Commit bf998156d24bcb127318ad5bf531ac3bdfcd6449

Authored by Huang Ying
Committed by Avi Kivity
1 parent 540ad6b62b

KVM: Avoid killing userspace through guest SRAO MCE on unmapped pages

In common cases, guest SRAO MCE will cause corresponding poisoned page
be un-mapped and SIGBUS be sent to QEMU-KVM, then QEMU-KVM will relay
the MCE to guest OS.

But it is reported that if the poisoned page is accessed in guest
after unmapping and before MCE is relayed to guest OS, userspace will
be killed.

The reason is as follows. Because poisoned page has been un-mapped,
guest access will cause guest exit and kvm_mmu_page_fault will be
called. kvm_mmu_page_fault can not get the poisoned page for fault
address, so kernel and user space MMIO processing is tried in turn. In
user MMIO processing, poisoned page is accessed again, then userspace
is killed by force_sig_info.

To fix the bug, kvm_mmu_page_fault send HWPOISON signal to QEMU-KVM
and do not try kernel and user space MMIO processing for poisoned
page.

[xiao: fix warning introduced by avi]

Reported-by: Max Asbock <masbock@linux.vnet.ibm.com>
Signed-off-by: Huang Ying <ying.huang@intel.com>
Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>

Showing 6 changed files with 95 additions and 15 deletions Side-by-side Diff

... ... @@ -32,6 +32,7 @@
32 32 #include <linux/compiler.h>
33 33 #include <linux/srcu.h>
34 34 #include <linux/slab.h>
  35 +#include <linux/uaccess.h>
35 36  
36 37 #include <asm/page.h>
37 38 #include <asm/cmpxchg.h>
... ... @@ -1960,6 +1961,27 @@
1960 1961 return pt_write;
1961 1962 }
1962 1963  
  1964 +static void kvm_send_hwpoison_signal(struct kvm *kvm, gfn_t gfn)
  1965 +{
  1966 + char buf[1];
  1967 + void __user *hva;
  1968 + int r;
  1969 +
  1970 + /* Touch the page, so send SIGBUS */
  1971 + hva = (void __user *)gfn_to_hva(kvm, gfn);
  1972 + r = copy_from_user(buf, hva, 1);
  1973 +}
  1974 +
  1975 +static int kvm_handle_bad_page(struct kvm *kvm, gfn_t gfn, pfn_t pfn)
  1976 +{
  1977 + kvm_release_pfn_clean(pfn);
  1978 + if (is_hwpoison_pfn(pfn)) {
  1979 + kvm_send_hwpoison_signal(kvm, gfn);
  1980 + return 0;
  1981 + }
  1982 + return 1;
  1983 +}
  1984 +
1963 1985 static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
1964 1986 {
1965 1987 int r;
... ... @@ -1983,10 +2005,8 @@
1983 2005 pfn = gfn_to_pfn(vcpu->kvm, gfn);
1984 2006  
1985 2007 /* mmio */
1986   - if (is_error_pfn(pfn)) {
1987   - kvm_release_pfn_clean(pfn);
1988   - return 1;
1989   - }
  2008 + if (is_error_pfn(pfn))
  2009 + return kvm_handle_bad_page(vcpu->kvm, gfn, pfn);
1990 2010  
1991 2011 spin_lock(&vcpu->kvm->mmu_lock);
1992 2012 if (mmu_notifier_retry(vcpu, mmu_seq))
... ... @@ -2198,10 +2218,8 @@
2198 2218 mmu_seq = vcpu->kvm->mmu_notifier_seq;
2199 2219 smp_rmb();
2200 2220 pfn = gfn_to_pfn(vcpu->kvm, gfn);
2201   - if (is_error_pfn(pfn)) {
2202   - kvm_release_pfn_clean(pfn);
2203   - return 1;
2204   - }
  2221 + if (is_error_pfn(pfn))
  2222 + return kvm_handle_bad_page(vcpu->kvm, gfn, pfn);
2205 2223 spin_lock(&vcpu->kvm->mmu_lock);
2206 2224 if (mmu_notifier_retry(vcpu, mmu_seq))
2207 2225 goto out_unlock;
arch/x86/kvm/paging_tmpl.h
... ... @@ -431,11 +431,8 @@
431 431 pfn = gfn_to_pfn(vcpu->kvm, walker.gfn);
432 432  
433 433 /* mmio */
434   - if (is_error_pfn(pfn)) {
435   - pgprintk("gfn %lx is mmio\n", walker.gfn);
436   - kvm_release_pfn_clean(pfn);
437   - return 1;
438   - }
  434 + if (is_error_pfn(pfn))
  435 + return kvm_handle_bad_page(vcpu->kvm, walker.gfn, pfn);
439 436  
440 437 spin_lock(&vcpu->kvm->mmu_lock);
441 438 if (mmu_notifier_retry(vcpu, mmu_seq))
include/linux/kvm_host.h
... ... @@ -266,6 +266,7 @@
266 266  
267 267 int is_error_page(struct page *page);
268 268 int is_error_pfn(pfn_t pfn);
  269 +int is_hwpoison_pfn(pfn_t pfn);
269 270 int kvm_is_error_hva(unsigned long addr);
270 271 int kvm_set_memory_region(struct kvm *kvm,
271 272 struct kvm_userspace_memory_region *mem,
... ... @@ -1465,6 +1465,14 @@
1465 1465 extern void shake_page(struct page *p, int access);
1466 1466 extern atomic_long_t mce_bad_pages;
1467 1467 extern int soft_offline_page(struct page *page, int flags);
  1468 +#ifdef CONFIG_MEMORY_FAILURE
  1469 +int is_hwpoison_address(unsigned long addr);
  1470 +#else
  1471 +static inline int is_hwpoison_address(unsigned long addr)
  1472 +{
  1473 + return 0;
  1474 +}
  1475 +#endif
1468 1476  
1469 1477 extern void dump_page(struct page *page);
1470 1478  
... ... @@ -45,6 +45,7 @@
45 45 #include <linux/page-isolation.h>
46 46 #include <linux/suspend.h>
47 47 #include <linux/slab.h>
  48 +#include <linux/swapops.h>
48 49 #include "internal.h"
49 50  
50 51 int sysctl_memory_failure_early_kill __read_mostly = 0;
... ... @@ -1296,4 +1297,33 @@
1296 1297 /* keep elevated page count for bad page */
1297 1298 return ret;
1298 1299 }
  1300 +
  1301 +int is_hwpoison_address(unsigned long addr)
  1302 +{
  1303 + pgd_t *pgdp;
  1304 + pud_t pud, *pudp;
  1305 + pmd_t pmd, *pmdp;
  1306 + pte_t pte, *ptep;
  1307 + swp_entry_t entry;
  1308 +
  1309 + pgdp = pgd_offset(current->mm, addr);
  1310 + if (!pgd_present(*pgdp))
  1311 + return 0;
  1312 + pudp = pud_offset(pgdp, addr);
  1313 + pud = *pudp;
  1314 + if (!pud_present(pud) || pud_large(pud))
  1315 + return 0;
  1316 + pmdp = pmd_offset(pudp, addr);
  1317 + pmd = *pmdp;
  1318 + if (!pmd_present(pmd) || pmd_large(pmd))
  1319 + return 0;
  1320 + ptep = pte_offset_map(pmdp, addr);
  1321 + pte = *ptep;
  1322 + pte_unmap(ptep);
  1323 + if (!is_swap_pte(pte))
  1324 + return 0;
  1325 + entry = pte_to_swp_entry(pte);
  1326 + return is_hwpoison_entry(entry);
  1327 +}
  1328 +EXPORT_SYMBOL_GPL(is_hwpoison_address);
... ... @@ -92,6 +92,9 @@
92 92  
93 93 static bool largepages_enabled = true;
94 94  
  95 +struct page *hwpoison_page;
  96 +pfn_t hwpoison_pfn;
  97 +
95 98 inline int kvm_is_mmio_pfn(pfn_t pfn)
96 99 {
97 100 if (pfn_valid(pfn)) {
98 101  
99 102  
... ... @@ -810,16 +813,22 @@
810 813  
811 814 int is_error_page(struct page *page)
812 815 {
813   - return page == bad_page;
  816 + return page == bad_page || page == hwpoison_page;
814 817 }
815 818 EXPORT_SYMBOL_GPL(is_error_page);
816 819  
817 820 int is_error_pfn(pfn_t pfn)
818 821 {
819   - return pfn == bad_pfn;
  822 + return pfn == bad_pfn || pfn == hwpoison_pfn;
820 823 }
821 824 EXPORT_SYMBOL_GPL(is_error_pfn);
822 825  
  826 +int is_hwpoison_pfn(pfn_t pfn)
  827 +{
  828 + return pfn == hwpoison_pfn;
  829 +}
  830 +EXPORT_SYMBOL_GPL(is_hwpoison_pfn);
  831 +
823 832 static inline unsigned long bad_hva(void)
824 833 {
825 834 return PAGE_OFFSET;
... ... @@ -945,6 +954,11 @@
945 954 if (unlikely(npages != 1)) {
946 955 struct vm_area_struct *vma;
947 956  
  957 + if (is_hwpoison_address(addr)) {
  958 + get_page(hwpoison_page);
  959 + return page_to_pfn(hwpoison_page);
  960 + }
  961 +
948 962 down_read(&current->mm->mmap_sem);
949 963 vma = find_vma(current->mm, addr);
950 964  
... ... @@ -2197,6 +2211,15 @@
2197 2211  
2198 2212 bad_pfn = page_to_pfn(bad_page);
2199 2213  
  2214 + hwpoison_page = alloc_page(GFP_KERNEL | __GFP_ZERO);
  2215 +
  2216 + if (hwpoison_page == NULL) {
  2217 + r = -ENOMEM;
  2218 + goto out_free_0;
  2219 + }
  2220 +
  2221 + hwpoison_pfn = page_to_pfn(hwpoison_page);
  2222 +
2200 2223 if (!zalloc_cpumask_var(&cpus_hardware_enabled, GFP_KERNEL)) {
2201 2224 r = -ENOMEM;
2202 2225 goto out_free_0;
... ... @@ -2269,6 +2292,8 @@
2269 2292 out_free_0a:
2270 2293 free_cpumask_var(cpus_hardware_enabled);
2271 2294 out_free_0:
  2295 + if (hwpoison_page)
  2296 + __free_page(hwpoison_page);
2272 2297 __free_page(bad_page);
2273 2298 out:
2274 2299 kvm_arch_exit();
... ... @@ -2290,6 +2315,7 @@
2290 2315 kvm_arch_hardware_unsetup();
2291 2316 kvm_arch_exit();
2292 2317 free_cpumask_var(cpus_hardware_enabled);
  2318 + __free_page(hwpoison_page);
2293 2319 __free_page(bad_page);
2294 2320 }
2295 2321 EXPORT_SYMBOL_GPL(kvm_exit);