Eric Lee / smarc-ti-linux-kernel | Embedian Git Server

Commit eb64c3c6cdb8fa8a4d324eb71a9033b62e150918

Authored by Linus Torvalds 2014-12-17 05:23:03 +0800

Exists in ti-lsk-linux-4.1.y and in 10 other branches

Merge tag 'stable/for-linus-3.19-rc0b-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/xen/tip

Pull additional xen update from David Vrabel:
 "Xen: additional features for 3.19-rc0

   - Linear p2m for x86 PV guests which simplifies the p2m code,
     improves performance and will allow for > 512 GB PV guests in the
     future.

  A last-minute, configuration specific issue was discovered with this
  change which is why it was not included in my previous pull request.
  This is now been fixed and tested"

* tag 'stable/for-linus-3.19-rc0b-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/xen/tip:
  xen: switch to post-init routines in xen mmu.c earlier
  Revert "swiotlb-xen: pass dev_addr to swiotlb_tbl_unmap_single"
  xen: annotate xen_set_identity_and_remap_chunk() with __init
  xen: introduce helper functions to do safe read and write accesses
  xen: Speed up set_phys_to_machine() by using read-only mappings
  xen: switch to linear virtual mapped sparse p2m list
  xen: Hide get_phys_to_machine() to be able to tune common path
  x86: Introduce function to get pmd entry pointer
  xen: Delay invalidating extra memory
  xen: Delay m2p_override initialization
  xen: Delay remapping memory of pv-domain
  xen: use common page allocation function in p2m.c
  xen: Make functions static
  xen: fix some style issues in p2m.c

Showing 7 changed files Inline Diff

arch/x86/include/asm/pgtable_types.h
arch/x86/include/asm/xen/page.h
arch/x86/mm/pageattr.c
arch/x86/xen/mmu.c
arch/x86/xen/p2m.c
arch/x86/xen/setup.c
arch/x86/xen/xen-ops.h

arch/x86/include/asm/pgtable_types.h

Diff comments View file @ eb64c3c

 #ifndef _ASM_X86_PGTABLE_DEFS_H
 #define _ASM_X86_PGTABLE_DEFS_H
 #include <linux/const.h>
 #include <asm/page_types.h>
 #define FIRST_USER_ADDRESS	0
 #define _PAGE_BIT_PRESENT	0	/* is present */
 #define _PAGE_BIT_RW		1	/* writeable */
 #define _PAGE_BIT_USER		2	/* userspace addressable */
 #define _PAGE_BIT_PWT		3	/* page write through */
 #define _PAGE_BIT_PCD		4	/* page cache disabled */
 #define _PAGE_BIT_ACCESSED	5	/* was accessed (raised by CPU) */
 #define _PAGE_BIT_DIRTY		6	/* was written to (raised by CPU) */
 #define _PAGE_BIT_PSE		7	/* 4 MB (or 2MB) page */
 #define _PAGE_BIT_PAT		7	/* on 4KB pages */
 #define _PAGE_BIT_GLOBAL	8	/* Global TLB entry PPro+ */
 #define _PAGE_BIT_SOFTW1	9	/* available for programmer */
 #define _PAGE_BIT_SOFTW2	10	/* " */
 #define _PAGE_BIT_SOFTW3	11	/* " */
 #define _PAGE_BIT_PAT_LARGE	12	/* On 2MB or 1GB pages */
 #define _PAGE_BIT_SPECIAL	_PAGE_BIT_SOFTW1
 #define _PAGE_BIT_CPA_TEST	_PAGE_BIT_SOFTW1
 #define _PAGE_BIT_SPLITTING	_PAGE_BIT_SOFTW2 /* only valid on a PSE pmd */
 #define _PAGE_BIT_HIDDEN	_PAGE_BIT_SOFTW3 /* hidden by kmemcheck */
 #define _PAGE_BIT_SOFT_DIRTY	_PAGE_BIT_SOFTW3 /* software dirty tracking */
 #define _PAGE_BIT_NX           63       /* No execute: only valid after cpuid check */
 /*
  * Swap offsets on configurations that allow automatic NUMA balancing use the
  * bits after _PAGE_BIT_GLOBAL. To uniquely distinguish NUMA hinting PTEs from
  * swap entries, we use the first bit after _PAGE_BIT_GLOBAL and shrink the
  * maximum possible swap space from 16TB to 8TB.
  */
 #define _PAGE_BIT_NUMA		(_PAGE_BIT_GLOBAL+1)
 /* If _PAGE_BIT_PRESENT is clear, we use these: */
 /* - if the user mapped it with PROT_NONE; pte_present gives true */
 #define _PAGE_BIT_PROTNONE	_PAGE_BIT_GLOBAL
 /* - set: nonlinear file mapping, saved PTE; unset:swap */
 #define _PAGE_BIT_FILE		_PAGE_BIT_DIRTY
 #define _PAGE_PRESENT	(_AT(pteval_t, 1) << _PAGE_BIT_PRESENT)
 #define _PAGE_RW	(_AT(pteval_t, 1) << _PAGE_BIT_RW)
 #define _PAGE_USER	(_AT(pteval_t, 1) << _PAGE_BIT_USER)
 #define _PAGE_PWT	(_AT(pteval_t, 1) << _PAGE_BIT_PWT)
 #define _PAGE_PCD	(_AT(pteval_t, 1) << _PAGE_BIT_PCD)
 #define _PAGE_ACCESSED	(_AT(pteval_t, 1) << _PAGE_BIT_ACCESSED)
 #define _PAGE_DIRTY	(_AT(pteval_t, 1) << _PAGE_BIT_DIRTY)
 #define _PAGE_PSE	(_AT(pteval_t, 1) << _PAGE_BIT_PSE)
 #define _PAGE_GLOBAL	(_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL)
 #define _PAGE_SOFTW1	(_AT(pteval_t, 1) << _PAGE_BIT_SOFTW1)
 #define _PAGE_SOFTW2	(_AT(pteval_t, 1) << _PAGE_BIT_SOFTW2)
 #define _PAGE_PAT	(_AT(pteval_t, 1) << _PAGE_BIT_PAT)
 #define _PAGE_PAT_LARGE (_AT(pteval_t, 1) << _PAGE_BIT_PAT_LARGE)
 #define _PAGE_SPECIAL	(_AT(pteval_t, 1) << _PAGE_BIT_SPECIAL)
 #define _PAGE_CPA_TEST	(_AT(pteval_t, 1) << _PAGE_BIT_CPA_TEST)
 #define _PAGE_SPLITTING	(_AT(pteval_t, 1) << _PAGE_BIT_SPLITTING)
 #define __HAVE_ARCH_PTE_SPECIAL
 #ifdef CONFIG_KMEMCHECK
 #define _PAGE_HIDDEN	(_AT(pteval_t, 1) << _PAGE_BIT_HIDDEN)
 #else
 #define _PAGE_HIDDEN	(_AT(pteval_t, 0))
 #endif
 /*
  * The same hidden bit is used by kmemcheck, but since kmemcheck
  * works on kernel pages while soft-dirty engine on user space,
  * they do not conflict with each other.
  */
 #ifdef CONFIG_MEM_SOFT_DIRTY
 #define _PAGE_SOFT_DIRTY	(_AT(pteval_t, 1) << _PAGE_BIT_SOFT_DIRTY)
 #else
 #define _PAGE_SOFT_DIRTY	(_AT(pteval_t, 0))
 #endif
 /*
  * _PAGE_NUMA distinguishes between a numa hinting minor fault and a page
  * that is not present. The hinting fault gathers numa placement statistics
  * (see pte_numa()). The bit is always zero when the PTE is not present.
  *
  * The bit picked must be always zero when the pmd is present and not
  * present, so that we don't lose information when we set it while
  * atomically clearing the present bit.
  */
 #ifdef CONFIG_NUMA_BALANCING
 #define _PAGE_NUMA	(_AT(pteval_t, 1) << _PAGE_BIT_NUMA)
 #else
 #define _PAGE_NUMA	(_AT(pteval_t, 0))
 #endif
 /*
  * Tracking soft dirty bit when a page goes to a swap is tricky.
  * We need a bit which can be stored in pte _and_ not conflict
  * with swap entry format. On x86 bits 6 and 7 are *not* involved
  * into swap entry computation, but bit 6 is used for nonlinear
  * file mapping, so we borrow bit 7 for soft dirty tracking.
  *
  * Please note that this bit must be treated as swap dirty page
  * mark if and only if the PTE has present bit clear!
  */
 #ifdef CONFIG_MEM_SOFT_DIRTY
 #define _PAGE_SWP_SOFT_DIRTY	_PAGE_PSE
 #else
 #define _PAGE_SWP_SOFT_DIRTY	(_AT(pteval_t, 0))
 #endif
 #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
 #define _PAGE_NX	(_AT(pteval_t, 1) << _PAGE_BIT_NX)
 #else
 #define _PAGE_NX	(_AT(pteval_t, 0))
 #endif
 #define _PAGE_FILE	(_AT(pteval_t, 1) << _PAGE_BIT_FILE)
 #define _PAGE_PROTNONE	(_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE)
 #define _PAGE_TABLE	(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER |	\
 			 _PAGE_ACCESSED | _PAGE_DIRTY)
 #define _KERNPG_TABLE	(_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED |	\
 			 _PAGE_DIRTY)
 /* Set of bits not changed in pte_modify */
 #define _PAGE_CHG_MASK	(PTE_PFN_MASK | _PAGE_PCD | _PAGE_PWT |		\
 			 _PAGE_SPECIAL | _PAGE_ACCESSED | _PAGE_DIRTY |	\
 			 _PAGE_SOFT_DIRTY | _PAGE_NUMA)
 #define _HPAGE_CHG_MASK (_PAGE_CHG_MASK | _PAGE_PSE | _PAGE_NUMA)
 /*
  * The cache modes defined here are used to translate between pure SW usage
  * and the HW defined cache mode bits and/or PAT entries.
  *
  * The resulting bits for PWT, PCD and PAT should be chosen in a way
  * to have the WB mode at index 0 (all bits clear). This is the default
  * right now and likely would break too much if changed.
  */
 #ifndef __ASSEMBLY__
 enum page_cache_mode {
 	_PAGE_CACHE_MODE_WB = 0,
 	_PAGE_CACHE_MODE_WC = 1,
 	_PAGE_CACHE_MODE_UC_MINUS = 2,
 	_PAGE_CACHE_MODE_UC = 3,
 	_PAGE_CACHE_MODE_WT = 4,
 	_PAGE_CACHE_MODE_WP = 5,
 	_PAGE_CACHE_MODE_NUM = 8
 };
 #endif
 #define _PAGE_CACHE_MASK	(_PAGE_PAT | _PAGE_PCD | _PAGE_PWT)
 #define _PAGE_NOCACHE		(cachemode2protval(_PAGE_CACHE_MODE_UC))
 #define PAGE_NONE	__pgprot(_PAGE_PROTNONE | _PAGE_ACCESSED)
 #define PAGE_SHARED	__pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \
 				 _PAGE_ACCESSED | _PAGE_NX)
 #define PAGE_SHARED_EXEC	__pgprot(_PAGE_PRESENT | _PAGE_RW |	\
 					 _PAGE_USER | _PAGE_ACCESSED)
 #define PAGE_COPY_NOEXEC	__pgprot(_PAGE_PRESENT | _PAGE_USER |	\
 					 _PAGE_ACCESSED | _PAGE_NX)
 #define PAGE_COPY_EXEC		__pgprot(_PAGE_PRESENT | _PAGE_USER |	\
 					 _PAGE_ACCESSED)
 #define PAGE_COPY		PAGE_COPY_NOEXEC
 #define PAGE_READONLY		__pgprot(_PAGE_PRESENT | _PAGE_USER |	\
 					 _PAGE_ACCESSED | _PAGE_NX)
 #define PAGE_READONLY_EXEC	__pgprot(_PAGE_PRESENT | _PAGE_USER |	\
 					 _PAGE_ACCESSED)
 #define __PAGE_KERNEL_EXEC						\
 	(_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_GLOBAL)
 #define __PAGE_KERNEL		(__PAGE_KERNEL_EXEC | _PAGE_NX)
 #define __PAGE_KERNEL_RO		(__PAGE_KERNEL & ~_PAGE_RW)
 #define __PAGE_KERNEL_RX		(__PAGE_KERNEL_EXEC & ~_PAGE_RW)
 #define __PAGE_KERNEL_NOCACHE		(__PAGE_KERNEL | _PAGE_NOCACHE)
 #define __PAGE_KERNEL_VSYSCALL		(__PAGE_KERNEL_RX | _PAGE_USER)
 #define __PAGE_KERNEL_VVAR		(__PAGE_KERNEL_RO | _PAGE_USER)
 #define __PAGE_KERNEL_LARGE		(__PAGE_KERNEL | _PAGE_PSE)
 #define __PAGE_KERNEL_LARGE_EXEC	(__PAGE_KERNEL_EXEC | _PAGE_PSE)
 #define __PAGE_KERNEL_IO		(__PAGE_KERNEL)
 #define __PAGE_KERNEL_IO_NOCACHE	(__PAGE_KERNEL_NOCACHE)
 #define PAGE_KERNEL			__pgprot(__PAGE_KERNEL)
 #define PAGE_KERNEL_RO			__pgprot(__PAGE_KERNEL_RO)
 #define PAGE_KERNEL_EXEC		__pgprot(__PAGE_KERNEL_EXEC)
 #define PAGE_KERNEL_RX			__pgprot(__PAGE_KERNEL_RX)
 #define PAGE_KERNEL_NOCACHE		__pgprot(__PAGE_KERNEL_NOCACHE)
 #define PAGE_KERNEL_LARGE		__pgprot(__PAGE_KERNEL_LARGE)
 #define PAGE_KERNEL_LARGE_EXEC		__pgprot(__PAGE_KERNEL_LARGE_EXEC)
 #define PAGE_KERNEL_VSYSCALL		__pgprot(__PAGE_KERNEL_VSYSCALL)
 #define PAGE_KERNEL_VVAR		__pgprot(__PAGE_KERNEL_VVAR)
 #define PAGE_KERNEL_IO			__pgprot(__PAGE_KERNEL_IO)
 #define PAGE_KERNEL_IO_NOCACHE		__pgprot(__PAGE_KERNEL_IO_NOCACHE)
 /*         xwr */
 #define __P000	PAGE_NONE
 #define __P001	PAGE_READONLY
 #define __P010	PAGE_COPY
 #define __P011	PAGE_COPY
 #define __P100	PAGE_READONLY_EXEC
 #define __P101	PAGE_READONLY_EXEC
 #define __P110	PAGE_COPY_EXEC
 #define __P111	PAGE_COPY_EXEC
 #define __S000	PAGE_NONE
 #define __S001	PAGE_READONLY
 #define __S010	PAGE_SHARED
 #define __S011	PAGE_SHARED
 #define __S100	PAGE_READONLY_EXEC
 #define __S101	PAGE_READONLY_EXEC
 #define __S110	PAGE_SHARED_EXEC
 #define __S111	PAGE_SHARED_EXEC
 /*
  * early identity mapping  pte attrib macros.
  */
 #ifdef CONFIG_X86_64
 #define __PAGE_KERNEL_IDENT_LARGE_EXEC	__PAGE_KERNEL_LARGE_EXEC
 #else
 #define PTE_IDENT_ATTR	 0x003		/* PRESENT+RW */
 #define PDE_IDENT_ATTR	 0x063		/* PRESENT+RW+DIRTY+ACCESSED */
 #define PGD_IDENT_ATTR	 0x001		/* PRESENT (no other attributes) */
 #endif
 #ifdef CONFIG_X86_32
 # include <asm/pgtable_32_types.h>
 #else
 # include <asm/pgtable_64_types.h>
 #endif
 #ifndef __ASSEMBLY__
 #include <linux/types.h>
 /* PTE_PFN_MASK extracts the PFN from a (pte|pmd|pud|pgd)val_t */
 #define PTE_PFN_MASK		((pteval_t)PHYSICAL_PAGE_MASK)
 /* PTE_FLAGS_MASK extracts the flags from a (pte|pmd|pud|pgd)val_t */
 #define PTE_FLAGS_MASK		(~PTE_PFN_MASK)
 typedef struct pgprot { pgprotval_t pgprot; } pgprot_t;
 typedef struct { pgdval_t pgd; } pgd_t;
 static inline pgd_t native_make_pgd(pgdval_t val)
 {
 	return (pgd_t) { val };
 }
 static inline pgdval_t native_pgd_val(pgd_t pgd)
 {
 	return pgd.pgd;
 }
 static inline pgdval_t pgd_flags(pgd_t pgd)
 {
 	return native_pgd_val(pgd) & PTE_FLAGS_MASK;
 }
 #if PAGETABLE_LEVELS > 3
 typedef struct { pudval_t pud; } pud_t;
 static inline pud_t native_make_pud(pmdval_t val)
 {
 	return (pud_t) { val };
 }
 static inline pudval_t native_pud_val(pud_t pud)
 {
 	return pud.pud;
 }
 #else
 #include <asm-generic/pgtable-nopud.h>
 static inline pudval_t native_pud_val(pud_t pud)
 {
 	return native_pgd_val(pud.pgd);
 }
 #endif
 #if PAGETABLE_LEVELS > 2
 typedef struct { pmdval_t pmd; } pmd_t;
 static inline pmd_t native_make_pmd(pmdval_t val)
 {
 	return (pmd_t) { val };
 }
 static inline pmdval_t native_pmd_val(pmd_t pmd)
 {
 	return pmd.pmd;
 }
 #else
 #include <asm-generic/pgtable-nopmd.h>
 static inline pmdval_t native_pmd_val(pmd_t pmd)
 {
 	return native_pgd_val(pmd.pud.pgd);
 }
 #endif
 static inline pudval_t pud_flags(pud_t pud)
 {
 	return native_pud_val(pud) & PTE_FLAGS_MASK;
 }
 static inline pmdval_t pmd_flags(pmd_t pmd)
 {
 	return native_pmd_val(pmd) & PTE_FLAGS_MASK;
 }
 static inline pte_t native_make_pte(pteval_t val)
 {
 	return (pte_t) { .pte = val };
 }
 static inline pteval_t native_pte_val(pte_t pte)
 {
 	return pte.pte;
 }
 static inline pteval_t pte_flags(pte_t pte)
 {
 	return native_pte_val(pte) & PTE_FLAGS_MASK;
 }
 #ifdef CONFIG_NUMA_BALANCING
 /* Set of bits that distinguishes present, prot_none and numa ptes */
 #define _PAGE_NUMA_MASK (_PAGE_NUMA|_PAGE_PROTNONE|_PAGE_PRESENT)
 static inline pteval_t ptenuma_flags(pte_t pte)
 {
 	return pte_flags(pte) & _PAGE_NUMA_MASK;
 }
 static inline pmdval_t pmdnuma_flags(pmd_t pmd)
 {
 	return pmd_flags(pmd) & _PAGE_NUMA_MASK;
 }
 #endif /* CONFIG_NUMA_BALANCING */
 #define pgprot_val(x)	((x).pgprot)
 #define __pgprot(x)	((pgprot_t) { (x) } )
 extern uint16_t __cachemode2pte_tbl[_PAGE_CACHE_MODE_NUM];
 extern uint8_t __pte2cachemode_tbl[8];
 #define __pte2cm_idx(cb)				\
 	((((cb) >> (_PAGE_BIT_PAT - 2)) & 4) |		\
 	 (((cb) >> (_PAGE_BIT_PCD - 1)) & 2) |		\
 	 (((cb) >> _PAGE_BIT_PWT) & 1))
 #define __cm_idx2pte(i)					\
 	((((i) & 4) << (_PAGE_BIT_PAT - 2)) |		\
 	 (((i) & 2) << (_PAGE_BIT_PCD - 1)) |		\
 	 (((i) & 1) << _PAGE_BIT_PWT))
 static inline unsigned long cachemode2protval(enum page_cache_mode pcm)
 {
 	if (likely(pcm == 0))
 		return 0;
 	return __cachemode2pte_tbl[pcm];
 }
 static inline pgprot_t cachemode2pgprot(enum page_cache_mode pcm)
 {
 	return __pgprot(cachemode2protval(pcm));
 }
 static inline enum page_cache_mode pgprot2cachemode(pgprot_t pgprot)
 {
 	unsigned long masked;
 	masked = pgprot_val(pgprot) & _PAGE_CACHE_MASK;
 	if (likely(masked == 0))
 		return 0;
 	return __pte2cachemode_tbl[__pte2cm_idx(masked)];
 }
 static inline pgprot_t pgprot_4k_2_large(pgprot_t pgprot)
 {
 	pgprot_t new;
 	unsigned long val;
 	val = pgprot_val(pgprot);
 	pgprot_val(new) = (val & ~(_PAGE_PAT | _PAGE_PAT_LARGE)) |
 		((val & _PAGE_PAT) << (_PAGE_BIT_PAT_LARGE - _PAGE_BIT_PAT));
 	return new;
 }
 static inline pgprot_t pgprot_large_2_4k(pgprot_t pgprot)
 {
 	pgprot_t new;
 	unsigned long val;
 	val = pgprot_val(pgprot);
 	pgprot_val(new) = (val & ~(_PAGE_PAT | _PAGE_PAT_LARGE)) |
 			  ((val & _PAGE_PAT_LARGE) >>
 			   (_PAGE_BIT_PAT_LARGE - _PAGE_BIT_PAT));
 	return new;
 }
 typedef struct page *pgtable_t;
 extern pteval_t __supported_pte_mask;
 extern void set_nx(void);
 extern int nx_enabled;
 #define pgprot_writecombine	pgprot_writecombine
 extern pgprot_t pgprot_writecombine(pgprot_t prot);
 /* Indicate that x86 has its own track and untrack pfn vma functions */
 #define __HAVE_PFNMAP_TRACKING
 #define __HAVE_PHYS_MEM_ACCESS_PROT
 struct file;
 pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
                               unsigned long size, pgprot_t vma_prot);
 int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
                               unsigned long size, pgprot_t *vma_prot);
 /* Install a pte for a particular vaddr in kernel space. */
 void set_pte_vaddr(unsigned long vaddr, pte_t pte);
 #ifdef CONFIG_X86_32
 extern void native_pagetable_init(void);
 #else
 #define native_pagetable_init        paging_init
 #endif
 struct seq_file;
 extern void arch_report_meminfo(struct seq_file *m);
 enum pg_level {
 	PG_LEVEL_NONE,
 	PG_LEVEL_4K,
 	PG_LEVEL_2M,
 	PG_LEVEL_1G,
 	PG_LEVEL_NUM
 };
 #ifdef CONFIG_PROC_FS
 extern void update_page_count(int level, unsigned long pages);
 #else
 static inline void update_page_count(int level, unsigned long pages) { }
 #endif
 /*
  * Helper function that returns the kernel pagetable entry controlling
  * the virtual address 'address'. NULL means no pagetable entry present.
  * NOTE: the return type is pte_t but if the pmd is PSE then we return it
  * as a pte too.
  */
 extern pte_t *lookup_address(unsigned long address, unsigned int *level);
 extern pte_t *lookup_address_in_pgd(pgd_t *pgd, unsigned long address,
 				    unsigned int *level);
+extern pmd_t *lookup_pmd_address(unsigned long address);
 extern phys_addr_t slow_virt_to_phys(void *__address);
 extern int kernel_map_pages_in_pgd(pgd_t *pgd, u64 pfn, unsigned long address,
 				   unsigned numpages, unsigned long page_flags);
 void kernel_unmap_pages_in_pgd(pgd_t *root, unsigned long address,
 			       unsigned numpages);
 #endif	/* !__ASSEMBLY__ */
 #endif /* _ASM_X86_PGTABLE_DEFS_H */

arch/x86/include/asm/xen/page.h

Diff comments View file @ eb64c3c

 #ifndef _ASM_X86_XEN_PAGE_H
 #define _ASM_X86_XEN_PAGE_H
 #include <linux/kernel.h>
 #include <linux/types.h>
 #include <linux/spinlock.h>
 #include <linux/pfn.h>
 #include <linux/mm.h>
 #include <asm/uaccess.h>
 #include <asm/page.h>
 #include <asm/pgtable.h>
 #include <xen/interface/xen.h>
 #include <xen/grant_table.h>
 #include <xen/features.h>
 /* Xen machine address */
 typedef struct xmaddr {
 	phys_addr_t maddr;
 } xmaddr_t;
 /* Xen pseudo-physical address */
 typedef struct xpaddr {
 	phys_addr_t paddr;
 } xpaddr_t;
 #define XMADDR(x)	((xmaddr_t) { .maddr = (x) })
 #define XPADDR(x)	((xpaddr_t) { .paddr = (x) })
 /**** MACHINE <-> PHYSICAL CONVERSION MACROS ****/
 #define INVALID_P2M_ENTRY	(~0UL)
 #define FOREIGN_FRAME_BIT	(1UL<<(BITS_PER_LONG-1))
 #define IDENTITY_FRAME_BIT	(1UL<<(BITS_PER_LONG-2))
 #define FOREIGN_FRAME(m)	((m) | FOREIGN_FRAME_BIT)
 #define IDENTITY_FRAME(m)	((m) | IDENTITY_FRAME_BIT)
 /* Maximum amount of memory we can handle in a domain in pages */
 #define MAX_DOMAIN_PAGES						\
     ((unsigned long)((u64)CONFIG_XEN_MAX_DOMAIN_MEMORY * 1024 * 1024 * 1024 / PAGE_SIZE))
 extern unsigned long *machine_to_phys_mapping;
 extern unsigned long  machine_to_phys_nr;
+extern unsigned long *xen_p2m_addr;
+extern unsigned long  xen_p2m_size;
+extern unsigned long  xen_max_p2m_pfn;
 extern unsigned long get_phys_to_machine(unsigned long pfn);
 extern bool set_phys_to_machine(unsigned long pfn, unsigned long mfn);
-extern bool __init early_set_phys_to_machine(unsigned long pfn, unsigned long mfn);
 extern bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn);
 extern unsigned long set_phys_range_identity(unsigned long pfn_s,
 					     unsigned long pfn_e);
 extern int set_foreign_p2m_mapping(struct gnttab_map_grant_ref *map_ops,
 				   struct gnttab_map_grant_ref *kmap_ops,
 				   struct page **pages, unsigned int count);
-extern int m2p_add_override(unsigned long mfn, struct page *page,
-			    struct gnttab_map_grant_ref *kmap_op);
 extern int clear_foreign_p2m_mapping(struct gnttab_unmap_grant_ref *unmap_ops,
 				     struct gnttab_map_grant_ref *kmap_ops,
 				     struct page **pages, unsigned int count);
-extern int m2p_remove_override(struct page *page,
-			       struct gnttab_map_grant_ref *kmap_op,
-			       unsigned long mfn);
-extern struct page *m2p_find_override(unsigned long mfn);
 extern unsigned long m2p_find_override_pfn(unsigned long mfn, unsigned long pfn);
+/*
+ * Helper functions to write or read unsigned long values to/from
+ * memory, when the access may fault.
+ */
+static inline int xen_safe_write_ulong(unsigned long *addr, unsigned long val)
+{
+	return __put_user(val, (unsigned long __user *)addr);
+}
+static inline int xen_safe_read_ulong(unsigned long *addr, unsigned long *val)
+{
+	return __get_user(*val, (unsigned long __user *)addr);
+}
+/*
+ * When to use pfn_to_mfn(), __pfn_to_mfn() or get_phys_to_machine():
+ * - pfn_to_mfn() returns either INVALID_P2M_ENTRY or the mfn. No indicator
+ *   bits (identity or foreign) are set.
+ * - __pfn_to_mfn() returns the found entry of the p2m table. A possibly set
+ *   identity or foreign indicator will be still set. __pfn_to_mfn() is
+ *   encapsulating get_phys_to_machine() which is called in special cases only.
+ * - get_phys_to_machine() is to be called by __pfn_to_mfn() only in special
+ *   cases needing an extended handling.
+ */
+static inline unsigned long __pfn_to_mfn(unsigned long pfn)
+{
+	unsigned long mfn;
+	if (pfn < xen_p2m_size)
+		mfn = xen_p2m_addr[pfn];
+	else if (unlikely(pfn < xen_max_p2m_pfn))
+		return get_phys_to_machine(pfn);
+	else
+		return IDENTITY_FRAME(pfn);
+	if (unlikely(mfn == INVALID_P2M_ENTRY))
+		return get_phys_to_machine(pfn);
+	return mfn;
+}
 static inline unsigned long pfn_to_mfn(unsigned long pfn)
 {
 	unsigned long mfn;
 	if (xen_feature(XENFEAT_auto_translated_physmap))
 		return pfn;
-	mfn = get_phys_to_machine(pfn);
+	mfn = __pfn_to_mfn(pfn);
 	if (mfn != INVALID_P2M_ENTRY)
 		mfn &= ~(FOREIGN_FRAME_BIT | IDENTITY_FRAME_BIT);
 	return mfn;
 }
 static inline int phys_to_machine_mapping_valid(unsigned long pfn)
 {
 	if (xen_feature(XENFEAT_auto_translated_physmap))
 		return 1;
-	return get_phys_to_machine(pfn) != INVALID_P2M_ENTRY;
+	return __pfn_to_mfn(pfn) != INVALID_P2M_ENTRY;
 }
 static inline unsigned long mfn_to_pfn_no_overrides(unsigned long mfn)
 {
 	unsigned long pfn;
 	int ret;
 	if (xen_feature(XENFEAT_auto_translated_physmap))
 		return mfn;
 	if (unlikely(mfn >= machine_to_phys_nr))
 		return ~0;
 	/*
 	 * The array access can fail (e.g., device space beyond end of RAM).
 	 * In such cases it doesn't matter what we return (we return garbage),
 	 * but we must handle the fault without crashing!
 	 */
-	ret = __get_user(pfn, &machine_to_phys_mapping[mfn]);
+	ret = xen_safe_read_ulong(&machine_to_phys_mapping[mfn], &pfn);
 	if (ret < 0)
 		return ~0;
 	return pfn;
 }
 static inline unsigned long mfn_to_pfn(unsigned long mfn)
 {
 	unsigned long pfn;
 	if (xen_feature(XENFEAT_auto_translated_physmap))
 		return mfn;
 	pfn = mfn_to_pfn_no_overrides(mfn);
-	if (get_phys_to_machine(pfn) != mfn) {
+	if (__pfn_to_mfn(pfn) != mfn) {
 		/*
 		 * If this appears to be a foreign mfn (because the pfn
 		 * doesn't map back to the mfn), then check the local override
 		 * table to see if there's a better pfn to use.
 		 *
 		 * m2p_find_override_pfn returns ~0 if it doesn't find anything.
 		 */
 		pfn = m2p_find_override_pfn(mfn, ~0);
 	}
 	/*
 	 * pfn is ~0 if there are no entries in the m2p for mfn or if the
 	 * entry doesn't map back to the mfn and m2p_override doesn't have a
 	 * valid entry for it.
 	 */
-	if (pfn == ~0 &&
+	if (pfn == ~0 && __pfn_to_mfn(mfn) == IDENTITY_FRAME(mfn))
-			get_phys_to_machine(mfn) == IDENTITY_FRAME(mfn))
 		pfn = mfn;
 	return pfn;
 }
 static inline xmaddr_t phys_to_machine(xpaddr_t phys)
 {
 	unsigned offset = phys.paddr & ~PAGE_MASK;
 	return XMADDR(PFN_PHYS(pfn_to_mfn(PFN_DOWN(phys.paddr))) | offset);
 }
 static inline xpaddr_t machine_to_phys(xmaddr_t machine)
 {
 	unsigned offset = machine.maddr & ~PAGE_MASK;
 	return XPADDR(PFN_PHYS(mfn_to_pfn(PFN_DOWN(machine.maddr))) | offset);
 }
 /*
  * We detect special mappings in one of two ways:
  *  1. If the MFN is an I/O page then Xen will set the m2p entry
  *     to be outside our maximum possible pseudophys range.
  *  2. If the MFN belongs to a different domain then we will certainly
  *     not have MFN in our p2m table. Conversely, if the page is ours,
  *     then we'll have p2m(m2p(MFN))==MFN.
  * If we detect a special mapping then it doesn't have a 'struct page'.
  * We force !pfn_valid() by returning an out-of-range pointer.
  *
  * NB. These checks require that, for any MFN that is not in our reservation,
  * there is no PFN such that p2m(PFN) == MFN. Otherwise we can get confused if
  * we are foreign-mapping the MFN, and the other domain as m2p(MFN) == PFN.
  * Yikes! Various places must poke in INVALID_P2M_ENTRY for safety.
  *
  * NB2. When deliberately mapping foreign pages into the p2m table, you *must*
  *      use FOREIGN_FRAME(). This will cause pte_pfn() to choke on it, as we
  *      require. In all the cases we care about, the FOREIGN_FRAME bit is
  *      masked (e.g., pfn_to_mfn()) so behaviour there is correct.
  */
 static inline unsigned long mfn_to_local_pfn(unsigned long mfn)
 {
 	unsigned long pfn;
 	if (xen_feature(XENFEAT_auto_translated_physmap))
 		return mfn;
 	pfn = mfn_to_pfn(mfn);
-	if (get_phys_to_machine(pfn) != mfn)
+	if (__pfn_to_mfn(pfn) != mfn)
 		return -1; /* force !pfn_valid() */
 	return pfn;
 }
 /* VIRT <-> MACHINE conversion */
 #define virt_to_machine(v)	(phys_to_machine(XPADDR(__pa(v))))
 #define virt_to_pfn(v)          (PFN_DOWN(__pa(v)))
 #define virt_to_mfn(v)		(pfn_to_mfn(virt_to_pfn(v)))
 #define mfn_to_virt(m)		(__va(mfn_to_pfn(m) << PAGE_SHIFT))
 static inline unsigned long pte_mfn(pte_t pte)
 {
 	return (pte.pte & PTE_PFN_MASK) >> PAGE_SHIFT;
 }
 static inline pte_t mfn_pte(unsigned long page_nr, pgprot_t pgprot)
 {
 	pte_t pte;
 	pte.pte = ((phys_addr_t)page_nr << PAGE_SHIFT) |
 			massage_pgprot(pgprot);
 	return pte;
 }
 static inline pteval_t pte_val_ma(pte_t pte)
 {
 	return pte.pte;
 }
 static inline pte_t __pte_ma(pteval_t x)
 {
 	return (pte_t) { .pte = x };
 }
 #define pmd_val_ma(v) ((v).pmd)
 #ifdef __PAGETABLE_PUD_FOLDED
 #define pud_val_ma(v) ((v).pgd.pgd)
 #else
 #define pud_val_ma(v) ((v).pud)
 #endif
 #define __pmd_ma(x)	((pmd_t) { (x) } )
 #define pgd_val_ma(x)	((x).pgd)
 void xen_set_domain_pte(pte_t *ptep, pte_t pteval, unsigned domid);
 xmaddr_t arbitrary_virt_to_machine(void *address);
 unsigned long arbitrary_virt_to_mfn(void *vaddr);
 void make_lowmem_page_readonly(void *vaddr);
 void make_lowmem_page_readwrite(void *vaddr);
 #define xen_remap(cookie, size) ioremap((cookie), (size));
 #define xen_unmap(cookie) iounmap((cookie))
 static inline bool xen_arch_need_swiotlb(struct device *dev,

arch/x86/mm/pageattr.c

Diff comments View file @ eb64c3c

 /*
  * Copyright 2002 Andi Kleen, SuSE Labs.
  * Thanks to Ben LaHaise for precious feedback.
  */
 #include <linux/highmem.h>
 #include <linux/bootmem.h>
 #include <linux/module.h>
 #include <linux/sched.h>
 #include <linux/mm.h>
 #include <linux/interrupt.h>
 #include <linux/seq_file.h>
 #include <linux/debugfs.h>
 #include <linux/pfn.h>
 #include <linux/percpu.h>
 #include <linux/gfp.h>
 #include <linux/pci.h>
 #include <asm/e820.h>
 #include <asm/processor.h>
 #include <asm/tlbflush.h>
 #include <asm/sections.h>
 #include <asm/setup.h>
 #include <asm/uaccess.h>
 #include <asm/pgalloc.h>
 #include <asm/proto.h>
 #include <asm/pat.h>
 /*
  * The current flushing context - we pass it instead of 5 arguments:
  */
 struct cpa_data {
 	unsigned long	*vaddr;
 	pgd_t		*pgd;
 	pgprot_t	mask_set;
 	pgprot_t	mask_clr;
 	int		numpages;
 	int		flags;
 	unsigned long	pfn;
 	unsigned	force_split : 1;
 	int		curpage;
 	struct page	**pages;
 };
 /*
  * Serialize cpa() (for !DEBUG_PAGEALLOC which uses large identity mappings)
  * using cpa_lock. So that we don't allow any other cpu, with stale large tlb
  * entries change the page attribute in parallel to some other cpu
  * splitting a large page entry along with changing the attribute.
  */
 static DEFINE_SPINLOCK(cpa_lock);
 #define CPA_FLUSHTLB 1
 #define CPA_ARRAY 2
 #define CPA_PAGES_ARRAY 4
 #ifdef CONFIG_PROC_FS
 static unsigned long direct_pages_count[PG_LEVEL_NUM];
 void update_page_count(int level, unsigned long pages)
 {
 	/* Protect against CPA */
 	spin_lock(&pgd_lock);
 	direct_pages_count[level] += pages;
 	spin_unlock(&pgd_lock);
 }
 static void split_page_count(int level)
 {
 	direct_pages_count[level]--;
 	direct_pages_count[level - 1] += PTRS_PER_PTE;
 }
 void arch_report_meminfo(struct seq_file *m)
 {
 	seq_printf(m, "DirectMap4k:    %8lu kB\n",
 			direct_pages_count[PG_LEVEL_4K] << 2);
 #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
 	seq_printf(m, "DirectMap2M:    %8lu kB\n",
 			direct_pages_count[PG_LEVEL_2M] << 11);
 #else
 	seq_printf(m, "DirectMap4M:    %8lu kB\n",
 			direct_pages_count[PG_LEVEL_2M] << 12);
 #endif
 #ifdef CONFIG_X86_64
 	if (direct_gbpages)
 		seq_printf(m, "DirectMap1G:    %8lu kB\n",
 			direct_pages_count[PG_LEVEL_1G] << 20);
 #endif
 }
 #else
 static inline void split_page_count(int level) { }
 #endif
 #ifdef CONFIG_X86_64
 static inline unsigned long highmap_start_pfn(void)
 {
 	return __pa_symbol(_text) >> PAGE_SHIFT;
 }
 static inline unsigned long highmap_end_pfn(void)
 {
 	return __pa_symbol(roundup(_brk_end, PMD_SIZE)) >> PAGE_SHIFT;
 }
 #endif
 #ifdef CONFIG_DEBUG_PAGEALLOC
 # define debug_pagealloc 1
 #else
 # define debug_pagealloc 0
 #endif
 static inline int
 within(unsigned long addr, unsigned long start, unsigned long end)
 {
 	return addr >= start && addr < end;
 }
 /*
  * Flushing functions
  */
 /**
  * clflush_cache_range - flush a cache range with clflush
  * @vaddr:	virtual start address
  * @size:	number of bytes to flush
  *
  * clflushopt is an unordered instruction which needs fencing with mfence or
  * sfence to avoid ordering issues.
  */
 void clflush_cache_range(void *vaddr, unsigned int size)
 {
 	void *vend = vaddr + size - 1;
 	mb();
 	for (; vaddr < vend; vaddr += boot_cpu_data.x86_clflush_size)
 		clflushopt(vaddr);
 	/*
 	 * Flush any possible final partial cacheline:
 	 */
 	clflushopt(vend);
 	mb();
 }
 EXPORT_SYMBOL_GPL(clflush_cache_range);
 static void __cpa_flush_all(void *arg)
 {
 	unsigned long cache = (unsigned long)arg;
 	/*
 	 * Flush all to work around Errata in early athlons regarding
 	 * large page flushing.
 	 */
 	__flush_tlb_all();
 	if (cache && boot_cpu_data.x86 >= 4)
 		wbinvd();
 }
 static void cpa_flush_all(unsigned long cache)
 {
 	BUG_ON(irqs_disabled());
 	on_each_cpu(__cpa_flush_all, (void *) cache, 1);
 }
 static void __cpa_flush_range(void *arg)
 {
 	/*
 	 * We could optimize that further and do individual per page
 	 * tlb invalidates for a low number of pages. Caveat: we must
 	 * flush the high aliases on 64bit as well.
 	 */
 	__flush_tlb_all();
 }
 static void cpa_flush_range(unsigned long start, int numpages, int cache)
 {
 	unsigned int i, level;
 	unsigned long addr;
 	BUG_ON(irqs_disabled());
 	WARN_ON(PAGE_ALIGN(start) != start);
 	on_each_cpu(__cpa_flush_range, NULL, 1);
 	if (!cache)
 		return;
 	/*
 	 * We only need to flush on one CPU,
 	 * clflush is a MESI-coherent instruction that
 	 * will cause all other CPUs to flush the same
 	 * cachelines:
 	 */
 	for (i = 0, addr = start; i < numpages; i++, addr += PAGE_SIZE) {
 		pte_t *pte = lookup_address(addr, &level);
 		/*
 		 * Only flush present addresses:
 		 */
 		if (pte && (pte_val(*pte) & _PAGE_PRESENT))
 			clflush_cache_range((void *) addr, PAGE_SIZE);
 	}
 }
 static void cpa_flush_array(unsigned long *start, int numpages, int cache,
 			    int in_flags, struct page **pages)
 {
 	unsigned int i, level;
 	unsigned long do_wbinvd = cache && numpages >= 1024; /* 4M threshold */
 	BUG_ON(irqs_disabled());
 	on_each_cpu(__cpa_flush_all, (void *) do_wbinvd, 1);
 	if (!cache || do_wbinvd)
 		return;
 	/*
 	 * We only need to flush on one CPU,
 	 * clflush is a MESI-coherent instruction that
 	 * will cause all other CPUs to flush the same
 	 * cachelines:
 	 */
 	for (i = 0; i < numpages; i++) {
 		unsigned long addr;
 		pte_t *pte;
 		if (in_flags & CPA_PAGES_ARRAY)
 			addr = (unsigned long)page_address(pages[i]);
 		else
 			addr = start[i];
 		pte = lookup_address(addr, &level);
 		/*
 		 * Only flush present addresses:
 		 */
 		if (pte && (pte_val(*pte) & _PAGE_PRESENT))
 			clflush_cache_range((void *)addr, PAGE_SIZE);
 	}
 }
 /*
  * Certain areas of memory on x86 require very specific protection flags,
  * for example the BIOS area or kernel text. Callers don't always get this
  * right (again, ioremap() on BIOS memory is not uncommon) so this function
  * checks and fixes these known static required protection bits.
  */
 static inline pgprot_t static_protections(pgprot_t prot, unsigned long address,
 				   unsigned long pfn)
 {
 	pgprot_t forbidden = __pgprot(0);
 	/*
 	 * The BIOS area between 640k and 1Mb needs to be executable for
 	 * PCI BIOS based config access (CONFIG_PCI_GOBIOS) support.
 	 */
 #ifdef CONFIG_PCI_BIOS
 	if (pcibios_enabled && within(pfn, BIOS_BEGIN >> PAGE_SHIFT, BIOS_END >> PAGE_SHIFT))
 		pgprot_val(forbidden) |= _PAGE_NX;
 #endif
 	/*
 	 * The kernel text needs to be executable for obvious reasons
 	 * Does not cover __inittext since that is gone later on. On
 	 * 64bit we do not enforce !NX on the low mapping
 	 */
 	if (within(address, (unsigned long)_text, (unsigned long)_etext))
 		pgprot_val(forbidden) |= _PAGE_NX;
 	/*
 	 * The .rodata section needs to be read-only. Using the pfn
 	 * catches all aliases.
 	 */
 	if (within(pfn, __pa_symbol(__start_rodata) >> PAGE_SHIFT,
 		   __pa_symbol(__end_rodata) >> PAGE_SHIFT))
 		pgprot_val(forbidden) |= _PAGE_RW;
 #if defined(CONFIG_X86_64) && defined(CONFIG_DEBUG_RODATA)
 	/*
 	 * Once the kernel maps the text as RO (kernel_set_to_readonly is set),
 	 * kernel text mappings for the large page aligned text, rodata sections
 	 * will be always read-only. For the kernel identity mappings covering
 	 * the holes caused by this alignment can be anything that user asks.
 	 *
 	 * This will preserve the large page mappings for kernel text/data
 	 * at no extra cost.
 	 */
 	if (kernel_set_to_readonly &&
 	    within(address, (unsigned long)_text,
 		   (unsigned long)__end_rodata_hpage_align)) {
 		unsigned int level;
 		/*
 		 * Don't enforce the !RW mapping for the kernel text mapping,
 		 * if the current mapping is already using small page mapping.
 		 * No need to work hard to preserve large page mappings in this
 		 * case.
 		 *
 		 * This also fixes the Linux Xen paravirt guest boot failure
 		 * (because of unexpected read-only mappings for kernel identity
 		 * mappings). In this paravirt guest case, the kernel text
 		 * mapping and the kernel identity mapping share the same
 		 * page-table pages. Thus we can't really use different
 		 * protections for the kernel text and identity mappings. Also,
 		 * these shared mappings are made of small page mappings.
 		 * Thus this don't enforce !RW mapping for small page kernel
 		 * text mapping logic will help Linux Xen parvirt guest boot
 		 * as well.
 		 */
 		if (lookup_address(address, &level) && (level != PG_LEVEL_4K))
 			pgprot_val(forbidden) |= _PAGE_RW;
 	}
 #endif
 	prot = __pgprot(pgprot_val(prot) & ~pgprot_val(forbidden));
 	return prot;
 }
 /*
  * Lookup the page table entry for a virtual address in a specific pgd.
  * Return a pointer to the entry and the level of the mapping.
  */
 pte_t *lookup_address_in_pgd(pgd_t *pgd, unsigned long address,
 			     unsigned int *level)
 {
 	pud_t *pud;
 	pmd_t *pmd;
 	*level = PG_LEVEL_NONE;
 	if (pgd_none(*pgd))
 		return NULL;
 	pud = pud_offset(pgd, address);
 	if (pud_none(*pud))
 		return NULL;
 	*level = PG_LEVEL_1G;
 	if (pud_large(*pud) || !pud_present(*pud))
 		return (pte_t *)pud;
 	pmd = pmd_offset(pud, address);
 	if (pmd_none(*pmd))
 		return NULL;
 	*level = PG_LEVEL_2M;
 	if (pmd_large(*pmd) || !pmd_present(*pmd))
 		return (pte_t *)pmd;
 	*level = PG_LEVEL_4K;
 	return pte_offset_kernel(pmd, address);
 }
 /*
  * Lookup the page table entry for a virtual address. Return a pointer
  * to the entry and the level of the mapping.
  *
  * Note: We return pud and pmd either when the entry is marked large
  * or when the present bit is not set. Otherwise we would return a
  * pointer to a nonexisting mapping.
  */
 pte_t *lookup_address(unsigned long address, unsigned int *level)
 {
         return lookup_address_in_pgd(pgd_offset_k(address), address, level);
 }
 EXPORT_SYMBOL_GPL(lookup_address);
 static pte_t *_lookup_address_cpa(struct cpa_data *cpa, unsigned long address,
 				  unsigned int *level)
 {
         if (cpa->pgd)
 		return lookup_address_in_pgd(cpa->pgd + pgd_index(address),
 					       address, level);
         return lookup_address(address, level);
 }
 /*
+ * Lookup the PMD entry for a virtual address. Return a pointer to the entry
+ * or NULL if not present.
+ */
+pmd_t *lookup_pmd_address(unsigned long address)
+{
+	pgd_t *pgd;
+	pud_t *pud;
+	pgd = pgd_offset_k(address);
+	if (pgd_none(*pgd))
+		return NULL;
+	pud = pud_offset(pgd, address);
+	if (pud_none(*pud) || pud_large(*pud) || !pud_present(*pud))
+		return NULL;
+	return pmd_offset(pud, address);
+}
+/*
  * This is necessary because __pa() does not work on some
  * kinds of memory, like vmalloc() or the alloc_remap()
  * areas on 32-bit NUMA systems.  The percpu areas can
  * end up in this kind of memory, for instance.
  *
  * This could be optimized, but it is only intended to be
  * used at inititalization time, and keeping it
  * unoptimized should increase the testing coverage for
  * the more obscure platforms.
  */
 phys_addr_t slow_virt_to_phys(void *__virt_addr)
 {
 	unsigned long virt_addr = (unsigned long)__virt_addr;
 	phys_addr_t phys_addr;
 	unsigned long offset;
 	enum pg_level level;
 	unsigned long psize;
 	unsigned long pmask;
 	pte_t *pte;
 	pte = lookup_address(virt_addr, &level);
 	BUG_ON(!pte);
 	psize = page_level_size(level);
 	pmask = page_level_mask(level);
 	offset = virt_addr & ~pmask;
 	phys_addr = (phys_addr_t)pte_pfn(*pte) << PAGE_SHIFT;
 	return (phys_addr | offset);
 }
 EXPORT_SYMBOL_GPL(slow_virt_to_phys);
 /*
  * Set the new pmd in all the pgds we know about:
  */
 static void __set_pmd_pte(pte_t *kpte, unsigned long address, pte_t pte)
 {
 	/* change init_mm */
 	set_pte_atomic(kpte, pte);
 #ifdef CONFIG_X86_32
 	if (!SHARED_KERNEL_PMD) {
 		struct page *page;
 		list_for_each_entry(page, &pgd_list, lru) {
 			pgd_t *pgd;
 			pud_t *pud;
 			pmd_t *pmd;
 			pgd = (pgd_t *)page_address(page) + pgd_index(address);
 			pud = pud_offset(pgd, address);
 			pmd = pmd_offset(pud, address);
 			set_pte_atomic((pte_t *)pmd, pte);
 		}
 	}
 #endif
 }
 static int
 try_preserve_large_page(pte_t *kpte, unsigned long address,
 			struct cpa_data *cpa)
 {
 	unsigned long nextpage_addr, numpages, pmask, psize, addr, pfn;
 	pte_t new_pte, old_pte, *tmp;
 	pgprot_t old_prot, new_prot, req_prot;
 	int i, do_split = 1;
 	enum pg_level level;
 	if (cpa->force_split)
 		return 1;
 	spin_lock(&pgd_lock);
 	/*
 	 * Check for races, another CPU might have split this page
 	 * up already:
 	 */
 	tmp = _lookup_address_cpa(cpa, address, &level);
 	if (tmp != kpte)
 		goto out_unlock;
 	switch (level) {
 	case PG_LEVEL_2M:
 #ifdef CONFIG_X86_64
 	case PG_LEVEL_1G:
 #endif
 		psize = page_level_size(level);
 		pmask = page_level_mask(level);
 		break;
 	default:
 		do_split = -EINVAL;
 		goto out_unlock;
 	}
 	/*
 	 * Calculate the number of pages, which fit into this large
 	 * page starting at address:
 	 */
 	nextpage_addr = (address + psize) & pmask;
 	numpages = (nextpage_addr - address) >> PAGE_SHIFT;
 	if (numpages < cpa->numpages)
 		cpa->numpages = numpages;
 	/*
 	 * We are safe now. Check whether the new pgprot is the same:
 	 * Convert protection attributes to 4k-format, as cpa->mask* are set
 	 * up accordingly.
 	 */
 	old_pte = *kpte;
 	old_prot = req_prot = pgprot_large_2_4k(pte_pgprot(old_pte));
 	pgprot_val(req_prot) &= ~pgprot_val(cpa->mask_clr);
 	pgprot_val(req_prot) |= pgprot_val(cpa->mask_set);
 	/*
 	 * req_prot is in format of 4k pages. It must be converted to large
 	 * page format: the caching mode includes the PAT bit located at
 	 * different bit positions in the two formats.
 	 */
 	req_prot = pgprot_4k_2_large(req_prot);
 	/*
 	 * Set the PSE and GLOBAL flags only if the PRESENT flag is
 	 * set otherwise pmd_present/pmd_huge will return true even on
 	 * a non present pmd. The canon_pgprot will clear _PAGE_GLOBAL
 	 * for the ancient hardware that doesn't support it.
 	 */
 	if (pgprot_val(req_prot) & _PAGE_PRESENT)
 		pgprot_val(req_prot) |= _PAGE_PSE | _PAGE_GLOBAL;
 	else
 		pgprot_val(req_prot) &= ~(_PAGE_PSE | _PAGE_GLOBAL);
 	req_prot = canon_pgprot(req_prot);
 	/*
 	 * old_pte points to the large page base address. So we need
 	 * to add the offset of the virtual address:
 	 */
 	pfn = pte_pfn(old_pte) + ((address & (psize - 1)) >> PAGE_SHIFT);
 	cpa->pfn = pfn;
 	new_prot = static_protections(req_prot, address, pfn);
 	/*
 	 * We need to check the full range, whether
 	 * static_protection() requires a different pgprot for one of
 	 * the pages in the range we try to preserve:
 	 */
 	addr = address & pmask;
 	pfn = pte_pfn(old_pte);
 	for (i = 0; i < (psize >> PAGE_SHIFT); i++, addr += PAGE_SIZE, pfn++) {
 		pgprot_t chk_prot = static_protections(req_prot, addr, pfn);
 		if (pgprot_val(chk_prot) != pgprot_val(new_prot))
 			goto out_unlock;
 	}
 	/*
 	 * If there are no changes, return. maxpages has been updated
 	 * above:
 	 */
 	if (pgprot_val(new_prot) == pgprot_val(old_prot)) {
 		do_split = 0;
 		goto out_unlock;
 	}
 	/*
 	 * We need to change the attributes. Check, whether we can
 	 * change the large page in one go. We request a split, when
 	 * the address is not aligned and the number of pages is
 	 * smaller than the number of pages in the large page. Note
 	 * that we limited the number of possible pages already to
 	 * the number of pages in the large page.
 	 */
 	if (address == (address & pmask) && cpa->numpages == (psize >> PAGE_SHIFT)) {
 		/*
 		 * The address is aligned and the number of pages
 		 * covers the full page.
 		 */
 		new_pte = pfn_pte(pte_pfn(old_pte), new_prot);
 		__set_pmd_pte(kpte, address, new_pte);
 		cpa->flags |= CPA_FLUSHTLB;
 		do_split = 0;
 	}
 out_unlock:
 	spin_unlock(&pgd_lock);
 	return do_split;
 }
 static int
 __split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address,
 		   struct page *base)
 {
 	pte_t *pbase = (pte_t *)page_address(base);
 	unsigned long pfn, pfninc = 1;
 	unsigned int i, level;
 	pte_t *tmp;
 	pgprot_t ref_prot;
 	spin_lock(&pgd_lock);
 	/*
 	 * Check for races, another CPU might have split this page
 	 * up for us already:
 	 */
 	tmp = _lookup_address_cpa(cpa, address, &level);
 	if (tmp != kpte) {
 		spin_unlock(&pgd_lock);
 		return 1;
 	}
 	paravirt_alloc_pte(&init_mm, page_to_pfn(base));
 	ref_prot = pte_pgprot(pte_clrhuge(*kpte));
 	/* promote PAT bit to correct position */
 	if (level == PG_LEVEL_2M)
 		ref_prot = pgprot_large_2_4k(ref_prot);
 #ifdef CONFIG_X86_64
 	if (level == PG_LEVEL_1G) {
 		pfninc = PMD_PAGE_SIZE >> PAGE_SHIFT;
 		/*
 		 * Set the PSE flags only if the PRESENT flag is set
 		 * otherwise pmd_present/pmd_huge will return true
 		 * even on a non present pmd.
 		 */
 		if (pgprot_val(ref_prot) & _PAGE_PRESENT)
 			pgprot_val(ref_prot) |= _PAGE_PSE;
 		else
 			pgprot_val(ref_prot) &= ~_PAGE_PSE;
 	}
 #endif
 	/*
 	 * Set the GLOBAL flags only if the PRESENT flag is set
 	 * otherwise pmd/pte_present will return true even on a non
 	 * present pmd/pte. The canon_pgprot will clear _PAGE_GLOBAL
 	 * for the ancient hardware that doesn't support it.
 	 */
 	if (pgprot_val(ref_prot) & _PAGE_PRESENT)
 		pgprot_val(ref_prot) |= _PAGE_GLOBAL;
 	else
 		pgprot_val(ref_prot) &= ~_PAGE_GLOBAL;
 	/*
 	 * Get the target pfn from the original entry:
 	 */
 	pfn = pte_pfn(*kpte);
 	for (i = 0; i < PTRS_PER_PTE; i++, pfn += pfninc)
 		set_pte(&pbase[i], pfn_pte(pfn, canon_pgprot(ref_prot)));
 	if (pfn_range_is_mapped(PFN_DOWN(__pa(address)),
 				PFN_DOWN(__pa(address)) + 1))
 		split_page_count(level);
 	/*
 	 * Install the new, split up pagetable.
 	 *
 	 * We use the standard kernel pagetable protections for the new
 	 * pagetable protections, the actual ptes set above control the
 	 * primary protection behavior:
 	 */
 	__set_pmd_pte(kpte, address, mk_pte(base, __pgprot(_KERNPG_TABLE)));
 	/*
 	 * Intel Atom errata AAH41 workaround.
 	 *
 	 * The real fix should be in hw or in a microcode update, but
 	 * we also probabilistically try to reduce the window of having
 	 * a large TLB mixed with 4K TLBs while instruction fetches are
 	 * going on.
 	 */
 	__flush_tlb_all();
 	spin_unlock(&pgd_lock);
 	return 0;
 }
 static int split_large_page(struct cpa_data *cpa, pte_t *kpte,
 			    unsigned long address)
 {
 	struct page *base;
 	if (!debug_pagealloc)
 		spin_unlock(&cpa_lock);
 	base = alloc_pages(GFP_KERNEL | __GFP_NOTRACK, 0);
 	if (!debug_pagealloc)
 		spin_lock(&cpa_lock);
 	if (!base)
 		return -ENOMEM;
 	if (__split_large_page(cpa, kpte, address, base))
 		__free_page(base);
 	return 0;
 }
 static bool try_to_free_pte_page(pte_t *pte)
 {
 	int i;
 	for (i = 0; i < PTRS_PER_PTE; i++)
 		if (!pte_none(pte[i]))
 			return false;
 	free_page((unsigned long)pte);
 	return true;
 }
 static bool try_to_free_pmd_page(pmd_t *pmd)
 {
 	int i;
 	for (i = 0; i < PTRS_PER_PMD; i++)
 		if (!pmd_none(pmd[i]))
 			return false;
 	free_page((unsigned long)pmd);
 	return true;
 }
 static bool try_to_free_pud_page(pud_t *pud)
 {
 	int i;
 	for (i = 0; i < PTRS_PER_PUD; i++)
 		if (!pud_none(pud[i]))
 			return false;
 	free_page((unsigned long)pud);
 	return true;
 }
 static bool unmap_pte_range(pmd_t *pmd, unsigned long start, unsigned long end)
 {
 	pte_t *pte = pte_offset_kernel(pmd, start);
 	while (start < end) {
 		set_pte(pte, __pte(0));
 		start += PAGE_SIZE;
 		pte++;
 	}
 	if (try_to_free_pte_page((pte_t *)pmd_page_vaddr(*pmd))) {
 		pmd_clear(pmd);
 		return true;
 	}
 	return false;
 }
 static void __unmap_pmd_range(pud_t *pud, pmd_t *pmd,
 			      unsigned long start, unsigned long end)
 {
 	if (unmap_pte_range(pmd, start, end))
 		if (try_to_free_pmd_page((pmd_t *)pud_page_vaddr(*pud)))
 			pud_clear(pud);
 }
 static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end)
 {
 	pmd_t *pmd = pmd_offset(pud, start);
 	/*
 	 * Not on a 2MB page boundary?
 	 */
 	if (start & (PMD_SIZE - 1)) {
 		unsigned long next_page = (start + PMD_SIZE) & PMD_MASK;
 		unsigned long pre_end = min_t(unsigned long, end, next_page);
 		__unmap_pmd_range(pud, pmd, start, pre_end);
 		start = pre_end;
 		pmd++;
 	}
 	/*
 	 * Try to unmap in 2M chunks.
 	 */
 	while (end - start >= PMD_SIZE) {
 		if (pmd_large(*pmd))
 			pmd_clear(pmd);
 		else
 			__unmap_pmd_range(pud, pmd, start, start + PMD_SIZE);
 		start += PMD_SIZE;
 		pmd++;
 	}
 	/*
 	 * 4K leftovers?
 	 */
 	if (start < end)
 		return __unmap_pmd_range(pud, pmd, start, end);
 	/*
 	 * Try again to free the PMD page if haven't succeeded above.
 	 */
 	if (!pud_none(*pud))
 		if (try_to_free_pmd_page((pmd_t *)pud_page_vaddr(*pud)))
 			pud_clear(pud);
 }
 static void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end)
 {
 	pud_t *pud = pud_offset(pgd, start);
 	/*
 	 * Not on a GB page boundary?
 	 */
 	if (start & (PUD_SIZE - 1)) {
 		unsigned long next_page = (start + PUD_SIZE) & PUD_MASK;
 		unsigned long pre_end	= min_t(unsigned long, end, next_page);
 		unmap_pmd_range(pud, start, pre_end);
 		start = pre_end;
 		pud++;
 	}
 	/*
 	 * Try to unmap in 1G chunks?
 	 */
 	while (end - start >= PUD_SIZE) {
 		if (pud_large(*pud))
 			pud_clear(pud);
 		else
 			unmap_pmd_range(pud, start, start + PUD_SIZE);
 		start += PUD_SIZE;
 		pud++;
 	}
 	/*
 	 * 2M leftovers?
 	 */
 	if (start < end)
 		unmap_pmd_range(pud, start, end);
 	/*
 	 * No need to try to free the PUD page because we'll free it in
 	 * populate_pgd's error path
 	 */
 }
 static void unmap_pgd_range(pgd_t *root, unsigned long addr, unsigned long end)
 {
 	pgd_t *pgd_entry = root + pgd_index(addr);
 	unmap_pud_range(pgd_entry, addr, end);
 	if (try_to_free_pud_page((pud_t *)pgd_page_vaddr(*pgd_entry)))
 		pgd_clear(pgd_entry);
 }
 static int alloc_pte_page(pmd_t *pmd)
 {
 	pte_t *pte = (pte_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK);
 	if (!pte)
 		return -1;
 	set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE));
 	return 0;
 }
 static int alloc_pmd_page(pud_t *pud)
 {
 	pmd_t *pmd = (pmd_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK);
 	if (!pmd)
 		return -1;
 	set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE));
 	return 0;
 }
 static void populate_pte(struct cpa_data *cpa,
 			 unsigned long start, unsigned long end,
 			 unsigned num_pages, pmd_t *pmd, pgprot_t pgprot)
 {
 	pte_t *pte;
 	pte = pte_offset_kernel(pmd, start);
 	while (num_pages-- && start < end) {
 		/* deal with the NX bit */
 		if (!(pgprot_val(pgprot) & _PAGE_NX))
 			cpa->pfn &= ~_PAGE_NX;
 		set_pte(pte, pfn_pte(cpa->pfn >> PAGE_SHIFT, pgprot));
 		start	 += PAGE_SIZE;
 		cpa->pfn += PAGE_SIZE;
 		pte++;
 	}
 }
 static int populate_pmd(struct cpa_data *cpa,
 			unsigned long start, unsigned long end,
 			unsigned num_pages, pud_t *pud, pgprot_t pgprot)
 {
 	unsigned int cur_pages = 0;
 	pmd_t *pmd;
 	pgprot_t pmd_pgprot;
 	/*
 	 * Not on a 2M boundary?
 	 */
 	if (start & (PMD_SIZE - 1)) {
 		unsigned long pre_end = start + (num_pages << PAGE_SHIFT);
 		unsigned long next_page = (start + PMD_SIZE) & PMD_MASK;
 		pre_end   = min_t(unsigned long, pre_end, next_page);
 		cur_pages = (pre_end - start) >> PAGE_SHIFT;
 		cur_pages = min_t(unsigned int, num_pages, cur_pages);
 		/*
 		 * Need a PTE page?
 		 */
 		pmd = pmd_offset(pud, start);
 		if (pmd_none(*pmd))
 			if (alloc_pte_page(pmd))
 				return -1;
 		populate_pte(cpa, start, pre_end, cur_pages, pmd, pgprot);
 		start = pre_end;
 	}
 	/*
 	 * We mapped them all?
 	 */
 	if (num_pages == cur_pages)
 		return cur_pages;
 	pmd_pgprot = pgprot_4k_2_large(pgprot);
 	while (end - start >= PMD_SIZE) {
 		/*
 		 * We cannot use a 1G page so allocate a PMD page if needed.
 		 */
 		if (pud_none(*pud))
 			if (alloc_pmd_page(pud))
 				return -1;
 		pmd = pmd_offset(pud, start);
 		set_pmd(pmd, __pmd(cpa->pfn | _PAGE_PSE |
 				   massage_pgprot(pmd_pgprot)));
 		start	  += PMD_SIZE;
 		cpa->pfn  += PMD_SIZE;
 		cur_pages += PMD_SIZE >> PAGE_SHIFT;
 	}
 	/*
 	 * Map trailing 4K pages.
 	 */
 	if (start < end) {
 		pmd = pmd_offset(pud, start);
 		if (pmd_none(*pmd))
 			if (alloc_pte_page(pmd))
 				return -1;
 		populate_pte(cpa, start, end, num_pages - cur_pages,
 			     pmd, pgprot);
 	}
 	return num_pages;
 }
 static int populate_pud(struct cpa_data *cpa, unsigned long start, pgd_t *pgd,
 			pgprot_t pgprot)
 {
 	pud_t *pud;
 	unsigned long end;
 	int cur_pages = 0;
 	pgprot_t pud_pgprot;
 	end = start + (cpa->numpages << PAGE_SHIFT);
 	/*
 	 * Not on a Gb page boundary? => map everything up to it with
 	 * smaller pages.
 	 */
 	if (start & (PUD_SIZE - 1)) {
 		unsigned long pre_end;
 		unsigned long next_page = (start + PUD_SIZE) & PUD_MASK;
 		pre_end   = min_t(unsigned long, end, next_page);
 		cur_pages = (pre_end - start) >> PAGE_SHIFT;
 		cur_pages = min_t(int, (int)cpa->numpages, cur_pages);
 		pud = pud_offset(pgd, start);
 		/*
 		 * Need a PMD page?
 		 */
 		if (pud_none(*pud))
 			if (alloc_pmd_page(pud))
 				return -1;
 		cur_pages = populate_pmd(cpa, start, pre_end, cur_pages,
 					 pud, pgprot);
 		if (cur_pages < 0)
 			return cur_pages;
 		start = pre_end;
 	}
 	/* We mapped them all? */
 	if (cpa->numpages == cur_pages)
 		return cur_pages;
 	pud = pud_offset(pgd, start);
 	pud_pgprot = pgprot_4k_2_large(pgprot);
 	/*
 	 * Map everything starting from the Gb boundary, possibly with 1G pages
 	 */
 	while (end - start >= PUD_SIZE) {
 		set_pud(pud, __pud(cpa->pfn | _PAGE_PSE |
 				   massage_pgprot(pud_pgprot)));
 		start	  += PUD_SIZE;
 		cpa->pfn  += PUD_SIZE;
 		cur_pages += PUD_SIZE >> PAGE_SHIFT;
 		pud++;
 	}
 	/* Map trailing leftover */
 	if (start < end) {
 		int tmp;
 		pud = pud_offset(pgd, start);
 		if (pud_none(*pud))
 			if (alloc_pmd_page(pud))
 				return -1;
 		tmp = populate_pmd(cpa, start, end, cpa->numpages - cur_pages,
 				   pud, pgprot);
 		if (tmp < 0)
 			return cur_pages;
 		cur_pages += tmp;
 	}
 	return cur_pages;
 }
 /*
  * Restrictions for kernel page table do not necessarily apply when mapping in
  * an alternate PGD.
  */
 static int populate_pgd(struct cpa_data *cpa, unsigned long addr)
 {
 	pgprot_t pgprot = __pgprot(_KERNPG_TABLE);
 	pud_t *pud = NULL;	/* shut up gcc */
 	pgd_t *pgd_entry;
 	int ret;
 	pgd_entry = cpa->pgd + pgd_index(addr);
 	/*
 	 * Allocate a PUD page and hand it down for mapping.
 	 */
 	if (pgd_none(*pgd_entry)) {
 		pud = (pud_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK);
 		if (!pud)
 			return -1;
 		set_pgd(pgd_entry, __pgd(__pa(pud) | _KERNPG_TABLE));
 	}
 	pgprot_val(pgprot) &= ~pgprot_val(cpa->mask_clr);
 	pgprot_val(pgprot) |=  pgprot_val(cpa->mask_set);
 	ret = populate_pud(cpa, addr, pgd_entry, pgprot);
 	if (ret < 0) {
 		unmap_pgd_range(cpa->pgd, addr,
 				addr + (cpa->numpages << PAGE_SHIFT));
 		return ret;
 	}
 	cpa->numpages = ret;
 	return 0;
 }
 static int __cpa_process_fault(struct cpa_data *cpa, unsigned long vaddr,
 			       int primary)
 {
 	if (cpa->pgd)
 		return populate_pgd(cpa, vaddr);
 	/*
 	 * Ignore all non primary paths.
 	 */
 	if (!primary)
 		return 0;
 	/*
 	 * Ignore the NULL PTE for kernel identity mapping, as it is expected
 	 * to have holes.
 	 * Also set numpages to '1' indicating that we processed cpa req for
 	 * one virtual address page and its pfn. TBD: numpages can be set based
 	 * on the initial value and the level returned by lookup_address().
 	 */
 	if (within(vaddr, PAGE_OFFSET,
 		   PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT))) {
 		cpa->numpages = 1;
 		cpa->pfn = __pa(vaddr) >> PAGE_SHIFT;
 		return 0;
 	} else {
 		WARN(1, KERN_WARNING "CPA: called for zero pte. "
 			"vaddr = %lx cpa->vaddr = %lx\n", vaddr,
 			*cpa->vaddr);
 		return -EFAULT;
 	}
 }
 static int __change_page_attr(struct cpa_data *cpa, int primary)
 {
 	unsigned long address;
 	int do_split, err;
 	unsigned int level;
 	pte_t *kpte, old_pte;
 	if (cpa->flags & CPA_PAGES_ARRAY) {
 		struct page *page = cpa->pages[cpa->curpage];
 		if (unlikely(PageHighMem(page)))
 			return 0;
 		address = (unsigned long)page_address(page);
 	} else if (cpa->flags & CPA_ARRAY)
 		address = cpa->vaddr[cpa->curpage];
 	else
 		address = *cpa->vaddr;
 repeat:
 	kpte = _lookup_address_cpa(cpa, address, &level);
 	if (!kpte)
 		return __cpa_process_fault(cpa, address, primary);
 	old_pte = *kpte;
 	if (!pte_val(old_pte))
 		return __cpa_process_fault(cpa, address, primary);
 	if (level == PG_LEVEL_4K) {
 		pte_t new_pte;
 		pgprot_t new_prot = pte_pgprot(old_pte);
 		unsigned long pfn = pte_pfn(old_pte);
 		pgprot_val(new_prot) &= ~pgprot_val(cpa->mask_clr);
 		pgprot_val(new_prot) |= pgprot_val(cpa->mask_set);
 		new_prot = static_protections(new_prot, address, pfn);
 		/*
 		 * Set the GLOBAL flags only if the PRESENT flag is
 		 * set otherwise pte_present will return true even on
 		 * a non present pte. The canon_pgprot will clear
 		 * _PAGE_GLOBAL for the ancient hardware that doesn't
 		 * support it.
 		 */
 		if (pgprot_val(new_prot) & _PAGE_PRESENT)
 			pgprot_val(new_prot) |= _PAGE_GLOBAL;
 		else
 			pgprot_val(new_prot) &= ~_PAGE_GLOBAL;
 		/*
 		 * We need to keep the pfn from the existing PTE,
 		 * after all we're only going to change it's attributes
 		 * not the memory it points to
 		 */
 		new_pte = pfn_pte(pfn, canon_pgprot(new_prot));
 		cpa->pfn = pfn;
 		/*
 		 * Do we really change anything ?
 		 */
 		if (pte_val(old_pte) != pte_val(new_pte)) {
 			set_pte_atomic(kpte, new_pte);
 			cpa->flags |= CPA_FLUSHTLB;
 		}
 		cpa->numpages = 1;
 		return 0;
 	}
 	/*
 	 * Check, whether we can keep the large page intact
 	 * and just change the pte:
 	 */
 	do_split = try_preserve_large_page(kpte, address, cpa);
 	/*
 	 * When the range fits into the existing large page,
 	 * return. cp->numpages and cpa->tlbflush have been updated in
 	 * try_large_page:
 	 */
 	if (do_split <= 0)
 		return do_split;
 	/*
 	 * We have to split the large page:
 	 */
 	err = split_large_page(cpa, kpte, address);
 	if (!err) {
 		/*
 	 	 * Do a global flush tlb after splitting the large page
 	 	 * and before we do the actual change page attribute in the PTE.
 	 	 *
 	 	 * With out this, we violate the TLB application note, that says
 	 	 * "The TLBs may contain both ordinary and large-page
 		 *  translations for a 4-KByte range of linear addresses. This
 		 *  may occur if software modifies the paging structures so that
 		 *  the page size used for the address range changes. If the two
 		 *  translations differ with respect to page frame or attributes
 		 *  (e.g., permissions), processor behavior is undefined and may
 		 *  be implementation-specific."
 	 	 *
 	 	 * We do this global tlb flush inside the cpa_lock, so that we
 		 * don't allow any other cpu, with stale tlb entries change the
 		 * page attribute in parallel, that also falls into the
 		 * just split large page entry.
 	 	 */
 		flush_tlb_all();
 		goto repeat;
 	}
 	return err;
 }
 static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias);
 static int cpa_process_alias(struct cpa_data *cpa)
 {
 	struct cpa_data alias_cpa;
 	unsigned long laddr = (unsigned long)__va(cpa->pfn << PAGE_SHIFT);
 	unsigned long vaddr;
 	int ret;
 	if (!pfn_range_is_mapped(cpa->pfn, cpa->pfn + 1))
 		return 0;
 	/*
 	 * No need to redo, when the primary call touched the direct
 	 * mapping already:
 	 */
 	if (cpa->flags & CPA_PAGES_ARRAY) {
 		struct page *page = cpa->pages[cpa->curpage];
 		if (unlikely(PageHighMem(page)))
 			return 0;
 		vaddr = (unsigned long)page_address(page);
 	} else if (cpa->flags & CPA_ARRAY)
 		vaddr = cpa->vaddr[cpa->curpage];
 	else
 		vaddr = *cpa->vaddr;
 	if (!(within(vaddr, PAGE_OFFSET,
 		    PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT)))) {
 		alias_cpa = *cpa;
 		alias_cpa.vaddr = &laddr;
 		alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY);
 		ret = __change_page_attr_set_clr(&alias_cpa, 0);
 		if (ret)
 			return ret;
 	}
 #ifdef CONFIG_X86_64
 	/*
 	 * If the primary call didn't touch the high mapping already
 	 * and the physical address is inside the kernel map, we need
 	 * to touch the high mapped kernel as well:
 	 */
 	if (!within(vaddr, (unsigned long)_text, _brk_end) &&
 	    within(cpa->pfn, highmap_start_pfn(), highmap_end_pfn())) {
 		unsigned long temp_cpa_vaddr = (cpa->pfn << PAGE_SHIFT) +
 					       __START_KERNEL_map - phys_base;
 		alias_cpa = *cpa;
 		alias_cpa.vaddr = &temp_cpa_vaddr;
 		alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY);
 		/*
 		 * The high mapping range is imprecise, so ignore the
 		 * return value.
 		 */
 		__change_page_attr_set_clr(&alias_cpa, 0);
 	}
 #endif
 	return 0;
 }
 static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias)
 {
 	int ret, numpages = cpa->numpages;
 	while (numpages) {
 		/*
 		 * Store the remaining nr of pages for the large page
 		 * preservation check.
 		 */
 		cpa->numpages = numpages;
 		/* for array changes, we can't use large page */
 		if (cpa->flags & (CPA_ARRAY | CPA_PAGES_ARRAY))
 			cpa->numpages = 1;
 		if (!debug_pagealloc)
 			spin_lock(&cpa_lock);
 		ret = __change_page_attr(cpa, checkalias);
 		if (!debug_pagealloc)
 			spin_unlock(&cpa_lock);
 		if (ret)
 			return ret;
 		if (checkalias) {
 			ret = cpa_process_alias(cpa);
 			if (ret)
 				return ret;
 		}
 		/*
 		 * Adjust the number of pages with the result of the
 		 * CPA operation. Either a large page has been
 		 * preserved or a single page update happened.
 		 */
 		BUG_ON(cpa->numpages > numpages);
 		numpages -= cpa->numpages;
 		if (cpa->flags & (CPA_PAGES_ARRAY | CPA_ARRAY))
 			cpa->curpage++;
 		else
 			*cpa->vaddr += cpa->numpages * PAGE_SIZE;
 	}
 	return 0;
 }
 static int change_page_attr_set_clr(unsigned long *addr, int numpages,
 				    pgprot_t mask_set, pgprot_t mask_clr,
 				    int force_split, int in_flag,
 				    struct page **pages)
 {
 	struct cpa_data cpa;
 	int ret, cache, checkalias;
 	unsigned long baddr = 0;
 	memset(&cpa, 0, sizeof(cpa));
 	/*
 	 * Check, if we are requested to change a not supported
 	 * feature:
 	 */
 	mask_set = canon_pgprot(mask_set);
 	mask_clr = canon_pgprot(mask_clr);
 	if (!pgprot_val(mask_set) && !pgprot_val(mask_clr) && !force_split)
 		return 0;
 	/* Ensure we are PAGE_SIZE aligned */
 	if (in_flag & CPA_ARRAY) {
 		int i;
 		for (i = 0; i < numpages; i++) {
 			if (addr[i] & ~PAGE_MASK) {
 				addr[i] &= PAGE_MASK;
 				WARN_ON_ONCE(1);
 			}
 		}
 	} else if (!(in_flag & CPA_PAGES_ARRAY)) {
 		/*
 		 * in_flag of CPA_PAGES_ARRAY implies it is aligned.
 		 * No need to cehck in that case
 		 */
 		if (*addr & ~PAGE_MASK) {
 			*addr &= PAGE_MASK;
 			/*
 			 * People should not be passing in unaligned addresses:
 			 */
 			WARN_ON_ONCE(1);
 		}
 		/*
 		 * Save address for cache flush. *addr is modified in the call
 		 * to __change_page_attr_set_clr() below.
 		 */
 		baddr = *addr;
 	}
 	/* Must avoid aliasing mappings in the highmem code */
 	kmap_flush_unused();
 	vm_unmap_aliases();
 	cpa.vaddr = addr;
 	cpa.pages = pages;
 	cpa.numpages = numpages;
 	cpa.mask_set = mask_set;
 	cpa.mask_clr = mask_clr;
 	cpa.flags = 0;
 	cpa.curpage = 0;
 	cpa.force_split = force_split;
 	if (in_flag & (CPA_ARRAY | CPA_PAGES_ARRAY))
 		cpa.flags |= in_flag;
 	/* No alias checking for _NX bit modifications */
 	checkalias = (pgprot_val(mask_set) | pgprot_val(mask_clr)) != _PAGE_NX;
 	ret = __change_page_attr_set_clr(&cpa, checkalias);
 	/*
 	 * Check whether we really changed something:
 	 */
 	if (!(cpa.flags & CPA_FLUSHTLB))
 		goto out;
 	/*
 	 * No need to flush, when we did not set any of the caching
 	 * attributes:
 	 */
 	cache = !!pgprot2cachemode(mask_set);
 	/*
 	 * On success we use CLFLUSH, when the CPU supports it to
 	 * avoid the WBINVD. If the CPU does not support it and in the
 	 * error case we fall back to cpa_flush_all (which uses
 	 * WBINVD):
 	 */
 	if (!ret && cpu_has_clflush) {
 		if (cpa.flags & (CPA_PAGES_ARRAY | CPA_ARRAY)) {
 			cpa_flush_array(addr, numpages, cache,
 					cpa.flags, pages);
 		} else
 			cpa_flush_range(baddr, numpages, cache);
 	} else
 		cpa_flush_all(cache);
 out:
 	return ret;
 }
 static inline int change_page_attr_set(unsigned long *addr, int numpages,
 				       pgprot_t mask, int array)
 {
 	return change_page_attr_set_clr(addr, numpages, mask, __pgprot(0), 0,
 		(array ? CPA_ARRAY : 0), NULL);
 }
 static inline int change_page_attr_clear(unsigned long *addr, int numpages,
 					 pgprot_t mask, int array)
 {
 	return change_page_attr_set_clr(addr, numpages, __pgprot(0), mask, 0,
 		(array ? CPA_ARRAY : 0), NULL);
 }
 static inline int cpa_set_pages_array(struct page **pages, int numpages,
 				       pgprot_t mask)
 {
 	return change_page_attr_set_clr(NULL, numpages, mask, __pgprot(0), 0,
 		CPA_PAGES_ARRAY, pages);
 }
 static inline int cpa_clear_pages_array(struct page **pages, int numpages,
 					 pgprot_t mask)
 {
 	return change_page_attr_set_clr(NULL, numpages, __pgprot(0), mask, 0,
 		CPA_PAGES_ARRAY, pages);
 }
 int _set_memory_uc(unsigned long addr, int numpages)
 {
 	/*
 	 * for now UC MINUS. see comments in ioremap_nocache()
 	 */
 	return change_page_attr_set(&addr, numpages,
 				    cachemode2pgprot(_PAGE_CACHE_MODE_UC_MINUS),
 				    0);
 }
 int set_memory_uc(unsigned long addr, int numpages)
 {
 	int ret;
 	/*
 	 * for now UC MINUS. see comments in ioremap_nocache()
 	 */
 	ret = reserve_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE,
 			      _PAGE_CACHE_MODE_UC_MINUS, NULL);
 	if (ret)
 		goto out_err;
 	ret = _set_memory_uc(addr, numpages);
 	if (ret)
 		goto out_free;
 	return 0;
 out_free:
 	free_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE);
 out_err:
 	return ret;
 }
 EXPORT_SYMBOL(set_memory_uc);
 static int _set_memory_array(unsigned long *addr, int addrinarray,
 		enum page_cache_mode new_type)
 {
 	int i, j;
 	int ret;
 	/*
 	 * for now UC MINUS. see comments in ioremap_nocache()
 	 */
 	for (i = 0; i < addrinarray; i++) {
 		ret = reserve_memtype(__pa(addr[i]), __pa(addr[i]) + PAGE_SIZE,
 					new_type, NULL);
 		if (ret)
 			goto out_free;
 	}
 	ret = change_page_attr_set(addr, addrinarray,
 				   cachemode2pgprot(_PAGE_CACHE_MODE_UC_MINUS),
 				   1);
 	if (!ret && new_type == _PAGE_CACHE_MODE_WC)
 		ret = change_page_attr_set_clr(addr, addrinarray,
 					       cachemode2pgprot(
 						_PAGE_CACHE_MODE_WC),
 					       __pgprot(_PAGE_CACHE_MASK),
 					       0, CPA_ARRAY, NULL);
 	if (ret)
 		goto out_free;
 	return 0;
 out_free:
 	for (j = 0; j < i; j++)
 		free_memtype(__pa(addr[j]), __pa(addr[j]) + PAGE_SIZE);
 	return ret;
 }
 int set_memory_array_uc(unsigned long *addr, int addrinarray)
 {
 	return _set_memory_array(addr, addrinarray, _PAGE_CACHE_MODE_UC_MINUS);
 }
 EXPORT_SYMBOL(set_memory_array_uc);
 int set_memory_array_wc(unsigned long *addr, int addrinarray)
 {
 	return _set_memory_array(addr, addrinarray, _PAGE_CACHE_MODE_WC);
 }
 EXPORT_SYMBOL(set_memory_array_wc);
 int _set_memory_wc(unsigned long addr, int numpages)
 {
 	int ret;
 	unsigned long addr_copy = addr;
 	ret = change_page_attr_set(&addr, numpages,
 				   cachemode2pgprot(_PAGE_CACHE_MODE_UC_MINUS),
 				   0);
 	if (!ret) {
 		ret = change_page_attr_set_clr(&addr_copy, numpages,
 					       cachemode2pgprot(
 						_PAGE_CACHE_MODE_WC),
 					       __pgprot(_PAGE_CACHE_MASK),
 					       0, 0, NULL);
 	}
 	return ret;
 }
 int set_memory_wc(unsigned long addr, int numpages)
 {
 	int ret;
 	if (!pat_enabled)
 		return set_memory_uc(addr, numpages);
 	ret = reserve_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE,
 		_PAGE_CACHE_MODE_WC, NULL);
 	if (ret)
 		goto out_err;
 	ret = _set_memory_wc(addr, numpages);
 	if (ret)
 		goto out_free;
 	return 0;
 out_free:
 	free_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE);
 out_err:
 	return ret;
 }
 EXPORT_SYMBOL(set_memory_wc);
 int _set_memory_wb(unsigned long addr, int numpages)
 {
 	/* WB cache mode is hard wired to all cache attribute bits being 0 */
 	return change_page_attr_clear(&addr, numpages,
 				      __pgprot(_PAGE_CACHE_MASK), 0);
 }
 int set_memory_wb(unsigned long addr, int numpages)
 {
 	int ret;
 	ret = _set_memory_wb(addr, numpages);
 	if (ret)
 		return ret;
 	free_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE);
 	return 0;
 }
 EXPORT_SYMBOL(set_memory_wb);
 int set_memory_array_wb(unsigned long *addr, int addrinarray)
 {
 	int i;
 	int ret;
 	/* WB cache mode is hard wired to all cache attribute bits being 0 */
 	ret = change_page_attr_clear(addr, addrinarray,
 				      __pgprot(_PAGE_CACHE_MASK), 1);
 	if (ret)
 		return ret;
 	for (i = 0; i < addrinarray; i++)
 		free_memtype(__pa(addr[i]), __pa(addr[i]) + PAGE_SIZE);
 	return 0;
 }
 EXPORT_SYMBOL(set_memory_array_wb);
 int set_memory_x(unsigned long addr, int numpages)
 {
 	if (!(__supported_pte_mask & _PAGE_NX))
 		return 0;
 	return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_NX), 0);
 }
 EXPORT_SYMBOL(set_memory_x);
 int set_memory_nx(unsigned long addr, int numpages)
 {
 	if (!(__supported_pte_mask & _PAGE_NX))
 		return 0;
 	return change_page_attr_set(&addr, numpages, __pgprot(_PAGE_NX), 0);
 }
 EXPORT_SYMBOL(set_memory_nx);
 int set_memory_ro(unsigned long addr, int numpages)
 {
 	return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_RW), 0);
 }
 EXPORT_SYMBOL_GPL(set_memory_ro);
 int set_memory_rw(unsigned long addr, int numpages)
 {
 	return change_page_attr_set(&addr, numpages, __pgprot(_PAGE_RW), 0);
 }
 EXPORT_SYMBOL_GPL(set_memory_rw);
 int set_memory_np(unsigned long addr, int numpages)
 {
 	return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_PRESENT), 0);
 }
 int set_memory_4k(unsigned long addr, int numpages)
 {
 	return change_page_attr_set_clr(&addr, numpages, __pgprot(0),
 					__pgprot(0), 1, 0, NULL);
 }
 int set_pages_uc(struct page *page, int numpages)
 {
 	unsigned long addr = (unsigned long)page_address(page);
 	return set_memory_uc(addr, numpages);
 }
 EXPORT_SYMBOL(set_pages_uc);
 static int _set_pages_array(struct page **pages, int addrinarray,
 		enum page_cache_mode new_type)
 {
 	unsigned long start;
 	unsigned long end;
 	int i;
 	int free_idx;
 	int ret;
 	for (i = 0; i < addrinarray; i++) {
 		if (PageHighMem(pages[i]))
 			continue;
 		start = page_to_pfn(pages[i]) << PAGE_SHIFT;
 		end = start + PAGE_SIZE;
 		if (reserve_memtype(start, end, new_type, NULL))
 			goto err_out;
 	}
 	ret = cpa_set_pages_array(pages, addrinarray,
 			cachemode2pgprot(_PAGE_CACHE_MODE_UC_MINUS));
 	if (!ret && new_type == _PAGE_CACHE_MODE_WC)
 		ret = change_page_attr_set_clr(NULL, addrinarray,
 					       cachemode2pgprot(
 						_PAGE_CACHE_MODE_WC),
 					       __pgprot(_PAGE_CACHE_MASK),
 					       0, CPA_PAGES_ARRAY, pages);
 	if (ret)
 		goto err_out;
 	return 0; /* Success */
 err_out:
 	free_idx = i;
 	for (i = 0; i < free_idx; i++) {
 		if (PageHighMem(pages[i]))
 			continue;
 		start = page_to_pfn(pages[i]) << PAGE_SHIFT;
 		end = start + PAGE_SIZE;
 		free_memtype(start, end);
 	}
 	return -EINVAL;
 }
 int set_pages_array_uc(struct page **pages, int addrinarray)
 {
 	return _set_pages_array(pages, addrinarray, _PAGE_CACHE_MODE_UC_MINUS);
 }
 EXPORT_SYMBOL(set_pages_array_uc);
 int set_pages_array_wc(struct page **pages, int addrinarray)
 {
 	return _set_pages_array(pages, addrinarray, _PAGE_CACHE_MODE_WC);
 }
 EXPORT_SYMBOL(set_pages_array_wc);
 int set_pages_wb(struct page *page, int numpages)
 {
 	unsigned long addr = (unsigned long)page_address(page);
 	return set_memory_wb(addr, numpages);
 }
 EXPORT_SYMBOL(set_pages_wb);
 int set_pages_array_wb(struct page **pages, int addrinarray)
 {
 	int retval;
 	unsigned long start;
 	unsigned long end;
 	int i;
 	/* WB cache mode is hard wired to all cache attribute bits being 0 */
 	retval = cpa_clear_pages_array(pages, addrinarray,
 			__pgprot(_PAGE_CACHE_MASK));
 	if (retval)
 		return retval;
 	for (i = 0; i < addrinarray; i++) {
 		if (PageHighMem(pages[i]))
 			continue;
 		start = page_to_pfn(pages[i]) << PAGE_SHIFT;
 		end = start + PAGE_SIZE;
 		free_memtype(start, end);
 	}
 	return 0;
 }
 EXPORT_SYMBOL(set_pages_array_wb);
 int set_pages_x(struct page *page, int numpages)
 {
 	unsigned long addr = (unsigned long)page_address(page);
 	return set_memory_x(addr, numpages);
 }
 EXPORT_SYMBOL(set_pages_x);
 int set_pages_nx(struct page *page, int numpages)
 {
 	unsigned long addr = (unsigned long)page_address(page);
 	return set_memory_nx(addr, numpages);
 }
 EXPORT_SYMBOL(set_pages_nx);
 int set_pages_ro(struct page *page, int numpages)
 {
 	unsigned long addr = (unsigned long)page_address(page);
 	return set_memory_ro(addr, numpages);
 }
 int set_pages_rw(struct page *page, int numpages)
 {
 	unsigned long addr = (unsigned long)page_address(page);
 	return set_memory_rw(addr, numpages);
 }
 #ifdef CONFIG_DEBUG_PAGEALLOC
 static int __set_pages_p(struct page *page, int numpages)
 {
 	unsigned long tempaddr = (unsigned long) page_address(page);
 	struct cpa_data cpa = { .vaddr = &tempaddr,
 				.pgd = NULL,
 				.numpages = numpages,
 				.mask_set = __pgprot(_PAGE_PRESENT | _PAGE_RW),
 				.mask_clr = __pgprot(0),
 				.flags = 0};
 	/*
 	 * No alias checking needed for setting present flag. otherwise,
 	 * we may need to break large pages for 64-bit kernel text
 	 * mappings (this adds to complexity if we want to do this from
 	 * atomic context especially). Let's keep it simple!
 	 */
 	return __change_page_attr_set_clr(&cpa, 0);
 }
 static int __set_pages_np(struct page *page, int numpages)
 {
 	unsigned long tempaddr = (unsigned long) page_address(page);
 	struct cpa_data cpa = { .vaddr = &tempaddr,
 				.pgd = NULL,
 				.numpages = numpages,
 				.mask_set = __pgprot(0),
 				.mask_clr = __pgprot(_PAGE_PRESENT | _PAGE_RW),
 				.flags = 0};
 	/*
 	 * No alias checking needed for setting not present flag. otherwise,
 	 * we may need to break large pages for 64-bit kernel text
 	 * mappings (this adds to complexity if we want to do this from
 	 * atomic context especially). Let's keep it simple!
 	 */
 	return __change_page_attr_set_clr(&cpa, 0);
 }
 void __kernel_map_pages(struct page *page, int numpages, int enable)
 {
 	if (PageHighMem(page))
 		return;
 	if (!enable) {
 		debug_check_no_locks_freed(page_address(page),
 					   numpages * PAGE_SIZE);
 	}
 	/*
 	 * The return value is ignored as the calls cannot fail.
 	 * Large pages for identity mappings are not used at boot time
 	 * and hence no memory allocations during large page split.
 	 */
 	if (enable)
 		__set_pages_p(page, numpages);
 	else
 		__set_pages_np(page, numpages);
 	/*
 	 * We should perform an IPI and flush all tlbs,
 	 * but that can deadlock->flush only current cpu:
 	 */
 	__flush_tlb_all();
 	arch_flush_lazy_mmu_mode();
 }
 #ifdef CONFIG_HIBERNATION
 bool kernel_page_present(struct page *page)
 {
 	unsigned int level;
 	pte_t *pte;
 	if (PageHighMem(page))
 		return false;
 	pte = lookup_address((unsigned long)page_address(page), &level);
 	return (pte_val(*pte) & _PAGE_PRESENT);
 }
 #endif /* CONFIG_HIBERNATION */
 #endif /* CONFIG_DEBUG_PAGEALLOC */
 int kernel_map_pages_in_pgd(pgd_t *pgd, u64 pfn, unsigned long address,
 			    unsigned numpages, unsigned long page_flags)
 {
 	int retval = -EINVAL;
 	struct cpa_data cpa = {
 		.vaddr = &address,
 		.pfn = pfn,
 		.pgd = pgd,
 		.numpages = numpages,
 		.mask_set = __pgprot(0),
 		.mask_clr = __pgprot(0),
 		.flags = 0,
 	};
 	if (!(__supported_pte_mask & _PAGE_NX))
 		goto out;
 	if (!(page_flags & _PAGE_NX))
 		cpa.mask_clr = __pgprot(_PAGE_NX);
 	cpa.mask_set = __pgprot(_PAGE_PRESENT | page_flags);
 	retval = __change_page_attr_set_clr(&cpa, 0);
 	__flush_tlb_all();
 out:
 	return retval;
 }
 void kernel_unmap_pages_in_pgd(pgd_t *root, unsigned long address,
 			       unsigned numpages)
 {
 	unmap_pgd_range(root, address, address + (numpages << PAGE_SHIFT));
 }
 /*
  * The testcases use internal knowledge of the implementation that shouldn't
  * be exposed to the rest of the kernel. Include these directly here.
  */
 #ifdef CONFIG_CPA_DEBUG
 #include "pageattr-test.c"
 #endif

arch/x86/xen/mmu.c

Diff comments View file @ eb64c3c

1	/*	1	/*
2	* Xen mmu operations	2	* Xen mmu operations
3	*	3	*
4	* This file contains the various mmu fetch and update operations.	4	* This file contains the various mmu fetch and update operations.
5	* The most important job they must perform is the mapping between the	5	* The most important job they must perform is the mapping between the
6	* domain's pfn and the overall machine mfns.	6	* domain's pfn and the overall machine mfns.
7	*	7	*
8	* Xen allows guests to directly update the pagetable, in a controlled	8	* Xen allows guests to directly update the pagetable, in a controlled
9	* fashion. In other words, the guest modifies the same pagetable	9	* fashion. In other words, the guest modifies the same pagetable
10	* that the CPU actually uses, which eliminates the overhead of having	10	* that the CPU actually uses, which eliminates the overhead of having
11	* a separate shadow pagetable.	11	* a separate shadow pagetable.
12	*	12	*
13	* In order to allow this, it falls on the guest domain to map its	13	* In order to allow this, it falls on the guest domain to map its
14	* notion of a "physical" pfn - which is just a domain-local linear	14	* notion of a "physical" pfn - which is just a domain-local linear
15	* address - into a real "machine address" which the CPU's MMU can	15	* address - into a real "machine address" which the CPU's MMU can
16	* use.	16	* use.
17	*	17	*
18	* A pgd_t/pmd_t/pte_t will typically contain an mfn, and so can be	18	* A pgd_t/pmd_t/pte_t will typically contain an mfn, and so can be
19	* inserted directly into the pagetable. When creating a new	19	* inserted directly into the pagetable. When creating a new
20	* pte/pmd/pgd, it converts the passed pfn into an mfn. Conversely,	20	* pte/pmd/pgd, it converts the passed pfn into an mfn. Conversely,
21	* when reading the content back with __(pgd\|pmd\|pte)_val, it converts	21	* when reading the content back with __(pgd\|pmd\|pte)_val, it converts
22	* the mfn back into a pfn.	22	* the mfn back into a pfn.
23	*	23	*
24	* The other constraint is that all pages which make up a pagetable	24	* The other constraint is that all pages which make up a pagetable
25	* must be mapped read-only in the guest. This prevents uncontrolled	25	* must be mapped read-only in the guest. This prevents uncontrolled
26	* guest updates to the pagetable. Xen strictly enforces this, and	26	* guest updates to the pagetable. Xen strictly enforces this, and
27	* will disallow any pagetable update which will end up mapping a	27	* will disallow any pagetable update which will end up mapping a
28	* pagetable page RW, and will disallow using any writable page as a	28	* pagetable page RW, and will disallow using any writable page as a
29	* pagetable.	29	* pagetable.
30	*	30	*
31	* Naively, when loading %cr3 with the base of a new pagetable, Xen	31	* Naively, when loading %cr3 with the base of a new pagetable, Xen
32	* would need to validate the whole pagetable before going on.	32	* would need to validate the whole pagetable before going on.
33	* Naturally, this is quite slow. The solution is to "pin" a	33	* Naturally, this is quite slow. The solution is to "pin" a
34	* pagetable, which enforces all the constraints on the pagetable even	34	* pagetable, which enforces all the constraints on the pagetable even
35	* when it is not actively in use. This menas that Xen can be assured	35	* when it is not actively in use. This menas that Xen can be assured
36	* that it is still valid when you do load it into %cr3, and doesn't	36	* that it is still valid when you do load it into %cr3, and doesn't
37	* need to revalidate it.	37	* need to revalidate it.
38	*	38	*
39	* Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007	39	* Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
40	*/	40	*/
41	#include <linux/sched.h>	41	#include <linux/sched.h>
42	#include <linux/highmem.h>	42	#include <linux/highmem.h>
43	#include <linux/debugfs.h>	43	#include <linux/debugfs.h>
44	#include <linux/bug.h>	44	#include <linux/bug.h>
45	#include <linux/vmalloc.h>	45	#include <linux/vmalloc.h>
46	#include <linux/module.h>	46	#include <linux/module.h>
47	#include <linux/gfp.h>	47	#include <linux/gfp.h>
48	#include <linux/memblock.h>	48	#include <linux/memblock.h>
49	#include <linux/seq_file.h>	49	#include <linux/seq_file.h>
50	#include <linux/crash_dump.h>	50	#include <linux/crash_dump.h>
51		51
52	#include <trace/events/xen.h>	52	#include <trace/events/xen.h>
53		53
54	#include <asm/pgtable.h>	54	#include <asm/pgtable.h>
55	#include <asm/tlbflush.h>	55	#include <asm/tlbflush.h>
56	#include <asm/fixmap.h>	56	#include <asm/fixmap.h>
57	#include <asm/mmu_context.h>	57	#include <asm/mmu_context.h>
58	#include <asm/setup.h>	58	#include <asm/setup.h>
59	#include <asm/paravirt.h>	59	#include <asm/paravirt.h>
60	#include <asm/e820.h>	60	#include <asm/e820.h>
61	#include <asm/linkage.h>	61	#include <asm/linkage.h>
62	#include <asm/page.h>	62	#include <asm/page.h>
63	#include <asm/init.h>	63	#include <asm/init.h>
64	#include <asm/pat.h>	64	#include <asm/pat.h>
65	#include <asm/smp.h>	65	#include <asm/smp.h>
66		66
67	#include <asm/xen/hypercall.h>	67	#include <asm/xen/hypercall.h>
68	#include <asm/xen/hypervisor.h>	68	#include <asm/xen/hypervisor.h>
69		69
70	#include <xen/xen.h>	70	#include <xen/xen.h>
71	#include <xen/page.h>	71	#include <xen/page.h>
72	#include <xen/interface/xen.h>	72	#include <xen/interface/xen.h>
73	#include <xen/interface/hvm/hvm_op.h>	73	#include <xen/interface/hvm/hvm_op.h>
74	#include <xen/interface/version.h>	74	#include <xen/interface/version.h>
75	#include <xen/interface/memory.h>	75	#include <xen/interface/memory.h>
76	#include <xen/hvc-console.h>	76	#include <xen/hvc-console.h>
77		77
78	#include "multicalls.h"	78	#include "multicalls.h"
79	#include "mmu.h"	79	#include "mmu.h"
80	#include "debugfs.h"	80	#include "debugfs.h"
81		81
82	/*	82	/*
83	* Protects atomic reservation decrease/increase against concurrent increases.	83	* Protects atomic reservation decrease/increase against concurrent increases.
84	* Also protects non-atomic updates of current_pages and balloon lists.	84	* Also protects non-atomic updates of current_pages and balloon lists.
85	*/	85	*/
86	DEFINE_SPINLOCK(xen_reservation_lock);	86	DEFINE_SPINLOCK(xen_reservation_lock);
87		87
88	#ifdef CONFIG_X86_32	88	#ifdef CONFIG_X86_32
89	/*	89	/*
90	* Identity map, in addition to plain kernel map. This needs to be	90	* Identity map, in addition to plain kernel map. This needs to be
91	* large enough to allocate page table pages to allocate the rest.	91	* large enough to allocate page table pages to allocate the rest.
92	* Each page can map 2MB.	92	* Each page can map 2MB.
93	*/	93	*/
94	#define LEVEL1_IDENT_ENTRIES (PTRS_PER_PTE * 4)	94	#define LEVEL1_IDENT_ENTRIES (PTRS_PER_PTE * 4)
95	static RESERVE_BRK_ARRAY(pte_t, level1_ident_pgt, LEVEL1_IDENT_ENTRIES);	95	static RESERVE_BRK_ARRAY(pte_t, level1_ident_pgt, LEVEL1_IDENT_ENTRIES);
96	#endif	96	#endif
97	#ifdef CONFIG_X86_64	97	#ifdef CONFIG_X86_64
98	/* l3 pud for userspace vsyscall mapping */	98	/* l3 pud for userspace vsyscall mapping */
99	static pud_t level3_user_vsyscall[PTRS_PER_PUD] __page_aligned_bss;	99	static pud_t level3_user_vsyscall[PTRS_PER_PUD] __page_aligned_bss;
100	#endif /* CONFIG_X86_64 */	100	#endif /* CONFIG_X86_64 */
101		101
102	/*	102	/*
103	* Note about cr3 (pagetable base) values:	103	* Note about cr3 (pagetable base) values:
104	*	104	*
105	* xen_cr3 contains the current logical cr3 value; it contains the	105	* xen_cr3 contains the current logical cr3 value; it contains the
106	* last set cr3. This may not be the current effective cr3, because	106	* last set cr3. This may not be the current effective cr3, because
107	* its update may be being lazily deferred. However, a vcpu looking	107	* its update may be being lazily deferred. However, a vcpu looking
108	* at its own cr3 can use this value knowing that it everything will	108	* at its own cr3 can use this value knowing that it everything will
109	* be self-consistent.	109	* be self-consistent.
110	*	110	*
111	* xen_current_cr3 contains the actual vcpu cr3; it is set once the	111	* xen_current_cr3 contains the actual vcpu cr3; it is set once the
112	* hypercall to set the vcpu cr3 is complete (so it may be a little	112	* hypercall to set the vcpu cr3 is complete (so it may be a little
113	* out of date, but it will never be set early). If one vcpu is	113	* out of date, but it will never be set early). If one vcpu is
114	* looking at another vcpu's cr3 value, it should use this variable.	114	* looking at another vcpu's cr3 value, it should use this variable.
115	*/	115	*/
116	DEFINE_PER_CPU(unsigned long, xen_cr3); /* cr3 stored as physaddr */	116	DEFINE_PER_CPU(unsigned long, xen_cr3); /* cr3 stored as physaddr */
117	DEFINE_PER_CPU(unsigned long, xen_current_cr3); /* actual vcpu cr3 */	117	DEFINE_PER_CPU(unsigned long, xen_current_cr3); /* actual vcpu cr3 */
118		118
119		119
120	/*	120	/*
121	* Just beyond the highest usermode address. STACK_TOP_MAX has a	121	* Just beyond the highest usermode address. STACK_TOP_MAX has a
122	* redzone above it, so round it up to a PGD boundary.	122	* redzone above it, so round it up to a PGD boundary.
123	*/	123	*/
124	#define USER_LIMIT ((STACK_TOP_MAX + PGDIR_SIZE - 1) & PGDIR_MASK)	124	#define USER_LIMIT ((STACK_TOP_MAX + PGDIR_SIZE - 1) & PGDIR_MASK)
125		125
126	unsigned long arbitrary_virt_to_mfn(void *vaddr)	126	unsigned long arbitrary_virt_to_mfn(void *vaddr)
127	{	127	{
128	xmaddr_t maddr = arbitrary_virt_to_machine(vaddr);	128	xmaddr_t maddr = arbitrary_virt_to_machine(vaddr);
129		129
130	return PFN_DOWN(maddr.maddr);	130	return PFN_DOWN(maddr.maddr);
131	}	131	}
132		132
133	xmaddr_t arbitrary_virt_to_machine(void *vaddr)	133	xmaddr_t arbitrary_virt_to_machine(void *vaddr)
134	{	134	{
135	unsigned long address = (unsigned long)vaddr;	135	unsigned long address = (unsigned long)vaddr;
136	unsigned int level;	136	unsigned int level;
137	pte_t *pte;	137	pte_t *pte;
138	unsigned offset;	138	unsigned offset;
139		139
140	/*	140	/*
141	* if the PFN is in the linear mapped vaddr range, we can just use	141	* if the PFN is in the linear mapped vaddr range, we can just use
142	* the (quick) virt_to_machine() p2m lookup	142	* the (quick) virt_to_machine() p2m lookup
143	*/	143	*/
144	if (virt_addr_valid(vaddr))	144	if (virt_addr_valid(vaddr))
145	return virt_to_machine(vaddr);	145	return virt_to_machine(vaddr);
146		146
147	/* otherwise we have to do a (slower) full page-table walk */	147	/* otherwise we have to do a (slower) full page-table walk */
148		148
149	pte = lookup_address(address, &level);	149	pte = lookup_address(address, &level);
150	BUG_ON(pte == NULL);	150	BUG_ON(pte == NULL);
151	offset = address & ~PAGE_MASK;	151	offset = address & ~PAGE_MASK;
152	return XMADDR(((phys_addr_t)pte_mfn(*pte) << PAGE_SHIFT) + offset);	152	return XMADDR(((phys_addr_t)pte_mfn(*pte) << PAGE_SHIFT) + offset);
153	}	153	}
154	EXPORT_SYMBOL_GPL(arbitrary_virt_to_machine);	154	EXPORT_SYMBOL_GPL(arbitrary_virt_to_machine);
155		155
156	void make_lowmem_page_readonly(void *vaddr)	156	void make_lowmem_page_readonly(void *vaddr)
157	{	157	{
158	pte_t *pte, ptev;	158	pte_t *pte, ptev;
159	unsigned long address = (unsigned long)vaddr;	159	unsigned long address = (unsigned long)vaddr;
160	unsigned int level;	160	unsigned int level;
161		161
162	pte = lookup_address(address, &level);	162	pte = lookup_address(address, &level);
163	if (pte == NULL)	163	if (pte == NULL)
164	return; /* vaddr missing */	164	return; /* vaddr missing */
165		165
166	ptev = pte_wrprotect(*pte);	166	ptev = pte_wrprotect(*pte);
167		167
168	if (HYPERVISOR_update_va_mapping(address, ptev, 0))	168	if (HYPERVISOR_update_va_mapping(address, ptev, 0))
169	BUG();	169	BUG();
170	}	170	}
171		171
172	void make_lowmem_page_readwrite(void *vaddr)	172	void make_lowmem_page_readwrite(void *vaddr)
173	{	173	{
174	pte_t *pte, ptev;	174	pte_t *pte, ptev;
175	unsigned long address = (unsigned long)vaddr;	175	unsigned long address = (unsigned long)vaddr;
176	unsigned int level;	176	unsigned int level;
177		177
178	pte = lookup_address(address, &level);	178	pte = lookup_address(address, &level);
179	if (pte == NULL)	179	if (pte == NULL)
180	return; /* vaddr missing */	180	return; /* vaddr missing */
181		181
182	ptev = pte_mkwrite(*pte);	182	ptev = pte_mkwrite(*pte);
183		183
184	if (HYPERVISOR_update_va_mapping(address, ptev, 0))	184	if (HYPERVISOR_update_va_mapping(address, ptev, 0))
185	BUG();	185	BUG();
186	}	186	}
187		187
188		188
189	static bool xen_page_pinned(void *ptr)	189	static bool xen_page_pinned(void *ptr)
190	{	190	{
191	struct page *page = virt_to_page(ptr);	191	struct page *page = virt_to_page(ptr);
192		192
193	return PagePinned(page);	193	return PagePinned(page);
194	}	194	}
195		195
196	void xen_set_domain_pte(pte_t *ptep, pte_t pteval, unsigned domid)	196	void xen_set_domain_pte(pte_t *ptep, pte_t pteval, unsigned domid)
197	{	197	{
198	struct multicall_space mcs;	198	struct multicall_space mcs;
199	struct mmu_update *u;	199	struct mmu_update *u;
200		200
201	trace_xen_mmu_set_domain_pte(ptep, pteval, domid);	201	trace_xen_mmu_set_domain_pte(ptep, pteval, domid);
202		202
203	mcs = xen_mc_entry(sizeof(*u));	203	mcs = xen_mc_entry(sizeof(*u));
204	u = mcs.args;	204	u = mcs.args;
205		205
206	/* ptep might be kmapped when using 32-bit HIGHPTE */	206	/* ptep might be kmapped when using 32-bit HIGHPTE */
207	u->ptr = virt_to_machine(ptep).maddr;	207	u->ptr = virt_to_machine(ptep).maddr;
208	u->val = pte_val_ma(pteval);	208	u->val = pte_val_ma(pteval);
209		209
210	MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, domid);	210	MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, domid);
211		211
212	xen_mc_issue(PARAVIRT_LAZY_MMU);	212	xen_mc_issue(PARAVIRT_LAZY_MMU);
213	}	213	}
214	EXPORT_SYMBOL_GPL(xen_set_domain_pte);	214	EXPORT_SYMBOL_GPL(xen_set_domain_pte);
215		215
216	static void xen_extend_mmu_update(const struct mmu_update *update)	216	static void xen_extend_mmu_update(const struct mmu_update *update)
217	{	217	{
218	struct multicall_space mcs;	218	struct multicall_space mcs;
219	struct mmu_update *u;	219	struct mmu_update *u;
220		220
221	mcs = xen_mc_extend_args(__HYPERVISOR_mmu_update, sizeof(*u));	221	mcs = xen_mc_extend_args(__HYPERVISOR_mmu_update, sizeof(*u));
222		222
223	if (mcs.mc != NULL) {	223	if (mcs.mc != NULL) {
224	mcs.mc->args[1]++;	224	mcs.mc->args[1]++;
225	} else {	225	} else {
226	mcs = __xen_mc_entry(sizeof(*u));	226	mcs = __xen_mc_entry(sizeof(*u));
227	MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, DOMID_SELF);	227	MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, DOMID_SELF);
228	}	228	}
229		229
230	u = mcs.args;	230	u = mcs.args;
231	u = update;	231	u = update;
232	}	232	}
233		233
234	static void xen_extend_mmuext_op(const struct mmuext_op *op)	234	static void xen_extend_mmuext_op(const struct mmuext_op *op)
235	{	235	{
236	struct multicall_space mcs;	236	struct multicall_space mcs;
237	struct mmuext_op *u;	237	struct mmuext_op *u;
238		238
239	mcs = xen_mc_extend_args(__HYPERVISOR_mmuext_op, sizeof(*u));	239	mcs = xen_mc_extend_args(__HYPERVISOR_mmuext_op, sizeof(*u));
240		240
241	if (mcs.mc != NULL) {	241	if (mcs.mc != NULL) {
242	mcs.mc->args[1]++;	242	mcs.mc->args[1]++;
243	} else {	243	} else {
244	mcs = __xen_mc_entry(sizeof(*u));	244	mcs = __xen_mc_entry(sizeof(*u));
245	MULTI_mmuext_op(mcs.mc, mcs.args, 1, NULL, DOMID_SELF);	245	MULTI_mmuext_op(mcs.mc, mcs.args, 1, NULL, DOMID_SELF);
246	}	246	}
247		247
248	u = mcs.args;	248	u = mcs.args;
249	u = op;	249	u = op;
250	}	250	}
251		251
252	static void xen_set_pmd_hyper(pmd_t *ptr, pmd_t val)	252	static void xen_set_pmd_hyper(pmd_t *ptr, pmd_t val)
253	{	253	{
254	struct mmu_update u;	254	struct mmu_update u;
255		255
256	preempt_disable();	256	preempt_disable();
257		257
258	xen_mc_batch();	258	xen_mc_batch();
259		259
260	/* ptr may be ioremapped for 64-bit pagetable setup */	260	/* ptr may be ioremapped for 64-bit pagetable setup */
261	u.ptr = arbitrary_virt_to_machine(ptr).maddr;	261	u.ptr = arbitrary_virt_to_machine(ptr).maddr;
262	u.val = pmd_val_ma(val);	262	u.val = pmd_val_ma(val);
263	xen_extend_mmu_update(&u);	263	xen_extend_mmu_update(&u);
264		264
265	xen_mc_issue(PARAVIRT_LAZY_MMU);	265	xen_mc_issue(PARAVIRT_LAZY_MMU);
266		266
267	preempt_enable();	267	preempt_enable();
268	}	268	}
269		269
270	static void xen_set_pmd(pmd_t *ptr, pmd_t val)	270	static void xen_set_pmd(pmd_t *ptr, pmd_t val)
271	{	271	{
272	trace_xen_mmu_set_pmd(ptr, val);	272	trace_xen_mmu_set_pmd(ptr, val);
273		273
274	/* If page is not pinned, we can just update the entry	274	/* If page is not pinned, we can just update the entry
275	directly */	275	directly */
276	if (!xen_page_pinned(ptr)) {	276	if (!xen_page_pinned(ptr)) {
277	*ptr = val;	277	*ptr = val;
278	return;	278	return;
279	}	279	}
280		280
281	xen_set_pmd_hyper(ptr, val);	281	xen_set_pmd_hyper(ptr, val);
282	}	282	}
283		283
284	/*	284	/*
285	* Associate a virtual page frame with a given physical page frame	285	* Associate a virtual page frame with a given physical page frame
286	* and protection flags for that frame.	286	* and protection flags for that frame.
287	*/	287	*/
288	void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags)	288	void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags)
289	{	289	{
290	set_pte_vaddr(vaddr, mfn_pte(mfn, flags));	290	set_pte_vaddr(vaddr, mfn_pte(mfn, flags));
291	}	291	}
292		292
293	static bool xen_batched_set_pte(pte_t *ptep, pte_t pteval)	293	static bool xen_batched_set_pte(pte_t *ptep, pte_t pteval)
294	{	294	{
295	struct mmu_update u;	295	struct mmu_update u;
296		296
297	if (paravirt_get_lazy_mode() != PARAVIRT_LAZY_MMU)	297	if (paravirt_get_lazy_mode() != PARAVIRT_LAZY_MMU)
298	return false;	298	return false;
299		299
300	xen_mc_batch();	300	xen_mc_batch();
301		301
302	u.ptr = virt_to_machine(ptep).maddr \| MMU_NORMAL_PT_UPDATE;	302	u.ptr = virt_to_machine(ptep).maddr \| MMU_NORMAL_PT_UPDATE;
303	u.val = pte_val_ma(pteval);	303	u.val = pte_val_ma(pteval);
304	xen_extend_mmu_update(&u);	304	xen_extend_mmu_update(&u);
305		305
306	xen_mc_issue(PARAVIRT_LAZY_MMU);	306	xen_mc_issue(PARAVIRT_LAZY_MMU);
307		307
308	return true;	308	return true;
309	}	309	}
310		310
311	static inline void __xen_set_pte(pte_t *ptep, pte_t pteval)	311	static inline void __xen_set_pte(pte_t *ptep, pte_t pteval)
312	{	312	{
313	if (!xen_batched_set_pte(ptep, pteval)) {	313	if (!xen_batched_set_pte(ptep, pteval)) {
314	/*	314	/*
315	* Could call native_set_pte() here and trap and	315	* Could call native_set_pte() here and trap and
316	* emulate the PTE write but with 32-bit guests this	316	* emulate the PTE write but with 32-bit guests this
317	* needs two traps (one for each of the two 32-bit	317	* needs two traps (one for each of the two 32-bit
318	* words in the PTE) so do one hypercall directly	318	* words in the PTE) so do one hypercall directly
319	* instead.	319	* instead.
320	*/	320	*/
321	struct mmu_update u;	321	struct mmu_update u;
322		322
323	u.ptr = virt_to_machine(ptep).maddr \| MMU_NORMAL_PT_UPDATE;	323	u.ptr = virt_to_machine(ptep).maddr \| MMU_NORMAL_PT_UPDATE;
324	u.val = pte_val_ma(pteval);	324	u.val = pte_val_ma(pteval);
325	HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF);	325	HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF);
326	}	326	}
327	}	327	}
328		328
329	static void xen_set_pte(pte_t *ptep, pte_t pteval)	329	static void xen_set_pte(pte_t *ptep, pte_t pteval)
330	{	330	{
331	trace_xen_mmu_set_pte(ptep, pteval);	331	trace_xen_mmu_set_pte(ptep, pteval);
332	__xen_set_pte(ptep, pteval);	332	__xen_set_pte(ptep, pteval);
333	}	333	}
334		334
335	static void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,	335	static void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
336	pte_t *ptep, pte_t pteval)	336	pte_t *ptep, pte_t pteval)
337	{	337	{
338	trace_xen_mmu_set_pte_at(mm, addr, ptep, pteval);	338	trace_xen_mmu_set_pte_at(mm, addr, ptep, pteval);
339	__xen_set_pte(ptep, pteval);	339	__xen_set_pte(ptep, pteval);
340	}	340	}
341		341
342	pte_t xen_ptep_modify_prot_start(struct mm_struct *mm,	342	pte_t xen_ptep_modify_prot_start(struct mm_struct *mm,
343	unsigned long addr, pte_t *ptep)	343	unsigned long addr, pte_t *ptep)
344	{	344	{
345	/* Just return the pte as-is. We preserve the bits on commit */	345	/* Just return the pte as-is. We preserve the bits on commit */
346	trace_xen_mmu_ptep_modify_prot_start(mm, addr, ptep, *ptep);	346	trace_xen_mmu_ptep_modify_prot_start(mm, addr, ptep, *ptep);
347	return *ptep;	347	return *ptep;
348	}	348	}
349		349
350	void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr,	350	void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr,
351	pte_t *ptep, pte_t pte)	351	pte_t *ptep, pte_t pte)
352	{	352	{
353	struct mmu_update u;	353	struct mmu_update u;
354		354
355	trace_xen_mmu_ptep_modify_prot_commit(mm, addr, ptep, pte);	355	trace_xen_mmu_ptep_modify_prot_commit(mm, addr, ptep, pte);
356	xen_mc_batch();	356	xen_mc_batch();
357		357
358	u.ptr = virt_to_machine(ptep).maddr \| MMU_PT_UPDATE_PRESERVE_AD;	358	u.ptr = virt_to_machine(ptep).maddr \| MMU_PT_UPDATE_PRESERVE_AD;
359	u.val = pte_val_ma(pte);	359	u.val = pte_val_ma(pte);
360	xen_extend_mmu_update(&u);	360	xen_extend_mmu_update(&u);
361		361
362	xen_mc_issue(PARAVIRT_LAZY_MMU);	362	xen_mc_issue(PARAVIRT_LAZY_MMU);
363	}	363	}
364		364
365	/* Assume pteval_t is equivalent to all the other val_t types. /	365	/* Assume pteval_t is equivalent to all the other val_t types. /
366	static pteval_t pte_mfn_to_pfn(pteval_t val)	366	static pteval_t pte_mfn_to_pfn(pteval_t val)
367	{	367	{
368	if (val & _PAGE_PRESENT) {	368	if (val & _PAGE_PRESENT) {
369	unsigned long mfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT;	369	unsigned long mfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT;
370	unsigned long pfn = mfn_to_pfn(mfn);	370	unsigned long pfn = mfn_to_pfn(mfn);
371		371
372	pteval_t flags = val & PTE_FLAGS_MASK;	372	pteval_t flags = val & PTE_FLAGS_MASK;
373	if (unlikely(pfn == ~0))	373	if (unlikely(pfn == ~0))
374	val = flags & ~_PAGE_PRESENT;	374	val = flags & ~_PAGE_PRESENT;
375	else	375	else
376	val = ((pteval_t)pfn << PAGE_SHIFT) \| flags;	376	val = ((pteval_t)pfn << PAGE_SHIFT) \| flags;
377	}	377	}
378		378
379	return val;	379	return val;
380	}	380	}
381		381
382	static pteval_t pte_pfn_to_mfn(pteval_t val)	382	static pteval_t pte_pfn_to_mfn(pteval_t val)
383	{	383	{
384	if (val & _PAGE_PRESENT) {	384	if (val & _PAGE_PRESENT) {
385	unsigned long pfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT;	385	unsigned long pfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT;
386	pteval_t flags = val & PTE_FLAGS_MASK;	386	pteval_t flags = val & PTE_FLAGS_MASK;
387	unsigned long mfn;	387	unsigned long mfn;
388		388
389	if (!xen_feature(XENFEAT_auto_translated_physmap))	389	if (!xen_feature(XENFEAT_auto_translated_physmap))
390	mfn = get_phys_to_machine(pfn);	390	mfn = __pfn_to_mfn(pfn);
391	else	391	else
392	mfn = pfn;	392	mfn = pfn;
393	/*	393	/*
394	* If there's no mfn for the pfn, then just create an	394	* If there's no mfn for the pfn, then just create an
395	* empty non-present pte. Unfortunately this loses	395	* empty non-present pte. Unfortunately this loses
396	* information about the original pfn, so	396	* information about the original pfn, so
397	* pte_mfn_to_pfn is asymmetric.	397	* pte_mfn_to_pfn is asymmetric.
398	*/	398	*/
399	if (unlikely(mfn == INVALID_P2M_ENTRY)) {	399	if (unlikely(mfn == INVALID_P2M_ENTRY)) {
400	mfn = 0;	400	mfn = 0;
401	flags = 0;	401	flags = 0;
402	} else	402	} else
403	mfn &= ~(FOREIGN_FRAME_BIT \| IDENTITY_FRAME_BIT);	403	mfn &= ~(FOREIGN_FRAME_BIT \| IDENTITY_FRAME_BIT);
404	val = ((pteval_t)mfn << PAGE_SHIFT) \| flags;	404	val = ((pteval_t)mfn << PAGE_SHIFT) \| flags;
405	}	405	}
406		406
407	return val;	407	return val;
408	}	408	}
409		409
410	__visible pteval_t xen_pte_val(pte_t pte)	410	__visible pteval_t xen_pte_val(pte_t pte)
411	{	411	{
412	pteval_t pteval = pte.pte;	412	pteval_t pteval = pte.pte;
413		413
414	return pte_mfn_to_pfn(pteval);	414	return pte_mfn_to_pfn(pteval);
415	}	415	}
416	PV_CALLEE_SAVE_REGS_THUNK(xen_pte_val);	416	PV_CALLEE_SAVE_REGS_THUNK(xen_pte_val);
417		417
418	__visible pgdval_t xen_pgd_val(pgd_t pgd)	418	__visible pgdval_t xen_pgd_val(pgd_t pgd)
419	{	419	{
420	return pte_mfn_to_pfn(pgd.pgd);	420	return pte_mfn_to_pfn(pgd.pgd);
421	}	421	}
422	PV_CALLEE_SAVE_REGS_THUNK(xen_pgd_val);	422	PV_CALLEE_SAVE_REGS_THUNK(xen_pgd_val);
423		423
424	__visible pte_t xen_make_pte(pteval_t pte)	424	__visible pte_t xen_make_pte(pteval_t pte)
425	{	425	{
426	pte = pte_pfn_to_mfn(pte);	426	pte = pte_pfn_to_mfn(pte);
427		427
428	return native_make_pte(pte);	428	return native_make_pte(pte);
429	}	429	}
430	PV_CALLEE_SAVE_REGS_THUNK(xen_make_pte);	430	PV_CALLEE_SAVE_REGS_THUNK(xen_make_pte);
431		431
432	__visible pgd_t xen_make_pgd(pgdval_t pgd)	432	__visible pgd_t xen_make_pgd(pgdval_t pgd)
433	{	433	{
434	pgd = pte_pfn_to_mfn(pgd);	434	pgd = pte_pfn_to_mfn(pgd);
435	return native_make_pgd(pgd);	435	return native_make_pgd(pgd);
436	}	436	}
437	PV_CALLEE_SAVE_REGS_THUNK(xen_make_pgd);	437	PV_CALLEE_SAVE_REGS_THUNK(xen_make_pgd);
438		438
439	__visible pmdval_t xen_pmd_val(pmd_t pmd)	439	__visible pmdval_t xen_pmd_val(pmd_t pmd)
440	{	440	{
441	return pte_mfn_to_pfn(pmd.pmd);	441	return pte_mfn_to_pfn(pmd.pmd);
442	}	442	}
443	PV_CALLEE_SAVE_REGS_THUNK(xen_pmd_val);	443	PV_CALLEE_SAVE_REGS_THUNK(xen_pmd_val);
444		444
445	static void xen_set_pud_hyper(pud_t *ptr, pud_t val)	445	static void xen_set_pud_hyper(pud_t *ptr, pud_t val)
446	{	446	{
447	struct mmu_update u;	447	struct mmu_update u;
448		448
449	preempt_disable();	449	preempt_disable();
450		450
451	xen_mc_batch();	451	xen_mc_batch();
452		452
453	/* ptr may be ioremapped for 64-bit pagetable setup */	453	/* ptr may be ioremapped for 64-bit pagetable setup */
454	u.ptr = arbitrary_virt_to_machine(ptr).maddr;	454	u.ptr = arbitrary_virt_to_machine(ptr).maddr;
455	u.val = pud_val_ma(val);	455	u.val = pud_val_ma(val);
456	xen_extend_mmu_update(&u);	456	xen_extend_mmu_update(&u);
457		457
458	xen_mc_issue(PARAVIRT_LAZY_MMU);	458	xen_mc_issue(PARAVIRT_LAZY_MMU);
459		459
460	preempt_enable();	460	preempt_enable();
461	}	461	}
462		462
463	static void xen_set_pud(pud_t *ptr, pud_t val)	463	static void xen_set_pud(pud_t *ptr, pud_t val)
464	{	464	{
465	trace_xen_mmu_set_pud(ptr, val);	465	trace_xen_mmu_set_pud(ptr, val);
466		466
467	/* If page is not pinned, we can just update the entry	467	/* If page is not pinned, we can just update the entry
468	directly */	468	directly */
469	if (!xen_page_pinned(ptr)) {	469	if (!xen_page_pinned(ptr)) {
470	*ptr = val;	470	*ptr = val;
471	return;	471	return;
472	}	472	}
473		473
474	xen_set_pud_hyper(ptr, val);	474	xen_set_pud_hyper(ptr, val);
475	}	475	}
476		476
477	#ifdef CONFIG_X86_PAE	477	#ifdef CONFIG_X86_PAE
478	static void xen_set_pte_atomic(pte_t *ptep, pte_t pte)	478	static void xen_set_pte_atomic(pte_t *ptep, pte_t pte)
479	{	479	{
480	trace_xen_mmu_set_pte_atomic(ptep, pte);	480	trace_xen_mmu_set_pte_atomic(ptep, pte);
481	set_64bit((u64 *)ptep, native_pte_val(pte));	481	set_64bit((u64 *)ptep, native_pte_val(pte));
482	}	482	}
483		483
484	static void xen_pte_clear(struct mm_struct mm, unsigned long addr, pte_t ptep)	484	static void xen_pte_clear(struct mm_struct mm, unsigned long addr, pte_t ptep)
485	{	485	{
486	trace_xen_mmu_pte_clear(mm, addr, ptep);	486	trace_xen_mmu_pte_clear(mm, addr, ptep);
487	if (!xen_batched_set_pte(ptep, native_make_pte(0)))	487	if (!xen_batched_set_pte(ptep, native_make_pte(0)))
488	native_pte_clear(mm, addr, ptep);	488	native_pte_clear(mm, addr, ptep);
489	}	489	}
490		490
491	static void xen_pmd_clear(pmd_t *pmdp)	491	static void xen_pmd_clear(pmd_t *pmdp)
492	{	492	{
493	trace_xen_mmu_pmd_clear(pmdp);	493	trace_xen_mmu_pmd_clear(pmdp);
494	set_pmd(pmdp, __pmd(0));	494	set_pmd(pmdp, __pmd(0));
495	}	495	}
496	#endif /* CONFIG_X86_PAE */	496	#endif /* CONFIG_X86_PAE */
497		497
498	__visible pmd_t xen_make_pmd(pmdval_t pmd)	498	__visible pmd_t xen_make_pmd(pmdval_t pmd)
499	{	499	{
500	pmd = pte_pfn_to_mfn(pmd);	500	pmd = pte_pfn_to_mfn(pmd);
501	return native_make_pmd(pmd);	501	return native_make_pmd(pmd);
502	}	502	}
503	PV_CALLEE_SAVE_REGS_THUNK(xen_make_pmd);	503	PV_CALLEE_SAVE_REGS_THUNK(xen_make_pmd);
504		504
505	#if PAGETABLE_LEVELS == 4	505	#if PAGETABLE_LEVELS == 4
506	__visible pudval_t xen_pud_val(pud_t pud)	506	__visible pudval_t xen_pud_val(pud_t pud)
507	{	507	{
508	return pte_mfn_to_pfn(pud.pud);	508	return pte_mfn_to_pfn(pud.pud);
509	}	509	}
510	PV_CALLEE_SAVE_REGS_THUNK(xen_pud_val);	510	PV_CALLEE_SAVE_REGS_THUNK(xen_pud_val);
511		511
512	__visible pud_t xen_make_pud(pudval_t pud)	512	__visible pud_t xen_make_pud(pudval_t pud)
513	{	513	{
514	pud = pte_pfn_to_mfn(pud);	514	pud = pte_pfn_to_mfn(pud);
515		515
516	return native_make_pud(pud);	516	return native_make_pud(pud);
517	}	517	}
518	PV_CALLEE_SAVE_REGS_THUNK(xen_make_pud);	518	PV_CALLEE_SAVE_REGS_THUNK(xen_make_pud);
519		519
520	static pgd_t xen_get_user_pgd(pgd_t pgd)	520	static pgd_t xen_get_user_pgd(pgd_t pgd)
521	{	521	{
522	pgd_t pgd_page = (pgd_t )(((unsigned long)pgd) & PAGE_MASK);	522	pgd_t pgd_page = (pgd_t )(((unsigned long)pgd) & PAGE_MASK);
523	unsigned offset = pgd - pgd_page;	523	unsigned offset = pgd - pgd_page;
524	pgd_t *user_ptr = NULL;	524	pgd_t *user_ptr = NULL;
525		525
526	if (offset < pgd_index(USER_LIMIT)) {	526	if (offset < pgd_index(USER_LIMIT)) {
527	struct page *page = virt_to_page(pgd_page);	527	struct page *page = virt_to_page(pgd_page);
528	user_ptr = (pgd_t *)page->private;	528	user_ptr = (pgd_t *)page->private;
529	if (user_ptr)	529	if (user_ptr)
530	user_ptr += offset;	530	user_ptr += offset;
531	}	531	}
532		532
533	return user_ptr;	533	return user_ptr;
534	}	534	}
535		535
536	static void __xen_set_pgd_hyper(pgd_t *ptr, pgd_t val)	536	static void __xen_set_pgd_hyper(pgd_t *ptr, pgd_t val)
537	{	537	{
538	struct mmu_update u;	538	struct mmu_update u;
539		539
540	u.ptr = virt_to_machine(ptr).maddr;	540	u.ptr = virt_to_machine(ptr).maddr;
541	u.val = pgd_val_ma(val);	541	u.val = pgd_val_ma(val);
542	xen_extend_mmu_update(&u);	542	xen_extend_mmu_update(&u);
543	}	543	}
544		544
545	/*	545	/*
546	* Raw hypercall-based set_pgd, intended for in early boot before	546	* Raw hypercall-based set_pgd, intended for in early boot before
547	* there's a page structure. This implies:	547	* there's a page structure. This implies:
548	* 1. The only existing pagetable is the kernel's	548	* 1. The only existing pagetable is the kernel's
549	* 2. It is always pinned	549	* 2. It is always pinned
550	* 3. It has no user pagetable attached to it	550	* 3. It has no user pagetable attached to it
551	*/	551	*/
552	static void __init xen_set_pgd_hyper(pgd_t *ptr, pgd_t val)	552	static void __init xen_set_pgd_hyper(pgd_t *ptr, pgd_t val)
553	{	553	{
554	preempt_disable();	554	preempt_disable();
555		555
556	xen_mc_batch();	556	xen_mc_batch();
557		557
558	__xen_set_pgd_hyper(ptr, val);	558	__xen_set_pgd_hyper(ptr, val);
559		559
560	xen_mc_issue(PARAVIRT_LAZY_MMU);	560	xen_mc_issue(PARAVIRT_LAZY_MMU);
561		561
562	preempt_enable();	562	preempt_enable();
563	}	563	}
564		564
565	static void xen_set_pgd(pgd_t *ptr, pgd_t val)	565	static void xen_set_pgd(pgd_t *ptr, pgd_t val)
566	{	566	{
567	pgd_t *user_ptr = xen_get_user_pgd(ptr);	567	pgd_t *user_ptr = xen_get_user_pgd(ptr);
568		568
569	trace_xen_mmu_set_pgd(ptr, user_ptr, val);	569	trace_xen_mmu_set_pgd(ptr, user_ptr, val);
570		570
571	/* If page is not pinned, we can just update the entry	571	/* If page is not pinned, we can just update the entry
572	directly */	572	directly */
573	if (!xen_page_pinned(ptr)) {	573	if (!xen_page_pinned(ptr)) {
574	*ptr = val;	574	*ptr = val;
575	if (user_ptr) {	575	if (user_ptr) {
576	WARN_ON(xen_page_pinned(user_ptr));	576	WARN_ON(xen_page_pinned(user_ptr));
577	*user_ptr = val;	577	*user_ptr = val;
578	}	578	}
579	return;	579	return;
580	}	580	}
581		581
582	/* If it's pinned, then we can at least batch the kernel and	582	/* If it's pinned, then we can at least batch the kernel and
583	user updates together. */	583	user updates together. */
584	xen_mc_batch();	584	xen_mc_batch();
585		585
586	__xen_set_pgd_hyper(ptr, val);	586	__xen_set_pgd_hyper(ptr, val);
587	if (user_ptr)	587	if (user_ptr)
588	__xen_set_pgd_hyper(user_ptr, val);	588	__xen_set_pgd_hyper(user_ptr, val);
589		589
590	xen_mc_issue(PARAVIRT_LAZY_MMU);	590	xen_mc_issue(PARAVIRT_LAZY_MMU);
591	}	591	}
592	#endif /* PAGETABLE_LEVELS == 4 */	592	#endif /* PAGETABLE_LEVELS == 4 */
593		593
594	/*	594	/*
595	* (Yet another) pagetable walker. This one is intended for pinning a	595	* (Yet another) pagetable walker. This one is intended for pinning a
596	* pagetable. This means that it walks a pagetable and calls the	596	* pagetable. This means that it walks a pagetable and calls the
597	* callback function on each page it finds making up the page table,	597	* callback function on each page it finds making up the page table,
598	* at every level. It walks the entire pagetable, but it only bothers	598	* at every level. It walks the entire pagetable, but it only bothers
599	* pinning pte pages which are below limit. In the normal case this	599	* pinning pte pages which are below limit. In the normal case this
600	* will be STACK_TOP_MAX, but at boot we need to pin up to	600	* will be STACK_TOP_MAX, but at boot we need to pin up to
601	* FIXADDR_TOP.	601	* FIXADDR_TOP.
602	*	602	*
603	* For 32-bit the important bit is that we don't pin beyond there,	603	* For 32-bit the important bit is that we don't pin beyond there,
604	* because then we start getting into Xen's ptes.	604	* because then we start getting into Xen's ptes.
605	*	605	*
606	* For 64-bit, we must skip the Xen hole in the middle of the address	606	* For 64-bit, we must skip the Xen hole in the middle of the address
607	* space, just after the big x86-64 virtual hole.	607	* space, just after the big x86-64 virtual hole.
608	*/	608	*/
609	static int __xen_pgd_walk(struct mm_struct mm, pgd_t pgd,	609	static int __xen_pgd_walk(struct mm_struct mm, pgd_t pgd,
610	int (func)(struct mm_struct mm, struct page *,	610	int (func)(struct mm_struct mm, struct page *,
611	enum pt_level),	611	enum pt_level),
612	unsigned long limit)	612	unsigned long limit)
613	{	613	{
614	int flush = 0;	614	int flush = 0;
615	unsigned hole_low, hole_high;	615	unsigned hole_low, hole_high;
616	unsigned pgdidx_limit, pudidx_limit, pmdidx_limit;	616	unsigned pgdidx_limit, pudidx_limit, pmdidx_limit;
617	unsigned pgdidx, pudidx, pmdidx;	617	unsigned pgdidx, pudidx, pmdidx;
618		618
619	/* The limit is the last byte to be touched */	619	/* The limit is the last byte to be touched */
620	limit--;	620	limit--;
621	BUG_ON(limit >= FIXADDR_TOP);	621	BUG_ON(limit >= FIXADDR_TOP);
622		622
623	if (xen_feature(XENFEAT_auto_translated_physmap))	623	if (xen_feature(XENFEAT_auto_translated_physmap))
624	return 0;	624	return 0;
625		625
626	/*	626	/*
627	* 64-bit has a great big hole in the middle of the address	627	* 64-bit has a great big hole in the middle of the address
628	* space, which contains the Xen mappings. On 32-bit these	628	* space, which contains the Xen mappings. On 32-bit these
629	* will end up making a zero-sized hole and so is a no-op.	629	* will end up making a zero-sized hole and so is a no-op.
630	*/	630	*/
631	hole_low = pgd_index(USER_LIMIT);	631	hole_low = pgd_index(USER_LIMIT);
632	hole_high = pgd_index(PAGE_OFFSET);	632	hole_high = pgd_index(PAGE_OFFSET);
633		633
634	pgdidx_limit = pgd_index(limit);	634	pgdidx_limit = pgd_index(limit);
635	#if PTRS_PER_PUD > 1	635	#if PTRS_PER_PUD > 1
636	pudidx_limit = pud_index(limit);	636	pudidx_limit = pud_index(limit);
637	#else	637	#else
638	pudidx_limit = 0;	638	pudidx_limit = 0;
639	#endif	639	#endif
640	#if PTRS_PER_PMD > 1	640	#if PTRS_PER_PMD > 1
641	pmdidx_limit = pmd_index(limit);	641	pmdidx_limit = pmd_index(limit);
642	#else	642	#else
643	pmdidx_limit = 0;	643	pmdidx_limit = 0;
644	#endif	644	#endif
645		645
646	for (pgdidx = 0; pgdidx <= pgdidx_limit; pgdidx++) {	646	for (pgdidx = 0; pgdidx <= pgdidx_limit; pgdidx++) {
647	pud_t *pud;	647	pud_t *pud;
648		648
649	if (pgdidx >= hole_low && pgdidx < hole_high)	649	if (pgdidx >= hole_low && pgdidx < hole_high)
650	continue;	650	continue;
651		651
652	if (!pgd_val(pgd[pgdidx]))	652	if (!pgd_val(pgd[pgdidx]))
653	continue;	653	continue;
654		654
655	pud = pud_offset(&pgd[pgdidx], 0);	655	pud = pud_offset(&pgd[pgdidx], 0);
656		656
657	if (PTRS_PER_PUD > 1) /* not folded */	657	if (PTRS_PER_PUD > 1) /* not folded */
658	flush \|= (*func)(mm, virt_to_page(pud), PT_PUD);	658	flush \|= (*func)(mm, virt_to_page(pud), PT_PUD);
659		659
660	for (pudidx = 0; pudidx < PTRS_PER_PUD; pudidx++) {	660	for (pudidx = 0; pudidx < PTRS_PER_PUD; pudidx++) {
661	pmd_t *pmd;	661	pmd_t *pmd;
662		662
663	if (pgdidx == pgdidx_limit &&	663	if (pgdidx == pgdidx_limit &&
664	pudidx > pudidx_limit)	664	pudidx > pudidx_limit)
665	goto out;	665	goto out;
666		666
667	if (pud_none(pud[pudidx]))	667	if (pud_none(pud[pudidx]))
668	continue;	668	continue;
669		669
670	pmd = pmd_offset(&pud[pudidx], 0);	670	pmd = pmd_offset(&pud[pudidx], 0);
671		671
672	if (PTRS_PER_PMD > 1) /* not folded */	672	if (PTRS_PER_PMD > 1) /* not folded */
673	flush \|= (*func)(mm, virt_to_page(pmd), PT_PMD);	673	flush \|= (*func)(mm, virt_to_page(pmd), PT_PMD);
674		674
675	for (pmdidx = 0; pmdidx < PTRS_PER_PMD; pmdidx++) {	675	for (pmdidx = 0; pmdidx < PTRS_PER_PMD; pmdidx++) {
676	struct page *pte;	676	struct page *pte;
677		677
678	if (pgdidx == pgdidx_limit &&	678	if (pgdidx == pgdidx_limit &&
679	pudidx == pudidx_limit &&	679	pudidx == pudidx_limit &&
680	pmdidx > pmdidx_limit)	680	pmdidx > pmdidx_limit)
681	goto out;	681	goto out;
682		682
683	if (pmd_none(pmd[pmdidx]))	683	if (pmd_none(pmd[pmdidx]))
684	continue;	684	continue;
685		685
686	pte = pmd_page(pmd[pmdidx]);	686	pte = pmd_page(pmd[pmdidx]);
687	flush \|= (*func)(mm, pte, PT_PTE);	687	flush \|= (*func)(mm, pte, PT_PTE);
688	}	688	}
689	}	689	}
690	}	690	}
691		691
692	out:	692	out:
693	/* Do the top level last, so that the callbacks can use it as	693	/* Do the top level last, so that the callbacks can use it as
694	a cue to do final things like tlb flushes. */	694	a cue to do final things like tlb flushes. */
695	flush \|= (*func)(mm, virt_to_page(pgd), PT_PGD);	695	flush \|= (*func)(mm, virt_to_page(pgd), PT_PGD);
696		696
697	return flush;	697	return flush;
698	}	698	}
699		699
700	static int xen_pgd_walk(struct mm_struct *mm,	700	static int xen_pgd_walk(struct mm_struct *mm,
701	int (func)(struct mm_struct mm, struct page *,	701	int (func)(struct mm_struct mm, struct page *,
702	enum pt_level),	702	enum pt_level),
703	unsigned long limit)	703	unsigned long limit)
704	{	704	{
705	return __xen_pgd_walk(mm, mm->pgd, func, limit);	705	return __xen_pgd_walk(mm, mm->pgd, func, limit);
706	}	706	}
707		707
708	/* If we're using split pte locks, then take the page's lock and	708	/* If we're using split pte locks, then take the page's lock and
709	return a pointer to it. Otherwise return NULL. */	709	return a pointer to it. Otherwise return NULL. */
710	static spinlock_t xen_pte_lock(struct page page, struct mm_struct *mm)	710	static spinlock_t xen_pte_lock(struct page page, struct mm_struct *mm)
711	{	711	{
712	spinlock_t *ptl = NULL;	712	spinlock_t *ptl = NULL;
713		713
714	#if USE_SPLIT_PTE_PTLOCKS	714	#if USE_SPLIT_PTE_PTLOCKS
715	ptl = ptlock_ptr(page);	715	ptl = ptlock_ptr(page);
716	spin_lock_nest_lock(ptl, &mm->page_table_lock);	716	spin_lock_nest_lock(ptl, &mm->page_table_lock);
717	#endif	717	#endif
718		718
719	return ptl;	719	return ptl;
720	}	720	}
721		721
722	static void xen_pte_unlock(void *v)	722	static void xen_pte_unlock(void *v)
723	{	723	{
724	spinlock_t *ptl = v;	724	spinlock_t *ptl = v;
725	spin_unlock(ptl);	725	spin_unlock(ptl);
726	}	726	}
727		727
728	static void xen_do_pin(unsigned level, unsigned long pfn)	728	static void xen_do_pin(unsigned level, unsigned long pfn)
729	{	729	{
730	struct mmuext_op op;	730	struct mmuext_op op;
731		731
732	op.cmd = level;	732	op.cmd = level;
733	op.arg1.mfn = pfn_to_mfn(pfn);	733	op.arg1.mfn = pfn_to_mfn(pfn);
734		734
735	xen_extend_mmuext_op(&op);	735	xen_extend_mmuext_op(&op);
736	}	736	}
737		737
738	static int xen_pin_page(struct mm_struct mm, struct page page,	738	static int xen_pin_page(struct mm_struct mm, struct page page,
739	enum pt_level level)	739	enum pt_level level)
740	{	740	{
741	unsigned pgfl = TestSetPagePinned(page);	741	unsigned pgfl = TestSetPagePinned(page);
742	int flush;	742	int flush;
743		743
744	if (pgfl)	744	if (pgfl)
745	flush = 0; /* already pinned */	745	flush = 0; /* already pinned */
746	else if (PageHighMem(page))	746	else if (PageHighMem(page))
747	/* kmaps need flushing if we found an unpinned	747	/* kmaps need flushing if we found an unpinned
748	highpage */	748	highpage */
749	flush = 1;	749	flush = 1;
750	else {	750	else {
751	void *pt = lowmem_page_address(page);	751	void *pt = lowmem_page_address(page);
752	unsigned long pfn = page_to_pfn(page);	752	unsigned long pfn = page_to_pfn(page);
753	struct multicall_space mcs = __xen_mc_entry(0);	753	struct multicall_space mcs = __xen_mc_entry(0);
754	spinlock_t *ptl;	754	spinlock_t *ptl;
755		755
756	flush = 0;	756	flush = 0;
757		757
758	/*	758	/*
759	* We need to hold the pagetable lock between the time	759	* We need to hold the pagetable lock between the time
760	* we make the pagetable RO and when we actually pin	760	* we make the pagetable RO and when we actually pin
761	* it. If we don't, then other users may come in and	761	* it. If we don't, then other users may come in and
762	* attempt to update the pagetable by writing it,	762	* attempt to update the pagetable by writing it,
763	* which will fail because the memory is RO but not	763	* which will fail because the memory is RO but not
764	* pinned, so Xen won't do the trap'n'emulate.	764	* pinned, so Xen won't do the trap'n'emulate.
765	*	765	*
766	* If we're using split pte locks, we can't hold the	766	* If we're using split pte locks, we can't hold the
767	* entire pagetable's worth of locks during the	767	* entire pagetable's worth of locks during the
768	* traverse, because we may wrap the preempt count (8	768	* traverse, because we may wrap the preempt count (8
769	* bits). The solution is to mark RO and pin each PTE	769	* bits). The solution is to mark RO and pin each PTE
770	* page while holding the lock. This means the number	770	* page while holding the lock. This means the number
771	* of locks we end up holding is never more than a	771	* of locks we end up holding is never more than a
772	* batch size (~32 entries, at present).	772	* batch size (~32 entries, at present).
773	*	773	*
774	* If we're not using split pte locks, we needn't pin	774	* If we're not using split pte locks, we needn't pin
775	* the PTE pages independently, because we're	775	* the PTE pages independently, because we're
776	* protected by the overall pagetable lock.	776	* protected by the overall pagetable lock.
777	*/	777	*/
778	ptl = NULL;	778	ptl = NULL;
779	if (level == PT_PTE)	779	if (level == PT_PTE)
780	ptl = xen_pte_lock(page, mm);	780	ptl = xen_pte_lock(page, mm);
781		781
782	MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,	782	MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
783	pfn_pte(pfn, PAGE_KERNEL_RO),	783	pfn_pte(pfn, PAGE_KERNEL_RO),
784	level == PT_PGD ? UVMF_TLB_FLUSH : 0);	784	level == PT_PGD ? UVMF_TLB_FLUSH : 0);
785		785
786	if (ptl) {	786	if (ptl) {
787	xen_do_pin(MMUEXT_PIN_L1_TABLE, pfn);	787	xen_do_pin(MMUEXT_PIN_L1_TABLE, pfn);
788		788
789	/* Queue a deferred unlock for when this batch	789	/* Queue a deferred unlock for when this batch
790	is completed. */	790	is completed. */
791	xen_mc_callback(xen_pte_unlock, ptl);	791	xen_mc_callback(xen_pte_unlock, ptl);
792	}	792	}
793	}	793	}
794		794
795	return flush;	795	return flush;
796	}	796	}
797		797
798	/* This is called just after a mm has been created, but it has not	798	/* This is called just after a mm has been created, but it has not
799	been used yet. We need to make sure that its pagetable is all	799	been used yet. We need to make sure that its pagetable is all
800	read-only, and can be pinned. */	800	read-only, and can be pinned. */
801	static void __xen_pgd_pin(struct mm_struct mm, pgd_t pgd)	801	static void __xen_pgd_pin(struct mm_struct mm, pgd_t pgd)
802	{	802	{
803	trace_xen_mmu_pgd_pin(mm, pgd);	803	trace_xen_mmu_pgd_pin(mm, pgd);
804		804
805	xen_mc_batch();	805	xen_mc_batch();
806		806
807	if (__xen_pgd_walk(mm, pgd, xen_pin_page, USER_LIMIT)) {	807	if (__xen_pgd_walk(mm, pgd, xen_pin_page, USER_LIMIT)) {
808	/* re-enable interrupts for flushing */	808	/* re-enable interrupts for flushing */
809	xen_mc_issue(0);	809	xen_mc_issue(0);
810		810
811	kmap_flush_unused();	811	kmap_flush_unused();
812		812
813	xen_mc_batch();	813	xen_mc_batch();
814	}	814	}
815		815
816	#ifdef CONFIG_X86_64	816	#ifdef CONFIG_X86_64
817	{	817	{
818	pgd_t *user_pgd = xen_get_user_pgd(pgd);	818	pgd_t *user_pgd = xen_get_user_pgd(pgd);
819		819
820	xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(pgd)));	820	xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(pgd)));
821		821
822	if (user_pgd) {	822	if (user_pgd) {
823	xen_pin_page(mm, virt_to_page(user_pgd), PT_PGD);	823	xen_pin_page(mm, virt_to_page(user_pgd), PT_PGD);
824	xen_do_pin(MMUEXT_PIN_L4_TABLE,	824	xen_do_pin(MMUEXT_PIN_L4_TABLE,
825	PFN_DOWN(__pa(user_pgd)));	825	PFN_DOWN(__pa(user_pgd)));
826	}	826	}
827	}	827	}
828	#else /* CONFIG_X86_32 */	828	#else /* CONFIG_X86_32 */
829	#ifdef CONFIG_X86_PAE	829	#ifdef CONFIG_X86_PAE
830	/* Need to make sure unshared kernel PMD is pinnable */	830	/* Need to make sure unshared kernel PMD is pinnable */
831	xen_pin_page(mm, pgd_page(pgd[pgd_index(TASK_SIZE)]),	831	xen_pin_page(mm, pgd_page(pgd[pgd_index(TASK_SIZE)]),
832	PT_PMD);	832	PT_PMD);
833	#endif	833	#endif
834	xen_do_pin(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(pgd)));	834	xen_do_pin(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(pgd)));
835	#endif /* CONFIG_X86_64 */	835	#endif /* CONFIG_X86_64 */
836	xen_mc_issue(0);	836	xen_mc_issue(0);
837	}	837	}
838		838
839	static void xen_pgd_pin(struct mm_struct *mm)	839	static void xen_pgd_pin(struct mm_struct *mm)
840	{	840	{
841	__xen_pgd_pin(mm, mm->pgd);	841	__xen_pgd_pin(mm, mm->pgd);
842	}	842	}
843		843
844	/*	844	/*
845	* On save, we need to pin all pagetables to make sure they get their	845	* On save, we need to pin all pagetables to make sure they get their
846	* mfns turned into pfns. Search the list for any unpinned pgds and pin	846	* mfns turned into pfns. Search the list for any unpinned pgds and pin
847	* them (unpinned pgds are not currently in use, probably because the	847	* them (unpinned pgds are not currently in use, probably because the
848	* process is under construction or destruction).	848	* process is under construction or destruction).
849	*	849	*
850	* Expected to be called in stop_machine() ("equivalent to taking	850	* Expected to be called in stop_machine() ("equivalent to taking
851	* every spinlock in the system"), so the locking doesn't really	851	* every spinlock in the system"), so the locking doesn't really
852	* matter all that much.	852	* matter all that much.
853	*/	853	*/
854	void xen_mm_pin_all(void)	854	void xen_mm_pin_all(void)
855	{	855	{
856	struct page *page;	856	struct page *page;
857		857
858	spin_lock(&pgd_lock);	858	spin_lock(&pgd_lock);
859		859
860	list_for_each_entry(page, &pgd_list, lru) {	860	list_for_each_entry(page, &pgd_list, lru) {
861	if (!PagePinned(page)) {	861	if (!PagePinned(page)) {
862	__xen_pgd_pin(&init_mm, (pgd_t *)page_address(page));	862	__xen_pgd_pin(&init_mm, (pgd_t *)page_address(page));
863	SetPageSavePinned(page);	863	SetPageSavePinned(page);
864	}	864	}
865	}	865	}
866		866
867	spin_unlock(&pgd_lock);	867	spin_unlock(&pgd_lock);
868	}	868	}
869		869
870	/*	870	/*
871	* The init_mm pagetable is really pinned as soon as its created, but	871	* The init_mm pagetable is really pinned as soon as its created, but
872	* that's before we have page structures to store the bits. So do all	872	* that's before we have page structures to store the bits. So do all
873	* the book-keeping now.	873	* the book-keeping now.
874	*/	874	*/
875	static int __init xen_mark_pinned(struct mm_struct mm, struct page page,	875	static int __init xen_mark_pinned(struct mm_struct mm, struct page page,
876	enum pt_level level)	876	enum pt_level level)
877	{	877	{
878	SetPagePinned(page);	878	SetPagePinned(page);
879	return 0;	879	return 0;
880	}	880	}
881		881
882	static void __init xen_mark_init_mm_pinned(void)	882	static void __init xen_mark_init_mm_pinned(void)
883	{	883	{
884	xen_pgd_walk(&init_mm, xen_mark_pinned, FIXADDR_TOP);	884	xen_pgd_walk(&init_mm, xen_mark_pinned, FIXADDR_TOP);
885	}	885	}
886		886
887	static int xen_unpin_page(struct mm_struct mm, struct page page,	887	static int xen_unpin_page(struct mm_struct mm, struct page page,
888	enum pt_level level)	888	enum pt_level level)
889	{	889	{
890	unsigned pgfl = TestClearPagePinned(page);	890	unsigned pgfl = TestClearPagePinned(page);
891		891
892	if (pgfl && !PageHighMem(page)) {	892	if (pgfl && !PageHighMem(page)) {
893	void *pt = lowmem_page_address(page);	893	void *pt = lowmem_page_address(page);
894	unsigned long pfn = page_to_pfn(page);	894	unsigned long pfn = page_to_pfn(page);
895	spinlock_t *ptl = NULL;	895	spinlock_t *ptl = NULL;
896	struct multicall_space mcs;	896	struct multicall_space mcs;
897		897
898	/*	898	/*
899	* Do the converse to pin_page. If we're using split	899	* Do the converse to pin_page. If we're using split
900	* pte locks, we must be holding the lock for while	900	* pte locks, we must be holding the lock for while
901	* the pte page is unpinned but still RO to prevent	901	* the pte page is unpinned but still RO to prevent
902	* concurrent updates from seeing it in this	902	* concurrent updates from seeing it in this
903	* partially-pinned state.	903	* partially-pinned state.
904	*/	904	*/
905	if (level == PT_PTE) {	905	if (level == PT_PTE) {
906	ptl = xen_pte_lock(page, mm);	906	ptl = xen_pte_lock(page, mm);
907		907
908	if (ptl)	908	if (ptl)
909	xen_do_pin(MMUEXT_UNPIN_TABLE, pfn);	909	xen_do_pin(MMUEXT_UNPIN_TABLE, pfn);
910	}	910	}
911		911
912	mcs = __xen_mc_entry(0);	912	mcs = __xen_mc_entry(0);
913		913
914	MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,	914	MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
915	pfn_pte(pfn, PAGE_KERNEL),	915	pfn_pte(pfn, PAGE_KERNEL),
916	level == PT_PGD ? UVMF_TLB_FLUSH : 0);	916	level == PT_PGD ? UVMF_TLB_FLUSH : 0);
917		917
918	if (ptl) {	918	if (ptl) {
919	/* unlock when batch completed */	919	/* unlock when batch completed */
920	xen_mc_callback(xen_pte_unlock, ptl);	920	xen_mc_callback(xen_pte_unlock, ptl);
921	}	921	}
922	}	922	}
923		923
924	return 0; /* never need to flush on unpin */	924	return 0; /* never need to flush on unpin */
925	}	925	}
926		926
927	/* Release a pagetables pages back as normal RW */	927	/* Release a pagetables pages back as normal RW */
928	static void __xen_pgd_unpin(struct mm_struct mm, pgd_t pgd)	928	static void __xen_pgd_unpin(struct mm_struct mm, pgd_t pgd)
929	{	929	{
930	trace_xen_mmu_pgd_unpin(mm, pgd);	930	trace_xen_mmu_pgd_unpin(mm, pgd);
931		931
932	xen_mc_batch();	932	xen_mc_batch();
933		933
934	xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));	934	xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
935		935
936	#ifdef CONFIG_X86_64	936	#ifdef CONFIG_X86_64
937	{	937	{
938	pgd_t *user_pgd = xen_get_user_pgd(pgd);	938	pgd_t *user_pgd = xen_get_user_pgd(pgd);
939		939
940	if (user_pgd) {	940	if (user_pgd) {
941	xen_do_pin(MMUEXT_UNPIN_TABLE,	941	xen_do_pin(MMUEXT_UNPIN_TABLE,
942	PFN_DOWN(__pa(user_pgd)));	942	PFN_DOWN(__pa(user_pgd)));
943	xen_unpin_page(mm, virt_to_page(user_pgd), PT_PGD);	943	xen_unpin_page(mm, virt_to_page(user_pgd), PT_PGD);
944	}	944	}
945	}	945	}
946	#endif	946	#endif
947		947
948	#ifdef CONFIG_X86_PAE	948	#ifdef CONFIG_X86_PAE
949	/* Need to make sure unshared kernel PMD is unpinned */	949	/* Need to make sure unshared kernel PMD is unpinned */
950	xen_unpin_page(mm, pgd_page(pgd[pgd_index(TASK_SIZE)]),	950	xen_unpin_page(mm, pgd_page(pgd[pgd_index(TASK_SIZE)]),
951	PT_PMD);	951	PT_PMD);
952	#endif	952	#endif
953		953
954	__xen_pgd_walk(mm, pgd, xen_unpin_page, USER_LIMIT);	954	__xen_pgd_walk(mm, pgd, xen_unpin_page, USER_LIMIT);
955		955
956	xen_mc_issue(0);	956	xen_mc_issue(0);
957	}	957	}
958		958
959	static void xen_pgd_unpin(struct mm_struct *mm)	959	static void xen_pgd_unpin(struct mm_struct *mm)
960	{	960	{
961	__xen_pgd_unpin(mm, mm->pgd);	961	__xen_pgd_unpin(mm, mm->pgd);
962	}	962	}
963		963
964	/*	964	/*
965	* On resume, undo any pinning done at save, so that the rest of the	965	* On resume, undo any pinning done at save, so that the rest of the
966	* kernel doesn't see any unexpected pinned pagetables.	966	* kernel doesn't see any unexpected pinned pagetables.
967	*/	967	*/
968	void xen_mm_unpin_all(void)	968	void xen_mm_unpin_all(void)
969	{	969	{
970	struct page *page;	970	struct page *page;
971		971
972	spin_lock(&pgd_lock);	972	spin_lock(&pgd_lock);
973		973
974	list_for_each_entry(page, &pgd_list, lru) {	974	list_for_each_entry(page, &pgd_list, lru) {
975	if (PageSavePinned(page)) {	975	if (PageSavePinned(page)) {
976	BUG_ON(!PagePinned(page));	976	BUG_ON(!PagePinned(page));
977	__xen_pgd_unpin(&init_mm, (pgd_t *)page_address(page));	977	__xen_pgd_unpin(&init_mm, (pgd_t *)page_address(page));
978	ClearPageSavePinned(page);	978	ClearPageSavePinned(page);
979	}	979	}
980	}	980	}
981		981
982	spin_unlock(&pgd_lock);	982	spin_unlock(&pgd_lock);
983	}	983	}
984		984
985	static void xen_activate_mm(struct mm_struct prev, struct mm_struct next)	985	static void xen_activate_mm(struct mm_struct prev, struct mm_struct next)
986	{	986	{
987	spin_lock(&next->page_table_lock);	987	spin_lock(&next->page_table_lock);
988	xen_pgd_pin(next);	988	xen_pgd_pin(next);
989	spin_unlock(&next->page_table_lock);	989	spin_unlock(&next->page_table_lock);
990	}	990	}
991		991
992	static void xen_dup_mmap(struct mm_struct oldmm, struct mm_struct mm)	992	static void xen_dup_mmap(struct mm_struct oldmm, struct mm_struct mm)
993	{	993	{
994	spin_lock(&mm->page_table_lock);	994	spin_lock(&mm->page_table_lock);
995	xen_pgd_pin(mm);	995	xen_pgd_pin(mm);
996	spin_unlock(&mm->page_table_lock);	996	spin_unlock(&mm->page_table_lock);
997	}	997	}
998		998
999		999
1000	#ifdef CONFIG_SMP	1000	#ifdef CONFIG_SMP
1001	/* Another cpu may still have their %cr3 pointing at the pagetable, so	1001	/* Another cpu may still have their %cr3 pointing at the pagetable, so
1002	we need to repoint it somewhere else before we can unpin it. */	1002	we need to repoint it somewhere else before we can unpin it. */
1003	static void drop_other_mm_ref(void *info)	1003	static void drop_other_mm_ref(void *info)
1004	{	1004	{
1005	struct mm_struct *mm = info;	1005	struct mm_struct *mm = info;
1006	struct mm_struct *active_mm;	1006	struct mm_struct *active_mm;
1007		1007
1008	active_mm = this_cpu_read(cpu_tlbstate.active_mm);	1008	active_mm = this_cpu_read(cpu_tlbstate.active_mm);
1009		1009
1010	if (active_mm == mm && this_cpu_read(cpu_tlbstate.state) != TLBSTATE_OK)	1010	if (active_mm == mm && this_cpu_read(cpu_tlbstate.state) != TLBSTATE_OK)
1011	leave_mm(smp_processor_id());	1011	leave_mm(smp_processor_id());
1012		1012
1013	/* If this cpu still has a stale cr3 reference, then make sure	1013	/* If this cpu still has a stale cr3 reference, then make sure
1014	it has been flushed. */	1014	it has been flushed. */
1015	if (this_cpu_read(xen_current_cr3) == __pa(mm->pgd))	1015	if (this_cpu_read(xen_current_cr3) == __pa(mm->pgd))
1016	load_cr3(swapper_pg_dir);	1016	load_cr3(swapper_pg_dir);
1017	}	1017	}
1018		1018
1019	static void xen_drop_mm_ref(struct mm_struct *mm)	1019	static void xen_drop_mm_ref(struct mm_struct *mm)
1020	{	1020	{
1021	cpumask_var_t mask;	1021	cpumask_var_t mask;
1022	unsigned cpu;	1022	unsigned cpu;
1023		1023
1024	if (current->active_mm == mm) {	1024	if (current->active_mm == mm) {
1025	if (current->mm == mm)	1025	if (current->mm == mm)
1026	load_cr3(swapper_pg_dir);	1026	load_cr3(swapper_pg_dir);
1027	else	1027	else
1028	leave_mm(smp_processor_id());	1028	leave_mm(smp_processor_id());
1029	}	1029	}
1030		1030
1031	/* Get the "official" set of cpus referring to our pagetable. */	1031	/* Get the "official" set of cpus referring to our pagetable. */
1032	if (!alloc_cpumask_var(&mask, GFP_ATOMIC)) {	1032	if (!alloc_cpumask_var(&mask, GFP_ATOMIC)) {
1033	for_each_online_cpu(cpu) {	1033	for_each_online_cpu(cpu) {
1034	if (!cpumask_test_cpu(cpu, mm_cpumask(mm))	1034	if (!cpumask_test_cpu(cpu, mm_cpumask(mm))
1035	&& per_cpu(xen_current_cr3, cpu) != __pa(mm->pgd))	1035	&& per_cpu(xen_current_cr3, cpu) != __pa(mm->pgd))
1036	continue;	1036	continue;
1037	smp_call_function_single(cpu, drop_other_mm_ref, mm, 1);	1037	smp_call_function_single(cpu, drop_other_mm_ref, mm, 1);
1038	}	1038	}
1039	return;	1039	return;
1040	}	1040	}
1041	cpumask_copy(mask, mm_cpumask(mm));	1041	cpumask_copy(mask, mm_cpumask(mm));
1042		1042
1043	/* It's possible that a vcpu may have a stale reference to our	1043	/* It's possible that a vcpu may have a stale reference to our
1044	cr3, because its in lazy mode, and it hasn't yet flushed	1044	cr3, because its in lazy mode, and it hasn't yet flushed
1045	its set of pending hypercalls yet. In this case, we can	1045	its set of pending hypercalls yet. In this case, we can
1046	look at its actual current cr3 value, and force it to flush	1046	look at its actual current cr3 value, and force it to flush
1047	if needed. */	1047	if needed. */
1048	for_each_online_cpu(cpu) {	1048	for_each_online_cpu(cpu) {
1049	if (per_cpu(xen_current_cr3, cpu) == __pa(mm->pgd))	1049	if (per_cpu(xen_current_cr3, cpu) == __pa(mm->pgd))
1050	cpumask_set_cpu(cpu, mask);	1050	cpumask_set_cpu(cpu, mask);
1051	}	1051	}
1052		1052
1053	if (!cpumask_empty(mask))	1053	if (!cpumask_empty(mask))
1054	smp_call_function_many(mask, drop_other_mm_ref, mm, 1);	1054	smp_call_function_many(mask, drop_other_mm_ref, mm, 1);
1055	free_cpumask_var(mask);	1055	free_cpumask_var(mask);
1056	}	1056	}
1057	#else	1057	#else
1058	static void xen_drop_mm_ref(struct mm_struct *mm)	1058	static void xen_drop_mm_ref(struct mm_struct *mm)
1059	{	1059	{
1060	if (current->active_mm == mm)	1060	if (current->active_mm == mm)
1061	load_cr3(swapper_pg_dir);	1061	load_cr3(swapper_pg_dir);
1062	}	1062	}
1063	#endif	1063	#endif
1064		1064
1065	/*	1065	/*
1066	* While a process runs, Xen pins its pagetables, which means that the	1066	* While a process runs, Xen pins its pagetables, which means that the
1067	* hypervisor forces it to be read-only, and it controls all updates	1067	* hypervisor forces it to be read-only, and it controls all updates
1068	* to it. This means that all pagetable updates have to go via the	1068	* to it. This means that all pagetable updates have to go via the
1069	* hypervisor, which is moderately expensive.	1069	* hypervisor, which is moderately expensive.
1070	*	1070	*
1071	* Since we're pulling the pagetable down, we switch to use init_mm,	1071	* Since we're pulling the pagetable down, we switch to use init_mm,
1072	* unpin old process pagetable and mark it all read-write, which	1072	* unpin old process pagetable and mark it all read-write, which
1073	* allows further operations on it to be simple memory accesses.	1073	* allows further operations on it to be simple memory accesses.
1074	*	1074	*
1075	* The only subtle point is that another CPU may be still using the	1075	* The only subtle point is that another CPU may be still using the
1076	* pagetable because of lazy tlb flushing. This means we need need to	1076	* pagetable because of lazy tlb flushing. This means we need need to
1077	* switch all CPUs off this pagetable before we can unpin it.	1077	* switch all CPUs off this pagetable before we can unpin it.
1078	*/	1078	*/
1079	static void xen_exit_mmap(struct mm_struct *mm)	1079	static void xen_exit_mmap(struct mm_struct *mm)
1080	{	1080	{
1081	get_cpu(); /* make sure we don't move around */	1081	get_cpu(); /* make sure we don't move around */
1082	xen_drop_mm_ref(mm);	1082	xen_drop_mm_ref(mm);
1083	put_cpu();	1083	put_cpu();
1084		1084
1085	spin_lock(&mm->page_table_lock);	1085	spin_lock(&mm->page_table_lock);
1086		1086
1087	/* pgd may not be pinned in the error exit path of execve */	1087	/* pgd may not be pinned in the error exit path of execve */
1088	if (xen_page_pinned(mm->pgd))	1088	if (xen_page_pinned(mm->pgd))
1089	xen_pgd_unpin(mm);	1089	xen_pgd_unpin(mm);
1090		1090
1091	spin_unlock(&mm->page_table_lock);	1091	spin_unlock(&mm->page_table_lock);
1092	}	1092	}
1093		1093
1094	static void xen_post_allocator_init(void);	1094	static void xen_post_allocator_init(void);
1095		1095
1096	#ifdef CONFIG_X86_64	1096	#ifdef CONFIG_X86_64
1097	static void __init xen_cleanhighmap(unsigned long vaddr,	1097	static void __init xen_cleanhighmap(unsigned long vaddr,
1098	unsigned long vaddr_end)	1098	unsigned long vaddr_end)
1099	{	1099	{
1100	unsigned long kernel_end = roundup((unsigned long)_brk_end, PMD_SIZE) - 1;	1100	unsigned long kernel_end = roundup((unsigned long)_brk_end, PMD_SIZE) - 1;
1101	pmd_t *pmd = level2_kernel_pgt + pmd_index(vaddr);	1101	pmd_t *pmd = level2_kernel_pgt + pmd_index(vaddr);
1102		1102
1103	/* NOTE: The loop is more greedy than the cleanup_highmap variant.	1103	/* NOTE: The loop is more greedy than the cleanup_highmap variant.
1104	* We include the PMD passed in on _both_ boundaries. */	1104	* We include the PMD passed in on _both_ boundaries. */
1105	for (; vaddr <= vaddr_end && (pmd < (level2_kernel_pgt + PAGE_SIZE));	1105	for (; vaddr <= vaddr_end && (pmd < (level2_kernel_pgt + PAGE_SIZE));
1106	pmd++, vaddr += PMD_SIZE) {	1106	pmd++, vaddr += PMD_SIZE) {
1107	if (pmd_none(*pmd))	1107	if (pmd_none(*pmd))
1108	continue;	1108	continue;
1109	if (vaddr < (unsigned long) _text \|\| vaddr > kernel_end)	1109	if (vaddr < (unsigned long) _text \|\| vaddr > kernel_end)
1110	set_pmd(pmd, __pmd(0));	1110	set_pmd(pmd, __pmd(0));
1111	}	1111	}
1112	/* In case we did something silly, we should crash in this function	1112	/* In case we did something silly, we should crash in this function
1113	* instead of somewhere later and be confusing. */	1113	* instead of somewhere later and be confusing. */
1114	xen_mc_flush();	1114	xen_mc_flush();
1115	}	1115	}
1116	static void __init xen_pagetable_p2m_copy(void)	1116
		1117	static void __init xen_pagetable_p2m_free(void)
1117	{	1118	{
1118	unsigned long size;	1119	unsigned long size;
1119	unsigned long addr;	1120	unsigned long addr;
1120	unsigned long new_mfn_list;
1121		1121
1122	if (xen_feature(XENFEAT_auto_translated_physmap))
1123	return;
1124
1125	size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long));	1122	size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long));
1126		1123
1127	new_mfn_list = xen_revector_p2m_tree();
1128	/* No memory or already called. */	1124	/* No memory or already called. */
1129	if (!new_mfn_list \|\| new_mfn_list == xen_start_info->mfn_list)	1125	if ((unsigned long)xen_p2m_addr == xen_start_info->mfn_list)
1130	return;	1126	return;
1131		1127
1132	/* using __ka address and sticking INVALID_P2M_ENTRY! */	1128	/* using __ka address and sticking INVALID_P2M_ENTRY! */
1133	memset((void *)xen_start_info->mfn_list, 0xff, size);	1129	memset((void *)xen_start_info->mfn_list, 0xff, size);
1134		1130
1135	/* We should be in __ka space. */	1131	/* We should be in __ka space. */
1136	BUG_ON(xen_start_info->mfn_list < __START_KERNEL_map);	1132	BUG_ON(xen_start_info->mfn_list < __START_KERNEL_map);
1137	addr = xen_start_info->mfn_list;	1133	addr = xen_start_info->mfn_list;
1138	/* We roundup to the PMD, which means that if anybody at this stage is	1134	/* We roundup to the PMD, which means that if anybody at this stage is
1139	* using the __ka address of xen_start_info or xen_start_info->shared_info	1135	* using the __ka address of xen_start_info or xen_start_info->shared_info
1140	* they are in going to crash. Fortunatly we have already revectored	1136	* they are in going to crash. Fortunatly we have already revectored
1141	* in xen_setup_kernel_pagetable and in xen_setup_shared_info. */	1137	* in xen_setup_kernel_pagetable and in xen_setup_shared_info. */
1142	size = roundup(size, PMD_SIZE);	1138	size = roundup(size, PMD_SIZE);
1143	xen_cleanhighmap(addr, addr + size);	1139	xen_cleanhighmap(addr, addr + size);
1144		1140
1145	size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long));	1141	size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long));
1146	memblock_free(__pa(xen_start_info->mfn_list), size);	1142	memblock_free(__pa(xen_start_info->mfn_list), size);
1147	/* And revector! Bye bye old array */
1148	xen_start_info->mfn_list = new_mfn_list;
1149		1143
1150	/* At this stage, cleanup_highmap has already cleaned __ka space	1144	/* At this stage, cleanup_highmap has already cleaned __ka space
1151	* from _brk_limit way up to the max_pfn_mapped (which is the end of	1145	* from _brk_limit way up to the max_pfn_mapped (which is the end of
1152	* the ramdisk). We continue on, erasing PMD entries that point to page	1146	* the ramdisk). We continue on, erasing PMD entries that point to page
1153	* tables - do note that they are accessible at this stage via __va.	1147	* tables - do note that they are accessible at this stage via __va.
1154	* For good measure we also round up to the PMD - which means that if	1148	* For good measure we also round up to the PMD - which means that if
1155	* anybody is using __ka address to the initial boot-stack - and try	1149	* anybody is using __ka address to the initial boot-stack - and try
1156	* to use it - they are going to crash. The xen_start_info has been	1150	* to use it - they are going to crash. The xen_start_info has been
1157	* taken care of already in xen_setup_kernel_pagetable. */	1151	* taken care of already in xen_setup_kernel_pagetable. */
1158	addr = xen_start_info->pt_base;	1152	addr = xen_start_info->pt_base;
1159	size = roundup(xen_start_info->nr_pt_frames * PAGE_SIZE, PMD_SIZE);	1153	size = roundup(xen_start_info->nr_pt_frames * PAGE_SIZE, PMD_SIZE);
1160		1154
1161	xen_cleanhighmap(addr, addr + size);	1155	xen_cleanhighmap(addr, addr + size);
1162	xen_start_info->pt_base = (unsigned long)__va(__pa(xen_start_info->pt_base));	1156	xen_start_info->pt_base = (unsigned long)__va(__pa(xen_start_info->pt_base));
1163	#ifdef DEBUG	1157	#ifdef DEBUG
1164	/* This is superflous and is not neccessary, but you know what	1158	/* This is superflous and is not neccessary, but you know what
1165	* lets do it. The MODULES_VADDR -> MODULES_END should be clear of	1159	* lets do it. The MODULES_VADDR -> MODULES_END should be clear of
1166	* anything at this stage. */	1160	* anything at this stage. */
1167	xen_cleanhighmap(MODULES_VADDR, roundup(MODULES_VADDR, PUD_SIZE) - 1);	1161	xen_cleanhighmap(MODULES_VADDR, roundup(MODULES_VADDR, PUD_SIZE) - 1);
1168	#endif	1162	#endif
1169	}	1163	}
1170	#endif	1164	#endif
1171		1165
1172	static void __init xen_pagetable_init(void)	1166	static void __init xen_pagetable_p2m_setup(void)
1173	{	1167	{
1174	paging_init();	1168	if (xen_feature(XENFEAT_auto_translated_physmap))
		1169	return;
		1170
		1171	xen_vmalloc_p2m_tree();
		1172
1175	#ifdef CONFIG_X86_64	1173	#ifdef CONFIG_X86_64
1176	xen_pagetable_p2m_copy();	1174	xen_pagetable_p2m_free();
1177	#endif	1175	#endif
		1176	/* And revector! Bye bye old array */
		1177	xen_start_info->mfn_list = (unsigned long)xen_p2m_addr;
		1178	}
		1179
		1180	static void __init xen_pagetable_init(void)
		1181	{
		1182	paging_init();
		1183	xen_post_allocator_init();
		1184
		1185	xen_pagetable_p2m_setup();
		1186
1178	/* Allocate and initialize top and mid mfn levels for p2m structure */	1187	/* Allocate and initialize top and mid mfn levels for p2m structure */
1179	xen_build_mfn_list_list();	1188	xen_build_mfn_list_list();
1180		1189
		1190	/* Remap memory freed due to conflicts with E820 map */
		1191	if (!xen_feature(XENFEAT_auto_translated_physmap))
		1192	xen_remap_memory();
		1193
1181	xen_setup_shared_info();	1194	xen_setup_shared_info();
1182	xen_post_allocator_init();
1183	}	1195	}
1184	static void xen_write_cr2(unsigned long cr2)	1196	static void xen_write_cr2(unsigned long cr2)
1185	{	1197	{
1186	this_cpu_read(xen_vcpu)->arch.cr2 = cr2;	1198	this_cpu_read(xen_vcpu)->arch.cr2 = cr2;
1187	}	1199	}
1188		1200
1189	static unsigned long xen_read_cr2(void)	1201	static unsigned long xen_read_cr2(void)
1190	{	1202	{
1191	return this_cpu_read(xen_vcpu)->arch.cr2;	1203	return this_cpu_read(xen_vcpu)->arch.cr2;
1192	}	1204	}
1193		1205
1194	unsigned long xen_read_cr2_direct(void)	1206	unsigned long xen_read_cr2_direct(void)
1195	{	1207	{
1196	return this_cpu_read(xen_vcpu_info.arch.cr2);	1208	return this_cpu_read(xen_vcpu_info.arch.cr2);
1197	}	1209	}
1198		1210
1199	void xen_flush_tlb_all(void)	1211	void xen_flush_tlb_all(void)
1200	{	1212	{
1201	struct mmuext_op *op;	1213	struct mmuext_op *op;
1202	struct multicall_space mcs;	1214	struct multicall_space mcs;
1203		1215
1204	trace_xen_mmu_flush_tlb_all(0);	1216	trace_xen_mmu_flush_tlb_all(0);
1205		1217
1206	preempt_disable();	1218	preempt_disable();
1207		1219
1208	mcs = xen_mc_entry(sizeof(*op));	1220	mcs = xen_mc_entry(sizeof(*op));
1209		1221
1210	op = mcs.args;	1222	op = mcs.args;
1211	op->cmd = MMUEXT_TLB_FLUSH_ALL;	1223	op->cmd = MMUEXT_TLB_FLUSH_ALL;
1212	MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);	1224	MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
1213		1225
1214	xen_mc_issue(PARAVIRT_LAZY_MMU);	1226	xen_mc_issue(PARAVIRT_LAZY_MMU);
1215		1227
1216	preempt_enable();	1228	preempt_enable();
1217	}	1229	}
1218	static void xen_flush_tlb(void)	1230	static void xen_flush_tlb(void)
1219	{	1231	{
1220	struct mmuext_op *op;	1232	struct mmuext_op *op;
1221	struct multicall_space mcs;	1233	struct multicall_space mcs;
1222		1234
1223	trace_xen_mmu_flush_tlb(0);	1235	trace_xen_mmu_flush_tlb(0);
1224		1236
1225	preempt_disable();	1237	preempt_disable();
1226		1238
1227	mcs = xen_mc_entry(sizeof(*op));	1239	mcs = xen_mc_entry(sizeof(*op));
1228		1240
1229	op = mcs.args;	1241	op = mcs.args;
1230	op->cmd = MMUEXT_TLB_FLUSH_LOCAL;	1242	op->cmd = MMUEXT_TLB_FLUSH_LOCAL;
1231	MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);	1243	MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
1232		1244
1233	xen_mc_issue(PARAVIRT_LAZY_MMU);	1245	xen_mc_issue(PARAVIRT_LAZY_MMU);
1234		1246
1235	preempt_enable();	1247	preempt_enable();
1236	}	1248	}
1237		1249
1238	static void xen_flush_tlb_single(unsigned long addr)	1250	static void xen_flush_tlb_single(unsigned long addr)
1239	{	1251	{
1240	struct mmuext_op *op;	1252	struct mmuext_op *op;
1241	struct multicall_space mcs;	1253	struct multicall_space mcs;
1242		1254
1243	trace_xen_mmu_flush_tlb_single(addr);	1255	trace_xen_mmu_flush_tlb_single(addr);
1244		1256
1245	preempt_disable();	1257	preempt_disable();
1246		1258
1247	mcs = xen_mc_entry(sizeof(*op));	1259	mcs = xen_mc_entry(sizeof(*op));
1248	op = mcs.args;	1260	op = mcs.args;
1249	op->cmd = MMUEXT_INVLPG_LOCAL;	1261	op->cmd = MMUEXT_INVLPG_LOCAL;
1250	op->arg1.linear_addr = addr & PAGE_MASK;	1262	op->arg1.linear_addr = addr & PAGE_MASK;
1251	MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);	1263	MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
1252		1264
1253	xen_mc_issue(PARAVIRT_LAZY_MMU);	1265	xen_mc_issue(PARAVIRT_LAZY_MMU);
1254		1266
1255	preempt_enable();	1267	preempt_enable();
1256	}	1268	}
1257		1269
1258	static void xen_flush_tlb_others(const struct cpumask *cpus,	1270	static void xen_flush_tlb_others(const struct cpumask *cpus,
1259	struct mm_struct *mm, unsigned long start,	1271	struct mm_struct *mm, unsigned long start,
1260	unsigned long end)	1272	unsigned long end)
1261	{	1273	{
1262	struct {	1274	struct {
1263	struct mmuext_op op;	1275	struct mmuext_op op;
1264	#ifdef CONFIG_SMP	1276	#ifdef CONFIG_SMP
1265	DECLARE_BITMAP(mask, num_processors);	1277	DECLARE_BITMAP(mask, num_processors);
1266	#else	1278	#else
1267	DECLARE_BITMAP(mask, NR_CPUS);	1279	DECLARE_BITMAP(mask, NR_CPUS);
1268	#endif	1280	#endif
1269	} *args;	1281	} *args;
1270	struct multicall_space mcs;	1282	struct multicall_space mcs;
1271		1283
1272	trace_xen_mmu_flush_tlb_others(cpus, mm, start, end);	1284	trace_xen_mmu_flush_tlb_others(cpus, mm, start, end);
1273		1285
1274	if (cpumask_empty(cpus))	1286	if (cpumask_empty(cpus))
1275	return; /* nothing to do */	1287	return; /* nothing to do */
1276		1288
1277	mcs = xen_mc_entry(sizeof(*args));	1289	mcs = xen_mc_entry(sizeof(*args));
1278	args = mcs.args;	1290	args = mcs.args;
1279	args->op.arg2.vcpumask = to_cpumask(args->mask);	1291	args->op.arg2.vcpumask = to_cpumask(args->mask);
1280		1292
1281	/* Remove us, and any offline CPUS. */	1293	/* Remove us, and any offline CPUS. */
1282	cpumask_and(to_cpumask(args->mask), cpus, cpu_online_mask);	1294	cpumask_and(to_cpumask(args->mask), cpus, cpu_online_mask);
1283	cpumask_clear_cpu(smp_processor_id(), to_cpumask(args->mask));	1295	cpumask_clear_cpu(smp_processor_id(), to_cpumask(args->mask));
1284		1296
1285	args->op.cmd = MMUEXT_TLB_FLUSH_MULTI;	1297	args->op.cmd = MMUEXT_TLB_FLUSH_MULTI;
1286	if (end != TLB_FLUSH_ALL && (end - start) <= PAGE_SIZE) {	1298	if (end != TLB_FLUSH_ALL && (end - start) <= PAGE_SIZE) {
1287	args->op.cmd = MMUEXT_INVLPG_MULTI;	1299	args->op.cmd = MMUEXT_INVLPG_MULTI;
1288	args->op.arg1.linear_addr = start;	1300	args->op.arg1.linear_addr = start;
1289	}	1301	}
1290		1302
1291	MULTI_mmuext_op(mcs.mc, &args->op, 1, NULL, DOMID_SELF);	1303	MULTI_mmuext_op(mcs.mc, &args->op, 1, NULL, DOMID_SELF);
1292		1304
1293	xen_mc_issue(PARAVIRT_LAZY_MMU);	1305	xen_mc_issue(PARAVIRT_LAZY_MMU);
1294	}	1306	}
1295		1307
1296	static unsigned long xen_read_cr3(void)	1308	static unsigned long xen_read_cr3(void)
1297	{	1309	{
1298	return this_cpu_read(xen_cr3);	1310	return this_cpu_read(xen_cr3);
1299	}	1311	}
1300		1312
1301	static void set_current_cr3(void *v)	1313	static void set_current_cr3(void *v)
1302	{	1314	{
1303	this_cpu_write(xen_current_cr3, (unsigned long)v);	1315	this_cpu_write(xen_current_cr3, (unsigned long)v);
1304	}	1316	}
1305		1317
1306	static void __xen_write_cr3(bool kernel, unsigned long cr3)	1318	static void __xen_write_cr3(bool kernel, unsigned long cr3)
1307	{	1319	{
1308	struct mmuext_op op;	1320	struct mmuext_op op;
1309	unsigned long mfn;	1321	unsigned long mfn;
1310		1322
1311	trace_xen_mmu_write_cr3(kernel, cr3);	1323	trace_xen_mmu_write_cr3(kernel, cr3);
1312		1324
1313	if (cr3)	1325	if (cr3)
1314	mfn = pfn_to_mfn(PFN_DOWN(cr3));	1326	mfn = pfn_to_mfn(PFN_DOWN(cr3));
1315	else	1327	else
1316	mfn = 0;	1328	mfn = 0;
1317		1329
1318	WARN_ON(mfn == 0 && kernel);	1330	WARN_ON(mfn == 0 && kernel);
1319		1331
1320	op.cmd = kernel ? MMUEXT_NEW_BASEPTR : MMUEXT_NEW_USER_BASEPTR;	1332	op.cmd = kernel ? MMUEXT_NEW_BASEPTR : MMUEXT_NEW_USER_BASEPTR;
1321	op.arg1.mfn = mfn;	1333	op.arg1.mfn = mfn;
1322		1334
1323	xen_extend_mmuext_op(&op);	1335	xen_extend_mmuext_op(&op);
1324		1336
1325	if (kernel) {	1337	if (kernel) {
1326	this_cpu_write(xen_cr3, cr3);	1338	this_cpu_write(xen_cr3, cr3);
1327		1339
1328	/* Update xen_current_cr3 once the batch has actually	1340	/* Update xen_current_cr3 once the batch has actually
1329	been submitted. */	1341	been submitted. */
1330	xen_mc_callback(set_current_cr3, (void *)cr3);	1342	xen_mc_callback(set_current_cr3, (void *)cr3);
1331	}	1343	}
1332	}	1344	}
1333	static void xen_write_cr3(unsigned long cr3)	1345	static void xen_write_cr3(unsigned long cr3)
1334	{	1346	{
1335	BUG_ON(preemptible());	1347	BUG_ON(preemptible());
1336		1348
1337	xen_mc_batch(); /* disables interrupts */	1349	xen_mc_batch(); /* disables interrupts */
1338		1350
1339	/* Update while interrupts are disabled, so its atomic with	1351	/* Update while interrupts are disabled, so its atomic with
1340	respect to ipis */	1352	respect to ipis */
1341	this_cpu_write(xen_cr3, cr3);	1353	this_cpu_write(xen_cr3, cr3);
1342		1354
1343	__xen_write_cr3(true, cr3);	1355	__xen_write_cr3(true, cr3);
1344		1356
1345	#ifdef CONFIG_X86_64	1357	#ifdef CONFIG_X86_64
1346	{	1358	{
1347	pgd_t *user_pgd = xen_get_user_pgd(__va(cr3));	1359	pgd_t *user_pgd = xen_get_user_pgd(__va(cr3));
1348	if (user_pgd)	1360	if (user_pgd)
1349	__xen_write_cr3(false, __pa(user_pgd));	1361	__xen_write_cr3(false, __pa(user_pgd));
1350	else	1362	else
1351	__xen_write_cr3(false, 0);	1363	__xen_write_cr3(false, 0);
1352	}	1364	}
1353	#endif	1365	#endif
1354		1366
1355	xen_mc_issue(PARAVIRT_LAZY_CPU); /* interrupts restored */	1367	xen_mc_issue(PARAVIRT_LAZY_CPU); /* interrupts restored */
1356	}	1368	}
1357		1369
1358	#ifdef CONFIG_X86_64	1370	#ifdef CONFIG_X86_64
1359	/*	1371	/*
1360	* At the start of the day - when Xen launches a guest, it has already	1372	* At the start of the day - when Xen launches a guest, it has already
1361	* built pagetables for the guest. We diligently look over them	1373	* built pagetables for the guest. We diligently look over them
1362	* in xen_setup_kernel_pagetable and graft as appropiate them in the	1374	* in xen_setup_kernel_pagetable and graft as appropiate them in the
1363	* init_level4_pgt and its friends. Then when we are happy we load	1375	* init_level4_pgt and its friends. Then when we are happy we load
1364	* the new init_level4_pgt - and continue on.	1376	* the new init_level4_pgt - and continue on.
1365	*	1377	*
1366	* The generic code starts (start_kernel) and 'init_mem_mapping' sets	1378	* The generic code starts (start_kernel) and 'init_mem_mapping' sets
1367	* up the rest of the pagetables. When it has completed it loads the cr3.	1379	* up the rest of the pagetables. When it has completed it loads the cr3.
1368	* N.B. that baremetal would start at 'start_kernel' (and the early	1380	* N.B. that baremetal would start at 'start_kernel' (and the early
1369	* #PF handler would create bootstrap pagetables) - so we are running	1381	* #PF handler would create bootstrap pagetables) - so we are running
1370	* with the same assumptions as what to do when write_cr3 is executed	1382	* with the same assumptions as what to do when write_cr3 is executed
1371	* at this point.	1383	* at this point.
1372	*	1384	*
1373	* Since there are no user-page tables at all, we have two variants	1385	* Since there are no user-page tables at all, we have two variants
1374	* of xen_write_cr3 - the early bootup (this one), and the late one	1386	* of xen_write_cr3 - the early bootup (this one), and the late one
1375	* (xen_write_cr3). The reason we have to do that is that in 64-bit	1387	* (xen_write_cr3). The reason we have to do that is that in 64-bit
1376	* the Linux kernel and user-space are both in ring 3 while the	1388	* the Linux kernel and user-space are both in ring 3 while the
1377	* hypervisor is in ring 0.	1389	* hypervisor is in ring 0.
1378	*/	1390	*/
1379	static void __init xen_write_cr3_init(unsigned long cr3)	1391	static void __init xen_write_cr3_init(unsigned long cr3)
1380	{	1392	{
1381	BUG_ON(preemptible());	1393	BUG_ON(preemptible());
1382		1394
1383	xen_mc_batch(); /* disables interrupts */	1395	xen_mc_batch(); /* disables interrupts */
1384		1396
1385	/* Update while interrupts are disabled, so its atomic with	1397	/* Update while interrupts are disabled, so its atomic with
1386	respect to ipis */	1398	respect to ipis */
1387	this_cpu_write(xen_cr3, cr3);	1399	this_cpu_write(xen_cr3, cr3);
1388		1400
1389	__xen_write_cr3(true, cr3);	1401	__xen_write_cr3(true, cr3);
1390		1402
1391	xen_mc_issue(PARAVIRT_LAZY_CPU); /* interrupts restored */	1403	xen_mc_issue(PARAVIRT_LAZY_CPU); /* interrupts restored */
1392	}	1404	}
1393	#endif	1405	#endif
1394		1406
1395	static int xen_pgd_alloc(struct mm_struct *mm)	1407	static int xen_pgd_alloc(struct mm_struct *mm)
1396	{	1408	{
1397	pgd_t *pgd = mm->pgd;	1409	pgd_t *pgd = mm->pgd;
1398	int ret = 0;	1410	int ret = 0;
1399		1411
1400	BUG_ON(PagePinned(virt_to_page(pgd)));	1412	BUG_ON(PagePinned(virt_to_page(pgd)));
1401		1413
1402	#ifdef CONFIG_X86_64	1414	#ifdef CONFIG_X86_64
1403	{	1415	{
1404	struct page *page = virt_to_page(pgd);	1416	struct page *page = virt_to_page(pgd);
1405	pgd_t *user_pgd;	1417	pgd_t *user_pgd;
1406		1418
1407	BUG_ON(page->private != 0);	1419	BUG_ON(page->private != 0);
1408		1420
1409	ret = -ENOMEM;	1421	ret = -ENOMEM;
1410		1422
1411	user_pgd = (pgd_t *)__get_free_page(GFP_KERNEL \| __GFP_ZERO);	1423	user_pgd = (pgd_t *)__get_free_page(GFP_KERNEL \| __GFP_ZERO);
1412	page->private = (unsigned long)user_pgd;	1424	page->private = (unsigned long)user_pgd;
1413		1425
1414	if (user_pgd != NULL) {	1426	if (user_pgd != NULL) {
1415	#ifdef CONFIG_X86_VSYSCALL_EMULATION	1427	#ifdef CONFIG_X86_VSYSCALL_EMULATION
1416	user_pgd[pgd_index(VSYSCALL_ADDR)] =	1428	user_pgd[pgd_index(VSYSCALL_ADDR)] =
1417	__pgd(__pa(level3_user_vsyscall) \| _PAGE_TABLE);	1429	__pgd(__pa(level3_user_vsyscall) \| _PAGE_TABLE);
1418	#endif	1430	#endif
1419	ret = 0;	1431	ret = 0;
1420	}	1432	}
1421		1433
1422	BUG_ON(PagePinned(virt_to_page(xen_get_user_pgd(pgd))));	1434	BUG_ON(PagePinned(virt_to_page(xen_get_user_pgd(pgd))));
1423	}	1435	}
1424	#endif	1436	#endif
1425		1437
1426	return ret;	1438	return ret;
1427	}	1439	}
1428		1440
1429	static void xen_pgd_free(struct mm_struct mm, pgd_t pgd)	1441	static void xen_pgd_free(struct mm_struct mm, pgd_t pgd)
1430	{	1442	{
1431	#ifdef CONFIG_X86_64	1443	#ifdef CONFIG_X86_64
1432	pgd_t *user_pgd = xen_get_user_pgd(pgd);	1444	pgd_t *user_pgd = xen_get_user_pgd(pgd);
1433		1445
1434	if (user_pgd)	1446	if (user_pgd)
1435	free_page((unsigned long)user_pgd);	1447	free_page((unsigned long)user_pgd);
1436	#endif	1448	#endif
1437	}	1449	}
1438		1450
1439	#ifdef CONFIG_X86_32	1451	#ifdef CONFIG_X86_32
1440	static pte_t __init mask_rw_pte(pte_t *ptep, pte_t pte)	1452	static pte_t __init mask_rw_pte(pte_t *ptep, pte_t pte)
1441	{	1453	{
1442	/* If there's an existing pte, then don't allow _PAGE_RW to be set */	1454	/* If there's an existing pte, then don't allow _PAGE_RW to be set */
1443	if (pte_val_ma(*ptep) & _PAGE_PRESENT)	1455	if (pte_val_ma(*ptep) & _PAGE_PRESENT)
1444	pte = __pte_ma(((pte_val_ma(*ptep) & _PAGE_RW) \| ~_PAGE_RW) &	1456	pte = __pte_ma(((pte_val_ma(*ptep) & _PAGE_RW) \| ~_PAGE_RW) &
1445	pte_val_ma(pte));	1457	pte_val_ma(pte));
1446		1458
1447	return pte;	1459	return pte;
1448	}	1460	}
1449	#else /* CONFIG_X86_64 */	1461	#else /* CONFIG_X86_64 */
1450	static pte_t __init mask_rw_pte(pte_t *ptep, pte_t pte)	1462	static pte_t __init mask_rw_pte(pte_t *ptep, pte_t pte)
1451	{	1463	{
1452	return pte;	1464	return pte;
1453	}	1465	}
1454	#endif /* CONFIG_X86_64 */	1466	#endif /* CONFIG_X86_64 */
1455		1467
1456	/*	1468	/*
1457	* Init-time set_pte while constructing initial pagetables, which	1469	* Init-time set_pte while constructing initial pagetables, which
1458	* doesn't allow RO page table pages to be remapped RW.	1470	* doesn't allow RO page table pages to be remapped RW.
1459	*	1471	*
1460	* If there is no MFN for this PFN then this page is initially	1472	* If there is no MFN for this PFN then this page is initially
1461	* ballooned out so clear the PTE (as in decrease_reservation() in	1473	* ballooned out so clear the PTE (as in decrease_reservation() in
1462	* drivers/xen/balloon.c).	1474	* drivers/xen/balloon.c).
1463	*	1475	*
1464	* Many of these PTE updates are done on unpinned and writable pages	1476	* Many of these PTE updates are done on unpinned and writable pages
1465	* and doing a hypercall for these is unnecessary and expensive. At	1477	* and doing a hypercall for these is unnecessary and expensive. At
1466	* this point it is not possible to tell if a page is pinned or not,	1478	* this point it is not possible to tell if a page is pinned or not,
1467	* so always write the PTE directly and rely on Xen trapping and	1479	* so always write the PTE directly and rely on Xen trapping and
1468	* emulating any updates as necessary.	1480	* emulating any updates as necessary.
1469	*/	1481	*/
1470	static void __init xen_set_pte_init(pte_t *ptep, pte_t pte)	1482	static void __init xen_set_pte_init(pte_t *ptep, pte_t pte)
1471	{	1483	{
1472	if (pte_mfn(pte) != INVALID_P2M_ENTRY)	1484	if (pte_mfn(pte) != INVALID_P2M_ENTRY)
1473	pte = mask_rw_pte(ptep, pte);	1485	pte = mask_rw_pte(ptep, pte);
1474	else	1486	else
1475	pte = __pte_ma(0);	1487	pte = __pte_ma(0);
1476		1488
1477	native_set_pte(ptep, pte);	1489	native_set_pte(ptep, pte);
1478	}	1490	}
1479		1491
1480	static void pin_pagetable_pfn(unsigned cmd, unsigned long pfn)	1492	static void pin_pagetable_pfn(unsigned cmd, unsigned long pfn)
1481	{	1493	{
1482	struct mmuext_op op;	1494	struct mmuext_op op;
1483	op.cmd = cmd;	1495	op.cmd = cmd;
1484	op.arg1.mfn = pfn_to_mfn(pfn);	1496	op.arg1.mfn = pfn_to_mfn(pfn);
1485	if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF))	1497	if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF))
1486	BUG();	1498	BUG();
1487	}	1499	}
1488		1500
1489	/* Early in boot, while setting up the initial pagetable, assume	1501	/* Early in boot, while setting up the initial pagetable, assume
1490	everything is pinned. */	1502	everything is pinned. */
1491	static void __init xen_alloc_pte_init(struct mm_struct *mm, unsigned long pfn)	1503	static void __init xen_alloc_pte_init(struct mm_struct *mm, unsigned long pfn)
1492	{	1504	{
1493	#ifdef CONFIG_FLATMEM	1505	#ifdef CONFIG_FLATMEM
1494	BUG_ON(mem_map); /* should only be used early */	1506	BUG_ON(mem_map); /* should only be used early */
1495	#endif	1507	#endif
1496	make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));	1508	make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
1497	pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn);	1509	pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn);
1498	}	1510	}
1499		1511
1500	/* Used for pmd and pud */	1512	/* Used for pmd and pud */
1501	static void __init xen_alloc_pmd_init(struct mm_struct *mm, unsigned long pfn)	1513	static void __init xen_alloc_pmd_init(struct mm_struct *mm, unsigned long pfn)
1502	{	1514	{
1503	#ifdef CONFIG_FLATMEM	1515	#ifdef CONFIG_FLATMEM
1504	BUG_ON(mem_map); /* should only be used early */	1516	BUG_ON(mem_map); /* should only be used early */
1505	#endif	1517	#endif
1506	make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));	1518	make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
1507	}	1519	}
1508		1520
1509	/* Early release_pte assumes that all pts are pinned, since there's	1521	/* Early release_pte assumes that all pts are pinned, since there's
1510	only init_mm and anything attached to that is pinned. */	1522	only init_mm and anything attached to that is pinned. */
1511	static void __init xen_release_pte_init(unsigned long pfn)	1523	static void __init xen_release_pte_init(unsigned long pfn)
1512	{	1524	{
1513	pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn);	1525	pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn);
1514	make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));	1526	make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
1515	}	1527	}
1516		1528
1517	static void __init xen_release_pmd_init(unsigned long pfn)	1529	static void __init xen_release_pmd_init(unsigned long pfn)
1518	{	1530	{
1519	make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));	1531	make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
1520	}	1532	}
1521		1533
1522	static inline void __pin_pagetable_pfn(unsigned cmd, unsigned long pfn)	1534	static inline void __pin_pagetable_pfn(unsigned cmd, unsigned long pfn)
1523	{	1535	{
1524	struct multicall_space mcs;	1536	struct multicall_space mcs;
1525	struct mmuext_op *op;	1537	struct mmuext_op *op;
1526		1538
1527	mcs = __xen_mc_entry(sizeof(*op));	1539	mcs = __xen_mc_entry(sizeof(*op));
1528	op = mcs.args;	1540	op = mcs.args;
1529	op->cmd = cmd;	1541	op->cmd = cmd;
1530	op->arg1.mfn = pfn_to_mfn(pfn);	1542	op->arg1.mfn = pfn_to_mfn(pfn);
1531		1543
1532	MULTI_mmuext_op(mcs.mc, mcs.args, 1, NULL, DOMID_SELF);	1544	MULTI_mmuext_op(mcs.mc, mcs.args, 1, NULL, DOMID_SELF);
1533	}	1545	}
1534		1546
1535	static inline void __set_pfn_prot(unsigned long pfn, pgprot_t prot)	1547	static inline void __set_pfn_prot(unsigned long pfn, pgprot_t prot)
1536	{	1548	{
1537	struct multicall_space mcs;	1549	struct multicall_space mcs;
1538	unsigned long addr = (unsigned long)__va(pfn << PAGE_SHIFT);	1550	unsigned long addr = (unsigned long)__va(pfn << PAGE_SHIFT);
1539		1551
1540	mcs = __xen_mc_entry(0);	1552	mcs = __xen_mc_entry(0);
1541	MULTI_update_va_mapping(mcs.mc, (unsigned long)addr,	1553	MULTI_update_va_mapping(mcs.mc, (unsigned long)addr,
1542	pfn_pte(pfn, prot), 0);	1554	pfn_pte(pfn, prot), 0);
1543	}	1555	}
1544		1556
1545	/* This needs to make sure the new pte page is pinned iff its being	1557	/* This needs to make sure the new pte page is pinned iff its being
1546	attached to a pinned pagetable. */	1558	attached to a pinned pagetable. */
1547	static inline void xen_alloc_ptpage(struct mm_struct *mm, unsigned long pfn,	1559	static inline void xen_alloc_ptpage(struct mm_struct *mm, unsigned long pfn,
1548	unsigned level)	1560	unsigned level)
1549	{	1561	{
1550	bool pinned = PagePinned(virt_to_page(mm->pgd));	1562	bool pinned = PagePinned(virt_to_page(mm->pgd));
1551		1563
1552	trace_xen_mmu_alloc_ptpage(mm, pfn, level, pinned);	1564	trace_xen_mmu_alloc_ptpage(mm, pfn, level, pinned);
1553		1565
1554	if (pinned) {	1566	if (pinned) {
1555	struct page *page = pfn_to_page(pfn);	1567	struct page *page = pfn_to_page(pfn);
1556		1568
1557	SetPagePinned(page);	1569	SetPagePinned(page);
1558		1570
1559	if (!PageHighMem(page)) {	1571	if (!PageHighMem(page)) {
1560	xen_mc_batch();	1572	xen_mc_batch();
1561		1573
1562	__set_pfn_prot(pfn, PAGE_KERNEL_RO);	1574	__set_pfn_prot(pfn, PAGE_KERNEL_RO);
1563		1575
1564	if (level == PT_PTE && USE_SPLIT_PTE_PTLOCKS)	1576	if (level == PT_PTE && USE_SPLIT_PTE_PTLOCKS)
1565	__pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn);	1577	__pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn);
1566		1578
1567	xen_mc_issue(PARAVIRT_LAZY_MMU);	1579	xen_mc_issue(PARAVIRT_LAZY_MMU);
1568	} else {	1580	} else {
1569	/* make sure there are no stray mappings of	1581	/* make sure there are no stray mappings of
1570	this page */	1582	this page */
1571	kmap_flush_unused();	1583	kmap_flush_unused();
1572	}	1584	}
1573	}	1585	}
1574	}	1586	}
1575		1587
1576	static void xen_alloc_pte(struct mm_struct *mm, unsigned long pfn)	1588	static void xen_alloc_pte(struct mm_struct *mm, unsigned long pfn)
1577	{	1589	{
1578	xen_alloc_ptpage(mm, pfn, PT_PTE);	1590	xen_alloc_ptpage(mm, pfn, PT_PTE);
1579	}	1591	}
1580		1592
1581	static void xen_alloc_pmd(struct mm_struct *mm, unsigned long pfn)	1593	static void xen_alloc_pmd(struct mm_struct *mm, unsigned long pfn)
1582	{	1594	{
1583	xen_alloc_ptpage(mm, pfn, PT_PMD);	1595	xen_alloc_ptpage(mm, pfn, PT_PMD);
1584	}	1596	}
1585		1597
1586	/* This should never happen until we're OK to use struct page */	1598	/* This should never happen until we're OK to use struct page */
1587	static inline void xen_release_ptpage(unsigned long pfn, unsigned level)	1599	static inline void xen_release_ptpage(unsigned long pfn, unsigned level)
1588	{	1600	{
1589	struct page *page = pfn_to_page(pfn);	1601	struct page *page = pfn_to_page(pfn);
1590	bool pinned = PagePinned(page);	1602	bool pinned = PagePinned(page);
1591		1603
1592	trace_xen_mmu_release_ptpage(pfn, level, pinned);	1604	trace_xen_mmu_release_ptpage(pfn, level, pinned);
1593		1605
1594	if (pinned) {	1606	if (pinned) {
1595	if (!PageHighMem(page)) {	1607	if (!PageHighMem(page)) {
1596	xen_mc_batch();	1608	xen_mc_batch();
1597		1609
1598	if (level == PT_PTE && USE_SPLIT_PTE_PTLOCKS)	1610	if (level == PT_PTE && USE_SPLIT_PTE_PTLOCKS)
1599	__pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn);	1611	__pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn);
1600		1612
1601	__set_pfn_prot(pfn, PAGE_KERNEL);	1613	__set_pfn_prot(pfn, PAGE_KERNEL);
1602		1614
1603	xen_mc_issue(PARAVIRT_LAZY_MMU);	1615	xen_mc_issue(PARAVIRT_LAZY_MMU);
1604	}	1616	}
1605	ClearPagePinned(page);	1617	ClearPagePinned(page);
1606	}	1618	}
1607	}	1619	}
1608		1620
1609	static void xen_release_pte(unsigned long pfn)	1621	static void xen_release_pte(unsigned long pfn)
1610	{	1622	{
1611	xen_release_ptpage(pfn, PT_PTE);	1623	xen_release_ptpage(pfn, PT_PTE);
1612	}	1624	}
1613		1625
1614	static void xen_release_pmd(unsigned long pfn)	1626	static void xen_release_pmd(unsigned long pfn)
1615	{	1627	{
1616	xen_release_ptpage(pfn, PT_PMD);	1628	xen_release_ptpage(pfn, PT_PMD);
1617	}	1629	}
1618		1630
1619	#if PAGETABLE_LEVELS == 4	1631	#if PAGETABLE_LEVELS == 4
1620	static void xen_alloc_pud(struct mm_struct *mm, unsigned long pfn)	1632	static void xen_alloc_pud(struct mm_struct *mm, unsigned long pfn)
1621	{	1633	{
1622	xen_alloc_ptpage(mm, pfn, PT_PUD);	1634	xen_alloc_ptpage(mm, pfn, PT_PUD);
1623	}	1635	}
1624		1636
1625	static void xen_release_pud(unsigned long pfn)	1637	static void xen_release_pud(unsigned long pfn)
1626	{	1638	{
1627	xen_release_ptpage(pfn, PT_PUD);	1639	xen_release_ptpage(pfn, PT_PUD);
1628	}	1640	}
1629	#endif	1641	#endif
1630		1642
1631	void __init xen_reserve_top(void)	1643	void __init xen_reserve_top(void)
1632	{	1644	{
1633	#ifdef CONFIG_X86_32	1645	#ifdef CONFIG_X86_32
1634	unsigned long top = HYPERVISOR_VIRT_START;	1646	unsigned long top = HYPERVISOR_VIRT_START;
1635	struct xen_platform_parameters pp;	1647	struct xen_platform_parameters pp;
1636		1648
1637	if (HYPERVISOR_xen_version(XENVER_platform_parameters, &pp) == 0)	1649	if (HYPERVISOR_xen_version(XENVER_platform_parameters, &pp) == 0)
1638	top = pp.virt_start;	1650	top = pp.virt_start;
1639		1651
1640	reserve_top_address(-top);	1652	reserve_top_address(-top);
1641	#endif /* CONFIG_X86_32 */	1653	#endif /* CONFIG_X86_32 */
1642	}	1654	}
1643		1655
1644	/*	1656	/*
1645	* Like __va(), but returns address in the kernel mapping (which is	1657	* Like __va(), but returns address in the kernel mapping (which is
1646	* all we have until the physical memory mapping has been set up.	1658	* all we have until the physical memory mapping has been set up.
1647	*/	1659	*/
1648	static void *__ka(phys_addr_t paddr)	1660	static void *__ka(phys_addr_t paddr)
1649	{	1661	{
1650	#ifdef CONFIG_X86_64	1662	#ifdef CONFIG_X86_64
1651	return (void *)(paddr + __START_KERNEL_map);	1663	return (void *)(paddr + __START_KERNEL_map);
1652	#else	1664	#else
1653	return __va(paddr);	1665	return __va(paddr);
1654	#endif	1666	#endif
1655	}	1667	}
1656		1668
1657	/* Convert a machine address to physical address */	1669	/* Convert a machine address to physical address */
1658	static unsigned long m2p(phys_addr_t maddr)	1670	static unsigned long m2p(phys_addr_t maddr)
1659	{	1671	{
1660	phys_addr_t paddr;	1672	phys_addr_t paddr;
1661		1673
1662	maddr &= PTE_PFN_MASK;	1674	maddr &= PTE_PFN_MASK;
1663	paddr = mfn_to_pfn(maddr >> PAGE_SHIFT) << PAGE_SHIFT;	1675	paddr = mfn_to_pfn(maddr >> PAGE_SHIFT) << PAGE_SHIFT;
1664		1676
1665	return paddr;	1677	return paddr;
1666	}	1678	}
1667		1679
1668	/* Convert a machine address to kernel virtual */	1680	/* Convert a machine address to kernel virtual */
1669	static void *m2v(phys_addr_t maddr)	1681	static void *m2v(phys_addr_t maddr)
1670	{	1682	{
1671	return __ka(m2p(maddr));	1683	return __ka(m2p(maddr));
1672	}	1684	}
1673		1685
1674	/* Set the page permissions on an identity-mapped pages */	1686	/* Set the page permissions on an identity-mapped pages */
1675	static void set_page_prot_flags(void *addr, pgprot_t prot, unsigned long flags)	1687	static void set_page_prot_flags(void *addr, pgprot_t prot, unsigned long flags)
1676	{	1688	{
1677	unsigned long pfn = __pa(addr) >> PAGE_SHIFT;	1689	unsigned long pfn = __pa(addr) >> PAGE_SHIFT;
1678	pte_t pte = pfn_pte(pfn, prot);	1690	pte_t pte = pfn_pte(pfn, prot);
1679		1691
1680	/* For PVH no need to set R/O or R/W to pin them or unpin them. */	1692	/* For PVH no need to set R/O or R/W to pin them or unpin them. */
1681	if (xen_feature(XENFEAT_auto_translated_physmap))	1693	if (xen_feature(XENFEAT_auto_translated_physmap))
1682	return;	1694	return;
1683		1695
1684	if (HYPERVISOR_update_va_mapping((unsigned long)addr, pte, flags))	1696	if (HYPERVISOR_update_va_mapping((unsigned long)addr, pte, flags))
1685	BUG();	1697	BUG();
1686	}	1698	}
1687	static void set_page_prot(void *addr, pgprot_t prot)	1699	static void set_page_prot(void *addr, pgprot_t prot)
1688	{	1700	{
1689	return set_page_prot_flags(addr, prot, UVMF_NONE);	1701	return set_page_prot_flags(addr, prot, UVMF_NONE);
1690	}	1702	}
1691	#ifdef CONFIG_X86_32	1703	#ifdef CONFIG_X86_32
1692	static void __init xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn)	1704	static void __init xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn)
1693	{	1705	{
1694	unsigned pmdidx, pteidx;	1706	unsigned pmdidx, pteidx;
1695	unsigned ident_pte;	1707	unsigned ident_pte;
1696	unsigned long pfn;	1708	unsigned long pfn;
1697		1709
1698	level1_ident_pgt = extend_brk(sizeof(pte_t) * LEVEL1_IDENT_ENTRIES,	1710	level1_ident_pgt = extend_brk(sizeof(pte_t) * LEVEL1_IDENT_ENTRIES,
1699	PAGE_SIZE);	1711	PAGE_SIZE);
1700		1712
1701	ident_pte = 0;	1713	ident_pte = 0;
1702	pfn = 0;	1714	pfn = 0;
1703	for (pmdidx = 0; pmdidx < PTRS_PER_PMD && pfn < max_pfn; pmdidx++) {	1715	for (pmdidx = 0; pmdidx < PTRS_PER_PMD && pfn < max_pfn; pmdidx++) {
1704	pte_t *pte_page;	1716	pte_t *pte_page;
1705		1717
1706	/* Reuse or allocate a page of ptes */	1718	/* Reuse or allocate a page of ptes */
1707	if (pmd_present(pmd[pmdidx]))	1719	if (pmd_present(pmd[pmdidx]))
1708	pte_page = m2v(pmd[pmdidx].pmd);	1720	pte_page = m2v(pmd[pmdidx].pmd);
1709	else {	1721	else {
1710	/* Check for free pte pages */	1722	/* Check for free pte pages */
1711	if (ident_pte == LEVEL1_IDENT_ENTRIES)	1723	if (ident_pte == LEVEL1_IDENT_ENTRIES)
1712	break;	1724	break;
1713		1725
1714	pte_page = &level1_ident_pgt[ident_pte];	1726	pte_page = &level1_ident_pgt[ident_pte];
1715	ident_pte += PTRS_PER_PTE;	1727	ident_pte += PTRS_PER_PTE;
1716		1728
1717	pmd[pmdidx] = __pmd(__pa(pte_page) \| _PAGE_TABLE);	1729	pmd[pmdidx] = __pmd(__pa(pte_page) \| _PAGE_TABLE);
1718	}	1730	}
1719		1731
1720	/* Install mappings */	1732	/* Install mappings */
1721	for (pteidx = 0; pteidx < PTRS_PER_PTE; pteidx++, pfn++) {	1733	for (pteidx = 0; pteidx < PTRS_PER_PTE; pteidx++, pfn++) {
1722	pte_t pte;	1734	pte_t pte;
1723		1735
1724	#ifdef CONFIG_X86_32	1736	#ifdef CONFIG_X86_32
1725	if (pfn > max_pfn_mapped)	1737	if (pfn > max_pfn_mapped)
1726	max_pfn_mapped = pfn;	1738	max_pfn_mapped = pfn;
1727	#endif	1739	#endif
1728		1740
1729	if (!pte_none(pte_page[pteidx]))	1741	if (!pte_none(pte_page[pteidx]))
1730	continue;	1742	continue;
1731		1743
1732	pte = pfn_pte(pfn, PAGE_KERNEL_EXEC);	1744	pte = pfn_pte(pfn, PAGE_KERNEL_EXEC);
1733	pte_page[pteidx] = pte;	1745	pte_page[pteidx] = pte;
1734	}	1746	}
1735	}	1747	}
1736		1748
1737	for (pteidx = 0; pteidx < ident_pte; pteidx += PTRS_PER_PTE)	1749	for (pteidx = 0; pteidx < ident_pte; pteidx += PTRS_PER_PTE)
1738	set_page_prot(&level1_ident_pgt[pteidx], PAGE_KERNEL_RO);	1750	set_page_prot(&level1_ident_pgt[pteidx], PAGE_KERNEL_RO);
1739		1751
1740	set_page_prot(pmd, PAGE_KERNEL_RO);	1752	set_page_prot(pmd, PAGE_KERNEL_RO);
1741	}	1753	}
1742	#endif	1754	#endif
1743	void __init xen_setup_machphys_mapping(void)	1755	void __init xen_setup_machphys_mapping(void)
1744	{	1756	{
1745	struct xen_machphys_mapping mapping;	1757	struct xen_machphys_mapping mapping;
1746		1758
1747	if (HYPERVISOR_memory_op(XENMEM_machphys_mapping, &mapping) == 0) {	1759	if (HYPERVISOR_memory_op(XENMEM_machphys_mapping, &mapping) == 0) {
1748	machine_to_phys_mapping = (unsigned long *)mapping.v_start;	1760	machine_to_phys_mapping = (unsigned long *)mapping.v_start;
1749	machine_to_phys_nr = mapping.max_mfn + 1;	1761	machine_to_phys_nr = mapping.max_mfn + 1;
1750	} else {	1762	} else {
1751	machine_to_phys_nr = MACH2PHYS_NR_ENTRIES;	1763	machine_to_phys_nr = MACH2PHYS_NR_ENTRIES;
1752	}	1764	}
1753	#ifdef CONFIG_X86_32	1765	#ifdef CONFIG_X86_32
1754	WARN_ON((machine_to_phys_mapping + (machine_to_phys_nr - 1))	1766	WARN_ON((machine_to_phys_mapping + (machine_to_phys_nr - 1))
1755	< machine_to_phys_mapping);	1767	< machine_to_phys_mapping);
1756	#endif	1768	#endif
1757	}	1769	}
1758		1770
1759	#ifdef CONFIG_X86_64	1771	#ifdef CONFIG_X86_64
1760	static void convert_pfn_mfn(void *v)	1772	static void convert_pfn_mfn(void *v)
1761	{	1773	{
1762	pte_t *pte = v;	1774	pte_t *pte = v;
1763	int i;	1775	int i;
1764		1776
1765	/* All levels are converted the same way, so just treat them	1777	/* All levels are converted the same way, so just treat them
1766	as ptes. */	1778	as ptes. */
1767	for (i = 0; i < PTRS_PER_PTE; i++)	1779	for (i = 0; i < PTRS_PER_PTE; i++)
1768	pte[i] = xen_make_pte(pte[i].pte);	1780	pte[i] = xen_make_pte(pte[i].pte);
1769	}	1781	}
1770	static void __init check_pt_base(unsigned long pt_base, unsigned long pt_end,	1782	static void __init check_pt_base(unsigned long pt_base, unsigned long pt_end,
1771	unsigned long addr)	1783	unsigned long addr)
1772	{	1784	{
1773	if (*pt_base == PFN_DOWN(__pa(addr))) {	1785	if (*pt_base == PFN_DOWN(__pa(addr))) {
1774	set_page_prot_flags((void *)addr, PAGE_KERNEL, UVMF_INVLPG);	1786	set_page_prot_flags((void *)addr, PAGE_KERNEL, UVMF_INVLPG);
1775	clear_page((void *)addr);	1787	clear_page((void *)addr);
1776	(*pt_base)++;	1788	(*pt_base)++;
1777	}	1789	}
1778	if (*pt_end == PFN_DOWN(__pa(addr))) {	1790	if (*pt_end == PFN_DOWN(__pa(addr))) {
1779	set_page_prot_flags((void *)addr, PAGE_KERNEL, UVMF_INVLPG);	1791	set_page_prot_flags((void *)addr, PAGE_KERNEL, UVMF_INVLPG);
1780	clear_page((void *)addr);	1792	clear_page((void *)addr);
1781	(*pt_end)--;	1793	(*pt_end)--;
1782	}	1794	}
1783	}	1795	}
1784	/*	1796	/*
1785	* Set up the initial kernel pagetable.	1797	* Set up the initial kernel pagetable.
1786	*	1798	*
1787	* We can construct this by grafting the Xen provided pagetable into	1799	* We can construct this by grafting the Xen provided pagetable into
1788	* head_64.S's preconstructed pagetables. We copy the Xen L2's into	1800	* head_64.S's preconstructed pagetables. We copy the Xen L2's into
1789	* level2_ident_pgt, and level2_kernel_pgt. This means that only the	1801	* level2_ident_pgt, and level2_kernel_pgt. This means that only the
1790	* kernel has a physical mapping to start with - but that's enough to	1802	* kernel has a physical mapping to start with - but that's enough to
1791	* get __va working. We need to fill in the rest of the physical	1803	* get __va working. We need to fill in the rest of the physical
1792	* mapping once some sort of allocator has been set up. NOTE: for	1804	* mapping once some sort of allocator has been set up. NOTE: for
1793	* PVH, the page tables are native.	1805	* PVH, the page tables are native.
1794	*/	1806	*/
1795	void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)	1807	void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
1796	{	1808	{
1797	pud_t *l3;	1809	pud_t *l3;
1798	pmd_t *l2;	1810	pmd_t *l2;
1799	unsigned long addr[3];	1811	unsigned long addr[3];
1800	unsigned long pt_base, pt_end;	1812	unsigned long pt_base, pt_end;
1801	unsigned i;	1813	unsigned i;
1802		1814
1803	/* max_pfn_mapped is the last pfn mapped in the initial memory	1815	/* max_pfn_mapped is the last pfn mapped in the initial memory
1804	* mappings. Considering that on Xen after the kernel mappings we	1816	* mappings. Considering that on Xen after the kernel mappings we
1805	* have the mappings of some pages that don't exist in pfn space, we	1817	* have the mappings of some pages that don't exist in pfn space, we
1806	* set max_pfn_mapped to the last real pfn mapped. */	1818	* set max_pfn_mapped to the last real pfn mapped. */
1807	max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->mfn_list));	1819	max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->mfn_list));
1808		1820
1809	pt_base = PFN_DOWN(__pa(xen_start_info->pt_base));	1821	pt_base = PFN_DOWN(__pa(xen_start_info->pt_base));
1810	pt_end = pt_base + xen_start_info->nr_pt_frames;	1822	pt_end = pt_base + xen_start_info->nr_pt_frames;
1811		1823
1812	/* Zap identity mapping */	1824	/* Zap identity mapping */
1813	init_level4_pgt[0] = __pgd(0);	1825	init_level4_pgt[0] = __pgd(0);
1814		1826
1815	if (!xen_feature(XENFEAT_auto_translated_physmap)) {	1827	if (!xen_feature(XENFEAT_auto_translated_physmap)) {
1816	/* Pre-constructed entries are in pfn, so convert to mfn */	1828	/* Pre-constructed entries are in pfn, so convert to mfn */
1817	/* L4[272] -> level3_ident_pgt	1829	/* L4[272] -> level3_ident_pgt
1818	* L4[511] -> level3_kernel_pgt */	1830	* L4[511] -> level3_kernel_pgt */
1819	convert_pfn_mfn(init_level4_pgt);	1831	convert_pfn_mfn(init_level4_pgt);
1820		1832
1821	/* L3_i[0] -> level2_ident_pgt */	1833	/* L3_i[0] -> level2_ident_pgt */
1822	convert_pfn_mfn(level3_ident_pgt);	1834	convert_pfn_mfn(level3_ident_pgt);
1823	/* L3_k[510] -> level2_kernel_pgt	1835	/* L3_k[510] -> level2_kernel_pgt
1824	* L3_k[511] -> level2_fixmap_pgt */	1836	* L3_k[511] -> level2_fixmap_pgt */
1825	convert_pfn_mfn(level3_kernel_pgt);	1837	convert_pfn_mfn(level3_kernel_pgt);
1826		1838
1827	/* L3_k[511][506] -> level1_fixmap_pgt */	1839	/* L3_k[511][506] -> level1_fixmap_pgt */
1828	convert_pfn_mfn(level2_fixmap_pgt);	1840	convert_pfn_mfn(level2_fixmap_pgt);
1829	}	1841	}
1830	/* We get [511][511] and have Xen's version of level2_kernel_pgt */	1842	/* We get [511][511] and have Xen's version of level2_kernel_pgt */
1831	l3 = m2v(pgd[pgd_index(__START_KERNEL_map)].pgd);	1843	l3 = m2v(pgd[pgd_index(__START_KERNEL_map)].pgd);
1832	l2 = m2v(l3[pud_index(__START_KERNEL_map)].pud);	1844	l2 = m2v(l3[pud_index(__START_KERNEL_map)].pud);
1833		1845
1834	addr[0] = (unsigned long)pgd;	1846	addr[0] = (unsigned long)pgd;
1835	addr[1] = (unsigned long)l3;	1847	addr[1] = (unsigned long)l3;
1836	addr[2] = (unsigned long)l2;	1848	addr[2] = (unsigned long)l2;
1837	/* Graft it onto L4[272][0]. Note that we creating an aliasing problem:	1849	/* Graft it onto L4[272][0]. Note that we creating an aliasing problem:
1838	* Both L4[272][0] and L4[511][510] have entries that point to the same	1850	* Both L4[272][0] and L4[511][510] have entries that point to the same
1839	* L2 (PMD) tables. Meaning that if you modify it in __va space	1851	* L2 (PMD) tables. Meaning that if you modify it in __va space
1840	* it will be also modified in the __ka space! (But if you just	1852	* it will be also modified in the __ka space! (But if you just
1841	* modify the PMD table to point to other PTE's or none, then you	1853	* modify the PMD table to point to other PTE's or none, then you
1842	* are OK - which is what cleanup_highmap does) */	1854	* are OK - which is what cleanup_highmap does) */
1843	copy_page(level2_ident_pgt, l2);	1855	copy_page(level2_ident_pgt, l2);
1844	/* Graft it onto L4[511][510] */	1856	/* Graft it onto L4[511][510] */
1845	copy_page(level2_kernel_pgt, l2);	1857	copy_page(level2_kernel_pgt, l2);
1846		1858
1847	if (!xen_feature(XENFEAT_auto_translated_physmap)) {	1859	if (!xen_feature(XENFEAT_auto_translated_physmap)) {
1848	/* Make pagetable pieces RO */	1860	/* Make pagetable pieces RO */
1849	set_page_prot(init_level4_pgt, PAGE_KERNEL_RO);	1861	set_page_prot(init_level4_pgt, PAGE_KERNEL_RO);
1850	set_page_prot(level3_ident_pgt, PAGE_KERNEL_RO);	1862	set_page_prot(level3_ident_pgt, PAGE_KERNEL_RO);
1851	set_page_prot(level3_kernel_pgt, PAGE_KERNEL_RO);	1863	set_page_prot(level3_kernel_pgt, PAGE_KERNEL_RO);
1852	set_page_prot(level3_user_vsyscall, PAGE_KERNEL_RO);	1864	set_page_prot(level3_user_vsyscall, PAGE_KERNEL_RO);
1853	set_page_prot(level2_ident_pgt, PAGE_KERNEL_RO);	1865	set_page_prot(level2_ident_pgt, PAGE_KERNEL_RO);
1854	set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO);	1866	set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO);
1855	set_page_prot(level2_fixmap_pgt, PAGE_KERNEL_RO);	1867	set_page_prot(level2_fixmap_pgt, PAGE_KERNEL_RO);
1856	set_page_prot(level1_fixmap_pgt, PAGE_KERNEL_RO);	1868	set_page_prot(level1_fixmap_pgt, PAGE_KERNEL_RO);
1857		1869
1858	/* Pin down new L4 */	1870	/* Pin down new L4 */
1859	pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE,	1871	pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE,
1860	PFN_DOWN(__pa_symbol(init_level4_pgt)));	1872	PFN_DOWN(__pa_symbol(init_level4_pgt)));
1861		1873
1862	/* Unpin Xen-provided one */	1874	/* Unpin Xen-provided one */
1863	pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));	1875	pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
1864		1876
1865	/*	1877	/*
1866	* At this stage there can be no user pgd, and no page	1878	* At this stage there can be no user pgd, and no page
1867	* structure to attach it to, so make sure we just set kernel	1879	* structure to attach it to, so make sure we just set kernel
1868	* pgd.	1880	* pgd.
1869	*/	1881	*/
1870	xen_mc_batch();	1882	xen_mc_batch();
1871	__xen_write_cr3(true, __pa(init_level4_pgt));	1883	__xen_write_cr3(true, __pa(init_level4_pgt));
1872	xen_mc_issue(PARAVIRT_LAZY_CPU);	1884	xen_mc_issue(PARAVIRT_LAZY_CPU);
1873	} else	1885	} else
1874	native_write_cr3(__pa(init_level4_pgt));	1886	native_write_cr3(__pa(init_level4_pgt));
1875		1887
1876	/* We can't that easily rip out L3 and L2, as the Xen pagetables are	1888	/* We can't that easily rip out L3 and L2, as the Xen pagetables are
1877	* set out this way: [L4], [L1], [L2], [L3], [L1], [L1] ... for	1889	* set out this way: [L4], [L1], [L2], [L3], [L1], [L1] ... for
1878	* the initial domain. For guests using the toolstack, they are in:	1890	* the initial domain. For guests using the toolstack, they are in:
1879	* [L4], [L3], [L2], [L1], [L1], order .. So for dom0 we can only	1891	* [L4], [L3], [L2], [L1], [L1], order .. So for dom0 we can only
1880	* rip out the [L4] (pgd), but for guests we shave off three pages.	1892	* rip out the [L4] (pgd), but for guests we shave off three pages.
1881	*/	1893	*/
1882	for (i = 0; i < ARRAY_SIZE(addr); i++)	1894	for (i = 0; i < ARRAY_SIZE(addr); i++)
1883	check_pt_base(&pt_base, &pt_end, addr[i]);	1895	check_pt_base(&pt_base, &pt_end, addr[i]);
1884		1896
1885	/* Our (by three pages) smaller Xen pagetable that we are using */	1897	/* Our (by three pages) smaller Xen pagetable that we are using */
1886	memblock_reserve(PFN_PHYS(pt_base), (pt_end - pt_base) * PAGE_SIZE);	1898	memblock_reserve(PFN_PHYS(pt_base), (pt_end - pt_base) * PAGE_SIZE);
1887	/* Revector the xen_start_info */	1899	/* Revector the xen_start_info */
1888	xen_start_info = (struct start_info *)__va(__pa(xen_start_info));	1900	xen_start_info = (struct start_info *)__va(__pa(xen_start_info));
1889	}	1901	}
1890	#else /* !CONFIG_X86_64 */	1902	#else /* !CONFIG_X86_64 */
1891	static RESERVE_BRK_ARRAY(pmd_t, initial_kernel_pmd, PTRS_PER_PMD);	1903	static RESERVE_BRK_ARRAY(pmd_t, initial_kernel_pmd, PTRS_PER_PMD);
1892	static RESERVE_BRK_ARRAY(pmd_t, swapper_kernel_pmd, PTRS_PER_PMD);	1904	static RESERVE_BRK_ARRAY(pmd_t, swapper_kernel_pmd, PTRS_PER_PMD);
1893		1905
1894	static void __init xen_write_cr3_init(unsigned long cr3)	1906	static void __init xen_write_cr3_init(unsigned long cr3)
1895	{	1907	{
1896	unsigned long pfn = PFN_DOWN(__pa(swapper_pg_dir));	1908	unsigned long pfn = PFN_DOWN(__pa(swapper_pg_dir));
1897		1909
1898	BUG_ON(read_cr3() != __pa(initial_page_table));	1910	BUG_ON(read_cr3() != __pa(initial_page_table));
1899	BUG_ON(cr3 != __pa(swapper_pg_dir));	1911	BUG_ON(cr3 != __pa(swapper_pg_dir));
1900		1912
1901	/*	1913	/*
1902	* We are switching to swapper_pg_dir for the first time (from	1914	* We are switching to swapper_pg_dir for the first time (from
1903	* initial_page_table) and therefore need to mark that page	1915	* initial_page_table) and therefore need to mark that page
1904	* read-only and then pin it.	1916	* read-only and then pin it.
1905	*	1917	*
1906	* Xen disallows sharing of kernel PMDs for PAE	1918	* Xen disallows sharing of kernel PMDs for PAE
1907	* guests. Therefore we must copy the kernel PMD from	1919	* guests. Therefore we must copy the kernel PMD from
1908	* initial_page_table into a new kernel PMD to be used in	1920	* initial_page_table into a new kernel PMD to be used in
1909	* swapper_pg_dir.	1921	* swapper_pg_dir.
1910	*/	1922	*/
1911	swapper_kernel_pmd =	1923	swapper_kernel_pmd =
1912	extend_brk(sizeof(pmd_t) * PTRS_PER_PMD, PAGE_SIZE);	1924	extend_brk(sizeof(pmd_t) * PTRS_PER_PMD, PAGE_SIZE);
1913	copy_page(swapper_kernel_pmd, initial_kernel_pmd);	1925	copy_page(swapper_kernel_pmd, initial_kernel_pmd);
1914	swapper_pg_dir[KERNEL_PGD_BOUNDARY] =	1926	swapper_pg_dir[KERNEL_PGD_BOUNDARY] =
1915	__pgd(__pa(swapper_kernel_pmd) \| _PAGE_PRESENT);	1927	__pgd(__pa(swapper_kernel_pmd) \| _PAGE_PRESENT);
1916	set_page_prot(swapper_kernel_pmd, PAGE_KERNEL_RO);	1928	set_page_prot(swapper_kernel_pmd, PAGE_KERNEL_RO);
1917		1929
1918	set_page_prot(swapper_pg_dir, PAGE_KERNEL_RO);	1930	set_page_prot(swapper_pg_dir, PAGE_KERNEL_RO);
1919	xen_write_cr3(cr3);	1931	xen_write_cr3(cr3);
1920	pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, pfn);	1932	pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, pfn);
1921		1933
1922	pin_pagetable_pfn(MMUEXT_UNPIN_TABLE,	1934	pin_pagetable_pfn(MMUEXT_UNPIN_TABLE,
1923	PFN_DOWN(__pa(initial_page_table)));	1935	PFN_DOWN(__pa(initial_page_table)));
1924	set_page_prot(initial_page_table, PAGE_KERNEL);	1936	set_page_prot(initial_page_table, PAGE_KERNEL);
1925	set_page_prot(initial_kernel_pmd, PAGE_KERNEL);	1937	set_page_prot(initial_kernel_pmd, PAGE_KERNEL);
1926		1938
1927	pv_mmu_ops.write_cr3 = &xen_write_cr3;	1939	pv_mmu_ops.write_cr3 = &xen_write_cr3;
1928	}	1940	}
1929		1941
1930	void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)	1942	void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
1931	{	1943	{
1932	pmd_t *kernel_pmd;	1944	pmd_t *kernel_pmd;
1933		1945
1934	initial_kernel_pmd =	1946	initial_kernel_pmd =
1935	extend_brk(sizeof(pmd_t) * PTRS_PER_PMD, PAGE_SIZE);	1947	extend_brk(sizeof(pmd_t) * PTRS_PER_PMD, PAGE_SIZE);
1936		1948
1937	max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->pt_base) +	1949	max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->pt_base) +
1938	xen_start_info->nr_pt_frames * PAGE_SIZE +	1950	xen_start_info->nr_pt_frames * PAGE_SIZE +
1939	512*1024);	1951	512*1024);
1940		1952
1941	kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd);	1953	kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd);
1942	copy_page(initial_kernel_pmd, kernel_pmd);	1954	copy_page(initial_kernel_pmd, kernel_pmd);
1943		1955
1944	xen_map_identity_early(initial_kernel_pmd, max_pfn);	1956	xen_map_identity_early(initial_kernel_pmd, max_pfn);
1945		1957
1946	copy_page(initial_page_table, pgd);	1958	copy_page(initial_page_table, pgd);
1947	initial_page_table[KERNEL_PGD_BOUNDARY] =	1959	initial_page_table[KERNEL_PGD_BOUNDARY] =
1948	__pgd(__pa(initial_kernel_pmd) \| _PAGE_PRESENT);	1960	__pgd(__pa(initial_kernel_pmd) \| _PAGE_PRESENT);
1949		1961
1950	set_page_prot(initial_kernel_pmd, PAGE_KERNEL_RO);	1962	set_page_prot(initial_kernel_pmd, PAGE_KERNEL_RO);
1951	set_page_prot(initial_page_table, PAGE_KERNEL_RO);	1963	set_page_prot(initial_page_table, PAGE_KERNEL_RO);
1952	set_page_prot(empty_zero_page, PAGE_KERNEL_RO);	1964	set_page_prot(empty_zero_page, PAGE_KERNEL_RO);
1953		1965
1954	pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));	1966	pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
1955		1967
1956	pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE,	1968	pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE,
1957	PFN_DOWN(__pa(initial_page_table)));	1969	PFN_DOWN(__pa(initial_page_table)));
1958	xen_write_cr3(__pa(initial_page_table));	1970	xen_write_cr3(__pa(initial_page_table));
1959		1971
1960	memblock_reserve(__pa(xen_start_info->pt_base),	1972	memblock_reserve(__pa(xen_start_info->pt_base),
1961	xen_start_info->nr_pt_frames * PAGE_SIZE);	1973	xen_start_info->nr_pt_frames * PAGE_SIZE);
1962	}	1974	}
1963	#endif /* CONFIG_X86_64 */	1975	#endif /* CONFIG_X86_64 */
1964		1976
1965	static unsigned char dummy_mapping[PAGE_SIZE] __page_aligned_bss;	1977	static unsigned char dummy_mapping[PAGE_SIZE] __page_aligned_bss;
1966		1978
1967	static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot)	1979	static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot)
1968	{	1980	{
1969	pte_t pte;	1981	pte_t pte;
1970		1982
1971	phys >>= PAGE_SHIFT;	1983	phys >>= PAGE_SHIFT;
1972		1984
1973	switch (idx) {	1985	switch (idx) {
1974	case FIX_BTMAP_END ... FIX_BTMAP_BEGIN:	1986	case FIX_BTMAP_END ... FIX_BTMAP_BEGIN:
1975	case FIX_RO_IDT:	1987	case FIX_RO_IDT:
1976	#ifdef CONFIG_X86_32	1988	#ifdef CONFIG_X86_32
1977	case FIX_WP_TEST:	1989	case FIX_WP_TEST:
1978	# ifdef CONFIG_HIGHMEM	1990	# ifdef CONFIG_HIGHMEM
1979	case FIX_KMAP_BEGIN ... FIX_KMAP_END:	1991	case FIX_KMAP_BEGIN ... FIX_KMAP_END:
1980	# endif	1992	# endif
1981	#elif defined(CONFIG_X86_VSYSCALL_EMULATION)	1993	#elif defined(CONFIG_X86_VSYSCALL_EMULATION)
1982	case VSYSCALL_PAGE:	1994	case VSYSCALL_PAGE:
1983	#endif	1995	#endif
1984	case FIX_TEXT_POKE0:	1996	case FIX_TEXT_POKE0:
1985	case FIX_TEXT_POKE1:	1997	case FIX_TEXT_POKE1:
1986	/* All local page mappings */	1998	/* All local page mappings */
1987	pte = pfn_pte(phys, prot);	1999	pte = pfn_pte(phys, prot);
1988	break;	2000	break;
1989		2001
1990	#ifdef CONFIG_X86_LOCAL_APIC	2002	#ifdef CONFIG_X86_LOCAL_APIC
1991	case FIX_APIC_BASE: /* maps dummy local APIC */	2003	case FIX_APIC_BASE: /* maps dummy local APIC */
1992	pte = pfn_pte(PFN_DOWN(__pa(dummy_mapping)), PAGE_KERNEL);	2004	pte = pfn_pte(PFN_DOWN(__pa(dummy_mapping)), PAGE_KERNEL);
1993	break;	2005	break;
1994	#endif	2006	#endif
1995		2007
1996	#ifdef CONFIG_X86_IO_APIC	2008	#ifdef CONFIG_X86_IO_APIC
1997	case FIX_IO_APIC_BASE_0 ... FIX_IO_APIC_BASE_END:	2009	case FIX_IO_APIC_BASE_0 ... FIX_IO_APIC_BASE_END:
1998	/*	2010	/*
1999	* We just don't map the IO APIC - all access is via	2011	* We just don't map the IO APIC - all access is via
2000	* hypercalls. Keep the address in the pte for reference.	2012	* hypercalls. Keep the address in the pte for reference.
2001	*/	2013	*/
2002	pte = pfn_pte(PFN_DOWN(__pa(dummy_mapping)), PAGE_KERNEL);	2014	pte = pfn_pte(PFN_DOWN(__pa(dummy_mapping)), PAGE_KERNEL);
2003	break;	2015	break;
2004	#endif	2016	#endif
2005		2017
2006	case FIX_PARAVIRT_BOOTMAP:	2018	case FIX_PARAVIRT_BOOTMAP:
2007	/* This is an MFN, but it isn't an IO mapping from the	2019	/* This is an MFN, but it isn't an IO mapping from the
2008	IO domain */	2020	IO domain */
2009	pte = mfn_pte(phys, prot);	2021	pte = mfn_pte(phys, prot);
2010	break;	2022	break;
2011		2023
2012	default:	2024	default:
2013	/* By default, set_fixmap is used for hardware mappings */	2025	/* By default, set_fixmap is used for hardware mappings */
2014	pte = mfn_pte(phys, prot);	2026	pte = mfn_pte(phys, prot);
2015	break;	2027	break;
2016	}	2028	}
2017		2029
2018	__native_set_fixmap(idx, pte);	2030	__native_set_fixmap(idx, pte);
2019		2031
2020	#ifdef CONFIG_X86_VSYSCALL_EMULATION	2032	#ifdef CONFIG_X86_VSYSCALL_EMULATION
2021	/* Replicate changes to map the vsyscall page into the user	2033	/* Replicate changes to map the vsyscall page into the user
2022	pagetable vsyscall mapping. */	2034	pagetable vsyscall mapping. */
2023	if (idx == VSYSCALL_PAGE) {	2035	if (idx == VSYSCALL_PAGE) {
2024	unsigned long vaddr = __fix_to_virt(idx);	2036	unsigned long vaddr = __fix_to_virt(idx);
2025	set_pte_vaddr_pud(level3_user_vsyscall, vaddr, pte);	2037	set_pte_vaddr_pud(level3_user_vsyscall, vaddr, pte);
2026	}	2038	}
2027	#endif	2039	#endif
2028	}	2040	}
2029		2041
2030	static void __init xen_post_allocator_init(void)	2042	static void __init xen_post_allocator_init(void)
2031	{	2043	{
2032	if (xen_feature(XENFEAT_auto_translated_physmap))	2044	if (xen_feature(XENFEAT_auto_translated_physmap))
2033	return;	2045	return;
2034		2046
2035	pv_mmu_ops.set_pte = xen_set_pte;	2047	pv_mmu_ops.set_pte = xen_set_pte;
2036	pv_mmu_ops.set_pmd = xen_set_pmd;	2048	pv_mmu_ops.set_pmd = xen_set_pmd;
2037	pv_mmu_ops.set_pud = xen_set_pud;	2049	pv_mmu_ops.set_pud = xen_set_pud;
2038	#if PAGETABLE_LEVELS == 4	2050	#if PAGETABLE_LEVELS == 4
2039	pv_mmu_ops.set_pgd = xen_set_pgd;	2051	pv_mmu_ops.set_pgd = xen_set_pgd;
2040	#endif	2052	#endif
2041		2053
2042	/* This will work as long as patching hasn't happened yet	2054	/* This will work as long as patching hasn't happened yet
2043	(which it hasn't) */	2055	(which it hasn't) */
2044	pv_mmu_ops.alloc_pte = xen_alloc_pte;	2056	pv_mmu_ops.alloc_pte = xen_alloc_pte;
2045	pv_mmu_ops.alloc_pmd = xen_alloc_pmd;	2057	pv_mmu_ops.alloc_pmd = xen_alloc_pmd;
2046	pv_mmu_ops.release_pte = xen_release_pte;	2058	pv_mmu_ops.release_pte = xen_release_pte;
2047	pv_mmu_ops.release_pmd = xen_release_pmd;	2059	pv_mmu_ops.release_pmd = xen_release_pmd;
2048	#if PAGETABLE_LEVELS == 4	2060	#if PAGETABLE_LEVELS == 4
2049	pv_mmu_ops.alloc_pud = xen_alloc_pud;	2061	pv_mmu_ops.alloc_pud = xen_alloc_pud;
2050	pv_mmu_ops.release_pud = xen_release_pud;	2062	pv_mmu_ops.release_pud = xen_release_pud;
2051	#endif	2063	#endif
2052		2064
2053	#ifdef CONFIG_X86_64	2065	#ifdef CONFIG_X86_64
2054	pv_mmu_ops.write_cr3 = &xen_write_cr3;	2066	pv_mmu_ops.write_cr3 = &xen_write_cr3;
2055	SetPagePinned(virt_to_page(level3_user_vsyscall));	2067	SetPagePinned(virt_to_page(level3_user_vsyscall));
2056	#endif	2068	#endif
2057	xen_mark_init_mm_pinned();	2069	xen_mark_init_mm_pinned();
2058	}	2070	}
2059		2071
2060	static void xen_leave_lazy_mmu(void)	2072	static void xen_leave_lazy_mmu(void)
2061	{	2073	{
2062	preempt_disable();	2074	preempt_disable();
2063	xen_mc_flush();	2075	xen_mc_flush();
2064	paravirt_leave_lazy_mmu();	2076	paravirt_leave_lazy_mmu();
2065	preempt_enable();	2077	preempt_enable();
2066	}	2078	}
2067		2079
2068	static const struct pv_mmu_ops xen_mmu_ops __initconst = {	2080	static const struct pv_mmu_ops xen_mmu_ops __initconst = {
2069	.read_cr2 = xen_read_cr2,	2081	.read_cr2 = xen_read_cr2,
2070	.write_cr2 = xen_write_cr2,	2082	.write_cr2 = xen_write_cr2,
2071		2083
2072	.read_cr3 = xen_read_cr3,	2084	.read_cr3 = xen_read_cr3,
2073	.write_cr3 = xen_write_cr3_init,	2085	.write_cr3 = xen_write_cr3_init,
2074		2086
2075	.flush_tlb_user = xen_flush_tlb,	2087	.flush_tlb_user = xen_flush_tlb,
2076	.flush_tlb_kernel = xen_flush_tlb,	2088	.flush_tlb_kernel = xen_flush_tlb,
2077	.flush_tlb_single = xen_flush_tlb_single,	2089	.flush_tlb_single = xen_flush_tlb_single,
2078	.flush_tlb_others = xen_flush_tlb_others,	2090	.flush_tlb_others = xen_flush_tlb_others,
2079		2091
2080	.pte_update = paravirt_nop,	2092	.pte_update = paravirt_nop,
2081	.pte_update_defer = paravirt_nop,	2093	.pte_update_defer = paravirt_nop,
2082		2094
2083	.pgd_alloc = xen_pgd_alloc,	2095	.pgd_alloc = xen_pgd_alloc,
2084	.pgd_free = xen_pgd_free,	2096	.pgd_free = xen_pgd_free,
2085		2097
2086	.alloc_pte = xen_alloc_pte_init,	2098	.alloc_pte = xen_alloc_pte_init,
2087	.release_pte = xen_release_pte_init,	2099	.release_pte = xen_release_pte_init,
2088	.alloc_pmd = xen_alloc_pmd_init,	2100	.alloc_pmd = xen_alloc_pmd_init,
2089	.release_pmd = xen_release_pmd_init,	2101	.release_pmd = xen_release_pmd_init,
2090		2102
2091	.set_pte = xen_set_pte_init,	2103	.set_pte = xen_set_pte_init,
2092	.set_pte_at = xen_set_pte_at,	2104	.set_pte_at = xen_set_pte_at,
2093	.set_pmd = xen_set_pmd_hyper,	2105	.set_pmd = xen_set_pmd_hyper,
2094		2106
2095	.ptep_modify_prot_start = __ptep_modify_prot_start,	2107	.ptep_modify_prot_start = __ptep_modify_prot_start,
2096	.ptep_modify_prot_commit = __ptep_modify_prot_commit,	2108	.ptep_modify_prot_commit = __ptep_modify_prot_commit,
2097		2109
2098	.pte_val = PV_CALLEE_SAVE(xen_pte_val),	2110	.pte_val = PV_CALLEE_SAVE(xen_pte_val),
2099	.pgd_val = PV_CALLEE_SAVE(xen_pgd_val),	2111	.pgd_val = PV_CALLEE_SAVE(xen_pgd_val),
2100		2112
2101	.make_pte = PV_CALLEE_SAVE(xen_make_pte),	2113	.make_pte = PV_CALLEE_SAVE(xen_make_pte),
2102	.make_pgd = PV_CALLEE_SAVE(xen_make_pgd),	2114	.make_pgd = PV_CALLEE_SAVE(xen_make_pgd),
2103		2115
2104	#ifdef CONFIG_X86_PAE	2116	#ifdef CONFIG_X86_PAE
2105	.set_pte_atomic = xen_set_pte_atomic,	2117	.set_pte_atomic = xen_set_pte_atomic,
2106	.pte_clear = xen_pte_clear,	2118	.pte_clear = xen_pte_clear,
2107	.pmd_clear = xen_pmd_clear,	2119	.pmd_clear = xen_pmd_clear,
2108	#endif /* CONFIG_X86_PAE */	2120	#endif /* CONFIG_X86_PAE */
2109	.set_pud = xen_set_pud_hyper,	2121	.set_pud = xen_set_pud_hyper,
2110		2122
2111	.make_pmd = PV_CALLEE_SAVE(xen_make_pmd),	2123	.make_pmd = PV_CALLEE_SAVE(xen_make_pmd),
2112	.pmd_val = PV_CALLEE_SAVE(xen_pmd_val),	2124	.pmd_val = PV_CALLEE_SAVE(xen_pmd_val),
2113		2125
2114	#if PAGETABLE_LEVELS == 4	2126	#if PAGETABLE_LEVELS == 4
2115	.pud_val = PV_CALLEE_SAVE(xen_pud_val),	2127	.pud_val = PV_CALLEE_SAVE(xen_pud_val),
2116	.make_pud = PV_CALLEE_SAVE(xen_make_pud),	2128	.make_pud = PV_CALLEE_SAVE(xen_make_pud),
2117	.set_pgd = xen_set_pgd_hyper,	2129	.set_pgd = xen_set_pgd_hyper,
2118		2130
2119	.alloc_pud = xen_alloc_pmd_init,	2131	.alloc_pud = xen_alloc_pmd_init,
2120	.release_pud = xen_release_pmd_init,	2132	.release_pud = xen_release_pmd_init,
2121	#endif /* PAGETABLE_LEVELS == 4 */	2133	#endif /* PAGETABLE_LEVELS == 4 */
2122		2134
2123	.activate_mm = xen_activate_mm,	2135	.activate_mm = xen_activate_mm,
2124	.dup_mmap = xen_dup_mmap,	2136	.dup_mmap = xen_dup_mmap,
2125	.exit_mmap = xen_exit_mmap,	2137	.exit_mmap = xen_exit_mmap,
2126		2138
2127	.lazy_mode = {	2139	.lazy_mode = {
2128	.enter = paravirt_enter_lazy_mmu,	2140	.enter = paravirt_enter_lazy_mmu,
2129	.leave = xen_leave_lazy_mmu,	2141	.leave = xen_leave_lazy_mmu,
2130	.flush = paravirt_flush_lazy_mmu,	2142	.flush = paravirt_flush_lazy_mmu,
2131	},	2143	},
2132		2144
2133	.set_fixmap = xen_set_fixmap,	2145	.set_fixmap = xen_set_fixmap,
2134	};	2146	};
2135		2147
2136	void __init xen_init_mmu_ops(void)	2148	void __init xen_init_mmu_ops(void)
2137	{	2149	{
2138	x86_init.paging.pagetable_init = xen_pagetable_init;	2150	x86_init.paging.pagetable_init = xen_pagetable_init;
2139		2151
2140	/* Optimization - we can use the HVM one but it has no idea which	2152	/* Optimization - we can use the HVM one but it has no idea which
2141	* VCPUs are descheduled - which means that it will needlessly IPI	2153	* VCPUs are descheduled - which means that it will needlessly IPI
2142	* them. Xen knows so let it do the job.	2154	* them. Xen knows so let it do the job.
2143	*/	2155	*/
2144	if (xen_feature(XENFEAT_auto_translated_physmap)) {	2156	if (xen_feature(XENFEAT_auto_translated_physmap)) {
2145	pv_mmu_ops.flush_tlb_others = xen_flush_tlb_others;	2157	pv_mmu_ops.flush_tlb_others = xen_flush_tlb_others;
2146	return;	2158	return;
2147	}	2159	}
2148	pv_mmu_ops = xen_mmu_ops;	2160	pv_mmu_ops = xen_mmu_ops;
2149		2161
2150	memset(dummy_mapping, 0xff, PAGE_SIZE);	2162	memset(dummy_mapping, 0xff, PAGE_SIZE);
2151	}	2163	}
2152		2164
2153	/* Protected by xen_reservation_lock. */	2165	/* Protected by xen_reservation_lock. */
2154	#define MAX_CONTIG_ORDER 9 /* 2MB */	2166	#define MAX_CONTIG_ORDER 9 /* 2MB */
2155	static unsigned long discontig_frames[1<<MAX_CONTIG_ORDER];	2167	static unsigned long discontig_frames[1<<MAX_CONTIG_ORDER];
2156		2168
2157	#define VOID_PTE (mfn_pte(0, __pgprot(0)))	2169	#define VOID_PTE (mfn_pte(0, __pgprot(0)))
2158	static void xen_zap_pfn_range(unsigned long vaddr, unsigned int order,	2170	static void xen_zap_pfn_range(unsigned long vaddr, unsigned int order,
2159	unsigned long *in_frames,	2171	unsigned long *in_frames,
2160	unsigned long *out_frames)	2172	unsigned long *out_frames)
2161	{	2173	{
2162	int i;	2174	int i;
2163	struct multicall_space mcs;	2175	struct multicall_space mcs;
2164		2176
2165	xen_mc_batch();	2177	xen_mc_batch();
2166	for (i = 0; i < (1UL<<order); i++, vaddr += PAGE_SIZE) {	2178	for (i = 0; i < (1UL<<order); i++, vaddr += PAGE_SIZE) {
2167	mcs = __xen_mc_entry(0);	2179	mcs = __xen_mc_entry(0);
2168		2180
2169	if (in_frames)	2181	if (in_frames)
2170	in_frames[i] = virt_to_mfn(vaddr);	2182	in_frames[i] = virt_to_mfn(vaddr);
2171		2183
2172	MULTI_update_va_mapping(mcs.mc, vaddr, VOID_PTE, 0);	2184	MULTI_update_va_mapping(mcs.mc, vaddr, VOID_PTE, 0);
2173	__set_phys_to_machine(virt_to_pfn(vaddr), INVALID_P2M_ENTRY);	2185	__set_phys_to_machine(virt_to_pfn(vaddr), INVALID_P2M_ENTRY);
2174		2186
2175	if (out_frames)	2187	if (out_frames)
2176	out_frames[i] = virt_to_pfn(vaddr);	2188	out_frames[i] = virt_to_pfn(vaddr);
2177	}	2189	}
2178	xen_mc_issue(0);	2190	xen_mc_issue(0);
2179	}	2191	}
2180		2192
2181	/*	2193	/*
2182	* Update the pfn-to-mfn mappings for a virtual address range, either to	2194	* Update the pfn-to-mfn mappings for a virtual address range, either to
2183	* point to an array of mfns, or contiguously from a single starting	2195	* point to an array of mfns, or contiguously from a single starting
2184	* mfn.	2196	* mfn.
2185	*/	2197	*/
2186	static void xen_remap_exchanged_ptes(unsigned long vaddr, int order,	2198	static void xen_remap_exchanged_ptes(unsigned long vaddr, int order,
2187	unsigned long *mfns,	2199	unsigned long *mfns,
2188	unsigned long first_mfn)	2200	unsigned long first_mfn)
2189	{	2201	{
2190	unsigned i, limit;	2202	unsigned i, limit;
2191	unsigned long mfn;	2203	unsigned long mfn;
2192		2204
2193	xen_mc_batch();	2205	xen_mc_batch();
2194		2206
2195	limit = 1u << order;	2207	limit = 1u << order;
2196	for (i = 0; i < limit; i++, vaddr += PAGE_SIZE) {	2208	for (i = 0; i < limit; i++, vaddr += PAGE_SIZE) {
2197	struct multicall_space mcs;	2209	struct multicall_space mcs;
2198	unsigned flags;	2210	unsigned flags;
2199		2211
2200	mcs = __xen_mc_entry(0);	2212	mcs = __xen_mc_entry(0);
2201	if (mfns)	2213	if (mfns)
2202	mfn = mfns[i];	2214	mfn = mfns[i];
2203	else	2215	else
2204	mfn = first_mfn + i;	2216	mfn = first_mfn + i;
2205		2217
2206	if (i < (limit - 1))	2218	if (i < (limit - 1))
2207	flags = 0;	2219	flags = 0;
2208	else {	2220	else {
2209	if (order == 0)	2221	if (order == 0)
2210	flags = UVMF_INVLPG \| UVMF_ALL;	2222	flags = UVMF_INVLPG \| UVMF_ALL;
2211	else	2223	else
2212	flags = UVMF_TLB_FLUSH \| UVMF_ALL;	2224	flags = UVMF_TLB_FLUSH \| UVMF_ALL;
2213	}	2225	}
2214		2226
2215	MULTI_update_va_mapping(mcs.mc, vaddr,	2227	MULTI_update_va_mapping(mcs.mc, vaddr,
2216	mfn_pte(mfn, PAGE_KERNEL), flags);	2228	mfn_pte(mfn, PAGE_KERNEL), flags);
2217		2229
2218	set_phys_to_machine(virt_to_pfn(vaddr), mfn);	2230	set_phys_to_machine(virt_to_pfn(vaddr), mfn);
2219	}	2231	}
2220		2232
2221	xen_mc_issue(0);	2233	xen_mc_issue(0);
2222	}	2234	}
2223		2235
2224	/*	2236	/*
2225	* Perform the hypercall to exchange a region of our pfns to point to	2237	* Perform the hypercall to exchange a region of our pfns to point to
2226	* memory with the required contiguous alignment. Takes the pfns as	2238	* memory with the required contiguous alignment. Takes the pfns as
2227	* input, and populates mfns as output.	2239	* input, and populates mfns as output.
2228	*	2240	*
2229	* Returns a success code indicating whether the hypervisor was able to	2241	* Returns a success code indicating whether the hypervisor was able to
2230	* satisfy the request or not.	2242	* satisfy the request or not.
2231	*/	2243	*/
2232	static int xen_exchange_memory(unsigned long extents_in, unsigned int order_in,	2244	static int xen_exchange_memory(unsigned long extents_in, unsigned int order_in,
2233	unsigned long *pfns_in,	2245	unsigned long *pfns_in,
2234	unsigned long extents_out,	2246	unsigned long extents_out,
2235	unsigned int order_out,	2247	unsigned int order_out,
2236	unsigned long *mfns_out,	2248	unsigned long *mfns_out,
2237	unsigned int address_bits)	2249	unsigned int address_bits)
2238	{	2250	{
2239	long rc;	2251	long rc;
2240	int success;	2252	int success;
2241		2253
2242	struct xen_memory_exchange exchange = {	2254	struct xen_memory_exchange exchange = {
2243	.in = {	2255	.in = {
2244	.nr_extents = extents_in,	2256	.nr_extents = extents_in,
2245	.extent_order = order_in,	2257	.extent_order = order_in,
2246	.extent_start = pfns_in,	2258	.extent_start = pfns_in,
2247	.domid = DOMID_SELF	2259	.domid = DOMID_SELF
2248	},	2260	},
2249	.out = {	2261	.out = {
2250	.nr_extents = extents_out,	2262	.nr_extents = extents_out,
2251	.extent_order = order_out,	2263	.extent_order = order_out,
2252	.extent_start = mfns_out,	2264	.extent_start = mfns_out,
2253	.address_bits = address_bits,	2265	.address_bits = address_bits,
2254	.domid = DOMID_SELF	2266	.domid = DOMID_SELF
2255	}	2267	}
2256	};	2268	};
2257		2269
2258	BUG_ON(extents_in << order_in != extents_out << order_out);	2270	BUG_ON(extents_in << order_in != extents_out << order_out);
2259		2271
2260	rc = HYPERVISOR_memory_op(XENMEM_exchange, &exchange);	2272	rc = HYPERVISOR_memory_op(XENMEM_exchange, &exchange);
2261	success = (exchange.nr_exchanged == extents_in);	2273	success = (exchange.nr_exchanged == extents_in);
2262		2274
2263	BUG_ON(!success && ((exchange.nr_exchanged != 0) \|\| (rc == 0)));	2275	BUG_ON(!success && ((exchange.nr_exchanged != 0) \|\| (rc == 0)));
2264	BUG_ON(success && (rc != 0));	2276	BUG_ON(success && (rc != 0));
2265		2277
2266	return success;	2278	return success;
2267	}	2279	}
2268		2280
2269	int xen_create_contiguous_region(phys_addr_t pstart, unsigned int order,	2281	int xen_create_contiguous_region(phys_addr_t pstart, unsigned int order,
2270	unsigned int address_bits,	2282	unsigned int address_bits,
2271	dma_addr_t *dma_handle)	2283	dma_addr_t *dma_handle)
2272	{	2284	{
2273	unsigned long *in_frames = discontig_frames, out_frame;	2285	unsigned long *in_frames = discontig_frames, out_frame;
2274	unsigned long flags;	2286	unsigned long flags;
2275	int success;	2287	int success;
2276	unsigned long vstart = (unsigned long)phys_to_virt(pstart);	2288	unsigned long vstart = (unsigned long)phys_to_virt(pstart);
2277		2289
2278	/*	2290	/*
2279	* Currently an auto-translated guest will not perform I/O, nor will	2291	* Currently an auto-translated guest will not perform I/O, nor will
2280	* it require PAE page directories below 4GB. Therefore any calls to	2292	* it require PAE page directories below 4GB. Therefore any calls to
2281	* this function are redundant and can be ignored.	2293	* this function are redundant and can be ignored.
2282	*/	2294	*/
2283		2295
2284	if (xen_feature(XENFEAT_auto_translated_physmap))	2296	if (xen_feature(XENFEAT_auto_translated_physmap))
2285	return 0;	2297	return 0;
2286		2298
2287	if (unlikely(order > MAX_CONTIG_ORDER))	2299	if (unlikely(order > MAX_CONTIG_ORDER))
2288	return -ENOMEM;	2300	return -ENOMEM;
2289		2301
2290	memset((void *) vstart, 0, PAGE_SIZE << order);	2302	memset((void *) vstart, 0, PAGE_SIZE << order);
2291		2303
2292	spin_lock_irqsave(&xen_reservation_lock, flags);	2304	spin_lock_irqsave(&xen_reservation_lock, flags);
2293		2305
2294	/* 1. Zap current PTEs, remembering MFNs. */	2306	/* 1. Zap current PTEs, remembering MFNs. */
2295	xen_zap_pfn_range(vstart, order, in_frames, NULL);	2307	xen_zap_pfn_range(vstart, order, in_frames, NULL);
2296		2308
2297	/* 2. Get a new contiguous memory extent. */	2309	/* 2. Get a new contiguous memory extent. */
2298	out_frame = virt_to_pfn(vstart);	2310	out_frame = virt_to_pfn(vstart);
2299	success = xen_exchange_memory(1UL << order, 0, in_frames,	2311	success = xen_exchange_memory(1UL << order, 0, in_frames,
2300	1, order, &out_frame,	2312	1, order, &out_frame,
2301	address_bits);	2313	address_bits);
2302		2314
2303	/* 3. Map the new extent in place of old pages. */	2315	/* 3. Map the new extent in place of old pages. */
2304	if (success)	2316	if (success)
2305	xen_remap_exchanged_ptes(vstart, order, NULL, out_frame);	2317	xen_remap_exchanged_ptes(vstart, order, NULL, out_frame);
2306	else	2318	else
2307	xen_remap_exchanged_ptes(vstart, order, in_frames, 0);	2319	xen_remap_exchanged_ptes(vstart, order, in_frames, 0);
2308		2320
2309	spin_unlock_irqrestore(&xen_reservation_lock, flags);	2321	spin_unlock_irqrestore(&xen_reservation_lock, flags);
2310		2322
2311	*dma_handle = virt_to_machine(vstart).maddr;	2323	*dma_handle = virt_to_machine(vstart).maddr;
2312	return success ? 0 : -ENOMEM;	2324	return success ? 0 : -ENOMEM;
2313	}	2325	}
2314	EXPORT_SYMBOL_GPL(xen_create_contiguous_region);	2326	EXPORT_SYMBOL_GPL(xen_create_contiguous_region);
2315		2327
2316	void xen_destroy_contiguous_region(phys_addr_t pstart, unsigned int order)	2328	void xen_destroy_contiguous_region(phys_addr_t pstart, unsigned int order)
2317	{	2329	{
2318	unsigned long *out_frames = discontig_frames, in_frame;	2330	unsigned long *out_frames = discontig_frames, in_frame;
2319	unsigned long flags;	2331	unsigned long flags;
2320	int success;	2332	int success;
2321	unsigned long vstart;	2333	unsigned long vstart;
2322		2334
2323	if (xen_feature(XENFEAT_auto_translated_physmap))	2335	if (xen_feature(XENFEAT_auto_translated_physmap))
2324	return;	2336	return;
2325		2337
2326	if (unlikely(order > MAX_CONTIG_ORDER))	2338	if (unlikely(order > MAX_CONTIG_ORDER))
2327	return;	2339	return;
2328		2340
2329	vstart = (unsigned long)phys_to_virt(pstart);	2341	vstart = (unsigned long)phys_to_virt(pstart);
2330	memset((void *) vstart, 0, PAGE_SIZE << order);	2342	memset((void *) vstart, 0, PAGE_SIZE << order);
2331		2343
2332	spin_lock_irqsave(&xen_reservation_lock, flags);	2344	spin_lock_irqsave(&xen_reservation_lock, flags);
2333		2345
2334	/* 1. Find start MFN of contiguous extent. */	2346	/* 1. Find start MFN of contiguous extent. */
2335	in_frame = virt_to_mfn(vstart);	2347	in_frame = virt_to_mfn(vstart);
2336		2348
2337	/* 2. Zap current PTEs. */	2349	/* 2. Zap current PTEs. */
2338	xen_zap_pfn_range(vstart, order, NULL, out_frames);	2350	xen_zap_pfn_range(vstart, order, NULL, out_frames);
2339		2351
2340	/* 3. Do the exchange for non-contiguous MFNs. */	2352	/* 3. Do the exchange for non-contiguous MFNs. */
2341	success = xen_exchange_memory(1, order, &in_frame, 1UL << order,	2353	success = xen_exchange_memory(1, order, &in_frame, 1UL << order,
2342	0, out_frames, 0);	2354	0, out_frames, 0);
2343		2355
2344	/* 4. Map new pages in place of old pages. */	2356	/* 4. Map new pages in place of old pages. */
2345	if (success)	2357	if (success)
2346	xen_remap_exchanged_ptes(vstart, order, out_frames, 0);	2358	xen_remap_exchanged_ptes(vstart, order, out_frames, 0);
2347	else	2359	else
2348	xen_remap_exchanged_ptes(vstart, order, NULL, in_frame);	2360	xen_remap_exchanged_ptes(vstart, order, NULL, in_frame);
2349		2361
2350	spin_unlock_irqrestore(&xen_reservation_lock, flags);	2362	spin_unlock_irqrestore(&xen_reservation_lock, flags);
2351	}	2363	}
2352	EXPORT_SYMBOL_GPL(xen_destroy_contiguous_region);	2364	EXPORT_SYMBOL_GPL(xen_destroy_contiguous_region);
2353		2365
2354	#ifdef CONFIG_XEN_PVHVM	2366	#ifdef CONFIG_XEN_PVHVM
2355	#ifdef CONFIG_PROC_VMCORE	2367	#ifdef CONFIG_PROC_VMCORE
2356	/*	2368	/*
2357	* This function is used in two contexts:	2369	* This function is used in two contexts:
2358	* - the kdump kernel has to check whether a pfn of the crashed kernel	2370	* - the kdump kernel has to check whether a pfn of the crashed kernel
2359	* was a ballooned page. vmcore is using this function to decide	2371	* was a ballooned page. vmcore is using this function to decide
2360	* whether to access a pfn of the crashed kernel.	2372	* whether to access a pfn of the crashed kernel.
2361	* - the kexec kernel has to check whether a pfn was ballooned by the	2373	* - the kexec kernel has to check whether a pfn was ballooned by the
2362	* previous kernel. If the pfn is ballooned, handle it properly.	2374	* previous kernel. If the pfn is ballooned, handle it properly.
2363	* Returns 0 if the pfn is not backed by a RAM page, the caller may	2375	* Returns 0 if the pfn is not backed by a RAM page, the caller may
2364	* handle the pfn special in this case.	2376	* handle the pfn special in this case.
2365	*/	2377	*/
2366	static int xen_oldmem_pfn_is_ram(unsigned long pfn)	2378	static int xen_oldmem_pfn_is_ram(unsigned long pfn)
2367	{	2379	{
2368	struct xen_hvm_get_mem_type a = {	2380	struct xen_hvm_get_mem_type a = {
2369	.domid = DOMID_SELF,	2381	.domid = DOMID_SELF,
2370	.pfn = pfn,	2382	.pfn = pfn,
2371	};	2383	};
2372	int ram;	2384	int ram;
2373		2385
2374	if (HYPERVISOR_hvm_op(HVMOP_get_mem_type, &a))	2386	if (HYPERVISOR_hvm_op(HVMOP_get_mem_type, &a))
2375	return -ENXIO;	2387	return -ENXIO;
2376		2388
2377	switch (a.mem_type) {	2389	switch (a.mem_type) {
2378	case HVMMEM_mmio_dm:	2390	case HVMMEM_mmio_dm:
2379	ram = 0;	2391	ram = 0;
2380	break;	2392	break;
2381	case HVMMEM_ram_rw:	2393	case HVMMEM_ram_rw:
2382	case HVMMEM_ram_ro:	2394	case HVMMEM_ram_ro:
2383	default:	2395	default:
2384	ram = 1;	2396	ram = 1;
2385	break;	2397	break;
2386	}	2398	}
2387		2399
2388	return ram;	2400	return ram;
2389	}	2401	}
2390	#endif	2402	#endif
2391		2403
2392	static void xen_hvm_exit_mmap(struct mm_struct *mm)	2404	static void xen_hvm_exit_mmap(struct mm_struct *mm)
2393	{	2405	{
2394	struct xen_hvm_pagetable_dying a;	2406	struct xen_hvm_pagetable_dying a;
2395	int rc;	2407	int rc;
2396		2408
2397	a.domid = DOMID_SELF;	2409	a.domid = DOMID_SELF;
2398	a.gpa = __pa(mm->pgd);	2410	a.gpa = __pa(mm->pgd);
2399	rc = HYPERVISOR_hvm_op(HVMOP_pagetable_dying, &a);	2411	rc = HYPERVISOR_hvm_op(HVMOP_pagetable_dying, &a);
2400	WARN_ON_ONCE(rc < 0);	2412	WARN_ON_ONCE(rc < 0);
2401	}	2413	}
2402		2414
2403	static int is_pagetable_dying_supported(void)	2415	static int is_pagetable_dying_supported(void)
2404	{	2416	{
2405	struct xen_hvm_pagetable_dying a;	2417	struct xen_hvm_pagetable_dying a;
2406	int rc = 0;	2418	int rc = 0;
2407		2419
2408	a.domid = DOMID_SELF;	2420	a.domid = DOMID_SELF;
2409	a.gpa = 0x00;	2421	a.gpa = 0x00;
2410	rc = HYPERVISOR_hvm_op(HVMOP_pagetable_dying, &a);	2422	rc = HYPERVISOR_hvm_op(HVMOP_pagetable_dying, &a);
2411	if (rc < 0) {	2423	if (rc < 0) {
2412	printk(KERN_DEBUG "HVMOP_pagetable_dying not supported\n");	2424	printk(KERN_DEBUG "HVMOP_pagetable_dying not supported\n");
2413	return 0;	2425	return 0;
2414	}	2426	}
2415	return 1;	2427	return 1;
2416	}	2428	}
2417		2429
2418	void __init xen_hvm_init_mmu_ops(void)	2430	void __init xen_hvm_init_mmu_ops(void)
2419	{	2431	{
2420	if (is_pagetable_dying_supported())	2432	if (is_pagetable_dying_supported())
2421	pv_mmu_ops.exit_mmap = xen_hvm_exit_mmap;	2433	pv_mmu_ops.exit_mmap = xen_hvm_exit_mmap;
2422	#ifdef CONFIG_PROC_VMCORE	2434	#ifdef CONFIG_PROC_VMCORE
2423	register_oldmem_pfn_is_ram(&xen_oldmem_pfn_is_ram);	2435	register_oldmem_pfn_is_ram(&xen_oldmem_pfn_is_ram);
2424	#endif	2436	#endif
2425	}	2437	}
2426	#endif	2438	#endif
2427		2439
2428	#ifdef CONFIG_XEN_PVH	2440	#ifdef CONFIG_XEN_PVH
2429	/*	2441	/*
2430	* Map foreign gfn (fgfn), to local pfn (lpfn). This for the user	2442	* Map foreign gfn (fgfn), to local pfn (lpfn). This for the user
2431	* space creating new guest on pvh dom0 and needing to map domU pages.	2443	* space creating new guest on pvh dom0 and needing to map domU pages.
2432	*/	2444	*/
2433	static int xlate_add_to_p2m(unsigned long lpfn, unsigned long fgfn,	2445	static int xlate_add_to_p2m(unsigned long lpfn, unsigned long fgfn,
2434	unsigned int domid)	2446	unsigned int domid)
2435	{	2447	{
2436	int rc, err = 0;	2448	int rc, err = 0;
2437	xen_pfn_t gpfn = lpfn;	2449	xen_pfn_t gpfn = lpfn;
2438	xen_ulong_t idx = fgfn;	2450	xen_ulong_t idx = fgfn;
2439		2451
2440	struct xen_add_to_physmap_range xatp = {	2452	struct xen_add_to_physmap_range xatp = {
2441	.domid = DOMID_SELF,	2453	.domid = DOMID_SELF,
2442	.foreign_domid = domid,	2454	.foreign_domid = domid,
2443	.size = 1,	2455	.size = 1,
2444	.space = XENMAPSPACE_gmfn_foreign,	2456	.space = XENMAPSPACE_gmfn_foreign,
2445	};	2457	};
2446	set_xen_guest_handle(xatp.idxs, &idx);	2458	set_xen_guest_handle(xatp.idxs, &idx);
2447	set_xen_guest_handle(xatp.gpfns, &gpfn);	2459	set_xen_guest_handle(xatp.gpfns, &gpfn);
2448	set_xen_guest_handle(xatp.errs, &err);	2460	set_xen_guest_handle(xatp.errs, &err);
2449		2461
2450	rc = HYPERVISOR_memory_op(XENMEM_add_to_physmap_range, &xatp);	2462	rc = HYPERVISOR_memory_op(XENMEM_add_to_physmap_range, &xatp);
2451	if (rc < 0)	2463	if (rc < 0)
2452	return rc;	2464	return rc;
2453	return err;	2465	return err;
2454	}	2466	}
2455		2467
2456	static int xlate_remove_from_p2m(unsigned long spfn, int count)	2468	static int xlate_remove_from_p2m(unsigned long spfn, int count)
2457	{	2469	{
2458	struct xen_remove_from_physmap xrp;	2470	struct xen_remove_from_physmap xrp;
2459	int i, rc;	2471	int i, rc;
2460		2472
2461	for (i = 0; i < count; i++) {	2473	for (i = 0; i < count; i++) {
2462	xrp.domid = DOMID_SELF;	2474	xrp.domid = DOMID_SELF;
2463	xrp.gpfn = spfn+i;	2475	xrp.gpfn = spfn+i;
2464	rc = HYPERVISOR_memory_op(XENMEM_remove_from_physmap, &xrp);	2476	rc = HYPERVISOR_memory_op(XENMEM_remove_from_physmap, &xrp);
2465	if (rc)	2477	if (rc)
2466	break;	2478	break;
2467	}	2479	}
2468	return rc;	2480	return rc;
2469	}	2481	}
2470		2482
2471	struct xlate_remap_data {	2483	struct xlate_remap_data {
2472	unsigned long fgfn; /* foreign domain's gfn */	2484	unsigned long fgfn; /* foreign domain's gfn */
2473	pgprot_t prot;	2485	pgprot_t prot;
2474	domid_t domid;	2486	domid_t domid;
2475	int index;	2487	int index;
2476	struct page **pages;	2488	struct page **pages;
2477	};	2489	};
2478		2490
2479	static int xlate_map_pte_fn(pte_t *ptep, pgtable_t token, unsigned long addr,	2491	static int xlate_map_pte_fn(pte_t *ptep, pgtable_t token, unsigned long addr,
2480	void *data)	2492	void *data)
2481	{	2493	{
2482	int rc;	2494	int rc;
2483	struct xlate_remap_data *remap = data;	2495	struct xlate_remap_data *remap = data;
2484	unsigned long pfn = page_to_pfn(remap->pages[remap->index++]);	2496	unsigned long pfn = page_to_pfn(remap->pages[remap->index++]);
2485	pte_t pteval = pte_mkspecial(pfn_pte(pfn, remap->prot));	2497	pte_t pteval = pte_mkspecial(pfn_pte(pfn, remap->prot));
2486		2498
2487	rc = xlate_add_to_p2m(pfn, remap->fgfn, remap->domid);	2499	rc = xlate_add_to_p2m(pfn, remap->fgfn, remap->domid);
2488	if (rc)	2500	if (rc)
2489	return rc;	2501	return rc;
2490	native_set_pte(ptep, pteval);	2502	native_set_pte(ptep, pteval);
2491		2503
2492	return 0;	2504	return 0;
2493	}	2505	}
2494		2506
2495	static int xlate_remap_gfn_range(struct vm_area_struct *vma,	2507	static int xlate_remap_gfn_range(struct vm_area_struct *vma,
2496	unsigned long addr, unsigned long mfn,	2508	unsigned long addr, unsigned long mfn,
2497	int nr, pgprot_t prot, unsigned domid,	2509	int nr, pgprot_t prot, unsigned domid,
2498	struct page **pages)	2510	struct page **pages)
2499	{	2511	{
2500	int err;	2512	int err;
2501	struct xlate_remap_data pvhdata;	2513	struct xlate_remap_data pvhdata;
2502		2514
2503	BUG_ON(!pages);	2515	BUG_ON(!pages);
2504		2516
2505	pvhdata.fgfn = mfn;	2517	pvhdata.fgfn = mfn;
2506	pvhdata.prot = prot;	2518	pvhdata.prot = prot;
2507	pvhdata.domid = domid;	2519	pvhdata.domid = domid;
2508	pvhdata.index = 0;	2520	pvhdata.index = 0;
2509	pvhdata.pages = pages;	2521	pvhdata.pages = pages;
2510	err = apply_to_page_range(vma->vm_mm, addr, nr << PAGE_SHIFT,	2522	err = apply_to_page_range(vma->vm_mm, addr, nr << PAGE_SHIFT,
2511	xlate_map_pte_fn, &pvhdata);	2523	xlate_map_pte_fn, &pvhdata);
2512	flush_tlb_all();	2524	flush_tlb_all();
2513	return err;	2525	return err;
2514	}	2526	}
2515	#endif	2527	#endif
2516		2528
2517	#define REMAP_BATCH_SIZE 16	2529	#define REMAP_BATCH_SIZE 16
2518		2530
2519	struct remap_data {	2531	struct remap_data {
2520	unsigned long mfn;	2532	unsigned long mfn;
2521	pgprot_t prot;	2533	pgprot_t prot;
2522	struct mmu_update *mmu_update;	2534	struct mmu_update *mmu_update;
2523	};	2535	};
2524		2536
2525	static int remap_area_mfn_pte_fn(pte_t *ptep, pgtable_t token,	2537	static int remap_area_mfn_pte_fn(pte_t *ptep, pgtable_t token,
2526	unsigned long addr, void *data)	2538	unsigned long addr, void *data)
2527	{	2539	{
2528	struct remap_data *rmd = data;	2540	struct remap_data *rmd = data;
2529	pte_t pte = pte_mkspecial(mfn_pte(rmd->mfn++, rmd->prot));	2541	pte_t pte = pte_mkspecial(mfn_pte(rmd->mfn++, rmd->prot));
2530		2542
2531	rmd->mmu_update->ptr = virt_to_machine(ptep).maddr;	2543	rmd->mmu_update->ptr = virt_to_machine(ptep).maddr;
2532	rmd->mmu_update->val = pte_val_ma(pte);	2544	rmd->mmu_update->val = pte_val_ma(pte);
2533	rmd->mmu_update++;	2545	rmd->mmu_update++;
2534		2546
2535	return 0;	2547	return 0;
2536	}	2548	}
2537		2549
2538	int xen_remap_domain_mfn_range(struct vm_area_struct *vma,	2550	int xen_remap_domain_mfn_range(struct vm_area_struct *vma,
2539	unsigned long addr,	2551	unsigned long addr,
2540	xen_pfn_t mfn, int nr,	2552	xen_pfn_t mfn, int nr,
2541	pgprot_t prot, unsigned domid,	2553	pgprot_t prot, unsigned domid,
2542	struct page **pages)	2554	struct page **pages)
2543		2555
2544	{	2556	{
2545	struct remap_data rmd;	2557	struct remap_data rmd;
2546	struct mmu_update mmu_update[REMAP_BATCH_SIZE];	2558	struct mmu_update mmu_update[REMAP_BATCH_SIZE];
2547	int batch;	2559	int batch;
2548	unsigned long range;	2560	unsigned long range;
2549	int err = 0;	2561	int err = 0;
2550		2562
2551	BUG_ON(!((vma->vm_flags & (VM_PFNMAP \| VM_IO)) == (VM_PFNMAP \| VM_IO)));	2563	BUG_ON(!((vma->vm_flags & (VM_PFNMAP \| VM_IO)) == (VM_PFNMAP \| VM_IO)));
2552		2564
2553	if (xen_feature(XENFEAT_auto_translated_physmap)) {	2565	if (xen_feature(XENFEAT_auto_translated_physmap)) {
2554	#ifdef CONFIG_XEN_PVH	2566	#ifdef CONFIG_XEN_PVH
2555	/* We need to update the local page tables and the xen HAP */	2567	/* We need to update the local page tables and the xen HAP */
2556	return xlate_remap_gfn_range(vma, addr, mfn, nr, prot,	2568	return xlate_remap_gfn_range(vma, addr, mfn, nr, prot,
2557	domid, pages);	2569	domid, pages);
2558	#else	2570	#else
2559	return -EINVAL;	2571	return -EINVAL;
2560	#endif	2572	#endif
2561	}	2573	}
2562		2574
2563	rmd.mfn = mfn;	2575	rmd.mfn = mfn;
2564	rmd.prot = prot;	2576	rmd.prot = prot;
2565		2577
2566	while (nr) {	2578	while (nr) {
2567	batch = min(REMAP_BATCH_SIZE, nr);	2579	batch = min(REMAP_BATCH_SIZE, nr);
2568	range = (unsigned long)batch << PAGE_SHIFT;	2580	range = (unsigned long)batch << PAGE_SHIFT;
2569		2581
2570	rmd.mmu_update = mmu_update;	2582	rmd.mmu_update = mmu_update;
2571	err = apply_to_page_range(vma->vm_mm, addr, range,	2583	err = apply_to_page_range(vma->vm_mm, addr, range,
2572	remap_area_mfn_pte_fn, &rmd);	2584	remap_area_mfn_pte_fn, &rmd);
2573	if (err)	2585	if (err)
2574	goto out;	2586	goto out;
2575		2587
2576	err = HYPERVISOR_mmu_update(mmu_update, batch, NULL, domid);	2588	err = HYPERVISOR_mmu_update(mmu_update, batch, NULL, domid);
2577	if (err < 0)	2589	if (err < 0)
2578	goto out;	2590	goto out;
2579		2591
2580	nr -= batch;	2592	nr -= batch;
2581	addr += range;	2593	addr += range;
2582	}	2594	}
2583		2595
2584	err = 0;	2596	err = 0;
2585	out:	2597	out:
2586		2598
2587	xen_flush_tlb_all();	2599	xen_flush_tlb_all();
2588		2600
2589	return err;	2601	return err;
2590	}	2602	}
2591	EXPORT_SYMBOL_GPL(xen_remap_domain_mfn_range);	2603	EXPORT_SYMBOL_GPL(xen_remap_domain_mfn_range);
2592		2604
2593	/* Returns: 0 success */	2605	/* Returns: 0 success */
2594	int xen_unmap_domain_mfn_range(struct vm_area_struct *vma,	2606	int xen_unmap_domain_mfn_range(struct vm_area_struct *vma,
2595	int numpgs, struct page **pages)	2607	int numpgs, struct page **pages)
2596	{	2608	{
2597	if (!pages \|\| !xen_feature(XENFEAT_auto_translated_physmap))	2609	if (!pages \|\| !xen_feature(XENFEAT_auto_translated_physmap))
2598	return 0;	2610	return 0;
2599		2611
2600	#ifdef CONFIG_XEN_PVH	2612	#ifdef CONFIG_XEN_PVH
2601	while (numpgs--) {	2613	while (numpgs--) {
2602	/*	2614	/*
2603	* The mmu has already cleaned up the process mmu	2615	* The mmu has already cleaned up the process mmu
2604	* resources at this point (lookup_address will return	2616	* resources at this point (lookup_address will return
2605	* NULL).	2617	* NULL).
2606	*/	2618	*/
2607	unsigned long pfn = page_to_pfn(pages[numpgs]);	2619	unsigned long pfn = page_to_pfn(pages[numpgs]);
2608		2620
2609	xlate_remove_from_p2m(pfn, 1);	2621	xlate_remove_from_p2m(pfn, 1);
2610	}	2622	}
2611	/*	2623	/*
2612	* We don't need to flush tlbs because as part of	2624	* We don't need to flush tlbs because as part of
2613	* xlate_remove_from_p2m, the hypervisor will do tlb flushes	2625	* xlate_remove_from_p2m, the hypervisor will do tlb flushes
2614	* after removing the p2m entries from the EPT/NPT	2626	* after removing the p2m entries from the EPT/NPT

arch/x86/xen/p2m.c

Diff comments View file @ eb64c3c

1	/*	1	/*
2	* Xen leaves the responsibility for maintaining p2m mappings to the	2	* Xen leaves the responsibility for maintaining p2m mappings to the
3	* guests themselves, but it must also access and update the p2m array	3	* guests themselves, but it must also access and update the p2m array
4	* during suspend/resume when all the pages are reallocated.	4	* during suspend/resume when all the pages are reallocated.
5	*	5	*
6	* The p2m table is logically a flat array, but we implement it as a	6	* The logical flat p2m table is mapped to a linear kernel memory area.
7	* three-level tree to allow the address space to be sparse.	7	* For accesses by Xen a three-level tree linked via mfns only is set up to
		8	* allow the address space to be sparse.
8	*	9	*
9	* Xen	10	* Xen
10	* \|	11	* \|
11	* p2m_top p2m_top_mfn	12	* p2m_top_mfn
12	* / \ / \	13	* / \
13	* p2m_mid p2m_mid p2m_mid_mfn p2m_mid_mfn	14	* p2m_mid_mfn p2m_mid_mfn
14	* / \ / \ / /	15	* / /
15	* p2m p2m p2m p2m p2m p2m p2m ...	16	* p2m p2m p2m ...
16	*	17	*
17	* The p2m_mid_mfn pages are mapped by p2m_top_mfn_p.	18	* The p2m_mid_mfn pages are mapped by p2m_top_mfn_p.
18	*	19	*
19	* The p2m_top and p2m_top_mfn levels are limited to 1 page, so the	20	* The p2m_top_mfn level is limited to 1 page, so the maximum representable
20	* maximum representable pseudo-physical address space is:	21	* pseudo-physical address space is:
21	* P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE pages	22	* P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE pages
22	*	23	*
23	* P2M_PER_PAGE depends on the architecture, as a mfn is always	24	* P2M_PER_PAGE depends on the architecture, as a mfn is always
24	* unsigned long (8 bytes on 64-bit, 4 bytes on 32), leading to	25	* unsigned long (8 bytes on 64-bit, 4 bytes on 32), leading to
25	* 512 and 1024 entries respectively.	26	* 512 and 1024 entries respectively.
26	*	27	*
27	* In short, these structures contain the Machine Frame Number (MFN) of the PFN.	28	* In short, these structures contain the Machine Frame Number (MFN) of the PFN.
28	*	29	*
29	* However not all entries are filled with MFNs. Specifically for all other	30	* However not all entries are filled with MFNs. Specifically for all other
30	* leaf entries, or for the top root, or middle one, for which there is a void	31	* leaf entries, or for the top root, or middle one, for which there is a void
31	* entry, we assume it is "missing". So (for example)	32	* entry, we assume it is "missing". So (for example)
32	* pfn_to_mfn(0x90909090)=INVALID_P2M_ENTRY.	33	* pfn_to_mfn(0x90909090)=INVALID_P2M_ENTRY.
		34	* We have a dedicated page p2m_missing with all entries being
		35	* INVALID_P2M_ENTRY. This page may be referenced multiple times in the p2m
		36	* list/tree in case there are multiple areas with P2M_PER_PAGE invalid pfns.
33	*	37	*
34	* We also have the possibility of setting 1-1 mappings on certain regions, so	38	* We also have the possibility of setting 1-1 mappings on certain regions, so
35	* that:	39	* that:
36	* pfn_to_mfn(0xc0000)=0xc0000	40	* pfn_to_mfn(0xc0000)=0xc0000
37	*	41	*
38	* The benefit of this is, that we can assume for non-RAM regions (think	42	* The benefit of this is, that we can assume for non-RAM regions (think
39	* PCI BARs, or ACPI spaces), we can create mappings easily because we	43	* PCI BARs, or ACPI spaces), we can create mappings easily because we
40	* get the PFN value to match the MFN.	44	* get the PFN value to match the MFN.
41	*	45	*
42	* For this to work efficiently we have one new page p2m_identity and	46	* For this to work efficiently we have one new page p2m_identity. All entries
43	* allocate (via reserved_brk) any other pages we need to cover the sides	47	* in p2m_identity are set to INVALID_P2M_ENTRY type (Xen toolstack only
44	* (1GB or 4MB boundary violations). All entries in p2m_identity are set to	48	* recognizes that and MFNs, no other fancy value).
45	* INVALID_P2M_ENTRY type (Xen toolstack only recognizes that and MFNs,
46	* no other fancy value).
47	*	49	*
48	* On lookup we spot that the entry points to p2m_identity and return the	50	* On lookup we spot that the entry points to p2m_identity and return the
49	* identity value instead of dereferencing and returning INVALID_P2M_ENTRY.	51	* identity value instead of dereferencing and returning INVALID_P2M_ENTRY.
50	* If the entry points to an allocated page, we just proceed as before and	52	* If the entry points to an allocated page, we just proceed as before and
51	* return the PFN. If the PFN has IDENTITY_FRAME_BIT set we unmask that in	53	* return the PFN. If the PFN has IDENTITY_FRAME_BIT set we unmask that in
52	* appropriate functions (pfn_to_mfn).	54	* appropriate functions (pfn_to_mfn).
53	*	55	*
54	* The reason for having the IDENTITY_FRAME_BIT instead of just returning the	56	* The reason for having the IDENTITY_FRAME_BIT instead of just returning the
55	* PFN is that we could find ourselves where pfn_to_mfn(pfn)==pfn for a	57	* PFN is that we could find ourselves where pfn_to_mfn(pfn)==pfn for a
56	* non-identity pfn. To protect ourselves against we elect to set (and get) the	58	* non-identity pfn. To protect ourselves against we elect to set (and get) the
57	* IDENTITY_FRAME_BIT on all identity mapped PFNs.	59	* IDENTITY_FRAME_BIT on all identity mapped PFNs.
58	*
59	* This simplistic diagram is used to explain the more subtle piece of code.
60	* There is also a digram of the P2M at the end that can help.
61	* Imagine your E820 looking as so:
62	*
63	* 1GB 2GB 4GB
64	* /-------------------+---------\/----\ /----------\ /---+-----\
65	* \| System RAM \| Sys RAM \|\|ACPI\| \| reserved \| \| Sys RAM \|
66	* \-------------------+---------/\----/ \----------/ \---+-----/
67	* ^- 1029MB ^- 2001MB
68	*
69	* [1029MB = 263424 (0x40500), 2001MB = 512256 (0x7D100),
70	* 2048MB = 524288 (0x80000)]
71	*
72	* And dom0_mem=max:3GB,1GB is passed in to the guest, meaning memory past 1GB
73	* is actually not present (would have to kick the balloon driver to put it in).
74	*
75	* When we are told to set the PFNs for identity mapping (see patch: "xen/setup:
76	* Set identity mapping for non-RAM E820 and E820 gaps.") we pass in the start
77	* of the PFN and the end PFN (263424 and 512256 respectively). The first step
78	* is to reserve_brk a top leaf page if the p2m[1] is missing. The top leaf page
79	* covers 512^2 of page estate (1GB) and in case the start or end PFN is not
80	* aligned on 512^2*PAGE_SIZE (1GB) we reserve_brk new middle and leaf pages as
81	* required to split any existing p2m_mid_missing middle pages.
82	*
83	* With the E820 example above, 263424 is not 1GB aligned so we allocate a
84	* reserve_brk page which will cover the PFNs estate from 0x40000 to 0x80000.
85	* Each entry in the allocate page is "missing" (points to p2m_missing).
86	*
87	* Next stage is to determine if we need to do a more granular boundary check
88	* on the 4MB (or 2MB depending on architecture) off the start and end pfn's.
89	* We check if the start pfn and end pfn violate that boundary check, and if
90	* so reserve_brk a (p2m[x][y]) leaf page. This way we have a much finer
91	* granularity of setting which PFNs are missing and which ones are identity.
92	* In our example 263424 and 512256 both fail the check so we reserve_brk two
93	* pages. Populate them with INVALID_P2M_ENTRY (so they both have "missing"
94	* values) and assign them to p2m[1][2] and p2m[1][488] respectively.
95	*
96	* At this point we would at minimum reserve_brk one page, but could be up to
97	* three. Each call to set_phys_range_identity has at maximum a three page
98	* cost. If we were to query the P2M at this stage, all those entries from
99	* start PFN through end PFN (so 1029MB -> 2001MB) would return
100	* INVALID_P2M_ENTRY ("missing").
101	*
102	* The next step is to walk from the start pfn to the end pfn setting
103	* the IDENTITY_FRAME_BIT on each PFN. This is done in set_phys_range_identity.
104	* If we find that the middle entry is pointing to p2m_missing we can swap it
105	* over to p2m_identity - this way covering 4MB (or 2MB) PFN space (and
106	* similarly swapping p2m_mid_missing for p2m_mid_identity for larger regions).
107	* At this point we do not need to worry about boundary aligment (so no need to
108	* reserve_brk a middle page, figure out which PFNs are "missing" and which
109	* ones are identity), as that has been done earlier. If we find that the
110	* middle leaf is not occupied by p2m_identity or p2m_missing, we dereference
111	* that page (which covers 512 PFNs) and set the appropriate PFN with
112	* IDENTITY_FRAME_BIT. In our example 263424 and 512256 end up there, and we
113	* set from p2m[1][2][256->511] and p2m[1][488][0->256] with
114	* IDENTITY_FRAME_BIT set.
115	*
116	* All other regions that are void (or not filled) either point to p2m_missing
117	* (considered missing) or have the default value of INVALID_P2M_ENTRY (also
118	* considered missing). In our case, p2m[1][2][0->255] and p2m[1][488][257->511]
119	* contain the INVALID_P2M_ENTRY value and are considered "missing."
120	*
121	* Finally, the region beyond the end of of the E820 (4 GB in this example)
122	* is set to be identity (in case there are MMIO regions placed here).
123	*
124	* This is what the p2m ends up looking (for the E820 above) with this
125	* fabulous drawing:
126	*
127	* p2m /--------------\
128	* /-----\ \| &mfn_list[0],\| /-----------------\
129	* \| 0 \|------>\| &mfn_list[1],\| /---------------\ \| ~0, ~0, .. \|
130	* \|-----\| \| ..., ~0, ~0 \| \| ~0, ~0, [x]---+----->\| IDENTITY [@256] \|
131	* \| 1 \|---\ \--------------/ \| [p2m_identity]+\ \| IDENTITY [@257] \|
132	* \|-----\| \ \| [p2m_identity]+\\ \| .... \|
133	* \| 2 \|--\ \-------------------->\| ... \| \\ \----------------/
134	* \|-----\| \ \---------------/ \\
135	* \| 3 \|-\ \ \\ p2m_identity [1]
136	* \|-----\| \ \-------------------->/---------------\ /-----------------\
137	* \| .. \|\ \| \| [p2m_identity]+-->\| ~0, ~0, ~0, ... \|
138	* \-----/ \| \| \| [p2m_identity]+-->\| ..., ~0 \|
139	* \| \| \| .... \| \-----------------/
140	* \| \| +-[x], ~0, ~0.. +\
141	* \| \| \---------------/ \
142	* \| \| \-> /---------------\
143	* \| V p2m_mid_missing p2m_missing \| IDENTITY[@0] \|
144	* \| /-----------------\ /------------\ \| IDENTITY[@256]\|
145	* \| \| [p2m_missing] +---->\| ~0, ~0, ...\| \| ~0, ~0, .... \|
146	* \| \| [p2m_missing] +---->\| ..., ~0 \| \---------------/
147	* \| \| ... \| \------------/
148	* \| \-----------------/
149	* \|
150	* \| p2m_mid_identity
151	* \| /-----------------\
152	* \-->\| [p2m_identity] +---->[1]
153	* \| [p2m_identity] +---->[1]
154	* \| ... \|
155	* \-----------------/
156	*
157	* where ~0 is INVALID_P2M_ENTRY. IDENTITY is (PFN \| IDENTITY_BIT)
158	*/	60	*/
159		61
160	#include <linux/init.h>	62	#include <linux/init.h>
161	#include <linux/module.h>	63	#include <linux/module.h>
162	#include <linux/list.h>	64	#include <linux/list.h>
163	#include <linux/hash.h>	65	#include <linux/hash.h>
164	#include <linux/sched.h>	66	#include <linux/sched.h>
165	#include <linux/seq_file.h>	67	#include <linux/seq_file.h>
166	#include <linux/bootmem.h>	68	#include <linux/bootmem.h>
		69	#include <linux/slab.h>
167		70
168	#include <asm/cache.h>	71	#include <asm/cache.h>
169	#include <asm/setup.h>	72	#include <asm/setup.h>
		73	#include <asm/uaccess.h>
170		74
171	#include <asm/xen/page.h>	75	#include <asm/xen/page.h>
172	#include <asm/xen/hypercall.h>	76	#include <asm/xen/hypercall.h>
173	#include <asm/xen/hypervisor.h>	77	#include <asm/xen/hypervisor.h>
174	#include <xen/balloon.h>	78	#include <xen/balloon.h>
175	#include <xen/grant_table.h>	79	#include <xen/grant_table.h>
176		80
177	#include "p2m.h"	81	#include "p2m.h"
178	#include "multicalls.h"	82	#include "multicalls.h"
179	#include "xen-ops.h"	83	#include "xen-ops.h"
180		84
		85	#define PMDS_PER_MID_PAGE (P2M_MID_PER_PAGE / PTRS_PER_PTE)
		86
181	static void __init m2p_override_init(void);	87	static void __init m2p_override_init(void);
182		88
		89	unsigned long *xen_p2m_addr __read_mostly;
		90	EXPORT_SYMBOL_GPL(xen_p2m_addr);
		91	unsigned long xen_p2m_size __read_mostly;
		92	EXPORT_SYMBOL_GPL(xen_p2m_size);
183	unsigned long xen_max_p2m_pfn __read_mostly;	93	unsigned long xen_max_p2m_pfn __read_mostly;
		94	EXPORT_SYMBOL_GPL(xen_max_p2m_pfn);
184		95
		96	static DEFINE_SPINLOCK(p2m_update_lock);
		97
185	static unsigned long *p2m_mid_missing_mfn;	98	static unsigned long *p2m_mid_missing_mfn;
186	static unsigned long *p2m_top_mfn;	99	static unsigned long *p2m_top_mfn;
187	static unsigned long **p2m_top_mfn_p;	100	static unsigned long **p2m_top_mfn_p;
		101	static unsigned long *p2m_missing;
		102	static unsigned long *p2m_identity;
		103	static pte_t *p2m_missing_pte;
		104	static pte_t *p2m_identity_pte;
188		105
189	/* Placeholders for holes in the address space */
190	static RESERVE_BRK_ARRAY(unsigned long, p2m_missing, P2M_PER_PAGE);
191	static RESERVE_BRK_ARRAY(unsigned long *, p2m_mid_missing, P2M_MID_PER_PAGE);
192
193	static RESERVE_BRK_ARRAY(unsigned long **, p2m_top, P2M_TOP_PER_PAGE);
194
195	static RESERVE_BRK_ARRAY(unsigned long, p2m_identity, P2M_PER_PAGE);
196	static RESERVE_BRK_ARRAY(unsigned long *, p2m_mid_identity, P2M_MID_PER_PAGE);
197
198	RESERVE_BRK(p2m_mid, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE)));
199
200	/* For each I/O range remapped we may lose up to two leaf pages for the boundary
201	* violations and three mid pages to cover up to 3GB. With
202	* early_can_reuse_p2m_middle() most of the leaf pages will be reused by the
203	* remapped region.
204	*/
205	RESERVE_BRK(p2m_identity_remap, PAGE_SIZE * 2 * 3 * MAX_REMAP_RANGES);
206
207	static inline unsigned p2m_top_index(unsigned long pfn)	106	static inline unsigned p2m_top_index(unsigned long pfn)
208	{	107	{
209	BUG_ON(pfn >= MAX_P2M_PFN);	108	BUG_ON(pfn >= MAX_P2M_PFN);
210	return pfn / (P2M_MID_PER_PAGE * P2M_PER_PAGE);	109	return pfn / (P2M_MID_PER_PAGE * P2M_PER_PAGE);
211	}	110	}
212		111
213	static inline unsigned p2m_mid_index(unsigned long pfn)	112	static inline unsigned p2m_mid_index(unsigned long pfn)
214	{	113	{
215	return (pfn / P2M_PER_PAGE) % P2M_MID_PER_PAGE;	114	return (pfn / P2M_PER_PAGE) % P2M_MID_PER_PAGE;
216	}	115	}
217		116
218	static inline unsigned p2m_index(unsigned long pfn)	117	static inline unsigned p2m_index(unsigned long pfn)
219	{	118	{
220	return pfn % P2M_PER_PAGE;	119	return pfn % P2M_PER_PAGE;
221	}	120	}
222		121
223	static void p2m_top_init(unsigned long ***top)
224	{
225	unsigned i;
226
227	for (i = 0; i < P2M_TOP_PER_PAGE; i++)
228	top[i] = p2m_mid_missing;
229	}
230
231	static void p2m_top_mfn_init(unsigned long *top)	122	static void p2m_top_mfn_init(unsigned long *top)
232	{	123	{
233	unsigned i;	124	unsigned i;
234		125
235	for (i = 0; i < P2M_TOP_PER_PAGE; i++)	126	for (i = 0; i < P2M_TOP_PER_PAGE; i++)
236	top[i] = virt_to_mfn(p2m_mid_missing_mfn);	127	top[i] = virt_to_mfn(p2m_mid_missing_mfn);
237	}	128	}
238		129
239	static void p2m_top_mfn_p_init(unsigned long **top)	130	static void p2m_top_mfn_p_init(unsigned long **top)
240	{	131	{
241	unsigned i;	132	unsigned i;
242		133
243	for (i = 0; i < P2M_TOP_PER_PAGE; i++)	134	for (i = 0; i < P2M_TOP_PER_PAGE; i++)
244	top[i] = p2m_mid_missing_mfn;	135	top[i] = p2m_mid_missing_mfn;
245	}	136	}
246		137
247	static void p2m_mid_init(unsigned long *mid, unsigned long leaf)	138	static void p2m_mid_mfn_init(unsigned long mid, unsigned long leaf)
248	{	139	{
249	unsigned i;	140	unsigned i;
250		141
251	for (i = 0; i < P2M_MID_PER_PAGE; i++)	142	for (i = 0; i < P2M_MID_PER_PAGE; i++)
252	mid[i] = leaf;	143	mid[i] = virt_to_mfn(leaf);
253	}	144	}
254		145
255	static void p2m_mid_mfn_init(unsigned long mid, unsigned long leaf)	146	static void p2m_init(unsigned long *p2m)
256	{	147	{
257	unsigned i;	148	unsigned i;
258		149
259	for (i = 0; i < P2M_MID_PER_PAGE; i++)	150	for (i = 0; i < P2M_PER_PAGE; i++)
260	mid[i] = virt_to_mfn(leaf);	151	p2m[i] = INVALID_P2M_ENTRY;
261	}	152	}
262		153
263	static void p2m_init(unsigned long *p2m)	154	static void p2m_init_identity(unsigned long *p2m, unsigned long pfn)
264	{	155	{
265	unsigned i;	156	unsigned i;
266		157
267	for (i = 0; i < P2M_MID_PER_PAGE; i++)	158	for (i = 0; i < P2M_PER_PAGE; i++)
268	p2m[i] = INVALID_P2M_ENTRY;	159	p2m[i] = IDENTITY_FRAME(pfn + i);
269	}	160	}
270		161
		162	static void * __ref alloc_p2m_page(void)
		163	{
		164	if (unlikely(!slab_is_available()))
		165	return alloc_bootmem_align(PAGE_SIZE, PAGE_SIZE);
		166
		167	return (void *)__get_free_page(GFP_KERNEL \| __GFP_REPEAT);
		168	}
		169
		170	/* Only to be called in case of a race for a page just allocated! */
		171	static void free_p2m_page(void *p)
		172	{
		173	BUG_ON(!slab_is_available());
		174	free_page((unsigned long)p);
		175	}
		176
271	/*	177	/*
272	* Build the parallel p2m_top_mfn and p2m_mid_mfn structures	178	* Build the parallel p2m_top_mfn and p2m_mid_mfn structures
273	*	179	*
274	* This is called both at boot time, and after resuming from suspend:	180	* This is called both at boot time, and after resuming from suspend:
275	* - At boot time we're called rather early, and must use alloc_bootmem*()	181	* - At boot time we're called rather early, and must use alloc_bootmem*()
276	* to allocate memory.	182	* to allocate memory.
277	*	183	*
278	* - After resume we're called from within stop_machine, but the mfn	184	* - After resume we're called from within stop_machine, but the mfn
279	* tree should already be completely allocated.	185	* tree should already be completely allocated.
280	*/	186	*/
281	void __ref xen_build_mfn_list_list(void)	187	void __ref xen_build_mfn_list_list(void)
282	{	188	{
283	unsigned long pfn;	189	unsigned long pfn, mfn;
		190	pte_t *ptep;
		191	unsigned int level, topidx, mididx;
		192	unsigned long *mid_mfn_p;
284		193
285	if (xen_feature(XENFEAT_auto_translated_physmap))	194	if (xen_feature(XENFEAT_auto_translated_physmap))
286	return;	195	return;
287		196
288	/* Pre-initialize p2m_top_mfn to be completely missing */	197	/* Pre-initialize p2m_top_mfn to be completely missing */
289	if (p2m_top_mfn == NULL) {	198	if (p2m_top_mfn == NULL) {
290	p2m_mid_missing_mfn = alloc_bootmem_align(PAGE_SIZE, PAGE_SIZE);	199	p2m_mid_missing_mfn = alloc_p2m_page();
291	p2m_mid_mfn_init(p2m_mid_missing_mfn, p2m_missing);	200	p2m_mid_mfn_init(p2m_mid_missing_mfn, p2m_missing);
292		201
293	p2m_top_mfn_p = alloc_bootmem_align(PAGE_SIZE, PAGE_SIZE);	202	p2m_top_mfn_p = alloc_p2m_page();
294	p2m_top_mfn_p_init(p2m_top_mfn_p);	203	p2m_top_mfn_p_init(p2m_top_mfn_p);
295		204
296	p2m_top_mfn = alloc_bootmem_align(PAGE_SIZE, PAGE_SIZE);	205	p2m_top_mfn = alloc_p2m_page();
297	p2m_top_mfn_init(p2m_top_mfn);	206	p2m_top_mfn_init(p2m_top_mfn);
298	} else {	207	} else {
299	/* Reinitialise, mfn's all change after migration */	208	/* Reinitialise, mfn's all change after migration */
300	p2m_mid_mfn_init(p2m_mid_missing_mfn, p2m_missing);	209	p2m_mid_mfn_init(p2m_mid_missing_mfn, p2m_missing);
301	}	210	}
302		211
303	for (pfn = 0; pfn < xen_max_p2m_pfn; pfn += P2M_PER_PAGE) {	212	for (pfn = 0; pfn < xen_max_p2m_pfn && pfn < MAX_P2M_PFN;
304	unsigned topidx = p2m_top_index(pfn);	213	pfn += P2M_PER_PAGE) {
305	unsigned mididx = p2m_mid_index(pfn);	214	topidx = p2m_top_index(pfn);
306	unsigned long **mid;	215	mididx = p2m_mid_index(pfn);
307	unsigned long *mid_mfn_p;
308		216
309	mid = p2m_top[topidx];
310	mid_mfn_p = p2m_top_mfn_p[topidx];	217	mid_mfn_p = p2m_top_mfn_p[topidx];
		218	ptep = lookup_address((unsigned long)(xen_p2m_addr + pfn),
		219	&level);
		220	BUG_ON(!ptep \|\| level != PG_LEVEL_4K);
		221	mfn = pte_mfn(*ptep);
		222	ptep = (pte_t *)((unsigned long)ptep & ~(PAGE_SIZE - 1));
311		223
312	/* Don't bother allocating any mfn mid levels if	224	/* Don't bother allocating any mfn mid levels if
313	* they're just missing, just update the stored mfn,	225	* they're just missing, just update the stored mfn,
314	* since all could have changed over a migrate.	226	* since all could have changed over a migrate.
315	*/	227	*/
316	if (mid == p2m_mid_missing) {	228	if (ptep == p2m_missing_pte \|\| ptep == p2m_identity_pte) {
317	BUG_ON(mididx);	229	BUG_ON(mididx);
318	BUG_ON(mid_mfn_p != p2m_mid_missing_mfn);	230	BUG_ON(mid_mfn_p != p2m_mid_missing_mfn);
319	p2m_top_mfn[topidx] = virt_to_mfn(p2m_mid_missing_mfn);	231	p2m_top_mfn[topidx] = virt_to_mfn(p2m_mid_missing_mfn);
320	pfn += (P2M_MID_PER_PAGE - 1) * P2M_PER_PAGE;	232	pfn += (P2M_MID_PER_PAGE - 1) * P2M_PER_PAGE;
321	continue;	233	continue;
322	}	234	}
323		235
324	if (mid_mfn_p == p2m_mid_missing_mfn) {	236	if (mid_mfn_p == p2m_mid_missing_mfn) {
325	/*	237	mid_mfn_p = alloc_p2m_page();
326	* XXX boot-time only! We should never find
327	* missing parts of the mfn tree after
328	* runtime.
329	*/
330	mid_mfn_p = alloc_bootmem_align(PAGE_SIZE, PAGE_SIZE);
331	p2m_mid_mfn_init(mid_mfn_p, p2m_missing);	238	p2m_mid_mfn_init(mid_mfn_p, p2m_missing);
332		239
333	p2m_top_mfn_p[topidx] = mid_mfn_p;	240	p2m_top_mfn_p[topidx] = mid_mfn_p;
334	}	241	}
335		242
336	p2m_top_mfn[topidx] = virt_to_mfn(mid_mfn_p);	243	p2m_top_mfn[topidx] = virt_to_mfn(mid_mfn_p);
337	mid_mfn_p[mididx] = virt_to_mfn(mid[mididx]);	244	mid_mfn_p[mididx] = mfn;
338	}	245	}
339	}	246	}
340		247
341	void xen_setup_mfn_list_list(void)	248	void xen_setup_mfn_list_list(void)
342	{	249	{
343	if (xen_feature(XENFEAT_auto_translated_physmap))	250	if (xen_feature(XENFEAT_auto_translated_physmap))
344	return;	251	return;
345		252
346	BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info);	253	BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info);
347		254
348	HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =	255	HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
349	virt_to_mfn(p2m_top_mfn);	256	virt_to_mfn(p2m_top_mfn);
350	HYPERVISOR_shared_info->arch.max_pfn = xen_max_p2m_pfn;	257	HYPERVISOR_shared_info->arch.max_pfn = xen_max_p2m_pfn;
351	}	258	}
352		259
353	/* Set up p2m_top to point to the domain-builder provided p2m pages */	260	/* Set up p2m_top to point to the domain-builder provided p2m pages */
354	void __init xen_build_dynamic_phys_to_machine(void)	261	void __init xen_build_dynamic_phys_to_machine(void)
355	{	262	{
356	unsigned long *mfn_list;
357	unsigned long max_pfn;
358	unsigned long pfn;	263	unsigned long pfn;
359		264
360	if (xen_feature(XENFEAT_auto_translated_physmap))	265	if (xen_feature(XENFEAT_auto_translated_physmap))
361	return;	266	return;
362		267
363	mfn_list = (unsigned long *)xen_start_info->mfn_list;	268	xen_p2m_addr = (unsigned long *)xen_start_info->mfn_list;
364	max_pfn = min(MAX_DOMAIN_PAGES, xen_start_info->nr_pages);	269	xen_p2m_size = ALIGN(xen_start_info->nr_pages, P2M_PER_PAGE);
365	xen_max_p2m_pfn = max_pfn;
366		270
367	p2m_missing = extend_brk(PAGE_SIZE, PAGE_SIZE);	271	for (pfn = xen_start_info->nr_pages; pfn < xen_p2m_size; pfn++)
368	p2m_init(p2m_missing);	272	xen_p2m_addr[pfn] = INVALID_P2M_ENTRY;
369	p2m_identity = extend_brk(PAGE_SIZE, PAGE_SIZE);
370	p2m_init(p2m_identity);
371		273
372	p2m_mid_missing = extend_brk(PAGE_SIZE, PAGE_SIZE);	274	xen_max_p2m_pfn = xen_p2m_size;
373	p2m_mid_init(p2m_mid_missing, p2m_missing);	275	}
374	p2m_mid_identity = extend_brk(PAGE_SIZE, PAGE_SIZE);
375	p2m_mid_init(p2m_mid_identity, p2m_identity);
376		276
377	p2m_top = extend_brk(PAGE_SIZE, PAGE_SIZE);	277	#define P2M_TYPE_IDENTITY 0
378	p2m_top_init(p2m_top);	278	#define P2M_TYPE_MISSING 1
		279	#define P2M_TYPE_PFN 2
		280	#define P2M_TYPE_UNKNOWN 3
379		281
380	/*	282	static int xen_p2m_elem_type(unsigned long pfn)
381	* The domain builder gives us a pre-constructed p2m array in	283	{
382	* mfn_list for all the pages initially given to us, so we just	284	unsigned long mfn;
383	* need to graft that into our tree structure.
384	*/
385	for (pfn = 0; pfn < max_pfn; pfn += P2M_PER_PAGE) {
386	unsigned topidx = p2m_top_index(pfn);
387	unsigned mididx = p2m_mid_index(pfn);
388		285
389	if (p2m_top[topidx] == p2m_mid_missing) {	286	if (pfn >= xen_p2m_size)
390	unsigned long **mid = extend_brk(PAGE_SIZE, PAGE_SIZE);	287	return P2M_TYPE_IDENTITY;
391	p2m_mid_init(mid, p2m_missing);
392		288
393	p2m_top[topidx] = mid;	289	mfn = xen_p2m_addr[pfn];
394	}
395		290
396	/*	291	if (mfn == INVALID_P2M_ENTRY)
397	* As long as the mfn_list has enough entries to completely	292	return P2M_TYPE_MISSING;
398	* fill a p2m page, pointing into the array is ok. But if
399	* not the entries beyond the last pfn will be undefined.
400	*/
401	if (unlikely(pfn + P2M_PER_PAGE > max_pfn)) {
402	unsigned long p2midx;
403		293
404	p2midx = max_pfn % P2M_PER_PAGE;	294	if (mfn & IDENTITY_FRAME_BIT)
405	for ( ; p2midx < P2M_PER_PAGE; p2midx++)	295	return P2M_TYPE_IDENTITY;
406	mfn_list[pfn + p2midx] = INVALID_P2M_ENTRY;
407	}
408	p2m_top[topidx][mididx] = &mfn_list[pfn];
409	}
410		296
411	m2p_override_init();	297	return P2M_TYPE_PFN;
412	}	298	}
413	#ifdef CONFIG_X86_64	299
414	unsigned long __init xen_revector_p2m_tree(void)	300	static void __init xen_rebuild_p2m_list(unsigned long *p2m)
415	{	301	{
416	unsigned long va_start;	302	unsigned int i, chunk;
417	unsigned long va_end;
418	unsigned long pfn;	303	unsigned long pfn;
419	unsigned long pfn_free = 0;	304	unsigned long *mfns;
420	unsigned long *mfn_list = NULL;	305	pte_t *ptep;
421	unsigned long size;	306	pmd_t *pmdp;
		307	int type;
422		308
423	va_start = xen_start_info->mfn_list;	309	p2m_missing = alloc_p2m_page();
424	/We copy in increments of P2M_PER_PAGE sizeof(unsigned long),	310	p2m_init(p2m_missing);
425	* so make sure it is rounded up to that */	311	p2m_identity = alloc_p2m_page();
426	size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long));	312	p2m_init(p2m_identity);
427	va_end = va_start + size;
428		313
429	/* If we were revectored already, don't do it again. */	314	p2m_missing_pte = alloc_p2m_page();
430	if (va_start <= __START_KERNEL_map && va_start >= __PAGE_OFFSET)	315	paravirt_alloc_pte(&init_mm, __pa(p2m_missing_pte) >> PAGE_SHIFT);
431	return 0;	316	p2m_identity_pte = alloc_p2m_page();
432		317	paravirt_alloc_pte(&init_mm, __pa(p2m_identity_pte) >> PAGE_SHIFT);
433	mfn_list = alloc_bootmem_align(size, PAGE_SIZE);	318	for (i = 0; i < PTRS_PER_PTE; i++) {
434	if (!mfn_list) {	319	set_pte(p2m_missing_pte + i,
435	pr_warn("Could not allocate space for a new P2M tree!\n");	320	pfn_pte(PFN_DOWN(__pa(p2m_missing)), PAGE_KERNEL_RO));
436	return xen_start_info->mfn_list;	321	set_pte(p2m_identity_pte + i,
		322	pfn_pte(PFN_DOWN(__pa(p2m_identity)), PAGE_KERNEL_RO));
437	}	323	}
438	/* Fill it out with INVALID_P2M_ENTRY value */
439	memset(mfn_list, 0xFF, size);
440		324
441	for (pfn = 0; pfn < ALIGN(MAX_DOMAIN_PAGES, P2M_PER_PAGE); pfn += P2M_PER_PAGE) {	325	for (pfn = 0; pfn < xen_max_p2m_pfn; pfn += chunk) {
442	unsigned topidx = p2m_top_index(pfn);	326	/*
443	unsigned mididx;	327	* Try to map missing/identity PMDs or p2m-pages if possible.
444	unsigned long *mid_p;	328	* We have to respect the structure of the mfn_list_list
		329	* which will be built just afterwards.
		330	* Chunk size to test is one p2m page if we are in the middle
		331	* of a mfn_list_list mid page and the complete mid page area
		332	* if we are at index 0 of the mid page. Please note that a
		333	* mid page might cover more than one PMD, e.g. on 32 bit PAE
		334	* kernels.
		335	*/
		336	chunk = (pfn & (P2M_PER_PAGE * P2M_MID_PER_PAGE - 1)) ?
		337	P2M_PER_PAGE : P2M_PER_PAGE * P2M_MID_PER_PAGE;
445		338
446	if (!p2m_top[topidx])	339	type = xen_p2m_elem_type(pfn);
447	continue;	340	i = 0;
		341	if (type != P2M_TYPE_PFN)
		342	for (i = 1; i < chunk; i++)
		343	if (xen_p2m_elem_type(pfn + i) != type)
		344	break;
		345	if (i < chunk)
		346	/* Reset to minimal chunk size. */
		347	chunk = P2M_PER_PAGE;
448		348
449	if (p2m_top[topidx] == p2m_mid_missing)	349	if (type == P2M_TYPE_PFN \|\| i < chunk) {
		350	/* Use initial p2m page contents. */
		351	#ifdef CONFIG_X86_64
		352	mfns = alloc_p2m_page();
		353	copy_page(mfns, xen_p2m_addr + pfn);
		354	#else
		355	mfns = xen_p2m_addr + pfn;
		356	#endif
		357	ptep = populate_extra_pte((unsigned long)(p2m + pfn));
		358	set_pte(ptep,
		359	pfn_pte(PFN_DOWN(__pa(mfns)), PAGE_KERNEL));
450	continue;	360	continue;
		361	}
451		362
452	mididx = p2m_mid_index(pfn);	363	if (chunk == P2M_PER_PAGE) {
453	mid_p = p2m_top[topidx][mididx];	364	/* Map complete missing or identity p2m-page. */
454	if (!mid_p)	365	mfns = (type == P2M_TYPE_MISSING) ?
		366	p2m_missing : p2m_identity;
		367	ptep = populate_extra_pte((unsigned long)(p2m + pfn));
		368	set_pte(ptep,
		369	pfn_pte(PFN_DOWN(__pa(mfns)), PAGE_KERNEL_RO));
455	continue;	370	continue;
456	if ((mid_p == p2m_missing) \|\| (mid_p == p2m_identity))	371	}
457	continue;
458		372
459	if ((unsigned long)mid_p == INVALID_P2M_ENTRY)	373	/* Complete missing or identity PMD(s) can be mapped. */
460	continue;	374	ptep = (type == P2M_TYPE_MISSING) ?
		375	p2m_missing_pte : p2m_identity_pte;
		376	for (i = 0; i < PMDS_PER_MID_PAGE; i++) {
		377	pmdp = populate_extra_pmd(
		378	(unsigned long)(p2m + pfn + i * PTRS_PER_PTE));
		379	set_pmd(pmdp, __pmd(__pa(ptep) \| _KERNPG_TABLE));
		380	}
		381	}
		382	}
461		383
462	/* The old va. Rebase it on mfn_list */	384	void __init xen_vmalloc_p2m_tree(void)
463	if (mid_p >= (unsigned long )va_start && mid_p <= (unsigned long )va_end) {	385	{
464	unsigned long *new;	386	static struct vm_struct vm;
465		387
466	if (pfn_free > (size / sizeof(unsigned long))) {	388	vm.flags = VM_ALLOC;
467	WARN(1, "Only allocated for %ld pages, but we want %ld!\n",	389	vm.size = ALIGN(sizeof(unsigned long) * xen_max_p2m_pfn,
468	size / sizeof(unsigned long), pfn_free);	390	PMD_SIZE * PMDS_PER_MID_PAGE);
469	return 0;	391	vm_area_register_early(&vm, PMD_SIZE * PMDS_PER_MID_PAGE);
470	}	392	pr_notice("p2m virtual area at %p, size is %lx\n", vm.addr, vm.size);
471	new = &mfn_list[pfn_free];
472		393
473	copy_page(new, mid_p);	394	xen_max_p2m_pfn = vm.size / sizeof(unsigned long);
474	p2m_top[topidx][mididx] = &mfn_list[pfn_free];
475		395
476	pfn_free += P2M_PER_PAGE;	396	xen_rebuild_p2m_list(vm.addr);
477		397
478	}	398	xen_p2m_addr = vm.addr;
479	/* This should be the leafs allocated for identity from _brk. */	399	xen_p2m_size = xen_max_p2m_pfn;
480	}
481	return (unsigned long)mfn_list;
482		400
		401	xen_inv_extra_mem();
		402
		403	m2p_override_init();
483	}	404	}
484	#else	405
485	unsigned long __init xen_revector_p2m_tree(void)
486	{
487	return 0;
488	}
489	#endif
490	unsigned long get_phys_to_machine(unsigned long pfn)	406	unsigned long get_phys_to_machine(unsigned long pfn)
491	{	407	{
492	unsigned topidx, mididx, idx;	408	pte_t *ptep;
		409	unsigned int level;
493		410
494	if (unlikely(pfn >= MAX_P2M_PFN))	411	if (unlikely(pfn >= xen_p2m_size)) {
		412	if (pfn < xen_max_p2m_pfn)
		413	return xen_chk_extra_mem(pfn);
		414
495	return IDENTITY_FRAME(pfn);	415	return IDENTITY_FRAME(pfn);
		416	}
496		417
497	topidx = p2m_top_index(pfn);	418	ptep = lookup_address((unsigned long)(xen_p2m_addr + pfn), &level);
498	mididx = p2m_mid_index(pfn);	419	BUG_ON(!ptep \|\| level != PG_LEVEL_4K);
499	idx = p2m_index(pfn);
500		420
501	/*	421	/*
502	* The INVALID_P2M_ENTRY is filled in both p2m_*identity	422	* The INVALID_P2M_ENTRY is filled in both p2m_*identity
503	* and in p2m_*missing, so returning the INVALID_P2M_ENTRY	423	* and in p2m_*missing, so returning the INVALID_P2M_ENTRY
504	* would be wrong.	424	* would be wrong.
505	*/	425	*/
506	if (p2m_top[topidx][mididx] == p2m_identity)	426	if (pte_pfn(*ptep) == PFN_DOWN(__pa(p2m_identity)))
507	return IDENTITY_FRAME(pfn);	427	return IDENTITY_FRAME(pfn);
508		428
509	return p2m_top[topidx][mididx][idx];	429	return xen_p2m_addr[pfn];
510	}	430	}
511	EXPORT_SYMBOL_GPL(get_phys_to_machine);	431	EXPORT_SYMBOL_GPL(get_phys_to_machine);
512		432
513	static void *alloc_p2m_page(void)	433	/*
		434	* Allocate new pmd(s). It is checked whether the old pmd is still in place.
		435	* If not, nothing is changed. This is okay as the only reason for allocating
		436	* a new pmd is to replace p2m_missing_pte or p2m_identity_pte by a individual
		437	* pmd. In case of PAE/x86-32 there are multiple pmds to allocate!
		438	*/
		439	static pte_t alloc_p2m_pmd(unsigned long addr, pte_t ptep, pte_t *pte_pg)
514	{	440	{
515	return (void *)__get_free_page(GFP_KERNEL \| __GFP_REPEAT);	441	pte_t *ptechk;
516	}	442	pte_t *pteret = ptep;
		443	pte_t *pte_newpg[PMDS_PER_MID_PAGE];
		444	pmd_t *pmdp;
		445	unsigned int level;
		446	unsigned long flags;
		447	unsigned long vaddr;
		448	int i;
517		449
518	static void free_p2m_page(void *p)	450	/* Do all allocations first to bail out in error case. */
519	{	451	for (i = 0; i < PMDS_PER_MID_PAGE; i++) {
520	free_page((unsigned long)p);	452	pte_newpg[i] = alloc_p2m_page();
		453	if (!pte_newpg[i]) {
		454	for (i--; i >= 0; i--)
		455	free_p2m_page(pte_newpg[i]);
		456
		457	return NULL;
		458	}
		459	}
		460
		461	vaddr = addr & ~(PMD_SIZE * PMDS_PER_MID_PAGE - 1);
		462
		463	for (i = 0; i < PMDS_PER_MID_PAGE; i++) {
		464	copy_page(pte_newpg[i], pte_pg);
		465	paravirt_alloc_pte(&init_mm, __pa(pte_newpg[i]) >> PAGE_SHIFT);
		466
		467	pmdp = lookup_pmd_address(vaddr);
		468	BUG_ON(!pmdp);
		469
		470	spin_lock_irqsave(&p2m_update_lock, flags);
		471
		472	ptechk = lookup_address(vaddr, &level);
		473	if (ptechk == pte_pg) {
		474	set_pmd(pmdp,
		475	__pmd(__pa(pte_newpg[i]) \| _KERNPG_TABLE));
		476	if (vaddr == (addr & ~(PMD_SIZE - 1)))
		477	pteret = pte_offset_kernel(pmdp, addr);
		478	pte_newpg[i] = NULL;
		479	}
		480
		481	spin_unlock_irqrestore(&p2m_update_lock, flags);
		482
		483	if (pte_newpg[i]) {
		484	paravirt_release_pte(__pa(pte_newpg[i]) >> PAGE_SHIFT);
		485	free_p2m_page(pte_newpg[i]);
		486	}
		487
		488	vaddr += PMD_SIZE;
		489	}
		490
		491	return pteret;
521	}	492	}
522		493
523	/*	494	/*
524	* Fully allocate the p2m structure for a given pfn. We need to check	495	* Fully allocate the p2m structure for a given pfn. We need to check
525	* that both the top and mid levels are allocated, and make sure the	496	* that both the top and mid levels are allocated, and make sure the
526	* parallel mfn tree is kept in sync. We may race with other cpus, so	497	* parallel mfn tree is kept in sync. We may race with other cpus, so
527	* the new pages are installed with cmpxchg; if we lose the race then	498	* the new pages are installed with cmpxchg; if we lose the race then
528	* simply free the page we allocated and use the one that's there.	499	* simply free the page we allocated and use the one that's there.
529	*/	500	*/
530	static bool alloc_p2m(unsigned long pfn)	501	static bool alloc_p2m(unsigned long pfn)
531	{	502	{
532	unsigned topidx, mididx;	503	unsigned topidx, mididx;
533	unsigned long *top_p, mid;
534	unsigned long top_mfn_p, mid_mfn;	504	unsigned long top_mfn_p, mid_mfn;
535	unsigned long *p2m_orig;	505	pte_t ptep, pte_pg;
		506	unsigned int level;
		507	unsigned long flags;
		508	unsigned long addr = (unsigned long)(xen_p2m_addr + pfn);
		509	unsigned long p2m_pfn;
536		510
537	topidx = p2m_top_index(pfn);	511	topidx = p2m_top_index(pfn);
538	mididx = p2m_mid_index(pfn);	512	mididx = p2m_mid_index(pfn);
539		513
540	top_p = &p2m_top[topidx];	514	ptep = lookup_address(addr, &level);
541	mid = ACCESS_ONCE(*top_p);	515	BUG_ON(!ptep \|\| level != PG_LEVEL_4K);
		516	pte_pg = (pte_t *)((unsigned long)ptep & ~(PAGE_SIZE - 1));
542		517
543	if (mid == p2m_mid_missing) {	518	if (pte_pg == p2m_missing_pte \|\| pte_pg == p2m_identity_pte) {
544	/* Mid level is missing, allocate a new one */	519	/* PMD level is missing, allocate a new one */
545	mid = alloc_p2m_page();	520	ptep = alloc_p2m_pmd(addr, ptep, pte_pg);
546	if (!mid)	521	if (!ptep)
547	return false;	522	return false;
548
549	p2m_mid_init(mid, p2m_missing);
550
551	if (cmpxchg(top_p, p2m_mid_missing, mid) != p2m_mid_missing)
552	free_p2m_page(mid);
553	}	523	}
554		524
555	top_mfn_p = &p2m_top_mfn[topidx];	525	if (p2m_top_mfn) {
556	mid_mfn = ACCESS_ONCE(p2m_top_mfn_p[topidx]);	526	top_mfn_p = &p2m_top_mfn[topidx];
		527	mid_mfn = ACCESS_ONCE(p2m_top_mfn_p[topidx]);
557		528
558	BUG_ON(virt_to_mfn(mid_mfn) != *top_mfn_p);	529	BUG_ON(virt_to_mfn(mid_mfn) != *top_mfn_p);
559		530
560	if (mid_mfn == p2m_mid_missing_mfn) {	531	if (mid_mfn == p2m_mid_missing_mfn) {
561	/* Separately check the mid mfn level */	532	/* Separately check the mid mfn level */
562	unsigned long missing_mfn;	533	unsigned long missing_mfn;
563	unsigned long mid_mfn_mfn;	534	unsigned long mid_mfn_mfn;
564	unsigned long old_mfn;	535	unsigned long old_mfn;
565		536
566	mid_mfn = alloc_p2m_page();	537	mid_mfn = alloc_p2m_page();
567	if (!mid_mfn)	538	if (!mid_mfn)
568	return false;	539	return false;
569		540
570	p2m_mid_mfn_init(mid_mfn, p2m_missing);	541	p2m_mid_mfn_init(mid_mfn, p2m_missing);
571		542
572	missing_mfn = virt_to_mfn(p2m_mid_missing_mfn);	543	missing_mfn = virt_to_mfn(p2m_mid_missing_mfn);
573	mid_mfn_mfn = virt_to_mfn(mid_mfn);	544	mid_mfn_mfn = virt_to_mfn(mid_mfn);
574	old_mfn = cmpxchg(top_mfn_p, missing_mfn, mid_mfn_mfn);	545	old_mfn = cmpxchg(top_mfn_p, missing_mfn, mid_mfn_mfn);
575	if (old_mfn != missing_mfn) {	546	if (old_mfn != missing_mfn) {
576	free_p2m_page(mid_mfn);	547	free_p2m_page(mid_mfn);
577	mid_mfn = mfn_to_virt(old_mfn);	548	mid_mfn = mfn_to_virt(old_mfn);
578	} else {	549	} else {
579	p2m_top_mfn_p[topidx] = mid_mfn;	550	p2m_top_mfn_p[topidx] = mid_mfn;
		551	}
580	}	552	}
		553	} else {
		554	mid_mfn = NULL;
581	}	555	}
582		556
583	p2m_orig = ACCESS_ONCE(p2m_top[topidx][mididx]);	557	p2m_pfn = pte_pfn(ACCESS_ONCE(*ptep));
584	if (p2m_orig == p2m_identity \|\| p2m_orig == p2m_missing) {	558	if (p2m_pfn == PFN_DOWN(__pa(p2m_identity)) \|\|
		559	p2m_pfn == PFN_DOWN(__pa(p2m_missing))) {
585	/* p2m leaf page is missing */	560	/* p2m leaf page is missing */
586	unsigned long *p2m;	561	unsigned long *p2m;
587		562
588	p2m = alloc_p2m_page();	563	p2m = alloc_p2m_page();
589	if (!p2m)	564	if (!p2m)
590	return false;	565	return false;
591		566
592	p2m_init(p2m);	567	if (p2m_pfn == PFN_DOWN(__pa(p2m_missing)))
593		568	p2m_init(p2m);
594	if (cmpxchg(&mid[mididx], p2m_orig, p2m) != p2m_orig)
595	free_p2m_page(p2m);
596	else	569	else
597	mid_mfn[mididx] = virt_to_mfn(p2m);	570	p2m_init_identity(p2m, pfn);
598	}
599		571
600	return true;	572	spin_lock_irqsave(&p2m_update_lock, flags);
601	}
602		573
603	static bool __init early_alloc_p2m(unsigned long pfn, bool check_boundary)	574	if (pte_pfn(*ptep) == p2m_pfn) {
604	{	575	set_pte(ptep,
605	unsigned topidx, mididx, idx;	576	pfn_pte(PFN_DOWN(__pa(p2m)), PAGE_KERNEL));
606	unsigned long *p2m;	577	if (mid_mfn)
607		578	mid_mfn[mididx] = virt_to_mfn(p2m);
608	topidx = p2m_top_index(pfn);	579	p2m = NULL;
609	mididx = p2m_mid_index(pfn);
610	idx = p2m_index(pfn);
611
612	/* Pfff.. No boundary cross-over, lets get out. */
613	if (!idx && check_boundary)
614	return false;
615
616	WARN(p2m_top[topidx][mididx] == p2m_identity,
617	"P2M[%d][%d] == IDENTITY, should be MISSING (or alloced)!\n",
618	topidx, mididx);
619
620	/*
621	* Could be done by xen_build_dynamic_phys_to_machine..
622	*/
623	if (p2m_top[topidx][mididx] != p2m_missing)
624	return false;
625
626	/* Boundary cross-over for the edges: */
627	p2m = extend_brk(PAGE_SIZE, PAGE_SIZE);
628
629	p2m_init(p2m);
630
631	p2m_top[topidx][mididx] = p2m;
632
633	return true;
634	}
635
636	static bool __init early_alloc_p2m_middle(unsigned long pfn)
637	{
638	unsigned topidx = p2m_top_index(pfn);
639	unsigned long **mid;
640
641	mid = p2m_top[topidx];
642	if (mid == p2m_mid_missing) {
643	mid = extend_brk(PAGE_SIZE, PAGE_SIZE);
644
645	p2m_mid_init(mid, p2m_missing);
646
647	p2m_top[topidx] = mid;
648	}
649	return true;
650	}
651
652	/*
653	* Skim over the P2M tree looking at pages that are either filled with
654	* INVALID_P2M_ENTRY or with 1:1 PFNs. If found, re-use that page and
655	* replace the P2M leaf with a p2m_missing or p2m_identity.
656	* Stick the old page in the new P2M tree location.
657	*/
658	static bool __init early_can_reuse_p2m_middle(unsigned long set_pfn)
659	{
660	unsigned topidx;
661	unsigned mididx;
662	unsigned ident_pfns;
663	unsigned inv_pfns;
664	unsigned long *p2m;
665	unsigned idx;
666	unsigned long pfn;
667
668	/* We only look when this entails a P2M middle layer */
669	if (p2m_index(set_pfn))
670	return false;
671
672	for (pfn = 0; pfn < MAX_DOMAIN_PAGES; pfn += P2M_PER_PAGE) {
673	topidx = p2m_top_index(pfn);
674
675	if (!p2m_top[topidx])
676	continue;
677
678	if (p2m_top[topidx] == p2m_mid_missing)
679	continue;
680
681	mididx = p2m_mid_index(pfn);
682	p2m = p2m_top[topidx][mididx];
683	if (!p2m)
684	continue;
685
686	if ((p2m == p2m_missing) \|\| (p2m == p2m_identity))
687	continue;
688
689	if ((unsigned long)p2m == INVALID_P2M_ENTRY)
690	continue;
691
692	ident_pfns = 0;
693	inv_pfns = 0;
694	for (idx = 0; idx < P2M_PER_PAGE; idx++) {
695	/* IDENTITY_PFNs are 1:1 */
696	if (p2m[idx] == IDENTITY_FRAME(pfn + idx))
697	ident_pfns++;
698	else if (p2m[idx] == INVALID_P2M_ENTRY)
699	inv_pfns++;
700	else
701	break;
702	}	580	}
703	if ((ident_pfns == P2M_PER_PAGE) \|\| (inv_pfns == P2M_PER_PAGE))
704	goto found;
705	}
706	return false;
707	found:
708	/* Found one, replace old with p2m_identity or p2m_missing */
709	p2m_top[topidx][mididx] = (ident_pfns ? p2m_identity : p2m_missing);
710		581
711	/* Reset where we want to stick the old page in. */	582	spin_unlock_irqrestore(&p2m_update_lock, flags);
712	topidx = p2m_top_index(set_pfn);
713	mididx = p2m_mid_index(set_pfn);
714		583
715	/* This shouldn't happen */	584	if (p2m)
716	if (WARN_ON(p2m_top[topidx] == p2m_mid_missing))	585	free_p2m_page(p2m);
717	early_alloc_p2m_middle(set_pfn);
718
719	if (WARN_ON(p2m_top[topidx][mididx] != p2m_missing))
720	return false;
721
722	p2m_init(p2m);
723	p2m_top[topidx][mididx] = p2m;
724
725	return true;
726	}
727	bool __init early_set_phys_to_machine(unsigned long pfn, unsigned long mfn)
728	{
729	if (unlikely(!__set_phys_to_machine(pfn, mfn))) {
730	if (!early_alloc_p2m_middle(pfn))
731	return false;
732
733	if (early_can_reuse_p2m_middle(pfn))
734	return __set_phys_to_machine(pfn, mfn);
735
736	if (!early_alloc_p2m(pfn, false /* boundary crossover OK!*/))
737	return false;
738
739	if (!__set_phys_to_machine(pfn, mfn))
740	return false;
741	}	586	}
742		587
743	return true;	588	return true;
744	}	589	}
745		590
746	static void __init early_split_p2m(unsigned long pfn)
747	{
748	unsigned long mididx, idx;
749
750	mididx = p2m_mid_index(pfn);
751	idx = p2m_index(pfn);
752
753	/*
754	* Allocate new middle and leaf pages if this pfn lies in the
755	* middle of one.
756	*/
757	if (mididx \|\| idx)
758	early_alloc_p2m_middle(pfn);
759	if (idx)
760	early_alloc_p2m(pfn, false);
761	}
762
763	unsigned long __init set_phys_range_identity(unsigned long pfn_s,	591	unsigned long __init set_phys_range_identity(unsigned long pfn_s,
764	unsigned long pfn_e)	592	unsigned long pfn_e)
765	{	593	{
766	unsigned long pfn;	594	unsigned long pfn;
767		595
768	if (unlikely(pfn_s >= MAX_P2M_PFN))	596	if (unlikely(pfn_s >= xen_p2m_size))
769	return 0;	597	return 0;
770		598
771	if (unlikely(xen_feature(XENFEAT_auto_translated_physmap)))	599	if (unlikely(xen_feature(XENFEAT_auto_translated_physmap)))
772	return pfn_e - pfn_s;	600	return pfn_e - pfn_s;
773		601
774	if (pfn_s > pfn_e)	602	if (pfn_s > pfn_e)
775	return 0;	603	return 0;
776		604
777	if (pfn_e > MAX_P2M_PFN)	605	if (pfn_e > xen_p2m_size)
778	pfn_e = MAX_P2M_PFN;	606	pfn_e = xen_p2m_size;
779		607
780	early_split_p2m(pfn_s);	608	for (pfn = pfn_s; pfn < pfn_e; pfn++)
781	early_split_p2m(pfn_e);	609	xen_p2m_addr[pfn] = IDENTITY_FRAME(pfn);
782		610
783	for (pfn = pfn_s; pfn < pfn_e;) {
784	unsigned topidx = p2m_top_index(pfn);
785	unsigned mididx = p2m_mid_index(pfn);
786
787	if (!__set_phys_to_machine(pfn, IDENTITY_FRAME(pfn)))
788	break;
789	pfn++;
790
791	/*
792	* If the PFN was set to a middle or leaf identity
793	* page the remainder must also be identity, so skip
794	* ahead to the next middle or leaf entry.
795	*/
796	if (p2m_top[topidx] == p2m_mid_identity)
797	pfn = ALIGN(pfn, P2M_MID_PER_PAGE * P2M_PER_PAGE);
798	else if (p2m_top[topidx][mididx] == p2m_identity)
799	pfn = ALIGN(pfn, P2M_PER_PAGE);
800	}
801
802	WARN((pfn - pfn_s) != (pfn_e - pfn_s),
803	"Identity mapping failed. We are %ld short of 1-1 mappings!\n",
804	(pfn_e - pfn_s) - (pfn - pfn_s));
805
806	return pfn - pfn_s;	611	return pfn - pfn_s;
807	}	612	}
808		613
809	/* Try to install p2m mapping; fail if intermediate bits missing */
810	bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn)	614	bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn)
811	{	615	{
812	unsigned topidx, mididx, idx;	616	pte_t *ptep;
		617	unsigned int level;
813		618
814	/* don't track P2M changes in autotranslate guests */	619	/* don't track P2M changes in autotranslate guests */
815	if (unlikely(xen_feature(XENFEAT_auto_translated_physmap)))	620	if (unlikely(xen_feature(XENFEAT_auto_translated_physmap)))
816	return true;	621	return true;
817		622
818	if (unlikely(pfn >= MAX_P2M_PFN)) {	623	if (unlikely(pfn >= xen_p2m_size)) {
819	BUG_ON(mfn != INVALID_P2M_ENTRY);	624	BUG_ON(mfn != INVALID_P2M_ENTRY);
820	return true;	625	return true;
821	}	626	}
822		627
823	topidx = p2m_top_index(pfn);	628	if (likely(!xen_safe_write_ulong(xen_p2m_addr + pfn, mfn)))
824	mididx = p2m_mid_index(pfn);	629	return true;
825	idx = p2m_index(pfn);
826		630
827	/* For sparse holes were the p2m leaf has real PFN along with	631	ptep = lookup_address((unsigned long)(xen_p2m_addr + pfn), &level);
828	* PCI holes, stick in the PFN as the MFN value.	632	BUG_ON(!ptep \|\| level != PG_LEVEL_4K);
829	*
830	* set_phys_range_identity() will have allocated new middle
831	* and leaf pages as required so an existing p2m_mid_missing
832	* or p2m_missing mean that whole range will be identity so
833	* these can be switched to p2m_mid_identity or p2m_identity.
834	*/
835	if (mfn != INVALID_P2M_ENTRY && (mfn & IDENTITY_FRAME_BIT)) {
836	if (p2m_top[topidx] == p2m_mid_identity)
837	return true;
838		633
839	if (p2m_top[topidx] == p2m_mid_missing) {	634	if (pte_pfn(*ptep) == PFN_DOWN(__pa(p2m_missing)))
840	WARN_ON(cmpxchg(&p2m_top[topidx], p2m_mid_missing,
841	p2m_mid_identity) != p2m_mid_missing);
842	return true;
843	}
844
845	if (p2m_top[topidx][mididx] == p2m_identity)
846	return true;
847
848	/* Swap over from MISSING to IDENTITY if needed. */
849	if (p2m_top[topidx][mididx] == p2m_missing) {
850	WARN_ON(cmpxchg(&p2m_top[topidx][mididx], p2m_missing,
851	p2m_identity) != p2m_missing);
852	return true;
853	}
854	}
855
856	if (p2m_top[topidx][mididx] == p2m_missing)
857	return mfn == INVALID_P2M_ENTRY;	635	return mfn == INVALID_P2M_ENTRY;
858		636
859	p2m_top[topidx][mididx][idx] = mfn;	637	if (pte_pfn(*ptep) == PFN_DOWN(__pa(p2m_identity)))
		638	return mfn == IDENTITY_FRAME(pfn);
860		639
861	return true;	640	return false;
862	}	641	}
863		642
864	bool set_phys_to_machine(unsigned long pfn, unsigned long mfn)	643	bool set_phys_to_machine(unsigned long pfn, unsigned long mfn)
865	{	644	{
866	if (unlikely(!__set_phys_to_machine(pfn, mfn))) {	645	if (unlikely(!__set_phys_to_machine(pfn, mfn))) {
867	if (!alloc_p2m(pfn))	646	if (!alloc_p2m(pfn))
868	return false;	647	return false;
869		648
870	if (!__set_phys_to_machine(pfn, mfn))	649	return __set_phys_to_machine(pfn, mfn);
871	return false;
872	}	650	}
873		651
874	return true;	652	return true;
875	}	653	}
876		654
877	#define M2P_OVERRIDE_HASH_SHIFT 10	655	#define M2P_OVERRIDE_HASH_SHIFT 10
878	#define M2P_OVERRIDE_HASH (1 << M2P_OVERRIDE_HASH_SHIFT)	656	#define M2P_OVERRIDE_HASH (1 << M2P_OVERRIDE_HASH_SHIFT)
879		657
880	static RESERVE_BRK_ARRAY(struct list_head, m2p_overrides, M2P_OVERRIDE_HASH);	658	static struct list_head *m2p_overrides;
881	static DEFINE_SPINLOCK(m2p_override_lock);	659	static DEFINE_SPINLOCK(m2p_override_lock);
882		660
883	static void __init m2p_override_init(void)	661	static void __init m2p_override_init(void)
884	{	662	{
885	unsigned i;	663	unsigned i;
886		664
887	m2p_overrides = extend_brk(sizeof(m2p_overrides) M2P_OVERRIDE_HASH,	665	m2p_overrides = alloc_bootmem_align(
888	sizeof(unsigned long));	666	sizeof(m2p_overrides) M2P_OVERRIDE_HASH,
		667	sizeof(unsigned long));
889		668
890	for (i = 0; i < M2P_OVERRIDE_HASH; i++)	669	for (i = 0; i < M2P_OVERRIDE_HASH; i++)
891	INIT_LIST_HEAD(&m2p_overrides[i]);	670	INIT_LIST_HEAD(&m2p_overrides[i]);
892	}	671	}
893		672
894	static unsigned long mfn_hash(unsigned long mfn)	673	static unsigned long mfn_hash(unsigned long mfn)
895	{	674	{
896	return hash_long(mfn, M2P_OVERRIDE_HASH_SHIFT);	675	return hash_long(mfn, M2P_OVERRIDE_HASH_SHIFT);
897	}	676	}
898		677
899	int set_foreign_p2m_mapping(struct gnttab_map_grant_ref *map_ops,
900	struct gnttab_map_grant_ref *kmap_ops,
901	struct page **pages, unsigned int count)
902	{
903	int i, ret = 0;
904	bool lazy = false;
905	pte_t *pte;
906
907	if (xen_feature(XENFEAT_auto_translated_physmap))
908	return 0;
909
910	if (kmap_ops &&
911	!in_interrupt() &&
912	paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) {
913	arch_enter_lazy_mmu_mode();
914	lazy = true;
915	}
916
917	for (i = 0; i < count; i++) {
918	unsigned long mfn, pfn;
919
920	/* Do not add to override if the map failed. */
921	if (map_ops[i].status)
922	continue;
923
924	if (map_ops[i].flags & GNTMAP_contains_pte) {
925	pte = (pte_t *) (mfn_to_virt(PFN_DOWN(map_ops[i].host_addr)) +
926	(map_ops[i].host_addr & ~PAGE_MASK));
927	mfn = pte_mfn(*pte);
928	} else {
929	mfn = PFN_DOWN(map_ops[i].dev_bus_addr);
930	}
931	pfn = page_to_pfn(pages[i]);
932
933	WARN_ON(PagePrivate(pages[i]));
934	SetPagePrivate(pages[i]);
935	set_page_private(pages[i], mfn);
936	pages[i]->index = pfn_to_mfn(pfn);
937
938	if (unlikely(!set_phys_to_machine(pfn, FOREIGN_FRAME(mfn)))) {
939	ret = -ENOMEM;
940	goto out;
941	}
942
943	if (kmap_ops) {
944	ret = m2p_add_override(mfn, pages[i], &kmap_ops[i]);
945	if (ret)
946	goto out;
947	}
948	}
949
950	out:
951	if (lazy)
952	arch_leave_lazy_mmu_mode();
953
954	return ret;
955	}
956	EXPORT_SYMBOL_GPL(set_foreign_p2m_mapping);
957
958	/* Add an MFN override for a particular page */	678	/* Add an MFN override for a particular page */
959	int m2p_add_override(unsigned long mfn, struct page *page,	679	static int m2p_add_override(unsigned long mfn, struct page *page,
960	struct gnttab_map_grant_ref *kmap_op)	680	struct gnttab_map_grant_ref *kmap_op)
961	{	681	{
962	unsigned long flags;	682	unsigned long flags;
963	unsigned long pfn;	683	unsigned long pfn;
964	unsigned long uninitialized_var(address);	684	unsigned long uninitialized_var(address);
965	unsigned level;	685	unsigned level;
966	pte_t *ptep = NULL;	686	pte_t *ptep = NULL;
967		687
968	pfn = page_to_pfn(page);	688	pfn = page_to_pfn(page);
969	if (!PageHighMem(page)) {	689	if (!PageHighMem(page)) {
970	address = (unsigned long)__va(pfn << PAGE_SHIFT);	690	address = (unsigned long)__va(pfn << PAGE_SHIFT);
971	ptep = lookup_address(address, &level);	691	ptep = lookup_address(address, &level);
972	if (WARN(ptep == NULL \|\| level != PG_LEVEL_4K,	692	if (WARN(ptep == NULL \|\| level != PG_LEVEL_4K,
973	"m2p_add_override: pfn %lx not mapped", pfn))	693	"m2p_add_override: pfn %lx not mapped", pfn))
974	return -EINVAL;	694	return -EINVAL;
975	}	695	}
976		696
977	if (kmap_op != NULL) {	697	if (kmap_op != NULL) {
978	if (!PageHighMem(page)) {	698	if (!PageHighMem(page)) {
979	struct multicall_space mcs =	699	struct multicall_space mcs =
980	xen_mc_entry(sizeof(*kmap_op));	700	xen_mc_entry(sizeof(*kmap_op));
981		701
982	MULTI_grant_table_op(mcs.mc,	702	MULTI_grant_table_op(mcs.mc,
983	GNTTABOP_map_grant_ref, kmap_op, 1);	703	GNTTABOP_map_grant_ref, kmap_op, 1);
984		704
985	xen_mc_issue(PARAVIRT_LAZY_MMU);	705	xen_mc_issue(PARAVIRT_LAZY_MMU);
986	}	706	}
987	}	707	}
988	spin_lock_irqsave(&m2p_override_lock, flags);	708	spin_lock_irqsave(&m2p_override_lock, flags);
989	list_add(&page->lru, &m2p_overrides[mfn_hash(mfn)]);	709	list_add(&page->lru, &m2p_overrides[mfn_hash(mfn)]);
990	spin_unlock_irqrestore(&m2p_override_lock, flags);	710	spin_unlock_irqrestore(&m2p_override_lock, flags);
991		711
992	/* p2m(m2p(mfn)) == mfn: the mfn is already present somewhere in	712	/* p2m(m2p(mfn)) == mfn: the mfn is already present somewhere in
993	* this domain. Set the FOREIGN_FRAME_BIT in the p2m for the other	713	* this domain. Set the FOREIGN_FRAME_BIT in the p2m for the other
994	* pfn so that the following mfn_to_pfn(mfn) calls will return the	714	* pfn so that the following mfn_to_pfn(mfn) calls will return the
995	* pfn from the m2p_override (the backend pfn) instead.	715	* pfn from the m2p_override (the backend pfn) instead.
996	* We need to do this because the pages shared by the frontend	716	* We need to do this because the pages shared by the frontend
997	* (xen-blkfront) can be already locked (lock_page, called by	717	* (xen-blkfront) can be already locked (lock_page, called by
998	* do_read_cache_page); when the userspace backend tries to use them	718	* do_read_cache_page); when the userspace backend tries to use them
999	* with direct_IO, mfn_to_pfn returns the pfn of the frontend, so	719	* with direct_IO, mfn_to_pfn returns the pfn of the frontend, so
1000	* do_blockdev_direct_IO is going to try to lock the same pages	720	* do_blockdev_direct_IO is going to try to lock the same pages
1001	* again resulting in a deadlock.	721	* again resulting in a deadlock.
1002	* As a side effect get_user_pages_fast might not be safe on the	722	* As a side effect get_user_pages_fast might not be safe on the
1003	* frontend pages while they are being shared with the backend,	723	* frontend pages while they are being shared with the backend,
1004	* because mfn_to_pfn (that ends up being called by GUPF) will	724	* because mfn_to_pfn (that ends up being called by GUPF) will
1005	* return the backend pfn rather than the frontend pfn. */	725	* return the backend pfn rather than the frontend pfn. */
1006	pfn = mfn_to_pfn_no_overrides(mfn);	726	pfn = mfn_to_pfn_no_overrides(mfn);
1007	if (get_phys_to_machine(pfn) == mfn)	727	if (__pfn_to_mfn(pfn) == mfn)
1008	set_phys_to_machine(pfn, FOREIGN_FRAME(mfn));	728	set_phys_to_machine(pfn, FOREIGN_FRAME(mfn));
1009		729
1010	return 0;	730	return 0;
1011	}	731	}
1012	EXPORT_SYMBOL_GPL(m2p_add_override);
1013		732
1014	int clear_foreign_p2m_mapping(struct gnttab_unmap_grant_ref *unmap_ops,	733	int set_foreign_p2m_mapping(struct gnttab_map_grant_ref *map_ops,
1015	struct gnttab_map_grant_ref *kmap_ops,	734	struct gnttab_map_grant_ref *kmap_ops,
1016	struct page **pages, unsigned int count)	735	struct page **pages, unsigned int count)
1017	{	736	{
1018	int i, ret = 0;	737	int i, ret = 0;
1019	bool lazy = false;	738	bool lazy = false;
		739	pte_t *pte;
1020		740
1021	if (xen_feature(XENFEAT_auto_translated_physmap))	741	if (xen_feature(XENFEAT_auto_translated_physmap))
1022	return 0;	742	return 0;
1023		743
1024	if (kmap_ops &&	744	if (kmap_ops &&
1025	!in_interrupt() &&	745	!in_interrupt() &&
1026	paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) {	746	paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) {
1027	arch_enter_lazy_mmu_mode();	747	arch_enter_lazy_mmu_mode();
1028	lazy = true;	748	lazy = true;
1029	}	749	}
1030		750
1031	for (i = 0; i < count; i++) {	751	for (i = 0; i < count; i++) {
1032	unsigned long mfn = get_phys_to_machine(page_to_pfn(pages[i]));	752	unsigned long mfn, pfn;
1033	unsigned long pfn = page_to_pfn(pages[i]);
1034		753
1035	if (mfn == INVALID_P2M_ENTRY \|\| !(mfn & FOREIGN_FRAME_BIT)) {	754	/* Do not add to override if the map failed. */
1036	ret = -EINVAL;	755	if (map_ops[i].status)
1037	goto out;	756	continue;
		757
		758	if (map_ops[i].flags & GNTMAP_contains_pte) {
		759	pte = (pte_t *)(mfn_to_virt(PFN_DOWN(map_ops[i].host_addr)) +
		760	(map_ops[i].host_addr & ~PAGE_MASK));
		761	mfn = pte_mfn(*pte);
		762	} else {
		763	mfn = PFN_DOWN(map_ops[i].dev_bus_addr);
1038	}	764	}
		765	pfn = page_to_pfn(pages[i]);
1039		766
1040	set_page_private(pages[i], INVALID_P2M_ENTRY);	767	WARN_ON(PagePrivate(pages[i]));
1041	WARN_ON(!PagePrivate(pages[i]));	768	SetPagePrivate(pages[i]);
1042	ClearPagePrivate(pages[i]);	769	set_page_private(pages[i], mfn);
1043	set_phys_to_machine(pfn, pages[i]->index);	770	pages[i]->index = pfn_to_mfn(pfn);
1044		771
1045	if (kmap_ops)	772	if (unlikely(!set_phys_to_machine(pfn, FOREIGN_FRAME(mfn)))) {
1046	ret = m2p_remove_override(pages[i], &kmap_ops[i], mfn);	773	ret = -ENOMEM;
1047	if (ret)
1048	goto out;	774	goto out;
		775	}
		776
		777	if (kmap_ops) {
		778	ret = m2p_add_override(mfn, pages[i], &kmap_ops[i]);
		779	if (ret)
		780	goto out;
		781	}
1049	}	782	}
1050		783
1051	out:	784	out:
1052	if (lazy)	785	if (lazy)
1053	arch_leave_lazy_mmu_mode();	786	arch_leave_lazy_mmu_mode();
		787
1054	return ret;	788	return ret;
1055	}	789	}
1056	EXPORT_SYMBOL_GPL(clear_foreign_p2m_mapping);	790	EXPORT_SYMBOL_GPL(set_foreign_p2m_mapping);
1057		791
1058	int m2p_remove_override(struct page *page,	792	static struct page *m2p_find_override(unsigned long mfn)
1059	struct gnttab_map_grant_ref *kmap_op,
1060	unsigned long mfn)
1061	{	793	{
1062	unsigned long flags;	794	unsigned long flags;
		795	struct list_head *bucket;
		796	struct page p, ret;
		797
		798	if (unlikely(!m2p_overrides))
		799	return NULL;
		800
		801	ret = NULL;
		802	bucket = &m2p_overrides[mfn_hash(mfn)];
		803
		804	spin_lock_irqsave(&m2p_override_lock, flags);
		805
		806	list_for_each_entry(p, bucket, lru) {
		807	if (page_private(p) == mfn) {
		808	ret = p;
		809	break;
		810	}
		811	}
		812
		813	spin_unlock_irqrestore(&m2p_override_lock, flags);
		814
		815	return ret;
		816	}
		817
		818	static int m2p_remove_override(struct page *page,
		819	struct gnttab_map_grant_ref *kmap_op,
		820	unsigned long mfn)
		821	{
		822	unsigned long flags;
1063	unsigned long pfn;	823	unsigned long pfn;
1064	unsigned long uninitialized_var(address);	824	unsigned long uninitialized_var(address);
1065	unsigned level;	825	unsigned level;
1066	pte_t *ptep = NULL;	826	pte_t *ptep = NULL;
1067		827
1068	pfn = page_to_pfn(page);	828	pfn = page_to_pfn(page);
1069		829
1070	if (!PageHighMem(page)) {	830	if (!PageHighMem(page)) {
1071	address = (unsigned long)__va(pfn << PAGE_SHIFT);	831	address = (unsigned long)__va(pfn << PAGE_SHIFT);
1072	ptep = lookup_address(address, &level);	832	ptep = lookup_address(address, &level);
1073		833
1074	if (WARN(ptep == NULL \|\| level != PG_LEVEL_4K,	834	if (WARN(ptep == NULL \|\| level != PG_LEVEL_4K,
1075	"m2p_remove_override: pfn %lx not mapped", pfn))	835	"m2p_remove_override: pfn %lx not mapped", pfn))
1076	return -EINVAL;	836	return -EINVAL;
1077	}	837	}
1078		838
1079	spin_lock_irqsave(&m2p_override_lock, flags);	839	spin_lock_irqsave(&m2p_override_lock, flags);
1080	list_del(&page->lru);	840	list_del(&page->lru);
1081	spin_unlock_irqrestore(&m2p_override_lock, flags);	841	spin_unlock_irqrestore(&m2p_override_lock, flags);
1082		842
1083	if (kmap_op != NULL) {	843	if (kmap_op != NULL) {
1084	if (!PageHighMem(page)) {	844	if (!PageHighMem(page)) {
1085	struct multicall_space mcs;	845	struct multicall_space mcs;
1086	struct gnttab_unmap_and_replace *unmap_op;	846	struct gnttab_unmap_and_replace *unmap_op;
1087	struct page *scratch_page = get_balloon_scratch_page();	847	struct page *scratch_page = get_balloon_scratch_page();
1088	unsigned long scratch_page_address = (unsigned long)	848	unsigned long scratch_page_address = (unsigned long)
1089	__va(page_to_pfn(scratch_page) << PAGE_SHIFT);	849	__va(page_to_pfn(scratch_page) << PAGE_SHIFT);
1090		850
1091	/*	851	/*
1092	* It might be that we queued all the m2p grant table	852	* It might be that we queued all the m2p grant table
1093	* hypercalls in a multicall, then m2p_remove_override	853	* hypercalls in a multicall, then m2p_remove_override
1094	* get called before the multicall has actually been	854	* get called before the multicall has actually been
1095	* issued. In this case handle is going to -1 because	855	* issued. In this case handle is going to -1 because
1096	* it hasn't been modified yet.	856	* it hasn't been modified yet.

arch/x86/xen/setup.c

Diff comments View file @ eb64c3c

 /*
  * Machine specific setup for xen
  *
  * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
  */
 #include <linux/module.h>
 #include <linux/sched.h>
 #include <linux/mm.h>
 #include <linux/pm.h>
 #include <linux/memblock.h>
 #include <linux/cpuidle.h>
 #include <linux/cpufreq.h>
 #include <asm/elf.h>
 #include <asm/vdso.h>
 #include <asm/e820.h>
 #include <asm/setup.h>
 #include <asm/acpi.h>
 #include <asm/numa.h>
 #include <asm/xen/hypervisor.h>
 #include <asm/xen/hypercall.h>
 #include <xen/xen.h>
 #include <xen/page.h>
 #include <xen/interface/callback.h>
 #include <xen/interface/memory.h>
 #include <xen/interface/physdev.h>
 #include <xen/features.h>
 #include "xen-ops.h"
 #include "vdso.h"
 #include "p2m.h"
+#include "mmu.h"
 /* These are code, but not functions.  Defined in entry.S */
 extern const char xen_hypervisor_callback[];
 extern const char xen_failsafe_callback[];
 #ifdef CONFIG_X86_64
 extern asmlinkage void nmi(void);
 #endif
 extern void xen_sysenter_target(void);
 extern void xen_syscall_target(void);
 extern void xen_syscall32_target(void);
 /* Amount of extra memory space we add to the e820 ranges */
 struct xen_memory_region xen_extra_mem[XEN_EXTRA_MEM_MAX_REGIONS] __initdata;
 /* Number of pages released from the initial allocation. */
 unsigned long xen_released_pages;
-/* Buffer used to remap identity mapped pages */
+/*
-unsigned long xen_remap_buf[P2M_PER_PAGE] __initdata;
+ * Buffer used to remap identity mapped pages. We only need the virtual space.
+ * The physical page behind this address is remapped as needed to different
+ * buffer pages.
+ */
+#define REMAP_SIZE	(P2M_PER_PAGE - 3)
+static struct {
+	unsigned long	next_area_mfn;
+	unsigned long	target_pfn;
+	unsigned long	size;
+	unsigned long	mfns[REMAP_SIZE];
+} xen_remap_buf __initdata __aligned(PAGE_SIZE);
+static unsigned long xen_remap_mfn __initdata = INVALID_P2M_ENTRY;
 /*
  * The maximum amount of extra memory compared to the base size.  The
  * main scaling factor is the size of struct page.  At extreme ratios
  * of base:extra, all the base memory can be filled with page
  * structures for the extra memory, leaving no space for anything
  * else.
  *
  * 10x seems like a reasonable balance between scaling flexibility and
  * leaving a practically usable system.
  */
 #define EXTRA_MEM_RATIO		(10)
 static void __init xen_add_extra_mem(u64 start, u64 size)
 {
-	unsigned long pfn;
 	int i;
 	for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) {
 		/* Add new region. */
 		if (xen_extra_mem[i].size == 0) {
 			xen_extra_mem[i].start = start;
 			xen_extra_mem[i].size  = size;
 			break;
 		}
 		/* Append to existing region. */
 		if (xen_extra_mem[i].start + xen_extra_mem[i].size == start) {
 			xen_extra_mem[i].size += size;
 			break;
 		}
 	}
 	if (i == XEN_EXTRA_MEM_MAX_REGIONS)
 		printk(KERN_WARNING "Warning: not enough extra memory regions\n");
 	memblock_reserve(start, size);
+}
-	xen_max_p2m_pfn = PFN_DOWN(start + size);
+static void __init xen_del_extra_mem(u64 start, u64 size)
-	for (pfn = PFN_DOWN(start); pfn < xen_max_p2m_pfn; pfn++) {
+{
-		unsigned long mfn = pfn_to_mfn(pfn);
+	int i;
+	u64 start_r, size_r;
-		if (WARN_ONCE(mfn == pfn, "Trying to over-write 1-1 mapping (pfn: %lx)\n", pfn))
+	for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) {
-			continue;
+		start_r = xen_extra_mem[i].start;
-		WARN_ONCE(mfn != INVALID_P2M_ENTRY, "Trying to remove %lx which has %lx mfn!\n",
+		size_r = xen_extra_mem[i].size;
-			  pfn, mfn);
-		__set_phys_to_machine(pfn, INVALID_P2M_ENTRY);
+		/* Start of region. */
+		if (start_r == start) {
+			BUG_ON(size > size_r);
+			xen_extra_mem[i].start += size;
+			xen_extra_mem[i].size -= size;
+			break;
+		}
+		/* End of region. */
+		if (start_r + size_r == start + size) {
+			BUG_ON(size > size_r);
+			xen_extra_mem[i].size -= size;
+			break;
+		}
+		/* Mid of region. */
+		if (start > start_r && start < start_r + size_r) {
+			BUG_ON(start + size > start_r + size_r);
+			xen_extra_mem[i].size = start - start_r;
+			/* Calling memblock_reserve() again is okay. */
+			xen_add_extra_mem(start + size, start_r + size_r -
+					  (start + size));
+			break;
+		}
 	}
+	memblock_free(start, size);
 }
-static unsigned long __init xen_do_chunk(unsigned long start,
+/*
-					 unsigned long end, bool release)
+ * Called during boot before the p2m list can take entries beyond the
+ * hypervisor supplied p2m list. Entries in extra mem are to be regarded as
+ * invalid.
+ */
+unsigned long __ref xen_chk_extra_mem(unsigned long pfn)
 {
-	struct xen_memory_reservation reservation = {
+	int i;
-		.address_bits = 0,
+	unsigned long addr = PFN_PHYS(pfn);
-		.extent_order = 0,
-		.domid        = DOMID_SELF
-	};
-	unsigned long len = 0;
-	unsigned long pfn;
-	int ret;
-	for (pfn = start; pfn < end; pfn++) {
+	for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) {
-		unsigned long frame;
+		if (addr >= xen_extra_mem[i].start &&
-		unsigned long mfn = pfn_to_mfn(pfn);
+		    addr < xen_extra_mem[i].start + xen_extra_mem[i].size)
+			return INVALID_P2M_ENTRY;
+	}
-		if (release) {
+	return IDENTITY_FRAME(pfn);
-			/* Make sure pfn exists to start with */
+}
-			if (mfn == INVALID_P2M_ENTRY || mfn_to_pfn(mfn) != pfn)
-				continue;
-			frame = mfn;
-		} else {
-			if (mfn != INVALID_P2M_ENTRY)
-				continue;
-			frame = pfn;
-		}
-		set_xen_guest_handle(reservation.extent_start, &frame);
-		reservation.nr_extents = 1;
-		ret = HYPERVISOR_memory_op(release ? XENMEM_decrease_reservation : XENMEM_populate_physmap,
+/*
-					   &reservation);
+ * Mark all pfns of extra mem as invalid in p2m list.
-		WARN(ret != 1, "Failed to %s pfn %lx err=%d\n",
+ */
-		     release ? "release" : "populate", pfn, ret);
+void __init xen_inv_extra_mem(void)
+{
+	unsigned long pfn, pfn_s, pfn_e;
+	int i;
-		if (ret == 1) {
+	for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) {
-			if (!early_set_phys_to_machine(pfn, release ? INVALID_P2M_ENTRY : frame)) {
+		pfn_s = PFN_DOWN(xen_extra_mem[i].start);
-				if (release)
+		pfn_e = PFN_UP(xen_extra_mem[i].start + xen_extra_mem[i].size);
-					break;
+		for (pfn = pfn_s; pfn < pfn_e; pfn++)
-				set_xen_guest_handle(reservation.extent_start, &frame);
+			set_phys_to_machine(pfn, INVALID_P2M_ENTRY);
-				reservation.nr_extents = 1;
-				ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation,
-							   &reservation);
-				break;
-			}
-			len++;
-		} else
-			break;
 	}
-	if (len)
-		printk(KERN_INFO "%s %lx-%lx pfn range: %lu pages %s\n",
-		       release ? "Freeing" : "Populating",
-		       start, end, len,
-		       release ? "freed" : "added");
-	return len;
 }
 /*
  * Finds the next RAM pfn available in the E820 map after min_pfn.
  * This function updates min_pfn with the pfn found and returns
  * the size of that range or zero if not found.
  */
 static unsigned long __init xen_find_pfn_range(
 	const struct e820entry *list, size_t map_size,
 	unsigned long *min_pfn)
 {
 	const struct e820entry *entry;
 	unsigned int i;
 	unsigned long done = 0;
 	for (i = 0, entry = list; i < map_size; i++, entry++) {
 		unsigned long s_pfn;
 		unsigned long e_pfn;
 		if (entry->type != E820_RAM)
 			continue;
 		e_pfn = PFN_DOWN(entry->addr + entry->size);
 		/* We only care about E820 after this */
 		if (e_pfn < *min_pfn)
 			continue;
 		s_pfn = PFN_UP(entry->addr);
 		/* If min_pfn falls within the E820 entry, we want to start
 		 * at the min_pfn PFN.
 		 */
 		if (s_pfn <= *min_pfn) {
 			done = e_pfn - *min_pfn;
 		} else {
 			done = e_pfn - s_pfn;
 			*min_pfn = s_pfn;
 		}
 		break;
 	}
 	return done;
 }
+static int __init xen_free_mfn(unsigned long mfn)
+{
+	struct xen_memory_reservation reservation = {
+		.address_bits = 0,
+		.extent_order = 0,
+		.domid        = DOMID_SELF
+	};
+	set_xen_guest_handle(reservation.extent_start, &mfn);
+	reservation.nr_extents = 1;
+	return HYPERVISOR_memory_op(XENMEM_decrease_reservation, &reservation);
+}
 /*
- * This releases a chunk of memory and then does the identity map. It's used as
+ * This releases a chunk of memory and then does the identity map. It's used
  * as a fallback if the remapping fails.
  */
 static void __init xen_set_identity_and_release_chunk(unsigned long start_pfn,
 	unsigned long end_pfn, unsigned long nr_pages, unsigned long *identity,
 	unsigned long *released)
 {
+	unsigned long len = 0;
+	unsigned long pfn, end;
+	int ret;
 	WARN_ON(start_pfn > end_pfn);
+	end = min(end_pfn, nr_pages);
+	for (pfn = start_pfn; pfn < end; pfn++) {
+		unsigned long mfn = pfn_to_mfn(pfn);
+		/* Make sure pfn exists to start with */
+		if (mfn == INVALID_P2M_ENTRY || mfn_to_pfn(mfn) != pfn)
+			continue;
+		ret = xen_free_mfn(mfn);
+		WARN(ret != 1, "Failed to release pfn %lx err=%d\n", pfn, ret);
+		if (ret == 1) {
+			if (!__set_phys_to_machine(pfn, INVALID_P2M_ENTRY))
+				break;
+			len++;
+		} else
+			break;
+	}
 	/* Need to release pages first */
-	*released += xen_do_chunk(start_pfn, min(end_pfn, nr_pages), true);
+	*released += len;
 	*identity += set_phys_range_identity(start_pfn, end_pfn);
 }
 /*
- * Helper function to update both the p2m and m2p tables.
+ * Helper function to update the p2m and m2p tables and kernel mapping.
  */
-static unsigned long __init xen_update_mem_tables(unsigned long pfn,
+static void __init xen_update_mem_tables(unsigned long pfn, unsigned long mfn)
-						  unsigned long mfn)
 {
 	struct mmu_update update = {
 		.ptr = ((unsigned long long)mfn << PAGE_SHIFT) | MMU_MACHPHYS_UPDATE,
 		.val = pfn
 	};
 	/* Update p2m */
-	if (!early_set_phys_to_machine(pfn, mfn)) {
+	if (!set_phys_to_machine(pfn, mfn)) {
 		WARN(1, "Failed to set p2m mapping for pfn=%ld mfn=%ld\n",
 		     pfn, mfn);
-		return false;
+		BUG();
 	}
 	/* Update m2p */
 	if (HYPERVISOR_mmu_update(&update, 1, NULL, DOMID_SELF) < 0) {
 		WARN(1, "Failed to set m2p mapping for mfn=%ld pfn=%ld\n",
 		     mfn, pfn);
-		return false;
+		BUG();
 	}
-	return true;
+	/* Update kernel mapping, but not for highmem. */
+	if ((pfn << PAGE_SHIFT) >= __pa(high_memory))
+		return;
+	if (HYPERVISOR_update_va_mapping((unsigned long)__va(pfn << PAGE_SHIFT),
+					 mfn_pte(mfn, PAGE_KERNEL), 0)) {
+		WARN(1, "Failed to update kernel mapping for mfn=%ld pfn=%ld\n",
+		      mfn, pfn);
+		BUG();
+	}
 }
 /*
  * This function updates the p2m and m2p tables with an identity map from
- * start_pfn to start_pfn+size and remaps the underlying RAM of the original
+ * start_pfn to start_pfn+size and prepares remapping the underlying RAM of the
- * allocation at remap_pfn. It must do so carefully in P2M_PER_PAGE sized blocks
+ * original allocation at remap_pfn. The information needed for remapping is
- * to not exhaust the reserved brk space. Doing it in properly aligned blocks
+ * saved in the memory itself to avoid the need for allocating buffers. The
- * ensures we only allocate the minimum required leaf pages in the p2m table. It
+ * complete remap information is contained in a list of MFNs each containing
- * copies the existing mfns from the p2m table under the 1:1 map, overwrites
+ * up to REMAP_SIZE MFNs and the start target PFN for doing the remap.
- * them with the identity map and then updates the p2m and m2p tables with the
+ * This enables us to preserve the original mfn sequence while doing the
- * remapped memory.
+ * remapping at a time when the memory management is capable of allocating
+ * virtual and physical memory in arbitrary amounts, see 'xen_remap_memory' and
+ * its callers.
  */
-static unsigned long __init xen_do_set_identity_and_remap_chunk(
+static void __init xen_do_set_identity_and_remap_chunk(
         unsigned long start_pfn, unsigned long size, unsigned long remap_pfn)
 {
+	unsigned long buf = (unsigned long)&xen_remap_buf;
+	unsigned long mfn_save, mfn;
 	unsigned long ident_pfn_iter, remap_pfn_iter;
-	unsigned long ident_start_pfn_align, remap_start_pfn_align;
+	unsigned long ident_end_pfn = start_pfn + size;
-	unsigned long ident_end_pfn_align, remap_end_pfn_align;
-	unsigned long ident_boundary_pfn, remap_boundary_pfn;
-	unsigned long ident_cnt = 0;
-	unsigned long remap_cnt = 0;
 	unsigned long left = size;
-	unsigned long mod;
+	unsigned long ident_cnt = 0;
-	int i;
+	unsigned int i, chunk;
 	WARN_ON(size == 0);
 	BUG_ON(xen_feature(XENFEAT_auto_translated_physmap));
-	/*
+	mfn_save = virt_to_mfn(buf);
-	 * Determine the proper alignment to remap memory in P2M_PER_PAGE sized
-	 * blocks. We need to keep track of both the existing pfn mapping and
-	 * the new pfn remapping.
-	 */
-	mod = start_pfn % P2M_PER_PAGE;
-	ident_start_pfn_align =
-		mod ? (start_pfn - mod + P2M_PER_PAGE) : start_pfn;
-	mod = remap_pfn % P2M_PER_PAGE;
-	remap_start_pfn_align =
-		mod ? (remap_pfn - mod + P2M_PER_PAGE) : remap_pfn;
-	mod = (start_pfn + size) % P2M_PER_PAGE;
-	ident_end_pfn_align = start_pfn + size - mod;
-	mod = (remap_pfn + size) % P2M_PER_PAGE;
-	remap_end_pfn_align = remap_pfn + size - mod;
-	/* Iterate over each p2m leaf node in each range */
+	for (ident_pfn_iter = start_pfn, remap_pfn_iter = remap_pfn;
-	for (ident_pfn_iter = ident_start_pfn_align, remap_pfn_iter = remap_start_pfn_align;
+	     ident_pfn_iter < ident_end_pfn;
-	     ident_pfn_iter < ident_end_pfn_align && remap_pfn_iter < remap_end_pfn_align;
+	     ident_pfn_iter += REMAP_SIZE, remap_pfn_iter += REMAP_SIZE) {
-	     ident_pfn_iter += P2M_PER_PAGE, remap_pfn_iter += P2M_PER_PAGE) {
+		chunk = (left < REMAP_SIZE) ? left : REMAP_SIZE;
-		/* Check we aren't past the end */
-		BUG_ON(ident_pfn_iter + P2M_PER_PAGE > start_pfn + size);
-		BUG_ON(remap_pfn_iter + P2M_PER_PAGE > remap_pfn + size);
-		/* Save p2m mappings */
+		/* Map first pfn to xen_remap_buf */
-		for (i = 0; i < P2M_PER_PAGE; i++)
+		mfn = pfn_to_mfn(ident_pfn_iter);
-			xen_remap_buf[i] = pfn_to_mfn(ident_pfn_iter + i);
+		set_pte_mfn(buf, mfn, PAGE_KERNEL);
-		/* Set identity map which will free a p2m leaf */
+		/* Save mapping information in page */
-		ident_cnt += set_phys_range_identity(ident_pfn_iter,
+		xen_remap_buf.next_area_mfn = xen_remap_mfn;
-			ident_pfn_iter + P2M_PER_PAGE);
+		xen_remap_buf.target_pfn = remap_pfn_iter;
+		xen_remap_buf.size = chunk;
+		for (i = 0; i < chunk; i++)
+			xen_remap_buf.mfns[i] = pfn_to_mfn(ident_pfn_iter + i);
-#ifdef DEBUG
+		/* Put remap buf into list. */
-		/* Helps verify a p2m leaf has been freed */
+		xen_remap_mfn = mfn;
-		for (i = 0; i < P2M_PER_PAGE; i++) {
-			unsigned int pfn = ident_pfn_iter + i;
-			BUG_ON(pfn_to_mfn(pfn) != pfn);
-		}
-#endif
-		/* Now remap memory */
-		for (i = 0; i < P2M_PER_PAGE; i++) {
-			unsigned long mfn = xen_remap_buf[i];
-			/* This will use the p2m leaf freed above */
+		/* Set identity map */
-			if (!xen_update_mem_tables(remap_pfn_iter + i, mfn)) {
+		ident_cnt += set_phys_range_identity(ident_pfn_iter,
-				WARN(1, "Failed to update mem mapping for pfn=%ld mfn=%ld\n",
+			ident_pfn_iter + chunk);
-					remap_pfn_iter + i, mfn);
-				return 0;
-			}
-			remap_cnt++;
+		left -= chunk;
-		}
-		left -= P2M_PER_PAGE;
 	}
-	/* Max boundary space possible */
+	/* Restore old xen_remap_buf mapping */
-	BUG_ON(left > (P2M_PER_PAGE - 1) * 2);
+	set_pte_mfn(buf, mfn_save, PAGE_KERNEL);
-	/* Now handle the boundary conditions */
-	ident_boundary_pfn = start_pfn;
-	remap_boundary_pfn = remap_pfn;
-	for (i = 0; i < left; i++) {
-		unsigned long mfn;
-		/* These two checks move from the start to end boundaries */
-		if (ident_boundary_pfn == ident_start_pfn_align)
-			ident_boundary_pfn = ident_pfn_iter;
-		if (remap_boundary_pfn == remap_start_pfn_align)
-			remap_boundary_pfn = remap_pfn_iter;
-		/* Check we aren't past the end */
-		BUG_ON(ident_boundary_pfn >= start_pfn + size);
-		BUG_ON(remap_boundary_pfn >= remap_pfn + size);
-		mfn = pfn_to_mfn(ident_boundary_pfn);
-		if (!xen_update_mem_tables(remap_boundary_pfn, mfn)) {
-			WARN(1, "Failed to update mem mapping for pfn=%ld mfn=%ld\n",
-				remap_pfn_iter + i, mfn);
-			return 0;
-		}
-		remap_cnt++;
-		ident_boundary_pfn++;
-		remap_boundary_pfn++;
-	}
-	/* Finish up the identity map */
-	if (ident_start_pfn_align >= ident_end_pfn_align) {
-		/*
-                 * In this case we have an identity range which does not span an
-                 * aligned block so everything needs to be identity mapped here.
-                 * If we didn't check this we might remap too many pages since
-                 * the align boundaries are not meaningful in this case.
-	         */
-		ident_cnt += set_phys_range_identity(start_pfn,
-			start_pfn + size);
-	} else {
-		/* Remapped above so check each end of the chunk */
-		if (start_pfn < ident_start_pfn_align)
-			ident_cnt += set_phys_range_identity(start_pfn,
-				ident_start_pfn_align);
-		if (start_pfn + size > ident_pfn_iter)
-			ident_cnt += set_phys_range_identity(ident_pfn_iter,
-				start_pfn + size);
-	}
-	BUG_ON(ident_cnt != size);
-	BUG_ON(remap_cnt != size);
-	return size;
 }
 /*
  * This function takes a contiguous pfn range that needs to be identity mapped
  * and:
  *
  *  1) Finds a new range of pfns to use to remap based on E820 and remap_pfn.
  *  2) Calls the do_ function to actually do the mapping/remapping work.
  *
  * The goal is to not allocate additional memory but to remap the existing
  * pages. In the case of an error the underlying memory is simply released back
  * to Xen and not remapped.
  */
 static unsigned long __init xen_set_identity_and_remap_chunk(
         const struct e820entry *list, size_t map_size, unsigned long start_pfn,
 	unsigned long end_pfn, unsigned long nr_pages, unsigned long remap_pfn,
-	unsigned long *identity, unsigned long *remapped,
+	unsigned long *identity, unsigned long *released)
-	unsigned long *released)
 {
 	unsigned long pfn;
 	unsigned long i = 0;
 	unsigned long n = end_pfn - start_pfn;
 	while (i < n) {
 		unsigned long cur_pfn = start_pfn + i;
 		unsigned long left = n - i;
 		unsigned long size = left;
 		unsigned long remap_range_size;
 		/* Do not remap pages beyond the current allocation */
 		if (cur_pfn >= nr_pages) {
 			/* Identity map remaining pages */
 			*identity += set_phys_range_identity(cur_pfn,
 				cur_pfn + size);
 			break;
 		}
 		if (cur_pfn + size > nr_pages)
 			size = nr_pages - cur_pfn;
 		remap_range_size = xen_find_pfn_range(list, map_size,
 						      &remap_pfn);
 		if (!remap_range_size) {
 			pr_warning("Unable to find available pfn range, not remapping identity pages\n");
 			xen_set_identity_and_release_chunk(cur_pfn,
 				cur_pfn + left, nr_pages, identity, released);
 			break;
 		}
 		/* Adjust size to fit in current e820 RAM region */
 		if (size > remap_range_size)
 			size = remap_range_size;
-		if (!xen_do_set_identity_and_remap_chunk(cur_pfn, size, remap_pfn)) {
+		xen_do_set_identity_and_remap_chunk(cur_pfn, size, remap_pfn);
-			WARN(1, "Failed to remap 1:1 memory cur_pfn=%ld size=%ld remap_pfn=%ld\n",
-				cur_pfn, size, remap_pfn);
-			xen_set_identity_and_release_chunk(cur_pfn,
-				cur_pfn + left, nr_pages, identity, released);
-			break;
-		}
 		/* Update variables to reflect new mappings. */
 		i += size;
 		remap_pfn += size;
 		*identity += size;
-		*remapped += size;
 	}
 	/*
 	 * If the PFNs are currently mapped, the VA mapping also needs
 	 * to be updated to be 1:1.
 	 */
 	for (pfn = start_pfn; pfn <= max_pfn_mapped && pfn < end_pfn; pfn++)
 		(void)HYPERVISOR_update_va_mapping(
 			(unsigned long)__va(pfn << PAGE_SHIFT),
 			mfn_pte(pfn, PAGE_KERNEL_IO), 0);
 	return remap_pfn;
 }
-static unsigned long __init xen_set_identity_and_remap(
+static void __init xen_set_identity_and_remap(
 	const struct e820entry *list, size_t map_size, unsigned long nr_pages,
 	unsigned long *released)
 {
 	phys_addr_t start = 0;
 	unsigned long identity = 0;
-	unsigned long remapped = 0;
 	unsigned long last_pfn = nr_pages;
 	const struct e820entry *entry;
 	unsigned long num_released = 0;
 	int i;
 	/*
 	 * Combine non-RAM regions and gaps until a RAM region (or the
 	 * end of the map) is reached, then set the 1:1 map and
 	 * remap the memory in those non-RAM regions.
 	 *
 	 * The combined non-RAM regions are rounded to a whole number
 	 * of pages so any partial pages are accessible via the 1:1
 	 * mapping.  This is needed for some BIOSes that put (for
 	 * example) the DMI tables in a reserved region that begins on
 	 * a non-page boundary.
 	 */
 	for (i = 0, entry = list; i < map_size; i++, entry++) {
 		phys_addr_t end = entry->addr + entry->size;
 		if (entry->type == E820_RAM || i == map_size - 1) {
 			unsigned long start_pfn = PFN_DOWN(start);
 			unsigned long end_pfn = PFN_UP(end);
 			if (entry->type == E820_RAM)
 				end_pfn = PFN_UP(entry->addr);
 			if (start_pfn < end_pfn)
 				last_pfn = xen_set_identity_and_remap_chunk(
 						list, map_size, start_pfn,
 						end_pfn, nr_pages, last_pfn,
-						&identity, &remapped,
+						&identity, &num_released);
-						&num_released);
 			start = end;
 		}
 	}
 	*released = num_released;
 	pr_info("Set %ld page(s) to 1-1 mapping\n", identity);
-	pr_info("Remapped %ld page(s), last_pfn=%ld\n", remapped,
-		last_pfn);
 	pr_info("Released %ld page(s)\n", num_released);
+}
-	return last_pfn;
+/*
+ * Remap the memory prepared in xen_do_set_identity_and_remap_chunk().
+ * The remap information (which mfn remap to which pfn) is contained in the
+ * to be remapped memory itself in a linked list anchored at xen_remap_mfn.
+ * This scheme allows to remap the different chunks in arbitrary order while
+ * the resulting mapping will be independant from the order.
+ */
+void __init xen_remap_memory(void)
+{
+	unsigned long buf = (unsigned long)&xen_remap_buf;
+	unsigned long mfn_save, mfn, pfn;
+	unsigned long remapped = 0;
+	unsigned int i;
+	unsigned long pfn_s = ~0UL;
+	unsigned long len = 0;
+	mfn_save = virt_to_mfn(buf);
+	while (xen_remap_mfn != INVALID_P2M_ENTRY) {
+		/* Map the remap information */
+		set_pte_mfn(buf, xen_remap_mfn, PAGE_KERNEL);
+		BUG_ON(xen_remap_mfn != xen_remap_buf.mfns[0]);
+		pfn = xen_remap_buf.target_pfn;
+		for (i = 0; i < xen_remap_buf.size; i++) {
+			mfn = xen_remap_buf.mfns[i];
+			xen_update_mem_tables(pfn, mfn);
+			remapped++;
+			pfn++;
+		}
+		if (pfn_s == ~0UL || pfn == pfn_s) {
+			pfn_s = xen_remap_buf.target_pfn;
+			len += xen_remap_buf.size;
+		} else if (pfn_s + len == xen_remap_buf.target_pfn) {
+			len += xen_remap_buf.size;
+		} else {
+			xen_del_extra_mem(PFN_PHYS(pfn_s), PFN_PHYS(len));
+			pfn_s = xen_remap_buf.target_pfn;
+			len = xen_remap_buf.size;
+		}
+		mfn = xen_remap_mfn;
+		xen_remap_mfn = xen_remap_buf.next_area_mfn;
+	}
+	if (pfn_s != ~0UL && len)
+		xen_del_extra_mem(PFN_PHYS(pfn_s), PFN_PHYS(len));
+	set_pte_mfn(buf, mfn_save, PAGE_KERNEL);
+	pr_info("Remapped %ld page(s)\n", remapped);
 }
 static unsigned long __init xen_get_max_pages(void)
 {
 	unsigned long max_pages = MAX_DOMAIN_PAGES;
 	domid_t domid = DOMID_SELF;
 	int ret;
 	/*
 	 * For the initial domain we use the maximum reservation as
 	 * the maximum page.
 	 *
 	 * For guest domains the current maximum reservation reflects
 	 * the current maximum rather than the static maximum. In this
 	 * case the e820 map provided to us will cover the static
 	 * maximum region.
 	 */
 	if (xen_initial_domain()) {
 		ret = HYPERVISOR_memory_op(XENMEM_maximum_reservation, &domid);
 		if (ret > 0)
 			max_pages = ret;
 	}
 	return min(max_pages, MAX_DOMAIN_PAGES);
 }
 static void xen_align_and_add_e820_region(u64 start, u64 size, int type)
 {
 	u64 end = start + size;
 	/* Align RAM regions to page boundaries. */
 	if (type == E820_RAM) {
 		start = PAGE_ALIGN(start);
 		end &= ~((u64)PAGE_SIZE - 1);
 	}
 	e820_add_region(start, end - start, type);
 }
 void xen_ignore_unusable(struct e820entry *list, size_t map_size)
 {
 	struct e820entry *entry;
 	unsigned int i;
 	for (i = 0, entry = list; i < map_size; i++, entry++) {
 		if (entry->type == E820_UNUSABLE)
 			entry->type = E820_RAM;
 	}
 }
 /**
  * machine_specific_memory_setup - Hook for machine specific memory setup.
  **/
 char * __init xen_memory_setup(void)
 {
 	static struct e820entry map[E820MAX] __initdata;
 	unsigned long max_pfn = xen_start_info->nr_pages;
 	unsigned long long mem_end;
 	int rc;
 	struct xen_memory_map memmap;
 	unsigned long max_pages;
-	unsigned long last_pfn = 0;
 	unsigned long extra_pages = 0;
 	int i;
 	int op;
 	max_pfn = min(MAX_DOMAIN_PAGES, max_pfn);
 	mem_end = PFN_PHYS(max_pfn);
 	memmap.nr_entries = E820MAX;
 	set_xen_guest_handle(memmap.buffer, map);
 	op = xen_initial_domain() ?
 		XENMEM_machine_memory_map :
 		XENMEM_memory_map;
 	rc = HYPERVISOR_memory_op(op, &memmap);
 	if (rc == -ENOSYS) {
 		BUG_ON(xen_initial_domain());
 		memmap.nr_entries = 1;
 		map[0].addr = 0ULL;
 		map[0].size = mem_end;
 		/* 8MB slack (to balance backend allocations). */
 		map[0].size += 8ULL << 20;
 		map[0].type = E820_RAM;
 		rc = 0;
 	}
 	BUG_ON(rc);
 	BUG_ON(memmap.nr_entries == 0);
 	/*
 	 * Xen won't allow a 1:1 mapping to be created to UNUSABLE
 	 * regions, so if we're using the machine memory map leave the
 	 * region as RAM as it is in the pseudo-physical map.
 	 *
 	 * UNUSABLE regions in domUs are not handled and will need
 	 * a patch in the future.
 	 */
 	if (xen_initial_domain())
 		xen_ignore_unusable(map, memmap.nr_entries);
 	/* Make sure the Xen-supplied memory map is well-ordered. */
 	sanitize_e820_map(map, memmap.nr_entries, &memmap.nr_entries);
 	max_pages = xen_get_max_pages();
 	if (max_pages > max_pfn)
 		extra_pages += max_pages - max_pfn;
 	/*
-	 * Set identity map on non-RAM pages and remap the underlying RAM.
+	 * Set identity map on non-RAM pages and prepare remapping the
+	 * underlying RAM.
 	 */
-	last_pfn = xen_set_identity_and_remap(map, memmap.nr_entries, max_pfn,
+	xen_set_identity_and_remap(map, memmap.nr_entries, max_pfn,
-					      &xen_released_pages);
+				   &xen_released_pages);
 	extra_pages += xen_released_pages;
-	if (last_pfn > max_pfn) {
-		max_pfn = min(MAX_DOMAIN_PAGES, last_pfn);
-		mem_end = PFN_PHYS(max_pfn);
-	}
 	/*
 	 * Clamp the amount of extra memory to a EXTRA_MEM_RATIO
 	 * factor the base size.  On non-highmem systems, the base
 	 * size is the full initial memory allocation; on highmem it
 	 * is limited to the max size of lowmem, so that it doesn't
 	 * get completely filled.
 	 *
 	 * In principle there could be a problem in lowmem systems if
 	 * the initial memory is also very large with respect to
 	 * lowmem, but we won't try to deal with that here.
 	 */
 	extra_pages = min(EXTRA_MEM_RATIO * min(max_pfn, PFN_DOWN(MAXMEM)),
 			  extra_pages);
 	i = 0;
 	while (i < memmap.nr_entries) {
 		u64 addr = map[i].addr;
 		u64 size = map[i].size;
 		u32 type = map[i].type;
 		if (type == E820_RAM) {
 			if (addr < mem_end) {
 				size = min(size, mem_end - addr);
 			} else if (extra_pages) {
 				size = min(size, (u64)extra_pages * PAGE_SIZE);
 				extra_pages -= size / PAGE_SIZE;
 				xen_add_extra_mem(addr, size);
+				xen_max_p2m_pfn = PFN_DOWN(addr + size);
 			} else
 				type = E820_UNUSABLE;
 		}
 		xen_align_and_add_e820_region(addr, size, type);
 		map[i].addr += size;
 		map[i].size -= size;
 		if (map[i].size == 0)
 			i++;
 	}
 	/*
 	 * Set the rest as identity mapped, in case PCI BARs are
 	 * located here.
 	 *
 	 * PFNs above MAX_P2M_PFN are considered identity mapped as
 	 * well.
 	 */
 	set_phys_range_identity(map[i-1].addr / PAGE_SIZE, ~0ul);
 	/*
 	 * In domU, the ISA region is normal, usable memory, but we
 	 * reserve ISA memory anyway because too many things poke
 	 * about in there.
 	 */
 	e820_add_region(ISA_START_ADDRESS, ISA_END_ADDRESS - ISA_START_ADDRESS,
 			E820_RESERVED);
 	/*
 	 * Reserve Xen bits:
 	 *  - mfn_list
 	 *  - xen_start_info
 	 * See comment above "struct start_info" in <xen/interface/xen.h>
 	 * We tried to make the the memblock_reserve more selective so
 	 * that it would be clear what region is reserved. Sadly we ran
 	 * in the problem wherein on a 64-bit hypervisor with a 32-bit
 	 * initial domain, the pt_base has the cr3 value which is not
 	 * neccessarily where the pagetable starts! As Jan put it: "
 	 * Actually, the adjustment turns out to be correct: The page
 	 * tables for a 32-on-64 dom0 get allocated in the order "first L1",
 	 * "first L2", "first L3", so the offset to the page table base is
 	 * indeed 2. When reading xen/include/public/xen.h's comment
 	 * very strictly, this is not a violation (since there nothing is said
 	 * that the first thing in the page table space is pointed to by
 	 * pt_base; I admit that this seems to be implied though, namely
 	 * do I think that it is implied that the page table space is the
 	 * range [pt_base, pt_base + nt_pt_frames), whereas that
 	 * range here indeed is [pt_base - 2, pt_base - 2 + nt_pt_frames),
 	 * which - without a priori knowledge - the kernel would have
 	 * difficulty to figure out)." - so lets just fall back to the
 	 * easy way and reserve the whole region.
 	 */
 	memblock_reserve(__pa(xen_start_info->mfn_list),
 			 xen_start_info->pt_base - xen_start_info->mfn_list);
 	sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
 	return "Xen";
 }
 /*
  * Machine specific memory setup for auto-translated guests.
  */
 char * __init xen_auto_xlated_memory_setup(void)
 {
 	static struct e820entry map[E820MAX] __initdata;

arch/x86/xen/xen-ops.h

Diff comments View file @ eb64c3c

 #ifndef XEN_OPS_H
 #define XEN_OPS_H
 #include <linux/init.h>
 #include <linux/clocksource.h>
 #include <linux/irqreturn.h>
 #include <xen/xen-ops.h>
 /* These are code, but not functions.  Defined in entry.S */
 extern const char xen_hypervisor_callback[];
 extern const char xen_failsafe_callback[];
 extern void *xen_initial_gdt;
 struct trap_info;
 void xen_copy_trap_info(struct trap_info *traps);
 DECLARE_PER_CPU(struct vcpu_info, xen_vcpu_info);
 DECLARE_PER_CPU(unsigned long, xen_cr3);
 DECLARE_PER_CPU(unsigned long, xen_current_cr3);
 extern struct start_info *xen_start_info;
 extern struct shared_info xen_dummy_shared_info;
 extern struct shared_info *HYPERVISOR_shared_info;
 void xen_setup_mfn_list_list(void);
 void xen_setup_shared_info(void);
 void xen_build_mfn_list_list(void);
 void xen_setup_machphys_mapping(void);
 void xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn);
 void xen_reserve_top(void);
-extern unsigned long xen_max_p2m_pfn;
 void xen_mm_pin_all(void);
 void xen_mm_unpin_all(void);
+unsigned long __ref xen_chk_extra_mem(unsigned long pfn);
+void __init xen_inv_extra_mem(void);
+void __init xen_remap_memory(void);
 char * __init xen_memory_setup(void);
 char * xen_auto_xlated_memory_setup(void);
 void __init xen_arch_setup(void);
 void xen_enable_sysenter(void);
 void xen_enable_syscall(void);
 void xen_vcpu_restore(void);
 void xen_callback_vector(void);
 void xen_hvm_init_shared_info(void);
 void xen_unplug_emulated_devices(void);
 void __init xen_build_dynamic_phys_to_machine(void);
-unsigned long __init xen_revector_p2m_tree(void);
+void __init xen_vmalloc_p2m_tree(void);
 void xen_init_irq_ops(void);
 void xen_setup_timer(int cpu);
 void xen_setup_runstate_info(int cpu);
 void xen_teardown_timer(int cpu);
 cycle_t xen_clocksource_read(void);
 void xen_setup_cpu_clockevents(void);
 void __init xen_init_time_ops(void);
 void __init xen_hvm_init_time_ops(void);
 irqreturn_t xen_debug_interrupt(int irq, void *dev_id);
 bool xen_vcpu_stolen(int vcpu);
 void xen_setup_vcpu_info_placement(void);
 #ifdef CONFIG_SMP
 void xen_smp_init(void);
 void __init xen_hvm_smp_init(void);
 extern cpumask_var_t xen_cpu_initialized_map;
 #else
 static inline void xen_smp_init(void) {}
 static inline void xen_hvm_smp_init(void) {}
 #endif
 #ifdef CONFIG_PARAVIRT_SPINLOCKS
 void __init xen_init_spinlocks(void);
 void xen_init_lock_cpu(int cpu);
 void xen_uninit_lock_cpu(int cpu);
 #else
 static inline void xen_init_spinlocks(void)
 {
 }
 static inline void xen_init_lock_cpu(int cpu)
 {
 }
 static inline void xen_uninit_lock_cpu(int cpu)
 {
 }
 #endif
 struct dom0_vga_console_info;
 #ifdef CONFIG_XEN_DOM0
 void __init xen_init_vga(const struct dom0_vga_console_info *, size_t size);
 void __init xen_init_apic(void);
 #else
 static inline void __init xen_init_vga(const struct dom0_vga_console_info *info,
 				       size_t size)
 {
 }
 static inline void __init xen_init_apic(void)
 {
 }
 #endif
 #ifdef CONFIG_XEN_EFI
 extern void xen_efi_init(void);
 #else
 static inline void __init xen_efi_init(void)
 {
 }
 #endif
 /* Declare an asm function, along with symbols needed to make it
    inlineable */
 #define DECL_ASM(ret, name, ...)		\
 	__visible ret name(__VA_ARGS__);	\
 	extern char name##_end[] __visible;	\
 	extern char name##_reloc[] __visible
 DECL_ASM(void, xen_irq_enable_direct, void);
 DECL_ASM(void, xen_irq_disable_direct, void);
 DECL_ASM(unsigned long, xen_save_fl_direct, void);
 DECL_ASM(void, xen_restore_fl_direct, unsigned long);
 /* These are not functions, and cannot be called normally */
 __visible void xen_iret(void);
 __visible void xen_sysexit(void);
 __visible void xen_sysret32(void);
 __visible void xen_sysret64(void);
 __visible void xen_adjust_exception_frame(void);
 extern int xen_panic_handler_init(void);
 void xen_pvh_secondary_vcpu_init(int cpu);
 #endif /* XEN_OPS_H */