Commit eb64c3c6cdb8fa8a4d324eb71a9033b62e150918

Authored by Linus Torvalds

Merge tag 'stable/for-linus-3.19-rc0b-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/xen/tip

Pull additional xen update from David Vrabel:
 "Xen: additional features for 3.19-rc0

   - Linear p2m for x86 PV guests which simplifies the p2m code,
     improves performance and will allow for > 512 GB PV guests in the
     future.

  A last-minute, configuration specific issue was discovered with this
  change which is why it was not included in my previous pull request.
  This is now been fixed and tested"

* tag 'stable/for-linus-3.19-rc0b-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/xen/tip:
  xen: switch to post-init routines in xen mmu.c earlier
  Revert "swiotlb-xen: pass dev_addr to swiotlb_tbl_unmap_single"
  xen: annotate xen_set_identity_and_remap_chunk() with __init
  xen: introduce helper functions to do safe read and write accesses
  xen: Speed up set_phys_to_machine() by using read-only mappings
  xen: switch to linear virtual mapped sparse p2m list
  xen: Hide get_phys_to_machine() to be able to tune common path
  x86: Introduce function to get pmd entry pointer
  xen: Delay invalidating extra memory
  xen: Delay m2p_override initialization
  xen: Delay remapping memory of pv-domain
  xen: use common page allocation function in p2m.c
  xen: Make functions static
  xen: fix some style issues in p2m.c

Showing 7 changed files Inline Diff

arch/x86/include/asm/pgtable_types.h
1 #ifndef _ASM_X86_PGTABLE_DEFS_H 1 #ifndef _ASM_X86_PGTABLE_DEFS_H
2 #define _ASM_X86_PGTABLE_DEFS_H 2 #define _ASM_X86_PGTABLE_DEFS_H
3 3
4 #include <linux/const.h> 4 #include <linux/const.h>
5 #include <asm/page_types.h> 5 #include <asm/page_types.h>
6 6
7 #define FIRST_USER_ADDRESS 0 7 #define FIRST_USER_ADDRESS 0
8 8
9 #define _PAGE_BIT_PRESENT 0 /* is present */ 9 #define _PAGE_BIT_PRESENT 0 /* is present */
10 #define _PAGE_BIT_RW 1 /* writeable */ 10 #define _PAGE_BIT_RW 1 /* writeable */
11 #define _PAGE_BIT_USER 2 /* userspace addressable */ 11 #define _PAGE_BIT_USER 2 /* userspace addressable */
12 #define _PAGE_BIT_PWT 3 /* page write through */ 12 #define _PAGE_BIT_PWT 3 /* page write through */
13 #define _PAGE_BIT_PCD 4 /* page cache disabled */ 13 #define _PAGE_BIT_PCD 4 /* page cache disabled */
14 #define _PAGE_BIT_ACCESSED 5 /* was accessed (raised by CPU) */ 14 #define _PAGE_BIT_ACCESSED 5 /* was accessed (raised by CPU) */
15 #define _PAGE_BIT_DIRTY 6 /* was written to (raised by CPU) */ 15 #define _PAGE_BIT_DIRTY 6 /* was written to (raised by CPU) */
16 #define _PAGE_BIT_PSE 7 /* 4 MB (or 2MB) page */ 16 #define _PAGE_BIT_PSE 7 /* 4 MB (or 2MB) page */
17 #define _PAGE_BIT_PAT 7 /* on 4KB pages */ 17 #define _PAGE_BIT_PAT 7 /* on 4KB pages */
18 #define _PAGE_BIT_GLOBAL 8 /* Global TLB entry PPro+ */ 18 #define _PAGE_BIT_GLOBAL 8 /* Global TLB entry PPro+ */
19 #define _PAGE_BIT_SOFTW1 9 /* available for programmer */ 19 #define _PAGE_BIT_SOFTW1 9 /* available for programmer */
20 #define _PAGE_BIT_SOFTW2 10 /* " */ 20 #define _PAGE_BIT_SOFTW2 10 /* " */
21 #define _PAGE_BIT_SOFTW3 11 /* " */ 21 #define _PAGE_BIT_SOFTW3 11 /* " */
22 #define _PAGE_BIT_PAT_LARGE 12 /* On 2MB or 1GB pages */ 22 #define _PAGE_BIT_PAT_LARGE 12 /* On 2MB or 1GB pages */
23 #define _PAGE_BIT_SPECIAL _PAGE_BIT_SOFTW1 23 #define _PAGE_BIT_SPECIAL _PAGE_BIT_SOFTW1
24 #define _PAGE_BIT_CPA_TEST _PAGE_BIT_SOFTW1 24 #define _PAGE_BIT_CPA_TEST _PAGE_BIT_SOFTW1
25 #define _PAGE_BIT_SPLITTING _PAGE_BIT_SOFTW2 /* only valid on a PSE pmd */ 25 #define _PAGE_BIT_SPLITTING _PAGE_BIT_SOFTW2 /* only valid on a PSE pmd */
26 #define _PAGE_BIT_HIDDEN _PAGE_BIT_SOFTW3 /* hidden by kmemcheck */ 26 #define _PAGE_BIT_HIDDEN _PAGE_BIT_SOFTW3 /* hidden by kmemcheck */
27 #define _PAGE_BIT_SOFT_DIRTY _PAGE_BIT_SOFTW3 /* software dirty tracking */ 27 #define _PAGE_BIT_SOFT_DIRTY _PAGE_BIT_SOFTW3 /* software dirty tracking */
28 #define _PAGE_BIT_NX 63 /* No execute: only valid after cpuid check */ 28 #define _PAGE_BIT_NX 63 /* No execute: only valid after cpuid check */
29 29
30 /* 30 /*
31 * Swap offsets on configurations that allow automatic NUMA balancing use the 31 * Swap offsets on configurations that allow automatic NUMA balancing use the
32 * bits after _PAGE_BIT_GLOBAL. To uniquely distinguish NUMA hinting PTEs from 32 * bits after _PAGE_BIT_GLOBAL. To uniquely distinguish NUMA hinting PTEs from
33 * swap entries, we use the first bit after _PAGE_BIT_GLOBAL and shrink the 33 * swap entries, we use the first bit after _PAGE_BIT_GLOBAL and shrink the
34 * maximum possible swap space from 16TB to 8TB. 34 * maximum possible swap space from 16TB to 8TB.
35 */ 35 */
36 #define _PAGE_BIT_NUMA (_PAGE_BIT_GLOBAL+1) 36 #define _PAGE_BIT_NUMA (_PAGE_BIT_GLOBAL+1)
37 37
38 /* If _PAGE_BIT_PRESENT is clear, we use these: */ 38 /* If _PAGE_BIT_PRESENT is clear, we use these: */
39 /* - if the user mapped it with PROT_NONE; pte_present gives true */ 39 /* - if the user mapped it with PROT_NONE; pte_present gives true */
40 #define _PAGE_BIT_PROTNONE _PAGE_BIT_GLOBAL 40 #define _PAGE_BIT_PROTNONE _PAGE_BIT_GLOBAL
41 /* - set: nonlinear file mapping, saved PTE; unset:swap */ 41 /* - set: nonlinear file mapping, saved PTE; unset:swap */
42 #define _PAGE_BIT_FILE _PAGE_BIT_DIRTY 42 #define _PAGE_BIT_FILE _PAGE_BIT_DIRTY
43 43
44 #define _PAGE_PRESENT (_AT(pteval_t, 1) << _PAGE_BIT_PRESENT) 44 #define _PAGE_PRESENT (_AT(pteval_t, 1) << _PAGE_BIT_PRESENT)
45 #define _PAGE_RW (_AT(pteval_t, 1) << _PAGE_BIT_RW) 45 #define _PAGE_RW (_AT(pteval_t, 1) << _PAGE_BIT_RW)
46 #define _PAGE_USER (_AT(pteval_t, 1) << _PAGE_BIT_USER) 46 #define _PAGE_USER (_AT(pteval_t, 1) << _PAGE_BIT_USER)
47 #define _PAGE_PWT (_AT(pteval_t, 1) << _PAGE_BIT_PWT) 47 #define _PAGE_PWT (_AT(pteval_t, 1) << _PAGE_BIT_PWT)
48 #define _PAGE_PCD (_AT(pteval_t, 1) << _PAGE_BIT_PCD) 48 #define _PAGE_PCD (_AT(pteval_t, 1) << _PAGE_BIT_PCD)
49 #define _PAGE_ACCESSED (_AT(pteval_t, 1) << _PAGE_BIT_ACCESSED) 49 #define _PAGE_ACCESSED (_AT(pteval_t, 1) << _PAGE_BIT_ACCESSED)
50 #define _PAGE_DIRTY (_AT(pteval_t, 1) << _PAGE_BIT_DIRTY) 50 #define _PAGE_DIRTY (_AT(pteval_t, 1) << _PAGE_BIT_DIRTY)
51 #define _PAGE_PSE (_AT(pteval_t, 1) << _PAGE_BIT_PSE) 51 #define _PAGE_PSE (_AT(pteval_t, 1) << _PAGE_BIT_PSE)
52 #define _PAGE_GLOBAL (_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL) 52 #define _PAGE_GLOBAL (_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL)
53 #define _PAGE_SOFTW1 (_AT(pteval_t, 1) << _PAGE_BIT_SOFTW1) 53 #define _PAGE_SOFTW1 (_AT(pteval_t, 1) << _PAGE_BIT_SOFTW1)
54 #define _PAGE_SOFTW2 (_AT(pteval_t, 1) << _PAGE_BIT_SOFTW2) 54 #define _PAGE_SOFTW2 (_AT(pteval_t, 1) << _PAGE_BIT_SOFTW2)
55 #define _PAGE_PAT (_AT(pteval_t, 1) << _PAGE_BIT_PAT) 55 #define _PAGE_PAT (_AT(pteval_t, 1) << _PAGE_BIT_PAT)
56 #define _PAGE_PAT_LARGE (_AT(pteval_t, 1) << _PAGE_BIT_PAT_LARGE) 56 #define _PAGE_PAT_LARGE (_AT(pteval_t, 1) << _PAGE_BIT_PAT_LARGE)
57 #define _PAGE_SPECIAL (_AT(pteval_t, 1) << _PAGE_BIT_SPECIAL) 57 #define _PAGE_SPECIAL (_AT(pteval_t, 1) << _PAGE_BIT_SPECIAL)
58 #define _PAGE_CPA_TEST (_AT(pteval_t, 1) << _PAGE_BIT_CPA_TEST) 58 #define _PAGE_CPA_TEST (_AT(pteval_t, 1) << _PAGE_BIT_CPA_TEST)
59 #define _PAGE_SPLITTING (_AT(pteval_t, 1) << _PAGE_BIT_SPLITTING) 59 #define _PAGE_SPLITTING (_AT(pteval_t, 1) << _PAGE_BIT_SPLITTING)
60 #define __HAVE_ARCH_PTE_SPECIAL 60 #define __HAVE_ARCH_PTE_SPECIAL
61 61
62 #ifdef CONFIG_KMEMCHECK 62 #ifdef CONFIG_KMEMCHECK
63 #define _PAGE_HIDDEN (_AT(pteval_t, 1) << _PAGE_BIT_HIDDEN) 63 #define _PAGE_HIDDEN (_AT(pteval_t, 1) << _PAGE_BIT_HIDDEN)
64 #else 64 #else
65 #define _PAGE_HIDDEN (_AT(pteval_t, 0)) 65 #define _PAGE_HIDDEN (_AT(pteval_t, 0))
66 #endif 66 #endif
67 67
68 /* 68 /*
69 * The same hidden bit is used by kmemcheck, but since kmemcheck 69 * The same hidden bit is used by kmemcheck, but since kmemcheck
70 * works on kernel pages while soft-dirty engine on user space, 70 * works on kernel pages while soft-dirty engine on user space,
71 * they do not conflict with each other. 71 * they do not conflict with each other.
72 */ 72 */
73 73
74 #ifdef CONFIG_MEM_SOFT_DIRTY 74 #ifdef CONFIG_MEM_SOFT_DIRTY
75 #define _PAGE_SOFT_DIRTY (_AT(pteval_t, 1) << _PAGE_BIT_SOFT_DIRTY) 75 #define _PAGE_SOFT_DIRTY (_AT(pteval_t, 1) << _PAGE_BIT_SOFT_DIRTY)
76 #else 76 #else
77 #define _PAGE_SOFT_DIRTY (_AT(pteval_t, 0)) 77 #define _PAGE_SOFT_DIRTY (_AT(pteval_t, 0))
78 #endif 78 #endif
79 79
80 /* 80 /*
81 * _PAGE_NUMA distinguishes between a numa hinting minor fault and a page 81 * _PAGE_NUMA distinguishes between a numa hinting minor fault and a page
82 * that is not present. The hinting fault gathers numa placement statistics 82 * that is not present. The hinting fault gathers numa placement statistics
83 * (see pte_numa()). The bit is always zero when the PTE is not present. 83 * (see pte_numa()). The bit is always zero when the PTE is not present.
84 * 84 *
85 * The bit picked must be always zero when the pmd is present and not 85 * The bit picked must be always zero when the pmd is present and not
86 * present, so that we don't lose information when we set it while 86 * present, so that we don't lose information when we set it while
87 * atomically clearing the present bit. 87 * atomically clearing the present bit.
88 */ 88 */
89 #ifdef CONFIG_NUMA_BALANCING 89 #ifdef CONFIG_NUMA_BALANCING
90 #define _PAGE_NUMA (_AT(pteval_t, 1) << _PAGE_BIT_NUMA) 90 #define _PAGE_NUMA (_AT(pteval_t, 1) << _PAGE_BIT_NUMA)
91 #else 91 #else
92 #define _PAGE_NUMA (_AT(pteval_t, 0)) 92 #define _PAGE_NUMA (_AT(pteval_t, 0))
93 #endif 93 #endif
94 94
95 /* 95 /*
96 * Tracking soft dirty bit when a page goes to a swap is tricky. 96 * Tracking soft dirty bit when a page goes to a swap is tricky.
97 * We need a bit which can be stored in pte _and_ not conflict 97 * We need a bit which can be stored in pte _and_ not conflict
98 * with swap entry format. On x86 bits 6 and 7 are *not* involved 98 * with swap entry format. On x86 bits 6 and 7 are *not* involved
99 * into swap entry computation, but bit 6 is used for nonlinear 99 * into swap entry computation, but bit 6 is used for nonlinear
100 * file mapping, so we borrow bit 7 for soft dirty tracking. 100 * file mapping, so we borrow bit 7 for soft dirty tracking.
101 * 101 *
102 * Please note that this bit must be treated as swap dirty page 102 * Please note that this bit must be treated as swap dirty page
103 * mark if and only if the PTE has present bit clear! 103 * mark if and only if the PTE has present bit clear!
104 */ 104 */
105 #ifdef CONFIG_MEM_SOFT_DIRTY 105 #ifdef CONFIG_MEM_SOFT_DIRTY
106 #define _PAGE_SWP_SOFT_DIRTY _PAGE_PSE 106 #define _PAGE_SWP_SOFT_DIRTY _PAGE_PSE
107 #else 107 #else
108 #define _PAGE_SWP_SOFT_DIRTY (_AT(pteval_t, 0)) 108 #define _PAGE_SWP_SOFT_DIRTY (_AT(pteval_t, 0))
109 #endif 109 #endif
110 110
111 #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) 111 #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
112 #define _PAGE_NX (_AT(pteval_t, 1) << _PAGE_BIT_NX) 112 #define _PAGE_NX (_AT(pteval_t, 1) << _PAGE_BIT_NX)
113 #else 113 #else
114 #define _PAGE_NX (_AT(pteval_t, 0)) 114 #define _PAGE_NX (_AT(pteval_t, 0))
115 #endif 115 #endif
116 116
117 #define _PAGE_FILE (_AT(pteval_t, 1) << _PAGE_BIT_FILE) 117 #define _PAGE_FILE (_AT(pteval_t, 1) << _PAGE_BIT_FILE)
118 #define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE) 118 #define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE)
119 119
120 #define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \ 120 #define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \
121 _PAGE_ACCESSED | _PAGE_DIRTY) 121 _PAGE_ACCESSED | _PAGE_DIRTY)
122 #define _KERNPG_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | \ 122 #define _KERNPG_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | \
123 _PAGE_DIRTY) 123 _PAGE_DIRTY)
124 124
125 /* Set of bits not changed in pte_modify */ 125 /* Set of bits not changed in pte_modify */
126 #define _PAGE_CHG_MASK (PTE_PFN_MASK | _PAGE_PCD | _PAGE_PWT | \ 126 #define _PAGE_CHG_MASK (PTE_PFN_MASK | _PAGE_PCD | _PAGE_PWT | \
127 _PAGE_SPECIAL | _PAGE_ACCESSED | _PAGE_DIRTY | \ 127 _PAGE_SPECIAL | _PAGE_ACCESSED | _PAGE_DIRTY | \
128 _PAGE_SOFT_DIRTY | _PAGE_NUMA) 128 _PAGE_SOFT_DIRTY | _PAGE_NUMA)
129 #define _HPAGE_CHG_MASK (_PAGE_CHG_MASK | _PAGE_PSE | _PAGE_NUMA) 129 #define _HPAGE_CHG_MASK (_PAGE_CHG_MASK | _PAGE_PSE | _PAGE_NUMA)
130 130
131 /* 131 /*
132 * The cache modes defined here are used to translate between pure SW usage 132 * The cache modes defined here are used to translate between pure SW usage
133 * and the HW defined cache mode bits and/or PAT entries. 133 * and the HW defined cache mode bits and/or PAT entries.
134 * 134 *
135 * The resulting bits for PWT, PCD and PAT should be chosen in a way 135 * The resulting bits for PWT, PCD and PAT should be chosen in a way
136 * to have the WB mode at index 0 (all bits clear). This is the default 136 * to have the WB mode at index 0 (all bits clear). This is the default
137 * right now and likely would break too much if changed. 137 * right now and likely would break too much if changed.
138 */ 138 */
139 #ifndef __ASSEMBLY__ 139 #ifndef __ASSEMBLY__
140 enum page_cache_mode { 140 enum page_cache_mode {
141 _PAGE_CACHE_MODE_WB = 0, 141 _PAGE_CACHE_MODE_WB = 0,
142 _PAGE_CACHE_MODE_WC = 1, 142 _PAGE_CACHE_MODE_WC = 1,
143 _PAGE_CACHE_MODE_UC_MINUS = 2, 143 _PAGE_CACHE_MODE_UC_MINUS = 2,
144 _PAGE_CACHE_MODE_UC = 3, 144 _PAGE_CACHE_MODE_UC = 3,
145 _PAGE_CACHE_MODE_WT = 4, 145 _PAGE_CACHE_MODE_WT = 4,
146 _PAGE_CACHE_MODE_WP = 5, 146 _PAGE_CACHE_MODE_WP = 5,
147 _PAGE_CACHE_MODE_NUM = 8 147 _PAGE_CACHE_MODE_NUM = 8
148 }; 148 };
149 #endif 149 #endif
150 150
151 #define _PAGE_CACHE_MASK (_PAGE_PAT | _PAGE_PCD | _PAGE_PWT) 151 #define _PAGE_CACHE_MASK (_PAGE_PAT | _PAGE_PCD | _PAGE_PWT)
152 #define _PAGE_NOCACHE (cachemode2protval(_PAGE_CACHE_MODE_UC)) 152 #define _PAGE_NOCACHE (cachemode2protval(_PAGE_CACHE_MODE_UC))
153 153
154 #define PAGE_NONE __pgprot(_PAGE_PROTNONE | _PAGE_ACCESSED) 154 #define PAGE_NONE __pgprot(_PAGE_PROTNONE | _PAGE_ACCESSED)
155 #define PAGE_SHARED __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \ 155 #define PAGE_SHARED __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \
156 _PAGE_ACCESSED | _PAGE_NX) 156 _PAGE_ACCESSED | _PAGE_NX)
157 157
158 #define PAGE_SHARED_EXEC __pgprot(_PAGE_PRESENT | _PAGE_RW | \ 158 #define PAGE_SHARED_EXEC __pgprot(_PAGE_PRESENT | _PAGE_RW | \
159 _PAGE_USER | _PAGE_ACCESSED) 159 _PAGE_USER | _PAGE_ACCESSED)
160 #define PAGE_COPY_NOEXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | \ 160 #define PAGE_COPY_NOEXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | \
161 _PAGE_ACCESSED | _PAGE_NX) 161 _PAGE_ACCESSED | _PAGE_NX)
162 #define PAGE_COPY_EXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | \ 162 #define PAGE_COPY_EXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | \
163 _PAGE_ACCESSED) 163 _PAGE_ACCESSED)
164 #define PAGE_COPY PAGE_COPY_NOEXEC 164 #define PAGE_COPY PAGE_COPY_NOEXEC
165 #define PAGE_READONLY __pgprot(_PAGE_PRESENT | _PAGE_USER | \ 165 #define PAGE_READONLY __pgprot(_PAGE_PRESENT | _PAGE_USER | \
166 _PAGE_ACCESSED | _PAGE_NX) 166 _PAGE_ACCESSED | _PAGE_NX)
167 #define PAGE_READONLY_EXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | \ 167 #define PAGE_READONLY_EXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | \
168 _PAGE_ACCESSED) 168 _PAGE_ACCESSED)
169 169
170 #define __PAGE_KERNEL_EXEC \ 170 #define __PAGE_KERNEL_EXEC \
171 (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_GLOBAL) 171 (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_GLOBAL)
172 #define __PAGE_KERNEL (__PAGE_KERNEL_EXEC | _PAGE_NX) 172 #define __PAGE_KERNEL (__PAGE_KERNEL_EXEC | _PAGE_NX)
173 173
174 #define __PAGE_KERNEL_RO (__PAGE_KERNEL & ~_PAGE_RW) 174 #define __PAGE_KERNEL_RO (__PAGE_KERNEL & ~_PAGE_RW)
175 #define __PAGE_KERNEL_RX (__PAGE_KERNEL_EXEC & ~_PAGE_RW) 175 #define __PAGE_KERNEL_RX (__PAGE_KERNEL_EXEC & ~_PAGE_RW)
176 #define __PAGE_KERNEL_NOCACHE (__PAGE_KERNEL | _PAGE_NOCACHE) 176 #define __PAGE_KERNEL_NOCACHE (__PAGE_KERNEL | _PAGE_NOCACHE)
177 #define __PAGE_KERNEL_VSYSCALL (__PAGE_KERNEL_RX | _PAGE_USER) 177 #define __PAGE_KERNEL_VSYSCALL (__PAGE_KERNEL_RX | _PAGE_USER)
178 #define __PAGE_KERNEL_VVAR (__PAGE_KERNEL_RO | _PAGE_USER) 178 #define __PAGE_KERNEL_VVAR (__PAGE_KERNEL_RO | _PAGE_USER)
179 #define __PAGE_KERNEL_LARGE (__PAGE_KERNEL | _PAGE_PSE) 179 #define __PAGE_KERNEL_LARGE (__PAGE_KERNEL | _PAGE_PSE)
180 #define __PAGE_KERNEL_LARGE_EXEC (__PAGE_KERNEL_EXEC | _PAGE_PSE) 180 #define __PAGE_KERNEL_LARGE_EXEC (__PAGE_KERNEL_EXEC | _PAGE_PSE)
181 181
182 #define __PAGE_KERNEL_IO (__PAGE_KERNEL) 182 #define __PAGE_KERNEL_IO (__PAGE_KERNEL)
183 #define __PAGE_KERNEL_IO_NOCACHE (__PAGE_KERNEL_NOCACHE) 183 #define __PAGE_KERNEL_IO_NOCACHE (__PAGE_KERNEL_NOCACHE)
184 184
185 #define PAGE_KERNEL __pgprot(__PAGE_KERNEL) 185 #define PAGE_KERNEL __pgprot(__PAGE_KERNEL)
186 #define PAGE_KERNEL_RO __pgprot(__PAGE_KERNEL_RO) 186 #define PAGE_KERNEL_RO __pgprot(__PAGE_KERNEL_RO)
187 #define PAGE_KERNEL_EXEC __pgprot(__PAGE_KERNEL_EXEC) 187 #define PAGE_KERNEL_EXEC __pgprot(__PAGE_KERNEL_EXEC)
188 #define PAGE_KERNEL_RX __pgprot(__PAGE_KERNEL_RX) 188 #define PAGE_KERNEL_RX __pgprot(__PAGE_KERNEL_RX)
189 #define PAGE_KERNEL_NOCACHE __pgprot(__PAGE_KERNEL_NOCACHE) 189 #define PAGE_KERNEL_NOCACHE __pgprot(__PAGE_KERNEL_NOCACHE)
190 #define PAGE_KERNEL_LARGE __pgprot(__PAGE_KERNEL_LARGE) 190 #define PAGE_KERNEL_LARGE __pgprot(__PAGE_KERNEL_LARGE)
191 #define PAGE_KERNEL_LARGE_EXEC __pgprot(__PAGE_KERNEL_LARGE_EXEC) 191 #define PAGE_KERNEL_LARGE_EXEC __pgprot(__PAGE_KERNEL_LARGE_EXEC)
192 #define PAGE_KERNEL_VSYSCALL __pgprot(__PAGE_KERNEL_VSYSCALL) 192 #define PAGE_KERNEL_VSYSCALL __pgprot(__PAGE_KERNEL_VSYSCALL)
193 #define PAGE_KERNEL_VVAR __pgprot(__PAGE_KERNEL_VVAR) 193 #define PAGE_KERNEL_VVAR __pgprot(__PAGE_KERNEL_VVAR)
194 194
195 #define PAGE_KERNEL_IO __pgprot(__PAGE_KERNEL_IO) 195 #define PAGE_KERNEL_IO __pgprot(__PAGE_KERNEL_IO)
196 #define PAGE_KERNEL_IO_NOCACHE __pgprot(__PAGE_KERNEL_IO_NOCACHE) 196 #define PAGE_KERNEL_IO_NOCACHE __pgprot(__PAGE_KERNEL_IO_NOCACHE)
197 197
198 /* xwr */ 198 /* xwr */
199 #define __P000 PAGE_NONE 199 #define __P000 PAGE_NONE
200 #define __P001 PAGE_READONLY 200 #define __P001 PAGE_READONLY
201 #define __P010 PAGE_COPY 201 #define __P010 PAGE_COPY
202 #define __P011 PAGE_COPY 202 #define __P011 PAGE_COPY
203 #define __P100 PAGE_READONLY_EXEC 203 #define __P100 PAGE_READONLY_EXEC
204 #define __P101 PAGE_READONLY_EXEC 204 #define __P101 PAGE_READONLY_EXEC
205 #define __P110 PAGE_COPY_EXEC 205 #define __P110 PAGE_COPY_EXEC
206 #define __P111 PAGE_COPY_EXEC 206 #define __P111 PAGE_COPY_EXEC
207 207
208 #define __S000 PAGE_NONE 208 #define __S000 PAGE_NONE
209 #define __S001 PAGE_READONLY 209 #define __S001 PAGE_READONLY
210 #define __S010 PAGE_SHARED 210 #define __S010 PAGE_SHARED
211 #define __S011 PAGE_SHARED 211 #define __S011 PAGE_SHARED
212 #define __S100 PAGE_READONLY_EXEC 212 #define __S100 PAGE_READONLY_EXEC
213 #define __S101 PAGE_READONLY_EXEC 213 #define __S101 PAGE_READONLY_EXEC
214 #define __S110 PAGE_SHARED_EXEC 214 #define __S110 PAGE_SHARED_EXEC
215 #define __S111 PAGE_SHARED_EXEC 215 #define __S111 PAGE_SHARED_EXEC
216 216
217 /* 217 /*
218 * early identity mapping pte attrib macros. 218 * early identity mapping pte attrib macros.
219 */ 219 */
220 #ifdef CONFIG_X86_64 220 #ifdef CONFIG_X86_64
221 #define __PAGE_KERNEL_IDENT_LARGE_EXEC __PAGE_KERNEL_LARGE_EXEC 221 #define __PAGE_KERNEL_IDENT_LARGE_EXEC __PAGE_KERNEL_LARGE_EXEC
222 #else 222 #else
223 #define PTE_IDENT_ATTR 0x003 /* PRESENT+RW */ 223 #define PTE_IDENT_ATTR 0x003 /* PRESENT+RW */
224 #define PDE_IDENT_ATTR 0x063 /* PRESENT+RW+DIRTY+ACCESSED */ 224 #define PDE_IDENT_ATTR 0x063 /* PRESENT+RW+DIRTY+ACCESSED */
225 #define PGD_IDENT_ATTR 0x001 /* PRESENT (no other attributes) */ 225 #define PGD_IDENT_ATTR 0x001 /* PRESENT (no other attributes) */
226 #endif 226 #endif
227 227
228 #ifdef CONFIG_X86_32 228 #ifdef CONFIG_X86_32
229 # include <asm/pgtable_32_types.h> 229 # include <asm/pgtable_32_types.h>
230 #else 230 #else
231 # include <asm/pgtable_64_types.h> 231 # include <asm/pgtable_64_types.h>
232 #endif 232 #endif
233 233
234 #ifndef __ASSEMBLY__ 234 #ifndef __ASSEMBLY__
235 235
236 #include <linux/types.h> 236 #include <linux/types.h>
237 237
238 /* PTE_PFN_MASK extracts the PFN from a (pte|pmd|pud|pgd)val_t */ 238 /* PTE_PFN_MASK extracts the PFN from a (pte|pmd|pud|pgd)val_t */
239 #define PTE_PFN_MASK ((pteval_t)PHYSICAL_PAGE_MASK) 239 #define PTE_PFN_MASK ((pteval_t)PHYSICAL_PAGE_MASK)
240 240
241 /* PTE_FLAGS_MASK extracts the flags from a (pte|pmd|pud|pgd)val_t */ 241 /* PTE_FLAGS_MASK extracts the flags from a (pte|pmd|pud|pgd)val_t */
242 #define PTE_FLAGS_MASK (~PTE_PFN_MASK) 242 #define PTE_FLAGS_MASK (~PTE_PFN_MASK)
243 243
244 typedef struct pgprot { pgprotval_t pgprot; } pgprot_t; 244 typedef struct pgprot { pgprotval_t pgprot; } pgprot_t;
245 245
246 typedef struct { pgdval_t pgd; } pgd_t; 246 typedef struct { pgdval_t pgd; } pgd_t;
247 247
248 static inline pgd_t native_make_pgd(pgdval_t val) 248 static inline pgd_t native_make_pgd(pgdval_t val)
249 { 249 {
250 return (pgd_t) { val }; 250 return (pgd_t) { val };
251 } 251 }
252 252
253 static inline pgdval_t native_pgd_val(pgd_t pgd) 253 static inline pgdval_t native_pgd_val(pgd_t pgd)
254 { 254 {
255 return pgd.pgd; 255 return pgd.pgd;
256 } 256 }
257 257
258 static inline pgdval_t pgd_flags(pgd_t pgd) 258 static inline pgdval_t pgd_flags(pgd_t pgd)
259 { 259 {
260 return native_pgd_val(pgd) & PTE_FLAGS_MASK; 260 return native_pgd_val(pgd) & PTE_FLAGS_MASK;
261 } 261 }
262 262
263 #if PAGETABLE_LEVELS > 3 263 #if PAGETABLE_LEVELS > 3
264 typedef struct { pudval_t pud; } pud_t; 264 typedef struct { pudval_t pud; } pud_t;
265 265
266 static inline pud_t native_make_pud(pmdval_t val) 266 static inline pud_t native_make_pud(pmdval_t val)
267 { 267 {
268 return (pud_t) { val }; 268 return (pud_t) { val };
269 } 269 }
270 270
271 static inline pudval_t native_pud_val(pud_t pud) 271 static inline pudval_t native_pud_val(pud_t pud)
272 { 272 {
273 return pud.pud; 273 return pud.pud;
274 } 274 }
275 #else 275 #else
276 #include <asm-generic/pgtable-nopud.h> 276 #include <asm-generic/pgtable-nopud.h>
277 277
278 static inline pudval_t native_pud_val(pud_t pud) 278 static inline pudval_t native_pud_val(pud_t pud)
279 { 279 {
280 return native_pgd_val(pud.pgd); 280 return native_pgd_val(pud.pgd);
281 } 281 }
282 #endif 282 #endif
283 283
284 #if PAGETABLE_LEVELS > 2 284 #if PAGETABLE_LEVELS > 2
285 typedef struct { pmdval_t pmd; } pmd_t; 285 typedef struct { pmdval_t pmd; } pmd_t;
286 286
287 static inline pmd_t native_make_pmd(pmdval_t val) 287 static inline pmd_t native_make_pmd(pmdval_t val)
288 { 288 {
289 return (pmd_t) { val }; 289 return (pmd_t) { val };
290 } 290 }
291 291
292 static inline pmdval_t native_pmd_val(pmd_t pmd) 292 static inline pmdval_t native_pmd_val(pmd_t pmd)
293 { 293 {
294 return pmd.pmd; 294 return pmd.pmd;
295 } 295 }
296 #else 296 #else
297 #include <asm-generic/pgtable-nopmd.h> 297 #include <asm-generic/pgtable-nopmd.h>
298 298
299 static inline pmdval_t native_pmd_val(pmd_t pmd) 299 static inline pmdval_t native_pmd_val(pmd_t pmd)
300 { 300 {
301 return native_pgd_val(pmd.pud.pgd); 301 return native_pgd_val(pmd.pud.pgd);
302 } 302 }
303 #endif 303 #endif
304 304
305 static inline pudval_t pud_flags(pud_t pud) 305 static inline pudval_t pud_flags(pud_t pud)
306 { 306 {
307 return native_pud_val(pud) & PTE_FLAGS_MASK; 307 return native_pud_val(pud) & PTE_FLAGS_MASK;
308 } 308 }
309 309
310 static inline pmdval_t pmd_flags(pmd_t pmd) 310 static inline pmdval_t pmd_flags(pmd_t pmd)
311 { 311 {
312 return native_pmd_val(pmd) & PTE_FLAGS_MASK; 312 return native_pmd_val(pmd) & PTE_FLAGS_MASK;
313 } 313 }
314 314
315 static inline pte_t native_make_pte(pteval_t val) 315 static inline pte_t native_make_pte(pteval_t val)
316 { 316 {
317 return (pte_t) { .pte = val }; 317 return (pte_t) { .pte = val };
318 } 318 }
319 319
320 static inline pteval_t native_pte_val(pte_t pte) 320 static inline pteval_t native_pte_val(pte_t pte)
321 { 321 {
322 return pte.pte; 322 return pte.pte;
323 } 323 }
324 324
325 static inline pteval_t pte_flags(pte_t pte) 325 static inline pteval_t pte_flags(pte_t pte)
326 { 326 {
327 return native_pte_val(pte) & PTE_FLAGS_MASK; 327 return native_pte_val(pte) & PTE_FLAGS_MASK;
328 } 328 }
329 329
330 #ifdef CONFIG_NUMA_BALANCING 330 #ifdef CONFIG_NUMA_BALANCING
331 /* Set of bits that distinguishes present, prot_none and numa ptes */ 331 /* Set of bits that distinguishes present, prot_none and numa ptes */
332 #define _PAGE_NUMA_MASK (_PAGE_NUMA|_PAGE_PROTNONE|_PAGE_PRESENT) 332 #define _PAGE_NUMA_MASK (_PAGE_NUMA|_PAGE_PROTNONE|_PAGE_PRESENT)
333 static inline pteval_t ptenuma_flags(pte_t pte) 333 static inline pteval_t ptenuma_flags(pte_t pte)
334 { 334 {
335 return pte_flags(pte) & _PAGE_NUMA_MASK; 335 return pte_flags(pte) & _PAGE_NUMA_MASK;
336 } 336 }
337 337
338 static inline pmdval_t pmdnuma_flags(pmd_t pmd) 338 static inline pmdval_t pmdnuma_flags(pmd_t pmd)
339 { 339 {
340 return pmd_flags(pmd) & _PAGE_NUMA_MASK; 340 return pmd_flags(pmd) & _PAGE_NUMA_MASK;
341 } 341 }
342 #endif /* CONFIG_NUMA_BALANCING */ 342 #endif /* CONFIG_NUMA_BALANCING */
343 343
344 #define pgprot_val(x) ((x).pgprot) 344 #define pgprot_val(x) ((x).pgprot)
345 #define __pgprot(x) ((pgprot_t) { (x) } ) 345 #define __pgprot(x) ((pgprot_t) { (x) } )
346 346
347 extern uint16_t __cachemode2pte_tbl[_PAGE_CACHE_MODE_NUM]; 347 extern uint16_t __cachemode2pte_tbl[_PAGE_CACHE_MODE_NUM];
348 extern uint8_t __pte2cachemode_tbl[8]; 348 extern uint8_t __pte2cachemode_tbl[8];
349 349
350 #define __pte2cm_idx(cb) \ 350 #define __pte2cm_idx(cb) \
351 ((((cb) >> (_PAGE_BIT_PAT - 2)) & 4) | \ 351 ((((cb) >> (_PAGE_BIT_PAT - 2)) & 4) | \
352 (((cb) >> (_PAGE_BIT_PCD - 1)) & 2) | \ 352 (((cb) >> (_PAGE_BIT_PCD - 1)) & 2) | \
353 (((cb) >> _PAGE_BIT_PWT) & 1)) 353 (((cb) >> _PAGE_BIT_PWT) & 1))
354 #define __cm_idx2pte(i) \ 354 #define __cm_idx2pte(i) \
355 ((((i) & 4) << (_PAGE_BIT_PAT - 2)) | \ 355 ((((i) & 4) << (_PAGE_BIT_PAT - 2)) | \
356 (((i) & 2) << (_PAGE_BIT_PCD - 1)) | \ 356 (((i) & 2) << (_PAGE_BIT_PCD - 1)) | \
357 (((i) & 1) << _PAGE_BIT_PWT)) 357 (((i) & 1) << _PAGE_BIT_PWT))
358 358
359 static inline unsigned long cachemode2protval(enum page_cache_mode pcm) 359 static inline unsigned long cachemode2protval(enum page_cache_mode pcm)
360 { 360 {
361 if (likely(pcm == 0)) 361 if (likely(pcm == 0))
362 return 0; 362 return 0;
363 return __cachemode2pte_tbl[pcm]; 363 return __cachemode2pte_tbl[pcm];
364 } 364 }
365 static inline pgprot_t cachemode2pgprot(enum page_cache_mode pcm) 365 static inline pgprot_t cachemode2pgprot(enum page_cache_mode pcm)
366 { 366 {
367 return __pgprot(cachemode2protval(pcm)); 367 return __pgprot(cachemode2protval(pcm));
368 } 368 }
369 static inline enum page_cache_mode pgprot2cachemode(pgprot_t pgprot) 369 static inline enum page_cache_mode pgprot2cachemode(pgprot_t pgprot)
370 { 370 {
371 unsigned long masked; 371 unsigned long masked;
372 372
373 masked = pgprot_val(pgprot) & _PAGE_CACHE_MASK; 373 masked = pgprot_val(pgprot) & _PAGE_CACHE_MASK;
374 if (likely(masked == 0)) 374 if (likely(masked == 0))
375 return 0; 375 return 0;
376 return __pte2cachemode_tbl[__pte2cm_idx(masked)]; 376 return __pte2cachemode_tbl[__pte2cm_idx(masked)];
377 } 377 }
378 static inline pgprot_t pgprot_4k_2_large(pgprot_t pgprot) 378 static inline pgprot_t pgprot_4k_2_large(pgprot_t pgprot)
379 { 379 {
380 pgprot_t new; 380 pgprot_t new;
381 unsigned long val; 381 unsigned long val;
382 382
383 val = pgprot_val(pgprot); 383 val = pgprot_val(pgprot);
384 pgprot_val(new) = (val & ~(_PAGE_PAT | _PAGE_PAT_LARGE)) | 384 pgprot_val(new) = (val & ~(_PAGE_PAT | _PAGE_PAT_LARGE)) |
385 ((val & _PAGE_PAT) << (_PAGE_BIT_PAT_LARGE - _PAGE_BIT_PAT)); 385 ((val & _PAGE_PAT) << (_PAGE_BIT_PAT_LARGE - _PAGE_BIT_PAT));
386 return new; 386 return new;
387 } 387 }
388 static inline pgprot_t pgprot_large_2_4k(pgprot_t pgprot) 388 static inline pgprot_t pgprot_large_2_4k(pgprot_t pgprot)
389 { 389 {
390 pgprot_t new; 390 pgprot_t new;
391 unsigned long val; 391 unsigned long val;
392 392
393 val = pgprot_val(pgprot); 393 val = pgprot_val(pgprot);
394 pgprot_val(new) = (val & ~(_PAGE_PAT | _PAGE_PAT_LARGE)) | 394 pgprot_val(new) = (val & ~(_PAGE_PAT | _PAGE_PAT_LARGE)) |
395 ((val & _PAGE_PAT_LARGE) >> 395 ((val & _PAGE_PAT_LARGE) >>
396 (_PAGE_BIT_PAT_LARGE - _PAGE_BIT_PAT)); 396 (_PAGE_BIT_PAT_LARGE - _PAGE_BIT_PAT));
397 return new; 397 return new;
398 } 398 }
399 399
400 400
401 typedef struct page *pgtable_t; 401 typedef struct page *pgtable_t;
402 402
403 extern pteval_t __supported_pte_mask; 403 extern pteval_t __supported_pte_mask;
404 extern void set_nx(void); 404 extern void set_nx(void);
405 extern int nx_enabled; 405 extern int nx_enabled;
406 406
407 #define pgprot_writecombine pgprot_writecombine 407 #define pgprot_writecombine pgprot_writecombine
408 extern pgprot_t pgprot_writecombine(pgprot_t prot); 408 extern pgprot_t pgprot_writecombine(pgprot_t prot);
409 409
410 /* Indicate that x86 has its own track and untrack pfn vma functions */ 410 /* Indicate that x86 has its own track and untrack pfn vma functions */
411 #define __HAVE_PFNMAP_TRACKING 411 #define __HAVE_PFNMAP_TRACKING
412 412
413 #define __HAVE_PHYS_MEM_ACCESS_PROT 413 #define __HAVE_PHYS_MEM_ACCESS_PROT
414 struct file; 414 struct file;
415 pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, 415 pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
416 unsigned long size, pgprot_t vma_prot); 416 unsigned long size, pgprot_t vma_prot);
417 int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn, 417 int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
418 unsigned long size, pgprot_t *vma_prot); 418 unsigned long size, pgprot_t *vma_prot);
419 419
420 /* Install a pte for a particular vaddr in kernel space. */ 420 /* Install a pte for a particular vaddr in kernel space. */
421 void set_pte_vaddr(unsigned long vaddr, pte_t pte); 421 void set_pte_vaddr(unsigned long vaddr, pte_t pte);
422 422
423 #ifdef CONFIG_X86_32 423 #ifdef CONFIG_X86_32
424 extern void native_pagetable_init(void); 424 extern void native_pagetable_init(void);
425 #else 425 #else
426 #define native_pagetable_init paging_init 426 #define native_pagetable_init paging_init
427 #endif 427 #endif
428 428
429 struct seq_file; 429 struct seq_file;
430 extern void arch_report_meminfo(struct seq_file *m); 430 extern void arch_report_meminfo(struct seq_file *m);
431 431
432 enum pg_level { 432 enum pg_level {
433 PG_LEVEL_NONE, 433 PG_LEVEL_NONE,
434 PG_LEVEL_4K, 434 PG_LEVEL_4K,
435 PG_LEVEL_2M, 435 PG_LEVEL_2M,
436 PG_LEVEL_1G, 436 PG_LEVEL_1G,
437 PG_LEVEL_NUM 437 PG_LEVEL_NUM
438 }; 438 };
439 439
440 #ifdef CONFIG_PROC_FS 440 #ifdef CONFIG_PROC_FS
441 extern void update_page_count(int level, unsigned long pages); 441 extern void update_page_count(int level, unsigned long pages);
442 #else 442 #else
443 static inline void update_page_count(int level, unsigned long pages) { } 443 static inline void update_page_count(int level, unsigned long pages) { }
444 #endif 444 #endif
445 445
446 /* 446 /*
447 * Helper function that returns the kernel pagetable entry controlling 447 * Helper function that returns the kernel pagetable entry controlling
448 * the virtual address 'address'. NULL means no pagetable entry present. 448 * the virtual address 'address'. NULL means no pagetable entry present.
449 * NOTE: the return type is pte_t but if the pmd is PSE then we return it 449 * NOTE: the return type is pte_t but if the pmd is PSE then we return it
450 * as a pte too. 450 * as a pte too.
451 */ 451 */
452 extern pte_t *lookup_address(unsigned long address, unsigned int *level); 452 extern pte_t *lookup_address(unsigned long address, unsigned int *level);
453 extern pte_t *lookup_address_in_pgd(pgd_t *pgd, unsigned long address, 453 extern pte_t *lookup_address_in_pgd(pgd_t *pgd, unsigned long address,
454 unsigned int *level); 454 unsigned int *level);
455 extern pmd_t *lookup_pmd_address(unsigned long address);
455 extern phys_addr_t slow_virt_to_phys(void *__address); 456 extern phys_addr_t slow_virt_to_phys(void *__address);
456 extern int kernel_map_pages_in_pgd(pgd_t *pgd, u64 pfn, unsigned long address, 457 extern int kernel_map_pages_in_pgd(pgd_t *pgd, u64 pfn, unsigned long address,
457 unsigned numpages, unsigned long page_flags); 458 unsigned numpages, unsigned long page_flags);
458 void kernel_unmap_pages_in_pgd(pgd_t *root, unsigned long address, 459 void kernel_unmap_pages_in_pgd(pgd_t *root, unsigned long address,
459 unsigned numpages); 460 unsigned numpages);
460 #endif /* !__ASSEMBLY__ */ 461 #endif /* !__ASSEMBLY__ */
461 462
462 #endif /* _ASM_X86_PGTABLE_DEFS_H */ 463 #endif /* _ASM_X86_PGTABLE_DEFS_H */
463 464
arch/x86/include/asm/xen/page.h
1 #ifndef _ASM_X86_XEN_PAGE_H 1 #ifndef _ASM_X86_XEN_PAGE_H
2 #define _ASM_X86_XEN_PAGE_H 2 #define _ASM_X86_XEN_PAGE_H
3 3
4 #include <linux/kernel.h> 4 #include <linux/kernel.h>
5 #include <linux/types.h> 5 #include <linux/types.h>
6 #include <linux/spinlock.h> 6 #include <linux/spinlock.h>
7 #include <linux/pfn.h> 7 #include <linux/pfn.h>
8 #include <linux/mm.h> 8 #include <linux/mm.h>
9 9
10 #include <asm/uaccess.h> 10 #include <asm/uaccess.h>
11 #include <asm/page.h> 11 #include <asm/page.h>
12 #include <asm/pgtable.h> 12 #include <asm/pgtable.h>
13 13
14 #include <xen/interface/xen.h> 14 #include <xen/interface/xen.h>
15 #include <xen/grant_table.h> 15 #include <xen/grant_table.h>
16 #include <xen/features.h> 16 #include <xen/features.h>
17 17
18 /* Xen machine address */ 18 /* Xen machine address */
19 typedef struct xmaddr { 19 typedef struct xmaddr {
20 phys_addr_t maddr; 20 phys_addr_t maddr;
21 } xmaddr_t; 21 } xmaddr_t;
22 22
23 /* Xen pseudo-physical address */ 23 /* Xen pseudo-physical address */
24 typedef struct xpaddr { 24 typedef struct xpaddr {
25 phys_addr_t paddr; 25 phys_addr_t paddr;
26 } xpaddr_t; 26 } xpaddr_t;
27 27
28 #define XMADDR(x) ((xmaddr_t) { .maddr = (x) }) 28 #define XMADDR(x) ((xmaddr_t) { .maddr = (x) })
29 #define XPADDR(x) ((xpaddr_t) { .paddr = (x) }) 29 #define XPADDR(x) ((xpaddr_t) { .paddr = (x) })
30 30
31 /**** MACHINE <-> PHYSICAL CONVERSION MACROS ****/ 31 /**** MACHINE <-> PHYSICAL CONVERSION MACROS ****/
32 #define INVALID_P2M_ENTRY (~0UL) 32 #define INVALID_P2M_ENTRY (~0UL)
33 #define FOREIGN_FRAME_BIT (1UL<<(BITS_PER_LONG-1)) 33 #define FOREIGN_FRAME_BIT (1UL<<(BITS_PER_LONG-1))
34 #define IDENTITY_FRAME_BIT (1UL<<(BITS_PER_LONG-2)) 34 #define IDENTITY_FRAME_BIT (1UL<<(BITS_PER_LONG-2))
35 #define FOREIGN_FRAME(m) ((m) | FOREIGN_FRAME_BIT) 35 #define FOREIGN_FRAME(m) ((m) | FOREIGN_FRAME_BIT)
36 #define IDENTITY_FRAME(m) ((m) | IDENTITY_FRAME_BIT) 36 #define IDENTITY_FRAME(m) ((m) | IDENTITY_FRAME_BIT)
37 37
38 /* Maximum amount of memory we can handle in a domain in pages */ 38 /* Maximum amount of memory we can handle in a domain in pages */
39 #define MAX_DOMAIN_PAGES \ 39 #define MAX_DOMAIN_PAGES \
40 ((unsigned long)((u64)CONFIG_XEN_MAX_DOMAIN_MEMORY * 1024 * 1024 * 1024 / PAGE_SIZE)) 40 ((unsigned long)((u64)CONFIG_XEN_MAX_DOMAIN_MEMORY * 1024 * 1024 * 1024 / PAGE_SIZE))
41 41
42 extern unsigned long *machine_to_phys_mapping; 42 extern unsigned long *machine_to_phys_mapping;
43 extern unsigned long machine_to_phys_nr; 43 extern unsigned long machine_to_phys_nr;
44 extern unsigned long *xen_p2m_addr;
45 extern unsigned long xen_p2m_size;
46 extern unsigned long xen_max_p2m_pfn;
44 47
45 extern unsigned long get_phys_to_machine(unsigned long pfn); 48 extern unsigned long get_phys_to_machine(unsigned long pfn);
46 extern bool set_phys_to_machine(unsigned long pfn, unsigned long mfn); 49 extern bool set_phys_to_machine(unsigned long pfn, unsigned long mfn);
47 extern bool __init early_set_phys_to_machine(unsigned long pfn, unsigned long mfn);
48 extern bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn); 50 extern bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn);
49 extern unsigned long set_phys_range_identity(unsigned long pfn_s, 51 extern unsigned long set_phys_range_identity(unsigned long pfn_s,
50 unsigned long pfn_e); 52 unsigned long pfn_e);
51 53
52 extern int set_foreign_p2m_mapping(struct gnttab_map_grant_ref *map_ops, 54 extern int set_foreign_p2m_mapping(struct gnttab_map_grant_ref *map_ops,
53 struct gnttab_map_grant_ref *kmap_ops, 55 struct gnttab_map_grant_ref *kmap_ops,
54 struct page **pages, unsigned int count); 56 struct page **pages, unsigned int count);
55 extern int m2p_add_override(unsigned long mfn, struct page *page,
56 struct gnttab_map_grant_ref *kmap_op);
57 extern int clear_foreign_p2m_mapping(struct gnttab_unmap_grant_ref *unmap_ops, 57 extern int clear_foreign_p2m_mapping(struct gnttab_unmap_grant_ref *unmap_ops,
58 struct gnttab_map_grant_ref *kmap_ops, 58 struct gnttab_map_grant_ref *kmap_ops,
59 struct page **pages, unsigned int count); 59 struct page **pages, unsigned int count);
60 extern int m2p_remove_override(struct page *page,
61 struct gnttab_map_grant_ref *kmap_op,
62 unsigned long mfn);
63 extern struct page *m2p_find_override(unsigned long mfn);
64 extern unsigned long m2p_find_override_pfn(unsigned long mfn, unsigned long pfn); 60 extern unsigned long m2p_find_override_pfn(unsigned long mfn, unsigned long pfn);
65 61
62 /*
63 * Helper functions to write or read unsigned long values to/from
64 * memory, when the access may fault.
65 */
66 static inline int xen_safe_write_ulong(unsigned long *addr, unsigned long val)
67 {
68 return __put_user(val, (unsigned long __user *)addr);
69 }
70
71 static inline int xen_safe_read_ulong(unsigned long *addr, unsigned long *val)
72 {
73 return __get_user(*val, (unsigned long __user *)addr);
74 }
75
76 /*
77 * When to use pfn_to_mfn(), __pfn_to_mfn() or get_phys_to_machine():
78 * - pfn_to_mfn() returns either INVALID_P2M_ENTRY or the mfn. No indicator
79 * bits (identity or foreign) are set.
80 * - __pfn_to_mfn() returns the found entry of the p2m table. A possibly set
81 * identity or foreign indicator will be still set. __pfn_to_mfn() is
82 * encapsulating get_phys_to_machine() which is called in special cases only.
83 * - get_phys_to_machine() is to be called by __pfn_to_mfn() only in special
84 * cases needing an extended handling.
85 */
86 static inline unsigned long __pfn_to_mfn(unsigned long pfn)
87 {
88 unsigned long mfn;
89
90 if (pfn < xen_p2m_size)
91 mfn = xen_p2m_addr[pfn];
92 else if (unlikely(pfn < xen_max_p2m_pfn))
93 return get_phys_to_machine(pfn);
94 else
95 return IDENTITY_FRAME(pfn);
96
97 if (unlikely(mfn == INVALID_P2M_ENTRY))
98 return get_phys_to_machine(pfn);
99
100 return mfn;
101 }
102
66 static inline unsigned long pfn_to_mfn(unsigned long pfn) 103 static inline unsigned long pfn_to_mfn(unsigned long pfn)
67 { 104 {
68 unsigned long mfn; 105 unsigned long mfn;
69 106
70 if (xen_feature(XENFEAT_auto_translated_physmap)) 107 if (xen_feature(XENFEAT_auto_translated_physmap))
71 return pfn; 108 return pfn;
72 109
73 mfn = get_phys_to_machine(pfn); 110 mfn = __pfn_to_mfn(pfn);
74 111
75 if (mfn != INVALID_P2M_ENTRY) 112 if (mfn != INVALID_P2M_ENTRY)
76 mfn &= ~(FOREIGN_FRAME_BIT | IDENTITY_FRAME_BIT); 113 mfn &= ~(FOREIGN_FRAME_BIT | IDENTITY_FRAME_BIT);
77 114
78 return mfn; 115 return mfn;
79 } 116 }
80 117
81 static inline int phys_to_machine_mapping_valid(unsigned long pfn) 118 static inline int phys_to_machine_mapping_valid(unsigned long pfn)
82 { 119 {
83 if (xen_feature(XENFEAT_auto_translated_physmap)) 120 if (xen_feature(XENFEAT_auto_translated_physmap))
84 return 1; 121 return 1;
85 122
86 return get_phys_to_machine(pfn) != INVALID_P2M_ENTRY; 123 return __pfn_to_mfn(pfn) != INVALID_P2M_ENTRY;
87 } 124 }
88 125
89 static inline unsigned long mfn_to_pfn_no_overrides(unsigned long mfn) 126 static inline unsigned long mfn_to_pfn_no_overrides(unsigned long mfn)
90 { 127 {
91 unsigned long pfn; 128 unsigned long pfn;
92 int ret; 129 int ret;
93 130
94 if (xen_feature(XENFEAT_auto_translated_physmap)) 131 if (xen_feature(XENFEAT_auto_translated_physmap))
95 return mfn; 132 return mfn;
96 133
97 if (unlikely(mfn >= machine_to_phys_nr)) 134 if (unlikely(mfn >= machine_to_phys_nr))
98 return ~0; 135 return ~0;
99 136
100 /* 137 /*
101 * The array access can fail (e.g., device space beyond end of RAM). 138 * The array access can fail (e.g., device space beyond end of RAM).
102 * In such cases it doesn't matter what we return (we return garbage), 139 * In such cases it doesn't matter what we return (we return garbage),
103 * but we must handle the fault without crashing! 140 * but we must handle the fault without crashing!
104 */ 141 */
105 ret = __get_user(pfn, &machine_to_phys_mapping[mfn]); 142 ret = xen_safe_read_ulong(&machine_to_phys_mapping[mfn], &pfn);
106 if (ret < 0) 143 if (ret < 0)
107 return ~0; 144 return ~0;
108 145
109 return pfn; 146 return pfn;
110 } 147 }
111 148
112 static inline unsigned long mfn_to_pfn(unsigned long mfn) 149 static inline unsigned long mfn_to_pfn(unsigned long mfn)
113 { 150 {
114 unsigned long pfn; 151 unsigned long pfn;
115 152
116 if (xen_feature(XENFEAT_auto_translated_physmap)) 153 if (xen_feature(XENFEAT_auto_translated_physmap))
117 return mfn; 154 return mfn;
118 155
119 pfn = mfn_to_pfn_no_overrides(mfn); 156 pfn = mfn_to_pfn_no_overrides(mfn);
120 if (get_phys_to_machine(pfn) != mfn) { 157 if (__pfn_to_mfn(pfn) != mfn) {
121 /* 158 /*
122 * If this appears to be a foreign mfn (because the pfn 159 * If this appears to be a foreign mfn (because the pfn
123 * doesn't map back to the mfn), then check the local override 160 * doesn't map back to the mfn), then check the local override
124 * table to see if there's a better pfn to use. 161 * table to see if there's a better pfn to use.
125 * 162 *
126 * m2p_find_override_pfn returns ~0 if it doesn't find anything. 163 * m2p_find_override_pfn returns ~0 if it doesn't find anything.
127 */ 164 */
128 pfn = m2p_find_override_pfn(mfn, ~0); 165 pfn = m2p_find_override_pfn(mfn, ~0);
129 } 166 }
130 167
131 /* 168 /*
132 * pfn is ~0 if there are no entries in the m2p for mfn or if the 169 * pfn is ~0 if there are no entries in the m2p for mfn or if the
133 * entry doesn't map back to the mfn and m2p_override doesn't have a 170 * entry doesn't map back to the mfn and m2p_override doesn't have a
134 * valid entry for it. 171 * valid entry for it.
135 */ 172 */
136 if (pfn == ~0 && 173 if (pfn == ~0 && __pfn_to_mfn(mfn) == IDENTITY_FRAME(mfn))
137 get_phys_to_machine(mfn) == IDENTITY_FRAME(mfn))
138 pfn = mfn; 174 pfn = mfn;
139 175
140 return pfn; 176 return pfn;
141 } 177 }
142 178
143 static inline xmaddr_t phys_to_machine(xpaddr_t phys) 179 static inline xmaddr_t phys_to_machine(xpaddr_t phys)
144 { 180 {
145 unsigned offset = phys.paddr & ~PAGE_MASK; 181 unsigned offset = phys.paddr & ~PAGE_MASK;
146 return XMADDR(PFN_PHYS(pfn_to_mfn(PFN_DOWN(phys.paddr))) | offset); 182 return XMADDR(PFN_PHYS(pfn_to_mfn(PFN_DOWN(phys.paddr))) | offset);
147 } 183 }
148 184
149 static inline xpaddr_t machine_to_phys(xmaddr_t machine) 185 static inline xpaddr_t machine_to_phys(xmaddr_t machine)
150 { 186 {
151 unsigned offset = machine.maddr & ~PAGE_MASK; 187 unsigned offset = machine.maddr & ~PAGE_MASK;
152 return XPADDR(PFN_PHYS(mfn_to_pfn(PFN_DOWN(machine.maddr))) | offset); 188 return XPADDR(PFN_PHYS(mfn_to_pfn(PFN_DOWN(machine.maddr))) | offset);
153 } 189 }
154 190
155 /* 191 /*
156 * We detect special mappings in one of two ways: 192 * We detect special mappings in one of two ways:
157 * 1. If the MFN is an I/O page then Xen will set the m2p entry 193 * 1. If the MFN is an I/O page then Xen will set the m2p entry
158 * to be outside our maximum possible pseudophys range. 194 * to be outside our maximum possible pseudophys range.
159 * 2. If the MFN belongs to a different domain then we will certainly 195 * 2. If the MFN belongs to a different domain then we will certainly
160 * not have MFN in our p2m table. Conversely, if the page is ours, 196 * not have MFN in our p2m table. Conversely, if the page is ours,
161 * then we'll have p2m(m2p(MFN))==MFN. 197 * then we'll have p2m(m2p(MFN))==MFN.
162 * If we detect a special mapping then it doesn't have a 'struct page'. 198 * If we detect a special mapping then it doesn't have a 'struct page'.
163 * We force !pfn_valid() by returning an out-of-range pointer. 199 * We force !pfn_valid() by returning an out-of-range pointer.
164 * 200 *
165 * NB. These checks require that, for any MFN that is not in our reservation, 201 * NB. These checks require that, for any MFN that is not in our reservation,
166 * there is no PFN such that p2m(PFN) == MFN. Otherwise we can get confused if 202 * there is no PFN such that p2m(PFN) == MFN. Otherwise we can get confused if
167 * we are foreign-mapping the MFN, and the other domain as m2p(MFN) == PFN. 203 * we are foreign-mapping the MFN, and the other domain as m2p(MFN) == PFN.
168 * Yikes! Various places must poke in INVALID_P2M_ENTRY for safety. 204 * Yikes! Various places must poke in INVALID_P2M_ENTRY for safety.
169 * 205 *
170 * NB2. When deliberately mapping foreign pages into the p2m table, you *must* 206 * NB2. When deliberately mapping foreign pages into the p2m table, you *must*
171 * use FOREIGN_FRAME(). This will cause pte_pfn() to choke on it, as we 207 * use FOREIGN_FRAME(). This will cause pte_pfn() to choke on it, as we
172 * require. In all the cases we care about, the FOREIGN_FRAME bit is 208 * require. In all the cases we care about, the FOREIGN_FRAME bit is
173 * masked (e.g., pfn_to_mfn()) so behaviour there is correct. 209 * masked (e.g., pfn_to_mfn()) so behaviour there is correct.
174 */ 210 */
175 static inline unsigned long mfn_to_local_pfn(unsigned long mfn) 211 static inline unsigned long mfn_to_local_pfn(unsigned long mfn)
176 { 212 {
177 unsigned long pfn; 213 unsigned long pfn;
178 214
179 if (xen_feature(XENFEAT_auto_translated_physmap)) 215 if (xen_feature(XENFEAT_auto_translated_physmap))
180 return mfn; 216 return mfn;
181 217
182 pfn = mfn_to_pfn(mfn); 218 pfn = mfn_to_pfn(mfn);
183 if (get_phys_to_machine(pfn) != mfn) 219 if (__pfn_to_mfn(pfn) != mfn)
184 return -1; /* force !pfn_valid() */ 220 return -1; /* force !pfn_valid() */
185 return pfn; 221 return pfn;
186 } 222 }
187 223
188 /* VIRT <-> MACHINE conversion */ 224 /* VIRT <-> MACHINE conversion */
189 #define virt_to_machine(v) (phys_to_machine(XPADDR(__pa(v)))) 225 #define virt_to_machine(v) (phys_to_machine(XPADDR(__pa(v))))
190 #define virt_to_pfn(v) (PFN_DOWN(__pa(v))) 226 #define virt_to_pfn(v) (PFN_DOWN(__pa(v)))
191 #define virt_to_mfn(v) (pfn_to_mfn(virt_to_pfn(v))) 227 #define virt_to_mfn(v) (pfn_to_mfn(virt_to_pfn(v)))
192 #define mfn_to_virt(m) (__va(mfn_to_pfn(m) << PAGE_SHIFT)) 228 #define mfn_to_virt(m) (__va(mfn_to_pfn(m) << PAGE_SHIFT))
193 229
194 static inline unsigned long pte_mfn(pte_t pte) 230 static inline unsigned long pte_mfn(pte_t pte)
195 { 231 {
196 return (pte.pte & PTE_PFN_MASK) >> PAGE_SHIFT; 232 return (pte.pte & PTE_PFN_MASK) >> PAGE_SHIFT;
197 } 233 }
198 234
199 static inline pte_t mfn_pte(unsigned long page_nr, pgprot_t pgprot) 235 static inline pte_t mfn_pte(unsigned long page_nr, pgprot_t pgprot)
200 { 236 {
201 pte_t pte; 237 pte_t pte;
202 238
203 pte.pte = ((phys_addr_t)page_nr << PAGE_SHIFT) | 239 pte.pte = ((phys_addr_t)page_nr << PAGE_SHIFT) |
204 massage_pgprot(pgprot); 240 massage_pgprot(pgprot);
205 241
206 return pte; 242 return pte;
207 } 243 }
208 244
209 static inline pteval_t pte_val_ma(pte_t pte) 245 static inline pteval_t pte_val_ma(pte_t pte)
210 { 246 {
211 return pte.pte; 247 return pte.pte;
212 } 248 }
213 249
214 static inline pte_t __pte_ma(pteval_t x) 250 static inline pte_t __pte_ma(pteval_t x)
215 { 251 {
216 return (pte_t) { .pte = x }; 252 return (pte_t) { .pte = x };
217 } 253 }
218 254
219 #define pmd_val_ma(v) ((v).pmd) 255 #define pmd_val_ma(v) ((v).pmd)
220 #ifdef __PAGETABLE_PUD_FOLDED 256 #ifdef __PAGETABLE_PUD_FOLDED
221 #define pud_val_ma(v) ((v).pgd.pgd) 257 #define pud_val_ma(v) ((v).pgd.pgd)
222 #else 258 #else
223 #define pud_val_ma(v) ((v).pud) 259 #define pud_val_ma(v) ((v).pud)
224 #endif 260 #endif
225 #define __pmd_ma(x) ((pmd_t) { (x) } ) 261 #define __pmd_ma(x) ((pmd_t) { (x) } )
226 262
227 #define pgd_val_ma(x) ((x).pgd) 263 #define pgd_val_ma(x) ((x).pgd)
228 264
229 void xen_set_domain_pte(pte_t *ptep, pte_t pteval, unsigned domid); 265 void xen_set_domain_pte(pte_t *ptep, pte_t pteval, unsigned domid);
230 266
231 xmaddr_t arbitrary_virt_to_machine(void *address); 267 xmaddr_t arbitrary_virt_to_machine(void *address);
232 unsigned long arbitrary_virt_to_mfn(void *vaddr); 268 unsigned long arbitrary_virt_to_mfn(void *vaddr);
233 void make_lowmem_page_readonly(void *vaddr); 269 void make_lowmem_page_readonly(void *vaddr);
234 void make_lowmem_page_readwrite(void *vaddr); 270 void make_lowmem_page_readwrite(void *vaddr);
235 271
236 #define xen_remap(cookie, size) ioremap((cookie), (size)); 272 #define xen_remap(cookie, size) ioremap((cookie), (size));
237 #define xen_unmap(cookie) iounmap((cookie)) 273 #define xen_unmap(cookie) iounmap((cookie))
238 274
239 static inline bool xen_arch_need_swiotlb(struct device *dev, 275 static inline bool xen_arch_need_swiotlb(struct device *dev,
arch/x86/mm/pageattr.c
1 /* 1 /*
2 * Copyright 2002 Andi Kleen, SuSE Labs. 2 * Copyright 2002 Andi Kleen, SuSE Labs.
3 * Thanks to Ben LaHaise for precious feedback. 3 * Thanks to Ben LaHaise for precious feedback.
4 */ 4 */
5 #include <linux/highmem.h> 5 #include <linux/highmem.h>
6 #include <linux/bootmem.h> 6 #include <linux/bootmem.h>
7 #include <linux/module.h> 7 #include <linux/module.h>
8 #include <linux/sched.h> 8 #include <linux/sched.h>
9 #include <linux/mm.h> 9 #include <linux/mm.h>
10 #include <linux/interrupt.h> 10 #include <linux/interrupt.h>
11 #include <linux/seq_file.h> 11 #include <linux/seq_file.h>
12 #include <linux/debugfs.h> 12 #include <linux/debugfs.h>
13 #include <linux/pfn.h> 13 #include <linux/pfn.h>
14 #include <linux/percpu.h> 14 #include <linux/percpu.h>
15 #include <linux/gfp.h> 15 #include <linux/gfp.h>
16 #include <linux/pci.h> 16 #include <linux/pci.h>
17 17
18 #include <asm/e820.h> 18 #include <asm/e820.h>
19 #include <asm/processor.h> 19 #include <asm/processor.h>
20 #include <asm/tlbflush.h> 20 #include <asm/tlbflush.h>
21 #include <asm/sections.h> 21 #include <asm/sections.h>
22 #include <asm/setup.h> 22 #include <asm/setup.h>
23 #include <asm/uaccess.h> 23 #include <asm/uaccess.h>
24 #include <asm/pgalloc.h> 24 #include <asm/pgalloc.h>
25 #include <asm/proto.h> 25 #include <asm/proto.h>
26 #include <asm/pat.h> 26 #include <asm/pat.h>
27 27
28 /* 28 /*
29 * The current flushing context - we pass it instead of 5 arguments: 29 * The current flushing context - we pass it instead of 5 arguments:
30 */ 30 */
31 struct cpa_data { 31 struct cpa_data {
32 unsigned long *vaddr; 32 unsigned long *vaddr;
33 pgd_t *pgd; 33 pgd_t *pgd;
34 pgprot_t mask_set; 34 pgprot_t mask_set;
35 pgprot_t mask_clr; 35 pgprot_t mask_clr;
36 int numpages; 36 int numpages;
37 int flags; 37 int flags;
38 unsigned long pfn; 38 unsigned long pfn;
39 unsigned force_split : 1; 39 unsigned force_split : 1;
40 int curpage; 40 int curpage;
41 struct page **pages; 41 struct page **pages;
42 }; 42 };
43 43
44 /* 44 /*
45 * Serialize cpa() (for !DEBUG_PAGEALLOC which uses large identity mappings) 45 * Serialize cpa() (for !DEBUG_PAGEALLOC which uses large identity mappings)
46 * using cpa_lock. So that we don't allow any other cpu, with stale large tlb 46 * using cpa_lock. So that we don't allow any other cpu, with stale large tlb
47 * entries change the page attribute in parallel to some other cpu 47 * entries change the page attribute in parallel to some other cpu
48 * splitting a large page entry along with changing the attribute. 48 * splitting a large page entry along with changing the attribute.
49 */ 49 */
50 static DEFINE_SPINLOCK(cpa_lock); 50 static DEFINE_SPINLOCK(cpa_lock);
51 51
52 #define CPA_FLUSHTLB 1 52 #define CPA_FLUSHTLB 1
53 #define CPA_ARRAY 2 53 #define CPA_ARRAY 2
54 #define CPA_PAGES_ARRAY 4 54 #define CPA_PAGES_ARRAY 4
55 55
56 #ifdef CONFIG_PROC_FS 56 #ifdef CONFIG_PROC_FS
57 static unsigned long direct_pages_count[PG_LEVEL_NUM]; 57 static unsigned long direct_pages_count[PG_LEVEL_NUM];
58 58
59 void update_page_count(int level, unsigned long pages) 59 void update_page_count(int level, unsigned long pages)
60 { 60 {
61 /* Protect against CPA */ 61 /* Protect against CPA */
62 spin_lock(&pgd_lock); 62 spin_lock(&pgd_lock);
63 direct_pages_count[level] += pages; 63 direct_pages_count[level] += pages;
64 spin_unlock(&pgd_lock); 64 spin_unlock(&pgd_lock);
65 } 65 }
66 66
67 static void split_page_count(int level) 67 static void split_page_count(int level)
68 { 68 {
69 direct_pages_count[level]--; 69 direct_pages_count[level]--;
70 direct_pages_count[level - 1] += PTRS_PER_PTE; 70 direct_pages_count[level - 1] += PTRS_PER_PTE;
71 } 71 }
72 72
73 void arch_report_meminfo(struct seq_file *m) 73 void arch_report_meminfo(struct seq_file *m)
74 { 74 {
75 seq_printf(m, "DirectMap4k: %8lu kB\n", 75 seq_printf(m, "DirectMap4k: %8lu kB\n",
76 direct_pages_count[PG_LEVEL_4K] << 2); 76 direct_pages_count[PG_LEVEL_4K] << 2);
77 #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) 77 #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
78 seq_printf(m, "DirectMap2M: %8lu kB\n", 78 seq_printf(m, "DirectMap2M: %8lu kB\n",
79 direct_pages_count[PG_LEVEL_2M] << 11); 79 direct_pages_count[PG_LEVEL_2M] << 11);
80 #else 80 #else
81 seq_printf(m, "DirectMap4M: %8lu kB\n", 81 seq_printf(m, "DirectMap4M: %8lu kB\n",
82 direct_pages_count[PG_LEVEL_2M] << 12); 82 direct_pages_count[PG_LEVEL_2M] << 12);
83 #endif 83 #endif
84 #ifdef CONFIG_X86_64 84 #ifdef CONFIG_X86_64
85 if (direct_gbpages) 85 if (direct_gbpages)
86 seq_printf(m, "DirectMap1G: %8lu kB\n", 86 seq_printf(m, "DirectMap1G: %8lu kB\n",
87 direct_pages_count[PG_LEVEL_1G] << 20); 87 direct_pages_count[PG_LEVEL_1G] << 20);
88 #endif 88 #endif
89 } 89 }
90 #else 90 #else
91 static inline void split_page_count(int level) { } 91 static inline void split_page_count(int level) { }
92 #endif 92 #endif
93 93
94 #ifdef CONFIG_X86_64 94 #ifdef CONFIG_X86_64
95 95
96 static inline unsigned long highmap_start_pfn(void) 96 static inline unsigned long highmap_start_pfn(void)
97 { 97 {
98 return __pa_symbol(_text) >> PAGE_SHIFT; 98 return __pa_symbol(_text) >> PAGE_SHIFT;
99 } 99 }
100 100
101 static inline unsigned long highmap_end_pfn(void) 101 static inline unsigned long highmap_end_pfn(void)
102 { 102 {
103 return __pa_symbol(roundup(_brk_end, PMD_SIZE)) >> PAGE_SHIFT; 103 return __pa_symbol(roundup(_brk_end, PMD_SIZE)) >> PAGE_SHIFT;
104 } 104 }
105 105
106 #endif 106 #endif
107 107
108 #ifdef CONFIG_DEBUG_PAGEALLOC 108 #ifdef CONFIG_DEBUG_PAGEALLOC
109 # define debug_pagealloc 1 109 # define debug_pagealloc 1
110 #else 110 #else
111 # define debug_pagealloc 0 111 # define debug_pagealloc 0
112 #endif 112 #endif
113 113
114 static inline int 114 static inline int
115 within(unsigned long addr, unsigned long start, unsigned long end) 115 within(unsigned long addr, unsigned long start, unsigned long end)
116 { 116 {
117 return addr >= start && addr < end; 117 return addr >= start && addr < end;
118 } 118 }
119 119
120 /* 120 /*
121 * Flushing functions 121 * Flushing functions
122 */ 122 */
123 123
124 /** 124 /**
125 * clflush_cache_range - flush a cache range with clflush 125 * clflush_cache_range - flush a cache range with clflush
126 * @vaddr: virtual start address 126 * @vaddr: virtual start address
127 * @size: number of bytes to flush 127 * @size: number of bytes to flush
128 * 128 *
129 * clflushopt is an unordered instruction which needs fencing with mfence or 129 * clflushopt is an unordered instruction which needs fencing with mfence or
130 * sfence to avoid ordering issues. 130 * sfence to avoid ordering issues.
131 */ 131 */
132 void clflush_cache_range(void *vaddr, unsigned int size) 132 void clflush_cache_range(void *vaddr, unsigned int size)
133 { 133 {
134 void *vend = vaddr + size - 1; 134 void *vend = vaddr + size - 1;
135 135
136 mb(); 136 mb();
137 137
138 for (; vaddr < vend; vaddr += boot_cpu_data.x86_clflush_size) 138 for (; vaddr < vend; vaddr += boot_cpu_data.x86_clflush_size)
139 clflushopt(vaddr); 139 clflushopt(vaddr);
140 /* 140 /*
141 * Flush any possible final partial cacheline: 141 * Flush any possible final partial cacheline:
142 */ 142 */
143 clflushopt(vend); 143 clflushopt(vend);
144 144
145 mb(); 145 mb();
146 } 146 }
147 EXPORT_SYMBOL_GPL(clflush_cache_range); 147 EXPORT_SYMBOL_GPL(clflush_cache_range);
148 148
149 static void __cpa_flush_all(void *arg) 149 static void __cpa_flush_all(void *arg)
150 { 150 {
151 unsigned long cache = (unsigned long)arg; 151 unsigned long cache = (unsigned long)arg;
152 152
153 /* 153 /*
154 * Flush all to work around Errata in early athlons regarding 154 * Flush all to work around Errata in early athlons regarding
155 * large page flushing. 155 * large page flushing.
156 */ 156 */
157 __flush_tlb_all(); 157 __flush_tlb_all();
158 158
159 if (cache && boot_cpu_data.x86 >= 4) 159 if (cache && boot_cpu_data.x86 >= 4)
160 wbinvd(); 160 wbinvd();
161 } 161 }
162 162
163 static void cpa_flush_all(unsigned long cache) 163 static void cpa_flush_all(unsigned long cache)
164 { 164 {
165 BUG_ON(irqs_disabled()); 165 BUG_ON(irqs_disabled());
166 166
167 on_each_cpu(__cpa_flush_all, (void *) cache, 1); 167 on_each_cpu(__cpa_flush_all, (void *) cache, 1);
168 } 168 }
169 169
170 static void __cpa_flush_range(void *arg) 170 static void __cpa_flush_range(void *arg)
171 { 171 {
172 /* 172 /*
173 * We could optimize that further and do individual per page 173 * We could optimize that further and do individual per page
174 * tlb invalidates for a low number of pages. Caveat: we must 174 * tlb invalidates for a low number of pages. Caveat: we must
175 * flush the high aliases on 64bit as well. 175 * flush the high aliases on 64bit as well.
176 */ 176 */
177 __flush_tlb_all(); 177 __flush_tlb_all();
178 } 178 }
179 179
180 static void cpa_flush_range(unsigned long start, int numpages, int cache) 180 static void cpa_flush_range(unsigned long start, int numpages, int cache)
181 { 181 {
182 unsigned int i, level; 182 unsigned int i, level;
183 unsigned long addr; 183 unsigned long addr;
184 184
185 BUG_ON(irqs_disabled()); 185 BUG_ON(irqs_disabled());
186 WARN_ON(PAGE_ALIGN(start) != start); 186 WARN_ON(PAGE_ALIGN(start) != start);
187 187
188 on_each_cpu(__cpa_flush_range, NULL, 1); 188 on_each_cpu(__cpa_flush_range, NULL, 1);
189 189
190 if (!cache) 190 if (!cache)
191 return; 191 return;
192 192
193 /* 193 /*
194 * We only need to flush on one CPU, 194 * We only need to flush on one CPU,
195 * clflush is a MESI-coherent instruction that 195 * clflush is a MESI-coherent instruction that
196 * will cause all other CPUs to flush the same 196 * will cause all other CPUs to flush the same
197 * cachelines: 197 * cachelines:
198 */ 198 */
199 for (i = 0, addr = start; i < numpages; i++, addr += PAGE_SIZE) { 199 for (i = 0, addr = start; i < numpages; i++, addr += PAGE_SIZE) {
200 pte_t *pte = lookup_address(addr, &level); 200 pte_t *pte = lookup_address(addr, &level);
201 201
202 /* 202 /*
203 * Only flush present addresses: 203 * Only flush present addresses:
204 */ 204 */
205 if (pte && (pte_val(*pte) & _PAGE_PRESENT)) 205 if (pte && (pte_val(*pte) & _PAGE_PRESENT))
206 clflush_cache_range((void *) addr, PAGE_SIZE); 206 clflush_cache_range((void *) addr, PAGE_SIZE);
207 } 207 }
208 } 208 }
209 209
210 static void cpa_flush_array(unsigned long *start, int numpages, int cache, 210 static void cpa_flush_array(unsigned long *start, int numpages, int cache,
211 int in_flags, struct page **pages) 211 int in_flags, struct page **pages)
212 { 212 {
213 unsigned int i, level; 213 unsigned int i, level;
214 unsigned long do_wbinvd = cache && numpages >= 1024; /* 4M threshold */ 214 unsigned long do_wbinvd = cache && numpages >= 1024; /* 4M threshold */
215 215
216 BUG_ON(irqs_disabled()); 216 BUG_ON(irqs_disabled());
217 217
218 on_each_cpu(__cpa_flush_all, (void *) do_wbinvd, 1); 218 on_each_cpu(__cpa_flush_all, (void *) do_wbinvd, 1);
219 219
220 if (!cache || do_wbinvd) 220 if (!cache || do_wbinvd)
221 return; 221 return;
222 222
223 /* 223 /*
224 * We only need to flush on one CPU, 224 * We only need to flush on one CPU,
225 * clflush is a MESI-coherent instruction that 225 * clflush is a MESI-coherent instruction that
226 * will cause all other CPUs to flush the same 226 * will cause all other CPUs to flush the same
227 * cachelines: 227 * cachelines:
228 */ 228 */
229 for (i = 0; i < numpages; i++) { 229 for (i = 0; i < numpages; i++) {
230 unsigned long addr; 230 unsigned long addr;
231 pte_t *pte; 231 pte_t *pte;
232 232
233 if (in_flags & CPA_PAGES_ARRAY) 233 if (in_flags & CPA_PAGES_ARRAY)
234 addr = (unsigned long)page_address(pages[i]); 234 addr = (unsigned long)page_address(pages[i]);
235 else 235 else
236 addr = start[i]; 236 addr = start[i];
237 237
238 pte = lookup_address(addr, &level); 238 pte = lookup_address(addr, &level);
239 239
240 /* 240 /*
241 * Only flush present addresses: 241 * Only flush present addresses:
242 */ 242 */
243 if (pte && (pte_val(*pte) & _PAGE_PRESENT)) 243 if (pte && (pte_val(*pte) & _PAGE_PRESENT))
244 clflush_cache_range((void *)addr, PAGE_SIZE); 244 clflush_cache_range((void *)addr, PAGE_SIZE);
245 } 245 }
246 } 246 }
247 247
248 /* 248 /*
249 * Certain areas of memory on x86 require very specific protection flags, 249 * Certain areas of memory on x86 require very specific protection flags,
250 * for example the BIOS area or kernel text. Callers don't always get this 250 * for example the BIOS area or kernel text. Callers don't always get this
251 * right (again, ioremap() on BIOS memory is not uncommon) so this function 251 * right (again, ioremap() on BIOS memory is not uncommon) so this function
252 * checks and fixes these known static required protection bits. 252 * checks and fixes these known static required protection bits.
253 */ 253 */
254 static inline pgprot_t static_protections(pgprot_t prot, unsigned long address, 254 static inline pgprot_t static_protections(pgprot_t prot, unsigned long address,
255 unsigned long pfn) 255 unsigned long pfn)
256 { 256 {
257 pgprot_t forbidden = __pgprot(0); 257 pgprot_t forbidden = __pgprot(0);
258 258
259 /* 259 /*
260 * The BIOS area between 640k and 1Mb needs to be executable for 260 * The BIOS area between 640k and 1Mb needs to be executable for
261 * PCI BIOS based config access (CONFIG_PCI_GOBIOS) support. 261 * PCI BIOS based config access (CONFIG_PCI_GOBIOS) support.
262 */ 262 */
263 #ifdef CONFIG_PCI_BIOS 263 #ifdef CONFIG_PCI_BIOS
264 if (pcibios_enabled && within(pfn, BIOS_BEGIN >> PAGE_SHIFT, BIOS_END >> PAGE_SHIFT)) 264 if (pcibios_enabled && within(pfn, BIOS_BEGIN >> PAGE_SHIFT, BIOS_END >> PAGE_SHIFT))
265 pgprot_val(forbidden) |= _PAGE_NX; 265 pgprot_val(forbidden) |= _PAGE_NX;
266 #endif 266 #endif
267 267
268 /* 268 /*
269 * The kernel text needs to be executable for obvious reasons 269 * The kernel text needs to be executable for obvious reasons
270 * Does not cover __inittext since that is gone later on. On 270 * Does not cover __inittext since that is gone later on. On
271 * 64bit we do not enforce !NX on the low mapping 271 * 64bit we do not enforce !NX on the low mapping
272 */ 272 */
273 if (within(address, (unsigned long)_text, (unsigned long)_etext)) 273 if (within(address, (unsigned long)_text, (unsigned long)_etext))
274 pgprot_val(forbidden) |= _PAGE_NX; 274 pgprot_val(forbidden) |= _PAGE_NX;
275 275
276 /* 276 /*
277 * The .rodata section needs to be read-only. Using the pfn 277 * The .rodata section needs to be read-only. Using the pfn
278 * catches all aliases. 278 * catches all aliases.
279 */ 279 */
280 if (within(pfn, __pa_symbol(__start_rodata) >> PAGE_SHIFT, 280 if (within(pfn, __pa_symbol(__start_rodata) >> PAGE_SHIFT,
281 __pa_symbol(__end_rodata) >> PAGE_SHIFT)) 281 __pa_symbol(__end_rodata) >> PAGE_SHIFT))
282 pgprot_val(forbidden) |= _PAGE_RW; 282 pgprot_val(forbidden) |= _PAGE_RW;
283 283
284 #if defined(CONFIG_X86_64) && defined(CONFIG_DEBUG_RODATA) 284 #if defined(CONFIG_X86_64) && defined(CONFIG_DEBUG_RODATA)
285 /* 285 /*
286 * Once the kernel maps the text as RO (kernel_set_to_readonly is set), 286 * Once the kernel maps the text as RO (kernel_set_to_readonly is set),
287 * kernel text mappings for the large page aligned text, rodata sections 287 * kernel text mappings for the large page aligned text, rodata sections
288 * will be always read-only. For the kernel identity mappings covering 288 * will be always read-only. For the kernel identity mappings covering
289 * the holes caused by this alignment can be anything that user asks. 289 * the holes caused by this alignment can be anything that user asks.
290 * 290 *
291 * This will preserve the large page mappings for kernel text/data 291 * This will preserve the large page mappings for kernel text/data
292 * at no extra cost. 292 * at no extra cost.
293 */ 293 */
294 if (kernel_set_to_readonly && 294 if (kernel_set_to_readonly &&
295 within(address, (unsigned long)_text, 295 within(address, (unsigned long)_text,
296 (unsigned long)__end_rodata_hpage_align)) { 296 (unsigned long)__end_rodata_hpage_align)) {
297 unsigned int level; 297 unsigned int level;
298 298
299 /* 299 /*
300 * Don't enforce the !RW mapping for the kernel text mapping, 300 * Don't enforce the !RW mapping for the kernel text mapping,
301 * if the current mapping is already using small page mapping. 301 * if the current mapping is already using small page mapping.
302 * No need to work hard to preserve large page mappings in this 302 * No need to work hard to preserve large page mappings in this
303 * case. 303 * case.
304 * 304 *
305 * This also fixes the Linux Xen paravirt guest boot failure 305 * This also fixes the Linux Xen paravirt guest boot failure
306 * (because of unexpected read-only mappings for kernel identity 306 * (because of unexpected read-only mappings for kernel identity
307 * mappings). In this paravirt guest case, the kernel text 307 * mappings). In this paravirt guest case, the kernel text
308 * mapping and the kernel identity mapping share the same 308 * mapping and the kernel identity mapping share the same
309 * page-table pages. Thus we can't really use different 309 * page-table pages. Thus we can't really use different
310 * protections for the kernel text and identity mappings. Also, 310 * protections for the kernel text and identity mappings. Also,
311 * these shared mappings are made of small page mappings. 311 * these shared mappings are made of small page mappings.
312 * Thus this don't enforce !RW mapping for small page kernel 312 * Thus this don't enforce !RW mapping for small page kernel
313 * text mapping logic will help Linux Xen parvirt guest boot 313 * text mapping logic will help Linux Xen parvirt guest boot
314 * as well. 314 * as well.
315 */ 315 */
316 if (lookup_address(address, &level) && (level != PG_LEVEL_4K)) 316 if (lookup_address(address, &level) && (level != PG_LEVEL_4K))
317 pgprot_val(forbidden) |= _PAGE_RW; 317 pgprot_val(forbidden) |= _PAGE_RW;
318 } 318 }
319 #endif 319 #endif
320 320
321 prot = __pgprot(pgprot_val(prot) & ~pgprot_val(forbidden)); 321 prot = __pgprot(pgprot_val(prot) & ~pgprot_val(forbidden));
322 322
323 return prot; 323 return prot;
324 } 324 }
325 325
326 /* 326 /*
327 * Lookup the page table entry for a virtual address in a specific pgd. 327 * Lookup the page table entry for a virtual address in a specific pgd.
328 * Return a pointer to the entry and the level of the mapping. 328 * Return a pointer to the entry and the level of the mapping.
329 */ 329 */
330 pte_t *lookup_address_in_pgd(pgd_t *pgd, unsigned long address, 330 pte_t *lookup_address_in_pgd(pgd_t *pgd, unsigned long address,
331 unsigned int *level) 331 unsigned int *level)
332 { 332 {
333 pud_t *pud; 333 pud_t *pud;
334 pmd_t *pmd; 334 pmd_t *pmd;
335 335
336 *level = PG_LEVEL_NONE; 336 *level = PG_LEVEL_NONE;
337 337
338 if (pgd_none(*pgd)) 338 if (pgd_none(*pgd))
339 return NULL; 339 return NULL;
340 340
341 pud = pud_offset(pgd, address); 341 pud = pud_offset(pgd, address);
342 if (pud_none(*pud)) 342 if (pud_none(*pud))
343 return NULL; 343 return NULL;
344 344
345 *level = PG_LEVEL_1G; 345 *level = PG_LEVEL_1G;
346 if (pud_large(*pud) || !pud_present(*pud)) 346 if (pud_large(*pud) || !pud_present(*pud))
347 return (pte_t *)pud; 347 return (pte_t *)pud;
348 348
349 pmd = pmd_offset(pud, address); 349 pmd = pmd_offset(pud, address);
350 if (pmd_none(*pmd)) 350 if (pmd_none(*pmd))
351 return NULL; 351 return NULL;
352 352
353 *level = PG_LEVEL_2M; 353 *level = PG_LEVEL_2M;
354 if (pmd_large(*pmd) || !pmd_present(*pmd)) 354 if (pmd_large(*pmd) || !pmd_present(*pmd))
355 return (pte_t *)pmd; 355 return (pte_t *)pmd;
356 356
357 *level = PG_LEVEL_4K; 357 *level = PG_LEVEL_4K;
358 358
359 return pte_offset_kernel(pmd, address); 359 return pte_offset_kernel(pmd, address);
360 } 360 }
361 361
362 /* 362 /*
363 * Lookup the page table entry for a virtual address. Return a pointer 363 * Lookup the page table entry for a virtual address. Return a pointer
364 * to the entry and the level of the mapping. 364 * to the entry and the level of the mapping.
365 * 365 *
366 * Note: We return pud and pmd either when the entry is marked large 366 * Note: We return pud and pmd either when the entry is marked large
367 * or when the present bit is not set. Otherwise we would return a 367 * or when the present bit is not set. Otherwise we would return a
368 * pointer to a nonexisting mapping. 368 * pointer to a nonexisting mapping.
369 */ 369 */
370 pte_t *lookup_address(unsigned long address, unsigned int *level) 370 pte_t *lookup_address(unsigned long address, unsigned int *level)
371 { 371 {
372 return lookup_address_in_pgd(pgd_offset_k(address), address, level); 372 return lookup_address_in_pgd(pgd_offset_k(address), address, level);
373 } 373 }
374 EXPORT_SYMBOL_GPL(lookup_address); 374 EXPORT_SYMBOL_GPL(lookup_address);
375 375
376 static pte_t *_lookup_address_cpa(struct cpa_data *cpa, unsigned long address, 376 static pte_t *_lookup_address_cpa(struct cpa_data *cpa, unsigned long address,
377 unsigned int *level) 377 unsigned int *level)
378 { 378 {
379 if (cpa->pgd) 379 if (cpa->pgd)
380 return lookup_address_in_pgd(cpa->pgd + pgd_index(address), 380 return lookup_address_in_pgd(cpa->pgd + pgd_index(address),
381 address, level); 381 address, level);
382 382
383 return lookup_address(address, level); 383 return lookup_address(address, level);
384 } 384 }
385 385
386 /* 386 /*
387 * Lookup the PMD entry for a virtual address. Return a pointer to the entry
388 * or NULL if not present.
389 */
390 pmd_t *lookup_pmd_address(unsigned long address)
391 {
392 pgd_t *pgd;
393 pud_t *pud;
394
395 pgd = pgd_offset_k(address);
396 if (pgd_none(*pgd))
397 return NULL;
398
399 pud = pud_offset(pgd, address);
400 if (pud_none(*pud) || pud_large(*pud) || !pud_present(*pud))
401 return NULL;
402
403 return pmd_offset(pud, address);
404 }
405
406 /*
387 * This is necessary because __pa() does not work on some 407 * This is necessary because __pa() does not work on some
388 * kinds of memory, like vmalloc() or the alloc_remap() 408 * kinds of memory, like vmalloc() or the alloc_remap()
389 * areas on 32-bit NUMA systems. The percpu areas can 409 * areas on 32-bit NUMA systems. The percpu areas can
390 * end up in this kind of memory, for instance. 410 * end up in this kind of memory, for instance.
391 * 411 *
392 * This could be optimized, but it is only intended to be 412 * This could be optimized, but it is only intended to be
393 * used at inititalization time, and keeping it 413 * used at inititalization time, and keeping it
394 * unoptimized should increase the testing coverage for 414 * unoptimized should increase the testing coverage for
395 * the more obscure platforms. 415 * the more obscure platforms.
396 */ 416 */
397 phys_addr_t slow_virt_to_phys(void *__virt_addr) 417 phys_addr_t slow_virt_to_phys(void *__virt_addr)
398 { 418 {
399 unsigned long virt_addr = (unsigned long)__virt_addr; 419 unsigned long virt_addr = (unsigned long)__virt_addr;
400 phys_addr_t phys_addr; 420 phys_addr_t phys_addr;
401 unsigned long offset; 421 unsigned long offset;
402 enum pg_level level; 422 enum pg_level level;
403 unsigned long psize; 423 unsigned long psize;
404 unsigned long pmask; 424 unsigned long pmask;
405 pte_t *pte; 425 pte_t *pte;
406 426
407 pte = lookup_address(virt_addr, &level); 427 pte = lookup_address(virt_addr, &level);
408 BUG_ON(!pte); 428 BUG_ON(!pte);
409 psize = page_level_size(level); 429 psize = page_level_size(level);
410 pmask = page_level_mask(level); 430 pmask = page_level_mask(level);
411 offset = virt_addr & ~pmask; 431 offset = virt_addr & ~pmask;
412 phys_addr = (phys_addr_t)pte_pfn(*pte) << PAGE_SHIFT; 432 phys_addr = (phys_addr_t)pte_pfn(*pte) << PAGE_SHIFT;
413 return (phys_addr | offset); 433 return (phys_addr | offset);
414 } 434 }
415 EXPORT_SYMBOL_GPL(slow_virt_to_phys); 435 EXPORT_SYMBOL_GPL(slow_virt_to_phys);
416 436
417 /* 437 /*
418 * Set the new pmd in all the pgds we know about: 438 * Set the new pmd in all the pgds we know about:
419 */ 439 */
420 static void __set_pmd_pte(pte_t *kpte, unsigned long address, pte_t pte) 440 static void __set_pmd_pte(pte_t *kpte, unsigned long address, pte_t pte)
421 { 441 {
422 /* change init_mm */ 442 /* change init_mm */
423 set_pte_atomic(kpte, pte); 443 set_pte_atomic(kpte, pte);
424 #ifdef CONFIG_X86_32 444 #ifdef CONFIG_X86_32
425 if (!SHARED_KERNEL_PMD) { 445 if (!SHARED_KERNEL_PMD) {
426 struct page *page; 446 struct page *page;
427 447
428 list_for_each_entry(page, &pgd_list, lru) { 448 list_for_each_entry(page, &pgd_list, lru) {
429 pgd_t *pgd; 449 pgd_t *pgd;
430 pud_t *pud; 450 pud_t *pud;
431 pmd_t *pmd; 451 pmd_t *pmd;
432 452
433 pgd = (pgd_t *)page_address(page) + pgd_index(address); 453 pgd = (pgd_t *)page_address(page) + pgd_index(address);
434 pud = pud_offset(pgd, address); 454 pud = pud_offset(pgd, address);
435 pmd = pmd_offset(pud, address); 455 pmd = pmd_offset(pud, address);
436 set_pte_atomic((pte_t *)pmd, pte); 456 set_pte_atomic((pte_t *)pmd, pte);
437 } 457 }
438 } 458 }
439 #endif 459 #endif
440 } 460 }
441 461
442 static int 462 static int
443 try_preserve_large_page(pte_t *kpte, unsigned long address, 463 try_preserve_large_page(pte_t *kpte, unsigned long address,
444 struct cpa_data *cpa) 464 struct cpa_data *cpa)
445 { 465 {
446 unsigned long nextpage_addr, numpages, pmask, psize, addr, pfn; 466 unsigned long nextpage_addr, numpages, pmask, psize, addr, pfn;
447 pte_t new_pte, old_pte, *tmp; 467 pte_t new_pte, old_pte, *tmp;
448 pgprot_t old_prot, new_prot, req_prot; 468 pgprot_t old_prot, new_prot, req_prot;
449 int i, do_split = 1; 469 int i, do_split = 1;
450 enum pg_level level; 470 enum pg_level level;
451 471
452 if (cpa->force_split) 472 if (cpa->force_split)
453 return 1; 473 return 1;
454 474
455 spin_lock(&pgd_lock); 475 spin_lock(&pgd_lock);
456 /* 476 /*
457 * Check for races, another CPU might have split this page 477 * Check for races, another CPU might have split this page
458 * up already: 478 * up already:
459 */ 479 */
460 tmp = _lookup_address_cpa(cpa, address, &level); 480 tmp = _lookup_address_cpa(cpa, address, &level);
461 if (tmp != kpte) 481 if (tmp != kpte)
462 goto out_unlock; 482 goto out_unlock;
463 483
464 switch (level) { 484 switch (level) {
465 case PG_LEVEL_2M: 485 case PG_LEVEL_2M:
466 #ifdef CONFIG_X86_64 486 #ifdef CONFIG_X86_64
467 case PG_LEVEL_1G: 487 case PG_LEVEL_1G:
468 #endif 488 #endif
469 psize = page_level_size(level); 489 psize = page_level_size(level);
470 pmask = page_level_mask(level); 490 pmask = page_level_mask(level);
471 break; 491 break;
472 default: 492 default:
473 do_split = -EINVAL; 493 do_split = -EINVAL;
474 goto out_unlock; 494 goto out_unlock;
475 } 495 }
476 496
477 /* 497 /*
478 * Calculate the number of pages, which fit into this large 498 * Calculate the number of pages, which fit into this large
479 * page starting at address: 499 * page starting at address:
480 */ 500 */
481 nextpage_addr = (address + psize) & pmask; 501 nextpage_addr = (address + psize) & pmask;
482 numpages = (nextpage_addr - address) >> PAGE_SHIFT; 502 numpages = (nextpage_addr - address) >> PAGE_SHIFT;
483 if (numpages < cpa->numpages) 503 if (numpages < cpa->numpages)
484 cpa->numpages = numpages; 504 cpa->numpages = numpages;
485 505
486 /* 506 /*
487 * We are safe now. Check whether the new pgprot is the same: 507 * We are safe now. Check whether the new pgprot is the same:
488 * Convert protection attributes to 4k-format, as cpa->mask* are set 508 * Convert protection attributes to 4k-format, as cpa->mask* are set
489 * up accordingly. 509 * up accordingly.
490 */ 510 */
491 old_pte = *kpte; 511 old_pte = *kpte;
492 old_prot = req_prot = pgprot_large_2_4k(pte_pgprot(old_pte)); 512 old_prot = req_prot = pgprot_large_2_4k(pte_pgprot(old_pte));
493 513
494 pgprot_val(req_prot) &= ~pgprot_val(cpa->mask_clr); 514 pgprot_val(req_prot) &= ~pgprot_val(cpa->mask_clr);
495 pgprot_val(req_prot) |= pgprot_val(cpa->mask_set); 515 pgprot_val(req_prot) |= pgprot_val(cpa->mask_set);
496 516
497 /* 517 /*
498 * req_prot is in format of 4k pages. It must be converted to large 518 * req_prot is in format of 4k pages. It must be converted to large
499 * page format: the caching mode includes the PAT bit located at 519 * page format: the caching mode includes the PAT bit located at
500 * different bit positions in the two formats. 520 * different bit positions in the two formats.
501 */ 521 */
502 req_prot = pgprot_4k_2_large(req_prot); 522 req_prot = pgprot_4k_2_large(req_prot);
503 523
504 /* 524 /*
505 * Set the PSE and GLOBAL flags only if the PRESENT flag is 525 * Set the PSE and GLOBAL flags only if the PRESENT flag is
506 * set otherwise pmd_present/pmd_huge will return true even on 526 * set otherwise pmd_present/pmd_huge will return true even on
507 * a non present pmd. The canon_pgprot will clear _PAGE_GLOBAL 527 * a non present pmd. The canon_pgprot will clear _PAGE_GLOBAL
508 * for the ancient hardware that doesn't support it. 528 * for the ancient hardware that doesn't support it.
509 */ 529 */
510 if (pgprot_val(req_prot) & _PAGE_PRESENT) 530 if (pgprot_val(req_prot) & _PAGE_PRESENT)
511 pgprot_val(req_prot) |= _PAGE_PSE | _PAGE_GLOBAL; 531 pgprot_val(req_prot) |= _PAGE_PSE | _PAGE_GLOBAL;
512 else 532 else
513 pgprot_val(req_prot) &= ~(_PAGE_PSE | _PAGE_GLOBAL); 533 pgprot_val(req_prot) &= ~(_PAGE_PSE | _PAGE_GLOBAL);
514 534
515 req_prot = canon_pgprot(req_prot); 535 req_prot = canon_pgprot(req_prot);
516 536
517 /* 537 /*
518 * old_pte points to the large page base address. So we need 538 * old_pte points to the large page base address. So we need
519 * to add the offset of the virtual address: 539 * to add the offset of the virtual address:
520 */ 540 */
521 pfn = pte_pfn(old_pte) + ((address & (psize - 1)) >> PAGE_SHIFT); 541 pfn = pte_pfn(old_pte) + ((address & (psize - 1)) >> PAGE_SHIFT);
522 cpa->pfn = pfn; 542 cpa->pfn = pfn;
523 543
524 new_prot = static_protections(req_prot, address, pfn); 544 new_prot = static_protections(req_prot, address, pfn);
525 545
526 /* 546 /*
527 * We need to check the full range, whether 547 * We need to check the full range, whether
528 * static_protection() requires a different pgprot for one of 548 * static_protection() requires a different pgprot for one of
529 * the pages in the range we try to preserve: 549 * the pages in the range we try to preserve:
530 */ 550 */
531 addr = address & pmask; 551 addr = address & pmask;
532 pfn = pte_pfn(old_pte); 552 pfn = pte_pfn(old_pte);
533 for (i = 0; i < (psize >> PAGE_SHIFT); i++, addr += PAGE_SIZE, pfn++) { 553 for (i = 0; i < (psize >> PAGE_SHIFT); i++, addr += PAGE_SIZE, pfn++) {
534 pgprot_t chk_prot = static_protections(req_prot, addr, pfn); 554 pgprot_t chk_prot = static_protections(req_prot, addr, pfn);
535 555
536 if (pgprot_val(chk_prot) != pgprot_val(new_prot)) 556 if (pgprot_val(chk_prot) != pgprot_val(new_prot))
537 goto out_unlock; 557 goto out_unlock;
538 } 558 }
539 559
540 /* 560 /*
541 * If there are no changes, return. maxpages has been updated 561 * If there are no changes, return. maxpages has been updated
542 * above: 562 * above:
543 */ 563 */
544 if (pgprot_val(new_prot) == pgprot_val(old_prot)) { 564 if (pgprot_val(new_prot) == pgprot_val(old_prot)) {
545 do_split = 0; 565 do_split = 0;
546 goto out_unlock; 566 goto out_unlock;
547 } 567 }
548 568
549 /* 569 /*
550 * We need to change the attributes. Check, whether we can 570 * We need to change the attributes. Check, whether we can
551 * change the large page in one go. We request a split, when 571 * change the large page in one go. We request a split, when
552 * the address is not aligned and the number of pages is 572 * the address is not aligned and the number of pages is
553 * smaller than the number of pages in the large page. Note 573 * smaller than the number of pages in the large page. Note
554 * that we limited the number of possible pages already to 574 * that we limited the number of possible pages already to
555 * the number of pages in the large page. 575 * the number of pages in the large page.
556 */ 576 */
557 if (address == (address & pmask) && cpa->numpages == (psize >> PAGE_SHIFT)) { 577 if (address == (address & pmask) && cpa->numpages == (psize >> PAGE_SHIFT)) {
558 /* 578 /*
559 * The address is aligned and the number of pages 579 * The address is aligned and the number of pages
560 * covers the full page. 580 * covers the full page.
561 */ 581 */
562 new_pte = pfn_pte(pte_pfn(old_pte), new_prot); 582 new_pte = pfn_pte(pte_pfn(old_pte), new_prot);
563 __set_pmd_pte(kpte, address, new_pte); 583 __set_pmd_pte(kpte, address, new_pte);
564 cpa->flags |= CPA_FLUSHTLB; 584 cpa->flags |= CPA_FLUSHTLB;
565 do_split = 0; 585 do_split = 0;
566 } 586 }
567 587
568 out_unlock: 588 out_unlock:
569 spin_unlock(&pgd_lock); 589 spin_unlock(&pgd_lock);
570 590
571 return do_split; 591 return do_split;
572 } 592 }
573 593
574 static int 594 static int
575 __split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address, 595 __split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address,
576 struct page *base) 596 struct page *base)
577 { 597 {
578 pte_t *pbase = (pte_t *)page_address(base); 598 pte_t *pbase = (pte_t *)page_address(base);
579 unsigned long pfn, pfninc = 1; 599 unsigned long pfn, pfninc = 1;
580 unsigned int i, level; 600 unsigned int i, level;
581 pte_t *tmp; 601 pte_t *tmp;
582 pgprot_t ref_prot; 602 pgprot_t ref_prot;
583 603
584 spin_lock(&pgd_lock); 604 spin_lock(&pgd_lock);
585 /* 605 /*
586 * Check for races, another CPU might have split this page 606 * Check for races, another CPU might have split this page
587 * up for us already: 607 * up for us already:
588 */ 608 */
589 tmp = _lookup_address_cpa(cpa, address, &level); 609 tmp = _lookup_address_cpa(cpa, address, &level);
590 if (tmp != kpte) { 610 if (tmp != kpte) {
591 spin_unlock(&pgd_lock); 611 spin_unlock(&pgd_lock);
592 return 1; 612 return 1;
593 } 613 }
594 614
595 paravirt_alloc_pte(&init_mm, page_to_pfn(base)); 615 paravirt_alloc_pte(&init_mm, page_to_pfn(base));
596 ref_prot = pte_pgprot(pte_clrhuge(*kpte)); 616 ref_prot = pte_pgprot(pte_clrhuge(*kpte));
597 617
598 /* promote PAT bit to correct position */ 618 /* promote PAT bit to correct position */
599 if (level == PG_LEVEL_2M) 619 if (level == PG_LEVEL_2M)
600 ref_prot = pgprot_large_2_4k(ref_prot); 620 ref_prot = pgprot_large_2_4k(ref_prot);
601 621
602 #ifdef CONFIG_X86_64 622 #ifdef CONFIG_X86_64
603 if (level == PG_LEVEL_1G) { 623 if (level == PG_LEVEL_1G) {
604 pfninc = PMD_PAGE_SIZE >> PAGE_SHIFT; 624 pfninc = PMD_PAGE_SIZE >> PAGE_SHIFT;
605 /* 625 /*
606 * Set the PSE flags only if the PRESENT flag is set 626 * Set the PSE flags only if the PRESENT flag is set
607 * otherwise pmd_present/pmd_huge will return true 627 * otherwise pmd_present/pmd_huge will return true
608 * even on a non present pmd. 628 * even on a non present pmd.
609 */ 629 */
610 if (pgprot_val(ref_prot) & _PAGE_PRESENT) 630 if (pgprot_val(ref_prot) & _PAGE_PRESENT)
611 pgprot_val(ref_prot) |= _PAGE_PSE; 631 pgprot_val(ref_prot) |= _PAGE_PSE;
612 else 632 else
613 pgprot_val(ref_prot) &= ~_PAGE_PSE; 633 pgprot_val(ref_prot) &= ~_PAGE_PSE;
614 } 634 }
615 #endif 635 #endif
616 636
617 /* 637 /*
618 * Set the GLOBAL flags only if the PRESENT flag is set 638 * Set the GLOBAL flags only if the PRESENT flag is set
619 * otherwise pmd/pte_present will return true even on a non 639 * otherwise pmd/pte_present will return true even on a non
620 * present pmd/pte. The canon_pgprot will clear _PAGE_GLOBAL 640 * present pmd/pte. The canon_pgprot will clear _PAGE_GLOBAL
621 * for the ancient hardware that doesn't support it. 641 * for the ancient hardware that doesn't support it.
622 */ 642 */
623 if (pgprot_val(ref_prot) & _PAGE_PRESENT) 643 if (pgprot_val(ref_prot) & _PAGE_PRESENT)
624 pgprot_val(ref_prot) |= _PAGE_GLOBAL; 644 pgprot_val(ref_prot) |= _PAGE_GLOBAL;
625 else 645 else
626 pgprot_val(ref_prot) &= ~_PAGE_GLOBAL; 646 pgprot_val(ref_prot) &= ~_PAGE_GLOBAL;
627 647
628 /* 648 /*
629 * Get the target pfn from the original entry: 649 * Get the target pfn from the original entry:
630 */ 650 */
631 pfn = pte_pfn(*kpte); 651 pfn = pte_pfn(*kpte);
632 for (i = 0; i < PTRS_PER_PTE; i++, pfn += pfninc) 652 for (i = 0; i < PTRS_PER_PTE; i++, pfn += pfninc)
633 set_pte(&pbase[i], pfn_pte(pfn, canon_pgprot(ref_prot))); 653 set_pte(&pbase[i], pfn_pte(pfn, canon_pgprot(ref_prot)));
634 654
635 if (pfn_range_is_mapped(PFN_DOWN(__pa(address)), 655 if (pfn_range_is_mapped(PFN_DOWN(__pa(address)),
636 PFN_DOWN(__pa(address)) + 1)) 656 PFN_DOWN(__pa(address)) + 1))
637 split_page_count(level); 657 split_page_count(level);
638 658
639 /* 659 /*
640 * Install the new, split up pagetable. 660 * Install the new, split up pagetable.
641 * 661 *
642 * We use the standard kernel pagetable protections for the new 662 * We use the standard kernel pagetable protections for the new
643 * pagetable protections, the actual ptes set above control the 663 * pagetable protections, the actual ptes set above control the
644 * primary protection behavior: 664 * primary protection behavior:
645 */ 665 */
646 __set_pmd_pte(kpte, address, mk_pte(base, __pgprot(_KERNPG_TABLE))); 666 __set_pmd_pte(kpte, address, mk_pte(base, __pgprot(_KERNPG_TABLE)));
647 667
648 /* 668 /*
649 * Intel Atom errata AAH41 workaround. 669 * Intel Atom errata AAH41 workaround.
650 * 670 *
651 * The real fix should be in hw or in a microcode update, but 671 * The real fix should be in hw or in a microcode update, but
652 * we also probabilistically try to reduce the window of having 672 * we also probabilistically try to reduce the window of having
653 * a large TLB mixed with 4K TLBs while instruction fetches are 673 * a large TLB mixed with 4K TLBs while instruction fetches are
654 * going on. 674 * going on.
655 */ 675 */
656 __flush_tlb_all(); 676 __flush_tlb_all();
657 spin_unlock(&pgd_lock); 677 spin_unlock(&pgd_lock);
658 678
659 return 0; 679 return 0;
660 } 680 }
661 681
662 static int split_large_page(struct cpa_data *cpa, pte_t *kpte, 682 static int split_large_page(struct cpa_data *cpa, pte_t *kpte,
663 unsigned long address) 683 unsigned long address)
664 { 684 {
665 struct page *base; 685 struct page *base;
666 686
667 if (!debug_pagealloc) 687 if (!debug_pagealloc)
668 spin_unlock(&cpa_lock); 688 spin_unlock(&cpa_lock);
669 base = alloc_pages(GFP_KERNEL | __GFP_NOTRACK, 0); 689 base = alloc_pages(GFP_KERNEL | __GFP_NOTRACK, 0);
670 if (!debug_pagealloc) 690 if (!debug_pagealloc)
671 spin_lock(&cpa_lock); 691 spin_lock(&cpa_lock);
672 if (!base) 692 if (!base)
673 return -ENOMEM; 693 return -ENOMEM;
674 694
675 if (__split_large_page(cpa, kpte, address, base)) 695 if (__split_large_page(cpa, kpte, address, base))
676 __free_page(base); 696 __free_page(base);
677 697
678 return 0; 698 return 0;
679 } 699 }
680 700
681 static bool try_to_free_pte_page(pte_t *pte) 701 static bool try_to_free_pte_page(pte_t *pte)
682 { 702 {
683 int i; 703 int i;
684 704
685 for (i = 0; i < PTRS_PER_PTE; i++) 705 for (i = 0; i < PTRS_PER_PTE; i++)
686 if (!pte_none(pte[i])) 706 if (!pte_none(pte[i]))
687 return false; 707 return false;
688 708
689 free_page((unsigned long)pte); 709 free_page((unsigned long)pte);
690 return true; 710 return true;
691 } 711 }
692 712
693 static bool try_to_free_pmd_page(pmd_t *pmd) 713 static bool try_to_free_pmd_page(pmd_t *pmd)
694 { 714 {
695 int i; 715 int i;
696 716
697 for (i = 0; i < PTRS_PER_PMD; i++) 717 for (i = 0; i < PTRS_PER_PMD; i++)
698 if (!pmd_none(pmd[i])) 718 if (!pmd_none(pmd[i]))
699 return false; 719 return false;
700 720
701 free_page((unsigned long)pmd); 721 free_page((unsigned long)pmd);
702 return true; 722 return true;
703 } 723 }
704 724
705 static bool try_to_free_pud_page(pud_t *pud) 725 static bool try_to_free_pud_page(pud_t *pud)
706 { 726 {
707 int i; 727 int i;
708 728
709 for (i = 0; i < PTRS_PER_PUD; i++) 729 for (i = 0; i < PTRS_PER_PUD; i++)
710 if (!pud_none(pud[i])) 730 if (!pud_none(pud[i]))
711 return false; 731 return false;
712 732
713 free_page((unsigned long)pud); 733 free_page((unsigned long)pud);
714 return true; 734 return true;
715 } 735 }
716 736
717 static bool unmap_pte_range(pmd_t *pmd, unsigned long start, unsigned long end) 737 static bool unmap_pte_range(pmd_t *pmd, unsigned long start, unsigned long end)
718 { 738 {
719 pte_t *pte = pte_offset_kernel(pmd, start); 739 pte_t *pte = pte_offset_kernel(pmd, start);
720 740
721 while (start < end) { 741 while (start < end) {
722 set_pte(pte, __pte(0)); 742 set_pte(pte, __pte(0));
723 743
724 start += PAGE_SIZE; 744 start += PAGE_SIZE;
725 pte++; 745 pte++;
726 } 746 }
727 747
728 if (try_to_free_pte_page((pte_t *)pmd_page_vaddr(*pmd))) { 748 if (try_to_free_pte_page((pte_t *)pmd_page_vaddr(*pmd))) {
729 pmd_clear(pmd); 749 pmd_clear(pmd);
730 return true; 750 return true;
731 } 751 }
732 return false; 752 return false;
733 } 753 }
734 754
735 static void __unmap_pmd_range(pud_t *pud, pmd_t *pmd, 755 static void __unmap_pmd_range(pud_t *pud, pmd_t *pmd,
736 unsigned long start, unsigned long end) 756 unsigned long start, unsigned long end)
737 { 757 {
738 if (unmap_pte_range(pmd, start, end)) 758 if (unmap_pte_range(pmd, start, end))
739 if (try_to_free_pmd_page((pmd_t *)pud_page_vaddr(*pud))) 759 if (try_to_free_pmd_page((pmd_t *)pud_page_vaddr(*pud)))
740 pud_clear(pud); 760 pud_clear(pud);
741 } 761 }
742 762
743 static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end) 763 static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end)
744 { 764 {
745 pmd_t *pmd = pmd_offset(pud, start); 765 pmd_t *pmd = pmd_offset(pud, start);
746 766
747 /* 767 /*
748 * Not on a 2MB page boundary? 768 * Not on a 2MB page boundary?
749 */ 769 */
750 if (start & (PMD_SIZE - 1)) { 770 if (start & (PMD_SIZE - 1)) {
751 unsigned long next_page = (start + PMD_SIZE) & PMD_MASK; 771 unsigned long next_page = (start + PMD_SIZE) & PMD_MASK;
752 unsigned long pre_end = min_t(unsigned long, end, next_page); 772 unsigned long pre_end = min_t(unsigned long, end, next_page);
753 773
754 __unmap_pmd_range(pud, pmd, start, pre_end); 774 __unmap_pmd_range(pud, pmd, start, pre_end);
755 775
756 start = pre_end; 776 start = pre_end;
757 pmd++; 777 pmd++;
758 } 778 }
759 779
760 /* 780 /*
761 * Try to unmap in 2M chunks. 781 * Try to unmap in 2M chunks.
762 */ 782 */
763 while (end - start >= PMD_SIZE) { 783 while (end - start >= PMD_SIZE) {
764 if (pmd_large(*pmd)) 784 if (pmd_large(*pmd))
765 pmd_clear(pmd); 785 pmd_clear(pmd);
766 else 786 else
767 __unmap_pmd_range(pud, pmd, start, start + PMD_SIZE); 787 __unmap_pmd_range(pud, pmd, start, start + PMD_SIZE);
768 788
769 start += PMD_SIZE; 789 start += PMD_SIZE;
770 pmd++; 790 pmd++;
771 } 791 }
772 792
773 /* 793 /*
774 * 4K leftovers? 794 * 4K leftovers?
775 */ 795 */
776 if (start < end) 796 if (start < end)
777 return __unmap_pmd_range(pud, pmd, start, end); 797 return __unmap_pmd_range(pud, pmd, start, end);
778 798
779 /* 799 /*
780 * Try again to free the PMD page if haven't succeeded above. 800 * Try again to free the PMD page if haven't succeeded above.
781 */ 801 */
782 if (!pud_none(*pud)) 802 if (!pud_none(*pud))
783 if (try_to_free_pmd_page((pmd_t *)pud_page_vaddr(*pud))) 803 if (try_to_free_pmd_page((pmd_t *)pud_page_vaddr(*pud)))
784 pud_clear(pud); 804 pud_clear(pud);
785 } 805 }
786 806
787 static void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end) 807 static void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end)
788 { 808 {
789 pud_t *pud = pud_offset(pgd, start); 809 pud_t *pud = pud_offset(pgd, start);
790 810
791 /* 811 /*
792 * Not on a GB page boundary? 812 * Not on a GB page boundary?
793 */ 813 */
794 if (start & (PUD_SIZE - 1)) { 814 if (start & (PUD_SIZE - 1)) {
795 unsigned long next_page = (start + PUD_SIZE) & PUD_MASK; 815 unsigned long next_page = (start + PUD_SIZE) & PUD_MASK;
796 unsigned long pre_end = min_t(unsigned long, end, next_page); 816 unsigned long pre_end = min_t(unsigned long, end, next_page);
797 817
798 unmap_pmd_range(pud, start, pre_end); 818 unmap_pmd_range(pud, start, pre_end);
799 819
800 start = pre_end; 820 start = pre_end;
801 pud++; 821 pud++;
802 } 822 }
803 823
804 /* 824 /*
805 * Try to unmap in 1G chunks? 825 * Try to unmap in 1G chunks?
806 */ 826 */
807 while (end - start >= PUD_SIZE) { 827 while (end - start >= PUD_SIZE) {
808 828
809 if (pud_large(*pud)) 829 if (pud_large(*pud))
810 pud_clear(pud); 830 pud_clear(pud);
811 else 831 else
812 unmap_pmd_range(pud, start, start + PUD_SIZE); 832 unmap_pmd_range(pud, start, start + PUD_SIZE);
813 833
814 start += PUD_SIZE; 834 start += PUD_SIZE;
815 pud++; 835 pud++;
816 } 836 }
817 837
818 /* 838 /*
819 * 2M leftovers? 839 * 2M leftovers?
820 */ 840 */
821 if (start < end) 841 if (start < end)
822 unmap_pmd_range(pud, start, end); 842 unmap_pmd_range(pud, start, end);
823 843
824 /* 844 /*
825 * No need to try to free the PUD page because we'll free it in 845 * No need to try to free the PUD page because we'll free it in
826 * populate_pgd's error path 846 * populate_pgd's error path
827 */ 847 */
828 } 848 }
829 849
830 static void unmap_pgd_range(pgd_t *root, unsigned long addr, unsigned long end) 850 static void unmap_pgd_range(pgd_t *root, unsigned long addr, unsigned long end)
831 { 851 {
832 pgd_t *pgd_entry = root + pgd_index(addr); 852 pgd_t *pgd_entry = root + pgd_index(addr);
833 853
834 unmap_pud_range(pgd_entry, addr, end); 854 unmap_pud_range(pgd_entry, addr, end);
835 855
836 if (try_to_free_pud_page((pud_t *)pgd_page_vaddr(*pgd_entry))) 856 if (try_to_free_pud_page((pud_t *)pgd_page_vaddr(*pgd_entry)))
837 pgd_clear(pgd_entry); 857 pgd_clear(pgd_entry);
838 } 858 }
839 859
840 static int alloc_pte_page(pmd_t *pmd) 860 static int alloc_pte_page(pmd_t *pmd)
841 { 861 {
842 pte_t *pte = (pte_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK); 862 pte_t *pte = (pte_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK);
843 if (!pte) 863 if (!pte)
844 return -1; 864 return -1;
845 865
846 set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE)); 866 set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE));
847 return 0; 867 return 0;
848 } 868 }
849 869
850 static int alloc_pmd_page(pud_t *pud) 870 static int alloc_pmd_page(pud_t *pud)
851 { 871 {
852 pmd_t *pmd = (pmd_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK); 872 pmd_t *pmd = (pmd_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK);
853 if (!pmd) 873 if (!pmd)
854 return -1; 874 return -1;
855 875
856 set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE)); 876 set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE));
857 return 0; 877 return 0;
858 } 878 }
859 879
860 static void populate_pte(struct cpa_data *cpa, 880 static void populate_pte(struct cpa_data *cpa,
861 unsigned long start, unsigned long end, 881 unsigned long start, unsigned long end,
862 unsigned num_pages, pmd_t *pmd, pgprot_t pgprot) 882 unsigned num_pages, pmd_t *pmd, pgprot_t pgprot)
863 { 883 {
864 pte_t *pte; 884 pte_t *pte;
865 885
866 pte = pte_offset_kernel(pmd, start); 886 pte = pte_offset_kernel(pmd, start);
867 887
868 while (num_pages-- && start < end) { 888 while (num_pages-- && start < end) {
869 889
870 /* deal with the NX bit */ 890 /* deal with the NX bit */
871 if (!(pgprot_val(pgprot) & _PAGE_NX)) 891 if (!(pgprot_val(pgprot) & _PAGE_NX))
872 cpa->pfn &= ~_PAGE_NX; 892 cpa->pfn &= ~_PAGE_NX;
873 893
874 set_pte(pte, pfn_pte(cpa->pfn >> PAGE_SHIFT, pgprot)); 894 set_pte(pte, pfn_pte(cpa->pfn >> PAGE_SHIFT, pgprot));
875 895
876 start += PAGE_SIZE; 896 start += PAGE_SIZE;
877 cpa->pfn += PAGE_SIZE; 897 cpa->pfn += PAGE_SIZE;
878 pte++; 898 pte++;
879 } 899 }
880 } 900 }
881 901
882 static int populate_pmd(struct cpa_data *cpa, 902 static int populate_pmd(struct cpa_data *cpa,
883 unsigned long start, unsigned long end, 903 unsigned long start, unsigned long end,
884 unsigned num_pages, pud_t *pud, pgprot_t pgprot) 904 unsigned num_pages, pud_t *pud, pgprot_t pgprot)
885 { 905 {
886 unsigned int cur_pages = 0; 906 unsigned int cur_pages = 0;
887 pmd_t *pmd; 907 pmd_t *pmd;
888 pgprot_t pmd_pgprot; 908 pgprot_t pmd_pgprot;
889 909
890 /* 910 /*
891 * Not on a 2M boundary? 911 * Not on a 2M boundary?
892 */ 912 */
893 if (start & (PMD_SIZE - 1)) { 913 if (start & (PMD_SIZE - 1)) {
894 unsigned long pre_end = start + (num_pages << PAGE_SHIFT); 914 unsigned long pre_end = start + (num_pages << PAGE_SHIFT);
895 unsigned long next_page = (start + PMD_SIZE) & PMD_MASK; 915 unsigned long next_page = (start + PMD_SIZE) & PMD_MASK;
896 916
897 pre_end = min_t(unsigned long, pre_end, next_page); 917 pre_end = min_t(unsigned long, pre_end, next_page);
898 cur_pages = (pre_end - start) >> PAGE_SHIFT; 918 cur_pages = (pre_end - start) >> PAGE_SHIFT;
899 cur_pages = min_t(unsigned int, num_pages, cur_pages); 919 cur_pages = min_t(unsigned int, num_pages, cur_pages);
900 920
901 /* 921 /*
902 * Need a PTE page? 922 * Need a PTE page?
903 */ 923 */
904 pmd = pmd_offset(pud, start); 924 pmd = pmd_offset(pud, start);
905 if (pmd_none(*pmd)) 925 if (pmd_none(*pmd))
906 if (alloc_pte_page(pmd)) 926 if (alloc_pte_page(pmd))
907 return -1; 927 return -1;
908 928
909 populate_pte(cpa, start, pre_end, cur_pages, pmd, pgprot); 929 populate_pte(cpa, start, pre_end, cur_pages, pmd, pgprot);
910 930
911 start = pre_end; 931 start = pre_end;
912 } 932 }
913 933
914 /* 934 /*
915 * We mapped them all? 935 * We mapped them all?
916 */ 936 */
917 if (num_pages == cur_pages) 937 if (num_pages == cur_pages)
918 return cur_pages; 938 return cur_pages;
919 939
920 pmd_pgprot = pgprot_4k_2_large(pgprot); 940 pmd_pgprot = pgprot_4k_2_large(pgprot);
921 941
922 while (end - start >= PMD_SIZE) { 942 while (end - start >= PMD_SIZE) {
923 943
924 /* 944 /*
925 * We cannot use a 1G page so allocate a PMD page if needed. 945 * We cannot use a 1G page so allocate a PMD page if needed.
926 */ 946 */
927 if (pud_none(*pud)) 947 if (pud_none(*pud))
928 if (alloc_pmd_page(pud)) 948 if (alloc_pmd_page(pud))
929 return -1; 949 return -1;
930 950
931 pmd = pmd_offset(pud, start); 951 pmd = pmd_offset(pud, start);
932 952
933 set_pmd(pmd, __pmd(cpa->pfn | _PAGE_PSE | 953 set_pmd(pmd, __pmd(cpa->pfn | _PAGE_PSE |
934 massage_pgprot(pmd_pgprot))); 954 massage_pgprot(pmd_pgprot)));
935 955
936 start += PMD_SIZE; 956 start += PMD_SIZE;
937 cpa->pfn += PMD_SIZE; 957 cpa->pfn += PMD_SIZE;
938 cur_pages += PMD_SIZE >> PAGE_SHIFT; 958 cur_pages += PMD_SIZE >> PAGE_SHIFT;
939 } 959 }
940 960
941 /* 961 /*
942 * Map trailing 4K pages. 962 * Map trailing 4K pages.
943 */ 963 */
944 if (start < end) { 964 if (start < end) {
945 pmd = pmd_offset(pud, start); 965 pmd = pmd_offset(pud, start);
946 if (pmd_none(*pmd)) 966 if (pmd_none(*pmd))
947 if (alloc_pte_page(pmd)) 967 if (alloc_pte_page(pmd))
948 return -1; 968 return -1;
949 969
950 populate_pte(cpa, start, end, num_pages - cur_pages, 970 populate_pte(cpa, start, end, num_pages - cur_pages,
951 pmd, pgprot); 971 pmd, pgprot);
952 } 972 }
953 return num_pages; 973 return num_pages;
954 } 974 }
955 975
956 static int populate_pud(struct cpa_data *cpa, unsigned long start, pgd_t *pgd, 976 static int populate_pud(struct cpa_data *cpa, unsigned long start, pgd_t *pgd,
957 pgprot_t pgprot) 977 pgprot_t pgprot)
958 { 978 {
959 pud_t *pud; 979 pud_t *pud;
960 unsigned long end; 980 unsigned long end;
961 int cur_pages = 0; 981 int cur_pages = 0;
962 pgprot_t pud_pgprot; 982 pgprot_t pud_pgprot;
963 983
964 end = start + (cpa->numpages << PAGE_SHIFT); 984 end = start + (cpa->numpages << PAGE_SHIFT);
965 985
966 /* 986 /*
967 * Not on a Gb page boundary? => map everything up to it with 987 * Not on a Gb page boundary? => map everything up to it with
968 * smaller pages. 988 * smaller pages.
969 */ 989 */
970 if (start & (PUD_SIZE - 1)) { 990 if (start & (PUD_SIZE - 1)) {
971 unsigned long pre_end; 991 unsigned long pre_end;
972 unsigned long next_page = (start + PUD_SIZE) & PUD_MASK; 992 unsigned long next_page = (start + PUD_SIZE) & PUD_MASK;
973 993
974 pre_end = min_t(unsigned long, end, next_page); 994 pre_end = min_t(unsigned long, end, next_page);
975 cur_pages = (pre_end - start) >> PAGE_SHIFT; 995 cur_pages = (pre_end - start) >> PAGE_SHIFT;
976 cur_pages = min_t(int, (int)cpa->numpages, cur_pages); 996 cur_pages = min_t(int, (int)cpa->numpages, cur_pages);
977 997
978 pud = pud_offset(pgd, start); 998 pud = pud_offset(pgd, start);
979 999
980 /* 1000 /*
981 * Need a PMD page? 1001 * Need a PMD page?
982 */ 1002 */
983 if (pud_none(*pud)) 1003 if (pud_none(*pud))
984 if (alloc_pmd_page(pud)) 1004 if (alloc_pmd_page(pud))
985 return -1; 1005 return -1;
986 1006
987 cur_pages = populate_pmd(cpa, start, pre_end, cur_pages, 1007 cur_pages = populate_pmd(cpa, start, pre_end, cur_pages,
988 pud, pgprot); 1008 pud, pgprot);
989 if (cur_pages < 0) 1009 if (cur_pages < 0)
990 return cur_pages; 1010 return cur_pages;
991 1011
992 start = pre_end; 1012 start = pre_end;
993 } 1013 }
994 1014
995 /* We mapped them all? */ 1015 /* We mapped them all? */
996 if (cpa->numpages == cur_pages) 1016 if (cpa->numpages == cur_pages)
997 return cur_pages; 1017 return cur_pages;
998 1018
999 pud = pud_offset(pgd, start); 1019 pud = pud_offset(pgd, start);
1000 pud_pgprot = pgprot_4k_2_large(pgprot); 1020 pud_pgprot = pgprot_4k_2_large(pgprot);
1001 1021
1002 /* 1022 /*
1003 * Map everything starting from the Gb boundary, possibly with 1G pages 1023 * Map everything starting from the Gb boundary, possibly with 1G pages
1004 */ 1024 */
1005 while (end - start >= PUD_SIZE) { 1025 while (end - start >= PUD_SIZE) {
1006 set_pud(pud, __pud(cpa->pfn | _PAGE_PSE | 1026 set_pud(pud, __pud(cpa->pfn | _PAGE_PSE |
1007 massage_pgprot(pud_pgprot))); 1027 massage_pgprot(pud_pgprot)));
1008 1028
1009 start += PUD_SIZE; 1029 start += PUD_SIZE;
1010 cpa->pfn += PUD_SIZE; 1030 cpa->pfn += PUD_SIZE;
1011 cur_pages += PUD_SIZE >> PAGE_SHIFT; 1031 cur_pages += PUD_SIZE >> PAGE_SHIFT;
1012 pud++; 1032 pud++;
1013 } 1033 }
1014 1034
1015 /* Map trailing leftover */ 1035 /* Map trailing leftover */
1016 if (start < end) { 1036 if (start < end) {
1017 int tmp; 1037 int tmp;
1018 1038
1019 pud = pud_offset(pgd, start); 1039 pud = pud_offset(pgd, start);
1020 if (pud_none(*pud)) 1040 if (pud_none(*pud))
1021 if (alloc_pmd_page(pud)) 1041 if (alloc_pmd_page(pud))
1022 return -1; 1042 return -1;
1023 1043
1024 tmp = populate_pmd(cpa, start, end, cpa->numpages - cur_pages, 1044 tmp = populate_pmd(cpa, start, end, cpa->numpages - cur_pages,
1025 pud, pgprot); 1045 pud, pgprot);
1026 if (tmp < 0) 1046 if (tmp < 0)
1027 return cur_pages; 1047 return cur_pages;
1028 1048
1029 cur_pages += tmp; 1049 cur_pages += tmp;
1030 } 1050 }
1031 return cur_pages; 1051 return cur_pages;
1032 } 1052 }
1033 1053
1034 /* 1054 /*
1035 * Restrictions for kernel page table do not necessarily apply when mapping in 1055 * Restrictions for kernel page table do not necessarily apply when mapping in
1036 * an alternate PGD. 1056 * an alternate PGD.
1037 */ 1057 */
1038 static int populate_pgd(struct cpa_data *cpa, unsigned long addr) 1058 static int populate_pgd(struct cpa_data *cpa, unsigned long addr)
1039 { 1059 {
1040 pgprot_t pgprot = __pgprot(_KERNPG_TABLE); 1060 pgprot_t pgprot = __pgprot(_KERNPG_TABLE);
1041 pud_t *pud = NULL; /* shut up gcc */ 1061 pud_t *pud = NULL; /* shut up gcc */
1042 pgd_t *pgd_entry; 1062 pgd_t *pgd_entry;
1043 int ret; 1063 int ret;
1044 1064
1045 pgd_entry = cpa->pgd + pgd_index(addr); 1065 pgd_entry = cpa->pgd + pgd_index(addr);
1046 1066
1047 /* 1067 /*
1048 * Allocate a PUD page and hand it down for mapping. 1068 * Allocate a PUD page and hand it down for mapping.
1049 */ 1069 */
1050 if (pgd_none(*pgd_entry)) { 1070 if (pgd_none(*pgd_entry)) {
1051 pud = (pud_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK); 1071 pud = (pud_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK);
1052 if (!pud) 1072 if (!pud)
1053 return -1; 1073 return -1;
1054 1074
1055 set_pgd(pgd_entry, __pgd(__pa(pud) | _KERNPG_TABLE)); 1075 set_pgd(pgd_entry, __pgd(__pa(pud) | _KERNPG_TABLE));
1056 } 1076 }
1057 1077
1058 pgprot_val(pgprot) &= ~pgprot_val(cpa->mask_clr); 1078 pgprot_val(pgprot) &= ~pgprot_val(cpa->mask_clr);
1059 pgprot_val(pgprot) |= pgprot_val(cpa->mask_set); 1079 pgprot_val(pgprot) |= pgprot_val(cpa->mask_set);
1060 1080
1061 ret = populate_pud(cpa, addr, pgd_entry, pgprot); 1081 ret = populate_pud(cpa, addr, pgd_entry, pgprot);
1062 if (ret < 0) { 1082 if (ret < 0) {
1063 unmap_pgd_range(cpa->pgd, addr, 1083 unmap_pgd_range(cpa->pgd, addr,
1064 addr + (cpa->numpages << PAGE_SHIFT)); 1084 addr + (cpa->numpages << PAGE_SHIFT));
1065 return ret; 1085 return ret;
1066 } 1086 }
1067 1087
1068 cpa->numpages = ret; 1088 cpa->numpages = ret;
1069 return 0; 1089 return 0;
1070 } 1090 }
1071 1091
1072 static int __cpa_process_fault(struct cpa_data *cpa, unsigned long vaddr, 1092 static int __cpa_process_fault(struct cpa_data *cpa, unsigned long vaddr,
1073 int primary) 1093 int primary)
1074 { 1094 {
1075 if (cpa->pgd) 1095 if (cpa->pgd)
1076 return populate_pgd(cpa, vaddr); 1096 return populate_pgd(cpa, vaddr);
1077 1097
1078 /* 1098 /*
1079 * Ignore all non primary paths. 1099 * Ignore all non primary paths.
1080 */ 1100 */
1081 if (!primary) 1101 if (!primary)
1082 return 0; 1102 return 0;
1083 1103
1084 /* 1104 /*
1085 * Ignore the NULL PTE for kernel identity mapping, as it is expected 1105 * Ignore the NULL PTE for kernel identity mapping, as it is expected
1086 * to have holes. 1106 * to have holes.
1087 * Also set numpages to '1' indicating that we processed cpa req for 1107 * Also set numpages to '1' indicating that we processed cpa req for
1088 * one virtual address page and its pfn. TBD: numpages can be set based 1108 * one virtual address page and its pfn. TBD: numpages can be set based
1089 * on the initial value and the level returned by lookup_address(). 1109 * on the initial value and the level returned by lookup_address().
1090 */ 1110 */
1091 if (within(vaddr, PAGE_OFFSET, 1111 if (within(vaddr, PAGE_OFFSET,
1092 PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT))) { 1112 PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT))) {
1093 cpa->numpages = 1; 1113 cpa->numpages = 1;
1094 cpa->pfn = __pa(vaddr) >> PAGE_SHIFT; 1114 cpa->pfn = __pa(vaddr) >> PAGE_SHIFT;
1095 return 0; 1115 return 0;
1096 } else { 1116 } else {
1097 WARN(1, KERN_WARNING "CPA: called for zero pte. " 1117 WARN(1, KERN_WARNING "CPA: called for zero pte. "
1098 "vaddr = %lx cpa->vaddr = %lx\n", vaddr, 1118 "vaddr = %lx cpa->vaddr = %lx\n", vaddr,
1099 *cpa->vaddr); 1119 *cpa->vaddr);
1100 1120
1101 return -EFAULT; 1121 return -EFAULT;
1102 } 1122 }
1103 } 1123 }
1104 1124
1105 static int __change_page_attr(struct cpa_data *cpa, int primary) 1125 static int __change_page_attr(struct cpa_data *cpa, int primary)
1106 { 1126 {
1107 unsigned long address; 1127 unsigned long address;
1108 int do_split, err; 1128 int do_split, err;
1109 unsigned int level; 1129 unsigned int level;
1110 pte_t *kpte, old_pte; 1130 pte_t *kpte, old_pte;
1111 1131
1112 if (cpa->flags & CPA_PAGES_ARRAY) { 1132 if (cpa->flags & CPA_PAGES_ARRAY) {
1113 struct page *page = cpa->pages[cpa->curpage]; 1133 struct page *page = cpa->pages[cpa->curpage];
1114 if (unlikely(PageHighMem(page))) 1134 if (unlikely(PageHighMem(page)))
1115 return 0; 1135 return 0;
1116 address = (unsigned long)page_address(page); 1136 address = (unsigned long)page_address(page);
1117 } else if (cpa->flags & CPA_ARRAY) 1137 } else if (cpa->flags & CPA_ARRAY)
1118 address = cpa->vaddr[cpa->curpage]; 1138 address = cpa->vaddr[cpa->curpage];
1119 else 1139 else
1120 address = *cpa->vaddr; 1140 address = *cpa->vaddr;
1121 repeat: 1141 repeat:
1122 kpte = _lookup_address_cpa(cpa, address, &level); 1142 kpte = _lookup_address_cpa(cpa, address, &level);
1123 if (!kpte) 1143 if (!kpte)
1124 return __cpa_process_fault(cpa, address, primary); 1144 return __cpa_process_fault(cpa, address, primary);
1125 1145
1126 old_pte = *kpte; 1146 old_pte = *kpte;
1127 if (!pte_val(old_pte)) 1147 if (!pte_val(old_pte))
1128 return __cpa_process_fault(cpa, address, primary); 1148 return __cpa_process_fault(cpa, address, primary);
1129 1149
1130 if (level == PG_LEVEL_4K) { 1150 if (level == PG_LEVEL_4K) {
1131 pte_t new_pte; 1151 pte_t new_pte;
1132 pgprot_t new_prot = pte_pgprot(old_pte); 1152 pgprot_t new_prot = pte_pgprot(old_pte);
1133 unsigned long pfn = pte_pfn(old_pte); 1153 unsigned long pfn = pte_pfn(old_pte);
1134 1154
1135 pgprot_val(new_prot) &= ~pgprot_val(cpa->mask_clr); 1155 pgprot_val(new_prot) &= ~pgprot_val(cpa->mask_clr);
1136 pgprot_val(new_prot) |= pgprot_val(cpa->mask_set); 1156 pgprot_val(new_prot) |= pgprot_val(cpa->mask_set);
1137 1157
1138 new_prot = static_protections(new_prot, address, pfn); 1158 new_prot = static_protections(new_prot, address, pfn);
1139 1159
1140 /* 1160 /*
1141 * Set the GLOBAL flags only if the PRESENT flag is 1161 * Set the GLOBAL flags only if the PRESENT flag is
1142 * set otherwise pte_present will return true even on 1162 * set otherwise pte_present will return true even on
1143 * a non present pte. The canon_pgprot will clear 1163 * a non present pte. The canon_pgprot will clear
1144 * _PAGE_GLOBAL for the ancient hardware that doesn't 1164 * _PAGE_GLOBAL for the ancient hardware that doesn't
1145 * support it. 1165 * support it.
1146 */ 1166 */
1147 if (pgprot_val(new_prot) & _PAGE_PRESENT) 1167 if (pgprot_val(new_prot) & _PAGE_PRESENT)
1148 pgprot_val(new_prot) |= _PAGE_GLOBAL; 1168 pgprot_val(new_prot) |= _PAGE_GLOBAL;
1149 else 1169 else
1150 pgprot_val(new_prot) &= ~_PAGE_GLOBAL; 1170 pgprot_val(new_prot) &= ~_PAGE_GLOBAL;
1151 1171
1152 /* 1172 /*
1153 * We need to keep the pfn from the existing PTE, 1173 * We need to keep the pfn from the existing PTE,
1154 * after all we're only going to change it's attributes 1174 * after all we're only going to change it's attributes
1155 * not the memory it points to 1175 * not the memory it points to
1156 */ 1176 */
1157 new_pte = pfn_pte(pfn, canon_pgprot(new_prot)); 1177 new_pte = pfn_pte(pfn, canon_pgprot(new_prot));
1158 cpa->pfn = pfn; 1178 cpa->pfn = pfn;
1159 /* 1179 /*
1160 * Do we really change anything ? 1180 * Do we really change anything ?
1161 */ 1181 */
1162 if (pte_val(old_pte) != pte_val(new_pte)) { 1182 if (pte_val(old_pte) != pte_val(new_pte)) {
1163 set_pte_atomic(kpte, new_pte); 1183 set_pte_atomic(kpte, new_pte);
1164 cpa->flags |= CPA_FLUSHTLB; 1184 cpa->flags |= CPA_FLUSHTLB;
1165 } 1185 }
1166 cpa->numpages = 1; 1186 cpa->numpages = 1;
1167 return 0; 1187 return 0;
1168 } 1188 }
1169 1189
1170 /* 1190 /*
1171 * Check, whether we can keep the large page intact 1191 * Check, whether we can keep the large page intact
1172 * and just change the pte: 1192 * and just change the pte:
1173 */ 1193 */
1174 do_split = try_preserve_large_page(kpte, address, cpa); 1194 do_split = try_preserve_large_page(kpte, address, cpa);
1175 /* 1195 /*
1176 * When the range fits into the existing large page, 1196 * When the range fits into the existing large page,
1177 * return. cp->numpages and cpa->tlbflush have been updated in 1197 * return. cp->numpages and cpa->tlbflush have been updated in
1178 * try_large_page: 1198 * try_large_page:
1179 */ 1199 */
1180 if (do_split <= 0) 1200 if (do_split <= 0)
1181 return do_split; 1201 return do_split;
1182 1202
1183 /* 1203 /*
1184 * We have to split the large page: 1204 * We have to split the large page:
1185 */ 1205 */
1186 err = split_large_page(cpa, kpte, address); 1206 err = split_large_page(cpa, kpte, address);
1187 if (!err) { 1207 if (!err) {
1188 /* 1208 /*
1189 * Do a global flush tlb after splitting the large page 1209 * Do a global flush tlb after splitting the large page
1190 * and before we do the actual change page attribute in the PTE. 1210 * and before we do the actual change page attribute in the PTE.
1191 * 1211 *
1192 * With out this, we violate the TLB application note, that says 1212 * With out this, we violate the TLB application note, that says
1193 * "The TLBs may contain both ordinary and large-page 1213 * "The TLBs may contain both ordinary and large-page
1194 * translations for a 4-KByte range of linear addresses. This 1214 * translations for a 4-KByte range of linear addresses. This
1195 * may occur if software modifies the paging structures so that 1215 * may occur if software modifies the paging structures so that
1196 * the page size used for the address range changes. If the two 1216 * the page size used for the address range changes. If the two
1197 * translations differ with respect to page frame or attributes 1217 * translations differ with respect to page frame or attributes
1198 * (e.g., permissions), processor behavior is undefined and may 1218 * (e.g., permissions), processor behavior is undefined and may
1199 * be implementation-specific." 1219 * be implementation-specific."
1200 * 1220 *
1201 * We do this global tlb flush inside the cpa_lock, so that we 1221 * We do this global tlb flush inside the cpa_lock, so that we
1202 * don't allow any other cpu, with stale tlb entries change the 1222 * don't allow any other cpu, with stale tlb entries change the
1203 * page attribute in parallel, that also falls into the 1223 * page attribute in parallel, that also falls into the
1204 * just split large page entry. 1224 * just split large page entry.
1205 */ 1225 */
1206 flush_tlb_all(); 1226 flush_tlb_all();
1207 goto repeat; 1227 goto repeat;
1208 } 1228 }
1209 1229
1210 return err; 1230 return err;
1211 } 1231 }
1212 1232
1213 static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias); 1233 static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias);
1214 1234
1215 static int cpa_process_alias(struct cpa_data *cpa) 1235 static int cpa_process_alias(struct cpa_data *cpa)
1216 { 1236 {
1217 struct cpa_data alias_cpa; 1237 struct cpa_data alias_cpa;
1218 unsigned long laddr = (unsigned long)__va(cpa->pfn << PAGE_SHIFT); 1238 unsigned long laddr = (unsigned long)__va(cpa->pfn << PAGE_SHIFT);
1219 unsigned long vaddr; 1239 unsigned long vaddr;
1220 int ret; 1240 int ret;
1221 1241
1222 if (!pfn_range_is_mapped(cpa->pfn, cpa->pfn + 1)) 1242 if (!pfn_range_is_mapped(cpa->pfn, cpa->pfn + 1))
1223 return 0; 1243 return 0;
1224 1244
1225 /* 1245 /*
1226 * No need to redo, when the primary call touched the direct 1246 * No need to redo, when the primary call touched the direct
1227 * mapping already: 1247 * mapping already:
1228 */ 1248 */
1229 if (cpa->flags & CPA_PAGES_ARRAY) { 1249 if (cpa->flags & CPA_PAGES_ARRAY) {
1230 struct page *page = cpa->pages[cpa->curpage]; 1250 struct page *page = cpa->pages[cpa->curpage];
1231 if (unlikely(PageHighMem(page))) 1251 if (unlikely(PageHighMem(page)))
1232 return 0; 1252 return 0;
1233 vaddr = (unsigned long)page_address(page); 1253 vaddr = (unsigned long)page_address(page);
1234 } else if (cpa->flags & CPA_ARRAY) 1254 } else if (cpa->flags & CPA_ARRAY)
1235 vaddr = cpa->vaddr[cpa->curpage]; 1255 vaddr = cpa->vaddr[cpa->curpage];
1236 else 1256 else
1237 vaddr = *cpa->vaddr; 1257 vaddr = *cpa->vaddr;
1238 1258
1239 if (!(within(vaddr, PAGE_OFFSET, 1259 if (!(within(vaddr, PAGE_OFFSET,
1240 PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT)))) { 1260 PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT)))) {
1241 1261
1242 alias_cpa = *cpa; 1262 alias_cpa = *cpa;
1243 alias_cpa.vaddr = &laddr; 1263 alias_cpa.vaddr = &laddr;
1244 alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY); 1264 alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY);
1245 1265
1246 ret = __change_page_attr_set_clr(&alias_cpa, 0); 1266 ret = __change_page_attr_set_clr(&alias_cpa, 0);
1247 if (ret) 1267 if (ret)
1248 return ret; 1268 return ret;
1249 } 1269 }
1250 1270
1251 #ifdef CONFIG_X86_64 1271 #ifdef CONFIG_X86_64
1252 /* 1272 /*
1253 * If the primary call didn't touch the high mapping already 1273 * If the primary call didn't touch the high mapping already
1254 * and the physical address is inside the kernel map, we need 1274 * and the physical address is inside the kernel map, we need
1255 * to touch the high mapped kernel as well: 1275 * to touch the high mapped kernel as well:
1256 */ 1276 */
1257 if (!within(vaddr, (unsigned long)_text, _brk_end) && 1277 if (!within(vaddr, (unsigned long)_text, _brk_end) &&
1258 within(cpa->pfn, highmap_start_pfn(), highmap_end_pfn())) { 1278 within(cpa->pfn, highmap_start_pfn(), highmap_end_pfn())) {
1259 unsigned long temp_cpa_vaddr = (cpa->pfn << PAGE_SHIFT) + 1279 unsigned long temp_cpa_vaddr = (cpa->pfn << PAGE_SHIFT) +
1260 __START_KERNEL_map - phys_base; 1280 __START_KERNEL_map - phys_base;
1261 alias_cpa = *cpa; 1281 alias_cpa = *cpa;
1262 alias_cpa.vaddr = &temp_cpa_vaddr; 1282 alias_cpa.vaddr = &temp_cpa_vaddr;
1263 alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY); 1283 alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY);
1264 1284
1265 /* 1285 /*
1266 * The high mapping range is imprecise, so ignore the 1286 * The high mapping range is imprecise, so ignore the
1267 * return value. 1287 * return value.
1268 */ 1288 */
1269 __change_page_attr_set_clr(&alias_cpa, 0); 1289 __change_page_attr_set_clr(&alias_cpa, 0);
1270 } 1290 }
1271 #endif 1291 #endif
1272 1292
1273 return 0; 1293 return 0;
1274 } 1294 }
1275 1295
1276 static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias) 1296 static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias)
1277 { 1297 {
1278 int ret, numpages = cpa->numpages; 1298 int ret, numpages = cpa->numpages;
1279 1299
1280 while (numpages) { 1300 while (numpages) {
1281 /* 1301 /*
1282 * Store the remaining nr of pages for the large page 1302 * Store the remaining nr of pages for the large page
1283 * preservation check. 1303 * preservation check.
1284 */ 1304 */
1285 cpa->numpages = numpages; 1305 cpa->numpages = numpages;
1286 /* for array changes, we can't use large page */ 1306 /* for array changes, we can't use large page */
1287 if (cpa->flags & (CPA_ARRAY | CPA_PAGES_ARRAY)) 1307 if (cpa->flags & (CPA_ARRAY | CPA_PAGES_ARRAY))
1288 cpa->numpages = 1; 1308 cpa->numpages = 1;
1289 1309
1290 if (!debug_pagealloc) 1310 if (!debug_pagealloc)
1291 spin_lock(&cpa_lock); 1311 spin_lock(&cpa_lock);
1292 ret = __change_page_attr(cpa, checkalias); 1312 ret = __change_page_attr(cpa, checkalias);
1293 if (!debug_pagealloc) 1313 if (!debug_pagealloc)
1294 spin_unlock(&cpa_lock); 1314 spin_unlock(&cpa_lock);
1295 if (ret) 1315 if (ret)
1296 return ret; 1316 return ret;
1297 1317
1298 if (checkalias) { 1318 if (checkalias) {
1299 ret = cpa_process_alias(cpa); 1319 ret = cpa_process_alias(cpa);
1300 if (ret) 1320 if (ret)
1301 return ret; 1321 return ret;
1302 } 1322 }
1303 1323
1304 /* 1324 /*
1305 * Adjust the number of pages with the result of the 1325 * Adjust the number of pages with the result of the
1306 * CPA operation. Either a large page has been 1326 * CPA operation. Either a large page has been
1307 * preserved or a single page update happened. 1327 * preserved or a single page update happened.
1308 */ 1328 */
1309 BUG_ON(cpa->numpages > numpages); 1329 BUG_ON(cpa->numpages > numpages);
1310 numpages -= cpa->numpages; 1330 numpages -= cpa->numpages;
1311 if (cpa->flags & (CPA_PAGES_ARRAY | CPA_ARRAY)) 1331 if (cpa->flags & (CPA_PAGES_ARRAY | CPA_ARRAY))
1312 cpa->curpage++; 1332 cpa->curpage++;
1313 else 1333 else
1314 *cpa->vaddr += cpa->numpages * PAGE_SIZE; 1334 *cpa->vaddr += cpa->numpages * PAGE_SIZE;
1315 1335
1316 } 1336 }
1317 return 0; 1337 return 0;
1318 } 1338 }
1319 1339
1320 static int change_page_attr_set_clr(unsigned long *addr, int numpages, 1340 static int change_page_attr_set_clr(unsigned long *addr, int numpages,
1321 pgprot_t mask_set, pgprot_t mask_clr, 1341 pgprot_t mask_set, pgprot_t mask_clr,
1322 int force_split, int in_flag, 1342 int force_split, int in_flag,
1323 struct page **pages) 1343 struct page **pages)
1324 { 1344 {
1325 struct cpa_data cpa; 1345 struct cpa_data cpa;
1326 int ret, cache, checkalias; 1346 int ret, cache, checkalias;
1327 unsigned long baddr = 0; 1347 unsigned long baddr = 0;
1328 1348
1329 memset(&cpa, 0, sizeof(cpa)); 1349 memset(&cpa, 0, sizeof(cpa));
1330 1350
1331 /* 1351 /*
1332 * Check, if we are requested to change a not supported 1352 * Check, if we are requested to change a not supported
1333 * feature: 1353 * feature:
1334 */ 1354 */
1335 mask_set = canon_pgprot(mask_set); 1355 mask_set = canon_pgprot(mask_set);
1336 mask_clr = canon_pgprot(mask_clr); 1356 mask_clr = canon_pgprot(mask_clr);
1337 if (!pgprot_val(mask_set) && !pgprot_val(mask_clr) && !force_split) 1357 if (!pgprot_val(mask_set) && !pgprot_val(mask_clr) && !force_split)
1338 return 0; 1358 return 0;
1339 1359
1340 /* Ensure we are PAGE_SIZE aligned */ 1360 /* Ensure we are PAGE_SIZE aligned */
1341 if (in_flag & CPA_ARRAY) { 1361 if (in_flag & CPA_ARRAY) {
1342 int i; 1362 int i;
1343 for (i = 0; i < numpages; i++) { 1363 for (i = 0; i < numpages; i++) {
1344 if (addr[i] & ~PAGE_MASK) { 1364 if (addr[i] & ~PAGE_MASK) {
1345 addr[i] &= PAGE_MASK; 1365 addr[i] &= PAGE_MASK;
1346 WARN_ON_ONCE(1); 1366 WARN_ON_ONCE(1);
1347 } 1367 }
1348 } 1368 }
1349 } else if (!(in_flag & CPA_PAGES_ARRAY)) { 1369 } else if (!(in_flag & CPA_PAGES_ARRAY)) {
1350 /* 1370 /*
1351 * in_flag of CPA_PAGES_ARRAY implies it is aligned. 1371 * in_flag of CPA_PAGES_ARRAY implies it is aligned.
1352 * No need to cehck in that case 1372 * No need to cehck in that case
1353 */ 1373 */
1354 if (*addr & ~PAGE_MASK) { 1374 if (*addr & ~PAGE_MASK) {
1355 *addr &= PAGE_MASK; 1375 *addr &= PAGE_MASK;
1356 /* 1376 /*
1357 * People should not be passing in unaligned addresses: 1377 * People should not be passing in unaligned addresses:
1358 */ 1378 */
1359 WARN_ON_ONCE(1); 1379 WARN_ON_ONCE(1);
1360 } 1380 }
1361 /* 1381 /*
1362 * Save address for cache flush. *addr is modified in the call 1382 * Save address for cache flush. *addr is modified in the call
1363 * to __change_page_attr_set_clr() below. 1383 * to __change_page_attr_set_clr() below.
1364 */ 1384 */
1365 baddr = *addr; 1385 baddr = *addr;
1366 } 1386 }
1367 1387
1368 /* Must avoid aliasing mappings in the highmem code */ 1388 /* Must avoid aliasing mappings in the highmem code */
1369 kmap_flush_unused(); 1389 kmap_flush_unused();
1370 1390
1371 vm_unmap_aliases(); 1391 vm_unmap_aliases();
1372 1392
1373 cpa.vaddr = addr; 1393 cpa.vaddr = addr;
1374 cpa.pages = pages; 1394 cpa.pages = pages;
1375 cpa.numpages = numpages; 1395 cpa.numpages = numpages;
1376 cpa.mask_set = mask_set; 1396 cpa.mask_set = mask_set;
1377 cpa.mask_clr = mask_clr; 1397 cpa.mask_clr = mask_clr;
1378 cpa.flags = 0; 1398 cpa.flags = 0;
1379 cpa.curpage = 0; 1399 cpa.curpage = 0;
1380 cpa.force_split = force_split; 1400 cpa.force_split = force_split;
1381 1401
1382 if (in_flag & (CPA_ARRAY | CPA_PAGES_ARRAY)) 1402 if (in_flag & (CPA_ARRAY | CPA_PAGES_ARRAY))
1383 cpa.flags |= in_flag; 1403 cpa.flags |= in_flag;
1384 1404
1385 /* No alias checking for _NX bit modifications */ 1405 /* No alias checking for _NX bit modifications */
1386 checkalias = (pgprot_val(mask_set) | pgprot_val(mask_clr)) != _PAGE_NX; 1406 checkalias = (pgprot_val(mask_set) | pgprot_val(mask_clr)) != _PAGE_NX;
1387 1407
1388 ret = __change_page_attr_set_clr(&cpa, checkalias); 1408 ret = __change_page_attr_set_clr(&cpa, checkalias);
1389 1409
1390 /* 1410 /*
1391 * Check whether we really changed something: 1411 * Check whether we really changed something:
1392 */ 1412 */
1393 if (!(cpa.flags & CPA_FLUSHTLB)) 1413 if (!(cpa.flags & CPA_FLUSHTLB))
1394 goto out; 1414 goto out;
1395 1415
1396 /* 1416 /*
1397 * No need to flush, when we did not set any of the caching 1417 * No need to flush, when we did not set any of the caching
1398 * attributes: 1418 * attributes:
1399 */ 1419 */
1400 cache = !!pgprot2cachemode(mask_set); 1420 cache = !!pgprot2cachemode(mask_set);
1401 1421
1402 /* 1422 /*
1403 * On success we use CLFLUSH, when the CPU supports it to 1423 * On success we use CLFLUSH, when the CPU supports it to
1404 * avoid the WBINVD. If the CPU does not support it and in the 1424 * avoid the WBINVD. If the CPU does not support it and in the
1405 * error case we fall back to cpa_flush_all (which uses 1425 * error case we fall back to cpa_flush_all (which uses
1406 * WBINVD): 1426 * WBINVD):
1407 */ 1427 */
1408 if (!ret && cpu_has_clflush) { 1428 if (!ret && cpu_has_clflush) {
1409 if (cpa.flags & (CPA_PAGES_ARRAY | CPA_ARRAY)) { 1429 if (cpa.flags & (CPA_PAGES_ARRAY | CPA_ARRAY)) {
1410 cpa_flush_array(addr, numpages, cache, 1430 cpa_flush_array(addr, numpages, cache,
1411 cpa.flags, pages); 1431 cpa.flags, pages);
1412 } else 1432 } else
1413 cpa_flush_range(baddr, numpages, cache); 1433 cpa_flush_range(baddr, numpages, cache);
1414 } else 1434 } else
1415 cpa_flush_all(cache); 1435 cpa_flush_all(cache);
1416 1436
1417 out: 1437 out:
1418 return ret; 1438 return ret;
1419 } 1439 }
1420 1440
1421 static inline int change_page_attr_set(unsigned long *addr, int numpages, 1441 static inline int change_page_attr_set(unsigned long *addr, int numpages,
1422 pgprot_t mask, int array) 1442 pgprot_t mask, int array)
1423 { 1443 {
1424 return change_page_attr_set_clr(addr, numpages, mask, __pgprot(0), 0, 1444 return change_page_attr_set_clr(addr, numpages, mask, __pgprot(0), 0,
1425 (array ? CPA_ARRAY : 0), NULL); 1445 (array ? CPA_ARRAY : 0), NULL);
1426 } 1446 }
1427 1447
1428 static inline int change_page_attr_clear(unsigned long *addr, int numpages, 1448 static inline int change_page_attr_clear(unsigned long *addr, int numpages,
1429 pgprot_t mask, int array) 1449 pgprot_t mask, int array)
1430 { 1450 {
1431 return change_page_attr_set_clr(addr, numpages, __pgprot(0), mask, 0, 1451 return change_page_attr_set_clr(addr, numpages, __pgprot(0), mask, 0,
1432 (array ? CPA_ARRAY : 0), NULL); 1452 (array ? CPA_ARRAY : 0), NULL);
1433 } 1453 }
1434 1454
1435 static inline int cpa_set_pages_array(struct page **pages, int numpages, 1455 static inline int cpa_set_pages_array(struct page **pages, int numpages,
1436 pgprot_t mask) 1456 pgprot_t mask)
1437 { 1457 {
1438 return change_page_attr_set_clr(NULL, numpages, mask, __pgprot(0), 0, 1458 return change_page_attr_set_clr(NULL, numpages, mask, __pgprot(0), 0,
1439 CPA_PAGES_ARRAY, pages); 1459 CPA_PAGES_ARRAY, pages);
1440 } 1460 }
1441 1461
1442 static inline int cpa_clear_pages_array(struct page **pages, int numpages, 1462 static inline int cpa_clear_pages_array(struct page **pages, int numpages,
1443 pgprot_t mask) 1463 pgprot_t mask)
1444 { 1464 {
1445 return change_page_attr_set_clr(NULL, numpages, __pgprot(0), mask, 0, 1465 return change_page_attr_set_clr(NULL, numpages, __pgprot(0), mask, 0,
1446 CPA_PAGES_ARRAY, pages); 1466 CPA_PAGES_ARRAY, pages);
1447 } 1467 }
1448 1468
1449 int _set_memory_uc(unsigned long addr, int numpages) 1469 int _set_memory_uc(unsigned long addr, int numpages)
1450 { 1470 {
1451 /* 1471 /*
1452 * for now UC MINUS. see comments in ioremap_nocache() 1472 * for now UC MINUS. see comments in ioremap_nocache()
1453 */ 1473 */
1454 return change_page_attr_set(&addr, numpages, 1474 return change_page_attr_set(&addr, numpages,
1455 cachemode2pgprot(_PAGE_CACHE_MODE_UC_MINUS), 1475 cachemode2pgprot(_PAGE_CACHE_MODE_UC_MINUS),
1456 0); 1476 0);
1457 } 1477 }
1458 1478
1459 int set_memory_uc(unsigned long addr, int numpages) 1479 int set_memory_uc(unsigned long addr, int numpages)
1460 { 1480 {
1461 int ret; 1481 int ret;
1462 1482
1463 /* 1483 /*
1464 * for now UC MINUS. see comments in ioremap_nocache() 1484 * for now UC MINUS. see comments in ioremap_nocache()
1465 */ 1485 */
1466 ret = reserve_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE, 1486 ret = reserve_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE,
1467 _PAGE_CACHE_MODE_UC_MINUS, NULL); 1487 _PAGE_CACHE_MODE_UC_MINUS, NULL);
1468 if (ret) 1488 if (ret)
1469 goto out_err; 1489 goto out_err;
1470 1490
1471 ret = _set_memory_uc(addr, numpages); 1491 ret = _set_memory_uc(addr, numpages);
1472 if (ret) 1492 if (ret)
1473 goto out_free; 1493 goto out_free;
1474 1494
1475 return 0; 1495 return 0;
1476 1496
1477 out_free: 1497 out_free:
1478 free_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE); 1498 free_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE);
1479 out_err: 1499 out_err:
1480 return ret; 1500 return ret;
1481 } 1501 }
1482 EXPORT_SYMBOL(set_memory_uc); 1502 EXPORT_SYMBOL(set_memory_uc);
1483 1503
1484 static int _set_memory_array(unsigned long *addr, int addrinarray, 1504 static int _set_memory_array(unsigned long *addr, int addrinarray,
1485 enum page_cache_mode new_type) 1505 enum page_cache_mode new_type)
1486 { 1506 {
1487 int i, j; 1507 int i, j;
1488 int ret; 1508 int ret;
1489 1509
1490 /* 1510 /*
1491 * for now UC MINUS. see comments in ioremap_nocache() 1511 * for now UC MINUS. see comments in ioremap_nocache()
1492 */ 1512 */
1493 for (i = 0; i < addrinarray; i++) { 1513 for (i = 0; i < addrinarray; i++) {
1494 ret = reserve_memtype(__pa(addr[i]), __pa(addr[i]) + PAGE_SIZE, 1514 ret = reserve_memtype(__pa(addr[i]), __pa(addr[i]) + PAGE_SIZE,
1495 new_type, NULL); 1515 new_type, NULL);
1496 if (ret) 1516 if (ret)
1497 goto out_free; 1517 goto out_free;
1498 } 1518 }
1499 1519
1500 ret = change_page_attr_set(addr, addrinarray, 1520 ret = change_page_attr_set(addr, addrinarray,
1501 cachemode2pgprot(_PAGE_CACHE_MODE_UC_MINUS), 1521 cachemode2pgprot(_PAGE_CACHE_MODE_UC_MINUS),
1502 1); 1522 1);
1503 1523
1504 if (!ret && new_type == _PAGE_CACHE_MODE_WC) 1524 if (!ret && new_type == _PAGE_CACHE_MODE_WC)
1505 ret = change_page_attr_set_clr(addr, addrinarray, 1525 ret = change_page_attr_set_clr(addr, addrinarray,
1506 cachemode2pgprot( 1526 cachemode2pgprot(
1507 _PAGE_CACHE_MODE_WC), 1527 _PAGE_CACHE_MODE_WC),
1508 __pgprot(_PAGE_CACHE_MASK), 1528 __pgprot(_PAGE_CACHE_MASK),
1509 0, CPA_ARRAY, NULL); 1529 0, CPA_ARRAY, NULL);
1510 if (ret) 1530 if (ret)
1511 goto out_free; 1531 goto out_free;
1512 1532
1513 return 0; 1533 return 0;
1514 1534
1515 out_free: 1535 out_free:
1516 for (j = 0; j < i; j++) 1536 for (j = 0; j < i; j++)
1517 free_memtype(__pa(addr[j]), __pa(addr[j]) + PAGE_SIZE); 1537 free_memtype(__pa(addr[j]), __pa(addr[j]) + PAGE_SIZE);
1518 1538
1519 return ret; 1539 return ret;
1520 } 1540 }
1521 1541
1522 int set_memory_array_uc(unsigned long *addr, int addrinarray) 1542 int set_memory_array_uc(unsigned long *addr, int addrinarray)
1523 { 1543 {
1524 return _set_memory_array(addr, addrinarray, _PAGE_CACHE_MODE_UC_MINUS); 1544 return _set_memory_array(addr, addrinarray, _PAGE_CACHE_MODE_UC_MINUS);
1525 } 1545 }
1526 EXPORT_SYMBOL(set_memory_array_uc); 1546 EXPORT_SYMBOL(set_memory_array_uc);
1527 1547
1528 int set_memory_array_wc(unsigned long *addr, int addrinarray) 1548 int set_memory_array_wc(unsigned long *addr, int addrinarray)
1529 { 1549 {
1530 return _set_memory_array(addr, addrinarray, _PAGE_CACHE_MODE_WC); 1550 return _set_memory_array(addr, addrinarray, _PAGE_CACHE_MODE_WC);
1531 } 1551 }
1532 EXPORT_SYMBOL(set_memory_array_wc); 1552 EXPORT_SYMBOL(set_memory_array_wc);
1533 1553
1534 int _set_memory_wc(unsigned long addr, int numpages) 1554 int _set_memory_wc(unsigned long addr, int numpages)
1535 { 1555 {
1536 int ret; 1556 int ret;
1537 unsigned long addr_copy = addr; 1557 unsigned long addr_copy = addr;
1538 1558
1539 ret = change_page_attr_set(&addr, numpages, 1559 ret = change_page_attr_set(&addr, numpages,
1540 cachemode2pgprot(_PAGE_CACHE_MODE_UC_MINUS), 1560 cachemode2pgprot(_PAGE_CACHE_MODE_UC_MINUS),
1541 0); 1561 0);
1542 if (!ret) { 1562 if (!ret) {
1543 ret = change_page_attr_set_clr(&addr_copy, numpages, 1563 ret = change_page_attr_set_clr(&addr_copy, numpages,
1544 cachemode2pgprot( 1564 cachemode2pgprot(
1545 _PAGE_CACHE_MODE_WC), 1565 _PAGE_CACHE_MODE_WC),
1546 __pgprot(_PAGE_CACHE_MASK), 1566 __pgprot(_PAGE_CACHE_MASK),
1547 0, 0, NULL); 1567 0, 0, NULL);
1548 } 1568 }
1549 return ret; 1569 return ret;
1550 } 1570 }
1551 1571
1552 int set_memory_wc(unsigned long addr, int numpages) 1572 int set_memory_wc(unsigned long addr, int numpages)
1553 { 1573 {
1554 int ret; 1574 int ret;
1555 1575
1556 if (!pat_enabled) 1576 if (!pat_enabled)
1557 return set_memory_uc(addr, numpages); 1577 return set_memory_uc(addr, numpages);
1558 1578
1559 ret = reserve_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE, 1579 ret = reserve_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE,
1560 _PAGE_CACHE_MODE_WC, NULL); 1580 _PAGE_CACHE_MODE_WC, NULL);
1561 if (ret) 1581 if (ret)
1562 goto out_err; 1582 goto out_err;
1563 1583
1564 ret = _set_memory_wc(addr, numpages); 1584 ret = _set_memory_wc(addr, numpages);
1565 if (ret) 1585 if (ret)
1566 goto out_free; 1586 goto out_free;
1567 1587
1568 return 0; 1588 return 0;
1569 1589
1570 out_free: 1590 out_free:
1571 free_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE); 1591 free_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE);
1572 out_err: 1592 out_err:
1573 return ret; 1593 return ret;
1574 } 1594 }
1575 EXPORT_SYMBOL(set_memory_wc); 1595 EXPORT_SYMBOL(set_memory_wc);
1576 1596
1577 int _set_memory_wb(unsigned long addr, int numpages) 1597 int _set_memory_wb(unsigned long addr, int numpages)
1578 { 1598 {
1579 /* WB cache mode is hard wired to all cache attribute bits being 0 */ 1599 /* WB cache mode is hard wired to all cache attribute bits being 0 */
1580 return change_page_attr_clear(&addr, numpages, 1600 return change_page_attr_clear(&addr, numpages,
1581 __pgprot(_PAGE_CACHE_MASK), 0); 1601 __pgprot(_PAGE_CACHE_MASK), 0);
1582 } 1602 }
1583 1603
1584 int set_memory_wb(unsigned long addr, int numpages) 1604 int set_memory_wb(unsigned long addr, int numpages)
1585 { 1605 {
1586 int ret; 1606 int ret;
1587 1607
1588 ret = _set_memory_wb(addr, numpages); 1608 ret = _set_memory_wb(addr, numpages);
1589 if (ret) 1609 if (ret)
1590 return ret; 1610 return ret;
1591 1611
1592 free_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE); 1612 free_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE);
1593 return 0; 1613 return 0;
1594 } 1614 }
1595 EXPORT_SYMBOL(set_memory_wb); 1615 EXPORT_SYMBOL(set_memory_wb);
1596 1616
1597 int set_memory_array_wb(unsigned long *addr, int addrinarray) 1617 int set_memory_array_wb(unsigned long *addr, int addrinarray)
1598 { 1618 {
1599 int i; 1619 int i;
1600 int ret; 1620 int ret;
1601 1621
1602 /* WB cache mode is hard wired to all cache attribute bits being 0 */ 1622 /* WB cache mode is hard wired to all cache attribute bits being 0 */
1603 ret = change_page_attr_clear(addr, addrinarray, 1623 ret = change_page_attr_clear(addr, addrinarray,
1604 __pgprot(_PAGE_CACHE_MASK), 1); 1624 __pgprot(_PAGE_CACHE_MASK), 1);
1605 if (ret) 1625 if (ret)
1606 return ret; 1626 return ret;
1607 1627
1608 for (i = 0; i < addrinarray; i++) 1628 for (i = 0; i < addrinarray; i++)
1609 free_memtype(__pa(addr[i]), __pa(addr[i]) + PAGE_SIZE); 1629 free_memtype(__pa(addr[i]), __pa(addr[i]) + PAGE_SIZE);
1610 1630
1611 return 0; 1631 return 0;
1612 } 1632 }
1613 EXPORT_SYMBOL(set_memory_array_wb); 1633 EXPORT_SYMBOL(set_memory_array_wb);
1614 1634
1615 int set_memory_x(unsigned long addr, int numpages) 1635 int set_memory_x(unsigned long addr, int numpages)
1616 { 1636 {
1617 if (!(__supported_pte_mask & _PAGE_NX)) 1637 if (!(__supported_pte_mask & _PAGE_NX))
1618 return 0; 1638 return 0;
1619 1639
1620 return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_NX), 0); 1640 return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_NX), 0);
1621 } 1641 }
1622 EXPORT_SYMBOL(set_memory_x); 1642 EXPORT_SYMBOL(set_memory_x);
1623 1643
1624 int set_memory_nx(unsigned long addr, int numpages) 1644 int set_memory_nx(unsigned long addr, int numpages)
1625 { 1645 {
1626 if (!(__supported_pte_mask & _PAGE_NX)) 1646 if (!(__supported_pte_mask & _PAGE_NX))
1627 return 0; 1647 return 0;
1628 1648
1629 return change_page_attr_set(&addr, numpages, __pgprot(_PAGE_NX), 0); 1649 return change_page_attr_set(&addr, numpages, __pgprot(_PAGE_NX), 0);
1630 } 1650 }
1631 EXPORT_SYMBOL(set_memory_nx); 1651 EXPORT_SYMBOL(set_memory_nx);
1632 1652
1633 int set_memory_ro(unsigned long addr, int numpages) 1653 int set_memory_ro(unsigned long addr, int numpages)
1634 { 1654 {
1635 return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_RW), 0); 1655 return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_RW), 0);
1636 } 1656 }
1637 EXPORT_SYMBOL_GPL(set_memory_ro); 1657 EXPORT_SYMBOL_GPL(set_memory_ro);
1638 1658
1639 int set_memory_rw(unsigned long addr, int numpages) 1659 int set_memory_rw(unsigned long addr, int numpages)
1640 { 1660 {
1641 return change_page_attr_set(&addr, numpages, __pgprot(_PAGE_RW), 0); 1661 return change_page_attr_set(&addr, numpages, __pgprot(_PAGE_RW), 0);
1642 } 1662 }
1643 EXPORT_SYMBOL_GPL(set_memory_rw); 1663 EXPORT_SYMBOL_GPL(set_memory_rw);
1644 1664
1645 int set_memory_np(unsigned long addr, int numpages) 1665 int set_memory_np(unsigned long addr, int numpages)
1646 { 1666 {
1647 return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_PRESENT), 0); 1667 return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_PRESENT), 0);
1648 } 1668 }
1649 1669
1650 int set_memory_4k(unsigned long addr, int numpages) 1670 int set_memory_4k(unsigned long addr, int numpages)
1651 { 1671 {
1652 return change_page_attr_set_clr(&addr, numpages, __pgprot(0), 1672 return change_page_attr_set_clr(&addr, numpages, __pgprot(0),
1653 __pgprot(0), 1, 0, NULL); 1673 __pgprot(0), 1, 0, NULL);
1654 } 1674 }
1655 1675
1656 int set_pages_uc(struct page *page, int numpages) 1676 int set_pages_uc(struct page *page, int numpages)
1657 { 1677 {
1658 unsigned long addr = (unsigned long)page_address(page); 1678 unsigned long addr = (unsigned long)page_address(page);
1659 1679
1660 return set_memory_uc(addr, numpages); 1680 return set_memory_uc(addr, numpages);
1661 } 1681 }
1662 EXPORT_SYMBOL(set_pages_uc); 1682 EXPORT_SYMBOL(set_pages_uc);
1663 1683
1664 static int _set_pages_array(struct page **pages, int addrinarray, 1684 static int _set_pages_array(struct page **pages, int addrinarray,
1665 enum page_cache_mode new_type) 1685 enum page_cache_mode new_type)
1666 { 1686 {
1667 unsigned long start; 1687 unsigned long start;
1668 unsigned long end; 1688 unsigned long end;
1669 int i; 1689 int i;
1670 int free_idx; 1690 int free_idx;
1671 int ret; 1691 int ret;
1672 1692
1673 for (i = 0; i < addrinarray; i++) { 1693 for (i = 0; i < addrinarray; i++) {
1674 if (PageHighMem(pages[i])) 1694 if (PageHighMem(pages[i]))
1675 continue; 1695 continue;
1676 start = page_to_pfn(pages[i]) << PAGE_SHIFT; 1696 start = page_to_pfn(pages[i]) << PAGE_SHIFT;
1677 end = start + PAGE_SIZE; 1697 end = start + PAGE_SIZE;
1678 if (reserve_memtype(start, end, new_type, NULL)) 1698 if (reserve_memtype(start, end, new_type, NULL))
1679 goto err_out; 1699 goto err_out;
1680 } 1700 }
1681 1701
1682 ret = cpa_set_pages_array(pages, addrinarray, 1702 ret = cpa_set_pages_array(pages, addrinarray,
1683 cachemode2pgprot(_PAGE_CACHE_MODE_UC_MINUS)); 1703 cachemode2pgprot(_PAGE_CACHE_MODE_UC_MINUS));
1684 if (!ret && new_type == _PAGE_CACHE_MODE_WC) 1704 if (!ret && new_type == _PAGE_CACHE_MODE_WC)
1685 ret = change_page_attr_set_clr(NULL, addrinarray, 1705 ret = change_page_attr_set_clr(NULL, addrinarray,
1686 cachemode2pgprot( 1706 cachemode2pgprot(
1687 _PAGE_CACHE_MODE_WC), 1707 _PAGE_CACHE_MODE_WC),
1688 __pgprot(_PAGE_CACHE_MASK), 1708 __pgprot(_PAGE_CACHE_MASK),
1689 0, CPA_PAGES_ARRAY, pages); 1709 0, CPA_PAGES_ARRAY, pages);
1690 if (ret) 1710 if (ret)
1691 goto err_out; 1711 goto err_out;
1692 return 0; /* Success */ 1712 return 0; /* Success */
1693 err_out: 1713 err_out:
1694 free_idx = i; 1714 free_idx = i;
1695 for (i = 0; i < free_idx; i++) { 1715 for (i = 0; i < free_idx; i++) {
1696 if (PageHighMem(pages[i])) 1716 if (PageHighMem(pages[i]))
1697 continue; 1717 continue;
1698 start = page_to_pfn(pages[i]) << PAGE_SHIFT; 1718 start = page_to_pfn(pages[i]) << PAGE_SHIFT;
1699 end = start + PAGE_SIZE; 1719 end = start + PAGE_SIZE;
1700 free_memtype(start, end); 1720 free_memtype(start, end);
1701 } 1721 }
1702 return -EINVAL; 1722 return -EINVAL;
1703 } 1723 }
1704 1724
1705 int set_pages_array_uc(struct page **pages, int addrinarray) 1725 int set_pages_array_uc(struct page **pages, int addrinarray)
1706 { 1726 {
1707 return _set_pages_array(pages, addrinarray, _PAGE_CACHE_MODE_UC_MINUS); 1727 return _set_pages_array(pages, addrinarray, _PAGE_CACHE_MODE_UC_MINUS);
1708 } 1728 }
1709 EXPORT_SYMBOL(set_pages_array_uc); 1729 EXPORT_SYMBOL(set_pages_array_uc);
1710 1730
1711 int set_pages_array_wc(struct page **pages, int addrinarray) 1731 int set_pages_array_wc(struct page **pages, int addrinarray)
1712 { 1732 {
1713 return _set_pages_array(pages, addrinarray, _PAGE_CACHE_MODE_WC); 1733 return _set_pages_array(pages, addrinarray, _PAGE_CACHE_MODE_WC);
1714 } 1734 }
1715 EXPORT_SYMBOL(set_pages_array_wc); 1735 EXPORT_SYMBOL(set_pages_array_wc);
1716 1736
1717 int set_pages_wb(struct page *page, int numpages) 1737 int set_pages_wb(struct page *page, int numpages)
1718 { 1738 {
1719 unsigned long addr = (unsigned long)page_address(page); 1739 unsigned long addr = (unsigned long)page_address(page);
1720 1740
1721 return set_memory_wb(addr, numpages); 1741 return set_memory_wb(addr, numpages);
1722 } 1742 }
1723 EXPORT_SYMBOL(set_pages_wb); 1743 EXPORT_SYMBOL(set_pages_wb);
1724 1744
1725 int set_pages_array_wb(struct page **pages, int addrinarray) 1745 int set_pages_array_wb(struct page **pages, int addrinarray)
1726 { 1746 {
1727 int retval; 1747 int retval;
1728 unsigned long start; 1748 unsigned long start;
1729 unsigned long end; 1749 unsigned long end;
1730 int i; 1750 int i;
1731 1751
1732 /* WB cache mode is hard wired to all cache attribute bits being 0 */ 1752 /* WB cache mode is hard wired to all cache attribute bits being 0 */
1733 retval = cpa_clear_pages_array(pages, addrinarray, 1753 retval = cpa_clear_pages_array(pages, addrinarray,
1734 __pgprot(_PAGE_CACHE_MASK)); 1754 __pgprot(_PAGE_CACHE_MASK));
1735 if (retval) 1755 if (retval)
1736 return retval; 1756 return retval;
1737 1757
1738 for (i = 0; i < addrinarray; i++) { 1758 for (i = 0; i < addrinarray; i++) {
1739 if (PageHighMem(pages[i])) 1759 if (PageHighMem(pages[i]))
1740 continue; 1760 continue;
1741 start = page_to_pfn(pages[i]) << PAGE_SHIFT; 1761 start = page_to_pfn(pages[i]) << PAGE_SHIFT;
1742 end = start + PAGE_SIZE; 1762 end = start + PAGE_SIZE;
1743 free_memtype(start, end); 1763 free_memtype(start, end);
1744 } 1764 }
1745 1765
1746 return 0; 1766 return 0;
1747 } 1767 }
1748 EXPORT_SYMBOL(set_pages_array_wb); 1768 EXPORT_SYMBOL(set_pages_array_wb);
1749 1769
1750 int set_pages_x(struct page *page, int numpages) 1770 int set_pages_x(struct page *page, int numpages)
1751 { 1771 {
1752 unsigned long addr = (unsigned long)page_address(page); 1772 unsigned long addr = (unsigned long)page_address(page);
1753 1773
1754 return set_memory_x(addr, numpages); 1774 return set_memory_x(addr, numpages);
1755 } 1775 }
1756 EXPORT_SYMBOL(set_pages_x); 1776 EXPORT_SYMBOL(set_pages_x);
1757 1777
1758 int set_pages_nx(struct page *page, int numpages) 1778 int set_pages_nx(struct page *page, int numpages)
1759 { 1779 {
1760 unsigned long addr = (unsigned long)page_address(page); 1780 unsigned long addr = (unsigned long)page_address(page);
1761 1781
1762 return set_memory_nx(addr, numpages); 1782 return set_memory_nx(addr, numpages);
1763 } 1783 }
1764 EXPORT_SYMBOL(set_pages_nx); 1784 EXPORT_SYMBOL(set_pages_nx);
1765 1785
1766 int set_pages_ro(struct page *page, int numpages) 1786 int set_pages_ro(struct page *page, int numpages)
1767 { 1787 {
1768 unsigned long addr = (unsigned long)page_address(page); 1788 unsigned long addr = (unsigned long)page_address(page);
1769 1789
1770 return set_memory_ro(addr, numpages); 1790 return set_memory_ro(addr, numpages);
1771 } 1791 }
1772 1792
1773 int set_pages_rw(struct page *page, int numpages) 1793 int set_pages_rw(struct page *page, int numpages)
1774 { 1794 {
1775 unsigned long addr = (unsigned long)page_address(page); 1795 unsigned long addr = (unsigned long)page_address(page);
1776 1796
1777 return set_memory_rw(addr, numpages); 1797 return set_memory_rw(addr, numpages);
1778 } 1798 }
1779 1799
1780 #ifdef CONFIG_DEBUG_PAGEALLOC 1800 #ifdef CONFIG_DEBUG_PAGEALLOC
1781 1801
1782 static int __set_pages_p(struct page *page, int numpages) 1802 static int __set_pages_p(struct page *page, int numpages)
1783 { 1803 {
1784 unsigned long tempaddr = (unsigned long) page_address(page); 1804 unsigned long tempaddr = (unsigned long) page_address(page);
1785 struct cpa_data cpa = { .vaddr = &tempaddr, 1805 struct cpa_data cpa = { .vaddr = &tempaddr,
1786 .pgd = NULL, 1806 .pgd = NULL,
1787 .numpages = numpages, 1807 .numpages = numpages,
1788 .mask_set = __pgprot(_PAGE_PRESENT | _PAGE_RW), 1808 .mask_set = __pgprot(_PAGE_PRESENT | _PAGE_RW),
1789 .mask_clr = __pgprot(0), 1809 .mask_clr = __pgprot(0),
1790 .flags = 0}; 1810 .flags = 0};
1791 1811
1792 /* 1812 /*
1793 * No alias checking needed for setting present flag. otherwise, 1813 * No alias checking needed for setting present flag. otherwise,
1794 * we may need to break large pages for 64-bit kernel text 1814 * we may need to break large pages for 64-bit kernel text
1795 * mappings (this adds to complexity if we want to do this from 1815 * mappings (this adds to complexity if we want to do this from
1796 * atomic context especially). Let's keep it simple! 1816 * atomic context especially). Let's keep it simple!
1797 */ 1817 */
1798 return __change_page_attr_set_clr(&cpa, 0); 1818 return __change_page_attr_set_clr(&cpa, 0);
1799 } 1819 }
1800 1820
1801 static int __set_pages_np(struct page *page, int numpages) 1821 static int __set_pages_np(struct page *page, int numpages)
1802 { 1822 {
1803 unsigned long tempaddr = (unsigned long) page_address(page); 1823 unsigned long tempaddr = (unsigned long) page_address(page);
1804 struct cpa_data cpa = { .vaddr = &tempaddr, 1824 struct cpa_data cpa = { .vaddr = &tempaddr,
1805 .pgd = NULL, 1825 .pgd = NULL,
1806 .numpages = numpages, 1826 .numpages = numpages,
1807 .mask_set = __pgprot(0), 1827 .mask_set = __pgprot(0),
1808 .mask_clr = __pgprot(_PAGE_PRESENT | _PAGE_RW), 1828 .mask_clr = __pgprot(_PAGE_PRESENT | _PAGE_RW),
1809 .flags = 0}; 1829 .flags = 0};
1810 1830
1811 /* 1831 /*
1812 * No alias checking needed for setting not present flag. otherwise, 1832 * No alias checking needed for setting not present flag. otherwise,
1813 * we may need to break large pages for 64-bit kernel text 1833 * we may need to break large pages for 64-bit kernel text
1814 * mappings (this adds to complexity if we want to do this from 1834 * mappings (this adds to complexity if we want to do this from
1815 * atomic context especially). Let's keep it simple! 1835 * atomic context especially). Let's keep it simple!
1816 */ 1836 */
1817 return __change_page_attr_set_clr(&cpa, 0); 1837 return __change_page_attr_set_clr(&cpa, 0);
1818 } 1838 }
1819 1839
1820 void __kernel_map_pages(struct page *page, int numpages, int enable) 1840 void __kernel_map_pages(struct page *page, int numpages, int enable)
1821 { 1841 {
1822 if (PageHighMem(page)) 1842 if (PageHighMem(page))
1823 return; 1843 return;
1824 if (!enable) { 1844 if (!enable) {
1825 debug_check_no_locks_freed(page_address(page), 1845 debug_check_no_locks_freed(page_address(page),
1826 numpages * PAGE_SIZE); 1846 numpages * PAGE_SIZE);
1827 } 1847 }
1828 1848
1829 /* 1849 /*
1830 * The return value is ignored as the calls cannot fail. 1850 * The return value is ignored as the calls cannot fail.
1831 * Large pages for identity mappings are not used at boot time 1851 * Large pages for identity mappings are not used at boot time
1832 * and hence no memory allocations during large page split. 1852 * and hence no memory allocations during large page split.
1833 */ 1853 */
1834 if (enable) 1854 if (enable)
1835 __set_pages_p(page, numpages); 1855 __set_pages_p(page, numpages);
1836 else 1856 else
1837 __set_pages_np(page, numpages); 1857 __set_pages_np(page, numpages);
1838 1858
1839 /* 1859 /*
1840 * We should perform an IPI and flush all tlbs, 1860 * We should perform an IPI and flush all tlbs,
1841 * but that can deadlock->flush only current cpu: 1861 * but that can deadlock->flush only current cpu:
1842 */ 1862 */
1843 __flush_tlb_all(); 1863 __flush_tlb_all();
1844 1864
1845 arch_flush_lazy_mmu_mode(); 1865 arch_flush_lazy_mmu_mode();
1846 } 1866 }
1847 1867
1848 #ifdef CONFIG_HIBERNATION 1868 #ifdef CONFIG_HIBERNATION
1849 1869
1850 bool kernel_page_present(struct page *page) 1870 bool kernel_page_present(struct page *page)
1851 { 1871 {
1852 unsigned int level; 1872 unsigned int level;
1853 pte_t *pte; 1873 pte_t *pte;
1854 1874
1855 if (PageHighMem(page)) 1875 if (PageHighMem(page))
1856 return false; 1876 return false;
1857 1877
1858 pte = lookup_address((unsigned long)page_address(page), &level); 1878 pte = lookup_address((unsigned long)page_address(page), &level);
1859 return (pte_val(*pte) & _PAGE_PRESENT); 1879 return (pte_val(*pte) & _PAGE_PRESENT);
1860 } 1880 }
1861 1881
1862 #endif /* CONFIG_HIBERNATION */ 1882 #endif /* CONFIG_HIBERNATION */
1863 1883
1864 #endif /* CONFIG_DEBUG_PAGEALLOC */ 1884 #endif /* CONFIG_DEBUG_PAGEALLOC */
1865 1885
1866 int kernel_map_pages_in_pgd(pgd_t *pgd, u64 pfn, unsigned long address, 1886 int kernel_map_pages_in_pgd(pgd_t *pgd, u64 pfn, unsigned long address,
1867 unsigned numpages, unsigned long page_flags) 1887 unsigned numpages, unsigned long page_flags)
1868 { 1888 {
1869 int retval = -EINVAL; 1889 int retval = -EINVAL;
1870 1890
1871 struct cpa_data cpa = { 1891 struct cpa_data cpa = {
1872 .vaddr = &address, 1892 .vaddr = &address,
1873 .pfn = pfn, 1893 .pfn = pfn,
1874 .pgd = pgd, 1894 .pgd = pgd,
1875 .numpages = numpages, 1895 .numpages = numpages,
1876 .mask_set = __pgprot(0), 1896 .mask_set = __pgprot(0),
1877 .mask_clr = __pgprot(0), 1897 .mask_clr = __pgprot(0),
1878 .flags = 0, 1898 .flags = 0,
1879 }; 1899 };
1880 1900
1881 if (!(__supported_pte_mask & _PAGE_NX)) 1901 if (!(__supported_pte_mask & _PAGE_NX))
1882 goto out; 1902 goto out;
1883 1903
1884 if (!(page_flags & _PAGE_NX)) 1904 if (!(page_flags & _PAGE_NX))
1885 cpa.mask_clr = __pgprot(_PAGE_NX); 1905 cpa.mask_clr = __pgprot(_PAGE_NX);
1886 1906
1887 cpa.mask_set = __pgprot(_PAGE_PRESENT | page_flags); 1907 cpa.mask_set = __pgprot(_PAGE_PRESENT | page_flags);
1888 1908
1889 retval = __change_page_attr_set_clr(&cpa, 0); 1909 retval = __change_page_attr_set_clr(&cpa, 0);
1890 __flush_tlb_all(); 1910 __flush_tlb_all();
1891 1911
1892 out: 1912 out:
1893 return retval; 1913 return retval;
1894 } 1914 }
1895 1915
1896 void kernel_unmap_pages_in_pgd(pgd_t *root, unsigned long address, 1916 void kernel_unmap_pages_in_pgd(pgd_t *root, unsigned long address,
1897 unsigned numpages) 1917 unsigned numpages)
1898 { 1918 {
1899 unmap_pgd_range(root, address, address + (numpages << PAGE_SHIFT)); 1919 unmap_pgd_range(root, address, address + (numpages << PAGE_SHIFT));
1900 } 1920 }
1901 1921
1902 /* 1922 /*
1903 * The testcases use internal knowledge of the implementation that shouldn't 1923 * The testcases use internal knowledge of the implementation that shouldn't
1904 * be exposed to the rest of the kernel. Include these directly here. 1924 * be exposed to the rest of the kernel. Include these directly here.
1905 */ 1925 */
1906 #ifdef CONFIG_CPA_DEBUG 1926 #ifdef CONFIG_CPA_DEBUG
1907 #include "pageattr-test.c" 1927 #include "pageattr-test.c"
1908 #endif 1928 #endif
1909 1929
1 /* 1 /*
2 * Xen mmu operations 2 * Xen mmu operations
3 * 3 *
4 * This file contains the various mmu fetch and update operations. 4 * This file contains the various mmu fetch and update operations.
5 * The most important job they must perform is the mapping between the 5 * The most important job they must perform is the mapping between the
6 * domain's pfn and the overall machine mfns. 6 * domain's pfn and the overall machine mfns.
7 * 7 *
8 * Xen allows guests to directly update the pagetable, in a controlled 8 * Xen allows guests to directly update the pagetable, in a controlled
9 * fashion. In other words, the guest modifies the same pagetable 9 * fashion. In other words, the guest modifies the same pagetable
10 * that the CPU actually uses, which eliminates the overhead of having 10 * that the CPU actually uses, which eliminates the overhead of having
11 * a separate shadow pagetable. 11 * a separate shadow pagetable.
12 * 12 *
13 * In order to allow this, it falls on the guest domain to map its 13 * In order to allow this, it falls on the guest domain to map its
14 * notion of a "physical" pfn - which is just a domain-local linear 14 * notion of a "physical" pfn - which is just a domain-local linear
15 * address - into a real "machine address" which the CPU's MMU can 15 * address - into a real "machine address" which the CPU's MMU can
16 * use. 16 * use.
17 * 17 *
18 * A pgd_t/pmd_t/pte_t will typically contain an mfn, and so can be 18 * A pgd_t/pmd_t/pte_t will typically contain an mfn, and so can be
19 * inserted directly into the pagetable. When creating a new 19 * inserted directly into the pagetable. When creating a new
20 * pte/pmd/pgd, it converts the passed pfn into an mfn. Conversely, 20 * pte/pmd/pgd, it converts the passed pfn into an mfn. Conversely,
21 * when reading the content back with __(pgd|pmd|pte)_val, it converts 21 * when reading the content back with __(pgd|pmd|pte)_val, it converts
22 * the mfn back into a pfn. 22 * the mfn back into a pfn.
23 * 23 *
24 * The other constraint is that all pages which make up a pagetable 24 * The other constraint is that all pages which make up a pagetable
25 * must be mapped read-only in the guest. This prevents uncontrolled 25 * must be mapped read-only in the guest. This prevents uncontrolled
26 * guest updates to the pagetable. Xen strictly enforces this, and 26 * guest updates to the pagetable. Xen strictly enforces this, and
27 * will disallow any pagetable update which will end up mapping a 27 * will disallow any pagetable update which will end up mapping a
28 * pagetable page RW, and will disallow using any writable page as a 28 * pagetable page RW, and will disallow using any writable page as a
29 * pagetable. 29 * pagetable.
30 * 30 *
31 * Naively, when loading %cr3 with the base of a new pagetable, Xen 31 * Naively, when loading %cr3 with the base of a new pagetable, Xen
32 * would need to validate the whole pagetable before going on. 32 * would need to validate the whole pagetable before going on.
33 * Naturally, this is quite slow. The solution is to "pin" a 33 * Naturally, this is quite slow. The solution is to "pin" a
34 * pagetable, which enforces all the constraints on the pagetable even 34 * pagetable, which enforces all the constraints on the pagetable even
35 * when it is not actively in use. This menas that Xen can be assured 35 * when it is not actively in use. This menas that Xen can be assured
36 * that it is still valid when you do load it into %cr3, and doesn't 36 * that it is still valid when you do load it into %cr3, and doesn't
37 * need to revalidate it. 37 * need to revalidate it.
38 * 38 *
39 * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007 39 * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
40 */ 40 */
41 #include <linux/sched.h> 41 #include <linux/sched.h>
42 #include <linux/highmem.h> 42 #include <linux/highmem.h>
43 #include <linux/debugfs.h> 43 #include <linux/debugfs.h>
44 #include <linux/bug.h> 44 #include <linux/bug.h>
45 #include <linux/vmalloc.h> 45 #include <linux/vmalloc.h>
46 #include <linux/module.h> 46 #include <linux/module.h>
47 #include <linux/gfp.h> 47 #include <linux/gfp.h>
48 #include <linux/memblock.h> 48 #include <linux/memblock.h>
49 #include <linux/seq_file.h> 49 #include <linux/seq_file.h>
50 #include <linux/crash_dump.h> 50 #include <linux/crash_dump.h>
51 51
52 #include <trace/events/xen.h> 52 #include <trace/events/xen.h>
53 53
54 #include <asm/pgtable.h> 54 #include <asm/pgtable.h>
55 #include <asm/tlbflush.h> 55 #include <asm/tlbflush.h>
56 #include <asm/fixmap.h> 56 #include <asm/fixmap.h>
57 #include <asm/mmu_context.h> 57 #include <asm/mmu_context.h>
58 #include <asm/setup.h> 58 #include <asm/setup.h>
59 #include <asm/paravirt.h> 59 #include <asm/paravirt.h>
60 #include <asm/e820.h> 60 #include <asm/e820.h>
61 #include <asm/linkage.h> 61 #include <asm/linkage.h>
62 #include <asm/page.h> 62 #include <asm/page.h>
63 #include <asm/init.h> 63 #include <asm/init.h>
64 #include <asm/pat.h> 64 #include <asm/pat.h>
65 #include <asm/smp.h> 65 #include <asm/smp.h>
66 66
67 #include <asm/xen/hypercall.h> 67 #include <asm/xen/hypercall.h>
68 #include <asm/xen/hypervisor.h> 68 #include <asm/xen/hypervisor.h>
69 69
70 #include <xen/xen.h> 70 #include <xen/xen.h>
71 #include <xen/page.h> 71 #include <xen/page.h>
72 #include <xen/interface/xen.h> 72 #include <xen/interface/xen.h>
73 #include <xen/interface/hvm/hvm_op.h> 73 #include <xen/interface/hvm/hvm_op.h>
74 #include <xen/interface/version.h> 74 #include <xen/interface/version.h>
75 #include <xen/interface/memory.h> 75 #include <xen/interface/memory.h>
76 #include <xen/hvc-console.h> 76 #include <xen/hvc-console.h>
77 77
78 #include "multicalls.h" 78 #include "multicalls.h"
79 #include "mmu.h" 79 #include "mmu.h"
80 #include "debugfs.h" 80 #include "debugfs.h"
81 81
82 /* 82 /*
83 * Protects atomic reservation decrease/increase against concurrent increases. 83 * Protects atomic reservation decrease/increase against concurrent increases.
84 * Also protects non-atomic updates of current_pages and balloon lists. 84 * Also protects non-atomic updates of current_pages and balloon lists.
85 */ 85 */
86 DEFINE_SPINLOCK(xen_reservation_lock); 86 DEFINE_SPINLOCK(xen_reservation_lock);
87 87
88 #ifdef CONFIG_X86_32 88 #ifdef CONFIG_X86_32
89 /* 89 /*
90 * Identity map, in addition to plain kernel map. This needs to be 90 * Identity map, in addition to plain kernel map. This needs to be
91 * large enough to allocate page table pages to allocate the rest. 91 * large enough to allocate page table pages to allocate the rest.
92 * Each page can map 2MB. 92 * Each page can map 2MB.
93 */ 93 */
94 #define LEVEL1_IDENT_ENTRIES (PTRS_PER_PTE * 4) 94 #define LEVEL1_IDENT_ENTRIES (PTRS_PER_PTE * 4)
95 static RESERVE_BRK_ARRAY(pte_t, level1_ident_pgt, LEVEL1_IDENT_ENTRIES); 95 static RESERVE_BRK_ARRAY(pte_t, level1_ident_pgt, LEVEL1_IDENT_ENTRIES);
96 #endif 96 #endif
97 #ifdef CONFIG_X86_64 97 #ifdef CONFIG_X86_64
98 /* l3 pud for userspace vsyscall mapping */ 98 /* l3 pud for userspace vsyscall mapping */
99 static pud_t level3_user_vsyscall[PTRS_PER_PUD] __page_aligned_bss; 99 static pud_t level3_user_vsyscall[PTRS_PER_PUD] __page_aligned_bss;
100 #endif /* CONFIG_X86_64 */ 100 #endif /* CONFIG_X86_64 */
101 101
102 /* 102 /*
103 * Note about cr3 (pagetable base) values: 103 * Note about cr3 (pagetable base) values:
104 * 104 *
105 * xen_cr3 contains the current logical cr3 value; it contains the 105 * xen_cr3 contains the current logical cr3 value; it contains the
106 * last set cr3. This may not be the current effective cr3, because 106 * last set cr3. This may not be the current effective cr3, because
107 * its update may be being lazily deferred. However, a vcpu looking 107 * its update may be being lazily deferred. However, a vcpu looking
108 * at its own cr3 can use this value knowing that it everything will 108 * at its own cr3 can use this value knowing that it everything will
109 * be self-consistent. 109 * be self-consistent.
110 * 110 *
111 * xen_current_cr3 contains the actual vcpu cr3; it is set once the 111 * xen_current_cr3 contains the actual vcpu cr3; it is set once the
112 * hypercall to set the vcpu cr3 is complete (so it may be a little 112 * hypercall to set the vcpu cr3 is complete (so it may be a little
113 * out of date, but it will never be set early). If one vcpu is 113 * out of date, but it will never be set early). If one vcpu is
114 * looking at another vcpu's cr3 value, it should use this variable. 114 * looking at another vcpu's cr3 value, it should use this variable.
115 */ 115 */
116 DEFINE_PER_CPU(unsigned long, xen_cr3); /* cr3 stored as physaddr */ 116 DEFINE_PER_CPU(unsigned long, xen_cr3); /* cr3 stored as physaddr */
117 DEFINE_PER_CPU(unsigned long, xen_current_cr3); /* actual vcpu cr3 */ 117 DEFINE_PER_CPU(unsigned long, xen_current_cr3); /* actual vcpu cr3 */
118 118
119 119
120 /* 120 /*
121 * Just beyond the highest usermode address. STACK_TOP_MAX has a 121 * Just beyond the highest usermode address. STACK_TOP_MAX has a
122 * redzone above it, so round it up to a PGD boundary. 122 * redzone above it, so round it up to a PGD boundary.
123 */ 123 */
124 #define USER_LIMIT ((STACK_TOP_MAX + PGDIR_SIZE - 1) & PGDIR_MASK) 124 #define USER_LIMIT ((STACK_TOP_MAX + PGDIR_SIZE - 1) & PGDIR_MASK)
125 125
126 unsigned long arbitrary_virt_to_mfn(void *vaddr) 126 unsigned long arbitrary_virt_to_mfn(void *vaddr)
127 { 127 {
128 xmaddr_t maddr = arbitrary_virt_to_machine(vaddr); 128 xmaddr_t maddr = arbitrary_virt_to_machine(vaddr);
129 129
130 return PFN_DOWN(maddr.maddr); 130 return PFN_DOWN(maddr.maddr);
131 } 131 }
132 132
133 xmaddr_t arbitrary_virt_to_machine(void *vaddr) 133 xmaddr_t arbitrary_virt_to_machine(void *vaddr)
134 { 134 {
135 unsigned long address = (unsigned long)vaddr; 135 unsigned long address = (unsigned long)vaddr;
136 unsigned int level; 136 unsigned int level;
137 pte_t *pte; 137 pte_t *pte;
138 unsigned offset; 138 unsigned offset;
139 139
140 /* 140 /*
141 * if the PFN is in the linear mapped vaddr range, we can just use 141 * if the PFN is in the linear mapped vaddr range, we can just use
142 * the (quick) virt_to_machine() p2m lookup 142 * the (quick) virt_to_machine() p2m lookup
143 */ 143 */
144 if (virt_addr_valid(vaddr)) 144 if (virt_addr_valid(vaddr))
145 return virt_to_machine(vaddr); 145 return virt_to_machine(vaddr);
146 146
147 /* otherwise we have to do a (slower) full page-table walk */ 147 /* otherwise we have to do a (slower) full page-table walk */
148 148
149 pte = lookup_address(address, &level); 149 pte = lookup_address(address, &level);
150 BUG_ON(pte == NULL); 150 BUG_ON(pte == NULL);
151 offset = address & ~PAGE_MASK; 151 offset = address & ~PAGE_MASK;
152 return XMADDR(((phys_addr_t)pte_mfn(*pte) << PAGE_SHIFT) + offset); 152 return XMADDR(((phys_addr_t)pte_mfn(*pte) << PAGE_SHIFT) + offset);
153 } 153 }
154 EXPORT_SYMBOL_GPL(arbitrary_virt_to_machine); 154 EXPORT_SYMBOL_GPL(arbitrary_virt_to_machine);
155 155
156 void make_lowmem_page_readonly(void *vaddr) 156 void make_lowmem_page_readonly(void *vaddr)
157 { 157 {
158 pte_t *pte, ptev; 158 pte_t *pte, ptev;
159 unsigned long address = (unsigned long)vaddr; 159 unsigned long address = (unsigned long)vaddr;
160 unsigned int level; 160 unsigned int level;
161 161
162 pte = lookup_address(address, &level); 162 pte = lookup_address(address, &level);
163 if (pte == NULL) 163 if (pte == NULL)
164 return; /* vaddr missing */ 164 return; /* vaddr missing */
165 165
166 ptev = pte_wrprotect(*pte); 166 ptev = pte_wrprotect(*pte);
167 167
168 if (HYPERVISOR_update_va_mapping(address, ptev, 0)) 168 if (HYPERVISOR_update_va_mapping(address, ptev, 0))
169 BUG(); 169 BUG();
170 } 170 }
171 171
172 void make_lowmem_page_readwrite(void *vaddr) 172 void make_lowmem_page_readwrite(void *vaddr)
173 { 173 {
174 pte_t *pte, ptev; 174 pte_t *pte, ptev;
175 unsigned long address = (unsigned long)vaddr; 175 unsigned long address = (unsigned long)vaddr;
176 unsigned int level; 176 unsigned int level;
177 177
178 pte = lookup_address(address, &level); 178 pte = lookup_address(address, &level);
179 if (pte == NULL) 179 if (pte == NULL)
180 return; /* vaddr missing */ 180 return; /* vaddr missing */
181 181
182 ptev = pte_mkwrite(*pte); 182 ptev = pte_mkwrite(*pte);
183 183
184 if (HYPERVISOR_update_va_mapping(address, ptev, 0)) 184 if (HYPERVISOR_update_va_mapping(address, ptev, 0))
185 BUG(); 185 BUG();
186 } 186 }
187 187
188 188
189 static bool xen_page_pinned(void *ptr) 189 static bool xen_page_pinned(void *ptr)
190 { 190 {
191 struct page *page = virt_to_page(ptr); 191 struct page *page = virt_to_page(ptr);
192 192
193 return PagePinned(page); 193 return PagePinned(page);
194 } 194 }
195 195
196 void xen_set_domain_pte(pte_t *ptep, pte_t pteval, unsigned domid) 196 void xen_set_domain_pte(pte_t *ptep, pte_t pteval, unsigned domid)
197 { 197 {
198 struct multicall_space mcs; 198 struct multicall_space mcs;
199 struct mmu_update *u; 199 struct mmu_update *u;
200 200
201 trace_xen_mmu_set_domain_pte(ptep, pteval, domid); 201 trace_xen_mmu_set_domain_pte(ptep, pteval, domid);
202 202
203 mcs = xen_mc_entry(sizeof(*u)); 203 mcs = xen_mc_entry(sizeof(*u));
204 u = mcs.args; 204 u = mcs.args;
205 205
206 /* ptep might be kmapped when using 32-bit HIGHPTE */ 206 /* ptep might be kmapped when using 32-bit HIGHPTE */
207 u->ptr = virt_to_machine(ptep).maddr; 207 u->ptr = virt_to_machine(ptep).maddr;
208 u->val = pte_val_ma(pteval); 208 u->val = pte_val_ma(pteval);
209 209
210 MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, domid); 210 MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, domid);
211 211
212 xen_mc_issue(PARAVIRT_LAZY_MMU); 212 xen_mc_issue(PARAVIRT_LAZY_MMU);
213 } 213 }
214 EXPORT_SYMBOL_GPL(xen_set_domain_pte); 214 EXPORT_SYMBOL_GPL(xen_set_domain_pte);
215 215
216 static void xen_extend_mmu_update(const struct mmu_update *update) 216 static void xen_extend_mmu_update(const struct mmu_update *update)
217 { 217 {
218 struct multicall_space mcs; 218 struct multicall_space mcs;
219 struct mmu_update *u; 219 struct mmu_update *u;
220 220
221 mcs = xen_mc_extend_args(__HYPERVISOR_mmu_update, sizeof(*u)); 221 mcs = xen_mc_extend_args(__HYPERVISOR_mmu_update, sizeof(*u));
222 222
223 if (mcs.mc != NULL) { 223 if (mcs.mc != NULL) {
224 mcs.mc->args[1]++; 224 mcs.mc->args[1]++;
225 } else { 225 } else {
226 mcs = __xen_mc_entry(sizeof(*u)); 226 mcs = __xen_mc_entry(sizeof(*u));
227 MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, DOMID_SELF); 227 MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, DOMID_SELF);
228 } 228 }
229 229
230 u = mcs.args; 230 u = mcs.args;
231 *u = *update; 231 *u = *update;
232 } 232 }
233 233
234 static void xen_extend_mmuext_op(const struct mmuext_op *op) 234 static void xen_extend_mmuext_op(const struct mmuext_op *op)
235 { 235 {
236 struct multicall_space mcs; 236 struct multicall_space mcs;
237 struct mmuext_op *u; 237 struct mmuext_op *u;
238 238
239 mcs = xen_mc_extend_args(__HYPERVISOR_mmuext_op, sizeof(*u)); 239 mcs = xen_mc_extend_args(__HYPERVISOR_mmuext_op, sizeof(*u));
240 240
241 if (mcs.mc != NULL) { 241 if (mcs.mc != NULL) {
242 mcs.mc->args[1]++; 242 mcs.mc->args[1]++;
243 } else { 243 } else {
244 mcs = __xen_mc_entry(sizeof(*u)); 244 mcs = __xen_mc_entry(sizeof(*u));
245 MULTI_mmuext_op(mcs.mc, mcs.args, 1, NULL, DOMID_SELF); 245 MULTI_mmuext_op(mcs.mc, mcs.args, 1, NULL, DOMID_SELF);
246 } 246 }
247 247
248 u = mcs.args; 248 u = mcs.args;
249 *u = *op; 249 *u = *op;
250 } 250 }
251 251
252 static void xen_set_pmd_hyper(pmd_t *ptr, pmd_t val) 252 static void xen_set_pmd_hyper(pmd_t *ptr, pmd_t val)
253 { 253 {
254 struct mmu_update u; 254 struct mmu_update u;
255 255
256 preempt_disable(); 256 preempt_disable();
257 257
258 xen_mc_batch(); 258 xen_mc_batch();
259 259
260 /* ptr may be ioremapped for 64-bit pagetable setup */ 260 /* ptr may be ioremapped for 64-bit pagetable setup */
261 u.ptr = arbitrary_virt_to_machine(ptr).maddr; 261 u.ptr = arbitrary_virt_to_machine(ptr).maddr;
262 u.val = pmd_val_ma(val); 262 u.val = pmd_val_ma(val);
263 xen_extend_mmu_update(&u); 263 xen_extend_mmu_update(&u);
264 264
265 xen_mc_issue(PARAVIRT_LAZY_MMU); 265 xen_mc_issue(PARAVIRT_LAZY_MMU);
266 266
267 preempt_enable(); 267 preempt_enable();
268 } 268 }
269 269
270 static void xen_set_pmd(pmd_t *ptr, pmd_t val) 270 static void xen_set_pmd(pmd_t *ptr, pmd_t val)
271 { 271 {
272 trace_xen_mmu_set_pmd(ptr, val); 272 trace_xen_mmu_set_pmd(ptr, val);
273 273
274 /* If page is not pinned, we can just update the entry 274 /* If page is not pinned, we can just update the entry
275 directly */ 275 directly */
276 if (!xen_page_pinned(ptr)) { 276 if (!xen_page_pinned(ptr)) {
277 *ptr = val; 277 *ptr = val;
278 return; 278 return;
279 } 279 }
280 280
281 xen_set_pmd_hyper(ptr, val); 281 xen_set_pmd_hyper(ptr, val);
282 } 282 }
283 283
284 /* 284 /*
285 * Associate a virtual page frame with a given physical page frame 285 * Associate a virtual page frame with a given physical page frame
286 * and protection flags for that frame. 286 * and protection flags for that frame.
287 */ 287 */
288 void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags) 288 void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags)
289 { 289 {
290 set_pte_vaddr(vaddr, mfn_pte(mfn, flags)); 290 set_pte_vaddr(vaddr, mfn_pte(mfn, flags));
291 } 291 }
292 292
293 static bool xen_batched_set_pte(pte_t *ptep, pte_t pteval) 293 static bool xen_batched_set_pte(pte_t *ptep, pte_t pteval)
294 { 294 {
295 struct mmu_update u; 295 struct mmu_update u;
296 296
297 if (paravirt_get_lazy_mode() != PARAVIRT_LAZY_MMU) 297 if (paravirt_get_lazy_mode() != PARAVIRT_LAZY_MMU)
298 return false; 298 return false;
299 299
300 xen_mc_batch(); 300 xen_mc_batch();
301 301
302 u.ptr = virt_to_machine(ptep).maddr | MMU_NORMAL_PT_UPDATE; 302 u.ptr = virt_to_machine(ptep).maddr | MMU_NORMAL_PT_UPDATE;
303 u.val = pte_val_ma(pteval); 303 u.val = pte_val_ma(pteval);
304 xen_extend_mmu_update(&u); 304 xen_extend_mmu_update(&u);
305 305
306 xen_mc_issue(PARAVIRT_LAZY_MMU); 306 xen_mc_issue(PARAVIRT_LAZY_MMU);
307 307
308 return true; 308 return true;
309 } 309 }
310 310
311 static inline void __xen_set_pte(pte_t *ptep, pte_t pteval) 311 static inline void __xen_set_pte(pte_t *ptep, pte_t pteval)
312 { 312 {
313 if (!xen_batched_set_pte(ptep, pteval)) { 313 if (!xen_batched_set_pte(ptep, pteval)) {
314 /* 314 /*
315 * Could call native_set_pte() here and trap and 315 * Could call native_set_pte() here and trap and
316 * emulate the PTE write but with 32-bit guests this 316 * emulate the PTE write but with 32-bit guests this
317 * needs two traps (one for each of the two 32-bit 317 * needs two traps (one for each of the two 32-bit
318 * words in the PTE) so do one hypercall directly 318 * words in the PTE) so do one hypercall directly
319 * instead. 319 * instead.
320 */ 320 */
321 struct mmu_update u; 321 struct mmu_update u;
322 322
323 u.ptr = virt_to_machine(ptep).maddr | MMU_NORMAL_PT_UPDATE; 323 u.ptr = virt_to_machine(ptep).maddr | MMU_NORMAL_PT_UPDATE;
324 u.val = pte_val_ma(pteval); 324 u.val = pte_val_ma(pteval);
325 HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF); 325 HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF);
326 } 326 }
327 } 327 }
328 328
329 static void xen_set_pte(pte_t *ptep, pte_t pteval) 329 static void xen_set_pte(pte_t *ptep, pte_t pteval)
330 { 330 {
331 trace_xen_mmu_set_pte(ptep, pteval); 331 trace_xen_mmu_set_pte(ptep, pteval);
332 __xen_set_pte(ptep, pteval); 332 __xen_set_pte(ptep, pteval);
333 } 333 }
334 334
335 static void xen_set_pte_at(struct mm_struct *mm, unsigned long addr, 335 static void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
336 pte_t *ptep, pte_t pteval) 336 pte_t *ptep, pte_t pteval)
337 { 337 {
338 trace_xen_mmu_set_pte_at(mm, addr, ptep, pteval); 338 trace_xen_mmu_set_pte_at(mm, addr, ptep, pteval);
339 __xen_set_pte(ptep, pteval); 339 __xen_set_pte(ptep, pteval);
340 } 340 }
341 341
342 pte_t xen_ptep_modify_prot_start(struct mm_struct *mm, 342 pte_t xen_ptep_modify_prot_start(struct mm_struct *mm,
343 unsigned long addr, pte_t *ptep) 343 unsigned long addr, pte_t *ptep)
344 { 344 {
345 /* Just return the pte as-is. We preserve the bits on commit */ 345 /* Just return the pte as-is. We preserve the bits on commit */
346 trace_xen_mmu_ptep_modify_prot_start(mm, addr, ptep, *ptep); 346 trace_xen_mmu_ptep_modify_prot_start(mm, addr, ptep, *ptep);
347 return *ptep; 347 return *ptep;
348 } 348 }
349 349
350 void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr, 350 void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr,
351 pte_t *ptep, pte_t pte) 351 pte_t *ptep, pte_t pte)
352 { 352 {
353 struct mmu_update u; 353 struct mmu_update u;
354 354
355 trace_xen_mmu_ptep_modify_prot_commit(mm, addr, ptep, pte); 355 trace_xen_mmu_ptep_modify_prot_commit(mm, addr, ptep, pte);
356 xen_mc_batch(); 356 xen_mc_batch();
357 357
358 u.ptr = virt_to_machine(ptep).maddr | MMU_PT_UPDATE_PRESERVE_AD; 358 u.ptr = virt_to_machine(ptep).maddr | MMU_PT_UPDATE_PRESERVE_AD;
359 u.val = pte_val_ma(pte); 359 u.val = pte_val_ma(pte);
360 xen_extend_mmu_update(&u); 360 xen_extend_mmu_update(&u);
361 361
362 xen_mc_issue(PARAVIRT_LAZY_MMU); 362 xen_mc_issue(PARAVIRT_LAZY_MMU);
363 } 363 }
364 364
365 /* Assume pteval_t is equivalent to all the other *val_t types. */ 365 /* Assume pteval_t is equivalent to all the other *val_t types. */
366 static pteval_t pte_mfn_to_pfn(pteval_t val) 366 static pteval_t pte_mfn_to_pfn(pteval_t val)
367 { 367 {
368 if (val & _PAGE_PRESENT) { 368 if (val & _PAGE_PRESENT) {
369 unsigned long mfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT; 369 unsigned long mfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT;
370 unsigned long pfn = mfn_to_pfn(mfn); 370 unsigned long pfn = mfn_to_pfn(mfn);
371 371
372 pteval_t flags = val & PTE_FLAGS_MASK; 372 pteval_t flags = val & PTE_FLAGS_MASK;
373 if (unlikely(pfn == ~0)) 373 if (unlikely(pfn == ~0))
374 val = flags & ~_PAGE_PRESENT; 374 val = flags & ~_PAGE_PRESENT;
375 else 375 else
376 val = ((pteval_t)pfn << PAGE_SHIFT) | flags; 376 val = ((pteval_t)pfn << PAGE_SHIFT) | flags;
377 } 377 }
378 378
379 return val; 379 return val;
380 } 380 }
381 381
382 static pteval_t pte_pfn_to_mfn(pteval_t val) 382 static pteval_t pte_pfn_to_mfn(pteval_t val)
383 { 383 {
384 if (val & _PAGE_PRESENT) { 384 if (val & _PAGE_PRESENT) {
385 unsigned long pfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT; 385 unsigned long pfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT;
386 pteval_t flags = val & PTE_FLAGS_MASK; 386 pteval_t flags = val & PTE_FLAGS_MASK;
387 unsigned long mfn; 387 unsigned long mfn;
388 388
389 if (!xen_feature(XENFEAT_auto_translated_physmap)) 389 if (!xen_feature(XENFEAT_auto_translated_physmap))
390 mfn = get_phys_to_machine(pfn); 390 mfn = __pfn_to_mfn(pfn);
391 else 391 else
392 mfn = pfn; 392 mfn = pfn;
393 /* 393 /*
394 * If there's no mfn for the pfn, then just create an 394 * If there's no mfn for the pfn, then just create an
395 * empty non-present pte. Unfortunately this loses 395 * empty non-present pte. Unfortunately this loses
396 * information about the original pfn, so 396 * information about the original pfn, so
397 * pte_mfn_to_pfn is asymmetric. 397 * pte_mfn_to_pfn is asymmetric.
398 */ 398 */
399 if (unlikely(mfn == INVALID_P2M_ENTRY)) { 399 if (unlikely(mfn == INVALID_P2M_ENTRY)) {
400 mfn = 0; 400 mfn = 0;
401 flags = 0; 401 flags = 0;
402 } else 402 } else
403 mfn &= ~(FOREIGN_FRAME_BIT | IDENTITY_FRAME_BIT); 403 mfn &= ~(FOREIGN_FRAME_BIT | IDENTITY_FRAME_BIT);
404 val = ((pteval_t)mfn << PAGE_SHIFT) | flags; 404 val = ((pteval_t)mfn << PAGE_SHIFT) | flags;
405 } 405 }
406 406
407 return val; 407 return val;
408 } 408 }
409 409
410 __visible pteval_t xen_pte_val(pte_t pte) 410 __visible pteval_t xen_pte_val(pte_t pte)
411 { 411 {
412 pteval_t pteval = pte.pte; 412 pteval_t pteval = pte.pte;
413 413
414 return pte_mfn_to_pfn(pteval); 414 return pte_mfn_to_pfn(pteval);
415 } 415 }
416 PV_CALLEE_SAVE_REGS_THUNK(xen_pte_val); 416 PV_CALLEE_SAVE_REGS_THUNK(xen_pte_val);
417 417
418 __visible pgdval_t xen_pgd_val(pgd_t pgd) 418 __visible pgdval_t xen_pgd_val(pgd_t pgd)
419 { 419 {
420 return pte_mfn_to_pfn(pgd.pgd); 420 return pte_mfn_to_pfn(pgd.pgd);
421 } 421 }
422 PV_CALLEE_SAVE_REGS_THUNK(xen_pgd_val); 422 PV_CALLEE_SAVE_REGS_THUNK(xen_pgd_val);
423 423
424 __visible pte_t xen_make_pte(pteval_t pte) 424 __visible pte_t xen_make_pte(pteval_t pte)
425 { 425 {
426 pte = pte_pfn_to_mfn(pte); 426 pte = pte_pfn_to_mfn(pte);
427 427
428 return native_make_pte(pte); 428 return native_make_pte(pte);
429 } 429 }
430 PV_CALLEE_SAVE_REGS_THUNK(xen_make_pte); 430 PV_CALLEE_SAVE_REGS_THUNK(xen_make_pte);
431 431
432 __visible pgd_t xen_make_pgd(pgdval_t pgd) 432 __visible pgd_t xen_make_pgd(pgdval_t pgd)
433 { 433 {
434 pgd = pte_pfn_to_mfn(pgd); 434 pgd = pte_pfn_to_mfn(pgd);
435 return native_make_pgd(pgd); 435 return native_make_pgd(pgd);
436 } 436 }
437 PV_CALLEE_SAVE_REGS_THUNK(xen_make_pgd); 437 PV_CALLEE_SAVE_REGS_THUNK(xen_make_pgd);
438 438
439 __visible pmdval_t xen_pmd_val(pmd_t pmd) 439 __visible pmdval_t xen_pmd_val(pmd_t pmd)
440 { 440 {
441 return pte_mfn_to_pfn(pmd.pmd); 441 return pte_mfn_to_pfn(pmd.pmd);
442 } 442 }
443 PV_CALLEE_SAVE_REGS_THUNK(xen_pmd_val); 443 PV_CALLEE_SAVE_REGS_THUNK(xen_pmd_val);
444 444
445 static void xen_set_pud_hyper(pud_t *ptr, pud_t val) 445 static void xen_set_pud_hyper(pud_t *ptr, pud_t val)
446 { 446 {
447 struct mmu_update u; 447 struct mmu_update u;
448 448
449 preempt_disable(); 449 preempt_disable();
450 450
451 xen_mc_batch(); 451 xen_mc_batch();
452 452
453 /* ptr may be ioremapped for 64-bit pagetable setup */ 453 /* ptr may be ioremapped for 64-bit pagetable setup */
454 u.ptr = arbitrary_virt_to_machine(ptr).maddr; 454 u.ptr = arbitrary_virt_to_machine(ptr).maddr;
455 u.val = pud_val_ma(val); 455 u.val = pud_val_ma(val);
456 xen_extend_mmu_update(&u); 456 xen_extend_mmu_update(&u);
457 457
458 xen_mc_issue(PARAVIRT_LAZY_MMU); 458 xen_mc_issue(PARAVIRT_LAZY_MMU);
459 459
460 preempt_enable(); 460 preempt_enable();
461 } 461 }
462 462
463 static void xen_set_pud(pud_t *ptr, pud_t val) 463 static void xen_set_pud(pud_t *ptr, pud_t val)
464 { 464 {
465 trace_xen_mmu_set_pud(ptr, val); 465 trace_xen_mmu_set_pud(ptr, val);
466 466
467 /* If page is not pinned, we can just update the entry 467 /* If page is not pinned, we can just update the entry
468 directly */ 468 directly */
469 if (!xen_page_pinned(ptr)) { 469 if (!xen_page_pinned(ptr)) {
470 *ptr = val; 470 *ptr = val;
471 return; 471 return;
472 } 472 }
473 473
474 xen_set_pud_hyper(ptr, val); 474 xen_set_pud_hyper(ptr, val);
475 } 475 }
476 476
477 #ifdef CONFIG_X86_PAE 477 #ifdef CONFIG_X86_PAE
478 static void xen_set_pte_atomic(pte_t *ptep, pte_t pte) 478 static void xen_set_pte_atomic(pte_t *ptep, pte_t pte)
479 { 479 {
480 trace_xen_mmu_set_pte_atomic(ptep, pte); 480 trace_xen_mmu_set_pte_atomic(ptep, pte);
481 set_64bit((u64 *)ptep, native_pte_val(pte)); 481 set_64bit((u64 *)ptep, native_pte_val(pte));
482 } 482 }
483 483
484 static void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) 484 static void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
485 { 485 {
486 trace_xen_mmu_pte_clear(mm, addr, ptep); 486 trace_xen_mmu_pte_clear(mm, addr, ptep);
487 if (!xen_batched_set_pte(ptep, native_make_pte(0))) 487 if (!xen_batched_set_pte(ptep, native_make_pte(0)))
488 native_pte_clear(mm, addr, ptep); 488 native_pte_clear(mm, addr, ptep);
489 } 489 }
490 490
491 static void xen_pmd_clear(pmd_t *pmdp) 491 static void xen_pmd_clear(pmd_t *pmdp)
492 { 492 {
493 trace_xen_mmu_pmd_clear(pmdp); 493 trace_xen_mmu_pmd_clear(pmdp);
494 set_pmd(pmdp, __pmd(0)); 494 set_pmd(pmdp, __pmd(0));
495 } 495 }
496 #endif /* CONFIG_X86_PAE */ 496 #endif /* CONFIG_X86_PAE */
497 497
498 __visible pmd_t xen_make_pmd(pmdval_t pmd) 498 __visible pmd_t xen_make_pmd(pmdval_t pmd)
499 { 499 {
500 pmd = pte_pfn_to_mfn(pmd); 500 pmd = pte_pfn_to_mfn(pmd);
501 return native_make_pmd(pmd); 501 return native_make_pmd(pmd);
502 } 502 }
503 PV_CALLEE_SAVE_REGS_THUNK(xen_make_pmd); 503 PV_CALLEE_SAVE_REGS_THUNK(xen_make_pmd);
504 504
505 #if PAGETABLE_LEVELS == 4 505 #if PAGETABLE_LEVELS == 4
506 __visible pudval_t xen_pud_val(pud_t pud) 506 __visible pudval_t xen_pud_val(pud_t pud)
507 { 507 {
508 return pte_mfn_to_pfn(pud.pud); 508 return pte_mfn_to_pfn(pud.pud);
509 } 509 }
510 PV_CALLEE_SAVE_REGS_THUNK(xen_pud_val); 510 PV_CALLEE_SAVE_REGS_THUNK(xen_pud_val);
511 511
512 __visible pud_t xen_make_pud(pudval_t pud) 512 __visible pud_t xen_make_pud(pudval_t pud)
513 { 513 {
514 pud = pte_pfn_to_mfn(pud); 514 pud = pte_pfn_to_mfn(pud);
515 515
516 return native_make_pud(pud); 516 return native_make_pud(pud);
517 } 517 }
518 PV_CALLEE_SAVE_REGS_THUNK(xen_make_pud); 518 PV_CALLEE_SAVE_REGS_THUNK(xen_make_pud);
519 519
520 static pgd_t *xen_get_user_pgd(pgd_t *pgd) 520 static pgd_t *xen_get_user_pgd(pgd_t *pgd)
521 { 521 {
522 pgd_t *pgd_page = (pgd_t *)(((unsigned long)pgd) & PAGE_MASK); 522 pgd_t *pgd_page = (pgd_t *)(((unsigned long)pgd) & PAGE_MASK);
523 unsigned offset = pgd - pgd_page; 523 unsigned offset = pgd - pgd_page;
524 pgd_t *user_ptr = NULL; 524 pgd_t *user_ptr = NULL;
525 525
526 if (offset < pgd_index(USER_LIMIT)) { 526 if (offset < pgd_index(USER_LIMIT)) {
527 struct page *page = virt_to_page(pgd_page); 527 struct page *page = virt_to_page(pgd_page);
528 user_ptr = (pgd_t *)page->private; 528 user_ptr = (pgd_t *)page->private;
529 if (user_ptr) 529 if (user_ptr)
530 user_ptr += offset; 530 user_ptr += offset;
531 } 531 }
532 532
533 return user_ptr; 533 return user_ptr;
534 } 534 }
535 535
536 static void __xen_set_pgd_hyper(pgd_t *ptr, pgd_t val) 536 static void __xen_set_pgd_hyper(pgd_t *ptr, pgd_t val)
537 { 537 {
538 struct mmu_update u; 538 struct mmu_update u;
539 539
540 u.ptr = virt_to_machine(ptr).maddr; 540 u.ptr = virt_to_machine(ptr).maddr;
541 u.val = pgd_val_ma(val); 541 u.val = pgd_val_ma(val);
542 xen_extend_mmu_update(&u); 542 xen_extend_mmu_update(&u);
543 } 543 }
544 544
545 /* 545 /*
546 * Raw hypercall-based set_pgd, intended for in early boot before 546 * Raw hypercall-based set_pgd, intended for in early boot before
547 * there's a page structure. This implies: 547 * there's a page structure. This implies:
548 * 1. The only existing pagetable is the kernel's 548 * 1. The only existing pagetable is the kernel's
549 * 2. It is always pinned 549 * 2. It is always pinned
550 * 3. It has no user pagetable attached to it 550 * 3. It has no user pagetable attached to it
551 */ 551 */
552 static void __init xen_set_pgd_hyper(pgd_t *ptr, pgd_t val) 552 static void __init xen_set_pgd_hyper(pgd_t *ptr, pgd_t val)
553 { 553 {
554 preempt_disable(); 554 preempt_disable();
555 555
556 xen_mc_batch(); 556 xen_mc_batch();
557 557
558 __xen_set_pgd_hyper(ptr, val); 558 __xen_set_pgd_hyper(ptr, val);
559 559
560 xen_mc_issue(PARAVIRT_LAZY_MMU); 560 xen_mc_issue(PARAVIRT_LAZY_MMU);
561 561
562 preempt_enable(); 562 preempt_enable();
563 } 563 }
564 564
565 static void xen_set_pgd(pgd_t *ptr, pgd_t val) 565 static void xen_set_pgd(pgd_t *ptr, pgd_t val)
566 { 566 {
567 pgd_t *user_ptr = xen_get_user_pgd(ptr); 567 pgd_t *user_ptr = xen_get_user_pgd(ptr);
568 568
569 trace_xen_mmu_set_pgd(ptr, user_ptr, val); 569 trace_xen_mmu_set_pgd(ptr, user_ptr, val);
570 570
571 /* If page is not pinned, we can just update the entry 571 /* If page is not pinned, we can just update the entry
572 directly */ 572 directly */
573 if (!xen_page_pinned(ptr)) { 573 if (!xen_page_pinned(ptr)) {
574 *ptr = val; 574 *ptr = val;
575 if (user_ptr) { 575 if (user_ptr) {
576 WARN_ON(xen_page_pinned(user_ptr)); 576 WARN_ON(xen_page_pinned(user_ptr));
577 *user_ptr = val; 577 *user_ptr = val;
578 } 578 }
579 return; 579 return;
580 } 580 }
581 581
582 /* If it's pinned, then we can at least batch the kernel and 582 /* If it's pinned, then we can at least batch the kernel and
583 user updates together. */ 583 user updates together. */
584 xen_mc_batch(); 584 xen_mc_batch();
585 585
586 __xen_set_pgd_hyper(ptr, val); 586 __xen_set_pgd_hyper(ptr, val);
587 if (user_ptr) 587 if (user_ptr)
588 __xen_set_pgd_hyper(user_ptr, val); 588 __xen_set_pgd_hyper(user_ptr, val);
589 589
590 xen_mc_issue(PARAVIRT_LAZY_MMU); 590 xen_mc_issue(PARAVIRT_LAZY_MMU);
591 } 591 }
592 #endif /* PAGETABLE_LEVELS == 4 */ 592 #endif /* PAGETABLE_LEVELS == 4 */
593 593
594 /* 594 /*
595 * (Yet another) pagetable walker. This one is intended for pinning a 595 * (Yet another) pagetable walker. This one is intended for pinning a
596 * pagetable. This means that it walks a pagetable and calls the 596 * pagetable. This means that it walks a pagetable and calls the
597 * callback function on each page it finds making up the page table, 597 * callback function on each page it finds making up the page table,
598 * at every level. It walks the entire pagetable, but it only bothers 598 * at every level. It walks the entire pagetable, but it only bothers
599 * pinning pte pages which are below limit. In the normal case this 599 * pinning pte pages which are below limit. In the normal case this
600 * will be STACK_TOP_MAX, but at boot we need to pin up to 600 * will be STACK_TOP_MAX, but at boot we need to pin up to
601 * FIXADDR_TOP. 601 * FIXADDR_TOP.
602 * 602 *
603 * For 32-bit the important bit is that we don't pin beyond there, 603 * For 32-bit the important bit is that we don't pin beyond there,
604 * because then we start getting into Xen's ptes. 604 * because then we start getting into Xen's ptes.
605 * 605 *
606 * For 64-bit, we must skip the Xen hole in the middle of the address 606 * For 64-bit, we must skip the Xen hole in the middle of the address
607 * space, just after the big x86-64 virtual hole. 607 * space, just after the big x86-64 virtual hole.
608 */ 608 */
609 static int __xen_pgd_walk(struct mm_struct *mm, pgd_t *pgd, 609 static int __xen_pgd_walk(struct mm_struct *mm, pgd_t *pgd,
610 int (*func)(struct mm_struct *mm, struct page *, 610 int (*func)(struct mm_struct *mm, struct page *,
611 enum pt_level), 611 enum pt_level),
612 unsigned long limit) 612 unsigned long limit)
613 { 613 {
614 int flush = 0; 614 int flush = 0;
615 unsigned hole_low, hole_high; 615 unsigned hole_low, hole_high;
616 unsigned pgdidx_limit, pudidx_limit, pmdidx_limit; 616 unsigned pgdidx_limit, pudidx_limit, pmdidx_limit;
617 unsigned pgdidx, pudidx, pmdidx; 617 unsigned pgdidx, pudidx, pmdidx;
618 618
619 /* The limit is the last byte to be touched */ 619 /* The limit is the last byte to be touched */
620 limit--; 620 limit--;
621 BUG_ON(limit >= FIXADDR_TOP); 621 BUG_ON(limit >= FIXADDR_TOP);
622 622
623 if (xen_feature(XENFEAT_auto_translated_physmap)) 623 if (xen_feature(XENFEAT_auto_translated_physmap))
624 return 0; 624 return 0;
625 625
626 /* 626 /*
627 * 64-bit has a great big hole in the middle of the address 627 * 64-bit has a great big hole in the middle of the address
628 * space, which contains the Xen mappings. On 32-bit these 628 * space, which contains the Xen mappings. On 32-bit these
629 * will end up making a zero-sized hole and so is a no-op. 629 * will end up making a zero-sized hole and so is a no-op.
630 */ 630 */
631 hole_low = pgd_index(USER_LIMIT); 631 hole_low = pgd_index(USER_LIMIT);
632 hole_high = pgd_index(PAGE_OFFSET); 632 hole_high = pgd_index(PAGE_OFFSET);
633 633
634 pgdidx_limit = pgd_index(limit); 634 pgdidx_limit = pgd_index(limit);
635 #if PTRS_PER_PUD > 1 635 #if PTRS_PER_PUD > 1
636 pudidx_limit = pud_index(limit); 636 pudidx_limit = pud_index(limit);
637 #else 637 #else
638 pudidx_limit = 0; 638 pudidx_limit = 0;
639 #endif 639 #endif
640 #if PTRS_PER_PMD > 1 640 #if PTRS_PER_PMD > 1
641 pmdidx_limit = pmd_index(limit); 641 pmdidx_limit = pmd_index(limit);
642 #else 642 #else
643 pmdidx_limit = 0; 643 pmdidx_limit = 0;
644 #endif 644 #endif
645 645
646 for (pgdidx = 0; pgdidx <= pgdidx_limit; pgdidx++) { 646 for (pgdidx = 0; pgdidx <= pgdidx_limit; pgdidx++) {
647 pud_t *pud; 647 pud_t *pud;
648 648
649 if (pgdidx >= hole_low && pgdidx < hole_high) 649 if (pgdidx >= hole_low && pgdidx < hole_high)
650 continue; 650 continue;
651 651
652 if (!pgd_val(pgd[pgdidx])) 652 if (!pgd_val(pgd[pgdidx]))
653 continue; 653 continue;
654 654
655 pud = pud_offset(&pgd[pgdidx], 0); 655 pud = pud_offset(&pgd[pgdidx], 0);
656 656
657 if (PTRS_PER_PUD > 1) /* not folded */ 657 if (PTRS_PER_PUD > 1) /* not folded */
658 flush |= (*func)(mm, virt_to_page(pud), PT_PUD); 658 flush |= (*func)(mm, virt_to_page(pud), PT_PUD);
659 659
660 for (pudidx = 0; pudidx < PTRS_PER_PUD; pudidx++) { 660 for (pudidx = 0; pudidx < PTRS_PER_PUD; pudidx++) {
661 pmd_t *pmd; 661 pmd_t *pmd;
662 662
663 if (pgdidx == pgdidx_limit && 663 if (pgdidx == pgdidx_limit &&
664 pudidx > pudidx_limit) 664 pudidx > pudidx_limit)
665 goto out; 665 goto out;
666 666
667 if (pud_none(pud[pudidx])) 667 if (pud_none(pud[pudidx]))
668 continue; 668 continue;
669 669
670 pmd = pmd_offset(&pud[pudidx], 0); 670 pmd = pmd_offset(&pud[pudidx], 0);
671 671
672 if (PTRS_PER_PMD > 1) /* not folded */ 672 if (PTRS_PER_PMD > 1) /* not folded */
673 flush |= (*func)(mm, virt_to_page(pmd), PT_PMD); 673 flush |= (*func)(mm, virt_to_page(pmd), PT_PMD);
674 674
675 for (pmdidx = 0; pmdidx < PTRS_PER_PMD; pmdidx++) { 675 for (pmdidx = 0; pmdidx < PTRS_PER_PMD; pmdidx++) {
676 struct page *pte; 676 struct page *pte;
677 677
678 if (pgdidx == pgdidx_limit && 678 if (pgdidx == pgdidx_limit &&
679 pudidx == pudidx_limit && 679 pudidx == pudidx_limit &&
680 pmdidx > pmdidx_limit) 680 pmdidx > pmdidx_limit)
681 goto out; 681 goto out;
682 682
683 if (pmd_none(pmd[pmdidx])) 683 if (pmd_none(pmd[pmdidx]))
684 continue; 684 continue;
685 685
686 pte = pmd_page(pmd[pmdidx]); 686 pte = pmd_page(pmd[pmdidx]);
687 flush |= (*func)(mm, pte, PT_PTE); 687 flush |= (*func)(mm, pte, PT_PTE);
688 } 688 }
689 } 689 }
690 } 690 }
691 691
692 out: 692 out:
693 /* Do the top level last, so that the callbacks can use it as 693 /* Do the top level last, so that the callbacks can use it as
694 a cue to do final things like tlb flushes. */ 694 a cue to do final things like tlb flushes. */
695 flush |= (*func)(mm, virt_to_page(pgd), PT_PGD); 695 flush |= (*func)(mm, virt_to_page(pgd), PT_PGD);
696 696
697 return flush; 697 return flush;
698 } 698 }
699 699
700 static int xen_pgd_walk(struct mm_struct *mm, 700 static int xen_pgd_walk(struct mm_struct *mm,
701 int (*func)(struct mm_struct *mm, struct page *, 701 int (*func)(struct mm_struct *mm, struct page *,
702 enum pt_level), 702 enum pt_level),
703 unsigned long limit) 703 unsigned long limit)
704 { 704 {
705 return __xen_pgd_walk(mm, mm->pgd, func, limit); 705 return __xen_pgd_walk(mm, mm->pgd, func, limit);
706 } 706 }
707 707
708 /* If we're using split pte locks, then take the page's lock and 708 /* If we're using split pte locks, then take the page's lock and
709 return a pointer to it. Otherwise return NULL. */ 709 return a pointer to it. Otherwise return NULL. */
710 static spinlock_t *xen_pte_lock(struct page *page, struct mm_struct *mm) 710 static spinlock_t *xen_pte_lock(struct page *page, struct mm_struct *mm)
711 { 711 {
712 spinlock_t *ptl = NULL; 712 spinlock_t *ptl = NULL;
713 713
714 #if USE_SPLIT_PTE_PTLOCKS 714 #if USE_SPLIT_PTE_PTLOCKS
715 ptl = ptlock_ptr(page); 715 ptl = ptlock_ptr(page);
716 spin_lock_nest_lock(ptl, &mm->page_table_lock); 716 spin_lock_nest_lock(ptl, &mm->page_table_lock);
717 #endif 717 #endif
718 718
719 return ptl; 719 return ptl;
720 } 720 }
721 721
722 static void xen_pte_unlock(void *v) 722 static void xen_pte_unlock(void *v)
723 { 723 {
724 spinlock_t *ptl = v; 724 spinlock_t *ptl = v;
725 spin_unlock(ptl); 725 spin_unlock(ptl);
726 } 726 }
727 727
728 static void xen_do_pin(unsigned level, unsigned long pfn) 728 static void xen_do_pin(unsigned level, unsigned long pfn)
729 { 729 {
730 struct mmuext_op op; 730 struct mmuext_op op;
731 731
732 op.cmd = level; 732 op.cmd = level;
733 op.arg1.mfn = pfn_to_mfn(pfn); 733 op.arg1.mfn = pfn_to_mfn(pfn);
734 734
735 xen_extend_mmuext_op(&op); 735 xen_extend_mmuext_op(&op);
736 } 736 }
737 737
738 static int xen_pin_page(struct mm_struct *mm, struct page *page, 738 static int xen_pin_page(struct mm_struct *mm, struct page *page,
739 enum pt_level level) 739 enum pt_level level)
740 { 740 {
741 unsigned pgfl = TestSetPagePinned(page); 741 unsigned pgfl = TestSetPagePinned(page);
742 int flush; 742 int flush;
743 743
744 if (pgfl) 744 if (pgfl)
745 flush = 0; /* already pinned */ 745 flush = 0; /* already pinned */
746 else if (PageHighMem(page)) 746 else if (PageHighMem(page))
747 /* kmaps need flushing if we found an unpinned 747 /* kmaps need flushing if we found an unpinned
748 highpage */ 748 highpage */
749 flush = 1; 749 flush = 1;
750 else { 750 else {
751 void *pt = lowmem_page_address(page); 751 void *pt = lowmem_page_address(page);
752 unsigned long pfn = page_to_pfn(page); 752 unsigned long pfn = page_to_pfn(page);
753 struct multicall_space mcs = __xen_mc_entry(0); 753 struct multicall_space mcs = __xen_mc_entry(0);
754 spinlock_t *ptl; 754 spinlock_t *ptl;
755 755
756 flush = 0; 756 flush = 0;
757 757
758 /* 758 /*
759 * We need to hold the pagetable lock between the time 759 * We need to hold the pagetable lock between the time
760 * we make the pagetable RO and when we actually pin 760 * we make the pagetable RO and when we actually pin
761 * it. If we don't, then other users may come in and 761 * it. If we don't, then other users may come in and
762 * attempt to update the pagetable by writing it, 762 * attempt to update the pagetable by writing it,
763 * which will fail because the memory is RO but not 763 * which will fail because the memory is RO but not
764 * pinned, so Xen won't do the trap'n'emulate. 764 * pinned, so Xen won't do the trap'n'emulate.
765 * 765 *
766 * If we're using split pte locks, we can't hold the 766 * If we're using split pte locks, we can't hold the
767 * entire pagetable's worth of locks during the 767 * entire pagetable's worth of locks during the
768 * traverse, because we may wrap the preempt count (8 768 * traverse, because we may wrap the preempt count (8
769 * bits). The solution is to mark RO and pin each PTE 769 * bits). The solution is to mark RO and pin each PTE
770 * page while holding the lock. This means the number 770 * page while holding the lock. This means the number
771 * of locks we end up holding is never more than a 771 * of locks we end up holding is never more than a
772 * batch size (~32 entries, at present). 772 * batch size (~32 entries, at present).
773 * 773 *
774 * If we're not using split pte locks, we needn't pin 774 * If we're not using split pte locks, we needn't pin
775 * the PTE pages independently, because we're 775 * the PTE pages independently, because we're
776 * protected by the overall pagetable lock. 776 * protected by the overall pagetable lock.
777 */ 777 */
778 ptl = NULL; 778 ptl = NULL;
779 if (level == PT_PTE) 779 if (level == PT_PTE)
780 ptl = xen_pte_lock(page, mm); 780 ptl = xen_pte_lock(page, mm);
781 781
782 MULTI_update_va_mapping(mcs.mc, (unsigned long)pt, 782 MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
783 pfn_pte(pfn, PAGE_KERNEL_RO), 783 pfn_pte(pfn, PAGE_KERNEL_RO),
784 level == PT_PGD ? UVMF_TLB_FLUSH : 0); 784 level == PT_PGD ? UVMF_TLB_FLUSH : 0);
785 785
786 if (ptl) { 786 if (ptl) {
787 xen_do_pin(MMUEXT_PIN_L1_TABLE, pfn); 787 xen_do_pin(MMUEXT_PIN_L1_TABLE, pfn);
788 788
789 /* Queue a deferred unlock for when this batch 789 /* Queue a deferred unlock for when this batch
790 is completed. */ 790 is completed. */
791 xen_mc_callback(xen_pte_unlock, ptl); 791 xen_mc_callback(xen_pte_unlock, ptl);
792 } 792 }
793 } 793 }
794 794
795 return flush; 795 return flush;
796 } 796 }
797 797
798 /* This is called just after a mm has been created, but it has not 798 /* This is called just after a mm has been created, but it has not
799 been used yet. We need to make sure that its pagetable is all 799 been used yet. We need to make sure that its pagetable is all
800 read-only, and can be pinned. */ 800 read-only, and can be pinned. */
801 static void __xen_pgd_pin(struct mm_struct *mm, pgd_t *pgd) 801 static void __xen_pgd_pin(struct mm_struct *mm, pgd_t *pgd)
802 { 802 {
803 trace_xen_mmu_pgd_pin(mm, pgd); 803 trace_xen_mmu_pgd_pin(mm, pgd);
804 804
805 xen_mc_batch(); 805 xen_mc_batch();
806 806
807 if (__xen_pgd_walk(mm, pgd, xen_pin_page, USER_LIMIT)) { 807 if (__xen_pgd_walk(mm, pgd, xen_pin_page, USER_LIMIT)) {
808 /* re-enable interrupts for flushing */ 808 /* re-enable interrupts for flushing */
809 xen_mc_issue(0); 809 xen_mc_issue(0);
810 810
811 kmap_flush_unused(); 811 kmap_flush_unused();
812 812
813 xen_mc_batch(); 813 xen_mc_batch();
814 } 814 }
815 815
816 #ifdef CONFIG_X86_64 816 #ifdef CONFIG_X86_64
817 { 817 {
818 pgd_t *user_pgd = xen_get_user_pgd(pgd); 818 pgd_t *user_pgd = xen_get_user_pgd(pgd);
819 819
820 xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(pgd))); 820 xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(pgd)));
821 821
822 if (user_pgd) { 822 if (user_pgd) {
823 xen_pin_page(mm, virt_to_page(user_pgd), PT_PGD); 823 xen_pin_page(mm, virt_to_page(user_pgd), PT_PGD);
824 xen_do_pin(MMUEXT_PIN_L4_TABLE, 824 xen_do_pin(MMUEXT_PIN_L4_TABLE,
825 PFN_DOWN(__pa(user_pgd))); 825 PFN_DOWN(__pa(user_pgd)));
826 } 826 }
827 } 827 }
828 #else /* CONFIG_X86_32 */ 828 #else /* CONFIG_X86_32 */
829 #ifdef CONFIG_X86_PAE 829 #ifdef CONFIG_X86_PAE
830 /* Need to make sure unshared kernel PMD is pinnable */ 830 /* Need to make sure unshared kernel PMD is pinnable */
831 xen_pin_page(mm, pgd_page(pgd[pgd_index(TASK_SIZE)]), 831 xen_pin_page(mm, pgd_page(pgd[pgd_index(TASK_SIZE)]),
832 PT_PMD); 832 PT_PMD);
833 #endif 833 #endif
834 xen_do_pin(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(pgd))); 834 xen_do_pin(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(pgd)));
835 #endif /* CONFIG_X86_64 */ 835 #endif /* CONFIG_X86_64 */
836 xen_mc_issue(0); 836 xen_mc_issue(0);
837 } 837 }
838 838
839 static void xen_pgd_pin(struct mm_struct *mm) 839 static void xen_pgd_pin(struct mm_struct *mm)
840 { 840 {
841 __xen_pgd_pin(mm, mm->pgd); 841 __xen_pgd_pin(mm, mm->pgd);
842 } 842 }
843 843
844 /* 844 /*
845 * On save, we need to pin all pagetables to make sure they get their 845 * On save, we need to pin all pagetables to make sure they get their
846 * mfns turned into pfns. Search the list for any unpinned pgds and pin 846 * mfns turned into pfns. Search the list for any unpinned pgds and pin
847 * them (unpinned pgds are not currently in use, probably because the 847 * them (unpinned pgds are not currently in use, probably because the
848 * process is under construction or destruction). 848 * process is under construction or destruction).
849 * 849 *
850 * Expected to be called in stop_machine() ("equivalent to taking 850 * Expected to be called in stop_machine() ("equivalent to taking
851 * every spinlock in the system"), so the locking doesn't really 851 * every spinlock in the system"), so the locking doesn't really
852 * matter all that much. 852 * matter all that much.
853 */ 853 */
854 void xen_mm_pin_all(void) 854 void xen_mm_pin_all(void)
855 { 855 {
856 struct page *page; 856 struct page *page;
857 857
858 spin_lock(&pgd_lock); 858 spin_lock(&pgd_lock);
859 859
860 list_for_each_entry(page, &pgd_list, lru) { 860 list_for_each_entry(page, &pgd_list, lru) {
861 if (!PagePinned(page)) { 861 if (!PagePinned(page)) {
862 __xen_pgd_pin(&init_mm, (pgd_t *)page_address(page)); 862 __xen_pgd_pin(&init_mm, (pgd_t *)page_address(page));
863 SetPageSavePinned(page); 863 SetPageSavePinned(page);
864 } 864 }
865 } 865 }
866 866
867 spin_unlock(&pgd_lock); 867 spin_unlock(&pgd_lock);
868 } 868 }
869 869
870 /* 870 /*
871 * The init_mm pagetable is really pinned as soon as its created, but 871 * The init_mm pagetable is really pinned as soon as its created, but
872 * that's before we have page structures to store the bits. So do all 872 * that's before we have page structures to store the bits. So do all
873 * the book-keeping now. 873 * the book-keeping now.
874 */ 874 */
875 static int __init xen_mark_pinned(struct mm_struct *mm, struct page *page, 875 static int __init xen_mark_pinned(struct mm_struct *mm, struct page *page,
876 enum pt_level level) 876 enum pt_level level)
877 { 877 {
878 SetPagePinned(page); 878 SetPagePinned(page);
879 return 0; 879 return 0;
880 } 880 }
881 881
882 static void __init xen_mark_init_mm_pinned(void) 882 static void __init xen_mark_init_mm_pinned(void)
883 { 883 {
884 xen_pgd_walk(&init_mm, xen_mark_pinned, FIXADDR_TOP); 884 xen_pgd_walk(&init_mm, xen_mark_pinned, FIXADDR_TOP);
885 } 885 }
886 886
887 static int xen_unpin_page(struct mm_struct *mm, struct page *page, 887 static int xen_unpin_page(struct mm_struct *mm, struct page *page,
888 enum pt_level level) 888 enum pt_level level)
889 { 889 {
890 unsigned pgfl = TestClearPagePinned(page); 890 unsigned pgfl = TestClearPagePinned(page);
891 891
892 if (pgfl && !PageHighMem(page)) { 892 if (pgfl && !PageHighMem(page)) {
893 void *pt = lowmem_page_address(page); 893 void *pt = lowmem_page_address(page);
894 unsigned long pfn = page_to_pfn(page); 894 unsigned long pfn = page_to_pfn(page);
895 spinlock_t *ptl = NULL; 895 spinlock_t *ptl = NULL;
896 struct multicall_space mcs; 896 struct multicall_space mcs;
897 897
898 /* 898 /*
899 * Do the converse to pin_page. If we're using split 899 * Do the converse to pin_page. If we're using split
900 * pte locks, we must be holding the lock for while 900 * pte locks, we must be holding the lock for while
901 * the pte page is unpinned but still RO to prevent 901 * the pte page is unpinned but still RO to prevent
902 * concurrent updates from seeing it in this 902 * concurrent updates from seeing it in this
903 * partially-pinned state. 903 * partially-pinned state.
904 */ 904 */
905 if (level == PT_PTE) { 905 if (level == PT_PTE) {
906 ptl = xen_pte_lock(page, mm); 906 ptl = xen_pte_lock(page, mm);
907 907
908 if (ptl) 908 if (ptl)
909 xen_do_pin(MMUEXT_UNPIN_TABLE, pfn); 909 xen_do_pin(MMUEXT_UNPIN_TABLE, pfn);
910 } 910 }
911 911
912 mcs = __xen_mc_entry(0); 912 mcs = __xen_mc_entry(0);
913 913
914 MULTI_update_va_mapping(mcs.mc, (unsigned long)pt, 914 MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
915 pfn_pte(pfn, PAGE_KERNEL), 915 pfn_pte(pfn, PAGE_KERNEL),
916 level == PT_PGD ? UVMF_TLB_FLUSH : 0); 916 level == PT_PGD ? UVMF_TLB_FLUSH : 0);
917 917
918 if (ptl) { 918 if (ptl) {
919 /* unlock when batch completed */ 919 /* unlock when batch completed */
920 xen_mc_callback(xen_pte_unlock, ptl); 920 xen_mc_callback(xen_pte_unlock, ptl);
921 } 921 }
922 } 922 }
923 923
924 return 0; /* never need to flush on unpin */ 924 return 0; /* never need to flush on unpin */
925 } 925 }
926 926
927 /* Release a pagetables pages back as normal RW */ 927 /* Release a pagetables pages back as normal RW */
928 static void __xen_pgd_unpin(struct mm_struct *mm, pgd_t *pgd) 928 static void __xen_pgd_unpin(struct mm_struct *mm, pgd_t *pgd)
929 { 929 {
930 trace_xen_mmu_pgd_unpin(mm, pgd); 930 trace_xen_mmu_pgd_unpin(mm, pgd);
931 931
932 xen_mc_batch(); 932 xen_mc_batch();
933 933
934 xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd))); 934 xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
935 935
936 #ifdef CONFIG_X86_64 936 #ifdef CONFIG_X86_64
937 { 937 {
938 pgd_t *user_pgd = xen_get_user_pgd(pgd); 938 pgd_t *user_pgd = xen_get_user_pgd(pgd);
939 939
940 if (user_pgd) { 940 if (user_pgd) {
941 xen_do_pin(MMUEXT_UNPIN_TABLE, 941 xen_do_pin(MMUEXT_UNPIN_TABLE,
942 PFN_DOWN(__pa(user_pgd))); 942 PFN_DOWN(__pa(user_pgd)));
943 xen_unpin_page(mm, virt_to_page(user_pgd), PT_PGD); 943 xen_unpin_page(mm, virt_to_page(user_pgd), PT_PGD);
944 } 944 }
945 } 945 }
946 #endif 946 #endif
947 947
948 #ifdef CONFIG_X86_PAE 948 #ifdef CONFIG_X86_PAE
949 /* Need to make sure unshared kernel PMD is unpinned */ 949 /* Need to make sure unshared kernel PMD is unpinned */
950 xen_unpin_page(mm, pgd_page(pgd[pgd_index(TASK_SIZE)]), 950 xen_unpin_page(mm, pgd_page(pgd[pgd_index(TASK_SIZE)]),
951 PT_PMD); 951 PT_PMD);
952 #endif 952 #endif
953 953
954 __xen_pgd_walk(mm, pgd, xen_unpin_page, USER_LIMIT); 954 __xen_pgd_walk(mm, pgd, xen_unpin_page, USER_LIMIT);
955 955
956 xen_mc_issue(0); 956 xen_mc_issue(0);
957 } 957 }
958 958
959 static void xen_pgd_unpin(struct mm_struct *mm) 959 static void xen_pgd_unpin(struct mm_struct *mm)
960 { 960 {
961 __xen_pgd_unpin(mm, mm->pgd); 961 __xen_pgd_unpin(mm, mm->pgd);
962 } 962 }
963 963
964 /* 964 /*
965 * On resume, undo any pinning done at save, so that the rest of the 965 * On resume, undo any pinning done at save, so that the rest of the
966 * kernel doesn't see any unexpected pinned pagetables. 966 * kernel doesn't see any unexpected pinned pagetables.
967 */ 967 */
968 void xen_mm_unpin_all(void) 968 void xen_mm_unpin_all(void)
969 { 969 {
970 struct page *page; 970 struct page *page;
971 971
972 spin_lock(&pgd_lock); 972 spin_lock(&pgd_lock);
973 973
974 list_for_each_entry(page, &pgd_list, lru) { 974 list_for_each_entry(page, &pgd_list, lru) {
975 if (PageSavePinned(page)) { 975 if (PageSavePinned(page)) {
976 BUG_ON(!PagePinned(page)); 976 BUG_ON(!PagePinned(page));
977 __xen_pgd_unpin(&init_mm, (pgd_t *)page_address(page)); 977 __xen_pgd_unpin(&init_mm, (pgd_t *)page_address(page));
978 ClearPageSavePinned(page); 978 ClearPageSavePinned(page);
979 } 979 }
980 } 980 }
981 981
982 spin_unlock(&pgd_lock); 982 spin_unlock(&pgd_lock);
983 } 983 }
984 984
985 static void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next) 985 static void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next)
986 { 986 {
987 spin_lock(&next->page_table_lock); 987 spin_lock(&next->page_table_lock);
988 xen_pgd_pin(next); 988 xen_pgd_pin(next);
989 spin_unlock(&next->page_table_lock); 989 spin_unlock(&next->page_table_lock);
990 } 990 }
991 991
992 static void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm) 992 static void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
993 { 993 {
994 spin_lock(&mm->page_table_lock); 994 spin_lock(&mm->page_table_lock);
995 xen_pgd_pin(mm); 995 xen_pgd_pin(mm);
996 spin_unlock(&mm->page_table_lock); 996 spin_unlock(&mm->page_table_lock);
997 } 997 }
998 998
999 999
1000 #ifdef CONFIG_SMP 1000 #ifdef CONFIG_SMP
1001 /* Another cpu may still have their %cr3 pointing at the pagetable, so 1001 /* Another cpu may still have their %cr3 pointing at the pagetable, so
1002 we need to repoint it somewhere else before we can unpin it. */ 1002 we need to repoint it somewhere else before we can unpin it. */
1003 static void drop_other_mm_ref(void *info) 1003 static void drop_other_mm_ref(void *info)
1004 { 1004 {
1005 struct mm_struct *mm = info; 1005 struct mm_struct *mm = info;
1006 struct mm_struct *active_mm; 1006 struct mm_struct *active_mm;
1007 1007
1008 active_mm = this_cpu_read(cpu_tlbstate.active_mm); 1008 active_mm = this_cpu_read(cpu_tlbstate.active_mm);
1009 1009
1010 if (active_mm == mm && this_cpu_read(cpu_tlbstate.state) != TLBSTATE_OK) 1010 if (active_mm == mm && this_cpu_read(cpu_tlbstate.state) != TLBSTATE_OK)
1011 leave_mm(smp_processor_id()); 1011 leave_mm(smp_processor_id());
1012 1012
1013 /* If this cpu still has a stale cr3 reference, then make sure 1013 /* If this cpu still has a stale cr3 reference, then make sure
1014 it has been flushed. */ 1014 it has been flushed. */
1015 if (this_cpu_read(xen_current_cr3) == __pa(mm->pgd)) 1015 if (this_cpu_read(xen_current_cr3) == __pa(mm->pgd))
1016 load_cr3(swapper_pg_dir); 1016 load_cr3(swapper_pg_dir);
1017 } 1017 }
1018 1018
1019 static void xen_drop_mm_ref(struct mm_struct *mm) 1019 static void xen_drop_mm_ref(struct mm_struct *mm)
1020 { 1020 {
1021 cpumask_var_t mask; 1021 cpumask_var_t mask;
1022 unsigned cpu; 1022 unsigned cpu;
1023 1023
1024 if (current->active_mm == mm) { 1024 if (current->active_mm == mm) {
1025 if (current->mm == mm) 1025 if (current->mm == mm)
1026 load_cr3(swapper_pg_dir); 1026 load_cr3(swapper_pg_dir);
1027 else 1027 else
1028 leave_mm(smp_processor_id()); 1028 leave_mm(smp_processor_id());
1029 } 1029 }
1030 1030
1031 /* Get the "official" set of cpus referring to our pagetable. */ 1031 /* Get the "official" set of cpus referring to our pagetable. */
1032 if (!alloc_cpumask_var(&mask, GFP_ATOMIC)) { 1032 if (!alloc_cpumask_var(&mask, GFP_ATOMIC)) {
1033 for_each_online_cpu(cpu) { 1033 for_each_online_cpu(cpu) {
1034 if (!cpumask_test_cpu(cpu, mm_cpumask(mm)) 1034 if (!cpumask_test_cpu(cpu, mm_cpumask(mm))
1035 && per_cpu(xen_current_cr3, cpu) != __pa(mm->pgd)) 1035 && per_cpu(xen_current_cr3, cpu) != __pa(mm->pgd))
1036 continue; 1036 continue;
1037 smp_call_function_single(cpu, drop_other_mm_ref, mm, 1); 1037 smp_call_function_single(cpu, drop_other_mm_ref, mm, 1);
1038 } 1038 }
1039 return; 1039 return;
1040 } 1040 }
1041 cpumask_copy(mask, mm_cpumask(mm)); 1041 cpumask_copy(mask, mm_cpumask(mm));
1042 1042
1043 /* It's possible that a vcpu may have a stale reference to our 1043 /* It's possible that a vcpu may have a stale reference to our
1044 cr3, because its in lazy mode, and it hasn't yet flushed 1044 cr3, because its in lazy mode, and it hasn't yet flushed
1045 its set of pending hypercalls yet. In this case, we can 1045 its set of pending hypercalls yet. In this case, we can
1046 look at its actual current cr3 value, and force it to flush 1046 look at its actual current cr3 value, and force it to flush
1047 if needed. */ 1047 if needed. */
1048 for_each_online_cpu(cpu) { 1048 for_each_online_cpu(cpu) {
1049 if (per_cpu(xen_current_cr3, cpu) == __pa(mm->pgd)) 1049 if (per_cpu(xen_current_cr3, cpu) == __pa(mm->pgd))
1050 cpumask_set_cpu(cpu, mask); 1050 cpumask_set_cpu(cpu, mask);
1051 } 1051 }
1052 1052
1053 if (!cpumask_empty(mask)) 1053 if (!cpumask_empty(mask))
1054 smp_call_function_many(mask, drop_other_mm_ref, mm, 1); 1054 smp_call_function_many(mask, drop_other_mm_ref, mm, 1);
1055 free_cpumask_var(mask); 1055 free_cpumask_var(mask);
1056 } 1056 }
1057 #else 1057 #else
1058 static void xen_drop_mm_ref(struct mm_struct *mm) 1058 static void xen_drop_mm_ref(struct mm_struct *mm)
1059 { 1059 {
1060 if (current->active_mm == mm) 1060 if (current->active_mm == mm)
1061 load_cr3(swapper_pg_dir); 1061 load_cr3(swapper_pg_dir);
1062 } 1062 }
1063 #endif 1063 #endif
1064 1064
1065 /* 1065 /*
1066 * While a process runs, Xen pins its pagetables, which means that the 1066 * While a process runs, Xen pins its pagetables, which means that the
1067 * hypervisor forces it to be read-only, and it controls all updates 1067 * hypervisor forces it to be read-only, and it controls all updates
1068 * to it. This means that all pagetable updates have to go via the 1068 * to it. This means that all pagetable updates have to go via the
1069 * hypervisor, which is moderately expensive. 1069 * hypervisor, which is moderately expensive.
1070 * 1070 *
1071 * Since we're pulling the pagetable down, we switch to use init_mm, 1071 * Since we're pulling the pagetable down, we switch to use init_mm,
1072 * unpin old process pagetable and mark it all read-write, which 1072 * unpin old process pagetable and mark it all read-write, which
1073 * allows further operations on it to be simple memory accesses. 1073 * allows further operations on it to be simple memory accesses.
1074 * 1074 *
1075 * The only subtle point is that another CPU may be still using the 1075 * The only subtle point is that another CPU may be still using the
1076 * pagetable because of lazy tlb flushing. This means we need need to 1076 * pagetable because of lazy tlb flushing. This means we need need to
1077 * switch all CPUs off this pagetable before we can unpin it. 1077 * switch all CPUs off this pagetable before we can unpin it.
1078 */ 1078 */
1079 static void xen_exit_mmap(struct mm_struct *mm) 1079 static void xen_exit_mmap(struct mm_struct *mm)
1080 { 1080 {
1081 get_cpu(); /* make sure we don't move around */ 1081 get_cpu(); /* make sure we don't move around */
1082 xen_drop_mm_ref(mm); 1082 xen_drop_mm_ref(mm);
1083 put_cpu(); 1083 put_cpu();
1084 1084
1085 spin_lock(&mm->page_table_lock); 1085 spin_lock(&mm->page_table_lock);
1086 1086
1087 /* pgd may not be pinned in the error exit path of execve */ 1087 /* pgd may not be pinned in the error exit path of execve */
1088 if (xen_page_pinned(mm->pgd)) 1088 if (xen_page_pinned(mm->pgd))
1089 xen_pgd_unpin(mm); 1089 xen_pgd_unpin(mm);
1090 1090
1091 spin_unlock(&mm->page_table_lock); 1091 spin_unlock(&mm->page_table_lock);
1092 } 1092 }
1093 1093
1094 static void xen_post_allocator_init(void); 1094 static void xen_post_allocator_init(void);
1095 1095
1096 #ifdef CONFIG_X86_64 1096 #ifdef CONFIG_X86_64
1097 static void __init xen_cleanhighmap(unsigned long vaddr, 1097 static void __init xen_cleanhighmap(unsigned long vaddr,
1098 unsigned long vaddr_end) 1098 unsigned long vaddr_end)
1099 { 1099 {
1100 unsigned long kernel_end = roundup((unsigned long)_brk_end, PMD_SIZE) - 1; 1100 unsigned long kernel_end = roundup((unsigned long)_brk_end, PMD_SIZE) - 1;
1101 pmd_t *pmd = level2_kernel_pgt + pmd_index(vaddr); 1101 pmd_t *pmd = level2_kernel_pgt + pmd_index(vaddr);
1102 1102
1103 /* NOTE: The loop is more greedy than the cleanup_highmap variant. 1103 /* NOTE: The loop is more greedy than the cleanup_highmap variant.
1104 * We include the PMD passed in on _both_ boundaries. */ 1104 * We include the PMD passed in on _both_ boundaries. */
1105 for (; vaddr <= vaddr_end && (pmd < (level2_kernel_pgt + PAGE_SIZE)); 1105 for (; vaddr <= vaddr_end && (pmd < (level2_kernel_pgt + PAGE_SIZE));
1106 pmd++, vaddr += PMD_SIZE) { 1106 pmd++, vaddr += PMD_SIZE) {
1107 if (pmd_none(*pmd)) 1107 if (pmd_none(*pmd))
1108 continue; 1108 continue;
1109 if (vaddr < (unsigned long) _text || vaddr > kernel_end) 1109 if (vaddr < (unsigned long) _text || vaddr > kernel_end)
1110 set_pmd(pmd, __pmd(0)); 1110 set_pmd(pmd, __pmd(0));
1111 } 1111 }
1112 /* In case we did something silly, we should crash in this function 1112 /* In case we did something silly, we should crash in this function
1113 * instead of somewhere later and be confusing. */ 1113 * instead of somewhere later and be confusing. */
1114 xen_mc_flush(); 1114 xen_mc_flush();
1115 } 1115 }
1116 static void __init xen_pagetable_p2m_copy(void) 1116
1117 static void __init xen_pagetable_p2m_free(void)
1117 { 1118 {
1118 unsigned long size; 1119 unsigned long size;
1119 unsigned long addr; 1120 unsigned long addr;
1120 unsigned long new_mfn_list;
1121 1121
1122 if (xen_feature(XENFEAT_auto_translated_physmap))
1123 return;
1124
1125 size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long)); 1122 size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long));
1126 1123
1127 new_mfn_list = xen_revector_p2m_tree();
1128 /* No memory or already called. */ 1124 /* No memory or already called. */
1129 if (!new_mfn_list || new_mfn_list == xen_start_info->mfn_list) 1125 if ((unsigned long)xen_p2m_addr == xen_start_info->mfn_list)
1130 return; 1126 return;
1131 1127
1132 /* using __ka address and sticking INVALID_P2M_ENTRY! */ 1128 /* using __ka address and sticking INVALID_P2M_ENTRY! */
1133 memset((void *)xen_start_info->mfn_list, 0xff, size); 1129 memset((void *)xen_start_info->mfn_list, 0xff, size);
1134 1130
1135 /* We should be in __ka space. */ 1131 /* We should be in __ka space. */
1136 BUG_ON(xen_start_info->mfn_list < __START_KERNEL_map); 1132 BUG_ON(xen_start_info->mfn_list < __START_KERNEL_map);
1137 addr = xen_start_info->mfn_list; 1133 addr = xen_start_info->mfn_list;
1138 /* We roundup to the PMD, which means that if anybody at this stage is 1134 /* We roundup to the PMD, which means that if anybody at this stage is
1139 * using the __ka address of xen_start_info or xen_start_info->shared_info 1135 * using the __ka address of xen_start_info or xen_start_info->shared_info
1140 * they are in going to crash. Fortunatly we have already revectored 1136 * they are in going to crash. Fortunatly we have already revectored
1141 * in xen_setup_kernel_pagetable and in xen_setup_shared_info. */ 1137 * in xen_setup_kernel_pagetable and in xen_setup_shared_info. */
1142 size = roundup(size, PMD_SIZE); 1138 size = roundup(size, PMD_SIZE);
1143 xen_cleanhighmap(addr, addr + size); 1139 xen_cleanhighmap(addr, addr + size);
1144 1140
1145 size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long)); 1141 size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long));
1146 memblock_free(__pa(xen_start_info->mfn_list), size); 1142 memblock_free(__pa(xen_start_info->mfn_list), size);
1147 /* And revector! Bye bye old array */
1148 xen_start_info->mfn_list = new_mfn_list;
1149 1143
1150 /* At this stage, cleanup_highmap has already cleaned __ka space 1144 /* At this stage, cleanup_highmap has already cleaned __ka space
1151 * from _brk_limit way up to the max_pfn_mapped (which is the end of 1145 * from _brk_limit way up to the max_pfn_mapped (which is the end of
1152 * the ramdisk). We continue on, erasing PMD entries that point to page 1146 * the ramdisk). We continue on, erasing PMD entries that point to page
1153 * tables - do note that they are accessible at this stage via __va. 1147 * tables - do note that they are accessible at this stage via __va.
1154 * For good measure we also round up to the PMD - which means that if 1148 * For good measure we also round up to the PMD - which means that if
1155 * anybody is using __ka address to the initial boot-stack - and try 1149 * anybody is using __ka address to the initial boot-stack - and try
1156 * to use it - they are going to crash. The xen_start_info has been 1150 * to use it - they are going to crash. The xen_start_info has been
1157 * taken care of already in xen_setup_kernel_pagetable. */ 1151 * taken care of already in xen_setup_kernel_pagetable. */
1158 addr = xen_start_info->pt_base; 1152 addr = xen_start_info->pt_base;
1159 size = roundup(xen_start_info->nr_pt_frames * PAGE_SIZE, PMD_SIZE); 1153 size = roundup(xen_start_info->nr_pt_frames * PAGE_SIZE, PMD_SIZE);
1160 1154
1161 xen_cleanhighmap(addr, addr + size); 1155 xen_cleanhighmap(addr, addr + size);
1162 xen_start_info->pt_base = (unsigned long)__va(__pa(xen_start_info->pt_base)); 1156 xen_start_info->pt_base = (unsigned long)__va(__pa(xen_start_info->pt_base));
1163 #ifdef DEBUG 1157 #ifdef DEBUG
1164 /* This is superflous and is not neccessary, but you know what 1158 /* This is superflous and is not neccessary, but you know what
1165 * lets do it. The MODULES_VADDR -> MODULES_END should be clear of 1159 * lets do it. The MODULES_VADDR -> MODULES_END should be clear of
1166 * anything at this stage. */ 1160 * anything at this stage. */
1167 xen_cleanhighmap(MODULES_VADDR, roundup(MODULES_VADDR, PUD_SIZE) - 1); 1161 xen_cleanhighmap(MODULES_VADDR, roundup(MODULES_VADDR, PUD_SIZE) - 1);
1168 #endif 1162 #endif
1169 } 1163 }
1170 #endif 1164 #endif
1171 1165
1172 static void __init xen_pagetable_init(void) 1166 static void __init xen_pagetable_p2m_setup(void)
1173 { 1167 {
1174 paging_init(); 1168 if (xen_feature(XENFEAT_auto_translated_physmap))
1169 return;
1170
1171 xen_vmalloc_p2m_tree();
1172
1175 #ifdef CONFIG_X86_64 1173 #ifdef CONFIG_X86_64
1176 xen_pagetable_p2m_copy(); 1174 xen_pagetable_p2m_free();
1177 #endif 1175 #endif
1176 /* And revector! Bye bye old array */
1177 xen_start_info->mfn_list = (unsigned long)xen_p2m_addr;
1178 }
1179
1180 static void __init xen_pagetable_init(void)
1181 {
1182 paging_init();
1183 xen_post_allocator_init();
1184
1185 xen_pagetable_p2m_setup();
1186
1178 /* Allocate and initialize top and mid mfn levels for p2m structure */ 1187 /* Allocate and initialize top and mid mfn levels for p2m structure */
1179 xen_build_mfn_list_list(); 1188 xen_build_mfn_list_list();
1180 1189
1190 /* Remap memory freed due to conflicts with E820 map */
1191 if (!xen_feature(XENFEAT_auto_translated_physmap))
1192 xen_remap_memory();
1193
1181 xen_setup_shared_info(); 1194 xen_setup_shared_info();
1182 xen_post_allocator_init();
1183 } 1195 }
1184 static void xen_write_cr2(unsigned long cr2) 1196 static void xen_write_cr2(unsigned long cr2)
1185 { 1197 {
1186 this_cpu_read(xen_vcpu)->arch.cr2 = cr2; 1198 this_cpu_read(xen_vcpu)->arch.cr2 = cr2;
1187 } 1199 }
1188 1200
1189 static unsigned long xen_read_cr2(void) 1201 static unsigned long xen_read_cr2(void)
1190 { 1202 {
1191 return this_cpu_read(xen_vcpu)->arch.cr2; 1203 return this_cpu_read(xen_vcpu)->arch.cr2;
1192 } 1204 }
1193 1205
1194 unsigned long xen_read_cr2_direct(void) 1206 unsigned long xen_read_cr2_direct(void)
1195 { 1207 {
1196 return this_cpu_read(xen_vcpu_info.arch.cr2); 1208 return this_cpu_read(xen_vcpu_info.arch.cr2);
1197 } 1209 }
1198 1210
1199 void xen_flush_tlb_all(void) 1211 void xen_flush_tlb_all(void)
1200 { 1212 {
1201 struct mmuext_op *op; 1213 struct mmuext_op *op;
1202 struct multicall_space mcs; 1214 struct multicall_space mcs;
1203 1215
1204 trace_xen_mmu_flush_tlb_all(0); 1216 trace_xen_mmu_flush_tlb_all(0);
1205 1217
1206 preempt_disable(); 1218 preempt_disable();
1207 1219
1208 mcs = xen_mc_entry(sizeof(*op)); 1220 mcs = xen_mc_entry(sizeof(*op));
1209 1221
1210 op = mcs.args; 1222 op = mcs.args;
1211 op->cmd = MMUEXT_TLB_FLUSH_ALL; 1223 op->cmd = MMUEXT_TLB_FLUSH_ALL;
1212 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); 1224 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
1213 1225
1214 xen_mc_issue(PARAVIRT_LAZY_MMU); 1226 xen_mc_issue(PARAVIRT_LAZY_MMU);
1215 1227
1216 preempt_enable(); 1228 preempt_enable();
1217 } 1229 }
1218 static void xen_flush_tlb(void) 1230 static void xen_flush_tlb(void)
1219 { 1231 {
1220 struct mmuext_op *op; 1232 struct mmuext_op *op;
1221 struct multicall_space mcs; 1233 struct multicall_space mcs;
1222 1234
1223 trace_xen_mmu_flush_tlb(0); 1235 trace_xen_mmu_flush_tlb(0);
1224 1236
1225 preempt_disable(); 1237 preempt_disable();
1226 1238
1227 mcs = xen_mc_entry(sizeof(*op)); 1239 mcs = xen_mc_entry(sizeof(*op));
1228 1240
1229 op = mcs.args; 1241 op = mcs.args;
1230 op->cmd = MMUEXT_TLB_FLUSH_LOCAL; 1242 op->cmd = MMUEXT_TLB_FLUSH_LOCAL;
1231 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); 1243 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
1232 1244
1233 xen_mc_issue(PARAVIRT_LAZY_MMU); 1245 xen_mc_issue(PARAVIRT_LAZY_MMU);
1234 1246
1235 preempt_enable(); 1247 preempt_enable();
1236 } 1248 }
1237 1249
1238 static void xen_flush_tlb_single(unsigned long addr) 1250 static void xen_flush_tlb_single(unsigned long addr)
1239 { 1251 {
1240 struct mmuext_op *op; 1252 struct mmuext_op *op;
1241 struct multicall_space mcs; 1253 struct multicall_space mcs;
1242 1254
1243 trace_xen_mmu_flush_tlb_single(addr); 1255 trace_xen_mmu_flush_tlb_single(addr);
1244 1256
1245 preempt_disable(); 1257 preempt_disable();
1246 1258
1247 mcs = xen_mc_entry(sizeof(*op)); 1259 mcs = xen_mc_entry(sizeof(*op));
1248 op = mcs.args; 1260 op = mcs.args;
1249 op->cmd = MMUEXT_INVLPG_LOCAL; 1261 op->cmd = MMUEXT_INVLPG_LOCAL;
1250 op->arg1.linear_addr = addr & PAGE_MASK; 1262 op->arg1.linear_addr = addr & PAGE_MASK;
1251 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); 1263 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
1252 1264
1253 xen_mc_issue(PARAVIRT_LAZY_MMU); 1265 xen_mc_issue(PARAVIRT_LAZY_MMU);
1254 1266
1255 preempt_enable(); 1267 preempt_enable();
1256 } 1268 }
1257 1269
1258 static void xen_flush_tlb_others(const struct cpumask *cpus, 1270 static void xen_flush_tlb_others(const struct cpumask *cpus,
1259 struct mm_struct *mm, unsigned long start, 1271 struct mm_struct *mm, unsigned long start,
1260 unsigned long end) 1272 unsigned long end)
1261 { 1273 {
1262 struct { 1274 struct {
1263 struct mmuext_op op; 1275 struct mmuext_op op;
1264 #ifdef CONFIG_SMP 1276 #ifdef CONFIG_SMP
1265 DECLARE_BITMAP(mask, num_processors); 1277 DECLARE_BITMAP(mask, num_processors);
1266 #else 1278 #else
1267 DECLARE_BITMAP(mask, NR_CPUS); 1279 DECLARE_BITMAP(mask, NR_CPUS);
1268 #endif 1280 #endif
1269 } *args; 1281 } *args;
1270 struct multicall_space mcs; 1282 struct multicall_space mcs;
1271 1283
1272 trace_xen_mmu_flush_tlb_others(cpus, mm, start, end); 1284 trace_xen_mmu_flush_tlb_others(cpus, mm, start, end);
1273 1285
1274 if (cpumask_empty(cpus)) 1286 if (cpumask_empty(cpus))
1275 return; /* nothing to do */ 1287 return; /* nothing to do */
1276 1288
1277 mcs = xen_mc_entry(sizeof(*args)); 1289 mcs = xen_mc_entry(sizeof(*args));
1278 args = mcs.args; 1290 args = mcs.args;
1279 args->op.arg2.vcpumask = to_cpumask(args->mask); 1291 args->op.arg2.vcpumask = to_cpumask(args->mask);
1280 1292
1281 /* Remove us, and any offline CPUS. */ 1293 /* Remove us, and any offline CPUS. */
1282 cpumask_and(to_cpumask(args->mask), cpus, cpu_online_mask); 1294 cpumask_and(to_cpumask(args->mask), cpus, cpu_online_mask);
1283 cpumask_clear_cpu(smp_processor_id(), to_cpumask(args->mask)); 1295 cpumask_clear_cpu(smp_processor_id(), to_cpumask(args->mask));
1284 1296
1285 args->op.cmd = MMUEXT_TLB_FLUSH_MULTI; 1297 args->op.cmd = MMUEXT_TLB_FLUSH_MULTI;
1286 if (end != TLB_FLUSH_ALL && (end - start) <= PAGE_SIZE) { 1298 if (end != TLB_FLUSH_ALL && (end - start) <= PAGE_SIZE) {
1287 args->op.cmd = MMUEXT_INVLPG_MULTI; 1299 args->op.cmd = MMUEXT_INVLPG_MULTI;
1288 args->op.arg1.linear_addr = start; 1300 args->op.arg1.linear_addr = start;
1289 } 1301 }
1290 1302
1291 MULTI_mmuext_op(mcs.mc, &args->op, 1, NULL, DOMID_SELF); 1303 MULTI_mmuext_op(mcs.mc, &args->op, 1, NULL, DOMID_SELF);
1292 1304
1293 xen_mc_issue(PARAVIRT_LAZY_MMU); 1305 xen_mc_issue(PARAVIRT_LAZY_MMU);
1294 } 1306 }
1295 1307
1296 static unsigned long xen_read_cr3(void) 1308 static unsigned long xen_read_cr3(void)
1297 { 1309 {
1298 return this_cpu_read(xen_cr3); 1310 return this_cpu_read(xen_cr3);
1299 } 1311 }
1300 1312
1301 static void set_current_cr3(void *v) 1313 static void set_current_cr3(void *v)
1302 { 1314 {
1303 this_cpu_write(xen_current_cr3, (unsigned long)v); 1315 this_cpu_write(xen_current_cr3, (unsigned long)v);
1304 } 1316 }
1305 1317
1306 static void __xen_write_cr3(bool kernel, unsigned long cr3) 1318 static void __xen_write_cr3(bool kernel, unsigned long cr3)
1307 { 1319 {
1308 struct mmuext_op op; 1320 struct mmuext_op op;
1309 unsigned long mfn; 1321 unsigned long mfn;
1310 1322
1311 trace_xen_mmu_write_cr3(kernel, cr3); 1323 trace_xen_mmu_write_cr3(kernel, cr3);
1312 1324
1313 if (cr3) 1325 if (cr3)
1314 mfn = pfn_to_mfn(PFN_DOWN(cr3)); 1326 mfn = pfn_to_mfn(PFN_DOWN(cr3));
1315 else 1327 else
1316 mfn = 0; 1328 mfn = 0;
1317 1329
1318 WARN_ON(mfn == 0 && kernel); 1330 WARN_ON(mfn == 0 && kernel);
1319 1331
1320 op.cmd = kernel ? MMUEXT_NEW_BASEPTR : MMUEXT_NEW_USER_BASEPTR; 1332 op.cmd = kernel ? MMUEXT_NEW_BASEPTR : MMUEXT_NEW_USER_BASEPTR;
1321 op.arg1.mfn = mfn; 1333 op.arg1.mfn = mfn;
1322 1334
1323 xen_extend_mmuext_op(&op); 1335 xen_extend_mmuext_op(&op);
1324 1336
1325 if (kernel) { 1337 if (kernel) {
1326 this_cpu_write(xen_cr3, cr3); 1338 this_cpu_write(xen_cr3, cr3);
1327 1339
1328 /* Update xen_current_cr3 once the batch has actually 1340 /* Update xen_current_cr3 once the batch has actually
1329 been submitted. */ 1341 been submitted. */
1330 xen_mc_callback(set_current_cr3, (void *)cr3); 1342 xen_mc_callback(set_current_cr3, (void *)cr3);
1331 } 1343 }
1332 } 1344 }
1333 static void xen_write_cr3(unsigned long cr3) 1345 static void xen_write_cr3(unsigned long cr3)
1334 { 1346 {
1335 BUG_ON(preemptible()); 1347 BUG_ON(preemptible());
1336 1348
1337 xen_mc_batch(); /* disables interrupts */ 1349 xen_mc_batch(); /* disables interrupts */
1338 1350
1339 /* Update while interrupts are disabled, so its atomic with 1351 /* Update while interrupts are disabled, so its atomic with
1340 respect to ipis */ 1352 respect to ipis */
1341 this_cpu_write(xen_cr3, cr3); 1353 this_cpu_write(xen_cr3, cr3);
1342 1354
1343 __xen_write_cr3(true, cr3); 1355 __xen_write_cr3(true, cr3);
1344 1356
1345 #ifdef CONFIG_X86_64 1357 #ifdef CONFIG_X86_64
1346 { 1358 {
1347 pgd_t *user_pgd = xen_get_user_pgd(__va(cr3)); 1359 pgd_t *user_pgd = xen_get_user_pgd(__va(cr3));
1348 if (user_pgd) 1360 if (user_pgd)
1349 __xen_write_cr3(false, __pa(user_pgd)); 1361 __xen_write_cr3(false, __pa(user_pgd));
1350 else 1362 else
1351 __xen_write_cr3(false, 0); 1363 __xen_write_cr3(false, 0);
1352 } 1364 }
1353 #endif 1365 #endif
1354 1366
1355 xen_mc_issue(PARAVIRT_LAZY_CPU); /* interrupts restored */ 1367 xen_mc_issue(PARAVIRT_LAZY_CPU); /* interrupts restored */
1356 } 1368 }
1357 1369
1358 #ifdef CONFIG_X86_64 1370 #ifdef CONFIG_X86_64
1359 /* 1371 /*
1360 * At the start of the day - when Xen launches a guest, it has already 1372 * At the start of the day - when Xen launches a guest, it has already
1361 * built pagetables for the guest. We diligently look over them 1373 * built pagetables for the guest. We diligently look over them
1362 * in xen_setup_kernel_pagetable and graft as appropiate them in the 1374 * in xen_setup_kernel_pagetable and graft as appropiate them in the
1363 * init_level4_pgt and its friends. Then when we are happy we load 1375 * init_level4_pgt and its friends. Then when we are happy we load
1364 * the new init_level4_pgt - and continue on. 1376 * the new init_level4_pgt - and continue on.
1365 * 1377 *
1366 * The generic code starts (start_kernel) and 'init_mem_mapping' sets 1378 * The generic code starts (start_kernel) and 'init_mem_mapping' sets
1367 * up the rest of the pagetables. When it has completed it loads the cr3. 1379 * up the rest of the pagetables. When it has completed it loads the cr3.
1368 * N.B. that baremetal would start at 'start_kernel' (and the early 1380 * N.B. that baremetal would start at 'start_kernel' (and the early
1369 * #PF handler would create bootstrap pagetables) - so we are running 1381 * #PF handler would create bootstrap pagetables) - so we are running
1370 * with the same assumptions as what to do when write_cr3 is executed 1382 * with the same assumptions as what to do when write_cr3 is executed
1371 * at this point. 1383 * at this point.
1372 * 1384 *
1373 * Since there are no user-page tables at all, we have two variants 1385 * Since there are no user-page tables at all, we have two variants
1374 * of xen_write_cr3 - the early bootup (this one), and the late one 1386 * of xen_write_cr3 - the early bootup (this one), and the late one
1375 * (xen_write_cr3). The reason we have to do that is that in 64-bit 1387 * (xen_write_cr3). The reason we have to do that is that in 64-bit
1376 * the Linux kernel and user-space are both in ring 3 while the 1388 * the Linux kernel and user-space are both in ring 3 while the
1377 * hypervisor is in ring 0. 1389 * hypervisor is in ring 0.
1378 */ 1390 */
1379 static void __init xen_write_cr3_init(unsigned long cr3) 1391 static void __init xen_write_cr3_init(unsigned long cr3)
1380 { 1392 {
1381 BUG_ON(preemptible()); 1393 BUG_ON(preemptible());
1382 1394
1383 xen_mc_batch(); /* disables interrupts */ 1395 xen_mc_batch(); /* disables interrupts */
1384 1396
1385 /* Update while interrupts are disabled, so its atomic with 1397 /* Update while interrupts are disabled, so its atomic with
1386 respect to ipis */ 1398 respect to ipis */
1387 this_cpu_write(xen_cr3, cr3); 1399 this_cpu_write(xen_cr3, cr3);
1388 1400
1389 __xen_write_cr3(true, cr3); 1401 __xen_write_cr3(true, cr3);
1390 1402
1391 xen_mc_issue(PARAVIRT_LAZY_CPU); /* interrupts restored */ 1403 xen_mc_issue(PARAVIRT_LAZY_CPU); /* interrupts restored */
1392 } 1404 }
1393 #endif 1405 #endif
1394 1406
1395 static int xen_pgd_alloc(struct mm_struct *mm) 1407 static int xen_pgd_alloc(struct mm_struct *mm)
1396 { 1408 {
1397 pgd_t *pgd = mm->pgd; 1409 pgd_t *pgd = mm->pgd;
1398 int ret = 0; 1410 int ret = 0;
1399 1411
1400 BUG_ON(PagePinned(virt_to_page(pgd))); 1412 BUG_ON(PagePinned(virt_to_page(pgd)));
1401 1413
1402 #ifdef CONFIG_X86_64 1414 #ifdef CONFIG_X86_64
1403 { 1415 {
1404 struct page *page = virt_to_page(pgd); 1416 struct page *page = virt_to_page(pgd);
1405 pgd_t *user_pgd; 1417 pgd_t *user_pgd;
1406 1418
1407 BUG_ON(page->private != 0); 1419 BUG_ON(page->private != 0);
1408 1420
1409 ret = -ENOMEM; 1421 ret = -ENOMEM;
1410 1422
1411 user_pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO); 1423 user_pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
1412 page->private = (unsigned long)user_pgd; 1424 page->private = (unsigned long)user_pgd;
1413 1425
1414 if (user_pgd != NULL) { 1426 if (user_pgd != NULL) {
1415 #ifdef CONFIG_X86_VSYSCALL_EMULATION 1427 #ifdef CONFIG_X86_VSYSCALL_EMULATION
1416 user_pgd[pgd_index(VSYSCALL_ADDR)] = 1428 user_pgd[pgd_index(VSYSCALL_ADDR)] =
1417 __pgd(__pa(level3_user_vsyscall) | _PAGE_TABLE); 1429 __pgd(__pa(level3_user_vsyscall) | _PAGE_TABLE);
1418 #endif 1430 #endif
1419 ret = 0; 1431 ret = 0;
1420 } 1432 }
1421 1433
1422 BUG_ON(PagePinned(virt_to_page(xen_get_user_pgd(pgd)))); 1434 BUG_ON(PagePinned(virt_to_page(xen_get_user_pgd(pgd))));
1423 } 1435 }
1424 #endif 1436 #endif
1425 1437
1426 return ret; 1438 return ret;
1427 } 1439 }
1428 1440
1429 static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd) 1441 static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd)
1430 { 1442 {
1431 #ifdef CONFIG_X86_64 1443 #ifdef CONFIG_X86_64
1432 pgd_t *user_pgd = xen_get_user_pgd(pgd); 1444 pgd_t *user_pgd = xen_get_user_pgd(pgd);
1433 1445
1434 if (user_pgd) 1446 if (user_pgd)
1435 free_page((unsigned long)user_pgd); 1447 free_page((unsigned long)user_pgd);
1436 #endif 1448 #endif
1437 } 1449 }
1438 1450
1439 #ifdef CONFIG_X86_32 1451 #ifdef CONFIG_X86_32
1440 static pte_t __init mask_rw_pte(pte_t *ptep, pte_t pte) 1452 static pte_t __init mask_rw_pte(pte_t *ptep, pte_t pte)
1441 { 1453 {
1442 /* If there's an existing pte, then don't allow _PAGE_RW to be set */ 1454 /* If there's an existing pte, then don't allow _PAGE_RW to be set */
1443 if (pte_val_ma(*ptep) & _PAGE_PRESENT) 1455 if (pte_val_ma(*ptep) & _PAGE_PRESENT)
1444 pte = __pte_ma(((pte_val_ma(*ptep) & _PAGE_RW) | ~_PAGE_RW) & 1456 pte = __pte_ma(((pte_val_ma(*ptep) & _PAGE_RW) | ~_PAGE_RW) &
1445 pte_val_ma(pte)); 1457 pte_val_ma(pte));
1446 1458
1447 return pte; 1459 return pte;
1448 } 1460 }
1449 #else /* CONFIG_X86_64 */ 1461 #else /* CONFIG_X86_64 */
1450 static pte_t __init mask_rw_pte(pte_t *ptep, pte_t pte) 1462 static pte_t __init mask_rw_pte(pte_t *ptep, pte_t pte)
1451 { 1463 {
1452 return pte; 1464 return pte;
1453 } 1465 }
1454 #endif /* CONFIG_X86_64 */ 1466 #endif /* CONFIG_X86_64 */
1455 1467
1456 /* 1468 /*
1457 * Init-time set_pte while constructing initial pagetables, which 1469 * Init-time set_pte while constructing initial pagetables, which
1458 * doesn't allow RO page table pages to be remapped RW. 1470 * doesn't allow RO page table pages to be remapped RW.
1459 * 1471 *
1460 * If there is no MFN for this PFN then this page is initially 1472 * If there is no MFN for this PFN then this page is initially
1461 * ballooned out so clear the PTE (as in decrease_reservation() in 1473 * ballooned out so clear the PTE (as in decrease_reservation() in
1462 * drivers/xen/balloon.c). 1474 * drivers/xen/balloon.c).
1463 * 1475 *
1464 * Many of these PTE updates are done on unpinned and writable pages 1476 * Many of these PTE updates are done on unpinned and writable pages
1465 * and doing a hypercall for these is unnecessary and expensive. At 1477 * and doing a hypercall for these is unnecessary and expensive. At
1466 * this point it is not possible to tell if a page is pinned or not, 1478 * this point it is not possible to tell if a page is pinned or not,
1467 * so always write the PTE directly and rely on Xen trapping and 1479 * so always write the PTE directly and rely on Xen trapping and
1468 * emulating any updates as necessary. 1480 * emulating any updates as necessary.
1469 */ 1481 */
1470 static void __init xen_set_pte_init(pte_t *ptep, pte_t pte) 1482 static void __init xen_set_pte_init(pte_t *ptep, pte_t pte)
1471 { 1483 {
1472 if (pte_mfn(pte) != INVALID_P2M_ENTRY) 1484 if (pte_mfn(pte) != INVALID_P2M_ENTRY)
1473 pte = mask_rw_pte(ptep, pte); 1485 pte = mask_rw_pte(ptep, pte);
1474 else 1486 else
1475 pte = __pte_ma(0); 1487 pte = __pte_ma(0);
1476 1488
1477 native_set_pte(ptep, pte); 1489 native_set_pte(ptep, pte);
1478 } 1490 }
1479 1491
1480 static void pin_pagetable_pfn(unsigned cmd, unsigned long pfn) 1492 static void pin_pagetable_pfn(unsigned cmd, unsigned long pfn)
1481 { 1493 {
1482 struct mmuext_op op; 1494 struct mmuext_op op;
1483 op.cmd = cmd; 1495 op.cmd = cmd;
1484 op.arg1.mfn = pfn_to_mfn(pfn); 1496 op.arg1.mfn = pfn_to_mfn(pfn);
1485 if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF)) 1497 if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF))
1486 BUG(); 1498 BUG();
1487 } 1499 }
1488 1500
1489 /* Early in boot, while setting up the initial pagetable, assume 1501 /* Early in boot, while setting up the initial pagetable, assume
1490 everything is pinned. */ 1502 everything is pinned. */
1491 static void __init xen_alloc_pte_init(struct mm_struct *mm, unsigned long pfn) 1503 static void __init xen_alloc_pte_init(struct mm_struct *mm, unsigned long pfn)
1492 { 1504 {
1493 #ifdef CONFIG_FLATMEM 1505 #ifdef CONFIG_FLATMEM
1494 BUG_ON(mem_map); /* should only be used early */ 1506 BUG_ON(mem_map); /* should only be used early */
1495 #endif 1507 #endif
1496 make_lowmem_page_readonly(__va(PFN_PHYS(pfn))); 1508 make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
1497 pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn); 1509 pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn);
1498 } 1510 }
1499 1511
1500 /* Used for pmd and pud */ 1512 /* Used for pmd and pud */
1501 static void __init xen_alloc_pmd_init(struct mm_struct *mm, unsigned long pfn) 1513 static void __init xen_alloc_pmd_init(struct mm_struct *mm, unsigned long pfn)
1502 { 1514 {
1503 #ifdef CONFIG_FLATMEM 1515 #ifdef CONFIG_FLATMEM
1504 BUG_ON(mem_map); /* should only be used early */ 1516 BUG_ON(mem_map); /* should only be used early */
1505 #endif 1517 #endif
1506 make_lowmem_page_readonly(__va(PFN_PHYS(pfn))); 1518 make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
1507 } 1519 }
1508 1520
1509 /* Early release_pte assumes that all pts are pinned, since there's 1521 /* Early release_pte assumes that all pts are pinned, since there's
1510 only init_mm and anything attached to that is pinned. */ 1522 only init_mm and anything attached to that is pinned. */
1511 static void __init xen_release_pte_init(unsigned long pfn) 1523 static void __init xen_release_pte_init(unsigned long pfn)
1512 { 1524 {
1513 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn); 1525 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn);
1514 make_lowmem_page_readwrite(__va(PFN_PHYS(pfn))); 1526 make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
1515 } 1527 }
1516 1528
1517 static void __init xen_release_pmd_init(unsigned long pfn) 1529 static void __init xen_release_pmd_init(unsigned long pfn)
1518 { 1530 {
1519 make_lowmem_page_readwrite(__va(PFN_PHYS(pfn))); 1531 make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
1520 } 1532 }
1521 1533
1522 static inline void __pin_pagetable_pfn(unsigned cmd, unsigned long pfn) 1534 static inline void __pin_pagetable_pfn(unsigned cmd, unsigned long pfn)
1523 { 1535 {
1524 struct multicall_space mcs; 1536 struct multicall_space mcs;
1525 struct mmuext_op *op; 1537 struct mmuext_op *op;
1526 1538
1527 mcs = __xen_mc_entry(sizeof(*op)); 1539 mcs = __xen_mc_entry(sizeof(*op));
1528 op = mcs.args; 1540 op = mcs.args;
1529 op->cmd = cmd; 1541 op->cmd = cmd;
1530 op->arg1.mfn = pfn_to_mfn(pfn); 1542 op->arg1.mfn = pfn_to_mfn(pfn);
1531 1543
1532 MULTI_mmuext_op(mcs.mc, mcs.args, 1, NULL, DOMID_SELF); 1544 MULTI_mmuext_op(mcs.mc, mcs.args, 1, NULL, DOMID_SELF);
1533 } 1545 }
1534 1546
1535 static inline void __set_pfn_prot(unsigned long pfn, pgprot_t prot) 1547 static inline void __set_pfn_prot(unsigned long pfn, pgprot_t prot)
1536 { 1548 {
1537 struct multicall_space mcs; 1549 struct multicall_space mcs;
1538 unsigned long addr = (unsigned long)__va(pfn << PAGE_SHIFT); 1550 unsigned long addr = (unsigned long)__va(pfn << PAGE_SHIFT);
1539 1551
1540 mcs = __xen_mc_entry(0); 1552 mcs = __xen_mc_entry(0);
1541 MULTI_update_va_mapping(mcs.mc, (unsigned long)addr, 1553 MULTI_update_va_mapping(mcs.mc, (unsigned long)addr,
1542 pfn_pte(pfn, prot), 0); 1554 pfn_pte(pfn, prot), 0);
1543 } 1555 }
1544 1556
1545 /* This needs to make sure the new pte page is pinned iff its being 1557 /* This needs to make sure the new pte page is pinned iff its being
1546 attached to a pinned pagetable. */ 1558 attached to a pinned pagetable. */
1547 static inline void xen_alloc_ptpage(struct mm_struct *mm, unsigned long pfn, 1559 static inline void xen_alloc_ptpage(struct mm_struct *mm, unsigned long pfn,
1548 unsigned level) 1560 unsigned level)
1549 { 1561 {
1550 bool pinned = PagePinned(virt_to_page(mm->pgd)); 1562 bool pinned = PagePinned(virt_to_page(mm->pgd));
1551 1563
1552 trace_xen_mmu_alloc_ptpage(mm, pfn, level, pinned); 1564 trace_xen_mmu_alloc_ptpage(mm, pfn, level, pinned);
1553 1565
1554 if (pinned) { 1566 if (pinned) {
1555 struct page *page = pfn_to_page(pfn); 1567 struct page *page = pfn_to_page(pfn);
1556 1568
1557 SetPagePinned(page); 1569 SetPagePinned(page);
1558 1570
1559 if (!PageHighMem(page)) { 1571 if (!PageHighMem(page)) {
1560 xen_mc_batch(); 1572 xen_mc_batch();
1561 1573
1562 __set_pfn_prot(pfn, PAGE_KERNEL_RO); 1574 __set_pfn_prot(pfn, PAGE_KERNEL_RO);
1563 1575
1564 if (level == PT_PTE && USE_SPLIT_PTE_PTLOCKS) 1576 if (level == PT_PTE && USE_SPLIT_PTE_PTLOCKS)
1565 __pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn); 1577 __pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn);
1566 1578
1567 xen_mc_issue(PARAVIRT_LAZY_MMU); 1579 xen_mc_issue(PARAVIRT_LAZY_MMU);
1568 } else { 1580 } else {
1569 /* make sure there are no stray mappings of 1581 /* make sure there are no stray mappings of
1570 this page */ 1582 this page */
1571 kmap_flush_unused(); 1583 kmap_flush_unused();
1572 } 1584 }
1573 } 1585 }
1574 } 1586 }
1575 1587
1576 static void xen_alloc_pte(struct mm_struct *mm, unsigned long pfn) 1588 static void xen_alloc_pte(struct mm_struct *mm, unsigned long pfn)
1577 { 1589 {
1578 xen_alloc_ptpage(mm, pfn, PT_PTE); 1590 xen_alloc_ptpage(mm, pfn, PT_PTE);
1579 } 1591 }
1580 1592
1581 static void xen_alloc_pmd(struct mm_struct *mm, unsigned long pfn) 1593 static void xen_alloc_pmd(struct mm_struct *mm, unsigned long pfn)
1582 { 1594 {
1583 xen_alloc_ptpage(mm, pfn, PT_PMD); 1595 xen_alloc_ptpage(mm, pfn, PT_PMD);
1584 } 1596 }
1585 1597
1586 /* This should never happen until we're OK to use struct page */ 1598 /* This should never happen until we're OK to use struct page */
1587 static inline void xen_release_ptpage(unsigned long pfn, unsigned level) 1599 static inline void xen_release_ptpage(unsigned long pfn, unsigned level)
1588 { 1600 {
1589 struct page *page = pfn_to_page(pfn); 1601 struct page *page = pfn_to_page(pfn);
1590 bool pinned = PagePinned(page); 1602 bool pinned = PagePinned(page);
1591 1603
1592 trace_xen_mmu_release_ptpage(pfn, level, pinned); 1604 trace_xen_mmu_release_ptpage(pfn, level, pinned);
1593 1605
1594 if (pinned) { 1606 if (pinned) {
1595 if (!PageHighMem(page)) { 1607 if (!PageHighMem(page)) {
1596 xen_mc_batch(); 1608 xen_mc_batch();
1597 1609
1598 if (level == PT_PTE && USE_SPLIT_PTE_PTLOCKS) 1610 if (level == PT_PTE && USE_SPLIT_PTE_PTLOCKS)
1599 __pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn); 1611 __pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn);
1600 1612
1601 __set_pfn_prot(pfn, PAGE_KERNEL); 1613 __set_pfn_prot(pfn, PAGE_KERNEL);
1602 1614
1603 xen_mc_issue(PARAVIRT_LAZY_MMU); 1615 xen_mc_issue(PARAVIRT_LAZY_MMU);
1604 } 1616 }
1605 ClearPagePinned(page); 1617 ClearPagePinned(page);
1606 } 1618 }
1607 } 1619 }
1608 1620
1609 static void xen_release_pte(unsigned long pfn) 1621 static void xen_release_pte(unsigned long pfn)
1610 { 1622 {
1611 xen_release_ptpage(pfn, PT_PTE); 1623 xen_release_ptpage(pfn, PT_PTE);
1612 } 1624 }
1613 1625
1614 static void xen_release_pmd(unsigned long pfn) 1626 static void xen_release_pmd(unsigned long pfn)
1615 { 1627 {
1616 xen_release_ptpage(pfn, PT_PMD); 1628 xen_release_ptpage(pfn, PT_PMD);
1617 } 1629 }
1618 1630
1619 #if PAGETABLE_LEVELS == 4 1631 #if PAGETABLE_LEVELS == 4
1620 static void xen_alloc_pud(struct mm_struct *mm, unsigned long pfn) 1632 static void xen_alloc_pud(struct mm_struct *mm, unsigned long pfn)
1621 { 1633 {
1622 xen_alloc_ptpage(mm, pfn, PT_PUD); 1634 xen_alloc_ptpage(mm, pfn, PT_PUD);
1623 } 1635 }
1624 1636
1625 static void xen_release_pud(unsigned long pfn) 1637 static void xen_release_pud(unsigned long pfn)
1626 { 1638 {
1627 xen_release_ptpage(pfn, PT_PUD); 1639 xen_release_ptpage(pfn, PT_PUD);
1628 } 1640 }
1629 #endif 1641 #endif
1630 1642
1631 void __init xen_reserve_top(void) 1643 void __init xen_reserve_top(void)
1632 { 1644 {
1633 #ifdef CONFIG_X86_32 1645 #ifdef CONFIG_X86_32
1634 unsigned long top = HYPERVISOR_VIRT_START; 1646 unsigned long top = HYPERVISOR_VIRT_START;
1635 struct xen_platform_parameters pp; 1647 struct xen_platform_parameters pp;
1636 1648
1637 if (HYPERVISOR_xen_version(XENVER_platform_parameters, &pp) == 0) 1649 if (HYPERVISOR_xen_version(XENVER_platform_parameters, &pp) == 0)
1638 top = pp.virt_start; 1650 top = pp.virt_start;
1639 1651
1640 reserve_top_address(-top); 1652 reserve_top_address(-top);
1641 #endif /* CONFIG_X86_32 */ 1653 #endif /* CONFIG_X86_32 */
1642 } 1654 }
1643 1655
1644 /* 1656 /*
1645 * Like __va(), but returns address in the kernel mapping (which is 1657 * Like __va(), but returns address in the kernel mapping (which is
1646 * all we have until the physical memory mapping has been set up. 1658 * all we have until the physical memory mapping has been set up.
1647 */ 1659 */
1648 static void *__ka(phys_addr_t paddr) 1660 static void *__ka(phys_addr_t paddr)
1649 { 1661 {
1650 #ifdef CONFIG_X86_64 1662 #ifdef CONFIG_X86_64
1651 return (void *)(paddr + __START_KERNEL_map); 1663 return (void *)(paddr + __START_KERNEL_map);
1652 #else 1664 #else
1653 return __va(paddr); 1665 return __va(paddr);
1654 #endif 1666 #endif
1655 } 1667 }
1656 1668
1657 /* Convert a machine address to physical address */ 1669 /* Convert a machine address to physical address */
1658 static unsigned long m2p(phys_addr_t maddr) 1670 static unsigned long m2p(phys_addr_t maddr)
1659 { 1671 {
1660 phys_addr_t paddr; 1672 phys_addr_t paddr;
1661 1673
1662 maddr &= PTE_PFN_MASK; 1674 maddr &= PTE_PFN_MASK;
1663 paddr = mfn_to_pfn(maddr >> PAGE_SHIFT) << PAGE_SHIFT; 1675 paddr = mfn_to_pfn(maddr >> PAGE_SHIFT) << PAGE_SHIFT;
1664 1676
1665 return paddr; 1677 return paddr;
1666 } 1678 }
1667 1679
1668 /* Convert a machine address to kernel virtual */ 1680 /* Convert a machine address to kernel virtual */
1669 static void *m2v(phys_addr_t maddr) 1681 static void *m2v(phys_addr_t maddr)
1670 { 1682 {
1671 return __ka(m2p(maddr)); 1683 return __ka(m2p(maddr));
1672 } 1684 }
1673 1685
1674 /* Set the page permissions on an identity-mapped pages */ 1686 /* Set the page permissions on an identity-mapped pages */
1675 static void set_page_prot_flags(void *addr, pgprot_t prot, unsigned long flags) 1687 static void set_page_prot_flags(void *addr, pgprot_t prot, unsigned long flags)
1676 { 1688 {
1677 unsigned long pfn = __pa(addr) >> PAGE_SHIFT; 1689 unsigned long pfn = __pa(addr) >> PAGE_SHIFT;
1678 pte_t pte = pfn_pte(pfn, prot); 1690 pte_t pte = pfn_pte(pfn, prot);
1679 1691
1680 /* For PVH no need to set R/O or R/W to pin them or unpin them. */ 1692 /* For PVH no need to set R/O or R/W to pin them or unpin them. */
1681 if (xen_feature(XENFEAT_auto_translated_physmap)) 1693 if (xen_feature(XENFEAT_auto_translated_physmap))
1682 return; 1694 return;
1683 1695
1684 if (HYPERVISOR_update_va_mapping((unsigned long)addr, pte, flags)) 1696 if (HYPERVISOR_update_va_mapping((unsigned long)addr, pte, flags))
1685 BUG(); 1697 BUG();
1686 } 1698 }
1687 static void set_page_prot(void *addr, pgprot_t prot) 1699 static void set_page_prot(void *addr, pgprot_t prot)
1688 { 1700 {
1689 return set_page_prot_flags(addr, prot, UVMF_NONE); 1701 return set_page_prot_flags(addr, prot, UVMF_NONE);
1690 } 1702 }
1691 #ifdef CONFIG_X86_32 1703 #ifdef CONFIG_X86_32
1692 static void __init xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn) 1704 static void __init xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn)
1693 { 1705 {
1694 unsigned pmdidx, pteidx; 1706 unsigned pmdidx, pteidx;
1695 unsigned ident_pte; 1707 unsigned ident_pte;
1696 unsigned long pfn; 1708 unsigned long pfn;
1697 1709
1698 level1_ident_pgt = extend_brk(sizeof(pte_t) * LEVEL1_IDENT_ENTRIES, 1710 level1_ident_pgt = extend_brk(sizeof(pte_t) * LEVEL1_IDENT_ENTRIES,
1699 PAGE_SIZE); 1711 PAGE_SIZE);
1700 1712
1701 ident_pte = 0; 1713 ident_pte = 0;
1702 pfn = 0; 1714 pfn = 0;
1703 for (pmdidx = 0; pmdidx < PTRS_PER_PMD && pfn < max_pfn; pmdidx++) { 1715 for (pmdidx = 0; pmdidx < PTRS_PER_PMD && pfn < max_pfn; pmdidx++) {
1704 pte_t *pte_page; 1716 pte_t *pte_page;
1705 1717
1706 /* Reuse or allocate a page of ptes */ 1718 /* Reuse or allocate a page of ptes */
1707 if (pmd_present(pmd[pmdidx])) 1719 if (pmd_present(pmd[pmdidx]))
1708 pte_page = m2v(pmd[pmdidx].pmd); 1720 pte_page = m2v(pmd[pmdidx].pmd);
1709 else { 1721 else {
1710 /* Check for free pte pages */ 1722 /* Check for free pte pages */
1711 if (ident_pte == LEVEL1_IDENT_ENTRIES) 1723 if (ident_pte == LEVEL1_IDENT_ENTRIES)
1712 break; 1724 break;
1713 1725
1714 pte_page = &level1_ident_pgt[ident_pte]; 1726 pte_page = &level1_ident_pgt[ident_pte];
1715 ident_pte += PTRS_PER_PTE; 1727 ident_pte += PTRS_PER_PTE;
1716 1728
1717 pmd[pmdidx] = __pmd(__pa(pte_page) | _PAGE_TABLE); 1729 pmd[pmdidx] = __pmd(__pa(pte_page) | _PAGE_TABLE);
1718 } 1730 }
1719 1731
1720 /* Install mappings */ 1732 /* Install mappings */
1721 for (pteidx = 0; pteidx < PTRS_PER_PTE; pteidx++, pfn++) { 1733 for (pteidx = 0; pteidx < PTRS_PER_PTE; pteidx++, pfn++) {
1722 pte_t pte; 1734 pte_t pte;
1723 1735
1724 #ifdef CONFIG_X86_32 1736 #ifdef CONFIG_X86_32
1725 if (pfn > max_pfn_mapped) 1737 if (pfn > max_pfn_mapped)
1726 max_pfn_mapped = pfn; 1738 max_pfn_mapped = pfn;
1727 #endif 1739 #endif
1728 1740
1729 if (!pte_none(pte_page[pteidx])) 1741 if (!pte_none(pte_page[pteidx]))
1730 continue; 1742 continue;
1731 1743
1732 pte = pfn_pte(pfn, PAGE_KERNEL_EXEC); 1744 pte = pfn_pte(pfn, PAGE_KERNEL_EXEC);
1733 pte_page[pteidx] = pte; 1745 pte_page[pteidx] = pte;
1734 } 1746 }
1735 } 1747 }
1736 1748
1737 for (pteidx = 0; pteidx < ident_pte; pteidx += PTRS_PER_PTE) 1749 for (pteidx = 0; pteidx < ident_pte; pteidx += PTRS_PER_PTE)
1738 set_page_prot(&level1_ident_pgt[pteidx], PAGE_KERNEL_RO); 1750 set_page_prot(&level1_ident_pgt[pteidx], PAGE_KERNEL_RO);
1739 1751
1740 set_page_prot(pmd, PAGE_KERNEL_RO); 1752 set_page_prot(pmd, PAGE_KERNEL_RO);
1741 } 1753 }
1742 #endif 1754 #endif
1743 void __init xen_setup_machphys_mapping(void) 1755 void __init xen_setup_machphys_mapping(void)
1744 { 1756 {
1745 struct xen_machphys_mapping mapping; 1757 struct xen_machphys_mapping mapping;
1746 1758
1747 if (HYPERVISOR_memory_op(XENMEM_machphys_mapping, &mapping) == 0) { 1759 if (HYPERVISOR_memory_op(XENMEM_machphys_mapping, &mapping) == 0) {
1748 machine_to_phys_mapping = (unsigned long *)mapping.v_start; 1760 machine_to_phys_mapping = (unsigned long *)mapping.v_start;
1749 machine_to_phys_nr = mapping.max_mfn + 1; 1761 machine_to_phys_nr = mapping.max_mfn + 1;
1750 } else { 1762 } else {
1751 machine_to_phys_nr = MACH2PHYS_NR_ENTRIES; 1763 machine_to_phys_nr = MACH2PHYS_NR_ENTRIES;
1752 } 1764 }
1753 #ifdef CONFIG_X86_32 1765 #ifdef CONFIG_X86_32
1754 WARN_ON((machine_to_phys_mapping + (machine_to_phys_nr - 1)) 1766 WARN_ON((machine_to_phys_mapping + (machine_to_phys_nr - 1))
1755 < machine_to_phys_mapping); 1767 < machine_to_phys_mapping);
1756 #endif 1768 #endif
1757 } 1769 }
1758 1770
1759 #ifdef CONFIG_X86_64 1771 #ifdef CONFIG_X86_64
1760 static void convert_pfn_mfn(void *v) 1772 static void convert_pfn_mfn(void *v)
1761 { 1773 {
1762 pte_t *pte = v; 1774 pte_t *pte = v;
1763 int i; 1775 int i;
1764 1776
1765 /* All levels are converted the same way, so just treat them 1777 /* All levels are converted the same way, so just treat them
1766 as ptes. */ 1778 as ptes. */
1767 for (i = 0; i < PTRS_PER_PTE; i++) 1779 for (i = 0; i < PTRS_PER_PTE; i++)
1768 pte[i] = xen_make_pte(pte[i].pte); 1780 pte[i] = xen_make_pte(pte[i].pte);
1769 } 1781 }
1770 static void __init check_pt_base(unsigned long *pt_base, unsigned long *pt_end, 1782 static void __init check_pt_base(unsigned long *pt_base, unsigned long *pt_end,
1771 unsigned long addr) 1783 unsigned long addr)
1772 { 1784 {
1773 if (*pt_base == PFN_DOWN(__pa(addr))) { 1785 if (*pt_base == PFN_DOWN(__pa(addr))) {
1774 set_page_prot_flags((void *)addr, PAGE_KERNEL, UVMF_INVLPG); 1786 set_page_prot_flags((void *)addr, PAGE_KERNEL, UVMF_INVLPG);
1775 clear_page((void *)addr); 1787 clear_page((void *)addr);
1776 (*pt_base)++; 1788 (*pt_base)++;
1777 } 1789 }
1778 if (*pt_end == PFN_DOWN(__pa(addr))) { 1790 if (*pt_end == PFN_DOWN(__pa(addr))) {
1779 set_page_prot_flags((void *)addr, PAGE_KERNEL, UVMF_INVLPG); 1791 set_page_prot_flags((void *)addr, PAGE_KERNEL, UVMF_INVLPG);
1780 clear_page((void *)addr); 1792 clear_page((void *)addr);
1781 (*pt_end)--; 1793 (*pt_end)--;
1782 } 1794 }
1783 } 1795 }
1784 /* 1796 /*
1785 * Set up the initial kernel pagetable. 1797 * Set up the initial kernel pagetable.
1786 * 1798 *
1787 * We can construct this by grafting the Xen provided pagetable into 1799 * We can construct this by grafting the Xen provided pagetable into
1788 * head_64.S's preconstructed pagetables. We copy the Xen L2's into 1800 * head_64.S's preconstructed pagetables. We copy the Xen L2's into
1789 * level2_ident_pgt, and level2_kernel_pgt. This means that only the 1801 * level2_ident_pgt, and level2_kernel_pgt. This means that only the
1790 * kernel has a physical mapping to start with - but that's enough to 1802 * kernel has a physical mapping to start with - but that's enough to
1791 * get __va working. We need to fill in the rest of the physical 1803 * get __va working. We need to fill in the rest of the physical
1792 * mapping once some sort of allocator has been set up. NOTE: for 1804 * mapping once some sort of allocator has been set up. NOTE: for
1793 * PVH, the page tables are native. 1805 * PVH, the page tables are native.
1794 */ 1806 */
1795 void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn) 1807 void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
1796 { 1808 {
1797 pud_t *l3; 1809 pud_t *l3;
1798 pmd_t *l2; 1810 pmd_t *l2;
1799 unsigned long addr[3]; 1811 unsigned long addr[3];
1800 unsigned long pt_base, pt_end; 1812 unsigned long pt_base, pt_end;
1801 unsigned i; 1813 unsigned i;
1802 1814
1803 /* max_pfn_mapped is the last pfn mapped in the initial memory 1815 /* max_pfn_mapped is the last pfn mapped in the initial memory
1804 * mappings. Considering that on Xen after the kernel mappings we 1816 * mappings. Considering that on Xen after the kernel mappings we
1805 * have the mappings of some pages that don't exist in pfn space, we 1817 * have the mappings of some pages that don't exist in pfn space, we
1806 * set max_pfn_mapped to the last real pfn mapped. */ 1818 * set max_pfn_mapped to the last real pfn mapped. */
1807 max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->mfn_list)); 1819 max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->mfn_list));
1808 1820
1809 pt_base = PFN_DOWN(__pa(xen_start_info->pt_base)); 1821 pt_base = PFN_DOWN(__pa(xen_start_info->pt_base));
1810 pt_end = pt_base + xen_start_info->nr_pt_frames; 1822 pt_end = pt_base + xen_start_info->nr_pt_frames;
1811 1823
1812 /* Zap identity mapping */ 1824 /* Zap identity mapping */
1813 init_level4_pgt[0] = __pgd(0); 1825 init_level4_pgt[0] = __pgd(0);
1814 1826
1815 if (!xen_feature(XENFEAT_auto_translated_physmap)) { 1827 if (!xen_feature(XENFEAT_auto_translated_physmap)) {
1816 /* Pre-constructed entries are in pfn, so convert to mfn */ 1828 /* Pre-constructed entries are in pfn, so convert to mfn */
1817 /* L4[272] -> level3_ident_pgt 1829 /* L4[272] -> level3_ident_pgt
1818 * L4[511] -> level3_kernel_pgt */ 1830 * L4[511] -> level3_kernel_pgt */
1819 convert_pfn_mfn(init_level4_pgt); 1831 convert_pfn_mfn(init_level4_pgt);
1820 1832
1821 /* L3_i[0] -> level2_ident_pgt */ 1833 /* L3_i[0] -> level2_ident_pgt */
1822 convert_pfn_mfn(level3_ident_pgt); 1834 convert_pfn_mfn(level3_ident_pgt);
1823 /* L3_k[510] -> level2_kernel_pgt 1835 /* L3_k[510] -> level2_kernel_pgt
1824 * L3_k[511] -> level2_fixmap_pgt */ 1836 * L3_k[511] -> level2_fixmap_pgt */
1825 convert_pfn_mfn(level3_kernel_pgt); 1837 convert_pfn_mfn(level3_kernel_pgt);
1826 1838
1827 /* L3_k[511][506] -> level1_fixmap_pgt */ 1839 /* L3_k[511][506] -> level1_fixmap_pgt */
1828 convert_pfn_mfn(level2_fixmap_pgt); 1840 convert_pfn_mfn(level2_fixmap_pgt);
1829 } 1841 }
1830 /* We get [511][511] and have Xen's version of level2_kernel_pgt */ 1842 /* We get [511][511] and have Xen's version of level2_kernel_pgt */
1831 l3 = m2v(pgd[pgd_index(__START_KERNEL_map)].pgd); 1843 l3 = m2v(pgd[pgd_index(__START_KERNEL_map)].pgd);
1832 l2 = m2v(l3[pud_index(__START_KERNEL_map)].pud); 1844 l2 = m2v(l3[pud_index(__START_KERNEL_map)].pud);
1833 1845
1834 addr[0] = (unsigned long)pgd; 1846 addr[0] = (unsigned long)pgd;
1835 addr[1] = (unsigned long)l3; 1847 addr[1] = (unsigned long)l3;
1836 addr[2] = (unsigned long)l2; 1848 addr[2] = (unsigned long)l2;
1837 /* Graft it onto L4[272][0]. Note that we creating an aliasing problem: 1849 /* Graft it onto L4[272][0]. Note that we creating an aliasing problem:
1838 * Both L4[272][0] and L4[511][510] have entries that point to the same 1850 * Both L4[272][0] and L4[511][510] have entries that point to the same
1839 * L2 (PMD) tables. Meaning that if you modify it in __va space 1851 * L2 (PMD) tables. Meaning that if you modify it in __va space
1840 * it will be also modified in the __ka space! (But if you just 1852 * it will be also modified in the __ka space! (But if you just
1841 * modify the PMD table to point to other PTE's or none, then you 1853 * modify the PMD table to point to other PTE's or none, then you
1842 * are OK - which is what cleanup_highmap does) */ 1854 * are OK - which is what cleanup_highmap does) */
1843 copy_page(level2_ident_pgt, l2); 1855 copy_page(level2_ident_pgt, l2);
1844 /* Graft it onto L4[511][510] */ 1856 /* Graft it onto L4[511][510] */
1845 copy_page(level2_kernel_pgt, l2); 1857 copy_page(level2_kernel_pgt, l2);
1846 1858
1847 if (!xen_feature(XENFEAT_auto_translated_physmap)) { 1859 if (!xen_feature(XENFEAT_auto_translated_physmap)) {
1848 /* Make pagetable pieces RO */ 1860 /* Make pagetable pieces RO */
1849 set_page_prot(init_level4_pgt, PAGE_KERNEL_RO); 1861 set_page_prot(init_level4_pgt, PAGE_KERNEL_RO);
1850 set_page_prot(level3_ident_pgt, PAGE_KERNEL_RO); 1862 set_page_prot(level3_ident_pgt, PAGE_KERNEL_RO);
1851 set_page_prot(level3_kernel_pgt, PAGE_KERNEL_RO); 1863 set_page_prot(level3_kernel_pgt, PAGE_KERNEL_RO);
1852 set_page_prot(level3_user_vsyscall, PAGE_KERNEL_RO); 1864 set_page_prot(level3_user_vsyscall, PAGE_KERNEL_RO);
1853 set_page_prot(level2_ident_pgt, PAGE_KERNEL_RO); 1865 set_page_prot(level2_ident_pgt, PAGE_KERNEL_RO);
1854 set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO); 1866 set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO);
1855 set_page_prot(level2_fixmap_pgt, PAGE_KERNEL_RO); 1867 set_page_prot(level2_fixmap_pgt, PAGE_KERNEL_RO);
1856 set_page_prot(level1_fixmap_pgt, PAGE_KERNEL_RO); 1868 set_page_prot(level1_fixmap_pgt, PAGE_KERNEL_RO);
1857 1869
1858 /* Pin down new L4 */ 1870 /* Pin down new L4 */
1859 pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE, 1871 pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE,
1860 PFN_DOWN(__pa_symbol(init_level4_pgt))); 1872 PFN_DOWN(__pa_symbol(init_level4_pgt)));
1861 1873
1862 /* Unpin Xen-provided one */ 1874 /* Unpin Xen-provided one */
1863 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd))); 1875 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
1864 1876
1865 /* 1877 /*
1866 * At this stage there can be no user pgd, and no page 1878 * At this stage there can be no user pgd, and no page
1867 * structure to attach it to, so make sure we just set kernel 1879 * structure to attach it to, so make sure we just set kernel
1868 * pgd. 1880 * pgd.
1869 */ 1881 */
1870 xen_mc_batch(); 1882 xen_mc_batch();
1871 __xen_write_cr3(true, __pa(init_level4_pgt)); 1883 __xen_write_cr3(true, __pa(init_level4_pgt));
1872 xen_mc_issue(PARAVIRT_LAZY_CPU); 1884 xen_mc_issue(PARAVIRT_LAZY_CPU);
1873 } else 1885 } else
1874 native_write_cr3(__pa(init_level4_pgt)); 1886 native_write_cr3(__pa(init_level4_pgt));
1875 1887
1876 /* We can't that easily rip out L3 and L2, as the Xen pagetables are 1888 /* We can't that easily rip out L3 and L2, as the Xen pagetables are
1877 * set out this way: [L4], [L1], [L2], [L3], [L1], [L1] ... for 1889 * set out this way: [L4], [L1], [L2], [L3], [L1], [L1] ... for
1878 * the initial domain. For guests using the toolstack, they are in: 1890 * the initial domain. For guests using the toolstack, they are in:
1879 * [L4], [L3], [L2], [L1], [L1], order .. So for dom0 we can only 1891 * [L4], [L3], [L2], [L1], [L1], order .. So for dom0 we can only
1880 * rip out the [L4] (pgd), but for guests we shave off three pages. 1892 * rip out the [L4] (pgd), but for guests we shave off three pages.
1881 */ 1893 */
1882 for (i = 0; i < ARRAY_SIZE(addr); i++) 1894 for (i = 0; i < ARRAY_SIZE(addr); i++)
1883 check_pt_base(&pt_base, &pt_end, addr[i]); 1895 check_pt_base(&pt_base, &pt_end, addr[i]);
1884 1896
1885 /* Our (by three pages) smaller Xen pagetable that we are using */ 1897 /* Our (by three pages) smaller Xen pagetable that we are using */
1886 memblock_reserve(PFN_PHYS(pt_base), (pt_end - pt_base) * PAGE_SIZE); 1898 memblock_reserve(PFN_PHYS(pt_base), (pt_end - pt_base) * PAGE_SIZE);
1887 /* Revector the xen_start_info */ 1899 /* Revector the xen_start_info */
1888 xen_start_info = (struct start_info *)__va(__pa(xen_start_info)); 1900 xen_start_info = (struct start_info *)__va(__pa(xen_start_info));
1889 } 1901 }
1890 #else /* !CONFIG_X86_64 */ 1902 #else /* !CONFIG_X86_64 */
1891 static RESERVE_BRK_ARRAY(pmd_t, initial_kernel_pmd, PTRS_PER_PMD); 1903 static RESERVE_BRK_ARRAY(pmd_t, initial_kernel_pmd, PTRS_PER_PMD);
1892 static RESERVE_BRK_ARRAY(pmd_t, swapper_kernel_pmd, PTRS_PER_PMD); 1904 static RESERVE_BRK_ARRAY(pmd_t, swapper_kernel_pmd, PTRS_PER_PMD);
1893 1905
1894 static void __init xen_write_cr3_init(unsigned long cr3) 1906 static void __init xen_write_cr3_init(unsigned long cr3)
1895 { 1907 {
1896 unsigned long pfn = PFN_DOWN(__pa(swapper_pg_dir)); 1908 unsigned long pfn = PFN_DOWN(__pa(swapper_pg_dir));
1897 1909
1898 BUG_ON(read_cr3() != __pa(initial_page_table)); 1910 BUG_ON(read_cr3() != __pa(initial_page_table));
1899 BUG_ON(cr3 != __pa(swapper_pg_dir)); 1911 BUG_ON(cr3 != __pa(swapper_pg_dir));
1900 1912
1901 /* 1913 /*
1902 * We are switching to swapper_pg_dir for the first time (from 1914 * We are switching to swapper_pg_dir for the first time (from
1903 * initial_page_table) and therefore need to mark that page 1915 * initial_page_table) and therefore need to mark that page
1904 * read-only and then pin it. 1916 * read-only and then pin it.
1905 * 1917 *
1906 * Xen disallows sharing of kernel PMDs for PAE 1918 * Xen disallows sharing of kernel PMDs for PAE
1907 * guests. Therefore we must copy the kernel PMD from 1919 * guests. Therefore we must copy the kernel PMD from
1908 * initial_page_table into a new kernel PMD to be used in 1920 * initial_page_table into a new kernel PMD to be used in
1909 * swapper_pg_dir. 1921 * swapper_pg_dir.
1910 */ 1922 */
1911 swapper_kernel_pmd = 1923 swapper_kernel_pmd =
1912 extend_brk(sizeof(pmd_t) * PTRS_PER_PMD, PAGE_SIZE); 1924 extend_brk(sizeof(pmd_t) * PTRS_PER_PMD, PAGE_SIZE);
1913 copy_page(swapper_kernel_pmd, initial_kernel_pmd); 1925 copy_page(swapper_kernel_pmd, initial_kernel_pmd);
1914 swapper_pg_dir[KERNEL_PGD_BOUNDARY] = 1926 swapper_pg_dir[KERNEL_PGD_BOUNDARY] =
1915 __pgd(__pa(swapper_kernel_pmd) | _PAGE_PRESENT); 1927 __pgd(__pa(swapper_kernel_pmd) | _PAGE_PRESENT);
1916 set_page_prot(swapper_kernel_pmd, PAGE_KERNEL_RO); 1928 set_page_prot(swapper_kernel_pmd, PAGE_KERNEL_RO);
1917 1929
1918 set_page_prot(swapper_pg_dir, PAGE_KERNEL_RO); 1930 set_page_prot(swapper_pg_dir, PAGE_KERNEL_RO);
1919 xen_write_cr3(cr3); 1931 xen_write_cr3(cr3);
1920 pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, pfn); 1932 pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, pfn);
1921 1933
1922 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, 1934 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE,
1923 PFN_DOWN(__pa(initial_page_table))); 1935 PFN_DOWN(__pa(initial_page_table)));
1924 set_page_prot(initial_page_table, PAGE_KERNEL); 1936 set_page_prot(initial_page_table, PAGE_KERNEL);
1925 set_page_prot(initial_kernel_pmd, PAGE_KERNEL); 1937 set_page_prot(initial_kernel_pmd, PAGE_KERNEL);
1926 1938
1927 pv_mmu_ops.write_cr3 = &xen_write_cr3; 1939 pv_mmu_ops.write_cr3 = &xen_write_cr3;
1928 } 1940 }
1929 1941
1930 void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn) 1942 void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
1931 { 1943 {
1932 pmd_t *kernel_pmd; 1944 pmd_t *kernel_pmd;
1933 1945
1934 initial_kernel_pmd = 1946 initial_kernel_pmd =
1935 extend_brk(sizeof(pmd_t) * PTRS_PER_PMD, PAGE_SIZE); 1947 extend_brk(sizeof(pmd_t) * PTRS_PER_PMD, PAGE_SIZE);
1936 1948
1937 max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->pt_base) + 1949 max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->pt_base) +
1938 xen_start_info->nr_pt_frames * PAGE_SIZE + 1950 xen_start_info->nr_pt_frames * PAGE_SIZE +
1939 512*1024); 1951 512*1024);
1940 1952
1941 kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd); 1953 kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd);
1942 copy_page(initial_kernel_pmd, kernel_pmd); 1954 copy_page(initial_kernel_pmd, kernel_pmd);
1943 1955
1944 xen_map_identity_early(initial_kernel_pmd, max_pfn); 1956 xen_map_identity_early(initial_kernel_pmd, max_pfn);
1945 1957
1946 copy_page(initial_page_table, pgd); 1958 copy_page(initial_page_table, pgd);
1947 initial_page_table[KERNEL_PGD_BOUNDARY] = 1959 initial_page_table[KERNEL_PGD_BOUNDARY] =
1948 __pgd(__pa(initial_kernel_pmd) | _PAGE_PRESENT); 1960 __pgd(__pa(initial_kernel_pmd) | _PAGE_PRESENT);
1949 1961
1950 set_page_prot(initial_kernel_pmd, PAGE_KERNEL_RO); 1962 set_page_prot(initial_kernel_pmd, PAGE_KERNEL_RO);
1951 set_page_prot(initial_page_table, PAGE_KERNEL_RO); 1963 set_page_prot(initial_page_table, PAGE_KERNEL_RO);
1952 set_page_prot(empty_zero_page, PAGE_KERNEL_RO); 1964 set_page_prot(empty_zero_page, PAGE_KERNEL_RO);
1953 1965
1954 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd))); 1966 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
1955 1967
1956 pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, 1968 pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE,
1957 PFN_DOWN(__pa(initial_page_table))); 1969 PFN_DOWN(__pa(initial_page_table)));
1958 xen_write_cr3(__pa(initial_page_table)); 1970 xen_write_cr3(__pa(initial_page_table));
1959 1971
1960 memblock_reserve(__pa(xen_start_info->pt_base), 1972 memblock_reserve(__pa(xen_start_info->pt_base),
1961 xen_start_info->nr_pt_frames * PAGE_SIZE); 1973 xen_start_info->nr_pt_frames * PAGE_SIZE);
1962 } 1974 }
1963 #endif /* CONFIG_X86_64 */ 1975 #endif /* CONFIG_X86_64 */
1964 1976
1965 static unsigned char dummy_mapping[PAGE_SIZE] __page_aligned_bss; 1977 static unsigned char dummy_mapping[PAGE_SIZE] __page_aligned_bss;
1966 1978
1967 static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot) 1979 static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot)
1968 { 1980 {
1969 pte_t pte; 1981 pte_t pte;
1970 1982
1971 phys >>= PAGE_SHIFT; 1983 phys >>= PAGE_SHIFT;
1972 1984
1973 switch (idx) { 1985 switch (idx) {
1974 case FIX_BTMAP_END ... FIX_BTMAP_BEGIN: 1986 case FIX_BTMAP_END ... FIX_BTMAP_BEGIN:
1975 case FIX_RO_IDT: 1987 case FIX_RO_IDT:
1976 #ifdef CONFIG_X86_32 1988 #ifdef CONFIG_X86_32
1977 case FIX_WP_TEST: 1989 case FIX_WP_TEST:
1978 # ifdef CONFIG_HIGHMEM 1990 # ifdef CONFIG_HIGHMEM
1979 case FIX_KMAP_BEGIN ... FIX_KMAP_END: 1991 case FIX_KMAP_BEGIN ... FIX_KMAP_END:
1980 # endif 1992 # endif
1981 #elif defined(CONFIG_X86_VSYSCALL_EMULATION) 1993 #elif defined(CONFIG_X86_VSYSCALL_EMULATION)
1982 case VSYSCALL_PAGE: 1994 case VSYSCALL_PAGE:
1983 #endif 1995 #endif
1984 case FIX_TEXT_POKE0: 1996 case FIX_TEXT_POKE0:
1985 case FIX_TEXT_POKE1: 1997 case FIX_TEXT_POKE1:
1986 /* All local page mappings */ 1998 /* All local page mappings */
1987 pte = pfn_pte(phys, prot); 1999 pte = pfn_pte(phys, prot);
1988 break; 2000 break;
1989 2001
1990 #ifdef CONFIG_X86_LOCAL_APIC 2002 #ifdef CONFIG_X86_LOCAL_APIC
1991 case FIX_APIC_BASE: /* maps dummy local APIC */ 2003 case FIX_APIC_BASE: /* maps dummy local APIC */
1992 pte = pfn_pte(PFN_DOWN(__pa(dummy_mapping)), PAGE_KERNEL); 2004 pte = pfn_pte(PFN_DOWN(__pa(dummy_mapping)), PAGE_KERNEL);
1993 break; 2005 break;
1994 #endif 2006 #endif
1995 2007
1996 #ifdef CONFIG_X86_IO_APIC 2008 #ifdef CONFIG_X86_IO_APIC
1997 case FIX_IO_APIC_BASE_0 ... FIX_IO_APIC_BASE_END: 2009 case FIX_IO_APIC_BASE_0 ... FIX_IO_APIC_BASE_END:
1998 /* 2010 /*
1999 * We just don't map the IO APIC - all access is via 2011 * We just don't map the IO APIC - all access is via
2000 * hypercalls. Keep the address in the pte for reference. 2012 * hypercalls. Keep the address in the pte for reference.
2001 */ 2013 */
2002 pte = pfn_pte(PFN_DOWN(__pa(dummy_mapping)), PAGE_KERNEL); 2014 pte = pfn_pte(PFN_DOWN(__pa(dummy_mapping)), PAGE_KERNEL);
2003 break; 2015 break;
2004 #endif 2016 #endif
2005 2017
2006 case FIX_PARAVIRT_BOOTMAP: 2018 case FIX_PARAVIRT_BOOTMAP:
2007 /* This is an MFN, but it isn't an IO mapping from the 2019 /* This is an MFN, but it isn't an IO mapping from the
2008 IO domain */ 2020 IO domain */
2009 pte = mfn_pte(phys, prot); 2021 pte = mfn_pte(phys, prot);
2010 break; 2022 break;
2011 2023
2012 default: 2024 default:
2013 /* By default, set_fixmap is used for hardware mappings */ 2025 /* By default, set_fixmap is used for hardware mappings */
2014 pte = mfn_pte(phys, prot); 2026 pte = mfn_pte(phys, prot);
2015 break; 2027 break;
2016 } 2028 }
2017 2029
2018 __native_set_fixmap(idx, pte); 2030 __native_set_fixmap(idx, pte);
2019 2031
2020 #ifdef CONFIG_X86_VSYSCALL_EMULATION 2032 #ifdef CONFIG_X86_VSYSCALL_EMULATION
2021 /* Replicate changes to map the vsyscall page into the user 2033 /* Replicate changes to map the vsyscall page into the user
2022 pagetable vsyscall mapping. */ 2034 pagetable vsyscall mapping. */
2023 if (idx == VSYSCALL_PAGE) { 2035 if (idx == VSYSCALL_PAGE) {
2024 unsigned long vaddr = __fix_to_virt(idx); 2036 unsigned long vaddr = __fix_to_virt(idx);
2025 set_pte_vaddr_pud(level3_user_vsyscall, vaddr, pte); 2037 set_pte_vaddr_pud(level3_user_vsyscall, vaddr, pte);
2026 } 2038 }
2027 #endif 2039 #endif
2028 } 2040 }
2029 2041
2030 static void __init xen_post_allocator_init(void) 2042 static void __init xen_post_allocator_init(void)
2031 { 2043 {
2032 if (xen_feature(XENFEAT_auto_translated_physmap)) 2044 if (xen_feature(XENFEAT_auto_translated_physmap))
2033 return; 2045 return;
2034 2046
2035 pv_mmu_ops.set_pte = xen_set_pte; 2047 pv_mmu_ops.set_pte = xen_set_pte;
2036 pv_mmu_ops.set_pmd = xen_set_pmd; 2048 pv_mmu_ops.set_pmd = xen_set_pmd;
2037 pv_mmu_ops.set_pud = xen_set_pud; 2049 pv_mmu_ops.set_pud = xen_set_pud;
2038 #if PAGETABLE_LEVELS == 4 2050 #if PAGETABLE_LEVELS == 4
2039 pv_mmu_ops.set_pgd = xen_set_pgd; 2051 pv_mmu_ops.set_pgd = xen_set_pgd;
2040 #endif 2052 #endif
2041 2053
2042 /* This will work as long as patching hasn't happened yet 2054 /* This will work as long as patching hasn't happened yet
2043 (which it hasn't) */ 2055 (which it hasn't) */
2044 pv_mmu_ops.alloc_pte = xen_alloc_pte; 2056 pv_mmu_ops.alloc_pte = xen_alloc_pte;
2045 pv_mmu_ops.alloc_pmd = xen_alloc_pmd; 2057 pv_mmu_ops.alloc_pmd = xen_alloc_pmd;
2046 pv_mmu_ops.release_pte = xen_release_pte; 2058 pv_mmu_ops.release_pte = xen_release_pte;
2047 pv_mmu_ops.release_pmd = xen_release_pmd; 2059 pv_mmu_ops.release_pmd = xen_release_pmd;
2048 #if PAGETABLE_LEVELS == 4 2060 #if PAGETABLE_LEVELS == 4
2049 pv_mmu_ops.alloc_pud = xen_alloc_pud; 2061 pv_mmu_ops.alloc_pud = xen_alloc_pud;
2050 pv_mmu_ops.release_pud = xen_release_pud; 2062 pv_mmu_ops.release_pud = xen_release_pud;
2051 #endif 2063 #endif
2052 2064
2053 #ifdef CONFIG_X86_64 2065 #ifdef CONFIG_X86_64
2054 pv_mmu_ops.write_cr3 = &xen_write_cr3; 2066 pv_mmu_ops.write_cr3 = &xen_write_cr3;
2055 SetPagePinned(virt_to_page(level3_user_vsyscall)); 2067 SetPagePinned(virt_to_page(level3_user_vsyscall));
2056 #endif 2068 #endif
2057 xen_mark_init_mm_pinned(); 2069 xen_mark_init_mm_pinned();
2058 } 2070 }
2059 2071
2060 static void xen_leave_lazy_mmu(void) 2072 static void xen_leave_lazy_mmu(void)
2061 { 2073 {
2062 preempt_disable(); 2074 preempt_disable();
2063 xen_mc_flush(); 2075 xen_mc_flush();
2064 paravirt_leave_lazy_mmu(); 2076 paravirt_leave_lazy_mmu();
2065 preempt_enable(); 2077 preempt_enable();
2066 } 2078 }
2067 2079
2068 static const struct pv_mmu_ops xen_mmu_ops __initconst = { 2080 static const struct pv_mmu_ops xen_mmu_ops __initconst = {
2069 .read_cr2 = xen_read_cr2, 2081 .read_cr2 = xen_read_cr2,
2070 .write_cr2 = xen_write_cr2, 2082 .write_cr2 = xen_write_cr2,
2071 2083
2072 .read_cr3 = xen_read_cr3, 2084 .read_cr3 = xen_read_cr3,
2073 .write_cr3 = xen_write_cr3_init, 2085 .write_cr3 = xen_write_cr3_init,
2074 2086
2075 .flush_tlb_user = xen_flush_tlb, 2087 .flush_tlb_user = xen_flush_tlb,
2076 .flush_tlb_kernel = xen_flush_tlb, 2088 .flush_tlb_kernel = xen_flush_tlb,
2077 .flush_tlb_single = xen_flush_tlb_single, 2089 .flush_tlb_single = xen_flush_tlb_single,
2078 .flush_tlb_others = xen_flush_tlb_others, 2090 .flush_tlb_others = xen_flush_tlb_others,
2079 2091
2080 .pte_update = paravirt_nop, 2092 .pte_update = paravirt_nop,
2081 .pte_update_defer = paravirt_nop, 2093 .pte_update_defer = paravirt_nop,
2082 2094
2083 .pgd_alloc = xen_pgd_alloc, 2095 .pgd_alloc = xen_pgd_alloc,
2084 .pgd_free = xen_pgd_free, 2096 .pgd_free = xen_pgd_free,
2085 2097
2086 .alloc_pte = xen_alloc_pte_init, 2098 .alloc_pte = xen_alloc_pte_init,
2087 .release_pte = xen_release_pte_init, 2099 .release_pte = xen_release_pte_init,
2088 .alloc_pmd = xen_alloc_pmd_init, 2100 .alloc_pmd = xen_alloc_pmd_init,
2089 .release_pmd = xen_release_pmd_init, 2101 .release_pmd = xen_release_pmd_init,
2090 2102
2091 .set_pte = xen_set_pte_init, 2103 .set_pte = xen_set_pte_init,
2092 .set_pte_at = xen_set_pte_at, 2104 .set_pte_at = xen_set_pte_at,
2093 .set_pmd = xen_set_pmd_hyper, 2105 .set_pmd = xen_set_pmd_hyper,
2094 2106
2095 .ptep_modify_prot_start = __ptep_modify_prot_start, 2107 .ptep_modify_prot_start = __ptep_modify_prot_start,
2096 .ptep_modify_prot_commit = __ptep_modify_prot_commit, 2108 .ptep_modify_prot_commit = __ptep_modify_prot_commit,
2097 2109
2098 .pte_val = PV_CALLEE_SAVE(xen_pte_val), 2110 .pte_val = PV_CALLEE_SAVE(xen_pte_val),
2099 .pgd_val = PV_CALLEE_SAVE(xen_pgd_val), 2111 .pgd_val = PV_CALLEE_SAVE(xen_pgd_val),
2100 2112
2101 .make_pte = PV_CALLEE_SAVE(xen_make_pte), 2113 .make_pte = PV_CALLEE_SAVE(xen_make_pte),
2102 .make_pgd = PV_CALLEE_SAVE(xen_make_pgd), 2114 .make_pgd = PV_CALLEE_SAVE(xen_make_pgd),
2103 2115
2104 #ifdef CONFIG_X86_PAE 2116 #ifdef CONFIG_X86_PAE
2105 .set_pte_atomic = xen_set_pte_atomic, 2117 .set_pte_atomic = xen_set_pte_atomic,
2106 .pte_clear = xen_pte_clear, 2118 .pte_clear = xen_pte_clear,
2107 .pmd_clear = xen_pmd_clear, 2119 .pmd_clear = xen_pmd_clear,
2108 #endif /* CONFIG_X86_PAE */ 2120 #endif /* CONFIG_X86_PAE */
2109 .set_pud = xen_set_pud_hyper, 2121 .set_pud = xen_set_pud_hyper,
2110 2122
2111 .make_pmd = PV_CALLEE_SAVE(xen_make_pmd), 2123 .make_pmd = PV_CALLEE_SAVE(xen_make_pmd),
2112 .pmd_val = PV_CALLEE_SAVE(xen_pmd_val), 2124 .pmd_val = PV_CALLEE_SAVE(xen_pmd_val),
2113 2125
2114 #if PAGETABLE_LEVELS == 4 2126 #if PAGETABLE_LEVELS == 4
2115 .pud_val = PV_CALLEE_SAVE(xen_pud_val), 2127 .pud_val = PV_CALLEE_SAVE(xen_pud_val),
2116 .make_pud = PV_CALLEE_SAVE(xen_make_pud), 2128 .make_pud = PV_CALLEE_SAVE(xen_make_pud),
2117 .set_pgd = xen_set_pgd_hyper, 2129 .set_pgd = xen_set_pgd_hyper,
2118 2130
2119 .alloc_pud = xen_alloc_pmd_init, 2131 .alloc_pud = xen_alloc_pmd_init,
2120 .release_pud = xen_release_pmd_init, 2132 .release_pud = xen_release_pmd_init,
2121 #endif /* PAGETABLE_LEVELS == 4 */ 2133 #endif /* PAGETABLE_LEVELS == 4 */
2122 2134
2123 .activate_mm = xen_activate_mm, 2135 .activate_mm = xen_activate_mm,
2124 .dup_mmap = xen_dup_mmap, 2136 .dup_mmap = xen_dup_mmap,
2125 .exit_mmap = xen_exit_mmap, 2137 .exit_mmap = xen_exit_mmap,
2126 2138
2127 .lazy_mode = { 2139 .lazy_mode = {
2128 .enter = paravirt_enter_lazy_mmu, 2140 .enter = paravirt_enter_lazy_mmu,
2129 .leave = xen_leave_lazy_mmu, 2141 .leave = xen_leave_lazy_mmu,
2130 .flush = paravirt_flush_lazy_mmu, 2142 .flush = paravirt_flush_lazy_mmu,
2131 }, 2143 },
2132 2144
2133 .set_fixmap = xen_set_fixmap, 2145 .set_fixmap = xen_set_fixmap,
2134 }; 2146 };
2135 2147
2136 void __init xen_init_mmu_ops(void) 2148 void __init xen_init_mmu_ops(void)
2137 { 2149 {
2138 x86_init.paging.pagetable_init = xen_pagetable_init; 2150 x86_init.paging.pagetable_init = xen_pagetable_init;
2139 2151
2140 /* Optimization - we can use the HVM one but it has no idea which 2152 /* Optimization - we can use the HVM one but it has no idea which
2141 * VCPUs are descheduled - which means that it will needlessly IPI 2153 * VCPUs are descheduled - which means that it will needlessly IPI
2142 * them. Xen knows so let it do the job. 2154 * them. Xen knows so let it do the job.
2143 */ 2155 */
2144 if (xen_feature(XENFEAT_auto_translated_physmap)) { 2156 if (xen_feature(XENFEAT_auto_translated_physmap)) {
2145 pv_mmu_ops.flush_tlb_others = xen_flush_tlb_others; 2157 pv_mmu_ops.flush_tlb_others = xen_flush_tlb_others;
2146 return; 2158 return;
2147 } 2159 }
2148 pv_mmu_ops = xen_mmu_ops; 2160 pv_mmu_ops = xen_mmu_ops;
2149 2161
2150 memset(dummy_mapping, 0xff, PAGE_SIZE); 2162 memset(dummy_mapping, 0xff, PAGE_SIZE);
2151 } 2163 }
2152 2164
2153 /* Protected by xen_reservation_lock. */ 2165 /* Protected by xen_reservation_lock. */
2154 #define MAX_CONTIG_ORDER 9 /* 2MB */ 2166 #define MAX_CONTIG_ORDER 9 /* 2MB */
2155 static unsigned long discontig_frames[1<<MAX_CONTIG_ORDER]; 2167 static unsigned long discontig_frames[1<<MAX_CONTIG_ORDER];
2156 2168
2157 #define VOID_PTE (mfn_pte(0, __pgprot(0))) 2169 #define VOID_PTE (mfn_pte(0, __pgprot(0)))
2158 static void xen_zap_pfn_range(unsigned long vaddr, unsigned int order, 2170 static void xen_zap_pfn_range(unsigned long vaddr, unsigned int order,
2159 unsigned long *in_frames, 2171 unsigned long *in_frames,
2160 unsigned long *out_frames) 2172 unsigned long *out_frames)
2161 { 2173 {
2162 int i; 2174 int i;
2163 struct multicall_space mcs; 2175 struct multicall_space mcs;
2164 2176
2165 xen_mc_batch(); 2177 xen_mc_batch();
2166 for (i = 0; i < (1UL<<order); i++, vaddr += PAGE_SIZE) { 2178 for (i = 0; i < (1UL<<order); i++, vaddr += PAGE_SIZE) {
2167 mcs = __xen_mc_entry(0); 2179 mcs = __xen_mc_entry(0);
2168 2180
2169 if (in_frames) 2181 if (in_frames)
2170 in_frames[i] = virt_to_mfn(vaddr); 2182 in_frames[i] = virt_to_mfn(vaddr);
2171 2183
2172 MULTI_update_va_mapping(mcs.mc, vaddr, VOID_PTE, 0); 2184 MULTI_update_va_mapping(mcs.mc, vaddr, VOID_PTE, 0);
2173 __set_phys_to_machine(virt_to_pfn(vaddr), INVALID_P2M_ENTRY); 2185 __set_phys_to_machine(virt_to_pfn(vaddr), INVALID_P2M_ENTRY);
2174 2186
2175 if (out_frames) 2187 if (out_frames)
2176 out_frames[i] = virt_to_pfn(vaddr); 2188 out_frames[i] = virt_to_pfn(vaddr);
2177 } 2189 }
2178 xen_mc_issue(0); 2190 xen_mc_issue(0);
2179 } 2191 }
2180 2192
2181 /* 2193 /*
2182 * Update the pfn-to-mfn mappings for a virtual address range, either to 2194 * Update the pfn-to-mfn mappings for a virtual address range, either to
2183 * point to an array of mfns, or contiguously from a single starting 2195 * point to an array of mfns, or contiguously from a single starting
2184 * mfn. 2196 * mfn.
2185 */ 2197 */
2186 static void xen_remap_exchanged_ptes(unsigned long vaddr, int order, 2198 static void xen_remap_exchanged_ptes(unsigned long vaddr, int order,
2187 unsigned long *mfns, 2199 unsigned long *mfns,
2188 unsigned long first_mfn) 2200 unsigned long first_mfn)
2189 { 2201 {
2190 unsigned i, limit; 2202 unsigned i, limit;
2191 unsigned long mfn; 2203 unsigned long mfn;
2192 2204
2193 xen_mc_batch(); 2205 xen_mc_batch();
2194 2206
2195 limit = 1u << order; 2207 limit = 1u << order;
2196 for (i = 0; i < limit; i++, vaddr += PAGE_SIZE) { 2208 for (i = 0; i < limit; i++, vaddr += PAGE_SIZE) {
2197 struct multicall_space mcs; 2209 struct multicall_space mcs;
2198 unsigned flags; 2210 unsigned flags;
2199 2211
2200 mcs = __xen_mc_entry(0); 2212 mcs = __xen_mc_entry(0);
2201 if (mfns) 2213 if (mfns)
2202 mfn = mfns[i]; 2214 mfn = mfns[i];
2203 else 2215 else
2204 mfn = first_mfn + i; 2216 mfn = first_mfn + i;
2205 2217
2206 if (i < (limit - 1)) 2218 if (i < (limit - 1))
2207 flags = 0; 2219 flags = 0;
2208 else { 2220 else {
2209 if (order == 0) 2221 if (order == 0)
2210 flags = UVMF_INVLPG | UVMF_ALL; 2222 flags = UVMF_INVLPG | UVMF_ALL;
2211 else 2223 else
2212 flags = UVMF_TLB_FLUSH | UVMF_ALL; 2224 flags = UVMF_TLB_FLUSH | UVMF_ALL;
2213 } 2225 }
2214 2226
2215 MULTI_update_va_mapping(mcs.mc, vaddr, 2227 MULTI_update_va_mapping(mcs.mc, vaddr,
2216 mfn_pte(mfn, PAGE_KERNEL), flags); 2228 mfn_pte(mfn, PAGE_KERNEL), flags);
2217 2229
2218 set_phys_to_machine(virt_to_pfn(vaddr), mfn); 2230 set_phys_to_machine(virt_to_pfn(vaddr), mfn);
2219 } 2231 }
2220 2232
2221 xen_mc_issue(0); 2233 xen_mc_issue(0);
2222 } 2234 }
2223 2235
2224 /* 2236 /*
2225 * Perform the hypercall to exchange a region of our pfns to point to 2237 * Perform the hypercall to exchange a region of our pfns to point to
2226 * memory with the required contiguous alignment. Takes the pfns as 2238 * memory with the required contiguous alignment. Takes the pfns as
2227 * input, and populates mfns as output. 2239 * input, and populates mfns as output.
2228 * 2240 *
2229 * Returns a success code indicating whether the hypervisor was able to 2241 * Returns a success code indicating whether the hypervisor was able to
2230 * satisfy the request or not. 2242 * satisfy the request or not.
2231 */ 2243 */
2232 static int xen_exchange_memory(unsigned long extents_in, unsigned int order_in, 2244 static int xen_exchange_memory(unsigned long extents_in, unsigned int order_in,
2233 unsigned long *pfns_in, 2245 unsigned long *pfns_in,
2234 unsigned long extents_out, 2246 unsigned long extents_out,
2235 unsigned int order_out, 2247 unsigned int order_out,
2236 unsigned long *mfns_out, 2248 unsigned long *mfns_out,
2237 unsigned int address_bits) 2249 unsigned int address_bits)
2238 { 2250 {
2239 long rc; 2251 long rc;
2240 int success; 2252 int success;
2241 2253
2242 struct xen_memory_exchange exchange = { 2254 struct xen_memory_exchange exchange = {
2243 .in = { 2255 .in = {
2244 .nr_extents = extents_in, 2256 .nr_extents = extents_in,
2245 .extent_order = order_in, 2257 .extent_order = order_in,
2246 .extent_start = pfns_in, 2258 .extent_start = pfns_in,
2247 .domid = DOMID_SELF 2259 .domid = DOMID_SELF
2248 }, 2260 },
2249 .out = { 2261 .out = {
2250 .nr_extents = extents_out, 2262 .nr_extents = extents_out,
2251 .extent_order = order_out, 2263 .extent_order = order_out,
2252 .extent_start = mfns_out, 2264 .extent_start = mfns_out,
2253 .address_bits = address_bits, 2265 .address_bits = address_bits,
2254 .domid = DOMID_SELF 2266 .domid = DOMID_SELF
2255 } 2267 }
2256 }; 2268 };
2257 2269
2258 BUG_ON(extents_in << order_in != extents_out << order_out); 2270 BUG_ON(extents_in << order_in != extents_out << order_out);
2259 2271
2260 rc = HYPERVISOR_memory_op(XENMEM_exchange, &exchange); 2272 rc = HYPERVISOR_memory_op(XENMEM_exchange, &exchange);
2261 success = (exchange.nr_exchanged == extents_in); 2273 success = (exchange.nr_exchanged == extents_in);
2262 2274
2263 BUG_ON(!success && ((exchange.nr_exchanged != 0) || (rc == 0))); 2275 BUG_ON(!success && ((exchange.nr_exchanged != 0) || (rc == 0)));
2264 BUG_ON(success && (rc != 0)); 2276 BUG_ON(success && (rc != 0));
2265 2277
2266 return success; 2278 return success;
2267 } 2279 }
2268 2280
2269 int xen_create_contiguous_region(phys_addr_t pstart, unsigned int order, 2281 int xen_create_contiguous_region(phys_addr_t pstart, unsigned int order,
2270 unsigned int address_bits, 2282 unsigned int address_bits,
2271 dma_addr_t *dma_handle) 2283 dma_addr_t *dma_handle)
2272 { 2284 {
2273 unsigned long *in_frames = discontig_frames, out_frame; 2285 unsigned long *in_frames = discontig_frames, out_frame;
2274 unsigned long flags; 2286 unsigned long flags;
2275 int success; 2287 int success;
2276 unsigned long vstart = (unsigned long)phys_to_virt(pstart); 2288 unsigned long vstart = (unsigned long)phys_to_virt(pstart);
2277 2289
2278 /* 2290 /*
2279 * Currently an auto-translated guest will not perform I/O, nor will 2291 * Currently an auto-translated guest will not perform I/O, nor will
2280 * it require PAE page directories below 4GB. Therefore any calls to 2292 * it require PAE page directories below 4GB. Therefore any calls to
2281 * this function are redundant and can be ignored. 2293 * this function are redundant and can be ignored.
2282 */ 2294 */
2283 2295
2284 if (xen_feature(XENFEAT_auto_translated_physmap)) 2296 if (xen_feature(XENFEAT_auto_translated_physmap))
2285 return 0; 2297 return 0;
2286 2298
2287 if (unlikely(order > MAX_CONTIG_ORDER)) 2299 if (unlikely(order > MAX_CONTIG_ORDER))
2288 return -ENOMEM; 2300 return -ENOMEM;
2289 2301
2290 memset((void *) vstart, 0, PAGE_SIZE << order); 2302 memset((void *) vstart, 0, PAGE_SIZE << order);
2291 2303
2292 spin_lock_irqsave(&xen_reservation_lock, flags); 2304 spin_lock_irqsave(&xen_reservation_lock, flags);
2293 2305
2294 /* 1. Zap current PTEs, remembering MFNs. */ 2306 /* 1. Zap current PTEs, remembering MFNs. */
2295 xen_zap_pfn_range(vstart, order, in_frames, NULL); 2307 xen_zap_pfn_range(vstart, order, in_frames, NULL);
2296 2308
2297 /* 2. Get a new contiguous memory extent. */ 2309 /* 2. Get a new contiguous memory extent. */
2298 out_frame = virt_to_pfn(vstart); 2310 out_frame = virt_to_pfn(vstart);
2299 success = xen_exchange_memory(1UL << order, 0, in_frames, 2311 success = xen_exchange_memory(1UL << order, 0, in_frames,
2300 1, order, &out_frame, 2312 1, order, &out_frame,
2301 address_bits); 2313 address_bits);
2302 2314
2303 /* 3. Map the new extent in place of old pages. */ 2315 /* 3. Map the new extent in place of old pages. */
2304 if (success) 2316 if (success)
2305 xen_remap_exchanged_ptes(vstart, order, NULL, out_frame); 2317 xen_remap_exchanged_ptes(vstart, order, NULL, out_frame);
2306 else 2318 else
2307 xen_remap_exchanged_ptes(vstart, order, in_frames, 0); 2319 xen_remap_exchanged_ptes(vstart, order, in_frames, 0);
2308 2320
2309 spin_unlock_irqrestore(&xen_reservation_lock, flags); 2321 spin_unlock_irqrestore(&xen_reservation_lock, flags);
2310 2322
2311 *dma_handle = virt_to_machine(vstart).maddr; 2323 *dma_handle = virt_to_machine(vstart).maddr;
2312 return success ? 0 : -ENOMEM; 2324 return success ? 0 : -ENOMEM;
2313 } 2325 }
2314 EXPORT_SYMBOL_GPL(xen_create_contiguous_region); 2326 EXPORT_SYMBOL_GPL(xen_create_contiguous_region);
2315 2327
2316 void xen_destroy_contiguous_region(phys_addr_t pstart, unsigned int order) 2328 void xen_destroy_contiguous_region(phys_addr_t pstart, unsigned int order)
2317 { 2329 {
2318 unsigned long *out_frames = discontig_frames, in_frame; 2330 unsigned long *out_frames = discontig_frames, in_frame;
2319 unsigned long flags; 2331 unsigned long flags;
2320 int success; 2332 int success;
2321 unsigned long vstart; 2333 unsigned long vstart;
2322 2334
2323 if (xen_feature(XENFEAT_auto_translated_physmap)) 2335 if (xen_feature(XENFEAT_auto_translated_physmap))
2324 return; 2336 return;
2325 2337
2326 if (unlikely(order > MAX_CONTIG_ORDER)) 2338 if (unlikely(order > MAX_CONTIG_ORDER))
2327 return; 2339 return;
2328 2340
2329 vstart = (unsigned long)phys_to_virt(pstart); 2341 vstart = (unsigned long)phys_to_virt(pstart);
2330 memset((void *) vstart, 0, PAGE_SIZE << order); 2342 memset((void *) vstart, 0, PAGE_SIZE << order);
2331 2343
2332 spin_lock_irqsave(&xen_reservation_lock, flags); 2344 spin_lock_irqsave(&xen_reservation_lock, flags);
2333 2345
2334 /* 1. Find start MFN of contiguous extent. */ 2346 /* 1. Find start MFN of contiguous extent. */
2335 in_frame = virt_to_mfn(vstart); 2347 in_frame = virt_to_mfn(vstart);
2336 2348
2337 /* 2. Zap current PTEs. */ 2349 /* 2. Zap current PTEs. */
2338 xen_zap_pfn_range(vstart, order, NULL, out_frames); 2350 xen_zap_pfn_range(vstart, order, NULL, out_frames);
2339 2351
2340 /* 3. Do the exchange for non-contiguous MFNs. */ 2352 /* 3. Do the exchange for non-contiguous MFNs. */
2341 success = xen_exchange_memory(1, order, &in_frame, 1UL << order, 2353 success = xen_exchange_memory(1, order, &in_frame, 1UL << order,
2342 0, out_frames, 0); 2354 0, out_frames, 0);
2343 2355
2344 /* 4. Map new pages in place of old pages. */ 2356 /* 4. Map new pages in place of old pages. */
2345 if (success) 2357 if (success)
2346 xen_remap_exchanged_ptes(vstart, order, out_frames, 0); 2358 xen_remap_exchanged_ptes(vstart, order, out_frames, 0);
2347 else 2359 else
2348 xen_remap_exchanged_ptes(vstart, order, NULL, in_frame); 2360 xen_remap_exchanged_ptes(vstart, order, NULL, in_frame);
2349 2361
2350 spin_unlock_irqrestore(&xen_reservation_lock, flags); 2362 spin_unlock_irqrestore(&xen_reservation_lock, flags);
2351 } 2363 }
2352 EXPORT_SYMBOL_GPL(xen_destroy_contiguous_region); 2364 EXPORT_SYMBOL_GPL(xen_destroy_contiguous_region);
2353 2365
2354 #ifdef CONFIG_XEN_PVHVM 2366 #ifdef CONFIG_XEN_PVHVM
2355 #ifdef CONFIG_PROC_VMCORE 2367 #ifdef CONFIG_PROC_VMCORE
2356 /* 2368 /*
2357 * This function is used in two contexts: 2369 * This function is used in two contexts:
2358 * - the kdump kernel has to check whether a pfn of the crashed kernel 2370 * - the kdump kernel has to check whether a pfn of the crashed kernel
2359 * was a ballooned page. vmcore is using this function to decide 2371 * was a ballooned page. vmcore is using this function to decide
2360 * whether to access a pfn of the crashed kernel. 2372 * whether to access a pfn of the crashed kernel.
2361 * - the kexec kernel has to check whether a pfn was ballooned by the 2373 * - the kexec kernel has to check whether a pfn was ballooned by the
2362 * previous kernel. If the pfn is ballooned, handle it properly. 2374 * previous kernel. If the pfn is ballooned, handle it properly.
2363 * Returns 0 if the pfn is not backed by a RAM page, the caller may 2375 * Returns 0 if the pfn is not backed by a RAM page, the caller may
2364 * handle the pfn special in this case. 2376 * handle the pfn special in this case.
2365 */ 2377 */
2366 static int xen_oldmem_pfn_is_ram(unsigned long pfn) 2378 static int xen_oldmem_pfn_is_ram(unsigned long pfn)
2367 { 2379 {
2368 struct xen_hvm_get_mem_type a = { 2380 struct xen_hvm_get_mem_type a = {
2369 .domid = DOMID_SELF, 2381 .domid = DOMID_SELF,
2370 .pfn = pfn, 2382 .pfn = pfn,
2371 }; 2383 };
2372 int ram; 2384 int ram;
2373 2385
2374 if (HYPERVISOR_hvm_op(HVMOP_get_mem_type, &a)) 2386 if (HYPERVISOR_hvm_op(HVMOP_get_mem_type, &a))
2375 return -ENXIO; 2387 return -ENXIO;
2376 2388
2377 switch (a.mem_type) { 2389 switch (a.mem_type) {
2378 case HVMMEM_mmio_dm: 2390 case HVMMEM_mmio_dm:
2379 ram = 0; 2391 ram = 0;
2380 break; 2392 break;
2381 case HVMMEM_ram_rw: 2393 case HVMMEM_ram_rw:
2382 case HVMMEM_ram_ro: 2394 case HVMMEM_ram_ro:
2383 default: 2395 default:
2384 ram = 1; 2396 ram = 1;
2385 break; 2397 break;
2386 } 2398 }
2387 2399
2388 return ram; 2400 return ram;
2389 } 2401 }
2390 #endif 2402 #endif
2391 2403
2392 static void xen_hvm_exit_mmap(struct mm_struct *mm) 2404 static void xen_hvm_exit_mmap(struct mm_struct *mm)
2393 { 2405 {
2394 struct xen_hvm_pagetable_dying a; 2406 struct xen_hvm_pagetable_dying a;
2395 int rc; 2407 int rc;
2396 2408
2397 a.domid = DOMID_SELF; 2409 a.domid = DOMID_SELF;
2398 a.gpa = __pa(mm->pgd); 2410 a.gpa = __pa(mm->pgd);
2399 rc = HYPERVISOR_hvm_op(HVMOP_pagetable_dying, &a); 2411 rc = HYPERVISOR_hvm_op(HVMOP_pagetable_dying, &a);
2400 WARN_ON_ONCE(rc < 0); 2412 WARN_ON_ONCE(rc < 0);
2401 } 2413 }
2402 2414
2403 static int is_pagetable_dying_supported(void) 2415 static int is_pagetable_dying_supported(void)
2404 { 2416 {
2405 struct xen_hvm_pagetable_dying a; 2417 struct xen_hvm_pagetable_dying a;
2406 int rc = 0; 2418 int rc = 0;
2407 2419
2408 a.domid = DOMID_SELF; 2420 a.domid = DOMID_SELF;
2409 a.gpa = 0x00; 2421 a.gpa = 0x00;
2410 rc = HYPERVISOR_hvm_op(HVMOP_pagetable_dying, &a); 2422 rc = HYPERVISOR_hvm_op(HVMOP_pagetable_dying, &a);
2411 if (rc < 0) { 2423 if (rc < 0) {
2412 printk(KERN_DEBUG "HVMOP_pagetable_dying not supported\n"); 2424 printk(KERN_DEBUG "HVMOP_pagetable_dying not supported\n");
2413 return 0; 2425 return 0;
2414 } 2426 }
2415 return 1; 2427 return 1;
2416 } 2428 }
2417 2429
2418 void __init xen_hvm_init_mmu_ops(void) 2430 void __init xen_hvm_init_mmu_ops(void)
2419 { 2431 {
2420 if (is_pagetable_dying_supported()) 2432 if (is_pagetable_dying_supported())
2421 pv_mmu_ops.exit_mmap = xen_hvm_exit_mmap; 2433 pv_mmu_ops.exit_mmap = xen_hvm_exit_mmap;
2422 #ifdef CONFIG_PROC_VMCORE 2434 #ifdef CONFIG_PROC_VMCORE
2423 register_oldmem_pfn_is_ram(&xen_oldmem_pfn_is_ram); 2435 register_oldmem_pfn_is_ram(&xen_oldmem_pfn_is_ram);
2424 #endif 2436 #endif
2425 } 2437 }
2426 #endif 2438 #endif
2427 2439
2428 #ifdef CONFIG_XEN_PVH 2440 #ifdef CONFIG_XEN_PVH
2429 /* 2441 /*
2430 * Map foreign gfn (fgfn), to local pfn (lpfn). This for the user 2442 * Map foreign gfn (fgfn), to local pfn (lpfn). This for the user
2431 * space creating new guest on pvh dom0 and needing to map domU pages. 2443 * space creating new guest on pvh dom0 and needing to map domU pages.
2432 */ 2444 */
2433 static int xlate_add_to_p2m(unsigned long lpfn, unsigned long fgfn, 2445 static int xlate_add_to_p2m(unsigned long lpfn, unsigned long fgfn,
2434 unsigned int domid) 2446 unsigned int domid)
2435 { 2447 {
2436 int rc, err = 0; 2448 int rc, err = 0;
2437 xen_pfn_t gpfn = lpfn; 2449 xen_pfn_t gpfn = lpfn;
2438 xen_ulong_t idx = fgfn; 2450 xen_ulong_t idx = fgfn;
2439 2451
2440 struct xen_add_to_physmap_range xatp = { 2452 struct xen_add_to_physmap_range xatp = {
2441 .domid = DOMID_SELF, 2453 .domid = DOMID_SELF,
2442 .foreign_domid = domid, 2454 .foreign_domid = domid,
2443 .size = 1, 2455 .size = 1,
2444 .space = XENMAPSPACE_gmfn_foreign, 2456 .space = XENMAPSPACE_gmfn_foreign,
2445 }; 2457 };
2446 set_xen_guest_handle(xatp.idxs, &idx); 2458 set_xen_guest_handle(xatp.idxs, &idx);
2447 set_xen_guest_handle(xatp.gpfns, &gpfn); 2459 set_xen_guest_handle(xatp.gpfns, &gpfn);
2448 set_xen_guest_handle(xatp.errs, &err); 2460 set_xen_guest_handle(xatp.errs, &err);
2449 2461
2450 rc = HYPERVISOR_memory_op(XENMEM_add_to_physmap_range, &xatp); 2462 rc = HYPERVISOR_memory_op(XENMEM_add_to_physmap_range, &xatp);
2451 if (rc < 0) 2463 if (rc < 0)
2452 return rc; 2464 return rc;
2453 return err; 2465 return err;
2454 } 2466 }
2455 2467
2456 static int xlate_remove_from_p2m(unsigned long spfn, int count) 2468 static int xlate_remove_from_p2m(unsigned long spfn, int count)
2457 { 2469 {
2458 struct xen_remove_from_physmap xrp; 2470 struct xen_remove_from_physmap xrp;
2459 int i, rc; 2471 int i, rc;
2460 2472
2461 for (i = 0; i < count; i++) { 2473 for (i = 0; i < count; i++) {
2462 xrp.domid = DOMID_SELF; 2474 xrp.domid = DOMID_SELF;
2463 xrp.gpfn = spfn+i; 2475 xrp.gpfn = spfn+i;
2464 rc = HYPERVISOR_memory_op(XENMEM_remove_from_physmap, &xrp); 2476 rc = HYPERVISOR_memory_op(XENMEM_remove_from_physmap, &xrp);
2465 if (rc) 2477 if (rc)
2466 break; 2478 break;
2467 } 2479 }
2468 return rc; 2480 return rc;
2469 } 2481 }
2470 2482
2471 struct xlate_remap_data { 2483 struct xlate_remap_data {
2472 unsigned long fgfn; /* foreign domain's gfn */ 2484 unsigned long fgfn; /* foreign domain's gfn */
2473 pgprot_t prot; 2485 pgprot_t prot;
2474 domid_t domid; 2486 domid_t domid;
2475 int index; 2487 int index;
2476 struct page **pages; 2488 struct page **pages;
2477 }; 2489 };
2478 2490
2479 static int xlate_map_pte_fn(pte_t *ptep, pgtable_t token, unsigned long addr, 2491 static int xlate_map_pte_fn(pte_t *ptep, pgtable_t token, unsigned long addr,
2480 void *data) 2492 void *data)
2481 { 2493 {
2482 int rc; 2494 int rc;
2483 struct xlate_remap_data *remap = data; 2495 struct xlate_remap_data *remap = data;
2484 unsigned long pfn = page_to_pfn(remap->pages[remap->index++]); 2496 unsigned long pfn = page_to_pfn(remap->pages[remap->index++]);
2485 pte_t pteval = pte_mkspecial(pfn_pte(pfn, remap->prot)); 2497 pte_t pteval = pte_mkspecial(pfn_pte(pfn, remap->prot));
2486 2498
2487 rc = xlate_add_to_p2m(pfn, remap->fgfn, remap->domid); 2499 rc = xlate_add_to_p2m(pfn, remap->fgfn, remap->domid);
2488 if (rc) 2500 if (rc)
2489 return rc; 2501 return rc;
2490 native_set_pte(ptep, pteval); 2502 native_set_pte(ptep, pteval);
2491 2503
2492 return 0; 2504 return 0;
2493 } 2505 }
2494 2506
2495 static int xlate_remap_gfn_range(struct vm_area_struct *vma, 2507 static int xlate_remap_gfn_range(struct vm_area_struct *vma,
2496 unsigned long addr, unsigned long mfn, 2508 unsigned long addr, unsigned long mfn,
2497 int nr, pgprot_t prot, unsigned domid, 2509 int nr, pgprot_t prot, unsigned domid,
2498 struct page **pages) 2510 struct page **pages)
2499 { 2511 {
2500 int err; 2512 int err;
2501 struct xlate_remap_data pvhdata; 2513 struct xlate_remap_data pvhdata;
2502 2514
2503 BUG_ON(!pages); 2515 BUG_ON(!pages);
2504 2516
2505 pvhdata.fgfn = mfn; 2517 pvhdata.fgfn = mfn;
2506 pvhdata.prot = prot; 2518 pvhdata.prot = prot;
2507 pvhdata.domid = domid; 2519 pvhdata.domid = domid;
2508 pvhdata.index = 0; 2520 pvhdata.index = 0;
2509 pvhdata.pages = pages; 2521 pvhdata.pages = pages;
2510 err = apply_to_page_range(vma->vm_mm, addr, nr << PAGE_SHIFT, 2522 err = apply_to_page_range(vma->vm_mm, addr, nr << PAGE_SHIFT,
2511 xlate_map_pte_fn, &pvhdata); 2523 xlate_map_pte_fn, &pvhdata);
2512 flush_tlb_all(); 2524 flush_tlb_all();
2513 return err; 2525 return err;
2514 } 2526 }
2515 #endif 2527 #endif
2516 2528
2517 #define REMAP_BATCH_SIZE 16 2529 #define REMAP_BATCH_SIZE 16
2518 2530
2519 struct remap_data { 2531 struct remap_data {
2520 unsigned long mfn; 2532 unsigned long mfn;
2521 pgprot_t prot; 2533 pgprot_t prot;
2522 struct mmu_update *mmu_update; 2534 struct mmu_update *mmu_update;
2523 }; 2535 };
2524 2536
2525 static int remap_area_mfn_pte_fn(pte_t *ptep, pgtable_t token, 2537 static int remap_area_mfn_pte_fn(pte_t *ptep, pgtable_t token,
2526 unsigned long addr, void *data) 2538 unsigned long addr, void *data)
2527 { 2539 {
2528 struct remap_data *rmd = data; 2540 struct remap_data *rmd = data;
2529 pte_t pte = pte_mkspecial(mfn_pte(rmd->mfn++, rmd->prot)); 2541 pte_t pte = pte_mkspecial(mfn_pte(rmd->mfn++, rmd->prot));
2530 2542
2531 rmd->mmu_update->ptr = virt_to_machine(ptep).maddr; 2543 rmd->mmu_update->ptr = virt_to_machine(ptep).maddr;
2532 rmd->mmu_update->val = pte_val_ma(pte); 2544 rmd->mmu_update->val = pte_val_ma(pte);
2533 rmd->mmu_update++; 2545 rmd->mmu_update++;
2534 2546
2535 return 0; 2547 return 0;
2536 } 2548 }
2537 2549
2538 int xen_remap_domain_mfn_range(struct vm_area_struct *vma, 2550 int xen_remap_domain_mfn_range(struct vm_area_struct *vma,
2539 unsigned long addr, 2551 unsigned long addr,
2540 xen_pfn_t mfn, int nr, 2552 xen_pfn_t mfn, int nr,
2541 pgprot_t prot, unsigned domid, 2553 pgprot_t prot, unsigned domid,
2542 struct page **pages) 2554 struct page **pages)
2543 2555
2544 { 2556 {
2545 struct remap_data rmd; 2557 struct remap_data rmd;
2546 struct mmu_update mmu_update[REMAP_BATCH_SIZE]; 2558 struct mmu_update mmu_update[REMAP_BATCH_SIZE];
2547 int batch; 2559 int batch;
2548 unsigned long range; 2560 unsigned long range;
2549 int err = 0; 2561 int err = 0;
2550 2562
2551 BUG_ON(!((vma->vm_flags & (VM_PFNMAP | VM_IO)) == (VM_PFNMAP | VM_IO))); 2563 BUG_ON(!((vma->vm_flags & (VM_PFNMAP | VM_IO)) == (VM_PFNMAP | VM_IO)));
2552 2564
2553 if (xen_feature(XENFEAT_auto_translated_physmap)) { 2565 if (xen_feature(XENFEAT_auto_translated_physmap)) {
2554 #ifdef CONFIG_XEN_PVH 2566 #ifdef CONFIG_XEN_PVH
2555 /* We need to update the local page tables and the xen HAP */ 2567 /* We need to update the local page tables and the xen HAP */
2556 return xlate_remap_gfn_range(vma, addr, mfn, nr, prot, 2568 return xlate_remap_gfn_range(vma, addr, mfn, nr, prot,
2557 domid, pages); 2569 domid, pages);
2558 #else 2570 #else
2559 return -EINVAL; 2571 return -EINVAL;
2560 #endif 2572 #endif
2561 } 2573 }
2562 2574
2563 rmd.mfn = mfn; 2575 rmd.mfn = mfn;
2564 rmd.prot = prot; 2576 rmd.prot = prot;
2565 2577
2566 while (nr) { 2578 while (nr) {
2567 batch = min(REMAP_BATCH_SIZE, nr); 2579 batch = min(REMAP_BATCH_SIZE, nr);
2568 range = (unsigned long)batch << PAGE_SHIFT; 2580 range = (unsigned long)batch << PAGE_SHIFT;
2569 2581
2570 rmd.mmu_update = mmu_update; 2582 rmd.mmu_update = mmu_update;
2571 err = apply_to_page_range(vma->vm_mm, addr, range, 2583 err = apply_to_page_range(vma->vm_mm, addr, range,
2572 remap_area_mfn_pte_fn, &rmd); 2584 remap_area_mfn_pte_fn, &rmd);
2573 if (err) 2585 if (err)
2574 goto out; 2586 goto out;
2575 2587
2576 err = HYPERVISOR_mmu_update(mmu_update, batch, NULL, domid); 2588 err = HYPERVISOR_mmu_update(mmu_update, batch, NULL, domid);
2577 if (err < 0) 2589 if (err < 0)
2578 goto out; 2590 goto out;
2579 2591
2580 nr -= batch; 2592 nr -= batch;
2581 addr += range; 2593 addr += range;
2582 } 2594 }
2583 2595
2584 err = 0; 2596 err = 0;
2585 out: 2597 out:
2586 2598
2587 xen_flush_tlb_all(); 2599 xen_flush_tlb_all();
2588 2600
2589 return err; 2601 return err;
2590 } 2602 }
2591 EXPORT_SYMBOL_GPL(xen_remap_domain_mfn_range); 2603 EXPORT_SYMBOL_GPL(xen_remap_domain_mfn_range);
2592 2604
2593 /* Returns: 0 success */ 2605 /* Returns: 0 success */
2594 int xen_unmap_domain_mfn_range(struct vm_area_struct *vma, 2606 int xen_unmap_domain_mfn_range(struct vm_area_struct *vma,
2595 int numpgs, struct page **pages) 2607 int numpgs, struct page **pages)
2596 { 2608 {
2597 if (!pages || !xen_feature(XENFEAT_auto_translated_physmap)) 2609 if (!pages || !xen_feature(XENFEAT_auto_translated_physmap))
2598 return 0; 2610 return 0;
2599 2611
2600 #ifdef CONFIG_XEN_PVH 2612 #ifdef CONFIG_XEN_PVH
2601 while (numpgs--) { 2613 while (numpgs--) {
2602 /* 2614 /*
2603 * The mmu has already cleaned up the process mmu 2615 * The mmu has already cleaned up the process mmu
2604 * resources at this point (lookup_address will return 2616 * resources at this point (lookup_address will return
2605 * NULL). 2617 * NULL).
2606 */ 2618 */
2607 unsigned long pfn = page_to_pfn(pages[numpgs]); 2619 unsigned long pfn = page_to_pfn(pages[numpgs]);
2608 2620
2609 xlate_remove_from_p2m(pfn, 1); 2621 xlate_remove_from_p2m(pfn, 1);
2610 } 2622 }
2611 /* 2623 /*
2612 * We don't need to flush tlbs because as part of 2624 * We don't need to flush tlbs because as part of
2613 * xlate_remove_from_p2m, the hypervisor will do tlb flushes 2625 * xlate_remove_from_p2m, the hypervisor will do tlb flushes
2614 * after removing the p2m entries from the EPT/NPT 2626 * after removing the p2m entries from the EPT/NPT
1 /* 1 /*
2 * Xen leaves the responsibility for maintaining p2m mappings to the 2 * Xen leaves the responsibility for maintaining p2m mappings to the
3 * guests themselves, but it must also access and update the p2m array 3 * guests themselves, but it must also access and update the p2m array
4 * during suspend/resume when all the pages are reallocated. 4 * during suspend/resume when all the pages are reallocated.
5 * 5 *
6 * The p2m table is logically a flat array, but we implement it as a 6 * The logical flat p2m table is mapped to a linear kernel memory area.
7 * three-level tree to allow the address space to be sparse. 7 * For accesses by Xen a three-level tree linked via mfns only is set up to
8 * allow the address space to be sparse.
8 * 9 *
9 * Xen 10 * Xen
10 * | 11 * |
11 * p2m_top p2m_top_mfn 12 * p2m_top_mfn
12 * / \ / \ 13 * / \
13 * p2m_mid p2m_mid p2m_mid_mfn p2m_mid_mfn 14 * p2m_mid_mfn p2m_mid_mfn
14 * / \ / \ / / 15 * / /
15 * p2m p2m p2m p2m p2m p2m p2m ... 16 * p2m p2m p2m ...
16 * 17 *
17 * The p2m_mid_mfn pages are mapped by p2m_top_mfn_p. 18 * The p2m_mid_mfn pages are mapped by p2m_top_mfn_p.
18 * 19 *
19 * The p2m_top and p2m_top_mfn levels are limited to 1 page, so the 20 * The p2m_top_mfn level is limited to 1 page, so the maximum representable
20 * maximum representable pseudo-physical address space is: 21 * pseudo-physical address space is:
21 * P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE pages 22 * P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE pages
22 * 23 *
23 * P2M_PER_PAGE depends on the architecture, as a mfn is always 24 * P2M_PER_PAGE depends on the architecture, as a mfn is always
24 * unsigned long (8 bytes on 64-bit, 4 bytes on 32), leading to 25 * unsigned long (8 bytes on 64-bit, 4 bytes on 32), leading to
25 * 512 and 1024 entries respectively. 26 * 512 and 1024 entries respectively.
26 * 27 *
27 * In short, these structures contain the Machine Frame Number (MFN) of the PFN. 28 * In short, these structures contain the Machine Frame Number (MFN) of the PFN.
28 * 29 *
29 * However not all entries are filled with MFNs. Specifically for all other 30 * However not all entries are filled with MFNs. Specifically for all other
30 * leaf entries, or for the top root, or middle one, for which there is a void 31 * leaf entries, or for the top root, or middle one, for which there is a void
31 * entry, we assume it is "missing". So (for example) 32 * entry, we assume it is "missing". So (for example)
32 * pfn_to_mfn(0x90909090)=INVALID_P2M_ENTRY. 33 * pfn_to_mfn(0x90909090)=INVALID_P2M_ENTRY.
34 * We have a dedicated page p2m_missing with all entries being
35 * INVALID_P2M_ENTRY. This page may be referenced multiple times in the p2m
36 * list/tree in case there are multiple areas with P2M_PER_PAGE invalid pfns.
33 * 37 *
34 * We also have the possibility of setting 1-1 mappings on certain regions, so 38 * We also have the possibility of setting 1-1 mappings on certain regions, so
35 * that: 39 * that:
36 * pfn_to_mfn(0xc0000)=0xc0000 40 * pfn_to_mfn(0xc0000)=0xc0000
37 * 41 *
38 * The benefit of this is, that we can assume for non-RAM regions (think 42 * The benefit of this is, that we can assume for non-RAM regions (think
39 * PCI BARs, or ACPI spaces), we can create mappings easily because we 43 * PCI BARs, or ACPI spaces), we can create mappings easily because we
40 * get the PFN value to match the MFN. 44 * get the PFN value to match the MFN.
41 * 45 *
42 * For this to work efficiently we have one new page p2m_identity and 46 * For this to work efficiently we have one new page p2m_identity. All entries
43 * allocate (via reserved_brk) any other pages we need to cover the sides 47 * in p2m_identity are set to INVALID_P2M_ENTRY type (Xen toolstack only
44 * (1GB or 4MB boundary violations). All entries in p2m_identity are set to 48 * recognizes that and MFNs, no other fancy value).
45 * INVALID_P2M_ENTRY type (Xen toolstack only recognizes that and MFNs,
46 * no other fancy value).
47 * 49 *
48 * On lookup we spot that the entry points to p2m_identity and return the 50 * On lookup we spot that the entry points to p2m_identity and return the
49 * identity value instead of dereferencing and returning INVALID_P2M_ENTRY. 51 * identity value instead of dereferencing and returning INVALID_P2M_ENTRY.
50 * If the entry points to an allocated page, we just proceed as before and 52 * If the entry points to an allocated page, we just proceed as before and
51 * return the PFN. If the PFN has IDENTITY_FRAME_BIT set we unmask that in 53 * return the PFN. If the PFN has IDENTITY_FRAME_BIT set we unmask that in
52 * appropriate functions (pfn_to_mfn). 54 * appropriate functions (pfn_to_mfn).
53 * 55 *
54 * The reason for having the IDENTITY_FRAME_BIT instead of just returning the 56 * The reason for having the IDENTITY_FRAME_BIT instead of just returning the
55 * PFN is that we could find ourselves where pfn_to_mfn(pfn)==pfn for a 57 * PFN is that we could find ourselves where pfn_to_mfn(pfn)==pfn for a
56 * non-identity pfn. To protect ourselves against we elect to set (and get) the 58 * non-identity pfn. To protect ourselves against we elect to set (and get) the
57 * IDENTITY_FRAME_BIT on all identity mapped PFNs. 59 * IDENTITY_FRAME_BIT on all identity mapped PFNs.
58 *
59 * This simplistic diagram is used to explain the more subtle piece of code.
60 * There is also a digram of the P2M at the end that can help.
61 * Imagine your E820 looking as so:
62 *
63 * 1GB 2GB 4GB
64 * /-------------------+---------\/----\ /----------\ /---+-----\
65 * | System RAM | Sys RAM ||ACPI| | reserved | | Sys RAM |
66 * \-------------------+---------/\----/ \----------/ \---+-----/
67 * ^- 1029MB ^- 2001MB
68 *
69 * [1029MB = 263424 (0x40500), 2001MB = 512256 (0x7D100),
70 * 2048MB = 524288 (0x80000)]
71 *
72 * And dom0_mem=max:3GB,1GB is passed in to the guest, meaning memory past 1GB
73 * is actually not present (would have to kick the balloon driver to put it in).
74 *
75 * When we are told to set the PFNs for identity mapping (see patch: "xen/setup:
76 * Set identity mapping for non-RAM E820 and E820 gaps.") we pass in the start
77 * of the PFN and the end PFN (263424 and 512256 respectively). The first step
78 * is to reserve_brk a top leaf page if the p2m[1] is missing. The top leaf page
79 * covers 512^2 of page estate (1GB) and in case the start or end PFN is not
80 * aligned on 512^2*PAGE_SIZE (1GB) we reserve_brk new middle and leaf pages as
81 * required to split any existing p2m_mid_missing middle pages.
82 *
83 * With the E820 example above, 263424 is not 1GB aligned so we allocate a
84 * reserve_brk page which will cover the PFNs estate from 0x40000 to 0x80000.
85 * Each entry in the allocate page is "missing" (points to p2m_missing).
86 *
87 * Next stage is to determine if we need to do a more granular boundary check
88 * on the 4MB (or 2MB depending on architecture) off the start and end pfn's.
89 * We check if the start pfn and end pfn violate that boundary check, and if
90 * so reserve_brk a (p2m[x][y]) leaf page. This way we have a much finer
91 * granularity of setting which PFNs are missing and which ones are identity.
92 * In our example 263424 and 512256 both fail the check so we reserve_brk two
93 * pages. Populate them with INVALID_P2M_ENTRY (so they both have "missing"
94 * values) and assign them to p2m[1][2] and p2m[1][488] respectively.
95 *
96 * At this point we would at minimum reserve_brk one page, but could be up to
97 * three. Each call to set_phys_range_identity has at maximum a three page
98 * cost. If we were to query the P2M at this stage, all those entries from
99 * start PFN through end PFN (so 1029MB -> 2001MB) would return
100 * INVALID_P2M_ENTRY ("missing").
101 *
102 * The next step is to walk from the start pfn to the end pfn setting
103 * the IDENTITY_FRAME_BIT on each PFN. This is done in set_phys_range_identity.
104 * If we find that the middle entry is pointing to p2m_missing we can swap it
105 * over to p2m_identity - this way covering 4MB (or 2MB) PFN space (and
106 * similarly swapping p2m_mid_missing for p2m_mid_identity for larger regions).
107 * At this point we do not need to worry about boundary aligment (so no need to
108 * reserve_brk a middle page, figure out which PFNs are "missing" and which
109 * ones are identity), as that has been done earlier. If we find that the
110 * middle leaf is not occupied by p2m_identity or p2m_missing, we dereference
111 * that page (which covers 512 PFNs) and set the appropriate PFN with
112 * IDENTITY_FRAME_BIT. In our example 263424 and 512256 end up there, and we
113 * set from p2m[1][2][256->511] and p2m[1][488][0->256] with
114 * IDENTITY_FRAME_BIT set.
115 *
116 * All other regions that are void (or not filled) either point to p2m_missing
117 * (considered missing) or have the default value of INVALID_P2M_ENTRY (also
118 * considered missing). In our case, p2m[1][2][0->255] and p2m[1][488][257->511]
119 * contain the INVALID_P2M_ENTRY value and are considered "missing."
120 *
121 * Finally, the region beyond the end of of the E820 (4 GB in this example)
122 * is set to be identity (in case there are MMIO regions placed here).
123 *
124 * This is what the p2m ends up looking (for the E820 above) with this
125 * fabulous drawing:
126 *
127 * p2m /--------------\
128 * /-----\ | &mfn_list[0],| /-----------------\
129 * | 0 |------>| &mfn_list[1],| /---------------\ | ~0, ~0, .. |
130 * |-----| | ..., ~0, ~0 | | ~0, ~0, [x]---+----->| IDENTITY [@256] |
131 * | 1 |---\ \--------------/ | [p2m_identity]+\ | IDENTITY [@257] |
132 * |-----| \ | [p2m_identity]+\\ | .... |
133 * | 2 |--\ \-------------------->| ... | \\ \----------------/
134 * |-----| \ \---------------/ \\
135 * | 3 |-\ \ \\ p2m_identity [1]
136 * |-----| \ \-------------------->/---------------\ /-----------------\
137 * | .. |\ | | [p2m_identity]+-->| ~0, ~0, ~0, ... |
138 * \-----/ | | | [p2m_identity]+-->| ..., ~0 |
139 * | | | .... | \-----------------/
140 * | | +-[x], ~0, ~0.. +\
141 * | | \---------------/ \
142 * | | \-> /---------------\
143 * | V p2m_mid_missing p2m_missing | IDENTITY[@0] |
144 * | /-----------------\ /------------\ | IDENTITY[@256]|
145 * | | [p2m_missing] +---->| ~0, ~0, ...| | ~0, ~0, .... |
146 * | | [p2m_missing] +---->| ..., ~0 | \---------------/
147 * | | ... | \------------/
148 * | \-----------------/
149 * |
150 * | p2m_mid_identity
151 * | /-----------------\
152 * \-->| [p2m_identity] +---->[1]
153 * | [p2m_identity] +---->[1]
154 * | ... |
155 * \-----------------/
156 *
157 * where ~0 is INVALID_P2M_ENTRY. IDENTITY is (PFN | IDENTITY_BIT)
158 */ 60 */
159 61
160 #include <linux/init.h> 62 #include <linux/init.h>
161 #include <linux/module.h> 63 #include <linux/module.h>
162 #include <linux/list.h> 64 #include <linux/list.h>
163 #include <linux/hash.h> 65 #include <linux/hash.h>
164 #include <linux/sched.h> 66 #include <linux/sched.h>
165 #include <linux/seq_file.h> 67 #include <linux/seq_file.h>
166 #include <linux/bootmem.h> 68 #include <linux/bootmem.h>
69 #include <linux/slab.h>
167 70
168 #include <asm/cache.h> 71 #include <asm/cache.h>
169 #include <asm/setup.h> 72 #include <asm/setup.h>
73 #include <asm/uaccess.h>
170 74
171 #include <asm/xen/page.h> 75 #include <asm/xen/page.h>
172 #include <asm/xen/hypercall.h> 76 #include <asm/xen/hypercall.h>
173 #include <asm/xen/hypervisor.h> 77 #include <asm/xen/hypervisor.h>
174 #include <xen/balloon.h> 78 #include <xen/balloon.h>
175 #include <xen/grant_table.h> 79 #include <xen/grant_table.h>
176 80
177 #include "p2m.h" 81 #include "p2m.h"
178 #include "multicalls.h" 82 #include "multicalls.h"
179 #include "xen-ops.h" 83 #include "xen-ops.h"
180 84
85 #define PMDS_PER_MID_PAGE (P2M_MID_PER_PAGE / PTRS_PER_PTE)
86
181 static void __init m2p_override_init(void); 87 static void __init m2p_override_init(void);
182 88
89 unsigned long *xen_p2m_addr __read_mostly;
90 EXPORT_SYMBOL_GPL(xen_p2m_addr);
91 unsigned long xen_p2m_size __read_mostly;
92 EXPORT_SYMBOL_GPL(xen_p2m_size);
183 unsigned long xen_max_p2m_pfn __read_mostly; 93 unsigned long xen_max_p2m_pfn __read_mostly;
94 EXPORT_SYMBOL_GPL(xen_max_p2m_pfn);
184 95
96 static DEFINE_SPINLOCK(p2m_update_lock);
97
185 static unsigned long *p2m_mid_missing_mfn; 98 static unsigned long *p2m_mid_missing_mfn;
186 static unsigned long *p2m_top_mfn; 99 static unsigned long *p2m_top_mfn;
187 static unsigned long **p2m_top_mfn_p; 100 static unsigned long **p2m_top_mfn_p;
101 static unsigned long *p2m_missing;
102 static unsigned long *p2m_identity;
103 static pte_t *p2m_missing_pte;
104 static pte_t *p2m_identity_pte;
188 105
189 /* Placeholders for holes in the address space */
190 static RESERVE_BRK_ARRAY(unsigned long, p2m_missing, P2M_PER_PAGE);
191 static RESERVE_BRK_ARRAY(unsigned long *, p2m_mid_missing, P2M_MID_PER_PAGE);
192
193 static RESERVE_BRK_ARRAY(unsigned long **, p2m_top, P2M_TOP_PER_PAGE);
194
195 static RESERVE_BRK_ARRAY(unsigned long, p2m_identity, P2M_PER_PAGE);
196 static RESERVE_BRK_ARRAY(unsigned long *, p2m_mid_identity, P2M_MID_PER_PAGE);
197
198 RESERVE_BRK(p2m_mid, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE)));
199
200 /* For each I/O range remapped we may lose up to two leaf pages for the boundary
201 * violations and three mid pages to cover up to 3GB. With
202 * early_can_reuse_p2m_middle() most of the leaf pages will be reused by the
203 * remapped region.
204 */
205 RESERVE_BRK(p2m_identity_remap, PAGE_SIZE * 2 * 3 * MAX_REMAP_RANGES);
206
207 static inline unsigned p2m_top_index(unsigned long pfn) 106 static inline unsigned p2m_top_index(unsigned long pfn)
208 { 107 {
209 BUG_ON(pfn >= MAX_P2M_PFN); 108 BUG_ON(pfn >= MAX_P2M_PFN);
210 return pfn / (P2M_MID_PER_PAGE * P2M_PER_PAGE); 109 return pfn / (P2M_MID_PER_PAGE * P2M_PER_PAGE);
211 } 110 }
212 111
213 static inline unsigned p2m_mid_index(unsigned long pfn) 112 static inline unsigned p2m_mid_index(unsigned long pfn)
214 { 113 {
215 return (pfn / P2M_PER_PAGE) % P2M_MID_PER_PAGE; 114 return (pfn / P2M_PER_PAGE) % P2M_MID_PER_PAGE;
216 } 115 }
217 116
218 static inline unsigned p2m_index(unsigned long pfn) 117 static inline unsigned p2m_index(unsigned long pfn)
219 { 118 {
220 return pfn % P2M_PER_PAGE; 119 return pfn % P2M_PER_PAGE;
221 } 120 }
222 121
223 static void p2m_top_init(unsigned long ***top)
224 {
225 unsigned i;
226
227 for (i = 0; i < P2M_TOP_PER_PAGE; i++)
228 top[i] = p2m_mid_missing;
229 }
230
231 static void p2m_top_mfn_init(unsigned long *top) 122 static void p2m_top_mfn_init(unsigned long *top)
232 { 123 {
233 unsigned i; 124 unsigned i;
234 125
235 for (i = 0; i < P2M_TOP_PER_PAGE; i++) 126 for (i = 0; i < P2M_TOP_PER_PAGE; i++)
236 top[i] = virt_to_mfn(p2m_mid_missing_mfn); 127 top[i] = virt_to_mfn(p2m_mid_missing_mfn);
237 } 128 }
238 129
239 static void p2m_top_mfn_p_init(unsigned long **top) 130 static void p2m_top_mfn_p_init(unsigned long **top)
240 { 131 {
241 unsigned i; 132 unsigned i;
242 133
243 for (i = 0; i < P2M_TOP_PER_PAGE; i++) 134 for (i = 0; i < P2M_TOP_PER_PAGE; i++)
244 top[i] = p2m_mid_missing_mfn; 135 top[i] = p2m_mid_missing_mfn;
245 } 136 }
246 137
247 static void p2m_mid_init(unsigned long **mid, unsigned long *leaf) 138 static void p2m_mid_mfn_init(unsigned long *mid, unsigned long *leaf)
248 { 139 {
249 unsigned i; 140 unsigned i;
250 141
251 for (i = 0; i < P2M_MID_PER_PAGE; i++) 142 for (i = 0; i < P2M_MID_PER_PAGE; i++)
252 mid[i] = leaf; 143 mid[i] = virt_to_mfn(leaf);
253 } 144 }
254 145
255 static void p2m_mid_mfn_init(unsigned long *mid, unsigned long *leaf) 146 static void p2m_init(unsigned long *p2m)
256 { 147 {
257 unsigned i; 148 unsigned i;
258 149
259 for (i = 0; i < P2M_MID_PER_PAGE; i++) 150 for (i = 0; i < P2M_PER_PAGE; i++)
260 mid[i] = virt_to_mfn(leaf); 151 p2m[i] = INVALID_P2M_ENTRY;
261 } 152 }
262 153
263 static void p2m_init(unsigned long *p2m) 154 static void p2m_init_identity(unsigned long *p2m, unsigned long pfn)
264 { 155 {
265 unsigned i; 156 unsigned i;
266 157
267 for (i = 0; i < P2M_MID_PER_PAGE; i++) 158 for (i = 0; i < P2M_PER_PAGE; i++)
268 p2m[i] = INVALID_P2M_ENTRY; 159 p2m[i] = IDENTITY_FRAME(pfn + i);
269 } 160 }
270 161
162 static void * __ref alloc_p2m_page(void)
163 {
164 if (unlikely(!slab_is_available()))
165 return alloc_bootmem_align(PAGE_SIZE, PAGE_SIZE);
166
167 return (void *)__get_free_page(GFP_KERNEL | __GFP_REPEAT);
168 }
169
170 /* Only to be called in case of a race for a page just allocated! */
171 static void free_p2m_page(void *p)
172 {
173 BUG_ON(!slab_is_available());
174 free_page((unsigned long)p);
175 }
176
271 /* 177 /*
272 * Build the parallel p2m_top_mfn and p2m_mid_mfn structures 178 * Build the parallel p2m_top_mfn and p2m_mid_mfn structures
273 * 179 *
274 * This is called both at boot time, and after resuming from suspend: 180 * This is called both at boot time, and after resuming from suspend:
275 * - At boot time we're called rather early, and must use alloc_bootmem*() 181 * - At boot time we're called rather early, and must use alloc_bootmem*()
276 * to allocate memory. 182 * to allocate memory.
277 * 183 *
278 * - After resume we're called from within stop_machine, but the mfn 184 * - After resume we're called from within stop_machine, but the mfn
279 * tree should already be completely allocated. 185 * tree should already be completely allocated.
280 */ 186 */
281 void __ref xen_build_mfn_list_list(void) 187 void __ref xen_build_mfn_list_list(void)
282 { 188 {
283 unsigned long pfn; 189 unsigned long pfn, mfn;
190 pte_t *ptep;
191 unsigned int level, topidx, mididx;
192 unsigned long *mid_mfn_p;
284 193
285 if (xen_feature(XENFEAT_auto_translated_physmap)) 194 if (xen_feature(XENFEAT_auto_translated_physmap))
286 return; 195 return;
287 196
288 /* Pre-initialize p2m_top_mfn to be completely missing */ 197 /* Pre-initialize p2m_top_mfn to be completely missing */
289 if (p2m_top_mfn == NULL) { 198 if (p2m_top_mfn == NULL) {
290 p2m_mid_missing_mfn = alloc_bootmem_align(PAGE_SIZE, PAGE_SIZE); 199 p2m_mid_missing_mfn = alloc_p2m_page();
291 p2m_mid_mfn_init(p2m_mid_missing_mfn, p2m_missing); 200 p2m_mid_mfn_init(p2m_mid_missing_mfn, p2m_missing);
292 201
293 p2m_top_mfn_p = alloc_bootmem_align(PAGE_SIZE, PAGE_SIZE); 202 p2m_top_mfn_p = alloc_p2m_page();
294 p2m_top_mfn_p_init(p2m_top_mfn_p); 203 p2m_top_mfn_p_init(p2m_top_mfn_p);
295 204
296 p2m_top_mfn = alloc_bootmem_align(PAGE_SIZE, PAGE_SIZE); 205 p2m_top_mfn = alloc_p2m_page();
297 p2m_top_mfn_init(p2m_top_mfn); 206 p2m_top_mfn_init(p2m_top_mfn);
298 } else { 207 } else {
299 /* Reinitialise, mfn's all change after migration */ 208 /* Reinitialise, mfn's all change after migration */
300 p2m_mid_mfn_init(p2m_mid_missing_mfn, p2m_missing); 209 p2m_mid_mfn_init(p2m_mid_missing_mfn, p2m_missing);
301 } 210 }
302 211
303 for (pfn = 0; pfn < xen_max_p2m_pfn; pfn += P2M_PER_PAGE) { 212 for (pfn = 0; pfn < xen_max_p2m_pfn && pfn < MAX_P2M_PFN;
304 unsigned topidx = p2m_top_index(pfn); 213 pfn += P2M_PER_PAGE) {
305 unsigned mididx = p2m_mid_index(pfn); 214 topidx = p2m_top_index(pfn);
306 unsigned long **mid; 215 mididx = p2m_mid_index(pfn);
307 unsigned long *mid_mfn_p;
308 216
309 mid = p2m_top[topidx];
310 mid_mfn_p = p2m_top_mfn_p[topidx]; 217 mid_mfn_p = p2m_top_mfn_p[topidx];
218 ptep = lookup_address((unsigned long)(xen_p2m_addr + pfn),
219 &level);
220 BUG_ON(!ptep || level != PG_LEVEL_4K);
221 mfn = pte_mfn(*ptep);
222 ptep = (pte_t *)((unsigned long)ptep & ~(PAGE_SIZE - 1));
311 223
312 /* Don't bother allocating any mfn mid levels if 224 /* Don't bother allocating any mfn mid levels if
313 * they're just missing, just update the stored mfn, 225 * they're just missing, just update the stored mfn,
314 * since all could have changed over a migrate. 226 * since all could have changed over a migrate.
315 */ 227 */
316 if (mid == p2m_mid_missing) { 228 if (ptep == p2m_missing_pte || ptep == p2m_identity_pte) {
317 BUG_ON(mididx); 229 BUG_ON(mididx);
318 BUG_ON(mid_mfn_p != p2m_mid_missing_mfn); 230 BUG_ON(mid_mfn_p != p2m_mid_missing_mfn);
319 p2m_top_mfn[topidx] = virt_to_mfn(p2m_mid_missing_mfn); 231 p2m_top_mfn[topidx] = virt_to_mfn(p2m_mid_missing_mfn);
320 pfn += (P2M_MID_PER_PAGE - 1) * P2M_PER_PAGE; 232 pfn += (P2M_MID_PER_PAGE - 1) * P2M_PER_PAGE;
321 continue; 233 continue;
322 } 234 }
323 235
324 if (mid_mfn_p == p2m_mid_missing_mfn) { 236 if (mid_mfn_p == p2m_mid_missing_mfn) {
325 /* 237 mid_mfn_p = alloc_p2m_page();
326 * XXX boot-time only! We should never find
327 * missing parts of the mfn tree after
328 * runtime.
329 */
330 mid_mfn_p = alloc_bootmem_align(PAGE_SIZE, PAGE_SIZE);
331 p2m_mid_mfn_init(mid_mfn_p, p2m_missing); 238 p2m_mid_mfn_init(mid_mfn_p, p2m_missing);
332 239
333 p2m_top_mfn_p[topidx] = mid_mfn_p; 240 p2m_top_mfn_p[topidx] = mid_mfn_p;
334 } 241 }
335 242
336 p2m_top_mfn[topidx] = virt_to_mfn(mid_mfn_p); 243 p2m_top_mfn[topidx] = virt_to_mfn(mid_mfn_p);
337 mid_mfn_p[mididx] = virt_to_mfn(mid[mididx]); 244 mid_mfn_p[mididx] = mfn;
338 } 245 }
339 } 246 }
340 247
341 void xen_setup_mfn_list_list(void) 248 void xen_setup_mfn_list_list(void)
342 { 249 {
343 if (xen_feature(XENFEAT_auto_translated_physmap)) 250 if (xen_feature(XENFEAT_auto_translated_physmap))
344 return; 251 return;
345 252
346 BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info); 253 BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info);
347 254
348 HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list = 255 HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
349 virt_to_mfn(p2m_top_mfn); 256 virt_to_mfn(p2m_top_mfn);
350 HYPERVISOR_shared_info->arch.max_pfn = xen_max_p2m_pfn; 257 HYPERVISOR_shared_info->arch.max_pfn = xen_max_p2m_pfn;
351 } 258 }
352 259
353 /* Set up p2m_top to point to the domain-builder provided p2m pages */ 260 /* Set up p2m_top to point to the domain-builder provided p2m pages */
354 void __init xen_build_dynamic_phys_to_machine(void) 261 void __init xen_build_dynamic_phys_to_machine(void)
355 { 262 {
356 unsigned long *mfn_list;
357 unsigned long max_pfn;
358 unsigned long pfn; 263 unsigned long pfn;
359 264
360 if (xen_feature(XENFEAT_auto_translated_physmap)) 265 if (xen_feature(XENFEAT_auto_translated_physmap))
361 return; 266 return;
362 267
363 mfn_list = (unsigned long *)xen_start_info->mfn_list; 268 xen_p2m_addr = (unsigned long *)xen_start_info->mfn_list;
364 max_pfn = min(MAX_DOMAIN_PAGES, xen_start_info->nr_pages); 269 xen_p2m_size = ALIGN(xen_start_info->nr_pages, P2M_PER_PAGE);
365 xen_max_p2m_pfn = max_pfn;
366 270
367 p2m_missing = extend_brk(PAGE_SIZE, PAGE_SIZE); 271 for (pfn = xen_start_info->nr_pages; pfn < xen_p2m_size; pfn++)
368 p2m_init(p2m_missing); 272 xen_p2m_addr[pfn] = INVALID_P2M_ENTRY;
369 p2m_identity = extend_brk(PAGE_SIZE, PAGE_SIZE);
370 p2m_init(p2m_identity);
371 273
372 p2m_mid_missing = extend_brk(PAGE_SIZE, PAGE_SIZE); 274 xen_max_p2m_pfn = xen_p2m_size;
373 p2m_mid_init(p2m_mid_missing, p2m_missing); 275 }
374 p2m_mid_identity = extend_brk(PAGE_SIZE, PAGE_SIZE);
375 p2m_mid_init(p2m_mid_identity, p2m_identity);
376 276
377 p2m_top = extend_brk(PAGE_SIZE, PAGE_SIZE); 277 #define P2M_TYPE_IDENTITY 0
378 p2m_top_init(p2m_top); 278 #define P2M_TYPE_MISSING 1
279 #define P2M_TYPE_PFN 2
280 #define P2M_TYPE_UNKNOWN 3
379 281
380 /* 282 static int xen_p2m_elem_type(unsigned long pfn)
381 * The domain builder gives us a pre-constructed p2m array in 283 {
382 * mfn_list for all the pages initially given to us, so we just 284 unsigned long mfn;
383 * need to graft that into our tree structure.
384 */
385 for (pfn = 0; pfn < max_pfn; pfn += P2M_PER_PAGE) {
386 unsigned topidx = p2m_top_index(pfn);
387 unsigned mididx = p2m_mid_index(pfn);
388 285
389 if (p2m_top[topidx] == p2m_mid_missing) { 286 if (pfn >= xen_p2m_size)
390 unsigned long **mid = extend_brk(PAGE_SIZE, PAGE_SIZE); 287 return P2M_TYPE_IDENTITY;
391 p2m_mid_init(mid, p2m_missing);
392 288
393 p2m_top[topidx] = mid; 289 mfn = xen_p2m_addr[pfn];
394 }
395 290
396 /* 291 if (mfn == INVALID_P2M_ENTRY)
397 * As long as the mfn_list has enough entries to completely 292 return P2M_TYPE_MISSING;
398 * fill a p2m page, pointing into the array is ok. But if
399 * not the entries beyond the last pfn will be undefined.
400 */
401 if (unlikely(pfn + P2M_PER_PAGE > max_pfn)) {
402 unsigned long p2midx;
403 293
404 p2midx = max_pfn % P2M_PER_PAGE; 294 if (mfn & IDENTITY_FRAME_BIT)
405 for ( ; p2midx < P2M_PER_PAGE; p2midx++) 295 return P2M_TYPE_IDENTITY;
406 mfn_list[pfn + p2midx] = INVALID_P2M_ENTRY;
407 }
408 p2m_top[topidx][mididx] = &mfn_list[pfn];
409 }
410 296
411 m2p_override_init(); 297 return P2M_TYPE_PFN;
412 } 298 }
413 #ifdef CONFIG_X86_64 299
414 unsigned long __init xen_revector_p2m_tree(void) 300 static void __init xen_rebuild_p2m_list(unsigned long *p2m)
415 { 301 {
416 unsigned long va_start; 302 unsigned int i, chunk;
417 unsigned long va_end;
418 unsigned long pfn; 303 unsigned long pfn;
419 unsigned long pfn_free = 0; 304 unsigned long *mfns;
420 unsigned long *mfn_list = NULL; 305 pte_t *ptep;
421 unsigned long size; 306 pmd_t *pmdp;
307 int type;
422 308
423 va_start = xen_start_info->mfn_list; 309 p2m_missing = alloc_p2m_page();
424 /*We copy in increments of P2M_PER_PAGE * sizeof(unsigned long), 310 p2m_init(p2m_missing);
425 * so make sure it is rounded up to that */ 311 p2m_identity = alloc_p2m_page();
426 size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long)); 312 p2m_init(p2m_identity);
427 va_end = va_start + size;
428 313
429 /* If we were revectored already, don't do it again. */ 314 p2m_missing_pte = alloc_p2m_page();
430 if (va_start <= __START_KERNEL_map && va_start >= __PAGE_OFFSET) 315 paravirt_alloc_pte(&init_mm, __pa(p2m_missing_pte) >> PAGE_SHIFT);
431 return 0; 316 p2m_identity_pte = alloc_p2m_page();
432 317 paravirt_alloc_pte(&init_mm, __pa(p2m_identity_pte) >> PAGE_SHIFT);
433 mfn_list = alloc_bootmem_align(size, PAGE_SIZE); 318 for (i = 0; i < PTRS_PER_PTE; i++) {
434 if (!mfn_list) { 319 set_pte(p2m_missing_pte + i,
435 pr_warn("Could not allocate space for a new P2M tree!\n"); 320 pfn_pte(PFN_DOWN(__pa(p2m_missing)), PAGE_KERNEL_RO));
436 return xen_start_info->mfn_list; 321 set_pte(p2m_identity_pte + i,
322 pfn_pte(PFN_DOWN(__pa(p2m_identity)), PAGE_KERNEL_RO));
437 } 323 }
438 /* Fill it out with INVALID_P2M_ENTRY value */
439 memset(mfn_list, 0xFF, size);
440 324
441 for (pfn = 0; pfn < ALIGN(MAX_DOMAIN_PAGES, P2M_PER_PAGE); pfn += P2M_PER_PAGE) { 325 for (pfn = 0; pfn < xen_max_p2m_pfn; pfn += chunk) {
442 unsigned topidx = p2m_top_index(pfn); 326 /*
443 unsigned mididx; 327 * Try to map missing/identity PMDs or p2m-pages if possible.
444 unsigned long *mid_p; 328 * We have to respect the structure of the mfn_list_list
329 * which will be built just afterwards.
330 * Chunk size to test is one p2m page if we are in the middle
331 * of a mfn_list_list mid page and the complete mid page area
332 * if we are at index 0 of the mid page. Please note that a
333 * mid page might cover more than one PMD, e.g. on 32 bit PAE
334 * kernels.
335 */
336 chunk = (pfn & (P2M_PER_PAGE * P2M_MID_PER_PAGE - 1)) ?
337 P2M_PER_PAGE : P2M_PER_PAGE * P2M_MID_PER_PAGE;
445 338
446 if (!p2m_top[topidx]) 339 type = xen_p2m_elem_type(pfn);
447 continue; 340 i = 0;
341 if (type != P2M_TYPE_PFN)
342 for (i = 1; i < chunk; i++)
343 if (xen_p2m_elem_type(pfn + i) != type)
344 break;
345 if (i < chunk)
346 /* Reset to minimal chunk size. */
347 chunk = P2M_PER_PAGE;
448 348
449 if (p2m_top[topidx] == p2m_mid_missing) 349 if (type == P2M_TYPE_PFN || i < chunk) {
350 /* Use initial p2m page contents. */
351 #ifdef CONFIG_X86_64
352 mfns = alloc_p2m_page();
353 copy_page(mfns, xen_p2m_addr + pfn);
354 #else
355 mfns = xen_p2m_addr + pfn;
356 #endif
357 ptep = populate_extra_pte((unsigned long)(p2m + pfn));
358 set_pte(ptep,
359 pfn_pte(PFN_DOWN(__pa(mfns)), PAGE_KERNEL));
450 continue; 360 continue;
361 }
451 362
452 mididx = p2m_mid_index(pfn); 363 if (chunk == P2M_PER_PAGE) {
453 mid_p = p2m_top[topidx][mididx]; 364 /* Map complete missing or identity p2m-page. */
454 if (!mid_p) 365 mfns = (type == P2M_TYPE_MISSING) ?
366 p2m_missing : p2m_identity;
367 ptep = populate_extra_pte((unsigned long)(p2m + pfn));
368 set_pte(ptep,
369 pfn_pte(PFN_DOWN(__pa(mfns)), PAGE_KERNEL_RO));
455 continue; 370 continue;
456 if ((mid_p == p2m_missing) || (mid_p == p2m_identity)) 371 }
457 continue;
458 372
459 if ((unsigned long)mid_p == INVALID_P2M_ENTRY) 373 /* Complete missing or identity PMD(s) can be mapped. */
460 continue; 374 ptep = (type == P2M_TYPE_MISSING) ?
375 p2m_missing_pte : p2m_identity_pte;
376 for (i = 0; i < PMDS_PER_MID_PAGE; i++) {
377 pmdp = populate_extra_pmd(
378 (unsigned long)(p2m + pfn + i * PTRS_PER_PTE));
379 set_pmd(pmdp, __pmd(__pa(ptep) | _KERNPG_TABLE));
380 }
381 }
382 }
461 383
462 /* The old va. Rebase it on mfn_list */ 384 void __init xen_vmalloc_p2m_tree(void)
463 if (mid_p >= (unsigned long *)va_start && mid_p <= (unsigned long *)va_end) { 385 {
464 unsigned long *new; 386 static struct vm_struct vm;
465 387
466 if (pfn_free > (size / sizeof(unsigned long))) { 388 vm.flags = VM_ALLOC;
467 WARN(1, "Only allocated for %ld pages, but we want %ld!\n", 389 vm.size = ALIGN(sizeof(unsigned long) * xen_max_p2m_pfn,
468 size / sizeof(unsigned long), pfn_free); 390 PMD_SIZE * PMDS_PER_MID_PAGE);
469 return 0; 391 vm_area_register_early(&vm, PMD_SIZE * PMDS_PER_MID_PAGE);
470 } 392 pr_notice("p2m virtual area at %p, size is %lx\n", vm.addr, vm.size);
471 new = &mfn_list[pfn_free];
472 393
473 copy_page(new, mid_p); 394 xen_max_p2m_pfn = vm.size / sizeof(unsigned long);
474 p2m_top[topidx][mididx] = &mfn_list[pfn_free];
475 395
476 pfn_free += P2M_PER_PAGE; 396 xen_rebuild_p2m_list(vm.addr);
477 397
478 } 398 xen_p2m_addr = vm.addr;
479 /* This should be the leafs allocated for identity from _brk. */ 399 xen_p2m_size = xen_max_p2m_pfn;
480 }
481 return (unsigned long)mfn_list;
482 400
401 xen_inv_extra_mem();
402
403 m2p_override_init();
483 } 404 }
484 #else 405
485 unsigned long __init xen_revector_p2m_tree(void)
486 {
487 return 0;
488 }
489 #endif
490 unsigned long get_phys_to_machine(unsigned long pfn) 406 unsigned long get_phys_to_machine(unsigned long pfn)
491 { 407 {
492 unsigned topidx, mididx, idx; 408 pte_t *ptep;
409 unsigned int level;
493 410
494 if (unlikely(pfn >= MAX_P2M_PFN)) 411 if (unlikely(pfn >= xen_p2m_size)) {
412 if (pfn < xen_max_p2m_pfn)
413 return xen_chk_extra_mem(pfn);
414
495 return IDENTITY_FRAME(pfn); 415 return IDENTITY_FRAME(pfn);
416 }
496 417
497 topidx = p2m_top_index(pfn); 418 ptep = lookup_address((unsigned long)(xen_p2m_addr + pfn), &level);
498 mididx = p2m_mid_index(pfn); 419 BUG_ON(!ptep || level != PG_LEVEL_4K);
499 idx = p2m_index(pfn);
500 420
501 /* 421 /*
502 * The INVALID_P2M_ENTRY is filled in both p2m_*identity 422 * The INVALID_P2M_ENTRY is filled in both p2m_*identity
503 * and in p2m_*missing, so returning the INVALID_P2M_ENTRY 423 * and in p2m_*missing, so returning the INVALID_P2M_ENTRY
504 * would be wrong. 424 * would be wrong.
505 */ 425 */
506 if (p2m_top[topidx][mididx] == p2m_identity) 426 if (pte_pfn(*ptep) == PFN_DOWN(__pa(p2m_identity)))
507 return IDENTITY_FRAME(pfn); 427 return IDENTITY_FRAME(pfn);
508 428
509 return p2m_top[topidx][mididx][idx]; 429 return xen_p2m_addr[pfn];
510 } 430 }
511 EXPORT_SYMBOL_GPL(get_phys_to_machine); 431 EXPORT_SYMBOL_GPL(get_phys_to_machine);
512 432
513 static void *alloc_p2m_page(void) 433 /*
434 * Allocate new pmd(s). It is checked whether the old pmd is still in place.
435 * If not, nothing is changed. This is okay as the only reason for allocating
436 * a new pmd is to replace p2m_missing_pte or p2m_identity_pte by a individual
437 * pmd. In case of PAE/x86-32 there are multiple pmds to allocate!
438 */
439 static pte_t *alloc_p2m_pmd(unsigned long addr, pte_t *ptep, pte_t *pte_pg)
514 { 440 {
515 return (void *)__get_free_page(GFP_KERNEL | __GFP_REPEAT); 441 pte_t *ptechk;
516 } 442 pte_t *pteret = ptep;
443 pte_t *pte_newpg[PMDS_PER_MID_PAGE];
444 pmd_t *pmdp;
445 unsigned int level;
446 unsigned long flags;
447 unsigned long vaddr;
448 int i;
517 449
518 static void free_p2m_page(void *p) 450 /* Do all allocations first to bail out in error case. */
519 { 451 for (i = 0; i < PMDS_PER_MID_PAGE; i++) {
520 free_page((unsigned long)p); 452 pte_newpg[i] = alloc_p2m_page();
453 if (!pte_newpg[i]) {
454 for (i--; i >= 0; i--)
455 free_p2m_page(pte_newpg[i]);
456
457 return NULL;
458 }
459 }
460
461 vaddr = addr & ~(PMD_SIZE * PMDS_PER_MID_PAGE - 1);
462
463 for (i = 0; i < PMDS_PER_MID_PAGE; i++) {
464 copy_page(pte_newpg[i], pte_pg);
465 paravirt_alloc_pte(&init_mm, __pa(pte_newpg[i]) >> PAGE_SHIFT);
466
467 pmdp = lookup_pmd_address(vaddr);
468 BUG_ON(!pmdp);
469
470 spin_lock_irqsave(&p2m_update_lock, flags);
471
472 ptechk = lookup_address(vaddr, &level);
473 if (ptechk == pte_pg) {
474 set_pmd(pmdp,
475 __pmd(__pa(pte_newpg[i]) | _KERNPG_TABLE));
476 if (vaddr == (addr & ~(PMD_SIZE - 1)))
477 pteret = pte_offset_kernel(pmdp, addr);
478 pte_newpg[i] = NULL;
479 }
480
481 spin_unlock_irqrestore(&p2m_update_lock, flags);
482
483 if (pte_newpg[i]) {
484 paravirt_release_pte(__pa(pte_newpg[i]) >> PAGE_SHIFT);
485 free_p2m_page(pte_newpg[i]);
486 }
487
488 vaddr += PMD_SIZE;
489 }
490
491 return pteret;
521 } 492 }
522 493
523 /* 494 /*
524 * Fully allocate the p2m structure for a given pfn. We need to check 495 * Fully allocate the p2m structure for a given pfn. We need to check
525 * that both the top and mid levels are allocated, and make sure the 496 * that both the top and mid levels are allocated, and make sure the
526 * parallel mfn tree is kept in sync. We may race with other cpus, so 497 * parallel mfn tree is kept in sync. We may race with other cpus, so
527 * the new pages are installed with cmpxchg; if we lose the race then 498 * the new pages are installed with cmpxchg; if we lose the race then
528 * simply free the page we allocated and use the one that's there. 499 * simply free the page we allocated and use the one that's there.
529 */ 500 */
530 static bool alloc_p2m(unsigned long pfn) 501 static bool alloc_p2m(unsigned long pfn)
531 { 502 {
532 unsigned topidx, mididx; 503 unsigned topidx, mididx;
533 unsigned long ***top_p, **mid;
534 unsigned long *top_mfn_p, *mid_mfn; 504 unsigned long *top_mfn_p, *mid_mfn;
535 unsigned long *p2m_orig; 505 pte_t *ptep, *pte_pg;
506 unsigned int level;
507 unsigned long flags;
508 unsigned long addr = (unsigned long)(xen_p2m_addr + pfn);
509 unsigned long p2m_pfn;
536 510
537 topidx = p2m_top_index(pfn); 511 topidx = p2m_top_index(pfn);
538 mididx = p2m_mid_index(pfn); 512 mididx = p2m_mid_index(pfn);
539 513
540 top_p = &p2m_top[topidx]; 514 ptep = lookup_address(addr, &level);
541 mid = ACCESS_ONCE(*top_p); 515 BUG_ON(!ptep || level != PG_LEVEL_4K);
516 pte_pg = (pte_t *)((unsigned long)ptep & ~(PAGE_SIZE - 1));
542 517
543 if (mid == p2m_mid_missing) { 518 if (pte_pg == p2m_missing_pte || pte_pg == p2m_identity_pte) {
544 /* Mid level is missing, allocate a new one */ 519 /* PMD level is missing, allocate a new one */
545 mid = alloc_p2m_page(); 520 ptep = alloc_p2m_pmd(addr, ptep, pte_pg);
546 if (!mid) 521 if (!ptep)
547 return false; 522 return false;
548
549 p2m_mid_init(mid, p2m_missing);
550
551 if (cmpxchg(top_p, p2m_mid_missing, mid) != p2m_mid_missing)
552 free_p2m_page(mid);
553 } 523 }
554 524
555 top_mfn_p = &p2m_top_mfn[topidx]; 525 if (p2m_top_mfn) {
556 mid_mfn = ACCESS_ONCE(p2m_top_mfn_p[topidx]); 526 top_mfn_p = &p2m_top_mfn[topidx];
527 mid_mfn = ACCESS_ONCE(p2m_top_mfn_p[topidx]);
557 528
558 BUG_ON(virt_to_mfn(mid_mfn) != *top_mfn_p); 529 BUG_ON(virt_to_mfn(mid_mfn) != *top_mfn_p);
559 530
560 if (mid_mfn == p2m_mid_missing_mfn) { 531 if (mid_mfn == p2m_mid_missing_mfn) {
561 /* Separately check the mid mfn level */ 532 /* Separately check the mid mfn level */
562 unsigned long missing_mfn; 533 unsigned long missing_mfn;
563 unsigned long mid_mfn_mfn; 534 unsigned long mid_mfn_mfn;
564 unsigned long old_mfn; 535 unsigned long old_mfn;
565 536
566 mid_mfn = alloc_p2m_page(); 537 mid_mfn = alloc_p2m_page();
567 if (!mid_mfn) 538 if (!mid_mfn)
568 return false; 539 return false;
569 540
570 p2m_mid_mfn_init(mid_mfn, p2m_missing); 541 p2m_mid_mfn_init(mid_mfn, p2m_missing);
571 542
572 missing_mfn = virt_to_mfn(p2m_mid_missing_mfn); 543 missing_mfn = virt_to_mfn(p2m_mid_missing_mfn);
573 mid_mfn_mfn = virt_to_mfn(mid_mfn); 544 mid_mfn_mfn = virt_to_mfn(mid_mfn);
574 old_mfn = cmpxchg(top_mfn_p, missing_mfn, mid_mfn_mfn); 545 old_mfn = cmpxchg(top_mfn_p, missing_mfn, mid_mfn_mfn);
575 if (old_mfn != missing_mfn) { 546 if (old_mfn != missing_mfn) {
576 free_p2m_page(mid_mfn); 547 free_p2m_page(mid_mfn);
577 mid_mfn = mfn_to_virt(old_mfn); 548 mid_mfn = mfn_to_virt(old_mfn);
578 } else { 549 } else {
579 p2m_top_mfn_p[topidx] = mid_mfn; 550 p2m_top_mfn_p[topidx] = mid_mfn;
551 }
580 } 552 }
553 } else {
554 mid_mfn = NULL;
581 } 555 }
582 556
583 p2m_orig = ACCESS_ONCE(p2m_top[topidx][mididx]); 557 p2m_pfn = pte_pfn(ACCESS_ONCE(*ptep));
584 if (p2m_orig == p2m_identity || p2m_orig == p2m_missing) { 558 if (p2m_pfn == PFN_DOWN(__pa(p2m_identity)) ||
559 p2m_pfn == PFN_DOWN(__pa(p2m_missing))) {
585 /* p2m leaf page is missing */ 560 /* p2m leaf page is missing */
586 unsigned long *p2m; 561 unsigned long *p2m;
587 562
588 p2m = alloc_p2m_page(); 563 p2m = alloc_p2m_page();
589 if (!p2m) 564 if (!p2m)
590 return false; 565 return false;
591 566
592 p2m_init(p2m); 567 if (p2m_pfn == PFN_DOWN(__pa(p2m_missing)))
593 568 p2m_init(p2m);
594 if (cmpxchg(&mid[mididx], p2m_orig, p2m) != p2m_orig)
595 free_p2m_page(p2m);
596 else 569 else
597 mid_mfn[mididx] = virt_to_mfn(p2m); 570 p2m_init_identity(p2m, pfn);
598 }
599 571
600 return true; 572 spin_lock_irqsave(&p2m_update_lock, flags);
601 }
602 573
603 static bool __init early_alloc_p2m(unsigned long pfn, bool check_boundary) 574 if (pte_pfn(*ptep) == p2m_pfn) {
604 { 575 set_pte(ptep,
605 unsigned topidx, mididx, idx; 576 pfn_pte(PFN_DOWN(__pa(p2m)), PAGE_KERNEL));
606 unsigned long *p2m; 577 if (mid_mfn)
607 578 mid_mfn[mididx] = virt_to_mfn(p2m);
608 topidx = p2m_top_index(pfn); 579 p2m = NULL;
609 mididx = p2m_mid_index(pfn);
610 idx = p2m_index(pfn);
611
612 /* Pfff.. No boundary cross-over, lets get out. */
613 if (!idx && check_boundary)
614 return false;
615
616 WARN(p2m_top[topidx][mididx] == p2m_identity,
617 "P2M[%d][%d] == IDENTITY, should be MISSING (or alloced)!\n",
618 topidx, mididx);
619
620 /*
621 * Could be done by xen_build_dynamic_phys_to_machine..
622 */
623 if (p2m_top[topidx][mididx] != p2m_missing)
624 return false;
625
626 /* Boundary cross-over for the edges: */
627 p2m = extend_brk(PAGE_SIZE, PAGE_SIZE);
628
629 p2m_init(p2m);
630
631 p2m_top[topidx][mididx] = p2m;
632
633 return true;
634 }
635
636 static bool __init early_alloc_p2m_middle(unsigned long pfn)
637 {
638 unsigned topidx = p2m_top_index(pfn);
639 unsigned long **mid;
640
641 mid = p2m_top[topidx];
642 if (mid == p2m_mid_missing) {
643 mid = extend_brk(PAGE_SIZE, PAGE_SIZE);
644
645 p2m_mid_init(mid, p2m_missing);
646
647 p2m_top[topidx] = mid;
648 }
649 return true;
650 }
651
652 /*
653 * Skim over the P2M tree looking at pages that are either filled with
654 * INVALID_P2M_ENTRY or with 1:1 PFNs. If found, re-use that page and
655 * replace the P2M leaf with a p2m_missing or p2m_identity.
656 * Stick the old page in the new P2M tree location.
657 */
658 static bool __init early_can_reuse_p2m_middle(unsigned long set_pfn)
659 {
660 unsigned topidx;
661 unsigned mididx;
662 unsigned ident_pfns;
663 unsigned inv_pfns;
664 unsigned long *p2m;
665 unsigned idx;
666 unsigned long pfn;
667
668 /* We only look when this entails a P2M middle layer */
669 if (p2m_index(set_pfn))
670 return false;
671
672 for (pfn = 0; pfn < MAX_DOMAIN_PAGES; pfn += P2M_PER_PAGE) {
673 topidx = p2m_top_index(pfn);
674
675 if (!p2m_top[topidx])
676 continue;
677
678 if (p2m_top[topidx] == p2m_mid_missing)
679 continue;
680
681 mididx = p2m_mid_index(pfn);
682 p2m = p2m_top[topidx][mididx];
683 if (!p2m)
684 continue;
685
686 if ((p2m == p2m_missing) || (p2m == p2m_identity))
687 continue;
688
689 if ((unsigned long)p2m == INVALID_P2M_ENTRY)
690 continue;
691
692 ident_pfns = 0;
693 inv_pfns = 0;
694 for (idx = 0; idx < P2M_PER_PAGE; idx++) {
695 /* IDENTITY_PFNs are 1:1 */
696 if (p2m[idx] == IDENTITY_FRAME(pfn + idx))
697 ident_pfns++;
698 else if (p2m[idx] == INVALID_P2M_ENTRY)
699 inv_pfns++;
700 else
701 break;
702 } 580 }
703 if ((ident_pfns == P2M_PER_PAGE) || (inv_pfns == P2M_PER_PAGE))
704 goto found;
705 }
706 return false;
707 found:
708 /* Found one, replace old with p2m_identity or p2m_missing */
709 p2m_top[topidx][mididx] = (ident_pfns ? p2m_identity : p2m_missing);
710 581
711 /* Reset where we want to stick the old page in. */ 582 spin_unlock_irqrestore(&p2m_update_lock, flags);
712 topidx = p2m_top_index(set_pfn);
713 mididx = p2m_mid_index(set_pfn);
714 583
715 /* This shouldn't happen */ 584 if (p2m)
716 if (WARN_ON(p2m_top[topidx] == p2m_mid_missing)) 585 free_p2m_page(p2m);
717 early_alloc_p2m_middle(set_pfn);
718
719 if (WARN_ON(p2m_top[topidx][mididx] != p2m_missing))
720 return false;
721
722 p2m_init(p2m);
723 p2m_top[topidx][mididx] = p2m;
724
725 return true;
726 }
727 bool __init early_set_phys_to_machine(unsigned long pfn, unsigned long mfn)
728 {
729 if (unlikely(!__set_phys_to_machine(pfn, mfn))) {
730 if (!early_alloc_p2m_middle(pfn))
731 return false;
732
733 if (early_can_reuse_p2m_middle(pfn))
734 return __set_phys_to_machine(pfn, mfn);
735
736 if (!early_alloc_p2m(pfn, false /* boundary crossover OK!*/))
737 return false;
738
739 if (!__set_phys_to_machine(pfn, mfn))
740 return false;
741 } 586 }
742 587
743 return true; 588 return true;
744 } 589 }
745 590
746 static void __init early_split_p2m(unsigned long pfn)
747 {
748 unsigned long mididx, idx;
749
750 mididx = p2m_mid_index(pfn);
751 idx = p2m_index(pfn);
752
753 /*
754 * Allocate new middle and leaf pages if this pfn lies in the
755 * middle of one.
756 */
757 if (mididx || idx)
758 early_alloc_p2m_middle(pfn);
759 if (idx)
760 early_alloc_p2m(pfn, false);
761 }
762
763 unsigned long __init set_phys_range_identity(unsigned long pfn_s, 591 unsigned long __init set_phys_range_identity(unsigned long pfn_s,
764 unsigned long pfn_e) 592 unsigned long pfn_e)
765 { 593 {
766 unsigned long pfn; 594 unsigned long pfn;
767 595
768 if (unlikely(pfn_s >= MAX_P2M_PFN)) 596 if (unlikely(pfn_s >= xen_p2m_size))
769 return 0; 597 return 0;
770 598
771 if (unlikely(xen_feature(XENFEAT_auto_translated_physmap))) 599 if (unlikely(xen_feature(XENFEAT_auto_translated_physmap)))
772 return pfn_e - pfn_s; 600 return pfn_e - pfn_s;
773 601
774 if (pfn_s > pfn_e) 602 if (pfn_s > pfn_e)
775 return 0; 603 return 0;
776 604
777 if (pfn_e > MAX_P2M_PFN) 605 if (pfn_e > xen_p2m_size)
778 pfn_e = MAX_P2M_PFN; 606 pfn_e = xen_p2m_size;
779 607
780 early_split_p2m(pfn_s); 608 for (pfn = pfn_s; pfn < pfn_e; pfn++)
781 early_split_p2m(pfn_e); 609 xen_p2m_addr[pfn] = IDENTITY_FRAME(pfn);
782 610
783 for (pfn = pfn_s; pfn < pfn_e;) {
784 unsigned topidx = p2m_top_index(pfn);
785 unsigned mididx = p2m_mid_index(pfn);
786
787 if (!__set_phys_to_machine(pfn, IDENTITY_FRAME(pfn)))
788 break;
789 pfn++;
790
791 /*
792 * If the PFN was set to a middle or leaf identity
793 * page the remainder must also be identity, so skip
794 * ahead to the next middle or leaf entry.
795 */
796 if (p2m_top[topidx] == p2m_mid_identity)
797 pfn = ALIGN(pfn, P2M_MID_PER_PAGE * P2M_PER_PAGE);
798 else if (p2m_top[topidx][mididx] == p2m_identity)
799 pfn = ALIGN(pfn, P2M_PER_PAGE);
800 }
801
802 WARN((pfn - pfn_s) != (pfn_e - pfn_s),
803 "Identity mapping failed. We are %ld short of 1-1 mappings!\n",
804 (pfn_e - pfn_s) - (pfn - pfn_s));
805
806 return pfn - pfn_s; 611 return pfn - pfn_s;
807 } 612 }
808 613
809 /* Try to install p2m mapping; fail if intermediate bits missing */
810 bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn) 614 bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn)
811 { 615 {
812 unsigned topidx, mididx, idx; 616 pte_t *ptep;
617 unsigned int level;
813 618
814 /* don't track P2M changes in autotranslate guests */ 619 /* don't track P2M changes in autotranslate guests */
815 if (unlikely(xen_feature(XENFEAT_auto_translated_physmap))) 620 if (unlikely(xen_feature(XENFEAT_auto_translated_physmap)))
816 return true; 621 return true;
817 622
818 if (unlikely(pfn >= MAX_P2M_PFN)) { 623 if (unlikely(pfn >= xen_p2m_size)) {
819 BUG_ON(mfn != INVALID_P2M_ENTRY); 624 BUG_ON(mfn != INVALID_P2M_ENTRY);
820 return true; 625 return true;
821 } 626 }
822 627
823 topidx = p2m_top_index(pfn); 628 if (likely(!xen_safe_write_ulong(xen_p2m_addr + pfn, mfn)))
824 mididx = p2m_mid_index(pfn); 629 return true;
825 idx = p2m_index(pfn);
826 630
827 /* For sparse holes were the p2m leaf has real PFN along with 631 ptep = lookup_address((unsigned long)(xen_p2m_addr + pfn), &level);
828 * PCI holes, stick in the PFN as the MFN value. 632 BUG_ON(!ptep || level != PG_LEVEL_4K);
829 *
830 * set_phys_range_identity() will have allocated new middle
831 * and leaf pages as required so an existing p2m_mid_missing
832 * or p2m_missing mean that whole range will be identity so
833 * these can be switched to p2m_mid_identity or p2m_identity.
834 */
835 if (mfn != INVALID_P2M_ENTRY && (mfn & IDENTITY_FRAME_BIT)) {
836 if (p2m_top[topidx] == p2m_mid_identity)
837 return true;
838 633
839 if (p2m_top[topidx] == p2m_mid_missing) { 634 if (pte_pfn(*ptep) == PFN_DOWN(__pa(p2m_missing)))
840 WARN_ON(cmpxchg(&p2m_top[topidx], p2m_mid_missing,
841 p2m_mid_identity) != p2m_mid_missing);
842 return true;
843 }
844
845 if (p2m_top[topidx][mididx] == p2m_identity)
846 return true;
847
848 /* Swap over from MISSING to IDENTITY if needed. */
849 if (p2m_top[topidx][mididx] == p2m_missing) {
850 WARN_ON(cmpxchg(&p2m_top[topidx][mididx], p2m_missing,
851 p2m_identity) != p2m_missing);
852 return true;
853 }
854 }
855
856 if (p2m_top[topidx][mididx] == p2m_missing)
857 return mfn == INVALID_P2M_ENTRY; 635 return mfn == INVALID_P2M_ENTRY;
858 636
859 p2m_top[topidx][mididx][idx] = mfn; 637 if (pte_pfn(*ptep) == PFN_DOWN(__pa(p2m_identity)))
638 return mfn == IDENTITY_FRAME(pfn);
860 639
861 return true; 640 return false;
862 } 641 }
863 642
864 bool set_phys_to_machine(unsigned long pfn, unsigned long mfn) 643 bool set_phys_to_machine(unsigned long pfn, unsigned long mfn)
865 { 644 {
866 if (unlikely(!__set_phys_to_machine(pfn, mfn))) { 645 if (unlikely(!__set_phys_to_machine(pfn, mfn))) {
867 if (!alloc_p2m(pfn)) 646 if (!alloc_p2m(pfn))
868 return false; 647 return false;
869 648
870 if (!__set_phys_to_machine(pfn, mfn)) 649 return __set_phys_to_machine(pfn, mfn);
871 return false;
872 } 650 }
873 651
874 return true; 652 return true;
875 } 653 }
876 654
877 #define M2P_OVERRIDE_HASH_SHIFT 10 655 #define M2P_OVERRIDE_HASH_SHIFT 10
878 #define M2P_OVERRIDE_HASH (1 << M2P_OVERRIDE_HASH_SHIFT) 656 #define M2P_OVERRIDE_HASH (1 << M2P_OVERRIDE_HASH_SHIFT)
879 657
880 static RESERVE_BRK_ARRAY(struct list_head, m2p_overrides, M2P_OVERRIDE_HASH); 658 static struct list_head *m2p_overrides;
881 static DEFINE_SPINLOCK(m2p_override_lock); 659 static DEFINE_SPINLOCK(m2p_override_lock);
882 660
883 static void __init m2p_override_init(void) 661 static void __init m2p_override_init(void)
884 { 662 {
885 unsigned i; 663 unsigned i;
886 664
887 m2p_overrides = extend_brk(sizeof(*m2p_overrides) * M2P_OVERRIDE_HASH, 665 m2p_overrides = alloc_bootmem_align(
888 sizeof(unsigned long)); 666 sizeof(*m2p_overrides) * M2P_OVERRIDE_HASH,
667 sizeof(unsigned long));
889 668
890 for (i = 0; i < M2P_OVERRIDE_HASH; i++) 669 for (i = 0; i < M2P_OVERRIDE_HASH; i++)
891 INIT_LIST_HEAD(&m2p_overrides[i]); 670 INIT_LIST_HEAD(&m2p_overrides[i]);
892 } 671 }
893 672
894 static unsigned long mfn_hash(unsigned long mfn) 673 static unsigned long mfn_hash(unsigned long mfn)
895 { 674 {
896 return hash_long(mfn, M2P_OVERRIDE_HASH_SHIFT); 675 return hash_long(mfn, M2P_OVERRIDE_HASH_SHIFT);
897 } 676 }
898 677
899 int set_foreign_p2m_mapping(struct gnttab_map_grant_ref *map_ops,
900 struct gnttab_map_grant_ref *kmap_ops,
901 struct page **pages, unsigned int count)
902 {
903 int i, ret = 0;
904 bool lazy = false;
905 pte_t *pte;
906
907 if (xen_feature(XENFEAT_auto_translated_physmap))
908 return 0;
909
910 if (kmap_ops &&
911 !in_interrupt() &&
912 paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) {
913 arch_enter_lazy_mmu_mode();
914 lazy = true;
915 }
916
917 for (i = 0; i < count; i++) {
918 unsigned long mfn, pfn;
919
920 /* Do not add to override if the map failed. */
921 if (map_ops[i].status)
922 continue;
923
924 if (map_ops[i].flags & GNTMAP_contains_pte) {
925 pte = (pte_t *) (mfn_to_virt(PFN_DOWN(map_ops[i].host_addr)) +
926 (map_ops[i].host_addr & ~PAGE_MASK));
927 mfn = pte_mfn(*pte);
928 } else {
929 mfn = PFN_DOWN(map_ops[i].dev_bus_addr);
930 }
931 pfn = page_to_pfn(pages[i]);
932
933 WARN_ON(PagePrivate(pages[i]));
934 SetPagePrivate(pages[i]);
935 set_page_private(pages[i], mfn);
936 pages[i]->index = pfn_to_mfn(pfn);
937
938 if (unlikely(!set_phys_to_machine(pfn, FOREIGN_FRAME(mfn)))) {
939 ret = -ENOMEM;
940 goto out;
941 }
942
943 if (kmap_ops) {
944 ret = m2p_add_override(mfn, pages[i], &kmap_ops[i]);
945 if (ret)
946 goto out;
947 }
948 }
949
950 out:
951 if (lazy)
952 arch_leave_lazy_mmu_mode();
953
954 return ret;
955 }
956 EXPORT_SYMBOL_GPL(set_foreign_p2m_mapping);
957
958 /* Add an MFN override for a particular page */ 678 /* Add an MFN override for a particular page */
959 int m2p_add_override(unsigned long mfn, struct page *page, 679 static int m2p_add_override(unsigned long mfn, struct page *page,
960 struct gnttab_map_grant_ref *kmap_op) 680 struct gnttab_map_grant_ref *kmap_op)
961 { 681 {
962 unsigned long flags; 682 unsigned long flags;
963 unsigned long pfn; 683 unsigned long pfn;
964 unsigned long uninitialized_var(address); 684 unsigned long uninitialized_var(address);
965 unsigned level; 685 unsigned level;
966 pte_t *ptep = NULL; 686 pte_t *ptep = NULL;
967 687
968 pfn = page_to_pfn(page); 688 pfn = page_to_pfn(page);
969 if (!PageHighMem(page)) { 689 if (!PageHighMem(page)) {
970 address = (unsigned long)__va(pfn << PAGE_SHIFT); 690 address = (unsigned long)__va(pfn << PAGE_SHIFT);
971 ptep = lookup_address(address, &level); 691 ptep = lookup_address(address, &level);
972 if (WARN(ptep == NULL || level != PG_LEVEL_4K, 692 if (WARN(ptep == NULL || level != PG_LEVEL_4K,
973 "m2p_add_override: pfn %lx not mapped", pfn)) 693 "m2p_add_override: pfn %lx not mapped", pfn))
974 return -EINVAL; 694 return -EINVAL;
975 } 695 }
976 696
977 if (kmap_op != NULL) { 697 if (kmap_op != NULL) {
978 if (!PageHighMem(page)) { 698 if (!PageHighMem(page)) {
979 struct multicall_space mcs = 699 struct multicall_space mcs =
980 xen_mc_entry(sizeof(*kmap_op)); 700 xen_mc_entry(sizeof(*kmap_op));
981 701
982 MULTI_grant_table_op(mcs.mc, 702 MULTI_grant_table_op(mcs.mc,
983 GNTTABOP_map_grant_ref, kmap_op, 1); 703 GNTTABOP_map_grant_ref, kmap_op, 1);
984 704
985 xen_mc_issue(PARAVIRT_LAZY_MMU); 705 xen_mc_issue(PARAVIRT_LAZY_MMU);
986 } 706 }
987 } 707 }
988 spin_lock_irqsave(&m2p_override_lock, flags); 708 spin_lock_irqsave(&m2p_override_lock, flags);
989 list_add(&page->lru, &m2p_overrides[mfn_hash(mfn)]); 709 list_add(&page->lru, &m2p_overrides[mfn_hash(mfn)]);
990 spin_unlock_irqrestore(&m2p_override_lock, flags); 710 spin_unlock_irqrestore(&m2p_override_lock, flags);
991 711
992 /* p2m(m2p(mfn)) == mfn: the mfn is already present somewhere in 712 /* p2m(m2p(mfn)) == mfn: the mfn is already present somewhere in
993 * this domain. Set the FOREIGN_FRAME_BIT in the p2m for the other 713 * this domain. Set the FOREIGN_FRAME_BIT in the p2m for the other
994 * pfn so that the following mfn_to_pfn(mfn) calls will return the 714 * pfn so that the following mfn_to_pfn(mfn) calls will return the
995 * pfn from the m2p_override (the backend pfn) instead. 715 * pfn from the m2p_override (the backend pfn) instead.
996 * We need to do this because the pages shared by the frontend 716 * We need to do this because the pages shared by the frontend
997 * (xen-blkfront) can be already locked (lock_page, called by 717 * (xen-blkfront) can be already locked (lock_page, called by
998 * do_read_cache_page); when the userspace backend tries to use them 718 * do_read_cache_page); when the userspace backend tries to use them
999 * with direct_IO, mfn_to_pfn returns the pfn of the frontend, so 719 * with direct_IO, mfn_to_pfn returns the pfn of the frontend, so
1000 * do_blockdev_direct_IO is going to try to lock the same pages 720 * do_blockdev_direct_IO is going to try to lock the same pages
1001 * again resulting in a deadlock. 721 * again resulting in a deadlock.
1002 * As a side effect get_user_pages_fast might not be safe on the 722 * As a side effect get_user_pages_fast might not be safe on the
1003 * frontend pages while they are being shared with the backend, 723 * frontend pages while they are being shared with the backend,
1004 * because mfn_to_pfn (that ends up being called by GUPF) will 724 * because mfn_to_pfn (that ends up being called by GUPF) will
1005 * return the backend pfn rather than the frontend pfn. */ 725 * return the backend pfn rather than the frontend pfn. */
1006 pfn = mfn_to_pfn_no_overrides(mfn); 726 pfn = mfn_to_pfn_no_overrides(mfn);
1007 if (get_phys_to_machine(pfn) == mfn) 727 if (__pfn_to_mfn(pfn) == mfn)
1008 set_phys_to_machine(pfn, FOREIGN_FRAME(mfn)); 728 set_phys_to_machine(pfn, FOREIGN_FRAME(mfn));
1009 729
1010 return 0; 730 return 0;
1011 } 731 }
1012 EXPORT_SYMBOL_GPL(m2p_add_override);
1013 732
1014 int clear_foreign_p2m_mapping(struct gnttab_unmap_grant_ref *unmap_ops, 733 int set_foreign_p2m_mapping(struct gnttab_map_grant_ref *map_ops,
1015 struct gnttab_map_grant_ref *kmap_ops, 734 struct gnttab_map_grant_ref *kmap_ops,
1016 struct page **pages, unsigned int count) 735 struct page **pages, unsigned int count)
1017 { 736 {
1018 int i, ret = 0; 737 int i, ret = 0;
1019 bool lazy = false; 738 bool lazy = false;
739 pte_t *pte;
1020 740
1021 if (xen_feature(XENFEAT_auto_translated_physmap)) 741 if (xen_feature(XENFEAT_auto_translated_physmap))
1022 return 0; 742 return 0;
1023 743
1024 if (kmap_ops && 744 if (kmap_ops &&
1025 !in_interrupt() && 745 !in_interrupt() &&
1026 paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) { 746 paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) {
1027 arch_enter_lazy_mmu_mode(); 747 arch_enter_lazy_mmu_mode();
1028 lazy = true; 748 lazy = true;
1029 } 749 }
1030 750
1031 for (i = 0; i < count; i++) { 751 for (i = 0; i < count; i++) {
1032 unsigned long mfn = get_phys_to_machine(page_to_pfn(pages[i])); 752 unsigned long mfn, pfn;
1033 unsigned long pfn = page_to_pfn(pages[i]);
1034 753
1035 if (mfn == INVALID_P2M_ENTRY || !(mfn & FOREIGN_FRAME_BIT)) { 754 /* Do not add to override if the map failed. */
1036 ret = -EINVAL; 755 if (map_ops[i].status)
1037 goto out; 756 continue;
757
758 if (map_ops[i].flags & GNTMAP_contains_pte) {
759 pte = (pte_t *)(mfn_to_virt(PFN_DOWN(map_ops[i].host_addr)) +
760 (map_ops[i].host_addr & ~PAGE_MASK));
761 mfn = pte_mfn(*pte);
762 } else {
763 mfn = PFN_DOWN(map_ops[i].dev_bus_addr);
1038 } 764 }
765 pfn = page_to_pfn(pages[i]);
1039 766
1040 set_page_private(pages[i], INVALID_P2M_ENTRY); 767 WARN_ON(PagePrivate(pages[i]));
1041 WARN_ON(!PagePrivate(pages[i])); 768 SetPagePrivate(pages[i]);
1042 ClearPagePrivate(pages[i]); 769 set_page_private(pages[i], mfn);
1043 set_phys_to_machine(pfn, pages[i]->index); 770 pages[i]->index = pfn_to_mfn(pfn);
1044 771
1045 if (kmap_ops) 772 if (unlikely(!set_phys_to_machine(pfn, FOREIGN_FRAME(mfn)))) {
1046 ret = m2p_remove_override(pages[i], &kmap_ops[i], mfn); 773 ret = -ENOMEM;
1047 if (ret)
1048 goto out; 774 goto out;
775 }
776
777 if (kmap_ops) {
778 ret = m2p_add_override(mfn, pages[i], &kmap_ops[i]);
779 if (ret)
780 goto out;
781 }
1049 } 782 }
1050 783
1051 out: 784 out:
1052 if (lazy) 785 if (lazy)
1053 arch_leave_lazy_mmu_mode(); 786 arch_leave_lazy_mmu_mode();
787
1054 return ret; 788 return ret;
1055 } 789 }
1056 EXPORT_SYMBOL_GPL(clear_foreign_p2m_mapping); 790 EXPORT_SYMBOL_GPL(set_foreign_p2m_mapping);
1057 791
1058 int m2p_remove_override(struct page *page, 792 static struct page *m2p_find_override(unsigned long mfn)
1059 struct gnttab_map_grant_ref *kmap_op,
1060 unsigned long mfn)
1061 { 793 {
1062 unsigned long flags; 794 unsigned long flags;
795 struct list_head *bucket;
796 struct page *p, *ret;
797
798 if (unlikely(!m2p_overrides))
799 return NULL;
800
801 ret = NULL;
802 bucket = &m2p_overrides[mfn_hash(mfn)];
803
804 spin_lock_irqsave(&m2p_override_lock, flags);
805
806 list_for_each_entry(p, bucket, lru) {
807 if (page_private(p) == mfn) {
808 ret = p;
809 break;
810 }
811 }
812
813 spin_unlock_irqrestore(&m2p_override_lock, flags);
814
815 return ret;
816 }
817
818 static int m2p_remove_override(struct page *page,
819 struct gnttab_map_grant_ref *kmap_op,
820 unsigned long mfn)
821 {
822 unsigned long flags;
1063 unsigned long pfn; 823 unsigned long pfn;
1064 unsigned long uninitialized_var(address); 824 unsigned long uninitialized_var(address);
1065 unsigned level; 825 unsigned level;
1066 pte_t *ptep = NULL; 826 pte_t *ptep = NULL;
1067 827
1068 pfn = page_to_pfn(page); 828 pfn = page_to_pfn(page);
1069 829
1070 if (!PageHighMem(page)) { 830 if (!PageHighMem(page)) {
1071 address = (unsigned long)__va(pfn << PAGE_SHIFT); 831 address = (unsigned long)__va(pfn << PAGE_SHIFT);
1072 ptep = lookup_address(address, &level); 832 ptep = lookup_address(address, &level);
1073 833
1074 if (WARN(ptep == NULL || level != PG_LEVEL_4K, 834 if (WARN(ptep == NULL || level != PG_LEVEL_4K,
1075 "m2p_remove_override: pfn %lx not mapped", pfn)) 835 "m2p_remove_override: pfn %lx not mapped", pfn))
1076 return -EINVAL; 836 return -EINVAL;
1077 } 837 }
1078 838
1079 spin_lock_irqsave(&m2p_override_lock, flags); 839 spin_lock_irqsave(&m2p_override_lock, flags);
1080 list_del(&page->lru); 840 list_del(&page->lru);
1081 spin_unlock_irqrestore(&m2p_override_lock, flags); 841 spin_unlock_irqrestore(&m2p_override_lock, flags);
1082 842
1083 if (kmap_op != NULL) { 843 if (kmap_op != NULL) {
1084 if (!PageHighMem(page)) { 844 if (!PageHighMem(page)) {
1085 struct multicall_space mcs; 845 struct multicall_space mcs;
1086 struct gnttab_unmap_and_replace *unmap_op; 846 struct gnttab_unmap_and_replace *unmap_op;
1087 struct page *scratch_page = get_balloon_scratch_page(); 847 struct page *scratch_page = get_balloon_scratch_page();
1088 unsigned long scratch_page_address = (unsigned long) 848 unsigned long scratch_page_address = (unsigned long)
1089 __va(page_to_pfn(scratch_page) << PAGE_SHIFT); 849 __va(page_to_pfn(scratch_page) << PAGE_SHIFT);
1090 850
1091 /* 851 /*
1092 * It might be that we queued all the m2p grant table 852 * It might be that we queued all the m2p grant table
1093 * hypercalls in a multicall, then m2p_remove_override 853 * hypercalls in a multicall, then m2p_remove_override
1094 * get called before the multicall has actually been 854 * get called before the multicall has actually been
1095 * issued. In this case handle is going to -1 because 855 * issued. In this case handle is going to -1 because
1096 * it hasn't been modified yet. 856 * it hasn't been modified yet.
arch/x86/xen/setup.c
1 /* 1 /*
2 * Machine specific setup for xen 2 * Machine specific setup for xen
3 * 3 *
4 * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007 4 * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
5 */ 5 */
6 6
7 #include <linux/module.h> 7 #include <linux/module.h>
8 #include <linux/sched.h> 8 #include <linux/sched.h>
9 #include <linux/mm.h> 9 #include <linux/mm.h>
10 #include <linux/pm.h> 10 #include <linux/pm.h>
11 #include <linux/memblock.h> 11 #include <linux/memblock.h>
12 #include <linux/cpuidle.h> 12 #include <linux/cpuidle.h>
13 #include <linux/cpufreq.h> 13 #include <linux/cpufreq.h>
14 14
15 #include <asm/elf.h> 15 #include <asm/elf.h>
16 #include <asm/vdso.h> 16 #include <asm/vdso.h>
17 #include <asm/e820.h> 17 #include <asm/e820.h>
18 #include <asm/setup.h> 18 #include <asm/setup.h>
19 #include <asm/acpi.h> 19 #include <asm/acpi.h>
20 #include <asm/numa.h> 20 #include <asm/numa.h>
21 #include <asm/xen/hypervisor.h> 21 #include <asm/xen/hypervisor.h>
22 #include <asm/xen/hypercall.h> 22 #include <asm/xen/hypercall.h>
23 23
24 #include <xen/xen.h> 24 #include <xen/xen.h>
25 #include <xen/page.h> 25 #include <xen/page.h>
26 #include <xen/interface/callback.h> 26 #include <xen/interface/callback.h>
27 #include <xen/interface/memory.h> 27 #include <xen/interface/memory.h>
28 #include <xen/interface/physdev.h> 28 #include <xen/interface/physdev.h>
29 #include <xen/features.h> 29 #include <xen/features.h>
30 #include "xen-ops.h" 30 #include "xen-ops.h"
31 #include "vdso.h" 31 #include "vdso.h"
32 #include "p2m.h" 32 #include "p2m.h"
33 #include "mmu.h"
33 34
34 /* These are code, but not functions. Defined in entry.S */ 35 /* These are code, but not functions. Defined in entry.S */
35 extern const char xen_hypervisor_callback[]; 36 extern const char xen_hypervisor_callback[];
36 extern const char xen_failsafe_callback[]; 37 extern const char xen_failsafe_callback[];
37 #ifdef CONFIG_X86_64 38 #ifdef CONFIG_X86_64
38 extern asmlinkage void nmi(void); 39 extern asmlinkage void nmi(void);
39 #endif 40 #endif
40 extern void xen_sysenter_target(void); 41 extern void xen_sysenter_target(void);
41 extern void xen_syscall_target(void); 42 extern void xen_syscall_target(void);
42 extern void xen_syscall32_target(void); 43 extern void xen_syscall32_target(void);
43 44
44 /* Amount of extra memory space we add to the e820 ranges */ 45 /* Amount of extra memory space we add to the e820 ranges */
45 struct xen_memory_region xen_extra_mem[XEN_EXTRA_MEM_MAX_REGIONS] __initdata; 46 struct xen_memory_region xen_extra_mem[XEN_EXTRA_MEM_MAX_REGIONS] __initdata;
46 47
47 /* Number of pages released from the initial allocation. */ 48 /* Number of pages released from the initial allocation. */
48 unsigned long xen_released_pages; 49 unsigned long xen_released_pages;
49 50
50 /* Buffer used to remap identity mapped pages */ 51 /*
51 unsigned long xen_remap_buf[P2M_PER_PAGE] __initdata; 52 * Buffer used to remap identity mapped pages. We only need the virtual space.
53 * The physical page behind this address is remapped as needed to different
54 * buffer pages.
55 */
56 #define REMAP_SIZE (P2M_PER_PAGE - 3)
57 static struct {
58 unsigned long next_area_mfn;
59 unsigned long target_pfn;
60 unsigned long size;
61 unsigned long mfns[REMAP_SIZE];
62 } xen_remap_buf __initdata __aligned(PAGE_SIZE);
63 static unsigned long xen_remap_mfn __initdata = INVALID_P2M_ENTRY;
52 64
53 /* 65 /*
54 * The maximum amount of extra memory compared to the base size. The 66 * The maximum amount of extra memory compared to the base size. The
55 * main scaling factor is the size of struct page. At extreme ratios 67 * main scaling factor is the size of struct page. At extreme ratios
56 * of base:extra, all the base memory can be filled with page 68 * of base:extra, all the base memory can be filled with page
57 * structures for the extra memory, leaving no space for anything 69 * structures for the extra memory, leaving no space for anything
58 * else. 70 * else.
59 * 71 *
60 * 10x seems like a reasonable balance between scaling flexibility and 72 * 10x seems like a reasonable balance between scaling flexibility and
61 * leaving a practically usable system. 73 * leaving a practically usable system.
62 */ 74 */
63 #define EXTRA_MEM_RATIO (10) 75 #define EXTRA_MEM_RATIO (10)
64 76
65 static void __init xen_add_extra_mem(u64 start, u64 size) 77 static void __init xen_add_extra_mem(u64 start, u64 size)
66 { 78 {
67 unsigned long pfn;
68 int i; 79 int i;
69 80
70 for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) { 81 for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) {
71 /* Add new region. */ 82 /* Add new region. */
72 if (xen_extra_mem[i].size == 0) { 83 if (xen_extra_mem[i].size == 0) {
73 xen_extra_mem[i].start = start; 84 xen_extra_mem[i].start = start;
74 xen_extra_mem[i].size = size; 85 xen_extra_mem[i].size = size;
75 break; 86 break;
76 } 87 }
77 /* Append to existing region. */ 88 /* Append to existing region. */
78 if (xen_extra_mem[i].start + xen_extra_mem[i].size == start) { 89 if (xen_extra_mem[i].start + xen_extra_mem[i].size == start) {
79 xen_extra_mem[i].size += size; 90 xen_extra_mem[i].size += size;
80 break; 91 break;
81 } 92 }
82 } 93 }
83 if (i == XEN_EXTRA_MEM_MAX_REGIONS) 94 if (i == XEN_EXTRA_MEM_MAX_REGIONS)
84 printk(KERN_WARNING "Warning: not enough extra memory regions\n"); 95 printk(KERN_WARNING "Warning: not enough extra memory regions\n");
85 96
86 memblock_reserve(start, size); 97 memblock_reserve(start, size);
98 }
87 99
88 xen_max_p2m_pfn = PFN_DOWN(start + size); 100 static void __init xen_del_extra_mem(u64 start, u64 size)
89 for (pfn = PFN_DOWN(start); pfn < xen_max_p2m_pfn; pfn++) { 101 {
90 unsigned long mfn = pfn_to_mfn(pfn); 102 int i;
103 u64 start_r, size_r;
91 104
92 if (WARN_ONCE(mfn == pfn, "Trying to over-write 1-1 mapping (pfn: %lx)\n", pfn)) 105 for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) {
93 continue; 106 start_r = xen_extra_mem[i].start;
94 WARN_ONCE(mfn != INVALID_P2M_ENTRY, "Trying to remove %lx which has %lx mfn!\n", 107 size_r = xen_extra_mem[i].size;
95 pfn, mfn);
96 108
97 __set_phys_to_machine(pfn, INVALID_P2M_ENTRY); 109 /* Start of region. */
110 if (start_r == start) {
111 BUG_ON(size > size_r);
112 xen_extra_mem[i].start += size;
113 xen_extra_mem[i].size -= size;
114 break;
115 }
116 /* End of region. */
117 if (start_r + size_r == start + size) {
118 BUG_ON(size > size_r);
119 xen_extra_mem[i].size -= size;
120 break;
121 }
122 /* Mid of region. */
123 if (start > start_r && start < start_r + size_r) {
124 BUG_ON(start + size > start_r + size_r);
125 xen_extra_mem[i].size = start - start_r;
126 /* Calling memblock_reserve() again is okay. */
127 xen_add_extra_mem(start + size, start_r + size_r -
128 (start + size));
129 break;
130 }
98 } 131 }
132 memblock_free(start, size);
99 } 133 }
100 134
101 static unsigned long __init xen_do_chunk(unsigned long start, 135 /*
102 unsigned long end, bool release) 136 * Called during boot before the p2m list can take entries beyond the
137 * hypervisor supplied p2m list. Entries in extra mem are to be regarded as
138 * invalid.
139 */
140 unsigned long __ref xen_chk_extra_mem(unsigned long pfn)
103 { 141 {
104 struct xen_memory_reservation reservation = { 142 int i;
105 .address_bits = 0, 143 unsigned long addr = PFN_PHYS(pfn);
106 .extent_order = 0,
107 .domid = DOMID_SELF
108 };
109 unsigned long len = 0;
110 unsigned long pfn;
111 int ret;
112 144
113 for (pfn = start; pfn < end; pfn++) { 145 for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) {
114 unsigned long frame; 146 if (addr >= xen_extra_mem[i].start &&
115 unsigned long mfn = pfn_to_mfn(pfn); 147 addr < xen_extra_mem[i].start + xen_extra_mem[i].size)
148 return INVALID_P2M_ENTRY;
149 }
116 150
117 if (release) { 151 return IDENTITY_FRAME(pfn);
118 /* Make sure pfn exists to start with */ 152 }
119 if (mfn == INVALID_P2M_ENTRY || mfn_to_pfn(mfn) != pfn)
120 continue;
121 frame = mfn;
122 } else {
123 if (mfn != INVALID_P2M_ENTRY)
124 continue;
125 frame = pfn;
126 }
127 set_xen_guest_handle(reservation.extent_start, &frame);
128 reservation.nr_extents = 1;
129 153
130 ret = HYPERVISOR_memory_op(release ? XENMEM_decrease_reservation : XENMEM_populate_physmap, 154 /*
131 &reservation); 155 * Mark all pfns of extra mem as invalid in p2m list.
132 WARN(ret != 1, "Failed to %s pfn %lx err=%d\n", 156 */
133 release ? "release" : "populate", pfn, ret); 157 void __init xen_inv_extra_mem(void)
158 {
159 unsigned long pfn, pfn_s, pfn_e;
160 int i;
134 161
135 if (ret == 1) { 162 for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) {
136 if (!early_set_phys_to_machine(pfn, release ? INVALID_P2M_ENTRY : frame)) { 163 pfn_s = PFN_DOWN(xen_extra_mem[i].start);
137 if (release) 164 pfn_e = PFN_UP(xen_extra_mem[i].start + xen_extra_mem[i].size);
138 break; 165 for (pfn = pfn_s; pfn < pfn_e; pfn++)
139 set_xen_guest_handle(reservation.extent_start, &frame); 166 set_phys_to_machine(pfn, INVALID_P2M_ENTRY);
140 reservation.nr_extents = 1;
141 ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation,
142 &reservation);
143 break;
144 }
145 len++;
146 } else
147 break;
148 } 167 }
149 if (len)
150 printk(KERN_INFO "%s %lx-%lx pfn range: %lu pages %s\n",
151 release ? "Freeing" : "Populating",
152 start, end, len,
153 release ? "freed" : "added");
154
155 return len;
156 } 168 }
157 169
158 /* 170 /*
159 * Finds the next RAM pfn available in the E820 map after min_pfn. 171 * Finds the next RAM pfn available in the E820 map after min_pfn.
160 * This function updates min_pfn with the pfn found and returns 172 * This function updates min_pfn with the pfn found and returns
161 * the size of that range or zero if not found. 173 * the size of that range or zero if not found.
162 */ 174 */
163 static unsigned long __init xen_find_pfn_range( 175 static unsigned long __init xen_find_pfn_range(
164 const struct e820entry *list, size_t map_size, 176 const struct e820entry *list, size_t map_size,
165 unsigned long *min_pfn) 177 unsigned long *min_pfn)
166 { 178 {
167 const struct e820entry *entry; 179 const struct e820entry *entry;
168 unsigned int i; 180 unsigned int i;
169 unsigned long done = 0; 181 unsigned long done = 0;
170 182
171 for (i = 0, entry = list; i < map_size; i++, entry++) { 183 for (i = 0, entry = list; i < map_size; i++, entry++) {
172 unsigned long s_pfn; 184 unsigned long s_pfn;
173 unsigned long e_pfn; 185 unsigned long e_pfn;
174 186
175 if (entry->type != E820_RAM) 187 if (entry->type != E820_RAM)
176 continue; 188 continue;
177 189
178 e_pfn = PFN_DOWN(entry->addr + entry->size); 190 e_pfn = PFN_DOWN(entry->addr + entry->size);
179 191
180 /* We only care about E820 after this */ 192 /* We only care about E820 after this */
181 if (e_pfn < *min_pfn) 193 if (e_pfn < *min_pfn)
182 continue; 194 continue;
183 195
184 s_pfn = PFN_UP(entry->addr); 196 s_pfn = PFN_UP(entry->addr);
185 197
186 /* If min_pfn falls within the E820 entry, we want to start 198 /* If min_pfn falls within the E820 entry, we want to start
187 * at the min_pfn PFN. 199 * at the min_pfn PFN.
188 */ 200 */
189 if (s_pfn <= *min_pfn) { 201 if (s_pfn <= *min_pfn) {
190 done = e_pfn - *min_pfn; 202 done = e_pfn - *min_pfn;
191 } else { 203 } else {
192 done = e_pfn - s_pfn; 204 done = e_pfn - s_pfn;
193 *min_pfn = s_pfn; 205 *min_pfn = s_pfn;
194 } 206 }
195 break; 207 break;
196 } 208 }
197 209
198 return done; 210 return done;
199 } 211 }
200 212
213 static int __init xen_free_mfn(unsigned long mfn)
214 {
215 struct xen_memory_reservation reservation = {
216 .address_bits = 0,
217 .extent_order = 0,
218 .domid = DOMID_SELF
219 };
220
221 set_xen_guest_handle(reservation.extent_start, &mfn);
222 reservation.nr_extents = 1;
223
224 return HYPERVISOR_memory_op(XENMEM_decrease_reservation, &reservation);
225 }
226
201 /* 227 /*
202 * This releases a chunk of memory and then does the identity map. It's used as 228 * This releases a chunk of memory and then does the identity map. It's used
203 * as a fallback if the remapping fails. 229 * as a fallback if the remapping fails.
204 */ 230 */
205 static void __init xen_set_identity_and_release_chunk(unsigned long start_pfn, 231 static void __init xen_set_identity_and_release_chunk(unsigned long start_pfn,
206 unsigned long end_pfn, unsigned long nr_pages, unsigned long *identity, 232 unsigned long end_pfn, unsigned long nr_pages, unsigned long *identity,
207 unsigned long *released) 233 unsigned long *released)
208 { 234 {
235 unsigned long len = 0;
236 unsigned long pfn, end;
237 int ret;
238
209 WARN_ON(start_pfn > end_pfn); 239 WARN_ON(start_pfn > end_pfn);
210 240
241 end = min(end_pfn, nr_pages);
242 for (pfn = start_pfn; pfn < end; pfn++) {
243 unsigned long mfn = pfn_to_mfn(pfn);
244
245 /* Make sure pfn exists to start with */
246 if (mfn == INVALID_P2M_ENTRY || mfn_to_pfn(mfn) != pfn)
247 continue;
248
249 ret = xen_free_mfn(mfn);
250 WARN(ret != 1, "Failed to release pfn %lx err=%d\n", pfn, ret);
251
252 if (ret == 1) {
253 if (!__set_phys_to_machine(pfn, INVALID_P2M_ENTRY))
254 break;
255 len++;
256 } else
257 break;
258 }
259
211 /* Need to release pages first */ 260 /* Need to release pages first */
212 *released += xen_do_chunk(start_pfn, min(end_pfn, nr_pages), true); 261 *released += len;
213 *identity += set_phys_range_identity(start_pfn, end_pfn); 262 *identity += set_phys_range_identity(start_pfn, end_pfn);
214 } 263 }
215 264
216 /* 265 /*
217 * Helper function to update both the p2m and m2p tables. 266 * Helper function to update the p2m and m2p tables and kernel mapping.
218 */ 267 */
219 static unsigned long __init xen_update_mem_tables(unsigned long pfn, 268 static void __init xen_update_mem_tables(unsigned long pfn, unsigned long mfn)
220 unsigned long mfn)
221 { 269 {
222 struct mmu_update update = { 270 struct mmu_update update = {
223 .ptr = ((unsigned long long)mfn << PAGE_SHIFT) | MMU_MACHPHYS_UPDATE, 271 .ptr = ((unsigned long long)mfn << PAGE_SHIFT) | MMU_MACHPHYS_UPDATE,
224 .val = pfn 272 .val = pfn
225 }; 273 };
226 274
227 /* Update p2m */ 275 /* Update p2m */
228 if (!early_set_phys_to_machine(pfn, mfn)) { 276 if (!set_phys_to_machine(pfn, mfn)) {
229 WARN(1, "Failed to set p2m mapping for pfn=%ld mfn=%ld\n", 277 WARN(1, "Failed to set p2m mapping for pfn=%ld mfn=%ld\n",
230 pfn, mfn); 278 pfn, mfn);
231 return false; 279 BUG();
232 } 280 }
233 281
234 /* Update m2p */ 282 /* Update m2p */
235 if (HYPERVISOR_mmu_update(&update, 1, NULL, DOMID_SELF) < 0) { 283 if (HYPERVISOR_mmu_update(&update, 1, NULL, DOMID_SELF) < 0) {
236 WARN(1, "Failed to set m2p mapping for mfn=%ld pfn=%ld\n", 284 WARN(1, "Failed to set m2p mapping for mfn=%ld pfn=%ld\n",
237 mfn, pfn); 285 mfn, pfn);
238 return false; 286 BUG();
239 } 287 }
240 288
241 return true; 289 /* Update kernel mapping, but not for highmem. */
290 if ((pfn << PAGE_SHIFT) >= __pa(high_memory))
291 return;
292
293 if (HYPERVISOR_update_va_mapping((unsigned long)__va(pfn << PAGE_SHIFT),
294 mfn_pte(mfn, PAGE_KERNEL), 0)) {
295 WARN(1, "Failed to update kernel mapping for mfn=%ld pfn=%ld\n",
296 mfn, pfn);
297 BUG();
298 }
242 } 299 }
243 300
244 /* 301 /*
245 * This function updates the p2m and m2p tables with an identity map from 302 * This function updates the p2m and m2p tables with an identity map from
246 * start_pfn to start_pfn+size and remaps the underlying RAM of the original 303 * start_pfn to start_pfn+size and prepares remapping the underlying RAM of the
247 * allocation at remap_pfn. It must do so carefully in P2M_PER_PAGE sized blocks 304 * original allocation at remap_pfn. The information needed for remapping is
248 * to not exhaust the reserved brk space. Doing it in properly aligned blocks 305 * saved in the memory itself to avoid the need for allocating buffers. The
249 * ensures we only allocate the minimum required leaf pages in the p2m table. It 306 * complete remap information is contained in a list of MFNs each containing
250 * copies the existing mfns from the p2m table under the 1:1 map, overwrites 307 * up to REMAP_SIZE MFNs and the start target PFN for doing the remap.
251 * them with the identity map and then updates the p2m and m2p tables with the 308 * This enables us to preserve the original mfn sequence while doing the
252 * remapped memory. 309 * remapping at a time when the memory management is capable of allocating
310 * virtual and physical memory in arbitrary amounts, see 'xen_remap_memory' and
311 * its callers.
253 */ 312 */
254 static unsigned long __init xen_do_set_identity_and_remap_chunk( 313 static void __init xen_do_set_identity_and_remap_chunk(
255 unsigned long start_pfn, unsigned long size, unsigned long remap_pfn) 314 unsigned long start_pfn, unsigned long size, unsigned long remap_pfn)
256 { 315 {
316 unsigned long buf = (unsigned long)&xen_remap_buf;
317 unsigned long mfn_save, mfn;
257 unsigned long ident_pfn_iter, remap_pfn_iter; 318 unsigned long ident_pfn_iter, remap_pfn_iter;
258 unsigned long ident_start_pfn_align, remap_start_pfn_align; 319 unsigned long ident_end_pfn = start_pfn + size;
259 unsigned long ident_end_pfn_align, remap_end_pfn_align;
260 unsigned long ident_boundary_pfn, remap_boundary_pfn;
261 unsigned long ident_cnt = 0;
262 unsigned long remap_cnt = 0;
263 unsigned long left = size; 320 unsigned long left = size;
264 unsigned long mod; 321 unsigned long ident_cnt = 0;
265 int i; 322 unsigned int i, chunk;
266 323
267 WARN_ON(size == 0); 324 WARN_ON(size == 0);
268 325
269 BUG_ON(xen_feature(XENFEAT_auto_translated_physmap)); 326 BUG_ON(xen_feature(XENFEAT_auto_translated_physmap));
270 327
271 /* 328 mfn_save = virt_to_mfn(buf);
272 * Determine the proper alignment to remap memory in P2M_PER_PAGE sized
273 * blocks. We need to keep track of both the existing pfn mapping and
274 * the new pfn remapping.
275 */
276 mod = start_pfn % P2M_PER_PAGE;
277 ident_start_pfn_align =
278 mod ? (start_pfn - mod + P2M_PER_PAGE) : start_pfn;
279 mod = remap_pfn % P2M_PER_PAGE;
280 remap_start_pfn_align =
281 mod ? (remap_pfn - mod + P2M_PER_PAGE) : remap_pfn;
282 mod = (start_pfn + size) % P2M_PER_PAGE;
283 ident_end_pfn_align = start_pfn + size - mod;
284 mod = (remap_pfn + size) % P2M_PER_PAGE;
285 remap_end_pfn_align = remap_pfn + size - mod;
286 329
287 /* Iterate over each p2m leaf node in each range */ 330 for (ident_pfn_iter = start_pfn, remap_pfn_iter = remap_pfn;
288 for (ident_pfn_iter = ident_start_pfn_align, remap_pfn_iter = remap_start_pfn_align; 331 ident_pfn_iter < ident_end_pfn;
289 ident_pfn_iter < ident_end_pfn_align && remap_pfn_iter < remap_end_pfn_align; 332 ident_pfn_iter += REMAP_SIZE, remap_pfn_iter += REMAP_SIZE) {
290 ident_pfn_iter += P2M_PER_PAGE, remap_pfn_iter += P2M_PER_PAGE) { 333 chunk = (left < REMAP_SIZE) ? left : REMAP_SIZE;
291 /* Check we aren't past the end */
292 BUG_ON(ident_pfn_iter + P2M_PER_PAGE > start_pfn + size);
293 BUG_ON(remap_pfn_iter + P2M_PER_PAGE > remap_pfn + size);
294 334
295 /* Save p2m mappings */ 335 /* Map first pfn to xen_remap_buf */
296 for (i = 0; i < P2M_PER_PAGE; i++) 336 mfn = pfn_to_mfn(ident_pfn_iter);
297 xen_remap_buf[i] = pfn_to_mfn(ident_pfn_iter + i); 337 set_pte_mfn(buf, mfn, PAGE_KERNEL);
298 338
299 /* Set identity map which will free a p2m leaf */ 339 /* Save mapping information in page */
300 ident_cnt += set_phys_range_identity(ident_pfn_iter, 340 xen_remap_buf.next_area_mfn = xen_remap_mfn;
301 ident_pfn_iter + P2M_PER_PAGE); 341 xen_remap_buf.target_pfn = remap_pfn_iter;
342 xen_remap_buf.size = chunk;
343 for (i = 0; i < chunk; i++)
344 xen_remap_buf.mfns[i] = pfn_to_mfn(ident_pfn_iter + i);
302 345
303 #ifdef DEBUG 346 /* Put remap buf into list. */
304 /* Helps verify a p2m leaf has been freed */ 347 xen_remap_mfn = mfn;
305 for (i = 0; i < P2M_PER_PAGE; i++) {
306 unsigned int pfn = ident_pfn_iter + i;
307 BUG_ON(pfn_to_mfn(pfn) != pfn);
308 }
309 #endif
310 /* Now remap memory */
311 for (i = 0; i < P2M_PER_PAGE; i++) {
312 unsigned long mfn = xen_remap_buf[i];
313 348
314 /* This will use the p2m leaf freed above */ 349 /* Set identity map */
315 if (!xen_update_mem_tables(remap_pfn_iter + i, mfn)) { 350 ident_cnt += set_phys_range_identity(ident_pfn_iter,
316 WARN(1, "Failed to update mem mapping for pfn=%ld mfn=%ld\n", 351 ident_pfn_iter + chunk);
317 remap_pfn_iter + i, mfn);
318 return 0;
319 }
320 352
321 remap_cnt++; 353 left -= chunk;
322 }
323
324 left -= P2M_PER_PAGE;
325 } 354 }
326 355
327 /* Max boundary space possible */ 356 /* Restore old xen_remap_buf mapping */
328 BUG_ON(left > (P2M_PER_PAGE - 1) * 2); 357 set_pte_mfn(buf, mfn_save, PAGE_KERNEL);
329
330 /* Now handle the boundary conditions */
331 ident_boundary_pfn = start_pfn;
332 remap_boundary_pfn = remap_pfn;
333 for (i = 0; i < left; i++) {
334 unsigned long mfn;
335
336 /* These two checks move from the start to end boundaries */
337 if (ident_boundary_pfn == ident_start_pfn_align)
338 ident_boundary_pfn = ident_pfn_iter;
339 if (remap_boundary_pfn == remap_start_pfn_align)
340 remap_boundary_pfn = remap_pfn_iter;
341
342 /* Check we aren't past the end */
343 BUG_ON(ident_boundary_pfn >= start_pfn + size);
344 BUG_ON(remap_boundary_pfn >= remap_pfn + size);
345
346 mfn = pfn_to_mfn(ident_boundary_pfn);
347
348 if (!xen_update_mem_tables(remap_boundary_pfn, mfn)) {
349 WARN(1, "Failed to update mem mapping for pfn=%ld mfn=%ld\n",
350 remap_pfn_iter + i, mfn);
351 return 0;
352 }
353 remap_cnt++;
354
355 ident_boundary_pfn++;
356 remap_boundary_pfn++;
357 }
358
359 /* Finish up the identity map */
360 if (ident_start_pfn_align >= ident_end_pfn_align) {
361 /*
362 * In this case we have an identity range which does not span an
363 * aligned block so everything needs to be identity mapped here.
364 * If we didn't check this we might remap too many pages since
365 * the align boundaries are not meaningful in this case.
366 */
367 ident_cnt += set_phys_range_identity(start_pfn,
368 start_pfn + size);
369 } else {
370 /* Remapped above so check each end of the chunk */
371 if (start_pfn < ident_start_pfn_align)
372 ident_cnt += set_phys_range_identity(start_pfn,
373 ident_start_pfn_align);
374 if (start_pfn + size > ident_pfn_iter)
375 ident_cnt += set_phys_range_identity(ident_pfn_iter,
376 start_pfn + size);
377 }
378
379 BUG_ON(ident_cnt != size);
380 BUG_ON(remap_cnt != size);
381
382 return size;
383 } 358 }
384 359
385 /* 360 /*
386 * This function takes a contiguous pfn range that needs to be identity mapped 361 * This function takes a contiguous pfn range that needs to be identity mapped
387 * and: 362 * and:
388 * 363 *
389 * 1) Finds a new range of pfns to use to remap based on E820 and remap_pfn. 364 * 1) Finds a new range of pfns to use to remap based on E820 and remap_pfn.
390 * 2) Calls the do_ function to actually do the mapping/remapping work. 365 * 2) Calls the do_ function to actually do the mapping/remapping work.
391 * 366 *
392 * The goal is to not allocate additional memory but to remap the existing 367 * The goal is to not allocate additional memory but to remap the existing
393 * pages. In the case of an error the underlying memory is simply released back 368 * pages. In the case of an error the underlying memory is simply released back
394 * to Xen and not remapped. 369 * to Xen and not remapped.
395 */ 370 */
396 static unsigned long __init xen_set_identity_and_remap_chunk( 371 static unsigned long __init xen_set_identity_and_remap_chunk(
397 const struct e820entry *list, size_t map_size, unsigned long start_pfn, 372 const struct e820entry *list, size_t map_size, unsigned long start_pfn,
398 unsigned long end_pfn, unsigned long nr_pages, unsigned long remap_pfn, 373 unsigned long end_pfn, unsigned long nr_pages, unsigned long remap_pfn,
399 unsigned long *identity, unsigned long *remapped, 374 unsigned long *identity, unsigned long *released)
400 unsigned long *released)
401 { 375 {
402 unsigned long pfn; 376 unsigned long pfn;
403 unsigned long i = 0; 377 unsigned long i = 0;
404 unsigned long n = end_pfn - start_pfn; 378 unsigned long n = end_pfn - start_pfn;
405 379
406 while (i < n) { 380 while (i < n) {
407 unsigned long cur_pfn = start_pfn + i; 381 unsigned long cur_pfn = start_pfn + i;
408 unsigned long left = n - i; 382 unsigned long left = n - i;
409 unsigned long size = left; 383 unsigned long size = left;
410 unsigned long remap_range_size; 384 unsigned long remap_range_size;
411 385
412 /* Do not remap pages beyond the current allocation */ 386 /* Do not remap pages beyond the current allocation */
413 if (cur_pfn >= nr_pages) { 387 if (cur_pfn >= nr_pages) {
414 /* Identity map remaining pages */ 388 /* Identity map remaining pages */
415 *identity += set_phys_range_identity(cur_pfn, 389 *identity += set_phys_range_identity(cur_pfn,
416 cur_pfn + size); 390 cur_pfn + size);
417 break; 391 break;
418 } 392 }
419 if (cur_pfn + size > nr_pages) 393 if (cur_pfn + size > nr_pages)
420 size = nr_pages - cur_pfn; 394 size = nr_pages - cur_pfn;
421 395
422 remap_range_size = xen_find_pfn_range(list, map_size, 396 remap_range_size = xen_find_pfn_range(list, map_size,
423 &remap_pfn); 397 &remap_pfn);
424 if (!remap_range_size) { 398 if (!remap_range_size) {
425 pr_warning("Unable to find available pfn range, not remapping identity pages\n"); 399 pr_warning("Unable to find available pfn range, not remapping identity pages\n");
426 xen_set_identity_and_release_chunk(cur_pfn, 400 xen_set_identity_and_release_chunk(cur_pfn,
427 cur_pfn + left, nr_pages, identity, released); 401 cur_pfn + left, nr_pages, identity, released);
428 break; 402 break;
429 } 403 }
430 /* Adjust size to fit in current e820 RAM region */ 404 /* Adjust size to fit in current e820 RAM region */
431 if (size > remap_range_size) 405 if (size > remap_range_size)
432 size = remap_range_size; 406 size = remap_range_size;
433 407
434 if (!xen_do_set_identity_and_remap_chunk(cur_pfn, size, remap_pfn)) { 408 xen_do_set_identity_and_remap_chunk(cur_pfn, size, remap_pfn);
435 WARN(1, "Failed to remap 1:1 memory cur_pfn=%ld size=%ld remap_pfn=%ld\n",
436 cur_pfn, size, remap_pfn);
437 xen_set_identity_and_release_chunk(cur_pfn,
438 cur_pfn + left, nr_pages, identity, released);
439 break;
440 }
441 409
442 /* Update variables to reflect new mappings. */ 410 /* Update variables to reflect new mappings. */
443 i += size; 411 i += size;
444 remap_pfn += size; 412 remap_pfn += size;
445 *identity += size; 413 *identity += size;
446 *remapped += size;
447 } 414 }
448 415
449 /* 416 /*
450 * If the PFNs are currently mapped, the VA mapping also needs 417 * If the PFNs are currently mapped, the VA mapping also needs
451 * to be updated to be 1:1. 418 * to be updated to be 1:1.
452 */ 419 */
453 for (pfn = start_pfn; pfn <= max_pfn_mapped && pfn < end_pfn; pfn++) 420 for (pfn = start_pfn; pfn <= max_pfn_mapped && pfn < end_pfn; pfn++)
454 (void)HYPERVISOR_update_va_mapping( 421 (void)HYPERVISOR_update_va_mapping(
455 (unsigned long)__va(pfn << PAGE_SHIFT), 422 (unsigned long)__va(pfn << PAGE_SHIFT),
456 mfn_pte(pfn, PAGE_KERNEL_IO), 0); 423 mfn_pte(pfn, PAGE_KERNEL_IO), 0);
457 424
458 return remap_pfn; 425 return remap_pfn;
459 } 426 }
460 427
461 static unsigned long __init xen_set_identity_and_remap( 428 static void __init xen_set_identity_and_remap(
462 const struct e820entry *list, size_t map_size, unsigned long nr_pages, 429 const struct e820entry *list, size_t map_size, unsigned long nr_pages,
463 unsigned long *released) 430 unsigned long *released)
464 { 431 {
465 phys_addr_t start = 0; 432 phys_addr_t start = 0;
466 unsigned long identity = 0; 433 unsigned long identity = 0;
467 unsigned long remapped = 0;
468 unsigned long last_pfn = nr_pages; 434 unsigned long last_pfn = nr_pages;
469 const struct e820entry *entry; 435 const struct e820entry *entry;
470 unsigned long num_released = 0; 436 unsigned long num_released = 0;
471 int i; 437 int i;
472 438
473 /* 439 /*
474 * Combine non-RAM regions and gaps until a RAM region (or the 440 * Combine non-RAM regions and gaps until a RAM region (or the
475 * end of the map) is reached, then set the 1:1 map and 441 * end of the map) is reached, then set the 1:1 map and
476 * remap the memory in those non-RAM regions. 442 * remap the memory in those non-RAM regions.
477 * 443 *
478 * The combined non-RAM regions are rounded to a whole number 444 * The combined non-RAM regions are rounded to a whole number
479 * of pages so any partial pages are accessible via the 1:1 445 * of pages so any partial pages are accessible via the 1:1
480 * mapping. This is needed for some BIOSes that put (for 446 * mapping. This is needed for some BIOSes that put (for
481 * example) the DMI tables in a reserved region that begins on 447 * example) the DMI tables in a reserved region that begins on
482 * a non-page boundary. 448 * a non-page boundary.
483 */ 449 */
484 for (i = 0, entry = list; i < map_size; i++, entry++) { 450 for (i = 0, entry = list; i < map_size; i++, entry++) {
485 phys_addr_t end = entry->addr + entry->size; 451 phys_addr_t end = entry->addr + entry->size;
486 if (entry->type == E820_RAM || i == map_size - 1) { 452 if (entry->type == E820_RAM || i == map_size - 1) {
487 unsigned long start_pfn = PFN_DOWN(start); 453 unsigned long start_pfn = PFN_DOWN(start);
488 unsigned long end_pfn = PFN_UP(end); 454 unsigned long end_pfn = PFN_UP(end);
489 455
490 if (entry->type == E820_RAM) 456 if (entry->type == E820_RAM)
491 end_pfn = PFN_UP(entry->addr); 457 end_pfn = PFN_UP(entry->addr);
492 458
493 if (start_pfn < end_pfn) 459 if (start_pfn < end_pfn)
494 last_pfn = xen_set_identity_and_remap_chunk( 460 last_pfn = xen_set_identity_and_remap_chunk(
495 list, map_size, start_pfn, 461 list, map_size, start_pfn,
496 end_pfn, nr_pages, last_pfn, 462 end_pfn, nr_pages, last_pfn,
497 &identity, &remapped, 463 &identity, &num_released);
498 &num_released);
499 start = end; 464 start = end;
500 } 465 }
501 } 466 }
502 467
503 *released = num_released; 468 *released = num_released;
504 469
505 pr_info("Set %ld page(s) to 1-1 mapping\n", identity); 470 pr_info("Set %ld page(s) to 1-1 mapping\n", identity);
506 pr_info("Remapped %ld page(s), last_pfn=%ld\n", remapped,
507 last_pfn);
508 pr_info("Released %ld page(s)\n", num_released); 471 pr_info("Released %ld page(s)\n", num_released);
472 }
509 473
510 return last_pfn; 474 /*
475 * Remap the memory prepared in xen_do_set_identity_and_remap_chunk().
476 * The remap information (which mfn remap to which pfn) is contained in the
477 * to be remapped memory itself in a linked list anchored at xen_remap_mfn.
478 * This scheme allows to remap the different chunks in arbitrary order while
479 * the resulting mapping will be independant from the order.
480 */
481 void __init xen_remap_memory(void)
482 {
483 unsigned long buf = (unsigned long)&xen_remap_buf;
484 unsigned long mfn_save, mfn, pfn;
485 unsigned long remapped = 0;
486 unsigned int i;
487 unsigned long pfn_s = ~0UL;
488 unsigned long len = 0;
489
490 mfn_save = virt_to_mfn(buf);
491
492 while (xen_remap_mfn != INVALID_P2M_ENTRY) {
493 /* Map the remap information */
494 set_pte_mfn(buf, xen_remap_mfn, PAGE_KERNEL);
495
496 BUG_ON(xen_remap_mfn != xen_remap_buf.mfns[0]);
497
498 pfn = xen_remap_buf.target_pfn;
499 for (i = 0; i < xen_remap_buf.size; i++) {
500 mfn = xen_remap_buf.mfns[i];
501 xen_update_mem_tables(pfn, mfn);
502 remapped++;
503 pfn++;
504 }
505 if (pfn_s == ~0UL || pfn == pfn_s) {
506 pfn_s = xen_remap_buf.target_pfn;
507 len += xen_remap_buf.size;
508 } else if (pfn_s + len == xen_remap_buf.target_pfn) {
509 len += xen_remap_buf.size;
510 } else {
511 xen_del_extra_mem(PFN_PHYS(pfn_s), PFN_PHYS(len));
512 pfn_s = xen_remap_buf.target_pfn;
513 len = xen_remap_buf.size;
514 }
515
516 mfn = xen_remap_mfn;
517 xen_remap_mfn = xen_remap_buf.next_area_mfn;
518 }
519
520 if (pfn_s != ~0UL && len)
521 xen_del_extra_mem(PFN_PHYS(pfn_s), PFN_PHYS(len));
522
523 set_pte_mfn(buf, mfn_save, PAGE_KERNEL);
524
525 pr_info("Remapped %ld page(s)\n", remapped);
511 } 526 }
527
512 static unsigned long __init xen_get_max_pages(void) 528 static unsigned long __init xen_get_max_pages(void)
513 { 529 {
514 unsigned long max_pages = MAX_DOMAIN_PAGES; 530 unsigned long max_pages = MAX_DOMAIN_PAGES;
515 domid_t domid = DOMID_SELF; 531 domid_t domid = DOMID_SELF;
516 int ret; 532 int ret;
517 533
518 /* 534 /*
519 * For the initial domain we use the maximum reservation as 535 * For the initial domain we use the maximum reservation as
520 * the maximum page. 536 * the maximum page.
521 * 537 *
522 * For guest domains the current maximum reservation reflects 538 * For guest domains the current maximum reservation reflects
523 * the current maximum rather than the static maximum. In this 539 * the current maximum rather than the static maximum. In this
524 * case the e820 map provided to us will cover the static 540 * case the e820 map provided to us will cover the static
525 * maximum region. 541 * maximum region.
526 */ 542 */
527 if (xen_initial_domain()) { 543 if (xen_initial_domain()) {
528 ret = HYPERVISOR_memory_op(XENMEM_maximum_reservation, &domid); 544 ret = HYPERVISOR_memory_op(XENMEM_maximum_reservation, &domid);
529 if (ret > 0) 545 if (ret > 0)
530 max_pages = ret; 546 max_pages = ret;
531 } 547 }
532 548
533 return min(max_pages, MAX_DOMAIN_PAGES); 549 return min(max_pages, MAX_DOMAIN_PAGES);
534 } 550 }
535 551
536 static void xen_align_and_add_e820_region(u64 start, u64 size, int type) 552 static void xen_align_and_add_e820_region(u64 start, u64 size, int type)
537 { 553 {
538 u64 end = start + size; 554 u64 end = start + size;
539 555
540 /* Align RAM regions to page boundaries. */ 556 /* Align RAM regions to page boundaries. */
541 if (type == E820_RAM) { 557 if (type == E820_RAM) {
542 start = PAGE_ALIGN(start); 558 start = PAGE_ALIGN(start);
543 end &= ~((u64)PAGE_SIZE - 1); 559 end &= ~((u64)PAGE_SIZE - 1);
544 } 560 }
545 561
546 e820_add_region(start, end - start, type); 562 e820_add_region(start, end - start, type);
547 } 563 }
548 564
549 void xen_ignore_unusable(struct e820entry *list, size_t map_size) 565 void xen_ignore_unusable(struct e820entry *list, size_t map_size)
550 { 566 {
551 struct e820entry *entry; 567 struct e820entry *entry;
552 unsigned int i; 568 unsigned int i;
553 569
554 for (i = 0, entry = list; i < map_size; i++, entry++) { 570 for (i = 0, entry = list; i < map_size; i++, entry++) {
555 if (entry->type == E820_UNUSABLE) 571 if (entry->type == E820_UNUSABLE)
556 entry->type = E820_RAM; 572 entry->type = E820_RAM;
557 } 573 }
558 } 574 }
559 575
560 /** 576 /**
561 * machine_specific_memory_setup - Hook for machine specific memory setup. 577 * machine_specific_memory_setup - Hook for machine specific memory setup.
562 **/ 578 **/
563 char * __init xen_memory_setup(void) 579 char * __init xen_memory_setup(void)
564 { 580 {
565 static struct e820entry map[E820MAX] __initdata; 581 static struct e820entry map[E820MAX] __initdata;
566 582
567 unsigned long max_pfn = xen_start_info->nr_pages; 583 unsigned long max_pfn = xen_start_info->nr_pages;
568 unsigned long long mem_end; 584 unsigned long long mem_end;
569 int rc; 585 int rc;
570 struct xen_memory_map memmap; 586 struct xen_memory_map memmap;
571 unsigned long max_pages; 587 unsigned long max_pages;
572 unsigned long last_pfn = 0;
573 unsigned long extra_pages = 0; 588 unsigned long extra_pages = 0;
574 int i; 589 int i;
575 int op; 590 int op;
576 591
577 max_pfn = min(MAX_DOMAIN_PAGES, max_pfn); 592 max_pfn = min(MAX_DOMAIN_PAGES, max_pfn);
578 mem_end = PFN_PHYS(max_pfn); 593 mem_end = PFN_PHYS(max_pfn);
579 594
580 memmap.nr_entries = E820MAX; 595 memmap.nr_entries = E820MAX;
581 set_xen_guest_handle(memmap.buffer, map); 596 set_xen_guest_handle(memmap.buffer, map);
582 597
583 op = xen_initial_domain() ? 598 op = xen_initial_domain() ?
584 XENMEM_machine_memory_map : 599 XENMEM_machine_memory_map :
585 XENMEM_memory_map; 600 XENMEM_memory_map;
586 rc = HYPERVISOR_memory_op(op, &memmap); 601 rc = HYPERVISOR_memory_op(op, &memmap);
587 if (rc == -ENOSYS) { 602 if (rc == -ENOSYS) {
588 BUG_ON(xen_initial_domain()); 603 BUG_ON(xen_initial_domain());
589 memmap.nr_entries = 1; 604 memmap.nr_entries = 1;
590 map[0].addr = 0ULL; 605 map[0].addr = 0ULL;
591 map[0].size = mem_end; 606 map[0].size = mem_end;
592 /* 8MB slack (to balance backend allocations). */ 607 /* 8MB slack (to balance backend allocations). */
593 map[0].size += 8ULL << 20; 608 map[0].size += 8ULL << 20;
594 map[0].type = E820_RAM; 609 map[0].type = E820_RAM;
595 rc = 0; 610 rc = 0;
596 } 611 }
597 BUG_ON(rc); 612 BUG_ON(rc);
598 BUG_ON(memmap.nr_entries == 0); 613 BUG_ON(memmap.nr_entries == 0);
599 614
600 /* 615 /*
601 * Xen won't allow a 1:1 mapping to be created to UNUSABLE 616 * Xen won't allow a 1:1 mapping to be created to UNUSABLE
602 * regions, so if we're using the machine memory map leave the 617 * regions, so if we're using the machine memory map leave the
603 * region as RAM as it is in the pseudo-physical map. 618 * region as RAM as it is in the pseudo-physical map.
604 * 619 *
605 * UNUSABLE regions in domUs are not handled and will need 620 * UNUSABLE regions in domUs are not handled and will need
606 * a patch in the future. 621 * a patch in the future.
607 */ 622 */
608 if (xen_initial_domain()) 623 if (xen_initial_domain())
609 xen_ignore_unusable(map, memmap.nr_entries); 624 xen_ignore_unusable(map, memmap.nr_entries);
610 625
611 /* Make sure the Xen-supplied memory map is well-ordered. */ 626 /* Make sure the Xen-supplied memory map is well-ordered. */
612 sanitize_e820_map(map, memmap.nr_entries, &memmap.nr_entries); 627 sanitize_e820_map(map, memmap.nr_entries, &memmap.nr_entries);
613 628
614 max_pages = xen_get_max_pages(); 629 max_pages = xen_get_max_pages();
615 if (max_pages > max_pfn) 630 if (max_pages > max_pfn)
616 extra_pages += max_pages - max_pfn; 631 extra_pages += max_pages - max_pfn;
617 632
618 /* 633 /*
619 * Set identity map on non-RAM pages and remap the underlying RAM. 634 * Set identity map on non-RAM pages and prepare remapping the
635 * underlying RAM.
620 */ 636 */
621 last_pfn = xen_set_identity_and_remap(map, memmap.nr_entries, max_pfn, 637 xen_set_identity_and_remap(map, memmap.nr_entries, max_pfn,
622 &xen_released_pages); 638 &xen_released_pages);
623 639
624 extra_pages += xen_released_pages; 640 extra_pages += xen_released_pages;
625 641
626 if (last_pfn > max_pfn) {
627 max_pfn = min(MAX_DOMAIN_PAGES, last_pfn);
628 mem_end = PFN_PHYS(max_pfn);
629 }
630 /* 642 /*
631 * Clamp the amount of extra memory to a EXTRA_MEM_RATIO 643 * Clamp the amount of extra memory to a EXTRA_MEM_RATIO
632 * factor the base size. On non-highmem systems, the base 644 * factor the base size. On non-highmem systems, the base
633 * size is the full initial memory allocation; on highmem it 645 * size is the full initial memory allocation; on highmem it
634 * is limited to the max size of lowmem, so that it doesn't 646 * is limited to the max size of lowmem, so that it doesn't
635 * get completely filled. 647 * get completely filled.
636 * 648 *
637 * In principle there could be a problem in lowmem systems if 649 * In principle there could be a problem in lowmem systems if
638 * the initial memory is also very large with respect to 650 * the initial memory is also very large with respect to
639 * lowmem, but we won't try to deal with that here. 651 * lowmem, but we won't try to deal with that here.
640 */ 652 */
641 extra_pages = min(EXTRA_MEM_RATIO * min(max_pfn, PFN_DOWN(MAXMEM)), 653 extra_pages = min(EXTRA_MEM_RATIO * min(max_pfn, PFN_DOWN(MAXMEM)),
642 extra_pages); 654 extra_pages);
643 i = 0; 655 i = 0;
644 while (i < memmap.nr_entries) { 656 while (i < memmap.nr_entries) {
645 u64 addr = map[i].addr; 657 u64 addr = map[i].addr;
646 u64 size = map[i].size; 658 u64 size = map[i].size;
647 u32 type = map[i].type; 659 u32 type = map[i].type;
648 660
649 if (type == E820_RAM) { 661 if (type == E820_RAM) {
650 if (addr < mem_end) { 662 if (addr < mem_end) {
651 size = min(size, mem_end - addr); 663 size = min(size, mem_end - addr);
652 } else if (extra_pages) { 664 } else if (extra_pages) {
653 size = min(size, (u64)extra_pages * PAGE_SIZE); 665 size = min(size, (u64)extra_pages * PAGE_SIZE);
654 extra_pages -= size / PAGE_SIZE; 666 extra_pages -= size / PAGE_SIZE;
655 xen_add_extra_mem(addr, size); 667 xen_add_extra_mem(addr, size);
668 xen_max_p2m_pfn = PFN_DOWN(addr + size);
656 } else 669 } else
657 type = E820_UNUSABLE; 670 type = E820_UNUSABLE;
658 } 671 }
659 672
660 xen_align_and_add_e820_region(addr, size, type); 673 xen_align_and_add_e820_region(addr, size, type);
661 674
662 map[i].addr += size; 675 map[i].addr += size;
663 map[i].size -= size; 676 map[i].size -= size;
664 if (map[i].size == 0) 677 if (map[i].size == 0)
665 i++; 678 i++;
666 } 679 }
667 680
668 /* 681 /*
669 * Set the rest as identity mapped, in case PCI BARs are 682 * Set the rest as identity mapped, in case PCI BARs are
670 * located here. 683 * located here.
671 * 684 *
672 * PFNs above MAX_P2M_PFN are considered identity mapped as 685 * PFNs above MAX_P2M_PFN are considered identity mapped as
673 * well. 686 * well.
674 */ 687 */
675 set_phys_range_identity(map[i-1].addr / PAGE_SIZE, ~0ul); 688 set_phys_range_identity(map[i-1].addr / PAGE_SIZE, ~0ul);
676 689
677 /* 690 /*
678 * In domU, the ISA region is normal, usable memory, but we 691 * In domU, the ISA region is normal, usable memory, but we
679 * reserve ISA memory anyway because too many things poke 692 * reserve ISA memory anyway because too many things poke
680 * about in there. 693 * about in there.
681 */ 694 */
682 e820_add_region(ISA_START_ADDRESS, ISA_END_ADDRESS - ISA_START_ADDRESS, 695 e820_add_region(ISA_START_ADDRESS, ISA_END_ADDRESS - ISA_START_ADDRESS,
683 E820_RESERVED); 696 E820_RESERVED);
684 697
685 /* 698 /*
686 * Reserve Xen bits: 699 * Reserve Xen bits:
687 * - mfn_list 700 * - mfn_list
688 * - xen_start_info 701 * - xen_start_info
689 * See comment above "struct start_info" in <xen/interface/xen.h> 702 * See comment above "struct start_info" in <xen/interface/xen.h>
690 * We tried to make the the memblock_reserve more selective so 703 * We tried to make the the memblock_reserve more selective so
691 * that it would be clear what region is reserved. Sadly we ran 704 * that it would be clear what region is reserved. Sadly we ran
692 * in the problem wherein on a 64-bit hypervisor with a 32-bit 705 * in the problem wherein on a 64-bit hypervisor with a 32-bit
693 * initial domain, the pt_base has the cr3 value which is not 706 * initial domain, the pt_base has the cr3 value which is not
694 * neccessarily where the pagetable starts! As Jan put it: " 707 * neccessarily where the pagetable starts! As Jan put it: "
695 * Actually, the adjustment turns out to be correct: The page 708 * Actually, the adjustment turns out to be correct: The page
696 * tables for a 32-on-64 dom0 get allocated in the order "first L1", 709 * tables for a 32-on-64 dom0 get allocated in the order "first L1",
697 * "first L2", "first L3", so the offset to the page table base is 710 * "first L2", "first L3", so the offset to the page table base is
698 * indeed 2. When reading xen/include/public/xen.h's comment 711 * indeed 2. When reading xen/include/public/xen.h's comment
699 * very strictly, this is not a violation (since there nothing is said 712 * very strictly, this is not a violation (since there nothing is said
700 * that the first thing in the page table space is pointed to by 713 * that the first thing in the page table space is pointed to by
701 * pt_base; I admit that this seems to be implied though, namely 714 * pt_base; I admit that this seems to be implied though, namely
702 * do I think that it is implied that the page table space is the 715 * do I think that it is implied that the page table space is the
703 * range [pt_base, pt_base + nt_pt_frames), whereas that 716 * range [pt_base, pt_base + nt_pt_frames), whereas that
704 * range here indeed is [pt_base - 2, pt_base - 2 + nt_pt_frames), 717 * range here indeed is [pt_base - 2, pt_base - 2 + nt_pt_frames),
705 * which - without a priori knowledge - the kernel would have 718 * which - without a priori knowledge - the kernel would have
706 * difficulty to figure out)." - so lets just fall back to the 719 * difficulty to figure out)." - so lets just fall back to the
707 * easy way and reserve the whole region. 720 * easy way and reserve the whole region.
708 */ 721 */
709 memblock_reserve(__pa(xen_start_info->mfn_list), 722 memblock_reserve(__pa(xen_start_info->mfn_list),
710 xen_start_info->pt_base - xen_start_info->mfn_list); 723 xen_start_info->pt_base - xen_start_info->mfn_list);
711 724
712 sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); 725 sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
713 726
714 return "Xen"; 727 return "Xen";
715 } 728 }
716 729
717 /* 730 /*
718 * Machine specific memory setup for auto-translated guests. 731 * Machine specific memory setup for auto-translated guests.
719 */ 732 */
720 char * __init xen_auto_xlated_memory_setup(void) 733 char * __init xen_auto_xlated_memory_setup(void)
721 { 734 {
722 static struct e820entry map[E820MAX] __initdata; 735 static struct e820entry map[E820MAX] __initdata;
723 736
arch/x86/xen/xen-ops.h
1 #ifndef XEN_OPS_H 1 #ifndef XEN_OPS_H
2 #define XEN_OPS_H 2 #define XEN_OPS_H
3 3
4 #include <linux/init.h> 4 #include <linux/init.h>
5 #include <linux/clocksource.h> 5 #include <linux/clocksource.h>
6 #include <linux/irqreturn.h> 6 #include <linux/irqreturn.h>
7 #include <xen/xen-ops.h> 7 #include <xen/xen-ops.h>
8 8
9 /* These are code, but not functions. Defined in entry.S */ 9 /* These are code, but not functions. Defined in entry.S */
10 extern const char xen_hypervisor_callback[]; 10 extern const char xen_hypervisor_callback[];
11 extern const char xen_failsafe_callback[]; 11 extern const char xen_failsafe_callback[];
12 12
13 extern void *xen_initial_gdt; 13 extern void *xen_initial_gdt;
14 14
15 struct trap_info; 15 struct trap_info;
16 void xen_copy_trap_info(struct trap_info *traps); 16 void xen_copy_trap_info(struct trap_info *traps);
17 17
18 DECLARE_PER_CPU(struct vcpu_info, xen_vcpu_info); 18 DECLARE_PER_CPU(struct vcpu_info, xen_vcpu_info);
19 DECLARE_PER_CPU(unsigned long, xen_cr3); 19 DECLARE_PER_CPU(unsigned long, xen_cr3);
20 DECLARE_PER_CPU(unsigned long, xen_current_cr3); 20 DECLARE_PER_CPU(unsigned long, xen_current_cr3);
21 21
22 extern struct start_info *xen_start_info; 22 extern struct start_info *xen_start_info;
23 extern struct shared_info xen_dummy_shared_info; 23 extern struct shared_info xen_dummy_shared_info;
24 extern struct shared_info *HYPERVISOR_shared_info; 24 extern struct shared_info *HYPERVISOR_shared_info;
25 25
26 void xen_setup_mfn_list_list(void); 26 void xen_setup_mfn_list_list(void);
27 void xen_setup_shared_info(void); 27 void xen_setup_shared_info(void);
28 void xen_build_mfn_list_list(void); 28 void xen_build_mfn_list_list(void);
29 void xen_setup_machphys_mapping(void); 29 void xen_setup_machphys_mapping(void);
30 void xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn); 30 void xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn);
31 void xen_reserve_top(void); 31 void xen_reserve_top(void);
32 extern unsigned long xen_max_p2m_pfn;
33 32
34 void xen_mm_pin_all(void); 33 void xen_mm_pin_all(void);
35 void xen_mm_unpin_all(void); 34 void xen_mm_unpin_all(void);
36 35
36 unsigned long __ref xen_chk_extra_mem(unsigned long pfn);
37 void __init xen_inv_extra_mem(void);
38 void __init xen_remap_memory(void);
37 char * __init xen_memory_setup(void); 39 char * __init xen_memory_setup(void);
38 char * xen_auto_xlated_memory_setup(void); 40 char * xen_auto_xlated_memory_setup(void);
39 void __init xen_arch_setup(void); 41 void __init xen_arch_setup(void);
40 void xen_enable_sysenter(void); 42 void xen_enable_sysenter(void);
41 void xen_enable_syscall(void); 43 void xen_enable_syscall(void);
42 void xen_vcpu_restore(void); 44 void xen_vcpu_restore(void);
43 45
44 void xen_callback_vector(void); 46 void xen_callback_vector(void);
45 void xen_hvm_init_shared_info(void); 47 void xen_hvm_init_shared_info(void);
46 void xen_unplug_emulated_devices(void); 48 void xen_unplug_emulated_devices(void);
47 49
48 void __init xen_build_dynamic_phys_to_machine(void); 50 void __init xen_build_dynamic_phys_to_machine(void);
49 unsigned long __init xen_revector_p2m_tree(void); 51 void __init xen_vmalloc_p2m_tree(void);
50 52
51 void xen_init_irq_ops(void); 53 void xen_init_irq_ops(void);
52 void xen_setup_timer(int cpu); 54 void xen_setup_timer(int cpu);
53 void xen_setup_runstate_info(int cpu); 55 void xen_setup_runstate_info(int cpu);
54 void xen_teardown_timer(int cpu); 56 void xen_teardown_timer(int cpu);
55 cycle_t xen_clocksource_read(void); 57 cycle_t xen_clocksource_read(void);
56 void xen_setup_cpu_clockevents(void); 58 void xen_setup_cpu_clockevents(void);
57 void __init xen_init_time_ops(void); 59 void __init xen_init_time_ops(void);
58 void __init xen_hvm_init_time_ops(void); 60 void __init xen_hvm_init_time_ops(void);
59 61
60 irqreturn_t xen_debug_interrupt(int irq, void *dev_id); 62 irqreturn_t xen_debug_interrupt(int irq, void *dev_id);
61 63
62 bool xen_vcpu_stolen(int vcpu); 64 bool xen_vcpu_stolen(int vcpu);
63 65
64 void xen_setup_vcpu_info_placement(void); 66 void xen_setup_vcpu_info_placement(void);
65 67
66 #ifdef CONFIG_SMP 68 #ifdef CONFIG_SMP
67 void xen_smp_init(void); 69 void xen_smp_init(void);
68 void __init xen_hvm_smp_init(void); 70 void __init xen_hvm_smp_init(void);
69 71
70 extern cpumask_var_t xen_cpu_initialized_map; 72 extern cpumask_var_t xen_cpu_initialized_map;
71 #else 73 #else
72 static inline void xen_smp_init(void) {} 74 static inline void xen_smp_init(void) {}
73 static inline void xen_hvm_smp_init(void) {} 75 static inline void xen_hvm_smp_init(void) {}
74 #endif 76 #endif
75 77
76 #ifdef CONFIG_PARAVIRT_SPINLOCKS 78 #ifdef CONFIG_PARAVIRT_SPINLOCKS
77 void __init xen_init_spinlocks(void); 79 void __init xen_init_spinlocks(void);
78 void xen_init_lock_cpu(int cpu); 80 void xen_init_lock_cpu(int cpu);
79 void xen_uninit_lock_cpu(int cpu); 81 void xen_uninit_lock_cpu(int cpu);
80 #else 82 #else
81 static inline void xen_init_spinlocks(void) 83 static inline void xen_init_spinlocks(void)
82 { 84 {
83 } 85 }
84 static inline void xen_init_lock_cpu(int cpu) 86 static inline void xen_init_lock_cpu(int cpu)
85 { 87 {
86 } 88 }
87 static inline void xen_uninit_lock_cpu(int cpu) 89 static inline void xen_uninit_lock_cpu(int cpu)
88 { 90 {
89 } 91 }
90 #endif 92 #endif
91 93
92 struct dom0_vga_console_info; 94 struct dom0_vga_console_info;
93 95
94 #ifdef CONFIG_XEN_DOM0 96 #ifdef CONFIG_XEN_DOM0
95 void __init xen_init_vga(const struct dom0_vga_console_info *, size_t size); 97 void __init xen_init_vga(const struct dom0_vga_console_info *, size_t size);
96 void __init xen_init_apic(void); 98 void __init xen_init_apic(void);
97 #else 99 #else
98 static inline void __init xen_init_vga(const struct dom0_vga_console_info *info, 100 static inline void __init xen_init_vga(const struct dom0_vga_console_info *info,
99 size_t size) 101 size_t size)
100 { 102 {
101 } 103 }
102 static inline void __init xen_init_apic(void) 104 static inline void __init xen_init_apic(void)
103 { 105 {
104 } 106 }
105 #endif 107 #endif
106 108
107 #ifdef CONFIG_XEN_EFI 109 #ifdef CONFIG_XEN_EFI
108 extern void xen_efi_init(void); 110 extern void xen_efi_init(void);
109 #else 111 #else
110 static inline void __init xen_efi_init(void) 112 static inline void __init xen_efi_init(void)
111 { 113 {
112 } 114 }
113 #endif 115 #endif
114 116
115 /* Declare an asm function, along with symbols needed to make it 117 /* Declare an asm function, along with symbols needed to make it
116 inlineable */ 118 inlineable */
117 #define DECL_ASM(ret, name, ...) \ 119 #define DECL_ASM(ret, name, ...) \
118 __visible ret name(__VA_ARGS__); \ 120 __visible ret name(__VA_ARGS__); \
119 extern char name##_end[] __visible; \ 121 extern char name##_end[] __visible; \
120 extern char name##_reloc[] __visible 122 extern char name##_reloc[] __visible
121 123
122 DECL_ASM(void, xen_irq_enable_direct, void); 124 DECL_ASM(void, xen_irq_enable_direct, void);
123 DECL_ASM(void, xen_irq_disable_direct, void); 125 DECL_ASM(void, xen_irq_disable_direct, void);
124 DECL_ASM(unsigned long, xen_save_fl_direct, void); 126 DECL_ASM(unsigned long, xen_save_fl_direct, void);
125 DECL_ASM(void, xen_restore_fl_direct, unsigned long); 127 DECL_ASM(void, xen_restore_fl_direct, unsigned long);
126 128
127 /* These are not functions, and cannot be called normally */ 129 /* These are not functions, and cannot be called normally */
128 __visible void xen_iret(void); 130 __visible void xen_iret(void);
129 __visible void xen_sysexit(void); 131 __visible void xen_sysexit(void);
130 __visible void xen_sysret32(void); 132 __visible void xen_sysret32(void);
131 __visible void xen_sysret64(void); 133 __visible void xen_sysret64(void);
132 __visible void xen_adjust_exception_frame(void); 134 __visible void xen_adjust_exception_frame(void);
133 135
134 extern int xen_panic_handler_init(void); 136 extern int xen_panic_handler_init(void);
135 137
136 void xen_pvh_secondary_vcpu_init(int cpu); 138 void xen_pvh_secondary_vcpu_init(int cpu);
137 #endif /* XEN_OPS_H */ 139 #endif /* XEN_OPS_H */