Commit eb64c3c6cdb8fa8a4d324eb71a9033b62e150918
Exists in
ti-lsk-linux-4.1.y
and in
10 other branches
Merge tag 'stable/for-linus-3.19-rc0b-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/xen/tip
Pull additional xen update from David Vrabel: "Xen: additional features for 3.19-rc0 - Linear p2m for x86 PV guests which simplifies the p2m code, improves performance and will allow for > 512 GB PV guests in the future. A last-minute, configuration specific issue was discovered with this change which is why it was not included in my previous pull request. This is now been fixed and tested" * tag 'stable/for-linus-3.19-rc0b-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/xen/tip: xen: switch to post-init routines in xen mmu.c earlier Revert "swiotlb-xen: pass dev_addr to swiotlb_tbl_unmap_single" xen: annotate xen_set_identity_and_remap_chunk() with __init xen: introduce helper functions to do safe read and write accesses xen: Speed up set_phys_to_machine() by using read-only mappings xen: switch to linear virtual mapped sparse p2m list xen: Hide get_phys_to_machine() to be able to tune common path x86: Introduce function to get pmd entry pointer xen: Delay invalidating extra memory xen: Delay m2p_override initialization xen: Delay remapping memory of pv-domain xen: use common page allocation function in p2m.c xen: Make functions static xen: fix some style issues in p2m.c
Showing 7 changed files Inline Diff
arch/x86/include/asm/pgtable_types.h
1 | #ifndef _ASM_X86_PGTABLE_DEFS_H | 1 | #ifndef _ASM_X86_PGTABLE_DEFS_H |
2 | #define _ASM_X86_PGTABLE_DEFS_H | 2 | #define _ASM_X86_PGTABLE_DEFS_H |
3 | 3 | ||
4 | #include <linux/const.h> | 4 | #include <linux/const.h> |
5 | #include <asm/page_types.h> | 5 | #include <asm/page_types.h> |
6 | 6 | ||
7 | #define FIRST_USER_ADDRESS 0 | 7 | #define FIRST_USER_ADDRESS 0 |
8 | 8 | ||
9 | #define _PAGE_BIT_PRESENT 0 /* is present */ | 9 | #define _PAGE_BIT_PRESENT 0 /* is present */ |
10 | #define _PAGE_BIT_RW 1 /* writeable */ | 10 | #define _PAGE_BIT_RW 1 /* writeable */ |
11 | #define _PAGE_BIT_USER 2 /* userspace addressable */ | 11 | #define _PAGE_BIT_USER 2 /* userspace addressable */ |
12 | #define _PAGE_BIT_PWT 3 /* page write through */ | 12 | #define _PAGE_BIT_PWT 3 /* page write through */ |
13 | #define _PAGE_BIT_PCD 4 /* page cache disabled */ | 13 | #define _PAGE_BIT_PCD 4 /* page cache disabled */ |
14 | #define _PAGE_BIT_ACCESSED 5 /* was accessed (raised by CPU) */ | 14 | #define _PAGE_BIT_ACCESSED 5 /* was accessed (raised by CPU) */ |
15 | #define _PAGE_BIT_DIRTY 6 /* was written to (raised by CPU) */ | 15 | #define _PAGE_BIT_DIRTY 6 /* was written to (raised by CPU) */ |
16 | #define _PAGE_BIT_PSE 7 /* 4 MB (or 2MB) page */ | 16 | #define _PAGE_BIT_PSE 7 /* 4 MB (or 2MB) page */ |
17 | #define _PAGE_BIT_PAT 7 /* on 4KB pages */ | 17 | #define _PAGE_BIT_PAT 7 /* on 4KB pages */ |
18 | #define _PAGE_BIT_GLOBAL 8 /* Global TLB entry PPro+ */ | 18 | #define _PAGE_BIT_GLOBAL 8 /* Global TLB entry PPro+ */ |
19 | #define _PAGE_BIT_SOFTW1 9 /* available for programmer */ | 19 | #define _PAGE_BIT_SOFTW1 9 /* available for programmer */ |
20 | #define _PAGE_BIT_SOFTW2 10 /* " */ | 20 | #define _PAGE_BIT_SOFTW2 10 /* " */ |
21 | #define _PAGE_BIT_SOFTW3 11 /* " */ | 21 | #define _PAGE_BIT_SOFTW3 11 /* " */ |
22 | #define _PAGE_BIT_PAT_LARGE 12 /* On 2MB or 1GB pages */ | 22 | #define _PAGE_BIT_PAT_LARGE 12 /* On 2MB or 1GB pages */ |
23 | #define _PAGE_BIT_SPECIAL _PAGE_BIT_SOFTW1 | 23 | #define _PAGE_BIT_SPECIAL _PAGE_BIT_SOFTW1 |
24 | #define _PAGE_BIT_CPA_TEST _PAGE_BIT_SOFTW1 | 24 | #define _PAGE_BIT_CPA_TEST _PAGE_BIT_SOFTW1 |
25 | #define _PAGE_BIT_SPLITTING _PAGE_BIT_SOFTW2 /* only valid on a PSE pmd */ | 25 | #define _PAGE_BIT_SPLITTING _PAGE_BIT_SOFTW2 /* only valid on a PSE pmd */ |
26 | #define _PAGE_BIT_HIDDEN _PAGE_BIT_SOFTW3 /* hidden by kmemcheck */ | 26 | #define _PAGE_BIT_HIDDEN _PAGE_BIT_SOFTW3 /* hidden by kmemcheck */ |
27 | #define _PAGE_BIT_SOFT_DIRTY _PAGE_BIT_SOFTW3 /* software dirty tracking */ | 27 | #define _PAGE_BIT_SOFT_DIRTY _PAGE_BIT_SOFTW3 /* software dirty tracking */ |
28 | #define _PAGE_BIT_NX 63 /* No execute: only valid after cpuid check */ | 28 | #define _PAGE_BIT_NX 63 /* No execute: only valid after cpuid check */ |
29 | 29 | ||
30 | /* | 30 | /* |
31 | * Swap offsets on configurations that allow automatic NUMA balancing use the | 31 | * Swap offsets on configurations that allow automatic NUMA balancing use the |
32 | * bits after _PAGE_BIT_GLOBAL. To uniquely distinguish NUMA hinting PTEs from | 32 | * bits after _PAGE_BIT_GLOBAL. To uniquely distinguish NUMA hinting PTEs from |
33 | * swap entries, we use the first bit after _PAGE_BIT_GLOBAL and shrink the | 33 | * swap entries, we use the first bit after _PAGE_BIT_GLOBAL and shrink the |
34 | * maximum possible swap space from 16TB to 8TB. | 34 | * maximum possible swap space from 16TB to 8TB. |
35 | */ | 35 | */ |
36 | #define _PAGE_BIT_NUMA (_PAGE_BIT_GLOBAL+1) | 36 | #define _PAGE_BIT_NUMA (_PAGE_BIT_GLOBAL+1) |
37 | 37 | ||
38 | /* If _PAGE_BIT_PRESENT is clear, we use these: */ | 38 | /* If _PAGE_BIT_PRESENT is clear, we use these: */ |
39 | /* - if the user mapped it with PROT_NONE; pte_present gives true */ | 39 | /* - if the user mapped it with PROT_NONE; pte_present gives true */ |
40 | #define _PAGE_BIT_PROTNONE _PAGE_BIT_GLOBAL | 40 | #define _PAGE_BIT_PROTNONE _PAGE_BIT_GLOBAL |
41 | /* - set: nonlinear file mapping, saved PTE; unset:swap */ | 41 | /* - set: nonlinear file mapping, saved PTE; unset:swap */ |
42 | #define _PAGE_BIT_FILE _PAGE_BIT_DIRTY | 42 | #define _PAGE_BIT_FILE _PAGE_BIT_DIRTY |
43 | 43 | ||
44 | #define _PAGE_PRESENT (_AT(pteval_t, 1) << _PAGE_BIT_PRESENT) | 44 | #define _PAGE_PRESENT (_AT(pteval_t, 1) << _PAGE_BIT_PRESENT) |
45 | #define _PAGE_RW (_AT(pteval_t, 1) << _PAGE_BIT_RW) | 45 | #define _PAGE_RW (_AT(pteval_t, 1) << _PAGE_BIT_RW) |
46 | #define _PAGE_USER (_AT(pteval_t, 1) << _PAGE_BIT_USER) | 46 | #define _PAGE_USER (_AT(pteval_t, 1) << _PAGE_BIT_USER) |
47 | #define _PAGE_PWT (_AT(pteval_t, 1) << _PAGE_BIT_PWT) | 47 | #define _PAGE_PWT (_AT(pteval_t, 1) << _PAGE_BIT_PWT) |
48 | #define _PAGE_PCD (_AT(pteval_t, 1) << _PAGE_BIT_PCD) | 48 | #define _PAGE_PCD (_AT(pteval_t, 1) << _PAGE_BIT_PCD) |
49 | #define _PAGE_ACCESSED (_AT(pteval_t, 1) << _PAGE_BIT_ACCESSED) | 49 | #define _PAGE_ACCESSED (_AT(pteval_t, 1) << _PAGE_BIT_ACCESSED) |
50 | #define _PAGE_DIRTY (_AT(pteval_t, 1) << _PAGE_BIT_DIRTY) | 50 | #define _PAGE_DIRTY (_AT(pteval_t, 1) << _PAGE_BIT_DIRTY) |
51 | #define _PAGE_PSE (_AT(pteval_t, 1) << _PAGE_BIT_PSE) | 51 | #define _PAGE_PSE (_AT(pteval_t, 1) << _PAGE_BIT_PSE) |
52 | #define _PAGE_GLOBAL (_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL) | 52 | #define _PAGE_GLOBAL (_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL) |
53 | #define _PAGE_SOFTW1 (_AT(pteval_t, 1) << _PAGE_BIT_SOFTW1) | 53 | #define _PAGE_SOFTW1 (_AT(pteval_t, 1) << _PAGE_BIT_SOFTW1) |
54 | #define _PAGE_SOFTW2 (_AT(pteval_t, 1) << _PAGE_BIT_SOFTW2) | 54 | #define _PAGE_SOFTW2 (_AT(pteval_t, 1) << _PAGE_BIT_SOFTW2) |
55 | #define _PAGE_PAT (_AT(pteval_t, 1) << _PAGE_BIT_PAT) | 55 | #define _PAGE_PAT (_AT(pteval_t, 1) << _PAGE_BIT_PAT) |
56 | #define _PAGE_PAT_LARGE (_AT(pteval_t, 1) << _PAGE_BIT_PAT_LARGE) | 56 | #define _PAGE_PAT_LARGE (_AT(pteval_t, 1) << _PAGE_BIT_PAT_LARGE) |
57 | #define _PAGE_SPECIAL (_AT(pteval_t, 1) << _PAGE_BIT_SPECIAL) | 57 | #define _PAGE_SPECIAL (_AT(pteval_t, 1) << _PAGE_BIT_SPECIAL) |
58 | #define _PAGE_CPA_TEST (_AT(pteval_t, 1) << _PAGE_BIT_CPA_TEST) | 58 | #define _PAGE_CPA_TEST (_AT(pteval_t, 1) << _PAGE_BIT_CPA_TEST) |
59 | #define _PAGE_SPLITTING (_AT(pteval_t, 1) << _PAGE_BIT_SPLITTING) | 59 | #define _PAGE_SPLITTING (_AT(pteval_t, 1) << _PAGE_BIT_SPLITTING) |
60 | #define __HAVE_ARCH_PTE_SPECIAL | 60 | #define __HAVE_ARCH_PTE_SPECIAL |
61 | 61 | ||
62 | #ifdef CONFIG_KMEMCHECK | 62 | #ifdef CONFIG_KMEMCHECK |
63 | #define _PAGE_HIDDEN (_AT(pteval_t, 1) << _PAGE_BIT_HIDDEN) | 63 | #define _PAGE_HIDDEN (_AT(pteval_t, 1) << _PAGE_BIT_HIDDEN) |
64 | #else | 64 | #else |
65 | #define _PAGE_HIDDEN (_AT(pteval_t, 0)) | 65 | #define _PAGE_HIDDEN (_AT(pteval_t, 0)) |
66 | #endif | 66 | #endif |
67 | 67 | ||
68 | /* | 68 | /* |
69 | * The same hidden bit is used by kmemcheck, but since kmemcheck | 69 | * The same hidden bit is used by kmemcheck, but since kmemcheck |
70 | * works on kernel pages while soft-dirty engine on user space, | 70 | * works on kernel pages while soft-dirty engine on user space, |
71 | * they do not conflict with each other. | 71 | * they do not conflict with each other. |
72 | */ | 72 | */ |
73 | 73 | ||
74 | #ifdef CONFIG_MEM_SOFT_DIRTY | 74 | #ifdef CONFIG_MEM_SOFT_DIRTY |
75 | #define _PAGE_SOFT_DIRTY (_AT(pteval_t, 1) << _PAGE_BIT_SOFT_DIRTY) | 75 | #define _PAGE_SOFT_DIRTY (_AT(pteval_t, 1) << _PAGE_BIT_SOFT_DIRTY) |
76 | #else | 76 | #else |
77 | #define _PAGE_SOFT_DIRTY (_AT(pteval_t, 0)) | 77 | #define _PAGE_SOFT_DIRTY (_AT(pteval_t, 0)) |
78 | #endif | 78 | #endif |
79 | 79 | ||
80 | /* | 80 | /* |
81 | * _PAGE_NUMA distinguishes between a numa hinting minor fault and a page | 81 | * _PAGE_NUMA distinguishes between a numa hinting minor fault and a page |
82 | * that is not present. The hinting fault gathers numa placement statistics | 82 | * that is not present. The hinting fault gathers numa placement statistics |
83 | * (see pte_numa()). The bit is always zero when the PTE is not present. | 83 | * (see pte_numa()). The bit is always zero when the PTE is not present. |
84 | * | 84 | * |
85 | * The bit picked must be always zero when the pmd is present and not | 85 | * The bit picked must be always zero when the pmd is present and not |
86 | * present, so that we don't lose information when we set it while | 86 | * present, so that we don't lose information when we set it while |
87 | * atomically clearing the present bit. | 87 | * atomically clearing the present bit. |
88 | */ | 88 | */ |
89 | #ifdef CONFIG_NUMA_BALANCING | 89 | #ifdef CONFIG_NUMA_BALANCING |
90 | #define _PAGE_NUMA (_AT(pteval_t, 1) << _PAGE_BIT_NUMA) | 90 | #define _PAGE_NUMA (_AT(pteval_t, 1) << _PAGE_BIT_NUMA) |
91 | #else | 91 | #else |
92 | #define _PAGE_NUMA (_AT(pteval_t, 0)) | 92 | #define _PAGE_NUMA (_AT(pteval_t, 0)) |
93 | #endif | 93 | #endif |
94 | 94 | ||
95 | /* | 95 | /* |
96 | * Tracking soft dirty bit when a page goes to a swap is tricky. | 96 | * Tracking soft dirty bit when a page goes to a swap is tricky. |
97 | * We need a bit which can be stored in pte _and_ not conflict | 97 | * We need a bit which can be stored in pte _and_ not conflict |
98 | * with swap entry format. On x86 bits 6 and 7 are *not* involved | 98 | * with swap entry format. On x86 bits 6 and 7 are *not* involved |
99 | * into swap entry computation, but bit 6 is used for nonlinear | 99 | * into swap entry computation, but bit 6 is used for nonlinear |
100 | * file mapping, so we borrow bit 7 for soft dirty tracking. | 100 | * file mapping, so we borrow bit 7 for soft dirty tracking. |
101 | * | 101 | * |
102 | * Please note that this bit must be treated as swap dirty page | 102 | * Please note that this bit must be treated as swap dirty page |
103 | * mark if and only if the PTE has present bit clear! | 103 | * mark if and only if the PTE has present bit clear! |
104 | */ | 104 | */ |
105 | #ifdef CONFIG_MEM_SOFT_DIRTY | 105 | #ifdef CONFIG_MEM_SOFT_DIRTY |
106 | #define _PAGE_SWP_SOFT_DIRTY _PAGE_PSE | 106 | #define _PAGE_SWP_SOFT_DIRTY _PAGE_PSE |
107 | #else | 107 | #else |
108 | #define _PAGE_SWP_SOFT_DIRTY (_AT(pteval_t, 0)) | 108 | #define _PAGE_SWP_SOFT_DIRTY (_AT(pteval_t, 0)) |
109 | #endif | 109 | #endif |
110 | 110 | ||
111 | #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) | 111 | #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) |
112 | #define _PAGE_NX (_AT(pteval_t, 1) << _PAGE_BIT_NX) | 112 | #define _PAGE_NX (_AT(pteval_t, 1) << _PAGE_BIT_NX) |
113 | #else | 113 | #else |
114 | #define _PAGE_NX (_AT(pteval_t, 0)) | 114 | #define _PAGE_NX (_AT(pteval_t, 0)) |
115 | #endif | 115 | #endif |
116 | 116 | ||
117 | #define _PAGE_FILE (_AT(pteval_t, 1) << _PAGE_BIT_FILE) | 117 | #define _PAGE_FILE (_AT(pteval_t, 1) << _PAGE_BIT_FILE) |
118 | #define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE) | 118 | #define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE) |
119 | 119 | ||
120 | #define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \ | 120 | #define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \ |
121 | _PAGE_ACCESSED | _PAGE_DIRTY) | 121 | _PAGE_ACCESSED | _PAGE_DIRTY) |
122 | #define _KERNPG_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | \ | 122 | #define _KERNPG_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | \ |
123 | _PAGE_DIRTY) | 123 | _PAGE_DIRTY) |
124 | 124 | ||
125 | /* Set of bits not changed in pte_modify */ | 125 | /* Set of bits not changed in pte_modify */ |
126 | #define _PAGE_CHG_MASK (PTE_PFN_MASK | _PAGE_PCD | _PAGE_PWT | \ | 126 | #define _PAGE_CHG_MASK (PTE_PFN_MASK | _PAGE_PCD | _PAGE_PWT | \ |
127 | _PAGE_SPECIAL | _PAGE_ACCESSED | _PAGE_DIRTY | \ | 127 | _PAGE_SPECIAL | _PAGE_ACCESSED | _PAGE_DIRTY | \ |
128 | _PAGE_SOFT_DIRTY | _PAGE_NUMA) | 128 | _PAGE_SOFT_DIRTY | _PAGE_NUMA) |
129 | #define _HPAGE_CHG_MASK (_PAGE_CHG_MASK | _PAGE_PSE | _PAGE_NUMA) | 129 | #define _HPAGE_CHG_MASK (_PAGE_CHG_MASK | _PAGE_PSE | _PAGE_NUMA) |
130 | 130 | ||
131 | /* | 131 | /* |
132 | * The cache modes defined here are used to translate between pure SW usage | 132 | * The cache modes defined here are used to translate between pure SW usage |
133 | * and the HW defined cache mode bits and/or PAT entries. | 133 | * and the HW defined cache mode bits and/or PAT entries. |
134 | * | 134 | * |
135 | * The resulting bits for PWT, PCD and PAT should be chosen in a way | 135 | * The resulting bits for PWT, PCD and PAT should be chosen in a way |
136 | * to have the WB mode at index 0 (all bits clear). This is the default | 136 | * to have the WB mode at index 0 (all bits clear). This is the default |
137 | * right now and likely would break too much if changed. | 137 | * right now and likely would break too much if changed. |
138 | */ | 138 | */ |
139 | #ifndef __ASSEMBLY__ | 139 | #ifndef __ASSEMBLY__ |
140 | enum page_cache_mode { | 140 | enum page_cache_mode { |
141 | _PAGE_CACHE_MODE_WB = 0, | 141 | _PAGE_CACHE_MODE_WB = 0, |
142 | _PAGE_CACHE_MODE_WC = 1, | 142 | _PAGE_CACHE_MODE_WC = 1, |
143 | _PAGE_CACHE_MODE_UC_MINUS = 2, | 143 | _PAGE_CACHE_MODE_UC_MINUS = 2, |
144 | _PAGE_CACHE_MODE_UC = 3, | 144 | _PAGE_CACHE_MODE_UC = 3, |
145 | _PAGE_CACHE_MODE_WT = 4, | 145 | _PAGE_CACHE_MODE_WT = 4, |
146 | _PAGE_CACHE_MODE_WP = 5, | 146 | _PAGE_CACHE_MODE_WP = 5, |
147 | _PAGE_CACHE_MODE_NUM = 8 | 147 | _PAGE_CACHE_MODE_NUM = 8 |
148 | }; | 148 | }; |
149 | #endif | 149 | #endif |
150 | 150 | ||
151 | #define _PAGE_CACHE_MASK (_PAGE_PAT | _PAGE_PCD | _PAGE_PWT) | 151 | #define _PAGE_CACHE_MASK (_PAGE_PAT | _PAGE_PCD | _PAGE_PWT) |
152 | #define _PAGE_NOCACHE (cachemode2protval(_PAGE_CACHE_MODE_UC)) | 152 | #define _PAGE_NOCACHE (cachemode2protval(_PAGE_CACHE_MODE_UC)) |
153 | 153 | ||
154 | #define PAGE_NONE __pgprot(_PAGE_PROTNONE | _PAGE_ACCESSED) | 154 | #define PAGE_NONE __pgprot(_PAGE_PROTNONE | _PAGE_ACCESSED) |
155 | #define PAGE_SHARED __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \ | 155 | #define PAGE_SHARED __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \ |
156 | _PAGE_ACCESSED | _PAGE_NX) | 156 | _PAGE_ACCESSED | _PAGE_NX) |
157 | 157 | ||
158 | #define PAGE_SHARED_EXEC __pgprot(_PAGE_PRESENT | _PAGE_RW | \ | 158 | #define PAGE_SHARED_EXEC __pgprot(_PAGE_PRESENT | _PAGE_RW | \ |
159 | _PAGE_USER | _PAGE_ACCESSED) | 159 | _PAGE_USER | _PAGE_ACCESSED) |
160 | #define PAGE_COPY_NOEXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | \ | 160 | #define PAGE_COPY_NOEXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | \ |
161 | _PAGE_ACCESSED | _PAGE_NX) | 161 | _PAGE_ACCESSED | _PAGE_NX) |
162 | #define PAGE_COPY_EXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | \ | 162 | #define PAGE_COPY_EXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | \ |
163 | _PAGE_ACCESSED) | 163 | _PAGE_ACCESSED) |
164 | #define PAGE_COPY PAGE_COPY_NOEXEC | 164 | #define PAGE_COPY PAGE_COPY_NOEXEC |
165 | #define PAGE_READONLY __pgprot(_PAGE_PRESENT | _PAGE_USER | \ | 165 | #define PAGE_READONLY __pgprot(_PAGE_PRESENT | _PAGE_USER | \ |
166 | _PAGE_ACCESSED | _PAGE_NX) | 166 | _PAGE_ACCESSED | _PAGE_NX) |
167 | #define PAGE_READONLY_EXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | \ | 167 | #define PAGE_READONLY_EXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | \ |
168 | _PAGE_ACCESSED) | 168 | _PAGE_ACCESSED) |
169 | 169 | ||
170 | #define __PAGE_KERNEL_EXEC \ | 170 | #define __PAGE_KERNEL_EXEC \ |
171 | (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_GLOBAL) | 171 | (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_GLOBAL) |
172 | #define __PAGE_KERNEL (__PAGE_KERNEL_EXEC | _PAGE_NX) | 172 | #define __PAGE_KERNEL (__PAGE_KERNEL_EXEC | _PAGE_NX) |
173 | 173 | ||
174 | #define __PAGE_KERNEL_RO (__PAGE_KERNEL & ~_PAGE_RW) | 174 | #define __PAGE_KERNEL_RO (__PAGE_KERNEL & ~_PAGE_RW) |
175 | #define __PAGE_KERNEL_RX (__PAGE_KERNEL_EXEC & ~_PAGE_RW) | 175 | #define __PAGE_KERNEL_RX (__PAGE_KERNEL_EXEC & ~_PAGE_RW) |
176 | #define __PAGE_KERNEL_NOCACHE (__PAGE_KERNEL | _PAGE_NOCACHE) | 176 | #define __PAGE_KERNEL_NOCACHE (__PAGE_KERNEL | _PAGE_NOCACHE) |
177 | #define __PAGE_KERNEL_VSYSCALL (__PAGE_KERNEL_RX | _PAGE_USER) | 177 | #define __PAGE_KERNEL_VSYSCALL (__PAGE_KERNEL_RX | _PAGE_USER) |
178 | #define __PAGE_KERNEL_VVAR (__PAGE_KERNEL_RO | _PAGE_USER) | 178 | #define __PAGE_KERNEL_VVAR (__PAGE_KERNEL_RO | _PAGE_USER) |
179 | #define __PAGE_KERNEL_LARGE (__PAGE_KERNEL | _PAGE_PSE) | 179 | #define __PAGE_KERNEL_LARGE (__PAGE_KERNEL | _PAGE_PSE) |
180 | #define __PAGE_KERNEL_LARGE_EXEC (__PAGE_KERNEL_EXEC | _PAGE_PSE) | 180 | #define __PAGE_KERNEL_LARGE_EXEC (__PAGE_KERNEL_EXEC | _PAGE_PSE) |
181 | 181 | ||
182 | #define __PAGE_KERNEL_IO (__PAGE_KERNEL) | 182 | #define __PAGE_KERNEL_IO (__PAGE_KERNEL) |
183 | #define __PAGE_KERNEL_IO_NOCACHE (__PAGE_KERNEL_NOCACHE) | 183 | #define __PAGE_KERNEL_IO_NOCACHE (__PAGE_KERNEL_NOCACHE) |
184 | 184 | ||
185 | #define PAGE_KERNEL __pgprot(__PAGE_KERNEL) | 185 | #define PAGE_KERNEL __pgprot(__PAGE_KERNEL) |
186 | #define PAGE_KERNEL_RO __pgprot(__PAGE_KERNEL_RO) | 186 | #define PAGE_KERNEL_RO __pgprot(__PAGE_KERNEL_RO) |
187 | #define PAGE_KERNEL_EXEC __pgprot(__PAGE_KERNEL_EXEC) | 187 | #define PAGE_KERNEL_EXEC __pgprot(__PAGE_KERNEL_EXEC) |
188 | #define PAGE_KERNEL_RX __pgprot(__PAGE_KERNEL_RX) | 188 | #define PAGE_KERNEL_RX __pgprot(__PAGE_KERNEL_RX) |
189 | #define PAGE_KERNEL_NOCACHE __pgprot(__PAGE_KERNEL_NOCACHE) | 189 | #define PAGE_KERNEL_NOCACHE __pgprot(__PAGE_KERNEL_NOCACHE) |
190 | #define PAGE_KERNEL_LARGE __pgprot(__PAGE_KERNEL_LARGE) | 190 | #define PAGE_KERNEL_LARGE __pgprot(__PAGE_KERNEL_LARGE) |
191 | #define PAGE_KERNEL_LARGE_EXEC __pgprot(__PAGE_KERNEL_LARGE_EXEC) | 191 | #define PAGE_KERNEL_LARGE_EXEC __pgprot(__PAGE_KERNEL_LARGE_EXEC) |
192 | #define PAGE_KERNEL_VSYSCALL __pgprot(__PAGE_KERNEL_VSYSCALL) | 192 | #define PAGE_KERNEL_VSYSCALL __pgprot(__PAGE_KERNEL_VSYSCALL) |
193 | #define PAGE_KERNEL_VVAR __pgprot(__PAGE_KERNEL_VVAR) | 193 | #define PAGE_KERNEL_VVAR __pgprot(__PAGE_KERNEL_VVAR) |
194 | 194 | ||
195 | #define PAGE_KERNEL_IO __pgprot(__PAGE_KERNEL_IO) | 195 | #define PAGE_KERNEL_IO __pgprot(__PAGE_KERNEL_IO) |
196 | #define PAGE_KERNEL_IO_NOCACHE __pgprot(__PAGE_KERNEL_IO_NOCACHE) | 196 | #define PAGE_KERNEL_IO_NOCACHE __pgprot(__PAGE_KERNEL_IO_NOCACHE) |
197 | 197 | ||
198 | /* xwr */ | 198 | /* xwr */ |
199 | #define __P000 PAGE_NONE | 199 | #define __P000 PAGE_NONE |
200 | #define __P001 PAGE_READONLY | 200 | #define __P001 PAGE_READONLY |
201 | #define __P010 PAGE_COPY | 201 | #define __P010 PAGE_COPY |
202 | #define __P011 PAGE_COPY | 202 | #define __P011 PAGE_COPY |
203 | #define __P100 PAGE_READONLY_EXEC | 203 | #define __P100 PAGE_READONLY_EXEC |
204 | #define __P101 PAGE_READONLY_EXEC | 204 | #define __P101 PAGE_READONLY_EXEC |
205 | #define __P110 PAGE_COPY_EXEC | 205 | #define __P110 PAGE_COPY_EXEC |
206 | #define __P111 PAGE_COPY_EXEC | 206 | #define __P111 PAGE_COPY_EXEC |
207 | 207 | ||
208 | #define __S000 PAGE_NONE | 208 | #define __S000 PAGE_NONE |
209 | #define __S001 PAGE_READONLY | 209 | #define __S001 PAGE_READONLY |
210 | #define __S010 PAGE_SHARED | 210 | #define __S010 PAGE_SHARED |
211 | #define __S011 PAGE_SHARED | 211 | #define __S011 PAGE_SHARED |
212 | #define __S100 PAGE_READONLY_EXEC | 212 | #define __S100 PAGE_READONLY_EXEC |
213 | #define __S101 PAGE_READONLY_EXEC | 213 | #define __S101 PAGE_READONLY_EXEC |
214 | #define __S110 PAGE_SHARED_EXEC | 214 | #define __S110 PAGE_SHARED_EXEC |
215 | #define __S111 PAGE_SHARED_EXEC | 215 | #define __S111 PAGE_SHARED_EXEC |
216 | 216 | ||
217 | /* | 217 | /* |
218 | * early identity mapping pte attrib macros. | 218 | * early identity mapping pte attrib macros. |
219 | */ | 219 | */ |
220 | #ifdef CONFIG_X86_64 | 220 | #ifdef CONFIG_X86_64 |
221 | #define __PAGE_KERNEL_IDENT_LARGE_EXEC __PAGE_KERNEL_LARGE_EXEC | 221 | #define __PAGE_KERNEL_IDENT_LARGE_EXEC __PAGE_KERNEL_LARGE_EXEC |
222 | #else | 222 | #else |
223 | #define PTE_IDENT_ATTR 0x003 /* PRESENT+RW */ | 223 | #define PTE_IDENT_ATTR 0x003 /* PRESENT+RW */ |
224 | #define PDE_IDENT_ATTR 0x063 /* PRESENT+RW+DIRTY+ACCESSED */ | 224 | #define PDE_IDENT_ATTR 0x063 /* PRESENT+RW+DIRTY+ACCESSED */ |
225 | #define PGD_IDENT_ATTR 0x001 /* PRESENT (no other attributes) */ | 225 | #define PGD_IDENT_ATTR 0x001 /* PRESENT (no other attributes) */ |
226 | #endif | 226 | #endif |
227 | 227 | ||
228 | #ifdef CONFIG_X86_32 | 228 | #ifdef CONFIG_X86_32 |
229 | # include <asm/pgtable_32_types.h> | 229 | # include <asm/pgtable_32_types.h> |
230 | #else | 230 | #else |
231 | # include <asm/pgtable_64_types.h> | 231 | # include <asm/pgtable_64_types.h> |
232 | #endif | 232 | #endif |
233 | 233 | ||
234 | #ifndef __ASSEMBLY__ | 234 | #ifndef __ASSEMBLY__ |
235 | 235 | ||
236 | #include <linux/types.h> | 236 | #include <linux/types.h> |
237 | 237 | ||
238 | /* PTE_PFN_MASK extracts the PFN from a (pte|pmd|pud|pgd)val_t */ | 238 | /* PTE_PFN_MASK extracts the PFN from a (pte|pmd|pud|pgd)val_t */ |
239 | #define PTE_PFN_MASK ((pteval_t)PHYSICAL_PAGE_MASK) | 239 | #define PTE_PFN_MASK ((pteval_t)PHYSICAL_PAGE_MASK) |
240 | 240 | ||
241 | /* PTE_FLAGS_MASK extracts the flags from a (pte|pmd|pud|pgd)val_t */ | 241 | /* PTE_FLAGS_MASK extracts the flags from a (pte|pmd|pud|pgd)val_t */ |
242 | #define PTE_FLAGS_MASK (~PTE_PFN_MASK) | 242 | #define PTE_FLAGS_MASK (~PTE_PFN_MASK) |
243 | 243 | ||
244 | typedef struct pgprot { pgprotval_t pgprot; } pgprot_t; | 244 | typedef struct pgprot { pgprotval_t pgprot; } pgprot_t; |
245 | 245 | ||
246 | typedef struct { pgdval_t pgd; } pgd_t; | 246 | typedef struct { pgdval_t pgd; } pgd_t; |
247 | 247 | ||
248 | static inline pgd_t native_make_pgd(pgdval_t val) | 248 | static inline pgd_t native_make_pgd(pgdval_t val) |
249 | { | 249 | { |
250 | return (pgd_t) { val }; | 250 | return (pgd_t) { val }; |
251 | } | 251 | } |
252 | 252 | ||
253 | static inline pgdval_t native_pgd_val(pgd_t pgd) | 253 | static inline pgdval_t native_pgd_val(pgd_t pgd) |
254 | { | 254 | { |
255 | return pgd.pgd; | 255 | return pgd.pgd; |
256 | } | 256 | } |
257 | 257 | ||
258 | static inline pgdval_t pgd_flags(pgd_t pgd) | 258 | static inline pgdval_t pgd_flags(pgd_t pgd) |
259 | { | 259 | { |
260 | return native_pgd_val(pgd) & PTE_FLAGS_MASK; | 260 | return native_pgd_val(pgd) & PTE_FLAGS_MASK; |
261 | } | 261 | } |
262 | 262 | ||
263 | #if PAGETABLE_LEVELS > 3 | 263 | #if PAGETABLE_LEVELS > 3 |
264 | typedef struct { pudval_t pud; } pud_t; | 264 | typedef struct { pudval_t pud; } pud_t; |
265 | 265 | ||
266 | static inline pud_t native_make_pud(pmdval_t val) | 266 | static inline pud_t native_make_pud(pmdval_t val) |
267 | { | 267 | { |
268 | return (pud_t) { val }; | 268 | return (pud_t) { val }; |
269 | } | 269 | } |
270 | 270 | ||
271 | static inline pudval_t native_pud_val(pud_t pud) | 271 | static inline pudval_t native_pud_val(pud_t pud) |
272 | { | 272 | { |
273 | return pud.pud; | 273 | return pud.pud; |
274 | } | 274 | } |
275 | #else | 275 | #else |
276 | #include <asm-generic/pgtable-nopud.h> | 276 | #include <asm-generic/pgtable-nopud.h> |
277 | 277 | ||
278 | static inline pudval_t native_pud_val(pud_t pud) | 278 | static inline pudval_t native_pud_val(pud_t pud) |
279 | { | 279 | { |
280 | return native_pgd_val(pud.pgd); | 280 | return native_pgd_val(pud.pgd); |
281 | } | 281 | } |
282 | #endif | 282 | #endif |
283 | 283 | ||
284 | #if PAGETABLE_LEVELS > 2 | 284 | #if PAGETABLE_LEVELS > 2 |
285 | typedef struct { pmdval_t pmd; } pmd_t; | 285 | typedef struct { pmdval_t pmd; } pmd_t; |
286 | 286 | ||
287 | static inline pmd_t native_make_pmd(pmdval_t val) | 287 | static inline pmd_t native_make_pmd(pmdval_t val) |
288 | { | 288 | { |
289 | return (pmd_t) { val }; | 289 | return (pmd_t) { val }; |
290 | } | 290 | } |
291 | 291 | ||
292 | static inline pmdval_t native_pmd_val(pmd_t pmd) | 292 | static inline pmdval_t native_pmd_val(pmd_t pmd) |
293 | { | 293 | { |
294 | return pmd.pmd; | 294 | return pmd.pmd; |
295 | } | 295 | } |
296 | #else | 296 | #else |
297 | #include <asm-generic/pgtable-nopmd.h> | 297 | #include <asm-generic/pgtable-nopmd.h> |
298 | 298 | ||
299 | static inline pmdval_t native_pmd_val(pmd_t pmd) | 299 | static inline pmdval_t native_pmd_val(pmd_t pmd) |
300 | { | 300 | { |
301 | return native_pgd_val(pmd.pud.pgd); | 301 | return native_pgd_val(pmd.pud.pgd); |
302 | } | 302 | } |
303 | #endif | 303 | #endif |
304 | 304 | ||
305 | static inline pudval_t pud_flags(pud_t pud) | 305 | static inline pudval_t pud_flags(pud_t pud) |
306 | { | 306 | { |
307 | return native_pud_val(pud) & PTE_FLAGS_MASK; | 307 | return native_pud_val(pud) & PTE_FLAGS_MASK; |
308 | } | 308 | } |
309 | 309 | ||
310 | static inline pmdval_t pmd_flags(pmd_t pmd) | 310 | static inline pmdval_t pmd_flags(pmd_t pmd) |
311 | { | 311 | { |
312 | return native_pmd_val(pmd) & PTE_FLAGS_MASK; | 312 | return native_pmd_val(pmd) & PTE_FLAGS_MASK; |
313 | } | 313 | } |
314 | 314 | ||
315 | static inline pte_t native_make_pte(pteval_t val) | 315 | static inline pte_t native_make_pte(pteval_t val) |
316 | { | 316 | { |
317 | return (pte_t) { .pte = val }; | 317 | return (pte_t) { .pte = val }; |
318 | } | 318 | } |
319 | 319 | ||
320 | static inline pteval_t native_pte_val(pte_t pte) | 320 | static inline pteval_t native_pte_val(pte_t pte) |
321 | { | 321 | { |
322 | return pte.pte; | 322 | return pte.pte; |
323 | } | 323 | } |
324 | 324 | ||
325 | static inline pteval_t pte_flags(pte_t pte) | 325 | static inline pteval_t pte_flags(pte_t pte) |
326 | { | 326 | { |
327 | return native_pte_val(pte) & PTE_FLAGS_MASK; | 327 | return native_pte_val(pte) & PTE_FLAGS_MASK; |
328 | } | 328 | } |
329 | 329 | ||
330 | #ifdef CONFIG_NUMA_BALANCING | 330 | #ifdef CONFIG_NUMA_BALANCING |
331 | /* Set of bits that distinguishes present, prot_none and numa ptes */ | 331 | /* Set of bits that distinguishes present, prot_none and numa ptes */ |
332 | #define _PAGE_NUMA_MASK (_PAGE_NUMA|_PAGE_PROTNONE|_PAGE_PRESENT) | 332 | #define _PAGE_NUMA_MASK (_PAGE_NUMA|_PAGE_PROTNONE|_PAGE_PRESENT) |
333 | static inline pteval_t ptenuma_flags(pte_t pte) | 333 | static inline pteval_t ptenuma_flags(pte_t pte) |
334 | { | 334 | { |
335 | return pte_flags(pte) & _PAGE_NUMA_MASK; | 335 | return pte_flags(pte) & _PAGE_NUMA_MASK; |
336 | } | 336 | } |
337 | 337 | ||
338 | static inline pmdval_t pmdnuma_flags(pmd_t pmd) | 338 | static inline pmdval_t pmdnuma_flags(pmd_t pmd) |
339 | { | 339 | { |
340 | return pmd_flags(pmd) & _PAGE_NUMA_MASK; | 340 | return pmd_flags(pmd) & _PAGE_NUMA_MASK; |
341 | } | 341 | } |
342 | #endif /* CONFIG_NUMA_BALANCING */ | 342 | #endif /* CONFIG_NUMA_BALANCING */ |
343 | 343 | ||
344 | #define pgprot_val(x) ((x).pgprot) | 344 | #define pgprot_val(x) ((x).pgprot) |
345 | #define __pgprot(x) ((pgprot_t) { (x) } ) | 345 | #define __pgprot(x) ((pgprot_t) { (x) } ) |
346 | 346 | ||
347 | extern uint16_t __cachemode2pte_tbl[_PAGE_CACHE_MODE_NUM]; | 347 | extern uint16_t __cachemode2pte_tbl[_PAGE_CACHE_MODE_NUM]; |
348 | extern uint8_t __pte2cachemode_tbl[8]; | 348 | extern uint8_t __pte2cachemode_tbl[8]; |
349 | 349 | ||
350 | #define __pte2cm_idx(cb) \ | 350 | #define __pte2cm_idx(cb) \ |
351 | ((((cb) >> (_PAGE_BIT_PAT - 2)) & 4) | \ | 351 | ((((cb) >> (_PAGE_BIT_PAT - 2)) & 4) | \ |
352 | (((cb) >> (_PAGE_BIT_PCD - 1)) & 2) | \ | 352 | (((cb) >> (_PAGE_BIT_PCD - 1)) & 2) | \ |
353 | (((cb) >> _PAGE_BIT_PWT) & 1)) | 353 | (((cb) >> _PAGE_BIT_PWT) & 1)) |
354 | #define __cm_idx2pte(i) \ | 354 | #define __cm_idx2pte(i) \ |
355 | ((((i) & 4) << (_PAGE_BIT_PAT - 2)) | \ | 355 | ((((i) & 4) << (_PAGE_BIT_PAT - 2)) | \ |
356 | (((i) & 2) << (_PAGE_BIT_PCD - 1)) | \ | 356 | (((i) & 2) << (_PAGE_BIT_PCD - 1)) | \ |
357 | (((i) & 1) << _PAGE_BIT_PWT)) | 357 | (((i) & 1) << _PAGE_BIT_PWT)) |
358 | 358 | ||
359 | static inline unsigned long cachemode2protval(enum page_cache_mode pcm) | 359 | static inline unsigned long cachemode2protval(enum page_cache_mode pcm) |
360 | { | 360 | { |
361 | if (likely(pcm == 0)) | 361 | if (likely(pcm == 0)) |
362 | return 0; | 362 | return 0; |
363 | return __cachemode2pte_tbl[pcm]; | 363 | return __cachemode2pte_tbl[pcm]; |
364 | } | 364 | } |
365 | static inline pgprot_t cachemode2pgprot(enum page_cache_mode pcm) | 365 | static inline pgprot_t cachemode2pgprot(enum page_cache_mode pcm) |
366 | { | 366 | { |
367 | return __pgprot(cachemode2protval(pcm)); | 367 | return __pgprot(cachemode2protval(pcm)); |
368 | } | 368 | } |
369 | static inline enum page_cache_mode pgprot2cachemode(pgprot_t pgprot) | 369 | static inline enum page_cache_mode pgprot2cachemode(pgprot_t pgprot) |
370 | { | 370 | { |
371 | unsigned long masked; | 371 | unsigned long masked; |
372 | 372 | ||
373 | masked = pgprot_val(pgprot) & _PAGE_CACHE_MASK; | 373 | masked = pgprot_val(pgprot) & _PAGE_CACHE_MASK; |
374 | if (likely(masked == 0)) | 374 | if (likely(masked == 0)) |
375 | return 0; | 375 | return 0; |
376 | return __pte2cachemode_tbl[__pte2cm_idx(masked)]; | 376 | return __pte2cachemode_tbl[__pte2cm_idx(masked)]; |
377 | } | 377 | } |
378 | static inline pgprot_t pgprot_4k_2_large(pgprot_t pgprot) | 378 | static inline pgprot_t pgprot_4k_2_large(pgprot_t pgprot) |
379 | { | 379 | { |
380 | pgprot_t new; | 380 | pgprot_t new; |
381 | unsigned long val; | 381 | unsigned long val; |
382 | 382 | ||
383 | val = pgprot_val(pgprot); | 383 | val = pgprot_val(pgprot); |
384 | pgprot_val(new) = (val & ~(_PAGE_PAT | _PAGE_PAT_LARGE)) | | 384 | pgprot_val(new) = (val & ~(_PAGE_PAT | _PAGE_PAT_LARGE)) | |
385 | ((val & _PAGE_PAT) << (_PAGE_BIT_PAT_LARGE - _PAGE_BIT_PAT)); | 385 | ((val & _PAGE_PAT) << (_PAGE_BIT_PAT_LARGE - _PAGE_BIT_PAT)); |
386 | return new; | 386 | return new; |
387 | } | 387 | } |
388 | static inline pgprot_t pgprot_large_2_4k(pgprot_t pgprot) | 388 | static inline pgprot_t pgprot_large_2_4k(pgprot_t pgprot) |
389 | { | 389 | { |
390 | pgprot_t new; | 390 | pgprot_t new; |
391 | unsigned long val; | 391 | unsigned long val; |
392 | 392 | ||
393 | val = pgprot_val(pgprot); | 393 | val = pgprot_val(pgprot); |
394 | pgprot_val(new) = (val & ~(_PAGE_PAT | _PAGE_PAT_LARGE)) | | 394 | pgprot_val(new) = (val & ~(_PAGE_PAT | _PAGE_PAT_LARGE)) | |
395 | ((val & _PAGE_PAT_LARGE) >> | 395 | ((val & _PAGE_PAT_LARGE) >> |
396 | (_PAGE_BIT_PAT_LARGE - _PAGE_BIT_PAT)); | 396 | (_PAGE_BIT_PAT_LARGE - _PAGE_BIT_PAT)); |
397 | return new; | 397 | return new; |
398 | } | 398 | } |
399 | 399 | ||
400 | 400 | ||
401 | typedef struct page *pgtable_t; | 401 | typedef struct page *pgtable_t; |
402 | 402 | ||
403 | extern pteval_t __supported_pte_mask; | 403 | extern pteval_t __supported_pte_mask; |
404 | extern void set_nx(void); | 404 | extern void set_nx(void); |
405 | extern int nx_enabled; | 405 | extern int nx_enabled; |
406 | 406 | ||
407 | #define pgprot_writecombine pgprot_writecombine | 407 | #define pgprot_writecombine pgprot_writecombine |
408 | extern pgprot_t pgprot_writecombine(pgprot_t prot); | 408 | extern pgprot_t pgprot_writecombine(pgprot_t prot); |
409 | 409 | ||
410 | /* Indicate that x86 has its own track and untrack pfn vma functions */ | 410 | /* Indicate that x86 has its own track and untrack pfn vma functions */ |
411 | #define __HAVE_PFNMAP_TRACKING | 411 | #define __HAVE_PFNMAP_TRACKING |
412 | 412 | ||
413 | #define __HAVE_PHYS_MEM_ACCESS_PROT | 413 | #define __HAVE_PHYS_MEM_ACCESS_PROT |
414 | struct file; | 414 | struct file; |
415 | pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, | 415 | pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, |
416 | unsigned long size, pgprot_t vma_prot); | 416 | unsigned long size, pgprot_t vma_prot); |
417 | int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn, | 417 | int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn, |
418 | unsigned long size, pgprot_t *vma_prot); | 418 | unsigned long size, pgprot_t *vma_prot); |
419 | 419 | ||
420 | /* Install a pte for a particular vaddr in kernel space. */ | 420 | /* Install a pte for a particular vaddr in kernel space. */ |
421 | void set_pte_vaddr(unsigned long vaddr, pte_t pte); | 421 | void set_pte_vaddr(unsigned long vaddr, pte_t pte); |
422 | 422 | ||
423 | #ifdef CONFIG_X86_32 | 423 | #ifdef CONFIG_X86_32 |
424 | extern void native_pagetable_init(void); | 424 | extern void native_pagetable_init(void); |
425 | #else | 425 | #else |
426 | #define native_pagetable_init paging_init | 426 | #define native_pagetable_init paging_init |
427 | #endif | 427 | #endif |
428 | 428 | ||
429 | struct seq_file; | 429 | struct seq_file; |
430 | extern void arch_report_meminfo(struct seq_file *m); | 430 | extern void arch_report_meminfo(struct seq_file *m); |
431 | 431 | ||
432 | enum pg_level { | 432 | enum pg_level { |
433 | PG_LEVEL_NONE, | 433 | PG_LEVEL_NONE, |
434 | PG_LEVEL_4K, | 434 | PG_LEVEL_4K, |
435 | PG_LEVEL_2M, | 435 | PG_LEVEL_2M, |
436 | PG_LEVEL_1G, | 436 | PG_LEVEL_1G, |
437 | PG_LEVEL_NUM | 437 | PG_LEVEL_NUM |
438 | }; | 438 | }; |
439 | 439 | ||
440 | #ifdef CONFIG_PROC_FS | 440 | #ifdef CONFIG_PROC_FS |
441 | extern void update_page_count(int level, unsigned long pages); | 441 | extern void update_page_count(int level, unsigned long pages); |
442 | #else | 442 | #else |
443 | static inline void update_page_count(int level, unsigned long pages) { } | 443 | static inline void update_page_count(int level, unsigned long pages) { } |
444 | #endif | 444 | #endif |
445 | 445 | ||
446 | /* | 446 | /* |
447 | * Helper function that returns the kernel pagetable entry controlling | 447 | * Helper function that returns the kernel pagetable entry controlling |
448 | * the virtual address 'address'. NULL means no pagetable entry present. | 448 | * the virtual address 'address'. NULL means no pagetable entry present. |
449 | * NOTE: the return type is pte_t but if the pmd is PSE then we return it | 449 | * NOTE: the return type is pte_t but if the pmd is PSE then we return it |
450 | * as a pte too. | 450 | * as a pte too. |
451 | */ | 451 | */ |
452 | extern pte_t *lookup_address(unsigned long address, unsigned int *level); | 452 | extern pte_t *lookup_address(unsigned long address, unsigned int *level); |
453 | extern pte_t *lookup_address_in_pgd(pgd_t *pgd, unsigned long address, | 453 | extern pte_t *lookup_address_in_pgd(pgd_t *pgd, unsigned long address, |
454 | unsigned int *level); | 454 | unsigned int *level); |
455 | extern pmd_t *lookup_pmd_address(unsigned long address); | ||
455 | extern phys_addr_t slow_virt_to_phys(void *__address); | 456 | extern phys_addr_t slow_virt_to_phys(void *__address); |
456 | extern int kernel_map_pages_in_pgd(pgd_t *pgd, u64 pfn, unsigned long address, | 457 | extern int kernel_map_pages_in_pgd(pgd_t *pgd, u64 pfn, unsigned long address, |
457 | unsigned numpages, unsigned long page_flags); | 458 | unsigned numpages, unsigned long page_flags); |
458 | void kernel_unmap_pages_in_pgd(pgd_t *root, unsigned long address, | 459 | void kernel_unmap_pages_in_pgd(pgd_t *root, unsigned long address, |
459 | unsigned numpages); | 460 | unsigned numpages); |
460 | #endif /* !__ASSEMBLY__ */ | 461 | #endif /* !__ASSEMBLY__ */ |
461 | 462 | ||
462 | #endif /* _ASM_X86_PGTABLE_DEFS_H */ | 463 | #endif /* _ASM_X86_PGTABLE_DEFS_H */ |
463 | 464 |
arch/x86/include/asm/xen/page.h
1 | #ifndef _ASM_X86_XEN_PAGE_H | 1 | #ifndef _ASM_X86_XEN_PAGE_H |
2 | #define _ASM_X86_XEN_PAGE_H | 2 | #define _ASM_X86_XEN_PAGE_H |
3 | 3 | ||
4 | #include <linux/kernel.h> | 4 | #include <linux/kernel.h> |
5 | #include <linux/types.h> | 5 | #include <linux/types.h> |
6 | #include <linux/spinlock.h> | 6 | #include <linux/spinlock.h> |
7 | #include <linux/pfn.h> | 7 | #include <linux/pfn.h> |
8 | #include <linux/mm.h> | 8 | #include <linux/mm.h> |
9 | 9 | ||
10 | #include <asm/uaccess.h> | 10 | #include <asm/uaccess.h> |
11 | #include <asm/page.h> | 11 | #include <asm/page.h> |
12 | #include <asm/pgtable.h> | 12 | #include <asm/pgtable.h> |
13 | 13 | ||
14 | #include <xen/interface/xen.h> | 14 | #include <xen/interface/xen.h> |
15 | #include <xen/grant_table.h> | 15 | #include <xen/grant_table.h> |
16 | #include <xen/features.h> | 16 | #include <xen/features.h> |
17 | 17 | ||
18 | /* Xen machine address */ | 18 | /* Xen machine address */ |
19 | typedef struct xmaddr { | 19 | typedef struct xmaddr { |
20 | phys_addr_t maddr; | 20 | phys_addr_t maddr; |
21 | } xmaddr_t; | 21 | } xmaddr_t; |
22 | 22 | ||
23 | /* Xen pseudo-physical address */ | 23 | /* Xen pseudo-physical address */ |
24 | typedef struct xpaddr { | 24 | typedef struct xpaddr { |
25 | phys_addr_t paddr; | 25 | phys_addr_t paddr; |
26 | } xpaddr_t; | 26 | } xpaddr_t; |
27 | 27 | ||
28 | #define XMADDR(x) ((xmaddr_t) { .maddr = (x) }) | 28 | #define XMADDR(x) ((xmaddr_t) { .maddr = (x) }) |
29 | #define XPADDR(x) ((xpaddr_t) { .paddr = (x) }) | 29 | #define XPADDR(x) ((xpaddr_t) { .paddr = (x) }) |
30 | 30 | ||
31 | /**** MACHINE <-> PHYSICAL CONVERSION MACROS ****/ | 31 | /**** MACHINE <-> PHYSICAL CONVERSION MACROS ****/ |
32 | #define INVALID_P2M_ENTRY (~0UL) | 32 | #define INVALID_P2M_ENTRY (~0UL) |
33 | #define FOREIGN_FRAME_BIT (1UL<<(BITS_PER_LONG-1)) | 33 | #define FOREIGN_FRAME_BIT (1UL<<(BITS_PER_LONG-1)) |
34 | #define IDENTITY_FRAME_BIT (1UL<<(BITS_PER_LONG-2)) | 34 | #define IDENTITY_FRAME_BIT (1UL<<(BITS_PER_LONG-2)) |
35 | #define FOREIGN_FRAME(m) ((m) | FOREIGN_FRAME_BIT) | 35 | #define FOREIGN_FRAME(m) ((m) | FOREIGN_FRAME_BIT) |
36 | #define IDENTITY_FRAME(m) ((m) | IDENTITY_FRAME_BIT) | 36 | #define IDENTITY_FRAME(m) ((m) | IDENTITY_FRAME_BIT) |
37 | 37 | ||
38 | /* Maximum amount of memory we can handle in a domain in pages */ | 38 | /* Maximum amount of memory we can handle in a domain in pages */ |
39 | #define MAX_DOMAIN_PAGES \ | 39 | #define MAX_DOMAIN_PAGES \ |
40 | ((unsigned long)((u64)CONFIG_XEN_MAX_DOMAIN_MEMORY * 1024 * 1024 * 1024 / PAGE_SIZE)) | 40 | ((unsigned long)((u64)CONFIG_XEN_MAX_DOMAIN_MEMORY * 1024 * 1024 * 1024 / PAGE_SIZE)) |
41 | 41 | ||
42 | extern unsigned long *machine_to_phys_mapping; | 42 | extern unsigned long *machine_to_phys_mapping; |
43 | extern unsigned long machine_to_phys_nr; | 43 | extern unsigned long machine_to_phys_nr; |
44 | extern unsigned long *xen_p2m_addr; | ||
45 | extern unsigned long xen_p2m_size; | ||
46 | extern unsigned long xen_max_p2m_pfn; | ||
44 | 47 | ||
45 | extern unsigned long get_phys_to_machine(unsigned long pfn); | 48 | extern unsigned long get_phys_to_machine(unsigned long pfn); |
46 | extern bool set_phys_to_machine(unsigned long pfn, unsigned long mfn); | 49 | extern bool set_phys_to_machine(unsigned long pfn, unsigned long mfn); |
47 | extern bool __init early_set_phys_to_machine(unsigned long pfn, unsigned long mfn); | ||
48 | extern bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn); | 50 | extern bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn); |
49 | extern unsigned long set_phys_range_identity(unsigned long pfn_s, | 51 | extern unsigned long set_phys_range_identity(unsigned long pfn_s, |
50 | unsigned long pfn_e); | 52 | unsigned long pfn_e); |
51 | 53 | ||
52 | extern int set_foreign_p2m_mapping(struct gnttab_map_grant_ref *map_ops, | 54 | extern int set_foreign_p2m_mapping(struct gnttab_map_grant_ref *map_ops, |
53 | struct gnttab_map_grant_ref *kmap_ops, | 55 | struct gnttab_map_grant_ref *kmap_ops, |
54 | struct page **pages, unsigned int count); | 56 | struct page **pages, unsigned int count); |
55 | extern int m2p_add_override(unsigned long mfn, struct page *page, | ||
56 | struct gnttab_map_grant_ref *kmap_op); | ||
57 | extern int clear_foreign_p2m_mapping(struct gnttab_unmap_grant_ref *unmap_ops, | 57 | extern int clear_foreign_p2m_mapping(struct gnttab_unmap_grant_ref *unmap_ops, |
58 | struct gnttab_map_grant_ref *kmap_ops, | 58 | struct gnttab_map_grant_ref *kmap_ops, |
59 | struct page **pages, unsigned int count); | 59 | struct page **pages, unsigned int count); |
60 | extern int m2p_remove_override(struct page *page, | ||
61 | struct gnttab_map_grant_ref *kmap_op, | ||
62 | unsigned long mfn); | ||
63 | extern struct page *m2p_find_override(unsigned long mfn); | ||
64 | extern unsigned long m2p_find_override_pfn(unsigned long mfn, unsigned long pfn); | 60 | extern unsigned long m2p_find_override_pfn(unsigned long mfn, unsigned long pfn); |
65 | 61 | ||
62 | /* | ||
63 | * Helper functions to write or read unsigned long values to/from | ||
64 | * memory, when the access may fault. | ||
65 | */ | ||
66 | static inline int xen_safe_write_ulong(unsigned long *addr, unsigned long val) | ||
67 | { | ||
68 | return __put_user(val, (unsigned long __user *)addr); | ||
69 | } | ||
70 | |||
71 | static inline int xen_safe_read_ulong(unsigned long *addr, unsigned long *val) | ||
72 | { | ||
73 | return __get_user(*val, (unsigned long __user *)addr); | ||
74 | } | ||
75 | |||
76 | /* | ||
77 | * When to use pfn_to_mfn(), __pfn_to_mfn() or get_phys_to_machine(): | ||
78 | * - pfn_to_mfn() returns either INVALID_P2M_ENTRY or the mfn. No indicator | ||
79 | * bits (identity or foreign) are set. | ||
80 | * - __pfn_to_mfn() returns the found entry of the p2m table. A possibly set | ||
81 | * identity or foreign indicator will be still set. __pfn_to_mfn() is | ||
82 | * encapsulating get_phys_to_machine() which is called in special cases only. | ||
83 | * - get_phys_to_machine() is to be called by __pfn_to_mfn() only in special | ||
84 | * cases needing an extended handling. | ||
85 | */ | ||
86 | static inline unsigned long __pfn_to_mfn(unsigned long pfn) | ||
87 | { | ||
88 | unsigned long mfn; | ||
89 | |||
90 | if (pfn < xen_p2m_size) | ||
91 | mfn = xen_p2m_addr[pfn]; | ||
92 | else if (unlikely(pfn < xen_max_p2m_pfn)) | ||
93 | return get_phys_to_machine(pfn); | ||
94 | else | ||
95 | return IDENTITY_FRAME(pfn); | ||
96 | |||
97 | if (unlikely(mfn == INVALID_P2M_ENTRY)) | ||
98 | return get_phys_to_machine(pfn); | ||
99 | |||
100 | return mfn; | ||
101 | } | ||
102 | |||
66 | static inline unsigned long pfn_to_mfn(unsigned long pfn) | 103 | static inline unsigned long pfn_to_mfn(unsigned long pfn) |
67 | { | 104 | { |
68 | unsigned long mfn; | 105 | unsigned long mfn; |
69 | 106 | ||
70 | if (xen_feature(XENFEAT_auto_translated_physmap)) | 107 | if (xen_feature(XENFEAT_auto_translated_physmap)) |
71 | return pfn; | 108 | return pfn; |
72 | 109 | ||
73 | mfn = get_phys_to_machine(pfn); | 110 | mfn = __pfn_to_mfn(pfn); |
74 | 111 | ||
75 | if (mfn != INVALID_P2M_ENTRY) | 112 | if (mfn != INVALID_P2M_ENTRY) |
76 | mfn &= ~(FOREIGN_FRAME_BIT | IDENTITY_FRAME_BIT); | 113 | mfn &= ~(FOREIGN_FRAME_BIT | IDENTITY_FRAME_BIT); |
77 | 114 | ||
78 | return mfn; | 115 | return mfn; |
79 | } | 116 | } |
80 | 117 | ||
81 | static inline int phys_to_machine_mapping_valid(unsigned long pfn) | 118 | static inline int phys_to_machine_mapping_valid(unsigned long pfn) |
82 | { | 119 | { |
83 | if (xen_feature(XENFEAT_auto_translated_physmap)) | 120 | if (xen_feature(XENFEAT_auto_translated_physmap)) |
84 | return 1; | 121 | return 1; |
85 | 122 | ||
86 | return get_phys_to_machine(pfn) != INVALID_P2M_ENTRY; | 123 | return __pfn_to_mfn(pfn) != INVALID_P2M_ENTRY; |
87 | } | 124 | } |
88 | 125 | ||
89 | static inline unsigned long mfn_to_pfn_no_overrides(unsigned long mfn) | 126 | static inline unsigned long mfn_to_pfn_no_overrides(unsigned long mfn) |
90 | { | 127 | { |
91 | unsigned long pfn; | 128 | unsigned long pfn; |
92 | int ret; | 129 | int ret; |
93 | 130 | ||
94 | if (xen_feature(XENFEAT_auto_translated_physmap)) | 131 | if (xen_feature(XENFEAT_auto_translated_physmap)) |
95 | return mfn; | 132 | return mfn; |
96 | 133 | ||
97 | if (unlikely(mfn >= machine_to_phys_nr)) | 134 | if (unlikely(mfn >= machine_to_phys_nr)) |
98 | return ~0; | 135 | return ~0; |
99 | 136 | ||
100 | /* | 137 | /* |
101 | * The array access can fail (e.g., device space beyond end of RAM). | 138 | * The array access can fail (e.g., device space beyond end of RAM). |
102 | * In such cases it doesn't matter what we return (we return garbage), | 139 | * In such cases it doesn't matter what we return (we return garbage), |
103 | * but we must handle the fault without crashing! | 140 | * but we must handle the fault without crashing! |
104 | */ | 141 | */ |
105 | ret = __get_user(pfn, &machine_to_phys_mapping[mfn]); | 142 | ret = xen_safe_read_ulong(&machine_to_phys_mapping[mfn], &pfn); |
106 | if (ret < 0) | 143 | if (ret < 0) |
107 | return ~0; | 144 | return ~0; |
108 | 145 | ||
109 | return pfn; | 146 | return pfn; |
110 | } | 147 | } |
111 | 148 | ||
112 | static inline unsigned long mfn_to_pfn(unsigned long mfn) | 149 | static inline unsigned long mfn_to_pfn(unsigned long mfn) |
113 | { | 150 | { |
114 | unsigned long pfn; | 151 | unsigned long pfn; |
115 | 152 | ||
116 | if (xen_feature(XENFEAT_auto_translated_physmap)) | 153 | if (xen_feature(XENFEAT_auto_translated_physmap)) |
117 | return mfn; | 154 | return mfn; |
118 | 155 | ||
119 | pfn = mfn_to_pfn_no_overrides(mfn); | 156 | pfn = mfn_to_pfn_no_overrides(mfn); |
120 | if (get_phys_to_machine(pfn) != mfn) { | 157 | if (__pfn_to_mfn(pfn) != mfn) { |
121 | /* | 158 | /* |
122 | * If this appears to be a foreign mfn (because the pfn | 159 | * If this appears to be a foreign mfn (because the pfn |
123 | * doesn't map back to the mfn), then check the local override | 160 | * doesn't map back to the mfn), then check the local override |
124 | * table to see if there's a better pfn to use. | 161 | * table to see if there's a better pfn to use. |
125 | * | 162 | * |
126 | * m2p_find_override_pfn returns ~0 if it doesn't find anything. | 163 | * m2p_find_override_pfn returns ~0 if it doesn't find anything. |
127 | */ | 164 | */ |
128 | pfn = m2p_find_override_pfn(mfn, ~0); | 165 | pfn = m2p_find_override_pfn(mfn, ~0); |
129 | } | 166 | } |
130 | 167 | ||
131 | /* | 168 | /* |
132 | * pfn is ~0 if there are no entries in the m2p for mfn or if the | 169 | * pfn is ~0 if there are no entries in the m2p for mfn or if the |
133 | * entry doesn't map back to the mfn and m2p_override doesn't have a | 170 | * entry doesn't map back to the mfn and m2p_override doesn't have a |
134 | * valid entry for it. | 171 | * valid entry for it. |
135 | */ | 172 | */ |
136 | if (pfn == ~0 && | 173 | if (pfn == ~0 && __pfn_to_mfn(mfn) == IDENTITY_FRAME(mfn)) |
137 | get_phys_to_machine(mfn) == IDENTITY_FRAME(mfn)) | ||
138 | pfn = mfn; | 174 | pfn = mfn; |
139 | 175 | ||
140 | return pfn; | 176 | return pfn; |
141 | } | 177 | } |
142 | 178 | ||
143 | static inline xmaddr_t phys_to_machine(xpaddr_t phys) | 179 | static inline xmaddr_t phys_to_machine(xpaddr_t phys) |
144 | { | 180 | { |
145 | unsigned offset = phys.paddr & ~PAGE_MASK; | 181 | unsigned offset = phys.paddr & ~PAGE_MASK; |
146 | return XMADDR(PFN_PHYS(pfn_to_mfn(PFN_DOWN(phys.paddr))) | offset); | 182 | return XMADDR(PFN_PHYS(pfn_to_mfn(PFN_DOWN(phys.paddr))) | offset); |
147 | } | 183 | } |
148 | 184 | ||
149 | static inline xpaddr_t machine_to_phys(xmaddr_t machine) | 185 | static inline xpaddr_t machine_to_phys(xmaddr_t machine) |
150 | { | 186 | { |
151 | unsigned offset = machine.maddr & ~PAGE_MASK; | 187 | unsigned offset = machine.maddr & ~PAGE_MASK; |
152 | return XPADDR(PFN_PHYS(mfn_to_pfn(PFN_DOWN(machine.maddr))) | offset); | 188 | return XPADDR(PFN_PHYS(mfn_to_pfn(PFN_DOWN(machine.maddr))) | offset); |
153 | } | 189 | } |
154 | 190 | ||
155 | /* | 191 | /* |
156 | * We detect special mappings in one of two ways: | 192 | * We detect special mappings in one of two ways: |
157 | * 1. If the MFN is an I/O page then Xen will set the m2p entry | 193 | * 1. If the MFN is an I/O page then Xen will set the m2p entry |
158 | * to be outside our maximum possible pseudophys range. | 194 | * to be outside our maximum possible pseudophys range. |
159 | * 2. If the MFN belongs to a different domain then we will certainly | 195 | * 2. If the MFN belongs to a different domain then we will certainly |
160 | * not have MFN in our p2m table. Conversely, if the page is ours, | 196 | * not have MFN in our p2m table. Conversely, if the page is ours, |
161 | * then we'll have p2m(m2p(MFN))==MFN. | 197 | * then we'll have p2m(m2p(MFN))==MFN. |
162 | * If we detect a special mapping then it doesn't have a 'struct page'. | 198 | * If we detect a special mapping then it doesn't have a 'struct page'. |
163 | * We force !pfn_valid() by returning an out-of-range pointer. | 199 | * We force !pfn_valid() by returning an out-of-range pointer. |
164 | * | 200 | * |
165 | * NB. These checks require that, for any MFN that is not in our reservation, | 201 | * NB. These checks require that, for any MFN that is not in our reservation, |
166 | * there is no PFN such that p2m(PFN) == MFN. Otherwise we can get confused if | 202 | * there is no PFN such that p2m(PFN) == MFN. Otherwise we can get confused if |
167 | * we are foreign-mapping the MFN, and the other domain as m2p(MFN) == PFN. | 203 | * we are foreign-mapping the MFN, and the other domain as m2p(MFN) == PFN. |
168 | * Yikes! Various places must poke in INVALID_P2M_ENTRY for safety. | 204 | * Yikes! Various places must poke in INVALID_P2M_ENTRY for safety. |
169 | * | 205 | * |
170 | * NB2. When deliberately mapping foreign pages into the p2m table, you *must* | 206 | * NB2. When deliberately mapping foreign pages into the p2m table, you *must* |
171 | * use FOREIGN_FRAME(). This will cause pte_pfn() to choke on it, as we | 207 | * use FOREIGN_FRAME(). This will cause pte_pfn() to choke on it, as we |
172 | * require. In all the cases we care about, the FOREIGN_FRAME bit is | 208 | * require. In all the cases we care about, the FOREIGN_FRAME bit is |
173 | * masked (e.g., pfn_to_mfn()) so behaviour there is correct. | 209 | * masked (e.g., pfn_to_mfn()) so behaviour there is correct. |
174 | */ | 210 | */ |
175 | static inline unsigned long mfn_to_local_pfn(unsigned long mfn) | 211 | static inline unsigned long mfn_to_local_pfn(unsigned long mfn) |
176 | { | 212 | { |
177 | unsigned long pfn; | 213 | unsigned long pfn; |
178 | 214 | ||
179 | if (xen_feature(XENFEAT_auto_translated_physmap)) | 215 | if (xen_feature(XENFEAT_auto_translated_physmap)) |
180 | return mfn; | 216 | return mfn; |
181 | 217 | ||
182 | pfn = mfn_to_pfn(mfn); | 218 | pfn = mfn_to_pfn(mfn); |
183 | if (get_phys_to_machine(pfn) != mfn) | 219 | if (__pfn_to_mfn(pfn) != mfn) |
184 | return -1; /* force !pfn_valid() */ | 220 | return -1; /* force !pfn_valid() */ |
185 | return pfn; | 221 | return pfn; |
186 | } | 222 | } |
187 | 223 | ||
188 | /* VIRT <-> MACHINE conversion */ | 224 | /* VIRT <-> MACHINE conversion */ |
189 | #define virt_to_machine(v) (phys_to_machine(XPADDR(__pa(v)))) | 225 | #define virt_to_machine(v) (phys_to_machine(XPADDR(__pa(v)))) |
190 | #define virt_to_pfn(v) (PFN_DOWN(__pa(v))) | 226 | #define virt_to_pfn(v) (PFN_DOWN(__pa(v))) |
191 | #define virt_to_mfn(v) (pfn_to_mfn(virt_to_pfn(v))) | 227 | #define virt_to_mfn(v) (pfn_to_mfn(virt_to_pfn(v))) |
192 | #define mfn_to_virt(m) (__va(mfn_to_pfn(m) << PAGE_SHIFT)) | 228 | #define mfn_to_virt(m) (__va(mfn_to_pfn(m) << PAGE_SHIFT)) |
193 | 229 | ||
194 | static inline unsigned long pte_mfn(pte_t pte) | 230 | static inline unsigned long pte_mfn(pte_t pte) |
195 | { | 231 | { |
196 | return (pte.pte & PTE_PFN_MASK) >> PAGE_SHIFT; | 232 | return (pte.pte & PTE_PFN_MASK) >> PAGE_SHIFT; |
197 | } | 233 | } |
198 | 234 | ||
199 | static inline pte_t mfn_pte(unsigned long page_nr, pgprot_t pgprot) | 235 | static inline pte_t mfn_pte(unsigned long page_nr, pgprot_t pgprot) |
200 | { | 236 | { |
201 | pte_t pte; | 237 | pte_t pte; |
202 | 238 | ||
203 | pte.pte = ((phys_addr_t)page_nr << PAGE_SHIFT) | | 239 | pte.pte = ((phys_addr_t)page_nr << PAGE_SHIFT) | |
204 | massage_pgprot(pgprot); | 240 | massage_pgprot(pgprot); |
205 | 241 | ||
206 | return pte; | 242 | return pte; |
207 | } | 243 | } |
208 | 244 | ||
209 | static inline pteval_t pte_val_ma(pte_t pte) | 245 | static inline pteval_t pte_val_ma(pte_t pte) |
210 | { | 246 | { |
211 | return pte.pte; | 247 | return pte.pte; |
212 | } | 248 | } |
213 | 249 | ||
214 | static inline pte_t __pte_ma(pteval_t x) | 250 | static inline pte_t __pte_ma(pteval_t x) |
215 | { | 251 | { |
216 | return (pte_t) { .pte = x }; | 252 | return (pte_t) { .pte = x }; |
217 | } | 253 | } |
218 | 254 | ||
219 | #define pmd_val_ma(v) ((v).pmd) | 255 | #define pmd_val_ma(v) ((v).pmd) |
220 | #ifdef __PAGETABLE_PUD_FOLDED | 256 | #ifdef __PAGETABLE_PUD_FOLDED |
221 | #define pud_val_ma(v) ((v).pgd.pgd) | 257 | #define pud_val_ma(v) ((v).pgd.pgd) |
222 | #else | 258 | #else |
223 | #define pud_val_ma(v) ((v).pud) | 259 | #define pud_val_ma(v) ((v).pud) |
224 | #endif | 260 | #endif |
225 | #define __pmd_ma(x) ((pmd_t) { (x) } ) | 261 | #define __pmd_ma(x) ((pmd_t) { (x) } ) |
226 | 262 | ||
227 | #define pgd_val_ma(x) ((x).pgd) | 263 | #define pgd_val_ma(x) ((x).pgd) |
228 | 264 | ||
229 | void xen_set_domain_pte(pte_t *ptep, pte_t pteval, unsigned domid); | 265 | void xen_set_domain_pte(pte_t *ptep, pte_t pteval, unsigned domid); |
230 | 266 | ||
231 | xmaddr_t arbitrary_virt_to_machine(void *address); | 267 | xmaddr_t arbitrary_virt_to_machine(void *address); |
232 | unsigned long arbitrary_virt_to_mfn(void *vaddr); | 268 | unsigned long arbitrary_virt_to_mfn(void *vaddr); |
233 | void make_lowmem_page_readonly(void *vaddr); | 269 | void make_lowmem_page_readonly(void *vaddr); |
234 | void make_lowmem_page_readwrite(void *vaddr); | 270 | void make_lowmem_page_readwrite(void *vaddr); |
235 | 271 | ||
236 | #define xen_remap(cookie, size) ioremap((cookie), (size)); | 272 | #define xen_remap(cookie, size) ioremap((cookie), (size)); |
237 | #define xen_unmap(cookie) iounmap((cookie)) | 273 | #define xen_unmap(cookie) iounmap((cookie)) |
238 | 274 | ||
239 | static inline bool xen_arch_need_swiotlb(struct device *dev, | 275 | static inline bool xen_arch_need_swiotlb(struct device *dev, |
arch/x86/mm/pageattr.c
1 | /* | 1 | /* |
2 | * Copyright 2002 Andi Kleen, SuSE Labs. | 2 | * Copyright 2002 Andi Kleen, SuSE Labs. |
3 | * Thanks to Ben LaHaise for precious feedback. | 3 | * Thanks to Ben LaHaise for precious feedback. |
4 | */ | 4 | */ |
5 | #include <linux/highmem.h> | 5 | #include <linux/highmem.h> |
6 | #include <linux/bootmem.h> | 6 | #include <linux/bootmem.h> |
7 | #include <linux/module.h> | 7 | #include <linux/module.h> |
8 | #include <linux/sched.h> | 8 | #include <linux/sched.h> |
9 | #include <linux/mm.h> | 9 | #include <linux/mm.h> |
10 | #include <linux/interrupt.h> | 10 | #include <linux/interrupt.h> |
11 | #include <linux/seq_file.h> | 11 | #include <linux/seq_file.h> |
12 | #include <linux/debugfs.h> | 12 | #include <linux/debugfs.h> |
13 | #include <linux/pfn.h> | 13 | #include <linux/pfn.h> |
14 | #include <linux/percpu.h> | 14 | #include <linux/percpu.h> |
15 | #include <linux/gfp.h> | 15 | #include <linux/gfp.h> |
16 | #include <linux/pci.h> | 16 | #include <linux/pci.h> |
17 | 17 | ||
18 | #include <asm/e820.h> | 18 | #include <asm/e820.h> |
19 | #include <asm/processor.h> | 19 | #include <asm/processor.h> |
20 | #include <asm/tlbflush.h> | 20 | #include <asm/tlbflush.h> |
21 | #include <asm/sections.h> | 21 | #include <asm/sections.h> |
22 | #include <asm/setup.h> | 22 | #include <asm/setup.h> |
23 | #include <asm/uaccess.h> | 23 | #include <asm/uaccess.h> |
24 | #include <asm/pgalloc.h> | 24 | #include <asm/pgalloc.h> |
25 | #include <asm/proto.h> | 25 | #include <asm/proto.h> |
26 | #include <asm/pat.h> | 26 | #include <asm/pat.h> |
27 | 27 | ||
28 | /* | 28 | /* |
29 | * The current flushing context - we pass it instead of 5 arguments: | 29 | * The current flushing context - we pass it instead of 5 arguments: |
30 | */ | 30 | */ |
31 | struct cpa_data { | 31 | struct cpa_data { |
32 | unsigned long *vaddr; | 32 | unsigned long *vaddr; |
33 | pgd_t *pgd; | 33 | pgd_t *pgd; |
34 | pgprot_t mask_set; | 34 | pgprot_t mask_set; |
35 | pgprot_t mask_clr; | 35 | pgprot_t mask_clr; |
36 | int numpages; | 36 | int numpages; |
37 | int flags; | 37 | int flags; |
38 | unsigned long pfn; | 38 | unsigned long pfn; |
39 | unsigned force_split : 1; | 39 | unsigned force_split : 1; |
40 | int curpage; | 40 | int curpage; |
41 | struct page **pages; | 41 | struct page **pages; |
42 | }; | 42 | }; |
43 | 43 | ||
44 | /* | 44 | /* |
45 | * Serialize cpa() (for !DEBUG_PAGEALLOC which uses large identity mappings) | 45 | * Serialize cpa() (for !DEBUG_PAGEALLOC which uses large identity mappings) |
46 | * using cpa_lock. So that we don't allow any other cpu, with stale large tlb | 46 | * using cpa_lock. So that we don't allow any other cpu, with stale large tlb |
47 | * entries change the page attribute in parallel to some other cpu | 47 | * entries change the page attribute in parallel to some other cpu |
48 | * splitting a large page entry along with changing the attribute. | 48 | * splitting a large page entry along with changing the attribute. |
49 | */ | 49 | */ |
50 | static DEFINE_SPINLOCK(cpa_lock); | 50 | static DEFINE_SPINLOCK(cpa_lock); |
51 | 51 | ||
52 | #define CPA_FLUSHTLB 1 | 52 | #define CPA_FLUSHTLB 1 |
53 | #define CPA_ARRAY 2 | 53 | #define CPA_ARRAY 2 |
54 | #define CPA_PAGES_ARRAY 4 | 54 | #define CPA_PAGES_ARRAY 4 |
55 | 55 | ||
56 | #ifdef CONFIG_PROC_FS | 56 | #ifdef CONFIG_PROC_FS |
57 | static unsigned long direct_pages_count[PG_LEVEL_NUM]; | 57 | static unsigned long direct_pages_count[PG_LEVEL_NUM]; |
58 | 58 | ||
59 | void update_page_count(int level, unsigned long pages) | 59 | void update_page_count(int level, unsigned long pages) |
60 | { | 60 | { |
61 | /* Protect against CPA */ | 61 | /* Protect against CPA */ |
62 | spin_lock(&pgd_lock); | 62 | spin_lock(&pgd_lock); |
63 | direct_pages_count[level] += pages; | 63 | direct_pages_count[level] += pages; |
64 | spin_unlock(&pgd_lock); | 64 | spin_unlock(&pgd_lock); |
65 | } | 65 | } |
66 | 66 | ||
67 | static void split_page_count(int level) | 67 | static void split_page_count(int level) |
68 | { | 68 | { |
69 | direct_pages_count[level]--; | 69 | direct_pages_count[level]--; |
70 | direct_pages_count[level - 1] += PTRS_PER_PTE; | 70 | direct_pages_count[level - 1] += PTRS_PER_PTE; |
71 | } | 71 | } |
72 | 72 | ||
73 | void arch_report_meminfo(struct seq_file *m) | 73 | void arch_report_meminfo(struct seq_file *m) |
74 | { | 74 | { |
75 | seq_printf(m, "DirectMap4k: %8lu kB\n", | 75 | seq_printf(m, "DirectMap4k: %8lu kB\n", |
76 | direct_pages_count[PG_LEVEL_4K] << 2); | 76 | direct_pages_count[PG_LEVEL_4K] << 2); |
77 | #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) | 77 | #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) |
78 | seq_printf(m, "DirectMap2M: %8lu kB\n", | 78 | seq_printf(m, "DirectMap2M: %8lu kB\n", |
79 | direct_pages_count[PG_LEVEL_2M] << 11); | 79 | direct_pages_count[PG_LEVEL_2M] << 11); |
80 | #else | 80 | #else |
81 | seq_printf(m, "DirectMap4M: %8lu kB\n", | 81 | seq_printf(m, "DirectMap4M: %8lu kB\n", |
82 | direct_pages_count[PG_LEVEL_2M] << 12); | 82 | direct_pages_count[PG_LEVEL_2M] << 12); |
83 | #endif | 83 | #endif |
84 | #ifdef CONFIG_X86_64 | 84 | #ifdef CONFIG_X86_64 |
85 | if (direct_gbpages) | 85 | if (direct_gbpages) |
86 | seq_printf(m, "DirectMap1G: %8lu kB\n", | 86 | seq_printf(m, "DirectMap1G: %8lu kB\n", |
87 | direct_pages_count[PG_LEVEL_1G] << 20); | 87 | direct_pages_count[PG_LEVEL_1G] << 20); |
88 | #endif | 88 | #endif |
89 | } | 89 | } |
90 | #else | 90 | #else |
91 | static inline void split_page_count(int level) { } | 91 | static inline void split_page_count(int level) { } |
92 | #endif | 92 | #endif |
93 | 93 | ||
94 | #ifdef CONFIG_X86_64 | 94 | #ifdef CONFIG_X86_64 |
95 | 95 | ||
96 | static inline unsigned long highmap_start_pfn(void) | 96 | static inline unsigned long highmap_start_pfn(void) |
97 | { | 97 | { |
98 | return __pa_symbol(_text) >> PAGE_SHIFT; | 98 | return __pa_symbol(_text) >> PAGE_SHIFT; |
99 | } | 99 | } |
100 | 100 | ||
101 | static inline unsigned long highmap_end_pfn(void) | 101 | static inline unsigned long highmap_end_pfn(void) |
102 | { | 102 | { |
103 | return __pa_symbol(roundup(_brk_end, PMD_SIZE)) >> PAGE_SHIFT; | 103 | return __pa_symbol(roundup(_brk_end, PMD_SIZE)) >> PAGE_SHIFT; |
104 | } | 104 | } |
105 | 105 | ||
106 | #endif | 106 | #endif |
107 | 107 | ||
108 | #ifdef CONFIG_DEBUG_PAGEALLOC | 108 | #ifdef CONFIG_DEBUG_PAGEALLOC |
109 | # define debug_pagealloc 1 | 109 | # define debug_pagealloc 1 |
110 | #else | 110 | #else |
111 | # define debug_pagealloc 0 | 111 | # define debug_pagealloc 0 |
112 | #endif | 112 | #endif |
113 | 113 | ||
114 | static inline int | 114 | static inline int |
115 | within(unsigned long addr, unsigned long start, unsigned long end) | 115 | within(unsigned long addr, unsigned long start, unsigned long end) |
116 | { | 116 | { |
117 | return addr >= start && addr < end; | 117 | return addr >= start && addr < end; |
118 | } | 118 | } |
119 | 119 | ||
120 | /* | 120 | /* |
121 | * Flushing functions | 121 | * Flushing functions |
122 | */ | 122 | */ |
123 | 123 | ||
124 | /** | 124 | /** |
125 | * clflush_cache_range - flush a cache range with clflush | 125 | * clflush_cache_range - flush a cache range with clflush |
126 | * @vaddr: virtual start address | 126 | * @vaddr: virtual start address |
127 | * @size: number of bytes to flush | 127 | * @size: number of bytes to flush |
128 | * | 128 | * |
129 | * clflushopt is an unordered instruction which needs fencing with mfence or | 129 | * clflushopt is an unordered instruction which needs fencing with mfence or |
130 | * sfence to avoid ordering issues. | 130 | * sfence to avoid ordering issues. |
131 | */ | 131 | */ |
132 | void clflush_cache_range(void *vaddr, unsigned int size) | 132 | void clflush_cache_range(void *vaddr, unsigned int size) |
133 | { | 133 | { |
134 | void *vend = vaddr + size - 1; | 134 | void *vend = vaddr + size - 1; |
135 | 135 | ||
136 | mb(); | 136 | mb(); |
137 | 137 | ||
138 | for (; vaddr < vend; vaddr += boot_cpu_data.x86_clflush_size) | 138 | for (; vaddr < vend; vaddr += boot_cpu_data.x86_clflush_size) |
139 | clflushopt(vaddr); | 139 | clflushopt(vaddr); |
140 | /* | 140 | /* |
141 | * Flush any possible final partial cacheline: | 141 | * Flush any possible final partial cacheline: |
142 | */ | 142 | */ |
143 | clflushopt(vend); | 143 | clflushopt(vend); |
144 | 144 | ||
145 | mb(); | 145 | mb(); |
146 | } | 146 | } |
147 | EXPORT_SYMBOL_GPL(clflush_cache_range); | 147 | EXPORT_SYMBOL_GPL(clflush_cache_range); |
148 | 148 | ||
149 | static void __cpa_flush_all(void *arg) | 149 | static void __cpa_flush_all(void *arg) |
150 | { | 150 | { |
151 | unsigned long cache = (unsigned long)arg; | 151 | unsigned long cache = (unsigned long)arg; |
152 | 152 | ||
153 | /* | 153 | /* |
154 | * Flush all to work around Errata in early athlons regarding | 154 | * Flush all to work around Errata in early athlons regarding |
155 | * large page flushing. | 155 | * large page flushing. |
156 | */ | 156 | */ |
157 | __flush_tlb_all(); | 157 | __flush_tlb_all(); |
158 | 158 | ||
159 | if (cache && boot_cpu_data.x86 >= 4) | 159 | if (cache && boot_cpu_data.x86 >= 4) |
160 | wbinvd(); | 160 | wbinvd(); |
161 | } | 161 | } |
162 | 162 | ||
163 | static void cpa_flush_all(unsigned long cache) | 163 | static void cpa_flush_all(unsigned long cache) |
164 | { | 164 | { |
165 | BUG_ON(irqs_disabled()); | 165 | BUG_ON(irqs_disabled()); |
166 | 166 | ||
167 | on_each_cpu(__cpa_flush_all, (void *) cache, 1); | 167 | on_each_cpu(__cpa_flush_all, (void *) cache, 1); |
168 | } | 168 | } |
169 | 169 | ||
170 | static void __cpa_flush_range(void *arg) | 170 | static void __cpa_flush_range(void *arg) |
171 | { | 171 | { |
172 | /* | 172 | /* |
173 | * We could optimize that further and do individual per page | 173 | * We could optimize that further and do individual per page |
174 | * tlb invalidates for a low number of pages. Caveat: we must | 174 | * tlb invalidates for a low number of pages. Caveat: we must |
175 | * flush the high aliases on 64bit as well. | 175 | * flush the high aliases on 64bit as well. |
176 | */ | 176 | */ |
177 | __flush_tlb_all(); | 177 | __flush_tlb_all(); |
178 | } | 178 | } |
179 | 179 | ||
180 | static void cpa_flush_range(unsigned long start, int numpages, int cache) | 180 | static void cpa_flush_range(unsigned long start, int numpages, int cache) |
181 | { | 181 | { |
182 | unsigned int i, level; | 182 | unsigned int i, level; |
183 | unsigned long addr; | 183 | unsigned long addr; |
184 | 184 | ||
185 | BUG_ON(irqs_disabled()); | 185 | BUG_ON(irqs_disabled()); |
186 | WARN_ON(PAGE_ALIGN(start) != start); | 186 | WARN_ON(PAGE_ALIGN(start) != start); |
187 | 187 | ||
188 | on_each_cpu(__cpa_flush_range, NULL, 1); | 188 | on_each_cpu(__cpa_flush_range, NULL, 1); |
189 | 189 | ||
190 | if (!cache) | 190 | if (!cache) |
191 | return; | 191 | return; |
192 | 192 | ||
193 | /* | 193 | /* |
194 | * We only need to flush on one CPU, | 194 | * We only need to flush on one CPU, |
195 | * clflush is a MESI-coherent instruction that | 195 | * clflush is a MESI-coherent instruction that |
196 | * will cause all other CPUs to flush the same | 196 | * will cause all other CPUs to flush the same |
197 | * cachelines: | 197 | * cachelines: |
198 | */ | 198 | */ |
199 | for (i = 0, addr = start; i < numpages; i++, addr += PAGE_SIZE) { | 199 | for (i = 0, addr = start; i < numpages; i++, addr += PAGE_SIZE) { |
200 | pte_t *pte = lookup_address(addr, &level); | 200 | pte_t *pte = lookup_address(addr, &level); |
201 | 201 | ||
202 | /* | 202 | /* |
203 | * Only flush present addresses: | 203 | * Only flush present addresses: |
204 | */ | 204 | */ |
205 | if (pte && (pte_val(*pte) & _PAGE_PRESENT)) | 205 | if (pte && (pte_val(*pte) & _PAGE_PRESENT)) |
206 | clflush_cache_range((void *) addr, PAGE_SIZE); | 206 | clflush_cache_range((void *) addr, PAGE_SIZE); |
207 | } | 207 | } |
208 | } | 208 | } |
209 | 209 | ||
210 | static void cpa_flush_array(unsigned long *start, int numpages, int cache, | 210 | static void cpa_flush_array(unsigned long *start, int numpages, int cache, |
211 | int in_flags, struct page **pages) | 211 | int in_flags, struct page **pages) |
212 | { | 212 | { |
213 | unsigned int i, level; | 213 | unsigned int i, level; |
214 | unsigned long do_wbinvd = cache && numpages >= 1024; /* 4M threshold */ | 214 | unsigned long do_wbinvd = cache && numpages >= 1024; /* 4M threshold */ |
215 | 215 | ||
216 | BUG_ON(irqs_disabled()); | 216 | BUG_ON(irqs_disabled()); |
217 | 217 | ||
218 | on_each_cpu(__cpa_flush_all, (void *) do_wbinvd, 1); | 218 | on_each_cpu(__cpa_flush_all, (void *) do_wbinvd, 1); |
219 | 219 | ||
220 | if (!cache || do_wbinvd) | 220 | if (!cache || do_wbinvd) |
221 | return; | 221 | return; |
222 | 222 | ||
223 | /* | 223 | /* |
224 | * We only need to flush on one CPU, | 224 | * We only need to flush on one CPU, |
225 | * clflush is a MESI-coherent instruction that | 225 | * clflush is a MESI-coherent instruction that |
226 | * will cause all other CPUs to flush the same | 226 | * will cause all other CPUs to flush the same |
227 | * cachelines: | 227 | * cachelines: |
228 | */ | 228 | */ |
229 | for (i = 0; i < numpages; i++) { | 229 | for (i = 0; i < numpages; i++) { |
230 | unsigned long addr; | 230 | unsigned long addr; |
231 | pte_t *pte; | 231 | pte_t *pte; |
232 | 232 | ||
233 | if (in_flags & CPA_PAGES_ARRAY) | 233 | if (in_flags & CPA_PAGES_ARRAY) |
234 | addr = (unsigned long)page_address(pages[i]); | 234 | addr = (unsigned long)page_address(pages[i]); |
235 | else | 235 | else |
236 | addr = start[i]; | 236 | addr = start[i]; |
237 | 237 | ||
238 | pte = lookup_address(addr, &level); | 238 | pte = lookup_address(addr, &level); |
239 | 239 | ||
240 | /* | 240 | /* |
241 | * Only flush present addresses: | 241 | * Only flush present addresses: |
242 | */ | 242 | */ |
243 | if (pte && (pte_val(*pte) & _PAGE_PRESENT)) | 243 | if (pte && (pte_val(*pte) & _PAGE_PRESENT)) |
244 | clflush_cache_range((void *)addr, PAGE_SIZE); | 244 | clflush_cache_range((void *)addr, PAGE_SIZE); |
245 | } | 245 | } |
246 | } | 246 | } |
247 | 247 | ||
248 | /* | 248 | /* |
249 | * Certain areas of memory on x86 require very specific protection flags, | 249 | * Certain areas of memory on x86 require very specific protection flags, |
250 | * for example the BIOS area or kernel text. Callers don't always get this | 250 | * for example the BIOS area or kernel text. Callers don't always get this |
251 | * right (again, ioremap() on BIOS memory is not uncommon) so this function | 251 | * right (again, ioremap() on BIOS memory is not uncommon) so this function |
252 | * checks and fixes these known static required protection bits. | 252 | * checks and fixes these known static required protection bits. |
253 | */ | 253 | */ |
254 | static inline pgprot_t static_protections(pgprot_t prot, unsigned long address, | 254 | static inline pgprot_t static_protections(pgprot_t prot, unsigned long address, |
255 | unsigned long pfn) | 255 | unsigned long pfn) |
256 | { | 256 | { |
257 | pgprot_t forbidden = __pgprot(0); | 257 | pgprot_t forbidden = __pgprot(0); |
258 | 258 | ||
259 | /* | 259 | /* |
260 | * The BIOS area between 640k and 1Mb needs to be executable for | 260 | * The BIOS area between 640k and 1Mb needs to be executable for |
261 | * PCI BIOS based config access (CONFIG_PCI_GOBIOS) support. | 261 | * PCI BIOS based config access (CONFIG_PCI_GOBIOS) support. |
262 | */ | 262 | */ |
263 | #ifdef CONFIG_PCI_BIOS | 263 | #ifdef CONFIG_PCI_BIOS |
264 | if (pcibios_enabled && within(pfn, BIOS_BEGIN >> PAGE_SHIFT, BIOS_END >> PAGE_SHIFT)) | 264 | if (pcibios_enabled && within(pfn, BIOS_BEGIN >> PAGE_SHIFT, BIOS_END >> PAGE_SHIFT)) |
265 | pgprot_val(forbidden) |= _PAGE_NX; | 265 | pgprot_val(forbidden) |= _PAGE_NX; |
266 | #endif | 266 | #endif |
267 | 267 | ||
268 | /* | 268 | /* |
269 | * The kernel text needs to be executable for obvious reasons | 269 | * The kernel text needs to be executable for obvious reasons |
270 | * Does not cover __inittext since that is gone later on. On | 270 | * Does not cover __inittext since that is gone later on. On |
271 | * 64bit we do not enforce !NX on the low mapping | 271 | * 64bit we do not enforce !NX on the low mapping |
272 | */ | 272 | */ |
273 | if (within(address, (unsigned long)_text, (unsigned long)_etext)) | 273 | if (within(address, (unsigned long)_text, (unsigned long)_etext)) |
274 | pgprot_val(forbidden) |= _PAGE_NX; | 274 | pgprot_val(forbidden) |= _PAGE_NX; |
275 | 275 | ||
276 | /* | 276 | /* |
277 | * The .rodata section needs to be read-only. Using the pfn | 277 | * The .rodata section needs to be read-only. Using the pfn |
278 | * catches all aliases. | 278 | * catches all aliases. |
279 | */ | 279 | */ |
280 | if (within(pfn, __pa_symbol(__start_rodata) >> PAGE_SHIFT, | 280 | if (within(pfn, __pa_symbol(__start_rodata) >> PAGE_SHIFT, |
281 | __pa_symbol(__end_rodata) >> PAGE_SHIFT)) | 281 | __pa_symbol(__end_rodata) >> PAGE_SHIFT)) |
282 | pgprot_val(forbidden) |= _PAGE_RW; | 282 | pgprot_val(forbidden) |= _PAGE_RW; |
283 | 283 | ||
284 | #if defined(CONFIG_X86_64) && defined(CONFIG_DEBUG_RODATA) | 284 | #if defined(CONFIG_X86_64) && defined(CONFIG_DEBUG_RODATA) |
285 | /* | 285 | /* |
286 | * Once the kernel maps the text as RO (kernel_set_to_readonly is set), | 286 | * Once the kernel maps the text as RO (kernel_set_to_readonly is set), |
287 | * kernel text mappings for the large page aligned text, rodata sections | 287 | * kernel text mappings for the large page aligned text, rodata sections |
288 | * will be always read-only. For the kernel identity mappings covering | 288 | * will be always read-only. For the kernel identity mappings covering |
289 | * the holes caused by this alignment can be anything that user asks. | 289 | * the holes caused by this alignment can be anything that user asks. |
290 | * | 290 | * |
291 | * This will preserve the large page mappings for kernel text/data | 291 | * This will preserve the large page mappings for kernel text/data |
292 | * at no extra cost. | 292 | * at no extra cost. |
293 | */ | 293 | */ |
294 | if (kernel_set_to_readonly && | 294 | if (kernel_set_to_readonly && |
295 | within(address, (unsigned long)_text, | 295 | within(address, (unsigned long)_text, |
296 | (unsigned long)__end_rodata_hpage_align)) { | 296 | (unsigned long)__end_rodata_hpage_align)) { |
297 | unsigned int level; | 297 | unsigned int level; |
298 | 298 | ||
299 | /* | 299 | /* |
300 | * Don't enforce the !RW mapping for the kernel text mapping, | 300 | * Don't enforce the !RW mapping for the kernel text mapping, |
301 | * if the current mapping is already using small page mapping. | 301 | * if the current mapping is already using small page mapping. |
302 | * No need to work hard to preserve large page mappings in this | 302 | * No need to work hard to preserve large page mappings in this |
303 | * case. | 303 | * case. |
304 | * | 304 | * |
305 | * This also fixes the Linux Xen paravirt guest boot failure | 305 | * This also fixes the Linux Xen paravirt guest boot failure |
306 | * (because of unexpected read-only mappings for kernel identity | 306 | * (because of unexpected read-only mappings for kernel identity |
307 | * mappings). In this paravirt guest case, the kernel text | 307 | * mappings). In this paravirt guest case, the kernel text |
308 | * mapping and the kernel identity mapping share the same | 308 | * mapping and the kernel identity mapping share the same |
309 | * page-table pages. Thus we can't really use different | 309 | * page-table pages. Thus we can't really use different |
310 | * protections for the kernel text and identity mappings. Also, | 310 | * protections for the kernel text and identity mappings. Also, |
311 | * these shared mappings are made of small page mappings. | 311 | * these shared mappings are made of small page mappings. |
312 | * Thus this don't enforce !RW mapping for small page kernel | 312 | * Thus this don't enforce !RW mapping for small page kernel |
313 | * text mapping logic will help Linux Xen parvirt guest boot | 313 | * text mapping logic will help Linux Xen parvirt guest boot |
314 | * as well. | 314 | * as well. |
315 | */ | 315 | */ |
316 | if (lookup_address(address, &level) && (level != PG_LEVEL_4K)) | 316 | if (lookup_address(address, &level) && (level != PG_LEVEL_4K)) |
317 | pgprot_val(forbidden) |= _PAGE_RW; | 317 | pgprot_val(forbidden) |= _PAGE_RW; |
318 | } | 318 | } |
319 | #endif | 319 | #endif |
320 | 320 | ||
321 | prot = __pgprot(pgprot_val(prot) & ~pgprot_val(forbidden)); | 321 | prot = __pgprot(pgprot_val(prot) & ~pgprot_val(forbidden)); |
322 | 322 | ||
323 | return prot; | 323 | return prot; |
324 | } | 324 | } |
325 | 325 | ||
326 | /* | 326 | /* |
327 | * Lookup the page table entry for a virtual address in a specific pgd. | 327 | * Lookup the page table entry for a virtual address in a specific pgd. |
328 | * Return a pointer to the entry and the level of the mapping. | 328 | * Return a pointer to the entry and the level of the mapping. |
329 | */ | 329 | */ |
330 | pte_t *lookup_address_in_pgd(pgd_t *pgd, unsigned long address, | 330 | pte_t *lookup_address_in_pgd(pgd_t *pgd, unsigned long address, |
331 | unsigned int *level) | 331 | unsigned int *level) |
332 | { | 332 | { |
333 | pud_t *pud; | 333 | pud_t *pud; |
334 | pmd_t *pmd; | 334 | pmd_t *pmd; |
335 | 335 | ||
336 | *level = PG_LEVEL_NONE; | 336 | *level = PG_LEVEL_NONE; |
337 | 337 | ||
338 | if (pgd_none(*pgd)) | 338 | if (pgd_none(*pgd)) |
339 | return NULL; | 339 | return NULL; |
340 | 340 | ||
341 | pud = pud_offset(pgd, address); | 341 | pud = pud_offset(pgd, address); |
342 | if (pud_none(*pud)) | 342 | if (pud_none(*pud)) |
343 | return NULL; | 343 | return NULL; |
344 | 344 | ||
345 | *level = PG_LEVEL_1G; | 345 | *level = PG_LEVEL_1G; |
346 | if (pud_large(*pud) || !pud_present(*pud)) | 346 | if (pud_large(*pud) || !pud_present(*pud)) |
347 | return (pte_t *)pud; | 347 | return (pte_t *)pud; |
348 | 348 | ||
349 | pmd = pmd_offset(pud, address); | 349 | pmd = pmd_offset(pud, address); |
350 | if (pmd_none(*pmd)) | 350 | if (pmd_none(*pmd)) |
351 | return NULL; | 351 | return NULL; |
352 | 352 | ||
353 | *level = PG_LEVEL_2M; | 353 | *level = PG_LEVEL_2M; |
354 | if (pmd_large(*pmd) || !pmd_present(*pmd)) | 354 | if (pmd_large(*pmd) || !pmd_present(*pmd)) |
355 | return (pte_t *)pmd; | 355 | return (pte_t *)pmd; |
356 | 356 | ||
357 | *level = PG_LEVEL_4K; | 357 | *level = PG_LEVEL_4K; |
358 | 358 | ||
359 | return pte_offset_kernel(pmd, address); | 359 | return pte_offset_kernel(pmd, address); |
360 | } | 360 | } |
361 | 361 | ||
362 | /* | 362 | /* |
363 | * Lookup the page table entry for a virtual address. Return a pointer | 363 | * Lookup the page table entry for a virtual address. Return a pointer |
364 | * to the entry and the level of the mapping. | 364 | * to the entry and the level of the mapping. |
365 | * | 365 | * |
366 | * Note: We return pud and pmd either when the entry is marked large | 366 | * Note: We return pud and pmd either when the entry is marked large |
367 | * or when the present bit is not set. Otherwise we would return a | 367 | * or when the present bit is not set. Otherwise we would return a |
368 | * pointer to a nonexisting mapping. | 368 | * pointer to a nonexisting mapping. |
369 | */ | 369 | */ |
370 | pte_t *lookup_address(unsigned long address, unsigned int *level) | 370 | pte_t *lookup_address(unsigned long address, unsigned int *level) |
371 | { | 371 | { |
372 | return lookup_address_in_pgd(pgd_offset_k(address), address, level); | 372 | return lookup_address_in_pgd(pgd_offset_k(address), address, level); |
373 | } | 373 | } |
374 | EXPORT_SYMBOL_GPL(lookup_address); | 374 | EXPORT_SYMBOL_GPL(lookup_address); |
375 | 375 | ||
376 | static pte_t *_lookup_address_cpa(struct cpa_data *cpa, unsigned long address, | 376 | static pte_t *_lookup_address_cpa(struct cpa_data *cpa, unsigned long address, |
377 | unsigned int *level) | 377 | unsigned int *level) |
378 | { | 378 | { |
379 | if (cpa->pgd) | 379 | if (cpa->pgd) |
380 | return lookup_address_in_pgd(cpa->pgd + pgd_index(address), | 380 | return lookup_address_in_pgd(cpa->pgd + pgd_index(address), |
381 | address, level); | 381 | address, level); |
382 | 382 | ||
383 | return lookup_address(address, level); | 383 | return lookup_address(address, level); |
384 | } | 384 | } |
385 | 385 | ||
386 | /* | 386 | /* |
387 | * Lookup the PMD entry for a virtual address. Return a pointer to the entry | ||
388 | * or NULL if not present. | ||
389 | */ | ||
390 | pmd_t *lookup_pmd_address(unsigned long address) | ||
391 | { | ||
392 | pgd_t *pgd; | ||
393 | pud_t *pud; | ||
394 | |||
395 | pgd = pgd_offset_k(address); | ||
396 | if (pgd_none(*pgd)) | ||
397 | return NULL; | ||
398 | |||
399 | pud = pud_offset(pgd, address); | ||
400 | if (pud_none(*pud) || pud_large(*pud) || !pud_present(*pud)) | ||
401 | return NULL; | ||
402 | |||
403 | return pmd_offset(pud, address); | ||
404 | } | ||
405 | |||
406 | /* | ||
387 | * This is necessary because __pa() does not work on some | 407 | * This is necessary because __pa() does not work on some |
388 | * kinds of memory, like vmalloc() or the alloc_remap() | 408 | * kinds of memory, like vmalloc() or the alloc_remap() |
389 | * areas on 32-bit NUMA systems. The percpu areas can | 409 | * areas on 32-bit NUMA systems. The percpu areas can |
390 | * end up in this kind of memory, for instance. | 410 | * end up in this kind of memory, for instance. |
391 | * | 411 | * |
392 | * This could be optimized, but it is only intended to be | 412 | * This could be optimized, but it is only intended to be |
393 | * used at inititalization time, and keeping it | 413 | * used at inititalization time, and keeping it |
394 | * unoptimized should increase the testing coverage for | 414 | * unoptimized should increase the testing coverage for |
395 | * the more obscure platforms. | 415 | * the more obscure platforms. |
396 | */ | 416 | */ |
397 | phys_addr_t slow_virt_to_phys(void *__virt_addr) | 417 | phys_addr_t slow_virt_to_phys(void *__virt_addr) |
398 | { | 418 | { |
399 | unsigned long virt_addr = (unsigned long)__virt_addr; | 419 | unsigned long virt_addr = (unsigned long)__virt_addr; |
400 | phys_addr_t phys_addr; | 420 | phys_addr_t phys_addr; |
401 | unsigned long offset; | 421 | unsigned long offset; |
402 | enum pg_level level; | 422 | enum pg_level level; |
403 | unsigned long psize; | 423 | unsigned long psize; |
404 | unsigned long pmask; | 424 | unsigned long pmask; |
405 | pte_t *pte; | 425 | pte_t *pte; |
406 | 426 | ||
407 | pte = lookup_address(virt_addr, &level); | 427 | pte = lookup_address(virt_addr, &level); |
408 | BUG_ON(!pte); | 428 | BUG_ON(!pte); |
409 | psize = page_level_size(level); | 429 | psize = page_level_size(level); |
410 | pmask = page_level_mask(level); | 430 | pmask = page_level_mask(level); |
411 | offset = virt_addr & ~pmask; | 431 | offset = virt_addr & ~pmask; |
412 | phys_addr = (phys_addr_t)pte_pfn(*pte) << PAGE_SHIFT; | 432 | phys_addr = (phys_addr_t)pte_pfn(*pte) << PAGE_SHIFT; |
413 | return (phys_addr | offset); | 433 | return (phys_addr | offset); |
414 | } | 434 | } |
415 | EXPORT_SYMBOL_GPL(slow_virt_to_phys); | 435 | EXPORT_SYMBOL_GPL(slow_virt_to_phys); |
416 | 436 | ||
417 | /* | 437 | /* |
418 | * Set the new pmd in all the pgds we know about: | 438 | * Set the new pmd in all the pgds we know about: |
419 | */ | 439 | */ |
420 | static void __set_pmd_pte(pte_t *kpte, unsigned long address, pte_t pte) | 440 | static void __set_pmd_pte(pte_t *kpte, unsigned long address, pte_t pte) |
421 | { | 441 | { |
422 | /* change init_mm */ | 442 | /* change init_mm */ |
423 | set_pte_atomic(kpte, pte); | 443 | set_pte_atomic(kpte, pte); |
424 | #ifdef CONFIG_X86_32 | 444 | #ifdef CONFIG_X86_32 |
425 | if (!SHARED_KERNEL_PMD) { | 445 | if (!SHARED_KERNEL_PMD) { |
426 | struct page *page; | 446 | struct page *page; |
427 | 447 | ||
428 | list_for_each_entry(page, &pgd_list, lru) { | 448 | list_for_each_entry(page, &pgd_list, lru) { |
429 | pgd_t *pgd; | 449 | pgd_t *pgd; |
430 | pud_t *pud; | 450 | pud_t *pud; |
431 | pmd_t *pmd; | 451 | pmd_t *pmd; |
432 | 452 | ||
433 | pgd = (pgd_t *)page_address(page) + pgd_index(address); | 453 | pgd = (pgd_t *)page_address(page) + pgd_index(address); |
434 | pud = pud_offset(pgd, address); | 454 | pud = pud_offset(pgd, address); |
435 | pmd = pmd_offset(pud, address); | 455 | pmd = pmd_offset(pud, address); |
436 | set_pte_atomic((pte_t *)pmd, pte); | 456 | set_pte_atomic((pte_t *)pmd, pte); |
437 | } | 457 | } |
438 | } | 458 | } |
439 | #endif | 459 | #endif |
440 | } | 460 | } |
441 | 461 | ||
442 | static int | 462 | static int |
443 | try_preserve_large_page(pte_t *kpte, unsigned long address, | 463 | try_preserve_large_page(pte_t *kpte, unsigned long address, |
444 | struct cpa_data *cpa) | 464 | struct cpa_data *cpa) |
445 | { | 465 | { |
446 | unsigned long nextpage_addr, numpages, pmask, psize, addr, pfn; | 466 | unsigned long nextpage_addr, numpages, pmask, psize, addr, pfn; |
447 | pte_t new_pte, old_pte, *tmp; | 467 | pte_t new_pte, old_pte, *tmp; |
448 | pgprot_t old_prot, new_prot, req_prot; | 468 | pgprot_t old_prot, new_prot, req_prot; |
449 | int i, do_split = 1; | 469 | int i, do_split = 1; |
450 | enum pg_level level; | 470 | enum pg_level level; |
451 | 471 | ||
452 | if (cpa->force_split) | 472 | if (cpa->force_split) |
453 | return 1; | 473 | return 1; |
454 | 474 | ||
455 | spin_lock(&pgd_lock); | 475 | spin_lock(&pgd_lock); |
456 | /* | 476 | /* |
457 | * Check for races, another CPU might have split this page | 477 | * Check for races, another CPU might have split this page |
458 | * up already: | 478 | * up already: |
459 | */ | 479 | */ |
460 | tmp = _lookup_address_cpa(cpa, address, &level); | 480 | tmp = _lookup_address_cpa(cpa, address, &level); |
461 | if (tmp != kpte) | 481 | if (tmp != kpte) |
462 | goto out_unlock; | 482 | goto out_unlock; |
463 | 483 | ||
464 | switch (level) { | 484 | switch (level) { |
465 | case PG_LEVEL_2M: | 485 | case PG_LEVEL_2M: |
466 | #ifdef CONFIG_X86_64 | 486 | #ifdef CONFIG_X86_64 |
467 | case PG_LEVEL_1G: | 487 | case PG_LEVEL_1G: |
468 | #endif | 488 | #endif |
469 | psize = page_level_size(level); | 489 | psize = page_level_size(level); |
470 | pmask = page_level_mask(level); | 490 | pmask = page_level_mask(level); |
471 | break; | 491 | break; |
472 | default: | 492 | default: |
473 | do_split = -EINVAL; | 493 | do_split = -EINVAL; |
474 | goto out_unlock; | 494 | goto out_unlock; |
475 | } | 495 | } |
476 | 496 | ||
477 | /* | 497 | /* |
478 | * Calculate the number of pages, which fit into this large | 498 | * Calculate the number of pages, which fit into this large |
479 | * page starting at address: | 499 | * page starting at address: |
480 | */ | 500 | */ |
481 | nextpage_addr = (address + psize) & pmask; | 501 | nextpage_addr = (address + psize) & pmask; |
482 | numpages = (nextpage_addr - address) >> PAGE_SHIFT; | 502 | numpages = (nextpage_addr - address) >> PAGE_SHIFT; |
483 | if (numpages < cpa->numpages) | 503 | if (numpages < cpa->numpages) |
484 | cpa->numpages = numpages; | 504 | cpa->numpages = numpages; |
485 | 505 | ||
486 | /* | 506 | /* |
487 | * We are safe now. Check whether the new pgprot is the same: | 507 | * We are safe now. Check whether the new pgprot is the same: |
488 | * Convert protection attributes to 4k-format, as cpa->mask* are set | 508 | * Convert protection attributes to 4k-format, as cpa->mask* are set |
489 | * up accordingly. | 509 | * up accordingly. |
490 | */ | 510 | */ |
491 | old_pte = *kpte; | 511 | old_pte = *kpte; |
492 | old_prot = req_prot = pgprot_large_2_4k(pte_pgprot(old_pte)); | 512 | old_prot = req_prot = pgprot_large_2_4k(pte_pgprot(old_pte)); |
493 | 513 | ||
494 | pgprot_val(req_prot) &= ~pgprot_val(cpa->mask_clr); | 514 | pgprot_val(req_prot) &= ~pgprot_val(cpa->mask_clr); |
495 | pgprot_val(req_prot) |= pgprot_val(cpa->mask_set); | 515 | pgprot_val(req_prot) |= pgprot_val(cpa->mask_set); |
496 | 516 | ||
497 | /* | 517 | /* |
498 | * req_prot is in format of 4k pages. It must be converted to large | 518 | * req_prot is in format of 4k pages. It must be converted to large |
499 | * page format: the caching mode includes the PAT bit located at | 519 | * page format: the caching mode includes the PAT bit located at |
500 | * different bit positions in the two formats. | 520 | * different bit positions in the two formats. |
501 | */ | 521 | */ |
502 | req_prot = pgprot_4k_2_large(req_prot); | 522 | req_prot = pgprot_4k_2_large(req_prot); |
503 | 523 | ||
504 | /* | 524 | /* |
505 | * Set the PSE and GLOBAL flags only if the PRESENT flag is | 525 | * Set the PSE and GLOBAL flags only if the PRESENT flag is |
506 | * set otherwise pmd_present/pmd_huge will return true even on | 526 | * set otherwise pmd_present/pmd_huge will return true even on |
507 | * a non present pmd. The canon_pgprot will clear _PAGE_GLOBAL | 527 | * a non present pmd. The canon_pgprot will clear _PAGE_GLOBAL |
508 | * for the ancient hardware that doesn't support it. | 528 | * for the ancient hardware that doesn't support it. |
509 | */ | 529 | */ |
510 | if (pgprot_val(req_prot) & _PAGE_PRESENT) | 530 | if (pgprot_val(req_prot) & _PAGE_PRESENT) |
511 | pgprot_val(req_prot) |= _PAGE_PSE | _PAGE_GLOBAL; | 531 | pgprot_val(req_prot) |= _PAGE_PSE | _PAGE_GLOBAL; |
512 | else | 532 | else |
513 | pgprot_val(req_prot) &= ~(_PAGE_PSE | _PAGE_GLOBAL); | 533 | pgprot_val(req_prot) &= ~(_PAGE_PSE | _PAGE_GLOBAL); |
514 | 534 | ||
515 | req_prot = canon_pgprot(req_prot); | 535 | req_prot = canon_pgprot(req_prot); |
516 | 536 | ||
517 | /* | 537 | /* |
518 | * old_pte points to the large page base address. So we need | 538 | * old_pte points to the large page base address. So we need |
519 | * to add the offset of the virtual address: | 539 | * to add the offset of the virtual address: |
520 | */ | 540 | */ |
521 | pfn = pte_pfn(old_pte) + ((address & (psize - 1)) >> PAGE_SHIFT); | 541 | pfn = pte_pfn(old_pte) + ((address & (psize - 1)) >> PAGE_SHIFT); |
522 | cpa->pfn = pfn; | 542 | cpa->pfn = pfn; |
523 | 543 | ||
524 | new_prot = static_protections(req_prot, address, pfn); | 544 | new_prot = static_protections(req_prot, address, pfn); |
525 | 545 | ||
526 | /* | 546 | /* |
527 | * We need to check the full range, whether | 547 | * We need to check the full range, whether |
528 | * static_protection() requires a different pgprot for one of | 548 | * static_protection() requires a different pgprot for one of |
529 | * the pages in the range we try to preserve: | 549 | * the pages in the range we try to preserve: |
530 | */ | 550 | */ |
531 | addr = address & pmask; | 551 | addr = address & pmask; |
532 | pfn = pte_pfn(old_pte); | 552 | pfn = pte_pfn(old_pte); |
533 | for (i = 0; i < (psize >> PAGE_SHIFT); i++, addr += PAGE_SIZE, pfn++) { | 553 | for (i = 0; i < (psize >> PAGE_SHIFT); i++, addr += PAGE_SIZE, pfn++) { |
534 | pgprot_t chk_prot = static_protections(req_prot, addr, pfn); | 554 | pgprot_t chk_prot = static_protections(req_prot, addr, pfn); |
535 | 555 | ||
536 | if (pgprot_val(chk_prot) != pgprot_val(new_prot)) | 556 | if (pgprot_val(chk_prot) != pgprot_val(new_prot)) |
537 | goto out_unlock; | 557 | goto out_unlock; |
538 | } | 558 | } |
539 | 559 | ||
540 | /* | 560 | /* |
541 | * If there are no changes, return. maxpages has been updated | 561 | * If there are no changes, return. maxpages has been updated |
542 | * above: | 562 | * above: |
543 | */ | 563 | */ |
544 | if (pgprot_val(new_prot) == pgprot_val(old_prot)) { | 564 | if (pgprot_val(new_prot) == pgprot_val(old_prot)) { |
545 | do_split = 0; | 565 | do_split = 0; |
546 | goto out_unlock; | 566 | goto out_unlock; |
547 | } | 567 | } |
548 | 568 | ||
549 | /* | 569 | /* |
550 | * We need to change the attributes. Check, whether we can | 570 | * We need to change the attributes. Check, whether we can |
551 | * change the large page in one go. We request a split, when | 571 | * change the large page in one go. We request a split, when |
552 | * the address is not aligned and the number of pages is | 572 | * the address is not aligned and the number of pages is |
553 | * smaller than the number of pages in the large page. Note | 573 | * smaller than the number of pages in the large page. Note |
554 | * that we limited the number of possible pages already to | 574 | * that we limited the number of possible pages already to |
555 | * the number of pages in the large page. | 575 | * the number of pages in the large page. |
556 | */ | 576 | */ |
557 | if (address == (address & pmask) && cpa->numpages == (psize >> PAGE_SHIFT)) { | 577 | if (address == (address & pmask) && cpa->numpages == (psize >> PAGE_SHIFT)) { |
558 | /* | 578 | /* |
559 | * The address is aligned and the number of pages | 579 | * The address is aligned and the number of pages |
560 | * covers the full page. | 580 | * covers the full page. |
561 | */ | 581 | */ |
562 | new_pte = pfn_pte(pte_pfn(old_pte), new_prot); | 582 | new_pte = pfn_pte(pte_pfn(old_pte), new_prot); |
563 | __set_pmd_pte(kpte, address, new_pte); | 583 | __set_pmd_pte(kpte, address, new_pte); |
564 | cpa->flags |= CPA_FLUSHTLB; | 584 | cpa->flags |= CPA_FLUSHTLB; |
565 | do_split = 0; | 585 | do_split = 0; |
566 | } | 586 | } |
567 | 587 | ||
568 | out_unlock: | 588 | out_unlock: |
569 | spin_unlock(&pgd_lock); | 589 | spin_unlock(&pgd_lock); |
570 | 590 | ||
571 | return do_split; | 591 | return do_split; |
572 | } | 592 | } |
573 | 593 | ||
574 | static int | 594 | static int |
575 | __split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address, | 595 | __split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address, |
576 | struct page *base) | 596 | struct page *base) |
577 | { | 597 | { |
578 | pte_t *pbase = (pte_t *)page_address(base); | 598 | pte_t *pbase = (pte_t *)page_address(base); |
579 | unsigned long pfn, pfninc = 1; | 599 | unsigned long pfn, pfninc = 1; |
580 | unsigned int i, level; | 600 | unsigned int i, level; |
581 | pte_t *tmp; | 601 | pte_t *tmp; |
582 | pgprot_t ref_prot; | 602 | pgprot_t ref_prot; |
583 | 603 | ||
584 | spin_lock(&pgd_lock); | 604 | spin_lock(&pgd_lock); |
585 | /* | 605 | /* |
586 | * Check for races, another CPU might have split this page | 606 | * Check for races, another CPU might have split this page |
587 | * up for us already: | 607 | * up for us already: |
588 | */ | 608 | */ |
589 | tmp = _lookup_address_cpa(cpa, address, &level); | 609 | tmp = _lookup_address_cpa(cpa, address, &level); |
590 | if (tmp != kpte) { | 610 | if (tmp != kpte) { |
591 | spin_unlock(&pgd_lock); | 611 | spin_unlock(&pgd_lock); |
592 | return 1; | 612 | return 1; |
593 | } | 613 | } |
594 | 614 | ||
595 | paravirt_alloc_pte(&init_mm, page_to_pfn(base)); | 615 | paravirt_alloc_pte(&init_mm, page_to_pfn(base)); |
596 | ref_prot = pte_pgprot(pte_clrhuge(*kpte)); | 616 | ref_prot = pte_pgprot(pte_clrhuge(*kpte)); |
597 | 617 | ||
598 | /* promote PAT bit to correct position */ | 618 | /* promote PAT bit to correct position */ |
599 | if (level == PG_LEVEL_2M) | 619 | if (level == PG_LEVEL_2M) |
600 | ref_prot = pgprot_large_2_4k(ref_prot); | 620 | ref_prot = pgprot_large_2_4k(ref_prot); |
601 | 621 | ||
602 | #ifdef CONFIG_X86_64 | 622 | #ifdef CONFIG_X86_64 |
603 | if (level == PG_LEVEL_1G) { | 623 | if (level == PG_LEVEL_1G) { |
604 | pfninc = PMD_PAGE_SIZE >> PAGE_SHIFT; | 624 | pfninc = PMD_PAGE_SIZE >> PAGE_SHIFT; |
605 | /* | 625 | /* |
606 | * Set the PSE flags only if the PRESENT flag is set | 626 | * Set the PSE flags only if the PRESENT flag is set |
607 | * otherwise pmd_present/pmd_huge will return true | 627 | * otherwise pmd_present/pmd_huge will return true |
608 | * even on a non present pmd. | 628 | * even on a non present pmd. |
609 | */ | 629 | */ |
610 | if (pgprot_val(ref_prot) & _PAGE_PRESENT) | 630 | if (pgprot_val(ref_prot) & _PAGE_PRESENT) |
611 | pgprot_val(ref_prot) |= _PAGE_PSE; | 631 | pgprot_val(ref_prot) |= _PAGE_PSE; |
612 | else | 632 | else |
613 | pgprot_val(ref_prot) &= ~_PAGE_PSE; | 633 | pgprot_val(ref_prot) &= ~_PAGE_PSE; |
614 | } | 634 | } |
615 | #endif | 635 | #endif |
616 | 636 | ||
617 | /* | 637 | /* |
618 | * Set the GLOBAL flags only if the PRESENT flag is set | 638 | * Set the GLOBAL flags only if the PRESENT flag is set |
619 | * otherwise pmd/pte_present will return true even on a non | 639 | * otherwise pmd/pte_present will return true even on a non |
620 | * present pmd/pte. The canon_pgprot will clear _PAGE_GLOBAL | 640 | * present pmd/pte. The canon_pgprot will clear _PAGE_GLOBAL |
621 | * for the ancient hardware that doesn't support it. | 641 | * for the ancient hardware that doesn't support it. |
622 | */ | 642 | */ |
623 | if (pgprot_val(ref_prot) & _PAGE_PRESENT) | 643 | if (pgprot_val(ref_prot) & _PAGE_PRESENT) |
624 | pgprot_val(ref_prot) |= _PAGE_GLOBAL; | 644 | pgprot_val(ref_prot) |= _PAGE_GLOBAL; |
625 | else | 645 | else |
626 | pgprot_val(ref_prot) &= ~_PAGE_GLOBAL; | 646 | pgprot_val(ref_prot) &= ~_PAGE_GLOBAL; |
627 | 647 | ||
628 | /* | 648 | /* |
629 | * Get the target pfn from the original entry: | 649 | * Get the target pfn from the original entry: |
630 | */ | 650 | */ |
631 | pfn = pte_pfn(*kpte); | 651 | pfn = pte_pfn(*kpte); |
632 | for (i = 0; i < PTRS_PER_PTE; i++, pfn += pfninc) | 652 | for (i = 0; i < PTRS_PER_PTE; i++, pfn += pfninc) |
633 | set_pte(&pbase[i], pfn_pte(pfn, canon_pgprot(ref_prot))); | 653 | set_pte(&pbase[i], pfn_pte(pfn, canon_pgprot(ref_prot))); |
634 | 654 | ||
635 | if (pfn_range_is_mapped(PFN_DOWN(__pa(address)), | 655 | if (pfn_range_is_mapped(PFN_DOWN(__pa(address)), |
636 | PFN_DOWN(__pa(address)) + 1)) | 656 | PFN_DOWN(__pa(address)) + 1)) |
637 | split_page_count(level); | 657 | split_page_count(level); |
638 | 658 | ||
639 | /* | 659 | /* |
640 | * Install the new, split up pagetable. | 660 | * Install the new, split up pagetable. |
641 | * | 661 | * |
642 | * We use the standard kernel pagetable protections for the new | 662 | * We use the standard kernel pagetable protections for the new |
643 | * pagetable protections, the actual ptes set above control the | 663 | * pagetable protections, the actual ptes set above control the |
644 | * primary protection behavior: | 664 | * primary protection behavior: |
645 | */ | 665 | */ |
646 | __set_pmd_pte(kpte, address, mk_pte(base, __pgprot(_KERNPG_TABLE))); | 666 | __set_pmd_pte(kpte, address, mk_pte(base, __pgprot(_KERNPG_TABLE))); |
647 | 667 | ||
648 | /* | 668 | /* |
649 | * Intel Atom errata AAH41 workaround. | 669 | * Intel Atom errata AAH41 workaround. |
650 | * | 670 | * |
651 | * The real fix should be in hw or in a microcode update, but | 671 | * The real fix should be in hw or in a microcode update, but |
652 | * we also probabilistically try to reduce the window of having | 672 | * we also probabilistically try to reduce the window of having |
653 | * a large TLB mixed with 4K TLBs while instruction fetches are | 673 | * a large TLB mixed with 4K TLBs while instruction fetches are |
654 | * going on. | 674 | * going on. |
655 | */ | 675 | */ |
656 | __flush_tlb_all(); | 676 | __flush_tlb_all(); |
657 | spin_unlock(&pgd_lock); | 677 | spin_unlock(&pgd_lock); |
658 | 678 | ||
659 | return 0; | 679 | return 0; |
660 | } | 680 | } |
661 | 681 | ||
662 | static int split_large_page(struct cpa_data *cpa, pte_t *kpte, | 682 | static int split_large_page(struct cpa_data *cpa, pte_t *kpte, |
663 | unsigned long address) | 683 | unsigned long address) |
664 | { | 684 | { |
665 | struct page *base; | 685 | struct page *base; |
666 | 686 | ||
667 | if (!debug_pagealloc) | 687 | if (!debug_pagealloc) |
668 | spin_unlock(&cpa_lock); | 688 | spin_unlock(&cpa_lock); |
669 | base = alloc_pages(GFP_KERNEL | __GFP_NOTRACK, 0); | 689 | base = alloc_pages(GFP_KERNEL | __GFP_NOTRACK, 0); |
670 | if (!debug_pagealloc) | 690 | if (!debug_pagealloc) |
671 | spin_lock(&cpa_lock); | 691 | spin_lock(&cpa_lock); |
672 | if (!base) | 692 | if (!base) |
673 | return -ENOMEM; | 693 | return -ENOMEM; |
674 | 694 | ||
675 | if (__split_large_page(cpa, kpte, address, base)) | 695 | if (__split_large_page(cpa, kpte, address, base)) |
676 | __free_page(base); | 696 | __free_page(base); |
677 | 697 | ||
678 | return 0; | 698 | return 0; |
679 | } | 699 | } |
680 | 700 | ||
681 | static bool try_to_free_pte_page(pte_t *pte) | 701 | static bool try_to_free_pte_page(pte_t *pte) |
682 | { | 702 | { |
683 | int i; | 703 | int i; |
684 | 704 | ||
685 | for (i = 0; i < PTRS_PER_PTE; i++) | 705 | for (i = 0; i < PTRS_PER_PTE; i++) |
686 | if (!pte_none(pte[i])) | 706 | if (!pte_none(pte[i])) |
687 | return false; | 707 | return false; |
688 | 708 | ||
689 | free_page((unsigned long)pte); | 709 | free_page((unsigned long)pte); |
690 | return true; | 710 | return true; |
691 | } | 711 | } |
692 | 712 | ||
693 | static bool try_to_free_pmd_page(pmd_t *pmd) | 713 | static bool try_to_free_pmd_page(pmd_t *pmd) |
694 | { | 714 | { |
695 | int i; | 715 | int i; |
696 | 716 | ||
697 | for (i = 0; i < PTRS_PER_PMD; i++) | 717 | for (i = 0; i < PTRS_PER_PMD; i++) |
698 | if (!pmd_none(pmd[i])) | 718 | if (!pmd_none(pmd[i])) |
699 | return false; | 719 | return false; |
700 | 720 | ||
701 | free_page((unsigned long)pmd); | 721 | free_page((unsigned long)pmd); |
702 | return true; | 722 | return true; |
703 | } | 723 | } |
704 | 724 | ||
705 | static bool try_to_free_pud_page(pud_t *pud) | 725 | static bool try_to_free_pud_page(pud_t *pud) |
706 | { | 726 | { |
707 | int i; | 727 | int i; |
708 | 728 | ||
709 | for (i = 0; i < PTRS_PER_PUD; i++) | 729 | for (i = 0; i < PTRS_PER_PUD; i++) |
710 | if (!pud_none(pud[i])) | 730 | if (!pud_none(pud[i])) |
711 | return false; | 731 | return false; |
712 | 732 | ||
713 | free_page((unsigned long)pud); | 733 | free_page((unsigned long)pud); |
714 | return true; | 734 | return true; |
715 | } | 735 | } |
716 | 736 | ||
717 | static bool unmap_pte_range(pmd_t *pmd, unsigned long start, unsigned long end) | 737 | static bool unmap_pte_range(pmd_t *pmd, unsigned long start, unsigned long end) |
718 | { | 738 | { |
719 | pte_t *pte = pte_offset_kernel(pmd, start); | 739 | pte_t *pte = pte_offset_kernel(pmd, start); |
720 | 740 | ||
721 | while (start < end) { | 741 | while (start < end) { |
722 | set_pte(pte, __pte(0)); | 742 | set_pte(pte, __pte(0)); |
723 | 743 | ||
724 | start += PAGE_SIZE; | 744 | start += PAGE_SIZE; |
725 | pte++; | 745 | pte++; |
726 | } | 746 | } |
727 | 747 | ||
728 | if (try_to_free_pte_page((pte_t *)pmd_page_vaddr(*pmd))) { | 748 | if (try_to_free_pte_page((pte_t *)pmd_page_vaddr(*pmd))) { |
729 | pmd_clear(pmd); | 749 | pmd_clear(pmd); |
730 | return true; | 750 | return true; |
731 | } | 751 | } |
732 | return false; | 752 | return false; |
733 | } | 753 | } |
734 | 754 | ||
735 | static void __unmap_pmd_range(pud_t *pud, pmd_t *pmd, | 755 | static void __unmap_pmd_range(pud_t *pud, pmd_t *pmd, |
736 | unsigned long start, unsigned long end) | 756 | unsigned long start, unsigned long end) |
737 | { | 757 | { |
738 | if (unmap_pte_range(pmd, start, end)) | 758 | if (unmap_pte_range(pmd, start, end)) |
739 | if (try_to_free_pmd_page((pmd_t *)pud_page_vaddr(*pud))) | 759 | if (try_to_free_pmd_page((pmd_t *)pud_page_vaddr(*pud))) |
740 | pud_clear(pud); | 760 | pud_clear(pud); |
741 | } | 761 | } |
742 | 762 | ||
743 | static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end) | 763 | static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end) |
744 | { | 764 | { |
745 | pmd_t *pmd = pmd_offset(pud, start); | 765 | pmd_t *pmd = pmd_offset(pud, start); |
746 | 766 | ||
747 | /* | 767 | /* |
748 | * Not on a 2MB page boundary? | 768 | * Not on a 2MB page boundary? |
749 | */ | 769 | */ |
750 | if (start & (PMD_SIZE - 1)) { | 770 | if (start & (PMD_SIZE - 1)) { |
751 | unsigned long next_page = (start + PMD_SIZE) & PMD_MASK; | 771 | unsigned long next_page = (start + PMD_SIZE) & PMD_MASK; |
752 | unsigned long pre_end = min_t(unsigned long, end, next_page); | 772 | unsigned long pre_end = min_t(unsigned long, end, next_page); |
753 | 773 | ||
754 | __unmap_pmd_range(pud, pmd, start, pre_end); | 774 | __unmap_pmd_range(pud, pmd, start, pre_end); |
755 | 775 | ||
756 | start = pre_end; | 776 | start = pre_end; |
757 | pmd++; | 777 | pmd++; |
758 | } | 778 | } |
759 | 779 | ||
760 | /* | 780 | /* |
761 | * Try to unmap in 2M chunks. | 781 | * Try to unmap in 2M chunks. |
762 | */ | 782 | */ |
763 | while (end - start >= PMD_SIZE) { | 783 | while (end - start >= PMD_SIZE) { |
764 | if (pmd_large(*pmd)) | 784 | if (pmd_large(*pmd)) |
765 | pmd_clear(pmd); | 785 | pmd_clear(pmd); |
766 | else | 786 | else |
767 | __unmap_pmd_range(pud, pmd, start, start + PMD_SIZE); | 787 | __unmap_pmd_range(pud, pmd, start, start + PMD_SIZE); |
768 | 788 | ||
769 | start += PMD_SIZE; | 789 | start += PMD_SIZE; |
770 | pmd++; | 790 | pmd++; |
771 | } | 791 | } |
772 | 792 | ||
773 | /* | 793 | /* |
774 | * 4K leftovers? | 794 | * 4K leftovers? |
775 | */ | 795 | */ |
776 | if (start < end) | 796 | if (start < end) |
777 | return __unmap_pmd_range(pud, pmd, start, end); | 797 | return __unmap_pmd_range(pud, pmd, start, end); |
778 | 798 | ||
779 | /* | 799 | /* |
780 | * Try again to free the PMD page if haven't succeeded above. | 800 | * Try again to free the PMD page if haven't succeeded above. |
781 | */ | 801 | */ |
782 | if (!pud_none(*pud)) | 802 | if (!pud_none(*pud)) |
783 | if (try_to_free_pmd_page((pmd_t *)pud_page_vaddr(*pud))) | 803 | if (try_to_free_pmd_page((pmd_t *)pud_page_vaddr(*pud))) |
784 | pud_clear(pud); | 804 | pud_clear(pud); |
785 | } | 805 | } |
786 | 806 | ||
787 | static void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end) | 807 | static void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end) |
788 | { | 808 | { |
789 | pud_t *pud = pud_offset(pgd, start); | 809 | pud_t *pud = pud_offset(pgd, start); |
790 | 810 | ||
791 | /* | 811 | /* |
792 | * Not on a GB page boundary? | 812 | * Not on a GB page boundary? |
793 | */ | 813 | */ |
794 | if (start & (PUD_SIZE - 1)) { | 814 | if (start & (PUD_SIZE - 1)) { |
795 | unsigned long next_page = (start + PUD_SIZE) & PUD_MASK; | 815 | unsigned long next_page = (start + PUD_SIZE) & PUD_MASK; |
796 | unsigned long pre_end = min_t(unsigned long, end, next_page); | 816 | unsigned long pre_end = min_t(unsigned long, end, next_page); |
797 | 817 | ||
798 | unmap_pmd_range(pud, start, pre_end); | 818 | unmap_pmd_range(pud, start, pre_end); |
799 | 819 | ||
800 | start = pre_end; | 820 | start = pre_end; |
801 | pud++; | 821 | pud++; |
802 | } | 822 | } |
803 | 823 | ||
804 | /* | 824 | /* |
805 | * Try to unmap in 1G chunks? | 825 | * Try to unmap in 1G chunks? |
806 | */ | 826 | */ |
807 | while (end - start >= PUD_SIZE) { | 827 | while (end - start >= PUD_SIZE) { |
808 | 828 | ||
809 | if (pud_large(*pud)) | 829 | if (pud_large(*pud)) |
810 | pud_clear(pud); | 830 | pud_clear(pud); |
811 | else | 831 | else |
812 | unmap_pmd_range(pud, start, start + PUD_SIZE); | 832 | unmap_pmd_range(pud, start, start + PUD_SIZE); |
813 | 833 | ||
814 | start += PUD_SIZE; | 834 | start += PUD_SIZE; |
815 | pud++; | 835 | pud++; |
816 | } | 836 | } |
817 | 837 | ||
818 | /* | 838 | /* |
819 | * 2M leftovers? | 839 | * 2M leftovers? |
820 | */ | 840 | */ |
821 | if (start < end) | 841 | if (start < end) |
822 | unmap_pmd_range(pud, start, end); | 842 | unmap_pmd_range(pud, start, end); |
823 | 843 | ||
824 | /* | 844 | /* |
825 | * No need to try to free the PUD page because we'll free it in | 845 | * No need to try to free the PUD page because we'll free it in |
826 | * populate_pgd's error path | 846 | * populate_pgd's error path |
827 | */ | 847 | */ |
828 | } | 848 | } |
829 | 849 | ||
830 | static void unmap_pgd_range(pgd_t *root, unsigned long addr, unsigned long end) | 850 | static void unmap_pgd_range(pgd_t *root, unsigned long addr, unsigned long end) |
831 | { | 851 | { |
832 | pgd_t *pgd_entry = root + pgd_index(addr); | 852 | pgd_t *pgd_entry = root + pgd_index(addr); |
833 | 853 | ||
834 | unmap_pud_range(pgd_entry, addr, end); | 854 | unmap_pud_range(pgd_entry, addr, end); |
835 | 855 | ||
836 | if (try_to_free_pud_page((pud_t *)pgd_page_vaddr(*pgd_entry))) | 856 | if (try_to_free_pud_page((pud_t *)pgd_page_vaddr(*pgd_entry))) |
837 | pgd_clear(pgd_entry); | 857 | pgd_clear(pgd_entry); |
838 | } | 858 | } |
839 | 859 | ||
840 | static int alloc_pte_page(pmd_t *pmd) | 860 | static int alloc_pte_page(pmd_t *pmd) |
841 | { | 861 | { |
842 | pte_t *pte = (pte_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK); | 862 | pte_t *pte = (pte_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK); |
843 | if (!pte) | 863 | if (!pte) |
844 | return -1; | 864 | return -1; |
845 | 865 | ||
846 | set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE)); | 866 | set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE)); |
847 | return 0; | 867 | return 0; |
848 | } | 868 | } |
849 | 869 | ||
850 | static int alloc_pmd_page(pud_t *pud) | 870 | static int alloc_pmd_page(pud_t *pud) |
851 | { | 871 | { |
852 | pmd_t *pmd = (pmd_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK); | 872 | pmd_t *pmd = (pmd_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK); |
853 | if (!pmd) | 873 | if (!pmd) |
854 | return -1; | 874 | return -1; |
855 | 875 | ||
856 | set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE)); | 876 | set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE)); |
857 | return 0; | 877 | return 0; |
858 | } | 878 | } |
859 | 879 | ||
860 | static void populate_pte(struct cpa_data *cpa, | 880 | static void populate_pte(struct cpa_data *cpa, |
861 | unsigned long start, unsigned long end, | 881 | unsigned long start, unsigned long end, |
862 | unsigned num_pages, pmd_t *pmd, pgprot_t pgprot) | 882 | unsigned num_pages, pmd_t *pmd, pgprot_t pgprot) |
863 | { | 883 | { |
864 | pte_t *pte; | 884 | pte_t *pte; |
865 | 885 | ||
866 | pte = pte_offset_kernel(pmd, start); | 886 | pte = pte_offset_kernel(pmd, start); |
867 | 887 | ||
868 | while (num_pages-- && start < end) { | 888 | while (num_pages-- && start < end) { |
869 | 889 | ||
870 | /* deal with the NX bit */ | 890 | /* deal with the NX bit */ |
871 | if (!(pgprot_val(pgprot) & _PAGE_NX)) | 891 | if (!(pgprot_val(pgprot) & _PAGE_NX)) |
872 | cpa->pfn &= ~_PAGE_NX; | 892 | cpa->pfn &= ~_PAGE_NX; |
873 | 893 | ||
874 | set_pte(pte, pfn_pte(cpa->pfn >> PAGE_SHIFT, pgprot)); | 894 | set_pte(pte, pfn_pte(cpa->pfn >> PAGE_SHIFT, pgprot)); |
875 | 895 | ||
876 | start += PAGE_SIZE; | 896 | start += PAGE_SIZE; |
877 | cpa->pfn += PAGE_SIZE; | 897 | cpa->pfn += PAGE_SIZE; |
878 | pte++; | 898 | pte++; |
879 | } | 899 | } |
880 | } | 900 | } |
881 | 901 | ||
882 | static int populate_pmd(struct cpa_data *cpa, | 902 | static int populate_pmd(struct cpa_data *cpa, |
883 | unsigned long start, unsigned long end, | 903 | unsigned long start, unsigned long end, |
884 | unsigned num_pages, pud_t *pud, pgprot_t pgprot) | 904 | unsigned num_pages, pud_t *pud, pgprot_t pgprot) |
885 | { | 905 | { |
886 | unsigned int cur_pages = 0; | 906 | unsigned int cur_pages = 0; |
887 | pmd_t *pmd; | 907 | pmd_t *pmd; |
888 | pgprot_t pmd_pgprot; | 908 | pgprot_t pmd_pgprot; |
889 | 909 | ||
890 | /* | 910 | /* |
891 | * Not on a 2M boundary? | 911 | * Not on a 2M boundary? |
892 | */ | 912 | */ |
893 | if (start & (PMD_SIZE - 1)) { | 913 | if (start & (PMD_SIZE - 1)) { |
894 | unsigned long pre_end = start + (num_pages << PAGE_SHIFT); | 914 | unsigned long pre_end = start + (num_pages << PAGE_SHIFT); |
895 | unsigned long next_page = (start + PMD_SIZE) & PMD_MASK; | 915 | unsigned long next_page = (start + PMD_SIZE) & PMD_MASK; |
896 | 916 | ||
897 | pre_end = min_t(unsigned long, pre_end, next_page); | 917 | pre_end = min_t(unsigned long, pre_end, next_page); |
898 | cur_pages = (pre_end - start) >> PAGE_SHIFT; | 918 | cur_pages = (pre_end - start) >> PAGE_SHIFT; |
899 | cur_pages = min_t(unsigned int, num_pages, cur_pages); | 919 | cur_pages = min_t(unsigned int, num_pages, cur_pages); |
900 | 920 | ||
901 | /* | 921 | /* |
902 | * Need a PTE page? | 922 | * Need a PTE page? |
903 | */ | 923 | */ |
904 | pmd = pmd_offset(pud, start); | 924 | pmd = pmd_offset(pud, start); |
905 | if (pmd_none(*pmd)) | 925 | if (pmd_none(*pmd)) |
906 | if (alloc_pte_page(pmd)) | 926 | if (alloc_pte_page(pmd)) |
907 | return -1; | 927 | return -1; |
908 | 928 | ||
909 | populate_pte(cpa, start, pre_end, cur_pages, pmd, pgprot); | 929 | populate_pte(cpa, start, pre_end, cur_pages, pmd, pgprot); |
910 | 930 | ||
911 | start = pre_end; | 931 | start = pre_end; |
912 | } | 932 | } |
913 | 933 | ||
914 | /* | 934 | /* |
915 | * We mapped them all? | 935 | * We mapped them all? |
916 | */ | 936 | */ |
917 | if (num_pages == cur_pages) | 937 | if (num_pages == cur_pages) |
918 | return cur_pages; | 938 | return cur_pages; |
919 | 939 | ||
920 | pmd_pgprot = pgprot_4k_2_large(pgprot); | 940 | pmd_pgprot = pgprot_4k_2_large(pgprot); |
921 | 941 | ||
922 | while (end - start >= PMD_SIZE) { | 942 | while (end - start >= PMD_SIZE) { |
923 | 943 | ||
924 | /* | 944 | /* |
925 | * We cannot use a 1G page so allocate a PMD page if needed. | 945 | * We cannot use a 1G page so allocate a PMD page if needed. |
926 | */ | 946 | */ |
927 | if (pud_none(*pud)) | 947 | if (pud_none(*pud)) |
928 | if (alloc_pmd_page(pud)) | 948 | if (alloc_pmd_page(pud)) |
929 | return -1; | 949 | return -1; |
930 | 950 | ||
931 | pmd = pmd_offset(pud, start); | 951 | pmd = pmd_offset(pud, start); |
932 | 952 | ||
933 | set_pmd(pmd, __pmd(cpa->pfn | _PAGE_PSE | | 953 | set_pmd(pmd, __pmd(cpa->pfn | _PAGE_PSE | |
934 | massage_pgprot(pmd_pgprot))); | 954 | massage_pgprot(pmd_pgprot))); |
935 | 955 | ||
936 | start += PMD_SIZE; | 956 | start += PMD_SIZE; |
937 | cpa->pfn += PMD_SIZE; | 957 | cpa->pfn += PMD_SIZE; |
938 | cur_pages += PMD_SIZE >> PAGE_SHIFT; | 958 | cur_pages += PMD_SIZE >> PAGE_SHIFT; |
939 | } | 959 | } |
940 | 960 | ||
941 | /* | 961 | /* |
942 | * Map trailing 4K pages. | 962 | * Map trailing 4K pages. |
943 | */ | 963 | */ |
944 | if (start < end) { | 964 | if (start < end) { |
945 | pmd = pmd_offset(pud, start); | 965 | pmd = pmd_offset(pud, start); |
946 | if (pmd_none(*pmd)) | 966 | if (pmd_none(*pmd)) |
947 | if (alloc_pte_page(pmd)) | 967 | if (alloc_pte_page(pmd)) |
948 | return -1; | 968 | return -1; |
949 | 969 | ||
950 | populate_pte(cpa, start, end, num_pages - cur_pages, | 970 | populate_pte(cpa, start, end, num_pages - cur_pages, |
951 | pmd, pgprot); | 971 | pmd, pgprot); |
952 | } | 972 | } |
953 | return num_pages; | 973 | return num_pages; |
954 | } | 974 | } |
955 | 975 | ||
956 | static int populate_pud(struct cpa_data *cpa, unsigned long start, pgd_t *pgd, | 976 | static int populate_pud(struct cpa_data *cpa, unsigned long start, pgd_t *pgd, |
957 | pgprot_t pgprot) | 977 | pgprot_t pgprot) |
958 | { | 978 | { |
959 | pud_t *pud; | 979 | pud_t *pud; |
960 | unsigned long end; | 980 | unsigned long end; |
961 | int cur_pages = 0; | 981 | int cur_pages = 0; |
962 | pgprot_t pud_pgprot; | 982 | pgprot_t pud_pgprot; |
963 | 983 | ||
964 | end = start + (cpa->numpages << PAGE_SHIFT); | 984 | end = start + (cpa->numpages << PAGE_SHIFT); |
965 | 985 | ||
966 | /* | 986 | /* |
967 | * Not on a Gb page boundary? => map everything up to it with | 987 | * Not on a Gb page boundary? => map everything up to it with |
968 | * smaller pages. | 988 | * smaller pages. |
969 | */ | 989 | */ |
970 | if (start & (PUD_SIZE - 1)) { | 990 | if (start & (PUD_SIZE - 1)) { |
971 | unsigned long pre_end; | 991 | unsigned long pre_end; |
972 | unsigned long next_page = (start + PUD_SIZE) & PUD_MASK; | 992 | unsigned long next_page = (start + PUD_SIZE) & PUD_MASK; |
973 | 993 | ||
974 | pre_end = min_t(unsigned long, end, next_page); | 994 | pre_end = min_t(unsigned long, end, next_page); |
975 | cur_pages = (pre_end - start) >> PAGE_SHIFT; | 995 | cur_pages = (pre_end - start) >> PAGE_SHIFT; |
976 | cur_pages = min_t(int, (int)cpa->numpages, cur_pages); | 996 | cur_pages = min_t(int, (int)cpa->numpages, cur_pages); |
977 | 997 | ||
978 | pud = pud_offset(pgd, start); | 998 | pud = pud_offset(pgd, start); |
979 | 999 | ||
980 | /* | 1000 | /* |
981 | * Need a PMD page? | 1001 | * Need a PMD page? |
982 | */ | 1002 | */ |
983 | if (pud_none(*pud)) | 1003 | if (pud_none(*pud)) |
984 | if (alloc_pmd_page(pud)) | 1004 | if (alloc_pmd_page(pud)) |
985 | return -1; | 1005 | return -1; |
986 | 1006 | ||
987 | cur_pages = populate_pmd(cpa, start, pre_end, cur_pages, | 1007 | cur_pages = populate_pmd(cpa, start, pre_end, cur_pages, |
988 | pud, pgprot); | 1008 | pud, pgprot); |
989 | if (cur_pages < 0) | 1009 | if (cur_pages < 0) |
990 | return cur_pages; | 1010 | return cur_pages; |
991 | 1011 | ||
992 | start = pre_end; | 1012 | start = pre_end; |
993 | } | 1013 | } |
994 | 1014 | ||
995 | /* We mapped them all? */ | 1015 | /* We mapped them all? */ |
996 | if (cpa->numpages == cur_pages) | 1016 | if (cpa->numpages == cur_pages) |
997 | return cur_pages; | 1017 | return cur_pages; |
998 | 1018 | ||
999 | pud = pud_offset(pgd, start); | 1019 | pud = pud_offset(pgd, start); |
1000 | pud_pgprot = pgprot_4k_2_large(pgprot); | 1020 | pud_pgprot = pgprot_4k_2_large(pgprot); |
1001 | 1021 | ||
1002 | /* | 1022 | /* |
1003 | * Map everything starting from the Gb boundary, possibly with 1G pages | 1023 | * Map everything starting from the Gb boundary, possibly with 1G pages |
1004 | */ | 1024 | */ |
1005 | while (end - start >= PUD_SIZE) { | 1025 | while (end - start >= PUD_SIZE) { |
1006 | set_pud(pud, __pud(cpa->pfn | _PAGE_PSE | | 1026 | set_pud(pud, __pud(cpa->pfn | _PAGE_PSE | |
1007 | massage_pgprot(pud_pgprot))); | 1027 | massage_pgprot(pud_pgprot))); |
1008 | 1028 | ||
1009 | start += PUD_SIZE; | 1029 | start += PUD_SIZE; |
1010 | cpa->pfn += PUD_SIZE; | 1030 | cpa->pfn += PUD_SIZE; |
1011 | cur_pages += PUD_SIZE >> PAGE_SHIFT; | 1031 | cur_pages += PUD_SIZE >> PAGE_SHIFT; |
1012 | pud++; | 1032 | pud++; |
1013 | } | 1033 | } |
1014 | 1034 | ||
1015 | /* Map trailing leftover */ | 1035 | /* Map trailing leftover */ |
1016 | if (start < end) { | 1036 | if (start < end) { |
1017 | int tmp; | 1037 | int tmp; |
1018 | 1038 | ||
1019 | pud = pud_offset(pgd, start); | 1039 | pud = pud_offset(pgd, start); |
1020 | if (pud_none(*pud)) | 1040 | if (pud_none(*pud)) |
1021 | if (alloc_pmd_page(pud)) | 1041 | if (alloc_pmd_page(pud)) |
1022 | return -1; | 1042 | return -1; |
1023 | 1043 | ||
1024 | tmp = populate_pmd(cpa, start, end, cpa->numpages - cur_pages, | 1044 | tmp = populate_pmd(cpa, start, end, cpa->numpages - cur_pages, |
1025 | pud, pgprot); | 1045 | pud, pgprot); |
1026 | if (tmp < 0) | 1046 | if (tmp < 0) |
1027 | return cur_pages; | 1047 | return cur_pages; |
1028 | 1048 | ||
1029 | cur_pages += tmp; | 1049 | cur_pages += tmp; |
1030 | } | 1050 | } |
1031 | return cur_pages; | 1051 | return cur_pages; |
1032 | } | 1052 | } |
1033 | 1053 | ||
1034 | /* | 1054 | /* |
1035 | * Restrictions for kernel page table do not necessarily apply when mapping in | 1055 | * Restrictions for kernel page table do not necessarily apply when mapping in |
1036 | * an alternate PGD. | 1056 | * an alternate PGD. |
1037 | */ | 1057 | */ |
1038 | static int populate_pgd(struct cpa_data *cpa, unsigned long addr) | 1058 | static int populate_pgd(struct cpa_data *cpa, unsigned long addr) |
1039 | { | 1059 | { |
1040 | pgprot_t pgprot = __pgprot(_KERNPG_TABLE); | 1060 | pgprot_t pgprot = __pgprot(_KERNPG_TABLE); |
1041 | pud_t *pud = NULL; /* shut up gcc */ | 1061 | pud_t *pud = NULL; /* shut up gcc */ |
1042 | pgd_t *pgd_entry; | 1062 | pgd_t *pgd_entry; |
1043 | int ret; | 1063 | int ret; |
1044 | 1064 | ||
1045 | pgd_entry = cpa->pgd + pgd_index(addr); | 1065 | pgd_entry = cpa->pgd + pgd_index(addr); |
1046 | 1066 | ||
1047 | /* | 1067 | /* |
1048 | * Allocate a PUD page and hand it down for mapping. | 1068 | * Allocate a PUD page and hand it down for mapping. |
1049 | */ | 1069 | */ |
1050 | if (pgd_none(*pgd_entry)) { | 1070 | if (pgd_none(*pgd_entry)) { |
1051 | pud = (pud_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK); | 1071 | pud = (pud_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK); |
1052 | if (!pud) | 1072 | if (!pud) |
1053 | return -1; | 1073 | return -1; |
1054 | 1074 | ||
1055 | set_pgd(pgd_entry, __pgd(__pa(pud) | _KERNPG_TABLE)); | 1075 | set_pgd(pgd_entry, __pgd(__pa(pud) | _KERNPG_TABLE)); |
1056 | } | 1076 | } |
1057 | 1077 | ||
1058 | pgprot_val(pgprot) &= ~pgprot_val(cpa->mask_clr); | 1078 | pgprot_val(pgprot) &= ~pgprot_val(cpa->mask_clr); |
1059 | pgprot_val(pgprot) |= pgprot_val(cpa->mask_set); | 1079 | pgprot_val(pgprot) |= pgprot_val(cpa->mask_set); |
1060 | 1080 | ||
1061 | ret = populate_pud(cpa, addr, pgd_entry, pgprot); | 1081 | ret = populate_pud(cpa, addr, pgd_entry, pgprot); |
1062 | if (ret < 0) { | 1082 | if (ret < 0) { |
1063 | unmap_pgd_range(cpa->pgd, addr, | 1083 | unmap_pgd_range(cpa->pgd, addr, |
1064 | addr + (cpa->numpages << PAGE_SHIFT)); | 1084 | addr + (cpa->numpages << PAGE_SHIFT)); |
1065 | return ret; | 1085 | return ret; |
1066 | } | 1086 | } |
1067 | 1087 | ||
1068 | cpa->numpages = ret; | 1088 | cpa->numpages = ret; |
1069 | return 0; | 1089 | return 0; |
1070 | } | 1090 | } |
1071 | 1091 | ||
1072 | static int __cpa_process_fault(struct cpa_data *cpa, unsigned long vaddr, | 1092 | static int __cpa_process_fault(struct cpa_data *cpa, unsigned long vaddr, |
1073 | int primary) | 1093 | int primary) |
1074 | { | 1094 | { |
1075 | if (cpa->pgd) | 1095 | if (cpa->pgd) |
1076 | return populate_pgd(cpa, vaddr); | 1096 | return populate_pgd(cpa, vaddr); |
1077 | 1097 | ||
1078 | /* | 1098 | /* |
1079 | * Ignore all non primary paths. | 1099 | * Ignore all non primary paths. |
1080 | */ | 1100 | */ |
1081 | if (!primary) | 1101 | if (!primary) |
1082 | return 0; | 1102 | return 0; |
1083 | 1103 | ||
1084 | /* | 1104 | /* |
1085 | * Ignore the NULL PTE for kernel identity mapping, as it is expected | 1105 | * Ignore the NULL PTE for kernel identity mapping, as it is expected |
1086 | * to have holes. | 1106 | * to have holes. |
1087 | * Also set numpages to '1' indicating that we processed cpa req for | 1107 | * Also set numpages to '1' indicating that we processed cpa req for |
1088 | * one virtual address page and its pfn. TBD: numpages can be set based | 1108 | * one virtual address page and its pfn. TBD: numpages can be set based |
1089 | * on the initial value and the level returned by lookup_address(). | 1109 | * on the initial value and the level returned by lookup_address(). |
1090 | */ | 1110 | */ |
1091 | if (within(vaddr, PAGE_OFFSET, | 1111 | if (within(vaddr, PAGE_OFFSET, |
1092 | PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT))) { | 1112 | PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT))) { |
1093 | cpa->numpages = 1; | 1113 | cpa->numpages = 1; |
1094 | cpa->pfn = __pa(vaddr) >> PAGE_SHIFT; | 1114 | cpa->pfn = __pa(vaddr) >> PAGE_SHIFT; |
1095 | return 0; | 1115 | return 0; |
1096 | } else { | 1116 | } else { |
1097 | WARN(1, KERN_WARNING "CPA: called for zero pte. " | 1117 | WARN(1, KERN_WARNING "CPA: called for zero pte. " |
1098 | "vaddr = %lx cpa->vaddr = %lx\n", vaddr, | 1118 | "vaddr = %lx cpa->vaddr = %lx\n", vaddr, |
1099 | *cpa->vaddr); | 1119 | *cpa->vaddr); |
1100 | 1120 | ||
1101 | return -EFAULT; | 1121 | return -EFAULT; |
1102 | } | 1122 | } |
1103 | } | 1123 | } |
1104 | 1124 | ||
1105 | static int __change_page_attr(struct cpa_data *cpa, int primary) | 1125 | static int __change_page_attr(struct cpa_data *cpa, int primary) |
1106 | { | 1126 | { |
1107 | unsigned long address; | 1127 | unsigned long address; |
1108 | int do_split, err; | 1128 | int do_split, err; |
1109 | unsigned int level; | 1129 | unsigned int level; |
1110 | pte_t *kpte, old_pte; | 1130 | pte_t *kpte, old_pte; |
1111 | 1131 | ||
1112 | if (cpa->flags & CPA_PAGES_ARRAY) { | 1132 | if (cpa->flags & CPA_PAGES_ARRAY) { |
1113 | struct page *page = cpa->pages[cpa->curpage]; | 1133 | struct page *page = cpa->pages[cpa->curpage]; |
1114 | if (unlikely(PageHighMem(page))) | 1134 | if (unlikely(PageHighMem(page))) |
1115 | return 0; | 1135 | return 0; |
1116 | address = (unsigned long)page_address(page); | 1136 | address = (unsigned long)page_address(page); |
1117 | } else if (cpa->flags & CPA_ARRAY) | 1137 | } else if (cpa->flags & CPA_ARRAY) |
1118 | address = cpa->vaddr[cpa->curpage]; | 1138 | address = cpa->vaddr[cpa->curpage]; |
1119 | else | 1139 | else |
1120 | address = *cpa->vaddr; | 1140 | address = *cpa->vaddr; |
1121 | repeat: | 1141 | repeat: |
1122 | kpte = _lookup_address_cpa(cpa, address, &level); | 1142 | kpte = _lookup_address_cpa(cpa, address, &level); |
1123 | if (!kpte) | 1143 | if (!kpte) |
1124 | return __cpa_process_fault(cpa, address, primary); | 1144 | return __cpa_process_fault(cpa, address, primary); |
1125 | 1145 | ||
1126 | old_pte = *kpte; | 1146 | old_pte = *kpte; |
1127 | if (!pte_val(old_pte)) | 1147 | if (!pte_val(old_pte)) |
1128 | return __cpa_process_fault(cpa, address, primary); | 1148 | return __cpa_process_fault(cpa, address, primary); |
1129 | 1149 | ||
1130 | if (level == PG_LEVEL_4K) { | 1150 | if (level == PG_LEVEL_4K) { |
1131 | pte_t new_pte; | 1151 | pte_t new_pte; |
1132 | pgprot_t new_prot = pte_pgprot(old_pte); | 1152 | pgprot_t new_prot = pte_pgprot(old_pte); |
1133 | unsigned long pfn = pte_pfn(old_pte); | 1153 | unsigned long pfn = pte_pfn(old_pte); |
1134 | 1154 | ||
1135 | pgprot_val(new_prot) &= ~pgprot_val(cpa->mask_clr); | 1155 | pgprot_val(new_prot) &= ~pgprot_val(cpa->mask_clr); |
1136 | pgprot_val(new_prot) |= pgprot_val(cpa->mask_set); | 1156 | pgprot_val(new_prot) |= pgprot_val(cpa->mask_set); |
1137 | 1157 | ||
1138 | new_prot = static_protections(new_prot, address, pfn); | 1158 | new_prot = static_protections(new_prot, address, pfn); |
1139 | 1159 | ||
1140 | /* | 1160 | /* |
1141 | * Set the GLOBAL flags only if the PRESENT flag is | 1161 | * Set the GLOBAL flags only if the PRESENT flag is |
1142 | * set otherwise pte_present will return true even on | 1162 | * set otherwise pte_present will return true even on |
1143 | * a non present pte. The canon_pgprot will clear | 1163 | * a non present pte. The canon_pgprot will clear |
1144 | * _PAGE_GLOBAL for the ancient hardware that doesn't | 1164 | * _PAGE_GLOBAL for the ancient hardware that doesn't |
1145 | * support it. | 1165 | * support it. |
1146 | */ | 1166 | */ |
1147 | if (pgprot_val(new_prot) & _PAGE_PRESENT) | 1167 | if (pgprot_val(new_prot) & _PAGE_PRESENT) |
1148 | pgprot_val(new_prot) |= _PAGE_GLOBAL; | 1168 | pgprot_val(new_prot) |= _PAGE_GLOBAL; |
1149 | else | 1169 | else |
1150 | pgprot_val(new_prot) &= ~_PAGE_GLOBAL; | 1170 | pgprot_val(new_prot) &= ~_PAGE_GLOBAL; |
1151 | 1171 | ||
1152 | /* | 1172 | /* |
1153 | * We need to keep the pfn from the existing PTE, | 1173 | * We need to keep the pfn from the existing PTE, |
1154 | * after all we're only going to change it's attributes | 1174 | * after all we're only going to change it's attributes |
1155 | * not the memory it points to | 1175 | * not the memory it points to |
1156 | */ | 1176 | */ |
1157 | new_pte = pfn_pte(pfn, canon_pgprot(new_prot)); | 1177 | new_pte = pfn_pte(pfn, canon_pgprot(new_prot)); |
1158 | cpa->pfn = pfn; | 1178 | cpa->pfn = pfn; |
1159 | /* | 1179 | /* |
1160 | * Do we really change anything ? | 1180 | * Do we really change anything ? |
1161 | */ | 1181 | */ |
1162 | if (pte_val(old_pte) != pte_val(new_pte)) { | 1182 | if (pte_val(old_pte) != pte_val(new_pte)) { |
1163 | set_pte_atomic(kpte, new_pte); | 1183 | set_pte_atomic(kpte, new_pte); |
1164 | cpa->flags |= CPA_FLUSHTLB; | 1184 | cpa->flags |= CPA_FLUSHTLB; |
1165 | } | 1185 | } |
1166 | cpa->numpages = 1; | 1186 | cpa->numpages = 1; |
1167 | return 0; | 1187 | return 0; |
1168 | } | 1188 | } |
1169 | 1189 | ||
1170 | /* | 1190 | /* |
1171 | * Check, whether we can keep the large page intact | 1191 | * Check, whether we can keep the large page intact |
1172 | * and just change the pte: | 1192 | * and just change the pte: |
1173 | */ | 1193 | */ |
1174 | do_split = try_preserve_large_page(kpte, address, cpa); | 1194 | do_split = try_preserve_large_page(kpte, address, cpa); |
1175 | /* | 1195 | /* |
1176 | * When the range fits into the existing large page, | 1196 | * When the range fits into the existing large page, |
1177 | * return. cp->numpages and cpa->tlbflush have been updated in | 1197 | * return. cp->numpages and cpa->tlbflush have been updated in |
1178 | * try_large_page: | 1198 | * try_large_page: |
1179 | */ | 1199 | */ |
1180 | if (do_split <= 0) | 1200 | if (do_split <= 0) |
1181 | return do_split; | 1201 | return do_split; |
1182 | 1202 | ||
1183 | /* | 1203 | /* |
1184 | * We have to split the large page: | 1204 | * We have to split the large page: |
1185 | */ | 1205 | */ |
1186 | err = split_large_page(cpa, kpte, address); | 1206 | err = split_large_page(cpa, kpte, address); |
1187 | if (!err) { | 1207 | if (!err) { |
1188 | /* | 1208 | /* |
1189 | * Do a global flush tlb after splitting the large page | 1209 | * Do a global flush tlb after splitting the large page |
1190 | * and before we do the actual change page attribute in the PTE. | 1210 | * and before we do the actual change page attribute in the PTE. |
1191 | * | 1211 | * |
1192 | * With out this, we violate the TLB application note, that says | 1212 | * With out this, we violate the TLB application note, that says |
1193 | * "The TLBs may contain both ordinary and large-page | 1213 | * "The TLBs may contain both ordinary and large-page |
1194 | * translations for a 4-KByte range of linear addresses. This | 1214 | * translations for a 4-KByte range of linear addresses. This |
1195 | * may occur if software modifies the paging structures so that | 1215 | * may occur if software modifies the paging structures so that |
1196 | * the page size used for the address range changes. If the two | 1216 | * the page size used for the address range changes. If the two |
1197 | * translations differ with respect to page frame or attributes | 1217 | * translations differ with respect to page frame or attributes |
1198 | * (e.g., permissions), processor behavior is undefined and may | 1218 | * (e.g., permissions), processor behavior is undefined and may |
1199 | * be implementation-specific." | 1219 | * be implementation-specific." |
1200 | * | 1220 | * |
1201 | * We do this global tlb flush inside the cpa_lock, so that we | 1221 | * We do this global tlb flush inside the cpa_lock, so that we |
1202 | * don't allow any other cpu, with stale tlb entries change the | 1222 | * don't allow any other cpu, with stale tlb entries change the |
1203 | * page attribute in parallel, that also falls into the | 1223 | * page attribute in parallel, that also falls into the |
1204 | * just split large page entry. | 1224 | * just split large page entry. |
1205 | */ | 1225 | */ |
1206 | flush_tlb_all(); | 1226 | flush_tlb_all(); |
1207 | goto repeat; | 1227 | goto repeat; |
1208 | } | 1228 | } |
1209 | 1229 | ||
1210 | return err; | 1230 | return err; |
1211 | } | 1231 | } |
1212 | 1232 | ||
1213 | static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias); | 1233 | static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias); |
1214 | 1234 | ||
1215 | static int cpa_process_alias(struct cpa_data *cpa) | 1235 | static int cpa_process_alias(struct cpa_data *cpa) |
1216 | { | 1236 | { |
1217 | struct cpa_data alias_cpa; | 1237 | struct cpa_data alias_cpa; |
1218 | unsigned long laddr = (unsigned long)__va(cpa->pfn << PAGE_SHIFT); | 1238 | unsigned long laddr = (unsigned long)__va(cpa->pfn << PAGE_SHIFT); |
1219 | unsigned long vaddr; | 1239 | unsigned long vaddr; |
1220 | int ret; | 1240 | int ret; |
1221 | 1241 | ||
1222 | if (!pfn_range_is_mapped(cpa->pfn, cpa->pfn + 1)) | 1242 | if (!pfn_range_is_mapped(cpa->pfn, cpa->pfn + 1)) |
1223 | return 0; | 1243 | return 0; |
1224 | 1244 | ||
1225 | /* | 1245 | /* |
1226 | * No need to redo, when the primary call touched the direct | 1246 | * No need to redo, when the primary call touched the direct |
1227 | * mapping already: | 1247 | * mapping already: |
1228 | */ | 1248 | */ |
1229 | if (cpa->flags & CPA_PAGES_ARRAY) { | 1249 | if (cpa->flags & CPA_PAGES_ARRAY) { |
1230 | struct page *page = cpa->pages[cpa->curpage]; | 1250 | struct page *page = cpa->pages[cpa->curpage]; |
1231 | if (unlikely(PageHighMem(page))) | 1251 | if (unlikely(PageHighMem(page))) |
1232 | return 0; | 1252 | return 0; |
1233 | vaddr = (unsigned long)page_address(page); | 1253 | vaddr = (unsigned long)page_address(page); |
1234 | } else if (cpa->flags & CPA_ARRAY) | 1254 | } else if (cpa->flags & CPA_ARRAY) |
1235 | vaddr = cpa->vaddr[cpa->curpage]; | 1255 | vaddr = cpa->vaddr[cpa->curpage]; |
1236 | else | 1256 | else |
1237 | vaddr = *cpa->vaddr; | 1257 | vaddr = *cpa->vaddr; |
1238 | 1258 | ||
1239 | if (!(within(vaddr, PAGE_OFFSET, | 1259 | if (!(within(vaddr, PAGE_OFFSET, |
1240 | PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT)))) { | 1260 | PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT)))) { |
1241 | 1261 | ||
1242 | alias_cpa = *cpa; | 1262 | alias_cpa = *cpa; |
1243 | alias_cpa.vaddr = &laddr; | 1263 | alias_cpa.vaddr = &laddr; |
1244 | alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY); | 1264 | alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY); |
1245 | 1265 | ||
1246 | ret = __change_page_attr_set_clr(&alias_cpa, 0); | 1266 | ret = __change_page_attr_set_clr(&alias_cpa, 0); |
1247 | if (ret) | 1267 | if (ret) |
1248 | return ret; | 1268 | return ret; |
1249 | } | 1269 | } |
1250 | 1270 | ||
1251 | #ifdef CONFIG_X86_64 | 1271 | #ifdef CONFIG_X86_64 |
1252 | /* | 1272 | /* |
1253 | * If the primary call didn't touch the high mapping already | 1273 | * If the primary call didn't touch the high mapping already |
1254 | * and the physical address is inside the kernel map, we need | 1274 | * and the physical address is inside the kernel map, we need |
1255 | * to touch the high mapped kernel as well: | 1275 | * to touch the high mapped kernel as well: |
1256 | */ | 1276 | */ |
1257 | if (!within(vaddr, (unsigned long)_text, _brk_end) && | 1277 | if (!within(vaddr, (unsigned long)_text, _brk_end) && |
1258 | within(cpa->pfn, highmap_start_pfn(), highmap_end_pfn())) { | 1278 | within(cpa->pfn, highmap_start_pfn(), highmap_end_pfn())) { |
1259 | unsigned long temp_cpa_vaddr = (cpa->pfn << PAGE_SHIFT) + | 1279 | unsigned long temp_cpa_vaddr = (cpa->pfn << PAGE_SHIFT) + |
1260 | __START_KERNEL_map - phys_base; | 1280 | __START_KERNEL_map - phys_base; |
1261 | alias_cpa = *cpa; | 1281 | alias_cpa = *cpa; |
1262 | alias_cpa.vaddr = &temp_cpa_vaddr; | 1282 | alias_cpa.vaddr = &temp_cpa_vaddr; |
1263 | alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY); | 1283 | alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY); |
1264 | 1284 | ||
1265 | /* | 1285 | /* |
1266 | * The high mapping range is imprecise, so ignore the | 1286 | * The high mapping range is imprecise, so ignore the |
1267 | * return value. | 1287 | * return value. |
1268 | */ | 1288 | */ |
1269 | __change_page_attr_set_clr(&alias_cpa, 0); | 1289 | __change_page_attr_set_clr(&alias_cpa, 0); |
1270 | } | 1290 | } |
1271 | #endif | 1291 | #endif |
1272 | 1292 | ||
1273 | return 0; | 1293 | return 0; |
1274 | } | 1294 | } |
1275 | 1295 | ||
1276 | static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias) | 1296 | static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias) |
1277 | { | 1297 | { |
1278 | int ret, numpages = cpa->numpages; | 1298 | int ret, numpages = cpa->numpages; |
1279 | 1299 | ||
1280 | while (numpages) { | 1300 | while (numpages) { |
1281 | /* | 1301 | /* |
1282 | * Store the remaining nr of pages for the large page | 1302 | * Store the remaining nr of pages for the large page |
1283 | * preservation check. | 1303 | * preservation check. |
1284 | */ | 1304 | */ |
1285 | cpa->numpages = numpages; | 1305 | cpa->numpages = numpages; |
1286 | /* for array changes, we can't use large page */ | 1306 | /* for array changes, we can't use large page */ |
1287 | if (cpa->flags & (CPA_ARRAY | CPA_PAGES_ARRAY)) | 1307 | if (cpa->flags & (CPA_ARRAY | CPA_PAGES_ARRAY)) |
1288 | cpa->numpages = 1; | 1308 | cpa->numpages = 1; |
1289 | 1309 | ||
1290 | if (!debug_pagealloc) | 1310 | if (!debug_pagealloc) |
1291 | spin_lock(&cpa_lock); | 1311 | spin_lock(&cpa_lock); |
1292 | ret = __change_page_attr(cpa, checkalias); | 1312 | ret = __change_page_attr(cpa, checkalias); |
1293 | if (!debug_pagealloc) | 1313 | if (!debug_pagealloc) |
1294 | spin_unlock(&cpa_lock); | 1314 | spin_unlock(&cpa_lock); |
1295 | if (ret) | 1315 | if (ret) |
1296 | return ret; | 1316 | return ret; |
1297 | 1317 | ||
1298 | if (checkalias) { | 1318 | if (checkalias) { |
1299 | ret = cpa_process_alias(cpa); | 1319 | ret = cpa_process_alias(cpa); |
1300 | if (ret) | 1320 | if (ret) |
1301 | return ret; | 1321 | return ret; |
1302 | } | 1322 | } |
1303 | 1323 | ||
1304 | /* | 1324 | /* |
1305 | * Adjust the number of pages with the result of the | 1325 | * Adjust the number of pages with the result of the |
1306 | * CPA operation. Either a large page has been | 1326 | * CPA operation. Either a large page has been |
1307 | * preserved or a single page update happened. | 1327 | * preserved or a single page update happened. |
1308 | */ | 1328 | */ |
1309 | BUG_ON(cpa->numpages > numpages); | 1329 | BUG_ON(cpa->numpages > numpages); |
1310 | numpages -= cpa->numpages; | 1330 | numpages -= cpa->numpages; |
1311 | if (cpa->flags & (CPA_PAGES_ARRAY | CPA_ARRAY)) | 1331 | if (cpa->flags & (CPA_PAGES_ARRAY | CPA_ARRAY)) |
1312 | cpa->curpage++; | 1332 | cpa->curpage++; |
1313 | else | 1333 | else |
1314 | *cpa->vaddr += cpa->numpages * PAGE_SIZE; | 1334 | *cpa->vaddr += cpa->numpages * PAGE_SIZE; |
1315 | 1335 | ||
1316 | } | 1336 | } |
1317 | return 0; | 1337 | return 0; |
1318 | } | 1338 | } |
1319 | 1339 | ||
1320 | static int change_page_attr_set_clr(unsigned long *addr, int numpages, | 1340 | static int change_page_attr_set_clr(unsigned long *addr, int numpages, |
1321 | pgprot_t mask_set, pgprot_t mask_clr, | 1341 | pgprot_t mask_set, pgprot_t mask_clr, |
1322 | int force_split, int in_flag, | 1342 | int force_split, int in_flag, |
1323 | struct page **pages) | 1343 | struct page **pages) |
1324 | { | 1344 | { |
1325 | struct cpa_data cpa; | 1345 | struct cpa_data cpa; |
1326 | int ret, cache, checkalias; | 1346 | int ret, cache, checkalias; |
1327 | unsigned long baddr = 0; | 1347 | unsigned long baddr = 0; |
1328 | 1348 | ||
1329 | memset(&cpa, 0, sizeof(cpa)); | 1349 | memset(&cpa, 0, sizeof(cpa)); |
1330 | 1350 | ||
1331 | /* | 1351 | /* |
1332 | * Check, if we are requested to change a not supported | 1352 | * Check, if we are requested to change a not supported |
1333 | * feature: | 1353 | * feature: |
1334 | */ | 1354 | */ |
1335 | mask_set = canon_pgprot(mask_set); | 1355 | mask_set = canon_pgprot(mask_set); |
1336 | mask_clr = canon_pgprot(mask_clr); | 1356 | mask_clr = canon_pgprot(mask_clr); |
1337 | if (!pgprot_val(mask_set) && !pgprot_val(mask_clr) && !force_split) | 1357 | if (!pgprot_val(mask_set) && !pgprot_val(mask_clr) && !force_split) |
1338 | return 0; | 1358 | return 0; |
1339 | 1359 | ||
1340 | /* Ensure we are PAGE_SIZE aligned */ | 1360 | /* Ensure we are PAGE_SIZE aligned */ |
1341 | if (in_flag & CPA_ARRAY) { | 1361 | if (in_flag & CPA_ARRAY) { |
1342 | int i; | 1362 | int i; |
1343 | for (i = 0; i < numpages; i++) { | 1363 | for (i = 0; i < numpages; i++) { |
1344 | if (addr[i] & ~PAGE_MASK) { | 1364 | if (addr[i] & ~PAGE_MASK) { |
1345 | addr[i] &= PAGE_MASK; | 1365 | addr[i] &= PAGE_MASK; |
1346 | WARN_ON_ONCE(1); | 1366 | WARN_ON_ONCE(1); |
1347 | } | 1367 | } |
1348 | } | 1368 | } |
1349 | } else if (!(in_flag & CPA_PAGES_ARRAY)) { | 1369 | } else if (!(in_flag & CPA_PAGES_ARRAY)) { |
1350 | /* | 1370 | /* |
1351 | * in_flag of CPA_PAGES_ARRAY implies it is aligned. | 1371 | * in_flag of CPA_PAGES_ARRAY implies it is aligned. |
1352 | * No need to cehck in that case | 1372 | * No need to cehck in that case |
1353 | */ | 1373 | */ |
1354 | if (*addr & ~PAGE_MASK) { | 1374 | if (*addr & ~PAGE_MASK) { |
1355 | *addr &= PAGE_MASK; | 1375 | *addr &= PAGE_MASK; |
1356 | /* | 1376 | /* |
1357 | * People should not be passing in unaligned addresses: | 1377 | * People should not be passing in unaligned addresses: |
1358 | */ | 1378 | */ |
1359 | WARN_ON_ONCE(1); | 1379 | WARN_ON_ONCE(1); |
1360 | } | 1380 | } |
1361 | /* | 1381 | /* |
1362 | * Save address for cache flush. *addr is modified in the call | 1382 | * Save address for cache flush. *addr is modified in the call |
1363 | * to __change_page_attr_set_clr() below. | 1383 | * to __change_page_attr_set_clr() below. |
1364 | */ | 1384 | */ |
1365 | baddr = *addr; | 1385 | baddr = *addr; |
1366 | } | 1386 | } |
1367 | 1387 | ||
1368 | /* Must avoid aliasing mappings in the highmem code */ | 1388 | /* Must avoid aliasing mappings in the highmem code */ |
1369 | kmap_flush_unused(); | 1389 | kmap_flush_unused(); |
1370 | 1390 | ||
1371 | vm_unmap_aliases(); | 1391 | vm_unmap_aliases(); |
1372 | 1392 | ||
1373 | cpa.vaddr = addr; | 1393 | cpa.vaddr = addr; |
1374 | cpa.pages = pages; | 1394 | cpa.pages = pages; |
1375 | cpa.numpages = numpages; | 1395 | cpa.numpages = numpages; |
1376 | cpa.mask_set = mask_set; | 1396 | cpa.mask_set = mask_set; |
1377 | cpa.mask_clr = mask_clr; | 1397 | cpa.mask_clr = mask_clr; |
1378 | cpa.flags = 0; | 1398 | cpa.flags = 0; |
1379 | cpa.curpage = 0; | 1399 | cpa.curpage = 0; |
1380 | cpa.force_split = force_split; | 1400 | cpa.force_split = force_split; |
1381 | 1401 | ||
1382 | if (in_flag & (CPA_ARRAY | CPA_PAGES_ARRAY)) | 1402 | if (in_flag & (CPA_ARRAY | CPA_PAGES_ARRAY)) |
1383 | cpa.flags |= in_flag; | 1403 | cpa.flags |= in_flag; |
1384 | 1404 | ||
1385 | /* No alias checking for _NX bit modifications */ | 1405 | /* No alias checking for _NX bit modifications */ |
1386 | checkalias = (pgprot_val(mask_set) | pgprot_val(mask_clr)) != _PAGE_NX; | 1406 | checkalias = (pgprot_val(mask_set) | pgprot_val(mask_clr)) != _PAGE_NX; |
1387 | 1407 | ||
1388 | ret = __change_page_attr_set_clr(&cpa, checkalias); | 1408 | ret = __change_page_attr_set_clr(&cpa, checkalias); |
1389 | 1409 | ||
1390 | /* | 1410 | /* |
1391 | * Check whether we really changed something: | 1411 | * Check whether we really changed something: |
1392 | */ | 1412 | */ |
1393 | if (!(cpa.flags & CPA_FLUSHTLB)) | 1413 | if (!(cpa.flags & CPA_FLUSHTLB)) |
1394 | goto out; | 1414 | goto out; |
1395 | 1415 | ||
1396 | /* | 1416 | /* |
1397 | * No need to flush, when we did not set any of the caching | 1417 | * No need to flush, when we did not set any of the caching |
1398 | * attributes: | 1418 | * attributes: |
1399 | */ | 1419 | */ |
1400 | cache = !!pgprot2cachemode(mask_set); | 1420 | cache = !!pgprot2cachemode(mask_set); |
1401 | 1421 | ||
1402 | /* | 1422 | /* |
1403 | * On success we use CLFLUSH, when the CPU supports it to | 1423 | * On success we use CLFLUSH, when the CPU supports it to |
1404 | * avoid the WBINVD. If the CPU does not support it and in the | 1424 | * avoid the WBINVD. If the CPU does not support it and in the |
1405 | * error case we fall back to cpa_flush_all (which uses | 1425 | * error case we fall back to cpa_flush_all (which uses |
1406 | * WBINVD): | 1426 | * WBINVD): |
1407 | */ | 1427 | */ |
1408 | if (!ret && cpu_has_clflush) { | 1428 | if (!ret && cpu_has_clflush) { |
1409 | if (cpa.flags & (CPA_PAGES_ARRAY | CPA_ARRAY)) { | 1429 | if (cpa.flags & (CPA_PAGES_ARRAY | CPA_ARRAY)) { |
1410 | cpa_flush_array(addr, numpages, cache, | 1430 | cpa_flush_array(addr, numpages, cache, |
1411 | cpa.flags, pages); | 1431 | cpa.flags, pages); |
1412 | } else | 1432 | } else |
1413 | cpa_flush_range(baddr, numpages, cache); | 1433 | cpa_flush_range(baddr, numpages, cache); |
1414 | } else | 1434 | } else |
1415 | cpa_flush_all(cache); | 1435 | cpa_flush_all(cache); |
1416 | 1436 | ||
1417 | out: | 1437 | out: |
1418 | return ret; | 1438 | return ret; |
1419 | } | 1439 | } |
1420 | 1440 | ||
1421 | static inline int change_page_attr_set(unsigned long *addr, int numpages, | 1441 | static inline int change_page_attr_set(unsigned long *addr, int numpages, |
1422 | pgprot_t mask, int array) | 1442 | pgprot_t mask, int array) |
1423 | { | 1443 | { |
1424 | return change_page_attr_set_clr(addr, numpages, mask, __pgprot(0), 0, | 1444 | return change_page_attr_set_clr(addr, numpages, mask, __pgprot(0), 0, |
1425 | (array ? CPA_ARRAY : 0), NULL); | 1445 | (array ? CPA_ARRAY : 0), NULL); |
1426 | } | 1446 | } |
1427 | 1447 | ||
1428 | static inline int change_page_attr_clear(unsigned long *addr, int numpages, | 1448 | static inline int change_page_attr_clear(unsigned long *addr, int numpages, |
1429 | pgprot_t mask, int array) | 1449 | pgprot_t mask, int array) |
1430 | { | 1450 | { |
1431 | return change_page_attr_set_clr(addr, numpages, __pgprot(0), mask, 0, | 1451 | return change_page_attr_set_clr(addr, numpages, __pgprot(0), mask, 0, |
1432 | (array ? CPA_ARRAY : 0), NULL); | 1452 | (array ? CPA_ARRAY : 0), NULL); |
1433 | } | 1453 | } |
1434 | 1454 | ||
1435 | static inline int cpa_set_pages_array(struct page **pages, int numpages, | 1455 | static inline int cpa_set_pages_array(struct page **pages, int numpages, |
1436 | pgprot_t mask) | 1456 | pgprot_t mask) |
1437 | { | 1457 | { |
1438 | return change_page_attr_set_clr(NULL, numpages, mask, __pgprot(0), 0, | 1458 | return change_page_attr_set_clr(NULL, numpages, mask, __pgprot(0), 0, |
1439 | CPA_PAGES_ARRAY, pages); | 1459 | CPA_PAGES_ARRAY, pages); |
1440 | } | 1460 | } |
1441 | 1461 | ||
1442 | static inline int cpa_clear_pages_array(struct page **pages, int numpages, | 1462 | static inline int cpa_clear_pages_array(struct page **pages, int numpages, |
1443 | pgprot_t mask) | 1463 | pgprot_t mask) |
1444 | { | 1464 | { |
1445 | return change_page_attr_set_clr(NULL, numpages, __pgprot(0), mask, 0, | 1465 | return change_page_attr_set_clr(NULL, numpages, __pgprot(0), mask, 0, |
1446 | CPA_PAGES_ARRAY, pages); | 1466 | CPA_PAGES_ARRAY, pages); |
1447 | } | 1467 | } |
1448 | 1468 | ||
1449 | int _set_memory_uc(unsigned long addr, int numpages) | 1469 | int _set_memory_uc(unsigned long addr, int numpages) |
1450 | { | 1470 | { |
1451 | /* | 1471 | /* |
1452 | * for now UC MINUS. see comments in ioremap_nocache() | 1472 | * for now UC MINUS. see comments in ioremap_nocache() |
1453 | */ | 1473 | */ |
1454 | return change_page_attr_set(&addr, numpages, | 1474 | return change_page_attr_set(&addr, numpages, |
1455 | cachemode2pgprot(_PAGE_CACHE_MODE_UC_MINUS), | 1475 | cachemode2pgprot(_PAGE_CACHE_MODE_UC_MINUS), |
1456 | 0); | 1476 | 0); |
1457 | } | 1477 | } |
1458 | 1478 | ||
1459 | int set_memory_uc(unsigned long addr, int numpages) | 1479 | int set_memory_uc(unsigned long addr, int numpages) |
1460 | { | 1480 | { |
1461 | int ret; | 1481 | int ret; |
1462 | 1482 | ||
1463 | /* | 1483 | /* |
1464 | * for now UC MINUS. see comments in ioremap_nocache() | 1484 | * for now UC MINUS. see comments in ioremap_nocache() |
1465 | */ | 1485 | */ |
1466 | ret = reserve_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE, | 1486 | ret = reserve_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE, |
1467 | _PAGE_CACHE_MODE_UC_MINUS, NULL); | 1487 | _PAGE_CACHE_MODE_UC_MINUS, NULL); |
1468 | if (ret) | 1488 | if (ret) |
1469 | goto out_err; | 1489 | goto out_err; |
1470 | 1490 | ||
1471 | ret = _set_memory_uc(addr, numpages); | 1491 | ret = _set_memory_uc(addr, numpages); |
1472 | if (ret) | 1492 | if (ret) |
1473 | goto out_free; | 1493 | goto out_free; |
1474 | 1494 | ||
1475 | return 0; | 1495 | return 0; |
1476 | 1496 | ||
1477 | out_free: | 1497 | out_free: |
1478 | free_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE); | 1498 | free_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE); |
1479 | out_err: | 1499 | out_err: |
1480 | return ret; | 1500 | return ret; |
1481 | } | 1501 | } |
1482 | EXPORT_SYMBOL(set_memory_uc); | 1502 | EXPORT_SYMBOL(set_memory_uc); |
1483 | 1503 | ||
1484 | static int _set_memory_array(unsigned long *addr, int addrinarray, | 1504 | static int _set_memory_array(unsigned long *addr, int addrinarray, |
1485 | enum page_cache_mode new_type) | 1505 | enum page_cache_mode new_type) |
1486 | { | 1506 | { |
1487 | int i, j; | 1507 | int i, j; |
1488 | int ret; | 1508 | int ret; |
1489 | 1509 | ||
1490 | /* | 1510 | /* |
1491 | * for now UC MINUS. see comments in ioremap_nocache() | 1511 | * for now UC MINUS. see comments in ioremap_nocache() |
1492 | */ | 1512 | */ |
1493 | for (i = 0; i < addrinarray; i++) { | 1513 | for (i = 0; i < addrinarray; i++) { |
1494 | ret = reserve_memtype(__pa(addr[i]), __pa(addr[i]) + PAGE_SIZE, | 1514 | ret = reserve_memtype(__pa(addr[i]), __pa(addr[i]) + PAGE_SIZE, |
1495 | new_type, NULL); | 1515 | new_type, NULL); |
1496 | if (ret) | 1516 | if (ret) |
1497 | goto out_free; | 1517 | goto out_free; |
1498 | } | 1518 | } |
1499 | 1519 | ||
1500 | ret = change_page_attr_set(addr, addrinarray, | 1520 | ret = change_page_attr_set(addr, addrinarray, |
1501 | cachemode2pgprot(_PAGE_CACHE_MODE_UC_MINUS), | 1521 | cachemode2pgprot(_PAGE_CACHE_MODE_UC_MINUS), |
1502 | 1); | 1522 | 1); |
1503 | 1523 | ||
1504 | if (!ret && new_type == _PAGE_CACHE_MODE_WC) | 1524 | if (!ret && new_type == _PAGE_CACHE_MODE_WC) |
1505 | ret = change_page_attr_set_clr(addr, addrinarray, | 1525 | ret = change_page_attr_set_clr(addr, addrinarray, |
1506 | cachemode2pgprot( | 1526 | cachemode2pgprot( |
1507 | _PAGE_CACHE_MODE_WC), | 1527 | _PAGE_CACHE_MODE_WC), |
1508 | __pgprot(_PAGE_CACHE_MASK), | 1528 | __pgprot(_PAGE_CACHE_MASK), |
1509 | 0, CPA_ARRAY, NULL); | 1529 | 0, CPA_ARRAY, NULL); |
1510 | if (ret) | 1530 | if (ret) |
1511 | goto out_free; | 1531 | goto out_free; |
1512 | 1532 | ||
1513 | return 0; | 1533 | return 0; |
1514 | 1534 | ||
1515 | out_free: | 1535 | out_free: |
1516 | for (j = 0; j < i; j++) | 1536 | for (j = 0; j < i; j++) |
1517 | free_memtype(__pa(addr[j]), __pa(addr[j]) + PAGE_SIZE); | 1537 | free_memtype(__pa(addr[j]), __pa(addr[j]) + PAGE_SIZE); |
1518 | 1538 | ||
1519 | return ret; | 1539 | return ret; |
1520 | } | 1540 | } |
1521 | 1541 | ||
1522 | int set_memory_array_uc(unsigned long *addr, int addrinarray) | 1542 | int set_memory_array_uc(unsigned long *addr, int addrinarray) |
1523 | { | 1543 | { |
1524 | return _set_memory_array(addr, addrinarray, _PAGE_CACHE_MODE_UC_MINUS); | 1544 | return _set_memory_array(addr, addrinarray, _PAGE_CACHE_MODE_UC_MINUS); |
1525 | } | 1545 | } |
1526 | EXPORT_SYMBOL(set_memory_array_uc); | 1546 | EXPORT_SYMBOL(set_memory_array_uc); |
1527 | 1547 | ||
1528 | int set_memory_array_wc(unsigned long *addr, int addrinarray) | 1548 | int set_memory_array_wc(unsigned long *addr, int addrinarray) |
1529 | { | 1549 | { |
1530 | return _set_memory_array(addr, addrinarray, _PAGE_CACHE_MODE_WC); | 1550 | return _set_memory_array(addr, addrinarray, _PAGE_CACHE_MODE_WC); |
1531 | } | 1551 | } |
1532 | EXPORT_SYMBOL(set_memory_array_wc); | 1552 | EXPORT_SYMBOL(set_memory_array_wc); |
1533 | 1553 | ||
1534 | int _set_memory_wc(unsigned long addr, int numpages) | 1554 | int _set_memory_wc(unsigned long addr, int numpages) |
1535 | { | 1555 | { |
1536 | int ret; | 1556 | int ret; |
1537 | unsigned long addr_copy = addr; | 1557 | unsigned long addr_copy = addr; |
1538 | 1558 | ||
1539 | ret = change_page_attr_set(&addr, numpages, | 1559 | ret = change_page_attr_set(&addr, numpages, |
1540 | cachemode2pgprot(_PAGE_CACHE_MODE_UC_MINUS), | 1560 | cachemode2pgprot(_PAGE_CACHE_MODE_UC_MINUS), |
1541 | 0); | 1561 | 0); |
1542 | if (!ret) { | 1562 | if (!ret) { |
1543 | ret = change_page_attr_set_clr(&addr_copy, numpages, | 1563 | ret = change_page_attr_set_clr(&addr_copy, numpages, |
1544 | cachemode2pgprot( | 1564 | cachemode2pgprot( |
1545 | _PAGE_CACHE_MODE_WC), | 1565 | _PAGE_CACHE_MODE_WC), |
1546 | __pgprot(_PAGE_CACHE_MASK), | 1566 | __pgprot(_PAGE_CACHE_MASK), |
1547 | 0, 0, NULL); | 1567 | 0, 0, NULL); |
1548 | } | 1568 | } |
1549 | return ret; | 1569 | return ret; |
1550 | } | 1570 | } |
1551 | 1571 | ||
1552 | int set_memory_wc(unsigned long addr, int numpages) | 1572 | int set_memory_wc(unsigned long addr, int numpages) |
1553 | { | 1573 | { |
1554 | int ret; | 1574 | int ret; |
1555 | 1575 | ||
1556 | if (!pat_enabled) | 1576 | if (!pat_enabled) |
1557 | return set_memory_uc(addr, numpages); | 1577 | return set_memory_uc(addr, numpages); |
1558 | 1578 | ||
1559 | ret = reserve_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE, | 1579 | ret = reserve_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE, |
1560 | _PAGE_CACHE_MODE_WC, NULL); | 1580 | _PAGE_CACHE_MODE_WC, NULL); |
1561 | if (ret) | 1581 | if (ret) |
1562 | goto out_err; | 1582 | goto out_err; |
1563 | 1583 | ||
1564 | ret = _set_memory_wc(addr, numpages); | 1584 | ret = _set_memory_wc(addr, numpages); |
1565 | if (ret) | 1585 | if (ret) |
1566 | goto out_free; | 1586 | goto out_free; |
1567 | 1587 | ||
1568 | return 0; | 1588 | return 0; |
1569 | 1589 | ||
1570 | out_free: | 1590 | out_free: |
1571 | free_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE); | 1591 | free_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE); |
1572 | out_err: | 1592 | out_err: |
1573 | return ret; | 1593 | return ret; |
1574 | } | 1594 | } |
1575 | EXPORT_SYMBOL(set_memory_wc); | 1595 | EXPORT_SYMBOL(set_memory_wc); |
1576 | 1596 | ||
1577 | int _set_memory_wb(unsigned long addr, int numpages) | 1597 | int _set_memory_wb(unsigned long addr, int numpages) |
1578 | { | 1598 | { |
1579 | /* WB cache mode is hard wired to all cache attribute bits being 0 */ | 1599 | /* WB cache mode is hard wired to all cache attribute bits being 0 */ |
1580 | return change_page_attr_clear(&addr, numpages, | 1600 | return change_page_attr_clear(&addr, numpages, |
1581 | __pgprot(_PAGE_CACHE_MASK), 0); | 1601 | __pgprot(_PAGE_CACHE_MASK), 0); |
1582 | } | 1602 | } |
1583 | 1603 | ||
1584 | int set_memory_wb(unsigned long addr, int numpages) | 1604 | int set_memory_wb(unsigned long addr, int numpages) |
1585 | { | 1605 | { |
1586 | int ret; | 1606 | int ret; |
1587 | 1607 | ||
1588 | ret = _set_memory_wb(addr, numpages); | 1608 | ret = _set_memory_wb(addr, numpages); |
1589 | if (ret) | 1609 | if (ret) |
1590 | return ret; | 1610 | return ret; |
1591 | 1611 | ||
1592 | free_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE); | 1612 | free_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE); |
1593 | return 0; | 1613 | return 0; |
1594 | } | 1614 | } |
1595 | EXPORT_SYMBOL(set_memory_wb); | 1615 | EXPORT_SYMBOL(set_memory_wb); |
1596 | 1616 | ||
1597 | int set_memory_array_wb(unsigned long *addr, int addrinarray) | 1617 | int set_memory_array_wb(unsigned long *addr, int addrinarray) |
1598 | { | 1618 | { |
1599 | int i; | 1619 | int i; |
1600 | int ret; | 1620 | int ret; |
1601 | 1621 | ||
1602 | /* WB cache mode is hard wired to all cache attribute bits being 0 */ | 1622 | /* WB cache mode is hard wired to all cache attribute bits being 0 */ |
1603 | ret = change_page_attr_clear(addr, addrinarray, | 1623 | ret = change_page_attr_clear(addr, addrinarray, |
1604 | __pgprot(_PAGE_CACHE_MASK), 1); | 1624 | __pgprot(_PAGE_CACHE_MASK), 1); |
1605 | if (ret) | 1625 | if (ret) |
1606 | return ret; | 1626 | return ret; |
1607 | 1627 | ||
1608 | for (i = 0; i < addrinarray; i++) | 1628 | for (i = 0; i < addrinarray; i++) |
1609 | free_memtype(__pa(addr[i]), __pa(addr[i]) + PAGE_SIZE); | 1629 | free_memtype(__pa(addr[i]), __pa(addr[i]) + PAGE_SIZE); |
1610 | 1630 | ||
1611 | return 0; | 1631 | return 0; |
1612 | } | 1632 | } |
1613 | EXPORT_SYMBOL(set_memory_array_wb); | 1633 | EXPORT_SYMBOL(set_memory_array_wb); |
1614 | 1634 | ||
1615 | int set_memory_x(unsigned long addr, int numpages) | 1635 | int set_memory_x(unsigned long addr, int numpages) |
1616 | { | 1636 | { |
1617 | if (!(__supported_pte_mask & _PAGE_NX)) | 1637 | if (!(__supported_pte_mask & _PAGE_NX)) |
1618 | return 0; | 1638 | return 0; |
1619 | 1639 | ||
1620 | return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_NX), 0); | 1640 | return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_NX), 0); |
1621 | } | 1641 | } |
1622 | EXPORT_SYMBOL(set_memory_x); | 1642 | EXPORT_SYMBOL(set_memory_x); |
1623 | 1643 | ||
1624 | int set_memory_nx(unsigned long addr, int numpages) | 1644 | int set_memory_nx(unsigned long addr, int numpages) |
1625 | { | 1645 | { |
1626 | if (!(__supported_pte_mask & _PAGE_NX)) | 1646 | if (!(__supported_pte_mask & _PAGE_NX)) |
1627 | return 0; | 1647 | return 0; |
1628 | 1648 | ||
1629 | return change_page_attr_set(&addr, numpages, __pgprot(_PAGE_NX), 0); | 1649 | return change_page_attr_set(&addr, numpages, __pgprot(_PAGE_NX), 0); |
1630 | } | 1650 | } |
1631 | EXPORT_SYMBOL(set_memory_nx); | 1651 | EXPORT_SYMBOL(set_memory_nx); |
1632 | 1652 | ||
1633 | int set_memory_ro(unsigned long addr, int numpages) | 1653 | int set_memory_ro(unsigned long addr, int numpages) |
1634 | { | 1654 | { |
1635 | return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_RW), 0); | 1655 | return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_RW), 0); |
1636 | } | 1656 | } |
1637 | EXPORT_SYMBOL_GPL(set_memory_ro); | 1657 | EXPORT_SYMBOL_GPL(set_memory_ro); |
1638 | 1658 | ||
1639 | int set_memory_rw(unsigned long addr, int numpages) | 1659 | int set_memory_rw(unsigned long addr, int numpages) |
1640 | { | 1660 | { |
1641 | return change_page_attr_set(&addr, numpages, __pgprot(_PAGE_RW), 0); | 1661 | return change_page_attr_set(&addr, numpages, __pgprot(_PAGE_RW), 0); |
1642 | } | 1662 | } |
1643 | EXPORT_SYMBOL_GPL(set_memory_rw); | 1663 | EXPORT_SYMBOL_GPL(set_memory_rw); |
1644 | 1664 | ||
1645 | int set_memory_np(unsigned long addr, int numpages) | 1665 | int set_memory_np(unsigned long addr, int numpages) |
1646 | { | 1666 | { |
1647 | return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_PRESENT), 0); | 1667 | return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_PRESENT), 0); |
1648 | } | 1668 | } |
1649 | 1669 | ||
1650 | int set_memory_4k(unsigned long addr, int numpages) | 1670 | int set_memory_4k(unsigned long addr, int numpages) |
1651 | { | 1671 | { |
1652 | return change_page_attr_set_clr(&addr, numpages, __pgprot(0), | 1672 | return change_page_attr_set_clr(&addr, numpages, __pgprot(0), |
1653 | __pgprot(0), 1, 0, NULL); | 1673 | __pgprot(0), 1, 0, NULL); |
1654 | } | 1674 | } |
1655 | 1675 | ||
1656 | int set_pages_uc(struct page *page, int numpages) | 1676 | int set_pages_uc(struct page *page, int numpages) |
1657 | { | 1677 | { |
1658 | unsigned long addr = (unsigned long)page_address(page); | 1678 | unsigned long addr = (unsigned long)page_address(page); |
1659 | 1679 | ||
1660 | return set_memory_uc(addr, numpages); | 1680 | return set_memory_uc(addr, numpages); |
1661 | } | 1681 | } |
1662 | EXPORT_SYMBOL(set_pages_uc); | 1682 | EXPORT_SYMBOL(set_pages_uc); |
1663 | 1683 | ||
1664 | static int _set_pages_array(struct page **pages, int addrinarray, | 1684 | static int _set_pages_array(struct page **pages, int addrinarray, |
1665 | enum page_cache_mode new_type) | 1685 | enum page_cache_mode new_type) |
1666 | { | 1686 | { |
1667 | unsigned long start; | 1687 | unsigned long start; |
1668 | unsigned long end; | 1688 | unsigned long end; |
1669 | int i; | 1689 | int i; |
1670 | int free_idx; | 1690 | int free_idx; |
1671 | int ret; | 1691 | int ret; |
1672 | 1692 | ||
1673 | for (i = 0; i < addrinarray; i++) { | 1693 | for (i = 0; i < addrinarray; i++) { |
1674 | if (PageHighMem(pages[i])) | 1694 | if (PageHighMem(pages[i])) |
1675 | continue; | 1695 | continue; |
1676 | start = page_to_pfn(pages[i]) << PAGE_SHIFT; | 1696 | start = page_to_pfn(pages[i]) << PAGE_SHIFT; |
1677 | end = start + PAGE_SIZE; | 1697 | end = start + PAGE_SIZE; |
1678 | if (reserve_memtype(start, end, new_type, NULL)) | 1698 | if (reserve_memtype(start, end, new_type, NULL)) |
1679 | goto err_out; | 1699 | goto err_out; |
1680 | } | 1700 | } |
1681 | 1701 | ||
1682 | ret = cpa_set_pages_array(pages, addrinarray, | 1702 | ret = cpa_set_pages_array(pages, addrinarray, |
1683 | cachemode2pgprot(_PAGE_CACHE_MODE_UC_MINUS)); | 1703 | cachemode2pgprot(_PAGE_CACHE_MODE_UC_MINUS)); |
1684 | if (!ret && new_type == _PAGE_CACHE_MODE_WC) | 1704 | if (!ret && new_type == _PAGE_CACHE_MODE_WC) |
1685 | ret = change_page_attr_set_clr(NULL, addrinarray, | 1705 | ret = change_page_attr_set_clr(NULL, addrinarray, |
1686 | cachemode2pgprot( | 1706 | cachemode2pgprot( |
1687 | _PAGE_CACHE_MODE_WC), | 1707 | _PAGE_CACHE_MODE_WC), |
1688 | __pgprot(_PAGE_CACHE_MASK), | 1708 | __pgprot(_PAGE_CACHE_MASK), |
1689 | 0, CPA_PAGES_ARRAY, pages); | 1709 | 0, CPA_PAGES_ARRAY, pages); |
1690 | if (ret) | 1710 | if (ret) |
1691 | goto err_out; | 1711 | goto err_out; |
1692 | return 0; /* Success */ | 1712 | return 0; /* Success */ |
1693 | err_out: | 1713 | err_out: |
1694 | free_idx = i; | 1714 | free_idx = i; |
1695 | for (i = 0; i < free_idx; i++) { | 1715 | for (i = 0; i < free_idx; i++) { |
1696 | if (PageHighMem(pages[i])) | 1716 | if (PageHighMem(pages[i])) |
1697 | continue; | 1717 | continue; |
1698 | start = page_to_pfn(pages[i]) << PAGE_SHIFT; | 1718 | start = page_to_pfn(pages[i]) << PAGE_SHIFT; |
1699 | end = start + PAGE_SIZE; | 1719 | end = start + PAGE_SIZE; |
1700 | free_memtype(start, end); | 1720 | free_memtype(start, end); |
1701 | } | 1721 | } |
1702 | return -EINVAL; | 1722 | return -EINVAL; |
1703 | } | 1723 | } |
1704 | 1724 | ||
1705 | int set_pages_array_uc(struct page **pages, int addrinarray) | 1725 | int set_pages_array_uc(struct page **pages, int addrinarray) |
1706 | { | 1726 | { |
1707 | return _set_pages_array(pages, addrinarray, _PAGE_CACHE_MODE_UC_MINUS); | 1727 | return _set_pages_array(pages, addrinarray, _PAGE_CACHE_MODE_UC_MINUS); |
1708 | } | 1728 | } |
1709 | EXPORT_SYMBOL(set_pages_array_uc); | 1729 | EXPORT_SYMBOL(set_pages_array_uc); |
1710 | 1730 | ||
1711 | int set_pages_array_wc(struct page **pages, int addrinarray) | 1731 | int set_pages_array_wc(struct page **pages, int addrinarray) |
1712 | { | 1732 | { |
1713 | return _set_pages_array(pages, addrinarray, _PAGE_CACHE_MODE_WC); | 1733 | return _set_pages_array(pages, addrinarray, _PAGE_CACHE_MODE_WC); |
1714 | } | 1734 | } |
1715 | EXPORT_SYMBOL(set_pages_array_wc); | 1735 | EXPORT_SYMBOL(set_pages_array_wc); |
1716 | 1736 | ||
1717 | int set_pages_wb(struct page *page, int numpages) | 1737 | int set_pages_wb(struct page *page, int numpages) |
1718 | { | 1738 | { |
1719 | unsigned long addr = (unsigned long)page_address(page); | 1739 | unsigned long addr = (unsigned long)page_address(page); |
1720 | 1740 | ||
1721 | return set_memory_wb(addr, numpages); | 1741 | return set_memory_wb(addr, numpages); |
1722 | } | 1742 | } |
1723 | EXPORT_SYMBOL(set_pages_wb); | 1743 | EXPORT_SYMBOL(set_pages_wb); |
1724 | 1744 | ||
1725 | int set_pages_array_wb(struct page **pages, int addrinarray) | 1745 | int set_pages_array_wb(struct page **pages, int addrinarray) |
1726 | { | 1746 | { |
1727 | int retval; | 1747 | int retval; |
1728 | unsigned long start; | 1748 | unsigned long start; |
1729 | unsigned long end; | 1749 | unsigned long end; |
1730 | int i; | 1750 | int i; |
1731 | 1751 | ||
1732 | /* WB cache mode is hard wired to all cache attribute bits being 0 */ | 1752 | /* WB cache mode is hard wired to all cache attribute bits being 0 */ |
1733 | retval = cpa_clear_pages_array(pages, addrinarray, | 1753 | retval = cpa_clear_pages_array(pages, addrinarray, |
1734 | __pgprot(_PAGE_CACHE_MASK)); | 1754 | __pgprot(_PAGE_CACHE_MASK)); |
1735 | if (retval) | 1755 | if (retval) |
1736 | return retval; | 1756 | return retval; |
1737 | 1757 | ||
1738 | for (i = 0; i < addrinarray; i++) { | 1758 | for (i = 0; i < addrinarray; i++) { |
1739 | if (PageHighMem(pages[i])) | 1759 | if (PageHighMem(pages[i])) |
1740 | continue; | 1760 | continue; |
1741 | start = page_to_pfn(pages[i]) << PAGE_SHIFT; | 1761 | start = page_to_pfn(pages[i]) << PAGE_SHIFT; |
1742 | end = start + PAGE_SIZE; | 1762 | end = start + PAGE_SIZE; |
1743 | free_memtype(start, end); | 1763 | free_memtype(start, end); |
1744 | } | 1764 | } |
1745 | 1765 | ||
1746 | return 0; | 1766 | return 0; |
1747 | } | 1767 | } |
1748 | EXPORT_SYMBOL(set_pages_array_wb); | 1768 | EXPORT_SYMBOL(set_pages_array_wb); |
1749 | 1769 | ||
1750 | int set_pages_x(struct page *page, int numpages) | 1770 | int set_pages_x(struct page *page, int numpages) |
1751 | { | 1771 | { |
1752 | unsigned long addr = (unsigned long)page_address(page); | 1772 | unsigned long addr = (unsigned long)page_address(page); |
1753 | 1773 | ||
1754 | return set_memory_x(addr, numpages); | 1774 | return set_memory_x(addr, numpages); |
1755 | } | 1775 | } |
1756 | EXPORT_SYMBOL(set_pages_x); | 1776 | EXPORT_SYMBOL(set_pages_x); |
1757 | 1777 | ||
1758 | int set_pages_nx(struct page *page, int numpages) | 1778 | int set_pages_nx(struct page *page, int numpages) |
1759 | { | 1779 | { |
1760 | unsigned long addr = (unsigned long)page_address(page); | 1780 | unsigned long addr = (unsigned long)page_address(page); |
1761 | 1781 | ||
1762 | return set_memory_nx(addr, numpages); | 1782 | return set_memory_nx(addr, numpages); |
1763 | } | 1783 | } |
1764 | EXPORT_SYMBOL(set_pages_nx); | 1784 | EXPORT_SYMBOL(set_pages_nx); |
1765 | 1785 | ||
1766 | int set_pages_ro(struct page *page, int numpages) | 1786 | int set_pages_ro(struct page *page, int numpages) |
1767 | { | 1787 | { |
1768 | unsigned long addr = (unsigned long)page_address(page); | 1788 | unsigned long addr = (unsigned long)page_address(page); |
1769 | 1789 | ||
1770 | return set_memory_ro(addr, numpages); | 1790 | return set_memory_ro(addr, numpages); |
1771 | } | 1791 | } |
1772 | 1792 | ||
1773 | int set_pages_rw(struct page *page, int numpages) | 1793 | int set_pages_rw(struct page *page, int numpages) |
1774 | { | 1794 | { |
1775 | unsigned long addr = (unsigned long)page_address(page); | 1795 | unsigned long addr = (unsigned long)page_address(page); |
1776 | 1796 | ||
1777 | return set_memory_rw(addr, numpages); | 1797 | return set_memory_rw(addr, numpages); |
1778 | } | 1798 | } |
1779 | 1799 | ||
1780 | #ifdef CONFIG_DEBUG_PAGEALLOC | 1800 | #ifdef CONFIG_DEBUG_PAGEALLOC |
1781 | 1801 | ||
1782 | static int __set_pages_p(struct page *page, int numpages) | 1802 | static int __set_pages_p(struct page *page, int numpages) |
1783 | { | 1803 | { |
1784 | unsigned long tempaddr = (unsigned long) page_address(page); | 1804 | unsigned long tempaddr = (unsigned long) page_address(page); |
1785 | struct cpa_data cpa = { .vaddr = &tempaddr, | 1805 | struct cpa_data cpa = { .vaddr = &tempaddr, |
1786 | .pgd = NULL, | 1806 | .pgd = NULL, |
1787 | .numpages = numpages, | 1807 | .numpages = numpages, |
1788 | .mask_set = __pgprot(_PAGE_PRESENT | _PAGE_RW), | 1808 | .mask_set = __pgprot(_PAGE_PRESENT | _PAGE_RW), |
1789 | .mask_clr = __pgprot(0), | 1809 | .mask_clr = __pgprot(0), |
1790 | .flags = 0}; | 1810 | .flags = 0}; |
1791 | 1811 | ||
1792 | /* | 1812 | /* |
1793 | * No alias checking needed for setting present flag. otherwise, | 1813 | * No alias checking needed for setting present flag. otherwise, |
1794 | * we may need to break large pages for 64-bit kernel text | 1814 | * we may need to break large pages for 64-bit kernel text |
1795 | * mappings (this adds to complexity if we want to do this from | 1815 | * mappings (this adds to complexity if we want to do this from |
1796 | * atomic context especially). Let's keep it simple! | 1816 | * atomic context especially). Let's keep it simple! |
1797 | */ | 1817 | */ |
1798 | return __change_page_attr_set_clr(&cpa, 0); | 1818 | return __change_page_attr_set_clr(&cpa, 0); |
1799 | } | 1819 | } |
1800 | 1820 | ||
1801 | static int __set_pages_np(struct page *page, int numpages) | 1821 | static int __set_pages_np(struct page *page, int numpages) |
1802 | { | 1822 | { |
1803 | unsigned long tempaddr = (unsigned long) page_address(page); | 1823 | unsigned long tempaddr = (unsigned long) page_address(page); |
1804 | struct cpa_data cpa = { .vaddr = &tempaddr, | 1824 | struct cpa_data cpa = { .vaddr = &tempaddr, |
1805 | .pgd = NULL, | 1825 | .pgd = NULL, |
1806 | .numpages = numpages, | 1826 | .numpages = numpages, |
1807 | .mask_set = __pgprot(0), | 1827 | .mask_set = __pgprot(0), |
1808 | .mask_clr = __pgprot(_PAGE_PRESENT | _PAGE_RW), | 1828 | .mask_clr = __pgprot(_PAGE_PRESENT | _PAGE_RW), |
1809 | .flags = 0}; | 1829 | .flags = 0}; |
1810 | 1830 | ||
1811 | /* | 1831 | /* |
1812 | * No alias checking needed for setting not present flag. otherwise, | 1832 | * No alias checking needed for setting not present flag. otherwise, |
1813 | * we may need to break large pages for 64-bit kernel text | 1833 | * we may need to break large pages for 64-bit kernel text |
1814 | * mappings (this adds to complexity if we want to do this from | 1834 | * mappings (this adds to complexity if we want to do this from |
1815 | * atomic context especially). Let's keep it simple! | 1835 | * atomic context especially). Let's keep it simple! |
1816 | */ | 1836 | */ |
1817 | return __change_page_attr_set_clr(&cpa, 0); | 1837 | return __change_page_attr_set_clr(&cpa, 0); |
1818 | } | 1838 | } |
1819 | 1839 | ||
1820 | void __kernel_map_pages(struct page *page, int numpages, int enable) | 1840 | void __kernel_map_pages(struct page *page, int numpages, int enable) |
1821 | { | 1841 | { |
1822 | if (PageHighMem(page)) | 1842 | if (PageHighMem(page)) |
1823 | return; | 1843 | return; |
1824 | if (!enable) { | 1844 | if (!enable) { |
1825 | debug_check_no_locks_freed(page_address(page), | 1845 | debug_check_no_locks_freed(page_address(page), |
1826 | numpages * PAGE_SIZE); | 1846 | numpages * PAGE_SIZE); |
1827 | } | 1847 | } |
1828 | 1848 | ||
1829 | /* | 1849 | /* |
1830 | * The return value is ignored as the calls cannot fail. | 1850 | * The return value is ignored as the calls cannot fail. |
1831 | * Large pages for identity mappings are not used at boot time | 1851 | * Large pages for identity mappings are not used at boot time |
1832 | * and hence no memory allocations during large page split. | 1852 | * and hence no memory allocations during large page split. |
1833 | */ | 1853 | */ |
1834 | if (enable) | 1854 | if (enable) |
1835 | __set_pages_p(page, numpages); | 1855 | __set_pages_p(page, numpages); |
1836 | else | 1856 | else |
1837 | __set_pages_np(page, numpages); | 1857 | __set_pages_np(page, numpages); |
1838 | 1858 | ||
1839 | /* | 1859 | /* |
1840 | * We should perform an IPI and flush all tlbs, | 1860 | * We should perform an IPI and flush all tlbs, |
1841 | * but that can deadlock->flush only current cpu: | 1861 | * but that can deadlock->flush only current cpu: |
1842 | */ | 1862 | */ |
1843 | __flush_tlb_all(); | 1863 | __flush_tlb_all(); |
1844 | 1864 | ||
1845 | arch_flush_lazy_mmu_mode(); | 1865 | arch_flush_lazy_mmu_mode(); |
1846 | } | 1866 | } |
1847 | 1867 | ||
1848 | #ifdef CONFIG_HIBERNATION | 1868 | #ifdef CONFIG_HIBERNATION |
1849 | 1869 | ||
1850 | bool kernel_page_present(struct page *page) | 1870 | bool kernel_page_present(struct page *page) |
1851 | { | 1871 | { |
1852 | unsigned int level; | 1872 | unsigned int level; |
1853 | pte_t *pte; | 1873 | pte_t *pte; |
1854 | 1874 | ||
1855 | if (PageHighMem(page)) | 1875 | if (PageHighMem(page)) |
1856 | return false; | 1876 | return false; |
1857 | 1877 | ||
1858 | pte = lookup_address((unsigned long)page_address(page), &level); | 1878 | pte = lookup_address((unsigned long)page_address(page), &level); |
1859 | return (pte_val(*pte) & _PAGE_PRESENT); | 1879 | return (pte_val(*pte) & _PAGE_PRESENT); |
1860 | } | 1880 | } |
1861 | 1881 | ||
1862 | #endif /* CONFIG_HIBERNATION */ | 1882 | #endif /* CONFIG_HIBERNATION */ |
1863 | 1883 | ||
1864 | #endif /* CONFIG_DEBUG_PAGEALLOC */ | 1884 | #endif /* CONFIG_DEBUG_PAGEALLOC */ |
1865 | 1885 | ||
1866 | int kernel_map_pages_in_pgd(pgd_t *pgd, u64 pfn, unsigned long address, | 1886 | int kernel_map_pages_in_pgd(pgd_t *pgd, u64 pfn, unsigned long address, |
1867 | unsigned numpages, unsigned long page_flags) | 1887 | unsigned numpages, unsigned long page_flags) |
1868 | { | 1888 | { |
1869 | int retval = -EINVAL; | 1889 | int retval = -EINVAL; |
1870 | 1890 | ||
1871 | struct cpa_data cpa = { | 1891 | struct cpa_data cpa = { |
1872 | .vaddr = &address, | 1892 | .vaddr = &address, |
1873 | .pfn = pfn, | 1893 | .pfn = pfn, |
1874 | .pgd = pgd, | 1894 | .pgd = pgd, |
1875 | .numpages = numpages, | 1895 | .numpages = numpages, |
1876 | .mask_set = __pgprot(0), | 1896 | .mask_set = __pgprot(0), |
1877 | .mask_clr = __pgprot(0), | 1897 | .mask_clr = __pgprot(0), |
1878 | .flags = 0, | 1898 | .flags = 0, |
1879 | }; | 1899 | }; |
1880 | 1900 | ||
1881 | if (!(__supported_pte_mask & _PAGE_NX)) | 1901 | if (!(__supported_pte_mask & _PAGE_NX)) |
1882 | goto out; | 1902 | goto out; |
1883 | 1903 | ||
1884 | if (!(page_flags & _PAGE_NX)) | 1904 | if (!(page_flags & _PAGE_NX)) |
1885 | cpa.mask_clr = __pgprot(_PAGE_NX); | 1905 | cpa.mask_clr = __pgprot(_PAGE_NX); |
1886 | 1906 | ||
1887 | cpa.mask_set = __pgprot(_PAGE_PRESENT | page_flags); | 1907 | cpa.mask_set = __pgprot(_PAGE_PRESENT | page_flags); |
1888 | 1908 | ||
1889 | retval = __change_page_attr_set_clr(&cpa, 0); | 1909 | retval = __change_page_attr_set_clr(&cpa, 0); |
1890 | __flush_tlb_all(); | 1910 | __flush_tlb_all(); |
1891 | 1911 | ||
1892 | out: | 1912 | out: |
1893 | return retval; | 1913 | return retval; |
1894 | } | 1914 | } |
1895 | 1915 | ||
1896 | void kernel_unmap_pages_in_pgd(pgd_t *root, unsigned long address, | 1916 | void kernel_unmap_pages_in_pgd(pgd_t *root, unsigned long address, |
1897 | unsigned numpages) | 1917 | unsigned numpages) |
1898 | { | 1918 | { |
1899 | unmap_pgd_range(root, address, address + (numpages << PAGE_SHIFT)); | 1919 | unmap_pgd_range(root, address, address + (numpages << PAGE_SHIFT)); |
1900 | } | 1920 | } |
1901 | 1921 | ||
1902 | /* | 1922 | /* |
1903 | * The testcases use internal knowledge of the implementation that shouldn't | 1923 | * The testcases use internal knowledge of the implementation that shouldn't |
1904 | * be exposed to the rest of the kernel. Include these directly here. | 1924 | * be exposed to the rest of the kernel. Include these directly here. |
1905 | */ | 1925 | */ |
1906 | #ifdef CONFIG_CPA_DEBUG | 1926 | #ifdef CONFIG_CPA_DEBUG |
1907 | #include "pageattr-test.c" | 1927 | #include "pageattr-test.c" |
1908 | #endif | 1928 | #endif |
1909 | 1929 |
arch/x86/xen/mmu.c
1 | /* | 1 | /* |
2 | * Xen mmu operations | 2 | * Xen mmu operations |
3 | * | 3 | * |
4 | * This file contains the various mmu fetch and update operations. | 4 | * This file contains the various mmu fetch and update operations. |
5 | * The most important job they must perform is the mapping between the | 5 | * The most important job they must perform is the mapping between the |
6 | * domain's pfn and the overall machine mfns. | 6 | * domain's pfn and the overall machine mfns. |
7 | * | 7 | * |
8 | * Xen allows guests to directly update the pagetable, in a controlled | 8 | * Xen allows guests to directly update the pagetable, in a controlled |
9 | * fashion. In other words, the guest modifies the same pagetable | 9 | * fashion. In other words, the guest modifies the same pagetable |
10 | * that the CPU actually uses, which eliminates the overhead of having | 10 | * that the CPU actually uses, which eliminates the overhead of having |
11 | * a separate shadow pagetable. | 11 | * a separate shadow pagetable. |
12 | * | 12 | * |
13 | * In order to allow this, it falls on the guest domain to map its | 13 | * In order to allow this, it falls on the guest domain to map its |
14 | * notion of a "physical" pfn - which is just a domain-local linear | 14 | * notion of a "physical" pfn - which is just a domain-local linear |
15 | * address - into a real "machine address" which the CPU's MMU can | 15 | * address - into a real "machine address" which the CPU's MMU can |
16 | * use. | 16 | * use. |
17 | * | 17 | * |
18 | * A pgd_t/pmd_t/pte_t will typically contain an mfn, and so can be | 18 | * A pgd_t/pmd_t/pte_t will typically contain an mfn, and so can be |
19 | * inserted directly into the pagetable. When creating a new | 19 | * inserted directly into the pagetable. When creating a new |
20 | * pte/pmd/pgd, it converts the passed pfn into an mfn. Conversely, | 20 | * pte/pmd/pgd, it converts the passed pfn into an mfn. Conversely, |
21 | * when reading the content back with __(pgd|pmd|pte)_val, it converts | 21 | * when reading the content back with __(pgd|pmd|pte)_val, it converts |
22 | * the mfn back into a pfn. | 22 | * the mfn back into a pfn. |
23 | * | 23 | * |
24 | * The other constraint is that all pages which make up a pagetable | 24 | * The other constraint is that all pages which make up a pagetable |
25 | * must be mapped read-only in the guest. This prevents uncontrolled | 25 | * must be mapped read-only in the guest. This prevents uncontrolled |
26 | * guest updates to the pagetable. Xen strictly enforces this, and | 26 | * guest updates to the pagetable. Xen strictly enforces this, and |
27 | * will disallow any pagetable update which will end up mapping a | 27 | * will disallow any pagetable update which will end up mapping a |
28 | * pagetable page RW, and will disallow using any writable page as a | 28 | * pagetable page RW, and will disallow using any writable page as a |
29 | * pagetable. | 29 | * pagetable. |
30 | * | 30 | * |
31 | * Naively, when loading %cr3 with the base of a new pagetable, Xen | 31 | * Naively, when loading %cr3 with the base of a new pagetable, Xen |
32 | * would need to validate the whole pagetable before going on. | 32 | * would need to validate the whole pagetable before going on. |
33 | * Naturally, this is quite slow. The solution is to "pin" a | 33 | * Naturally, this is quite slow. The solution is to "pin" a |
34 | * pagetable, which enforces all the constraints on the pagetable even | 34 | * pagetable, which enforces all the constraints on the pagetable even |
35 | * when it is not actively in use. This menas that Xen can be assured | 35 | * when it is not actively in use. This menas that Xen can be assured |
36 | * that it is still valid when you do load it into %cr3, and doesn't | 36 | * that it is still valid when you do load it into %cr3, and doesn't |
37 | * need to revalidate it. | 37 | * need to revalidate it. |
38 | * | 38 | * |
39 | * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007 | 39 | * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007 |
40 | */ | 40 | */ |
41 | #include <linux/sched.h> | 41 | #include <linux/sched.h> |
42 | #include <linux/highmem.h> | 42 | #include <linux/highmem.h> |
43 | #include <linux/debugfs.h> | 43 | #include <linux/debugfs.h> |
44 | #include <linux/bug.h> | 44 | #include <linux/bug.h> |
45 | #include <linux/vmalloc.h> | 45 | #include <linux/vmalloc.h> |
46 | #include <linux/module.h> | 46 | #include <linux/module.h> |
47 | #include <linux/gfp.h> | 47 | #include <linux/gfp.h> |
48 | #include <linux/memblock.h> | 48 | #include <linux/memblock.h> |
49 | #include <linux/seq_file.h> | 49 | #include <linux/seq_file.h> |
50 | #include <linux/crash_dump.h> | 50 | #include <linux/crash_dump.h> |
51 | 51 | ||
52 | #include <trace/events/xen.h> | 52 | #include <trace/events/xen.h> |
53 | 53 | ||
54 | #include <asm/pgtable.h> | 54 | #include <asm/pgtable.h> |
55 | #include <asm/tlbflush.h> | 55 | #include <asm/tlbflush.h> |
56 | #include <asm/fixmap.h> | 56 | #include <asm/fixmap.h> |
57 | #include <asm/mmu_context.h> | 57 | #include <asm/mmu_context.h> |
58 | #include <asm/setup.h> | 58 | #include <asm/setup.h> |
59 | #include <asm/paravirt.h> | 59 | #include <asm/paravirt.h> |
60 | #include <asm/e820.h> | 60 | #include <asm/e820.h> |
61 | #include <asm/linkage.h> | 61 | #include <asm/linkage.h> |
62 | #include <asm/page.h> | 62 | #include <asm/page.h> |
63 | #include <asm/init.h> | 63 | #include <asm/init.h> |
64 | #include <asm/pat.h> | 64 | #include <asm/pat.h> |
65 | #include <asm/smp.h> | 65 | #include <asm/smp.h> |
66 | 66 | ||
67 | #include <asm/xen/hypercall.h> | 67 | #include <asm/xen/hypercall.h> |
68 | #include <asm/xen/hypervisor.h> | 68 | #include <asm/xen/hypervisor.h> |
69 | 69 | ||
70 | #include <xen/xen.h> | 70 | #include <xen/xen.h> |
71 | #include <xen/page.h> | 71 | #include <xen/page.h> |
72 | #include <xen/interface/xen.h> | 72 | #include <xen/interface/xen.h> |
73 | #include <xen/interface/hvm/hvm_op.h> | 73 | #include <xen/interface/hvm/hvm_op.h> |
74 | #include <xen/interface/version.h> | 74 | #include <xen/interface/version.h> |
75 | #include <xen/interface/memory.h> | 75 | #include <xen/interface/memory.h> |
76 | #include <xen/hvc-console.h> | 76 | #include <xen/hvc-console.h> |
77 | 77 | ||
78 | #include "multicalls.h" | 78 | #include "multicalls.h" |
79 | #include "mmu.h" | 79 | #include "mmu.h" |
80 | #include "debugfs.h" | 80 | #include "debugfs.h" |
81 | 81 | ||
82 | /* | 82 | /* |
83 | * Protects atomic reservation decrease/increase against concurrent increases. | 83 | * Protects atomic reservation decrease/increase against concurrent increases. |
84 | * Also protects non-atomic updates of current_pages and balloon lists. | 84 | * Also protects non-atomic updates of current_pages and balloon lists. |
85 | */ | 85 | */ |
86 | DEFINE_SPINLOCK(xen_reservation_lock); | 86 | DEFINE_SPINLOCK(xen_reservation_lock); |
87 | 87 | ||
88 | #ifdef CONFIG_X86_32 | 88 | #ifdef CONFIG_X86_32 |
89 | /* | 89 | /* |
90 | * Identity map, in addition to plain kernel map. This needs to be | 90 | * Identity map, in addition to plain kernel map. This needs to be |
91 | * large enough to allocate page table pages to allocate the rest. | 91 | * large enough to allocate page table pages to allocate the rest. |
92 | * Each page can map 2MB. | 92 | * Each page can map 2MB. |
93 | */ | 93 | */ |
94 | #define LEVEL1_IDENT_ENTRIES (PTRS_PER_PTE * 4) | 94 | #define LEVEL1_IDENT_ENTRIES (PTRS_PER_PTE * 4) |
95 | static RESERVE_BRK_ARRAY(pte_t, level1_ident_pgt, LEVEL1_IDENT_ENTRIES); | 95 | static RESERVE_BRK_ARRAY(pte_t, level1_ident_pgt, LEVEL1_IDENT_ENTRIES); |
96 | #endif | 96 | #endif |
97 | #ifdef CONFIG_X86_64 | 97 | #ifdef CONFIG_X86_64 |
98 | /* l3 pud for userspace vsyscall mapping */ | 98 | /* l3 pud for userspace vsyscall mapping */ |
99 | static pud_t level3_user_vsyscall[PTRS_PER_PUD] __page_aligned_bss; | 99 | static pud_t level3_user_vsyscall[PTRS_PER_PUD] __page_aligned_bss; |
100 | #endif /* CONFIG_X86_64 */ | 100 | #endif /* CONFIG_X86_64 */ |
101 | 101 | ||
102 | /* | 102 | /* |
103 | * Note about cr3 (pagetable base) values: | 103 | * Note about cr3 (pagetable base) values: |
104 | * | 104 | * |
105 | * xen_cr3 contains the current logical cr3 value; it contains the | 105 | * xen_cr3 contains the current logical cr3 value; it contains the |
106 | * last set cr3. This may not be the current effective cr3, because | 106 | * last set cr3. This may not be the current effective cr3, because |
107 | * its update may be being lazily deferred. However, a vcpu looking | 107 | * its update may be being lazily deferred. However, a vcpu looking |
108 | * at its own cr3 can use this value knowing that it everything will | 108 | * at its own cr3 can use this value knowing that it everything will |
109 | * be self-consistent. | 109 | * be self-consistent. |
110 | * | 110 | * |
111 | * xen_current_cr3 contains the actual vcpu cr3; it is set once the | 111 | * xen_current_cr3 contains the actual vcpu cr3; it is set once the |
112 | * hypercall to set the vcpu cr3 is complete (so it may be a little | 112 | * hypercall to set the vcpu cr3 is complete (so it may be a little |
113 | * out of date, but it will never be set early). If one vcpu is | 113 | * out of date, but it will never be set early). If one vcpu is |
114 | * looking at another vcpu's cr3 value, it should use this variable. | 114 | * looking at another vcpu's cr3 value, it should use this variable. |
115 | */ | 115 | */ |
116 | DEFINE_PER_CPU(unsigned long, xen_cr3); /* cr3 stored as physaddr */ | 116 | DEFINE_PER_CPU(unsigned long, xen_cr3); /* cr3 stored as physaddr */ |
117 | DEFINE_PER_CPU(unsigned long, xen_current_cr3); /* actual vcpu cr3 */ | 117 | DEFINE_PER_CPU(unsigned long, xen_current_cr3); /* actual vcpu cr3 */ |
118 | 118 | ||
119 | 119 | ||
120 | /* | 120 | /* |
121 | * Just beyond the highest usermode address. STACK_TOP_MAX has a | 121 | * Just beyond the highest usermode address. STACK_TOP_MAX has a |
122 | * redzone above it, so round it up to a PGD boundary. | 122 | * redzone above it, so round it up to a PGD boundary. |
123 | */ | 123 | */ |
124 | #define USER_LIMIT ((STACK_TOP_MAX + PGDIR_SIZE - 1) & PGDIR_MASK) | 124 | #define USER_LIMIT ((STACK_TOP_MAX + PGDIR_SIZE - 1) & PGDIR_MASK) |
125 | 125 | ||
126 | unsigned long arbitrary_virt_to_mfn(void *vaddr) | 126 | unsigned long arbitrary_virt_to_mfn(void *vaddr) |
127 | { | 127 | { |
128 | xmaddr_t maddr = arbitrary_virt_to_machine(vaddr); | 128 | xmaddr_t maddr = arbitrary_virt_to_machine(vaddr); |
129 | 129 | ||
130 | return PFN_DOWN(maddr.maddr); | 130 | return PFN_DOWN(maddr.maddr); |
131 | } | 131 | } |
132 | 132 | ||
133 | xmaddr_t arbitrary_virt_to_machine(void *vaddr) | 133 | xmaddr_t arbitrary_virt_to_machine(void *vaddr) |
134 | { | 134 | { |
135 | unsigned long address = (unsigned long)vaddr; | 135 | unsigned long address = (unsigned long)vaddr; |
136 | unsigned int level; | 136 | unsigned int level; |
137 | pte_t *pte; | 137 | pte_t *pte; |
138 | unsigned offset; | 138 | unsigned offset; |
139 | 139 | ||
140 | /* | 140 | /* |
141 | * if the PFN is in the linear mapped vaddr range, we can just use | 141 | * if the PFN is in the linear mapped vaddr range, we can just use |
142 | * the (quick) virt_to_machine() p2m lookup | 142 | * the (quick) virt_to_machine() p2m lookup |
143 | */ | 143 | */ |
144 | if (virt_addr_valid(vaddr)) | 144 | if (virt_addr_valid(vaddr)) |
145 | return virt_to_machine(vaddr); | 145 | return virt_to_machine(vaddr); |
146 | 146 | ||
147 | /* otherwise we have to do a (slower) full page-table walk */ | 147 | /* otherwise we have to do a (slower) full page-table walk */ |
148 | 148 | ||
149 | pte = lookup_address(address, &level); | 149 | pte = lookup_address(address, &level); |
150 | BUG_ON(pte == NULL); | 150 | BUG_ON(pte == NULL); |
151 | offset = address & ~PAGE_MASK; | 151 | offset = address & ~PAGE_MASK; |
152 | return XMADDR(((phys_addr_t)pte_mfn(*pte) << PAGE_SHIFT) + offset); | 152 | return XMADDR(((phys_addr_t)pte_mfn(*pte) << PAGE_SHIFT) + offset); |
153 | } | 153 | } |
154 | EXPORT_SYMBOL_GPL(arbitrary_virt_to_machine); | 154 | EXPORT_SYMBOL_GPL(arbitrary_virt_to_machine); |
155 | 155 | ||
156 | void make_lowmem_page_readonly(void *vaddr) | 156 | void make_lowmem_page_readonly(void *vaddr) |
157 | { | 157 | { |
158 | pte_t *pte, ptev; | 158 | pte_t *pte, ptev; |
159 | unsigned long address = (unsigned long)vaddr; | 159 | unsigned long address = (unsigned long)vaddr; |
160 | unsigned int level; | 160 | unsigned int level; |
161 | 161 | ||
162 | pte = lookup_address(address, &level); | 162 | pte = lookup_address(address, &level); |
163 | if (pte == NULL) | 163 | if (pte == NULL) |
164 | return; /* vaddr missing */ | 164 | return; /* vaddr missing */ |
165 | 165 | ||
166 | ptev = pte_wrprotect(*pte); | 166 | ptev = pte_wrprotect(*pte); |
167 | 167 | ||
168 | if (HYPERVISOR_update_va_mapping(address, ptev, 0)) | 168 | if (HYPERVISOR_update_va_mapping(address, ptev, 0)) |
169 | BUG(); | 169 | BUG(); |
170 | } | 170 | } |
171 | 171 | ||
172 | void make_lowmem_page_readwrite(void *vaddr) | 172 | void make_lowmem_page_readwrite(void *vaddr) |
173 | { | 173 | { |
174 | pte_t *pte, ptev; | 174 | pte_t *pte, ptev; |
175 | unsigned long address = (unsigned long)vaddr; | 175 | unsigned long address = (unsigned long)vaddr; |
176 | unsigned int level; | 176 | unsigned int level; |
177 | 177 | ||
178 | pte = lookup_address(address, &level); | 178 | pte = lookup_address(address, &level); |
179 | if (pte == NULL) | 179 | if (pte == NULL) |
180 | return; /* vaddr missing */ | 180 | return; /* vaddr missing */ |
181 | 181 | ||
182 | ptev = pte_mkwrite(*pte); | 182 | ptev = pte_mkwrite(*pte); |
183 | 183 | ||
184 | if (HYPERVISOR_update_va_mapping(address, ptev, 0)) | 184 | if (HYPERVISOR_update_va_mapping(address, ptev, 0)) |
185 | BUG(); | 185 | BUG(); |
186 | } | 186 | } |
187 | 187 | ||
188 | 188 | ||
189 | static bool xen_page_pinned(void *ptr) | 189 | static bool xen_page_pinned(void *ptr) |
190 | { | 190 | { |
191 | struct page *page = virt_to_page(ptr); | 191 | struct page *page = virt_to_page(ptr); |
192 | 192 | ||
193 | return PagePinned(page); | 193 | return PagePinned(page); |
194 | } | 194 | } |
195 | 195 | ||
196 | void xen_set_domain_pte(pte_t *ptep, pte_t pteval, unsigned domid) | 196 | void xen_set_domain_pte(pte_t *ptep, pte_t pteval, unsigned domid) |
197 | { | 197 | { |
198 | struct multicall_space mcs; | 198 | struct multicall_space mcs; |
199 | struct mmu_update *u; | 199 | struct mmu_update *u; |
200 | 200 | ||
201 | trace_xen_mmu_set_domain_pte(ptep, pteval, domid); | 201 | trace_xen_mmu_set_domain_pte(ptep, pteval, domid); |
202 | 202 | ||
203 | mcs = xen_mc_entry(sizeof(*u)); | 203 | mcs = xen_mc_entry(sizeof(*u)); |
204 | u = mcs.args; | 204 | u = mcs.args; |
205 | 205 | ||
206 | /* ptep might be kmapped when using 32-bit HIGHPTE */ | 206 | /* ptep might be kmapped when using 32-bit HIGHPTE */ |
207 | u->ptr = virt_to_machine(ptep).maddr; | 207 | u->ptr = virt_to_machine(ptep).maddr; |
208 | u->val = pte_val_ma(pteval); | 208 | u->val = pte_val_ma(pteval); |
209 | 209 | ||
210 | MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, domid); | 210 | MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, domid); |
211 | 211 | ||
212 | xen_mc_issue(PARAVIRT_LAZY_MMU); | 212 | xen_mc_issue(PARAVIRT_LAZY_MMU); |
213 | } | 213 | } |
214 | EXPORT_SYMBOL_GPL(xen_set_domain_pte); | 214 | EXPORT_SYMBOL_GPL(xen_set_domain_pte); |
215 | 215 | ||
216 | static void xen_extend_mmu_update(const struct mmu_update *update) | 216 | static void xen_extend_mmu_update(const struct mmu_update *update) |
217 | { | 217 | { |
218 | struct multicall_space mcs; | 218 | struct multicall_space mcs; |
219 | struct mmu_update *u; | 219 | struct mmu_update *u; |
220 | 220 | ||
221 | mcs = xen_mc_extend_args(__HYPERVISOR_mmu_update, sizeof(*u)); | 221 | mcs = xen_mc_extend_args(__HYPERVISOR_mmu_update, sizeof(*u)); |
222 | 222 | ||
223 | if (mcs.mc != NULL) { | 223 | if (mcs.mc != NULL) { |
224 | mcs.mc->args[1]++; | 224 | mcs.mc->args[1]++; |
225 | } else { | 225 | } else { |
226 | mcs = __xen_mc_entry(sizeof(*u)); | 226 | mcs = __xen_mc_entry(sizeof(*u)); |
227 | MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, DOMID_SELF); | 227 | MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, DOMID_SELF); |
228 | } | 228 | } |
229 | 229 | ||
230 | u = mcs.args; | 230 | u = mcs.args; |
231 | *u = *update; | 231 | *u = *update; |
232 | } | 232 | } |
233 | 233 | ||
234 | static void xen_extend_mmuext_op(const struct mmuext_op *op) | 234 | static void xen_extend_mmuext_op(const struct mmuext_op *op) |
235 | { | 235 | { |
236 | struct multicall_space mcs; | 236 | struct multicall_space mcs; |
237 | struct mmuext_op *u; | 237 | struct mmuext_op *u; |
238 | 238 | ||
239 | mcs = xen_mc_extend_args(__HYPERVISOR_mmuext_op, sizeof(*u)); | 239 | mcs = xen_mc_extend_args(__HYPERVISOR_mmuext_op, sizeof(*u)); |
240 | 240 | ||
241 | if (mcs.mc != NULL) { | 241 | if (mcs.mc != NULL) { |
242 | mcs.mc->args[1]++; | 242 | mcs.mc->args[1]++; |
243 | } else { | 243 | } else { |
244 | mcs = __xen_mc_entry(sizeof(*u)); | 244 | mcs = __xen_mc_entry(sizeof(*u)); |
245 | MULTI_mmuext_op(mcs.mc, mcs.args, 1, NULL, DOMID_SELF); | 245 | MULTI_mmuext_op(mcs.mc, mcs.args, 1, NULL, DOMID_SELF); |
246 | } | 246 | } |
247 | 247 | ||
248 | u = mcs.args; | 248 | u = mcs.args; |
249 | *u = *op; | 249 | *u = *op; |
250 | } | 250 | } |
251 | 251 | ||
252 | static void xen_set_pmd_hyper(pmd_t *ptr, pmd_t val) | 252 | static void xen_set_pmd_hyper(pmd_t *ptr, pmd_t val) |
253 | { | 253 | { |
254 | struct mmu_update u; | 254 | struct mmu_update u; |
255 | 255 | ||
256 | preempt_disable(); | 256 | preempt_disable(); |
257 | 257 | ||
258 | xen_mc_batch(); | 258 | xen_mc_batch(); |
259 | 259 | ||
260 | /* ptr may be ioremapped for 64-bit pagetable setup */ | 260 | /* ptr may be ioremapped for 64-bit pagetable setup */ |
261 | u.ptr = arbitrary_virt_to_machine(ptr).maddr; | 261 | u.ptr = arbitrary_virt_to_machine(ptr).maddr; |
262 | u.val = pmd_val_ma(val); | 262 | u.val = pmd_val_ma(val); |
263 | xen_extend_mmu_update(&u); | 263 | xen_extend_mmu_update(&u); |
264 | 264 | ||
265 | xen_mc_issue(PARAVIRT_LAZY_MMU); | 265 | xen_mc_issue(PARAVIRT_LAZY_MMU); |
266 | 266 | ||
267 | preempt_enable(); | 267 | preempt_enable(); |
268 | } | 268 | } |
269 | 269 | ||
270 | static void xen_set_pmd(pmd_t *ptr, pmd_t val) | 270 | static void xen_set_pmd(pmd_t *ptr, pmd_t val) |
271 | { | 271 | { |
272 | trace_xen_mmu_set_pmd(ptr, val); | 272 | trace_xen_mmu_set_pmd(ptr, val); |
273 | 273 | ||
274 | /* If page is not pinned, we can just update the entry | 274 | /* If page is not pinned, we can just update the entry |
275 | directly */ | 275 | directly */ |
276 | if (!xen_page_pinned(ptr)) { | 276 | if (!xen_page_pinned(ptr)) { |
277 | *ptr = val; | 277 | *ptr = val; |
278 | return; | 278 | return; |
279 | } | 279 | } |
280 | 280 | ||
281 | xen_set_pmd_hyper(ptr, val); | 281 | xen_set_pmd_hyper(ptr, val); |
282 | } | 282 | } |
283 | 283 | ||
284 | /* | 284 | /* |
285 | * Associate a virtual page frame with a given physical page frame | 285 | * Associate a virtual page frame with a given physical page frame |
286 | * and protection flags for that frame. | 286 | * and protection flags for that frame. |
287 | */ | 287 | */ |
288 | void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags) | 288 | void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags) |
289 | { | 289 | { |
290 | set_pte_vaddr(vaddr, mfn_pte(mfn, flags)); | 290 | set_pte_vaddr(vaddr, mfn_pte(mfn, flags)); |
291 | } | 291 | } |
292 | 292 | ||
293 | static bool xen_batched_set_pte(pte_t *ptep, pte_t pteval) | 293 | static bool xen_batched_set_pte(pte_t *ptep, pte_t pteval) |
294 | { | 294 | { |
295 | struct mmu_update u; | 295 | struct mmu_update u; |
296 | 296 | ||
297 | if (paravirt_get_lazy_mode() != PARAVIRT_LAZY_MMU) | 297 | if (paravirt_get_lazy_mode() != PARAVIRT_LAZY_MMU) |
298 | return false; | 298 | return false; |
299 | 299 | ||
300 | xen_mc_batch(); | 300 | xen_mc_batch(); |
301 | 301 | ||
302 | u.ptr = virt_to_machine(ptep).maddr | MMU_NORMAL_PT_UPDATE; | 302 | u.ptr = virt_to_machine(ptep).maddr | MMU_NORMAL_PT_UPDATE; |
303 | u.val = pte_val_ma(pteval); | 303 | u.val = pte_val_ma(pteval); |
304 | xen_extend_mmu_update(&u); | 304 | xen_extend_mmu_update(&u); |
305 | 305 | ||
306 | xen_mc_issue(PARAVIRT_LAZY_MMU); | 306 | xen_mc_issue(PARAVIRT_LAZY_MMU); |
307 | 307 | ||
308 | return true; | 308 | return true; |
309 | } | 309 | } |
310 | 310 | ||
311 | static inline void __xen_set_pte(pte_t *ptep, pte_t pteval) | 311 | static inline void __xen_set_pte(pte_t *ptep, pte_t pteval) |
312 | { | 312 | { |
313 | if (!xen_batched_set_pte(ptep, pteval)) { | 313 | if (!xen_batched_set_pte(ptep, pteval)) { |
314 | /* | 314 | /* |
315 | * Could call native_set_pte() here and trap and | 315 | * Could call native_set_pte() here and trap and |
316 | * emulate the PTE write but with 32-bit guests this | 316 | * emulate the PTE write but with 32-bit guests this |
317 | * needs two traps (one for each of the two 32-bit | 317 | * needs two traps (one for each of the two 32-bit |
318 | * words in the PTE) so do one hypercall directly | 318 | * words in the PTE) so do one hypercall directly |
319 | * instead. | 319 | * instead. |
320 | */ | 320 | */ |
321 | struct mmu_update u; | 321 | struct mmu_update u; |
322 | 322 | ||
323 | u.ptr = virt_to_machine(ptep).maddr | MMU_NORMAL_PT_UPDATE; | 323 | u.ptr = virt_to_machine(ptep).maddr | MMU_NORMAL_PT_UPDATE; |
324 | u.val = pte_val_ma(pteval); | 324 | u.val = pte_val_ma(pteval); |
325 | HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF); | 325 | HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF); |
326 | } | 326 | } |
327 | } | 327 | } |
328 | 328 | ||
329 | static void xen_set_pte(pte_t *ptep, pte_t pteval) | 329 | static void xen_set_pte(pte_t *ptep, pte_t pteval) |
330 | { | 330 | { |
331 | trace_xen_mmu_set_pte(ptep, pteval); | 331 | trace_xen_mmu_set_pte(ptep, pteval); |
332 | __xen_set_pte(ptep, pteval); | 332 | __xen_set_pte(ptep, pteval); |
333 | } | 333 | } |
334 | 334 | ||
335 | static void xen_set_pte_at(struct mm_struct *mm, unsigned long addr, | 335 | static void xen_set_pte_at(struct mm_struct *mm, unsigned long addr, |
336 | pte_t *ptep, pte_t pteval) | 336 | pte_t *ptep, pte_t pteval) |
337 | { | 337 | { |
338 | trace_xen_mmu_set_pte_at(mm, addr, ptep, pteval); | 338 | trace_xen_mmu_set_pte_at(mm, addr, ptep, pteval); |
339 | __xen_set_pte(ptep, pteval); | 339 | __xen_set_pte(ptep, pteval); |
340 | } | 340 | } |
341 | 341 | ||
342 | pte_t xen_ptep_modify_prot_start(struct mm_struct *mm, | 342 | pte_t xen_ptep_modify_prot_start(struct mm_struct *mm, |
343 | unsigned long addr, pte_t *ptep) | 343 | unsigned long addr, pte_t *ptep) |
344 | { | 344 | { |
345 | /* Just return the pte as-is. We preserve the bits on commit */ | 345 | /* Just return the pte as-is. We preserve the bits on commit */ |
346 | trace_xen_mmu_ptep_modify_prot_start(mm, addr, ptep, *ptep); | 346 | trace_xen_mmu_ptep_modify_prot_start(mm, addr, ptep, *ptep); |
347 | return *ptep; | 347 | return *ptep; |
348 | } | 348 | } |
349 | 349 | ||
350 | void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr, | 350 | void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr, |
351 | pte_t *ptep, pte_t pte) | 351 | pte_t *ptep, pte_t pte) |
352 | { | 352 | { |
353 | struct mmu_update u; | 353 | struct mmu_update u; |
354 | 354 | ||
355 | trace_xen_mmu_ptep_modify_prot_commit(mm, addr, ptep, pte); | 355 | trace_xen_mmu_ptep_modify_prot_commit(mm, addr, ptep, pte); |
356 | xen_mc_batch(); | 356 | xen_mc_batch(); |
357 | 357 | ||
358 | u.ptr = virt_to_machine(ptep).maddr | MMU_PT_UPDATE_PRESERVE_AD; | 358 | u.ptr = virt_to_machine(ptep).maddr | MMU_PT_UPDATE_PRESERVE_AD; |
359 | u.val = pte_val_ma(pte); | 359 | u.val = pte_val_ma(pte); |
360 | xen_extend_mmu_update(&u); | 360 | xen_extend_mmu_update(&u); |
361 | 361 | ||
362 | xen_mc_issue(PARAVIRT_LAZY_MMU); | 362 | xen_mc_issue(PARAVIRT_LAZY_MMU); |
363 | } | 363 | } |
364 | 364 | ||
365 | /* Assume pteval_t is equivalent to all the other *val_t types. */ | 365 | /* Assume pteval_t is equivalent to all the other *val_t types. */ |
366 | static pteval_t pte_mfn_to_pfn(pteval_t val) | 366 | static pteval_t pte_mfn_to_pfn(pteval_t val) |
367 | { | 367 | { |
368 | if (val & _PAGE_PRESENT) { | 368 | if (val & _PAGE_PRESENT) { |
369 | unsigned long mfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT; | 369 | unsigned long mfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT; |
370 | unsigned long pfn = mfn_to_pfn(mfn); | 370 | unsigned long pfn = mfn_to_pfn(mfn); |
371 | 371 | ||
372 | pteval_t flags = val & PTE_FLAGS_MASK; | 372 | pteval_t flags = val & PTE_FLAGS_MASK; |
373 | if (unlikely(pfn == ~0)) | 373 | if (unlikely(pfn == ~0)) |
374 | val = flags & ~_PAGE_PRESENT; | 374 | val = flags & ~_PAGE_PRESENT; |
375 | else | 375 | else |
376 | val = ((pteval_t)pfn << PAGE_SHIFT) | flags; | 376 | val = ((pteval_t)pfn << PAGE_SHIFT) | flags; |
377 | } | 377 | } |
378 | 378 | ||
379 | return val; | 379 | return val; |
380 | } | 380 | } |
381 | 381 | ||
382 | static pteval_t pte_pfn_to_mfn(pteval_t val) | 382 | static pteval_t pte_pfn_to_mfn(pteval_t val) |
383 | { | 383 | { |
384 | if (val & _PAGE_PRESENT) { | 384 | if (val & _PAGE_PRESENT) { |
385 | unsigned long pfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT; | 385 | unsigned long pfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT; |
386 | pteval_t flags = val & PTE_FLAGS_MASK; | 386 | pteval_t flags = val & PTE_FLAGS_MASK; |
387 | unsigned long mfn; | 387 | unsigned long mfn; |
388 | 388 | ||
389 | if (!xen_feature(XENFEAT_auto_translated_physmap)) | 389 | if (!xen_feature(XENFEAT_auto_translated_physmap)) |
390 | mfn = get_phys_to_machine(pfn); | 390 | mfn = __pfn_to_mfn(pfn); |
391 | else | 391 | else |
392 | mfn = pfn; | 392 | mfn = pfn; |
393 | /* | 393 | /* |
394 | * If there's no mfn for the pfn, then just create an | 394 | * If there's no mfn for the pfn, then just create an |
395 | * empty non-present pte. Unfortunately this loses | 395 | * empty non-present pte. Unfortunately this loses |
396 | * information about the original pfn, so | 396 | * information about the original pfn, so |
397 | * pte_mfn_to_pfn is asymmetric. | 397 | * pte_mfn_to_pfn is asymmetric. |
398 | */ | 398 | */ |
399 | if (unlikely(mfn == INVALID_P2M_ENTRY)) { | 399 | if (unlikely(mfn == INVALID_P2M_ENTRY)) { |
400 | mfn = 0; | 400 | mfn = 0; |
401 | flags = 0; | 401 | flags = 0; |
402 | } else | 402 | } else |
403 | mfn &= ~(FOREIGN_FRAME_BIT | IDENTITY_FRAME_BIT); | 403 | mfn &= ~(FOREIGN_FRAME_BIT | IDENTITY_FRAME_BIT); |
404 | val = ((pteval_t)mfn << PAGE_SHIFT) | flags; | 404 | val = ((pteval_t)mfn << PAGE_SHIFT) | flags; |
405 | } | 405 | } |
406 | 406 | ||
407 | return val; | 407 | return val; |
408 | } | 408 | } |
409 | 409 | ||
410 | __visible pteval_t xen_pte_val(pte_t pte) | 410 | __visible pteval_t xen_pte_val(pte_t pte) |
411 | { | 411 | { |
412 | pteval_t pteval = pte.pte; | 412 | pteval_t pteval = pte.pte; |
413 | 413 | ||
414 | return pte_mfn_to_pfn(pteval); | 414 | return pte_mfn_to_pfn(pteval); |
415 | } | 415 | } |
416 | PV_CALLEE_SAVE_REGS_THUNK(xen_pte_val); | 416 | PV_CALLEE_SAVE_REGS_THUNK(xen_pte_val); |
417 | 417 | ||
418 | __visible pgdval_t xen_pgd_val(pgd_t pgd) | 418 | __visible pgdval_t xen_pgd_val(pgd_t pgd) |
419 | { | 419 | { |
420 | return pte_mfn_to_pfn(pgd.pgd); | 420 | return pte_mfn_to_pfn(pgd.pgd); |
421 | } | 421 | } |
422 | PV_CALLEE_SAVE_REGS_THUNK(xen_pgd_val); | 422 | PV_CALLEE_SAVE_REGS_THUNK(xen_pgd_val); |
423 | 423 | ||
424 | __visible pte_t xen_make_pte(pteval_t pte) | 424 | __visible pte_t xen_make_pte(pteval_t pte) |
425 | { | 425 | { |
426 | pte = pte_pfn_to_mfn(pte); | 426 | pte = pte_pfn_to_mfn(pte); |
427 | 427 | ||
428 | return native_make_pte(pte); | 428 | return native_make_pte(pte); |
429 | } | 429 | } |
430 | PV_CALLEE_SAVE_REGS_THUNK(xen_make_pte); | 430 | PV_CALLEE_SAVE_REGS_THUNK(xen_make_pte); |
431 | 431 | ||
432 | __visible pgd_t xen_make_pgd(pgdval_t pgd) | 432 | __visible pgd_t xen_make_pgd(pgdval_t pgd) |
433 | { | 433 | { |
434 | pgd = pte_pfn_to_mfn(pgd); | 434 | pgd = pte_pfn_to_mfn(pgd); |
435 | return native_make_pgd(pgd); | 435 | return native_make_pgd(pgd); |
436 | } | 436 | } |
437 | PV_CALLEE_SAVE_REGS_THUNK(xen_make_pgd); | 437 | PV_CALLEE_SAVE_REGS_THUNK(xen_make_pgd); |
438 | 438 | ||
439 | __visible pmdval_t xen_pmd_val(pmd_t pmd) | 439 | __visible pmdval_t xen_pmd_val(pmd_t pmd) |
440 | { | 440 | { |
441 | return pte_mfn_to_pfn(pmd.pmd); | 441 | return pte_mfn_to_pfn(pmd.pmd); |
442 | } | 442 | } |
443 | PV_CALLEE_SAVE_REGS_THUNK(xen_pmd_val); | 443 | PV_CALLEE_SAVE_REGS_THUNK(xen_pmd_val); |
444 | 444 | ||
445 | static void xen_set_pud_hyper(pud_t *ptr, pud_t val) | 445 | static void xen_set_pud_hyper(pud_t *ptr, pud_t val) |
446 | { | 446 | { |
447 | struct mmu_update u; | 447 | struct mmu_update u; |
448 | 448 | ||
449 | preempt_disable(); | 449 | preempt_disable(); |
450 | 450 | ||
451 | xen_mc_batch(); | 451 | xen_mc_batch(); |
452 | 452 | ||
453 | /* ptr may be ioremapped for 64-bit pagetable setup */ | 453 | /* ptr may be ioremapped for 64-bit pagetable setup */ |
454 | u.ptr = arbitrary_virt_to_machine(ptr).maddr; | 454 | u.ptr = arbitrary_virt_to_machine(ptr).maddr; |
455 | u.val = pud_val_ma(val); | 455 | u.val = pud_val_ma(val); |
456 | xen_extend_mmu_update(&u); | 456 | xen_extend_mmu_update(&u); |
457 | 457 | ||
458 | xen_mc_issue(PARAVIRT_LAZY_MMU); | 458 | xen_mc_issue(PARAVIRT_LAZY_MMU); |
459 | 459 | ||
460 | preempt_enable(); | 460 | preempt_enable(); |
461 | } | 461 | } |
462 | 462 | ||
463 | static void xen_set_pud(pud_t *ptr, pud_t val) | 463 | static void xen_set_pud(pud_t *ptr, pud_t val) |
464 | { | 464 | { |
465 | trace_xen_mmu_set_pud(ptr, val); | 465 | trace_xen_mmu_set_pud(ptr, val); |
466 | 466 | ||
467 | /* If page is not pinned, we can just update the entry | 467 | /* If page is not pinned, we can just update the entry |
468 | directly */ | 468 | directly */ |
469 | if (!xen_page_pinned(ptr)) { | 469 | if (!xen_page_pinned(ptr)) { |
470 | *ptr = val; | 470 | *ptr = val; |
471 | return; | 471 | return; |
472 | } | 472 | } |
473 | 473 | ||
474 | xen_set_pud_hyper(ptr, val); | 474 | xen_set_pud_hyper(ptr, val); |
475 | } | 475 | } |
476 | 476 | ||
477 | #ifdef CONFIG_X86_PAE | 477 | #ifdef CONFIG_X86_PAE |
478 | static void xen_set_pte_atomic(pte_t *ptep, pte_t pte) | 478 | static void xen_set_pte_atomic(pte_t *ptep, pte_t pte) |
479 | { | 479 | { |
480 | trace_xen_mmu_set_pte_atomic(ptep, pte); | 480 | trace_xen_mmu_set_pte_atomic(ptep, pte); |
481 | set_64bit((u64 *)ptep, native_pte_val(pte)); | 481 | set_64bit((u64 *)ptep, native_pte_val(pte)); |
482 | } | 482 | } |
483 | 483 | ||
484 | static void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) | 484 | static void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) |
485 | { | 485 | { |
486 | trace_xen_mmu_pte_clear(mm, addr, ptep); | 486 | trace_xen_mmu_pte_clear(mm, addr, ptep); |
487 | if (!xen_batched_set_pte(ptep, native_make_pte(0))) | 487 | if (!xen_batched_set_pte(ptep, native_make_pte(0))) |
488 | native_pte_clear(mm, addr, ptep); | 488 | native_pte_clear(mm, addr, ptep); |
489 | } | 489 | } |
490 | 490 | ||
491 | static void xen_pmd_clear(pmd_t *pmdp) | 491 | static void xen_pmd_clear(pmd_t *pmdp) |
492 | { | 492 | { |
493 | trace_xen_mmu_pmd_clear(pmdp); | 493 | trace_xen_mmu_pmd_clear(pmdp); |
494 | set_pmd(pmdp, __pmd(0)); | 494 | set_pmd(pmdp, __pmd(0)); |
495 | } | 495 | } |
496 | #endif /* CONFIG_X86_PAE */ | 496 | #endif /* CONFIG_X86_PAE */ |
497 | 497 | ||
498 | __visible pmd_t xen_make_pmd(pmdval_t pmd) | 498 | __visible pmd_t xen_make_pmd(pmdval_t pmd) |
499 | { | 499 | { |
500 | pmd = pte_pfn_to_mfn(pmd); | 500 | pmd = pte_pfn_to_mfn(pmd); |
501 | return native_make_pmd(pmd); | 501 | return native_make_pmd(pmd); |
502 | } | 502 | } |
503 | PV_CALLEE_SAVE_REGS_THUNK(xen_make_pmd); | 503 | PV_CALLEE_SAVE_REGS_THUNK(xen_make_pmd); |
504 | 504 | ||
505 | #if PAGETABLE_LEVELS == 4 | 505 | #if PAGETABLE_LEVELS == 4 |
506 | __visible pudval_t xen_pud_val(pud_t pud) | 506 | __visible pudval_t xen_pud_val(pud_t pud) |
507 | { | 507 | { |
508 | return pte_mfn_to_pfn(pud.pud); | 508 | return pte_mfn_to_pfn(pud.pud); |
509 | } | 509 | } |
510 | PV_CALLEE_SAVE_REGS_THUNK(xen_pud_val); | 510 | PV_CALLEE_SAVE_REGS_THUNK(xen_pud_val); |
511 | 511 | ||
512 | __visible pud_t xen_make_pud(pudval_t pud) | 512 | __visible pud_t xen_make_pud(pudval_t pud) |
513 | { | 513 | { |
514 | pud = pte_pfn_to_mfn(pud); | 514 | pud = pte_pfn_to_mfn(pud); |
515 | 515 | ||
516 | return native_make_pud(pud); | 516 | return native_make_pud(pud); |
517 | } | 517 | } |
518 | PV_CALLEE_SAVE_REGS_THUNK(xen_make_pud); | 518 | PV_CALLEE_SAVE_REGS_THUNK(xen_make_pud); |
519 | 519 | ||
520 | static pgd_t *xen_get_user_pgd(pgd_t *pgd) | 520 | static pgd_t *xen_get_user_pgd(pgd_t *pgd) |
521 | { | 521 | { |
522 | pgd_t *pgd_page = (pgd_t *)(((unsigned long)pgd) & PAGE_MASK); | 522 | pgd_t *pgd_page = (pgd_t *)(((unsigned long)pgd) & PAGE_MASK); |
523 | unsigned offset = pgd - pgd_page; | 523 | unsigned offset = pgd - pgd_page; |
524 | pgd_t *user_ptr = NULL; | 524 | pgd_t *user_ptr = NULL; |
525 | 525 | ||
526 | if (offset < pgd_index(USER_LIMIT)) { | 526 | if (offset < pgd_index(USER_LIMIT)) { |
527 | struct page *page = virt_to_page(pgd_page); | 527 | struct page *page = virt_to_page(pgd_page); |
528 | user_ptr = (pgd_t *)page->private; | 528 | user_ptr = (pgd_t *)page->private; |
529 | if (user_ptr) | 529 | if (user_ptr) |
530 | user_ptr += offset; | 530 | user_ptr += offset; |
531 | } | 531 | } |
532 | 532 | ||
533 | return user_ptr; | 533 | return user_ptr; |
534 | } | 534 | } |
535 | 535 | ||
536 | static void __xen_set_pgd_hyper(pgd_t *ptr, pgd_t val) | 536 | static void __xen_set_pgd_hyper(pgd_t *ptr, pgd_t val) |
537 | { | 537 | { |
538 | struct mmu_update u; | 538 | struct mmu_update u; |
539 | 539 | ||
540 | u.ptr = virt_to_machine(ptr).maddr; | 540 | u.ptr = virt_to_machine(ptr).maddr; |
541 | u.val = pgd_val_ma(val); | 541 | u.val = pgd_val_ma(val); |
542 | xen_extend_mmu_update(&u); | 542 | xen_extend_mmu_update(&u); |
543 | } | 543 | } |
544 | 544 | ||
545 | /* | 545 | /* |
546 | * Raw hypercall-based set_pgd, intended for in early boot before | 546 | * Raw hypercall-based set_pgd, intended for in early boot before |
547 | * there's a page structure. This implies: | 547 | * there's a page structure. This implies: |
548 | * 1. The only existing pagetable is the kernel's | 548 | * 1. The only existing pagetable is the kernel's |
549 | * 2. It is always pinned | 549 | * 2. It is always pinned |
550 | * 3. It has no user pagetable attached to it | 550 | * 3. It has no user pagetable attached to it |
551 | */ | 551 | */ |
552 | static void __init xen_set_pgd_hyper(pgd_t *ptr, pgd_t val) | 552 | static void __init xen_set_pgd_hyper(pgd_t *ptr, pgd_t val) |
553 | { | 553 | { |
554 | preempt_disable(); | 554 | preempt_disable(); |
555 | 555 | ||
556 | xen_mc_batch(); | 556 | xen_mc_batch(); |
557 | 557 | ||
558 | __xen_set_pgd_hyper(ptr, val); | 558 | __xen_set_pgd_hyper(ptr, val); |
559 | 559 | ||
560 | xen_mc_issue(PARAVIRT_LAZY_MMU); | 560 | xen_mc_issue(PARAVIRT_LAZY_MMU); |
561 | 561 | ||
562 | preempt_enable(); | 562 | preempt_enable(); |
563 | } | 563 | } |
564 | 564 | ||
565 | static void xen_set_pgd(pgd_t *ptr, pgd_t val) | 565 | static void xen_set_pgd(pgd_t *ptr, pgd_t val) |
566 | { | 566 | { |
567 | pgd_t *user_ptr = xen_get_user_pgd(ptr); | 567 | pgd_t *user_ptr = xen_get_user_pgd(ptr); |
568 | 568 | ||
569 | trace_xen_mmu_set_pgd(ptr, user_ptr, val); | 569 | trace_xen_mmu_set_pgd(ptr, user_ptr, val); |
570 | 570 | ||
571 | /* If page is not pinned, we can just update the entry | 571 | /* If page is not pinned, we can just update the entry |
572 | directly */ | 572 | directly */ |
573 | if (!xen_page_pinned(ptr)) { | 573 | if (!xen_page_pinned(ptr)) { |
574 | *ptr = val; | 574 | *ptr = val; |
575 | if (user_ptr) { | 575 | if (user_ptr) { |
576 | WARN_ON(xen_page_pinned(user_ptr)); | 576 | WARN_ON(xen_page_pinned(user_ptr)); |
577 | *user_ptr = val; | 577 | *user_ptr = val; |
578 | } | 578 | } |
579 | return; | 579 | return; |
580 | } | 580 | } |
581 | 581 | ||
582 | /* If it's pinned, then we can at least batch the kernel and | 582 | /* If it's pinned, then we can at least batch the kernel and |
583 | user updates together. */ | 583 | user updates together. */ |
584 | xen_mc_batch(); | 584 | xen_mc_batch(); |
585 | 585 | ||
586 | __xen_set_pgd_hyper(ptr, val); | 586 | __xen_set_pgd_hyper(ptr, val); |
587 | if (user_ptr) | 587 | if (user_ptr) |
588 | __xen_set_pgd_hyper(user_ptr, val); | 588 | __xen_set_pgd_hyper(user_ptr, val); |
589 | 589 | ||
590 | xen_mc_issue(PARAVIRT_LAZY_MMU); | 590 | xen_mc_issue(PARAVIRT_LAZY_MMU); |
591 | } | 591 | } |
592 | #endif /* PAGETABLE_LEVELS == 4 */ | 592 | #endif /* PAGETABLE_LEVELS == 4 */ |
593 | 593 | ||
594 | /* | 594 | /* |
595 | * (Yet another) pagetable walker. This one is intended for pinning a | 595 | * (Yet another) pagetable walker. This one is intended for pinning a |
596 | * pagetable. This means that it walks a pagetable and calls the | 596 | * pagetable. This means that it walks a pagetable and calls the |
597 | * callback function on each page it finds making up the page table, | 597 | * callback function on each page it finds making up the page table, |
598 | * at every level. It walks the entire pagetable, but it only bothers | 598 | * at every level. It walks the entire pagetable, but it only bothers |
599 | * pinning pte pages which are below limit. In the normal case this | 599 | * pinning pte pages which are below limit. In the normal case this |
600 | * will be STACK_TOP_MAX, but at boot we need to pin up to | 600 | * will be STACK_TOP_MAX, but at boot we need to pin up to |
601 | * FIXADDR_TOP. | 601 | * FIXADDR_TOP. |
602 | * | 602 | * |
603 | * For 32-bit the important bit is that we don't pin beyond there, | 603 | * For 32-bit the important bit is that we don't pin beyond there, |
604 | * because then we start getting into Xen's ptes. | 604 | * because then we start getting into Xen's ptes. |
605 | * | 605 | * |
606 | * For 64-bit, we must skip the Xen hole in the middle of the address | 606 | * For 64-bit, we must skip the Xen hole in the middle of the address |
607 | * space, just after the big x86-64 virtual hole. | 607 | * space, just after the big x86-64 virtual hole. |
608 | */ | 608 | */ |
609 | static int __xen_pgd_walk(struct mm_struct *mm, pgd_t *pgd, | 609 | static int __xen_pgd_walk(struct mm_struct *mm, pgd_t *pgd, |
610 | int (*func)(struct mm_struct *mm, struct page *, | 610 | int (*func)(struct mm_struct *mm, struct page *, |
611 | enum pt_level), | 611 | enum pt_level), |
612 | unsigned long limit) | 612 | unsigned long limit) |
613 | { | 613 | { |
614 | int flush = 0; | 614 | int flush = 0; |
615 | unsigned hole_low, hole_high; | 615 | unsigned hole_low, hole_high; |
616 | unsigned pgdidx_limit, pudidx_limit, pmdidx_limit; | 616 | unsigned pgdidx_limit, pudidx_limit, pmdidx_limit; |
617 | unsigned pgdidx, pudidx, pmdidx; | 617 | unsigned pgdidx, pudidx, pmdidx; |
618 | 618 | ||
619 | /* The limit is the last byte to be touched */ | 619 | /* The limit is the last byte to be touched */ |
620 | limit--; | 620 | limit--; |
621 | BUG_ON(limit >= FIXADDR_TOP); | 621 | BUG_ON(limit >= FIXADDR_TOP); |
622 | 622 | ||
623 | if (xen_feature(XENFEAT_auto_translated_physmap)) | 623 | if (xen_feature(XENFEAT_auto_translated_physmap)) |
624 | return 0; | 624 | return 0; |
625 | 625 | ||
626 | /* | 626 | /* |
627 | * 64-bit has a great big hole in the middle of the address | 627 | * 64-bit has a great big hole in the middle of the address |
628 | * space, which contains the Xen mappings. On 32-bit these | 628 | * space, which contains the Xen mappings. On 32-bit these |
629 | * will end up making a zero-sized hole and so is a no-op. | 629 | * will end up making a zero-sized hole and so is a no-op. |
630 | */ | 630 | */ |
631 | hole_low = pgd_index(USER_LIMIT); | 631 | hole_low = pgd_index(USER_LIMIT); |
632 | hole_high = pgd_index(PAGE_OFFSET); | 632 | hole_high = pgd_index(PAGE_OFFSET); |
633 | 633 | ||
634 | pgdidx_limit = pgd_index(limit); | 634 | pgdidx_limit = pgd_index(limit); |
635 | #if PTRS_PER_PUD > 1 | 635 | #if PTRS_PER_PUD > 1 |
636 | pudidx_limit = pud_index(limit); | 636 | pudidx_limit = pud_index(limit); |
637 | #else | 637 | #else |
638 | pudidx_limit = 0; | 638 | pudidx_limit = 0; |
639 | #endif | 639 | #endif |
640 | #if PTRS_PER_PMD > 1 | 640 | #if PTRS_PER_PMD > 1 |
641 | pmdidx_limit = pmd_index(limit); | 641 | pmdidx_limit = pmd_index(limit); |
642 | #else | 642 | #else |
643 | pmdidx_limit = 0; | 643 | pmdidx_limit = 0; |
644 | #endif | 644 | #endif |
645 | 645 | ||
646 | for (pgdidx = 0; pgdidx <= pgdidx_limit; pgdidx++) { | 646 | for (pgdidx = 0; pgdidx <= pgdidx_limit; pgdidx++) { |
647 | pud_t *pud; | 647 | pud_t *pud; |
648 | 648 | ||
649 | if (pgdidx >= hole_low && pgdidx < hole_high) | 649 | if (pgdidx >= hole_low && pgdidx < hole_high) |
650 | continue; | 650 | continue; |
651 | 651 | ||
652 | if (!pgd_val(pgd[pgdidx])) | 652 | if (!pgd_val(pgd[pgdidx])) |
653 | continue; | 653 | continue; |
654 | 654 | ||
655 | pud = pud_offset(&pgd[pgdidx], 0); | 655 | pud = pud_offset(&pgd[pgdidx], 0); |
656 | 656 | ||
657 | if (PTRS_PER_PUD > 1) /* not folded */ | 657 | if (PTRS_PER_PUD > 1) /* not folded */ |
658 | flush |= (*func)(mm, virt_to_page(pud), PT_PUD); | 658 | flush |= (*func)(mm, virt_to_page(pud), PT_PUD); |
659 | 659 | ||
660 | for (pudidx = 0; pudidx < PTRS_PER_PUD; pudidx++) { | 660 | for (pudidx = 0; pudidx < PTRS_PER_PUD; pudidx++) { |
661 | pmd_t *pmd; | 661 | pmd_t *pmd; |
662 | 662 | ||
663 | if (pgdidx == pgdidx_limit && | 663 | if (pgdidx == pgdidx_limit && |
664 | pudidx > pudidx_limit) | 664 | pudidx > pudidx_limit) |
665 | goto out; | 665 | goto out; |
666 | 666 | ||
667 | if (pud_none(pud[pudidx])) | 667 | if (pud_none(pud[pudidx])) |
668 | continue; | 668 | continue; |
669 | 669 | ||
670 | pmd = pmd_offset(&pud[pudidx], 0); | 670 | pmd = pmd_offset(&pud[pudidx], 0); |
671 | 671 | ||
672 | if (PTRS_PER_PMD > 1) /* not folded */ | 672 | if (PTRS_PER_PMD > 1) /* not folded */ |
673 | flush |= (*func)(mm, virt_to_page(pmd), PT_PMD); | 673 | flush |= (*func)(mm, virt_to_page(pmd), PT_PMD); |
674 | 674 | ||
675 | for (pmdidx = 0; pmdidx < PTRS_PER_PMD; pmdidx++) { | 675 | for (pmdidx = 0; pmdidx < PTRS_PER_PMD; pmdidx++) { |
676 | struct page *pte; | 676 | struct page *pte; |
677 | 677 | ||
678 | if (pgdidx == pgdidx_limit && | 678 | if (pgdidx == pgdidx_limit && |
679 | pudidx == pudidx_limit && | 679 | pudidx == pudidx_limit && |
680 | pmdidx > pmdidx_limit) | 680 | pmdidx > pmdidx_limit) |
681 | goto out; | 681 | goto out; |
682 | 682 | ||
683 | if (pmd_none(pmd[pmdidx])) | 683 | if (pmd_none(pmd[pmdidx])) |
684 | continue; | 684 | continue; |
685 | 685 | ||
686 | pte = pmd_page(pmd[pmdidx]); | 686 | pte = pmd_page(pmd[pmdidx]); |
687 | flush |= (*func)(mm, pte, PT_PTE); | 687 | flush |= (*func)(mm, pte, PT_PTE); |
688 | } | 688 | } |
689 | } | 689 | } |
690 | } | 690 | } |
691 | 691 | ||
692 | out: | 692 | out: |
693 | /* Do the top level last, so that the callbacks can use it as | 693 | /* Do the top level last, so that the callbacks can use it as |
694 | a cue to do final things like tlb flushes. */ | 694 | a cue to do final things like tlb flushes. */ |
695 | flush |= (*func)(mm, virt_to_page(pgd), PT_PGD); | 695 | flush |= (*func)(mm, virt_to_page(pgd), PT_PGD); |
696 | 696 | ||
697 | return flush; | 697 | return flush; |
698 | } | 698 | } |
699 | 699 | ||
700 | static int xen_pgd_walk(struct mm_struct *mm, | 700 | static int xen_pgd_walk(struct mm_struct *mm, |
701 | int (*func)(struct mm_struct *mm, struct page *, | 701 | int (*func)(struct mm_struct *mm, struct page *, |
702 | enum pt_level), | 702 | enum pt_level), |
703 | unsigned long limit) | 703 | unsigned long limit) |
704 | { | 704 | { |
705 | return __xen_pgd_walk(mm, mm->pgd, func, limit); | 705 | return __xen_pgd_walk(mm, mm->pgd, func, limit); |
706 | } | 706 | } |
707 | 707 | ||
708 | /* If we're using split pte locks, then take the page's lock and | 708 | /* If we're using split pte locks, then take the page's lock and |
709 | return a pointer to it. Otherwise return NULL. */ | 709 | return a pointer to it. Otherwise return NULL. */ |
710 | static spinlock_t *xen_pte_lock(struct page *page, struct mm_struct *mm) | 710 | static spinlock_t *xen_pte_lock(struct page *page, struct mm_struct *mm) |
711 | { | 711 | { |
712 | spinlock_t *ptl = NULL; | 712 | spinlock_t *ptl = NULL; |
713 | 713 | ||
714 | #if USE_SPLIT_PTE_PTLOCKS | 714 | #if USE_SPLIT_PTE_PTLOCKS |
715 | ptl = ptlock_ptr(page); | 715 | ptl = ptlock_ptr(page); |
716 | spin_lock_nest_lock(ptl, &mm->page_table_lock); | 716 | spin_lock_nest_lock(ptl, &mm->page_table_lock); |
717 | #endif | 717 | #endif |
718 | 718 | ||
719 | return ptl; | 719 | return ptl; |
720 | } | 720 | } |
721 | 721 | ||
722 | static void xen_pte_unlock(void *v) | 722 | static void xen_pte_unlock(void *v) |
723 | { | 723 | { |
724 | spinlock_t *ptl = v; | 724 | spinlock_t *ptl = v; |
725 | spin_unlock(ptl); | 725 | spin_unlock(ptl); |
726 | } | 726 | } |
727 | 727 | ||
728 | static void xen_do_pin(unsigned level, unsigned long pfn) | 728 | static void xen_do_pin(unsigned level, unsigned long pfn) |
729 | { | 729 | { |
730 | struct mmuext_op op; | 730 | struct mmuext_op op; |
731 | 731 | ||
732 | op.cmd = level; | 732 | op.cmd = level; |
733 | op.arg1.mfn = pfn_to_mfn(pfn); | 733 | op.arg1.mfn = pfn_to_mfn(pfn); |
734 | 734 | ||
735 | xen_extend_mmuext_op(&op); | 735 | xen_extend_mmuext_op(&op); |
736 | } | 736 | } |
737 | 737 | ||
738 | static int xen_pin_page(struct mm_struct *mm, struct page *page, | 738 | static int xen_pin_page(struct mm_struct *mm, struct page *page, |
739 | enum pt_level level) | 739 | enum pt_level level) |
740 | { | 740 | { |
741 | unsigned pgfl = TestSetPagePinned(page); | 741 | unsigned pgfl = TestSetPagePinned(page); |
742 | int flush; | 742 | int flush; |
743 | 743 | ||
744 | if (pgfl) | 744 | if (pgfl) |
745 | flush = 0; /* already pinned */ | 745 | flush = 0; /* already pinned */ |
746 | else if (PageHighMem(page)) | 746 | else if (PageHighMem(page)) |
747 | /* kmaps need flushing if we found an unpinned | 747 | /* kmaps need flushing if we found an unpinned |
748 | highpage */ | 748 | highpage */ |
749 | flush = 1; | 749 | flush = 1; |
750 | else { | 750 | else { |
751 | void *pt = lowmem_page_address(page); | 751 | void *pt = lowmem_page_address(page); |
752 | unsigned long pfn = page_to_pfn(page); | 752 | unsigned long pfn = page_to_pfn(page); |
753 | struct multicall_space mcs = __xen_mc_entry(0); | 753 | struct multicall_space mcs = __xen_mc_entry(0); |
754 | spinlock_t *ptl; | 754 | spinlock_t *ptl; |
755 | 755 | ||
756 | flush = 0; | 756 | flush = 0; |
757 | 757 | ||
758 | /* | 758 | /* |
759 | * We need to hold the pagetable lock between the time | 759 | * We need to hold the pagetable lock between the time |
760 | * we make the pagetable RO and when we actually pin | 760 | * we make the pagetable RO and when we actually pin |
761 | * it. If we don't, then other users may come in and | 761 | * it. If we don't, then other users may come in and |
762 | * attempt to update the pagetable by writing it, | 762 | * attempt to update the pagetable by writing it, |
763 | * which will fail because the memory is RO but not | 763 | * which will fail because the memory is RO but not |
764 | * pinned, so Xen won't do the trap'n'emulate. | 764 | * pinned, so Xen won't do the trap'n'emulate. |
765 | * | 765 | * |
766 | * If we're using split pte locks, we can't hold the | 766 | * If we're using split pte locks, we can't hold the |
767 | * entire pagetable's worth of locks during the | 767 | * entire pagetable's worth of locks during the |
768 | * traverse, because we may wrap the preempt count (8 | 768 | * traverse, because we may wrap the preempt count (8 |
769 | * bits). The solution is to mark RO and pin each PTE | 769 | * bits). The solution is to mark RO and pin each PTE |
770 | * page while holding the lock. This means the number | 770 | * page while holding the lock. This means the number |
771 | * of locks we end up holding is never more than a | 771 | * of locks we end up holding is never more than a |
772 | * batch size (~32 entries, at present). | 772 | * batch size (~32 entries, at present). |
773 | * | 773 | * |
774 | * If we're not using split pte locks, we needn't pin | 774 | * If we're not using split pte locks, we needn't pin |
775 | * the PTE pages independently, because we're | 775 | * the PTE pages independently, because we're |
776 | * protected by the overall pagetable lock. | 776 | * protected by the overall pagetable lock. |
777 | */ | 777 | */ |
778 | ptl = NULL; | 778 | ptl = NULL; |
779 | if (level == PT_PTE) | 779 | if (level == PT_PTE) |
780 | ptl = xen_pte_lock(page, mm); | 780 | ptl = xen_pte_lock(page, mm); |
781 | 781 | ||
782 | MULTI_update_va_mapping(mcs.mc, (unsigned long)pt, | 782 | MULTI_update_va_mapping(mcs.mc, (unsigned long)pt, |
783 | pfn_pte(pfn, PAGE_KERNEL_RO), | 783 | pfn_pte(pfn, PAGE_KERNEL_RO), |
784 | level == PT_PGD ? UVMF_TLB_FLUSH : 0); | 784 | level == PT_PGD ? UVMF_TLB_FLUSH : 0); |
785 | 785 | ||
786 | if (ptl) { | 786 | if (ptl) { |
787 | xen_do_pin(MMUEXT_PIN_L1_TABLE, pfn); | 787 | xen_do_pin(MMUEXT_PIN_L1_TABLE, pfn); |
788 | 788 | ||
789 | /* Queue a deferred unlock for when this batch | 789 | /* Queue a deferred unlock for when this batch |
790 | is completed. */ | 790 | is completed. */ |
791 | xen_mc_callback(xen_pte_unlock, ptl); | 791 | xen_mc_callback(xen_pte_unlock, ptl); |
792 | } | 792 | } |
793 | } | 793 | } |
794 | 794 | ||
795 | return flush; | 795 | return flush; |
796 | } | 796 | } |
797 | 797 | ||
798 | /* This is called just after a mm has been created, but it has not | 798 | /* This is called just after a mm has been created, but it has not |
799 | been used yet. We need to make sure that its pagetable is all | 799 | been used yet. We need to make sure that its pagetable is all |
800 | read-only, and can be pinned. */ | 800 | read-only, and can be pinned. */ |
801 | static void __xen_pgd_pin(struct mm_struct *mm, pgd_t *pgd) | 801 | static void __xen_pgd_pin(struct mm_struct *mm, pgd_t *pgd) |
802 | { | 802 | { |
803 | trace_xen_mmu_pgd_pin(mm, pgd); | 803 | trace_xen_mmu_pgd_pin(mm, pgd); |
804 | 804 | ||
805 | xen_mc_batch(); | 805 | xen_mc_batch(); |
806 | 806 | ||
807 | if (__xen_pgd_walk(mm, pgd, xen_pin_page, USER_LIMIT)) { | 807 | if (__xen_pgd_walk(mm, pgd, xen_pin_page, USER_LIMIT)) { |
808 | /* re-enable interrupts for flushing */ | 808 | /* re-enable interrupts for flushing */ |
809 | xen_mc_issue(0); | 809 | xen_mc_issue(0); |
810 | 810 | ||
811 | kmap_flush_unused(); | 811 | kmap_flush_unused(); |
812 | 812 | ||
813 | xen_mc_batch(); | 813 | xen_mc_batch(); |
814 | } | 814 | } |
815 | 815 | ||
816 | #ifdef CONFIG_X86_64 | 816 | #ifdef CONFIG_X86_64 |
817 | { | 817 | { |
818 | pgd_t *user_pgd = xen_get_user_pgd(pgd); | 818 | pgd_t *user_pgd = xen_get_user_pgd(pgd); |
819 | 819 | ||
820 | xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(pgd))); | 820 | xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(pgd))); |
821 | 821 | ||
822 | if (user_pgd) { | 822 | if (user_pgd) { |
823 | xen_pin_page(mm, virt_to_page(user_pgd), PT_PGD); | 823 | xen_pin_page(mm, virt_to_page(user_pgd), PT_PGD); |
824 | xen_do_pin(MMUEXT_PIN_L4_TABLE, | 824 | xen_do_pin(MMUEXT_PIN_L4_TABLE, |
825 | PFN_DOWN(__pa(user_pgd))); | 825 | PFN_DOWN(__pa(user_pgd))); |
826 | } | 826 | } |
827 | } | 827 | } |
828 | #else /* CONFIG_X86_32 */ | 828 | #else /* CONFIG_X86_32 */ |
829 | #ifdef CONFIG_X86_PAE | 829 | #ifdef CONFIG_X86_PAE |
830 | /* Need to make sure unshared kernel PMD is pinnable */ | 830 | /* Need to make sure unshared kernel PMD is pinnable */ |
831 | xen_pin_page(mm, pgd_page(pgd[pgd_index(TASK_SIZE)]), | 831 | xen_pin_page(mm, pgd_page(pgd[pgd_index(TASK_SIZE)]), |
832 | PT_PMD); | 832 | PT_PMD); |
833 | #endif | 833 | #endif |
834 | xen_do_pin(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(pgd))); | 834 | xen_do_pin(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(pgd))); |
835 | #endif /* CONFIG_X86_64 */ | 835 | #endif /* CONFIG_X86_64 */ |
836 | xen_mc_issue(0); | 836 | xen_mc_issue(0); |
837 | } | 837 | } |
838 | 838 | ||
839 | static void xen_pgd_pin(struct mm_struct *mm) | 839 | static void xen_pgd_pin(struct mm_struct *mm) |
840 | { | 840 | { |
841 | __xen_pgd_pin(mm, mm->pgd); | 841 | __xen_pgd_pin(mm, mm->pgd); |
842 | } | 842 | } |
843 | 843 | ||
844 | /* | 844 | /* |
845 | * On save, we need to pin all pagetables to make sure they get their | 845 | * On save, we need to pin all pagetables to make sure they get their |
846 | * mfns turned into pfns. Search the list for any unpinned pgds and pin | 846 | * mfns turned into pfns. Search the list for any unpinned pgds and pin |
847 | * them (unpinned pgds are not currently in use, probably because the | 847 | * them (unpinned pgds are not currently in use, probably because the |
848 | * process is under construction or destruction). | 848 | * process is under construction or destruction). |
849 | * | 849 | * |
850 | * Expected to be called in stop_machine() ("equivalent to taking | 850 | * Expected to be called in stop_machine() ("equivalent to taking |
851 | * every spinlock in the system"), so the locking doesn't really | 851 | * every spinlock in the system"), so the locking doesn't really |
852 | * matter all that much. | 852 | * matter all that much. |
853 | */ | 853 | */ |
854 | void xen_mm_pin_all(void) | 854 | void xen_mm_pin_all(void) |
855 | { | 855 | { |
856 | struct page *page; | 856 | struct page *page; |
857 | 857 | ||
858 | spin_lock(&pgd_lock); | 858 | spin_lock(&pgd_lock); |
859 | 859 | ||
860 | list_for_each_entry(page, &pgd_list, lru) { | 860 | list_for_each_entry(page, &pgd_list, lru) { |
861 | if (!PagePinned(page)) { | 861 | if (!PagePinned(page)) { |
862 | __xen_pgd_pin(&init_mm, (pgd_t *)page_address(page)); | 862 | __xen_pgd_pin(&init_mm, (pgd_t *)page_address(page)); |
863 | SetPageSavePinned(page); | 863 | SetPageSavePinned(page); |
864 | } | 864 | } |
865 | } | 865 | } |
866 | 866 | ||
867 | spin_unlock(&pgd_lock); | 867 | spin_unlock(&pgd_lock); |
868 | } | 868 | } |
869 | 869 | ||
870 | /* | 870 | /* |
871 | * The init_mm pagetable is really pinned as soon as its created, but | 871 | * The init_mm pagetable is really pinned as soon as its created, but |
872 | * that's before we have page structures to store the bits. So do all | 872 | * that's before we have page structures to store the bits. So do all |
873 | * the book-keeping now. | 873 | * the book-keeping now. |
874 | */ | 874 | */ |
875 | static int __init xen_mark_pinned(struct mm_struct *mm, struct page *page, | 875 | static int __init xen_mark_pinned(struct mm_struct *mm, struct page *page, |
876 | enum pt_level level) | 876 | enum pt_level level) |
877 | { | 877 | { |
878 | SetPagePinned(page); | 878 | SetPagePinned(page); |
879 | return 0; | 879 | return 0; |
880 | } | 880 | } |
881 | 881 | ||
882 | static void __init xen_mark_init_mm_pinned(void) | 882 | static void __init xen_mark_init_mm_pinned(void) |
883 | { | 883 | { |
884 | xen_pgd_walk(&init_mm, xen_mark_pinned, FIXADDR_TOP); | 884 | xen_pgd_walk(&init_mm, xen_mark_pinned, FIXADDR_TOP); |
885 | } | 885 | } |
886 | 886 | ||
887 | static int xen_unpin_page(struct mm_struct *mm, struct page *page, | 887 | static int xen_unpin_page(struct mm_struct *mm, struct page *page, |
888 | enum pt_level level) | 888 | enum pt_level level) |
889 | { | 889 | { |
890 | unsigned pgfl = TestClearPagePinned(page); | 890 | unsigned pgfl = TestClearPagePinned(page); |
891 | 891 | ||
892 | if (pgfl && !PageHighMem(page)) { | 892 | if (pgfl && !PageHighMem(page)) { |
893 | void *pt = lowmem_page_address(page); | 893 | void *pt = lowmem_page_address(page); |
894 | unsigned long pfn = page_to_pfn(page); | 894 | unsigned long pfn = page_to_pfn(page); |
895 | spinlock_t *ptl = NULL; | 895 | spinlock_t *ptl = NULL; |
896 | struct multicall_space mcs; | 896 | struct multicall_space mcs; |
897 | 897 | ||
898 | /* | 898 | /* |
899 | * Do the converse to pin_page. If we're using split | 899 | * Do the converse to pin_page. If we're using split |
900 | * pte locks, we must be holding the lock for while | 900 | * pte locks, we must be holding the lock for while |
901 | * the pte page is unpinned but still RO to prevent | 901 | * the pte page is unpinned but still RO to prevent |
902 | * concurrent updates from seeing it in this | 902 | * concurrent updates from seeing it in this |
903 | * partially-pinned state. | 903 | * partially-pinned state. |
904 | */ | 904 | */ |
905 | if (level == PT_PTE) { | 905 | if (level == PT_PTE) { |
906 | ptl = xen_pte_lock(page, mm); | 906 | ptl = xen_pte_lock(page, mm); |
907 | 907 | ||
908 | if (ptl) | 908 | if (ptl) |
909 | xen_do_pin(MMUEXT_UNPIN_TABLE, pfn); | 909 | xen_do_pin(MMUEXT_UNPIN_TABLE, pfn); |
910 | } | 910 | } |
911 | 911 | ||
912 | mcs = __xen_mc_entry(0); | 912 | mcs = __xen_mc_entry(0); |
913 | 913 | ||
914 | MULTI_update_va_mapping(mcs.mc, (unsigned long)pt, | 914 | MULTI_update_va_mapping(mcs.mc, (unsigned long)pt, |
915 | pfn_pte(pfn, PAGE_KERNEL), | 915 | pfn_pte(pfn, PAGE_KERNEL), |
916 | level == PT_PGD ? UVMF_TLB_FLUSH : 0); | 916 | level == PT_PGD ? UVMF_TLB_FLUSH : 0); |
917 | 917 | ||
918 | if (ptl) { | 918 | if (ptl) { |
919 | /* unlock when batch completed */ | 919 | /* unlock when batch completed */ |
920 | xen_mc_callback(xen_pte_unlock, ptl); | 920 | xen_mc_callback(xen_pte_unlock, ptl); |
921 | } | 921 | } |
922 | } | 922 | } |
923 | 923 | ||
924 | return 0; /* never need to flush on unpin */ | 924 | return 0; /* never need to flush on unpin */ |
925 | } | 925 | } |
926 | 926 | ||
927 | /* Release a pagetables pages back as normal RW */ | 927 | /* Release a pagetables pages back as normal RW */ |
928 | static void __xen_pgd_unpin(struct mm_struct *mm, pgd_t *pgd) | 928 | static void __xen_pgd_unpin(struct mm_struct *mm, pgd_t *pgd) |
929 | { | 929 | { |
930 | trace_xen_mmu_pgd_unpin(mm, pgd); | 930 | trace_xen_mmu_pgd_unpin(mm, pgd); |
931 | 931 | ||
932 | xen_mc_batch(); | 932 | xen_mc_batch(); |
933 | 933 | ||
934 | xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd))); | 934 | xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd))); |
935 | 935 | ||
936 | #ifdef CONFIG_X86_64 | 936 | #ifdef CONFIG_X86_64 |
937 | { | 937 | { |
938 | pgd_t *user_pgd = xen_get_user_pgd(pgd); | 938 | pgd_t *user_pgd = xen_get_user_pgd(pgd); |
939 | 939 | ||
940 | if (user_pgd) { | 940 | if (user_pgd) { |
941 | xen_do_pin(MMUEXT_UNPIN_TABLE, | 941 | xen_do_pin(MMUEXT_UNPIN_TABLE, |
942 | PFN_DOWN(__pa(user_pgd))); | 942 | PFN_DOWN(__pa(user_pgd))); |
943 | xen_unpin_page(mm, virt_to_page(user_pgd), PT_PGD); | 943 | xen_unpin_page(mm, virt_to_page(user_pgd), PT_PGD); |
944 | } | 944 | } |
945 | } | 945 | } |
946 | #endif | 946 | #endif |
947 | 947 | ||
948 | #ifdef CONFIG_X86_PAE | 948 | #ifdef CONFIG_X86_PAE |
949 | /* Need to make sure unshared kernel PMD is unpinned */ | 949 | /* Need to make sure unshared kernel PMD is unpinned */ |
950 | xen_unpin_page(mm, pgd_page(pgd[pgd_index(TASK_SIZE)]), | 950 | xen_unpin_page(mm, pgd_page(pgd[pgd_index(TASK_SIZE)]), |
951 | PT_PMD); | 951 | PT_PMD); |
952 | #endif | 952 | #endif |
953 | 953 | ||
954 | __xen_pgd_walk(mm, pgd, xen_unpin_page, USER_LIMIT); | 954 | __xen_pgd_walk(mm, pgd, xen_unpin_page, USER_LIMIT); |
955 | 955 | ||
956 | xen_mc_issue(0); | 956 | xen_mc_issue(0); |
957 | } | 957 | } |
958 | 958 | ||
959 | static void xen_pgd_unpin(struct mm_struct *mm) | 959 | static void xen_pgd_unpin(struct mm_struct *mm) |
960 | { | 960 | { |
961 | __xen_pgd_unpin(mm, mm->pgd); | 961 | __xen_pgd_unpin(mm, mm->pgd); |
962 | } | 962 | } |
963 | 963 | ||
964 | /* | 964 | /* |
965 | * On resume, undo any pinning done at save, so that the rest of the | 965 | * On resume, undo any pinning done at save, so that the rest of the |
966 | * kernel doesn't see any unexpected pinned pagetables. | 966 | * kernel doesn't see any unexpected pinned pagetables. |
967 | */ | 967 | */ |
968 | void xen_mm_unpin_all(void) | 968 | void xen_mm_unpin_all(void) |
969 | { | 969 | { |
970 | struct page *page; | 970 | struct page *page; |
971 | 971 | ||
972 | spin_lock(&pgd_lock); | 972 | spin_lock(&pgd_lock); |
973 | 973 | ||
974 | list_for_each_entry(page, &pgd_list, lru) { | 974 | list_for_each_entry(page, &pgd_list, lru) { |
975 | if (PageSavePinned(page)) { | 975 | if (PageSavePinned(page)) { |
976 | BUG_ON(!PagePinned(page)); | 976 | BUG_ON(!PagePinned(page)); |
977 | __xen_pgd_unpin(&init_mm, (pgd_t *)page_address(page)); | 977 | __xen_pgd_unpin(&init_mm, (pgd_t *)page_address(page)); |
978 | ClearPageSavePinned(page); | 978 | ClearPageSavePinned(page); |
979 | } | 979 | } |
980 | } | 980 | } |
981 | 981 | ||
982 | spin_unlock(&pgd_lock); | 982 | spin_unlock(&pgd_lock); |
983 | } | 983 | } |
984 | 984 | ||
985 | static void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next) | 985 | static void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next) |
986 | { | 986 | { |
987 | spin_lock(&next->page_table_lock); | 987 | spin_lock(&next->page_table_lock); |
988 | xen_pgd_pin(next); | 988 | xen_pgd_pin(next); |
989 | spin_unlock(&next->page_table_lock); | 989 | spin_unlock(&next->page_table_lock); |
990 | } | 990 | } |
991 | 991 | ||
992 | static void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm) | 992 | static void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm) |
993 | { | 993 | { |
994 | spin_lock(&mm->page_table_lock); | 994 | spin_lock(&mm->page_table_lock); |
995 | xen_pgd_pin(mm); | 995 | xen_pgd_pin(mm); |
996 | spin_unlock(&mm->page_table_lock); | 996 | spin_unlock(&mm->page_table_lock); |
997 | } | 997 | } |
998 | 998 | ||
999 | 999 | ||
1000 | #ifdef CONFIG_SMP | 1000 | #ifdef CONFIG_SMP |
1001 | /* Another cpu may still have their %cr3 pointing at the pagetable, so | 1001 | /* Another cpu may still have their %cr3 pointing at the pagetable, so |
1002 | we need to repoint it somewhere else before we can unpin it. */ | 1002 | we need to repoint it somewhere else before we can unpin it. */ |
1003 | static void drop_other_mm_ref(void *info) | 1003 | static void drop_other_mm_ref(void *info) |
1004 | { | 1004 | { |
1005 | struct mm_struct *mm = info; | 1005 | struct mm_struct *mm = info; |
1006 | struct mm_struct *active_mm; | 1006 | struct mm_struct *active_mm; |
1007 | 1007 | ||
1008 | active_mm = this_cpu_read(cpu_tlbstate.active_mm); | 1008 | active_mm = this_cpu_read(cpu_tlbstate.active_mm); |
1009 | 1009 | ||
1010 | if (active_mm == mm && this_cpu_read(cpu_tlbstate.state) != TLBSTATE_OK) | 1010 | if (active_mm == mm && this_cpu_read(cpu_tlbstate.state) != TLBSTATE_OK) |
1011 | leave_mm(smp_processor_id()); | 1011 | leave_mm(smp_processor_id()); |
1012 | 1012 | ||
1013 | /* If this cpu still has a stale cr3 reference, then make sure | 1013 | /* If this cpu still has a stale cr3 reference, then make sure |
1014 | it has been flushed. */ | 1014 | it has been flushed. */ |
1015 | if (this_cpu_read(xen_current_cr3) == __pa(mm->pgd)) | 1015 | if (this_cpu_read(xen_current_cr3) == __pa(mm->pgd)) |
1016 | load_cr3(swapper_pg_dir); | 1016 | load_cr3(swapper_pg_dir); |
1017 | } | 1017 | } |
1018 | 1018 | ||
1019 | static void xen_drop_mm_ref(struct mm_struct *mm) | 1019 | static void xen_drop_mm_ref(struct mm_struct *mm) |
1020 | { | 1020 | { |
1021 | cpumask_var_t mask; | 1021 | cpumask_var_t mask; |
1022 | unsigned cpu; | 1022 | unsigned cpu; |
1023 | 1023 | ||
1024 | if (current->active_mm == mm) { | 1024 | if (current->active_mm == mm) { |
1025 | if (current->mm == mm) | 1025 | if (current->mm == mm) |
1026 | load_cr3(swapper_pg_dir); | 1026 | load_cr3(swapper_pg_dir); |
1027 | else | 1027 | else |
1028 | leave_mm(smp_processor_id()); | 1028 | leave_mm(smp_processor_id()); |
1029 | } | 1029 | } |
1030 | 1030 | ||
1031 | /* Get the "official" set of cpus referring to our pagetable. */ | 1031 | /* Get the "official" set of cpus referring to our pagetable. */ |
1032 | if (!alloc_cpumask_var(&mask, GFP_ATOMIC)) { | 1032 | if (!alloc_cpumask_var(&mask, GFP_ATOMIC)) { |
1033 | for_each_online_cpu(cpu) { | 1033 | for_each_online_cpu(cpu) { |
1034 | if (!cpumask_test_cpu(cpu, mm_cpumask(mm)) | 1034 | if (!cpumask_test_cpu(cpu, mm_cpumask(mm)) |
1035 | && per_cpu(xen_current_cr3, cpu) != __pa(mm->pgd)) | 1035 | && per_cpu(xen_current_cr3, cpu) != __pa(mm->pgd)) |
1036 | continue; | 1036 | continue; |
1037 | smp_call_function_single(cpu, drop_other_mm_ref, mm, 1); | 1037 | smp_call_function_single(cpu, drop_other_mm_ref, mm, 1); |
1038 | } | 1038 | } |
1039 | return; | 1039 | return; |
1040 | } | 1040 | } |
1041 | cpumask_copy(mask, mm_cpumask(mm)); | 1041 | cpumask_copy(mask, mm_cpumask(mm)); |
1042 | 1042 | ||
1043 | /* It's possible that a vcpu may have a stale reference to our | 1043 | /* It's possible that a vcpu may have a stale reference to our |
1044 | cr3, because its in lazy mode, and it hasn't yet flushed | 1044 | cr3, because its in lazy mode, and it hasn't yet flushed |
1045 | its set of pending hypercalls yet. In this case, we can | 1045 | its set of pending hypercalls yet. In this case, we can |
1046 | look at its actual current cr3 value, and force it to flush | 1046 | look at its actual current cr3 value, and force it to flush |
1047 | if needed. */ | 1047 | if needed. */ |
1048 | for_each_online_cpu(cpu) { | 1048 | for_each_online_cpu(cpu) { |
1049 | if (per_cpu(xen_current_cr3, cpu) == __pa(mm->pgd)) | 1049 | if (per_cpu(xen_current_cr3, cpu) == __pa(mm->pgd)) |
1050 | cpumask_set_cpu(cpu, mask); | 1050 | cpumask_set_cpu(cpu, mask); |
1051 | } | 1051 | } |
1052 | 1052 | ||
1053 | if (!cpumask_empty(mask)) | 1053 | if (!cpumask_empty(mask)) |
1054 | smp_call_function_many(mask, drop_other_mm_ref, mm, 1); | 1054 | smp_call_function_many(mask, drop_other_mm_ref, mm, 1); |
1055 | free_cpumask_var(mask); | 1055 | free_cpumask_var(mask); |
1056 | } | 1056 | } |
1057 | #else | 1057 | #else |
1058 | static void xen_drop_mm_ref(struct mm_struct *mm) | 1058 | static void xen_drop_mm_ref(struct mm_struct *mm) |
1059 | { | 1059 | { |
1060 | if (current->active_mm == mm) | 1060 | if (current->active_mm == mm) |
1061 | load_cr3(swapper_pg_dir); | 1061 | load_cr3(swapper_pg_dir); |
1062 | } | 1062 | } |
1063 | #endif | 1063 | #endif |
1064 | 1064 | ||
1065 | /* | 1065 | /* |
1066 | * While a process runs, Xen pins its pagetables, which means that the | 1066 | * While a process runs, Xen pins its pagetables, which means that the |
1067 | * hypervisor forces it to be read-only, and it controls all updates | 1067 | * hypervisor forces it to be read-only, and it controls all updates |
1068 | * to it. This means that all pagetable updates have to go via the | 1068 | * to it. This means that all pagetable updates have to go via the |
1069 | * hypervisor, which is moderately expensive. | 1069 | * hypervisor, which is moderately expensive. |
1070 | * | 1070 | * |
1071 | * Since we're pulling the pagetable down, we switch to use init_mm, | 1071 | * Since we're pulling the pagetable down, we switch to use init_mm, |
1072 | * unpin old process pagetable and mark it all read-write, which | 1072 | * unpin old process pagetable and mark it all read-write, which |
1073 | * allows further operations on it to be simple memory accesses. | 1073 | * allows further operations on it to be simple memory accesses. |
1074 | * | 1074 | * |
1075 | * The only subtle point is that another CPU may be still using the | 1075 | * The only subtle point is that another CPU may be still using the |
1076 | * pagetable because of lazy tlb flushing. This means we need need to | 1076 | * pagetable because of lazy tlb flushing. This means we need need to |
1077 | * switch all CPUs off this pagetable before we can unpin it. | 1077 | * switch all CPUs off this pagetable before we can unpin it. |
1078 | */ | 1078 | */ |
1079 | static void xen_exit_mmap(struct mm_struct *mm) | 1079 | static void xen_exit_mmap(struct mm_struct *mm) |
1080 | { | 1080 | { |
1081 | get_cpu(); /* make sure we don't move around */ | 1081 | get_cpu(); /* make sure we don't move around */ |
1082 | xen_drop_mm_ref(mm); | 1082 | xen_drop_mm_ref(mm); |
1083 | put_cpu(); | 1083 | put_cpu(); |
1084 | 1084 | ||
1085 | spin_lock(&mm->page_table_lock); | 1085 | spin_lock(&mm->page_table_lock); |
1086 | 1086 | ||
1087 | /* pgd may not be pinned in the error exit path of execve */ | 1087 | /* pgd may not be pinned in the error exit path of execve */ |
1088 | if (xen_page_pinned(mm->pgd)) | 1088 | if (xen_page_pinned(mm->pgd)) |
1089 | xen_pgd_unpin(mm); | 1089 | xen_pgd_unpin(mm); |
1090 | 1090 | ||
1091 | spin_unlock(&mm->page_table_lock); | 1091 | spin_unlock(&mm->page_table_lock); |
1092 | } | 1092 | } |
1093 | 1093 | ||
1094 | static void xen_post_allocator_init(void); | 1094 | static void xen_post_allocator_init(void); |
1095 | 1095 | ||
1096 | #ifdef CONFIG_X86_64 | 1096 | #ifdef CONFIG_X86_64 |
1097 | static void __init xen_cleanhighmap(unsigned long vaddr, | 1097 | static void __init xen_cleanhighmap(unsigned long vaddr, |
1098 | unsigned long vaddr_end) | 1098 | unsigned long vaddr_end) |
1099 | { | 1099 | { |
1100 | unsigned long kernel_end = roundup((unsigned long)_brk_end, PMD_SIZE) - 1; | 1100 | unsigned long kernel_end = roundup((unsigned long)_brk_end, PMD_SIZE) - 1; |
1101 | pmd_t *pmd = level2_kernel_pgt + pmd_index(vaddr); | 1101 | pmd_t *pmd = level2_kernel_pgt + pmd_index(vaddr); |
1102 | 1102 | ||
1103 | /* NOTE: The loop is more greedy than the cleanup_highmap variant. | 1103 | /* NOTE: The loop is more greedy than the cleanup_highmap variant. |
1104 | * We include the PMD passed in on _both_ boundaries. */ | 1104 | * We include the PMD passed in on _both_ boundaries. */ |
1105 | for (; vaddr <= vaddr_end && (pmd < (level2_kernel_pgt + PAGE_SIZE)); | 1105 | for (; vaddr <= vaddr_end && (pmd < (level2_kernel_pgt + PAGE_SIZE)); |
1106 | pmd++, vaddr += PMD_SIZE) { | 1106 | pmd++, vaddr += PMD_SIZE) { |
1107 | if (pmd_none(*pmd)) | 1107 | if (pmd_none(*pmd)) |
1108 | continue; | 1108 | continue; |
1109 | if (vaddr < (unsigned long) _text || vaddr > kernel_end) | 1109 | if (vaddr < (unsigned long) _text || vaddr > kernel_end) |
1110 | set_pmd(pmd, __pmd(0)); | 1110 | set_pmd(pmd, __pmd(0)); |
1111 | } | 1111 | } |
1112 | /* In case we did something silly, we should crash in this function | 1112 | /* In case we did something silly, we should crash in this function |
1113 | * instead of somewhere later and be confusing. */ | 1113 | * instead of somewhere later and be confusing. */ |
1114 | xen_mc_flush(); | 1114 | xen_mc_flush(); |
1115 | } | 1115 | } |
1116 | static void __init xen_pagetable_p2m_copy(void) | 1116 | |
1117 | static void __init xen_pagetable_p2m_free(void) | ||
1117 | { | 1118 | { |
1118 | unsigned long size; | 1119 | unsigned long size; |
1119 | unsigned long addr; | 1120 | unsigned long addr; |
1120 | unsigned long new_mfn_list; | ||
1121 | 1121 | ||
1122 | if (xen_feature(XENFEAT_auto_translated_physmap)) | ||
1123 | return; | ||
1124 | |||
1125 | size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long)); | 1122 | size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long)); |
1126 | 1123 | ||
1127 | new_mfn_list = xen_revector_p2m_tree(); | ||
1128 | /* No memory or already called. */ | 1124 | /* No memory or already called. */ |
1129 | if (!new_mfn_list || new_mfn_list == xen_start_info->mfn_list) | 1125 | if ((unsigned long)xen_p2m_addr == xen_start_info->mfn_list) |
1130 | return; | 1126 | return; |
1131 | 1127 | ||
1132 | /* using __ka address and sticking INVALID_P2M_ENTRY! */ | 1128 | /* using __ka address and sticking INVALID_P2M_ENTRY! */ |
1133 | memset((void *)xen_start_info->mfn_list, 0xff, size); | 1129 | memset((void *)xen_start_info->mfn_list, 0xff, size); |
1134 | 1130 | ||
1135 | /* We should be in __ka space. */ | 1131 | /* We should be in __ka space. */ |
1136 | BUG_ON(xen_start_info->mfn_list < __START_KERNEL_map); | 1132 | BUG_ON(xen_start_info->mfn_list < __START_KERNEL_map); |
1137 | addr = xen_start_info->mfn_list; | 1133 | addr = xen_start_info->mfn_list; |
1138 | /* We roundup to the PMD, which means that if anybody at this stage is | 1134 | /* We roundup to the PMD, which means that if anybody at this stage is |
1139 | * using the __ka address of xen_start_info or xen_start_info->shared_info | 1135 | * using the __ka address of xen_start_info or xen_start_info->shared_info |
1140 | * they are in going to crash. Fortunatly we have already revectored | 1136 | * they are in going to crash. Fortunatly we have already revectored |
1141 | * in xen_setup_kernel_pagetable and in xen_setup_shared_info. */ | 1137 | * in xen_setup_kernel_pagetable and in xen_setup_shared_info. */ |
1142 | size = roundup(size, PMD_SIZE); | 1138 | size = roundup(size, PMD_SIZE); |
1143 | xen_cleanhighmap(addr, addr + size); | 1139 | xen_cleanhighmap(addr, addr + size); |
1144 | 1140 | ||
1145 | size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long)); | 1141 | size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long)); |
1146 | memblock_free(__pa(xen_start_info->mfn_list), size); | 1142 | memblock_free(__pa(xen_start_info->mfn_list), size); |
1147 | /* And revector! Bye bye old array */ | ||
1148 | xen_start_info->mfn_list = new_mfn_list; | ||
1149 | 1143 | ||
1150 | /* At this stage, cleanup_highmap has already cleaned __ka space | 1144 | /* At this stage, cleanup_highmap has already cleaned __ka space |
1151 | * from _brk_limit way up to the max_pfn_mapped (which is the end of | 1145 | * from _brk_limit way up to the max_pfn_mapped (which is the end of |
1152 | * the ramdisk). We continue on, erasing PMD entries that point to page | 1146 | * the ramdisk). We continue on, erasing PMD entries that point to page |
1153 | * tables - do note that they are accessible at this stage via __va. | 1147 | * tables - do note that they are accessible at this stage via __va. |
1154 | * For good measure we also round up to the PMD - which means that if | 1148 | * For good measure we also round up to the PMD - which means that if |
1155 | * anybody is using __ka address to the initial boot-stack - and try | 1149 | * anybody is using __ka address to the initial boot-stack - and try |
1156 | * to use it - they are going to crash. The xen_start_info has been | 1150 | * to use it - they are going to crash. The xen_start_info has been |
1157 | * taken care of already in xen_setup_kernel_pagetable. */ | 1151 | * taken care of already in xen_setup_kernel_pagetable. */ |
1158 | addr = xen_start_info->pt_base; | 1152 | addr = xen_start_info->pt_base; |
1159 | size = roundup(xen_start_info->nr_pt_frames * PAGE_SIZE, PMD_SIZE); | 1153 | size = roundup(xen_start_info->nr_pt_frames * PAGE_SIZE, PMD_SIZE); |
1160 | 1154 | ||
1161 | xen_cleanhighmap(addr, addr + size); | 1155 | xen_cleanhighmap(addr, addr + size); |
1162 | xen_start_info->pt_base = (unsigned long)__va(__pa(xen_start_info->pt_base)); | 1156 | xen_start_info->pt_base = (unsigned long)__va(__pa(xen_start_info->pt_base)); |
1163 | #ifdef DEBUG | 1157 | #ifdef DEBUG |
1164 | /* This is superflous and is not neccessary, but you know what | 1158 | /* This is superflous and is not neccessary, but you know what |
1165 | * lets do it. The MODULES_VADDR -> MODULES_END should be clear of | 1159 | * lets do it. The MODULES_VADDR -> MODULES_END should be clear of |
1166 | * anything at this stage. */ | 1160 | * anything at this stage. */ |
1167 | xen_cleanhighmap(MODULES_VADDR, roundup(MODULES_VADDR, PUD_SIZE) - 1); | 1161 | xen_cleanhighmap(MODULES_VADDR, roundup(MODULES_VADDR, PUD_SIZE) - 1); |
1168 | #endif | 1162 | #endif |
1169 | } | 1163 | } |
1170 | #endif | 1164 | #endif |
1171 | 1165 | ||
1172 | static void __init xen_pagetable_init(void) | 1166 | static void __init xen_pagetable_p2m_setup(void) |
1173 | { | 1167 | { |
1174 | paging_init(); | 1168 | if (xen_feature(XENFEAT_auto_translated_physmap)) |
1169 | return; | ||
1170 | |||
1171 | xen_vmalloc_p2m_tree(); | ||
1172 | |||
1175 | #ifdef CONFIG_X86_64 | 1173 | #ifdef CONFIG_X86_64 |
1176 | xen_pagetable_p2m_copy(); | 1174 | xen_pagetable_p2m_free(); |
1177 | #endif | 1175 | #endif |
1176 | /* And revector! Bye bye old array */ | ||
1177 | xen_start_info->mfn_list = (unsigned long)xen_p2m_addr; | ||
1178 | } | ||
1179 | |||
1180 | static void __init xen_pagetable_init(void) | ||
1181 | { | ||
1182 | paging_init(); | ||
1183 | xen_post_allocator_init(); | ||
1184 | |||
1185 | xen_pagetable_p2m_setup(); | ||
1186 | |||
1178 | /* Allocate and initialize top and mid mfn levels for p2m structure */ | 1187 | /* Allocate and initialize top and mid mfn levels for p2m structure */ |
1179 | xen_build_mfn_list_list(); | 1188 | xen_build_mfn_list_list(); |
1180 | 1189 | ||
1190 | /* Remap memory freed due to conflicts with E820 map */ | ||
1191 | if (!xen_feature(XENFEAT_auto_translated_physmap)) | ||
1192 | xen_remap_memory(); | ||
1193 | |||
1181 | xen_setup_shared_info(); | 1194 | xen_setup_shared_info(); |
1182 | xen_post_allocator_init(); | ||
1183 | } | 1195 | } |
1184 | static void xen_write_cr2(unsigned long cr2) | 1196 | static void xen_write_cr2(unsigned long cr2) |
1185 | { | 1197 | { |
1186 | this_cpu_read(xen_vcpu)->arch.cr2 = cr2; | 1198 | this_cpu_read(xen_vcpu)->arch.cr2 = cr2; |
1187 | } | 1199 | } |
1188 | 1200 | ||
1189 | static unsigned long xen_read_cr2(void) | 1201 | static unsigned long xen_read_cr2(void) |
1190 | { | 1202 | { |
1191 | return this_cpu_read(xen_vcpu)->arch.cr2; | 1203 | return this_cpu_read(xen_vcpu)->arch.cr2; |
1192 | } | 1204 | } |
1193 | 1205 | ||
1194 | unsigned long xen_read_cr2_direct(void) | 1206 | unsigned long xen_read_cr2_direct(void) |
1195 | { | 1207 | { |
1196 | return this_cpu_read(xen_vcpu_info.arch.cr2); | 1208 | return this_cpu_read(xen_vcpu_info.arch.cr2); |
1197 | } | 1209 | } |
1198 | 1210 | ||
1199 | void xen_flush_tlb_all(void) | 1211 | void xen_flush_tlb_all(void) |
1200 | { | 1212 | { |
1201 | struct mmuext_op *op; | 1213 | struct mmuext_op *op; |
1202 | struct multicall_space mcs; | 1214 | struct multicall_space mcs; |
1203 | 1215 | ||
1204 | trace_xen_mmu_flush_tlb_all(0); | 1216 | trace_xen_mmu_flush_tlb_all(0); |
1205 | 1217 | ||
1206 | preempt_disable(); | 1218 | preempt_disable(); |
1207 | 1219 | ||
1208 | mcs = xen_mc_entry(sizeof(*op)); | 1220 | mcs = xen_mc_entry(sizeof(*op)); |
1209 | 1221 | ||
1210 | op = mcs.args; | 1222 | op = mcs.args; |
1211 | op->cmd = MMUEXT_TLB_FLUSH_ALL; | 1223 | op->cmd = MMUEXT_TLB_FLUSH_ALL; |
1212 | MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); | 1224 | MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); |
1213 | 1225 | ||
1214 | xen_mc_issue(PARAVIRT_LAZY_MMU); | 1226 | xen_mc_issue(PARAVIRT_LAZY_MMU); |
1215 | 1227 | ||
1216 | preempt_enable(); | 1228 | preempt_enable(); |
1217 | } | 1229 | } |
1218 | static void xen_flush_tlb(void) | 1230 | static void xen_flush_tlb(void) |
1219 | { | 1231 | { |
1220 | struct mmuext_op *op; | 1232 | struct mmuext_op *op; |
1221 | struct multicall_space mcs; | 1233 | struct multicall_space mcs; |
1222 | 1234 | ||
1223 | trace_xen_mmu_flush_tlb(0); | 1235 | trace_xen_mmu_flush_tlb(0); |
1224 | 1236 | ||
1225 | preempt_disable(); | 1237 | preempt_disable(); |
1226 | 1238 | ||
1227 | mcs = xen_mc_entry(sizeof(*op)); | 1239 | mcs = xen_mc_entry(sizeof(*op)); |
1228 | 1240 | ||
1229 | op = mcs.args; | 1241 | op = mcs.args; |
1230 | op->cmd = MMUEXT_TLB_FLUSH_LOCAL; | 1242 | op->cmd = MMUEXT_TLB_FLUSH_LOCAL; |
1231 | MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); | 1243 | MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); |
1232 | 1244 | ||
1233 | xen_mc_issue(PARAVIRT_LAZY_MMU); | 1245 | xen_mc_issue(PARAVIRT_LAZY_MMU); |
1234 | 1246 | ||
1235 | preempt_enable(); | 1247 | preempt_enable(); |
1236 | } | 1248 | } |
1237 | 1249 | ||
1238 | static void xen_flush_tlb_single(unsigned long addr) | 1250 | static void xen_flush_tlb_single(unsigned long addr) |
1239 | { | 1251 | { |
1240 | struct mmuext_op *op; | 1252 | struct mmuext_op *op; |
1241 | struct multicall_space mcs; | 1253 | struct multicall_space mcs; |
1242 | 1254 | ||
1243 | trace_xen_mmu_flush_tlb_single(addr); | 1255 | trace_xen_mmu_flush_tlb_single(addr); |
1244 | 1256 | ||
1245 | preempt_disable(); | 1257 | preempt_disable(); |
1246 | 1258 | ||
1247 | mcs = xen_mc_entry(sizeof(*op)); | 1259 | mcs = xen_mc_entry(sizeof(*op)); |
1248 | op = mcs.args; | 1260 | op = mcs.args; |
1249 | op->cmd = MMUEXT_INVLPG_LOCAL; | 1261 | op->cmd = MMUEXT_INVLPG_LOCAL; |
1250 | op->arg1.linear_addr = addr & PAGE_MASK; | 1262 | op->arg1.linear_addr = addr & PAGE_MASK; |
1251 | MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); | 1263 | MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); |
1252 | 1264 | ||
1253 | xen_mc_issue(PARAVIRT_LAZY_MMU); | 1265 | xen_mc_issue(PARAVIRT_LAZY_MMU); |
1254 | 1266 | ||
1255 | preempt_enable(); | 1267 | preempt_enable(); |
1256 | } | 1268 | } |
1257 | 1269 | ||
1258 | static void xen_flush_tlb_others(const struct cpumask *cpus, | 1270 | static void xen_flush_tlb_others(const struct cpumask *cpus, |
1259 | struct mm_struct *mm, unsigned long start, | 1271 | struct mm_struct *mm, unsigned long start, |
1260 | unsigned long end) | 1272 | unsigned long end) |
1261 | { | 1273 | { |
1262 | struct { | 1274 | struct { |
1263 | struct mmuext_op op; | 1275 | struct mmuext_op op; |
1264 | #ifdef CONFIG_SMP | 1276 | #ifdef CONFIG_SMP |
1265 | DECLARE_BITMAP(mask, num_processors); | 1277 | DECLARE_BITMAP(mask, num_processors); |
1266 | #else | 1278 | #else |
1267 | DECLARE_BITMAP(mask, NR_CPUS); | 1279 | DECLARE_BITMAP(mask, NR_CPUS); |
1268 | #endif | 1280 | #endif |
1269 | } *args; | 1281 | } *args; |
1270 | struct multicall_space mcs; | 1282 | struct multicall_space mcs; |
1271 | 1283 | ||
1272 | trace_xen_mmu_flush_tlb_others(cpus, mm, start, end); | 1284 | trace_xen_mmu_flush_tlb_others(cpus, mm, start, end); |
1273 | 1285 | ||
1274 | if (cpumask_empty(cpus)) | 1286 | if (cpumask_empty(cpus)) |
1275 | return; /* nothing to do */ | 1287 | return; /* nothing to do */ |
1276 | 1288 | ||
1277 | mcs = xen_mc_entry(sizeof(*args)); | 1289 | mcs = xen_mc_entry(sizeof(*args)); |
1278 | args = mcs.args; | 1290 | args = mcs.args; |
1279 | args->op.arg2.vcpumask = to_cpumask(args->mask); | 1291 | args->op.arg2.vcpumask = to_cpumask(args->mask); |
1280 | 1292 | ||
1281 | /* Remove us, and any offline CPUS. */ | 1293 | /* Remove us, and any offline CPUS. */ |
1282 | cpumask_and(to_cpumask(args->mask), cpus, cpu_online_mask); | 1294 | cpumask_and(to_cpumask(args->mask), cpus, cpu_online_mask); |
1283 | cpumask_clear_cpu(smp_processor_id(), to_cpumask(args->mask)); | 1295 | cpumask_clear_cpu(smp_processor_id(), to_cpumask(args->mask)); |
1284 | 1296 | ||
1285 | args->op.cmd = MMUEXT_TLB_FLUSH_MULTI; | 1297 | args->op.cmd = MMUEXT_TLB_FLUSH_MULTI; |
1286 | if (end != TLB_FLUSH_ALL && (end - start) <= PAGE_SIZE) { | 1298 | if (end != TLB_FLUSH_ALL && (end - start) <= PAGE_SIZE) { |
1287 | args->op.cmd = MMUEXT_INVLPG_MULTI; | 1299 | args->op.cmd = MMUEXT_INVLPG_MULTI; |
1288 | args->op.arg1.linear_addr = start; | 1300 | args->op.arg1.linear_addr = start; |
1289 | } | 1301 | } |
1290 | 1302 | ||
1291 | MULTI_mmuext_op(mcs.mc, &args->op, 1, NULL, DOMID_SELF); | 1303 | MULTI_mmuext_op(mcs.mc, &args->op, 1, NULL, DOMID_SELF); |
1292 | 1304 | ||
1293 | xen_mc_issue(PARAVIRT_LAZY_MMU); | 1305 | xen_mc_issue(PARAVIRT_LAZY_MMU); |
1294 | } | 1306 | } |
1295 | 1307 | ||
1296 | static unsigned long xen_read_cr3(void) | 1308 | static unsigned long xen_read_cr3(void) |
1297 | { | 1309 | { |
1298 | return this_cpu_read(xen_cr3); | 1310 | return this_cpu_read(xen_cr3); |
1299 | } | 1311 | } |
1300 | 1312 | ||
1301 | static void set_current_cr3(void *v) | 1313 | static void set_current_cr3(void *v) |
1302 | { | 1314 | { |
1303 | this_cpu_write(xen_current_cr3, (unsigned long)v); | 1315 | this_cpu_write(xen_current_cr3, (unsigned long)v); |
1304 | } | 1316 | } |
1305 | 1317 | ||
1306 | static void __xen_write_cr3(bool kernel, unsigned long cr3) | 1318 | static void __xen_write_cr3(bool kernel, unsigned long cr3) |
1307 | { | 1319 | { |
1308 | struct mmuext_op op; | 1320 | struct mmuext_op op; |
1309 | unsigned long mfn; | 1321 | unsigned long mfn; |
1310 | 1322 | ||
1311 | trace_xen_mmu_write_cr3(kernel, cr3); | 1323 | trace_xen_mmu_write_cr3(kernel, cr3); |
1312 | 1324 | ||
1313 | if (cr3) | 1325 | if (cr3) |
1314 | mfn = pfn_to_mfn(PFN_DOWN(cr3)); | 1326 | mfn = pfn_to_mfn(PFN_DOWN(cr3)); |
1315 | else | 1327 | else |
1316 | mfn = 0; | 1328 | mfn = 0; |
1317 | 1329 | ||
1318 | WARN_ON(mfn == 0 && kernel); | 1330 | WARN_ON(mfn == 0 && kernel); |
1319 | 1331 | ||
1320 | op.cmd = kernel ? MMUEXT_NEW_BASEPTR : MMUEXT_NEW_USER_BASEPTR; | 1332 | op.cmd = kernel ? MMUEXT_NEW_BASEPTR : MMUEXT_NEW_USER_BASEPTR; |
1321 | op.arg1.mfn = mfn; | 1333 | op.arg1.mfn = mfn; |
1322 | 1334 | ||
1323 | xen_extend_mmuext_op(&op); | 1335 | xen_extend_mmuext_op(&op); |
1324 | 1336 | ||
1325 | if (kernel) { | 1337 | if (kernel) { |
1326 | this_cpu_write(xen_cr3, cr3); | 1338 | this_cpu_write(xen_cr3, cr3); |
1327 | 1339 | ||
1328 | /* Update xen_current_cr3 once the batch has actually | 1340 | /* Update xen_current_cr3 once the batch has actually |
1329 | been submitted. */ | 1341 | been submitted. */ |
1330 | xen_mc_callback(set_current_cr3, (void *)cr3); | 1342 | xen_mc_callback(set_current_cr3, (void *)cr3); |
1331 | } | 1343 | } |
1332 | } | 1344 | } |
1333 | static void xen_write_cr3(unsigned long cr3) | 1345 | static void xen_write_cr3(unsigned long cr3) |
1334 | { | 1346 | { |
1335 | BUG_ON(preemptible()); | 1347 | BUG_ON(preemptible()); |
1336 | 1348 | ||
1337 | xen_mc_batch(); /* disables interrupts */ | 1349 | xen_mc_batch(); /* disables interrupts */ |
1338 | 1350 | ||
1339 | /* Update while interrupts are disabled, so its atomic with | 1351 | /* Update while interrupts are disabled, so its atomic with |
1340 | respect to ipis */ | 1352 | respect to ipis */ |
1341 | this_cpu_write(xen_cr3, cr3); | 1353 | this_cpu_write(xen_cr3, cr3); |
1342 | 1354 | ||
1343 | __xen_write_cr3(true, cr3); | 1355 | __xen_write_cr3(true, cr3); |
1344 | 1356 | ||
1345 | #ifdef CONFIG_X86_64 | 1357 | #ifdef CONFIG_X86_64 |
1346 | { | 1358 | { |
1347 | pgd_t *user_pgd = xen_get_user_pgd(__va(cr3)); | 1359 | pgd_t *user_pgd = xen_get_user_pgd(__va(cr3)); |
1348 | if (user_pgd) | 1360 | if (user_pgd) |
1349 | __xen_write_cr3(false, __pa(user_pgd)); | 1361 | __xen_write_cr3(false, __pa(user_pgd)); |
1350 | else | 1362 | else |
1351 | __xen_write_cr3(false, 0); | 1363 | __xen_write_cr3(false, 0); |
1352 | } | 1364 | } |
1353 | #endif | 1365 | #endif |
1354 | 1366 | ||
1355 | xen_mc_issue(PARAVIRT_LAZY_CPU); /* interrupts restored */ | 1367 | xen_mc_issue(PARAVIRT_LAZY_CPU); /* interrupts restored */ |
1356 | } | 1368 | } |
1357 | 1369 | ||
1358 | #ifdef CONFIG_X86_64 | 1370 | #ifdef CONFIG_X86_64 |
1359 | /* | 1371 | /* |
1360 | * At the start of the day - when Xen launches a guest, it has already | 1372 | * At the start of the day - when Xen launches a guest, it has already |
1361 | * built pagetables for the guest. We diligently look over them | 1373 | * built pagetables for the guest. We diligently look over them |
1362 | * in xen_setup_kernel_pagetable and graft as appropiate them in the | 1374 | * in xen_setup_kernel_pagetable and graft as appropiate them in the |
1363 | * init_level4_pgt and its friends. Then when we are happy we load | 1375 | * init_level4_pgt and its friends. Then when we are happy we load |
1364 | * the new init_level4_pgt - and continue on. | 1376 | * the new init_level4_pgt - and continue on. |
1365 | * | 1377 | * |
1366 | * The generic code starts (start_kernel) and 'init_mem_mapping' sets | 1378 | * The generic code starts (start_kernel) and 'init_mem_mapping' sets |
1367 | * up the rest of the pagetables. When it has completed it loads the cr3. | 1379 | * up the rest of the pagetables. When it has completed it loads the cr3. |
1368 | * N.B. that baremetal would start at 'start_kernel' (and the early | 1380 | * N.B. that baremetal would start at 'start_kernel' (and the early |
1369 | * #PF handler would create bootstrap pagetables) - so we are running | 1381 | * #PF handler would create bootstrap pagetables) - so we are running |
1370 | * with the same assumptions as what to do when write_cr3 is executed | 1382 | * with the same assumptions as what to do when write_cr3 is executed |
1371 | * at this point. | 1383 | * at this point. |
1372 | * | 1384 | * |
1373 | * Since there are no user-page tables at all, we have two variants | 1385 | * Since there are no user-page tables at all, we have two variants |
1374 | * of xen_write_cr3 - the early bootup (this one), and the late one | 1386 | * of xen_write_cr3 - the early bootup (this one), and the late one |
1375 | * (xen_write_cr3). The reason we have to do that is that in 64-bit | 1387 | * (xen_write_cr3). The reason we have to do that is that in 64-bit |
1376 | * the Linux kernel and user-space are both in ring 3 while the | 1388 | * the Linux kernel and user-space are both in ring 3 while the |
1377 | * hypervisor is in ring 0. | 1389 | * hypervisor is in ring 0. |
1378 | */ | 1390 | */ |
1379 | static void __init xen_write_cr3_init(unsigned long cr3) | 1391 | static void __init xen_write_cr3_init(unsigned long cr3) |
1380 | { | 1392 | { |
1381 | BUG_ON(preemptible()); | 1393 | BUG_ON(preemptible()); |
1382 | 1394 | ||
1383 | xen_mc_batch(); /* disables interrupts */ | 1395 | xen_mc_batch(); /* disables interrupts */ |
1384 | 1396 | ||
1385 | /* Update while interrupts are disabled, so its atomic with | 1397 | /* Update while interrupts are disabled, so its atomic with |
1386 | respect to ipis */ | 1398 | respect to ipis */ |
1387 | this_cpu_write(xen_cr3, cr3); | 1399 | this_cpu_write(xen_cr3, cr3); |
1388 | 1400 | ||
1389 | __xen_write_cr3(true, cr3); | 1401 | __xen_write_cr3(true, cr3); |
1390 | 1402 | ||
1391 | xen_mc_issue(PARAVIRT_LAZY_CPU); /* interrupts restored */ | 1403 | xen_mc_issue(PARAVIRT_LAZY_CPU); /* interrupts restored */ |
1392 | } | 1404 | } |
1393 | #endif | 1405 | #endif |
1394 | 1406 | ||
1395 | static int xen_pgd_alloc(struct mm_struct *mm) | 1407 | static int xen_pgd_alloc(struct mm_struct *mm) |
1396 | { | 1408 | { |
1397 | pgd_t *pgd = mm->pgd; | 1409 | pgd_t *pgd = mm->pgd; |
1398 | int ret = 0; | 1410 | int ret = 0; |
1399 | 1411 | ||
1400 | BUG_ON(PagePinned(virt_to_page(pgd))); | 1412 | BUG_ON(PagePinned(virt_to_page(pgd))); |
1401 | 1413 | ||
1402 | #ifdef CONFIG_X86_64 | 1414 | #ifdef CONFIG_X86_64 |
1403 | { | 1415 | { |
1404 | struct page *page = virt_to_page(pgd); | 1416 | struct page *page = virt_to_page(pgd); |
1405 | pgd_t *user_pgd; | 1417 | pgd_t *user_pgd; |
1406 | 1418 | ||
1407 | BUG_ON(page->private != 0); | 1419 | BUG_ON(page->private != 0); |
1408 | 1420 | ||
1409 | ret = -ENOMEM; | 1421 | ret = -ENOMEM; |
1410 | 1422 | ||
1411 | user_pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO); | 1423 | user_pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO); |
1412 | page->private = (unsigned long)user_pgd; | 1424 | page->private = (unsigned long)user_pgd; |
1413 | 1425 | ||
1414 | if (user_pgd != NULL) { | 1426 | if (user_pgd != NULL) { |
1415 | #ifdef CONFIG_X86_VSYSCALL_EMULATION | 1427 | #ifdef CONFIG_X86_VSYSCALL_EMULATION |
1416 | user_pgd[pgd_index(VSYSCALL_ADDR)] = | 1428 | user_pgd[pgd_index(VSYSCALL_ADDR)] = |
1417 | __pgd(__pa(level3_user_vsyscall) | _PAGE_TABLE); | 1429 | __pgd(__pa(level3_user_vsyscall) | _PAGE_TABLE); |
1418 | #endif | 1430 | #endif |
1419 | ret = 0; | 1431 | ret = 0; |
1420 | } | 1432 | } |
1421 | 1433 | ||
1422 | BUG_ON(PagePinned(virt_to_page(xen_get_user_pgd(pgd)))); | 1434 | BUG_ON(PagePinned(virt_to_page(xen_get_user_pgd(pgd)))); |
1423 | } | 1435 | } |
1424 | #endif | 1436 | #endif |
1425 | 1437 | ||
1426 | return ret; | 1438 | return ret; |
1427 | } | 1439 | } |
1428 | 1440 | ||
1429 | static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd) | 1441 | static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd) |
1430 | { | 1442 | { |
1431 | #ifdef CONFIG_X86_64 | 1443 | #ifdef CONFIG_X86_64 |
1432 | pgd_t *user_pgd = xen_get_user_pgd(pgd); | 1444 | pgd_t *user_pgd = xen_get_user_pgd(pgd); |
1433 | 1445 | ||
1434 | if (user_pgd) | 1446 | if (user_pgd) |
1435 | free_page((unsigned long)user_pgd); | 1447 | free_page((unsigned long)user_pgd); |
1436 | #endif | 1448 | #endif |
1437 | } | 1449 | } |
1438 | 1450 | ||
1439 | #ifdef CONFIG_X86_32 | 1451 | #ifdef CONFIG_X86_32 |
1440 | static pte_t __init mask_rw_pte(pte_t *ptep, pte_t pte) | 1452 | static pte_t __init mask_rw_pte(pte_t *ptep, pte_t pte) |
1441 | { | 1453 | { |
1442 | /* If there's an existing pte, then don't allow _PAGE_RW to be set */ | 1454 | /* If there's an existing pte, then don't allow _PAGE_RW to be set */ |
1443 | if (pte_val_ma(*ptep) & _PAGE_PRESENT) | 1455 | if (pte_val_ma(*ptep) & _PAGE_PRESENT) |
1444 | pte = __pte_ma(((pte_val_ma(*ptep) & _PAGE_RW) | ~_PAGE_RW) & | 1456 | pte = __pte_ma(((pte_val_ma(*ptep) & _PAGE_RW) | ~_PAGE_RW) & |
1445 | pte_val_ma(pte)); | 1457 | pte_val_ma(pte)); |
1446 | 1458 | ||
1447 | return pte; | 1459 | return pte; |
1448 | } | 1460 | } |
1449 | #else /* CONFIG_X86_64 */ | 1461 | #else /* CONFIG_X86_64 */ |
1450 | static pte_t __init mask_rw_pte(pte_t *ptep, pte_t pte) | 1462 | static pte_t __init mask_rw_pte(pte_t *ptep, pte_t pte) |
1451 | { | 1463 | { |
1452 | return pte; | 1464 | return pte; |
1453 | } | 1465 | } |
1454 | #endif /* CONFIG_X86_64 */ | 1466 | #endif /* CONFIG_X86_64 */ |
1455 | 1467 | ||
1456 | /* | 1468 | /* |
1457 | * Init-time set_pte while constructing initial pagetables, which | 1469 | * Init-time set_pte while constructing initial pagetables, which |
1458 | * doesn't allow RO page table pages to be remapped RW. | 1470 | * doesn't allow RO page table pages to be remapped RW. |
1459 | * | 1471 | * |
1460 | * If there is no MFN for this PFN then this page is initially | 1472 | * If there is no MFN for this PFN then this page is initially |
1461 | * ballooned out so clear the PTE (as in decrease_reservation() in | 1473 | * ballooned out so clear the PTE (as in decrease_reservation() in |
1462 | * drivers/xen/balloon.c). | 1474 | * drivers/xen/balloon.c). |
1463 | * | 1475 | * |
1464 | * Many of these PTE updates are done on unpinned and writable pages | 1476 | * Many of these PTE updates are done on unpinned and writable pages |
1465 | * and doing a hypercall for these is unnecessary and expensive. At | 1477 | * and doing a hypercall for these is unnecessary and expensive. At |
1466 | * this point it is not possible to tell if a page is pinned or not, | 1478 | * this point it is not possible to tell if a page is pinned or not, |
1467 | * so always write the PTE directly and rely on Xen trapping and | 1479 | * so always write the PTE directly and rely on Xen trapping and |
1468 | * emulating any updates as necessary. | 1480 | * emulating any updates as necessary. |
1469 | */ | 1481 | */ |
1470 | static void __init xen_set_pte_init(pte_t *ptep, pte_t pte) | 1482 | static void __init xen_set_pte_init(pte_t *ptep, pte_t pte) |
1471 | { | 1483 | { |
1472 | if (pte_mfn(pte) != INVALID_P2M_ENTRY) | 1484 | if (pte_mfn(pte) != INVALID_P2M_ENTRY) |
1473 | pte = mask_rw_pte(ptep, pte); | 1485 | pte = mask_rw_pte(ptep, pte); |
1474 | else | 1486 | else |
1475 | pte = __pte_ma(0); | 1487 | pte = __pte_ma(0); |
1476 | 1488 | ||
1477 | native_set_pte(ptep, pte); | 1489 | native_set_pte(ptep, pte); |
1478 | } | 1490 | } |
1479 | 1491 | ||
1480 | static void pin_pagetable_pfn(unsigned cmd, unsigned long pfn) | 1492 | static void pin_pagetable_pfn(unsigned cmd, unsigned long pfn) |
1481 | { | 1493 | { |
1482 | struct mmuext_op op; | 1494 | struct mmuext_op op; |
1483 | op.cmd = cmd; | 1495 | op.cmd = cmd; |
1484 | op.arg1.mfn = pfn_to_mfn(pfn); | 1496 | op.arg1.mfn = pfn_to_mfn(pfn); |
1485 | if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF)) | 1497 | if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF)) |
1486 | BUG(); | 1498 | BUG(); |
1487 | } | 1499 | } |
1488 | 1500 | ||
1489 | /* Early in boot, while setting up the initial pagetable, assume | 1501 | /* Early in boot, while setting up the initial pagetable, assume |
1490 | everything is pinned. */ | 1502 | everything is pinned. */ |
1491 | static void __init xen_alloc_pte_init(struct mm_struct *mm, unsigned long pfn) | 1503 | static void __init xen_alloc_pte_init(struct mm_struct *mm, unsigned long pfn) |
1492 | { | 1504 | { |
1493 | #ifdef CONFIG_FLATMEM | 1505 | #ifdef CONFIG_FLATMEM |
1494 | BUG_ON(mem_map); /* should only be used early */ | 1506 | BUG_ON(mem_map); /* should only be used early */ |
1495 | #endif | 1507 | #endif |
1496 | make_lowmem_page_readonly(__va(PFN_PHYS(pfn))); | 1508 | make_lowmem_page_readonly(__va(PFN_PHYS(pfn))); |
1497 | pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn); | 1509 | pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn); |
1498 | } | 1510 | } |
1499 | 1511 | ||
1500 | /* Used for pmd and pud */ | 1512 | /* Used for pmd and pud */ |
1501 | static void __init xen_alloc_pmd_init(struct mm_struct *mm, unsigned long pfn) | 1513 | static void __init xen_alloc_pmd_init(struct mm_struct *mm, unsigned long pfn) |
1502 | { | 1514 | { |
1503 | #ifdef CONFIG_FLATMEM | 1515 | #ifdef CONFIG_FLATMEM |
1504 | BUG_ON(mem_map); /* should only be used early */ | 1516 | BUG_ON(mem_map); /* should only be used early */ |
1505 | #endif | 1517 | #endif |
1506 | make_lowmem_page_readonly(__va(PFN_PHYS(pfn))); | 1518 | make_lowmem_page_readonly(__va(PFN_PHYS(pfn))); |
1507 | } | 1519 | } |
1508 | 1520 | ||
1509 | /* Early release_pte assumes that all pts are pinned, since there's | 1521 | /* Early release_pte assumes that all pts are pinned, since there's |
1510 | only init_mm and anything attached to that is pinned. */ | 1522 | only init_mm and anything attached to that is pinned. */ |
1511 | static void __init xen_release_pte_init(unsigned long pfn) | 1523 | static void __init xen_release_pte_init(unsigned long pfn) |
1512 | { | 1524 | { |
1513 | pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn); | 1525 | pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn); |
1514 | make_lowmem_page_readwrite(__va(PFN_PHYS(pfn))); | 1526 | make_lowmem_page_readwrite(__va(PFN_PHYS(pfn))); |
1515 | } | 1527 | } |
1516 | 1528 | ||
1517 | static void __init xen_release_pmd_init(unsigned long pfn) | 1529 | static void __init xen_release_pmd_init(unsigned long pfn) |
1518 | { | 1530 | { |
1519 | make_lowmem_page_readwrite(__va(PFN_PHYS(pfn))); | 1531 | make_lowmem_page_readwrite(__va(PFN_PHYS(pfn))); |
1520 | } | 1532 | } |
1521 | 1533 | ||
1522 | static inline void __pin_pagetable_pfn(unsigned cmd, unsigned long pfn) | 1534 | static inline void __pin_pagetable_pfn(unsigned cmd, unsigned long pfn) |
1523 | { | 1535 | { |
1524 | struct multicall_space mcs; | 1536 | struct multicall_space mcs; |
1525 | struct mmuext_op *op; | 1537 | struct mmuext_op *op; |
1526 | 1538 | ||
1527 | mcs = __xen_mc_entry(sizeof(*op)); | 1539 | mcs = __xen_mc_entry(sizeof(*op)); |
1528 | op = mcs.args; | 1540 | op = mcs.args; |
1529 | op->cmd = cmd; | 1541 | op->cmd = cmd; |
1530 | op->arg1.mfn = pfn_to_mfn(pfn); | 1542 | op->arg1.mfn = pfn_to_mfn(pfn); |
1531 | 1543 | ||
1532 | MULTI_mmuext_op(mcs.mc, mcs.args, 1, NULL, DOMID_SELF); | 1544 | MULTI_mmuext_op(mcs.mc, mcs.args, 1, NULL, DOMID_SELF); |
1533 | } | 1545 | } |
1534 | 1546 | ||
1535 | static inline void __set_pfn_prot(unsigned long pfn, pgprot_t prot) | 1547 | static inline void __set_pfn_prot(unsigned long pfn, pgprot_t prot) |
1536 | { | 1548 | { |
1537 | struct multicall_space mcs; | 1549 | struct multicall_space mcs; |
1538 | unsigned long addr = (unsigned long)__va(pfn << PAGE_SHIFT); | 1550 | unsigned long addr = (unsigned long)__va(pfn << PAGE_SHIFT); |
1539 | 1551 | ||
1540 | mcs = __xen_mc_entry(0); | 1552 | mcs = __xen_mc_entry(0); |
1541 | MULTI_update_va_mapping(mcs.mc, (unsigned long)addr, | 1553 | MULTI_update_va_mapping(mcs.mc, (unsigned long)addr, |
1542 | pfn_pte(pfn, prot), 0); | 1554 | pfn_pte(pfn, prot), 0); |
1543 | } | 1555 | } |
1544 | 1556 | ||
1545 | /* This needs to make sure the new pte page is pinned iff its being | 1557 | /* This needs to make sure the new pte page is pinned iff its being |
1546 | attached to a pinned pagetable. */ | 1558 | attached to a pinned pagetable. */ |
1547 | static inline void xen_alloc_ptpage(struct mm_struct *mm, unsigned long pfn, | 1559 | static inline void xen_alloc_ptpage(struct mm_struct *mm, unsigned long pfn, |
1548 | unsigned level) | 1560 | unsigned level) |
1549 | { | 1561 | { |
1550 | bool pinned = PagePinned(virt_to_page(mm->pgd)); | 1562 | bool pinned = PagePinned(virt_to_page(mm->pgd)); |
1551 | 1563 | ||
1552 | trace_xen_mmu_alloc_ptpage(mm, pfn, level, pinned); | 1564 | trace_xen_mmu_alloc_ptpage(mm, pfn, level, pinned); |
1553 | 1565 | ||
1554 | if (pinned) { | 1566 | if (pinned) { |
1555 | struct page *page = pfn_to_page(pfn); | 1567 | struct page *page = pfn_to_page(pfn); |
1556 | 1568 | ||
1557 | SetPagePinned(page); | 1569 | SetPagePinned(page); |
1558 | 1570 | ||
1559 | if (!PageHighMem(page)) { | 1571 | if (!PageHighMem(page)) { |
1560 | xen_mc_batch(); | 1572 | xen_mc_batch(); |
1561 | 1573 | ||
1562 | __set_pfn_prot(pfn, PAGE_KERNEL_RO); | 1574 | __set_pfn_prot(pfn, PAGE_KERNEL_RO); |
1563 | 1575 | ||
1564 | if (level == PT_PTE && USE_SPLIT_PTE_PTLOCKS) | 1576 | if (level == PT_PTE && USE_SPLIT_PTE_PTLOCKS) |
1565 | __pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn); | 1577 | __pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn); |
1566 | 1578 | ||
1567 | xen_mc_issue(PARAVIRT_LAZY_MMU); | 1579 | xen_mc_issue(PARAVIRT_LAZY_MMU); |
1568 | } else { | 1580 | } else { |
1569 | /* make sure there are no stray mappings of | 1581 | /* make sure there are no stray mappings of |
1570 | this page */ | 1582 | this page */ |
1571 | kmap_flush_unused(); | 1583 | kmap_flush_unused(); |
1572 | } | 1584 | } |
1573 | } | 1585 | } |
1574 | } | 1586 | } |
1575 | 1587 | ||
1576 | static void xen_alloc_pte(struct mm_struct *mm, unsigned long pfn) | 1588 | static void xen_alloc_pte(struct mm_struct *mm, unsigned long pfn) |
1577 | { | 1589 | { |
1578 | xen_alloc_ptpage(mm, pfn, PT_PTE); | 1590 | xen_alloc_ptpage(mm, pfn, PT_PTE); |
1579 | } | 1591 | } |
1580 | 1592 | ||
1581 | static void xen_alloc_pmd(struct mm_struct *mm, unsigned long pfn) | 1593 | static void xen_alloc_pmd(struct mm_struct *mm, unsigned long pfn) |
1582 | { | 1594 | { |
1583 | xen_alloc_ptpage(mm, pfn, PT_PMD); | 1595 | xen_alloc_ptpage(mm, pfn, PT_PMD); |
1584 | } | 1596 | } |
1585 | 1597 | ||
1586 | /* This should never happen until we're OK to use struct page */ | 1598 | /* This should never happen until we're OK to use struct page */ |
1587 | static inline void xen_release_ptpage(unsigned long pfn, unsigned level) | 1599 | static inline void xen_release_ptpage(unsigned long pfn, unsigned level) |
1588 | { | 1600 | { |
1589 | struct page *page = pfn_to_page(pfn); | 1601 | struct page *page = pfn_to_page(pfn); |
1590 | bool pinned = PagePinned(page); | 1602 | bool pinned = PagePinned(page); |
1591 | 1603 | ||
1592 | trace_xen_mmu_release_ptpage(pfn, level, pinned); | 1604 | trace_xen_mmu_release_ptpage(pfn, level, pinned); |
1593 | 1605 | ||
1594 | if (pinned) { | 1606 | if (pinned) { |
1595 | if (!PageHighMem(page)) { | 1607 | if (!PageHighMem(page)) { |
1596 | xen_mc_batch(); | 1608 | xen_mc_batch(); |
1597 | 1609 | ||
1598 | if (level == PT_PTE && USE_SPLIT_PTE_PTLOCKS) | 1610 | if (level == PT_PTE && USE_SPLIT_PTE_PTLOCKS) |
1599 | __pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn); | 1611 | __pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn); |
1600 | 1612 | ||
1601 | __set_pfn_prot(pfn, PAGE_KERNEL); | 1613 | __set_pfn_prot(pfn, PAGE_KERNEL); |
1602 | 1614 | ||
1603 | xen_mc_issue(PARAVIRT_LAZY_MMU); | 1615 | xen_mc_issue(PARAVIRT_LAZY_MMU); |
1604 | } | 1616 | } |
1605 | ClearPagePinned(page); | 1617 | ClearPagePinned(page); |
1606 | } | 1618 | } |
1607 | } | 1619 | } |
1608 | 1620 | ||
1609 | static void xen_release_pte(unsigned long pfn) | 1621 | static void xen_release_pte(unsigned long pfn) |
1610 | { | 1622 | { |
1611 | xen_release_ptpage(pfn, PT_PTE); | 1623 | xen_release_ptpage(pfn, PT_PTE); |
1612 | } | 1624 | } |
1613 | 1625 | ||
1614 | static void xen_release_pmd(unsigned long pfn) | 1626 | static void xen_release_pmd(unsigned long pfn) |
1615 | { | 1627 | { |
1616 | xen_release_ptpage(pfn, PT_PMD); | 1628 | xen_release_ptpage(pfn, PT_PMD); |
1617 | } | 1629 | } |
1618 | 1630 | ||
1619 | #if PAGETABLE_LEVELS == 4 | 1631 | #if PAGETABLE_LEVELS == 4 |
1620 | static void xen_alloc_pud(struct mm_struct *mm, unsigned long pfn) | 1632 | static void xen_alloc_pud(struct mm_struct *mm, unsigned long pfn) |
1621 | { | 1633 | { |
1622 | xen_alloc_ptpage(mm, pfn, PT_PUD); | 1634 | xen_alloc_ptpage(mm, pfn, PT_PUD); |
1623 | } | 1635 | } |
1624 | 1636 | ||
1625 | static void xen_release_pud(unsigned long pfn) | 1637 | static void xen_release_pud(unsigned long pfn) |
1626 | { | 1638 | { |
1627 | xen_release_ptpage(pfn, PT_PUD); | 1639 | xen_release_ptpage(pfn, PT_PUD); |
1628 | } | 1640 | } |
1629 | #endif | 1641 | #endif |
1630 | 1642 | ||
1631 | void __init xen_reserve_top(void) | 1643 | void __init xen_reserve_top(void) |
1632 | { | 1644 | { |
1633 | #ifdef CONFIG_X86_32 | 1645 | #ifdef CONFIG_X86_32 |
1634 | unsigned long top = HYPERVISOR_VIRT_START; | 1646 | unsigned long top = HYPERVISOR_VIRT_START; |
1635 | struct xen_platform_parameters pp; | 1647 | struct xen_platform_parameters pp; |
1636 | 1648 | ||
1637 | if (HYPERVISOR_xen_version(XENVER_platform_parameters, &pp) == 0) | 1649 | if (HYPERVISOR_xen_version(XENVER_platform_parameters, &pp) == 0) |
1638 | top = pp.virt_start; | 1650 | top = pp.virt_start; |
1639 | 1651 | ||
1640 | reserve_top_address(-top); | 1652 | reserve_top_address(-top); |
1641 | #endif /* CONFIG_X86_32 */ | 1653 | #endif /* CONFIG_X86_32 */ |
1642 | } | 1654 | } |
1643 | 1655 | ||
1644 | /* | 1656 | /* |
1645 | * Like __va(), but returns address in the kernel mapping (which is | 1657 | * Like __va(), but returns address in the kernel mapping (which is |
1646 | * all we have until the physical memory mapping has been set up. | 1658 | * all we have until the physical memory mapping has been set up. |
1647 | */ | 1659 | */ |
1648 | static void *__ka(phys_addr_t paddr) | 1660 | static void *__ka(phys_addr_t paddr) |
1649 | { | 1661 | { |
1650 | #ifdef CONFIG_X86_64 | 1662 | #ifdef CONFIG_X86_64 |
1651 | return (void *)(paddr + __START_KERNEL_map); | 1663 | return (void *)(paddr + __START_KERNEL_map); |
1652 | #else | 1664 | #else |
1653 | return __va(paddr); | 1665 | return __va(paddr); |
1654 | #endif | 1666 | #endif |
1655 | } | 1667 | } |
1656 | 1668 | ||
1657 | /* Convert a machine address to physical address */ | 1669 | /* Convert a machine address to physical address */ |
1658 | static unsigned long m2p(phys_addr_t maddr) | 1670 | static unsigned long m2p(phys_addr_t maddr) |
1659 | { | 1671 | { |
1660 | phys_addr_t paddr; | 1672 | phys_addr_t paddr; |
1661 | 1673 | ||
1662 | maddr &= PTE_PFN_MASK; | 1674 | maddr &= PTE_PFN_MASK; |
1663 | paddr = mfn_to_pfn(maddr >> PAGE_SHIFT) << PAGE_SHIFT; | 1675 | paddr = mfn_to_pfn(maddr >> PAGE_SHIFT) << PAGE_SHIFT; |
1664 | 1676 | ||
1665 | return paddr; | 1677 | return paddr; |
1666 | } | 1678 | } |
1667 | 1679 | ||
1668 | /* Convert a machine address to kernel virtual */ | 1680 | /* Convert a machine address to kernel virtual */ |
1669 | static void *m2v(phys_addr_t maddr) | 1681 | static void *m2v(phys_addr_t maddr) |
1670 | { | 1682 | { |
1671 | return __ka(m2p(maddr)); | 1683 | return __ka(m2p(maddr)); |
1672 | } | 1684 | } |
1673 | 1685 | ||
1674 | /* Set the page permissions on an identity-mapped pages */ | 1686 | /* Set the page permissions on an identity-mapped pages */ |
1675 | static void set_page_prot_flags(void *addr, pgprot_t prot, unsigned long flags) | 1687 | static void set_page_prot_flags(void *addr, pgprot_t prot, unsigned long flags) |
1676 | { | 1688 | { |
1677 | unsigned long pfn = __pa(addr) >> PAGE_SHIFT; | 1689 | unsigned long pfn = __pa(addr) >> PAGE_SHIFT; |
1678 | pte_t pte = pfn_pte(pfn, prot); | 1690 | pte_t pte = pfn_pte(pfn, prot); |
1679 | 1691 | ||
1680 | /* For PVH no need to set R/O or R/W to pin them or unpin them. */ | 1692 | /* For PVH no need to set R/O or R/W to pin them or unpin them. */ |
1681 | if (xen_feature(XENFEAT_auto_translated_physmap)) | 1693 | if (xen_feature(XENFEAT_auto_translated_physmap)) |
1682 | return; | 1694 | return; |
1683 | 1695 | ||
1684 | if (HYPERVISOR_update_va_mapping((unsigned long)addr, pte, flags)) | 1696 | if (HYPERVISOR_update_va_mapping((unsigned long)addr, pte, flags)) |
1685 | BUG(); | 1697 | BUG(); |
1686 | } | 1698 | } |
1687 | static void set_page_prot(void *addr, pgprot_t prot) | 1699 | static void set_page_prot(void *addr, pgprot_t prot) |
1688 | { | 1700 | { |
1689 | return set_page_prot_flags(addr, prot, UVMF_NONE); | 1701 | return set_page_prot_flags(addr, prot, UVMF_NONE); |
1690 | } | 1702 | } |
1691 | #ifdef CONFIG_X86_32 | 1703 | #ifdef CONFIG_X86_32 |
1692 | static void __init xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn) | 1704 | static void __init xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn) |
1693 | { | 1705 | { |
1694 | unsigned pmdidx, pteidx; | 1706 | unsigned pmdidx, pteidx; |
1695 | unsigned ident_pte; | 1707 | unsigned ident_pte; |
1696 | unsigned long pfn; | 1708 | unsigned long pfn; |
1697 | 1709 | ||
1698 | level1_ident_pgt = extend_brk(sizeof(pte_t) * LEVEL1_IDENT_ENTRIES, | 1710 | level1_ident_pgt = extend_brk(sizeof(pte_t) * LEVEL1_IDENT_ENTRIES, |
1699 | PAGE_SIZE); | 1711 | PAGE_SIZE); |
1700 | 1712 | ||
1701 | ident_pte = 0; | 1713 | ident_pte = 0; |
1702 | pfn = 0; | 1714 | pfn = 0; |
1703 | for (pmdidx = 0; pmdidx < PTRS_PER_PMD && pfn < max_pfn; pmdidx++) { | 1715 | for (pmdidx = 0; pmdidx < PTRS_PER_PMD && pfn < max_pfn; pmdidx++) { |
1704 | pte_t *pte_page; | 1716 | pte_t *pte_page; |
1705 | 1717 | ||
1706 | /* Reuse or allocate a page of ptes */ | 1718 | /* Reuse or allocate a page of ptes */ |
1707 | if (pmd_present(pmd[pmdidx])) | 1719 | if (pmd_present(pmd[pmdidx])) |
1708 | pte_page = m2v(pmd[pmdidx].pmd); | 1720 | pte_page = m2v(pmd[pmdidx].pmd); |
1709 | else { | 1721 | else { |
1710 | /* Check for free pte pages */ | 1722 | /* Check for free pte pages */ |
1711 | if (ident_pte == LEVEL1_IDENT_ENTRIES) | 1723 | if (ident_pte == LEVEL1_IDENT_ENTRIES) |
1712 | break; | 1724 | break; |
1713 | 1725 | ||
1714 | pte_page = &level1_ident_pgt[ident_pte]; | 1726 | pte_page = &level1_ident_pgt[ident_pte]; |
1715 | ident_pte += PTRS_PER_PTE; | 1727 | ident_pte += PTRS_PER_PTE; |
1716 | 1728 | ||
1717 | pmd[pmdidx] = __pmd(__pa(pte_page) | _PAGE_TABLE); | 1729 | pmd[pmdidx] = __pmd(__pa(pte_page) | _PAGE_TABLE); |
1718 | } | 1730 | } |
1719 | 1731 | ||
1720 | /* Install mappings */ | 1732 | /* Install mappings */ |
1721 | for (pteidx = 0; pteidx < PTRS_PER_PTE; pteidx++, pfn++) { | 1733 | for (pteidx = 0; pteidx < PTRS_PER_PTE; pteidx++, pfn++) { |
1722 | pte_t pte; | 1734 | pte_t pte; |
1723 | 1735 | ||
1724 | #ifdef CONFIG_X86_32 | 1736 | #ifdef CONFIG_X86_32 |
1725 | if (pfn > max_pfn_mapped) | 1737 | if (pfn > max_pfn_mapped) |
1726 | max_pfn_mapped = pfn; | 1738 | max_pfn_mapped = pfn; |
1727 | #endif | 1739 | #endif |
1728 | 1740 | ||
1729 | if (!pte_none(pte_page[pteidx])) | 1741 | if (!pte_none(pte_page[pteidx])) |
1730 | continue; | 1742 | continue; |
1731 | 1743 | ||
1732 | pte = pfn_pte(pfn, PAGE_KERNEL_EXEC); | 1744 | pte = pfn_pte(pfn, PAGE_KERNEL_EXEC); |
1733 | pte_page[pteidx] = pte; | 1745 | pte_page[pteidx] = pte; |
1734 | } | 1746 | } |
1735 | } | 1747 | } |
1736 | 1748 | ||
1737 | for (pteidx = 0; pteidx < ident_pte; pteidx += PTRS_PER_PTE) | 1749 | for (pteidx = 0; pteidx < ident_pte; pteidx += PTRS_PER_PTE) |
1738 | set_page_prot(&level1_ident_pgt[pteidx], PAGE_KERNEL_RO); | 1750 | set_page_prot(&level1_ident_pgt[pteidx], PAGE_KERNEL_RO); |
1739 | 1751 | ||
1740 | set_page_prot(pmd, PAGE_KERNEL_RO); | 1752 | set_page_prot(pmd, PAGE_KERNEL_RO); |
1741 | } | 1753 | } |
1742 | #endif | 1754 | #endif |
1743 | void __init xen_setup_machphys_mapping(void) | 1755 | void __init xen_setup_machphys_mapping(void) |
1744 | { | 1756 | { |
1745 | struct xen_machphys_mapping mapping; | 1757 | struct xen_machphys_mapping mapping; |
1746 | 1758 | ||
1747 | if (HYPERVISOR_memory_op(XENMEM_machphys_mapping, &mapping) == 0) { | 1759 | if (HYPERVISOR_memory_op(XENMEM_machphys_mapping, &mapping) == 0) { |
1748 | machine_to_phys_mapping = (unsigned long *)mapping.v_start; | 1760 | machine_to_phys_mapping = (unsigned long *)mapping.v_start; |
1749 | machine_to_phys_nr = mapping.max_mfn + 1; | 1761 | machine_to_phys_nr = mapping.max_mfn + 1; |
1750 | } else { | 1762 | } else { |
1751 | machine_to_phys_nr = MACH2PHYS_NR_ENTRIES; | 1763 | machine_to_phys_nr = MACH2PHYS_NR_ENTRIES; |
1752 | } | 1764 | } |
1753 | #ifdef CONFIG_X86_32 | 1765 | #ifdef CONFIG_X86_32 |
1754 | WARN_ON((machine_to_phys_mapping + (machine_to_phys_nr - 1)) | 1766 | WARN_ON((machine_to_phys_mapping + (machine_to_phys_nr - 1)) |
1755 | < machine_to_phys_mapping); | 1767 | < machine_to_phys_mapping); |
1756 | #endif | 1768 | #endif |
1757 | } | 1769 | } |
1758 | 1770 | ||
1759 | #ifdef CONFIG_X86_64 | 1771 | #ifdef CONFIG_X86_64 |
1760 | static void convert_pfn_mfn(void *v) | 1772 | static void convert_pfn_mfn(void *v) |
1761 | { | 1773 | { |
1762 | pte_t *pte = v; | 1774 | pte_t *pte = v; |
1763 | int i; | 1775 | int i; |
1764 | 1776 | ||
1765 | /* All levels are converted the same way, so just treat them | 1777 | /* All levels are converted the same way, so just treat them |
1766 | as ptes. */ | 1778 | as ptes. */ |
1767 | for (i = 0; i < PTRS_PER_PTE; i++) | 1779 | for (i = 0; i < PTRS_PER_PTE; i++) |
1768 | pte[i] = xen_make_pte(pte[i].pte); | 1780 | pte[i] = xen_make_pte(pte[i].pte); |
1769 | } | 1781 | } |
1770 | static void __init check_pt_base(unsigned long *pt_base, unsigned long *pt_end, | 1782 | static void __init check_pt_base(unsigned long *pt_base, unsigned long *pt_end, |
1771 | unsigned long addr) | 1783 | unsigned long addr) |
1772 | { | 1784 | { |
1773 | if (*pt_base == PFN_DOWN(__pa(addr))) { | 1785 | if (*pt_base == PFN_DOWN(__pa(addr))) { |
1774 | set_page_prot_flags((void *)addr, PAGE_KERNEL, UVMF_INVLPG); | 1786 | set_page_prot_flags((void *)addr, PAGE_KERNEL, UVMF_INVLPG); |
1775 | clear_page((void *)addr); | 1787 | clear_page((void *)addr); |
1776 | (*pt_base)++; | 1788 | (*pt_base)++; |
1777 | } | 1789 | } |
1778 | if (*pt_end == PFN_DOWN(__pa(addr))) { | 1790 | if (*pt_end == PFN_DOWN(__pa(addr))) { |
1779 | set_page_prot_flags((void *)addr, PAGE_KERNEL, UVMF_INVLPG); | 1791 | set_page_prot_flags((void *)addr, PAGE_KERNEL, UVMF_INVLPG); |
1780 | clear_page((void *)addr); | 1792 | clear_page((void *)addr); |
1781 | (*pt_end)--; | 1793 | (*pt_end)--; |
1782 | } | 1794 | } |
1783 | } | 1795 | } |
1784 | /* | 1796 | /* |
1785 | * Set up the initial kernel pagetable. | 1797 | * Set up the initial kernel pagetable. |
1786 | * | 1798 | * |
1787 | * We can construct this by grafting the Xen provided pagetable into | 1799 | * We can construct this by grafting the Xen provided pagetable into |
1788 | * head_64.S's preconstructed pagetables. We copy the Xen L2's into | 1800 | * head_64.S's preconstructed pagetables. We copy the Xen L2's into |
1789 | * level2_ident_pgt, and level2_kernel_pgt. This means that only the | 1801 | * level2_ident_pgt, and level2_kernel_pgt. This means that only the |
1790 | * kernel has a physical mapping to start with - but that's enough to | 1802 | * kernel has a physical mapping to start with - but that's enough to |
1791 | * get __va working. We need to fill in the rest of the physical | 1803 | * get __va working. We need to fill in the rest of the physical |
1792 | * mapping once some sort of allocator has been set up. NOTE: for | 1804 | * mapping once some sort of allocator has been set up. NOTE: for |
1793 | * PVH, the page tables are native. | 1805 | * PVH, the page tables are native. |
1794 | */ | 1806 | */ |
1795 | void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn) | 1807 | void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn) |
1796 | { | 1808 | { |
1797 | pud_t *l3; | 1809 | pud_t *l3; |
1798 | pmd_t *l2; | 1810 | pmd_t *l2; |
1799 | unsigned long addr[3]; | 1811 | unsigned long addr[3]; |
1800 | unsigned long pt_base, pt_end; | 1812 | unsigned long pt_base, pt_end; |
1801 | unsigned i; | 1813 | unsigned i; |
1802 | 1814 | ||
1803 | /* max_pfn_mapped is the last pfn mapped in the initial memory | 1815 | /* max_pfn_mapped is the last pfn mapped in the initial memory |
1804 | * mappings. Considering that on Xen after the kernel mappings we | 1816 | * mappings. Considering that on Xen after the kernel mappings we |
1805 | * have the mappings of some pages that don't exist in pfn space, we | 1817 | * have the mappings of some pages that don't exist in pfn space, we |
1806 | * set max_pfn_mapped to the last real pfn mapped. */ | 1818 | * set max_pfn_mapped to the last real pfn mapped. */ |
1807 | max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->mfn_list)); | 1819 | max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->mfn_list)); |
1808 | 1820 | ||
1809 | pt_base = PFN_DOWN(__pa(xen_start_info->pt_base)); | 1821 | pt_base = PFN_DOWN(__pa(xen_start_info->pt_base)); |
1810 | pt_end = pt_base + xen_start_info->nr_pt_frames; | 1822 | pt_end = pt_base + xen_start_info->nr_pt_frames; |
1811 | 1823 | ||
1812 | /* Zap identity mapping */ | 1824 | /* Zap identity mapping */ |
1813 | init_level4_pgt[0] = __pgd(0); | 1825 | init_level4_pgt[0] = __pgd(0); |
1814 | 1826 | ||
1815 | if (!xen_feature(XENFEAT_auto_translated_physmap)) { | 1827 | if (!xen_feature(XENFEAT_auto_translated_physmap)) { |
1816 | /* Pre-constructed entries are in pfn, so convert to mfn */ | 1828 | /* Pre-constructed entries are in pfn, so convert to mfn */ |
1817 | /* L4[272] -> level3_ident_pgt | 1829 | /* L4[272] -> level3_ident_pgt |
1818 | * L4[511] -> level3_kernel_pgt */ | 1830 | * L4[511] -> level3_kernel_pgt */ |
1819 | convert_pfn_mfn(init_level4_pgt); | 1831 | convert_pfn_mfn(init_level4_pgt); |
1820 | 1832 | ||
1821 | /* L3_i[0] -> level2_ident_pgt */ | 1833 | /* L3_i[0] -> level2_ident_pgt */ |
1822 | convert_pfn_mfn(level3_ident_pgt); | 1834 | convert_pfn_mfn(level3_ident_pgt); |
1823 | /* L3_k[510] -> level2_kernel_pgt | 1835 | /* L3_k[510] -> level2_kernel_pgt |
1824 | * L3_k[511] -> level2_fixmap_pgt */ | 1836 | * L3_k[511] -> level2_fixmap_pgt */ |
1825 | convert_pfn_mfn(level3_kernel_pgt); | 1837 | convert_pfn_mfn(level3_kernel_pgt); |
1826 | 1838 | ||
1827 | /* L3_k[511][506] -> level1_fixmap_pgt */ | 1839 | /* L3_k[511][506] -> level1_fixmap_pgt */ |
1828 | convert_pfn_mfn(level2_fixmap_pgt); | 1840 | convert_pfn_mfn(level2_fixmap_pgt); |
1829 | } | 1841 | } |
1830 | /* We get [511][511] and have Xen's version of level2_kernel_pgt */ | 1842 | /* We get [511][511] and have Xen's version of level2_kernel_pgt */ |
1831 | l3 = m2v(pgd[pgd_index(__START_KERNEL_map)].pgd); | 1843 | l3 = m2v(pgd[pgd_index(__START_KERNEL_map)].pgd); |
1832 | l2 = m2v(l3[pud_index(__START_KERNEL_map)].pud); | 1844 | l2 = m2v(l3[pud_index(__START_KERNEL_map)].pud); |
1833 | 1845 | ||
1834 | addr[0] = (unsigned long)pgd; | 1846 | addr[0] = (unsigned long)pgd; |
1835 | addr[1] = (unsigned long)l3; | 1847 | addr[1] = (unsigned long)l3; |
1836 | addr[2] = (unsigned long)l2; | 1848 | addr[2] = (unsigned long)l2; |
1837 | /* Graft it onto L4[272][0]. Note that we creating an aliasing problem: | 1849 | /* Graft it onto L4[272][0]. Note that we creating an aliasing problem: |
1838 | * Both L4[272][0] and L4[511][510] have entries that point to the same | 1850 | * Both L4[272][0] and L4[511][510] have entries that point to the same |
1839 | * L2 (PMD) tables. Meaning that if you modify it in __va space | 1851 | * L2 (PMD) tables. Meaning that if you modify it in __va space |
1840 | * it will be also modified in the __ka space! (But if you just | 1852 | * it will be also modified in the __ka space! (But if you just |
1841 | * modify the PMD table to point to other PTE's or none, then you | 1853 | * modify the PMD table to point to other PTE's or none, then you |
1842 | * are OK - which is what cleanup_highmap does) */ | 1854 | * are OK - which is what cleanup_highmap does) */ |
1843 | copy_page(level2_ident_pgt, l2); | 1855 | copy_page(level2_ident_pgt, l2); |
1844 | /* Graft it onto L4[511][510] */ | 1856 | /* Graft it onto L4[511][510] */ |
1845 | copy_page(level2_kernel_pgt, l2); | 1857 | copy_page(level2_kernel_pgt, l2); |
1846 | 1858 | ||
1847 | if (!xen_feature(XENFEAT_auto_translated_physmap)) { | 1859 | if (!xen_feature(XENFEAT_auto_translated_physmap)) { |
1848 | /* Make pagetable pieces RO */ | 1860 | /* Make pagetable pieces RO */ |
1849 | set_page_prot(init_level4_pgt, PAGE_KERNEL_RO); | 1861 | set_page_prot(init_level4_pgt, PAGE_KERNEL_RO); |
1850 | set_page_prot(level3_ident_pgt, PAGE_KERNEL_RO); | 1862 | set_page_prot(level3_ident_pgt, PAGE_KERNEL_RO); |
1851 | set_page_prot(level3_kernel_pgt, PAGE_KERNEL_RO); | 1863 | set_page_prot(level3_kernel_pgt, PAGE_KERNEL_RO); |
1852 | set_page_prot(level3_user_vsyscall, PAGE_KERNEL_RO); | 1864 | set_page_prot(level3_user_vsyscall, PAGE_KERNEL_RO); |
1853 | set_page_prot(level2_ident_pgt, PAGE_KERNEL_RO); | 1865 | set_page_prot(level2_ident_pgt, PAGE_KERNEL_RO); |
1854 | set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO); | 1866 | set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO); |
1855 | set_page_prot(level2_fixmap_pgt, PAGE_KERNEL_RO); | 1867 | set_page_prot(level2_fixmap_pgt, PAGE_KERNEL_RO); |
1856 | set_page_prot(level1_fixmap_pgt, PAGE_KERNEL_RO); | 1868 | set_page_prot(level1_fixmap_pgt, PAGE_KERNEL_RO); |
1857 | 1869 | ||
1858 | /* Pin down new L4 */ | 1870 | /* Pin down new L4 */ |
1859 | pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE, | 1871 | pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE, |
1860 | PFN_DOWN(__pa_symbol(init_level4_pgt))); | 1872 | PFN_DOWN(__pa_symbol(init_level4_pgt))); |
1861 | 1873 | ||
1862 | /* Unpin Xen-provided one */ | 1874 | /* Unpin Xen-provided one */ |
1863 | pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd))); | 1875 | pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd))); |
1864 | 1876 | ||
1865 | /* | 1877 | /* |
1866 | * At this stage there can be no user pgd, and no page | 1878 | * At this stage there can be no user pgd, and no page |
1867 | * structure to attach it to, so make sure we just set kernel | 1879 | * structure to attach it to, so make sure we just set kernel |
1868 | * pgd. | 1880 | * pgd. |
1869 | */ | 1881 | */ |
1870 | xen_mc_batch(); | 1882 | xen_mc_batch(); |
1871 | __xen_write_cr3(true, __pa(init_level4_pgt)); | 1883 | __xen_write_cr3(true, __pa(init_level4_pgt)); |
1872 | xen_mc_issue(PARAVIRT_LAZY_CPU); | 1884 | xen_mc_issue(PARAVIRT_LAZY_CPU); |
1873 | } else | 1885 | } else |
1874 | native_write_cr3(__pa(init_level4_pgt)); | 1886 | native_write_cr3(__pa(init_level4_pgt)); |
1875 | 1887 | ||
1876 | /* We can't that easily rip out L3 and L2, as the Xen pagetables are | 1888 | /* We can't that easily rip out L3 and L2, as the Xen pagetables are |
1877 | * set out this way: [L4], [L1], [L2], [L3], [L1], [L1] ... for | 1889 | * set out this way: [L4], [L1], [L2], [L3], [L1], [L1] ... for |
1878 | * the initial domain. For guests using the toolstack, they are in: | 1890 | * the initial domain. For guests using the toolstack, they are in: |
1879 | * [L4], [L3], [L2], [L1], [L1], order .. So for dom0 we can only | 1891 | * [L4], [L3], [L2], [L1], [L1], order .. So for dom0 we can only |
1880 | * rip out the [L4] (pgd), but for guests we shave off three pages. | 1892 | * rip out the [L4] (pgd), but for guests we shave off three pages. |
1881 | */ | 1893 | */ |
1882 | for (i = 0; i < ARRAY_SIZE(addr); i++) | 1894 | for (i = 0; i < ARRAY_SIZE(addr); i++) |
1883 | check_pt_base(&pt_base, &pt_end, addr[i]); | 1895 | check_pt_base(&pt_base, &pt_end, addr[i]); |
1884 | 1896 | ||
1885 | /* Our (by three pages) smaller Xen pagetable that we are using */ | 1897 | /* Our (by three pages) smaller Xen pagetable that we are using */ |
1886 | memblock_reserve(PFN_PHYS(pt_base), (pt_end - pt_base) * PAGE_SIZE); | 1898 | memblock_reserve(PFN_PHYS(pt_base), (pt_end - pt_base) * PAGE_SIZE); |
1887 | /* Revector the xen_start_info */ | 1899 | /* Revector the xen_start_info */ |
1888 | xen_start_info = (struct start_info *)__va(__pa(xen_start_info)); | 1900 | xen_start_info = (struct start_info *)__va(__pa(xen_start_info)); |
1889 | } | 1901 | } |
1890 | #else /* !CONFIG_X86_64 */ | 1902 | #else /* !CONFIG_X86_64 */ |
1891 | static RESERVE_BRK_ARRAY(pmd_t, initial_kernel_pmd, PTRS_PER_PMD); | 1903 | static RESERVE_BRK_ARRAY(pmd_t, initial_kernel_pmd, PTRS_PER_PMD); |
1892 | static RESERVE_BRK_ARRAY(pmd_t, swapper_kernel_pmd, PTRS_PER_PMD); | 1904 | static RESERVE_BRK_ARRAY(pmd_t, swapper_kernel_pmd, PTRS_PER_PMD); |
1893 | 1905 | ||
1894 | static void __init xen_write_cr3_init(unsigned long cr3) | 1906 | static void __init xen_write_cr3_init(unsigned long cr3) |
1895 | { | 1907 | { |
1896 | unsigned long pfn = PFN_DOWN(__pa(swapper_pg_dir)); | 1908 | unsigned long pfn = PFN_DOWN(__pa(swapper_pg_dir)); |
1897 | 1909 | ||
1898 | BUG_ON(read_cr3() != __pa(initial_page_table)); | 1910 | BUG_ON(read_cr3() != __pa(initial_page_table)); |
1899 | BUG_ON(cr3 != __pa(swapper_pg_dir)); | 1911 | BUG_ON(cr3 != __pa(swapper_pg_dir)); |
1900 | 1912 | ||
1901 | /* | 1913 | /* |
1902 | * We are switching to swapper_pg_dir for the first time (from | 1914 | * We are switching to swapper_pg_dir for the first time (from |
1903 | * initial_page_table) and therefore need to mark that page | 1915 | * initial_page_table) and therefore need to mark that page |
1904 | * read-only and then pin it. | 1916 | * read-only and then pin it. |
1905 | * | 1917 | * |
1906 | * Xen disallows sharing of kernel PMDs for PAE | 1918 | * Xen disallows sharing of kernel PMDs for PAE |
1907 | * guests. Therefore we must copy the kernel PMD from | 1919 | * guests. Therefore we must copy the kernel PMD from |
1908 | * initial_page_table into a new kernel PMD to be used in | 1920 | * initial_page_table into a new kernel PMD to be used in |
1909 | * swapper_pg_dir. | 1921 | * swapper_pg_dir. |
1910 | */ | 1922 | */ |
1911 | swapper_kernel_pmd = | 1923 | swapper_kernel_pmd = |
1912 | extend_brk(sizeof(pmd_t) * PTRS_PER_PMD, PAGE_SIZE); | 1924 | extend_brk(sizeof(pmd_t) * PTRS_PER_PMD, PAGE_SIZE); |
1913 | copy_page(swapper_kernel_pmd, initial_kernel_pmd); | 1925 | copy_page(swapper_kernel_pmd, initial_kernel_pmd); |
1914 | swapper_pg_dir[KERNEL_PGD_BOUNDARY] = | 1926 | swapper_pg_dir[KERNEL_PGD_BOUNDARY] = |
1915 | __pgd(__pa(swapper_kernel_pmd) | _PAGE_PRESENT); | 1927 | __pgd(__pa(swapper_kernel_pmd) | _PAGE_PRESENT); |
1916 | set_page_prot(swapper_kernel_pmd, PAGE_KERNEL_RO); | 1928 | set_page_prot(swapper_kernel_pmd, PAGE_KERNEL_RO); |
1917 | 1929 | ||
1918 | set_page_prot(swapper_pg_dir, PAGE_KERNEL_RO); | 1930 | set_page_prot(swapper_pg_dir, PAGE_KERNEL_RO); |
1919 | xen_write_cr3(cr3); | 1931 | xen_write_cr3(cr3); |
1920 | pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, pfn); | 1932 | pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, pfn); |
1921 | 1933 | ||
1922 | pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, | 1934 | pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, |
1923 | PFN_DOWN(__pa(initial_page_table))); | 1935 | PFN_DOWN(__pa(initial_page_table))); |
1924 | set_page_prot(initial_page_table, PAGE_KERNEL); | 1936 | set_page_prot(initial_page_table, PAGE_KERNEL); |
1925 | set_page_prot(initial_kernel_pmd, PAGE_KERNEL); | 1937 | set_page_prot(initial_kernel_pmd, PAGE_KERNEL); |
1926 | 1938 | ||
1927 | pv_mmu_ops.write_cr3 = &xen_write_cr3; | 1939 | pv_mmu_ops.write_cr3 = &xen_write_cr3; |
1928 | } | 1940 | } |
1929 | 1941 | ||
1930 | void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn) | 1942 | void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn) |
1931 | { | 1943 | { |
1932 | pmd_t *kernel_pmd; | 1944 | pmd_t *kernel_pmd; |
1933 | 1945 | ||
1934 | initial_kernel_pmd = | 1946 | initial_kernel_pmd = |
1935 | extend_brk(sizeof(pmd_t) * PTRS_PER_PMD, PAGE_SIZE); | 1947 | extend_brk(sizeof(pmd_t) * PTRS_PER_PMD, PAGE_SIZE); |
1936 | 1948 | ||
1937 | max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->pt_base) + | 1949 | max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->pt_base) + |
1938 | xen_start_info->nr_pt_frames * PAGE_SIZE + | 1950 | xen_start_info->nr_pt_frames * PAGE_SIZE + |
1939 | 512*1024); | 1951 | 512*1024); |
1940 | 1952 | ||
1941 | kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd); | 1953 | kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd); |
1942 | copy_page(initial_kernel_pmd, kernel_pmd); | 1954 | copy_page(initial_kernel_pmd, kernel_pmd); |
1943 | 1955 | ||
1944 | xen_map_identity_early(initial_kernel_pmd, max_pfn); | 1956 | xen_map_identity_early(initial_kernel_pmd, max_pfn); |
1945 | 1957 | ||
1946 | copy_page(initial_page_table, pgd); | 1958 | copy_page(initial_page_table, pgd); |
1947 | initial_page_table[KERNEL_PGD_BOUNDARY] = | 1959 | initial_page_table[KERNEL_PGD_BOUNDARY] = |
1948 | __pgd(__pa(initial_kernel_pmd) | _PAGE_PRESENT); | 1960 | __pgd(__pa(initial_kernel_pmd) | _PAGE_PRESENT); |
1949 | 1961 | ||
1950 | set_page_prot(initial_kernel_pmd, PAGE_KERNEL_RO); | 1962 | set_page_prot(initial_kernel_pmd, PAGE_KERNEL_RO); |
1951 | set_page_prot(initial_page_table, PAGE_KERNEL_RO); | 1963 | set_page_prot(initial_page_table, PAGE_KERNEL_RO); |
1952 | set_page_prot(empty_zero_page, PAGE_KERNEL_RO); | 1964 | set_page_prot(empty_zero_page, PAGE_KERNEL_RO); |
1953 | 1965 | ||
1954 | pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd))); | 1966 | pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd))); |
1955 | 1967 | ||
1956 | pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, | 1968 | pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, |
1957 | PFN_DOWN(__pa(initial_page_table))); | 1969 | PFN_DOWN(__pa(initial_page_table))); |
1958 | xen_write_cr3(__pa(initial_page_table)); | 1970 | xen_write_cr3(__pa(initial_page_table)); |
1959 | 1971 | ||
1960 | memblock_reserve(__pa(xen_start_info->pt_base), | 1972 | memblock_reserve(__pa(xen_start_info->pt_base), |
1961 | xen_start_info->nr_pt_frames * PAGE_SIZE); | 1973 | xen_start_info->nr_pt_frames * PAGE_SIZE); |
1962 | } | 1974 | } |
1963 | #endif /* CONFIG_X86_64 */ | 1975 | #endif /* CONFIG_X86_64 */ |
1964 | 1976 | ||
1965 | static unsigned char dummy_mapping[PAGE_SIZE] __page_aligned_bss; | 1977 | static unsigned char dummy_mapping[PAGE_SIZE] __page_aligned_bss; |
1966 | 1978 | ||
1967 | static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot) | 1979 | static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot) |
1968 | { | 1980 | { |
1969 | pte_t pte; | 1981 | pte_t pte; |
1970 | 1982 | ||
1971 | phys >>= PAGE_SHIFT; | 1983 | phys >>= PAGE_SHIFT; |
1972 | 1984 | ||
1973 | switch (idx) { | 1985 | switch (idx) { |
1974 | case FIX_BTMAP_END ... FIX_BTMAP_BEGIN: | 1986 | case FIX_BTMAP_END ... FIX_BTMAP_BEGIN: |
1975 | case FIX_RO_IDT: | 1987 | case FIX_RO_IDT: |
1976 | #ifdef CONFIG_X86_32 | 1988 | #ifdef CONFIG_X86_32 |
1977 | case FIX_WP_TEST: | 1989 | case FIX_WP_TEST: |
1978 | # ifdef CONFIG_HIGHMEM | 1990 | # ifdef CONFIG_HIGHMEM |
1979 | case FIX_KMAP_BEGIN ... FIX_KMAP_END: | 1991 | case FIX_KMAP_BEGIN ... FIX_KMAP_END: |
1980 | # endif | 1992 | # endif |
1981 | #elif defined(CONFIG_X86_VSYSCALL_EMULATION) | 1993 | #elif defined(CONFIG_X86_VSYSCALL_EMULATION) |
1982 | case VSYSCALL_PAGE: | 1994 | case VSYSCALL_PAGE: |
1983 | #endif | 1995 | #endif |
1984 | case FIX_TEXT_POKE0: | 1996 | case FIX_TEXT_POKE0: |
1985 | case FIX_TEXT_POKE1: | 1997 | case FIX_TEXT_POKE1: |
1986 | /* All local page mappings */ | 1998 | /* All local page mappings */ |
1987 | pte = pfn_pte(phys, prot); | 1999 | pte = pfn_pte(phys, prot); |
1988 | break; | 2000 | break; |
1989 | 2001 | ||
1990 | #ifdef CONFIG_X86_LOCAL_APIC | 2002 | #ifdef CONFIG_X86_LOCAL_APIC |
1991 | case FIX_APIC_BASE: /* maps dummy local APIC */ | 2003 | case FIX_APIC_BASE: /* maps dummy local APIC */ |
1992 | pte = pfn_pte(PFN_DOWN(__pa(dummy_mapping)), PAGE_KERNEL); | 2004 | pte = pfn_pte(PFN_DOWN(__pa(dummy_mapping)), PAGE_KERNEL); |
1993 | break; | 2005 | break; |
1994 | #endif | 2006 | #endif |
1995 | 2007 | ||
1996 | #ifdef CONFIG_X86_IO_APIC | 2008 | #ifdef CONFIG_X86_IO_APIC |
1997 | case FIX_IO_APIC_BASE_0 ... FIX_IO_APIC_BASE_END: | 2009 | case FIX_IO_APIC_BASE_0 ... FIX_IO_APIC_BASE_END: |
1998 | /* | 2010 | /* |
1999 | * We just don't map the IO APIC - all access is via | 2011 | * We just don't map the IO APIC - all access is via |
2000 | * hypercalls. Keep the address in the pte for reference. | 2012 | * hypercalls. Keep the address in the pte for reference. |
2001 | */ | 2013 | */ |
2002 | pte = pfn_pte(PFN_DOWN(__pa(dummy_mapping)), PAGE_KERNEL); | 2014 | pte = pfn_pte(PFN_DOWN(__pa(dummy_mapping)), PAGE_KERNEL); |
2003 | break; | 2015 | break; |
2004 | #endif | 2016 | #endif |
2005 | 2017 | ||
2006 | case FIX_PARAVIRT_BOOTMAP: | 2018 | case FIX_PARAVIRT_BOOTMAP: |
2007 | /* This is an MFN, but it isn't an IO mapping from the | 2019 | /* This is an MFN, but it isn't an IO mapping from the |
2008 | IO domain */ | 2020 | IO domain */ |
2009 | pte = mfn_pte(phys, prot); | 2021 | pte = mfn_pte(phys, prot); |
2010 | break; | 2022 | break; |
2011 | 2023 | ||
2012 | default: | 2024 | default: |
2013 | /* By default, set_fixmap is used for hardware mappings */ | 2025 | /* By default, set_fixmap is used for hardware mappings */ |
2014 | pte = mfn_pte(phys, prot); | 2026 | pte = mfn_pte(phys, prot); |
2015 | break; | 2027 | break; |
2016 | } | 2028 | } |
2017 | 2029 | ||
2018 | __native_set_fixmap(idx, pte); | 2030 | __native_set_fixmap(idx, pte); |
2019 | 2031 | ||
2020 | #ifdef CONFIG_X86_VSYSCALL_EMULATION | 2032 | #ifdef CONFIG_X86_VSYSCALL_EMULATION |
2021 | /* Replicate changes to map the vsyscall page into the user | 2033 | /* Replicate changes to map the vsyscall page into the user |
2022 | pagetable vsyscall mapping. */ | 2034 | pagetable vsyscall mapping. */ |
2023 | if (idx == VSYSCALL_PAGE) { | 2035 | if (idx == VSYSCALL_PAGE) { |
2024 | unsigned long vaddr = __fix_to_virt(idx); | 2036 | unsigned long vaddr = __fix_to_virt(idx); |
2025 | set_pte_vaddr_pud(level3_user_vsyscall, vaddr, pte); | 2037 | set_pte_vaddr_pud(level3_user_vsyscall, vaddr, pte); |
2026 | } | 2038 | } |
2027 | #endif | 2039 | #endif |
2028 | } | 2040 | } |
2029 | 2041 | ||
2030 | static void __init xen_post_allocator_init(void) | 2042 | static void __init xen_post_allocator_init(void) |
2031 | { | 2043 | { |
2032 | if (xen_feature(XENFEAT_auto_translated_physmap)) | 2044 | if (xen_feature(XENFEAT_auto_translated_physmap)) |
2033 | return; | 2045 | return; |
2034 | 2046 | ||
2035 | pv_mmu_ops.set_pte = xen_set_pte; | 2047 | pv_mmu_ops.set_pte = xen_set_pte; |
2036 | pv_mmu_ops.set_pmd = xen_set_pmd; | 2048 | pv_mmu_ops.set_pmd = xen_set_pmd; |
2037 | pv_mmu_ops.set_pud = xen_set_pud; | 2049 | pv_mmu_ops.set_pud = xen_set_pud; |
2038 | #if PAGETABLE_LEVELS == 4 | 2050 | #if PAGETABLE_LEVELS == 4 |
2039 | pv_mmu_ops.set_pgd = xen_set_pgd; | 2051 | pv_mmu_ops.set_pgd = xen_set_pgd; |
2040 | #endif | 2052 | #endif |
2041 | 2053 | ||
2042 | /* This will work as long as patching hasn't happened yet | 2054 | /* This will work as long as patching hasn't happened yet |
2043 | (which it hasn't) */ | 2055 | (which it hasn't) */ |
2044 | pv_mmu_ops.alloc_pte = xen_alloc_pte; | 2056 | pv_mmu_ops.alloc_pte = xen_alloc_pte; |
2045 | pv_mmu_ops.alloc_pmd = xen_alloc_pmd; | 2057 | pv_mmu_ops.alloc_pmd = xen_alloc_pmd; |
2046 | pv_mmu_ops.release_pte = xen_release_pte; | 2058 | pv_mmu_ops.release_pte = xen_release_pte; |
2047 | pv_mmu_ops.release_pmd = xen_release_pmd; | 2059 | pv_mmu_ops.release_pmd = xen_release_pmd; |
2048 | #if PAGETABLE_LEVELS == 4 | 2060 | #if PAGETABLE_LEVELS == 4 |
2049 | pv_mmu_ops.alloc_pud = xen_alloc_pud; | 2061 | pv_mmu_ops.alloc_pud = xen_alloc_pud; |
2050 | pv_mmu_ops.release_pud = xen_release_pud; | 2062 | pv_mmu_ops.release_pud = xen_release_pud; |
2051 | #endif | 2063 | #endif |
2052 | 2064 | ||
2053 | #ifdef CONFIG_X86_64 | 2065 | #ifdef CONFIG_X86_64 |
2054 | pv_mmu_ops.write_cr3 = &xen_write_cr3; | 2066 | pv_mmu_ops.write_cr3 = &xen_write_cr3; |
2055 | SetPagePinned(virt_to_page(level3_user_vsyscall)); | 2067 | SetPagePinned(virt_to_page(level3_user_vsyscall)); |
2056 | #endif | 2068 | #endif |
2057 | xen_mark_init_mm_pinned(); | 2069 | xen_mark_init_mm_pinned(); |
2058 | } | 2070 | } |
2059 | 2071 | ||
2060 | static void xen_leave_lazy_mmu(void) | 2072 | static void xen_leave_lazy_mmu(void) |
2061 | { | 2073 | { |
2062 | preempt_disable(); | 2074 | preempt_disable(); |
2063 | xen_mc_flush(); | 2075 | xen_mc_flush(); |
2064 | paravirt_leave_lazy_mmu(); | 2076 | paravirt_leave_lazy_mmu(); |
2065 | preempt_enable(); | 2077 | preempt_enable(); |
2066 | } | 2078 | } |
2067 | 2079 | ||
2068 | static const struct pv_mmu_ops xen_mmu_ops __initconst = { | 2080 | static const struct pv_mmu_ops xen_mmu_ops __initconst = { |
2069 | .read_cr2 = xen_read_cr2, | 2081 | .read_cr2 = xen_read_cr2, |
2070 | .write_cr2 = xen_write_cr2, | 2082 | .write_cr2 = xen_write_cr2, |
2071 | 2083 | ||
2072 | .read_cr3 = xen_read_cr3, | 2084 | .read_cr3 = xen_read_cr3, |
2073 | .write_cr3 = xen_write_cr3_init, | 2085 | .write_cr3 = xen_write_cr3_init, |
2074 | 2086 | ||
2075 | .flush_tlb_user = xen_flush_tlb, | 2087 | .flush_tlb_user = xen_flush_tlb, |
2076 | .flush_tlb_kernel = xen_flush_tlb, | 2088 | .flush_tlb_kernel = xen_flush_tlb, |
2077 | .flush_tlb_single = xen_flush_tlb_single, | 2089 | .flush_tlb_single = xen_flush_tlb_single, |
2078 | .flush_tlb_others = xen_flush_tlb_others, | 2090 | .flush_tlb_others = xen_flush_tlb_others, |
2079 | 2091 | ||
2080 | .pte_update = paravirt_nop, | 2092 | .pte_update = paravirt_nop, |
2081 | .pte_update_defer = paravirt_nop, | 2093 | .pte_update_defer = paravirt_nop, |
2082 | 2094 | ||
2083 | .pgd_alloc = xen_pgd_alloc, | 2095 | .pgd_alloc = xen_pgd_alloc, |
2084 | .pgd_free = xen_pgd_free, | 2096 | .pgd_free = xen_pgd_free, |
2085 | 2097 | ||
2086 | .alloc_pte = xen_alloc_pte_init, | 2098 | .alloc_pte = xen_alloc_pte_init, |
2087 | .release_pte = xen_release_pte_init, | 2099 | .release_pte = xen_release_pte_init, |
2088 | .alloc_pmd = xen_alloc_pmd_init, | 2100 | .alloc_pmd = xen_alloc_pmd_init, |
2089 | .release_pmd = xen_release_pmd_init, | 2101 | .release_pmd = xen_release_pmd_init, |
2090 | 2102 | ||
2091 | .set_pte = xen_set_pte_init, | 2103 | .set_pte = xen_set_pte_init, |
2092 | .set_pte_at = xen_set_pte_at, | 2104 | .set_pte_at = xen_set_pte_at, |
2093 | .set_pmd = xen_set_pmd_hyper, | 2105 | .set_pmd = xen_set_pmd_hyper, |
2094 | 2106 | ||
2095 | .ptep_modify_prot_start = __ptep_modify_prot_start, | 2107 | .ptep_modify_prot_start = __ptep_modify_prot_start, |
2096 | .ptep_modify_prot_commit = __ptep_modify_prot_commit, | 2108 | .ptep_modify_prot_commit = __ptep_modify_prot_commit, |
2097 | 2109 | ||
2098 | .pte_val = PV_CALLEE_SAVE(xen_pte_val), | 2110 | .pte_val = PV_CALLEE_SAVE(xen_pte_val), |
2099 | .pgd_val = PV_CALLEE_SAVE(xen_pgd_val), | 2111 | .pgd_val = PV_CALLEE_SAVE(xen_pgd_val), |
2100 | 2112 | ||
2101 | .make_pte = PV_CALLEE_SAVE(xen_make_pte), | 2113 | .make_pte = PV_CALLEE_SAVE(xen_make_pte), |
2102 | .make_pgd = PV_CALLEE_SAVE(xen_make_pgd), | 2114 | .make_pgd = PV_CALLEE_SAVE(xen_make_pgd), |
2103 | 2115 | ||
2104 | #ifdef CONFIG_X86_PAE | 2116 | #ifdef CONFIG_X86_PAE |
2105 | .set_pte_atomic = xen_set_pte_atomic, | 2117 | .set_pte_atomic = xen_set_pte_atomic, |
2106 | .pte_clear = xen_pte_clear, | 2118 | .pte_clear = xen_pte_clear, |
2107 | .pmd_clear = xen_pmd_clear, | 2119 | .pmd_clear = xen_pmd_clear, |
2108 | #endif /* CONFIG_X86_PAE */ | 2120 | #endif /* CONFIG_X86_PAE */ |
2109 | .set_pud = xen_set_pud_hyper, | 2121 | .set_pud = xen_set_pud_hyper, |
2110 | 2122 | ||
2111 | .make_pmd = PV_CALLEE_SAVE(xen_make_pmd), | 2123 | .make_pmd = PV_CALLEE_SAVE(xen_make_pmd), |
2112 | .pmd_val = PV_CALLEE_SAVE(xen_pmd_val), | 2124 | .pmd_val = PV_CALLEE_SAVE(xen_pmd_val), |
2113 | 2125 | ||
2114 | #if PAGETABLE_LEVELS == 4 | 2126 | #if PAGETABLE_LEVELS == 4 |
2115 | .pud_val = PV_CALLEE_SAVE(xen_pud_val), | 2127 | .pud_val = PV_CALLEE_SAVE(xen_pud_val), |
2116 | .make_pud = PV_CALLEE_SAVE(xen_make_pud), | 2128 | .make_pud = PV_CALLEE_SAVE(xen_make_pud), |
2117 | .set_pgd = xen_set_pgd_hyper, | 2129 | .set_pgd = xen_set_pgd_hyper, |
2118 | 2130 | ||
2119 | .alloc_pud = xen_alloc_pmd_init, | 2131 | .alloc_pud = xen_alloc_pmd_init, |
2120 | .release_pud = xen_release_pmd_init, | 2132 | .release_pud = xen_release_pmd_init, |
2121 | #endif /* PAGETABLE_LEVELS == 4 */ | 2133 | #endif /* PAGETABLE_LEVELS == 4 */ |
2122 | 2134 | ||
2123 | .activate_mm = xen_activate_mm, | 2135 | .activate_mm = xen_activate_mm, |
2124 | .dup_mmap = xen_dup_mmap, | 2136 | .dup_mmap = xen_dup_mmap, |
2125 | .exit_mmap = xen_exit_mmap, | 2137 | .exit_mmap = xen_exit_mmap, |
2126 | 2138 | ||
2127 | .lazy_mode = { | 2139 | .lazy_mode = { |
2128 | .enter = paravirt_enter_lazy_mmu, | 2140 | .enter = paravirt_enter_lazy_mmu, |
2129 | .leave = xen_leave_lazy_mmu, | 2141 | .leave = xen_leave_lazy_mmu, |
2130 | .flush = paravirt_flush_lazy_mmu, | 2142 | .flush = paravirt_flush_lazy_mmu, |
2131 | }, | 2143 | }, |
2132 | 2144 | ||
2133 | .set_fixmap = xen_set_fixmap, | 2145 | .set_fixmap = xen_set_fixmap, |
2134 | }; | 2146 | }; |
2135 | 2147 | ||
2136 | void __init xen_init_mmu_ops(void) | 2148 | void __init xen_init_mmu_ops(void) |
2137 | { | 2149 | { |
2138 | x86_init.paging.pagetable_init = xen_pagetable_init; | 2150 | x86_init.paging.pagetable_init = xen_pagetable_init; |
2139 | 2151 | ||
2140 | /* Optimization - we can use the HVM one but it has no idea which | 2152 | /* Optimization - we can use the HVM one but it has no idea which |
2141 | * VCPUs are descheduled - which means that it will needlessly IPI | 2153 | * VCPUs are descheduled - which means that it will needlessly IPI |
2142 | * them. Xen knows so let it do the job. | 2154 | * them. Xen knows so let it do the job. |
2143 | */ | 2155 | */ |
2144 | if (xen_feature(XENFEAT_auto_translated_physmap)) { | 2156 | if (xen_feature(XENFEAT_auto_translated_physmap)) { |
2145 | pv_mmu_ops.flush_tlb_others = xen_flush_tlb_others; | 2157 | pv_mmu_ops.flush_tlb_others = xen_flush_tlb_others; |
2146 | return; | 2158 | return; |
2147 | } | 2159 | } |
2148 | pv_mmu_ops = xen_mmu_ops; | 2160 | pv_mmu_ops = xen_mmu_ops; |
2149 | 2161 | ||
2150 | memset(dummy_mapping, 0xff, PAGE_SIZE); | 2162 | memset(dummy_mapping, 0xff, PAGE_SIZE); |
2151 | } | 2163 | } |
2152 | 2164 | ||
2153 | /* Protected by xen_reservation_lock. */ | 2165 | /* Protected by xen_reservation_lock. */ |
2154 | #define MAX_CONTIG_ORDER 9 /* 2MB */ | 2166 | #define MAX_CONTIG_ORDER 9 /* 2MB */ |
2155 | static unsigned long discontig_frames[1<<MAX_CONTIG_ORDER]; | 2167 | static unsigned long discontig_frames[1<<MAX_CONTIG_ORDER]; |
2156 | 2168 | ||
2157 | #define VOID_PTE (mfn_pte(0, __pgprot(0))) | 2169 | #define VOID_PTE (mfn_pte(0, __pgprot(0))) |
2158 | static void xen_zap_pfn_range(unsigned long vaddr, unsigned int order, | 2170 | static void xen_zap_pfn_range(unsigned long vaddr, unsigned int order, |
2159 | unsigned long *in_frames, | 2171 | unsigned long *in_frames, |
2160 | unsigned long *out_frames) | 2172 | unsigned long *out_frames) |
2161 | { | 2173 | { |
2162 | int i; | 2174 | int i; |
2163 | struct multicall_space mcs; | 2175 | struct multicall_space mcs; |
2164 | 2176 | ||
2165 | xen_mc_batch(); | 2177 | xen_mc_batch(); |
2166 | for (i = 0; i < (1UL<<order); i++, vaddr += PAGE_SIZE) { | 2178 | for (i = 0; i < (1UL<<order); i++, vaddr += PAGE_SIZE) { |
2167 | mcs = __xen_mc_entry(0); | 2179 | mcs = __xen_mc_entry(0); |
2168 | 2180 | ||
2169 | if (in_frames) | 2181 | if (in_frames) |
2170 | in_frames[i] = virt_to_mfn(vaddr); | 2182 | in_frames[i] = virt_to_mfn(vaddr); |
2171 | 2183 | ||
2172 | MULTI_update_va_mapping(mcs.mc, vaddr, VOID_PTE, 0); | 2184 | MULTI_update_va_mapping(mcs.mc, vaddr, VOID_PTE, 0); |
2173 | __set_phys_to_machine(virt_to_pfn(vaddr), INVALID_P2M_ENTRY); | 2185 | __set_phys_to_machine(virt_to_pfn(vaddr), INVALID_P2M_ENTRY); |
2174 | 2186 | ||
2175 | if (out_frames) | 2187 | if (out_frames) |
2176 | out_frames[i] = virt_to_pfn(vaddr); | 2188 | out_frames[i] = virt_to_pfn(vaddr); |
2177 | } | 2189 | } |
2178 | xen_mc_issue(0); | 2190 | xen_mc_issue(0); |
2179 | } | 2191 | } |
2180 | 2192 | ||
2181 | /* | 2193 | /* |
2182 | * Update the pfn-to-mfn mappings for a virtual address range, either to | 2194 | * Update the pfn-to-mfn mappings for a virtual address range, either to |
2183 | * point to an array of mfns, or contiguously from a single starting | 2195 | * point to an array of mfns, or contiguously from a single starting |
2184 | * mfn. | 2196 | * mfn. |
2185 | */ | 2197 | */ |
2186 | static void xen_remap_exchanged_ptes(unsigned long vaddr, int order, | 2198 | static void xen_remap_exchanged_ptes(unsigned long vaddr, int order, |
2187 | unsigned long *mfns, | 2199 | unsigned long *mfns, |
2188 | unsigned long first_mfn) | 2200 | unsigned long first_mfn) |
2189 | { | 2201 | { |
2190 | unsigned i, limit; | 2202 | unsigned i, limit; |
2191 | unsigned long mfn; | 2203 | unsigned long mfn; |
2192 | 2204 | ||
2193 | xen_mc_batch(); | 2205 | xen_mc_batch(); |
2194 | 2206 | ||
2195 | limit = 1u << order; | 2207 | limit = 1u << order; |
2196 | for (i = 0; i < limit; i++, vaddr += PAGE_SIZE) { | 2208 | for (i = 0; i < limit; i++, vaddr += PAGE_SIZE) { |
2197 | struct multicall_space mcs; | 2209 | struct multicall_space mcs; |
2198 | unsigned flags; | 2210 | unsigned flags; |
2199 | 2211 | ||
2200 | mcs = __xen_mc_entry(0); | 2212 | mcs = __xen_mc_entry(0); |
2201 | if (mfns) | 2213 | if (mfns) |
2202 | mfn = mfns[i]; | 2214 | mfn = mfns[i]; |
2203 | else | 2215 | else |
2204 | mfn = first_mfn + i; | 2216 | mfn = first_mfn + i; |
2205 | 2217 | ||
2206 | if (i < (limit - 1)) | 2218 | if (i < (limit - 1)) |
2207 | flags = 0; | 2219 | flags = 0; |
2208 | else { | 2220 | else { |
2209 | if (order == 0) | 2221 | if (order == 0) |
2210 | flags = UVMF_INVLPG | UVMF_ALL; | 2222 | flags = UVMF_INVLPG | UVMF_ALL; |
2211 | else | 2223 | else |
2212 | flags = UVMF_TLB_FLUSH | UVMF_ALL; | 2224 | flags = UVMF_TLB_FLUSH | UVMF_ALL; |
2213 | } | 2225 | } |
2214 | 2226 | ||
2215 | MULTI_update_va_mapping(mcs.mc, vaddr, | 2227 | MULTI_update_va_mapping(mcs.mc, vaddr, |
2216 | mfn_pte(mfn, PAGE_KERNEL), flags); | 2228 | mfn_pte(mfn, PAGE_KERNEL), flags); |
2217 | 2229 | ||
2218 | set_phys_to_machine(virt_to_pfn(vaddr), mfn); | 2230 | set_phys_to_machine(virt_to_pfn(vaddr), mfn); |
2219 | } | 2231 | } |
2220 | 2232 | ||
2221 | xen_mc_issue(0); | 2233 | xen_mc_issue(0); |
2222 | } | 2234 | } |
2223 | 2235 | ||
2224 | /* | 2236 | /* |
2225 | * Perform the hypercall to exchange a region of our pfns to point to | 2237 | * Perform the hypercall to exchange a region of our pfns to point to |
2226 | * memory with the required contiguous alignment. Takes the pfns as | 2238 | * memory with the required contiguous alignment. Takes the pfns as |
2227 | * input, and populates mfns as output. | 2239 | * input, and populates mfns as output. |
2228 | * | 2240 | * |
2229 | * Returns a success code indicating whether the hypervisor was able to | 2241 | * Returns a success code indicating whether the hypervisor was able to |
2230 | * satisfy the request or not. | 2242 | * satisfy the request or not. |
2231 | */ | 2243 | */ |
2232 | static int xen_exchange_memory(unsigned long extents_in, unsigned int order_in, | 2244 | static int xen_exchange_memory(unsigned long extents_in, unsigned int order_in, |
2233 | unsigned long *pfns_in, | 2245 | unsigned long *pfns_in, |
2234 | unsigned long extents_out, | 2246 | unsigned long extents_out, |
2235 | unsigned int order_out, | 2247 | unsigned int order_out, |
2236 | unsigned long *mfns_out, | 2248 | unsigned long *mfns_out, |
2237 | unsigned int address_bits) | 2249 | unsigned int address_bits) |
2238 | { | 2250 | { |
2239 | long rc; | 2251 | long rc; |
2240 | int success; | 2252 | int success; |
2241 | 2253 | ||
2242 | struct xen_memory_exchange exchange = { | 2254 | struct xen_memory_exchange exchange = { |
2243 | .in = { | 2255 | .in = { |
2244 | .nr_extents = extents_in, | 2256 | .nr_extents = extents_in, |
2245 | .extent_order = order_in, | 2257 | .extent_order = order_in, |
2246 | .extent_start = pfns_in, | 2258 | .extent_start = pfns_in, |
2247 | .domid = DOMID_SELF | 2259 | .domid = DOMID_SELF |
2248 | }, | 2260 | }, |
2249 | .out = { | 2261 | .out = { |
2250 | .nr_extents = extents_out, | 2262 | .nr_extents = extents_out, |
2251 | .extent_order = order_out, | 2263 | .extent_order = order_out, |
2252 | .extent_start = mfns_out, | 2264 | .extent_start = mfns_out, |
2253 | .address_bits = address_bits, | 2265 | .address_bits = address_bits, |
2254 | .domid = DOMID_SELF | 2266 | .domid = DOMID_SELF |
2255 | } | 2267 | } |
2256 | }; | 2268 | }; |
2257 | 2269 | ||
2258 | BUG_ON(extents_in << order_in != extents_out << order_out); | 2270 | BUG_ON(extents_in << order_in != extents_out << order_out); |
2259 | 2271 | ||
2260 | rc = HYPERVISOR_memory_op(XENMEM_exchange, &exchange); | 2272 | rc = HYPERVISOR_memory_op(XENMEM_exchange, &exchange); |
2261 | success = (exchange.nr_exchanged == extents_in); | 2273 | success = (exchange.nr_exchanged == extents_in); |
2262 | 2274 | ||
2263 | BUG_ON(!success && ((exchange.nr_exchanged != 0) || (rc == 0))); | 2275 | BUG_ON(!success && ((exchange.nr_exchanged != 0) || (rc == 0))); |
2264 | BUG_ON(success && (rc != 0)); | 2276 | BUG_ON(success && (rc != 0)); |
2265 | 2277 | ||
2266 | return success; | 2278 | return success; |
2267 | } | 2279 | } |
2268 | 2280 | ||
2269 | int xen_create_contiguous_region(phys_addr_t pstart, unsigned int order, | 2281 | int xen_create_contiguous_region(phys_addr_t pstart, unsigned int order, |
2270 | unsigned int address_bits, | 2282 | unsigned int address_bits, |
2271 | dma_addr_t *dma_handle) | 2283 | dma_addr_t *dma_handle) |
2272 | { | 2284 | { |
2273 | unsigned long *in_frames = discontig_frames, out_frame; | 2285 | unsigned long *in_frames = discontig_frames, out_frame; |
2274 | unsigned long flags; | 2286 | unsigned long flags; |
2275 | int success; | 2287 | int success; |
2276 | unsigned long vstart = (unsigned long)phys_to_virt(pstart); | 2288 | unsigned long vstart = (unsigned long)phys_to_virt(pstart); |
2277 | 2289 | ||
2278 | /* | 2290 | /* |
2279 | * Currently an auto-translated guest will not perform I/O, nor will | 2291 | * Currently an auto-translated guest will not perform I/O, nor will |
2280 | * it require PAE page directories below 4GB. Therefore any calls to | 2292 | * it require PAE page directories below 4GB. Therefore any calls to |
2281 | * this function are redundant and can be ignored. | 2293 | * this function are redundant and can be ignored. |
2282 | */ | 2294 | */ |
2283 | 2295 | ||
2284 | if (xen_feature(XENFEAT_auto_translated_physmap)) | 2296 | if (xen_feature(XENFEAT_auto_translated_physmap)) |
2285 | return 0; | 2297 | return 0; |
2286 | 2298 | ||
2287 | if (unlikely(order > MAX_CONTIG_ORDER)) | 2299 | if (unlikely(order > MAX_CONTIG_ORDER)) |
2288 | return -ENOMEM; | 2300 | return -ENOMEM; |
2289 | 2301 | ||
2290 | memset((void *) vstart, 0, PAGE_SIZE << order); | 2302 | memset((void *) vstart, 0, PAGE_SIZE << order); |
2291 | 2303 | ||
2292 | spin_lock_irqsave(&xen_reservation_lock, flags); | 2304 | spin_lock_irqsave(&xen_reservation_lock, flags); |
2293 | 2305 | ||
2294 | /* 1. Zap current PTEs, remembering MFNs. */ | 2306 | /* 1. Zap current PTEs, remembering MFNs. */ |
2295 | xen_zap_pfn_range(vstart, order, in_frames, NULL); | 2307 | xen_zap_pfn_range(vstart, order, in_frames, NULL); |
2296 | 2308 | ||
2297 | /* 2. Get a new contiguous memory extent. */ | 2309 | /* 2. Get a new contiguous memory extent. */ |
2298 | out_frame = virt_to_pfn(vstart); | 2310 | out_frame = virt_to_pfn(vstart); |
2299 | success = xen_exchange_memory(1UL << order, 0, in_frames, | 2311 | success = xen_exchange_memory(1UL << order, 0, in_frames, |
2300 | 1, order, &out_frame, | 2312 | 1, order, &out_frame, |
2301 | address_bits); | 2313 | address_bits); |
2302 | 2314 | ||
2303 | /* 3. Map the new extent in place of old pages. */ | 2315 | /* 3. Map the new extent in place of old pages. */ |
2304 | if (success) | 2316 | if (success) |
2305 | xen_remap_exchanged_ptes(vstart, order, NULL, out_frame); | 2317 | xen_remap_exchanged_ptes(vstart, order, NULL, out_frame); |
2306 | else | 2318 | else |
2307 | xen_remap_exchanged_ptes(vstart, order, in_frames, 0); | 2319 | xen_remap_exchanged_ptes(vstart, order, in_frames, 0); |
2308 | 2320 | ||
2309 | spin_unlock_irqrestore(&xen_reservation_lock, flags); | 2321 | spin_unlock_irqrestore(&xen_reservation_lock, flags); |
2310 | 2322 | ||
2311 | *dma_handle = virt_to_machine(vstart).maddr; | 2323 | *dma_handle = virt_to_machine(vstart).maddr; |
2312 | return success ? 0 : -ENOMEM; | 2324 | return success ? 0 : -ENOMEM; |
2313 | } | 2325 | } |
2314 | EXPORT_SYMBOL_GPL(xen_create_contiguous_region); | 2326 | EXPORT_SYMBOL_GPL(xen_create_contiguous_region); |
2315 | 2327 | ||
2316 | void xen_destroy_contiguous_region(phys_addr_t pstart, unsigned int order) | 2328 | void xen_destroy_contiguous_region(phys_addr_t pstart, unsigned int order) |
2317 | { | 2329 | { |
2318 | unsigned long *out_frames = discontig_frames, in_frame; | 2330 | unsigned long *out_frames = discontig_frames, in_frame; |
2319 | unsigned long flags; | 2331 | unsigned long flags; |
2320 | int success; | 2332 | int success; |
2321 | unsigned long vstart; | 2333 | unsigned long vstart; |
2322 | 2334 | ||
2323 | if (xen_feature(XENFEAT_auto_translated_physmap)) | 2335 | if (xen_feature(XENFEAT_auto_translated_physmap)) |
2324 | return; | 2336 | return; |
2325 | 2337 | ||
2326 | if (unlikely(order > MAX_CONTIG_ORDER)) | 2338 | if (unlikely(order > MAX_CONTIG_ORDER)) |
2327 | return; | 2339 | return; |
2328 | 2340 | ||
2329 | vstart = (unsigned long)phys_to_virt(pstart); | 2341 | vstart = (unsigned long)phys_to_virt(pstart); |
2330 | memset((void *) vstart, 0, PAGE_SIZE << order); | 2342 | memset((void *) vstart, 0, PAGE_SIZE << order); |
2331 | 2343 | ||
2332 | spin_lock_irqsave(&xen_reservation_lock, flags); | 2344 | spin_lock_irqsave(&xen_reservation_lock, flags); |
2333 | 2345 | ||
2334 | /* 1. Find start MFN of contiguous extent. */ | 2346 | /* 1. Find start MFN of contiguous extent. */ |
2335 | in_frame = virt_to_mfn(vstart); | 2347 | in_frame = virt_to_mfn(vstart); |
2336 | 2348 | ||
2337 | /* 2. Zap current PTEs. */ | 2349 | /* 2. Zap current PTEs. */ |
2338 | xen_zap_pfn_range(vstart, order, NULL, out_frames); | 2350 | xen_zap_pfn_range(vstart, order, NULL, out_frames); |
2339 | 2351 | ||
2340 | /* 3. Do the exchange for non-contiguous MFNs. */ | 2352 | /* 3. Do the exchange for non-contiguous MFNs. */ |
2341 | success = xen_exchange_memory(1, order, &in_frame, 1UL << order, | 2353 | success = xen_exchange_memory(1, order, &in_frame, 1UL << order, |
2342 | 0, out_frames, 0); | 2354 | 0, out_frames, 0); |
2343 | 2355 | ||
2344 | /* 4. Map new pages in place of old pages. */ | 2356 | /* 4. Map new pages in place of old pages. */ |
2345 | if (success) | 2357 | if (success) |
2346 | xen_remap_exchanged_ptes(vstart, order, out_frames, 0); | 2358 | xen_remap_exchanged_ptes(vstart, order, out_frames, 0); |
2347 | else | 2359 | else |
2348 | xen_remap_exchanged_ptes(vstart, order, NULL, in_frame); | 2360 | xen_remap_exchanged_ptes(vstart, order, NULL, in_frame); |
2349 | 2361 | ||
2350 | spin_unlock_irqrestore(&xen_reservation_lock, flags); | 2362 | spin_unlock_irqrestore(&xen_reservation_lock, flags); |
2351 | } | 2363 | } |
2352 | EXPORT_SYMBOL_GPL(xen_destroy_contiguous_region); | 2364 | EXPORT_SYMBOL_GPL(xen_destroy_contiguous_region); |
2353 | 2365 | ||
2354 | #ifdef CONFIG_XEN_PVHVM | 2366 | #ifdef CONFIG_XEN_PVHVM |
2355 | #ifdef CONFIG_PROC_VMCORE | 2367 | #ifdef CONFIG_PROC_VMCORE |
2356 | /* | 2368 | /* |
2357 | * This function is used in two contexts: | 2369 | * This function is used in two contexts: |
2358 | * - the kdump kernel has to check whether a pfn of the crashed kernel | 2370 | * - the kdump kernel has to check whether a pfn of the crashed kernel |
2359 | * was a ballooned page. vmcore is using this function to decide | 2371 | * was a ballooned page. vmcore is using this function to decide |
2360 | * whether to access a pfn of the crashed kernel. | 2372 | * whether to access a pfn of the crashed kernel. |
2361 | * - the kexec kernel has to check whether a pfn was ballooned by the | 2373 | * - the kexec kernel has to check whether a pfn was ballooned by the |
2362 | * previous kernel. If the pfn is ballooned, handle it properly. | 2374 | * previous kernel. If the pfn is ballooned, handle it properly. |
2363 | * Returns 0 if the pfn is not backed by a RAM page, the caller may | 2375 | * Returns 0 if the pfn is not backed by a RAM page, the caller may |
2364 | * handle the pfn special in this case. | 2376 | * handle the pfn special in this case. |
2365 | */ | 2377 | */ |
2366 | static int xen_oldmem_pfn_is_ram(unsigned long pfn) | 2378 | static int xen_oldmem_pfn_is_ram(unsigned long pfn) |
2367 | { | 2379 | { |
2368 | struct xen_hvm_get_mem_type a = { | 2380 | struct xen_hvm_get_mem_type a = { |
2369 | .domid = DOMID_SELF, | 2381 | .domid = DOMID_SELF, |
2370 | .pfn = pfn, | 2382 | .pfn = pfn, |
2371 | }; | 2383 | }; |
2372 | int ram; | 2384 | int ram; |
2373 | 2385 | ||
2374 | if (HYPERVISOR_hvm_op(HVMOP_get_mem_type, &a)) | 2386 | if (HYPERVISOR_hvm_op(HVMOP_get_mem_type, &a)) |
2375 | return -ENXIO; | 2387 | return -ENXIO; |
2376 | 2388 | ||
2377 | switch (a.mem_type) { | 2389 | switch (a.mem_type) { |
2378 | case HVMMEM_mmio_dm: | 2390 | case HVMMEM_mmio_dm: |
2379 | ram = 0; | 2391 | ram = 0; |
2380 | break; | 2392 | break; |
2381 | case HVMMEM_ram_rw: | 2393 | case HVMMEM_ram_rw: |
2382 | case HVMMEM_ram_ro: | 2394 | case HVMMEM_ram_ro: |
2383 | default: | 2395 | default: |
2384 | ram = 1; | 2396 | ram = 1; |
2385 | break; | 2397 | break; |
2386 | } | 2398 | } |
2387 | 2399 | ||
2388 | return ram; | 2400 | return ram; |
2389 | } | 2401 | } |
2390 | #endif | 2402 | #endif |
2391 | 2403 | ||
2392 | static void xen_hvm_exit_mmap(struct mm_struct *mm) | 2404 | static void xen_hvm_exit_mmap(struct mm_struct *mm) |
2393 | { | 2405 | { |
2394 | struct xen_hvm_pagetable_dying a; | 2406 | struct xen_hvm_pagetable_dying a; |
2395 | int rc; | 2407 | int rc; |
2396 | 2408 | ||
2397 | a.domid = DOMID_SELF; | 2409 | a.domid = DOMID_SELF; |
2398 | a.gpa = __pa(mm->pgd); | 2410 | a.gpa = __pa(mm->pgd); |
2399 | rc = HYPERVISOR_hvm_op(HVMOP_pagetable_dying, &a); | 2411 | rc = HYPERVISOR_hvm_op(HVMOP_pagetable_dying, &a); |
2400 | WARN_ON_ONCE(rc < 0); | 2412 | WARN_ON_ONCE(rc < 0); |
2401 | } | 2413 | } |
2402 | 2414 | ||
2403 | static int is_pagetable_dying_supported(void) | 2415 | static int is_pagetable_dying_supported(void) |
2404 | { | 2416 | { |
2405 | struct xen_hvm_pagetable_dying a; | 2417 | struct xen_hvm_pagetable_dying a; |
2406 | int rc = 0; | 2418 | int rc = 0; |
2407 | 2419 | ||
2408 | a.domid = DOMID_SELF; | 2420 | a.domid = DOMID_SELF; |
2409 | a.gpa = 0x00; | 2421 | a.gpa = 0x00; |
2410 | rc = HYPERVISOR_hvm_op(HVMOP_pagetable_dying, &a); | 2422 | rc = HYPERVISOR_hvm_op(HVMOP_pagetable_dying, &a); |
2411 | if (rc < 0) { | 2423 | if (rc < 0) { |
2412 | printk(KERN_DEBUG "HVMOP_pagetable_dying not supported\n"); | 2424 | printk(KERN_DEBUG "HVMOP_pagetable_dying not supported\n"); |
2413 | return 0; | 2425 | return 0; |
2414 | } | 2426 | } |
2415 | return 1; | 2427 | return 1; |
2416 | } | 2428 | } |
2417 | 2429 | ||
2418 | void __init xen_hvm_init_mmu_ops(void) | 2430 | void __init xen_hvm_init_mmu_ops(void) |
2419 | { | 2431 | { |
2420 | if (is_pagetable_dying_supported()) | 2432 | if (is_pagetable_dying_supported()) |
2421 | pv_mmu_ops.exit_mmap = xen_hvm_exit_mmap; | 2433 | pv_mmu_ops.exit_mmap = xen_hvm_exit_mmap; |
2422 | #ifdef CONFIG_PROC_VMCORE | 2434 | #ifdef CONFIG_PROC_VMCORE |
2423 | register_oldmem_pfn_is_ram(&xen_oldmem_pfn_is_ram); | 2435 | register_oldmem_pfn_is_ram(&xen_oldmem_pfn_is_ram); |
2424 | #endif | 2436 | #endif |
2425 | } | 2437 | } |
2426 | #endif | 2438 | #endif |
2427 | 2439 | ||
2428 | #ifdef CONFIG_XEN_PVH | 2440 | #ifdef CONFIG_XEN_PVH |
2429 | /* | 2441 | /* |
2430 | * Map foreign gfn (fgfn), to local pfn (lpfn). This for the user | 2442 | * Map foreign gfn (fgfn), to local pfn (lpfn). This for the user |
2431 | * space creating new guest on pvh dom0 and needing to map domU pages. | 2443 | * space creating new guest on pvh dom0 and needing to map domU pages. |
2432 | */ | 2444 | */ |
2433 | static int xlate_add_to_p2m(unsigned long lpfn, unsigned long fgfn, | 2445 | static int xlate_add_to_p2m(unsigned long lpfn, unsigned long fgfn, |
2434 | unsigned int domid) | 2446 | unsigned int domid) |
2435 | { | 2447 | { |
2436 | int rc, err = 0; | 2448 | int rc, err = 0; |
2437 | xen_pfn_t gpfn = lpfn; | 2449 | xen_pfn_t gpfn = lpfn; |
2438 | xen_ulong_t idx = fgfn; | 2450 | xen_ulong_t idx = fgfn; |
2439 | 2451 | ||
2440 | struct xen_add_to_physmap_range xatp = { | 2452 | struct xen_add_to_physmap_range xatp = { |
2441 | .domid = DOMID_SELF, | 2453 | .domid = DOMID_SELF, |
2442 | .foreign_domid = domid, | 2454 | .foreign_domid = domid, |
2443 | .size = 1, | 2455 | .size = 1, |
2444 | .space = XENMAPSPACE_gmfn_foreign, | 2456 | .space = XENMAPSPACE_gmfn_foreign, |
2445 | }; | 2457 | }; |
2446 | set_xen_guest_handle(xatp.idxs, &idx); | 2458 | set_xen_guest_handle(xatp.idxs, &idx); |
2447 | set_xen_guest_handle(xatp.gpfns, &gpfn); | 2459 | set_xen_guest_handle(xatp.gpfns, &gpfn); |
2448 | set_xen_guest_handle(xatp.errs, &err); | 2460 | set_xen_guest_handle(xatp.errs, &err); |
2449 | 2461 | ||
2450 | rc = HYPERVISOR_memory_op(XENMEM_add_to_physmap_range, &xatp); | 2462 | rc = HYPERVISOR_memory_op(XENMEM_add_to_physmap_range, &xatp); |
2451 | if (rc < 0) | 2463 | if (rc < 0) |
2452 | return rc; | 2464 | return rc; |
2453 | return err; | 2465 | return err; |
2454 | } | 2466 | } |
2455 | 2467 | ||
2456 | static int xlate_remove_from_p2m(unsigned long spfn, int count) | 2468 | static int xlate_remove_from_p2m(unsigned long spfn, int count) |
2457 | { | 2469 | { |
2458 | struct xen_remove_from_physmap xrp; | 2470 | struct xen_remove_from_physmap xrp; |
2459 | int i, rc; | 2471 | int i, rc; |
2460 | 2472 | ||
2461 | for (i = 0; i < count; i++) { | 2473 | for (i = 0; i < count; i++) { |
2462 | xrp.domid = DOMID_SELF; | 2474 | xrp.domid = DOMID_SELF; |
2463 | xrp.gpfn = spfn+i; | 2475 | xrp.gpfn = spfn+i; |
2464 | rc = HYPERVISOR_memory_op(XENMEM_remove_from_physmap, &xrp); | 2476 | rc = HYPERVISOR_memory_op(XENMEM_remove_from_physmap, &xrp); |
2465 | if (rc) | 2477 | if (rc) |
2466 | break; | 2478 | break; |
2467 | } | 2479 | } |
2468 | return rc; | 2480 | return rc; |
2469 | } | 2481 | } |
2470 | 2482 | ||
2471 | struct xlate_remap_data { | 2483 | struct xlate_remap_data { |
2472 | unsigned long fgfn; /* foreign domain's gfn */ | 2484 | unsigned long fgfn; /* foreign domain's gfn */ |
2473 | pgprot_t prot; | 2485 | pgprot_t prot; |
2474 | domid_t domid; | 2486 | domid_t domid; |
2475 | int index; | 2487 | int index; |
2476 | struct page **pages; | 2488 | struct page **pages; |
2477 | }; | 2489 | }; |
2478 | 2490 | ||
2479 | static int xlate_map_pte_fn(pte_t *ptep, pgtable_t token, unsigned long addr, | 2491 | static int xlate_map_pte_fn(pte_t *ptep, pgtable_t token, unsigned long addr, |
2480 | void *data) | 2492 | void *data) |
2481 | { | 2493 | { |
2482 | int rc; | 2494 | int rc; |
2483 | struct xlate_remap_data *remap = data; | 2495 | struct xlate_remap_data *remap = data; |
2484 | unsigned long pfn = page_to_pfn(remap->pages[remap->index++]); | 2496 | unsigned long pfn = page_to_pfn(remap->pages[remap->index++]); |
2485 | pte_t pteval = pte_mkspecial(pfn_pte(pfn, remap->prot)); | 2497 | pte_t pteval = pte_mkspecial(pfn_pte(pfn, remap->prot)); |
2486 | 2498 | ||
2487 | rc = xlate_add_to_p2m(pfn, remap->fgfn, remap->domid); | 2499 | rc = xlate_add_to_p2m(pfn, remap->fgfn, remap->domid); |
2488 | if (rc) | 2500 | if (rc) |
2489 | return rc; | 2501 | return rc; |
2490 | native_set_pte(ptep, pteval); | 2502 | native_set_pte(ptep, pteval); |
2491 | 2503 | ||
2492 | return 0; | 2504 | return 0; |
2493 | } | 2505 | } |
2494 | 2506 | ||
2495 | static int xlate_remap_gfn_range(struct vm_area_struct *vma, | 2507 | static int xlate_remap_gfn_range(struct vm_area_struct *vma, |
2496 | unsigned long addr, unsigned long mfn, | 2508 | unsigned long addr, unsigned long mfn, |
2497 | int nr, pgprot_t prot, unsigned domid, | 2509 | int nr, pgprot_t prot, unsigned domid, |
2498 | struct page **pages) | 2510 | struct page **pages) |
2499 | { | 2511 | { |
2500 | int err; | 2512 | int err; |
2501 | struct xlate_remap_data pvhdata; | 2513 | struct xlate_remap_data pvhdata; |
2502 | 2514 | ||
2503 | BUG_ON(!pages); | 2515 | BUG_ON(!pages); |
2504 | 2516 | ||
2505 | pvhdata.fgfn = mfn; | 2517 | pvhdata.fgfn = mfn; |
2506 | pvhdata.prot = prot; | 2518 | pvhdata.prot = prot; |
2507 | pvhdata.domid = domid; | 2519 | pvhdata.domid = domid; |
2508 | pvhdata.index = 0; | 2520 | pvhdata.index = 0; |
2509 | pvhdata.pages = pages; | 2521 | pvhdata.pages = pages; |
2510 | err = apply_to_page_range(vma->vm_mm, addr, nr << PAGE_SHIFT, | 2522 | err = apply_to_page_range(vma->vm_mm, addr, nr << PAGE_SHIFT, |
2511 | xlate_map_pte_fn, &pvhdata); | 2523 | xlate_map_pte_fn, &pvhdata); |
2512 | flush_tlb_all(); | 2524 | flush_tlb_all(); |
2513 | return err; | 2525 | return err; |
2514 | } | 2526 | } |
2515 | #endif | 2527 | #endif |
2516 | 2528 | ||
2517 | #define REMAP_BATCH_SIZE 16 | 2529 | #define REMAP_BATCH_SIZE 16 |
2518 | 2530 | ||
2519 | struct remap_data { | 2531 | struct remap_data { |
2520 | unsigned long mfn; | 2532 | unsigned long mfn; |
2521 | pgprot_t prot; | 2533 | pgprot_t prot; |
2522 | struct mmu_update *mmu_update; | 2534 | struct mmu_update *mmu_update; |
2523 | }; | 2535 | }; |
2524 | 2536 | ||
2525 | static int remap_area_mfn_pte_fn(pte_t *ptep, pgtable_t token, | 2537 | static int remap_area_mfn_pte_fn(pte_t *ptep, pgtable_t token, |
2526 | unsigned long addr, void *data) | 2538 | unsigned long addr, void *data) |
2527 | { | 2539 | { |
2528 | struct remap_data *rmd = data; | 2540 | struct remap_data *rmd = data; |
2529 | pte_t pte = pte_mkspecial(mfn_pte(rmd->mfn++, rmd->prot)); | 2541 | pte_t pte = pte_mkspecial(mfn_pte(rmd->mfn++, rmd->prot)); |
2530 | 2542 | ||
2531 | rmd->mmu_update->ptr = virt_to_machine(ptep).maddr; | 2543 | rmd->mmu_update->ptr = virt_to_machine(ptep).maddr; |
2532 | rmd->mmu_update->val = pte_val_ma(pte); | 2544 | rmd->mmu_update->val = pte_val_ma(pte); |
2533 | rmd->mmu_update++; | 2545 | rmd->mmu_update++; |
2534 | 2546 | ||
2535 | return 0; | 2547 | return 0; |
2536 | } | 2548 | } |
2537 | 2549 | ||
2538 | int xen_remap_domain_mfn_range(struct vm_area_struct *vma, | 2550 | int xen_remap_domain_mfn_range(struct vm_area_struct *vma, |
2539 | unsigned long addr, | 2551 | unsigned long addr, |
2540 | xen_pfn_t mfn, int nr, | 2552 | xen_pfn_t mfn, int nr, |
2541 | pgprot_t prot, unsigned domid, | 2553 | pgprot_t prot, unsigned domid, |
2542 | struct page **pages) | 2554 | struct page **pages) |
2543 | 2555 | ||
2544 | { | 2556 | { |
2545 | struct remap_data rmd; | 2557 | struct remap_data rmd; |
2546 | struct mmu_update mmu_update[REMAP_BATCH_SIZE]; | 2558 | struct mmu_update mmu_update[REMAP_BATCH_SIZE]; |
2547 | int batch; | 2559 | int batch; |
2548 | unsigned long range; | 2560 | unsigned long range; |
2549 | int err = 0; | 2561 | int err = 0; |
2550 | 2562 | ||
2551 | BUG_ON(!((vma->vm_flags & (VM_PFNMAP | VM_IO)) == (VM_PFNMAP | VM_IO))); | 2563 | BUG_ON(!((vma->vm_flags & (VM_PFNMAP | VM_IO)) == (VM_PFNMAP | VM_IO))); |
2552 | 2564 | ||
2553 | if (xen_feature(XENFEAT_auto_translated_physmap)) { | 2565 | if (xen_feature(XENFEAT_auto_translated_physmap)) { |
2554 | #ifdef CONFIG_XEN_PVH | 2566 | #ifdef CONFIG_XEN_PVH |
2555 | /* We need to update the local page tables and the xen HAP */ | 2567 | /* We need to update the local page tables and the xen HAP */ |
2556 | return xlate_remap_gfn_range(vma, addr, mfn, nr, prot, | 2568 | return xlate_remap_gfn_range(vma, addr, mfn, nr, prot, |
2557 | domid, pages); | 2569 | domid, pages); |
2558 | #else | 2570 | #else |
2559 | return -EINVAL; | 2571 | return -EINVAL; |
2560 | #endif | 2572 | #endif |
2561 | } | 2573 | } |
2562 | 2574 | ||
2563 | rmd.mfn = mfn; | 2575 | rmd.mfn = mfn; |
2564 | rmd.prot = prot; | 2576 | rmd.prot = prot; |
2565 | 2577 | ||
2566 | while (nr) { | 2578 | while (nr) { |
2567 | batch = min(REMAP_BATCH_SIZE, nr); | 2579 | batch = min(REMAP_BATCH_SIZE, nr); |
2568 | range = (unsigned long)batch << PAGE_SHIFT; | 2580 | range = (unsigned long)batch << PAGE_SHIFT; |
2569 | 2581 | ||
2570 | rmd.mmu_update = mmu_update; | 2582 | rmd.mmu_update = mmu_update; |
2571 | err = apply_to_page_range(vma->vm_mm, addr, range, | 2583 | err = apply_to_page_range(vma->vm_mm, addr, range, |
2572 | remap_area_mfn_pte_fn, &rmd); | 2584 | remap_area_mfn_pte_fn, &rmd); |
2573 | if (err) | 2585 | if (err) |
2574 | goto out; | 2586 | goto out; |
2575 | 2587 | ||
2576 | err = HYPERVISOR_mmu_update(mmu_update, batch, NULL, domid); | 2588 | err = HYPERVISOR_mmu_update(mmu_update, batch, NULL, domid); |
2577 | if (err < 0) | 2589 | if (err < 0) |
2578 | goto out; | 2590 | goto out; |
2579 | 2591 | ||
2580 | nr -= batch; | 2592 | nr -= batch; |
2581 | addr += range; | 2593 | addr += range; |
2582 | } | 2594 | } |
2583 | 2595 | ||
2584 | err = 0; | 2596 | err = 0; |
2585 | out: | 2597 | out: |
2586 | 2598 | ||
2587 | xen_flush_tlb_all(); | 2599 | xen_flush_tlb_all(); |
2588 | 2600 | ||
2589 | return err; | 2601 | return err; |
2590 | } | 2602 | } |
2591 | EXPORT_SYMBOL_GPL(xen_remap_domain_mfn_range); | 2603 | EXPORT_SYMBOL_GPL(xen_remap_domain_mfn_range); |
2592 | 2604 | ||
2593 | /* Returns: 0 success */ | 2605 | /* Returns: 0 success */ |
2594 | int xen_unmap_domain_mfn_range(struct vm_area_struct *vma, | 2606 | int xen_unmap_domain_mfn_range(struct vm_area_struct *vma, |
2595 | int numpgs, struct page **pages) | 2607 | int numpgs, struct page **pages) |
2596 | { | 2608 | { |
2597 | if (!pages || !xen_feature(XENFEAT_auto_translated_physmap)) | 2609 | if (!pages || !xen_feature(XENFEAT_auto_translated_physmap)) |
2598 | return 0; | 2610 | return 0; |
2599 | 2611 | ||
2600 | #ifdef CONFIG_XEN_PVH | 2612 | #ifdef CONFIG_XEN_PVH |
2601 | while (numpgs--) { | 2613 | while (numpgs--) { |
2602 | /* | 2614 | /* |
2603 | * The mmu has already cleaned up the process mmu | 2615 | * The mmu has already cleaned up the process mmu |
2604 | * resources at this point (lookup_address will return | 2616 | * resources at this point (lookup_address will return |
2605 | * NULL). | 2617 | * NULL). |
2606 | */ | 2618 | */ |
2607 | unsigned long pfn = page_to_pfn(pages[numpgs]); | 2619 | unsigned long pfn = page_to_pfn(pages[numpgs]); |
2608 | 2620 | ||
2609 | xlate_remove_from_p2m(pfn, 1); | 2621 | xlate_remove_from_p2m(pfn, 1); |
2610 | } | 2622 | } |
2611 | /* | 2623 | /* |
2612 | * We don't need to flush tlbs because as part of | 2624 | * We don't need to flush tlbs because as part of |
2613 | * xlate_remove_from_p2m, the hypervisor will do tlb flushes | 2625 | * xlate_remove_from_p2m, the hypervisor will do tlb flushes |
2614 | * after removing the p2m entries from the EPT/NPT | 2626 | * after removing the p2m entries from the EPT/NPT |
arch/x86/xen/p2m.c
1 | /* | 1 | /* |
2 | * Xen leaves the responsibility for maintaining p2m mappings to the | 2 | * Xen leaves the responsibility for maintaining p2m mappings to the |
3 | * guests themselves, but it must also access and update the p2m array | 3 | * guests themselves, but it must also access and update the p2m array |
4 | * during suspend/resume when all the pages are reallocated. | 4 | * during suspend/resume when all the pages are reallocated. |
5 | * | 5 | * |
6 | * The p2m table is logically a flat array, but we implement it as a | 6 | * The logical flat p2m table is mapped to a linear kernel memory area. |
7 | * three-level tree to allow the address space to be sparse. | 7 | * For accesses by Xen a three-level tree linked via mfns only is set up to |
8 | * allow the address space to be sparse. | ||
8 | * | 9 | * |
9 | * Xen | 10 | * Xen |
10 | * | | 11 | * | |
11 | * p2m_top p2m_top_mfn | 12 | * p2m_top_mfn |
12 | * / \ / \ | 13 | * / \ |
13 | * p2m_mid p2m_mid p2m_mid_mfn p2m_mid_mfn | 14 | * p2m_mid_mfn p2m_mid_mfn |
14 | * / \ / \ / / | 15 | * / / |
15 | * p2m p2m p2m p2m p2m p2m p2m ... | 16 | * p2m p2m p2m ... |
16 | * | 17 | * |
17 | * The p2m_mid_mfn pages are mapped by p2m_top_mfn_p. | 18 | * The p2m_mid_mfn pages are mapped by p2m_top_mfn_p. |
18 | * | 19 | * |
19 | * The p2m_top and p2m_top_mfn levels are limited to 1 page, so the | 20 | * The p2m_top_mfn level is limited to 1 page, so the maximum representable |
20 | * maximum representable pseudo-physical address space is: | 21 | * pseudo-physical address space is: |
21 | * P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE pages | 22 | * P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE pages |
22 | * | 23 | * |
23 | * P2M_PER_PAGE depends on the architecture, as a mfn is always | 24 | * P2M_PER_PAGE depends on the architecture, as a mfn is always |
24 | * unsigned long (8 bytes on 64-bit, 4 bytes on 32), leading to | 25 | * unsigned long (8 bytes on 64-bit, 4 bytes on 32), leading to |
25 | * 512 and 1024 entries respectively. | 26 | * 512 and 1024 entries respectively. |
26 | * | 27 | * |
27 | * In short, these structures contain the Machine Frame Number (MFN) of the PFN. | 28 | * In short, these structures contain the Machine Frame Number (MFN) of the PFN. |
28 | * | 29 | * |
29 | * However not all entries are filled with MFNs. Specifically for all other | 30 | * However not all entries are filled with MFNs. Specifically for all other |
30 | * leaf entries, or for the top root, or middle one, for which there is a void | 31 | * leaf entries, or for the top root, or middle one, for which there is a void |
31 | * entry, we assume it is "missing". So (for example) | 32 | * entry, we assume it is "missing". So (for example) |
32 | * pfn_to_mfn(0x90909090)=INVALID_P2M_ENTRY. | 33 | * pfn_to_mfn(0x90909090)=INVALID_P2M_ENTRY. |
34 | * We have a dedicated page p2m_missing with all entries being | ||
35 | * INVALID_P2M_ENTRY. This page may be referenced multiple times in the p2m | ||
36 | * list/tree in case there are multiple areas with P2M_PER_PAGE invalid pfns. | ||
33 | * | 37 | * |
34 | * We also have the possibility of setting 1-1 mappings on certain regions, so | 38 | * We also have the possibility of setting 1-1 mappings on certain regions, so |
35 | * that: | 39 | * that: |
36 | * pfn_to_mfn(0xc0000)=0xc0000 | 40 | * pfn_to_mfn(0xc0000)=0xc0000 |
37 | * | 41 | * |
38 | * The benefit of this is, that we can assume for non-RAM regions (think | 42 | * The benefit of this is, that we can assume for non-RAM regions (think |
39 | * PCI BARs, or ACPI spaces), we can create mappings easily because we | 43 | * PCI BARs, or ACPI spaces), we can create mappings easily because we |
40 | * get the PFN value to match the MFN. | 44 | * get the PFN value to match the MFN. |
41 | * | 45 | * |
42 | * For this to work efficiently we have one new page p2m_identity and | 46 | * For this to work efficiently we have one new page p2m_identity. All entries |
43 | * allocate (via reserved_brk) any other pages we need to cover the sides | 47 | * in p2m_identity are set to INVALID_P2M_ENTRY type (Xen toolstack only |
44 | * (1GB or 4MB boundary violations). All entries in p2m_identity are set to | 48 | * recognizes that and MFNs, no other fancy value). |
45 | * INVALID_P2M_ENTRY type (Xen toolstack only recognizes that and MFNs, | ||
46 | * no other fancy value). | ||
47 | * | 49 | * |
48 | * On lookup we spot that the entry points to p2m_identity and return the | 50 | * On lookup we spot that the entry points to p2m_identity and return the |
49 | * identity value instead of dereferencing and returning INVALID_P2M_ENTRY. | 51 | * identity value instead of dereferencing and returning INVALID_P2M_ENTRY. |
50 | * If the entry points to an allocated page, we just proceed as before and | 52 | * If the entry points to an allocated page, we just proceed as before and |
51 | * return the PFN. If the PFN has IDENTITY_FRAME_BIT set we unmask that in | 53 | * return the PFN. If the PFN has IDENTITY_FRAME_BIT set we unmask that in |
52 | * appropriate functions (pfn_to_mfn). | 54 | * appropriate functions (pfn_to_mfn). |
53 | * | 55 | * |
54 | * The reason for having the IDENTITY_FRAME_BIT instead of just returning the | 56 | * The reason for having the IDENTITY_FRAME_BIT instead of just returning the |
55 | * PFN is that we could find ourselves where pfn_to_mfn(pfn)==pfn for a | 57 | * PFN is that we could find ourselves where pfn_to_mfn(pfn)==pfn for a |
56 | * non-identity pfn. To protect ourselves against we elect to set (and get) the | 58 | * non-identity pfn. To protect ourselves against we elect to set (and get) the |
57 | * IDENTITY_FRAME_BIT on all identity mapped PFNs. | 59 | * IDENTITY_FRAME_BIT on all identity mapped PFNs. |
58 | * | ||
59 | * This simplistic diagram is used to explain the more subtle piece of code. | ||
60 | * There is also a digram of the P2M at the end that can help. | ||
61 | * Imagine your E820 looking as so: | ||
62 | * | ||
63 | * 1GB 2GB 4GB | ||
64 | * /-------------------+---------\/----\ /----------\ /---+-----\ | ||
65 | * | System RAM | Sys RAM ||ACPI| | reserved | | Sys RAM | | ||
66 | * \-------------------+---------/\----/ \----------/ \---+-----/ | ||
67 | * ^- 1029MB ^- 2001MB | ||
68 | * | ||
69 | * [1029MB = 263424 (0x40500), 2001MB = 512256 (0x7D100), | ||
70 | * 2048MB = 524288 (0x80000)] | ||
71 | * | ||
72 | * And dom0_mem=max:3GB,1GB is passed in to the guest, meaning memory past 1GB | ||
73 | * is actually not present (would have to kick the balloon driver to put it in). | ||
74 | * | ||
75 | * When we are told to set the PFNs for identity mapping (see patch: "xen/setup: | ||
76 | * Set identity mapping for non-RAM E820 and E820 gaps.") we pass in the start | ||
77 | * of the PFN and the end PFN (263424 and 512256 respectively). The first step | ||
78 | * is to reserve_brk a top leaf page if the p2m[1] is missing. The top leaf page | ||
79 | * covers 512^2 of page estate (1GB) and in case the start or end PFN is not | ||
80 | * aligned on 512^2*PAGE_SIZE (1GB) we reserve_brk new middle and leaf pages as | ||
81 | * required to split any existing p2m_mid_missing middle pages. | ||
82 | * | ||
83 | * With the E820 example above, 263424 is not 1GB aligned so we allocate a | ||
84 | * reserve_brk page which will cover the PFNs estate from 0x40000 to 0x80000. | ||
85 | * Each entry in the allocate page is "missing" (points to p2m_missing). | ||
86 | * | ||
87 | * Next stage is to determine if we need to do a more granular boundary check | ||
88 | * on the 4MB (or 2MB depending on architecture) off the start and end pfn's. | ||
89 | * We check if the start pfn and end pfn violate that boundary check, and if | ||
90 | * so reserve_brk a (p2m[x][y]) leaf page. This way we have a much finer | ||
91 | * granularity of setting which PFNs are missing and which ones are identity. | ||
92 | * In our example 263424 and 512256 both fail the check so we reserve_brk two | ||
93 | * pages. Populate them with INVALID_P2M_ENTRY (so they both have "missing" | ||
94 | * values) and assign them to p2m[1][2] and p2m[1][488] respectively. | ||
95 | * | ||
96 | * At this point we would at minimum reserve_brk one page, but could be up to | ||
97 | * three. Each call to set_phys_range_identity has at maximum a three page | ||
98 | * cost. If we were to query the P2M at this stage, all those entries from | ||
99 | * start PFN through end PFN (so 1029MB -> 2001MB) would return | ||
100 | * INVALID_P2M_ENTRY ("missing"). | ||
101 | * | ||
102 | * The next step is to walk from the start pfn to the end pfn setting | ||
103 | * the IDENTITY_FRAME_BIT on each PFN. This is done in set_phys_range_identity. | ||
104 | * If we find that the middle entry is pointing to p2m_missing we can swap it | ||
105 | * over to p2m_identity - this way covering 4MB (or 2MB) PFN space (and | ||
106 | * similarly swapping p2m_mid_missing for p2m_mid_identity for larger regions). | ||
107 | * At this point we do not need to worry about boundary aligment (so no need to | ||
108 | * reserve_brk a middle page, figure out which PFNs are "missing" and which | ||
109 | * ones are identity), as that has been done earlier. If we find that the | ||
110 | * middle leaf is not occupied by p2m_identity or p2m_missing, we dereference | ||
111 | * that page (which covers 512 PFNs) and set the appropriate PFN with | ||
112 | * IDENTITY_FRAME_BIT. In our example 263424 and 512256 end up there, and we | ||
113 | * set from p2m[1][2][256->511] and p2m[1][488][0->256] with | ||
114 | * IDENTITY_FRAME_BIT set. | ||
115 | * | ||
116 | * All other regions that are void (or not filled) either point to p2m_missing | ||
117 | * (considered missing) or have the default value of INVALID_P2M_ENTRY (also | ||
118 | * considered missing). In our case, p2m[1][2][0->255] and p2m[1][488][257->511] | ||
119 | * contain the INVALID_P2M_ENTRY value and are considered "missing." | ||
120 | * | ||
121 | * Finally, the region beyond the end of of the E820 (4 GB in this example) | ||
122 | * is set to be identity (in case there are MMIO regions placed here). | ||
123 | * | ||
124 | * This is what the p2m ends up looking (for the E820 above) with this | ||
125 | * fabulous drawing: | ||
126 | * | ||
127 | * p2m /--------------\ | ||
128 | * /-----\ | &mfn_list[0],| /-----------------\ | ||
129 | * | 0 |------>| &mfn_list[1],| /---------------\ | ~0, ~0, .. | | ||
130 | * |-----| | ..., ~0, ~0 | | ~0, ~0, [x]---+----->| IDENTITY [@256] | | ||
131 | * | 1 |---\ \--------------/ | [p2m_identity]+\ | IDENTITY [@257] | | ||
132 | * |-----| \ | [p2m_identity]+\\ | .... | | ||
133 | * | 2 |--\ \-------------------->| ... | \\ \----------------/ | ||
134 | * |-----| \ \---------------/ \\ | ||
135 | * | 3 |-\ \ \\ p2m_identity [1] | ||
136 | * |-----| \ \-------------------->/---------------\ /-----------------\ | ||
137 | * | .. |\ | | [p2m_identity]+-->| ~0, ~0, ~0, ... | | ||
138 | * \-----/ | | | [p2m_identity]+-->| ..., ~0 | | ||
139 | * | | | .... | \-----------------/ | ||
140 | * | | +-[x], ~0, ~0.. +\ | ||
141 | * | | \---------------/ \ | ||
142 | * | | \-> /---------------\ | ||
143 | * | V p2m_mid_missing p2m_missing | IDENTITY[@0] | | ||
144 | * | /-----------------\ /------------\ | IDENTITY[@256]| | ||
145 | * | | [p2m_missing] +---->| ~0, ~0, ...| | ~0, ~0, .... | | ||
146 | * | | [p2m_missing] +---->| ..., ~0 | \---------------/ | ||
147 | * | | ... | \------------/ | ||
148 | * | \-----------------/ | ||
149 | * | | ||
150 | * | p2m_mid_identity | ||
151 | * | /-----------------\ | ||
152 | * \-->| [p2m_identity] +---->[1] | ||
153 | * | [p2m_identity] +---->[1] | ||
154 | * | ... | | ||
155 | * \-----------------/ | ||
156 | * | ||
157 | * where ~0 is INVALID_P2M_ENTRY. IDENTITY is (PFN | IDENTITY_BIT) | ||
158 | */ | 60 | */ |
159 | 61 | ||
160 | #include <linux/init.h> | 62 | #include <linux/init.h> |
161 | #include <linux/module.h> | 63 | #include <linux/module.h> |
162 | #include <linux/list.h> | 64 | #include <linux/list.h> |
163 | #include <linux/hash.h> | 65 | #include <linux/hash.h> |
164 | #include <linux/sched.h> | 66 | #include <linux/sched.h> |
165 | #include <linux/seq_file.h> | 67 | #include <linux/seq_file.h> |
166 | #include <linux/bootmem.h> | 68 | #include <linux/bootmem.h> |
69 | #include <linux/slab.h> | ||
167 | 70 | ||
168 | #include <asm/cache.h> | 71 | #include <asm/cache.h> |
169 | #include <asm/setup.h> | 72 | #include <asm/setup.h> |
73 | #include <asm/uaccess.h> | ||
170 | 74 | ||
171 | #include <asm/xen/page.h> | 75 | #include <asm/xen/page.h> |
172 | #include <asm/xen/hypercall.h> | 76 | #include <asm/xen/hypercall.h> |
173 | #include <asm/xen/hypervisor.h> | 77 | #include <asm/xen/hypervisor.h> |
174 | #include <xen/balloon.h> | 78 | #include <xen/balloon.h> |
175 | #include <xen/grant_table.h> | 79 | #include <xen/grant_table.h> |
176 | 80 | ||
177 | #include "p2m.h" | 81 | #include "p2m.h" |
178 | #include "multicalls.h" | 82 | #include "multicalls.h" |
179 | #include "xen-ops.h" | 83 | #include "xen-ops.h" |
180 | 84 | ||
85 | #define PMDS_PER_MID_PAGE (P2M_MID_PER_PAGE / PTRS_PER_PTE) | ||
86 | |||
181 | static void __init m2p_override_init(void); | 87 | static void __init m2p_override_init(void); |
182 | 88 | ||
89 | unsigned long *xen_p2m_addr __read_mostly; | ||
90 | EXPORT_SYMBOL_GPL(xen_p2m_addr); | ||
91 | unsigned long xen_p2m_size __read_mostly; | ||
92 | EXPORT_SYMBOL_GPL(xen_p2m_size); | ||
183 | unsigned long xen_max_p2m_pfn __read_mostly; | 93 | unsigned long xen_max_p2m_pfn __read_mostly; |
94 | EXPORT_SYMBOL_GPL(xen_max_p2m_pfn); | ||
184 | 95 | ||
96 | static DEFINE_SPINLOCK(p2m_update_lock); | ||
97 | |||
185 | static unsigned long *p2m_mid_missing_mfn; | 98 | static unsigned long *p2m_mid_missing_mfn; |
186 | static unsigned long *p2m_top_mfn; | 99 | static unsigned long *p2m_top_mfn; |
187 | static unsigned long **p2m_top_mfn_p; | 100 | static unsigned long **p2m_top_mfn_p; |
101 | static unsigned long *p2m_missing; | ||
102 | static unsigned long *p2m_identity; | ||
103 | static pte_t *p2m_missing_pte; | ||
104 | static pte_t *p2m_identity_pte; | ||
188 | 105 | ||
189 | /* Placeholders for holes in the address space */ | ||
190 | static RESERVE_BRK_ARRAY(unsigned long, p2m_missing, P2M_PER_PAGE); | ||
191 | static RESERVE_BRK_ARRAY(unsigned long *, p2m_mid_missing, P2M_MID_PER_PAGE); | ||
192 | |||
193 | static RESERVE_BRK_ARRAY(unsigned long **, p2m_top, P2M_TOP_PER_PAGE); | ||
194 | |||
195 | static RESERVE_BRK_ARRAY(unsigned long, p2m_identity, P2M_PER_PAGE); | ||
196 | static RESERVE_BRK_ARRAY(unsigned long *, p2m_mid_identity, P2M_MID_PER_PAGE); | ||
197 | |||
198 | RESERVE_BRK(p2m_mid, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE))); | ||
199 | |||
200 | /* For each I/O range remapped we may lose up to two leaf pages for the boundary | ||
201 | * violations and three mid pages to cover up to 3GB. With | ||
202 | * early_can_reuse_p2m_middle() most of the leaf pages will be reused by the | ||
203 | * remapped region. | ||
204 | */ | ||
205 | RESERVE_BRK(p2m_identity_remap, PAGE_SIZE * 2 * 3 * MAX_REMAP_RANGES); | ||
206 | |||
207 | static inline unsigned p2m_top_index(unsigned long pfn) | 106 | static inline unsigned p2m_top_index(unsigned long pfn) |
208 | { | 107 | { |
209 | BUG_ON(pfn >= MAX_P2M_PFN); | 108 | BUG_ON(pfn >= MAX_P2M_PFN); |
210 | return pfn / (P2M_MID_PER_PAGE * P2M_PER_PAGE); | 109 | return pfn / (P2M_MID_PER_PAGE * P2M_PER_PAGE); |
211 | } | 110 | } |
212 | 111 | ||
213 | static inline unsigned p2m_mid_index(unsigned long pfn) | 112 | static inline unsigned p2m_mid_index(unsigned long pfn) |
214 | { | 113 | { |
215 | return (pfn / P2M_PER_PAGE) % P2M_MID_PER_PAGE; | 114 | return (pfn / P2M_PER_PAGE) % P2M_MID_PER_PAGE; |
216 | } | 115 | } |
217 | 116 | ||
218 | static inline unsigned p2m_index(unsigned long pfn) | 117 | static inline unsigned p2m_index(unsigned long pfn) |
219 | { | 118 | { |
220 | return pfn % P2M_PER_PAGE; | 119 | return pfn % P2M_PER_PAGE; |
221 | } | 120 | } |
222 | 121 | ||
223 | static void p2m_top_init(unsigned long ***top) | ||
224 | { | ||
225 | unsigned i; | ||
226 | |||
227 | for (i = 0; i < P2M_TOP_PER_PAGE; i++) | ||
228 | top[i] = p2m_mid_missing; | ||
229 | } | ||
230 | |||
231 | static void p2m_top_mfn_init(unsigned long *top) | 122 | static void p2m_top_mfn_init(unsigned long *top) |
232 | { | 123 | { |
233 | unsigned i; | 124 | unsigned i; |
234 | 125 | ||
235 | for (i = 0; i < P2M_TOP_PER_PAGE; i++) | 126 | for (i = 0; i < P2M_TOP_PER_PAGE; i++) |
236 | top[i] = virt_to_mfn(p2m_mid_missing_mfn); | 127 | top[i] = virt_to_mfn(p2m_mid_missing_mfn); |
237 | } | 128 | } |
238 | 129 | ||
239 | static void p2m_top_mfn_p_init(unsigned long **top) | 130 | static void p2m_top_mfn_p_init(unsigned long **top) |
240 | { | 131 | { |
241 | unsigned i; | 132 | unsigned i; |
242 | 133 | ||
243 | for (i = 0; i < P2M_TOP_PER_PAGE; i++) | 134 | for (i = 0; i < P2M_TOP_PER_PAGE; i++) |
244 | top[i] = p2m_mid_missing_mfn; | 135 | top[i] = p2m_mid_missing_mfn; |
245 | } | 136 | } |
246 | 137 | ||
247 | static void p2m_mid_init(unsigned long **mid, unsigned long *leaf) | 138 | static void p2m_mid_mfn_init(unsigned long *mid, unsigned long *leaf) |
248 | { | 139 | { |
249 | unsigned i; | 140 | unsigned i; |
250 | 141 | ||
251 | for (i = 0; i < P2M_MID_PER_PAGE; i++) | 142 | for (i = 0; i < P2M_MID_PER_PAGE; i++) |
252 | mid[i] = leaf; | 143 | mid[i] = virt_to_mfn(leaf); |
253 | } | 144 | } |
254 | 145 | ||
255 | static void p2m_mid_mfn_init(unsigned long *mid, unsigned long *leaf) | 146 | static void p2m_init(unsigned long *p2m) |
256 | { | 147 | { |
257 | unsigned i; | 148 | unsigned i; |
258 | 149 | ||
259 | for (i = 0; i < P2M_MID_PER_PAGE; i++) | 150 | for (i = 0; i < P2M_PER_PAGE; i++) |
260 | mid[i] = virt_to_mfn(leaf); | 151 | p2m[i] = INVALID_P2M_ENTRY; |
261 | } | 152 | } |
262 | 153 | ||
263 | static void p2m_init(unsigned long *p2m) | 154 | static void p2m_init_identity(unsigned long *p2m, unsigned long pfn) |
264 | { | 155 | { |
265 | unsigned i; | 156 | unsigned i; |
266 | 157 | ||
267 | for (i = 0; i < P2M_MID_PER_PAGE; i++) | 158 | for (i = 0; i < P2M_PER_PAGE; i++) |
268 | p2m[i] = INVALID_P2M_ENTRY; | 159 | p2m[i] = IDENTITY_FRAME(pfn + i); |
269 | } | 160 | } |
270 | 161 | ||
162 | static void * __ref alloc_p2m_page(void) | ||
163 | { | ||
164 | if (unlikely(!slab_is_available())) | ||
165 | return alloc_bootmem_align(PAGE_SIZE, PAGE_SIZE); | ||
166 | |||
167 | return (void *)__get_free_page(GFP_KERNEL | __GFP_REPEAT); | ||
168 | } | ||
169 | |||
170 | /* Only to be called in case of a race for a page just allocated! */ | ||
171 | static void free_p2m_page(void *p) | ||
172 | { | ||
173 | BUG_ON(!slab_is_available()); | ||
174 | free_page((unsigned long)p); | ||
175 | } | ||
176 | |||
271 | /* | 177 | /* |
272 | * Build the parallel p2m_top_mfn and p2m_mid_mfn structures | 178 | * Build the parallel p2m_top_mfn and p2m_mid_mfn structures |
273 | * | 179 | * |
274 | * This is called both at boot time, and after resuming from suspend: | 180 | * This is called both at boot time, and after resuming from suspend: |
275 | * - At boot time we're called rather early, and must use alloc_bootmem*() | 181 | * - At boot time we're called rather early, and must use alloc_bootmem*() |
276 | * to allocate memory. | 182 | * to allocate memory. |
277 | * | 183 | * |
278 | * - After resume we're called from within stop_machine, but the mfn | 184 | * - After resume we're called from within stop_machine, but the mfn |
279 | * tree should already be completely allocated. | 185 | * tree should already be completely allocated. |
280 | */ | 186 | */ |
281 | void __ref xen_build_mfn_list_list(void) | 187 | void __ref xen_build_mfn_list_list(void) |
282 | { | 188 | { |
283 | unsigned long pfn; | 189 | unsigned long pfn, mfn; |
190 | pte_t *ptep; | ||
191 | unsigned int level, topidx, mididx; | ||
192 | unsigned long *mid_mfn_p; | ||
284 | 193 | ||
285 | if (xen_feature(XENFEAT_auto_translated_physmap)) | 194 | if (xen_feature(XENFEAT_auto_translated_physmap)) |
286 | return; | 195 | return; |
287 | 196 | ||
288 | /* Pre-initialize p2m_top_mfn to be completely missing */ | 197 | /* Pre-initialize p2m_top_mfn to be completely missing */ |
289 | if (p2m_top_mfn == NULL) { | 198 | if (p2m_top_mfn == NULL) { |
290 | p2m_mid_missing_mfn = alloc_bootmem_align(PAGE_SIZE, PAGE_SIZE); | 199 | p2m_mid_missing_mfn = alloc_p2m_page(); |
291 | p2m_mid_mfn_init(p2m_mid_missing_mfn, p2m_missing); | 200 | p2m_mid_mfn_init(p2m_mid_missing_mfn, p2m_missing); |
292 | 201 | ||
293 | p2m_top_mfn_p = alloc_bootmem_align(PAGE_SIZE, PAGE_SIZE); | 202 | p2m_top_mfn_p = alloc_p2m_page(); |
294 | p2m_top_mfn_p_init(p2m_top_mfn_p); | 203 | p2m_top_mfn_p_init(p2m_top_mfn_p); |
295 | 204 | ||
296 | p2m_top_mfn = alloc_bootmem_align(PAGE_SIZE, PAGE_SIZE); | 205 | p2m_top_mfn = alloc_p2m_page(); |
297 | p2m_top_mfn_init(p2m_top_mfn); | 206 | p2m_top_mfn_init(p2m_top_mfn); |
298 | } else { | 207 | } else { |
299 | /* Reinitialise, mfn's all change after migration */ | 208 | /* Reinitialise, mfn's all change after migration */ |
300 | p2m_mid_mfn_init(p2m_mid_missing_mfn, p2m_missing); | 209 | p2m_mid_mfn_init(p2m_mid_missing_mfn, p2m_missing); |
301 | } | 210 | } |
302 | 211 | ||
303 | for (pfn = 0; pfn < xen_max_p2m_pfn; pfn += P2M_PER_PAGE) { | 212 | for (pfn = 0; pfn < xen_max_p2m_pfn && pfn < MAX_P2M_PFN; |
304 | unsigned topidx = p2m_top_index(pfn); | 213 | pfn += P2M_PER_PAGE) { |
305 | unsigned mididx = p2m_mid_index(pfn); | 214 | topidx = p2m_top_index(pfn); |
306 | unsigned long **mid; | 215 | mididx = p2m_mid_index(pfn); |
307 | unsigned long *mid_mfn_p; | ||
308 | 216 | ||
309 | mid = p2m_top[topidx]; | ||
310 | mid_mfn_p = p2m_top_mfn_p[topidx]; | 217 | mid_mfn_p = p2m_top_mfn_p[topidx]; |
218 | ptep = lookup_address((unsigned long)(xen_p2m_addr + pfn), | ||
219 | &level); | ||
220 | BUG_ON(!ptep || level != PG_LEVEL_4K); | ||
221 | mfn = pte_mfn(*ptep); | ||
222 | ptep = (pte_t *)((unsigned long)ptep & ~(PAGE_SIZE - 1)); | ||
311 | 223 | ||
312 | /* Don't bother allocating any mfn mid levels if | 224 | /* Don't bother allocating any mfn mid levels if |
313 | * they're just missing, just update the stored mfn, | 225 | * they're just missing, just update the stored mfn, |
314 | * since all could have changed over a migrate. | 226 | * since all could have changed over a migrate. |
315 | */ | 227 | */ |
316 | if (mid == p2m_mid_missing) { | 228 | if (ptep == p2m_missing_pte || ptep == p2m_identity_pte) { |
317 | BUG_ON(mididx); | 229 | BUG_ON(mididx); |
318 | BUG_ON(mid_mfn_p != p2m_mid_missing_mfn); | 230 | BUG_ON(mid_mfn_p != p2m_mid_missing_mfn); |
319 | p2m_top_mfn[topidx] = virt_to_mfn(p2m_mid_missing_mfn); | 231 | p2m_top_mfn[topidx] = virt_to_mfn(p2m_mid_missing_mfn); |
320 | pfn += (P2M_MID_PER_PAGE - 1) * P2M_PER_PAGE; | 232 | pfn += (P2M_MID_PER_PAGE - 1) * P2M_PER_PAGE; |
321 | continue; | 233 | continue; |
322 | } | 234 | } |
323 | 235 | ||
324 | if (mid_mfn_p == p2m_mid_missing_mfn) { | 236 | if (mid_mfn_p == p2m_mid_missing_mfn) { |
325 | /* | 237 | mid_mfn_p = alloc_p2m_page(); |
326 | * XXX boot-time only! We should never find | ||
327 | * missing parts of the mfn tree after | ||
328 | * runtime. | ||
329 | */ | ||
330 | mid_mfn_p = alloc_bootmem_align(PAGE_SIZE, PAGE_SIZE); | ||
331 | p2m_mid_mfn_init(mid_mfn_p, p2m_missing); | 238 | p2m_mid_mfn_init(mid_mfn_p, p2m_missing); |
332 | 239 | ||
333 | p2m_top_mfn_p[topidx] = mid_mfn_p; | 240 | p2m_top_mfn_p[topidx] = mid_mfn_p; |
334 | } | 241 | } |
335 | 242 | ||
336 | p2m_top_mfn[topidx] = virt_to_mfn(mid_mfn_p); | 243 | p2m_top_mfn[topidx] = virt_to_mfn(mid_mfn_p); |
337 | mid_mfn_p[mididx] = virt_to_mfn(mid[mididx]); | 244 | mid_mfn_p[mididx] = mfn; |
338 | } | 245 | } |
339 | } | 246 | } |
340 | 247 | ||
341 | void xen_setup_mfn_list_list(void) | 248 | void xen_setup_mfn_list_list(void) |
342 | { | 249 | { |
343 | if (xen_feature(XENFEAT_auto_translated_physmap)) | 250 | if (xen_feature(XENFEAT_auto_translated_physmap)) |
344 | return; | 251 | return; |
345 | 252 | ||
346 | BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info); | 253 | BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info); |
347 | 254 | ||
348 | HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list = | 255 | HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list = |
349 | virt_to_mfn(p2m_top_mfn); | 256 | virt_to_mfn(p2m_top_mfn); |
350 | HYPERVISOR_shared_info->arch.max_pfn = xen_max_p2m_pfn; | 257 | HYPERVISOR_shared_info->arch.max_pfn = xen_max_p2m_pfn; |
351 | } | 258 | } |
352 | 259 | ||
353 | /* Set up p2m_top to point to the domain-builder provided p2m pages */ | 260 | /* Set up p2m_top to point to the domain-builder provided p2m pages */ |
354 | void __init xen_build_dynamic_phys_to_machine(void) | 261 | void __init xen_build_dynamic_phys_to_machine(void) |
355 | { | 262 | { |
356 | unsigned long *mfn_list; | ||
357 | unsigned long max_pfn; | ||
358 | unsigned long pfn; | 263 | unsigned long pfn; |
359 | 264 | ||
360 | if (xen_feature(XENFEAT_auto_translated_physmap)) | 265 | if (xen_feature(XENFEAT_auto_translated_physmap)) |
361 | return; | 266 | return; |
362 | 267 | ||
363 | mfn_list = (unsigned long *)xen_start_info->mfn_list; | 268 | xen_p2m_addr = (unsigned long *)xen_start_info->mfn_list; |
364 | max_pfn = min(MAX_DOMAIN_PAGES, xen_start_info->nr_pages); | 269 | xen_p2m_size = ALIGN(xen_start_info->nr_pages, P2M_PER_PAGE); |
365 | xen_max_p2m_pfn = max_pfn; | ||
366 | 270 | ||
367 | p2m_missing = extend_brk(PAGE_SIZE, PAGE_SIZE); | 271 | for (pfn = xen_start_info->nr_pages; pfn < xen_p2m_size; pfn++) |
368 | p2m_init(p2m_missing); | 272 | xen_p2m_addr[pfn] = INVALID_P2M_ENTRY; |
369 | p2m_identity = extend_brk(PAGE_SIZE, PAGE_SIZE); | ||
370 | p2m_init(p2m_identity); | ||
371 | 273 | ||
372 | p2m_mid_missing = extend_brk(PAGE_SIZE, PAGE_SIZE); | 274 | xen_max_p2m_pfn = xen_p2m_size; |
373 | p2m_mid_init(p2m_mid_missing, p2m_missing); | 275 | } |
374 | p2m_mid_identity = extend_brk(PAGE_SIZE, PAGE_SIZE); | ||
375 | p2m_mid_init(p2m_mid_identity, p2m_identity); | ||
376 | 276 | ||
377 | p2m_top = extend_brk(PAGE_SIZE, PAGE_SIZE); | 277 | #define P2M_TYPE_IDENTITY 0 |
378 | p2m_top_init(p2m_top); | 278 | #define P2M_TYPE_MISSING 1 |
279 | #define P2M_TYPE_PFN 2 | ||
280 | #define P2M_TYPE_UNKNOWN 3 | ||
379 | 281 | ||
380 | /* | 282 | static int xen_p2m_elem_type(unsigned long pfn) |
381 | * The domain builder gives us a pre-constructed p2m array in | 283 | { |
382 | * mfn_list for all the pages initially given to us, so we just | 284 | unsigned long mfn; |
383 | * need to graft that into our tree structure. | ||
384 | */ | ||
385 | for (pfn = 0; pfn < max_pfn; pfn += P2M_PER_PAGE) { | ||
386 | unsigned topidx = p2m_top_index(pfn); | ||
387 | unsigned mididx = p2m_mid_index(pfn); | ||
388 | 285 | ||
389 | if (p2m_top[topidx] == p2m_mid_missing) { | 286 | if (pfn >= xen_p2m_size) |
390 | unsigned long **mid = extend_brk(PAGE_SIZE, PAGE_SIZE); | 287 | return P2M_TYPE_IDENTITY; |
391 | p2m_mid_init(mid, p2m_missing); | ||
392 | 288 | ||
393 | p2m_top[topidx] = mid; | 289 | mfn = xen_p2m_addr[pfn]; |
394 | } | ||
395 | 290 | ||
396 | /* | 291 | if (mfn == INVALID_P2M_ENTRY) |
397 | * As long as the mfn_list has enough entries to completely | 292 | return P2M_TYPE_MISSING; |
398 | * fill a p2m page, pointing into the array is ok. But if | ||
399 | * not the entries beyond the last pfn will be undefined. | ||
400 | */ | ||
401 | if (unlikely(pfn + P2M_PER_PAGE > max_pfn)) { | ||
402 | unsigned long p2midx; | ||
403 | 293 | ||
404 | p2midx = max_pfn % P2M_PER_PAGE; | 294 | if (mfn & IDENTITY_FRAME_BIT) |
405 | for ( ; p2midx < P2M_PER_PAGE; p2midx++) | 295 | return P2M_TYPE_IDENTITY; |
406 | mfn_list[pfn + p2midx] = INVALID_P2M_ENTRY; | ||
407 | } | ||
408 | p2m_top[topidx][mididx] = &mfn_list[pfn]; | ||
409 | } | ||
410 | 296 | ||
411 | m2p_override_init(); | 297 | return P2M_TYPE_PFN; |
412 | } | 298 | } |
413 | #ifdef CONFIG_X86_64 | 299 | |
414 | unsigned long __init xen_revector_p2m_tree(void) | 300 | static void __init xen_rebuild_p2m_list(unsigned long *p2m) |
415 | { | 301 | { |
416 | unsigned long va_start; | 302 | unsigned int i, chunk; |
417 | unsigned long va_end; | ||
418 | unsigned long pfn; | 303 | unsigned long pfn; |
419 | unsigned long pfn_free = 0; | 304 | unsigned long *mfns; |
420 | unsigned long *mfn_list = NULL; | 305 | pte_t *ptep; |
421 | unsigned long size; | 306 | pmd_t *pmdp; |
307 | int type; | ||
422 | 308 | ||
423 | va_start = xen_start_info->mfn_list; | 309 | p2m_missing = alloc_p2m_page(); |
424 | /*We copy in increments of P2M_PER_PAGE * sizeof(unsigned long), | 310 | p2m_init(p2m_missing); |
425 | * so make sure it is rounded up to that */ | 311 | p2m_identity = alloc_p2m_page(); |
426 | size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long)); | 312 | p2m_init(p2m_identity); |
427 | va_end = va_start + size; | ||
428 | 313 | ||
429 | /* If we were revectored already, don't do it again. */ | 314 | p2m_missing_pte = alloc_p2m_page(); |
430 | if (va_start <= __START_KERNEL_map && va_start >= __PAGE_OFFSET) | 315 | paravirt_alloc_pte(&init_mm, __pa(p2m_missing_pte) >> PAGE_SHIFT); |
431 | return 0; | 316 | p2m_identity_pte = alloc_p2m_page(); |
432 | 317 | paravirt_alloc_pte(&init_mm, __pa(p2m_identity_pte) >> PAGE_SHIFT); | |
433 | mfn_list = alloc_bootmem_align(size, PAGE_SIZE); | 318 | for (i = 0; i < PTRS_PER_PTE; i++) { |
434 | if (!mfn_list) { | 319 | set_pte(p2m_missing_pte + i, |
435 | pr_warn("Could not allocate space for a new P2M tree!\n"); | 320 | pfn_pte(PFN_DOWN(__pa(p2m_missing)), PAGE_KERNEL_RO)); |
436 | return xen_start_info->mfn_list; | 321 | set_pte(p2m_identity_pte + i, |
322 | pfn_pte(PFN_DOWN(__pa(p2m_identity)), PAGE_KERNEL_RO)); | ||
437 | } | 323 | } |
438 | /* Fill it out with INVALID_P2M_ENTRY value */ | ||
439 | memset(mfn_list, 0xFF, size); | ||
440 | 324 | ||
441 | for (pfn = 0; pfn < ALIGN(MAX_DOMAIN_PAGES, P2M_PER_PAGE); pfn += P2M_PER_PAGE) { | 325 | for (pfn = 0; pfn < xen_max_p2m_pfn; pfn += chunk) { |
442 | unsigned topidx = p2m_top_index(pfn); | 326 | /* |
443 | unsigned mididx; | 327 | * Try to map missing/identity PMDs or p2m-pages if possible. |
444 | unsigned long *mid_p; | 328 | * We have to respect the structure of the mfn_list_list |
329 | * which will be built just afterwards. | ||
330 | * Chunk size to test is one p2m page if we are in the middle | ||
331 | * of a mfn_list_list mid page and the complete mid page area | ||
332 | * if we are at index 0 of the mid page. Please note that a | ||
333 | * mid page might cover more than one PMD, e.g. on 32 bit PAE | ||
334 | * kernels. | ||
335 | */ | ||
336 | chunk = (pfn & (P2M_PER_PAGE * P2M_MID_PER_PAGE - 1)) ? | ||
337 | P2M_PER_PAGE : P2M_PER_PAGE * P2M_MID_PER_PAGE; | ||
445 | 338 | ||
446 | if (!p2m_top[topidx]) | 339 | type = xen_p2m_elem_type(pfn); |
447 | continue; | 340 | i = 0; |
341 | if (type != P2M_TYPE_PFN) | ||
342 | for (i = 1; i < chunk; i++) | ||
343 | if (xen_p2m_elem_type(pfn + i) != type) | ||
344 | break; | ||
345 | if (i < chunk) | ||
346 | /* Reset to minimal chunk size. */ | ||
347 | chunk = P2M_PER_PAGE; | ||
448 | 348 | ||
449 | if (p2m_top[topidx] == p2m_mid_missing) | 349 | if (type == P2M_TYPE_PFN || i < chunk) { |
350 | /* Use initial p2m page contents. */ | ||
351 | #ifdef CONFIG_X86_64 | ||
352 | mfns = alloc_p2m_page(); | ||
353 | copy_page(mfns, xen_p2m_addr + pfn); | ||
354 | #else | ||
355 | mfns = xen_p2m_addr + pfn; | ||
356 | #endif | ||
357 | ptep = populate_extra_pte((unsigned long)(p2m + pfn)); | ||
358 | set_pte(ptep, | ||
359 | pfn_pte(PFN_DOWN(__pa(mfns)), PAGE_KERNEL)); | ||
450 | continue; | 360 | continue; |
361 | } | ||
451 | 362 | ||
452 | mididx = p2m_mid_index(pfn); | 363 | if (chunk == P2M_PER_PAGE) { |
453 | mid_p = p2m_top[topidx][mididx]; | 364 | /* Map complete missing or identity p2m-page. */ |
454 | if (!mid_p) | 365 | mfns = (type == P2M_TYPE_MISSING) ? |
366 | p2m_missing : p2m_identity; | ||
367 | ptep = populate_extra_pte((unsigned long)(p2m + pfn)); | ||
368 | set_pte(ptep, | ||
369 | pfn_pte(PFN_DOWN(__pa(mfns)), PAGE_KERNEL_RO)); | ||
455 | continue; | 370 | continue; |
456 | if ((mid_p == p2m_missing) || (mid_p == p2m_identity)) | 371 | } |
457 | continue; | ||
458 | 372 | ||
459 | if ((unsigned long)mid_p == INVALID_P2M_ENTRY) | 373 | /* Complete missing or identity PMD(s) can be mapped. */ |
460 | continue; | 374 | ptep = (type == P2M_TYPE_MISSING) ? |
375 | p2m_missing_pte : p2m_identity_pte; | ||
376 | for (i = 0; i < PMDS_PER_MID_PAGE; i++) { | ||
377 | pmdp = populate_extra_pmd( | ||
378 | (unsigned long)(p2m + pfn + i * PTRS_PER_PTE)); | ||
379 | set_pmd(pmdp, __pmd(__pa(ptep) | _KERNPG_TABLE)); | ||
380 | } | ||
381 | } | ||
382 | } | ||
461 | 383 | ||
462 | /* The old va. Rebase it on mfn_list */ | 384 | void __init xen_vmalloc_p2m_tree(void) |
463 | if (mid_p >= (unsigned long *)va_start && mid_p <= (unsigned long *)va_end) { | 385 | { |
464 | unsigned long *new; | 386 | static struct vm_struct vm; |
465 | 387 | ||
466 | if (pfn_free > (size / sizeof(unsigned long))) { | 388 | vm.flags = VM_ALLOC; |
467 | WARN(1, "Only allocated for %ld pages, but we want %ld!\n", | 389 | vm.size = ALIGN(sizeof(unsigned long) * xen_max_p2m_pfn, |
468 | size / sizeof(unsigned long), pfn_free); | 390 | PMD_SIZE * PMDS_PER_MID_PAGE); |
469 | return 0; | 391 | vm_area_register_early(&vm, PMD_SIZE * PMDS_PER_MID_PAGE); |
470 | } | 392 | pr_notice("p2m virtual area at %p, size is %lx\n", vm.addr, vm.size); |
471 | new = &mfn_list[pfn_free]; | ||
472 | 393 | ||
473 | copy_page(new, mid_p); | 394 | xen_max_p2m_pfn = vm.size / sizeof(unsigned long); |
474 | p2m_top[topidx][mididx] = &mfn_list[pfn_free]; | ||
475 | 395 | ||
476 | pfn_free += P2M_PER_PAGE; | 396 | xen_rebuild_p2m_list(vm.addr); |
477 | 397 | ||
478 | } | 398 | xen_p2m_addr = vm.addr; |
479 | /* This should be the leafs allocated for identity from _brk. */ | 399 | xen_p2m_size = xen_max_p2m_pfn; |
480 | } | ||
481 | return (unsigned long)mfn_list; | ||
482 | 400 | ||
401 | xen_inv_extra_mem(); | ||
402 | |||
403 | m2p_override_init(); | ||
483 | } | 404 | } |
484 | #else | 405 | |
485 | unsigned long __init xen_revector_p2m_tree(void) | ||
486 | { | ||
487 | return 0; | ||
488 | } | ||
489 | #endif | ||
490 | unsigned long get_phys_to_machine(unsigned long pfn) | 406 | unsigned long get_phys_to_machine(unsigned long pfn) |
491 | { | 407 | { |
492 | unsigned topidx, mididx, idx; | 408 | pte_t *ptep; |
409 | unsigned int level; | ||
493 | 410 | ||
494 | if (unlikely(pfn >= MAX_P2M_PFN)) | 411 | if (unlikely(pfn >= xen_p2m_size)) { |
412 | if (pfn < xen_max_p2m_pfn) | ||
413 | return xen_chk_extra_mem(pfn); | ||
414 | |||
495 | return IDENTITY_FRAME(pfn); | 415 | return IDENTITY_FRAME(pfn); |
416 | } | ||
496 | 417 | ||
497 | topidx = p2m_top_index(pfn); | 418 | ptep = lookup_address((unsigned long)(xen_p2m_addr + pfn), &level); |
498 | mididx = p2m_mid_index(pfn); | 419 | BUG_ON(!ptep || level != PG_LEVEL_4K); |
499 | idx = p2m_index(pfn); | ||
500 | 420 | ||
501 | /* | 421 | /* |
502 | * The INVALID_P2M_ENTRY is filled in both p2m_*identity | 422 | * The INVALID_P2M_ENTRY is filled in both p2m_*identity |
503 | * and in p2m_*missing, so returning the INVALID_P2M_ENTRY | 423 | * and in p2m_*missing, so returning the INVALID_P2M_ENTRY |
504 | * would be wrong. | 424 | * would be wrong. |
505 | */ | 425 | */ |
506 | if (p2m_top[topidx][mididx] == p2m_identity) | 426 | if (pte_pfn(*ptep) == PFN_DOWN(__pa(p2m_identity))) |
507 | return IDENTITY_FRAME(pfn); | 427 | return IDENTITY_FRAME(pfn); |
508 | 428 | ||
509 | return p2m_top[topidx][mididx][idx]; | 429 | return xen_p2m_addr[pfn]; |
510 | } | 430 | } |
511 | EXPORT_SYMBOL_GPL(get_phys_to_machine); | 431 | EXPORT_SYMBOL_GPL(get_phys_to_machine); |
512 | 432 | ||
513 | static void *alloc_p2m_page(void) | 433 | /* |
434 | * Allocate new pmd(s). It is checked whether the old pmd is still in place. | ||
435 | * If not, nothing is changed. This is okay as the only reason for allocating | ||
436 | * a new pmd is to replace p2m_missing_pte or p2m_identity_pte by a individual | ||
437 | * pmd. In case of PAE/x86-32 there are multiple pmds to allocate! | ||
438 | */ | ||
439 | static pte_t *alloc_p2m_pmd(unsigned long addr, pte_t *ptep, pte_t *pte_pg) | ||
514 | { | 440 | { |
515 | return (void *)__get_free_page(GFP_KERNEL | __GFP_REPEAT); | 441 | pte_t *ptechk; |
516 | } | 442 | pte_t *pteret = ptep; |
443 | pte_t *pte_newpg[PMDS_PER_MID_PAGE]; | ||
444 | pmd_t *pmdp; | ||
445 | unsigned int level; | ||
446 | unsigned long flags; | ||
447 | unsigned long vaddr; | ||
448 | int i; | ||
517 | 449 | ||
518 | static void free_p2m_page(void *p) | 450 | /* Do all allocations first to bail out in error case. */ |
519 | { | 451 | for (i = 0; i < PMDS_PER_MID_PAGE; i++) { |
520 | free_page((unsigned long)p); | 452 | pte_newpg[i] = alloc_p2m_page(); |
453 | if (!pte_newpg[i]) { | ||
454 | for (i--; i >= 0; i--) | ||
455 | free_p2m_page(pte_newpg[i]); | ||
456 | |||
457 | return NULL; | ||
458 | } | ||
459 | } | ||
460 | |||
461 | vaddr = addr & ~(PMD_SIZE * PMDS_PER_MID_PAGE - 1); | ||
462 | |||
463 | for (i = 0; i < PMDS_PER_MID_PAGE; i++) { | ||
464 | copy_page(pte_newpg[i], pte_pg); | ||
465 | paravirt_alloc_pte(&init_mm, __pa(pte_newpg[i]) >> PAGE_SHIFT); | ||
466 | |||
467 | pmdp = lookup_pmd_address(vaddr); | ||
468 | BUG_ON(!pmdp); | ||
469 | |||
470 | spin_lock_irqsave(&p2m_update_lock, flags); | ||
471 | |||
472 | ptechk = lookup_address(vaddr, &level); | ||
473 | if (ptechk == pte_pg) { | ||
474 | set_pmd(pmdp, | ||
475 | __pmd(__pa(pte_newpg[i]) | _KERNPG_TABLE)); | ||
476 | if (vaddr == (addr & ~(PMD_SIZE - 1))) | ||
477 | pteret = pte_offset_kernel(pmdp, addr); | ||
478 | pte_newpg[i] = NULL; | ||
479 | } | ||
480 | |||
481 | spin_unlock_irqrestore(&p2m_update_lock, flags); | ||
482 | |||
483 | if (pte_newpg[i]) { | ||
484 | paravirt_release_pte(__pa(pte_newpg[i]) >> PAGE_SHIFT); | ||
485 | free_p2m_page(pte_newpg[i]); | ||
486 | } | ||
487 | |||
488 | vaddr += PMD_SIZE; | ||
489 | } | ||
490 | |||
491 | return pteret; | ||
521 | } | 492 | } |
522 | 493 | ||
523 | /* | 494 | /* |
524 | * Fully allocate the p2m structure for a given pfn. We need to check | 495 | * Fully allocate the p2m structure for a given pfn. We need to check |
525 | * that both the top and mid levels are allocated, and make sure the | 496 | * that both the top and mid levels are allocated, and make sure the |
526 | * parallel mfn tree is kept in sync. We may race with other cpus, so | 497 | * parallel mfn tree is kept in sync. We may race with other cpus, so |
527 | * the new pages are installed with cmpxchg; if we lose the race then | 498 | * the new pages are installed with cmpxchg; if we lose the race then |
528 | * simply free the page we allocated and use the one that's there. | 499 | * simply free the page we allocated and use the one that's there. |
529 | */ | 500 | */ |
530 | static bool alloc_p2m(unsigned long pfn) | 501 | static bool alloc_p2m(unsigned long pfn) |
531 | { | 502 | { |
532 | unsigned topidx, mididx; | 503 | unsigned topidx, mididx; |
533 | unsigned long ***top_p, **mid; | ||
534 | unsigned long *top_mfn_p, *mid_mfn; | 504 | unsigned long *top_mfn_p, *mid_mfn; |
535 | unsigned long *p2m_orig; | 505 | pte_t *ptep, *pte_pg; |
506 | unsigned int level; | ||
507 | unsigned long flags; | ||
508 | unsigned long addr = (unsigned long)(xen_p2m_addr + pfn); | ||
509 | unsigned long p2m_pfn; | ||
536 | 510 | ||
537 | topidx = p2m_top_index(pfn); | 511 | topidx = p2m_top_index(pfn); |
538 | mididx = p2m_mid_index(pfn); | 512 | mididx = p2m_mid_index(pfn); |
539 | 513 | ||
540 | top_p = &p2m_top[topidx]; | 514 | ptep = lookup_address(addr, &level); |
541 | mid = ACCESS_ONCE(*top_p); | 515 | BUG_ON(!ptep || level != PG_LEVEL_4K); |
516 | pte_pg = (pte_t *)((unsigned long)ptep & ~(PAGE_SIZE - 1)); | ||
542 | 517 | ||
543 | if (mid == p2m_mid_missing) { | 518 | if (pte_pg == p2m_missing_pte || pte_pg == p2m_identity_pte) { |
544 | /* Mid level is missing, allocate a new one */ | 519 | /* PMD level is missing, allocate a new one */ |
545 | mid = alloc_p2m_page(); | 520 | ptep = alloc_p2m_pmd(addr, ptep, pte_pg); |
546 | if (!mid) | 521 | if (!ptep) |
547 | return false; | 522 | return false; |
548 | |||
549 | p2m_mid_init(mid, p2m_missing); | ||
550 | |||
551 | if (cmpxchg(top_p, p2m_mid_missing, mid) != p2m_mid_missing) | ||
552 | free_p2m_page(mid); | ||
553 | } | 523 | } |
554 | 524 | ||
555 | top_mfn_p = &p2m_top_mfn[topidx]; | 525 | if (p2m_top_mfn) { |
556 | mid_mfn = ACCESS_ONCE(p2m_top_mfn_p[topidx]); | 526 | top_mfn_p = &p2m_top_mfn[topidx]; |
527 | mid_mfn = ACCESS_ONCE(p2m_top_mfn_p[topidx]); | ||
557 | 528 | ||
558 | BUG_ON(virt_to_mfn(mid_mfn) != *top_mfn_p); | 529 | BUG_ON(virt_to_mfn(mid_mfn) != *top_mfn_p); |
559 | 530 | ||
560 | if (mid_mfn == p2m_mid_missing_mfn) { | 531 | if (mid_mfn == p2m_mid_missing_mfn) { |
561 | /* Separately check the mid mfn level */ | 532 | /* Separately check the mid mfn level */ |
562 | unsigned long missing_mfn; | 533 | unsigned long missing_mfn; |
563 | unsigned long mid_mfn_mfn; | 534 | unsigned long mid_mfn_mfn; |
564 | unsigned long old_mfn; | 535 | unsigned long old_mfn; |
565 | 536 | ||
566 | mid_mfn = alloc_p2m_page(); | 537 | mid_mfn = alloc_p2m_page(); |
567 | if (!mid_mfn) | 538 | if (!mid_mfn) |
568 | return false; | 539 | return false; |
569 | 540 | ||
570 | p2m_mid_mfn_init(mid_mfn, p2m_missing); | 541 | p2m_mid_mfn_init(mid_mfn, p2m_missing); |
571 | 542 | ||
572 | missing_mfn = virt_to_mfn(p2m_mid_missing_mfn); | 543 | missing_mfn = virt_to_mfn(p2m_mid_missing_mfn); |
573 | mid_mfn_mfn = virt_to_mfn(mid_mfn); | 544 | mid_mfn_mfn = virt_to_mfn(mid_mfn); |
574 | old_mfn = cmpxchg(top_mfn_p, missing_mfn, mid_mfn_mfn); | 545 | old_mfn = cmpxchg(top_mfn_p, missing_mfn, mid_mfn_mfn); |
575 | if (old_mfn != missing_mfn) { | 546 | if (old_mfn != missing_mfn) { |
576 | free_p2m_page(mid_mfn); | 547 | free_p2m_page(mid_mfn); |
577 | mid_mfn = mfn_to_virt(old_mfn); | 548 | mid_mfn = mfn_to_virt(old_mfn); |
578 | } else { | 549 | } else { |
579 | p2m_top_mfn_p[topidx] = mid_mfn; | 550 | p2m_top_mfn_p[topidx] = mid_mfn; |
551 | } | ||
580 | } | 552 | } |
553 | } else { | ||
554 | mid_mfn = NULL; | ||
581 | } | 555 | } |
582 | 556 | ||
583 | p2m_orig = ACCESS_ONCE(p2m_top[topidx][mididx]); | 557 | p2m_pfn = pte_pfn(ACCESS_ONCE(*ptep)); |
584 | if (p2m_orig == p2m_identity || p2m_orig == p2m_missing) { | 558 | if (p2m_pfn == PFN_DOWN(__pa(p2m_identity)) || |
559 | p2m_pfn == PFN_DOWN(__pa(p2m_missing))) { | ||
585 | /* p2m leaf page is missing */ | 560 | /* p2m leaf page is missing */ |
586 | unsigned long *p2m; | 561 | unsigned long *p2m; |
587 | 562 | ||
588 | p2m = alloc_p2m_page(); | 563 | p2m = alloc_p2m_page(); |
589 | if (!p2m) | 564 | if (!p2m) |
590 | return false; | 565 | return false; |
591 | 566 | ||
592 | p2m_init(p2m); | 567 | if (p2m_pfn == PFN_DOWN(__pa(p2m_missing))) |
593 | 568 | p2m_init(p2m); | |
594 | if (cmpxchg(&mid[mididx], p2m_orig, p2m) != p2m_orig) | ||
595 | free_p2m_page(p2m); | ||
596 | else | 569 | else |
597 | mid_mfn[mididx] = virt_to_mfn(p2m); | 570 | p2m_init_identity(p2m, pfn); |
598 | } | ||
599 | 571 | ||
600 | return true; | 572 | spin_lock_irqsave(&p2m_update_lock, flags); |
601 | } | ||
602 | 573 | ||
603 | static bool __init early_alloc_p2m(unsigned long pfn, bool check_boundary) | 574 | if (pte_pfn(*ptep) == p2m_pfn) { |
604 | { | 575 | set_pte(ptep, |
605 | unsigned topidx, mididx, idx; | 576 | pfn_pte(PFN_DOWN(__pa(p2m)), PAGE_KERNEL)); |
606 | unsigned long *p2m; | 577 | if (mid_mfn) |
607 | 578 | mid_mfn[mididx] = virt_to_mfn(p2m); | |
608 | topidx = p2m_top_index(pfn); | 579 | p2m = NULL; |
609 | mididx = p2m_mid_index(pfn); | ||
610 | idx = p2m_index(pfn); | ||
611 | |||
612 | /* Pfff.. No boundary cross-over, lets get out. */ | ||
613 | if (!idx && check_boundary) | ||
614 | return false; | ||
615 | |||
616 | WARN(p2m_top[topidx][mididx] == p2m_identity, | ||
617 | "P2M[%d][%d] == IDENTITY, should be MISSING (or alloced)!\n", | ||
618 | topidx, mididx); | ||
619 | |||
620 | /* | ||
621 | * Could be done by xen_build_dynamic_phys_to_machine.. | ||
622 | */ | ||
623 | if (p2m_top[topidx][mididx] != p2m_missing) | ||
624 | return false; | ||
625 | |||
626 | /* Boundary cross-over for the edges: */ | ||
627 | p2m = extend_brk(PAGE_SIZE, PAGE_SIZE); | ||
628 | |||
629 | p2m_init(p2m); | ||
630 | |||
631 | p2m_top[topidx][mididx] = p2m; | ||
632 | |||
633 | return true; | ||
634 | } | ||
635 | |||
636 | static bool __init early_alloc_p2m_middle(unsigned long pfn) | ||
637 | { | ||
638 | unsigned topidx = p2m_top_index(pfn); | ||
639 | unsigned long **mid; | ||
640 | |||
641 | mid = p2m_top[topidx]; | ||
642 | if (mid == p2m_mid_missing) { | ||
643 | mid = extend_brk(PAGE_SIZE, PAGE_SIZE); | ||
644 | |||
645 | p2m_mid_init(mid, p2m_missing); | ||
646 | |||
647 | p2m_top[topidx] = mid; | ||
648 | } | ||
649 | return true; | ||
650 | } | ||
651 | |||
652 | /* | ||
653 | * Skim over the P2M tree looking at pages that are either filled with | ||
654 | * INVALID_P2M_ENTRY or with 1:1 PFNs. If found, re-use that page and | ||
655 | * replace the P2M leaf with a p2m_missing or p2m_identity. | ||
656 | * Stick the old page in the new P2M tree location. | ||
657 | */ | ||
658 | static bool __init early_can_reuse_p2m_middle(unsigned long set_pfn) | ||
659 | { | ||
660 | unsigned topidx; | ||
661 | unsigned mididx; | ||
662 | unsigned ident_pfns; | ||
663 | unsigned inv_pfns; | ||
664 | unsigned long *p2m; | ||
665 | unsigned idx; | ||
666 | unsigned long pfn; | ||
667 | |||
668 | /* We only look when this entails a P2M middle layer */ | ||
669 | if (p2m_index(set_pfn)) | ||
670 | return false; | ||
671 | |||
672 | for (pfn = 0; pfn < MAX_DOMAIN_PAGES; pfn += P2M_PER_PAGE) { | ||
673 | topidx = p2m_top_index(pfn); | ||
674 | |||
675 | if (!p2m_top[topidx]) | ||
676 | continue; | ||
677 | |||
678 | if (p2m_top[topidx] == p2m_mid_missing) | ||
679 | continue; | ||
680 | |||
681 | mididx = p2m_mid_index(pfn); | ||
682 | p2m = p2m_top[topidx][mididx]; | ||
683 | if (!p2m) | ||
684 | continue; | ||
685 | |||
686 | if ((p2m == p2m_missing) || (p2m == p2m_identity)) | ||
687 | continue; | ||
688 | |||
689 | if ((unsigned long)p2m == INVALID_P2M_ENTRY) | ||
690 | continue; | ||
691 | |||
692 | ident_pfns = 0; | ||
693 | inv_pfns = 0; | ||
694 | for (idx = 0; idx < P2M_PER_PAGE; idx++) { | ||
695 | /* IDENTITY_PFNs are 1:1 */ | ||
696 | if (p2m[idx] == IDENTITY_FRAME(pfn + idx)) | ||
697 | ident_pfns++; | ||
698 | else if (p2m[idx] == INVALID_P2M_ENTRY) | ||
699 | inv_pfns++; | ||
700 | else | ||
701 | break; | ||
702 | } | 580 | } |
703 | if ((ident_pfns == P2M_PER_PAGE) || (inv_pfns == P2M_PER_PAGE)) | ||
704 | goto found; | ||
705 | } | ||
706 | return false; | ||
707 | found: | ||
708 | /* Found one, replace old with p2m_identity or p2m_missing */ | ||
709 | p2m_top[topidx][mididx] = (ident_pfns ? p2m_identity : p2m_missing); | ||
710 | 581 | ||
711 | /* Reset where we want to stick the old page in. */ | 582 | spin_unlock_irqrestore(&p2m_update_lock, flags); |
712 | topidx = p2m_top_index(set_pfn); | ||
713 | mididx = p2m_mid_index(set_pfn); | ||
714 | 583 | ||
715 | /* This shouldn't happen */ | 584 | if (p2m) |
716 | if (WARN_ON(p2m_top[topidx] == p2m_mid_missing)) | 585 | free_p2m_page(p2m); |
717 | early_alloc_p2m_middle(set_pfn); | ||
718 | |||
719 | if (WARN_ON(p2m_top[topidx][mididx] != p2m_missing)) | ||
720 | return false; | ||
721 | |||
722 | p2m_init(p2m); | ||
723 | p2m_top[topidx][mididx] = p2m; | ||
724 | |||
725 | return true; | ||
726 | } | ||
727 | bool __init early_set_phys_to_machine(unsigned long pfn, unsigned long mfn) | ||
728 | { | ||
729 | if (unlikely(!__set_phys_to_machine(pfn, mfn))) { | ||
730 | if (!early_alloc_p2m_middle(pfn)) | ||
731 | return false; | ||
732 | |||
733 | if (early_can_reuse_p2m_middle(pfn)) | ||
734 | return __set_phys_to_machine(pfn, mfn); | ||
735 | |||
736 | if (!early_alloc_p2m(pfn, false /* boundary crossover OK!*/)) | ||
737 | return false; | ||
738 | |||
739 | if (!__set_phys_to_machine(pfn, mfn)) | ||
740 | return false; | ||
741 | } | 586 | } |
742 | 587 | ||
743 | return true; | 588 | return true; |
744 | } | 589 | } |
745 | 590 | ||
746 | static void __init early_split_p2m(unsigned long pfn) | ||
747 | { | ||
748 | unsigned long mididx, idx; | ||
749 | |||
750 | mididx = p2m_mid_index(pfn); | ||
751 | idx = p2m_index(pfn); | ||
752 | |||
753 | /* | ||
754 | * Allocate new middle and leaf pages if this pfn lies in the | ||
755 | * middle of one. | ||
756 | */ | ||
757 | if (mididx || idx) | ||
758 | early_alloc_p2m_middle(pfn); | ||
759 | if (idx) | ||
760 | early_alloc_p2m(pfn, false); | ||
761 | } | ||
762 | |||
763 | unsigned long __init set_phys_range_identity(unsigned long pfn_s, | 591 | unsigned long __init set_phys_range_identity(unsigned long pfn_s, |
764 | unsigned long pfn_e) | 592 | unsigned long pfn_e) |
765 | { | 593 | { |
766 | unsigned long pfn; | 594 | unsigned long pfn; |
767 | 595 | ||
768 | if (unlikely(pfn_s >= MAX_P2M_PFN)) | 596 | if (unlikely(pfn_s >= xen_p2m_size)) |
769 | return 0; | 597 | return 0; |
770 | 598 | ||
771 | if (unlikely(xen_feature(XENFEAT_auto_translated_physmap))) | 599 | if (unlikely(xen_feature(XENFEAT_auto_translated_physmap))) |
772 | return pfn_e - pfn_s; | 600 | return pfn_e - pfn_s; |
773 | 601 | ||
774 | if (pfn_s > pfn_e) | 602 | if (pfn_s > pfn_e) |
775 | return 0; | 603 | return 0; |
776 | 604 | ||
777 | if (pfn_e > MAX_P2M_PFN) | 605 | if (pfn_e > xen_p2m_size) |
778 | pfn_e = MAX_P2M_PFN; | 606 | pfn_e = xen_p2m_size; |
779 | 607 | ||
780 | early_split_p2m(pfn_s); | 608 | for (pfn = pfn_s; pfn < pfn_e; pfn++) |
781 | early_split_p2m(pfn_e); | 609 | xen_p2m_addr[pfn] = IDENTITY_FRAME(pfn); |
782 | 610 | ||
783 | for (pfn = pfn_s; pfn < pfn_e;) { | ||
784 | unsigned topidx = p2m_top_index(pfn); | ||
785 | unsigned mididx = p2m_mid_index(pfn); | ||
786 | |||
787 | if (!__set_phys_to_machine(pfn, IDENTITY_FRAME(pfn))) | ||
788 | break; | ||
789 | pfn++; | ||
790 | |||
791 | /* | ||
792 | * If the PFN was set to a middle or leaf identity | ||
793 | * page the remainder must also be identity, so skip | ||
794 | * ahead to the next middle or leaf entry. | ||
795 | */ | ||
796 | if (p2m_top[topidx] == p2m_mid_identity) | ||
797 | pfn = ALIGN(pfn, P2M_MID_PER_PAGE * P2M_PER_PAGE); | ||
798 | else if (p2m_top[topidx][mididx] == p2m_identity) | ||
799 | pfn = ALIGN(pfn, P2M_PER_PAGE); | ||
800 | } | ||
801 | |||
802 | WARN((pfn - pfn_s) != (pfn_e - pfn_s), | ||
803 | "Identity mapping failed. We are %ld short of 1-1 mappings!\n", | ||
804 | (pfn_e - pfn_s) - (pfn - pfn_s)); | ||
805 | |||
806 | return pfn - pfn_s; | 611 | return pfn - pfn_s; |
807 | } | 612 | } |
808 | 613 | ||
809 | /* Try to install p2m mapping; fail if intermediate bits missing */ | ||
810 | bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn) | 614 | bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn) |
811 | { | 615 | { |
812 | unsigned topidx, mididx, idx; | 616 | pte_t *ptep; |
617 | unsigned int level; | ||
813 | 618 | ||
814 | /* don't track P2M changes in autotranslate guests */ | 619 | /* don't track P2M changes in autotranslate guests */ |
815 | if (unlikely(xen_feature(XENFEAT_auto_translated_physmap))) | 620 | if (unlikely(xen_feature(XENFEAT_auto_translated_physmap))) |
816 | return true; | 621 | return true; |
817 | 622 | ||
818 | if (unlikely(pfn >= MAX_P2M_PFN)) { | 623 | if (unlikely(pfn >= xen_p2m_size)) { |
819 | BUG_ON(mfn != INVALID_P2M_ENTRY); | 624 | BUG_ON(mfn != INVALID_P2M_ENTRY); |
820 | return true; | 625 | return true; |
821 | } | 626 | } |
822 | 627 | ||
823 | topidx = p2m_top_index(pfn); | 628 | if (likely(!xen_safe_write_ulong(xen_p2m_addr + pfn, mfn))) |
824 | mididx = p2m_mid_index(pfn); | 629 | return true; |
825 | idx = p2m_index(pfn); | ||
826 | 630 | ||
827 | /* For sparse holes were the p2m leaf has real PFN along with | 631 | ptep = lookup_address((unsigned long)(xen_p2m_addr + pfn), &level); |
828 | * PCI holes, stick in the PFN as the MFN value. | 632 | BUG_ON(!ptep || level != PG_LEVEL_4K); |
829 | * | ||
830 | * set_phys_range_identity() will have allocated new middle | ||
831 | * and leaf pages as required so an existing p2m_mid_missing | ||
832 | * or p2m_missing mean that whole range will be identity so | ||
833 | * these can be switched to p2m_mid_identity or p2m_identity. | ||
834 | */ | ||
835 | if (mfn != INVALID_P2M_ENTRY && (mfn & IDENTITY_FRAME_BIT)) { | ||
836 | if (p2m_top[topidx] == p2m_mid_identity) | ||
837 | return true; | ||
838 | 633 | ||
839 | if (p2m_top[topidx] == p2m_mid_missing) { | 634 | if (pte_pfn(*ptep) == PFN_DOWN(__pa(p2m_missing))) |
840 | WARN_ON(cmpxchg(&p2m_top[topidx], p2m_mid_missing, | ||
841 | p2m_mid_identity) != p2m_mid_missing); | ||
842 | return true; | ||
843 | } | ||
844 | |||
845 | if (p2m_top[topidx][mididx] == p2m_identity) | ||
846 | return true; | ||
847 | |||
848 | /* Swap over from MISSING to IDENTITY if needed. */ | ||
849 | if (p2m_top[topidx][mididx] == p2m_missing) { | ||
850 | WARN_ON(cmpxchg(&p2m_top[topidx][mididx], p2m_missing, | ||
851 | p2m_identity) != p2m_missing); | ||
852 | return true; | ||
853 | } | ||
854 | } | ||
855 | |||
856 | if (p2m_top[topidx][mididx] == p2m_missing) | ||
857 | return mfn == INVALID_P2M_ENTRY; | 635 | return mfn == INVALID_P2M_ENTRY; |
858 | 636 | ||
859 | p2m_top[topidx][mididx][idx] = mfn; | 637 | if (pte_pfn(*ptep) == PFN_DOWN(__pa(p2m_identity))) |
638 | return mfn == IDENTITY_FRAME(pfn); | ||
860 | 639 | ||
861 | return true; | 640 | return false; |
862 | } | 641 | } |
863 | 642 | ||
864 | bool set_phys_to_machine(unsigned long pfn, unsigned long mfn) | 643 | bool set_phys_to_machine(unsigned long pfn, unsigned long mfn) |
865 | { | 644 | { |
866 | if (unlikely(!__set_phys_to_machine(pfn, mfn))) { | 645 | if (unlikely(!__set_phys_to_machine(pfn, mfn))) { |
867 | if (!alloc_p2m(pfn)) | 646 | if (!alloc_p2m(pfn)) |
868 | return false; | 647 | return false; |
869 | 648 | ||
870 | if (!__set_phys_to_machine(pfn, mfn)) | 649 | return __set_phys_to_machine(pfn, mfn); |
871 | return false; | ||
872 | } | 650 | } |
873 | 651 | ||
874 | return true; | 652 | return true; |
875 | } | 653 | } |
876 | 654 | ||
877 | #define M2P_OVERRIDE_HASH_SHIFT 10 | 655 | #define M2P_OVERRIDE_HASH_SHIFT 10 |
878 | #define M2P_OVERRIDE_HASH (1 << M2P_OVERRIDE_HASH_SHIFT) | 656 | #define M2P_OVERRIDE_HASH (1 << M2P_OVERRIDE_HASH_SHIFT) |
879 | 657 | ||
880 | static RESERVE_BRK_ARRAY(struct list_head, m2p_overrides, M2P_OVERRIDE_HASH); | 658 | static struct list_head *m2p_overrides; |
881 | static DEFINE_SPINLOCK(m2p_override_lock); | 659 | static DEFINE_SPINLOCK(m2p_override_lock); |
882 | 660 | ||
883 | static void __init m2p_override_init(void) | 661 | static void __init m2p_override_init(void) |
884 | { | 662 | { |
885 | unsigned i; | 663 | unsigned i; |
886 | 664 | ||
887 | m2p_overrides = extend_brk(sizeof(*m2p_overrides) * M2P_OVERRIDE_HASH, | 665 | m2p_overrides = alloc_bootmem_align( |
888 | sizeof(unsigned long)); | 666 | sizeof(*m2p_overrides) * M2P_OVERRIDE_HASH, |
667 | sizeof(unsigned long)); | ||
889 | 668 | ||
890 | for (i = 0; i < M2P_OVERRIDE_HASH; i++) | 669 | for (i = 0; i < M2P_OVERRIDE_HASH; i++) |
891 | INIT_LIST_HEAD(&m2p_overrides[i]); | 670 | INIT_LIST_HEAD(&m2p_overrides[i]); |
892 | } | 671 | } |
893 | 672 | ||
894 | static unsigned long mfn_hash(unsigned long mfn) | 673 | static unsigned long mfn_hash(unsigned long mfn) |
895 | { | 674 | { |
896 | return hash_long(mfn, M2P_OVERRIDE_HASH_SHIFT); | 675 | return hash_long(mfn, M2P_OVERRIDE_HASH_SHIFT); |
897 | } | 676 | } |
898 | 677 | ||
899 | int set_foreign_p2m_mapping(struct gnttab_map_grant_ref *map_ops, | ||
900 | struct gnttab_map_grant_ref *kmap_ops, | ||
901 | struct page **pages, unsigned int count) | ||
902 | { | ||
903 | int i, ret = 0; | ||
904 | bool lazy = false; | ||
905 | pte_t *pte; | ||
906 | |||
907 | if (xen_feature(XENFEAT_auto_translated_physmap)) | ||
908 | return 0; | ||
909 | |||
910 | if (kmap_ops && | ||
911 | !in_interrupt() && | ||
912 | paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) { | ||
913 | arch_enter_lazy_mmu_mode(); | ||
914 | lazy = true; | ||
915 | } | ||
916 | |||
917 | for (i = 0; i < count; i++) { | ||
918 | unsigned long mfn, pfn; | ||
919 | |||
920 | /* Do not add to override if the map failed. */ | ||
921 | if (map_ops[i].status) | ||
922 | continue; | ||
923 | |||
924 | if (map_ops[i].flags & GNTMAP_contains_pte) { | ||
925 | pte = (pte_t *) (mfn_to_virt(PFN_DOWN(map_ops[i].host_addr)) + | ||
926 | (map_ops[i].host_addr & ~PAGE_MASK)); | ||
927 | mfn = pte_mfn(*pte); | ||
928 | } else { | ||
929 | mfn = PFN_DOWN(map_ops[i].dev_bus_addr); | ||
930 | } | ||
931 | pfn = page_to_pfn(pages[i]); | ||
932 | |||
933 | WARN_ON(PagePrivate(pages[i])); | ||
934 | SetPagePrivate(pages[i]); | ||
935 | set_page_private(pages[i], mfn); | ||
936 | pages[i]->index = pfn_to_mfn(pfn); | ||
937 | |||
938 | if (unlikely(!set_phys_to_machine(pfn, FOREIGN_FRAME(mfn)))) { | ||
939 | ret = -ENOMEM; | ||
940 | goto out; | ||
941 | } | ||
942 | |||
943 | if (kmap_ops) { | ||
944 | ret = m2p_add_override(mfn, pages[i], &kmap_ops[i]); | ||
945 | if (ret) | ||
946 | goto out; | ||
947 | } | ||
948 | } | ||
949 | |||
950 | out: | ||
951 | if (lazy) | ||
952 | arch_leave_lazy_mmu_mode(); | ||
953 | |||
954 | return ret; | ||
955 | } | ||
956 | EXPORT_SYMBOL_GPL(set_foreign_p2m_mapping); | ||
957 | |||
958 | /* Add an MFN override for a particular page */ | 678 | /* Add an MFN override for a particular page */ |
959 | int m2p_add_override(unsigned long mfn, struct page *page, | 679 | static int m2p_add_override(unsigned long mfn, struct page *page, |
960 | struct gnttab_map_grant_ref *kmap_op) | 680 | struct gnttab_map_grant_ref *kmap_op) |
961 | { | 681 | { |
962 | unsigned long flags; | 682 | unsigned long flags; |
963 | unsigned long pfn; | 683 | unsigned long pfn; |
964 | unsigned long uninitialized_var(address); | 684 | unsigned long uninitialized_var(address); |
965 | unsigned level; | 685 | unsigned level; |
966 | pte_t *ptep = NULL; | 686 | pte_t *ptep = NULL; |
967 | 687 | ||
968 | pfn = page_to_pfn(page); | 688 | pfn = page_to_pfn(page); |
969 | if (!PageHighMem(page)) { | 689 | if (!PageHighMem(page)) { |
970 | address = (unsigned long)__va(pfn << PAGE_SHIFT); | 690 | address = (unsigned long)__va(pfn << PAGE_SHIFT); |
971 | ptep = lookup_address(address, &level); | 691 | ptep = lookup_address(address, &level); |
972 | if (WARN(ptep == NULL || level != PG_LEVEL_4K, | 692 | if (WARN(ptep == NULL || level != PG_LEVEL_4K, |
973 | "m2p_add_override: pfn %lx not mapped", pfn)) | 693 | "m2p_add_override: pfn %lx not mapped", pfn)) |
974 | return -EINVAL; | 694 | return -EINVAL; |
975 | } | 695 | } |
976 | 696 | ||
977 | if (kmap_op != NULL) { | 697 | if (kmap_op != NULL) { |
978 | if (!PageHighMem(page)) { | 698 | if (!PageHighMem(page)) { |
979 | struct multicall_space mcs = | 699 | struct multicall_space mcs = |
980 | xen_mc_entry(sizeof(*kmap_op)); | 700 | xen_mc_entry(sizeof(*kmap_op)); |
981 | 701 | ||
982 | MULTI_grant_table_op(mcs.mc, | 702 | MULTI_grant_table_op(mcs.mc, |
983 | GNTTABOP_map_grant_ref, kmap_op, 1); | 703 | GNTTABOP_map_grant_ref, kmap_op, 1); |
984 | 704 | ||
985 | xen_mc_issue(PARAVIRT_LAZY_MMU); | 705 | xen_mc_issue(PARAVIRT_LAZY_MMU); |
986 | } | 706 | } |
987 | } | 707 | } |
988 | spin_lock_irqsave(&m2p_override_lock, flags); | 708 | spin_lock_irqsave(&m2p_override_lock, flags); |
989 | list_add(&page->lru, &m2p_overrides[mfn_hash(mfn)]); | 709 | list_add(&page->lru, &m2p_overrides[mfn_hash(mfn)]); |
990 | spin_unlock_irqrestore(&m2p_override_lock, flags); | 710 | spin_unlock_irqrestore(&m2p_override_lock, flags); |
991 | 711 | ||
992 | /* p2m(m2p(mfn)) == mfn: the mfn is already present somewhere in | 712 | /* p2m(m2p(mfn)) == mfn: the mfn is already present somewhere in |
993 | * this domain. Set the FOREIGN_FRAME_BIT in the p2m for the other | 713 | * this domain. Set the FOREIGN_FRAME_BIT in the p2m for the other |
994 | * pfn so that the following mfn_to_pfn(mfn) calls will return the | 714 | * pfn so that the following mfn_to_pfn(mfn) calls will return the |
995 | * pfn from the m2p_override (the backend pfn) instead. | 715 | * pfn from the m2p_override (the backend pfn) instead. |
996 | * We need to do this because the pages shared by the frontend | 716 | * We need to do this because the pages shared by the frontend |
997 | * (xen-blkfront) can be already locked (lock_page, called by | 717 | * (xen-blkfront) can be already locked (lock_page, called by |
998 | * do_read_cache_page); when the userspace backend tries to use them | 718 | * do_read_cache_page); when the userspace backend tries to use them |
999 | * with direct_IO, mfn_to_pfn returns the pfn of the frontend, so | 719 | * with direct_IO, mfn_to_pfn returns the pfn of the frontend, so |
1000 | * do_blockdev_direct_IO is going to try to lock the same pages | 720 | * do_blockdev_direct_IO is going to try to lock the same pages |
1001 | * again resulting in a deadlock. | 721 | * again resulting in a deadlock. |
1002 | * As a side effect get_user_pages_fast might not be safe on the | 722 | * As a side effect get_user_pages_fast might not be safe on the |
1003 | * frontend pages while they are being shared with the backend, | 723 | * frontend pages while they are being shared with the backend, |
1004 | * because mfn_to_pfn (that ends up being called by GUPF) will | 724 | * because mfn_to_pfn (that ends up being called by GUPF) will |
1005 | * return the backend pfn rather than the frontend pfn. */ | 725 | * return the backend pfn rather than the frontend pfn. */ |
1006 | pfn = mfn_to_pfn_no_overrides(mfn); | 726 | pfn = mfn_to_pfn_no_overrides(mfn); |
1007 | if (get_phys_to_machine(pfn) == mfn) | 727 | if (__pfn_to_mfn(pfn) == mfn) |
1008 | set_phys_to_machine(pfn, FOREIGN_FRAME(mfn)); | 728 | set_phys_to_machine(pfn, FOREIGN_FRAME(mfn)); |
1009 | 729 | ||
1010 | return 0; | 730 | return 0; |
1011 | } | 731 | } |
1012 | EXPORT_SYMBOL_GPL(m2p_add_override); | ||
1013 | 732 | ||
1014 | int clear_foreign_p2m_mapping(struct gnttab_unmap_grant_ref *unmap_ops, | 733 | int set_foreign_p2m_mapping(struct gnttab_map_grant_ref *map_ops, |
1015 | struct gnttab_map_grant_ref *kmap_ops, | 734 | struct gnttab_map_grant_ref *kmap_ops, |
1016 | struct page **pages, unsigned int count) | 735 | struct page **pages, unsigned int count) |
1017 | { | 736 | { |
1018 | int i, ret = 0; | 737 | int i, ret = 0; |
1019 | bool lazy = false; | 738 | bool lazy = false; |
739 | pte_t *pte; | ||
1020 | 740 | ||
1021 | if (xen_feature(XENFEAT_auto_translated_physmap)) | 741 | if (xen_feature(XENFEAT_auto_translated_physmap)) |
1022 | return 0; | 742 | return 0; |
1023 | 743 | ||
1024 | if (kmap_ops && | 744 | if (kmap_ops && |
1025 | !in_interrupt() && | 745 | !in_interrupt() && |
1026 | paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) { | 746 | paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) { |
1027 | arch_enter_lazy_mmu_mode(); | 747 | arch_enter_lazy_mmu_mode(); |
1028 | lazy = true; | 748 | lazy = true; |
1029 | } | 749 | } |
1030 | 750 | ||
1031 | for (i = 0; i < count; i++) { | 751 | for (i = 0; i < count; i++) { |
1032 | unsigned long mfn = get_phys_to_machine(page_to_pfn(pages[i])); | 752 | unsigned long mfn, pfn; |
1033 | unsigned long pfn = page_to_pfn(pages[i]); | ||
1034 | 753 | ||
1035 | if (mfn == INVALID_P2M_ENTRY || !(mfn & FOREIGN_FRAME_BIT)) { | 754 | /* Do not add to override if the map failed. */ |
1036 | ret = -EINVAL; | 755 | if (map_ops[i].status) |
1037 | goto out; | 756 | continue; |
757 | |||
758 | if (map_ops[i].flags & GNTMAP_contains_pte) { | ||
759 | pte = (pte_t *)(mfn_to_virt(PFN_DOWN(map_ops[i].host_addr)) + | ||
760 | (map_ops[i].host_addr & ~PAGE_MASK)); | ||
761 | mfn = pte_mfn(*pte); | ||
762 | } else { | ||
763 | mfn = PFN_DOWN(map_ops[i].dev_bus_addr); | ||
1038 | } | 764 | } |
765 | pfn = page_to_pfn(pages[i]); | ||
1039 | 766 | ||
1040 | set_page_private(pages[i], INVALID_P2M_ENTRY); | 767 | WARN_ON(PagePrivate(pages[i])); |
1041 | WARN_ON(!PagePrivate(pages[i])); | 768 | SetPagePrivate(pages[i]); |
1042 | ClearPagePrivate(pages[i]); | 769 | set_page_private(pages[i], mfn); |
1043 | set_phys_to_machine(pfn, pages[i]->index); | 770 | pages[i]->index = pfn_to_mfn(pfn); |
1044 | 771 | ||
1045 | if (kmap_ops) | 772 | if (unlikely(!set_phys_to_machine(pfn, FOREIGN_FRAME(mfn)))) { |
1046 | ret = m2p_remove_override(pages[i], &kmap_ops[i], mfn); | 773 | ret = -ENOMEM; |
1047 | if (ret) | ||
1048 | goto out; | 774 | goto out; |
775 | } | ||
776 | |||
777 | if (kmap_ops) { | ||
778 | ret = m2p_add_override(mfn, pages[i], &kmap_ops[i]); | ||
779 | if (ret) | ||
780 | goto out; | ||
781 | } | ||
1049 | } | 782 | } |
1050 | 783 | ||
1051 | out: | 784 | out: |
1052 | if (lazy) | 785 | if (lazy) |
1053 | arch_leave_lazy_mmu_mode(); | 786 | arch_leave_lazy_mmu_mode(); |
787 | |||
1054 | return ret; | 788 | return ret; |
1055 | } | 789 | } |
1056 | EXPORT_SYMBOL_GPL(clear_foreign_p2m_mapping); | 790 | EXPORT_SYMBOL_GPL(set_foreign_p2m_mapping); |
1057 | 791 | ||
1058 | int m2p_remove_override(struct page *page, | 792 | static struct page *m2p_find_override(unsigned long mfn) |
1059 | struct gnttab_map_grant_ref *kmap_op, | ||
1060 | unsigned long mfn) | ||
1061 | { | 793 | { |
1062 | unsigned long flags; | 794 | unsigned long flags; |
795 | struct list_head *bucket; | ||
796 | struct page *p, *ret; | ||
797 | |||
798 | if (unlikely(!m2p_overrides)) | ||
799 | return NULL; | ||
800 | |||
801 | ret = NULL; | ||
802 | bucket = &m2p_overrides[mfn_hash(mfn)]; | ||
803 | |||
804 | spin_lock_irqsave(&m2p_override_lock, flags); | ||
805 | |||
806 | list_for_each_entry(p, bucket, lru) { | ||
807 | if (page_private(p) == mfn) { | ||
808 | ret = p; | ||
809 | break; | ||
810 | } | ||
811 | } | ||
812 | |||
813 | spin_unlock_irqrestore(&m2p_override_lock, flags); | ||
814 | |||
815 | return ret; | ||
816 | } | ||
817 | |||
818 | static int m2p_remove_override(struct page *page, | ||
819 | struct gnttab_map_grant_ref *kmap_op, | ||
820 | unsigned long mfn) | ||
821 | { | ||
822 | unsigned long flags; | ||
1063 | unsigned long pfn; | 823 | unsigned long pfn; |
1064 | unsigned long uninitialized_var(address); | 824 | unsigned long uninitialized_var(address); |
1065 | unsigned level; | 825 | unsigned level; |
1066 | pte_t *ptep = NULL; | 826 | pte_t *ptep = NULL; |
1067 | 827 | ||
1068 | pfn = page_to_pfn(page); | 828 | pfn = page_to_pfn(page); |
1069 | 829 | ||
1070 | if (!PageHighMem(page)) { | 830 | if (!PageHighMem(page)) { |
1071 | address = (unsigned long)__va(pfn << PAGE_SHIFT); | 831 | address = (unsigned long)__va(pfn << PAGE_SHIFT); |
1072 | ptep = lookup_address(address, &level); | 832 | ptep = lookup_address(address, &level); |
1073 | 833 | ||
1074 | if (WARN(ptep == NULL || level != PG_LEVEL_4K, | 834 | if (WARN(ptep == NULL || level != PG_LEVEL_4K, |
1075 | "m2p_remove_override: pfn %lx not mapped", pfn)) | 835 | "m2p_remove_override: pfn %lx not mapped", pfn)) |
1076 | return -EINVAL; | 836 | return -EINVAL; |
1077 | } | 837 | } |
1078 | 838 | ||
1079 | spin_lock_irqsave(&m2p_override_lock, flags); | 839 | spin_lock_irqsave(&m2p_override_lock, flags); |
1080 | list_del(&page->lru); | 840 | list_del(&page->lru); |
1081 | spin_unlock_irqrestore(&m2p_override_lock, flags); | 841 | spin_unlock_irqrestore(&m2p_override_lock, flags); |
1082 | 842 | ||
1083 | if (kmap_op != NULL) { | 843 | if (kmap_op != NULL) { |
1084 | if (!PageHighMem(page)) { | 844 | if (!PageHighMem(page)) { |
1085 | struct multicall_space mcs; | 845 | struct multicall_space mcs; |
1086 | struct gnttab_unmap_and_replace *unmap_op; | 846 | struct gnttab_unmap_and_replace *unmap_op; |
1087 | struct page *scratch_page = get_balloon_scratch_page(); | 847 | struct page *scratch_page = get_balloon_scratch_page(); |
1088 | unsigned long scratch_page_address = (unsigned long) | 848 | unsigned long scratch_page_address = (unsigned long) |
1089 | __va(page_to_pfn(scratch_page) << PAGE_SHIFT); | 849 | __va(page_to_pfn(scratch_page) << PAGE_SHIFT); |
1090 | 850 | ||
1091 | /* | 851 | /* |
1092 | * It might be that we queued all the m2p grant table | 852 | * It might be that we queued all the m2p grant table |
1093 | * hypercalls in a multicall, then m2p_remove_override | 853 | * hypercalls in a multicall, then m2p_remove_override |
1094 | * get called before the multicall has actually been | 854 | * get called before the multicall has actually been |
1095 | * issued. In this case handle is going to -1 because | 855 | * issued. In this case handle is going to -1 because |
1096 | * it hasn't been modified yet. | 856 | * it hasn't been modified yet. |
arch/x86/xen/setup.c
1 | /* | 1 | /* |
2 | * Machine specific setup for xen | 2 | * Machine specific setup for xen |
3 | * | 3 | * |
4 | * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007 | 4 | * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007 |
5 | */ | 5 | */ |
6 | 6 | ||
7 | #include <linux/module.h> | 7 | #include <linux/module.h> |
8 | #include <linux/sched.h> | 8 | #include <linux/sched.h> |
9 | #include <linux/mm.h> | 9 | #include <linux/mm.h> |
10 | #include <linux/pm.h> | 10 | #include <linux/pm.h> |
11 | #include <linux/memblock.h> | 11 | #include <linux/memblock.h> |
12 | #include <linux/cpuidle.h> | 12 | #include <linux/cpuidle.h> |
13 | #include <linux/cpufreq.h> | 13 | #include <linux/cpufreq.h> |
14 | 14 | ||
15 | #include <asm/elf.h> | 15 | #include <asm/elf.h> |
16 | #include <asm/vdso.h> | 16 | #include <asm/vdso.h> |
17 | #include <asm/e820.h> | 17 | #include <asm/e820.h> |
18 | #include <asm/setup.h> | 18 | #include <asm/setup.h> |
19 | #include <asm/acpi.h> | 19 | #include <asm/acpi.h> |
20 | #include <asm/numa.h> | 20 | #include <asm/numa.h> |
21 | #include <asm/xen/hypervisor.h> | 21 | #include <asm/xen/hypervisor.h> |
22 | #include <asm/xen/hypercall.h> | 22 | #include <asm/xen/hypercall.h> |
23 | 23 | ||
24 | #include <xen/xen.h> | 24 | #include <xen/xen.h> |
25 | #include <xen/page.h> | 25 | #include <xen/page.h> |
26 | #include <xen/interface/callback.h> | 26 | #include <xen/interface/callback.h> |
27 | #include <xen/interface/memory.h> | 27 | #include <xen/interface/memory.h> |
28 | #include <xen/interface/physdev.h> | 28 | #include <xen/interface/physdev.h> |
29 | #include <xen/features.h> | 29 | #include <xen/features.h> |
30 | #include "xen-ops.h" | 30 | #include "xen-ops.h" |
31 | #include "vdso.h" | 31 | #include "vdso.h" |
32 | #include "p2m.h" | 32 | #include "p2m.h" |
33 | #include "mmu.h" | ||
33 | 34 | ||
34 | /* These are code, but not functions. Defined in entry.S */ | 35 | /* These are code, but not functions. Defined in entry.S */ |
35 | extern const char xen_hypervisor_callback[]; | 36 | extern const char xen_hypervisor_callback[]; |
36 | extern const char xen_failsafe_callback[]; | 37 | extern const char xen_failsafe_callback[]; |
37 | #ifdef CONFIG_X86_64 | 38 | #ifdef CONFIG_X86_64 |
38 | extern asmlinkage void nmi(void); | 39 | extern asmlinkage void nmi(void); |
39 | #endif | 40 | #endif |
40 | extern void xen_sysenter_target(void); | 41 | extern void xen_sysenter_target(void); |
41 | extern void xen_syscall_target(void); | 42 | extern void xen_syscall_target(void); |
42 | extern void xen_syscall32_target(void); | 43 | extern void xen_syscall32_target(void); |
43 | 44 | ||
44 | /* Amount of extra memory space we add to the e820 ranges */ | 45 | /* Amount of extra memory space we add to the e820 ranges */ |
45 | struct xen_memory_region xen_extra_mem[XEN_EXTRA_MEM_MAX_REGIONS] __initdata; | 46 | struct xen_memory_region xen_extra_mem[XEN_EXTRA_MEM_MAX_REGIONS] __initdata; |
46 | 47 | ||
47 | /* Number of pages released from the initial allocation. */ | 48 | /* Number of pages released from the initial allocation. */ |
48 | unsigned long xen_released_pages; | 49 | unsigned long xen_released_pages; |
49 | 50 | ||
50 | /* Buffer used to remap identity mapped pages */ | 51 | /* |
51 | unsigned long xen_remap_buf[P2M_PER_PAGE] __initdata; | 52 | * Buffer used to remap identity mapped pages. We only need the virtual space. |
53 | * The physical page behind this address is remapped as needed to different | ||
54 | * buffer pages. | ||
55 | */ | ||
56 | #define REMAP_SIZE (P2M_PER_PAGE - 3) | ||
57 | static struct { | ||
58 | unsigned long next_area_mfn; | ||
59 | unsigned long target_pfn; | ||
60 | unsigned long size; | ||
61 | unsigned long mfns[REMAP_SIZE]; | ||
62 | } xen_remap_buf __initdata __aligned(PAGE_SIZE); | ||
63 | static unsigned long xen_remap_mfn __initdata = INVALID_P2M_ENTRY; | ||
52 | 64 | ||
53 | /* | 65 | /* |
54 | * The maximum amount of extra memory compared to the base size. The | 66 | * The maximum amount of extra memory compared to the base size. The |
55 | * main scaling factor is the size of struct page. At extreme ratios | 67 | * main scaling factor is the size of struct page. At extreme ratios |
56 | * of base:extra, all the base memory can be filled with page | 68 | * of base:extra, all the base memory can be filled with page |
57 | * structures for the extra memory, leaving no space for anything | 69 | * structures for the extra memory, leaving no space for anything |
58 | * else. | 70 | * else. |
59 | * | 71 | * |
60 | * 10x seems like a reasonable balance between scaling flexibility and | 72 | * 10x seems like a reasonable balance between scaling flexibility and |
61 | * leaving a practically usable system. | 73 | * leaving a practically usable system. |
62 | */ | 74 | */ |
63 | #define EXTRA_MEM_RATIO (10) | 75 | #define EXTRA_MEM_RATIO (10) |
64 | 76 | ||
65 | static void __init xen_add_extra_mem(u64 start, u64 size) | 77 | static void __init xen_add_extra_mem(u64 start, u64 size) |
66 | { | 78 | { |
67 | unsigned long pfn; | ||
68 | int i; | 79 | int i; |
69 | 80 | ||
70 | for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) { | 81 | for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) { |
71 | /* Add new region. */ | 82 | /* Add new region. */ |
72 | if (xen_extra_mem[i].size == 0) { | 83 | if (xen_extra_mem[i].size == 0) { |
73 | xen_extra_mem[i].start = start; | 84 | xen_extra_mem[i].start = start; |
74 | xen_extra_mem[i].size = size; | 85 | xen_extra_mem[i].size = size; |
75 | break; | 86 | break; |
76 | } | 87 | } |
77 | /* Append to existing region. */ | 88 | /* Append to existing region. */ |
78 | if (xen_extra_mem[i].start + xen_extra_mem[i].size == start) { | 89 | if (xen_extra_mem[i].start + xen_extra_mem[i].size == start) { |
79 | xen_extra_mem[i].size += size; | 90 | xen_extra_mem[i].size += size; |
80 | break; | 91 | break; |
81 | } | 92 | } |
82 | } | 93 | } |
83 | if (i == XEN_EXTRA_MEM_MAX_REGIONS) | 94 | if (i == XEN_EXTRA_MEM_MAX_REGIONS) |
84 | printk(KERN_WARNING "Warning: not enough extra memory regions\n"); | 95 | printk(KERN_WARNING "Warning: not enough extra memory regions\n"); |
85 | 96 | ||
86 | memblock_reserve(start, size); | 97 | memblock_reserve(start, size); |
98 | } | ||
87 | 99 | ||
88 | xen_max_p2m_pfn = PFN_DOWN(start + size); | 100 | static void __init xen_del_extra_mem(u64 start, u64 size) |
89 | for (pfn = PFN_DOWN(start); pfn < xen_max_p2m_pfn; pfn++) { | 101 | { |
90 | unsigned long mfn = pfn_to_mfn(pfn); | 102 | int i; |
103 | u64 start_r, size_r; | ||
91 | 104 | ||
92 | if (WARN_ONCE(mfn == pfn, "Trying to over-write 1-1 mapping (pfn: %lx)\n", pfn)) | 105 | for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) { |
93 | continue; | 106 | start_r = xen_extra_mem[i].start; |
94 | WARN_ONCE(mfn != INVALID_P2M_ENTRY, "Trying to remove %lx which has %lx mfn!\n", | 107 | size_r = xen_extra_mem[i].size; |
95 | pfn, mfn); | ||
96 | 108 | ||
97 | __set_phys_to_machine(pfn, INVALID_P2M_ENTRY); | 109 | /* Start of region. */ |
110 | if (start_r == start) { | ||
111 | BUG_ON(size > size_r); | ||
112 | xen_extra_mem[i].start += size; | ||
113 | xen_extra_mem[i].size -= size; | ||
114 | break; | ||
115 | } | ||
116 | /* End of region. */ | ||
117 | if (start_r + size_r == start + size) { | ||
118 | BUG_ON(size > size_r); | ||
119 | xen_extra_mem[i].size -= size; | ||
120 | break; | ||
121 | } | ||
122 | /* Mid of region. */ | ||
123 | if (start > start_r && start < start_r + size_r) { | ||
124 | BUG_ON(start + size > start_r + size_r); | ||
125 | xen_extra_mem[i].size = start - start_r; | ||
126 | /* Calling memblock_reserve() again is okay. */ | ||
127 | xen_add_extra_mem(start + size, start_r + size_r - | ||
128 | (start + size)); | ||
129 | break; | ||
130 | } | ||
98 | } | 131 | } |
132 | memblock_free(start, size); | ||
99 | } | 133 | } |
100 | 134 | ||
101 | static unsigned long __init xen_do_chunk(unsigned long start, | 135 | /* |
102 | unsigned long end, bool release) | 136 | * Called during boot before the p2m list can take entries beyond the |
137 | * hypervisor supplied p2m list. Entries in extra mem are to be regarded as | ||
138 | * invalid. | ||
139 | */ | ||
140 | unsigned long __ref xen_chk_extra_mem(unsigned long pfn) | ||
103 | { | 141 | { |
104 | struct xen_memory_reservation reservation = { | 142 | int i; |
105 | .address_bits = 0, | 143 | unsigned long addr = PFN_PHYS(pfn); |
106 | .extent_order = 0, | ||
107 | .domid = DOMID_SELF | ||
108 | }; | ||
109 | unsigned long len = 0; | ||
110 | unsigned long pfn; | ||
111 | int ret; | ||
112 | 144 | ||
113 | for (pfn = start; pfn < end; pfn++) { | 145 | for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) { |
114 | unsigned long frame; | 146 | if (addr >= xen_extra_mem[i].start && |
115 | unsigned long mfn = pfn_to_mfn(pfn); | 147 | addr < xen_extra_mem[i].start + xen_extra_mem[i].size) |
148 | return INVALID_P2M_ENTRY; | ||
149 | } | ||
116 | 150 | ||
117 | if (release) { | 151 | return IDENTITY_FRAME(pfn); |
118 | /* Make sure pfn exists to start with */ | 152 | } |
119 | if (mfn == INVALID_P2M_ENTRY || mfn_to_pfn(mfn) != pfn) | ||
120 | continue; | ||
121 | frame = mfn; | ||
122 | } else { | ||
123 | if (mfn != INVALID_P2M_ENTRY) | ||
124 | continue; | ||
125 | frame = pfn; | ||
126 | } | ||
127 | set_xen_guest_handle(reservation.extent_start, &frame); | ||
128 | reservation.nr_extents = 1; | ||
129 | 153 | ||
130 | ret = HYPERVISOR_memory_op(release ? XENMEM_decrease_reservation : XENMEM_populate_physmap, | 154 | /* |
131 | &reservation); | 155 | * Mark all pfns of extra mem as invalid in p2m list. |
132 | WARN(ret != 1, "Failed to %s pfn %lx err=%d\n", | 156 | */ |
133 | release ? "release" : "populate", pfn, ret); | 157 | void __init xen_inv_extra_mem(void) |
158 | { | ||
159 | unsigned long pfn, pfn_s, pfn_e; | ||
160 | int i; | ||
134 | 161 | ||
135 | if (ret == 1) { | 162 | for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) { |
136 | if (!early_set_phys_to_machine(pfn, release ? INVALID_P2M_ENTRY : frame)) { | 163 | pfn_s = PFN_DOWN(xen_extra_mem[i].start); |
137 | if (release) | 164 | pfn_e = PFN_UP(xen_extra_mem[i].start + xen_extra_mem[i].size); |
138 | break; | 165 | for (pfn = pfn_s; pfn < pfn_e; pfn++) |
139 | set_xen_guest_handle(reservation.extent_start, &frame); | 166 | set_phys_to_machine(pfn, INVALID_P2M_ENTRY); |
140 | reservation.nr_extents = 1; | ||
141 | ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation, | ||
142 | &reservation); | ||
143 | break; | ||
144 | } | ||
145 | len++; | ||
146 | } else | ||
147 | break; | ||
148 | } | 167 | } |
149 | if (len) | ||
150 | printk(KERN_INFO "%s %lx-%lx pfn range: %lu pages %s\n", | ||
151 | release ? "Freeing" : "Populating", | ||
152 | start, end, len, | ||
153 | release ? "freed" : "added"); | ||
154 | |||
155 | return len; | ||
156 | } | 168 | } |
157 | 169 | ||
158 | /* | 170 | /* |
159 | * Finds the next RAM pfn available in the E820 map after min_pfn. | 171 | * Finds the next RAM pfn available in the E820 map after min_pfn. |
160 | * This function updates min_pfn with the pfn found and returns | 172 | * This function updates min_pfn with the pfn found and returns |
161 | * the size of that range or zero if not found. | 173 | * the size of that range or zero if not found. |
162 | */ | 174 | */ |
163 | static unsigned long __init xen_find_pfn_range( | 175 | static unsigned long __init xen_find_pfn_range( |
164 | const struct e820entry *list, size_t map_size, | 176 | const struct e820entry *list, size_t map_size, |
165 | unsigned long *min_pfn) | 177 | unsigned long *min_pfn) |
166 | { | 178 | { |
167 | const struct e820entry *entry; | 179 | const struct e820entry *entry; |
168 | unsigned int i; | 180 | unsigned int i; |
169 | unsigned long done = 0; | 181 | unsigned long done = 0; |
170 | 182 | ||
171 | for (i = 0, entry = list; i < map_size; i++, entry++) { | 183 | for (i = 0, entry = list; i < map_size; i++, entry++) { |
172 | unsigned long s_pfn; | 184 | unsigned long s_pfn; |
173 | unsigned long e_pfn; | 185 | unsigned long e_pfn; |
174 | 186 | ||
175 | if (entry->type != E820_RAM) | 187 | if (entry->type != E820_RAM) |
176 | continue; | 188 | continue; |
177 | 189 | ||
178 | e_pfn = PFN_DOWN(entry->addr + entry->size); | 190 | e_pfn = PFN_DOWN(entry->addr + entry->size); |
179 | 191 | ||
180 | /* We only care about E820 after this */ | 192 | /* We only care about E820 after this */ |
181 | if (e_pfn < *min_pfn) | 193 | if (e_pfn < *min_pfn) |
182 | continue; | 194 | continue; |
183 | 195 | ||
184 | s_pfn = PFN_UP(entry->addr); | 196 | s_pfn = PFN_UP(entry->addr); |
185 | 197 | ||
186 | /* If min_pfn falls within the E820 entry, we want to start | 198 | /* If min_pfn falls within the E820 entry, we want to start |
187 | * at the min_pfn PFN. | 199 | * at the min_pfn PFN. |
188 | */ | 200 | */ |
189 | if (s_pfn <= *min_pfn) { | 201 | if (s_pfn <= *min_pfn) { |
190 | done = e_pfn - *min_pfn; | 202 | done = e_pfn - *min_pfn; |
191 | } else { | 203 | } else { |
192 | done = e_pfn - s_pfn; | 204 | done = e_pfn - s_pfn; |
193 | *min_pfn = s_pfn; | 205 | *min_pfn = s_pfn; |
194 | } | 206 | } |
195 | break; | 207 | break; |
196 | } | 208 | } |
197 | 209 | ||
198 | return done; | 210 | return done; |
199 | } | 211 | } |
200 | 212 | ||
213 | static int __init xen_free_mfn(unsigned long mfn) | ||
214 | { | ||
215 | struct xen_memory_reservation reservation = { | ||
216 | .address_bits = 0, | ||
217 | .extent_order = 0, | ||
218 | .domid = DOMID_SELF | ||
219 | }; | ||
220 | |||
221 | set_xen_guest_handle(reservation.extent_start, &mfn); | ||
222 | reservation.nr_extents = 1; | ||
223 | |||
224 | return HYPERVISOR_memory_op(XENMEM_decrease_reservation, &reservation); | ||
225 | } | ||
226 | |||
201 | /* | 227 | /* |
202 | * This releases a chunk of memory and then does the identity map. It's used as | 228 | * This releases a chunk of memory and then does the identity map. It's used |
203 | * as a fallback if the remapping fails. | 229 | * as a fallback if the remapping fails. |
204 | */ | 230 | */ |
205 | static void __init xen_set_identity_and_release_chunk(unsigned long start_pfn, | 231 | static void __init xen_set_identity_and_release_chunk(unsigned long start_pfn, |
206 | unsigned long end_pfn, unsigned long nr_pages, unsigned long *identity, | 232 | unsigned long end_pfn, unsigned long nr_pages, unsigned long *identity, |
207 | unsigned long *released) | 233 | unsigned long *released) |
208 | { | 234 | { |
235 | unsigned long len = 0; | ||
236 | unsigned long pfn, end; | ||
237 | int ret; | ||
238 | |||
209 | WARN_ON(start_pfn > end_pfn); | 239 | WARN_ON(start_pfn > end_pfn); |
210 | 240 | ||
241 | end = min(end_pfn, nr_pages); | ||
242 | for (pfn = start_pfn; pfn < end; pfn++) { | ||
243 | unsigned long mfn = pfn_to_mfn(pfn); | ||
244 | |||
245 | /* Make sure pfn exists to start with */ | ||
246 | if (mfn == INVALID_P2M_ENTRY || mfn_to_pfn(mfn) != pfn) | ||
247 | continue; | ||
248 | |||
249 | ret = xen_free_mfn(mfn); | ||
250 | WARN(ret != 1, "Failed to release pfn %lx err=%d\n", pfn, ret); | ||
251 | |||
252 | if (ret == 1) { | ||
253 | if (!__set_phys_to_machine(pfn, INVALID_P2M_ENTRY)) | ||
254 | break; | ||
255 | len++; | ||
256 | } else | ||
257 | break; | ||
258 | } | ||
259 | |||
211 | /* Need to release pages first */ | 260 | /* Need to release pages first */ |
212 | *released += xen_do_chunk(start_pfn, min(end_pfn, nr_pages), true); | 261 | *released += len; |
213 | *identity += set_phys_range_identity(start_pfn, end_pfn); | 262 | *identity += set_phys_range_identity(start_pfn, end_pfn); |
214 | } | 263 | } |
215 | 264 | ||
216 | /* | 265 | /* |
217 | * Helper function to update both the p2m and m2p tables. | 266 | * Helper function to update the p2m and m2p tables and kernel mapping. |
218 | */ | 267 | */ |
219 | static unsigned long __init xen_update_mem_tables(unsigned long pfn, | 268 | static void __init xen_update_mem_tables(unsigned long pfn, unsigned long mfn) |
220 | unsigned long mfn) | ||
221 | { | 269 | { |
222 | struct mmu_update update = { | 270 | struct mmu_update update = { |
223 | .ptr = ((unsigned long long)mfn << PAGE_SHIFT) | MMU_MACHPHYS_UPDATE, | 271 | .ptr = ((unsigned long long)mfn << PAGE_SHIFT) | MMU_MACHPHYS_UPDATE, |
224 | .val = pfn | 272 | .val = pfn |
225 | }; | 273 | }; |
226 | 274 | ||
227 | /* Update p2m */ | 275 | /* Update p2m */ |
228 | if (!early_set_phys_to_machine(pfn, mfn)) { | 276 | if (!set_phys_to_machine(pfn, mfn)) { |
229 | WARN(1, "Failed to set p2m mapping for pfn=%ld mfn=%ld\n", | 277 | WARN(1, "Failed to set p2m mapping for pfn=%ld mfn=%ld\n", |
230 | pfn, mfn); | 278 | pfn, mfn); |
231 | return false; | 279 | BUG(); |
232 | } | 280 | } |
233 | 281 | ||
234 | /* Update m2p */ | 282 | /* Update m2p */ |
235 | if (HYPERVISOR_mmu_update(&update, 1, NULL, DOMID_SELF) < 0) { | 283 | if (HYPERVISOR_mmu_update(&update, 1, NULL, DOMID_SELF) < 0) { |
236 | WARN(1, "Failed to set m2p mapping for mfn=%ld pfn=%ld\n", | 284 | WARN(1, "Failed to set m2p mapping for mfn=%ld pfn=%ld\n", |
237 | mfn, pfn); | 285 | mfn, pfn); |
238 | return false; | 286 | BUG(); |
239 | } | 287 | } |
240 | 288 | ||
241 | return true; | 289 | /* Update kernel mapping, but not for highmem. */ |
290 | if ((pfn << PAGE_SHIFT) >= __pa(high_memory)) | ||
291 | return; | ||
292 | |||
293 | if (HYPERVISOR_update_va_mapping((unsigned long)__va(pfn << PAGE_SHIFT), | ||
294 | mfn_pte(mfn, PAGE_KERNEL), 0)) { | ||
295 | WARN(1, "Failed to update kernel mapping for mfn=%ld pfn=%ld\n", | ||
296 | mfn, pfn); | ||
297 | BUG(); | ||
298 | } | ||
242 | } | 299 | } |
243 | 300 | ||
244 | /* | 301 | /* |
245 | * This function updates the p2m and m2p tables with an identity map from | 302 | * This function updates the p2m and m2p tables with an identity map from |
246 | * start_pfn to start_pfn+size and remaps the underlying RAM of the original | 303 | * start_pfn to start_pfn+size and prepares remapping the underlying RAM of the |
247 | * allocation at remap_pfn. It must do so carefully in P2M_PER_PAGE sized blocks | 304 | * original allocation at remap_pfn. The information needed for remapping is |
248 | * to not exhaust the reserved brk space. Doing it in properly aligned blocks | 305 | * saved in the memory itself to avoid the need for allocating buffers. The |
249 | * ensures we only allocate the minimum required leaf pages in the p2m table. It | 306 | * complete remap information is contained in a list of MFNs each containing |
250 | * copies the existing mfns from the p2m table under the 1:1 map, overwrites | 307 | * up to REMAP_SIZE MFNs and the start target PFN for doing the remap. |
251 | * them with the identity map and then updates the p2m and m2p tables with the | 308 | * This enables us to preserve the original mfn sequence while doing the |
252 | * remapped memory. | 309 | * remapping at a time when the memory management is capable of allocating |
310 | * virtual and physical memory in arbitrary amounts, see 'xen_remap_memory' and | ||
311 | * its callers. | ||
253 | */ | 312 | */ |
254 | static unsigned long __init xen_do_set_identity_and_remap_chunk( | 313 | static void __init xen_do_set_identity_and_remap_chunk( |
255 | unsigned long start_pfn, unsigned long size, unsigned long remap_pfn) | 314 | unsigned long start_pfn, unsigned long size, unsigned long remap_pfn) |
256 | { | 315 | { |
316 | unsigned long buf = (unsigned long)&xen_remap_buf; | ||
317 | unsigned long mfn_save, mfn; | ||
257 | unsigned long ident_pfn_iter, remap_pfn_iter; | 318 | unsigned long ident_pfn_iter, remap_pfn_iter; |
258 | unsigned long ident_start_pfn_align, remap_start_pfn_align; | 319 | unsigned long ident_end_pfn = start_pfn + size; |
259 | unsigned long ident_end_pfn_align, remap_end_pfn_align; | ||
260 | unsigned long ident_boundary_pfn, remap_boundary_pfn; | ||
261 | unsigned long ident_cnt = 0; | ||
262 | unsigned long remap_cnt = 0; | ||
263 | unsigned long left = size; | 320 | unsigned long left = size; |
264 | unsigned long mod; | 321 | unsigned long ident_cnt = 0; |
265 | int i; | 322 | unsigned int i, chunk; |
266 | 323 | ||
267 | WARN_ON(size == 0); | 324 | WARN_ON(size == 0); |
268 | 325 | ||
269 | BUG_ON(xen_feature(XENFEAT_auto_translated_physmap)); | 326 | BUG_ON(xen_feature(XENFEAT_auto_translated_physmap)); |
270 | 327 | ||
271 | /* | 328 | mfn_save = virt_to_mfn(buf); |
272 | * Determine the proper alignment to remap memory in P2M_PER_PAGE sized | ||
273 | * blocks. We need to keep track of both the existing pfn mapping and | ||
274 | * the new pfn remapping. | ||
275 | */ | ||
276 | mod = start_pfn % P2M_PER_PAGE; | ||
277 | ident_start_pfn_align = | ||
278 | mod ? (start_pfn - mod + P2M_PER_PAGE) : start_pfn; | ||
279 | mod = remap_pfn % P2M_PER_PAGE; | ||
280 | remap_start_pfn_align = | ||
281 | mod ? (remap_pfn - mod + P2M_PER_PAGE) : remap_pfn; | ||
282 | mod = (start_pfn + size) % P2M_PER_PAGE; | ||
283 | ident_end_pfn_align = start_pfn + size - mod; | ||
284 | mod = (remap_pfn + size) % P2M_PER_PAGE; | ||
285 | remap_end_pfn_align = remap_pfn + size - mod; | ||
286 | 329 | ||
287 | /* Iterate over each p2m leaf node in each range */ | 330 | for (ident_pfn_iter = start_pfn, remap_pfn_iter = remap_pfn; |
288 | for (ident_pfn_iter = ident_start_pfn_align, remap_pfn_iter = remap_start_pfn_align; | 331 | ident_pfn_iter < ident_end_pfn; |
289 | ident_pfn_iter < ident_end_pfn_align && remap_pfn_iter < remap_end_pfn_align; | 332 | ident_pfn_iter += REMAP_SIZE, remap_pfn_iter += REMAP_SIZE) { |
290 | ident_pfn_iter += P2M_PER_PAGE, remap_pfn_iter += P2M_PER_PAGE) { | 333 | chunk = (left < REMAP_SIZE) ? left : REMAP_SIZE; |
291 | /* Check we aren't past the end */ | ||
292 | BUG_ON(ident_pfn_iter + P2M_PER_PAGE > start_pfn + size); | ||
293 | BUG_ON(remap_pfn_iter + P2M_PER_PAGE > remap_pfn + size); | ||
294 | 334 | ||
295 | /* Save p2m mappings */ | 335 | /* Map first pfn to xen_remap_buf */ |
296 | for (i = 0; i < P2M_PER_PAGE; i++) | 336 | mfn = pfn_to_mfn(ident_pfn_iter); |
297 | xen_remap_buf[i] = pfn_to_mfn(ident_pfn_iter + i); | 337 | set_pte_mfn(buf, mfn, PAGE_KERNEL); |
298 | 338 | ||
299 | /* Set identity map which will free a p2m leaf */ | 339 | /* Save mapping information in page */ |
300 | ident_cnt += set_phys_range_identity(ident_pfn_iter, | 340 | xen_remap_buf.next_area_mfn = xen_remap_mfn; |
301 | ident_pfn_iter + P2M_PER_PAGE); | 341 | xen_remap_buf.target_pfn = remap_pfn_iter; |
342 | xen_remap_buf.size = chunk; | ||
343 | for (i = 0; i < chunk; i++) | ||
344 | xen_remap_buf.mfns[i] = pfn_to_mfn(ident_pfn_iter + i); | ||
302 | 345 | ||
303 | #ifdef DEBUG | 346 | /* Put remap buf into list. */ |
304 | /* Helps verify a p2m leaf has been freed */ | 347 | xen_remap_mfn = mfn; |
305 | for (i = 0; i < P2M_PER_PAGE; i++) { | ||
306 | unsigned int pfn = ident_pfn_iter + i; | ||
307 | BUG_ON(pfn_to_mfn(pfn) != pfn); | ||
308 | } | ||
309 | #endif | ||
310 | /* Now remap memory */ | ||
311 | for (i = 0; i < P2M_PER_PAGE; i++) { | ||
312 | unsigned long mfn = xen_remap_buf[i]; | ||
313 | 348 | ||
314 | /* This will use the p2m leaf freed above */ | 349 | /* Set identity map */ |
315 | if (!xen_update_mem_tables(remap_pfn_iter + i, mfn)) { | 350 | ident_cnt += set_phys_range_identity(ident_pfn_iter, |
316 | WARN(1, "Failed to update mem mapping for pfn=%ld mfn=%ld\n", | 351 | ident_pfn_iter + chunk); |
317 | remap_pfn_iter + i, mfn); | ||
318 | return 0; | ||
319 | } | ||
320 | 352 | ||
321 | remap_cnt++; | 353 | left -= chunk; |
322 | } | ||
323 | |||
324 | left -= P2M_PER_PAGE; | ||
325 | } | 354 | } |
326 | 355 | ||
327 | /* Max boundary space possible */ | 356 | /* Restore old xen_remap_buf mapping */ |
328 | BUG_ON(left > (P2M_PER_PAGE - 1) * 2); | 357 | set_pte_mfn(buf, mfn_save, PAGE_KERNEL); |
329 | |||
330 | /* Now handle the boundary conditions */ | ||
331 | ident_boundary_pfn = start_pfn; | ||
332 | remap_boundary_pfn = remap_pfn; | ||
333 | for (i = 0; i < left; i++) { | ||
334 | unsigned long mfn; | ||
335 | |||
336 | /* These two checks move from the start to end boundaries */ | ||
337 | if (ident_boundary_pfn == ident_start_pfn_align) | ||
338 | ident_boundary_pfn = ident_pfn_iter; | ||
339 | if (remap_boundary_pfn == remap_start_pfn_align) | ||
340 | remap_boundary_pfn = remap_pfn_iter; | ||
341 | |||
342 | /* Check we aren't past the end */ | ||
343 | BUG_ON(ident_boundary_pfn >= start_pfn + size); | ||
344 | BUG_ON(remap_boundary_pfn >= remap_pfn + size); | ||
345 | |||
346 | mfn = pfn_to_mfn(ident_boundary_pfn); | ||
347 | |||
348 | if (!xen_update_mem_tables(remap_boundary_pfn, mfn)) { | ||
349 | WARN(1, "Failed to update mem mapping for pfn=%ld mfn=%ld\n", | ||
350 | remap_pfn_iter + i, mfn); | ||
351 | return 0; | ||
352 | } | ||
353 | remap_cnt++; | ||
354 | |||
355 | ident_boundary_pfn++; | ||
356 | remap_boundary_pfn++; | ||
357 | } | ||
358 | |||
359 | /* Finish up the identity map */ | ||
360 | if (ident_start_pfn_align >= ident_end_pfn_align) { | ||
361 | /* | ||
362 | * In this case we have an identity range which does not span an | ||
363 | * aligned block so everything needs to be identity mapped here. | ||
364 | * If we didn't check this we might remap too many pages since | ||
365 | * the align boundaries are not meaningful in this case. | ||
366 | */ | ||
367 | ident_cnt += set_phys_range_identity(start_pfn, | ||
368 | start_pfn + size); | ||
369 | } else { | ||
370 | /* Remapped above so check each end of the chunk */ | ||
371 | if (start_pfn < ident_start_pfn_align) | ||
372 | ident_cnt += set_phys_range_identity(start_pfn, | ||
373 | ident_start_pfn_align); | ||
374 | if (start_pfn + size > ident_pfn_iter) | ||
375 | ident_cnt += set_phys_range_identity(ident_pfn_iter, | ||
376 | start_pfn + size); | ||
377 | } | ||
378 | |||
379 | BUG_ON(ident_cnt != size); | ||
380 | BUG_ON(remap_cnt != size); | ||
381 | |||
382 | return size; | ||
383 | } | 358 | } |
384 | 359 | ||
385 | /* | 360 | /* |
386 | * This function takes a contiguous pfn range that needs to be identity mapped | 361 | * This function takes a contiguous pfn range that needs to be identity mapped |
387 | * and: | 362 | * and: |
388 | * | 363 | * |
389 | * 1) Finds a new range of pfns to use to remap based on E820 and remap_pfn. | 364 | * 1) Finds a new range of pfns to use to remap based on E820 and remap_pfn. |
390 | * 2) Calls the do_ function to actually do the mapping/remapping work. | 365 | * 2) Calls the do_ function to actually do the mapping/remapping work. |
391 | * | 366 | * |
392 | * The goal is to not allocate additional memory but to remap the existing | 367 | * The goal is to not allocate additional memory but to remap the existing |
393 | * pages. In the case of an error the underlying memory is simply released back | 368 | * pages. In the case of an error the underlying memory is simply released back |
394 | * to Xen and not remapped. | 369 | * to Xen and not remapped. |
395 | */ | 370 | */ |
396 | static unsigned long __init xen_set_identity_and_remap_chunk( | 371 | static unsigned long __init xen_set_identity_and_remap_chunk( |
397 | const struct e820entry *list, size_t map_size, unsigned long start_pfn, | 372 | const struct e820entry *list, size_t map_size, unsigned long start_pfn, |
398 | unsigned long end_pfn, unsigned long nr_pages, unsigned long remap_pfn, | 373 | unsigned long end_pfn, unsigned long nr_pages, unsigned long remap_pfn, |
399 | unsigned long *identity, unsigned long *remapped, | 374 | unsigned long *identity, unsigned long *released) |
400 | unsigned long *released) | ||
401 | { | 375 | { |
402 | unsigned long pfn; | 376 | unsigned long pfn; |
403 | unsigned long i = 0; | 377 | unsigned long i = 0; |
404 | unsigned long n = end_pfn - start_pfn; | 378 | unsigned long n = end_pfn - start_pfn; |
405 | 379 | ||
406 | while (i < n) { | 380 | while (i < n) { |
407 | unsigned long cur_pfn = start_pfn + i; | 381 | unsigned long cur_pfn = start_pfn + i; |
408 | unsigned long left = n - i; | 382 | unsigned long left = n - i; |
409 | unsigned long size = left; | 383 | unsigned long size = left; |
410 | unsigned long remap_range_size; | 384 | unsigned long remap_range_size; |
411 | 385 | ||
412 | /* Do not remap pages beyond the current allocation */ | 386 | /* Do not remap pages beyond the current allocation */ |
413 | if (cur_pfn >= nr_pages) { | 387 | if (cur_pfn >= nr_pages) { |
414 | /* Identity map remaining pages */ | 388 | /* Identity map remaining pages */ |
415 | *identity += set_phys_range_identity(cur_pfn, | 389 | *identity += set_phys_range_identity(cur_pfn, |
416 | cur_pfn + size); | 390 | cur_pfn + size); |
417 | break; | 391 | break; |
418 | } | 392 | } |
419 | if (cur_pfn + size > nr_pages) | 393 | if (cur_pfn + size > nr_pages) |
420 | size = nr_pages - cur_pfn; | 394 | size = nr_pages - cur_pfn; |
421 | 395 | ||
422 | remap_range_size = xen_find_pfn_range(list, map_size, | 396 | remap_range_size = xen_find_pfn_range(list, map_size, |
423 | &remap_pfn); | 397 | &remap_pfn); |
424 | if (!remap_range_size) { | 398 | if (!remap_range_size) { |
425 | pr_warning("Unable to find available pfn range, not remapping identity pages\n"); | 399 | pr_warning("Unable to find available pfn range, not remapping identity pages\n"); |
426 | xen_set_identity_and_release_chunk(cur_pfn, | 400 | xen_set_identity_and_release_chunk(cur_pfn, |
427 | cur_pfn + left, nr_pages, identity, released); | 401 | cur_pfn + left, nr_pages, identity, released); |
428 | break; | 402 | break; |
429 | } | 403 | } |
430 | /* Adjust size to fit in current e820 RAM region */ | 404 | /* Adjust size to fit in current e820 RAM region */ |
431 | if (size > remap_range_size) | 405 | if (size > remap_range_size) |
432 | size = remap_range_size; | 406 | size = remap_range_size; |
433 | 407 | ||
434 | if (!xen_do_set_identity_and_remap_chunk(cur_pfn, size, remap_pfn)) { | 408 | xen_do_set_identity_and_remap_chunk(cur_pfn, size, remap_pfn); |
435 | WARN(1, "Failed to remap 1:1 memory cur_pfn=%ld size=%ld remap_pfn=%ld\n", | ||
436 | cur_pfn, size, remap_pfn); | ||
437 | xen_set_identity_and_release_chunk(cur_pfn, | ||
438 | cur_pfn + left, nr_pages, identity, released); | ||
439 | break; | ||
440 | } | ||
441 | 409 | ||
442 | /* Update variables to reflect new mappings. */ | 410 | /* Update variables to reflect new mappings. */ |
443 | i += size; | 411 | i += size; |
444 | remap_pfn += size; | 412 | remap_pfn += size; |
445 | *identity += size; | 413 | *identity += size; |
446 | *remapped += size; | ||
447 | } | 414 | } |
448 | 415 | ||
449 | /* | 416 | /* |
450 | * If the PFNs are currently mapped, the VA mapping also needs | 417 | * If the PFNs are currently mapped, the VA mapping also needs |
451 | * to be updated to be 1:1. | 418 | * to be updated to be 1:1. |
452 | */ | 419 | */ |
453 | for (pfn = start_pfn; pfn <= max_pfn_mapped && pfn < end_pfn; pfn++) | 420 | for (pfn = start_pfn; pfn <= max_pfn_mapped && pfn < end_pfn; pfn++) |
454 | (void)HYPERVISOR_update_va_mapping( | 421 | (void)HYPERVISOR_update_va_mapping( |
455 | (unsigned long)__va(pfn << PAGE_SHIFT), | 422 | (unsigned long)__va(pfn << PAGE_SHIFT), |
456 | mfn_pte(pfn, PAGE_KERNEL_IO), 0); | 423 | mfn_pte(pfn, PAGE_KERNEL_IO), 0); |
457 | 424 | ||
458 | return remap_pfn; | 425 | return remap_pfn; |
459 | } | 426 | } |
460 | 427 | ||
461 | static unsigned long __init xen_set_identity_and_remap( | 428 | static void __init xen_set_identity_and_remap( |
462 | const struct e820entry *list, size_t map_size, unsigned long nr_pages, | 429 | const struct e820entry *list, size_t map_size, unsigned long nr_pages, |
463 | unsigned long *released) | 430 | unsigned long *released) |
464 | { | 431 | { |
465 | phys_addr_t start = 0; | 432 | phys_addr_t start = 0; |
466 | unsigned long identity = 0; | 433 | unsigned long identity = 0; |
467 | unsigned long remapped = 0; | ||
468 | unsigned long last_pfn = nr_pages; | 434 | unsigned long last_pfn = nr_pages; |
469 | const struct e820entry *entry; | 435 | const struct e820entry *entry; |
470 | unsigned long num_released = 0; | 436 | unsigned long num_released = 0; |
471 | int i; | 437 | int i; |
472 | 438 | ||
473 | /* | 439 | /* |
474 | * Combine non-RAM regions and gaps until a RAM region (or the | 440 | * Combine non-RAM regions and gaps until a RAM region (or the |
475 | * end of the map) is reached, then set the 1:1 map and | 441 | * end of the map) is reached, then set the 1:1 map and |
476 | * remap the memory in those non-RAM regions. | 442 | * remap the memory in those non-RAM regions. |
477 | * | 443 | * |
478 | * The combined non-RAM regions are rounded to a whole number | 444 | * The combined non-RAM regions are rounded to a whole number |
479 | * of pages so any partial pages are accessible via the 1:1 | 445 | * of pages so any partial pages are accessible via the 1:1 |
480 | * mapping. This is needed for some BIOSes that put (for | 446 | * mapping. This is needed for some BIOSes that put (for |
481 | * example) the DMI tables in a reserved region that begins on | 447 | * example) the DMI tables in a reserved region that begins on |
482 | * a non-page boundary. | 448 | * a non-page boundary. |
483 | */ | 449 | */ |
484 | for (i = 0, entry = list; i < map_size; i++, entry++) { | 450 | for (i = 0, entry = list; i < map_size; i++, entry++) { |
485 | phys_addr_t end = entry->addr + entry->size; | 451 | phys_addr_t end = entry->addr + entry->size; |
486 | if (entry->type == E820_RAM || i == map_size - 1) { | 452 | if (entry->type == E820_RAM || i == map_size - 1) { |
487 | unsigned long start_pfn = PFN_DOWN(start); | 453 | unsigned long start_pfn = PFN_DOWN(start); |
488 | unsigned long end_pfn = PFN_UP(end); | 454 | unsigned long end_pfn = PFN_UP(end); |
489 | 455 | ||
490 | if (entry->type == E820_RAM) | 456 | if (entry->type == E820_RAM) |
491 | end_pfn = PFN_UP(entry->addr); | 457 | end_pfn = PFN_UP(entry->addr); |
492 | 458 | ||
493 | if (start_pfn < end_pfn) | 459 | if (start_pfn < end_pfn) |
494 | last_pfn = xen_set_identity_and_remap_chunk( | 460 | last_pfn = xen_set_identity_and_remap_chunk( |
495 | list, map_size, start_pfn, | 461 | list, map_size, start_pfn, |
496 | end_pfn, nr_pages, last_pfn, | 462 | end_pfn, nr_pages, last_pfn, |
497 | &identity, &remapped, | 463 | &identity, &num_released); |
498 | &num_released); | ||
499 | start = end; | 464 | start = end; |
500 | } | 465 | } |
501 | } | 466 | } |
502 | 467 | ||
503 | *released = num_released; | 468 | *released = num_released; |
504 | 469 | ||
505 | pr_info("Set %ld page(s) to 1-1 mapping\n", identity); | 470 | pr_info("Set %ld page(s) to 1-1 mapping\n", identity); |
506 | pr_info("Remapped %ld page(s), last_pfn=%ld\n", remapped, | ||
507 | last_pfn); | ||
508 | pr_info("Released %ld page(s)\n", num_released); | 471 | pr_info("Released %ld page(s)\n", num_released); |
472 | } | ||
509 | 473 | ||
510 | return last_pfn; | 474 | /* |
475 | * Remap the memory prepared in xen_do_set_identity_and_remap_chunk(). | ||
476 | * The remap information (which mfn remap to which pfn) is contained in the | ||
477 | * to be remapped memory itself in a linked list anchored at xen_remap_mfn. | ||
478 | * This scheme allows to remap the different chunks in arbitrary order while | ||
479 | * the resulting mapping will be independant from the order. | ||
480 | */ | ||
481 | void __init xen_remap_memory(void) | ||
482 | { | ||
483 | unsigned long buf = (unsigned long)&xen_remap_buf; | ||
484 | unsigned long mfn_save, mfn, pfn; | ||
485 | unsigned long remapped = 0; | ||
486 | unsigned int i; | ||
487 | unsigned long pfn_s = ~0UL; | ||
488 | unsigned long len = 0; | ||
489 | |||
490 | mfn_save = virt_to_mfn(buf); | ||
491 | |||
492 | while (xen_remap_mfn != INVALID_P2M_ENTRY) { | ||
493 | /* Map the remap information */ | ||
494 | set_pte_mfn(buf, xen_remap_mfn, PAGE_KERNEL); | ||
495 | |||
496 | BUG_ON(xen_remap_mfn != xen_remap_buf.mfns[0]); | ||
497 | |||
498 | pfn = xen_remap_buf.target_pfn; | ||
499 | for (i = 0; i < xen_remap_buf.size; i++) { | ||
500 | mfn = xen_remap_buf.mfns[i]; | ||
501 | xen_update_mem_tables(pfn, mfn); | ||
502 | remapped++; | ||
503 | pfn++; | ||
504 | } | ||
505 | if (pfn_s == ~0UL || pfn == pfn_s) { | ||
506 | pfn_s = xen_remap_buf.target_pfn; | ||
507 | len += xen_remap_buf.size; | ||
508 | } else if (pfn_s + len == xen_remap_buf.target_pfn) { | ||
509 | len += xen_remap_buf.size; | ||
510 | } else { | ||
511 | xen_del_extra_mem(PFN_PHYS(pfn_s), PFN_PHYS(len)); | ||
512 | pfn_s = xen_remap_buf.target_pfn; | ||
513 | len = xen_remap_buf.size; | ||
514 | } | ||
515 | |||
516 | mfn = xen_remap_mfn; | ||
517 | xen_remap_mfn = xen_remap_buf.next_area_mfn; | ||
518 | } | ||
519 | |||
520 | if (pfn_s != ~0UL && len) | ||
521 | xen_del_extra_mem(PFN_PHYS(pfn_s), PFN_PHYS(len)); | ||
522 | |||
523 | set_pte_mfn(buf, mfn_save, PAGE_KERNEL); | ||
524 | |||
525 | pr_info("Remapped %ld page(s)\n", remapped); | ||
511 | } | 526 | } |
527 | |||
512 | static unsigned long __init xen_get_max_pages(void) | 528 | static unsigned long __init xen_get_max_pages(void) |
513 | { | 529 | { |
514 | unsigned long max_pages = MAX_DOMAIN_PAGES; | 530 | unsigned long max_pages = MAX_DOMAIN_PAGES; |
515 | domid_t domid = DOMID_SELF; | 531 | domid_t domid = DOMID_SELF; |
516 | int ret; | 532 | int ret; |
517 | 533 | ||
518 | /* | 534 | /* |
519 | * For the initial domain we use the maximum reservation as | 535 | * For the initial domain we use the maximum reservation as |
520 | * the maximum page. | 536 | * the maximum page. |
521 | * | 537 | * |
522 | * For guest domains the current maximum reservation reflects | 538 | * For guest domains the current maximum reservation reflects |
523 | * the current maximum rather than the static maximum. In this | 539 | * the current maximum rather than the static maximum. In this |
524 | * case the e820 map provided to us will cover the static | 540 | * case the e820 map provided to us will cover the static |
525 | * maximum region. | 541 | * maximum region. |
526 | */ | 542 | */ |
527 | if (xen_initial_domain()) { | 543 | if (xen_initial_domain()) { |
528 | ret = HYPERVISOR_memory_op(XENMEM_maximum_reservation, &domid); | 544 | ret = HYPERVISOR_memory_op(XENMEM_maximum_reservation, &domid); |
529 | if (ret > 0) | 545 | if (ret > 0) |
530 | max_pages = ret; | 546 | max_pages = ret; |
531 | } | 547 | } |
532 | 548 | ||
533 | return min(max_pages, MAX_DOMAIN_PAGES); | 549 | return min(max_pages, MAX_DOMAIN_PAGES); |
534 | } | 550 | } |
535 | 551 | ||
536 | static void xen_align_and_add_e820_region(u64 start, u64 size, int type) | 552 | static void xen_align_and_add_e820_region(u64 start, u64 size, int type) |
537 | { | 553 | { |
538 | u64 end = start + size; | 554 | u64 end = start + size; |
539 | 555 | ||
540 | /* Align RAM regions to page boundaries. */ | 556 | /* Align RAM regions to page boundaries. */ |
541 | if (type == E820_RAM) { | 557 | if (type == E820_RAM) { |
542 | start = PAGE_ALIGN(start); | 558 | start = PAGE_ALIGN(start); |
543 | end &= ~((u64)PAGE_SIZE - 1); | 559 | end &= ~((u64)PAGE_SIZE - 1); |
544 | } | 560 | } |
545 | 561 | ||
546 | e820_add_region(start, end - start, type); | 562 | e820_add_region(start, end - start, type); |
547 | } | 563 | } |
548 | 564 | ||
549 | void xen_ignore_unusable(struct e820entry *list, size_t map_size) | 565 | void xen_ignore_unusable(struct e820entry *list, size_t map_size) |
550 | { | 566 | { |
551 | struct e820entry *entry; | 567 | struct e820entry *entry; |
552 | unsigned int i; | 568 | unsigned int i; |
553 | 569 | ||
554 | for (i = 0, entry = list; i < map_size; i++, entry++) { | 570 | for (i = 0, entry = list; i < map_size; i++, entry++) { |
555 | if (entry->type == E820_UNUSABLE) | 571 | if (entry->type == E820_UNUSABLE) |
556 | entry->type = E820_RAM; | 572 | entry->type = E820_RAM; |
557 | } | 573 | } |
558 | } | 574 | } |
559 | 575 | ||
560 | /** | 576 | /** |
561 | * machine_specific_memory_setup - Hook for machine specific memory setup. | 577 | * machine_specific_memory_setup - Hook for machine specific memory setup. |
562 | **/ | 578 | **/ |
563 | char * __init xen_memory_setup(void) | 579 | char * __init xen_memory_setup(void) |
564 | { | 580 | { |
565 | static struct e820entry map[E820MAX] __initdata; | 581 | static struct e820entry map[E820MAX] __initdata; |
566 | 582 | ||
567 | unsigned long max_pfn = xen_start_info->nr_pages; | 583 | unsigned long max_pfn = xen_start_info->nr_pages; |
568 | unsigned long long mem_end; | 584 | unsigned long long mem_end; |
569 | int rc; | 585 | int rc; |
570 | struct xen_memory_map memmap; | 586 | struct xen_memory_map memmap; |
571 | unsigned long max_pages; | 587 | unsigned long max_pages; |
572 | unsigned long last_pfn = 0; | ||
573 | unsigned long extra_pages = 0; | 588 | unsigned long extra_pages = 0; |
574 | int i; | 589 | int i; |
575 | int op; | 590 | int op; |
576 | 591 | ||
577 | max_pfn = min(MAX_DOMAIN_PAGES, max_pfn); | 592 | max_pfn = min(MAX_DOMAIN_PAGES, max_pfn); |
578 | mem_end = PFN_PHYS(max_pfn); | 593 | mem_end = PFN_PHYS(max_pfn); |
579 | 594 | ||
580 | memmap.nr_entries = E820MAX; | 595 | memmap.nr_entries = E820MAX; |
581 | set_xen_guest_handle(memmap.buffer, map); | 596 | set_xen_guest_handle(memmap.buffer, map); |
582 | 597 | ||
583 | op = xen_initial_domain() ? | 598 | op = xen_initial_domain() ? |
584 | XENMEM_machine_memory_map : | 599 | XENMEM_machine_memory_map : |
585 | XENMEM_memory_map; | 600 | XENMEM_memory_map; |
586 | rc = HYPERVISOR_memory_op(op, &memmap); | 601 | rc = HYPERVISOR_memory_op(op, &memmap); |
587 | if (rc == -ENOSYS) { | 602 | if (rc == -ENOSYS) { |
588 | BUG_ON(xen_initial_domain()); | 603 | BUG_ON(xen_initial_domain()); |
589 | memmap.nr_entries = 1; | 604 | memmap.nr_entries = 1; |
590 | map[0].addr = 0ULL; | 605 | map[0].addr = 0ULL; |
591 | map[0].size = mem_end; | 606 | map[0].size = mem_end; |
592 | /* 8MB slack (to balance backend allocations). */ | 607 | /* 8MB slack (to balance backend allocations). */ |
593 | map[0].size += 8ULL << 20; | 608 | map[0].size += 8ULL << 20; |
594 | map[0].type = E820_RAM; | 609 | map[0].type = E820_RAM; |
595 | rc = 0; | 610 | rc = 0; |
596 | } | 611 | } |
597 | BUG_ON(rc); | 612 | BUG_ON(rc); |
598 | BUG_ON(memmap.nr_entries == 0); | 613 | BUG_ON(memmap.nr_entries == 0); |
599 | 614 | ||
600 | /* | 615 | /* |
601 | * Xen won't allow a 1:1 mapping to be created to UNUSABLE | 616 | * Xen won't allow a 1:1 mapping to be created to UNUSABLE |
602 | * regions, so if we're using the machine memory map leave the | 617 | * regions, so if we're using the machine memory map leave the |
603 | * region as RAM as it is in the pseudo-physical map. | 618 | * region as RAM as it is in the pseudo-physical map. |
604 | * | 619 | * |
605 | * UNUSABLE regions in domUs are not handled and will need | 620 | * UNUSABLE regions in domUs are not handled and will need |
606 | * a patch in the future. | 621 | * a patch in the future. |
607 | */ | 622 | */ |
608 | if (xen_initial_domain()) | 623 | if (xen_initial_domain()) |
609 | xen_ignore_unusable(map, memmap.nr_entries); | 624 | xen_ignore_unusable(map, memmap.nr_entries); |
610 | 625 | ||
611 | /* Make sure the Xen-supplied memory map is well-ordered. */ | 626 | /* Make sure the Xen-supplied memory map is well-ordered. */ |
612 | sanitize_e820_map(map, memmap.nr_entries, &memmap.nr_entries); | 627 | sanitize_e820_map(map, memmap.nr_entries, &memmap.nr_entries); |
613 | 628 | ||
614 | max_pages = xen_get_max_pages(); | 629 | max_pages = xen_get_max_pages(); |
615 | if (max_pages > max_pfn) | 630 | if (max_pages > max_pfn) |
616 | extra_pages += max_pages - max_pfn; | 631 | extra_pages += max_pages - max_pfn; |
617 | 632 | ||
618 | /* | 633 | /* |
619 | * Set identity map on non-RAM pages and remap the underlying RAM. | 634 | * Set identity map on non-RAM pages and prepare remapping the |
635 | * underlying RAM. | ||
620 | */ | 636 | */ |
621 | last_pfn = xen_set_identity_and_remap(map, memmap.nr_entries, max_pfn, | 637 | xen_set_identity_and_remap(map, memmap.nr_entries, max_pfn, |
622 | &xen_released_pages); | 638 | &xen_released_pages); |
623 | 639 | ||
624 | extra_pages += xen_released_pages; | 640 | extra_pages += xen_released_pages; |
625 | 641 | ||
626 | if (last_pfn > max_pfn) { | ||
627 | max_pfn = min(MAX_DOMAIN_PAGES, last_pfn); | ||
628 | mem_end = PFN_PHYS(max_pfn); | ||
629 | } | ||
630 | /* | 642 | /* |
631 | * Clamp the amount of extra memory to a EXTRA_MEM_RATIO | 643 | * Clamp the amount of extra memory to a EXTRA_MEM_RATIO |
632 | * factor the base size. On non-highmem systems, the base | 644 | * factor the base size. On non-highmem systems, the base |
633 | * size is the full initial memory allocation; on highmem it | 645 | * size is the full initial memory allocation; on highmem it |
634 | * is limited to the max size of lowmem, so that it doesn't | 646 | * is limited to the max size of lowmem, so that it doesn't |
635 | * get completely filled. | 647 | * get completely filled. |
636 | * | 648 | * |
637 | * In principle there could be a problem in lowmem systems if | 649 | * In principle there could be a problem in lowmem systems if |
638 | * the initial memory is also very large with respect to | 650 | * the initial memory is also very large with respect to |
639 | * lowmem, but we won't try to deal with that here. | 651 | * lowmem, but we won't try to deal with that here. |
640 | */ | 652 | */ |
641 | extra_pages = min(EXTRA_MEM_RATIO * min(max_pfn, PFN_DOWN(MAXMEM)), | 653 | extra_pages = min(EXTRA_MEM_RATIO * min(max_pfn, PFN_DOWN(MAXMEM)), |
642 | extra_pages); | 654 | extra_pages); |
643 | i = 0; | 655 | i = 0; |
644 | while (i < memmap.nr_entries) { | 656 | while (i < memmap.nr_entries) { |
645 | u64 addr = map[i].addr; | 657 | u64 addr = map[i].addr; |
646 | u64 size = map[i].size; | 658 | u64 size = map[i].size; |
647 | u32 type = map[i].type; | 659 | u32 type = map[i].type; |
648 | 660 | ||
649 | if (type == E820_RAM) { | 661 | if (type == E820_RAM) { |
650 | if (addr < mem_end) { | 662 | if (addr < mem_end) { |
651 | size = min(size, mem_end - addr); | 663 | size = min(size, mem_end - addr); |
652 | } else if (extra_pages) { | 664 | } else if (extra_pages) { |
653 | size = min(size, (u64)extra_pages * PAGE_SIZE); | 665 | size = min(size, (u64)extra_pages * PAGE_SIZE); |
654 | extra_pages -= size / PAGE_SIZE; | 666 | extra_pages -= size / PAGE_SIZE; |
655 | xen_add_extra_mem(addr, size); | 667 | xen_add_extra_mem(addr, size); |
668 | xen_max_p2m_pfn = PFN_DOWN(addr + size); | ||
656 | } else | 669 | } else |
657 | type = E820_UNUSABLE; | 670 | type = E820_UNUSABLE; |
658 | } | 671 | } |
659 | 672 | ||
660 | xen_align_and_add_e820_region(addr, size, type); | 673 | xen_align_and_add_e820_region(addr, size, type); |
661 | 674 | ||
662 | map[i].addr += size; | 675 | map[i].addr += size; |
663 | map[i].size -= size; | 676 | map[i].size -= size; |
664 | if (map[i].size == 0) | 677 | if (map[i].size == 0) |
665 | i++; | 678 | i++; |
666 | } | 679 | } |
667 | 680 | ||
668 | /* | 681 | /* |
669 | * Set the rest as identity mapped, in case PCI BARs are | 682 | * Set the rest as identity mapped, in case PCI BARs are |
670 | * located here. | 683 | * located here. |
671 | * | 684 | * |
672 | * PFNs above MAX_P2M_PFN are considered identity mapped as | 685 | * PFNs above MAX_P2M_PFN are considered identity mapped as |
673 | * well. | 686 | * well. |
674 | */ | 687 | */ |
675 | set_phys_range_identity(map[i-1].addr / PAGE_SIZE, ~0ul); | 688 | set_phys_range_identity(map[i-1].addr / PAGE_SIZE, ~0ul); |
676 | 689 | ||
677 | /* | 690 | /* |
678 | * In domU, the ISA region is normal, usable memory, but we | 691 | * In domU, the ISA region is normal, usable memory, but we |
679 | * reserve ISA memory anyway because too many things poke | 692 | * reserve ISA memory anyway because too many things poke |
680 | * about in there. | 693 | * about in there. |
681 | */ | 694 | */ |
682 | e820_add_region(ISA_START_ADDRESS, ISA_END_ADDRESS - ISA_START_ADDRESS, | 695 | e820_add_region(ISA_START_ADDRESS, ISA_END_ADDRESS - ISA_START_ADDRESS, |
683 | E820_RESERVED); | 696 | E820_RESERVED); |
684 | 697 | ||
685 | /* | 698 | /* |
686 | * Reserve Xen bits: | 699 | * Reserve Xen bits: |
687 | * - mfn_list | 700 | * - mfn_list |
688 | * - xen_start_info | 701 | * - xen_start_info |
689 | * See comment above "struct start_info" in <xen/interface/xen.h> | 702 | * See comment above "struct start_info" in <xen/interface/xen.h> |
690 | * We tried to make the the memblock_reserve more selective so | 703 | * We tried to make the the memblock_reserve more selective so |
691 | * that it would be clear what region is reserved. Sadly we ran | 704 | * that it would be clear what region is reserved. Sadly we ran |
692 | * in the problem wherein on a 64-bit hypervisor with a 32-bit | 705 | * in the problem wherein on a 64-bit hypervisor with a 32-bit |
693 | * initial domain, the pt_base has the cr3 value which is not | 706 | * initial domain, the pt_base has the cr3 value which is not |
694 | * neccessarily where the pagetable starts! As Jan put it: " | 707 | * neccessarily where the pagetable starts! As Jan put it: " |
695 | * Actually, the adjustment turns out to be correct: The page | 708 | * Actually, the adjustment turns out to be correct: The page |
696 | * tables for a 32-on-64 dom0 get allocated in the order "first L1", | 709 | * tables for a 32-on-64 dom0 get allocated in the order "first L1", |
697 | * "first L2", "first L3", so the offset to the page table base is | 710 | * "first L2", "first L3", so the offset to the page table base is |
698 | * indeed 2. When reading xen/include/public/xen.h's comment | 711 | * indeed 2. When reading xen/include/public/xen.h's comment |
699 | * very strictly, this is not a violation (since there nothing is said | 712 | * very strictly, this is not a violation (since there nothing is said |
700 | * that the first thing in the page table space is pointed to by | 713 | * that the first thing in the page table space is pointed to by |
701 | * pt_base; I admit that this seems to be implied though, namely | 714 | * pt_base; I admit that this seems to be implied though, namely |
702 | * do I think that it is implied that the page table space is the | 715 | * do I think that it is implied that the page table space is the |
703 | * range [pt_base, pt_base + nt_pt_frames), whereas that | 716 | * range [pt_base, pt_base + nt_pt_frames), whereas that |
704 | * range here indeed is [pt_base - 2, pt_base - 2 + nt_pt_frames), | 717 | * range here indeed is [pt_base - 2, pt_base - 2 + nt_pt_frames), |
705 | * which - without a priori knowledge - the kernel would have | 718 | * which - without a priori knowledge - the kernel would have |
706 | * difficulty to figure out)." - so lets just fall back to the | 719 | * difficulty to figure out)." - so lets just fall back to the |
707 | * easy way and reserve the whole region. | 720 | * easy way and reserve the whole region. |
708 | */ | 721 | */ |
709 | memblock_reserve(__pa(xen_start_info->mfn_list), | 722 | memblock_reserve(__pa(xen_start_info->mfn_list), |
710 | xen_start_info->pt_base - xen_start_info->mfn_list); | 723 | xen_start_info->pt_base - xen_start_info->mfn_list); |
711 | 724 | ||
712 | sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); | 725 | sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); |
713 | 726 | ||
714 | return "Xen"; | 727 | return "Xen"; |
715 | } | 728 | } |
716 | 729 | ||
717 | /* | 730 | /* |
718 | * Machine specific memory setup for auto-translated guests. | 731 | * Machine specific memory setup for auto-translated guests. |
719 | */ | 732 | */ |
720 | char * __init xen_auto_xlated_memory_setup(void) | 733 | char * __init xen_auto_xlated_memory_setup(void) |
721 | { | 734 | { |
722 | static struct e820entry map[E820MAX] __initdata; | 735 | static struct e820entry map[E820MAX] __initdata; |
723 | 736 |
arch/x86/xen/xen-ops.h
1 | #ifndef XEN_OPS_H | 1 | #ifndef XEN_OPS_H |
2 | #define XEN_OPS_H | 2 | #define XEN_OPS_H |
3 | 3 | ||
4 | #include <linux/init.h> | 4 | #include <linux/init.h> |
5 | #include <linux/clocksource.h> | 5 | #include <linux/clocksource.h> |
6 | #include <linux/irqreturn.h> | 6 | #include <linux/irqreturn.h> |
7 | #include <xen/xen-ops.h> | 7 | #include <xen/xen-ops.h> |
8 | 8 | ||
9 | /* These are code, but not functions. Defined in entry.S */ | 9 | /* These are code, but not functions. Defined in entry.S */ |
10 | extern const char xen_hypervisor_callback[]; | 10 | extern const char xen_hypervisor_callback[]; |
11 | extern const char xen_failsafe_callback[]; | 11 | extern const char xen_failsafe_callback[]; |
12 | 12 | ||
13 | extern void *xen_initial_gdt; | 13 | extern void *xen_initial_gdt; |
14 | 14 | ||
15 | struct trap_info; | 15 | struct trap_info; |
16 | void xen_copy_trap_info(struct trap_info *traps); | 16 | void xen_copy_trap_info(struct trap_info *traps); |
17 | 17 | ||
18 | DECLARE_PER_CPU(struct vcpu_info, xen_vcpu_info); | 18 | DECLARE_PER_CPU(struct vcpu_info, xen_vcpu_info); |
19 | DECLARE_PER_CPU(unsigned long, xen_cr3); | 19 | DECLARE_PER_CPU(unsigned long, xen_cr3); |
20 | DECLARE_PER_CPU(unsigned long, xen_current_cr3); | 20 | DECLARE_PER_CPU(unsigned long, xen_current_cr3); |
21 | 21 | ||
22 | extern struct start_info *xen_start_info; | 22 | extern struct start_info *xen_start_info; |
23 | extern struct shared_info xen_dummy_shared_info; | 23 | extern struct shared_info xen_dummy_shared_info; |
24 | extern struct shared_info *HYPERVISOR_shared_info; | 24 | extern struct shared_info *HYPERVISOR_shared_info; |
25 | 25 | ||
26 | void xen_setup_mfn_list_list(void); | 26 | void xen_setup_mfn_list_list(void); |
27 | void xen_setup_shared_info(void); | 27 | void xen_setup_shared_info(void); |
28 | void xen_build_mfn_list_list(void); | 28 | void xen_build_mfn_list_list(void); |
29 | void xen_setup_machphys_mapping(void); | 29 | void xen_setup_machphys_mapping(void); |
30 | void xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn); | 30 | void xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn); |
31 | void xen_reserve_top(void); | 31 | void xen_reserve_top(void); |
32 | extern unsigned long xen_max_p2m_pfn; | ||
33 | 32 | ||
34 | void xen_mm_pin_all(void); | 33 | void xen_mm_pin_all(void); |
35 | void xen_mm_unpin_all(void); | 34 | void xen_mm_unpin_all(void); |
36 | 35 | ||
36 | unsigned long __ref xen_chk_extra_mem(unsigned long pfn); | ||
37 | void __init xen_inv_extra_mem(void); | ||
38 | void __init xen_remap_memory(void); | ||
37 | char * __init xen_memory_setup(void); | 39 | char * __init xen_memory_setup(void); |
38 | char * xen_auto_xlated_memory_setup(void); | 40 | char * xen_auto_xlated_memory_setup(void); |
39 | void __init xen_arch_setup(void); | 41 | void __init xen_arch_setup(void); |
40 | void xen_enable_sysenter(void); | 42 | void xen_enable_sysenter(void); |
41 | void xen_enable_syscall(void); | 43 | void xen_enable_syscall(void); |
42 | void xen_vcpu_restore(void); | 44 | void xen_vcpu_restore(void); |
43 | 45 | ||
44 | void xen_callback_vector(void); | 46 | void xen_callback_vector(void); |
45 | void xen_hvm_init_shared_info(void); | 47 | void xen_hvm_init_shared_info(void); |
46 | void xen_unplug_emulated_devices(void); | 48 | void xen_unplug_emulated_devices(void); |
47 | 49 | ||
48 | void __init xen_build_dynamic_phys_to_machine(void); | 50 | void __init xen_build_dynamic_phys_to_machine(void); |
49 | unsigned long __init xen_revector_p2m_tree(void); | 51 | void __init xen_vmalloc_p2m_tree(void); |
50 | 52 | ||
51 | void xen_init_irq_ops(void); | 53 | void xen_init_irq_ops(void); |
52 | void xen_setup_timer(int cpu); | 54 | void xen_setup_timer(int cpu); |
53 | void xen_setup_runstate_info(int cpu); | 55 | void xen_setup_runstate_info(int cpu); |
54 | void xen_teardown_timer(int cpu); | 56 | void xen_teardown_timer(int cpu); |
55 | cycle_t xen_clocksource_read(void); | 57 | cycle_t xen_clocksource_read(void); |
56 | void xen_setup_cpu_clockevents(void); | 58 | void xen_setup_cpu_clockevents(void); |
57 | void __init xen_init_time_ops(void); | 59 | void __init xen_init_time_ops(void); |
58 | void __init xen_hvm_init_time_ops(void); | 60 | void __init xen_hvm_init_time_ops(void); |
59 | 61 | ||
60 | irqreturn_t xen_debug_interrupt(int irq, void *dev_id); | 62 | irqreturn_t xen_debug_interrupt(int irq, void *dev_id); |
61 | 63 | ||
62 | bool xen_vcpu_stolen(int vcpu); | 64 | bool xen_vcpu_stolen(int vcpu); |
63 | 65 | ||
64 | void xen_setup_vcpu_info_placement(void); | 66 | void xen_setup_vcpu_info_placement(void); |
65 | 67 | ||
66 | #ifdef CONFIG_SMP | 68 | #ifdef CONFIG_SMP |
67 | void xen_smp_init(void); | 69 | void xen_smp_init(void); |
68 | void __init xen_hvm_smp_init(void); | 70 | void __init xen_hvm_smp_init(void); |
69 | 71 | ||
70 | extern cpumask_var_t xen_cpu_initialized_map; | 72 | extern cpumask_var_t xen_cpu_initialized_map; |
71 | #else | 73 | #else |
72 | static inline void xen_smp_init(void) {} | 74 | static inline void xen_smp_init(void) {} |
73 | static inline void xen_hvm_smp_init(void) {} | 75 | static inline void xen_hvm_smp_init(void) {} |
74 | #endif | 76 | #endif |
75 | 77 | ||
76 | #ifdef CONFIG_PARAVIRT_SPINLOCKS | 78 | #ifdef CONFIG_PARAVIRT_SPINLOCKS |
77 | void __init xen_init_spinlocks(void); | 79 | void __init xen_init_spinlocks(void); |
78 | void xen_init_lock_cpu(int cpu); | 80 | void xen_init_lock_cpu(int cpu); |
79 | void xen_uninit_lock_cpu(int cpu); | 81 | void xen_uninit_lock_cpu(int cpu); |
80 | #else | 82 | #else |
81 | static inline void xen_init_spinlocks(void) | 83 | static inline void xen_init_spinlocks(void) |
82 | { | 84 | { |
83 | } | 85 | } |
84 | static inline void xen_init_lock_cpu(int cpu) | 86 | static inline void xen_init_lock_cpu(int cpu) |
85 | { | 87 | { |
86 | } | 88 | } |
87 | static inline void xen_uninit_lock_cpu(int cpu) | 89 | static inline void xen_uninit_lock_cpu(int cpu) |
88 | { | 90 | { |
89 | } | 91 | } |
90 | #endif | 92 | #endif |
91 | 93 | ||
92 | struct dom0_vga_console_info; | 94 | struct dom0_vga_console_info; |
93 | 95 | ||
94 | #ifdef CONFIG_XEN_DOM0 | 96 | #ifdef CONFIG_XEN_DOM0 |
95 | void __init xen_init_vga(const struct dom0_vga_console_info *, size_t size); | 97 | void __init xen_init_vga(const struct dom0_vga_console_info *, size_t size); |
96 | void __init xen_init_apic(void); | 98 | void __init xen_init_apic(void); |
97 | #else | 99 | #else |
98 | static inline void __init xen_init_vga(const struct dom0_vga_console_info *info, | 100 | static inline void __init xen_init_vga(const struct dom0_vga_console_info *info, |
99 | size_t size) | 101 | size_t size) |
100 | { | 102 | { |
101 | } | 103 | } |
102 | static inline void __init xen_init_apic(void) | 104 | static inline void __init xen_init_apic(void) |
103 | { | 105 | { |
104 | } | 106 | } |
105 | #endif | 107 | #endif |
106 | 108 | ||
107 | #ifdef CONFIG_XEN_EFI | 109 | #ifdef CONFIG_XEN_EFI |
108 | extern void xen_efi_init(void); | 110 | extern void xen_efi_init(void); |
109 | #else | 111 | #else |
110 | static inline void __init xen_efi_init(void) | 112 | static inline void __init xen_efi_init(void) |
111 | { | 113 | { |
112 | } | 114 | } |
113 | #endif | 115 | #endif |
114 | 116 | ||
115 | /* Declare an asm function, along with symbols needed to make it | 117 | /* Declare an asm function, along with symbols needed to make it |
116 | inlineable */ | 118 | inlineable */ |
117 | #define DECL_ASM(ret, name, ...) \ | 119 | #define DECL_ASM(ret, name, ...) \ |
118 | __visible ret name(__VA_ARGS__); \ | 120 | __visible ret name(__VA_ARGS__); \ |
119 | extern char name##_end[] __visible; \ | 121 | extern char name##_end[] __visible; \ |
120 | extern char name##_reloc[] __visible | 122 | extern char name##_reloc[] __visible |
121 | 123 | ||
122 | DECL_ASM(void, xen_irq_enable_direct, void); | 124 | DECL_ASM(void, xen_irq_enable_direct, void); |
123 | DECL_ASM(void, xen_irq_disable_direct, void); | 125 | DECL_ASM(void, xen_irq_disable_direct, void); |
124 | DECL_ASM(unsigned long, xen_save_fl_direct, void); | 126 | DECL_ASM(unsigned long, xen_save_fl_direct, void); |
125 | DECL_ASM(void, xen_restore_fl_direct, unsigned long); | 127 | DECL_ASM(void, xen_restore_fl_direct, unsigned long); |
126 | 128 | ||
127 | /* These are not functions, and cannot be called normally */ | 129 | /* These are not functions, and cannot be called normally */ |
128 | __visible void xen_iret(void); | 130 | __visible void xen_iret(void); |
129 | __visible void xen_sysexit(void); | 131 | __visible void xen_sysexit(void); |
130 | __visible void xen_sysret32(void); | 132 | __visible void xen_sysret32(void); |
131 | __visible void xen_sysret64(void); | 133 | __visible void xen_sysret64(void); |
132 | __visible void xen_adjust_exception_frame(void); | 134 | __visible void xen_adjust_exception_frame(void); |
133 | 135 | ||
134 | extern int xen_panic_handler_init(void); | 136 | extern int xen_panic_handler_init(void); |
135 | 137 | ||
136 | void xen_pvh_secondary_vcpu_init(int cpu); | 138 | void xen_pvh_secondary_vcpu_init(int cpu); |
137 | #endif /* XEN_OPS_H */ | 139 | #endif /* XEN_OPS_H */ |