Blame view
drivers/lguest/page_tables.c
35.4 KB
2e04ef769 lguest: fix comme... |
1 2 |
/*P:700 * The pagetable code, on the other hand, still shows the scars of |
f938d2c89 lguest: documenta... |
3 4 5 |
* previous encounters. It's functional, and as neat as it can be in the * circumstances, but be wary, for these things are subtle and break easily. * The Guest provides a virtual to physical mapping, but we can neither trust |
a6bd8e130 lguest: comment d... |
6 |
* it nor use it: we verify and convert it here then point the CPU to the |
2e04ef769 lguest: fix comme... |
7 8 |
* converted Guest pages when running the Guest. :*/ |
f938d2c89 lguest: documenta... |
9 10 |
/* Copyright (C) Rusty Russell IBM Corporation 2006. |
d7e28ffe6 lguest: the host ... |
11 12 |
* GPL v2 and any later version */ #include <linux/mm.h> |
5a0e3ad6a include cleanup: ... |
13 |
#include <linux/gfp.h> |
d7e28ffe6 lguest: the host ... |
14 15 16 17 18 |
#include <linux/types.h> #include <linux/spinlock.h> #include <linux/random.h> #include <linux/percpu.h> #include <asm/tlbflush.h> |
47436aa4a Boot with virtual... |
19 |
#include <asm/uaccess.h> |
d7e28ffe6 lguest: the host ... |
20 |
#include "lg.h" |
2e04ef769 lguest: fix comme... |
21 22 |
/*M:008 * We hold reference to pages, which prevents them from being swapped. |
f56a384e9 lguest: documenta... |
23 24 |
* It'd be nice to have a callback in the "struct mm_struct" when Linux wants * to swap out. If we had this, and a shrinker callback to trim PTE pages, we |
2e04ef769 lguest: fix comme... |
25 26 |
* could probably consider launching Guests as non-root. :*/ |
f56a384e9 lguest: documenta... |
27 |
|
bff672e63 lguest: documenta... |
28 29 30 |
/*H:300 * The Page Table Code * |
a91d74a3c lguest: update co... |
31 32 33 34 |
* We use two-level page tables for the Guest, or three-level with PAE. If * you're not entirely comfortable with virtual addresses, physical addresses * and page tables then I recommend you review arch/x86/lguest/boot.c's "Page * Table Handling" (with diagrams!). |
bff672e63 lguest: documenta... |
35 36 37 38 39 40 41 42 43 |
* * The Guest keeps page tables, but we maintain the actual ones here: these are * called "shadow" page tables. Which is a very Guest-centric name: these are * the real page tables the CPU uses, although we keep them up to date to * reflect the Guest's. (See what I mean about weird naming? Since when do * shadows reflect anything?) * * Anyway, this is the most complicated part of the Host code. There are seven * parts to this: |
e1e72965e lguest: documenta... |
44 45 46 |
* (i) Looking up a page table entry when the Guest faults, * (ii) Making sure the Guest stack is mapped, * (iii) Setting up a page table entry when the Guest tells us one has changed, |
bff672e63 lguest: documenta... |
47 |
* (iv) Switching page tables, |
e1e72965e lguest: documenta... |
48 |
* (v) Flushing (throwing away) page tables, |
bff672e63 lguest: documenta... |
49 50 |
* (vi) Mapping the Switcher when the Guest is about to run, * (vii) Setting up the page tables initially. |
2e04ef769 lguest: fix comme... |
51 |
:*/ |
bff672e63 lguest: documenta... |
52 |
|
2e04ef769 lguest: fix comme... |
53 |
/* |
a91d74a3c lguest: update co... |
54 55 |
* The Switcher uses the complete top PTE page. That's 1024 PTE entries (4MB) * or 512 PTE entries with PAE (2MB). |
2e04ef769 lguest: fix comme... |
56 |
*/ |
df29f43e6 Pagetables to use... |
57 |
#define SWITCHER_PGD_INDEX (PTRS_PER_PGD - 1) |
d7e28ffe6 lguest: the host ... |
58 |
|
2e04ef769 lguest: fix comme... |
59 60 61 62 |
/* * For PAE we need the PMD index as well. We use the last 2MB, so we * will need the last pmd entry of the last pmd page. */ |
acdd0b629 lguest: PAE support |
63 64 65 66 67 68 69 70 |
#ifdef CONFIG_X86_PAE #define SWITCHER_PMD_INDEX (PTRS_PER_PMD - 1) #define RESERVE_MEM 2U #define CHECK_GPGD_MASK _PAGE_PRESENT #else #define RESERVE_MEM 4U #define CHECK_GPGD_MASK _PAGE_TABLE #endif |
2e04ef769 lguest: fix comme... |
71 72 |
/* * We actually need a separate PTE page for each CPU. Remember that after the |
bff672e63 lguest: documenta... |
73 |
* Switcher code itself comes two pages for each CPU, and we don't want this |
2e04ef769 lguest: fix comme... |
74 75 |
* CPU's guest to see the pages of any other CPU. */ |
df29f43e6 Pagetables to use... |
76 |
static DEFINE_PER_CPU(pte_t *, switcher_pte_pages); |
d7e28ffe6 lguest: the host ... |
77 |
#define switcher_pte_page(cpu) per_cpu(switcher_pte_pages, cpu) |
2e04ef769 lguest: fix comme... |
78 79 |
/*H:320 * The page table code is curly enough to need helper functions to keep it |
a91d74a3c lguest: update co... |
80 81 |
* clear and clean. The kernel itself provides many of them; one advantage * of insisting that the Guest and Host use the same CONFIG_PAE setting. |
bff672e63 lguest: documenta... |
82 |
* |
df29f43e6 Pagetables to use... |
83 |
* There are two functions which return pointers to the shadow (aka "real") |
bff672e63 lguest: documenta... |
84 85 86 |
* page tables. * * spgd_addr() takes the virtual address and returns a pointer to the top-level |
e1e72965e lguest: documenta... |
87 88 |
* page directory entry (PGD) for that address. Since we keep track of several * page tables, the "i" argument tells us which one we're interested in (it's |
2e04ef769 lguest: fix comme... |
89 90 |
* usually the current one). */ |
382ac6b3f lguest: get rid o... |
91 |
static pgd_t *spgd_addr(struct lg_cpu *cpu, u32 i, unsigned long vaddr) |
d7e28ffe6 lguest: the host ... |
92 |
{ |
df29f43e6 Pagetables to use... |
93 |
unsigned int index = pgd_index(vaddr); |
d7e28ffe6 lguest: the host ... |
94 |
|
acdd0b629 lguest: PAE support |
95 |
#ifndef CONFIG_X86_PAE |
bff672e63 lguest: documenta... |
96 |
/* We kill any Guest trying to touch the Switcher addresses. */ |
d7e28ffe6 lguest: the host ... |
97 |
if (index >= SWITCHER_PGD_INDEX) { |
382ac6b3f lguest: get rid o... |
98 |
kill_guest(cpu, "attempt to access switcher pages"); |
d7e28ffe6 lguest: the host ... |
99 100 |
index = 0; } |
acdd0b629 lguest: PAE support |
101 |
#endif |
bff672e63 lguest: documenta... |
102 |
/* Return a pointer index'th pgd entry for the i'th page table. */ |
382ac6b3f lguest: get rid o... |
103 |
return &cpu->lg->pgdirs[i].pgdir[index]; |
d7e28ffe6 lguest: the host ... |
104 |
} |
acdd0b629 lguest: PAE support |
105 |
#ifdef CONFIG_X86_PAE |
2e04ef769 lguest: fix comme... |
106 107 |
/* * This routine then takes the PGD entry given above, which contains the |
acdd0b629 lguest: PAE support |
108 |
* address of the PMD page. It then returns a pointer to the PMD entry for the |
2e04ef769 lguest: fix comme... |
109 110 |
* given address. */ |
acdd0b629 lguest: PAE support |
111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 |
static pmd_t *spmd_addr(struct lg_cpu *cpu, pgd_t spgd, unsigned long vaddr) { unsigned int index = pmd_index(vaddr); pmd_t *page; /* We kill any Guest trying to touch the Switcher addresses. */ if (pgd_index(vaddr) == SWITCHER_PGD_INDEX && index >= SWITCHER_PMD_INDEX) { kill_guest(cpu, "attempt to access switcher pages"); index = 0; } /* You should never call this if the PGD entry wasn't valid */ BUG_ON(!(pgd_flags(spgd) & _PAGE_PRESENT)); page = __va(pgd_pfn(spgd) << PAGE_SHIFT); return &page[index]; } #endif |
2e04ef769 lguest: fix comme... |
130 131 |
/* * This routine then takes the page directory entry returned above, which |
e1e72965e lguest: documenta... |
132 |
* contains the address of the page table entry (PTE) page. It then returns a |
2e04ef769 lguest: fix comme... |
133 134 |
* pointer to the PTE entry for the given address. */ |
acdd0b629 lguest: PAE support |
135 |
static pte_t *spte_addr(struct lg_cpu *cpu, pgd_t spgd, unsigned long vaddr) |
d7e28ffe6 lguest: the host ... |
136 |
{ |
acdd0b629 lguest: PAE support |
137 138 139 140 141 142 143 |
#ifdef CONFIG_X86_PAE pmd_t *pmd = spmd_addr(cpu, spgd, vaddr); pte_t *page = __va(pmd_pfn(*pmd) << PAGE_SHIFT); /* You should never call this if the PMD entry wasn't valid */ BUG_ON(!(pmd_flags(*pmd) & _PAGE_PRESENT)); #else |
df29f43e6 Pagetables to use... |
144 |
pte_t *page = __va(pgd_pfn(spgd) << PAGE_SHIFT); |
bff672e63 lguest: documenta... |
145 |
/* You should never call this if the PGD entry wasn't valid */ |
df29f43e6 Pagetables to use... |
146 |
BUG_ON(!(pgd_flags(spgd) & _PAGE_PRESENT)); |
acdd0b629 lguest: PAE support |
147 |
#endif |
90603d15f lguest: use nativ... |
148 |
return &page[pte_index(vaddr)]; |
d7e28ffe6 lguest: the host ... |
149 |
} |
2e04ef769 lguest: fix comme... |
150 |
/* |
9f54288de lguest: update co... |
151 |
* These functions are just like the above, except they access the Guest |
2e04ef769 lguest: fix comme... |
152 153 |
* page tables. Hence they return a Guest address. */ |
1713608f2 lguest: per-vcpu ... |
154 |
static unsigned long gpgd_addr(struct lg_cpu *cpu, unsigned long vaddr) |
d7e28ffe6 lguest: the host ... |
155 |
{ |
df29f43e6 Pagetables to use... |
156 |
unsigned int index = vaddr >> (PGDIR_SHIFT); |
1713608f2 lguest: per-vcpu ... |
157 |
return cpu->lg->pgdirs[cpu->cpu_pgd].gpgdir + index * sizeof(pgd_t); |
d7e28ffe6 lguest: the host ... |
158 |
} |
acdd0b629 lguest: PAE support |
159 |
#ifdef CONFIG_X86_PAE |
a91d74a3c lguest: update co... |
160 |
/* Follow the PGD to the PMD. */ |
acdd0b629 lguest: PAE support |
161 |
static unsigned long gpmd_addr(pgd_t gpgd, unsigned long vaddr) |
d7e28ffe6 lguest: the host ... |
162 |
{ |
df29f43e6 Pagetables to use... |
163 164 |
unsigned long gpage = pgd_pfn(gpgd) << PAGE_SHIFT; BUG_ON(!(pgd_flags(gpgd) & _PAGE_PRESENT)); |
acdd0b629 lguest: PAE support |
165 166 |
return gpage + pmd_index(vaddr) * sizeof(pmd_t); } |
acdd0b629 lguest: PAE support |
167 |
|
a91d74a3c lguest: update co... |
168 |
/* Follow the PMD to the PTE. */ |
acdd0b629 lguest: PAE support |
169 |
static unsigned long gpte_addr(struct lg_cpu *cpu, |
92b4d8df8 lguest: PAE fixes |
170 |
pmd_t gpmd, unsigned long vaddr) |
acdd0b629 lguest: PAE support |
171 |
{ |
92b4d8df8 lguest: PAE fixes |
172 |
unsigned long gpage = pmd_pfn(gpmd) << PAGE_SHIFT; |
acdd0b629 lguest: PAE support |
173 |
|
acdd0b629 lguest: PAE support |
174 |
BUG_ON(!(pmd_flags(gpmd) & _PAGE_PRESENT)); |
92b4d8df8 lguest: PAE fixes |
175 176 |
return gpage + pte_index(vaddr) * sizeof(pte_t); } |
acdd0b629 lguest: PAE support |
177 |
#else |
a91d74a3c lguest: update co... |
178 |
/* Follow the PGD to the PTE (no mid-level for !PAE). */ |
92b4d8df8 lguest: PAE fixes |
179 180 181 182 183 184 |
static unsigned long gpte_addr(struct lg_cpu *cpu, pgd_t gpgd, unsigned long vaddr) { unsigned long gpage = pgd_pfn(gpgd) << PAGE_SHIFT; BUG_ON(!(pgd_flags(gpgd) & _PAGE_PRESENT)); |
90603d15f lguest: use nativ... |
185 |
return gpage + pte_index(vaddr) * sizeof(pte_t); |
d7e28ffe6 lguest: the host ... |
186 |
} |
92b4d8df8 lguest: PAE fixes |
187 |
#endif |
a6bd8e130 lguest: comment d... |
188 |
/*:*/ |
9f54288de lguest: update co... |
189 |
/*M:007 |
2e04ef769 lguest: fix comme... |
190 191 192 |
* get_pfn is slow: we could probably try to grab batches of pages here as * an optimization (ie. pre-faulting). :*/ |
d7e28ffe6 lguest: the host ... |
193 |
|
2e04ef769 lguest: fix comme... |
194 195 |
/*H:350 * This routine takes a page number given by the Guest and converts it to |
bff672e63 lguest: documenta... |
196 197 198 199 200 |
* an actual, physical page number. It can fail for several reasons: the * virtual address might not be mapped by the Launcher, the write flag is set * and the page is read-only, or the write flag was set and the page was * shared so had to be copied, but we ran out of memory. * |
a6bd8e130 lguest: comment d... |
201 |
* This holds a reference to the page, so release_pte() is careful to put that |
2e04ef769 lguest: fix comme... |
202 203 |
* back. */ |
d7e28ffe6 lguest: the host ... |
204 205 206 |
static unsigned long get_pfn(unsigned long virtpfn, int write) { struct page *page; |
71a3f4edc lguest: use get_u... |
207 208 209 210 |
/* gup me one page at this address please! */ if (get_user_pages_fast(virtpfn << PAGE_SHIFT, 1, write, &page) == 1) return page_to_pfn(page); |
bff672e63 lguest: documenta... |
211 |
/* This value indicates failure. */ |
71a3f4edc lguest: use get_u... |
212 |
return -1UL; |
d7e28ffe6 lguest: the host ... |
213 |
} |
2e04ef769 lguest: fix comme... |
214 215 |
/*H:340 * Converting a Guest page table entry to a shadow (ie. real) page table |
bff672e63 lguest: documenta... |
216 217 |
* entry can be a little tricky. The flags are (almost) the same, but the * Guest PTE contains a virtual page number: the CPU needs the real page |
2e04ef769 lguest: fix comme... |
218 219 |
* number. */ |
382ac6b3f lguest: get rid o... |
220 |
static pte_t gpte_to_spte(struct lg_cpu *cpu, pte_t gpte, int write) |
d7e28ffe6 lguest: the host ... |
221 |
{ |
df29f43e6 Pagetables to use... |
222 |
unsigned long pfn, base, flags; |
d7e28ffe6 lguest: the host ... |
223 |
|
2e04ef769 lguest: fix comme... |
224 225 |
/* * The Guest sets the global flag, because it thinks that it is using |
bff672e63 lguest: documenta... |
226 227 |
* PGE. We only told it to use PGE so it would tell us whether it was * flushing a kernel mapping or a userspace mapping. We don't actually |
2e04ef769 lguest: fix comme... |
228 229 |
* use the global bit, so throw it away. */ |
df29f43e6 Pagetables to use... |
230 |
flags = (pte_flags(gpte) & ~_PAGE_GLOBAL); |
bff672e63 lguest: documenta... |
231 |
|
3c6b5bfa3 Introduce guest m... |
232 |
/* The Guest's pages are offset inside the Launcher. */ |
382ac6b3f lguest: get rid o... |
233 |
base = (unsigned long)cpu->lg->mem_base / PAGE_SIZE; |
3c6b5bfa3 Introduce guest m... |
234 |
|
2e04ef769 lguest: fix comme... |
235 236 |
/* * We need a temporary "unsigned long" variable to hold the answer from |
bff672e63 lguest: documenta... |
237 238 |
* get_pfn(), because it returns 0xFFFFFFFF on failure, which wouldn't * fit in spte.pfn. get_pfn() finds the real physical number of the |
2e04ef769 lguest: fix comme... |
239 240 |
* page, given the virtual number. */ |
df29f43e6 Pagetables to use... |
241 |
pfn = get_pfn(base + pte_pfn(gpte), write); |
d7e28ffe6 lguest: the host ... |
242 |
if (pfn == -1UL) { |
382ac6b3f lguest: get rid o... |
243 |
kill_guest(cpu, "failed to get page %lu", pte_pfn(gpte)); |
2e04ef769 lguest: fix comme... |
244 245 |
/* * When we destroy the Guest, we'll go through the shadow page |
bff672e63 lguest: documenta... |
246 |
* tables and release_pte() them. Make sure we don't think |
2e04ef769 lguest: fix comme... |
247 248 |
* this one is valid! */ |
df29f43e6 Pagetables to use... |
249 |
flags = 0; |
d7e28ffe6 lguest: the host ... |
250 |
} |
df29f43e6 Pagetables to use... |
251 252 |
/* Now we assemble our shadow PTE from the page number and flags. */ return pfn_pte(pfn, __pgprot(flags)); |
d7e28ffe6 lguest: the host ... |
253 |
} |
bff672e63 lguest: documenta... |
254 |
/*H:460 And to complete the chain, release_pte() looks like this: */ |
df29f43e6 Pagetables to use... |
255 |
static void release_pte(pte_t pte) |
d7e28ffe6 lguest: the host ... |
256 |
{ |
2e04ef769 lguest: fix comme... |
257 258 259 260 |
/* * Remember that get_user_pages_fast() took a reference to the page, in * get_pfn()? We have to put it back now. */ |
df29f43e6 Pagetables to use... |
261 |
if (pte_flags(pte) & _PAGE_PRESENT) |
90603d15f lguest: use nativ... |
262 |
put_page(pte_page(pte)); |
d7e28ffe6 lguest: the host ... |
263 |
} |
bff672e63 lguest: documenta... |
264 |
/*:*/ |
d7e28ffe6 lguest: the host ... |
265 |
|
382ac6b3f lguest: get rid o... |
266 |
static void check_gpte(struct lg_cpu *cpu, pte_t gpte) |
d7e28ffe6 lguest: the host ... |
267 |
{ |
31f4b46ec lguest: accept gu... |
268 269 |
if ((pte_flags(gpte) & _PAGE_PSE) || pte_pfn(gpte) >= cpu->lg->pfn_limit) |
382ac6b3f lguest: get rid o... |
270 |
kill_guest(cpu, "bad page table entry"); |
d7e28ffe6 lguest: the host ... |
271 |
} |
382ac6b3f lguest: get rid o... |
272 |
static void check_gpgd(struct lg_cpu *cpu, pgd_t gpgd) |
d7e28ffe6 lguest: the host ... |
273 |
{ |
acdd0b629 lguest: PAE support |
274 |
if ((pgd_flags(gpgd) & ~CHECK_GPGD_MASK) || |
382ac6b3f lguest: get rid o... |
275 276 |
(pgd_pfn(gpgd) >= cpu->lg->pfn_limit)) kill_guest(cpu, "bad page directory entry"); |
d7e28ffe6 lguest: the host ... |
277 |
} |
acdd0b629 lguest: PAE support |
278 279 280 281 282 283 284 285 |
#ifdef CONFIG_X86_PAE static void check_gpmd(struct lg_cpu *cpu, pmd_t gpmd) { if ((pmd_flags(gpmd) & ~_PAGE_TABLE) || (pmd_pfn(gpmd) >= cpu->lg->pfn_limit)) kill_guest(cpu, "bad page middle directory entry"); } #endif |
bff672e63 lguest: documenta... |
286 |
/*H:330 |
e1e72965e lguest: documenta... |
287 |
* (i) Looking up a page table entry when the Guest faults. |
bff672e63 lguest: documenta... |
288 289 290 291 292 293 294 |
* * We saw this call in run_guest(): when we see a page fault in the Guest, we * come here. That's because we only set up the shadow page tables lazily as * they're needed, so we get page faults all the time and quietly fix them up * and return to the Guest without it knowing. * * If we fixed up the fault (ie. we mapped the address), this routine returns |
2e04ef769 lguest: fix comme... |
295 296 |
* true. Otherwise, it was a real fault and we need to tell the Guest. */ |
df1693abc lguest: use bool ... |
297 |
bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) |
d7e28ffe6 lguest: the host ... |
298 |
{ |
df29f43e6 Pagetables to use... |
299 300 |
pgd_t gpgd; pgd_t *spgd; |
d7e28ffe6 lguest: the host ... |
301 |
unsigned long gpte_ptr; |
df29f43e6 Pagetables to use... |
302 303 |
pte_t gpte; pte_t *spte; |
d7e28ffe6 lguest: the host ... |
304 |
|
a91d74a3c lguest: update co... |
305 |
/* Mid level for PAE. */ |
acdd0b629 lguest: PAE support |
306 307 308 309 |
#ifdef CONFIG_X86_PAE pmd_t *spmd; pmd_t gpmd; #endif |
bff672e63 lguest: documenta... |
310 |
/* First step: get the top-level Guest page table entry. */ |
5dea1c88e lguest: use a spe... |
311 312 313 314 315 316 317 318 319 |
if (unlikely(cpu->linear_pages)) { /* Faking up a linear mapping. */ gpgd = __pgd(CHECK_GPGD_MASK); } else { gpgd = lgread(cpu, gpgd_addr(cpu, vaddr), pgd_t); /* Toplevel not present? We can't map it in. */ if (!(pgd_flags(gpgd) & _PAGE_PRESENT)) return false; } |
d7e28ffe6 lguest: the host ... |
320 |
|
bff672e63 lguest: documenta... |
321 |
/* Now look at the matching shadow entry. */ |
382ac6b3f lguest: get rid o... |
322 |
spgd = spgd_addr(cpu, cpu->cpu_pgd, vaddr); |
df29f43e6 Pagetables to use... |
323 |
if (!(pgd_flags(*spgd) & _PAGE_PRESENT)) { |
bff672e63 lguest: documenta... |
324 |
/* No shadow entry: allocate a new shadow PTE page. */ |
d7e28ffe6 lguest: the host ... |
325 |
unsigned long ptepage = get_zeroed_page(GFP_KERNEL); |
2e04ef769 lguest: fix comme... |
326 327 328 329 |
/* * This is not really the Guest's fault, but killing it is * simple for this corner case. */ |
d7e28ffe6 lguest: the host ... |
330 |
if (!ptepage) { |
382ac6b3f lguest: get rid o... |
331 |
kill_guest(cpu, "out of memory allocating pte page"); |
df1693abc lguest: use bool ... |
332 |
return false; |
d7e28ffe6 lguest: the host ... |
333 |
} |
bff672e63 lguest: documenta... |
334 |
/* We check that the Guest pgd is OK. */ |
382ac6b3f lguest: get rid o... |
335 |
check_gpgd(cpu, gpgd); |
2e04ef769 lguest: fix comme... |
336 337 338 339 |
/* * And we copy the flags to the shadow PGD entry. The page * number in the shadow PGD is the page we just allocated. */ |
acdd0b629 lguest: PAE support |
340 |
set_pgd(spgd, __pgd(__pa(ptepage) | pgd_flags(gpgd))); |
d7e28ffe6 lguest: the host ... |
341 |
} |
acdd0b629 lguest: PAE support |
342 |
#ifdef CONFIG_X86_PAE |
5dea1c88e lguest: use a spe... |
343 344 345 346 347 348 349 350 351 |
if (unlikely(cpu->linear_pages)) { /* Faking up a linear mapping. */ gpmd = __pmd(_PAGE_TABLE); } else { gpmd = lgread(cpu, gpmd_addr(gpgd, vaddr), pmd_t); /* Middle level not present? We can't map it in. */ if (!(pmd_flags(gpmd) & _PAGE_PRESENT)) return false; } |
acdd0b629 lguest: PAE support |
352 353 354 355 356 357 358 |
/* Now look at the matching shadow entry. */ spmd = spmd_addr(cpu, *spgd, vaddr); if (!(pmd_flags(*spmd) & _PAGE_PRESENT)) { /* No shadow entry: allocate a new shadow PTE page. */ unsigned long ptepage = get_zeroed_page(GFP_KERNEL); |
2e04ef769 lguest: fix comme... |
359 360 361 362 |
/* * This is not really the Guest's fault, but killing it is * simple for this corner case. */ |
acdd0b629 lguest: PAE support |
363 364 365 366 367 368 369 |
if (!ptepage) { kill_guest(cpu, "out of memory allocating pte page"); return false; } /* We check that the Guest pmd is OK. */ check_gpmd(cpu, gpmd); |
2e04ef769 lguest: fix comme... |
370 371 372 373 |
/* * And we copy the flags to the shadow PMD entry. The page * number in the shadow PMD is the page we just allocated. */ |
4c1ea3dd7 lguest: use set_p... |
374 |
set_pmd(spmd, __pmd(__pa(ptepage) | pmd_flags(gpmd))); |
acdd0b629 lguest: PAE support |
375 |
} |
92b4d8df8 lguest: PAE fixes |
376 |
|
2e04ef769 lguest: fix comme... |
377 378 379 380 |
/* * OK, now we look at the lower level in the Guest page table: keep its * address, because we might update it later. */ |
92b4d8df8 lguest: PAE fixes |
381 382 |
gpte_ptr = gpte_addr(cpu, gpmd, vaddr); #else |
2e04ef769 lguest: fix comme... |
383 384 385 386 |
/* * OK, now we look at the lower level in the Guest page table: keep its * address, because we might update it later. */ |
acdd0b629 lguest: PAE support |
387 |
gpte_ptr = gpte_addr(cpu, gpgd, vaddr); |
92b4d8df8 lguest: PAE fixes |
388 |
#endif |
a91d74a3c lguest: update co... |
389 |
|
5dea1c88e lguest: use a spe... |
390 391 392 393 394 395 396 |
if (unlikely(cpu->linear_pages)) { /* Linear? Make up a PTE which points to same page. */ gpte = __pte((vaddr & PAGE_MASK) | _PAGE_RW | _PAGE_PRESENT); } else { /* Read the actual PTE value. */ gpte = lgread(cpu, gpte_ptr, pte_t); } |
d7e28ffe6 lguest: the host ... |
397 |
|
bff672e63 lguest: documenta... |
398 |
/* If this page isn't in the Guest page tables, we can't page it in. */ |
df29f43e6 Pagetables to use... |
399 |
if (!(pte_flags(gpte) & _PAGE_PRESENT)) |
df1693abc lguest: use bool ... |
400 |
return false; |
d7e28ffe6 lguest: the host ... |
401 |
|
2e04ef769 lguest: fix comme... |
402 403 404 405 |
/* * Check they're not trying to write to a page the Guest wants * read-only (bit 2 of errcode == write). */ |
df29f43e6 Pagetables to use... |
406 |
if ((errcode & 2) && !(pte_flags(gpte) & _PAGE_RW)) |
df1693abc lguest: use bool ... |
407 |
return false; |
d7e28ffe6 lguest: the host ... |
408 |
|
e1e72965e lguest: documenta... |
409 |
/* User access to a kernel-only page? (bit 3 == user access) */ |
df29f43e6 Pagetables to use... |
410 |
if ((errcode & 4) && !(pte_flags(gpte) & _PAGE_USER)) |
df1693abc lguest: use bool ... |
411 |
return false; |
d7e28ffe6 lguest: the host ... |
412 |
|
2e04ef769 lguest: fix comme... |
413 414 415 416 |
/* * Check that the Guest PTE flags are OK, and the page number is below * the pfn_limit (ie. not mapping the Launcher binary). */ |
382ac6b3f lguest: get rid o... |
417 |
check_gpte(cpu, gpte); |
e1e72965e lguest: documenta... |
418 |
|
bff672e63 lguest: documenta... |
419 |
/* Add the _PAGE_ACCESSED and (for a write) _PAGE_DIRTY flag */ |
df29f43e6 Pagetables to use... |
420 |
gpte = pte_mkyoung(gpte); |
d7e28ffe6 lguest: the host ... |
421 |
if (errcode & 2) |
df29f43e6 Pagetables to use... |
422 |
gpte = pte_mkdirty(gpte); |
d7e28ffe6 lguest: the host ... |
423 |
|
bff672e63 lguest: documenta... |
424 |
/* Get the pointer to the shadow PTE entry we're going to set. */ |
acdd0b629 lguest: PAE support |
425 |
spte = spte_addr(cpu, *spgd, vaddr); |
2e04ef769 lguest: fix comme... |
426 427 428 429 430 |
/* * If there was a valid shadow PTE entry here before, we release it. * This can happen with a write to a previously read-only entry. */ |
d7e28ffe6 lguest: the host ... |
431 |
release_pte(*spte); |
2e04ef769 lguest: fix comme... |
432 433 434 435 |
/* * If this is a write, we insist that the Guest page is writable (the * final arg to gpte_to_spte()). */ |
df29f43e6 Pagetables to use... |
436 |
if (pte_dirty(gpte)) |
382ac6b3f lguest: get rid o... |
437 |
*spte = gpte_to_spte(cpu, gpte, 1); |
df29f43e6 Pagetables to use... |
438 |
else |
2e04ef769 lguest: fix comme... |
439 440 |
/* * If this is a read, don't set the "writable" bit in the page |
bff672e63 lguest: documenta... |
441 |
* table entry, even if the Guest says it's writable. That way |
e1e72965e lguest: documenta... |
442 |
* we will come back here when a write does actually occur, so |
2e04ef769 lguest: fix comme... |
443 444 |
* we can update the Guest's _PAGE_DIRTY flag. */ |
4c1ea3dd7 lguest: use set_p... |
445 |
set_pte(spte, gpte_to_spte(cpu, pte_wrprotect(gpte), 0)); |
d7e28ffe6 lguest: the host ... |
446 |
|
2e04ef769 lguest: fix comme... |
447 448 449 450 |
/* * Finally, we write the Guest PTE entry back: we've set the * _PAGE_ACCESSED and maybe the _PAGE_DIRTY flags. */ |
5dea1c88e lguest: use a spe... |
451 452 |
if (likely(!cpu->linear_pages)) lgwrite(cpu, gpte_ptr, pte_t, gpte); |
bff672e63 lguest: documenta... |
453 |
|
2e04ef769 lguest: fix comme... |
454 455 |
/* * The fault is fixed, the page table is populated, the mapping |
e1e72965e lguest: documenta... |
456 457 |
* manipulated, the result returned and the code complete. A small * delay and a trace of alliteration are the only indications the Guest |
2e04ef769 lguest: fix comme... |
458 459 |
* has that a page fault occurred at all. */ |
df1693abc lguest: use bool ... |
460 |
return true; |
d7e28ffe6 lguest: the host ... |
461 |
} |
e1e72965e lguest: documenta... |
462 463 |
/*H:360 * (ii) Making sure the Guest stack is mapped. |
bff672e63 lguest: documenta... |
464 |
* |
e1e72965e lguest: documenta... |
465 466 467 468 |
* Remember that direct traps into the Guest need a mapped Guest kernel stack. * pin_stack_pages() calls us here: we could simply call demand_page(), but as * we've seen that logic is quite long, and usually the stack pages are already * mapped, so it's overkill. |
bff672e63 lguest: documenta... |
469 470 |
* * This is a quick version which answers the question: is this virtual address |
2e04ef769 lguest: fix comme... |
471 472 |
* mapped by the shadow page tables, and is it writable? */ |
df1693abc lguest: use bool ... |
473 |
static bool page_writable(struct lg_cpu *cpu, unsigned long vaddr) |
d7e28ffe6 lguest: the host ... |
474 |
{ |
df29f43e6 Pagetables to use... |
475 |
pgd_t *spgd; |
d7e28ffe6 lguest: the host ... |
476 |
unsigned long flags; |
acdd0b629 lguest: PAE support |
477 478 479 |
#ifdef CONFIG_X86_PAE pmd_t *spmd; #endif |
e1e72965e lguest: documenta... |
480 |
/* Look at the current top level entry: is it present? */ |
382ac6b3f lguest: get rid o... |
481 |
spgd = spgd_addr(cpu, cpu->cpu_pgd, vaddr); |
df29f43e6 Pagetables to use... |
482 |
if (!(pgd_flags(*spgd) & _PAGE_PRESENT)) |
df1693abc lguest: use bool ... |
483 |
return false; |
d7e28ffe6 lguest: the host ... |
484 |
|
acdd0b629 lguest: PAE support |
485 486 487 488 489 |
#ifdef CONFIG_X86_PAE spmd = spmd_addr(cpu, *spgd, vaddr); if (!(pmd_flags(*spmd) & _PAGE_PRESENT)) return false; #endif |
2e04ef769 lguest: fix comme... |
490 491 492 493 |
/* * Check the flags on the pte entry itself: it must be present and * writable. */ |
acdd0b629 lguest: PAE support |
494 |
flags = pte_flags(*(spte_addr(cpu, *spgd, vaddr))); |
df29f43e6 Pagetables to use... |
495 |
|
d7e28ffe6 lguest: the host ... |
496 497 |
return (flags & (_PAGE_PRESENT|_PAGE_RW)) == (_PAGE_PRESENT|_PAGE_RW); } |
2e04ef769 lguest: fix comme... |
498 499 |
/* * So, when pin_stack_pages() asks us to pin a page, we check if it's already |
bff672e63 lguest: documenta... |
500 |
* in the page tables, and if not, we call demand_page() with error code 2 |
2e04ef769 lguest: fix comme... |
501 502 |
* (meaning "write"). */ |
1713608f2 lguest: per-vcpu ... |
503 |
void pin_page(struct lg_cpu *cpu, unsigned long vaddr) |
d7e28ffe6 lguest: the host ... |
504 |
{ |
1713608f2 lguest: per-vcpu ... |
505 |
if (!page_writable(cpu, vaddr) && !demand_page(cpu, vaddr, 2)) |
382ac6b3f lguest: get rid o... |
506 |
kill_guest(cpu, "bad stack page %#lx", vaddr); |
d7e28ffe6 lguest: the host ... |
507 |
} |
a91d74a3c lguest: update co... |
508 |
/*:*/ |
d7e28ffe6 lguest: the host ... |
509 |
|
acdd0b629 lguest: PAE support |
510 511 512 513 514 515 516 517 518 519 520 521 522 |
#ifdef CONFIG_X86_PAE static void release_pmd(pmd_t *spmd) { /* If the entry's not present, there's nothing to release. */ if (pmd_flags(*spmd) & _PAGE_PRESENT) { unsigned int i; pte_t *ptepage = __va(pmd_pfn(*spmd) << PAGE_SHIFT); /* For each entry in the page, we might need to release it. */ for (i = 0; i < PTRS_PER_PTE; i++) release_pte(ptepage[i]); /* Now we can free the page of PTEs */ free_page((long)ptepage); /* And zero out the PMD entry so we never release it twice. */ |
4c1ea3dd7 lguest: use set_p... |
523 |
set_pmd(spmd, __pmd(0)); |
acdd0b629 lguest: PAE support |
524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 |
} } static void release_pgd(pgd_t *spgd) { /* If the entry's not present, there's nothing to release. */ if (pgd_flags(*spgd) & _PAGE_PRESENT) { unsigned int i; pmd_t *pmdpage = __va(pgd_pfn(*spgd) << PAGE_SHIFT); for (i = 0; i < PTRS_PER_PMD; i++) release_pmd(&pmdpage[i]); /* Now we can free the page of PMDs */ free_page((long)pmdpage); /* And zero out the PGD entry so we never release it twice. */ set_pgd(spgd, __pgd(0)); } } #else /* !CONFIG_X86_PAE */ |
a91d74a3c lguest: update co... |
545 546 547 548 549 |
/*H:450 * If we chase down the release_pgd() code, the non-PAE version looks like * this. The PAE version is almost identical, but instead of calling * release_pte it calls release_pmd(), which looks much like this. */ |
90603d15f lguest: use nativ... |
550 |
static void release_pgd(pgd_t *spgd) |
d7e28ffe6 lguest: the host ... |
551 |
{ |
bff672e63 lguest: documenta... |
552 |
/* If the entry's not present, there's nothing to release. */ |
df29f43e6 Pagetables to use... |
553 |
if (pgd_flags(*spgd) & _PAGE_PRESENT) { |
d7e28ffe6 lguest: the host ... |
554 |
unsigned int i; |
2e04ef769 lguest: fix comme... |
555 556 |
/* * Converting the pfn to find the actual PTE page is easy: turn |
bff672e63 lguest: documenta... |
557 |
* the page number into a physical address, then convert to a |
2e04ef769 lguest: fix comme... |
558 559 |
* virtual address (easy for kernel pages like this one). */ |
df29f43e6 Pagetables to use... |
560 |
pte_t *ptepage = __va(pgd_pfn(*spgd) << PAGE_SHIFT); |
bff672e63 lguest: documenta... |
561 |
/* For each entry in the page, we might need to release it. */ |
df29f43e6 Pagetables to use... |
562 |
for (i = 0; i < PTRS_PER_PTE; i++) |
d7e28ffe6 lguest: the host ... |
563 |
release_pte(ptepage[i]); |
bff672e63 lguest: documenta... |
564 |
/* Now we can free the page of PTEs */ |
d7e28ffe6 lguest: the host ... |
565 |
free_page((long)ptepage); |
e1e72965e lguest: documenta... |
566 |
/* And zero out the PGD entry so we never release it twice. */ |
df29f43e6 Pagetables to use... |
567 |
*spgd = __pgd(0); |
d7e28ffe6 lguest: the host ... |
568 569 |
} } |
acdd0b629 lguest: PAE support |
570 |
#endif |
2e04ef769 lguest: fix comme... |
571 572 573 |
/*H:445 * We saw flush_user_mappings() twice: once from the flush_user_mappings() |
e1e72965e lguest: documenta... |
574 |
* hypercall and once in new_pgdir() when we re-used a top-level pgdir page. |
2e04ef769 lguest: fix comme... |
575 576 |
* It simply releases every PTE page from 0 up to the Guest's kernel address. */ |
d7e28ffe6 lguest: the host ... |
577 578 579 |
static void flush_user_mappings(struct lguest *lg, int idx) { unsigned int i; |
bff672e63 lguest: documenta... |
580 |
/* Release every pgd entry up to the kernel's address. */ |
47436aa4a Boot with virtual... |
581 |
for (i = 0; i < pgd_index(lg->kernel_address); i++) |
90603d15f lguest: use nativ... |
582 |
release_pgd(lg->pgdirs[idx].pgdir + i); |
d7e28ffe6 lguest: the host ... |
583 |
} |
2e04ef769 lguest: fix comme... |
584 585 |
/*H:440 * (v) Flushing (throwing away) page tables, |
e1e72965e lguest: documenta... |
586 587 |
* * The Guest has a hypercall to throw away the page tables: it's used when a |
2e04ef769 lguest: fix comme... |
588 589 |
* large number of mappings have been changed. */ |
1713608f2 lguest: per-vcpu ... |
590 |
void guest_pagetable_flush_user(struct lg_cpu *cpu) |
d7e28ffe6 lguest: the host ... |
591 |
{ |
bff672e63 lguest: documenta... |
592 |
/* Drop the userspace part of the current page table. */ |
1713608f2 lguest: per-vcpu ... |
593 |
flush_user_mappings(cpu->lg, cpu->cpu_pgd); |
d7e28ffe6 lguest: the host ... |
594 |
} |
bff672e63 lguest: documenta... |
595 |
/*:*/ |
d7e28ffe6 lguest: the host ... |
596 |
|
47436aa4a Boot with virtual... |
597 |
/* We walk down the guest page tables to get a guest-physical address */ |
1713608f2 lguest: per-vcpu ... |
598 |
unsigned long guest_pa(struct lg_cpu *cpu, unsigned long vaddr) |
47436aa4a Boot with virtual... |
599 600 601 |
{ pgd_t gpgd; pte_t gpte; |
acdd0b629 lguest: PAE support |
602 603 604 |
#ifdef CONFIG_X86_PAE pmd_t gpmd; #endif |
5dea1c88e lguest: use a spe... |
605 606 607 608 |
/* Still not set up? Just map 1:1. */ if (unlikely(cpu->linear_pages)) return vaddr; |
47436aa4a Boot with virtual... |
609 |
/* First step: get the top-level Guest page table entry. */ |
382ac6b3f lguest: get rid o... |
610 |
gpgd = lgread(cpu, gpgd_addr(cpu, vaddr), pgd_t); |
47436aa4a Boot with virtual... |
611 |
/* Toplevel not present? We can't map it in. */ |
6afbdd059 lguest: fix spuri... |
612 |
if (!(pgd_flags(gpgd) & _PAGE_PRESENT)) { |
382ac6b3f lguest: get rid o... |
613 |
kill_guest(cpu, "Bad address %#lx", vaddr); |
6afbdd059 lguest: fix spuri... |
614 615 |
return -1UL; } |
47436aa4a Boot with virtual... |
616 |
|
acdd0b629 lguest: PAE support |
617 618 619 620 |
#ifdef CONFIG_X86_PAE gpmd = lgread(cpu, gpmd_addr(gpgd, vaddr), pmd_t); if (!(pmd_flags(gpmd) & _PAGE_PRESENT)) kill_guest(cpu, "Bad address %#lx", vaddr); |
92b4d8df8 lguest: PAE fixes |
621 622 |
gpte = lgread(cpu, gpte_addr(cpu, gpmd, vaddr), pte_t); #else |
acdd0b629 lguest: PAE support |
623 |
gpte = lgread(cpu, gpte_addr(cpu, gpgd, vaddr), pte_t); |
92b4d8df8 lguest: PAE fixes |
624 |
#endif |
47436aa4a Boot with virtual... |
625 |
if (!(pte_flags(gpte) & _PAGE_PRESENT)) |
382ac6b3f lguest: get rid o... |
626 |
kill_guest(cpu, "Bad address %#lx", vaddr); |
47436aa4a Boot with virtual... |
627 628 629 |
return pte_pfn(gpte) * PAGE_SIZE | (vaddr & ~PAGE_MASK); } |
2e04ef769 lguest: fix comme... |
630 631 |
/* * We keep several page tables. This is a simple routine to find the page |
bff672e63 lguest: documenta... |
632 |
* table (if any) corresponding to this top-level address the Guest has given |
2e04ef769 lguest: fix comme... |
633 634 |
* us. */ |
d7e28ffe6 lguest: the host ... |
635 636 637 638 |
static unsigned int find_pgdir(struct lguest *lg, unsigned long pgtable) { unsigned int i; for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++) |
4357bd945 lguest: Revert 1c... |
639 |
if (lg->pgdirs[i].pgdir && lg->pgdirs[i].gpgdir == pgtable) |
d7e28ffe6 lguest: the host ... |
640 641 642 |
break; return i; } |
2e04ef769 lguest: fix comme... |
643 644 |
/*H:435 * And this is us, creating the new page directory. If we really do |
bff672e63 lguest: documenta... |
645 |
* allocate a new one (and so the kernel parts are not there), we set |
2e04ef769 lguest: fix comme... |
646 647 |
* blank_pgdir. */ |
1713608f2 lguest: per-vcpu ... |
648 |
static unsigned int new_pgdir(struct lg_cpu *cpu, |
ee3db0f2b Rename "cr3" to "... |
649 |
unsigned long gpgdir, |
d7e28ffe6 lguest: the host ... |
650 651 652 |
int *blank_pgdir) { unsigned int next; |
acdd0b629 lguest: PAE support |
653 654 655 |
#ifdef CONFIG_X86_PAE pmd_t *pmd_table; #endif |
d7e28ffe6 lguest: the host ... |
656 |
|
2e04ef769 lguest: fix comme... |
657 658 659 660 |
/* * We pick one entry at random to throw out. Choosing the Least * Recently Used might be better, but this is easy. */ |
382ac6b3f lguest: get rid o... |
661 |
next = random32() % ARRAY_SIZE(cpu->lg->pgdirs); |
bff672e63 lguest: documenta... |
662 |
/* If it's never been allocated at all before, try now. */ |
382ac6b3f lguest: get rid o... |
663 664 665 |
if (!cpu->lg->pgdirs[next].pgdir) { cpu->lg->pgdirs[next].pgdir = (pgd_t *)get_zeroed_page(GFP_KERNEL); |
bff672e63 lguest: documenta... |
666 |
/* If the allocation fails, just keep using the one we have */ |
382ac6b3f lguest: get rid o... |
667 |
if (!cpu->lg->pgdirs[next].pgdir) |
1713608f2 lguest: per-vcpu ... |
668 |
next = cpu->cpu_pgd; |
acdd0b629 lguest: PAE support |
669 670 |
else { #ifdef CONFIG_X86_PAE |
2e04ef769 lguest: fix comme... |
671 672 673 674 |
/* * In PAE mode, allocate a pmd page and populate the * last pgd entry. */ |
acdd0b629 lguest: PAE support |
675 676 677 678 679 680 681 682 683 |
pmd_table = (pmd_t *)get_zeroed_page(GFP_KERNEL); if (!pmd_table) { free_page((long)cpu->lg->pgdirs[next].pgdir); set_pgd(cpu->lg->pgdirs[next].pgdir, __pgd(0)); next = cpu->cpu_pgd; } else { set_pgd(cpu->lg->pgdirs[next].pgdir + SWITCHER_PGD_INDEX, __pgd(__pa(pmd_table) | _PAGE_PRESENT)); |
2e04ef769 lguest: fix comme... |
684 685 686 687 |
/* * This is a blank page, so there are no kernel * mappings: caller must map the stack! */ |
acdd0b629 lguest: PAE support |
688 689 690 |
*blank_pgdir = 1; } #else |
d7e28ffe6 lguest: the host ... |
691 |
*blank_pgdir = 1; |
acdd0b629 lguest: PAE support |
692 693 |
#endif } |
d7e28ffe6 lguest: the host ... |
694 |
} |
bff672e63 lguest: documenta... |
695 |
/* Record which Guest toplevel this shadows. */ |
382ac6b3f lguest: get rid o... |
696 |
cpu->lg->pgdirs[next].gpgdir = gpgdir; |
d7e28ffe6 lguest: the host ... |
697 |
/* Release all the non-kernel mappings. */ |
382ac6b3f lguest: get rid o... |
698 |
flush_user_mappings(cpu->lg, next); |
d7e28ffe6 lguest: the host ... |
699 700 701 |
return next; } |
2e04ef769 lguest: fix comme... |
702 703 |
/*H:470 * Finally, a routine which throws away everything: all PGD entries in all |
e1e72965e lguest: documenta... |
704 |
* the shadow page tables, including the Guest's kernel mappings. This is used |
2e04ef769 lguest: fix comme... |
705 706 |
* when we destroy the Guest. */ |
d7e28ffe6 lguest: the host ... |
707 708 709 |
static void release_all_pagetables(struct lguest *lg) { unsigned int i, j; |
bff672e63 lguest: documenta... |
710 |
/* Every shadow pagetable this Guest has */ |
d7e28ffe6 lguest: the host ... |
711 |
for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++) |
acdd0b629 lguest: PAE support |
712 713 714 715 716 717 718 719 720 |
if (lg->pgdirs[i].pgdir) { #ifdef CONFIG_X86_PAE pgd_t *spgd; pmd_t *pmdpage; unsigned int k; /* Get the last pmd page. */ spgd = lg->pgdirs[i].pgdir + SWITCHER_PGD_INDEX; pmdpage = __va(pgd_pfn(*spgd) << PAGE_SHIFT); |
2e04ef769 lguest: fix comme... |
721 722 723 724 |
/* * And release the pmd entries of that pmd page, * except for the switcher pmd. */ |
acdd0b629 lguest: PAE support |
725 726 727 |
for (k = 0; k < SWITCHER_PMD_INDEX; k++) release_pmd(&pmdpage[k]); #endif |
bff672e63 lguest: documenta... |
728 |
/* Every PGD entry except the Switcher at the top */ |
d7e28ffe6 lguest: the host ... |
729 |
for (j = 0; j < SWITCHER_PGD_INDEX; j++) |
90603d15f lguest: use nativ... |
730 |
release_pgd(lg->pgdirs[i].pgdir + j); |
acdd0b629 lguest: PAE support |
731 |
} |
d7e28ffe6 lguest: the host ... |
732 |
} |
2e04ef769 lguest: fix comme... |
733 734 |
/* * We also throw away everything when a Guest tells us it's changed a kernel |
bff672e63 lguest: documenta... |
735 |
* mapping. Since kernel mappings are in every page table, it's easiest to |
e1e72965e lguest: documenta... |
736 |
* throw them all away. This traps the Guest in amber for a while as |
2e04ef769 lguest: fix comme... |
737 738 |
* everything faults back in, but it's rare. */ |
4665ac8e2 lguest: makes spe... |
739 |
void guest_pagetable_clear_all(struct lg_cpu *cpu) |
d7e28ffe6 lguest: the host ... |
740 |
{ |
4665ac8e2 lguest: makes spe... |
741 |
release_all_pagetables(cpu->lg); |
bff672e63 lguest: documenta... |
742 |
/* We need the Guest kernel stack mapped again. */ |
4665ac8e2 lguest: makes spe... |
743 |
pin_stack_pages(cpu); |
d7e28ffe6 lguest: the host ... |
744 |
} |
5dea1c88e lguest: use a spe... |
745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 |
/*H:430 * (iv) Switching page tables * * Now we've seen all the page table setting and manipulation, let's see * what happens when the Guest changes page tables (ie. changes the top-level * pgdir). This occurs on almost every context switch. */ void guest_new_pagetable(struct lg_cpu *cpu, unsigned long pgtable) { int newpgdir, repin = 0; /* * The very first time they call this, we're actually running without * any page tables; we've been making it up. Throw them away now. */ if (unlikely(cpu->linear_pages)) { release_all_pagetables(cpu->lg); cpu->linear_pages = false; /* Force allocation of a new pgdir. */ newpgdir = ARRAY_SIZE(cpu->lg->pgdirs); } else { /* Look to see if we have this one already. */ newpgdir = find_pgdir(cpu->lg, pgtable); } /* * If not, we allocate or mug an existing one: if it's a fresh one, * repin gets set to 1. */ if (newpgdir == ARRAY_SIZE(cpu->lg->pgdirs)) newpgdir = new_pgdir(cpu, pgtable, &repin); /* Change the current pgd index to the new one. */ cpu->cpu_pgd = newpgdir; /* If it was completely blank, we map in the Guest kernel stack */ if (repin) pin_stack_pages(cpu); } |
e1e72965e lguest: documenta... |
783 |
/*:*/ |
2e04ef769 lguest: fix comme... |
784 785 786 |
/*M:009 * Since we throw away all mappings when a kernel mapping changes, our |
e1e72965e lguest: documenta... |
787 788 789 790 791 |
* performance sucks for guests using highmem. In fact, a guest with * PAGE_OFFSET 0xc0000000 (the default) and more than about 700MB of RAM is * usually slower than a Guest with less memory. * * This, of course, cannot be fixed. It would take some kind of... well, I |
2e04ef769 lguest: fix comme... |
792 793 |
* don't know, but the term "puissant code-fu" comes to mind. :*/ |
d7e28ffe6 lguest: the host ... |
794 |
|
2e04ef769 lguest: fix comme... |
795 796 |
/*H:420 * This is the routine which actually sets the page table entry for then |
bff672e63 lguest: documenta... |
797 798 799 800 801 802 803 804 805 806 807 808 809 |
* "idx"'th shadow page table. * * Normally, we can just throw out the old entry and replace it with 0: if they * use it demand_page() will put the new entry in. We need to do this anyway: * The Guest expects _PAGE_ACCESSED to be set on its PTE the first time a page * is read from, and _PAGE_DIRTY when it's written to. * * But Avi Kivity pointed out that most Operating Systems (Linux included) set * these bits on PTEs immediately anyway. This is done to save the CPU from * having to update them, but it helps us the same way: if they set * _PAGE_ACCESSED then we can put a read-only PTE entry in immediately, and if * they set _PAGE_DIRTY then we can put a writable PTE entry in immediately. */ |
382ac6b3f lguest: get rid o... |
810 |
static void do_set_pte(struct lg_cpu *cpu, int idx, |
df29f43e6 Pagetables to use... |
811 |
unsigned long vaddr, pte_t gpte) |
d7e28ffe6 lguest: the host ... |
812 |
{ |
e1e72965e lguest: documenta... |
813 |
/* Look up the matching shadow page directory entry. */ |
382ac6b3f lguest: get rid o... |
814 |
pgd_t *spgd = spgd_addr(cpu, idx, vaddr); |
acdd0b629 lguest: PAE support |
815 816 817 |
#ifdef CONFIG_X86_PAE pmd_t *spmd; #endif |
bff672e63 lguest: documenta... |
818 819 |
/* If the top level isn't present, there's no entry to update. */ |
df29f43e6 Pagetables to use... |
820 |
if (pgd_flags(*spgd) & _PAGE_PRESENT) { |
acdd0b629 lguest: PAE support |
821 822 823 824 |
#ifdef CONFIG_X86_PAE spmd = spmd_addr(cpu, *spgd, vaddr); if (pmd_flags(*spmd) & _PAGE_PRESENT) { #endif |
2e04ef769 lguest: fix comme... |
825 |
/* Otherwise, start by releasing the existing entry. */ |
acdd0b629 lguest: PAE support |
826 827 |
pte_t *spte = spte_addr(cpu, *spgd, vaddr); release_pte(*spte); |
2e04ef769 lguest: fix comme... |
828 829 830 831 832 833 |
/* * If they're setting this entry as dirty or accessed, * we might as well put that entry they've given us in * now. This shaves 10% off a copy-on-write * micro-benchmark. */ |
acdd0b629 lguest: PAE support |
834 835 |
if (pte_flags(gpte) & (_PAGE_DIRTY | _PAGE_ACCESSED)) { check_gpte(cpu, gpte); |
4c1ea3dd7 lguest: use set_p... |
836 837 |
set_pte(spte, gpte_to_spte(cpu, gpte, |
acdd0b629 lguest: PAE support |
838 |
pte_flags(gpte) & _PAGE_DIRTY)); |
2e04ef769 lguest: fix comme... |
839 840 841 842 843 |
} else { /* * Otherwise kill it and we can demand_page() * it in later. */ |
4c1ea3dd7 lguest: use set_p... |
844 |
set_pte(spte, __pte(0)); |
2e04ef769 lguest: fix comme... |
845 |
} |
acdd0b629 lguest: PAE support |
846 847 848 |
#ifdef CONFIG_X86_PAE } #endif |
d7e28ffe6 lguest: the host ... |
849 850 |
} } |
2e04ef769 lguest: fix comme... |
851 852 |
/*H:410 * Updating a PTE entry is a little trickier. |
bff672e63 lguest: documenta... |
853 854 855 856 857 858 859 |
* * We keep track of several different page tables (the Guest uses one for each * process, so it makes sense to cache at least a few). Each of these have * identical kernel parts: ie. every mapping above PAGE_OFFSET is the same for * all processes. So when the page table above that address changes, we update * all the page tables, not just the current one. This is rare. * |
a6bd8e130 lguest: comment d... |
860 |
* The benefit is that when we have to track a new page table, we can keep all |
2e04ef769 lguest: fix comme... |
861 862 |
* the kernel mappings. This speeds up context switch immensely. */ |
382ac6b3f lguest: get rid o... |
863 |
void guest_set_pte(struct lg_cpu *cpu, |
ee3db0f2b Rename "cr3" to "... |
864 |
unsigned long gpgdir, unsigned long vaddr, pte_t gpte) |
d7e28ffe6 lguest: the host ... |
865 |
{ |
2e04ef769 lguest: fix comme... |
866 867 868 869 |
/* * Kernel mappings must be changed on all top levels. Slow, but doesn't * happen often. */ |
382ac6b3f lguest: get rid o... |
870 |
if (vaddr >= cpu->lg->kernel_address) { |
d7e28ffe6 lguest: the host ... |
871 |
unsigned int i; |
382ac6b3f lguest: get rid o... |
872 873 874 |
for (i = 0; i < ARRAY_SIZE(cpu->lg->pgdirs); i++) if (cpu->lg->pgdirs[i].pgdir) do_set_pte(cpu, i, vaddr, gpte); |
d7e28ffe6 lguest: the host ... |
875 |
} else { |
bff672e63 lguest: documenta... |
876 |
/* Is this page table one we have a shadow for? */ |
382ac6b3f lguest: get rid o... |
877 878 |
int pgdir = find_pgdir(cpu->lg, gpgdir); if (pgdir != ARRAY_SIZE(cpu->lg->pgdirs)) |
bff672e63 lguest: documenta... |
879 |
/* If so, do the update. */ |
382ac6b3f lguest: get rid o... |
880 |
do_set_pte(cpu, pgdir, vaddr, gpte); |
d7e28ffe6 lguest: the host ... |
881 882 |
} } |
bff672e63 lguest: documenta... |
883 |
/*H:400 |
e1e72965e lguest: documenta... |
884 |
* (iii) Setting up a page table entry when the Guest tells us one has changed. |
bff672e63 lguest: documenta... |
885 886 887 888 889 890 891 892 893 894 |
* * Just like we did in interrupts_and_traps.c, it makes sense for us to deal * with the other side of page tables while we're here: what happens when the * Guest asks for a page table to be updated? * * We already saw that demand_page() will fill in the shadow page tables when * needed, so we can simply remove shadow page table entries whenever the Guest * tells us they've changed. When the Guest tries to use the new entry it will * fault and demand_page() will fix it up. * |
fd589a8f0 trivial: fix typo... |
895 |
* So with that in mind here's our code to update a (top-level) PGD entry: |
bff672e63 lguest: documenta... |
896 |
*/ |
ebe0ba84f lguest: replace h... |
897 |
void guest_set_pgd(struct lguest *lg, unsigned long gpgdir, u32 idx) |
d7e28ffe6 lguest: the host ... |
898 899 900 901 902 |
{ int pgdir; if (idx >= SWITCHER_PGD_INDEX) return; |
bff672e63 lguest: documenta... |
903 |
/* If they're talking about a page table we have a shadow for... */ |
ee3db0f2b Rename "cr3" to "... |
904 |
pgdir = find_pgdir(lg, gpgdir); |
d7e28ffe6 lguest: the host ... |
905 |
if (pgdir < ARRAY_SIZE(lg->pgdirs)) |
bff672e63 lguest: documenta... |
906 |
/* ... throw it away. */ |
90603d15f lguest: use nativ... |
907 |
release_pgd(lg->pgdirs[pgdir].pgdir + idx); |
d7e28ffe6 lguest: the host ... |
908 |
} |
a91d74a3c lguest: update co... |
909 |
|
acdd0b629 lguest: PAE support |
910 |
#ifdef CONFIG_X86_PAE |
a91d74a3c lguest: update co... |
911 |
/* For setting a mid-level, we just throw everything away. It's easy. */ |
acdd0b629 lguest: PAE support |
912 913 914 915 916 |
void guest_set_pmd(struct lguest *lg, unsigned long pmdp, u32 idx) { guest_pagetable_clear_all(&lg->cpus[0]); } #endif |
d7e28ffe6 lguest: the host ... |
917 |
|
2e04ef769 lguest: fix comme... |
918 919 |
/*H:500 * (vii) Setting up the page tables initially. |
bff672e63 lguest: documenta... |
920 |
* |
5dea1c88e lguest: use a spe... |
921 922 923 924 |
* When a Guest is first created, set initialize a shadow page table which * we will populate on future faults. The Guest doesn't have any actual * pagetables yet, so we set linear_pages to tell demand_page() to fake it * for the moment. |
2e04ef769 lguest: fix comme... |
925 |
*/ |
58a245664 lguest: move the ... |
926 |
int init_guest_pagetable(struct lguest *lg) |
d7e28ffe6 lguest: the host ... |
927 |
{ |
5dea1c88e lguest: use a spe... |
928 929 |
struct lg_cpu *cpu = &lg->cpus[0]; int allocated = 0; |
58a245664 lguest: move the ... |
930 |
|
5dea1c88e lguest: use a spe... |
931 932 933 |
/* lg (and lg->cpus[]) starts zeroed: this allocates a new pgdir */ cpu->cpu_pgd = new_pgdir(cpu, 0, &allocated); if (!allocated) |
d7e28ffe6 lguest: the host ... |
934 |
return -ENOMEM; |
a91d74a3c lguest: update co... |
935 |
|
5dea1c88e lguest: use a spe... |
936 937 |
/* We start with a linear mapping until the initialize. */ cpu->linear_pages = true; |
d7e28ffe6 lguest: the host ... |
938 939 |
return 0; } |
a91d74a3c lguest: update co... |
940 |
/*H:508 When the Guest calls LHCALL_LGUEST_INIT we do more setup. */ |
382ac6b3f lguest: get rid o... |
941 |
void page_table_guest_data_init(struct lg_cpu *cpu) |
47436aa4a Boot with virtual... |
942 943 |
{ /* We get the kernel address: above this is all kernel memory. */ |
382ac6b3f lguest: get rid o... |
944 |
if (get_user(cpu->lg->kernel_address, |
acdd0b629 lguest: PAE support |
945 |
&cpu->lg->lguest_data->kernel_address) |
2e04ef769 lguest: fix comme... |
946 947 948 949 |
/* * We tell the Guest that it can't use the top 2 or 4 MB * of virtual addresses used by the Switcher. */ |
acdd0b629 lguest: PAE support |
950 |
|| put_user(RESERVE_MEM * 1024 * 1024, |
5dea1c88e lguest: use a spe... |
951 |
&cpu->lg->lguest_data->reserve_mem)) { |
382ac6b3f lguest: get rid o... |
952 |
kill_guest(cpu, "bad guest page %p", cpu->lg->lguest_data); |
5dea1c88e lguest: use a spe... |
953 954 |
return; } |
47436aa4a Boot with virtual... |
955 |
|
2e04ef769 lguest: fix comme... |
956 957 |
/* * In flush_user_mappings() we loop from 0 to |
47436aa4a Boot with virtual... |
958 |
* "pgd_index(lg->kernel_address)". This assumes it won't hit the |
2e04ef769 lguest: fix comme... |
959 960 |
* Switcher mappings, so check that now. */ |
acdd0b629 lguest: PAE support |
961 962 963 964 |
#ifdef CONFIG_X86_PAE if (pgd_index(cpu->lg->kernel_address) == SWITCHER_PGD_INDEX && pmd_index(cpu->lg->kernel_address) == SWITCHER_PMD_INDEX) #else |
382ac6b3f lguest: get rid o... |
965 |
if (pgd_index(cpu->lg->kernel_address) >= SWITCHER_PGD_INDEX) |
acdd0b629 lguest: PAE support |
966 |
#endif |
382ac6b3f lguest: get rid o... |
967 968 |
kill_guest(cpu, "bad kernel address %#lx", cpu->lg->kernel_address); |
47436aa4a Boot with virtual... |
969 |
} |
bff672e63 lguest: documenta... |
970 |
/* When a Guest dies, our cleanup is fairly simple. */ |
d7e28ffe6 lguest: the host ... |
971 972 973 |
void free_guest_pagetable(struct lguest *lg) { unsigned int i; |
bff672e63 lguest: documenta... |
974 |
/* Throw away all page table pages. */ |
d7e28ffe6 lguest: the host ... |
975 |
release_all_pagetables(lg); |
bff672e63 lguest: documenta... |
976 |
/* Now free the top levels: free_page() can handle 0 just fine. */ |
d7e28ffe6 lguest: the host ... |
977 978 979 |
for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++) free_page((long)lg->pgdirs[i].pgdir); } |
2e04ef769 lguest: fix comme... |
980 981 |
/*H:480 * (vi) Mapping the Switcher when the Guest is about to run. |
bff672e63 lguest: documenta... |
982 |
* |
e1e72965e lguest: documenta... |
983 |
* The Switcher and the two pages for this CPU need to be visible in the |
bff672e63 lguest: documenta... |
984 |
* Guest (and not the pages for other CPUs). We have the appropriate PTE pages |
e1e72965e lguest: documenta... |
985 |
* for each CPU already set up, we just need to hook them in now we know which |
2e04ef769 lguest: fix comme... |
986 987 |
* Guest is about to run on this CPU. */ |
0c78441cf lguest: map_switc... |
988 |
void map_switcher_in_guest(struct lg_cpu *cpu, struct lguest_pages *pages) |
d7e28ffe6 lguest: the host ... |
989 |
{ |
c9f295496 lguest: Use this_... |
990 |
pte_t *switcher_pte_page = __this_cpu_read(switcher_pte_pages); |
df29f43e6 Pagetables to use... |
991 |
pte_t regs_pte; |
d7e28ffe6 lguest: the host ... |
992 |
|
acdd0b629 lguest: PAE support |
993 994 995 |
#ifdef CONFIG_X86_PAE pmd_t switcher_pmd; pmd_t *pmd_table; |
4c1ea3dd7 lguest: use set_p... |
996 997 |
switcher_pmd = pfn_pmd(__pa(switcher_pte_page) >> PAGE_SHIFT, PAGE_KERNEL_EXEC); |
acdd0b629 lguest: PAE support |
998 |
|
a91d74a3c lguest: update co... |
999 1000 |
/* Figure out where the pmd page is, by reading the PGD, and converting * it to a virtual address. */ |
acdd0b629 lguest: PAE support |
1001 1002 1003 |
pmd_table = __va(pgd_pfn(cpu->lg-> pgdirs[cpu->cpu_pgd].pgdir[SWITCHER_PGD_INDEX]) << PAGE_SHIFT); |
a91d74a3c lguest: update co... |
1004 |
/* Now write it into the shadow page table. */ |
4c1ea3dd7 lguest: use set_p... |
1005 |
set_pmd(&pmd_table[SWITCHER_PMD_INDEX], switcher_pmd); |
acdd0b629 lguest: PAE support |
1006 1007 |
#else pgd_t switcher_pgd; |
2e04ef769 lguest: fix comme... |
1008 1009 1010 1011 |
/* * Make the last PGD entry for this Guest point to the Switcher's PTE * page for this CPU (with appropriate flags). */ |
ed1dc7781 lguest: map switc... |
1012 |
switcher_pgd = __pgd(__pa(switcher_pte_page) | __PAGE_KERNEL_EXEC); |
df29f43e6 Pagetables to use... |
1013 |
|
1713608f2 lguest: per-vcpu ... |
1014 |
cpu->lg->pgdirs[cpu->cpu_pgd].pgdir[SWITCHER_PGD_INDEX] = switcher_pgd; |
d7e28ffe6 lguest: the host ... |
1015 |
|
acdd0b629 lguest: PAE support |
1016 |
#endif |
2e04ef769 lguest: fix comme... |
1017 1018 |
/* * We also change the Switcher PTE page. When we're running the Guest, |
bff672e63 lguest: documenta... |
1019 1020 1021 1022 1023 |
* we want the Guest's "regs" page to appear where the first Switcher * page for this CPU is. This is an optimization: when the Switcher * saves the Guest registers, it saves them into the first page of this * CPU's "struct lguest_pages": if we make sure the Guest's register * page is already mapped there, we don't have to copy them out |
2e04ef769 lguest: fix comme... |
1024 1025 |
* again. */ |
4c1ea3dd7 lguest: use set_p... |
1026 1027 |
regs_pte = pfn_pte(__pa(cpu->regs_page) >> PAGE_SHIFT, PAGE_KERNEL); set_pte(&switcher_pte_page[pte_index((unsigned long)pages)], regs_pte); |
d7e28ffe6 lguest: the host ... |
1028 |
} |
bff672e63 lguest: documenta... |
1029 |
/*:*/ |
d7e28ffe6 lguest: the host ... |
1030 1031 1032 1033 1034 1035 1036 1037 |
static void free_switcher_pte_pages(void) { unsigned int i; for_each_possible_cpu(i) free_page((long)switcher_pte_page(i)); } |
2e04ef769 lguest: fix comme... |
1038 1039 |
/*H:520 * Setting up the Switcher PTE page for given CPU is fairly easy, given |
bff672e63 lguest: documenta... |
1040 1041 |
* the CPU number and the "struct page"s for the Switcher code itself. * |
2e04ef769 lguest: fix comme... |
1042 1043 |
* Currently the Switcher is less than a page long, so "pages" is always 1. */ |
d7e28ffe6 lguest: the host ... |
1044 1045 1046 1047 1048 |
static __init void populate_switcher_pte_page(unsigned int cpu, struct page *switcher_page[], unsigned int pages) { unsigned int i; |
df29f43e6 Pagetables to use... |
1049 |
pte_t *pte = switcher_pte_page(cpu); |
d7e28ffe6 lguest: the host ... |
1050 |
|
bff672e63 lguest: documenta... |
1051 |
/* The first entries are easy: they map the Switcher code. */ |
d7e28ffe6 lguest: the host ... |
1052 |
for (i = 0; i < pages; i++) { |
4c1ea3dd7 lguest: use set_p... |
1053 |
set_pte(&pte[i], mk_pte(switcher_page[i], |
90603d15f lguest: use nativ... |
1054 |
__pgprot(_PAGE_PRESENT|_PAGE_ACCESSED))); |
d7e28ffe6 lguest: the host ... |
1055 |
} |
bff672e63 lguest: documenta... |
1056 |
/* The only other thing we map is this CPU's pair of pages. */ |
d7e28ffe6 lguest: the host ... |
1057 |
i = pages + cpu*2; |
bff672e63 lguest: documenta... |
1058 |
/* First page (Guest registers) is writable from the Guest */ |
4c1ea3dd7 lguest: use set_p... |
1059 |
set_pte(&pte[i], pfn_pte(page_to_pfn(switcher_page[i]), |
90603d15f lguest: use nativ... |
1060 |
__pgprot(_PAGE_PRESENT|_PAGE_ACCESSED|_PAGE_RW))); |
df29f43e6 Pagetables to use... |
1061 |
|
2e04ef769 lguest: fix comme... |
1062 1063 1064 1065 |
/* * The second page contains the "struct lguest_ro_state", and is * read-only. */ |
4c1ea3dd7 lguest: use set_p... |
1066 |
set_pte(&pte[i+1], pfn_pte(page_to_pfn(switcher_page[i+1]), |
90603d15f lguest: use nativ... |
1067 |
__pgprot(_PAGE_PRESENT|_PAGE_ACCESSED))); |
d7e28ffe6 lguest: the host ... |
1068 |
} |
2e04ef769 lguest: fix comme... |
1069 1070 |
/* * We've made it through the page table code. Perhaps our tired brains are |
e1e72965e lguest: documenta... |
1071 1072 |
* still processing the details, or perhaps we're simply glad it's over. * |
a6bd8e130 lguest: comment d... |
1073 1074 1075 1076 1077 |
* If nothing else, note that all this complexity in juggling shadow page tables * in sync with the Guest's page tables is for one reason: for most Guests this * page table dance determines how bad performance will be. This is why Xen * uses exotic direct Guest pagetable manipulation, and why both Intel and AMD * have implemented shadow page table support directly into hardware. |
e1e72965e lguest: documenta... |
1078 |
* |
2e04ef769 lguest: fix comme... |
1079 1080 |
* There is just one file remaining in the Host. */ |
e1e72965e lguest: documenta... |
1081 |
|
2e04ef769 lguest: fix comme... |
1082 1083 1084 1085 |
/*H:510 * At boot or module load time, init_pagetables() allocates and populates * the Switcher PTE page for each CPU. */ |
d7e28ffe6 lguest: the host ... |
1086 1087 1088 1089 1090 |
__init int init_pagetables(struct page **switcher_page, unsigned int pages) { unsigned int i; for_each_possible_cpu(i) { |
df29f43e6 Pagetables to use... |
1091 |
switcher_pte_page(i) = (pte_t *)get_zeroed_page(GFP_KERNEL); |
d7e28ffe6 lguest: the host ... |
1092 1093 1094 1095 1096 1097 1098 1099 |
if (!switcher_pte_page(i)) { free_switcher_pte_pages(); return -ENOMEM; } populate_switcher_pte_page(i, switcher_page, pages); } return 0; } |
bff672e63 lguest: documenta... |
1100 |
/*:*/ |
d7e28ffe6 lguest: the host ... |
1101 |
|
bff672e63 lguest: documenta... |
1102 |
/* Cleaning up simply involves freeing the PTE page for each CPU. */ |
d7e28ffe6 lguest: the host ... |
1103 1104 1105 1106 |
void free_pagetables(void) { free_switcher_pte_pages(); } |