Commit da181a8b3916aa7f2e3c5775d2bd2fe3454cf82d

Authored by Rusty Russell
Committed by Andi Kleen
1 parent 13623d7930

[PATCH] paravirt: Add MMU virtualization to paravirt_ops

Add the three bare TLB accessor functions to paravirt-ops.  Most amusingly,
flush_tlb is redefined on SMP, so I can't call the paravirt op flush_tlb.
Instead, I chose to indicate the actual flush type, kernel (global) vs. user
(non-global).  Global in this sense means using the global bit in the page
table entry, which makes TLB entries persistent across CR3 reloads, not
global as in the SMP sense of invoking remote shootdowns, so the term is
confusingly overloaded.

AK: folded in fix from Zach for PAE compilation

Signed-off-by: Zachary Amsden <zach@vmware.com>
Signed-off-by: Chris Wright <chrisw@sous-sol.org>
Signed-off-by: Andi Kleen <ak@suse.de>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Jeremy Fitzhardinge <jeremy@goop.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>

Showing 7 changed files with 226 additions and 26 deletions Side-by-side Diff

arch/i386/kernel/paravirt.c
... ... @@ -31,6 +31,7 @@
31 31 #include <asm/delay.h>
32 32 #include <asm/fixmap.h>
33 33 #include <asm/apic.h>
  34 +#include <asm/tlbflush.h>
34 35  
35 36 /* nop stub */
36 37 static void native_nop(void)
... ... @@ -379,6 +380,97 @@
379 380 asm volatile("outb %al,$0x80");
380 381 }
381 382  
  383 +static fastcall void native_flush_tlb(void)
  384 +{
  385 + __native_flush_tlb();
  386 +}
  387 +
  388 +/*
  389 + * Global pages have to be flushed a bit differently. Not a real
  390 + * performance problem because this does not happen often.
  391 + */
  392 +static fastcall void native_flush_tlb_global(void)
  393 +{
  394 + __native_flush_tlb_global();
  395 +}
  396 +
  397 +static fastcall void native_flush_tlb_single(u32 addr)
  398 +{
  399 + __native_flush_tlb_single(addr);
  400 +}
  401 +
  402 +#ifndef CONFIG_X86_PAE
  403 +static fastcall void native_set_pte(pte_t *ptep, pte_t pteval)
  404 +{
  405 + *ptep = pteval;
  406 +}
  407 +
  408 +static fastcall void native_set_pte_at(struct mm_struct *mm, u32 addr, pte_t *ptep, pte_t pteval)
  409 +{
  410 + *ptep = pteval;
  411 +}
  412 +
  413 +static fastcall void native_set_pmd(pmd_t *pmdp, pmd_t pmdval)
  414 +{
  415 + *pmdp = pmdval;
  416 +}
  417 +
  418 +#else /* CONFIG_X86_PAE */
  419 +
  420 +static fastcall void native_set_pte(pte_t *ptep, pte_t pte)
  421 +{
  422 + ptep->pte_high = pte.pte_high;
  423 + smp_wmb();
  424 + ptep->pte_low = pte.pte_low;
  425 +}
  426 +
  427 +static fastcall void native_set_pte_at(struct mm_struct *mm, u32 addr, pte_t *ptep, pte_t pte)
  428 +{
  429 + ptep->pte_high = pte.pte_high;
  430 + smp_wmb();
  431 + ptep->pte_low = pte.pte_low;
  432 +}
  433 +
  434 +static fastcall void native_set_pte_present(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte)
  435 +{
  436 + ptep->pte_low = 0;
  437 + smp_wmb();
  438 + ptep->pte_high = pte.pte_high;
  439 + smp_wmb();
  440 + ptep->pte_low = pte.pte_low;
  441 +}
  442 +
  443 +static fastcall void native_set_pte_atomic(pte_t *ptep, pte_t pteval)
  444 +{
  445 + set_64bit((unsigned long long *)ptep,pte_val(pteval));
  446 +}
  447 +
  448 +static fastcall void native_set_pmd(pmd_t *pmdp, pmd_t pmdval)
  449 +{
  450 + set_64bit((unsigned long long *)pmdp,pmd_val(pmdval));
  451 +}
  452 +
  453 +static fastcall void native_set_pud(pud_t *pudp, pud_t pudval)
  454 +{
  455 + *pudp = pudval;
  456 +}
  457 +
  458 +static fastcall void native_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
  459 +{
  460 + ptep->pte_low = 0;
  461 + smp_wmb();
  462 + ptep->pte_high = 0;
  463 +}
  464 +
  465 +static fastcall void native_pmd_clear(pmd_t *pmd)
  466 +{
  467 + u32 *tmp = (u32 *)pmd;
  468 + *tmp = 0;
  469 + smp_wmb();
  470 + *(tmp + 1) = 0;
  471 +}
  472 +#endif /* CONFIG_X86_PAE */
  473 +
382 474 /* These are in entry.S */
383 475 extern fastcall void native_iret(void);
384 476 extern fastcall void native_irq_enable_sysexit(void);
... ... @@ -452,6 +544,23 @@
452 544 .apic_write = native_apic_write,
453 545 .apic_write_atomic = native_apic_write_atomic,
454 546 .apic_read = native_apic_read,
  547 +#endif
  548 +
  549 + .flush_tlb_user = native_flush_tlb,
  550 + .flush_tlb_kernel = native_flush_tlb_global,
  551 + .flush_tlb_single = native_flush_tlb_single,
  552 +
  553 + .set_pte = native_set_pte,
  554 + .set_pte_at = native_set_pte_at,
  555 + .set_pmd = native_set_pmd,
  556 + .pte_update = (void *)native_nop,
  557 + .pte_update_defer = (void *)native_nop,
  558 +#ifdef CONFIG_X86_PAE
  559 + .set_pte_atomic = native_set_pte_atomic,
  560 + .set_pte_present = native_set_pte_present,
  561 + .set_pud = native_set_pud,
  562 + .pte_clear = native_pte_clear,
  563 + .pmd_clear = native_pmd_clear,
455 564 #endif
456 565  
457 566 .irq_enable_sysexit = native_irq_enable_sysexit,
arch/i386/mm/boot_ioremap.c
... ... @@ -16,6 +16,7 @@
16 16 */
17 17  
18 18 #undef CONFIG_X86_PAE
  19 +#undef CONFIG_PARAVIRT
19 20 #include <asm/page.h>
20 21 #include <asm/pgtable.h>
21 22 #include <asm/tlbflush.h>
include/asm-i386/paravirt.h
... ... @@ -4,6 +4,7 @@
4 4 * para-virtualization: those hooks are defined here. */
5 5 #include <linux/linkage.h>
6 6 #include <linux/stringify.h>
  7 +#include <asm/page.h>
7 8  
8 9 #ifdef CONFIG_PARAVIRT
9 10 /* These are the most performance critical ops, so we want to be able to patch
... ... @@ -27,6 +28,7 @@
27 28 struct thread_struct;
28 29 struct Xgt_desc_struct;
29 30 struct tss_struct;
  31 +struct mm_struct;
30 32 struct paravirt_ops
31 33 {
32 34 unsigned int kernel_rpl;
... ... @@ -121,6 +123,23 @@
121 123 unsigned long (fastcall *apic_read)(unsigned long reg);
122 124 #endif
123 125  
  126 + void (fastcall *flush_tlb_user)(void);
  127 + void (fastcall *flush_tlb_kernel)(void);
  128 + void (fastcall *flush_tlb_single)(u32 addr);
  129 +
  130 + void (fastcall *set_pte)(pte_t *ptep, pte_t pteval);
  131 + void (fastcall *set_pte_at)(struct mm_struct *mm, u32 addr, pte_t *ptep, pte_t pteval);
  132 + void (fastcall *set_pmd)(pmd_t *pmdp, pmd_t pmdval);
  133 + void (fastcall *pte_update)(struct mm_struct *mm, u32 addr, pte_t *ptep);
  134 + void (fastcall *pte_update_defer)(struct mm_struct *mm, u32 addr, pte_t *ptep);
  135 +#ifdef CONFIG_X86_PAE
  136 + void (fastcall *set_pte_atomic)(pte_t *ptep, pte_t pteval);
  137 + void (fastcall *set_pte_present)(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte);
  138 + void (fastcall *set_pud)(pud_t *pudp, pud_t pudval);
  139 + void (fastcall *pte_clear)(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
  140 + void (fastcall *pmd_clear)(pmd_t *pmdp);
  141 +#endif
  142 +
124 143 /* These two are jmp to, not actually called. */
125 144 void (fastcall *irq_enable_sysexit)(void);
126 145 void (fastcall *iret)(void);
... ... @@ -296,6 +315,62 @@
296 315 }
297 316 #endif
298 317  
  318 +
  319 +#define __flush_tlb() paravirt_ops.flush_tlb_user()
  320 +#define __flush_tlb_global() paravirt_ops.flush_tlb_kernel()
  321 +#define __flush_tlb_single(addr) paravirt_ops.flush_tlb_single(addr)
  322 +
  323 +static inline void set_pte(pte_t *ptep, pte_t pteval)
  324 +{
  325 + paravirt_ops.set_pte(ptep, pteval);
  326 +}
  327 +
  328 +static inline void set_pte_at(struct mm_struct *mm, u32 addr, pte_t *ptep, pte_t pteval)
  329 +{
  330 + paravirt_ops.set_pte_at(mm, addr, ptep, pteval);
  331 +}
  332 +
  333 +static inline void set_pmd(pmd_t *pmdp, pmd_t pmdval)
  334 +{
  335 + paravirt_ops.set_pmd(pmdp, pmdval);
  336 +}
  337 +
  338 +static inline void pte_update(struct mm_struct *mm, u32 addr, pte_t *ptep)
  339 +{
  340 + paravirt_ops.pte_update(mm, addr, ptep);
  341 +}
  342 +
  343 +static inline void pte_update_defer(struct mm_struct *mm, u32 addr, pte_t *ptep)
  344 +{
  345 + paravirt_ops.pte_update_defer(mm, addr, ptep);
  346 +}
  347 +
  348 +#ifdef CONFIG_X86_PAE
  349 +static inline void set_pte_atomic(pte_t *ptep, pte_t pteval)
  350 +{
  351 + paravirt_ops.set_pte_atomic(ptep, pteval);
  352 +}
  353 +
  354 +static inline void set_pte_present(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte)
  355 +{
  356 + paravirt_ops.set_pte_present(mm, addr, ptep, pte);
  357 +}
  358 +
  359 +static inline void set_pud(pud_t *pudp, pud_t pudval)
  360 +{
  361 + paravirt_ops.set_pud(pudp, pudval);
  362 +}
  363 +
  364 +static inline void pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
  365 +{
  366 + paravirt_ops.pte_clear(mm, addr, ptep);
  367 +}
  368 +
  369 +static inline void pmd_clear(pmd_t *pmdp)
  370 +{
  371 + paravirt_ops.pmd_clear(pmdp);
  372 +}
  373 +#endif
299 374  
300 375 /* These all sit in the .parainstructions section to tell us what to patch. */
301 376 struct paravirt_patch {
include/asm-i386/pgtable-2level.h
... ... @@ -13,11 +13,14 @@
13 13 * within a page table are directly modified. Thus, the following
14 14 * hook is made available.
15 15 */
  16 +#ifndef CONFIG_PARAVIRT
16 17 #define set_pte(pteptr, pteval) (*(pteptr) = pteval)
17 18 #define set_pte_at(mm,addr,ptep,pteval) set_pte(ptep,pteval)
  19 +#define set_pmd(pmdptr, pmdval) (*(pmdptr) = (pmdval))
  20 +#endif
  21 +
18 22 #define set_pte_atomic(pteptr, pteval) set_pte(pteptr,pteval)
19 23 #define set_pte_present(mm,addr,ptep,pteval) set_pte_at(mm,addr,ptep,pteval)
20   -#define set_pmd(pmdptr, pmdval) (*(pmdptr) = (pmdval))
21 24  
22 25 #define pte_clear(mm,addr,xp) do { set_pte_at(mm, addr, xp, __pte(0)); } while (0)
23 26 #define pmd_clear(xp) do { set_pmd(xp, __pmd(0)); } while (0)
include/asm-i386/pgtable-3level.h
... ... @@ -44,6 +44,7 @@
44 44 return pte_x(pte);
45 45 }
46 46  
  47 +#ifndef CONFIG_PARAVIRT
47 48 /* Rules for using set_pte: the pte being assigned *must* be
48 49 * either not present or in a state where the hardware will
49 50 * not attempt to update the pte. In places where this is
... ... @@ -81,25 +82,6 @@
81 82 (*(pudptr) = (pudval))
82 83  
83 84 /*
84   - * Pentium-II erratum A13: in PAE mode we explicitly have to flush
85   - * the TLB via cr3 if the top-level pgd is changed...
86   - * We do not let the generic code free and clear pgd entries due to
87   - * this erratum.
88   - */
89   -static inline void pud_clear (pud_t * pud) { }
90   -
91   -#define pud_page(pud) \
92   -((struct page *) __va(pud_val(pud) & PAGE_MASK))
93   -
94   -#define pud_page_vaddr(pud) \
95   -((unsigned long) __va(pud_val(pud) & PAGE_MASK))
96   -
97   -
98   -/* Find an entry in the second-level page table.. */
99   -#define pmd_offset(pud, address) ((pmd_t *) pud_page(*(pud)) + \
100   - pmd_index(address))
101   -
102   -/*
103 85 * For PTEs and PDEs, we must clear the P-bit first when clearing a page table
104 86 * entry, so clear the bottom half first and enforce ordering with a compiler
105 87 * barrier.
... ... @@ -118,6 +100,26 @@
118 100 smp_wmb();
119 101 *(tmp + 1) = 0;
120 102 }
  103 +#endif
  104 +
  105 +/*
  106 + * Pentium-II erratum A13: in PAE mode we explicitly have to flush
  107 + * the TLB via cr3 if the top-level pgd is changed...
  108 + * We do not let the generic code free and clear pgd entries due to
  109 + * this erratum.
  110 + */
  111 +static inline void pud_clear (pud_t * pud) { }
  112 +
  113 +#define pud_page(pud) \
  114 +((struct page *) __va(pud_val(pud) & PAGE_MASK))
  115 +
  116 +#define pud_page_vaddr(pud) \
  117 +((unsigned long) __va(pud_val(pud) & PAGE_MASK))
  118 +
  119 +
  120 +/* Find an entry in the second-level page table.. */
  121 +#define pmd_offset(pud, address) ((pmd_t *) pud_page(*(pud)) + \
  122 + pmd_index(address))
121 123  
122 124 #define __HAVE_ARCH_PTEP_GET_AND_CLEAR
123 125 static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
include/asm-i386/pgtable.h
... ... @@ -15,6 +15,7 @@
15 15 #include <asm/processor.h>
16 16 #include <asm/fixmap.h>
17 17 #include <linux/threads.h>
  18 +#include <asm/paravirt.h>
18 19  
19 20 #ifndef _I386_BITOPS_H
20 21 #include <asm/bitops.h>
... ... @@ -246,6 +247,7 @@
246 247 # include <asm/pgtable-2level.h>
247 248 #endif
248 249  
  250 +#ifndef CONFIG_PARAVIRT
249 251 /*
250 252 * Rules for using pte_update - it must be called after any PTE update which
251 253 * has not been done using the set_pte / clear_pte interfaces. It is used by
... ... @@ -261,7 +263,7 @@
261 263 */
262 264 #define pte_update(mm, addr, ptep) do { } while (0)
263 265 #define pte_update_defer(mm, addr, ptep) do { } while (0)
264   -
  266 +#endif
265 267  
266 268 /*
267 269 * We only update the dirty/accessed state if we set
include/asm-i386/tlbflush.h
... ... @@ -4,7 +4,15 @@
4 4 #include <linux/mm.h>
5 5 #include <asm/processor.h>
6 6  
7   -#define __flush_tlb() \
  7 +#ifdef CONFIG_PARAVIRT
  8 +#include <asm/paravirt.h>
  9 +#else
  10 +#define __flush_tlb() __native_flush_tlb()
  11 +#define __flush_tlb_global() __native_flush_tlb_global()
  12 +#define __flush_tlb_single(addr) __native_flush_tlb_single(addr)
  13 +#endif
  14 +
  15 +#define __native_flush_tlb() \
8 16 do { \
9 17 unsigned int tmpreg; \
10 18 \
... ... @@ -19,7 +27,7 @@
19 27 * Global pages have to be flushed a bit differently. Not a real
20 28 * performance problem because this does not happen often.
21 29 */
22   -#define __flush_tlb_global() \
  30 +#define __native_flush_tlb_global() \
23 31 do { \
24 32 unsigned int tmpreg, cr4, cr4_orig; \
25 33 \
... ... @@ -36,6 +44,9 @@
36 44 : "memory"); \
37 45 } while (0)
38 46  
  47 +#define __native_flush_tlb_single(addr) \
  48 + __asm__ __volatile__("invlpg (%0)" ::"r" (addr) : "memory")
  49 +
39 50 # define __flush_tlb_all() \
40 51 do { \
41 52 if (cpu_has_pge) \
... ... @@ -45,9 +56,6 @@
45 56 } while (0)
46 57  
47 58 #define cpu_has_invlpg (boot_cpu_data.x86 > 3)
48   -
49   -#define __flush_tlb_single(addr) \
50   - __asm__ __volatile__("invlpg (%0)" ::"r" (addr) : "memory")
51 59  
52 60 #ifdef CONFIG_X86_INVLPG
53 61 # define __flush_tlb_one(addr) __flush_tlb_single(addr)