Commit bf72aeba2ffef599d1d386425c9e46b82be657cd

Authored by Paul Mackerras
1 parent 31925323b1

powerpc: Use 64k pages without needing cache-inhibited large pages

Some POWER5+ machines can do 64k hardware pages for normal memory but
not for cache-inhibited pages.  This patch lets us use 64k hardware
pages for most user processes on such machines (assuming the kernel
has been configured with CONFIG_PPC_64K_PAGES=y).  User processes
start out using 64k pages and get switched to 4k pages if they use any
non-cacheable mappings.

With this, we use 64k pages for the vmalloc region and 4k pages for
the imalloc region.  If anything creates a non-cacheable mapping in
the vmalloc region, the vmalloc region will get switched to 4k pages.
I don't know of any driver other than the DRM that would do this,
though, and these machines don't have AGP.

When a region gets switched from 64k pages to 4k pages, we do not have
to clear out all the 64k HPTEs from the hash table immediately.  We
use the _PAGE_COMBO bit in the Linux PTE to indicate whether the page
was hashed in as a 64k page or a set of 4k pages.  If hash_page is
trying to insert a 4k page for a Linux PTE and it sees that it has
already been inserted as a 64k page, it first invalidates the 64k HPTE
before inserting the 4k HPTE.  The hash invalidation routines also use
the _PAGE_COMBO bit, to determine whether to look for a 64k HPTE or a
set of 4k HPTEs to remove.  With those two changes, we can tolerate a
mix of 4k and 64k HPTEs in the hash table, and they will all get
removed when the address space is torn down.

Signed-off-by: Paul Mackerras <paulus@samba.org>

Showing 13 changed files with 160 additions and 39 deletions Side-by-side Diff

arch/powerpc/kernel/asm-offsets.c
... ... @@ -122,6 +122,8 @@
122 122 DEFINE(PACASLBCACHE, offsetof(struct paca_struct, slb_cache));
123 123 DEFINE(PACASLBCACHEPTR, offsetof(struct paca_struct, slb_cache_ptr));
124 124 DEFINE(PACACONTEXTID, offsetof(struct paca_struct, context.id));
  125 + DEFINE(PACACONTEXTSLLP, offsetof(struct paca_struct, context.sllp));
  126 + DEFINE(PACAVMALLOCSLLP, offsetof(struct paca_struct, vmalloc_sllp));
125 127 #ifdef CONFIG_HUGETLB_PAGE
126 128 DEFINE(PACALOWHTLBAREAS, offsetof(struct paca_struct, context.low_htlb_areas));
127 129 DEFINE(PACAHIGHHTLBAREAS, offsetof(struct paca_struct, context.high_htlb_areas));
arch/powerpc/kernel/prom.c
... ... @@ -948,7 +948,10 @@
948 948 {CPU_FTR_CTRL, 0, 0, 3, 0},
949 949 {CPU_FTR_NOEXECUTE, 0, 0, 6, 0},
950 950 {CPU_FTR_NODSISRALIGN, 0, 1, 1, 1},
  951 +#if 0
  952 + /* put this back once we know how to test if firmware does 64k IO */
951 953 {CPU_FTR_CI_LARGE_PAGE, 0, 1, 2, 0},
  954 +#endif
952 955 };
953 956  
954 957 static void __init check_cpu_pa_features(unsigned long node)
arch/powerpc/mm/hash_low_64.S
... ... @@ -369,6 +369,7 @@
369 369 rlwinm r30,r4,32-9+7,31-7,31-7 /* _PAGE_RW -> _PAGE_DIRTY */
370 370 or r30,r30,r31
371 371 ori r30,r30,_PAGE_BUSY | _PAGE_ACCESSED | _PAGE_HASHPTE
  372 + oris r30,r30,_PAGE_COMBO@h
372 373 /* Write the linux PTE atomically (setting busy) */
373 374 stdcx. r30,0,r6
374 375 bne- 1b
... ... @@ -428,6 +429,14 @@
428 429 andi. r0,r31,_PAGE_HASHPTE
429 430 li r26,0 /* Default hidx */
430 431 beq htab_insert_pte
  432 +
  433 + /*
  434 + * Check if the pte was already inserted into the hash table
  435 + * as a 64k HW page, and invalidate the 64k HPTE if so.
  436 + */
  437 + andis. r0,r31,_PAGE_COMBO@h
  438 + beq htab_inval_old_hpte
  439 +
431 440 ld r6,STK_PARM(r6)(r1)
432 441 ori r26,r6,0x8000 /* Load the hidx mask */
433 442 ld r26,0(r26)
... ... @@ -498,6 +507,19 @@
498 507 /* Try all again */
499 508 b htab_insert_pte
500 509  
  510 + /*
  511 + * Call out to C code to invalidate an 64k HW HPTE that is
  512 + * useless now that the segment has been switched to 4k pages.
  513 + */
  514 +htab_inval_old_hpte:
  515 + mr r3,r29 /* virtual addr */
  516 + mr r4,r31 /* PTE.pte */
  517 + li r5,0 /* PTE.hidx */
  518 + li r6,MMU_PAGE_64K /* psize */
  519 + ld r7,STK_PARM(r8)(r1) /* local */
  520 + bl .flush_hash_page
  521 + b htab_insert_pte
  522 +
501 523 htab_bail_ok:
502 524 li r3,0
503 525 b htab_bail
... ... @@ -638,6 +660,12 @@
638 660 * is changing this PTE anyway and might hash it.
639 661 */
640 662 bne- ht64_bail_ok
  663 +BEGIN_FTR_SECTION
  664 + /* Check if PTE has the cache-inhibit bit set */
  665 + andi. r0,r31,_PAGE_NO_CACHE
  666 + /* If so, bail out and refault as a 4k page */
  667 + bne- ht64_bail_ok
  668 +END_FTR_SECTION_IFCLR(CPU_FTR_CI_LARGE_PAGE)
641 669 /* Prepare new PTE value (turn access RW into DIRTY, then
642 670 * add BUSY,HASHPTE and ACCESSED)
643 671 */
arch/powerpc/mm/hash_utils_64.c
... ... @@ -92,10 +92,15 @@
92 92 unsigned long htab_hash_mask;
93 93 int mmu_linear_psize = MMU_PAGE_4K;
94 94 int mmu_virtual_psize = MMU_PAGE_4K;
  95 +int mmu_vmalloc_psize = MMU_PAGE_4K;
  96 +int mmu_io_psize = MMU_PAGE_4K;
95 97 #ifdef CONFIG_HUGETLB_PAGE
96 98 int mmu_huge_psize = MMU_PAGE_16M;
97 99 unsigned int HPAGE_SHIFT;
98 100 #endif
  101 +#ifdef CONFIG_PPC_64K_PAGES
  102 +int mmu_ci_restrictions;
  103 +#endif
99 104  
100 105 /* There are definitions of page sizes arrays to be used when none
101 106 * is provided by the firmware.
102 107  
103 108  
104 109  
105 110  
106 111  
... ... @@ -308,20 +313,31 @@
308 313 else if (mmu_psize_defs[MMU_PAGE_1M].shift)
309 314 mmu_linear_psize = MMU_PAGE_1M;
310 315  
  316 +#ifdef CONFIG_PPC_64K_PAGES
311 317 /*
312 318 * Pick a size for the ordinary pages. Default is 4K, we support
313   - * 64K if cache inhibited large pages are supported by the
314   - * processor
  319 + * 64K for user mappings and vmalloc if supported by the processor.
  320 + * We only use 64k for ioremap if the processor
  321 + * (and firmware) support cache-inhibited large pages.
  322 + * If not, we use 4k and set mmu_ci_restrictions so that
  323 + * hash_page knows to switch processes that use cache-inhibited
  324 + * mappings to 4k pages.
315 325 */
316   -#ifdef CONFIG_PPC_64K_PAGES
317   - if (mmu_psize_defs[MMU_PAGE_64K].shift &&
318   - cpu_has_feature(CPU_FTR_CI_LARGE_PAGE))
  326 + if (mmu_psize_defs[MMU_PAGE_64K].shift) {
319 327 mmu_virtual_psize = MMU_PAGE_64K;
  328 + mmu_vmalloc_psize = MMU_PAGE_64K;
  329 + if (cpu_has_feature(CPU_FTR_CI_LARGE_PAGE))
  330 + mmu_io_psize = MMU_PAGE_64K;
  331 + else
  332 + mmu_ci_restrictions = 1;
  333 + }
320 334 #endif
321 335  
322   - printk(KERN_DEBUG "Page orders: linear mapping = %d, others = %d\n",
  336 + printk(KERN_DEBUG "Page orders: linear mapping = %d, "
  337 + "virtual = %d, io = %d\n",
323 338 mmu_psize_defs[mmu_linear_psize].shift,
324   - mmu_psize_defs[mmu_virtual_psize].shift);
  339 + mmu_psize_defs[mmu_virtual_psize].shift,
  340 + mmu_psize_defs[mmu_io_psize].shift);
325 341  
326 342 #ifdef CONFIG_HUGETLB_PAGE
327 343 /* Init large page size. Currently, we pick 16M or 1M depending
... ... @@ -556,6 +572,7 @@
556 572 pte_t *ptep;
557 573 cpumask_t tmp;
558 574 int rc, user_region = 0, local = 0;
  575 + int psize;
559 576  
560 577 DBG_LOW("hash_page(ea=%016lx, access=%lx, trap=%lx\n",
561 578 ea, access, trap);
562 579  
... ... @@ -575,10 +592,15 @@
575 592 return 1;
576 593 }
577 594 vsid = get_vsid(mm->context.id, ea);
  595 + psize = mm->context.user_psize;
578 596 break;
579 597 case VMALLOC_REGION_ID:
580 598 mm = &init_mm;
581 599 vsid = get_kernel_vsid(ea);
  600 + if (ea < VMALLOC_END)
  601 + psize = mmu_vmalloc_psize;
  602 + else
  603 + psize = mmu_io_psize;
582 604 break;
583 605 default:
584 606 /* Not a valid range
... ... @@ -629,7 +651,40 @@
629 651 #ifndef CONFIG_PPC_64K_PAGES
630 652 rc = __hash_page_4K(ea, access, vsid, ptep, trap, local);
631 653 #else
632   - if (mmu_virtual_psize == MMU_PAGE_64K)
  654 + if (mmu_ci_restrictions) {
  655 + /* If this PTE is non-cacheable, switch to 4k */
  656 + if (psize == MMU_PAGE_64K &&
  657 + (pte_val(*ptep) & _PAGE_NO_CACHE)) {
  658 + if (user_region) {
  659 + psize = MMU_PAGE_4K;
  660 + mm->context.user_psize = MMU_PAGE_4K;
  661 + mm->context.sllp = SLB_VSID_USER |
  662 + mmu_psize_defs[MMU_PAGE_4K].sllp;
  663 + } else if (ea < VMALLOC_END) {
  664 + /*
  665 + * some driver did a non-cacheable mapping
  666 + * in vmalloc space, so switch vmalloc
  667 + * to 4k pages
  668 + */
  669 + printk(KERN_ALERT "Reducing vmalloc segment "
  670 + "to 4kB pages because of "
  671 + "non-cacheable mapping\n");
  672 + psize = mmu_vmalloc_psize = MMU_PAGE_4K;
  673 + }
  674 + }
  675 + if (user_region) {
  676 + if (psize != get_paca()->context.user_psize) {
  677 + get_paca()->context = mm->context;
  678 + slb_flush_and_rebolt();
  679 + }
  680 + } else if (get_paca()->vmalloc_sllp !=
  681 + mmu_psize_defs[mmu_vmalloc_psize].sllp) {
  682 + get_paca()->vmalloc_sllp =
  683 + mmu_psize_defs[mmu_vmalloc_psize].sllp;
  684 + slb_flush_and_rebolt();
  685 + }
  686 + }
  687 + if (psize == MMU_PAGE_64K)
633 688 rc = __hash_page_64K(ea, access, vsid, ptep, trap, local);
634 689 else
635 690 rc = __hash_page_4K(ea, access, vsid, ptep, trap, local);
... ... @@ -681,7 +736,18 @@
681 736 #ifndef CONFIG_PPC_64K_PAGES
682 737 __hash_page_4K(ea, access, vsid, ptep, trap, local);
683 738 #else
684   - if (mmu_virtual_psize == MMU_PAGE_64K)
  739 + if (mmu_ci_restrictions) {
  740 + /* If this PTE is non-cacheable, switch to 4k */
  741 + if (mm->context.user_psize == MMU_PAGE_64K &&
  742 + (pte_val(*ptep) & _PAGE_NO_CACHE)) {
  743 + mm->context.user_psize = MMU_PAGE_4K;
  744 + mm->context.sllp = SLB_VSID_USER |
  745 + mmu_psize_defs[MMU_PAGE_4K].sllp;
  746 + get_paca()->context = mm->context;
  747 + slb_flush_and_rebolt();
  748 + }
  749 + }
  750 + if (mm->context.user_psize == MMU_PAGE_64K)
685 751 __hash_page_64K(ea, access, vsid, ptep, trap, local);
686 752 else
687 753 __hash_page_4K(ea, access, vsid, ptep, trap, local);
arch/powerpc/mm/mmu_context_64.c
... ... @@ -49,6 +49,9 @@
49 49 }
50 50  
51 51 mm->context.id = index;
  52 + mm->context.user_psize = mmu_virtual_psize;
  53 + mm->context.sllp = SLB_VSID_USER |
  54 + mmu_psize_defs[mmu_virtual_psize].sllp;
52 55  
53 56 return 0;
54 57 }
arch/powerpc/mm/slb.c
... ... @@ -60,19 +60,19 @@
60 60 : "memory" );
61 61 }
62 62  
63   -static void slb_flush_and_rebolt(void)
  63 +void slb_flush_and_rebolt(void)
64 64 {
65 65 /* If you change this make sure you change SLB_NUM_BOLTED
66 66 * appropriately too. */
67   - unsigned long linear_llp, virtual_llp, lflags, vflags;
  67 + unsigned long linear_llp, vmalloc_llp, lflags, vflags;
68 68 unsigned long ksp_esid_data;
69 69  
70 70 WARN_ON(!irqs_disabled());
71 71  
72 72 linear_llp = mmu_psize_defs[mmu_linear_psize].sllp;
73   - virtual_llp = mmu_psize_defs[mmu_virtual_psize].sllp;
  73 + vmalloc_llp = mmu_psize_defs[mmu_vmalloc_psize].sllp;
74 74 lflags = SLB_VSID_KERNEL | linear_llp;
75   - vflags = SLB_VSID_KERNEL | virtual_llp;
  75 + vflags = SLB_VSID_KERNEL | vmalloc_llp;
76 76  
77 77 ksp_esid_data = mk_esid_data(get_paca()->kstack, 2);
78 78 if ((ksp_esid_data & ESID_MASK) == PAGE_OFFSET)
79 79  
... ... @@ -164,11 +164,10 @@
164 164  
165 165 void slb_initialize(void)
166 166 {
167   - unsigned long linear_llp, virtual_llp;
  167 + unsigned long linear_llp, vmalloc_llp, io_llp;
168 168 static int slb_encoding_inited;
169 169 extern unsigned int *slb_miss_kernel_load_linear;
170   - extern unsigned int *slb_miss_kernel_load_virtual;
171   - extern unsigned int *slb_miss_user_load_normal;
  170 + extern unsigned int *slb_miss_kernel_load_io;
172 171 #ifdef CONFIG_HUGETLB_PAGE
173 172 extern unsigned int *slb_miss_user_load_huge;
174 173 unsigned long huge_llp;
175 174  
176 175  
... ... @@ -178,18 +177,19 @@
178 177  
179 178 /* Prepare our SLB miss handler based on our page size */
180 179 linear_llp = mmu_psize_defs[mmu_linear_psize].sllp;
181   - virtual_llp = mmu_psize_defs[mmu_virtual_psize].sllp;
  180 + io_llp = mmu_psize_defs[mmu_io_psize].sllp;
  181 + vmalloc_llp = mmu_psize_defs[mmu_vmalloc_psize].sllp;
  182 + get_paca()->vmalloc_sllp = SLB_VSID_KERNEL | vmalloc_llp;
  183 +
182 184 if (!slb_encoding_inited) {
183 185 slb_encoding_inited = 1;
184 186 patch_slb_encoding(slb_miss_kernel_load_linear,
185 187 SLB_VSID_KERNEL | linear_llp);
186   - patch_slb_encoding(slb_miss_kernel_load_virtual,
187   - SLB_VSID_KERNEL | virtual_llp);
188   - patch_slb_encoding(slb_miss_user_load_normal,
189   - SLB_VSID_USER | virtual_llp);
  188 + patch_slb_encoding(slb_miss_kernel_load_io,
  189 + SLB_VSID_KERNEL | io_llp);
190 190  
191 191 DBG("SLB: linear LLP = %04x\n", linear_llp);
192   - DBG("SLB: virtual LLP = %04x\n", virtual_llp);
  192 + DBG("SLB: io LLP = %04x\n", io_llp);
193 193 #ifdef CONFIG_HUGETLB_PAGE
194 194 patch_slb_encoding(slb_miss_user_load_huge,
195 195 SLB_VSID_USER | huge_llp);
... ... @@ -204,7 +204,7 @@
204 204 unsigned long lflags, vflags;
205 205  
206 206 lflags = SLB_VSID_KERNEL | linear_llp;
207   - vflags = SLB_VSID_KERNEL | virtual_llp;
  207 + vflags = SLB_VSID_KERNEL | vmalloc_llp;
208 208  
209 209 /* Invalidate the entire SLB (even slot 0) & all the ERATS */
210 210 asm volatile("isync":::"memory");
... ... @@ -212,7 +212,6 @@
212 212 asm volatile("isync; slbia; isync":::"memory");
213 213 create_slbe(PAGE_OFFSET, lflags, 0);
214 214  
215   - /* VMALLOC space has 4K pages always for now */
216 215 create_slbe(VMALLOC_START, vflags, 1);
217 216  
218 217 /* We don't bolt the stack for the time being - we're in boot,
arch/powerpc/mm/slb_low.S
... ... @@ -59,10 +59,19 @@
59 59 li r11,0
60 60 b slb_finish_load
61 61  
62   -1: /* vmalloc/ioremap mapping encoding bits, the "li" instruction below
  62 +1: /* vmalloc/ioremap mapping encoding bits, the "li" instructions below
63 63 * will be patched by the kernel at boot
64 64 */
65   -_GLOBAL(slb_miss_kernel_load_virtual)
  65 +BEGIN_FTR_SECTION
  66 + /* check whether this is in vmalloc or ioremap space */
  67 + clrldi r11,r10,48
  68 + cmpldi r11,(VMALLOC_SIZE >> 28) - 1
  69 + bgt 5f
  70 + lhz r11,PACAVMALLOCSLLP(r13)
  71 + b slb_finish_load
  72 +5:
  73 +END_FTR_SECTION_IFCLR(CPU_FTR_CI_LARGE_PAGE)
  74 +_GLOBAL(slb_miss_kernel_load_io)
66 75 li r11,0
67 76 b slb_finish_load
68 77  
... ... @@ -96,9 +105,7 @@
96 105 1:
97 106 #endif /* CONFIG_HUGETLB_PAGE */
98 107  
99   -_GLOBAL(slb_miss_user_load_normal)
100   - li r11,0
101   -
  108 + lhz r11,PACACONTEXTSLLP(r13)
102 109 2:
103 110 ld r9,PACACONTEXTID(r13)
104 111 rldimi r10,r9,USER_ESID_BITS,0
arch/powerpc/mm/tlb_64.c
... ... @@ -131,7 +131,7 @@
131 131 {
132 132 struct ppc64_tlb_batch *batch = &__get_cpu_var(ppc64_tlb_batch);
133 133 unsigned long vsid;
134   - unsigned int psize = mmu_virtual_psize;
  134 + unsigned int psize;
135 135 int i;
136 136  
137 137 i = batch->index;
... ... @@ -148,7 +148,8 @@
148 148 #else
149 149 BUG();
150 150 #endif
151   - }
  151 + } else
  152 + psize = pte_pagesize_index(pte);
152 153  
153 154 /*
154 155 * This can happen when we are in the middle of a TLB batch and
include/asm-powerpc/mmu.h
... ... @@ -165,7 +165,17 @@
165 165 extern struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT];
166 166 extern int mmu_linear_psize;
167 167 extern int mmu_virtual_psize;
  168 +extern int mmu_vmalloc_psize;
  169 +extern int mmu_io_psize;
168 170  
  171 +/*
  172 + * If the processor supports 64k normal pages but not 64k cache
  173 + * inhibited pages, we have to be prepared to switch processes
  174 + * to use 4k pages when they create cache-inhibited mappings.
  175 + * If this is the case, mmu_ci_restrictions will be set to 1.
  176 + */
  177 +extern int mmu_ci_restrictions;
  178 +
169 179 #ifdef CONFIG_HUGETLB_PAGE
170 180 /*
171 181 * The page size index of the huge pages for use by hugetlbfs
... ... @@ -256,6 +266,7 @@
256 266  
257 267 extern void stabs_alloc(void);
258 268 extern void slb_initialize(void);
  269 +extern void slb_flush_and_rebolt(void);
259 270 extern void stab_initialize(unsigned long stab);
260 271  
261 272 #endif /* __ASSEMBLY__ */
... ... @@ -359,6 +370,8 @@
359 370  
360 371 typedef struct {
361 372 mm_context_id_t id;
  373 + u16 user_psize; /* page size index */
  374 + u16 sllp; /* SLB entry page size encoding */
362 375 #ifdef CONFIG_HUGETLB_PAGE
363 376 u16 low_htlb_areas, high_htlb_areas;
364 377 #endif
include/asm-powerpc/paca.h
... ... @@ -81,6 +81,7 @@
81 81 * on the linear mapping */
82 82  
83 83 mm_context_t context;
  84 + u16 vmalloc_sllp;
84 85 u16 slb_cache[SLB_CACHE_ENTRIES];
85 86 u16 slb_cache_ptr;
86 87  
include/asm-powerpc/pgtable-4k.h
... ... @@ -78,6 +78,8 @@
78 78  
79 79 #define pte_iterate_hashed_end() } while(0)
80 80  
  81 +#define pte_pagesize_index(pte) MMU_PAGE_4K
  82 +
81 83 /*
82 84 * 4-level page tables related bits
83 85 */
include/asm-powerpc/pgtable-64k.h
... ... @@ -90,6 +90,8 @@
90 90  
91 91 #define pte_iterate_hashed_end() } while(0); } } while(0)
92 92  
  93 +#define pte_pagesize_index(pte) \
  94 + (((pte) & _PAGE_COMBO)? MMU_PAGE_4K: MMU_PAGE_64K)
93 95  
94 96 #endif /* __ASSEMBLY__ */
95 97 #endif /* __KERNEL__ */
include/asm-powerpc/pgtable.h
... ... @@ -47,8 +47,8 @@
47 47 /*
48 48 * Define the address range of the vmalloc VM area.
49 49 */
50   -#define VMALLOC_START (0xD000000000000000ul)
51   -#define VMALLOC_SIZE (0x80000000000UL)
  50 +#define VMALLOC_START ASM_CONST(0xD000000000000000)
  51 +#define VMALLOC_SIZE ASM_CONST(0x80000000000)
52 52 #define VMALLOC_END (VMALLOC_START + VMALLOC_SIZE)
53 53  
54 54 /*
... ... @@ -413,12 +413,6 @@
413 413 flush_tlb_pending();
414 414 }
415 415 pte = __pte(pte_val(pte) & ~_PAGE_HPTEFLAGS);
416   -
417   -#ifdef CONFIG_PPC_64K_PAGES
418   - if (mmu_virtual_psize != MMU_PAGE_64K)
419   - pte = __pte(pte_val(pte) | _PAGE_COMBO);
420   -#endif /* CONFIG_PPC_64K_PAGES */
421   -
422 416 *ptep = pte;
423 417 }
424 418