Commit 0d9ea75443dc7e37843e656b8ebc947a6d16d618

Authored by Jon Tollefson
Committed by Linus Torvalds
1 parent f4a67cceee

powerpc: support multiple hugepage sizes

Instead of using the variable mmu_huge_psize to keep track of the huge
page size we use an array of MMU_PAGE_* values.  For each supported huge
page size we need to know the hugepte_shift value and have a
pgtable_cache.  The hstate or an mmu_huge_psizes index is passed to
functions so that they know which huge page size they should use.

The hugepage sizes 16M and 64K are setup(if available on the hardware) so
that they don't have to be set on the boot cmd line in order to use them.
The number of 16G pages have to be specified at boot-time though (e.g.
hugepagesz=16G hugepages=5).

Signed-off-by: Jon Tollefson <kniht@linux.vnet.ibm.com>
Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Showing 9 changed files with 199 additions and 118 deletions Side-by-side Diff

Documentation/kernel-parameters.txt
... ... @@ -776,11 +776,11 @@
776 776  
777 777 hugepages= [HW,X86-32,IA-64] HugeTLB pages to allocate at boot.
778 778 hugepagesz= [HW,IA-64,PPC,X86-64] The size of the HugeTLB pages.
779   - On x86 this option can be specified multiple times
780   - interleaved with hugepages= to reserve huge pages
781   - of different sizes. Valid pages sizes on x86-64
782   - are 2M (when the CPU supports "pse") and 1G (when the
783   - CPU supports the "pdpe1gb" cpuinfo flag)
  779 + On x86-64 and powerpc, this option can be specified
  780 + multiple times interleaved with hugepages= to reserve
  781 + huge pages of different sizes. Valid pages sizes on
  782 + x86-64 are 2M (when the CPU supports "pse") and 1G
  783 + (when the CPU supports the "pdpe1gb" cpuinfo flag)
784 784 Note that 1GB pages can only be allocated at boot time
785 785 using hugepages= and not freed afterwards.
786 786 default_hugepagesz=
arch/powerpc/mm/hash_utils_64.c
... ... @@ -103,7 +103,6 @@
103 103 int mmu_highuser_ssize = MMU_SEGSIZE_256M;
104 104 u16 mmu_slb_size = 64;
105 105 #ifdef CONFIG_HUGETLB_PAGE
106   -int mmu_huge_psize = MMU_PAGE_16M;
107 106 unsigned int HPAGE_SHIFT;
108 107 #endif
109 108 #ifdef CONFIG_PPC_64K_PAGES
110 109  
111 110  
... ... @@ -460,15 +459,15 @@
460 459 /* Reserve 16G huge page memory sections for huge pages */
461 460 of_scan_flat_dt(htab_dt_scan_hugepage_blocks, NULL);
462 461  
463   -/* Init large page size. Currently, we pick 16M or 1M depending
  462 +/* Set default large page size. Currently, we pick 16M or 1M depending
464 463 * on what is available
465 464 */
466 465 if (mmu_psize_defs[MMU_PAGE_16M].shift)
467   - set_huge_psize(MMU_PAGE_16M);
  466 + HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_16M].shift;
468 467 /* With 4k/4level pagetables, we can't (for now) cope with a
469 468 * huge page size < PMD_SIZE */
470 469 else if (mmu_psize_defs[MMU_PAGE_1M].shift)
471   - set_huge_psize(MMU_PAGE_1M);
  470 + HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_1M].shift;
472 471 #endif /* CONFIG_HUGETLB_PAGE */
473 472 }
474 473  
... ... @@ -889,7 +888,7 @@
889 888  
890 889 #ifdef CONFIG_HUGETLB_PAGE
891 890 /* Handle hugepage regions */
892   - if (HPAGE_SHIFT && psize == mmu_huge_psize) {
  891 + if (HPAGE_SHIFT && mmu_huge_psizes[psize]) {
893 892 DBG_LOW(" -> huge page !\n");
894 893 return hash_huge_page(mm, access, ea, vsid, local, trap);
895 894 }
arch/powerpc/mm/hugetlbpage.c
... ... @@ -37,16 +37,31 @@
37 37 static unsigned long gpage_freearray[MAX_NUMBER_GPAGES];
38 38 static unsigned nr_gpages;
39 39  
40   -unsigned int hugepte_shift;
41   -#define PTRS_PER_HUGEPTE (1 << hugepte_shift)
42   -#define HUGEPTE_TABLE_SIZE (sizeof(pte_t) << hugepte_shift)
  40 +/* Array of valid huge page sizes - non-zero value(hugepte_shift) is
  41 + * stored for the huge page sizes that are valid.
  42 + */
  43 +unsigned int mmu_huge_psizes[MMU_PAGE_COUNT] = { }; /* initialize all to 0 */
43 44  
44   -#define HUGEPD_SHIFT (HPAGE_SHIFT + hugepte_shift)
45   -#define HUGEPD_SIZE (1UL << HUGEPD_SHIFT)
46   -#define HUGEPD_MASK (~(HUGEPD_SIZE-1))
  45 +#define hugepte_shift mmu_huge_psizes
  46 +#define PTRS_PER_HUGEPTE(psize) (1 << hugepte_shift[psize])
  47 +#define HUGEPTE_TABLE_SIZE(psize) (sizeof(pte_t) << hugepte_shift[psize])
47 48  
48   -#define huge_pgtable_cache (pgtable_cache[HUGEPTE_CACHE_NUM])
  49 +#define HUGEPD_SHIFT(psize) (mmu_psize_to_shift(psize) \
  50 + + hugepte_shift[psize])
  51 +#define HUGEPD_SIZE(psize) (1UL << HUGEPD_SHIFT(psize))
  52 +#define HUGEPD_MASK(psize) (~(HUGEPD_SIZE(psize)-1))
49 53  
  54 +/* Subtract one from array size because we don't need a cache for 4K since
  55 + * is not a huge page size */
  56 +#define huge_pgtable_cache(psize) (pgtable_cache[HUGEPTE_CACHE_NUM \
  57 + + psize-1])
  58 +#define HUGEPTE_CACHE_NAME(psize) (huge_pgtable_cache_name[psize])
  59 +
  60 +static const char *huge_pgtable_cache_name[MMU_PAGE_COUNT] = {
  61 + "unused_4K", "hugepte_cache_64K", "unused_64K_AP",
  62 + "hugepte_cache_1M", "hugepte_cache_16M", "hugepte_cache_16G"
  63 +};
  64 +
50 65 /* Flag to mark huge PD pointers. This means pmd_bad() and pud_bad()
51 66 * will choke on pointers to hugepte tables, which is handy for
52 67 * catching screwups early. */
53 68  
54 69  
55 70  
56 71  
... ... @@ -56,24 +71,49 @@
56 71  
57 72 #define hugepd_none(hpd) ((hpd).pd == 0)
58 73  
  74 +static inline int shift_to_mmu_psize(unsigned int shift)
  75 +{
  76 + switch (shift) {
  77 +#ifndef CONFIG_PPC_64K_PAGES
  78 + case PAGE_SHIFT_64K:
  79 + return MMU_PAGE_64K;
  80 +#endif
  81 + case PAGE_SHIFT_16M:
  82 + return MMU_PAGE_16M;
  83 + case PAGE_SHIFT_16G:
  84 + return MMU_PAGE_16G;
  85 + }
  86 + return -1;
  87 +}
  88 +
  89 +static inline unsigned int mmu_psize_to_shift(unsigned int mmu_psize)
  90 +{
  91 + if (mmu_psize_defs[mmu_psize].shift)
  92 + return mmu_psize_defs[mmu_psize].shift;
  93 + BUG();
  94 +}
  95 +
59 96 static inline pte_t *hugepd_page(hugepd_t hpd)
60 97 {
61 98 BUG_ON(!(hpd.pd & HUGEPD_OK));
62 99 return (pte_t *)(hpd.pd & ~HUGEPD_OK);
63 100 }
64 101  
65   -static inline pte_t *hugepte_offset(hugepd_t *hpdp, unsigned long addr)
  102 +static inline pte_t *hugepte_offset(hugepd_t *hpdp, unsigned long addr,
  103 + struct hstate *hstate)
66 104 {
67   - unsigned long idx = ((addr >> HPAGE_SHIFT) & (PTRS_PER_HUGEPTE-1));
  105 + unsigned int shift = huge_page_shift(hstate);
  106 + int psize = shift_to_mmu_psize(shift);
  107 + unsigned long idx = ((addr >> shift) & (PTRS_PER_HUGEPTE(psize)-1));
68 108 pte_t *dir = hugepd_page(*hpdp);
69 109  
70 110 return dir + idx;
71 111 }
72 112  
73 113 static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp,
74   - unsigned long address)
  114 + unsigned long address, unsigned int psize)
75 115 {
76   - pte_t *new = kmem_cache_alloc(huge_pgtable_cache,
  116 + pte_t *new = kmem_cache_alloc(huge_pgtable_cache(psize),
77 117 GFP_KERNEL|__GFP_REPEAT);
78 118  
79 119 if (! new)
... ... @@ -81,7 +121,7 @@
81 121  
82 122 spin_lock(&mm->page_table_lock);
83 123 if (!hugepd_none(*hpdp))
84   - kmem_cache_free(huge_pgtable_cache, new);
  124 + kmem_cache_free(huge_pgtable_cache(psize), new);
85 125 else
86 126 hpdp->pd = (unsigned long)new | HUGEPD_OK;
87 127 spin_unlock(&mm->page_table_lock);
88 128  
89 129  
90 130  
91 131  
... ... @@ -90,21 +130,22 @@
90 130  
91 131 /* Base page size affects how we walk hugetlb page tables */
92 132 #ifdef CONFIG_PPC_64K_PAGES
93   -#define hpmd_offset(pud, addr) pmd_offset(pud, addr)
94   -#define hpmd_alloc(mm, pud, addr) pmd_alloc(mm, pud, addr)
  133 +#define hpmd_offset(pud, addr, h) pmd_offset(pud, addr)
  134 +#define hpmd_alloc(mm, pud, addr, h) pmd_alloc(mm, pud, addr)
95 135 #else
96 136 static inline
97   -pmd_t *hpmd_offset(pud_t *pud, unsigned long addr)
  137 +pmd_t *hpmd_offset(pud_t *pud, unsigned long addr, struct hstate *hstate)
98 138 {
99   - if (HPAGE_SHIFT == PAGE_SHIFT_64K)
  139 + if (huge_page_shift(hstate) == PAGE_SHIFT_64K)
100 140 return pmd_offset(pud, addr);
101 141 else
102 142 return (pmd_t *) pud;
103 143 }
104 144 static inline
105   -pmd_t *hpmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long addr)
  145 +pmd_t *hpmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long addr,
  146 + struct hstate *hstate)
106 147 {
107   - if (HPAGE_SHIFT == PAGE_SHIFT_64K)
  148 + if (huge_page_shift(hstate) == PAGE_SHIFT_64K)
108 149 return pmd_alloc(mm, pud, addr);
109 150 else
110 151 return (pmd_t *) pud;
... ... @@ -128,8 +169,9 @@
128 169 }
129 170  
130 171 /* Moves the gigantic page addresses from the temporary list to the
131   - * huge_boot_pages list. */
132   -int alloc_bootmem_huge_page(struct hstate *h)
  172 + * huge_boot_pages list.
  173 + */
  174 +int alloc_bootmem_huge_page(struct hstate *hstate)
133 175 {
134 176 struct huge_bootmem_page *m;
135 177 if (nr_gpages == 0)
... ... @@ -137,7 +179,7 @@
137 179 m = phys_to_virt(gpage_freearray[--nr_gpages]);
138 180 gpage_freearray[nr_gpages] = 0;
139 181 list_add(&m->list, &huge_boot_pages);
140   - m->hstate = h;
  182 + m->hstate = hstate;
141 183 return 1;
142 184 }
143 185  
144 186  
145 187  
146 188  
... ... @@ -149,17 +191,25 @@
149 191 pud_t *pu;
150 192 pmd_t *pm;
151 193  
152   - BUG_ON(get_slice_psize(mm, addr) != mmu_huge_psize);
  194 + unsigned int psize;
  195 + unsigned int shift;
  196 + unsigned long sz;
  197 + struct hstate *hstate;
  198 + psize = get_slice_psize(mm, addr);
  199 + shift = mmu_psize_to_shift(psize);
  200 + sz = ((1UL) << shift);
  201 + hstate = size_to_hstate(sz);
153 202  
154   - addr &= HPAGE_MASK;
  203 + addr &= hstate->mask;
155 204  
156 205 pg = pgd_offset(mm, addr);
157 206 if (!pgd_none(*pg)) {
158 207 pu = pud_offset(pg, addr);
159 208 if (!pud_none(*pu)) {
160   - pm = hpmd_offset(pu, addr);
  209 + pm = hpmd_offset(pu, addr, hstate);
161 210 if (!pmd_none(*pm))
162   - return hugepte_offset((hugepd_t *)pm, addr);
  211 + return hugepte_offset((hugepd_t *)pm, addr,
  212 + hstate);
163 213 }
164 214 }
165 215  
166 216  
167 217  
168 218  
... ... @@ -173,16 +223,20 @@
173 223 pud_t *pu;
174 224 pmd_t *pm;
175 225 hugepd_t *hpdp = NULL;
  226 + struct hstate *hstate;
  227 + unsigned int psize;
  228 + hstate = size_to_hstate(sz);
176 229  
177   - BUG_ON(get_slice_psize(mm, addr) != mmu_huge_psize);
  230 + psize = get_slice_psize(mm, addr);
  231 + BUG_ON(!mmu_huge_psizes[psize]);
178 232  
179   - addr &= HPAGE_MASK;
  233 + addr &= hstate->mask;
180 234  
181 235 pg = pgd_offset(mm, addr);
182 236 pu = pud_alloc(mm, pg, addr);
183 237  
184 238 if (pu) {
185   - pm = hpmd_alloc(mm, pu, addr);
  239 + pm = hpmd_alloc(mm, pu, addr, hstate);
186 240 if (pm)
187 241 hpdp = (hugepd_t *)pm;
188 242 }
189 243  
... ... @@ -190,10 +244,10 @@
190 244 if (! hpdp)
191 245 return NULL;
192 246  
193   - if (hugepd_none(*hpdp) && __hugepte_alloc(mm, hpdp, addr))
  247 + if (hugepd_none(*hpdp) && __hugepte_alloc(mm, hpdp, addr, psize))
194 248 return NULL;
195 249  
196   - return hugepte_offset(hpdp, addr);
  250 + return hugepte_offset(hpdp, addr, hstate);
197 251 }
198 252  
199 253 int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
200 254  
201 255  
... ... @@ -201,19 +255,22 @@
201 255 return 0;
202 256 }
203 257  
204   -static void free_hugepte_range(struct mmu_gather *tlb, hugepd_t *hpdp)
  258 +static void free_hugepte_range(struct mmu_gather *tlb, hugepd_t *hpdp,
  259 + unsigned int psize)
205 260 {
206 261 pte_t *hugepte = hugepd_page(*hpdp);
207 262  
208 263 hpdp->pd = 0;
209 264 tlb->need_flush = 1;
210   - pgtable_free_tlb(tlb, pgtable_free_cache(hugepte, HUGEPTE_CACHE_NUM,
  265 + pgtable_free_tlb(tlb, pgtable_free_cache(hugepte,
  266 + HUGEPTE_CACHE_NUM+psize-1,
211 267 PGF_CACHENUM_MASK));
212 268 }
213 269  
214 270 static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
215 271 unsigned long addr, unsigned long end,
216   - unsigned long floor, unsigned long ceiling)
  272 + unsigned long floor, unsigned long ceiling,
  273 + unsigned int psize)
217 274 {
218 275 pmd_t *pmd;
219 276 unsigned long next;
... ... @@ -225,7 +282,7 @@
225 282 next = pmd_addr_end(addr, end);
226 283 if (pmd_none(*pmd))
227 284 continue;
228   - free_hugepte_range(tlb, (hugepd_t *)pmd);
  285 + free_hugepte_range(tlb, (hugepd_t *)pmd, psize);
229 286 } while (pmd++, addr = next, addr != end);
230 287  
231 288 start &= PUD_MASK;
... ... @@ -251,6 +308,9 @@
251 308 pud_t *pud;
252 309 unsigned long next;
253 310 unsigned long start;
  311 + unsigned int shift;
  312 + unsigned int psize = get_slice_psize(tlb->mm, addr);
  313 + shift = mmu_psize_to_shift(psize);
254 314  
255 315 start = addr;
256 316 pud = pud_offset(pgd, addr);
257 317  
258 318  
259 319  
... ... @@ -259,16 +319,18 @@
259 319 #ifdef CONFIG_PPC_64K_PAGES
260 320 if (pud_none_or_clear_bad(pud))
261 321 continue;
262   - hugetlb_free_pmd_range(tlb, pud, addr, next, floor, ceiling);
  322 + hugetlb_free_pmd_range(tlb, pud, addr, next, floor, ceiling,
  323 + psize);
263 324 #else
264   - if (HPAGE_SHIFT == PAGE_SHIFT_64K) {
  325 + if (shift == PAGE_SHIFT_64K) {
265 326 if (pud_none_or_clear_bad(pud))
266 327 continue;
267   - hugetlb_free_pmd_range(tlb, pud, addr, next, floor, ceiling);
  328 + hugetlb_free_pmd_range(tlb, pud, addr, next, floor,
  329 + ceiling, psize);
268 330 } else {
269 331 if (pud_none(*pud))
270 332 continue;
271   - free_hugepte_range(tlb, (hugepd_t *)pud);
  333 + free_hugepte_range(tlb, (hugepd_t *)pud, psize);
272 334 }
273 335 #endif
274 336 } while (pud++, addr = next, addr != end);
275 337  
276 338  
277 339  
278 340  
279 341  
... ... @@ -336,27 +398,29 @@
336 398 * now has no other vmas using it, so can be freed, we don't
337 399 * bother to round floor or end up - the tests don't need that.
338 400 */
  401 + unsigned int psize = get_slice_psize(tlb->mm, addr);
339 402  
340   - addr &= HUGEPD_MASK;
  403 + addr &= HUGEPD_MASK(psize);
341 404 if (addr < floor) {
342   - addr += HUGEPD_SIZE;
  405 + addr += HUGEPD_SIZE(psize);
343 406 if (!addr)
344 407 return;
345 408 }
346 409 if (ceiling) {
347   - ceiling &= HUGEPD_MASK;
  410 + ceiling &= HUGEPD_MASK(psize);
348 411 if (!ceiling)
349 412 return;
350 413 }
351 414 if (end - 1 > ceiling - 1)
352   - end -= HUGEPD_SIZE;
  415 + end -= HUGEPD_SIZE(psize);
353 416 if (addr > end - 1)
354 417 return;
355 418  
356 419 start = addr;
357 420 pgd = pgd_offset(tlb->mm, addr);
358 421 do {
359   - BUG_ON(get_slice_psize(tlb->mm, addr) != mmu_huge_psize);
  422 + psize = get_slice_psize(tlb->mm, addr);
  423 + BUG_ON(!mmu_huge_psizes[psize]);
360 424 next = pgd_addr_end(addr, end);
361 425 if (pgd_none_or_clear_bad(pgd))
362 426 continue;
... ... @@ -373,7 +437,11 @@
373 437 * necessary anymore if we make hpte_need_flush() get the
374 438 * page size from the slices
375 439 */
376   - pte_update(mm, addr & HPAGE_MASK, ptep, ~0UL, 1);
  440 + unsigned int psize = get_slice_psize(mm, addr);
  441 + unsigned int shift = mmu_psize_to_shift(psize);
  442 + unsigned long sz = ((1UL) << shift);
  443 + struct hstate *hstate = size_to_hstate(sz);
  444 + pte_update(mm, addr & hstate->mask, ptep, ~0UL, 1);
377 445 }
378 446 *ptep = __pte(pte_val(pte) & ~_PAGE_HPTEFLAGS);
379 447 }
380 448  
381 449  
... ... @@ -390,14 +458,19 @@
390 458 {
391 459 pte_t *ptep;
392 460 struct page *page;
  461 + unsigned int mmu_psize = get_slice_psize(mm, address);
393 462  
394   - if (get_slice_psize(mm, address) != mmu_huge_psize)
  463 + /* Verify it is a huge page else bail. */
  464 + if (!mmu_huge_psizes[mmu_psize])
395 465 return ERR_PTR(-EINVAL);
396 466  
397 467 ptep = huge_pte_offset(mm, address);
398 468 page = pte_page(*ptep);
399   - if (page)
400   - page += (address % HPAGE_SIZE) / PAGE_SIZE;
  469 + if (page) {
  470 + unsigned int shift = mmu_psize_to_shift(mmu_psize);
  471 + unsigned long sz = ((1UL) << shift);
  472 + page += (address % sz) / PAGE_SIZE;
  473 + }
401 474  
402 475 return page;
403 476 }
404 477  
... ... @@ -425,15 +498,16 @@
425 498 unsigned long len, unsigned long pgoff,
426 499 unsigned long flags)
427 500 {
428   - return slice_get_unmapped_area(addr, len, flags,
429   - mmu_huge_psize, 1, 0);
  501 + struct hstate *hstate = hstate_file(file);
  502 + int mmu_psize = shift_to_mmu_psize(huge_page_shift(hstate));
  503 + return slice_get_unmapped_area(addr, len, flags, mmu_psize, 1, 0);
430 504 }
431 505  
432 506 /*
433 507 * Called by asm hashtable.S for doing lazy icache flush
434 508 */
435 509 static unsigned int hash_huge_page_do_lazy_icache(unsigned long rflags,
436   - pte_t pte, int trap)
  510 + pte_t pte, int trap, unsigned long sz)
437 511 {
438 512 struct page *page;
439 513 int i;
... ... @@ -446,7 +520,7 @@
446 520 /* page is dirty */
447 521 if (!test_bit(PG_arch_1, &page->flags) && !PageReserved(page)) {
448 522 if (trap == 0x400) {
449   - for (i = 0; i < (HPAGE_SIZE / PAGE_SIZE); i++)
  523 + for (i = 0; i < (sz / PAGE_SIZE); i++)
450 524 __flush_dcache_icache(page_address(page+i));
451 525 set_bit(PG_arch_1, &page->flags);
452 526 } else {
453 527  
454 528  
... ... @@ -462,11 +536,16 @@
462 536 {
463 537 pte_t *ptep;
464 538 unsigned long old_pte, new_pte;
465   - unsigned long va, rflags, pa;
  539 + unsigned long va, rflags, pa, sz;
466 540 long slot;
467 541 int err = 1;
468 542 int ssize = user_segment_size(ea);
  543 + unsigned int mmu_psize;
  544 + int shift;
  545 + mmu_psize = get_slice_psize(mm, ea);
469 546  
  547 + if (!mmu_huge_psizes[mmu_psize])
  548 + goto out;
470 549 ptep = huge_pte_offset(mm, ea);
471 550  
472 551 /* Search the Linux page table for a match with va */
473 552  
474 553  
475 554  
476 555  
... ... @@ -509,30 +588,32 @@
509 588 rflags = 0x2 | (!(new_pte & _PAGE_RW));
510 589 /* _PAGE_EXEC -> HW_NO_EXEC since it's inverted */
511 590 rflags |= ((new_pte & _PAGE_EXEC) ? 0 : HPTE_R_N);
  591 + shift = mmu_psize_to_shift(mmu_psize);
  592 + sz = ((1UL) << shift);
512 593 if (!cpu_has_feature(CPU_FTR_COHERENT_ICACHE))
513 594 /* No CPU has hugepages but lacks no execute, so we
514 595 * don't need to worry about that case */
515 596 rflags = hash_huge_page_do_lazy_icache(rflags, __pte(old_pte),
516   - trap);
  597 + trap, sz);
517 598  
518 599 /* Check if pte already has an hpte (case 2) */
519 600 if (unlikely(old_pte & _PAGE_HASHPTE)) {
520 601 /* There MIGHT be an HPTE for this pte */
521 602 unsigned long hash, slot;
522 603  
523   - hash = hpt_hash(va, HPAGE_SHIFT, ssize);
  604 + hash = hpt_hash(va, shift, ssize);
524 605 if (old_pte & _PAGE_F_SECOND)
525 606 hash = ~hash;
526 607 slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
527 608 slot += (old_pte & _PAGE_F_GIX) >> 12;
528 609  
529   - if (ppc_md.hpte_updatepp(slot, rflags, va, mmu_huge_psize,
  610 + if (ppc_md.hpte_updatepp(slot, rflags, va, mmu_psize,
530 611 ssize, local) == -1)
531 612 old_pte &= ~_PAGE_HPTEFLAGS;
532 613 }
533 614  
534 615 if (likely(!(old_pte & _PAGE_HASHPTE))) {
535   - unsigned long hash = hpt_hash(va, HPAGE_SHIFT, ssize);
  616 + unsigned long hash = hpt_hash(va, shift, ssize);
536 617 unsigned long hpte_group;
537 618  
538 619 pa = pte_pfn(__pte(old_pte)) << PAGE_SHIFT;
... ... @@ -553,7 +634,7 @@
553 634  
554 635 /* Insert into the hash table, primary slot */
555 636 slot = ppc_md.hpte_insert(hpte_group, va, pa, rflags, 0,
556   - mmu_huge_psize, ssize);
  637 + mmu_psize, ssize);
557 638  
558 639 /* Primary is full, try the secondary */
559 640 if (unlikely(slot == -1)) {
... ... @@ -561,7 +642,7 @@
561 642 HPTES_PER_GROUP) & ~0x7UL;
562 643 slot = ppc_md.hpte_insert(hpte_group, va, pa, rflags,
563 644 HPTE_V_SECONDARY,
564   - mmu_huge_psize, ssize);
  645 + mmu_psize, ssize);
565 646 if (slot == -1) {
566 647 if (mftb() & 0x1)
567 648 hpte_group = ((hash & htab_hash_mask) *
568 649  
569 650  
570 651  
571 652  
572 653  
573 654  
574 655  
575 656  
576 657  
577 658  
578 659  
... ... @@ -598,66 +679,50 @@
598 679 (mmu_psize_defs[psize].shift > MIN_HUGEPTE_SHIFT ||
599 680 mmu_psize_defs[psize].shift == PAGE_SHIFT_64K ||
600 681 mmu_psize_defs[psize].shift == PAGE_SHIFT_16G)) {
601   - /* Return if huge page size is the same as the
602   - * base page size. */
603   - if (mmu_psize_defs[psize].shift == PAGE_SHIFT)
  682 + /* Return if huge page size has already been setup or is the
  683 + * same as the base page size. */
  684 + if (mmu_huge_psizes[psize] ||
  685 + mmu_psize_defs[psize].shift == PAGE_SHIFT)
604 686 return;
  687 + hugetlb_add_hstate(mmu_psize_defs[psize].shift - PAGE_SHIFT);
605 688  
606   - HPAGE_SHIFT = mmu_psize_defs[psize].shift;
607   - mmu_huge_psize = psize;
608   -
609   - switch (HPAGE_SHIFT) {
  689 + switch (mmu_psize_defs[psize].shift) {
610 690 case PAGE_SHIFT_64K:
611 691 /* We only allow 64k hpages with 4k base page,
612 692 * which was checked above, and always put them
613 693 * at the PMD */
614   - hugepte_shift = PMD_SHIFT;
  694 + hugepte_shift[psize] = PMD_SHIFT;
615 695 break;
616 696 case PAGE_SHIFT_16M:
617 697 /* 16M pages can be at two different levels
618 698 * of pagestables based on base page size */
619 699 if (PAGE_SHIFT == PAGE_SHIFT_64K)
620   - hugepte_shift = PMD_SHIFT;
  700 + hugepte_shift[psize] = PMD_SHIFT;
621 701 else /* 4k base page */
622   - hugepte_shift = PUD_SHIFT;
  702 + hugepte_shift[psize] = PUD_SHIFT;
623 703 break;
624 704 case PAGE_SHIFT_16G:
625 705 /* 16G pages are always at PGD level */
626   - hugepte_shift = PGDIR_SHIFT;
  706 + hugepte_shift[psize] = PGDIR_SHIFT;
627 707 break;
628 708 }
629   - hugepte_shift -= HPAGE_SHIFT;
  709 + hugepte_shift[psize] -= mmu_psize_defs[psize].shift;
630 710 } else
631   - HPAGE_SHIFT = 0;
  711 + hugepte_shift[psize] = 0;
632 712 }
633 713  
634 714 static int __init hugepage_setup_sz(char *str)
635 715 {
636 716 unsigned long long size;
637   - int mmu_psize = -1;
  717 + int mmu_psize;
638 718 int shift;
639 719  
640 720 size = memparse(str, &str);
641 721  
642 722 shift = __ffs(size);
643   - switch (shift) {
644   -#ifndef CONFIG_PPC_64K_PAGES
645   - case PAGE_SHIFT_64K:
646   - mmu_psize = MMU_PAGE_64K;
647   - break;
648   -#endif
649   - case PAGE_SHIFT_16M:
650   - mmu_psize = MMU_PAGE_16M;
651   - break;
652   - case PAGE_SHIFT_16G:
653   - mmu_psize = MMU_PAGE_16G;
654   - break;
655   - }
656   -
657   - if (mmu_psize >= 0 && mmu_psize_defs[mmu_psize].shift) {
  723 + mmu_psize = shift_to_mmu_psize(shift);
  724 + if (mmu_psize >= 0 && mmu_psize_defs[mmu_psize].shift)
658 725 set_huge_psize(mmu_psize);
659   - hugetlb_add_hstate(shift - PAGE_SHIFT);
660   - }
661 726 else
662 727 printk(KERN_WARNING "Invalid huge page size specified(%llu)\n", size);
663 728  
664 729  
665 730  
... ... @@ -672,16 +737,31 @@
672 737  
673 738 static int __init hugetlbpage_init(void)
674 739 {
  740 + unsigned int psize;
  741 +
675 742 if (!cpu_has_feature(CPU_FTR_16M_PAGE))
676 743 return -ENODEV;
  744 + /* Add supported huge page sizes. Need to change HUGE_MAX_HSTATE
  745 + * and adjust PTE_NONCACHE_NUM if the number of supported huge page
  746 + * sizes changes.
  747 + */
  748 + set_huge_psize(MMU_PAGE_16M);
  749 + set_huge_psize(MMU_PAGE_64K);
  750 + set_huge_psize(MMU_PAGE_16G);
677 751  
678   - huge_pgtable_cache = kmem_cache_create("hugepte_cache",
679   - HUGEPTE_TABLE_SIZE,
680   - HUGEPTE_TABLE_SIZE,
681   - 0,
682   - zero_ctor);
683   - if (! huge_pgtable_cache)
684   - panic("hugetlbpage_init(): could not create hugepte cache\n");
  752 + for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) {
  753 + if (mmu_huge_psizes[psize]) {
  754 + huge_pgtable_cache(psize) = kmem_cache_create(
  755 + HUGEPTE_CACHE_NAME(psize),
  756 + HUGEPTE_TABLE_SIZE(psize),
  757 + HUGEPTE_TABLE_SIZE(psize),
  758 + 0,
  759 + zero_ctor);
  760 + if (!huge_pgtable_cache(psize))
  761 + panic("hugetlbpage_init(): could not create %s"\
  762 + "\n", HUGEPTE_CACHE_NAME(psize));
  763 + }
  764 + }
685 765  
686 766 return 0;
687 767 }
arch/powerpc/mm/init_64.c
... ... @@ -153,10 +153,10 @@
153 153 };
154 154  
155 155 #ifdef CONFIG_HUGETLB_PAGE
156   -/* Hugepages need one extra cache, initialized in hugetlbpage.c. We
157   - * can't put into the tables above, because HPAGE_SHIFT is not compile
158   - * time constant. */
159   -struct kmem_cache *pgtable_cache[ARRAY_SIZE(pgtable_cache_size)+1];
  156 +/* Hugepages need an extra cache per hugepagesize, initialized in
  157 + * hugetlbpage.c. We can't put into the tables above, because HPAGE_SHIFT
  158 + * is not compile time constant. */
  159 +struct kmem_cache *pgtable_cache[ARRAY_SIZE(pgtable_cache_size)+MMU_PAGE_COUNT];
160 160 #else
161 161 struct kmem_cache *pgtable_cache[ARRAY_SIZE(pgtable_cache_size)];
162 162 #endif
arch/powerpc/mm/tlb_64.c
... ... @@ -147,7 +147,7 @@
147 147 */
148 148 if (huge) {
149 149 #ifdef CONFIG_HUGETLB_PAGE
150   - psize = mmu_huge_psize;
  150 + psize = get_slice_psize(mm, addr);;
151 151 #else
152 152 BUG();
153 153 psize = pte_pagesize_index(mm, addr, pte); /* shutup gcc */
include/asm-powerpc/hugetlb.h
... ... @@ -24,9 +24,10 @@
24 24 static inline int prepare_hugepage_range(struct file *file,
25 25 unsigned long addr, unsigned long len)
26 26 {
27   - if (len & ~HPAGE_MASK)
  27 + struct hstate *h = hstate_file(file);
  28 + if (len & ~huge_page_mask(h))
28 29 return -EINVAL;
29   - if (addr & ~HPAGE_MASK)
  30 + if (addr & ~huge_page_mask(h))
30 31 return -EINVAL;
31 32 return 0;
32 33 }
include/asm-powerpc/mmu-hash64.h
... ... @@ -194,9 +194,9 @@
194 194  
195 195 #ifdef CONFIG_HUGETLB_PAGE
196 196 /*
197   - * The page size index of the huge pages for use by hugetlbfs
  197 + * The page size indexes of the huge pages for use by hugetlbfs
198 198 */
199   -extern int mmu_huge_psize;
  199 +extern unsigned int mmu_huge_psizes[MMU_PAGE_COUNT];
200 200  
201 201 #endif /* CONFIG_HUGETLB_PAGE */
202 202  
include/asm-powerpc/page_64.h
... ... @@ -90,6 +90,7 @@
90 90 #define HPAGE_SIZE ((1UL) << HPAGE_SHIFT)
91 91 #define HPAGE_MASK (~(HPAGE_SIZE - 1))
92 92 #define HUGETLB_PAGE_ORDER (HPAGE_SHIFT - PAGE_SHIFT)
  93 +#define HUGE_MAX_HSTATE 3
93 94  
94 95 #endif /* __ASSEMBLY__ */
95 96  
include/asm-powerpc/pgalloc-64.h
... ... @@ -22,7 +22,7 @@
22 22 #define PUD_CACHE_NUM 1
23 23 #define PMD_CACHE_NUM 1
24 24 #define HUGEPTE_CACHE_NUM 2
25   -#define PTE_NONCACHE_NUM 3 /* from GFP rather than kmem_cache */
  25 +#define PTE_NONCACHE_NUM 7 /* from GFP rather than kmem_cache */
26 26  
27 27 static inline pgd_t *pgd_alloc(struct mm_struct *mm)
28 28 {
... ... @@ -119,7 +119,7 @@
119 119 __free_page(ptepage);
120 120 }
121 121  
122   -#define PGF_CACHENUM_MASK 0x3
  122 +#define PGF_CACHENUM_MASK 0x7
123 123  
124 124 typedef struct pgtable_free {
125 125 unsigned long val;