Commit c594adad5653491813959277fb87a2fef54c4e05

Authored by David Gibson
Committed by Paul Mackerras
1 parent 9a5573e378

[PATCH] Dynamic hugepage addresses for ppc64

Paulus, I think this is now a reasonable candidate for the post-2.6.13
queue.

Relax address restrictions for hugepages on ppc64

Presently, 64-bit applications on ppc64 may only use hugepages in the
address region from 1-1.5T.  Furthermore, if hugepages are enabled in
the kernel config, they may only use hugepages and never normal pages
in this area.  This patch relaxes this restriction, allowing any
address to be used with hugepages, but with a 1TB granularity.  That
is if you map a hugepage anywhere in the region 1TB-2TB, that entire
area will be reserved exclusively for hugepages for the remainder of
the process's lifetime.  This works analagously to hugepages in 32-bit
applications, where hugepages can be mapped anywhere, but with 256MB
(mmu segment) granularity.

This patch applies on top of the four level pagetable patch
(http://patchwork.ozlabs.org/linuxppc64/patch?id=1936).

Signed-off-by: David Gibson <dwg@au1.ibm.com>
Signed-off-by: Paul Mackerras <paulus@samba.org>

Showing 5 changed files with 190 additions and 78 deletions Side-by-side Diff

arch/ppc64/kernel/asm-offsets.c
... ... @@ -94,7 +94,8 @@
94 94 DEFINE(PACASLBCACHEPTR, offsetof(struct paca_struct, slb_cache_ptr));
95 95 DEFINE(PACACONTEXTID, offsetof(struct paca_struct, context.id));
96 96 #ifdef CONFIG_HUGETLB_PAGE
97   - DEFINE(PACAHTLBSEGS, offsetof(struct paca_struct, context.htlb_segs));
  97 + DEFINE(PACALOWHTLBAREAS, offsetof(struct paca_struct, context.low_htlb_areas));
  98 + DEFINE(PACAHIGHHTLBAREAS, offsetof(struct paca_struct, context.high_htlb_areas));
98 99 #endif /* CONFIG_HUGETLB_PAGE */
99 100 DEFINE(PACADEFAULTDECR, offsetof(struct paca_struct, default_decr));
100 101 DEFINE(PACA_EXGEN, offsetof(struct paca_struct, exgen));
arch/ppc64/mm/hugetlbpage.c
... ... @@ -27,6 +27,9 @@
27 27  
28 28 #include <linux/sysctl.h>
29 29  
  30 +#define NUM_LOW_AREAS (0x100000000UL >> SID_SHIFT)
  31 +#define NUM_HIGH_AREAS (PGTABLE_RANGE >> HTLB_AREA_SHIFT)
  32 +
30 33 /* Modelled after find_linux_pte() */
31 34 pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
32 35 {
33 36  
34 37  
... ... @@ -129,15 +132,17 @@
129 132 return 0;
130 133 }
131 134  
132   -static void flush_segments(void *parm)
  135 +static void flush_low_segments(void *parm)
133 136 {
134   - u16 segs = (unsigned long) parm;
  137 + u16 areas = (unsigned long) parm;
135 138 unsigned long i;
136 139  
137 140 asm volatile("isync" : : : "memory");
138 141  
139   - for (i = 0; i < 16; i++) {
140   - if (! (segs & (1U << i)))
  142 + BUILD_BUG_ON((sizeof(areas)*8) != NUM_LOW_AREAS);
  143 +
  144 + for (i = 0; i < NUM_LOW_AREAS; i++) {
  145 + if (! (areas & (1U << i)))
141 146 continue;
142 147 asm volatile("slbie %0" : : "r" (i << SID_SHIFT));
143 148 }
144 149  
145 150  
... ... @@ -145,13 +150,33 @@
145 150 asm volatile("isync" : : : "memory");
146 151 }
147 152  
148   -static int prepare_low_seg_for_htlb(struct mm_struct *mm, unsigned long seg)
  153 +static void flush_high_segments(void *parm)
149 154 {
150   - unsigned long start = seg << SID_SHIFT;
151   - unsigned long end = (seg+1) << SID_SHIFT;
  155 + u16 areas = (unsigned long) parm;
  156 + unsigned long i, j;
  157 +
  158 + asm volatile("isync" : : : "memory");
  159 +
  160 + BUILD_BUG_ON((sizeof(areas)*8) != NUM_HIGH_AREAS);
  161 +
  162 + for (i = 0; i < NUM_HIGH_AREAS; i++) {
  163 + if (! (areas & (1U << i)))
  164 + continue;
  165 + for (j = 0; j < (1UL << (HTLB_AREA_SHIFT-SID_SHIFT)); j++)
  166 + asm volatile("slbie %0"
  167 + :: "r" ((i << HTLB_AREA_SHIFT) + (j << SID_SHIFT)));
  168 + }
  169 +
  170 + asm volatile("isync" : : : "memory");
  171 +}
  172 +
  173 +static int prepare_low_area_for_htlb(struct mm_struct *mm, unsigned long area)
  174 +{
  175 + unsigned long start = area << SID_SHIFT;
  176 + unsigned long end = (area+1) << SID_SHIFT;
152 177 struct vm_area_struct *vma;
153 178  
154   - BUG_ON(seg >= 16);
  179 + BUG_ON(area >= NUM_LOW_AREAS);
155 180  
156 181 /* Check no VMAs are in the region */
157 182 vma = find_vma(mm, start);
158 183  
159 184  
160 185  
161 186  
... ... @@ -161,20 +186,39 @@
161 186 return 0;
162 187 }
163 188  
164   -static int open_low_hpage_segs(struct mm_struct *mm, u16 newsegs)
  189 +static int prepare_high_area_for_htlb(struct mm_struct *mm, unsigned long area)
165 190 {
  191 + unsigned long start = area << HTLB_AREA_SHIFT;
  192 + unsigned long end = (area+1) << HTLB_AREA_SHIFT;
  193 + struct vm_area_struct *vma;
  194 +
  195 + BUG_ON(area >= NUM_HIGH_AREAS);
  196 +
  197 + /* Check no VMAs are in the region */
  198 + vma = find_vma(mm, start);
  199 + if (vma && (vma->vm_start < end))
  200 + return -EBUSY;
  201 +
  202 + return 0;
  203 +}
  204 +
  205 +static int open_low_hpage_areas(struct mm_struct *mm, u16 newareas)
  206 +{
166 207 unsigned long i;
167 208  
168   - newsegs &= ~(mm->context.htlb_segs);
169   - if (! newsegs)
  209 + BUILD_BUG_ON((sizeof(newareas)*8) != NUM_LOW_AREAS);
  210 + BUILD_BUG_ON((sizeof(mm->context.low_htlb_areas)*8) != NUM_LOW_AREAS);
  211 +
  212 + newareas &= ~(mm->context.low_htlb_areas);
  213 + if (! newareas)
170 214 return 0; /* The segments we want are already open */
171 215  
172   - for (i = 0; i < 16; i++)
173   - if ((1 << i) & newsegs)
174   - if (prepare_low_seg_for_htlb(mm, i) != 0)
  216 + for (i = 0; i < NUM_LOW_AREAS; i++)
  217 + if ((1 << i) & newareas)
  218 + if (prepare_low_area_for_htlb(mm, i) != 0)
175 219 return -EBUSY;
176 220  
177   - mm->context.htlb_segs |= newsegs;
  221 + mm->context.low_htlb_areas |= newareas;
178 222  
179 223 /* update the paca copy of the context struct */
180 224 get_paca()->context = mm->context;
181 225  
182 226  
183 227  
184 228  
... ... @@ -182,29 +226,63 @@
182 226 /* the context change must make it to memory before the flush,
183 227 * so that further SLB misses do the right thing. */
184 228 mb();
185   - on_each_cpu(flush_segments, (void *)(unsigned long)newsegs, 0, 1);
  229 + on_each_cpu(flush_low_segments, (void *)(unsigned long)newareas, 0, 1);
186 230  
187 231 return 0;
188 232 }
189 233  
  234 +static int open_high_hpage_areas(struct mm_struct *mm, u16 newareas)
  235 +{
  236 + unsigned long i;
  237 +
  238 + BUILD_BUG_ON((sizeof(newareas)*8) != NUM_HIGH_AREAS);
  239 + BUILD_BUG_ON((sizeof(mm->context.high_htlb_areas)*8)
  240 + != NUM_HIGH_AREAS);
  241 +
  242 + newareas &= ~(mm->context.high_htlb_areas);
  243 + if (! newareas)
  244 + return 0; /* The areas we want are already open */
  245 +
  246 + for (i = 0; i < NUM_HIGH_AREAS; i++)
  247 + if ((1 << i) & newareas)
  248 + if (prepare_high_area_for_htlb(mm, i) != 0)
  249 + return -EBUSY;
  250 +
  251 + mm->context.high_htlb_areas |= newareas;
  252 +
  253 + /* update the paca copy of the context struct */
  254 + get_paca()->context = mm->context;
  255 +
  256 + /* the context change must make it to memory before the flush,
  257 + * so that further SLB misses do the right thing. */
  258 + mb();
  259 + on_each_cpu(flush_high_segments, (void *)(unsigned long)newareas, 0, 1);
  260 +
  261 + return 0;
  262 +}
  263 +
190 264 int prepare_hugepage_range(unsigned long addr, unsigned long len)
191 265 {
192   - if (within_hugepage_high_range(addr, len))
193   - return 0;
194   - else if ((addr < 0x100000000UL) && ((addr+len) < 0x100000000UL)) {
195   - int err;
196   - /* Yes, we need both tests, in case addr+len overflows
197   - * 64-bit arithmetic */
198   - err = open_low_hpage_segs(current->mm,
  266 + int err;
  267 +
  268 + if ( (addr+len) < addr )
  269 + return -EINVAL;
  270 +
  271 + if ((addr + len) < 0x100000000UL)
  272 + err = open_low_hpage_areas(current->mm,
199 273 LOW_ESID_MASK(addr, len));
200   - if (err)
201   - printk(KERN_DEBUG "prepare_hugepage_range(%lx, %lx)"
202   - " failed (segs: 0x%04hx)\n", addr, len,
203   - LOW_ESID_MASK(addr, len));
  274 + else
  275 + err = open_high_hpage_areas(current->mm,
  276 + HTLB_AREA_MASK(addr, len));
  277 + if (err) {
  278 + printk(KERN_DEBUG "prepare_hugepage_range(%lx, %lx)"
  279 + " failed (lowmask: 0x%04hx, highmask: 0x%04hx)\n",
  280 + addr, len,
  281 + LOW_ESID_MASK(addr, len), HTLB_AREA_MASK(addr, len));
204 282 return err;
205 283 }
206 284  
207   - return -EINVAL;
  285 + return 0;
208 286 }
209 287  
210 288 struct page *
... ... @@ -276,8 +354,8 @@
276 354 vma = find_vma(mm, addr);
277 355 continue;
278 356 }
279   - if (touches_hugepage_high_range(addr, len)) {
280   - addr = TASK_HPAGE_END;
  357 + if (touches_hugepage_high_range(mm, addr, len)) {
  358 + addr = ALIGN(addr+1, 1UL<<HTLB_AREA_SHIFT);
281 359 vma = find_vma(mm, addr);
282 360 continue;
283 361 }
... ... @@ -356,8 +434,9 @@
356 434 if (touches_hugepage_low_range(mm, addr, len)) {
357 435 addr = (addr & ((~0) << SID_SHIFT)) - len;
358 436 goto hugepage_recheck;
359   - } else if (touches_hugepage_high_range(addr, len)) {
360   - addr = TASK_HPAGE_BASE - len;
  437 + } else if (touches_hugepage_high_range(mm, addr, len)) {
  438 + addr = (addr & ((~0UL) << HTLB_AREA_SHIFT)) - len;
  439 + goto hugepage_recheck;
361 440 }
362 441  
363 442 /*
364 443  
365 444  
366 445  
367 446  
368 447  
... ... @@ -448,23 +527,28 @@
448 527 return -ENOMEM;
449 528 }
450 529  
451   -static unsigned long htlb_get_high_area(unsigned long len)
  530 +static unsigned long htlb_get_high_area(unsigned long len, u16 areamask)
452 531 {
453   - unsigned long addr = TASK_HPAGE_BASE;
  532 + unsigned long addr = 0x100000000UL;
454 533 struct vm_area_struct *vma;
455 534  
456 535 vma = find_vma(current->mm, addr);
457   - for (vma = find_vma(current->mm, addr);
458   - addr + len <= TASK_HPAGE_END;
459   - vma = vma->vm_next) {
  536 + while (addr + len <= TASK_SIZE_USER64) {
460 537 BUG_ON(vma && (addr >= vma->vm_end)); /* invariant */
461   - BUG_ON(! within_hugepage_high_range(addr, len));
462 538  
  539 + if (! __within_hugepage_high_range(addr, len, areamask)) {
  540 + addr = ALIGN(addr+1, 1UL<<HTLB_AREA_SHIFT);
  541 + vma = find_vma(current->mm, addr);
  542 + continue;
  543 + }
  544 +
463 545 if (!vma || (addr + len) <= vma->vm_start)
464 546 return addr;
465 547 addr = ALIGN(vma->vm_end, HPAGE_SIZE);
466   - /* Because we're in a hugepage region, this alignment
467   - * should not skip us over any VMAs */
  548 + /* Depending on segmask this might not be a confirmed
  549 + * hugepage region, so the ALIGN could have skipped
  550 + * some VMAs */
  551 + vma = find_vma(current->mm, addr);
468 552 }
469 553  
470 554 return -ENOMEM;
... ... @@ -474,6 +558,9 @@
474 558 unsigned long len, unsigned long pgoff,
475 559 unsigned long flags)
476 560 {
  561 + int lastshift;
  562 + u16 areamask, curareas;
  563 +
477 564 if (len & ~HPAGE_MASK)
478 565 return -EINVAL;
479 566  
480 567  
481 568  
482 569  
483 570  
484 571  
485 572  
486 573  
... ... @@ -481,31 +568,49 @@
481 568 return -EINVAL;
482 569  
483 570 if (test_thread_flag(TIF_32BIT)) {
484   - int lastshift = 0;
485   - u16 segmask, cursegs = current->mm->context.htlb_segs;
  571 + curareas = current->mm->context.low_htlb_areas;
486 572  
487 573 /* First see if we can do the mapping in the existing
488   - * low hpage segments */
489   - addr = htlb_get_low_area(len, cursegs);
  574 + * low areas */
  575 + addr = htlb_get_low_area(len, curareas);
490 576 if (addr != -ENOMEM)
491 577 return addr;
492 578  
493   - for (segmask = LOW_ESID_MASK(0x100000000UL-len, len);
494   - ! lastshift; segmask >>=1) {
495   - if (segmask & 1)
  579 + lastshift = 0;
  580 + for (areamask = LOW_ESID_MASK(0x100000000UL-len, len);
  581 + ! lastshift; areamask >>=1) {
  582 + if (areamask & 1)
496 583 lastshift = 1;
497 584  
498   - addr = htlb_get_low_area(len, cursegs | segmask);
  585 + addr = htlb_get_low_area(len, curareas | areamask);
499 586 if ((addr != -ENOMEM)
500   - && open_low_hpage_segs(current->mm, segmask) == 0)
  587 + && open_low_hpage_areas(current->mm, areamask) == 0)
501 588 return addr;
502 589 }
503   - printk(KERN_DEBUG "hugetlb_get_unmapped_area() unable to open"
504   - " enough segments\n");
505   - return -ENOMEM;
506 590 } else {
507   - return htlb_get_high_area(len);
  591 + curareas = current->mm->context.high_htlb_areas;
  592 +
  593 + /* First see if we can do the mapping in the existing
  594 + * high areas */
  595 + addr = htlb_get_high_area(len, curareas);
  596 + if (addr != -ENOMEM)
  597 + return addr;
  598 +
  599 + lastshift = 0;
  600 + for (areamask = HTLB_AREA_MASK(TASK_SIZE_USER64-len, len);
  601 + ! lastshift; areamask >>=1) {
  602 + if (areamask & 1)
  603 + lastshift = 1;
  604 +
  605 + addr = htlb_get_high_area(len, curareas | areamask);
  606 + if ((addr != -ENOMEM)
  607 + && open_high_hpage_areas(current->mm, areamask) == 0)
  608 + return addr;
  609 + }
508 610 }
  611 + printk(KERN_DEBUG "hugetlb_get_unmapped_area() unable to open"
  612 + " enough areas\n");
  613 + return -ENOMEM;
509 614 }
510 615  
511 616 int hash_huge_page(struct mm_struct *mm, unsigned long access,
arch/ppc64/mm/slb_low.S
... ... @@ -89,28 +89,29 @@
89 89 b 9f
90 90  
91 91 0: /* user address: proto-VSID = context<<15 | ESID */
92   - li r11,SLB_VSID_USER
93   -
94 92 srdi. r9,r3,USER_ESID_BITS
95 93 bne- 8f /* invalid ea bits set */
96 94  
97 95 #ifdef CONFIG_HUGETLB_PAGE
98 96 BEGIN_FTR_SECTION
99   - /* check against the hugepage ranges */
100   - cmpldi r3,(TASK_HPAGE_END>>SID_SHIFT)
101   - bge 6f /* >= TASK_HPAGE_END */
102   - cmpldi r3,(TASK_HPAGE_BASE>>SID_SHIFT)
103   - bge 5f /* TASK_HPAGE_BASE..TASK_HPAGE_END */
  97 + lhz r9,PACAHIGHHTLBAREAS(r13)
  98 + srdi r11,r3,(HTLB_AREA_SHIFT-SID_SHIFT)
  99 + srd r9,r9,r11
  100 + andi. r9,r9,1
  101 + bne 5f
  102 +
  103 + li r11,SLB_VSID_USER
  104 +
104 105 cmpldi r3,16
105   - bge 6f /* 4GB..TASK_HPAGE_BASE */
  106 + bge 6f
106 107  
107   - lhz r9,PACAHTLBSEGS(r13)
  108 + lhz r9,PACALOWHTLBAREAS(r13)
108 109 srd r9,r9,r3
109 110 andi. r9,r9,1
  111 +
110 112 beq 6f
111 113  
112   -5: /* this is a hugepage user address */
113   - li r11,(SLB_VSID_USER|SLB_VSID_L)
  114 +5: li r11,SLB_VSID_USER|SLB_VSID_L
114 115 END_FTR_SECTION_IFSET(CPU_FTR_16M_PAGE)
115 116 #endif /* CONFIG_HUGETLB_PAGE */
116 117  
include/asm-ppc64/mmu.h
... ... @@ -307,7 +307,7 @@
307 307 typedef struct {
308 308 mm_context_id_t id;
309 309 #ifdef CONFIG_HUGETLB_PAGE
310   - u16 htlb_segs; /* bitmask */
  310 + u16 low_htlb_areas, high_htlb_areas;
311 311 #endif
312 312 } mm_context_t;
313 313  
include/asm-ppc64/page.h
... ... @@ -37,40 +37,45 @@
37 37  
38 38 #define HUGETLB_PAGE_ORDER (HPAGE_SHIFT - PAGE_SHIFT)
39 39  
40   -/* For 64-bit processes the hugepage range is 1T-1.5T */
41   -#define TASK_HPAGE_BASE ASM_CONST(0x0000010000000000)
42   -#define TASK_HPAGE_END ASM_CONST(0x0000018000000000)
  40 +#define HTLB_AREA_SHIFT 40
  41 +#define HTLB_AREA_SIZE (1UL << HTLB_AREA_SHIFT)
  42 +#define GET_HTLB_AREA(x) ((x) >> HTLB_AREA_SHIFT)
43 43  
44 44 #define LOW_ESID_MASK(addr, len) (((1U << (GET_ESID(addr+len-1)+1)) \
45 45 - (1U << GET_ESID(addr))) & 0xffff)
  46 +#define HTLB_AREA_MASK(addr, len) (((1U << (GET_HTLB_AREA(addr+len-1)+1)) \
  47 + - (1U << GET_HTLB_AREA(addr))) & 0xffff)
46 48  
47 49 #define ARCH_HAS_HUGEPAGE_ONLY_RANGE
48 50 #define ARCH_HAS_PREPARE_HUGEPAGE_RANGE
49 51 #define ARCH_HAS_SETCLEAR_HUGE_PTE
50 52  
51 53 #define touches_hugepage_low_range(mm, addr, len) \
52   - (LOW_ESID_MASK((addr), (len)) & mm->context.htlb_segs)
53   -#define touches_hugepage_high_range(addr, len) \
54   - (((addr) > (TASK_HPAGE_BASE-(len))) && ((addr) < TASK_HPAGE_END))
  54 + (LOW_ESID_MASK((addr), (len)) & (mm)->context.low_htlb_areas)
  55 +#define touches_hugepage_high_range(mm, addr, len) \
  56 + (HTLB_AREA_MASK((addr), (len)) & (mm)->context.high_htlb_areas)
55 57  
56 58 #define __within_hugepage_low_range(addr, len, segmask) \
57 59 ((LOW_ESID_MASK((addr), (len)) | (segmask)) == (segmask))
58 60 #define within_hugepage_low_range(addr, len) \
59 61 __within_hugepage_low_range((addr), (len), \
60   - current->mm->context.htlb_segs)
61   -#define within_hugepage_high_range(addr, len) (((addr) >= TASK_HPAGE_BASE) \
62   - && ((addr)+(len) <= TASK_HPAGE_END) && ((addr)+(len) >= (addr)))
  62 + current->mm->context.low_htlb_areas)
  63 +#define __within_hugepage_high_range(addr, len, zonemask) \
  64 + ((HTLB_AREA_MASK((addr), (len)) | (zonemask)) == (zonemask))
  65 +#define within_hugepage_high_range(addr, len) \
  66 + __within_hugepage_high_range((addr), (len), \
  67 + current->mm->context.high_htlb_areas)
63 68  
64 69 #define is_hugepage_only_range(mm, addr, len) \
65   - (touches_hugepage_high_range((addr), (len)) || \
  70 + (touches_hugepage_high_range((mm), (addr), (len)) || \
66 71 touches_hugepage_low_range((mm), (addr), (len)))
67 72 #define HAVE_ARCH_HUGETLB_UNMAPPED_AREA
68 73  
69 74 #define in_hugepage_area(context, addr) \
70 75 (cpu_has_feature(CPU_FTR_16M_PAGE) && \
71   - ( (((addr) >= TASK_HPAGE_BASE) && ((addr) < TASK_HPAGE_END)) || \
  76 + ( ((1 << GET_HTLB_AREA(addr)) & (context).high_htlb_areas) || \
72 77 ( ((addr) < 0x100000000L) && \
73   - ((1 << GET_ESID(addr)) & (context).htlb_segs) ) ) )
  78 + ((1 << GET_ESID(addr)) & (context).low_htlb_areas) ) ) )
74 79  
75 80 #else /* !CONFIG_HUGETLB_PAGE */
76 81