Commit 5c1f6ee9a31cbdac90bbb8ae1ba4475031ac74b4

Authored by Aneesh Kumar K.V
Committed by Benjamin Herrenschmidt
1 parent d614bb0412

powerpc: Reduce PTE table memory wastage

We allocate one page for the last level of linux page table. With THP and
large page size of 16MB, that would mean we are wasting large part
of that page. To map 16MB area, we only need a PTE space of 2K with 64K
page size. This patch reduce the space wastage by sharing the page
allocated for the last level of linux page table with multiple pmd
entries. We call these smaller chunks PTE page fragments and allocated
page, PTE page.

In order to support systems which doesn't have 64K HPTE support, we also
add another 2K to PTE page fragment. The second half of the PTE fragments
is used for storing slot and secondary bit information of an HPTE. With this
we now have a 4K PTE fragment.

We use a simple approach to share the PTE page. On allocation, we bump the
PTE page refcount to 16 and share the PTE page with the next 16 pte alloc
request. This should help in the node locality of the PTE page fragment,
assuming that the immediate pte alloc request will mostly come from the
same NUMA node. We don't try to reuse the freed PTE page fragment. Hence
we could be waisting some space.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Acked-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>

Showing 7 changed files with 195 additions and 58 deletions Side-by-side Diff

arch/powerpc/include/asm/mmu-book3e.h
... ... @@ -231,6 +231,10 @@
231 231 u64 high_slices_psize; /* 4 bits per slice for now */
232 232 u16 user_psize; /* page size index */
233 233 #endif
  234 +#ifdef CONFIG_PPC_64K_PAGES
  235 + /* for 4K PTE fragment support */
  236 + void *pte_frag;
  237 +#endif
234 238 } mm_context_t;
235 239  
236 240 /* Page size definitions, common between 32 and 64-bit
arch/powerpc/include/asm/mmu-hash64.h
... ... @@ -516,6 +516,10 @@
516 516 unsigned long acop; /* mask of enabled coprocessor types */
517 517 unsigned int cop_pid; /* pid value used with coprocessors */
518 518 #endif /* CONFIG_PPC_ICSWX */
  519 +#ifdef CONFIG_PPC_64K_PAGES
  520 + /* for 4K PTE fragment support */
  521 + void *pte_frag;
  522 +#endif
519 523 } mm_context_t;
520 524  
521 525  
arch/powerpc/include/asm/page.h
... ... @@ -393,7 +393,11 @@
393 393  
394 394 struct vm_area_struct;
395 395  
  396 +#ifdef CONFIG_PPC_64K_PAGES
  397 +typedef pte_t *pgtable_t;
  398 +#else
396 399 typedef struct page *pgtable_t;
  400 +#endif
397 401  
398 402 #include <asm-generic/memory_model.h>
399 403 #endif /* __ASSEMBLY__ */
arch/powerpc/include/asm/pgalloc-64.h
... ... @@ -152,7 +152,24 @@
152 152 }
153 153  
154 154 #else /* if CONFIG_PPC_64K_PAGES */
  155 +/*
  156 + * we support 16 fragments per PTE page.
  157 + */
  158 +#define PTE_FRAG_NR 16
  159 +/*
  160 + * We use a 2K PTE page fragment and another 2K for storing
  161 + * real_pte_t hash index
  162 + */
  163 +#define PTE_FRAG_SIZE_SHIFT 12
  164 +#define PTE_FRAG_SIZE (2 * PTRS_PER_PTE * sizeof(pte_t))
155 165  
  166 +extern pte_t *page_table_alloc(struct mm_struct *, unsigned long, int);
  167 +extern void page_table_free(struct mm_struct *, unsigned long *, int);
  168 +extern void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int shift);
  169 +#ifdef CONFIG_SMP
  170 +extern void __tlb_remove_table(void *_table);
  171 +#endif
  172 +
156 173 #define pud_populate(mm, pud, pmd) pud_set(pud, (unsigned long)pmd)
157 174  
158 175 static inline void pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmd,
159 176  
160 177  
161 178  
162 179  
163 180  
164 181  
165 182  
166 183  
167 184  
168 185  
... ... @@ -164,90 +181,42 @@
164 181 static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd,
165 182 pgtable_t pte_page)
166 183 {
167   - pmd_populate_kernel(mm, pmd, page_address(pte_page));
  184 + pmd_set(pmd, (unsigned long)pte_page);
168 185 }
169 186  
170 187 static inline pgtable_t pmd_pgtable(pmd_t pmd)
171 188 {
172   - return pmd_page(pmd);
  189 + return (pgtable_t)(pmd_val(pmd) & -sizeof(pte_t)*PTRS_PER_PTE);
173 190 }
174 191  
175 192 static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm,
176 193 unsigned long address)
177 194 {
178   - return (pte_t *)__get_free_page(GFP_KERNEL | __GFP_REPEAT | __GFP_ZERO);
  195 + return (pte_t *)page_table_alloc(mm, address, 1);
179 196 }
180 197  
181 198 static inline pgtable_t pte_alloc_one(struct mm_struct *mm,
182   - unsigned long address)
  199 + unsigned long address)
183 200 {
184   - struct page *page;
185   - pte_t *pte;
186   -
187   - pte = pte_alloc_one_kernel(mm, address);
188   - if (!pte)
189   - return NULL;
190   - page = virt_to_page(pte);
191   - pgtable_page_ctor(page);
192   - return page;
  201 + return (pgtable_t)page_table_alloc(mm, address, 0);
193 202 }
194 203  
195 204 static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
196 205 {
197   - free_page((unsigned long)pte);
  206 + page_table_free(mm, (unsigned long *)pte, 1);
198 207 }
199 208  
200 209 static inline void pte_free(struct mm_struct *mm, pgtable_t ptepage)
201 210 {
202   - pgtable_page_dtor(ptepage);
203   - __free_page(ptepage);
  211 + page_table_free(mm, (unsigned long *)ptepage, 0);
204 212 }
205 213  
206   -static inline void pgtable_free(void *table, unsigned index_size)
207   -{
208   - if (!index_size)
209   - free_page((unsigned long)table);
210   - else {
211   - BUG_ON(index_size > MAX_PGTABLE_INDEX_SIZE);
212   - kmem_cache_free(PGT_CACHE(index_size), table);
213   - }
214   -}
215   -
216   -#ifdef CONFIG_SMP
217   -static inline void pgtable_free_tlb(struct mmu_gather *tlb,
218   - void *table, int shift)
219   -{
220   - unsigned long pgf = (unsigned long)table;
221   - BUG_ON(shift > MAX_PGTABLE_INDEX_SIZE);
222   - pgf |= shift;
223   - tlb_remove_table(tlb, (void *)pgf);
224   -}
225   -
226   -static inline void __tlb_remove_table(void *_table)
227   -{
228   - void *table = (void *)((unsigned long)_table & ~MAX_PGTABLE_INDEX_SIZE);
229   - unsigned shift = (unsigned long)_table & MAX_PGTABLE_INDEX_SIZE;
230   -
231   - pgtable_free(table, shift);
232   -}
233   -#else /* !CONFIG_SMP */
234   -static inline void pgtable_free_tlb(struct mmu_gather *tlb,
235   - void *table, int shift)
236   -{
237   - pgtable_free(table, shift);
238   -}
239   -#endif /* CONFIG_SMP */
240   -
241 214 static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t table,
242 215 unsigned long address)
243 216 {
244   - struct page *page = page_address(table);
245   -
246 217 tlb_flush_pgtable(tlb, address);
247   - pgtable_page_dtor(page);
248   - pgtable_free_tlb(tlb, page, 0);
  218 + pgtable_free_tlb(tlb, table, 0);
249 219 }
250   -
251 220 #endif /* CONFIG_PPC_64K_PAGES */
252 221  
253 222 static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr)
... ... @@ -260,7 +229,6 @@
260 229 {
261 230 kmem_cache_free(PGT_CACHE(PMD_INDEX_SIZE), pmd);
262 231 }
263   -
264 232  
265 233 #define __pmd_free_tlb(tlb, pmd, addr) \
266 234 pgtable_free_tlb(tlb, pmd, PMD_INDEX_SIZE)
arch/powerpc/kernel/setup_64.c
... ... @@ -583,7 +583,9 @@
583 583 init_mm.end_code = (unsigned long) _etext;
584 584 init_mm.end_data = (unsigned long) _edata;
585 585 init_mm.brk = klimit;
586   -
  586 +#ifdef CONFIG_PPC_64K_PAGES
  587 + init_mm.context.pte_frag = NULL;
  588 +#endif
587 589 irqstack_early_init();
588 590 exc_lvl_early_init();
589 591 emergency_stack_init();
arch/powerpc/mm/mmu_context_hash64.c
... ... @@ -23,6 +23,7 @@
23 23 #include <linux/slab.h>
24 24  
25 25 #include <asm/mmu_context.h>
  26 +#include <asm/pgalloc.h>
26 27  
27 28 #include "icswx.h"
28 29  
... ... @@ -85,6 +86,9 @@
85 86 spin_lock_init(mm->context.cop_lockp);
86 87 #endif /* CONFIG_PPC_ICSWX */
87 88  
  89 +#ifdef CONFIG_PPC_64K_PAGES
  90 + mm->context.pte_frag = NULL;
  91 +#endif
88 92 return 0;
89 93 }
90 94  
91 95  
92 96  
... ... @@ -96,13 +100,46 @@
96 100 }
97 101 EXPORT_SYMBOL_GPL(__destroy_context);
98 102  
  103 +#ifdef CONFIG_PPC_64K_PAGES
  104 +static void destroy_pagetable_page(struct mm_struct *mm)
  105 +{
  106 + int count;
  107 + void *pte_frag;
  108 + struct page *page;
  109 +
  110 + pte_frag = mm->context.pte_frag;
  111 + if (!pte_frag)
  112 + return;
  113 +
  114 + page = virt_to_page(pte_frag);
  115 + /* drop all the pending references */
  116 + count = ((unsigned long)pte_frag & ~PAGE_MASK) >> PTE_FRAG_SIZE_SHIFT;
  117 + /* We allow PTE_FRAG_NR fragments from a PTE page */
  118 + count = atomic_sub_return(PTE_FRAG_NR - count, &page->_count);
  119 + if (!count) {
  120 + pgtable_page_dtor(page);
  121 + free_hot_cold_page(page, 0);
  122 + }
  123 +}
  124 +
  125 +#else
  126 +static inline void destroy_pagetable_page(struct mm_struct *mm)
  127 +{
  128 + return;
  129 +}
  130 +#endif
  131 +
  132 +
99 133 void destroy_context(struct mm_struct *mm)
100 134 {
  135 +
101 136 #ifdef CONFIG_PPC_ICSWX
102 137 drop_cop(mm->context.acop, mm);
103 138 kfree(mm->context.cop_lockp);
104 139 mm->context.cop_lockp = NULL;
105 140 #endif /* CONFIG_PPC_ICSWX */
  141 +
  142 + destroy_pagetable_page(mm);
106 143 __destroy_context(mm->context.id);
107 144 subpage_prot_free(mm);
108 145 mm->context.id = MMU_NO_CONTEXT;
arch/powerpc/mm/pgtable_64.c
... ... @@ -337,4 +337,122 @@
337 337 EXPORT_SYMBOL(iounmap);
338 338 EXPORT_SYMBOL(__iounmap);
339 339 EXPORT_SYMBOL(__iounmap_at);
  340 +
  341 +#ifdef CONFIG_PPC_64K_PAGES
  342 +static pte_t *get_from_cache(struct mm_struct *mm)
  343 +{
  344 + void *pte_frag, *ret;
  345 +
  346 + spin_lock(&mm->page_table_lock);
  347 + ret = mm->context.pte_frag;
  348 + if (ret) {
  349 + pte_frag = ret + PTE_FRAG_SIZE;
  350 + /*
  351 + * If we have taken up all the fragments mark PTE page NULL
  352 + */
  353 + if (((unsigned long)pte_frag & ~PAGE_MASK) == 0)
  354 + pte_frag = NULL;
  355 + mm->context.pte_frag = pte_frag;
  356 + }
  357 + spin_unlock(&mm->page_table_lock);
  358 + return (pte_t *)ret;
  359 +}
  360 +
  361 +static pte_t *__alloc_for_cache(struct mm_struct *mm, int kernel)
  362 +{
  363 + void *ret = NULL;
  364 + struct page *page = alloc_page(GFP_KERNEL | __GFP_NOTRACK |
  365 + __GFP_REPEAT | __GFP_ZERO);
  366 + if (!page)
  367 + return NULL;
  368 +
  369 + ret = page_address(page);
  370 + spin_lock(&mm->page_table_lock);
  371 + /*
  372 + * If we find pgtable_page set, we return
  373 + * the allocated page with single fragement
  374 + * count.
  375 + */
  376 + if (likely(!mm->context.pte_frag)) {
  377 + atomic_set(&page->_count, PTE_FRAG_NR);
  378 + mm->context.pte_frag = ret + PTE_FRAG_SIZE;
  379 + }
  380 + spin_unlock(&mm->page_table_lock);
  381 +
  382 + if (!kernel)
  383 + pgtable_page_ctor(page);
  384 +
  385 + return (pte_t *)ret;
  386 +}
  387 +
  388 +pte_t *page_table_alloc(struct mm_struct *mm, unsigned long vmaddr, int kernel)
  389 +{
  390 + pte_t *pte;
  391 +
  392 + pte = get_from_cache(mm);
  393 + if (pte)
  394 + return pte;
  395 +
  396 + return __alloc_for_cache(mm, kernel);
  397 +}
  398 +
  399 +void page_table_free(struct mm_struct *mm, unsigned long *table, int kernel)
  400 +{
  401 + struct page *page = virt_to_page(table);
  402 + if (put_page_testzero(page)) {
  403 + if (!kernel)
  404 + pgtable_page_dtor(page);
  405 + free_hot_cold_page(page, 0);
  406 + }
  407 +}
  408 +
  409 +#ifdef CONFIG_SMP
  410 +static void page_table_free_rcu(void *table)
  411 +{
  412 + struct page *page = virt_to_page(table);
  413 + if (put_page_testzero(page)) {
  414 + pgtable_page_dtor(page);
  415 + free_hot_cold_page(page, 0);
  416 + }
  417 +}
  418 +
  419 +void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int shift)
  420 +{
  421 + unsigned long pgf = (unsigned long)table;
  422 +
  423 + BUG_ON(shift > MAX_PGTABLE_INDEX_SIZE);
  424 + pgf |= shift;
  425 + tlb_remove_table(tlb, (void *)pgf);
  426 +}
  427 +
  428 +void __tlb_remove_table(void *_table)
  429 +{
  430 + void *table = (void *)((unsigned long)_table & ~MAX_PGTABLE_INDEX_SIZE);
  431 + unsigned shift = (unsigned long)_table & MAX_PGTABLE_INDEX_SIZE;
  432 +
  433 + if (!shift)
  434 + /* PTE page needs special handling */
  435 + page_table_free_rcu(table);
  436 + else {
  437 + BUG_ON(shift > MAX_PGTABLE_INDEX_SIZE);
  438 + kmem_cache_free(PGT_CACHE(shift), table);
  439 + }
  440 +}
  441 +#else
  442 +void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int shift)
  443 +{
  444 + if (!shift) {
  445 + /* PTE page needs special handling */
  446 + struct page *page = virt_to_page(table);
  447 + if (put_page_testzero(page)) {
  448 + pgtable_page_dtor(page);
  449 + free_hot_cold_page(page, 0);
  450 + }
  451 + } else {
  452 + BUG_ON(shift > MAX_PGTABLE_INDEX_SIZE);
  453 + kmem_cache_free(PGT_CACHE(shift), table);
  454 + }
  455 +}
  456 +#endif
  457 +#endif /* CONFIG_PPC_64K_PAGES */