Commit 5da7ca86078964cbfe6c83efc1205904587706fe

Authored by Christoph Lameter
Committed by Linus Torvalds
1 parent 96df9333c9

[PATCH] Add NUMA policy support for huge pages.

The huge_zonelist() function in the memory policy layer provides an list of
zones ordered by NUMA distance.  The hugetlb layer will walk that list looking
for a zone that has available huge pages but is also in the nodeset of the
current cpuset.

This patch does not contain the folding of find_or_alloc_huge_page() that was
controversial in the earlier discussion.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Cc: Andi Kleen <ak@muc.de>
Acked-by: William Lee Irwin III <wli@holomorphy.com>
Cc: Adam Litke <agl@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

Showing 4 changed files with 54 additions and 21 deletions Side-by-side Diff

include/linux/hugetlb.h
... ... @@ -22,7 +22,7 @@
22 22 int hugetlb_report_node_meminfo(int, char *);
23 23 int is_hugepage_mem_enough(size_t);
24 24 unsigned long hugetlb_total_pages(void);
25   -struct page *alloc_huge_page(void);
  25 +struct page *alloc_huge_page(struct vm_area_struct *, unsigned long);
26 26 void free_huge_page(struct page *);
27 27 int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
28 28 unsigned long address, int write_access);
... ... @@ -97,7 +97,7 @@
97 97 #define is_hugepage_only_range(mm, addr, len) 0
98 98 #define hugetlb_free_pgd_range(tlb, addr, end, floor, ceiling) \
99 99 do { } while (0)
100   -#define alloc_huge_page() ({ NULL; })
  100 +#define alloc_huge_page(vma, addr) ({ NULL; })
101 101 #define free_huge_page(p) ({ (void)(p); BUG(); })
102 102 #define hugetlb_fault(mm, vma, addr, write) ({ BUG(); 0; })
103 103  
include/linux/mempolicy.h
... ... @@ -156,6 +156,8 @@
156 156 extern void numa_policy_init(void);
157 157 extern void numa_policy_rebind(const nodemask_t *old, const nodemask_t *new);
158 158 extern struct mempolicy default_policy;
  159 +extern struct zonelist *huge_zonelist(struct vm_area_struct *vma,
  160 + unsigned long addr);
159 161  
160 162 #else
161 163  
... ... @@ -230,6 +232,12 @@
230 232 static inline void numa_policy_rebind(const nodemask_t *old,
231 233 const nodemask_t *new)
232 234 {
  235 +}
  236 +
  237 +static inline struct zonelist *huge_zonelist(struct vm_area_struct *vma,
  238 + unsigned long addr)
  239 +{
  240 + return NODE_DATA(0)->node_zonelists + gfp_zone(GFP_HIGHUSER);
233 241 }
234 242  
235 243 #endif /* CONFIG_NUMA */
... ... @@ -11,6 +11,8 @@
11 11 #include <linux/highmem.h>
12 12 #include <linux/nodemask.h>
13 13 #include <linux/pagemap.h>
  14 +#include <linux/mempolicy.h>
  15 +
14 16 #include <asm/page.h>
15 17 #include <asm/pgtable.h>
16 18  
17 19  
... ... @@ -36,11 +38,12 @@
36 38 free_huge_pages_node[nid]++;
37 39 }
38 40  
39   -static struct page *dequeue_huge_page(void)
  41 +static struct page *dequeue_huge_page(struct vm_area_struct *vma,
  42 + unsigned long address)
40 43 {
41 44 int nid = numa_node_id();
42 45 struct page *page = NULL;
43   - struct zonelist *zonelist = NODE_DATA(nid)->node_zonelists;
  46 + struct zonelist *zonelist = huge_zonelist(vma, address);
44 47 struct zone **z;
45 48  
46 49 for (z = zonelist->zones; *z; z++) {
47 50  
... ... @@ -87,13 +90,13 @@
87 90 spin_unlock(&hugetlb_lock);
88 91 }
89 92  
90   -struct page *alloc_huge_page(void)
  93 +struct page *alloc_huge_page(struct vm_area_struct *vma, unsigned long addr)
91 94 {
92 95 struct page *page;
93 96 int i;
94 97  
95 98 spin_lock(&hugetlb_lock);
96   - page = dequeue_huge_page();
  99 + page = dequeue_huge_page(vma, addr);
97 100 if (!page) {
98 101 spin_unlock(&hugetlb_lock);
99 102 return NULL;
... ... @@ -196,7 +199,7 @@
196 199 spin_lock(&hugetlb_lock);
197 200 try_to_free_low(count);
198 201 while (count < nr_huge_pages) {
199   - struct page *page = dequeue_huge_page();
  202 + struct page *page = dequeue_huge_page(NULL, 0);
200 203 if (!page)
201 204 break;
202 205 update_and_free_page(page);
... ... @@ -365,8 +368,9 @@
365 368 flush_tlb_range(vma, start, end);
366 369 }
367 370  
368   -static struct page *find_or_alloc_huge_page(struct address_space *mapping,
369   - unsigned long idx, int shared)
  371 +static struct page *find_or_alloc_huge_page(struct vm_area_struct *vma,
  372 + unsigned long addr, struct address_space *mapping,
  373 + unsigned long idx, int shared)
370 374 {
371 375 struct page *page;
372 376 int err;
... ... @@ -378,7 +382,7 @@
378 382  
379 383 if (hugetlb_get_quota(mapping))
380 384 goto out;
381   - page = alloc_huge_page();
  385 + page = alloc_huge_page(vma, addr);
382 386 if (!page) {
383 387 hugetlb_put_quota(mapping);
384 388 goto out;
... ... @@ -418,7 +422,7 @@
418 422 }
419 423  
420 424 page_cache_get(old_page);
421   - new_page = alloc_huge_page();
  425 + new_page = alloc_huge_page(vma, address);
422 426  
423 427 if (!new_page) {
424 428 page_cache_release(old_page);
... ... @@ -467,7 +471,7 @@
467 471 * Use page lock to guard against racing truncation
468 472 * before we get page_table_lock.
469 473 */
470   - page = find_or_alloc_huge_page(mapping, idx,
  474 + page = find_or_alloc_huge_page(vma, address, mapping, idx,
471 475 vma->vm_flags & VM_SHARED);
472 476 if (!page)
473 477 goto out;
... ... @@ -785,6 +785,34 @@
785 785 return nid;
786 786 }
787 787  
  788 +/* Determine a node number for interleave */
  789 +static inline unsigned interleave_nid(struct mempolicy *pol,
  790 + struct vm_area_struct *vma, unsigned long addr, int shift)
  791 +{
  792 + if (vma) {
  793 + unsigned long off;
  794 +
  795 + off = vma->vm_pgoff;
  796 + off += (addr - vma->vm_start) >> shift;
  797 + return offset_il_node(pol, vma, off);
  798 + } else
  799 + return interleave_nodes(pol);
  800 +}
  801 +
  802 +/* Return a zonelist suitable for a huge page allocation. */
  803 +struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr)
  804 +{
  805 + struct mempolicy *pol = get_vma_policy(current, vma, addr);
  806 +
  807 + if (pol->policy == MPOL_INTERLEAVE) {
  808 + unsigned nid;
  809 +
  810 + nid = interleave_nid(pol, vma, addr, HPAGE_SHIFT);
  811 + return NODE_DATA(nid)->node_zonelists + gfp_zone(GFP_HIGHUSER);
  812 + }
  813 + return zonelist_policy(GFP_HIGHUSER, pol);
  814 +}
  815 +
788 816 /* Allocate a page in interleaved policy.
789 817 Own path because it needs to do special accounting. */
790 818 static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
... ... @@ -833,15 +861,8 @@
833 861  
834 862 if (unlikely(pol->policy == MPOL_INTERLEAVE)) {
835 863 unsigned nid;
836   - if (vma) {
837   - unsigned long off;
838   - off = vma->vm_pgoff;
839   - off += (addr - vma->vm_start) >> PAGE_SHIFT;
840   - nid = offset_il_node(pol, vma, off);
841   - } else {
842   - /* fall back to process interleaving */
843   - nid = interleave_nodes(pol);
844   - }
  864 +
  865 + nid = interleave_nid(pol, vma, addr, PAGE_SHIFT);
845 866 return alloc_page_interleave(gfp, 0, nid);
846 867 }
847 868 return __alloc_pages(gfp, 0, zonelist_policy(gfp, pol));