Commit 5da7ca86078964cbfe6c83efc1205904587706fe
Committed by
Linus Torvalds
1 parent
96df9333c9
Exists in
master
and in
20 other branches
[PATCH] Add NUMA policy support for huge pages.
The huge_zonelist() function in the memory policy layer provides an list of zones ordered by NUMA distance. The hugetlb layer will walk that list looking for a zone that has available huge pages but is also in the nodeset of the current cpuset. This patch does not contain the folding of find_or_alloc_huge_page() that was controversial in the earlier discussion. Signed-off-by: Christoph Lameter <clameter@sgi.com> Cc: Andi Kleen <ak@muc.de> Acked-by: William Lee Irwin III <wli@holomorphy.com> Cc: Adam Litke <agl@us.ibm.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Showing 4 changed files with 54 additions and 21 deletions Side-by-side Diff
include/linux/hugetlb.h
... | ... | @@ -22,7 +22,7 @@ |
22 | 22 | int hugetlb_report_node_meminfo(int, char *); |
23 | 23 | int is_hugepage_mem_enough(size_t); |
24 | 24 | unsigned long hugetlb_total_pages(void); |
25 | -struct page *alloc_huge_page(void); | |
25 | +struct page *alloc_huge_page(struct vm_area_struct *, unsigned long); | |
26 | 26 | void free_huge_page(struct page *); |
27 | 27 | int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, |
28 | 28 | unsigned long address, int write_access); |
... | ... | @@ -97,7 +97,7 @@ |
97 | 97 | #define is_hugepage_only_range(mm, addr, len) 0 |
98 | 98 | #define hugetlb_free_pgd_range(tlb, addr, end, floor, ceiling) \ |
99 | 99 | do { } while (0) |
100 | -#define alloc_huge_page() ({ NULL; }) | |
100 | +#define alloc_huge_page(vma, addr) ({ NULL; }) | |
101 | 101 | #define free_huge_page(p) ({ (void)(p); BUG(); }) |
102 | 102 | #define hugetlb_fault(mm, vma, addr, write) ({ BUG(); 0; }) |
103 | 103 |
include/linux/mempolicy.h
... | ... | @@ -156,6 +156,8 @@ |
156 | 156 | extern void numa_policy_init(void); |
157 | 157 | extern void numa_policy_rebind(const nodemask_t *old, const nodemask_t *new); |
158 | 158 | extern struct mempolicy default_policy; |
159 | +extern struct zonelist *huge_zonelist(struct vm_area_struct *vma, | |
160 | + unsigned long addr); | |
159 | 161 | |
160 | 162 | #else |
161 | 163 | |
... | ... | @@ -230,6 +232,12 @@ |
230 | 232 | static inline void numa_policy_rebind(const nodemask_t *old, |
231 | 233 | const nodemask_t *new) |
232 | 234 | { |
235 | +} | |
236 | + | |
237 | +static inline struct zonelist *huge_zonelist(struct vm_area_struct *vma, | |
238 | + unsigned long addr) | |
239 | +{ | |
240 | + return NODE_DATA(0)->node_zonelists + gfp_zone(GFP_HIGHUSER); | |
233 | 241 | } |
234 | 242 | |
235 | 243 | #endif /* CONFIG_NUMA */ |
mm/hugetlb.c
... | ... | @@ -11,6 +11,8 @@ |
11 | 11 | #include <linux/highmem.h> |
12 | 12 | #include <linux/nodemask.h> |
13 | 13 | #include <linux/pagemap.h> |
14 | +#include <linux/mempolicy.h> | |
15 | + | |
14 | 16 | #include <asm/page.h> |
15 | 17 | #include <asm/pgtable.h> |
16 | 18 | |
17 | 19 | |
... | ... | @@ -36,11 +38,12 @@ |
36 | 38 | free_huge_pages_node[nid]++; |
37 | 39 | } |
38 | 40 | |
39 | -static struct page *dequeue_huge_page(void) | |
41 | +static struct page *dequeue_huge_page(struct vm_area_struct *vma, | |
42 | + unsigned long address) | |
40 | 43 | { |
41 | 44 | int nid = numa_node_id(); |
42 | 45 | struct page *page = NULL; |
43 | - struct zonelist *zonelist = NODE_DATA(nid)->node_zonelists; | |
46 | + struct zonelist *zonelist = huge_zonelist(vma, address); | |
44 | 47 | struct zone **z; |
45 | 48 | |
46 | 49 | for (z = zonelist->zones; *z; z++) { |
47 | 50 | |
... | ... | @@ -87,13 +90,13 @@ |
87 | 90 | spin_unlock(&hugetlb_lock); |
88 | 91 | } |
89 | 92 | |
90 | -struct page *alloc_huge_page(void) | |
93 | +struct page *alloc_huge_page(struct vm_area_struct *vma, unsigned long addr) | |
91 | 94 | { |
92 | 95 | struct page *page; |
93 | 96 | int i; |
94 | 97 | |
95 | 98 | spin_lock(&hugetlb_lock); |
96 | - page = dequeue_huge_page(); | |
99 | + page = dequeue_huge_page(vma, addr); | |
97 | 100 | if (!page) { |
98 | 101 | spin_unlock(&hugetlb_lock); |
99 | 102 | return NULL; |
... | ... | @@ -196,7 +199,7 @@ |
196 | 199 | spin_lock(&hugetlb_lock); |
197 | 200 | try_to_free_low(count); |
198 | 201 | while (count < nr_huge_pages) { |
199 | - struct page *page = dequeue_huge_page(); | |
202 | + struct page *page = dequeue_huge_page(NULL, 0); | |
200 | 203 | if (!page) |
201 | 204 | break; |
202 | 205 | update_and_free_page(page); |
... | ... | @@ -365,8 +368,9 @@ |
365 | 368 | flush_tlb_range(vma, start, end); |
366 | 369 | } |
367 | 370 | |
368 | -static struct page *find_or_alloc_huge_page(struct address_space *mapping, | |
369 | - unsigned long idx, int shared) | |
371 | +static struct page *find_or_alloc_huge_page(struct vm_area_struct *vma, | |
372 | + unsigned long addr, struct address_space *mapping, | |
373 | + unsigned long idx, int shared) | |
370 | 374 | { |
371 | 375 | struct page *page; |
372 | 376 | int err; |
... | ... | @@ -378,7 +382,7 @@ |
378 | 382 | |
379 | 383 | if (hugetlb_get_quota(mapping)) |
380 | 384 | goto out; |
381 | - page = alloc_huge_page(); | |
385 | + page = alloc_huge_page(vma, addr); | |
382 | 386 | if (!page) { |
383 | 387 | hugetlb_put_quota(mapping); |
384 | 388 | goto out; |
... | ... | @@ -418,7 +422,7 @@ |
418 | 422 | } |
419 | 423 | |
420 | 424 | page_cache_get(old_page); |
421 | - new_page = alloc_huge_page(); | |
425 | + new_page = alloc_huge_page(vma, address); | |
422 | 426 | |
423 | 427 | if (!new_page) { |
424 | 428 | page_cache_release(old_page); |
... | ... | @@ -467,7 +471,7 @@ |
467 | 471 | * Use page lock to guard against racing truncation |
468 | 472 | * before we get page_table_lock. |
469 | 473 | */ |
470 | - page = find_or_alloc_huge_page(mapping, idx, | |
474 | + page = find_or_alloc_huge_page(vma, address, mapping, idx, | |
471 | 475 | vma->vm_flags & VM_SHARED); |
472 | 476 | if (!page) |
473 | 477 | goto out; |
mm/mempolicy.c
... | ... | @@ -785,6 +785,34 @@ |
785 | 785 | return nid; |
786 | 786 | } |
787 | 787 | |
788 | +/* Determine a node number for interleave */ | |
789 | +static inline unsigned interleave_nid(struct mempolicy *pol, | |
790 | + struct vm_area_struct *vma, unsigned long addr, int shift) | |
791 | +{ | |
792 | + if (vma) { | |
793 | + unsigned long off; | |
794 | + | |
795 | + off = vma->vm_pgoff; | |
796 | + off += (addr - vma->vm_start) >> shift; | |
797 | + return offset_il_node(pol, vma, off); | |
798 | + } else | |
799 | + return interleave_nodes(pol); | |
800 | +} | |
801 | + | |
802 | +/* Return a zonelist suitable for a huge page allocation. */ | |
803 | +struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr) | |
804 | +{ | |
805 | + struct mempolicy *pol = get_vma_policy(current, vma, addr); | |
806 | + | |
807 | + if (pol->policy == MPOL_INTERLEAVE) { | |
808 | + unsigned nid; | |
809 | + | |
810 | + nid = interleave_nid(pol, vma, addr, HPAGE_SHIFT); | |
811 | + return NODE_DATA(nid)->node_zonelists + gfp_zone(GFP_HIGHUSER); | |
812 | + } | |
813 | + return zonelist_policy(GFP_HIGHUSER, pol); | |
814 | +} | |
815 | + | |
788 | 816 | /* Allocate a page in interleaved policy. |
789 | 817 | Own path because it needs to do special accounting. */ |
790 | 818 | static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, |
... | ... | @@ -833,15 +861,8 @@ |
833 | 861 | |
834 | 862 | if (unlikely(pol->policy == MPOL_INTERLEAVE)) { |
835 | 863 | unsigned nid; |
836 | - if (vma) { | |
837 | - unsigned long off; | |
838 | - off = vma->vm_pgoff; | |
839 | - off += (addr - vma->vm_start) >> PAGE_SHIFT; | |
840 | - nid = offset_il_node(pol, vma, off); | |
841 | - } else { | |
842 | - /* fall back to process interleaving */ | |
843 | - nid = interleave_nodes(pol); | |
844 | - } | |
864 | + | |
865 | + nid = interleave_nid(pol, vma, addr, PAGE_SHIFT); | |
845 | 866 | return alloc_page_interleave(gfp, 0, nid); |
846 | 867 | } |
847 | 868 | return __alloc_pages(gfp, 0, zonelist_policy(gfp, pol)); |