[PATCH] Add NUMA policy support for huge pages.

The huge_zonelist() function in the memory policy layer provides an list of zones ordered by NUMA distance. The hugetlb layer will walk that list looking for a zone that has available huge pages but is also in the nodeset of the current cpuset. This patch does not contain the folding of find_or_alloc_huge_page() that was controversial in the earlier discussion. Signed-off-by: Christoph Lameter <clameter@sgi.com> Cc: Andi Kleen <ak@muc.de> Acked-by: William Lee Irwin III <wli@holomorphy.com> Cc: Adam Litke <agl@us.ibm.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>

[PATCH] Add NUMA policy support for huge pages.
The huge_zonelist() function in the memory policy layer provides an list of zones ordered by NUMA distance. The hugetlb layer will walk that list looking for a zone that has available huge pages but is also in the nodeset of the current cpuset. This patch does not contain the folding of find_or_alloc_huge_page() that was controversial in the earlier discussion. Signed-off-by: Christoph Lameter <clameter@sgi.com> Cc: Andi Kleen <ak@muc.de> Acked-by: William Lee Irwin III <wli@holomorphy.com> Cc: Adam Litke <agl@us.ibm.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Christoph Lameter · Linus Torvalds
1 parent 96df9333c9
Showing 4 changed files with 54 additions and 21 deletions Side-by-side Diff
include/linux/hugetlb.h
include/linux/mempolicy.h
mm/hugetlb.c
mm/mempolicy.c
@@ -22,7 +22,7 @@
 int hugetlb_report_node_meminfo(int, char *);
 int is_hugepage_mem_enough(size_t);
 unsigned long hugetlb_total_pages(void);
-struct page *alloc_huge_page(void);
+struct page *alloc_huge_page(struct vm_area_struct *, unsigned long);
 void free_huge_page(struct page *);
 int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 			unsigned long address, int write_access);
@@ -97,7 +97,7 @@
 #define is_hugepage_only_range(mm, addr, len)	0
 #define hugetlb_free_pgd_range(tlb, addr, end, floor, ceiling) \
 						do { } while (0)
-#define alloc_huge_page()			({ NULL; })
+#define alloc_huge_page(vma, addr)		({ NULL; })
 #define free_huge_page(p)			({ (void)(p); BUG(); })
 #define hugetlb_fault(mm, vma, addr, write)	({ BUG(); 0; })
  
@@ -156,6 +156,8 @@
 extern void numa_policy_init(void);
 extern void numa_policy_rebind(const nodemask_t *old, const nodemask_t *new);
 extern struct mempolicy default_policy;
+extern struct zonelist *huge_zonelist(struct vm_area_struct *vma,
+		unsigned long addr);
  
 #else
  
@@ -230,6 +232,12 @@
 static inline void numa_policy_rebind(const nodemask_t *old,
 					const nodemask_t *new)
 {
+}
+
+static inline struct zonelist *huge_zonelist(struct vm_area_struct *vma,
+		unsigned long addr)
+{
+	return NODE_DATA(0)->node_zonelists + gfp_zone(GFP_HIGHUSER);
 }
  
 #endif /* CONFIG_NUMA */
@@ -11,6 +11,8 @@
 #include <linux/highmem.h>
 #include <linux/nodemask.h>
 #include <linux/pagemap.h>
+#include <linux/mempolicy.h>
+
 #include <asm/page.h>
 #include <asm/pgtable.h>
  
  
@@ -36,11 +38,12 @@
 	free_huge_pages_node[nid]++;
 }
  
-static struct page *dequeue_huge_page(void)
+static struct page *dequeue_huge_page(struct vm_area_struct *vma,
+				unsigned long address)
 {
 	int nid = numa_node_id();
 	struct page *page = NULL;
-	struct zonelist *zonelist = NODE_DATA(nid)->node_zonelists;
+	struct zonelist *zonelist = huge_zonelist(vma, address);
 	struct zone **z;
  
 	for (z = zonelist->zones; *z; z++) {
  
@@ -87,13 +90,13 @@
 	spin_unlock(&hugetlb_lock);
 }
  
-struct page *alloc_huge_page(void)
+struct page *alloc_huge_page(struct vm_area_struct *vma, unsigned long addr)
 {
 	struct page *page;
 	int i;
  
 	spin_lock(&hugetlb_lock);
-	page = dequeue_huge_page();
+	page = dequeue_huge_page(vma, addr);
 	if (!page) {
 		spin_unlock(&hugetlb_lock);
 		return NULL;
@@ -196,7 +199,7 @@
 	spin_lock(&hugetlb_lock);
 	try_to_free_low(count);
 	while (count < nr_huge_pages) {
-		struct page *page = dequeue_huge_page();
+		struct page *page = dequeue_huge_page(NULL, 0);
 		if (!page)
 			break;
 		update_and_free_page(page);
@@ -365,8 +368,9 @@
 	flush_tlb_range(vma, start, end);
 }
  
-static struct page *find_or_alloc_huge_page(struct address_space *mapping,
-				unsigned long idx, int shared)
+static struct page *find_or_alloc_huge_page(struct vm_area_struct *vma,
+			unsigned long addr, struct address_space *mapping,
+			unsigned long idx, int shared)
 {
 	struct page *page;
 	int err;
@@ -378,7 +382,7 @@
  
 	if (hugetlb_get_quota(mapping))
 		goto out;
-	page = alloc_huge_page();
+	page = alloc_huge_page(vma, addr);
 	if (!page) {
 		hugetlb_put_quota(mapping);
 		goto out;
@@ -418,7 +422,7 @@
 	}
  
 	page_cache_get(old_page);
-	new_page = alloc_huge_page();
+	new_page = alloc_huge_page(vma, address);
  
 	if (!new_page) {
 		page_cache_release(old_page);
@@ -467,7 +471,7 @@
 	 * Use page lock to guard against racing truncation
 	 * before we get page_table_lock.
 	 */
-	page = find_or_alloc_huge_page(mapping, idx,
+	page = find_or_alloc_huge_page(vma, address, mapping, idx,
 			vma->vm_flags & VM_SHARED);
 	if (!page)
 		goto out;
@@ -785,6 +785,34 @@
 	return nid;
 }
  
+/* Determine a node number for interleave */
+static inline unsigned interleave_nid(struct mempolicy *pol,
+		 struct vm_area_struct *vma, unsigned long addr, int shift)
+{
+	if (vma) {
+		unsigned long off;
+
+		off = vma->vm_pgoff;
+		off += (addr - vma->vm_start) >> shift;
+		return offset_il_node(pol, vma, off);
+	} else
+		return interleave_nodes(pol);
+}
+
+/* Return a zonelist suitable for a huge page allocation. */
+struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr)
+{
+	struct mempolicy *pol = get_vma_policy(current, vma, addr);
+
+	if (pol->policy == MPOL_INTERLEAVE) {
+		unsigned nid;
+
+		nid = interleave_nid(pol, vma, addr, HPAGE_SHIFT);
+		return NODE_DATA(nid)->node_zonelists + gfp_zone(GFP_HIGHUSER);
+	}
+	return zonelist_policy(GFP_HIGHUSER, pol);
+}
+
 /* Allocate a page in interleaved policy.
    Own path because it needs to do special accounting. */
 static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
@@ -833,15 +861,8 @@
  
 	if (unlikely(pol->policy == MPOL_INTERLEAVE)) {
 		unsigned nid;
-		if (vma) {
-			unsigned long off;
-			off = vma->vm_pgoff;
-			off += (addr - vma->vm_start) >> PAGE_SHIFT;
-			nid = offset_il_node(pol, vma, off);
-		} else {
-			/* fall back to process interleaving */
-			nid = interleave_nodes(pol);
-		}
+
+		nid = interleave_nid(pol, vma, addr, PAGE_SHIFT);
 		return alloc_page_interleave(gfp, 0, nid);
 	}
 	return __alloc_pages(gfp, 0, zonelist_policy(gfp, pol));
...	...	@@ -22,7 +22,7 @@
22	22	int hugetlb_report_node_meminfo(int, char *);
23	23	int is_hugepage_mem_enough(size_t);
24	24	unsigned long hugetlb_total_pages(void);
25		-struct page *alloc_huge_page(void);
	25	+struct page alloc_huge_page(struct vm_area_struct , unsigned long);
26	26	void free_huge_page(struct page *);
27	27	int hugetlb_fault(struct mm_struct mm, struct vm_area_struct vma,
28	28	unsigned long address, int write_access);
...	...	@@ -97,7 +97,7 @@
97	97	#define is_hugepage_only_range(mm, addr, len) 0
98	98	#define hugetlb_free_pgd_range(tlb, addr, end, floor, ceiling) \
99	99	do { } while (0)
100		-#define alloc_huge_page() ({ NULL; })
	100	+#define alloc_huge_page(vma, addr) ({ NULL; })
101	101	#define free_huge_page(p) ({ (void)(p); BUG(); })
102	102	#define hugetlb_fault(mm, vma, addr, write) ({ BUG(); 0; })
103	103
...	...	@@ -156,6 +156,8 @@
156	156	extern void numa_policy_init(void);
157	157	extern void numa_policy_rebind(const nodemask_t old, const nodemask_t new);
158	158	extern struct mempolicy default_policy;
	159	+extern struct zonelist huge_zonelist(struct vm_area_struct vma,
	160	+ unsigned long addr);
159	161
160	162	#else
161	163
...	...	@@ -230,6 +232,12 @@
230	232	static inline void numa_policy_rebind(const nodemask_t *old,
231	233	const nodemask_t *new)
232	234	{
	235	+}
	236	+
	237	+static inline struct zonelist huge_zonelist(struct vm_area_struct vma,
	238	+ unsigned long addr)
	239	+{
	240	+ return NODE_DATA(0)->node_zonelists + gfp_zone(GFP_HIGHUSER);
233	241	}
234	242
235	243	#endif /* CONFIG_NUMA */
...	...	@@ -11,6 +11,8 @@
11	11	#include <linux/highmem.h>
12	12	#include <linux/nodemask.h>
13	13	#include <linux/pagemap.h>
	14	+#include <linux/mempolicy.h>
	15	+
14	16	#include <asm/page.h>
15	17	#include <asm/pgtable.h>
16	18
17	19
...	...	@@ -36,11 +38,12 @@
36	38	free_huge_pages_node[nid]++;
37	39	}
38	40
39		-static struct page *dequeue_huge_page(void)
	41	+static struct page dequeue_huge_page(struct vm_area_struct vma,
	42	+ unsigned long address)
40	43	{
41	44	int nid = numa_node_id();
42	45	struct page *page = NULL;
43		- struct zonelist *zonelist = NODE_DATA(nid)->node_zonelists;
	46	+ struct zonelist *zonelist = huge_zonelist(vma, address);
44	47	struct zone **z;
45	48
46	49	for (z = zonelist->zones; *z; z++) {
47	50
...	...	@@ -87,13 +90,13 @@
87	90	spin_unlock(&hugetlb_lock);
88	91	}
89	92
90		-struct page *alloc_huge_page(void)
	93	+struct page alloc_huge_page(struct vm_area_struct vma, unsigned long addr)
91	94	{
92	95	struct page *page;
93	96	int i;
94	97
95	98	spin_lock(&hugetlb_lock);
96		- page = dequeue_huge_page();
	99	+ page = dequeue_huge_page(vma, addr);
97	100	if (!page) {
98	101	spin_unlock(&hugetlb_lock);
99	102	return NULL;
...	...	@@ -196,7 +199,7 @@
196	199	spin_lock(&hugetlb_lock);
197	200	try_to_free_low(count);
198	201	while (count < nr_huge_pages) {
199		- struct page *page = dequeue_huge_page();
	202	+ struct page *page = dequeue_huge_page(NULL, 0);
200	203	if (!page)
201	204	break;
202	205	update_and_free_page(page);
...	...	@@ -365,8 +368,9 @@
365	368	flush_tlb_range(vma, start, end);
366	369	}
367	370
368		-static struct page find_or_alloc_huge_page(struct address_space mapping,
369		- unsigned long idx, int shared)
	371	+static struct page find_or_alloc_huge_page(struct vm_area_struct vma,
	372	+ unsigned long addr, struct address_space *mapping,
	373	+ unsigned long idx, int shared)
370	374	{
371	375	struct page *page;
372	376	int err;
...	...	@@ -378,7 +382,7 @@
378	382
379	383	if (hugetlb_get_quota(mapping))
380	384	goto out;
381		- page = alloc_huge_page();
	385	+ page = alloc_huge_page(vma, addr);
382	386	if (!page) {
383	387	hugetlb_put_quota(mapping);
384	388	goto out;
...	...	@@ -418,7 +422,7 @@
418	422	}
419	423
420	424	page_cache_get(old_page);
421		- new_page = alloc_huge_page();
	425	+ new_page = alloc_huge_page(vma, address);
422	426
423	427	if (!new_page) {
424	428	page_cache_release(old_page);
...	...	@@ -467,7 +471,7 @@
467	471	* Use page lock to guard against racing truncation
468	472	* before we get page_table_lock.
469	473	*/
470		- page = find_or_alloc_huge_page(mapping, idx,
	474	+ page = find_or_alloc_huge_page(vma, address, mapping, idx,
471	475	vma->vm_flags & VM_SHARED);
472	476	if (!page)
473	477	goto out;
...	...	@@ -785,6 +785,34 @@
785	785	return nid;
786	786	}
787	787
	788	+/* Determine a node number for interleave */
	789	+static inline unsigned interleave_nid(struct mempolicy *pol,
	790	+ struct vm_area_struct *vma, unsigned long addr, int shift)
	791	+{
	792	+ if (vma) {
	793	+ unsigned long off;
	794	+
	795	+ off = vma->vm_pgoff;
	796	+ off += (addr - vma->vm_start) >> shift;
	797	+ return offset_il_node(pol, vma, off);
	798	+ } else
	799	+ return interleave_nodes(pol);
	800	+}
	801	+
	802	+/* Return a zonelist suitable for a huge page allocation. */
	803	+struct zonelist huge_zonelist(struct vm_area_struct vma, unsigned long addr)
	804	+{
	805	+ struct mempolicy *pol = get_vma_policy(current, vma, addr);
	806	+
	807	+ if (pol->policy == MPOL_INTERLEAVE) {
	808	+ unsigned nid;
	809	+
	810	+ nid = interleave_nid(pol, vma, addr, HPAGE_SHIFT);
	811	+ return NODE_DATA(nid)->node_zonelists + gfp_zone(GFP_HIGHUSER);
	812	+ }
	813	+ return zonelist_policy(GFP_HIGHUSER, pol);
	814	+}
	815	+
788	816	/* Allocate a page in interleaved policy.
789	817	Own path because it needs to do special accounting. */
790	818	static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
...	...	@@ -833,15 +861,8 @@
833	861
834	862	if (unlikely(pol->policy == MPOL_INTERLEAVE)) {
835	863	unsigned nid;
836		- if (vma) {
837		- unsigned long off;
838		- off = vma->vm_pgoff;
839		- off += (addr - vma->vm_start) >> PAGE_SHIFT;
840		- nid = offset_il_node(pol, vma, off);
841		- } else {
842		- /* fall back to process interleaving */
843		- nid = interleave_nodes(pol);
844		- }
	864	+
	865	+ nid = interleave_nid(pol, vma, addr, PAGE_SHIFT);
845	866	return alloc_page_interleave(gfp, 0, nid);
846	867	}
847	868	return __alloc_pages(gfp, 0, zonelist_policy(gfp, pol));