Commit 480eccf9ae1073b87bb4fe118971fbf134a5bc61

Authored by Lee Schermerhorn
Committed by Linus Torvalds
1 parent 28f300d236

Fix NUMA Memory Policy Reference Counting

This patch proposes fixes to the reference counting of memory policy in the
page allocation paths and in show_numa_map().  Extracted from my "Memory
Policy Cleanups and Enhancements" series as stand-alone.

Shared policy lookup [shmem] has always added a reference to the policy,
but this was never unrefed after page allocation or after formatting the
numa map data.

Default system policy should not require additional ref counting, nor
should the current task's task policy.  However, show_numa_map() calls
get_vma_policy() to examine what may be [likely is] another task's policy.
The latter case needs protection against freeing of the policy.

This patch adds a reference count to a mempolicy returned by
get_vma_policy() when the policy is a vma policy or another task's
mempolicy.  Again, shared policy is already reference counted on lookup.  A
matching "unref" [__mpol_free()] is performed in alloc_page_vma() for
shared and vma policies, and in show_numa_map() for shared and another
task's mempolicy.  We can call __mpol_free() directly, saving an admittedly
inexpensive inline NULL test, because we know we have a non-NULL policy.

Handling policy ref counts for hugepages is a bit trickier.
huge_zonelist() returns a zone list that might come from a shared or vma
'BIND policy.  In this case, we should hold the reference until after the
huge page allocation in dequeue_hugepage().  The patch modifies
huge_zonelist() to return a pointer to the mempolicy if it needs to be
unref'd after allocation.

Kernel Build [16cpu, 32GB, ia64] - average of 10 runs:

		w/o patch	w/ refcount patch
	    Avg	  Std Devn	   Avg	  Std Devn
Real:	 100.59	    0.38	 100.63	    0.43
User:	1209.60	    0.37	1209.91	    0.31
System:   81.52	    0.42	  81.64	    0.34

Signed-off-by:  Lee Schermerhorn <lee.schermerhorn@hp.com>
Acked-by: Andi Kleen <ak@suse.de>
Cc: Christoph Lameter <clameter@sgi.com>
Acked-by: Mel Gorman <mel@csn.ul.ie>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Showing 3 changed files with 75 additions and 12 deletions Side-by-side Diff

include/linux/mempolicy.h
... ... @@ -159,7 +159,7 @@
159 159  
160 160 extern struct mempolicy default_policy;
161 161 extern struct zonelist *huge_zonelist(struct vm_area_struct *vma,
162   - unsigned long addr, gfp_t gfp_flags);
  162 + unsigned long addr, gfp_t gfp_flags, struct mempolicy **mpol);
163 163 extern unsigned slab_node(struct mempolicy *policy);
164 164  
165 165 extern enum zone_type policy_zone;
... ... @@ -256,7 +256,7 @@
256 256 #define set_cpuset_being_rebound(x) do {} while (0)
257 257  
258 258 static inline struct zonelist *huge_zonelist(struct vm_area_struct *vma,
259   - unsigned long addr, gfp_t gfp_flags)
  259 + unsigned long addr, gfp_t gfp_flags, struct mempolicy **mpol)
260 260 {
261 261 return NODE_DATA(0)->node_zonelists + gfp_zone(gfp_flags);
262 262 }
... ... @@ -71,8 +71,9 @@
71 71 {
72 72 int nid;
73 73 struct page *page = NULL;
  74 + struct mempolicy *mpol;
74 75 struct zonelist *zonelist = huge_zonelist(vma, address,
75   - htlb_alloc_mask);
  76 + htlb_alloc_mask, &mpol);
76 77 struct zone **z;
77 78  
78 79 for (z = zonelist->zones; *z; z++) {
... ... @@ -87,6 +88,7 @@
87 88 break;
88 89 }
89 90 }
  91 + mpol_free(mpol); /* unref if mpol !NULL */
90 92 return page;
91 93 }
92 94  
... ... @@ -1077,21 +1077,37 @@
1077 1077  
1078 1078 #endif
1079 1079  
1080   -/* Return effective policy for a VMA */
  1080 +/*
  1081 + * get_vma_policy(@task, @vma, @addr)
  1082 + * @task - task for fallback if vma policy == default
  1083 + * @vma - virtual memory area whose policy is sought
  1084 + * @addr - address in @vma for shared policy lookup
  1085 + *
  1086 + * Returns effective policy for a VMA at specified address.
  1087 + * Falls back to @task or system default policy, as necessary.
  1088 + * Returned policy has extra reference count if shared, vma,
  1089 + * or some other task's policy [show_numa_maps() can pass
  1090 + * @task != current]. It is the caller's responsibility to
  1091 + * free the reference in these cases.
  1092 + */
1081 1093 static struct mempolicy * get_vma_policy(struct task_struct *task,
1082 1094 struct vm_area_struct *vma, unsigned long addr)
1083 1095 {
1084 1096 struct mempolicy *pol = task->mempolicy;
  1097 + int shared_pol = 0;
1085 1098  
1086 1099 if (vma) {
1087   - if (vma->vm_ops && vma->vm_ops->get_policy)
  1100 + if (vma->vm_ops && vma->vm_ops->get_policy) {
1088 1101 pol = vma->vm_ops->get_policy(vma, addr);
1089   - else if (vma->vm_policy &&
  1102 + shared_pol = 1; /* if pol non-NULL, add ref below */
  1103 + } else if (vma->vm_policy &&
1090 1104 vma->vm_policy->policy != MPOL_DEFAULT)
1091 1105 pol = vma->vm_policy;
1092 1106 }
1093 1107 if (!pol)
1094 1108 pol = &default_policy;
  1109 + else if (!shared_pol && pol != current->mempolicy)
  1110 + mpol_get(pol); /* vma or other task's policy */
1095 1111 return pol;
1096 1112 }
1097 1113  
1098 1114  
1099 1115  
1100 1116  
1101 1117  
1102 1118  
... ... @@ -1207,19 +1223,45 @@
1207 1223 }
1208 1224  
1209 1225 #ifdef CONFIG_HUGETLBFS
1210   -/* Return a zonelist suitable for a huge page allocation. */
  1226 +/*
  1227 + * huge_zonelist(@vma, @addr, @gfp_flags, @mpol)
  1228 + * @vma = virtual memory area whose policy is sought
  1229 + * @addr = address in @vma for shared policy lookup and interleave policy
  1230 + * @gfp_flags = for requested zone
  1231 + * @mpol = pointer to mempolicy pointer for reference counted 'BIND policy
  1232 + *
  1233 + * Returns a zonelist suitable for a huge page allocation.
  1234 + * If the effective policy is 'BIND, returns pointer to policy's zonelist.
  1235 + * If it is also a policy for which get_vma_policy() returns an extra
  1236 + * reference, we must hold that reference until after allocation.
  1237 + * In that case, return policy via @mpol so hugetlb allocation can drop
  1238 + * the reference. For non-'BIND referenced policies, we can/do drop the
  1239 + * reference here, so the caller doesn't need to know about the special case
  1240 + * for default and current task policy.
  1241 + */
1211 1242 struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
1212   - gfp_t gfp_flags)
  1243 + gfp_t gfp_flags, struct mempolicy **mpol)
1213 1244 {
1214 1245 struct mempolicy *pol = get_vma_policy(current, vma, addr);
  1246 + struct zonelist *zl;
1215 1247  
  1248 + *mpol = NULL; /* probably no unref needed */
1216 1249 if (pol->policy == MPOL_INTERLEAVE) {
1217 1250 unsigned nid;
1218 1251  
1219 1252 nid = interleave_nid(pol, vma, addr, HPAGE_SHIFT);
  1253 + __mpol_free(pol); /* finished with pol */
1220 1254 return NODE_DATA(nid)->node_zonelists + gfp_zone(gfp_flags);
1221 1255 }
1222   - return zonelist_policy(GFP_HIGHUSER, pol);
  1256 +
  1257 + zl = zonelist_policy(GFP_HIGHUSER, pol);
  1258 + if (unlikely(pol != &default_policy && pol != current->mempolicy)) {
  1259 + if (pol->policy != MPOL_BIND)
  1260 + __mpol_free(pol); /* finished with pol */
  1261 + else
  1262 + *mpol = pol; /* unref needed after allocation */
  1263 + }
  1264 + return zl;
1223 1265 }
1224 1266 #endif
1225 1267  
... ... @@ -1264,6 +1306,7 @@
1264 1306 alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
1265 1307 {
1266 1308 struct mempolicy *pol = get_vma_policy(current, vma, addr);
  1309 + struct zonelist *zl;
1267 1310  
1268 1311 cpuset_update_task_memory_state();
1269 1312  
... ... @@ -1273,7 +1316,19 @@
1273 1316 nid = interleave_nid(pol, vma, addr, PAGE_SHIFT);
1274 1317 return alloc_page_interleave(gfp, 0, nid);
1275 1318 }
1276   - return __alloc_pages(gfp, 0, zonelist_policy(gfp, pol));
  1319 + zl = zonelist_policy(gfp, pol);
  1320 + if (pol != &default_policy && pol != current->mempolicy) {
  1321 + /*
  1322 + * slow path: ref counted policy -- shared or vma
  1323 + */
  1324 + struct page *page = __alloc_pages(gfp, 0, zl);
  1325 + __mpol_free(pol);
  1326 + return page;
  1327 + }
  1328 + /*
  1329 + * fast path: default or task policy
  1330 + */
  1331 + return __alloc_pages(gfp, 0, zl);
1277 1332 }
1278 1333  
1279 1334 /**
... ... @@ -1872,6 +1927,7 @@
1872 1927 struct numa_maps *md;
1873 1928 struct file *file = vma->vm_file;
1874 1929 struct mm_struct *mm = vma->vm_mm;
  1930 + struct mempolicy *pol;
1875 1931 int n;
1876 1932 char buffer[50];
1877 1933  
... ... @@ -1882,8 +1938,13 @@
1882 1938 if (!md)
1883 1939 return 0;
1884 1940  
1885   - mpol_to_str(buffer, sizeof(buffer),
1886   - get_vma_policy(priv->task, vma, vma->vm_start));
  1941 + pol = get_vma_policy(priv->task, vma, vma->vm_start);
  1942 + mpol_to_str(buffer, sizeof(buffer), pol);
  1943 + /*
  1944 + * unref shared or other task's mempolicy
  1945 + */
  1946 + if (pol != &default_policy && pol != current->mempolicy)
  1947 + __mpol_free(pol);
1887 1948  
1888 1949 seq_printf(m, "%08lx %s", vma->vm_start, buffer);
1889 1950