Commit 42288fe366c4f1ce7522bc9f27d0bc2a81c55264

Authored by Mel Gorman
Committed by Linus Torvalds
1 parent 5439ca6b8f

mm: mempolicy: Convert shared_policy mutex to spinlock

Sasha was fuzzing with trinity and reported the following problem:

  BUG: sleeping function called from invalid context at kernel/mutex.c:269
  in_atomic(): 1, irqs_disabled(): 0, pid: 6361, name: trinity-main
  2 locks held by trinity-main/6361:
   #0:  (&mm->mmap_sem){++++++}, at: [<ffffffff810aa314>] __do_page_fault+0x1e4/0x4f0
   #1:  (&(&mm->page_table_lock)->rlock){+.+...}, at: [<ffffffff8122f017>] handle_pte_fault+0x3f7/0x6a0
  Pid: 6361, comm: trinity-main Tainted: G        W
  3.7.0-rc2-next-20121024-sasha-00001-gd95ef01-dirty #74
  Call Trace:
    __might_sleep+0x1c3/0x1e0
    mutex_lock_nested+0x29/0x50
    mpol_shared_policy_lookup+0x2e/0x90
    shmem_get_policy+0x2e/0x30
    get_vma_policy+0x5a/0xa0
    mpol_misplaced+0x41/0x1d0
    handle_pte_fault+0x465/0x6a0

This was triggered by a different version of automatic NUMA balancing
but in theory the current version is vunerable to the same problem.

do_numa_page
  -> numa_migrate_prep
    -> mpol_misplaced
      -> get_vma_policy
        -> shmem_get_policy

It's very unlikely this will happen as shared pages are not marked
pte_numa -- see the page_mapcount() check in change_pte_range() -- but
it is possible.

To address this, this patch restores sp->lock as originally implemented
by Kosaki Motohiro.  In the path where get_vma_policy() is called, it
should not be calling sp_alloc() so it is not necessary to treat the PTL
specially.

Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Tested-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Mel Gorman <mgorman@suse.de>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Showing 2 changed files with 49 additions and 21 deletions Side-by-side Diff

include/linux/mempolicy.h
... ... @@ -123,7 +123,7 @@
123 123  
124 124 struct shared_policy {
125 125 struct rb_root root;
126   - struct mutex mutex;
  126 + spinlock_t lock;
127 127 };
128 128  
129 129 void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol);
... ... @@ -2132,7 +2132,7 @@
2132 2132 */
2133 2133  
2134 2134 /* lookup first element intersecting start-end */
2135   -/* Caller holds sp->mutex */
  2135 +/* Caller holds sp->lock */
2136 2136 static struct sp_node *
2137 2137 sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
2138 2138 {
2139 2139  
... ... @@ -2196,13 +2196,13 @@
2196 2196  
2197 2197 if (!sp->root.rb_node)
2198 2198 return NULL;
2199   - mutex_lock(&sp->mutex);
  2199 + spin_lock(&sp->lock);
2200 2200 sn = sp_lookup(sp, idx, idx+1);
2201 2201 if (sn) {
2202 2202 mpol_get(sn->policy);
2203 2203 pol = sn->policy;
2204 2204 }
2205   - mutex_unlock(&sp->mutex);
  2205 + spin_unlock(&sp->lock);
2206 2206 return pol;
2207 2207 }
2208 2208  
... ... @@ -2328,6 +2328,14 @@
2328 2328 sp_free(n);
2329 2329 }
2330 2330  
  2331 +static void sp_node_init(struct sp_node *node, unsigned long start,
  2332 + unsigned long end, struct mempolicy *pol)
  2333 +{
  2334 + node->start = start;
  2335 + node->end = end;
  2336 + node->policy = pol;
  2337 +}
  2338 +
2331 2339 static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
2332 2340 struct mempolicy *pol)
2333 2341 {
2334 2342  
... ... @@ -2344,11 +2352,8 @@
2344 2352 return NULL;
2345 2353 }
2346 2354 newpol->flags |= MPOL_F_SHARED;
  2355 + sp_node_init(n, start, end, newpol);
2347 2356  
2348   - n->start = start;
2349   - n->end = end;
2350   - n->policy = newpol;
2351   -
2352 2357 return n;
2353 2358 }
2354 2359  
2355 2360  
... ... @@ -2357,9 +2362,12 @@
2357 2362 unsigned long end, struct sp_node *new)
2358 2363 {
2359 2364 struct sp_node *n;
  2365 + struct sp_node *n_new = NULL;
  2366 + struct mempolicy *mpol_new = NULL;
2360 2367 int ret = 0;
2361 2368  
2362   - mutex_lock(&sp->mutex);
  2369 +restart:
  2370 + spin_lock(&sp->lock);
2363 2371 n = sp_lookup(sp, start, end);
2364 2372 /* Take care of old policies in the same range. */
2365 2373 while (n && n->start < end) {
2366 2374  
... ... @@ -2372,14 +2380,16 @@
2372 2380 } else {
2373 2381 /* Old policy spanning whole new range. */
2374 2382 if (n->end > end) {
2375   - struct sp_node *new2;
2376   - new2 = sp_alloc(end, n->end, n->policy);
2377   - if (!new2) {
2378   - ret = -ENOMEM;
2379   - goto out;
2380   - }
  2383 + if (!n_new)
  2384 + goto alloc_new;
  2385 +
  2386 + *mpol_new = *n->policy;
  2387 + atomic_set(&mpol_new->refcnt, 1);
  2388 + sp_node_init(n_new, n->end, end, mpol_new);
  2389 + sp_insert(sp, n_new);
2381 2390 n->end = start;
2382   - sp_insert(sp, new2);
  2391 + n_new = NULL;
  2392 + mpol_new = NULL;
2383 2393 break;
2384 2394 } else
2385 2395 n->end = start;
2386 2396  
... ... @@ -2390,9 +2400,27 @@
2390 2400 }
2391 2401 if (new)
2392 2402 sp_insert(sp, new);
2393   -out:
2394   - mutex_unlock(&sp->mutex);
  2403 + spin_unlock(&sp->lock);
  2404 + ret = 0;
  2405 +
  2406 +err_out:
  2407 + if (mpol_new)
  2408 + mpol_put(mpol_new);
  2409 + if (n_new)
  2410 + kmem_cache_free(sn_cache, n_new);
  2411 +
2395 2412 return ret;
  2413 +
  2414 +alloc_new:
  2415 + spin_unlock(&sp->lock);
  2416 + ret = -ENOMEM;
  2417 + n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL);
  2418 + if (!n_new)
  2419 + goto err_out;
  2420 + mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
  2421 + if (!mpol_new)
  2422 + goto err_out;
  2423 + goto restart;
2396 2424 }
2397 2425  
2398 2426 /**
... ... @@ -2410,7 +2438,7 @@
2410 2438 int ret;
2411 2439  
2412 2440 sp->root = RB_ROOT; /* empty tree == default mempolicy */
2413   - mutex_init(&sp->mutex);
  2441 + spin_lock_init(&sp->lock);
2414 2442  
2415 2443 if (mpol) {
2416 2444 struct vm_area_struct pvma;
2417 2445  
... ... @@ -2476,14 +2504,14 @@
2476 2504  
2477 2505 if (!p->root.rb_node)
2478 2506 return;
2479   - mutex_lock(&p->mutex);
  2507 + spin_lock(&p->lock);
2480 2508 next = rb_first(&p->root);
2481 2509 while (next) {
2482 2510 n = rb_entry(next, struct sp_node, nd);
2483 2511 next = rb_next(&n->nd);
2484 2512 sp_delete(p, n);
2485 2513 }
2486   - mutex_unlock(&p->mutex);
  2514 + spin_unlock(&p->lock);
2487 2515 }
2488 2516  
2489 2517 #ifdef CONFIG_NUMA_BALANCING