Commit 03afc0e25f7fc03537014a770f4c54ebbe63a24c

Authored by Vladimir Davydov
Committed by Linus Torvalds
1 parent bfc8c90139

slab: get_online_mems for kmem_cache_{create,destroy,shrink}

When we create a sl[au]b cache, we allocate kmem_cache_node structures
for each online NUMA node.  To handle nodes taken online/offline, we
register memory hotplug notifier and allocate/free kmem_cache_node
corresponding to the node that changes its state for each kmem cache.

To synchronize between the two paths we hold the slab_mutex during both
the cache creationg/destruction path and while tuning per-node parts of
kmem caches in memory hotplug handler, but that's not quite right,
because it does not guarantee that a newly created cache will have all
kmem_cache_nodes initialized in case it races with memory hotplug.  For
instance, in case of slub:

    CPU0                            CPU1
    ----                            ----
    kmem_cache_create:              online_pages:
     __kmem_cache_create:            slab_memory_callback:
                                      slab_mem_going_online_callback:
                                       lock slab_mutex
                                       for each slab_caches list entry
                                           allocate kmem_cache node
                                       unlock slab_mutex
      lock slab_mutex
      init_kmem_cache_nodes:
       for_each_node_state(node, N_NORMAL_MEMORY)
           allocate kmem_cache node
      add kmem_cache to slab_caches list
      unlock slab_mutex
                                    online_pages (continued):
                                     node_states_set_node

As a result we'll get a kmem cache with not all kmem_cache_nodes
allocated.

To avoid issues like that we should hold get/put_online_mems() during
the whole kmem cache creation/destruction/shrink paths, just like we
deal with cpu hotplug.  This patch does the trick.

Note, that after it's applied, there is no need in taking the slab_mutex
for kmem_cache_shrink any more, so it is removed from there.

Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: Tang Chen <tangchen@cn.fujitsu.com>
Cc: Zhang Yanfei <zhangyanfei@cn.fujitsu.com>
Cc: Toshi Kani <toshi.kani@hp.com>
Cc: Xishi Qiu <qiuxishi@huawei.com>
Cc: Jiang Liu <liuj97@gmail.com>
Cc: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Wen Congyang <wency@cn.fujitsu.com>
Cc: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Cc: Lai Jiangshan <laijs@cn.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Showing 5 changed files with 39 additions and 31 deletions Side-by-side Diff

... ... @@ -2480,8 +2480,7 @@
2480 2480 return nr_freed;
2481 2481 }
2482 2482  
2483   -/* Called with slab_mutex held to protect against cpu hotplug */
2484   -static int __cache_shrink(struct kmem_cache *cachep)
  2483 +int __kmem_cache_shrink(struct kmem_cache *cachep)
2485 2484 {
2486 2485 int ret = 0, i = 0;
2487 2486 struct kmem_cache_node *n;
2488 2487  
... ... @@ -2502,32 +2501,11 @@
2502 2501 return (ret ? 1 : 0);
2503 2502 }
2504 2503  
2505   -/**
2506   - * kmem_cache_shrink - Shrink a cache.
2507   - * @cachep: The cache to shrink.
2508   - *
2509   - * Releases as many slabs as possible for a cache.
2510   - * To help debugging, a zero exit status indicates all slabs were released.
2511   - */
2512   -int kmem_cache_shrink(struct kmem_cache *cachep)
2513   -{
2514   - int ret;
2515   - BUG_ON(!cachep || in_interrupt());
2516   -
2517   - get_online_cpus();
2518   - mutex_lock(&slab_mutex);
2519   - ret = __cache_shrink(cachep);
2520   - mutex_unlock(&slab_mutex);
2521   - put_online_cpus();
2522   - return ret;
2523   -}
2524   -EXPORT_SYMBOL(kmem_cache_shrink);
2525   -
2526 2504 int __kmem_cache_shutdown(struct kmem_cache *cachep)
2527 2505 {
2528 2506 int i;
2529 2507 struct kmem_cache_node *n;
2530   - int rc = __cache_shrink(cachep);
  2508 + int rc = __kmem_cache_shrink(cachep);
2531 2509  
2532 2510 if (rc)
2533 2511 return rc;
... ... @@ -91,6 +91,7 @@
91 91 #define CACHE_CREATE_MASK (SLAB_CORE_FLAGS | SLAB_DEBUG_FLAGS | SLAB_CACHE_FLAGS)
92 92  
93 93 int __kmem_cache_shutdown(struct kmem_cache *);
  94 +int __kmem_cache_shrink(struct kmem_cache *);
94 95 void slab_kmem_cache_release(struct kmem_cache *);
95 96  
96 97 struct seq_file;
... ... @@ -205,6 +205,8 @@
205 205 int err;
206 206  
207 207 get_online_cpus();
  208 + get_online_mems();
  209 +
208 210 mutex_lock(&slab_mutex);
209 211  
210 212 err = kmem_cache_sanity_check(name, size);
... ... @@ -239,6 +241,8 @@
239 241  
240 242 out_unlock:
241 243 mutex_unlock(&slab_mutex);
  244 +
  245 + put_online_mems();
242 246 put_online_cpus();
243 247  
244 248 if (err) {
... ... @@ -272,6 +276,8 @@
272 276 char *cache_name;
273 277  
274 278 get_online_cpus();
  279 + get_online_mems();
  280 +
275 281 mutex_lock(&slab_mutex);
276 282  
277 283 /*
... ... @@ -295,6 +301,8 @@
295 301  
296 302 out_unlock:
297 303 mutex_unlock(&slab_mutex);
  304 +
  305 + put_online_mems();
298 306 put_online_cpus();
299 307 }
300 308  
... ... @@ -328,6 +336,8 @@
328 336 void kmem_cache_destroy(struct kmem_cache *s)
329 337 {
330 338 get_online_cpus();
  339 + get_online_mems();
  340 +
331 341 mutex_lock(&slab_mutex);
332 342  
333 343 s->refcount--;
334 344  
335 345  
... ... @@ -359,14 +369,35 @@
359 369 #else
360 370 slab_kmem_cache_release(s);
361 371 #endif
362   - goto out_put_cpus;
  372 + goto out;
363 373  
364 374 out_unlock:
365 375 mutex_unlock(&slab_mutex);
366   -out_put_cpus:
  376 +out:
  377 + put_online_mems();
367 378 put_online_cpus();
368 379 }
369 380 EXPORT_SYMBOL(kmem_cache_destroy);
  381 +
  382 +/**
  383 + * kmem_cache_shrink - Shrink a cache.
  384 + * @cachep: The cache to shrink.
  385 + *
  386 + * Releases as many slabs as possible for a cache.
  387 + * To help debugging, a zero exit status indicates all slabs were released.
  388 + */
  389 +int kmem_cache_shrink(struct kmem_cache *cachep)
  390 +{
  391 + int ret;
  392 +
  393 + get_online_cpus();
  394 + get_online_mems();
  395 + ret = __kmem_cache_shrink(cachep);
  396 + put_online_mems();
  397 + put_online_cpus();
  398 + return ret;
  399 +}
  400 +EXPORT_SYMBOL(kmem_cache_shrink);
370 401  
371 402 int slab_is_available(void)
372 403 {
... ... @@ -620,11 +620,10 @@
620 620 return 0;
621 621 }
622 622  
623   -int kmem_cache_shrink(struct kmem_cache *d)
  623 +int __kmem_cache_shrink(struct kmem_cache *d)
624 624 {
625 625 return 0;
626 626 }
627   -EXPORT_SYMBOL(kmem_cache_shrink);
628 627  
629 628 struct kmem_cache kmem_cache_boot = {
630 629 .name = "kmem_cache",
... ... @@ -3398,7 +3398,7 @@
3398 3398 * being allocated from last increasing the chance that the last objects
3399 3399 * are freed in them.
3400 3400 */
3401   -int kmem_cache_shrink(struct kmem_cache *s)
  3401 +int __kmem_cache_shrink(struct kmem_cache *s)
3402 3402 {
3403 3403 int node;
3404 3404 int i;
... ... @@ -3454,7 +3454,6 @@
3454 3454 kfree(slabs_by_inuse);
3455 3455 return 0;
3456 3456 }
3457   -EXPORT_SYMBOL(kmem_cache_shrink);
3458 3457  
3459 3458 static int slab_mem_going_offline_callback(void *arg)
3460 3459 {
... ... @@ -3462,7 +3461,7 @@
3462 3461  
3463 3462 mutex_lock(&slab_mutex);
3464 3463 list_for_each_entry(s, &slab_caches, list)
3465   - kmem_cache_shrink(s);
  3464 + __kmem_cache_shrink(s);
3466 3465 mutex_unlock(&slab_mutex);
3467 3466  
3468 3467 return 0;