Commit 832f37f5d5f5c7281880c21eb09508750b67f540

Authored by Vladimir Davydov
Committed by Linus Torvalds
1 parent 2788cf0c40

slub: never fail to shrink cache

SLUB's version of __kmem_cache_shrink() not only removes empty slabs, but
also tries to rearrange the partial lists to place slabs filled up most to
the head to cope with fragmentation.  To achieve that, it allocates a
temporary array of lists used to sort slabs by the number of objects in
use.  If the allocation fails, the whole procedure is aborted.

This is unacceptable for the kernel memory accounting extension of the
memory cgroup, where we want to make sure that kmem_cache_shrink()
successfully discarded empty slabs.  Although the allocation failure is
utterly unlikely with the current page allocator implementation, which
retries GFP_KERNEL allocations of order <= 2 infinitely, it is better not
to rely on that.

This patch therefore makes __kmem_cache_shrink() allocate the array on
stack instead of calling kmalloc, which may fail.  The array size is
chosen to be equal to 32, because most SLUB caches store not more than 32
objects per slab page.  Slab pages with <= 32 free objects are sorted
using the array by the number of objects in use and promoted to the head
of the partial list, while slab pages with > 32 free objects are left in
the end of the list without any ordering imposed on them.

Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
Acked-by: Christoph Lameter <cl@linux.com>
Acked-by: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Huang Ying <ying.huang@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Showing 1 changed file with 31 additions and 27 deletions Side-by-side Diff

... ... @@ -3358,11 +3358,12 @@
3358 3358 }
3359 3359 EXPORT_SYMBOL(kfree);
3360 3360  
  3361 +#define SHRINK_PROMOTE_MAX 32
  3362 +
3361 3363 /*
3362   - * kmem_cache_shrink removes empty slabs from the partial lists and sorts
3363   - * the remaining slabs by the number of items in use. The slabs with the
3364   - * most items in use come first. New allocations will then fill those up
3365   - * and thus they can be removed from the partial lists.
  3364 + * kmem_cache_shrink discards empty slabs and promotes the slabs filled
  3365 + * up most to the head of the partial lists. New allocations will then
  3366 + * fill those up and thus they can be removed from the partial lists.
3366 3367 *
3367 3368 * The slabs with the least items are placed last. This results in them
3368 3369 * being allocated from last increasing the chance that the last objects
3369 3370  
3370 3371  
3371 3372  
3372 3373  
3373 3374  
3374 3375  
3375 3376  
3376 3377  
3377 3378  
... ... @@ -3375,51 +3376,57 @@
3375 3376 struct kmem_cache_node *n;
3376 3377 struct page *page;
3377 3378 struct page *t;
3378   - int objects = oo_objects(s->max);
3379   - struct list_head *slabs_by_inuse =
3380   - kmalloc(sizeof(struct list_head) * objects, GFP_KERNEL);
  3379 + struct list_head discard;
  3380 + struct list_head promote[SHRINK_PROMOTE_MAX];
3381 3381 unsigned long flags;
3382 3382  
3383   - if (!slabs_by_inuse)
3384   - return -ENOMEM;
3385   -
3386 3383 flush_all(s);
3387 3384 for_each_kmem_cache_node(s, node, n) {
3388 3385 if (!n->nr_partial)
3389 3386 continue;
3390 3387  
3391   - for (i = 0; i < objects; i++)
3392   - INIT_LIST_HEAD(slabs_by_inuse + i);
  3388 + INIT_LIST_HEAD(&discard);
  3389 + for (i = 0; i < SHRINK_PROMOTE_MAX; i++)
  3390 + INIT_LIST_HEAD(promote + i);
3393 3391  
3394 3392 spin_lock_irqsave(&n->list_lock, flags);
3395 3393  
3396 3394 /*
3397   - * Build lists indexed by the items in use in each slab.
  3395 + * Build lists of slabs to discard or promote.
3398 3396 *
3399 3397 * Note that concurrent frees may occur while we hold the
3400 3398 * list_lock. page->inuse here is the upper limit.
3401 3399 */
3402 3400 list_for_each_entry_safe(page, t, &n->partial, lru) {
3403   - list_move(&page->lru, slabs_by_inuse + page->inuse);
3404   - if (!page->inuse)
  3401 + int free = page->objects - page->inuse;
  3402 +
  3403 + /* Do not reread page->inuse */
  3404 + barrier();
  3405 +
  3406 + /* We do not keep full slabs on the list */
  3407 + BUG_ON(free <= 0);
  3408 +
  3409 + if (free == page->objects) {
  3410 + list_move(&page->lru, &discard);
3405 3411 n->nr_partial--;
  3412 + } else if (free <= SHRINK_PROMOTE_MAX)
  3413 + list_move(&page->lru, promote + free - 1);
3406 3414 }
3407 3415  
3408 3416 /*
3409   - * Rebuild the partial list with the slabs filled up most
3410   - * first and the least used slabs at the end.
  3417 + * Promote the slabs filled up most to the head of the
  3418 + * partial list.
3411 3419 */
3412   - for (i = objects - 1; i > 0; i--)
3413   - list_splice(slabs_by_inuse + i, n->partial.prev);
  3420 + for (i = SHRINK_PROMOTE_MAX - 1; i >= 0; i--)
  3421 + list_splice(promote + i, &n->partial);
3414 3422  
3415 3423 spin_unlock_irqrestore(&n->list_lock, flags);
3416 3424  
3417 3425 /* Release empty slabs */
3418   - list_for_each_entry_safe(page, t, slabs_by_inuse, lru)
  3426 + list_for_each_entry_safe(page, t, &discard, lru)
3419 3427 discard_slab(s, page);
3420 3428 }
3421 3429  
3422   - kfree(slabs_by_inuse);
3423 3430 return 0;
3424 3431 }
3425 3432  
... ... @@ -4686,12 +4693,9 @@
4686 4693 static ssize_t shrink_store(struct kmem_cache *s,
4687 4694 const char *buf, size_t length)
4688 4695 {
4689   - if (buf[0] == '1') {
4690   - int rc = kmem_cache_shrink(s);
4691   -
4692   - if (rc)
4693   - return rc;
4694   - } else
  4696 + if (buf[0] == '1')
  4697 + kmem_cache_shrink(s);
  4698 + else
4695 4699 return -EINVAL;
4696 4700 return length;
4697 4701 }