slub: never fail to shrink cache

SLUB's version of __kmem_cache_shrink() not only removes empty slabs, but also tries to rearrange the partial lists to place slabs filled up most to the head to cope with fragmentation. To achieve that, it allocates a temporary array of lists used to sort slabs by the number of objects in use. If the allocation fails, the whole procedure is aborted. This is unacceptable for the kernel memory accounting extension of the memory cgroup, where we want to make sure that kmem_cache_shrink() successfully discarded empty slabs. Although the allocation failure is utterly unlikely with the current page allocator implementation, which retries GFP_KERNEL allocations of order <= 2 infinitely, it is better not to rely on that. This patch therefore makes __kmem_cache_shrink() allocate the array on stack instead of calling kmalloc, which may fail. The array size is chosen to be equal to 32, because most SLUB caches store not more than 32 objects per slab page. Slab pages with <= 32 free objects are sorted using the array by the number of objects in use and promoted to the head of the partial list, while slab pages with > 32 free objects are left in the end of the list without any ordering imposed on them. Signed-off-by: Vladimir Davydov <vdavydov@parallels.com> Acked-by: Christoph Lameter <cl@linux.com> Acked-by: Pekka Enberg <penberg@kernel.org> Cc: David Rientjes <rientjes@google.com> Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com> Cc: Huang Ying <ying.huang@intel.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

slub: never fail to shrink cache
SLUB's version of __kmem_cache_shrink() not only removes empty slabs, but also tries to rearrange the partial lists to place slabs filled up most to the head to cope with fragmentation. To achieve that, it allocates a temporary array of lists used to sort slabs by the number of objects in use. If the allocation fails, the whole procedure is aborted. This is unacceptable for the kernel memory accounting extension of the memory cgroup, where we want to make sure that kmem_cache_shrink() successfully discarded empty slabs. Although the allocation failure is utterly unlikely with the current page allocator implementation, which retries GFP_KERNEL allocations of order <= 2 infinitely, it is better not to rely on that. This patch therefore makes __kmem_cache_shrink() allocate the array on stack instead of calling kmalloc, which may fail. The array size is chosen to be equal to 32, because most SLUB caches store not more than 32 objects per slab page. Slab pages with <= 32 free objects are sorted using the array by the number of objects in use and promoted to the head of the partial list, while slab pages with > 32 free objects are left in the end of the list without any ordering imposed on them. Signed-off-by: Vladimir Davydov <vdavydov@parallels.com> Acked-by: Christoph Lameter <cl@linux.com> Acked-by: Pekka Enberg <penberg@kernel.org> Cc: David Rientjes <rientjes@google.com> Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com> Cc: Huang Ying <ying.huang@intel.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Vladimir Davydov · Linus Torvalds
1 parent 2788cf0c40
Showing 1 changed file with 31 additions and 27 deletions Side-by-side Diff
mm/slub.c
@@ -3358,11 +3358,12 @@
 }
 EXPORT_SYMBOL(kfree);
  
+#define SHRINK_PROMOTE_MAX 32
+
 /*
- * kmem_cache_shrink removes empty slabs from the partial lists and sorts
- * the remaining slabs by the number of items in use. The slabs with the
- * most items in use come first. New allocations will then fill those up
- * and thus they can be removed from the partial lists.
+ * kmem_cache_shrink discards empty slabs and promotes the slabs filled
+ * up most to the head of the partial lists. New allocations will then
+ * fill those up and thus they can be removed from the partial lists.
  *
  * The slabs with the least items are placed last. This results in them
  * being allocated from last increasing the chance that the last objects
  
  
  
  
  
  
  
  
  
@@ -3375,51 +3376,57 @@
 	struct kmem_cache_node *n;
 	struct page *page;
 	struct page *t;
-	int objects = oo_objects(s->max);
-	struct list_head *slabs_by_inuse =
-		kmalloc(sizeof(struct list_head) * objects, GFP_KERNEL);
+	struct list_head discard;
+	struct list_head promote[SHRINK_PROMOTE_MAX];
 	unsigned long flags;
  
-	if (!slabs_by_inuse)
-		return -ENOMEM;
-
 	flush_all(s);
 	for_each_kmem_cache_node(s, node, n) {
 		if (!n->nr_partial)
 			continue;
  
-		for (i = 0; i < objects; i++)
-			INIT_LIST_HEAD(slabs_by_inuse + i);
+		INIT_LIST_HEAD(&discard);
+		for (i = 0; i < SHRINK_PROMOTE_MAX; i++)
+			INIT_LIST_HEAD(promote + i);
  
 		spin_lock_irqsave(&n->list_lock, flags);
  
 		/*
-		 * Build lists indexed by the items in use in each slab.
+		 * Build lists of slabs to discard or promote.
 		 *
 		 * Note that concurrent frees may occur while we hold the
 		 * list_lock. page->inuse here is the upper limit.
 		 */
 		list_for_each_entry_safe(page, t, &n->partial, lru) {
-			list_move(&page->lru, slabs_by_inuse + page->inuse);
-			if (!page->inuse)
+			int free = page->objects - page->inuse;
+
+			/* Do not reread page->inuse */
+			barrier();
+
+			/* We do not keep full slabs on the list */
+			BUG_ON(free <= 0);
+
+			if (free == page->objects) {
+				list_move(&page->lru, &discard);
 				n->nr_partial--;
+			} else if (free <= SHRINK_PROMOTE_MAX)
+				list_move(&page->lru, promote + free - 1);
 		}
  
 		/*
-		 * Rebuild the partial list with the slabs filled up most
-		 * first and the least used slabs at the end.
+		 * Promote the slabs filled up most to the head of the
+		 * partial list.
 		 */
-		for (i = objects - 1; i > 0; i--)
-			list_splice(slabs_by_inuse + i, n->partial.prev);
+		for (i = SHRINK_PROMOTE_MAX - 1; i >= 0; i--)
+			list_splice(promote + i, &n->partial);
  
 		spin_unlock_irqrestore(&n->list_lock, flags);
  
 		/* Release empty slabs */
-		list_for_each_entry_safe(page, t, slabs_by_inuse, lru)
+		list_for_each_entry_safe(page, t, &discard, lru)
 			discard_slab(s, page);
 	}
  
-	kfree(slabs_by_inuse);
 	return 0;
 }
  
@@ -4686,12 +4693,9 @@
 static ssize_t shrink_store(struct kmem_cache *s,
 			const char *buf, size_t length)
 {
-	if (buf[0] == '1') {
-		int rc = kmem_cache_shrink(s);
-
-		if (rc)
-			return rc;
-	} else
+	if (buf[0] == '1')
+		kmem_cache_shrink(s);
+	else
 		return -EINVAL;
 	return length;
 }
...	...	@@ -3358,11 +3358,12 @@
3358	3358	}
3359	3359	EXPORT_SYMBOL(kfree);
3360	3360
	3361	+#define SHRINK_PROMOTE_MAX 32
	3362	+
3361	3363	/*
3362		- * kmem_cache_shrink removes empty slabs from the partial lists and sorts
3363		- * the remaining slabs by the number of items in use. The slabs with the
3364		- * most items in use come first. New allocations will then fill those up
3365		- * and thus they can be removed from the partial lists.
	3364	+ * kmem_cache_shrink discards empty slabs and promotes the slabs filled
	3365	+ * up most to the head of the partial lists. New allocations will then
	3366	+ * fill those up and thus they can be removed from the partial lists.
3366	3367	*
3367	3368	* The slabs with the least items are placed last. This results in them
3368	3369	* being allocated from last increasing the chance that the last objects
3369	3370
3370	3371
3371	3372
3372	3373
3373	3374
3374	3375
3375	3376
3376	3377
3377	3378
...	...	@@ -3375,51 +3376,57 @@
3375	3376	struct kmem_cache_node *n;
3376	3377	struct page *page;
3377	3378	struct page *t;
3378		- int objects = oo_objects(s->max);
3379		- struct list_head *slabs_by_inuse =
3380		- kmalloc(sizeof(struct list_head) * objects, GFP_KERNEL);
	3379	+ struct list_head discard;
	3380	+ struct list_head promote[SHRINK_PROMOTE_MAX];
3381	3381	unsigned long flags;
3382	3382
3383		- if (!slabs_by_inuse)
3384		- return -ENOMEM;
3385		-
3386	3383	flush_all(s);
3387	3384	for_each_kmem_cache_node(s, node, n) {
3388	3385	if (!n->nr_partial)
3389	3386	continue;
3390	3387
3391		- for (i = 0; i < objects; i++)
3392		- INIT_LIST_HEAD(slabs_by_inuse + i);
	3388	+ INIT_LIST_HEAD(&discard);
	3389	+ for (i = 0; i < SHRINK_PROMOTE_MAX; i++)
	3390	+ INIT_LIST_HEAD(promote + i);
3393	3391
3394	3392	spin_lock_irqsave(&n->list_lock, flags);
3395	3393
3396	3394	/*
3397		- * Build lists indexed by the items in use in each slab.
	3395	+ * Build lists of slabs to discard or promote.
3398	3396	*
3399	3397	* Note that concurrent frees may occur while we hold the
3400	3398	* list_lock. page->inuse here is the upper limit.
3401	3399	*/
3402	3400	list_for_each_entry_safe(page, t, &n->partial, lru) {
3403		- list_move(&page->lru, slabs_by_inuse + page->inuse);
3404		- if (!page->inuse)
	3401	+ int free = page->objects - page->inuse;
	3402	+
	3403	+ /* Do not reread page->inuse */
	3404	+ barrier();
	3405	+
	3406	+ /* We do not keep full slabs on the list */
	3407	+ BUG_ON(free <= 0);
	3408	+
	3409	+ if (free == page->objects) {
	3410	+ list_move(&page->lru, &discard);
3405	3411	n->nr_partial--;
	3412	+ } else if (free <= SHRINK_PROMOTE_MAX)
	3413	+ list_move(&page->lru, promote + free - 1);
3406	3414	}
3407	3415
3408	3416	/*
3409		- * Rebuild the partial list with the slabs filled up most
3410		- * first and the least used slabs at the end.
	3417	+ * Promote the slabs filled up most to the head of the
	3418	+ * partial list.
3411	3419	*/
3412		- for (i = objects - 1; i > 0; i--)
3413		- list_splice(slabs_by_inuse + i, n->partial.prev);
	3420	+ for (i = SHRINK_PROMOTE_MAX - 1; i >= 0; i--)
	3421	+ list_splice(promote + i, &n->partial);
3414	3422
3415	3423	spin_unlock_irqrestore(&n->list_lock, flags);
3416	3424
3417	3425	/* Release empty slabs */
3418		- list_for_each_entry_safe(page, t, slabs_by_inuse, lru)
	3426	+ list_for_each_entry_safe(page, t, &discard, lru)
3419	3427	discard_slab(s, page);
3420	3428	}
3421	3429
3422		- kfree(slabs_by_inuse);
3423	3430	return 0;
3424	3431	}
3425	3432
...	...	@@ -4686,12 +4693,9 @@
4686	4693	static ssize_t shrink_store(struct kmem_cache *s,
4687	4694	const char *buf, size_t length)
4688	4695	{
4689		- if (buf[0] == '1') {
4690		- int rc = kmem_cache_shrink(s);
4691		-
4692		- if (rc)
4693		- return rc;
4694		- } else
	4696	+ if (buf[0] == '1')
	4697	+ kmem_cache_shrink(s);
	4698	+ else
4695	4699	return -EINVAL;
4696	4700	return length;
4697	4701	}