slab: fix oops when reading /proc/slab_allocators

commit 03787301420376ae41fbaf4267f4a6253d152ac5 upstream. Commit b1cb0982bdd6 ("change the management method of free objects of the slab") introduced a bug on slab leak detector ('/proc/slab_allocators'). This detector works like as following decription. 1. traverse all objects on all the slabs. 2. determine whether it is active or not. 3. if active, print who allocate this object. but that commit changed the way how to manage free objects, so the logic determining whether it is active or not is also changed. In before, we regard object in cpu caches as inactive one, but, with this commit, we mistakenly regard object in cpu caches as active one. This intoduces kernel oops if DEBUG_PAGEALLOC is enabled. If DEBUG_PAGEALLOC is enabled, kernel_map_pages() is used to detect who corrupt free memory in the slab. It unmaps page table mapping if object is free and map it if object is active. When slab leak detector check object in cpu caches, it mistakenly think this object active so try to access object memory to retrieve caller of allocation. At this point, page table mapping to this object doesn't exist, so oops occurs. Following is oops message reported from Dave. It blew up when something tried to read /proc/slab_allocators (Just cat it, and you should see the oops below) Oops: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC Modules linked in: [snip...] CPU: 1 PID: 9386 Comm: trinity-c33 Not tainted 3.14.0-rc5+ #131 task: ffff8801aa46e890 ti: ffff880076924000 task.ti: ffff880076924000 RIP: 0010:[<ffffffffaa1a8f4a>] [<ffffffffaa1a8f4a>] handle_slab+0x8a/0x180 RSP: 0018:ffff880076925de0 EFLAGS: 00010002 RAX: 0000000000001000 RBX: 0000000000000000 RCX: 000000005ce85ce7 RDX: ffffea00079be100 RSI: 0000000000001000 RDI: ffff880107458000 RBP: ffff880076925e18 R08: 0000000000000001 R09: 0000000000000000 R10: 0000000000000000 R11: 000000000000000f R12: ffff8801e6f84000 R13: ffffea00079be100 R14: ffff880107458000 R15: ffff88022bb8d2c0 FS: 00007fb769e45740(0000) GS:ffff88024d040000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: ffff8801e6f84ff8 CR3: 00000000a22db000 CR4: 00000000001407e0 DR0: 0000000002695000 DR1: 0000000002695000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000070602 Call Trace: leaks_show+0xce/0x240 seq_read+0x28e/0x490 proc_reg_read+0x3d/0x80 vfs_read+0x9b/0x160 SyS_read+0x58/0xb0 tracesys+0xd4/0xd9 Code: f5 00 00 00 0f 1f 44 00 00 48 63 c8 44 3b 0c 8a 0f 84 e3 00 00 00 83 c0 01 44 39 c0 72 eb 41 f6 47 1a 01 0f 84 e9 00 00 00 89 f0 <4d> 8b 4c 04 f8 4d 85 c9 0f 84 88 00 00 00 49 8b 7e 08 4d 8d 46 RIP handle_slab+0x8a/0x180 To fix the problem, I introduce an object status buffer on each slab. With this, we can track object status precisely, so slab leak detector would not access active object and no kernel oops would occur. Memory overhead caused by this fix is only imposed to CONFIG_DEBUG_SLAB_LEAK which is mainly used for debugging, so memory overhead isn't big problem. Signed-off-by: Joonsoo Kim <iamjoonsoo.kim@lge.com> Reported-by: Dave Jones <davej@redhat.com> Reported-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp> Reviewed-by: Vladimir Davydov <vdavydov@parallels.com> Cc: Christoph Lameter <cl@linux.com> Cc: Pekka Enberg <penberg@kernel.org> Cc: David Rientjes <rientjes@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>

slab: fix oops when reading /proc/slab_allocators
commit 03787301420376ae41fbaf4267f4a6253d152ac5 upstream. Commit b1cb0982bdd6 ("change the management method of free objects of the slab") introduced a bug on slab leak detector ('/proc/slab_allocators'). This detector works like as following decription. 1. traverse all objects on all the slabs. 2. determine whether it is active or not. 3. if active, print who allocate this object. but that commit changed the way how to manage free objects, so the logic determining whether it is active or not is also changed. In before, we regard object in cpu caches as inactive one, but, with this commit, we mistakenly regard object in cpu caches as active one. This intoduces kernel oops if DEBUG_PAGEALLOC is enabled. If DEBUG_PAGEALLOC is enabled, kernel_map_pages() is used to detect who corrupt free memory in the slab. It unmaps page table mapping if object is free and map it if object is active. When slab leak detector check object in cpu caches, it mistakenly think this object active so try to access object memory to retrieve caller of allocation. At this point, page table mapping to this object doesn't exist, so oops occurs. Following is oops message reported from Dave. It blew up when something tried to read /proc/slab_allocators (Just cat it, and you should see the oops below) Oops: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC Modules linked in: [snip...] CPU: 1 PID: 9386 Comm: trinity-c33 Not tainted 3.14.0-rc5+ #131 task: ffff8801aa46e890 ti: ffff880076924000 task.ti: ffff880076924000 RIP: 0010:[<ffffffffaa1a8f4a>] [<ffffffffaa1a8f4a>] handle_slab+0x8a/0x180 RSP: 0018:ffff880076925de0 EFLAGS: 00010002 RAX: 0000000000001000 RBX: 0000000000000000 RCX: 000000005ce85ce7 RDX: ffffea00079be100 RSI: 0000000000001000 RDI: ffff880107458000 RBP: ffff880076925e18 R08: 0000000000000001 R09: 0000000000000000 R10: 0000000000000000 R11: 000000000000000f R12: ffff8801e6f84000 R13: ffffea00079be100 R14: ffff880107458000 R15: ffff88022bb8d2c0 FS: 00007fb769e45740(0000) GS:ffff88024d040000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: ffff8801e6f84ff8 CR3: 00000000a22db000 CR4: 00000000001407e0 DR0: 0000000002695000 DR1: 0000000002695000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000070602 Call Trace: leaks_show+0xce/0x240 seq_read+0x28e/0x490 proc_reg_read+0x3d/0x80 vfs_read+0x9b/0x160 SyS_read+0x58/0xb0 tracesys+0xd4/0xd9 Code: f5 00 00 00 0f 1f 44 00 00 48 63 c8 44 3b 0c 8a 0f 84 e3 00 00 00 83 c0 01 44 39 c0 72 eb 41 f6 47 1a 01 0f 84 e9 00 00 00 89 f0 <4d> 8b 4c 04 f8 4d 85 c9 0f 84 88 00 00 00 49 8b 7e 08 4d 8d 46 RIP handle_slab+0x8a/0x180 To fix the problem, I introduce an object status buffer on each slab. With this, we can track object status precisely, so slab leak detector would not access active object and no kernel oops would occur. Memory overhead caused by this fix is only imposed to CONFIG_DEBUG_SLAB_LEAK which is mainly used for debugging, so memory overhead isn't big problem. Signed-off-by: Joonsoo Kim <iamjoonsoo.kim@lge.com> Reported-by: Dave Jones <davej@redhat.com> Reported-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp> Reviewed-by: Vladimir Davydov <vdavydov@parallels.com> Cc: Christoph Lameter <cl@linux.com> Cc: Pekka Enberg <penberg@kernel.org> Cc: David Rientjes <rientjes@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Joonsoo Kim · Greg Kroah-Hartman
1 parent 347f31e9ad
Showing 1 changed file with 68 additions and 21 deletions Side-by-side Diff
mm/slab.c
@@ -375,6 +375,39 @@
  
 #endif
  
+#define OBJECT_FREE (0)
+#define OBJECT_ACTIVE (1)
+
+#ifdef CONFIG_DEBUG_SLAB_LEAK
+
+static void set_obj_status(struct page *page, int idx, int val)
+{
+	int freelist_size;
+	char *status;
+	struct kmem_cache *cachep = page->slab_cache;
+
+	freelist_size = cachep->num * sizeof(unsigned int);
+	status = (char *)page->freelist + freelist_size;
+	status[idx] = val;
+}
+
+static inline unsigned int get_obj_status(struct page *page, int idx)
+{
+	int freelist_size;
+	char *status;
+	struct kmem_cache *cachep = page->slab_cache;
+
+	freelist_size = cachep->num * sizeof(unsigned int);
+	status = (char *)page->freelist + freelist_size;
+
+	return status[idx];
+}
+
+#else
+static inline void set_obj_status(struct page *page, int idx, int val) {}
+
+#endif
+
 /*
  * Do not go above this order unless 0 objects fit into the slab or
  * overridden on the command line.
  
@@ -565,9 +598,18 @@
 	return cachep->array[smp_processor_id()];
 }
  
-static size_t slab_mgmt_size(size_t nr_objs, size_t align)
+static size_t calculate_freelist_size(int nr_objs, size_t align)
 {
-	return ALIGN(nr_objs * sizeof(unsigned int), align);
+	size_t freelist_size;
+
+	freelist_size = nr_objs * sizeof(unsigned int);
+	if (IS_ENABLED(CONFIG_DEBUG_SLAB_LEAK))
+		freelist_size += nr_objs * sizeof(char);
+
+	if (align)
+		freelist_size = ALIGN(freelist_size, align);
+
+	return freelist_size;
 }
  
 /*
@@ -600,6 +642,10 @@
 		nr_objs = slab_size / buffer_size;
  
 	} else {
+		int extra_space = 0;
+
+		if (IS_ENABLED(CONFIG_DEBUG_SLAB_LEAK))
+			extra_space = sizeof(char);
 		/*
 		 * Ignore padding for the initial guess. The padding
 		 * is at most @align-1 bytes, and @buffer_size is at
  
  
@@ -608,17 +654,18 @@
 		 * into the memory allocation when taking the padding
 		 * into account.
 		 */
-		nr_objs = (slab_size) / (buffer_size + sizeof(unsigned int));
+		nr_objs = (slab_size) /
+			(buffer_size + sizeof(unsigned int) + extra_space);
  
 		/*
 		 * This calculated number will be either the right
 		 * amount, or one greater than what we want.
 		 */
-		if (slab_mgmt_size(nr_objs, align) + nr_objs*buffer_size
-		       > slab_size)
+		if (calculate_freelist_size(nr_objs, align) >
+			slab_size - nr_objs * buffer_size)
 			nr_objs--;
  
-		mgmt_size = slab_mgmt_size(nr_objs, align);
+		mgmt_size = calculate_freelist_size(nr_objs, align);
 	}
 	*num = nr_objs;
 	*left_over = slab_size - nr_objs*buffer_size - mgmt_size;
  
  
@@ -2011,13 +2058,16 @@
 			continue;
  
 		if (flags & CFLGS_OFF_SLAB) {
+			size_t freelist_size_per_obj = sizeof(unsigned int);
 			/*
 			 * Max number of objs-per-slab for caches which
 			 * use off-slab slabs. Needed to avoid a possible
 			 * looping condition in cache_grow().
 			 */
+			if (IS_ENABLED(CONFIG_DEBUG_SLAB_LEAK))
+				freelist_size_per_obj += sizeof(char);
 			offslab_limit = size;
-			offslab_limit /= sizeof(unsigned int);
+			offslab_limit /= freelist_size_per_obj;
  
  			if (num > offslab_limit)
 				break;
@@ -2258,8 +2308,7 @@
 	if (!cachep->num)
 		return -E2BIG;
  
-	freelist_size =
-		ALIGN(cachep->num * sizeof(unsigned int), cachep->align);
+	freelist_size = calculate_freelist_size(cachep->num, cachep->align);
  
 	/*
 	 * If the slab has been placed off-slab, and we have enough space then
@@ -2272,7 +2321,7 @@
  
 	if (flags & CFLGS_OFF_SLAB) {
 		/* really off slab. No need for manual alignment */
-		freelist_size = cachep->num * sizeof(unsigned int);
+		freelist_size = calculate_freelist_size(cachep->num, 0);
  
 #ifdef CONFIG_PAGE_POISONING
 		/* If we're going to use the generic kernel_map_pages()
@@ -2589,6 +2638,7 @@
 		if (cachep->ctor)
 			cachep->ctor(objp);
 #endif
+		set_obj_status(page, i, OBJECT_FREE);
 		slab_freelist(page)[i] = i;
 	}
 }
@@ -2797,6 +2847,7 @@
 	BUG_ON(objnr >= cachep->num);
 	BUG_ON(objp != index_to_obj(cachep, page, objnr));
  
+	set_obj_status(page, objnr, OBJECT_FREE);
 	if (cachep->flags & SLAB_POISON) {
 #ifdef CONFIG_DEBUG_PAGEALLOC
 		if ((cachep->size % PAGE_SIZE)==0 && OFF_SLAB(cachep)) {
@@ -2930,6 +2981,8 @@
 static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
 				gfp_t flags, void *objp, unsigned long caller)
 {
+	struct page *page;
+
 	if (!objp)
 		return objp;
 	if (cachep->flags & SLAB_POISON) {
@@ -2960,6 +3013,9 @@
 		*dbg_redzone1(cachep, objp) = RED_ACTIVE;
 		*dbg_redzone2(cachep, objp) = RED_ACTIVE;
 	}
+
+	page = virt_to_head_page(objp);
+	set_obj_status(page, obj_to_index(cachep, page, objp), OBJECT_ACTIVE);
 	objp += obj_offset(cachep);
 	if (cachep->ctor && cachep->flags & SLAB_POISON)
 		cachep->ctor(objp);
  
@@ -4201,21 +4257,12 @@
 						struct page *page)
 {
 	void *p;
-	int i, j;
+	int i;
  
 	if (n[0] == n[1])
 		return;
 	for (i = 0, p = page->s_mem; i < c->num; i++, p += c->size) {
-		bool active = true;
-
-		for (j = page->active; j < c->num; j++) {
-			/* Skip freed item */
-			if (slab_freelist(page)[j] == i) {
-				active = false;
-				break;
-			}
-		}
-		if (!active)
+		if (get_obj_status(page, i) != OBJECT_ACTIVE)
 			continue;
  
 		if (!add_caller(n, (unsigned long)*dbg_userword(c, p)))
...	...	@@ -375,6 +375,39 @@
375	375
376	376	#endif
377	377
	378	+#define OBJECT_FREE (0)
	379	+#define OBJECT_ACTIVE (1)
	380	+
	381	+#ifdef CONFIG_DEBUG_SLAB_LEAK
	382	+
	383	+static void set_obj_status(struct page *page, int idx, int val)
	384	+{
	385	+ int freelist_size;
	386	+ char *status;
	387	+ struct kmem_cache *cachep = page->slab_cache;
	388	+
	389	+ freelist_size = cachep->num * sizeof(unsigned int);
	390	+ status = (char *)page->freelist + freelist_size;
	391	+ status[idx] = val;
	392	+}
	393	+
	394	+static inline unsigned int get_obj_status(struct page *page, int idx)
	395	+{
	396	+ int freelist_size;
	397	+ char *status;
	398	+ struct kmem_cache *cachep = page->slab_cache;
	399	+
	400	+ freelist_size = cachep->num * sizeof(unsigned int);
	401	+ status = (char *)page->freelist + freelist_size;
	402	+
	403	+ return status[idx];
	404	+}
	405	+
	406	+#else
	407	+static inline void set_obj_status(struct page *page, int idx, int val) {}
	408	+
	409	+#endif
	410	+
378	411	/*
379	412	* Do not go above this order unless 0 objects fit into the slab or
380	413	* overridden on the command line.
381	414
...	...	@@ -565,9 +598,18 @@
565	598	return cachep->array[smp_processor_id()];
566	599	}
567	600
568		-static size_t slab_mgmt_size(size_t nr_objs, size_t align)
	601	+static size_t calculate_freelist_size(int nr_objs, size_t align)
569	602	{
570		- return ALIGN(nr_objs * sizeof(unsigned int), align);
	603	+ size_t freelist_size;
	604	+
	605	+ freelist_size = nr_objs * sizeof(unsigned int);
	606	+ if (IS_ENABLED(CONFIG_DEBUG_SLAB_LEAK))
	607	+ freelist_size += nr_objs * sizeof(char);
	608	+
	609	+ if (align)
	610	+ freelist_size = ALIGN(freelist_size, align);
	611	+
	612	+ return freelist_size;
571	613	}
572	614
573	615	/*
...	...	@@ -600,6 +642,10 @@
600	642	nr_objs = slab_size / buffer_size;
601	643
602	644	} else {
	645	+ int extra_space = 0;
	646	+
	647	+ if (IS_ENABLED(CONFIG_DEBUG_SLAB_LEAK))
	648	+ extra_space = sizeof(char);
603	649	/*
604	650	* Ignore padding for the initial guess. The padding
605	651	* is at most @align-1 bytes, and @buffer_size is at
606	652
607	653
...	...	@@ -608,17 +654,18 @@
608	654	* into the memory allocation when taking the padding
609	655	* into account.
610	656	*/
611		- nr_objs = (slab_size) / (buffer_size + sizeof(unsigned int));
	657	+ nr_objs = (slab_size) /
	658	+ (buffer_size + sizeof(unsigned int) + extra_space);
612	659
613	660	/*
614	661	* This calculated number will be either the right
615	662	* amount, or one greater than what we want.
616	663	*/
617		- if (slab_mgmt_size(nr_objs, align) + nr_objs*buffer_size
618		- > slab_size)
	664	+ if (calculate_freelist_size(nr_objs, align) >
	665	+ slab_size - nr_objs * buffer_size)
619	666	nr_objs--;
620	667
621		- mgmt_size = slab_mgmt_size(nr_objs, align);
	668	+ mgmt_size = calculate_freelist_size(nr_objs, align);
622	669	}
623	670	*num = nr_objs;
624	671	left_over = slab_size - nr_objsbuffer_size - mgmt_size;
625	672
626	673
...	...	@@ -2011,13 +2058,16 @@
2011	2058	continue;
2012	2059
2013	2060	if (flags & CFLGS_OFF_SLAB) {
	2061	+ size_t freelist_size_per_obj = sizeof(unsigned int);
2014	2062	/*
2015	2063	* Max number of objs-per-slab for caches which
2016	2064	* use off-slab slabs. Needed to avoid a possible
2017	2065	* looping condition in cache_grow().
2018	2066	*/
	2067	+ if (IS_ENABLED(CONFIG_DEBUG_SLAB_LEAK))
	2068	+ freelist_size_per_obj += sizeof(char);
2019	2069	offslab_limit = size;
2020		- offslab_limit /= sizeof(unsigned int);
	2070	+ offslab_limit /= freelist_size_per_obj;
2021	2071
2022	2072	if (num > offslab_limit)
2023	2073	break;
...	...	@@ -2258,8 +2308,7 @@
2258	2308	if (!cachep->num)
2259	2309	return -E2BIG;
2260	2310
2261		- freelist_size =
2262		- ALIGN(cachep->num * sizeof(unsigned int), cachep->align);
	2311	+ freelist_size = calculate_freelist_size(cachep->num, cachep->align);
2263	2312
2264	2313	/*
2265	2314	* If the slab has been placed off-slab, and we have enough space then
...	...	@@ -2272,7 +2321,7 @@
2272	2321
2273	2322	if (flags & CFLGS_OFF_SLAB) {
2274	2323	/* really off slab. No need for manual alignment */
2275		- freelist_size = cachep->num * sizeof(unsigned int);
	2324	+ freelist_size = calculate_freelist_size(cachep->num, 0);
2276	2325
2277	2326	#ifdef CONFIG_PAGE_POISONING
2278	2327	/* If we're going to use the generic kernel_map_pages()
...	...	@@ -2589,6 +2638,7 @@
2589	2638	if (cachep->ctor)
2590	2639	cachep->ctor(objp);
2591	2640	#endif
	2641	+ set_obj_status(page, i, OBJECT_FREE);
2592	2642	slab_freelist(page)[i] = i;
2593	2643	}
2594	2644	}
...	...	@@ -2797,6 +2847,7 @@
2797	2847	BUG_ON(objnr >= cachep->num);
2798	2848	BUG_ON(objp != index_to_obj(cachep, page, objnr));
2799	2849
	2850	+ set_obj_status(page, objnr, OBJECT_FREE);
2800	2851	if (cachep->flags & SLAB_POISON) {
2801	2852	#ifdef CONFIG_DEBUG_PAGEALLOC
2802	2853	if ((cachep->size % PAGE_SIZE)==0 && OFF_SLAB(cachep)) {
...	...	@@ -2930,6 +2981,8 @@
2930	2981	static void cache_alloc_debugcheck_after(struct kmem_cache cachep,
2931	2982	gfp_t flags, void *objp, unsigned long caller)
2932	2983	{
	2984	+ struct page *page;
	2985	+
2933	2986	if (!objp)
2934	2987	return objp;
2935	2988	if (cachep->flags & SLAB_POISON) {
...	...	@@ -2960,6 +3013,9 @@
2960	3013	*dbg_redzone1(cachep, objp) = RED_ACTIVE;
2961	3014	*dbg_redzone2(cachep, objp) = RED_ACTIVE;
2962	3015	}
	3016	+
	3017	+ page = virt_to_head_page(objp);
	3018	+ set_obj_status(page, obj_to_index(cachep, page, objp), OBJECT_ACTIVE);
2963	3019	objp += obj_offset(cachep);
2964	3020	if (cachep->ctor && cachep->flags & SLAB_POISON)
2965	3021	cachep->ctor(objp);
2966	3022
...	...	@@ -4201,21 +4257,12 @@
4201	4257	struct page *page)
4202	4258	{
4203	4259	void *p;
4204		- int i, j;
	4260	+ int i;
4205	4261
4206	4262	if (n[0] == n[1])
4207	4263	return;
4208	4264	for (i = 0, p = page->s_mem; i < c->num; i++, p += c->size) {
4209		- bool active = true;
4210		-
4211		- for (j = page->active; j < c->num; j++) {
4212		- /* Skip freed item */
4213		- if (slab_freelist(page)[j] == i) {
4214		- active = false;
4215		- break;
4216		- }
4217		- }
4218		- if (!active)
	4265	+ if (get_obj_status(page, i) != OBJECT_ACTIVE)
4219	4266	continue;
4220	4267
4221	4268	if (!add_caller(n, (unsigned long)*dbg_userword(c, p)))