Commit d35426ed1fdd2c16d41ff4b83cfa2206dad57c07

Authored by Joonsoo Kim
Committed by Greg Kroah-Hartman
1 parent 347f31e9ad

slab: fix oops when reading /proc/slab_allocators

commit 03787301420376ae41fbaf4267f4a6253d152ac5 upstream.

Commit b1cb0982bdd6 ("change the management method of free objects of
the slab") introduced a bug on slab leak detector
('/proc/slab_allocators').  This detector works like as following
decription.

 1. traverse all objects on all the slabs.
 2. determine whether it is active or not.
 3. if active, print who allocate this object.

but that commit changed the way how to manage free objects, so the logic
determining whether it is active or not is also changed.  In before, we
regard object in cpu caches as inactive one, but, with this commit, we
mistakenly regard object in cpu caches as active one.

This intoduces kernel oops if DEBUG_PAGEALLOC is enabled.  If
DEBUG_PAGEALLOC is enabled, kernel_map_pages() is used to detect who
corrupt free memory in the slab.  It unmaps page table mapping if object
is free and map it if object is active.  When slab leak detector check
object in cpu caches, it mistakenly think this object active so try to
access object memory to retrieve caller of allocation.  At this point,
page table mapping to this object doesn't exist, so oops occurs.

Following is oops message reported from Dave.

It blew up when something tried to read /proc/slab_allocators
(Just cat it, and you should see the oops below)

  Oops: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC
  Modules linked in:
  [snip...]
  CPU: 1 PID: 9386 Comm: trinity-c33 Not tainted 3.14.0-rc5+ #131
  task: ffff8801aa46e890 ti: ffff880076924000 task.ti: ffff880076924000
  RIP: 0010:[<ffffffffaa1a8f4a>]  [<ffffffffaa1a8f4a>] handle_slab+0x8a/0x180
  RSP: 0018:ffff880076925de0  EFLAGS: 00010002
  RAX: 0000000000001000 RBX: 0000000000000000 RCX: 000000005ce85ce7
  RDX: ffffea00079be100 RSI: 0000000000001000 RDI: ffff880107458000
  RBP: ffff880076925e18 R08: 0000000000000001 R09: 0000000000000000
  R10: 0000000000000000 R11: 000000000000000f R12: ffff8801e6f84000
  R13: ffffea00079be100 R14: ffff880107458000 R15: ffff88022bb8d2c0
  FS:  00007fb769e45740(0000) GS:ffff88024d040000(0000) knlGS:0000000000000000
  CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
  CR2: ffff8801e6f84ff8 CR3: 00000000a22db000 CR4: 00000000001407e0
  DR0: 0000000002695000 DR1: 0000000002695000 DR2: 0000000000000000
  DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000070602
  Call Trace:
    leaks_show+0xce/0x240
    seq_read+0x28e/0x490
    proc_reg_read+0x3d/0x80
    vfs_read+0x9b/0x160
    SyS_read+0x58/0xb0
    tracesys+0xd4/0xd9
  Code: f5 00 00 00 0f 1f 44 00 00 48 63 c8 44 3b 0c 8a 0f 84 e3 00 00 00 83 c0 01 44 39 c0 72 eb 41 f6 47 1a 01 0f 84 e9 00 00 00 89 f0 <4d> 8b 4c 04 f8 4d 85 c9 0f 84 88 00 00 00 49 8b 7e 08 4d 8d 46
  RIP   handle_slab+0x8a/0x180

To fix the problem, I introduce an object status buffer on each slab.
With this, we can track object status precisely, so slab leak detector
would not access active object and no kernel oops would occur.  Memory
overhead caused by this fix is only imposed to CONFIG_DEBUG_SLAB_LEAK
which is mainly used for debugging, so memory overhead isn't big
problem.

Signed-off-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Reported-by: Dave Jones <davej@redhat.com>
Reported-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Reviewed-by: Vladimir Davydov <vdavydov@parallels.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>

Showing 1 changed file with 68 additions and 21 deletions Side-by-side Diff

... ... @@ -375,6 +375,39 @@
375 375  
376 376 #endif
377 377  
  378 +#define OBJECT_FREE (0)
  379 +#define OBJECT_ACTIVE (1)
  380 +
  381 +#ifdef CONFIG_DEBUG_SLAB_LEAK
  382 +
  383 +static void set_obj_status(struct page *page, int idx, int val)
  384 +{
  385 + int freelist_size;
  386 + char *status;
  387 + struct kmem_cache *cachep = page->slab_cache;
  388 +
  389 + freelist_size = cachep->num * sizeof(unsigned int);
  390 + status = (char *)page->freelist + freelist_size;
  391 + status[idx] = val;
  392 +}
  393 +
  394 +static inline unsigned int get_obj_status(struct page *page, int idx)
  395 +{
  396 + int freelist_size;
  397 + char *status;
  398 + struct kmem_cache *cachep = page->slab_cache;
  399 +
  400 + freelist_size = cachep->num * sizeof(unsigned int);
  401 + status = (char *)page->freelist + freelist_size;
  402 +
  403 + return status[idx];
  404 +}
  405 +
  406 +#else
  407 +static inline void set_obj_status(struct page *page, int idx, int val) {}
  408 +
  409 +#endif
  410 +
378 411 /*
379 412 * Do not go above this order unless 0 objects fit into the slab or
380 413 * overridden on the command line.
381 414  
... ... @@ -565,9 +598,18 @@
565 598 return cachep->array[smp_processor_id()];
566 599 }
567 600  
568   -static size_t slab_mgmt_size(size_t nr_objs, size_t align)
  601 +static size_t calculate_freelist_size(int nr_objs, size_t align)
569 602 {
570   - return ALIGN(nr_objs * sizeof(unsigned int), align);
  603 + size_t freelist_size;
  604 +
  605 + freelist_size = nr_objs * sizeof(unsigned int);
  606 + if (IS_ENABLED(CONFIG_DEBUG_SLAB_LEAK))
  607 + freelist_size += nr_objs * sizeof(char);
  608 +
  609 + if (align)
  610 + freelist_size = ALIGN(freelist_size, align);
  611 +
  612 + return freelist_size;
571 613 }
572 614  
573 615 /*
... ... @@ -600,6 +642,10 @@
600 642 nr_objs = slab_size / buffer_size;
601 643  
602 644 } else {
  645 + int extra_space = 0;
  646 +
  647 + if (IS_ENABLED(CONFIG_DEBUG_SLAB_LEAK))
  648 + extra_space = sizeof(char);
603 649 /*
604 650 * Ignore padding for the initial guess. The padding
605 651 * is at most @align-1 bytes, and @buffer_size is at
606 652  
607 653  
... ... @@ -608,17 +654,18 @@
608 654 * into the memory allocation when taking the padding
609 655 * into account.
610 656 */
611   - nr_objs = (slab_size) / (buffer_size + sizeof(unsigned int));
  657 + nr_objs = (slab_size) /
  658 + (buffer_size + sizeof(unsigned int) + extra_space);
612 659  
613 660 /*
614 661 * This calculated number will be either the right
615 662 * amount, or one greater than what we want.
616 663 */
617   - if (slab_mgmt_size(nr_objs, align) + nr_objs*buffer_size
618   - > slab_size)
  664 + if (calculate_freelist_size(nr_objs, align) >
  665 + slab_size - nr_objs * buffer_size)
619 666 nr_objs--;
620 667  
621   - mgmt_size = slab_mgmt_size(nr_objs, align);
  668 + mgmt_size = calculate_freelist_size(nr_objs, align);
622 669 }
623 670 *num = nr_objs;
624 671 *left_over = slab_size - nr_objs*buffer_size - mgmt_size;
625 672  
626 673  
... ... @@ -2011,13 +2058,16 @@
2011 2058 continue;
2012 2059  
2013 2060 if (flags & CFLGS_OFF_SLAB) {
  2061 + size_t freelist_size_per_obj = sizeof(unsigned int);
2014 2062 /*
2015 2063 * Max number of objs-per-slab for caches which
2016 2064 * use off-slab slabs. Needed to avoid a possible
2017 2065 * looping condition in cache_grow().
2018 2066 */
  2067 + if (IS_ENABLED(CONFIG_DEBUG_SLAB_LEAK))
  2068 + freelist_size_per_obj += sizeof(char);
2019 2069 offslab_limit = size;
2020   - offslab_limit /= sizeof(unsigned int);
  2070 + offslab_limit /= freelist_size_per_obj;
2021 2071  
2022 2072 if (num > offslab_limit)
2023 2073 break;
... ... @@ -2258,8 +2308,7 @@
2258 2308 if (!cachep->num)
2259 2309 return -E2BIG;
2260 2310  
2261   - freelist_size =
2262   - ALIGN(cachep->num * sizeof(unsigned int), cachep->align);
  2311 + freelist_size = calculate_freelist_size(cachep->num, cachep->align);
2263 2312  
2264 2313 /*
2265 2314 * If the slab has been placed off-slab, and we have enough space then
... ... @@ -2272,7 +2321,7 @@
2272 2321  
2273 2322 if (flags & CFLGS_OFF_SLAB) {
2274 2323 /* really off slab. No need for manual alignment */
2275   - freelist_size = cachep->num * sizeof(unsigned int);
  2324 + freelist_size = calculate_freelist_size(cachep->num, 0);
2276 2325  
2277 2326 #ifdef CONFIG_PAGE_POISONING
2278 2327 /* If we're going to use the generic kernel_map_pages()
... ... @@ -2589,6 +2638,7 @@
2589 2638 if (cachep->ctor)
2590 2639 cachep->ctor(objp);
2591 2640 #endif
  2641 + set_obj_status(page, i, OBJECT_FREE);
2592 2642 slab_freelist(page)[i] = i;
2593 2643 }
2594 2644 }
... ... @@ -2797,6 +2847,7 @@
2797 2847 BUG_ON(objnr >= cachep->num);
2798 2848 BUG_ON(objp != index_to_obj(cachep, page, objnr));
2799 2849  
  2850 + set_obj_status(page, objnr, OBJECT_FREE);
2800 2851 if (cachep->flags & SLAB_POISON) {
2801 2852 #ifdef CONFIG_DEBUG_PAGEALLOC
2802 2853 if ((cachep->size % PAGE_SIZE)==0 && OFF_SLAB(cachep)) {
... ... @@ -2930,6 +2981,8 @@
2930 2981 static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
2931 2982 gfp_t flags, void *objp, unsigned long caller)
2932 2983 {
  2984 + struct page *page;
  2985 +
2933 2986 if (!objp)
2934 2987 return objp;
2935 2988 if (cachep->flags & SLAB_POISON) {
... ... @@ -2960,6 +3013,9 @@
2960 3013 *dbg_redzone1(cachep, objp) = RED_ACTIVE;
2961 3014 *dbg_redzone2(cachep, objp) = RED_ACTIVE;
2962 3015 }
  3016 +
  3017 + page = virt_to_head_page(objp);
  3018 + set_obj_status(page, obj_to_index(cachep, page, objp), OBJECT_ACTIVE);
2963 3019 objp += obj_offset(cachep);
2964 3020 if (cachep->ctor && cachep->flags & SLAB_POISON)
2965 3021 cachep->ctor(objp);
2966 3022  
... ... @@ -4201,21 +4257,12 @@
4201 4257 struct page *page)
4202 4258 {
4203 4259 void *p;
4204   - int i, j;
  4260 + int i;
4205 4261  
4206 4262 if (n[0] == n[1])
4207 4263 return;
4208 4264 for (i = 0, p = page->s_mem; i < c->num; i++, p += c->size) {
4209   - bool active = true;
4210   -
4211   - for (j = page->active; j < c->num; j++) {
4212   - /* Skip freed item */
4213   - if (slab_freelist(page)[j] == i) {
4214   - active = false;
4215   - break;
4216   - }
4217   - }
4218   - if (!active)
  4265 + if (get_obj_status(page, i) != OBJECT_ACTIVE)
4219 4266 continue;
4220 4267  
4221 4268 if (!add_caller(n, (unsigned long)*dbg_userword(c, p)))