Commit 03787301420376ae41fbaf4267f4a6253d152ac5

Authored by Joonsoo Kim
Committed by Linus Torvalds
1 parent f00cdc6df7

slab: fix oops when reading /proc/slab_allocators

Commit b1cb0982bdd6 ("change the management method of free objects of
the slab") introduced a bug on slab leak detector
('/proc/slab_allocators').  This detector works like as following
decription.

 1. traverse all objects on all the slabs.
 2. determine whether it is active or not.
 3. if active, print who allocate this object.

but that commit changed the way how to manage free objects, so the logic
determining whether it is active or not is also changed.  In before, we
regard object in cpu caches as inactive one, but, with this commit, we
mistakenly regard object in cpu caches as active one.

This intoduces kernel oops if DEBUG_PAGEALLOC is enabled.  If
DEBUG_PAGEALLOC is enabled, kernel_map_pages() is used to detect who
corrupt free memory in the slab.  It unmaps page table mapping if object
is free and map it if object is active.  When slab leak detector check
object in cpu caches, it mistakenly think this object active so try to
access object memory to retrieve caller of allocation.  At this point,
page table mapping to this object doesn't exist, so oops occurs.

Following is oops message reported from Dave.

It blew up when something tried to read /proc/slab_allocators
(Just cat it, and you should see the oops below)

  Oops: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC
  Modules linked in:
  [snip...]
  CPU: 1 PID: 9386 Comm: trinity-c33 Not tainted 3.14.0-rc5+ #131
  task: ffff8801aa46e890 ti: ffff880076924000 task.ti: ffff880076924000
  RIP: 0010:[<ffffffffaa1a8f4a>]  [<ffffffffaa1a8f4a>] handle_slab+0x8a/0x180
  RSP: 0018:ffff880076925de0  EFLAGS: 00010002
  RAX: 0000000000001000 RBX: 0000000000000000 RCX: 000000005ce85ce7
  RDX: ffffea00079be100 RSI: 0000000000001000 RDI: ffff880107458000
  RBP: ffff880076925e18 R08: 0000000000000001 R09: 0000000000000000
  R10: 0000000000000000 R11: 000000000000000f R12: ffff8801e6f84000
  R13: ffffea00079be100 R14: ffff880107458000 R15: ffff88022bb8d2c0
  FS:  00007fb769e45740(0000) GS:ffff88024d040000(0000) knlGS:0000000000000000
  CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
  CR2: ffff8801e6f84ff8 CR3: 00000000a22db000 CR4: 00000000001407e0
  DR0: 0000000002695000 DR1: 0000000002695000 DR2: 0000000000000000
  DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000070602
  Call Trace:
    leaks_show+0xce/0x240
    seq_read+0x28e/0x490
    proc_reg_read+0x3d/0x80
    vfs_read+0x9b/0x160
    SyS_read+0x58/0xb0
    tracesys+0xd4/0xd9
  Code: f5 00 00 00 0f 1f 44 00 00 48 63 c8 44 3b 0c 8a 0f 84 e3 00 00 00 83 c0 01 44 39 c0 72 eb 41 f6 47 1a 01 0f 84 e9 00 00 00 89 f0 <4d> 8b 4c 04 f8 4d 85 c9 0f 84 88 00 00 00 49 8b 7e 08 4d 8d 46
  RIP   handle_slab+0x8a/0x180

To fix the problem, I introduce an object status buffer on each slab.
With this, we can track object status precisely, so slab leak detector
would not access active object and no kernel oops would occur.  Memory
overhead caused by this fix is only imposed to CONFIG_DEBUG_SLAB_LEAK
which is mainly used for debugging, so memory overhead isn't big
problem.

Signed-off-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Reported-by: Dave Jones <davej@redhat.com>
Reported-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Reviewed-by: Vladimir Davydov <vdavydov@parallels.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Showing 1 changed file with 71 additions and 19 deletions Side-by-side Diff

... ... @@ -386,6 +386,39 @@
386 386  
387 387 #endif
388 388  
  389 +#define OBJECT_FREE (0)
  390 +#define OBJECT_ACTIVE (1)
  391 +
  392 +#ifdef CONFIG_DEBUG_SLAB_LEAK
  393 +
  394 +static void set_obj_status(struct page *page, int idx, int val)
  395 +{
  396 + int freelist_size;
  397 + char *status;
  398 + struct kmem_cache *cachep = page->slab_cache;
  399 +
  400 + freelist_size = cachep->num * sizeof(freelist_idx_t);
  401 + status = (char *)page->freelist + freelist_size;
  402 + status[idx] = val;
  403 +}
  404 +
  405 +static inline unsigned int get_obj_status(struct page *page, int idx)
  406 +{
  407 + int freelist_size;
  408 + char *status;
  409 + struct kmem_cache *cachep = page->slab_cache;
  410 +
  411 + freelist_size = cachep->num * sizeof(freelist_idx_t);
  412 + status = (char *)page->freelist + freelist_size;
  413 +
  414 + return status[idx];
  415 +}
  416 +
  417 +#else
  418 +static inline void set_obj_status(struct page *page, int idx, int val) {}
  419 +
  420 +#endif
  421 +
389 422 /*
390 423 * Do not go above this order unless 0 objects fit into the slab or
391 424 * overridden on the command line.
392 425  
393 426  
394 427  
... ... @@ -576,12 +609,30 @@
576 609 return cachep->array[smp_processor_id()];
577 610 }
578 611  
  612 +static size_t calculate_freelist_size(int nr_objs, size_t align)
  613 +{
  614 + size_t freelist_size;
  615 +
  616 + freelist_size = nr_objs * sizeof(freelist_idx_t);
  617 + if (IS_ENABLED(CONFIG_DEBUG_SLAB_LEAK))
  618 + freelist_size += nr_objs * sizeof(char);
  619 +
  620 + if (align)
  621 + freelist_size = ALIGN(freelist_size, align);
  622 +
  623 + return freelist_size;
  624 +}
  625 +
579 626 static int calculate_nr_objs(size_t slab_size, size_t buffer_size,
580 627 size_t idx_size, size_t align)
581 628 {
582 629 int nr_objs;
  630 + size_t remained_size;
583 631 size_t freelist_size;
  632 + int extra_space = 0;
584 633  
  634 + if (IS_ENABLED(CONFIG_DEBUG_SLAB_LEAK))
  635 + extra_space = sizeof(char);
585 636 /*
586 637 * Ignore padding for the initial guess. The padding
587 638 * is at most @align-1 bytes, and @buffer_size is at
588 639  
... ... @@ -590,14 +641,15 @@
590 641 * into the memory allocation when taking the padding
591 642 * into account.
592 643 */
593   - nr_objs = slab_size / (buffer_size + idx_size);
  644 + nr_objs = slab_size / (buffer_size + idx_size + extra_space);
594 645  
595 646 /*
596 647 * This calculated number will be either the right
597 648 * amount, or one greater than what we want.
598 649 */
599   - freelist_size = slab_size - nr_objs * buffer_size;
600   - if (freelist_size < ALIGN(nr_objs * idx_size, align))
  650 + remained_size = slab_size - nr_objs * buffer_size;
  651 + freelist_size = calculate_freelist_size(nr_objs, align);
  652 + if (remained_size < freelist_size)
601 653 nr_objs--;
602 654  
603 655 return nr_objs;
... ... @@ -635,7 +687,7 @@
635 687 } else {
636 688 nr_objs = calculate_nr_objs(slab_size, buffer_size,
637 689 sizeof(freelist_idx_t), align);
638   - mgmt_size = ALIGN(nr_objs * sizeof(freelist_idx_t), align);
  690 + mgmt_size = calculate_freelist_size(nr_objs, align);
639 691 }
640 692 *num = nr_objs;
641 693 *left_over = slab_size - nr_objs*buffer_size - mgmt_size;
642 694  
643 695  
... ... @@ -2041,13 +2093,16 @@
2041 2093 break;
2042 2094  
2043 2095 if (flags & CFLGS_OFF_SLAB) {
  2096 + size_t freelist_size_per_obj = sizeof(freelist_idx_t);
2044 2097 /*
2045 2098 * Max number of objs-per-slab for caches which
2046 2099 * use off-slab slabs. Needed to avoid a possible
2047 2100 * looping condition in cache_grow().
2048 2101 */
  2102 + if (IS_ENABLED(CONFIG_DEBUG_SLAB_LEAK))
  2103 + freelist_size_per_obj += sizeof(char);
2049 2104 offslab_limit = size;
2050   - offslab_limit /= sizeof(freelist_idx_t);
  2105 + offslab_limit /= freelist_size_per_obj;
2051 2106  
2052 2107 if (num > offslab_limit)
2053 2108 break;
... ... @@ -2294,8 +2349,7 @@
2294 2349 if (!cachep->num)
2295 2350 return -E2BIG;
2296 2351  
2297   - freelist_size =
2298   - ALIGN(cachep->num * sizeof(freelist_idx_t), cachep->align);
  2352 + freelist_size = calculate_freelist_size(cachep->num, cachep->align);
2299 2353  
2300 2354 /*
2301 2355 * If the slab has been placed off-slab, and we have enough space then
... ... @@ -2308,7 +2362,7 @@
2308 2362  
2309 2363 if (flags & CFLGS_OFF_SLAB) {
2310 2364 /* really off slab. No need for manual alignment */
2311   - freelist_size = cachep->num * sizeof(freelist_idx_t);
  2365 + freelist_size = calculate_freelist_size(cachep->num, 0);
2312 2366  
2313 2367 #ifdef CONFIG_PAGE_POISONING
2314 2368 /* If we're going to use the generic kernel_map_pages()
... ... @@ -2612,6 +2666,7 @@
2612 2666 if (cachep->ctor)
2613 2667 cachep->ctor(objp);
2614 2668 #endif
  2669 + set_obj_status(page, i, OBJECT_FREE);
2615 2670 set_free_obj(page, i, i);
2616 2671 }
2617 2672 }
... ... @@ -2820,6 +2875,7 @@
2820 2875 BUG_ON(objnr >= cachep->num);
2821 2876 BUG_ON(objp != index_to_obj(cachep, page, objnr));
2822 2877  
  2878 + set_obj_status(page, objnr, OBJECT_FREE);
2823 2879 if (cachep->flags & SLAB_POISON) {
2824 2880 #ifdef CONFIG_DEBUG_PAGEALLOC
2825 2881 if ((cachep->size % PAGE_SIZE)==0 && OFF_SLAB(cachep)) {
... ... @@ -2953,6 +3009,8 @@
2953 3009 static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
2954 3010 gfp_t flags, void *objp, unsigned long caller)
2955 3011 {
  3012 + struct page *page;
  3013 +
2956 3014 if (!objp)
2957 3015 return objp;
2958 3016 if (cachep->flags & SLAB_POISON) {
... ... @@ -2983,6 +3041,9 @@
2983 3041 *dbg_redzone1(cachep, objp) = RED_ACTIVE;
2984 3042 *dbg_redzone2(cachep, objp) = RED_ACTIVE;
2985 3043 }
  3044 +
  3045 + page = virt_to_head_page(objp);
  3046 + set_obj_status(page, obj_to_index(cachep, page, objp), OBJECT_ACTIVE);
2986 3047 objp += obj_offset(cachep);
2987 3048 if (cachep->ctor && cachep->flags & SLAB_POISON)
2988 3049 cachep->ctor(objp);
2989 3050  
... ... @@ -4219,21 +4280,12 @@
4219 4280 struct page *page)
4220 4281 {
4221 4282 void *p;
4222   - int i, j;
  4283 + int i;
4223 4284  
4224 4285 if (n[0] == n[1])
4225 4286 return;
4226 4287 for (i = 0, p = page->s_mem; i < c->num; i++, p += c->size) {
4227   - bool active = true;
4228   -
4229   - for (j = page->active; j < c->num; j++) {
4230   - /* Skip freed item */
4231   - if (get_free_obj(page, j) == i) {
4232   - active = false;
4233   - break;
4234   - }
4235   - }
4236   - if (!active)
  4288 + if (get_obj_status(page, i) != OBJECT_ACTIVE)
4237 4289 continue;
4238 4290  
4239 4291 if (!add_caller(n, (unsigned long)*dbg_userword(c, p)))