Commit 6a2d7a955d8de6cb19ed9cd194b3c83008a22c32

Authored by Eric Dumazet
Committed by Linus Torvalds
1 parent 02a0e53d82

[PATCH] SLAB: use a multiply instead of a divide in obj_to_index()

When some objects are allocated by one CPU but freed by another CPU we can
consume lot of cycles doing divides in obj_to_index().

(Typical load on a dual processor machine where network interrupts are
handled by one particular CPU (allocating skbufs), and the other CPU is
running the application (consuming and freeing skbufs))

Here on one production server (dual-core AMD Opteron 285), I noticed this
divide took 1.20 % of CPU_CLK_UNHALTED events in kernel.  But Opteron are
quite modern cpus and the divide is much more expensive on oldest
architectures :

On a 200 MHz sparcv9 machine, the division takes 64 cycles instead of 1
cycle for a multiply.

Doing some math, we can use a reciprocal multiplication instead of a divide.

If we want to compute V = (A / B)  (A and B being u32 quantities)
we can instead use :

V = ((u64)A * RECIPROCAL(B)) >> 32 ;

where RECIPROCAL(B) is precalculated to ((1LL << 32) + (B - 1)) / B

Note :

I wrote pure C code for clarity. gcc output for i386 is not optimal but
acceptable :

mull   0x14(%ebx)
mov    %edx,%eax // part of the >> 32
xor     %edx,%edx // useless
mov    %eax,(%esp) // could be avoided
mov    %edx,0x4(%esp) // useless
mov    (%esp),%ebx

[akpm@osdl.org: small cleanups]
Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Cc: Christoph Lameter <clameter@sgi.com>
Cc: David Miller <davem@davemloft.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

Showing 4 changed files with 57 additions and 4 deletions Inline Diff

include/linux/reciprocal_div.h
File was created 1 #ifndef _LINUX_RECIPROCAL_DIV_H
2 #define _LINUX_RECIPROCAL_DIV_H
3
4 #include <linux/types.h>
5
6 /*
7 * This file describes reciprocical division.
8 *
9 * This optimizes the (A/B) problem, when A and B are two u32
10 * and B is a known value (but not known at compile time)
11 *
12 * The math principle used is :
13 * Let RECIPROCAL_VALUE(B) be (((1LL << 32) + (B - 1))/ B)
14 * Then A / B = (u32)(((u64)(A) * (R)) >> 32)
15 *
16 * This replaces a divide by a multiply (and a shift), and
17 * is generally less expensive in CPU cycles.
18 */
19
20 /*
21 * Computes the reciprocal value (R) for the value B of the divisor.
22 * Should not be called before each reciprocal_divide(),
23 * or else the performance is slower than a normal divide.
24 */
25 extern u32 reciprocal_value(u32 B);
26
27
28 static inline u32 reciprocal_divide(u32 A, u32 R)
29 {
30 return (u32)(((u64)A * R) >> 32);
31 }
32 #endif
33
1 # 1 #
2 # Makefile for some libs needed in the kernel. 2 # Makefile for some libs needed in the kernel.
3 # 3 #
4 4
5 lib-y := ctype.o string.o vsprintf.o cmdline.o \ 5 lib-y := ctype.o string.o vsprintf.o cmdline.o \
6 bust_spinlocks.o rbtree.o radix-tree.o dump_stack.o \ 6 bust_spinlocks.o rbtree.o radix-tree.o dump_stack.o \
7 idr.o div64.o int_sqrt.o bitmap.o extable.o prio_tree.o \ 7 idr.o div64.o int_sqrt.o bitmap.o extable.o prio_tree.o \
8 sha1.o irq_regs.o 8 sha1.o irq_regs.o reciprocal_div.o
9 9
10 lib-$(CONFIG_MMU) += ioremap.o 10 lib-$(CONFIG_MMU) += ioremap.o
11 lib-$(CONFIG_SMP) += cpumask.o 11 lib-$(CONFIG_SMP) += cpumask.o
12 12
13 lib-y += kobject.o kref.o kobject_uevent.o klist.o 13 lib-y += kobject.o kref.o kobject_uevent.o klist.o
14 14
15 obj-y += sort.o parser.o halfmd4.o iomap_copy.o debug_locks.o random32.o 15 obj-y += sort.o parser.o halfmd4.o iomap_copy.o debug_locks.o random32.o
16 16
17 ifeq ($(CONFIG_DEBUG_KOBJECT),y) 17 ifeq ($(CONFIG_DEBUG_KOBJECT),y)
18 CFLAGS_kobject.o += -DDEBUG 18 CFLAGS_kobject.o += -DDEBUG
19 CFLAGS_kobject_uevent.o += -DDEBUG 19 CFLAGS_kobject_uevent.o += -DDEBUG
20 endif 20 endif
21 21
22 obj-$(CONFIG_DEBUG_LOCKING_API_SELFTESTS) += locking-selftest.o 22 obj-$(CONFIG_DEBUG_LOCKING_API_SELFTESTS) += locking-selftest.o
23 obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock_debug.o 23 obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock_debug.o
24 lib-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o 24 lib-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o
25 lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem.o 25 lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem.o
26 lib-$(CONFIG_SEMAPHORE_SLEEPERS) += semaphore-sleepers.o 26 lib-$(CONFIG_SEMAPHORE_SLEEPERS) += semaphore-sleepers.o
27 lib-$(CONFIG_GENERIC_FIND_NEXT_BIT) += find_next_bit.o 27 lib-$(CONFIG_GENERIC_FIND_NEXT_BIT) += find_next_bit.o
28 obj-$(CONFIG_GENERIC_HWEIGHT) += hweight.o 28 obj-$(CONFIG_GENERIC_HWEIGHT) += hweight.o
29 obj-$(CONFIG_LOCK_KERNEL) += kernel_lock.o 29 obj-$(CONFIG_LOCK_KERNEL) += kernel_lock.o
30 obj-$(CONFIG_PLIST) += plist.o 30 obj-$(CONFIG_PLIST) += plist.o
31 obj-$(CONFIG_DEBUG_PREEMPT) += smp_processor_id.o 31 obj-$(CONFIG_DEBUG_PREEMPT) += smp_processor_id.o
32 obj-$(CONFIG_DEBUG_LIST) += list_debug.o 32 obj-$(CONFIG_DEBUG_LIST) += list_debug.o
33 33
34 ifneq ($(CONFIG_HAVE_DEC_LOCK),y) 34 ifneq ($(CONFIG_HAVE_DEC_LOCK),y)
35 lib-y += dec_and_lock.o 35 lib-y += dec_and_lock.o
36 endif 36 endif
37 37
38 obj-$(CONFIG_BITREVERSE) += bitrev.o 38 obj-$(CONFIG_BITREVERSE) += bitrev.o
39 obj-$(CONFIG_CRC_CCITT) += crc-ccitt.o 39 obj-$(CONFIG_CRC_CCITT) += crc-ccitt.o
40 obj-$(CONFIG_CRC16) += crc16.o 40 obj-$(CONFIG_CRC16) += crc16.o
41 obj-$(CONFIG_CRC32) += crc32.o 41 obj-$(CONFIG_CRC32) += crc32.o
42 obj-$(CONFIG_LIBCRC32C) += libcrc32c.o 42 obj-$(CONFIG_LIBCRC32C) += libcrc32c.o
43 obj-$(CONFIG_GENERIC_IOMAP) += iomap.o 43 obj-$(CONFIG_GENERIC_IOMAP) += iomap.o
44 obj-$(CONFIG_GENERIC_ALLOCATOR) += genalloc.o 44 obj-$(CONFIG_GENERIC_ALLOCATOR) += genalloc.o
45 45
46 obj-$(CONFIG_ZLIB_INFLATE) += zlib_inflate/ 46 obj-$(CONFIG_ZLIB_INFLATE) += zlib_inflate/
47 obj-$(CONFIG_ZLIB_DEFLATE) += zlib_deflate/ 47 obj-$(CONFIG_ZLIB_DEFLATE) += zlib_deflate/
48 obj-$(CONFIG_REED_SOLOMON) += reed_solomon/ 48 obj-$(CONFIG_REED_SOLOMON) += reed_solomon/
49 49
50 obj-$(CONFIG_TEXTSEARCH) += textsearch.o 50 obj-$(CONFIG_TEXTSEARCH) += textsearch.o
51 obj-$(CONFIG_TEXTSEARCH_KMP) += ts_kmp.o 51 obj-$(CONFIG_TEXTSEARCH_KMP) += ts_kmp.o
52 obj-$(CONFIG_TEXTSEARCH_BM) += ts_bm.o 52 obj-$(CONFIG_TEXTSEARCH_BM) += ts_bm.o
53 obj-$(CONFIG_TEXTSEARCH_FSM) += ts_fsm.o 53 obj-$(CONFIG_TEXTSEARCH_FSM) += ts_fsm.o
54 obj-$(CONFIG_SMP) += percpu_counter.o 54 obj-$(CONFIG_SMP) += percpu_counter.o
55 obj-$(CONFIG_AUDIT_GENERIC) += audit.o 55 obj-$(CONFIG_AUDIT_GENERIC) += audit.o
56 56
57 obj-$(CONFIG_SWIOTLB) += swiotlb.o 57 obj-$(CONFIG_SWIOTLB) += swiotlb.o
58 obj-$(CONFIG_FAULT_INJECTION) += fault-inject.o 58 obj-$(CONFIG_FAULT_INJECTION) += fault-inject.o
59 59
60 lib-$(CONFIG_GENERIC_BUG) += bug.o 60 lib-$(CONFIG_GENERIC_BUG) += bug.o
61 61
62 hostprogs-y := gen_crc32table 62 hostprogs-y := gen_crc32table
63 clean-files := crc32table.h 63 clean-files := crc32table.h
64 64
65 $(obj)/crc32.o: $(obj)/crc32table.h 65 $(obj)/crc32.o: $(obj)/crc32table.h
66 66
67 quiet_cmd_crc32 = GEN $@ 67 quiet_cmd_crc32 = GEN $@
68 cmd_crc32 = $< > $@ 68 cmd_crc32 = $< > $@
69 69
70 $(obj)/crc32table.h: $(obj)/gen_crc32table 70 $(obj)/crc32table.h: $(obj)/gen_crc32table
71 $(call cmd,crc32) 71 $(call cmd,crc32)
72 72
lib/reciprocal_div.c
File was created 1 #include <asm/div64.h>
2 #include <linux/reciprocal_div.h>
3
4 u32 reciprocal_value(u32 k)
5 {
6 u64 val = (1LL << 32) + (k - 1);
7 do_div(val, k);
8 return (u32)val;
9 }
10
1 /* 1 /*
2 * linux/mm/slab.c 2 * linux/mm/slab.c
3 * Written by Mark Hemment, 1996/97. 3 * Written by Mark Hemment, 1996/97.
4 * (markhe@nextd.demon.co.uk) 4 * (markhe@nextd.demon.co.uk)
5 * 5 *
6 * kmem_cache_destroy() + some cleanup - 1999 Andrea Arcangeli 6 * kmem_cache_destroy() + some cleanup - 1999 Andrea Arcangeli
7 * 7 *
8 * Major cleanup, different bufctl logic, per-cpu arrays 8 * Major cleanup, different bufctl logic, per-cpu arrays
9 * (c) 2000 Manfred Spraul 9 * (c) 2000 Manfred Spraul
10 * 10 *
11 * Cleanup, make the head arrays unconditional, preparation for NUMA 11 * Cleanup, make the head arrays unconditional, preparation for NUMA
12 * (c) 2002 Manfred Spraul 12 * (c) 2002 Manfred Spraul
13 * 13 *
14 * An implementation of the Slab Allocator as described in outline in; 14 * An implementation of the Slab Allocator as described in outline in;
15 * UNIX Internals: The New Frontiers by Uresh Vahalia 15 * UNIX Internals: The New Frontiers by Uresh Vahalia
16 * Pub: Prentice Hall ISBN 0-13-101908-2 16 * Pub: Prentice Hall ISBN 0-13-101908-2
17 * or with a little more detail in; 17 * or with a little more detail in;
18 * The Slab Allocator: An Object-Caching Kernel Memory Allocator 18 * The Slab Allocator: An Object-Caching Kernel Memory Allocator
19 * Jeff Bonwick (Sun Microsystems). 19 * Jeff Bonwick (Sun Microsystems).
20 * Presented at: USENIX Summer 1994 Technical Conference 20 * Presented at: USENIX Summer 1994 Technical Conference
21 * 21 *
22 * The memory is organized in caches, one cache for each object type. 22 * The memory is organized in caches, one cache for each object type.
23 * (e.g. inode_cache, dentry_cache, buffer_head, vm_area_struct) 23 * (e.g. inode_cache, dentry_cache, buffer_head, vm_area_struct)
24 * Each cache consists out of many slabs (they are small (usually one 24 * Each cache consists out of many slabs (they are small (usually one
25 * page long) and always contiguous), and each slab contains multiple 25 * page long) and always contiguous), and each slab contains multiple
26 * initialized objects. 26 * initialized objects.
27 * 27 *
28 * This means, that your constructor is used only for newly allocated 28 * This means, that your constructor is used only for newly allocated
29 * slabs and you must pass objects with the same intializations to 29 * slabs and you must pass objects with the same intializations to
30 * kmem_cache_free. 30 * kmem_cache_free.
31 * 31 *
32 * Each cache can only support one memory type (GFP_DMA, GFP_HIGHMEM, 32 * Each cache can only support one memory type (GFP_DMA, GFP_HIGHMEM,
33 * normal). If you need a special memory type, then must create a new 33 * normal). If you need a special memory type, then must create a new
34 * cache for that memory type. 34 * cache for that memory type.
35 * 35 *
36 * In order to reduce fragmentation, the slabs are sorted in 3 groups: 36 * In order to reduce fragmentation, the slabs are sorted in 3 groups:
37 * full slabs with 0 free objects 37 * full slabs with 0 free objects
38 * partial slabs 38 * partial slabs
39 * empty slabs with no allocated objects 39 * empty slabs with no allocated objects
40 * 40 *
41 * If partial slabs exist, then new allocations come from these slabs, 41 * If partial slabs exist, then new allocations come from these slabs,
42 * otherwise from empty slabs or new slabs are allocated. 42 * otherwise from empty slabs or new slabs are allocated.
43 * 43 *
44 * kmem_cache_destroy() CAN CRASH if you try to allocate from the cache 44 * kmem_cache_destroy() CAN CRASH if you try to allocate from the cache
45 * during kmem_cache_destroy(). The caller must prevent concurrent allocs. 45 * during kmem_cache_destroy(). The caller must prevent concurrent allocs.
46 * 46 *
47 * Each cache has a short per-cpu head array, most allocs 47 * Each cache has a short per-cpu head array, most allocs
48 * and frees go into that array, and if that array overflows, then 1/2 48 * and frees go into that array, and if that array overflows, then 1/2
49 * of the entries in the array are given back into the global cache. 49 * of the entries in the array are given back into the global cache.
50 * The head array is strictly LIFO and should improve the cache hit rates. 50 * The head array is strictly LIFO and should improve the cache hit rates.
51 * On SMP, it additionally reduces the spinlock operations. 51 * On SMP, it additionally reduces the spinlock operations.
52 * 52 *
53 * The c_cpuarray may not be read with enabled local interrupts - 53 * The c_cpuarray may not be read with enabled local interrupts -
54 * it's changed with a smp_call_function(). 54 * it's changed with a smp_call_function().
55 * 55 *
56 * SMP synchronization: 56 * SMP synchronization:
57 * constructors and destructors are called without any locking. 57 * constructors and destructors are called without any locking.
58 * Several members in struct kmem_cache and struct slab never change, they 58 * Several members in struct kmem_cache and struct slab never change, they
59 * are accessed without any locking. 59 * are accessed without any locking.
60 * The per-cpu arrays are never accessed from the wrong cpu, no locking, 60 * The per-cpu arrays are never accessed from the wrong cpu, no locking,
61 * and local interrupts are disabled so slab code is preempt-safe. 61 * and local interrupts are disabled so slab code is preempt-safe.
62 * The non-constant members are protected with a per-cache irq spinlock. 62 * The non-constant members are protected with a per-cache irq spinlock.
63 * 63 *
64 * Many thanks to Mark Hemment, who wrote another per-cpu slab patch 64 * Many thanks to Mark Hemment, who wrote another per-cpu slab patch
65 * in 2000 - many ideas in the current implementation are derived from 65 * in 2000 - many ideas in the current implementation are derived from
66 * his patch. 66 * his patch.
67 * 67 *
68 * Further notes from the original documentation: 68 * Further notes from the original documentation:
69 * 69 *
70 * 11 April '97. Started multi-threading - markhe 70 * 11 April '97. Started multi-threading - markhe
71 * The global cache-chain is protected by the mutex 'cache_chain_mutex'. 71 * The global cache-chain is protected by the mutex 'cache_chain_mutex'.
72 * The sem is only needed when accessing/extending the cache-chain, which 72 * The sem is only needed when accessing/extending the cache-chain, which
73 * can never happen inside an interrupt (kmem_cache_create(), 73 * can never happen inside an interrupt (kmem_cache_create(),
74 * kmem_cache_shrink() and kmem_cache_reap()). 74 * kmem_cache_shrink() and kmem_cache_reap()).
75 * 75 *
76 * At present, each engine can be growing a cache. This should be blocked. 76 * At present, each engine can be growing a cache. This should be blocked.
77 * 77 *
78 * 15 March 2005. NUMA slab allocator. 78 * 15 March 2005. NUMA slab allocator.
79 * Shai Fultheim <shai@scalex86.org>. 79 * Shai Fultheim <shai@scalex86.org>.
80 * Shobhit Dayal <shobhit@calsoftinc.com> 80 * Shobhit Dayal <shobhit@calsoftinc.com>
81 * Alok N Kataria <alokk@calsoftinc.com> 81 * Alok N Kataria <alokk@calsoftinc.com>
82 * Christoph Lameter <christoph@lameter.com> 82 * Christoph Lameter <christoph@lameter.com>
83 * 83 *
84 * Modified the slab allocator to be node aware on NUMA systems. 84 * Modified the slab allocator to be node aware on NUMA systems.
85 * Each node has its own list of partial, free and full slabs. 85 * Each node has its own list of partial, free and full slabs.
86 * All object allocations for a node occur from node specific slab lists. 86 * All object allocations for a node occur from node specific slab lists.
87 */ 87 */
88 88
89 #include <linux/slab.h> 89 #include <linux/slab.h>
90 #include <linux/mm.h> 90 #include <linux/mm.h>
91 #include <linux/poison.h> 91 #include <linux/poison.h>
92 #include <linux/swap.h> 92 #include <linux/swap.h>
93 #include <linux/cache.h> 93 #include <linux/cache.h>
94 #include <linux/interrupt.h> 94 #include <linux/interrupt.h>
95 #include <linux/init.h> 95 #include <linux/init.h>
96 #include <linux/compiler.h> 96 #include <linux/compiler.h>
97 #include <linux/cpuset.h> 97 #include <linux/cpuset.h>
98 #include <linux/seq_file.h> 98 #include <linux/seq_file.h>
99 #include <linux/notifier.h> 99 #include <linux/notifier.h>
100 #include <linux/kallsyms.h> 100 #include <linux/kallsyms.h>
101 #include <linux/cpu.h> 101 #include <linux/cpu.h>
102 #include <linux/sysctl.h> 102 #include <linux/sysctl.h>
103 #include <linux/module.h> 103 #include <linux/module.h>
104 #include <linux/rcupdate.h> 104 #include <linux/rcupdate.h>
105 #include <linux/string.h> 105 #include <linux/string.h>
106 #include <linux/uaccess.h> 106 #include <linux/uaccess.h>
107 #include <linux/nodemask.h> 107 #include <linux/nodemask.h>
108 #include <linux/mempolicy.h> 108 #include <linux/mempolicy.h>
109 #include <linux/mutex.h> 109 #include <linux/mutex.h>
110 #include <linux/fault-inject.h> 110 #include <linux/fault-inject.h>
111 #include <linux/rtmutex.h> 111 #include <linux/rtmutex.h>
112 #include <linux/reciprocal_div.h>
112 113
113 #include <asm/cacheflush.h> 114 #include <asm/cacheflush.h>
114 #include <asm/tlbflush.h> 115 #include <asm/tlbflush.h>
115 #include <asm/page.h> 116 #include <asm/page.h>
116 117
117 /* 118 /*
118 * DEBUG - 1 for kmem_cache_create() to honour; SLAB_DEBUG_INITIAL, 119 * DEBUG - 1 for kmem_cache_create() to honour; SLAB_DEBUG_INITIAL,
119 * SLAB_RED_ZONE & SLAB_POISON. 120 * SLAB_RED_ZONE & SLAB_POISON.
120 * 0 for faster, smaller code (especially in the critical paths). 121 * 0 for faster, smaller code (especially in the critical paths).
121 * 122 *
122 * STATS - 1 to collect stats for /proc/slabinfo. 123 * STATS - 1 to collect stats for /proc/slabinfo.
123 * 0 for faster, smaller code (especially in the critical paths). 124 * 0 for faster, smaller code (especially in the critical paths).
124 * 125 *
125 * FORCED_DEBUG - 1 enables SLAB_RED_ZONE and SLAB_POISON (if possible) 126 * FORCED_DEBUG - 1 enables SLAB_RED_ZONE and SLAB_POISON (if possible)
126 */ 127 */
127 128
128 #ifdef CONFIG_DEBUG_SLAB 129 #ifdef CONFIG_DEBUG_SLAB
129 #define DEBUG 1 130 #define DEBUG 1
130 #define STATS 1 131 #define STATS 1
131 #define FORCED_DEBUG 1 132 #define FORCED_DEBUG 1
132 #else 133 #else
133 #define DEBUG 0 134 #define DEBUG 0
134 #define STATS 0 135 #define STATS 0
135 #define FORCED_DEBUG 0 136 #define FORCED_DEBUG 0
136 #endif 137 #endif
137 138
138 /* Shouldn't this be in a header file somewhere? */ 139 /* Shouldn't this be in a header file somewhere? */
139 #define BYTES_PER_WORD sizeof(void *) 140 #define BYTES_PER_WORD sizeof(void *)
140 141
141 #ifndef cache_line_size 142 #ifndef cache_line_size
142 #define cache_line_size() L1_CACHE_BYTES 143 #define cache_line_size() L1_CACHE_BYTES
143 #endif 144 #endif
144 145
145 #ifndef ARCH_KMALLOC_MINALIGN 146 #ifndef ARCH_KMALLOC_MINALIGN
146 /* 147 /*
147 * Enforce a minimum alignment for the kmalloc caches. 148 * Enforce a minimum alignment for the kmalloc caches.
148 * Usually, the kmalloc caches are cache_line_size() aligned, except when 149 * Usually, the kmalloc caches are cache_line_size() aligned, except when
149 * DEBUG and FORCED_DEBUG are enabled, then they are BYTES_PER_WORD aligned. 150 * DEBUG and FORCED_DEBUG are enabled, then they are BYTES_PER_WORD aligned.
150 * Some archs want to perform DMA into kmalloc caches and need a guaranteed 151 * Some archs want to perform DMA into kmalloc caches and need a guaranteed
151 * alignment larger than BYTES_PER_WORD. ARCH_KMALLOC_MINALIGN allows that. 152 * alignment larger than BYTES_PER_WORD. ARCH_KMALLOC_MINALIGN allows that.
152 * Note that this flag disables some debug features. 153 * Note that this flag disables some debug features.
153 */ 154 */
154 #define ARCH_KMALLOC_MINALIGN 0 155 #define ARCH_KMALLOC_MINALIGN 0
155 #endif 156 #endif
156 157
157 #ifndef ARCH_SLAB_MINALIGN 158 #ifndef ARCH_SLAB_MINALIGN
158 /* 159 /*
159 * Enforce a minimum alignment for all caches. 160 * Enforce a minimum alignment for all caches.
160 * Intended for archs that get misalignment faults even for BYTES_PER_WORD 161 * Intended for archs that get misalignment faults even for BYTES_PER_WORD
161 * aligned buffers. Includes ARCH_KMALLOC_MINALIGN. 162 * aligned buffers. Includes ARCH_KMALLOC_MINALIGN.
162 * If possible: Do not enable this flag for CONFIG_DEBUG_SLAB, it disables 163 * If possible: Do not enable this flag for CONFIG_DEBUG_SLAB, it disables
163 * some debug features. 164 * some debug features.
164 */ 165 */
165 #define ARCH_SLAB_MINALIGN 0 166 #define ARCH_SLAB_MINALIGN 0
166 #endif 167 #endif
167 168
168 #ifndef ARCH_KMALLOC_FLAGS 169 #ifndef ARCH_KMALLOC_FLAGS
169 #define ARCH_KMALLOC_FLAGS SLAB_HWCACHE_ALIGN 170 #define ARCH_KMALLOC_FLAGS SLAB_HWCACHE_ALIGN
170 #endif 171 #endif
171 172
172 /* Legal flag mask for kmem_cache_create(). */ 173 /* Legal flag mask for kmem_cache_create(). */
173 #if DEBUG 174 #if DEBUG
174 # define CREATE_MASK (SLAB_DEBUG_INITIAL | SLAB_RED_ZONE | \ 175 # define CREATE_MASK (SLAB_DEBUG_INITIAL | SLAB_RED_ZONE | \
175 SLAB_POISON | SLAB_HWCACHE_ALIGN | \ 176 SLAB_POISON | SLAB_HWCACHE_ALIGN | \
176 SLAB_CACHE_DMA | \ 177 SLAB_CACHE_DMA | \
177 SLAB_MUST_HWCACHE_ALIGN | SLAB_STORE_USER | \ 178 SLAB_MUST_HWCACHE_ALIGN | SLAB_STORE_USER | \
178 SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \ 179 SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \
179 SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD) 180 SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD)
180 #else 181 #else
181 # define CREATE_MASK (SLAB_HWCACHE_ALIGN | \ 182 # define CREATE_MASK (SLAB_HWCACHE_ALIGN | \
182 SLAB_CACHE_DMA | SLAB_MUST_HWCACHE_ALIGN | \ 183 SLAB_CACHE_DMA | SLAB_MUST_HWCACHE_ALIGN | \
183 SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \ 184 SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \
184 SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD) 185 SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD)
185 #endif 186 #endif
186 187
187 /* 188 /*
188 * kmem_bufctl_t: 189 * kmem_bufctl_t:
189 * 190 *
190 * Bufctl's are used for linking objs within a slab 191 * Bufctl's are used for linking objs within a slab
191 * linked offsets. 192 * linked offsets.
192 * 193 *
193 * This implementation relies on "struct page" for locating the cache & 194 * This implementation relies on "struct page" for locating the cache &
194 * slab an object belongs to. 195 * slab an object belongs to.
195 * This allows the bufctl structure to be small (one int), but limits 196 * This allows the bufctl structure to be small (one int), but limits
196 * the number of objects a slab (not a cache) can contain when off-slab 197 * the number of objects a slab (not a cache) can contain when off-slab
197 * bufctls are used. The limit is the size of the largest general cache 198 * bufctls are used. The limit is the size of the largest general cache
198 * that does not use off-slab slabs. 199 * that does not use off-slab slabs.
199 * For 32bit archs with 4 kB pages, is this 56. 200 * For 32bit archs with 4 kB pages, is this 56.
200 * This is not serious, as it is only for large objects, when it is unwise 201 * This is not serious, as it is only for large objects, when it is unwise
201 * to have too many per slab. 202 * to have too many per slab.
202 * Note: This limit can be raised by introducing a general cache whose size 203 * Note: This limit can be raised by introducing a general cache whose size
203 * is less than 512 (PAGE_SIZE<<3), but greater than 256. 204 * is less than 512 (PAGE_SIZE<<3), but greater than 256.
204 */ 205 */
205 206
206 typedef unsigned int kmem_bufctl_t; 207 typedef unsigned int kmem_bufctl_t;
207 #define BUFCTL_END (((kmem_bufctl_t)(~0U))-0) 208 #define BUFCTL_END (((kmem_bufctl_t)(~0U))-0)
208 #define BUFCTL_FREE (((kmem_bufctl_t)(~0U))-1) 209 #define BUFCTL_FREE (((kmem_bufctl_t)(~0U))-1)
209 #define BUFCTL_ACTIVE (((kmem_bufctl_t)(~0U))-2) 210 #define BUFCTL_ACTIVE (((kmem_bufctl_t)(~0U))-2)
210 #define SLAB_LIMIT (((kmem_bufctl_t)(~0U))-3) 211 #define SLAB_LIMIT (((kmem_bufctl_t)(~0U))-3)
211 212
212 /* 213 /*
213 * struct slab 214 * struct slab
214 * 215 *
215 * Manages the objs in a slab. Placed either at the beginning of mem allocated 216 * Manages the objs in a slab. Placed either at the beginning of mem allocated
216 * for a slab, or allocated from an general cache. 217 * for a slab, or allocated from an general cache.
217 * Slabs are chained into three list: fully used, partial, fully free slabs. 218 * Slabs are chained into three list: fully used, partial, fully free slabs.
218 */ 219 */
219 struct slab { 220 struct slab {
220 struct list_head list; 221 struct list_head list;
221 unsigned long colouroff; 222 unsigned long colouroff;
222 void *s_mem; /* including colour offset */ 223 void *s_mem; /* including colour offset */
223 unsigned int inuse; /* num of objs active in slab */ 224 unsigned int inuse; /* num of objs active in slab */
224 kmem_bufctl_t free; 225 kmem_bufctl_t free;
225 unsigned short nodeid; 226 unsigned short nodeid;
226 }; 227 };
227 228
228 /* 229 /*
229 * struct slab_rcu 230 * struct slab_rcu
230 * 231 *
231 * slab_destroy on a SLAB_DESTROY_BY_RCU cache uses this structure to 232 * slab_destroy on a SLAB_DESTROY_BY_RCU cache uses this structure to
232 * arrange for kmem_freepages to be called via RCU. This is useful if 233 * arrange for kmem_freepages to be called via RCU. This is useful if
233 * we need to approach a kernel structure obliquely, from its address 234 * we need to approach a kernel structure obliquely, from its address
234 * obtained without the usual locking. We can lock the structure to 235 * obtained without the usual locking. We can lock the structure to
235 * stabilize it and check it's still at the given address, only if we 236 * stabilize it and check it's still at the given address, only if we
236 * can be sure that the memory has not been meanwhile reused for some 237 * can be sure that the memory has not been meanwhile reused for some
237 * other kind of object (which our subsystem's lock might corrupt). 238 * other kind of object (which our subsystem's lock might corrupt).
238 * 239 *
239 * rcu_read_lock before reading the address, then rcu_read_unlock after 240 * rcu_read_lock before reading the address, then rcu_read_unlock after
240 * taking the spinlock within the structure expected at that address. 241 * taking the spinlock within the structure expected at that address.
241 * 242 *
242 * We assume struct slab_rcu can overlay struct slab when destroying. 243 * We assume struct slab_rcu can overlay struct slab when destroying.
243 */ 244 */
244 struct slab_rcu { 245 struct slab_rcu {
245 struct rcu_head head; 246 struct rcu_head head;
246 struct kmem_cache *cachep; 247 struct kmem_cache *cachep;
247 void *addr; 248 void *addr;
248 }; 249 };
249 250
250 /* 251 /*
251 * struct array_cache 252 * struct array_cache
252 * 253 *
253 * Purpose: 254 * Purpose:
254 * - LIFO ordering, to hand out cache-warm objects from _alloc 255 * - LIFO ordering, to hand out cache-warm objects from _alloc
255 * - reduce the number of linked list operations 256 * - reduce the number of linked list operations
256 * - reduce spinlock operations 257 * - reduce spinlock operations
257 * 258 *
258 * The limit is stored in the per-cpu structure to reduce the data cache 259 * The limit is stored in the per-cpu structure to reduce the data cache
259 * footprint. 260 * footprint.
260 * 261 *
261 */ 262 */
262 struct array_cache { 263 struct array_cache {
263 unsigned int avail; 264 unsigned int avail;
264 unsigned int limit; 265 unsigned int limit;
265 unsigned int batchcount; 266 unsigned int batchcount;
266 unsigned int touched; 267 unsigned int touched;
267 spinlock_t lock; 268 spinlock_t lock;
268 void *entry[0]; /* 269 void *entry[0]; /*
269 * Must have this definition in here for the proper 270 * Must have this definition in here for the proper
270 * alignment of array_cache. Also simplifies accessing 271 * alignment of array_cache. Also simplifies accessing
271 * the entries. 272 * the entries.
272 * [0] is for gcc 2.95. It should really be []. 273 * [0] is for gcc 2.95. It should really be [].
273 */ 274 */
274 }; 275 };
275 276
276 /* 277 /*
277 * bootstrap: The caches do not work without cpuarrays anymore, but the 278 * bootstrap: The caches do not work without cpuarrays anymore, but the
278 * cpuarrays are allocated from the generic caches... 279 * cpuarrays are allocated from the generic caches...
279 */ 280 */
280 #define BOOT_CPUCACHE_ENTRIES 1 281 #define BOOT_CPUCACHE_ENTRIES 1
281 struct arraycache_init { 282 struct arraycache_init {
282 struct array_cache cache; 283 struct array_cache cache;
283 void *entries[BOOT_CPUCACHE_ENTRIES]; 284 void *entries[BOOT_CPUCACHE_ENTRIES];
284 }; 285 };
285 286
286 /* 287 /*
287 * The slab lists for all objects. 288 * The slab lists for all objects.
288 */ 289 */
289 struct kmem_list3 { 290 struct kmem_list3 {
290 struct list_head slabs_partial; /* partial list first, better asm code */ 291 struct list_head slabs_partial; /* partial list first, better asm code */
291 struct list_head slabs_full; 292 struct list_head slabs_full;
292 struct list_head slabs_free; 293 struct list_head slabs_free;
293 unsigned long free_objects; 294 unsigned long free_objects;
294 unsigned int free_limit; 295 unsigned int free_limit;
295 unsigned int colour_next; /* Per-node cache coloring */ 296 unsigned int colour_next; /* Per-node cache coloring */
296 spinlock_t list_lock; 297 spinlock_t list_lock;
297 struct array_cache *shared; /* shared per node */ 298 struct array_cache *shared; /* shared per node */
298 struct array_cache **alien; /* on other nodes */ 299 struct array_cache **alien; /* on other nodes */
299 unsigned long next_reap; /* updated without locking */ 300 unsigned long next_reap; /* updated without locking */
300 int free_touched; /* updated without locking */ 301 int free_touched; /* updated without locking */
301 }; 302 };
302 303
303 /* 304 /*
304 * Need this for bootstrapping a per node allocator. 305 * Need this for bootstrapping a per node allocator.
305 */ 306 */
306 #define NUM_INIT_LISTS (2 * MAX_NUMNODES + 1) 307 #define NUM_INIT_LISTS (2 * MAX_NUMNODES + 1)
307 struct kmem_list3 __initdata initkmem_list3[NUM_INIT_LISTS]; 308 struct kmem_list3 __initdata initkmem_list3[NUM_INIT_LISTS];
308 #define CACHE_CACHE 0 309 #define CACHE_CACHE 0
309 #define SIZE_AC 1 310 #define SIZE_AC 1
310 #define SIZE_L3 (1 + MAX_NUMNODES) 311 #define SIZE_L3 (1 + MAX_NUMNODES)
311 312
312 static int drain_freelist(struct kmem_cache *cache, 313 static int drain_freelist(struct kmem_cache *cache,
313 struct kmem_list3 *l3, int tofree); 314 struct kmem_list3 *l3, int tofree);
314 static void free_block(struct kmem_cache *cachep, void **objpp, int len, 315 static void free_block(struct kmem_cache *cachep, void **objpp, int len,
315 int node); 316 int node);
316 static int enable_cpucache(struct kmem_cache *cachep); 317 static int enable_cpucache(struct kmem_cache *cachep);
317 static void cache_reap(struct work_struct *unused); 318 static void cache_reap(struct work_struct *unused);
318 319
319 /* 320 /*
320 * This function must be completely optimized away if a constant is passed to 321 * This function must be completely optimized away if a constant is passed to
321 * it. Mostly the same as what is in linux/slab.h except it returns an index. 322 * it. Mostly the same as what is in linux/slab.h except it returns an index.
322 */ 323 */
323 static __always_inline int index_of(const size_t size) 324 static __always_inline int index_of(const size_t size)
324 { 325 {
325 extern void __bad_size(void); 326 extern void __bad_size(void);
326 327
327 if (__builtin_constant_p(size)) { 328 if (__builtin_constant_p(size)) {
328 int i = 0; 329 int i = 0;
329 330
330 #define CACHE(x) \ 331 #define CACHE(x) \
331 if (size <=x) \ 332 if (size <=x) \
332 return i; \ 333 return i; \
333 else \ 334 else \
334 i++; 335 i++;
335 #include "linux/kmalloc_sizes.h" 336 #include "linux/kmalloc_sizes.h"
336 #undef CACHE 337 #undef CACHE
337 __bad_size(); 338 __bad_size();
338 } else 339 } else
339 __bad_size(); 340 __bad_size();
340 return 0; 341 return 0;
341 } 342 }
342 343
343 static int slab_early_init = 1; 344 static int slab_early_init = 1;
344 345
345 #define INDEX_AC index_of(sizeof(struct arraycache_init)) 346 #define INDEX_AC index_of(sizeof(struct arraycache_init))
346 #define INDEX_L3 index_of(sizeof(struct kmem_list3)) 347 #define INDEX_L3 index_of(sizeof(struct kmem_list3))
347 348
348 static void kmem_list3_init(struct kmem_list3 *parent) 349 static void kmem_list3_init(struct kmem_list3 *parent)
349 { 350 {
350 INIT_LIST_HEAD(&parent->slabs_full); 351 INIT_LIST_HEAD(&parent->slabs_full);
351 INIT_LIST_HEAD(&parent->slabs_partial); 352 INIT_LIST_HEAD(&parent->slabs_partial);
352 INIT_LIST_HEAD(&parent->slabs_free); 353 INIT_LIST_HEAD(&parent->slabs_free);
353 parent->shared = NULL; 354 parent->shared = NULL;
354 parent->alien = NULL; 355 parent->alien = NULL;
355 parent->colour_next = 0; 356 parent->colour_next = 0;
356 spin_lock_init(&parent->list_lock); 357 spin_lock_init(&parent->list_lock);
357 parent->free_objects = 0; 358 parent->free_objects = 0;
358 parent->free_touched = 0; 359 parent->free_touched = 0;
359 } 360 }
360 361
361 #define MAKE_LIST(cachep, listp, slab, nodeid) \ 362 #define MAKE_LIST(cachep, listp, slab, nodeid) \
362 do { \ 363 do { \
363 INIT_LIST_HEAD(listp); \ 364 INIT_LIST_HEAD(listp); \
364 list_splice(&(cachep->nodelists[nodeid]->slab), listp); \ 365 list_splice(&(cachep->nodelists[nodeid]->slab), listp); \
365 } while (0) 366 } while (0)
366 367
367 #define MAKE_ALL_LISTS(cachep, ptr, nodeid) \ 368 #define MAKE_ALL_LISTS(cachep, ptr, nodeid) \
368 do { \ 369 do { \
369 MAKE_LIST((cachep), (&(ptr)->slabs_full), slabs_full, nodeid); \ 370 MAKE_LIST((cachep), (&(ptr)->slabs_full), slabs_full, nodeid); \
370 MAKE_LIST((cachep), (&(ptr)->slabs_partial), slabs_partial, nodeid); \ 371 MAKE_LIST((cachep), (&(ptr)->slabs_partial), slabs_partial, nodeid); \
371 MAKE_LIST((cachep), (&(ptr)->slabs_free), slabs_free, nodeid); \ 372 MAKE_LIST((cachep), (&(ptr)->slabs_free), slabs_free, nodeid); \
372 } while (0) 373 } while (0)
373 374
374 /* 375 /*
375 * struct kmem_cache 376 * struct kmem_cache
376 * 377 *
377 * manages a cache. 378 * manages a cache.
378 */ 379 */
379 380
380 struct kmem_cache { 381 struct kmem_cache {
381 /* 1) per-cpu data, touched during every alloc/free */ 382 /* 1) per-cpu data, touched during every alloc/free */
382 struct array_cache *array[NR_CPUS]; 383 struct array_cache *array[NR_CPUS];
383 /* 2) Cache tunables. Protected by cache_chain_mutex */ 384 /* 2) Cache tunables. Protected by cache_chain_mutex */
384 unsigned int batchcount; 385 unsigned int batchcount;
385 unsigned int limit; 386 unsigned int limit;
386 unsigned int shared; 387 unsigned int shared;
387 388
388 unsigned int buffer_size; 389 unsigned int buffer_size;
390 u32 reciprocal_buffer_size;
389 /* 3) touched by every alloc & free from the backend */ 391 /* 3) touched by every alloc & free from the backend */
390 struct kmem_list3 *nodelists[MAX_NUMNODES]; 392 struct kmem_list3 *nodelists[MAX_NUMNODES];
391 393
392 unsigned int flags; /* constant flags */ 394 unsigned int flags; /* constant flags */
393 unsigned int num; /* # of objs per slab */ 395 unsigned int num; /* # of objs per slab */
394 396
395 /* 4) cache_grow/shrink */ 397 /* 4) cache_grow/shrink */
396 /* order of pgs per slab (2^n) */ 398 /* order of pgs per slab (2^n) */
397 unsigned int gfporder; 399 unsigned int gfporder;
398 400
399 /* force GFP flags, e.g. GFP_DMA */ 401 /* force GFP flags, e.g. GFP_DMA */
400 gfp_t gfpflags; 402 gfp_t gfpflags;
401 403
402 size_t colour; /* cache colouring range */ 404 size_t colour; /* cache colouring range */
403 unsigned int colour_off; /* colour offset */ 405 unsigned int colour_off; /* colour offset */
404 struct kmem_cache *slabp_cache; 406 struct kmem_cache *slabp_cache;
405 unsigned int slab_size; 407 unsigned int slab_size;
406 unsigned int dflags; /* dynamic flags */ 408 unsigned int dflags; /* dynamic flags */
407 409
408 /* constructor func */ 410 /* constructor func */
409 void (*ctor) (void *, struct kmem_cache *, unsigned long); 411 void (*ctor) (void *, struct kmem_cache *, unsigned long);
410 412
411 /* de-constructor func */ 413 /* de-constructor func */
412 void (*dtor) (void *, struct kmem_cache *, unsigned long); 414 void (*dtor) (void *, struct kmem_cache *, unsigned long);
413 415
414 /* 5) cache creation/removal */ 416 /* 5) cache creation/removal */
415 const char *name; 417 const char *name;
416 struct list_head next; 418 struct list_head next;
417 419
418 /* 6) statistics */ 420 /* 6) statistics */
419 #if STATS 421 #if STATS
420 unsigned long num_active; 422 unsigned long num_active;
421 unsigned long num_allocations; 423 unsigned long num_allocations;
422 unsigned long high_mark; 424 unsigned long high_mark;
423 unsigned long grown; 425 unsigned long grown;
424 unsigned long reaped; 426 unsigned long reaped;
425 unsigned long errors; 427 unsigned long errors;
426 unsigned long max_freeable; 428 unsigned long max_freeable;
427 unsigned long node_allocs; 429 unsigned long node_allocs;
428 unsigned long node_frees; 430 unsigned long node_frees;
429 unsigned long node_overflow; 431 unsigned long node_overflow;
430 atomic_t allochit; 432 atomic_t allochit;
431 atomic_t allocmiss; 433 atomic_t allocmiss;
432 atomic_t freehit; 434 atomic_t freehit;
433 atomic_t freemiss; 435 atomic_t freemiss;
434 #endif 436 #endif
435 #if DEBUG 437 #if DEBUG
436 /* 438 /*
437 * If debugging is enabled, then the allocator can add additional 439 * If debugging is enabled, then the allocator can add additional
438 * fields and/or padding to every object. buffer_size contains the total 440 * fields and/or padding to every object. buffer_size contains the total
439 * object size including these internal fields, the following two 441 * object size including these internal fields, the following two
440 * variables contain the offset to the user object and its size. 442 * variables contain the offset to the user object and its size.
441 */ 443 */
442 int obj_offset; 444 int obj_offset;
443 int obj_size; 445 int obj_size;
444 #endif 446 #endif
445 }; 447 };
446 448
447 #define CFLGS_OFF_SLAB (0x80000000UL) 449 #define CFLGS_OFF_SLAB (0x80000000UL)
448 #define OFF_SLAB(x) ((x)->flags & CFLGS_OFF_SLAB) 450 #define OFF_SLAB(x) ((x)->flags & CFLGS_OFF_SLAB)
449 451
450 #define BATCHREFILL_LIMIT 16 452 #define BATCHREFILL_LIMIT 16
451 /* 453 /*
452 * Optimization question: fewer reaps means less probability for unnessary 454 * Optimization question: fewer reaps means less probability for unnessary
453 * cpucache drain/refill cycles. 455 * cpucache drain/refill cycles.
454 * 456 *
455 * OTOH the cpuarrays can contain lots of objects, 457 * OTOH the cpuarrays can contain lots of objects,
456 * which could lock up otherwise freeable slabs. 458 * which could lock up otherwise freeable slabs.
457 */ 459 */
458 #define REAPTIMEOUT_CPUC (2*HZ) 460 #define REAPTIMEOUT_CPUC (2*HZ)
459 #define REAPTIMEOUT_LIST3 (4*HZ) 461 #define REAPTIMEOUT_LIST3 (4*HZ)
460 462
461 #if STATS 463 #if STATS
462 #define STATS_INC_ACTIVE(x) ((x)->num_active++) 464 #define STATS_INC_ACTIVE(x) ((x)->num_active++)
463 #define STATS_DEC_ACTIVE(x) ((x)->num_active--) 465 #define STATS_DEC_ACTIVE(x) ((x)->num_active--)
464 #define STATS_INC_ALLOCED(x) ((x)->num_allocations++) 466 #define STATS_INC_ALLOCED(x) ((x)->num_allocations++)
465 #define STATS_INC_GROWN(x) ((x)->grown++) 467 #define STATS_INC_GROWN(x) ((x)->grown++)
466 #define STATS_ADD_REAPED(x,y) ((x)->reaped += (y)) 468 #define STATS_ADD_REAPED(x,y) ((x)->reaped += (y))
467 #define STATS_SET_HIGH(x) \ 469 #define STATS_SET_HIGH(x) \
468 do { \ 470 do { \
469 if ((x)->num_active > (x)->high_mark) \ 471 if ((x)->num_active > (x)->high_mark) \
470 (x)->high_mark = (x)->num_active; \ 472 (x)->high_mark = (x)->num_active; \
471 } while (0) 473 } while (0)
472 #define STATS_INC_ERR(x) ((x)->errors++) 474 #define STATS_INC_ERR(x) ((x)->errors++)
473 #define STATS_INC_NODEALLOCS(x) ((x)->node_allocs++) 475 #define STATS_INC_NODEALLOCS(x) ((x)->node_allocs++)
474 #define STATS_INC_NODEFREES(x) ((x)->node_frees++) 476 #define STATS_INC_NODEFREES(x) ((x)->node_frees++)
475 #define STATS_INC_ACOVERFLOW(x) ((x)->node_overflow++) 477 #define STATS_INC_ACOVERFLOW(x) ((x)->node_overflow++)
476 #define STATS_SET_FREEABLE(x, i) \ 478 #define STATS_SET_FREEABLE(x, i) \
477 do { \ 479 do { \
478 if ((x)->max_freeable < i) \ 480 if ((x)->max_freeable < i) \
479 (x)->max_freeable = i; \ 481 (x)->max_freeable = i; \
480 } while (0) 482 } while (0)
481 #define STATS_INC_ALLOCHIT(x) atomic_inc(&(x)->allochit) 483 #define STATS_INC_ALLOCHIT(x) atomic_inc(&(x)->allochit)
482 #define STATS_INC_ALLOCMISS(x) atomic_inc(&(x)->allocmiss) 484 #define STATS_INC_ALLOCMISS(x) atomic_inc(&(x)->allocmiss)
483 #define STATS_INC_FREEHIT(x) atomic_inc(&(x)->freehit) 485 #define STATS_INC_FREEHIT(x) atomic_inc(&(x)->freehit)
484 #define STATS_INC_FREEMISS(x) atomic_inc(&(x)->freemiss) 486 #define STATS_INC_FREEMISS(x) atomic_inc(&(x)->freemiss)
485 #else 487 #else
486 #define STATS_INC_ACTIVE(x) do { } while (0) 488 #define STATS_INC_ACTIVE(x) do { } while (0)
487 #define STATS_DEC_ACTIVE(x) do { } while (0) 489 #define STATS_DEC_ACTIVE(x) do { } while (0)
488 #define STATS_INC_ALLOCED(x) do { } while (0) 490 #define STATS_INC_ALLOCED(x) do { } while (0)
489 #define STATS_INC_GROWN(x) do { } while (0) 491 #define STATS_INC_GROWN(x) do { } while (0)
490 #define STATS_ADD_REAPED(x,y) do { } while (0) 492 #define STATS_ADD_REAPED(x,y) do { } while (0)
491 #define STATS_SET_HIGH(x) do { } while (0) 493 #define STATS_SET_HIGH(x) do { } while (0)
492 #define STATS_INC_ERR(x) do { } while (0) 494 #define STATS_INC_ERR(x) do { } while (0)
493 #define STATS_INC_NODEALLOCS(x) do { } while (0) 495 #define STATS_INC_NODEALLOCS(x) do { } while (0)
494 #define STATS_INC_NODEFREES(x) do { } while (0) 496 #define STATS_INC_NODEFREES(x) do { } while (0)
495 #define STATS_INC_ACOVERFLOW(x) do { } while (0) 497 #define STATS_INC_ACOVERFLOW(x) do { } while (0)
496 #define STATS_SET_FREEABLE(x, i) do { } while (0) 498 #define STATS_SET_FREEABLE(x, i) do { } while (0)
497 #define STATS_INC_ALLOCHIT(x) do { } while (0) 499 #define STATS_INC_ALLOCHIT(x) do { } while (0)
498 #define STATS_INC_ALLOCMISS(x) do { } while (0) 500 #define STATS_INC_ALLOCMISS(x) do { } while (0)
499 #define STATS_INC_FREEHIT(x) do { } while (0) 501 #define STATS_INC_FREEHIT(x) do { } while (0)
500 #define STATS_INC_FREEMISS(x) do { } while (0) 502 #define STATS_INC_FREEMISS(x) do { } while (0)
501 #endif 503 #endif
502 504
503 #if DEBUG 505 #if DEBUG
504 506
505 /* 507 /*
506 * memory layout of objects: 508 * memory layout of objects:
507 * 0 : objp 509 * 0 : objp
508 * 0 .. cachep->obj_offset - BYTES_PER_WORD - 1: padding. This ensures that 510 * 0 .. cachep->obj_offset - BYTES_PER_WORD - 1: padding. This ensures that
509 * the end of an object is aligned with the end of the real 511 * the end of an object is aligned with the end of the real
510 * allocation. Catches writes behind the end of the allocation. 512 * allocation. Catches writes behind the end of the allocation.
511 * cachep->obj_offset - BYTES_PER_WORD .. cachep->obj_offset - 1: 513 * cachep->obj_offset - BYTES_PER_WORD .. cachep->obj_offset - 1:
512 * redzone word. 514 * redzone word.
513 * cachep->obj_offset: The real object. 515 * cachep->obj_offset: The real object.
514 * cachep->buffer_size - 2* BYTES_PER_WORD: redzone word [BYTES_PER_WORD long] 516 * cachep->buffer_size - 2* BYTES_PER_WORD: redzone word [BYTES_PER_WORD long]
515 * cachep->buffer_size - 1* BYTES_PER_WORD: last caller address 517 * cachep->buffer_size - 1* BYTES_PER_WORD: last caller address
516 * [BYTES_PER_WORD long] 518 * [BYTES_PER_WORD long]
517 */ 519 */
518 static int obj_offset(struct kmem_cache *cachep) 520 static int obj_offset(struct kmem_cache *cachep)
519 { 521 {
520 return cachep->obj_offset; 522 return cachep->obj_offset;
521 } 523 }
522 524
523 static int obj_size(struct kmem_cache *cachep) 525 static int obj_size(struct kmem_cache *cachep)
524 { 526 {
525 return cachep->obj_size; 527 return cachep->obj_size;
526 } 528 }
527 529
528 static unsigned long *dbg_redzone1(struct kmem_cache *cachep, void *objp) 530 static unsigned long *dbg_redzone1(struct kmem_cache *cachep, void *objp)
529 { 531 {
530 BUG_ON(!(cachep->flags & SLAB_RED_ZONE)); 532 BUG_ON(!(cachep->flags & SLAB_RED_ZONE));
531 return (unsigned long*) (objp+obj_offset(cachep)-BYTES_PER_WORD); 533 return (unsigned long*) (objp+obj_offset(cachep)-BYTES_PER_WORD);
532 } 534 }
533 535
534 static unsigned long *dbg_redzone2(struct kmem_cache *cachep, void *objp) 536 static unsigned long *dbg_redzone2(struct kmem_cache *cachep, void *objp)
535 { 537 {
536 BUG_ON(!(cachep->flags & SLAB_RED_ZONE)); 538 BUG_ON(!(cachep->flags & SLAB_RED_ZONE));
537 if (cachep->flags & SLAB_STORE_USER) 539 if (cachep->flags & SLAB_STORE_USER)
538 return (unsigned long *)(objp + cachep->buffer_size - 540 return (unsigned long *)(objp + cachep->buffer_size -
539 2 * BYTES_PER_WORD); 541 2 * BYTES_PER_WORD);
540 return (unsigned long *)(objp + cachep->buffer_size - BYTES_PER_WORD); 542 return (unsigned long *)(objp + cachep->buffer_size - BYTES_PER_WORD);
541 } 543 }
542 544
543 static void **dbg_userword(struct kmem_cache *cachep, void *objp) 545 static void **dbg_userword(struct kmem_cache *cachep, void *objp)
544 { 546 {
545 BUG_ON(!(cachep->flags & SLAB_STORE_USER)); 547 BUG_ON(!(cachep->flags & SLAB_STORE_USER));
546 return (void **)(objp + cachep->buffer_size - BYTES_PER_WORD); 548 return (void **)(objp + cachep->buffer_size - BYTES_PER_WORD);
547 } 549 }
548 550
549 #else 551 #else
550 552
551 #define obj_offset(x) 0 553 #define obj_offset(x) 0
552 #define obj_size(cachep) (cachep->buffer_size) 554 #define obj_size(cachep) (cachep->buffer_size)
553 #define dbg_redzone1(cachep, objp) ({BUG(); (unsigned long *)NULL;}) 555 #define dbg_redzone1(cachep, objp) ({BUG(); (unsigned long *)NULL;})
554 #define dbg_redzone2(cachep, objp) ({BUG(); (unsigned long *)NULL;}) 556 #define dbg_redzone2(cachep, objp) ({BUG(); (unsigned long *)NULL;})
555 #define dbg_userword(cachep, objp) ({BUG(); (void **)NULL;}) 557 #define dbg_userword(cachep, objp) ({BUG(); (void **)NULL;})
556 558
557 #endif 559 #endif
558 560
559 /* 561 /*
560 * Maximum size of an obj (in 2^order pages) and absolute limit for the gfp 562 * Maximum size of an obj (in 2^order pages) and absolute limit for the gfp
561 * order. 563 * order.
562 */ 564 */
563 #if defined(CONFIG_LARGE_ALLOCS) 565 #if defined(CONFIG_LARGE_ALLOCS)
564 #define MAX_OBJ_ORDER 13 /* up to 32Mb */ 566 #define MAX_OBJ_ORDER 13 /* up to 32Mb */
565 #define MAX_GFP_ORDER 13 /* up to 32Mb */ 567 #define MAX_GFP_ORDER 13 /* up to 32Mb */
566 #elif defined(CONFIG_MMU) 568 #elif defined(CONFIG_MMU)
567 #define MAX_OBJ_ORDER 5 /* 32 pages */ 569 #define MAX_OBJ_ORDER 5 /* 32 pages */
568 #define MAX_GFP_ORDER 5 /* 32 pages */ 570 #define MAX_GFP_ORDER 5 /* 32 pages */
569 #else 571 #else
570 #define MAX_OBJ_ORDER 8 /* up to 1Mb */ 572 #define MAX_OBJ_ORDER 8 /* up to 1Mb */
571 #define MAX_GFP_ORDER 8 /* up to 1Mb */ 573 #define MAX_GFP_ORDER 8 /* up to 1Mb */
572 #endif 574 #endif
573 575
574 /* 576 /*
575 * Do not go above this order unless 0 objects fit into the slab. 577 * Do not go above this order unless 0 objects fit into the slab.
576 */ 578 */
577 #define BREAK_GFP_ORDER_HI 1 579 #define BREAK_GFP_ORDER_HI 1
578 #define BREAK_GFP_ORDER_LO 0 580 #define BREAK_GFP_ORDER_LO 0
579 static int slab_break_gfp_order = BREAK_GFP_ORDER_LO; 581 static int slab_break_gfp_order = BREAK_GFP_ORDER_LO;
580 582
581 /* 583 /*
582 * Functions for storing/retrieving the cachep and or slab from the page 584 * Functions for storing/retrieving the cachep and or slab from the page
583 * allocator. These are used to find the slab an obj belongs to. With kfree(), 585 * allocator. These are used to find the slab an obj belongs to. With kfree(),
584 * these are used to find the cache which an obj belongs to. 586 * these are used to find the cache which an obj belongs to.
585 */ 587 */
586 static inline void page_set_cache(struct page *page, struct kmem_cache *cache) 588 static inline void page_set_cache(struct page *page, struct kmem_cache *cache)
587 { 589 {
588 page->lru.next = (struct list_head *)cache; 590 page->lru.next = (struct list_head *)cache;
589 } 591 }
590 592
591 static inline struct kmem_cache *page_get_cache(struct page *page) 593 static inline struct kmem_cache *page_get_cache(struct page *page)
592 { 594 {
593 if (unlikely(PageCompound(page))) 595 if (unlikely(PageCompound(page)))
594 page = (struct page *)page_private(page); 596 page = (struct page *)page_private(page);
595 BUG_ON(!PageSlab(page)); 597 BUG_ON(!PageSlab(page));
596 return (struct kmem_cache *)page->lru.next; 598 return (struct kmem_cache *)page->lru.next;
597 } 599 }
598 600
599 static inline void page_set_slab(struct page *page, struct slab *slab) 601 static inline void page_set_slab(struct page *page, struct slab *slab)
600 { 602 {
601 page->lru.prev = (struct list_head *)slab; 603 page->lru.prev = (struct list_head *)slab;
602 } 604 }
603 605
604 static inline struct slab *page_get_slab(struct page *page) 606 static inline struct slab *page_get_slab(struct page *page)
605 { 607 {
606 if (unlikely(PageCompound(page))) 608 if (unlikely(PageCompound(page)))
607 page = (struct page *)page_private(page); 609 page = (struct page *)page_private(page);
608 BUG_ON(!PageSlab(page)); 610 BUG_ON(!PageSlab(page));
609 return (struct slab *)page->lru.prev; 611 return (struct slab *)page->lru.prev;
610 } 612 }
611 613
612 static inline struct kmem_cache *virt_to_cache(const void *obj) 614 static inline struct kmem_cache *virt_to_cache(const void *obj)
613 { 615 {
614 struct page *page = virt_to_page(obj); 616 struct page *page = virt_to_page(obj);
615 return page_get_cache(page); 617 return page_get_cache(page);
616 } 618 }
617 619
618 static inline struct slab *virt_to_slab(const void *obj) 620 static inline struct slab *virt_to_slab(const void *obj)
619 { 621 {
620 struct page *page = virt_to_page(obj); 622 struct page *page = virt_to_page(obj);
621 return page_get_slab(page); 623 return page_get_slab(page);
622 } 624 }
623 625
624 static inline void *index_to_obj(struct kmem_cache *cache, struct slab *slab, 626 static inline void *index_to_obj(struct kmem_cache *cache, struct slab *slab,
625 unsigned int idx) 627 unsigned int idx)
626 { 628 {
627 return slab->s_mem + cache->buffer_size * idx; 629 return slab->s_mem + cache->buffer_size * idx;
628 } 630 }
629 631
630 static inline unsigned int obj_to_index(struct kmem_cache *cache, 632 /*
631 struct slab *slab, void *obj) 633 * We want to avoid an expensive divide : (offset / cache->buffer_size)
634 * Using the fact that buffer_size is a constant for a particular cache,
635 * we can replace (offset / cache->buffer_size) by
636 * reciprocal_divide(offset, cache->reciprocal_buffer_size)
637 */
638 static inline unsigned int obj_to_index(const struct kmem_cache *cache,
639 const struct slab *slab, void *obj)
632 { 640 {
633 return (unsigned)(obj - slab->s_mem) / cache->buffer_size; 641 u32 offset = (obj - slab->s_mem);
642 return reciprocal_divide(offset, cache->reciprocal_buffer_size);
634 } 643 }
635 644
636 /* 645 /*
637 * These are the default caches for kmalloc. Custom caches can have other sizes. 646 * These are the default caches for kmalloc. Custom caches can have other sizes.
638 */ 647 */
639 struct cache_sizes malloc_sizes[] = { 648 struct cache_sizes malloc_sizes[] = {
640 #define CACHE(x) { .cs_size = (x) }, 649 #define CACHE(x) { .cs_size = (x) },
641 #include <linux/kmalloc_sizes.h> 650 #include <linux/kmalloc_sizes.h>
642 CACHE(ULONG_MAX) 651 CACHE(ULONG_MAX)
643 #undef CACHE 652 #undef CACHE
644 }; 653 };
645 EXPORT_SYMBOL(malloc_sizes); 654 EXPORT_SYMBOL(malloc_sizes);
646 655
647 /* Must match cache_sizes above. Out of line to keep cache footprint low. */ 656 /* Must match cache_sizes above. Out of line to keep cache footprint low. */
648 struct cache_names { 657 struct cache_names {
649 char *name; 658 char *name;
650 char *name_dma; 659 char *name_dma;
651 }; 660 };
652 661
653 static struct cache_names __initdata cache_names[] = { 662 static struct cache_names __initdata cache_names[] = {
654 #define CACHE(x) { .name = "size-" #x, .name_dma = "size-" #x "(DMA)" }, 663 #define CACHE(x) { .name = "size-" #x, .name_dma = "size-" #x "(DMA)" },
655 #include <linux/kmalloc_sizes.h> 664 #include <linux/kmalloc_sizes.h>
656 {NULL,} 665 {NULL,}
657 #undef CACHE 666 #undef CACHE
658 }; 667 };
659 668
660 static struct arraycache_init initarray_cache __initdata = 669 static struct arraycache_init initarray_cache __initdata =
661 { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} }; 670 { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} };
662 static struct arraycache_init initarray_generic = 671 static struct arraycache_init initarray_generic =
663 { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} }; 672 { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} };
664 673
665 /* internal cache of cache description objs */ 674 /* internal cache of cache description objs */
666 static struct kmem_cache cache_cache = { 675 static struct kmem_cache cache_cache = {
667 .batchcount = 1, 676 .batchcount = 1,
668 .limit = BOOT_CPUCACHE_ENTRIES, 677 .limit = BOOT_CPUCACHE_ENTRIES,
669 .shared = 1, 678 .shared = 1,
670 .buffer_size = sizeof(struct kmem_cache), 679 .buffer_size = sizeof(struct kmem_cache),
671 .name = "kmem_cache", 680 .name = "kmem_cache",
672 #if DEBUG 681 #if DEBUG
673 .obj_size = sizeof(struct kmem_cache), 682 .obj_size = sizeof(struct kmem_cache),
674 #endif 683 #endif
675 }; 684 };
676 685
677 #define BAD_ALIEN_MAGIC 0x01020304ul 686 #define BAD_ALIEN_MAGIC 0x01020304ul
678 687
679 #ifdef CONFIG_LOCKDEP 688 #ifdef CONFIG_LOCKDEP
680 689
681 /* 690 /*
682 * Slab sometimes uses the kmalloc slabs to store the slab headers 691 * Slab sometimes uses the kmalloc slabs to store the slab headers
683 * for other slabs "off slab". 692 * for other slabs "off slab".
684 * The locking for this is tricky in that it nests within the locks 693 * The locking for this is tricky in that it nests within the locks
685 * of all other slabs in a few places; to deal with this special 694 * of all other slabs in a few places; to deal with this special
686 * locking we put on-slab caches into a separate lock-class. 695 * locking we put on-slab caches into a separate lock-class.
687 * 696 *
688 * We set lock class for alien array caches which are up during init. 697 * We set lock class for alien array caches which are up during init.
689 * The lock annotation will be lost if all cpus of a node goes down and 698 * The lock annotation will be lost if all cpus of a node goes down and
690 * then comes back up during hotplug 699 * then comes back up during hotplug
691 */ 700 */
692 static struct lock_class_key on_slab_l3_key; 701 static struct lock_class_key on_slab_l3_key;
693 static struct lock_class_key on_slab_alc_key; 702 static struct lock_class_key on_slab_alc_key;
694 703
695 static inline void init_lock_keys(void) 704 static inline void init_lock_keys(void)
696 705
697 { 706 {
698 int q; 707 int q;
699 struct cache_sizes *s = malloc_sizes; 708 struct cache_sizes *s = malloc_sizes;
700 709
701 while (s->cs_size != ULONG_MAX) { 710 while (s->cs_size != ULONG_MAX) {
702 for_each_node(q) { 711 for_each_node(q) {
703 struct array_cache **alc; 712 struct array_cache **alc;
704 int r; 713 int r;
705 struct kmem_list3 *l3 = s->cs_cachep->nodelists[q]; 714 struct kmem_list3 *l3 = s->cs_cachep->nodelists[q];
706 if (!l3 || OFF_SLAB(s->cs_cachep)) 715 if (!l3 || OFF_SLAB(s->cs_cachep))
707 continue; 716 continue;
708 lockdep_set_class(&l3->list_lock, &on_slab_l3_key); 717 lockdep_set_class(&l3->list_lock, &on_slab_l3_key);
709 alc = l3->alien; 718 alc = l3->alien;
710 /* 719 /*
711 * FIXME: This check for BAD_ALIEN_MAGIC 720 * FIXME: This check for BAD_ALIEN_MAGIC
712 * should go away when common slab code is taught to 721 * should go away when common slab code is taught to
713 * work even without alien caches. 722 * work even without alien caches.
714 * Currently, non NUMA code returns BAD_ALIEN_MAGIC 723 * Currently, non NUMA code returns BAD_ALIEN_MAGIC
715 * for alloc_alien_cache, 724 * for alloc_alien_cache,
716 */ 725 */
717 if (!alc || (unsigned long)alc == BAD_ALIEN_MAGIC) 726 if (!alc || (unsigned long)alc == BAD_ALIEN_MAGIC)
718 continue; 727 continue;
719 for_each_node(r) { 728 for_each_node(r) {
720 if (alc[r]) 729 if (alc[r])
721 lockdep_set_class(&alc[r]->lock, 730 lockdep_set_class(&alc[r]->lock,
722 &on_slab_alc_key); 731 &on_slab_alc_key);
723 } 732 }
724 } 733 }
725 s++; 734 s++;
726 } 735 }
727 } 736 }
728 #else 737 #else
729 static inline void init_lock_keys(void) 738 static inline void init_lock_keys(void)
730 { 739 {
731 } 740 }
732 #endif 741 #endif
733 742
734 /* 743 /*
735 * 1. Guard access to the cache-chain. 744 * 1. Guard access to the cache-chain.
736 * 2. Protect sanity of cpu_online_map against cpu hotplug events 745 * 2. Protect sanity of cpu_online_map against cpu hotplug events
737 */ 746 */
738 static DEFINE_MUTEX(cache_chain_mutex); 747 static DEFINE_MUTEX(cache_chain_mutex);
739 static struct list_head cache_chain; 748 static struct list_head cache_chain;
740 749
741 /* 750 /*
742 * chicken and egg problem: delay the per-cpu array allocation 751 * chicken and egg problem: delay the per-cpu array allocation
743 * until the general caches are up. 752 * until the general caches are up.
744 */ 753 */
745 static enum { 754 static enum {
746 NONE, 755 NONE,
747 PARTIAL_AC, 756 PARTIAL_AC,
748 PARTIAL_L3, 757 PARTIAL_L3,
749 FULL 758 FULL
750 } g_cpucache_up; 759 } g_cpucache_up;
751 760
752 /* 761 /*
753 * used by boot code to determine if it can use slab based allocator 762 * used by boot code to determine if it can use slab based allocator
754 */ 763 */
755 int slab_is_available(void) 764 int slab_is_available(void)
756 { 765 {
757 return g_cpucache_up == FULL; 766 return g_cpucache_up == FULL;
758 } 767 }
759 768
760 static DEFINE_PER_CPU(struct delayed_work, reap_work); 769 static DEFINE_PER_CPU(struct delayed_work, reap_work);
761 770
762 static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep) 771 static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep)
763 { 772 {
764 return cachep->array[smp_processor_id()]; 773 return cachep->array[smp_processor_id()];
765 } 774 }
766 775
767 static inline struct kmem_cache *__find_general_cachep(size_t size, 776 static inline struct kmem_cache *__find_general_cachep(size_t size,
768 gfp_t gfpflags) 777 gfp_t gfpflags)
769 { 778 {
770 struct cache_sizes *csizep = malloc_sizes; 779 struct cache_sizes *csizep = malloc_sizes;
771 780
772 #if DEBUG 781 #if DEBUG
773 /* This happens if someone tries to call 782 /* This happens if someone tries to call
774 * kmem_cache_create(), or __kmalloc(), before 783 * kmem_cache_create(), or __kmalloc(), before
775 * the generic caches are initialized. 784 * the generic caches are initialized.
776 */ 785 */
777 BUG_ON(malloc_sizes[INDEX_AC].cs_cachep == NULL); 786 BUG_ON(malloc_sizes[INDEX_AC].cs_cachep == NULL);
778 #endif 787 #endif
779 while (size > csizep->cs_size) 788 while (size > csizep->cs_size)
780 csizep++; 789 csizep++;
781 790
782 /* 791 /*
783 * Really subtle: The last entry with cs->cs_size==ULONG_MAX 792 * Really subtle: The last entry with cs->cs_size==ULONG_MAX
784 * has cs_{dma,}cachep==NULL. Thus no special case 793 * has cs_{dma,}cachep==NULL. Thus no special case
785 * for large kmalloc calls required. 794 * for large kmalloc calls required.
786 */ 795 */
787 if (unlikely(gfpflags & GFP_DMA)) 796 if (unlikely(gfpflags & GFP_DMA))
788 return csizep->cs_dmacachep; 797 return csizep->cs_dmacachep;
789 return csizep->cs_cachep; 798 return csizep->cs_cachep;
790 } 799 }
791 800
792 static struct kmem_cache *kmem_find_general_cachep(size_t size, gfp_t gfpflags) 801 static struct kmem_cache *kmem_find_general_cachep(size_t size, gfp_t gfpflags)
793 { 802 {
794 return __find_general_cachep(size, gfpflags); 803 return __find_general_cachep(size, gfpflags);
795 } 804 }
796 805
797 static size_t slab_mgmt_size(size_t nr_objs, size_t align) 806 static size_t slab_mgmt_size(size_t nr_objs, size_t align)
798 { 807 {
799 return ALIGN(sizeof(struct slab)+nr_objs*sizeof(kmem_bufctl_t), align); 808 return ALIGN(sizeof(struct slab)+nr_objs*sizeof(kmem_bufctl_t), align);
800 } 809 }
801 810
802 /* 811 /*
803 * Calculate the number of objects and left-over bytes for a given buffer size. 812 * Calculate the number of objects and left-over bytes for a given buffer size.
804 */ 813 */
805 static void cache_estimate(unsigned long gfporder, size_t buffer_size, 814 static void cache_estimate(unsigned long gfporder, size_t buffer_size,
806 size_t align, int flags, size_t *left_over, 815 size_t align, int flags, size_t *left_over,
807 unsigned int *num) 816 unsigned int *num)
808 { 817 {
809 int nr_objs; 818 int nr_objs;
810 size_t mgmt_size; 819 size_t mgmt_size;
811 size_t slab_size = PAGE_SIZE << gfporder; 820 size_t slab_size = PAGE_SIZE << gfporder;
812 821
813 /* 822 /*
814 * The slab management structure can be either off the slab or 823 * The slab management structure can be either off the slab or
815 * on it. For the latter case, the memory allocated for a 824 * on it. For the latter case, the memory allocated for a
816 * slab is used for: 825 * slab is used for:
817 * 826 *
818 * - The struct slab 827 * - The struct slab
819 * - One kmem_bufctl_t for each object 828 * - One kmem_bufctl_t for each object
820 * - Padding to respect alignment of @align 829 * - Padding to respect alignment of @align
821 * - @buffer_size bytes for each object 830 * - @buffer_size bytes for each object
822 * 831 *
823 * If the slab management structure is off the slab, then the 832 * If the slab management structure is off the slab, then the
824 * alignment will already be calculated into the size. Because 833 * alignment will already be calculated into the size. Because
825 * the slabs are all pages aligned, the objects will be at the 834 * the slabs are all pages aligned, the objects will be at the
826 * correct alignment when allocated. 835 * correct alignment when allocated.
827 */ 836 */
828 if (flags & CFLGS_OFF_SLAB) { 837 if (flags & CFLGS_OFF_SLAB) {
829 mgmt_size = 0; 838 mgmt_size = 0;
830 nr_objs = slab_size / buffer_size; 839 nr_objs = slab_size / buffer_size;
831 840
832 if (nr_objs > SLAB_LIMIT) 841 if (nr_objs > SLAB_LIMIT)
833 nr_objs = SLAB_LIMIT; 842 nr_objs = SLAB_LIMIT;
834 } else { 843 } else {
835 /* 844 /*
836 * Ignore padding for the initial guess. The padding 845 * Ignore padding for the initial guess. The padding
837 * is at most @align-1 bytes, and @buffer_size is at 846 * is at most @align-1 bytes, and @buffer_size is at
838 * least @align. In the worst case, this result will 847 * least @align. In the worst case, this result will
839 * be one greater than the number of objects that fit 848 * be one greater than the number of objects that fit
840 * into the memory allocation when taking the padding 849 * into the memory allocation when taking the padding
841 * into account. 850 * into account.
842 */ 851 */
843 nr_objs = (slab_size - sizeof(struct slab)) / 852 nr_objs = (slab_size - sizeof(struct slab)) /
844 (buffer_size + sizeof(kmem_bufctl_t)); 853 (buffer_size + sizeof(kmem_bufctl_t));
845 854
846 /* 855 /*
847 * This calculated number will be either the right 856 * This calculated number will be either the right
848 * amount, or one greater than what we want. 857 * amount, or one greater than what we want.
849 */ 858 */
850 if (slab_mgmt_size(nr_objs, align) + nr_objs*buffer_size 859 if (slab_mgmt_size(nr_objs, align) + nr_objs*buffer_size
851 > slab_size) 860 > slab_size)
852 nr_objs--; 861 nr_objs--;
853 862
854 if (nr_objs > SLAB_LIMIT) 863 if (nr_objs > SLAB_LIMIT)
855 nr_objs = SLAB_LIMIT; 864 nr_objs = SLAB_LIMIT;
856 865
857 mgmt_size = slab_mgmt_size(nr_objs, align); 866 mgmt_size = slab_mgmt_size(nr_objs, align);
858 } 867 }
859 *num = nr_objs; 868 *num = nr_objs;
860 *left_over = slab_size - nr_objs*buffer_size - mgmt_size; 869 *left_over = slab_size - nr_objs*buffer_size - mgmt_size;
861 } 870 }
862 871
863 #define slab_error(cachep, msg) __slab_error(__FUNCTION__, cachep, msg) 872 #define slab_error(cachep, msg) __slab_error(__FUNCTION__, cachep, msg)
864 873
865 static void __slab_error(const char *function, struct kmem_cache *cachep, 874 static void __slab_error(const char *function, struct kmem_cache *cachep,
866 char *msg) 875 char *msg)
867 { 876 {
868 printk(KERN_ERR "slab error in %s(): cache `%s': %s\n", 877 printk(KERN_ERR "slab error in %s(): cache `%s': %s\n",
869 function, cachep->name, msg); 878 function, cachep->name, msg);
870 dump_stack(); 879 dump_stack();
871 } 880 }
872 881
873 /* 882 /*
874 * By default on NUMA we use alien caches to stage the freeing of 883 * By default on NUMA we use alien caches to stage the freeing of
875 * objects allocated from other nodes. This causes massive memory 884 * objects allocated from other nodes. This causes massive memory
876 * inefficiencies when using fake NUMA setup to split memory into a 885 * inefficiencies when using fake NUMA setup to split memory into a
877 * large number of small nodes, so it can be disabled on the command 886 * large number of small nodes, so it can be disabled on the command
878 * line 887 * line
879 */ 888 */
880 889
881 static int use_alien_caches __read_mostly = 1; 890 static int use_alien_caches __read_mostly = 1;
882 static int __init noaliencache_setup(char *s) 891 static int __init noaliencache_setup(char *s)
883 { 892 {
884 use_alien_caches = 0; 893 use_alien_caches = 0;
885 return 1; 894 return 1;
886 } 895 }
887 __setup("noaliencache", noaliencache_setup); 896 __setup("noaliencache", noaliencache_setup);
888 897
889 #ifdef CONFIG_NUMA 898 #ifdef CONFIG_NUMA
890 /* 899 /*
891 * Special reaping functions for NUMA systems called from cache_reap(). 900 * Special reaping functions for NUMA systems called from cache_reap().
892 * These take care of doing round robin flushing of alien caches (containing 901 * These take care of doing round robin flushing of alien caches (containing
893 * objects freed on different nodes from which they were allocated) and the 902 * objects freed on different nodes from which they were allocated) and the
894 * flushing of remote pcps by calling drain_node_pages. 903 * flushing of remote pcps by calling drain_node_pages.
895 */ 904 */
896 static DEFINE_PER_CPU(unsigned long, reap_node); 905 static DEFINE_PER_CPU(unsigned long, reap_node);
897 906
898 static void init_reap_node(int cpu) 907 static void init_reap_node(int cpu)
899 { 908 {
900 int node; 909 int node;
901 910
902 node = next_node(cpu_to_node(cpu), node_online_map); 911 node = next_node(cpu_to_node(cpu), node_online_map);
903 if (node == MAX_NUMNODES) 912 if (node == MAX_NUMNODES)
904 node = first_node(node_online_map); 913 node = first_node(node_online_map);
905 914
906 per_cpu(reap_node, cpu) = node; 915 per_cpu(reap_node, cpu) = node;
907 } 916 }
908 917
909 static void next_reap_node(void) 918 static void next_reap_node(void)
910 { 919 {
911 int node = __get_cpu_var(reap_node); 920 int node = __get_cpu_var(reap_node);
912 921
913 /* 922 /*
914 * Also drain per cpu pages on remote zones 923 * Also drain per cpu pages on remote zones
915 */ 924 */
916 if (node != numa_node_id()) 925 if (node != numa_node_id())
917 drain_node_pages(node); 926 drain_node_pages(node);
918 927
919 node = next_node(node, node_online_map); 928 node = next_node(node, node_online_map);
920 if (unlikely(node >= MAX_NUMNODES)) 929 if (unlikely(node >= MAX_NUMNODES))
921 node = first_node(node_online_map); 930 node = first_node(node_online_map);
922 __get_cpu_var(reap_node) = node; 931 __get_cpu_var(reap_node) = node;
923 } 932 }
924 933
925 #else 934 #else
926 #define init_reap_node(cpu) do { } while (0) 935 #define init_reap_node(cpu) do { } while (0)
927 #define next_reap_node(void) do { } while (0) 936 #define next_reap_node(void) do { } while (0)
928 #endif 937 #endif
929 938
930 /* 939 /*
931 * Initiate the reap timer running on the target CPU. We run at around 1 to 2Hz 940 * Initiate the reap timer running on the target CPU. We run at around 1 to 2Hz
932 * via the workqueue/eventd. 941 * via the workqueue/eventd.
933 * Add the CPU number into the expiration time to minimize the possibility of 942 * Add the CPU number into the expiration time to minimize the possibility of
934 * the CPUs getting into lockstep and contending for the global cache chain 943 * the CPUs getting into lockstep and contending for the global cache chain
935 * lock. 944 * lock.
936 */ 945 */
937 static void __devinit start_cpu_timer(int cpu) 946 static void __devinit start_cpu_timer(int cpu)
938 { 947 {
939 struct delayed_work *reap_work = &per_cpu(reap_work, cpu); 948 struct delayed_work *reap_work = &per_cpu(reap_work, cpu);
940 949
941 /* 950 /*
942 * When this gets called from do_initcalls via cpucache_init(), 951 * When this gets called from do_initcalls via cpucache_init(),
943 * init_workqueues() has already run, so keventd will be setup 952 * init_workqueues() has already run, so keventd will be setup
944 * at that time. 953 * at that time.
945 */ 954 */
946 if (keventd_up() && reap_work->work.func == NULL) { 955 if (keventd_up() && reap_work->work.func == NULL) {
947 init_reap_node(cpu); 956 init_reap_node(cpu);
948 INIT_DELAYED_WORK(reap_work, cache_reap); 957 INIT_DELAYED_WORK(reap_work, cache_reap);
949 schedule_delayed_work_on(cpu, reap_work, 958 schedule_delayed_work_on(cpu, reap_work,
950 __round_jiffies_relative(HZ, cpu)); 959 __round_jiffies_relative(HZ, cpu));
951 } 960 }
952 } 961 }
953 962
954 static struct array_cache *alloc_arraycache(int node, int entries, 963 static struct array_cache *alloc_arraycache(int node, int entries,
955 int batchcount) 964 int batchcount)
956 { 965 {
957 int memsize = sizeof(void *) * entries + sizeof(struct array_cache); 966 int memsize = sizeof(void *) * entries + sizeof(struct array_cache);
958 struct array_cache *nc = NULL; 967 struct array_cache *nc = NULL;
959 968
960 nc = kmalloc_node(memsize, GFP_KERNEL, node); 969 nc = kmalloc_node(memsize, GFP_KERNEL, node);
961 if (nc) { 970 if (nc) {
962 nc->avail = 0; 971 nc->avail = 0;
963 nc->limit = entries; 972 nc->limit = entries;
964 nc->batchcount = batchcount; 973 nc->batchcount = batchcount;
965 nc->touched = 0; 974 nc->touched = 0;
966 spin_lock_init(&nc->lock); 975 spin_lock_init(&nc->lock);
967 } 976 }
968 return nc; 977 return nc;
969 } 978 }
970 979
971 /* 980 /*
972 * Transfer objects in one arraycache to another. 981 * Transfer objects in one arraycache to another.
973 * Locking must be handled by the caller. 982 * Locking must be handled by the caller.
974 * 983 *
975 * Return the number of entries transferred. 984 * Return the number of entries transferred.
976 */ 985 */
977 static int transfer_objects(struct array_cache *to, 986 static int transfer_objects(struct array_cache *to,
978 struct array_cache *from, unsigned int max) 987 struct array_cache *from, unsigned int max)
979 { 988 {
980 /* Figure out how many entries to transfer */ 989 /* Figure out how many entries to transfer */
981 int nr = min(min(from->avail, max), to->limit - to->avail); 990 int nr = min(min(from->avail, max), to->limit - to->avail);
982 991
983 if (!nr) 992 if (!nr)
984 return 0; 993 return 0;
985 994
986 memcpy(to->entry + to->avail, from->entry + from->avail -nr, 995 memcpy(to->entry + to->avail, from->entry + from->avail -nr,
987 sizeof(void *) *nr); 996 sizeof(void *) *nr);
988 997
989 from->avail -= nr; 998 from->avail -= nr;
990 to->avail += nr; 999 to->avail += nr;
991 to->touched = 1; 1000 to->touched = 1;
992 return nr; 1001 return nr;
993 } 1002 }
994 1003
995 #ifndef CONFIG_NUMA 1004 #ifndef CONFIG_NUMA
996 1005
997 #define drain_alien_cache(cachep, alien) do { } while (0) 1006 #define drain_alien_cache(cachep, alien) do { } while (0)
998 #define reap_alien(cachep, l3) do { } while (0) 1007 #define reap_alien(cachep, l3) do { } while (0)
999 1008
1000 static inline struct array_cache **alloc_alien_cache(int node, int limit) 1009 static inline struct array_cache **alloc_alien_cache(int node, int limit)
1001 { 1010 {
1002 return (struct array_cache **)BAD_ALIEN_MAGIC; 1011 return (struct array_cache **)BAD_ALIEN_MAGIC;
1003 } 1012 }
1004 1013
1005 static inline void free_alien_cache(struct array_cache **ac_ptr) 1014 static inline void free_alien_cache(struct array_cache **ac_ptr)
1006 { 1015 {
1007 } 1016 }
1008 1017
1009 static inline int cache_free_alien(struct kmem_cache *cachep, void *objp) 1018 static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
1010 { 1019 {
1011 return 0; 1020 return 0;
1012 } 1021 }
1013 1022
1014 static inline void *alternate_node_alloc(struct kmem_cache *cachep, 1023 static inline void *alternate_node_alloc(struct kmem_cache *cachep,
1015 gfp_t flags) 1024 gfp_t flags)
1016 { 1025 {
1017 return NULL; 1026 return NULL;
1018 } 1027 }
1019 1028
1020 static inline void *____cache_alloc_node(struct kmem_cache *cachep, 1029 static inline void *____cache_alloc_node(struct kmem_cache *cachep,
1021 gfp_t flags, int nodeid) 1030 gfp_t flags, int nodeid)
1022 { 1031 {
1023 return NULL; 1032 return NULL;
1024 } 1033 }
1025 1034
1026 #else /* CONFIG_NUMA */ 1035 #else /* CONFIG_NUMA */
1027 1036
1028 static void *____cache_alloc_node(struct kmem_cache *, gfp_t, int); 1037 static void *____cache_alloc_node(struct kmem_cache *, gfp_t, int);
1029 static void *alternate_node_alloc(struct kmem_cache *, gfp_t); 1038 static void *alternate_node_alloc(struct kmem_cache *, gfp_t);
1030 1039
1031 static struct array_cache **alloc_alien_cache(int node, int limit) 1040 static struct array_cache **alloc_alien_cache(int node, int limit)
1032 { 1041 {
1033 struct array_cache **ac_ptr; 1042 struct array_cache **ac_ptr;
1034 int memsize = sizeof(void *) * MAX_NUMNODES; 1043 int memsize = sizeof(void *) * MAX_NUMNODES;
1035 int i; 1044 int i;
1036 1045
1037 if (limit > 1) 1046 if (limit > 1)
1038 limit = 12; 1047 limit = 12;
1039 ac_ptr = kmalloc_node(memsize, GFP_KERNEL, node); 1048 ac_ptr = kmalloc_node(memsize, GFP_KERNEL, node);
1040 if (ac_ptr) { 1049 if (ac_ptr) {
1041 for_each_node(i) { 1050 for_each_node(i) {
1042 if (i == node || !node_online(i)) { 1051 if (i == node || !node_online(i)) {
1043 ac_ptr[i] = NULL; 1052 ac_ptr[i] = NULL;
1044 continue; 1053 continue;
1045 } 1054 }
1046 ac_ptr[i] = alloc_arraycache(node, limit, 0xbaadf00d); 1055 ac_ptr[i] = alloc_arraycache(node, limit, 0xbaadf00d);
1047 if (!ac_ptr[i]) { 1056 if (!ac_ptr[i]) {
1048 for (i--; i <= 0; i--) 1057 for (i--; i <= 0; i--)
1049 kfree(ac_ptr[i]); 1058 kfree(ac_ptr[i]);
1050 kfree(ac_ptr); 1059 kfree(ac_ptr);
1051 return NULL; 1060 return NULL;
1052 } 1061 }
1053 } 1062 }
1054 } 1063 }
1055 return ac_ptr; 1064 return ac_ptr;
1056 } 1065 }
1057 1066
1058 static void free_alien_cache(struct array_cache **ac_ptr) 1067 static void free_alien_cache(struct array_cache **ac_ptr)
1059 { 1068 {
1060 int i; 1069 int i;
1061 1070
1062 if (!ac_ptr) 1071 if (!ac_ptr)
1063 return; 1072 return;
1064 for_each_node(i) 1073 for_each_node(i)
1065 kfree(ac_ptr[i]); 1074 kfree(ac_ptr[i]);
1066 kfree(ac_ptr); 1075 kfree(ac_ptr);
1067 } 1076 }
1068 1077
1069 static void __drain_alien_cache(struct kmem_cache *cachep, 1078 static void __drain_alien_cache(struct kmem_cache *cachep,
1070 struct array_cache *ac, int node) 1079 struct array_cache *ac, int node)
1071 { 1080 {
1072 struct kmem_list3 *rl3 = cachep->nodelists[node]; 1081 struct kmem_list3 *rl3 = cachep->nodelists[node];
1073 1082
1074 if (ac->avail) { 1083 if (ac->avail) {
1075 spin_lock(&rl3->list_lock); 1084 spin_lock(&rl3->list_lock);
1076 /* 1085 /*
1077 * Stuff objects into the remote nodes shared array first. 1086 * Stuff objects into the remote nodes shared array first.
1078 * That way we could avoid the overhead of putting the objects 1087 * That way we could avoid the overhead of putting the objects
1079 * into the free lists and getting them back later. 1088 * into the free lists and getting them back later.
1080 */ 1089 */
1081 if (rl3->shared) 1090 if (rl3->shared)
1082 transfer_objects(rl3->shared, ac, ac->limit); 1091 transfer_objects(rl3->shared, ac, ac->limit);
1083 1092
1084 free_block(cachep, ac->entry, ac->avail, node); 1093 free_block(cachep, ac->entry, ac->avail, node);
1085 ac->avail = 0; 1094 ac->avail = 0;
1086 spin_unlock(&rl3->list_lock); 1095 spin_unlock(&rl3->list_lock);
1087 } 1096 }
1088 } 1097 }
1089 1098
1090 /* 1099 /*
1091 * Called from cache_reap() to regularly drain alien caches round robin. 1100 * Called from cache_reap() to regularly drain alien caches round robin.
1092 */ 1101 */
1093 static void reap_alien(struct kmem_cache *cachep, struct kmem_list3 *l3) 1102 static void reap_alien(struct kmem_cache *cachep, struct kmem_list3 *l3)
1094 { 1103 {
1095 int node = __get_cpu_var(reap_node); 1104 int node = __get_cpu_var(reap_node);
1096 1105
1097 if (l3->alien) { 1106 if (l3->alien) {
1098 struct array_cache *ac = l3->alien[node]; 1107 struct array_cache *ac = l3->alien[node];
1099 1108
1100 if (ac && ac->avail && spin_trylock_irq(&ac->lock)) { 1109 if (ac && ac->avail && spin_trylock_irq(&ac->lock)) {
1101 __drain_alien_cache(cachep, ac, node); 1110 __drain_alien_cache(cachep, ac, node);
1102 spin_unlock_irq(&ac->lock); 1111 spin_unlock_irq(&ac->lock);
1103 } 1112 }
1104 } 1113 }
1105 } 1114 }
1106 1115
1107 static void drain_alien_cache(struct kmem_cache *cachep, 1116 static void drain_alien_cache(struct kmem_cache *cachep,
1108 struct array_cache **alien) 1117 struct array_cache **alien)
1109 { 1118 {
1110 int i = 0; 1119 int i = 0;
1111 struct array_cache *ac; 1120 struct array_cache *ac;
1112 unsigned long flags; 1121 unsigned long flags;
1113 1122
1114 for_each_online_node(i) { 1123 for_each_online_node(i) {
1115 ac = alien[i]; 1124 ac = alien[i];
1116 if (ac) { 1125 if (ac) {
1117 spin_lock_irqsave(&ac->lock, flags); 1126 spin_lock_irqsave(&ac->lock, flags);
1118 __drain_alien_cache(cachep, ac, i); 1127 __drain_alien_cache(cachep, ac, i);
1119 spin_unlock_irqrestore(&ac->lock, flags); 1128 spin_unlock_irqrestore(&ac->lock, flags);
1120 } 1129 }
1121 } 1130 }
1122 } 1131 }
1123 1132
1124 static inline int cache_free_alien(struct kmem_cache *cachep, void *objp) 1133 static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
1125 { 1134 {
1126 struct slab *slabp = virt_to_slab(objp); 1135 struct slab *slabp = virt_to_slab(objp);
1127 int nodeid = slabp->nodeid; 1136 int nodeid = slabp->nodeid;
1128 struct kmem_list3 *l3; 1137 struct kmem_list3 *l3;
1129 struct array_cache *alien = NULL; 1138 struct array_cache *alien = NULL;
1130 int node; 1139 int node;
1131 1140
1132 node = numa_node_id(); 1141 node = numa_node_id();
1133 1142
1134 /* 1143 /*
1135 * Make sure we are not freeing a object from another node to the array 1144 * Make sure we are not freeing a object from another node to the array
1136 * cache on this cpu. 1145 * cache on this cpu.
1137 */ 1146 */
1138 if (likely(slabp->nodeid == node) || unlikely(!use_alien_caches)) 1147 if (likely(slabp->nodeid == node) || unlikely(!use_alien_caches))
1139 return 0; 1148 return 0;
1140 1149
1141 l3 = cachep->nodelists[node]; 1150 l3 = cachep->nodelists[node];
1142 STATS_INC_NODEFREES(cachep); 1151 STATS_INC_NODEFREES(cachep);
1143 if (l3->alien && l3->alien[nodeid]) { 1152 if (l3->alien && l3->alien[nodeid]) {
1144 alien = l3->alien[nodeid]; 1153 alien = l3->alien[nodeid];
1145 spin_lock(&alien->lock); 1154 spin_lock(&alien->lock);
1146 if (unlikely(alien->avail == alien->limit)) { 1155 if (unlikely(alien->avail == alien->limit)) {
1147 STATS_INC_ACOVERFLOW(cachep); 1156 STATS_INC_ACOVERFLOW(cachep);
1148 __drain_alien_cache(cachep, alien, nodeid); 1157 __drain_alien_cache(cachep, alien, nodeid);
1149 } 1158 }
1150 alien->entry[alien->avail++] = objp; 1159 alien->entry[alien->avail++] = objp;
1151 spin_unlock(&alien->lock); 1160 spin_unlock(&alien->lock);
1152 } else { 1161 } else {
1153 spin_lock(&(cachep->nodelists[nodeid])->list_lock); 1162 spin_lock(&(cachep->nodelists[nodeid])->list_lock);
1154 free_block(cachep, &objp, 1, nodeid); 1163 free_block(cachep, &objp, 1, nodeid);
1155 spin_unlock(&(cachep->nodelists[nodeid])->list_lock); 1164 spin_unlock(&(cachep->nodelists[nodeid])->list_lock);
1156 } 1165 }
1157 return 1; 1166 return 1;
1158 } 1167 }
1159 #endif 1168 #endif
1160 1169
1161 static int __cpuinit cpuup_callback(struct notifier_block *nfb, 1170 static int __cpuinit cpuup_callback(struct notifier_block *nfb,
1162 unsigned long action, void *hcpu) 1171 unsigned long action, void *hcpu)
1163 { 1172 {
1164 long cpu = (long)hcpu; 1173 long cpu = (long)hcpu;
1165 struct kmem_cache *cachep; 1174 struct kmem_cache *cachep;
1166 struct kmem_list3 *l3 = NULL; 1175 struct kmem_list3 *l3 = NULL;
1167 int node = cpu_to_node(cpu); 1176 int node = cpu_to_node(cpu);
1168 int memsize = sizeof(struct kmem_list3); 1177 int memsize = sizeof(struct kmem_list3);
1169 1178
1170 switch (action) { 1179 switch (action) {
1171 case CPU_UP_PREPARE: 1180 case CPU_UP_PREPARE:
1172 mutex_lock(&cache_chain_mutex); 1181 mutex_lock(&cache_chain_mutex);
1173 /* 1182 /*
1174 * We need to do this right in the beginning since 1183 * We need to do this right in the beginning since
1175 * alloc_arraycache's are going to use this list. 1184 * alloc_arraycache's are going to use this list.
1176 * kmalloc_node allows us to add the slab to the right 1185 * kmalloc_node allows us to add the slab to the right
1177 * kmem_list3 and not this cpu's kmem_list3 1186 * kmem_list3 and not this cpu's kmem_list3
1178 */ 1187 */
1179 1188
1180 list_for_each_entry(cachep, &cache_chain, next) { 1189 list_for_each_entry(cachep, &cache_chain, next) {
1181 /* 1190 /*
1182 * Set up the size64 kmemlist for cpu before we can 1191 * Set up the size64 kmemlist for cpu before we can
1183 * begin anything. Make sure some other cpu on this 1192 * begin anything. Make sure some other cpu on this
1184 * node has not already allocated this 1193 * node has not already allocated this
1185 */ 1194 */
1186 if (!cachep->nodelists[node]) { 1195 if (!cachep->nodelists[node]) {
1187 l3 = kmalloc_node(memsize, GFP_KERNEL, node); 1196 l3 = kmalloc_node(memsize, GFP_KERNEL, node);
1188 if (!l3) 1197 if (!l3)
1189 goto bad; 1198 goto bad;
1190 kmem_list3_init(l3); 1199 kmem_list3_init(l3);
1191 l3->next_reap = jiffies + REAPTIMEOUT_LIST3 + 1200 l3->next_reap = jiffies + REAPTIMEOUT_LIST3 +
1192 ((unsigned long)cachep) % REAPTIMEOUT_LIST3; 1201 ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
1193 1202
1194 /* 1203 /*
1195 * The l3s don't come and go as CPUs come and 1204 * The l3s don't come and go as CPUs come and
1196 * go. cache_chain_mutex is sufficient 1205 * go. cache_chain_mutex is sufficient
1197 * protection here. 1206 * protection here.
1198 */ 1207 */
1199 cachep->nodelists[node] = l3; 1208 cachep->nodelists[node] = l3;
1200 } 1209 }
1201 1210
1202 spin_lock_irq(&cachep->nodelists[node]->list_lock); 1211 spin_lock_irq(&cachep->nodelists[node]->list_lock);
1203 cachep->nodelists[node]->free_limit = 1212 cachep->nodelists[node]->free_limit =
1204 (1 + nr_cpus_node(node)) * 1213 (1 + nr_cpus_node(node)) *
1205 cachep->batchcount + cachep->num; 1214 cachep->batchcount + cachep->num;
1206 spin_unlock_irq(&cachep->nodelists[node]->list_lock); 1215 spin_unlock_irq(&cachep->nodelists[node]->list_lock);
1207 } 1216 }
1208 1217
1209 /* 1218 /*
1210 * Now we can go ahead with allocating the shared arrays and 1219 * Now we can go ahead with allocating the shared arrays and
1211 * array caches 1220 * array caches
1212 */ 1221 */
1213 list_for_each_entry(cachep, &cache_chain, next) { 1222 list_for_each_entry(cachep, &cache_chain, next) {
1214 struct array_cache *nc; 1223 struct array_cache *nc;
1215 struct array_cache *shared; 1224 struct array_cache *shared;
1216 struct array_cache **alien = NULL; 1225 struct array_cache **alien = NULL;
1217 1226
1218 nc = alloc_arraycache(node, cachep->limit, 1227 nc = alloc_arraycache(node, cachep->limit,
1219 cachep->batchcount); 1228 cachep->batchcount);
1220 if (!nc) 1229 if (!nc)
1221 goto bad; 1230 goto bad;
1222 shared = alloc_arraycache(node, 1231 shared = alloc_arraycache(node,
1223 cachep->shared * cachep->batchcount, 1232 cachep->shared * cachep->batchcount,
1224 0xbaadf00d); 1233 0xbaadf00d);
1225 if (!shared) 1234 if (!shared)
1226 goto bad; 1235 goto bad;
1227 1236
1228 if (use_alien_caches) { 1237 if (use_alien_caches) {
1229 alien = alloc_alien_cache(node, cachep->limit); 1238 alien = alloc_alien_cache(node, cachep->limit);
1230 if (!alien) 1239 if (!alien)
1231 goto bad; 1240 goto bad;
1232 } 1241 }
1233 cachep->array[cpu] = nc; 1242 cachep->array[cpu] = nc;
1234 l3 = cachep->nodelists[node]; 1243 l3 = cachep->nodelists[node];
1235 BUG_ON(!l3); 1244 BUG_ON(!l3);
1236 1245
1237 spin_lock_irq(&l3->list_lock); 1246 spin_lock_irq(&l3->list_lock);
1238 if (!l3->shared) { 1247 if (!l3->shared) {
1239 /* 1248 /*
1240 * We are serialised from CPU_DEAD or 1249 * We are serialised from CPU_DEAD or
1241 * CPU_UP_CANCELLED by the cpucontrol lock 1250 * CPU_UP_CANCELLED by the cpucontrol lock
1242 */ 1251 */
1243 l3->shared = shared; 1252 l3->shared = shared;
1244 shared = NULL; 1253 shared = NULL;
1245 } 1254 }
1246 #ifdef CONFIG_NUMA 1255 #ifdef CONFIG_NUMA
1247 if (!l3->alien) { 1256 if (!l3->alien) {
1248 l3->alien = alien; 1257 l3->alien = alien;
1249 alien = NULL; 1258 alien = NULL;
1250 } 1259 }
1251 #endif 1260 #endif
1252 spin_unlock_irq(&l3->list_lock); 1261 spin_unlock_irq(&l3->list_lock);
1253 kfree(shared); 1262 kfree(shared);
1254 free_alien_cache(alien); 1263 free_alien_cache(alien);
1255 } 1264 }
1256 break; 1265 break;
1257 case CPU_ONLINE: 1266 case CPU_ONLINE:
1258 mutex_unlock(&cache_chain_mutex); 1267 mutex_unlock(&cache_chain_mutex);
1259 start_cpu_timer(cpu); 1268 start_cpu_timer(cpu);
1260 break; 1269 break;
1261 #ifdef CONFIG_HOTPLUG_CPU 1270 #ifdef CONFIG_HOTPLUG_CPU
1262 case CPU_DOWN_PREPARE: 1271 case CPU_DOWN_PREPARE:
1263 mutex_lock(&cache_chain_mutex); 1272 mutex_lock(&cache_chain_mutex);
1264 break; 1273 break;
1265 case CPU_DOWN_FAILED: 1274 case CPU_DOWN_FAILED:
1266 mutex_unlock(&cache_chain_mutex); 1275 mutex_unlock(&cache_chain_mutex);
1267 break; 1276 break;
1268 case CPU_DEAD: 1277 case CPU_DEAD:
1269 /* 1278 /*
1270 * Even if all the cpus of a node are down, we don't free the 1279 * Even if all the cpus of a node are down, we don't free the
1271 * kmem_list3 of any cache. This to avoid a race between 1280 * kmem_list3 of any cache. This to avoid a race between
1272 * cpu_down, and a kmalloc allocation from another cpu for 1281 * cpu_down, and a kmalloc allocation from another cpu for
1273 * memory from the node of the cpu going down. The list3 1282 * memory from the node of the cpu going down. The list3
1274 * structure is usually allocated from kmem_cache_create() and 1283 * structure is usually allocated from kmem_cache_create() and
1275 * gets destroyed at kmem_cache_destroy(). 1284 * gets destroyed at kmem_cache_destroy().
1276 */ 1285 */
1277 /* fall thru */ 1286 /* fall thru */
1278 #endif 1287 #endif
1279 case CPU_UP_CANCELED: 1288 case CPU_UP_CANCELED:
1280 list_for_each_entry(cachep, &cache_chain, next) { 1289 list_for_each_entry(cachep, &cache_chain, next) {
1281 struct array_cache *nc; 1290 struct array_cache *nc;
1282 struct array_cache *shared; 1291 struct array_cache *shared;
1283 struct array_cache **alien; 1292 struct array_cache **alien;
1284 cpumask_t mask; 1293 cpumask_t mask;
1285 1294
1286 mask = node_to_cpumask(node); 1295 mask = node_to_cpumask(node);
1287 /* cpu is dead; no one can alloc from it. */ 1296 /* cpu is dead; no one can alloc from it. */
1288 nc = cachep->array[cpu]; 1297 nc = cachep->array[cpu];
1289 cachep->array[cpu] = NULL; 1298 cachep->array[cpu] = NULL;
1290 l3 = cachep->nodelists[node]; 1299 l3 = cachep->nodelists[node];
1291 1300
1292 if (!l3) 1301 if (!l3)
1293 goto free_array_cache; 1302 goto free_array_cache;
1294 1303
1295 spin_lock_irq(&l3->list_lock); 1304 spin_lock_irq(&l3->list_lock);
1296 1305
1297 /* Free limit for this kmem_list3 */ 1306 /* Free limit for this kmem_list3 */
1298 l3->free_limit -= cachep->batchcount; 1307 l3->free_limit -= cachep->batchcount;
1299 if (nc) 1308 if (nc)
1300 free_block(cachep, nc->entry, nc->avail, node); 1309 free_block(cachep, nc->entry, nc->avail, node);
1301 1310
1302 if (!cpus_empty(mask)) { 1311 if (!cpus_empty(mask)) {
1303 spin_unlock_irq(&l3->list_lock); 1312 spin_unlock_irq(&l3->list_lock);
1304 goto free_array_cache; 1313 goto free_array_cache;
1305 } 1314 }
1306 1315
1307 shared = l3->shared; 1316 shared = l3->shared;
1308 if (shared) { 1317 if (shared) {
1309 free_block(cachep, l3->shared->entry, 1318 free_block(cachep, l3->shared->entry,
1310 l3->shared->avail, node); 1319 l3->shared->avail, node);
1311 l3->shared = NULL; 1320 l3->shared = NULL;
1312 } 1321 }
1313 1322
1314 alien = l3->alien; 1323 alien = l3->alien;
1315 l3->alien = NULL; 1324 l3->alien = NULL;
1316 1325
1317 spin_unlock_irq(&l3->list_lock); 1326 spin_unlock_irq(&l3->list_lock);
1318 1327
1319 kfree(shared); 1328 kfree(shared);
1320 if (alien) { 1329 if (alien) {
1321 drain_alien_cache(cachep, alien); 1330 drain_alien_cache(cachep, alien);
1322 free_alien_cache(alien); 1331 free_alien_cache(alien);
1323 } 1332 }
1324 free_array_cache: 1333 free_array_cache:
1325 kfree(nc); 1334 kfree(nc);
1326 } 1335 }
1327 /* 1336 /*
1328 * In the previous loop, all the objects were freed to 1337 * In the previous loop, all the objects were freed to
1329 * the respective cache's slabs, now we can go ahead and 1338 * the respective cache's slabs, now we can go ahead and
1330 * shrink each nodelist to its limit. 1339 * shrink each nodelist to its limit.
1331 */ 1340 */
1332 list_for_each_entry(cachep, &cache_chain, next) { 1341 list_for_each_entry(cachep, &cache_chain, next) {
1333 l3 = cachep->nodelists[node]; 1342 l3 = cachep->nodelists[node];
1334 if (!l3) 1343 if (!l3)
1335 continue; 1344 continue;
1336 drain_freelist(cachep, l3, l3->free_objects); 1345 drain_freelist(cachep, l3, l3->free_objects);
1337 } 1346 }
1338 mutex_unlock(&cache_chain_mutex); 1347 mutex_unlock(&cache_chain_mutex);
1339 break; 1348 break;
1340 } 1349 }
1341 return NOTIFY_OK; 1350 return NOTIFY_OK;
1342 bad: 1351 bad:
1343 return NOTIFY_BAD; 1352 return NOTIFY_BAD;
1344 } 1353 }
1345 1354
1346 static struct notifier_block __cpuinitdata cpucache_notifier = { 1355 static struct notifier_block __cpuinitdata cpucache_notifier = {
1347 &cpuup_callback, NULL, 0 1356 &cpuup_callback, NULL, 0
1348 }; 1357 };
1349 1358
1350 /* 1359 /*
1351 * swap the static kmem_list3 with kmalloced memory 1360 * swap the static kmem_list3 with kmalloced memory
1352 */ 1361 */
1353 static void init_list(struct kmem_cache *cachep, struct kmem_list3 *list, 1362 static void init_list(struct kmem_cache *cachep, struct kmem_list3 *list,
1354 int nodeid) 1363 int nodeid)
1355 { 1364 {
1356 struct kmem_list3 *ptr; 1365 struct kmem_list3 *ptr;
1357 1366
1358 ptr = kmalloc_node(sizeof(struct kmem_list3), GFP_KERNEL, nodeid); 1367 ptr = kmalloc_node(sizeof(struct kmem_list3), GFP_KERNEL, nodeid);
1359 BUG_ON(!ptr); 1368 BUG_ON(!ptr);
1360 1369
1361 local_irq_disable(); 1370 local_irq_disable();
1362 memcpy(ptr, list, sizeof(struct kmem_list3)); 1371 memcpy(ptr, list, sizeof(struct kmem_list3));
1363 /* 1372 /*
1364 * Do not assume that spinlocks can be initialized via memcpy: 1373 * Do not assume that spinlocks can be initialized via memcpy:
1365 */ 1374 */
1366 spin_lock_init(&ptr->list_lock); 1375 spin_lock_init(&ptr->list_lock);
1367 1376
1368 MAKE_ALL_LISTS(cachep, ptr, nodeid); 1377 MAKE_ALL_LISTS(cachep, ptr, nodeid);
1369 cachep->nodelists[nodeid] = ptr; 1378 cachep->nodelists[nodeid] = ptr;
1370 local_irq_enable(); 1379 local_irq_enable();
1371 } 1380 }
1372 1381
1373 /* 1382 /*
1374 * Initialisation. Called after the page allocator have been initialised and 1383 * Initialisation. Called after the page allocator have been initialised and
1375 * before smp_init(). 1384 * before smp_init().
1376 */ 1385 */
1377 void __init kmem_cache_init(void) 1386 void __init kmem_cache_init(void)
1378 { 1387 {
1379 size_t left_over; 1388 size_t left_over;
1380 struct cache_sizes *sizes; 1389 struct cache_sizes *sizes;
1381 struct cache_names *names; 1390 struct cache_names *names;
1382 int i; 1391 int i;
1383 int order; 1392 int order;
1384 int node; 1393 int node;
1385 1394
1386 for (i = 0; i < NUM_INIT_LISTS; i++) { 1395 for (i = 0; i < NUM_INIT_LISTS; i++) {
1387 kmem_list3_init(&initkmem_list3[i]); 1396 kmem_list3_init(&initkmem_list3[i]);
1388 if (i < MAX_NUMNODES) 1397 if (i < MAX_NUMNODES)
1389 cache_cache.nodelists[i] = NULL; 1398 cache_cache.nodelists[i] = NULL;
1390 } 1399 }
1391 1400
1392 /* 1401 /*
1393 * Fragmentation resistance on low memory - only use bigger 1402 * Fragmentation resistance on low memory - only use bigger
1394 * page orders on machines with more than 32MB of memory. 1403 * page orders on machines with more than 32MB of memory.
1395 */ 1404 */
1396 if (num_physpages > (32 << 20) >> PAGE_SHIFT) 1405 if (num_physpages > (32 << 20) >> PAGE_SHIFT)
1397 slab_break_gfp_order = BREAK_GFP_ORDER_HI; 1406 slab_break_gfp_order = BREAK_GFP_ORDER_HI;
1398 1407
1399 /* Bootstrap is tricky, because several objects are allocated 1408 /* Bootstrap is tricky, because several objects are allocated
1400 * from caches that do not exist yet: 1409 * from caches that do not exist yet:
1401 * 1) initialize the cache_cache cache: it contains the struct 1410 * 1) initialize the cache_cache cache: it contains the struct
1402 * kmem_cache structures of all caches, except cache_cache itself: 1411 * kmem_cache structures of all caches, except cache_cache itself:
1403 * cache_cache is statically allocated. 1412 * cache_cache is statically allocated.
1404 * Initially an __init data area is used for the head array and the 1413 * Initially an __init data area is used for the head array and the
1405 * kmem_list3 structures, it's replaced with a kmalloc allocated 1414 * kmem_list3 structures, it's replaced with a kmalloc allocated
1406 * array at the end of the bootstrap. 1415 * array at the end of the bootstrap.
1407 * 2) Create the first kmalloc cache. 1416 * 2) Create the first kmalloc cache.
1408 * The struct kmem_cache for the new cache is allocated normally. 1417 * The struct kmem_cache for the new cache is allocated normally.
1409 * An __init data area is used for the head array. 1418 * An __init data area is used for the head array.
1410 * 3) Create the remaining kmalloc caches, with minimally sized 1419 * 3) Create the remaining kmalloc caches, with minimally sized
1411 * head arrays. 1420 * head arrays.
1412 * 4) Replace the __init data head arrays for cache_cache and the first 1421 * 4) Replace the __init data head arrays for cache_cache and the first
1413 * kmalloc cache with kmalloc allocated arrays. 1422 * kmalloc cache with kmalloc allocated arrays.
1414 * 5) Replace the __init data for kmem_list3 for cache_cache and 1423 * 5) Replace the __init data for kmem_list3 for cache_cache and
1415 * the other cache's with kmalloc allocated memory. 1424 * the other cache's with kmalloc allocated memory.
1416 * 6) Resize the head arrays of the kmalloc caches to their final sizes. 1425 * 6) Resize the head arrays of the kmalloc caches to their final sizes.
1417 */ 1426 */
1418 1427
1419 node = numa_node_id(); 1428 node = numa_node_id();
1420 1429
1421 /* 1) create the cache_cache */ 1430 /* 1) create the cache_cache */
1422 INIT_LIST_HEAD(&cache_chain); 1431 INIT_LIST_HEAD(&cache_chain);
1423 list_add(&cache_cache.next, &cache_chain); 1432 list_add(&cache_cache.next, &cache_chain);
1424 cache_cache.colour_off = cache_line_size(); 1433 cache_cache.colour_off = cache_line_size();
1425 cache_cache.array[smp_processor_id()] = &initarray_cache.cache; 1434 cache_cache.array[smp_processor_id()] = &initarray_cache.cache;
1426 cache_cache.nodelists[node] = &initkmem_list3[CACHE_CACHE]; 1435 cache_cache.nodelists[node] = &initkmem_list3[CACHE_CACHE];
1427 1436
1428 cache_cache.buffer_size = ALIGN(cache_cache.buffer_size, 1437 cache_cache.buffer_size = ALIGN(cache_cache.buffer_size,
1429 cache_line_size()); 1438 cache_line_size());
1439 cache_cache.reciprocal_buffer_size =
1440 reciprocal_value(cache_cache.buffer_size);
1430 1441
1431 for (order = 0; order < MAX_ORDER; order++) { 1442 for (order = 0; order < MAX_ORDER; order++) {
1432 cache_estimate(order, cache_cache.buffer_size, 1443 cache_estimate(order, cache_cache.buffer_size,
1433 cache_line_size(), 0, &left_over, &cache_cache.num); 1444 cache_line_size(), 0, &left_over, &cache_cache.num);
1434 if (cache_cache.num) 1445 if (cache_cache.num)
1435 break; 1446 break;
1436 } 1447 }
1437 BUG_ON(!cache_cache.num); 1448 BUG_ON(!cache_cache.num);
1438 cache_cache.gfporder = order; 1449 cache_cache.gfporder = order;
1439 cache_cache.colour = left_over / cache_cache.colour_off; 1450 cache_cache.colour = left_over / cache_cache.colour_off;
1440 cache_cache.slab_size = ALIGN(cache_cache.num * sizeof(kmem_bufctl_t) + 1451 cache_cache.slab_size = ALIGN(cache_cache.num * sizeof(kmem_bufctl_t) +
1441 sizeof(struct slab), cache_line_size()); 1452 sizeof(struct slab), cache_line_size());
1442 1453
1443 /* 2+3) create the kmalloc caches */ 1454 /* 2+3) create the kmalloc caches */
1444 sizes = malloc_sizes; 1455 sizes = malloc_sizes;
1445 names = cache_names; 1456 names = cache_names;
1446 1457
1447 /* 1458 /*
1448 * Initialize the caches that provide memory for the array cache and the 1459 * Initialize the caches that provide memory for the array cache and the
1449 * kmem_list3 structures first. Without this, further allocations will 1460 * kmem_list3 structures first. Without this, further allocations will
1450 * bug. 1461 * bug.
1451 */ 1462 */
1452 1463
1453 sizes[INDEX_AC].cs_cachep = kmem_cache_create(names[INDEX_AC].name, 1464 sizes[INDEX_AC].cs_cachep = kmem_cache_create(names[INDEX_AC].name,
1454 sizes[INDEX_AC].cs_size, 1465 sizes[INDEX_AC].cs_size,
1455 ARCH_KMALLOC_MINALIGN, 1466 ARCH_KMALLOC_MINALIGN,
1456 ARCH_KMALLOC_FLAGS|SLAB_PANIC, 1467 ARCH_KMALLOC_FLAGS|SLAB_PANIC,
1457 NULL, NULL); 1468 NULL, NULL);
1458 1469
1459 if (INDEX_AC != INDEX_L3) { 1470 if (INDEX_AC != INDEX_L3) {
1460 sizes[INDEX_L3].cs_cachep = 1471 sizes[INDEX_L3].cs_cachep =
1461 kmem_cache_create(names[INDEX_L3].name, 1472 kmem_cache_create(names[INDEX_L3].name,
1462 sizes[INDEX_L3].cs_size, 1473 sizes[INDEX_L3].cs_size,
1463 ARCH_KMALLOC_MINALIGN, 1474 ARCH_KMALLOC_MINALIGN,
1464 ARCH_KMALLOC_FLAGS|SLAB_PANIC, 1475 ARCH_KMALLOC_FLAGS|SLAB_PANIC,
1465 NULL, NULL); 1476 NULL, NULL);
1466 } 1477 }
1467 1478
1468 slab_early_init = 0; 1479 slab_early_init = 0;
1469 1480
1470 while (sizes->cs_size != ULONG_MAX) { 1481 while (sizes->cs_size != ULONG_MAX) {
1471 /* 1482 /*
1472 * For performance, all the general caches are L1 aligned. 1483 * For performance, all the general caches are L1 aligned.
1473 * This should be particularly beneficial on SMP boxes, as it 1484 * This should be particularly beneficial on SMP boxes, as it
1474 * eliminates "false sharing". 1485 * eliminates "false sharing".
1475 * Note for systems short on memory removing the alignment will 1486 * Note for systems short on memory removing the alignment will
1476 * allow tighter packing of the smaller caches. 1487 * allow tighter packing of the smaller caches.
1477 */ 1488 */
1478 if (!sizes->cs_cachep) { 1489 if (!sizes->cs_cachep) {
1479 sizes->cs_cachep = kmem_cache_create(names->name, 1490 sizes->cs_cachep = kmem_cache_create(names->name,
1480 sizes->cs_size, 1491 sizes->cs_size,
1481 ARCH_KMALLOC_MINALIGN, 1492 ARCH_KMALLOC_MINALIGN,
1482 ARCH_KMALLOC_FLAGS|SLAB_PANIC, 1493 ARCH_KMALLOC_FLAGS|SLAB_PANIC,
1483 NULL, NULL); 1494 NULL, NULL);
1484 } 1495 }
1485 1496
1486 sizes->cs_dmacachep = kmem_cache_create(names->name_dma, 1497 sizes->cs_dmacachep = kmem_cache_create(names->name_dma,
1487 sizes->cs_size, 1498 sizes->cs_size,
1488 ARCH_KMALLOC_MINALIGN, 1499 ARCH_KMALLOC_MINALIGN,
1489 ARCH_KMALLOC_FLAGS|SLAB_CACHE_DMA| 1500 ARCH_KMALLOC_FLAGS|SLAB_CACHE_DMA|
1490 SLAB_PANIC, 1501 SLAB_PANIC,
1491 NULL, NULL); 1502 NULL, NULL);
1492 sizes++; 1503 sizes++;
1493 names++; 1504 names++;
1494 } 1505 }
1495 /* 4) Replace the bootstrap head arrays */ 1506 /* 4) Replace the bootstrap head arrays */
1496 { 1507 {
1497 struct array_cache *ptr; 1508 struct array_cache *ptr;
1498 1509
1499 ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL); 1510 ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL);
1500 1511
1501 local_irq_disable(); 1512 local_irq_disable();
1502 BUG_ON(cpu_cache_get(&cache_cache) != &initarray_cache.cache); 1513 BUG_ON(cpu_cache_get(&cache_cache) != &initarray_cache.cache);
1503 memcpy(ptr, cpu_cache_get(&cache_cache), 1514 memcpy(ptr, cpu_cache_get(&cache_cache),
1504 sizeof(struct arraycache_init)); 1515 sizeof(struct arraycache_init));
1505 /* 1516 /*
1506 * Do not assume that spinlocks can be initialized via memcpy: 1517 * Do not assume that spinlocks can be initialized via memcpy:
1507 */ 1518 */
1508 spin_lock_init(&ptr->lock); 1519 spin_lock_init(&ptr->lock);
1509 1520
1510 cache_cache.array[smp_processor_id()] = ptr; 1521 cache_cache.array[smp_processor_id()] = ptr;
1511 local_irq_enable(); 1522 local_irq_enable();
1512 1523
1513 ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL); 1524 ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL);
1514 1525
1515 local_irq_disable(); 1526 local_irq_disable();
1516 BUG_ON(cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep) 1527 BUG_ON(cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep)
1517 != &initarray_generic.cache); 1528 != &initarray_generic.cache);
1518 memcpy(ptr, cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep), 1529 memcpy(ptr, cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep),
1519 sizeof(struct arraycache_init)); 1530 sizeof(struct arraycache_init));
1520 /* 1531 /*
1521 * Do not assume that spinlocks can be initialized via memcpy: 1532 * Do not assume that spinlocks can be initialized via memcpy:
1522 */ 1533 */
1523 spin_lock_init(&ptr->lock); 1534 spin_lock_init(&ptr->lock);
1524 1535
1525 malloc_sizes[INDEX_AC].cs_cachep->array[smp_processor_id()] = 1536 malloc_sizes[INDEX_AC].cs_cachep->array[smp_processor_id()] =
1526 ptr; 1537 ptr;
1527 local_irq_enable(); 1538 local_irq_enable();
1528 } 1539 }
1529 /* 5) Replace the bootstrap kmem_list3's */ 1540 /* 5) Replace the bootstrap kmem_list3's */
1530 { 1541 {
1531 int nid; 1542 int nid;
1532 1543
1533 /* Replace the static kmem_list3 structures for the boot cpu */ 1544 /* Replace the static kmem_list3 structures for the boot cpu */
1534 init_list(&cache_cache, &initkmem_list3[CACHE_CACHE], node); 1545 init_list(&cache_cache, &initkmem_list3[CACHE_CACHE], node);
1535 1546
1536 for_each_online_node(nid) { 1547 for_each_online_node(nid) {
1537 init_list(malloc_sizes[INDEX_AC].cs_cachep, 1548 init_list(malloc_sizes[INDEX_AC].cs_cachep,
1538 &initkmem_list3[SIZE_AC + nid], nid); 1549 &initkmem_list3[SIZE_AC + nid], nid);
1539 1550
1540 if (INDEX_AC != INDEX_L3) { 1551 if (INDEX_AC != INDEX_L3) {
1541 init_list(malloc_sizes[INDEX_L3].cs_cachep, 1552 init_list(malloc_sizes[INDEX_L3].cs_cachep,
1542 &initkmem_list3[SIZE_L3 + nid], nid); 1553 &initkmem_list3[SIZE_L3 + nid], nid);
1543 } 1554 }
1544 } 1555 }
1545 } 1556 }
1546 1557
1547 /* 6) resize the head arrays to their final sizes */ 1558 /* 6) resize the head arrays to their final sizes */
1548 { 1559 {
1549 struct kmem_cache *cachep; 1560 struct kmem_cache *cachep;
1550 mutex_lock(&cache_chain_mutex); 1561 mutex_lock(&cache_chain_mutex);
1551 list_for_each_entry(cachep, &cache_chain, next) 1562 list_for_each_entry(cachep, &cache_chain, next)
1552 if (enable_cpucache(cachep)) 1563 if (enable_cpucache(cachep))
1553 BUG(); 1564 BUG();
1554 mutex_unlock(&cache_chain_mutex); 1565 mutex_unlock(&cache_chain_mutex);
1555 } 1566 }
1556 1567
1557 /* Annotate slab for lockdep -- annotate the malloc caches */ 1568 /* Annotate slab for lockdep -- annotate the malloc caches */
1558 init_lock_keys(); 1569 init_lock_keys();
1559 1570
1560 1571
1561 /* Done! */ 1572 /* Done! */
1562 g_cpucache_up = FULL; 1573 g_cpucache_up = FULL;
1563 1574
1564 /* 1575 /*
1565 * Register a cpu startup notifier callback that initializes 1576 * Register a cpu startup notifier callback that initializes
1566 * cpu_cache_get for all new cpus 1577 * cpu_cache_get for all new cpus
1567 */ 1578 */
1568 register_cpu_notifier(&cpucache_notifier); 1579 register_cpu_notifier(&cpucache_notifier);
1569 1580
1570 /* 1581 /*
1571 * The reap timers are started later, with a module init call: That part 1582 * The reap timers are started later, with a module init call: That part
1572 * of the kernel is not yet operational. 1583 * of the kernel is not yet operational.
1573 */ 1584 */
1574 } 1585 }
1575 1586
1576 static int __init cpucache_init(void) 1587 static int __init cpucache_init(void)
1577 { 1588 {
1578 int cpu; 1589 int cpu;
1579 1590
1580 /* 1591 /*
1581 * Register the timers that return unneeded pages to the page allocator 1592 * Register the timers that return unneeded pages to the page allocator
1582 */ 1593 */
1583 for_each_online_cpu(cpu) 1594 for_each_online_cpu(cpu)
1584 start_cpu_timer(cpu); 1595 start_cpu_timer(cpu);
1585 return 0; 1596 return 0;
1586 } 1597 }
1587 __initcall(cpucache_init); 1598 __initcall(cpucache_init);
1588 1599
1589 /* 1600 /*
1590 * Interface to system's page allocator. No need to hold the cache-lock. 1601 * Interface to system's page allocator. No need to hold the cache-lock.
1591 * 1602 *
1592 * If we requested dmaable memory, we will get it. Even if we 1603 * If we requested dmaable memory, we will get it. Even if we
1593 * did not request dmaable memory, we might get it, but that 1604 * did not request dmaable memory, we might get it, but that
1594 * would be relatively rare and ignorable. 1605 * would be relatively rare and ignorable.
1595 */ 1606 */
1596 static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid) 1607 static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid)
1597 { 1608 {
1598 struct page *page; 1609 struct page *page;
1599 int nr_pages; 1610 int nr_pages;
1600 int i; 1611 int i;
1601 1612
1602 #ifndef CONFIG_MMU 1613 #ifndef CONFIG_MMU
1603 /* 1614 /*
1604 * Nommu uses slab's for process anonymous memory allocations, and thus 1615 * Nommu uses slab's for process anonymous memory allocations, and thus
1605 * requires __GFP_COMP to properly refcount higher order allocations 1616 * requires __GFP_COMP to properly refcount higher order allocations
1606 */ 1617 */
1607 flags |= __GFP_COMP; 1618 flags |= __GFP_COMP;
1608 #endif 1619 #endif
1609 1620
1610 flags |= cachep->gfpflags; 1621 flags |= cachep->gfpflags;
1611 1622
1612 page = alloc_pages_node(nodeid, flags, cachep->gfporder); 1623 page = alloc_pages_node(nodeid, flags, cachep->gfporder);
1613 if (!page) 1624 if (!page)
1614 return NULL; 1625 return NULL;
1615 1626
1616 nr_pages = (1 << cachep->gfporder); 1627 nr_pages = (1 << cachep->gfporder);
1617 if (cachep->flags & SLAB_RECLAIM_ACCOUNT) 1628 if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
1618 add_zone_page_state(page_zone(page), 1629 add_zone_page_state(page_zone(page),
1619 NR_SLAB_RECLAIMABLE, nr_pages); 1630 NR_SLAB_RECLAIMABLE, nr_pages);
1620 else 1631 else
1621 add_zone_page_state(page_zone(page), 1632 add_zone_page_state(page_zone(page),
1622 NR_SLAB_UNRECLAIMABLE, nr_pages); 1633 NR_SLAB_UNRECLAIMABLE, nr_pages);
1623 for (i = 0; i < nr_pages; i++) 1634 for (i = 0; i < nr_pages; i++)
1624 __SetPageSlab(page + i); 1635 __SetPageSlab(page + i);
1625 return page_address(page); 1636 return page_address(page);
1626 } 1637 }
1627 1638
1628 /* 1639 /*
1629 * Interface to system's page release. 1640 * Interface to system's page release.
1630 */ 1641 */
1631 static void kmem_freepages(struct kmem_cache *cachep, void *addr) 1642 static void kmem_freepages(struct kmem_cache *cachep, void *addr)
1632 { 1643 {
1633 unsigned long i = (1 << cachep->gfporder); 1644 unsigned long i = (1 << cachep->gfporder);
1634 struct page *page = virt_to_page(addr); 1645 struct page *page = virt_to_page(addr);
1635 const unsigned long nr_freed = i; 1646 const unsigned long nr_freed = i;
1636 1647
1637 if (cachep->flags & SLAB_RECLAIM_ACCOUNT) 1648 if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
1638 sub_zone_page_state(page_zone(page), 1649 sub_zone_page_state(page_zone(page),
1639 NR_SLAB_RECLAIMABLE, nr_freed); 1650 NR_SLAB_RECLAIMABLE, nr_freed);
1640 else 1651 else
1641 sub_zone_page_state(page_zone(page), 1652 sub_zone_page_state(page_zone(page),
1642 NR_SLAB_UNRECLAIMABLE, nr_freed); 1653 NR_SLAB_UNRECLAIMABLE, nr_freed);
1643 while (i--) { 1654 while (i--) {
1644 BUG_ON(!PageSlab(page)); 1655 BUG_ON(!PageSlab(page));
1645 __ClearPageSlab(page); 1656 __ClearPageSlab(page);
1646 page++; 1657 page++;
1647 } 1658 }
1648 if (current->reclaim_state) 1659 if (current->reclaim_state)
1649 current->reclaim_state->reclaimed_slab += nr_freed; 1660 current->reclaim_state->reclaimed_slab += nr_freed;
1650 free_pages((unsigned long)addr, cachep->gfporder); 1661 free_pages((unsigned long)addr, cachep->gfporder);
1651 } 1662 }
1652 1663
1653 static void kmem_rcu_free(struct rcu_head *head) 1664 static void kmem_rcu_free(struct rcu_head *head)
1654 { 1665 {
1655 struct slab_rcu *slab_rcu = (struct slab_rcu *)head; 1666 struct slab_rcu *slab_rcu = (struct slab_rcu *)head;
1656 struct kmem_cache *cachep = slab_rcu->cachep; 1667 struct kmem_cache *cachep = slab_rcu->cachep;
1657 1668
1658 kmem_freepages(cachep, slab_rcu->addr); 1669 kmem_freepages(cachep, slab_rcu->addr);
1659 if (OFF_SLAB(cachep)) 1670 if (OFF_SLAB(cachep))
1660 kmem_cache_free(cachep->slabp_cache, slab_rcu); 1671 kmem_cache_free(cachep->slabp_cache, slab_rcu);
1661 } 1672 }
1662 1673
1663 #if DEBUG 1674 #if DEBUG
1664 1675
1665 #ifdef CONFIG_DEBUG_PAGEALLOC 1676 #ifdef CONFIG_DEBUG_PAGEALLOC
1666 static void store_stackinfo(struct kmem_cache *cachep, unsigned long *addr, 1677 static void store_stackinfo(struct kmem_cache *cachep, unsigned long *addr,
1667 unsigned long caller) 1678 unsigned long caller)
1668 { 1679 {
1669 int size = obj_size(cachep); 1680 int size = obj_size(cachep);
1670 1681
1671 addr = (unsigned long *)&((char *)addr)[obj_offset(cachep)]; 1682 addr = (unsigned long *)&((char *)addr)[obj_offset(cachep)];
1672 1683
1673 if (size < 5 * sizeof(unsigned long)) 1684 if (size < 5 * sizeof(unsigned long))
1674 return; 1685 return;
1675 1686
1676 *addr++ = 0x12345678; 1687 *addr++ = 0x12345678;
1677 *addr++ = caller; 1688 *addr++ = caller;
1678 *addr++ = smp_processor_id(); 1689 *addr++ = smp_processor_id();
1679 size -= 3 * sizeof(unsigned long); 1690 size -= 3 * sizeof(unsigned long);
1680 { 1691 {
1681 unsigned long *sptr = &caller; 1692 unsigned long *sptr = &caller;
1682 unsigned long svalue; 1693 unsigned long svalue;
1683 1694
1684 while (!kstack_end(sptr)) { 1695 while (!kstack_end(sptr)) {
1685 svalue = *sptr++; 1696 svalue = *sptr++;
1686 if (kernel_text_address(svalue)) { 1697 if (kernel_text_address(svalue)) {
1687 *addr++ = svalue; 1698 *addr++ = svalue;
1688 size -= sizeof(unsigned long); 1699 size -= sizeof(unsigned long);
1689 if (size <= sizeof(unsigned long)) 1700 if (size <= sizeof(unsigned long))
1690 break; 1701 break;
1691 } 1702 }
1692 } 1703 }
1693 1704
1694 } 1705 }
1695 *addr++ = 0x87654321; 1706 *addr++ = 0x87654321;
1696 } 1707 }
1697 #endif 1708 #endif
1698 1709
1699 static void poison_obj(struct kmem_cache *cachep, void *addr, unsigned char val) 1710 static void poison_obj(struct kmem_cache *cachep, void *addr, unsigned char val)
1700 { 1711 {
1701 int size = obj_size(cachep); 1712 int size = obj_size(cachep);
1702 addr = &((char *)addr)[obj_offset(cachep)]; 1713 addr = &((char *)addr)[obj_offset(cachep)];
1703 1714
1704 memset(addr, val, size); 1715 memset(addr, val, size);
1705 *(unsigned char *)(addr + size - 1) = POISON_END; 1716 *(unsigned char *)(addr + size - 1) = POISON_END;
1706 } 1717 }
1707 1718
1708 static void dump_line(char *data, int offset, int limit) 1719 static void dump_line(char *data, int offset, int limit)
1709 { 1720 {
1710 int i; 1721 int i;
1711 unsigned char error = 0; 1722 unsigned char error = 0;
1712 int bad_count = 0; 1723 int bad_count = 0;
1713 1724
1714 printk(KERN_ERR "%03x:", offset); 1725 printk(KERN_ERR "%03x:", offset);
1715 for (i = 0; i < limit; i++) { 1726 for (i = 0; i < limit; i++) {
1716 if (data[offset + i] != POISON_FREE) { 1727 if (data[offset + i] != POISON_FREE) {
1717 error = data[offset + i]; 1728 error = data[offset + i];
1718 bad_count++; 1729 bad_count++;
1719 } 1730 }
1720 printk(" %02x", (unsigned char)data[offset + i]); 1731 printk(" %02x", (unsigned char)data[offset + i]);
1721 } 1732 }
1722 printk("\n"); 1733 printk("\n");
1723 1734
1724 if (bad_count == 1) { 1735 if (bad_count == 1) {
1725 error ^= POISON_FREE; 1736 error ^= POISON_FREE;
1726 if (!(error & (error - 1))) { 1737 if (!(error & (error - 1))) {
1727 printk(KERN_ERR "Single bit error detected. Probably " 1738 printk(KERN_ERR "Single bit error detected. Probably "
1728 "bad RAM.\n"); 1739 "bad RAM.\n");
1729 #ifdef CONFIG_X86 1740 #ifdef CONFIG_X86
1730 printk(KERN_ERR "Run memtest86+ or a similar memory " 1741 printk(KERN_ERR "Run memtest86+ or a similar memory "
1731 "test tool.\n"); 1742 "test tool.\n");
1732 #else 1743 #else
1733 printk(KERN_ERR "Run a memory test tool.\n"); 1744 printk(KERN_ERR "Run a memory test tool.\n");
1734 #endif 1745 #endif
1735 } 1746 }
1736 } 1747 }
1737 } 1748 }
1738 #endif 1749 #endif
1739 1750
1740 #if DEBUG 1751 #if DEBUG
1741 1752
1742 static void print_objinfo(struct kmem_cache *cachep, void *objp, int lines) 1753 static void print_objinfo(struct kmem_cache *cachep, void *objp, int lines)
1743 { 1754 {
1744 int i, size; 1755 int i, size;
1745 char *realobj; 1756 char *realobj;
1746 1757
1747 if (cachep->flags & SLAB_RED_ZONE) { 1758 if (cachep->flags & SLAB_RED_ZONE) {
1748 printk(KERN_ERR "Redzone: 0x%lx/0x%lx.\n", 1759 printk(KERN_ERR "Redzone: 0x%lx/0x%lx.\n",
1749 *dbg_redzone1(cachep, objp), 1760 *dbg_redzone1(cachep, objp),
1750 *dbg_redzone2(cachep, objp)); 1761 *dbg_redzone2(cachep, objp));
1751 } 1762 }
1752 1763
1753 if (cachep->flags & SLAB_STORE_USER) { 1764 if (cachep->flags & SLAB_STORE_USER) {
1754 printk(KERN_ERR "Last user: [<%p>]", 1765 printk(KERN_ERR "Last user: [<%p>]",
1755 *dbg_userword(cachep, objp)); 1766 *dbg_userword(cachep, objp));
1756 print_symbol("(%s)", 1767 print_symbol("(%s)",
1757 (unsigned long)*dbg_userword(cachep, objp)); 1768 (unsigned long)*dbg_userword(cachep, objp));
1758 printk("\n"); 1769 printk("\n");
1759 } 1770 }
1760 realobj = (char *)objp + obj_offset(cachep); 1771 realobj = (char *)objp + obj_offset(cachep);
1761 size = obj_size(cachep); 1772 size = obj_size(cachep);
1762 for (i = 0; i < size && lines; i += 16, lines--) { 1773 for (i = 0; i < size && lines; i += 16, lines--) {
1763 int limit; 1774 int limit;
1764 limit = 16; 1775 limit = 16;
1765 if (i + limit > size) 1776 if (i + limit > size)
1766 limit = size - i; 1777 limit = size - i;
1767 dump_line(realobj, i, limit); 1778 dump_line(realobj, i, limit);
1768 } 1779 }
1769 } 1780 }
1770 1781
1771 static void check_poison_obj(struct kmem_cache *cachep, void *objp) 1782 static void check_poison_obj(struct kmem_cache *cachep, void *objp)
1772 { 1783 {
1773 char *realobj; 1784 char *realobj;
1774 int size, i; 1785 int size, i;
1775 int lines = 0; 1786 int lines = 0;
1776 1787
1777 realobj = (char *)objp + obj_offset(cachep); 1788 realobj = (char *)objp + obj_offset(cachep);
1778 size = obj_size(cachep); 1789 size = obj_size(cachep);
1779 1790
1780 for (i = 0; i < size; i++) { 1791 for (i = 0; i < size; i++) {
1781 char exp = POISON_FREE; 1792 char exp = POISON_FREE;
1782 if (i == size - 1) 1793 if (i == size - 1)
1783 exp = POISON_END; 1794 exp = POISON_END;
1784 if (realobj[i] != exp) { 1795 if (realobj[i] != exp) {
1785 int limit; 1796 int limit;
1786 /* Mismatch ! */ 1797 /* Mismatch ! */
1787 /* Print header */ 1798 /* Print header */
1788 if (lines == 0) { 1799 if (lines == 0) {
1789 printk(KERN_ERR 1800 printk(KERN_ERR
1790 "Slab corruption: start=%p, len=%d\n", 1801 "Slab corruption: start=%p, len=%d\n",
1791 realobj, size); 1802 realobj, size);
1792 print_objinfo(cachep, objp, 0); 1803 print_objinfo(cachep, objp, 0);
1793 } 1804 }
1794 /* Hexdump the affected line */ 1805 /* Hexdump the affected line */
1795 i = (i / 16) * 16; 1806 i = (i / 16) * 16;
1796 limit = 16; 1807 limit = 16;
1797 if (i + limit > size) 1808 if (i + limit > size)
1798 limit = size - i; 1809 limit = size - i;
1799 dump_line(realobj, i, limit); 1810 dump_line(realobj, i, limit);
1800 i += 16; 1811 i += 16;
1801 lines++; 1812 lines++;
1802 /* Limit to 5 lines */ 1813 /* Limit to 5 lines */
1803 if (lines > 5) 1814 if (lines > 5)
1804 break; 1815 break;
1805 } 1816 }
1806 } 1817 }
1807 if (lines != 0) { 1818 if (lines != 0) {
1808 /* Print some data about the neighboring objects, if they 1819 /* Print some data about the neighboring objects, if they
1809 * exist: 1820 * exist:
1810 */ 1821 */
1811 struct slab *slabp = virt_to_slab(objp); 1822 struct slab *slabp = virt_to_slab(objp);
1812 unsigned int objnr; 1823 unsigned int objnr;
1813 1824
1814 objnr = obj_to_index(cachep, slabp, objp); 1825 objnr = obj_to_index(cachep, slabp, objp);
1815 if (objnr) { 1826 if (objnr) {
1816 objp = index_to_obj(cachep, slabp, objnr - 1); 1827 objp = index_to_obj(cachep, slabp, objnr - 1);
1817 realobj = (char *)objp + obj_offset(cachep); 1828 realobj = (char *)objp + obj_offset(cachep);
1818 printk(KERN_ERR "Prev obj: start=%p, len=%d\n", 1829 printk(KERN_ERR "Prev obj: start=%p, len=%d\n",
1819 realobj, size); 1830 realobj, size);
1820 print_objinfo(cachep, objp, 2); 1831 print_objinfo(cachep, objp, 2);
1821 } 1832 }
1822 if (objnr + 1 < cachep->num) { 1833 if (objnr + 1 < cachep->num) {
1823 objp = index_to_obj(cachep, slabp, objnr + 1); 1834 objp = index_to_obj(cachep, slabp, objnr + 1);
1824 realobj = (char *)objp + obj_offset(cachep); 1835 realobj = (char *)objp + obj_offset(cachep);
1825 printk(KERN_ERR "Next obj: start=%p, len=%d\n", 1836 printk(KERN_ERR "Next obj: start=%p, len=%d\n",
1826 realobj, size); 1837 realobj, size);
1827 print_objinfo(cachep, objp, 2); 1838 print_objinfo(cachep, objp, 2);
1828 } 1839 }
1829 } 1840 }
1830 } 1841 }
1831 #endif 1842 #endif
1832 1843
1833 #if DEBUG 1844 #if DEBUG
1834 /** 1845 /**
1835 * slab_destroy_objs - destroy a slab and its objects 1846 * slab_destroy_objs - destroy a slab and its objects
1836 * @cachep: cache pointer being destroyed 1847 * @cachep: cache pointer being destroyed
1837 * @slabp: slab pointer being destroyed 1848 * @slabp: slab pointer being destroyed
1838 * 1849 *
1839 * Call the registered destructor for each object in a slab that is being 1850 * Call the registered destructor for each object in a slab that is being
1840 * destroyed. 1851 * destroyed.
1841 */ 1852 */
1842 static void slab_destroy_objs(struct kmem_cache *cachep, struct slab *slabp) 1853 static void slab_destroy_objs(struct kmem_cache *cachep, struct slab *slabp)
1843 { 1854 {
1844 int i; 1855 int i;
1845 for (i = 0; i < cachep->num; i++) { 1856 for (i = 0; i < cachep->num; i++) {
1846 void *objp = index_to_obj(cachep, slabp, i); 1857 void *objp = index_to_obj(cachep, slabp, i);
1847 1858
1848 if (cachep->flags & SLAB_POISON) { 1859 if (cachep->flags & SLAB_POISON) {
1849 #ifdef CONFIG_DEBUG_PAGEALLOC 1860 #ifdef CONFIG_DEBUG_PAGEALLOC
1850 if (cachep->buffer_size % PAGE_SIZE == 0 && 1861 if (cachep->buffer_size % PAGE_SIZE == 0 &&
1851 OFF_SLAB(cachep)) 1862 OFF_SLAB(cachep))
1852 kernel_map_pages(virt_to_page(objp), 1863 kernel_map_pages(virt_to_page(objp),
1853 cachep->buffer_size / PAGE_SIZE, 1); 1864 cachep->buffer_size / PAGE_SIZE, 1);
1854 else 1865 else
1855 check_poison_obj(cachep, objp); 1866 check_poison_obj(cachep, objp);
1856 #else 1867 #else
1857 check_poison_obj(cachep, objp); 1868 check_poison_obj(cachep, objp);
1858 #endif 1869 #endif
1859 } 1870 }
1860 if (cachep->flags & SLAB_RED_ZONE) { 1871 if (cachep->flags & SLAB_RED_ZONE) {
1861 if (*dbg_redzone1(cachep, objp) != RED_INACTIVE) 1872 if (*dbg_redzone1(cachep, objp) != RED_INACTIVE)
1862 slab_error(cachep, "start of a freed object " 1873 slab_error(cachep, "start of a freed object "
1863 "was overwritten"); 1874 "was overwritten");
1864 if (*dbg_redzone2(cachep, objp) != RED_INACTIVE) 1875 if (*dbg_redzone2(cachep, objp) != RED_INACTIVE)
1865 slab_error(cachep, "end of a freed object " 1876 slab_error(cachep, "end of a freed object "
1866 "was overwritten"); 1877 "was overwritten");
1867 } 1878 }
1868 if (cachep->dtor && !(cachep->flags & SLAB_POISON)) 1879 if (cachep->dtor && !(cachep->flags & SLAB_POISON))
1869 (cachep->dtor) (objp + obj_offset(cachep), cachep, 0); 1880 (cachep->dtor) (objp + obj_offset(cachep), cachep, 0);
1870 } 1881 }
1871 } 1882 }
1872 #else 1883 #else
1873 static void slab_destroy_objs(struct kmem_cache *cachep, struct slab *slabp) 1884 static void slab_destroy_objs(struct kmem_cache *cachep, struct slab *slabp)
1874 { 1885 {
1875 if (cachep->dtor) { 1886 if (cachep->dtor) {
1876 int i; 1887 int i;
1877 for (i = 0; i < cachep->num; i++) { 1888 for (i = 0; i < cachep->num; i++) {
1878 void *objp = index_to_obj(cachep, slabp, i); 1889 void *objp = index_to_obj(cachep, slabp, i);
1879 (cachep->dtor) (objp, cachep, 0); 1890 (cachep->dtor) (objp, cachep, 0);
1880 } 1891 }
1881 } 1892 }
1882 } 1893 }
1883 #endif 1894 #endif
1884 1895
1885 /** 1896 /**
1886 * slab_destroy - destroy and release all objects in a slab 1897 * slab_destroy - destroy and release all objects in a slab
1887 * @cachep: cache pointer being destroyed 1898 * @cachep: cache pointer being destroyed
1888 * @slabp: slab pointer being destroyed 1899 * @slabp: slab pointer being destroyed
1889 * 1900 *
1890 * Destroy all the objs in a slab, and release the mem back to the system. 1901 * Destroy all the objs in a slab, and release the mem back to the system.
1891 * Before calling the slab must have been unlinked from the cache. The 1902 * Before calling the slab must have been unlinked from the cache. The
1892 * cache-lock is not held/needed. 1903 * cache-lock is not held/needed.
1893 */ 1904 */
1894 static void slab_destroy(struct kmem_cache *cachep, struct slab *slabp) 1905 static void slab_destroy(struct kmem_cache *cachep, struct slab *slabp)
1895 { 1906 {
1896 void *addr = slabp->s_mem - slabp->colouroff; 1907 void *addr = slabp->s_mem - slabp->colouroff;
1897 1908
1898 slab_destroy_objs(cachep, slabp); 1909 slab_destroy_objs(cachep, slabp);
1899 if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU)) { 1910 if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU)) {
1900 struct slab_rcu *slab_rcu; 1911 struct slab_rcu *slab_rcu;
1901 1912
1902 slab_rcu = (struct slab_rcu *)slabp; 1913 slab_rcu = (struct slab_rcu *)slabp;
1903 slab_rcu->cachep = cachep; 1914 slab_rcu->cachep = cachep;
1904 slab_rcu->addr = addr; 1915 slab_rcu->addr = addr;
1905 call_rcu(&slab_rcu->head, kmem_rcu_free); 1916 call_rcu(&slab_rcu->head, kmem_rcu_free);
1906 } else { 1917 } else {
1907 kmem_freepages(cachep, addr); 1918 kmem_freepages(cachep, addr);
1908 if (OFF_SLAB(cachep)) 1919 if (OFF_SLAB(cachep))
1909 kmem_cache_free(cachep->slabp_cache, slabp); 1920 kmem_cache_free(cachep->slabp_cache, slabp);
1910 } 1921 }
1911 } 1922 }
1912 1923
1913 /* 1924 /*
1914 * For setting up all the kmem_list3s for cache whose buffer_size is same as 1925 * For setting up all the kmem_list3s for cache whose buffer_size is same as
1915 * size of kmem_list3. 1926 * size of kmem_list3.
1916 */ 1927 */
1917 static void set_up_list3s(struct kmem_cache *cachep, int index) 1928 static void set_up_list3s(struct kmem_cache *cachep, int index)
1918 { 1929 {
1919 int node; 1930 int node;
1920 1931
1921 for_each_online_node(node) { 1932 for_each_online_node(node) {
1922 cachep->nodelists[node] = &initkmem_list3[index + node]; 1933 cachep->nodelists[node] = &initkmem_list3[index + node];
1923 cachep->nodelists[node]->next_reap = jiffies + 1934 cachep->nodelists[node]->next_reap = jiffies +
1924 REAPTIMEOUT_LIST3 + 1935 REAPTIMEOUT_LIST3 +
1925 ((unsigned long)cachep) % REAPTIMEOUT_LIST3; 1936 ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
1926 } 1937 }
1927 } 1938 }
1928 1939
1929 static void __kmem_cache_destroy(struct kmem_cache *cachep) 1940 static void __kmem_cache_destroy(struct kmem_cache *cachep)
1930 { 1941 {
1931 int i; 1942 int i;
1932 struct kmem_list3 *l3; 1943 struct kmem_list3 *l3;
1933 1944
1934 for_each_online_cpu(i) 1945 for_each_online_cpu(i)
1935 kfree(cachep->array[i]); 1946 kfree(cachep->array[i]);
1936 1947
1937 /* NUMA: free the list3 structures */ 1948 /* NUMA: free the list3 structures */
1938 for_each_online_node(i) { 1949 for_each_online_node(i) {
1939 l3 = cachep->nodelists[i]; 1950 l3 = cachep->nodelists[i];
1940 if (l3) { 1951 if (l3) {
1941 kfree(l3->shared); 1952 kfree(l3->shared);
1942 free_alien_cache(l3->alien); 1953 free_alien_cache(l3->alien);
1943 kfree(l3); 1954 kfree(l3);
1944 } 1955 }
1945 } 1956 }
1946 kmem_cache_free(&cache_cache, cachep); 1957 kmem_cache_free(&cache_cache, cachep);
1947 } 1958 }
1948 1959
1949 1960
1950 /** 1961 /**
1951 * calculate_slab_order - calculate size (page order) of slabs 1962 * calculate_slab_order - calculate size (page order) of slabs
1952 * @cachep: pointer to the cache that is being created 1963 * @cachep: pointer to the cache that is being created
1953 * @size: size of objects to be created in this cache. 1964 * @size: size of objects to be created in this cache.
1954 * @align: required alignment for the objects. 1965 * @align: required alignment for the objects.
1955 * @flags: slab allocation flags 1966 * @flags: slab allocation flags
1956 * 1967 *
1957 * Also calculates the number of objects per slab. 1968 * Also calculates the number of objects per slab.
1958 * 1969 *
1959 * This could be made much more intelligent. For now, try to avoid using 1970 * This could be made much more intelligent. For now, try to avoid using
1960 * high order pages for slabs. When the gfp() functions are more friendly 1971 * high order pages for slabs. When the gfp() functions are more friendly
1961 * towards high-order requests, this should be changed. 1972 * towards high-order requests, this should be changed.
1962 */ 1973 */
1963 static size_t calculate_slab_order(struct kmem_cache *cachep, 1974 static size_t calculate_slab_order(struct kmem_cache *cachep,
1964 size_t size, size_t align, unsigned long flags) 1975 size_t size, size_t align, unsigned long flags)
1965 { 1976 {
1966 unsigned long offslab_limit; 1977 unsigned long offslab_limit;
1967 size_t left_over = 0; 1978 size_t left_over = 0;
1968 int gfporder; 1979 int gfporder;
1969 1980
1970 for (gfporder = 0; gfporder <= MAX_GFP_ORDER; gfporder++) { 1981 for (gfporder = 0; gfporder <= MAX_GFP_ORDER; gfporder++) {
1971 unsigned int num; 1982 unsigned int num;
1972 size_t remainder; 1983 size_t remainder;
1973 1984
1974 cache_estimate(gfporder, size, align, flags, &remainder, &num); 1985 cache_estimate(gfporder, size, align, flags, &remainder, &num);
1975 if (!num) 1986 if (!num)
1976 continue; 1987 continue;
1977 1988
1978 if (flags & CFLGS_OFF_SLAB) { 1989 if (flags & CFLGS_OFF_SLAB) {
1979 /* 1990 /*
1980 * Max number of objs-per-slab for caches which 1991 * Max number of objs-per-slab for caches which
1981 * use off-slab slabs. Needed to avoid a possible 1992 * use off-slab slabs. Needed to avoid a possible
1982 * looping condition in cache_grow(). 1993 * looping condition in cache_grow().
1983 */ 1994 */
1984 offslab_limit = size - sizeof(struct slab); 1995 offslab_limit = size - sizeof(struct slab);
1985 offslab_limit /= sizeof(kmem_bufctl_t); 1996 offslab_limit /= sizeof(kmem_bufctl_t);
1986 1997
1987 if (num > offslab_limit) 1998 if (num > offslab_limit)
1988 break; 1999 break;
1989 } 2000 }
1990 2001
1991 /* Found something acceptable - save it away */ 2002 /* Found something acceptable - save it away */
1992 cachep->num = num; 2003 cachep->num = num;
1993 cachep->gfporder = gfporder; 2004 cachep->gfporder = gfporder;
1994 left_over = remainder; 2005 left_over = remainder;
1995 2006
1996 /* 2007 /*
1997 * A VFS-reclaimable slab tends to have most allocations 2008 * A VFS-reclaimable slab tends to have most allocations
1998 * as GFP_NOFS and we really don't want to have to be allocating 2009 * as GFP_NOFS and we really don't want to have to be allocating
1999 * higher-order pages when we are unable to shrink dcache. 2010 * higher-order pages when we are unable to shrink dcache.
2000 */ 2011 */
2001 if (flags & SLAB_RECLAIM_ACCOUNT) 2012 if (flags & SLAB_RECLAIM_ACCOUNT)
2002 break; 2013 break;
2003 2014
2004 /* 2015 /*
2005 * Large number of objects is good, but very large slabs are 2016 * Large number of objects is good, but very large slabs are
2006 * currently bad for the gfp()s. 2017 * currently bad for the gfp()s.
2007 */ 2018 */
2008 if (gfporder >= slab_break_gfp_order) 2019 if (gfporder >= slab_break_gfp_order)
2009 break; 2020 break;
2010 2021
2011 /* 2022 /*
2012 * Acceptable internal fragmentation? 2023 * Acceptable internal fragmentation?
2013 */ 2024 */
2014 if (left_over * 8 <= (PAGE_SIZE << gfporder)) 2025 if (left_over * 8 <= (PAGE_SIZE << gfporder))
2015 break; 2026 break;
2016 } 2027 }
2017 return left_over; 2028 return left_over;
2018 } 2029 }
2019 2030
2020 static int setup_cpu_cache(struct kmem_cache *cachep) 2031 static int setup_cpu_cache(struct kmem_cache *cachep)
2021 { 2032 {
2022 if (g_cpucache_up == FULL) 2033 if (g_cpucache_up == FULL)
2023 return enable_cpucache(cachep); 2034 return enable_cpucache(cachep);
2024 2035
2025 if (g_cpucache_up == NONE) { 2036 if (g_cpucache_up == NONE) {
2026 /* 2037 /*
2027 * Note: the first kmem_cache_create must create the cache 2038 * Note: the first kmem_cache_create must create the cache
2028 * that's used by kmalloc(24), otherwise the creation of 2039 * that's used by kmalloc(24), otherwise the creation of
2029 * further caches will BUG(). 2040 * further caches will BUG().
2030 */ 2041 */
2031 cachep->array[smp_processor_id()] = &initarray_generic.cache; 2042 cachep->array[smp_processor_id()] = &initarray_generic.cache;
2032 2043
2033 /* 2044 /*
2034 * If the cache that's used by kmalloc(sizeof(kmem_list3)) is 2045 * If the cache that's used by kmalloc(sizeof(kmem_list3)) is
2035 * the first cache, then we need to set up all its list3s, 2046 * the first cache, then we need to set up all its list3s,
2036 * otherwise the creation of further caches will BUG(). 2047 * otherwise the creation of further caches will BUG().
2037 */ 2048 */
2038 set_up_list3s(cachep, SIZE_AC); 2049 set_up_list3s(cachep, SIZE_AC);
2039 if (INDEX_AC == INDEX_L3) 2050 if (INDEX_AC == INDEX_L3)
2040 g_cpucache_up = PARTIAL_L3; 2051 g_cpucache_up = PARTIAL_L3;
2041 else 2052 else
2042 g_cpucache_up = PARTIAL_AC; 2053 g_cpucache_up = PARTIAL_AC;
2043 } else { 2054 } else {
2044 cachep->array[smp_processor_id()] = 2055 cachep->array[smp_processor_id()] =
2045 kmalloc(sizeof(struct arraycache_init), GFP_KERNEL); 2056 kmalloc(sizeof(struct arraycache_init), GFP_KERNEL);
2046 2057
2047 if (g_cpucache_up == PARTIAL_AC) { 2058 if (g_cpucache_up == PARTIAL_AC) {
2048 set_up_list3s(cachep, SIZE_L3); 2059 set_up_list3s(cachep, SIZE_L3);
2049 g_cpucache_up = PARTIAL_L3; 2060 g_cpucache_up = PARTIAL_L3;
2050 } else { 2061 } else {
2051 int node; 2062 int node;
2052 for_each_online_node(node) { 2063 for_each_online_node(node) {
2053 cachep->nodelists[node] = 2064 cachep->nodelists[node] =
2054 kmalloc_node(sizeof(struct kmem_list3), 2065 kmalloc_node(sizeof(struct kmem_list3),
2055 GFP_KERNEL, node); 2066 GFP_KERNEL, node);
2056 BUG_ON(!cachep->nodelists[node]); 2067 BUG_ON(!cachep->nodelists[node]);
2057 kmem_list3_init(cachep->nodelists[node]); 2068 kmem_list3_init(cachep->nodelists[node]);
2058 } 2069 }
2059 } 2070 }
2060 } 2071 }
2061 cachep->nodelists[numa_node_id()]->next_reap = 2072 cachep->nodelists[numa_node_id()]->next_reap =
2062 jiffies + REAPTIMEOUT_LIST3 + 2073 jiffies + REAPTIMEOUT_LIST3 +
2063 ((unsigned long)cachep) % REAPTIMEOUT_LIST3; 2074 ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
2064 2075
2065 cpu_cache_get(cachep)->avail = 0; 2076 cpu_cache_get(cachep)->avail = 0;
2066 cpu_cache_get(cachep)->limit = BOOT_CPUCACHE_ENTRIES; 2077 cpu_cache_get(cachep)->limit = BOOT_CPUCACHE_ENTRIES;
2067 cpu_cache_get(cachep)->batchcount = 1; 2078 cpu_cache_get(cachep)->batchcount = 1;
2068 cpu_cache_get(cachep)->touched = 0; 2079 cpu_cache_get(cachep)->touched = 0;
2069 cachep->batchcount = 1; 2080 cachep->batchcount = 1;
2070 cachep->limit = BOOT_CPUCACHE_ENTRIES; 2081 cachep->limit = BOOT_CPUCACHE_ENTRIES;
2071 return 0; 2082 return 0;
2072 } 2083 }
2073 2084
2074 /** 2085 /**
2075 * kmem_cache_create - Create a cache. 2086 * kmem_cache_create - Create a cache.
2076 * @name: A string which is used in /proc/slabinfo to identify this cache. 2087 * @name: A string which is used in /proc/slabinfo to identify this cache.
2077 * @size: The size of objects to be created in this cache. 2088 * @size: The size of objects to be created in this cache.
2078 * @align: The required alignment for the objects. 2089 * @align: The required alignment for the objects.
2079 * @flags: SLAB flags 2090 * @flags: SLAB flags
2080 * @ctor: A constructor for the objects. 2091 * @ctor: A constructor for the objects.
2081 * @dtor: A destructor for the objects. 2092 * @dtor: A destructor for the objects.
2082 * 2093 *
2083 * Returns a ptr to the cache on success, NULL on failure. 2094 * Returns a ptr to the cache on success, NULL on failure.
2084 * Cannot be called within a int, but can be interrupted. 2095 * Cannot be called within a int, but can be interrupted.
2085 * The @ctor is run when new pages are allocated by the cache 2096 * The @ctor is run when new pages are allocated by the cache
2086 * and the @dtor is run before the pages are handed back. 2097 * and the @dtor is run before the pages are handed back.
2087 * 2098 *
2088 * @name must be valid until the cache is destroyed. This implies that 2099 * @name must be valid until the cache is destroyed. This implies that
2089 * the module calling this has to destroy the cache before getting unloaded. 2100 * the module calling this has to destroy the cache before getting unloaded.
2090 * 2101 *
2091 * The flags are 2102 * The flags are
2092 * 2103 *
2093 * %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5) 2104 * %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5)
2094 * to catch references to uninitialised memory. 2105 * to catch references to uninitialised memory.
2095 * 2106 *
2096 * %SLAB_RED_ZONE - Insert `Red' zones around the allocated memory to check 2107 * %SLAB_RED_ZONE - Insert `Red' zones around the allocated memory to check
2097 * for buffer overruns. 2108 * for buffer overruns.
2098 * 2109 *
2099 * %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware 2110 * %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware
2100 * cacheline. This can be beneficial if you're counting cycles as closely 2111 * cacheline. This can be beneficial if you're counting cycles as closely
2101 * as davem. 2112 * as davem.
2102 */ 2113 */
2103 struct kmem_cache * 2114 struct kmem_cache *
2104 kmem_cache_create (const char *name, size_t size, size_t align, 2115 kmem_cache_create (const char *name, size_t size, size_t align,
2105 unsigned long flags, 2116 unsigned long flags,
2106 void (*ctor)(void*, struct kmem_cache *, unsigned long), 2117 void (*ctor)(void*, struct kmem_cache *, unsigned long),
2107 void (*dtor)(void*, struct kmem_cache *, unsigned long)) 2118 void (*dtor)(void*, struct kmem_cache *, unsigned long))
2108 { 2119 {
2109 size_t left_over, slab_size, ralign; 2120 size_t left_over, slab_size, ralign;
2110 struct kmem_cache *cachep = NULL, *pc; 2121 struct kmem_cache *cachep = NULL, *pc;
2111 2122
2112 /* 2123 /*
2113 * Sanity checks... these are all serious usage bugs. 2124 * Sanity checks... these are all serious usage bugs.
2114 */ 2125 */
2115 if (!name || in_interrupt() || (size < BYTES_PER_WORD) || 2126 if (!name || in_interrupt() || (size < BYTES_PER_WORD) ||
2116 (size > (1 << MAX_OBJ_ORDER) * PAGE_SIZE) || (dtor && !ctor)) { 2127 (size > (1 << MAX_OBJ_ORDER) * PAGE_SIZE) || (dtor && !ctor)) {
2117 printk(KERN_ERR "%s: Early error in slab %s\n", __FUNCTION__, 2128 printk(KERN_ERR "%s: Early error in slab %s\n", __FUNCTION__,
2118 name); 2129 name);
2119 BUG(); 2130 BUG();
2120 } 2131 }
2121 2132
2122 /* 2133 /*
2123 * We use cache_chain_mutex to ensure a consistent view of 2134 * We use cache_chain_mutex to ensure a consistent view of
2124 * cpu_online_map as well. Please see cpuup_callback 2135 * cpu_online_map as well. Please see cpuup_callback
2125 */ 2136 */
2126 mutex_lock(&cache_chain_mutex); 2137 mutex_lock(&cache_chain_mutex);
2127 2138
2128 list_for_each_entry(pc, &cache_chain, next) { 2139 list_for_each_entry(pc, &cache_chain, next) {
2129 char tmp; 2140 char tmp;
2130 int res; 2141 int res;
2131 2142
2132 /* 2143 /*
2133 * This happens when the module gets unloaded and doesn't 2144 * This happens when the module gets unloaded and doesn't
2134 * destroy its slab cache and no-one else reuses the vmalloc 2145 * destroy its slab cache and no-one else reuses the vmalloc
2135 * area of the module. Print a warning. 2146 * area of the module. Print a warning.
2136 */ 2147 */
2137 res = probe_kernel_address(pc->name, tmp); 2148 res = probe_kernel_address(pc->name, tmp);
2138 if (res) { 2149 if (res) {
2139 printk("SLAB: cache with size %d has lost its name\n", 2150 printk("SLAB: cache with size %d has lost its name\n",
2140 pc->buffer_size); 2151 pc->buffer_size);
2141 continue; 2152 continue;
2142 } 2153 }
2143 2154
2144 if (!strcmp(pc->name, name)) { 2155 if (!strcmp(pc->name, name)) {
2145 printk("kmem_cache_create: duplicate cache %s\n", name); 2156 printk("kmem_cache_create: duplicate cache %s\n", name);
2146 dump_stack(); 2157 dump_stack();
2147 goto oops; 2158 goto oops;
2148 } 2159 }
2149 } 2160 }
2150 2161
2151 #if DEBUG 2162 #if DEBUG
2152 WARN_ON(strchr(name, ' ')); /* It confuses parsers */ 2163 WARN_ON(strchr(name, ' ')); /* It confuses parsers */
2153 if ((flags & SLAB_DEBUG_INITIAL) && !ctor) { 2164 if ((flags & SLAB_DEBUG_INITIAL) && !ctor) {
2154 /* No constructor, but inital state check requested */ 2165 /* No constructor, but inital state check requested */
2155 printk(KERN_ERR "%s: No con, but init state check " 2166 printk(KERN_ERR "%s: No con, but init state check "
2156 "requested - %s\n", __FUNCTION__, name); 2167 "requested - %s\n", __FUNCTION__, name);
2157 flags &= ~SLAB_DEBUG_INITIAL; 2168 flags &= ~SLAB_DEBUG_INITIAL;
2158 } 2169 }
2159 #if FORCED_DEBUG 2170 #if FORCED_DEBUG
2160 /* 2171 /*
2161 * Enable redzoning and last user accounting, except for caches with 2172 * Enable redzoning and last user accounting, except for caches with
2162 * large objects, if the increased size would increase the object size 2173 * large objects, if the increased size would increase the object size
2163 * above the next power of two: caches with object sizes just above a 2174 * above the next power of two: caches with object sizes just above a
2164 * power of two have a significant amount of internal fragmentation. 2175 * power of two have a significant amount of internal fragmentation.
2165 */ 2176 */
2166 if (size < 4096 || fls(size - 1) == fls(size-1 + 3 * BYTES_PER_WORD)) 2177 if (size < 4096 || fls(size - 1) == fls(size-1 + 3 * BYTES_PER_WORD))
2167 flags |= SLAB_RED_ZONE | SLAB_STORE_USER; 2178 flags |= SLAB_RED_ZONE | SLAB_STORE_USER;
2168 if (!(flags & SLAB_DESTROY_BY_RCU)) 2179 if (!(flags & SLAB_DESTROY_BY_RCU))
2169 flags |= SLAB_POISON; 2180 flags |= SLAB_POISON;
2170 #endif 2181 #endif
2171 if (flags & SLAB_DESTROY_BY_RCU) 2182 if (flags & SLAB_DESTROY_BY_RCU)
2172 BUG_ON(flags & SLAB_POISON); 2183 BUG_ON(flags & SLAB_POISON);
2173 #endif 2184 #endif
2174 if (flags & SLAB_DESTROY_BY_RCU) 2185 if (flags & SLAB_DESTROY_BY_RCU)
2175 BUG_ON(dtor); 2186 BUG_ON(dtor);
2176 2187
2177 /* 2188 /*
2178 * Always checks flags, a caller might be expecting debug support which 2189 * Always checks flags, a caller might be expecting debug support which
2179 * isn't available. 2190 * isn't available.
2180 */ 2191 */
2181 BUG_ON(flags & ~CREATE_MASK); 2192 BUG_ON(flags & ~CREATE_MASK);
2182 2193
2183 /* 2194 /*
2184 * Check that size is in terms of words. This is needed to avoid 2195 * Check that size is in terms of words. This is needed to avoid
2185 * unaligned accesses for some archs when redzoning is used, and makes 2196 * unaligned accesses for some archs when redzoning is used, and makes
2186 * sure any on-slab bufctl's are also correctly aligned. 2197 * sure any on-slab bufctl's are also correctly aligned.
2187 */ 2198 */
2188 if (size & (BYTES_PER_WORD - 1)) { 2199 if (size & (BYTES_PER_WORD - 1)) {
2189 size += (BYTES_PER_WORD - 1); 2200 size += (BYTES_PER_WORD - 1);
2190 size &= ~(BYTES_PER_WORD - 1); 2201 size &= ~(BYTES_PER_WORD - 1);
2191 } 2202 }
2192 2203
2193 /* calculate the final buffer alignment: */ 2204 /* calculate the final buffer alignment: */
2194 2205
2195 /* 1) arch recommendation: can be overridden for debug */ 2206 /* 1) arch recommendation: can be overridden for debug */
2196 if (flags & SLAB_HWCACHE_ALIGN) { 2207 if (flags & SLAB_HWCACHE_ALIGN) {
2197 /* 2208 /*
2198 * Default alignment: as specified by the arch code. Except if 2209 * Default alignment: as specified by the arch code. Except if
2199 * an object is really small, then squeeze multiple objects into 2210 * an object is really small, then squeeze multiple objects into
2200 * one cacheline. 2211 * one cacheline.
2201 */ 2212 */
2202 ralign = cache_line_size(); 2213 ralign = cache_line_size();
2203 while (size <= ralign / 2) 2214 while (size <= ralign / 2)
2204 ralign /= 2; 2215 ralign /= 2;
2205 } else { 2216 } else {
2206 ralign = BYTES_PER_WORD; 2217 ralign = BYTES_PER_WORD;
2207 } 2218 }
2208 2219
2209 /* 2220 /*
2210 * Redzoning and user store require word alignment. Note this will be 2221 * Redzoning and user store require word alignment. Note this will be
2211 * overridden by architecture or caller mandated alignment if either 2222 * overridden by architecture or caller mandated alignment if either
2212 * is greater than BYTES_PER_WORD. 2223 * is greater than BYTES_PER_WORD.
2213 */ 2224 */
2214 if (flags & SLAB_RED_ZONE || flags & SLAB_STORE_USER) 2225 if (flags & SLAB_RED_ZONE || flags & SLAB_STORE_USER)
2215 ralign = BYTES_PER_WORD; 2226 ralign = BYTES_PER_WORD;
2216 2227
2217 /* 2) arch mandated alignment */ 2228 /* 2) arch mandated alignment */
2218 if (ralign < ARCH_SLAB_MINALIGN) { 2229 if (ralign < ARCH_SLAB_MINALIGN) {
2219 ralign = ARCH_SLAB_MINALIGN; 2230 ralign = ARCH_SLAB_MINALIGN;
2220 } 2231 }
2221 /* 3) caller mandated alignment */ 2232 /* 3) caller mandated alignment */
2222 if (ralign < align) { 2233 if (ralign < align) {
2223 ralign = align; 2234 ralign = align;
2224 } 2235 }
2225 /* disable debug if necessary */ 2236 /* disable debug if necessary */
2226 if (ralign > BYTES_PER_WORD) 2237 if (ralign > BYTES_PER_WORD)
2227 flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER); 2238 flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
2228 /* 2239 /*
2229 * 4) Store it. 2240 * 4) Store it.
2230 */ 2241 */
2231 align = ralign; 2242 align = ralign;
2232 2243
2233 /* Get cache's description obj. */ 2244 /* Get cache's description obj. */
2234 cachep = kmem_cache_zalloc(&cache_cache, GFP_KERNEL); 2245 cachep = kmem_cache_zalloc(&cache_cache, GFP_KERNEL);
2235 if (!cachep) 2246 if (!cachep)
2236 goto oops; 2247 goto oops;
2237 2248
2238 #if DEBUG 2249 #if DEBUG
2239 cachep->obj_size = size; 2250 cachep->obj_size = size;
2240 2251
2241 /* 2252 /*
2242 * Both debugging options require word-alignment which is calculated 2253 * Both debugging options require word-alignment which is calculated
2243 * into align above. 2254 * into align above.
2244 */ 2255 */
2245 if (flags & SLAB_RED_ZONE) { 2256 if (flags & SLAB_RED_ZONE) {
2246 /* add space for red zone words */ 2257 /* add space for red zone words */
2247 cachep->obj_offset += BYTES_PER_WORD; 2258 cachep->obj_offset += BYTES_PER_WORD;
2248 size += 2 * BYTES_PER_WORD; 2259 size += 2 * BYTES_PER_WORD;
2249 } 2260 }
2250 if (flags & SLAB_STORE_USER) { 2261 if (flags & SLAB_STORE_USER) {
2251 /* user store requires one word storage behind the end of 2262 /* user store requires one word storage behind the end of
2252 * the real object. 2263 * the real object.
2253 */ 2264 */
2254 size += BYTES_PER_WORD; 2265 size += BYTES_PER_WORD;
2255 } 2266 }
2256 #if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC) 2267 #if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC)
2257 if (size >= malloc_sizes[INDEX_L3 + 1].cs_size 2268 if (size >= malloc_sizes[INDEX_L3 + 1].cs_size
2258 && cachep->obj_size > cache_line_size() && size < PAGE_SIZE) { 2269 && cachep->obj_size > cache_line_size() && size < PAGE_SIZE) {
2259 cachep->obj_offset += PAGE_SIZE - size; 2270 cachep->obj_offset += PAGE_SIZE - size;
2260 size = PAGE_SIZE; 2271 size = PAGE_SIZE;
2261 } 2272 }
2262 #endif 2273 #endif
2263 #endif 2274 #endif
2264 2275
2265 /* 2276 /*
2266 * Determine if the slab management is 'on' or 'off' slab. 2277 * Determine if the slab management is 'on' or 'off' slab.
2267 * (bootstrapping cannot cope with offslab caches so don't do 2278 * (bootstrapping cannot cope with offslab caches so don't do
2268 * it too early on.) 2279 * it too early on.)
2269 */ 2280 */
2270 if ((size >= (PAGE_SIZE >> 3)) && !slab_early_init) 2281 if ((size >= (PAGE_SIZE >> 3)) && !slab_early_init)
2271 /* 2282 /*
2272 * Size is large, assume best to place the slab management obj 2283 * Size is large, assume best to place the slab management obj
2273 * off-slab (should allow better packing of objs). 2284 * off-slab (should allow better packing of objs).
2274 */ 2285 */
2275 flags |= CFLGS_OFF_SLAB; 2286 flags |= CFLGS_OFF_SLAB;
2276 2287
2277 size = ALIGN(size, align); 2288 size = ALIGN(size, align);
2278 2289
2279 left_over = calculate_slab_order(cachep, size, align, flags); 2290 left_over = calculate_slab_order(cachep, size, align, flags);
2280 2291
2281 if (!cachep->num) { 2292 if (!cachep->num) {
2282 printk("kmem_cache_create: couldn't create cache %s.\n", name); 2293 printk("kmem_cache_create: couldn't create cache %s.\n", name);
2283 kmem_cache_free(&cache_cache, cachep); 2294 kmem_cache_free(&cache_cache, cachep);
2284 cachep = NULL; 2295 cachep = NULL;
2285 goto oops; 2296 goto oops;
2286 } 2297 }
2287 slab_size = ALIGN(cachep->num * sizeof(kmem_bufctl_t) 2298 slab_size = ALIGN(cachep->num * sizeof(kmem_bufctl_t)
2288 + sizeof(struct slab), align); 2299 + sizeof(struct slab), align);
2289 2300
2290 /* 2301 /*
2291 * If the slab has been placed off-slab, and we have enough space then 2302 * If the slab has been placed off-slab, and we have enough space then
2292 * move it on-slab. This is at the expense of any extra colouring. 2303 * move it on-slab. This is at the expense of any extra colouring.
2293 */ 2304 */
2294 if (flags & CFLGS_OFF_SLAB && left_over >= slab_size) { 2305 if (flags & CFLGS_OFF_SLAB && left_over >= slab_size) {
2295 flags &= ~CFLGS_OFF_SLAB; 2306 flags &= ~CFLGS_OFF_SLAB;
2296 left_over -= slab_size; 2307 left_over -= slab_size;
2297 } 2308 }
2298 2309
2299 if (flags & CFLGS_OFF_SLAB) { 2310 if (flags & CFLGS_OFF_SLAB) {
2300 /* really off slab. No need for manual alignment */ 2311 /* really off slab. No need for manual alignment */
2301 slab_size = 2312 slab_size =
2302 cachep->num * sizeof(kmem_bufctl_t) + sizeof(struct slab); 2313 cachep->num * sizeof(kmem_bufctl_t) + sizeof(struct slab);
2303 } 2314 }
2304 2315
2305 cachep->colour_off = cache_line_size(); 2316 cachep->colour_off = cache_line_size();
2306 /* Offset must be a multiple of the alignment. */ 2317 /* Offset must be a multiple of the alignment. */
2307 if (cachep->colour_off < align) 2318 if (cachep->colour_off < align)
2308 cachep->colour_off = align; 2319 cachep->colour_off = align;
2309 cachep->colour = left_over / cachep->colour_off; 2320 cachep->colour = left_over / cachep->colour_off;
2310 cachep->slab_size = slab_size; 2321 cachep->slab_size = slab_size;
2311 cachep->flags = flags; 2322 cachep->flags = flags;
2312 cachep->gfpflags = 0; 2323 cachep->gfpflags = 0;
2313 if (flags & SLAB_CACHE_DMA) 2324 if (flags & SLAB_CACHE_DMA)
2314 cachep->gfpflags |= GFP_DMA; 2325 cachep->gfpflags |= GFP_DMA;
2315 cachep->buffer_size = size; 2326 cachep->buffer_size = size;
2327 cachep->reciprocal_buffer_size = reciprocal_value(size);
2316 2328
2317 if (flags & CFLGS_OFF_SLAB) { 2329 if (flags & CFLGS_OFF_SLAB) {
2318 cachep->slabp_cache = kmem_find_general_cachep(slab_size, 0u); 2330 cachep->slabp_cache = kmem_find_general_cachep(slab_size, 0u);
2319 /* 2331 /*
2320 * This is a possibility for one of the malloc_sizes caches. 2332 * This is a possibility for one of the malloc_sizes caches.
2321 * But since we go off slab only for object size greater than 2333 * But since we go off slab only for object size greater than
2322 * PAGE_SIZE/8, and malloc_sizes gets created in ascending order, 2334 * PAGE_SIZE/8, and malloc_sizes gets created in ascending order,
2323 * this should not happen at all. 2335 * this should not happen at all.
2324 * But leave a BUG_ON for some lucky dude. 2336 * But leave a BUG_ON for some lucky dude.
2325 */ 2337 */
2326 BUG_ON(!cachep->slabp_cache); 2338 BUG_ON(!cachep->slabp_cache);
2327 } 2339 }
2328 cachep->ctor = ctor; 2340 cachep->ctor = ctor;
2329 cachep->dtor = dtor; 2341 cachep->dtor = dtor;
2330 cachep->name = name; 2342 cachep->name = name;
2331 2343
2332 if (setup_cpu_cache(cachep)) { 2344 if (setup_cpu_cache(cachep)) {
2333 __kmem_cache_destroy(cachep); 2345 __kmem_cache_destroy(cachep);
2334 cachep = NULL; 2346 cachep = NULL;
2335 goto oops; 2347 goto oops;
2336 } 2348 }
2337 2349
2338 /* cache setup completed, link it into the list */ 2350 /* cache setup completed, link it into the list */
2339 list_add(&cachep->next, &cache_chain); 2351 list_add(&cachep->next, &cache_chain);
2340 oops: 2352 oops:
2341 if (!cachep && (flags & SLAB_PANIC)) 2353 if (!cachep && (flags & SLAB_PANIC))
2342 panic("kmem_cache_create(): failed to create slab `%s'\n", 2354 panic("kmem_cache_create(): failed to create slab `%s'\n",
2343 name); 2355 name);
2344 mutex_unlock(&cache_chain_mutex); 2356 mutex_unlock(&cache_chain_mutex);
2345 return cachep; 2357 return cachep;
2346 } 2358 }
2347 EXPORT_SYMBOL(kmem_cache_create); 2359 EXPORT_SYMBOL(kmem_cache_create);
2348 2360
2349 #if DEBUG 2361 #if DEBUG
2350 static void check_irq_off(void) 2362 static void check_irq_off(void)
2351 { 2363 {
2352 BUG_ON(!irqs_disabled()); 2364 BUG_ON(!irqs_disabled());
2353 } 2365 }
2354 2366
2355 static void check_irq_on(void) 2367 static void check_irq_on(void)
2356 { 2368 {
2357 BUG_ON(irqs_disabled()); 2369 BUG_ON(irqs_disabled());
2358 } 2370 }
2359 2371
2360 static void check_spinlock_acquired(struct kmem_cache *cachep) 2372 static void check_spinlock_acquired(struct kmem_cache *cachep)
2361 { 2373 {
2362 #ifdef CONFIG_SMP 2374 #ifdef CONFIG_SMP
2363 check_irq_off(); 2375 check_irq_off();
2364 assert_spin_locked(&cachep->nodelists[numa_node_id()]->list_lock); 2376 assert_spin_locked(&cachep->nodelists[numa_node_id()]->list_lock);
2365 #endif 2377 #endif
2366 } 2378 }
2367 2379
2368 static void check_spinlock_acquired_node(struct kmem_cache *cachep, int node) 2380 static void check_spinlock_acquired_node(struct kmem_cache *cachep, int node)
2369 { 2381 {
2370 #ifdef CONFIG_SMP 2382 #ifdef CONFIG_SMP
2371 check_irq_off(); 2383 check_irq_off();
2372 assert_spin_locked(&cachep->nodelists[node]->list_lock); 2384 assert_spin_locked(&cachep->nodelists[node]->list_lock);
2373 #endif 2385 #endif
2374 } 2386 }
2375 2387
2376 #else 2388 #else
2377 #define check_irq_off() do { } while(0) 2389 #define check_irq_off() do { } while(0)
2378 #define check_irq_on() do { } while(0) 2390 #define check_irq_on() do { } while(0)
2379 #define check_spinlock_acquired(x) do { } while(0) 2391 #define check_spinlock_acquired(x) do { } while(0)
2380 #define check_spinlock_acquired_node(x, y) do { } while(0) 2392 #define check_spinlock_acquired_node(x, y) do { } while(0)
2381 #endif 2393 #endif
2382 2394
2383 static void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3, 2395 static void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3,
2384 struct array_cache *ac, 2396 struct array_cache *ac,
2385 int force, int node); 2397 int force, int node);
2386 2398
2387 static void do_drain(void *arg) 2399 static void do_drain(void *arg)
2388 { 2400 {
2389 struct kmem_cache *cachep = arg; 2401 struct kmem_cache *cachep = arg;
2390 struct array_cache *ac; 2402 struct array_cache *ac;
2391 int node = numa_node_id(); 2403 int node = numa_node_id();
2392 2404
2393 check_irq_off(); 2405 check_irq_off();
2394 ac = cpu_cache_get(cachep); 2406 ac = cpu_cache_get(cachep);
2395 spin_lock(&cachep->nodelists[node]->list_lock); 2407 spin_lock(&cachep->nodelists[node]->list_lock);
2396 free_block(cachep, ac->entry, ac->avail, node); 2408 free_block(cachep, ac->entry, ac->avail, node);
2397 spin_unlock(&cachep->nodelists[node]->list_lock); 2409 spin_unlock(&cachep->nodelists[node]->list_lock);
2398 ac->avail = 0; 2410 ac->avail = 0;
2399 } 2411 }
2400 2412
2401 static void drain_cpu_caches(struct kmem_cache *cachep) 2413 static void drain_cpu_caches(struct kmem_cache *cachep)
2402 { 2414 {
2403 struct kmem_list3 *l3; 2415 struct kmem_list3 *l3;
2404 int node; 2416 int node;
2405 2417
2406 on_each_cpu(do_drain, cachep, 1, 1); 2418 on_each_cpu(do_drain, cachep, 1, 1);
2407 check_irq_on(); 2419 check_irq_on();
2408 for_each_online_node(node) { 2420 for_each_online_node(node) {
2409 l3 = cachep->nodelists[node]; 2421 l3 = cachep->nodelists[node];
2410 if (l3 && l3->alien) 2422 if (l3 && l3->alien)
2411 drain_alien_cache(cachep, l3->alien); 2423 drain_alien_cache(cachep, l3->alien);
2412 } 2424 }
2413 2425
2414 for_each_online_node(node) { 2426 for_each_online_node(node) {
2415 l3 = cachep->nodelists[node]; 2427 l3 = cachep->nodelists[node];
2416 if (l3) 2428 if (l3)
2417 drain_array(cachep, l3, l3->shared, 1, node); 2429 drain_array(cachep, l3, l3->shared, 1, node);
2418 } 2430 }
2419 } 2431 }
2420 2432
2421 /* 2433 /*
2422 * Remove slabs from the list of free slabs. 2434 * Remove slabs from the list of free slabs.
2423 * Specify the number of slabs to drain in tofree. 2435 * Specify the number of slabs to drain in tofree.
2424 * 2436 *
2425 * Returns the actual number of slabs released. 2437 * Returns the actual number of slabs released.
2426 */ 2438 */
2427 static int drain_freelist(struct kmem_cache *cache, 2439 static int drain_freelist(struct kmem_cache *cache,
2428 struct kmem_list3 *l3, int tofree) 2440 struct kmem_list3 *l3, int tofree)
2429 { 2441 {
2430 struct list_head *p; 2442 struct list_head *p;
2431 int nr_freed; 2443 int nr_freed;
2432 struct slab *slabp; 2444 struct slab *slabp;
2433 2445
2434 nr_freed = 0; 2446 nr_freed = 0;
2435 while (nr_freed < tofree && !list_empty(&l3->slabs_free)) { 2447 while (nr_freed < tofree && !list_empty(&l3->slabs_free)) {
2436 2448
2437 spin_lock_irq(&l3->list_lock); 2449 spin_lock_irq(&l3->list_lock);
2438 p = l3->slabs_free.prev; 2450 p = l3->slabs_free.prev;
2439 if (p == &l3->slabs_free) { 2451 if (p == &l3->slabs_free) {
2440 spin_unlock_irq(&l3->list_lock); 2452 spin_unlock_irq(&l3->list_lock);
2441 goto out; 2453 goto out;
2442 } 2454 }
2443 2455
2444 slabp = list_entry(p, struct slab, list); 2456 slabp = list_entry(p, struct slab, list);
2445 #if DEBUG 2457 #if DEBUG
2446 BUG_ON(slabp->inuse); 2458 BUG_ON(slabp->inuse);
2447 #endif 2459 #endif
2448 list_del(&slabp->list); 2460 list_del(&slabp->list);
2449 /* 2461 /*
2450 * Safe to drop the lock. The slab is no longer linked 2462 * Safe to drop the lock. The slab is no longer linked
2451 * to the cache. 2463 * to the cache.
2452 */ 2464 */
2453 l3->free_objects -= cache->num; 2465 l3->free_objects -= cache->num;
2454 spin_unlock_irq(&l3->list_lock); 2466 spin_unlock_irq(&l3->list_lock);
2455 slab_destroy(cache, slabp); 2467 slab_destroy(cache, slabp);
2456 nr_freed++; 2468 nr_freed++;
2457 } 2469 }
2458 out: 2470 out:
2459 return nr_freed; 2471 return nr_freed;
2460 } 2472 }
2461 2473
2462 /* Called with cache_chain_mutex held to protect against cpu hotplug */ 2474 /* Called with cache_chain_mutex held to protect against cpu hotplug */
2463 static int __cache_shrink(struct kmem_cache *cachep) 2475 static int __cache_shrink(struct kmem_cache *cachep)
2464 { 2476 {
2465 int ret = 0, i = 0; 2477 int ret = 0, i = 0;
2466 struct kmem_list3 *l3; 2478 struct kmem_list3 *l3;
2467 2479
2468 drain_cpu_caches(cachep); 2480 drain_cpu_caches(cachep);
2469 2481
2470 check_irq_on(); 2482 check_irq_on();
2471 for_each_online_node(i) { 2483 for_each_online_node(i) {
2472 l3 = cachep->nodelists[i]; 2484 l3 = cachep->nodelists[i];
2473 if (!l3) 2485 if (!l3)
2474 continue; 2486 continue;
2475 2487
2476 drain_freelist(cachep, l3, l3->free_objects); 2488 drain_freelist(cachep, l3, l3->free_objects);
2477 2489
2478 ret += !list_empty(&l3->slabs_full) || 2490 ret += !list_empty(&l3->slabs_full) ||
2479 !list_empty(&l3->slabs_partial); 2491 !list_empty(&l3->slabs_partial);
2480 } 2492 }
2481 return (ret ? 1 : 0); 2493 return (ret ? 1 : 0);
2482 } 2494 }
2483 2495
2484 /** 2496 /**
2485 * kmem_cache_shrink - Shrink a cache. 2497 * kmem_cache_shrink - Shrink a cache.
2486 * @cachep: The cache to shrink. 2498 * @cachep: The cache to shrink.
2487 * 2499 *
2488 * Releases as many slabs as possible for a cache. 2500 * Releases as many slabs as possible for a cache.
2489 * To help debugging, a zero exit status indicates all slabs were released. 2501 * To help debugging, a zero exit status indicates all slabs were released.
2490 */ 2502 */
2491 int kmem_cache_shrink(struct kmem_cache *cachep) 2503 int kmem_cache_shrink(struct kmem_cache *cachep)
2492 { 2504 {
2493 int ret; 2505 int ret;
2494 BUG_ON(!cachep || in_interrupt()); 2506 BUG_ON(!cachep || in_interrupt());
2495 2507
2496 mutex_lock(&cache_chain_mutex); 2508 mutex_lock(&cache_chain_mutex);
2497 ret = __cache_shrink(cachep); 2509 ret = __cache_shrink(cachep);
2498 mutex_unlock(&cache_chain_mutex); 2510 mutex_unlock(&cache_chain_mutex);
2499 return ret; 2511 return ret;
2500 } 2512 }
2501 EXPORT_SYMBOL(kmem_cache_shrink); 2513 EXPORT_SYMBOL(kmem_cache_shrink);
2502 2514
2503 /** 2515 /**
2504 * kmem_cache_destroy - delete a cache 2516 * kmem_cache_destroy - delete a cache
2505 * @cachep: the cache to destroy 2517 * @cachep: the cache to destroy
2506 * 2518 *
2507 * Remove a struct kmem_cache object from the slab cache. 2519 * Remove a struct kmem_cache object from the slab cache.
2508 * 2520 *
2509 * It is expected this function will be called by a module when it is 2521 * It is expected this function will be called by a module when it is
2510 * unloaded. This will remove the cache completely, and avoid a duplicate 2522 * unloaded. This will remove the cache completely, and avoid a duplicate
2511 * cache being allocated each time a module is loaded and unloaded, if the 2523 * cache being allocated each time a module is loaded and unloaded, if the
2512 * module doesn't have persistent in-kernel storage across loads and unloads. 2524 * module doesn't have persistent in-kernel storage across loads and unloads.
2513 * 2525 *
2514 * The cache must be empty before calling this function. 2526 * The cache must be empty before calling this function.
2515 * 2527 *
2516 * The caller must guarantee that noone will allocate memory from the cache 2528 * The caller must guarantee that noone will allocate memory from the cache
2517 * during the kmem_cache_destroy(). 2529 * during the kmem_cache_destroy().
2518 */ 2530 */
2519 void kmem_cache_destroy(struct kmem_cache *cachep) 2531 void kmem_cache_destroy(struct kmem_cache *cachep)
2520 { 2532 {
2521 BUG_ON(!cachep || in_interrupt()); 2533 BUG_ON(!cachep || in_interrupt());
2522 2534
2523 /* Find the cache in the chain of caches. */ 2535 /* Find the cache in the chain of caches. */
2524 mutex_lock(&cache_chain_mutex); 2536 mutex_lock(&cache_chain_mutex);
2525 /* 2537 /*
2526 * the chain is never empty, cache_cache is never destroyed 2538 * the chain is never empty, cache_cache is never destroyed
2527 */ 2539 */
2528 list_del(&cachep->next); 2540 list_del(&cachep->next);
2529 if (__cache_shrink(cachep)) { 2541 if (__cache_shrink(cachep)) {
2530 slab_error(cachep, "Can't free all objects"); 2542 slab_error(cachep, "Can't free all objects");
2531 list_add(&cachep->next, &cache_chain); 2543 list_add(&cachep->next, &cache_chain);
2532 mutex_unlock(&cache_chain_mutex); 2544 mutex_unlock(&cache_chain_mutex);
2533 return; 2545 return;
2534 } 2546 }
2535 2547
2536 if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU)) 2548 if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU))
2537 synchronize_rcu(); 2549 synchronize_rcu();
2538 2550
2539 __kmem_cache_destroy(cachep); 2551 __kmem_cache_destroy(cachep);
2540 mutex_unlock(&cache_chain_mutex); 2552 mutex_unlock(&cache_chain_mutex);
2541 } 2553 }
2542 EXPORT_SYMBOL(kmem_cache_destroy); 2554 EXPORT_SYMBOL(kmem_cache_destroy);
2543 2555
2544 /* 2556 /*
2545 * Get the memory for a slab management obj. 2557 * Get the memory for a slab management obj.
2546 * For a slab cache when the slab descriptor is off-slab, slab descriptors 2558 * For a slab cache when the slab descriptor is off-slab, slab descriptors
2547 * always come from malloc_sizes caches. The slab descriptor cannot 2559 * always come from malloc_sizes caches. The slab descriptor cannot
2548 * come from the same cache which is getting created because, 2560 * come from the same cache which is getting created because,
2549 * when we are searching for an appropriate cache for these 2561 * when we are searching for an appropriate cache for these
2550 * descriptors in kmem_cache_create, we search through the malloc_sizes array. 2562 * descriptors in kmem_cache_create, we search through the malloc_sizes array.
2551 * If we are creating a malloc_sizes cache here it would not be visible to 2563 * If we are creating a malloc_sizes cache here it would not be visible to
2552 * kmem_find_general_cachep till the initialization is complete. 2564 * kmem_find_general_cachep till the initialization is complete.
2553 * Hence we cannot have slabp_cache same as the original cache. 2565 * Hence we cannot have slabp_cache same as the original cache.
2554 */ 2566 */
2555 static struct slab *alloc_slabmgmt(struct kmem_cache *cachep, void *objp, 2567 static struct slab *alloc_slabmgmt(struct kmem_cache *cachep, void *objp,
2556 int colour_off, gfp_t local_flags, 2568 int colour_off, gfp_t local_flags,
2557 int nodeid) 2569 int nodeid)
2558 { 2570 {
2559 struct slab *slabp; 2571 struct slab *slabp;
2560 2572
2561 if (OFF_SLAB(cachep)) { 2573 if (OFF_SLAB(cachep)) {
2562 /* Slab management obj is off-slab. */ 2574 /* Slab management obj is off-slab. */
2563 slabp = kmem_cache_alloc_node(cachep->slabp_cache, 2575 slabp = kmem_cache_alloc_node(cachep->slabp_cache,
2564 local_flags & ~GFP_THISNODE, nodeid); 2576 local_flags & ~GFP_THISNODE, nodeid);
2565 if (!slabp) 2577 if (!slabp)
2566 return NULL; 2578 return NULL;
2567 } else { 2579 } else {
2568 slabp = objp + colour_off; 2580 slabp = objp + colour_off;
2569 colour_off += cachep->slab_size; 2581 colour_off += cachep->slab_size;
2570 } 2582 }
2571 slabp->inuse = 0; 2583 slabp->inuse = 0;
2572 slabp->colouroff = colour_off; 2584 slabp->colouroff = colour_off;
2573 slabp->s_mem = objp + colour_off; 2585 slabp->s_mem = objp + colour_off;
2574 slabp->nodeid = nodeid; 2586 slabp->nodeid = nodeid;
2575 return slabp; 2587 return slabp;
2576 } 2588 }
2577 2589
2578 static inline kmem_bufctl_t *slab_bufctl(struct slab *slabp) 2590 static inline kmem_bufctl_t *slab_bufctl(struct slab *slabp)
2579 { 2591 {
2580 return (kmem_bufctl_t *) (slabp + 1); 2592 return (kmem_bufctl_t *) (slabp + 1);
2581 } 2593 }
2582 2594
2583 static void cache_init_objs(struct kmem_cache *cachep, 2595 static void cache_init_objs(struct kmem_cache *cachep,
2584 struct slab *slabp, unsigned long ctor_flags) 2596 struct slab *slabp, unsigned long ctor_flags)
2585 { 2597 {
2586 int i; 2598 int i;
2587 2599
2588 for (i = 0; i < cachep->num; i++) { 2600 for (i = 0; i < cachep->num; i++) {
2589 void *objp = index_to_obj(cachep, slabp, i); 2601 void *objp = index_to_obj(cachep, slabp, i);
2590 #if DEBUG 2602 #if DEBUG
2591 /* need to poison the objs? */ 2603 /* need to poison the objs? */
2592 if (cachep->flags & SLAB_POISON) 2604 if (cachep->flags & SLAB_POISON)
2593 poison_obj(cachep, objp, POISON_FREE); 2605 poison_obj(cachep, objp, POISON_FREE);
2594 if (cachep->flags & SLAB_STORE_USER) 2606 if (cachep->flags & SLAB_STORE_USER)
2595 *dbg_userword(cachep, objp) = NULL; 2607 *dbg_userword(cachep, objp) = NULL;
2596 2608
2597 if (cachep->flags & SLAB_RED_ZONE) { 2609 if (cachep->flags & SLAB_RED_ZONE) {
2598 *dbg_redzone1(cachep, objp) = RED_INACTIVE; 2610 *dbg_redzone1(cachep, objp) = RED_INACTIVE;
2599 *dbg_redzone2(cachep, objp) = RED_INACTIVE; 2611 *dbg_redzone2(cachep, objp) = RED_INACTIVE;
2600 } 2612 }
2601 /* 2613 /*
2602 * Constructors are not allowed to allocate memory from the same 2614 * Constructors are not allowed to allocate memory from the same
2603 * cache which they are a constructor for. Otherwise, deadlock. 2615 * cache which they are a constructor for. Otherwise, deadlock.
2604 * They must also be threaded. 2616 * They must also be threaded.
2605 */ 2617 */
2606 if (cachep->ctor && !(cachep->flags & SLAB_POISON)) 2618 if (cachep->ctor && !(cachep->flags & SLAB_POISON))
2607 cachep->ctor(objp + obj_offset(cachep), cachep, 2619 cachep->ctor(objp + obj_offset(cachep), cachep,
2608 ctor_flags); 2620 ctor_flags);
2609 2621
2610 if (cachep->flags & SLAB_RED_ZONE) { 2622 if (cachep->flags & SLAB_RED_ZONE) {
2611 if (*dbg_redzone2(cachep, objp) != RED_INACTIVE) 2623 if (*dbg_redzone2(cachep, objp) != RED_INACTIVE)
2612 slab_error(cachep, "constructor overwrote the" 2624 slab_error(cachep, "constructor overwrote the"
2613 " end of an object"); 2625 " end of an object");
2614 if (*dbg_redzone1(cachep, objp) != RED_INACTIVE) 2626 if (*dbg_redzone1(cachep, objp) != RED_INACTIVE)
2615 slab_error(cachep, "constructor overwrote the" 2627 slab_error(cachep, "constructor overwrote the"
2616 " start of an object"); 2628 " start of an object");
2617 } 2629 }
2618 if ((cachep->buffer_size % PAGE_SIZE) == 0 && 2630 if ((cachep->buffer_size % PAGE_SIZE) == 0 &&
2619 OFF_SLAB(cachep) && cachep->flags & SLAB_POISON) 2631 OFF_SLAB(cachep) && cachep->flags & SLAB_POISON)
2620 kernel_map_pages(virt_to_page(objp), 2632 kernel_map_pages(virt_to_page(objp),
2621 cachep->buffer_size / PAGE_SIZE, 0); 2633 cachep->buffer_size / PAGE_SIZE, 0);
2622 #else 2634 #else
2623 if (cachep->ctor) 2635 if (cachep->ctor)
2624 cachep->ctor(objp, cachep, ctor_flags); 2636 cachep->ctor(objp, cachep, ctor_flags);
2625 #endif 2637 #endif
2626 slab_bufctl(slabp)[i] = i + 1; 2638 slab_bufctl(slabp)[i] = i + 1;
2627 } 2639 }
2628 slab_bufctl(slabp)[i - 1] = BUFCTL_END; 2640 slab_bufctl(slabp)[i - 1] = BUFCTL_END;
2629 slabp->free = 0; 2641 slabp->free = 0;
2630 } 2642 }
2631 2643
2632 static void kmem_flagcheck(struct kmem_cache *cachep, gfp_t flags) 2644 static void kmem_flagcheck(struct kmem_cache *cachep, gfp_t flags)
2633 { 2645 {
2634 if (flags & GFP_DMA) 2646 if (flags & GFP_DMA)
2635 BUG_ON(!(cachep->gfpflags & GFP_DMA)); 2647 BUG_ON(!(cachep->gfpflags & GFP_DMA));
2636 else 2648 else
2637 BUG_ON(cachep->gfpflags & GFP_DMA); 2649 BUG_ON(cachep->gfpflags & GFP_DMA);
2638 } 2650 }
2639 2651
2640 static void *slab_get_obj(struct kmem_cache *cachep, struct slab *slabp, 2652 static void *slab_get_obj(struct kmem_cache *cachep, struct slab *slabp,
2641 int nodeid) 2653 int nodeid)
2642 { 2654 {
2643 void *objp = index_to_obj(cachep, slabp, slabp->free); 2655 void *objp = index_to_obj(cachep, slabp, slabp->free);
2644 kmem_bufctl_t next; 2656 kmem_bufctl_t next;
2645 2657
2646 slabp->inuse++; 2658 slabp->inuse++;
2647 next = slab_bufctl(slabp)[slabp->free]; 2659 next = slab_bufctl(slabp)[slabp->free];
2648 #if DEBUG 2660 #if DEBUG
2649 slab_bufctl(slabp)[slabp->free] = BUFCTL_FREE; 2661 slab_bufctl(slabp)[slabp->free] = BUFCTL_FREE;
2650 WARN_ON(slabp->nodeid != nodeid); 2662 WARN_ON(slabp->nodeid != nodeid);
2651 #endif 2663 #endif
2652 slabp->free = next; 2664 slabp->free = next;
2653 2665
2654 return objp; 2666 return objp;
2655 } 2667 }
2656 2668
2657 static void slab_put_obj(struct kmem_cache *cachep, struct slab *slabp, 2669 static void slab_put_obj(struct kmem_cache *cachep, struct slab *slabp,
2658 void *objp, int nodeid) 2670 void *objp, int nodeid)
2659 { 2671 {
2660 unsigned int objnr = obj_to_index(cachep, slabp, objp); 2672 unsigned int objnr = obj_to_index(cachep, slabp, objp);
2661 2673
2662 #if DEBUG 2674 #if DEBUG
2663 /* Verify that the slab belongs to the intended node */ 2675 /* Verify that the slab belongs to the intended node */
2664 WARN_ON(slabp->nodeid != nodeid); 2676 WARN_ON(slabp->nodeid != nodeid);
2665 2677
2666 if (slab_bufctl(slabp)[objnr] + 1 <= SLAB_LIMIT + 1) { 2678 if (slab_bufctl(slabp)[objnr] + 1 <= SLAB_LIMIT + 1) {
2667 printk(KERN_ERR "slab: double free detected in cache " 2679 printk(KERN_ERR "slab: double free detected in cache "
2668 "'%s', objp %p\n", cachep->name, objp); 2680 "'%s', objp %p\n", cachep->name, objp);
2669 BUG(); 2681 BUG();
2670 } 2682 }
2671 #endif 2683 #endif
2672 slab_bufctl(slabp)[objnr] = slabp->free; 2684 slab_bufctl(slabp)[objnr] = slabp->free;
2673 slabp->free = objnr; 2685 slabp->free = objnr;
2674 slabp->inuse--; 2686 slabp->inuse--;
2675 } 2687 }
2676 2688
2677 /* 2689 /*
2678 * Map pages beginning at addr to the given cache and slab. This is required 2690 * Map pages beginning at addr to the given cache and slab. This is required
2679 * for the slab allocator to be able to lookup the cache and slab of a 2691 * for the slab allocator to be able to lookup the cache and slab of a
2680 * virtual address for kfree, ksize, kmem_ptr_validate, and slab debugging. 2692 * virtual address for kfree, ksize, kmem_ptr_validate, and slab debugging.
2681 */ 2693 */
2682 static void slab_map_pages(struct kmem_cache *cache, struct slab *slab, 2694 static void slab_map_pages(struct kmem_cache *cache, struct slab *slab,
2683 void *addr) 2695 void *addr)
2684 { 2696 {
2685 int nr_pages; 2697 int nr_pages;
2686 struct page *page; 2698 struct page *page;
2687 2699
2688 page = virt_to_page(addr); 2700 page = virt_to_page(addr);
2689 2701
2690 nr_pages = 1; 2702 nr_pages = 1;
2691 if (likely(!PageCompound(page))) 2703 if (likely(!PageCompound(page)))
2692 nr_pages <<= cache->gfporder; 2704 nr_pages <<= cache->gfporder;
2693 2705
2694 do { 2706 do {
2695 page_set_cache(page, cache); 2707 page_set_cache(page, cache);
2696 page_set_slab(page, slab); 2708 page_set_slab(page, slab);
2697 page++; 2709 page++;
2698 } while (--nr_pages); 2710 } while (--nr_pages);
2699 } 2711 }
2700 2712
2701 /* 2713 /*
2702 * Grow (by 1) the number of slabs within a cache. This is called by 2714 * Grow (by 1) the number of slabs within a cache. This is called by
2703 * kmem_cache_alloc() when there are no active objs left in a cache. 2715 * kmem_cache_alloc() when there are no active objs left in a cache.
2704 */ 2716 */
2705 static int cache_grow(struct kmem_cache *cachep, 2717 static int cache_grow(struct kmem_cache *cachep,
2706 gfp_t flags, int nodeid, void *objp) 2718 gfp_t flags, int nodeid, void *objp)
2707 { 2719 {
2708 struct slab *slabp; 2720 struct slab *slabp;
2709 size_t offset; 2721 size_t offset;
2710 gfp_t local_flags; 2722 gfp_t local_flags;
2711 unsigned long ctor_flags; 2723 unsigned long ctor_flags;
2712 struct kmem_list3 *l3; 2724 struct kmem_list3 *l3;
2713 2725
2714 /* 2726 /*
2715 * Be lazy and only check for valid flags here, keeping it out of the 2727 * Be lazy and only check for valid flags here, keeping it out of the
2716 * critical path in kmem_cache_alloc(). 2728 * critical path in kmem_cache_alloc().
2717 */ 2729 */
2718 BUG_ON(flags & ~(GFP_DMA | GFP_LEVEL_MASK | __GFP_NO_GROW)); 2730 BUG_ON(flags & ~(GFP_DMA | GFP_LEVEL_MASK | __GFP_NO_GROW));
2719 if (flags & __GFP_NO_GROW) 2731 if (flags & __GFP_NO_GROW)
2720 return 0; 2732 return 0;
2721 2733
2722 ctor_flags = SLAB_CTOR_CONSTRUCTOR; 2734 ctor_flags = SLAB_CTOR_CONSTRUCTOR;
2723 local_flags = (flags & GFP_LEVEL_MASK); 2735 local_flags = (flags & GFP_LEVEL_MASK);
2724 if (!(local_flags & __GFP_WAIT)) 2736 if (!(local_flags & __GFP_WAIT))
2725 /* 2737 /*
2726 * Not allowed to sleep. Need to tell a constructor about 2738 * Not allowed to sleep. Need to tell a constructor about
2727 * this - it might need to know... 2739 * this - it might need to know...
2728 */ 2740 */
2729 ctor_flags |= SLAB_CTOR_ATOMIC; 2741 ctor_flags |= SLAB_CTOR_ATOMIC;
2730 2742
2731 /* Take the l3 list lock to change the colour_next on this node */ 2743 /* Take the l3 list lock to change the colour_next on this node */
2732 check_irq_off(); 2744 check_irq_off();
2733 l3 = cachep->nodelists[nodeid]; 2745 l3 = cachep->nodelists[nodeid];
2734 spin_lock(&l3->list_lock); 2746 spin_lock(&l3->list_lock);
2735 2747
2736 /* Get colour for the slab, and cal the next value. */ 2748 /* Get colour for the slab, and cal the next value. */
2737 offset = l3->colour_next; 2749 offset = l3->colour_next;
2738 l3->colour_next++; 2750 l3->colour_next++;
2739 if (l3->colour_next >= cachep->colour) 2751 if (l3->colour_next >= cachep->colour)
2740 l3->colour_next = 0; 2752 l3->colour_next = 0;
2741 spin_unlock(&l3->list_lock); 2753 spin_unlock(&l3->list_lock);
2742 2754
2743 offset *= cachep->colour_off; 2755 offset *= cachep->colour_off;
2744 2756
2745 if (local_flags & __GFP_WAIT) 2757 if (local_flags & __GFP_WAIT)
2746 local_irq_enable(); 2758 local_irq_enable();
2747 2759
2748 /* 2760 /*
2749 * The test for missing atomic flag is performed here, rather than 2761 * The test for missing atomic flag is performed here, rather than
2750 * the more obvious place, simply to reduce the critical path length 2762 * the more obvious place, simply to reduce the critical path length
2751 * in kmem_cache_alloc(). If a caller is seriously mis-behaving they 2763 * in kmem_cache_alloc(). If a caller is seriously mis-behaving they
2752 * will eventually be caught here (where it matters). 2764 * will eventually be caught here (where it matters).
2753 */ 2765 */
2754 kmem_flagcheck(cachep, flags); 2766 kmem_flagcheck(cachep, flags);
2755 2767
2756 /* 2768 /*
2757 * Get mem for the objs. Attempt to allocate a physical page from 2769 * Get mem for the objs. Attempt to allocate a physical page from
2758 * 'nodeid'. 2770 * 'nodeid'.
2759 */ 2771 */
2760 if (!objp) 2772 if (!objp)
2761 objp = kmem_getpages(cachep, flags, nodeid); 2773 objp = kmem_getpages(cachep, flags, nodeid);
2762 if (!objp) 2774 if (!objp)
2763 goto failed; 2775 goto failed;
2764 2776
2765 /* Get slab management. */ 2777 /* Get slab management. */
2766 slabp = alloc_slabmgmt(cachep, objp, offset, 2778 slabp = alloc_slabmgmt(cachep, objp, offset,
2767 local_flags & ~GFP_THISNODE, nodeid); 2779 local_flags & ~GFP_THISNODE, nodeid);
2768 if (!slabp) 2780 if (!slabp)
2769 goto opps1; 2781 goto opps1;
2770 2782
2771 slabp->nodeid = nodeid; 2783 slabp->nodeid = nodeid;
2772 slab_map_pages(cachep, slabp, objp); 2784 slab_map_pages(cachep, slabp, objp);
2773 2785
2774 cache_init_objs(cachep, slabp, ctor_flags); 2786 cache_init_objs(cachep, slabp, ctor_flags);
2775 2787
2776 if (local_flags & __GFP_WAIT) 2788 if (local_flags & __GFP_WAIT)
2777 local_irq_disable(); 2789 local_irq_disable();
2778 check_irq_off(); 2790 check_irq_off();
2779 spin_lock(&l3->list_lock); 2791 spin_lock(&l3->list_lock);
2780 2792
2781 /* Make slab active. */ 2793 /* Make slab active. */
2782 list_add_tail(&slabp->list, &(l3->slabs_free)); 2794 list_add_tail(&slabp->list, &(l3->slabs_free));
2783 STATS_INC_GROWN(cachep); 2795 STATS_INC_GROWN(cachep);
2784 l3->free_objects += cachep->num; 2796 l3->free_objects += cachep->num;
2785 spin_unlock(&l3->list_lock); 2797 spin_unlock(&l3->list_lock);
2786 return 1; 2798 return 1;
2787 opps1: 2799 opps1:
2788 kmem_freepages(cachep, objp); 2800 kmem_freepages(cachep, objp);
2789 failed: 2801 failed:
2790 if (local_flags & __GFP_WAIT) 2802 if (local_flags & __GFP_WAIT)
2791 local_irq_disable(); 2803 local_irq_disable();
2792 return 0; 2804 return 0;
2793 } 2805 }
2794 2806
2795 #if DEBUG 2807 #if DEBUG
2796 2808
2797 /* 2809 /*
2798 * Perform extra freeing checks: 2810 * Perform extra freeing checks:
2799 * - detect bad pointers. 2811 * - detect bad pointers.
2800 * - POISON/RED_ZONE checking 2812 * - POISON/RED_ZONE checking
2801 * - destructor calls, for caches with POISON+dtor 2813 * - destructor calls, for caches with POISON+dtor
2802 */ 2814 */
2803 static void kfree_debugcheck(const void *objp) 2815 static void kfree_debugcheck(const void *objp)
2804 { 2816 {
2805 struct page *page; 2817 struct page *page;
2806 2818
2807 if (!virt_addr_valid(objp)) { 2819 if (!virt_addr_valid(objp)) {
2808 printk(KERN_ERR "kfree_debugcheck: out of range ptr %lxh.\n", 2820 printk(KERN_ERR "kfree_debugcheck: out of range ptr %lxh.\n",
2809 (unsigned long)objp); 2821 (unsigned long)objp);
2810 BUG(); 2822 BUG();
2811 } 2823 }
2812 page = virt_to_page(objp); 2824 page = virt_to_page(objp);
2813 if (!PageSlab(page)) { 2825 if (!PageSlab(page)) {
2814 printk(KERN_ERR "kfree_debugcheck: bad ptr %lxh.\n", 2826 printk(KERN_ERR "kfree_debugcheck: bad ptr %lxh.\n",
2815 (unsigned long)objp); 2827 (unsigned long)objp);
2816 BUG(); 2828 BUG();
2817 } 2829 }
2818 } 2830 }
2819 2831
2820 static inline void verify_redzone_free(struct kmem_cache *cache, void *obj) 2832 static inline void verify_redzone_free(struct kmem_cache *cache, void *obj)
2821 { 2833 {
2822 unsigned long redzone1, redzone2; 2834 unsigned long redzone1, redzone2;
2823 2835
2824 redzone1 = *dbg_redzone1(cache, obj); 2836 redzone1 = *dbg_redzone1(cache, obj);
2825 redzone2 = *dbg_redzone2(cache, obj); 2837 redzone2 = *dbg_redzone2(cache, obj);
2826 2838
2827 /* 2839 /*
2828 * Redzone is ok. 2840 * Redzone is ok.
2829 */ 2841 */
2830 if (redzone1 == RED_ACTIVE && redzone2 == RED_ACTIVE) 2842 if (redzone1 == RED_ACTIVE && redzone2 == RED_ACTIVE)
2831 return; 2843 return;
2832 2844
2833 if (redzone1 == RED_INACTIVE && redzone2 == RED_INACTIVE) 2845 if (redzone1 == RED_INACTIVE && redzone2 == RED_INACTIVE)
2834 slab_error(cache, "double free detected"); 2846 slab_error(cache, "double free detected");
2835 else 2847 else
2836 slab_error(cache, "memory outside object was overwritten"); 2848 slab_error(cache, "memory outside object was overwritten");
2837 2849
2838 printk(KERN_ERR "%p: redzone 1:0x%lx, redzone 2:0x%lx.\n", 2850 printk(KERN_ERR "%p: redzone 1:0x%lx, redzone 2:0x%lx.\n",
2839 obj, redzone1, redzone2); 2851 obj, redzone1, redzone2);
2840 } 2852 }
2841 2853
2842 static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp, 2854 static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
2843 void *caller) 2855 void *caller)
2844 { 2856 {
2845 struct page *page; 2857 struct page *page;
2846 unsigned int objnr; 2858 unsigned int objnr;
2847 struct slab *slabp; 2859 struct slab *slabp;
2848 2860
2849 objp -= obj_offset(cachep); 2861 objp -= obj_offset(cachep);
2850 kfree_debugcheck(objp); 2862 kfree_debugcheck(objp);
2851 page = virt_to_page(objp); 2863 page = virt_to_page(objp);
2852 2864
2853 slabp = page_get_slab(page); 2865 slabp = page_get_slab(page);
2854 2866
2855 if (cachep->flags & SLAB_RED_ZONE) { 2867 if (cachep->flags & SLAB_RED_ZONE) {
2856 verify_redzone_free(cachep, objp); 2868 verify_redzone_free(cachep, objp);
2857 *dbg_redzone1(cachep, objp) = RED_INACTIVE; 2869 *dbg_redzone1(cachep, objp) = RED_INACTIVE;
2858 *dbg_redzone2(cachep, objp) = RED_INACTIVE; 2870 *dbg_redzone2(cachep, objp) = RED_INACTIVE;
2859 } 2871 }
2860 if (cachep->flags & SLAB_STORE_USER) 2872 if (cachep->flags & SLAB_STORE_USER)
2861 *dbg_userword(cachep, objp) = caller; 2873 *dbg_userword(cachep, objp) = caller;
2862 2874
2863 objnr = obj_to_index(cachep, slabp, objp); 2875 objnr = obj_to_index(cachep, slabp, objp);
2864 2876
2865 BUG_ON(objnr >= cachep->num); 2877 BUG_ON(objnr >= cachep->num);
2866 BUG_ON(objp != index_to_obj(cachep, slabp, objnr)); 2878 BUG_ON(objp != index_to_obj(cachep, slabp, objnr));
2867 2879
2868 if (cachep->flags & SLAB_DEBUG_INITIAL) { 2880 if (cachep->flags & SLAB_DEBUG_INITIAL) {
2869 /* 2881 /*
2870 * Need to call the slab's constructor so the caller can 2882 * Need to call the slab's constructor so the caller can
2871 * perform a verify of its state (debugging). Called without 2883 * perform a verify of its state (debugging). Called without
2872 * the cache-lock held. 2884 * the cache-lock held.
2873 */ 2885 */
2874 cachep->ctor(objp + obj_offset(cachep), 2886 cachep->ctor(objp + obj_offset(cachep),
2875 cachep, SLAB_CTOR_CONSTRUCTOR | SLAB_CTOR_VERIFY); 2887 cachep, SLAB_CTOR_CONSTRUCTOR | SLAB_CTOR_VERIFY);
2876 } 2888 }
2877 if (cachep->flags & SLAB_POISON && cachep->dtor) { 2889 if (cachep->flags & SLAB_POISON && cachep->dtor) {
2878 /* we want to cache poison the object, 2890 /* we want to cache poison the object,
2879 * call the destruction callback 2891 * call the destruction callback
2880 */ 2892 */
2881 cachep->dtor(objp + obj_offset(cachep), cachep, 0); 2893 cachep->dtor(objp + obj_offset(cachep), cachep, 0);
2882 } 2894 }
2883 #ifdef CONFIG_DEBUG_SLAB_LEAK 2895 #ifdef CONFIG_DEBUG_SLAB_LEAK
2884 slab_bufctl(slabp)[objnr] = BUFCTL_FREE; 2896 slab_bufctl(slabp)[objnr] = BUFCTL_FREE;
2885 #endif 2897 #endif
2886 if (cachep->flags & SLAB_POISON) { 2898 if (cachep->flags & SLAB_POISON) {
2887 #ifdef CONFIG_DEBUG_PAGEALLOC 2899 #ifdef CONFIG_DEBUG_PAGEALLOC
2888 if ((cachep->buffer_size % PAGE_SIZE)==0 && OFF_SLAB(cachep)) { 2900 if ((cachep->buffer_size % PAGE_SIZE)==0 && OFF_SLAB(cachep)) {
2889 store_stackinfo(cachep, objp, (unsigned long)caller); 2901 store_stackinfo(cachep, objp, (unsigned long)caller);
2890 kernel_map_pages(virt_to_page(objp), 2902 kernel_map_pages(virt_to_page(objp),
2891 cachep->buffer_size / PAGE_SIZE, 0); 2903 cachep->buffer_size / PAGE_SIZE, 0);
2892 } else { 2904 } else {
2893 poison_obj(cachep, objp, POISON_FREE); 2905 poison_obj(cachep, objp, POISON_FREE);
2894 } 2906 }
2895 #else 2907 #else
2896 poison_obj(cachep, objp, POISON_FREE); 2908 poison_obj(cachep, objp, POISON_FREE);
2897 #endif 2909 #endif
2898 } 2910 }
2899 return objp; 2911 return objp;
2900 } 2912 }
2901 2913
2902 static void check_slabp(struct kmem_cache *cachep, struct slab *slabp) 2914 static void check_slabp(struct kmem_cache *cachep, struct slab *slabp)
2903 { 2915 {
2904 kmem_bufctl_t i; 2916 kmem_bufctl_t i;
2905 int entries = 0; 2917 int entries = 0;
2906 2918
2907 /* Check slab's freelist to see if this obj is there. */ 2919 /* Check slab's freelist to see if this obj is there. */
2908 for (i = slabp->free; i != BUFCTL_END; i = slab_bufctl(slabp)[i]) { 2920 for (i = slabp->free; i != BUFCTL_END; i = slab_bufctl(slabp)[i]) {
2909 entries++; 2921 entries++;
2910 if (entries > cachep->num || i >= cachep->num) 2922 if (entries > cachep->num || i >= cachep->num)
2911 goto bad; 2923 goto bad;
2912 } 2924 }
2913 if (entries != cachep->num - slabp->inuse) { 2925 if (entries != cachep->num - slabp->inuse) {
2914 bad: 2926 bad:
2915 printk(KERN_ERR "slab: Internal list corruption detected in " 2927 printk(KERN_ERR "slab: Internal list corruption detected in "
2916 "cache '%s'(%d), slabp %p(%d). Hexdump:\n", 2928 "cache '%s'(%d), slabp %p(%d). Hexdump:\n",
2917 cachep->name, cachep->num, slabp, slabp->inuse); 2929 cachep->name, cachep->num, slabp, slabp->inuse);
2918 for (i = 0; 2930 for (i = 0;
2919 i < sizeof(*slabp) + cachep->num * sizeof(kmem_bufctl_t); 2931 i < sizeof(*slabp) + cachep->num * sizeof(kmem_bufctl_t);
2920 i++) { 2932 i++) {
2921 if (i % 16 == 0) 2933 if (i % 16 == 0)
2922 printk("\n%03x:", i); 2934 printk("\n%03x:", i);
2923 printk(" %02x", ((unsigned char *)slabp)[i]); 2935 printk(" %02x", ((unsigned char *)slabp)[i]);
2924 } 2936 }
2925 printk("\n"); 2937 printk("\n");
2926 BUG(); 2938 BUG();
2927 } 2939 }
2928 } 2940 }
2929 #else 2941 #else
2930 #define kfree_debugcheck(x) do { } while(0) 2942 #define kfree_debugcheck(x) do { } while(0)
2931 #define cache_free_debugcheck(x,objp,z) (objp) 2943 #define cache_free_debugcheck(x,objp,z) (objp)
2932 #define check_slabp(x,y) do { } while(0) 2944 #define check_slabp(x,y) do { } while(0)
2933 #endif 2945 #endif
2934 2946
2935 static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags) 2947 static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags)
2936 { 2948 {
2937 int batchcount; 2949 int batchcount;
2938 struct kmem_list3 *l3; 2950 struct kmem_list3 *l3;
2939 struct array_cache *ac; 2951 struct array_cache *ac;
2940 int node; 2952 int node;
2941 2953
2942 node = numa_node_id(); 2954 node = numa_node_id();
2943 2955
2944 check_irq_off(); 2956 check_irq_off();
2945 ac = cpu_cache_get(cachep); 2957 ac = cpu_cache_get(cachep);
2946 retry: 2958 retry:
2947 batchcount = ac->batchcount; 2959 batchcount = ac->batchcount;
2948 if (!ac->touched && batchcount > BATCHREFILL_LIMIT) { 2960 if (!ac->touched && batchcount > BATCHREFILL_LIMIT) {
2949 /* 2961 /*
2950 * If there was little recent activity on this cache, then 2962 * If there was little recent activity on this cache, then
2951 * perform only a partial refill. Otherwise we could generate 2963 * perform only a partial refill. Otherwise we could generate
2952 * refill bouncing. 2964 * refill bouncing.
2953 */ 2965 */
2954 batchcount = BATCHREFILL_LIMIT; 2966 batchcount = BATCHREFILL_LIMIT;
2955 } 2967 }
2956 l3 = cachep->nodelists[node]; 2968 l3 = cachep->nodelists[node];
2957 2969
2958 BUG_ON(ac->avail > 0 || !l3); 2970 BUG_ON(ac->avail > 0 || !l3);
2959 spin_lock(&l3->list_lock); 2971 spin_lock(&l3->list_lock);
2960 2972
2961 /* See if we can refill from the shared array */ 2973 /* See if we can refill from the shared array */
2962 if (l3->shared && transfer_objects(ac, l3->shared, batchcount)) 2974 if (l3->shared && transfer_objects(ac, l3->shared, batchcount))
2963 goto alloc_done; 2975 goto alloc_done;
2964 2976
2965 while (batchcount > 0) { 2977 while (batchcount > 0) {
2966 struct list_head *entry; 2978 struct list_head *entry;
2967 struct slab *slabp; 2979 struct slab *slabp;
2968 /* Get slab alloc is to come from. */ 2980 /* Get slab alloc is to come from. */
2969 entry = l3->slabs_partial.next; 2981 entry = l3->slabs_partial.next;
2970 if (entry == &l3->slabs_partial) { 2982 if (entry == &l3->slabs_partial) {
2971 l3->free_touched = 1; 2983 l3->free_touched = 1;
2972 entry = l3->slabs_free.next; 2984 entry = l3->slabs_free.next;
2973 if (entry == &l3->slabs_free) 2985 if (entry == &l3->slabs_free)
2974 goto must_grow; 2986 goto must_grow;
2975 } 2987 }
2976 2988
2977 slabp = list_entry(entry, struct slab, list); 2989 slabp = list_entry(entry, struct slab, list);
2978 check_slabp(cachep, slabp); 2990 check_slabp(cachep, slabp);
2979 check_spinlock_acquired(cachep); 2991 check_spinlock_acquired(cachep);
2980 while (slabp->inuse < cachep->num && batchcount--) { 2992 while (slabp->inuse < cachep->num && batchcount--) {
2981 STATS_INC_ALLOCED(cachep); 2993 STATS_INC_ALLOCED(cachep);
2982 STATS_INC_ACTIVE(cachep); 2994 STATS_INC_ACTIVE(cachep);
2983 STATS_SET_HIGH(cachep); 2995 STATS_SET_HIGH(cachep);
2984 2996
2985 ac->entry[ac->avail++] = slab_get_obj(cachep, slabp, 2997 ac->entry[ac->avail++] = slab_get_obj(cachep, slabp,
2986 node); 2998 node);
2987 } 2999 }
2988 check_slabp(cachep, slabp); 3000 check_slabp(cachep, slabp);
2989 3001
2990 /* move slabp to correct slabp list: */ 3002 /* move slabp to correct slabp list: */
2991 list_del(&slabp->list); 3003 list_del(&slabp->list);
2992 if (slabp->free == BUFCTL_END) 3004 if (slabp->free == BUFCTL_END)
2993 list_add(&slabp->list, &l3->slabs_full); 3005 list_add(&slabp->list, &l3->slabs_full);
2994 else 3006 else
2995 list_add(&slabp->list, &l3->slabs_partial); 3007 list_add(&slabp->list, &l3->slabs_partial);
2996 } 3008 }
2997 3009
2998 must_grow: 3010 must_grow:
2999 l3->free_objects -= ac->avail; 3011 l3->free_objects -= ac->avail;
3000 alloc_done: 3012 alloc_done:
3001 spin_unlock(&l3->list_lock); 3013 spin_unlock(&l3->list_lock);
3002 3014
3003 if (unlikely(!ac->avail)) { 3015 if (unlikely(!ac->avail)) {
3004 int x; 3016 int x;
3005 x = cache_grow(cachep, flags | GFP_THISNODE, node, NULL); 3017 x = cache_grow(cachep, flags | GFP_THISNODE, node, NULL);
3006 3018
3007 /* cache_grow can reenable interrupts, then ac could change. */ 3019 /* cache_grow can reenable interrupts, then ac could change. */
3008 ac = cpu_cache_get(cachep); 3020 ac = cpu_cache_get(cachep);
3009 if (!x && ac->avail == 0) /* no objects in sight? abort */ 3021 if (!x && ac->avail == 0) /* no objects in sight? abort */
3010 return NULL; 3022 return NULL;
3011 3023
3012 if (!ac->avail) /* objects refilled by interrupt? */ 3024 if (!ac->avail) /* objects refilled by interrupt? */
3013 goto retry; 3025 goto retry;
3014 } 3026 }
3015 ac->touched = 1; 3027 ac->touched = 1;
3016 return ac->entry[--ac->avail]; 3028 return ac->entry[--ac->avail];
3017 } 3029 }
3018 3030
3019 static inline void cache_alloc_debugcheck_before(struct kmem_cache *cachep, 3031 static inline void cache_alloc_debugcheck_before(struct kmem_cache *cachep,
3020 gfp_t flags) 3032 gfp_t flags)
3021 { 3033 {
3022 might_sleep_if(flags & __GFP_WAIT); 3034 might_sleep_if(flags & __GFP_WAIT);
3023 #if DEBUG 3035 #if DEBUG
3024 kmem_flagcheck(cachep, flags); 3036 kmem_flagcheck(cachep, flags);
3025 #endif 3037 #endif
3026 } 3038 }
3027 3039
3028 #if DEBUG 3040 #if DEBUG
3029 static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, 3041 static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
3030 gfp_t flags, void *objp, void *caller) 3042 gfp_t flags, void *objp, void *caller)
3031 { 3043 {
3032 if (!objp) 3044 if (!objp)
3033 return objp; 3045 return objp;
3034 if (cachep->flags & SLAB_POISON) { 3046 if (cachep->flags & SLAB_POISON) {
3035 #ifdef CONFIG_DEBUG_PAGEALLOC 3047 #ifdef CONFIG_DEBUG_PAGEALLOC
3036 if ((cachep->buffer_size % PAGE_SIZE) == 0 && OFF_SLAB(cachep)) 3048 if ((cachep->buffer_size % PAGE_SIZE) == 0 && OFF_SLAB(cachep))
3037 kernel_map_pages(virt_to_page(objp), 3049 kernel_map_pages(virt_to_page(objp),
3038 cachep->buffer_size / PAGE_SIZE, 1); 3050 cachep->buffer_size / PAGE_SIZE, 1);
3039 else 3051 else
3040 check_poison_obj(cachep, objp); 3052 check_poison_obj(cachep, objp);
3041 #else 3053 #else
3042 check_poison_obj(cachep, objp); 3054 check_poison_obj(cachep, objp);
3043 #endif 3055 #endif
3044 poison_obj(cachep, objp, POISON_INUSE); 3056 poison_obj(cachep, objp, POISON_INUSE);
3045 } 3057 }
3046 if (cachep->flags & SLAB_STORE_USER) 3058 if (cachep->flags & SLAB_STORE_USER)
3047 *dbg_userword(cachep, objp) = caller; 3059 *dbg_userword(cachep, objp) = caller;
3048 3060
3049 if (cachep->flags & SLAB_RED_ZONE) { 3061 if (cachep->flags & SLAB_RED_ZONE) {
3050 if (*dbg_redzone1(cachep, objp) != RED_INACTIVE || 3062 if (*dbg_redzone1(cachep, objp) != RED_INACTIVE ||
3051 *dbg_redzone2(cachep, objp) != RED_INACTIVE) { 3063 *dbg_redzone2(cachep, objp) != RED_INACTIVE) {
3052 slab_error(cachep, "double free, or memory outside" 3064 slab_error(cachep, "double free, or memory outside"
3053 " object was overwritten"); 3065 " object was overwritten");
3054 printk(KERN_ERR 3066 printk(KERN_ERR
3055 "%p: redzone 1:0x%lx, redzone 2:0x%lx\n", 3067 "%p: redzone 1:0x%lx, redzone 2:0x%lx\n",
3056 objp, *dbg_redzone1(cachep, objp), 3068 objp, *dbg_redzone1(cachep, objp),
3057 *dbg_redzone2(cachep, objp)); 3069 *dbg_redzone2(cachep, objp));
3058 } 3070 }
3059 *dbg_redzone1(cachep, objp) = RED_ACTIVE; 3071 *dbg_redzone1(cachep, objp) = RED_ACTIVE;
3060 *dbg_redzone2(cachep, objp) = RED_ACTIVE; 3072 *dbg_redzone2(cachep, objp) = RED_ACTIVE;
3061 } 3073 }
3062 #ifdef CONFIG_DEBUG_SLAB_LEAK 3074 #ifdef CONFIG_DEBUG_SLAB_LEAK
3063 { 3075 {
3064 struct slab *slabp; 3076 struct slab *slabp;
3065 unsigned objnr; 3077 unsigned objnr;
3066 3078
3067 slabp = page_get_slab(virt_to_page(objp)); 3079 slabp = page_get_slab(virt_to_page(objp));
3068 objnr = (unsigned)(objp - slabp->s_mem) / cachep->buffer_size; 3080 objnr = (unsigned)(objp - slabp->s_mem) / cachep->buffer_size;
3069 slab_bufctl(slabp)[objnr] = BUFCTL_ACTIVE; 3081 slab_bufctl(slabp)[objnr] = BUFCTL_ACTIVE;
3070 } 3082 }
3071 #endif 3083 #endif
3072 objp += obj_offset(cachep); 3084 objp += obj_offset(cachep);
3073 if (cachep->ctor && cachep->flags & SLAB_POISON) { 3085 if (cachep->ctor && cachep->flags & SLAB_POISON) {
3074 unsigned long ctor_flags = SLAB_CTOR_CONSTRUCTOR; 3086 unsigned long ctor_flags = SLAB_CTOR_CONSTRUCTOR;
3075 3087
3076 if (!(flags & __GFP_WAIT)) 3088 if (!(flags & __GFP_WAIT))
3077 ctor_flags |= SLAB_CTOR_ATOMIC; 3089 ctor_flags |= SLAB_CTOR_ATOMIC;
3078 3090
3079 cachep->ctor(objp, cachep, ctor_flags); 3091 cachep->ctor(objp, cachep, ctor_flags);
3080 } 3092 }
3081 #if ARCH_SLAB_MINALIGN 3093 #if ARCH_SLAB_MINALIGN
3082 if ((u32)objp & (ARCH_SLAB_MINALIGN-1)) { 3094 if ((u32)objp & (ARCH_SLAB_MINALIGN-1)) {
3083 printk(KERN_ERR "0x%p: not aligned to ARCH_SLAB_MINALIGN=%d\n", 3095 printk(KERN_ERR "0x%p: not aligned to ARCH_SLAB_MINALIGN=%d\n",
3084 objp, ARCH_SLAB_MINALIGN); 3096 objp, ARCH_SLAB_MINALIGN);
3085 } 3097 }
3086 #endif 3098 #endif
3087 return objp; 3099 return objp;
3088 } 3100 }
3089 #else 3101 #else
3090 #define cache_alloc_debugcheck_after(a,b,objp,d) (objp) 3102 #define cache_alloc_debugcheck_after(a,b,objp,d) (objp)
3091 #endif 3103 #endif
3092 3104
3093 #ifdef CONFIG_FAILSLAB 3105 #ifdef CONFIG_FAILSLAB
3094 3106
3095 static struct failslab_attr { 3107 static struct failslab_attr {
3096 3108
3097 struct fault_attr attr; 3109 struct fault_attr attr;
3098 3110
3099 u32 ignore_gfp_wait; 3111 u32 ignore_gfp_wait;
3100 #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS 3112 #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
3101 struct dentry *ignore_gfp_wait_file; 3113 struct dentry *ignore_gfp_wait_file;
3102 #endif 3114 #endif
3103 3115
3104 } failslab = { 3116 } failslab = {
3105 .attr = FAULT_ATTR_INITIALIZER, 3117 .attr = FAULT_ATTR_INITIALIZER,
3106 .ignore_gfp_wait = 1, 3118 .ignore_gfp_wait = 1,
3107 }; 3119 };
3108 3120
3109 static int __init setup_failslab(char *str) 3121 static int __init setup_failslab(char *str)
3110 { 3122 {
3111 return setup_fault_attr(&failslab.attr, str); 3123 return setup_fault_attr(&failslab.attr, str);
3112 } 3124 }
3113 __setup("failslab=", setup_failslab); 3125 __setup("failslab=", setup_failslab);
3114 3126
3115 static int should_failslab(struct kmem_cache *cachep, gfp_t flags) 3127 static int should_failslab(struct kmem_cache *cachep, gfp_t flags)
3116 { 3128 {
3117 if (cachep == &cache_cache) 3129 if (cachep == &cache_cache)
3118 return 0; 3130 return 0;
3119 if (flags & __GFP_NOFAIL) 3131 if (flags & __GFP_NOFAIL)
3120 return 0; 3132 return 0;
3121 if (failslab.ignore_gfp_wait && (flags & __GFP_WAIT)) 3133 if (failslab.ignore_gfp_wait && (flags & __GFP_WAIT))
3122 return 0; 3134 return 0;
3123 3135
3124 return should_fail(&failslab.attr, obj_size(cachep)); 3136 return should_fail(&failslab.attr, obj_size(cachep));
3125 } 3137 }
3126 3138
3127 #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS 3139 #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
3128 3140
3129 static int __init failslab_debugfs(void) 3141 static int __init failslab_debugfs(void)
3130 { 3142 {
3131 mode_t mode = S_IFREG | S_IRUSR | S_IWUSR; 3143 mode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
3132 struct dentry *dir; 3144 struct dentry *dir;
3133 int err; 3145 int err;
3134 3146
3135 err = init_fault_attr_dentries(&failslab.attr, "failslab"); 3147 err = init_fault_attr_dentries(&failslab.attr, "failslab");
3136 if (err) 3148 if (err)
3137 return err; 3149 return err;
3138 dir = failslab.attr.dentries.dir; 3150 dir = failslab.attr.dentries.dir;
3139 3151
3140 failslab.ignore_gfp_wait_file = 3152 failslab.ignore_gfp_wait_file =
3141 debugfs_create_bool("ignore-gfp-wait", mode, dir, 3153 debugfs_create_bool("ignore-gfp-wait", mode, dir,
3142 &failslab.ignore_gfp_wait); 3154 &failslab.ignore_gfp_wait);
3143 3155
3144 if (!failslab.ignore_gfp_wait_file) { 3156 if (!failslab.ignore_gfp_wait_file) {
3145 err = -ENOMEM; 3157 err = -ENOMEM;
3146 debugfs_remove(failslab.ignore_gfp_wait_file); 3158 debugfs_remove(failslab.ignore_gfp_wait_file);
3147 cleanup_fault_attr_dentries(&failslab.attr); 3159 cleanup_fault_attr_dentries(&failslab.attr);
3148 } 3160 }
3149 3161
3150 return err; 3162 return err;
3151 } 3163 }
3152 3164
3153 late_initcall(failslab_debugfs); 3165 late_initcall(failslab_debugfs);
3154 3166
3155 #endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */ 3167 #endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
3156 3168
3157 #else /* CONFIG_FAILSLAB */ 3169 #else /* CONFIG_FAILSLAB */
3158 3170
3159 static inline int should_failslab(struct kmem_cache *cachep, gfp_t flags) 3171 static inline int should_failslab(struct kmem_cache *cachep, gfp_t flags)
3160 { 3172 {
3161 return 0; 3173 return 0;
3162 } 3174 }
3163 3175
3164 #endif /* CONFIG_FAILSLAB */ 3176 #endif /* CONFIG_FAILSLAB */
3165 3177
3166 static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags) 3178 static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags)
3167 { 3179 {
3168 void *objp; 3180 void *objp;
3169 struct array_cache *ac; 3181 struct array_cache *ac;
3170 3182
3171 check_irq_off(); 3183 check_irq_off();
3172 3184
3173 if (should_failslab(cachep, flags)) 3185 if (should_failslab(cachep, flags))
3174 return NULL; 3186 return NULL;
3175 3187
3176 ac = cpu_cache_get(cachep); 3188 ac = cpu_cache_get(cachep);
3177 if (likely(ac->avail)) { 3189 if (likely(ac->avail)) {
3178 STATS_INC_ALLOCHIT(cachep); 3190 STATS_INC_ALLOCHIT(cachep);
3179 ac->touched = 1; 3191 ac->touched = 1;
3180 objp = ac->entry[--ac->avail]; 3192 objp = ac->entry[--ac->avail];
3181 } else { 3193 } else {
3182 STATS_INC_ALLOCMISS(cachep); 3194 STATS_INC_ALLOCMISS(cachep);
3183 objp = cache_alloc_refill(cachep, flags); 3195 objp = cache_alloc_refill(cachep, flags);
3184 } 3196 }
3185 return objp; 3197 return objp;
3186 } 3198 }
3187 3199
3188 static __always_inline void *__cache_alloc(struct kmem_cache *cachep, 3200 static __always_inline void *__cache_alloc(struct kmem_cache *cachep,
3189 gfp_t flags, void *caller) 3201 gfp_t flags, void *caller)
3190 { 3202 {
3191 unsigned long save_flags; 3203 unsigned long save_flags;
3192 void *objp = NULL; 3204 void *objp = NULL;
3193 3205
3194 cache_alloc_debugcheck_before(cachep, flags); 3206 cache_alloc_debugcheck_before(cachep, flags);
3195 3207
3196 local_irq_save(save_flags); 3208 local_irq_save(save_flags);
3197 3209
3198 if (unlikely(NUMA_BUILD && 3210 if (unlikely(NUMA_BUILD &&
3199 current->flags & (PF_SPREAD_SLAB | PF_MEMPOLICY))) 3211 current->flags & (PF_SPREAD_SLAB | PF_MEMPOLICY)))
3200 objp = alternate_node_alloc(cachep, flags); 3212 objp = alternate_node_alloc(cachep, flags);
3201 3213
3202 if (!objp) 3214 if (!objp)
3203 objp = ____cache_alloc(cachep, flags); 3215 objp = ____cache_alloc(cachep, flags);
3204 /* 3216 /*
3205 * We may just have run out of memory on the local node. 3217 * We may just have run out of memory on the local node.
3206 * ____cache_alloc_node() knows how to locate memory on other nodes 3218 * ____cache_alloc_node() knows how to locate memory on other nodes
3207 */ 3219 */
3208 if (NUMA_BUILD && !objp) 3220 if (NUMA_BUILD && !objp)
3209 objp = ____cache_alloc_node(cachep, flags, numa_node_id()); 3221 objp = ____cache_alloc_node(cachep, flags, numa_node_id());
3210 local_irq_restore(save_flags); 3222 local_irq_restore(save_flags);
3211 objp = cache_alloc_debugcheck_after(cachep, flags, objp, 3223 objp = cache_alloc_debugcheck_after(cachep, flags, objp,
3212 caller); 3224 caller);
3213 prefetchw(objp); 3225 prefetchw(objp);
3214 return objp; 3226 return objp;
3215 } 3227 }
3216 3228
3217 #ifdef CONFIG_NUMA 3229 #ifdef CONFIG_NUMA
3218 /* 3230 /*
3219 * Try allocating on another node if PF_SPREAD_SLAB|PF_MEMPOLICY. 3231 * Try allocating on another node if PF_SPREAD_SLAB|PF_MEMPOLICY.
3220 * 3232 *
3221 * If we are in_interrupt, then process context, including cpusets and 3233 * If we are in_interrupt, then process context, including cpusets and
3222 * mempolicy, may not apply and should not be used for allocation policy. 3234 * mempolicy, may not apply and should not be used for allocation policy.
3223 */ 3235 */
3224 static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags) 3236 static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags)
3225 { 3237 {
3226 int nid_alloc, nid_here; 3238 int nid_alloc, nid_here;
3227 3239
3228 if (in_interrupt() || (flags & __GFP_THISNODE)) 3240 if (in_interrupt() || (flags & __GFP_THISNODE))
3229 return NULL; 3241 return NULL;
3230 nid_alloc = nid_here = numa_node_id(); 3242 nid_alloc = nid_here = numa_node_id();
3231 if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD)) 3243 if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD))
3232 nid_alloc = cpuset_mem_spread_node(); 3244 nid_alloc = cpuset_mem_spread_node();
3233 else if (current->mempolicy) 3245 else if (current->mempolicy)
3234 nid_alloc = slab_node(current->mempolicy); 3246 nid_alloc = slab_node(current->mempolicy);
3235 if (nid_alloc != nid_here) 3247 if (nid_alloc != nid_here)
3236 return ____cache_alloc_node(cachep, flags, nid_alloc); 3248 return ____cache_alloc_node(cachep, flags, nid_alloc);
3237 return NULL; 3249 return NULL;
3238 } 3250 }
3239 3251
3240 /* 3252 /*
3241 * Fallback function if there was no memory available and no objects on a 3253 * Fallback function if there was no memory available and no objects on a
3242 * certain node and fall back is permitted. First we scan all the 3254 * certain node and fall back is permitted. First we scan all the
3243 * available nodelists for available objects. If that fails then we 3255 * available nodelists for available objects. If that fails then we
3244 * perform an allocation without specifying a node. This allows the page 3256 * perform an allocation without specifying a node. This allows the page
3245 * allocator to do its reclaim / fallback magic. We then insert the 3257 * allocator to do its reclaim / fallback magic. We then insert the
3246 * slab into the proper nodelist and then allocate from it. 3258 * slab into the proper nodelist and then allocate from it.
3247 */ 3259 */
3248 void *fallback_alloc(struct kmem_cache *cache, gfp_t flags) 3260 void *fallback_alloc(struct kmem_cache *cache, gfp_t flags)
3249 { 3261 {
3250 struct zonelist *zonelist = &NODE_DATA(slab_node(current->mempolicy)) 3262 struct zonelist *zonelist = &NODE_DATA(slab_node(current->mempolicy))
3251 ->node_zonelists[gfp_zone(flags)]; 3263 ->node_zonelists[gfp_zone(flags)];
3252 struct zone **z; 3264 struct zone **z;
3253 void *obj = NULL; 3265 void *obj = NULL;
3254 int nid; 3266 int nid;
3255 gfp_t local_flags = (flags & GFP_LEVEL_MASK); 3267 gfp_t local_flags = (flags & GFP_LEVEL_MASK);
3256 3268
3257 retry: 3269 retry:
3258 /* 3270 /*
3259 * Look through allowed nodes for objects available 3271 * Look through allowed nodes for objects available
3260 * from existing per node queues. 3272 * from existing per node queues.
3261 */ 3273 */
3262 for (z = zonelist->zones; *z && !obj; z++) { 3274 for (z = zonelist->zones; *z && !obj; z++) {
3263 nid = zone_to_nid(*z); 3275 nid = zone_to_nid(*z);
3264 3276
3265 if (cpuset_zone_allowed_hardwall(*z, flags) && 3277 if (cpuset_zone_allowed_hardwall(*z, flags) &&
3266 cache->nodelists[nid] && 3278 cache->nodelists[nid] &&
3267 cache->nodelists[nid]->free_objects) 3279 cache->nodelists[nid]->free_objects)
3268 obj = ____cache_alloc_node(cache, 3280 obj = ____cache_alloc_node(cache,
3269 flags | GFP_THISNODE, nid); 3281 flags | GFP_THISNODE, nid);
3270 } 3282 }
3271 3283
3272 if (!obj) { 3284 if (!obj) {
3273 /* 3285 /*
3274 * This allocation will be performed within the constraints 3286 * This allocation will be performed within the constraints
3275 * of the current cpuset / memory policy requirements. 3287 * of the current cpuset / memory policy requirements.
3276 * We may trigger various forms of reclaim on the allowed 3288 * We may trigger various forms of reclaim on the allowed
3277 * set and go into memory reserves if necessary. 3289 * set and go into memory reserves if necessary.
3278 */ 3290 */
3279 if (local_flags & __GFP_WAIT) 3291 if (local_flags & __GFP_WAIT)
3280 local_irq_enable(); 3292 local_irq_enable();
3281 kmem_flagcheck(cache, flags); 3293 kmem_flagcheck(cache, flags);
3282 obj = kmem_getpages(cache, flags, -1); 3294 obj = kmem_getpages(cache, flags, -1);
3283 if (local_flags & __GFP_WAIT) 3295 if (local_flags & __GFP_WAIT)
3284 local_irq_disable(); 3296 local_irq_disable();
3285 if (obj) { 3297 if (obj) {
3286 /* 3298 /*
3287 * Insert into the appropriate per node queues 3299 * Insert into the appropriate per node queues
3288 */ 3300 */
3289 nid = page_to_nid(virt_to_page(obj)); 3301 nid = page_to_nid(virt_to_page(obj));
3290 if (cache_grow(cache, flags, nid, obj)) { 3302 if (cache_grow(cache, flags, nid, obj)) {
3291 obj = ____cache_alloc_node(cache, 3303 obj = ____cache_alloc_node(cache,
3292 flags | GFP_THISNODE, nid); 3304 flags | GFP_THISNODE, nid);
3293 if (!obj) 3305 if (!obj)
3294 /* 3306 /*
3295 * Another processor may allocate the 3307 * Another processor may allocate the
3296 * objects in the slab since we are 3308 * objects in the slab since we are
3297 * not holding any locks. 3309 * not holding any locks.
3298 */ 3310 */
3299 goto retry; 3311 goto retry;
3300 } else { 3312 } else {
3301 kmem_freepages(cache, obj); 3313 kmem_freepages(cache, obj);
3302 obj = NULL; 3314 obj = NULL;
3303 } 3315 }
3304 } 3316 }
3305 } 3317 }
3306 return obj; 3318 return obj;
3307 } 3319 }
3308 3320
3309 /* 3321 /*
3310 * A interface to enable slab creation on nodeid 3322 * A interface to enable slab creation on nodeid
3311 */ 3323 */
3312 static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, 3324 static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags,
3313 int nodeid) 3325 int nodeid)
3314 { 3326 {
3315 struct list_head *entry; 3327 struct list_head *entry;
3316 struct slab *slabp; 3328 struct slab *slabp;
3317 struct kmem_list3 *l3; 3329 struct kmem_list3 *l3;
3318 void *obj; 3330 void *obj;
3319 int x; 3331 int x;
3320 3332
3321 l3 = cachep->nodelists[nodeid]; 3333 l3 = cachep->nodelists[nodeid];
3322 BUG_ON(!l3); 3334 BUG_ON(!l3);
3323 3335
3324 retry: 3336 retry:
3325 check_irq_off(); 3337 check_irq_off();
3326 spin_lock(&l3->list_lock); 3338 spin_lock(&l3->list_lock);
3327 entry = l3->slabs_partial.next; 3339 entry = l3->slabs_partial.next;
3328 if (entry == &l3->slabs_partial) { 3340 if (entry == &l3->slabs_partial) {
3329 l3->free_touched = 1; 3341 l3->free_touched = 1;
3330 entry = l3->slabs_free.next; 3342 entry = l3->slabs_free.next;
3331 if (entry == &l3->slabs_free) 3343 if (entry == &l3->slabs_free)
3332 goto must_grow; 3344 goto must_grow;
3333 } 3345 }
3334 3346
3335 slabp = list_entry(entry, struct slab, list); 3347 slabp = list_entry(entry, struct slab, list);
3336 check_spinlock_acquired_node(cachep, nodeid); 3348 check_spinlock_acquired_node(cachep, nodeid);
3337 check_slabp(cachep, slabp); 3349 check_slabp(cachep, slabp);
3338 3350
3339 STATS_INC_NODEALLOCS(cachep); 3351 STATS_INC_NODEALLOCS(cachep);
3340 STATS_INC_ACTIVE(cachep); 3352 STATS_INC_ACTIVE(cachep);
3341 STATS_SET_HIGH(cachep); 3353 STATS_SET_HIGH(cachep);
3342 3354
3343 BUG_ON(slabp->inuse == cachep->num); 3355 BUG_ON(slabp->inuse == cachep->num);
3344 3356
3345 obj = slab_get_obj(cachep, slabp, nodeid); 3357 obj = slab_get_obj(cachep, slabp, nodeid);
3346 check_slabp(cachep, slabp); 3358 check_slabp(cachep, slabp);
3347 l3->free_objects--; 3359 l3->free_objects--;
3348 /* move slabp to correct slabp list: */ 3360 /* move slabp to correct slabp list: */
3349 list_del(&slabp->list); 3361 list_del(&slabp->list);
3350 3362
3351 if (slabp->free == BUFCTL_END) 3363 if (slabp->free == BUFCTL_END)
3352 list_add(&slabp->list, &l3->slabs_full); 3364 list_add(&slabp->list, &l3->slabs_full);
3353 else 3365 else
3354 list_add(&slabp->list, &l3->slabs_partial); 3366 list_add(&slabp->list, &l3->slabs_partial);
3355 3367
3356 spin_unlock(&l3->list_lock); 3368 spin_unlock(&l3->list_lock);
3357 goto done; 3369 goto done;
3358 3370
3359 must_grow: 3371 must_grow:
3360 spin_unlock(&l3->list_lock); 3372 spin_unlock(&l3->list_lock);
3361 x = cache_grow(cachep, flags | GFP_THISNODE, nodeid, NULL); 3373 x = cache_grow(cachep, flags | GFP_THISNODE, nodeid, NULL);
3362 if (x) 3374 if (x)
3363 goto retry; 3375 goto retry;
3364 3376
3365 if (!(flags & __GFP_THISNODE)) 3377 if (!(flags & __GFP_THISNODE))
3366 /* Unable to grow the cache. Fall back to other nodes. */ 3378 /* Unable to grow the cache. Fall back to other nodes. */
3367 return fallback_alloc(cachep, flags); 3379 return fallback_alloc(cachep, flags);
3368 3380
3369 return NULL; 3381 return NULL;
3370 3382
3371 done: 3383 done:
3372 return obj; 3384 return obj;
3373 } 3385 }
3374 #endif 3386 #endif
3375 3387
3376 /* 3388 /*
3377 * Caller needs to acquire correct kmem_list's list_lock 3389 * Caller needs to acquire correct kmem_list's list_lock
3378 */ 3390 */
3379 static void free_block(struct kmem_cache *cachep, void **objpp, int nr_objects, 3391 static void free_block(struct kmem_cache *cachep, void **objpp, int nr_objects,
3380 int node) 3392 int node)
3381 { 3393 {
3382 int i; 3394 int i;
3383 struct kmem_list3 *l3; 3395 struct kmem_list3 *l3;
3384 3396
3385 for (i = 0; i < nr_objects; i++) { 3397 for (i = 0; i < nr_objects; i++) {
3386 void *objp = objpp[i]; 3398 void *objp = objpp[i];
3387 struct slab *slabp; 3399 struct slab *slabp;
3388 3400
3389 slabp = virt_to_slab(objp); 3401 slabp = virt_to_slab(objp);
3390 l3 = cachep->nodelists[node]; 3402 l3 = cachep->nodelists[node];
3391 list_del(&slabp->list); 3403 list_del(&slabp->list);
3392 check_spinlock_acquired_node(cachep, node); 3404 check_spinlock_acquired_node(cachep, node);
3393 check_slabp(cachep, slabp); 3405 check_slabp(cachep, slabp);
3394 slab_put_obj(cachep, slabp, objp, node); 3406 slab_put_obj(cachep, slabp, objp, node);
3395 STATS_DEC_ACTIVE(cachep); 3407 STATS_DEC_ACTIVE(cachep);
3396 l3->free_objects++; 3408 l3->free_objects++;
3397 check_slabp(cachep, slabp); 3409 check_slabp(cachep, slabp);
3398 3410
3399 /* fixup slab chains */ 3411 /* fixup slab chains */
3400 if (slabp->inuse == 0) { 3412 if (slabp->inuse == 0) {
3401 if (l3->free_objects > l3->free_limit) { 3413 if (l3->free_objects > l3->free_limit) {
3402 l3->free_objects -= cachep->num; 3414 l3->free_objects -= cachep->num;
3403 /* No need to drop any previously held 3415 /* No need to drop any previously held
3404 * lock here, even if we have a off-slab slab 3416 * lock here, even if we have a off-slab slab
3405 * descriptor it is guaranteed to come from 3417 * descriptor it is guaranteed to come from
3406 * a different cache, refer to comments before 3418 * a different cache, refer to comments before
3407 * alloc_slabmgmt. 3419 * alloc_slabmgmt.
3408 */ 3420 */
3409 slab_destroy(cachep, slabp); 3421 slab_destroy(cachep, slabp);
3410 } else { 3422 } else {
3411 list_add(&slabp->list, &l3->slabs_free); 3423 list_add(&slabp->list, &l3->slabs_free);
3412 } 3424 }
3413 } else { 3425 } else {
3414 /* Unconditionally move a slab to the end of the 3426 /* Unconditionally move a slab to the end of the
3415 * partial list on free - maximum time for the 3427 * partial list on free - maximum time for the
3416 * other objects to be freed, too. 3428 * other objects to be freed, too.
3417 */ 3429 */
3418 list_add_tail(&slabp->list, &l3->slabs_partial); 3430 list_add_tail(&slabp->list, &l3->slabs_partial);
3419 } 3431 }
3420 } 3432 }
3421 } 3433 }
3422 3434
3423 static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac) 3435 static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac)
3424 { 3436 {
3425 int batchcount; 3437 int batchcount;
3426 struct kmem_list3 *l3; 3438 struct kmem_list3 *l3;
3427 int node = numa_node_id(); 3439 int node = numa_node_id();
3428 3440
3429 batchcount = ac->batchcount; 3441 batchcount = ac->batchcount;
3430 #if DEBUG 3442 #if DEBUG
3431 BUG_ON(!batchcount || batchcount > ac->avail); 3443 BUG_ON(!batchcount || batchcount > ac->avail);
3432 #endif 3444 #endif
3433 check_irq_off(); 3445 check_irq_off();
3434 l3 = cachep->nodelists[node]; 3446 l3 = cachep->nodelists[node];
3435 spin_lock(&l3->list_lock); 3447 spin_lock(&l3->list_lock);
3436 if (l3->shared) { 3448 if (l3->shared) {
3437 struct array_cache *shared_array = l3->shared; 3449 struct array_cache *shared_array = l3->shared;
3438 int max = shared_array->limit - shared_array->avail; 3450 int max = shared_array->limit - shared_array->avail;
3439 if (max) { 3451 if (max) {
3440 if (batchcount > max) 3452 if (batchcount > max)
3441 batchcount = max; 3453 batchcount = max;
3442 memcpy(&(shared_array->entry[shared_array->avail]), 3454 memcpy(&(shared_array->entry[shared_array->avail]),
3443 ac->entry, sizeof(void *) * batchcount); 3455 ac->entry, sizeof(void *) * batchcount);
3444 shared_array->avail += batchcount; 3456 shared_array->avail += batchcount;
3445 goto free_done; 3457 goto free_done;
3446 } 3458 }
3447 } 3459 }
3448 3460
3449 free_block(cachep, ac->entry, batchcount, node); 3461 free_block(cachep, ac->entry, batchcount, node);
3450 free_done: 3462 free_done:
3451 #if STATS 3463 #if STATS
3452 { 3464 {
3453 int i = 0; 3465 int i = 0;
3454 struct list_head *p; 3466 struct list_head *p;
3455 3467
3456 p = l3->slabs_free.next; 3468 p = l3->slabs_free.next;
3457 while (p != &(l3->slabs_free)) { 3469 while (p != &(l3->slabs_free)) {
3458 struct slab *slabp; 3470 struct slab *slabp;
3459 3471
3460 slabp = list_entry(p, struct slab, list); 3472 slabp = list_entry(p, struct slab, list);
3461 BUG_ON(slabp->inuse); 3473 BUG_ON(slabp->inuse);
3462 3474
3463 i++; 3475 i++;
3464 p = p->next; 3476 p = p->next;
3465 } 3477 }
3466 STATS_SET_FREEABLE(cachep, i); 3478 STATS_SET_FREEABLE(cachep, i);
3467 } 3479 }
3468 #endif 3480 #endif
3469 spin_unlock(&l3->list_lock); 3481 spin_unlock(&l3->list_lock);
3470 ac->avail -= batchcount; 3482 ac->avail -= batchcount;
3471 memmove(ac->entry, &(ac->entry[batchcount]), sizeof(void *)*ac->avail); 3483 memmove(ac->entry, &(ac->entry[batchcount]), sizeof(void *)*ac->avail);
3472 } 3484 }
3473 3485
3474 /* 3486 /*
3475 * Release an obj back to its cache. If the obj has a constructed state, it must 3487 * Release an obj back to its cache. If the obj has a constructed state, it must
3476 * be in this state _before_ it is released. Called with disabled ints. 3488 * be in this state _before_ it is released. Called with disabled ints.
3477 */ 3489 */
3478 static inline void __cache_free(struct kmem_cache *cachep, void *objp) 3490 static inline void __cache_free(struct kmem_cache *cachep, void *objp)
3479 { 3491 {
3480 struct array_cache *ac = cpu_cache_get(cachep); 3492 struct array_cache *ac = cpu_cache_get(cachep);
3481 3493
3482 check_irq_off(); 3494 check_irq_off();
3483 objp = cache_free_debugcheck(cachep, objp, __builtin_return_address(0)); 3495 objp = cache_free_debugcheck(cachep, objp, __builtin_return_address(0));
3484 3496
3485 if (cache_free_alien(cachep, objp)) 3497 if (cache_free_alien(cachep, objp))
3486 return; 3498 return;
3487 3499
3488 if (likely(ac->avail < ac->limit)) { 3500 if (likely(ac->avail < ac->limit)) {
3489 STATS_INC_FREEHIT(cachep); 3501 STATS_INC_FREEHIT(cachep);
3490 ac->entry[ac->avail++] = objp; 3502 ac->entry[ac->avail++] = objp;
3491 return; 3503 return;
3492 } else { 3504 } else {
3493 STATS_INC_FREEMISS(cachep); 3505 STATS_INC_FREEMISS(cachep);
3494 cache_flusharray(cachep, ac); 3506 cache_flusharray(cachep, ac);
3495 ac->entry[ac->avail++] = objp; 3507 ac->entry[ac->avail++] = objp;
3496 } 3508 }
3497 } 3509 }
3498 3510
3499 /** 3511 /**
3500 * kmem_cache_alloc - Allocate an object 3512 * kmem_cache_alloc - Allocate an object
3501 * @cachep: The cache to allocate from. 3513 * @cachep: The cache to allocate from.
3502 * @flags: See kmalloc(). 3514 * @flags: See kmalloc().
3503 * 3515 *
3504 * Allocate an object from this cache. The flags are only relevant 3516 * Allocate an object from this cache. The flags are only relevant
3505 * if the cache has no available objects. 3517 * if the cache has no available objects.
3506 */ 3518 */
3507 void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags) 3519 void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
3508 { 3520 {
3509 return __cache_alloc(cachep, flags, __builtin_return_address(0)); 3521 return __cache_alloc(cachep, flags, __builtin_return_address(0));
3510 } 3522 }
3511 EXPORT_SYMBOL(kmem_cache_alloc); 3523 EXPORT_SYMBOL(kmem_cache_alloc);
3512 3524
3513 /** 3525 /**
3514 * kmem_cache_zalloc - Allocate an object. The memory is set to zero. 3526 * kmem_cache_zalloc - Allocate an object. The memory is set to zero.
3515 * @cache: The cache to allocate from. 3527 * @cache: The cache to allocate from.
3516 * @flags: See kmalloc(). 3528 * @flags: See kmalloc().
3517 * 3529 *
3518 * Allocate an object from this cache and set the allocated memory to zero. 3530 * Allocate an object from this cache and set the allocated memory to zero.
3519 * The flags are only relevant if the cache has no available objects. 3531 * The flags are only relevant if the cache has no available objects.
3520 */ 3532 */
3521 void *kmem_cache_zalloc(struct kmem_cache *cache, gfp_t flags) 3533 void *kmem_cache_zalloc(struct kmem_cache *cache, gfp_t flags)
3522 { 3534 {
3523 void *ret = __cache_alloc(cache, flags, __builtin_return_address(0)); 3535 void *ret = __cache_alloc(cache, flags, __builtin_return_address(0));
3524 if (ret) 3536 if (ret)
3525 memset(ret, 0, obj_size(cache)); 3537 memset(ret, 0, obj_size(cache));
3526 return ret; 3538 return ret;
3527 } 3539 }
3528 EXPORT_SYMBOL(kmem_cache_zalloc); 3540 EXPORT_SYMBOL(kmem_cache_zalloc);
3529 3541
3530 /** 3542 /**
3531 * kmem_ptr_validate - check if an untrusted pointer might 3543 * kmem_ptr_validate - check if an untrusted pointer might
3532 * be a slab entry. 3544 * be a slab entry.
3533 * @cachep: the cache we're checking against 3545 * @cachep: the cache we're checking against
3534 * @ptr: pointer to validate 3546 * @ptr: pointer to validate
3535 * 3547 *
3536 * This verifies that the untrusted pointer looks sane: 3548 * This verifies that the untrusted pointer looks sane:
3537 * it is _not_ a guarantee that the pointer is actually 3549 * it is _not_ a guarantee that the pointer is actually
3538 * part of the slab cache in question, but it at least 3550 * part of the slab cache in question, but it at least
3539 * validates that the pointer can be dereferenced and 3551 * validates that the pointer can be dereferenced and
3540 * looks half-way sane. 3552 * looks half-way sane.
3541 * 3553 *
3542 * Currently only used for dentry validation. 3554 * Currently only used for dentry validation.
3543 */ 3555 */
3544 int fastcall kmem_ptr_validate(struct kmem_cache *cachep, const void *ptr) 3556 int fastcall kmem_ptr_validate(struct kmem_cache *cachep, const void *ptr)
3545 { 3557 {
3546 unsigned long addr = (unsigned long)ptr; 3558 unsigned long addr = (unsigned long)ptr;
3547 unsigned long min_addr = PAGE_OFFSET; 3559 unsigned long min_addr = PAGE_OFFSET;
3548 unsigned long align_mask = BYTES_PER_WORD - 1; 3560 unsigned long align_mask = BYTES_PER_WORD - 1;
3549 unsigned long size = cachep->buffer_size; 3561 unsigned long size = cachep->buffer_size;
3550 struct page *page; 3562 struct page *page;
3551 3563
3552 if (unlikely(addr < min_addr)) 3564 if (unlikely(addr < min_addr))
3553 goto out; 3565 goto out;
3554 if (unlikely(addr > (unsigned long)high_memory - size)) 3566 if (unlikely(addr > (unsigned long)high_memory - size))
3555 goto out; 3567 goto out;
3556 if (unlikely(addr & align_mask)) 3568 if (unlikely(addr & align_mask))
3557 goto out; 3569 goto out;
3558 if (unlikely(!kern_addr_valid(addr))) 3570 if (unlikely(!kern_addr_valid(addr)))
3559 goto out; 3571 goto out;
3560 if (unlikely(!kern_addr_valid(addr + size - 1))) 3572 if (unlikely(!kern_addr_valid(addr + size - 1)))
3561 goto out; 3573 goto out;
3562 page = virt_to_page(ptr); 3574 page = virt_to_page(ptr);
3563 if (unlikely(!PageSlab(page))) 3575 if (unlikely(!PageSlab(page)))
3564 goto out; 3576 goto out;
3565 if (unlikely(page_get_cache(page) != cachep)) 3577 if (unlikely(page_get_cache(page) != cachep))
3566 goto out; 3578 goto out;
3567 return 1; 3579 return 1;
3568 out: 3580 out:
3569 return 0; 3581 return 0;
3570 } 3582 }
3571 3583
3572 #ifdef CONFIG_NUMA 3584 #ifdef CONFIG_NUMA
3573 /** 3585 /**
3574 * kmem_cache_alloc_node - Allocate an object on the specified node 3586 * kmem_cache_alloc_node - Allocate an object on the specified node
3575 * @cachep: The cache to allocate from. 3587 * @cachep: The cache to allocate from.
3576 * @flags: See kmalloc(). 3588 * @flags: See kmalloc().
3577 * @nodeid: node number of the target node. 3589 * @nodeid: node number of the target node.
3578 * 3590 *
3579 * Identical to kmem_cache_alloc but it will allocate memory on the given 3591 * Identical to kmem_cache_alloc but it will allocate memory on the given
3580 * node, which can improve the performance for cpu bound structures. 3592 * node, which can improve the performance for cpu bound structures.
3581 * 3593 *
3582 * Fallback to other node is possible if __GFP_THISNODE is not set. 3594 * Fallback to other node is possible if __GFP_THISNODE is not set.
3583 */ 3595 */
3584 static __always_inline void * 3596 static __always_inline void *
3585 __cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, 3597 __cache_alloc_node(struct kmem_cache *cachep, gfp_t flags,
3586 int nodeid, void *caller) 3598 int nodeid, void *caller)
3587 { 3599 {
3588 unsigned long save_flags; 3600 unsigned long save_flags;
3589 void *ptr = NULL; 3601 void *ptr = NULL;
3590 3602
3591 cache_alloc_debugcheck_before(cachep, flags); 3603 cache_alloc_debugcheck_before(cachep, flags);
3592 local_irq_save(save_flags); 3604 local_irq_save(save_flags);
3593 3605
3594 if (unlikely(nodeid == -1)) 3606 if (unlikely(nodeid == -1))
3595 nodeid = numa_node_id(); 3607 nodeid = numa_node_id();
3596 3608
3597 if (likely(cachep->nodelists[nodeid])) { 3609 if (likely(cachep->nodelists[nodeid])) {
3598 if (nodeid == numa_node_id()) { 3610 if (nodeid == numa_node_id()) {
3599 /* 3611 /*
3600 * Use the locally cached objects if possible. 3612 * Use the locally cached objects if possible.
3601 * However ____cache_alloc does not allow fallback 3613 * However ____cache_alloc does not allow fallback
3602 * to other nodes. It may fail while we still have 3614 * to other nodes. It may fail while we still have
3603 * objects on other nodes available. 3615 * objects on other nodes available.
3604 */ 3616 */
3605 ptr = ____cache_alloc(cachep, flags); 3617 ptr = ____cache_alloc(cachep, flags);
3606 } 3618 }
3607 if (!ptr) { 3619 if (!ptr) {
3608 /* ___cache_alloc_node can fall back to other nodes */ 3620 /* ___cache_alloc_node can fall back to other nodes */
3609 ptr = ____cache_alloc_node(cachep, flags, nodeid); 3621 ptr = ____cache_alloc_node(cachep, flags, nodeid);
3610 } 3622 }
3611 } else { 3623 } else {
3612 /* Node not bootstrapped yet */ 3624 /* Node not bootstrapped yet */
3613 if (!(flags & __GFP_THISNODE)) 3625 if (!(flags & __GFP_THISNODE))
3614 ptr = fallback_alloc(cachep, flags); 3626 ptr = fallback_alloc(cachep, flags);
3615 } 3627 }
3616 3628
3617 local_irq_restore(save_flags); 3629 local_irq_restore(save_flags);
3618 ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, caller); 3630 ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, caller);
3619 3631
3620 return ptr; 3632 return ptr;
3621 } 3633 }
3622 3634
3623 void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid) 3635 void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid)
3624 { 3636 {
3625 return __cache_alloc_node(cachep, flags, nodeid, 3637 return __cache_alloc_node(cachep, flags, nodeid,
3626 __builtin_return_address(0)); 3638 __builtin_return_address(0));
3627 } 3639 }
3628 EXPORT_SYMBOL(kmem_cache_alloc_node); 3640 EXPORT_SYMBOL(kmem_cache_alloc_node);
3629 3641
3630 static __always_inline void * 3642 static __always_inline void *
3631 __do_kmalloc_node(size_t size, gfp_t flags, int node, void *caller) 3643 __do_kmalloc_node(size_t size, gfp_t flags, int node, void *caller)
3632 { 3644 {
3633 struct kmem_cache *cachep; 3645 struct kmem_cache *cachep;
3634 3646
3635 cachep = kmem_find_general_cachep(size, flags); 3647 cachep = kmem_find_general_cachep(size, flags);
3636 if (unlikely(cachep == NULL)) 3648 if (unlikely(cachep == NULL))
3637 return NULL; 3649 return NULL;
3638 return kmem_cache_alloc_node(cachep, flags, node); 3650 return kmem_cache_alloc_node(cachep, flags, node);
3639 } 3651 }
3640 3652
3641 #ifdef CONFIG_DEBUG_SLAB 3653 #ifdef CONFIG_DEBUG_SLAB
3642 void *__kmalloc_node(size_t size, gfp_t flags, int node) 3654 void *__kmalloc_node(size_t size, gfp_t flags, int node)
3643 { 3655 {
3644 return __do_kmalloc_node(size, flags, node, 3656 return __do_kmalloc_node(size, flags, node,
3645 __builtin_return_address(0)); 3657 __builtin_return_address(0));
3646 } 3658 }
3647 EXPORT_SYMBOL(__kmalloc_node); 3659 EXPORT_SYMBOL(__kmalloc_node);
3648 3660
3649 void *__kmalloc_node_track_caller(size_t size, gfp_t flags, 3661 void *__kmalloc_node_track_caller(size_t size, gfp_t flags,
3650 int node, void *caller) 3662 int node, void *caller)
3651 { 3663 {
3652 return __do_kmalloc_node(size, flags, node, caller); 3664 return __do_kmalloc_node(size, flags, node, caller);
3653 } 3665 }
3654 EXPORT_SYMBOL(__kmalloc_node_track_caller); 3666 EXPORT_SYMBOL(__kmalloc_node_track_caller);
3655 #else 3667 #else
3656 void *__kmalloc_node(size_t size, gfp_t flags, int node) 3668 void *__kmalloc_node(size_t size, gfp_t flags, int node)
3657 { 3669 {
3658 return __do_kmalloc_node(size, flags, node, NULL); 3670 return __do_kmalloc_node(size, flags, node, NULL);
3659 } 3671 }
3660 EXPORT_SYMBOL(__kmalloc_node); 3672 EXPORT_SYMBOL(__kmalloc_node);
3661 #endif /* CONFIG_DEBUG_SLAB */ 3673 #endif /* CONFIG_DEBUG_SLAB */
3662 #endif /* CONFIG_NUMA */ 3674 #endif /* CONFIG_NUMA */
3663 3675
3664 /** 3676 /**
3665 * __do_kmalloc - allocate memory 3677 * __do_kmalloc - allocate memory
3666 * @size: how many bytes of memory are required. 3678 * @size: how many bytes of memory are required.
3667 * @flags: the type of memory to allocate (see kmalloc). 3679 * @flags: the type of memory to allocate (see kmalloc).
3668 * @caller: function caller for debug tracking of the caller 3680 * @caller: function caller for debug tracking of the caller
3669 */ 3681 */
3670 static __always_inline void *__do_kmalloc(size_t size, gfp_t flags, 3682 static __always_inline void *__do_kmalloc(size_t size, gfp_t flags,
3671 void *caller) 3683 void *caller)
3672 { 3684 {
3673 struct kmem_cache *cachep; 3685 struct kmem_cache *cachep;
3674 3686
3675 /* If you want to save a few bytes .text space: replace 3687 /* If you want to save a few bytes .text space: replace
3676 * __ with kmem_. 3688 * __ with kmem_.
3677 * Then kmalloc uses the uninlined functions instead of the inline 3689 * Then kmalloc uses the uninlined functions instead of the inline
3678 * functions. 3690 * functions.
3679 */ 3691 */
3680 cachep = __find_general_cachep(size, flags); 3692 cachep = __find_general_cachep(size, flags);
3681 if (unlikely(cachep == NULL)) 3693 if (unlikely(cachep == NULL))
3682 return NULL; 3694 return NULL;
3683 return __cache_alloc(cachep, flags, caller); 3695 return __cache_alloc(cachep, flags, caller);
3684 } 3696 }
3685 3697
3686 3698
3687 #ifdef CONFIG_DEBUG_SLAB 3699 #ifdef CONFIG_DEBUG_SLAB
3688 void *__kmalloc(size_t size, gfp_t flags) 3700 void *__kmalloc(size_t size, gfp_t flags)
3689 { 3701 {
3690 return __do_kmalloc(size, flags, __builtin_return_address(0)); 3702 return __do_kmalloc(size, flags, __builtin_return_address(0));
3691 } 3703 }
3692 EXPORT_SYMBOL(__kmalloc); 3704 EXPORT_SYMBOL(__kmalloc);
3693 3705
3694 void *__kmalloc_track_caller(size_t size, gfp_t flags, void *caller) 3706 void *__kmalloc_track_caller(size_t size, gfp_t flags, void *caller)
3695 { 3707 {
3696 return __do_kmalloc(size, flags, caller); 3708 return __do_kmalloc(size, flags, caller);
3697 } 3709 }
3698 EXPORT_SYMBOL(__kmalloc_track_caller); 3710 EXPORT_SYMBOL(__kmalloc_track_caller);
3699 3711
3700 #else 3712 #else
3701 void *__kmalloc(size_t size, gfp_t flags) 3713 void *__kmalloc(size_t size, gfp_t flags)
3702 { 3714 {
3703 return __do_kmalloc(size, flags, NULL); 3715 return __do_kmalloc(size, flags, NULL);
3704 } 3716 }
3705 EXPORT_SYMBOL(__kmalloc); 3717 EXPORT_SYMBOL(__kmalloc);
3706 #endif 3718 #endif
3707 3719
3708 /** 3720 /**
3709 * kmem_cache_free - Deallocate an object 3721 * kmem_cache_free - Deallocate an object
3710 * @cachep: The cache the allocation was from. 3722 * @cachep: The cache the allocation was from.
3711 * @objp: The previously allocated object. 3723 * @objp: The previously allocated object.
3712 * 3724 *
3713 * Free an object which was previously allocated from this 3725 * Free an object which was previously allocated from this
3714 * cache. 3726 * cache.
3715 */ 3727 */
3716 void kmem_cache_free(struct kmem_cache *cachep, void *objp) 3728 void kmem_cache_free(struct kmem_cache *cachep, void *objp)
3717 { 3729 {
3718 unsigned long flags; 3730 unsigned long flags;
3719 3731
3720 BUG_ON(virt_to_cache(objp) != cachep); 3732 BUG_ON(virt_to_cache(objp) != cachep);
3721 3733
3722 local_irq_save(flags); 3734 local_irq_save(flags);
3723 __cache_free(cachep, objp); 3735 __cache_free(cachep, objp);
3724 local_irq_restore(flags); 3736 local_irq_restore(flags);
3725 } 3737 }
3726 EXPORT_SYMBOL(kmem_cache_free); 3738 EXPORT_SYMBOL(kmem_cache_free);
3727 3739
3728 /** 3740 /**
3729 * kfree - free previously allocated memory 3741 * kfree - free previously allocated memory
3730 * @objp: pointer returned by kmalloc. 3742 * @objp: pointer returned by kmalloc.
3731 * 3743 *
3732 * If @objp is NULL, no operation is performed. 3744 * If @objp is NULL, no operation is performed.
3733 * 3745 *
3734 * Don't free memory not originally allocated by kmalloc() 3746 * Don't free memory not originally allocated by kmalloc()
3735 * or you will run into trouble. 3747 * or you will run into trouble.
3736 */ 3748 */
3737 void kfree(const void *objp) 3749 void kfree(const void *objp)
3738 { 3750 {
3739 struct kmem_cache *c; 3751 struct kmem_cache *c;
3740 unsigned long flags; 3752 unsigned long flags;
3741 3753
3742 if (unlikely(!objp)) 3754 if (unlikely(!objp))
3743 return; 3755 return;
3744 local_irq_save(flags); 3756 local_irq_save(flags);
3745 kfree_debugcheck(objp); 3757 kfree_debugcheck(objp);
3746 c = virt_to_cache(objp); 3758 c = virt_to_cache(objp);
3747 debug_check_no_locks_freed(objp, obj_size(c)); 3759 debug_check_no_locks_freed(objp, obj_size(c));
3748 __cache_free(c, (void *)objp); 3760 __cache_free(c, (void *)objp);
3749 local_irq_restore(flags); 3761 local_irq_restore(flags);
3750 } 3762 }
3751 EXPORT_SYMBOL(kfree); 3763 EXPORT_SYMBOL(kfree);
3752 3764
3753 unsigned int kmem_cache_size(struct kmem_cache *cachep) 3765 unsigned int kmem_cache_size(struct kmem_cache *cachep)
3754 { 3766 {
3755 return obj_size(cachep); 3767 return obj_size(cachep);
3756 } 3768 }
3757 EXPORT_SYMBOL(kmem_cache_size); 3769 EXPORT_SYMBOL(kmem_cache_size);
3758 3770
3759 const char *kmem_cache_name(struct kmem_cache *cachep) 3771 const char *kmem_cache_name(struct kmem_cache *cachep)
3760 { 3772 {
3761 return cachep->name; 3773 return cachep->name;
3762 } 3774 }
3763 EXPORT_SYMBOL_GPL(kmem_cache_name); 3775 EXPORT_SYMBOL_GPL(kmem_cache_name);
3764 3776
3765 /* 3777 /*
3766 * This initializes kmem_list3 or resizes varioius caches for all nodes. 3778 * This initializes kmem_list3 or resizes varioius caches for all nodes.
3767 */ 3779 */
3768 static int alloc_kmemlist(struct kmem_cache *cachep) 3780 static int alloc_kmemlist(struct kmem_cache *cachep)
3769 { 3781 {
3770 int node; 3782 int node;
3771 struct kmem_list3 *l3; 3783 struct kmem_list3 *l3;
3772 struct array_cache *new_shared; 3784 struct array_cache *new_shared;
3773 struct array_cache **new_alien = NULL; 3785 struct array_cache **new_alien = NULL;
3774 3786
3775 for_each_online_node(node) { 3787 for_each_online_node(node) {
3776 3788
3777 if (use_alien_caches) { 3789 if (use_alien_caches) {
3778 new_alien = alloc_alien_cache(node, cachep->limit); 3790 new_alien = alloc_alien_cache(node, cachep->limit);
3779 if (!new_alien) 3791 if (!new_alien)
3780 goto fail; 3792 goto fail;
3781 } 3793 }
3782 3794
3783 new_shared = alloc_arraycache(node, 3795 new_shared = alloc_arraycache(node,
3784 cachep->shared*cachep->batchcount, 3796 cachep->shared*cachep->batchcount,
3785 0xbaadf00d); 3797 0xbaadf00d);
3786 if (!new_shared) { 3798 if (!new_shared) {
3787 free_alien_cache(new_alien); 3799 free_alien_cache(new_alien);
3788 goto fail; 3800 goto fail;
3789 } 3801 }
3790 3802
3791 l3 = cachep->nodelists[node]; 3803 l3 = cachep->nodelists[node];
3792 if (l3) { 3804 if (l3) {
3793 struct array_cache *shared = l3->shared; 3805 struct array_cache *shared = l3->shared;
3794 3806
3795 spin_lock_irq(&l3->list_lock); 3807 spin_lock_irq(&l3->list_lock);
3796 3808
3797 if (shared) 3809 if (shared)
3798 free_block(cachep, shared->entry, 3810 free_block(cachep, shared->entry,
3799 shared->avail, node); 3811 shared->avail, node);
3800 3812
3801 l3->shared = new_shared; 3813 l3->shared = new_shared;
3802 if (!l3->alien) { 3814 if (!l3->alien) {
3803 l3->alien = new_alien; 3815 l3->alien = new_alien;
3804 new_alien = NULL; 3816 new_alien = NULL;
3805 } 3817 }
3806 l3->free_limit = (1 + nr_cpus_node(node)) * 3818 l3->free_limit = (1 + nr_cpus_node(node)) *
3807 cachep->batchcount + cachep->num; 3819 cachep->batchcount + cachep->num;
3808 spin_unlock_irq(&l3->list_lock); 3820 spin_unlock_irq(&l3->list_lock);
3809 kfree(shared); 3821 kfree(shared);
3810 free_alien_cache(new_alien); 3822 free_alien_cache(new_alien);
3811 continue; 3823 continue;
3812 } 3824 }
3813 l3 = kmalloc_node(sizeof(struct kmem_list3), GFP_KERNEL, node); 3825 l3 = kmalloc_node(sizeof(struct kmem_list3), GFP_KERNEL, node);
3814 if (!l3) { 3826 if (!l3) {
3815 free_alien_cache(new_alien); 3827 free_alien_cache(new_alien);
3816 kfree(new_shared); 3828 kfree(new_shared);
3817 goto fail; 3829 goto fail;
3818 } 3830 }
3819 3831
3820 kmem_list3_init(l3); 3832 kmem_list3_init(l3);
3821 l3->next_reap = jiffies + REAPTIMEOUT_LIST3 + 3833 l3->next_reap = jiffies + REAPTIMEOUT_LIST3 +
3822 ((unsigned long)cachep) % REAPTIMEOUT_LIST3; 3834 ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
3823 l3->shared = new_shared; 3835 l3->shared = new_shared;
3824 l3->alien = new_alien; 3836 l3->alien = new_alien;
3825 l3->free_limit = (1 + nr_cpus_node(node)) * 3837 l3->free_limit = (1 + nr_cpus_node(node)) *
3826 cachep->batchcount + cachep->num; 3838 cachep->batchcount + cachep->num;
3827 cachep->nodelists[node] = l3; 3839 cachep->nodelists[node] = l3;
3828 } 3840 }
3829 return 0; 3841 return 0;
3830 3842
3831 fail: 3843 fail:
3832 if (!cachep->next.next) { 3844 if (!cachep->next.next) {
3833 /* Cache is not active yet. Roll back what we did */ 3845 /* Cache is not active yet. Roll back what we did */
3834 node--; 3846 node--;
3835 while (node >= 0) { 3847 while (node >= 0) {
3836 if (cachep->nodelists[node]) { 3848 if (cachep->nodelists[node]) {
3837 l3 = cachep->nodelists[node]; 3849 l3 = cachep->nodelists[node];
3838 3850
3839 kfree(l3->shared); 3851 kfree(l3->shared);
3840 free_alien_cache(l3->alien); 3852 free_alien_cache(l3->alien);
3841 kfree(l3); 3853 kfree(l3);
3842 cachep->nodelists[node] = NULL; 3854 cachep->nodelists[node] = NULL;
3843 } 3855 }
3844 node--; 3856 node--;
3845 } 3857 }
3846 } 3858 }
3847 return -ENOMEM; 3859 return -ENOMEM;
3848 } 3860 }
3849 3861
3850 struct ccupdate_struct { 3862 struct ccupdate_struct {
3851 struct kmem_cache *cachep; 3863 struct kmem_cache *cachep;
3852 struct array_cache *new[NR_CPUS]; 3864 struct array_cache *new[NR_CPUS];
3853 }; 3865 };
3854 3866
3855 static void do_ccupdate_local(void *info) 3867 static void do_ccupdate_local(void *info)
3856 { 3868 {
3857 struct ccupdate_struct *new = info; 3869 struct ccupdate_struct *new = info;
3858 struct array_cache *old; 3870 struct array_cache *old;
3859 3871
3860 check_irq_off(); 3872 check_irq_off();
3861 old = cpu_cache_get(new->cachep); 3873 old = cpu_cache_get(new->cachep);
3862 3874
3863 new->cachep->array[smp_processor_id()] = new->new[smp_processor_id()]; 3875 new->cachep->array[smp_processor_id()] = new->new[smp_processor_id()];
3864 new->new[smp_processor_id()] = old; 3876 new->new[smp_processor_id()] = old;
3865 } 3877 }
3866 3878
3867 /* Always called with the cache_chain_mutex held */ 3879 /* Always called with the cache_chain_mutex held */
3868 static int do_tune_cpucache(struct kmem_cache *cachep, int limit, 3880 static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
3869 int batchcount, int shared) 3881 int batchcount, int shared)
3870 { 3882 {
3871 struct ccupdate_struct *new; 3883 struct ccupdate_struct *new;
3872 int i; 3884 int i;
3873 3885
3874 new = kzalloc(sizeof(*new), GFP_KERNEL); 3886 new = kzalloc(sizeof(*new), GFP_KERNEL);
3875 if (!new) 3887 if (!new)
3876 return -ENOMEM; 3888 return -ENOMEM;
3877 3889
3878 for_each_online_cpu(i) { 3890 for_each_online_cpu(i) {
3879 new->new[i] = alloc_arraycache(cpu_to_node(i), limit, 3891 new->new[i] = alloc_arraycache(cpu_to_node(i), limit,
3880 batchcount); 3892 batchcount);
3881 if (!new->new[i]) { 3893 if (!new->new[i]) {
3882 for (i--; i >= 0; i--) 3894 for (i--; i >= 0; i--)
3883 kfree(new->new[i]); 3895 kfree(new->new[i]);
3884 kfree(new); 3896 kfree(new);
3885 return -ENOMEM; 3897 return -ENOMEM;
3886 } 3898 }
3887 } 3899 }
3888 new->cachep = cachep; 3900 new->cachep = cachep;
3889 3901
3890 on_each_cpu(do_ccupdate_local, (void *)new, 1, 1); 3902 on_each_cpu(do_ccupdate_local, (void *)new, 1, 1);
3891 3903
3892 check_irq_on(); 3904 check_irq_on();
3893 cachep->batchcount = batchcount; 3905 cachep->batchcount = batchcount;
3894 cachep->limit = limit; 3906 cachep->limit = limit;
3895 cachep->shared = shared; 3907 cachep->shared = shared;
3896 3908
3897 for_each_online_cpu(i) { 3909 for_each_online_cpu(i) {
3898 struct array_cache *ccold = new->new[i]; 3910 struct array_cache *ccold = new->new[i];
3899 if (!ccold) 3911 if (!ccold)
3900 continue; 3912 continue;
3901 spin_lock_irq(&cachep->nodelists[cpu_to_node(i)]->list_lock); 3913 spin_lock_irq(&cachep->nodelists[cpu_to_node(i)]->list_lock);
3902 free_block(cachep, ccold->entry, ccold->avail, cpu_to_node(i)); 3914 free_block(cachep, ccold->entry, ccold->avail, cpu_to_node(i));
3903 spin_unlock_irq(&cachep->nodelists[cpu_to_node(i)]->list_lock); 3915 spin_unlock_irq(&cachep->nodelists[cpu_to_node(i)]->list_lock);
3904 kfree(ccold); 3916 kfree(ccold);
3905 } 3917 }
3906 kfree(new); 3918 kfree(new);
3907 return alloc_kmemlist(cachep); 3919 return alloc_kmemlist(cachep);
3908 } 3920 }
3909 3921
3910 /* Called with cache_chain_mutex held always */ 3922 /* Called with cache_chain_mutex held always */
3911 static int enable_cpucache(struct kmem_cache *cachep) 3923 static int enable_cpucache(struct kmem_cache *cachep)
3912 { 3924 {
3913 int err; 3925 int err;
3914 int limit, shared; 3926 int limit, shared;
3915 3927
3916 /* 3928 /*
3917 * The head array serves three purposes: 3929 * The head array serves three purposes:
3918 * - create a LIFO ordering, i.e. return objects that are cache-warm 3930 * - create a LIFO ordering, i.e. return objects that are cache-warm
3919 * - reduce the number of spinlock operations. 3931 * - reduce the number of spinlock operations.
3920 * - reduce the number of linked list operations on the slab and 3932 * - reduce the number of linked list operations on the slab and
3921 * bufctl chains: array operations are cheaper. 3933 * bufctl chains: array operations are cheaper.
3922 * The numbers are guessed, we should auto-tune as described by 3934 * The numbers are guessed, we should auto-tune as described by
3923 * Bonwick. 3935 * Bonwick.
3924 */ 3936 */
3925 if (cachep->buffer_size > 131072) 3937 if (cachep->buffer_size > 131072)
3926 limit = 1; 3938 limit = 1;
3927 else if (cachep->buffer_size > PAGE_SIZE) 3939 else if (cachep->buffer_size > PAGE_SIZE)
3928 limit = 8; 3940 limit = 8;
3929 else if (cachep->buffer_size > 1024) 3941 else if (cachep->buffer_size > 1024)
3930 limit = 24; 3942 limit = 24;
3931 else if (cachep->buffer_size > 256) 3943 else if (cachep->buffer_size > 256)
3932 limit = 54; 3944 limit = 54;
3933 else 3945 else
3934 limit = 120; 3946 limit = 120;
3935 3947
3936 /* 3948 /*
3937 * CPU bound tasks (e.g. network routing) can exhibit cpu bound 3949 * CPU bound tasks (e.g. network routing) can exhibit cpu bound
3938 * allocation behaviour: Most allocs on one cpu, most free operations 3950 * allocation behaviour: Most allocs on one cpu, most free operations
3939 * on another cpu. For these cases, an efficient object passing between 3951 * on another cpu. For these cases, an efficient object passing between
3940 * cpus is necessary. This is provided by a shared array. The array 3952 * cpus is necessary. This is provided by a shared array. The array
3941 * replaces Bonwick's magazine layer. 3953 * replaces Bonwick's magazine layer.
3942 * On uniprocessor, it's functionally equivalent (but less efficient) 3954 * On uniprocessor, it's functionally equivalent (but less efficient)
3943 * to a larger limit. Thus disabled by default. 3955 * to a larger limit. Thus disabled by default.
3944 */ 3956 */
3945 shared = 0; 3957 shared = 0;
3946 #ifdef CONFIG_SMP 3958 #ifdef CONFIG_SMP
3947 if (cachep->buffer_size <= PAGE_SIZE) 3959 if (cachep->buffer_size <= PAGE_SIZE)
3948 shared = 8; 3960 shared = 8;
3949 #endif 3961 #endif
3950 3962
3951 #if DEBUG 3963 #if DEBUG
3952 /* 3964 /*
3953 * With debugging enabled, large batchcount lead to excessively long 3965 * With debugging enabled, large batchcount lead to excessively long
3954 * periods with disabled local interrupts. Limit the batchcount 3966 * periods with disabled local interrupts. Limit the batchcount
3955 */ 3967 */
3956 if (limit > 32) 3968 if (limit > 32)
3957 limit = 32; 3969 limit = 32;
3958 #endif 3970 #endif
3959 err = do_tune_cpucache(cachep, limit, (limit + 1) / 2, shared); 3971 err = do_tune_cpucache(cachep, limit, (limit + 1) / 2, shared);
3960 if (err) 3972 if (err)
3961 printk(KERN_ERR "enable_cpucache failed for %s, error %d.\n", 3973 printk(KERN_ERR "enable_cpucache failed for %s, error %d.\n",
3962 cachep->name, -err); 3974 cachep->name, -err);
3963 return err; 3975 return err;
3964 } 3976 }
3965 3977
3966 /* 3978 /*
3967 * Drain an array if it contains any elements taking the l3 lock only if 3979 * Drain an array if it contains any elements taking the l3 lock only if
3968 * necessary. Note that the l3 listlock also protects the array_cache 3980 * necessary. Note that the l3 listlock also protects the array_cache
3969 * if drain_array() is used on the shared array. 3981 * if drain_array() is used on the shared array.
3970 */ 3982 */
3971 void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3, 3983 void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3,
3972 struct array_cache *ac, int force, int node) 3984 struct array_cache *ac, int force, int node)
3973 { 3985 {
3974 int tofree; 3986 int tofree;
3975 3987
3976 if (!ac || !ac->avail) 3988 if (!ac || !ac->avail)
3977 return; 3989 return;
3978 if (ac->touched && !force) { 3990 if (ac->touched && !force) {
3979 ac->touched = 0; 3991 ac->touched = 0;
3980 } else { 3992 } else {
3981 spin_lock_irq(&l3->list_lock); 3993 spin_lock_irq(&l3->list_lock);
3982 if (ac->avail) { 3994 if (ac->avail) {
3983 tofree = force ? ac->avail : (ac->limit + 4) / 5; 3995 tofree = force ? ac->avail : (ac->limit + 4) / 5;
3984 if (tofree > ac->avail) 3996 if (tofree > ac->avail)
3985 tofree = (ac->avail + 1) / 2; 3997 tofree = (ac->avail + 1) / 2;
3986 free_block(cachep, ac->entry, tofree, node); 3998 free_block(cachep, ac->entry, tofree, node);
3987 ac->avail -= tofree; 3999 ac->avail -= tofree;
3988 memmove(ac->entry, &(ac->entry[tofree]), 4000 memmove(ac->entry, &(ac->entry[tofree]),
3989 sizeof(void *) * ac->avail); 4001 sizeof(void *) * ac->avail);
3990 } 4002 }
3991 spin_unlock_irq(&l3->list_lock); 4003 spin_unlock_irq(&l3->list_lock);
3992 } 4004 }
3993 } 4005 }
3994 4006
3995 /** 4007 /**
3996 * cache_reap - Reclaim memory from caches. 4008 * cache_reap - Reclaim memory from caches.
3997 * @unused: unused parameter 4009 * @unused: unused parameter
3998 * 4010 *
3999 * Called from workqueue/eventd every few seconds. 4011 * Called from workqueue/eventd every few seconds.
4000 * Purpose: 4012 * Purpose:
4001 * - clear the per-cpu caches for this CPU. 4013 * - clear the per-cpu caches for this CPU.
4002 * - return freeable pages to the main free memory pool. 4014 * - return freeable pages to the main free memory pool.
4003 * 4015 *
4004 * If we cannot acquire the cache chain mutex then just give up - we'll try 4016 * If we cannot acquire the cache chain mutex then just give up - we'll try
4005 * again on the next iteration. 4017 * again on the next iteration.
4006 */ 4018 */
4007 static void cache_reap(struct work_struct *unused) 4019 static void cache_reap(struct work_struct *unused)
4008 { 4020 {
4009 struct kmem_cache *searchp; 4021 struct kmem_cache *searchp;
4010 struct kmem_list3 *l3; 4022 struct kmem_list3 *l3;
4011 int node = numa_node_id(); 4023 int node = numa_node_id();
4012 4024
4013 if (!mutex_trylock(&cache_chain_mutex)) { 4025 if (!mutex_trylock(&cache_chain_mutex)) {
4014 /* Give up. Setup the next iteration. */ 4026 /* Give up. Setup the next iteration. */
4015 schedule_delayed_work(&__get_cpu_var(reap_work), 4027 schedule_delayed_work(&__get_cpu_var(reap_work),
4016 round_jiffies_relative(REAPTIMEOUT_CPUC)); 4028 round_jiffies_relative(REAPTIMEOUT_CPUC));
4017 return; 4029 return;
4018 } 4030 }
4019 4031
4020 list_for_each_entry(searchp, &cache_chain, next) { 4032 list_for_each_entry(searchp, &cache_chain, next) {
4021 check_irq_on(); 4033 check_irq_on();
4022 4034
4023 /* 4035 /*
4024 * We only take the l3 lock if absolutely necessary and we 4036 * We only take the l3 lock if absolutely necessary and we
4025 * have established with reasonable certainty that 4037 * have established with reasonable certainty that
4026 * we can do some work if the lock was obtained. 4038 * we can do some work if the lock was obtained.
4027 */ 4039 */
4028 l3 = searchp->nodelists[node]; 4040 l3 = searchp->nodelists[node];
4029 4041
4030 reap_alien(searchp, l3); 4042 reap_alien(searchp, l3);
4031 4043
4032 drain_array(searchp, l3, cpu_cache_get(searchp), 0, node); 4044 drain_array(searchp, l3, cpu_cache_get(searchp), 0, node);
4033 4045
4034 /* 4046 /*
4035 * These are racy checks but it does not matter 4047 * These are racy checks but it does not matter
4036 * if we skip one check or scan twice. 4048 * if we skip one check or scan twice.
4037 */ 4049 */
4038 if (time_after(l3->next_reap, jiffies)) 4050 if (time_after(l3->next_reap, jiffies))
4039 goto next; 4051 goto next;
4040 4052
4041 l3->next_reap = jiffies + REAPTIMEOUT_LIST3; 4053 l3->next_reap = jiffies + REAPTIMEOUT_LIST3;
4042 4054
4043 drain_array(searchp, l3, l3->shared, 0, node); 4055 drain_array(searchp, l3, l3->shared, 0, node);
4044 4056
4045 if (l3->free_touched) 4057 if (l3->free_touched)
4046 l3->free_touched = 0; 4058 l3->free_touched = 0;
4047 else { 4059 else {
4048 int freed; 4060 int freed;
4049 4061
4050 freed = drain_freelist(searchp, l3, (l3->free_limit + 4062 freed = drain_freelist(searchp, l3, (l3->free_limit +
4051 5 * searchp->num - 1) / (5 * searchp->num)); 4063 5 * searchp->num - 1) / (5 * searchp->num));
4052 STATS_ADD_REAPED(searchp, freed); 4064 STATS_ADD_REAPED(searchp, freed);
4053 } 4065 }
4054 next: 4066 next:
4055 cond_resched(); 4067 cond_resched();
4056 } 4068 }
4057 check_irq_on(); 4069 check_irq_on();
4058 mutex_unlock(&cache_chain_mutex); 4070 mutex_unlock(&cache_chain_mutex);
4059 next_reap_node(); 4071 next_reap_node();
4060 refresh_cpu_vm_stats(smp_processor_id()); 4072 refresh_cpu_vm_stats(smp_processor_id());
4061 /* Set up the next iteration */ 4073 /* Set up the next iteration */
4062 schedule_delayed_work(&__get_cpu_var(reap_work), 4074 schedule_delayed_work(&__get_cpu_var(reap_work),
4063 round_jiffies_relative(REAPTIMEOUT_CPUC)); 4075 round_jiffies_relative(REAPTIMEOUT_CPUC));
4064 } 4076 }
4065 4077
4066 #ifdef CONFIG_PROC_FS 4078 #ifdef CONFIG_PROC_FS
4067 4079
4068 static void print_slabinfo_header(struct seq_file *m) 4080 static void print_slabinfo_header(struct seq_file *m)
4069 { 4081 {
4070 /* 4082 /*
4071 * Output format version, so at least we can change it 4083 * Output format version, so at least we can change it
4072 * without _too_ many complaints. 4084 * without _too_ many complaints.
4073 */ 4085 */
4074 #if STATS 4086 #if STATS
4075 seq_puts(m, "slabinfo - version: 2.1 (statistics)\n"); 4087 seq_puts(m, "slabinfo - version: 2.1 (statistics)\n");
4076 #else 4088 #else
4077 seq_puts(m, "slabinfo - version: 2.1\n"); 4089 seq_puts(m, "slabinfo - version: 2.1\n");
4078 #endif 4090 #endif
4079 seq_puts(m, "# name <active_objs> <num_objs> <objsize> " 4091 seq_puts(m, "# name <active_objs> <num_objs> <objsize> "
4080 "<objperslab> <pagesperslab>"); 4092 "<objperslab> <pagesperslab>");
4081 seq_puts(m, " : tunables <limit> <batchcount> <sharedfactor>"); 4093 seq_puts(m, " : tunables <limit> <batchcount> <sharedfactor>");
4082 seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>"); 4094 seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>");
4083 #if STATS 4095 #if STATS
4084 seq_puts(m, " : globalstat <listallocs> <maxobjs> <grown> <reaped> " 4096 seq_puts(m, " : globalstat <listallocs> <maxobjs> <grown> <reaped> "
4085 "<error> <maxfreeable> <nodeallocs> <remotefrees> <alienoverflow>"); 4097 "<error> <maxfreeable> <nodeallocs> <remotefrees> <alienoverflow>");
4086 seq_puts(m, " : cpustat <allochit> <allocmiss> <freehit> <freemiss>"); 4098 seq_puts(m, " : cpustat <allochit> <allocmiss> <freehit> <freemiss>");
4087 #endif 4099 #endif
4088 seq_putc(m, '\n'); 4100 seq_putc(m, '\n');
4089 } 4101 }
4090 4102
4091 static void *s_start(struct seq_file *m, loff_t *pos) 4103 static void *s_start(struct seq_file *m, loff_t *pos)
4092 { 4104 {
4093 loff_t n = *pos; 4105 loff_t n = *pos;
4094 struct list_head *p; 4106 struct list_head *p;
4095 4107
4096 mutex_lock(&cache_chain_mutex); 4108 mutex_lock(&cache_chain_mutex);
4097 if (!n) 4109 if (!n)
4098 print_slabinfo_header(m); 4110 print_slabinfo_header(m);
4099 p = cache_chain.next; 4111 p = cache_chain.next;
4100 while (n--) { 4112 while (n--) {
4101 p = p->next; 4113 p = p->next;
4102 if (p == &cache_chain) 4114 if (p == &cache_chain)
4103 return NULL; 4115 return NULL;
4104 } 4116 }
4105 return list_entry(p, struct kmem_cache, next); 4117 return list_entry(p, struct kmem_cache, next);
4106 } 4118 }
4107 4119
4108 static void *s_next(struct seq_file *m, void *p, loff_t *pos) 4120 static void *s_next(struct seq_file *m, void *p, loff_t *pos)
4109 { 4121 {
4110 struct kmem_cache *cachep = p; 4122 struct kmem_cache *cachep = p;
4111 ++*pos; 4123 ++*pos;
4112 return cachep->next.next == &cache_chain ? 4124 return cachep->next.next == &cache_chain ?
4113 NULL : list_entry(cachep->next.next, struct kmem_cache, next); 4125 NULL : list_entry(cachep->next.next, struct kmem_cache, next);
4114 } 4126 }
4115 4127
4116 static void s_stop(struct seq_file *m, void *p) 4128 static void s_stop(struct seq_file *m, void *p)
4117 { 4129 {
4118 mutex_unlock(&cache_chain_mutex); 4130 mutex_unlock(&cache_chain_mutex);
4119 } 4131 }
4120 4132
4121 static int s_show(struct seq_file *m, void *p) 4133 static int s_show(struct seq_file *m, void *p)
4122 { 4134 {
4123 struct kmem_cache *cachep = p; 4135 struct kmem_cache *cachep = p;
4124 struct slab *slabp; 4136 struct slab *slabp;
4125 unsigned long active_objs; 4137 unsigned long active_objs;
4126 unsigned long num_objs; 4138 unsigned long num_objs;
4127 unsigned long active_slabs = 0; 4139 unsigned long active_slabs = 0;
4128 unsigned long num_slabs, free_objects = 0, shared_avail = 0; 4140 unsigned long num_slabs, free_objects = 0, shared_avail = 0;
4129 const char *name; 4141 const char *name;
4130 char *error = NULL; 4142 char *error = NULL;
4131 int node; 4143 int node;
4132 struct kmem_list3 *l3; 4144 struct kmem_list3 *l3;
4133 4145
4134 active_objs = 0; 4146 active_objs = 0;
4135 num_slabs = 0; 4147 num_slabs = 0;
4136 for_each_online_node(node) { 4148 for_each_online_node(node) {
4137 l3 = cachep->nodelists[node]; 4149 l3 = cachep->nodelists[node];
4138 if (!l3) 4150 if (!l3)
4139 continue; 4151 continue;
4140 4152
4141 check_irq_on(); 4153 check_irq_on();
4142 spin_lock_irq(&l3->list_lock); 4154 spin_lock_irq(&l3->list_lock);
4143 4155
4144 list_for_each_entry(slabp, &l3->slabs_full, list) { 4156 list_for_each_entry(slabp, &l3->slabs_full, list) {
4145 if (slabp->inuse != cachep->num && !error) 4157 if (slabp->inuse != cachep->num && !error)
4146 error = "slabs_full accounting error"; 4158 error = "slabs_full accounting error";
4147 active_objs += cachep->num; 4159 active_objs += cachep->num;
4148 active_slabs++; 4160 active_slabs++;
4149 } 4161 }
4150 list_for_each_entry(slabp, &l3->slabs_partial, list) { 4162 list_for_each_entry(slabp, &l3->slabs_partial, list) {
4151 if (slabp->inuse == cachep->num && !error) 4163 if (slabp->inuse == cachep->num && !error)
4152 error = "slabs_partial inuse accounting error"; 4164 error = "slabs_partial inuse accounting error";
4153 if (!slabp->inuse && !error) 4165 if (!slabp->inuse && !error)
4154 error = "slabs_partial/inuse accounting error"; 4166 error = "slabs_partial/inuse accounting error";
4155 active_objs += slabp->inuse; 4167 active_objs += slabp->inuse;
4156 active_slabs++; 4168 active_slabs++;
4157 } 4169 }
4158 list_for_each_entry(slabp, &l3->slabs_free, list) { 4170 list_for_each_entry(slabp, &l3->slabs_free, list) {
4159 if (slabp->inuse && !error) 4171 if (slabp->inuse && !error)
4160 error = "slabs_free/inuse accounting error"; 4172 error = "slabs_free/inuse accounting error";
4161 num_slabs++; 4173 num_slabs++;
4162 } 4174 }
4163 free_objects += l3->free_objects; 4175 free_objects += l3->free_objects;
4164 if (l3->shared) 4176 if (l3->shared)
4165 shared_avail += l3->shared->avail; 4177 shared_avail += l3->shared->avail;
4166 4178
4167 spin_unlock_irq(&l3->list_lock); 4179 spin_unlock_irq(&l3->list_lock);
4168 } 4180 }
4169 num_slabs += active_slabs; 4181 num_slabs += active_slabs;
4170 num_objs = num_slabs * cachep->num; 4182 num_objs = num_slabs * cachep->num;
4171 if (num_objs - active_objs != free_objects && !error) 4183 if (num_objs - active_objs != free_objects && !error)
4172 error = "free_objects accounting error"; 4184 error = "free_objects accounting error";
4173 4185
4174 name = cachep->name; 4186 name = cachep->name;
4175 if (error) 4187 if (error)
4176 printk(KERN_ERR "slab: cache %s error: %s\n", name, error); 4188 printk(KERN_ERR "slab: cache %s error: %s\n", name, error);
4177 4189
4178 seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d", 4190 seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d",
4179 name, active_objs, num_objs, cachep->buffer_size, 4191 name, active_objs, num_objs, cachep->buffer_size,
4180 cachep->num, (1 << cachep->gfporder)); 4192 cachep->num, (1 << cachep->gfporder));
4181 seq_printf(m, " : tunables %4u %4u %4u", 4193 seq_printf(m, " : tunables %4u %4u %4u",
4182 cachep->limit, cachep->batchcount, cachep->shared); 4194 cachep->limit, cachep->batchcount, cachep->shared);
4183 seq_printf(m, " : slabdata %6lu %6lu %6lu", 4195 seq_printf(m, " : slabdata %6lu %6lu %6lu",
4184 active_slabs, num_slabs, shared_avail); 4196 active_slabs, num_slabs, shared_avail);
4185 #if STATS 4197 #if STATS
4186 { /* list3 stats */ 4198 { /* list3 stats */
4187 unsigned long high = cachep->high_mark; 4199 unsigned long high = cachep->high_mark;
4188 unsigned long allocs = cachep->num_allocations; 4200 unsigned long allocs = cachep->num_allocations;
4189 unsigned long grown = cachep->grown; 4201 unsigned long grown = cachep->grown;
4190 unsigned long reaped = cachep->reaped; 4202 unsigned long reaped = cachep->reaped;
4191 unsigned long errors = cachep->errors; 4203 unsigned long errors = cachep->errors;
4192 unsigned long max_freeable = cachep->max_freeable; 4204 unsigned long max_freeable = cachep->max_freeable;
4193 unsigned long node_allocs = cachep->node_allocs; 4205 unsigned long node_allocs = cachep->node_allocs;
4194 unsigned long node_frees = cachep->node_frees; 4206 unsigned long node_frees = cachep->node_frees;
4195 unsigned long overflows = cachep->node_overflow; 4207 unsigned long overflows = cachep->node_overflow;
4196 4208
4197 seq_printf(m, " : globalstat %7lu %6lu %5lu %4lu \ 4209 seq_printf(m, " : globalstat %7lu %6lu %5lu %4lu \
4198 %4lu %4lu %4lu %4lu %4lu", allocs, high, grown, 4210 %4lu %4lu %4lu %4lu %4lu", allocs, high, grown,
4199 reaped, errors, max_freeable, node_allocs, 4211 reaped, errors, max_freeable, node_allocs,
4200 node_frees, overflows); 4212 node_frees, overflows);
4201 } 4213 }
4202 /* cpu stats */ 4214 /* cpu stats */
4203 { 4215 {
4204 unsigned long allochit = atomic_read(&cachep->allochit); 4216 unsigned long allochit = atomic_read(&cachep->allochit);
4205 unsigned long allocmiss = atomic_read(&cachep->allocmiss); 4217 unsigned long allocmiss = atomic_read(&cachep->allocmiss);
4206 unsigned long freehit = atomic_read(&cachep->freehit); 4218 unsigned long freehit = atomic_read(&cachep->freehit);
4207 unsigned long freemiss = atomic_read(&cachep->freemiss); 4219 unsigned long freemiss = atomic_read(&cachep->freemiss);
4208 4220
4209 seq_printf(m, " : cpustat %6lu %6lu %6lu %6lu", 4221 seq_printf(m, " : cpustat %6lu %6lu %6lu %6lu",
4210 allochit, allocmiss, freehit, freemiss); 4222 allochit, allocmiss, freehit, freemiss);
4211 } 4223 }
4212 #endif 4224 #endif
4213 seq_putc(m, '\n'); 4225 seq_putc(m, '\n');
4214 return 0; 4226 return 0;
4215 } 4227 }
4216 4228
4217 /* 4229 /*
4218 * slabinfo_op - iterator that generates /proc/slabinfo 4230 * slabinfo_op - iterator that generates /proc/slabinfo
4219 * 4231 *
4220 * Output layout: 4232 * Output layout:
4221 * cache-name 4233 * cache-name
4222 * num-active-objs 4234 * num-active-objs
4223 * total-objs 4235 * total-objs
4224 * object size 4236 * object size
4225 * num-active-slabs 4237 * num-active-slabs
4226 * total-slabs 4238 * total-slabs
4227 * num-pages-per-slab 4239 * num-pages-per-slab
4228 * + further values on SMP and with statistics enabled 4240 * + further values on SMP and with statistics enabled
4229 */ 4241 */
4230 4242
4231 const struct seq_operations slabinfo_op = { 4243 const struct seq_operations slabinfo_op = {
4232 .start = s_start, 4244 .start = s_start,
4233 .next = s_next, 4245 .next = s_next,
4234 .stop = s_stop, 4246 .stop = s_stop,
4235 .show = s_show, 4247 .show = s_show,
4236 }; 4248 };
4237 4249
4238 #define MAX_SLABINFO_WRITE 128 4250 #define MAX_SLABINFO_WRITE 128
4239 /** 4251 /**
4240 * slabinfo_write - Tuning for the slab allocator 4252 * slabinfo_write - Tuning for the slab allocator
4241 * @file: unused 4253 * @file: unused
4242 * @buffer: user buffer 4254 * @buffer: user buffer
4243 * @count: data length 4255 * @count: data length
4244 * @ppos: unused 4256 * @ppos: unused
4245 */ 4257 */
4246 ssize_t slabinfo_write(struct file *file, const char __user * buffer, 4258 ssize_t slabinfo_write(struct file *file, const char __user * buffer,
4247 size_t count, loff_t *ppos) 4259 size_t count, loff_t *ppos)
4248 { 4260 {
4249 char kbuf[MAX_SLABINFO_WRITE + 1], *tmp; 4261 char kbuf[MAX_SLABINFO_WRITE + 1], *tmp;
4250 int limit, batchcount, shared, res; 4262 int limit, batchcount, shared, res;
4251 struct kmem_cache *cachep; 4263 struct kmem_cache *cachep;
4252 4264
4253 if (count > MAX_SLABINFO_WRITE) 4265 if (count > MAX_SLABINFO_WRITE)
4254 return -EINVAL; 4266 return -EINVAL;
4255 if (copy_from_user(&kbuf, buffer, count)) 4267 if (copy_from_user(&kbuf, buffer, count))
4256 return -EFAULT; 4268 return -EFAULT;
4257 kbuf[MAX_SLABINFO_WRITE] = '\0'; 4269 kbuf[MAX_SLABINFO_WRITE] = '\0';
4258 4270
4259 tmp = strchr(kbuf, ' '); 4271 tmp = strchr(kbuf, ' ');
4260 if (!tmp) 4272 if (!tmp)
4261 return -EINVAL; 4273 return -EINVAL;
4262 *tmp = '\0'; 4274 *tmp = '\0';
4263 tmp++; 4275 tmp++;
4264 if (sscanf(tmp, " %d %d %d", &limit, &batchcount, &shared) != 3) 4276 if (sscanf(tmp, " %d %d %d", &limit, &batchcount, &shared) != 3)
4265 return -EINVAL; 4277 return -EINVAL;
4266 4278
4267 /* Find the cache in the chain of caches. */ 4279 /* Find the cache in the chain of caches. */
4268 mutex_lock(&cache_chain_mutex); 4280 mutex_lock(&cache_chain_mutex);
4269 res = -EINVAL; 4281 res = -EINVAL;
4270 list_for_each_entry(cachep, &cache_chain, next) { 4282 list_for_each_entry(cachep, &cache_chain, next) {
4271 if (!strcmp(cachep->name, kbuf)) { 4283 if (!strcmp(cachep->name, kbuf)) {
4272 if (limit < 1 || batchcount < 1 || 4284 if (limit < 1 || batchcount < 1 ||
4273 batchcount > limit || shared < 0) { 4285 batchcount > limit || shared < 0) {
4274 res = 0; 4286 res = 0;
4275 } else { 4287 } else {
4276 res = do_tune_cpucache(cachep, limit, 4288 res = do_tune_cpucache(cachep, limit,
4277 batchcount, shared); 4289 batchcount, shared);
4278 } 4290 }
4279 break; 4291 break;
4280 } 4292 }
4281 } 4293 }
4282 mutex_unlock(&cache_chain_mutex); 4294 mutex_unlock(&cache_chain_mutex);
4283 if (res >= 0) 4295 if (res >= 0)
4284 res = count; 4296 res = count;
4285 return res; 4297 return res;
4286 } 4298 }
4287 4299
4288 #ifdef CONFIG_DEBUG_SLAB_LEAK 4300 #ifdef CONFIG_DEBUG_SLAB_LEAK
4289 4301
4290 static void *leaks_start(struct seq_file *m, loff_t *pos) 4302 static void *leaks_start(struct seq_file *m, loff_t *pos)
4291 { 4303 {
4292 loff_t n = *pos; 4304 loff_t n = *pos;
4293 struct list_head *p; 4305 struct list_head *p;
4294 4306
4295 mutex_lock(&cache_chain_mutex); 4307 mutex_lock(&cache_chain_mutex);
4296 p = cache_chain.next; 4308 p = cache_chain.next;
4297 while (n--) { 4309 while (n--) {
4298 p = p->next; 4310 p = p->next;
4299 if (p == &cache_chain) 4311 if (p == &cache_chain)
4300 return NULL; 4312 return NULL;
4301 } 4313 }
4302 return list_entry(p, struct kmem_cache, next); 4314 return list_entry(p, struct kmem_cache, next);
4303 } 4315 }
4304 4316
4305 static inline int add_caller(unsigned long *n, unsigned long v) 4317 static inline int add_caller(unsigned long *n, unsigned long v)
4306 { 4318 {
4307 unsigned long *p; 4319 unsigned long *p;
4308 int l; 4320 int l;
4309 if (!v) 4321 if (!v)
4310 return 1; 4322 return 1;
4311 l = n[1]; 4323 l = n[1];
4312 p = n + 2; 4324 p = n + 2;
4313 while (l) { 4325 while (l) {
4314 int i = l/2; 4326 int i = l/2;
4315 unsigned long *q = p + 2 * i; 4327 unsigned long *q = p + 2 * i;
4316 if (*q == v) { 4328 if (*q == v) {
4317 q[1]++; 4329 q[1]++;
4318 return 1; 4330 return 1;
4319 } 4331 }
4320 if (*q > v) { 4332 if (*q > v) {
4321 l = i; 4333 l = i;
4322 } else { 4334 } else {
4323 p = q + 2; 4335 p = q + 2;
4324 l -= i + 1; 4336 l -= i + 1;
4325 } 4337 }
4326 } 4338 }
4327 if (++n[1] == n[0]) 4339 if (++n[1] == n[0])
4328 return 0; 4340 return 0;
4329 memmove(p + 2, p, n[1] * 2 * sizeof(unsigned long) - ((void *)p - (void *)n)); 4341 memmove(p + 2, p, n[1] * 2 * sizeof(unsigned long) - ((void *)p - (void *)n));
4330 p[0] = v; 4342 p[0] = v;
4331 p[1] = 1; 4343 p[1] = 1;
4332 return 1; 4344 return 1;
4333 } 4345 }
4334 4346
4335 static void handle_slab(unsigned long *n, struct kmem_cache *c, struct slab *s) 4347 static void handle_slab(unsigned long *n, struct kmem_cache *c, struct slab *s)
4336 { 4348 {
4337 void *p; 4349 void *p;
4338 int i; 4350 int i;
4339 if (n[0] == n[1]) 4351 if (n[0] == n[1])
4340 return; 4352 return;
4341 for (i = 0, p = s->s_mem; i < c->num; i++, p += c->buffer_size) { 4353 for (i = 0, p = s->s_mem; i < c->num; i++, p += c->buffer_size) {
4342 if (slab_bufctl(s)[i] != BUFCTL_ACTIVE) 4354 if (slab_bufctl(s)[i] != BUFCTL_ACTIVE)
4343 continue; 4355 continue;
4344 if (!add_caller(n, (unsigned long)*dbg_userword(c, p))) 4356 if (!add_caller(n, (unsigned long)*dbg_userword(c, p)))
4345 return; 4357 return;
4346 } 4358 }
4347 } 4359 }
4348 4360
4349 static void show_symbol(struct seq_file *m, unsigned long address) 4361 static void show_symbol(struct seq_file *m, unsigned long address)
4350 { 4362 {
4351 #ifdef CONFIG_KALLSYMS 4363 #ifdef CONFIG_KALLSYMS
4352 char *modname; 4364 char *modname;
4353 const char *name; 4365 const char *name;
4354 unsigned long offset, size; 4366 unsigned long offset, size;
4355 char namebuf[KSYM_NAME_LEN+1]; 4367 char namebuf[KSYM_NAME_LEN+1];
4356 4368
4357 name = kallsyms_lookup(address, &size, &offset, &modname, namebuf); 4369 name = kallsyms_lookup(address, &size, &offset, &modname, namebuf);
4358 4370
4359 if (name) { 4371 if (name) {
4360 seq_printf(m, "%s+%#lx/%#lx", name, offset, size); 4372 seq_printf(m, "%s+%#lx/%#lx", name, offset, size);
4361 if (modname) 4373 if (modname)
4362 seq_printf(m, " [%s]", modname); 4374 seq_printf(m, " [%s]", modname);
4363 return; 4375 return;
4364 } 4376 }
4365 #endif 4377 #endif
4366 seq_printf(m, "%p", (void *)address); 4378 seq_printf(m, "%p", (void *)address);
4367 } 4379 }
4368 4380
4369 static int leaks_show(struct seq_file *m, void *p) 4381 static int leaks_show(struct seq_file *m, void *p)
4370 { 4382 {
4371 struct kmem_cache *cachep = p; 4383 struct kmem_cache *cachep = p;
4372 struct slab *slabp; 4384 struct slab *slabp;
4373 struct kmem_list3 *l3; 4385 struct kmem_list3 *l3;
4374 const char *name; 4386 const char *name;
4375 unsigned long *n = m->private; 4387 unsigned long *n = m->private;
4376 int node; 4388 int node;
4377 int i; 4389 int i;
4378 4390
4379 if (!(cachep->flags & SLAB_STORE_USER)) 4391 if (!(cachep->flags & SLAB_STORE_USER))
4380 return 0; 4392 return 0;
4381 if (!(cachep->flags & SLAB_RED_ZONE)) 4393 if (!(cachep->flags & SLAB_RED_ZONE))
4382 return 0; 4394 return 0;
4383 4395
4384 /* OK, we can do it */ 4396 /* OK, we can do it */
4385 4397
4386 n[1] = 0; 4398 n[1] = 0;
4387 4399
4388 for_each_online_node(node) { 4400 for_each_online_node(node) {
4389 l3 = cachep->nodelists[node]; 4401 l3 = cachep->nodelists[node];
4390 if (!l3) 4402 if (!l3)
4391 continue; 4403 continue;
4392 4404
4393 check_irq_on(); 4405 check_irq_on();
4394 spin_lock_irq(&l3->list_lock); 4406 spin_lock_irq(&l3->list_lock);
4395 4407
4396 list_for_each_entry(slabp, &l3->slabs_full, list) 4408 list_for_each_entry(slabp, &l3->slabs_full, list)
4397 handle_slab(n, cachep, slabp); 4409 handle_slab(n, cachep, slabp);
4398 list_for_each_entry(slabp, &l3->slabs_partial, list) 4410 list_for_each_entry(slabp, &l3->slabs_partial, list)
4399 handle_slab(n, cachep, slabp); 4411 handle_slab(n, cachep, slabp);
4400 spin_unlock_irq(&l3->list_lock); 4412 spin_unlock_irq(&l3->list_lock);
4401 } 4413 }
4402 name = cachep->name; 4414 name = cachep->name;
4403 if (n[0] == n[1]) { 4415 if (n[0] == n[1]) {
4404 /* Increase the buffer size */ 4416 /* Increase the buffer size */
4405 mutex_unlock(&cache_chain_mutex); 4417 mutex_unlock(&cache_chain_mutex);
4406 m->private = kzalloc(n[0] * 4 * sizeof(unsigned long), GFP_KERNEL); 4418 m->private = kzalloc(n[0] * 4 * sizeof(unsigned long), GFP_KERNEL);
4407 if (!m->private) { 4419 if (!m->private) {
4408 /* Too bad, we are really out */ 4420 /* Too bad, we are really out */
4409 m->private = n; 4421 m->private = n;
4410 mutex_lock(&cache_chain_mutex); 4422 mutex_lock(&cache_chain_mutex);
4411 return -ENOMEM; 4423 return -ENOMEM;
4412 } 4424 }
4413 *(unsigned long *)m->private = n[0] * 2; 4425 *(unsigned long *)m->private = n[0] * 2;
4414 kfree(n); 4426 kfree(n);
4415 mutex_lock(&cache_chain_mutex); 4427 mutex_lock(&cache_chain_mutex);
4416 /* Now make sure this entry will be retried */ 4428 /* Now make sure this entry will be retried */
4417 m->count = m->size; 4429 m->count = m->size;
4418 return 0; 4430 return 0;
4419 } 4431 }
4420 for (i = 0; i < n[1]; i++) { 4432 for (i = 0; i < n[1]; i++) {
4421 seq_printf(m, "%s: %lu ", name, n[2*i+3]); 4433 seq_printf(m, "%s: %lu ", name, n[2*i+3]);
4422 show_symbol(m, n[2*i+2]); 4434 show_symbol(m, n[2*i+2]);
4423 seq_putc(m, '\n'); 4435 seq_putc(m, '\n');
4424 } 4436 }
4425 4437
4426 return 0; 4438 return 0;
4427 } 4439 }
4428 4440
4429 const struct seq_operations slabstats_op = { 4441 const struct seq_operations slabstats_op = {
4430 .start = leaks_start, 4442 .start = leaks_start,
4431 .next = s_next, 4443 .next = s_next,
4432 .stop = s_stop, 4444 .stop = s_stop,
4433 .show = leaks_show, 4445 .show = leaks_show,
4434 }; 4446 };
4435 #endif 4447 #endif
4436 #endif 4448 #endif
4437 4449
4438 /** 4450 /**
4439 * ksize - get the actual amount of memory allocated for a given object 4451 * ksize - get the actual amount of memory allocated for a given object
4440 * @objp: Pointer to the object 4452 * @objp: Pointer to the object
4441 * 4453 *
4442 * kmalloc may internally round up allocations and return more memory 4454 * kmalloc may internally round up allocations and return more memory
4443 * than requested. ksize() can be used to determine the actual amount of 4455 * than requested. ksize() can be used to determine the actual amount of
4444 * memory allocated. The caller may use this additional memory, even though 4456 * memory allocated. The caller may use this additional memory, even though
4445 * a smaller amount of memory was initially specified with the kmalloc call. 4457 * a smaller amount of memory was initially specified with the kmalloc call.
4446 * The caller must guarantee that objp points to a valid object previously 4458 * The caller must guarantee that objp points to a valid object previously
4447 * allocated with either kmalloc() or kmem_cache_alloc(). The object 4459 * allocated with either kmalloc() or kmem_cache_alloc(). The object
4448 * must not be freed during the duration of the call. 4460 * must not be freed during the duration of the call.
4449 */ 4461 */
4450 unsigned int ksize(const void *objp) 4462 unsigned int ksize(const void *objp)
4451 { 4463 {
4452 if (unlikely(objp == NULL)) 4464 if (unlikely(objp == NULL))
4453 return 0; 4465 return 0;
4454 4466
4455 return obj_size(virt_to_cache(objp)); 4467 return obj_size(virt_to_cache(objp));
4456 } 4468 }
4457 4469