Eric Lee / smarc-fsl-linux-kernel

1

/*

1

/*

2

* linux/mm/slab.c

2

* linux/mm/slab.c

3

* Written by Mark Hemment, 1996/97.

3

* Written by Mark Hemment, 1996/97.

4

* (markhe@nextd.demon.co.uk)

4

* (markhe@nextd.demon.co.uk)

5

*

5

*

6

* kmem_cache_destroy() + some cleanup - 1999 Andrea Arcangeli

6

* kmem_cache_destroy() + some cleanup - 1999 Andrea Arcangeli

7

*

7

*

8

* Major cleanup, different bufctl logic, per-cpu arrays

8

* Major cleanup, different bufctl logic, per-cpu arrays

9

10

*

10

*

11

* Cleanup, make the head arrays unconditional, preparation for NUMA

11

* Cleanup, make the head arrays unconditional, preparation for NUMA

12

13

*

13

*

14

* An implementation of the Slab Allocator as described in outline in;

14

* An implementation of the Slab Allocator as described in outline in;

15

* UNIX Internals: The New Frontiers by Uresh Vahalia

15

* UNIX Internals: The New Frontiers by Uresh Vahalia

16

* Pub: Prentice Hall ISBN 0-13-101908-2

16

* Pub: Prentice Hall ISBN 0-13-101908-2

17

* or with a little more detail in;

17

* or with a little more detail in;

18

* The Slab Allocator: An Object-Caching Kernel Memory Allocator

18

* The Slab Allocator: An Object-Caching Kernel Memory Allocator

19

* Jeff Bonwick (Sun Microsystems).

19

* Jeff Bonwick (Sun Microsystems).

20

* Presented at: USENIX Summer 1994 Technical Conference

20

* Presented at: USENIX Summer 1994 Technical Conference

21

*

21

*

22

* The memory is organized in caches, one cache for each object type.

22

* The memory is organized in caches, one cache for each object type.

23

* (e.g. inode_cache, dentry_cache, buffer_head, vm_area_struct)

23

* (e.g. inode_cache, dentry_cache, buffer_head, vm_area_struct)

24

* Each cache consists out of many slabs (they are small (usually one

24

* Each cache consists out of many slabs (they are small (usually one

25

* page long) and always contiguous), and each slab contains multiple

25

* page long) and always contiguous), and each slab contains multiple

26

* initialized objects.

26

* initialized objects.

27

*

27

*

28

* This means, that your constructor is used only for newly allocated

28

* This means, that your constructor is used only for newly allocated

29

* slabs and you must pass objects with the same intializations to

29

* slabs and you must pass objects with the same intializations to

30

* kmem_cache_free.

30

* kmem_cache_free.

31

*

31

*

32

* Each cache can only support one memory type (GFP_DMA, GFP_HIGHMEM,

32

* Each cache can only support one memory type (GFP_DMA, GFP_HIGHMEM,

33

* normal). If you need a special memory type, then must create a new

33

* normal). If you need a special memory type, then must create a new

34

* cache for that memory type.

34

* cache for that memory type.

35

*

35

*

36

* In order to reduce fragmentation, the slabs are sorted in 3 groups:

36

* In order to reduce fragmentation, the slabs are sorted in 3 groups:

37

* full slabs with 0 free objects

37

* full slabs with 0 free objects

38

* partial slabs

38

* partial slabs

39

* empty slabs with no allocated objects

39

* empty slabs with no allocated objects

40

*

40

*

41

* If partial slabs exist, then new allocations come from these slabs,

41

* If partial slabs exist, then new allocations come from these slabs,

42

* otherwise from empty slabs or new slabs are allocated.

42

* otherwise from empty slabs or new slabs are allocated.

43

*

43

*

44

* kmem_cache_destroy() CAN CRASH if you try to allocate from the cache

44

* kmem_cache_destroy() CAN CRASH if you try to allocate from the cache

45

* during kmem_cache_destroy(). The caller must prevent concurrent allocs.

45

* during kmem_cache_destroy(). The caller must prevent concurrent allocs.

46

*

46

*

47

* Each cache has a short per-cpu head array, most allocs

47

* Each cache has a short per-cpu head array, most allocs

48

* and frees go into that array, and if that array overflows, then 1/2

48

* and frees go into that array, and if that array overflows, then 1/2

49

* of the entries in the array are given back into the global cache.

49

* of the entries in the array are given back into the global cache.

50

* The head array is strictly LIFO and should improve the cache hit rates.

50

* The head array is strictly LIFO and should improve the cache hit rates.

51

* On SMP, it additionally reduces the spinlock operations.

51

* On SMP, it additionally reduces the spinlock operations.

52

*

52

*

53

* The c_cpuarray may not be read with enabled local interrupts -

53

* The c_cpuarray may not be read with enabled local interrupts -

54

* it's changed with a smp_call_function().

54

* it's changed with a smp_call_function().

55

*

55

*

56

* SMP synchronization:

56

* SMP synchronization:

57

* constructors and destructors are called without any locking.

57

* constructors and destructors are called without any locking.

58

* Several members in struct kmem_cache and struct slab never change, they

58

* Several members in struct kmem_cache and struct slab never change, they

59

* are accessed without any locking.

59

* are accessed without any locking.

60

* The per-cpu arrays are never accessed from the wrong cpu, no locking,

60

* The per-cpu arrays are never accessed from the wrong cpu, no locking,

61

* and local interrupts are disabled so slab code is preempt-safe.

61

* and local interrupts are disabled so slab code is preempt-safe.

62

* The non-constant members are protected with a per-cache irq spinlock.

62

* The non-constant members are protected with a per-cache irq spinlock.

63

*

63

*

64

* Many thanks to Mark Hemment, who wrote another per-cpu slab patch

64

* Many thanks to Mark Hemment, who wrote another per-cpu slab patch

65

* in 2000 - many ideas in the current implementation are derived from

65

* in 2000 - many ideas in the current implementation are derived from

66

* his patch.

66

* his patch.

67

*

67

*

68

* Further notes from the original documentation:

68

* Further notes from the original documentation:

69

*

69

*

70

* 11 April '97. Started multi-threading - markhe

70

* 11 April '97. Started multi-threading - markhe

71

* The global cache-chain is protected by the mutex 'cache_chain_mutex'.

71

* The global cache-chain is protected by the mutex 'cache_chain_mutex'.

72

* The sem is only needed when accessing/extending the cache-chain, which

72

* The sem is only needed when accessing/extending the cache-chain, which

73

* can never happen inside an interrupt (kmem_cache_create(),

73

* can never happen inside an interrupt (kmem_cache_create(),

74

* kmem_cache_shrink() and kmem_cache_reap()).

74

* kmem_cache_shrink() and kmem_cache_reap()).

75

*

75

*

76

* At present, each engine can be growing a cache. This should be blocked.

76

* At present, each engine can be growing a cache. This should be blocked.

77

*

77

*

78

* 15 March 2005. NUMA slab allocator.

78

* 15 March 2005. NUMA slab allocator.

79

* Shai Fultheim <shai@scalex86.org>.

79

* Shai Fultheim <shai@scalex86.org>.

80

* Shobhit Dayal <shobhit@calsoftinc.com>

80

* Shobhit Dayal <shobhit@calsoftinc.com>

81

* Alok N Kataria <alokk@calsoftinc.com>

81

* Alok N Kataria <alokk@calsoftinc.com>

82

* Christoph Lameter <christoph@lameter.com>

82

* Christoph Lameter <christoph@lameter.com>

83

*

83

*

84

* Modified the slab allocator to be node aware on NUMA systems.

84

* Modified the slab allocator to be node aware on NUMA systems.

85

* Each node has its own list of partial, free and full slabs.

85

* Each node has its own list of partial, free and full slabs.

86

* All object allocations for a node occur from node specific slab lists.

86

* All object allocations for a node occur from node specific slab lists.

87

*/

87

*/

88

89

#include <linux/slab.h>

89

#include <linux/slab.h>

90

#include <linux/mm.h>

90

#include <linux/mm.h>

91

#include <linux/poison.h>

91

#include <linux/poison.h>

92

#include <linux/swap.h>

92

#include <linux/swap.h>

93

#include <linux/cache.h>

93

#include <linux/cache.h>

94

#include <linux/interrupt.h>

94

#include <linux/interrupt.h>

95

#include <linux/init.h>

95

#include <linux/init.h>

96

#include <linux/compiler.h>

96

#include <linux/compiler.h>

97

#include <linux/cpuset.h>

97

#include <linux/cpuset.h>

98

#include <linux/seq_file.h>

98

#include <linux/seq_file.h>

99

#include <linux/notifier.h>

99

#include <linux/notifier.h>

100

#include <linux/kallsyms.h>

100

#include <linux/kallsyms.h>

101

#include <linux/cpu.h>

101

#include <linux/cpu.h>

102

#include <linux/sysctl.h>

102

#include <linux/sysctl.h>

103

#include <linux/module.h>

103

#include <linux/module.h>

104

#include <linux/rcupdate.h>

104

#include <linux/rcupdate.h>

105

#include <linux/string.h>

105

#include <linux/string.h>

106

#include <linux/uaccess.h>

106

#include <linux/uaccess.h>

107

#include <linux/nodemask.h>

107

#include <linux/nodemask.h>

108

#include <linux/mempolicy.h>

108

#include <linux/mempolicy.h>

109

#include <linux/mutex.h>

109

#include <linux/mutex.h>

110

#include <linux/fault-inject.h>

110

#include <linux/fault-inject.h>

111

#include <linux/rtmutex.h>

111

#include <linux/rtmutex.h>

112

#include <linux/reciprocal_div.h>

112

113

#include <asm/cacheflush.h>

114

#include <asm/cacheflush.h>

114

#include <asm/tlbflush.h>

115

#include <asm/tlbflush.h>

115

#include <asm/page.h>

116

#include <asm/page.h>

116

117

/*

118

/*

118

* DEBUG - 1 for kmem_cache_create() to honour; SLAB_DEBUG_INITIAL,

119

* DEBUG - 1 for kmem_cache_create() to honour; SLAB_DEBUG_INITIAL,

119

* SLAB_RED_ZONE & SLAB_POISON.

120

* SLAB_RED_ZONE & SLAB_POISON.

120

* 0 for faster, smaller code (especially in the critical paths).

121

* 0 for faster, smaller code (especially in the critical paths).

121

*

122

*

122

* STATS - 1 to collect stats for /proc/slabinfo.

123

* STATS - 1 to collect stats for /proc/slabinfo.

123

* 0 for faster, smaller code (especially in the critical paths).

124

* 0 for faster, smaller code (especially in the critical paths).

124

*

125

*

125

* FORCED_DEBUG - 1 enables SLAB_RED_ZONE and SLAB_POISON (if possible)

126

* FORCED_DEBUG - 1 enables SLAB_RED_ZONE and SLAB_POISON (if possible)

126

*/

127

*/

127

128

#ifdef CONFIG_DEBUG_SLAB

129

#ifdef CONFIG_DEBUG_SLAB

129

#define DEBUG 1

130

#define DEBUG 1

130

#define STATS 1

131

#define STATS 1

131

#define FORCED_DEBUG 1

132

#define FORCED_DEBUG 1

132

#else

133

#else

133

#define DEBUG 0

134

#define DEBUG 0

134

#define STATS 0

135

#define STATS 0

135

#define FORCED_DEBUG 0

136

#define FORCED_DEBUG 0

136

#endif

137

#endif

137

138

/* Shouldn't this be in a header file somewhere? */

139

/* Shouldn't this be in a header file somewhere? */

139

#define BYTES_PER_WORD sizeof(void *)

140

#define BYTES_PER_WORD sizeof(void *)

140

141

#ifndef cache_line_size

142

#ifndef cache_line_size

142

#define cache_line_size() L1_CACHE_BYTES

143

#define cache_line_size() L1_CACHE_BYTES

143

#endif

144

#endif

144

145

#ifndef ARCH_KMALLOC_MINALIGN

146

#ifndef ARCH_KMALLOC_MINALIGN

146

/*

147

/*

147

* Enforce a minimum alignment for the kmalloc caches.

148

* Enforce a minimum alignment for the kmalloc caches.

148

* Usually, the kmalloc caches are cache_line_size() aligned, except when

149

* Usually, the kmalloc caches are cache_line_size() aligned, except when

149

* DEBUG and FORCED_DEBUG are enabled, then they are BYTES_PER_WORD aligned.

150

* DEBUG and FORCED_DEBUG are enabled, then they are BYTES_PER_WORD aligned.

150

* Some archs want to perform DMA into kmalloc caches and need a guaranteed

151

* Some archs want to perform DMA into kmalloc caches and need a guaranteed

151

* alignment larger than BYTES_PER_WORD. ARCH_KMALLOC_MINALIGN allows that.

152

* alignment larger than BYTES_PER_WORD. ARCH_KMALLOC_MINALIGN allows that.

152

* Note that this flag disables some debug features.

153

* Note that this flag disables some debug features.

153

*/

154

*/

154

#define ARCH_KMALLOC_MINALIGN 0

155

#define ARCH_KMALLOC_MINALIGN 0

155

#endif

156

#endif

156

157

#ifndef ARCH_SLAB_MINALIGN

158

#ifndef ARCH_SLAB_MINALIGN

158

/*

159

/*

159

* Enforce a minimum alignment for all caches.

160

* Enforce a minimum alignment for all caches.

160

* Intended for archs that get misalignment faults even for BYTES_PER_WORD

161

* Intended for archs that get misalignment faults even for BYTES_PER_WORD

161

* aligned buffers. Includes ARCH_KMALLOC_MINALIGN.

162

* aligned buffers. Includes ARCH_KMALLOC_MINALIGN.

162

* If possible: Do not enable this flag for CONFIG_DEBUG_SLAB, it disables

163

* If possible: Do not enable this flag for CONFIG_DEBUG_SLAB, it disables

163

* some debug features.

164

* some debug features.

164

*/

165

*/

165

#define ARCH_SLAB_MINALIGN 0

166

#define ARCH_SLAB_MINALIGN 0

166

#endif

167

#endif

167

168

#ifndef ARCH_KMALLOC_FLAGS

169

#ifndef ARCH_KMALLOC_FLAGS

169

#define ARCH_KMALLOC_FLAGS SLAB_HWCACHE_ALIGN

170

#define ARCH_KMALLOC_FLAGS SLAB_HWCACHE_ALIGN

170

#endif

171

#endif

171

172

/* Legal flag mask for kmem_cache_create(). */

173

/* Legal flag mask for kmem_cache_create(). */

173

#if DEBUG

174

#if DEBUG

174

# define CREATE_MASK (SLAB_DEBUG_INITIAL | SLAB_RED_ZONE | \

175

# define CREATE_MASK (SLAB_DEBUG_INITIAL | SLAB_RED_ZONE | \

175

SLAB_POISON | SLAB_HWCACHE_ALIGN | \

176

SLAB_POISON | SLAB_HWCACHE_ALIGN | \

176

SLAB_CACHE_DMA | \

177

SLAB_CACHE_DMA | \

177

SLAB_MUST_HWCACHE_ALIGN | SLAB_STORE_USER | \

178

SLAB_MUST_HWCACHE_ALIGN | SLAB_STORE_USER | \

178

SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \

179

SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \

179

SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD)

180

SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD)

180

#else

181

#else

181

# define CREATE_MASK (SLAB_HWCACHE_ALIGN | \

182

# define CREATE_MASK (SLAB_HWCACHE_ALIGN | \

182

SLAB_CACHE_DMA | SLAB_MUST_HWCACHE_ALIGN | \

183

SLAB_CACHE_DMA | SLAB_MUST_HWCACHE_ALIGN | \

183

SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \

184

SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \

184

SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD)

185

SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD)

185

#endif

186

#endif

186

187

/*

188

/*

188

* kmem_bufctl_t:

189

* kmem_bufctl_t:

189

*

190

*

190

* Bufctl's are used for linking objs within a slab

191

* Bufctl's are used for linking objs within a slab

191

* linked offsets.

192

* linked offsets.

192

*

193

*

193

* This implementation relies on "struct page" for locating the cache &

194

* This implementation relies on "struct page" for locating the cache &

194

* slab an object belongs to.

195

* slab an object belongs to.

195

* This allows the bufctl structure to be small (one int), but limits

196

* This allows the bufctl structure to be small (one int), but limits

196

* the number of objects a slab (not a cache) can contain when off-slab

197

* the number of objects a slab (not a cache) can contain when off-slab

197

* bufctls are used. The limit is the size of the largest general cache

198

* bufctls are used. The limit is the size of the largest general cache

198

* that does not use off-slab slabs.

199

* that does not use off-slab slabs.

199

* For 32bit archs with 4 kB pages, is this 56.

200

* For 32bit archs with 4 kB pages, is this 56.

200

* This is not serious, as it is only for large objects, when it is unwise

201

* This is not serious, as it is only for large objects, when it is unwise

201

* to have too many per slab.

202

* to have too many per slab.

202

* Note: This limit can be raised by introducing a general cache whose size

203

* Note: This limit can be raised by introducing a general cache whose size

203

* is less than 512 (PAGE_SIZE<<3), but greater than 256.

204

* is less than 512 (PAGE_SIZE<<3), but greater than 256.

204

*/

205

*/

205

206

typedef unsigned int kmem_bufctl_t;

207

typedef unsigned int kmem_bufctl_t;

207

#define BUFCTL_END (((kmem_bufctl_t)(~0U))-0)

208

#define BUFCTL_END (((kmem_bufctl_t)(~0U))-0)

208

#define BUFCTL_FREE (((kmem_bufctl_t)(~0U))-1)

209

#define BUFCTL_FREE (((kmem_bufctl_t)(~0U))-1)

209

#define BUFCTL_ACTIVE (((kmem_bufctl_t)(~0U))-2)

210

#define BUFCTL_ACTIVE (((kmem_bufctl_t)(~0U))-2)

210

#define SLAB_LIMIT (((kmem_bufctl_t)(~0U))-3)

211

#define SLAB_LIMIT (((kmem_bufctl_t)(~0U))-3)

211

212

/*

213

/*

213

* struct slab

214

* struct slab

214

*

215

*

215

* Manages the objs in a slab. Placed either at the beginning of mem allocated

216

* Manages the objs in a slab. Placed either at the beginning of mem allocated

216

* for a slab, or allocated from an general cache.

217

* for a slab, or allocated from an general cache.

217

* Slabs are chained into three list: fully used, partial, fully free slabs.

218

* Slabs are chained into three list: fully used, partial, fully free slabs.

218

*/

219

*/

219

struct slab {

220

struct slab {

220

struct list_head list;

221

struct list_head list;

221

unsigned long colouroff;

222

unsigned long colouroff;

222

void *s_mem; /* including colour offset */

223

void *s_mem; /* including colour offset */

223

unsigned int inuse; /* num of objs active in slab */

224

unsigned int inuse; /* num of objs active in slab */

224

kmem_bufctl_t free;

225

kmem_bufctl_t free;

225

unsigned short nodeid;

226

unsigned short nodeid;

226

};

227

};

227

228

/*

229

/*

229

* struct slab_rcu

230

* struct slab_rcu

230

*

231

*

231

* slab_destroy on a SLAB_DESTROY_BY_RCU cache uses this structure to

232

* slab_destroy on a SLAB_DESTROY_BY_RCU cache uses this structure to

232

* arrange for kmem_freepages to be called via RCU. This is useful if

233

* arrange for kmem_freepages to be called via RCU. This is useful if

233

* we need to approach a kernel structure obliquely, from its address

234

* we need to approach a kernel structure obliquely, from its address

234

* obtained without the usual locking. We can lock the structure to

235

* obtained without the usual locking. We can lock the structure to

235

* stabilize it and check it's still at the given address, only if we

236

* stabilize it and check it's still at the given address, only if we

236

* can be sure that the memory has not been meanwhile reused for some

237

* can be sure that the memory has not been meanwhile reused for some

237

* other kind of object (which our subsystem's lock might corrupt).

238

* other kind of object (which our subsystem's lock might corrupt).

238

*

239

*

239

* rcu_read_lock before reading the address, then rcu_read_unlock after

240

* rcu_read_lock before reading the address, then rcu_read_unlock after

240

* taking the spinlock within the structure expected at that address.

241

* taking the spinlock within the structure expected at that address.

241

*

242

*

242

* We assume struct slab_rcu can overlay struct slab when destroying.

243

* We assume struct slab_rcu can overlay struct slab when destroying.

243

*/

244

*/

244

struct slab_rcu {

245

struct slab_rcu {

245

struct rcu_head head;

246

struct rcu_head head;

246

struct kmem_cache *cachep;

247

struct kmem_cache *cachep;

247

void *addr;

248

void *addr;

248

};

249

};

249

250

/*

251

/*

251

* struct array_cache

252

* struct array_cache

252

*

253

*

253

* Purpose:

254

* Purpose:

254

* - LIFO ordering, to hand out cache-warm objects from _alloc

255

* - LIFO ordering, to hand out cache-warm objects from _alloc

255

* - reduce the number of linked list operations

256

* - reduce the number of linked list operations

256

* - reduce spinlock operations

257

* - reduce spinlock operations

257

*

258

*

258

* The limit is stored in the per-cpu structure to reduce the data cache

259

* The limit is stored in the per-cpu structure to reduce the data cache

259

* footprint.

260

* footprint.

260

*

261

*

261

*/

262

*/

262

struct array_cache {

263

struct array_cache {

263

unsigned int avail;

264

unsigned int avail;

264

unsigned int limit;

265

unsigned int limit;

265

unsigned int batchcount;

266

unsigned int batchcount;

266

unsigned int touched;

267

unsigned int touched;

267

spinlock_t lock;

268

spinlock_t lock;

268

void *entry[0]; /*

269

void *entry[0]; /*

269

* Must have this definition in here for the proper

270

* Must have this definition in here for the proper

270

* alignment of array_cache. Also simplifies accessing

271

* alignment of array_cache. Also simplifies accessing

271

* the entries.

272

* the entries.

272

* [0] is for gcc 2.95. It should really be [].

273

* [0] is for gcc 2.95. It should really be [].

273

*/

274

*/

274

};

275

};

275

276

/*

277

/*

277

* bootstrap: The caches do not work without cpuarrays anymore, but the

278

* bootstrap: The caches do not work without cpuarrays anymore, but the

278

* cpuarrays are allocated from the generic caches...

279

* cpuarrays are allocated from the generic caches...

279

*/

280

*/

280

#define BOOT_CPUCACHE_ENTRIES 1

281

#define BOOT_CPUCACHE_ENTRIES 1

281

struct arraycache_init {

282

struct arraycache_init {

282

struct array_cache cache;

283

struct array_cache cache;

283

void *entries[BOOT_CPUCACHE_ENTRIES];

284

void *entries[BOOT_CPUCACHE_ENTRIES];

284

};

285

};

285

286

/*

287

/*

287

* The slab lists for all objects.

288

* The slab lists for all objects.

288

*/

289

*/

289

struct kmem_list3 {

290

struct kmem_list3 {

290

struct list_head slabs_partial; /* partial list first, better asm code */

291

struct list_head slabs_partial; /* partial list first, better asm code */

291

struct list_head slabs_full;

292

struct list_head slabs_full;

292

struct list_head slabs_free;

293

struct list_head slabs_free;

293

unsigned long free_objects;

294

unsigned long free_objects;

294

unsigned int free_limit;

295

unsigned int free_limit;

295

unsigned int colour_next; /* Per-node cache coloring */

296

unsigned int colour_next; /* Per-node cache coloring */

296

spinlock_t list_lock;

297

spinlock_t list_lock;

297

struct array_cache *shared; /* shared per node */

298

struct array_cache *shared; /* shared per node */

298

struct array_cache **alien; /* on other nodes */

299

struct array_cache **alien; /* on other nodes */

299

unsigned long next_reap; /* updated without locking */

300

unsigned long next_reap; /* updated without locking */

300

int free_touched; /* updated without locking */

301

int free_touched; /* updated without locking */

301

};

302

};

302

303

/*

304

/*

304

* Need this for bootstrapping a per node allocator.

305

* Need this for bootstrapping a per node allocator.

305

*/

306

*/

306

#define NUM_INIT_LISTS (2 * MAX_NUMNODES + 1)

307

#define NUM_INIT_LISTS (2 * MAX_NUMNODES + 1)

307

struct kmem_list3 __initdata initkmem_list3[NUM_INIT_LISTS];

308

struct kmem_list3 __initdata initkmem_list3[NUM_INIT_LISTS];

308

#define CACHE_CACHE 0

309

#define CACHE_CACHE 0

309

#define SIZE_AC 1

310

#define SIZE_AC 1

310

#define SIZE_L3 (1 + MAX_NUMNODES)

311

#define SIZE_L3 (1 + MAX_NUMNODES)

311

312

static int drain_freelist(struct kmem_cache *cache,

313

static int drain_freelist(struct kmem_cache *cache,

313

struct kmem_list3 *l3, int tofree);

314

struct kmem_list3 *l3, int tofree);

314

static void free_block(struct kmem_cache *cachep, void **objpp, int len,

315

static void free_block(struct kmem_cache *cachep, void **objpp, int len,

315

int node);

316

int node);

316

static int enable_cpucache(struct kmem_cache *cachep);

317

static int enable_cpucache(struct kmem_cache *cachep);

317

static void cache_reap(struct work_struct *unused);

318

static void cache_reap(struct work_struct *unused);

318

319

/*

320

/*

320

* This function must be completely optimized away if a constant is passed to

321

* This function must be completely optimized away if a constant is passed to

321

* it. Mostly the same as what is in linux/slab.h except it returns an index.

322

* it. Mostly the same as what is in linux/slab.h except it returns an index.

322

*/

323

*/

323

static __always_inline int index_of(const size_t size)

324

static __always_inline int index_of(const size_t size)

324

{

325

{

325

extern void __bad_size(void);

326

extern void __bad_size(void);

326

327

if (__builtin_constant_p(size)) {

328

if (__builtin_constant_p(size)) {

328

int i = 0;

329

int i = 0;

329

330

#define CACHE(x) \

331

#define CACHE(x) \

331

if (size <=x) \

332

if (size <=x) \

332

return i; \

333

return i; \

333

else \

334

else \

334

i++;

335

i++;

335

#include "linux/kmalloc_sizes.h"

336

#include "linux/kmalloc_sizes.h"

336

#undef CACHE

337

#undef CACHE

337

__bad_size();

338

__bad_size();

338

} else

339

} else

339

__bad_size();

340

__bad_size();

340

return 0;

341

return 0;

341

}

342

}

342

343

static int slab_early_init = 1;

344

static int slab_early_init = 1;

344

345

#define INDEX_AC index_of(sizeof(struct arraycache_init))

346

#define INDEX_AC index_of(sizeof(struct arraycache_init))

346

#define INDEX_L3 index_of(sizeof(struct kmem_list3))

347

#define INDEX_L3 index_of(sizeof(struct kmem_list3))

347

348

static void kmem_list3_init(struct kmem_list3 *parent)

349

static void kmem_list3_init(struct kmem_list3 *parent)

349

{

350

{

350

INIT_LIST_HEAD(&parent->slabs_full);

351

INIT_LIST_HEAD(&parent->slabs_full);

351

INIT_LIST_HEAD(&parent->slabs_partial);

352

INIT_LIST_HEAD(&parent->slabs_partial);

352

INIT_LIST_HEAD(&parent->slabs_free);

353

INIT_LIST_HEAD(&parent->slabs_free);

353

parent->shared = NULL;

354

parent->shared = NULL;

354

parent->alien = NULL;

355

parent->alien = NULL;

355

parent->colour_next = 0;

356

parent->colour_next = 0;

356

spin_lock_init(&parent->list_lock);

357

spin_lock_init(&parent->list_lock);

357

parent->free_objects = 0;

358

parent->free_objects = 0;

358

parent->free_touched = 0;

359

parent->free_touched = 0;

359

}

360

}

360

361

#define MAKE_LIST(cachep, listp, slab, nodeid) \

362

#define MAKE_LIST(cachep, listp, slab, nodeid) \

362

do { \

363

do { \

363

INIT_LIST_HEAD(listp); \

364

INIT_LIST_HEAD(listp); \

364

list_splice(&(cachep->nodelists[nodeid]->slab), listp); \

365

list_splice(&(cachep->nodelists[nodeid]->slab), listp); \

365

} while (0)

366

} while (0)

366

367

#define MAKE_ALL_LISTS(cachep, ptr, nodeid) \

368

#define MAKE_ALL_LISTS(cachep, ptr, nodeid) \

368

do { \

369

do { \

369

MAKE_LIST((cachep), (&(ptr)->slabs_full), slabs_full, nodeid); \

370

MAKE_LIST((cachep), (&(ptr)->slabs_full), slabs_full, nodeid); \

370

MAKE_LIST((cachep), (&(ptr)->slabs_partial), slabs_partial, nodeid); \

371

MAKE_LIST((cachep), (&(ptr)->slabs_partial), slabs_partial, nodeid); \

371

MAKE_LIST((cachep), (&(ptr)->slabs_free), slabs_free, nodeid); \

372

MAKE_LIST((cachep), (&(ptr)->slabs_free), slabs_free, nodeid); \

372

} while (0)

373

} while (0)

373

374

/*

375

/*

375

* struct kmem_cache

376

* struct kmem_cache

376

*

377

*

377

* manages a cache.

378

* manages a cache.

378

*/

379

*/

379

380

struct kmem_cache {

381

struct kmem_cache {

381

/* 1) per-cpu data, touched during every alloc/free */

382

/* 1) per-cpu data, touched during every alloc/free */

382

struct array_cache *array[NR_CPUS];

383

struct array_cache *array[NR_CPUS];

383

/* 2) Cache tunables. Protected by cache_chain_mutex */

384

/* 2) Cache tunables. Protected by cache_chain_mutex */

384

unsigned int batchcount;

385

unsigned int batchcount;

385

unsigned int limit;

386

unsigned int limit;

386

unsigned int shared;

387

unsigned int shared;

387

388

unsigned int buffer_size;

389

unsigned int buffer_size;

390

u32 reciprocal_buffer_size;

389

/* 3) touched by every alloc & free from the backend */

391

/* 3) touched by every alloc & free from the backend */

390

struct kmem_list3 *nodelists[MAX_NUMNODES];

392

struct kmem_list3 *nodelists[MAX_NUMNODES];

391

393

392

unsigned int flags; /* constant flags */

394

unsigned int flags; /* constant flags */

393

unsigned int num; /* # of objs per slab */

395

unsigned int num; /* # of objs per slab */

394

396

395

/* 4) cache_grow/shrink */

397

/* 4) cache_grow/shrink */

396

/* order of pgs per slab (2^n) */

398

/* order of pgs per slab (2^n) */

397

unsigned int gfporder;

399

unsigned int gfporder;

398

400

399

/* force GFP flags, e.g. GFP_DMA */

401

/* force GFP flags, e.g. GFP_DMA */

400

gfp_t gfpflags;

402

gfp_t gfpflags;

401

403

402

size_t colour; /* cache colouring range */

404

size_t colour; /* cache colouring range */

403

unsigned int colour_off; /* colour offset */

405

unsigned int colour_off; /* colour offset */

404

struct kmem_cache *slabp_cache;

406

struct kmem_cache *slabp_cache;

405

unsigned int slab_size;

407

unsigned int slab_size;

406

unsigned int dflags; /* dynamic flags */

408

unsigned int dflags; /* dynamic flags */

407

409

408

/* constructor func */

410

/* constructor func */

409

void (*ctor) (void *, struct kmem_cache *, unsigned long);

411

void (*ctor) (void *, struct kmem_cache *, unsigned long);

410

412

411

/* de-constructor func */

413

/* de-constructor func */

412

void (*dtor) (void *, struct kmem_cache *, unsigned long);

414

void (*dtor) (void *, struct kmem_cache *, unsigned long);

413

415

414

/* 5) cache creation/removal */

416

/* 5) cache creation/removal */

415

const char *name;

417

const char *name;

416

struct list_head next;

418

struct list_head next;

417

419

418

/* 6) statistics */

420

/* 6) statistics */

419

#if STATS

421

#if STATS

420

unsigned long num_active;

422

unsigned long num_active;

421

unsigned long num_allocations;

423

unsigned long num_allocations;

422

unsigned long high_mark;

424

unsigned long high_mark;

423

unsigned long grown;

425

unsigned long grown;

424

unsigned long reaped;

426

unsigned long reaped;

425

unsigned long errors;

427

unsigned long errors;

426

unsigned long max_freeable;

428

unsigned long max_freeable;

427

unsigned long node_allocs;

429

unsigned long node_allocs;

428

unsigned long node_frees;

430

unsigned long node_frees;

429

unsigned long node_overflow;

431

unsigned long node_overflow;

430

atomic_t allochit;

432

atomic_t allochit;

431

atomic_t allocmiss;

433

atomic_t allocmiss;

432

atomic_t freehit;

434

atomic_t freehit;

433

atomic_t freemiss;

435

atomic_t freemiss;

434

#endif

436

#endif

435

#if DEBUG

437

#if DEBUG

436

/*

438

/*

437

* If debugging is enabled, then the allocator can add additional

439

* If debugging is enabled, then the allocator can add additional

438

* fields and/or padding to every object. buffer_size contains the total

440

* fields and/or padding to every object. buffer_size contains the total

439

* object size including these internal fields, the following two

441

* object size including these internal fields, the following two

440

* variables contain the offset to the user object and its size.

442

* variables contain the offset to the user object and its size.

441

*/

443

*/

442

int obj_offset;

444

int obj_offset;

443

int obj_size;

445

int obj_size;

444

#endif

446

#endif

445

};

447

};

446

448

447

#define CFLGS_OFF_SLAB (0x80000000UL)

449

#define CFLGS_OFF_SLAB (0x80000000UL)

448

#define OFF_SLAB(x) ((x)->flags & CFLGS_OFF_SLAB)

450

#define OFF_SLAB(x) ((x)->flags & CFLGS_OFF_SLAB)

449

451

450

#define BATCHREFILL_LIMIT 16

452

#define BATCHREFILL_LIMIT 16

451

/*

453

/*

452

* Optimization question: fewer reaps means less probability for unnessary

454

* Optimization question: fewer reaps means less probability for unnessary

453

* cpucache drain/refill cycles.

455

* cpucache drain/refill cycles.

454

*

456

*

455

* OTOH the cpuarrays can contain lots of objects,

457

* OTOH the cpuarrays can contain lots of objects,

456

* which could lock up otherwise freeable slabs.

458

* which could lock up otherwise freeable slabs.

457

*/

459

*/

458

#define REAPTIMEOUT_CPUC (2*HZ)

460

#define REAPTIMEOUT_CPUC (2*HZ)

459

#define REAPTIMEOUT_LIST3 (4*HZ)

461

#define REAPTIMEOUT_LIST3 (4*HZ)

460

462

461

#if STATS

463

#if STATS

462

#define STATS_INC_ACTIVE(x) ((x)->num_active++)

464

#define STATS_INC_ACTIVE(x) ((x)->num_active++)

463

#define STATS_DEC_ACTIVE(x) ((x)->num_active--)

465

#define STATS_DEC_ACTIVE(x) ((x)->num_active--)

464

#define STATS_INC_ALLOCED(x) ((x)->num_allocations++)

466

#define STATS_INC_ALLOCED(x) ((x)->num_allocations++)

465

#define STATS_INC_GROWN(x) ((x)->grown++)

467

#define STATS_INC_GROWN(x) ((x)->grown++)

466

#define STATS_ADD_REAPED(x,y) ((x)->reaped += (y))

468

#define STATS_ADD_REAPED(x,y) ((x)->reaped += (y))

467

#define STATS_SET_HIGH(x) \

469

#define STATS_SET_HIGH(x) \

468

do { \

470

do { \

469

if ((x)->num_active > (x)->high_mark) \

471

if ((x)->num_active > (x)->high_mark) \

470

(x)->high_mark = (x)->num_active; \

472

(x)->high_mark = (x)->num_active; \

471

} while (0)

473

} while (0)

472

#define STATS_INC_ERR(x) ((x)->errors++)

474

#define STATS_INC_ERR(x) ((x)->errors++)

473

#define STATS_INC_NODEALLOCS(x) ((x)->node_allocs++)

475

#define STATS_INC_NODEALLOCS(x) ((x)->node_allocs++)

474

#define STATS_INC_NODEFREES(x) ((x)->node_frees++)

476

#define STATS_INC_NODEFREES(x) ((x)->node_frees++)

475

#define STATS_INC_ACOVERFLOW(x) ((x)->node_overflow++)

477

#define STATS_INC_ACOVERFLOW(x) ((x)->node_overflow++)

476

#define STATS_SET_FREEABLE(x, i) \

478

#define STATS_SET_FREEABLE(x, i) \

477

do { \

479

do { \

478

if ((x)->max_freeable < i) \

480

if ((x)->max_freeable < i) \

479

(x)->max_freeable = i; \

481

(x)->max_freeable = i; \

480

} while (0)

482

} while (0)

481

#define STATS_INC_ALLOCHIT(x) atomic_inc(&(x)->allochit)

483

#define STATS_INC_ALLOCHIT(x) atomic_inc(&(x)->allochit)

482

#define STATS_INC_ALLOCMISS(x) atomic_inc(&(x)->allocmiss)

484

#define STATS_INC_ALLOCMISS(x) atomic_inc(&(x)->allocmiss)

483

#define STATS_INC_FREEHIT(x) atomic_inc(&(x)->freehit)

485

#define STATS_INC_FREEHIT(x) atomic_inc(&(x)->freehit)

484

#define STATS_INC_FREEMISS(x) atomic_inc(&(x)->freemiss)

486

#define STATS_INC_FREEMISS(x) atomic_inc(&(x)->freemiss)

485

#else

487

#else

486

#define STATS_INC_ACTIVE(x) do { } while (0)

488

#define STATS_INC_ACTIVE(x) do { } while (0)

487

#define STATS_DEC_ACTIVE(x) do { } while (0)

489

#define STATS_DEC_ACTIVE(x) do { } while (0)

488

#define STATS_INC_ALLOCED(x) do { } while (0)

490

#define STATS_INC_ALLOCED(x) do { } while (0)

489

#define STATS_INC_GROWN(x) do { } while (0)

491

#define STATS_INC_GROWN(x) do { } while (0)

490

#define STATS_ADD_REAPED(x,y) do { } while (0)

492

#define STATS_ADD_REAPED(x,y) do { } while (0)

491

#define STATS_SET_HIGH(x) do { } while (0)

493

#define STATS_SET_HIGH(x) do { } while (0)

492

#define STATS_INC_ERR(x) do { } while (0)

494

#define STATS_INC_ERR(x) do { } while (0)

493

#define STATS_INC_NODEALLOCS(x) do { } while (0)

495

#define STATS_INC_NODEALLOCS(x) do { } while (0)

494

#define STATS_INC_NODEFREES(x) do { } while (0)

496

#define STATS_INC_NODEFREES(x) do { } while (0)

495

#define STATS_INC_ACOVERFLOW(x) do { } while (0)

497

#define STATS_INC_ACOVERFLOW(x) do { } while (0)

496

#define STATS_SET_FREEABLE(x, i) do { } while (0)

498

#define STATS_SET_FREEABLE(x, i) do { } while (0)

497

#define STATS_INC_ALLOCHIT(x) do { } while (0)

499

#define STATS_INC_ALLOCHIT(x) do { } while (0)

498

#define STATS_INC_ALLOCMISS(x) do { } while (0)

500

#define STATS_INC_ALLOCMISS(x) do { } while (0)

499

#define STATS_INC_FREEHIT(x) do { } while (0)

501

#define STATS_INC_FREEHIT(x) do { } while (0)

500

#define STATS_INC_FREEMISS(x) do { } while (0)

502

#define STATS_INC_FREEMISS(x) do { } while (0)

501

#endif

503

#endif

502

504

503

#if DEBUG

505

#if DEBUG

504

506

505

/*

507

/*

506

* memory layout of objects:

508

* memory layout of objects:

507

* 0 : objp

509

* 0 : objp

508

* 0 .. cachep->obj_offset - BYTES_PER_WORD - 1: padding. This ensures that

510

* 0 .. cachep->obj_offset - BYTES_PER_WORD - 1: padding. This ensures that

509

* the end of an object is aligned with the end of the real

511

* the end of an object is aligned with the end of the real

510

* allocation. Catches writes behind the end of the allocation.

512

* allocation. Catches writes behind the end of the allocation.

511

* cachep->obj_offset - BYTES_PER_WORD .. cachep->obj_offset - 1:

513

* cachep->obj_offset - BYTES_PER_WORD .. cachep->obj_offset - 1:

512

* redzone word.

514

* redzone word.

513

* cachep->obj_offset: The real object.

515

* cachep->obj_offset: The real object.

514

* cachep->buffer_size - 2* BYTES_PER_WORD: redzone word [BYTES_PER_WORD long]

516

* cachep->buffer_size - 2* BYTES_PER_WORD: redzone word [BYTES_PER_WORD long]

515

* cachep->buffer_size - 1* BYTES_PER_WORD: last caller address

517

* cachep->buffer_size - 1* BYTES_PER_WORD: last caller address

516

* [BYTES_PER_WORD long]

518

* [BYTES_PER_WORD long]

517

*/

519

*/

518

static int obj_offset(struct kmem_cache *cachep)

520

static int obj_offset(struct kmem_cache *cachep)

519

{

521

{

520

return cachep->obj_offset;

522

return cachep->obj_offset;

521

}

523

}

522

524

523

static int obj_size(struct kmem_cache *cachep)

525

static int obj_size(struct kmem_cache *cachep)

524

{

526

{

525

return cachep->obj_size;

527

return cachep->obj_size;

526

}

528

}

527

529

528

static unsigned long *dbg_redzone1(struct kmem_cache *cachep, void *objp)

530

static unsigned long *dbg_redzone1(struct kmem_cache *cachep, void *objp)

529

{

531

{

530

BUG_ON(!(cachep->flags & SLAB_RED_ZONE));

532

BUG_ON(!(cachep->flags & SLAB_RED_ZONE));

531

return (unsigned long*) (objp+obj_offset(cachep)-BYTES_PER_WORD);

533

return (unsigned long*) (objp+obj_offset(cachep)-BYTES_PER_WORD);

532

}

534

}

533

535

534

static unsigned long *dbg_redzone2(struct kmem_cache *cachep, void *objp)

536

static unsigned long *dbg_redzone2(struct kmem_cache *cachep, void *objp)

535

{

537

{

536

BUG_ON(!(cachep->flags & SLAB_RED_ZONE));

538

BUG_ON(!(cachep->flags & SLAB_RED_ZONE));

537

if (cachep->flags & SLAB_STORE_USER)

539

if (cachep->flags & SLAB_STORE_USER)

538

return (unsigned long *)(objp + cachep->buffer_size -

540

return (unsigned long *)(objp + cachep->buffer_size -

539

2 * BYTES_PER_WORD);

541

2 * BYTES_PER_WORD);

540

return (unsigned long *)(objp + cachep->buffer_size - BYTES_PER_WORD);

542

return (unsigned long *)(objp + cachep->buffer_size - BYTES_PER_WORD);

541

}

543

}

542

544

543

static void **dbg_userword(struct kmem_cache *cachep, void *objp)

545

static void **dbg_userword(struct kmem_cache *cachep, void *objp)

544

{

546

{

545

BUG_ON(!(cachep->flags & SLAB_STORE_USER));

547

BUG_ON(!(cachep->flags & SLAB_STORE_USER));

546

return (void **)(objp + cachep->buffer_size - BYTES_PER_WORD);

548

return (void **)(objp + cachep->buffer_size - BYTES_PER_WORD);

547

}

549

}

548

550

549

#else

551

#else

550

552

551

#define obj_offset(x) 0

553

#define obj_offset(x) 0

552

#define obj_size(cachep) (cachep->buffer_size)

554

#define obj_size(cachep) (cachep->buffer_size)

553

#define dbg_redzone1(cachep, objp) ({BUG(); (unsigned long *)NULL;})

555

#define dbg_redzone1(cachep, objp) ({BUG(); (unsigned long *)NULL;})

554

#define dbg_redzone2(cachep, objp) ({BUG(); (unsigned long *)NULL;})

556

#define dbg_redzone2(cachep, objp) ({BUG(); (unsigned long *)NULL;})

555

#define dbg_userword(cachep, objp) ({BUG(); (void **)NULL;})

557

#define dbg_userword(cachep, objp) ({BUG(); (void **)NULL;})

556

558

557

#endif

559

#endif

558

560

559

/*

561

/*

560

* Maximum size of an obj (in 2^order pages) and absolute limit for the gfp

562

* Maximum size of an obj (in 2^order pages) and absolute limit for the gfp

561

* order.

563

* order.

562

*/

564

*/

563

#if defined(CONFIG_LARGE_ALLOCS)

565

#if defined(CONFIG_LARGE_ALLOCS)

564

#define MAX_OBJ_ORDER 13 /* up to 32Mb */

566

#define MAX_OBJ_ORDER 13 /* up to 32Mb */

565

#define MAX_GFP_ORDER 13 /* up to 32Mb */

567

#define MAX_GFP_ORDER 13 /* up to 32Mb */

566

#elif defined(CONFIG_MMU)

568

#elif defined(CONFIG_MMU)

567

#define MAX_OBJ_ORDER 5 /* 32 pages */

569

#define MAX_OBJ_ORDER 5 /* 32 pages */

568

#define MAX_GFP_ORDER 5 /* 32 pages */

570

#define MAX_GFP_ORDER 5 /* 32 pages */

569

#else

571

#else

570

#define MAX_OBJ_ORDER 8 /* up to 1Mb */

572

#define MAX_OBJ_ORDER 8 /* up to 1Mb */

571

#define MAX_GFP_ORDER 8 /* up to 1Mb */

573

#define MAX_GFP_ORDER 8 /* up to 1Mb */

572

#endif

574

#endif

573

575

574

/*

576

/*

575

* Do not go above this order unless 0 objects fit into the slab.

577

* Do not go above this order unless 0 objects fit into the slab.

576

*/

578

*/

577

#define BREAK_GFP_ORDER_HI 1

579

#define BREAK_GFP_ORDER_HI 1

578

#define BREAK_GFP_ORDER_LO 0

580

#define BREAK_GFP_ORDER_LO 0

579

static int slab_break_gfp_order = BREAK_GFP_ORDER_LO;

581

static int slab_break_gfp_order = BREAK_GFP_ORDER_LO;

580

582

581

/*

583

/*

582

* Functions for storing/retrieving the cachep and or slab from the page

584

* Functions for storing/retrieving the cachep and or slab from the page

583

* allocator. These are used to find the slab an obj belongs to. With kfree(),

585

* allocator. These are used to find the slab an obj belongs to. With kfree(),

584

* these are used to find the cache which an obj belongs to.

586

* these are used to find the cache which an obj belongs to.

585

*/

587

*/

586

static inline void page_set_cache(struct page *page, struct kmem_cache *cache)

588

static inline void page_set_cache(struct page *page, struct kmem_cache *cache)

587

{

589

{

588

page->lru.next = (struct list_head *)cache;

590

page->lru.next = (struct list_head *)cache;

589

}

591

}

590

592

591

static inline struct kmem_cache *page_get_cache(struct page *page)

593

static inline struct kmem_cache *page_get_cache(struct page *page)

592

{

594

{

593

if (unlikely(PageCompound(page)))

595

if (unlikely(PageCompound(page)))

594

page = (struct page *)page_private(page);

596

page = (struct page *)page_private(page);

595

BUG_ON(!PageSlab(page));

597

BUG_ON(!PageSlab(page));

596

return (struct kmem_cache *)page->lru.next;

598

return (struct kmem_cache *)page->lru.next;

597

}

599

}

598

600

599

static inline void page_set_slab(struct page *page, struct slab *slab)

601

static inline void page_set_slab(struct page *page, struct slab *slab)

600

{

602

{

601

page->lru.prev = (struct list_head *)slab;

603

page->lru.prev = (struct list_head *)slab;

602

}

604

}

603

605

604

static inline struct slab *page_get_slab(struct page *page)

606

static inline struct slab *page_get_slab(struct page *page)

605

{

607

{

606

if (unlikely(PageCompound(page)))

608

if (unlikely(PageCompound(page)))

607

page = (struct page *)page_private(page);

609

page = (struct page *)page_private(page);

608

BUG_ON(!PageSlab(page));

610

BUG_ON(!PageSlab(page));

609

return (struct slab *)page->lru.prev;

611

return (struct slab *)page->lru.prev;

610

}

612

}

611

613

612

static inline struct kmem_cache *virt_to_cache(const void *obj)

614

static inline struct kmem_cache *virt_to_cache(const void *obj)

613

{

615

{

614

struct page *page = virt_to_page(obj);

616

struct page *page = virt_to_page(obj);

615

return page_get_cache(page);

617

return page_get_cache(page);

616

}

618

}

617

619

618

static inline struct slab *virt_to_slab(const void *obj)

620

static inline struct slab *virt_to_slab(const void *obj)

619

{

621

{

620

struct page *page = virt_to_page(obj);

622

struct page *page = virt_to_page(obj);

621

return page_get_slab(page);

623

return page_get_slab(page);

622

}

624

}

623

625

624

static inline void *index_to_obj(struct kmem_cache *cache, struct slab *slab,

626

static inline void *index_to_obj(struct kmem_cache *cache, struct slab *slab,

625

unsigned int idx)

627

unsigned int idx)

626

{

628

{

627

return slab->s_mem + cache->buffer_size * idx;

629

return slab->s_mem + cache->buffer_size * idx;

628

}

630

}

629

631

630

static inline unsigned int obj_to_index(struct kmem_cache *cache,

632

/*

631

struct slab *slab, void *obj)

633

* We want to avoid an expensive divide : (offset / cache->buffer_size)

634

* Using the fact that buffer_size is a constant for a particular cache,

635

* we can replace (offset / cache->buffer_size) by

636

* reciprocal_divide(offset, cache->reciprocal_buffer_size)

637

*/

638

static inline unsigned int obj_to_index(const struct kmem_cache *cache,

639

const struct slab *slab, void *obj)

632

{

640

{

633

return (unsigned)(obj - slab->s_mem) / cache->buffer_size;

641

u32 offset = (obj - slab->s_mem);

642

return reciprocal_divide(offset, cache->reciprocal_buffer_size);

634

}

643

}

635

644

636

/*

645

/*

637

* These are the default caches for kmalloc. Custom caches can have other sizes.

646

* These are the default caches for kmalloc. Custom caches can have other sizes.

638

*/

647

*/

639

struct cache_sizes malloc_sizes[] = {

648

struct cache_sizes malloc_sizes[] = {

640

#define CACHE(x) { .cs_size = (x) },

649

#define CACHE(x) { .cs_size = (x) },

641

#include <linux/kmalloc_sizes.h>

650

#include <linux/kmalloc_sizes.h>

642

CACHE(ULONG_MAX)

651

CACHE(ULONG_MAX)

643

#undef CACHE

652

#undef CACHE

644

};

653

};

645

EXPORT_SYMBOL(malloc_sizes);

654

EXPORT_SYMBOL(malloc_sizes);

646

655

647

/* Must match cache_sizes above. Out of line to keep cache footprint low. */

656

/* Must match cache_sizes above. Out of line to keep cache footprint low. */

648

struct cache_names {

657

struct cache_names {

649

char *name;

658

char *name;

650

char *name_dma;

659

char *name_dma;

651

};

660

};

652

661

653

static struct cache_names __initdata cache_names[] = {

662

static struct cache_names __initdata cache_names[] = {

654

#define CACHE(x) { .name = "size-" #x, .name_dma = "size-" #x "(DMA)" },

663

#define CACHE(x) { .name = "size-" #x, .name_dma = "size-" #x "(DMA)" },

655

#include <linux/kmalloc_sizes.h>

664

#include <linux/kmalloc_sizes.h>

656

{NULL,}

665

{NULL,}

657

#undef CACHE

666

#undef CACHE

658

};

667

};

659

668

660

static struct arraycache_init initarray_cache __initdata =

669

static struct arraycache_init initarray_cache __initdata =

661

{ {0, BOOT_CPUCACHE_ENTRIES, 1, 0} };

670

{ {0, BOOT_CPUCACHE_ENTRIES, 1, 0} };

662

static struct arraycache_init initarray_generic =

671

static struct arraycache_init initarray_generic =

663

{ {0, BOOT_CPUCACHE_ENTRIES, 1, 0} };

672

{ {0, BOOT_CPUCACHE_ENTRIES, 1, 0} };

664

673

665

/* internal cache of cache description objs */

674

/* internal cache of cache description objs */

666

static struct kmem_cache cache_cache = {

675

static struct kmem_cache cache_cache = {

667

.batchcount = 1,

676

.batchcount = 1,

668

.limit = BOOT_CPUCACHE_ENTRIES,

677

.limit = BOOT_CPUCACHE_ENTRIES,

669

.shared = 1,

678

.shared = 1,

670

.buffer_size = sizeof(struct kmem_cache),

679

.buffer_size = sizeof(struct kmem_cache),

671

.name = "kmem_cache",

680

.name = "kmem_cache",

672

#if DEBUG

681

#if DEBUG

673

.obj_size = sizeof(struct kmem_cache),

682

.obj_size = sizeof(struct kmem_cache),

674

#endif

683

#endif

675

};

684

};

676

685

677

#define BAD_ALIEN_MAGIC 0x01020304ul

686

#define BAD_ALIEN_MAGIC 0x01020304ul

678

687

679

#ifdef CONFIG_LOCKDEP

688

#ifdef CONFIG_LOCKDEP

680

689

681

/*

690

/*

682

* Slab sometimes uses the kmalloc slabs to store the slab headers

691

* Slab sometimes uses the kmalloc slabs to store the slab headers

683

* for other slabs "off slab".

692

* for other slabs "off slab".

684

* The locking for this is tricky in that it nests within the locks

693

* The locking for this is tricky in that it nests within the locks

685

* of all other slabs in a few places; to deal with this special

694

* of all other slabs in a few places; to deal with this special

686

* locking we put on-slab caches into a separate lock-class.

695

* locking we put on-slab caches into a separate lock-class.

687

*

696

*

688

* We set lock class for alien array caches which are up during init.

697

* We set lock class for alien array caches which are up during init.

689

* The lock annotation will be lost if all cpus of a node goes down and

698

* The lock annotation will be lost if all cpus of a node goes down and

690

* then comes back up during hotplug

699

* then comes back up during hotplug

691

*/

700

*/

692

static struct lock_class_key on_slab_l3_key;

701

static struct lock_class_key on_slab_l3_key;

693

static struct lock_class_key on_slab_alc_key;

702

static struct lock_class_key on_slab_alc_key;

694

703

695

static inline void init_lock_keys(void)

704

static inline void init_lock_keys(void)

696

705

697

{

706

{

698

int q;

707

int q;

699

struct cache_sizes *s = malloc_sizes;

708

struct cache_sizes *s = malloc_sizes;

700

709

701

while (s->cs_size != ULONG_MAX) {

710

while (s->cs_size != ULONG_MAX) {

702

for_each_node(q) {

711

for_each_node(q) {

703

struct array_cache **alc;

712

struct array_cache **alc;

704

int r;

713

int r;

705

struct kmem_list3 *l3 = s->cs_cachep->nodelists[q];

714

struct kmem_list3 *l3 = s->cs_cachep->nodelists[q];

706

if (!l3 || OFF_SLAB(s->cs_cachep))

715

if (!l3 || OFF_SLAB(s->cs_cachep))

707

continue;

716

continue;

708

lockdep_set_class(&l3->list_lock, &on_slab_l3_key);

717

lockdep_set_class(&l3->list_lock, &on_slab_l3_key);

709

alc = l3->alien;

718

alc = l3->alien;

710

/*

719

/*

711

* FIXME: This check for BAD_ALIEN_MAGIC

720

* FIXME: This check for BAD_ALIEN_MAGIC

712

* should go away when common slab code is taught to

721

* should go away when common slab code is taught to

713

* work even without alien caches.

722

* work even without alien caches.

714

* Currently, non NUMA code returns BAD_ALIEN_MAGIC

723

* Currently, non NUMA code returns BAD_ALIEN_MAGIC

715

* for alloc_alien_cache,

724

* for alloc_alien_cache,

716

*/

725

*/

717

if (!alc || (unsigned long)alc == BAD_ALIEN_MAGIC)

726

if (!alc || (unsigned long)alc == BAD_ALIEN_MAGIC)

718

continue;

727

continue;

719

for_each_node(r) {

728

for_each_node(r) {

720

if (alc[r])

729

if (alc[r])

721

lockdep_set_class(&alc[r]->lock,

730

lockdep_set_class(&alc[r]->lock,

722

&on_slab_alc_key);

731

&on_slab_alc_key);

723

}

732

}

724

}

733

}

725

s++;

734

s++;

726

}

735

}

727

}

736

}

728

#else

737

#else

729

static inline void init_lock_keys(void)

738

static inline void init_lock_keys(void)

730

{

739

{

731

}

740

}

732

#endif

741

#endif

733

742

734

/*

743

/*

735

* 1. Guard access to the cache-chain.

744

* 1. Guard access to the cache-chain.

736

* 2. Protect sanity of cpu_online_map against cpu hotplug events

745

* 2. Protect sanity of cpu_online_map against cpu hotplug events

737

*/

746

*/

738

static DEFINE_MUTEX(cache_chain_mutex);

747

static DEFINE_MUTEX(cache_chain_mutex);

739

static struct list_head cache_chain;

748

static struct list_head cache_chain;

740

749

741

/*

750

/*

742

* chicken and egg problem: delay the per-cpu array allocation

751

* chicken and egg problem: delay the per-cpu array allocation

743

* until the general caches are up.

752

* until the general caches are up.

744

*/

753

*/

745

static enum {

754

static enum {

746

NONE,

755

NONE,

747

PARTIAL_AC,

756

PARTIAL_AC,

748

PARTIAL_L3,

757

PARTIAL_L3,

749

FULL

758

FULL

750

} g_cpucache_up;

759

} g_cpucache_up;

751

760

752

/*

761

/*

753

* used by boot code to determine if it can use slab based allocator

762

* used by boot code to determine if it can use slab based allocator

754

*/

763

*/

755

int slab_is_available(void)

764

int slab_is_available(void)

756

{

765

{

757

return g_cpucache_up == FULL;

766

return g_cpucache_up == FULL;

758

}

767

}

759

768

760

static DEFINE_PER_CPU(struct delayed_work, reap_work);

769

static DEFINE_PER_CPU(struct delayed_work, reap_work);

761

770

762

static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep)

771

static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep)

763

{

772

{

764

return cachep->array[smp_processor_id()];

773

return cachep->array[smp_processor_id()];

765

}

774

}

766

775

767

static inline struct kmem_cache *__find_general_cachep(size_t size,

776

static inline struct kmem_cache *__find_general_cachep(size_t size,

768

gfp_t gfpflags)

777

gfp_t gfpflags)

769

{

778

{

770

struct cache_sizes *csizep = malloc_sizes;

779

struct cache_sizes *csizep = malloc_sizes;

771

780

772

#if DEBUG

781

#if DEBUG

773

/* This happens if someone tries to call

782

/* This happens if someone tries to call

774

* kmem_cache_create(), or __kmalloc(), before

783

* kmem_cache_create(), or __kmalloc(), before

775

* the generic caches are initialized.

784

* the generic caches are initialized.

776

*/

785

*/

777

BUG_ON(malloc_sizes[INDEX_AC].cs_cachep == NULL);

786

BUG_ON(malloc_sizes[INDEX_AC].cs_cachep == NULL);

778

#endif

787

#endif

779

while (size > csizep->cs_size)

788

while (size > csizep->cs_size)

780

csizep++;

789

csizep++;

781

790

782

/*

791

/*

783

* Really subtle: The last entry with cs->cs_size==ULONG_MAX

792

* Really subtle: The last entry with cs->cs_size==ULONG_MAX

784

* has cs_{dma,}cachep==NULL. Thus no special case

793

* has cs_{dma,}cachep==NULL. Thus no special case

785

* for large kmalloc calls required.

794

* for large kmalloc calls required.

786

*/

795

*/

787

if (unlikely(gfpflags & GFP_DMA))

796

if (unlikely(gfpflags & GFP_DMA))

788

return csizep->cs_dmacachep;

797

return csizep->cs_dmacachep;

789

return csizep->cs_cachep;

798

return csizep->cs_cachep;

790

}

799

}

791

800

792

static struct kmem_cache *kmem_find_general_cachep(size_t size, gfp_t gfpflags)

801

static struct kmem_cache *kmem_find_general_cachep(size_t size, gfp_t gfpflags)

793

{

802

{

794

return __find_general_cachep(size, gfpflags);

803

return __find_general_cachep(size, gfpflags);

795

}

804

}

796

805

797

static size_t slab_mgmt_size(size_t nr_objs, size_t align)

806

static size_t slab_mgmt_size(size_t nr_objs, size_t align)

798

{

807

{

799

return ALIGN(sizeof(struct slab)+nr_objs*sizeof(kmem_bufctl_t), align);

808

return ALIGN(sizeof(struct slab)+nr_objs*sizeof(kmem_bufctl_t), align);

800

}

809

}

801

810

802

/*

811

/*

803

* Calculate the number of objects and left-over bytes for a given buffer size.

812

* Calculate the number of objects and left-over bytes for a given buffer size.

804

*/

813

*/

805

static void cache_estimate(unsigned long gfporder, size_t buffer_size,

814

static void cache_estimate(unsigned long gfporder, size_t buffer_size,

806

size_t align, int flags, size_t *left_over,

815

size_t align, int flags, size_t *left_over,

807

unsigned int *num)

816

unsigned int *num)

808

{

817

{

809

int nr_objs;

818

int nr_objs;

810

size_t mgmt_size;

819

size_t mgmt_size;

811

size_t slab_size = PAGE_SIZE << gfporder;

820

size_t slab_size = PAGE_SIZE << gfporder;

812

821

813

/*

822

/*

814

* The slab management structure can be either off the slab or

823

* The slab management structure can be either off the slab or

815

* on it. For the latter case, the memory allocated for a

824

* on it. For the latter case, the memory allocated for a

816

* slab is used for:

825

* slab is used for:

817

*

826

*

818

* - The struct slab

827

* - The struct slab

819

* - One kmem_bufctl_t for each object

828

* - One kmem_bufctl_t for each object

820

* - Padding to respect alignment of @align

829

* - Padding to respect alignment of @align

821

* - @buffer_size bytes for each object

830

* - @buffer_size bytes for each object

822

*

831

*

823

* If the slab management structure is off the slab, then the

832

* If the slab management structure is off the slab, then the

824

* alignment will already be calculated into the size. Because

833

* alignment will already be calculated into the size. Because

825

* the slabs are all pages aligned, the objects will be at the

834

* the slabs are all pages aligned, the objects will be at the

826

* correct alignment when allocated.

835

* correct alignment when allocated.

827

*/

836

*/

828

if (flags & CFLGS_OFF_SLAB) {

837

if (flags & CFLGS_OFF_SLAB) {

829

mgmt_size = 0;

838

mgmt_size = 0;

830

nr_objs = slab_size / buffer_size;

839

nr_objs = slab_size / buffer_size;

831

840

832

if (nr_objs > SLAB_LIMIT)

841

if (nr_objs > SLAB_LIMIT)

833

nr_objs = SLAB_LIMIT;

842

nr_objs = SLAB_LIMIT;

834

} else {

843

} else {

835

/*

844

/*

836

* Ignore padding for the initial guess. The padding

845

* Ignore padding for the initial guess. The padding

837

* is at most @align-1 bytes, and @buffer_size is at

846

* is at most @align-1 bytes, and @buffer_size is at

838

* least @align. In the worst case, this result will

847

* least @align. In the worst case, this result will

839

* be one greater than the number of objects that fit

848

* be one greater than the number of objects that fit

840

* into the memory allocation when taking the padding

849

* into the memory allocation when taking the padding

841

* into account.

850

* into account.

842

*/

851

*/

843

nr_objs = (slab_size - sizeof(struct slab)) /

852

nr_objs = (slab_size - sizeof(struct slab)) /

844

(buffer_size + sizeof(kmem_bufctl_t));

853

(buffer_size + sizeof(kmem_bufctl_t));

845

854

846

/*

855

/*

847

* This calculated number will be either the right

856

* This calculated number will be either the right

848

* amount, or one greater than what we want.

857

* amount, or one greater than what we want.

849

*/

858

*/

850

if (slab_mgmt_size(nr_objs, align) + nr_objs*buffer_size

859

if (slab_mgmt_size(nr_objs, align) + nr_objs*buffer_size

851

> slab_size)

860

> slab_size)

852

nr_objs--;

861

nr_objs--;

853

862

854

if (nr_objs > SLAB_LIMIT)

863

if (nr_objs > SLAB_LIMIT)

855

nr_objs = SLAB_LIMIT;

864

nr_objs = SLAB_LIMIT;

856

865

857

mgmt_size = slab_mgmt_size(nr_objs, align);

866

mgmt_size = slab_mgmt_size(nr_objs, align);

858

}

867

}

859

*num = nr_objs;

868

*num = nr_objs;

860

*left_over = slab_size - nr_objs*buffer_size - mgmt_size;

869

*left_over = slab_size - nr_objs*buffer_size - mgmt_size;

861

}

870

}

862

871

863

#define slab_error(cachep, msg) __slab_error(__FUNCTION__, cachep, msg)

872

#define slab_error(cachep, msg) __slab_error(__FUNCTION__, cachep, msg)

864

873

865

static void __slab_error(const char *function, struct kmem_cache *cachep,

874

static void __slab_error(const char *function, struct kmem_cache *cachep,

866

char *msg)

875

char *msg)

867

{

876

{

868

printk(KERN_ERR "slab error in %s(): cache `%s': %s\n",

877

printk(KERN_ERR "slab error in %s(): cache `%s': %s\n",

869

function, cachep->name, msg);

878

function, cachep->name, msg);

870

dump_stack();

879

dump_stack();

871

}

880

}

872

881

873

/*

882

/*

874

* By default on NUMA we use alien caches to stage the freeing of

883

* By default on NUMA we use alien caches to stage the freeing of

875

* objects allocated from other nodes. This causes massive memory

884

* objects allocated from other nodes. This causes massive memory

876

* inefficiencies when using fake NUMA setup to split memory into a

885

* inefficiencies when using fake NUMA setup to split memory into a

877

* large number of small nodes, so it can be disabled on the command

886

* large number of small nodes, so it can be disabled on the command

878

* line

887

* line

879

*/

888

*/

880

889

881

static int use_alien_caches __read_mostly = 1;

890

static int use_alien_caches __read_mostly = 1;

882

static int __init noaliencache_setup(char *s)

891

static int __init noaliencache_setup(char *s)

883

{

892

{

884

use_alien_caches = 0;

893

use_alien_caches = 0;

885

return 1;

894

return 1;

886

}

895

}

887

__setup("noaliencache", noaliencache_setup);

896

__setup("noaliencache", noaliencache_setup);

888

897

889

#ifdef CONFIG_NUMA

898

#ifdef CONFIG_NUMA

890

/*

899

/*

891

* Special reaping functions for NUMA systems called from cache_reap().

900

* Special reaping functions for NUMA systems called from cache_reap().

892

* These take care of doing round robin flushing of alien caches (containing

901

* These take care of doing round robin flushing of alien caches (containing

893

* objects freed on different nodes from which they were allocated) and the

902

* objects freed on different nodes from which they were allocated) and the

894

* flushing of remote pcps by calling drain_node_pages.

903

* flushing of remote pcps by calling drain_node_pages.

895

*/

904

*/

896

static DEFINE_PER_CPU(unsigned long, reap_node);

905

static DEFINE_PER_CPU(unsigned long, reap_node);

897

906

898

static void init_reap_node(int cpu)

907

static void init_reap_node(int cpu)

899

{

908

{

900

int node;

909

int node;

901

910

902

node = next_node(cpu_to_node(cpu), node_online_map);

911

node = next_node(cpu_to_node(cpu), node_online_map);

903

if (node == MAX_NUMNODES)

912

if (node == MAX_NUMNODES)

904

node = first_node(node_online_map);

913

node = first_node(node_online_map);

905

914

906

per_cpu(reap_node, cpu) = node;

915

per_cpu(reap_node, cpu) = node;

907

}

916

}

908

917

909

static void next_reap_node(void)

918

static void next_reap_node(void)

910

{

919

{

911

int node = __get_cpu_var(reap_node);

920

int node = __get_cpu_var(reap_node);

912

921

913

/*

922

/*

914

* Also drain per cpu pages on remote zones

923

* Also drain per cpu pages on remote zones

915

*/

924

*/

916

if (node != numa_node_id())

925

if (node != numa_node_id())

917

drain_node_pages(node);

926

drain_node_pages(node);

918

927

919

node = next_node(node, node_online_map);

928

node = next_node(node, node_online_map);

920

if (unlikely(node >= MAX_NUMNODES))

929

if (unlikely(node >= MAX_NUMNODES))

921

node = first_node(node_online_map);

930

node = first_node(node_online_map);

922

__get_cpu_var(reap_node) = node;

931

__get_cpu_var(reap_node) = node;

923

}

932

}

924

933

925

#else

934

#else

926

#define init_reap_node(cpu) do { } while (0)

935

#define init_reap_node(cpu) do { } while (0)

927

#define next_reap_node(void) do { } while (0)

936

#define next_reap_node(void) do { } while (0)

928

#endif

937

#endif

929

938

930

/*

939

/*

931

* Initiate the reap timer running on the target CPU. We run at around 1 to 2Hz

940

* Initiate the reap timer running on the target CPU. We run at around 1 to 2Hz

932

* via the workqueue/eventd.

941

* via the workqueue/eventd.

933

* Add the CPU number into the expiration time to minimize the possibility of

942

* Add the CPU number into the expiration time to minimize the possibility of

934

* the CPUs getting into lockstep and contending for the global cache chain

943

* the CPUs getting into lockstep and contending for the global cache chain

935

* lock.

944

* lock.

936

*/

945

*/

937

static void __devinit start_cpu_timer(int cpu)

946

static void __devinit start_cpu_timer(int cpu)

938

{

947

{

939

struct delayed_work *reap_work = &per_cpu(reap_work, cpu);

948

struct delayed_work *reap_work = &per_cpu(reap_work, cpu);

940

949

941

/*

950

/*

942

* When this gets called from do_initcalls via cpucache_init(),

951

* When this gets called from do_initcalls via cpucache_init(),

943

* init_workqueues() has already run, so keventd will be setup

952

* init_workqueues() has already run, so keventd will be setup

944

* at that time.

953

* at that time.

945

*/

954

*/

946

if (keventd_up() && reap_work->work.func == NULL) {

955

if (keventd_up() && reap_work->work.func == NULL) {

947

init_reap_node(cpu);

956

init_reap_node(cpu);

948

INIT_DELAYED_WORK(reap_work, cache_reap);

957

INIT_DELAYED_WORK(reap_work, cache_reap);

949

schedule_delayed_work_on(cpu, reap_work,

958

schedule_delayed_work_on(cpu, reap_work,

950

__round_jiffies_relative(HZ, cpu));

959

__round_jiffies_relative(HZ, cpu));

951

}

960

}

952

}

961

}

953

962

954

static struct array_cache *alloc_arraycache(int node, int entries,

963

static struct array_cache *alloc_arraycache(int node, int entries,

955

int batchcount)

964

int batchcount)

956

{

965

{

957

int memsize = sizeof(void *) * entries + sizeof(struct array_cache);

966

int memsize = sizeof(void *) * entries + sizeof(struct array_cache);

958

struct array_cache *nc = NULL;

967

struct array_cache *nc = NULL;

959

968

960

nc = kmalloc_node(memsize, GFP_KERNEL, node);

969

nc = kmalloc_node(memsize, GFP_KERNEL, node);

961

if (nc) {

970

if (nc) {

962

nc->avail = 0;

971

nc->avail = 0;

963

nc->limit = entries;

972

nc->limit = entries;

964

nc->batchcount = batchcount;

973

nc->batchcount = batchcount;

965

nc->touched = 0;

974

nc->touched = 0;

966

spin_lock_init(&nc->lock);

975

spin_lock_init(&nc->lock);

967

}

976

}

968

return nc;

977

return nc;

969

}

978

}

970

979

971

/*

980

/*

972

* Transfer objects in one arraycache to another.

981

* Transfer objects in one arraycache to another.

973

* Locking must be handled by the caller.

982

* Locking must be handled by the caller.

974

*

983

*

975

* Return the number of entries transferred.

984

* Return the number of entries transferred.

976

*/

985

*/

977

static int transfer_objects(struct array_cache *to,

986

static int transfer_objects(struct array_cache *to,

978

struct array_cache *from, unsigned int max)

987

struct array_cache *from, unsigned int max)

979

{

988

{

980

/* Figure out how many entries to transfer */

989

/* Figure out how many entries to transfer */

981

int nr = min(min(from->avail, max), to->limit - to->avail);

990

int nr = min(min(from->avail, max), to->limit - to->avail);

982

991

983

if (!nr)

992

if (!nr)

984

return 0;

993

return 0;

985

994

986

memcpy(to->entry + to->avail, from->entry + from->avail -nr,

995

memcpy(to->entry + to->avail, from->entry + from->avail -nr,

987

sizeof(void *) *nr);

996

sizeof(void *) *nr);

988

997

989

from->avail -= nr;

998

from->avail -= nr;

990

to->avail += nr;

999

to->avail += nr;

991

to->touched = 1;

1000

to->touched = 1;

992

return nr;

1001

return nr;

993

}

1002

}

994

1003

995

#ifndef CONFIG_NUMA

1004

#ifndef CONFIG_NUMA

996

1005

997

#define drain_alien_cache(cachep, alien) do { } while (0)

1006

#define drain_alien_cache(cachep, alien) do { } while (0)

998

#define reap_alien(cachep, l3) do { } while (0)

1007

#define reap_alien(cachep, l3) do { } while (0)

999

1008

1000

static inline struct array_cache **alloc_alien_cache(int node, int limit)

1009

static inline struct array_cache **alloc_alien_cache(int node, int limit)

1001

{

1010

{

1002

return (struct array_cache **)BAD_ALIEN_MAGIC;

1011

return (struct array_cache **)BAD_ALIEN_MAGIC;

1003

}

1012

}

1004

1013

1005

static inline void free_alien_cache(struct array_cache **ac_ptr)

1014

static inline void free_alien_cache(struct array_cache **ac_ptr)

1006

{

1015

{

1007

}

1016

}

1008

1017

1009

static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)

1018

static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)

1010

{

1019

{

1011

return 0;

1020

return 0;

1012

}

1021

}

1013

1022

1014

static inline void *alternate_node_alloc(struct kmem_cache *cachep,

1023

static inline void *alternate_node_alloc(struct kmem_cache *cachep,

1015

gfp_t flags)

1024

gfp_t flags)

1016

{

1025

{

1017

return NULL;

1026

return NULL;

1018

}

1027

}

1019

1028

1020

static inline void *____cache_alloc_node(struct kmem_cache *cachep,

1029

static inline void *____cache_alloc_node(struct kmem_cache *cachep,

1021

gfp_t flags, int nodeid)

1030

gfp_t flags, int nodeid)

1022

{

1031

{

1023

return NULL;

1032

return NULL;

1024

}

1033

}

1025

1034

1026

#else /* CONFIG_NUMA */

1035

#else /* CONFIG_NUMA */

1027

1036

1028

static void *____cache_alloc_node(struct kmem_cache *, gfp_t, int);

1037

static void *____cache_alloc_node(struct kmem_cache *, gfp_t, int);

1029

static void *alternate_node_alloc(struct kmem_cache *, gfp_t);

1038

static void *alternate_node_alloc(struct kmem_cache *, gfp_t);

1030

1039

1031

static struct array_cache **alloc_alien_cache(int node, int limit)

1040

static struct array_cache **alloc_alien_cache(int node, int limit)

1032

{

1041

{

1033

struct array_cache **ac_ptr;

1042

struct array_cache **ac_ptr;

1034

int memsize = sizeof(void *) * MAX_NUMNODES;

1043

int memsize = sizeof(void *) * MAX_NUMNODES;

1035

int i;

1044

int i;

1036

1045

1037

if (limit > 1)

1046

if (limit > 1)

1038

limit = 12;

1047

limit = 12;

1039

ac_ptr = kmalloc_node(memsize, GFP_KERNEL, node);

1048

ac_ptr = kmalloc_node(memsize, GFP_KERNEL, node);

1040

if (ac_ptr) {

1049

if (ac_ptr) {

1041

for_each_node(i) {

1050

for_each_node(i) {

1042

if (i == node || !node_online(i)) {

1051

if (i == node || !node_online(i)) {

1043

ac_ptr[i] = NULL;

1052

ac_ptr[i] = NULL;

1044

continue;

1053

continue;

1045

}

1054

}

1046

ac_ptr[i] = alloc_arraycache(node, limit, 0xbaadf00d);

1055

ac_ptr[i] = alloc_arraycache(node, limit, 0xbaadf00d);

1047

if (!ac_ptr[i]) {

1056

if (!ac_ptr[i]) {

1048

for (i--; i <= 0; i--)

1057

for (i--; i <= 0; i--)

1049

kfree(ac_ptr[i]);

1058

kfree(ac_ptr[i]);

1050

kfree(ac_ptr);

1059

kfree(ac_ptr);

1051

return NULL;

1060

return NULL;

1052

}

1061

}

1053

}

1062

}

1054

}

1063

}

1055

return ac_ptr;

1064

return ac_ptr;

1056

}

1065

}

1057

1066

1058

static void free_alien_cache(struct array_cache **ac_ptr)

1067

static void free_alien_cache(struct array_cache **ac_ptr)

1059

{

1068

{

1060

int i;

1069

int i;

1061

1070

1062

if (!ac_ptr)

1071

if (!ac_ptr)

1063

return;

1072

return;

1064

for_each_node(i)

1073

for_each_node(i)

1065

kfree(ac_ptr[i]);

1074

kfree(ac_ptr[i]);

1066

kfree(ac_ptr);

1075

kfree(ac_ptr);

1067

}

1076

}

1068

1077

1069

static void __drain_alien_cache(struct kmem_cache *cachep,

1078

static void __drain_alien_cache(struct kmem_cache *cachep,

1070

struct array_cache *ac, int node)

1079

struct array_cache *ac, int node)

1071

{

1080

{

1072

struct kmem_list3 *rl3 = cachep->nodelists[node];

1081

struct kmem_list3 *rl3 = cachep->nodelists[node];

1073

1082

1074

if (ac->avail) {

1083

if (ac->avail) {

1075

spin_lock(&rl3->list_lock);

1084

spin_lock(&rl3->list_lock);

1076

/*

1085

/*

1077

* Stuff objects into the remote nodes shared array first.

1086

* Stuff objects into the remote nodes shared array first.

1078

* That way we could avoid the overhead of putting the objects

1087

* That way we could avoid the overhead of putting the objects

1079

* into the free lists and getting them back later.

1088

* into the free lists and getting them back later.

1080

*/

1089

*/

1081

if (rl3->shared)

1090

if (rl3->shared)

1082

transfer_objects(rl3->shared, ac, ac->limit);

1091

transfer_objects(rl3->shared, ac, ac->limit);

1083

1092

1084

free_block(cachep, ac->entry, ac->avail, node);

1093

free_block(cachep, ac->entry, ac->avail, node);

1085

ac->avail = 0;

1094

ac->avail = 0;

1086

spin_unlock(&rl3->list_lock);

1095

spin_unlock(&rl3->list_lock);

1087

}

1096

}

1088

}

1097

}

1089

1098

1090

/*

1099

/*

1091

* Called from cache_reap() to regularly drain alien caches round robin.

1100

* Called from cache_reap() to regularly drain alien caches round robin.

1092

*/

1101

*/

1093

static void reap_alien(struct kmem_cache *cachep, struct kmem_list3 *l3)

1102

static void reap_alien(struct kmem_cache *cachep, struct kmem_list3 *l3)

1094

{

1103

{

1095

int node = __get_cpu_var(reap_node);

1104

int node = __get_cpu_var(reap_node);

1096

1105

1097

if (l3->alien) {

1106

if (l3->alien) {

1098

struct array_cache *ac = l3->alien[node];

1107

struct array_cache *ac = l3->alien[node];

1099

1108

1100

if (ac && ac->avail && spin_trylock_irq(&ac->lock)) {

1109

if (ac && ac->avail && spin_trylock_irq(&ac->lock)) {

1101

__drain_alien_cache(cachep, ac, node);

1110

__drain_alien_cache(cachep, ac, node);

1102

spin_unlock_irq(&ac->lock);

1111

spin_unlock_irq(&ac->lock);

1103

}

1112

}

1104

}

1113

}

1105

}

1114

}

1106

1115

1107

static void drain_alien_cache(struct kmem_cache *cachep,

1116

static void drain_alien_cache(struct kmem_cache *cachep,

1108

struct array_cache **alien)

1117

struct array_cache **alien)

1109

{

1118

{

1110

int i = 0;

1119

int i = 0;

1111

struct array_cache *ac;

1120

struct array_cache *ac;

1112

unsigned long flags;

1121

unsigned long flags;

1113

1122

1114

for_each_online_node(i) {

1123

for_each_online_node(i) {

1115

ac = alien[i];

1124

ac = alien[i];

1116

if (ac) {

1125

if (ac) {

1117

spin_lock_irqsave(&ac->lock, flags);

1126

spin_lock_irqsave(&ac->lock, flags);

1118

__drain_alien_cache(cachep, ac, i);

1127

__drain_alien_cache(cachep, ac, i);

1119

spin_unlock_irqrestore(&ac->lock, flags);

1128

spin_unlock_irqrestore(&ac->lock, flags);

1120

}

1129

}

1121

}

1130

}

1122

}

1131

}

1123

1132

1124

static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)

1133

static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)

1125

{

1134

{

1126

struct slab *slabp = virt_to_slab(objp);

1135

struct slab *slabp = virt_to_slab(objp);

1127

int nodeid = slabp->nodeid;

1136

int nodeid = slabp->nodeid;

1128

struct kmem_list3 *l3;

1137

struct kmem_list3 *l3;

1129

struct array_cache *alien = NULL;

1138

struct array_cache *alien = NULL;

1130

int node;

1139

int node;

1131

1140

1132

node = numa_node_id();

1141

node = numa_node_id();

1133

1142

1134

/*

1143

/*

1135

* Make sure we are not freeing a object from another node to the array

1144

* Make sure we are not freeing a object from another node to the array

1136

* cache on this cpu.

1145

* cache on this cpu.

1137

*/

1146

*/

1138

if (likely(slabp->nodeid == node) || unlikely(!use_alien_caches))

1147

if (likely(slabp->nodeid == node) || unlikely(!use_alien_caches))

1139

return 0;

1148

return 0;

1140

1149

1141

l3 = cachep->nodelists[node];

1150

l3 = cachep->nodelists[node];

1142

STATS_INC_NODEFREES(cachep);

1151

STATS_INC_NODEFREES(cachep);

1143

if (l3->alien && l3->alien[nodeid]) {

1152

if (l3->alien && l3->alien[nodeid]) {

1144

alien = l3->alien[nodeid];

1153

alien = l3->alien[nodeid];

1145

spin_lock(&alien->lock);

1154

spin_lock(&alien->lock);

1146

if (unlikely(alien->avail == alien->limit)) {

1155

if (unlikely(alien->avail == alien->limit)) {

1147

STATS_INC_ACOVERFLOW(cachep);

1156

STATS_INC_ACOVERFLOW(cachep);

1148

__drain_alien_cache(cachep, alien, nodeid);

1157

__drain_alien_cache(cachep, alien, nodeid);

1149

}

1158

}

1150

alien->entry[alien->avail++] = objp;

1159

alien->entry[alien->avail++] = objp;

1151

spin_unlock(&alien->lock);

1160

spin_unlock(&alien->lock);

1152

} else {

1161

} else {

1153

spin_lock(&(cachep->nodelists[nodeid])->list_lock);

1162

spin_lock(&(cachep->nodelists[nodeid])->list_lock);

1154

free_block(cachep, &objp, 1, nodeid);

1163

free_block(cachep, &objp, 1, nodeid);

1155

spin_unlock(&(cachep->nodelists[nodeid])->list_lock);

1164

spin_unlock(&(cachep->nodelists[nodeid])->list_lock);

1156

}

1165

}

1157

return 1;

1166

return 1;

1158

}

1167

}

1159

#endif

1168

#endif

1160

1169

1161

static int __cpuinit cpuup_callback(struct notifier_block *nfb,

1170

static int __cpuinit cpuup_callback(struct notifier_block *nfb,

1162

unsigned long action, void *hcpu)

1171

unsigned long action, void *hcpu)

1163

{

1172

{

1164

long cpu = (long)hcpu;

1173

long cpu = (long)hcpu;

1165

struct kmem_cache *cachep;

1174

struct kmem_cache *cachep;

1166

struct kmem_list3 *l3 = NULL;

1175

struct kmem_list3 *l3 = NULL;

1167

int node = cpu_to_node(cpu);

1176

int node = cpu_to_node(cpu);

1168

int memsize = sizeof(struct kmem_list3);

1177

int memsize = sizeof(struct kmem_list3);

1169

1178

1170

switch (action) {

1179

switch (action) {

1171

case CPU_UP_PREPARE:

1180

case CPU_UP_PREPARE:

1172

mutex_lock(&cache_chain_mutex);

1181

mutex_lock(&cache_chain_mutex);

1173

/*

1182

/*

1174

* We need to do this right in the beginning since

1183

* We need to do this right in the beginning since

1175

* alloc_arraycache's are going to use this list.

1184

* alloc_arraycache's are going to use this list.

1176

* kmalloc_node allows us to add the slab to the right

1185

* kmalloc_node allows us to add the slab to the right

1177

* kmem_list3 and not this cpu's kmem_list3

1186

* kmem_list3 and not this cpu's kmem_list3

1178

*/

1187

*/

1179

1188

1180

list_for_each_entry(cachep, &cache_chain, next) {

1189

list_for_each_entry(cachep, &cache_chain, next) {

1181

/*

1190

/*

1182

* Set up the size64 kmemlist for cpu before we can

1191

* Set up the size64 kmemlist for cpu before we can

1183

* begin anything. Make sure some other cpu on this

1192

* begin anything. Make sure some other cpu on this

1184

* node has not already allocated this

1193

* node has not already allocated this

1185

*/

1194

*/

1186

if (!cachep->nodelists[node]) {

1195

if (!cachep->nodelists[node]) {

1187

l3 = kmalloc_node(memsize, GFP_KERNEL, node);

1196

l3 = kmalloc_node(memsize, GFP_KERNEL, node);

1188

if (!l3)

1197

if (!l3)

1189

goto bad;

1198

goto bad;

1190

kmem_list3_init(l3);

1199

kmem_list3_init(l3);

1191

l3->next_reap = jiffies + REAPTIMEOUT_LIST3 +

1200

l3->next_reap = jiffies + REAPTIMEOUT_LIST3 +

1192

((unsigned long)cachep) % REAPTIMEOUT_LIST3;

1201

((unsigned long)cachep) % REAPTIMEOUT_LIST3;

1193

1202

1194

/*

1203

/*

1195

* The l3s don't come and go as CPUs come and

1204

* The l3s don't come and go as CPUs come and

1196

* go. cache_chain_mutex is sufficient

1205

* go. cache_chain_mutex is sufficient

1197

* protection here.

1206

* protection here.

1198

*/

1207

*/

1199

cachep->nodelists[node] = l3;

1208

cachep->nodelists[node] = l3;

1200

}

1209

}

1201

1210

1202

spin_lock_irq(&cachep->nodelists[node]->list_lock);

1211

spin_lock_irq(&cachep->nodelists[node]->list_lock);

1203

cachep->nodelists[node]->free_limit =

1212

cachep->nodelists[node]->free_limit =

1204

(1 + nr_cpus_node(node)) *

1213

(1 + nr_cpus_node(node)) *

1205

cachep->batchcount + cachep->num;

1214

cachep->batchcount + cachep->num;

1206

spin_unlock_irq(&cachep->nodelists[node]->list_lock);

1215

spin_unlock_irq(&cachep->nodelists[node]->list_lock);

1207

}

1216

}

1208

1217

1209

/*

1218

/*

1210

* Now we can go ahead with allocating the shared arrays and

1219

* Now we can go ahead with allocating the shared arrays and

1211

* array caches

1220

* array caches

1212

*/

1221

*/

1213

list_for_each_entry(cachep, &cache_chain, next) {

1222

list_for_each_entry(cachep, &cache_chain, next) {

1214

struct array_cache *nc;

1223

struct array_cache *nc;

1215

struct array_cache *shared;

1224

struct array_cache *shared;

1216

struct array_cache **alien = NULL;

1225

struct array_cache **alien = NULL;

1217

1226

1218

nc = alloc_arraycache(node, cachep->limit,

1227

nc = alloc_arraycache(node, cachep->limit,

1219

cachep->batchcount);

1228

cachep->batchcount);

1220

if (!nc)

1229

if (!nc)

1221

goto bad;

1230

goto bad;

1222

shared = alloc_arraycache(node,

1231

shared = alloc_arraycache(node,

1223

cachep->shared * cachep->batchcount,

1232

cachep->shared * cachep->batchcount,

1224

0xbaadf00d);

1233

0xbaadf00d);

1225

if (!shared)

1234

if (!shared)

1226

goto bad;

1235

goto bad;

1227

1236

1228

if (use_alien_caches) {

1237

if (use_alien_caches) {

1229

alien = alloc_alien_cache(node, cachep->limit);

1238

alien = alloc_alien_cache(node, cachep->limit);

1230

if (!alien)

1239

if (!alien)

1231

goto bad;

1240

goto bad;

1232

}

1241

}

1233

cachep->array[cpu] = nc;

1242

cachep->array[cpu] = nc;

1234

l3 = cachep->nodelists[node];

1243

l3 = cachep->nodelists[node];

1235

BUG_ON(!l3);

1244

BUG_ON(!l3);

1236

1245

1237

spin_lock_irq(&l3->list_lock);

1246

spin_lock_irq(&l3->list_lock);

1238

if (!l3->shared) {

1247

if (!l3->shared) {

1239

/*

1248

/*

1240

* We are serialised from CPU_DEAD or

1249

* We are serialised from CPU_DEAD or

1241

* CPU_UP_CANCELLED by the cpucontrol lock

1250

* CPU_UP_CANCELLED by the cpucontrol lock

1242

*/

1251

*/

1243

l3->shared = shared;

1252

l3->shared = shared;

1244

shared = NULL;

1253

shared = NULL;

1245

}

1254

}

1246

#ifdef CONFIG_NUMA

1255

#ifdef CONFIG_NUMA

1247

if (!l3->alien) {

1256

if (!l3->alien) {

1248

l3->alien = alien;

1257

l3->alien = alien;

1249

alien = NULL;

1258

alien = NULL;

1250

}

1259

}

1251

#endif

1260

#endif

1252

spin_unlock_irq(&l3->list_lock);

1261

spin_unlock_irq(&l3->list_lock);

1253

kfree(shared);

1262

kfree(shared);

1254

free_alien_cache(alien);

1263

free_alien_cache(alien);

1255

}

1264

}

1256

break;

1265

break;

1257

case CPU_ONLINE:

1266

case CPU_ONLINE:

1258

mutex_unlock(&cache_chain_mutex);

1267

mutex_unlock(&cache_chain_mutex);

1259

start_cpu_timer(cpu);

1268

start_cpu_timer(cpu);

1260

break;

1269

break;

1261

#ifdef CONFIG_HOTPLUG_CPU

1270

#ifdef CONFIG_HOTPLUG_CPU

1262

case CPU_DOWN_PREPARE:

1271

case CPU_DOWN_PREPARE:

1263

mutex_lock(&cache_chain_mutex);

1272

mutex_lock(&cache_chain_mutex);

1264

break;

1273

break;

1265

case CPU_DOWN_FAILED:

1274

case CPU_DOWN_FAILED:

1266

mutex_unlock(&cache_chain_mutex);

1275

mutex_unlock(&cache_chain_mutex);

1267

break;

1276

break;

1268

case CPU_DEAD:

1277

case CPU_DEAD:

1269

/*

1278

/*

1270

* Even if all the cpus of a node are down, we don't free the

1279

* Even if all the cpus of a node are down, we don't free the

1271

* kmem_list3 of any cache. This to avoid a race between

1280

* kmem_list3 of any cache. This to avoid a race between

1272

* cpu_down, and a kmalloc allocation from another cpu for

1281

* cpu_down, and a kmalloc allocation from another cpu for

1273

* memory from the node of the cpu going down. The list3

1282

* memory from the node of the cpu going down. The list3

1274

* structure is usually allocated from kmem_cache_create() and

1283

* structure is usually allocated from kmem_cache_create() and

1275

* gets destroyed at kmem_cache_destroy().

1284

* gets destroyed at kmem_cache_destroy().

1276

*/

1285

*/

1277

/* fall thru */

1286

/* fall thru */

1278

#endif

1287

#endif

1279

case CPU_UP_CANCELED:

1288

case CPU_UP_CANCELED:

1280

list_for_each_entry(cachep, &cache_chain, next) {

1289

list_for_each_entry(cachep, &cache_chain, next) {

1281

struct array_cache *nc;

1290

struct array_cache *nc;

1282

struct array_cache *shared;

1291

struct array_cache *shared;

1283

struct array_cache **alien;

1292

struct array_cache **alien;

1284

cpumask_t mask;

1293

cpumask_t mask;

1285

1294

1286

mask = node_to_cpumask(node);

1295

mask = node_to_cpumask(node);

1287

/* cpu is dead; no one can alloc from it. */

1296

/* cpu is dead; no one can alloc from it. */

1288

nc = cachep->array[cpu];

1297

nc = cachep->array[cpu];

1289

cachep->array[cpu] = NULL;

1298

cachep->array[cpu] = NULL;

1290

l3 = cachep->nodelists[node];

1299

l3 = cachep->nodelists[node];

1291

1300

1292

if (!l3)

1301

if (!l3)

1293

goto free_array_cache;

1302

goto free_array_cache;

1294

1303

1295

spin_lock_irq(&l3->list_lock);

1304

spin_lock_irq(&l3->list_lock);

1296

1305

1297

/* Free limit for this kmem_list3 */

1306

/* Free limit for this kmem_list3 */

1298

l3->free_limit -= cachep->batchcount;

1307

l3->free_limit -= cachep->batchcount;

1299

if (nc)

1308

if (nc)

1300

free_block(cachep, nc->entry, nc->avail, node);

1309

free_block(cachep, nc->entry, nc->avail, node);

1301

1310

1302

if (!cpus_empty(mask)) {

1311

if (!cpus_empty(mask)) {

1303

spin_unlock_irq(&l3->list_lock);

1312

spin_unlock_irq(&l3->list_lock);

1304

goto free_array_cache;

1313

goto free_array_cache;

1305

}

1314

}

1306

1315

1307

shared = l3->shared;

1316

shared = l3->shared;

1308

if (shared) {

1317

if (shared) {

1309

free_block(cachep, l3->shared->entry,

1318

free_block(cachep, l3->shared->entry,

1310

l3->shared->avail, node);

1319

l3->shared->avail, node);

1311

l3->shared = NULL;

1320

l3->shared = NULL;

1312

}

1321

}

1313

1322

1314

alien = l3->alien;

1323

alien = l3->alien;

1315

l3->alien = NULL;

1324

l3->alien = NULL;

1316

1325

1317

spin_unlock_irq(&l3->list_lock);

1326

spin_unlock_irq(&l3->list_lock);

1318

1327

1319

kfree(shared);

1328

kfree(shared);

1320

if (alien) {

1329

if (alien) {

1321

drain_alien_cache(cachep, alien);

1330

drain_alien_cache(cachep, alien);

1322

free_alien_cache(alien);

1331

free_alien_cache(alien);

1323

}

1332

}

1324

free_array_cache:

1333

free_array_cache:

1325

kfree(nc);

1334

kfree(nc);

1326

}

1335

}

1327

/*

1336

/*

1328

* In the previous loop, all the objects were freed to

1337

* In the previous loop, all the objects were freed to

1329

* the respective cache's slabs, now we can go ahead and

1338

* the respective cache's slabs, now we can go ahead and

1330

* shrink each nodelist to its limit.

1339

* shrink each nodelist to its limit.

1331

*/

1340

*/

1332

list_for_each_entry(cachep, &cache_chain, next) {

1341

list_for_each_entry(cachep, &cache_chain, next) {

1333

l3 = cachep->nodelists[node];

1342

l3 = cachep->nodelists[node];

1334

if (!l3)

1343

if (!l3)

1335

continue;

1344

continue;

1336

drain_freelist(cachep, l3, l3->free_objects);

1345

drain_freelist(cachep, l3, l3->free_objects);

1337

}

1346

}

1338

mutex_unlock(&cache_chain_mutex);

1347

mutex_unlock(&cache_chain_mutex);

1339

break;

1348

break;

1340

}

1349

}

1341

return NOTIFY_OK;

1350

return NOTIFY_OK;

1342

bad:

1351

bad:

1343

return NOTIFY_BAD;

1352

return NOTIFY_BAD;

1344

}

1353

}

1345

1354

1346

static struct notifier_block __cpuinitdata cpucache_notifier = {

1355

static struct notifier_block __cpuinitdata cpucache_notifier = {

1347

&cpuup_callback, NULL, 0

1356

&cpuup_callback, NULL, 0

1348

};

1357

};

1349

1358

1350

/*

1359

/*

1351

* swap the static kmem_list3 with kmalloced memory

1360

* swap the static kmem_list3 with kmalloced memory

1352

*/

1361

*/

1353

static void init_list(struct kmem_cache *cachep, struct kmem_list3 *list,

1362

static void init_list(struct kmem_cache *cachep, struct kmem_list3 *list,

1354

int nodeid)

1363

int nodeid)

1355

{

1364

{

1356

struct kmem_list3 *ptr;

1365

struct kmem_list3 *ptr;

1357

1366

1358

ptr = kmalloc_node(sizeof(struct kmem_list3), GFP_KERNEL, nodeid);

1367

ptr = kmalloc_node(sizeof(struct kmem_list3), GFP_KERNEL, nodeid);

1359

BUG_ON(!ptr);

1368

BUG_ON(!ptr);

1360

1369

1361

local_irq_disable();

1370

local_irq_disable();

1362

memcpy(ptr, list, sizeof(struct kmem_list3));

1371

memcpy(ptr, list, sizeof(struct kmem_list3));

1363

/*

1372

/*

1364

* Do not assume that spinlocks can be initialized via memcpy:

1373

* Do not assume that spinlocks can be initialized via memcpy:

1365

*/

1374

*/

1366

spin_lock_init(&ptr->list_lock);

1375

spin_lock_init(&ptr->list_lock);

1367

1376

1368

MAKE_ALL_LISTS(cachep, ptr, nodeid);

1377

MAKE_ALL_LISTS(cachep, ptr, nodeid);

1369

cachep->nodelists[nodeid] = ptr;

1378

cachep->nodelists[nodeid] = ptr;

1370

local_irq_enable();

1379

local_irq_enable();

1371

}

1380

}

1372

1381

1373

/*

1382

/*

1374

* Initialisation. Called after the page allocator have been initialised and

1383

* Initialisation. Called after the page allocator have been initialised and

1375

* before smp_init().

1384

* before smp_init().

1376

*/

1385

*/

1377

void __init kmem_cache_init(void)

1386

void __init kmem_cache_init(void)

1378

{

1387

{

1379

size_t left_over;

1388

size_t left_over;

1380

struct cache_sizes *sizes;

1389

struct cache_sizes *sizes;

1381

struct cache_names *names;

1390

struct cache_names *names;

1382

int i;

1391

int i;

1383

int order;

1392

int order;

1384

int node;

1393

int node;

1385

1394

1386

for (i = 0; i < NUM_INIT_LISTS; i++) {

1395

for (i = 0; i < NUM_INIT_LISTS; i++) {

1387

kmem_list3_init(&initkmem_list3[i]);

1396

kmem_list3_init(&initkmem_list3[i]);

1388

if (i < MAX_NUMNODES)

1397

if (i < MAX_NUMNODES)

1389

cache_cache.nodelists[i] = NULL;

1398

cache_cache.nodelists[i] = NULL;

1390

}

1399

}

1391

1400

1392

/*

1401

/*

1393

* Fragmentation resistance on low memory - only use bigger

1402

* Fragmentation resistance on low memory - only use bigger

1394

* page orders on machines with more than 32MB of memory.

1403

* page orders on machines with more than 32MB of memory.

1395

*/

1404

*/

1396

if (num_physpages > (32 << 20) >> PAGE_SHIFT)

1405

if (num_physpages > (32 << 20) >> PAGE_SHIFT)

1397

slab_break_gfp_order = BREAK_GFP_ORDER_HI;

1406

slab_break_gfp_order = BREAK_GFP_ORDER_HI;

1398

1407

1399

/* Bootstrap is tricky, because several objects are allocated

1408

/* Bootstrap is tricky, because several objects are allocated

1400

* from caches that do not exist yet:

1409

* from caches that do not exist yet:

1401

* 1) initialize the cache_cache cache: it contains the struct

1410

* 1) initialize the cache_cache cache: it contains the struct

1402

* kmem_cache structures of all caches, except cache_cache itself:

1411

* kmem_cache structures of all caches, except cache_cache itself:

1403

* cache_cache is statically allocated.

1412

* cache_cache is statically allocated.

1404

* Initially an __init data area is used for the head array and the

1413

* Initially an __init data area is used for the head array and the

1405

* kmem_list3 structures, it's replaced with a kmalloc allocated

1414

* kmem_list3 structures, it's replaced with a kmalloc allocated

1406

* array at the end of the bootstrap.

1415

* array at the end of the bootstrap.

1407

* 2) Create the first kmalloc cache.

1416

* 2) Create the first kmalloc cache.

1408

* The struct kmem_cache for the new cache is allocated normally.

1417

* The struct kmem_cache for the new cache is allocated normally.

1409

* An __init data area is used for the head array.

1418

* An __init data area is used for the head array.

1410

* 3) Create the remaining kmalloc caches, with minimally sized

1419

* 3) Create the remaining kmalloc caches, with minimally sized

1411

* head arrays.

1420

* head arrays.

1412

* 4) Replace the __init data head arrays for cache_cache and the first

1421

* 4) Replace the __init data head arrays for cache_cache and the first

1413

* kmalloc cache with kmalloc allocated arrays.

1422

* kmalloc cache with kmalloc allocated arrays.

1414

* 5) Replace the __init data for kmem_list3 for cache_cache and

1423

* 5) Replace the __init data for kmem_list3 for cache_cache and

1415

* the other cache's with kmalloc allocated memory.

1424

* the other cache's with kmalloc allocated memory.

1416

* 6) Resize the head arrays of the kmalloc caches to their final sizes.

1425

* 6) Resize the head arrays of the kmalloc caches to their final sizes.

1417

*/

1426

*/

1418

1427

1419

node = numa_node_id();

1428

node = numa_node_id();

1420

1429

1421

/* 1) create the cache_cache */

1430

/* 1) create the cache_cache */

1422

INIT_LIST_HEAD(&cache_chain);

1431

INIT_LIST_HEAD(&cache_chain);

1423

list_add(&cache_cache.next, &cache_chain);

1432

list_add(&cache_cache.next, &cache_chain);

1424

cache_cache.colour_off = cache_line_size();

1433

cache_cache.colour_off = cache_line_size();

1425

cache_cache.array[smp_processor_id()] = &initarray_cache.cache;

1434

cache_cache.array[smp_processor_id()] = &initarray_cache.cache;

1426

cache_cache.nodelists[node] = &initkmem_list3[CACHE_CACHE];

1435

cache_cache.nodelists[node] = &initkmem_list3[CACHE_CACHE];

1427

1436

1428

cache_cache.buffer_size = ALIGN(cache_cache.buffer_size,

1437

cache_cache.buffer_size = ALIGN(cache_cache.buffer_size,

1429

cache_line_size());

1438

cache_line_size());

1439

cache_cache.reciprocal_buffer_size =

1440

reciprocal_value(cache_cache.buffer_size);

1430

1441

1431

for (order = 0; order < MAX_ORDER; order++) {

1442

for (order = 0; order < MAX_ORDER; order++) {

1432

cache_estimate(order, cache_cache.buffer_size,

1443

cache_estimate(order, cache_cache.buffer_size,

1433

cache_line_size(), 0, &left_over, &cache_cache.num);

1444

cache_line_size(), 0, &left_over, &cache_cache.num);

1434

if (cache_cache.num)

1445

if (cache_cache.num)

1435

break;

1446

break;

1436

}

1447

}

1437

BUG_ON(!cache_cache.num);

1448

BUG_ON(!cache_cache.num);

1438

cache_cache.gfporder = order;

1449

cache_cache.gfporder = order;

1439

cache_cache.colour = left_over / cache_cache.colour_off;

1450

cache_cache.colour = left_over / cache_cache.colour_off;

1440

cache_cache.slab_size = ALIGN(cache_cache.num * sizeof(kmem_bufctl_t) +

1451

cache_cache.slab_size = ALIGN(cache_cache.num * sizeof(kmem_bufctl_t) +

1441

sizeof(struct slab), cache_line_size());

1452

sizeof(struct slab), cache_line_size());

1442

1453

1443

/* 2+3) create the kmalloc caches */

1454

/* 2+3) create the kmalloc caches */

1444

sizes = malloc_sizes;

1455

sizes = malloc_sizes;

1445

names = cache_names;

1456

names = cache_names;

1446

1457

1447

/*

1458

/*

1448

* Initialize the caches that provide memory for the array cache and the

1459

* Initialize the caches that provide memory for the array cache and the

1449

* kmem_list3 structures first. Without this, further allocations will

1460

* kmem_list3 structures first. Without this, further allocations will

1450

* bug.

1461

* bug.

1451

*/

1462

*/

1452

1463

1453

sizes[INDEX_AC].cs_cachep = kmem_cache_create(names[INDEX_AC].name,

1464

sizes[INDEX_AC].cs_cachep = kmem_cache_create(names[INDEX_AC].name,

1454

sizes[INDEX_AC].cs_size,

1465

sizes[INDEX_AC].cs_size,

1455

ARCH_KMALLOC_MINALIGN,

1466

ARCH_KMALLOC_MINALIGN,

1456

ARCH_KMALLOC_FLAGS|SLAB_PANIC,

1467

ARCH_KMALLOC_FLAGS|SLAB_PANIC,

1457

NULL, NULL);

1468

NULL, NULL);

1458

1469

1459

if (INDEX_AC != INDEX_L3) {

1470

if (INDEX_AC != INDEX_L3) {

1460

sizes[INDEX_L3].cs_cachep =

1471

sizes[INDEX_L3].cs_cachep =

1461

kmem_cache_create(names[INDEX_L3].name,

1472

kmem_cache_create(names[INDEX_L3].name,

1462

sizes[INDEX_L3].cs_size,

1473

sizes[INDEX_L3].cs_size,

1463

ARCH_KMALLOC_MINALIGN,

1474

ARCH_KMALLOC_MINALIGN,

1464

ARCH_KMALLOC_FLAGS|SLAB_PANIC,

1475

ARCH_KMALLOC_FLAGS|SLAB_PANIC,

1465

NULL, NULL);

1476

NULL, NULL);

1466

}

1477

}

1467

1478

1468

slab_early_init = 0;

1479

slab_early_init = 0;

1469

1480

1470

while (sizes->cs_size != ULONG_MAX) {

1481

while (sizes->cs_size != ULONG_MAX) {

1471

/*

1482

/*

1472

* For performance, all the general caches are L1 aligned.

1483

* For performance, all the general caches are L1 aligned.

1473

* This should be particularly beneficial on SMP boxes, as it

1484

* This should be particularly beneficial on SMP boxes, as it

1474

* eliminates "false sharing".

1485

* eliminates "false sharing".

1475

* Note for systems short on memory removing the alignment will

1486

* Note for systems short on memory removing the alignment will

1476

* allow tighter packing of the smaller caches.

1487

* allow tighter packing of the smaller caches.

1477

*/

1488

*/

1478

if (!sizes->cs_cachep) {

1489

if (!sizes->cs_cachep) {

1479

sizes->cs_cachep = kmem_cache_create(names->name,

1490

sizes->cs_cachep = kmem_cache_create(names->name,

1480

sizes->cs_size,

1491

sizes->cs_size,

1481

ARCH_KMALLOC_MINALIGN,

1492

ARCH_KMALLOC_MINALIGN,

1482

ARCH_KMALLOC_FLAGS|SLAB_PANIC,

1493

ARCH_KMALLOC_FLAGS|SLAB_PANIC,

1483

NULL, NULL);

1494

NULL, NULL);

1484

}

1495

}

1485

1496

1486

sizes->cs_dmacachep = kmem_cache_create(names->name_dma,

1497

sizes->cs_dmacachep = kmem_cache_create(names->name_dma,

1487

sizes->cs_size,

1498

sizes->cs_size,

1488

ARCH_KMALLOC_MINALIGN,

1499

ARCH_KMALLOC_MINALIGN,

1489

ARCH_KMALLOC_FLAGS|SLAB_CACHE_DMA|

1500

ARCH_KMALLOC_FLAGS|SLAB_CACHE_DMA|

1490

SLAB_PANIC,

1501

SLAB_PANIC,

1491

NULL, NULL);

1502

NULL, NULL);

1492

sizes++;

1503

sizes++;

1493

names++;

1504

names++;

1494

}

1505

}

1495

/* 4) Replace the bootstrap head arrays */

1506

/* 4) Replace the bootstrap head arrays */

1496

{

1507

{

1497

struct array_cache *ptr;

1508

struct array_cache *ptr;

1498

1509

1499

ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL);

1510

ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL);

1500

1511

1501

local_irq_disable();

1512

local_irq_disable();

1502

BUG_ON(cpu_cache_get(&cache_cache) != &initarray_cache.cache);

1513

BUG_ON(cpu_cache_get(&cache_cache) != &initarray_cache.cache);

1503

memcpy(ptr, cpu_cache_get(&cache_cache),

1514

memcpy(ptr, cpu_cache_get(&cache_cache),

1504

sizeof(struct arraycache_init));

1515

sizeof(struct arraycache_init));

1505

/*

1516

/*

1506

* Do not assume that spinlocks can be initialized via memcpy:

1517

* Do not assume that spinlocks can be initialized via memcpy:

1507

*/

1518

*/

1508

spin_lock_init(&ptr->lock);

1519

spin_lock_init(&ptr->lock);

1509

1520

1510

cache_cache.array[smp_processor_id()] = ptr;

1521

cache_cache.array[smp_processor_id()] = ptr;

1511

local_irq_enable();

1522

local_irq_enable();

1512

1523

1513

ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL);

1524

ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL);

1514

1525

1515

local_irq_disable();

1526

local_irq_disable();

1516

BUG_ON(cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep)

1527

BUG_ON(cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep)

1517

!= &initarray_generic.cache);

1528

!= &initarray_generic.cache);

1518

memcpy(ptr, cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep),

1529

memcpy(ptr, cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep),

1519

sizeof(struct arraycache_init));

1530

sizeof(struct arraycache_init));

1520

/*

1531

/*

1521

* Do not assume that spinlocks can be initialized via memcpy:

1532

* Do not assume that spinlocks can be initialized via memcpy:

1522

*/

1533

*/

1523

spin_lock_init(&ptr->lock);

1534

spin_lock_init(&ptr->lock);

1524

1535

1525

malloc_sizes[INDEX_AC].cs_cachep->array[smp_processor_id()] =

1536

malloc_sizes[INDEX_AC].cs_cachep->array[smp_processor_id()] =

1526

ptr;

1537

ptr;

1527

local_irq_enable();

1538

local_irq_enable();

1528

}

1539

}

1529

/* 5) Replace the bootstrap kmem_list3's */

1540

/* 5) Replace the bootstrap kmem_list3's */

1530

{

1541

{

1531

int nid;

1542

int nid;

1532

1543

1533

/* Replace the static kmem_list3 structures for the boot cpu */

1544

/* Replace the static kmem_list3 structures for the boot cpu */

1534

init_list(&cache_cache, &initkmem_list3[CACHE_CACHE], node);

1545

init_list(&cache_cache, &initkmem_list3[CACHE_CACHE], node);

1535

1546

1536

for_each_online_node(nid) {

1547

for_each_online_node(nid) {

1537

init_list(malloc_sizes[INDEX_AC].cs_cachep,

1548

init_list(malloc_sizes[INDEX_AC].cs_cachep,

1538

&initkmem_list3[SIZE_AC + nid], nid);

1549

&initkmem_list3[SIZE_AC + nid], nid);

1539

1550

1540

if (INDEX_AC != INDEX_L3) {

1551

if (INDEX_AC != INDEX_L3) {

1541

init_list(malloc_sizes[INDEX_L3].cs_cachep,

1552

init_list(malloc_sizes[INDEX_L3].cs_cachep,

1542

&initkmem_list3[SIZE_L3 + nid], nid);

1553

&initkmem_list3[SIZE_L3 + nid], nid);

1543

}

1554

}

1544

}

1555

}

1545

}

1556

}

1546

1557

1547

/* 6) resize the head arrays to their final sizes */

1558

/* 6) resize the head arrays to their final sizes */

1548

{

1559

{

1549

struct kmem_cache *cachep;

1560

struct kmem_cache *cachep;

1550

mutex_lock(&cache_chain_mutex);

1561

mutex_lock(&cache_chain_mutex);

1551

list_for_each_entry(cachep, &cache_chain, next)

1562

list_for_each_entry(cachep, &cache_chain, next)

1552

if (enable_cpucache(cachep))

1563

if (enable_cpucache(cachep))

1553

BUG();

1564

BUG();

1554

mutex_unlock(&cache_chain_mutex);

1565

mutex_unlock(&cache_chain_mutex);

1555

}

1566

}

1556

1567

1557

/* Annotate slab for lockdep -- annotate the malloc caches */

1568

/* Annotate slab for lockdep -- annotate the malloc caches */

1558

init_lock_keys();

1569

init_lock_keys();

1559

1570

1560

1571

1561

/* Done! */

1572

/* Done! */

1562

g_cpucache_up = FULL;

1573

g_cpucache_up = FULL;

1563

1574

1564

/*

1575

/*

1565

* Register a cpu startup notifier callback that initializes

1576

* Register a cpu startup notifier callback that initializes

1566

* cpu_cache_get for all new cpus

1577

* cpu_cache_get for all new cpus

1567

*/

1578

*/

1568

register_cpu_notifier(&cpucache_notifier);

1579

register_cpu_notifier(&cpucache_notifier);

1569

1580

1570

/*

1581

/*

1571

* The reap timers are started later, with a module init call: That part

1582

* The reap timers are started later, with a module init call: That part

1572

* of the kernel is not yet operational.

1583

* of the kernel is not yet operational.

1573

*/

1584

*/

1574

}

1585

}

1575

1586

1576

static int __init cpucache_init(void)

1587

static int __init cpucache_init(void)

1577

{

1588

{

1578

int cpu;

1589

int cpu;

1579

1590

1580

/*

1591

/*

1581

* Register the timers that return unneeded pages to the page allocator

1592

* Register the timers that return unneeded pages to the page allocator

1582

*/

1593

*/

1583

for_each_online_cpu(cpu)

1594

for_each_online_cpu(cpu)

1584

start_cpu_timer(cpu);

1595

start_cpu_timer(cpu);

1585

return 0;

1596

return 0;

1586

}

1597

}

1587

__initcall(cpucache_init);

1598

__initcall(cpucache_init);

1588

1599

1589

/*

1600

/*

1590

* Interface to system's page allocator. No need to hold the cache-lock.

1601

* Interface to system's page allocator. No need to hold the cache-lock.

1591

*

1602

*

1592

* If we requested dmaable memory, we will get it. Even if we

1603

* If we requested dmaable memory, we will get it. Even if we

1593

* did not request dmaable memory, we might get it, but that

1604

* did not request dmaable memory, we might get it, but that

1594

* would be relatively rare and ignorable.

1605

* would be relatively rare and ignorable.

1595

*/

1606

*/

1596

static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid)

1607

static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid)

1597

{

1608

{

1598

struct page *page;

1609

struct page *page;

1599

int nr_pages;

1610

int nr_pages;

1600

int i;

1611

int i;

1601

1612

1602

#ifndef CONFIG_MMU

1613

#ifndef CONFIG_MMU

1603

/*

1614

/*

1604

* Nommu uses slab's for process anonymous memory allocations, and thus

1615

* Nommu uses slab's for process anonymous memory allocations, and thus

1605

* requires __GFP_COMP to properly refcount higher order allocations

1616

* requires __GFP_COMP to properly refcount higher order allocations

1606

*/

1617

*/

1607

flags |= __GFP_COMP;

1618

flags |= __GFP_COMP;

1608

#endif

1619

#endif

1609

1620

1610

flags |= cachep->gfpflags;

1621

flags |= cachep->gfpflags;

1611

1622

1612

page = alloc_pages_node(nodeid, flags, cachep->gfporder);

1623

page = alloc_pages_node(nodeid, flags, cachep->gfporder);

1613

if (!page)

1624

if (!page)

1614

return NULL;

1625

return NULL;

1615

1626

1616

nr_pages = (1 << cachep->gfporder);

1627

nr_pages = (1 << cachep->gfporder);

1617

if (cachep->flags & SLAB_RECLAIM_ACCOUNT)

1628

if (cachep->flags & SLAB_RECLAIM_ACCOUNT)

1618

add_zone_page_state(page_zone(page),

1629

add_zone_page_state(page_zone(page),

1619

NR_SLAB_RECLAIMABLE, nr_pages);

1630

NR_SLAB_RECLAIMABLE, nr_pages);

1620

else

1631

else

1621

add_zone_page_state(page_zone(page),

1632

add_zone_page_state(page_zone(page),

1622

NR_SLAB_UNRECLAIMABLE, nr_pages);

1633

NR_SLAB_UNRECLAIMABLE, nr_pages);

1623

for (i = 0; i < nr_pages; i++)

1634

for (i = 0; i < nr_pages; i++)

1624

__SetPageSlab(page + i);

1635

__SetPageSlab(page + i);

1625

return page_address(page);

1636

return page_address(page);

1626

}

1637

}

1627

1638

1628

/*

1639

/*

1629

* Interface to system's page release.

1640

* Interface to system's page release.

1630

*/

1641

*/

1631

static void kmem_freepages(struct kmem_cache *cachep, void *addr)

1642

static void kmem_freepages(struct kmem_cache *cachep, void *addr)

1632

{

1643

{

1633

unsigned long i = (1 << cachep->gfporder);

1644

unsigned long i = (1 << cachep->gfporder);

1634

struct page *page = virt_to_page(addr);

1645

struct page *page = virt_to_page(addr);

1635

const unsigned long nr_freed = i;

1646

const unsigned long nr_freed = i;

1636

1647

1637

if (cachep->flags & SLAB_RECLAIM_ACCOUNT)

1648

if (cachep->flags & SLAB_RECLAIM_ACCOUNT)

1638

sub_zone_page_state(page_zone(page),

1649

sub_zone_page_state(page_zone(page),

1639

NR_SLAB_RECLAIMABLE, nr_freed);

1650

NR_SLAB_RECLAIMABLE, nr_freed);

1640

else

1651

else

1641

sub_zone_page_state(page_zone(page),

1652

sub_zone_page_state(page_zone(page),

1642

NR_SLAB_UNRECLAIMABLE, nr_freed);

1653

NR_SLAB_UNRECLAIMABLE, nr_freed);

1643

while (i--) {

1654

while (i--) {

1644

BUG_ON(!PageSlab(page));

1655

BUG_ON(!PageSlab(page));

1645

__ClearPageSlab(page);

1656

__ClearPageSlab(page);

1646

page++;

1657

page++;

1647

}

1658

}

1648

if (current->reclaim_state)

1659

if (current->reclaim_state)

1649

current->reclaim_state->reclaimed_slab += nr_freed;

1660

current->reclaim_state->reclaimed_slab += nr_freed;

1650

free_pages((unsigned long)addr, cachep->gfporder);

1661

free_pages((unsigned long)addr, cachep->gfporder);

1651

}

1662

}

1652

1663

1653

static void kmem_rcu_free(struct rcu_head *head)

1664

static void kmem_rcu_free(struct rcu_head *head)

1654

{

1665

{

1655

struct slab_rcu *slab_rcu = (struct slab_rcu *)head;

1666

struct slab_rcu *slab_rcu = (struct slab_rcu *)head;

1656

struct kmem_cache *cachep = slab_rcu->cachep;

1667

struct kmem_cache *cachep = slab_rcu->cachep;

1657

1668

1658

kmem_freepages(cachep, slab_rcu->addr);

1669

kmem_freepages(cachep, slab_rcu->addr);

1659

if (OFF_SLAB(cachep))

1670

if (OFF_SLAB(cachep))

1660

kmem_cache_free(cachep->slabp_cache, slab_rcu);

1671

kmem_cache_free(cachep->slabp_cache, slab_rcu);

1661

}

1672

}

1662

1673

1663

#if DEBUG

1674

#if DEBUG

1664

1675

1665

#ifdef CONFIG_DEBUG_PAGEALLOC

1676

#ifdef CONFIG_DEBUG_PAGEALLOC

1666

static void store_stackinfo(struct kmem_cache *cachep, unsigned long *addr,

1677

static void store_stackinfo(struct kmem_cache *cachep, unsigned long *addr,

1667

unsigned long caller)

1678

unsigned long caller)

1668

{

1679

{

1669

int size = obj_size(cachep);

1680

int size = obj_size(cachep);

1670

1681

1671

addr = (unsigned long *)&((char *)addr)[obj_offset(cachep)];

1682

addr = (unsigned long *)&((char *)addr)[obj_offset(cachep)];

1672

1683

1673

if (size < 5 * sizeof(unsigned long))

1684

if (size < 5 * sizeof(unsigned long))

1674

return;

1685

return;

1675

1686

1676

*addr++ = 0x12345678;

1687

*addr++ = 0x12345678;

1677

*addr++ = caller;

1688

*addr++ = caller;

1678

*addr++ = smp_processor_id();

1689

*addr++ = smp_processor_id();

1679

size -= 3 * sizeof(unsigned long);

1690

size -= 3 * sizeof(unsigned long);

1680

{

1691

{

1681

unsigned long *sptr = &caller;

1692

unsigned long *sptr = &caller;

1682

unsigned long svalue;

1693

unsigned long svalue;

1683

1694

1684

while (!kstack_end(sptr)) {

1695

while (!kstack_end(sptr)) {

1685

svalue = *sptr++;

1696

svalue = *sptr++;

1686

if (kernel_text_address(svalue)) {

1697

if (kernel_text_address(svalue)) {

1687

*addr++ = svalue;

1698

*addr++ = svalue;

1688

size -= sizeof(unsigned long);

1699

size -= sizeof(unsigned long);

1689

if (size <= sizeof(unsigned long))

1700

if (size <= sizeof(unsigned long))

1690

break;

1701

break;

1691

}

1702

}

1692

}

1703

}

1693

1704

1694

}

1705

}

1695

*addr++ = 0x87654321;

1706

*addr++ = 0x87654321;

1696

}

1707

}

1697

#endif

1708

#endif

1698

1709

1699

static void poison_obj(struct kmem_cache *cachep, void *addr, unsigned char val)

1710

static void poison_obj(struct kmem_cache *cachep, void *addr, unsigned char val)

1700

{

1711

{

1701

int size = obj_size(cachep);

1712

int size = obj_size(cachep);

1702

addr = &((char *)addr)[obj_offset(cachep)];

1713

addr = &((char *)addr)[obj_offset(cachep)];

1703

1714

1704

memset(addr, val, size);

1715

memset(addr, val, size);

1705

*(unsigned char *)(addr + size - 1) = POISON_END;

1716

*(unsigned char *)(addr + size - 1) = POISON_END;

1706

}

1717

}

1707

1718

1708

static void dump_line(char *data, int offset, int limit)

1719

static void dump_line(char *data, int offset, int limit)

1709

{

1720

{

1710

int i;

1721

int i;

1711

unsigned char error = 0;

1722

unsigned char error = 0;

1712

int bad_count = 0;

1723

int bad_count = 0;

1713

1724

1714

printk(KERN_ERR "%03x:", offset);

1725

printk(KERN_ERR "%03x:", offset);

1715

for (i = 0; i < limit; i++) {

1726

for (i = 0; i < limit; i++) {

1716

if (data[offset + i] != POISON_FREE) {

1727

if (data[offset + i] != POISON_FREE) {

1717

error = data[offset + i];

1728

error = data[offset + i];

1718

bad_count++;

1729

bad_count++;

1719

}

1730

}

1720

printk(" %02x", (unsigned char)data[offset + i]);

1731

printk(" %02x", (unsigned char)data[offset + i]);

1721

}

1732

}

1722

printk("\n");

1733

printk("\n");

1723

1734

1724

if (bad_count == 1) {

1735

if (bad_count == 1) {

1725

error ^= POISON_FREE;

1736

error ^= POISON_FREE;

1726

if (!(error & (error - 1))) {

1737

if (!(error & (error - 1))) {

1727

printk(KERN_ERR "Single bit error detected. Probably "

1738

printk(KERN_ERR "Single bit error detected. Probably "

1728

"bad RAM.\n");

1739

"bad RAM.\n");

1729

#ifdef CONFIG_X86

1740

#ifdef CONFIG_X86

1730

printk(KERN_ERR "Run memtest86+ or a similar memory "

1741

printk(KERN_ERR "Run memtest86+ or a similar memory "

1731

"test tool.\n");

1742

"test tool.\n");

1732

#else

1743

#else

1733

printk(KERN_ERR "Run a memory test tool.\n");

1744

printk(KERN_ERR "Run a memory test tool.\n");

1734

#endif

1745

#endif

1735

}

1746

}

1736

}

1747

}

1737

}

1748

}

1738

#endif

1749

#endif

1739

1750

1740

#if DEBUG

1751

#if DEBUG

1741

1752

1742

static void print_objinfo(struct kmem_cache *cachep, void *objp, int lines)

1753

static void print_objinfo(struct kmem_cache *cachep, void *objp, int lines)

1743

{

1754

{

1744

int i, size;

1755

int i, size;

1745

char *realobj;

1756

char *realobj;

1746

1757

1747

if (cachep->flags & SLAB_RED_ZONE) {

1758

if (cachep->flags & SLAB_RED_ZONE) {

1748

printk(KERN_ERR "Redzone: 0x%lx/0x%lx.\n",

1759

printk(KERN_ERR "Redzone: 0x%lx/0x%lx.\n",

1749

*dbg_redzone1(cachep, objp),

1760

*dbg_redzone1(cachep, objp),

1750

*dbg_redzone2(cachep, objp));

1761

*dbg_redzone2(cachep, objp));

1751

}

1762

}

1752

1763

1753

if (cachep->flags & SLAB_STORE_USER) {

1764

if (cachep->flags & SLAB_STORE_USER) {

1754

printk(KERN_ERR "Last user: [<%p>]",

1765

printk(KERN_ERR "Last user: [<%p>]",

1755

*dbg_userword(cachep, objp));

1766

*dbg_userword(cachep, objp));

1756

print_symbol("(%s)",

1767

print_symbol("(%s)",

1757

(unsigned long)*dbg_userword(cachep, objp));

1768

(unsigned long)*dbg_userword(cachep, objp));

1758

printk("\n");

1769

printk("\n");

1759

}

1770

}

1760

realobj = (char *)objp + obj_offset(cachep);

1771

realobj = (char *)objp + obj_offset(cachep);

1761

size = obj_size(cachep);

1772

size = obj_size(cachep);

1762

for (i = 0; i < size && lines; i += 16, lines--) {

1773

for (i = 0; i < size && lines; i += 16, lines--) {

1763

int limit;

1774

int limit;

1764

limit = 16;

1775

limit = 16;

1765

if (i + limit > size)

1776

if (i + limit > size)

1766

limit = size - i;

1777

limit = size - i;

1767

dump_line(realobj, i, limit);

1778

dump_line(realobj, i, limit);

1768

}

1779

}

1769

}

1780

}

1770

1781

1771

static void check_poison_obj(struct kmem_cache *cachep, void *objp)

1782

static void check_poison_obj(struct kmem_cache *cachep, void *objp)

1772

{

1783

{

1773

char *realobj;

1784

char *realobj;

1774

int size, i;

1785

int size, i;

1775

int lines = 0;

1786

int lines = 0;

1776

1787

1777

realobj = (char *)objp + obj_offset(cachep);

1788

realobj = (char *)objp + obj_offset(cachep);

1778

size = obj_size(cachep);

1789

size = obj_size(cachep);

1779

1790

1780

for (i = 0; i < size; i++) {

1791

for (i = 0; i < size; i++) {

1781

char exp = POISON_FREE;

1792

char exp = POISON_FREE;

1782

if (i == size - 1)

1793

if (i == size - 1)

1783

exp = POISON_END;

1794

exp = POISON_END;

1784

if (realobj[i] != exp) {

1795

if (realobj[i] != exp) {

1785

int limit;

1796

int limit;

1786

/* Mismatch ! */

1797

/* Mismatch ! */

1787

/* Print header */

1798

/* Print header */

1788

if (lines == 0) {

1799

if (lines == 0) {

1789

printk(KERN_ERR

1800

printk(KERN_ERR

1790

"Slab corruption: start=%p, len=%d\n",

1801

"Slab corruption: start=%p, len=%d\n",

1791

realobj, size);

1802

realobj, size);

1792

print_objinfo(cachep, objp, 0);

1803

print_objinfo(cachep, objp, 0);

1793

}

1804

}

1794

/* Hexdump the affected line */

1805

/* Hexdump the affected line */

1795

i = (i / 16) * 16;

1806

i = (i / 16) * 16;

1796

limit = 16;

1807

limit = 16;

1797

if (i + limit > size)

1808

if (i + limit > size)

1798

limit = size - i;

1809

limit = size - i;

1799

dump_line(realobj, i, limit);

1810

dump_line(realobj, i, limit);

1800

i += 16;

1811

i += 16;

1801

lines++;

1812

lines++;

1802

/* Limit to 5 lines */

1813

/* Limit to 5 lines */

1803

if (lines > 5)

1814

if (lines > 5)

1804

break;

1815

break;

1805

}

1816

}

1806

}

1817

}

1807

if (lines != 0) {

1818

if (lines != 0) {

1808

/* Print some data about the neighboring objects, if they

1819

/* Print some data about the neighboring objects, if they

1809

* exist:

1820

* exist:

1810

*/

1821

*/

1811

struct slab *slabp = virt_to_slab(objp);

1822

struct slab *slabp = virt_to_slab(objp);

1812

unsigned int objnr;

1823

unsigned int objnr;

1813

1824

1814

objnr = obj_to_index(cachep, slabp, objp);

1825

objnr = obj_to_index(cachep, slabp, objp);

1815

if (objnr) {

1826

if (objnr) {

1816

objp = index_to_obj(cachep, slabp, objnr - 1);

1827

objp = index_to_obj(cachep, slabp, objnr - 1);

1817

realobj = (char *)objp + obj_offset(cachep);

1828

realobj = (char *)objp + obj_offset(cachep);

1818

printk(KERN_ERR "Prev obj: start=%p, len=%d\n",

1829

printk(KERN_ERR "Prev obj: start=%p, len=%d\n",

1819

realobj, size);

1830

realobj, size);

1820

print_objinfo(cachep, objp, 2);

1831

print_objinfo(cachep, objp, 2);

1821

}

1832

}

1822

if (objnr + 1 < cachep->num) {

1833

if (objnr + 1 < cachep->num) {

1823

objp = index_to_obj(cachep, slabp, objnr + 1);

1834

objp = index_to_obj(cachep, slabp, objnr + 1);

1824

realobj = (char *)objp + obj_offset(cachep);

1835

realobj = (char *)objp + obj_offset(cachep);

1825

printk(KERN_ERR "Next obj: start=%p, len=%d\n",

1836

printk(KERN_ERR "Next obj: start=%p, len=%d\n",

1826

realobj, size);

1837

realobj, size);

1827

print_objinfo(cachep, objp, 2);

1838

print_objinfo(cachep, objp, 2);

1828

}

1839

}

1829

}

1840

}

1830

}

1841

}

1831

#endif

1842

#endif

1832

1843

1833

#if DEBUG

1844

#if DEBUG

1834

/**

1845

/**

1835

* slab_destroy_objs - destroy a slab and its objects

1846

* slab_destroy_objs - destroy a slab and its objects

1836

* @cachep: cache pointer being destroyed

1847

* @cachep: cache pointer being destroyed

1837

* @slabp: slab pointer being destroyed

1848

* @slabp: slab pointer being destroyed

1838

*

1849

*

1839

* Call the registered destructor for each object in a slab that is being

1850

* Call the registered destructor for each object in a slab that is being

1840

* destroyed.

1851

* destroyed.

1841

*/

1852

*/

1842

static void slab_destroy_objs(struct kmem_cache *cachep, struct slab *slabp)

1853

static void slab_destroy_objs(struct kmem_cache *cachep, struct slab *slabp)

1843

{

1854

{

1844

int i;

1855

int i;

1845

for (i = 0; i < cachep->num; i++) {

1856

for (i = 0; i < cachep->num; i++) {

1846

void *objp = index_to_obj(cachep, slabp, i);

1857

void *objp = index_to_obj(cachep, slabp, i);

1847

1858

1848

if (cachep->flags & SLAB_POISON) {

1859

if (cachep->flags & SLAB_POISON) {

1849

#ifdef CONFIG_DEBUG_PAGEALLOC

1860

#ifdef CONFIG_DEBUG_PAGEALLOC

1850

if (cachep->buffer_size % PAGE_SIZE == 0 &&

1861

if (cachep->buffer_size % PAGE_SIZE == 0 &&

1851

OFF_SLAB(cachep))

1862

OFF_SLAB(cachep))

1852

kernel_map_pages(virt_to_page(objp),

1863

kernel_map_pages(virt_to_page(objp),

1853

cachep->buffer_size / PAGE_SIZE, 1);

1864

cachep->buffer_size / PAGE_SIZE, 1);

1854

else

1865

else

1855

check_poison_obj(cachep, objp);

1866

check_poison_obj(cachep, objp);

1856

#else

1867

#else

1857

check_poison_obj(cachep, objp);

1868

check_poison_obj(cachep, objp);

1858

#endif

1869

#endif

1859

}

1870

}

1860

if (cachep->flags & SLAB_RED_ZONE) {

1871

if (cachep->flags & SLAB_RED_ZONE) {

1861

if (*dbg_redzone1(cachep, objp) != RED_INACTIVE)

1872

if (*dbg_redzone1(cachep, objp) != RED_INACTIVE)

1862

slab_error(cachep, "start of a freed object "

1873

slab_error(cachep, "start of a freed object "

1863

"was overwritten");

1874

"was overwritten");

1864

if (*dbg_redzone2(cachep, objp) != RED_INACTIVE)

1875

if (*dbg_redzone2(cachep, objp) != RED_INACTIVE)

1865

slab_error(cachep, "end of a freed object "

1876

slab_error(cachep, "end of a freed object "

1866

"was overwritten");

1877

"was overwritten");

1867

}

1878

}

1868

if (cachep->dtor && !(cachep->flags & SLAB_POISON))

1879

if (cachep->dtor && !(cachep->flags & SLAB_POISON))

1869

(cachep->dtor) (objp + obj_offset(cachep), cachep, 0);

1880

(cachep->dtor) (objp + obj_offset(cachep), cachep, 0);

1870

}

1881

}

1871

}

1882

}

1872

#else

1883

#else

1873

static void slab_destroy_objs(struct kmem_cache *cachep, struct slab *slabp)

1884

static void slab_destroy_objs(struct kmem_cache *cachep, struct slab *slabp)

1874

{

1885

{

1875

if (cachep->dtor) {

1886

if (cachep->dtor) {

1876

int i;

1887

int i;

1877

for (i = 0; i < cachep->num; i++) {

1888

for (i = 0; i < cachep->num; i++) {

1878

void *objp = index_to_obj(cachep, slabp, i);

1889

void *objp = index_to_obj(cachep, slabp, i);

1879

(cachep->dtor) (objp, cachep, 0);

1890

(cachep->dtor) (objp, cachep, 0);

1880

}

1891

}

1881

}

1892

}

1882

}

1893

}

1883

#endif

1894

#endif

1884

1895

1885

/**

1896

/**

1886

* slab_destroy - destroy and release all objects in a slab

1897

* slab_destroy - destroy and release all objects in a slab

1887

* @cachep: cache pointer being destroyed

1898

* @cachep: cache pointer being destroyed

1888

* @slabp: slab pointer being destroyed

1899

* @slabp: slab pointer being destroyed

1889

*

1900

*

1890

* Destroy all the objs in a slab, and release the mem back to the system.

1901

* Destroy all the objs in a slab, and release the mem back to the system.

1891

* Before calling the slab must have been unlinked from the cache. The

1902

* Before calling the slab must have been unlinked from the cache. The

1892

* cache-lock is not held/needed.

1903

* cache-lock is not held/needed.

1893

*/

1904

*/

1894

static void slab_destroy(struct kmem_cache *cachep, struct slab *slabp)

1905

static void slab_destroy(struct kmem_cache *cachep, struct slab *slabp)

1895

{

1906

{

1896

void *addr = slabp->s_mem - slabp->colouroff;

1907

void *addr = slabp->s_mem - slabp->colouroff;

1897

1908

1898

slab_destroy_objs(cachep, slabp);

1909

slab_destroy_objs(cachep, slabp);

1899

if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU)) {

1910

if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU)) {

1900

struct slab_rcu *slab_rcu;

1911

struct slab_rcu *slab_rcu;

1901

1912

1902

slab_rcu = (struct slab_rcu *)slabp;

1913

slab_rcu = (struct slab_rcu *)slabp;

1903

slab_rcu->cachep = cachep;

1914

slab_rcu->cachep = cachep;

1904

slab_rcu->addr = addr;

1915

slab_rcu->addr = addr;

1905

call_rcu(&slab_rcu->head, kmem_rcu_free);

1916

call_rcu(&slab_rcu->head, kmem_rcu_free);

1906

} else {

1917

} else {

1907

kmem_freepages(cachep, addr);

1918

kmem_freepages(cachep, addr);

1908

if (OFF_SLAB(cachep))

1919

if (OFF_SLAB(cachep))

1909

kmem_cache_free(cachep->slabp_cache, slabp);

1920

kmem_cache_free(cachep->slabp_cache, slabp);

1910

}

1921

}

1911

}

1922

}

1912

1923

1913

/*

1924

/*

1914

* For setting up all the kmem_list3s for cache whose buffer_size is same as

1925

* For setting up all the kmem_list3s for cache whose buffer_size is same as

1915

* size of kmem_list3.

1926

* size of kmem_list3.

1916

*/

1927

*/

1917

static void set_up_list3s(struct kmem_cache *cachep, int index)

1928

static void set_up_list3s(struct kmem_cache *cachep, int index)

1918

{

1929

{

1919

int node;

1930

int node;

1920

1931

1921

for_each_online_node(node) {

1932

for_each_online_node(node) {

1922

cachep->nodelists[node] = &initkmem_list3[index + node];

1933

cachep->nodelists[node] = &initkmem_list3[index + node];

1923

cachep->nodelists[node]->next_reap = jiffies +

1934

cachep->nodelists[node]->next_reap = jiffies +

1924

REAPTIMEOUT_LIST3 +

1935

REAPTIMEOUT_LIST3 +

1925

((unsigned long)cachep) % REAPTIMEOUT_LIST3;

1936

((unsigned long)cachep) % REAPTIMEOUT_LIST3;

1926

}

1937

}

1927

}

1938

}

1928

1939

1929

static void __kmem_cache_destroy(struct kmem_cache *cachep)

1940

static void __kmem_cache_destroy(struct kmem_cache *cachep)

1930

{

1941

{

1931

int i;

1942

int i;

1932

struct kmem_list3 *l3;

1943

struct kmem_list3 *l3;

1933

1944

1934

for_each_online_cpu(i)

1945

for_each_online_cpu(i)

1935

kfree(cachep->array[i]);

1946

kfree(cachep->array[i]);

1936

1947

1937

/* NUMA: free the list3 structures */

1948

/* NUMA: free the list3 structures */

1938

for_each_online_node(i) {

1949

for_each_online_node(i) {

1939

l3 = cachep->nodelists[i];

1950

l3 = cachep->nodelists[i];

1940

if (l3) {

1951

if (l3) {

1941

kfree(l3->shared);

1952

kfree(l3->shared);

1942

free_alien_cache(l3->alien);

1953

free_alien_cache(l3->alien);

1943

kfree(l3);

1954

kfree(l3);

1944

}

1955

}

1945

}

1956

}

1946

kmem_cache_free(&cache_cache, cachep);

1957

kmem_cache_free(&cache_cache, cachep);

1947

}

1958

}

1948

1959

1949

1960

1950

/**

1961

/**

1951

* calculate_slab_order - calculate size (page order) of slabs

1962

* calculate_slab_order - calculate size (page order) of slabs

1952

* @cachep: pointer to the cache that is being created

1963

* @cachep: pointer to the cache that is being created

1953

* @size: size of objects to be created in this cache.

1964

* @size: size of objects to be created in this cache.

1954

* @align: required alignment for the objects.

1965

* @align: required alignment for the objects.

1955

* @flags: slab allocation flags

1966

* @flags: slab allocation flags

1956

*

1967

*

1957

* Also calculates the number of objects per slab.

1968

* Also calculates the number of objects per slab.

1958

*

1969

*

1959

* This could be made much more intelligent. For now, try to avoid using

1970

* This could be made much more intelligent. For now, try to avoid using

1960

* high order pages for slabs. When the gfp() functions are more friendly

1971

* high order pages for slabs. When the gfp() functions are more friendly

1961

* towards high-order requests, this should be changed.

1972

* towards high-order requests, this should be changed.

1962

*/

1973

*/

1963

static size_t calculate_slab_order(struct kmem_cache *cachep,

1974

static size_t calculate_slab_order(struct kmem_cache *cachep,

1964

size_t size, size_t align, unsigned long flags)

1975

size_t size, size_t align, unsigned long flags)

1965

{

1976

{

1966

unsigned long offslab_limit;

1977

unsigned long offslab_limit;

1967

size_t left_over = 0;

1978

size_t left_over = 0;

1968

int gfporder;

1979

int gfporder;

1969

1980

1970

for (gfporder = 0; gfporder <= MAX_GFP_ORDER; gfporder++) {

1981

for (gfporder = 0; gfporder <= MAX_GFP_ORDER; gfporder++) {

1971

unsigned int num;

1982

unsigned int num;

1972

size_t remainder;

1983

size_t remainder;

1973

1984

1974

cache_estimate(gfporder, size, align, flags, &remainder, &num);

1985

cache_estimate(gfporder, size, align, flags, &remainder, &num);

1975

if (!num)

1986

if (!num)

1976

continue;

1987

continue;

1977

1988

1978

if (flags & CFLGS_OFF_SLAB) {

1989

if (flags & CFLGS_OFF_SLAB) {

1979

/*

1990

/*

1980

* Max number of objs-per-slab for caches which

1991

* Max number of objs-per-slab for caches which

1981

* use off-slab slabs. Needed to avoid a possible

1992

* use off-slab slabs. Needed to avoid a possible

1982

* looping condition in cache_grow().

1993

* looping condition in cache_grow().

1983

*/

1994

*/

1984

offslab_limit = size - sizeof(struct slab);

1995

offslab_limit = size - sizeof(struct slab);

1985

offslab_limit /= sizeof(kmem_bufctl_t);

1996

offslab_limit /= sizeof(kmem_bufctl_t);

1986

1997

1987

if (num > offslab_limit)

1998

if (num > offslab_limit)

1988

break;

1999

break;

1989

}

2000

}

1990

2001

1991

/* Found something acceptable - save it away */

2002

/* Found something acceptable - save it away */

1992

cachep->num = num;

2003

cachep->num = num;

1993

cachep->gfporder = gfporder;

2004

cachep->gfporder = gfporder;

1994

left_over = remainder;

2005

left_over = remainder;

1995

2006

1996

/*

2007

/*

1997

* A VFS-reclaimable slab tends to have most allocations

2008

* A VFS-reclaimable slab tends to have most allocations

1998

* as GFP_NOFS and we really don't want to have to be allocating

2009

* as GFP_NOFS and we really don't want to have to be allocating

1999

* higher-order pages when we are unable to shrink dcache.

2010

* higher-order pages when we are unable to shrink dcache.

2000

*/

2011

*/

2001

if (flags & SLAB_RECLAIM_ACCOUNT)

2012

if (flags & SLAB_RECLAIM_ACCOUNT)

2002

break;

2013

break;

2003

2014

2004

/*

2015

/*

2005

* Large number of objects is good, but very large slabs are

2016

* Large number of objects is good, but very large slabs are

2006

* currently bad for the gfp()s.

2017

* currently bad for the gfp()s.

2007

*/

2018

*/

2008

if (gfporder >= slab_break_gfp_order)

2019

if (gfporder >= slab_break_gfp_order)

2009

break;

2020

break;

2010

2021

2011

/*

2022

/*

2012

* Acceptable internal fragmentation?

2023

* Acceptable internal fragmentation?

2013

*/

2024

*/

2014

if (left_over * 8 <= (PAGE_SIZE << gfporder))

2025

if (left_over * 8 <= (PAGE_SIZE << gfporder))

2015

break;

2026

break;

2016

}

2027

}

2017

return left_over;

2028

return left_over;

2018

}

2029

}

2019

2030

2020

static int setup_cpu_cache(struct kmem_cache *cachep)

2031

static int setup_cpu_cache(struct kmem_cache *cachep)

2021

{

2032

{

2022

if (g_cpucache_up == FULL)

2033

if (g_cpucache_up == FULL)

2023

return enable_cpucache(cachep);

2034

return enable_cpucache(cachep);

2024

2035

2025

if (g_cpucache_up == NONE) {

2036

if (g_cpucache_up == NONE) {

2026

/*

2037

/*

2027

* Note: the first kmem_cache_create must create the cache

2038

* Note: the first kmem_cache_create must create the cache

2028

* that's used by kmalloc(24), otherwise the creation of

2039

* that's used by kmalloc(24), otherwise the creation of

2029

* further caches will BUG().

2040

* further caches will BUG().

2030

*/

2041

*/

2031

cachep->array[smp_processor_id()] = &initarray_generic.cache;

2042

cachep->array[smp_processor_id()] = &initarray_generic.cache;

2032

2043

2033

/*

2044

/*

2034

* If the cache that's used by kmalloc(sizeof(kmem_list3)) is

2045

* If the cache that's used by kmalloc(sizeof(kmem_list3)) is

2035

* the first cache, then we need to set up all its list3s,

2046

* the first cache, then we need to set up all its list3s,

2036

* otherwise the creation of further caches will BUG().

2047

* otherwise the creation of further caches will BUG().

2037

*/

2048

*/

2038

set_up_list3s(cachep, SIZE_AC);

2049

set_up_list3s(cachep, SIZE_AC);

2039

if (INDEX_AC == INDEX_L3)

2050

if (INDEX_AC == INDEX_L3)

2040

g_cpucache_up = PARTIAL_L3;

2051

g_cpucache_up = PARTIAL_L3;

2041

else

2052

else

2042

g_cpucache_up = PARTIAL_AC;

2053

g_cpucache_up = PARTIAL_AC;

2043

} else {

2054

} else {

2044

cachep->array[smp_processor_id()] =

2055

cachep->array[smp_processor_id()] =

2045

kmalloc(sizeof(struct arraycache_init), GFP_KERNEL);

2056

kmalloc(sizeof(struct arraycache_init), GFP_KERNEL);

2046

2057

2047

if (g_cpucache_up == PARTIAL_AC) {

2058

if (g_cpucache_up == PARTIAL_AC) {

2048

set_up_list3s(cachep, SIZE_L3);

2059

set_up_list3s(cachep, SIZE_L3);

2049

g_cpucache_up = PARTIAL_L3;

2060

g_cpucache_up = PARTIAL_L3;

2050

} else {

2061

} else {

2051

int node;

2062

int node;

2052

for_each_online_node(node) {

2063

for_each_online_node(node) {

2053

cachep->nodelists[node] =

2064

cachep->nodelists[node] =

2054

kmalloc_node(sizeof(struct kmem_list3),

2065

kmalloc_node(sizeof(struct kmem_list3),

2055

GFP_KERNEL, node);

2066

GFP_KERNEL, node);

2056

BUG_ON(!cachep->nodelists[node]);

2067

BUG_ON(!cachep->nodelists[node]);

2057

kmem_list3_init(cachep->nodelists[node]);

2068

kmem_list3_init(cachep->nodelists[node]);

2058

}

2069

}

2059

}

2070

}

2060

}

2071

}

2061

cachep->nodelists[numa_node_id()]->next_reap =

2072

cachep->nodelists[numa_node_id()]->next_reap =

2062

jiffies + REAPTIMEOUT_LIST3 +

2073

jiffies + REAPTIMEOUT_LIST3 +

2063

((unsigned long)cachep) % REAPTIMEOUT_LIST3;

2074

((unsigned long)cachep) % REAPTIMEOUT_LIST3;

2064

2075

2065

cpu_cache_get(cachep)->avail = 0;

2076

cpu_cache_get(cachep)->avail = 0;

2066

cpu_cache_get(cachep)->limit = BOOT_CPUCACHE_ENTRIES;

2077

cpu_cache_get(cachep)->limit = BOOT_CPUCACHE_ENTRIES;

2067

cpu_cache_get(cachep)->batchcount = 1;

2078

cpu_cache_get(cachep)->batchcount = 1;

2068

cpu_cache_get(cachep)->touched = 0;

2079

cpu_cache_get(cachep)->touched = 0;

2069

cachep->batchcount = 1;

2080

cachep->batchcount = 1;

2070

cachep->limit = BOOT_CPUCACHE_ENTRIES;

2081

cachep->limit = BOOT_CPUCACHE_ENTRIES;

2071

return 0;

2082

return 0;

2072

}

2083

}

2073

2084

2074

/**

2085

/**

2075

* kmem_cache_create - Create a cache.

2086

* kmem_cache_create - Create a cache.

2076

* @name: A string which is used in /proc/slabinfo to identify this cache.

2087

* @name: A string which is used in /proc/slabinfo to identify this cache.

2077

* @size: The size of objects to be created in this cache.

2088

* @size: The size of objects to be created in this cache.

2078

* @align: The required alignment for the objects.

2089

* @align: The required alignment for the objects.

2079

* @flags: SLAB flags

2090

* @flags: SLAB flags

2080

* @ctor: A constructor for the objects.

2091

* @ctor: A constructor for the objects.

2081

* @dtor: A destructor for the objects.

2092

* @dtor: A destructor for the objects.

2082

*

2093

*

2083

* Returns a ptr to the cache on success, NULL on failure.

2094

* Returns a ptr to the cache on success, NULL on failure.

2084

* Cannot be called within a int, but can be interrupted.

2095

* Cannot be called within a int, but can be interrupted.

2085

* The @ctor is run when new pages are allocated by the cache

2096

* The @ctor is run when new pages are allocated by the cache

2086

* and the @dtor is run before the pages are handed back.

2097

* and the @dtor is run before the pages are handed back.

2087

*

2098

*

2088

* @name must be valid until the cache is destroyed. This implies that

2099

* @name must be valid until the cache is destroyed. This implies that

2089

* the module calling this has to destroy the cache before getting unloaded.

2100

* the module calling this has to destroy the cache before getting unloaded.

2090

*

2101

*

2091

* The flags are

2102

* The flags are

2092

*

2103

*

2093

* %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5)

2104

* %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5)

2094

* to catch references to uninitialised memory.

2105

* to catch references to uninitialised memory.

2095

*

2106

*

2096

* %SLAB_RED_ZONE - Insert `Red' zones around the allocated memory to check

2107

* %SLAB_RED_ZONE - Insert `Red' zones around the allocated memory to check

2097

* for buffer overruns.

2108

* for buffer overruns.

2098

*

2109

*

2099

* %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware

2110

* %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware

2100

* cacheline. This can be beneficial if you're counting cycles as closely

2111

* cacheline. This can be beneficial if you're counting cycles as closely

2101

* as davem.

2112

* as davem.

2102

*/

2113

*/

2103

struct kmem_cache *

2114

struct kmem_cache *

2104

kmem_cache_create (const char *name, size_t size, size_t align,

2115

kmem_cache_create (const char *name, size_t size, size_t align,

2105

unsigned long flags,

2116

unsigned long flags,

2106

void (*ctor)(void*, struct kmem_cache *, unsigned long),

2117

void (*ctor)(void*, struct kmem_cache *, unsigned long),

2107

void (*dtor)(void*, struct kmem_cache *, unsigned long))

2118

void (*dtor)(void*, struct kmem_cache *, unsigned long))

2108

{

2119

{

2109

size_t left_over, slab_size, ralign;

2120

size_t left_over, slab_size, ralign;

2110

struct kmem_cache *cachep = NULL, *pc;

2121

struct kmem_cache *cachep = NULL, *pc;

2111

2122

2112

/*

2123

/*

2113

* Sanity checks... these are all serious usage bugs.

2124

* Sanity checks... these are all serious usage bugs.

2114

*/

2125

*/

2115

if (!name || in_interrupt() || (size < BYTES_PER_WORD) ||

2126

if (!name || in_interrupt() || (size < BYTES_PER_WORD) ||

2116

(size > (1 << MAX_OBJ_ORDER) * PAGE_SIZE) || (dtor && !ctor)) {

2127

(size > (1 << MAX_OBJ_ORDER) * PAGE_SIZE) || (dtor && !ctor)) {

2117

printk(KERN_ERR "%s: Early error in slab %s\n", __FUNCTION__,

2128

printk(KERN_ERR "%s: Early error in slab %s\n", __FUNCTION__,

2118

name);

2129

name);

2119

BUG();

2130

BUG();

2120

}

2131

}

2121

2132

2122

/*

2133

/*

2123

* We use cache_chain_mutex to ensure a consistent view of

2134

* We use cache_chain_mutex to ensure a consistent view of

2124

* cpu_online_map as well. Please see cpuup_callback

2135

* cpu_online_map as well. Please see cpuup_callback

2125

*/

2136

*/

2126

mutex_lock(&cache_chain_mutex);

2137

mutex_lock(&cache_chain_mutex);

2127

2138

2128

list_for_each_entry(pc, &cache_chain, next) {

2139

list_for_each_entry(pc, &cache_chain, next) {

2129

char tmp;

2140

char tmp;

2130

int res;

2141

int res;

2131

2142

2132

/*

2143

/*

2133

* This happens when the module gets unloaded and doesn't

2144

* This happens when the module gets unloaded and doesn't

2134

* destroy its slab cache and no-one else reuses the vmalloc

2145

* destroy its slab cache and no-one else reuses the vmalloc

2135

* area of the module. Print a warning.

2146

* area of the module. Print a warning.

2136

*/

2147

*/

2137

res = probe_kernel_address(pc->name, tmp);

2148

res = probe_kernel_address(pc->name, tmp);

2138

if (res) {

2149

if (res) {

2139

printk("SLAB: cache with size %d has lost its name\n",

2150

printk("SLAB: cache with size %d has lost its name\n",

2140

pc->buffer_size);

2151

pc->buffer_size);

2141

continue;

2152

continue;

2142

}

2153

}

2143

2154

2144

if (!strcmp(pc->name, name)) {

2155

if (!strcmp(pc->name, name)) {

2145

printk("kmem_cache_create: duplicate cache %s\n", name);

2156

printk("kmem_cache_create: duplicate cache %s\n", name);

2146

dump_stack();

2157

dump_stack();

2147

goto oops;

2158

goto oops;

2148

}

2159

}

2149

}

2160

}

2150

2161

2151

#if DEBUG

2162

#if DEBUG

2152

WARN_ON(strchr(name, ' ')); /* It confuses parsers */

2163

WARN_ON(strchr(name, ' ')); /* It confuses parsers */

2153

if ((flags & SLAB_DEBUG_INITIAL) && !ctor) {

2164

if ((flags & SLAB_DEBUG_INITIAL) && !ctor) {

2154

/* No constructor, but inital state check requested */

2165

/* No constructor, but inital state check requested */

2155

printk(KERN_ERR "%s: No con, but init state check "

2166

printk(KERN_ERR "%s: No con, but init state check "

2156

"requested - %s\n", __FUNCTION__, name);

2167

"requested - %s\n", __FUNCTION__, name);

2157

flags &= ~SLAB_DEBUG_INITIAL;

2168

flags &= ~SLAB_DEBUG_INITIAL;

2158

}

2169

}

2159

#if FORCED_DEBUG

2170

#if FORCED_DEBUG

2160

/*

2171

/*

2161

* Enable redzoning and last user accounting, except for caches with

2172

* Enable redzoning and last user accounting, except for caches with

2162

* large objects, if the increased size would increase the object size

2173

* large objects, if the increased size would increase the object size

2163

* above the next power of two: caches with object sizes just above a

2174

* above the next power of two: caches with object sizes just above a

2164

* power of two have a significant amount of internal fragmentation.

2175

* power of two have a significant amount of internal fragmentation.

2165

*/

2176

*/

2166

if (size < 4096 || fls(size - 1) == fls(size-1 + 3 * BYTES_PER_WORD))

2177

if (size < 4096 || fls(size - 1) == fls(size-1 + 3 * BYTES_PER_WORD))

2167

flags |= SLAB_RED_ZONE | SLAB_STORE_USER;

2178

flags |= SLAB_RED_ZONE | SLAB_STORE_USER;

2168

if (!(flags & SLAB_DESTROY_BY_RCU))

2179

if (!(flags & SLAB_DESTROY_BY_RCU))

2169

flags |= SLAB_POISON;

2180

flags |= SLAB_POISON;

2170

#endif

2181

#endif

2171

if (flags & SLAB_DESTROY_BY_RCU)

2182

if (flags & SLAB_DESTROY_BY_RCU)

2172

BUG_ON(flags & SLAB_POISON);

2183

BUG_ON(flags & SLAB_POISON);

2173

#endif

2184

#endif

2174

if (flags & SLAB_DESTROY_BY_RCU)

2185

if (flags & SLAB_DESTROY_BY_RCU)

2175

BUG_ON(dtor);

2186

BUG_ON(dtor);

2176

2187

2177

/*

2188

/*

2178

* Always checks flags, a caller might be expecting debug support which

2189

* Always checks flags, a caller might be expecting debug support which

2179

* isn't available.

2190

* isn't available.

2180

*/

2191

*/

2181

BUG_ON(flags & ~CREATE_MASK);

2192

BUG_ON(flags & ~CREATE_MASK);

2182

2193

2183

/*

2194

/*

2184

* Check that size is in terms of words. This is needed to avoid

2195

* Check that size is in terms of words. This is needed to avoid

2185

* unaligned accesses for some archs when redzoning is used, and makes

2196

* unaligned accesses for some archs when redzoning is used, and makes

2186

* sure any on-slab bufctl's are also correctly aligned.

2197

* sure any on-slab bufctl's are also correctly aligned.

2187

*/

2198

*/

2188

if (size & (BYTES_PER_WORD - 1)) {

2199

if (size & (BYTES_PER_WORD - 1)) {

2189

size += (BYTES_PER_WORD - 1);

2200

size += (BYTES_PER_WORD - 1);

2190

size &= ~(BYTES_PER_WORD - 1);

2201

size &= ~(BYTES_PER_WORD - 1);

2191

}

2202

}

2192

2203

2193

/* calculate the final buffer alignment: */

2204

/* calculate the final buffer alignment: */

2194

2205

2195

/* 1) arch recommendation: can be overridden for debug */

2206

/* 1) arch recommendation: can be overridden for debug */

2196

if (flags & SLAB_HWCACHE_ALIGN) {

2207

if (flags & SLAB_HWCACHE_ALIGN) {

2197

/*

2208

/*

2198

* Default alignment: as specified by the arch code. Except if

2209

* Default alignment: as specified by the arch code. Except if

2199

* an object is really small, then squeeze multiple objects into

2210

* an object is really small, then squeeze multiple objects into

2200

* one cacheline.

2211

* one cacheline.

2201

*/

2212

*/

2202

ralign = cache_line_size();

2213

ralign = cache_line_size();

2203

while (size <= ralign / 2)

2214

while (size <= ralign / 2)

2204

ralign /= 2;

2215

ralign /= 2;

2205

} else {

2216

} else {

2206

ralign = BYTES_PER_WORD;

2217

ralign = BYTES_PER_WORD;

2207

}

2218

}

2208

2219

2209

/*

2220

/*

2210

* Redzoning and user store require word alignment. Note this will be

2221

* Redzoning and user store require word alignment. Note this will be

2211

* overridden by architecture or caller mandated alignment if either

2222

* overridden by architecture or caller mandated alignment if either

2212

* is greater than BYTES_PER_WORD.

2223

* is greater than BYTES_PER_WORD.

2213

*/

2224

*/

2214

if (flags & SLAB_RED_ZONE || flags & SLAB_STORE_USER)

2225

if (flags & SLAB_RED_ZONE || flags & SLAB_STORE_USER)

2215

ralign = BYTES_PER_WORD;

2226

ralign = BYTES_PER_WORD;

2216

2227

2217

/* 2) arch mandated alignment */

2228

/* 2) arch mandated alignment */

2218

if (ralign < ARCH_SLAB_MINALIGN) {

2229

if (ralign < ARCH_SLAB_MINALIGN) {

2219

ralign = ARCH_SLAB_MINALIGN;

2230

ralign = ARCH_SLAB_MINALIGN;

2220

}

2231

}

2221

/* 3) caller mandated alignment */

2232

/* 3) caller mandated alignment */

2222

if (ralign < align) {

2233

if (ralign < align) {

2223

ralign = align;

2234

ralign = align;

2224

}

2235

}

2225

/* disable debug if necessary */

2236

/* disable debug if necessary */

2226

if (ralign > BYTES_PER_WORD)

2237

if (ralign > BYTES_PER_WORD)

2227

flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);

2238

flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);

2228

/*

2239

/*

2229

* 4) Store it.

2240

* 4) Store it.

2230

*/

2241

*/

2231

align = ralign;

2242

align = ralign;

2232

2243

2233

/* Get cache's description obj. */

2244

/* Get cache's description obj. */

2234

cachep = kmem_cache_zalloc(&cache_cache, GFP_KERNEL);

2245

cachep = kmem_cache_zalloc(&cache_cache, GFP_KERNEL);

2235

if (!cachep)

2246

if (!cachep)

2236

goto oops;

2247

goto oops;

2237

2248

2238

#if DEBUG

2249

#if DEBUG

2239

cachep->obj_size = size;

2250

cachep->obj_size = size;

2240

2251

2241

/*

2252

/*

2242

* Both debugging options require word-alignment which is calculated

2253

* Both debugging options require word-alignment which is calculated

2243

* into align above.

2254

* into align above.

2244

*/

2255

*/

2245

if (flags & SLAB_RED_ZONE) {

2256

if (flags & SLAB_RED_ZONE) {

2246

/* add space for red zone words */

2257

/* add space for red zone words */

2247

cachep->obj_offset += BYTES_PER_WORD;

2258

cachep->obj_offset += BYTES_PER_WORD;

2248

size += 2 * BYTES_PER_WORD;

2259

size += 2 * BYTES_PER_WORD;

2249

}

2260

}

2250

if (flags & SLAB_STORE_USER) {

2261

if (flags & SLAB_STORE_USER) {

2251

/* user store requires one word storage behind the end of

2262

/* user store requires one word storage behind the end of

2252

* the real object.

2263

* the real object.

2253

*/

2264

*/

2254

size += BYTES_PER_WORD;

2265

size += BYTES_PER_WORD;

2255

}

2266

}

2256

#if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC)

2267

#if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC)

2257

if (size >= malloc_sizes[INDEX_L3 + 1].cs_size

2268

if (size >= malloc_sizes[INDEX_L3 + 1].cs_size

2258

&& cachep->obj_size > cache_line_size() && size < PAGE_SIZE) {

2269

&& cachep->obj_size > cache_line_size() && size < PAGE_SIZE) {

2259

cachep->obj_offset += PAGE_SIZE - size;

2270

cachep->obj_offset += PAGE_SIZE - size;

2260

size = PAGE_SIZE;

2271

size = PAGE_SIZE;

2261

}

2272

}

2262

#endif

2273

#endif

2263

#endif

2274

#endif

2264

2275

2265

/*

2276

/*

2266

* Determine if the slab management is 'on' or 'off' slab.

2277

* Determine if the slab management is 'on' or 'off' slab.

2267

* (bootstrapping cannot cope with offslab caches so don't do

2278

* (bootstrapping cannot cope with offslab caches so don't do

2268

* it too early on.)

2279

* it too early on.)

2269

*/

2280

*/

2270

if ((size >= (PAGE_SIZE >> 3)) && !slab_early_init)

2281

if ((size >= (PAGE_SIZE >> 3)) && !slab_early_init)

2271

/*

2282

/*

2272

* Size is large, assume best to place the slab management obj

2283

* Size is large, assume best to place the slab management obj

2273

* off-slab (should allow better packing of objs).

2284

* off-slab (should allow better packing of objs).

2274

*/

2285

*/

2275

flags |= CFLGS_OFF_SLAB;

2286

flags |= CFLGS_OFF_SLAB;

2276

2287

2277

size = ALIGN(size, align);

2288

size = ALIGN(size, align);

2278

2289

2279

left_over = calculate_slab_order(cachep, size, align, flags);

2290

left_over = calculate_slab_order(cachep, size, align, flags);

2280

2291

2281

if (!cachep->num) {

2292

if (!cachep->num) {

2282

printk("kmem_cache_create: couldn't create cache %s.\n", name);

2293

printk("kmem_cache_create: couldn't create cache %s.\n", name);

2283

kmem_cache_free(&cache_cache, cachep);

2294

kmem_cache_free(&cache_cache, cachep);

2284

cachep = NULL;

2295

cachep = NULL;

2285

goto oops;

2296

goto oops;

2286

}

2297

}

2287

slab_size = ALIGN(cachep->num * sizeof(kmem_bufctl_t)

2298

slab_size = ALIGN(cachep->num * sizeof(kmem_bufctl_t)

2288

+ sizeof(struct slab), align);

2299

+ sizeof(struct slab), align);

2289

2300

2290

/*

2301

/*

2291

* If the slab has been placed off-slab, and we have enough space then

2302

* If the slab has been placed off-slab, and we have enough space then

2292

* move it on-slab. This is at the expense of any extra colouring.

2303

* move it on-slab. This is at the expense of any extra colouring.

2293

*/

2304

*/

2294

if (flags & CFLGS_OFF_SLAB && left_over >= slab_size) {

2305

if (flags & CFLGS_OFF_SLAB && left_over >= slab_size) {

2295

flags &= ~CFLGS_OFF_SLAB;

2306

flags &= ~CFLGS_OFF_SLAB;

2296

left_over -= slab_size;

2307

left_over -= slab_size;

2297

}

2308

}

2298

2309

2299

if (flags & CFLGS_OFF_SLAB) {

2310

if (flags & CFLGS_OFF_SLAB) {

2300

/* really off slab. No need for manual alignment */

2311

/* really off slab. No need for manual alignment */

2301

slab_size =

2312

slab_size =

2302

cachep->num * sizeof(kmem_bufctl_t) + sizeof(struct slab);

2313

cachep->num * sizeof(kmem_bufctl_t) + sizeof(struct slab);

2303

}

2314

}

2304

2315

2305

cachep->colour_off = cache_line_size();

2316

cachep->colour_off = cache_line_size();

2306

/* Offset must be a multiple of the alignment. */

2317

/* Offset must be a multiple of the alignment. */

2307

if (cachep->colour_off < align)

2318

if (cachep->colour_off < align)

2308

cachep->colour_off = align;

2319

cachep->colour_off = align;

2309

cachep->colour = left_over / cachep->colour_off;

2320

cachep->colour = left_over / cachep->colour_off;

2310

cachep->slab_size = slab_size;

2321

cachep->slab_size = slab_size;

2311

cachep->flags = flags;

2322

cachep->flags = flags;

2312

cachep->gfpflags = 0;

2323

cachep->gfpflags = 0;

2313

if (flags & SLAB_CACHE_DMA)

2324

if (flags & SLAB_CACHE_DMA)

2314

cachep->gfpflags |= GFP_DMA;

2325

cachep->gfpflags |= GFP_DMA;

2315

cachep->buffer_size = size;

2326

cachep->buffer_size = size;

2327

cachep->reciprocal_buffer_size = reciprocal_value(size);

2316

2328

2317

if (flags & CFLGS_OFF_SLAB) {

2329

if (flags & CFLGS_OFF_SLAB) {

2318

cachep->slabp_cache = kmem_find_general_cachep(slab_size, 0u);

2330

cachep->slabp_cache = kmem_find_general_cachep(slab_size, 0u);

2319

/*

2331

/*

2320

* This is a possibility for one of the malloc_sizes caches.

2332

* This is a possibility for one of the malloc_sizes caches.

2321

* But since we go off slab only for object size greater than

2333

* But since we go off slab only for object size greater than

2322

* PAGE_SIZE/8, and malloc_sizes gets created in ascending order,

2334

* PAGE_SIZE/8, and malloc_sizes gets created in ascending order,

2323

* this should not happen at all.

2335

* this should not happen at all.

2324

* But leave a BUG_ON for some lucky dude.

2336

* But leave a BUG_ON for some lucky dude.

2325

*/

2337

*/

2326

BUG_ON(!cachep->slabp_cache);

2338

BUG_ON(!cachep->slabp_cache);

2327

}

2339

}

2328

cachep->ctor = ctor;

2340

cachep->ctor = ctor;

2329

cachep->dtor = dtor;

2341

cachep->dtor = dtor;

2330

cachep->name = name;

2342

cachep->name = name;

2331

2343

2332

if (setup_cpu_cache(cachep)) {

2344

if (setup_cpu_cache(cachep)) {

2333

__kmem_cache_destroy(cachep);

2345

__kmem_cache_destroy(cachep);

2334

cachep = NULL;

2346

cachep = NULL;

2335

goto oops;

2347

goto oops;

2336

}

2348

}

2337

2349

2338

/* cache setup completed, link it into the list */

2350

/* cache setup completed, link it into the list */

2339

list_add(&cachep->next, &cache_chain);

2351

list_add(&cachep->next, &cache_chain);

2340

oops:

2352

oops:

2341

if (!cachep && (flags & SLAB_PANIC))

2353

if (!cachep && (flags & SLAB_PANIC))

2342

panic("kmem_cache_create(): failed to create slab `%s'\n",

2354

panic("kmem_cache_create(): failed to create slab `%s'\n",

2343

name);

2355

name);

2344

mutex_unlock(&cache_chain_mutex);

2356

mutex_unlock(&cache_chain_mutex);

2345

return cachep;

2357

return cachep;

2346

}

2358

}

2347

EXPORT_SYMBOL(kmem_cache_create);

2359

EXPORT_SYMBOL(kmem_cache_create);

2348

2360

2349

#if DEBUG

2361

#if DEBUG

2350

static void check_irq_off(void)

2362

static void check_irq_off(void)

2351

{

2363

{

2352

BUG_ON(!irqs_disabled());

2364

BUG_ON(!irqs_disabled());

2353

}

2365

}

2354

2366

2355

static void check_irq_on(void)

2367

static void check_irq_on(void)

2356

{

2368

{

2357

BUG_ON(irqs_disabled());

2369

BUG_ON(irqs_disabled());

2358

}

2370

}

2359

2371

2360

static void check_spinlock_acquired(struct kmem_cache *cachep)

2372

static void check_spinlock_acquired(struct kmem_cache *cachep)

2361

{

2373

{

2362

#ifdef CONFIG_SMP

2374

#ifdef CONFIG_SMP

2363

check_irq_off();

2375

check_irq_off();

2364

assert_spin_locked(&cachep->nodelists[numa_node_id()]->list_lock);

2376

assert_spin_locked(&cachep->nodelists[numa_node_id()]->list_lock);

2365

#endif

2377

#endif

2366

}

2378

}

2367

2379

2368

static void check_spinlock_acquired_node(struct kmem_cache *cachep, int node)

2380

static void check_spinlock_acquired_node(struct kmem_cache *cachep, int node)

2369

{

2381

{

2370

#ifdef CONFIG_SMP

2382

#ifdef CONFIG_SMP

2371

check_irq_off();

2383

check_irq_off();

2372

assert_spin_locked(&cachep->nodelists[node]->list_lock);

2384

assert_spin_locked(&cachep->nodelists[node]->list_lock);

2373

#endif

2385

#endif

2374

}

2386

}

2375

2387

2376

#else

2388

#else

2377

#define check_irq_off() do { } while(0)

2389

#define check_irq_off() do { } while(0)

2378

#define check_irq_on() do { } while(0)

2390

#define check_irq_on() do { } while(0)

2379

#define check_spinlock_acquired(x) do { } while(0)

2391

#define check_spinlock_acquired(x) do { } while(0)

2380

#define check_spinlock_acquired_node(x, y) do { } while(0)

2392

#define check_spinlock_acquired_node(x, y) do { } while(0)

2381

#endif

2393

#endif

2382

2394

2383

static void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3,

2395

static void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3,

2384

struct array_cache *ac,

2396

struct array_cache *ac,

2385

int force, int node);

2397

int force, int node);

2386

2398

2387

static void do_drain(void *arg)

2399

static void do_drain(void *arg)

2388

{

2400

{

2389

struct kmem_cache *cachep = arg;

2401

struct kmem_cache *cachep = arg;

2390

struct array_cache *ac;

2402

struct array_cache *ac;

2391

int node = numa_node_id();

2403

int node = numa_node_id();

2392

2404

2393

check_irq_off();

2405

check_irq_off();

2394

ac = cpu_cache_get(cachep);

2406

ac = cpu_cache_get(cachep);

2395

spin_lock(&cachep->nodelists[node]->list_lock);

2407

spin_lock(&cachep->nodelists[node]->list_lock);

2396

free_block(cachep, ac->entry, ac->avail, node);

2408

free_block(cachep, ac->entry, ac->avail, node);

2397

spin_unlock(&cachep->nodelists[node]->list_lock);

2409

spin_unlock(&cachep->nodelists[node]->list_lock);

2398

ac->avail = 0;

2410

ac->avail = 0;

2399

}

2411

}

2400

2412

2401

static void drain_cpu_caches(struct kmem_cache *cachep)

2413

static void drain_cpu_caches(struct kmem_cache *cachep)

2402

{

2414

{

2403

struct kmem_list3 *l3;

2415

struct kmem_list3 *l3;

2404

int node;

2416

int node;

2405

2417

2406

on_each_cpu(do_drain, cachep, 1, 1);

2418

on_each_cpu(do_drain, cachep, 1, 1);

2407

check_irq_on();

2419

check_irq_on();

2408

for_each_online_node(node) {

2420

for_each_online_node(node) {

2409

l3 = cachep->nodelists[node];

2421

l3 = cachep->nodelists[node];

2410

if (l3 && l3->alien)

2422

if (l3 && l3->alien)

2411

drain_alien_cache(cachep, l3->alien);

2423

drain_alien_cache(cachep, l3->alien);

2412

}

2424

}

2413

2425

2414

for_each_online_node(node) {

2426

for_each_online_node(node) {

2415

l3 = cachep->nodelists[node];

2427

l3 = cachep->nodelists[node];

2416

if (l3)

2428

if (l3)

2417

drain_array(cachep, l3, l3->shared, 1, node);

2429

drain_array(cachep, l3, l3->shared, 1, node);

2418

}

2430

}

2419

}

2431

}

2420

2432

2421

/*

2433

/*

2422

* Remove slabs from the list of free slabs.

2434

* Remove slabs from the list of free slabs.

2423

* Specify the number of slabs to drain in tofree.

2435

* Specify the number of slabs to drain in tofree.

2424

*

2436

*

2425

* Returns the actual number of slabs released.

2437

* Returns the actual number of slabs released.

2426

*/

2438

*/

2427

static int drain_freelist(struct kmem_cache *cache,

2439

static int drain_freelist(struct kmem_cache *cache,

2428

struct kmem_list3 *l3, int tofree)

2440

struct kmem_list3 *l3, int tofree)

2429

{

2441

{

2430

struct list_head *p;

2442

struct list_head *p;

2431

int nr_freed;

2443

int nr_freed;

2432

struct slab *slabp;

2444

struct slab *slabp;

2433

2445

2434

nr_freed = 0;

2446

nr_freed = 0;

2435

while (nr_freed < tofree && !list_empty(&l3->slabs_free)) {

2447

while (nr_freed < tofree && !list_empty(&l3->slabs_free)) {

2436

2448

2437

spin_lock_irq(&l3->list_lock);

2449

spin_lock_irq(&l3->list_lock);

2438

p = l3->slabs_free.prev;

2450

p = l3->slabs_free.prev;

2439

if (p == &l3->slabs_free) {

2451

if (p == &l3->slabs_free) {

2440

spin_unlock_irq(&l3->list_lock);

2452

spin_unlock_irq(&l3->list_lock);

2441

goto out;

2453

goto out;

2442

}

2454

}

2443

2455

2444

slabp = list_entry(p, struct slab, list);

2456

slabp = list_entry(p, struct slab, list);

2445

#if DEBUG

2457

#if DEBUG

2446

BUG_ON(slabp->inuse);

2458

BUG_ON(slabp->inuse);

2447

#endif

2459

#endif

2448

list_del(&slabp->list);

2460

list_del(&slabp->list);

2449

/*

2461

/*

2450

* Safe to drop the lock. The slab is no longer linked

2462

* Safe to drop the lock. The slab is no longer linked

2451

* to the cache.

2463

* to the cache.

2452

*/

2464

*/

2453

l3->free_objects -= cache->num;

2465

l3->free_objects -= cache->num;

2454

spin_unlock_irq(&l3->list_lock);

2466

spin_unlock_irq(&l3->list_lock);

2455

slab_destroy(cache, slabp);

2467

slab_destroy(cache, slabp);

2456

nr_freed++;

2468

nr_freed++;

2457

}

2469

}

2458

out:

2470

out:

2459

return nr_freed;

2471

return nr_freed;

2460

}

2472

}

2461

2473

2462

/* Called with cache_chain_mutex held to protect against cpu hotplug */

2474

/* Called with cache_chain_mutex held to protect against cpu hotplug */

2463

static int __cache_shrink(struct kmem_cache *cachep)

2475

static int __cache_shrink(struct kmem_cache *cachep)

2464

{

2476

{

2465

int ret = 0, i = 0;

2477

int ret = 0, i = 0;

2466

struct kmem_list3 *l3;

2478

struct kmem_list3 *l3;

2467

2479

2468

drain_cpu_caches(cachep);

2480

drain_cpu_caches(cachep);

2469

2481

2470

check_irq_on();

2482

check_irq_on();

2471

for_each_online_node(i) {

2483

for_each_online_node(i) {

2472

l3 = cachep->nodelists[i];

2484

l3 = cachep->nodelists[i];

2473

if (!l3)

2485

if (!l3)

2474

continue;

2486

continue;

2475

2487

2476

drain_freelist(cachep, l3, l3->free_objects);

2488

drain_freelist(cachep, l3, l3->free_objects);

2477

2489

2478

ret += !list_empty(&l3->slabs_full) ||

2490

ret += !list_empty(&l3->slabs_full) ||

2479

!list_empty(&l3->slabs_partial);

2491

!list_empty(&l3->slabs_partial);

2480

}

2492

}

2481

return (ret ? 1 : 0);

2493

return (ret ? 1 : 0);

2482

}

2494

}

2483

2495

2484

/**

2496

/**

2485

* kmem_cache_shrink - Shrink a cache.

2497

* kmem_cache_shrink - Shrink a cache.

2486

* @cachep: The cache to shrink.

2498

* @cachep: The cache to shrink.

2487

*

2499

*

2488

* Releases as many slabs as possible for a cache.

2500

* Releases as many slabs as possible for a cache.

2489

* To help debugging, a zero exit status indicates all slabs were released.

2501

* To help debugging, a zero exit status indicates all slabs were released.

2490

*/

2502

*/

2491

int kmem_cache_shrink(struct kmem_cache *cachep)

2503

int kmem_cache_shrink(struct kmem_cache *cachep)

2492

{

2504

{

2493

int ret;

2505

int ret;

2494

BUG_ON(!cachep || in_interrupt());

2506

BUG_ON(!cachep || in_interrupt());

2495

2507

2496

mutex_lock(&cache_chain_mutex);

2508

mutex_lock(&cache_chain_mutex);

2497

ret = __cache_shrink(cachep);

2509

ret = __cache_shrink(cachep);

2498

mutex_unlock(&cache_chain_mutex);

2510

mutex_unlock(&cache_chain_mutex);

2499

return ret;

2511

return ret;

2500

}

2512

}

2501

EXPORT_SYMBOL(kmem_cache_shrink);

2513

EXPORT_SYMBOL(kmem_cache_shrink);

2502

2514

2503

/**

2515

/**

2504

* kmem_cache_destroy - delete a cache

2516

* kmem_cache_destroy - delete a cache

2505

* @cachep: the cache to destroy

2517

* @cachep: the cache to destroy

2506

*

2518

*

2507

* Remove a struct kmem_cache object from the slab cache.

2519

* Remove a struct kmem_cache object from the slab cache.

2508

*

2520

*

2509

* It is expected this function will be called by a module when it is

2521

* It is expected this function will be called by a module when it is

2510

* unloaded. This will remove the cache completely, and avoid a duplicate

2522

* unloaded. This will remove the cache completely, and avoid a duplicate

2511

* cache being allocated each time a module is loaded and unloaded, if the

2523

* cache being allocated each time a module is loaded and unloaded, if the

2512

* module doesn't have persistent in-kernel storage across loads and unloads.

2524

* module doesn't have persistent in-kernel storage across loads and unloads.

2513

*

2525

*

2514

* The cache must be empty before calling this function.

2526

* The cache must be empty before calling this function.

2515

*

2527

*

2516

* The caller must guarantee that noone will allocate memory from the cache

2528

* The caller must guarantee that noone will allocate memory from the cache

2517

* during the kmem_cache_destroy().

2529

* during the kmem_cache_destroy().

2518

*/

2530

*/

2519

void kmem_cache_destroy(struct kmem_cache *cachep)

2531

void kmem_cache_destroy(struct kmem_cache *cachep)

2520

{

2532

{

2521

BUG_ON(!cachep || in_interrupt());

2533

BUG_ON(!cachep || in_interrupt());

2522

2534

2523

/* Find the cache in the chain of caches. */

2535

/* Find the cache in the chain of caches. */

2524

mutex_lock(&cache_chain_mutex);

2536

mutex_lock(&cache_chain_mutex);

2525

/*

2537

/*

2526

* the chain is never empty, cache_cache is never destroyed

2538

* the chain is never empty, cache_cache is never destroyed

2527

*/

2539

*/

2528

list_del(&cachep->next);

2540

list_del(&cachep->next);

2529

if (__cache_shrink(cachep)) {

2541

if (__cache_shrink(cachep)) {

2530

slab_error(cachep, "Can't free all objects");

2542

slab_error(cachep, "Can't free all objects");

2531

list_add(&cachep->next, &cache_chain);

2543

list_add(&cachep->next, &cache_chain);

2532

mutex_unlock(&cache_chain_mutex);

2544

mutex_unlock(&cache_chain_mutex);

2533

return;

2545

return;

2534

}

2546

}

2535

2547

2536

if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU))

2548

if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU))

2537

synchronize_rcu();

2549

synchronize_rcu();

2538

2550

2539

__kmem_cache_destroy(cachep);

2551

__kmem_cache_destroy(cachep);

2540

mutex_unlock(&cache_chain_mutex);

2552

mutex_unlock(&cache_chain_mutex);

2541

}

2553

}

2542

EXPORT_SYMBOL(kmem_cache_destroy);

2554

EXPORT_SYMBOL(kmem_cache_destroy);

2543

2555

2544

/*

2556

/*

2545

* Get the memory for a slab management obj.

2557

* Get the memory for a slab management obj.

2546

* For a slab cache when the slab descriptor is off-slab, slab descriptors

2558

* For a slab cache when the slab descriptor is off-slab, slab descriptors

2547

* always come from malloc_sizes caches. The slab descriptor cannot

2559

* always come from malloc_sizes caches. The slab descriptor cannot

2548

* come from the same cache which is getting created because,

2560

* come from the same cache which is getting created because,

2549

* when we are searching for an appropriate cache for these

2561

* when we are searching for an appropriate cache for these

2550

* descriptors in kmem_cache_create, we search through the malloc_sizes array.

2562

* descriptors in kmem_cache_create, we search through the malloc_sizes array.

2551

* If we are creating a malloc_sizes cache here it would not be visible to

2563

* If we are creating a malloc_sizes cache here it would not be visible to

2552

* kmem_find_general_cachep till the initialization is complete.

2564

* kmem_find_general_cachep till the initialization is complete.

2553

* Hence we cannot have slabp_cache same as the original cache.

2565

* Hence we cannot have slabp_cache same as the original cache.

2554

*/

2566

*/

2555

static struct slab *alloc_slabmgmt(struct kmem_cache *cachep, void *objp,

2567

static struct slab *alloc_slabmgmt(struct kmem_cache *cachep, void *objp,

2556

int colour_off, gfp_t local_flags,

2568

int colour_off, gfp_t local_flags,

2557

int nodeid)

2569

int nodeid)

2558

{

2570

{

2559

struct slab *slabp;

2571

struct slab *slabp;

2560

2572

2561

if (OFF_SLAB(cachep)) {

2573

if (OFF_SLAB(cachep)) {

2562

/* Slab management obj is off-slab. */

2574

/* Slab management obj is off-slab. */

2563

slabp = kmem_cache_alloc_node(cachep->slabp_cache,

2575

slabp = kmem_cache_alloc_node(cachep->slabp_cache,

2564

local_flags & ~GFP_THISNODE, nodeid);

2576

local_flags & ~GFP_THISNODE, nodeid);

2565

if (!slabp)

2577

if (!slabp)

2566

return NULL;

2578

return NULL;

2567

} else {

2579

} else {

2568

slabp = objp + colour_off;

2580

slabp = objp + colour_off;

2569

colour_off += cachep->slab_size;

2581

colour_off += cachep->slab_size;

2570

}

2582

}

2571

slabp->inuse = 0;

2583

slabp->inuse = 0;

2572

slabp->colouroff = colour_off;

2584

slabp->colouroff = colour_off;

2573

slabp->s_mem = objp + colour_off;

2585

slabp->s_mem = objp + colour_off;

2574

slabp->nodeid = nodeid;

2586

slabp->nodeid = nodeid;

2575

return slabp;

2587

return slabp;

2576

}

2588

}

2577

2589

2578

static inline kmem_bufctl_t *slab_bufctl(struct slab *slabp)

2590

static inline kmem_bufctl_t *slab_bufctl(struct slab *slabp)

2579

{

2591

{

2580

return (kmem_bufctl_t *) (slabp + 1);

2592

return (kmem_bufctl_t *) (slabp + 1);

2581

}

2593

}

2582

2594

2583

static void cache_init_objs(struct kmem_cache *cachep,

2595

static void cache_init_objs(struct kmem_cache *cachep,

2584

struct slab *slabp, unsigned long ctor_flags)

2596

struct slab *slabp, unsigned long ctor_flags)

2585

{

2597

{

2586

int i;

2598

int i;

2587

2599

2588

for (i = 0; i < cachep->num; i++) {

2600

for (i = 0; i < cachep->num; i++) {

2589

void *objp = index_to_obj(cachep, slabp, i);

2601

void *objp = index_to_obj(cachep, slabp, i);

2590

#if DEBUG

2602

#if DEBUG

2591

/* need to poison the objs? */

2603

/* need to poison the objs? */

2592

if (cachep->flags & SLAB_POISON)

2604

if (cachep->flags & SLAB_POISON)

2593

poison_obj(cachep, objp, POISON_FREE);

2605

poison_obj(cachep, objp, POISON_FREE);

2594

if (cachep->flags & SLAB_STORE_USER)

2606

if (cachep->flags & SLAB_STORE_USER)

2595

*dbg_userword(cachep, objp) = NULL;

2607

*dbg_userword(cachep, objp) = NULL;

2596

2608

2597

if (cachep->flags & SLAB_RED_ZONE) {

2609

if (cachep->flags & SLAB_RED_ZONE) {

2598

*dbg_redzone1(cachep, objp) = RED_INACTIVE;

2610

*dbg_redzone1(cachep, objp) = RED_INACTIVE;

2599

*dbg_redzone2(cachep, objp) = RED_INACTIVE;

2611

*dbg_redzone2(cachep, objp) = RED_INACTIVE;

2600

}

2612

}

2601

/*

2613

/*

2602

* Constructors are not allowed to allocate memory from the same

2614

* Constructors are not allowed to allocate memory from the same

2603

* cache which they are a constructor for. Otherwise, deadlock.

2615

* cache which they are a constructor for. Otherwise, deadlock.

2604

* They must also be threaded.

2616

* They must also be threaded.

2605

*/

2617

*/

2606

if (cachep->ctor && !(cachep->flags & SLAB_POISON))

2618

if (cachep->ctor && !(cachep->flags & SLAB_POISON))

2607

cachep->ctor(objp + obj_offset(cachep), cachep,

2619

cachep->ctor(objp + obj_offset(cachep), cachep,

2608

ctor_flags);

2620

ctor_flags);

2609

2621

2610

if (cachep->flags & SLAB_RED_ZONE) {

2622

if (cachep->flags & SLAB_RED_ZONE) {

2611

if (*dbg_redzone2(cachep, objp) != RED_INACTIVE)

2623

if (*dbg_redzone2(cachep, objp) != RED_INACTIVE)

2612

slab_error(cachep, "constructor overwrote the"

2624

slab_error(cachep, "constructor overwrote the"

2613

" end of an object");

2625

" end of an object");

2614

if (*dbg_redzone1(cachep, objp) != RED_INACTIVE)

2626

if (*dbg_redzone1(cachep, objp) != RED_INACTIVE)

2615

slab_error(cachep, "constructor overwrote the"

2627

slab_error(cachep, "constructor overwrote the"

2616

" start of an object");

2628

" start of an object");

2617

}

2629

}

2618

if ((cachep->buffer_size % PAGE_SIZE) == 0 &&

2630

if ((cachep->buffer_size % PAGE_SIZE) == 0 &&

2619

OFF_SLAB(cachep) && cachep->flags & SLAB_POISON)

2631

OFF_SLAB(cachep) && cachep->flags & SLAB_POISON)

2620

kernel_map_pages(virt_to_page(objp),

2632

kernel_map_pages(virt_to_page(objp),

2621

cachep->buffer_size / PAGE_SIZE, 0);

2633

cachep->buffer_size / PAGE_SIZE, 0);

2622

#else

2634

#else

2623

if (cachep->ctor)

2635

if (cachep->ctor)

2624

cachep->ctor(objp, cachep, ctor_flags);

2636

cachep->ctor(objp, cachep, ctor_flags);

2625

#endif

2637

#endif

2626

slab_bufctl(slabp)[i] = i + 1;

2638

slab_bufctl(slabp)[i] = i + 1;

2627

}

2639

}

2628

slab_bufctl(slabp)[i - 1] = BUFCTL_END;

2640

slab_bufctl(slabp)[i - 1] = BUFCTL_END;

2629

slabp->free = 0;

2641

slabp->free = 0;

2630

}

2642

}

2631

2643

2632

static void kmem_flagcheck(struct kmem_cache *cachep, gfp_t flags)

2644

static void kmem_flagcheck(struct kmem_cache *cachep, gfp_t flags)

2633

{

2645

{

2634

if (flags & GFP_DMA)

2646

if (flags & GFP_DMA)

2635

BUG_ON(!(cachep->gfpflags & GFP_DMA));

2647

BUG_ON(!(cachep->gfpflags & GFP_DMA));

2636

else

2648

else

2637

BUG_ON(cachep->gfpflags & GFP_DMA);

2649

BUG_ON(cachep->gfpflags & GFP_DMA);

2638

}

2650

}

2639

2651

2640

static void *slab_get_obj(struct kmem_cache *cachep, struct slab *slabp,

2652

static void *slab_get_obj(struct kmem_cache *cachep, struct slab *slabp,

2641

int nodeid)

2653

int nodeid)

2642

{

2654

{

2643

void *objp = index_to_obj(cachep, slabp, slabp->free);

2655

void *objp = index_to_obj(cachep, slabp, slabp->free);

2644

kmem_bufctl_t next;

2656

kmem_bufctl_t next;

2645

2657

2646

slabp->inuse++;

2658

slabp->inuse++;

2647

next = slab_bufctl(slabp)[slabp->free];

2659

next = slab_bufctl(slabp)[slabp->free];

2648

#if DEBUG

2660

#if DEBUG

2649

slab_bufctl(slabp)[slabp->free] = BUFCTL_FREE;

2661

slab_bufctl(slabp)[slabp->free] = BUFCTL_FREE;

2650

WARN_ON(slabp->nodeid != nodeid);

2662

WARN_ON(slabp->nodeid != nodeid);

2651

#endif

2663

#endif

2652

slabp->free = next;

2664

slabp->free = next;

2653

2665

2654

return objp;

2666

return objp;

2655

}

2667

}

2656

2668

2657

static void slab_put_obj(struct kmem_cache *cachep, struct slab *slabp,

2669

static void slab_put_obj(struct kmem_cache *cachep, struct slab *slabp,

2658

void *objp, int nodeid)

2670

void *objp, int nodeid)

2659

{

2671

{

2660

unsigned int objnr = obj_to_index(cachep, slabp, objp);

2672

unsigned int objnr = obj_to_index(cachep, slabp, objp);

2661

2673

2662

#if DEBUG

2674

#if DEBUG

2663

/* Verify that the slab belongs to the intended node */

2675

/* Verify that the slab belongs to the intended node */

2664

WARN_ON(slabp->nodeid != nodeid);

2676

WARN_ON(slabp->nodeid != nodeid);

2665

2677

2666

if (slab_bufctl(slabp)[objnr] + 1 <= SLAB_LIMIT + 1) {

2678

if (slab_bufctl(slabp)[objnr] + 1 <= SLAB_LIMIT + 1) {

2667

printk(KERN_ERR "slab: double free detected in cache "

2679

printk(KERN_ERR "slab: double free detected in cache "

2668

"'%s', objp %p\n", cachep->name, objp);

2680

"'%s', objp %p\n", cachep->name, objp);

2669

BUG();

2681

BUG();

2670

}

2682

}

2671

#endif

2683

#endif

2672

slab_bufctl(slabp)[objnr] = slabp->free;

2684

slab_bufctl(slabp)[objnr] = slabp->free;

2673

slabp->free = objnr;

2685

slabp->free = objnr;

2674

slabp->inuse--;

2686

slabp->inuse--;

2675

}

2687

}

2676

2688

2677

/*

2689

/*

2678

* Map pages beginning at addr to the given cache and slab. This is required

2690

* Map pages beginning at addr to the given cache and slab. This is required

2679

* for the slab allocator to be able to lookup the cache and slab of a

2691

* for the slab allocator to be able to lookup the cache and slab of a

2680

* virtual address for kfree, ksize, kmem_ptr_validate, and slab debugging.

2692

* virtual address for kfree, ksize, kmem_ptr_validate, and slab debugging.

2681

*/

2693

*/

2682

static void slab_map_pages(struct kmem_cache *cache, struct slab *slab,

2694

static void slab_map_pages(struct kmem_cache *cache, struct slab *slab,

2683

void *addr)

2695

void *addr)

2684

{

2696

{

2685

int nr_pages;

2697

int nr_pages;

2686

struct page *page;

2698

struct page *page;

2687

2699

2688

page = virt_to_page(addr);

2700

page = virt_to_page(addr);

2689

2701

2690

nr_pages = 1;

2702

nr_pages = 1;

2691

if (likely(!PageCompound(page)))

2703

if (likely(!PageCompound(page)))

2692

nr_pages <<= cache->gfporder;

2704

nr_pages <<= cache->gfporder;

2693

2705

2694

do {

2706

do {

2695

page_set_cache(page, cache);

2707

page_set_cache(page, cache);

2696

page_set_slab(page, slab);

2708

page_set_slab(page, slab);

2697

page++;

2709

page++;

2698

} while (--nr_pages);

2710

} while (--nr_pages);

2699

}

2711

}

2700

2712

2701

/*

2713

/*

2702

* Grow (by 1) the number of slabs within a cache. This is called by

2714

* Grow (by 1) the number of slabs within a cache. This is called by

2703

* kmem_cache_alloc() when there are no active objs left in a cache.

2715

* kmem_cache_alloc() when there are no active objs left in a cache.

2704

*/

2716

*/

2705

static int cache_grow(struct kmem_cache *cachep,

2717

static int cache_grow(struct kmem_cache *cachep,

2706

gfp_t flags, int nodeid, void *objp)

2718

gfp_t flags, int nodeid, void *objp)

2707

{

2719

{

2708

struct slab *slabp;

2720

struct slab *slabp;

2709

size_t offset;

2721

size_t offset;

2710

gfp_t local_flags;

2722

gfp_t local_flags;

2711

unsigned long ctor_flags;

2723

unsigned long ctor_flags;

2712

struct kmem_list3 *l3;

2724

struct kmem_list3 *l3;

2713

2725

2714

/*

2726

/*

2715

* Be lazy and only check for valid flags here, keeping it out of the

2727

* Be lazy and only check for valid flags here, keeping it out of the

2716

* critical path in kmem_cache_alloc().

2728

* critical path in kmem_cache_alloc().

2717

*/

2729

*/

2718

BUG_ON(flags & ~(GFP_DMA | GFP_LEVEL_MASK | __GFP_NO_GROW));

2730

BUG_ON(flags & ~(GFP_DMA | GFP_LEVEL_MASK | __GFP_NO_GROW));

2719

if (flags & __GFP_NO_GROW)

2731

if (flags & __GFP_NO_GROW)

2720

return 0;

2732

return 0;

2721

2733

2722

ctor_flags = SLAB_CTOR_CONSTRUCTOR;

2734

ctor_flags = SLAB_CTOR_CONSTRUCTOR;

2723

local_flags = (flags & GFP_LEVEL_MASK);

2735

local_flags = (flags & GFP_LEVEL_MASK);

2724

if (!(local_flags & __GFP_WAIT))

2736

if (!(local_flags & __GFP_WAIT))

2725

/*

2737

/*

2726

* Not allowed to sleep. Need to tell a constructor about

2738

* Not allowed to sleep. Need to tell a constructor about

2727

* this - it might need to know...

2739

* this - it might need to know...

2728

*/

2740

*/

2729

ctor_flags |= SLAB_CTOR_ATOMIC;

2741

ctor_flags |= SLAB_CTOR_ATOMIC;

2730

2742

2731

/* Take the l3 list lock to change the colour_next on this node */

2743

/* Take the l3 list lock to change the colour_next on this node */

2732

check_irq_off();

2744

check_irq_off();

2733

l3 = cachep->nodelists[nodeid];

2745

l3 = cachep->nodelists[nodeid];

2734

spin_lock(&l3->list_lock);

2746

spin_lock(&l3->list_lock);

2735

2747

2736

/* Get colour for the slab, and cal the next value. */

2748

/* Get colour for the slab, and cal the next value. */

2737

offset = l3->colour_next;

2749

offset = l3->colour_next;

2738

l3->colour_next++;

2750

l3->colour_next++;

2739

if (l3->colour_next >= cachep->colour)

2751

if (l3->colour_next >= cachep->colour)

2740

l3->colour_next = 0;

2752

l3->colour_next = 0;

2741

spin_unlock(&l3->list_lock);

2753

spin_unlock(&l3->list_lock);

2742

2754

2743

offset *= cachep->colour_off;

2755

offset *= cachep->colour_off;

2744

2756

2745

if (local_flags & __GFP_WAIT)

2757

if (local_flags & __GFP_WAIT)

2746

local_irq_enable();

2758

local_irq_enable();

2747

2759

2748

/*

2760

/*

2749

* The test for missing atomic flag is performed here, rather than

2761

* The test for missing atomic flag is performed here, rather than

2750

* the more obvious place, simply to reduce the critical path length

2762

* the more obvious place, simply to reduce the critical path length

2751

* in kmem_cache_alloc(). If a caller is seriously mis-behaving they

2763

* in kmem_cache_alloc(). If a caller is seriously mis-behaving they

2752

* will eventually be caught here (where it matters).

2764

* will eventually be caught here (where it matters).

2753

*/

2765

*/

2754

kmem_flagcheck(cachep, flags);

2766

kmem_flagcheck(cachep, flags);

2755

2767

2756

/*

2768

/*

2757

* Get mem for the objs. Attempt to allocate a physical page from

2769

* Get mem for the objs. Attempt to allocate a physical page from

2758

* 'nodeid'.

2770

* 'nodeid'.

2759

*/

2771

*/

2760

if (!objp)

2772

if (!objp)

2761

objp = kmem_getpages(cachep, flags, nodeid);

2773

objp = kmem_getpages(cachep, flags, nodeid);

2762

if (!objp)

2774

if (!objp)

2763

goto failed;

2775

goto failed;

2764

2776

2765

/* Get slab management. */

2777

/* Get slab management. */

2766

slabp = alloc_slabmgmt(cachep, objp, offset,

2778

slabp = alloc_slabmgmt(cachep, objp, offset,

2767

local_flags & ~GFP_THISNODE, nodeid);

2779

local_flags & ~GFP_THISNODE, nodeid);

2768

if (!slabp)

2780

if (!slabp)

2769

goto opps1;

2781

goto opps1;

2770

2782

2771

slabp->nodeid = nodeid;

2783

slabp->nodeid = nodeid;

2772

slab_map_pages(cachep, slabp, objp);

2784

slab_map_pages(cachep, slabp, objp);

2773

2785

2774

cache_init_objs(cachep, slabp, ctor_flags);

2786

cache_init_objs(cachep, slabp, ctor_flags);

2775

2787

2776

if (local_flags & __GFP_WAIT)

2788

if (local_flags & __GFP_WAIT)

2777

local_irq_disable();

2789

local_irq_disable();

2778

check_irq_off();

2790

check_irq_off();

2779

spin_lock(&l3->list_lock);

2791

spin_lock(&l3->list_lock);

2780

2792

2781

/* Make slab active. */

2793

/* Make slab active. */

2782

list_add_tail(&slabp->list, &(l3->slabs_free));

2794

list_add_tail(&slabp->list, &(l3->slabs_free));

2783

STATS_INC_GROWN(cachep);

2795

STATS_INC_GROWN(cachep);

2784

l3->free_objects += cachep->num;

2796

l3->free_objects += cachep->num;

2785

spin_unlock(&l3->list_lock);

2797

spin_unlock(&l3->list_lock);

2786

return 1;

2798

return 1;

2787

opps1:

2799

opps1:

2788

kmem_freepages(cachep, objp);

2800

kmem_freepages(cachep, objp);

2789

failed:

2801

failed:

2790

if (local_flags & __GFP_WAIT)

2802

if (local_flags & __GFP_WAIT)

2791

local_irq_disable();

2803

local_irq_disable();

2792

return 0;

2804

return 0;

2793

}

2805

}

2794

2806

2795

#if DEBUG

2807

#if DEBUG

2796

2808

2797

/*

2809

/*

2798

* Perform extra freeing checks:

2810

* Perform extra freeing checks:

2799

* - detect bad pointers.

2811

* - detect bad pointers.

2800

* - POISON/RED_ZONE checking

2812

* - POISON/RED_ZONE checking

2801

* - destructor calls, for caches with POISON+dtor

2813

* - destructor calls, for caches with POISON+dtor

2802

*/

2814

*/

2803

static void kfree_debugcheck(const void *objp)

2815

static void kfree_debugcheck(const void *objp)

2804

{

2816

{

2805

struct page *page;

2817

struct page *page;

2806

2818

2807

if (!virt_addr_valid(objp)) {

2819

if (!virt_addr_valid(objp)) {

2808

printk(KERN_ERR "kfree_debugcheck: out of range ptr %lxh.\n",

2820

printk(KERN_ERR "kfree_debugcheck: out of range ptr %lxh.\n",

2809

(unsigned long)objp);

2821

(unsigned long)objp);

2810

BUG();

2822

BUG();

2811

}

2823

}

2812

page = virt_to_page(objp);

2824

page = virt_to_page(objp);

2813

if (!PageSlab(page)) {

2825

if (!PageSlab(page)) {

2814

printk(KERN_ERR "kfree_debugcheck: bad ptr %lxh.\n",

2826

printk(KERN_ERR "kfree_debugcheck: bad ptr %lxh.\n",

2815

(unsigned long)objp);

2827

(unsigned long)objp);

2816

BUG();

2828

BUG();

2817

}

2829

}

2818

}

2830

}

2819

2831

2820

static inline void verify_redzone_free(struct kmem_cache *cache, void *obj)

2832

static inline void verify_redzone_free(struct kmem_cache *cache, void *obj)

2821

{

2833

{

2822

unsigned long redzone1, redzone2;

2834

unsigned long redzone1, redzone2;

2823

2835

2824

redzone1 = *dbg_redzone1(cache, obj);

2836

redzone1 = *dbg_redzone1(cache, obj);

2825

redzone2 = *dbg_redzone2(cache, obj);

2837

redzone2 = *dbg_redzone2(cache, obj);

2826

2838

2827

/*

2839

/*

2828

* Redzone is ok.

2840

* Redzone is ok.

2829

*/

2841

*/

2830

if (redzone1 == RED_ACTIVE && redzone2 == RED_ACTIVE)

2842

if (redzone1 == RED_ACTIVE && redzone2 == RED_ACTIVE)

2831

return;

2843

return;

2832

2844

2833

if (redzone1 == RED_INACTIVE && redzone2 == RED_INACTIVE)

2845

if (redzone1 == RED_INACTIVE && redzone2 == RED_INACTIVE)

2834

slab_error(cache, "double free detected");

2846

slab_error(cache, "double free detected");

2835

else

2847

else

2836

slab_error(cache, "memory outside object was overwritten");

2848

slab_error(cache, "memory outside object was overwritten");

2837

2849

2838

printk(KERN_ERR "%p: redzone 1:0x%lx, redzone 2:0x%lx.\n",

2850

printk(KERN_ERR "%p: redzone 1:0x%lx, redzone 2:0x%lx.\n",

2839

obj, redzone1, redzone2);

2851

obj, redzone1, redzone2);

2840

}

2852

}

2841

2853

2842

static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,

2854

static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,

2843

void *caller)

2855

void *caller)

2844

{

2856

{

2845

struct page *page;

2857

struct page *page;

2846

unsigned int objnr;

2858

unsigned int objnr;

2847

struct slab *slabp;

2859

struct slab *slabp;

2848

2860

2849

objp -= obj_offset(cachep);

2861

objp -= obj_offset(cachep);

2850

kfree_debugcheck(objp);

2862

kfree_debugcheck(objp);

2851

page = virt_to_page(objp);

2863

page = virt_to_page(objp);

2852

2864

2853

slabp = page_get_slab(page);

2865

slabp = page_get_slab(page);

2854

2866

2855

if (cachep->flags & SLAB_RED_ZONE) {

2867

if (cachep->flags & SLAB_RED_ZONE) {

2856

verify_redzone_free(cachep, objp);

2868

verify_redzone_free(cachep, objp);

2857

*dbg_redzone1(cachep, objp) = RED_INACTIVE;

2869

*dbg_redzone1(cachep, objp) = RED_INACTIVE;

2858

*dbg_redzone2(cachep, objp) = RED_INACTIVE;

2870

*dbg_redzone2(cachep, objp) = RED_INACTIVE;

2859

}

2871

}

2860

if (cachep->flags & SLAB_STORE_USER)

2872

if (cachep->flags & SLAB_STORE_USER)

2861

*dbg_userword(cachep, objp) = caller;

2873

*dbg_userword(cachep, objp) = caller;

2862

2874

2863

objnr = obj_to_index(cachep, slabp, objp);

2875

objnr = obj_to_index(cachep, slabp, objp);

2864

2876

2865

BUG_ON(objnr >= cachep->num);

2877

BUG_ON(objnr >= cachep->num);

2866

BUG_ON(objp != index_to_obj(cachep, slabp, objnr));

2878

BUG_ON(objp != index_to_obj(cachep, slabp, objnr));

2867

2879

2868

if (cachep->flags & SLAB_DEBUG_INITIAL) {

2880

if (cachep->flags & SLAB_DEBUG_INITIAL) {

2869

/*

2881

/*

2870

* Need to call the slab's constructor so the caller can

2882

* Need to call the slab's constructor so the caller can

2871

* perform a verify of its state (debugging). Called without

2883

* perform a verify of its state (debugging). Called without

2872

* the cache-lock held.

2884

* the cache-lock held.

2873

*/

2885

*/

2874

cachep->ctor(objp + obj_offset(cachep),

2886

cachep->ctor(objp + obj_offset(cachep),

2875

cachep, SLAB_CTOR_CONSTRUCTOR | SLAB_CTOR_VERIFY);

2887

cachep, SLAB_CTOR_CONSTRUCTOR | SLAB_CTOR_VERIFY);

2876

}

2888

}

2877

if (cachep->flags & SLAB_POISON && cachep->dtor) {

2889

if (cachep->flags & SLAB_POISON && cachep->dtor) {

2878

/* we want to cache poison the object,

2890

/* we want to cache poison the object,

2879

* call the destruction callback

2891

* call the destruction callback

2880

*/

2892

*/

2881

cachep->dtor(objp + obj_offset(cachep), cachep, 0);

2893

cachep->dtor(objp + obj_offset(cachep), cachep, 0);

2882

}

2894

}

2883

#ifdef CONFIG_DEBUG_SLAB_LEAK

2895

#ifdef CONFIG_DEBUG_SLAB_LEAK

2884

slab_bufctl(slabp)[objnr] = BUFCTL_FREE;

2896

slab_bufctl(slabp)[objnr] = BUFCTL_FREE;

2885

#endif

2897

#endif

2886

if (cachep->flags & SLAB_POISON) {

2898

if (cachep->flags & SLAB_POISON) {

2887

#ifdef CONFIG_DEBUG_PAGEALLOC

2899

#ifdef CONFIG_DEBUG_PAGEALLOC

2888

if ((cachep->buffer_size % PAGE_SIZE)==0 && OFF_SLAB(cachep)) {

2900

if ((cachep->buffer_size % PAGE_SIZE)==0 && OFF_SLAB(cachep)) {

2889

store_stackinfo(cachep, objp, (unsigned long)caller);

2901

store_stackinfo(cachep, objp, (unsigned long)caller);

2890

kernel_map_pages(virt_to_page(objp),

2902

kernel_map_pages(virt_to_page(objp),

2891

cachep->buffer_size / PAGE_SIZE, 0);

2903

cachep->buffer_size / PAGE_SIZE, 0);

2892

} else {

2904

} else {

2893

poison_obj(cachep, objp, POISON_FREE);

2905

poison_obj(cachep, objp, POISON_FREE);

2894

}

2906

}

2895

#else

2907

#else

2896

poison_obj(cachep, objp, POISON_FREE);

2908

poison_obj(cachep, objp, POISON_FREE);

2897

#endif

2909

#endif

2898

}

2910

}

2899

return objp;

2911

return objp;

2900

}

2912

}

2901

2913

2902

static void check_slabp(struct kmem_cache *cachep, struct slab *slabp)

2914

static void check_slabp(struct kmem_cache *cachep, struct slab *slabp)

2903

{

2915

{

2904

kmem_bufctl_t i;

2916

kmem_bufctl_t i;

2905

int entries = 0;

2917

int entries = 0;

2906

2918

2907

/* Check slab's freelist to see if this obj is there. */

2919

/* Check slab's freelist to see if this obj is there. */

2908

for (i = slabp->free; i != BUFCTL_END; i = slab_bufctl(slabp)[i]) {

2920

for (i = slabp->free; i != BUFCTL_END; i = slab_bufctl(slabp)[i]) {

2909

entries++;

2921

entries++;

2910

if (entries > cachep->num || i >= cachep->num)

2922

if (entries > cachep->num || i >= cachep->num)

2911

goto bad;

2923

goto bad;

2912

}

2924

}

2913

if (entries != cachep->num - slabp->inuse) {

2925

if (entries != cachep->num - slabp->inuse) {

2914

bad:

2926

bad:

2915

printk(KERN_ERR "slab: Internal list corruption detected in "

2927

printk(KERN_ERR "slab: Internal list corruption detected in "

2916

"cache '%s'(%d), slabp %p(%d). Hexdump:\n",

2928

"cache '%s'(%d), slabp %p(%d). Hexdump:\n",

2917

cachep->name, cachep->num, slabp, slabp->inuse);

2929

cachep->name, cachep->num, slabp, slabp->inuse);

2918

for (i = 0;

2930

for (i = 0;

2919

i < sizeof(*slabp) + cachep->num * sizeof(kmem_bufctl_t);

2931

i < sizeof(*slabp) + cachep->num * sizeof(kmem_bufctl_t);

2920

i++) {

2932

i++) {

2921

if (i % 16 == 0)

2933

if (i % 16 == 0)

2922

printk("\n%03x:", i);

2934

printk("\n%03x:", i);

2923

printk(" %02x", ((unsigned char *)slabp)[i]);

2935

printk(" %02x", ((unsigned char *)slabp)[i]);

2924

}

2936

}

2925

printk("\n");

2937

printk("\n");

2926

BUG();

2938

BUG();

2927

}

2939

}

2928

}

2940

}

2929

#else

2941

#else

2930

#define kfree_debugcheck(x) do { } while(0)

2942

#define kfree_debugcheck(x) do { } while(0)

2931

#define cache_free_debugcheck(x,objp,z) (objp)

2943

#define cache_free_debugcheck(x,objp,z) (objp)

2932

#define check_slabp(x,y) do { } while(0)

2944

#define check_slabp(x,y) do { } while(0)

2933

#endif

2945

#endif

2934

2946

2935

static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags)

2947

static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags)

2936

{

2948

{

2937

int batchcount;

2949

int batchcount;

2938

struct kmem_list3 *l3;

2950

struct kmem_list3 *l3;

2939

struct array_cache *ac;

2951

struct array_cache *ac;

2940

int node;

2952

int node;

2941

2953

2942

node = numa_node_id();

2954

node = numa_node_id();

2943

2955

2944

check_irq_off();

2956

check_irq_off();

2945

ac = cpu_cache_get(cachep);

2957

ac = cpu_cache_get(cachep);

2946

retry:

2958

retry:

2947

batchcount = ac->batchcount;

2959

batchcount = ac->batchcount;

2948

if (!ac->touched && batchcount > BATCHREFILL_LIMIT) {

2960

if (!ac->touched && batchcount > BATCHREFILL_LIMIT) {

2949

/*

2961

/*

2950

* If there was little recent activity on this cache, then

2962

* If there was little recent activity on this cache, then

2951

* perform only a partial refill. Otherwise we could generate

2963

* perform only a partial refill. Otherwise we could generate

2952

* refill bouncing.

2964

* refill bouncing.

2953

*/

2965

*/

2954

batchcount = BATCHREFILL_LIMIT;

2966

batchcount = BATCHREFILL_LIMIT;

2955

}

2967

}

2956

l3 = cachep->nodelists[node];

2968

l3 = cachep->nodelists[node];

2957

2969

2958

BUG_ON(ac->avail > 0 || !l3);

2970

BUG_ON(ac->avail > 0 || !l3);

2959

spin_lock(&l3->list_lock);

2971

spin_lock(&l3->list_lock);

2960

2972

2961

/* See if we can refill from the shared array */

2973

/* See if we can refill from the shared array */

2962

if (l3->shared && transfer_objects(ac, l3->shared, batchcount))

2974

if (l3->shared && transfer_objects(ac, l3->shared, batchcount))

2963

goto alloc_done;

2975

goto alloc_done;

2964

2976

2965

while (batchcount > 0) {

2977

while (batchcount > 0) {

2966

struct list_head *entry;

2978

struct list_head *entry;

2967

struct slab *slabp;

2979

struct slab *slabp;

2968

/* Get slab alloc is to come from. */

2980

/* Get slab alloc is to come from. */

2969

entry = l3->slabs_partial.next;

2981

entry = l3->slabs_partial.next;

2970

if (entry == &l3->slabs_partial) {

2982

if (entry == &l3->slabs_partial) {

2971

l3->free_touched = 1;

2983

l3->free_touched = 1;

2972

entry = l3->slabs_free.next;

2984

entry = l3->slabs_free.next;

2973

if (entry == &l3->slabs_free)

2985

if (entry == &l3->slabs_free)

2974

goto must_grow;

2986

goto must_grow;

2975

}

2987

}

2976

2988

2977

slabp = list_entry(entry, struct slab, list);

2989

slabp = list_entry(entry, struct slab, list);

2978

check_slabp(cachep, slabp);

2990

check_slabp(cachep, slabp);

2979

check_spinlock_acquired(cachep);

2991

check_spinlock_acquired(cachep);

2980

while (slabp->inuse < cachep->num && batchcount--) {

2992

while (slabp->inuse < cachep->num && batchcount--) {

2981

STATS_INC_ALLOCED(cachep);

2993

STATS_INC_ALLOCED(cachep);

2982

STATS_INC_ACTIVE(cachep);

2994

STATS_INC_ACTIVE(cachep);

2983

STATS_SET_HIGH(cachep);

2995

STATS_SET_HIGH(cachep);

2984

2996

2985

ac->entry[ac->avail++] = slab_get_obj(cachep, slabp,

2997

ac->entry[ac->avail++] = slab_get_obj(cachep, slabp,

2986

node);

2998

node);

2987

}

2999

}

2988

check_slabp(cachep, slabp);

3000

check_slabp(cachep, slabp);

2989

3001

2990

/* move slabp to correct slabp list: */

3002

/* move slabp to correct slabp list: */

2991

list_del(&slabp->list);

3003

list_del(&slabp->list);

2992

if (slabp->free == BUFCTL_END)

3004

if (slabp->free == BUFCTL_END)

2993

list_add(&slabp->list, &l3->slabs_full);

3005

list_add(&slabp->list, &l3->slabs_full);

2994

else

3006

else

2995

list_add(&slabp->list, &l3->slabs_partial);

3007

list_add(&slabp->list, &l3->slabs_partial);

2996

}

3008

}

2997

3009

2998

must_grow:

3010

must_grow:

2999

l3->free_objects -= ac->avail;

3011

l3->free_objects -= ac->avail;

3000

alloc_done:

3012

alloc_done:

3001

spin_unlock(&l3->list_lock);

3013

spin_unlock(&l3->list_lock);

3002

3014

3003

if (unlikely(!ac->avail)) {

3015

if (unlikely(!ac->avail)) {

3004

int x;

3016

int x;

3005

x = cache_grow(cachep, flags | GFP_THISNODE, node, NULL);

3017

x = cache_grow(cachep, flags | GFP_THISNODE, node, NULL);

3006

3018

3007

/* cache_grow can reenable interrupts, then ac could change. */

3019

/* cache_grow can reenable interrupts, then ac could change. */

3008

ac = cpu_cache_get(cachep);

3020

ac = cpu_cache_get(cachep);

3009

if (!x && ac->avail == 0) /* no objects in sight? abort */

3021

if (!x && ac->avail == 0) /* no objects in sight? abort */

3010

return NULL;

3022

return NULL;

3011

3023

3012

if (!ac->avail) /* objects refilled by interrupt? */

3024

if (!ac->avail) /* objects refilled by interrupt? */

3013

goto retry;

3025

goto retry;

3014

}

3026

}

3015

ac->touched = 1;

3027

ac->touched = 1;

3016

return ac->entry[--ac->avail];

3028

return ac->entry[--ac->avail];

3017

}

3029

}

3018

3030

3019

static inline void cache_alloc_debugcheck_before(struct kmem_cache *cachep,

3031

static inline void cache_alloc_debugcheck_before(struct kmem_cache *cachep,

3020

gfp_t flags)

3032

gfp_t flags)

3021

{

3033

{

3022

might_sleep_if(flags & __GFP_WAIT);

3034

might_sleep_if(flags & __GFP_WAIT);

3023

#if DEBUG

3035

#if DEBUG

3024

kmem_flagcheck(cachep, flags);

3036

kmem_flagcheck(cachep, flags);

3025

#endif

3037

#endif

3026

}

3038

}

3027

3039

3028

#if DEBUG

3040

#if DEBUG

3029

static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,

3041

static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,

3030

gfp_t flags, void *objp, void *caller)

3042

gfp_t flags, void *objp, void *caller)

3031

{

3043

{

3032

if (!objp)

3044

if (!objp)

3033

return objp;

3045

return objp;

3034

if (cachep->flags & SLAB_POISON) {

3046

if (cachep->flags & SLAB_POISON) {

3035

#ifdef CONFIG_DEBUG_PAGEALLOC

3047

#ifdef CONFIG_DEBUG_PAGEALLOC

3036

if ((cachep->buffer_size % PAGE_SIZE) == 0 && OFF_SLAB(cachep))

3048

if ((cachep->buffer_size % PAGE_SIZE) == 0 && OFF_SLAB(cachep))

3037

kernel_map_pages(virt_to_page(objp),

3049

kernel_map_pages(virt_to_page(objp),

3038

cachep->buffer_size / PAGE_SIZE, 1);

3050

cachep->buffer_size / PAGE_SIZE, 1);

3039

else

3051

else

3040

check_poison_obj(cachep, objp);

3052

check_poison_obj(cachep, objp);

3041

#else

3053

#else

3042

check_poison_obj(cachep, objp);

3054

check_poison_obj(cachep, objp);

3043

#endif

3055

#endif

3044

poison_obj(cachep, objp, POISON_INUSE);

3056

poison_obj(cachep, objp, POISON_INUSE);

3045

}

3057

}

3046

if (cachep->flags & SLAB_STORE_USER)

3058

if (cachep->flags & SLAB_STORE_USER)

3047

*dbg_userword(cachep, objp) = caller;

3059

*dbg_userword(cachep, objp) = caller;

3048

3060

3049

if (cachep->flags & SLAB_RED_ZONE) {

3061

if (cachep->flags & SLAB_RED_ZONE) {

3050

if (*dbg_redzone1(cachep, objp) != RED_INACTIVE ||

3062

if (*dbg_redzone1(cachep, objp) != RED_INACTIVE ||

3051

*dbg_redzone2(cachep, objp) != RED_INACTIVE) {

3063

*dbg_redzone2(cachep, objp) != RED_INACTIVE) {

3052

slab_error(cachep, "double free, or memory outside"

3064

slab_error(cachep, "double free, or memory outside"

3053

" object was overwritten");

3065

" object was overwritten");

3054

printk(KERN_ERR

3066

printk(KERN_ERR

3055

"%p: redzone 1:0x%lx, redzone 2:0x%lx\n",

3067

"%p: redzone 1:0x%lx, redzone 2:0x%lx\n",

3056

objp, *dbg_redzone1(cachep, objp),

3068

objp, *dbg_redzone1(cachep, objp),

3057

*dbg_redzone2(cachep, objp));

3069

*dbg_redzone2(cachep, objp));

3058

}

3070

}

3059

*dbg_redzone1(cachep, objp) = RED_ACTIVE;

3071

*dbg_redzone1(cachep, objp) = RED_ACTIVE;

3060

*dbg_redzone2(cachep, objp) = RED_ACTIVE;

3072

*dbg_redzone2(cachep, objp) = RED_ACTIVE;

3061

}

3073

}

3062

#ifdef CONFIG_DEBUG_SLAB_LEAK

3074

#ifdef CONFIG_DEBUG_SLAB_LEAK

3063

{

3075

{

3064

struct slab *slabp;

3076

struct slab *slabp;

3065

unsigned objnr;

3077

unsigned objnr;

3066

3078

3067

slabp = page_get_slab(virt_to_page(objp));

3079

slabp = page_get_slab(virt_to_page(objp));

3068

objnr = (unsigned)(objp - slabp->s_mem) / cachep->buffer_size;

3080

objnr = (unsigned)(objp - slabp->s_mem) / cachep->buffer_size;

3069

slab_bufctl(slabp)[objnr] = BUFCTL_ACTIVE;

3081

slab_bufctl(slabp)[objnr] = BUFCTL_ACTIVE;

3070

}

3082

}

3071

#endif

3083

#endif

3072

objp += obj_offset(cachep);

3084

objp += obj_offset(cachep);

3073

if (cachep->ctor && cachep->flags & SLAB_POISON) {

3085

if (cachep->ctor && cachep->flags & SLAB_POISON) {

3074

unsigned long ctor_flags = SLAB_CTOR_CONSTRUCTOR;

3086

unsigned long ctor_flags = SLAB_CTOR_CONSTRUCTOR;

3075

3087

3076

if (!(flags & __GFP_WAIT))

3088

if (!(flags & __GFP_WAIT))

3077

ctor_flags |= SLAB_CTOR_ATOMIC;

3089

ctor_flags |= SLAB_CTOR_ATOMIC;

3078

3090

3079

cachep->ctor(objp, cachep, ctor_flags);

3091

cachep->ctor(objp, cachep, ctor_flags);

3080

}

3092

}

3081

#if ARCH_SLAB_MINALIGN

3093

#if ARCH_SLAB_MINALIGN

3082

if ((u32)objp & (ARCH_SLAB_MINALIGN-1)) {

3094

if ((u32)objp & (ARCH_SLAB_MINALIGN-1)) {

3083

printk(KERN_ERR "0x%p: not aligned to ARCH_SLAB_MINALIGN=%d\n",

3095

printk(KERN_ERR "0x%p: not aligned to ARCH_SLAB_MINALIGN=%d\n",

3084

objp, ARCH_SLAB_MINALIGN);

3096

objp, ARCH_SLAB_MINALIGN);

3085

}

3097

}

3086

#endif

3098

#endif

3087

return objp;

3099

return objp;

3088

}

3100

}

3089

#else

3101

#else

3090

#define cache_alloc_debugcheck_after(a,b,objp,d) (objp)

3102

#define cache_alloc_debugcheck_after(a,b,objp,d) (objp)

3091

#endif

3103

#endif

3092

3104

3093

#ifdef CONFIG_FAILSLAB

3105

#ifdef CONFIG_FAILSLAB

3094

3106

3095

static struct failslab_attr {

3107

static struct failslab_attr {

3096

3108

3097

struct fault_attr attr;

3109

struct fault_attr attr;

3098

3110

3099

u32 ignore_gfp_wait;

3111

u32 ignore_gfp_wait;

3100

#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS

3112

#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS

3101

struct dentry *ignore_gfp_wait_file;

3113

struct dentry *ignore_gfp_wait_file;

3102

#endif

3114

#endif

3103

3115

3104

} failslab = {

3116

} failslab = {

3105

.attr = FAULT_ATTR_INITIALIZER,

3117

.attr = FAULT_ATTR_INITIALIZER,

3106

.ignore_gfp_wait = 1,

3118

.ignore_gfp_wait = 1,

3107

};

3119

};

3108

3120

3109

static int __init setup_failslab(char *str)

3121

static int __init setup_failslab(char *str)

3110

{

3122

{

3111

return setup_fault_attr(&failslab.attr, str);

3123

return setup_fault_attr(&failslab.attr, str);

3112

}

3124

}

3113

__setup("failslab=", setup_failslab);

3125

__setup("failslab=", setup_failslab);

3114

3126

3115

static int should_failslab(struct kmem_cache *cachep, gfp_t flags)

3127

static int should_failslab(struct kmem_cache *cachep, gfp_t flags)

3116

{

3128

{

3117

if (cachep == &cache_cache)

3129

if (cachep == &cache_cache)

3118

return 0;

3130

return 0;

3119

if (flags & __GFP_NOFAIL)

3131

if (flags & __GFP_NOFAIL)

3120

return 0;

3132

return 0;

3121

if (failslab.ignore_gfp_wait && (flags & __GFP_WAIT))

3133

if (failslab.ignore_gfp_wait && (flags & __GFP_WAIT))

3122

return 0;

3134

return 0;

3123

3135

3124

return should_fail(&failslab.attr, obj_size(cachep));

3136

return should_fail(&failslab.attr, obj_size(cachep));

3125

}

3137

}

3126

3138

3127

#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS

3139

#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS

3128

3140

3129

static int __init failslab_debugfs(void)

3141

static int __init failslab_debugfs(void)

3130

{

3142

{

3131

mode_t mode = S_IFREG | S_IRUSR | S_IWUSR;

3143

mode_t mode = S_IFREG | S_IRUSR | S_IWUSR;

3132

struct dentry *dir;

3144

struct dentry *dir;

3133

int err;

3145

int err;

3134

3146

3135

err = init_fault_attr_dentries(&failslab.attr, "failslab");

3147

err = init_fault_attr_dentries(&failslab.attr, "failslab");

3136

if (err)

3148

if (err)

3137

return err;

3149

return err;

3138

dir = failslab.attr.dentries.dir;

3150

dir = failslab.attr.dentries.dir;

3139

3151

3140

failslab.ignore_gfp_wait_file =

3152

failslab.ignore_gfp_wait_file =

3141

debugfs_create_bool("ignore-gfp-wait", mode, dir,

3153

debugfs_create_bool("ignore-gfp-wait", mode, dir,

3142

&failslab.ignore_gfp_wait);

3154

&failslab.ignore_gfp_wait);

3143

3155

3144

if (!failslab.ignore_gfp_wait_file) {

3156

if (!failslab.ignore_gfp_wait_file) {

3145

err = -ENOMEM;

3157

err = -ENOMEM;

3146

debugfs_remove(failslab.ignore_gfp_wait_file);

3158

debugfs_remove(failslab.ignore_gfp_wait_file);

3147

cleanup_fault_attr_dentries(&failslab.attr);

3159

cleanup_fault_attr_dentries(&failslab.attr);

3148

}

3160

}

3149

3161

3150

return err;

3162

return err;

3151

}

3163

}

3152

3164

3153

late_initcall(failslab_debugfs);

3165

late_initcall(failslab_debugfs);

3154

3166

3155

#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */

3167

#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */

3156

3168

3157

#else /* CONFIG_FAILSLAB */

3169

#else /* CONFIG_FAILSLAB */

3158

3170

3159

static inline int should_failslab(struct kmem_cache *cachep, gfp_t flags)

3171

static inline int should_failslab(struct kmem_cache *cachep, gfp_t flags)

3160

{

3172

{

3161

return 0;

3173

return 0;

3162

}

3174

}

3163

3175

3164

#endif /* CONFIG_FAILSLAB */

3176

#endif /* CONFIG_FAILSLAB */

3165

3177

3166

static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags)

3178

static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags)

3167

{

3179

{

3168

void *objp;

3180

void *objp;

3169

struct array_cache *ac;

3181

struct array_cache *ac;

3170

3182

3171

check_irq_off();

3183

check_irq_off();

3172

3184

3173

if (should_failslab(cachep, flags))

3185

if (should_failslab(cachep, flags))

3174

return NULL;

3186

return NULL;

3175

3187

3176

ac = cpu_cache_get(cachep);

3188

ac = cpu_cache_get(cachep);

3177

if (likely(ac->avail)) {

3189

if (likely(ac->avail)) {

3178

STATS_INC_ALLOCHIT(cachep);

3190

STATS_INC_ALLOCHIT(cachep);

3179

ac->touched = 1;

3191

ac->touched = 1;

3180

objp = ac->entry[--ac->avail];

3192

objp = ac->entry[--ac->avail];

3181

} else {

3193

} else {

3182

STATS_INC_ALLOCMISS(cachep);

3194

STATS_INC_ALLOCMISS(cachep);

3183

objp = cache_alloc_refill(cachep, flags);

3195

objp = cache_alloc_refill(cachep, flags);

3184

}

3196

}

3185

return objp;

3197

return objp;

3186

}

3198

}

3187

3199

3188

static __always_inline void *__cache_alloc(struct kmem_cache *cachep,

3200

static __always_inline void *__cache_alloc(struct kmem_cache *cachep,

3189

gfp_t flags, void *caller)

3201

gfp_t flags, void *caller)

3190

{

3202

{

3191

unsigned long save_flags;

3203

unsigned long save_flags;

3192

void *objp = NULL;

3204

void *objp = NULL;

3193

3205

3194

cache_alloc_debugcheck_before(cachep, flags);

3206

cache_alloc_debugcheck_before(cachep, flags);

3195

3207

3196

local_irq_save(save_flags);

3208

local_irq_save(save_flags);

3197

3209

3198

if (unlikely(NUMA_BUILD &&

3210

if (unlikely(NUMA_BUILD &&

3199

current->flags & (PF_SPREAD_SLAB | PF_MEMPOLICY)))

3211

current->flags & (PF_SPREAD_SLAB | PF_MEMPOLICY)))

3200

objp = alternate_node_alloc(cachep, flags);

3212

objp = alternate_node_alloc(cachep, flags);

3201

3213

3202

if (!objp)

3214

if (!objp)

3203

objp = ____cache_alloc(cachep, flags);

3215

objp = ____cache_alloc(cachep, flags);

3204

/*

3216

/*

3205

* We may just have run out of memory on the local node.

3217

* We may just have run out of memory on the local node.

3206

* ____cache_alloc_node() knows how to locate memory on other nodes

3218

* ____cache_alloc_node() knows how to locate memory on other nodes

3207

*/

3219

*/

3208

if (NUMA_BUILD && !objp)

3220

if (NUMA_BUILD && !objp)

3209

objp = ____cache_alloc_node(cachep, flags, numa_node_id());

3221

objp = ____cache_alloc_node(cachep, flags, numa_node_id());

3210

local_irq_restore(save_flags);

3222

local_irq_restore(save_flags);

3211

objp = cache_alloc_debugcheck_after(cachep, flags, objp,

3223

objp = cache_alloc_debugcheck_after(cachep, flags, objp,

3212

caller);

3224

caller);

3213

prefetchw(objp);

3225

prefetchw(objp);

3214

return objp;

3226

return objp;

3215

}

3227

}

3216

3228

3217

#ifdef CONFIG_NUMA

3229

#ifdef CONFIG_NUMA

3218

/*

3230

/*

3219

* Try allocating on another node if PF_SPREAD_SLAB|PF_MEMPOLICY.

3231

* Try allocating on another node if PF_SPREAD_SLAB|PF_MEMPOLICY.

3220

*

3232

*

3221

* If we are in_interrupt, then process context, including cpusets and

3233

* If we are in_interrupt, then process context, including cpusets and

3222

* mempolicy, may not apply and should not be used for allocation policy.

3234

* mempolicy, may not apply and should not be used for allocation policy.

3223

*/

3235

*/

3224

static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags)

3236

static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags)

3225

{

3237

{

3226

int nid_alloc, nid_here;

3238

int nid_alloc, nid_here;

3227

3239

3228

if (in_interrupt() || (flags & __GFP_THISNODE))

3240

if (in_interrupt() || (flags & __GFP_THISNODE))

3229

return NULL;

3241

return NULL;

3230

nid_alloc = nid_here = numa_node_id();

3242

nid_alloc = nid_here = numa_node_id();

3231

if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD))

3243

if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD))

3232

nid_alloc = cpuset_mem_spread_node();

3244

nid_alloc = cpuset_mem_spread_node();

3233

else if (current->mempolicy)

3245

else if (current->mempolicy)

3234

nid_alloc = slab_node(current->mempolicy);

3246

nid_alloc = slab_node(current->mempolicy);

3235

if (nid_alloc != nid_here)

3247

if (nid_alloc != nid_here)

3236

return ____cache_alloc_node(cachep, flags, nid_alloc);

3248

return ____cache_alloc_node(cachep, flags, nid_alloc);

3237

return NULL;

3249

return NULL;

3238

}

3250

}

3239

3251

3240

/*

3252

/*

3241

* Fallback function if there was no memory available and no objects on a

3253

* Fallback function if there was no memory available and no objects on a

3242

* certain node and fall back is permitted. First we scan all the

3254

* certain node and fall back is permitted. First we scan all the

3243

* available nodelists for available objects. If that fails then we

3255

* available nodelists for available objects. If that fails then we

3244

* perform an allocation without specifying a node. This allows the page

3256

* perform an allocation without specifying a node. This allows the page

3245

* allocator to do its reclaim / fallback magic. We then insert the

3257

* allocator to do its reclaim / fallback magic. We then insert the

3246

* slab into the proper nodelist and then allocate from it.

3258

* slab into the proper nodelist and then allocate from it.

3247

*/

3259

*/

3248

void *fallback_alloc(struct kmem_cache *cache, gfp_t flags)

3260

void *fallback_alloc(struct kmem_cache *cache, gfp_t flags)

3249

{

3261

{

3250

struct zonelist *zonelist = &NODE_DATA(slab_node(current->mempolicy))

3262

struct zonelist *zonelist = &NODE_DATA(slab_node(current->mempolicy))

3251

->node_zonelists[gfp_zone(flags)];

3263

->node_zonelists[gfp_zone(flags)];

3252

struct zone **z;

3264

struct zone **z;

3253

void *obj = NULL;

3265

void *obj = NULL;

3254

int nid;

3266

int nid;

3255

gfp_t local_flags = (flags & GFP_LEVEL_MASK);

3267

gfp_t local_flags = (flags & GFP_LEVEL_MASK);

3256

3268

3257

retry:

3269

retry:

3258

/*

3270

/*

3259

* Look through allowed nodes for objects available

3271

* Look through allowed nodes for objects available

3260

* from existing per node queues.

3272

* from existing per node queues.

3261

*/

3273

*/

3262

for (z = zonelist->zones; *z && !obj; z++) {

3274

for (z = zonelist->zones; *z && !obj; z++) {

3263

nid = zone_to_nid(*z);

3275

nid = zone_to_nid(*z);

3264

3276

3265

if (cpuset_zone_allowed_hardwall(*z, flags) &&

3277

if (cpuset_zone_allowed_hardwall(*z, flags) &&

3266

cache->nodelists[nid] &&

3278

cache->nodelists[nid] &&

3267

cache->nodelists[nid]->free_objects)

3279

cache->nodelists[nid]->free_objects)

3268

obj = ____cache_alloc_node(cache,

3280

obj = ____cache_alloc_node(cache,

3269

flags | GFP_THISNODE, nid);

3281

flags | GFP_THISNODE, nid);

3270

}

3282

}

3271

3283

3272

if (!obj) {

3284

if (!obj) {

3273

/*

3285

/*

3274

* This allocation will be performed within the constraints

3286

* This allocation will be performed within the constraints

3275

* of the current cpuset / memory policy requirements.

3287

* of the current cpuset / memory policy requirements.

3276

* We may trigger various forms of reclaim on the allowed

3288

* We may trigger various forms of reclaim on the allowed

3277

* set and go into memory reserves if necessary.

3289

* set and go into memory reserves if necessary.

3278

*/

3290

*/

3279

if (local_flags & __GFP_WAIT)

3291

if (local_flags & __GFP_WAIT)

3280

local_irq_enable();

3292

local_irq_enable();

3281

kmem_flagcheck(cache, flags);

3293

kmem_flagcheck(cache, flags);

3282

obj = kmem_getpages(cache, flags, -1);

3294

obj = kmem_getpages(cache, flags, -1);

3283

if (local_flags & __GFP_WAIT)

3295

if (local_flags & __GFP_WAIT)

3284

local_irq_disable();

3296

local_irq_disable();

3285

if (obj) {

3297

if (obj) {

3286

/*

3298

/*

3287

* Insert into the appropriate per node queues

3299

* Insert into the appropriate per node queues

3288

*/

3300

*/

3289

nid = page_to_nid(virt_to_page(obj));

3301

nid = page_to_nid(virt_to_page(obj));

3290

if (cache_grow(cache, flags, nid, obj)) {

3302

if (cache_grow(cache, flags, nid, obj)) {

3291

obj = ____cache_alloc_node(cache,

3303

obj = ____cache_alloc_node(cache,

3292

flags | GFP_THISNODE, nid);

3304

flags | GFP_THISNODE, nid);

3293

if (!obj)

3305

if (!obj)

3294

/*

3306

/*

3295

* Another processor may allocate the

3307

* Another processor may allocate the

3296

* objects in the slab since we are

3308

* objects in the slab since we are

3297

* not holding any locks.

3309

* not holding any locks.

3298

*/

3310

*/

3299

goto retry;

3311

goto retry;

3300

} else {

3312

} else {

3301

kmem_freepages(cache, obj);

3313

kmem_freepages(cache, obj);

3302

obj = NULL;

3314

obj = NULL;

3303

}

3315

}

3304

}

3316

}

3305

}

3317

}

3306

return obj;

3318

return obj;

3307

}

3319

}

3308

3320

3309

/*

3321

/*

3310

* A interface to enable slab creation on nodeid

3322

* A interface to enable slab creation on nodeid

3311

*/

3323

*/

3312

static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags,

3324

static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags,

3313

int nodeid)

3325

int nodeid)

3314

{

3326

{

3315

struct list_head *entry;

3327

struct list_head *entry;

3316

struct slab *slabp;

3328

struct slab *slabp;

3317

struct kmem_list3 *l3;

3329

struct kmem_list3 *l3;

3318

void *obj;

3330

void *obj;

3319

int x;

3331

int x;

3320

3332

3321

l3 = cachep->nodelists[nodeid];

3333

l3 = cachep->nodelists[nodeid];

3322

BUG_ON(!l3);

3334

BUG_ON(!l3);

3323

3335

3324

retry:

3336

retry:

3325

check_irq_off();

3337

check_irq_off();

3326

spin_lock(&l3->list_lock);

3338

spin_lock(&l3->list_lock);

3327

entry = l3->slabs_partial.next;

3339

entry = l3->slabs_partial.next;

3328

if (entry == &l3->slabs_partial) {

3340

if (entry == &l3->slabs_partial) {

3329

l3->free_touched = 1;

3341

l3->free_touched = 1;

3330

entry = l3->slabs_free.next;

3342

entry = l3->slabs_free.next;

3331

if (entry == &l3->slabs_free)

3343

if (entry == &l3->slabs_free)

3332

goto must_grow;

3344

goto must_grow;

3333

}

3345

}

3334

3346

3335

slabp = list_entry(entry, struct slab, list);

3347

slabp = list_entry(entry, struct slab, list);

3336

check_spinlock_acquired_node(cachep, nodeid);

3348

check_spinlock_acquired_node(cachep, nodeid);

3337

check_slabp(cachep, slabp);

3349

check_slabp(cachep, slabp);

3338

3350

3339

STATS_INC_NODEALLOCS(cachep);

3351

STATS_INC_NODEALLOCS(cachep);

3340

STATS_INC_ACTIVE(cachep);

3352

STATS_INC_ACTIVE(cachep);

3341

STATS_SET_HIGH(cachep);

3353

STATS_SET_HIGH(cachep);

3342

3354

3343

BUG_ON(slabp->inuse == cachep->num);

3355

BUG_ON(slabp->inuse == cachep->num);

3344

3356

3345

obj = slab_get_obj(cachep, slabp, nodeid);

3357

obj = slab_get_obj(cachep, slabp, nodeid);

3346

check_slabp(cachep, slabp);

3358

check_slabp(cachep, slabp);

3347

l3->free_objects--;

3359

l3->free_objects--;

3348

/* move slabp to correct slabp list: */

3360

/* move slabp to correct slabp list: */

3349

list_del(&slabp->list);

3361

list_del(&slabp->list);

3350

3362

3351

if (slabp->free == BUFCTL_END)

3363

if (slabp->free == BUFCTL_END)

3352

list_add(&slabp->list, &l3->slabs_full);

3364

list_add(&slabp->list, &l3->slabs_full);

3353

else

3365

else

3354

list_add(&slabp->list, &l3->slabs_partial);

3366

list_add(&slabp->list, &l3->slabs_partial);

3355

3367

3356

spin_unlock(&l3->list_lock);

3368

spin_unlock(&l3->list_lock);

3357

goto done;

3369

goto done;

3358

3370

3359

must_grow:

3371

must_grow:

3360

spin_unlock(&l3->list_lock);

3372

spin_unlock(&l3->list_lock);

3361

x = cache_grow(cachep, flags | GFP_THISNODE, nodeid, NULL);

3373

x = cache_grow(cachep, flags | GFP_THISNODE, nodeid, NULL);

3362

if (x)

3374

if (x)

3363

goto retry;

3375

goto retry;

3364

3376

3365

if (!(flags & __GFP_THISNODE))

3377

if (!(flags & __GFP_THISNODE))

3366

/* Unable to grow the cache. Fall back to other nodes. */

3378

/* Unable to grow the cache. Fall back to other nodes. */

3367

return fallback_alloc(cachep, flags);

3379

return fallback_alloc(cachep, flags);

3368

3380

3369

return NULL;

3381

return NULL;

3370

3382

3371

done:

3383

done:

3372

return obj;

3384

return obj;

3373

}

3385

}

3374

#endif

3386

#endif

3375

3387

3376

/*

3388

/*

3377

* Caller needs to acquire correct kmem_list's list_lock

3389

* Caller needs to acquire correct kmem_list's list_lock

3378

*/

3390

*/

3379

static void free_block(struct kmem_cache *cachep, void **objpp, int nr_objects,

3391

static void free_block(struct kmem_cache *cachep, void **objpp, int nr_objects,

3380

int node)

3392

int node)

3381

{

3393

{

3382

int i;

3394

int i;

3383

struct kmem_list3 *l3;

3395

struct kmem_list3 *l3;

3384

3396

3385

for (i = 0; i < nr_objects; i++) {

3397

for (i = 0; i < nr_objects; i++) {

3386

void *objp = objpp[i];

3398

void *objp = objpp[i];

3387

struct slab *slabp;

3399

struct slab *slabp;

3388

3400

3389

slabp = virt_to_slab(objp);

3401

slabp = virt_to_slab(objp);

3390

l3 = cachep->nodelists[node];

3402

l3 = cachep->nodelists[node];

3391

list_del(&slabp->list);

3403

list_del(&slabp->list);

3392

check_spinlock_acquired_node(cachep, node);

3404

check_spinlock_acquired_node(cachep, node);

3393

check_slabp(cachep, slabp);

3405

check_slabp(cachep, slabp);

3394

slab_put_obj(cachep, slabp, objp, node);

3406

slab_put_obj(cachep, slabp, objp, node);

3395

STATS_DEC_ACTIVE(cachep);

3407

STATS_DEC_ACTIVE(cachep);

3396

l3->free_objects++;

3408

l3->free_objects++;

3397

check_slabp(cachep, slabp);

3409

check_slabp(cachep, slabp);

3398

3410

3399

/* fixup slab chains */

3411

/* fixup slab chains */

3400

if (slabp->inuse == 0) {

3412

if (slabp->inuse == 0) {

3401

if (l3->free_objects > l3->free_limit) {

3413

if (l3->free_objects > l3->free_limit) {

3402

l3->free_objects -= cachep->num;

3414

l3->free_objects -= cachep->num;

3403

/* No need to drop any previously held

3415

/* No need to drop any previously held

3404

* lock here, even if we have a off-slab slab

3416

* lock here, even if we have a off-slab slab

3405

* descriptor it is guaranteed to come from

3417

* descriptor it is guaranteed to come from

3406

* a different cache, refer to comments before

3418

* a different cache, refer to comments before

3407

* alloc_slabmgmt.

3419

* alloc_slabmgmt.

3408

*/

3420

*/

3409

slab_destroy(cachep, slabp);

3421

slab_destroy(cachep, slabp);

3410

} else {

3422

} else {

3411

list_add(&slabp->list, &l3->slabs_free);

3423

list_add(&slabp->list, &l3->slabs_free);

3412

}

3424

}

3413

} else {

3425

} else {

3414

/* Unconditionally move a slab to the end of the

3426

/* Unconditionally move a slab to the end of the

3415

* partial list on free - maximum time for the

3427

* partial list on free - maximum time for the

3416

* other objects to be freed, too.

3428

* other objects to be freed, too.

3417

*/

3429

*/

3418

list_add_tail(&slabp->list, &l3->slabs_partial);

3430

list_add_tail(&slabp->list, &l3->slabs_partial);

3419

}

3431

}

3420

}

3432

}

3421

}

3433

}

3422

3434

3423

static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac)

3435

static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac)

3424

{

3436

{

3425

int batchcount;

3437

int batchcount;

3426

struct kmem_list3 *l3;

3438

struct kmem_list3 *l3;

3427

int node = numa_node_id();

3439

int node = numa_node_id();

3428

3440

3429

batchcount = ac->batchcount;

3441

batchcount = ac->batchcount;

3430

#if DEBUG

3442

#if DEBUG

3431

BUG_ON(!batchcount || batchcount > ac->avail);

3443

BUG_ON(!batchcount || batchcount > ac->avail);

3432

#endif

3444

#endif

3433

check_irq_off();

3445

check_irq_off();

3434

l3 = cachep->nodelists[node];

3446

l3 = cachep->nodelists[node];

3435

spin_lock(&l3->list_lock);

3447

spin_lock(&l3->list_lock);

3436

if (l3->shared) {

3448

if (l3->shared) {

3437

struct array_cache *shared_array = l3->shared;

3449

struct array_cache *shared_array = l3->shared;

3438

int max = shared_array->limit - shared_array->avail;

3450

int max = shared_array->limit - shared_array->avail;

3439

if (max) {

3451

if (max) {

3440

if (batchcount > max)

3452

if (batchcount > max)

3441

batchcount = max;

3453

batchcount = max;

3442

memcpy(&(shared_array->entry[shared_array->avail]),

3454

memcpy(&(shared_array->entry[shared_array->avail]),

3443

ac->entry, sizeof(void *) * batchcount);

3455

ac->entry, sizeof(void *) * batchcount);

3444

shared_array->avail += batchcount;

3456

shared_array->avail += batchcount;

3445

goto free_done;

3457

goto free_done;

3446

}

3458

}

3447

}

3459

}

3448

3460

3449

free_block(cachep, ac->entry, batchcount, node);

3461

free_block(cachep, ac->entry, batchcount, node);

3450

free_done:

3462

free_done:

3451

#if STATS

3463

#if STATS

3452

{

3464

{

3453

int i = 0;

3465

int i = 0;

3454

struct list_head *p;

3466

struct list_head *p;

3455

3467

3456

p = l3->slabs_free.next;

3468

p = l3->slabs_free.next;

3457

while (p != &(l3->slabs_free)) {

3469

while (p != &(l3->slabs_free)) {

3458

struct slab *slabp;

3470

struct slab *slabp;

3459

3471

3460

slabp = list_entry(p, struct slab, list);

3472

slabp = list_entry(p, struct slab, list);

3461

BUG_ON(slabp->inuse);

3473

BUG_ON(slabp->inuse);

3462

3474

3463

i++;

3475

i++;

3464

p = p->next;

3476

p = p->next;

3465

}

3477

}

3466

STATS_SET_FREEABLE(cachep, i);

3478

STATS_SET_FREEABLE(cachep, i);

3467

}

3479

}

3468

#endif

3480

#endif

3469

spin_unlock(&l3->list_lock);

3481

spin_unlock(&l3->list_lock);

3470

ac->avail -= batchcount;

3482

ac->avail -= batchcount;

3471

memmove(ac->entry, &(ac->entry[batchcount]), sizeof(void *)*ac->avail);

3483

memmove(ac->entry, &(ac->entry[batchcount]), sizeof(void *)*ac->avail);

3472

}

3484

}

3473

3485

3474

/*

3486

/*

3475

* Release an obj back to its cache. If the obj has a constructed state, it must

3487

* Release an obj back to its cache. If the obj has a constructed state, it must

3476

* be in this state _before_ it is released. Called with disabled ints.

3488

* be in this state _before_ it is released. Called with disabled ints.

3477

*/

3489

*/

3478

static inline void __cache_free(struct kmem_cache *cachep, void *objp)

3490

static inline void __cache_free(struct kmem_cache *cachep, void *objp)

3479

{

3491

{

3480

struct array_cache *ac = cpu_cache_get(cachep);

3492

struct array_cache *ac = cpu_cache_get(cachep);

3481

3493

3482

check_irq_off();

3494

check_irq_off();

3483

objp = cache_free_debugcheck(cachep, objp, __builtin_return_address(0));

3495

objp = cache_free_debugcheck(cachep, objp, __builtin_return_address(0));

3484

3496

3485

if (cache_free_alien(cachep, objp))

3497

if (cache_free_alien(cachep, objp))

3486

return;

3498

return;

3487

3499

3488

if (likely(ac->avail < ac->limit)) {

3500

if (likely(ac->avail < ac->limit)) {

3489

STATS_INC_FREEHIT(cachep);

3501

STATS_INC_FREEHIT(cachep);

3490

ac->entry[ac->avail++] = objp;

3502

ac->entry[ac->avail++] = objp;

3491

return;

3503

return;

3492

} else {

3504

} else {

3493

STATS_INC_FREEMISS(cachep);

3505

STATS_INC_FREEMISS(cachep);

3494

cache_flusharray(cachep, ac);

3506

cache_flusharray(cachep, ac);

3495

ac->entry[ac->avail++] = objp;

3507

ac->entry[ac->avail++] = objp;

3496

}

3508

}

3497

}

3509

}

3498

3510

3499

/**

3511

/**

3500

* kmem_cache_alloc - Allocate an object

3512

* kmem_cache_alloc - Allocate an object

3501

* @cachep: The cache to allocate from.

3513

* @cachep: The cache to allocate from.

3502

* @flags: See kmalloc().

3514

* @flags: See kmalloc().

3503

*

3515

*

3504

* Allocate an object from this cache. The flags are only relevant

3516

* Allocate an object from this cache. The flags are only relevant

3505

* if the cache has no available objects.

3517

* if the cache has no available objects.

3506

*/

3518

*/

3507

void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags)

3519

void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags)

3508

{

3520

{

3509

return __cache_alloc(cachep, flags, __builtin_return_address(0));

3521

return __cache_alloc(cachep, flags, __builtin_return_address(0));

3510

}

3522

}

3511

EXPORT_SYMBOL(kmem_cache_alloc);

3523

EXPORT_SYMBOL(kmem_cache_alloc);

3512

3524

3513

/**

3525

/**

3514

* kmem_cache_zalloc - Allocate an object. The memory is set to zero.

3526

* kmem_cache_zalloc - Allocate an object. The memory is set to zero.

3515

* @cache: The cache to allocate from.

3527

* @cache: The cache to allocate from.

3516

* @flags: See kmalloc().

3528

* @flags: See kmalloc().

3517

*

3529

*

3518

* Allocate an object from this cache and set the allocated memory to zero.

3530

* Allocate an object from this cache and set the allocated memory to zero.

3519

* The flags are only relevant if the cache has no available objects.

3531

* The flags are only relevant if the cache has no available objects.

3520

*/

3532

*/

3521

void *kmem_cache_zalloc(struct kmem_cache *cache, gfp_t flags)

3533

void *kmem_cache_zalloc(struct kmem_cache *cache, gfp_t flags)

3522

{

3534

{

3523

void *ret = __cache_alloc(cache, flags, __builtin_return_address(0));

3535

void *ret = __cache_alloc(cache, flags, __builtin_return_address(0));

3524

if (ret)

3536

if (ret)

3525

memset(ret, 0, obj_size(cache));

3537

memset(ret, 0, obj_size(cache));

3526

return ret;

3538

return ret;

3527

}

3539

}

3528

EXPORT_SYMBOL(kmem_cache_zalloc);

3540

EXPORT_SYMBOL(kmem_cache_zalloc);

3529

3541

3530

/**

3542

/**

3531

* kmem_ptr_validate - check if an untrusted pointer might

3543

* kmem_ptr_validate - check if an untrusted pointer might

3532

* be a slab entry.

3544

* be a slab entry.

3533

* @cachep: the cache we're checking against

3545

* @cachep: the cache we're checking against

3534

* @ptr: pointer to validate

3546

* @ptr: pointer to validate

3535

*

3547

*

3536

* This verifies that the untrusted pointer looks sane:

3548

* This verifies that the untrusted pointer looks sane:

3537

* it is _not_ a guarantee that the pointer is actually

3549

* it is _not_ a guarantee that the pointer is actually

3538

* part of the slab cache in question, but it at least

3550

* part of the slab cache in question, but it at least

3539

* validates that the pointer can be dereferenced and

3551

* validates that the pointer can be dereferenced and

3540

* looks half-way sane.

3552

* looks half-way sane.

3541

*

3553

*

3542

* Currently only used for dentry validation.

3554

* Currently only used for dentry validation.

3543

*/

3555

*/

3544

int fastcall kmem_ptr_validate(struct kmem_cache *cachep, const void *ptr)

3556

int fastcall kmem_ptr_validate(struct kmem_cache *cachep, const void *ptr)

3545

{

3557

{

3546

unsigned long addr = (unsigned long)ptr;

3558

unsigned long addr = (unsigned long)ptr;

3547

unsigned long min_addr = PAGE_OFFSET;

3559

unsigned long min_addr = PAGE_OFFSET;

3548

unsigned long align_mask = BYTES_PER_WORD - 1;

3560

unsigned long align_mask = BYTES_PER_WORD - 1;

3549

unsigned long size = cachep->buffer_size;

3561

unsigned long size = cachep->buffer_size;

3550

struct page *page;

3562

struct page *page;

3551

3563

3552

if (unlikely(addr < min_addr))

3564

if (unlikely(addr < min_addr))

3553

goto out;

3565

goto out;

3554

if (unlikely(addr > (unsigned long)high_memory - size))

3566

if (unlikely(addr > (unsigned long)high_memory - size))

3555

goto out;

3567

goto out;

3556

if (unlikely(addr & align_mask))

3568

if (unlikely(addr & align_mask))

3557

goto out;

3569

goto out;

3558

if (unlikely(!kern_addr_valid(addr)))

3570

if (unlikely(!kern_addr_valid(addr)))

3559

goto out;

3571

goto out;

3560

if (unlikely(!kern_addr_valid(addr + size - 1)))

3572

if (unlikely(!kern_addr_valid(addr + size - 1)))

3561

goto out;

3573

goto out;

3562

page = virt_to_page(ptr);

3574

page = virt_to_page(ptr);

3563

if (unlikely(!PageSlab(page)))

3575

if (unlikely(!PageSlab(page)))

3564

goto out;

3576

goto out;

3565

if (unlikely(page_get_cache(page) != cachep))

3577

if (unlikely(page_get_cache(page) != cachep))

3566

goto out;

3578

goto out;

3567

return 1;

3579

return 1;

3568

out:

3580

out:

3569

return 0;

3581

return 0;

3570

}

3582

}

3571

3583

3572

#ifdef CONFIG_NUMA

3584

#ifdef CONFIG_NUMA

3573

/**

3585

/**

3574

* kmem_cache_alloc_node - Allocate an object on the specified node

3586

* kmem_cache_alloc_node - Allocate an object on the specified node

3575

* @cachep: The cache to allocate from.

3587

* @cachep: The cache to allocate from.

3576

* @flags: See kmalloc().

3588

* @flags: See kmalloc().

3577

* @nodeid: node number of the target node.

3589

* @nodeid: node number of the target node.

3578

*

3590

*

3579

* Identical to kmem_cache_alloc but it will allocate memory on the given

3591

* Identical to kmem_cache_alloc but it will allocate memory on the given

3580

* node, which can improve the performance for cpu bound structures.

3592

* node, which can improve the performance for cpu bound structures.

3581

*

3593

*

3582

* Fallback to other node is possible if __GFP_THISNODE is not set.

3594

* Fallback to other node is possible if __GFP_THISNODE is not set.

3583

*/

3595

*/

3584

static __always_inline void *

3596

static __always_inline void *

3585

__cache_alloc_node(struct kmem_cache *cachep, gfp_t flags,

3597

__cache_alloc_node(struct kmem_cache *cachep, gfp_t flags,

3586

int nodeid, void *caller)

3598

int nodeid, void *caller)

3587

{

3599

{

3588

unsigned long save_flags;

3600

unsigned long save_flags;

3589

void *ptr = NULL;

3601

void *ptr = NULL;

3590

3602

3591

cache_alloc_debugcheck_before(cachep, flags);

3603

cache_alloc_debugcheck_before(cachep, flags);

3592

local_irq_save(save_flags);

3604

local_irq_save(save_flags);

3593

3605

3594

if (unlikely(nodeid == -1))

3606

if (unlikely(nodeid == -1))

3595

nodeid = numa_node_id();

3607

nodeid = numa_node_id();

3596

3608

3597

if (likely(cachep->nodelists[nodeid])) {

3609

if (likely(cachep->nodelists[nodeid])) {

3598

if (nodeid == numa_node_id()) {

3610

if (nodeid == numa_node_id()) {

3599

/*

3611

/*

3600

* Use the locally cached objects if possible.

3612

* Use the locally cached objects if possible.

3601

* However ____cache_alloc does not allow fallback

3613

* However ____cache_alloc does not allow fallback

3602

* to other nodes. It may fail while we still have

3614

* to other nodes. It may fail while we still have

3603

* objects on other nodes available.

3615

* objects on other nodes available.

3604

*/

3616

*/

3605

ptr = ____cache_alloc(cachep, flags);

3617

ptr = ____cache_alloc(cachep, flags);

3606

}

3618

}

3607

if (!ptr) {

3619

if (!ptr) {

3608

/* ___cache_alloc_node can fall back to other nodes */

3620

/* ___cache_alloc_node can fall back to other nodes */

3609

ptr = ____cache_alloc_node(cachep, flags, nodeid);

3621

ptr = ____cache_alloc_node(cachep, flags, nodeid);

3610

}

3622

}

3611

} else {

3623

} else {

3612

/* Node not bootstrapped yet */

3624

/* Node not bootstrapped yet */

3613

if (!(flags & __GFP_THISNODE))

3625

if (!(flags & __GFP_THISNODE))

3614

ptr = fallback_alloc(cachep, flags);

3626

ptr = fallback_alloc(cachep, flags);

3615

}

3627

}

3616

3628

3617

local_irq_restore(save_flags);

3629

local_irq_restore(save_flags);

3618

ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, caller);

3630

ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, caller);

3619

3631

3620

return ptr;

3632

return ptr;

3621

}

3633

}

3622

3634

3623

void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid)

3635

void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid)

3624

{

3636

{

3625

return __cache_alloc_node(cachep, flags, nodeid,

3637

return __cache_alloc_node(cachep, flags, nodeid,

3626

__builtin_return_address(0));

3638

__builtin_return_address(0));

3627

}

3639

}

3628

EXPORT_SYMBOL(kmem_cache_alloc_node);

3640

EXPORT_SYMBOL(kmem_cache_alloc_node);

3629

3641

3630

static __always_inline void *

3642

static __always_inline void *

3631

__do_kmalloc_node(size_t size, gfp_t flags, int node, void *caller)

3643

__do_kmalloc_node(size_t size, gfp_t flags, int node, void *caller)

3632

{

3644

{

3633

struct kmem_cache *cachep;

3645

struct kmem_cache *cachep;

3634

3646

3635

cachep = kmem_find_general_cachep(size, flags);

3647

cachep = kmem_find_general_cachep(size, flags);

3636

if (unlikely(cachep == NULL))

3648

if (unlikely(cachep == NULL))

3637

return NULL;

3649

return NULL;

3638

return kmem_cache_alloc_node(cachep, flags, node);

3650

return kmem_cache_alloc_node(cachep, flags, node);

3639

}

3651

}

3640

3652

3641

#ifdef CONFIG_DEBUG_SLAB

3653

#ifdef CONFIG_DEBUG_SLAB

3642

void *__kmalloc_node(size_t size, gfp_t flags, int node)

3654

void *__kmalloc_node(size_t size, gfp_t flags, int node)

3643

{

3655

{

3644

return __do_kmalloc_node(size, flags, node,

3656

return __do_kmalloc_node(size, flags, node,

3645

__builtin_return_address(0));

3657

__builtin_return_address(0));

3646

}

3658

}

3647

EXPORT_SYMBOL(__kmalloc_node);

3659

EXPORT_SYMBOL(__kmalloc_node);

3648

3660

3649

void *__kmalloc_node_track_caller(size_t size, gfp_t flags,

3661

void *__kmalloc_node_track_caller(size_t size, gfp_t flags,

3650

int node, void *caller)

3662

int node, void *caller)

3651

{

3663

{

3652

return __do_kmalloc_node(size, flags, node, caller);

3664

return __do_kmalloc_node(size, flags, node, caller);

3653

}

3665

}

3654

EXPORT_SYMBOL(__kmalloc_node_track_caller);

3666

EXPORT_SYMBOL(__kmalloc_node_track_caller);

3655

#else

3667

#else

3656

void *__kmalloc_node(size_t size, gfp_t flags, int node)

3668

void *__kmalloc_node(size_t size, gfp_t flags, int node)

3657

{

3669

{

3658

return __do_kmalloc_node(size, flags, node, NULL);

3670

return __do_kmalloc_node(size, flags, node, NULL);

3659

}

3671

}

3660

EXPORT_SYMBOL(__kmalloc_node);

3672

EXPORT_SYMBOL(__kmalloc_node);

3661

#endif /* CONFIG_DEBUG_SLAB */

3673

#endif /* CONFIG_DEBUG_SLAB */

3662

#endif /* CONFIG_NUMA */

3674

#endif /* CONFIG_NUMA */

3663

3675

3664

/**

3676

/**

3665

* __do_kmalloc - allocate memory

3677

* __do_kmalloc - allocate memory

3666

* @size: how many bytes of memory are required.

3678

* @size: how many bytes of memory are required.

3667

* @flags: the type of memory to allocate (see kmalloc).

3679

* @flags: the type of memory to allocate (see kmalloc).

3668

* @caller: function caller for debug tracking of the caller

3680

* @caller: function caller for debug tracking of the caller

3669

*/

3681

*/

3670

static __always_inline void *__do_kmalloc(size_t size, gfp_t flags,

3682

static __always_inline void *__do_kmalloc(size_t size, gfp_t flags,

3671

void *caller)

3683

void *caller)

3672

{

3684

{

3673

struct kmem_cache *cachep;

3685

struct kmem_cache *cachep;

3674

3686

3675

/* If you want to save a few bytes .text space: replace

3687

/* If you want to save a few bytes .text space: replace

3676

* __ with kmem_.

3688

* __ with kmem_.

3677

* Then kmalloc uses the uninlined functions instead of the inline

3689

* Then kmalloc uses the uninlined functions instead of the inline

3678

* functions.

3690

* functions.

3679

*/

3691

*/

3680

cachep = __find_general_cachep(size, flags);

3692

cachep = __find_general_cachep(size, flags);

3681

if (unlikely(cachep == NULL))

3693

if (unlikely(cachep == NULL))

3682

return NULL;

3694

return NULL;

3683

return __cache_alloc(cachep, flags, caller);

3695

return __cache_alloc(cachep, flags, caller);

3684

}

3696

}

3685

3697

3686

3698

3687

#ifdef CONFIG_DEBUG_SLAB

3699

#ifdef CONFIG_DEBUG_SLAB

3688

void *__kmalloc(size_t size, gfp_t flags)

3700

void *__kmalloc(size_t size, gfp_t flags)

3689

{

3701

{

3690

return __do_kmalloc(size, flags, __builtin_return_address(0));

3702

return __do_kmalloc(size, flags, __builtin_return_address(0));

3691

}

3703

}

3692

EXPORT_SYMBOL(__kmalloc);

3704

EXPORT_SYMBOL(__kmalloc);

3693

3705

3694

void *__kmalloc_track_caller(size_t size, gfp_t flags, void *caller)

3706

void *__kmalloc_track_caller(size_t size, gfp_t flags, void *caller)

3695

{

3707

{

3696

return __do_kmalloc(size, flags, caller);

3708

return __do_kmalloc(size, flags, caller);

3697

}

3709

}

3698

EXPORT_SYMBOL(__kmalloc_track_caller);

3710

EXPORT_SYMBOL(__kmalloc_track_caller);

3699

3711

3700

#else

3712

#else

3701

void *__kmalloc(size_t size, gfp_t flags)

3713

void *__kmalloc(size_t size, gfp_t flags)

3702

{

3714

{

3703

return __do_kmalloc(size, flags, NULL);

3715

return __do_kmalloc(size, flags, NULL);

3704

}

3716

}

3705

EXPORT_SYMBOL(__kmalloc);

3717

EXPORT_SYMBOL(__kmalloc);

3706

#endif

3718

#endif

3707

3719

3708

/**

3720

/**

3709

* kmem_cache_free - Deallocate an object

3721

* kmem_cache_free - Deallocate an object

3710

* @cachep: The cache the allocation was from.

3722

* @cachep: The cache the allocation was from.

3711

* @objp: The previously allocated object.

3723

* @objp: The previously allocated object.

3712

*

3724

*

3713

* Free an object which was previously allocated from this

3725

* Free an object which was previously allocated from this

3714

* cache.

3726

* cache.

3715

*/

3727

*/

3716

void kmem_cache_free(struct kmem_cache *cachep, void *objp)

3728

void kmem_cache_free(struct kmem_cache *cachep, void *objp)

3717

{

3729

{

3718

unsigned long flags;

3730

unsigned long flags;

3719

3731

3720

BUG_ON(virt_to_cache(objp) != cachep);

3732

BUG_ON(virt_to_cache(objp) != cachep);

3721

3733

3722

local_irq_save(flags);

3734

local_irq_save(flags);

3723

__cache_free(cachep, objp);

3735

__cache_free(cachep, objp);

3724

local_irq_restore(flags);

3736

local_irq_restore(flags);

3725

}

3737

}

3726

EXPORT_SYMBOL(kmem_cache_free);

3738

EXPORT_SYMBOL(kmem_cache_free);

3727

3739

3728

/**

3740

/**

3729

* kfree - free previously allocated memory

3741

* kfree - free previously allocated memory

3730

* @objp: pointer returned by kmalloc.

3742

* @objp: pointer returned by kmalloc.

3731

*

3743

*

3732

* If @objp is NULL, no operation is performed.

3744

* If @objp is NULL, no operation is performed.

3733

*

3745

*

3734

* Don't free memory not originally allocated by kmalloc()

3746

* Don't free memory not originally allocated by kmalloc()

3735

* or you will run into trouble.

3747

* or you will run into trouble.

3736

*/

3748

*/

3737

void kfree(const void *objp)

3749

void kfree(const void *objp)

3738

{

3750

{

3739

struct kmem_cache *c;

3751

struct kmem_cache *c;

3740

unsigned long flags;

3752

unsigned long flags;

3741

3753

3742

if (unlikely(!objp))

3754

if (unlikely(!objp))

3743

return;

3755

return;

3744

local_irq_save(flags);

3756

local_irq_save(flags);

3745

kfree_debugcheck(objp);

3757

kfree_debugcheck(objp);

3746

c = virt_to_cache(objp);

3758

c = virt_to_cache(objp);

3747

debug_check_no_locks_freed(objp, obj_size(c));

3759

debug_check_no_locks_freed(objp, obj_size(c));

3748

__cache_free(c, (void *)objp);

3760

__cache_free(c, (void *)objp);

3749

local_irq_restore(flags);

3761

local_irq_restore(flags);

3750

}

3762

}

3751

EXPORT_SYMBOL(kfree);

3763

EXPORT_SYMBOL(kfree);

3752

3764

3753

unsigned int kmem_cache_size(struct kmem_cache *cachep)

3765

unsigned int kmem_cache_size(struct kmem_cache *cachep)

3754

{

3766

{

3755

return obj_size(cachep);

3767

return obj_size(cachep);

3756

}

3768

}

3757

EXPORT_SYMBOL(kmem_cache_size);

3769

EXPORT_SYMBOL(kmem_cache_size);

3758

3770

3759

const char *kmem_cache_name(struct kmem_cache *cachep)

3771

const char *kmem_cache_name(struct kmem_cache *cachep)

3760

{

3772

{

3761

return cachep->name;

3773

return cachep->name;

3762

}

3774

}

3763

EXPORT_SYMBOL_GPL(kmem_cache_name);

3775

EXPORT_SYMBOL_GPL(kmem_cache_name);

3764

3776

3765

/*

3777

/*

3766

* This initializes kmem_list3 or resizes varioius caches for all nodes.

3778

* This initializes kmem_list3 or resizes varioius caches for all nodes.

3767

*/

3779

*/

3768

static int alloc_kmemlist(struct kmem_cache *cachep)

3780

static int alloc_kmemlist(struct kmem_cache *cachep)

3769

{

3781

{

3770

int node;

3782

int node;

3771

struct kmem_list3 *l3;

3783

struct kmem_list3 *l3;

3772

struct array_cache *new_shared;

3784

struct array_cache *new_shared;

3773

struct array_cache **new_alien = NULL;

3785

struct array_cache **new_alien = NULL;

3774

3786

3775

for_each_online_node(node) {

3787

for_each_online_node(node) {

3776

3788

3777

if (use_alien_caches) {

3789

if (use_alien_caches) {

3778

new_alien = alloc_alien_cache(node, cachep->limit);

3790

new_alien = alloc_alien_cache(node, cachep->limit);

3779

if (!new_alien)

3791

if (!new_alien)

3780

goto fail;

3792

goto fail;

3781

}

3793

}

3782

3794

3783

new_shared = alloc_arraycache(node,

3795

new_shared = alloc_arraycache(node,

3784

cachep->shared*cachep->batchcount,

3796

cachep->shared*cachep->batchcount,

3785

0xbaadf00d);

3797

0xbaadf00d);

3786

if (!new_shared) {

3798

if (!new_shared) {

3787

free_alien_cache(new_alien);

3799

free_alien_cache(new_alien);

3788

goto fail;

3800

goto fail;

3789

}

3801

}

3790

3802

3791

l3 = cachep->nodelists[node];

3803

l3 = cachep->nodelists[node];

3792

if (l3) {

3804

if (l3) {

3793

struct array_cache *shared = l3->shared;

3805

struct array_cache *shared = l3->shared;

3794

3806

3795

spin_lock_irq(&l3->list_lock);

3807

spin_lock_irq(&l3->list_lock);

3796

3808

3797

if (shared)

3809

if (shared)

3798

free_block(cachep, shared->entry,

3810

free_block(cachep, shared->entry,

3799

shared->avail, node);

3811

shared->avail, node);

3800

3812

3801

l3->shared = new_shared;

3813

l3->shared = new_shared;

3802

if (!l3->alien) {

3814

if (!l3->alien) {

3803

l3->alien = new_alien;

3815

l3->alien = new_alien;

3804

new_alien = NULL;

3816

new_alien = NULL;

3805

}

3817

}

3806

l3->free_limit = (1 + nr_cpus_node(node)) *

3818

l3->free_limit = (1 + nr_cpus_node(node)) *

3807

cachep->batchcount + cachep->num;

3819

cachep->batchcount + cachep->num;

3808

spin_unlock_irq(&l3->list_lock);

3820

spin_unlock_irq(&l3->list_lock);

3809

kfree(shared);

3821

kfree(shared);

3810

free_alien_cache(new_alien);

3822

free_alien_cache(new_alien);

3811

continue;

3823

continue;

3812

}

3824

}

3813

l3 = kmalloc_node(sizeof(struct kmem_list3), GFP_KERNEL, node);

3825

l3 = kmalloc_node(sizeof(struct kmem_list3), GFP_KERNEL, node);

3814

if (!l3) {

3826

if (!l3) {

3815

free_alien_cache(new_alien);

3827

free_alien_cache(new_alien);

3816

kfree(new_shared);

3828

kfree(new_shared);

3817

goto fail;

3829

goto fail;

3818

}

3830

}

3819

3831

3820

kmem_list3_init(l3);

3832

kmem_list3_init(l3);

3821

l3->next_reap = jiffies + REAPTIMEOUT_LIST3 +

3833

l3->next_reap = jiffies + REAPTIMEOUT_LIST3 +

3822

((unsigned long)cachep) % REAPTIMEOUT_LIST3;

3834

((unsigned long)cachep) % REAPTIMEOUT_LIST3;

3823

l3->shared = new_shared;

3835

l3->shared = new_shared;

3824

l3->alien = new_alien;

3836

l3->alien = new_alien;

3825

l3->free_limit = (1 + nr_cpus_node(node)) *

3837

l3->free_limit = (1 + nr_cpus_node(node)) *

3826

cachep->batchcount + cachep->num;

3838

cachep->batchcount + cachep->num;

3827

cachep->nodelists[node] = l3;

3839

cachep->nodelists[node] = l3;

3828

}

3840

}

3829

return 0;

3841

return 0;

3830

3842

3831

fail:

3843

fail:

3832

if (!cachep->next.next) {

3844

if (!cachep->next.next) {

3833

/* Cache is not active yet. Roll back what we did */

3845

/* Cache is not active yet. Roll back what we did */

3834

node--;

3846

node--;

3835

while (node >= 0) {

3847

while (node >= 0) {

3836

if (cachep->nodelists[node]) {

3848

if (cachep->nodelists[node]) {

3837

l3 = cachep->nodelists[node];

3849

l3 = cachep->nodelists[node];

3838

3850

3839

kfree(l3->shared);

3851

kfree(l3->shared);

3840

free_alien_cache(l3->alien);

3852

free_alien_cache(l3->alien);

3841

kfree(l3);

3853

kfree(l3);

3842

cachep->nodelists[node] = NULL;

3854

cachep->nodelists[node] = NULL;

3843

}

3855

}

3844

node--;

3856

node--;

3845

}

3857

}

3846

}

3858

}

3847

return -ENOMEM;

3859

return -ENOMEM;

3848

}

3860

}

3849

3861

3850

struct ccupdate_struct {

3862

struct ccupdate_struct {

3851

struct kmem_cache *cachep;

3863

struct kmem_cache *cachep;

3852

struct array_cache *new[NR_CPUS];

3864

struct array_cache *new[NR_CPUS];

3853

};

3865

};

3854

3866

3855

static void do_ccupdate_local(void *info)

3867

static void do_ccupdate_local(void *info)

3856

{

3868

{

3857

struct ccupdate_struct *new = info;

3869

struct ccupdate_struct *new = info;

3858

struct array_cache *old;

3870

struct array_cache *old;

3859

3871

3860

check_irq_off();

3872

check_irq_off();

3861

old = cpu_cache_get(new->cachep);

3873

old = cpu_cache_get(new->cachep);

3862

3874

3863

new->cachep->array[smp_processor_id()] = new->new[smp_processor_id()];

3875

new->cachep->array[smp_processor_id()] = new->new[smp_processor_id()];

3864

new->new[smp_processor_id()] = old;

3876

new->new[smp_processor_id()] = old;

3865

}

3877

}

3866

3878

3867

/* Always called with the cache_chain_mutex held */

3879

/* Always called with the cache_chain_mutex held */

3868

static int do_tune_cpucache(struct kmem_cache *cachep, int limit,

3880

static int do_tune_cpucache(struct kmem_cache *cachep, int limit,

3869

int batchcount, int shared)

3881

int batchcount, int shared)

3870

{

3882

{

3871

struct ccupdate_struct *new;

3883

struct ccupdate_struct *new;

3872

int i;

3884

int i;

3873

3885

3874

new = kzalloc(sizeof(*new), GFP_KERNEL);

3886

new = kzalloc(sizeof(*new), GFP_KERNEL);

3875

if (!new)

3887

if (!new)

3876

return -ENOMEM;

3888

return -ENOMEM;

3877

3889

3878

for_each_online_cpu(i) {

3890

for_each_online_cpu(i) {

3879

new->new[i] = alloc_arraycache(cpu_to_node(i), limit,

3891

new->new[i] = alloc_arraycache(cpu_to_node(i), limit,

3880

batchcount);

3892

batchcount);

3881

if (!new->new[i]) {

3893

if (!new->new[i]) {

3882

for (i--; i >= 0; i--)

3894

for (i--; i >= 0; i--)

3883

kfree(new->new[i]);

3895

kfree(new->new[i]);

3884

kfree(new);

3896

kfree(new);

3885

return -ENOMEM;

3897

return -ENOMEM;

3886

}

3898

}

3887

}

3899

}

3888

new->cachep = cachep;

3900

new->cachep = cachep;

3889

3901

3890

on_each_cpu(do_ccupdate_local, (void *)new, 1, 1);

3902

on_each_cpu(do_ccupdate_local, (void *)new, 1, 1);

3891

3903

3892

check_irq_on();

3904

check_irq_on();

3893

cachep->batchcount = batchcount;

3905

cachep->batchcount = batchcount;

3894

cachep->limit = limit;

3906

cachep->limit = limit;

3895

cachep->shared = shared;

3907

cachep->shared = shared;

3896

3908

3897

for_each_online_cpu(i) {

3909

for_each_online_cpu(i) {

3898

struct array_cache *ccold = new->new[i];

3910

struct array_cache *ccold = new->new[i];

3899

if (!ccold)

3911

if (!ccold)

3900

continue;

3912

continue;

3901

spin_lock_irq(&cachep->nodelists[cpu_to_node(i)]->list_lock);

3913

spin_lock_irq(&cachep->nodelists[cpu_to_node(i)]->list_lock);

3902

free_block(cachep, ccold->entry, ccold->avail, cpu_to_node(i));

3914

free_block(cachep, ccold->entry, ccold->avail, cpu_to_node(i));

3903

spin_unlock_irq(&cachep->nodelists[cpu_to_node(i)]->list_lock);

3915

spin_unlock_irq(&cachep->nodelists[cpu_to_node(i)]->list_lock);

3904

kfree(ccold);

3916

kfree(ccold);

3905

}

3917

}

3906

kfree(new);

3918

kfree(new);

3907

return alloc_kmemlist(cachep);

3919

return alloc_kmemlist(cachep);

3908

}

3920

}

3909

3921

3910

/* Called with cache_chain_mutex held always */

3922

/* Called with cache_chain_mutex held always */

3911

static int enable_cpucache(struct kmem_cache *cachep)

3923

static int enable_cpucache(struct kmem_cache *cachep)

3912

{

3924

{

3913

int err;

3925

int err;

3914

int limit, shared;

3926

int limit, shared;

3915

3927

3916

/*

3928

/*

3917

* The head array serves three purposes:

3929

* The head array serves three purposes:

3918

* - create a LIFO ordering, i.e. return objects that are cache-warm

3930

* - create a LIFO ordering, i.e. return objects that are cache-warm

3919

* - reduce the number of spinlock operations.

3931

* - reduce the number of spinlock operations.

3920

* - reduce the number of linked list operations on the slab and

3932

* - reduce the number of linked list operations on the slab and

3921

* bufctl chains: array operations are cheaper.

3933

* bufctl chains: array operations are cheaper.

3922

* The numbers are guessed, we should auto-tune as described by

3934

* The numbers are guessed, we should auto-tune as described by

3923

* Bonwick.

3935

* Bonwick.

3924

*/

3936

*/

3925

if (cachep->buffer_size > 131072)

3937

if (cachep->buffer_size > 131072)

3926

limit = 1;

3938

limit = 1;

3927

else if (cachep->buffer_size > PAGE_SIZE)

3939

else if (cachep->buffer_size > PAGE_SIZE)

3928

limit = 8;

3940

limit = 8;

3929

else if (cachep->buffer_size > 1024)

3941

else if (cachep->buffer_size > 1024)

3930

limit = 24;

3942

limit = 24;

3931

else if (cachep->buffer_size > 256)

3943

else if (cachep->buffer_size > 256)

3932

limit = 54;

3944

limit = 54;

3933

else

3945

else

3934

limit = 120;

3946

limit = 120;

3935

3947

3936

/*

3948

/*

3937

* CPU bound tasks (e.g. network routing) can exhibit cpu bound

3949

* CPU bound tasks (e.g. network routing) can exhibit cpu bound

3938

* allocation behaviour: Most allocs on one cpu, most free operations

3950

* allocation behaviour: Most allocs on one cpu, most free operations

3939

* on another cpu. For these cases, an efficient object passing between

3951

* on another cpu. For these cases, an efficient object passing between

3940

* cpus is necessary. This is provided by a shared array. The array

3952

* cpus is necessary. This is provided by a shared array. The array

3941

* replaces Bonwick's magazine layer.

3953

* replaces Bonwick's magazine layer.

3942

* On uniprocessor, it's functionally equivalent (but less efficient)

3954

* On uniprocessor, it's functionally equivalent (but less efficient)

3943

* to a larger limit. Thus disabled by default.

3955

* to a larger limit. Thus disabled by default.

3944

*/

3956

*/

3945

shared = 0;

3957

shared = 0;

3946

#ifdef CONFIG_SMP

3958

#ifdef CONFIG_SMP

3947

if (cachep->buffer_size <= PAGE_SIZE)

3959

if (cachep->buffer_size <= PAGE_SIZE)

3948

shared = 8;

3960

shared = 8;

3949

#endif

3961

#endif

3950

3962

3951

#if DEBUG

3963

#if DEBUG

3952

/*

3964

/*

3953

* With debugging enabled, large batchcount lead to excessively long

3965

* With debugging enabled, large batchcount lead to excessively long

3954

* periods with disabled local interrupts. Limit the batchcount

3966

* periods with disabled local interrupts. Limit the batchcount

3955

*/

3967

*/

3956

if (limit > 32)

3968

if (limit > 32)

3957

limit = 32;

3969

limit = 32;

3958

#endif

3970

#endif

3959

err = do_tune_cpucache(cachep, limit, (limit + 1) / 2, shared);

3971

err = do_tune_cpucache(cachep, limit, (limit + 1) / 2, shared);

3960

if (err)

3972

if (err)

3961

printk(KERN_ERR "enable_cpucache failed for %s, error %d.\n",

3973

printk(KERN_ERR "enable_cpucache failed for %s, error %d.\n",

3962

cachep->name, -err);

3974

cachep->name, -err);

3963

return err;

3975

return err;

3964

}

3976

}

3965

3977

3966

/*

3978

/*

3967

* Drain an array if it contains any elements taking the l3 lock only if

3979

* Drain an array if it contains any elements taking the l3 lock only if

3968

* necessary. Note that the l3 listlock also protects the array_cache

3980

* necessary. Note that the l3 listlock also protects the array_cache

3969

* if drain_array() is used on the shared array.

3981

* if drain_array() is used on the shared array.

3970

*/

3982

*/

3971

void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3,

3983

void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3,

3972

struct array_cache *ac, int force, int node)

3984

struct array_cache *ac, int force, int node)

3973

{

3985

{

3974

int tofree;

3986

int tofree;

3975

3987

3976

if (!ac || !ac->avail)

3988

if (!ac || !ac->avail)

3977

return;

3989

return;

3978

if (ac->touched && !force) {

3990

if (ac->touched && !force) {

3979

ac->touched = 0;

3991

ac->touched = 0;

3980

} else {

3992

} else {

3981

spin_lock_irq(&l3->list_lock);

3993

spin_lock_irq(&l3->list_lock);

3982

if (ac->avail) {

3994

if (ac->avail) {

3983

tofree = force ? ac->avail : (ac->limit + 4) / 5;

3995

tofree = force ? ac->avail : (ac->limit + 4) / 5;

3984

if (tofree > ac->avail)

3996

if (tofree > ac->avail)

3985

tofree = (ac->avail + 1) / 2;

3997

tofree = (ac->avail + 1) / 2;

3986

free_block(cachep, ac->entry, tofree, node);

3998

free_block(cachep, ac->entry, tofree, node);

3987

ac->avail -= tofree;

3999

ac->avail -= tofree;

3988

memmove(ac->entry, &(ac->entry[tofree]),

4000

memmove(ac->entry, &(ac->entry[tofree]),

3989

sizeof(void *) * ac->avail);

4001

sizeof(void *) * ac->avail);

3990

}

4002

}

3991

spin_unlock_irq(&l3->list_lock);

4003

spin_unlock_irq(&l3->list_lock);

3992

}

4004

}

3993

}

4005

}

3994

4006

3995

/**

4007

/**

3996

* cache_reap - Reclaim memory from caches.

4008

* cache_reap - Reclaim memory from caches.

3997

* @unused: unused parameter

4009

* @unused: unused parameter

3998

*

4010

*

3999

* Called from workqueue/eventd every few seconds.

4011

* Called from workqueue/eventd every few seconds.

4000

* Purpose:

4012

* Purpose:

4001

* - clear the per-cpu caches for this CPU.

4013

* - clear the per-cpu caches for this CPU.

4002

* - return freeable pages to the main free memory pool.

4014

* - return freeable pages to the main free memory pool.

4003

*

4015

*

4004

* If we cannot acquire the cache chain mutex then just give up - we'll try

4016

* If we cannot acquire the cache chain mutex then just give up - we'll try

4005

* again on the next iteration.

4017

* again on the next iteration.

4006

*/

4018

*/

4007

static void cache_reap(struct work_struct *unused)

4019

static void cache_reap(struct work_struct *unused)

4008

{

4020

{

4009

struct kmem_cache *searchp;

4021

struct kmem_cache *searchp;

4010

struct kmem_list3 *l3;

4022

struct kmem_list3 *l3;

4011

int node = numa_node_id();

4023

int node = numa_node_id();

4012

4024

4013

if (!mutex_trylock(&cache_chain_mutex)) {

4025

if (!mutex_trylock(&cache_chain_mutex)) {

4014

/* Give up. Setup the next iteration. */

4026

/* Give up. Setup the next iteration. */

4015

schedule_delayed_work(&__get_cpu_var(reap_work),

4027

schedule_delayed_work(&__get_cpu_var(reap_work),

4016

round_jiffies_relative(REAPTIMEOUT_CPUC));

4028

round_jiffies_relative(REAPTIMEOUT_CPUC));

4017

return;

4029

return;

4018

}

4030

}

4019

4031

4020

list_for_each_entry(searchp, &cache_chain, next) {

4032

list_for_each_entry(searchp, &cache_chain, next) {

4021

check_irq_on();

4033

check_irq_on();

4022

4034

4023

/*

4035

/*

4024

* We only take the l3 lock if absolutely necessary and we

4036

* We only take the l3 lock if absolutely necessary and we

4025

* have established with reasonable certainty that

4037

* have established with reasonable certainty that

4026

* we can do some work if the lock was obtained.

4038

* we can do some work if the lock was obtained.

4027

*/

4039

*/

4028

l3 = searchp->nodelists[node];

4040

l3 = searchp->nodelists[node];

4029

4041

4030

reap_alien(searchp, l3);

4042

reap_alien(searchp, l3);

4031

4043

4032

drain_array(searchp, l3, cpu_cache_get(searchp), 0, node);

4044

drain_array(searchp, l3, cpu_cache_get(searchp), 0, node);

4033

4045

4034

/*

4046

/*

4035

* These are racy checks but it does not matter

4047

* These are racy checks but it does not matter

4036

* if we skip one check or scan twice.

4048

* if we skip one check or scan twice.

4037

*/

4049

*/

4038

if (time_after(l3->next_reap, jiffies))

4050

if (time_after(l3->next_reap, jiffies))

4039

goto next;

4051

goto next;

4040

4052

4041

l3->next_reap = jiffies + REAPTIMEOUT_LIST3;

4053

l3->next_reap = jiffies + REAPTIMEOUT_LIST3;

4042

4054

4043

drain_array(searchp, l3, l3->shared, 0, node);

4055

drain_array(searchp, l3, l3->shared, 0, node);

4044

4056

4045

if (l3->free_touched)

4057

if (l3->free_touched)

4046

l3->free_touched = 0;

4058

l3->free_touched = 0;

4047

else {

4059

else {

4048

int freed;

4060

int freed;

4049

4061

4050

freed = drain_freelist(searchp, l3, (l3->free_limit +

4062

freed = drain_freelist(searchp, l3, (l3->free_limit +

4051

5 * searchp->num - 1) / (5 * searchp->num));

4063

5 * searchp->num - 1) / (5 * searchp->num));

4052

STATS_ADD_REAPED(searchp, freed);

4064

STATS_ADD_REAPED(searchp, freed);

4053

}

4065

}

4054

cond_resched();

4067

cond_resched();

4056

}

4068

}

4057

check_irq_on();

4069

check_irq_on();

4058

mutex_unlock(&cache_chain_mutex);

4070

mutex_unlock(&cache_chain_mutex);

4059

next_reap_node();

4071

next_reap_node();

4060

refresh_cpu_vm_stats(smp_processor_id());

4072

refresh_cpu_vm_stats(smp_processor_id());

4061

/* Set up the next iteration */

4073

/* Set up the next iteration */

4062

schedule_delayed_work(&__get_cpu_var(reap_work),

4074

schedule_delayed_work(&__get_cpu_var(reap_work),

4063

round_jiffies_relative(REAPTIMEOUT_CPUC));

4075

round_jiffies_relative(REAPTIMEOUT_CPUC));

4064

}

4076

}

4065

4077

4066

#ifdef CONFIG_PROC_FS

4078

#ifdef CONFIG_PROC_FS

4067

4079

4068

static void print_slabinfo_header(struct seq_file *m)

4080

static void print_slabinfo_header(struct seq_file *m)

4069

{

4081

{

4070

/*

4082

/*

4071

* Output format version, so at least we can change it

4083

* Output format version, so at least we can change it

4072

* without _too_ many complaints.

4084

* without _too_ many complaints.

4073

*/

4085

*/

4074

#if STATS

4086

#if STATS

4075

seq_puts(m, "slabinfo - version: 2.1 (statistics)\n");

4087

seq_puts(m, "slabinfo - version: 2.1 (statistics)\n");

4076

#else

4088

#else

4077

seq_puts(m, "slabinfo - version: 2.1\n");

4089

seq_puts(m, "slabinfo - version: 2.1\n");

4078

#endif

4090

#endif

4079

seq_puts(m, "# name <active_objs> <num_objs> <objsize> "

4091

seq_puts(m, "# name <active_objs> <num_objs> <objsize> "

4080

"<objperslab> <pagesperslab>");

4092

"<objperslab> <pagesperslab>");

4081

seq_puts(m, " : tunables <limit> <batchcount> <sharedfactor>");

4093

seq_puts(m, " : tunables <limit> <batchcount> <sharedfactor>");

4082

seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>");

4094

seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>");

4083

#if STATS

4095

#if STATS

4084

seq_puts(m, " : globalstat <listallocs> <maxobjs> <grown> <reaped> "

4096

seq_puts(m, " : globalstat <listallocs> <maxobjs> <grown> <reaped> "

4085

"<error> <maxfreeable> <nodeallocs> <remotefrees> <alienoverflow>");

4097

"<error> <maxfreeable> <nodeallocs> <remotefrees> <alienoverflow>");

4086

seq_puts(m, " : cpustat <allochit> <allocmiss> <freehit> <freemiss>");

4098

seq_puts(m, " : cpustat <allochit> <allocmiss> <freehit> <freemiss>");

4087

#endif

4099

#endif

4088

seq_putc(m, '\n');

4100

seq_putc(m, '\n');

4089

}

4101

}

4090

4102

4091

static void *s_start(struct seq_file *m, loff_t *pos)

4103

static void *s_start(struct seq_file *m, loff_t *pos)

4092

{

4104

{

4093

loff_t n = *pos;

4105

loff_t n = *pos;

4094

struct list_head *p;

4106

struct list_head *p;

4095

4107

4096

mutex_lock(&cache_chain_mutex);

4108

mutex_lock(&cache_chain_mutex);

4097

if (!n)

4109

if (!n)

4098

print_slabinfo_header(m);

4110

print_slabinfo_header(m);

4099

p = cache_chain.next;

4111

p = cache_chain.next;

4100

while (n--) {

4112

while (n--) {

4101

p = p->next;

4113

p = p->next;

4102

if (p == &cache_chain)

4114

if (p == &cache_chain)

4103

return NULL;

4115

return NULL;

4104

}

4116

}

4105

return list_entry(p, struct kmem_cache, next);

4117

return list_entry(p, struct kmem_cache, next);

4106

}

4118

}

4107

4119

4108

static void *s_next(struct seq_file *m, void *p, loff_t *pos)

4120

static void *s_next(struct seq_file *m, void *p, loff_t *pos)

4109

{

4121

{

4110

struct kmem_cache *cachep = p;

4122

struct kmem_cache *cachep = p;

4111

++*pos;

4123

++*pos;

4112

return cachep->next.next == &cache_chain ?

4124

return cachep->next.next == &cache_chain ?

4113

NULL : list_entry(cachep->next.next, struct kmem_cache, next);

4125

NULL : list_entry(cachep->next.next, struct kmem_cache, next);

4114

}

4126

}

4115

4127

4116

static void s_stop(struct seq_file *m, void *p)

4128

static void s_stop(struct seq_file *m, void *p)

4117

{

4129

{

4118

mutex_unlock(&cache_chain_mutex);

4130

mutex_unlock(&cache_chain_mutex);

4119

}

4131

}

4120

4132

4121

static int s_show(struct seq_file *m, void *p)

4133

static int s_show(struct seq_file *m, void *p)

4122

{

4134

{

4123

struct kmem_cache *cachep = p;

4135

struct kmem_cache *cachep = p;

4124

struct slab *slabp;

4136

struct slab *slabp;

4125

unsigned long active_objs;

4137

unsigned long active_objs;

4126

unsigned long num_objs;

4138

unsigned long num_objs;

4127

unsigned long active_slabs = 0;

4139

unsigned long active_slabs = 0;

4128

unsigned long num_slabs, free_objects = 0, shared_avail = 0;

4140

unsigned long num_slabs, free_objects = 0, shared_avail = 0;

4129

const char *name;

4141

const char *name;

4130

char *error = NULL;

4142

char *error = NULL;

4131

int node;

4143

int node;

4132

struct kmem_list3 *l3;

4144

struct kmem_list3 *l3;

4133

4145

4134

active_objs = 0;

4146

active_objs = 0;

4135

num_slabs = 0;

4147

num_slabs = 0;

4136

for_each_online_node(node) {

4148

for_each_online_node(node) {

4137

l3 = cachep->nodelists[node];

4149

l3 = cachep->nodelists[node];

4138

if (!l3)

4150

if (!l3)

4139

continue;

4151

continue;

4140

4152

4141

check_irq_on();

4153

check_irq_on();

4142

spin_lock_irq(&l3->list_lock);

4154

spin_lock_irq(&l3->list_lock);

4143

4155

4144

list_for_each_entry(slabp, &l3->slabs_full, list) {

4156

list_for_each_entry(slabp, &l3->slabs_full, list) {

4145

if (slabp->inuse != cachep->num && !error)

4157

if (slabp->inuse != cachep->num && !error)

4146

error = "slabs_full accounting error";

4158

error = "slabs_full accounting error";

4147

active_objs += cachep->num;

4159

active_objs += cachep->num;

4148

active_slabs++;

4160

active_slabs++;

4149

}

4161

}

4150

list_for_each_entry(slabp, &l3->slabs_partial, list) {

4162

list_for_each_entry(slabp, &l3->slabs_partial, list) {

4151

if (slabp->inuse == cachep->num && !error)

4163

if (slabp->inuse == cachep->num && !error)

4152

error = "slabs_partial inuse accounting error";

4164

error = "slabs_partial inuse accounting error";

4153

if (!slabp->inuse && !error)

4165

if (!slabp->inuse && !error)

4154

error = "slabs_partial/inuse accounting error";

4166

error = "slabs_partial/inuse accounting error";

4155

active_objs += slabp->inuse;

4167

active_objs += slabp->inuse;

4156

active_slabs++;

4168

active_slabs++;

4157

}

4169

}

4158

list_for_each_entry(slabp, &l3->slabs_free, list) {

4170

list_for_each_entry(slabp, &l3->slabs_free, list) {

4159

if (slabp->inuse && !error)

4171

if (slabp->inuse && !error)

4160

error = "slabs_free/inuse accounting error";

4172

error = "slabs_free/inuse accounting error";

4161

num_slabs++;

4173

num_slabs++;

4162

}

4174

}

4163

free_objects += l3->free_objects;

4175

free_objects += l3->free_objects;

4164

if (l3->shared)

4176

if (l3->shared)

4165

shared_avail += l3->shared->avail;

4177

shared_avail += l3->shared->avail;

4166

4178

4167

spin_unlock_irq(&l3->list_lock);

4179

spin_unlock_irq(&l3->list_lock);

4168

}

4180

}

4169

num_slabs += active_slabs;

4181

num_slabs += active_slabs;

4170

num_objs = num_slabs * cachep->num;

4182

num_objs = num_slabs * cachep->num;

4171

if (num_objs - active_objs != free_objects && !error)

4183

if (num_objs - active_objs != free_objects && !error)

4172

error = "free_objects accounting error";

4184

error = "free_objects accounting error";

4173

4185

4174

name = cachep->name;

4186

name = cachep->name;

4175

if (error)

4187

if (error)

4176

printk(KERN_ERR "slab: cache %s error: %s\n", name, error);

4188

printk(KERN_ERR "slab: cache %s error: %s\n", name, error);

4177

4189

4178

seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d",

4190

seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d",

4179

name, active_objs, num_objs, cachep->buffer_size,

4191

name, active_objs, num_objs, cachep->buffer_size,

4180

cachep->num, (1 << cachep->gfporder));

4192

cachep->num, (1 << cachep->gfporder));

4181

seq_printf(m, " : tunables %4u %4u %4u",

4193

seq_printf(m, " : tunables %4u %4u %4u",

4182

cachep->limit, cachep->batchcount, cachep->shared);

4194

cachep->limit, cachep->batchcount, cachep->shared);

4183

seq_printf(m, " : slabdata %6lu %6lu %6lu",

4195

seq_printf(m, " : slabdata %6lu %6lu %6lu",

4184

active_slabs, num_slabs, shared_avail);

4196

active_slabs, num_slabs, shared_avail);

4185

#if STATS

4197

#if STATS

4186

{ /* list3 stats */

4198

{ /* list3 stats */

4187

unsigned long high = cachep->high_mark;

4199

unsigned long high = cachep->high_mark;

4188

unsigned long allocs = cachep->num_allocations;

4200

unsigned long allocs = cachep->num_allocations;

4189

unsigned long grown = cachep->grown;

4201

unsigned long grown = cachep->grown;

4190

unsigned long reaped = cachep->reaped;

4202

unsigned long reaped = cachep->reaped;

4191

unsigned long errors = cachep->errors;

4203

unsigned long errors = cachep->errors;

4192

unsigned long max_freeable = cachep->max_freeable;

4204

unsigned long max_freeable = cachep->max_freeable;

4193

unsigned long node_allocs = cachep->node_allocs;

4205

unsigned long node_allocs = cachep->node_allocs;

4194

unsigned long node_frees = cachep->node_frees;

4206

unsigned long node_frees = cachep->node_frees;

4195

unsigned long overflows = cachep->node_overflow;

4207

unsigned long overflows = cachep->node_overflow;

4196

4208

4197

seq_printf(m, " : globalstat %7lu %6lu %5lu %4lu \

4209

seq_printf(m, " : globalstat %7lu %6lu %5lu %4lu \

4198

%4lu %4lu %4lu %4lu %4lu", allocs, high, grown,

4210

%4lu %4lu %4lu %4lu %4lu", allocs, high, grown,

4199

reaped, errors, max_freeable, node_allocs,

4211

reaped, errors, max_freeable, node_allocs,

4200

node_frees, overflows);

4212

node_frees, overflows);

4201

}

4213

}

4202

/* cpu stats */

4214

/* cpu stats */

4203

{

4215

{

4204

unsigned long allochit = atomic_read(&cachep->allochit);

4216

unsigned long allochit = atomic_read(&cachep->allochit);

4205

unsigned long allocmiss = atomic_read(&cachep->allocmiss);

4217

unsigned long allocmiss = atomic_read(&cachep->allocmiss);

4206

unsigned long freehit = atomic_read(&cachep->freehit);

4218

unsigned long freehit = atomic_read(&cachep->freehit);

4207

unsigned long freemiss = atomic_read(&cachep->freemiss);

4219

unsigned long freemiss = atomic_read(&cachep->freemiss);

4208

4220

4209

seq_printf(m, " : cpustat %6lu %6lu %6lu %6lu",

4221

seq_printf(m, " : cpustat %6lu %6lu %6lu %6lu",

4210

allochit, allocmiss, freehit, freemiss);

4222

allochit, allocmiss, freehit, freemiss);

4211

}

4223

}

4212

#endif

4224

#endif

4213

seq_putc(m, '\n');

4225

seq_putc(m, '\n');

4214

return 0;

4226

return 0;

4215

}

4227

}

4216

4228

4217

/*

4229

/*

4218

* slabinfo_op - iterator that generates /proc/slabinfo

4230

* slabinfo_op - iterator that generates /proc/slabinfo

4219

*

4231

*

4220

* Output layout:

4232

* Output layout:

4221

* cache-name

4233

* cache-name

4222

* num-active-objs

4234

* num-active-objs

4223

* total-objs

4235

* total-objs

4224

* object size

4236

* object size

4225

* num-active-slabs

4237

* num-active-slabs

4226

* total-slabs

4238

* total-slabs

4227

* num-pages-per-slab

4239

* num-pages-per-slab

4228

* + further values on SMP and with statistics enabled

4240

* + further values on SMP and with statistics enabled

4229

*/

4241

*/

4230

4242

4231

const struct seq_operations slabinfo_op = {

4243

const struct seq_operations slabinfo_op = {

4232

.start = s_start,

4244

.start = s_start,

4233

.next = s_next,

4245

.next = s_next,

4234

.stop = s_stop,

4246

.stop = s_stop,

4235

.show = s_show,

4247

.show = s_show,

4236

};

4248

};

4237

4249

4238

#define MAX_SLABINFO_WRITE 128

4250

#define MAX_SLABINFO_WRITE 128

4239

/**

4251

/**

4240

* slabinfo_write - Tuning for the slab allocator

4252

* slabinfo_write - Tuning for the slab allocator

4241

* @file: unused

4253

* @file: unused

4242

* @buffer: user buffer

4254

* @buffer: user buffer

4243

* @count: data length

4255

* @count: data length

4244

* @ppos: unused

4256

* @ppos: unused

4245

*/

4257

*/

4246

ssize_t slabinfo_write(struct file *file, const char __user * buffer,

4258

ssize_t slabinfo_write(struct file *file, const char __user * buffer,

4247

size_t count, loff_t *ppos)

4259

size_t count, loff_t *ppos)

4248

{

4260

{

4249

char kbuf[MAX_SLABINFO_WRITE + 1], *tmp;

4261

char kbuf[MAX_SLABINFO_WRITE + 1], *tmp;

4250

int limit, batchcount, shared, res;

4262

int limit, batchcount, shared, res;

4251

struct kmem_cache *cachep;

4263

struct kmem_cache *cachep;

4252

4264

4253

if (count > MAX_SLABINFO_WRITE)

4265

if (count > MAX_SLABINFO_WRITE)

4254

return -EINVAL;

4266

return -EINVAL;

4255

if (copy_from_user(&kbuf, buffer, count))

4267

if (copy_from_user(&kbuf, buffer, count))

4256

return -EFAULT;

4268

return -EFAULT;

4257

kbuf[MAX_SLABINFO_WRITE] = '\0';

4269

kbuf[MAX_SLABINFO_WRITE] = '\0';

4258

4270

4259

tmp = strchr(kbuf, ' ');

4271

tmp = strchr(kbuf, ' ');

4260

if (!tmp)

4272

if (!tmp)

4261

return -EINVAL;

4273

return -EINVAL;

4262

*tmp = '\0';

4274

*tmp = '\0';

4263

tmp++;

4275

tmp++;

4264

if (sscanf(tmp, " %d %d %d", &limit, &batchcount, &shared) != 3)

4276

if (sscanf(tmp, " %d %d %d", &limit, &batchcount, &shared) != 3)

4265

return -EINVAL;

4277

return -EINVAL;

4266

4278

4267

/* Find the cache in the chain of caches. */

4279

/* Find the cache in the chain of caches. */

4268

mutex_lock(&cache_chain_mutex);

4280

mutex_lock(&cache_chain_mutex);

4269

res = -EINVAL;

4281

res = -EINVAL;

4270

list_for_each_entry(cachep, &cache_chain, next) {

4282

list_for_each_entry(cachep, &cache_chain, next) {

4271

if (!strcmp(cachep->name, kbuf)) {

4283

if (!strcmp(cachep->name, kbuf)) {

4272

if (limit < 1 || batchcount < 1 ||

4284

if (limit < 1 || batchcount < 1 ||

4273

batchcount > limit || shared < 0) {

4285

batchcount > limit || shared < 0) {

4274

res = 0;

4286

res = 0;

4275

} else {

4287

} else {

4276

res = do_tune_cpucache(cachep, limit,

4288

res = do_tune_cpucache(cachep, limit,

4277

batchcount, shared);

4289

batchcount, shared);

4278

}

4290

}

4279

break;

4291

break;

4280

}

4292

}

4281

}

4293

}

4282

mutex_unlock(&cache_chain_mutex);

4294

mutex_unlock(&cache_chain_mutex);

4283

if (res >= 0)

4295

if (res >= 0)

4284

res = count;

4296

res = count;

4285

return res;

4297

return res;

4286

}

4298

}

4287

4299

4288

#ifdef CONFIG_DEBUG_SLAB_LEAK

4300

#ifdef CONFIG_DEBUG_SLAB_LEAK

4289

4301

4290

static void *leaks_start(struct seq_file *m, loff_t *pos)

4302

static void *leaks_start(struct seq_file *m, loff_t *pos)

4291

{

4303

{

4292

loff_t n = *pos;

4304

loff_t n = *pos;

4293

struct list_head *p;

4305

struct list_head *p;

4294

4306

4295

mutex_lock(&cache_chain_mutex);

4307

mutex_lock(&cache_chain_mutex);

4296

p = cache_chain.next;

4308

p = cache_chain.next;

4297

while (n--) {

4309

while (n--) {

4298

p = p->next;

4310

p = p->next;

4299

if (p == &cache_chain)

4311

if (p == &cache_chain)

4300

return NULL;

4312

return NULL;

4301

}

4313

}

4302

return list_entry(p, struct kmem_cache, next);

4314

return list_entry(p, struct kmem_cache, next);

4303

}

4315

}

4304

4316

4305

static inline int add_caller(unsigned long *n, unsigned long v)

4317

static inline int add_caller(unsigned long *n, unsigned long v)

4306

{

4318

{

4307

unsigned long *p;

4319

unsigned long *p;

4308

int l;

4320

int l;

4309

if (!v)

4321

if (!v)

4310

return 1;

4322

return 1;

4311

l = n[1];

4323

l = n[1];

4312

p = n + 2;

4324

p = n + 2;

4313

while (l) {

4325

while (l) {

4314

int i = l/2;

4326

int i = l/2;

4315

unsigned long *q = p + 2 * i;

4327

unsigned long *q = p + 2 * i;

4316

if (*q == v) {

4328

if (*q == v) {

4317

q[1]++;

4329

q[1]++;

4318

return 1;

4330

return 1;

4319

}

4331

}

4320

if (*q > v) {

4332

if (*q > v) {

4321

l = i;

4333

l = i;

4322

} else {

4334

} else {

4323

p = q + 2;

4335

p = q + 2;

4324

l -= i + 1;

4336

l -= i + 1;

4325

}

4337

}

4326

}

4338

}

4327

if (++n[1] == n[0])

4339

if (++n[1] == n[0])

4328

return 0;

4340

return 0;

4329

memmove(p + 2, p, n[1] * 2 * sizeof(unsigned long) - ((void *)p - (void *)n));

4341

memmove(p + 2, p, n[1] * 2 * sizeof(unsigned long) - ((void *)p - (void *)n));

4330

p[0] = v;

4342

p[0] = v;

4331

p[1] = 1;

4343

p[1] = 1;

4332

return 1;

4344

return 1;

4333

}

4345

}

4334

4346

4335

static void handle_slab(unsigned long *n, struct kmem_cache *c, struct slab *s)

4347

static void handle_slab(unsigned long *n, struct kmem_cache *c, struct slab *s)

4336

{

4348

{

4337

void *p;

4349

void *p;

4338

int i;

4350

int i;

4339

if (n[0] == n[1])

4351

if (n[0] == n[1])

4340

return;

4352

return;

4341

for (i = 0, p = s->s_mem; i < c->num; i++, p += c->buffer_size) {

4353

for (i = 0, p = s->s_mem; i < c->num; i++, p += c->buffer_size) {

4342

if (slab_bufctl(s)[i] != BUFCTL_ACTIVE)

4354

if (slab_bufctl(s)[i] != BUFCTL_ACTIVE)

4343

continue;

4355

continue;

4344

if (!add_caller(n, (unsigned long)*dbg_userword(c, p)))

4356

if (!add_caller(n, (unsigned long)*dbg_userword(c, p)))

4345

return;

4357

return;

4346

}

4358

}

4347

}

4359

}

4348

4360

4349

static void show_symbol(struct seq_file *m, unsigned long address)

4361

static void show_symbol(struct seq_file *m, unsigned long address)

4350

{

4362

{

4351

#ifdef CONFIG_KALLSYMS

4363

#ifdef CONFIG_KALLSYMS

4352

char *modname;

4364

char *modname;

4353

const char *name;

4365

const char *name;

4354

unsigned long offset, size;

4366

unsigned long offset, size;

4355

char namebuf[KSYM_NAME_LEN+1];

4367

char namebuf[KSYM_NAME_LEN+1];

4356

4368

4357

name = kallsyms_lookup(address, &size, &offset, &modname, namebuf);

4369

name = kallsyms_lookup(address, &size, &offset, &modname, namebuf);

4358

4370

4359

if (name) {

4371

if (name) {

4360

seq_printf(m, "%s+%#lx/%#lx", name, offset, size);

4372

seq_printf(m, "%s+%#lx/%#lx", name, offset, size);

4361

if (modname)

4373

if (modname)

4362

seq_printf(m, " [%s]", modname);

4374

seq_printf(m, " [%s]", modname);

4363

return;

4375

return;

4364

}

4376

}

4365

#endif

4377

#endif

4366

seq_printf(m, "%p", (void *)address);

4378

seq_printf(m, "%p", (void *)address);

4367

}

4379

}

4368

4380

4369

static int leaks_show(struct seq_file *m, void *p)

4381

static int leaks_show(struct seq_file *m, void *p)

4370

{

4382

{

4371

struct kmem_cache *cachep = p;

4383

struct kmem_cache *cachep = p;

4372

struct slab *slabp;

4384

struct slab *slabp;

4373

struct kmem_list3 *l3;

4385

struct kmem_list3 *l3;

4374

const char *name;

4386

const char *name;

4375

unsigned long *n = m->private;

4387

unsigned long *n = m->private;

4376

int node;

4388

int node;

4377

int i;

4389

int i;

4378

4390

4379

if (!(cachep->flags & SLAB_STORE_USER))

4391

if (!(cachep->flags & SLAB_STORE_USER))

4380

return 0;

4392

return 0;

4381

if (!(cachep->flags & SLAB_RED_ZONE))

4393

if (!(cachep->flags & SLAB_RED_ZONE))

4382

return 0;

4394

return 0;

4383

4395

4384

/* OK, we can do it */

4396

/* OK, we can do it */

4385

4397

4386

n[1] = 0;

4398

n[1] = 0;

4387

4399

4388

for_each_online_node(node) {

4400

for_each_online_node(node) {

4389

l3 = cachep->nodelists[node];

4401

l3 = cachep->nodelists[node];

4390

if (!l3)

4402

if (!l3)

4391

continue;

4403

continue;

4392

4404

4393

check_irq_on();

4405

check_irq_on();

4394

spin_lock_irq(&l3->list_lock);

4406

spin_lock_irq(&l3->list_lock);

4395

4407

4396

list_for_each_entry(slabp, &l3->slabs_full, list)

4408

list_for_each_entry(slabp, &l3->slabs_full, list)

4397

handle_slab(n, cachep, slabp);

4409

handle_slab(n, cachep, slabp);

4398

list_for_each_entry(slabp, &l3->slabs_partial, list)

4410

list_for_each_entry(slabp, &l3->slabs_partial, list)

4399

handle_slab(n, cachep, slabp);

4411

handle_slab(n, cachep, slabp);

4400

spin_unlock_irq(&l3->list_lock);

4412

spin_unlock_irq(&l3->list_lock);

4401

}

4413

}

4402

name = cachep->name;

4414

name = cachep->name;

4403

if (n[0] == n[1]) {

4415

if (n[0] == n[1]) {

4404

/* Increase the buffer size */

4416

/* Increase the buffer size */

4405

mutex_unlock(&cache_chain_mutex);

4417

mutex_unlock(&cache_chain_mutex);

4406

m->private = kzalloc(n[0] * 4 * sizeof(unsigned long), GFP_KERNEL);

4418

m->private = kzalloc(n[0] * 4 * sizeof(unsigned long), GFP_KERNEL);

4407

if (!m->private) {

4419

if (!m->private) {

4408

/* Too bad, we are really out */

4420

/* Too bad, we are really out */

4409

m->private = n;

4421

m->private = n;

4410

mutex_lock(&cache_chain_mutex);

4422

mutex_lock(&cache_chain_mutex);

4411

return -ENOMEM;

4423

return -ENOMEM;

4412

}

4424

}

4413

*(unsigned long *)m->private = n[0] * 2;

4425

*(unsigned long *)m->private = n[0] * 2;

4414

kfree(n);

4426

kfree(n);

4415

mutex_lock(&cache_chain_mutex);

4427

mutex_lock(&cache_chain_mutex);

4416

/* Now make sure this entry will be retried */

4428

/* Now make sure this entry will be retried */

4417

m->count = m->size;

4429

m->count = m->size;

4418

return 0;

4430

return 0;

4419

}

4431

}

4420

for (i = 0; i < n[1]; i++) {

4432

for (i = 0; i < n[1]; i++) {

4421

seq_printf(m, "%s: %lu ", name, n[2*i+3]);

4433

seq_printf(m, "%s: %lu ", name, n[2*i+3]);

4422

show_symbol(m, n[2*i+2]);

4434

show_symbol(m, n[2*i+2]);

4423

seq_putc(m, '\n');

4435

seq_putc(m, '\n');

4424

}

4436

}

4425

4437

4426

return 0;

4438

return 0;

4427

}

4439

}

4428

4440

4429

const struct seq_operations slabstats_op = {

4441

const struct seq_operations slabstats_op = {

4430

.start = leaks_start,

4442

.start = leaks_start,

4431

.next = s_next,

4443

.next = s_next,

4432

.stop = s_stop,

4444

.stop = s_stop,

4433

.show = leaks_show,

4445

.show = leaks_show,

4434

};

4446

};

4435

#endif

4447

#endif

4436

#endif

4448

#endif

4437

4449

4438

/**

4450

/**

4439

* ksize - get the actual amount of memory allocated for a given object

4451

* ksize - get the actual amount of memory allocated for a given object

4440

* @objp: Pointer to the object

4452

* @objp: Pointer to the object

4441

*

4453

*

4442

* kmalloc may internally round up allocations and return more memory

4454

* kmalloc may internally round up allocations and return more memory

4443

* than requested. ksize() can be used to determine the actual amount of

4455

* than requested. ksize() can be used to determine the actual amount of

4444

* memory allocated. The caller may use this additional memory, even though

4456

* memory allocated. The caller may use this additional memory, even though

4445

* a smaller amount of memory was initially specified with the kmalloc call.

4457

* a smaller amount of memory was initially specified with the kmalloc call.

4446

* The caller must guarantee that objp points to a valid object previously

4458

* The caller must guarantee that objp points to a valid object previously

4447

* allocated with either kmalloc() or kmem_cache_alloc(). The object

4459

* allocated with either kmalloc() or kmem_cache_alloc(). The object

4448

* must not be freed during the duration of the call.

4460

* must not be freed during the duration of the call.

4449

*/

4461

*/

4450

unsigned int ksize(const void *objp)

4462

unsigned int ksize(const void *objp)

4451

{

4463

{

4452

if (unlikely(objp == NULL))

4464

if (unlikely(objp == NULL))

4453

return 0;

4465

return 0;

4454

4466

4455

return obj_size(virt_to_cache(objp));

4467

return obj_size(virt_to_cache(objp));

4456

}

4468

}

4457

4469

File was created	1	#include <asm/div64.h>
	2	#include <linux/reciprocal_div.h>
	3
	4	u32 reciprocal_value(u32 k)
	5	{
	6	u64 val = (1LL << 32) + (k - 1);
	7	do_div(val, k);
	8	return (u32)val;
	9	}
	10

GITLAB

Eric Lee / smarc-fsl-linux-kernel

[PATCH] SLAB: use a multiply instead of a divide in obj_to_index()

File was created	1	#ifndef _LINUX_RECIPROCAL_DIV_H
	2	#define _LINUX_RECIPROCAL_DIV_H
	3
	4	#include <linux/types.h>
	5
	6	/*
	7	* This file describes reciprocical division.
	8	*
	9	* This optimizes the (A/B) problem, when A and B are two u32
	10	* and B is a known value (but not known at compile time)
	11	*
	12	* The math principle used is :
	13	* Let RECIPROCAL_VALUE(B) be (((1LL << 32) + (B - 1))/ B)
	14	* Then A / B = (u32)(((u64)(A) * (R)) >> 32)
	15	*
	16	* This replaces a divide by a multiply (and a shift), and
	17	* is generally less expensive in CPU cycles.
	18	*/
	19
	20	/*
	21	* Computes the reciprocal value (R) for the value B of the divisor.
	22	* Should not be called before each reciprocal_divide(),
	23	* or else the performance is slower than a normal divide.
	24	*/
	25	extern u32 reciprocal_value(u32 B);
	26
	27
	28	static inline u32 reciprocal_divide(u32 A, u32 R)
	29	{
	30	return (u32)(((u64)A * R) >> 32);
	31	}
	32	#endif
	33

1	#	1	#
2	# Makefile for some libs needed in the kernel.	2	# Makefile for some libs needed in the kernel.
3	#	3	#
4		4
5	lib-y := ctype.o string.o vsprintf.o cmdline.o \	5	lib-y := ctype.o string.o vsprintf.o cmdline.o \
6	bust_spinlocks.o rbtree.o radix-tree.o dump_stack.o \	6	bust_spinlocks.o rbtree.o radix-tree.o dump_stack.o \
7	idr.o div64.o int_sqrt.o bitmap.o extable.o prio_tree.o \	7	idr.o div64.o int_sqrt.o bitmap.o extable.o prio_tree.o \
8	sha1.o irq_regs.o	8	sha1.o irq_regs.o reciprocal_div.o
9		9
10	lib-$(CONFIG_MMU) += ioremap.o	10	lib-$(CONFIG_MMU) += ioremap.o
11	lib-$(CONFIG_SMP) += cpumask.o	11	lib-$(CONFIG_SMP) += cpumask.o
12		12
13	lib-y += kobject.o kref.o kobject_uevent.o klist.o	13	lib-y += kobject.o kref.o kobject_uevent.o klist.o
14		14
15	obj-y += sort.o parser.o halfmd4.o iomap_copy.o debug_locks.o random32.o	15	obj-y += sort.o parser.o halfmd4.o iomap_copy.o debug_locks.o random32.o
16		16
17	ifeq ($(CONFIG_DEBUG_KOBJECT),y)	17	ifeq ($(CONFIG_DEBUG_KOBJECT),y)
18	CFLAGS_kobject.o += -DDEBUG	18	CFLAGS_kobject.o += -DDEBUG
19	CFLAGS_kobject_uevent.o += -DDEBUG	19	CFLAGS_kobject_uevent.o += -DDEBUG
20	endif	20	endif
21		21
22	obj-$(CONFIG_DEBUG_LOCKING_API_SELFTESTS) += locking-selftest.o	22	obj-$(CONFIG_DEBUG_LOCKING_API_SELFTESTS) += locking-selftest.o
23	obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock_debug.o	23	obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock_debug.o
24	lib-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o	24	lib-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o
25	lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem.o	25	lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem.o
26	lib-$(CONFIG_SEMAPHORE_SLEEPERS) += semaphore-sleepers.o	26	lib-$(CONFIG_SEMAPHORE_SLEEPERS) += semaphore-sleepers.o
27	lib-$(CONFIG_GENERIC_FIND_NEXT_BIT) += find_next_bit.o	27	lib-$(CONFIG_GENERIC_FIND_NEXT_BIT) += find_next_bit.o
28	obj-$(CONFIG_GENERIC_HWEIGHT) += hweight.o	28	obj-$(CONFIG_GENERIC_HWEIGHT) += hweight.o
29	obj-$(CONFIG_LOCK_KERNEL) += kernel_lock.o	29	obj-$(CONFIG_LOCK_KERNEL) += kernel_lock.o
30	obj-$(CONFIG_PLIST) += plist.o	30	obj-$(CONFIG_PLIST) += plist.o
31	obj-$(CONFIG_DEBUG_PREEMPT) += smp_processor_id.o	31	obj-$(CONFIG_DEBUG_PREEMPT) += smp_processor_id.o
32	obj-$(CONFIG_DEBUG_LIST) += list_debug.o	32	obj-$(CONFIG_DEBUG_LIST) += list_debug.o
33		33
34	ifneq ($(CONFIG_HAVE_DEC_LOCK),y)	34	ifneq ($(CONFIG_HAVE_DEC_LOCK),y)
35	lib-y += dec_and_lock.o	35	lib-y += dec_and_lock.o
36	endif	36	endif
37		37
38	obj-$(CONFIG_BITREVERSE) += bitrev.o	38	obj-$(CONFIG_BITREVERSE) += bitrev.o
39	obj-$(CONFIG_CRC_CCITT) += crc-ccitt.o	39	obj-$(CONFIG_CRC_CCITT) += crc-ccitt.o
40	obj-$(CONFIG_CRC16) += crc16.o	40	obj-$(CONFIG_CRC16) += crc16.o
41	obj-$(CONFIG_CRC32) += crc32.o	41	obj-$(CONFIG_CRC32) += crc32.o
42	obj-$(CONFIG_LIBCRC32C) += libcrc32c.o	42	obj-$(CONFIG_LIBCRC32C) += libcrc32c.o
43	obj-$(CONFIG_GENERIC_IOMAP) += iomap.o	43	obj-$(CONFIG_GENERIC_IOMAP) += iomap.o
44	obj-$(CONFIG_GENERIC_ALLOCATOR) += genalloc.o	44	obj-$(CONFIG_GENERIC_ALLOCATOR) += genalloc.o
45		45
46	obj-$(CONFIG_ZLIB_INFLATE) += zlib_inflate/	46	obj-$(CONFIG_ZLIB_INFLATE) += zlib_inflate/
47	obj-$(CONFIG_ZLIB_DEFLATE) += zlib_deflate/	47	obj-$(CONFIG_ZLIB_DEFLATE) += zlib_deflate/
48	obj-$(CONFIG_REED_SOLOMON) += reed_solomon/	48	obj-$(CONFIG_REED_SOLOMON) += reed_solomon/
49		49
50	obj-$(CONFIG_TEXTSEARCH) += textsearch.o	50	obj-$(CONFIG_TEXTSEARCH) += textsearch.o
51	obj-$(CONFIG_TEXTSEARCH_KMP) += ts_kmp.o	51	obj-$(CONFIG_TEXTSEARCH_KMP) += ts_kmp.o
52	obj-$(CONFIG_TEXTSEARCH_BM) += ts_bm.o	52	obj-$(CONFIG_TEXTSEARCH_BM) += ts_bm.o
53	obj-$(CONFIG_TEXTSEARCH_FSM) += ts_fsm.o	53	obj-$(CONFIG_TEXTSEARCH_FSM) += ts_fsm.o
54	obj-$(CONFIG_SMP) += percpu_counter.o	54	obj-$(CONFIG_SMP) += percpu_counter.o
55	obj-$(CONFIG_AUDIT_GENERIC) += audit.o	55	obj-$(CONFIG_AUDIT_GENERIC) += audit.o
56		56
57	obj-$(CONFIG_SWIOTLB) += swiotlb.o	57	obj-$(CONFIG_SWIOTLB) += swiotlb.o
58	obj-$(CONFIG_FAULT_INJECTION) += fault-inject.o	58	obj-$(CONFIG_FAULT_INJECTION) += fault-inject.o
59		59
60	lib-$(CONFIG_GENERIC_BUG) += bug.o	60	lib-$(CONFIG_GENERIC_BUG) += bug.o
61		61
62	hostprogs-y := gen_crc32table	62	hostprogs-y := gen_crc32table
63	clean-files := crc32table.h	63	clean-files := crc32table.h
64		64
65	$(obj)/crc32.o: $(obj)/crc32table.h	65	$(obj)/crc32.o: $(obj)/crc32table.h
66		66
67	quiet_cmd_crc32 = GEN $@	67	quiet_cmd_crc32 = GEN $@
68	cmd_crc32 = $< > $@	68	cmd_crc32 = $< > $@
69		69
70	$(obj)/crc32table.h: $(obj)/gen_crc32table	70	$(obj)/crc32table.h: $(obj)/gen_crc32table
71	$(call cmd,crc32)	71	$(call cmd,crc32)
72		72

 /*
  * linux/mm/slab.c
  * Written by Mark Hemment, 1996/97.
  * (markhe@nextd.demon.co.uk)
  *
  * kmem_cache_destroy() + some cleanup - 1999 Andrea Arcangeli
  *
  * Major cleanup, different bufctl logic, per-cpu arrays
  *	(c) 2000 Manfred Spraul
  *
  * Cleanup, make the head arrays unconditional, preparation for NUMA
  * 	(c) 2002 Manfred Spraul
  *
  * An implementation of the Slab Allocator as described in outline in;
  *	UNIX Internals: The New Frontiers by Uresh Vahalia
  *	Pub: Prentice Hall	ISBN 0-13-101908-2
  * or with a little more detail in;
  *	The Slab Allocator: An Object-Caching Kernel Memory Allocator
  *	Jeff Bonwick (Sun Microsystems).
  *	Presented at: USENIX Summer 1994 Technical Conference
  *
  * The memory is organized in caches, one cache for each object type.
  * (e.g. inode_cache, dentry_cache, buffer_head, vm_area_struct)
  * Each cache consists out of many slabs (they are small (usually one
  * page long) and always contiguous), and each slab contains multiple
  * initialized objects.
  *
  * This means, that your constructor is used only for newly allocated
  * slabs and you must pass objects with the same intializations to
  * kmem_cache_free.
  *
  * Each cache can only support one memory type (GFP_DMA, GFP_HIGHMEM,
  * normal). If you need a special memory type, then must create a new
  * cache for that memory type.
  *
  * In order to reduce fragmentation, the slabs are sorted in 3 groups:
  *   full slabs with 0 free objects
  *   partial slabs
  *   empty slabs with no allocated objects
  *
  * If partial slabs exist, then new allocations come from these slabs,
  * otherwise from empty slabs or new slabs are allocated.
  *
  * kmem_cache_destroy() CAN CRASH if you try to allocate from the cache
  * during kmem_cache_destroy(). The caller must prevent concurrent allocs.
  *
  * Each cache has a short per-cpu head array, most allocs
  * and frees go into that array, and if that array overflows, then 1/2
  * of the entries in the array are given back into the global cache.
  * The head array is strictly LIFO and should improve the cache hit rates.
  * On SMP, it additionally reduces the spinlock operations.
  *
  * The c_cpuarray may not be read with enabled local interrupts -
  * it's changed with a smp_call_function().
  *
  * SMP synchronization:
  *  constructors and destructors are called without any locking.
  *  Several members in struct kmem_cache and struct slab never change, they
  *	are accessed without any locking.
  *  The per-cpu arrays are never accessed from the wrong cpu, no locking,
  *  	and local interrupts are disabled so slab code is preempt-safe.
  *  The non-constant members are protected with a per-cache irq spinlock.
  *
  * Many thanks to Mark Hemment, who wrote another per-cpu slab patch
  * in 2000 - many ideas in the current implementation are derived from
  * his patch.
  *
  * Further notes from the original documentation:
  *
  * 11 April '97.  Started multi-threading - markhe
  *	The global cache-chain is protected by the mutex 'cache_chain_mutex'.
  *	The sem is only needed when accessing/extending the cache-chain, which
  *	can never happen inside an interrupt (kmem_cache_create(),
  *	kmem_cache_shrink() and kmem_cache_reap()).
  *
  *	At present, each engine can be growing a cache.  This should be blocked.
  *
  * 15 March 2005. NUMA slab allocator.
  *	Shai Fultheim <shai@scalex86.org>.
  *	Shobhit Dayal <shobhit@calsoftinc.com>
  *	Alok N Kataria <alokk@calsoftinc.com>
  *	Christoph Lameter <christoph@lameter.com>
  *
  *	Modified the slab allocator to be node aware on NUMA systems.
  *	Each node has its own list of partial, free and full slabs.
  *	All object allocations for a node occur from node specific slab lists.
  */
 #include	<linux/slab.h>
 #include	<linux/mm.h>
 #include	<linux/poison.h>
 #include	<linux/swap.h>
 #include	<linux/cache.h>
 #include	<linux/interrupt.h>
 #include	<linux/init.h>
 #include	<linux/compiler.h>
 #include	<linux/cpuset.h>
 #include	<linux/seq_file.h>
 #include	<linux/notifier.h>
 #include	<linux/kallsyms.h>
 #include	<linux/cpu.h>
 #include	<linux/sysctl.h>
 #include	<linux/module.h>
 #include	<linux/rcupdate.h>
 #include	<linux/string.h>
 #include	<linux/uaccess.h>
 #include	<linux/nodemask.h>
 #include	<linux/mempolicy.h>
 #include	<linux/mutex.h>
 #include	<linux/fault-inject.h>
 #include	<linux/rtmutex.h>
+#include	<linux/reciprocal_div.h>
 #include	<asm/cacheflush.h>
 #include	<asm/tlbflush.h>
 #include	<asm/page.h>
 /*
  * DEBUG	- 1 for kmem_cache_create() to honour; SLAB_DEBUG_INITIAL,
  *		  SLAB_RED_ZONE & SLAB_POISON.
  *		  0 for faster, smaller code (especially in the critical paths).
  *
  * STATS	- 1 to collect stats for /proc/slabinfo.
  *		  0 for faster, smaller code (especially in the critical paths).
  *
  * FORCED_DEBUG	- 1 enables SLAB_RED_ZONE and SLAB_POISON (if possible)
  */
 #ifdef CONFIG_DEBUG_SLAB
 #define	DEBUG		1
 #define	STATS		1
 #define	FORCED_DEBUG	1
 #else
 #define	DEBUG		0
 #define	STATS		0
 #define	FORCED_DEBUG	0
 #endif
 /* Shouldn't this be in a header file somewhere? */
 #define	BYTES_PER_WORD		sizeof(void *)
 #ifndef cache_line_size
 #define cache_line_size()	L1_CACHE_BYTES
 #endif
 #ifndef ARCH_KMALLOC_MINALIGN
 /*
  * Enforce a minimum alignment for the kmalloc caches.
  * Usually, the kmalloc caches are cache_line_size() aligned, except when
  * DEBUG and FORCED_DEBUG are enabled, then they are BYTES_PER_WORD aligned.
  * Some archs want to perform DMA into kmalloc caches and need a guaranteed
  * alignment larger than BYTES_PER_WORD. ARCH_KMALLOC_MINALIGN allows that.
  * Note that this flag disables some debug features.
  */
 #define ARCH_KMALLOC_MINALIGN 0
 #endif
 #ifndef ARCH_SLAB_MINALIGN
 /*
  * Enforce a minimum alignment for all caches.
  * Intended for archs that get misalignment faults even for BYTES_PER_WORD
  * aligned buffers. Includes ARCH_KMALLOC_MINALIGN.
  * If possible: Do not enable this flag for CONFIG_DEBUG_SLAB, it disables
  * some debug features.
  */
 #define ARCH_SLAB_MINALIGN 0
 #endif
 #ifndef ARCH_KMALLOC_FLAGS
 #define ARCH_KMALLOC_FLAGS SLAB_HWCACHE_ALIGN
 #endif
 /* Legal flag mask for kmem_cache_create(). */
 #if DEBUG
 # define CREATE_MASK	(SLAB_DEBUG_INITIAL | SLAB_RED_ZONE | \
 			 SLAB_POISON | SLAB_HWCACHE_ALIGN | \
 			 SLAB_CACHE_DMA | \
 			 SLAB_MUST_HWCACHE_ALIGN | SLAB_STORE_USER | \
 			 SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \
 			 SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD)
 #else
 # define CREATE_MASK	(SLAB_HWCACHE_ALIGN | \
 			 SLAB_CACHE_DMA | SLAB_MUST_HWCACHE_ALIGN | \
 			 SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \
 			 SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD)
 #endif
 /*
  * kmem_bufctl_t:
  *
  * Bufctl's are used for linking objs within a slab
  * linked offsets.
  *
  * This implementation relies on "struct page" for locating the cache &
  * slab an object belongs to.
  * This allows the bufctl structure to be small (one int), but limits
  * the number of objects a slab (not a cache) can contain when off-slab
  * bufctls are used. The limit is the size of the largest general cache
  * that does not use off-slab slabs.
  * For 32bit archs with 4 kB pages, is this 56.
  * This is not serious, as it is only for large objects, when it is unwise
  * to have too many per slab.
  * Note: This limit can be raised by introducing a general cache whose size
  * is less than 512 (PAGE_SIZE<<3), but greater than 256.
  */
 typedef unsigned int kmem_bufctl_t;
 #define BUFCTL_END	(((kmem_bufctl_t)(~0U))-0)
 #define BUFCTL_FREE	(((kmem_bufctl_t)(~0U))-1)
 #define	BUFCTL_ACTIVE	(((kmem_bufctl_t)(~0U))-2)
 #define	SLAB_LIMIT	(((kmem_bufctl_t)(~0U))-3)
 /*
  * struct slab
  *
  * Manages the objs in a slab. Placed either at the beginning of mem allocated
  * for a slab, or allocated from an general cache.
  * Slabs are chained into three list: fully used, partial, fully free slabs.
  */
 struct slab {
 	struct list_head list;
 	unsigned long colouroff;
 	void *s_mem;		/* including colour offset */
 	unsigned int inuse;	/* num of objs active in slab */
 	kmem_bufctl_t free;
 	unsigned short nodeid;
 };
 /*
  * struct slab_rcu
  *
  * slab_destroy on a SLAB_DESTROY_BY_RCU cache uses this structure to
  * arrange for kmem_freepages to be called via RCU.  This is useful if
  * we need to approach a kernel structure obliquely, from its address
  * obtained without the usual locking.  We can lock the structure to
  * stabilize it and check it's still at the given address, only if we
  * can be sure that the memory has not been meanwhile reused for some
  * other kind of object (which our subsystem's lock might corrupt).
  *
  * rcu_read_lock before reading the address, then rcu_read_unlock after
  * taking the spinlock within the structure expected at that address.
  *
  * We assume struct slab_rcu can overlay struct slab when destroying.
  */
 struct slab_rcu {
 	struct rcu_head head;
 	struct kmem_cache *cachep;
 	void *addr;
 };
 /*
  * struct array_cache
  *
  * Purpose:
  * - LIFO ordering, to hand out cache-warm objects from _alloc
  * - reduce the number of linked list operations
  * - reduce spinlock operations
  *
  * The limit is stored in the per-cpu structure to reduce the data cache
  * footprint.
  *
  */
 struct array_cache {
 	unsigned int avail;
 	unsigned int limit;
 	unsigned int batchcount;
 	unsigned int touched;
 	spinlock_t lock;
 	void *entry[0];	/*
 			 * Must have this definition in here for the proper
 			 * alignment of array_cache. Also simplifies accessing
 			 * the entries.
 			 * [0] is for gcc 2.95. It should really be [].
 			 */
 };
 /*
  * bootstrap: The caches do not work without cpuarrays anymore, but the
  * cpuarrays are allocated from the generic caches...
  */
 #define BOOT_CPUCACHE_ENTRIES	1
 struct arraycache_init {
 	struct array_cache cache;
 	void *entries[BOOT_CPUCACHE_ENTRIES];
 };
 /*
  * The slab lists for all objects.
  */
 struct kmem_list3 {
 	struct list_head slabs_partial;	/* partial list first, better asm code */
 	struct list_head slabs_full;
 	struct list_head slabs_free;
 	unsigned long free_objects;
 	unsigned int free_limit;
 	unsigned int colour_next;	/* Per-node cache coloring */
 	spinlock_t list_lock;
 	struct array_cache *shared;	/* shared per node */
 	struct array_cache **alien;	/* on other nodes */
 	unsigned long next_reap;	/* updated without locking */
 	int free_touched;		/* updated without locking */
 };
 /*
  * Need this for bootstrapping a per node allocator.
  */
 #define NUM_INIT_LISTS (2 * MAX_NUMNODES + 1)
 struct kmem_list3 __initdata initkmem_list3[NUM_INIT_LISTS];
 #define	CACHE_CACHE 0
 #define	SIZE_AC 1
 #define	SIZE_L3 (1 + MAX_NUMNODES)
 static int drain_freelist(struct kmem_cache *cache,
 			struct kmem_list3 *l3, int tofree);
 static void free_block(struct kmem_cache *cachep, void **objpp, int len,
 			int node);
 static int enable_cpucache(struct kmem_cache *cachep);
 static void cache_reap(struct work_struct *unused);
 /*
  * This function must be completely optimized away if a constant is passed to
  * it.  Mostly the same as what is in linux/slab.h except it returns an index.
  */
 static __always_inline int index_of(const size_t size)
 {
 	extern void __bad_size(void);
 	if (__builtin_constant_p(size)) {
 		int i = 0;
 #define CACHE(x) \
 	if (size <=x) \
 		return i; \
 	else \
 		i++;
 #include "linux/kmalloc_sizes.h"
 #undef CACHE
 		__bad_size();
 	} else
 		__bad_size();
 	return 0;
 }
 static int slab_early_init = 1;
 #define INDEX_AC index_of(sizeof(struct arraycache_init))
 #define INDEX_L3 index_of(sizeof(struct kmem_list3))
 static void kmem_list3_init(struct kmem_list3 *parent)
 {
 	INIT_LIST_HEAD(&parent->slabs_full);
 	INIT_LIST_HEAD(&parent->slabs_partial);
 	INIT_LIST_HEAD(&parent->slabs_free);
 	parent->shared = NULL;
 	parent->alien = NULL;
 	parent->colour_next = 0;
 	spin_lock_init(&parent->list_lock);
 	parent->free_objects = 0;
 	parent->free_touched = 0;
 }
 #define MAKE_LIST(cachep, listp, slab, nodeid)				\
 	do {								\
 		INIT_LIST_HEAD(listp);					\
 		list_splice(&(cachep->nodelists[nodeid]->slab), listp);	\
 	} while (0)
 #define	MAKE_ALL_LISTS(cachep, ptr, nodeid)				\
 	do {								\
 	MAKE_LIST((cachep), (&(ptr)->slabs_full), slabs_full, nodeid);	\
 	MAKE_LIST((cachep), (&(ptr)->slabs_partial), slabs_partial, nodeid); \
 	MAKE_LIST((cachep), (&(ptr)->slabs_free), slabs_free, nodeid);	\
 	} while (0)
 /*
  * struct kmem_cache
  *
  * manages a cache.
  */
 struct kmem_cache {
 /* 1) per-cpu data, touched during every alloc/free */
 	struct array_cache *array[NR_CPUS];
 /* 2) Cache tunables. Protected by cache_chain_mutex */
 	unsigned int batchcount;
 	unsigned int limit;
 	unsigned int shared;
 	unsigned int buffer_size;
+	u32 reciprocal_buffer_size;
 /* 3) touched by every alloc & free from the backend */
 	struct kmem_list3 *nodelists[MAX_NUMNODES];
 	unsigned int flags;		/* constant flags */
 	unsigned int num;		/* # of objs per slab */
 /* 4) cache_grow/shrink */
 	/* order of pgs per slab (2^n) */
 	unsigned int gfporder;
 	/* force GFP flags, e.g. GFP_DMA */
 	gfp_t gfpflags;
 	size_t colour;			/* cache colouring range */
 	unsigned int colour_off;	/* colour offset */
 	struct kmem_cache *slabp_cache;
 	unsigned int slab_size;
 	unsigned int dflags;		/* dynamic flags */
 	/* constructor func */
 	void (*ctor) (void *, struct kmem_cache *, unsigned long);
 	/* de-constructor func */
 	void (*dtor) (void *, struct kmem_cache *, unsigned long);
 /* 5) cache creation/removal */
 	const char *name;
 	struct list_head next;
 /* 6) statistics */
 #if STATS
 	unsigned long num_active;
 	unsigned long num_allocations;
 	unsigned long high_mark;
 	unsigned long grown;
 	unsigned long reaped;
 	unsigned long errors;
 	unsigned long max_freeable;
 	unsigned long node_allocs;
 	unsigned long node_frees;
 	unsigned long node_overflow;
 	atomic_t allochit;
 	atomic_t allocmiss;
 	atomic_t freehit;
 	atomic_t freemiss;
 #endif
 #if DEBUG
 	/*
 	 * If debugging is enabled, then the allocator can add additional
 	 * fields and/or padding to every object. buffer_size contains the total
 	 * object size including these internal fields, the following two
 	 * variables contain the offset to the user object and its size.
 	 */
 	int obj_offset;
 	int obj_size;
 #endif
 };
 #define CFLGS_OFF_SLAB		(0x80000000UL)
 #define	OFF_SLAB(x)	((x)->flags & CFLGS_OFF_SLAB)
 #define BATCHREFILL_LIMIT	16
 /*
  * Optimization question: fewer reaps means less probability for unnessary
  * cpucache drain/refill cycles.
  *
  * OTOH the cpuarrays can contain lots of objects,
  * which could lock up otherwise freeable slabs.
  */
 #define REAPTIMEOUT_CPUC	(2*HZ)
 #define REAPTIMEOUT_LIST3	(4*HZ)
 #if STATS
 #define	STATS_INC_ACTIVE(x)	((x)->num_active++)
 #define	STATS_DEC_ACTIVE(x)	((x)->num_active--)
 #define	STATS_INC_ALLOCED(x)	((x)->num_allocations++)
 #define	STATS_INC_GROWN(x)	((x)->grown++)
 #define	STATS_ADD_REAPED(x,y)	((x)->reaped += (y))
 #define	STATS_SET_HIGH(x)						\
 	do {								\
 		if ((x)->num_active > (x)->high_mark)			\
 			(x)->high_mark = (x)->num_active;		\
 	} while (0)
 #define	STATS_INC_ERR(x)	((x)->errors++)
 #define	STATS_INC_NODEALLOCS(x)	((x)->node_allocs++)
 #define	STATS_INC_NODEFREES(x)	((x)->node_frees++)
 #define STATS_INC_ACOVERFLOW(x)   ((x)->node_overflow++)
 #define	STATS_SET_FREEABLE(x, i)					\
 	do {								\
 		if ((x)->max_freeable < i)				\
 			(x)->max_freeable = i;				\
 	} while (0)
 #define STATS_INC_ALLOCHIT(x)	atomic_inc(&(x)->allochit)
 #define STATS_INC_ALLOCMISS(x)	atomic_inc(&(x)->allocmiss)
 #define STATS_INC_FREEHIT(x)	atomic_inc(&(x)->freehit)
 #define STATS_INC_FREEMISS(x)	atomic_inc(&(x)->freemiss)
 #else
 #define	STATS_INC_ACTIVE(x)	do { } while (0)
 #define	STATS_DEC_ACTIVE(x)	do { } while (0)
 #define	STATS_INC_ALLOCED(x)	do { } while (0)
 #define	STATS_INC_GROWN(x)	do { } while (0)
 #define	STATS_ADD_REAPED(x,y)	do { } while (0)
 #define	STATS_SET_HIGH(x)	do { } while (0)
 #define	STATS_INC_ERR(x)	do { } while (0)
 #define	STATS_INC_NODEALLOCS(x)	do { } while (0)
 #define	STATS_INC_NODEFREES(x)	do { } while (0)
 #define STATS_INC_ACOVERFLOW(x)   do { } while (0)
 #define	STATS_SET_FREEABLE(x, i) do { } while (0)
 #define STATS_INC_ALLOCHIT(x)	do { } while (0)
 #define STATS_INC_ALLOCMISS(x)	do { } while (0)
 #define STATS_INC_FREEHIT(x)	do { } while (0)
 #define STATS_INC_FREEMISS(x)	do { } while (0)
 #endif
 #if DEBUG
 /*
  * memory layout of objects:
  * 0		: objp
  * 0 .. cachep->obj_offset - BYTES_PER_WORD - 1: padding. This ensures that
  * 		the end of an object is aligned with the end of the real
  * 		allocation. Catches writes behind the end of the allocation.
  * cachep->obj_offset - BYTES_PER_WORD .. cachep->obj_offset - 1:
  * 		redzone word.
  * cachep->obj_offset: The real object.
  * cachep->buffer_size - 2* BYTES_PER_WORD: redzone word [BYTES_PER_WORD long]
  * cachep->buffer_size - 1* BYTES_PER_WORD: last caller address
  *					[BYTES_PER_WORD long]
  */
 static int obj_offset(struct kmem_cache *cachep)
 {
 	return cachep->obj_offset;
 }
 static int obj_size(struct kmem_cache *cachep)
 {
 	return cachep->obj_size;
 }
 static unsigned long *dbg_redzone1(struct kmem_cache *cachep, void *objp)
 {
 	BUG_ON(!(cachep->flags & SLAB_RED_ZONE));
 	return (unsigned long*) (objp+obj_offset(cachep)-BYTES_PER_WORD);
 }
 static unsigned long *dbg_redzone2(struct kmem_cache *cachep, void *objp)
 {
 	BUG_ON(!(cachep->flags & SLAB_RED_ZONE));
 	if (cachep->flags & SLAB_STORE_USER)
 		return (unsigned long *)(objp + cachep->buffer_size -
 					 2 * BYTES_PER_WORD);
 	return (unsigned long *)(objp + cachep->buffer_size - BYTES_PER_WORD);
 }
 static void **dbg_userword(struct kmem_cache *cachep, void *objp)
 {
 	BUG_ON(!(cachep->flags & SLAB_STORE_USER));
 	return (void **)(objp + cachep->buffer_size - BYTES_PER_WORD);
 }
 #else
 #define obj_offset(x)			0
 #define obj_size(cachep)		(cachep->buffer_size)
 #define dbg_redzone1(cachep, objp)	({BUG(); (unsigned long *)NULL;})
 #define dbg_redzone2(cachep, objp)	({BUG(); (unsigned long *)NULL;})
 #define dbg_userword(cachep, objp)	({BUG(); (void **)NULL;})
 #endif
 /*
  * Maximum size of an obj (in 2^order pages) and absolute limit for the gfp
  * order.
  */
 #if defined(CONFIG_LARGE_ALLOCS)
 #define	MAX_OBJ_ORDER	13	/* up to 32Mb */
 #define	MAX_GFP_ORDER	13	/* up to 32Mb */
 #elif defined(CONFIG_MMU)
 #define	MAX_OBJ_ORDER	5	/* 32 pages */
 #define	MAX_GFP_ORDER	5	/* 32 pages */
 #else
 #define	MAX_OBJ_ORDER	8	/* up to 1Mb */
 #define	MAX_GFP_ORDER	8	/* up to 1Mb */
 #endif
 /*
  * Do not go above this order unless 0 objects fit into the slab.
  */
 #define	BREAK_GFP_ORDER_HI	1
 #define	BREAK_GFP_ORDER_LO	0
 static int slab_break_gfp_order = BREAK_GFP_ORDER_LO;
 /*
  * Functions for storing/retrieving the cachep and or slab from the page
  * allocator.  These are used to find the slab an obj belongs to.  With kfree(),
  * these are used to find the cache which an obj belongs to.
  */
 static inline void page_set_cache(struct page *page, struct kmem_cache *cache)
 {
 	page->lru.next = (struct list_head *)cache;
 }
 static inline struct kmem_cache *page_get_cache(struct page *page)
 {
 	if (unlikely(PageCompound(page)))
 		page = (struct page *)page_private(page);
 	BUG_ON(!PageSlab(page));
 	return (struct kmem_cache *)page->lru.next;
 }
 static inline void page_set_slab(struct page *page, struct slab *slab)
 {
 	page->lru.prev = (struct list_head *)slab;
 }
 static inline struct slab *page_get_slab(struct page *page)
 {
 	if (unlikely(PageCompound(page)))
 		page = (struct page *)page_private(page);
 	BUG_ON(!PageSlab(page));
 	return (struct slab *)page->lru.prev;
 }
 static inline struct kmem_cache *virt_to_cache(const void *obj)
 {
 	struct page *page = virt_to_page(obj);
 	return page_get_cache(page);
 }
 static inline struct slab *virt_to_slab(const void *obj)
 {
 	struct page *page = virt_to_page(obj);
 	return page_get_slab(page);
 }
 static inline void *index_to_obj(struct kmem_cache *cache, struct slab *slab,
 				 unsigned int idx)
 {
 	return slab->s_mem + cache->buffer_size * idx;
 }
-static inline unsigned int obj_to_index(struct kmem_cache *cache,
+/*
-					struct slab *slab, void *obj)
+ * We want to avoid an expensive divide : (offset / cache->buffer_size)
+ *   Using the fact that buffer_size is a constant for a particular cache,
+ *   we can replace (offset / cache->buffer_size) by
+ *   reciprocal_divide(offset, cache->reciprocal_buffer_size)
+ */
+static inline unsigned int obj_to_index(const struct kmem_cache *cache,
+					const struct slab *slab, void *obj)
 {
-	return (unsigned)(obj - slab->s_mem) / cache->buffer_size;
+	u32 offset = (obj - slab->s_mem);
+	return reciprocal_divide(offset, cache->reciprocal_buffer_size);
 }
 /*
  * These are the default caches for kmalloc. Custom caches can have other sizes.
  */
 struct cache_sizes malloc_sizes[] = {
 #define CACHE(x) { .cs_size = (x) },
 #include <linux/kmalloc_sizes.h>
 	CACHE(ULONG_MAX)
 #undef CACHE
 };
 EXPORT_SYMBOL(malloc_sizes);
 /* Must match cache_sizes above. Out of line to keep cache footprint low. */
 struct cache_names {
 	char *name;
 	char *name_dma;
 };
 static struct cache_names __initdata cache_names[] = {
 #define CACHE(x) { .name = "size-" #x, .name_dma = "size-" #x "(DMA)" },
 #include <linux/kmalloc_sizes.h>
 	{NULL,}
 #undef CACHE
 };
 static struct arraycache_init initarray_cache __initdata =
     { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} };
 static struct arraycache_init initarray_generic =
     { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} };
 /* internal cache of cache description objs */
 static struct kmem_cache cache_cache = {
 	.batchcount = 1,
 	.limit = BOOT_CPUCACHE_ENTRIES,
 	.shared = 1,
 	.buffer_size = sizeof(struct kmem_cache),
 	.name = "kmem_cache",
 #if DEBUG
 	.obj_size = sizeof(struct kmem_cache),
 #endif
 };
 #define BAD_ALIEN_MAGIC 0x01020304ul
 #ifdef CONFIG_LOCKDEP
 /*
  * Slab sometimes uses the kmalloc slabs to store the slab headers
  * for other slabs "off slab".
  * The locking for this is tricky in that it nests within the locks
  * of all other slabs in a few places; to deal with this special
  * locking we put on-slab caches into a separate lock-class.
  *
  * We set lock class for alien array caches which are up during init.
  * The lock annotation will be lost if all cpus of a node goes down and
  * then comes back up during hotplug
  */
 static struct lock_class_key on_slab_l3_key;
 static struct lock_class_key on_slab_alc_key;
 static inline void init_lock_keys(void)
 {
 	int q;
 	struct cache_sizes *s = malloc_sizes;
 	while (s->cs_size != ULONG_MAX) {
 		for_each_node(q) {
 			struct array_cache **alc;
 			int r;
 			struct kmem_list3 *l3 = s->cs_cachep->nodelists[q];
 			if (!l3 || OFF_SLAB(s->cs_cachep))
 				continue;
 			lockdep_set_class(&l3->list_lock, &on_slab_l3_key);
 			alc = l3->alien;
 			/*
 			 * FIXME: This check for BAD_ALIEN_MAGIC
 			 * should go away when common slab code is taught to
 			 * work even without alien caches.
 			 * Currently, non NUMA code returns BAD_ALIEN_MAGIC
 			 * for alloc_alien_cache,
 			 */
 			if (!alc || (unsigned long)alc == BAD_ALIEN_MAGIC)
 				continue;
 			for_each_node(r) {
 				if (alc[r])
 					lockdep_set_class(&alc[r]->lock,
 					     &on_slab_alc_key);
 			}
 		}
 		s++;
 	}
 }
 #else
 static inline void init_lock_keys(void)
 {
 }
 #endif
 /*
  * 1. Guard access to the cache-chain.
  * 2. Protect sanity of cpu_online_map against cpu hotplug events
  */
 static DEFINE_MUTEX(cache_chain_mutex);
 static struct list_head cache_chain;
 /*
  * chicken and egg problem: delay the per-cpu array allocation
  * until the general caches are up.
  */
 static enum {
 	NONE,
 	PARTIAL_AC,
 	PARTIAL_L3,
 	FULL
 } g_cpucache_up;
 /*
  * used by boot code to determine if it can use slab based allocator
  */
 int slab_is_available(void)
 {
 	return g_cpucache_up == FULL;
 }
 static DEFINE_PER_CPU(struct delayed_work, reap_work);
 static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep)
 {
 	return cachep->array[smp_processor_id()];
 }
 static inline struct kmem_cache *__find_general_cachep(size_t size,
 							gfp_t gfpflags)
 {
 	struct cache_sizes *csizep = malloc_sizes;
 #if DEBUG
 	/* This happens if someone tries to call
 	 * kmem_cache_create(), or __kmalloc(), before
 	 * the generic caches are initialized.
 	 */
 	BUG_ON(malloc_sizes[INDEX_AC].cs_cachep == NULL);
 #endif
 	while (size > csizep->cs_size)
 		csizep++;
 	/*
 	 * Really subtle: The last entry with cs->cs_size==ULONG_MAX
 	 * has cs_{dma,}cachep==NULL. Thus no special case
 	 * for large kmalloc calls required.
 	 */
 	if (unlikely(gfpflags & GFP_DMA))
 		return csizep->cs_dmacachep;
 	return csizep->cs_cachep;
 }
 static struct kmem_cache *kmem_find_general_cachep(size_t size, gfp_t gfpflags)
 {
 	return __find_general_cachep(size, gfpflags);
 }
 static size_t slab_mgmt_size(size_t nr_objs, size_t align)
 {
 	return ALIGN(sizeof(struct slab)+nr_objs*sizeof(kmem_bufctl_t), align);
 }
 /*
  * Calculate the number of objects and left-over bytes for a given buffer size.
  */
 static void cache_estimate(unsigned long gfporder, size_t buffer_size,
 			   size_t align, int flags, size_t *left_over,
 			   unsigned int *num)
 {
 	int nr_objs;
 	size_t mgmt_size;
 	size_t slab_size = PAGE_SIZE << gfporder;
 	/*
 	 * The slab management structure can be either off the slab or
 	 * on it. For the latter case, the memory allocated for a
 	 * slab is used for:
 	 *
 	 * - The struct slab
 	 * - One kmem_bufctl_t for each object
 	 * - Padding to respect alignment of @align
 	 * - @buffer_size bytes for each object
 	 *
 	 * If the slab management structure is off the slab, then the
 	 * alignment will already be calculated into the size. Because
 	 * the slabs are all pages aligned, the objects will be at the
 	 * correct alignment when allocated.
 	 */
 	if (flags & CFLGS_OFF_SLAB) {
 		mgmt_size = 0;
 		nr_objs = slab_size / buffer_size;
 		if (nr_objs > SLAB_LIMIT)
 			nr_objs = SLAB_LIMIT;
 	} else {
 		/*
 		 * Ignore padding for the initial guess. The padding
 		 * is at most @align-1 bytes, and @buffer_size is at
 		 * least @align. In the worst case, this result will
 		 * be one greater than the number of objects that fit
 		 * into the memory allocation when taking the padding
 		 * into account.
 		 */
 		nr_objs = (slab_size - sizeof(struct slab)) /
 			  (buffer_size + sizeof(kmem_bufctl_t));
 		/*
 		 * This calculated number will be either the right
 		 * amount, or one greater than what we want.
 		 */
 		if (slab_mgmt_size(nr_objs, align) + nr_objs*buffer_size
 		       > slab_size)
 			nr_objs--;
 		if (nr_objs > SLAB_LIMIT)
 			nr_objs = SLAB_LIMIT;
 		mgmt_size = slab_mgmt_size(nr_objs, align);
 	}
 	*num = nr_objs;
 	*left_over = slab_size - nr_objs*buffer_size - mgmt_size;
 }
 #define slab_error(cachep, msg) __slab_error(__FUNCTION__, cachep, msg)
 static void __slab_error(const char *function, struct kmem_cache *cachep,
 			char *msg)
 {
 	printk(KERN_ERR "slab error in %s(): cache `%s': %s\n",
 	       function, cachep->name, msg);
 	dump_stack();
 }
 /*
  * By default on NUMA we use alien caches to stage the freeing of
  * objects allocated from other nodes. This causes massive memory
  * inefficiencies when using fake NUMA setup to split memory into a
  * large number of small nodes, so it can be disabled on the command
  * line
   */
 static int use_alien_caches __read_mostly = 1;
 static int __init noaliencache_setup(char *s)
 {
 	use_alien_caches = 0;
 	return 1;
 }
 __setup("noaliencache", noaliencache_setup);
 #ifdef CONFIG_NUMA
 /*
  * Special reaping functions for NUMA systems called from cache_reap().
  * These take care of doing round robin flushing of alien caches (containing
  * objects freed on different nodes from which they were allocated) and the
  * flushing of remote pcps by calling drain_node_pages.
  */
 static DEFINE_PER_CPU(unsigned long, reap_node);
 static void init_reap_node(int cpu)
 {
 	int node;
 	node = next_node(cpu_to_node(cpu), node_online_map);
 	if (node == MAX_NUMNODES)
 		node = first_node(node_online_map);
 	per_cpu(reap_node, cpu) = node;
 }
 static void next_reap_node(void)
 {
 	int node = __get_cpu_var(reap_node);
 	/*
 	 * Also drain per cpu pages on remote zones
 	 */
 	if (node != numa_node_id())
 		drain_node_pages(node);
 	node = next_node(node, node_online_map);
 	if (unlikely(node >= MAX_NUMNODES))
 		node = first_node(node_online_map);
 	__get_cpu_var(reap_node) = node;
 }
 #else
 #define init_reap_node(cpu) do { } while (0)
 #define next_reap_node(void) do { } while (0)
 #endif
 /*
  * Initiate the reap timer running on the target CPU.  We run at around 1 to 2Hz
  * via the workqueue/eventd.
  * Add the CPU number into the expiration time to minimize the possibility of
  * the CPUs getting into lockstep and contending for the global cache chain
  * lock.
  */
 static void __devinit start_cpu_timer(int cpu)
 {
 	struct delayed_work *reap_work = &per_cpu(reap_work, cpu);
 	/*
 	 * When this gets called from do_initcalls via cpucache_init(),
 	 * init_workqueues() has already run, so keventd will be setup
 	 * at that time.
 	 */
 	if (keventd_up() && reap_work->work.func == NULL) {
 		init_reap_node(cpu);
 		INIT_DELAYED_WORK(reap_work, cache_reap);
 		schedule_delayed_work_on(cpu, reap_work,
 					__round_jiffies_relative(HZ, cpu));
 	}
 }
 static struct array_cache *alloc_arraycache(int node, int entries,
 					    int batchcount)
 {
 	int memsize = sizeof(void *) * entries + sizeof(struct array_cache);
 	struct array_cache *nc = NULL;
 	nc = kmalloc_node(memsize, GFP_KERNEL, node);
 	if (nc) {
 		nc->avail = 0;
 		nc->limit = entries;
 		nc->batchcount = batchcount;
 		nc->touched = 0;
 		spin_lock_init(&nc->lock);
 	}
 	return nc;
 }
 /*
  * Transfer objects in one arraycache to another.
  * Locking must be handled by the caller.
  *
  * Return the number of entries transferred.
  */
 static int transfer_objects(struct array_cache *to,
 		struct array_cache *from, unsigned int max)
 {
 	/* Figure out how many entries to transfer */
 	int nr = min(min(from->avail, max), to->limit - to->avail);
 	if (!nr)
 		return 0;
 	memcpy(to->entry + to->avail, from->entry + from->avail -nr,
 			sizeof(void *) *nr);
 	from->avail -= nr;
 	to->avail += nr;
 	to->touched = 1;
 	return nr;
 }
 #ifndef CONFIG_NUMA
 #define drain_alien_cache(cachep, alien) do { } while (0)
 #define reap_alien(cachep, l3) do { } while (0)
 static inline struct array_cache **alloc_alien_cache(int node, int limit)
 {
 	return (struct array_cache **)BAD_ALIEN_MAGIC;
 }
 static inline void free_alien_cache(struct array_cache **ac_ptr)
 {
 }
 static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
 {
 	return 0;
 }
 static inline void *alternate_node_alloc(struct kmem_cache *cachep,
 		gfp_t flags)
 {
 	return NULL;
 }
 static inline void *____cache_alloc_node(struct kmem_cache *cachep,
 		 gfp_t flags, int nodeid)
 {
 	return NULL;
 }
 #else	/* CONFIG_NUMA */
 static void *____cache_alloc_node(struct kmem_cache *, gfp_t, int);
 static void *alternate_node_alloc(struct kmem_cache *, gfp_t);
 static struct array_cache **alloc_alien_cache(int node, int limit)
 {
 	struct array_cache **ac_ptr;
 	int memsize = sizeof(void *) * MAX_NUMNODES;
 	int i;
 	if (limit > 1)
 		limit = 12;
 	ac_ptr = kmalloc_node(memsize, GFP_KERNEL, node);
 	if (ac_ptr) {
 		for_each_node(i) {
 			if (i == node || !node_online(i)) {
 				ac_ptr[i] = NULL;
 				continue;
 			}
 			ac_ptr[i] = alloc_arraycache(node, limit, 0xbaadf00d);
 			if (!ac_ptr[i]) {
 				for (i--; i <= 0; i--)
 					kfree(ac_ptr[i]);
 				kfree(ac_ptr);
 				return NULL;
 			}
 		}
 	}
 	return ac_ptr;
 }
 static void free_alien_cache(struct array_cache **ac_ptr)
 {
 	int i;
 	if (!ac_ptr)
 		return;
 	for_each_node(i)
 	    kfree(ac_ptr[i]);
 	kfree(ac_ptr);
 }
 static void __drain_alien_cache(struct kmem_cache *cachep,
 				struct array_cache *ac, int node)
 {
 	struct kmem_list3 *rl3 = cachep->nodelists[node];
 	if (ac->avail) {
 		spin_lock(&rl3->list_lock);
 		/*
 		 * Stuff objects into the remote nodes shared array first.
 		 * That way we could avoid the overhead of putting the objects
 		 * into the free lists and getting them back later.
 		 */
 		if (rl3->shared)
 			transfer_objects(rl3->shared, ac, ac->limit);
 		free_block(cachep, ac->entry, ac->avail, node);
 		ac->avail = 0;
 		spin_unlock(&rl3->list_lock);
 	}
 }
 /*
  * Called from cache_reap() to regularly drain alien caches round robin.
  */
 static void reap_alien(struct kmem_cache *cachep, struct kmem_list3 *l3)
 {
 	int node = __get_cpu_var(reap_node);
 	if (l3->alien) {
 		struct array_cache *ac = l3->alien[node];
 		if (ac && ac->avail && spin_trylock_irq(&ac->lock)) {
 			__drain_alien_cache(cachep, ac, node);
 			spin_unlock_irq(&ac->lock);
 		}
 	}
 }
 static void drain_alien_cache(struct kmem_cache *cachep,
 				struct array_cache **alien)
 {
 	int i = 0;
 	struct array_cache *ac;
 	unsigned long flags;
 	for_each_online_node(i) {
 		ac = alien[i];
 		if (ac) {
 			spin_lock_irqsave(&ac->lock, flags);
 			__drain_alien_cache(cachep, ac, i);
 			spin_unlock_irqrestore(&ac->lock, flags);
 		}
 	}
 }
 static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
 {
 	struct slab *slabp = virt_to_slab(objp);
 	int nodeid = slabp->nodeid;
 	struct kmem_list3 *l3;
 	struct array_cache *alien = NULL;
 	int node;
 	node = numa_node_id();
 	/*
 	 * Make sure we are not freeing a object from another node to the array
 	 * cache on this cpu.
 	 */
 	if (likely(slabp->nodeid == node) || unlikely(!use_alien_caches))
 		return 0;
 	l3 = cachep->nodelists[node];
 	STATS_INC_NODEFREES(cachep);
 	if (l3->alien && l3->alien[nodeid]) {
 		alien = l3->alien[nodeid];
 		spin_lock(&alien->lock);
 		if (unlikely(alien->avail == alien->limit)) {
 			STATS_INC_ACOVERFLOW(cachep);
 			__drain_alien_cache(cachep, alien, nodeid);
 		}
 		alien->entry[alien->avail++] = objp;
 		spin_unlock(&alien->lock);
 	} else {
 		spin_lock(&(cachep->nodelists[nodeid])->list_lock);
 		free_block(cachep, &objp, 1, nodeid);
 		spin_unlock(&(cachep->nodelists[nodeid])->list_lock);
 	}
 	return 1;
 }
 #endif
 static int __cpuinit cpuup_callback(struct notifier_block *nfb,
 				    unsigned long action, void *hcpu)
 {
 	long cpu = (long)hcpu;
 	struct kmem_cache *cachep;
 	struct kmem_list3 *l3 = NULL;
 	int node = cpu_to_node(cpu);
 	int memsize = sizeof(struct kmem_list3);
 	switch (action) {
 	case CPU_UP_PREPARE:
 		mutex_lock(&cache_chain_mutex);
 		/*
 		 * We need to do this right in the beginning since
 		 * alloc_arraycache's are going to use this list.
 		 * kmalloc_node allows us to add the slab to the right
 		 * kmem_list3 and not this cpu's kmem_list3
 		 */
 		list_for_each_entry(cachep, &cache_chain, next) {
 			/*
 			 * Set up the size64 kmemlist for cpu before we can
 			 * begin anything. Make sure some other cpu on this
 			 * node has not already allocated this
 			 */
 			if (!cachep->nodelists[node]) {
 				l3 = kmalloc_node(memsize, GFP_KERNEL, node);
 				if (!l3)
 					goto bad;
 				kmem_list3_init(l3);
 				l3->next_reap = jiffies + REAPTIMEOUT_LIST3 +
 				    ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
 				/*
 				 * The l3s don't come and go as CPUs come and
 				 * go.  cache_chain_mutex is sufficient
 				 * protection here.
 				 */
 				cachep->nodelists[node] = l3;
 			}
 			spin_lock_irq(&cachep->nodelists[node]->list_lock);
 			cachep->nodelists[node]->free_limit =
 				(1 + nr_cpus_node(node)) *
 				cachep->batchcount + cachep->num;
 			spin_unlock_irq(&cachep->nodelists[node]->list_lock);
 		}
 		/*
 		 * Now we can go ahead with allocating the shared arrays and
 		 * array caches
 		 */
 		list_for_each_entry(cachep, &cache_chain, next) {
 			struct array_cache *nc;
 			struct array_cache *shared;
 			struct array_cache **alien = NULL;
 			nc = alloc_arraycache(node, cachep->limit,
 						cachep->batchcount);
 			if (!nc)
 				goto bad;
 			shared = alloc_arraycache(node,
 					cachep->shared * cachep->batchcount,
 					0xbaadf00d);
 			if (!shared)
 				goto bad;
 			if (use_alien_caches) {
                                 alien = alloc_alien_cache(node, cachep->limit);
                                 if (!alien)
                                         goto bad;
                         }
 			cachep->array[cpu] = nc;
 			l3 = cachep->nodelists[node];
 			BUG_ON(!l3);
 			spin_lock_irq(&l3->list_lock);
 			if (!l3->shared) {
 				/*
 				 * We are serialised from CPU_DEAD or
 				 * CPU_UP_CANCELLED by the cpucontrol lock
 				 */
 				l3->shared = shared;
 				shared = NULL;
 			}
 #ifdef CONFIG_NUMA
 			if (!l3->alien) {
 				l3->alien = alien;
 				alien = NULL;
 			}
 #endif
 			spin_unlock_irq(&l3->list_lock);
 			kfree(shared);
 			free_alien_cache(alien);
 		}
 		break;
 	case CPU_ONLINE:
 		mutex_unlock(&cache_chain_mutex);
 		start_cpu_timer(cpu);
 		break;
 #ifdef CONFIG_HOTPLUG_CPU
 	case CPU_DOWN_PREPARE:
 		mutex_lock(&cache_chain_mutex);
 		break;
 	case CPU_DOWN_FAILED:
 		mutex_unlock(&cache_chain_mutex);
 		break;
 	case CPU_DEAD:
 		/*
 		 * Even if all the cpus of a node are down, we don't free the
 		 * kmem_list3 of any cache. This to avoid a race between
 		 * cpu_down, and a kmalloc allocation from another cpu for
 		 * memory from the node of the cpu going down.  The list3
 		 * structure is usually allocated from kmem_cache_create() and
 		 * gets destroyed at kmem_cache_destroy().
 		 */
 		/* fall thru */
 #endif
 	case CPU_UP_CANCELED:
 		list_for_each_entry(cachep, &cache_chain, next) {
 			struct array_cache *nc;
 			struct array_cache *shared;
 			struct array_cache **alien;
 			cpumask_t mask;
 			mask = node_to_cpumask(node);
 			/* cpu is dead; no one can alloc from it. */
 			nc = cachep->array[cpu];
 			cachep->array[cpu] = NULL;
 			l3 = cachep->nodelists[node];
 			if (!l3)
 				goto free_array_cache;
 			spin_lock_irq(&l3->list_lock);
 			/* Free limit for this kmem_list3 */
 			l3->free_limit -= cachep->batchcount;
 			if (nc)
 				free_block(cachep, nc->entry, nc->avail, node);
 			if (!cpus_empty(mask)) {
 				spin_unlock_irq(&l3->list_lock);
 				goto free_array_cache;
 			}
 			shared = l3->shared;
 			if (shared) {
 				free_block(cachep, l3->shared->entry,
 					   l3->shared->avail, node);
 				l3->shared = NULL;
 			}
 			alien = l3->alien;
 			l3->alien = NULL;
 			spin_unlock_irq(&l3->list_lock);
 			kfree(shared);
 			if (alien) {
 				drain_alien_cache(cachep, alien);
 				free_alien_cache(alien);
 			}
 free_array_cache:
 			kfree(nc);
 		}
 		/*
 		 * In the previous loop, all the objects were freed to
 		 * the respective cache's slabs,  now we can go ahead and
 		 * shrink each nodelist to its limit.
 		 */
 		list_for_each_entry(cachep, &cache_chain, next) {
 			l3 = cachep->nodelists[node];
 			if (!l3)
 				continue;
 			drain_freelist(cachep, l3, l3->free_objects);
 		}
 		mutex_unlock(&cache_chain_mutex);
 		break;
 	}
 	return NOTIFY_OK;
 bad:
 	return NOTIFY_BAD;
 }
 static struct notifier_block __cpuinitdata cpucache_notifier = {
 	&cpuup_callback, NULL, 0
 };
 /*
  * swap the static kmem_list3 with kmalloced memory
  */
 static void init_list(struct kmem_cache *cachep, struct kmem_list3 *list,
 			int nodeid)
 {
 	struct kmem_list3 *ptr;
 	ptr = kmalloc_node(sizeof(struct kmem_list3), GFP_KERNEL, nodeid);
 	BUG_ON(!ptr);
 	local_irq_disable();
 	memcpy(ptr, list, sizeof(struct kmem_list3));
 	/*
 	 * Do not assume that spinlocks can be initialized via memcpy:
 	 */
 	spin_lock_init(&ptr->list_lock);
 	MAKE_ALL_LISTS(cachep, ptr, nodeid);
 	cachep->nodelists[nodeid] = ptr;
 	local_irq_enable();
 }
 /*
  * Initialisation.  Called after the page allocator have been initialised and
  * before smp_init().
  */
 void __init kmem_cache_init(void)
 {
 	size_t left_over;
 	struct cache_sizes *sizes;
 	struct cache_names *names;
 	int i;
 	int order;
 	int node;
 	for (i = 0; i < NUM_INIT_LISTS; i++) {
 		kmem_list3_init(&initkmem_list3[i]);
 		if (i < MAX_NUMNODES)
 			cache_cache.nodelists[i] = NULL;
 	}
 	/*
 	 * Fragmentation resistance on low memory - only use bigger
 	 * page orders on machines with more than 32MB of memory.
 	 */
 	if (num_physpages > (32 << 20) >> PAGE_SHIFT)
 		slab_break_gfp_order = BREAK_GFP_ORDER_HI;
 	/* Bootstrap is tricky, because several objects are allocated
 	 * from caches that do not exist yet:
 	 * 1) initialize the cache_cache cache: it contains the struct
 	 *    kmem_cache structures of all caches, except cache_cache itself:
 	 *    cache_cache is statically allocated.
 	 *    Initially an __init data area is used for the head array and the
 	 *    kmem_list3 structures, it's replaced with a kmalloc allocated
 	 *    array at the end of the bootstrap.
 	 * 2) Create the first kmalloc cache.
 	 *    The struct kmem_cache for the new cache is allocated normally.
 	 *    An __init data area is used for the head array.
 	 * 3) Create the remaining kmalloc caches, with minimally sized
 	 *    head arrays.
 	 * 4) Replace the __init data head arrays for cache_cache and the first
 	 *    kmalloc cache with kmalloc allocated arrays.
 	 * 5) Replace the __init data for kmem_list3 for cache_cache and
 	 *    the other cache's with kmalloc allocated memory.
 	 * 6) Resize the head arrays of the kmalloc caches to their final sizes.
 	 */
 	node = numa_node_id();
 	/* 1) create the cache_cache */
 	INIT_LIST_HEAD(&cache_chain);
 	list_add(&cache_cache.next, &cache_chain);
 	cache_cache.colour_off = cache_line_size();
 	cache_cache.array[smp_processor_id()] = &initarray_cache.cache;
 	cache_cache.nodelists[node] = &initkmem_list3[CACHE_CACHE];
 	cache_cache.buffer_size = ALIGN(cache_cache.buffer_size,
 					cache_line_size());
+	cache_cache.reciprocal_buffer_size =
+		reciprocal_value(cache_cache.buffer_size);
 	for (order = 0; order < MAX_ORDER; order++) {
 		cache_estimate(order, cache_cache.buffer_size,
 			cache_line_size(), 0, &left_over, &cache_cache.num);
 		if (cache_cache.num)
 			break;
 	}
 	BUG_ON(!cache_cache.num);
 	cache_cache.gfporder = order;
 	cache_cache.colour = left_over / cache_cache.colour_off;
 	cache_cache.slab_size = ALIGN(cache_cache.num * sizeof(kmem_bufctl_t) +
 				      sizeof(struct slab), cache_line_size());
 	/* 2+3) create the kmalloc caches */
 	sizes = malloc_sizes;
 	names = cache_names;
 	/*
 	 * Initialize the caches that provide memory for the array cache and the
 	 * kmem_list3 structures first.  Without this, further allocations will
 	 * bug.
 	 */
 	sizes[INDEX_AC].cs_cachep = kmem_cache_create(names[INDEX_AC].name,
 					sizes[INDEX_AC].cs_size,
 					ARCH_KMALLOC_MINALIGN,
 					ARCH_KMALLOC_FLAGS|SLAB_PANIC,
 					NULL, NULL);
 	if (INDEX_AC != INDEX_L3) {
 		sizes[INDEX_L3].cs_cachep =
 			kmem_cache_create(names[INDEX_L3].name,
 				sizes[INDEX_L3].cs_size,
 				ARCH_KMALLOC_MINALIGN,
 				ARCH_KMALLOC_FLAGS|SLAB_PANIC,
 				NULL, NULL);
 	}
 	slab_early_init = 0;
 	while (sizes->cs_size != ULONG_MAX) {
 		/*
 		 * For performance, all the general caches are L1 aligned.
 		 * This should be particularly beneficial on SMP boxes, as it
 		 * eliminates "false sharing".
 		 * Note for systems short on memory removing the alignment will
 		 * allow tighter packing of the smaller caches.
 		 */
 		if (!sizes->cs_cachep) {
 			sizes->cs_cachep = kmem_cache_create(names->name,
 					sizes->cs_size,
 					ARCH_KMALLOC_MINALIGN,
 					ARCH_KMALLOC_FLAGS|SLAB_PANIC,
 					NULL, NULL);
 		}
 		sizes->cs_dmacachep = kmem_cache_create(names->name_dma,
 					sizes->cs_size,
 					ARCH_KMALLOC_MINALIGN,
 					ARCH_KMALLOC_FLAGS|SLAB_CACHE_DMA|
 						SLAB_PANIC,
 					NULL, NULL);
 		sizes++;
 		names++;
 	}
 	/* 4) Replace the bootstrap head arrays */
 	{
 		struct array_cache *ptr;
 		ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL);
 		local_irq_disable();
 		BUG_ON(cpu_cache_get(&cache_cache) != &initarray_cache.cache);
 		memcpy(ptr, cpu_cache_get(&cache_cache),
 		       sizeof(struct arraycache_init));
 		/*
 		 * Do not assume that spinlocks can be initialized via memcpy:
 		 */
 		spin_lock_init(&ptr->lock);
 		cache_cache.array[smp_processor_id()] = ptr;
 		local_irq_enable();
 		ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL);
 		local_irq_disable();
 		BUG_ON(cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep)
 		       != &initarray_generic.cache);
 		memcpy(ptr, cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep),
 		       sizeof(struct arraycache_init));
 		/*
 		 * Do not assume that spinlocks can be initialized via memcpy:
 		 */
 		spin_lock_init(&ptr->lock);
 		malloc_sizes[INDEX_AC].cs_cachep->array[smp_processor_id()] =
 		    ptr;
 		local_irq_enable();
 	}
 	/* 5) Replace the bootstrap kmem_list3's */
 	{
 		int nid;
 		/* Replace the static kmem_list3 structures for the boot cpu */
 		init_list(&cache_cache, &initkmem_list3[CACHE_CACHE], node);
 		for_each_online_node(nid) {
 			init_list(malloc_sizes[INDEX_AC].cs_cachep,
 				  &initkmem_list3[SIZE_AC + nid], nid);
 			if (INDEX_AC != INDEX_L3) {
 				init_list(malloc_sizes[INDEX_L3].cs_cachep,
 					  &initkmem_list3[SIZE_L3 + nid], nid);
 			}
 		}
 	}
 	/* 6) resize the head arrays to their final sizes */
 	{
 		struct kmem_cache *cachep;
 		mutex_lock(&cache_chain_mutex);
 		list_for_each_entry(cachep, &cache_chain, next)
 			if (enable_cpucache(cachep))
 				BUG();
 		mutex_unlock(&cache_chain_mutex);
 	}
 	/* Annotate slab for lockdep -- annotate the malloc caches */
 	init_lock_keys();
 	/* Done! */
 	g_cpucache_up = FULL;
 	/*
 	 * Register a cpu startup notifier callback that initializes
 	 * cpu_cache_get for all new cpus
 	 */
 	register_cpu_notifier(&cpucache_notifier);
 	/*
 	 * The reap timers are started later, with a module init call: That part
 	 * of the kernel is not yet operational.
 	 */
 }
 static int __init cpucache_init(void)
 {
 	int cpu;
 	/*
 	 * Register the timers that return unneeded pages to the page allocator
 	 */
 	for_each_online_cpu(cpu)
 		start_cpu_timer(cpu);
 	return 0;
 }
 __initcall(cpucache_init);
 /*
  * Interface to system's page allocator. No need to hold the cache-lock.
  *
  * If we requested dmaable memory, we will get it. Even if we
  * did not request dmaable memory, we might get it, but that
  * would be relatively rare and ignorable.
  */
 static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid)
 {
 	struct page *page;
 	int nr_pages;
 	int i;
 #ifndef CONFIG_MMU
 	/*
 	 * Nommu uses slab's for process anonymous memory allocations, and thus
 	 * requires __GFP_COMP to properly refcount higher order allocations
 	 */
 	flags |= __GFP_COMP;
 #endif
 	flags |= cachep->gfpflags;
 	page = alloc_pages_node(nodeid, flags, cachep->gfporder);
 	if (!page)
 		return NULL;
 	nr_pages = (1 << cachep->gfporder);
 	if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
 		add_zone_page_state(page_zone(page),
 			NR_SLAB_RECLAIMABLE, nr_pages);
 	else
 		add_zone_page_state(page_zone(page),
 			NR_SLAB_UNRECLAIMABLE, nr_pages);
 	for (i = 0; i < nr_pages; i++)
 		__SetPageSlab(page + i);
 	return page_address(page);
 }
 /*
  * Interface to system's page release.
  */
 static void kmem_freepages(struct kmem_cache *cachep, void *addr)
 {
 	unsigned long i = (1 << cachep->gfporder);
 	struct page *page = virt_to_page(addr);
 	const unsigned long nr_freed = i;
 	if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
 		sub_zone_page_state(page_zone(page),
 				NR_SLAB_RECLAIMABLE, nr_freed);
 	else
 		sub_zone_page_state(page_zone(page),
 				NR_SLAB_UNRECLAIMABLE, nr_freed);
 	while (i--) {
 		BUG_ON(!PageSlab(page));
 		__ClearPageSlab(page);
 		page++;
 	}
 	if (current->reclaim_state)
 		current->reclaim_state->reclaimed_slab += nr_freed;
 	free_pages((unsigned long)addr, cachep->gfporder);
 }
 static void kmem_rcu_free(struct rcu_head *head)
 {
 	struct slab_rcu *slab_rcu = (struct slab_rcu *)head;
 	struct kmem_cache *cachep = slab_rcu->cachep;
 	kmem_freepages(cachep, slab_rcu->addr);
 	if (OFF_SLAB(cachep))
 		kmem_cache_free(cachep->slabp_cache, slab_rcu);
 }
 #if DEBUG
 #ifdef CONFIG_DEBUG_PAGEALLOC
 static void store_stackinfo(struct kmem_cache *cachep, unsigned long *addr,
 			    unsigned long caller)
 {
 	int size = obj_size(cachep);
 	addr = (unsigned long *)&((char *)addr)[obj_offset(cachep)];
 	if (size < 5 * sizeof(unsigned long))
 		return;
 	*addr++ = 0x12345678;
 	*addr++ = caller;
 	*addr++ = smp_processor_id();
 	size -= 3 * sizeof(unsigned long);
 	{
 		unsigned long *sptr = &caller;
 		unsigned long svalue;
 		while (!kstack_end(sptr)) {
 			svalue = *sptr++;
 			if (kernel_text_address(svalue)) {
 				*addr++ = svalue;
 				size -= sizeof(unsigned long);
 				if (size <= sizeof(unsigned long))
 					break;
 			}
 		}
 	}
 	*addr++ = 0x87654321;
 }
 #endif
 static void poison_obj(struct kmem_cache *cachep, void *addr, unsigned char val)
 {
 	int size = obj_size(cachep);
 	addr = &((char *)addr)[obj_offset(cachep)];
 	memset(addr, val, size);
 	*(unsigned char *)(addr + size - 1) = POISON_END;
 }
 static void dump_line(char *data, int offset, int limit)
 {
 	int i;
 	unsigned char error = 0;
 	int bad_count = 0;
 	printk(KERN_ERR "%03x:", offset);
 	for (i = 0; i < limit; i++) {
 		if (data[offset + i] != POISON_FREE) {
 			error = data[offset + i];
 			bad_count++;
 		}
 		printk(" %02x", (unsigned char)data[offset + i]);
 	}
 	printk("\n");
 	if (bad_count == 1) {
 		error ^= POISON_FREE;
 		if (!(error & (error - 1))) {
 			printk(KERN_ERR "Single bit error detected. Probably "
 					"bad RAM.\n");
 #ifdef CONFIG_X86
 			printk(KERN_ERR "Run memtest86+ or a similar memory "
 					"test tool.\n");
 #else
 			printk(KERN_ERR "Run a memory test tool.\n");
 #endif
 		}
 	}
 }
 #endif
 #if DEBUG
 static void print_objinfo(struct kmem_cache *cachep, void *objp, int lines)
 {
 	int i, size;
 	char *realobj;
 	if (cachep->flags & SLAB_RED_ZONE) {
 		printk(KERN_ERR "Redzone: 0x%lx/0x%lx.\n",
 			*dbg_redzone1(cachep, objp),
 			*dbg_redzone2(cachep, objp));
 	}
 	if (cachep->flags & SLAB_STORE_USER) {
 		printk(KERN_ERR "Last user: [<%p>]",
 			*dbg_userword(cachep, objp));
 		print_symbol("(%s)",
 				(unsigned long)*dbg_userword(cachep, objp));
 		printk("\n");
 	}
 	realobj = (char *)objp + obj_offset(cachep);
 	size = obj_size(cachep);
 	for (i = 0; i < size && lines; i += 16, lines--) {
 		int limit;
 		limit = 16;
 		if (i + limit > size)
 			limit = size - i;
 		dump_line(realobj, i, limit);
 	}
 }
 static void check_poison_obj(struct kmem_cache *cachep, void *objp)
 {
 	char *realobj;
 	int size, i;
 	int lines = 0;
 	realobj = (char *)objp + obj_offset(cachep);
 	size = obj_size(cachep);
 	for (i = 0; i < size; i++) {
 		char exp = POISON_FREE;
 		if (i == size - 1)
 			exp = POISON_END;
 		if (realobj[i] != exp) {
 			int limit;
 			/* Mismatch ! */
 			/* Print header */
 			if (lines == 0) {
 				printk(KERN_ERR
 					"Slab corruption: start=%p, len=%d\n",
 					realobj, size);
 				print_objinfo(cachep, objp, 0);
 			}
 			/* Hexdump the affected line */
 			i = (i / 16) * 16;
 			limit = 16;
 			if (i + limit > size)
 				limit = size - i;
 			dump_line(realobj, i, limit);
 			i += 16;
 			lines++;
 			/* Limit to 5 lines */
 			if (lines > 5)
 				break;
 		}
 	}
 	if (lines != 0) {
 		/* Print some data about the neighboring objects, if they
 		 * exist:
 		 */
 		struct slab *slabp = virt_to_slab(objp);
 		unsigned int objnr;
 		objnr = obj_to_index(cachep, slabp, objp);
 		if (objnr) {
 			objp = index_to_obj(cachep, slabp, objnr - 1);
 			realobj = (char *)objp + obj_offset(cachep);
 			printk(KERN_ERR "Prev obj: start=%p, len=%d\n",
 			       realobj, size);
 			print_objinfo(cachep, objp, 2);
 		}
 		if (objnr + 1 < cachep->num) {
 			objp = index_to_obj(cachep, slabp, objnr + 1);
 			realobj = (char *)objp + obj_offset(cachep);
 			printk(KERN_ERR "Next obj: start=%p, len=%d\n",
 			       realobj, size);
 			print_objinfo(cachep, objp, 2);
 		}
 	}
 }
 #endif
 #if DEBUG
 /**
  * slab_destroy_objs - destroy a slab and its objects
  * @cachep: cache pointer being destroyed
  * @slabp: slab pointer being destroyed
  *
  * Call the registered destructor for each object in a slab that is being
  * destroyed.
  */
 static void slab_destroy_objs(struct kmem_cache *cachep, struct slab *slabp)
 {
 	int i;
 	for (i = 0; i < cachep->num; i++) {
 		void *objp = index_to_obj(cachep, slabp, i);
 		if (cachep->flags & SLAB_POISON) {
 #ifdef CONFIG_DEBUG_PAGEALLOC
 			if (cachep->buffer_size % PAGE_SIZE == 0 &&
 					OFF_SLAB(cachep))
 				kernel_map_pages(virt_to_page(objp),
 					cachep->buffer_size / PAGE_SIZE, 1);
 			else
 				check_poison_obj(cachep, objp);
 #else
 			check_poison_obj(cachep, objp);
 #endif
 		}
 		if (cachep->flags & SLAB_RED_ZONE) {
 			if (*dbg_redzone1(cachep, objp) != RED_INACTIVE)
 				slab_error(cachep, "start of a freed object "
 					   "was overwritten");
 			if (*dbg_redzone2(cachep, objp) != RED_INACTIVE)
 				slab_error(cachep, "end of a freed object "
 					   "was overwritten");
 		}
 		if (cachep->dtor && !(cachep->flags & SLAB_POISON))
 			(cachep->dtor) (objp + obj_offset(cachep), cachep, 0);
 	}
 }
 #else
 static void slab_destroy_objs(struct kmem_cache *cachep, struct slab *slabp)
 {
 	if (cachep->dtor) {
 		int i;
 		for (i = 0; i < cachep->num; i++) {
 			void *objp = index_to_obj(cachep, slabp, i);
 			(cachep->dtor) (objp, cachep, 0);
 		}
 	}
 }
 #endif
 /**
  * slab_destroy - destroy and release all objects in a slab
  * @cachep: cache pointer being destroyed
  * @slabp: slab pointer being destroyed
  *
  * Destroy all the objs in a slab, and release the mem back to the system.
  * Before calling the slab must have been unlinked from the cache.  The
  * cache-lock is not held/needed.
  */
 static void slab_destroy(struct kmem_cache *cachep, struct slab *slabp)
 {
 	void *addr = slabp->s_mem - slabp->colouroff;
 	slab_destroy_objs(cachep, slabp);
 	if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU)) {
 		struct slab_rcu *slab_rcu;
 		slab_rcu = (struct slab_rcu *)slabp;
 		slab_rcu->cachep = cachep;
 		slab_rcu->addr = addr;
 		call_rcu(&slab_rcu->head, kmem_rcu_free);
 	} else {
 		kmem_freepages(cachep, addr);
 		if (OFF_SLAB(cachep))
 			kmem_cache_free(cachep->slabp_cache, slabp);
 	}
 }
 /*
  * For setting up all the kmem_list3s for cache whose buffer_size is same as
  * size of kmem_list3.
  */
 static void set_up_list3s(struct kmem_cache *cachep, int index)
 {
 	int node;
 	for_each_online_node(node) {
 		cachep->nodelists[node] = &initkmem_list3[index + node];
 		cachep->nodelists[node]->next_reap = jiffies +
 		    REAPTIMEOUT_LIST3 +
 		    ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
 	}
 }
 static void __kmem_cache_destroy(struct kmem_cache *cachep)
 {
 	int i;
 	struct kmem_list3 *l3;
 	for_each_online_cpu(i)
 	    kfree(cachep->array[i]);
 	/* NUMA: free the list3 structures */
 	for_each_online_node(i) {
 		l3 = cachep->nodelists[i];
 		if (l3) {
 			kfree(l3->shared);
 			free_alien_cache(l3->alien);
 			kfree(l3);
 		}
 	}
 	kmem_cache_free(&cache_cache, cachep);
 }
 /**
  * calculate_slab_order - calculate size (page order) of slabs
  * @cachep: pointer to the cache that is being created
  * @size: size of objects to be created in this cache.
  * @align: required alignment for the objects.
  * @flags: slab allocation flags
  *
  * Also calculates the number of objects per slab.
  *
  * This could be made much more intelligent.  For now, try to avoid using
  * high order pages for slabs.  When the gfp() functions are more friendly
  * towards high-order requests, this should be changed.
  */
 static size_t calculate_slab_order(struct kmem_cache *cachep,
 			size_t size, size_t align, unsigned long flags)
 {
 	unsigned long offslab_limit;
 	size_t left_over = 0;
 	int gfporder;
 	for (gfporder = 0; gfporder <= MAX_GFP_ORDER; gfporder++) {
 		unsigned int num;
 		size_t remainder;
 		cache_estimate(gfporder, size, align, flags, &remainder, &num);
 		if (!num)
 			continue;
 		if (flags & CFLGS_OFF_SLAB) {
 			/*
 			 * Max number of objs-per-slab for caches which
 			 * use off-slab slabs. Needed to avoid a possible
 			 * looping condition in cache_grow().
 			 */
 			offslab_limit = size - sizeof(struct slab);
 			offslab_limit /= sizeof(kmem_bufctl_t);
  			if (num > offslab_limit)
 				break;
 		}
 		/* Found something acceptable - save it away */
 		cachep->num = num;
 		cachep->gfporder = gfporder;
 		left_over = remainder;
 		/*
 		 * A VFS-reclaimable slab tends to have most allocations
 		 * as GFP_NOFS and we really don't want to have to be allocating
 		 * higher-order pages when we are unable to shrink dcache.
 		 */
 		if (flags & SLAB_RECLAIM_ACCOUNT)
 			break;
 		/*
 		 * Large number of objects is good, but very large slabs are
 		 * currently bad for the gfp()s.
 		 */
 		if (gfporder >= slab_break_gfp_order)
 			break;
 		/*
 		 * Acceptable internal fragmentation?
 		 */
 		if (left_over * 8 <= (PAGE_SIZE << gfporder))
 			break;
 	}
 	return left_over;
 }
 static int setup_cpu_cache(struct kmem_cache *cachep)
 {
 	if (g_cpucache_up == FULL)
 		return enable_cpucache(cachep);
 	if (g_cpucache_up == NONE) {
 		/*
 		 * Note: the first kmem_cache_create must create the cache
 		 * that's used by kmalloc(24), otherwise the creation of
 		 * further caches will BUG().
 		 */
 		cachep->array[smp_processor_id()] = &initarray_generic.cache;
 		/*
 		 * If the cache that's used by kmalloc(sizeof(kmem_list3)) is
 		 * the first cache, then we need to set up all its list3s,
 		 * otherwise the creation of further caches will BUG().
 		 */
 		set_up_list3s(cachep, SIZE_AC);
 		if (INDEX_AC == INDEX_L3)
 			g_cpucache_up = PARTIAL_L3;
 		else
 			g_cpucache_up = PARTIAL_AC;
 	} else {
 		cachep->array[smp_processor_id()] =
 			kmalloc(sizeof(struct arraycache_init), GFP_KERNEL);
 		if (g_cpucache_up == PARTIAL_AC) {
 			set_up_list3s(cachep, SIZE_L3);
 			g_cpucache_up = PARTIAL_L3;
 		} else {
 			int node;
 			for_each_online_node(node) {
 				cachep->nodelists[node] =
 				    kmalloc_node(sizeof(struct kmem_list3),
 						GFP_KERNEL, node);
 				BUG_ON(!cachep->nodelists[node]);
 				kmem_list3_init(cachep->nodelists[node]);
 			}
 		}
 	}
 	cachep->nodelists[numa_node_id()]->next_reap =
 			jiffies + REAPTIMEOUT_LIST3 +
 			((unsigned long)cachep) % REAPTIMEOUT_LIST3;
 	cpu_cache_get(cachep)->avail = 0;
 	cpu_cache_get(cachep)->limit = BOOT_CPUCACHE_ENTRIES;
 	cpu_cache_get(cachep)->batchcount = 1;
 	cpu_cache_get(cachep)->touched = 0;
 	cachep->batchcount = 1;
 	cachep->limit = BOOT_CPUCACHE_ENTRIES;
 	return 0;
 }
 /**
  * kmem_cache_create - Create a cache.
  * @name: A string which is used in /proc/slabinfo to identify this cache.
  * @size: The size of objects to be created in this cache.
  * @align: The required alignment for the objects.
  * @flags: SLAB flags
  * @ctor: A constructor for the objects.
  * @dtor: A destructor for the objects.
  *
  * Returns a ptr to the cache on success, NULL on failure.
  * Cannot be called within a int, but can be interrupted.
  * The @ctor is run when new pages are allocated by the cache
  * and the @dtor is run before the pages are handed back.
  *
  * @name must be valid until the cache is destroyed. This implies that
  * the module calling this has to destroy the cache before getting unloaded.
  *
  * The flags are
  *
  * %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5)
  * to catch references to uninitialised memory.
  *
  * %SLAB_RED_ZONE - Insert `Red' zones around the allocated memory to check
  * for buffer overruns.
  *
  * %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware
  * cacheline.  This can be beneficial if you're counting cycles as closely
  * as davem.
  */
 struct kmem_cache *
 kmem_cache_create (const char *name, size_t size, size_t align,
 	unsigned long flags,
 	void (*ctor)(void*, struct kmem_cache *, unsigned long),
 	void (*dtor)(void*, struct kmem_cache *, unsigned long))
 {
 	size_t left_over, slab_size, ralign;
 	struct kmem_cache *cachep = NULL, *pc;
 	/*
 	 * Sanity checks... these are all serious usage bugs.
 	 */
 	if (!name || in_interrupt() || (size < BYTES_PER_WORD) ||
 	    (size > (1 << MAX_OBJ_ORDER) * PAGE_SIZE) || (dtor && !ctor)) {
 		printk(KERN_ERR "%s: Early error in slab %s\n", __FUNCTION__,
 				name);
 		BUG();
 	}
 	/*
 	 * We use cache_chain_mutex to ensure a consistent view of
 	 * cpu_online_map as well.  Please see cpuup_callback
 	 */
 	mutex_lock(&cache_chain_mutex);
 	list_for_each_entry(pc, &cache_chain, next) {
 		char tmp;
 		int res;
 		/*
 		 * This happens when the module gets unloaded and doesn't
 		 * destroy its slab cache and no-one else reuses the vmalloc
 		 * area of the module.  Print a warning.
 		 */
 		res = probe_kernel_address(pc->name, tmp);
 		if (res) {
 			printk("SLAB: cache with size %d has lost its name\n",
 			       pc->buffer_size);
 			continue;
 		}
 		if (!strcmp(pc->name, name)) {
 			printk("kmem_cache_create: duplicate cache %s\n", name);
 			dump_stack();
 			goto oops;
 		}
 	}
 #if DEBUG
 	WARN_ON(strchr(name, ' '));	/* It confuses parsers */
 	if ((flags & SLAB_DEBUG_INITIAL) && !ctor) {
 		/* No constructor, but inital state check requested */
 		printk(KERN_ERR "%s: No con, but init state check "
 		       "requested - %s\n", __FUNCTION__, name);
 		flags &= ~SLAB_DEBUG_INITIAL;
 	}
 #if FORCED_DEBUG
 	/*
 	 * Enable redzoning and last user accounting, except for caches with
 	 * large objects, if the increased size would increase the object size
 	 * above the next power of two: caches with object sizes just above a
 	 * power of two have a significant amount of internal fragmentation.
 	 */
 	if (size < 4096 || fls(size - 1) == fls(size-1 + 3 * BYTES_PER_WORD))
 		flags |= SLAB_RED_ZONE | SLAB_STORE_USER;
 	if (!(flags & SLAB_DESTROY_BY_RCU))
 		flags |= SLAB_POISON;
 #endif
 	if (flags & SLAB_DESTROY_BY_RCU)
 		BUG_ON(flags & SLAB_POISON);
 #endif
 	if (flags & SLAB_DESTROY_BY_RCU)
 		BUG_ON(dtor);
 	/*
 	 * Always checks flags, a caller might be expecting debug support which
 	 * isn't available.
 	 */
 	BUG_ON(flags & ~CREATE_MASK);
 	/*
 	 * Check that size is in terms of words.  This is needed to avoid
 	 * unaligned accesses for some archs when redzoning is used, and makes
 	 * sure any on-slab bufctl's are also correctly aligned.
 	 */
 	if (size & (BYTES_PER_WORD - 1)) {
 		size += (BYTES_PER_WORD - 1);
 		size &= ~(BYTES_PER_WORD - 1);
 	}
 	/* calculate the final buffer alignment: */
 	/* 1) arch recommendation: can be overridden for debug */
 	if (flags & SLAB_HWCACHE_ALIGN) {
 		/*
 		 * Default alignment: as specified by the arch code.  Except if
 		 * an object is really small, then squeeze multiple objects into
 		 * one cacheline.
 		 */
 		ralign = cache_line_size();
 		while (size <= ralign / 2)
 			ralign /= 2;
 	} else {
 		ralign = BYTES_PER_WORD;
 	}
 	/*
 	 * Redzoning and user store require word alignment. Note this will be
 	 * overridden by architecture or caller mandated alignment if either
 	 * is greater than BYTES_PER_WORD.
 	 */
 	if (flags & SLAB_RED_ZONE || flags & SLAB_STORE_USER)
 		ralign = BYTES_PER_WORD;
 	/* 2) arch mandated alignment */
 	if (ralign < ARCH_SLAB_MINALIGN) {
 		ralign = ARCH_SLAB_MINALIGN;
 	}
 	/* 3) caller mandated alignment */
 	if (ralign < align) {
 		ralign = align;
 	}
 	/* disable debug if necessary */
 	if (ralign > BYTES_PER_WORD)
 		flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
 	/*
 	 * 4) Store it.
 	 */
 	align = ralign;
 	/* Get cache's description obj. */
 	cachep = kmem_cache_zalloc(&cache_cache, GFP_KERNEL);
 	if (!cachep)
 		goto oops;
 #if DEBUG
 	cachep->obj_size = size;
 	/*
 	 * Both debugging options require word-alignment which is calculated
 	 * into align above.
 	 */
 	if (flags & SLAB_RED_ZONE) {
 		/* add space for red zone words */
 		cachep->obj_offset += BYTES_PER_WORD;
 		size += 2 * BYTES_PER_WORD;
 	}
 	if (flags & SLAB_STORE_USER) {
 		/* user store requires one word storage behind the end of
 		 * the real object.
 		 */
 		size += BYTES_PER_WORD;
 	}
 #if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC)
 	if (size >= malloc_sizes[INDEX_L3 + 1].cs_size
 	    && cachep->obj_size > cache_line_size() && size < PAGE_SIZE) {
 		cachep->obj_offset += PAGE_SIZE - size;
 		size = PAGE_SIZE;
 	}
 #endif
 #endif
 	/*
 	 * Determine if the slab management is 'on' or 'off' slab.
 	 * (bootstrapping cannot cope with offslab caches so don't do
 	 * it too early on.)
 	 */
 	if ((size >= (PAGE_SIZE >> 3)) && !slab_early_init)
 		/*
 		 * Size is large, assume best to place the slab management obj
 		 * off-slab (should allow better packing of objs).
 		 */
 		flags |= CFLGS_OFF_SLAB;
 	size = ALIGN(size, align);
 	left_over = calculate_slab_order(cachep, size, align, flags);
 	if (!cachep->num) {
 		printk("kmem_cache_create: couldn't create cache %s.\n", name);
 		kmem_cache_free(&cache_cache, cachep);
 		cachep = NULL;
 		goto oops;
 	}
 	slab_size = ALIGN(cachep->num * sizeof(kmem_bufctl_t)
 			  + sizeof(struct slab), align);
 	/*
 	 * If the slab has been placed off-slab, and we have enough space then
 	 * move it on-slab. This is at the expense of any extra colouring.
 	 */
 	if (flags & CFLGS_OFF_SLAB && left_over >= slab_size) {
 		flags &= ~CFLGS_OFF_SLAB;
 		left_over -= slab_size;
 	}
 	if (flags & CFLGS_OFF_SLAB) {
 		/* really off slab. No need for manual alignment */
 		slab_size =
 		    cachep->num * sizeof(kmem_bufctl_t) + sizeof(struct slab);
 	}
 	cachep->colour_off = cache_line_size();
 	/* Offset must be a multiple of the alignment. */
 	if (cachep->colour_off < align)
 		cachep->colour_off = align;
 	cachep->colour = left_over / cachep->colour_off;
 	cachep->slab_size = slab_size;
 	cachep->flags = flags;
 	cachep->gfpflags = 0;
 	if (flags & SLAB_CACHE_DMA)
 		cachep->gfpflags |= GFP_DMA;
 	cachep->buffer_size = size;
+	cachep->reciprocal_buffer_size = reciprocal_value(size);
 	if (flags & CFLGS_OFF_SLAB) {
 		cachep->slabp_cache = kmem_find_general_cachep(slab_size, 0u);
 		/*
 		 * This is a possibility for one of the malloc_sizes caches.
 		 * But since we go off slab only for object size greater than
 		 * PAGE_SIZE/8, and malloc_sizes gets created in ascending order,
 		 * this should not happen at all.
 		 * But leave a BUG_ON for some lucky dude.
 		 */
 		BUG_ON(!cachep->slabp_cache);
 	}
 	cachep->ctor = ctor;
 	cachep->dtor = dtor;
 	cachep->name = name;
 	if (setup_cpu_cache(cachep)) {
 		__kmem_cache_destroy(cachep);
 		cachep = NULL;
 		goto oops;
 	}
 	/* cache setup completed, link it into the list */
 	list_add(&cachep->next, &cache_chain);
 oops:
 	if (!cachep && (flags & SLAB_PANIC))
 		panic("kmem_cache_create(): failed to create slab `%s'\n",
 		      name);
 	mutex_unlock(&cache_chain_mutex);
 	return cachep;
 }
 EXPORT_SYMBOL(kmem_cache_create);
 #if DEBUG
 static void check_irq_off(void)
 {
 	BUG_ON(!irqs_disabled());
 }
 static void check_irq_on(void)
 {
 	BUG_ON(irqs_disabled());
 }
 static void check_spinlock_acquired(struct kmem_cache *cachep)
 {
 #ifdef CONFIG_SMP
 	check_irq_off();
 	assert_spin_locked(&cachep->nodelists[numa_node_id()]->list_lock);
 #endif
 }
 static void check_spinlock_acquired_node(struct kmem_cache *cachep, int node)
 {
 #ifdef CONFIG_SMP
 	check_irq_off();
 	assert_spin_locked(&cachep->nodelists[node]->list_lock);
 #endif
 }
 #else
 #define check_irq_off()	do { } while(0)
 #define check_irq_on()	do { } while(0)
 #define check_spinlock_acquired(x) do { } while(0)
 #define check_spinlock_acquired_node(x, y) do { } while(0)
 #endif
 static void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3,
 			struct array_cache *ac,
 			int force, int node);
 static void do_drain(void *arg)
 {
 	struct kmem_cache *cachep = arg;
 	struct array_cache *ac;
 	int node = numa_node_id();
 	check_irq_off();
 	ac = cpu_cache_get(cachep);
 	spin_lock(&cachep->nodelists[node]->list_lock);
 	free_block(cachep, ac->entry, ac->avail, node);
 	spin_unlock(&cachep->nodelists[node]->list_lock);
 	ac->avail = 0;
 }
 static void drain_cpu_caches(struct kmem_cache *cachep)
 {
 	struct kmem_list3 *l3;
 	int node;
 	on_each_cpu(do_drain, cachep, 1, 1);
 	check_irq_on();
 	for_each_online_node(node) {
 		l3 = cachep->nodelists[node];
 		if (l3 && l3->alien)
 			drain_alien_cache(cachep, l3->alien);
 	}
 	for_each_online_node(node) {
 		l3 = cachep->nodelists[node];
 		if (l3)
 			drain_array(cachep, l3, l3->shared, 1, node);
 	}
 }
 /*
  * Remove slabs from the list of free slabs.
  * Specify the number of slabs to drain in tofree.
  *
  * Returns the actual number of slabs released.
  */
 static int drain_freelist(struct kmem_cache *cache,
 			struct kmem_list3 *l3, int tofree)
 {
 	struct list_head *p;
 	int nr_freed;
 	struct slab *slabp;
 	nr_freed = 0;
 	while (nr_freed < tofree && !list_empty(&l3->slabs_free)) {
 		spin_lock_irq(&l3->list_lock);
 		p = l3->slabs_free.prev;
 		if (p == &l3->slabs_free) {
 			spin_unlock_irq(&l3->list_lock);
 			goto out;
 		}
 		slabp = list_entry(p, struct slab, list);
 #if DEBUG
 		BUG_ON(slabp->inuse);
 #endif
 		list_del(&slabp->list);
 		/*
 		 * Safe to drop the lock. The slab is no longer linked
 		 * to the cache.
 		 */
 		l3->free_objects -= cache->num;
 		spin_unlock_irq(&l3->list_lock);
 		slab_destroy(cache, slabp);
 		nr_freed++;
 	}
 out:
 	return nr_freed;
 }
 /* Called with cache_chain_mutex held to protect against cpu hotplug */
 static int __cache_shrink(struct kmem_cache *cachep)
 {
 	int ret = 0, i = 0;
 	struct kmem_list3 *l3;
 	drain_cpu_caches(cachep);
 	check_irq_on();
 	for_each_online_node(i) {
 		l3 = cachep->nodelists[i];
 		if (!l3)
 			continue;
 		drain_freelist(cachep, l3, l3->free_objects);
 		ret += !list_empty(&l3->slabs_full) ||
 			!list_empty(&l3->slabs_partial);
 	}
 	return (ret ? 1 : 0);
 }
 /**
  * kmem_cache_shrink - Shrink a cache.
  * @cachep: The cache to shrink.
  *
  * Releases as many slabs as possible for a cache.
  * To help debugging, a zero exit status indicates all slabs were released.
  */
 int kmem_cache_shrink(struct kmem_cache *cachep)
 {
 	int ret;
 	BUG_ON(!cachep || in_interrupt());
 	mutex_lock(&cache_chain_mutex);
 	ret = __cache_shrink(cachep);
 	mutex_unlock(&cache_chain_mutex);
 	return ret;
 }
 EXPORT_SYMBOL(kmem_cache_shrink);
 /**
  * kmem_cache_destroy - delete a cache
  * @cachep: the cache to destroy
  *
  * Remove a struct kmem_cache object from the slab cache.
  *
  * It is expected this function will be called by a module when it is
  * unloaded.  This will remove the cache completely, and avoid a duplicate
  * cache being allocated each time a module is loaded and unloaded, if the
  * module doesn't have persistent in-kernel storage across loads and unloads.
  *
  * The cache must be empty before calling this function.
  *
  * The caller must guarantee that noone will allocate memory from the cache
  * during the kmem_cache_destroy().
  */
 void kmem_cache_destroy(struct kmem_cache *cachep)
 {
 	BUG_ON(!cachep || in_interrupt());
 	/* Find the cache in the chain of caches. */
 	mutex_lock(&cache_chain_mutex);
 	/*
 	 * the chain is never empty, cache_cache is never destroyed
 	 */
 	list_del(&cachep->next);
 	if (__cache_shrink(cachep)) {
 		slab_error(cachep, "Can't free all objects");
 		list_add(&cachep->next, &cache_chain);
 		mutex_unlock(&cache_chain_mutex);
 		return;
 	}
 	if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU))
 		synchronize_rcu();
 	__kmem_cache_destroy(cachep);
 	mutex_unlock(&cache_chain_mutex);
 }
 EXPORT_SYMBOL(kmem_cache_destroy);
 /*
  * Get the memory for a slab management obj.
  * For a slab cache when the slab descriptor is off-slab, slab descriptors
  * always come from malloc_sizes caches.  The slab descriptor cannot
  * come from the same cache which is getting created because,
  * when we are searching for an appropriate cache for these
  * descriptors in kmem_cache_create, we search through the malloc_sizes array.
  * If we are creating a malloc_sizes cache here it would not be visible to
  * kmem_find_general_cachep till the initialization is complete.
  * Hence we cannot have slabp_cache same as the original cache.
  */
 static struct slab *alloc_slabmgmt(struct kmem_cache *cachep, void *objp,
 				   int colour_off, gfp_t local_flags,
 				   int nodeid)
 {
 	struct slab *slabp;
 	if (OFF_SLAB(cachep)) {
 		/* Slab management obj is off-slab. */
 		slabp = kmem_cache_alloc_node(cachep->slabp_cache,
 					      local_flags & ~GFP_THISNODE, nodeid);
 		if (!slabp)
 			return NULL;
 	} else {
 		slabp = objp + colour_off;
 		colour_off += cachep->slab_size;
 	}
 	slabp->inuse = 0;
 	slabp->colouroff = colour_off;
 	slabp->s_mem = objp + colour_off;
 	slabp->nodeid = nodeid;
 	return slabp;
 }
 static inline kmem_bufctl_t *slab_bufctl(struct slab *slabp)
 {
 	return (kmem_bufctl_t *) (slabp + 1);
 }
 static void cache_init_objs(struct kmem_cache *cachep,
 			    struct slab *slabp, unsigned long ctor_flags)
 {
 	int i;
 	for (i = 0; i < cachep->num; i++) {
 		void *objp = index_to_obj(cachep, slabp, i);
 #if DEBUG
 		/* need to poison the objs? */
 		if (cachep->flags & SLAB_POISON)
 			poison_obj(cachep, objp, POISON_FREE);
 		if (cachep->flags & SLAB_STORE_USER)
 			*dbg_userword(cachep, objp) = NULL;
 		if (cachep->flags & SLAB_RED_ZONE) {
 			*dbg_redzone1(cachep, objp) = RED_INACTIVE;
 			*dbg_redzone2(cachep, objp) = RED_INACTIVE;
 		}
 		/*
 		 * Constructors are not allowed to allocate memory from the same
 		 * cache which they are a constructor for.  Otherwise, deadlock.
 		 * They must also be threaded.
 		 */
 		if (cachep->ctor && !(cachep->flags & SLAB_POISON))
 			cachep->ctor(objp + obj_offset(cachep), cachep,
 				     ctor_flags);
 		if (cachep->flags & SLAB_RED_ZONE) {
 			if (*dbg_redzone2(cachep, objp) != RED_INACTIVE)
 				slab_error(cachep, "constructor overwrote the"
 					   " end of an object");
 			if (*dbg_redzone1(cachep, objp) != RED_INACTIVE)
 				slab_error(cachep, "constructor overwrote the"
 					   " start of an object");
 		}
 		if ((cachep->buffer_size % PAGE_SIZE) == 0 &&
 			    OFF_SLAB(cachep) && cachep->flags & SLAB_POISON)
 			kernel_map_pages(virt_to_page(objp),
 					 cachep->buffer_size / PAGE_SIZE, 0);
 #else
 		if (cachep->ctor)
 			cachep->ctor(objp, cachep, ctor_flags);
 #endif
 		slab_bufctl(slabp)[i] = i + 1;
 	}
 	slab_bufctl(slabp)[i - 1] = BUFCTL_END;
 	slabp->free = 0;
 }
 static void kmem_flagcheck(struct kmem_cache *cachep, gfp_t flags)
 {
 	if (flags & GFP_DMA)
 		BUG_ON(!(cachep->gfpflags & GFP_DMA));
 	else
 		BUG_ON(cachep->gfpflags & GFP_DMA);
 }
 static void *slab_get_obj(struct kmem_cache *cachep, struct slab *slabp,
 				int nodeid)
 {
 	void *objp = index_to_obj(cachep, slabp, slabp->free);
 	kmem_bufctl_t next;
 	slabp->inuse++;
 	next = slab_bufctl(slabp)[slabp->free];
 #if DEBUG
 	slab_bufctl(slabp)[slabp->free] = BUFCTL_FREE;
 	WARN_ON(slabp->nodeid != nodeid);
 #endif
 	slabp->free = next;
 	return objp;
 }
 static void slab_put_obj(struct kmem_cache *cachep, struct slab *slabp,
 				void *objp, int nodeid)
 {
 	unsigned int objnr = obj_to_index(cachep, slabp, objp);
 #if DEBUG
 	/* Verify that the slab belongs to the intended node */
 	WARN_ON(slabp->nodeid != nodeid);
 	if (slab_bufctl(slabp)[objnr] + 1 <= SLAB_LIMIT + 1) {
 		printk(KERN_ERR "slab: double free detected in cache "
 				"'%s', objp %p\n", cachep->name, objp);
 		BUG();
 	}
 #endif
 	slab_bufctl(slabp)[objnr] = slabp->free;
 	slabp->free = objnr;
 	slabp->inuse--;
 }
 /*
  * Map pages beginning at addr to the given cache and slab. This is required
  * for the slab allocator to be able to lookup the cache and slab of a
  * virtual address for kfree, ksize, kmem_ptr_validate, and slab debugging.
  */
 static void slab_map_pages(struct kmem_cache *cache, struct slab *slab,
 			   void *addr)
 {
 	int nr_pages;
 	struct page *page;
 	page = virt_to_page(addr);
 	nr_pages = 1;
 	if (likely(!PageCompound(page)))
 		nr_pages <<= cache->gfporder;
 	do {
 		page_set_cache(page, cache);
 		page_set_slab(page, slab);
 		page++;
 	} while (--nr_pages);
 }
 /*
  * Grow (by 1) the number of slabs within a cache.  This is called by
  * kmem_cache_alloc() when there are no active objs left in a cache.
  */
 static int cache_grow(struct kmem_cache *cachep,
 		gfp_t flags, int nodeid, void *objp)
 {
 	struct slab *slabp;
 	size_t offset;
 	gfp_t local_flags;
 	unsigned long ctor_flags;
 	struct kmem_list3 *l3;
 	/*
 	 * Be lazy and only check for valid flags here,  keeping it out of the
 	 * critical path in kmem_cache_alloc().
 	 */
 	BUG_ON(flags & ~(GFP_DMA | GFP_LEVEL_MASK | __GFP_NO_GROW));
 	if (flags & __GFP_NO_GROW)
 		return 0;
 	ctor_flags = SLAB_CTOR_CONSTRUCTOR;
 	local_flags = (flags & GFP_LEVEL_MASK);
 	if (!(local_flags & __GFP_WAIT))
 		/*
 		 * Not allowed to sleep.  Need to tell a constructor about
 		 * this - it might need to know...
 		 */
 		ctor_flags |= SLAB_CTOR_ATOMIC;
 	/* Take the l3 list lock to change the colour_next on this node */
 	check_irq_off();
 	l3 = cachep->nodelists[nodeid];
 	spin_lock(&l3->list_lock);
 	/* Get colour for the slab, and cal the next value. */
 	offset = l3->colour_next;
 	l3->colour_next++;
 	if (l3->colour_next >= cachep->colour)
 		l3->colour_next = 0;
 	spin_unlock(&l3->list_lock);
 	offset *= cachep->colour_off;
 	if (local_flags & __GFP_WAIT)
 		local_irq_enable();
 	/*
 	 * The test for missing atomic flag is performed here, rather than
 	 * the more obvious place, simply to reduce the critical path length
 	 * in kmem_cache_alloc(). If a caller is seriously mis-behaving they
 	 * will eventually be caught here (where it matters).
 	 */
 	kmem_flagcheck(cachep, flags);
 	/*
 	 * Get mem for the objs.  Attempt to allocate a physical page from
 	 * 'nodeid'.
 	 */
 	if (!objp)
 		objp = kmem_getpages(cachep, flags, nodeid);
 	if (!objp)
 		goto failed;
 	/* Get slab management. */
 	slabp = alloc_slabmgmt(cachep, objp, offset,
 			local_flags & ~GFP_THISNODE, nodeid);
 	if (!slabp)
 		goto opps1;
 	slabp->nodeid = nodeid;
 	slab_map_pages(cachep, slabp, objp);
 	cache_init_objs(cachep, slabp, ctor_flags);
 	if (local_flags & __GFP_WAIT)
 		local_irq_disable();
 	check_irq_off();
 	spin_lock(&l3->list_lock);
 	/* Make slab active. */
 	list_add_tail(&slabp->list, &(l3->slabs_free));
 	STATS_INC_GROWN(cachep);
 	l3->free_objects += cachep->num;
 	spin_unlock(&l3->list_lock);
 	return 1;
 opps1:
 	kmem_freepages(cachep, objp);
 failed:
 	if (local_flags & __GFP_WAIT)
 		local_irq_disable();
 	return 0;
 }
 #if DEBUG
 /*
  * Perform extra freeing checks:
  * - detect bad pointers.
  * - POISON/RED_ZONE checking
  * - destructor calls, for caches with POISON+dtor
  */
 static void kfree_debugcheck(const void *objp)
 {
 	struct page *page;
 	if (!virt_addr_valid(objp)) {
 		printk(KERN_ERR "kfree_debugcheck: out of range ptr %lxh.\n",
 		       (unsigned long)objp);
 		BUG();
 	}
 	page = virt_to_page(objp);
 	if (!PageSlab(page)) {
 		printk(KERN_ERR "kfree_debugcheck: bad ptr %lxh.\n",
 		       (unsigned long)objp);
 		BUG();
 	}
 }
 static inline void verify_redzone_free(struct kmem_cache *cache, void *obj)
 {
 	unsigned long redzone1, redzone2;
 	redzone1 = *dbg_redzone1(cache, obj);
 	redzone2 = *dbg_redzone2(cache, obj);
 	/*
 	 * Redzone is ok.
 	 */
 	if (redzone1 == RED_ACTIVE && redzone2 == RED_ACTIVE)
 		return;
 	if (redzone1 == RED_INACTIVE && redzone2 == RED_INACTIVE)
 		slab_error(cache, "double free detected");
 	else
 		slab_error(cache, "memory outside object was overwritten");
 	printk(KERN_ERR "%p: redzone 1:0x%lx, redzone 2:0x%lx.\n",
 			obj, redzone1, redzone2);
 }
 static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
 				   void *caller)
 {
 	struct page *page;
 	unsigned int objnr;
 	struct slab *slabp;
 	objp -= obj_offset(cachep);
 	kfree_debugcheck(objp);
 	page = virt_to_page(objp);
 	slabp = page_get_slab(page);
 	if (cachep->flags & SLAB_RED_ZONE) {
 		verify_redzone_free(cachep, objp);
 		*dbg_redzone1(cachep, objp) = RED_INACTIVE;
 		*dbg_redzone2(cachep, objp) = RED_INACTIVE;
 	}
 	if (cachep->flags & SLAB_STORE_USER)
 		*dbg_userword(cachep, objp) = caller;
 	objnr = obj_to_index(cachep, slabp, objp);
 	BUG_ON(objnr >= cachep->num);
 	BUG_ON(objp != index_to_obj(cachep, slabp, objnr));
 	if (cachep->flags & SLAB_DEBUG_INITIAL) {
 		/*
 		 * Need to call the slab's constructor so the caller can
 		 * perform a verify of its state (debugging).  Called without
 		 * the cache-lock held.
 		 */
 		cachep->ctor(objp + obj_offset(cachep),
 			     cachep, SLAB_CTOR_CONSTRUCTOR | SLAB_CTOR_VERIFY);
 	}
 	if (cachep->flags & SLAB_POISON && cachep->dtor) {
 		/* we want to cache poison the object,
 		 * call the destruction callback
 		 */
 		cachep->dtor(objp + obj_offset(cachep), cachep, 0);
 	}
 #ifdef CONFIG_DEBUG_SLAB_LEAK
 	slab_bufctl(slabp)[objnr] = BUFCTL_FREE;
 #endif
 	if (cachep->flags & SLAB_POISON) {
 #ifdef CONFIG_DEBUG_PAGEALLOC
 		if ((cachep->buffer_size % PAGE_SIZE)==0 && OFF_SLAB(cachep)) {
 			store_stackinfo(cachep, objp, (unsigned long)caller);
 			kernel_map_pages(virt_to_page(objp),
 					 cachep->buffer_size / PAGE_SIZE, 0);
 		} else {
 			poison_obj(cachep, objp, POISON_FREE);
 		}
 #else
 		poison_obj(cachep, objp, POISON_FREE);
 #endif
 	}
 	return objp;
 }
 static void check_slabp(struct kmem_cache *cachep, struct slab *slabp)
 {
 	kmem_bufctl_t i;
 	int entries = 0;
 	/* Check slab's freelist to see if this obj is there. */
 	for (i = slabp->free; i != BUFCTL_END; i = slab_bufctl(slabp)[i]) {
 		entries++;
 		if (entries > cachep->num || i >= cachep->num)
 			goto bad;
 	}
 	if (entries != cachep->num - slabp->inuse) {
 bad:
 		printk(KERN_ERR "slab: Internal list corruption detected in "
 				"cache '%s'(%d), slabp %p(%d). Hexdump:\n",
 			cachep->name, cachep->num, slabp, slabp->inuse);
 		for (i = 0;
 		     i < sizeof(*slabp) + cachep->num * sizeof(kmem_bufctl_t);
 		     i++) {
 			if (i % 16 == 0)
 				printk("\n%03x:", i);
 			printk(" %02x", ((unsigned char *)slabp)[i]);
 		}
 		printk("\n");
 		BUG();
 	}
 }
 #else
 #define kfree_debugcheck(x) do { } while(0)
 #define cache_free_debugcheck(x,objp,z) (objp)
 #define check_slabp(x,y) do { } while(0)
 #endif
 static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags)
 {
 	int batchcount;
 	struct kmem_list3 *l3;
 	struct array_cache *ac;
 	int node;
 	node = numa_node_id();
 	check_irq_off();
 	ac = cpu_cache_get(cachep);
 retry:
 	batchcount = ac->batchcount;
 	if (!ac->touched && batchcount > BATCHREFILL_LIMIT) {
 		/*
 		 * If there was little recent activity on this cache, then
 		 * perform only a partial refill.  Otherwise we could generate
 		 * refill bouncing.
 		 */
 		batchcount = BATCHREFILL_LIMIT;
 	}
 	l3 = cachep->nodelists[node];
 	BUG_ON(ac->avail > 0 || !l3);
 	spin_lock(&l3->list_lock);
 	/* See if we can refill from the shared array */
 	if (l3->shared && transfer_objects(ac, l3->shared, batchcount))
 		goto alloc_done;
 	while (batchcount > 0) {
 		struct list_head *entry;
 		struct slab *slabp;
 		/* Get slab alloc is to come from. */
 		entry = l3->slabs_partial.next;
 		if (entry == &l3->slabs_partial) {
 			l3->free_touched = 1;
 			entry = l3->slabs_free.next;
 			if (entry == &l3->slabs_free)
 				goto must_grow;
 		}
 		slabp = list_entry(entry, struct slab, list);
 		check_slabp(cachep, slabp);
 		check_spinlock_acquired(cachep);
 		while (slabp->inuse < cachep->num && batchcount--) {
 			STATS_INC_ALLOCED(cachep);
 			STATS_INC_ACTIVE(cachep);
 			STATS_SET_HIGH(cachep);
 			ac->entry[ac->avail++] = slab_get_obj(cachep, slabp,
 							    node);
 		}
 		check_slabp(cachep, slabp);
 		/* move slabp to correct slabp list: */
 		list_del(&slabp->list);
 		if (slabp->free == BUFCTL_END)
 			list_add(&slabp->list, &l3->slabs_full);
 		else
 			list_add(&slabp->list, &l3->slabs_partial);
 	}
 must_grow:
 	l3->free_objects -= ac->avail;
 alloc_done:
 	spin_unlock(&l3->list_lock);
 	if (unlikely(!ac->avail)) {
 		int x;
 		x = cache_grow(cachep, flags | GFP_THISNODE, node, NULL);
 		/* cache_grow can reenable interrupts, then ac could change. */
 		ac = cpu_cache_get(cachep);
 		if (!x && ac->avail == 0)	/* no objects in sight? abort */
 			return NULL;
 		if (!ac->avail)		/* objects refilled by interrupt? */
 			goto retry;
 	}
 	ac->touched = 1;
 	return ac->entry[--ac->avail];
 }
 static inline void cache_alloc_debugcheck_before(struct kmem_cache *cachep,
 						gfp_t flags)
 {
 	might_sleep_if(flags & __GFP_WAIT);
 #if DEBUG
 	kmem_flagcheck(cachep, flags);
 #endif
 }
 #if DEBUG
 static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
 				gfp_t flags, void *objp, void *caller)
 {
 	if (!objp)
 		return objp;
 	if (cachep->flags & SLAB_POISON) {
 #ifdef CONFIG_DEBUG_PAGEALLOC
 		if ((cachep->buffer_size % PAGE_SIZE) == 0 && OFF_SLAB(cachep))
 			kernel_map_pages(virt_to_page(objp),
 					 cachep->buffer_size / PAGE_SIZE, 1);
 		else
 			check_poison_obj(cachep, objp);
 #else
 		check_poison_obj(cachep, objp);
 #endif
 		poison_obj(cachep, objp, POISON_INUSE);
 	}
 	if (cachep->flags & SLAB_STORE_USER)
 		*dbg_userword(cachep, objp) = caller;
 	if (cachep->flags & SLAB_RED_ZONE) {
 		if (*dbg_redzone1(cachep, objp) != RED_INACTIVE ||
 				*dbg_redzone2(cachep, objp) != RED_INACTIVE) {
 			slab_error(cachep, "double free, or memory outside"
 						" object was overwritten");
 			printk(KERN_ERR
 				"%p: redzone 1:0x%lx, redzone 2:0x%lx\n",
 				objp, *dbg_redzone1(cachep, objp),
 				*dbg_redzone2(cachep, objp));
 		}
 		*dbg_redzone1(cachep, objp) = RED_ACTIVE;
 		*dbg_redzone2(cachep, objp) = RED_ACTIVE;
 	}
 #ifdef CONFIG_DEBUG_SLAB_LEAK
 	{
 		struct slab *slabp;
 		unsigned objnr;
 		slabp = page_get_slab(virt_to_page(objp));
 		objnr = (unsigned)(objp - slabp->s_mem) / cachep->buffer_size;
 		slab_bufctl(slabp)[objnr] = BUFCTL_ACTIVE;
 	}
 #endif
 	objp += obj_offset(cachep);
 	if (cachep->ctor && cachep->flags & SLAB_POISON) {
 		unsigned long ctor_flags = SLAB_CTOR_CONSTRUCTOR;
 		if (!(flags & __GFP_WAIT))
 			ctor_flags |= SLAB_CTOR_ATOMIC;
 		cachep->ctor(objp, cachep, ctor_flags);
 	}
 #if ARCH_SLAB_MINALIGN
 	if ((u32)objp & (ARCH_SLAB_MINALIGN-1)) {
 		printk(KERN_ERR "0x%p: not aligned to ARCH_SLAB_MINALIGN=%d\n",
 		       objp, ARCH_SLAB_MINALIGN);
 	}
 #endif
 	return objp;
 }
 #else
 #define cache_alloc_debugcheck_after(a,b,objp,d) (objp)
 #endif
 #ifdef CONFIG_FAILSLAB
 static struct failslab_attr {
 	struct fault_attr attr;
 	u32 ignore_gfp_wait;
 #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
 	struct dentry *ignore_gfp_wait_file;
 #endif
 } failslab = {
 	.attr = FAULT_ATTR_INITIALIZER,
 	.ignore_gfp_wait = 1,
 };
 static int __init setup_failslab(char *str)
 {
 	return setup_fault_attr(&failslab.attr, str);
 }
 __setup("failslab=", setup_failslab);
 static int should_failslab(struct kmem_cache *cachep, gfp_t flags)
 {
 	if (cachep == &cache_cache)
 		return 0;
 	if (flags & __GFP_NOFAIL)
 		return 0;
 	if (failslab.ignore_gfp_wait && (flags & __GFP_WAIT))
 		return 0;
 	return should_fail(&failslab.attr, obj_size(cachep));
 }
 #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
 static int __init failslab_debugfs(void)
 {
 	mode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
 	struct dentry *dir;
 	int err;
        	err = init_fault_attr_dentries(&failslab.attr, "failslab");
 	if (err)
 		return err;
 	dir = failslab.attr.dentries.dir;
 	failslab.ignore_gfp_wait_file =
 		debugfs_create_bool("ignore-gfp-wait", mode, dir,
 				      &failslab.ignore_gfp_wait);
 	if (!failslab.ignore_gfp_wait_file) {
 		err = -ENOMEM;
 		debugfs_remove(failslab.ignore_gfp_wait_file);
 		cleanup_fault_attr_dentries(&failslab.attr);
 	}
 	return err;
 }
 late_initcall(failslab_debugfs);
 #endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
 #else /* CONFIG_FAILSLAB */
 static inline int should_failslab(struct kmem_cache *cachep, gfp_t flags)
 {
 	return 0;
 }
 #endif /* CONFIG_FAILSLAB */
 static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags)
 {
 	void *objp;
 	struct array_cache *ac;
 	check_irq_off();
 	if (should_failslab(cachep, flags))
 		return NULL;
 	ac = cpu_cache_get(cachep);
 	if (likely(ac->avail)) {
 		STATS_INC_ALLOCHIT(cachep);
 		ac->touched = 1;
 		objp = ac->entry[--ac->avail];
 	} else {
 		STATS_INC_ALLOCMISS(cachep);
 		objp = cache_alloc_refill(cachep, flags);
 	}
 	return objp;
 }
 static __always_inline void *__cache_alloc(struct kmem_cache *cachep,
 						gfp_t flags, void *caller)
 {
 	unsigned long save_flags;
 	void *objp = NULL;
 	cache_alloc_debugcheck_before(cachep, flags);
 	local_irq_save(save_flags);
 	if (unlikely(NUMA_BUILD &&
 			current->flags & (PF_SPREAD_SLAB | PF_MEMPOLICY)))
 		objp = alternate_node_alloc(cachep, flags);
 	if (!objp)
 		objp = ____cache_alloc(cachep, flags);
 	/*
 	 * We may just have run out of memory on the local node.
 	 * ____cache_alloc_node() knows how to locate memory on other nodes
 	 */
  	if (NUMA_BUILD && !objp)
  		objp = ____cache_alloc_node(cachep, flags, numa_node_id());
 	local_irq_restore(save_flags);
 	objp = cache_alloc_debugcheck_after(cachep, flags, objp,
 					    caller);
 	prefetchw(objp);
 	return objp;
 }
 #ifdef CONFIG_NUMA
 /*
  * Try allocating on another node if PF_SPREAD_SLAB|PF_MEMPOLICY.
  *
  * If we are in_interrupt, then process context, including cpusets and
  * mempolicy, may not apply and should not be used for allocation policy.
  */
 static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags)
 {
 	int nid_alloc, nid_here;
 	if (in_interrupt() || (flags & __GFP_THISNODE))
 		return NULL;
 	nid_alloc = nid_here = numa_node_id();
 	if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD))
 		nid_alloc = cpuset_mem_spread_node();
 	else if (current->mempolicy)
 		nid_alloc = slab_node(current->mempolicy);
 	if (nid_alloc != nid_here)
 		return ____cache_alloc_node(cachep, flags, nid_alloc);
 	return NULL;
 }
 /*
  * Fallback function if there was no memory available and no objects on a
  * certain node and fall back is permitted. First we scan all the
  * available nodelists for available objects. If that fails then we
  * perform an allocation without specifying a node. This allows the page
  * allocator to do its reclaim / fallback magic. We then insert the
  * slab into the proper nodelist and then allocate from it.
  */
 void *fallback_alloc(struct kmem_cache *cache, gfp_t flags)
 {
 	struct zonelist *zonelist = &NODE_DATA(slab_node(current->mempolicy))
 					->node_zonelists[gfp_zone(flags)];
 	struct zone **z;
 	void *obj = NULL;
 	int nid;
 	gfp_t local_flags = (flags & GFP_LEVEL_MASK);
 retry:
 	/*
 	 * Look through allowed nodes for objects available
 	 * from existing per node queues.
 	 */
 	for (z = zonelist->zones; *z && !obj; z++) {
 		nid = zone_to_nid(*z);
 		if (cpuset_zone_allowed_hardwall(*z, flags) &&
 			cache->nodelists[nid] &&
 			cache->nodelists[nid]->free_objects)
 				obj = ____cache_alloc_node(cache,
 					flags | GFP_THISNODE, nid);
 	}
 	if (!obj) {
 		/*
 		 * This allocation will be performed within the constraints
 		 * of the current cpuset / memory policy requirements.
 		 * We may trigger various forms of reclaim on the allowed
 		 * set and go into memory reserves if necessary.
 		 */
 		if (local_flags & __GFP_WAIT)
 			local_irq_enable();
 		kmem_flagcheck(cache, flags);
 		obj = kmem_getpages(cache, flags, -1);
 		if (local_flags & __GFP_WAIT)
 			local_irq_disable();
 		if (obj) {
 			/*
 			 * Insert into the appropriate per node queues
 			 */
 			nid = page_to_nid(virt_to_page(obj));
 			if (cache_grow(cache, flags, nid, obj)) {
 				obj = ____cache_alloc_node(cache,
 					flags | GFP_THISNODE, nid);
 				if (!obj)
 					/*
 					 * Another processor may allocate the
 					 * objects in the slab since we are
 					 * not holding any locks.
 					 */
 					goto retry;
 			} else {
 				kmem_freepages(cache, obj);
 				obj = NULL;
 			}
 		}
 	}
 	return obj;
 }
 /*
  * A interface to enable slab creation on nodeid
  */
 static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags,
 				int nodeid)
 {
 	struct list_head *entry;
 	struct slab *slabp;
 	struct kmem_list3 *l3;
 	void *obj;
 	int x;
 	l3 = cachep->nodelists[nodeid];
 	BUG_ON(!l3);
 retry:
 	check_irq_off();
 	spin_lock(&l3->list_lock);
 	entry = l3->slabs_partial.next;
 	if (entry == &l3->slabs_partial) {
 		l3->free_touched = 1;
 		entry = l3->slabs_free.next;
 		if (entry == &l3->slabs_free)
 			goto must_grow;
 	}
 	slabp = list_entry(entry, struct slab, list);
 	check_spinlock_acquired_node(cachep, nodeid);
 	check_slabp(cachep, slabp);
 	STATS_INC_NODEALLOCS(cachep);
 	STATS_INC_ACTIVE(cachep);
 	STATS_SET_HIGH(cachep);
 	BUG_ON(slabp->inuse == cachep->num);
 	obj = slab_get_obj(cachep, slabp, nodeid);
 	check_slabp(cachep, slabp);
 	l3->free_objects--;
 	/* move slabp to correct slabp list: */
 	list_del(&slabp->list);
 	if (slabp->free == BUFCTL_END)
 		list_add(&slabp->list, &l3->slabs_full);
 	else
 		list_add(&slabp->list, &l3->slabs_partial);
 	spin_unlock(&l3->list_lock);
 	goto done;
 must_grow:
 	spin_unlock(&l3->list_lock);
 	x = cache_grow(cachep, flags | GFP_THISNODE, nodeid, NULL);
 	if (x)
 		goto retry;
 	if (!(flags & __GFP_THISNODE))
 		/* Unable to grow the cache. Fall back to other nodes. */
 		return fallback_alloc(cachep, flags);
 	return NULL;
 done:
 	return obj;
 }
 #endif
 /*
  * Caller needs to acquire correct kmem_list's list_lock
  */
 static void free_block(struct kmem_cache *cachep, void **objpp, int nr_objects,
 		       int node)
 {
 	int i;
 	struct kmem_list3 *l3;
 	for (i = 0; i < nr_objects; i++) {
 		void *objp = objpp[i];
 		struct slab *slabp;
 		slabp = virt_to_slab(objp);
 		l3 = cachep->nodelists[node];
 		list_del(&slabp->list);
 		check_spinlock_acquired_node(cachep, node);
 		check_slabp(cachep, slabp);
 		slab_put_obj(cachep, slabp, objp, node);
 		STATS_DEC_ACTIVE(cachep);
 		l3->free_objects++;
 		check_slabp(cachep, slabp);
 		/* fixup slab chains */
 		if (slabp->inuse == 0) {
 			if (l3->free_objects > l3->free_limit) {
 				l3->free_objects -= cachep->num;
 				/* No need to drop any previously held
 				 * lock here, even if we have a off-slab slab
 				 * descriptor it is guaranteed to come from
 				 * a different cache, refer to comments before
 				 * alloc_slabmgmt.
 				 */
 				slab_destroy(cachep, slabp);
 			} else {
 				list_add(&slabp->list, &l3->slabs_free);
 			}
 		} else {
 			/* Unconditionally move a slab to the end of the
 			 * partial list on free - maximum time for the
 			 * other objects to be freed, too.
 			 */
 			list_add_tail(&slabp->list, &l3->slabs_partial);
 		}
 	}
 }
 static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac)
 {
 	int batchcount;
 	struct kmem_list3 *l3;
 	int node = numa_node_id();
 	batchcount = ac->batchcount;
 #if DEBUG
 	BUG_ON(!batchcount || batchcount > ac->avail);
 #endif
 	check_irq_off();
 	l3 = cachep->nodelists[node];
 	spin_lock(&l3->list_lock);
 	if (l3->shared) {
 		struct array_cache *shared_array = l3->shared;
 		int max = shared_array->limit - shared_array->avail;
 		if (max) {
 			if (batchcount > max)
 				batchcount = max;
 			memcpy(&(shared_array->entry[shared_array->avail]),
 			       ac->entry, sizeof(void *) * batchcount);
 			shared_array->avail += batchcount;
 			goto free_done;
 		}
 	}
 	free_block(cachep, ac->entry, batchcount, node);
 free_done:
 #if STATS
 	{
 		int i = 0;
 		struct list_head *p;
 		p = l3->slabs_free.next;
 		while (p != &(l3->slabs_free)) {
 			struct slab *slabp;
 			slabp = list_entry(p, struct slab, list);
 			BUG_ON(slabp->inuse);
 			i++;
 			p = p->next;
 		}
 		STATS_SET_FREEABLE(cachep, i);
 	}
 #endif
 	spin_unlock(&l3->list_lock);
 	ac->avail -= batchcount;
 	memmove(ac->entry, &(ac->entry[batchcount]), sizeof(void *)*ac->avail);
 }
 /*
  * Release an obj back to its cache. If the obj has a constructed state, it must
  * be in this state _before_ it is released.  Called with disabled ints.
  */
 static inline void __cache_free(struct kmem_cache *cachep, void *objp)
 {
 	struct array_cache *ac = cpu_cache_get(cachep);
 	check_irq_off();
 	objp = cache_free_debugcheck(cachep, objp, __builtin_return_address(0));
 	if (cache_free_alien(cachep, objp))
 		return;
 	if (likely(ac->avail < ac->limit)) {
 		STATS_INC_FREEHIT(cachep);
 		ac->entry[ac->avail++] = objp;
 		return;
 	} else {
 		STATS_INC_FREEMISS(cachep);
 		cache_flusharray(cachep, ac);
 		ac->entry[ac->avail++] = objp;
 	}
 }
 /**
  * kmem_cache_alloc - Allocate an object
  * @cachep: The cache to allocate from.
  * @flags: See kmalloc().
  *
  * Allocate an object from this cache.  The flags are only relevant
  * if the cache has no available objects.
  */
 void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
 {
 	return __cache_alloc(cachep, flags, __builtin_return_address(0));
 }
 EXPORT_SYMBOL(kmem_cache_alloc);
 /**
  * kmem_cache_zalloc - Allocate an object. The memory is set to zero.
  * @cache: The cache to allocate from.
  * @flags: See kmalloc().
  *
  * Allocate an object from this cache and set the allocated memory to zero.
  * The flags are only relevant if the cache has no available objects.
  */
 void *kmem_cache_zalloc(struct kmem_cache *cache, gfp_t flags)
 {
 	void *ret = __cache_alloc(cache, flags, __builtin_return_address(0));
 	if (ret)
 		memset(ret, 0, obj_size(cache));
 	return ret;
 }
 EXPORT_SYMBOL(kmem_cache_zalloc);
 /**
  * kmem_ptr_validate - check if an untrusted pointer might
  *	be a slab entry.
  * @cachep: the cache we're checking against
  * @ptr: pointer to validate
  *
  * This verifies that the untrusted pointer looks sane:
  * it is _not_ a guarantee that the pointer is actually
  * part of the slab cache in question, but it at least
  * validates that the pointer can be dereferenced and
  * looks half-way sane.
  *
  * Currently only used for dentry validation.
  */
 int fastcall kmem_ptr_validate(struct kmem_cache *cachep, const void *ptr)
 {
 	unsigned long addr = (unsigned long)ptr;
 	unsigned long min_addr = PAGE_OFFSET;
 	unsigned long align_mask = BYTES_PER_WORD - 1;
 	unsigned long size = cachep->buffer_size;
 	struct page *page;
 	if (unlikely(addr < min_addr))
 		goto out;
 	if (unlikely(addr > (unsigned long)high_memory - size))
 		goto out;
 	if (unlikely(addr & align_mask))
 		goto out;
 	if (unlikely(!kern_addr_valid(addr)))
 		goto out;
 	if (unlikely(!kern_addr_valid(addr + size - 1)))
 		goto out;
 	page = virt_to_page(ptr);
 	if (unlikely(!PageSlab(page)))
 		goto out;
 	if (unlikely(page_get_cache(page) != cachep))
 		goto out;
 	return 1;
 out:
 	return 0;
 }
 #ifdef CONFIG_NUMA
 /**
  * kmem_cache_alloc_node - Allocate an object on the specified node
  * @cachep: The cache to allocate from.
  * @flags: See kmalloc().
  * @nodeid: node number of the target node.
  *
  * Identical to kmem_cache_alloc but it will allocate memory on the given
  * node, which can improve the performance for cpu bound structures.
  *
  * Fallback to other node is possible if __GFP_THISNODE is not set.
  */
 static __always_inline void *
 __cache_alloc_node(struct kmem_cache *cachep, gfp_t flags,
 		int nodeid, void *caller)
 {
 	unsigned long save_flags;
 	void *ptr = NULL;
 	cache_alloc_debugcheck_before(cachep, flags);
 	local_irq_save(save_flags);
 	if (unlikely(nodeid == -1))
 		nodeid = numa_node_id();
 	if (likely(cachep->nodelists[nodeid])) {
 		if (nodeid == numa_node_id()) {
 			/*
 			 * Use the locally cached objects if possible.
 			 * However ____cache_alloc does not allow fallback
 			 * to other nodes. It may fail while we still have
 			 * objects on other nodes available.
 			 */
 			ptr = ____cache_alloc(cachep, flags);
 		}
 		if (!ptr) {
 			/* ___cache_alloc_node can fall back to other nodes */
 			ptr = ____cache_alloc_node(cachep, flags, nodeid);
 		}
 	} else {
 		/* Node not bootstrapped yet */
 		if (!(flags & __GFP_THISNODE))
 			ptr = fallback_alloc(cachep, flags);
 	}
 	local_irq_restore(save_flags);
 	ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, caller);
 	return ptr;
 }
 void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid)
 {
 	return __cache_alloc_node(cachep, flags, nodeid,
 			__builtin_return_address(0));
 }
 EXPORT_SYMBOL(kmem_cache_alloc_node);
 static __always_inline void *
 __do_kmalloc_node(size_t size, gfp_t flags, int node, void *caller)
 {
 	struct kmem_cache *cachep;
 	cachep = kmem_find_general_cachep(size, flags);
 	if (unlikely(cachep == NULL))
 		return NULL;
 	return kmem_cache_alloc_node(cachep, flags, node);
 }
 #ifdef CONFIG_DEBUG_SLAB
 void *__kmalloc_node(size_t size, gfp_t flags, int node)
 {
 	return __do_kmalloc_node(size, flags, node,
 			__builtin_return_address(0));
 }
 EXPORT_SYMBOL(__kmalloc_node);
 void *__kmalloc_node_track_caller(size_t size, gfp_t flags,
 		int node, void *caller)
 {
 	return __do_kmalloc_node(size, flags, node, caller);
 }
 EXPORT_SYMBOL(__kmalloc_node_track_caller);
 #else
 void *__kmalloc_node(size_t size, gfp_t flags, int node)
 {
 	return __do_kmalloc_node(size, flags, node, NULL);
 }
 EXPORT_SYMBOL(__kmalloc_node);
 #endif /* CONFIG_DEBUG_SLAB */
 #endif /* CONFIG_NUMA */
 /**
  * __do_kmalloc - allocate memory
  * @size: how many bytes of memory are required.
  * @flags: the type of memory to allocate (see kmalloc).
  * @caller: function caller for debug tracking of the caller
  */
 static __always_inline void *__do_kmalloc(size_t size, gfp_t flags,
 					  void *caller)
 {
 	struct kmem_cache *cachep;
 	/* If you want to save a few bytes .text space: replace
 	 * __ with kmem_.
 	 * Then kmalloc uses the uninlined functions instead of the inline
 	 * functions.
 	 */
 	cachep = __find_general_cachep(size, flags);
 	if (unlikely(cachep == NULL))
 		return NULL;
 	return __cache_alloc(cachep, flags, caller);
 }
 #ifdef CONFIG_DEBUG_SLAB
 void *__kmalloc(size_t size, gfp_t flags)
 {
 	return __do_kmalloc(size, flags, __builtin_return_address(0));
 }
 EXPORT_SYMBOL(__kmalloc);
 void *__kmalloc_track_caller(size_t size, gfp_t flags, void *caller)
 {
 	return __do_kmalloc(size, flags, caller);
 }
 EXPORT_SYMBOL(__kmalloc_track_caller);
 #else
 void *__kmalloc(size_t size, gfp_t flags)
 {
 	return __do_kmalloc(size, flags, NULL);
 }
 EXPORT_SYMBOL(__kmalloc);
 #endif
 /**
  * kmem_cache_free - Deallocate an object
  * @cachep: The cache the allocation was from.
  * @objp: The previously allocated object.
  *
  * Free an object which was previously allocated from this
  * cache.
  */
 void kmem_cache_free(struct kmem_cache *cachep, void *objp)
 {
 	unsigned long flags;
 	BUG_ON(virt_to_cache(objp) != cachep);
 	local_irq_save(flags);
 	__cache_free(cachep, objp);
 	local_irq_restore(flags);
 }
 EXPORT_SYMBOL(kmem_cache_free);
 /**
  * kfree - free previously allocated memory
  * @objp: pointer returned by kmalloc.
  *
  * If @objp is NULL, no operation is performed.
  *
  * Don't free memory not originally allocated by kmalloc()
  * or you will run into trouble.
  */
 void kfree(const void *objp)
 {
 	struct kmem_cache *c;
 	unsigned long flags;
 	if (unlikely(!objp))
 		return;
 	local_irq_save(flags);
 	kfree_debugcheck(objp);
 	c = virt_to_cache(objp);
 	debug_check_no_locks_freed(objp, obj_size(c));
 	__cache_free(c, (void *)objp);
 	local_irq_restore(flags);
 }
 EXPORT_SYMBOL(kfree);
 unsigned int kmem_cache_size(struct kmem_cache *cachep)
 {
 	return obj_size(cachep);
 }
 EXPORT_SYMBOL(kmem_cache_size);
 const char *kmem_cache_name(struct kmem_cache *cachep)
 {
 	return cachep->name;
 }
 EXPORT_SYMBOL_GPL(kmem_cache_name);
 /*
  * This initializes kmem_list3 or resizes varioius caches for all nodes.
  */
 static int alloc_kmemlist(struct kmem_cache *cachep)
 {
 	int node;
 	struct kmem_list3 *l3;
 	struct array_cache *new_shared;
 	struct array_cache **new_alien = NULL;
 	for_each_online_node(node) {
                 if (use_alien_caches) {
                         new_alien = alloc_alien_cache(node, cachep->limit);
                         if (!new_alien)
                                 goto fail;
                 }
 		new_shared = alloc_arraycache(node,
 				cachep->shared*cachep->batchcount,
 					0xbaadf00d);
 		if (!new_shared) {
 			free_alien_cache(new_alien);
 			goto fail;
 		}
 		l3 = cachep->nodelists[node];
 		if (l3) {
 			struct array_cache *shared = l3->shared;
 			spin_lock_irq(&l3->list_lock);
 			if (shared)
 				free_block(cachep, shared->entry,
 						shared->avail, node);
 			l3->shared = new_shared;
 			if (!l3->alien) {
 				l3->alien = new_alien;
 				new_alien = NULL;
 			}
 			l3->free_limit = (1 + nr_cpus_node(node)) *
 					cachep->batchcount + cachep->num;
 			spin_unlock_irq(&l3->list_lock);
 			kfree(shared);
 			free_alien_cache(new_alien);
 			continue;
 		}
 		l3 = kmalloc_node(sizeof(struct kmem_list3), GFP_KERNEL, node);
 		if (!l3) {
 			free_alien_cache(new_alien);
 			kfree(new_shared);
 			goto fail;
 		}
 		kmem_list3_init(l3);
 		l3->next_reap = jiffies + REAPTIMEOUT_LIST3 +
 				((unsigned long)cachep) % REAPTIMEOUT_LIST3;
 		l3->shared = new_shared;
 		l3->alien = new_alien;
 		l3->free_limit = (1 + nr_cpus_node(node)) *
 					cachep->batchcount + cachep->num;
 		cachep->nodelists[node] = l3;
 	}
 	return 0;
 fail:
 	if (!cachep->next.next) {
 		/* Cache is not active yet. Roll back what we did */
 		node--;
 		while (node >= 0) {
 			if (cachep->nodelists[node]) {
 				l3 = cachep->nodelists[node];
 				kfree(l3->shared);
 				free_alien_cache(l3->alien);
 				kfree(l3);
 				cachep->nodelists[node] = NULL;
 			}
 			node--;
 		}
 	}
 	return -ENOMEM;
 }
 struct ccupdate_struct {
 	struct kmem_cache *cachep;
 	struct array_cache *new[NR_CPUS];
 };
 static void do_ccupdate_local(void *info)
 {
 	struct ccupdate_struct *new = info;
 	struct array_cache *old;
 	check_irq_off();
 	old = cpu_cache_get(new->cachep);
 	new->cachep->array[smp_processor_id()] = new->new[smp_processor_id()];
 	new->new[smp_processor_id()] = old;
 }
 /* Always called with the cache_chain_mutex held */
 static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
 				int batchcount, int shared)
 {
 	struct ccupdate_struct *new;
 	int i;
 	new = kzalloc(sizeof(*new), GFP_KERNEL);
 	if (!new)
 		return -ENOMEM;
 	for_each_online_cpu(i) {
 		new->new[i] = alloc_arraycache(cpu_to_node(i), limit,
 						batchcount);
 		if (!new->new[i]) {
 			for (i--; i >= 0; i--)
 				kfree(new->new[i]);
 			kfree(new);
 			return -ENOMEM;
 		}
 	}
 	new->cachep = cachep;
 	on_each_cpu(do_ccupdate_local, (void *)new, 1, 1);
 	check_irq_on();
 	cachep->batchcount = batchcount;
 	cachep->limit = limit;
 	cachep->shared = shared;
 	for_each_online_cpu(i) {
 		struct array_cache *ccold = new->new[i];
 		if (!ccold)
 			continue;
 		spin_lock_irq(&cachep->nodelists[cpu_to_node(i)]->list_lock);
 		free_block(cachep, ccold->entry, ccold->avail, cpu_to_node(i));
 		spin_unlock_irq(&cachep->nodelists[cpu_to_node(i)]->list_lock);
 		kfree(ccold);
 	}
 	kfree(new);
 	return alloc_kmemlist(cachep);
 }
 /* Called with cache_chain_mutex held always */
 static int enable_cpucache(struct kmem_cache *cachep)
 {
 	int err;
 	int limit, shared;
 	/*
 	 * The head array serves three purposes:
 	 * - create a LIFO ordering, i.e. return objects that are cache-warm
 	 * - reduce the number of spinlock operations.
 	 * - reduce the number of linked list operations on the slab and
 	 *   bufctl chains: array operations are cheaper.
 	 * The numbers are guessed, we should auto-tune as described by
 	 * Bonwick.
 	 */
 	if (cachep->buffer_size > 131072)
 		limit = 1;
 	else if (cachep->buffer_size > PAGE_SIZE)
 		limit = 8;
 	else if (cachep->buffer_size > 1024)
 		limit = 24;
 	else if (cachep->buffer_size > 256)
 		limit = 54;
 	else
 		limit = 120;
 	/*
 	 * CPU bound tasks (e.g. network routing) can exhibit cpu bound
 	 * allocation behaviour: Most allocs on one cpu, most free operations
 	 * on another cpu. For these cases, an efficient object passing between
 	 * cpus is necessary. This is provided by a shared array. The array
 	 * replaces Bonwick's magazine layer.
 	 * On uniprocessor, it's functionally equivalent (but less efficient)
 	 * to a larger limit. Thus disabled by default.
 	 */
 	shared = 0;
 #ifdef CONFIG_SMP
 	if (cachep->buffer_size <= PAGE_SIZE)
 		shared = 8;
 #endif
 #if DEBUG
 	/*
 	 * With debugging enabled, large batchcount lead to excessively long
 	 * periods with disabled local interrupts. Limit the batchcount
 	 */
 	if (limit > 32)
 		limit = 32;
 #endif
 	err = do_tune_cpucache(cachep, limit, (limit + 1) / 2, shared);
 	if (err)
 		printk(KERN_ERR "enable_cpucache failed for %s, error %d.\n",
 		       cachep->name, -err);
 	return err;
 }
 /*
  * Drain an array if it contains any elements taking the l3 lock only if
  * necessary. Note that the l3 listlock also protects the array_cache
  * if drain_array() is used on the shared array.
  */
 void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3,
 			 struct array_cache *ac, int force, int node)
 {
 	int tofree;
 	if (!ac || !ac->avail)
 		return;
 	if (ac->touched && !force) {
 		ac->touched = 0;
 	} else {
 		spin_lock_irq(&l3->list_lock);
 		if (ac->avail) {
 			tofree = force ? ac->avail : (ac->limit + 4) / 5;
 			if (tofree > ac->avail)
 				tofree = (ac->avail + 1) / 2;
 			free_block(cachep, ac->entry, tofree, node);
 			ac->avail -= tofree;
 			memmove(ac->entry, &(ac->entry[tofree]),
 				sizeof(void *) * ac->avail);
 		}
 		spin_unlock_irq(&l3->list_lock);
 	}
 }
 /**
  * cache_reap - Reclaim memory from caches.
  * @unused: unused parameter
  *
  * Called from workqueue/eventd every few seconds.
  * Purpose:
  * - clear the per-cpu caches for this CPU.
  * - return freeable pages to the main free memory pool.
  *
  * If we cannot acquire the cache chain mutex then just give up - we'll try
  * again on the next iteration.
  */
 static void cache_reap(struct work_struct *unused)
 {
 	struct kmem_cache *searchp;
 	struct kmem_list3 *l3;
 	int node = numa_node_id();
 	if (!mutex_trylock(&cache_chain_mutex)) {
 		/* Give up. Setup the next iteration. */
 		schedule_delayed_work(&__get_cpu_var(reap_work),
 				      round_jiffies_relative(REAPTIMEOUT_CPUC));
 		return;
 	}
 	list_for_each_entry(searchp, &cache_chain, next) {
 		check_irq_on();
 		/*
 		 * We only take the l3 lock if absolutely necessary and we
 		 * have established with reasonable certainty that
 		 * we can do some work if the lock was obtained.
 		 */
 		l3 = searchp->nodelists[node];
 		reap_alien(searchp, l3);
 		drain_array(searchp, l3, cpu_cache_get(searchp), 0, node);
 		/*
 		 * These are racy checks but it does not matter
 		 * if we skip one check or scan twice.
 		 */
 		if (time_after(l3->next_reap, jiffies))
 			goto next;
 		l3->next_reap = jiffies + REAPTIMEOUT_LIST3;
 		drain_array(searchp, l3, l3->shared, 0, node);
 		if (l3->free_touched)
 			l3->free_touched = 0;
 		else {
 			int freed;
 			freed = drain_freelist(searchp, l3, (l3->free_limit +
 				5 * searchp->num - 1) / (5 * searchp->num));
 			STATS_ADD_REAPED(searchp, freed);
 		}
 next:
 		cond_resched();
 	}
 	check_irq_on();
 	mutex_unlock(&cache_chain_mutex);
 	next_reap_node();
 	refresh_cpu_vm_stats(smp_processor_id());
 	/* Set up the next iteration */
 	schedule_delayed_work(&__get_cpu_var(reap_work),
 		round_jiffies_relative(REAPTIMEOUT_CPUC));
 }
 #ifdef CONFIG_PROC_FS
 static void print_slabinfo_header(struct seq_file *m)
 {
 	/*
 	 * Output format version, so at least we can change it
 	 * without _too_ many complaints.
 	 */
 #if STATS
 	seq_puts(m, "slabinfo - version: 2.1 (statistics)\n");
 #else
 	seq_puts(m, "slabinfo - version: 2.1\n");
 #endif
 	seq_puts(m, "# name            <active_objs> <num_objs> <objsize> "
 		 "<objperslab> <pagesperslab>");
 	seq_puts(m, " : tunables <limit> <batchcount> <sharedfactor>");
 	seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>");
 #if STATS
 	seq_puts(m, " : globalstat <listallocs> <maxobjs> <grown> <reaped> "
 		 "<error> <maxfreeable> <nodeallocs> <remotefrees> <alienoverflow>");
 	seq_puts(m, " : cpustat <allochit> <allocmiss> <freehit> <freemiss>");
 #endif
 	seq_putc(m, '\n');
 }
 static void *s_start(struct seq_file *m, loff_t *pos)
 {
 	loff_t n = *pos;
 	struct list_head *p;
 	mutex_lock(&cache_chain_mutex);
 	if (!n)
 		print_slabinfo_header(m);
 	p = cache_chain.next;
 	while (n--) {
 		p = p->next;
 		if (p == &cache_chain)
 			return NULL;
 	}
 	return list_entry(p, struct kmem_cache, next);
 }
 static void *s_next(struct seq_file *m, void *p, loff_t *pos)
 {
 	struct kmem_cache *cachep = p;
 	++*pos;
 	return cachep->next.next == &cache_chain ?
 		NULL : list_entry(cachep->next.next, struct kmem_cache, next);
 }
 static void s_stop(struct seq_file *m, void *p)
 {
 	mutex_unlock(&cache_chain_mutex);
 }
 static int s_show(struct seq_file *m, void *p)
 {
 	struct kmem_cache *cachep = p;
 	struct slab *slabp;
 	unsigned long active_objs;
 	unsigned long num_objs;
 	unsigned long active_slabs = 0;
 	unsigned long num_slabs, free_objects = 0, shared_avail = 0;
 	const char *name;
 	char *error = NULL;
 	int node;
 	struct kmem_list3 *l3;
 	active_objs = 0;
 	num_slabs = 0;
 	for_each_online_node(node) {
 		l3 = cachep->nodelists[node];
 		if (!l3)
 			continue;
 		check_irq_on();
 		spin_lock_irq(&l3->list_lock);
 		list_for_each_entry(slabp, &l3->slabs_full, list) {
 			if (slabp->inuse != cachep->num && !error)
 				error = "slabs_full accounting error";
 			active_objs += cachep->num;
 			active_slabs++;
 		}
 		list_for_each_entry(slabp, &l3->slabs_partial, list) {
 			if (slabp->inuse == cachep->num && !error)
 				error = "slabs_partial inuse accounting error";
 			if (!slabp->inuse && !error)
 				error = "slabs_partial/inuse accounting error";
 			active_objs += slabp->inuse;
 			active_slabs++;
 		}
 		list_for_each_entry(slabp, &l3->slabs_free, list) {
 			if (slabp->inuse && !error)
 				error = "slabs_free/inuse accounting error";
 			num_slabs++;
 		}
 		free_objects += l3->free_objects;
 		if (l3->shared)
 			shared_avail += l3->shared->avail;
 		spin_unlock_irq(&l3->list_lock);
 	}
 	num_slabs += active_slabs;
 	num_objs = num_slabs * cachep->num;
 	if (num_objs - active_objs != free_objects && !error)
 		error = "free_objects accounting error";
 	name = cachep->name;
 	if (error)
 		printk(KERN_ERR "slab: cache %s error: %s\n", name, error);
 	seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d",
 		   name, active_objs, num_objs, cachep->buffer_size,
 		   cachep->num, (1 << cachep->gfporder));
 	seq_printf(m, " : tunables %4u %4u %4u",
 		   cachep->limit, cachep->batchcount, cachep->shared);
 	seq_printf(m, " : slabdata %6lu %6lu %6lu",
 		   active_slabs, num_slabs, shared_avail);
 #if STATS
 	{			/* list3 stats */
 		unsigned long high = cachep->high_mark;
 		unsigned long allocs = cachep->num_allocations;
 		unsigned long grown = cachep->grown;
 		unsigned long reaped = cachep->reaped;
 		unsigned long errors = cachep->errors;
 		unsigned long max_freeable = cachep->max_freeable;
 		unsigned long node_allocs = cachep->node_allocs;
 		unsigned long node_frees = cachep->node_frees;
 		unsigned long overflows = cachep->node_overflow;
 		seq_printf(m, " : globalstat %7lu %6lu %5lu %4lu \
 				%4lu %4lu %4lu %4lu %4lu", allocs, high, grown,
 				reaped, errors, max_freeable, node_allocs,
 				node_frees, overflows);
 	}
 	/* cpu stats */
 	{
 		unsigned long allochit = atomic_read(&cachep->allochit);
 		unsigned long allocmiss = atomic_read(&cachep->allocmiss);
 		unsigned long freehit = atomic_read(&cachep->freehit);
 		unsigned long freemiss = atomic_read(&cachep->freemiss);
 		seq_printf(m, " : cpustat %6lu %6lu %6lu %6lu",
 			   allochit, allocmiss, freehit, freemiss);
 	}
 #endif
 	seq_putc(m, '\n');
 	return 0;
 }
 /*
  * slabinfo_op - iterator that generates /proc/slabinfo
  *
  * Output layout:
  * cache-name
  * num-active-objs
  * total-objs
  * object size
  * num-active-slabs
  * total-slabs
  * num-pages-per-slab
  * + further values on SMP and with statistics enabled
  */
 const struct seq_operations slabinfo_op = {
 	.start = s_start,
 	.next = s_next,
 	.stop = s_stop,
 	.show = s_show,
 };
 #define MAX_SLABINFO_WRITE 128
 /**
  * slabinfo_write - Tuning for the slab allocator
  * @file: unused
  * @buffer: user buffer
  * @count: data length
  * @ppos: unused
  */
 ssize_t slabinfo_write(struct file *file, const char __user * buffer,
 		       size_t count, loff_t *ppos)
 {
 	char kbuf[MAX_SLABINFO_WRITE + 1], *tmp;
 	int limit, batchcount, shared, res;
 	struct kmem_cache *cachep;
 	if (count > MAX_SLABINFO_WRITE)
 		return -EINVAL;
 	if (copy_from_user(&kbuf, buffer, count))
 		return -EFAULT;
 	kbuf[MAX_SLABINFO_WRITE] = '\0';
 	tmp = strchr(kbuf, ' ');
 	if (!tmp)
 		return -EINVAL;
 	*tmp = '\0';
 	tmp++;
 	if (sscanf(tmp, " %d %d %d", &limit, &batchcount, &shared) != 3)
 		return -EINVAL;
 	/* Find the cache in the chain of caches. */
 	mutex_lock(&cache_chain_mutex);
 	res = -EINVAL;
 	list_for_each_entry(cachep, &cache_chain, next) {
 		if (!strcmp(cachep->name, kbuf)) {
 			if (limit < 1 || batchcount < 1 ||
 					batchcount > limit || shared < 0) {
 				res = 0;
 			} else {
 				res = do_tune_cpucache(cachep, limit,
 						       batchcount, shared);
 			}
 			break;
 		}
 	}
 	mutex_unlock(&cache_chain_mutex);
 	if (res >= 0)
 		res = count;
 	return res;
 }
 #ifdef CONFIG_DEBUG_SLAB_LEAK
 static void *leaks_start(struct seq_file *m, loff_t *pos)
 {
 	loff_t n = *pos;
 	struct list_head *p;
 	mutex_lock(&cache_chain_mutex);
 	p = cache_chain.next;
 	while (n--) {
 		p = p->next;
 		if (p == &cache_chain)
 			return NULL;
 	}
 	return list_entry(p, struct kmem_cache, next);
 }
 static inline int add_caller(unsigned long *n, unsigned long v)
 {
 	unsigned long *p;
 	int l;
 	if (!v)
 		return 1;
 	l = n[1];
 	p = n + 2;
 	while (l) {
 		int i = l/2;
 		unsigned long *q = p + 2 * i;
 		if (*q == v) {
 			q[1]++;
 			return 1;
 		}
 		if (*q > v) {
 			l = i;
 		} else {
 			p = q + 2;
 			l -= i + 1;
 		}
 	}
 	if (++n[1] == n[0])
 		return 0;
 	memmove(p + 2, p, n[1] * 2 * sizeof(unsigned long) - ((void *)p - (void *)n));
 	p[0] = v;
 	p[1] = 1;
 	return 1;
 }
 static void handle_slab(unsigned long *n, struct kmem_cache *c, struct slab *s)
 {
 	void *p;
 	int i;
 	if (n[0] == n[1])
 		return;
 	for (i = 0, p = s->s_mem; i < c->num; i++, p += c->buffer_size) {
 		if (slab_bufctl(s)[i] != BUFCTL_ACTIVE)
 			continue;
 		if (!add_caller(n, (unsigned long)*dbg_userword(c, p)))
 			return;
 	}
 }
 static void show_symbol(struct seq_file *m, unsigned long address)
 {
 #ifdef CONFIG_KALLSYMS
 	char *modname;
 	const char *name;
 	unsigned long offset, size;
 	char namebuf[KSYM_NAME_LEN+1];
 	name = kallsyms_lookup(address, &size, &offset, &modname, namebuf);
 	if (name) {
 		seq_printf(m, "%s+%#lx/%#lx", name, offset, size);
 		if (modname)
 			seq_printf(m, " [%s]", modname);
 		return;
 	}
 #endif
 	seq_printf(m, "%p", (void *)address);
 }
 static int leaks_show(struct seq_file *m, void *p)
 {
 	struct kmem_cache *cachep = p;
 	struct slab *slabp;
 	struct kmem_list3 *l3;
 	const char *name;
 	unsigned long *n = m->private;
 	int node;
 	int i;
 	if (!(cachep->flags & SLAB_STORE_USER))
 		return 0;
 	if (!(cachep->flags & SLAB_RED_ZONE))
 		return 0;
 	/* OK, we can do it */
 	n[1] = 0;
 	for_each_online_node(node) {
 		l3 = cachep->nodelists[node];
 		if (!l3)
 			continue;
 		check_irq_on();
 		spin_lock_irq(&l3->list_lock);
 		list_for_each_entry(slabp, &l3->slabs_full, list)
 			handle_slab(n, cachep, slabp);
 		list_for_each_entry(slabp, &l3->slabs_partial, list)
 			handle_slab(n, cachep, slabp);
 		spin_unlock_irq(&l3->list_lock);
 	}
 	name = cachep->name;
 	if (n[0] == n[1]) {
 		/* Increase the buffer size */
 		mutex_unlock(&cache_chain_mutex);
 		m->private = kzalloc(n[0] * 4 * sizeof(unsigned long), GFP_KERNEL);
 		if (!m->private) {
 			/* Too bad, we are really out */
 			m->private = n;
 			mutex_lock(&cache_chain_mutex);
 			return -ENOMEM;
 		}
 		*(unsigned long *)m->private = n[0] * 2;
 		kfree(n);
 		mutex_lock(&cache_chain_mutex);
 		/* Now make sure this entry will be retried */
 		m->count = m->size;
 		return 0;
 	}
 	for (i = 0; i < n[1]; i++) {
 		seq_printf(m, "%s: %lu ", name, n[2*i+3]);
 		show_symbol(m, n[2*i+2]);
 		seq_putc(m, '\n');
 	}
 	return 0;
 }
 const struct seq_operations slabstats_op = {
 	.start = leaks_start,
 	.next = s_next,
 	.stop = s_stop,
 	.show = leaks_show,
 };
 #endif
 #endif
 /**
  * ksize - get the actual amount of memory allocated for a given object
  * @objp: Pointer to the object
  *
  * kmalloc may internally round up allocations and return more memory
  * than requested. ksize() can be used to determine the actual amount of
  * memory allocated. The caller may use this additional memory, even though
  * a smaller amount of memory was initially specified with the kmalloc call.
  * The caller must guarantee that objp points to a valid object previously
  * allocated with either kmalloc() or kmem_cache_alloc(). The object
  * must not be freed during the duration of the call.
  */
 unsigned int ksize(const void *objp)
 {
 	if (unlikely(objp == NULL))
 		return 0;
 	return obj_size(virt_to_cache(objp));
 }