Doug / smarc-fsl-linux-kernel

1

/*

1

/*

2

* mm/percpu.c - percpu memory allocator

2

* mm/percpu.c - percpu memory allocator

3

*

3

*

4

5

6

*

6

*

7

* This file is released under the GPLv2.

7

* This file is released under the GPLv2.

8

*

8

*

9

* This is percpu allocator which can handle both static and dynamic

9

* This is percpu allocator which can handle both static and dynamic

10

* areas. Percpu areas are allocated in chunks. Each chunk is

10

* areas. Percpu areas are allocated in chunks. Each chunk is

11

* consisted of boot-time determined number of units and the first

11

* consisted of boot-time determined number of units and the first

12

* chunk is used for static percpu variables in the kernel image

12

* chunk is used for static percpu variables in the kernel image

13

* (special boot time alloc/init handling necessary as these areas

13

* (special boot time alloc/init handling necessary as these areas

14

* need to be brought up before allocation services are running).

14

* need to be brought up before allocation services are running).

15

* Unit grows as necessary and all units grow or shrink in unison.

15

* Unit grows as necessary and all units grow or shrink in unison.

16

* When a chunk is filled up, another chunk is allocated.

16

* When a chunk is filled up, another chunk is allocated.

17

*

17

*

18

* c0 c1 c2

18

* c0 c1 c2

19

* ------------------- ------------------- ------------

19

* ------------------- ------------------- ------------

20

* | u0 | u1 | u2 | u3 | | u0 | u1 | u2 | u3 | | u0 | u1 | u

20

* | u0 | u1 | u2 | u3 | | u0 | u1 | u2 | u3 | | u0 | u1 | u

21

* ------------------- ...... ------------------- .... ------------

21

* ------------------- ...... ------------------- .... ------------

22

*

22

*

23

* Allocation is done in offset-size areas of single unit space. Ie,

23

* Allocation is done in offset-size areas of single unit space. Ie,

24

* an area of 512 bytes at 6k in c1 occupies 512 bytes at 6k of c1:u0,

24

* an area of 512 bytes at 6k in c1 occupies 512 bytes at 6k of c1:u0,

25

* c1:u1, c1:u2 and c1:u3. On UMA, units corresponds directly to

25

* c1:u1, c1:u2 and c1:u3. On UMA, units corresponds directly to

26

* cpus. On NUMA, the mapping can be non-linear and even sparse.

26

* cpus. On NUMA, the mapping can be non-linear and even sparse.

27

* Percpu access can be done by configuring percpu base registers

27

* Percpu access can be done by configuring percpu base registers

28

* according to cpu to unit mapping and pcpu_unit_size.

28

* according to cpu to unit mapping and pcpu_unit_size.

29

*

29

*

30

* There are usually many small percpu allocations many of them being

30

* There are usually many small percpu allocations many of them being

31

* as small as 4 bytes. The allocator organizes chunks into lists

31

* as small as 4 bytes. The allocator organizes chunks into lists

32

* according to free size and tries to allocate from the fullest one.

32

* according to free size and tries to allocate from the fullest one.

33

* Each chunk keeps the maximum contiguous area size hint which is

33

* Each chunk keeps the maximum contiguous area size hint which is

34

* guaranteed to be equal to or larger than the maximum contiguous

34

* guaranteed to be equal to or larger than the maximum contiguous

35

* area in the chunk. This helps the allocator not to iterate the

35

* area in the chunk. This helps the allocator not to iterate the

36

* chunk maps unnecessarily.

36

* chunk maps unnecessarily.

37

*

37

*

38

* Allocation state in each chunk is kept using an array of integers

38

* Allocation state in each chunk is kept using an array of integers

39

* on chunk->map. A positive value in the map represents a free

39

* on chunk->map. A positive value in the map represents a free

40

* region and negative allocated. Allocation inside a chunk is done

40

* region and negative allocated. Allocation inside a chunk is done

41

* by scanning this map sequentially and serving the first matching

41

* by scanning this map sequentially and serving the first matching

42

* entry. This is mostly copied from the percpu_modalloc() allocator.

42

* entry. This is mostly copied from the percpu_modalloc() allocator.

43

* Chunks can be determined from the address using the index field

43

* Chunks can be determined from the address using the index field

44

* in the page struct. The index field contains a pointer to the chunk.

44

* in the page struct. The index field contains a pointer to the chunk.

45

*

45

*

46

* To use this allocator, arch code should do the followings.

46

* To use this allocator, arch code should do the followings.

47

*

47

*

48

* - define __addr_to_pcpu_ptr() and __pcpu_ptr_to_addr() to translate

48

* - define __addr_to_pcpu_ptr() and __pcpu_ptr_to_addr() to translate

49

* regular address to percpu pointer and back if they need to be

49

* regular address to percpu pointer and back if they need to be

50

* different from the default

50

* different from the default

51

*

51

*

52

* - use pcpu_setup_first_chunk() during percpu area initialization to

52

* - use pcpu_setup_first_chunk() during percpu area initialization to

53

* setup the first chunk containing the kernel static percpu area

53

* setup the first chunk containing the kernel static percpu area

54

*/

54

*/

55

56

#include <linux/bitmap.h>

56

#include <linux/bitmap.h>

57

#include <linux/bootmem.h>

57

#include <linux/bootmem.h>

58

#include <linux/err.h>

58

#include <linux/err.h>

59

#include <linux/list.h>

59

#include <linux/list.h>

60

#include <linux/log2.h>

60

#include <linux/log2.h>

61

#include <linux/mm.h>

61

#include <linux/mm.h>

62

#include <linux/module.h>

62

#include <linux/module.h>

63

#include <linux/mutex.h>

63

#include <linux/mutex.h>

64

#include <linux/percpu.h>

64

#include <linux/percpu.h>

65

#include <linux/pfn.h>

65

#include <linux/pfn.h>

66

#include <linux/slab.h>

66

#include <linux/slab.h>

67

#include <linux/spinlock.h>

67

#include <linux/spinlock.h>

68

#include <linux/vmalloc.h>

68

#include <linux/vmalloc.h>

69

#include <linux/workqueue.h>

69

#include <linux/workqueue.h>

70

#include <linux/kmemleak.h>

70

#include <linux/kmemleak.h>

71

72

#include <asm/cacheflush.h>

72

#include <asm/cacheflush.h>

73

#include <asm/sections.h>

73

#include <asm/sections.h>

74

#include <asm/tlbflush.h>

74

#include <asm/tlbflush.h>

75

#include <asm/io.h>

75

#include <asm/io.h>

76

77

#define PCPU_SLOT_BASE_SHIFT 5 /* 1-31 shares the same slot */

77

#define PCPU_SLOT_BASE_SHIFT 5 /* 1-31 shares the same slot */

78

#define PCPU_DFL_MAP_ALLOC 16 /* start a map with 16 ents */

78

#define PCPU_DFL_MAP_ALLOC 16 /* start a map with 16 ents */

79

80

#ifdef CONFIG_SMP

80

#ifdef CONFIG_SMP

81

/* default addr <-> pcpu_ptr mapping, override in asm/percpu.h if necessary */

81

/* default addr <-> pcpu_ptr mapping, override in asm/percpu.h if necessary */

82

#ifndef __addr_to_pcpu_ptr

82

#ifndef __addr_to_pcpu_ptr

83

#define __addr_to_pcpu_ptr(addr) \

83

#define __addr_to_pcpu_ptr(addr) \

84

(void __percpu *)((unsigned long)(addr) - \

84

(void __percpu *)((unsigned long)(addr) - \

85

(unsigned long)pcpu_base_addr + \

85

(unsigned long)pcpu_base_addr + \

86

(unsigned long)__per_cpu_start)

86

(unsigned long)__per_cpu_start)

87

#endif

87

#endif

88

#ifndef __pcpu_ptr_to_addr

88

#ifndef __pcpu_ptr_to_addr

89

#define __pcpu_ptr_to_addr(ptr) \

89

#define __pcpu_ptr_to_addr(ptr) \

90

(void __force *)((unsigned long)(ptr) + \

90

(void __force *)((unsigned long)(ptr) + \

91

(unsigned long)pcpu_base_addr - \

91

(unsigned long)pcpu_base_addr - \

92

(unsigned long)__per_cpu_start)

92

(unsigned long)__per_cpu_start)

93

#endif

93

#endif

94

#else /* CONFIG_SMP */

94

#else /* CONFIG_SMP */

95

/* on UP, it's always identity mapped */

95

/* on UP, it's always identity mapped */

96

#define __addr_to_pcpu_ptr(addr) (void __percpu *)(addr)

96

#define __addr_to_pcpu_ptr(addr) (void __percpu *)(addr)

97

#define __pcpu_ptr_to_addr(ptr) (void __force *)(ptr)

97

#define __pcpu_ptr_to_addr(ptr) (void __force *)(ptr)

98

#endif /* CONFIG_SMP */

98

#endif /* CONFIG_SMP */

99

100

struct pcpu_chunk {

100

struct pcpu_chunk {

101

struct list_head list; /* linked to pcpu_slot lists */

101

struct list_head list; /* linked to pcpu_slot lists */

102

int free_size; /* free bytes in the chunk */

102

int free_size; /* free bytes in the chunk */

103

int contig_hint; /* max contiguous size hint */

103

int contig_hint; /* max contiguous size hint */

104

void *base_addr; /* base address of this chunk */

104

void *base_addr; /* base address of this chunk */

105

int map_used; /* # of map entries used */

105

int map_used; /* # of map entries used */

106

int map_alloc; /* # of map entries allocated */

106

int map_alloc; /* # of map entries allocated */

107

int *map; /* allocation map */

107

int *map; /* allocation map */

108

void *data; /* chunk data */

108

void *data; /* chunk data */

109

bool immutable; /* no [de]population allowed */

109

bool immutable; /* no [de]population allowed */

110

unsigned long populated[]; /* populated bitmap */

110

unsigned long populated[]; /* populated bitmap */

111

};

111

};

112

113

static int pcpu_unit_pages __read_mostly;

113

static int pcpu_unit_pages __read_mostly;

114

static int pcpu_unit_size __read_mostly;

114

static int pcpu_unit_size __read_mostly;

115

static int pcpu_nr_units __read_mostly;

115

static int pcpu_nr_units __read_mostly;

116

static int pcpu_atom_size __read_mostly;

116

static int pcpu_atom_size __read_mostly;

117

static int pcpu_nr_slots __read_mostly;

117

static int pcpu_nr_slots __read_mostly;

118

static size_t pcpu_chunk_struct_size __read_mostly;

118

static size_t pcpu_chunk_struct_size __read_mostly;

119

120

/* cpus with the lowest and highest unit addresses */

120

/* cpus with the lowest and highest unit addresses */

121

static unsigned int pcpu_low_unit_cpu __read_mostly;

121

static unsigned int pcpu_low_unit_cpu __read_mostly;

122

static unsigned int pcpu_high_unit_cpu __read_mostly;

122

static unsigned int pcpu_high_unit_cpu __read_mostly;

123

124

/* the address of the first chunk which starts with the kernel static area */

124

/* the address of the first chunk which starts with the kernel static area */

125

void *pcpu_base_addr __read_mostly;

125

void *pcpu_base_addr __read_mostly;

126

EXPORT_SYMBOL_GPL(pcpu_base_addr);

126

EXPORT_SYMBOL_GPL(pcpu_base_addr);

127

128

static const int *pcpu_unit_map __read_mostly; /* cpu -> unit */

128

static const int *pcpu_unit_map __read_mostly; /* cpu -> unit */

129

const unsigned long *pcpu_unit_offsets __read_mostly; /* cpu -> unit offset */

129

const unsigned long *pcpu_unit_offsets __read_mostly; /* cpu -> unit offset */

130

131

/* group information, used for vm allocation */

131

/* group information, used for vm allocation */

132

static int pcpu_nr_groups __read_mostly;

132

static int pcpu_nr_groups __read_mostly;

133

static const unsigned long *pcpu_group_offsets __read_mostly;

133

static const unsigned long *pcpu_group_offsets __read_mostly;

134

static const size_t *pcpu_group_sizes __read_mostly;

134

static const size_t *pcpu_group_sizes __read_mostly;

135

136

/*

136

/*

137

* The first chunk which always exists. Note that unlike other

137

* The first chunk which always exists. Note that unlike other

138

* chunks, this one can be allocated and mapped in several different

138

* chunks, this one can be allocated and mapped in several different

139

* ways and thus often doesn't live in the vmalloc area.

139

* ways and thus often doesn't live in the vmalloc area.

140

*/

140

*/

141

static struct pcpu_chunk *pcpu_first_chunk;

141

static struct pcpu_chunk *pcpu_first_chunk;

142

143

/*

143

/*

144

* Optional reserved chunk. This chunk reserves part of the first

144

* Optional reserved chunk. This chunk reserves part of the first

145

* chunk and serves it for reserved allocations. The amount of

145

* chunk and serves it for reserved allocations. The amount of

146

* reserved offset is in pcpu_reserved_chunk_limit. When reserved

146

* reserved offset is in pcpu_reserved_chunk_limit. When reserved

147

* area doesn't exist, the following variables contain NULL and 0

147

* area doesn't exist, the following variables contain NULL and 0

148

* respectively.

148

* respectively.

149

*/

149

*/

150

static struct pcpu_chunk *pcpu_reserved_chunk;

150

static struct pcpu_chunk *pcpu_reserved_chunk;

151

static int pcpu_reserved_chunk_limit;

151

static int pcpu_reserved_chunk_limit;

152

153

/*

153

/*

154

* Synchronization rules.

154

* Synchronization rules.

155

*

155

*

156

* There are two locks - pcpu_alloc_mutex and pcpu_lock. The former

156

* There are two locks - pcpu_alloc_mutex and pcpu_lock. The former

157

* protects allocation/reclaim paths, chunks, populated bitmap and

157

* protects allocation/reclaim paths, chunks, populated bitmap and

158

* vmalloc mapping. The latter is a spinlock and protects the index

158

* vmalloc mapping. The latter is a spinlock and protects the index

159

* data structures - chunk slots, chunks and area maps in chunks.

159

* data structures - chunk slots, chunks and area maps in chunks.

160

*

160

*

161

* During allocation, pcpu_alloc_mutex is kept locked all the time and

161

* During allocation, pcpu_alloc_mutex is kept locked all the time and

162

* pcpu_lock is grabbed and released as necessary. All actual memory

162

* pcpu_lock is grabbed and released as necessary. All actual memory

163

* allocations are done using GFP_KERNEL with pcpu_lock released. In

163

* allocations are done using GFP_KERNEL with pcpu_lock released. In

164

* general, percpu memory can't be allocated with irq off but

164

* general, percpu memory can't be allocated with irq off but

165

* irqsave/restore are still used in alloc path so that it can be used

165

* irqsave/restore are still used in alloc path so that it can be used

166

* from early init path - sched_init() specifically.

166

* from early init path - sched_init() specifically.

167

*

167

*

168

* Free path accesses and alters only the index data structures, so it

168

* Free path accesses and alters only the index data structures, so it

169

* can be safely called from atomic context. When memory needs to be

169

* can be safely called from atomic context. When memory needs to be

170

* returned to the system, free path schedules reclaim_work which

170

* returned to the system, free path schedules reclaim_work which

171

* grabs both pcpu_alloc_mutex and pcpu_lock, unlinks chunks to be

171

* grabs both pcpu_alloc_mutex and pcpu_lock, unlinks chunks to be

172

* reclaimed, release both locks and frees the chunks. Note that it's

172

* reclaimed, release both locks and frees the chunks. Note that it's

173

* necessary to grab both locks to remove a chunk from circulation as

173

* necessary to grab both locks to remove a chunk from circulation as

174

* allocation path might be referencing the chunk with only

174

* allocation path might be referencing the chunk with only

175

* pcpu_alloc_mutex locked.

175

* pcpu_alloc_mutex locked.

176

*/

176

*/

177

static DEFINE_MUTEX(pcpu_alloc_mutex); /* protects whole alloc and reclaim */

177

static DEFINE_MUTEX(pcpu_alloc_mutex); /* protects whole alloc and reclaim */

178

static DEFINE_SPINLOCK(pcpu_lock); /* protects index data structures */

178

static DEFINE_SPINLOCK(pcpu_lock); /* protects index data structures */

179

180

static struct list_head *pcpu_slot __read_mostly; /* chunk list slots */

180

static struct list_head *pcpu_slot __read_mostly; /* chunk list slots */

181

182

/* reclaim work to release fully free chunks, scheduled from free path */

182

/* reclaim work to release fully free chunks, scheduled from free path */

183

static void pcpu_reclaim(struct work_struct *work);

183

static void pcpu_reclaim(struct work_struct *work);

184

static DECLARE_WORK(pcpu_reclaim_work, pcpu_reclaim);

184

static DECLARE_WORK(pcpu_reclaim_work, pcpu_reclaim);

185

186

static bool pcpu_addr_in_first_chunk(void *addr)

186

static bool pcpu_addr_in_first_chunk(void *addr)

187

{

187

{

188

void *first_start = pcpu_first_chunk->base_addr;

188

void *first_start = pcpu_first_chunk->base_addr;

189

190

return addr >= first_start && addr < first_start + pcpu_unit_size;

190

return addr >= first_start && addr < first_start + pcpu_unit_size;

191

}

191

}

192

193

static bool pcpu_addr_in_reserved_chunk(void *addr)

193

static bool pcpu_addr_in_reserved_chunk(void *addr)

194

{

194

{

195

void *first_start = pcpu_first_chunk->base_addr;

195

void *first_start = pcpu_first_chunk->base_addr;

196

197

return addr >= first_start &&

197

return addr >= first_start &&

198

addr < first_start + pcpu_reserved_chunk_limit;

198

addr < first_start + pcpu_reserved_chunk_limit;

199

}

199

}

200

201

static int __pcpu_size_to_slot(int size)

201

static int __pcpu_size_to_slot(int size)

202

{

202

{

203

int highbit = fls(size); /* size is in bytes */

203

int highbit = fls(size); /* size is in bytes */

204

return max(highbit - PCPU_SLOT_BASE_SHIFT + 2, 1);

204

return max(highbit - PCPU_SLOT_BASE_SHIFT + 2, 1);

205

}

205

}

206

207

static int pcpu_size_to_slot(int size)

207

static int pcpu_size_to_slot(int size)

208

{

208

{

209

if (size == pcpu_unit_size)

209

if (size == pcpu_unit_size)

210

return pcpu_nr_slots - 1;

210

return pcpu_nr_slots - 1;

211

return __pcpu_size_to_slot(size);

211

return __pcpu_size_to_slot(size);

212

}

212

}

213

214

static int pcpu_chunk_slot(const struct pcpu_chunk *chunk)

214

static int pcpu_chunk_slot(const struct pcpu_chunk *chunk)

215

{

215

{

216

if (chunk->free_size < sizeof(int) || chunk->contig_hint < sizeof(int))

216

if (chunk->free_size < sizeof(int) || chunk->contig_hint < sizeof(int))

217

return 0;

217

return 0;

218

219

return pcpu_size_to_slot(chunk->free_size);

219

return pcpu_size_to_slot(chunk->free_size);

220

}

220

}

221

222

/* set the pointer to a chunk in a page struct */

222

/* set the pointer to a chunk in a page struct */

223

static void pcpu_set_page_chunk(struct page *page, struct pcpu_chunk *pcpu)

223

static void pcpu_set_page_chunk(struct page *page, struct pcpu_chunk *pcpu)

224

{

224

{

225

page->index = (unsigned long)pcpu;

225

page->index = (unsigned long)pcpu;

226

}

226

}

227

228

/* obtain pointer to a chunk from a page struct */

228

/* obtain pointer to a chunk from a page struct */

229

static struct pcpu_chunk *pcpu_get_page_chunk(struct page *page)

229

static struct pcpu_chunk *pcpu_get_page_chunk(struct page *page)

230

{

230

{

231

return (struct pcpu_chunk *)page->index;

231

return (struct pcpu_chunk *)page->index;

232

}

232

}

233

234

static int __maybe_unused pcpu_page_idx(unsigned int cpu, int page_idx)

234

static int __maybe_unused pcpu_page_idx(unsigned int cpu, int page_idx)

235

{

235

{

236

return pcpu_unit_map[cpu] * pcpu_unit_pages + page_idx;

236

return pcpu_unit_map[cpu] * pcpu_unit_pages + page_idx;

237

}

237

}

238

239

static unsigned long pcpu_chunk_addr(struct pcpu_chunk *chunk,

239

static unsigned long pcpu_chunk_addr(struct pcpu_chunk *chunk,

240

unsigned int cpu, int page_idx)

240

unsigned int cpu, int page_idx)

241

{

241

{

242

return (unsigned long)chunk->base_addr + pcpu_unit_offsets[cpu] +

242

return (unsigned long)chunk->base_addr + pcpu_unit_offsets[cpu] +

243

(page_idx << PAGE_SHIFT);

243

(page_idx << PAGE_SHIFT);

244

}

244

}

245

246

static void __maybe_unused pcpu_next_unpop(struct pcpu_chunk *chunk,

246

static void __maybe_unused pcpu_next_unpop(struct pcpu_chunk *chunk,

247

int *rs, int *re, int end)

247

int *rs, int *re, int end)

248

{

248

{

249

*rs = find_next_zero_bit(chunk->populated, end, *rs);

249

*rs = find_next_zero_bit(chunk->populated, end, *rs);

250

*re = find_next_bit(chunk->populated, end, *rs + 1);

250

*re = find_next_bit(chunk->populated, end, *rs + 1);

251

}

251

}

252

253

static void __maybe_unused pcpu_next_pop(struct pcpu_chunk *chunk,

253

static void __maybe_unused pcpu_next_pop(struct pcpu_chunk *chunk,

254

int *rs, int *re, int end)

254

int *rs, int *re, int end)

255

{

255

{

256

*rs = find_next_bit(chunk->populated, end, *rs);

256

*rs = find_next_bit(chunk->populated, end, *rs);

257

*re = find_next_zero_bit(chunk->populated, end, *rs + 1);

257

*re = find_next_zero_bit(chunk->populated, end, *rs + 1);

258

}

258

}

259

260

/*

260

/*

261

* (Un)populated page region iterators. Iterate over (un)populated

261

* (Un)populated page region iterators. Iterate over (un)populated

262

* page regions between @start and @end in @chunk. @rs and @re should

262

* page regions between @start and @end in @chunk. @rs and @re should

263

* be integer variables and will be set to start and end page index of

263

* be integer variables and will be set to start and end page index of

264

* the current region.

264

* the current region.

265

*/

265

*/

266

#define pcpu_for_each_unpop_region(chunk, rs, re, start, end) \

266

#define pcpu_for_each_unpop_region(chunk, rs, re, start, end) \

267

for ((rs) = (start), pcpu_next_unpop((chunk), &(rs), &(re), (end)); \

267

for ((rs) = (start), pcpu_next_unpop((chunk), &(rs), &(re), (end)); \

268

(rs) < (re); \

268

(rs) < (re); \

269

(rs) = (re) + 1, pcpu_next_unpop((chunk), &(rs), &(re), (end)))

269

(rs) = (re) + 1, pcpu_next_unpop((chunk), &(rs), &(re), (end)))

270

271

#define pcpu_for_each_pop_region(chunk, rs, re, start, end) \

271

#define pcpu_for_each_pop_region(chunk, rs, re, start, end) \

272

for ((rs) = (start), pcpu_next_pop((chunk), &(rs), &(re), (end)); \

272

for ((rs) = (start), pcpu_next_pop((chunk), &(rs), &(re), (end)); \

273

(rs) < (re); \

273

(rs) < (re); \

274

(rs) = (re) + 1, pcpu_next_pop((chunk), &(rs), &(re), (end)))

274

(rs) = (re) + 1, pcpu_next_pop((chunk), &(rs), &(re), (end)))

275

276

/**

276

/**

277

* pcpu_mem_zalloc - allocate memory

277

* pcpu_mem_zalloc - allocate memory

278

* @size: bytes to allocate

278

* @size: bytes to allocate

279

*

279

*

280

* Allocate @size bytes. If @size is smaller than PAGE_SIZE,

280

* Allocate @size bytes. If @size is smaller than PAGE_SIZE,

281

* kzalloc() is used; otherwise, vzalloc() is used. The returned

281

* kzalloc() is used; otherwise, vzalloc() is used. The returned

282

* memory is always zeroed.

282

* memory is always zeroed.

283

*

283

*

284

* CONTEXT:

284

* CONTEXT:

285

* Does GFP_KERNEL allocation.

285

* Does GFP_KERNEL allocation.

286

*

286

*

287

* RETURNS:

287

* RETURNS:

288

* Pointer to the allocated area on success, NULL on failure.

288

* Pointer to the allocated area on success, NULL on failure.

289

*/

289

*/

290

static void *pcpu_mem_zalloc(size_t size)

290

static void *pcpu_mem_zalloc(size_t size)

291

{

291

{

292

if (WARN_ON_ONCE(!slab_is_available()))

292

if (WARN_ON_ONCE(!slab_is_available()))

293

return NULL;

293

return NULL;

294

295

if (size <= PAGE_SIZE)

295

if (size <= PAGE_SIZE)

296

return kzalloc(size, GFP_KERNEL);

296

return kzalloc(size, GFP_KERNEL);

297

else

297

else

298

return vzalloc(size);

298

return vzalloc(size);

299

}

299

}

300

301

/**

301

/**

302

* pcpu_mem_free - free memory

302

* pcpu_mem_free - free memory

303

* @ptr: memory to free

303

* @ptr: memory to free

304

* @size: size of the area

304

* @size: size of the area

305

*

305

*

306

* Free @ptr. @ptr should have been allocated using pcpu_mem_zalloc().

306

* Free @ptr. @ptr should have been allocated using pcpu_mem_zalloc().

307

*/

307

*/

308

static void pcpu_mem_free(void *ptr, size_t size)

308

static void pcpu_mem_free(void *ptr, size_t size)

309

{

309

{

310

if (size <= PAGE_SIZE)

310

if (size <= PAGE_SIZE)

311

kfree(ptr);

311

kfree(ptr);

312

else

312

else

313

vfree(ptr);

313

vfree(ptr);

314

}

314

}

315

316

/**

316

/**

317

* pcpu_chunk_relocate - put chunk in the appropriate chunk slot

317

* pcpu_chunk_relocate - put chunk in the appropriate chunk slot

318

* @chunk: chunk of interest

318

* @chunk: chunk of interest

319

* @oslot: the previous slot it was on

319

* @oslot: the previous slot it was on

320

*

320

*

321

* This function is called after an allocation or free changed @chunk.

321

* This function is called after an allocation or free changed @chunk.

322

* New slot according to the changed state is determined and @chunk is

322

* New slot according to the changed state is determined and @chunk is

323

* moved to the slot. Note that the reserved chunk is never put on

323

* moved to the slot. Note that the reserved chunk is never put on

324

* chunk slots.

324

* chunk slots.

325

*

325

*

326

* CONTEXT:

326

* CONTEXT:

327

* pcpu_lock.

327

* pcpu_lock.

328

*/

328

*/

329

static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot)

329

static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot)

330

{

330

{

331

int nslot = pcpu_chunk_slot(chunk);

331

int nslot = pcpu_chunk_slot(chunk);

332

333

if (chunk != pcpu_reserved_chunk && oslot != nslot) {

333

if (chunk != pcpu_reserved_chunk && oslot != nslot) {

334

if (oslot < nslot)

334

if (oslot < nslot)

335

list_move(&chunk->list, &pcpu_slot[nslot]);

335

list_move(&chunk->list, &pcpu_slot[nslot]);

336

else

336

else

337

list_move_tail(&chunk->list, &pcpu_slot[nslot]);

337

list_move_tail(&chunk->list, &pcpu_slot[nslot]);

338

}

338

}

339

}

339

}

340

341

/**

341

/**

342

* pcpu_need_to_extend - determine whether chunk area map needs to be extended

342

* pcpu_need_to_extend - determine whether chunk area map needs to be extended

343

* @chunk: chunk of interest

343

* @chunk: chunk of interest

344

*

344

*

345

* Determine whether area map of @chunk needs to be extended to

345

* Determine whether area map of @chunk needs to be extended to

346

* accommodate a new allocation.

346

* accommodate a new allocation.

347

*

347

*

348

* CONTEXT:

348

* CONTEXT:

349

* pcpu_lock.

349

* pcpu_lock.

350

*

350

*

351

* RETURNS:

351

* RETURNS:

352

* New target map allocation length if extension is necessary, 0

352

* New target map allocation length if extension is necessary, 0

353

* otherwise.

353

* otherwise.

354

*/

354

*/

355

static int pcpu_need_to_extend(struct pcpu_chunk *chunk)

355

static int pcpu_need_to_extend(struct pcpu_chunk *chunk)

356

{

356

{

357

int new_alloc;

357

int new_alloc;

358

359

if (chunk->map_alloc >= chunk->map_used + 2)

359

if (chunk->map_alloc >= chunk->map_used + 2)

360

return 0;

360

return 0;

361

362

new_alloc = PCPU_DFL_MAP_ALLOC;

362

new_alloc = PCPU_DFL_MAP_ALLOC;

363

while (new_alloc < chunk->map_used + 2)

363

while (new_alloc < chunk->map_used + 2)

364

new_alloc *= 2;

364

new_alloc *= 2;

365

366

return new_alloc;

366

return new_alloc;

367

}

367

}

368

369

/**

369

/**

370

* pcpu_extend_area_map - extend area map of a chunk

370

* pcpu_extend_area_map - extend area map of a chunk

371

* @chunk: chunk of interest

371

* @chunk: chunk of interest

372

* @new_alloc: new target allocation length of the area map

372

* @new_alloc: new target allocation length of the area map

373

*

373

*

374

* Extend area map of @chunk to have @new_alloc entries.

374

* Extend area map of @chunk to have @new_alloc entries.

375

*

375

*

376

* CONTEXT:

376

* CONTEXT:

377

* Does GFP_KERNEL allocation. Grabs and releases pcpu_lock.

377

* Does GFP_KERNEL allocation. Grabs and releases pcpu_lock.

378

*

378

*

379

* RETURNS:

379

* RETURNS:

380

* 0 on success, -errno on failure.

380

* 0 on success, -errno on failure.

381

*/

381

*/

382

static int pcpu_extend_area_map(struct pcpu_chunk *chunk, int new_alloc)

382

static int pcpu_extend_area_map(struct pcpu_chunk *chunk, int new_alloc)

383

{

383

{

384

int *old = NULL, *new = NULL;

384

int *old = NULL, *new = NULL;

385

size_t old_size = 0, new_size = new_alloc * sizeof(new[0]);

385

size_t old_size = 0, new_size = new_alloc * sizeof(new[0]);

386

unsigned long flags;

386

unsigned long flags;

387

388

new = pcpu_mem_zalloc(new_size);

388

new = pcpu_mem_zalloc(new_size);

389

if (!new)

389

if (!new)

390

return -ENOMEM;

390

return -ENOMEM;

391

392

/* acquire pcpu_lock and switch to new area map */

392

/* acquire pcpu_lock and switch to new area map */

393

spin_lock_irqsave(&pcpu_lock, flags);

393

spin_lock_irqsave(&pcpu_lock, flags);

394

395

if (new_alloc <= chunk->map_alloc)

395

if (new_alloc <= chunk->map_alloc)

396

goto out_unlock;

396

goto out_unlock;

397

398

old_size = chunk->map_alloc * sizeof(chunk->map[0]);

398

old_size = chunk->map_alloc * sizeof(chunk->map[0]);

399

old = chunk->map;

399

old = chunk->map;

400

401

memcpy(new, old, old_size);

401

memcpy(new, old, old_size);

402

403

chunk->map_alloc = new_alloc;

403

chunk->map_alloc = new_alloc;

404

chunk->map = new;

404

chunk->map = new;

405

new = NULL;

405

new = NULL;

406

407

out_unlock:

407

out_unlock:

408

spin_unlock_irqrestore(&pcpu_lock, flags);

408

spin_unlock_irqrestore(&pcpu_lock, flags);

409

410

/*

410

/*

411

* pcpu_mem_free() might end up calling vfree() which uses

411

* pcpu_mem_free() might end up calling vfree() which uses

412

* IRQ-unsafe lock and thus can't be called under pcpu_lock.

412

* IRQ-unsafe lock and thus can't be called under pcpu_lock.

413

*/

413

*/

414

pcpu_mem_free(old, old_size);

414

pcpu_mem_free(old, old_size);

415

pcpu_mem_free(new, new_size);

415

pcpu_mem_free(new, new_size);

416

417

return 0;

417

return 0;

418

}

418

}

419

420

/**

420

/**

421

* pcpu_split_block - split a map block

421

* pcpu_split_block - split a map block

422

* @chunk: chunk of interest

422

* @chunk: chunk of interest

423

* @i: index of map block to split

423

* @i: index of map block to split

424

* @head: head size in bytes (can be 0)

424

* @head: head size in bytes (can be 0)

425

* @tail: tail size in bytes (can be 0)

425

* @tail: tail size in bytes (can be 0)

426

*

426

*

427

* Split the @i'th map block into two or three blocks. If @head is

427

* Split the @i'th map block into two or three blocks. If @head is

428

* non-zero, @head bytes block is inserted before block @i moving it

428

* non-zero, @head bytes block is inserted before block @i moving it

429

* to @i+1 and reducing its size by @head bytes.

429

* to @i+1 and reducing its size by @head bytes.

430

*

430

*

431

* If @tail is non-zero, the target block, which can be @i or @i+1

431

* If @tail is non-zero, the target block, which can be @i or @i+1

432

* depending on @head, is reduced by @tail bytes and @tail byte block

432

* depending on @head, is reduced by @tail bytes and @tail byte block

433

* is inserted after the target block.

433

* is inserted after the target block.

434

*

434

*

435

* @chunk->map must have enough free slots to accommodate the split.

435

* @chunk->map must have enough free slots to accommodate the split.

436

*

436

*

437

* CONTEXT:

437

* CONTEXT:

438

* pcpu_lock.

438

* pcpu_lock.

439

*/

439

*/

440

static void pcpu_split_block(struct pcpu_chunk *chunk, int i,

440

static void pcpu_split_block(struct pcpu_chunk *chunk, int i,

441

int head, int tail)

441

int head, int tail)

442

{

442

{

443

int nr_extra = !!head + !!tail;

443

int nr_extra = !!head + !!tail;

444

445

BUG_ON(chunk->map_alloc < chunk->map_used + nr_extra);

445

BUG_ON(chunk->map_alloc < chunk->map_used + nr_extra);

446

447

/* insert new subblocks */

447

/* insert new subblocks */

448

memmove(&chunk->map[i + nr_extra], &chunk->map[i],

448

memmove(&chunk->map[i + nr_extra], &chunk->map[i],

449

sizeof(chunk->map[0]) * (chunk->map_used - i));

449

sizeof(chunk->map[0]) * (chunk->map_used - i));

450

chunk->map_used += nr_extra;

450

chunk->map_used += nr_extra;

451

452

if (head) {

452

if (head) {

453

chunk->map[i + 1] = chunk->map[i] - head;

453

chunk->map[i + 1] = chunk->map[i] - head;

454

chunk->map[i++] = head;

454

chunk->map[i++] = head;

455

}

455

}

456

if (tail) {

456

if (tail) {

457

chunk->map[i++] -= tail;

457

chunk->map[i++] -= tail;

458

chunk->map[i] = tail;

458

chunk->map[i] = tail;

459

}

459

}

460

}

460

}

461

462

/**

462

/**

463

* pcpu_alloc_area - allocate area from a pcpu_chunk

463

* pcpu_alloc_area - allocate area from a pcpu_chunk

464

* @chunk: chunk of interest

464

* @chunk: chunk of interest

465

* @size: wanted size in bytes

465

* @size: wanted size in bytes

466

* @align: wanted align

466

* @align: wanted align

467

*

467

*

468

* Try to allocate @size bytes area aligned at @align from @chunk.

468

* Try to allocate @size bytes area aligned at @align from @chunk.

469

* Note that this function only allocates the offset. It doesn't

469

* Note that this function only allocates the offset. It doesn't

470

* populate or map the area.

470

* populate or map the area.

471

*

471

*

472

* @chunk->map must have at least two free slots.

472

* @chunk->map must have at least two free slots.

473

*

473

*

474

* CONTEXT:

474

* CONTEXT:

475

* pcpu_lock.

475

* pcpu_lock.

476

*

476

*

477

* RETURNS:

477

* RETURNS:

478

* Allocated offset in @chunk on success, -1 if no matching area is

478

* Allocated offset in @chunk on success, -1 if no matching area is

479

* found.

479

* found.

480

*/

480

*/

481

static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align)

481

static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align)

482

{

482

{

483

int oslot = pcpu_chunk_slot(chunk);

483

int oslot = pcpu_chunk_slot(chunk);

484

int max_contig = 0;

484

int max_contig = 0;

485

int i, off;

485

int i, off;

486

487

for (i = 0, off = 0; i < chunk->map_used; off += abs(chunk->map[i++])) {

487

for (i = 0, off = 0; i < chunk->map_used; off += abs(chunk->map[i++])) {

488

bool is_last = i + 1 == chunk->map_used;

488

bool is_last = i + 1 == chunk->map_used;

489

int head, tail;

489

int head, tail;

490

491

/* extra for alignment requirement */

491

/* extra for alignment requirement */

492

head = ALIGN(off, align) - off;

492

head = ALIGN(off, align) - off;

493

BUG_ON(i == 0 && head != 0);

493

BUG_ON(i == 0 && head != 0);

494

495

if (chunk->map[i] < 0)

495

if (chunk->map[i] < 0)

496

continue;

496

continue;

497

if (chunk->map[i] < head + size) {

497

if (chunk->map[i] < head + size) {

498

max_contig = max(chunk->map[i], max_contig);

498

max_contig = max(chunk->map[i], max_contig);

499

continue;

499

continue;

500

}

500

}

501

502

/*

502

/*

503

* If head is small or the previous block is free,

503

* If head is small or the previous block is free,

504

* merge'em. Note that 'small' is defined as smaller

504

* merge'em. Note that 'small' is defined as smaller

505

* than sizeof(int), which is very small but isn't too

505

* than sizeof(int), which is very small but isn't too

506

* uncommon for percpu allocations.

506

* uncommon for percpu allocations.

507

*/

507

*/

508

if (head && (head < sizeof(int) || chunk->map[i - 1] > 0)) {

508

if (head && (head < sizeof(int) || chunk->map[i - 1] > 0)) {

509

if (chunk->map[i - 1] > 0)

509

if (chunk->map[i - 1] > 0)

510

chunk->map[i - 1] += head;

510

chunk->map[i - 1] += head;

511

else {

511

else {

512

chunk->map[i - 1] -= head;

512

chunk->map[i - 1] -= head;

513

chunk->free_size -= head;

513

chunk->free_size -= head;

514

}

514

}

515

chunk->map[i] -= head;

515

chunk->map[i] -= head;

516

off += head;

516

off += head;

517

head = 0;

517

head = 0;

518

}

518

}

519

520

/* if tail is small, just keep it around */

520

/* if tail is small, just keep it around */

521

tail = chunk->map[i] - head - size;

521

tail = chunk->map[i] - head - size;

522

if (tail < sizeof(int))

522

if (tail < sizeof(int))

523

tail = 0;

523

tail = 0;

524

525

/* split if warranted */

525

/* split if warranted */

526

if (head || tail) {

526

if (head || tail) {

527

pcpu_split_block(chunk, i, head, tail);

527

pcpu_split_block(chunk, i, head, tail);

528

if (head) {

528

if (head) {

529

i++;

529

i++;

530

off += head;

530

off += head;

531

max_contig = max(chunk->map[i - 1], max_contig);

531

max_contig = max(chunk->map[i - 1], max_contig);

532

}

532

}

533

if (tail)

533

if (tail)

534

max_contig = max(chunk->map[i + 1], max_contig);

534

max_contig = max(chunk->map[i + 1], max_contig);

535

}

535

}

536

537

/* update hint and mark allocated */

537

/* update hint and mark allocated */

538

if (is_last)

538

if (is_last)

539

chunk->contig_hint = max_contig; /* fully scanned */

539

chunk->contig_hint = max_contig; /* fully scanned */

540

else

540

else

541

chunk->contig_hint = max(chunk->contig_hint,

541

chunk->contig_hint = max(chunk->contig_hint,

542

max_contig);

542

max_contig);

543

544

chunk->free_size -= chunk->map[i];

544

chunk->free_size -= chunk->map[i];

545

chunk->map[i] = -chunk->map[i];

545

chunk->map[i] = -chunk->map[i];

546

547

pcpu_chunk_relocate(chunk, oslot);

547

pcpu_chunk_relocate(chunk, oslot);

548

return off;

548

return off;

549

}

549

}

550

551

chunk->contig_hint = max_contig; /* fully scanned */

551

chunk->contig_hint = max_contig; /* fully scanned */

552

pcpu_chunk_relocate(chunk, oslot);

552

pcpu_chunk_relocate(chunk, oslot);

553

554

/* tell the upper layer that this chunk has no matching area */

554

/* tell the upper layer that this chunk has no matching area */

555

return -1;

555

return -1;

556

}

556

}

557

558

/**

558

/**

559

* pcpu_free_area - free area to a pcpu_chunk

559

* pcpu_free_area - free area to a pcpu_chunk

560

* @chunk: chunk of interest

560

* @chunk: chunk of interest

561

* @freeme: offset of area to free

561

* @freeme: offset of area to free

562

*

562

*

563

* Free area starting from @freeme to @chunk. Note that this function

563

* Free area starting from @freeme to @chunk. Note that this function

564

* only modifies the allocation map. It doesn't depopulate or unmap

564

* only modifies the allocation map. It doesn't depopulate or unmap

565

* the area.

565

* the area.

566

*

566

*

567

* CONTEXT:

567

* CONTEXT:

568

* pcpu_lock.

568

* pcpu_lock.

569

*/

569

*/

570

static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme)

570

static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme)

571

{

571

{

572

int oslot = pcpu_chunk_slot(chunk);

572

int oslot = pcpu_chunk_slot(chunk);

573

int i, off;

573

int i, off;

574

575

for (i = 0, off = 0; i < chunk->map_used; off += abs(chunk->map[i++]))

575

for (i = 0, off = 0; i < chunk->map_used; off += abs(chunk->map[i++]))

576

if (off == freeme)

576

if (off == freeme)

577

break;

577

break;

578

BUG_ON(off != freeme);

578

BUG_ON(off != freeme);

579

BUG_ON(chunk->map[i] > 0);

579

BUG_ON(chunk->map[i] > 0);

580

581

chunk->map[i] = -chunk->map[i];

581

chunk->map[i] = -chunk->map[i];

582

chunk->free_size += chunk->map[i];

582

chunk->free_size += chunk->map[i];

583

584

/* merge with previous? */

584

/* merge with previous? */

585

if (i > 0 && chunk->map[i - 1] >= 0) {

585

if (i > 0 && chunk->map[i - 1] >= 0) {

586

chunk->map[i - 1] += chunk->map[i];

586

chunk->map[i - 1] += chunk->map[i];

587

chunk->map_used--;

587

chunk->map_used--;

588

memmove(&chunk->map[i], &chunk->map[i + 1],

588

memmove(&chunk->map[i], &chunk->map[i + 1],

589

(chunk->map_used - i) * sizeof(chunk->map[0]));

589

(chunk->map_used - i) * sizeof(chunk->map[0]));

590

i--;

590

i--;

591

}

591

}

592

/* merge with next? */

592

/* merge with next? */

593

if (i + 1 < chunk->map_used && chunk->map[i + 1] >= 0) {

593

if (i + 1 < chunk->map_used && chunk->map[i + 1] >= 0) {

594

chunk->map[i] += chunk->map[i + 1];

594

chunk->map[i] += chunk->map[i + 1];

595

chunk->map_used--;

595

chunk->map_used--;

596

memmove(&chunk->map[i + 1], &chunk->map[i + 2],

596

memmove(&chunk->map[i + 1], &chunk->map[i + 2],

597

(chunk->map_used - (i + 1)) * sizeof(chunk->map[0]));

597

(chunk->map_used - (i + 1)) * sizeof(chunk->map[0]));

598

}

598

}

599

600

chunk->contig_hint = max(chunk->map[i], chunk->contig_hint);

600

chunk->contig_hint = max(chunk->map[i], chunk->contig_hint);

601

pcpu_chunk_relocate(chunk, oslot);

601

pcpu_chunk_relocate(chunk, oslot);

602

}

602

}

603

604

static struct pcpu_chunk *pcpu_alloc_chunk(void)

604

static struct pcpu_chunk *pcpu_alloc_chunk(void)

605

{

605

{

606

struct pcpu_chunk *chunk;

606

struct pcpu_chunk *chunk;

607

608

chunk = pcpu_mem_zalloc(pcpu_chunk_struct_size);

608

chunk = pcpu_mem_zalloc(pcpu_chunk_struct_size);

609

if (!chunk)

609

if (!chunk)

610

return NULL;

610

return NULL;

611

612

chunk->map = pcpu_mem_zalloc(PCPU_DFL_MAP_ALLOC *

612

chunk->map = pcpu_mem_zalloc(PCPU_DFL_MAP_ALLOC *

613

sizeof(chunk->map[0]));

613

sizeof(chunk->map[0]));

614

if (!chunk->map) {

614

if (!chunk->map) {

615

kfree(chunk);

615

kfree(chunk);

616

return NULL;

616

return NULL;

617

}

617

}

618

619

chunk->map_alloc = PCPU_DFL_MAP_ALLOC;

619

chunk->map_alloc = PCPU_DFL_MAP_ALLOC;

620

chunk->map[chunk->map_used++] = pcpu_unit_size;

620

chunk->map[chunk->map_used++] = pcpu_unit_size;

621

622

INIT_LIST_HEAD(&chunk->list);

622

INIT_LIST_HEAD(&chunk->list);

623

chunk->free_size = pcpu_unit_size;

623

chunk->free_size = pcpu_unit_size;

624

chunk->contig_hint = pcpu_unit_size;

624

chunk->contig_hint = pcpu_unit_size;

625

626

return chunk;

626

return chunk;

627

}

627

}

628

629

static void pcpu_free_chunk(struct pcpu_chunk *chunk)

629

static void pcpu_free_chunk(struct pcpu_chunk *chunk)

630

{

630

{

631

if (!chunk)

631

if (!chunk)

632

return;

632

return;

633

pcpu_mem_free(chunk->map, chunk->map_alloc * sizeof(chunk->map[0]));

633

pcpu_mem_free(chunk->map, chunk->map_alloc * sizeof(chunk->map[0]));

634

kfree(chunk);

634

kfree(chunk);

635

}

635

}

636

637

/*

637

/*

638

* Chunk management implementation.

638

* Chunk management implementation.

639

*

639

*

640

* To allow different implementations, chunk alloc/free and

640

* To allow different implementations, chunk alloc/free and

641

* [de]population are implemented in a separate file which is pulled

641

* [de]population are implemented in a separate file which is pulled

642

* into this file and compiled together. The following functions

642

* into this file and compiled together. The following functions

643

* should be implemented.

643

* should be implemented.

644

*

644

*

645

* pcpu_populate_chunk - populate the specified range of a chunk

645

* pcpu_populate_chunk - populate the specified range of a chunk

646

* pcpu_depopulate_chunk - depopulate the specified range of a chunk

646

* pcpu_depopulate_chunk - depopulate the specified range of a chunk

647

* pcpu_create_chunk - create a new chunk

647

* pcpu_create_chunk - create a new chunk

648

* pcpu_destroy_chunk - destroy a chunk, always preceded by full depop

648

* pcpu_destroy_chunk - destroy a chunk, always preceded by full depop

649

* pcpu_addr_to_page - translate address to physical address

649

* pcpu_addr_to_page - translate address to physical address

650

* pcpu_verify_alloc_info - check alloc_info is acceptable during init

650

* pcpu_verify_alloc_info - check alloc_info is acceptable during init

651

*/

651

*/

652

static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size);

652

static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size);

653

static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size);

653

static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size);

654

static struct pcpu_chunk *pcpu_create_chunk(void);

654

static struct pcpu_chunk *pcpu_create_chunk(void);

655

static void pcpu_destroy_chunk(struct pcpu_chunk *chunk);

655

static void pcpu_destroy_chunk(struct pcpu_chunk *chunk);

656

static struct page *pcpu_addr_to_page(void *addr);

656

static struct page *pcpu_addr_to_page(void *addr);

657

static int __init pcpu_verify_alloc_info(const struct pcpu_alloc_info *ai);

657

static int __init pcpu_verify_alloc_info(const struct pcpu_alloc_info *ai);

658

659

#ifdef CONFIG_NEED_PER_CPU_KM

659

#ifdef CONFIG_NEED_PER_CPU_KM

660

#include "percpu-km.c"

660

#include "percpu-km.c"

661

#else

661

#else

662

#include "percpu-vm.c"

662

#include "percpu-vm.c"

663

#endif

663

#endif

664

665

/**

665

/**

666

* pcpu_chunk_addr_search - determine chunk containing specified address

666

* pcpu_chunk_addr_search - determine chunk containing specified address

667

* @addr: address for which the chunk needs to be determined.

667

* @addr: address for which the chunk needs to be determined.

668

*

668

*

669

* RETURNS:

669

* RETURNS:

670

* The address of the found chunk.

670

* The address of the found chunk.

671

*/

671

*/

672

static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr)

672

static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr)

673

{

673

{

674

/* is it in the first chunk? */

674

/* is it in the first chunk? */

675

if (pcpu_addr_in_first_chunk(addr)) {

675

if (pcpu_addr_in_first_chunk(addr)) {

676

/* is it in the reserved area? */

676

/* is it in the reserved area? */

677

if (pcpu_addr_in_reserved_chunk(addr))

677

if (pcpu_addr_in_reserved_chunk(addr))

678

return pcpu_reserved_chunk;

678

return pcpu_reserved_chunk;

679

return pcpu_first_chunk;

679

return pcpu_first_chunk;

680

}

680

}

681

682

/*

682

/*

683

* The address is relative to unit0 which might be unused and

683

* The address is relative to unit0 which might be unused and

684

* thus unmapped. Offset the address to the unit space of the

684

* thus unmapped. Offset the address to the unit space of the

685

* current processor before looking it up in the vmalloc

685

* current processor before looking it up in the vmalloc

686

* space. Note that any possible cpu id can be used here, so

686

* space. Note that any possible cpu id can be used here, so

687

* there's no need to worry about preemption or cpu hotplug.

687

* there's no need to worry about preemption or cpu hotplug.

688

*/

688

*/

689

addr += pcpu_unit_offsets[raw_smp_processor_id()];

689

addr += pcpu_unit_offsets[raw_smp_processor_id()];

690

return pcpu_get_page_chunk(pcpu_addr_to_page(addr));

690

return pcpu_get_page_chunk(pcpu_addr_to_page(addr));

691

}

691

}

692

693

/**

693

/**

694

* pcpu_alloc - the percpu allocator

694

* pcpu_alloc - the percpu allocator

695

* @size: size of area to allocate in bytes

695

* @size: size of area to allocate in bytes

696

* @align: alignment of area (max PAGE_SIZE)

696

* @align: alignment of area (max PAGE_SIZE)

697

* @reserved: allocate from the reserved chunk if available

697

* @reserved: allocate from the reserved chunk if available

698

*

698

*

699

* Allocate percpu area of @size bytes aligned at @align.

699

* Allocate percpu area of @size bytes aligned at @align.

700

*

700

*

701

* CONTEXT:

701

* CONTEXT:

702

* Does GFP_KERNEL allocation.

702

* Does GFP_KERNEL allocation.

703

*

703

*

704

* RETURNS:

704

* RETURNS:

705

* Percpu pointer to the allocated area on success, NULL on failure.

705

* Percpu pointer to the allocated area on success, NULL on failure.

706

*/

706

*/

707

static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved)

707

static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved)

708

{

708

{

709

static int warn_limit = 10;

709

static int warn_limit = 10;

710

struct pcpu_chunk *chunk;

710

struct pcpu_chunk *chunk;

711

const char *err;

711

const char *err;

712

int slot, off, new_alloc;

712

int slot, off, new_alloc;

713

unsigned long flags;

713

unsigned long flags;

714

void __percpu *ptr;

714

void __percpu *ptr;

715

716

if (unlikely(!size || size > PCPU_MIN_UNIT_SIZE || align > PAGE_SIZE)) {

716

if (unlikely(!size || size > PCPU_MIN_UNIT_SIZE || align > PAGE_SIZE)) {

717

WARN(true, "illegal size (%zu) or align (%zu) for "

717

WARN(true, "illegal size (%zu) or align (%zu) for "

718

"percpu allocation\n", size, align);

718

"percpu allocation\n", size, align);

719

return NULL;

719

return NULL;

720

}

720

}

721

722

mutex_lock(&pcpu_alloc_mutex);

722

mutex_lock(&pcpu_alloc_mutex);

723

spin_lock_irqsave(&pcpu_lock, flags);

723

spin_lock_irqsave(&pcpu_lock, flags);

724

725

/* serve reserved allocations from the reserved chunk if available */

725

/* serve reserved allocations from the reserved chunk if available */

726

if (reserved && pcpu_reserved_chunk) {

726

if (reserved && pcpu_reserved_chunk) {

727

chunk = pcpu_reserved_chunk;

727

chunk = pcpu_reserved_chunk;

728

729

if (size > chunk->contig_hint) {

729

if (size > chunk->contig_hint) {

730

err = "alloc from reserved chunk failed";

730

err = "alloc from reserved chunk failed";

731

goto fail_unlock;

731

goto fail_unlock;

732

}

732

}

733

734

while ((new_alloc = pcpu_need_to_extend(chunk))) {

734

while ((new_alloc = pcpu_need_to_extend(chunk))) {

735

spin_unlock_irqrestore(&pcpu_lock, flags);

735

spin_unlock_irqrestore(&pcpu_lock, flags);

736

if (pcpu_extend_area_map(chunk, new_alloc) < 0) {

736

if (pcpu_extend_area_map(chunk, new_alloc) < 0) {

737

err = "failed to extend area map of reserved chunk";

737

err = "failed to extend area map of reserved chunk";

738

goto fail_unlock_mutex;

738

goto fail_unlock_mutex;

739

}

739

}

740

spin_lock_irqsave(&pcpu_lock, flags);

740

spin_lock_irqsave(&pcpu_lock, flags);

741

}

741

}

742

743

off = pcpu_alloc_area(chunk, size, align);

743

off = pcpu_alloc_area(chunk, size, align);

744

if (off >= 0)

744

if (off >= 0)

745

goto area_found;

745

goto area_found;

746

747

err = "alloc from reserved chunk failed";

747

err = "alloc from reserved chunk failed";

748

goto fail_unlock;

748

goto fail_unlock;

749

}

749

}

750

751

restart:

751

restart:

752

/* search through normal chunks */

752

/* search through normal chunks */

753

for (slot = pcpu_size_to_slot(size); slot < pcpu_nr_slots; slot++) {

753

for (slot = pcpu_size_to_slot(size); slot < pcpu_nr_slots; slot++) {

754

list_for_each_entry(chunk, &pcpu_slot[slot], list) {

754

list_for_each_entry(chunk, &pcpu_slot[slot], list) {

755

if (size > chunk->contig_hint)

755

if (size > chunk->contig_hint)

756

continue;

756

continue;

757

758

new_alloc = pcpu_need_to_extend(chunk);

758

new_alloc = pcpu_need_to_extend(chunk);

759

if (new_alloc) {

759

if (new_alloc) {

760

spin_unlock_irqrestore(&pcpu_lock, flags);

760

spin_unlock_irqrestore(&pcpu_lock, flags);

761

if (pcpu_extend_area_map(chunk,

761

if (pcpu_extend_area_map(chunk,

762

new_alloc) < 0) {

762

new_alloc) < 0) {

763

err = "failed to extend area map";

763

err = "failed to extend area map";

764

goto fail_unlock_mutex;

764

goto fail_unlock_mutex;

765

}

765

}

766

spin_lock_irqsave(&pcpu_lock, flags);

766

spin_lock_irqsave(&pcpu_lock, flags);

767

/*

767

/*

768

* pcpu_lock has been dropped, need to

768

* pcpu_lock has been dropped, need to

769

* restart cpu_slot list walking.

769

* restart cpu_slot list walking.

770

*/

770

*/

771

goto restart;

771

goto restart;

772

}

772

}

773

774

off = pcpu_alloc_area(chunk, size, align);

774

off = pcpu_alloc_area(chunk, size, align);

775

if (off >= 0)

775

if (off >= 0)

776

goto area_found;

776

goto area_found;

777

}

777

}

778

}

778

}

779

780

/* hmmm... no space left, create a new chunk */

780

/* hmmm... no space left, create a new chunk */

781

spin_unlock_irqrestore(&pcpu_lock, flags);

781

spin_unlock_irqrestore(&pcpu_lock, flags);

782

783

chunk = pcpu_create_chunk();

783

chunk = pcpu_create_chunk();

784

if (!chunk) {

784

if (!chunk) {

785

err = "failed to allocate new chunk";

785

err = "failed to allocate new chunk";

786

goto fail_unlock_mutex;

786

goto fail_unlock_mutex;

787

}

787

}

788

789

spin_lock_irqsave(&pcpu_lock, flags);

789

spin_lock_irqsave(&pcpu_lock, flags);

790

pcpu_chunk_relocate(chunk, -1);

790

pcpu_chunk_relocate(chunk, -1);

791

goto restart;

791

goto restart;

792

793

area_found:

793

area_found:

794

spin_unlock_irqrestore(&pcpu_lock, flags);

794

spin_unlock_irqrestore(&pcpu_lock, flags);

795

796

/* populate, map and clear the area */

796

/* populate, map and clear the area */

797

if (pcpu_populate_chunk(chunk, off, size)) {

797

if (pcpu_populate_chunk(chunk, off, size)) {

798

spin_lock_irqsave(&pcpu_lock, flags);

798

spin_lock_irqsave(&pcpu_lock, flags);

799

pcpu_free_area(chunk, off);

799

pcpu_free_area(chunk, off);

800

err = "failed to populate";

800

err = "failed to populate";

801

goto fail_unlock;

801

goto fail_unlock;

802

}

802

}

803

804

mutex_unlock(&pcpu_alloc_mutex);

804

mutex_unlock(&pcpu_alloc_mutex);

805

806

/* return address relative to base address */

806

/* return address relative to base address */

807

ptr = __addr_to_pcpu_ptr(chunk->base_addr + off);

807

ptr = __addr_to_pcpu_ptr(chunk->base_addr + off);

808

kmemleak_alloc_percpu(ptr, size);

808

kmemleak_alloc_percpu(ptr, size);

809

return ptr;

809

return ptr;

810

811

fail_unlock:

811

fail_unlock:

812

spin_unlock_irqrestore(&pcpu_lock, flags);

812

spin_unlock_irqrestore(&pcpu_lock, flags);

813

fail_unlock_mutex:

813

fail_unlock_mutex:

814

mutex_unlock(&pcpu_alloc_mutex);

814

mutex_unlock(&pcpu_alloc_mutex);

815

if (warn_limit) {

815

if (warn_limit) {

816

pr_warning("PERCPU: allocation failed, size=%zu align=%zu, "

816

pr_warning("PERCPU: allocation failed, size=%zu align=%zu, "

817

"%s\n", size, align, err);

817

"%s\n", size, align, err);

818

dump_stack();

818

dump_stack();

819

if (!--warn_limit)

819

if (!--warn_limit)

820

pr_info("PERCPU: limit reached, disable warning\n");

820

pr_info("PERCPU: limit reached, disable warning\n");

821

}

821

}

822

return NULL;

822

return NULL;

823

}

823

}

824

825

/**

825

/**

826

* __alloc_percpu - allocate dynamic percpu area

826

* __alloc_percpu - allocate dynamic percpu area

827

* @size: size of area to allocate in bytes

827

* @size: size of area to allocate in bytes

828

* @align: alignment of area (max PAGE_SIZE)

828

* @align: alignment of area (max PAGE_SIZE)

829

*

829

*

830

* Allocate zero-filled percpu area of @size bytes aligned at @align.

830

* Allocate zero-filled percpu area of @size bytes aligned at @align.

831

* Might sleep. Might trigger writeouts.

831

* Might sleep. Might trigger writeouts.

832

*

832

*

833

* CONTEXT:

833

* CONTEXT:

834

* Does GFP_KERNEL allocation.

834

* Does GFP_KERNEL allocation.

835

*

835

*

836

* RETURNS:

836

* RETURNS:

837

* Percpu pointer to the allocated area on success, NULL on failure.

837

* Percpu pointer to the allocated area on success, NULL on failure.

838

*/

838

*/

839

void __percpu *__alloc_percpu(size_t size, size_t align)

839

void __percpu *__alloc_percpu(size_t size, size_t align)

840

{

840

{

841

return pcpu_alloc(size, align, false);

841

return pcpu_alloc(size, align, false);

842

}

842

}

843

EXPORT_SYMBOL_GPL(__alloc_percpu);

843

EXPORT_SYMBOL_GPL(__alloc_percpu);

844

845

/**

845

/**

846

* __alloc_reserved_percpu - allocate reserved percpu area

846

* __alloc_reserved_percpu - allocate reserved percpu area

847

* @size: size of area to allocate in bytes

847

* @size: size of area to allocate in bytes

848

* @align: alignment of area (max PAGE_SIZE)

848

* @align: alignment of area (max PAGE_SIZE)

849

*

849

*

850

* Allocate zero-filled percpu area of @size bytes aligned at @align

850

* Allocate zero-filled percpu area of @size bytes aligned at @align

851

* from reserved percpu area if arch has set it up; otherwise,

851

* from reserved percpu area if arch has set it up; otherwise,

852

* allocation is served from the same dynamic area. Might sleep.

852

* allocation is served from the same dynamic area. Might sleep.

853

* Might trigger writeouts.

853

* Might trigger writeouts.

854

*

854

*

855

* CONTEXT:

855

* CONTEXT:

856

* Does GFP_KERNEL allocation.

856

* Does GFP_KERNEL allocation.

857

*

857

*

858

* RETURNS:

858

* RETURNS:

859

* Percpu pointer to the allocated area on success, NULL on failure.

859

* Percpu pointer to the allocated area on success, NULL on failure.

860

*/

860

*/

861

void __percpu *__alloc_reserved_percpu(size_t size, size_t align)

861

void __percpu *__alloc_reserved_percpu(size_t size, size_t align)

862

{

862

{

863

return pcpu_alloc(size, align, true);

863

return pcpu_alloc(size, align, true);

864

}

864

}

865

866

/**

866

/**

867

* pcpu_reclaim - reclaim fully free chunks, workqueue function

867

* pcpu_reclaim - reclaim fully free chunks, workqueue function

868

* @work: unused

868

* @work: unused

869

*

869

*

870

* Reclaim all fully free chunks except for the first one.

870

* Reclaim all fully free chunks except for the first one.

871

*

871

*

872

* CONTEXT:

872

* CONTEXT:

873

* workqueue context.

873

* workqueue context.

874

*/

874

*/

875

static void pcpu_reclaim(struct work_struct *work)

875

static void pcpu_reclaim(struct work_struct *work)

876

{

876

{

877

LIST_HEAD(todo);

877

LIST_HEAD(todo);

878

struct list_head *head = &pcpu_slot[pcpu_nr_slots - 1];

878

struct list_head *head = &pcpu_slot[pcpu_nr_slots - 1];

879

struct pcpu_chunk *chunk, *next;

879

struct pcpu_chunk *chunk, *next;

880

881

mutex_lock(&pcpu_alloc_mutex);

881

mutex_lock(&pcpu_alloc_mutex);

882

spin_lock_irq(&pcpu_lock);

882

spin_lock_irq(&pcpu_lock);

883

884

list_for_each_entry_safe(chunk, next, head, list) {

884

list_for_each_entry_safe(chunk, next, head, list) {

885

WARN_ON(chunk->immutable);

885

WARN_ON(chunk->immutable);

886

887

/* spare the first one */

887

/* spare the first one */

888

if (chunk == list_first_entry(head, struct pcpu_chunk, list))

888

if (chunk == list_first_entry(head, struct pcpu_chunk, list))

889

continue;

889

continue;

890

891

list_move(&chunk->list, &todo);

891

list_move(&chunk->list, &todo);

892

}

892

}

893

894

spin_unlock_irq(&pcpu_lock);

894

spin_unlock_irq(&pcpu_lock);

895

896

list_for_each_entry_safe(chunk, next, &todo, list) {

896

list_for_each_entry_safe(chunk, next, &todo, list) {

897

pcpu_depopulate_chunk(chunk, 0, pcpu_unit_size);

897

pcpu_depopulate_chunk(chunk, 0, pcpu_unit_size);

898

pcpu_destroy_chunk(chunk);

898

pcpu_destroy_chunk(chunk);

899

}

899

}

900

901

mutex_unlock(&pcpu_alloc_mutex);

901

mutex_unlock(&pcpu_alloc_mutex);

902

}

902

}

903

904

/**

904

/**

905

* free_percpu - free percpu area

905

* free_percpu - free percpu area

906

* @ptr: pointer to area to free

906

* @ptr: pointer to area to free

907

*

907

*

908

* Free percpu area @ptr.

908

* Free percpu area @ptr.

909

*

909

*

910

* CONTEXT:

910

* CONTEXT:

911

* Can be called from atomic context.

911

* Can be called from atomic context.

912

*/

912

*/

913

void free_percpu(void __percpu *ptr)

913

void free_percpu(void __percpu *ptr)

914

{

914

{

915

void *addr;

915

void *addr;

916

struct pcpu_chunk *chunk;

916

struct pcpu_chunk *chunk;

917

unsigned long flags;

917

unsigned long flags;

918

int off;

918

int off;

919

920

if (!ptr)

920

if (!ptr)

921

return;

921

return;

922

923

kmemleak_free_percpu(ptr);

923

kmemleak_free_percpu(ptr);

924

925

addr = __pcpu_ptr_to_addr(ptr);

925

addr = __pcpu_ptr_to_addr(ptr);

926

927

spin_lock_irqsave(&pcpu_lock, flags);

927

spin_lock_irqsave(&pcpu_lock, flags);

928

929

chunk = pcpu_chunk_addr_search(addr);

929

chunk = pcpu_chunk_addr_search(addr);

930

off = addr - chunk->base_addr;

930

off = addr - chunk->base_addr;

931

932

pcpu_free_area(chunk, off);

932

pcpu_free_area(chunk, off);

933

934

/* if there are more than one fully free chunks, wake up grim reaper */

934

/* if there are more than one fully free chunks, wake up grim reaper */

935

if (chunk->free_size == pcpu_unit_size) {

935

if (chunk->free_size == pcpu_unit_size) {

936

struct pcpu_chunk *pos;

936

struct pcpu_chunk *pos;

937

938

list_for_each_entry(pos, &pcpu_slot[pcpu_nr_slots - 1], list)

938

list_for_each_entry(pos, &pcpu_slot[pcpu_nr_slots - 1], list)

939

if (pos != chunk) {

939

if (pos != chunk) {

940

schedule_work(&pcpu_reclaim_work);

940

schedule_work(&pcpu_reclaim_work);

941

break;

941

break;

942

}

942

}

943

}

943

}

944

945

spin_unlock_irqrestore(&pcpu_lock, flags);

945

spin_unlock_irqrestore(&pcpu_lock, flags);

946

}

946

}

947

EXPORT_SYMBOL_GPL(free_percpu);

947

EXPORT_SYMBOL_GPL(free_percpu);

948

949

/**

949

/**

950

* is_kernel_percpu_address - test whether address is from static percpu area

950

* is_kernel_percpu_address - test whether address is from static percpu area

951

* @addr: address to test

951

* @addr: address to test

952

*

952

*

953

* Test whether @addr belongs to in-kernel static percpu area. Module

953

* Test whether @addr belongs to in-kernel static percpu area. Module

954

* static percpu areas are not considered. For those, use

954

* static percpu areas are not considered. For those, use

955

* is_module_percpu_address().

955

* is_module_percpu_address().

956

*

956

*

957

* RETURNS:

957

* RETURNS:

958

* %true if @addr is from in-kernel static percpu area, %false otherwise.

958

* %true if @addr is from in-kernel static percpu area, %false otherwise.

959

*/

959

*/

960

bool is_kernel_percpu_address(unsigned long addr)

960

bool is_kernel_percpu_address(unsigned long addr)

961

{

961

{

962

#ifdef CONFIG_SMP

962

#ifdef CONFIG_SMP

963

const size_t static_size = __per_cpu_end - __per_cpu_start;

963

const size_t static_size = __per_cpu_end - __per_cpu_start;

964

void __percpu *base = __addr_to_pcpu_ptr(pcpu_base_addr);

964

void __percpu *base = __addr_to_pcpu_ptr(pcpu_base_addr);

965

unsigned int cpu;

965

unsigned int cpu;

966

967

for_each_possible_cpu(cpu) {

967

for_each_possible_cpu(cpu) {

968

void *start = per_cpu_ptr(base, cpu);

968

void *start = per_cpu_ptr(base, cpu);

969

970

if ((void *)addr >= start && (void *)addr < start + static_size)

970

if ((void *)addr >= start && (void *)addr < start + static_size)

971

return true;

971

return true;

972

}

972

}

973

#endif

973

#endif

974

/* on UP, can't distinguish from other static vars, always false */

974

/* on UP, can't distinguish from other static vars, always false */

975

return false;

975

return false;

976

}

976

}

977

978

/**

978

/**

979

* per_cpu_ptr_to_phys - convert translated percpu address to physical address

979

* per_cpu_ptr_to_phys - convert translated percpu address to physical address

980

* @addr: the address to be converted to physical address

980

* @addr: the address to be converted to physical address

981

*

981

*

982

* Given @addr which is dereferenceable address obtained via one of

982

* Given @addr which is dereferenceable address obtained via one of

983

* percpu access macros, this function translates it into its physical

983

* percpu access macros, this function translates it into its physical

984

* address. The caller is responsible for ensuring @addr stays valid

984

* address. The caller is responsible for ensuring @addr stays valid

985

* until this function finishes.

985

* until this function finishes.

986

*

986

*

987

* percpu allocator has special setup for the first chunk, which currently

987

* percpu allocator has special setup for the first chunk, which currently

988

* supports either embedding in linear address space or vmalloc mapping,

988

* supports either embedding in linear address space or vmalloc mapping,

989

* and, from the second one, the backing allocator (currently either vm or

989

* and, from the second one, the backing allocator (currently either vm or

990

* km) provides translation.

990

* km) provides translation.

991

*

991

*

992

* The addr can be tranlated simply without checking if it falls into the

992

* The addr can be tranlated simply without checking if it falls into the

993

* first chunk. But the current code reflects better how percpu allocator

993

* first chunk. But the current code reflects better how percpu allocator

994

* actually works, and the verification can discover both bugs in percpu

994

* actually works, and the verification can discover both bugs in percpu

995

* allocator itself and per_cpu_ptr_to_phys() callers. So we keep current

995

* allocator itself and per_cpu_ptr_to_phys() callers. So we keep current

996

* code.

996

* code.

997

*

997

*

998

* RETURNS:

998

* RETURNS:

999

* The physical address for @addr.

999

* The physical address for @addr.

1000

*/

1000

*/

1001

phys_addr_t per_cpu_ptr_to_phys(void *addr)

1001

phys_addr_t per_cpu_ptr_to_phys(void *addr)

1002

{

1002

{

1003

void __percpu *base = __addr_to_pcpu_ptr(pcpu_base_addr);

1003

void __percpu *base = __addr_to_pcpu_ptr(pcpu_base_addr);

1004

bool in_first_chunk = false;

1004

bool in_first_chunk = false;

1005

unsigned long first_low, first_high;

1005

unsigned long first_low, first_high;

1006

unsigned int cpu;

1006

unsigned int cpu;

1007

1008

/*

1008

/*

1009

* The following test on unit_low/high isn't strictly

1009

* The following test on unit_low/high isn't strictly

1010

* necessary but will speed up lookups of addresses which

1010

* necessary but will speed up lookups of addresses which

1011

* aren't in the first chunk.

1011

* aren't in the first chunk.

1012

*/

1012

*/

1013

first_low = pcpu_chunk_addr(pcpu_first_chunk, pcpu_low_unit_cpu, 0);

1013

first_low = pcpu_chunk_addr(pcpu_first_chunk, pcpu_low_unit_cpu, 0);

1014

first_high = pcpu_chunk_addr(pcpu_first_chunk, pcpu_high_unit_cpu,

1014

first_high = pcpu_chunk_addr(pcpu_first_chunk, pcpu_high_unit_cpu,

1015

pcpu_unit_pages);

1015

pcpu_unit_pages);

1016

if ((unsigned long)addr >= first_low &&

1016

if ((unsigned long)addr >= first_low &&

1017

(unsigned long)addr < first_high) {

1017

(unsigned long)addr < first_high) {

1018

for_each_possible_cpu(cpu) {

1018

for_each_possible_cpu(cpu) {

1019

void *start = per_cpu_ptr(base, cpu);

1019

void *start = per_cpu_ptr(base, cpu);

1020

1021

if (addr >= start && addr < start + pcpu_unit_size) {

1021

if (addr >= start && addr < start + pcpu_unit_size) {

1022

in_first_chunk = true;

1022

in_first_chunk = true;

1023

break;

1023

break;

1024

}

1024

}

1025

}

1025

}

1026

}

1026

}

1027

1028

if (in_first_chunk) {

1028

if (in_first_chunk) {

1029

if (!is_vmalloc_addr(addr))

1029

if (!is_vmalloc_addr(addr))

1030

return __pa(addr);

1030

return __pa(addr);

1031

else

1031

else

1032

return page_to_phys(vmalloc_to_page(addr)) +

1032

return page_to_phys(vmalloc_to_page(addr)) +

1033

offset_in_page(addr);

1033

offset_in_page(addr);

1034

} else

1034

} else

1035

return page_to_phys(pcpu_addr_to_page(addr)) +

1035

return page_to_phys(pcpu_addr_to_page(addr)) +

1036

offset_in_page(addr);

1036

offset_in_page(addr);

1037

}

1037

}

1038

1039

/**

1039

/**

1040

* pcpu_alloc_alloc_info - allocate percpu allocation info

1040

* pcpu_alloc_alloc_info - allocate percpu allocation info

1041

* @nr_groups: the number of groups

1041

* @nr_groups: the number of groups

1042

* @nr_units: the number of units

1042

* @nr_units: the number of units

1043

*

1043

*

1044

* Allocate ai which is large enough for @nr_groups groups containing

1044

* Allocate ai which is large enough for @nr_groups groups containing

1045

* @nr_units units. The returned ai's groups[0].cpu_map points to the

1045

* @nr_units units. The returned ai's groups[0].cpu_map points to the

1046

* cpu_map array which is long enough for @nr_units and filled with

1046

* cpu_map array which is long enough for @nr_units and filled with

1047

* NR_CPUS. It's the caller's responsibility to initialize cpu_map

1047

* NR_CPUS. It's the caller's responsibility to initialize cpu_map

1048

* pointer of other groups.

1048

* pointer of other groups.

1049

*

1049

*

1050

* RETURNS:

1050

* RETURNS:

1051

* Pointer to the allocated pcpu_alloc_info on success, NULL on

1051

* Pointer to the allocated pcpu_alloc_info on success, NULL on

1052

* failure.

1052

* failure.

1053

*/

1053

*/

1054

struct pcpu_alloc_info * __init pcpu_alloc_alloc_info(int nr_groups,

1054

struct pcpu_alloc_info * __init pcpu_alloc_alloc_info(int nr_groups,

1055

int nr_units)

1055

int nr_units)

1056

{

1056

{

1057

struct pcpu_alloc_info *ai;

1057

struct pcpu_alloc_info *ai;

1058

size_t base_size, ai_size;

1058

size_t base_size, ai_size;

1059

void *ptr;

1059

void *ptr;

1060

int unit;

1060

int unit;

1061

1062

base_size = ALIGN(sizeof(*ai) + nr_groups * sizeof(ai->groups[0]),

1062

base_size = ALIGN(sizeof(*ai) + nr_groups * sizeof(ai->groups[0]),

1063

__alignof__(ai->groups[0].cpu_map[0]));

1063

__alignof__(ai->groups[0].cpu_map[0]));

1064

ai_size = base_size + nr_units * sizeof(ai->groups[0].cpu_map[0]);

1064

ai_size = base_size + nr_units * sizeof(ai->groups[0].cpu_map[0]);

1065

1066

ptr = alloc_bootmem_nopanic(PFN_ALIGN(ai_size));

1066

ptr = alloc_bootmem_nopanic(PFN_ALIGN(ai_size));

1067

if (!ptr)

1067

if (!ptr)

1068

return NULL;

1068

return NULL;

1069

ai = ptr;

1069

ai = ptr;

1070

ptr += base_size;

1070

ptr += base_size;

1071

1072

ai->groups[0].cpu_map = ptr;

1072

ai->groups[0].cpu_map = ptr;

1073

1074

for (unit = 0; unit < nr_units; unit++)

1074

for (unit = 0; unit < nr_units; unit++)

1075

ai->groups[0].cpu_map[unit] = NR_CPUS;

1075

ai->groups[0].cpu_map[unit] = NR_CPUS;

1076

1077

ai->nr_groups = nr_groups;

1077

ai->nr_groups = nr_groups;

1078

ai->__ai_size = PFN_ALIGN(ai_size);

1078

ai->__ai_size = PFN_ALIGN(ai_size);

1079

1080

return ai;

1080

return ai;

1081

}

1081

}

1082

1083

/**

1083

/**

1084

* pcpu_free_alloc_info - free percpu allocation info

1084

* pcpu_free_alloc_info - free percpu allocation info

1085

* @ai: pcpu_alloc_info to free

1085

* @ai: pcpu_alloc_info to free

1086

*

1086

*

1087

* Free @ai which was allocated by pcpu_alloc_alloc_info().

1087

* Free @ai which was allocated by pcpu_alloc_alloc_info().

1088

*/

1088

*/

1089

void __init pcpu_free_alloc_info(struct pcpu_alloc_info *ai)

1089

void __init pcpu_free_alloc_info(struct pcpu_alloc_info *ai)

1090

{

1090

{

1091

free_bootmem(__pa(ai), ai->__ai_size);

1091

free_bootmem(__pa(ai), ai->__ai_size);

1092

}

1092

}

1093

1094

/**

1094

/**

1095

* pcpu_dump_alloc_info - print out information about pcpu_alloc_info

1095

* pcpu_dump_alloc_info - print out information about pcpu_alloc_info

1096

* @lvl: loglevel

1096

* @lvl: loglevel

1097

* @ai: allocation info to dump

1097

* @ai: allocation info to dump

1098

*

1098

*

1099

* Print out information about @ai using loglevel @lvl.

1099

* Print out information about @ai using loglevel @lvl.

1100

*/

1100

*/

1101

static void pcpu_dump_alloc_info(const char *lvl,

1101

static void pcpu_dump_alloc_info(const char *lvl,

1102

const struct pcpu_alloc_info *ai)

1102

const struct pcpu_alloc_info *ai)

1103

{

1103

{

1104

int group_width = 1, cpu_width = 1, width;

1104

int group_width = 1, cpu_width = 1, width;

1105

char empty_str[] = "--------";

1105

char empty_str[] = "--------";

1106

int alloc = 0, alloc_end = 0;

1106

int alloc = 0, alloc_end = 0;

1107

int group, v;

1107

int group, v;

1108

int upa, apl; /* units per alloc, allocs per line */

1108

int upa, apl; /* units per alloc, allocs per line */

1109

1110

v = ai->nr_groups;

1110

v = ai->nr_groups;

1111

while (v /= 10)

1111

while (v /= 10)

1112

group_width++;

1112

group_width++;

1113

1114

v = num_possible_cpus();

1114

v = num_possible_cpus();

1115

while (v /= 10)

1115

while (v /= 10)

1116

cpu_width++;

1116

cpu_width++;

1117

empty_str[min_t(int, cpu_width, sizeof(empty_str) - 1)] = '\0';

1117

empty_str[min_t(int, cpu_width, sizeof(empty_str) - 1)] = '\0';

1118

1119

upa = ai->alloc_size / ai->unit_size;

1119

upa = ai->alloc_size / ai->unit_size;

1120

width = upa * (cpu_width + 1) + group_width + 3;

1120

width = upa * (cpu_width + 1) + group_width + 3;

1121

apl = rounddown_pow_of_two(max(60 / width, 1));

1121

apl = rounddown_pow_of_two(max(60 / width, 1));

1122

1123

printk("%spcpu-alloc: s%zu r%zu d%zu u%zu alloc=%zu*%zu",

1123

printk("%spcpu-alloc: s%zu r%zu d%zu u%zu alloc=%zu*%zu",

1124

lvl, ai->static_size, ai->reserved_size, ai->dyn_size,

1124

lvl, ai->static_size, ai->reserved_size, ai->dyn_size,

1125

ai->unit_size, ai->alloc_size / ai->atom_size, ai->atom_size);

1125

ai->unit_size, ai->alloc_size / ai->atom_size, ai->atom_size);

1126

1127

for (group = 0; group < ai->nr_groups; group++) {

1127

for (group = 0; group < ai->nr_groups; group++) {

1128

const struct pcpu_group_info *gi = &ai->groups[group];

1128

const struct pcpu_group_info *gi = &ai->groups[group];

1129

int unit = 0, unit_end = 0;

1129

int unit = 0, unit_end = 0;

1130

1131

BUG_ON(gi->nr_units % upa);

1131

BUG_ON(gi->nr_units % upa);

1132

for (alloc_end += gi->nr_units / upa;

1132

for (alloc_end += gi->nr_units / upa;

1133

alloc < alloc_end; alloc++) {

1133

alloc < alloc_end; alloc++) {

1134

if (!(alloc % apl)) {

1134

if (!(alloc % apl)) {

1135

printk(KERN_CONT "\n");

1135

printk(KERN_CONT "\n");

1136

printk("%spcpu-alloc: ", lvl);

1136

printk("%spcpu-alloc: ", lvl);

1137

}

1137

}

1138

printk(KERN_CONT "[%0*d] ", group_width, group);

1138

printk(KERN_CONT "[%0*d] ", group_width, group);

1139

1140

for (unit_end += upa; unit < unit_end; unit++)

1140

for (unit_end += upa; unit < unit_end; unit++)

1141

if (gi->cpu_map[unit] != NR_CPUS)

1141

if (gi->cpu_map[unit] != NR_CPUS)

1142

printk(KERN_CONT "%0*d ", cpu_width,

1142

printk(KERN_CONT "%0*d ", cpu_width,

1143

gi->cpu_map[unit]);

1143

gi->cpu_map[unit]);

1144

else

1144

else

1145

printk(KERN_CONT "%s ", empty_str);

1145

printk(KERN_CONT "%s ", empty_str);

1146

}

1146

}

1147

}

1147

}

1148

printk(KERN_CONT "\n");

1148

printk(KERN_CONT "\n");

1149

}

1149

}

1150

1151

/**

1151

/**

1152

* pcpu_setup_first_chunk - initialize the first percpu chunk

1152

* pcpu_setup_first_chunk - initialize the first percpu chunk

1153

* @ai: pcpu_alloc_info describing how to percpu area is shaped

1153

* @ai: pcpu_alloc_info describing how to percpu area is shaped

1154

* @base_addr: mapped address

1154

* @base_addr: mapped address

1155

*

1155

*

1156

* Initialize the first percpu chunk which contains the kernel static

1156

* Initialize the first percpu chunk which contains the kernel static

1157

* perpcu area. This function is to be called from arch percpu area

1157

* perpcu area. This function is to be called from arch percpu area

1158

* setup path.

1158

* setup path.

1159

*

1159

*

1160

* @ai contains all information necessary to initialize the first

1160

* @ai contains all information necessary to initialize the first

1161

* chunk and prime the dynamic percpu allocator.

1161

* chunk and prime the dynamic percpu allocator.

1162

*

1162

*

1163

* @ai->static_size is the size of static percpu area.

1163

* @ai->static_size is the size of static percpu area.

1164

*

1164

*

1165

* @ai->reserved_size, if non-zero, specifies the amount of bytes to

1165

* @ai->reserved_size, if non-zero, specifies the amount of bytes to

1166

* reserve after the static area in the first chunk. This reserves

1166

* reserve after the static area in the first chunk. This reserves

1167

* the first chunk such that it's available only through reserved

1167

* the first chunk such that it's available only through reserved

1168

* percpu allocation. This is primarily used to serve module percpu

1168

* percpu allocation. This is primarily used to serve module percpu

1169

* static areas on architectures where the addressing model has

1169

* static areas on architectures where the addressing model has

1170

* limited offset range for symbol relocations to guarantee module

1170

* limited offset range for symbol relocations to guarantee module

1171

* percpu symbols fall inside the relocatable range.

1171

* percpu symbols fall inside the relocatable range.

1172

*

1172

*

1173

* @ai->dyn_size determines the number of bytes available for dynamic

1173

* @ai->dyn_size determines the number of bytes available for dynamic

1174

* allocation in the first chunk. The area between @ai->static_size +

1174

* allocation in the first chunk. The area between @ai->static_size +

1175

* @ai->reserved_size + @ai->dyn_size and @ai->unit_size is unused.

1175

* @ai->reserved_size + @ai->dyn_size and @ai->unit_size is unused.

1176

*

1176

*

1177

* @ai->unit_size specifies unit size and must be aligned to PAGE_SIZE

1177

* @ai->unit_size specifies unit size and must be aligned to PAGE_SIZE

1178

* and equal to or larger than @ai->static_size + @ai->reserved_size +

1178

* and equal to or larger than @ai->static_size + @ai->reserved_size +

1179

* @ai->dyn_size.

1179

* @ai->dyn_size.

1180

*

1180

*

1181

* @ai->atom_size is the allocation atom size and used as alignment

1181

* @ai->atom_size is the allocation atom size and used as alignment

1182

* for vm areas.

1182

* for vm areas.

1183

*

1183

*

1184

* @ai->alloc_size is the allocation size and always multiple of

1184

* @ai->alloc_size is the allocation size and always multiple of

1185

* @ai->atom_size. This is larger than @ai->atom_size if

1185

* @ai->atom_size. This is larger than @ai->atom_size if

1186

* @ai->unit_size is larger than @ai->atom_size.

1186

* @ai->unit_size is larger than @ai->atom_size.

1187

*

1187

*

1188

* @ai->nr_groups and @ai->groups describe virtual memory layout of

1188

* @ai->nr_groups and @ai->groups describe virtual memory layout of

1189

* percpu areas. Units which should be colocated are put into the

1189

* percpu areas. Units which should be colocated are put into the

1190

* same group. Dynamic VM areas will be allocated according to these

1190

* same group. Dynamic VM areas will be allocated according to these

1191

* groupings. If @ai->nr_groups is zero, a single group containing

1191

* groupings. If @ai->nr_groups is zero, a single group containing

1192

* all units is assumed.

1192

* all units is assumed.

1193

*

1193

*

1194

* The caller should have mapped the first chunk at @base_addr and

1194

* The caller should have mapped the first chunk at @base_addr and

1195

* copied static data to each unit.

1195

* copied static data to each unit.

1196

*

1196

*

1197

* If the first chunk ends up with both reserved and dynamic areas, it

1197

* If the first chunk ends up with both reserved and dynamic areas, it

1198

* is served by two chunks - one to serve the core static and reserved

1198

* is served by two chunks - one to serve the core static and reserved

1199

* areas and the other for the dynamic area. They share the same vm

1199

* areas and the other for the dynamic area. They share the same vm

1200

* and page map but uses different area allocation map to stay away

1200

* and page map but uses different area allocation map to stay away

1201

* from each other. The latter chunk is circulated in the chunk slots

1201

* from each other. The latter chunk is circulated in the chunk slots

1202

* and available for dynamic allocation like any other chunks.

1202

* and available for dynamic allocation like any other chunks.

1203

*

1203

*

1204

* RETURNS:

1204

* RETURNS:

1205

* 0 on success, -errno on failure.

1205

* 0 on success, -errno on failure.

1206

*/

1206

*/

1207

int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,

1207

int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,

1208

void *base_addr)

1208

void *base_addr)

1209

{

1209

{

1210

static char cpus_buf[4096] __initdata;

1210

static char cpus_buf[4096] __initdata;

1211

static int smap[PERCPU_DYNAMIC_EARLY_SLOTS] __initdata;

1211

static int smap[PERCPU_DYNAMIC_EARLY_SLOTS] __initdata;

1212

static int dmap[PERCPU_DYNAMIC_EARLY_SLOTS] __initdata;

1212

static int dmap[PERCPU_DYNAMIC_EARLY_SLOTS] __initdata;

1213

size_t dyn_size = ai->dyn_size;

1213

size_t dyn_size = ai->dyn_size;

1214

size_t size_sum = ai->static_size + ai->reserved_size + dyn_size;

1214

size_t size_sum = ai->static_size + ai->reserved_size + dyn_size;

1215

struct pcpu_chunk *schunk, *dchunk = NULL;

1215

struct pcpu_chunk *schunk, *dchunk = NULL;

1216

unsigned long *group_offsets;

1216

unsigned long *group_offsets;

1217

size_t *group_sizes;

1217

size_t *group_sizes;

1218

unsigned long *unit_off;

1218

unsigned long *unit_off;

1219

unsigned int cpu;

1219

unsigned int cpu;

1220

int *unit_map;

1220

int *unit_map;

1221

int group, unit, i;

1221

int group, unit, i;

1222

1223

cpumask_scnprintf(cpus_buf, sizeof(cpus_buf), cpu_possible_mask);

1223

cpumask_scnprintf(cpus_buf, sizeof(cpus_buf), cpu_possible_mask);

1224

1225

#define PCPU_SETUP_BUG_ON(cond) do { \

1225

#define PCPU_SETUP_BUG_ON(cond) do { \

1226

if (unlikely(cond)) { \

1226

if (unlikely(cond)) { \

1227

pr_emerg("PERCPU: failed to initialize, %s", #cond); \

1227

pr_emerg("PERCPU: failed to initialize, %s", #cond); \

1228

pr_emerg("PERCPU: cpu_possible_mask=%s\n", cpus_buf); \

1228

pr_emerg("PERCPU: cpu_possible_mask=%s\n", cpus_buf); \

1229

pcpu_dump_alloc_info(KERN_EMERG, ai); \

1229

pcpu_dump_alloc_info(KERN_EMERG, ai); \

1230

BUG(); \

1230

BUG(); \

1231

} \

1231

} \

1232

} while (0)

1232

} while (0)

1233

1234

/* sanity checks */

1234

/* sanity checks */

1235

PCPU_SETUP_BUG_ON(ai->nr_groups <= 0);

1235

PCPU_SETUP_BUG_ON(ai->nr_groups <= 0);

1236

#ifdef CONFIG_SMP

1236

#ifdef CONFIG_SMP

1237

PCPU_SETUP_BUG_ON(!ai->static_size);

1237

PCPU_SETUP_BUG_ON(!ai->static_size);

1238

PCPU_SETUP_BUG_ON((unsigned long)__per_cpu_start & ~PAGE_MASK);

1238

PCPU_SETUP_BUG_ON((unsigned long)__per_cpu_start & ~PAGE_MASK);

1239

#endif

1239

#endif

1240

PCPU_SETUP_BUG_ON(!base_addr);

1240

PCPU_SETUP_BUG_ON(!base_addr);

1241

PCPU_SETUP_BUG_ON((unsigned long)base_addr & ~PAGE_MASK);

1241

PCPU_SETUP_BUG_ON((unsigned long)base_addr & ~PAGE_MASK);

1242

PCPU_SETUP_BUG_ON(ai->unit_size < size_sum);

1242

PCPU_SETUP_BUG_ON(ai->unit_size < size_sum);

1243

PCPU_SETUP_BUG_ON(ai->unit_size & ~PAGE_MASK);

1243

PCPU_SETUP_BUG_ON(ai->unit_size & ~PAGE_MASK);

1244

PCPU_SETUP_BUG_ON(ai->unit_size < PCPU_MIN_UNIT_SIZE);

1244

PCPU_SETUP_BUG_ON(ai->unit_size < PCPU_MIN_UNIT_SIZE);

1245

PCPU_SETUP_BUG_ON(ai->dyn_size < PERCPU_DYNAMIC_EARLY_SIZE);

1245

PCPU_SETUP_BUG_ON(ai->dyn_size < PERCPU_DYNAMIC_EARLY_SIZE);

1246

PCPU_SETUP_BUG_ON(pcpu_verify_alloc_info(ai) < 0);

1246

PCPU_SETUP_BUG_ON(pcpu_verify_alloc_info(ai) < 0);

1247

1248

/* process group information and build config tables accordingly */

1248

/* process group information and build config tables accordingly */

1249

group_offsets = alloc_bootmem(ai->nr_groups * sizeof(group_offsets[0]));

1249

group_offsets = alloc_bootmem(ai->nr_groups * sizeof(group_offsets[0]));

1250

group_sizes = alloc_bootmem(ai->nr_groups * sizeof(group_sizes[0]));

1250

group_sizes = alloc_bootmem(ai->nr_groups * sizeof(group_sizes[0]));

1251

unit_map = alloc_bootmem(nr_cpu_ids * sizeof(unit_map[0]));

1251

unit_map = alloc_bootmem(nr_cpu_ids * sizeof(unit_map[0]));

1252

unit_off = alloc_bootmem(nr_cpu_ids * sizeof(unit_off[0]));

1252

unit_off = alloc_bootmem(nr_cpu_ids * sizeof(unit_off[0]));

1253

1254

for (cpu = 0; cpu < nr_cpu_ids; cpu++)

1254

for (cpu = 0; cpu < nr_cpu_ids; cpu++)

1255

unit_map[cpu] = UINT_MAX;

1255

unit_map[cpu] = UINT_MAX;

1256

1257

pcpu_low_unit_cpu = NR_CPUS;

1257

pcpu_low_unit_cpu = NR_CPUS;

1258

pcpu_high_unit_cpu = NR_CPUS;

1258

pcpu_high_unit_cpu = NR_CPUS;

1259

1260

for (group = 0, unit = 0; group < ai->nr_groups; group++, unit += i) {

1260

for (group = 0, unit = 0; group < ai->nr_groups; group++, unit += i) {

1261

const struct pcpu_group_info *gi = &ai->groups[group];

1261

const struct pcpu_group_info *gi = &ai->groups[group];

1262

1263

group_offsets[group] = gi->base_offset;

1263

group_offsets[group] = gi->base_offset;

1264

group_sizes[group] = gi->nr_units * ai->unit_size;

1264

group_sizes[group] = gi->nr_units * ai->unit_size;

1265

1266

for (i = 0; i < gi->nr_units; i++) {

1266

for (i = 0; i < gi->nr_units; i++) {

1267

cpu = gi->cpu_map[i];

1267

cpu = gi->cpu_map[i];

1268

if (cpu == NR_CPUS)

1268

if (cpu == NR_CPUS)

1269

continue;

1269

continue;

1270

1271

PCPU_SETUP_BUG_ON(cpu > nr_cpu_ids);

1271

PCPU_SETUP_BUG_ON(cpu > nr_cpu_ids);

1272

PCPU_SETUP_BUG_ON(!cpu_possible(cpu));

1272

PCPU_SETUP_BUG_ON(!cpu_possible(cpu));

1273

PCPU_SETUP_BUG_ON(unit_map[cpu] != UINT_MAX);

1273

PCPU_SETUP_BUG_ON(unit_map[cpu] != UINT_MAX);

1274

1275

unit_map[cpu] = unit + i;

1275

unit_map[cpu] = unit + i;

1276

unit_off[cpu] = gi->base_offset + i * ai->unit_size;

1276

unit_off[cpu] = gi->base_offset + i * ai->unit_size;

1277

1278

/* determine low/high unit_cpu */

1278

/* determine low/high unit_cpu */

1279

if (pcpu_low_unit_cpu == NR_CPUS ||

1279

if (pcpu_low_unit_cpu == NR_CPUS ||

1280

unit_off[cpu] < unit_off[pcpu_low_unit_cpu])

1280

unit_off[cpu] < unit_off[pcpu_low_unit_cpu])

1281

pcpu_low_unit_cpu = cpu;

1281

pcpu_low_unit_cpu = cpu;

1282

if (pcpu_high_unit_cpu == NR_CPUS ||

1282

if (pcpu_high_unit_cpu == NR_CPUS ||

1283

unit_off[cpu] > unit_off[pcpu_high_unit_cpu])

1283

unit_off[cpu] > unit_off[pcpu_high_unit_cpu])

1284

pcpu_high_unit_cpu = cpu;

1284

pcpu_high_unit_cpu = cpu;

1285

}

1285

}

1286

}

1286

}

1287

pcpu_nr_units = unit;

1287

pcpu_nr_units = unit;

1288

1289

for_each_possible_cpu(cpu)

1289

for_each_possible_cpu(cpu)

1290

PCPU_SETUP_BUG_ON(unit_map[cpu] == UINT_MAX);

1290

PCPU_SETUP_BUG_ON(unit_map[cpu] == UINT_MAX);

1291

1292

/* we're done parsing the input, undefine BUG macro and dump config */

1292

/* we're done parsing the input, undefine BUG macro and dump config */

1293

#undef PCPU_SETUP_BUG_ON

1293

#undef PCPU_SETUP_BUG_ON

1294

pcpu_dump_alloc_info(KERN_DEBUG, ai);

1294

pcpu_dump_alloc_info(KERN_DEBUG, ai);

1295

1296

pcpu_nr_groups = ai->nr_groups;

1296

pcpu_nr_groups = ai->nr_groups;

1297

pcpu_group_offsets = group_offsets;

1297

pcpu_group_offsets = group_offsets;

1298

pcpu_group_sizes = group_sizes;

1298

pcpu_group_sizes = group_sizes;

1299

pcpu_unit_map = unit_map;

1299

pcpu_unit_map = unit_map;

1300

pcpu_unit_offsets = unit_off;

1300

pcpu_unit_offsets = unit_off;

1301

1302

/* determine basic parameters */

1302

/* determine basic parameters */

1303

pcpu_unit_pages = ai->unit_size >> PAGE_SHIFT;

1303

pcpu_unit_pages = ai->unit_size >> PAGE_SHIFT;

1304

pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT;

1304

pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT;

1305

pcpu_atom_size = ai->atom_size;

1305

pcpu_atom_size = ai->atom_size;

1306

pcpu_chunk_struct_size = sizeof(struct pcpu_chunk) +

1306

pcpu_chunk_struct_size = sizeof(struct pcpu_chunk) +

1307

BITS_TO_LONGS(pcpu_unit_pages) * sizeof(unsigned long);

1307

BITS_TO_LONGS(pcpu_unit_pages) * sizeof(unsigned long);

1308

1309

/*

1309

/*

1310

* Allocate chunk slots. The additional last slot is for

1310

* Allocate chunk slots. The additional last slot is for

1311

* empty chunks.

1311

* empty chunks.

1312

*/

1312

*/

1313

pcpu_nr_slots = __pcpu_size_to_slot(pcpu_unit_size) + 2;

1313

pcpu_nr_slots = __pcpu_size_to_slot(pcpu_unit_size) + 2;

1314

pcpu_slot = alloc_bootmem(pcpu_nr_slots * sizeof(pcpu_slot[0]));

1314

pcpu_slot = alloc_bootmem(pcpu_nr_slots * sizeof(pcpu_slot[0]));

1315

for (i = 0; i < pcpu_nr_slots; i++)

1315

for (i = 0; i < pcpu_nr_slots; i++)

1316

INIT_LIST_HEAD(&pcpu_slot[i]);

1316

INIT_LIST_HEAD(&pcpu_slot[i]);

1317

1318

/*

1318

/*

1319

* Initialize static chunk. If reserved_size is zero, the

1319

* Initialize static chunk. If reserved_size is zero, the

1320

* static chunk covers static area + dynamic allocation area

1320

* static chunk covers static area + dynamic allocation area

1321

* in the first chunk. If reserved_size is not zero, it

1321

* in the first chunk. If reserved_size is not zero, it

1322

* covers static area + reserved area (mostly used for module

1322

* covers static area + reserved area (mostly used for module

1323

* static percpu allocation).

1323

* static percpu allocation).

1324

*/

1324

*/

1325

schunk = alloc_bootmem(pcpu_chunk_struct_size);

1325

schunk = alloc_bootmem(pcpu_chunk_struct_size);

1326

INIT_LIST_HEAD(&schunk->list);

1326

INIT_LIST_HEAD(&schunk->list);

1327

schunk->base_addr = base_addr;

1327

schunk->base_addr = base_addr;

1328

schunk->map = smap;

1328

schunk->map = smap;

1329

schunk->map_alloc = ARRAY_SIZE(smap);

1329

schunk->map_alloc = ARRAY_SIZE(smap);

1330

schunk->immutable = true;

1330

schunk->immutable = true;

1331

bitmap_fill(schunk->populated, pcpu_unit_pages);

1331

bitmap_fill(schunk->populated, pcpu_unit_pages);

1332

1333

if (ai->reserved_size) {

1333

if (ai->reserved_size) {

1334

schunk->free_size = ai->reserved_size;

1334

schunk->free_size = ai->reserved_size;

1335

pcpu_reserved_chunk = schunk;

1335

pcpu_reserved_chunk = schunk;

1336

pcpu_reserved_chunk_limit = ai->static_size + ai->reserved_size;

1336

pcpu_reserved_chunk_limit = ai->static_size + ai->reserved_size;

1337

} else {

1337

} else {

1338

schunk->free_size = dyn_size;

1338

schunk->free_size = dyn_size;

1339

dyn_size = 0; /* dynamic area covered */

1339

dyn_size = 0; /* dynamic area covered */

1340

}

1340

}

1341

schunk->contig_hint = schunk->free_size;

1341

schunk->contig_hint = schunk->free_size;

1342

1343

schunk->map[schunk->map_used++] = -ai->static_size;

1343

schunk->map[schunk->map_used++] = -ai->static_size;

1344

if (schunk->free_size)

1344

if (schunk->free_size)

1345

schunk->map[schunk->map_used++] = schunk->free_size;

1345

schunk->map[schunk->map_used++] = schunk->free_size;

1346

1347

/* init dynamic chunk if necessary */

1347

/* init dynamic chunk if necessary */

1348

if (dyn_size) {

1348

if (dyn_size) {

1349

dchunk = alloc_bootmem(pcpu_chunk_struct_size);

1349

dchunk = alloc_bootmem(pcpu_chunk_struct_size);

1350

INIT_LIST_HEAD(&dchunk->list);

1350

INIT_LIST_HEAD(&dchunk->list);

1351

dchunk->base_addr = base_addr;

1351

dchunk->base_addr = base_addr;

1352

dchunk->map = dmap;

1352

dchunk->map = dmap;

1353

dchunk->map_alloc = ARRAY_SIZE(dmap);

1353

dchunk->map_alloc = ARRAY_SIZE(dmap);

1354

dchunk->immutable = true;

1354

dchunk->immutable = true;

1355

bitmap_fill(dchunk->populated, pcpu_unit_pages);

1355

bitmap_fill(dchunk->populated, pcpu_unit_pages);

1356

1357

dchunk->contig_hint = dchunk->free_size = dyn_size;

1357

dchunk->contig_hint = dchunk->free_size = dyn_size;

1358

dchunk->map[dchunk->map_used++] = -pcpu_reserved_chunk_limit;

1358

dchunk->map[dchunk->map_used++] = -pcpu_reserved_chunk_limit;

1359

dchunk->map[dchunk->map_used++] = dchunk->free_size;

1359

dchunk->map[dchunk->map_used++] = dchunk->free_size;

1360

}

1360

}

1361

1362

/* link the first chunk in */

1362

/* link the first chunk in */

1363

pcpu_first_chunk = dchunk ?: schunk;

1363

pcpu_first_chunk = dchunk ?: schunk;

1364

pcpu_chunk_relocate(pcpu_first_chunk, -1);

1364

pcpu_chunk_relocate(pcpu_first_chunk, -1);

1365

1366

/* we're done */

1366

/* we're done */

1367

pcpu_base_addr = base_addr;

1367

pcpu_base_addr = base_addr;

1368

return 0;

1368

return 0;

1369

}

1369

}

1370

1371

#ifdef CONFIG_SMP

1371

#ifdef CONFIG_SMP

1372

1373

const char *pcpu_fc_names[PCPU_FC_NR] __initdata = {

1373

const char *pcpu_fc_names[PCPU_FC_NR] __initdata = {

1374

[PCPU_FC_AUTO] = "auto",

1374

[PCPU_FC_AUTO] = "auto",

1375

[PCPU_FC_EMBED] = "embed",

1375

[PCPU_FC_EMBED] = "embed",

1376

[PCPU_FC_PAGE] = "page",

1376

[PCPU_FC_PAGE] = "page",

1377

};

1377

};

1378

1379

enum pcpu_fc pcpu_chosen_fc __initdata = PCPU_FC_AUTO;

1379

enum pcpu_fc pcpu_chosen_fc __initdata = PCPU_FC_AUTO;

1380

1381

static int __init percpu_alloc_setup(char *str)

1381

static int __init percpu_alloc_setup(char *str)

1382

{

1382

{

1383

if (0)

1383

if (0)

1384

/* nada */;

1384

/* nada */;

1385

#ifdef CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK

1385

#ifdef CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK

1386

else if (!strcmp(str, "embed"))

1386

else if (!strcmp(str, "embed"))

1387

pcpu_chosen_fc = PCPU_FC_EMBED;

1387

pcpu_chosen_fc = PCPU_FC_EMBED;

1388

#endif

1388

#endif

1389

#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK

1389

#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK

1390

else if (!strcmp(str, "page"))

1390

else if (!strcmp(str, "page"))

1391

pcpu_chosen_fc = PCPU_FC_PAGE;

1391

pcpu_chosen_fc = PCPU_FC_PAGE;

1392

#endif

1392

#endif

1393

else

1393

else

1394

pr_warning("PERCPU: unknown allocator %s specified\n", str);

1394

pr_warning("PERCPU: unknown allocator %s specified\n", str);

1395

1396

return 0;

1396

return 0;

1397

}

1397

}

1398

early_param("percpu_alloc", percpu_alloc_setup);

1398

early_param("percpu_alloc", percpu_alloc_setup);

1399

1400

/*

1400

/*

1401

* pcpu_embed_first_chunk() is used by the generic percpu setup.

1401

* pcpu_embed_first_chunk() is used by the generic percpu setup.

1402

* Build it if needed by the arch config or the generic setup is going

1402

* Build it if needed by the arch config or the generic setup is going

1403

* to be used.

1403

* to be used.

1404

*/

1404

*/

1405

#if defined(CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK) || \

1405

#if defined(CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK) || \

1406

!defined(CONFIG_HAVE_SETUP_PER_CPU_AREA)

1406

!defined(CONFIG_HAVE_SETUP_PER_CPU_AREA)

1407

#define BUILD_EMBED_FIRST_CHUNK

1407

#define BUILD_EMBED_FIRST_CHUNK

1408

#endif

1408

#endif

1409

1410

/* build pcpu_page_first_chunk() iff needed by the arch config */

1410

/* build pcpu_page_first_chunk() iff needed by the arch config */

1411

#if defined(CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK)

1411

#if defined(CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK)

1412

#define BUILD_PAGE_FIRST_CHUNK

1412

#define BUILD_PAGE_FIRST_CHUNK

1413

#endif

1413

#endif

1414

1415

/* pcpu_build_alloc_info() is used by both embed and page first chunk */

1415

/* pcpu_build_alloc_info() is used by both embed and page first chunk */

1416

#if defined(BUILD_EMBED_FIRST_CHUNK) || defined(BUILD_PAGE_FIRST_CHUNK)

1416

#if defined(BUILD_EMBED_FIRST_CHUNK) || defined(BUILD_PAGE_FIRST_CHUNK)

1417

/**

1417

/**

1418

* pcpu_build_alloc_info - build alloc_info considering distances between CPUs

1418

* pcpu_build_alloc_info - build alloc_info considering distances between CPUs

1419

* @reserved_size: the size of reserved percpu area in bytes

1419

* @reserved_size: the size of reserved percpu area in bytes

1420

* @dyn_size: minimum free size for dynamic allocation in bytes

1420

* @dyn_size: minimum free size for dynamic allocation in bytes

1421

* @atom_size: allocation atom size

1421

* @atom_size: allocation atom size

1422

* @cpu_distance_fn: callback to determine distance between cpus, optional

1422

* @cpu_distance_fn: callback to determine distance between cpus, optional

1423

*

1423

*

1424

* This function determines grouping of units, their mappings to cpus

1424

* This function determines grouping of units, their mappings to cpus

1425

* and other parameters considering needed percpu size, allocation

1425

* and other parameters considering needed percpu size, allocation

1426

* atom size and distances between CPUs.

1426

* atom size and distances between CPUs.

1427

*

1427

*

1428

* Groups are always mutliples of atom size and CPUs which are of

1428

* Groups are always mutliples of atom size and CPUs which are of

1429

* LOCAL_DISTANCE both ways are grouped together and share space for

1429

* LOCAL_DISTANCE both ways are grouped together and share space for

1430

* units in the same group. The returned configuration is guaranteed

1430

* units in the same group. The returned configuration is guaranteed

1431

* to have CPUs on different nodes on different groups and >=75% usage

1431

* to have CPUs on different nodes on different groups and >=75% usage

1432

* of allocated virtual address space.

1432

* of allocated virtual address space.

1433

*

1433

*

1434

* RETURNS:

1434

* RETURNS:

1435

* On success, pointer to the new allocation_info is returned. On

1435

* On success, pointer to the new allocation_info is returned. On

1436

* failure, ERR_PTR value is returned.

1436

* failure, ERR_PTR value is returned.

1437

*/

1437

*/

1438

static struct pcpu_alloc_info * __init pcpu_build_alloc_info(

1438

static struct pcpu_alloc_info * __init pcpu_build_alloc_info(

1439

size_t reserved_size, size_t dyn_size,

1439

size_t reserved_size, size_t dyn_size,

1440

size_t atom_size,

1440

size_t atom_size,

1441

pcpu_fc_cpu_distance_fn_t cpu_distance_fn)

1441

pcpu_fc_cpu_distance_fn_t cpu_distance_fn)

1442

{

1442

{

1443

static int group_map[NR_CPUS] __initdata;

1443

static int group_map[NR_CPUS] __initdata;

1444

static int group_cnt[NR_CPUS] __initdata;

1444

static int group_cnt[NR_CPUS] __initdata;

1445

const size_t static_size = __per_cpu_end - __per_cpu_start;

1445

const size_t static_size = __per_cpu_end - __per_cpu_start;

1446

int nr_groups = 1, nr_units = 0;

1446

int nr_groups = 1, nr_units = 0;

1447

size_t size_sum, min_unit_size, alloc_size;

1447

size_t size_sum, min_unit_size, alloc_size;

1448

int upa, max_upa, uninitialized_var(best_upa); /* units_per_alloc */

1448

int upa, max_upa, uninitialized_var(best_upa); /* units_per_alloc */

1449

int last_allocs, group, unit;

1449

int last_allocs, group, unit;

1450

unsigned int cpu, tcpu;

1450

unsigned int cpu, tcpu;

1451

struct pcpu_alloc_info *ai;

1451

struct pcpu_alloc_info *ai;

1452

unsigned int *cpu_map;

1452

unsigned int *cpu_map;

1453

1454

/* this function may be called multiple times */

1454

/* this function may be called multiple times */

1455

memset(group_map, 0, sizeof(group_map));

1455

memset(group_map, 0, sizeof(group_map));

1456

memset(group_cnt, 0, sizeof(group_cnt));

1456

memset(group_cnt, 0, sizeof(group_cnt));

1457

1458

/* calculate size_sum and ensure dyn_size is enough for early alloc */

1458

/* calculate size_sum and ensure dyn_size is enough for early alloc */

1459

size_sum = PFN_ALIGN(static_size + reserved_size +

1459

size_sum = PFN_ALIGN(static_size + reserved_size +

1460

max_t(size_t, dyn_size, PERCPU_DYNAMIC_EARLY_SIZE));

1460

max_t(size_t, dyn_size, PERCPU_DYNAMIC_EARLY_SIZE));

1461

dyn_size = size_sum - static_size - reserved_size;

1461

dyn_size = size_sum - static_size - reserved_size;

1462

1463

/*

1463

/*

1464

* Determine min_unit_size, alloc_size and max_upa such that

1464

* Determine min_unit_size, alloc_size and max_upa such that

1465

* alloc_size is multiple of atom_size and is the smallest

1465

* alloc_size is multiple of atom_size and is the smallest

1466

* which can accommodate 4k aligned segments which are equal to

1466

* which can accommodate 4k aligned segments which are equal to

1467

* or larger than min_unit_size.

1467

* or larger than min_unit_size.

1468

*/

1468

*/

1469

min_unit_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE);

1469

min_unit_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE);

1470

1471

alloc_size = roundup(min_unit_size, atom_size);

1471

alloc_size = roundup(min_unit_size, atom_size);

1472

upa = alloc_size / min_unit_size;

1472

upa = alloc_size / min_unit_size;

1473

while (alloc_size % upa || ((alloc_size / upa) & ~PAGE_MASK))

1473

while (alloc_size % upa || ((alloc_size / upa) & ~PAGE_MASK))

1474

upa--;

1474

upa--;

1475

max_upa = upa;

1475

max_upa = upa;

1476

1477

/* group cpus according to their proximity */

1477

/* group cpus according to their proximity */

1478

for_each_possible_cpu(cpu) {

1478

for_each_possible_cpu(cpu) {

1479

group = 0;

1479

group = 0;

1480

next_group:

1480

next_group:

1481

for_each_possible_cpu(tcpu) {

1481

for_each_possible_cpu(tcpu) {

1482

if (cpu == tcpu)

1482

if (cpu == tcpu)

1483

break;

1483

break;

1484

if (group_map[tcpu] == group && cpu_distance_fn &&

1484

if (group_map[tcpu] == group && cpu_distance_fn &&

1485

(cpu_distance_fn(cpu, tcpu) > LOCAL_DISTANCE ||

1485

(cpu_distance_fn(cpu, tcpu) > LOCAL_DISTANCE ||

1486

cpu_distance_fn(tcpu, cpu) > LOCAL_DISTANCE)) {

1486

cpu_distance_fn(tcpu, cpu) > LOCAL_DISTANCE)) {

1487

group++;

1487

group++;

1488

nr_groups = max(nr_groups, group + 1);

1488

nr_groups = max(nr_groups, group + 1);

1489

goto next_group;

1489

goto next_group;

1490

}

1490

}

1491

}

1491

}

1492

group_map[cpu] = group;

1492

group_map[cpu] = group;

1493

group_cnt[group]++;

1493

group_cnt[group]++;

1494

}

1494

}

1495

1496

/*

1496

/*

1497

* Expand unit size until address space usage goes over 75%

1497

* Expand unit size until address space usage goes over 75%

1498

* and then as much as possible without using more address

1498

* and then as much as possible without using more address

1499

* space.

1499

* space.

1500

*/

1500

*/

1501

last_allocs = INT_MAX;

1501

last_allocs = INT_MAX;

1502

for (upa = max_upa; upa; upa--) {

1502

for (upa = max_upa; upa; upa--) {

1503

int allocs = 0, wasted = 0;

1503

int allocs = 0, wasted = 0;

1504

1505

if (alloc_size % upa || ((alloc_size / upa) & ~PAGE_MASK))

1505

if (alloc_size % upa || ((alloc_size / upa) & ~PAGE_MASK))

1506

continue;

1506

continue;

1507

1508

for (group = 0; group < nr_groups; group++) {

1508

for (group = 0; group < nr_groups; group++) {

1509

int this_allocs = DIV_ROUND_UP(group_cnt[group], upa);

1509

int this_allocs = DIV_ROUND_UP(group_cnt[group], upa);

1510

allocs += this_allocs;

1510

allocs += this_allocs;

1511

wasted += this_allocs * upa - group_cnt[group];

1511

wasted += this_allocs * upa - group_cnt[group];

1512

}

1512

}

1513

1514

/*

1514

/*

1515

* Don't accept if wastage is over 1/3. The

1515

* Don't accept if wastage is over 1/3. The

1516

* greater-than comparison ensures upa==1 always

1516

* greater-than comparison ensures upa==1 always

1517

* passes the following check.

1517

* passes the following check.

1518

*/

1518

*/

1519

if (wasted > num_possible_cpus() / 3)

1519

if (wasted > num_possible_cpus() / 3)

1520

continue;

1520

continue;

1521

1522

/* and then don't consume more memory */

1522

/* and then don't consume more memory */

1523

if (allocs > last_allocs)

1523

if (allocs > last_allocs)

1524

break;

1524

break;

1525

last_allocs = allocs;

1525

last_allocs = allocs;

1526

best_upa = upa;

1526

best_upa = upa;

1527

}

1527

}

1528

upa = best_upa;

1528

upa = best_upa;

1529

1530

/* allocate and fill alloc_info */

1530

/* allocate and fill alloc_info */

1531

for (group = 0; group < nr_groups; group++)

1531

for (group = 0; group < nr_groups; group++)

1532

nr_units += roundup(group_cnt[group], upa);

1532

nr_units += roundup(group_cnt[group], upa);

1533

1534

ai = pcpu_alloc_alloc_info(nr_groups, nr_units);

1534

ai = pcpu_alloc_alloc_info(nr_groups, nr_units);

1535

if (!ai)

1535

if (!ai)

1536

return ERR_PTR(-ENOMEM);

1536

return ERR_PTR(-ENOMEM);

1537

cpu_map = ai->groups[0].cpu_map;

1537

cpu_map = ai->groups[0].cpu_map;

1538

1539

for (group = 0; group < nr_groups; group++) {

1539

for (group = 0; group < nr_groups; group++) {

1540

ai->groups[group].cpu_map = cpu_map;

1540

ai->groups[group].cpu_map = cpu_map;

1541

cpu_map += roundup(group_cnt[group], upa);

1541

cpu_map += roundup(group_cnt[group], upa);

1542

}

1542

}

1543

1544

ai->static_size = static_size;

1544

ai->static_size = static_size;

1545

ai->reserved_size = reserved_size;

1545

ai->reserved_size = reserved_size;

1546

ai->dyn_size = dyn_size;

1546

ai->dyn_size = dyn_size;

1547

ai->unit_size = alloc_size / upa;

1547

ai->unit_size = alloc_size / upa;

1548

ai->atom_size = atom_size;

1548

ai->atom_size = atom_size;

1549

ai->alloc_size = alloc_size;

1549

ai->alloc_size = alloc_size;

1550

1551

for (group = 0, unit = 0; group_cnt[group]; group++) {

1551

for (group = 0, unit = 0; group_cnt[group]; group++) {

1552

struct pcpu_group_info *gi = &ai->groups[group];

1552

struct pcpu_group_info *gi = &ai->groups[group];

1553

1554

/*

1554

/*

1555

* Initialize base_offset as if all groups are located

1555

* Initialize base_offset as if all groups are located

1556

* back-to-back. The caller should update this to

1556

* back-to-back. The caller should update this to

1557

* reflect actual allocation.

1557

* reflect actual allocation.

1558

*/

1558

*/

1559

gi->base_offset = unit * ai->unit_size;

1559

gi->base_offset = unit * ai->unit_size;

1560

1561

for_each_possible_cpu(cpu)

1561

for_each_possible_cpu(cpu)

1562

if (group_map[cpu] == group)

1562

if (group_map[cpu] == group)

1563

gi->cpu_map[gi->nr_units++] = cpu;

1563

gi->cpu_map[gi->nr_units++] = cpu;

1564

gi->nr_units = roundup(gi->nr_units, upa);

1564

gi->nr_units = roundup(gi->nr_units, upa);

1565

unit += gi->nr_units;

1565

unit += gi->nr_units;

1566

}

1566

}

1567

BUG_ON(unit != nr_units);

1567

BUG_ON(unit != nr_units);

1568

1569

return ai;

1569

return ai;

1570

}

1570

}

1571

#endif /* BUILD_EMBED_FIRST_CHUNK || BUILD_PAGE_FIRST_CHUNK */

1571

#endif /* BUILD_EMBED_FIRST_CHUNK || BUILD_PAGE_FIRST_CHUNK */

1572

1573

#if defined(BUILD_EMBED_FIRST_CHUNK)

1573

#if defined(BUILD_EMBED_FIRST_CHUNK)

1574

/**

1574

/**

1575

* pcpu_embed_first_chunk - embed the first percpu chunk into bootmem

1575

* pcpu_embed_first_chunk - embed the first percpu chunk into bootmem

1576

* @reserved_size: the size of reserved percpu area in bytes

1576

* @reserved_size: the size of reserved percpu area in bytes

1577

* @dyn_size: minimum free size for dynamic allocation in bytes

1577

* @dyn_size: minimum free size for dynamic allocation in bytes

1578

* @atom_size: allocation atom size

1578

* @atom_size: allocation atom size

1579

* @cpu_distance_fn: callback to determine distance between cpus, optional

1579

* @cpu_distance_fn: callback to determine distance between cpus, optional

1580

* @alloc_fn: function to allocate percpu page

1580

* @alloc_fn: function to allocate percpu page

1581

* @free_fn: function to free percpu page

1581

* @free_fn: function to free percpu page

1582

*

1582

*

1583

* This is a helper to ease setting up embedded first percpu chunk and

1583

* This is a helper to ease setting up embedded first percpu chunk and

1584

* can be called where pcpu_setup_first_chunk() is expected.

1584

* can be called where pcpu_setup_first_chunk() is expected.

1585

*

1585

*

1586

* If this function is used to setup the first chunk, it is allocated

1586

* If this function is used to setup the first chunk, it is allocated

1587

* by calling @alloc_fn and used as-is without being mapped into

1587

* by calling @alloc_fn and used as-is without being mapped into

1588

* vmalloc area. Allocations are always whole multiples of @atom_size

1588

* vmalloc area. Allocations are always whole multiples of @atom_size

1589

* aligned to @atom_size.

1589

* aligned to @atom_size.

1590

*

1590

*

1591

* This enables the first chunk to piggy back on the linear physical

1591

* This enables the first chunk to piggy back on the linear physical

1592

* mapping which often uses larger page size. Please note that this

1592

* mapping which often uses larger page size. Please note that this

1593

* can result in very sparse cpu->unit mapping on NUMA machines thus

1593

* can result in very sparse cpu->unit mapping on NUMA machines thus

1594

* requiring large vmalloc address space. Don't use this allocator if

1594

* requiring large vmalloc address space. Don't use this allocator if

1595

* vmalloc space is not orders of magnitude larger than distances

1595

* vmalloc space is not orders of magnitude larger than distances

1596

* between node memory addresses (ie. 32bit NUMA machines).

1596

* between node memory addresses (ie. 32bit NUMA machines).

1597

*

1597

*

1598

* @dyn_size specifies the minimum dynamic area size.

1598

* @dyn_size specifies the minimum dynamic area size.

1599

*

1599

*

1600

* If the needed size is smaller than the minimum or specified unit

1600

* If the needed size is smaller than the minimum or specified unit

1601

* size, the leftover is returned using @free_fn.

1601

* size, the leftover is returned using @free_fn.

1602

*

1602

*

1603

* RETURNS:

1603

* RETURNS:

1604

* 0 on success, -errno on failure.

1604

* 0 on success, -errno on failure.

1605

*/

1605

*/

1606

int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size,

1606

int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size,

1607

size_t atom_size,

1607

size_t atom_size,

1608

pcpu_fc_cpu_distance_fn_t cpu_distance_fn,

1608

pcpu_fc_cpu_distance_fn_t cpu_distance_fn,

1609

pcpu_fc_alloc_fn_t alloc_fn,

1609

pcpu_fc_alloc_fn_t alloc_fn,

1610

pcpu_fc_free_fn_t free_fn)

1610

pcpu_fc_free_fn_t free_fn)

1611

{

1611

{

1612

void *base = (void *)ULONG_MAX;

1612

void *base = (void *)ULONG_MAX;

1613

void **areas = NULL;

1613

void **areas = NULL;

1614

struct pcpu_alloc_info *ai;

1614

struct pcpu_alloc_info *ai;

1615

size_t size_sum, areas_size, max_distance;

1615

size_t size_sum, areas_size, max_distance;

1616

int group, i, rc;

1616

int group, i, rc;

1617

1618

ai = pcpu_build_alloc_info(reserved_size, dyn_size, atom_size,

1618

ai = pcpu_build_alloc_info(reserved_size, dyn_size, atom_size,

1619

cpu_distance_fn);

1619

cpu_distance_fn);

1620

if (IS_ERR(ai))

1620

if (IS_ERR(ai))

1621

return PTR_ERR(ai);

1621

return PTR_ERR(ai);

1622

1623

size_sum = ai->static_size + ai->reserved_size + ai->dyn_size;

1623

size_sum = ai->static_size + ai->reserved_size + ai->dyn_size;

1624

areas_size = PFN_ALIGN(ai->nr_groups * sizeof(void *));

1624

areas_size = PFN_ALIGN(ai->nr_groups * sizeof(void *));

1625

1626

areas = alloc_bootmem_nopanic(areas_size);

1626

areas = alloc_bootmem_nopanic(areas_size);

1627

if (!areas) {

1627

if (!areas) {

1628

rc = -ENOMEM;

1628

rc = -ENOMEM;

1629

goto out_free;

1629

goto out_free;

1630

}

1630

}

1631

1632

/* allocate, copy and determine base address */

1632

/* allocate, copy and determine base address */

1633

for (group = 0; group < ai->nr_groups; group++) {

1633

for (group = 0; group < ai->nr_groups; group++) {

1634

struct pcpu_group_info *gi = &ai->groups[group];

1634

struct pcpu_group_info *gi = &ai->groups[group];

1635

unsigned int cpu = NR_CPUS;

1635

unsigned int cpu = NR_CPUS;

1636

void *ptr;

1636

void *ptr;

1637

1638

for (i = 0; i < gi->nr_units && cpu == NR_CPUS; i++)

1638

for (i = 0; i < gi->nr_units && cpu == NR_CPUS; i++)

1639

cpu = gi->cpu_map[i];

1639

cpu = gi->cpu_map[i];

1640

BUG_ON(cpu == NR_CPUS);

1640

BUG_ON(cpu == NR_CPUS);

1641

1642

/* allocate space for the whole group */

1642

/* allocate space for the whole group */

1643

ptr = alloc_fn(cpu, gi->nr_units * ai->unit_size, atom_size);

1643

ptr = alloc_fn(cpu, gi->nr_units * ai->unit_size, atom_size);

1644

if (!ptr) {

1644

if (!ptr) {

1645

rc = -ENOMEM;

1645

rc = -ENOMEM;

1646

goto out_free_areas;

1646

goto out_free_areas;

1647

}

1647

}

1648

/* kmemleak tracks the percpu allocations separately */

1648

/* kmemleak tracks the percpu allocations separately */

1649

kmemleak_free(ptr);

1649

kmemleak_free(ptr);

1650

areas[group] = ptr;

1650

areas[group] = ptr;

1651

1652

base = min(ptr, base);

1652

base = min(ptr, base);

1653

}

1653

}

1654

1655

/*

1655

/*

1656

* Copy data and free unused parts. This should happen after all

1656

* Copy data and free unused parts. This should happen after all

1657

* allocations are complete; otherwise, we may end up with

1657

* allocations are complete; otherwise, we may end up with

1658

* overlapping groups.

1658

* overlapping groups.

1659

*/

1659

*/

1660

for (group = 0; group < ai->nr_groups; group++) {

1660

for (group = 0; group < ai->nr_groups; group++) {

1661

struct pcpu_group_info *gi = &ai->groups[group];

1661

struct pcpu_group_info *gi = &ai->groups[group];

1662

void *ptr = areas[group];

1662

void *ptr = areas[group];

1663

1664

for (i = 0; i < gi->nr_units; i++, ptr += ai->unit_size) {

1664

for (i = 0; i < gi->nr_units; i++, ptr += ai->unit_size) {

1665

if (gi->cpu_map[i] == NR_CPUS) {

1665

if (gi->cpu_map[i] == NR_CPUS) {

1666

/* unused unit, free whole */

1666

/* unused unit, free whole */

1667

free_fn(ptr, ai->unit_size);

1667

free_fn(ptr, ai->unit_size);

1668

continue;

1668

continue;

1669

}

1669

}

1670

/* copy and return the unused part */

1670

/* copy and return the unused part */

1671

memcpy(ptr, __per_cpu_load, ai->static_size);

1671

memcpy(ptr, __per_cpu_load, ai->static_size);

1672

free_fn(ptr + size_sum, ai->unit_size - size_sum);

1672

free_fn(ptr + size_sum, ai->unit_size - size_sum);

1673

}

1673

}

1674

}

1674

}

1675

1676

/* base address is now known, determine group base offsets */

1676

/* base address is now known, determine group base offsets */

1677

max_distance = 0;

1677

max_distance = 0;

1678

for (group = 0; group < ai->nr_groups; group++) {

1678

for (group = 0; group < ai->nr_groups; group++) {

1679

ai->groups[group].base_offset = areas[group] - base;

1679

ai->groups[group].base_offset = areas[group] - base;

1680

max_distance = max_t(size_t, max_distance,

1680

max_distance = max_t(size_t, max_distance,

1681

ai->groups[group].base_offset);

1681

ai->groups[group].base_offset);

1682

}

1682

}

1683

max_distance += ai->unit_size;

1683

max_distance += ai->unit_size;

1684

1685

/* warn if maximum distance is further than 75% of vmalloc space */

1685

/* warn if maximum distance is further than 75% of vmalloc space */

1686

if (max_distance > (VMALLOC_END - VMALLOC_START) * 3 / 4) {

1686

if (max_distance > (VMALLOC_END - VMALLOC_START) * 3 / 4) {

1687

pr_warning("PERCPU: max_distance=0x%zx too large for vmalloc "

1687

pr_warning("PERCPU: max_distance=0x%zx too large for vmalloc "

1688

"space 0x%lx\n", max_distance,

1688

"space 0x%lx\n", max_distance,

1689

(unsigned long)(VMALLOC_END - VMALLOC_START));

1689

(unsigned long)(VMALLOC_END - VMALLOC_START));

1690

#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK

1690

#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK

1691

/* and fail if we have fallback */

1691

/* and fail if we have fallback */

1692

rc = -EINVAL;

1692

rc = -EINVAL;

1693

goto out_free;

1693

goto out_free;

1694

#endif

1694

#endif

1695

}

1695

}

1696

1697

pr_info("PERCPU: Embedded %zu pages/cpu @%p s%zu r%zu d%zu u%zu\n",

1697

pr_info("PERCPU: Embedded %zu pages/cpu @%p s%zu r%zu d%zu u%zu\n",

1698

PFN_DOWN(size_sum), base, ai->static_size, ai->reserved_size,

1698

PFN_DOWN(size_sum), base, ai->static_size, ai->reserved_size,

1699

ai->dyn_size, ai->unit_size);

1699

ai->dyn_size, ai->unit_size);

1700

1701

rc = pcpu_setup_first_chunk(ai, base);

1701

rc = pcpu_setup_first_chunk(ai, base);

1702

goto out_free;

1702

goto out_free;

1703

1704

out_free_areas:

1704

out_free_areas:

1705

for (group = 0; group < ai->nr_groups; group++)

1705

for (group = 0; group < ai->nr_groups; group++)

1706

free_fn(areas[group],

1706

free_fn(areas[group],

1707

ai->groups[group].nr_units * ai->unit_size);

1707

ai->groups[group].nr_units * ai->unit_size);

1708

out_free:

1708

out_free:

1709

pcpu_free_alloc_info(ai);

1709

pcpu_free_alloc_info(ai);

1710

if (areas)

1710

if (areas)

1711

free_bootmem(__pa(areas), areas_size);

1711

free_bootmem(__pa(areas), areas_size);

1712

return rc;

1712

return rc;

1713

}

1713

}

1714

#endif /* BUILD_EMBED_FIRST_CHUNK */

1714

#endif /* BUILD_EMBED_FIRST_CHUNK */

1715

1716

#ifdef BUILD_PAGE_FIRST_CHUNK

1716

#ifdef BUILD_PAGE_FIRST_CHUNK

1717

/**

1717

/**

1718

* pcpu_page_first_chunk - map the first chunk using PAGE_SIZE pages

1718

* pcpu_page_first_chunk - map the first chunk using PAGE_SIZE pages

1719

* @reserved_size: the size of reserved percpu area in bytes

1719

* @reserved_size: the size of reserved percpu area in bytes

1720

* @alloc_fn: function to allocate percpu page, always called with PAGE_SIZE

1720

* @alloc_fn: function to allocate percpu page, always called with PAGE_SIZE

1721

* @free_fn: function to free percpu page, always called with PAGE_SIZE

1721

* @free_fn: function to free percpu page, always called with PAGE_SIZE

1722

* @populate_pte_fn: function to populate pte

1722

* @populate_pte_fn: function to populate pte

1723

*

1723

*

1724

* This is a helper to ease setting up page-remapped first percpu

1724

* This is a helper to ease setting up page-remapped first percpu

1725

* chunk and can be called where pcpu_setup_first_chunk() is expected.

1725

* chunk and can be called where pcpu_setup_first_chunk() is expected.

1726

*

1726

*

1727

* This is the basic allocator. Static percpu area is allocated

1727

* This is the basic allocator. Static percpu area is allocated

1728

* page-by-page into vmalloc area.

1728

* page-by-page into vmalloc area.

1729

*

1729

*

1730

* RETURNS:

1730

* RETURNS:

1731

* 0 on success, -errno on failure.

1731

* 0 on success, -errno on failure.

1732

*/

1732

*/

1733

int __init pcpu_page_first_chunk(size_t reserved_size,

1733

int __init pcpu_page_first_chunk(size_t reserved_size,

1734

pcpu_fc_alloc_fn_t alloc_fn,

1734

pcpu_fc_alloc_fn_t alloc_fn,

1735

pcpu_fc_free_fn_t free_fn,

1735

pcpu_fc_free_fn_t free_fn,

1736

pcpu_fc_populate_pte_fn_t populate_pte_fn)

1736

pcpu_fc_populate_pte_fn_t populate_pte_fn)

1737

{

1737

{

1738

static struct vm_struct vm;

1738

static struct vm_struct vm;

1739

struct pcpu_alloc_info *ai;

1739

struct pcpu_alloc_info *ai;

1740

char psize_str[16];

1740

char psize_str[16];

1741

int unit_pages;

1741

int unit_pages;

1742

size_t pages_size;

1742

size_t pages_size;

1743

struct page **pages;

1743

struct page **pages;

1744

int unit, i, j, rc;

1744

int unit, i, j, rc;

1745

1746

snprintf(psize_str, sizeof(psize_str), "%luK", PAGE_SIZE >> 10);

1746

snprintf(psize_str, sizeof(psize_str), "%luK", PAGE_SIZE >> 10);

1747

1748

ai = pcpu_build_alloc_info(reserved_size, 0, PAGE_SIZE, NULL);

1748

ai = pcpu_build_alloc_info(reserved_size, 0, PAGE_SIZE, NULL);

1749

if (IS_ERR(ai))

1749

if (IS_ERR(ai))

1750

return PTR_ERR(ai);

1750

return PTR_ERR(ai);

1751

BUG_ON(ai->nr_groups != 1);

1751

BUG_ON(ai->nr_groups != 1);

1752

BUG_ON(ai->groups[0].nr_units != num_possible_cpus());

1752

BUG_ON(ai->groups[0].nr_units != num_possible_cpus());

1753

1754

unit_pages = ai->unit_size >> PAGE_SHIFT;

1754

unit_pages = ai->unit_size >> PAGE_SHIFT;

1755

1756

/* unaligned allocations can't be freed, round up to page size */

1756

/* unaligned allocations can't be freed, round up to page size */

1757

pages_size = PFN_ALIGN(unit_pages * num_possible_cpus() *

1757

pages_size = PFN_ALIGN(unit_pages * num_possible_cpus() *

1758

sizeof(pages[0]));

1758

sizeof(pages[0]));

1759

pages = alloc_bootmem(pages_size);

1759

pages = alloc_bootmem(pages_size);

1760

1761

/* allocate pages */

1761

/* allocate pages */

1762

j = 0;

1762

j = 0;

1763

for (unit = 0; unit < num_possible_cpus(); unit++)

1763

for (unit = 0; unit < num_possible_cpus(); unit++)

1764

for (i = 0; i < unit_pages; i++) {

1764

for (i = 0; i < unit_pages; i++) {

1765

unsigned int cpu = ai->groups[0].cpu_map[unit];

1765

unsigned int cpu = ai->groups[0].cpu_map[unit];

1766

void *ptr;

1766

void *ptr;

1767

1768

ptr = alloc_fn(cpu, PAGE_SIZE, PAGE_SIZE);

1768

ptr = alloc_fn(cpu, PAGE_SIZE, PAGE_SIZE);

1769

if (!ptr) {

1769

if (!ptr) {

1770

pr_warning("PERCPU: failed to allocate %s page "

1770

pr_warning("PERCPU: failed to allocate %s page "

1771

"for cpu%u\n", psize_str, cpu);

1771

"for cpu%u\n", psize_str, cpu);

1772

goto enomem;

1772

goto enomem;

1773

}

1773

}

1774

/* kmemleak tracks the percpu allocations separately */

1774

/* kmemleak tracks the percpu allocations separately */

1775

kmemleak_free(ptr);

1775

kmemleak_free(ptr);

1776

pages[j++] = virt_to_page(ptr);

1776

pages[j++] = virt_to_page(ptr);

1777

}

1777

}

1778

1779

/* allocate vm area, map the pages and copy static data */

1779

/* allocate vm area, map the pages and copy static data */

1780

vm.flags = VM_ALLOC;

1780

vm.flags = VM_ALLOC;

1781

vm.size = num_possible_cpus() * ai->unit_size;

1781

vm.size = num_possible_cpus() * ai->unit_size;

1782

vm_area_register_early(&vm, PAGE_SIZE);

1782

vm_area_register_early(&vm, PAGE_SIZE);

1783

1784

for (unit = 0; unit < num_possible_cpus(); unit++) {

1784

for (unit = 0; unit < num_possible_cpus(); unit++) {

1785

unsigned long unit_addr =

1785

unsigned long unit_addr =

1786

(unsigned long)vm.addr + unit * ai->unit_size;

1786

(unsigned long)vm.addr + unit * ai->unit_size;

1787

1788

for (i = 0; i < unit_pages; i++)

1788

for (i = 0; i < unit_pages; i++)

1789

populate_pte_fn(unit_addr + (i << PAGE_SHIFT));

1789

populate_pte_fn(unit_addr + (i << PAGE_SHIFT));

1790

1791

/* pte already populated, the following shouldn't fail */

1791

/* pte already populated, the following shouldn't fail */

1792

rc = __pcpu_map_pages(unit_addr, &pages[unit * unit_pages],

1792

rc = __pcpu_map_pages(unit_addr, &pages[unit * unit_pages],

1793

unit_pages);

1793

unit_pages);

1794

if (rc < 0)

1794

if (rc < 0)

1795

panic("failed to map percpu area, err=%d\n", rc);

1795

panic("failed to map percpu area, err=%d\n", rc);

1796

1797

/*

1797

/*

1798

* FIXME: Archs with virtual cache should flush local

1798

* FIXME: Archs with virtual cache should flush local

1799

* cache for the linear mapping here - something

1799

* cache for the linear mapping here - something

1800

* equivalent to flush_cache_vmap() on the local cpu.

1800

* equivalent to flush_cache_vmap() on the local cpu.

1801

* flush_cache_vmap() can't be used as most supporting

1801

* flush_cache_vmap() can't be used as most supporting

1802

* data structures are not set up yet.

1802

* data structures are not set up yet.

1803

*/

1803

*/

1804

1805

/* copy static data */

1805

/* copy static data */

1806

memcpy((void *)unit_addr, __per_cpu_load, ai->static_size);

1806

memcpy((void *)unit_addr, __per_cpu_load, ai->static_size);

1807

}

1807

}

1808

1809

/* we're ready, commit */

1809

/* we're ready, commit */

1810

pr_info("PERCPU: %d %s pages/cpu @%p s%zu r%zu d%zu\n",

1810

pr_info("PERCPU: %d %s pages/cpu @%p s%zu r%zu d%zu\n",

1811

unit_pages, psize_str, vm.addr, ai->static_size,

1811

unit_pages, psize_str, vm.addr, ai->static_size,

1812

ai->reserved_size, ai->dyn_size);

1812

ai->reserved_size, ai->dyn_size);

1813

1814

rc = pcpu_setup_first_chunk(ai, vm.addr);

1814

rc = pcpu_setup_first_chunk(ai, vm.addr);

1815

goto out_free_ar;

1815

goto out_free_ar;

1816

1817

enomem:

1817

enomem:

1818

while (--j >= 0)

1818

while (--j >= 0)

1819

free_fn(page_address(pages[j]), PAGE_SIZE);

1819

free_fn(page_address(pages[j]), PAGE_SIZE);

1820

rc = -ENOMEM;

1820

rc = -ENOMEM;

1821

out_free_ar:

1821

out_free_ar:

1822

free_bootmem(__pa(pages), pages_size);

1822

free_bootmem(__pa(pages), pages_size);

1823

pcpu_free_alloc_info(ai);

1823

pcpu_free_alloc_info(ai);

1824

return rc;

1824

return rc;

1825

}

1825

}

1826

#endif /* BUILD_PAGE_FIRST_CHUNK */

1826

#endif /* BUILD_PAGE_FIRST_CHUNK */

1827

1828

#ifndef CONFIG_HAVE_SETUP_PER_CPU_AREA

1828

#ifndef CONFIG_HAVE_SETUP_PER_CPU_AREA

1829

/*

1829

/*

1830

* Generic SMP percpu area setup.

1830

* Generic SMP percpu area setup.

1831

*

1831

*

1832

* The embedding helper is used because its behavior closely resembles

1832

* The embedding helper is used because its behavior closely resembles

1833

* the original non-dynamic generic percpu area setup. This is

1833

* the original non-dynamic generic percpu area setup. This is

1834

* important because many archs have addressing restrictions and might

1834

* important because many archs have addressing restrictions and might

1835

* fail if the percpu area is located far away from the previous

1835

* fail if the percpu area is located far away from the previous

1836

* location. As an added bonus, in non-NUMA cases, embedding is

1836

* location. As an added bonus, in non-NUMA cases, embedding is

1837

* generally a good idea TLB-wise because percpu area can piggy back

1837

* generally a good idea TLB-wise because percpu area can piggy back

1838

* on the physical linear memory mapping which uses large page

1838

* on the physical linear memory mapping which uses large page

1839

* mappings on applicable archs.

1839

* mappings on applicable archs.

1840

*/

1840

*/

1841

unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;

1841

unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;

1842

EXPORT_SYMBOL(__per_cpu_offset);

1842

EXPORT_SYMBOL(__per_cpu_offset);

1843

1844

static void * __init pcpu_dfl_fc_alloc(unsigned int cpu, size_t size,

1844

static void * __init pcpu_dfl_fc_alloc(unsigned int cpu, size_t size,

1845

size_t align)

1845

size_t align)

1846

{

1846

{

1847

return __alloc_bootmem_nopanic(size, align, __pa(MAX_DMA_ADDRESS));

1847

return __alloc_bootmem_nopanic(size, align, __pa(MAX_DMA_ADDRESS));

1848

}

1848

}

1849

1850

static void __init pcpu_dfl_fc_free(void *ptr, size_t size)

1850

static void __init pcpu_dfl_fc_free(void *ptr, size_t size)

1851

{

1851

{

1852

free_bootmem(__pa(ptr), size);

1852

free_bootmem(__pa(ptr), size);

1853

}

1853

}

1854

1855

void __init setup_per_cpu_areas(void)

1855

void __init setup_per_cpu_areas(void)

1856

{

1856

{

1857

unsigned long delta;

1857

unsigned long delta;

1858

unsigned int cpu;

1858

unsigned int cpu;

1859

int rc;

1859

int rc;

1860

1861

/*

1861

/*

1862

* Always reserve area for module percpu variables. That's

1862

* Always reserve area for module percpu variables. That's

1863

* what the legacy allocator did.

1863

* what the legacy allocator did.

1864

*/

1864

*/

1865

rc = pcpu_embed_first_chunk(PERCPU_MODULE_RESERVE,

1865

rc = pcpu_embed_first_chunk(PERCPU_MODULE_RESERVE,

1866

PERCPU_DYNAMIC_RESERVE, PAGE_SIZE, NULL,

1866

PERCPU_DYNAMIC_RESERVE, PAGE_SIZE, NULL,

1867

pcpu_dfl_fc_alloc, pcpu_dfl_fc_free);

1867

pcpu_dfl_fc_alloc, pcpu_dfl_fc_free);

1868

if (rc < 0)

1868

if (rc < 0)

1869

panic("Failed to initialize percpu areas.");

1869

panic("Failed to initialize percpu areas.");

1870

1871

delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start;

1871

delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start;

1872

for_each_possible_cpu(cpu)

1872

for_each_possible_cpu(cpu)

1873

__per_cpu_offset[cpu] = delta + pcpu_unit_offsets[cpu];

1873

__per_cpu_offset[cpu] = delta + pcpu_unit_offsets[cpu];

1874

}

1874

}

1875

#endif /* CONFIG_HAVE_SETUP_PER_CPU_AREA */

1875

#endif /* CONFIG_HAVE_SETUP_PER_CPU_AREA */

1876

1877

#else /* CONFIG_SMP */

1877

#else /* CONFIG_SMP */

1878

1879

/*

1879

/*

1880

* UP percpu area setup.

1880

* UP percpu area setup.

1881

*

1881

*

1882

* UP always uses km-based percpu allocator with identity mapping.

1882

* UP always uses km-based percpu allocator with identity mapping.

1883

* Static percpu variables are indistinguishable from the usual static

1883

* Static percpu variables are indistinguishable from the usual static

1884

* variables and don't require any special preparation.

1884

* variables and don't require any special preparation.

1885

*/

1885

*/

1886

void __init setup_per_cpu_areas(void)

1886

void __init setup_per_cpu_areas(void)

1887

{

1887

{

1888

const size_t unit_size =

1888

const size_t unit_size =

1889

roundup_pow_of_two(max_t(size_t, PCPU_MIN_UNIT_SIZE,

1889

roundup_pow_of_two(max_t(size_t, PCPU_MIN_UNIT_SIZE,

1890

PERCPU_DYNAMIC_RESERVE));

1890

PERCPU_DYNAMIC_RESERVE));

1891

struct pcpu_alloc_info *ai;

1891

struct pcpu_alloc_info *ai;

1892

void *fc;

1892

void *fc;

1893

1894

ai = pcpu_alloc_alloc_info(1, 1);

1894

ai = pcpu_alloc_alloc_info(1, 1);

1895

fc = __alloc_bootmem(unit_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS));

1895

fc = __alloc_bootmem(unit_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS));

1896

if (!ai || !fc)

1896

if (!ai || !fc)

1897

panic("Failed to allocate memory for percpu areas.");

1897

panic("Failed to allocate memory for percpu areas.");

1898

/* kmemleak tracks the percpu allocations separately */

1899

kmemleak_free(fc);

1898

1900

1899

ai->dyn_size = unit_size;

1901

ai->dyn_size = unit_size;

1900

ai->unit_size = unit_size;

1902

ai->unit_size = unit_size;

1901

ai->atom_size = unit_size;

1903

ai->atom_size = unit_size;

1902

ai->alloc_size = unit_size;

1904

ai->alloc_size = unit_size;

1903

ai->groups[0].nr_units = 1;

1905

ai->groups[0].nr_units = 1;

1904

ai->groups[0].cpu_map[0] = 0;

1906

ai->groups[0].cpu_map[0] = 0;

1905

1907

1906

if (pcpu_setup_first_chunk(ai, fc) < 0)

1908

if (pcpu_setup_first_chunk(ai, fc) < 0)

1907

panic("Failed to initialize percpu areas.");

1909

panic("Failed to initialize percpu areas.");

1908

}

1910

}

1909

1911

1910

#endif /* CONFIG_SMP */

1912

#endif /* CONFIG_SMP */

1911

1913

1912

/*

1914

/*

1913

* First and reserved chunks are initialized with temporary allocation

1915

* First and reserved chunks are initialized with temporary allocation

1914

* map in initdata so that they can be used before slab is online.

1916

* map in initdata so that they can be used before slab is online.

1915

* This function is called after slab is brought up and replaces those

1917

* This function is called after slab is brought up and replaces those

1916

* with properly allocated maps.

1918

* with properly allocated maps.

1917

*/

1919

*/

1918

void __init percpu_init_late(void)

1920

void __init percpu_init_late(void)

1919

{

1921

{

1920

struct pcpu_chunk *target_chunks[] =

1922

struct pcpu_chunk *target_chunks[] =

1921

{ pcpu_first_chunk, pcpu_reserved_chunk, NULL };

1923

{ pcpu_first_chunk, pcpu_reserved_chunk, NULL };

1922

struct pcpu_chunk *chunk;

1924

struct pcpu_chunk *chunk;

1923

unsigned long flags;

1925

unsigned long flags;

1924

int i;

1926

int i;

1925

1927

1926

for (i = 0; (chunk = target_chunks[i]); i++) {

1928

for (i = 0; (chunk = target_chunks[i]); i++) {

1927

int *map;

1929

int *map;

1928

const size_t size = PERCPU_DYNAMIC_EARLY_SLOTS * sizeof(map[0]);

1930

const size_t size = PERCPU_DYNAMIC_EARLY_SLOTS * sizeof(map[0]);

1929

1931

1930

BUILD_BUG_ON(size > PAGE_SIZE);

1932

BUILD_BUG_ON(size > PAGE_SIZE);

1931

1933

1932

map = pcpu_mem_zalloc(size);

1934

map = pcpu_mem_zalloc(size);

1933

BUG_ON(!map);

1935

BUG_ON(!map);

1934

1936

1935

spin_lock_irqsave(&pcpu_lock, flags);

1937

spin_lock_irqsave(&pcpu_lock, flags);

1936

memcpy(map, chunk->map, size);

1938

memcpy(map, chunk->map, size);

1937

chunk->map = map;

1939

chunk->map = map;

1938

spin_unlock_irqrestore(&pcpu_lock, flags);

1940

spin_unlock_irqrestore(&pcpu_lock, flags);

1939

}

1941

}

1940

}

1942

}

1941

1943

GITLAB

kmemleak: Fix the kmemleak tracking of the percpu areas with !SMP

 /*
  * mm/percpu.c - percpu memory allocator
  *
  * Copyright (C) 2009		SUSE Linux Products GmbH
  * Copyright (C) 2009		Tejun Heo <tj@kernel.org>
  *
  * This file is released under the GPLv2.
  *
  * This is percpu allocator which can handle both static and dynamic
  * areas.  Percpu areas are allocated in chunks.  Each chunk is
  * consisted of boot-time determined number of units and the first
  * chunk is used for static percpu variables in the kernel image
  * (special boot time alloc/init handling necessary as these areas
  * need to be brought up before allocation services are running).
  * Unit grows as necessary and all units grow or shrink in unison.
  * When a chunk is filled up, another chunk is allocated.
  *
  *  c0                           c1                         c2
  *  -------------------          -------------------        ------------
  * | u0 | u1 | u2 | u3 |        | u0 | u1 | u2 | u3 |      | u0 | u1 | u
  *  -------------------  ......  -------------------  ....  ------------
  *
  * Allocation is done in offset-size areas of single unit space.  Ie,
  * an area of 512 bytes at 6k in c1 occupies 512 bytes at 6k of c1:u0,
  * c1:u1, c1:u2 and c1:u3.  On UMA, units corresponds directly to
  * cpus.  On NUMA, the mapping can be non-linear and even sparse.
  * Percpu access can be done by configuring percpu base registers
  * according to cpu to unit mapping and pcpu_unit_size.
  *
  * There are usually many small percpu allocations many of them being
  * as small as 4 bytes.  The allocator organizes chunks into lists
  * according to free size and tries to allocate from the fullest one.
  * Each chunk keeps the maximum contiguous area size hint which is
  * guaranteed to be equal to or larger than the maximum contiguous
  * area in the chunk.  This helps the allocator not to iterate the
  * chunk maps unnecessarily.
  *
  * Allocation state in each chunk is kept using an array of integers
  * on chunk->map.  A positive value in the map represents a free
  * region and negative allocated.  Allocation inside a chunk is done
  * by scanning this map sequentially and serving the first matching
  * entry.  This is mostly copied from the percpu_modalloc() allocator.
  * Chunks can be determined from the address using the index field
  * in the page struct. The index field contains a pointer to the chunk.
  *
  * To use this allocator, arch code should do the followings.
  *
  * - define __addr_to_pcpu_ptr() and __pcpu_ptr_to_addr() to translate
  *   regular address to percpu pointer and back if they need to be
  *   different from the default
  *
  * - use pcpu_setup_first_chunk() during percpu area initialization to
  *   setup the first chunk containing the kernel static percpu area
  */
 #include <linux/bitmap.h>
 #include <linux/bootmem.h>
 #include <linux/err.h>
 #include <linux/list.h>
 #include <linux/log2.h>
 #include <linux/mm.h>
 #include <linux/module.h>
 #include <linux/mutex.h>
 #include <linux/percpu.h>
 #include <linux/pfn.h>
 #include <linux/slab.h>
 #include <linux/spinlock.h>
 #include <linux/vmalloc.h>
 #include <linux/workqueue.h>
 #include <linux/kmemleak.h>
 #include <asm/cacheflush.h>
 #include <asm/sections.h>
 #include <asm/tlbflush.h>
 #include <asm/io.h>
 #define PCPU_SLOT_BASE_SHIFT		5	/* 1-31 shares the same slot */
 #define PCPU_DFL_MAP_ALLOC		16	/* start a map with 16 ents */
 #ifdef CONFIG_SMP
 /* default addr <-> pcpu_ptr mapping, override in asm/percpu.h if necessary */
 #ifndef __addr_to_pcpu_ptr
 #define __addr_to_pcpu_ptr(addr)					\
 	(void __percpu *)((unsigned long)(addr) -			\
 			  (unsigned long)pcpu_base_addr	+		\
 			  (unsigned long)__per_cpu_start)
 #endif
 #ifndef __pcpu_ptr_to_addr
 #define __pcpu_ptr_to_addr(ptr)						\
 	(void __force *)((unsigned long)(ptr) +				\
 			 (unsigned long)pcpu_base_addr -		\
 			 (unsigned long)__per_cpu_start)
 #endif
 #else	/* CONFIG_SMP */
 /* on UP, it's always identity mapped */
 #define __addr_to_pcpu_ptr(addr)	(void __percpu *)(addr)
 #define __pcpu_ptr_to_addr(ptr)		(void __force *)(ptr)
 #endif	/* CONFIG_SMP */
 struct pcpu_chunk {
 	struct list_head	list;		/* linked to pcpu_slot lists */
 	int			free_size;	/* free bytes in the chunk */
 	int			contig_hint;	/* max contiguous size hint */
 	void			*base_addr;	/* base address of this chunk */
 	int			map_used;	/* # of map entries used */
 	int			map_alloc;	/* # of map entries allocated */
 	int			*map;		/* allocation map */
 	void			*data;		/* chunk data */
 	bool			immutable;	/* no [de]population allowed */
 	unsigned long		populated[];	/* populated bitmap */
 };
 static int pcpu_unit_pages __read_mostly;
 static int pcpu_unit_size __read_mostly;
 static int pcpu_nr_units __read_mostly;
 static int pcpu_atom_size __read_mostly;
 static int pcpu_nr_slots __read_mostly;
 static size_t pcpu_chunk_struct_size __read_mostly;
 /* cpus with the lowest and highest unit addresses */
 static unsigned int pcpu_low_unit_cpu __read_mostly;
 static unsigned int pcpu_high_unit_cpu __read_mostly;
 /* the address of the first chunk which starts with the kernel static area */
 void *pcpu_base_addr __read_mostly;
 EXPORT_SYMBOL_GPL(pcpu_base_addr);
 static const int *pcpu_unit_map __read_mostly;		/* cpu -> unit */
 const unsigned long *pcpu_unit_offsets __read_mostly;	/* cpu -> unit offset */
 /* group information, used for vm allocation */
 static int pcpu_nr_groups __read_mostly;
 static const unsigned long *pcpu_group_offsets __read_mostly;
 static const size_t *pcpu_group_sizes __read_mostly;
 /*
  * The first chunk which always exists.  Note that unlike other
  * chunks, this one can be allocated and mapped in several different
  * ways and thus often doesn't live in the vmalloc area.
  */
 static struct pcpu_chunk *pcpu_first_chunk;
 /*
  * Optional reserved chunk.  This chunk reserves part of the first
  * chunk and serves it for reserved allocations.  The amount of
  * reserved offset is in pcpu_reserved_chunk_limit.  When reserved
  * area doesn't exist, the following variables contain NULL and 0
  * respectively.
  */
 static struct pcpu_chunk *pcpu_reserved_chunk;
 static int pcpu_reserved_chunk_limit;
 /*
  * Synchronization rules.
  *
  * There are two locks - pcpu_alloc_mutex and pcpu_lock.  The former
  * protects allocation/reclaim paths, chunks, populated bitmap and
  * vmalloc mapping.  The latter is a spinlock and protects the index
  * data structures - chunk slots, chunks and area maps in chunks.
  *
  * During allocation, pcpu_alloc_mutex is kept locked all the time and
  * pcpu_lock is grabbed and released as necessary.  All actual memory
  * allocations are done using GFP_KERNEL with pcpu_lock released.  In
  * general, percpu memory can't be allocated with irq off but
  * irqsave/restore are still used in alloc path so that it can be used
  * from early init path - sched_init() specifically.
  *
  * Free path accesses and alters only the index data structures, so it
  * can be safely called from atomic context.  When memory needs to be
  * returned to the system, free path schedules reclaim_work which
  * grabs both pcpu_alloc_mutex and pcpu_lock, unlinks chunks to be
  * reclaimed, release both locks and frees the chunks.  Note that it's
  * necessary to grab both locks to remove a chunk from circulation as
  * allocation path might be referencing the chunk with only
  * pcpu_alloc_mutex locked.
  */
 static DEFINE_MUTEX(pcpu_alloc_mutex);	/* protects whole alloc and reclaim */
 static DEFINE_SPINLOCK(pcpu_lock);	/* protects index data structures */
 static struct list_head *pcpu_slot __read_mostly; /* chunk list slots */
 /* reclaim work to release fully free chunks, scheduled from free path */
 static void pcpu_reclaim(struct work_struct *work);
 static DECLARE_WORK(pcpu_reclaim_work, pcpu_reclaim);
 static bool pcpu_addr_in_first_chunk(void *addr)
 {
 	void *first_start = pcpu_first_chunk->base_addr;
 	return addr >= first_start && addr < first_start + pcpu_unit_size;
 }
 static bool pcpu_addr_in_reserved_chunk(void *addr)
 {
 	void *first_start = pcpu_first_chunk->base_addr;
 	return addr >= first_start &&
 		addr < first_start + pcpu_reserved_chunk_limit;
 }
 static int __pcpu_size_to_slot(int size)
 {
 	int highbit = fls(size);	/* size is in bytes */
 	return max(highbit - PCPU_SLOT_BASE_SHIFT + 2, 1);
 }
 static int pcpu_size_to_slot(int size)
 {
 	if (size == pcpu_unit_size)
 		return pcpu_nr_slots - 1;
 	return __pcpu_size_to_slot(size);
 }
 static int pcpu_chunk_slot(const struct pcpu_chunk *chunk)
 {
 	if (chunk->free_size < sizeof(int) || chunk->contig_hint < sizeof(int))
 		return 0;
 	return pcpu_size_to_slot(chunk->free_size);
 }
 /* set the pointer to a chunk in a page struct */
 static void pcpu_set_page_chunk(struct page *page, struct pcpu_chunk *pcpu)
 {
 	page->index = (unsigned long)pcpu;
 }
 /* obtain pointer to a chunk from a page struct */
 static struct pcpu_chunk *pcpu_get_page_chunk(struct page *page)
 {
 	return (struct pcpu_chunk *)page->index;
 }
 static int __maybe_unused pcpu_page_idx(unsigned int cpu, int page_idx)
 {
 	return pcpu_unit_map[cpu] * pcpu_unit_pages + page_idx;
 }
 static unsigned long pcpu_chunk_addr(struct pcpu_chunk *chunk,
 				     unsigned int cpu, int page_idx)
 {
 	return (unsigned long)chunk->base_addr + pcpu_unit_offsets[cpu] +
 		(page_idx << PAGE_SHIFT);
 }
 static void __maybe_unused pcpu_next_unpop(struct pcpu_chunk *chunk,
 					   int *rs, int *re, int end)
 {
 	*rs = find_next_zero_bit(chunk->populated, end, *rs);
 	*re = find_next_bit(chunk->populated, end, *rs + 1);
 }
 static void __maybe_unused pcpu_next_pop(struct pcpu_chunk *chunk,
 					 int *rs, int *re, int end)
 {
 	*rs = find_next_bit(chunk->populated, end, *rs);
 	*re = find_next_zero_bit(chunk->populated, end, *rs + 1);
 }
 /*
  * (Un)populated page region iterators.  Iterate over (un)populated
  * page regions between @start and @end in @chunk.  @rs and @re should
  * be integer variables and will be set to start and end page index of
  * the current region.
  */
 #define pcpu_for_each_unpop_region(chunk, rs, re, start, end)		    \
 	for ((rs) = (start), pcpu_next_unpop((chunk), &(rs), &(re), (end)); \
 	     (rs) < (re);						    \
 	     (rs) = (re) + 1, pcpu_next_unpop((chunk), &(rs), &(re), (end)))
 #define pcpu_for_each_pop_region(chunk, rs, re, start, end)		    \
 	for ((rs) = (start), pcpu_next_pop((chunk), &(rs), &(re), (end));   \
 	     (rs) < (re);						    \
 	     (rs) = (re) + 1, pcpu_next_pop((chunk), &(rs), &(re), (end)))
 /**
  * pcpu_mem_zalloc - allocate memory
  * @size: bytes to allocate
  *
  * Allocate @size bytes.  If @size is smaller than PAGE_SIZE,
  * kzalloc() is used; otherwise, vzalloc() is used.  The returned
  * memory is always zeroed.
  *
  * CONTEXT:
  * Does GFP_KERNEL allocation.
  *
  * RETURNS:
  * Pointer to the allocated area on success, NULL on failure.
  */
 static void *pcpu_mem_zalloc(size_t size)
 {
 	if (WARN_ON_ONCE(!slab_is_available()))
 		return NULL;
 	if (size <= PAGE_SIZE)
 		return kzalloc(size, GFP_KERNEL);
 	else
 		return vzalloc(size);
 }
 /**
  * pcpu_mem_free - free memory
  * @ptr: memory to free
  * @size: size of the area
  *
  * Free @ptr.  @ptr should have been allocated using pcpu_mem_zalloc().
  */
 static void pcpu_mem_free(void *ptr, size_t size)
 {
 	if (size <= PAGE_SIZE)
 		kfree(ptr);
 	else
 		vfree(ptr);
 }
 /**
  * pcpu_chunk_relocate - put chunk in the appropriate chunk slot
  * @chunk: chunk of interest
  * @oslot: the previous slot it was on
  *
  * This function is called after an allocation or free changed @chunk.
  * New slot according to the changed state is determined and @chunk is
  * moved to the slot.  Note that the reserved chunk is never put on
  * chunk slots.
  *
  * CONTEXT:
  * pcpu_lock.
  */
 static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot)
 {
 	int nslot = pcpu_chunk_slot(chunk);
 	if (chunk != pcpu_reserved_chunk && oslot != nslot) {
 		if (oslot < nslot)
 			list_move(&chunk->list, &pcpu_slot[nslot]);
 		else
 			list_move_tail(&chunk->list, &pcpu_slot[nslot]);
 	}
 }
 /**
  * pcpu_need_to_extend - determine whether chunk area map needs to be extended
  * @chunk: chunk of interest
  *
  * Determine whether area map of @chunk needs to be extended to
  * accommodate a new allocation.
  *
  * CONTEXT:
  * pcpu_lock.
  *
  * RETURNS:
  * New target map allocation length if extension is necessary, 0
  * otherwise.
  */
 static int pcpu_need_to_extend(struct pcpu_chunk *chunk)
 {
 	int new_alloc;
 	if (chunk->map_alloc >= chunk->map_used + 2)
 		return 0;
 	new_alloc = PCPU_DFL_MAP_ALLOC;
 	while (new_alloc < chunk->map_used + 2)
 		new_alloc *= 2;
 	return new_alloc;
 }
 /**
  * pcpu_extend_area_map - extend area map of a chunk
  * @chunk: chunk of interest
  * @new_alloc: new target allocation length of the area map
  *
  * Extend area map of @chunk to have @new_alloc entries.
  *
  * CONTEXT:
  * Does GFP_KERNEL allocation.  Grabs and releases pcpu_lock.
  *
  * RETURNS:
  * 0 on success, -errno on failure.
  */
 static int pcpu_extend_area_map(struct pcpu_chunk *chunk, int new_alloc)
 {
 	int *old = NULL, *new = NULL;
 	size_t old_size = 0, new_size = new_alloc * sizeof(new[0]);
 	unsigned long flags;
 	new = pcpu_mem_zalloc(new_size);
 	if (!new)
 		return -ENOMEM;
 	/* acquire pcpu_lock and switch to new area map */
 	spin_lock_irqsave(&pcpu_lock, flags);
 	if (new_alloc <= chunk->map_alloc)
 		goto out_unlock;
 	old_size = chunk->map_alloc * sizeof(chunk->map[0]);
 	old = chunk->map;
 	memcpy(new, old, old_size);
 	chunk->map_alloc = new_alloc;
 	chunk->map = new;
 	new = NULL;
 out_unlock:
 	spin_unlock_irqrestore(&pcpu_lock, flags);
 	/*
 	 * pcpu_mem_free() might end up calling vfree() which uses
 	 * IRQ-unsafe lock and thus can't be called under pcpu_lock.
 	 */
 	pcpu_mem_free(old, old_size);
 	pcpu_mem_free(new, new_size);
 	return 0;
 }
 /**
  * pcpu_split_block - split a map block
  * @chunk: chunk of interest
  * @i: index of map block to split
  * @head: head size in bytes (can be 0)
  * @tail: tail size in bytes (can be 0)
  *
  * Split the @i'th map block into two or three blocks.  If @head is
  * non-zero, @head bytes block is inserted before block @i moving it
  * to @i+1 and reducing its size by @head bytes.
  *
  * If @tail is non-zero, the target block, which can be @i or @i+1
  * depending on @head, is reduced by @tail bytes and @tail byte block
  * is inserted after the target block.
  *
  * @chunk->map must have enough free slots to accommodate the split.
  *
  * CONTEXT:
  * pcpu_lock.
  */
 static void pcpu_split_block(struct pcpu_chunk *chunk, int i,
 			     int head, int tail)
 {
 	int nr_extra = !!head + !!tail;
 	BUG_ON(chunk->map_alloc < chunk->map_used + nr_extra);
 	/* insert new subblocks */
 	memmove(&chunk->map[i + nr_extra], &chunk->map[i],
 		sizeof(chunk->map[0]) * (chunk->map_used - i));
 	chunk->map_used += nr_extra;
 	if (head) {
 		chunk->map[i + 1] = chunk->map[i] - head;
 		chunk->map[i++] = head;
 	}
 	if (tail) {
 		chunk->map[i++] -= tail;
 		chunk->map[i] = tail;
 	}
 }
 /**
  * pcpu_alloc_area - allocate area from a pcpu_chunk
  * @chunk: chunk of interest
  * @size: wanted size in bytes
  * @align: wanted align
  *
  * Try to allocate @size bytes area aligned at @align from @chunk.
  * Note that this function only allocates the offset.  It doesn't
  * populate or map the area.
  *
  * @chunk->map must have at least two free slots.
  *
  * CONTEXT:
  * pcpu_lock.
  *
  * RETURNS:
  * Allocated offset in @chunk on success, -1 if no matching area is
  * found.
  */
 static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align)
 {
 	int oslot = pcpu_chunk_slot(chunk);
 	int max_contig = 0;
 	int i, off;
 	for (i = 0, off = 0; i < chunk->map_used; off += abs(chunk->map[i++])) {
 		bool is_last = i + 1 == chunk->map_used;
 		int head, tail;
 		/* extra for alignment requirement */
 		head = ALIGN(off, align) - off;
 		BUG_ON(i == 0 && head != 0);
 		if (chunk->map[i] < 0)
 			continue;
 		if (chunk->map[i] < head + size) {
 			max_contig = max(chunk->map[i], max_contig);
 			continue;
 		}
 		/*
 		 * If head is small or the previous block is free,
 		 * merge'em.  Note that 'small' is defined as smaller
 		 * than sizeof(int), which is very small but isn't too
 		 * uncommon for percpu allocations.
 		 */
 		if (head && (head < sizeof(int) || chunk->map[i - 1] > 0)) {
 			if (chunk->map[i - 1] > 0)
 				chunk->map[i - 1] += head;
 			else {
 				chunk->map[i - 1] -= head;
 				chunk->free_size -= head;
 			}
 			chunk->map[i] -= head;
 			off += head;
 			head = 0;
 		}
 		/* if tail is small, just keep it around */
 		tail = chunk->map[i] - head - size;
 		if (tail < sizeof(int))
 			tail = 0;
 		/* split if warranted */
 		if (head || tail) {
 			pcpu_split_block(chunk, i, head, tail);
 			if (head) {
 				i++;
 				off += head;
 				max_contig = max(chunk->map[i - 1], max_contig);
 			}
 			if (tail)
 				max_contig = max(chunk->map[i + 1], max_contig);
 		}
 		/* update hint and mark allocated */
 		if (is_last)
 			chunk->contig_hint = max_contig; /* fully scanned */
 		else
 			chunk->contig_hint = max(chunk->contig_hint,
 						 max_contig);
 		chunk->free_size -= chunk->map[i];
 		chunk->map[i] = -chunk->map[i];
 		pcpu_chunk_relocate(chunk, oslot);
 		return off;
 	}
 	chunk->contig_hint = max_contig;	/* fully scanned */
 	pcpu_chunk_relocate(chunk, oslot);
 	/* tell the upper layer that this chunk has no matching area */
 	return -1;
 }
 /**
  * pcpu_free_area - free area to a pcpu_chunk
  * @chunk: chunk of interest
  * @freeme: offset of area to free
  *
  * Free area starting from @freeme to @chunk.  Note that this function
  * only modifies the allocation map.  It doesn't depopulate or unmap
  * the area.
  *
  * CONTEXT:
  * pcpu_lock.
  */
 static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme)
 {
 	int oslot = pcpu_chunk_slot(chunk);
 	int i, off;
 	for (i = 0, off = 0; i < chunk->map_used; off += abs(chunk->map[i++]))
 		if (off == freeme)
 			break;
 	BUG_ON(off != freeme);
 	BUG_ON(chunk->map[i] > 0);
 	chunk->map[i] = -chunk->map[i];
 	chunk->free_size += chunk->map[i];
 	/* merge with previous? */
 	if (i > 0 && chunk->map[i - 1] >= 0) {
 		chunk->map[i - 1] += chunk->map[i];
 		chunk->map_used--;
 		memmove(&chunk->map[i], &chunk->map[i + 1],
 			(chunk->map_used - i) * sizeof(chunk->map[0]));
 		i--;
 	}
 	/* merge with next? */
 	if (i + 1 < chunk->map_used && chunk->map[i + 1] >= 0) {
 		chunk->map[i] += chunk->map[i + 1];
 		chunk->map_used--;
 		memmove(&chunk->map[i + 1], &chunk->map[i + 2],
 			(chunk->map_used - (i + 1)) * sizeof(chunk->map[0]));
 	}
 	chunk->contig_hint = max(chunk->map[i], chunk->contig_hint);
 	pcpu_chunk_relocate(chunk, oslot);
 }
 static struct pcpu_chunk *pcpu_alloc_chunk(void)
 {
 	struct pcpu_chunk *chunk;
 	chunk = pcpu_mem_zalloc(pcpu_chunk_struct_size);
 	if (!chunk)
 		return NULL;
 	chunk->map = pcpu_mem_zalloc(PCPU_DFL_MAP_ALLOC *
 						sizeof(chunk->map[0]));
 	if (!chunk->map) {
 		kfree(chunk);
 		return NULL;
 	}
 	chunk->map_alloc = PCPU_DFL_MAP_ALLOC;
 	chunk->map[chunk->map_used++] = pcpu_unit_size;
 	INIT_LIST_HEAD(&chunk->list);
 	chunk->free_size = pcpu_unit_size;
 	chunk->contig_hint = pcpu_unit_size;
 	return chunk;
 }
 static void pcpu_free_chunk(struct pcpu_chunk *chunk)
 {
 	if (!chunk)
 		return;
 	pcpu_mem_free(chunk->map, chunk->map_alloc * sizeof(chunk->map[0]));
 	kfree(chunk);
 }
 /*
  * Chunk management implementation.
  *
  * To allow different implementations, chunk alloc/free and
  * [de]population are implemented in a separate file which is pulled
  * into this file and compiled together.  The following functions
  * should be implemented.
  *
  * pcpu_populate_chunk		- populate the specified range of a chunk
  * pcpu_depopulate_chunk	- depopulate the specified range of a chunk
  * pcpu_create_chunk		- create a new chunk
  * pcpu_destroy_chunk		- destroy a chunk, always preceded by full depop
  * pcpu_addr_to_page		- translate address to physical address
  * pcpu_verify_alloc_info	- check alloc_info is acceptable during init
  */
 static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size);
 static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size);
 static struct pcpu_chunk *pcpu_create_chunk(void);
 static void pcpu_destroy_chunk(struct pcpu_chunk *chunk);
 static struct page *pcpu_addr_to_page(void *addr);
 static int __init pcpu_verify_alloc_info(const struct pcpu_alloc_info *ai);
 #ifdef CONFIG_NEED_PER_CPU_KM
 #include "percpu-km.c"
 #else
 #include "percpu-vm.c"
 #endif
 /**
  * pcpu_chunk_addr_search - determine chunk containing specified address
  * @addr: address for which the chunk needs to be determined.
  *
  * RETURNS:
  * The address of the found chunk.
  */
 static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr)
 {
 	/* is it in the first chunk? */
 	if (pcpu_addr_in_first_chunk(addr)) {
 		/* is it in the reserved area? */
 		if (pcpu_addr_in_reserved_chunk(addr))
 			return pcpu_reserved_chunk;
 		return pcpu_first_chunk;
 	}
 	/*
 	 * The address is relative to unit0 which might be unused and
 	 * thus unmapped.  Offset the address to the unit space of the
 	 * current processor before looking it up in the vmalloc
 	 * space.  Note that any possible cpu id can be used here, so
 	 * there's no need to worry about preemption or cpu hotplug.
 	 */
 	addr += pcpu_unit_offsets[raw_smp_processor_id()];
 	return pcpu_get_page_chunk(pcpu_addr_to_page(addr));
 }
 /**
  * pcpu_alloc - the percpu allocator
  * @size: size of area to allocate in bytes
  * @align: alignment of area (max PAGE_SIZE)
  * @reserved: allocate from the reserved chunk if available
  *
  * Allocate percpu area of @size bytes aligned at @align.
  *
  * CONTEXT:
  * Does GFP_KERNEL allocation.
  *
  * RETURNS:
  * Percpu pointer to the allocated area on success, NULL on failure.
  */
 static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved)
 {
 	static int warn_limit = 10;
 	struct pcpu_chunk *chunk;
 	const char *err;
 	int slot, off, new_alloc;
 	unsigned long flags;
 	void __percpu *ptr;
 	if (unlikely(!size || size > PCPU_MIN_UNIT_SIZE || align > PAGE_SIZE)) {
 		WARN(true, "illegal size (%zu) or align (%zu) for "
 		     "percpu allocation\n", size, align);
 		return NULL;
 	}
 	mutex_lock(&pcpu_alloc_mutex);
 	spin_lock_irqsave(&pcpu_lock, flags);
 	/* serve reserved allocations from the reserved chunk if available */
 	if (reserved && pcpu_reserved_chunk) {
 		chunk = pcpu_reserved_chunk;
 		if (size > chunk->contig_hint) {
 			err = "alloc from reserved chunk failed";
 			goto fail_unlock;
 		}
 		while ((new_alloc = pcpu_need_to_extend(chunk))) {
 			spin_unlock_irqrestore(&pcpu_lock, flags);
 			if (pcpu_extend_area_map(chunk, new_alloc) < 0) {
 				err = "failed to extend area map of reserved chunk";
 				goto fail_unlock_mutex;
 			}
 			spin_lock_irqsave(&pcpu_lock, flags);
 		}
 		off = pcpu_alloc_area(chunk, size, align);
 		if (off >= 0)
 			goto area_found;
 		err = "alloc from reserved chunk failed";
 		goto fail_unlock;
 	}
 restart:
 	/* search through normal chunks */
 	for (slot = pcpu_size_to_slot(size); slot < pcpu_nr_slots; slot++) {
 		list_for_each_entry(chunk, &pcpu_slot[slot], list) {
 			if (size > chunk->contig_hint)
 				continue;
 			new_alloc = pcpu_need_to_extend(chunk);
 			if (new_alloc) {
 				spin_unlock_irqrestore(&pcpu_lock, flags);
 				if (pcpu_extend_area_map(chunk,
 							 new_alloc) < 0) {
 					err = "failed to extend area map";
 					goto fail_unlock_mutex;
 				}
 				spin_lock_irqsave(&pcpu_lock, flags);
 				/*
 				 * pcpu_lock has been dropped, need to
 				 * restart cpu_slot list walking.
 				 */
 				goto restart;
 			}
 			off = pcpu_alloc_area(chunk, size, align);
 			if (off >= 0)
 				goto area_found;
 		}
 	}
 	/* hmmm... no space left, create a new chunk */
 	spin_unlock_irqrestore(&pcpu_lock, flags);
 	chunk = pcpu_create_chunk();
 	if (!chunk) {
 		err = "failed to allocate new chunk";
 		goto fail_unlock_mutex;
 	}
 	spin_lock_irqsave(&pcpu_lock, flags);
 	pcpu_chunk_relocate(chunk, -1);
 	goto restart;
 area_found:
 	spin_unlock_irqrestore(&pcpu_lock, flags);
 	/* populate, map and clear the area */
 	if (pcpu_populate_chunk(chunk, off, size)) {
 		spin_lock_irqsave(&pcpu_lock, flags);
 		pcpu_free_area(chunk, off);
 		err = "failed to populate";
 		goto fail_unlock;
 	}
 	mutex_unlock(&pcpu_alloc_mutex);
 	/* return address relative to base address */
 	ptr = __addr_to_pcpu_ptr(chunk->base_addr + off);
 	kmemleak_alloc_percpu(ptr, size);
 	return ptr;
 fail_unlock:
 	spin_unlock_irqrestore(&pcpu_lock, flags);
 fail_unlock_mutex:
 	mutex_unlock(&pcpu_alloc_mutex);
 	if (warn_limit) {
 		pr_warning("PERCPU: allocation failed, size=%zu align=%zu, "
 			   "%s\n", size, align, err);
 		dump_stack();
 		if (!--warn_limit)
 			pr_info("PERCPU: limit reached, disable warning\n");
 	}
 	return NULL;
 }
 /**
  * __alloc_percpu - allocate dynamic percpu area
  * @size: size of area to allocate in bytes
  * @align: alignment of area (max PAGE_SIZE)
  *
  * Allocate zero-filled percpu area of @size bytes aligned at @align.
  * Might sleep.  Might trigger writeouts.
  *
  * CONTEXT:
  * Does GFP_KERNEL allocation.
  *
  * RETURNS:
  * Percpu pointer to the allocated area on success, NULL on failure.
  */
 void __percpu *__alloc_percpu(size_t size, size_t align)
 {
 	return pcpu_alloc(size, align, false);
 }
 EXPORT_SYMBOL_GPL(__alloc_percpu);
 /**
  * __alloc_reserved_percpu - allocate reserved percpu area
  * @size: size of area to allocate in bytes
  * @align: alignment of area (max PAGE_SIZE)
  *
  * Allocate zero-filled percpu area of @size bytes aligned at @align
  * from reserved percpu area if arch has set it up; otherwise,
  * allocation is served from the same dynamic area.  Might sleep.
  * Might trigger writeouts.
  *
  * CONTEXT:
  * Does GFP_KERNEL allocation.
  *
  * RETURNS:
  * Percpu pointer to the allocated area on success, NULL on failure.
  */
 void __percpu *__alloc_reserved_percpu(size_t size, size_t align)
 {
 	return pcpu_alloc(size, align, true);
 }
 /**
  * pcpu_reclaim - reclaim fully free chunks, workqueue function
  * @work: unused
  *
  * Reclaim all fully free chunks except for the first one.
  *
  * CONTEXT:
  * workqueue context.
  */
 static void pcpu_reclaim(struct work_struct *work)
 {
 	LIST_HEAD(todo);
 	struct list_head *head = &pcpu_slot[pcpu_nr_slots - 1];
 	struct pcpu_chunk *chunk, *next;
 	mutex_lock(&pcpu_alloc_mutex);
 	spin_lock_irq(&pcpu_lock);
 	list_for_each_entry_safe(chunk, next, head, list) {
 		WARN_ON(chunk->immutable);
 		/* spare the first one */
 		if (chunk == list_first_entry(head, struct pcpu_chunk, list))
 			continue;
 		list_move(&chunk->list, &todo);
 	}
 	spin_unlock_irq(&pcpu_lock);
 	list_for_each_entry_safe(chunk, next, &todo, list) {
 		pcpu_depopulate_chunk(chunk, 0, pcpu_unit_size);
 		pcpu_destroy_chunk(chunk);
 	}
 	mutex_unlock(&pcpu_alloc_mutex);
 }
 /**
  * free_percpu - free percpu area
  * @ptr: pointer to area to free
  *
  * Free percpu area @ptr.
  *
  * CONTEXT:
  * Can be called from atomic context.
  */
 void free_percpu(void __percpu *ptr)
 {
 	void *addr;
 	struct pcpu_chunk *chunk;
 	unsigned long flags;
 	int off;
 	if (!ptr)
 		return;
 	kmemleak_free_percpu(ptr);
 	addr = __pcpu_ptr_to_addr(ptr);
 	spin_lock_irqsave(&pcpu_lock, flags);
 	chunk = pcpu_chunk_addr_search(addr);
 	off = addr - chunk->base_addr;
 	pcpu_free_area(chunk, off);
 	/* if there are more than one fully free chunks, wake up grim reaper */
 	if (chunk->free_size == pcpu_unit_size) {
 		struct pcpu_chunk *pos;
 		list_for_each_entry(pos, &pcpu_slot[pcpu_nr_slots - 1], list)
 			if (pos != chunk) {
 				schedule_work(&pcpu_reclaim_work);
 				break;
 			}
 	}
 	spin_unlock_irqrestore(&pcpu_lock, flags);
 }
 EXPORT_SYMBOL_GPL(free_percpu);
 /**
  * is_kernel_percpu_address - test whether address is from static percpu area
  * @addr: address to test
  *
  * Test whether @addr belongs to in-kernel static percpu area.  Module
  * static percpu areas are not considered.  For those, use
  * is_module_percpu_address().
  *
  * RETURNS:
  * %true if @addr is from in-kernel static percpu area, %false otherwise.
  */
 bool is_kernel_percpu_address(unsigned long addr)
 {
 #ifdef CONFIG_SMP
 	const size_t static_size = __per_cpu_end - __per_cpu_start;
 	void __percpu *base = __addr_to_pcpu_ptr(pcpu_base_addr);
 	unsigned int cpu;
 	for_each_possible_cpu(cpu) {
 		void *start = per_cpu_ptr(base, cpu);
 		if ((void *)addr >= start && (void *)addr < start + static_size)
 			return true;
         }
 #endif
 	/* on UP, can't distinguish from other static vars, always false */
 	return false;
 }
 /**
  * per_cpu_ptr_to_phys - convert translated percpu address to physical address
  * @addr: the address to be converted to physical address
  *
  * Given @addr which is dereferenceable address obtained via one of
  * percpu access macros, this function translates it into its physical
  * address.  The caller is responsible for ensuring @addr stays valid
  * until this function finishes.
  *
  * percpu allocator has special setup for the first chunk, which currently
  * supports either embedding in linear address space or vmalloc mapping,
  * and, from the second one, the backing allocator (currently either vm or
  * km) provides translation.
  *
  * The addr can be tranlated simply without checking if it falls into the
  * first chunk. But the current code reflects better how percpu allocator
  * actually works, and the verification can discover both bugs in percpu
  * allocator itself and per_cpu_ptr_to_phys() callers. So we keep current
  * code.
  *
  * RETURNS:
  * The physical address for @addr.
  */
 phys_addr_t per_cpu_ptr_to_phys(void *addr)
 {
 	void __percpu *base = __addr_to_pcpu_ptr(pcpu_base_addr);
 	bool in_first_chunk = false;
 	unsigned long first_low, first_high;
 	unsigned int cpu;
 	/*
 	 * The following test on unit_low/high isn't strictly
 	 * necessary but will speed up lookups of addresses which
 	 * aren't in the first chunk.
 	 */
 	first_low = pcpu_chunk_addr(pcpu_first_chunk, pcpu_low_unit_cpu, 0);
 	first_high = pcpu_chunk_addr(pcpu_first_chunk, pcpu_high_unit_cpu,
 				     pcpu_unit_pages);
 	if ((unsigned long)addr >= first_low &&
 	    (unsigned long)addr < first_high) {
 		for_each_possible_cpu(cpu) {
 			void *start = per_cpu_ptr(base, cpu);
 			if (addr >= start && addr < start + pcpu_unit_size) {
 				in_first_chunk = true;
 				break;
 			}
 		}
 	}
 	if (in_first_chunk) {
 		if (!is_vmalloc_addr(addr))
 			return __pa(addr);
 		else
 			return page_to_phys(vmalloc_to_page(addr)) +
 			       offset_in_page(addr);
 	} else
 		return page_to_phys(pcpu_addr_to_page(addr)) +
 		       offset_in_page(addr);
 }
 /**
  * pcpu_alloc_alloc_info - allocate percpu allocation info
  * @nr_groups: the number of groups
  * @nr_units: the number of units
  *
  * Allocate ai which is large enough for @nr_groups groups containing
  * @nr_units units.  The returned ai's groups[0].cpu_map points to the
  * cpu_map array which is long enough for @nr_units and filled with
  * NR_CPUS.  It's the caller's responsibility to initialize cpu_map
  * pointer of other groups.
  *
  * RETURNS:
  * Pointer to the allocated pcpu_alloc_info on success, NULL on
  * failure.
  */
 struct pcpu_alloc_info * __init pcpu_alloc_alloc_info(int nr_groups,
 						      int nr_units)
 {
 	struct pcpu_alloc_info *ai;
 	size_t base_size, ai_size;
 	void *ptr;
 	int unit;
 	base_size = ALIGN(sizeof(*ai) + nr_groups * sizeof(ai->groups[0]),
 			  __alignof__(ai->groups[0].cpu_map[0]));
 	ai_size = base_size + nr_units * sizeof(ai->groups[0].cpu_map[0]);
 	ptr = alloc_bootmem_nopanic(PFN_ALIGN(ai_size));
 	if (!ptr)
 		return NULL;
 	ai = ptr;
 	ptr += base_size;
 	ai->groups[0].cpu_map = ptr;
 	for (unit = 0; unit < nr_units; unit++)
 		ai->groups[0].cpu_map[unit] = NR_CPUS;
 	ai->nr_groups = nr_groups;
 	ai->__ai_size = PFN_ALIGN(ai_size);
 	return ai;
 }
 /**
  * pcpu_free_alloc_info - free percpu allocation info
  * @ai: pcpu_alloc_info to free
  *
  * Free @ai which was allocated by pcpu_alloc_alloc_info().
  */
 void __init pcpu_free_alloc_info(struct pcpu_alloc_info *ai)
 {
 	free_bootmem(__pa(ai), ai->__ai_size);
 }
 /**
  * pcpu_dump_alloc_info - print out information about pcpu_alloc_info
  * @lvl: loglevel
  * @ai: allocation info to dump
  *
  * Print out information about @ai using loglevel @lvl.
  */
 static void pcpu_dump_alloc_info(const char *lvl,
 				 const struct pcpu_alloc_info *ai)
 {
 	int group_width = 1, cpu_width = 1, width;
 	char empty_str[] = "--------";
 	int alloc = 0, alloc_end = 0;
 	int group, v;
 	int upa, apl;	/* units per alloc, allocs per line */
 	v = ai->nr_groups;
 	while (v /= 10)
 		group_width++;
 	v = num_possible_cpus();
 	while (v /= 10)
 		cpu_width++;
 	empty_str[min_t(int, cpu_width, sizeof(empty_str) - 1)] = '\0';
 	upa = ai->alloc_size / ai->unit_size;
 	width = upa * (cpu_width + 1) + group_width + 3;
 	apl = rounddown_pow_of_two(max(60 / width, 1));
 	printk("%spcpu-alloc: s%zu r%zu d%zu u%zu alloc=%zu*%zu",
 	       lvl, ai->static_size, ai->reserved_size, ai->dyn_size,
 	       ai->unit_size, ai->alloc_size / ai->atom_size, ai->atom_size);
 	for (group = 0; group < ai->nr_groups; group++) {
 		const struct pcpu_group_info *gi = &ai->groups[group];
 		int unit = 0, unit_end = 0;
 		BUG_ON(gi->nr_units % upa);
 		for (alloc_end += gi->nr_units / upa;
 		     alloc < alloc_end; alloc++) {
 			if (!(alloc % apl)) {
 				printk(KERN_CONT "\n");
 				printk("%spcpu-alloc: ", lvl);
 			}
 			printk(KERN_CONT "[%0*d] ", group_width, group);
 			for (unit_end += upa; unit < unit_end; unit++)
 				if (gi->cpu_map[unit] != NR_CPUS)
 					printk(KERN_CONT "%0*d ", cpu_width,
 					       gi->cpu_map[unit]);
 				else
 					printk(KERN_CONT "%s ", empty_str);
 		}
 	}
 	printk(KERN_CONT "\n");
 }
 /**
  * pcpu_setup_first_chunk - initialize the first percpu chunk
  * @ai: pcpu_alloc_info describing how to percpu area is shaped
  * @base_addr: mapped address
  *
  * Initialize the first percpu chunk which contains the kernel static
  * perpcu area.  This function is to be called from arch percpu area
  * setup path.
  *
  * @ai contains all information necessary to initialize the first
  * chunk and prime the dynamic percpu allocator.
  *
  * @ai->static_size is the size of static percpu area.
  *
  * @ai->reserved_size, if non-zero, specifies the amount of bytes to
  * reserve after the static area in the first chunk.  This reserves
  * the first chunk such that it's available only through reserved
  * percpu allocation.  This is primarily used to serve module percpu
  * static areas on architectures where the addressing model has
  * limited offset range for symbol relocations to guarantee module
  * percpu symbols fall inside the relocatable range.
  *
  * @ai->dyn_size determines the number of bytes available for dynamic
  * allocation in the first chunk.  The area between @ai->static_size +
  * @ai->reserved_size + @ai->dyn_size and @ai->unit_size is unused.
  *
  * @ai->unit_size specifies unit size and must be aligned to PAGE_SIZE
  * and equal to or larger than @ai->static_size + @ai->reserved_size +
  * @ai->dyn_size.
  *
  * @ai->atom_size is the allocation atom size and used as alignment
  * for vm areas.
  *
  * @ai->alloc_size is the allocation size and always multiple of
  * @ai->atom_size.  This is larger than @ai->atom_size if
  * @ai->unit_size is larger than @ai->atom_size.
  *
  * @ai->nr_groups and @ai->groups describe virtual memory layout of
  * percpu areas.  Units which should be colocated are put into the
  * same group.  Dynamic VM areas will be allocated according to these
  * groupings.  If @ai->nr_groups is zero, a single group containing
  * all units is assumed.
  *
  * The caller should have mapped the first chunk at @base_addr and
  * copied static data to each unit.
  *
  * If the first chunk ends up with both reserved and dynamic areas, it
  * is served by two chunks - one to serve the core static and reserved
  * areas and the other for the dynamic area.  They share the same vm
  * and page map but uses different area allocation map to stay away
  * from each other.  The latter chunk is circulated in the chunk slots
  * and available for dynamic allocation like any other chunks.
  *
  * RETURNS:
  * 0 on success, -errno on failure.
  */
 int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
 				  void *base_addr)
 {
 	static char cpus_buf[4096] __initdata;
 	static int smap[PERCPU_DYNAMIC_EARLY_SLOTS] __initdata;
 	static int dmap[PERCPU_DYNAMIC_EARLY_SLOTS] __initdata;
 	size_t dyn_size = ai->dyn_size;
 	size_t size_sum = ai->static_size + ai->reserved_size + dyn_size;
 	struct pcpu_chunk *schunk, *dchunk = NULL;
 	unsigned long *group_offsets;
 	size_t *group_sizes;
 	unsigned long *unit_off;
 	unsigned int cpu;
 	int *unit_map;
 	int group, unit, i;
 	cpumask_scnprintf(cpus_buf, sizeof(cpus_buf), cpu_possible_mask);
 #define PCPU_SETUP_BUG_ON(cond)	do {					\
 	if (unlikely(cond)) {						\
 		pr_emerg("PERCPU: failed to initialize, %s", #cond);	\
 		pr_emerg("PERCPU: cpu_possible_mask=%s\n", cpus_buf);	\
 		pcpu_dump_alloc_info(KERN_EMERG, ai);			\
 		BUG();							\
 	}								\
 } while (0)
 	/* sanity checks */
 	PCPU_SETUP_BUG_ON(ai->nr_groups <= 0);
 #ifdef CONFIG_SMP
 	PCPU_SETUP_BUG_ON(!ai->static_size);
 	PCPU_SETUP_BUG_ON((unsigned long)__per_cpu_start & ~PAGE_MASK);
 #endif
 	PCPU_SETUP_BUG_ON(!base_addr);
 	PCPU_SETUP_BUG_ON((unsigned long)base_addr & ~PAGE_MASK);
 	PCPU_SETUP_BUG_ON(ai->unit_size < size_sum);
 	PCPU_SETUP_BUG_ON(ai->unit_size & ~PAGE_MASK);
 	PCPU_SETUP_BUG_ON(ai->unit_size < PCPU_MIN_UNIT_SIZE);
 	PCPU_SETUP_BUG_ON(ai->dyn_size < PERCPU_DYNAMIC_EARLY_SIZE);
 	PCPU_SETUP_BUG_ON(pcpu_verify_alloc_info(ai) < 0);
 	/* process group information and build config tables accordingly */
 	group_offsets = alloc_bootmem(ai->nr_groups * sizeof(group_offsets[0]));
 	group_sizes = alloc_bootmem(ai->nr_groups * sizeof(group_sizes[0]));
 	unit_map = alloc_bootmem(nr_cpu_ids * sizeof(unit_map[0]));
 	unit_off = alloc_bootmem(nr_cpu_ids * sizeof(unit_off[0]));
 	for (cpu = 0; cpu < nr_cpu_ids; cpu++)
 		unit_map[cpu] = UINT_MAX;
 	pcpu_low_unit_cpu = NR_CPUS;
 	pcpu_high_unit_cpu = NR_CPUS;
 	for (group = 0, unit = 0; group < ai->nr_groups; group++, unit += i) {
 		const struct pcpu_group_info *gi = &ai->groups[group];
 		group_offsets[group] = gi->base_offset;
 		group_sizes[group] = gi->nr_units * ai->unit_size;
 		for (i = 0; i < gi->nr_units; i++) {
 			cpu = gi->cpu_map[i];
 			if (cpu == NR_CPUS)
 				continue;
 			PCPU_SETUP_BUG_ON(cpu > nr_cpu_ids);
 			PCPU_SETUP_BUG_ON(!cpu_possible(cpu));
 			PCPU_SETUP_BUG_ON(unit_map[cpu] != UINT_MAX);
 			unit_map[cpu] = unit + i;
 			unit_off[cpu] = gi->base_offset + i * ai->unit_size;
 			/* determine low/high unit_cpu */
 			if (pcpu_low_unit_cpu == NR_CPUS ||
 			    unit_off[cpu] < unit_off[pcpu_low_unit_cpu])
 				pcpu_low_unit_cpu = cpu;
 			if (pcpu_high_unit_cpu == NR_CPUS ||
 			    unit_off[cpu] > unit_off[pcpu_high_unit_cpu])
 				pcpu_high_unit_cpu = cpu;
 		}
 	}
 	pcpu_nr_units = unit;
 	for_each_possible_cpu(cpu)
 		PCPU_SETUP_BUG_ON(unit_map[cpu] == UINT_MAX);
 	/* we're done parsing the input, undefine BUG macro and dump config */
 #undef PCPU_SETUP_BUG_ON
 	pcpu_dump_alloc_info(KERN_DEBUG, ai);
 	pcpu_nr_groups = ai->nr_groups;
 	pcpu_group_offsets = group_offsets;
 	pcpu_group_sizes = group_sizes;
 	pcpu_unit_map = unit_map;
 	pcpu_unit_offsets = unit_off;
 	/* determine basic parameters */
 	pcpu_unit_pages = ai->unit_size >> PAGE_SHIFT;
 	pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT;
 	pcpu_atom_size = ai->atom_size;
 	pcpu_chunk_struct_size = sizeof(struct pcpu_chunk) +
 		BITS_TO_LONGS(pcpu_unit_pages) * sizeof(unsigned long);
 	/*
 	 * Allocate chunk slots.  The additional last slot is for
 	 * empty chunks.
 	 */
 	pcpu_nr_slots = __pcpu_size_to_slot(pcpu_unit_size) + 2;
 	pcpu_slot = alloc_bootmem(pcpu_nr_slots * sizeof(pcpu_slot[0]));
 	for (i = 0; i < pcpu_nr_slots; i++)
 		INIT_LIST_HEAD(&pcpu_slot[i]);
 	/*
 	 * Initialize static chunk.  If reserved_size is zero, the
 	 * static chunk covers static area + dynamic allocation area
 	 * in the first chunk.  If reserved_size is not zero, it
 	 * covers static area + reserved area (mostly used for module
 	 * static percpu allocation).
 	 */
 	schunk = alloc_bootmem(pcpu_chunk_struct_size);
 	INIT_LIST_HEAD(&schunk->list);
 	schunk->base_addr = base_addr;
 	schunk->map = smap;
 	schunk->map_alloc = ARRAY_SIZE(smap);
 	schunk->immutable = true;
 	bitmap_fill(schunk->populated, pcpu_unit_pages);
 	if (ai->reserved_size) {
 		schunk->free_size = ai->reserved_size;
 		pcpu_reserved_chunk = schunk;
 		pcpu_reserved_chunk_limit = ai->static_size + ai->reserved_size;
 	} else {
 		schunk->free_size = dyn_size;
 		dyn_size = 0;			/* dynamic area covered */
 	}
 	schunk->contig_hint = schunk->free_size;
 	schunk->map[schunk->map_used++] = -ai->static_size;
 	if (schunk->free_size)
 		schunk->map[schunk->map_used++] = schunk->free_size;
 	/* init dynamic chunk if necessary */
 	if (dyn_size) {
 		dchunk = alloc_bootmem(pcpu_chunk_struct_size);
 		INIT_LIST_HEAD(&dchunk->list);
 		dchunk->base_addr = base_addr;
 		dchunk->map = dmap;
 		dchunk->map_alloc = ARRAY_SIZE(dmap);
 		dchunk->immutable = true;
 		bitmap_fill(dchunk->populated, pcpu_unit_pages);
 		dchunk->contig_hint = dchunk->free_size = dyn_size;
 		dchunk->map[dchunk->map_used++] = -pcpu_reserved_chunk_limit;
 		dchunk->map[dchunk->map_used++] = dchunk->free_size;
 	}
 	/* link the first chunk in */
 	pcpu_first_chunk = dchunk ?: schunk;
 	pcpu_chunk_relocate(pcpu_first_chunk, -1);
 	/* we're done */
 	pcpu_base_addr = base_addr;
 	return 0;
 }
 #ifdef CONFIG_SMP
 const char *pcpu_fc_names[PCPU_FC_NR] __initdata = {
 	[PCPU_FC_AUTO]	= "auto",
 	[PCPU_FC_EMBED]	= "embed",
 	[PCPU_FC_PAGE]	= "page",
 };
 enum pcpu_fc pcpu_chosen_fc __initdata = PCPU_FC_AUTO;
 static int __init percpu_alloc_setup(char *str)
 {
 	if (0)
 		/* nada */;
 #ifdef CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK
 	else if (!strcmp(str, "embed"))
 		pcpu_chosen_fc = PCPU_FC_EMBED;
 #endif
 #ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK
 	else if (!strcmp(str, "page"))
 		pcpu_chosen_fc = PCPU_FC_PAGE;
 #endif
 	else
 		pr_warning("PERCPU: unknown allocator %s specified\n", str);
 	return 0;
 }
 early_param("percpu_alloc", percpu_alloc_setup);
 /*
  * pcpu_embed_first_chunk() is used by the generic percpu setup.
  * Build it if needed by the arch config or the generic setup is going
  * to be used.
  */
 #if defined(CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK) || \
 	!defined(CONFIG_HAVE_SETUP_PER_CPU_AREA)
 #define BUILD_EMBED_FIRST_CHUNK
 #endif
 /* build pcpu_page_first_chunk() iff needed by the arch config */
 #if defined(CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK)
 #define BUILD_PAGE_FIRST_CHUNK
 #endif
 /* pcpu_build_alloc_info() is used by both embed and page first chunk */
 #if defined(BUILD_EMBED_FIRST_CHUNK) || defined(BUILD_PAGE_FIRST_CHUNK)
 /**
  * pcpu_build_alloc_info - build alloc_info considering distances between CPUs
  * @reserved_size: the size of reserved percpu area in bytes
  * @dyn_size: minimum free size for dynamic allocation in bytes
  * @atom_size: allocation atom size
  * @cpu_distance_fn: callback to determine distance between cpus, optional
  *
  * This function determines grouping of units, their mappings to cpus
  * and other parameters considering needed percpu size, allocation
  * atom size and distances between CPUs.
  *
  * Groups are always mutliples of atom size and CPUs which are of
  * LOCAL_DISTANCE both ways are grouped together and share space for
  * units in the same group.  The returned configuration is guaranteed
  * to have CPUs on different nodes on different groups and >=75% usage
  * of allocated virtual address space.
  *
  * RETURNS:
  * On success, pointer to the new allocation_info is returned.  On
  * failure, ERR_PTR value is returned.
  */
 static struct pcpu_alloc_info * __init pcpu_build_alloc_info(
 				size_t reserved_size, size_t dyn_size,
 				size_t atom_size,
 				pcpu_fc_cpu_distance_fn_t cpu_distance_fn)
 {
 	static int group_map[NR_CPUS] __initdata;
 	static int group_cnt[NR_CPUS] __initdata;
 	const size_t static_size = __per_cpu_end - __per_cpu_start;
 	int nr_groups = 1, nr_units = 0;
 	size_t size_sum, min_unit_size, alloc_size;
 	int upa, max_upa, uninitialized_var(best_upa);	/* units_per_alloc */
 	int last_allocs, group, unit;
 	unsigned int cpu, tcpu;
 	struct pcpu_alloc_info *ai;
 	unsigned int *cpu_map;
 	/* this function may be called multiple times */
 	memset(group_map, 0, sizeof(group_map));
 	memset(group_cnt, 0, sizeof(group_cnt));
 	/* calculate size_sum and ensure dyn_size is enough for early alloc */
 	size_sum = PFN_ALIGN(static_size + reserved_size +
 			    max_t(size_t, dyn_size, PERCPU_DYNAMIC_EARLY_SIZE));
 	dyn_size = size_sum - static_size - reserved_size;
 	/*
 	 * Determine min_unit_size, alloc_size and max_upa such that
 	 * alloc_size is multiple of atom_size and is the smallest
 	 * which can accommodate 4k aligned segments which are equal to
 	 * or larger than min_unit_size.
 	 */
 	min_unit_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE);
 	alloc_size = roundup(min_unit_size, atom_size);
 	upa = alloc_size / min_unit_size;
 	while (alloc_size % upa || ((alloc_size / upa) & ~PAGE_MASK))
 		upa--;
 	max_upa = upa;
 	/* group cpus according to their proximity */
 	for_each_possible_cpu(cpu) {
 		group = 0;
 	next_group:
 		for_each_possible_cpu(tcpu) {
 			if (cpu == tcpu)
 				break;
 			if (group_map[tcpu] == group && cpu_distance_fn &&
 			    (cpu_distance_fn(cpu, tcpu) > LOCAL_DISTANCE ||
 			     cpu_distance_fn(tcpu, cpu) > LOCAL_DISTANCE)) {
 				group++;
 				nr_groups = max(nr_groups, group + 1);
 				goto next_group;
 			}
 		}
 		group_map[cpu] = group;
 		group_cnt[group]++;
 	}
 	/*
 	 * Expand unit size until address space usage goes over 75%
 	 * and then as much as possible without using more address
 	 * space.
 	 */
 	last_allocs = INT_MAX;
 	for (upa = max_upa; upa; upa--) {
 		int allocs = 0, wasted = 0;
 		if (alloc_size % upa || ((alloc_size / upa) & ~PAGE_MASK))
 			continue;
 		for (group = 0; group < nr_groups; group++) {
 			int this_allocs = DIV_ROUND_UP(group_cnt[group], upa);
 			allocs += this_allocs;
 			wasted += this_allocs * upa - group_cnt[group];
 		}
 		/*
 		 * Don't accept if wastage is over 1/3.  The
 		 * greater-than comparison ensures upa==1 always
 		 * passes the following check.
 		 */
 		if (wasted > num_possible_cpus() / 3)
 			continue;
 		/* and then don't consume more memory */
 		if (allocs > last_allocs)
 			break;
 		last_allocs = allocs;
 		best_upa = upa;
 	}
 	upa = best_upa;
 	/* allocate and fill alloc_info */
 	for (group = 0; group < nr_groups; group++)
 		nr_units += roundup(group_cnt[group], upa);
 	ai = pcpu_alloc_alloc_info(nr_groups, nr_units);
 	if (!ai)
 		return ERR_PTR(-ENOMEM);
 	cpu_map = ai->groups[0].cpu_map;
 	for (group = 0; group < nr_groups; group++) {
 		ai->groups[group].cpu_map = cpu_map;
 		cpu_map += roundup(group_cnt[group], upa);
 	}
 	ai->static_size = static_size;
 	ai->reserved_size = reserved_size;
 	ai->dyn_size = dyn_size;
 	ai->unit_size = alloc_size / upa;
 	ai->atom_size = atom_size;
 	ai->alloc_size = alloc_size;
 	for (group = 0, unit = 0; group_cnt[group]; group++) {
 		struct pcpu_group_info *gi = &ai->groups[group];
 		/*
 		 * Initialize base_offset as if all groups are located
 		 * back-to-back.  The caller should update this to
 		 * reflect actual allocation.
 		 */
 		gi->base_offset = unit * ai->unit_size;
 		for_each_possible_cpu(cpu)
 			if (group_map[cpu] == group)
 				gi->cpu_map[gi->nr_units++] = cpu;
 		gi->nr_units = roundup(gi->nr_units, upa);
 		unit += gi->nr_units;
 	}
 	BUG_ON(unit != nr_units);
 	return ai;
 }
 #endif /* BUILD_EMBED_FIRST_CHUNK || BUILD_PAGE_FIRST_CHUNK */
 #if defined(BUILD_EMBED_FIRST_CHUNK)
 /**
  * pcpu_embed_first_chunk - embed the first percpu chunk into bootmem
  * @reserved_size: the size of reserved percpu area in bytes
  * @dyn_size: minimum free size for dynamic allocation in bytes
  * @atom_size: allocation atom size
  * @cpu_distance_fn: callback to determine distance between cpus, optional
  * @alloc_fn: function to allocate percpu page
  * @free_fn: function to free percpu page
  *
  * This is a helper to ease setting up embedded first percpu chunk and
  * can be called where pcpu_setup_first_chunk() is expected.
  *
  * If this function is used to setup the first chunk, it is allocated
  * by calling @alloc_fn and used as-is without being mapped into
  * vmalloc area.  Allocations are always whole multiples of @atom_size
  * aligned to @atom_size.
  *
  * This enables the first chunk to piggy back on the linear physical
  * mapping which often uses larger page size.  Please note that this
  * can result in very sparse cpu->unit mapping on NUMA machines thus
  * requiring large vmalloc address space.  Don't use this allocator if
  * vmalloc space is not orders of magnitude larger than distances
  * between node memory addresses (ie. 32bit NUMA machines).
  *
  * @dyn_size specifies the minimum dynamic area size.
  *
  * If the needed size is smaller than the minimum or specified unit
  * size, the leftover is returned using @free_fn.
  *
  * RETURNS:
  * 0 on success, -errno on failure.
  */
 int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size,
 				  size_t atom_size,
 				  pcpu_fc_cpu_distance_fn_t cpu_distance_fn,
 				  pcpu_fc_alloc_fn_t alloc_fn,
 				  pcpu_fc_free_fn_t free_fn)
 {
 	void *base = (void *)ULONG_MAX;
 	void **areas = NULL;
 	struct pcpu_alloc_info *ai;
 	size_t size_sum, areas_size, max_distance;
 	int group, i, rc;
 	ai = pcpu_build_alloc_info(reserved_size, dyn_size, atom_size,
 				   cpu_distance_fn);
 	if (IS_ERR(ai))
 		return PTR_ERR(ai);
 	size_sum = ai->static_size + ai->reserved_size + ai->dyn_size;
 	areas_size = PFN_ALIGN(ai->nr_groups * sizeof(void *));
 	areas = alloc_bootmem_nopanic(areas_size);
 	if (!areas) {
 		rc = -ENOMEM;
 		goto out_free;
 	}
 	/* allocate, copy and determine base address */
 	for (group = 0; group < ai->nr_groups; group++) {
 		struct pcpu_group_info *gi = &ai->groups[group];
 		unsigned int cpu = NR_CPUS;
 		void *ptr;
 		for (i = 0; i < gi->nr_units && cpu == NR_CPUS; i++)
 			cpu = gi->cpu_map[i];
 		BUG_ON(cpu == NR_CPUS);
 		/* allocate space for the whole group */
 		ptr = alloc_fn(cpu, gi->nr_units * ai->unit_size, atom_size);
 		if (!ptr) {
 			rc = -ENOMEM;
 			goto out_free_areas;
 		}
 		/* kmemleak tracks the percpu allocations separately */
 		kmemleak_free(ptr);
 		areas[group] = ptr;
 		base = min(ptr, base);
 	}
 	/*
 	 * Copy data and free unused parts.  This should happen after all
 	 * allocations are complete; otherwise, we may end up with
 	 * overlapping groups.
 	 */
 	for (group = 0; group < ai->nr_groups; group++) {
 		struct pcpu_group_info *gi = &ai->groups[group];
 		void *ptr = areas[group];
 		for (i = 0; i < gi->nr_units; i++, ptr += ai->unit_size) {
 			if (gi->cpu_map[i] == NR_CPUS) {
 				/* unused unit, free whole */
 				free_fn(ptr, ai->unit_size);
 				continue;
 			}
 			/* copy and return the unused part */
 			memcpy(ptr, __per_cpu_load, ai->static_size);
 			free_fn(ptr + size_sum, ai->unit_size - size_sum);
 		}
 	}
 	/* base address is now known, determine group base offsets */
 	max_distance = 0;
 	for (group = 0; group < ai->nr_groups; group++) {
 		ai->groups[group].base_offset = areas[group] - base;
 		max_distance = max_t(size_t, max_distance,
 				     ai->groups[group].base_offset);
 	}
 	max_distance += ai->unit_size;
 	/* warn if maximum distance is further than 75% of vmalloc space */
 	if (max_distance > (VMALLOC_END - VMALLOC_START) * 3 / 4) {
 		pr_warning("PERCPU: max_distance=0x%zx too large for vmalloc "
 			   "space 0x%lx\n", max_distance,
 			   (unsigned long)(VMALLOC_END - VMALLOC_START));
 #ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK
 		/* and fail if we have fallback */
 		rc = -EINVAL;
 		goto out_free;
 #endif
 	}
 	pr_info("PERCPU: Embedded %zu pages/cpu @%p s%zu r%zu d%zu u%zu\n",
 		PFN_DOWN(size_sum), base, ai->static_size, ai->reserved_size,
 		ai->dyn_size, ai->unit_size);
 	rc = pcpu_setup_first_chunk(ai, base);
 	goto out_free;
 out_free_areas:
 	for (group = 0; group < ai->nr_groups; group++)
 		free_fn(areas[group],
 			ai->groups[group].nr_units * ai->unit_size);
 out_free:
 	pcpu_free_alloc_info(ai);
 	if (areas)
 		free_bootmem(__pa(areas), areas_size);
 	return rc;
 }
 #endif /* BUILD_EMBED_FIRST_CHUNK */
 #ifdef BUILD_PAGE_FIRST_CHUNK
 /**
  * pcpu_page_first_chunk - map the first chunk using PAGE_SIZE pages
  * @reserved_size: the size of reserved percpu area in bytes
  * @alloc_fn: function to allocate percpu page, always called with PAGE_SIZE
  * @free_fn: function to free percpu page, always called with PAGE_SIZE
  * @populate_pte_fn: function to populate pte
  *
  * This is a helper to ease setting up page-remapped first percpu
  * chunk and can be called where pcpu_setup_first_chunk() is expected.
  *
  * This is the basic allocator.  Static percpu area is allocated
  * page-by-page into vmalloc area.
  *
  * RETURNS:
  * 0 on success, -errno on failure.
  */
 int __init pcpu_page_first_chunk(size_t reserved_size,
 				 pcpu_fc_alloc_fn_t alloc_fn,
 				 pcpu_fc_free_fn_t free_fn,
 				 pcpu_fc_populate_pte_fn_t populate_pte_fn)
 {
 	static struct vm_struct vm;
 	struct pcpu_alloc_info *ai;
 	char psize_str[16];
 	int unit_pages;
 	size_t pages_size;
 	struct page **pages;
 	int unit, i, j, rc;
 	snprintf(psize_str, sizeof(psize_str), "%luK", PAGE_SIZE >> 10);
 	ai = pcpu_build_alloc_info(reserved_size, 0, PAGE_SIZE, NULL);
 	if (IS_ERR(ai))
 		return PTR_ERR(ai);
 	BUG_ON(ai->nr_groups != 1);
 	BUG_ON(ai->groups[0].nr_units != num_possible_cpus());
 	unit_pages = ai->unit_size >> PAGE_SHIFT;
 	/* unaligned allocations can't be freed, round up to page size */
 	pages_size = PFN_ALIGN(unit_pages * num_possible_cpus() *
 			       sizeof(pages[0]));
 	pages = alloc_bootmem(pages_size);
 	/* allocate pages */
 	j = 0;
 	for (unit = 0; unit < num_possible_cpus(); unit++)
 		for (i = 0; i < unit_pages; i++) {
 			unsigned int cpu = ai->groups[0].cpu_map[unit];
 			void *ptr;
 			ptr = alloc_fn(cpu, PAGE_SIZE, PAGE_SIZE);
 			if (!ptr) {
 				pr_warning("PERCPU: failed to allocate %s page "
 					   "for cpu%u\n", psize_str, cpu);
 				goto enomem;
 			}
 			/* kmemleak tracks the percpu allocations separately */
 			kmemleak_free(ptr);
 			pages[j++] = virt_to_page(ptr);
 		}
 	/* allocate vm area, map the pages and copy static data */
 	vm.flags = VM_ALLOC;
 	vm.size = num_possible_cpus() * ai->unit_size;
 	vm_area_register_early(&vm, PAGE_SIZE);
 	for (unit = 0; unit < num_possible_cpus(); unit++) {
 		unsigned long unit_addr =
 			(unsigned long)vm.addr + unit * ai->unit_size;
 		for (i = 0; i < unit_pages; i++)
 			populate_pte_fn(unit_addr + (i << PAGE_SHIFT));
 		/* pte already populated, the following shouldn't fail */
 		rc = __pcpu_map_pages(unit_addr, &pages[unit * unit_pages],
 				      unit_pages);
 		if (rc < 0)
 			panic("failed to map percpu area, err=%d\n", rc);
 		/*
 		 * FIXME: Archs with virtual cache should flush local
 		 * cache for the linear mapping here - something
 		 * equivalent to flush_cache_vmap() on the local cpu.
 		 * flush_cache_vmap() can't be used as most supporting
 		 * data structures are not set up yet.
 		 */
 		/* copy static data */
 		memcpy((void *)unit_addr, __per_cpu_load, ai->static_size);
 	}
 	/* we're ready, commit */
 	pr_info("PERCPU: %d %s pages/cpu @%p s%zu r%zu d%zu\n",
 		unit_pages, psize_str, vm.addr, ai->static_size,
 		ai->reserved_size, ai->dyn_size);
 	rc = pcpu_setup_first_chunk(ai, vm.addr);
 	goto out_free_ar;
 enomem:
 	while (--j >= 0)
 		free_fn(page_address(pages[j]), PAGE_SIZE);
 	rc = -ENOMEM;
 out_free_ar:
 	free_bootmem(__pa(pages), pages_size);
 	pcpu_free_alloc_info(ai);
 	return rc;
 }
 #endif /* BUILD_PAGE_FIRST_CHUNK */
 #ifndef	CONFIG_HAVE_SETUP_PER_CPU_AREA
 /*
  * Generic SMP percpu area setup.
  *
  * The embedding helper is used because its behavior closely resembles
  * the original non-dynamic generic percpu area setup.  This is
  * important because many archs have addressing restrictions and might
  * fail if the percpu area is located far away from the previous
  * location.  As an added bonus, in non-NUMA cases, embedding is
  * generally a good idea TLB-wise because percpu area can piggy back
  * on the physical linear memory mapping which uses large page
  * mappings on applicable archs.
  */
 unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
 EXPORT_SYMBOL(__per_cpu_offset);
 static void * __init pcpu_dfl_fc_alloc(unsigned int cpu, size_t size,
 				       size_t align)
 {
 	return __alloc_bootmem_nopanic(size, align, __pa(MAX_DMA_ADDRESS));
 }
 static void __init pcpu_dfl_fc_free(void *ptr, size_t size)
 {
 	free_bootmem(__pa(ptr), size);
 }
 void __init setup_per_cpu_areas(void)
 {
 	unsigned long delta;
 	unsigned int cpu;
 	int rc;
 	/*
 	 * Always reserve area for module percpu variables.  That's
 	 * what the legacy allocator did.
 	 */
 	rc = pcpu_embed_first_chunk(PERCPU_MODULE_RESERVE,
 				    PERCPU_DYNAMIC_RESERVE, PAGE_SIZE, NULL,
 				    pcpu_dfl_fc_alloc, pcpu_dfl_fc_free);
 	if (rc < 0)
 		panic("Failed to initialize percpu areas.");
 	delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start;
 	for_each_possible_cpu(cpu)
 		__per_cpu_offset[cpu] = delta + pcpu_unit_offsets[cpu];
 }
 #endif	/* CONFIG_HAVE_SETUP_PER_CPU_AREA */
 #else	/* CONFIG_SMP */
 /*
  * UP percpu area setup.
  *
  * UP always uses km-based percpu allocator with identity mapping.
  * Static percpu variables are indistinguishable from the usual static
  * variables and don't require any special preparation.
  */
 void __init setup_per_cpu_areas(void)
 {
 	const size_t unit_size =
 		roundup_pow_of_two(max_t(size_t, PCPU_MIN_UNIT_SIZE,
 					 PERCPU_DYNAMIC_RESERVE));
 	struct pcpu_alloc_info *ai;
 	void *fc;
 	ai = pcpu_alloc_alloc_info(1, 1);
 	fc = __alloc_bootmem(unit_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
 	if (!ai || !fc)
 		panic("Failed to allocate memory for percpu areas.");
+	/* kmemleak tracks the percpu allocations separately */
+	kmemleak_free(fc);
 	ai->dyn_size = unit_size;
 	ai->unit_size = unit_size;
 	ai->atom_size = unit_size;
 	ai->alloc_size = unit_size;
 	ai->groups[0].nr_units = 1;
 	ai->groups[0].cpu_map[0] = 0;
 	if (pcpu_setup_first_chunk(ai, fc) < 0)
 		panic("Failed to initialize percpu areas.");
 }
 #endif	/* CONFIG_SMP */
 /*
  * First and reserved chunks are initialized with temporary allocation
  * map in initdata so that they can be used before slab is online.
  * This function is called after slab is brought up and replaces those
  * with properly allocated maps.
  */
 void __init percpu_init_late(void)
 {
 	struct pcpu_chunk *target_chunks[] =
 		{ pcpu_first_chunk, pcpu_reserved_chunk, NULL };
 	struct pcpu_chunk *chunk;
 	unsigned long flags;
 	int i;
 	for (i = 0; (chunk = target_chunks[i]); i++) {
 		int *map;
 		const size_t size = PERCPU_DYNAMIC_EARLY_SLOTS * sizeof(map[0]);
 		BUILD_BUG_ON(size > PAGE_SIZE);
 		map = pcpu_mem_zalloc(size);
 		BUG_ON(!map);
 		spin_lock_irqsave(&pcpu_lock, flags);
 		memcpy(map, chunk->map, size);
 		chunk->map = map;
 		spin_unlock_irqrestore(&pcpu_lock, flags);
 	}
 }