Blame view

mm/percpu.c 83.1 KB
fbf59bc9d   Tejun Heo   percpu: implement...
1
  /*
88999a898   Tejun Heo   percpu: misc prep...
2
   * mm/percpu.c - percpu memory allocator
fbf59bc9d   Tejun Heo   percpu: implement...
3
4
5
6
   *
   * Copyright (C) 2009		SUSE Linux Products GmbH
   * Copyright (C) 2009		Tejun Heo <tj@kernel.org>
   *
5e81ee3e6   Dennis Zhou (Facebook)   percpu: update he...
7
8
9
   * Copyright (C) 2017		Facebook Inc.
   * Copyright (C) 2017		Dennis Zhou <dennisszhou@gmail.com>
   *
9c0151627   Dennis Zhou (Facebook)   percpu: update th...
10
   * This file is released under the GPLv2 license.
fbf59bc9d   Tejun Heo   percpu: implement...
11
   *
9c0151627   Dennis Zhou (Facebook)   percpu: update th...
12
13
14
15
   * The percpu allocator handles both static and dynamic areas.  Percpu
   * areas are allocated in chunks which are divided into units.  There is
   * a 1-to-1 mapping for units to possible cpus.  These units are grouped
   * based on NUMA properties of the machine.
fbf59bc9d   Tejun Heo   percpu: implement...
16
17
18
19
20
21
   *
   *  c0                           c1                         c2
   *  -------------------          -------------------        ------------
   * | u0 | u1 | u2 | u3 |        | u0 | u1 | u2 | u3 |      | u0 | u1 | u
   *  -------------------  ......  -------------------  ....  ------------
   *
9c0151627   Dennis Zhou (Facebook)   percpu: update th...
22
23
24
25
26
27
28
29
30
   * Allocation is done by offsets into a unit's address space.  Ie., an
   * area of 512 bytes at 6k in c1 occupies 512 bytes at 6k in c1:u0,
   * c1:u1, c1:u2, etc.  On NUMA machines, the mapping may be non-linear
   * and even sparse.  Access is handled by configuring percpu base
   * registers according to the cpu to unit mappings and offsetting the
   * base address using pcpu_unit_size.
   *
   * There is special consideration for the first chunk which must handle
   * the static percpu variables in the kernel image as allocation services
5e81ee3e6   Dennis Zhou (Facebook)   percpu: update he...
31
   * are not online yet.  In short, the first chunk is structured like so:
9c0151627   Dennis Zhou (Facebook)   percpu: update th...
32
33
34
35
36
37
38
   *
   *                  <Static | [Reserved] | Dynamic>
   *
   * The static data is copied from the original section managed by the
   * linker.  The reserved section, if non-zero, primarily manages static
   * percpu variables from kernel modules.  Finally, the dynamic section
   * takes care of normal allocations.
fbf59bc9d   Tejun Heo   percpu: implement...
39
   *
5e81ee3e6   Dennis Zhou (Facebook)   percpu: update he...
40
41
42
43
44
45
46
47
48
49
50
51
52
53
   * The allocator organizes chunks into lists according to free size and
   * tries to allocate from the fullest chunk first.  Each chunk is managed
   * by a bitmap with metadata blocks.  The allocation map is updated on
   * every allocation and free to reflect the current state while the boundary
   * map is only updated on allocation.  Each metadata block contains
   * information to help mitigate the need to iterate over large portions
   * of the bitmap.  The reverse mapping from page to chunk is stored in
   * the page's index.  Lastly, units are lazily backed and grow in unison.
   *
   * There is a unique conversion that goes on here between bytes and bits.
   * Each bit represents a fragment of size PCPU_MIN_ALLOC_SIZE.  The chunk
   * tracks the number of pages it is responsible for in nr_pages.  Helper
   * functions are used to convert from between the bytes, bits, and blocks.
   * All hints are managed in bits unless explicitly stated.
9c0151627   Dennis Zhou (Facebook)   percpu: update th...
54
   *
4091fb95b   Masahiro Yamada   scripts/spelling....
55
   * To use this allocator, arch code should do the following:
fbf59bc9d   Tejun Heo   percpu: implement...
56
   *
fbf59bc9d   Tejun Heo   percpu: implement...
57
   * - define __addr_to_pcpu_ptr() and __pcpu_ptr_to_addr() to translate
e01009833   Tejun Heo   percpu: make x86 ...
58
59
   *   regular address to percpu pointer and back if they need to be
   *   different from the default
fbf59bc9d   Tejun Heo   percpu: implement...
60
   *
8d408b4be   Tejun Heo   percpu: give more...
61
62
   * - use pcpu_setup_first_chunk() during percpu area initialization to
   *   setup the first chunk containing the kernel static percpu area
fbf59bc9d   Tejun Heo   percpu: implement...
63
   */
870d4b12a   Joe Perches   mm: percpu: use p...
64
  #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
fbf59bc9d   Tejun Heo   percpu: implement...
65
66
  #include <linux/bitmap.h>
  #include <linux/bootmem.h>
fd1e8a1fe   Tejun Heo   percpu: introduce...
67
  #include <linux/err.h>
ca460b3c9   Dennis Zhou (Facebook)   percpu: introduce...
68
  #include <linux/lcm.h>
fbf59bc9d   Tejun Heo   percpu: implement...
69
  #include <linux/list.h>
a530b7958   Tejun Heo   percpu: teach lar...
70
  #include <linux/log2.h>
fbf59bc9d   Tejun Heo   percpu: implement...
71
72
73
74
75
  #include <linux/mm.h>
  #include <linux/module.h>
  #include <linux/mutex.h>
  #include <linux/percpu.h>
  #include <linux/pfn.h>
fbf59bc9d   Tejun Heo   percpu: implement...
76
  #include <linux/slab.h>
ccea34b5d   Tejun Heo   percpu: finer gra...
77
  #include <linux/spinlock.h>
fbf59bc9d   Tejun Heo   percpu: implement...
78
  #include <linux/vmalloc.h>
a56dbddf0   Tejun Heo   percpu: move full...
79
  #include <linux/workqueue.h>
f528f0b8e   Catalin Marinas   kmemleak: Handle ...
80
  #include <linux/kmemleak.h>
e9caf1e1d   Tejun Heo   percpu: include l...
81
  #include <linux/sched.h>
fbf59bc9d   Tejun Heo   percpu: implement...
82
83
  
  #include <asm/cacheflush.h>
e01009833   Tejun Heo   percpu: make x86 ...
84
  #include <asm/sections.h>
fbf59bc9d   Tejun Heo   percpu: implement...
85
  #include <asm/tlbflush.h>
3b034b0d0   Vivek Goyal   percpu: Fix kdump...
86
  #include <asm/io.h>
fbf59bc9d   Tejun Heo   percpu: implement...
87

df95e795a   Dennis Zhou   percpu: add trace...
88
89
  #define CREATE_TRACE_POINTS
  #include <trace/events/percpu.h>
8fa3ed801   Dennis Zhou   percpu: migrate p...
90
  #include "percpu-internal.h"
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
91
92
  /* the slots are sorted by free bytes left, 1-31 bytes share the same slot */
  #define PCPU_SLOT_BASE_SHIFT		5
1a4d76076   Tejun Heo   percpu: implement...
93
94
  #define PCPU_EMPTY_POP_PAGES_LOW	2
  #define PCPU_EMPTY_POP_PAGES_HIGH	4
fbf59bc9d   Tejun Heo   percpu: implement...
95

bbddff054   Tejun Heo   percpu: use percp...
96
  #ifdef CONFIG_SMP
e01009833   Tejun Heo   percpu: make x86 ...
97
98
99
  /* default addr <-> pcpu_ptr mapping, override in asm/percpu.h if necessary */
  #ifndef __addr_to_pcpu_ptr
  #define __addr_to_pcpu_ptr(addr)					\
43cf38eb5   Tejun Heo   percpu: add __per...
100
101
102
  	(void __percpu *)((unsigned long)(addr) -			\
  			  (unsigned long)pcpu_base_addr	+		\
  			  (unsigned long)__per_cpu_start)
e01009833   Tejun Heo   percpu: make x86 ...
103
104
105
  #endif
  #ifndef __pcpu_ptr_to_addr
  #define __pcpu_ptr_to_addr(ptr)						\
43cf38eb5   Tejun Heo   percpu: add __per...
106
107
108
  	(void __force *)((unsigned long)(ptr) +				\
  			 (unsigned long)pcpu_base_addr -		\
  			 (unsigned long)__per_cpu_start)
e01009833   Tejun Heo   percpu: make x86 ...
109
  #endif
bbddff054   Tejun Heo   percpu: use percp...
110
111
112
113
114
  #else	/* CONFIG_SMP */
  /* on UP, it's always identity mapped */
  #define __addr_to_pcpu_ptr(addr)	(void __percpu *)(addr)
  #define __pcpu_ptr_to_addr(ptr)		(void __force *)(ptr)
  #endif	/* CONFIG_SMP */
e01009833   Tejun Heo   percpu: make x86 ...
115

1328710b8   Daniel Micay   mark most percpu ...
116
117
118
119
  static int pcpu_unit_pages __ro_after_init;
  static int pcpu_unit_size __ro_after_init;
  static int pcpu_nr_units __ro_after_init;
  static int pcpu_atom_size __ro_after_init;
8fa3ed801   Dennis Zhou   percpu: migrate p...
120
  int pcpu_nr_slots __ro_after_init;
1328710b8   Daniel Micay   mark most percpu ...
121
  static size_t pcpu_chunk_struct_size __ro_after_init;
fbf59bc9d   Tejun Heo   percpu: implement...
122

a855b84c3   Tejun Heo   percpu: fix chunk...
123
  /* cpus with the lowest and highest unit addresses */
1328710b8   Daniel Micay   mark most percpu ...
124
125
  static unsigned int pcpu_low_unit_cpu __ro_after_init;
  static unsigned int pcpu_high_unit_cpu __ro_after_init;
2f39e637e   Tejun Heo   percpu: allow non...
126

fbf59bc9d   Tejun Heo   percpu: implement...
127
  /* the address of the first chunk which starts with the kernel static area */
1328710b8   Daniel Micay   mark most percpu ...
128
  void *pcpu_base_addr __ro_after_init;
fbf59bc9d   Tejun Heo   percpu: implement...
129
  EXPORT_SYMBOL_GPL(pcpu_base_addr);
1328710b8   Daniel Micay   mark most percpu ...
130
131
  static const int *pcpu_unit_map __ro_after_init;		/* cpu -> unit */
  const unsigned long *pcpu_unit_offsets __ro_after_init;	/* cpu -> unit offset */
2f39e637e   Tejun Heo   percpu: allow non...
132

6563297ce   Tejun Heo   percpu: use group...
133
  /* group information, used for vm allocation */
1328710b8   Daniel Micay   mark most percpu ...
134
135
136
  static int pcpu_nr_groups __ro_after_init;
  static const unsigned long *pcpu_group_offsets __ro_after_init;
  static const size_t *pcpu_group_sizes __ro_after_init;
6563297ce   Tejun Heo   percpu: use group...
137

ae9e6bc9f   Tejun Heo   percpu: don't put...
138
139
140
141
142
  /*
   * The first chunk which always exists.  Note that unlike other
   * chunks, this one can be allocated and mapped in several different
   * ways and thus often doesn't live in the vmalloc area.
   */
8fa3ed801   Dennis Zhou   percpu: migrate p...
143
  struct pcpu_chunk *pcpu_first_chunk __ro_after_init;
ae9e6bc9f   Tejun Heo   percpu: don't put...
144
145
146
  
  /*
   * Optional reserved chunk.  This chunk reserves part of the first
e22667056   Dennis Zhou (Facebook)   percpu: introduce...
147
148
   * chunk and serves it for reserved allocations.  When the reserved
   * region doesn't exist, the following variable is NULL.
ae9e6bc9f   Tejun Heo   percpu: don't put...
149
   */
8fa3ed801   Dennis Zhou   percpu: migrate p...
150
  struct pcpu_chunk *pcpu_reserved_chunk __ro_after_init;
edcb46399   Tejun Heo   percpu, module: i...
151

8fa3ed801   Dennis Zhou   percpu: migrate p...
152
  DEFINE_SPINLOCK(pcpu_lock);	/* all internal data structures */
6710e594f   Tejun Heo   percpu: fix synch...
153
  static DEFINE_MUTEX(pcpu_alloc_mutex);	/* chunk create/destroy, [de]pop, map ext */
fbf59bc9d   Tejun Heo   percpu: implement...
154

8fa3ed801   Dennis Zhou   percpu: migrate p...
155
  struct list_head *pcpu_slot __ro_after_init; /* chunk list slots */
fbf59bc9d   Tejun Heo   percpu: implement...
156

4f996e234   Tejun Heo   percpu: fix synch...
157
158
  /* chunks which need their map areas extended, protected by pcpu_lock */
  static LIST_HEAD(pcpu_map_extend_chunks);
b539b87fe   Tejun Heo   percpu: implmeent...
159
160
161
162
  /*
   * The number of empty populated pages, protected by pcpu_lock.  The
   * reserved chunk doesn't contribute to the count.
   */
6b9b6f399   Dennis Zhou (Facebook)   percpu: expose pc...
163
  int pcpu_nr_empty_pop_pages;
b539b87fe   Tejun Heo   percpu: implmeent...
164

1a4d76076   Tejun Heo   percpu: implement...
165
166
167
168
169
170
  /*
   * Balance work is used to populate or destroy chunks asynchronously.  We
   * try to keep the number of populated free pages between
   * PCPU_EMPTY_POP_PAGES_LOW and HIGH for atomic allocations and at most one
   * empty chunk.
   */
fe6bd8c3d   Tejun Heo   percpu: rename pc...
171
172
  static void pcpu_balance_workfn(struct work_struct *work);
  static DECLARE_WORK(pcpu_balance_work, pcpu_balance_workfn);
1a4d76076   Tejun Heo   percpu: implement...
173
174
175
176
177
178
179
180
  static bool pcpu_async_enabled __read_mostly;
  static bool pcpu_atomic_alloc_failed;
  
  static void pcpu_schedule_balance_work(void)
  {
  	if (pcpu_async_enabled)
  		schedule_work(&pcpu_balance_work);
  }
a56dbddf0   Tejun Heo   percpu: move full...
181

c0ebfdc3f   Dennis Zhou (Facebook)   percpu: modify ba...
182
  /**
560f2c236   Dennis Zhou (Facebook)   percpu: combine p...
183
184
185
   * pcpu_addr_in_chunk - check if the address is served from this chunk
   * @chunk: chunk of interest
   * @addr: percpu address
c0ebfdc3f   Dennis Zhou (Facebook)   percpu: modify ba...
186
187
   *
   * RETURNS:
560f2c236   Dennis Zhou (Facebook)   percpu: combine p...
188
   * True if the address is served from this chunk.
c0ebfdc3f   Dennis Zhou (Facebook)   percpu: modify ba...
189
   */
560f2c236   Dennis Zhou (Facebook)   percpu: combine p...
190
  static bool pcpu_addr_in_chunk(struct pcpu_chunk *chunk, void *addr)
020ec6537   Tejun Heo   percpu: factor ou...
191
  {
c0ebfdc3f   Dennis Zhou (Facebook)   percpu: modify ba...
192
  	void *start_addr, *end_addr;
560f2c236   Dennis Zhou (Facebook)   percpu: combine p...
193
  	if (!chunk)
c0ebfdc3f   Dennis Zhou (Facebook)   percpu: modify ba...
194
  		return false;
020ec6537   Tejun Heo   percpu: factor ou...
195

560f2c236   Dennis Zhou (Facebook)   percpu: combine p...
196
197
198
  	start_addr = chunk->base_addr + chunk->start_offset;
  	end_addr = chunk->base_addr + chunk->nr_pages * PAGE_SIZE -
  		   chunk->end_offset;
c0ebfdc3f   Dennis Zhou (Facebook)   percpu: modify ba...
199
200
  
  	return addr >= start_addr && addr < end_addr;
020ec6537   Tejun Heo   percpu: factor ou...
201
  }
d9b55eeb1   Tejun Heo   percpu: remove un...
202
  static int __pcpu_size_to_slot(int size)
fbf59bc9d   Tejun Heo   percpu: implement...
203
  {
cae3aeb83   Tejun Heo   percpu: clean up ...
204
  	int highbit = fls(size);	/* size is in bytes */
fbf59bc9d   Tejun Heo   percpu: implement...
205
206
  	return max(highbit - PCPU_SLOT_BASE_SHIFT + 2, 1);
  }
d9b55eeb1   Tejun Heo   percpu: remove un...
207
208
209
210
211
212
  static int pcpu_size_to_slot(int size)
  {
  	if (size == pcpu_unit_size)
  		return pcpu_nr_slots - 1;
  	return __pcpu_size_to_slot(size);
  }
fbf59bc9d   Tejun Heo   percpu: implement...
213
214
  static int pcpu_chunk_slot(const struct pcpu_chunk *chunk)
  {
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
215
  	if (chunk->free_bytes < PCPU_MIN_ALLOC_SIZE || chunk->contig_bits == 0)
fbf59bc9d   Tejun Heo   percpu: implement...
216
  		return 0;
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
217
  	return pcpu_size_to_slot(chunk->free_bytes);
fbf59bc9d   Tejun Heo   percpu: implement...
218
  }
88999a898   Tejun Heo   percpu: misc prep...
219
220
221
222
223
224
225
226
227
228
229
230
231
  /* set the pointer to a chunk in a page struct */
  static void pcpu_set_page_chunk(struct page *page, struct pcpu_chunk *pcpu)
  {
  	page->index = (unsigned long)pcpu;
  }
  
  /* obtain pointer to a chunk from a page struct */
  static struct pcpu_chunk *pcpu_get_page_chunk(struct page *page)
  {
  	return (struct pcpu_chunk *)page->index;
  }
  
  static int __maybe_unused pcpu_page_idx(unsigned int cpu, int page_idx)
fbf59bc9d   Tejun Heo   percpu: implement...
232
  {
2f39e637e   Tejun Heo   percpu: allow non...
233
  	return pcpu_unit_map[cpu] * pcpu_unit_pages + page_idx;
fbf59bc9d   Tejun Heo   percpu: implement...
234
  }
c0ebfdc3f   Dennis Zhou (Facebook)   percpu: modify ba...
235
236
237
238
  static unsigned long pcpu_unit_page_offset(unsigned int cpu, int page_idx)
  {
  	return pcpu_unit_offsets[cpu] + (page_idx << PAGE_SHIFT);
  }
9983b6f0c   Tejun Heo   percpu: fix first...
239
240
  static unsigned long pcpu_chunk_addr(struct pcpu_chunk *chunk,
  				     unsigned int cpu, int page_idx)
fbf59bc9d   Tejun Heo   percpu: implement...
241
  {
c0ebfdc3f   Dennis Zhou (Facebook)   percpu: modify ba...
242
243
  	return (unsigned long)chunk->base_addr +
  	       pcpu_unit_page_offset(cpu, page_idx);
fbf59bc9d   Tejun Heo   percpu: implement...
244
  }
91e914c5a   Dennis Zhou (Facebook)   percpu: generaliz...
245
  static void pcpu_next_unpop(unsigned long *bitmap, int *rs, int *re, int end)
ce3141a27   Tejun Heo   percpu: drop pcpu...
246
  {
91e914c5a   Dennis Zhou (Facebook)   percpu: generaliz...
247
248
  	*rs = find_next_zero_bit(bitmap, end, *rs);
  	*re = find_next_bit(bitmap, end, *rs + 1);
ce3141a27   Tejun Heo   percpu: drop pcpu...
249
  }
91e914c5a   Dennis Zhou (Facebook)   percpu: generaliz...
250
  static void pcpu_next_pop(unsigned long *bitmap, int *rs, int *re, int end)
ce3141a27   Tejun Heo   percpu: drop pcpu...
251
  {
91e914c5a   Dennis Zhou (Facebook)   percpu: generaliz...
252
253
  	*rs = find_next_bit(bitmap, end, *rs);
  	*re = find_next_zero_bit(bitmap, end, *rs + 1);
ce3141a27   Tejun Heo   percpu: drop pcpu...
254
255
256
  }
  
  /*
91e914c5a   Dennis Zhou (Facebook)   percpu: generaliz...
257
258
259
   * Bitmap region iterators.  Iterates over the bitmap between
   * [@start, @end) in @chunk.  @rs and @re should be integer variables
   * and will be set to start and end index of the current free region.
ce3141a27   Tejun Heo   percpu: drop pcpu...
260
   */
91e914c5a   Dennis Zhou (Facebook)   percpu: generaliz...
261
262
263
264
  #define pcpu_for_each_unpop_region(bitmap, rs, re, start, end)		     \
  	for ((rs) = (start), pcpu_next_unpop((bitmap), &(rs), &(re), (end)); \
  	     (rs) < (re);						     \
  	     (rs) = (re) + 1, pcpu_next_unpop((bitmap), &(rs), &(re), (end)))
ce3141a27   Tejun Heo   percpu: drop pcpu...
265

91e914c5a   Dennis Zhou (Facebook)   percpu: generaliz...
266
267
268
269
  #define pcpu_for_each_pop_region(bitmap, rs, re, start, end)		     \
  	for ((rs) = (start), pcpu_next_pop((bitmap), &(rs), &(re), (end));   \
  	     (rs) < (re);						     \
  	     (rs) = (re) + 1, pcpu_next_pop((bitmap), &(rs), &(re), (end)))
ce3141a27   Tejun Heo   percpu: drop pcpu...
270

ca460b3c9   Dennis Zhou (Facebook)   percpu: introduce...
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
  /*
   * The following are helper functions to help access bitmaps and convert
   * between bitmap offsets to address offsets.
   */
  static unsigned long *pcpu_index_alloc_map(struct pcpu_chunk *chunk, int index)
  {
  	return chunk->alloc_map +
  	       (index * PCPU_BITMAP_BLOCK_BITS / BITS_PER_LONG);
  }
  
  static unsigned long pcpu_off_to_block_index(int off)
  {
  	return off / PCPU_BITMAP_BLOCK_BITS;
  }
  
  static unsigned long pcpu_off_to_block_off(int off)
  {
  	return off & (PCPU_BITMAP_BLOCK_BITS - 1);
  }
b185cd0dc   Dennis Zhou (Facebook)   percpu: update fr...
290
291
292
293
  static unsigned long pcpu_block_off_to_off(int index, int off)
  {
  	return index * PCPU_BITMAP_BLOCK_BITS + off;
  }
fbf59bc9d   Tejun Heo   percpu: implement...
294
  /**
525ca84da   Dennis Zhou (Facebook)   percpu: use metad...
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
   * pcpu_next_md_free_region - finds the next hint free area
   * @chunk: chunk of interest
   * @bit_off: chunk offset
   * @bits: size of free area
   *
   * Helper function for pcpu_for_each_md_free_region.  It checks
   * block->contig_hint and performs aggregation across blocks to find the
   * next hint.  It modifies bit_off and bits in-place to be consumed in the
   * loop.
   */
  static void pcpu_next_md_free_region(struct pcpu_chunk *chunk, int *bit_off,
  				     int *bits)
  {
  	int i = pcpu_off_to_block_index(*bit_off);
  	int block_off = pcpu_off_to_block_off(*bit_off);
  	struct pcpu_block_md *block;
  
  	*bits = 0;
  	for (block = chunk->md_blocks + i; i < pcpu_chunk_nr_blocks(chunk);
  	     block++, i++) {
  		/* handles contig area across blocks */
  		if (*bits) {
  			*bits += block->left_free;
  			if (block->left_free == PCPU_BITMAP_BLOCK_BITS)
  				continue;
  			return;
  		}
  
  		/*
  		 * This checks three things.  First is there a contig_hint to
  		 * check.  Second, have we checked this hint before by
  		 * comparing the block_off.  Third, is this the same as the
  		 * right contig hint.  In the last case, it spills over into
  		 * the next block and should be handled by the contig area
  		 * across blocks code.
  		 */
  		*bits = block->contig_hint;
  		if (*bits && block->contig_hint_start >= block_off &&
  		    *bits + block->contig_hint_start < PCPU_BITMAP_BLOCK_BITS) {
  			*bit_off = pcpu_block_off_to_off(i,
  					block->contig_hint_start);
  			return;
  		}
1fa4df3e6   Dennis Zhou   percpu: fix itera...
338
339
  		/* reset to satisfy the second predicate above */
  		block_off = 0;
525ca84da   Dennis Zhou (Facebook)   percpu: use metad...
340
341
342
343
344
  
  		*bits = block->right_free;
  		*bit_off = (i + 1) * PCPU_BITMAP_BLOCK_BITS - block->right_free;
  	}
  }
b4c2116cf   Dennis Zhou (Facebook)   percpu: update pc...
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
  /**
   * pcpu_next_fit_region - finds fit areas for a given allocation request
   * @chunk: chunk of interest
   * @alloc_bits: size of allocation
   * @align: alignment of area (max PAGE_SIZE)
   * @bit_off: chunk offset
   * @bits: size of free area
   *
   * Finds the next free region that is viable for use with a given size and
   * alignment.  This only returns if there is a valid area to be used for this
   * allocation.  block->first_free is returned if the allocation request fits
   * within the block to see if the request can be fulfilled prior to the contig
   * hint.
   */
  static void pcpu_next_fit_region(struct pcpu_chunk *chunk, int alloc_bits,
  				 int align, int *bit_off, int *bits)
  {
  	int i = pcpu_off_to_block_index(*bit_off);
  	int block_off = pcpu_off_to_block_off(*bit_off);
  	struct pcpu_block_md *block;
  
  	*bits = 0;
  	for (block = chunk->md_blocks + i; i < pcpu_chunk_nr_blocks(chunk);
  	     block++, i++) {
  		/* handles contig area across blocks */
  		if (*bits) {
  			*bits += block->left_free;
  			if (*bits >= alloc_bits)
  				return;
  			if (block->left_free == PCPU_BITMAP_BLOCK_BITS)
  				continue;
  		}
  
  		/* check block->contig_hint */
  		*bits = ALIGN(block->contig_hint_start, align) -
  			block->contig_hint_start;
  		/*
  		 * This uses the block offset to determine if this has been
  		 * checked in the prior iteration.
  		 */
  		if (block->contig_hint &&
  		    block->contig_hint_start >= block_off &&
  		    block->contig_hint >= *bits + alloc_bits) {
  			*bits += alloc_bits + block->contig_hint_start -
  				 block->first_free;
  			*bit_off = pcpu_block_off_to_off(i, block->first_free);
  			return;
  		}
1fa4df3e6   Dennis Zhou   percpu: fix itera...
393
394
  		/* reset to satisfy the second predicate above */
  		block_off = 0;
b4c2116cf   Dennis Zhou (Facebook)   percpu: update pc...
395
396
397
398
399
400
401
402
403
404
405
406
  
  		*bit_off = ALIGN(PCPU_BITMAP_BLOCK_BITS - block->right_free,
  				 align);
  		*bits = PCPU_BITMAP_BLOCK_BITS - *bit_off;
  		*bit_off = pcpu_block_off_to_off(i, *bit_off);
  		if (*bits >= alloc_bits)
  			return;
  	}
  
  	/* no valid offsets were found - fail condition */
  	*bit_off = pcpu_chunk_map_bits(chunk);
  }
525ca84da   Dennis Zhou (Facebook)   percpu: use metad...
407
408
409
  /*
   * Metadata free area iterators.  These perform aggregation of free areas
   * based on the metadata blocks and return the offset @bit_off and size in
b4c2116cf   Dennis Zhou (Facebook)   percpu: update pc...
410
411
   * bits of the free area @bits.  pcpu_for_each_fit_region only returns when
   * a fit is found for the allocation request.
525ca84da   Dennis Zhou (Facebook)   percpu: use metad...
412
413
414
415
416
417
   */
  #define pcpu_for_each_md_free_region(chunk, bit_off, bits)		\
  	for (pcpu_next_md_free_region((chunk), &(bit_off), &(bits));	\
  	     (bit_off) < pcpu_chunk_map_bits((chunk));			\
  	     (bit_off) += (bits) + 1,					\
  	     pcpu_next_md_free_region((chunk), &(bit_off), &(bits)))
b4c2116cf   Dennis Zhou (Facebook)   percpu: update pc...
418
419
420
421
422
423
424
  #define pcpu_for_each_fit_region(chunk, alloc_bits, align, bit_off, bits)     \
  	for (pcpu_next_fit_region((chunk), (alloc_bits), (align), &(bit_off), \
  				  &(bits));				      \
  	     (bit_off) < pcpu_chunk_map_bits((chunk));			      \
  	     (bit_off) += (bits),					      \
  	     pcpu_next_fit_region((chunk), (alloc_bits), (align), &(bit_off), \
  				  &(bits)))
525ca84da   Dennis Zhou (Facebook)   percpu: use metad...
425
  /**
90459ce06   Bob Liu   percpu: rename pc...
426
   * pcpu_mem_zalloc - allocate memory
1880d93b8   Tejun Heo   percpu: replace p...
427
   * @size: bytes to allocate
72682b162   Dennis Zhou   percpu: add __GFP...
428
   * @gfp: allocation flags
fbf59bc9d   Tejun Heo   percpu: implement...
429
   *
1880d93b8   Tejun Heo   percpu: replace p...
430
   * Allocate @size bytes.  If @size is smaller than PAGE_SIZE,
72682b162   Dennis Zhou   percpu: add __GFP...
431
432
433
   * kzalloc() is used; otherwise, the equivalent of vzalloc() is used.
   * This is to facilitate passing through whitelisted flags.  The
   * returned memory is always zeroed.
fbf59bc9d   Tejun Heo   percpu: implement...
434
   *
ccea34b5d   Tejun Heo   percpu: finer gra...
435
436
437
   * CONTEXT:
   * Does GFP_KERNEL allocation.
   *
fbf59bc9d   Tejun Heo   percpu: implement...
438
   * RETURNS:
1880d93b8   Tejun Heo   percpu: replace p...
439
   * Pointer to the allocated area on success, NULL on failure.
fbf59bc9d   Tejun Heo   percpu: implement...
440
   */
72682b162   Dennis Zhou   percpu: add __GFP...
441
  static void *pcpu_mem_zalloc(size_t size, gfp_t gfp)
fbf59bc9d   Tejun Heo   percpu: implement...
442
  {
099a19d91   Tejun Heo   percpu: allow lim...
443
444
  	if (WARN_ON_ONCE(!slab_is_available()))
  		return NULL;
1880d93b8   Tejun Heo   percpu: replace p...
445
  	if (size <= PAGE_SIZE)
72682b162   Dennis Zhou   percpu: add __GFP...
446
  		return kzalloc(size, gfp | GFP_KERNEL);
7af4c0932   Jesper Juhl   percpu: zero memo...
447
  	else
72682b162   Dennis Zhou   percpu: add __GFP...
448
449
  		return __vmalloc(size, gfp | GFP_KERNEL | __GFP_ZERO,
  				 PAGE_KERNEL);
1880d93b8   Tejun Heo   percpu: replace p...
450
  }
fbf59bc9d   Tejun Heo   percpu: implement...
451

1880d93b8   Tejun Heo   percpu: replace p...
452
453
454
  /**
   * pcpu_mem_free - free memory
   * @ptr: memory to free
1880d93b8   Tejun Heo   percpu: replace p...
455
   *
90459ce06   Bob Liu   percpu: rename pc...
456
   * Free @ptr.  @ptr should have been allocated using pcpu_mem_zalloc().
1880d93b8   Tejun Heo   percpu: replace p...
457
   */
1d5cfdb07   Tetsuo Handa   tree wide: use kv...
458
  static void pcpu_mem_free(void *ptr)
1880d93b8   Tejun Heo   percpu: replace p...
459
  {
1d5cfdb07   Tetsuo Handa   tree wide: use kv...
460
  	kvfree(ptr);
fbf59bc9d   Tejun Heo   percpu: implement...
461
462
463
464
465
466
467
468
469
  }
  
  /**
   * pcpu_chunk_relocate - put chunk in the appropriate chunk slot
   * @chunk: chunk of interest
   * @oslot: the previous slot it was on
   *
   * This function is called after an allocation or free changed @chunk.
   * New slot according to the changed state is determined and @chunk is
edcb46399   Tejun Heo   percpu, module: i...
470
471
   * moved to the slot.  Note that the reserved chunk is never put on
   * chunk slots.
ccea34b5d   Tejun Heo   percpu: finer gra...
472
473
474
   *
   * CONTEXT:
   * pcpu_lock.
fbf59bc9d   Tejun Heo   percpu: implement...
475
476
477
478
   */
  static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot)
  {
  	int nslot = pcpu_chunk_slot(chunk);
edcb46399   Tejun Heo   percpu, module: i...
479
  	if (chunk != pcpu_reserved_chunk && oslot != nslot) {
fbf59bc9d   Tejun Heo   percpu: implement...
480
481
482
483
484
485
  		if (oslot < nslot)
  			list_move(&chunk->list, &pcpu_slot[nslot]);
  		else
  			list_move_tail(&chunk->list, &pcpu_slot[nslot]);
  	}
  }
fbf59bc9d   Tejun Heo   percpu: implement...
486
  /**
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
487
   * pcpu_cnt_pop_pages- counts populated backing pages in range
833af8427   Tejun Heo   percpu: restructu...
488
   * @chunk: chunk of interest
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
489
490
   * @bit_off: start offset
   * @bits: size of area to check
9f7dcf224   Tejun Heo   percpu: move chun...
491
   *
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
492
493
494
   * Calculates the number of populated pages in the region
   * [page_start, page_end).  This keeps track of how many empty populated
   * pages are available and decide if async work should be scheduled.
ccea34b5d   Tejun Heo   percpu: finer gra...
495
   *
9f7dcf224   Tejun Heo   percpu: move chun...
496
   * RETURNS:
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
497
   * The nr of populated pages.
9f7dcf224   Tejun Heo   percpu: move chun...
498
   */
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
499
500
  static inline int pcpu_cnt_pop_pages(struct pcpu_chunk *chunk, int bit_off,
  				     int bits)
9f7dcf224   Tejun Heo   percpu: move chun...
501
  {
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
502
503
  	int page_start = PFN_UP(bit_off * PCPU_MIN_ALLOC_SIZE);
  	int page_end = PFN_DOWN((bit_off + bits) * PCPU_MIN_ALLOC_SIZE);
4f996e234   Tejun Heo   percpu: fix synch...
504

40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
505
  	if (page_start >= page_end)
9f7dcf224   Tejun Heo   percpu: move chun...
506
  		return 0;
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
507
508
509
510
511
512
513
514
515
  	/*
  	 * bitmap_weight counts the number of bits set in a bitmap up to
  	 * the specified number of bits.  This is counting the populated
  	 * pages up to page_end and then subtracting the populated pages
  	 * up to page_start to count the populated pages in
  	 * [page_start, page_end).
  	 */
  	return bitmap_weight(chunk->populated, page_end) -
  	       bitmap_weight(chunk->populated, page_start);
833af8427   Tejun Heo   percpu: restructu...
516
517
518
  }
  
  /**
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
519
   * pcpu_chunk_update - updates the chunk metadata given a free area
833af8427   Tejun Heo   percpu: restructu...
520
   * @chunk: chunk of interest
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
521
522
   * @bit_off: chunk offset
   * @bits: size of free area
833af8427   Tejun Heo   percpu: restructu...
523
   *
13f966373   Dennis Zhou (Facebook)   percpu: skip chun...
524
   * This updates the chunk's contig hint and starting offset given a free area.
268625a6f   Dennis Zhou (Facebook)   percpu: keep trac...
525
   * Choose the best starting offset if the contig hint is equal.
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
526
527
528
   */
  static void pcpu_chunk_update(struct pcpu_chunk *chunk, int bit_off, int bits)
  {
13f966373   Dennis Zhou (Facebook)   percpu: skip chun...
529
530
  	if (bits > chunk->contig_bits) {
  		chunk->contig_bits_start = bit_off;
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
531
  		chunk->contig_bits = bits;
268625a6f   Dennis Zhou (Facebook)   percpu: keep trac...
532
533
534
535
536
  	} else if (bits == chunk->contig_bits && chunk->contig_bits_start &&
  		   (!bit_off ||
  		    __ffs(bit_off) > __ffs(chunk->contig_bits_start))) {
  		/* use the start with the best alignment */
  		chunk->contig_bits_start = bit_off;
13f966373   Dennis Zhou (Facebook)   percpu: skip chun...
537
  	}
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
538
539
540
541
542
  }
  
  /**
   * pcpu_chunk_refresh_hint - updates metadata about a chunk
   * @chunk: chunk of interest
833af8427   Tejun Heo   percpu: restructu...
543
   *
525ca84da   Dennis Zhou (Facebook)   percpu: use metad...
544
545
546
   * Iterates over the metadata blocks to find the largest contig area.
   * It also counts the populated pages and uses the delta to update the
   * global count.
833af8427   Tejun Heo   percpu: restructu...
547
   *
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
548
549
   * Updates:
   *      chunk->contig_bits
13f966373   Dennis Zhou (Facebook)   percpu: skip chun...
550
   *      chunk->contig_bits_start
525ca84da   Dennis Zhou (Facebook)   percpu: use metad...
551
   *      nr_empty_pop_pages (chunk and global)
833af8427   Tejun Heo   percpu: restructu...
552
   */
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
553
  static void pcpu_chunk_refresh_hint(struct pcpu_chunk *chunk)
833af8427   Tejun Heo   percpu: restructu...
554
  {
525ca84da   Dennis Zhou (Facebook)   percpu: use metad...
555
  	int bit_off, bits, nr_empty_pop_pages;
833af8427   Tejun Heo   percpu: restructu...
556

40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
557
558
  	/* clear metadata */
  	chunk->contig_bits = 0;
6710e594f   Tejun Heo   percpu: fix synch...
559

525ca84da   Dennis Zhou (Facebook)   percpu: use metad...
560
  	bit_off = chunk->first_bit;
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
561
  	bits = nr_empty_pop_pages = 0;
525ca84da   Dennis Zhou (Facebook)   percpu: use metad...
562
563
  	pcpu_for_each_md_free_region(chunk, bit_off, bits) {
  		pcpu_chunk_update(chunk, bit_off, bits);
833af8427   Tejun Heo   percpu: restructu...
564

525ca84da   Dennis Zhou (Facebook)   percpu: use metad...
565
  		nr_empty_pop_pages += pcpu_cnt_pop_pages(chunk, bit_off, bits);
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
566
  	}
9f7dcf224   Tejun Heo   percpu: move chun...
567

40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
568
569
570
571
572
573
574
575
576
577
578
  	/*
  	 * Keep track of nr_empty_pop_pages.
  	 *
  	 * The chunk maintains the previous number of free pages it held,
  	 * so the delta is used to update the global counter.  The reserved
  	 * chunk is not part of the free page count as they are populated
  	 * at init and are special to serving reserved allocations.
  	 */
  	if (chunk != pcpu_reserved_chunk)
  		pcpu_nr_empty_pop_pages +=
  			(nr_empty_pop_pages - chunk->nr_empty_pop_pages);
a002d1484   Huang Shijie   percpu: fix a mem...
579

40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
580
581
  	chunk->nr_empty_pop_pages = nr_empty_pop_pages;
  }
9f7dcf224   Tejun Heo   percpu: move chun...
582

40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
583
  /**
ca460b3c9   Dennis Zhou (Facebook)   percpu: introduce...
584
585
586
587
588
589
   * pcpu_block_update - updates a block given a free area
   * @block: block of interest
   * @start: start offset in block
   * @end: end offset in block
   *
   * Updates a block given a known free area.  The region [start, end) is
268625a6f   Dennis Zhou (Facebook)   percpu: keep trac...
590
591
   * expected to be the entirety of the free area within a block.  Chooses
   * the best starting offset if the contig hints are equal.
ca460b3c9   Dennis Zhou (Facebook)   percpu: introduce...
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
   */
  static void pcpu_block_update(struct pcpu_block_md *block, int start, int end)
  {
  	int contig = end - start;
  
  	block->first_free = min(block->first_free, start);
  	if (start == 0)
  		block->left_free = contig;
  
  	if (end == PCPU_BITMAP_BLOCK_BITS)
  		block->right_free = contig;
  
  	if (contig > block->contig_hint) {
  		block->contig_hint_start = start;
  		block->contig_hint = contig;
268625a6f   Dennis Zhou (Facebook)   percpu: keep trac...
607
608
609
610
  	} else if (block->contig_hint_start && contig == block->contig_hint &&
  		   (!start || __ffs(start) > __ffs(block->contig_hint_start))) {
  		/* use the start with the best alignment */
  		block->contig_hint_start = start;
ca460b3c9   Dennis Zhou (Facebook)   percpu: introduce...
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
  	}
  }
  
  /**
   * pcpu_block_refresh_hint
   * @chunk: chunk of interest
   * @index: index of the metadata block
   *
   * Scans over the block beginning at first_free and updates the block
   * metadata accordingly.
   */
  static void pcpu_block_refresh_hint(struct pcpu_chunk *chunk, int index)
  {
  	struct pcpu_block_md *block = chunk->md_blocks + index;
  	unsigned long *alloc_map = pcpu_index_alloc_map(chunk, index);
  	int rs, re;	/* region start, region end */
  
  	/* clear hints */
  	block->contig_hint = 0;
  	block->left_free = block->right_free = 0;
  
  	/* iterate over free areas and update the contig hints */
  	pcpu_for_each_unpop_region(alloc_map, rs, re, block->first_free,
  				   PCPU_BITMAP_BLOCK_BITS) {
  		pcpu_block_update(block, rs, re);
  	}
  }
  
  /**
   * pcpu_block_update_hint_alloc - update hint on allocation path
   * @chunk: chunk of interest
   * @bit_off: chunk offset
   * @bits: size of request
fc3043345   Dennis Zhou (Facebook)   percpu: update al...
644
645
646
647
   *
   * Updates metadata for the allocation path.  The metadata only has to be
   * refreshed by a full scan iff the chunk's contig hint is broken.  Block level
   * scans are required if the block's contig hint is broken.
ca460b3c9   Dennis Zhou (Facebook)   percpu: introduce...
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
   */
  static void pcpu_block_update_hint_alloc(struct pcpu_chunk *chunk, int bit_off,
  					 int bits)
  {
  	struct pcpu_block_md *s_block, *e_block, *block;
  	int s_index, e_index;	/* block indexes of the freed allocation */
  	int s_off, e_off;	/* block offsets of the freed allocation */
  
  	/*
  	 * Calculate per block offsets.
  	 * The calculation uses an inclusive range, but the resulting offsets
  	 * are [start, end).  e_index always points to the last block in the
  	 * range.
  	 */
  	s_index = pcpu_off_to_block_index(bit_off);
  	e_index = pcpu_off_to_block_index(bit_off + bits - 1);
  	s_off = pcpu_off_to_block_off(bit_off);
  	e_off = pcpu_off_to_block_off(bit_off + bits - 1) + 1;
  
  	s_block = chunk->md_blocks + s_index;
  	e_block = chunk->md_blocks + e_index;
  
  	/*
  	 * Update s_block.
fc3043345   Dennis Zhou (Facebook)   percpu: update al...
672
673
674
  	 * block->first_free must be updated if the allocation takes its place.
  	 * If the allocation breaks the contig_hint, a scan is required to
  	 * restore this hint.
ca460b3c9   Dennis Zhou (Facebook)   percpu: introduce...
675
  	 */
fc3043345   Dennis Zhou (Facebook)   percpu: update al...
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
  	if (s_off == s_block->first_free)
  		s_block->first_free = find_next_zero_bit(
  					pcpu_index_alloc_map(chunk, s_index),
  					PCPU_BITMAP_BLOCK_BITS,
  					s_off + bits);
  
  	if (s_off >= s_block->contig_hint_start &&
  	    s_off < s_block->contig_hint_start + s_block->contig_hint) {
  		/* block contig hint is broken - scan to fix it */
  		pcpu_block_refresh_hint(chunk, s_index);
  	} else {
  		/* update left and right contig manually */
  		s_block->left_free = min(s_block->left_free, s_off);
  		if (s_index == e_index)
  			s_block->right_free = min_t(int, s_block->right_free,
  					PCPU_BITMAP_BLOCK_BITS - e_off);
  		else
  			s_block->right_free = 0;
  	}
ca460b3c9   Dennis Zhou (Facebook)   percpu: introduce...
695
696
697
698
699
  
  	/*
  	 * Update e_block.
  	 */
  	if (s_index != e_index) {
fc3043345   Dennis Zhou (Facebook)   percpu: update al...
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
  		/*
  		 * When the allocation is across blocks, the end is along
  		 * the left part of the e_block.
  		 */
  		e_block->first_free = find_next_zero_bit(
  				pcpu_index_alloc_map(chunk, e_index),
  				PCPU_BITMAP_BLOCK_BITS, e_off);
  
  		if (e_off == PCPU_BITMAP_BLOCK_BITS) {
  			/* reset the block */
  			e_block++;
  		} else {
  			if (e_off > e_block->contig_hint_start) {
  				/* contig hint is broken - scan to fix it */
  				pcpu_block_refresh_hint(chunk, e_index);
  			} else {
  				e_block->left_free = 0;
  				e_block->right_free =
  					min_t(int, e_block->right_free,
  					      PCPU_BITMAP_BLOCK_BITS - e_off);
  			}
  		}
ca460b3c9   Dennis Zhou (Facebook)   percpu: introduce...
722
723
724
725
726
727
728
729
  
  		/* update in-between md_blocks */
  		for (block = s_block + 1; block < e_block; block++) {
  			block->contig_hint = 0;
  			block->left_free = 0;
  			block->right_free = 0;
  		}
  	}
fc3043345   Dennis Zhou (Facebook)   percpu: update al...
730
731
732
733
734
735
736
737
  	/*
  	 * The only time a full chunk scan is required is if the chunk
  	 * contig hint is broken.  Otherwise, it means a smaller space
  	 * was used and therefore the chunk contig hint is still correct.
  	 */
  	if (bit_off >= chunk->contig_bits_start  &&
  	    bit_off < chunk->contig_bits_start + chunk->contig_bits)
  		pcpu_chunk_refresh_hint(chunk);
ca460b3c9   Dennis Zhou (Facebook)   percpu: introduce...
738
739
740
741
742
743
744
  }
  
  /**
   * pcpu_block_update_hint_free - updates the block hints on the free path
   * @chunk: chunk of interest
   * @bit_off: chunk offset
   * @bits: size of request
b185cd0dc   Dennis Zhou (Facebook)   percpu: update fr...
745
746
747
748
749
750
751
752
753
754
755
   *
   * Updates metadata for the allocation path.  This avoids a blind block
   * refresh by making use of the block contig hints.  If this fails, it scans
   * forward and backward to determine the extent of the free area.  This is
   * capped at the boundary of blocks.
   *
   * A chunk update is triggered if a page becomes free, a block becomes free,
   * or the free spans across blocks.  This tradeoff is to minimize iterating
   * over the block metadata to update chunk->contig_bits.  chunk->contig_bits
   * may be off by up to a page, but it will never be more than the available
   * space.  If the contig hint is contained in one block, it will be accurate.
ca460b3c9   Dennis Zhou (Facebook)   percpu: introduce...
756
757
758
759
760
761
762
   */
  static void pcpu_block_update_hint_free(struct pcpu_chunk *chunk, int bit_off,
  					int bits)
  {
  	struct pcpu_block_md *s_block, *e_block, *block;
  	int s_index, e_index;	/* block indexes of the freed allocation */
  	int s_off, e_off;	/* block offsets of the freed allocation */
b185cd0dc   Dennis Zhou (Facebook)   percpu: update fr...
763
  	int start, end;		/* start and end of the whole free area */
ca460b3c9   Dennis Zhou (Facebook)   percpu: introduce...
764
765
766
767
768
769
770
771
772
773
774
775
776
777
  
  	/*
  	 * Calculate per block offsets.
  	 * The calculation uses an inclusive range, but the resulting offsets
  	 * are [start, end).  e_index always points to the last block in the
  	 * range.
  	 */
  	s_index = pcpu_off_to_block_index(bit_off);
  	e_index = pcpu_off_to_block_index(bit_off + bits - 1);
  	s_off = pcpu_off_to_block_off(bit_off);
  	e_off = pcpu_off_to_block_off(bit_off + bits - 1) + 1;
  
  	s_block = chunk->md_blocks + s_index;
  	e_block = chunk->md_blocks + e_index;
b185cd0dc   Dennis Zhou (Facebook)   percpu: update fr...
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
  	/*
  	 * Check if the freed area aligns with the block->contig_hint.
  	 * If it does, then the scan to find the beginning/end of the
  	 * larger free area can be avoided.
  	 *
  	 * start and end refer to beginning and end of the free area
  	 * within each their respective blocks.  This is not necessarily
  	 * the entire free area as it may span blocks past the beginning
  	 * or end of the block.
  	 */
  	start = s_off;
  	if (s_off == s_block->contig_hint + s_block->contig_hint_start) {
  		start = s_block->contig_hint_start;
  	} else {
  		/*
  		 * Scan backwards to find the extent of the free area.
  		 * find_last_bit returns the starting bit, so if the start bit
  		 * is returned, that means there was no last bit and the
  		 * remainder of the chunk is free.
  		 */
  		int l_bit = find_last_bit(pcpu_index_alloc_map(chunk, s_index),
  					  start);
  		start = (start == l_bit) ? 0 : l_bit + 1;
  	}
  
  	end = e_off;
  	if (e_off == e_block->contig_hint_start)
  		end = e_block->contig_hint_start + e_block->contig_hint;
  	else
  		end = find_next_bit(pcpu_index_alloc_map(chunk, e_index),
  				    PCPU_BITMAP_BLOCK_BITS, end);
ca460b3c9   Dennis Zhou (Facebook)   percpu: introduce...
809
  	/* update s_block */
b185cd0dc   Dennis Zhou (Facebook)   percpu: update fr...
810
811
  	e_off = (s_index == e_index) ? end : PCPU_BITMAP_BLOCK_BITS;
  	pcpu_block_update(s_block, start, e_off);
ca460b3c9   Dennis Zhou (Facebook)   percpu: introduce...
812
813
814
815
  
  	/* freeing in the same block */
  	if (s_index != e_index) {
  		/* update e_block */
b185cd0dc   Dennis Zhou (Facebook)   percpu: update fr...
816
  		pcpu_block_update(e_block, 0, end);
ca460b3c9   Dennis Zhou (Facebook)   percpu: introduce...
817
818
819
820
821
822
823
824
825
826
  
  		/* reset md_blocks in the middle */
  		for (block = s_block + 1; block < e_block; block++) {
  			block->first_free = 0;
  			block->contig_hint_start = 0;
  			block->contig_hint = PCPU_BITMAP_BLOCK_BITS;
  			block->left_free = PCPU_BITMAP_BLOCK_BITS;
  			block->right_free = PCPU_BITMAP_BLOCK_BITS;
  		}
  	}
b185cd0dc   Dennis Zhou (Facebook)   percpu: update fr...
827
828
829
830
831
832
833
834
835
836
837
838
839
  	/*
  	 * Refresh chunk metadata when the free makes a page free, a block
  	 * free, or spans across blocks.  The contig hint may be off by up to
  	 * a page, but if the hint is contained in a block, it will be accurate
  	 * with the else condition below.
  	 */
  	if ((ALIGN_DOWN(end, min(PCPU_BITS_PER_PAGE, PCPU_BITMAP_BLOCK_BITS)) >
  	     ALIGN(start, min(PCPU_BITS_PER_PAGE, PCPU_BITMAP_BLOCK_BITS))) ||
  	    s_index != e_index)
  		pcpu_chunk_refresh_hint(chunk);
  	else
  		pcpu_chunk_update(chunk, pcpu_block_off_to_off(s_index, start),
  				  s_block->contig_hint);
ca460b3c9   Dennis Zhou (Facebook)   percpu: introduce...
840
841
842
  }
  
  /**
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
   * pcpu_is_populated - determines if the region is populated
   * @chunk: chunk of interest
   * @bit_off: chunk offset
   * @bits: size of area
   * @next_off: return value for the next offset to start searching
   *
   * For atomic allocations, check if the backing pages are populated.
   *
   * RETURNS:
   * Bool if the backing pages are populated.
   * next_index is to skip over unpopulated blocks in pcpu_find_block_fit.
   */
  static bool pcpu_is_populated(struct pcpu_chunk *chunk, int bit_off, int bits,
  			      int *next_off)
  {
  	int page_start, page_end, rs, re;
833af8427   Tejun Heo   percpu: restructu...
859

40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
860
861
  	page_start = PFN_DOWN(bit_off * PCPU_MIN_ALLOC_SIZE);
  	page_end = PFN_UP((bit_off + bits) * PCPU_MIN_ALLOC_SIZE);
833af8427   Tejun Heo   percpu: restructu...
862

40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
863
864
865
866
  	rs = page_start;
  	pcpu_next_unpop(chunk->populated, &rs, &re, page_end);
  	if (rs >= page_end)
  		return true;
833af8427   Tejun Heo   percpu: restructu...
867

40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
868
869
  	*next_off = re * PAGE_SIZE / PCPU_MIN_ALLOC_SIZE;
  	return false;
9f7dcf224   Tejun Heo   percpu: move chun...
870
871
872
  }
  
  /**
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
873
874
875
876
877
878
   * pcpu_find_block_fit - finds the block index to start searching
   * @chunk: chunk of interest
   * @alloc_bits: size of request in allocation units
   * @align: alignment of area (max PAGE_SIZE bytes)
   * @pop_only: use populated regions only
   *
b4c2116cf   Dennis Zhou (Facebook)   percpu: update pc...
879
880
881
882
883
884
885
886
   * Given a chunk and an allocation spec, find the offset to begin searching
   * for a free region.  This iterates over the bitmap metadata blocks to
   * find an offset that will be guaranteed to fit the requirements.  It is
   * not quite first fit as if the allocation does not fit in the contig hint
   * of a block or chunk, it is skipped.  This errs on the side of caution
   * to prevent excess iteration.  Poor alignment can cause the allocator to
   * skip over blocks and chunks that have valid free areas.
   *
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
887
888
889
   * RETURNS:
   * The offset in the bitmap to begin searching.
   * -1 if no offset is found.
a16037c8d   Tejun Heo   percpu: make pcpu...
890
   */
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
891
892
  static int pcpu_find_block_fit(struct pcpu_chunk *chunk, int alloc_bits,
  			       size_t align, bool pop_only)
a16037c8d   Tejun Heo   percpu: make pcpu...
893
  {
b4c2116cf   Dennis Zhou (Facebook)   percpu: update pc...
894
  	int bit_off, bits, next_off;
a16037c8d   Tejun Heo   percpu: make pcpu...
895

13f966373   Dennis Zhou (Facebook)   percpu: skip chun...
896
897
898
899
900
901
902
903
904
905
  	/*
  	 * Check to see if the allocation can fit in the chunk's contig hint.
  	 * This is an optimization to prevent scanning by assuming if it
  	 * cannot fit in the global hint, there is memory pressure and creating
  	 * a new chunk would happen soon.
  	 */
  	bit_off = ALIGN(chunk->contig_bits_start, align) -
  		  chunk->contig_bits_start;
  	if (bit_off + alloc_bits > chunk->contig_bits)
  		return -1;
b4c2116cf   Dennis Zhou (Facebook)   percpu: update pc...
906
907
908
  	bit_off = chunk->first_bit;
  	bits = 0;
  	pcpu_for_each_fit_region(chunk, alloc_bits, align, bit_off, bits) {
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
909
  		if (!pop_only || pcpu_is_populated(chunk, bit_off, bits,
b4c2116cf   Dennis Zhou (Facebook)   percpu: update pc...
910
  						   &next_off))
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
911
  			break;
a16037c8d   Tejun Heo   percpu: make pcpu...
912

b4c2116cf   Dennis Zhou (Facebook)   percpu: update pc...
913
  		bit_off = next_off;
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
914
  		bits = 0;
a16037c8d   Tejun Heo   percpu: make pcpu...
915
  	}
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
916
917
918
919
920
  
  	if (bit_off == pcpu_chunk_map_bits(chunk))
  		return -1;
  
  	return bit_off;
a16037c8d   Tejun Heo   percpu: make pcpu...
921
922
923
  }
  
  /**
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
924
   * pcpu_alloc_area - allocates an area from a pcpu_chunk
fbf59bc9d   Tejun Heo   percpu: implement...
925
   * @chunk: chunk of interest
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
926
927
928
   * @alloc_bits: size of request in allocation units
   * @align: alignment of area (max PAGE_SIZE)
   * @start: bit_off to start searching
9f7dcf224   Tejun Heo   percpu: move chun...
929
   *
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
930
   * This function takes in a @start offset to begin searching to fit an
b4c2116cf   Dennis Zhou (Facebook)   percpu: update pc...
931
932
933
934
935
936
   * allocation of @alloc_bits with alignment @align.  It needs to scan
   * the allocation map because if it fits within the block's contig hint,
   * @start will be block->first_free. This is an attempt to fill the
   * allocation prior to breaking the contig hint.  The allocation and
   * boundary maps are updated accordingly if it confirms a valid
   * free area.
ccea34b5d   Tejun Heo   percpu: finer gra...
937
   *
fbf59bc9d   Tejun Heo   percpu: implement...
938
   * RETURNS:
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
939
940
   * Allocated addr offset in @chunk on success.
   * -1 if no matching area is found.
fbf59bc9d   Tejun Heo   percpu: implement...
941
   */
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
942
943
  static int pcpu_alloc_area(struct pcpu_chunk *chunk, int alloc_bits,
  			   size_t align, int start)
fbf59bc9d   Tejun Heo   percpu: implement...
944
  {
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
945
946
  	size_t align_mask = (align) ? (align - 1) : 0;
  	int bit_off, end, oslot;
a16037c8d   Tejun Heo   percpu: make pcpu...
947

40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
948
  	lockdep_assert_held(&pcpu_lock);
fbf59bc9d   Tejun Heo   percpu: implement...
949

40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
950
  	oslot = pcpu_chunk_slot(chunk);
fbf59bc9d   Tejun Heo   percpu: implement...
951

40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
952
953
954
  	/*
  	 * Search to find a fit.
  	 */
b4c2116cf   Dennis Zhou (Facebook)   percpu: update pc...
955
  	end = start + alloc_bits + PCPU_BITMAP_BLOCK_BITS;
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
956
957
958
959
  	bit_off = bitmap_find_next_zero_area(chunk->alloc_map, end, start,
  					     alloc_bits, align_mask);
  	if (bit_off >= end)
  		return -1;
fbf59bc9d   Tejun Heo   percpu: implement...
960

40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
961
962
  	/* update alloc map */
  	bitmap_set(chunk->alloc_map, bit_off, alloc_bits);
3d331ad74   Al Viro   percpu: speed all...
963

40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
964
965
966
967
  	/* update boundary map */
  	set_bit(bit_off, chunk->bound_map);
  	bitmap_clear(chunk->bound_map, bit_off + 1, alloc_bits - 1);
  	set_bit(bit_off + alloc_bits, chunk->bound_map);
fbf59bc9d   Tejun Heo   percpu: implement...
968

40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
969
  	chunk->free_bytes -= alloc_bits * PCPU_MIN_ALLOC_SIZE;
fbf59bc9d   Tejun Heo   percpu: implement...
970

86b442fbc   Dennis Zhou (Facebook)   percpu: add first...
971
972
973
974
975
976
  	/* update first free bit */
  	if (bit_off == chunk->first_bit)
  		chunk->first_bit = find_next_zero_bit(
  					chunk->alloc_map,
  					pcpu_chunk_map_bits(chunk),
  					bit_off + alloc_bits);
ca460b3c9   Dennis Zhou (Facebook)   percpu: introduce...
977
  	pcpu_block_update_hint_alloc(chunk, bit_off, alloc_bits);
fbf59bc9d   Tejun Heo   percpu: implement...
978

fbf59bc9d   Tejun Heo   percpu: implement...
979
  	pcpu_chunk_relocate(chunk, oslot);
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
980
  	return bit_off * PCPU_MIN_ALLOC_SIZE;
fbf59bc9d   Tejun Heo   percpu: implement...
981
982
983
  }
  
  /**
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
984
   * pcpu_free_area - frees the corresponding offset
fbf59bc9d   Tejun Heo   percpu: implement...
985
   * @chunk: chunk of interest
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
986
   * @off: addr offset into chunk
ccea34b5d   Tejun Heo   percpu: finer gra...
987
   *
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
988
989
   * This function determines the size of an allocation to free using
   * the boundary bitmap and clears the allocation map.
fbf59bc9d   Tejun Heo   percpu: implement...
990
   */
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
991
  static void pcpu_free_area(struct pcpu_chunk *chunk, int off)
fbf59bc9d   Tejun Heo   percpu: implement...
992
  {
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
993
  	int bit_off, bits, end, oslot;
723ad1d90   Al Viro   percpu: store off...
994

5ccd30e40   Dennis Zhou   percpu: add missi...
995
  	lockdep_assert_held(&pcpu_lock);
30a5b5367   Dennis Zhou   percpu: expose st...
996
  	pcpu_stats_area_dealloc(chunk);
5ccd30e40   Dennis Zhou   percpu: add missi...
997

40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
998
  	oslot = pcpu_chunk_slot(chunk);
fbf59bc9d   Tejun Heo   percpu: implement...
999

40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1000
  	bit_off = off / PCPU_MIN_ALLOC_SIZE;
3d331ad74   Al Viro   percpu: speed all...
1001

40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1002
1003
1004
1005
1006
  	/* find end index */
  	end = find_next_bit(chunk->bound_map, pcpu_chunk_map_bits(chunk),
  			    bit_off + 1);
  	bits = end - bit_off;
  	bitmap_clear(chunk->alloc_map, bit_off, bits);
fbf59bc9d   Tejun Heo   percpu: implement...
1007

40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1008
1009
  	/* update metadata */
  	chunk->free_bytes += bits * PCPU_MIN_ALLOC_SIZE;
b539b87fe   Tejun Heo   percpu: implmeent...
1010

86b442fbc   Dennis Zhou (Facebook)   percpu: add first...
1011
1012
  	/* update first free bit */
  	chunk->first_bit = min(chunk->first_bit, bit_off);
ca460b3c9   Dennis Zhou (Facebook)   percpu: introduce...
1013
  	pcpu_block_update_hint_free(chunk, bit_off, bits);
fbf59bc9d   Tejun Heo   percpu: implement...
1014

fbf59bc9d   Tejun Heo   percpu: implement...
1015
1016
  	pcpu_chunk_relocate(chunk, oslot);
  }
ca460b3c9   Dennis Zhou (Facebook)   percpu: introduce...
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
  static void pcpu_init_md_blocks(struct pcpu_chunk *chunk)
  {
  	struct pcpu_block_md *md_block;
  
  	for (md_block = chunk->md_blocks;
  	     md_block != chunk->md_blocks + pcpu_chunk_nr_blocks(chunk);
  	     md_block++) {
  		md_block->contig_hint = PCPU_BITMAP_BLOCK_BITS;
  		md_block->left_free = PCPU_BITMAP_BLOCK_BITS;
  		md_block->right_free = PCPU_BITMAP_BLOCK_BITS;
  	}
  }
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
  /**
   * pcpu_alloc_first_chunk - creates chunks that serve the first chunk
   * @tmp_addr: the start of the region served
   * @map_size: size of the region served
   *
   * This is responsible for creating the chunks that serve the first chunk.  The
   * base_addr is page aligned down of @tmp_addr while the region end is page
   * aligned up.  Offsets are kept track of to determine the region served. All
   * this is done to appease the bitmap allocator in avoiding partial blocks.
   *
   * RETURNS:
   * Chunk serving the region at @tmp_addr of @map_size.
   */
c0ebfdc3f   Dennis Zhou (Facebook)   percpu: modify ba...
1042
  static struct pcpu_chunk * __init pcpu_alloc_first_chunk(unsigned long tmp_addr,
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1043
  							 int map_size)
10edf5b0b   Dennis Zhou (Facebook)   percpu: unify all...
1044
1045
  {
  	struct pcpu_chunk *chunk;
ca460b3c9   Dennis Zhou (Facebook)   percpu: introduce...
1046
  	unsigned long aligned_addr, lcm_align;
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1047
  	int start_offset, offset_bits, region_size, region_bits;
c0ebfdc3f   Dennis Zhou (Facebook)   percpu: modify ba...
1048
1049
1050
1051
1052
  
  	/* region calculations */
  	aligned_addr = tmp_addr & PAGE_MASK;
  
  	start_offset = tmp_addr - aligned_addr;
6b9d7c8e8   Dennis Zhou (Facebook)   percpu: end chunk...
1053

ca460b3c9   Dennis Zhou (Facebook)   percpu: introduce...
1054
1055
1056
1057
1058
1059
1060
  	/*
  	 * Align the end of the region with the LCM of PAGE_SIZE and
  	 * PCPU_BITMAP_BLOCK_SIZE.  One of these constants is a multiple of
  	 * the other.
  	 */
  	lcm_align = lcm(PAGE_SIZE, PCPU_BITMAP_BLOCK_SIZE);
  	region_size = ALIGN(start_offset + map_size, lcm_align);
10edf5b0b   Dennis Zhou (Facebook)   percpu: unify all...
1061

c0ebfdc3f   Dennis Zhou (Facebook)   percpu: modify ba...
1062
  	/* allocate chunk */
8ab16c43e   Dennis Zhou (Facebook)   percpu: change th...
1063
1064
1065
  	chunk = memblock_virt_alloc(sizeof(struct pcpu_chunk) +
  				    BITS_TO_LONGS(region_size >> PAGE_SHIFT),
  				    0);
c0ebfdc3f   Dennis Zhou (Facebook)   percpu: modify ba...
1066

10edf5b0b   Dennis Zhou (Facebook)   percpu: unify all...
1067
  	INIT_LIST_HEAD(&chunk->list);
c0ebfdc3f   Dennis Zhou (Facebook)   percpu: modify ba...
1068
1069
  
  	chunk->base_addr = (void *)aligned_addr;
10edf5b0b   Dennis Zhou (Facebook)   percpu: unify all...
1070
  	chunk->start_offset = start_offset;
6b9d7c8e8   Dennis Zhou (Facebook)   percpu: end chunk...
1071
  	chunk->end_offset = region_size - chunk->start_offset - map_size;
c0ebfdc3f   Dennis Zhou (Facebook)   percpu: modify ba...
1072

8ab16c43e   Dennis Zhou (Facebook)   percpu: change th...
1073
  	chunk->nr_pages = region_size >> PAGE_SHIFT;
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1074
  	region_bits = pcpu_chunk_map_bits(chunk);
c0ebfdc3f   Dennis Zhou (Facebook)   percpu: modify ba...
1075

ca460b3c9   Dennis Zhou (Facebook)   percpu: introduce...
1076
1077
1078
1079
1080
1081
1082
  	chunk->alloc_map = memblock_virt_alloc(BITS_TO_LONGS(region_bits) *
  					       sizeof(chunk->alloc_map[0]), 0);
  	chunk->bound_map = memblock_virt_alloc(BITS_TO_LONGS(region_bits + 1) *
  					       sizeof(chunk->bound_map[0]), 0);
  	chunk->md_blocks = memblock_virt_alloc(pcpu_chunk_nr_blocks(chunk) *
  					       sizeof(chunk->md_blocks[0]), 0);
  	pcpu_init_md_blocks(chunk);
10edf5b0b   Dennis Zhou (Facebook)   percpu: unify all...
1083
1084
1085
  
  	/* manage populated page bitmap */
  	chunk->immutable = true;
8ab16c43e   Dennis Zhou (Facebook)   percpu: change th...
1086
1087
  	bitmap_fill(chunk->populated, chunk->nr_pages);
  	chunk->nr_populated = chunk->nr_pages;
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1088
1089
1090
  	chunk->nr_empty_pop_pages =
  		pcpu_cnt_pop_pages(chunk, start_offset / PCPU_MIN_ALLOC_SIZE,
  				   map_size / PCPU_MIN_ALLOC_SIZE);
10edf5b0b   Dennis Zhou (Facebook)   percpu: unify all...
1091

40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1092
1093
  	chunk->contig_bits = map_size / PCPU_MIN_ALLOC_SIZE;
  	chunk->free_bytes = map_size;
c0ebfdc3f   Dennis Zhou (Facebook)   percpu: modify ba...
1094
1095
1096
  
  	if (chunk->start_offset) {
  		/* hide the beginning of the bitmap */
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1097
1098
1099
1100
  		offset_bits = chunk->start_offset / PCPU_MIN_ALLOC_SIZE;
  		bitmap_set(chunk->alloc_map, 0, offset_bits);
  		set_bit(0, chunk->bound_map);
  		set_bit(offset_bits, chunk->bound_map);
ca460b3c9   Dennis Zhou (Facebook)   percpu: introduce...
1101

86b442fbc   Dennis Zhou (Facebook)   percpu: add first...
1102
  		chunk->first_bit = offset_bits;
ca460b3c9   Dennis Zhou (Facebook)   percpu: introduce...
1103
  		pcpu_block_update_hint_alloc(chunk, 0, offset_bits);
c0ebfdc3f   Dennis Zhou (Facebook)   percpu: modify ba...
1104
  	}
6b9d7c8e8   Dennis Zhou (Facebook)   percpu: end chunk...
1105
1106
  	if (chunk->end_offset) {
  		/* hide the end of the bitmap */
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1107
1108
1109
1110
1111
1112
1113
  		offset_bits = chunk->end_offset / PCPU_MIN_ALLOC_SIZE;
  		bitmap_set(chunk->alloc_map,
  			   pcpu_chunk_map_bits(chunk) - offset_bits,
  			   offset_bits);
  		set_bit((start_offset + map_size) / PCPU_MIN_ALLOC_SIZE,
  			chunk->bound_map);
  		set_bit(region_bits, chunk->bound_map);
6b9d7c8e8   Dennis Zhou (Facebook)   percpu: end chunk...
1114

ca460b3c9   Dennis Zhou (Facebook)   percpu: introduce...
1115
1116
1117
  		pcpu_block_update_hint_alloc(chunk, pcpu_chunk_map_bits(chunk)
  					     - offset_bits, offset_bits);
  	}
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1118

10edf5b0b   Dennis Zhou (Facebook)   percpu: unify all...
1119
1120
  	return chunk;
  }
72682b162   Dennis Zhou   percpu: add __GFP...
1121
  static struct pcpu_chunk *pcpu_alloc_chunk(gfp_t gfp)
6081089fd   Tejun Heo   percpu: reorganiz...
1122
1123
  {
  	struct pcpu_chunk *chunk;
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1124
  	int region_bits;
6081089fd   Tejun Heo   percpu: reorganiz...
1125

72682b162   Dennis Zhou   percpu: add __GFP...
1126
  	chunk = pcpu_mem_zalloc(pcpu_chunk_struct_size, gfp);
6081089fd   Tejun Heo   percpu: reorganiz...
1127
1128
  	if (!chunk)
  		return NULL;
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1129
1130
1131
  	INIT_LIST_HEAD(&chunk->list);
  	chunk->nr_pages = pcpu_unit_pages;
  	region_bits = pcpu_chunk_map_bits(chunk);
6081089fd   Tejun Heo   percpu: reorganiz...
1132

40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1133
  	chunk->alloc_map = pcpu_mem_zalloc(BITS_TO_LONGS(region_bits) *
72682b162   Dennis Zhou   percpu: add __GFP...
1134
  					   sizeof(chunk->alloc_map[0]), gfp);
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1135
1136
  	if (!chunk->alloc_map)
  		goto alloc_map_fail;
6081089fd   Tejun Heo   percpu: reorganiz...
1137

40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1138
  	chunk->bound_map = pcpu_mem_zalloc(BITS_TO_LONGS(region_bits + 1) *
72682b162   Dennis Zhou   percpu: add __GFP...
1139
  					   sizeof(chunk->bound_map[0]), gfp);
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1140
1141
  	if (!chunk->bound_map)
  		goto bound_map_fail;
6081089fd   Tejun Heo   percpu: reorganiz...
1142

ca460b3c9   Dennis Zhou (Facebook)   percpu: introduce...
1143
  	chunk->md_blocks = pcpu_mem_zalloc(pcpu_chunk_nr_blocks(chunk) *
72682b162   Dennis Zhou   percpu: add __GFP...
1144
  					   sizeof(chunk->md_blocks[0]), gfp);
ca460b3c9   Dennis Zhou (Facebook)   percpu: introduce...
1145
1146
1147
1148
  	if (!chunk->md_blocks)
  		goto md_blocks_fail;
  
  	pcpu_init_md_blocks(chunk);
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1149
1150
1151
  	/* init metadata */
  	chunk->contig_bits = region_bits;
  	chunk->free_bytes = chunk->nr_pages * PAGE_SIZE;
c0ebfdc3f   Dennis Zhou (Facebook)   percpu: modify ba...
1152

6081089fd   Tejun Heo   percpu: reorganiz...
1153
  	return chunk;
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1154

ca460b3c9   Dennis Zhou (Facebook)   percpu: introduce...
1155
1156
  md_blocks_fail:
  	pcpu_mem_free(chunk->bound_map);
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1157
1158
1159
1160
1161
1162
  bound_map_fail:
  	pcpu_mem_free(chunk->alloc_map);
  alloc_map_fail:
  	pcpu_mem_free(chunk);
  
  	return NULL;
6081089fd   Tejun Heo   percpu: reorganiz...
1163
1164
1165
1166
1167
1168
  }
  
  static void pcpu_free_chunk(struct pcpu_chunk *chunk)
  {
  	if (!chunk)
  		return;
54886c978   Mike Rapoport   percpu: stop leak...
1169
  	pcpu_mem_free(chunk->md_blocks);
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1170
1171
  	pcpu_mem_free(chunk->bound_map);
  	pcpu_mem_free(chunk->alloc_map);
1d5cfdb07   Tetsuo Handa   tree wide: use kv...
1172
  	pcpu_mem_free(chunk);
6081089fd   Tejun Heo   percpu: reorganiz...
1173
  }
b539b87fe   Tejun Heo   percpu: implmeent...
1174
1175
1176
1177
1178
  /**
   * pcpu_chunk_populated - post-population bookkeeping
   * @chunk: pcpu_chunk which got populated
   * @page_start: the start page
   * @page_end: the end page
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1179
   * @for_alloc: if this is to populate for allocation
b539b87fe   Tejun Heo   percpu: implmeent...
1180
1181
1182
1183
   *
   * Pages in [@page_start,@page_end) have been populated to @chunk.  Update
   * the bookkeeping information accordingly.  Must be called after each
   * successful population.
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1184
1185
1186
   *
   * If this is @for_alloc, do not increment pcpu_nr_empty_pop_pages because it
   * is to serve an allocation in that area.
b539b87fe   Tejun Heo   percpu: implmeent...
1187
   */
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1188
1189
  static void pcpu_chunk_populated(struct pcpu_chunk *chunk, int page_start,
  				 int page_end, bool for_alloc)
b539b87fe   Tejun Heo   percpu: implmeent...
1190
1191
1192
1193
1194
1195
1196
  {
  	int nr = page_end - page_start;
  
  	lockdep_assert_held(&pcpu_lock);
  
  	bitmap_set(chunk->populated, page_start, nr);
  	chunk->nr_populated += nr;
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1197
1198
1199
1200
1201
  
  	if (!for_alloc) {
  		chunk->nr_empty_pop_pages += nr;
  		pcpu_nr_empty_pop_pages += nr;
  	}
b539b87fe   Tejun Heo   percpu: implmeent...
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
  }
  
  /**
   * pcpu_chunk_depopulated - post-depopulation bookkeeping
   * @chunk: pcpu_chunk which got depopulated
   * @page_start: the start page
   * @page_end: the end page
   *
   * Pages in [@page_start,@page_end) have been depopulated from @chunk.
   * Update the bookkeeping information accordingly.  Must be called after
   * each successful depopulation.
   */
  static void pcpu_chunk_depopulated(struct pcpu_chunk *chunk,
  				   int page_start, int page_end)
  {
  	int nr = page_end - page_start;
  
  	lockdep_assert_held(&pcpu_lock);
  
  	bitmap_clear(chunk->populated, page_start, nr);
  	chunk->nr_populated -= nr;
0cecf50cf   Dennis Zhou (Facebook)   percpu: introduce...
1223
  	chunk->nr_empty_pop_pages -= nr;
b539b87fe   Tejun Heo   percpu: implmeent...
1224
1225
  	pcpu_nr_empty_pop_pages -= nr;
  }
9f6455325   Tejun Heo   percpu: move vmal...
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
  /*
   * Chunk management implementation.
   *
   * To allow different implementations, chunk alloc/free and
   * [de]population are implemented in a separate file which is pulled
   * into this file and compiled together.  The following functions
   * should be implemented.
   *
   * pcpu_populate_chunk		- populate the specified range of a chunk
   * pcpu_depopulate_chunk	- depopulate the specified range of a chunk
   * pcpu_create_chunk		- create a new chunk
   * pcpu_destroy_chunk		- destroy a chunk, always preceded by full depop
   * pcpu_addr_to_page		- translate address to physical address
   * pcpu_verify_alloc_info	- check alloc_info is acceptable during init
fbf59bc9d   Tejun Heo   percpu: implement...
1240
   */
72682b162   Dennis Zhou   percpu: add __GFP...
1241
1242
  static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size,
  			       gfp_t gfp);
9f6455325   Tejun Heo   percpu: move vmal...
1243
  static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size);
72682b162   Dennis Zhou   percpu: add __GFP...
1244
  static struct pcpu_chunk *pcpu_create_chunk(gfp_t gfp);
9f6455325   Tejun Heo   percpu: move vmal...
1245
1246
1247
  static void pcpu_destroy_chunk(struct pcpu_chunk *chunk);
  static struct page *pcpu_addr_to_page(void *addr);
  static int __init pcpu_verify_alloc_info(const struct pcpu_alloc_info *ai);
fbf59bc9d   Tejun Heo   percpu: implement...
1248

b0c9778b1   Tejun Heo   percpu: implement...
1249
1250
1251
  #ifdef CONFIG_NEED_PER_CPU_KM
  #include "percpu-km.c"
  #else
9f6455325   Tejun Heo   percpu: move vmal...
1252
  #include "percpu-vm.c"
b0c9778b1   Tejun Heo   percpu: implement...
1253
  #endif
fbf59bc9d   Tejun Heo   percpu: implement...
1254
1255
  
  /**
88999a898   Tejun Heo   percpu: misc prep...
1256
1257
1258
   * pcpu_chunk_addr_search - determine chunk containing specified address
   * @addr: address for which the chunk needs to be determined.
   *
c0ebfdc3f   Dennis Zhou (Facebook)   percpu: modify ba...
1259
1260
1261
   * This is an internal function that handles all but static allocations.
   * Static percpu address values should never be passed into the allocator.
   *
88999a898   Tejun Heo   percpu: misc prep...
1262
1263
1264
1265
1266
   * RETURNS:
   * The address of the found chunk.
   */
  static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr)
  {
c0ebfdc3f   Dennis Zhou (Facebook)   percpu: modify ba...
1267
  	/* is it in the dynamic region (first chunk)? */
560f2c236   Dennis Zhou (Facebook)   percpu: combine p...
1268
  	if (pcpu_addr_in_chunk(pcpu_first_chunk, addr))
88999a898   Tejun Heo   percpu: misc prep...
1269
  		return pcpu_first_chunk;
c0ebfdc3f   Dennis Zhou (Facebook)   percpu: modify ba...
1270
1271
  
  	/* is it in the reserved region? */
560f2c236   Dennis Zhou (Facebook)   percpu: combine p...
1272
  	if (pcpu_addr_in_chunk(pcpu_reserved_chunk, addr))
c0ebfdc3f   Dennis Zhou (Facebook)   percpu: modify ba...
1273
  		return pcpu_reserved_chunk;
88999a898   Tejun Heo   percpu: misc prep...
1274
1275
1276
1277
1278
1279
1280
1281
1282
  
  	/*
  	 * The address is relative to unit0 which might be unused and
  	 * thus unmapped.  Offset the address to the unit space of the
  	 * current processor before looking it up in the vmalloc
  	 * space.  Note that any possible cpu id can be used here, so
  	 * there's no need to worry about preemption or cpu hotplug.
  	 */
  	addr += pcpu_unit_offsets[raw_smp_processor_id()];
9f6455325   Tejun Heo   percpu: move vmal...
1283
  	return pcpu_get_page_chunk(pcpu_addr_to_page(addr));
88999a898   Tejun Heo   percpu: misc prep...
1284
1285
1286
  }
  
  /**
edcb46399   Tejun Heo   percpu, module: i...
1287
   * pcpu_alloc - the percpu allocator
cae3aeb83   Tejun Heo   percpu: clean up ...
1288
   * @size: size of area to allocate in bytes
fbf59bc9d   Tejun Heo   percpu: implement...
1289
   * @align: alignment of area (max PAGE_SIZE)
edcb46399   Tejun Heo   percpu, module: i...
1290
   * @reserved: allocate from the reserved chunk if available
5835d96e9   Tejun Heo   percpu: implement...
1291
   * @gfp: allocation flags
fbf59bc9d   Tejun Heo   percpu: implement...
1292
   *
5835d96e9   Tejun Heo   percpu: implement...
1293
   * Allocate percpu area of @size bytes aligned at @align.  If @gfp doesn't
0ea7eeec2   Daniel Borkmann   mm, percpu: add s...
1294
1295
1296
   * contain %GFP_KERNEL, the allocation is atomic. If @gfp has __GFP_NOWARN
   * then no warning will be triggered on invalid or failed allocation
   * requests.
fbf59bc9d   Tejun Heo   percpu: implement...
1297
1298
1299
1300
   *
   * RETURNS:
   * Percpu pointer to the allocated area on success, NULL on failure.
   */
5835d96e9   Tejun Heo   percpu: implement...
1301
1302
  static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
  				 gfp_t gfp)
fbf59bc9d   Tejun Heo   percpu: implement...
1303
  {
0ea7eeec2   Daniel Borkmann   mm, percpu: add s...
1304
1305
  	bool is_atomic = (gfp & GFP_KERNEL) != GFP_KERNEL;
  	bool do_warn = !(gfp & __GFP_NOWARN);
f2badb0c9   Tejun Heo   percpu: make allo...
1306
  	static int warn_limit = 10;
fbf59bc9d   Tejun Heo   percpu: implement...
1307
  	struct pcpu_chunk *chunk;
f2badb0c9   Tejun Heo   percpu: make allo...
1308
  	const char *err;
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1309
  	int slot, off, cpu, ret;
403a91b16   Jiri Kosina   percpu: allow pcp...
1310
  	unsigned long flags;
f528f0b8e   Catalin Marinas   kmemleak: Handle ...
1311
  	void __percpu *ptr;
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1312
  	size_t bits, bit_align;
fbf59bc9d   Tejun Heo   percpu: implement...
1313

723ad1d90   Al Viro   percpu: store off...
1314
  	/*
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1315
1316
1317
1318
  	 * There is now a minimum allocation size of PCPU_MIN_ALLOC_SIZE,
  	 * therefore alignment must be a minimum of that many bytes.
  	 * An allocation may have internal fragmentation from rounding up
  	 * of up to PCPU_MIN_ALLOC_SIZE - 1 bytes.
723ad1d90   Al Viro   percpu: store off...
1319
  	 */
d2f3c3849   Dennis Zhou (Facebook)   percpu: increase ...
1320
1321
  	if (unlikely(align < PCPU_MIN_ALLOC_SIZE))
  		align = PCPU_MIN_ALLOC_SIZE;
723ad1d90   Al Viro   percpu: store off...
1322

d2f3c3849   Dennis Zhou (Facebook)   percpu: increase ...
1323
  	size = ALIGN(size, PCPU_MIN_ALLOC_SIZE);
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1324
1325
  	bits = size >> PCPU_MIN_ALLOC_SHIFT;
  	bit_align = align >> PCPU_MIN_ALLOC_SHIFT;
2f69fa829   Al Viro   percpu: allocatio...
1326

3ca45a46f   zijun_hu   percpu: ensure th...
1327
1328
  	if (unlikely(!size || size > PCPU_MIN_UNIT_SIZE || align > PAGE_SIZE ||
  		     !is_power_of_2(align))) {
0ea7eeec2   Daniel Borkmann   mm, percpu: add s...
1329
1330
  		WARN(do_warn, "illegal size (%zu) or align (%zu) for percpu allocation
  ",
756a025f0   Joe Perches   mm: coalesce spli...
1331
  		     size, align);
fbf59bc9d   Tejun Heo   percpu: implement...
1332
1333
  		return NULL;
  	}
6710e594f   Tejun Heo   percpu: fix synch...
1334
1335
  	if (!is_atomic)
  		mutex_lock(&pcpu_alloc_mutex);
403a91b16   Jiri Kosina   percpu: allow pcp...
1336
  	spin_lock_irqsave(&pcpu_lock, flags);
fbf59bc9d   Tejun Heo   percpu: implement...
1337

edcb46399   Tejun Heo   percpu, module: i...
1338
1339
1340
  	/* serve reserved allocations from the reserved chunk if available */
  	if (reserved && pcpu_reserved_chunk) {
  		chunk = pcpu_reserved_chunk;
833af8427   Tejun Heo   percpu: restructu...
1341

40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1342
1343
  		off = pcpu_find_block_fit(chunk, bits, bit_align, is_atomic);
  		if (off < 0) {
833af8427   Tejun Heo   percpu: restructu...
1344
  			err = "alloc from reserved chunk failed";
ccea34b5d   Tejun Heo   percpu: finer gra...
1345
  			goto fail_unlock;
f2badb0c9   Tejun Heo   percpu: make allo...
1346
  		}
833af8427   Tejun Heo   percpu: restructu...
1347

40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1348
  		off = pcpu_alloc_area(chunk, bits, bit_align, off);
edcb46399   Tejun Heo   percpu, module: i...
1349
1350
  		if (off >= 0)
  			goto area_found;
833af8427   Tejun Heo   percpu: restructu...
1351

f2badb0c9   Tejun Heo   percpu: make allo...
1352
  		err = "alloc from reserved chunk failed";
ccea34b5d   Tejun Heo   percpu: finer gra...
1353
  		goto fail_unlock;
edcb46399   Tejun Heo   percpu, module: i...
1354
  	}
ccea34b5d   Tejun Heo   percpu: finer gra...
1355
  restart:
edcb46399   Tejun Heo   percpu, module: i...
1356
  	/* search through normal chunks */
fbf59bc9d   Tejun Heo   percpu: implement...
1357
1358
  	for (slot = pcpu_size_to_slot(size); slot < pcpu_nr_slots; slot++) {
  		list_for_each_entry(chunk, &pcpu_slot[slot], list) {
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1359
1360
1361
  			off = pcpu_find_block_fit(chunk, bits, bit_align,
  						  is_atomic);
  			if (off < 0)
fbf59bc9d   Tejun Heo   percpu: implement...
1362
  				continue;
ccea34b5d   Tejun Heo   percpu: finer gra...
1363

40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1364
  			off = pcpu_alloc_area(chunk, bits, bit_align, off);
fbf59bc9d   Tejun Heo   percpu: implement...
1365
1366
  			if (off >= 0)
  				goto area_found;
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1367

fbf59bc9d   Tejun Heo   percpu: implement...
1368
1369
  		}
  	}
403a91b16   Jiri Kosina   percpu: allow pcp...
1370
  	spin_unlock_irqrestore(&pcpu_lock, flags);
ccea34b5d   Tejun Heo   percpu: finer gra...
1371

b38d08f31   Tejun Heo   percpu: restructu...
1372
1373
1374
1375
1376
  	/*
  	 * No space left.  Create a new chunk.  We don't want multiple
  	 * tasks to create chunks simultaneously.  Serialize and create iff
  	 * there's still no empty chunk after grabbing the mutex.
  	 */
11df02bf9   Dennis Zhou   percpu: resolve e...
1377
1378
  	if (is_atomic) {
  		err = "atomic alloc failed, no space left";
5835d96e9   Tejun Heo   percpu: implement...
1379
  		goto fail;
11df02bf9   Dennis Zhou   percpu: resolve e...
1380
  	}
5835d96e9   Tejun Heo   percpu: implement...
1381

b38d08f31   Tejun Heo   percpu: restructu...
1382
  	if (list_empty(&pcpu_slot[pcpu_nr_slots - 1])) {
72682b162   Dennis Zhou   percpu: add __GFP...
1383
  		chunk = pcpu_create_chunk(0);
b38d08f31   Tejun Heo   percpu: restructu...
1384
1385
1386
1387
1388
1389
1390
1391
1392
  		if (!chunk) {
  			err = "failed to allocate new chunk";
  			goto fail;
  		}
  
  		spin_lock_irqsave(&pcpu_lock, flags);
  		pcpu_chunk_relocate(chunk, -1);
  	} else {
  		spin_lock_irqsave(&pcpu_lock, flags);
f2badb0c9   Tejun Heo   percpu: make allo...
1393
  	}
ccea34b5d   Tejun Heo   percpu: finer gra...
1394

ccea34b5d   Tejun Heo   percpu: finer gra...
1395
  	goto restart;
fbf59bc9d   Tejun Heo   percpu: implement...
1396
1397
  
  area_found:
30a5b5367   Dennis Zhou   percpu: expose st...
1398
  	pcpu_stats_area_alloc(chunk, size);
403a91b16   Jiri Kosina   percpu: allow pcp...
1399
  	spin_unlock_irqrestore(&pcpu_lock, flags);
ccea34b5d   Tejun Heo   percpu: finer gra...
1400

dca496451   Tejun Heo   percpu: move comm...
1401
  	/* populate if not all pages are already there */
5835d96e9   Tejun Heo   percpu: implement...
1402
  	if (!is_atomic) {
e04d32083   Tejun Heo   percpu: indent th...
1403
  		int page_start, page_end, rs, re;
dca496451   Tejun Heo   percpu: move comm...
1404

e04d32083   Tejun Heo   percpu: indent th...
1405
1406
  		page_start = PFN_DOWN(off);
  		page_end = PFN_UP(off + size);
b38d08f31   Tejun Heo   percpu: restructu...
1407

91e914c5a   Dennis Zhou (Facebook)   percpu: generaliz...
1408
1409
  		pcpu_for_each_unpop_region(chunk->populated, rs, re,
  					   page_start, page_end) {
e04d32083   Tejun Heo   percpu: indent th...
1410
  			WARN_ON(chunk->immutable);
72682b162   Dennis Zhou   percpu: add __GFP...
1411
  			ret = pcpu_populate_chunk(chunk, rs, re, 0);
e04d32083   Tejun Heo   percpu: indent th...
1412
1413
1414
  
  			spin_lock_irqsave(&pcpu_lock, flags);
  			if (ret) {
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1415
  				pcpu_free_area(chunk, off);
e04d32083   Tejun Heo   percpu: indent th...
1416
1417
1418
  				err = "failed to populate";
  				goto fail_unlock;
  			}
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1419
  			pcpu_chunk_populated(chunk, rs, re, true);
e04d32083   Tejun Heo   percpu: indent th...
1420
  			spin_unlock_irqrestore(&pcpu_lock, flags);
dca496451   Tejun Heo   percpu: move comm...
1421
  		}
fbf59bc9d   Tejun Heo   percpu: implement...
1422

e04d32083   Tejun Heo   percpu: indent th...
1423
1424
  		mutex_unlock(&pcpu_alloc_mutex);
  	}
ccea34b5d   Tejun Heo   percpu: finer gra...
1425

1a4d76076   Tejun Heo   percpu: implement...
1426
1427
  	if (pcpu_nr_empty_pop_pages < PCPU_EMPTY_POP_PAGES_LOW)
  		pcpu_schedule_balance_work();
dca496451   Tejun Heo   percpu: move comm...
1428
1429
1430
  	/* clear the areas and return address relative to base address */
  	for_each_possible_cpu(cpu)
  		memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size);
f528f0b8e   Catalin Marinas   kmemleak: Handle ...
1431
  	ptr = __addr_to_pcpu_ptr(chunk->base_addr + off);
8a8c35fad   Larry Finger   mm: kmemleak_allo...
1432
  	kmemleak_alloc_percpu(ptr, size, gfp);
df95e795a   Dennis Zhou   percpu: add trace...
1433
1434
1435
  
  	trace_percpu_alloc_percpu(reserved, is_atomic, size, align,
  			chunk->base_addr, off, ptr);
f528f0b8e   Catalin Marinas   kmemleak: Handle ...
1436
  	return ptr;
ccea34b5d   Tejun Heo   percpu: finer gra...
1437
1438
  
  fail_unlock:
403a91b16   Jiri Kosina   percpu: allow pcp...
1439
  	spin_unlock_irqrestore(&pcpu_lock, flags);
b38d08f31   Tejun Heo   percpu: restructu...
1440
  fail:
df95e795a   Dennis Zhou   percpu: add trace...
1441
  	trace_percpu_alloc_percpu_fail(reserved, is_atomic, size, align);
0ea7eeec2   Daniel Borkmann   mm, percpu: add s...
1442
  	if (!is_atomic && do_warn && warn_limit) {
870d4b12a   Joe Perches   mm: percpu: use p...
1443
1444
  		pr_warn("allocation failed, size=%zu align=%zu atomic=%d, %s
  ",
598d80914   Joe Perches   mm: convert pr_wa...
1445
  			size, align, is_atomic, err);
f2badb0c9   Tejun Heo   percpu: make allo...
1446
1447
  		dump_stack();
  		if (!--warn_limit)
870d4b12a   Joe Perches   mm: percpu: use p...
1448
1449
  			pr_info("limit reached, disable warning
  ");
f2badb0c9   Tejun Heo   percpu: make allo...
1450
  	}
1a4d76076   Tejun Heo   percpu: implement...
1451
1452
1453
1454
  	if (is_atomic) {
  		/* see the flag handling in pcpu_blance_workfn() */
  		pcpu_atomic_alloc_failed = true;
  		pcpu_schedule_balance_work();
6710e594f   Tejun Heo   percpu: fix synch...
1455
1456
  	} else {
  		mutex_unlock(&pcpu_alloc_mutex);
1a4d76076   Tejun Heo   percpu: implement...
1457
  	}
ccea34b5d   Tejun Heo   percpu: finer gra...
1458
  	return NULL;
fbf59bc9d   Tejun Heo   percpu: implement...
1459
  }
edcb46399   Tejun Heo   percpu, module: i...
1460
1461
  
  /**
5835d96e9   Tejun Heo   percpu: implement...
1462
   * __alloc_percpu_gfp - allocate dynamic percpu area
edcb46399   Tejun Heo   percpu, module: i...
1463
1464
   * @size: size of area to allocate in bytes
   * @align: alignment of area (max PAGE_SIZE)
5835d96e9   Tejun Heo   percpu: implement...
1465
   * @gfp: allocation flags
edcb46399   Tejun Heo   percpu, module: i...
1466
   *
5835d96e9   Tejun Heo   percpu: implement...
1467
1468
   * Allocate zero-filled percpu area of @size bytes aligned at @align.  If
   * @gfp doesn't contain %GFP_KERNEL, the allocation doesn't block and can
0ea7eeec2   Daniel Borkmann   mm, percpu: add s...
1469
1470
1471
   * be called from any context but is a lot more likely to fail. If @gfp
   * has __GFP_NOWARN then no warning will be triggered on invalid or failed
   * allocation requests.
ccea34b5d   Tejun Heo   percpu: finer gra...
1472
   *
edcb46399   Tejun Heo   percpu, module: i...
1473
1474
1475
   * RETURNS:
   * Percpu pointer to the allocated area on success, NULL on failure.
   */
5835d96e9   Tejun Heo   percpu: implement...
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
  void __percpu *__alloc_percpu_gfp(size_t size, size_t align, gfp_t gfp)
  {
  	return pcpu_alloc(size, align, false, gfp);
  }
  EXPORT_SYMBOL_GPL(__alloc_percpu_gfp);
  
  /**
   * __alloc_percpu - allocate dynamic percpu area
   * @size: size of area to allocate in bytes
   * @align: alignment of area (max PAGE_SIZE)
   *
   * Equivalent to __alloc_percpu_gfp(size, align, %GFP_KERNEL).
   */
43cf38eb5   Tejun Heo   percpu: add __per...
1489
  void __percpu *__alloc_percpu(size_t size, size_t align)
edcb46399   Tejun Heo   percpu, module: i...
1490
  {
5835d96e9   Tejun Heo   percpu: implement...
1491
  	return pcpu_alloc(size, align, false, GFP_KERNEL);
edcb46399   Tejun Heo   percpu, module: i...
1492
  }
fbf59bc9d   Tejun Heo   percpu: implement...
1493
  EXPORT_SYMBOL_GPL(__alloc_percpu);
edcb46399   Tejun Heo   percpu, module: i...
1494
1495
1496
1497
1498
  /**
   * __alloc_reserved_percpu - allocate reserved percpu area
   * @size: size of area to allocate in bytes
   * @align: alignment of area (max PAGE_SIZE)
   *
9329ba970   Tejun Heo   percpu: update co...
1499
1500
1501
1502
   * Allocate zero-filled percpu area of @size bytes aligned at @align
   * from reserved percpu area if arch has set it up; otherwise,
   * allocation is served from the same dynamic area.  Might sleep.
   * Might trigger writeouts.
edcb46399   Tejun Heo   percpu, module: i...
1503
   *
ccea34b5d   Tejun Heo   percpu: finer gra...
1504
1505
1506
   * CONTEXT:
   * Does GFP_KERNEL allocation.
   *
edcb46399   Tejun Heo   percpu, module: i...
1507
1508
1509
   * RETURNS:
   * Percpu pointer to the allocated area on success, NULL on failure.
   */
43cf38eb5   Tejun Heo   percpu: add __per...
1510
  void __percpu *__alloc_reserved_percpu(size_t size, size_t align)
edcb46399   Tejun Heo   percpu, module: i...
1511
  {
5835d96e9   Tejun Heo   percpu: implement...
1512
  	return pcpu_alloc(size, align, true, GFP_KERNEL);
edcb46399   Tejun Heo   percpu, module: i...
1513
  }
a56dbddf0   Tejun Heo   percpu: move full...
1514
  /**
1a4d76076   Tejun Heo   percpu: implement...
1515
   * pcpu_balance_workfn - manage the amount of free chunks and populated pages
a56dbddf0   Tejun Heo   percpu: move full...
1516
1517
   * @work: unused
   *
72682b162   Dennis Zhou   percpu: add __GFP...
1518
1519
1520
1521
1522
1523
   * Reclaim all fully free chunks except for the first one.  This is also
   * responsible for maintaining the pool of empty populated pages.  However,
   * it is possible that this is called when physical memory is scarce causing
   * OOM killer to be triggered.  We should avoid doing so until an actual
   * allocation causes the failure as it is possible that requests can be
   * serviced from already backed regions.
a56dbddf0   Tejun Heo   percpu: move full...
1524
   */
fe6bd8c3d   Tejun Heo   percpu: rename pc...
1525
  static void pcpu_balance_workfn(struct work_struct *work)
fbf59bc9d   Tejun Heo   percpu: implement...
1526
  {
72682b162   Dennis Zhou   percpu: add __GFP...
1527
1528
  	/* gfp flags passed to underlying allocators */
  	const gfp_t gfp = __GFP_NORETRY | __GFP_NOWARN;
fe6bd8c3d   Tejun Heo   percpu: rename pc...
1529
1530
  	LIST_HEAD(to_free);
  	struct list_head *free_head = &pcpu_slot[pcpu_nr_slots - 1];
a56dbddf0   Tejun Heo   percpu: move full...
1531
  	struct pcpu_chunk *chunk, *next;
1a4d76076   Tejun Heo   percpu: implement...
1532
  	int slot, nr_to_pop, ret;
a56dbddf0   Tejun Heo   percpu: move full...
1533

1a4d76076   Tejun Heo   percpu: implement...
1534
1535
1536
1537
  	/*
  	 * There's no reason to keep around multiple unused chunks and VM
  	 * areas can be scarce.  Destroy all free chunks except for one.
  	 */
ccea34b5d   Tejun Heo   percpu: finer gra...
1538
1539
  	mutex_lock(&pcpu_alloc_mutex);
  	spin_lock_irq(&pcpu_lock);
a56dbddf0   Tejun Heo   percpu: move full...
1540

fe6bd8c3d   Tejun Heo   percpu: rename pc...
1541
  	list_for_each_entry_safe(chunk, next, free_head, list) {
a56dbddf0   Tejun Heo   percpu: move full...
1542
1543
1544
  		WARN_ON(chunk->immutable);
  
  		/* spare the first one */
fe6bd8c3d   Tejun Heo   percpu: rename pc...
1545
  		if (chunk == list_first_entry(free_head, struct pcpu_chunk, list))
a56dbddf0   Tejun Heo   percpu: move full...
1546
  			continue;
fe6bd8c3d   Tejun Heo   percpu: rename pc...
1547
  		list_move(&chunk->list, &to_free);
a56dbddf0   Tejun Heo   percpu: move full...
1548
  	}
ccea34b5d   Tejun Heo   percpu: finer gra...
1549
  	spin_unlock_irq(&pcpu_lock);
a56dbddf0   Tejun Heo   percpu: move full...
1550

fe6bd8c3d   Tejun Heo   percpu: rename pc...
1551
  	list_for_each_entry_safe(chunk, next, &to_free, list) {
a93ace487   Tejun Heo   percpu: move regi...
1552
  		int rs, re;
dca496451   Tejun Heo   percpu: move comm...
1553

91e914c5a   Dennis Zhou (Facebook)   percpu: generaliz...
1554
1555
  		pcpu_for_each_pop_region(chunk->populated, rs, re, 0,
  					 chunk->nr_pages) {
a93ace487   Tejun Heo   percpu: move regi...
1556
  			pcpu_depopulate_chunk(chunk, rs, re);
b539b87fe   Tejun Heo   percpu: implmeent...
1557
1558
1559
  			spin_lock_irq(&pcpu_lock);
  			pcpu_chunk_depopulated(chunk, rs, re);
  			spin_unlock_irq(&pcpu_lock);
a93ace487   Tejun Heo   percpu: move regi...
1560
  		}
6081089fd   Tejun Heo   percpu: reorganiz...
1561
  		pcpu_destroy_chunk(chunk);
a56dbddf0   Tejun Heo   percpu: move full...
1562
  	}
971f3918a   Tejun Heo   percpu: fix pcpu_...
1563

1a4d76076   Tejun Heo   percpu: implement...
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
  	/*
  	 * Ensure there are certain number of free populated pages for
  	 * atomic allocs.  Fill up from the most packed so that atomic
  	 * allocs don't increase fragmentation.  If atomic allocation
  	 * failed previously, always populate the maximum amount.  This
  	 * should prevent atomic allocs larger than PAGE_SIZE from keeping
  	 * failing indefinitely; however, large atomic allocs are not
  	 * something we support properly and can be highly unreliable and
  	 * inefficient.
  	 */
  retry_pop:
  	if (pcpu_atomic_alloc_failed) {
  		nr_to_pop = PCPU_EMPTY_POP_PAGES_HIGH;
  		/* best effort anyway, don't worry about synchronization */
  		pcpu_atomic_alloc_failed = false;
  	} else {
  		nr_to_pop = clamp(PCPU_EMPTY_POP_PAGES_HIGH -
  				  pcpu_nr_empty_pop_pages,
  				  0, PCPU_EMPTY_POP_PAGES_HIGH);
  	}
  
  	for (slot = pcpu_size_to_slot(PAGE_SIZE); slot < pcpu_nr_slots; slot++) {
  		int nr_unpop = 0, rs, re;
  
  		if (!nr_to_pop)
  			break;
  
  		spin_lock_irq(&pcpu_lock);
  		list_for_each_entry(chunk, &pcpu_slot[slot], list) {
8ab16c43e   Dennis Zhou (Facebook)   percpu: change th...
1593
  			nr_unpop = chunk->nr_pages - chunk->nr_populated;
1a4d76076   Tejun Heo   percpu: implement...
1594
1595
1596
1597
1598
1599
1600
1601
1602
  			if (nr_unpop)
  				break;
  		}
  		spin_unlock_irq(&pcpu_lock);
  
  		if (!nr_unpop)
  			continue;
  
  		/* @chunk can't go away while pcpu_alloc_mutex is held */
91e914c5a   Dennis Zhou (Facebook)   percpu: generaliz...
1603
1604
  		pcpu_for_each_unpop_region(chunk->populated, rs, re, 0,
  					   chunk->nr_pages) {
1a4d76076   Tejun Heo   percpu: implement...
1605
  			int nr = min(re - rs, nr_to_pop);
72682b162   Dennis Zhou   percpu: add __GFP...
1606
  			ret = pcpu_populate_chunk(chunk, rs, rs + nr, gfp);
1a4d76076   Tejun Heo   percpu: implement...
1607
1608
1609
  			if (!ret) {
  				nr_to_pop -= nr;
  				spin_lock_irq(&pcpu_lock);
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1610
  				pcpu_chunk_populated(chunk, rs, rs + nr, false);
1a4d76076   Tejun Heo   percpu: implement...
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
  				spin_unlock_irq(&pcpu_lock);
  			} else {
  				nr_to_pop = 0;
  			}
  
  			if (!nr_to_pop)
  				break;
  		}
  	}
  
  	if (nr_to_pop) {
  		/* ran out of chunks to populate, create a new one and retry */
72682b162   Dennis Zhou   percpu: add __GFP...
1623
  		chunk = pcpu_create_chunk(gfp);
1a4d76076   Tejun Heo   percpu: implement...
1624
1625
1626
1627
1628
1629
1630
  		if (chunk) {
  			spin_lock_irq(&pcpu_lock);
  			pcpu_chunk_relocate(chunk, -1);
  			spin_unlock_irq(&pcpu_lock);
  			goto retry_pop;
  		}
  	}
971f3918a   Tejun Heo   percpu: fix pcpu_...
1631
  	mutex_unlock(&pcpu_alloc_mutex);
fbf59bc9d   Tejun Heo   percpu: implement...
1632
1633
1634
1635
1636
1637
  }
  
  /**
   * free_percpu - free percpu area
   * @ptr: pointer to area to free
   *
ccea34b5d   Tejun Heo   percpu: finer gra...
1638
1639
1640
1641
   * Free percpu area @ptr.
   *
   * CONTEXT:
   * Can be called from atomic context.
fbf59bc9d   Tejun Heo   percpu: implement...
1642
   */
43cf38eb5   Tejun Heo   percpu: add __per...
1643
  void free_percpu(void __percpu *ptr)
fbf59bc9d   Tejun Heo   percpu: implement...
1644
  {
129182e56   Andrew Morton   percpu: avoid cal...
1645
  	void *addr;
fbf59bc9d   Tejun Heo   percpu: implement...
1646
  	struct pcpu_chunk *chunk;
ccea34b5d   Tejun Heo   percpu: finer gra...
1647
  	unsigned long flags;
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1648
  	int off;
fbf59bc9d   Tejun Heo   percpu: implement...
1649
1650
1651
  
  	if (!ptr)
  		return;
f528f0b8e   Catalin Marinas   kmemleak: Handle ...
1652
  	kmemleak_free_percpu(ptr);
129182e56   Andrew Morton   percpu: avoid cal...
1653
  	addr = __pcpu_ptr_to_addr(ptr);
ccea34b5d   Tejun Heo   percpu: finer gra...
1654
  	spin_lock_irqsave(&pcpu_lock, flags);
fbf59bc9d   Tejun Heo   percpu: implement...
1655
1656
  
  	chunk = pcpu_chunk_addr_search(addr);
bba174f5e   Tejun Heo   percpu: add chunk...
1657
  	off = addr - chunk->base_addr;
fbf59bc9d   Tejun Heo   percpu: implement...
1658

40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1659
  	pcpu_free_area(chunk, off);
fbf59bc9d   Tejun Heo   percpu: implement...
1660

a56dbddf0   Tejun Heo   percpu: move full...
1661
  	/* if there are more than one fully free chunks, wake up grim reaper */
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1662
  	if (chunk->free_bytes == pcpu_unit_size) {
fbf59bc9d   Tejun Heo   percpu: implement...
1663
  		struct pcpu_chunk *pos;
a56dbddf0   Tejun Heo   percpu: move full...
1664
  		list_for_each_entry(pos, &pcpu_slot[pcpu_nr_slots - 1], list)
fbf59bc9d   Tejun Heo   percpu: implement...
1665
  			if (pos != chunk) {
1a4d76076   Tejun Heo   percpu: implement...
1666
  				pcpu_schedule_balance_work();
fbf59bc9d   Tejun Heo   percpu: implement...
1667
1668
1669
  				break;
  			}
  	}
df95e795a   Dennis Zhou   percpu: add trace...
1670
  	trace_percpu_free_percpu(chunk->base_addr, off, ptr);
ccea34b5d   Tejun Heo   percpu: finer gra...
1671
  	spin_unlock_irqrestore(&pcpu_lock, flags);
fbf59bc9d   Tejun Heo   percpu: implement...
1672
1673
  }
  EXPORT_SYMBOL_GPL(free_percpu);
383776fa7   Thomas Gleixner   locking/lockdep: ...
1674
  bool __is_kernel_percpu_address(unsigned long addr, unsigned long *can_addr)
10fad5e46   Tejun Heo   percpu, module: i...
1675
  {
bbddff054   Tejun Heo   percpu: use percp...
1676
  #ifdef CONFIG_SMP
10fad5e46   Tejun Heo   percpu, module: i...
1677
1678
1679
1680
1681
1682
  	const size_t static_size = __per_cpu_end - __per_cpu_start;
  	void __percpu *base = __addr_to_pcpu_ptr(pcpu_base_addr);
  	unsigned int cpu;
  
  	for_each_possible_cpu(cpu) {
  		void *start = per_cpu_ptr(base, cpu);
383776fa7   Thomas Gleixner   locking/lockdep: ...
1683
  		void *va = (void *)addr;
10fad5e46   Tejun Heo   percpu, module: i...
1684

383776fa7   Thomas Gleixner   locking/lockdep: ...
1685
  		if (va >= start && va < start + static_size) {
8ce371f98   Peter Zijlstra   lockdep: Fix per-...
1686
  			if (can_addr) {
383776fa7   Thomas Gleixner   locking/lockdep: ...
1687
  				*can_addr = (unsigned long) (va - start);
8ce371f98   Peter Zijlstra   lockdep: Fix per-...
1688
1689
1690
  				*can_addr += (unsigned long)
  					per_cpu_ptr(base, get_boot_cpu_id());
  			}
10fad5e46   Tejun Heo   percpu, module: i...
1691
  			return true;
383776fa7   Thomas Gleixner   locking/lockdep: ...
1692
1693
  		}
  	}
bbddff054   Tejun Heo   percpu: use percp...
1694
1695
  #endif
  	/* on UP, can't distinguish from other static vars, always false */
10fad5e46   Tejun Heo   percpu, module: i...
1696
1697
1698
1699
  	return false;
  }
  
  /**
383776fa7   Thomas Gleixner   locking/lockdep: ...
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
   * is_kernel_percpu_address - test whether address is from static percpu area
   * @addr: address to test
   *
   * Test whether @addr belongs to in-kernel static percpu area.  Module
   * static percpu areas are not considered.  For those, use
   * is_module_percpu_address().
   *
   * RETURNS:
   * %true if @addr is from in-kernel static percpu area, %false otherwise.
   */
  bool is_kernel_percpu_address(unsigned long addr)
  {
  	return __is_kernel_percpu_address(addr, NULL);
  }
  
  /**
3b034b0d0   Vivek Goyal   percpu: Fix kdump...
1716
1717
1718
1719
1720
1721
1722
1723
   * per_cpu_ptr_to_phys - convert translated percpu address to physical address
   * @addr: the address to be converted to physical address
   *
   * Given @addr which is dereferenceable address obtained via one of
   * percpu access macros, this function translates it into its physical
   * address.  The caller is responsible for ensuring @addr stays valid
   * until this function finishes.
   *
67589c714   Dave Young   percpu: explain w...
1724
1725
1726
1727
1728
   * percpu allocator has special setup for the first chunk, which currently
   * supports either embedding in linear address space or vmalloc mapping,
   * and, from the second one, the backing allocator (currently either vm or
   * km) provides translation.
   *
bffc43758   Yannick Guerrini   percpu: Fix trivi...
1729
   * The addr can be translated simply without checking if it falls into the
67589c714   Dave Young   percpu: explain w...
1730
1731
1732
1733
1734
   * first chunk. But the current code reflects better how percpu allocator
   * actually works, and the verification can discover both bugs in percpu
   * allocator itself and per_cpu_ptr_to_phys() callers. So we keep current
   * code.
   *
3b034b0d0   Vivek Goyal   percpu: Fix kdump...
1735
1736
1737
1738
1739
   * RETURNS:
   * The physical address for @addr.
   */
  phys_addr_t per_cpu_ptr_to_phys(void *addr)
  {
9983b6f0c   Tejun Heo   percpu: fix first...
1740
1741
  	void __percpu *base = __addr_to_pcpu_ptr(pcpu_base_addr);
  	bool in_first_chunk = false;
a855b84c3   Tejun Heo   percpu: fix chunk...
1742
  	unsigned long first_low, first_high;
9983b6f0c   Tejun Heo   percpu: fix first...
1743
1744
1745
  	unsigned int cpu;
  
  	/*
a855b84c3   Tejun Heo   percpu: fix chunk...
1746
  	 * The following test on unit_low/high isn't strictly
9983b6f0c   Tejun Heo   percpu: fix first...
1747
1748
  	 * necessary but will speed up lookups of addresses which
  	 * aren't in the first chunk.
c0ebfdc3f   Dennis Zhou (Facebook)   percpu: modify ba...
1749
1750
1751
1752
1753
  	 *
  	 * The address check is against full chunk sizes.  pcpu_base_addr
  	 * points to the beginning of the first chunk including the
  	 * static region.  Assumes good intent as the first chunk may
  	 * not be full (ie. < pcpu_unit_pages in size).
9983b6f0c   Tejun Heo   percpu: fix first...
1754
  	 */
c0ebfdc3f   Dennis Zhou (Facebook)   percpu: modify ba...
1755
1756
1757
1758
  	first_low = (unsigned long)pcpu_base_addr +
  		    pcpu_unit_page_offset(pcpu_low_unit_cpu, 0);
  	first_high = (unsigned long)pcpu_base_addr +
  		     pcpu_unit_page_offset(pcpu_high_unit_cpu, pcpu_unit_pages);
a855b84c3   Tejun Heo   percpu: fix chunk...
1759
1760
  	if ((unsigned long)addr >= first_low &&
  	    (unsigned long)addr < first_high) {
9983b6f0c   Tejun Heo   percpu: fix first...
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
  		for_each_possible_cpu(cpu) {
  			void *start = per_cpu_ptr(base, cpu);
  
  			if (addr >= start && addr < start + pcpu_unit_size) {
  				in_first_chunk = true;
  				break;
  			}
  		}
  	}
  
  	if (in_first_chunk) {
eac522ef4   David Howells   NOMMU: percpu sho...
1772
  		if (!is_vmalloc_addr(addr))
020ec6537   Tejun Heo   percpu: factor ou...
1773
1774
  			return __pa(addr);
  		else
9f57bd4d6   Eugene Surovegin   percpu: fix per_c...
1775
1776
  			return page_to_phys(vmalloc_to_page(addr)) +
  			       offset_in_page(addr);
020ec6537   Tejun Heo   percpu: factor ou...
1777
  	} else
9f57bd4d6   Eugene Surovegin   percpu: fix per_c...
1778
1779
  		return page_to_phys(pcpu_addr_to_page(addr)) +
  		       offset_in_page(addr);
3b034b0d0   Vivek Goyal   percpu: Fix kdump...
1780
  }
fbf59bc9d   Tejun Heo   percpu: implement...
1781
  /**
fd1e8a1fe   Tejun Heo   percpu: introduce...
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
   * pcpu_alloc_alloc_info - allocate percpu allocation info
   * @nr_groups: the number of groups
   * @nr_units: the number of units
   *
   * Allocate ai which is large enough for @nr_groups groups containing
   * @nr_units units.  The returned ai's groups[0].cpu_map points to the
   * cpu_map array which is long enough for @nr_units and filled with
   * NR_CPUS.  It's the caller's responsibility to initialize cpu_map
   * pointer of other groups.
   *
   * RETURNS:
   * Pointer to the allocated pcpu_alloc_info on success, NULL on
   * failure.
   */
  struct pcpu_alloc_info * __init pcpu_alloc_alloc_info(int nr_groups,
  						      int nr_units)
  {
  	struct pcpu_alloc_info *ai;
  	size_t base_size, ai_size;
  	void *ptr;
  	int unit;
  
  	base_size = ALIGN(sizeof(*ai) + nr_groups * sizeof(ai->groups[0]),
  			  __alignof__(ai->groups[0].cpu_map[0]));
  	ai_size = base_size + nr_units * sizeof(ai->groups[0].cpu_map[0]);
999c17e3d   Santosh Shilimkar   mm/percpu.c: use ...
1807
  	ptr = memblock_virt_alloc_nopanic(PFN_ALIGN(ai_size), 0);
fd1e8a1fe   Tejun Heo   percpu: introduce...
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
  	if (!ptr)
  		return NULL;
  	ai = ptr;
  	ptr += base_size;
  
  	ai->groups[0].cpu_map = ptr;
  
  	for (unit = 0; unit < nr_units; unit++)
  		ai->groups[0].cpu_map[unit] = NR_CPUS;
  
  	ai->nr_groups = nr_groups;
  	ai->__ai_size = PFN_ALIGN(ai_size);
  
  	return ai;
  }
  
  /**
   * pcpu_free_alloc_info - free percpu allocation info
   * @ai: pcpu_alloc_info to free
   *
   * Free @ai which was allocated by pcpu_alloc_alloc_info().
   */
  void __init pcpu_free_alloc_info(struct pcpu_alloc_info *ai)
  {
999c17e3d   Santosh Shilimkar   mm/percpu.c: use ...
1832
  	memblock_free_early(__pa(ai), ai->__ai_size);
fd1e8a1fe   Tejun Heo   percpu: introduce...
1833
1834
1835
  }
  
  /**
fd1e8a1fe   Tejun Heo   percpu: introduce...
1836
1837
1838
1839
1840
1841
1842
1843
   * pcpu_dump_alloc_info - print out information about pcpu_alloc_info
   * @lvl: loglevel
   * @ai: allocation info to dump
   *
   * Print out information about @ai using loglevel @lvl.
   */
  static void pcpu_dump_alloc_info(const char *lvl,
  				 const struct pcpu_alloc_info *ai)
033e48fb8   Tejun Heo   percpu: move pcpu...
1844
  {
fd1e8a1fe   Tejun Heo   percpu: introduce...
1845
  	int group_width = 1, cpu_width = 1, width;
033e48fb8   Tejun Heo   percpu: move pcpu...
1846
  	char empty_str[] = "--------";
fd1e8a1fe   Tejun Heo   percpu: introduce...
1847
1848
1849
1850
1851
1852
1853
  	int alloc = 0, alloc_end = 0;
  	int group, v;
  	int upa, apl;	/* units per alloc, allocs per line */
  
  	v = ai->nr_groups;
  	while (v /= 10)
  		group_width++;
033e48fb8   Tejun Heo   percpu: move pcpu...
1854

fd1e8a1fe   Tejun Heo   percpu: introduce...
1855
  	v = num_possible_cpus();
033e48fb8   Tejun Heo   percpu: move pcpu...
1856
  	while (v /= 10)
fd1e8a1fe   Tejun Heo   percpu: introduce...
1857
1858
  		cpu_width++;
  	empty_str[min_t(int, cpu_width, sizeof(empty_str) - 1)] = '\0';
033e48fb8   Tejun Heo   percpu: move pcpu...
1859

fd1e8a1fe   Tejun Heo   percpu: introduce...
1860
1861
1862
  	upa = ai->alloc_size / ai->unit_size;
  	width = upa * (cpu_width + 1) + group_width + 3;
  	apl = rounddown_pow_of_two(max(60 / width, 1));
033e48fb8   Tejun Heo   percpu: move pcpu...
1863

fd1e8a1fe   Tejun Heo   percpu: introduce...
1864
1865
1866
  	printk("%spcpu-alloc: s%zu r%zu d%zu u%zu alloc=%zu*%zu",
  	       lvl, ai->static_size, ai->reserved_size, ai->dyn_size,
  	       ai->unit_size, ai->alloc_size / ai->atom_size, ai->atom_size);
033e48fb8   Tejun Heo   percpu: move pcpu...
1867

fd1e8a1fe   Tejun Heo   percpu: introduce...
1868
1869
1870
1871
1872
1873
1874
1875
  	for (group = 0; group < ai->nr_groups; group++) {
  		const struct pcpu_group_info *gi = &ai->groups[group];
  		int unit = 0, unit_end = 0;
  
  		BUG_ON(gi->nr_units % upa);
  		for (alloc_end += gi->nr_units / upa;
  		     alloc < alloc_end; alloc++) {
  			if (!(alloc % apl)) {
1170532bb   Joe Perches   mm: convert print...
1876
1877
  				pr_cont("
  ");
fd1e8a1fe   Tejun Heo   percpu: introduce...
1878
1879
  				printk("%spcpu-alloc: ", lvl);
  			}
1170532bb   Joe Perches   mm: convert print...
1880
  			pr_cont("[%0*d] ", group_width, group);
fd1e8a1fe   Tejun Heo   percpu: introduce...
1881
1882
1883
  
  			for (unit_end += upa; unit < unit_end; unit++)
  				if (gi->cpu_map[unit] != NR_CPUS)
1170532bb   Joe Perches   mm: convert print...
1884
1885
  					pr_cont("%0*d ",
  						cpu_width, gi->cpu_map[unit]);
fd1e8a1fe   Tejun Heo   percpu: introduce...
1886
  				else
1170532bb   Joe Perches   mm: convert print...
1887
  					pr_cont("%s ", empty_str);
033e48fb8   Tejun Heo   percpu: move pcpu...
1888
  		}
033e48fb8   Tejun Heo   percpu: move pcpu...
1889
  	}
1170532bb   Joe Perches   mm: convert print...
1890
1891
  	pr_cont("
  ");
033e48fb8   Tejun Heo   percpu: move pcpu...
1892
  }
033e48fb8   Tejun Heo   percpu: move pcpu...
1893

fbf59bc9d   Tejun Heo   percpu: implement...
1894
  /**
8d408b4be   Tejun Heo   percpu: give more...
1895
   * pcpu_setup_first_chunk - initialize the first percpu chunk
fd1e8a1fe   Tejun Heo   percpu: introduce...
1896
   * @ai: pcpu_alloc_info describing how to percpu area is shaped
38a6be525   Tejun Heo   percpu: simplify ...
1897
   * @base_addr: mapped address
8d408b4be   Tejun Heo   percpu: give more...
1898
1899
1900
   *
   * Initialize the first percpu chunk which contains the kernel static
   * perpcu area.  This function is to be called from arch percpu area
38a6be525   Tejun Heo   percpu: simplify ...
1901
   * setup path.
8d408b4be   Tejun Heo   percpu: give more...
1902
   *
fd1e8a1fe   Tejun Heo   percpu: introduce...
1903
1904
1905
1906
1907
1908
   * @ai contains all information necessary to initialize the first
   * chunk and prime the dynamic percpu allocator.
   *
   * @ai->static_size is the size of static percpu area.
   *
   * @ai->reserved_size, if non-zero, specifies the amount of bytes to
edcb46399   Tejun Heo   percpu, module: i...
1909
1910
1911
1912
1913
1914
1915
   * reserve after the static area in the first chunk.  This reserves
   * the first chunk such that it's available only through reserved
   * percpu allocation.  This is primarily used to serve module percpu
   * static areas on architectures where the addressing model has
   * limited offset range for symbol relocations to guarantee module
   * percpu symbols fall inside the relocatable range.
   *
fd1e8a1fe   Tejun Heo   percpu: introduce...
1916
1917
1918
   * @ai->dyn_size determines the number of bytes available for dynamic
   * allocation in the first chunk.  The area between @ai->static_size +
   * @ai->reserved_size + @ai->dyn_size and @ai->unit_size is unused.
6074d5b0a   Tejun Heo   percpu: more flex...
1919
   *
fd1e8a1fe   Tejun Heo   percpu: introduce...
1920
1921
1922
   * @ai->unit_size specifies unit size and must be aligned to PAGE_SIZE
   * and equal to or larger than @ai->static_size + @ai->reserved_size +
   * @ai->dyn_size.
8d408b4be   Tejun Heo   percpu: give more...
1923
   *
fd1e8a1fe   Tejun Heo   percpu: introduce...
1924
1925
   * @ai->atom_size is the allocation atom size and used as alignment
   * for vm areas.
8d408b4be   Tejun Heo   percpu: give more...
1926
   *
fd1e8a1fe   Tejun Heo   percpu: introduce...
1927
1928
1929
1930
1931
1932
1933
1934
1935
   * @ai->alloc_size is the allocation size and always multiple of
   * @ai->atom_size.  This is larger than @ai->atom_size if
   * @ai->unit_size is larger than @ai->atom_size.
   *
   * @ai->nr_groups and @ai->groups describe virtual memory layout of
   * percpu areas.  Units which should be colocated are put into the
   * same group.  Dynamic VM areas will be allocated according to these
   * groupings.  If @ai->nr_groups is zero, a single group containing
   * all units is assumed.
8d408b4be   Tejun Heo   percpu: give more...
1936
   *
38a6be525   Tejun Heo   percpu: simplify ...
1937
1938
   * The caller should have mapped the first chunk at @base_addr and
   * copied static data to each unit.
fbf59bc9d   Tejun Heo   percpu: implement...
1939
   *
c0ebfdc3f   Dennis Zhou (Facebook)   percpu: modify ba...
1940
1941
1942
1943
1944
1945
1946
   * The first chunk will always contain a static and a dynamic region.
   * However, the static region is not managed by any chunk.  If the first
   * chunk also contains a reserved region, it is served by two chunks -
   * one for the reserved region and one for the dynamic region.  They
   * share the same vm, but use offset regions in the area allocation map.
   * The chunk serving the dynamic region is circulated in the chunk slots
   * and available for dynamic allocation like any other chunk.
edcb46399   Tejun Heo   percpu, module: i...
1947
   *
fbf59bc9d   Tejun Heo   percpu: implement...
1948
   * RETURNS:
fb435d523   Tejun Heo   percpu: add pcpu_...
1949
   * 0 on success, -errno on failure.
fbf59bc9d   Tejun Heo   percpu: implement...
1950
   */
fb435d523   Tejun Heo   percpu: add pcpu_...
1951
1952
  int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
  				  void *base_addr)
fbf59bc9d   Tejun Heo   percpu: implement...
1953
  {
b9c39442c   Dennis Zhou (Facebook)   percpu: setup_fir...
1954
  	size_t size_sum = ai->static_size + ai->reserved_size + ai->dyn_size;
d2f3c3849   Dennis Zhou (Facebook)   percpu: increase ...
1955
  	size_t static_size, dyn_size;
0c4169c3d   Dennis Zhou (Facebook)   percpu: setup_fir...
1956
  	struct pcpu_chunk *chunk;
6563297ce   Tejun Heo   percpu: use group...
1957
1958
  	unsigned long *group_offsets;
  	size_t *group_sizes;
fb435d523   Tejun Heo   percpu: add pcpu_...
1959
  	unsigned long *unit_off;
fbf59bc9d   Tejun Heo   percpu: implement...
1960
  	unsigned int cpu;
fd1e8a1fe   Tejun Heo   percpu: introduce...
1961
1962
  	int *unit_map;
  	int group, unit, i;
c0ebfdc3f   Dennis Zhou (Facebook)   percpu: modify ba...
1963
1964
  	int map_size;
  	unsigned long tmp_addr;
fbf59bc9d   Tejun Heo   percpu: implement...
1965

635b75fc1   Tejun Heo   percpu: make pcpu...
1966
1967
  #define PCPU_SETUP_BUG_ON(cond)	do {					\
  	if (unlikely(cond)) {						\
870d4b12a   Joe Perches   mm: percpu: use p...
1968
1969
1970
1971
  		pr_emerg("failed to initialize, %s
  ", #cond);		\
  		pr_emerg("cpu_possible_mask=%*pb
  ",			\
807de073b   Tejun Heo   percpu: use %*pb[...
1972
  			 cpumask_pr_args(cpu_possible_mask));		\
635b75fc1   Tejun Heo   percpu: make pcpu...
1973
1974
1975
1976
  		pcpu_dump_alloc_info(KERN_EMERG, ai);			\
  		BUG();							\
  	}								\
  } while (0)
2f39e637e   Tejun Heo   percpu: allow non...
1977
  	/* sanity checks */
635b75fc1   Tejun Heo   percpu: make pcpu...
1978
  	PCPU_SETUP_BUG_ON(ai->nr_groups <= 0);
bbddff054   Tejun Heo   percpu: use percp...
1979
  #ifdef CONFIG_SMP
635b75fc1   Tejun Heo   percpu: make pcpu...
1980
  	PCPU_SETUP_BUG_ON(!ai->static_size);
f09f1243c   Alexander Kuleshov   mm/percpu: use of...
1981
  	PCPU_SETUP_BUG_ON(offset_in_page(__per_cpu_start));
bbddff054   Tejun Heo   percpu: use percp...
1982
  #endif
635b75fc1   Tejun Heo   percpu: make pcpu...
1983
  	PCPU_SETUP_BUG_ON(!base_addr);
f09f1243c   Alexander Kuleshov   mm/percpu: use of...
1984
  	PCPU_SETUP_BUG_ON(offset_in_page(base_addr));
635b75fc1   Tejun Heo   percpu: make pcpu...
1985
  	PCPU_SETUP_BUG_ON(ai->unit_size < size_sum);
f09f1243c   Alexander Kuleshov   mm/percpu: use of...
1986
  	PCPU_SETUP_BUG_ON(offset_in_page(ai->unit_size));
635b75fc1   Tejun Heo   percpu: make pcpu...
1987
  	PCPU_SETUP_BUG_ON(ai->unit_size < PCPU_MIN_UNIT_SIZE);
ca460b3c9   Dennis Zhou (Facebook)   percpu: introduce...
1988
  	PCPU_SETUP_BUG_ON(!IS_ALIGNED(ai->unit_size, PCPU_BITMAP_BLOCK_SIZE));
099a19d91   Tejun Heo   percpu: allow lim...
1989
  	PCPU_SETUP_BUG_ON(ai->dyn_size < PERCPU_DYNAMIC_EARLY_SIZE);
fb29a2cc6   Dennis Zhou (Facebook)   percpu: setup_fir...
1990
  	PCPU_SETUP_BUG_ON(!ai->dyn_size);
d2f3c3849   Dennis Zhou (Facebook)   percpu: increase ...
1991
  	PCPU_SETUP_BUG_ON(!IS_ALIGNED(ai->reserved_size, PCPU_MIN_ALLOC_SIZE));
ca460b3c9   Dennis Zhou (Facebook)   percpu: introduce...
1992
1993
  	PCPU_SETUP_BUG_ON(!(IS_ALIGNED(PCPU_BITMAP_BLOCK_SIZE, PAGE_SIZE) ||
  			    IS_ALIGNED(PAGE_SIZE, PCPU_BITMAP_BLOCK_SIZE)));
9f6455325   Tejun Heo   percpu: move vmal...
1994
  	PCPU_SETUP_BUG_ON(pcpu_verify_alloc_info(ai) < 0);
8d408b4be   Tejun Heo   percpu: give more...
1995

6563297ce   Tejun Heo   percpu: use group...
1996
  	/* process group information and build config tables accordingly */
999c17e3d   Santosh Shilimkar   mm/percpu.c: use ...
1997
1998
1999
2000
2001
2002
  	group_offsets = memblock_virt_alloc(ai->nr_groups *
  					     sizeof(group_offsets[0]), 0);
  	group_sizes = memblock_virt_alloc(ai->nr_groups *
  					   sizeof(group_sizes[0]), 0);
  	unit_map = memblock_virt_alloc(nr_cpu_ids * sizeof(unit_map[0]), 0);
  	unit_off = memblock_virt_alloc(nr_cpu_ids * sizeof(unit_off[0]), 0);
2f39e637e   Tejun Heo   percpu: allow non...
2003

fd1e8a1fe   Tejun Heo   percpu: introduce...
2004
  	for (cpu = 0; cpu < nr_cpu_ids; cpu++)
ffe0d5a57   Tejun Heo   percpu: fix unit_...
2005
  		unit_map[cpu] = UINT_MAX;
a855b84c3   Tejun Heo   percpu: fix chunk...
2006
2007
2008
  
  	pcpu_low_unit_cpu = NR_CPUS;
  	pcpu_high_unit_cpu = NR_CPUS;
2f39e637e   Tejun Heo   percpu: allow non...
2009

fd1e8a1fe   Tejun Heo   percpu: introduce...
2010
2011
  	for (group = 0, unit = 0; group < ai->nr_groups; group++, unit += i) {
  		const struct pcpu_group_info *gi = &ai->groups[group];
2f39e637e   Tejun Heo   percpu: allow non...
2012

6563297ce   Tejun Heo   percpu: use group...
2013
2014
  		group_offsets[group] = gi->base_offset;
  		group_sizes[group] = gi->nr_units * ai->unit_size;
fd1e8a1fe   Tejun Heo   percpu: introduce...
2015
2016
2017
2018
  		for (i = 0; i < gi->nr_units; i++) {
  			cpu = gi->cpu_map[i];
  			if (cpu == NR_CPUS)
  				continue;
8d408b4be   Tejun Heo   percpu: give more...
2019

9f295664e   Dan Carpenter   percpu: off by on...
2020
  			PCPU_SETUP_BUG_ON(cpu >= nr_cpu_ids);
635b75fc1   Tejun Heo   percpu: make pcpu...
2021
2022
  			PCPU_SETUP_BUG_ON(!cpu_possible(cpu));
  			PCPU_SETUP_BUG_ON(unit_map[cpu] != UINT_MAX);
fbf59bc9d   Tejun Heo   percpu: implement...
2023

fd1e8a1fe   Tejun Heo   percpu: introduce...
2024
  			unit_map[cpu] = unit + i;
fb435d523   Tejun Heo   percpu: add pcpu_...
2025
  			unit_off[cpu] = gi->base_offset + i * ai->unit_size;
a855b84c3   Tejun Heo   percpu: fix chunk...
2026
2027
2028
2029
2030
2031
2032
  			/* determine low/high unit_cpu */
  			if (pcpu_low_unit_cpu == NR_CPUS ||
  			    unit_off[cpu] < unit_off[pcpu_low_unit_cpu])
  				pcpu_low_unit_cpu = cpu;
  			if (pcpu_high_unit_cpu == NR_CPUS ||
  			    unit_off[cpu] > unit_off[pcpu_high_unit_cpu])
  				pcpu_high_unit_cpu = cpu;
fd1e8a1fe   Tejun Heo   percpu: introduce...
2033
  		}
2f39e637e   Tejun Heo   percpu: allow non...
2034
  	}
fd1e8a1fe   Tejun Heo   percpu: introduce...
2035
2036
2037
  	pcpu_nr_units = unit;
  
  	for_each_possible_cpu(cpu)
635b75fc1   Tejun Heo   percpu: make pcpu...
2038
2039
2040
2041
  		PCPU_SETUP_BUG_ON(unit_map[cpu] == UINT_MAX);
  
  	/* we're done parsing the input, undefine BUG macro and dump config */
  #undef PCPU_SETUP_BUG_ON
bcbea798f   Tejun Heo   percpu: print out...
2042
  	pcpu_dump_alloc_info(KERN_DEBUG, ai);
fd1e8a1fe   Tejun Heo   percpu: introduce...
2043

6563297ce   Tejun Heo   percpu: use group...
2044
2045
2046
  	pcpu_nr_groups = ai->nr_groups;
  	pcpu_group_offsets = group_offsets;
  	pcpu_group_sizes = group_sizes;
fd1e8a1fe   Tejun Heo   percpu: introduce...
2047
  	pcpu_unit_map = unit_map;
fb435d523   Tejun Heo   percpu: add pcpu_...
2048
  	pcpu_unit_offsets = unit_off;
2f39e637e   Tejun Heo   percpu: allow non...
2049
2050
  
  	/* determine basic parameters */
fd1e8a1fe   Tejun Heo   percpu: introduce...
2051
  	pcpu_unit_pages = ai->unit_size >> PAGE_SHIFT;
d9b55eeb1   Tejun Heo   percpu: remove un...
2052
  	pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT;
6563297ce   Tejun Heo   percpu: use group...
2053
  	pcpu_atom_size = ai->atom_size;
ce3141a27   Tejun Heo   percpu: drop pcpu...
2054
2055
  	pcpu_chunk_struct_size = sizeof(struct pcpu_chunk) +
  		BITS_TO_LONGS(pcpu_unit_pages) * sizeof(unsigned long);
cafe8816b   Tejun Heo   percpu: use negat...
2056

30a5b5367   Dennis Zhou   percpu: expose st...
2057
  	pcpu_stats_save_ai(ai);
d9b55eeb1   Tejun Heo   percpu: remove un...
2058
2059
2060
2061
2062
  	/*
  	 * Allocate chunk slots.  The additional last slot is for
  	 * empty chunks.
  	 */
  	pcpu_nr_slots = __pcpu_size_to_slot(pcpu_unit_size) + 2;
999c17e3d   Santosh Shilimkar   mm/percpu.c: use ...
2063
2064
  	pcpu_slot = memblock_virt_alloc(
  			pcpu_nr_slots * sizeof(pcpu_slot[0]), 0);
fbf59bc9d   Tejun Heo   percpu: implement...
2065
2066
  	for (i = 0; i < pcpu_nr_slots; i++)
  		INIT_LIST_HEAD(&pcpu_slot[i]);
edcb46399   Tejun Heo   percpu, module: i...
2067
  	/*
d2f3c3849   Dennis Zhou (Facebook)   percpu: increase ...
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
  	 * The end of the static region needs to be aligned with the
  	 * minimum allocation size as this offsets the reserved and
  	 * dynamic region.  The first chunk ends page aligned by
  	 * expanding the dynamic region, therefore the dynamic region
  	 * can be shrunk to compensate while still staying above the
  	 * configured sizes.
  	 */
  	static_size = ALIGN(ai->static_size, PCPU_MIN_ALLOC_SIZE);
  	dyn_size = ai->dyn_size - (static_size - ai->static_size);
  
  	/*
c0ebfdc3f   Dennis Zhou (Facebook)   percpu: modify ba...
2079
2080
2081
2082
2083
2084
  	 * Initialize first chunk.
  	 * If the reserved_size is non-zero, this initializes the reserved
  	 * chunk.  If the reserved_size is zero, the reserved chunk is NULL
  	 * and the dynamic region is initialized here.  The first chunk,
  	 * pcpu_first_chunk, will always point to the chunk that serves
  	 * the dynamic region.
edcb46399   Tejun Heo   percpu, module: i...
2085
  	 */
d2f3c3849   Dennis Zhou (Facebook)   percpu: increase ...
2086
2087
  	tmp_addr = (unsigned long)base_addr + static_size;
  	map_size = ai->reserved_size ?: dyn_size;
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
2088
  	chunk = pcpu_alloc_first_chunk(tmp_addr, map_size);
61ace7fa2   Tejun Heo   percpu: improve f...
2089

edcb46399   Tejun Heo   percpu, module: i...
2090
  	/* init dynamic chunk if necessary */
b9c39442c   Dennis Zhou (Facebook)   percpu: setup_fir...
2091
  	if (ai->reserved_size) {
0c4169c3d   Dennis Zhou (Facebook)   percpu: setup_fir...
2092
  		pcpu_reserved_chunk = chunk;
b9c39442c   Dennis Zhou (Facebook)   percpu: setup_fir...
2093

d2f3c3849   Dennis Zhou (Facebook)   percpu: increase ...
2094
  		tmp_addr = (unsigned long)base_addr + static_size +
c0ebfdc3f   Dennis Zhou (Facebook)   percpu: modify ba...
2095
  			   ai->reserved_size;
d2f3c3849   Dennis Zhou (Facebook)   percpu: increase ...
2096
  		map_size = dyn_size;
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
2097
  		chunk = pcpu_alloc_first_chunk(tmp_addr, map_size);
edcb46399   Tejun Heo   percpu, module: i...
2098
  	}
2441d15c9   Tejun Heo   percpu: cosmetic ...
2099
  	/* link the first chunk in */
0c4169c3d   Dennis Zhou (Facebook)   percpu: setup_fir...
2100
  	pcpu_first_chunk = chunk;
0cecf50cf   Dennis Zhou (Facebook)   percpu: introduce...
2101
  	pcpu_nr_empty_pop_pages = pcpu_first_chunk->nr_empty_pop_pages;
ae9e6bc9f   Tejun Heo   percpu: don't put...
2102
  	pcpu_chunk_relocate(pcpu_first_chunk, -1);
fbf59bc9d   Tejun Heo   percpu: implement...
2103

30a5b5367   Dennis Zhou   percpu: expose st...
2104
  	pcpu_stats_chunk_alloc();
df95e795a   Dennis Zhou   percpu: add trace...
2105
  	trace_percpu_create_chunk(base_addr);
30a5b5367   Dennis Zhou   percpu: expose st...
2106

fbf59bc9d   Tejun Heo   percpu: implement...
2107
  	/* we're done */
bba174f5e   Tejun Heo   percpu: add chunk...
2108
  	pcpu_base_addr = base_addr;
fb435d523   Tejun Heo   percpu: add pcpu_...
2109
  	return 0;
fbf59bc9d   Tejun Heo   percpu: implement...
2110
  }
66c3a7577   Tejun Heo   percpu: generaliz...
2111

bbddff054   Tejun Heo   percpu: use percp...
2112
  #ifdef CONFIG_SMP
17f3609c2   Andi Kleen   sections: fix sec...
2113
  const char * const pcpu_fc_names[PCPU_FC_NR] __initconst = {
f58dc01ba   Tejun Heo   percpu: generaliz...
2114
2115
2116
  	[PCPU_FC_AUTO]	= "auto",
  	[PCPU_FC_EMBED]	= "embed",
  	[PCPU_FC_PAGE]	= "page",
f58dc01ba   Tejun Heo   percpu: generaliz...
2117
  };
66c3a7577   Tejun Heo   percpu: generaliz...
2118

f58dc01ba   Tejun Heo   percpu: generaliz...
2119
  enum pcpu_fc pcpu_chosen_fc __initdata = PCPU_FC_AUTO;
66c3a7577   Tejun Heo   percpu: generaliz...
2120

f58dc01ba   Tejun Heo   percpu: generaliz...
2121
2122
  static int __init percpu_alloc_setup(char *str)
  {
5479c78ac   Cyrill Gorcunov   mm, percpu: Make ...
2123
2124
  	if (!str)
  		return -EINVAL;
f58dc01ba   Tejun Heo   percpu: generaliz...
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
  	if (0)
  		/* nada */;
  #ifdef CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK
  	else if (!strcmp(str, "embed"))
  		pcpu_chosen_fc = PCPU_FC_EMBED;
  #endif
  #ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK
  	else if (!strcmp(str, "page"))
  		pcpu_chosen_fc = PCPU_FC_PAGE;
  #endif
f58dc01ba   Tejun Heo   percpu: generaliz...
2135
  	else
870d4b12a   Joe Perches   mm: percpu: use p...
2136
2137
  		pr_warn("unknown allocator %s specified
  ", str);
66c3a7577   Tejun Heo   percpu: generaliz...
2138

f58dc01ba   Tejun Heo   percpu: generaliz...
2139
  	return 0;
66c3a7577   Tejun Heo   percpu: generaliz...
2140
  }
f58dc01ba   Tejun Heo   percpu: generaliz...
2141
  early_param("percpu_alloc", percpu_alloc_setup);
66c3a7577   Tejun Heo   percpu: generaliz...
2142

3c9a024fd   Tejun Heo   percpu: fix build...
2143
2144
2145
2146
2147
  /*
   * pcpu_embed_first_chunk() is used by the generic percpu setup.
   * Build it if needed by the arch config or the generic setup is going
   * to be used.
   */
08fc45806   Tejun Heo   percpu: build fir...
2148
2149
  #if defined(CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK) || \
  	!defined(CONFIG_HAVE_SETUP_PER_CPU_AREA)
3c9a024fd   Tejun Heo   percpu: fix build...
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
  #define BUILD_EMBED_FIRST_CHUNK
  #endif
  
  /* build pcpu_page_first_chunk() iff needed by the arch config */
  #if defined(CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK)
  #define BUILD_PAGE_FIRST_CHUNK
  #endif
  
  /* pcpu_build_alloc_info() is used by both embed and page first chunk */
  #if defined(BUILD_EMBED_FIRST_CHUNK) || defined(BUILD_PAGE_FIRST_CHUNK)
  /**
   * pcpu_build_alloc_info - build alloc_info considering distances between CPUs
   * @reserved_size: the size of reserved percpu area in bytes
   * @dyn_size: minimum free size for dynamic allocation in bytes
   * @atom_size: allocation atom size
   * @cpu_distance_fn: callback to determine distance between cpus, optional
   *
   * This function determines grouping of units, their mappings to cpus
   * and other parameters considering needed percpu size, allocation
   * atom size and distances between CPUs.
   *
bffc43758   Yannick Guerrini   percpu: Fix trivi...
2171
   * Groups are always multiples of atom size and CPUs which are of
3c9a024fd   Tejun Heo   percpu: fix build...
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
   * LOCAL_DISTANCE both ways are grouped together and share space for
   * units in the same group.  The returned configuration is guaranteed
   * to have CPUs on different nodes on different groups and >=75% usage
   * of allocated virtual address space.
   *
   * RETURNS:
   * On success, pointer to the new allocation_info is returned.  On
   * failure, ERR_PTR value is returned.
   */
  static struct pcpu_alloc_info * __init pcpu_build_alloc_info(
  				size_t reserved_size, size_t dyn_size,
  				size_t atom_size,
  				pcpu_fc_cpu_distance_fn_t cpu_distance_fn)
  {
  	static int group_map[NR_CPUS] __initdata;
  	static int group_cnt[NR_CPUS] __initdata;
  	const size_t static_size = __per_cpu_end - __per_cpu_start;
  	int nr_groups = 1, nr_units = 0;
  	size_t size_sum, min_unit_size, alloc_size;
  	int upa, max_upa, uninitialized_var(best_upa);	/* units_per_alloc */
  	int last_allocs, group, unit;
  	unsigned int cpu, tcpu;
  	struct pcpu_alloc_info *ai;
  	unsigned int *cpu_map;
  
  	/* this function may be called multiple times */
  	memset(group_map, 0, sizeof(group_map));
  	memset(group_cnt, 0, sizeof(group_cnt));
  
  	/* calculate size_sum and ensure dyn_size is enough for early alloc */
  	size_sum = PFN_ALIGN(static_size + reserved_size +
  			    max_t(size_t, dyn_size, PERCPU_DYNAMIC_EARLY_SIZE));
  	dyn_size = size_sum - static_size - reserved_size;
  
  	/*
  	 * Determine min_unit_size, alloc_size and max_upa such that
  	 * alloc_size is multiple of atom_size and is the smallest
25985edce   Lucas De Marchi   Fix common misspe...
2209
  	 * which can accommodate 4k aligned segments which are equal to
3c9a024fd   Tejun Heo   percpu: fix build...
2210
2211
2212
  	 * or larger than min_unit_size.
  	 */
  	min_unit_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE);
9c0151627   Dennis Zhou (Facebook)   percpu: update th...
2213
  	/* determine the maximum # of units that can fit in an allocation */
3c9a024fd   Tejun Heo   percpu: fix build...
2214
2215
  	alloc_size = roundup(min_unit_size, atom_size);
  	upa = alloc_size / min_unit_size;
f09f1243c   Alexander Kuleshov   mm/percpu: use of...
2216
  	while (alloc_size % upa || (offset_in_page(alloc_size / upa)))
3c9a024fd   Tejun Heo   percpu: fix build...
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
  		upa--;
  	max_upa = upa;
  
  	/* group cpus according to their proximity */
  	for_each_possible_cpu(cpu) {
  		group = 0;
  	next_group:
  		for_each_possible_cpu(tcpu) {
  			if (cpu == tcpu)
  				break;
  			if (group_map[tcpu] == group && cpu_distance_fn &&
  			    (cpu_distance_fn(cpu, tcpu) > LOCAL_DISTANCE ||
  			     cpu_distance_fn(tcpu, cpu) > LOCAL_DISTANCE)) {
  				group++;
  				nr_groups = max(nr_groups, group + 1);
  				goto next_group;
  			}
  		}
  		group_map[cpu] = group;
  		group_cnt[group]++;
  	}
  
  	/*
9c0151627   Dennis Zhou (Facebook)   percpu: update th...
2240
2241
2242
  	 * Wasted space is caused by a ratio imbalance of upa to group_cnt.
  	 * Expand the unit_size until we use >= 75% of the units allocated.
  	 * Related to atom_size, which could be much larger than the unit_size.
3c9a024fd   Tejun Heo   percpu: fix build...
2243
2244
2245
2246
  	 */
  	last_allocs = INT_MAX;
  	for (upa = max_upa; upa; upa--) {
  		int allocs = 0, wasted = 0;
f09f1243c   Alexander Kuleshov   mm/percpu: use of...
2247
  		if (alloc_size % upa || (offset_in_page(alloc_size / upa)))
3c9a024fd   Tejun Heo   percpu: fix build...
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
  			continue;
  
  		for (group = 0; group < nr_groups; group++) {
  			int this_allocs = DIV_ROUND_UP(group_cnt[group], upa);
  			allocs += this_allocs;
  			wasted += this_allocs * upa - group_cnt[group];
  		}
  
  		/*
  		 * Don't accept if wastage is over 1/3.  The
  		 * greater-than comparison ensures upa==1 always
  		 * passes the following check.
  		 */
  		if (wasted > num_possible_cpus() / 3)
  			continue;
  
  		/* and then don't consume more memory */
  		if (allocs > last_allocs)
  			break;
  		last_allocs = allocs;
  		best_upa = upa;
  	}
  	upa = best_upa;
  
  	/* allocate and fill alloc_info */
  	for (group = 0; group < nr_groups; group++)
  		nr_units += roundup(group_cnt[group], upa);
  
  	ai = pcpu_alloc_alloc_info(nr_groups, nr_units);
  	if (!ai)
  		return ERR_PTR(-ENOMEM);
  	cpu_map = ai->groups[0].cpu_map;
  
  	for (group = 0; group < nr_groups; group++) {
  		ai->groups[group].cpu_map = cpu_map;
  		cpu_map += roundup(group_cnt[group], upa);
  	}
  
  	ai->static_size = static_size;
  	ai->reserved_size = reserved_size;
  	ai->dyn_size = dyn_size;
  	ai->unit_size = alloc_size / upa;
  	ai->atom_size = atom_size;
  	ai->alloc_size = alloc_size;
  
  	for (group = 0, unit = 0; group_cnt[group]; group++) {
  		struct pcpu_group_info *gi = &ai->groups[group];
  
  		/*
  		 * Initialize base_offset as if all groups are located
  		 * back-to-back.  The caller should update this to
  		 * reflect actual allocation.
  		 */
  		gi->base_offset = unit * ai->unit_size;
  
  		for_each_possible_cpu(cpu)
  			if (group_map[cpu] == group)
  				gi->cpu_map[gi->nr_units++] = cpu;
  		gi->nr_units = roundup(gi->nr_units, upa);
  		unit += gi->nr_units;
  	}
  	BUG_ON(unit != nr_units);
  
  	return ai;
  }
  #endif /* BUILD_EMBED_FIRST_CHUNK || BUILD_PAGE_FIRST_CHUNK */
  
  #if defined(BUILD_EMBED_FIRST_CHUNK)
66c3a7577   Tejun Heo   percpu: generaliz...
2316
2317
  /**
   * pcpu_embed_first_chunk - embed the first percpu chunk into bootmem
66c3a7577   Tejun Heo   percpu: generaliz...
2318
   * @reserved_size: the size of reserved percpu area in bytes
4ba6ce250   Tejun Heo   percpu: make @dyn...
2319
   * @dyn_size: minimum free size for dynamic allocation in bytes
c8826dd53   Tejun Heo   percpu: update em...
2320
2321
2322
   * @atom_size: allocation atom size
   * @cpu_distance_fn: callback to determine distance between cpus, optional
   * @alloc_fn: function to allocate percpu page
25985edce   Lucas De Marchi   Fix common misspe...
2323
   * @free_fn: function to free percpu page
66c3a7577   Tejun Heo   percpu: generaliz...
2324
2325
2326
2327
2328
   *
   * This is a helper to ease setting up embedded first percpu chunk and
   * can be called where pcpu_setup_first_chunk() is expected.
   *
   * If this function is used to setup the first chunk, it is allocated
c8826dd53   Tejun Heo   percpu: update em...
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
   * by calling @alloc_fn and used as-is without being mapped into
   * vmalloc area.  Allocations are always whole multiples of @atom_size
   * aligned to @atom_size.
   *
   * This enables the first chunk to piggy back on the linear physical
   * mapping which often uses larger page size.  Please note that this
   * can result in very sparse cpu->unit mapping on NUMA machines thus
   * requiring large vmalloc address space.  Don't use this allocator if
   * vmalloc space is not orders of magnitude larger than distances
   * between node memory addresses (ie. 32bit NUMA machines).
66c3a7577   Tejun Heo   percpu: generaliz...
2339
   *
4ba6ce250   Tejun Heo   percpu: make @dyn...
2340
   * @dyn_size specifies the minimum dynamic area size.
66c3a7577   Tejun Heo   percpu: generaliz...
2341
2342
   *
   * If the needed size is smaller than the minimum or specified unit
c8826dd53   Tejun Heo   percpu: update em...
2343
   * size, the leftover is returned using @free_fn.
66c3a7577   Tejun Heo   percpu: generaliz...
2344
2345
   *
   * RETURNS:
fb435d523   Tejun Heo   percpu: add pcpu_...
2346
   * 0 on success, -errno on failure.
66c3a7577   Tejun Heo   percpu: generaliz...
2347
   */
4ba6ce250   Tejun Heo   percpu: make @dyn...
2348
  int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size,
c8826dd53   Tejun Heo   percpu: update em...
2349
2350
2351
2352
  				  size_t atom_size,
  				  pcpu_fc_cpu_distance_fn_t cpu_distance_fn,
  				  pcpu_fc_alloc_fn_t alloc_fn,
  				  pcpu_fc_free_fn_t free_fn)
66c3a7577   Tejun Heo   percpu: generaliz...
2353
  {
c8826dd53   Tejun Heo   percpu: update em...
2354
2355
  	void *base = (void *)ULONG_MAX;
  	void **areas = NULL;
fd1e8a1fe   Tejun Heo   percpu: introduce...
2356
  	struct pcpu_alloc_info *ai;
93c76b6b2   zijun_hu   mm/percpu.c: corr...
2357
2358
  	size_t size_sum, areas_size;
  	unsigned long max_distance;
9b7396624   zijun_hu   mm/percpu.c: fix ...
2359
  	int group, i, highest_group, rc;
66c3a7577   Tejun Heo   percpu: generaliz...
2360

c8826dd53   Tejun Heo   percpu: update em...
2361
2362
  	ai = pcpu_build_alloc_info(reserved_size, dyn_size, atom_size,
  				   cpu_distance_fn);
fd1e8a1fe   Tejun Heo   percpu: introduce...
2363
2364
  	if (IS_ERR(ai))
  		return PTR_ERR(ai);
66c3a7577   Tejun Heo   percpu: generaliz...
2365

fd1e8a1fe   Tejun Heo   percpu: introduce...
2366
  	size_sum = ai->static_size + ai->reserved_size + ai->dyn_size;
c8826dd53   Tejun Heo   percpu: update em...
2367
  	areas_size = PFN_ALIGN(ai->nr_groups * sizeof(void *));
fa8a7094b   Tejun Heo   x86: implement pe...
2368

999c17e3d   Santosh Shilimkar   mm/percpu.c: use ...
2369
  	areas = memblock_virt_alloc_nopanic(areas_size, 0);
c8826dd53   Tejun Heo   percpu: update em...
2370
  	if (!areas) {
fb435d523   Tejun Heo   percpu: add pcpu_...
2371
  		rc = -ENOMEM;
c8826dd53   Tejun Heo   percpu: update em...
2372
  		goto out_free;
fa8a7094b   Tejun Heo   x86: implement pe...
2373
  	}
66c3a7577   Tejun Heo   percpu: generaliz...
2374

9b7396624   zijun_hu   mm/percpu.c: fix ...
2375
2376
  	/* allocate, copy and determine base address & max_distance */
  	highest_group = 0;
c8826dd53   Tejun Heo   percpu: update em...
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
  	for (group = 0; group < ai->nr_groups; group++) {
  		struct pcpu_group_info *gi = &ai->groups[group];
  		unsigned int cpu = NR_CPUS;
  		void *ptr;
  
  		for (i = 0; i < gi->nr_units && cpu == NR_CPUS; i++)
  			cpu = gi->cpu_map[i];
  		BUG_ON(cpu == NR_CPUS);
  
  		/* allocate space for the whole group */
  		ptr = alloc_fn(cpu, gi->nr_units * ai->unit_size, atom_size);
  		if (!ptr) {
  			rc = -ENOMEM;
  			goto out_free_areas;
  		}
f528f0b8e   Catalin Marinas   kmemleak: Handle ...
2392
2393
  		/* kmemleak tracks the percpu allocations separately */
  		kmemleak_free(ptr);
c8826dd53   Tejun Heo   percpu: update em...
2394
  		areas[group] = ptr;
fd1e8a1fe   Tejun Heo   percpu: introduce...
2395

c8826dd53   Tejun Heo   percpu: update em...
2396
  		base = min(ptr, base);
9b7396624   zijun_hu   mm/percpu.c: fix ...
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
  		if (ptr > areas[highest_group])
  			highest_group = group;
  	}
  	max_distance = areas[highest_group] - base;
  	max_distance += ai->unit_size * ai->groups[highest_group].nr_units;
  
  	/* warn if maximum distance is further than 75% of vmalloc space */
  	if (max_distance > VMALLOC_TOTAL * 3 / 4) {
  		pr_warn("max_distance=0x%lx too large for vmalloc space 0x%lx
  ",
  				max_distance, VMALLOC_TOTAL);
  #ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK
  		/* and fail if we have fallback */
  		rc = -EINVAL;
  		goto out_free_areas;
  #endif
42b642814   Tejun Heo   percpu: pcpu_embe...
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
  	}
  
  	/*
  	 * Copy data and free unused parts.  This should happen after all
  	 * allocations are complete; otherwise, we may end up with
  	 * overlapping groups.
  	 */
  	for (group = 0; group < ai->nr_groups; group++) {
  		struct pcpu_group_info *gi = &ai->groups[group];
  		void *ptr = areas[group];
c8826dd53   Tejun Heo   percpu: update em...
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
  
  		for (i = 0; i < gi->nr_units; i++, ptr += ai->unit_size) {
  			if (gi->cpu_map[i] == NR_CPUS) {
  				/* unused unit, free whole */
  				free_fn(ptr, ai->unit_size);
  				continue;
  			}
  			/* copy and return the unused part */
  			memcpy(ptr, __per_cpu_load, ai->static_size);
  			free_fn(ptr + size_sum, ai->unit_size - size_sum);
  		}
fa8a7094b   Tejun Heo   x86: implement pe...
2434
  	}
66c3a7577   Tejun Heo   percpu: generaliz...
2435

c8826dd53   Tejun Heo   percpu: update em...
2436
  	/* base address is now known, determine group base offsets */
6ea529a20   Tejun Heo   percpu: make embe...
2437
  	for (group = 0; group < ai->nr_groups; group++) {
c8826dd53   Tejun Heo   percpu: update em...
2438
  		ai->groups[group].base_offset = areas[group] - base;
6ea529a20   Tejun Heo   percpu: make embe...
2439
  	}
c8826dd53   Tejun Heo   percpu: update em...
2440

870d4b12a   Joe Perches   mm: percpu: use p...
2441
2442
  	pr_info("Embedded %zu pages/cpu @%p s%zu r%zu d%zu u%zu
  ",
fd1e8a1fe   Tejun Heo   percpu: introduce...
2443
2444
  		PFN_DOWN(size_sum), base, ai->static_size, ai->reserved_size,
  		ai->dyn_size, ai->unit_size);
d4b95f803   Tejun Heo   x86,percpu: gener...
2445

fb435d523   Tejun Heo   percpu: add pcpu_...
2446
  	rc = pcpu_setup_first_chunk(ai, base);
c8826dd53   Tejun Heo   percpu: update em...
2447
2448
2449
2450
  	goto out_free;
  
  out_free_areas:
  	for (group = 0; group < ai->nr_groups; group++)
f851c8d85   Michael Holzheu   percpu: fix bootm...
2451
2452
2453
  		if (areas[group])
  			free_fn(areas[group],
  				ai->groups[group].nr_units * ai->unit_size);
c8826dd53   Tejun Heo   percpu: update em...
2454
  out_free:
fd1e8a1fe   Tejun Heo   percpu: introduce...
2455
  	pcpu_free_alloc_info(ai);
c8826dd53   Tejun Heo   percpu: update em...
2456
  	if (areas)
999c17e3d   Santosh Shilimkar   mm/percpu.c: use ...
2457
  		memblock_free_early(__pa(areas), areas_size);
fb435d523   Tejun Heo   percpu: add pcpu_...
2458
  	return rc;
d4b95f803   Tejun Heo   x86,percpu: gener...
2459
  }
3c9a024fd   Tejun Heo   percpu: fix build...
2460
  #endif /* BUILD_EMBED_FIRST_CHUNK */
d4b95f803   Tejun Heo   x86,percpu: gener...
2461

3c9a024fd   Tejun Heo   percpu: fix build...
2462
  #ifdef BUILD_PAGE_FIRST_CHUNK
d4b95f803   Tejun Heo   x86,percpu: gener...
2463
  /**
00ae4064b   Tejun Heo   percpu: rename 4k...
2464
   * pcpu_page_first_chunk - map the first chunk using PAGE_SIZE pages
d4b95f803   Tejun Heo   x86,percpu: gener...
2465
2466
   * @reserved_size: the size of reserved percpu area in bytes
   * @alloc_fn: function to allocate percpu page, always called with PAGE_SIZE
25985edce   Lucas De Marchi   Fix common misspe...
2467
   * @free_fn: function to free percpu page, always called with PAGE_SIZE
d4b95f803   Tejun Heo   x86,percpu: gener...
2468
2469
   * @populate_pte_fn: function to populate pte
   *
00ae4064b   Tejun Heo   percpu: rename 4k...
2470
2471
   * This is a helper to ease setting up page-remapped first percpu
   * chunk and can be called where pcpu_setup_first_chunk() is expected.
d4b95f803   Tejun Heo   x86,percpu: gener...
2472
2473
2474
2475
2476
   *
   * This is the basic allocator.  Static percpu area is allocated
   * page-by-page into vmalloc area.
   *
   * RETURNS:
fb435d523   Tejun Heo   percpu: add pcpu_...
2477
   * 0 on success, -errno on failure.
d4b95f803   Tejun Heo   x86,percpu: gener...
2478
   */
fb435d523   Tejun Heo   percpu: add pcpu_...
2479
2480
2481
2482
  int __init pcpu_page_first_chunk(size_t reserved_size,
  				 pcpu_fc_alloc_fn_t alloc_fn,
  				 pcpu_fc_free_fn_t free_fn,
  				 pcpu_fc_populate_pte_fn_t populate_pte_fn)
d4b95f803   Tejun Heo   x86,percpu: gener...
2483
  {
8f05a6a65   Tejun Heo   percpu: make 4k f...
2484
  	static struct vm_struct vm;
fd1e8a1fe   Tejun Heo   percpu: introduce...
2485
  	struct pcpu_alloc_info *ai;
00ae4064b   Tejun Heo   percpu: rename 4k...
2486
  	char psize_str[16];
ce3141a27   Tejun Heo   percpu: drop pcpu...
2487
  	int unit_pages;
d4b95f803   Tejun Heo   x86,percpu: gener...
2488
  	size_t pages_size;
ce3141a27   Tejun Heo   percpu: drop pcpu...
2489
  	struct page **pages;
fb435d523   Tejun Heo   percpu: add pcpu_...
2490
  	int unit, i, j, rc;
8f6066049   zijun_hu   mm/percpu.c: fix ...
2491
2492
  	int upa;
  	int nr_g0_units;
d4b95f803   Tejun Heo   x86,percpu: gener...
2493

00ae4064b   Tejun Heo   percpu: rename 4k...
2494
  	snprintf(psize_str, sizeof(psize_str), "%luK", PAGE_SIZE >> 10);
4ba6ce250   Tejun Heo   percpu: make @dyn...
2495
  	ai = pcpu_build_alloc_info(reserved_size, 0, PAGE_SIZE, NULL);
fd1e8a1fe   Tejun Heo   percpu: introduce...
2496
2497
2498
  	if (IS_ERR(ai))
  		return PTR_ERR(ai);
  	BUG_ON(ai->nr_groups != 1);
8f6066049   zijun_hu   mm/percpu.c: fix ...
2499
2500
2501
2502
2503
2504
  	upa = ai->alloc_size/ai->unit_size;
  	nr_g0_units = roundup(num_possible_cpus(), upa);
  	if (unlikely(WARN_ON(ai->groups[0].nr_units != nr_g0_units))) {
  		pcpu_free_alloc_info(ai);
  		return -EINVAL;
  	}
fd1e8a1fe   Tejun Heo   percpu: introduce...
2505
2506
  
  	unit_pages = ai->unit_size >> PAGE_SHIFT;
d4b95f803   Tejun Heo   x86,percpu: gener...
2507
2508
  
  	/* unaligned allocations can't be freed, round up to page size */
fd1e8a1fe   Tejun Heo   percpu: introduce...
2509
2510
  	pages_size = PFN_ALIGN(unit_pages * num_possible_cpus() *
  			       sizeof(pages[0]));
999c17e3d   Santosh Shilimkar   mm/percpu.c: use ...
2511
  	pages = memblock_virt_alloc(pages_size, 0);
d4b95f803   Tejun Heo   x86,percpu: gener...
2512

8f05a6a65   Tejun Heo   percpu: make 4k f...
2513
  	/* allocate pages */
d4b95f803   Tejun Heo   x86,percpu: gener...
2514
  	j = 0;
8f6066049   zijun_hu   mm/percpu.c: fix ...
2515
2516
  	for (unit = 0; unit < num_possible_cpus(); unit++) {
  		unsigned int cpu = ai->groups[0].cpu_map[unit];
ce3141a27   Tejun Heo   percpu: drop pcpu...
2517
  		for (i = 0; i < unit_pages; i++) {
d4b95f803   Tejun Heo   x86,percpu: gener...
2518
  			void *ptr;
3cbc85652   Tejun Heo   percpu: add @alig...
2519
  			ptr = alloc_fn(cpu, PAGE_SIZE, PAGE_SIZE);
d4b95f803   Tejun Heo   x86,percpu: gener...
2520
  			if (!ptr) {
870d4b12a   Joe Perches   mm: percpu: use p...
2521
2522
  				pr_warn("failed to allocate %s page for cpu%u
  ",
8f6066049   zijun_hu   mm/percpu.c: fix ...
2523
  						psize_str, cpu);
d4b95f803   Tejun Heo   x86,percpu: gener...
2524
2525
  				goto enomem;
  			}
f528f0b8e   Catalin Marinas   kmemleak: Handle ...
2526
2527
  			/* kmemleak tracks the percpu allocations separately */
  			kmemleak_free(ptr);
ce3141a27   Tejun Heo   percpu: drop pcpu...
2528
  			pages[j++] = virt_to_page(ptr);
d4b95f803   Tejun Heo   x86,percpu: gener...
2529
  		}
8f6066049   zijun_hu   mm/percpu.c: fix ...
2530
  	}
d4b95f803   Tejun Heo   x86,percpu: gener...
2531

8f05a6a65   Tejun Heo   percpu: make 4k f...
2532
2533
  	/* allocate vm area, map the pages and copy static data */
  	vm.flags = VM_ALLOC;
fd1e8a1fe   Tejun Heo   percpu: introduce...
2534
  	vm.size = num_possible_cpus() * ai->unit_size;
8f05a6a65   Tejun Heo   percpu: make 4k f...
2535
  	vm_area_register_early(&vm, PAGE_SIZE);
fd1e8a1fe   Tejun Heo   percpu: introduce...
2536
  	for (unit = 0; unit < num_possible_cpus(); unit++) {
1d9d32572   Tejun Heo   percpu: make @dyn...
2537
  		unsigned long unit_addr =
fd1e8a1fe   Tejun Heo   percpu: introduce...
2538
  			(unsigned long)vm.addr + unit * ai->unit_size;
8f05a6a65   Tejun Heo   percpu: make 4k f...
2539

ce3141a27   Tejun Heo   percpu: drop pcpu...
2540
  		for (i = 0; i < unit_pages; i++)
8f05a6a65   Tejun Heo   percpu: make 4k f...
2541
2542
2543
  			populate_pte_fn(unit_addr + (i << PAGE_SHIFT));
  
  		/* pte already populated, the following shouldn't fail */
fb435d523   Tejun Heo   percpu: add pcpu_...
2544
2545
2546
2547
2548
  		rc = __pcpu_map_pages(unit_addr, &pages[unit * unit_pages],
  				      unit_pages);
  		if (rc < 0)
  			panic("failed to map percpu area, err=%d
  ", rc);
66c3a7577   Tejun Heo   percpu: generaliz...
2549

8f05a6a65   Tejun Heo   percpu: make 4k f...
2550
2551
2552
2553
2554
2555
2556
2557
2558
  		/*
  		 * FIXME: Archs with virtual cache should flush local
  		 * cache for the linear mapping here - something
  		 * equivalent to flush_cache_vmap() on the local cpu.
  		 * flush_cache_vmap() can't be used as most supporting
  		 * data structures are not set up yet.
  		 */
  
  		/* copy static data */
fd1e8a1fe   Tejun Heo   percpu: introduce...
2559
  		memcpy((void *)unit_addr, __per_cpu_load, ai->static_size);
66c3a7577   Tejun Heo   percpu: generaliz...
2560
2561
2562
  	}
  
  	/* we're ready, commit */
870d4b12a   Joe Perches   mm: percpu: use p...
2563
2564
  	pr_info("%d %s pages/cpu @%p s%zu r%zu d%zu
  ",
fd1e8a1fe   Tejun Heo   percpu: introduce...
2565
2566
  		unit_pages, psize_str, vm.addr, ai->static_size,
  		ai->reserved_size, ai->dyn_size);
d4b95f803   Tejun Heo   x86,percpu: gener...
2567

fb435d523   Tejun Heo   percpu: add pcpu_...
2568
  	rc = pcpu_setup_first_chunk(ai, vm.addr);
d4b95f803   Tejun Heo   x86,percpu: gener...
2569
2570
2571
2572
  	goto out_free_ar;
  
  enomem:
  	while (--j >= 0)
ce3141a27   Tejun Heo   percpu: drop pcpu...
2573
  		free_fn(page_address(pages[j]), PAGE_SIZE);
fb435d523   Tejun Heo   percpu: add pcpu_...
2574
  	rc = -ENOMEM;
d4b95f803   Tejun Heo   x86,percpu: gener...
2575
  out_free_ar:
999c17e3d   Santosh Shilimkar   mm/percpu.c: use ...
2576
  	memblock_free_early(__pa(pages), pages_size);
fd1e8a1fe   Tejun Heo   percpu: introduce...
2577
  	pcpu_free_alloc_info(ai);
fb435d523   Tejun Heo   percpu: add pcpu_...
2578
  	return rc;
d4b95f803   Tejun Heo   x86,percpu: gener...
2579
  }
3c9a024fd   Tejun Heo   percpu: fix build...
2580
  #endif /* BUILD_PAGE_FIRST_CHUNK */
d4b95f803   Tejun Heo   x86,percpu: gener...
2581

bbddff054   Tejun Heo   percpu: use percp...
2582
  #ifndef	CONFIG_HAVE_SETUP_PER_CPU_AREA
8c4bfc6e8   Tejun Heo   x86,percpu: gener...
2583
  /*
bbddff054   Tejun Heo   percpu: use percp...
2584
   * Generic SMP percpu area setup.
e74e39620   Tejun Heo   percpu: use dynam...
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
   *
   * The embedding helper is used because its behavior closely resembles
   * the original non-dynamic generic percpu area setup.  This is
   * important because many archs have addressing restrictions and might
   * fail if the percpu area is located far away from the previous
   * location.  As an added bonus, in non-NUMA cases, embedding is
   * generally a good idea TLB-wise because percpu area can piggy back
   * on the physical linear memory mapping which uses large page
   * mappings on applicable archs.
   */
e74e39620   Tejun Heo   percpu: use dynam...
2595
2596
  unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
  EXPORT_SYMBOL(__per_cpu_offset);
c8826dd53   Tejun Heo   percpu: update em...
2597
2598
2599
  static void * __init pcpu_dfl_fc_alloc(unsigned int cpu, size_t size,
  				       size_t align)
  {
999c17e3d   Santosh Shilimkar   mm/percpu.c: use ...
2600
2601
  	return  memblock_virt_alloc_from_nopanic(
  			size, align, __pa(MAX_DMA_ADDRESS));
c8826dd53   Tejun Heo   percpu: update em...
2602
  }
66c3a7577   Tejun Heo   percpu: generaliz...
2603

c8826dd53   Tejun Heo   percpu: update em...
2604
2605
  static void __init pcpu_dfl_fc_free(void *ptr, size_t size)
  {
999c17e3d   Santosh Shilimkar   mm/percpu.c: use ...
2606
  	memblock_free_early(__pa(ptr), size);
c8826dd53   Tejun Heo   percpu: update em...
2607
  }
e74e39620   Tejun Heo   percpu: use dynam...
2608
2609
  void __init setup_per_cpu_areas(void)
  {
e74e39620   Tejun Heo   percpu: use dynam...
2610
2611
  	unsigned long delta;
  	unsigned int cpu;
fb435d523   Tejun Heo   percpu: add pcpu_...
2612
  	int rc;
e74e39620   Tejun Heo   percpu: use dynam...
2613
2614
2615
2616
2617
  
  	/*
  	 * Always reserve area for module percpu variables.  That's
  	 * what the legacy allocator did.
  	 */
fb435d523   Tejun Heo   percpu: add pcpu_...
2618
  	rc = pcpu_embed_first_chunk(PERCPU_MODULE_RESERVE,
c8826dd53   Tejun Heo   percpu: update em...
2619
2620
  				    PERCPU_DYNAMIC_RESERVE, PAGE_SIZE, NULL,
  				    pcpu_dfl_fc_alloc, pcpu_dfl_fc_free);
fb435d523   Tejun Heo   percpu: add pcpu_...
2621
  	if (rc < 0)
bbddff054   Tejun Heo   percpu: use percp...
2622
  		panic("Failed to initialize percpu areas.");
e74e39620   Tejun Heo   percpu: use dynam...
2623
2624
2625
  
  	delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start;
  	for_each_possible_cpu(cpu)
fb435d523   Tejun Heo   percpu: add pcpu_...
2626
  		__per_cpu_offset[cpu] = delta + pcpu_unit_offsets[cpu];
66c3a7577   Tejun Heo   percpu: generaliz...
2627
  }
bbddff054   Tejun Heo   percpu: use percp...
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
  #endif	/* CONFIG_HAVE_SETUP_PER_CPU_AREA */
  
  #else	/* CONFIG_SMP */
  
  /*
   * UP percpu area setup.
   *
   * UP always uses km-based percpu allocator with identity mapping.
   * Static percpu variables are indistinguishable from the usual static
   * variables and don't require any special preparation.
   */
  void __init setup_per_cpu_areas(void)
  {
  	const size_t unit_size =
  		roundup_pow_of_two(max_t(size_t, PCPU_MIN_UNIT_SIZE,
  					 PERCPU_DYNAMIC_RESERVE));
  	struct pcpu_alloc_info *ai;
  	void *fc;
  
  	ai = pcpu_alloc_alloc_info(1, 1);
999c17e3d   Santosh Shilimkar   mm/percpu.c: use ...
2648
2649
2650
  	fc = memblock_virt_alloc_from_nopanic(unit_size,
  					      PAGE_SIZE,
  					      __pa(MAX_DMA_ADDRESS));
bbddff054   Tejun Heo   percpu: use percp...
2651
2652
  	if (!ai || !fc)
  		panic("Failed to allocate memory for percpu areas.");
100d13c3b   Catalin Marinas   kmemleak: Fix the...
2653
2654
  	/* kmemleak tracks the percpu allocations separately */
  	kmemleak_free(fc);
bbddff054   Tejun Heo   percpu: use percp...
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
  
  	ai->dyn_size = unit_size;
  	ai->unit_size = unit_size;
  	ai->atom_size = unit_size;
  	ai->alloc_size = unit_size;
  	ai->groups[0].nr_units = 1;
  	ai->groups[0].cpu_map[0] = 0;
  
  	if (pcpu_setup_first_chunk(ai, fc) < 0)
  		panic("Failed to initialize percpu areas.");
  }
  
  #endif	/* CONFIG_SMP */
099a19d91   Tejun Heo   percpu: allow lim...
2668
2669
  
  /*
1a4d76076   Tejun Heo   percpu: implement...
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
   * Percpu allocator is initialized early during boot when neither slab or
   * workqueue is available.  Plug async management until everything is up
   * and running.
   */
  static int __init percpu_enable_async(void)
  {
  	pcpu_async_enabled = true;
  	return 0;
  }
  subsys_initcall(percpu_enable_async);