Blame view

mm/percpu.c 84.6 KB
fbf59bc9d   Tejun Heo   percpu: implement...
1
  /*
88999a898   Tejun Heo   percpu: misc prep...
2
   * mm/percpu.c - percpu memory allocator
fbf59bc9d   Tejun Heo   percpu: implement...
3
4
5
6
   *
   * Copyright (C) 2009		SUSE Linux Products GmbH
   * Copyright (C) 2009		Tejun Heo <tj@kernel.org>
   *
5e81ee3e6   Dennis Zhou (Facebook)   percpu: update he...
7
8
9
   * Copyright (C) 2017		Facebook Inc.
   * Copyright (C) 2017		Dennis Zhou <dennisszhou@gmail.com>
   *
9c0151627   Dennis Zhou (Facebook)   percpu: update th...
10
   * This file is released under the GPLv2 license.
fbf59bc9d   Tejun Heo   percpu: implement...
11
   *
9c0151627   Dennis Zhou (Facebook)   percpu: update th...
12
13
14
15
   * The percpu allocator handles both static and dynamic areas.  Percpu
   * areas are allocated in chunks which are divided into units.  There is
   * a 1-to-1 mapping for units to possible cpus.  These units are grouped
   * based on NUMA properties of the machine.
fbf59bc9d   Tejun Heo   percpu: implement...
16
17
18
19
20
21
   *
   *  c0                           c1                         c2
   *  -------------------          -------------------        ------------
   * | u0 | u1 | u2 | u3 |        | u0 | u1 | u2 | u3 |      | u0 | u1 | u
   *  -------------------  ......  -------------------  ....  ------------
   *
9c0151627   Dennis Zhou (Facebook)   percpu: update th...
22
23
24
25
26
27
28
29
30
   * Allocation is done by offsets into a unit's address space.  Ie., an
   * area of 512 bytes at 6k in c1 occupies 512 bytes at 6k in c1:u0,
   * c1:u1, c1:u2, etc.  On NUMA machines, the mapping may be non-linear
   * and even sparse.  Access is handled by configuring percpu base
   * registers according to the cpu to unit mappings and offsetting the
   * base address using pcpu_unit_size.
   *
   * There is special consideration for the first chunk which must handle
   * the static percpu variables in the kernel image as allocation services
5e81ee3e6   Dennis Zhou (Facebook)   percpu: update he...
31
   * are not online yet.  In short, the first chunk is structured like so:
9c0151627   Dennis Zhou (Facebook)   percpu: update th...
32
33
34
35
36
37
38
   *
   *                  <Static | [Reserved] | Dynamic>
   *
   * The static data is copied from the original section managed by the
   * linker.  The reserved section, if non-zero, primarily manages static
   * percpu variables from kernel modules.  Finally, the dynamic section
   * takes care of normal allocations.
fbf59bc9d   Tejun Heo   percpu: implement...
39
   *
5e81ee3e6   Dennis Zhou (Facebook)   percpu: update he...
40
41
42
43
44
45
46
47
48
49
50
51
52
53
   * The allocator organizes chunks into lists according to free size and
   * tries to allocate from the fullest chunk first.  Each chunk is managed
   * by a bitmap with metadata blocks.  The allocation map is updated on
   * every allocation and free to reflect the current state while the boundary
   * map is only updated on allocation.  Each metadata block contains
   * information to help mitigate the need to iterate over large portions
   * of the bitmap.  The reverse mapping from page to chunk is stored in
   * the page's index.  Lastly, units are lazily backed and grow in unison.
   *
   * There is a unique conversion that goes on here between bytes and bits.
   * Each bit represents a fragment of size PCPU_MIN_ALLOC_SIZE.  The chunk
   * tracks the number of pages it is responsible for in nr_pages.  Helper
   * functions are used to convert from between the bytes, bits, and blocks.
   * All hints are managed in bits unless explicitly stated.
9c0151627   Dennis Zhou (Facebook)   percpu: update th...
54
   *
4091fb95b   Masahiro Yamada   scripts/spelling....
55
   * To use this allocator, arch code should do the following:
fbf59bc9d   Tejun Heo   percpu: implement...
56
   *
fbf59bc9d   Tejun Heo   percpu: implement...
57
   * - define __addr_to_pcpu_ptr() and __pcpu_ptr_to_addr() to translate
e01009833   Tejun Heo   percpu: make x86 ...
58
59
   *   regular address to percpu pointer and back if they need to be
   *   different from the default
fbf59bc9d   Tejun Heo   percpu: implement...
60
   *
8d408b4be   Tejun Heo   percpu: give more...
61
62
   * - use pcpu_setup_first_chunk() during percpu area initialization to
   *   setup the first chunk containing the kernel static percpu area
fbf59bc9d   Tejun Heo   percpu: implement...
63
   */
870d4b12a   Joe Perches   mm: percpu: use p...
64
  #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
fbf59bc9d   Tejun Heo   percpu: implement...
65
66
  #include <linux/bitmap.h>
  #include <linux/bootmem.h>
fd1e8a1fe   Tejun Heo   percpu: introduce...
67
  #include <linux/err.h>
ca460b3c9   Dennis Zhou (Facebook)   percpu: introduce...
68
  #include <linux/lcm.h>
fbf59bc9d   Tejun Heo   percpu: implement...
69
  #include <linux/list.h>
a530b7958   Tejun Heo   percpu: teach lar...
70
  #include <linux/log2.h>
fbf59bc9d   Tejun Heo   percpu: implement...
71
72
73
74
75
  #include <linux/mm.h>
  #include <linux/module.h>
  #include <linux/mutex.h>
  #include <linux/percpu.h>
  #include <linux/pfn.h>
fbf59bc9d   Tejun Heo   percpu: implement...
76
  #include <linux/slab.h>
ccea34b5d   Tejun Heo   percpu: finer gra...
77
  #include <linux/spinlock.h>
fbf59bc9d   Tejun Heo   percpu: implement...
78
  #include <linux/vmalloc.h>
a56dbddf0   Tejun Heo   percpu: move full...
79
  #include <linux/workqueue.h>
f528f0b8e   Catalin Marinas   kmemleak: Handle ...
80
  #include <linux/kmemleak.h>
71546d100   Tejun Heo   percpu: include l...
81
  #include <linux/sched.h>
fbf59bc9d   Tejun Heo   percpu: implement...
82
83
  
  #include <asm/cacheflush.h>
e01009833   Tejun Heo   percpu: make x86 ...
84
  #include <asm/sections.h>
fbf59bc9d   Tejun Heo   percpu: implement...
85
  #include <asm/tlbflush.h>
3b034b0d0   Vivek Goyal   percpu: Fix kdump...
86
  #include <asm/io.h>
fbf59bc9d   Tejun Heo   percpu: implement...
87

df95e795a   Dennis Zhou   percpu: add trace...
88
89
  #define CREATE_TRACE_POINTS
  #include <trace/events/percpu.h>
8fa3ed801   Dennis Zhou   percpu: migrate p...
90
  #include "percpu-internal.h"
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
91
92
  /* the slots are sorted by free bytes left, 1-31 bytes share the same slot */
  #define PCPU_SLOT_BASE_SHIFT		5
1a4d76076   Tejun Heo   percpu: implement...
93
94
  #define PCPU_EMPTY_POP_PAGES_LOW	2
  #define PCPU_EMPTY_POP_PAGES_HIGH	4
fbf59bc9d   Tejun Heo   percpu: implement...
95

bbddff054   Tejun Heo   percpu: use percp...
96
  #ifdef CONFIG_SMP
e01009833   Tejun Heo   percpu: make x86 ...
97
98
99
  /* default addr <-> pcpu_ptr mapping, override in asm/percpu.h if necessary */
  #ifndef __addr_to_pcpu_ptr
  #define __addr_to_pcpu_ptr(addr)					\
43cf38eb5   Tejun Heo   percpu: add __per...
100
101
102
  	(void __percpu *)((unsigned long)(addr) -			\
  			  (unsigned long)pcpu_base_addr	+		\
  			  (unsigned long)__per_cpu_start)
e01009833   Tejun Heo   percpu: make x86 ...
103
104
105
  #endif
  #ifndef __pcpu_ptr_to_addr
  #define __pcpu_ptr_to_addr(ptr)						\
43cf38eb5   Tejun Heo   percpu: add __per...
106
107
108
  	(void __force *)((unsigned long)(ptr) +				\
  			 (unsigned long)pcpu_base_addr -		\
  			 (unsigned long)__per_cpu_start)
e01009833   Tejun Heo   percpu: make x86 ...
109
  #endif
bbddff054   Tejun Heo   percpu: use percp...
110
111
112
113
114
  #else	/* CONFIG_SMP */
  /* on UP, it's always identity mapped */
  #define __addr_to_pcpu_ptr(addr)	(void __percpu *)(addr)
  #define __pcpu_ptr_to_addr(ptr)		(void __force *)(ptr)
  #endif	/* CONFIG_SMP */
e01009833   Tejun Heo   percpu: make x86 ...
115

1328710b8   Daniel Micay   mark most percpu ...
116
117
118
119
  static int pcpu_unit_pages __ro_after_init;
  static int pcpu_unit_size __ro_after_init;
  static int pcpu_nr_units __ro_after_init;
  static int pcpu_atom_size __ro_after_init;
8fa3ed801   Dennis Zhou   percpu: migrate p...
120
  int pcpu_nr_slots __ro_after_init;
1328710b8   Daniel Micay   mark most percpu ...
121
  static size_t pcpu_chunk_struct_size __ro_after_init;
fbf59bc9d   Tejun Heo   percpu: implement...
122

a855b84c3   Tejun Heo   percpu: fix chunk...
123
  /* cpus with the lowest and highest unit addresses */
1328710b8   Daniel Micay   mark most percpu ...
124
125
  static unsigned int pcpu_low_unit_cpu __ro_after_init;
  static unsigned int pcpu_high_unit_cpu __ro_after_init;
2f39e637e   Tejun Heo   percpu: allow non...
126

fbf59bc9d   Tejun Heo   percpu: implement...
127
  /* the address of the first chunk which starts with the kernel static area */
1328710b8   Daniel Micay   mark most percpu ...
128
  void *pcpu_base_addr __ro_after_init;
fbf59bc9d   Tejun Heo   percpu: implement...
129
  EXPORT_SYMBOL_GPL(pcpu_base_addr);
1328710b8   Daniel Micay   mark most percpu ...
130
131
  static const int *pcpu_unit_map __ro_after_init;		/* cpu -> unit */
  const unsigned long *pcpu_unit_offsets __ro_after_init;	/* cpu -> unit offset */
2f39e637e   Tejun Heo   percpu: allow non...
132

6563297ce   Tejun Heo   percpu: use group...
133
  /* group information, used for vm allocation */
1328710b8   Daniel Micay   mark most percpu ...
134
135
136
  static int pcpu_nr_groups __ro_after_init;
  static const unsigned long *pcpu_group_offsets __ro_after_init;
  static const size_t *pcpu_group_sizes __ro_after_init;
6563297ce   Tejun Heo   percpu: use group...
137

ae9e6bc9f   Tejun Heo   percpu: don't put...
138
139
140
141
142
  /*
   * The first chunk which always exists.  Note that unlike other
   * chunks, this one can be allocated and mapped in several different
   * ways and thus often doesn't live in the vmalloc area.
   */
8fa3ed801   Dennis Zhou   percpu: migrate p...
143
  struct pcpu_chunk *pcpu_first_chunk __ro_after_init;
ae9e6bc9f   Tejun Heo   percpu: don't put...
144
145
146
  
  /*
   * Optional reserved chunk.  This chunk reserves part of the first
e22667056   Dennis Zhou (Facebook)   percpu: introduce...
147
148
   * chunk and serves it for reserved allocations.  When the reserved
   * region doesn't exist, the following variable is NULL.
ae9e6bc9f   Tejun Heo   percpu: don't put...
149
   */
8fa3ed801   Dennis Zhou   percpu: migrate p...
150
  struct pcpu_chunk *pcpu_reserved_chunk __ro_after_init;
edcb46399   Tejun Heo   percpu, module: i...
151

8fa3ed801   Dennis Zhou   percpu: migrate p...
152
  DEFINE_SPINLOCK(pcpu_lock);	/* all internal data structures */
6710e594f   Tejun Heo   percpu: fix synch...
153
  static DEFINE_MUTEX(pcpu_alloc_mutex);	/* chunk create/destroy, [de]pop, map ext */
fbf59bc9d   Tejun Heo   percpu: implement...
154

8fa3ed801   Dennis Zhou   percpu: migrate p...
155
  struct list_head *pcpu_slot __ro_after_init; /* chunk list slots */
fbf59bc9d   Tejun Heo   percpu: implement...
156

4f996e234   Tejun Heo   percpu: fix synch...
157
158
  /* chunks which need their map areas extended, protected by pcpu_lock */
  static LIST_HEAD(pcpu_map_extend_chunks);
b539b87fe   Tejun Heo   percpu: implmeent...
159
160
161
162
  /*
   * The number of empty populated pages, protected by pcpu_lock.  The
   * reserved chunk doesn't contribute to the count.
   */
6b9b6f399   Dennis Zhou (Facebook)   percpu: expose pc...
163
  int pcpu_nr_empty_pop_pages;
b539b87fe   Tejun Heo   percpu: implmeent...
164

1a4d76076   Tejun Heo   percpu: implement...
165
  /*
7e8a6304d   Dennis Zhou (Facebook)   /proc/meminfo: ad...
166
167
168
169
170
171
172
173
   * The number of populated pages in use by the allocator, protected by
   * pcpu_lock.  This number is kept per a unit per chunk (i.e. when a page gets
   * allocated/deallocated, it is allocated/deallocated in all units of a chunk
   * and increments/decrements this count by 1).
   */
  static unsigned long pcpu_nr_populated;
  
  /*
1a4d76076   Tejun Heo   percpu: implement...
174
175
176
177
178
   * Balance work is used to populate or destroy chunks asynchronously.  We
   * try to keep the number of populated free pages between
   * PCPU_EMPTY_POP_PAGES_LOW and HIGH for atomic allocations and at most one
   * empty chunk.
   */
fe6bd8c3d   Tejun Heo   percpu: rename pc...
179
180
  static void pcpu_balance_workfn(struct work_struct *work);
  static DECLARE_WORK(pcpu_balance_work, pcpu_balance_workfn);
1a4d76076   Tejun Heo   percpu: implement...
181
182
183
184
185
186
187
188
  static bool pcpu_async_enabled __read_mostly;
  static bool pcpu_atomic_alloc_failed;
  
  static void pcpu_schedule_balance_work(void)
  {
  	if (pcpu_async_enabled)
  		schedule_work(&pcpu_balance_work);
  }
a56dbddf0   Tejun Heo   percpu: move full...
189

c0ebfdc3f   Dennis Zhou (Facebook)   percpu: modify ba...
190
  /**
560f2c236   Dennis Zhou (Facebook)   percpu: combine p...
191
192
193
   * pcpu_addr_in_chunk - check if the address is served from this chunk
   * @chunk: chunk of interest
   * @addr: percpu address
c0ebfdc3f   Dennis Zhou (Facebook)   percpu: modify ba...
194
195
   *
   * RETURNS:
560f2c236   Dennis Zhou (Facebook)   percpu: combine p...
196
   * True if the address is served from this chunk.
c0ebfdc3f   Dennis Zhou (Facebook)   percpu: modify ba...
197
   */
560f2c236   Dennis Zhou (Facebook)   percpu: combine p...
198
  static bool pcpu_addr_in_chunk(struct pcpu_chunk *chunk, void *addr)
020ec6537   Tejun Heo   percpu: factor ou...
199
  {
c0ebfdc3f   Dennis Zhou (Facebook)   percpu: modify ba...
200
  	void *start_addr, *end_addr;
560f2c236   Dennis Zhou (Facebook)   percpu: combine p...
201
  	if (!chunk)
c0ebfdc3f   Dennis Zhou (Facebook)   percpu: modify ba...
202
  		return false;
020ec6537   Tejun Heo   percpu: factor ou...
203

560f2c236   Dennis Zhou (Facebook)   percpu: combine p...
204
205
206
  	start_addr = chunk->base_addr + chunk->start_offset;
  	end_addr = chunk->base_addr + chunk->nr_pages * PAGE_SIZE -
  		   chunk->end_offset;
c0ebfdc3f   Dennis Zhou (Facebook)   percpu: modify ba...
207
208
  
  	return addr >= start_addr && addr < end_addr;
020ec6537   Tejun Heo   percpu: factor ou...
209
  }
d9b55eeb1   Tejun Heo   percpu: remove un...
210
  static int __pcpu_size_to_slot(int size)
fbf59bc9d   Tejun Heo   percpu: implement...
211
  {
cae3aeb83   Tejun Heo   percpu: clean up ...
212
  	int highbit = fls(size);	/* size is in bytes */
fbf59bc9d   Tejun Heo   percpu: implement...
213
214
  	return max(highbit - PCPU_SLOT_BASE_SHIFT + 2, 1);
  }
d9b55eeb1   Tejun Heo   percpu: remove un...
215
216
217
218
219
220
  static int pcpu_size_to_slot(int size)
  {
  	if (size == pcpu_unit_size)
  		return pcpu_nr_slots - 1;
  	return __pcpu_size_to_slot(size);
  }
fbf59bc9d   Tejun Heo   percpu: implement...
221
222
  static int pcpu_chunk_slot(const struct pcpu_chunk *chunk)
  {
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
223
  	if (chunk->free_bytes < PCPU_MIN_ALLOC_SIZE || chunk->contig_bits == 0)
fbf59bc9d   Tejun Heo   percpu: implement...
224
  		return 0;
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
225
  	return pcpu_size_to_slot(chunk->free_bytes);
fbf59bc9d   Tejun Heo   percpu: implement...
226
  }
88999a898   Tejun Heo   percpu: misc prep...
227
228
229
230
231
232
233
234
235
236
237
238
239
  /* set the pointer to a chunk in a page struct */
  static void pcpu_set_page_chunk(struct page *page, struct pcpu_chunk *pcpu)
  {
  	page->index = (unsigned long)pcpu;
  }
  
  /* obtain pointer to a chunk from a page struct */
  static struct pcpu_chunk *pcpu_get_page_chunk(struct page *page)
  {
  	return (struct pcpu_chunk *)page->index;
  }
  
  static int __maybe_unused pcpu_page_idx(unsigned int cpu, int page_idx)
fbf59bc9d   Tejun Heo   percpu: implement...
240
  {
2f39e637e   Tejun Heo   percpu: allow non...
241
  	return pcpu_unit_map[cpu] * pcpu_unit_pages + page_idx;
fbf59bc9d   Tejun Heo   percpu: implement...
242
  }
c0ebfdc3f   Dennis Zhou (Facebook)   percpu: modify ba...
243
244
245
246
  static unsigned long pcpu_unit_page_offset(unsigned int cpu, int page_idx)
  {
  	return pcpu_unit_offsets[cpu] + (page_idx << PAGE_SHIFT);
  }
9983b6f0c   Tejun Heo   percpu: fix first...
247
248
  static unsigned long pcpu_chunk_addr(struct pcpu_chunk *chunk,
  				     unsigned int cpu, int page_idx)
fbf59bc9d   Tejun Heo   percpu: implement...
249
  {
c0ebfdc3f   Dennis Zhou (Facebook)   percpu: modify ba...
250
251
  	return (unsigned long)chunk->base_addr +
  	       pcpu_unit_page_offset(cpu, page_idx);
fbf59bc9d   Tejun Heo   percpu: implement...
252
  }
91e914c5a   Dennis Zhou (Facebook)   percpu: generaliz...
253
  static void pcpu_next_unpop(unsigned long *bitmap, int *rs, int *re, int end)
ce3141a27   Tejun Heo   percpu: drop pcpu...
254
  {
91e914c5a   Dennis Zhou (Facebook)   percpu: generaliz...
255
256
  	*rs = find_next_zero_bit(bitmap, end, *rs);
  	*re = find_next_bit(bitmap, end, *rs + 1);
ce3141a27   Tejun Heo   percpu: drop pcpu...
257
  }
91e914c5a   Dennis Zhou (Facebook)   percpu: generaliz...
258
  static void pcpu_next_pop(unsigned long *bitmap, int *rs, int *re, int end)
ce3141a27   Tejun Heo   percpu: drop pcpu...
259
  {
91e914c5a   Dennis Zhou (Facebook)   percpu: generaliz...
260
261
  	*rs = find_next_bit(bitmap, end, *rs);
  	*re = find_next_zero_bit(bitmap, end, *rs + 1);
ce3141a27   Tejun Heo   percpu: drop pcpu...
262
263
264
  }
  
  /*
91e914c5a   Dennis Zhou (Facebook)   percpu: generaliz...
265
266
267
   * Bitmap region iterators.  Iterates over the bitmap between
   * [@start, @end) in @chunk.  @rs and @re should be integer variables
   * and will be set to start and end index of the current free region.
ce3141a27   Tejun Heo   percpu: drop pcpu...
268
   */
91e914c5a   Dennis Zhou (Facebook)   percpu: generaliz...
269
270
271
272
  #define pcpu_for_each_unpop_region(bitmap, rs, re, start, end)		     \
  	for ((rs) = (start), pcpu_next_unpop((bitmap), &(rs), &(re), (end)); \
  	     (rs) < (re);						     \
  	     (rs) = (re) + 1, pcpu_next_unpop((bitmap), &(rs), &(re), (end)))
ce3141a27   Tejun Heo   percpu: drop pcpu...
273

91e914c5a   Dennis Zhou (Facebook)   percpu: generaliz...
274
275
276
277
  #define pcpu_for_each_pop_region(bitmap, rs, re, start, end)		     \
  	for ((rs) = (start), pcpu_next_pop((bitmap), &(rs), &(re), (end));   \
  	     (rs) < (re);						     \
  	     (rs) = (re) + 1, pcpu_next_pop((bitmap), &(rs), &(re), (end)))
ce3141a27   Tejun Heo   percpu: drop pcpu...
278

ca460b3c9   Dennis Zhou (Facebook)   percpu: introduce...
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
  /*
   * The following are helper functions to help access bitmaps and convert
   * between bitmap offsets to address offsets.
   */
  static unsigned long *pcpu_index_alloc_map(struct pcpu_chunk *chunk, int index)
  {
  	return chunk->alloc_map +
  	       (index * PCPU_BITMAP_BLOCK_BITS / BITS_PER_LONG);
  }
  
  static unsigned long pcpu_off_to_block_index(int off)
  {
  	return off / PCPU_BITMAP_BLOCK_BITS;
  }
  
  static unsigned long pcpu_off_to_block_off(int off)
  {
  	return off & (PCPU_BITMAP_BLOCK_BITS - 1);
  }
b185cd0dc   Dennis Zhou (Facebook)   percpu: update fr...
298
299
300
301
  static unsigned long pcpu_block_off_to_off(int index, int off)
  {
  	return index * PCPU_BITMAP_BLOCK_BITS + off;
  }
fbf59bc9d   Tejun Heo   percpu: implement...
302
  /**
525ca84da   Dennis Zhou (Facebook)   percpu: use metad...
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
   * pcpu_next_md_free_region - finds the next hint free area
   * @chunk: chunk of interest
   * @bit_off: chunk offset
   * @bits: size of free area
   *
   * Helper function for pcpu_for_each_md_free_region.  It checks
   * block->contig_hint and performs aggregation across blocks to find the
   * next hint.  It modifies bit_off and bits in-place to be consumed in the
   * loop.
   */
  static void pcpu_next_md_free_region(struct pcpu_chunk *chunk, int *bit_off,
  				     int *bits)
  {
  	int i = pcpu_off_to_block_index(*bit_off);
  	int block_off = pcpu_off_to_block_off(*bit_off);
  	struct pcpu_block_md *block;
  
  	*bits = 0;
  	for (block = chunk->md_blocks + i; i < pcpu_chunk_nr_blocks(chunk);
  	     block++, i++) {
  		/* handles contig area across blocks */
  		if (*bits) {
  			*bits += block->left_free;
  			if (block->left_free == PCPU_BITMAP_BLOCK_BITS)
  				continue;
  			return;
  		}
  
  		/*
  		 * This checks three things.  First is there a contig_hint to
  		 * check.  Second, have we checked this hint before by
  		 * comparing the block_off.  Third, is this the same as the
  		 * right contig hint.  In the last case, it spills over into
  		 * the next block and should be handled by the contig area
  		 * across blocks code.
  		 */
  		*bits = block->contig_hint;
  		if (*bits && block->contig_hint_start >= block_off &&
  		    *bits + block->contig_hint_start < PCPU_BITMAP_BLOCK_BITS) {
  			*bit_off = pcpu_block_off_to_off(i,
  					block->contig_hint_start);
  			return;
  		}
1fa4df3e6   Dennis Zhou   percpu: fix itera...
346
347
  		/* reset to satisfy the second predicate above */
  		block_off = 0;
525ca84da   Dennis Zhou (Facebook)   percpu: use metad...
348
349
350
351
352
  
  		*bits = block->right_free;
  		*bit_off = (i + 1) * PCPU_BITMAP_BLOCK_BITS - block->right_free;
  	}
  }
b4c2116cf   Dennis Zhou (Facebook)   percpu: update pc...
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
  /**
   * pcpu_next_fit_region - finds fit areas for a given allocation request
   * @chunk: chunk of interest
   * @alloc_bits: size of allocation
   * @align: alignment of area (max PAGE_SIZE)
   * @bit_off: chunk offset
   * @bits: size of free area
   *
   * Finds the next free region that is viable for use with a given size and
   * alignment.  This only returns if there is a valid area to be used for this
   * allocation.  block->first_free is returned if the allocation request fits
   * within the block to see if the request can be fulfilled prior to the contig
   * hint.
   */
  static void pcpu_next_fit_region(struct pcpu_chunk *chunk, int alloc_bits,
  				 int align, int *bit_off, int *bits)
  {
  	int i = pcpu_off_to_block_index(*bit_off);
  	int block_off = pcpu_off_to_block_off(*bit_off);
  	struct pcpu_block_md *block;
  
  	*bits = 0;
  	for (block = chunk->md_blocks + i; i < pcpu_chunk_nr_blocks(chunk);
  	     block++, i++) {
  		/* handles contig area across blocks */
  		if (*bits) {
  			*bits += block->left_free;
  			if (*bits >= alloc_bits)
  				return;
  			if (block->left_free == PCPU_BITMAP_BLOCK_BITS)
  				continue;
  		}
  
  		/* check block->contig_hint */
  		*bits = ALIGN(block->contig_hint_start, align) -
  			block->contig_hint_start;
  		/*
  		 * This uses the block offset to determine if this has been
  		 * checked in the prior iteration.
  		 */
  		if (block->contig_hint &&
  		    block->contig_hint_start >= block_off &&
  		    block->contig_hint >= *bits + alloc_bits) {
  			*bits += alloc_bits + block->contig_hint_start -
  				 block->first_free;
  			*bit_off = pcpu_block_off_to_off(i, block->first_free);
  			return;
  		}
1fa4df3e6   Dennis Zhou   percpu: fix itera...
401
402
  		/* reset to satisfy the second predicate above */
  		block_off = 0;
b4c2116cf   Dennis Zhou (Facebook)   percpu: update pc...
403
404
405
406
407
408
409
410
411
412
413
414
  
  		*bit_off = ALIGN(PCPU_BITMAP_BLOCK_BITS - block->right_free,
  				 align);
  		*bits = PCPU_BITMAP_BLOCK_BITS - *bit_off;
  		*bit_off = pcpu_block_off_to_off(i, *bit_off);
  		if (*bits >= alloc_bits)
  			return;
  	}
  
  	/* no valid offsets were found - fail condition */
  	*bit_off = pcpu_chunk_map_bits(chunk);
  }
525ca84da   Dennis Zhou (Facebook)   percpu: use metad...
415
416
417
  /*
   * Metadata free area iterators.  These perform aggregation of free areas
   * based on the metadata blocks and return the offset @bit_off and size in
b4c2116cf   Dennis Zhou (Facebook)   percpu: update pc...
418
419
   * bits of the free area @bits.  pcpu_for_each_fit_region only returns when
   * a fit is found for the allocation request.
525ca84da   Dennis Zhou (Facebook)   percpu: use metad...
420
421
422
423
424
425
   */
  #define pcpu_for_each_md_free_region(chunk, bit_off, bits)		\
  	for (pcpu_next_md_free_region((chunk), &(bit_off), &(bits));	\
  	     (bit_off) < pcpu_chunk_map_bits((chunk));			\
  	     (bit_off) += (bits) + 1,					\
  	     pcpu_next_md_free_region((chunk), &(bit_off), &(bits)))
b4c2116cf   Dennis Zhou (Facebook)   percpu: update pc...
426
427
428
429
430
431
432
  #define pcpu_for_each_fit_region(chunk, alloc_bits, align, bit_off, bits)     \
  	for (pcpu_next_fit_region((chunk), (alloc_bits), (align), &(bit_off), \
  				  &(bits));				      \
  	     (bit_off) < pcpu_chunk_map_bits((chunk));			      \
  	     (bit_off) += (bits),					      \
  	     pcpu_next_fit_region((chunk), (alloc_bits), (align), &(bit_off), \
  				  &(bits)))
525ca84da   Dennis Zhou (Facebook)   percpu: use metad...
433
  /**
90459ce06   Bob Liu   percpu: rename pc...
434
   * pcpu_mem_zalloc - allocate memory
1880d93b8   Tejun Heo   percpu: replace p...
435
   * @size: bytes to allocate
47504ee04   Dennis Zhou   percpu: add __GFP...
436
   * @gfp: allocation flags
fbf59bc9d   Tejun Heo   percpu: implement...
437
   *
1880d93b8   Tejun Heo   percpu: replace p...
438
   * Allocate @size bytes.  If @size is smaller than PAGE_SIZE,
47504ee04   Dennis Zhou   percpu: add __GFP...
439
440
441
   * kzalloc() is used; otherwise, the equivalent of vzalloc() is used.
   * This is to facilitate passing through whitelisted flags.  The
   * returned memory is always zeroed.
fbf59bc9d   Tejun Heo   percpu: implement...
442
443
   *
   * RETURNS:
1880d93b8   Tejun Heo   percpu: replace p...
444
   * Pointer to the allocated area on success, NULL on failure.
fbf59bc9d   Tejun Heo   percpu: implement...
445
   */
47504ee04   Dennis Zhou   percpu: add __GFP...
446
  static void *pcpu_mem_zalloc(size_t size, gfp_t gfp)
fbf59bc9d   Tejun Heo   percpu: implement...
447
  {
099a19d91   Tejun Heo   percpu: allow lim...
448
449
  	if (WARN_ON_ONCE(!slab_is_available()))
  		return NULL;
1880d93b8   Tejun Heo   percpu: replace p...
450
  	if (size <= PAGE_SIZE)
554fef1c3   Dennis Zhou   percpu: allow sel...
451
  		return kzalloc(size, gfp);
7af4c0932   Jesper Juhl   percpu: zero memo...
452
  	else
554fef1c3   Dennis Zhou   percpu: allow sel...
453
  		return __vmalloc(size, gfp | __GFP_ZERO, PAGE_KERNEL);
1880d93b8   Tejun Heo   percpu: replace p...
454
  }
fbf59bc9d   Tejun Heo   percpu: implement...
455

1880d93b8   Tejun Heo   percpu: replace p...
456
457
458
  /**
   * pcpu_mem_free - free memory
   * @ptr: memory to free
1880d93b8   Tejun Heo   percpu: replace p...
459
   *
90459ce06   Bob Liu   percpu: rename pc...
460
   * Free @ptr.  @ptr should have been allocated using pcpu_mem_zalloc().
1880d93b8   Tejun Heo   percpu: replace p...
461
   */
1d5cfdb07   Tetsuo Handa   tree wide: use kv...
462
  static void pcpu_mem_free(void *ptr)
1880d93b8   Tejun Heo   percpu: replace p...
463
  {
1d5cfdb07   Tetsuo Handa   tree wide: use kv...
464
  	kvfree(ptr);
fbf59bc9d   Tejun Heo   percpu: implement...
465
466
467
468
469
470
471
472
473
  }
  
  /**
   * pcpu_chunk_relocate - put chunk in the appropriate chunk slot
   * @chunk: chunk of interest
   * @oslot: the previous slot it was on
   *
   * This function is called after an allocation or free changed @chunk.
   * New slot according to the changed state is determined and @chunk is
edcb46399   Tejun Heo   percpu, module: i...
474
475
   * moved to the slot.  Note that the reserved chunk is never put on
   * chunk slots.
ccea34b5d   Tejun Heo   percpu: finer gra...
476
477
478
   *
   * CONTEXT:
   * pcpu_lock.
fbf59bc9d   Tejun Heo   percpu: implement...
479
480
481
482
   */
  static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot)
  {
  	int nslot = pcpu_chunk_slot(chunk);
edcb46399   Tejun Heo   percpu, module: i...
483
  	if (chunk != pcpu_reserved_chunk && oslot != nslot) {
fbf59bc9d   Tejun Heo   percpu: implement...
484
485
486
487
488
489
  		if (oslot < nslot)
  			list_move(&chunk->list, &pcpu_slot[nslot]);
  		else
  			list_move_tail(&chunk->list, &pcpu_slot[nslot]);
  	}
  }
fbf59bc9d   Tejun Heo   percpu: implement...
490
  /**
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
491
   * pcpu_cnt_pop_pages- counts populated backing pages in range
833af8427   Tejun Heo   percpu: restructu...
492
   * @chunk: chunk of interest
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
493
494
   * @bit_off: start offset
   * @bits: size of area to check
9f7dcf224   Tejun Heo   percpu: move chun...
495
   *
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
496
497
498
   * Calculates the number of populated pages in the region
   * [page_start, page_end).  This keeps track of how many empty populated
   * pages are available and decide if async work should be scheduled.
ccea34b5d   Tejun Heo   percpu: finer gra...
499
   *
9f7dcf224   Tejun Heo   percpu: move chun...
500
   * RETURNS:
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
501
   * The nr of populated pages.
9f7dcf224   Tejun Heo   percpu: move chun...
502
   */
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
503
504
  static inline int pcpu_cnt_pop_pages(struct pcpu_chunk *chunk, int bit_off,
  				     int bits)
9f7dcf224   Tejun Heo   percpu: move chun...
505
  {
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
506
507
  	int page_start = PFN_UP(bit_off * PCPU_MIN_ALLOC_SIZE);
  	int page_end = PFN_DOWN((bit_off + bits) * PCPU_MIN_ALLOC_SIZE);
4f996e234   Tejun Heo   percpu: fix synch...
508

40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
509
  	if (page_start >= page_end)
9f7dcf224   Tejun Heo   percpu: move chun...
510
  		return 0;
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
511
512
513
514
515
516
517
518
519
  	/*
  	 * bitmap_weight counts the number of bits set in a bitmap up to
  	 * the specified number of bits.  This is counting the populated
  	 * pages up to page_end and then subtracting the populated pages
  	 * up to page_start to count the populated pages in
  	 * [page_start, page_end).
  	 */
  	return bitmap_weight(chunk->populated, page_end) -
  	       bitmap_weight(chunk->populated, page_start);
833af8427   Tejun Heo   percpu: restructu...
520
521
522
  }
  
  /**
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
523
   * pcpu_chunk_update - updates the chunk metadata given a free area
833af8427   Tejun Heo   percpu: restructu...
524
   * @chunk: chunk of interest
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
525
526
   * @bit_off: chunk offset
   * @bits: size of free area
833af8427   Tejun Heo   percpu: restructu...
527
   *
13f966373   Dennis Zhou (Facebook)   percpu: skip chun...
528
   * This updates the chunk's contig hint and starting offset given a free area.
268625a6f   Dennis Zhou (Facebook)   percpu: keep trac...
529
   * Choose the best starting offset if the contig hint is equal.
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
530
531
532
   */
  static void pcpu_chunk_update(struct pcpu_chunk *chunk, int bit_off, int bits)
  {
13f966373   Dennis Zhou (Facebook)   percpu: skip chun...
533
534
  	if (bits > chunk->contig_bits) {
  		chunk->contig_bits_start = bit_off;
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
535
  		chunk->contig_bits = bits;
268625a6f   Dennis Zhou (Facebook)   percpu: keep trac...
536
537
538
539
540
  	} else if (bits == chunk->contig_bits && chunk->contig_bits_start &&
  		   (!bit_off ||
  		    __ffs(bit_off) > __ffs(chunk->contig_bits_start))) {
  		/* use the start with the best alignment */
  		chunk->contig_bits_start = bit_off;
13f966373   Dennis Zhou (Facebook)   percpu: skip chun...
541
  	}
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
542
543
544
545
546
  }
  
  /**
   * pcpu_chunk_refresh_hint - updates metadata about a chunk
   * @chunk: chunk of interest
833af8427   Tejun Heo   percpu: restructu...
547
   *
525ca84da   Dennis Zhou (Facebook)   percpu: use metad...
548
549
550
   * Iterates over the metadata blocks to find the largest contig area.
   * It also counts the populated pages and uses the delta to update the
   * global count.
833af8427   Tejun Heo   percpu: restructu...
551
   *
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
552
553
   * Updates:
   *      chunk->contig_bits
13f966373   Dennis Zhou (Facebook)   percpu: skip chun...
554
   *      chunk->contig_bits_start
525ca84da   Dennis Zhou (Facebook)   percpu: use metad...
555
   *      nr_empty_pop_pages (chunk and global)
833af8427   Tejun Heo   percpu: restructu...
556
   */
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
557
  static void pcpu_chunk_refresh_hint(struct pcpu_chunk *chunk)
833af8427   Tejun Heo   percpu: restructu...
558
  {
525ca84da   Dennis Zhou (Facebook)   percpu: use metad...
559
  	int bit_off, bits, nr_empty_pop_pages;
833af8427   Tejun Heo   percpu: restructu...
560

40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
561
562
  	/* clear metadata */
  	chunk->contig_bits = 0;
6710e594f   Tejun Heo   percpu: fix synch...
563

525ca84da   Dennis Zhou (Facebook)   percpu: use metad...
564
  	bit_off = chunk->first_bit;
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
565
  	bits = nr_empty_pop_pages = 0;
525ca84da   Dennis Zhou (Facebook)   percpu: use metad...
566
567
  	pcpu_for_each_md_free_region(chunk, bit_off, bits) {
  		pcpu_chunk_update(chunk, bit_off, bits);
833af8427   Tejun Heo   percpu: restructu...
568

525ca84da   Dennis Zhou (Facebook)   percpu: use metad...
569
  		nr_empty_pop_pages += pcpu_cnt_pop_pages(chunk, bit_off, bits);
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
570
  	}
9f7dcf224   Tejun Heo   percpu: move chun...
571

40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
572
573
574
575
576
577
578
579
580
581
582
  	/*
  	 * Keep track of nr_empty_pop_pages.
  	 *
  	 * The chunk maintains the previous number of free pages it held,
  	 * so the delta is used to update the global counter.  The reserved
  	 * chunk is not part of the free page count as they are populated
  	 * at init and are special to serving reserved allocations.
  	 */
  	if (chunk != pcpu_reserved_chunk)
  		pcpu_nr_empty_pop_pages +=
  			(nr_empty_pop_pages - chunk->nr_empty_pop_pages);
a002d1484   Huang Shijie   percpu: fix a mem...
583

40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
584
585
  	chunk->nr_empty_pop_pages = nr_empty_pop_pages;
  }
9f7dcf224   Tejun Heo   percpu: move chun...
586

40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
587
  /**
ca460b3c9   Dennis Zhou (Facebook)   percpu: introduce...
588
589
590
591
592
593
   * pcpu_block_update - updates a block given a free area
   * @block: block of interest
   * @start: start offset in block
   * @end: end offset in block
   *
   * Updates a block given a known free area.  The region [start, end) is
268625a6f   Dennis Zhou (Facebook)   percpu: keep trac...
594
595
   * expected to be the entirety of the free area within a block.  Chooses
   * the best starting offset if the contig hints are equal.
ca460b3c9   Dennis Zhou (Facebook)   percpu: introduce...
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
   */
  static void pcpu_block_update(struct pcpu_block_md *block, int start, int end)
  {
  	int contig = end - start;
  
  	block->first_free = min(block->first_free, start);
  	if (start == 0)
  		block->left_free = contig;
  
  	if (end == PCPU_BITMAP_BLOCK_BITS)
  		block->right_free = contig;
  
  	if (contig > block->contig_hint) {
  		block->contig_hint_start = start;
  		block->contig_hint = contig;
268625a6f   Dennis Zhou (Facebook)   percpu: keep trac...
611
612
613
614
  	} else if (block->contig_hint_start && contig == block->contig_hint &&
  		   (!start || __ffs(start) > __ffs(block->contig_hint_start))) {
  		/* use the start with the best alignment */
  		block->contig_hint_start = start;
ca460b3c9   Dennis Zhou (Facebook)   percpu: introduce...
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
  	}
  }
  
  /**
   * pcpu_block_refresh_hint
   * @chunk: chunk of interest
   * @index: index of the metadata block
   *
   * Scans over the block beginning at first_free and updates the block
   * metadata accordingly.
   */
  static void pcpu_block_refresh_hint(struct pcpu_chunk *chunk, int index)
  {
  	struct pcpu_block_md *block = chunk->md_blocks + index;
  	unsigned long *alloc_map = pcpu_index_alloc_map(chunk, index);
  	int rs, re;	/* region start, region end */
  
  	/* clear hints */
  	block->contig_hint = 0;
  	block->left_free = block->right_free = 0;
  
  	/* iterate over free areas and update the contig hints */
  	pcpu_for_each_unpop_region(alloc_map, rs, re, block->first_free,
  				   PCPU_BITMAP_BLOCK_BITS) {
  		pcpu_block_update(block, rs, re);
  	}
  }
  
  /**
   * pcpu_block_update_hint_alloc - update hint on allocation path
   * @chunk: chunk of interest
   * @bit_off: chunk offset
   * @bits: size of request
fc3043345   Dennis Zhou (Facebook)   percpu: update al...
648
649
650
651
   *
   * Updates metadata for the allocation path.  The metadata only has to be
   * refreshed by a full scan iff the chunk's contig hint is broken.  Block level
   * scans are required if the block's contig hint is broken.
ca460b3c9   Dennis Zhou (Facebook)   percpu: introduce...
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
   */
  static void pcpu_block_update_hint_alloc(struct pcpu_chunk *chunk, int bit_off,
  					 int bits)
  {
  	struct pcpu_block_md *s_block, *e_block, *block;
  	int s_index, e_index;	/* block indexes of the freed allocation */
  	int s_off, e_off;	/* block offsets of the freed allocation */
  
  	/*
  	 * Calculate per block offsets.
  	 * The calculation uses an inclusive range, but the resulting offsets
  	 * are [start, end).  e_index always points to the last block in the
  	 * range.
  	 */
  	s_index = pcpu_off_to_block_index(bit_off);
  	e_index = pcpu_off_to_block_index(bit_off + bits - 1);
  	s_off = pcpu_off_to_block_off(bit_off);
  	e_off = pcpu_off_to_block_off(bit_off + bits - 1) + 1;
  
  	s_block = chunk->md_blocks + s_index;
  	e_block = chunk->md_blocks + e_index;
  
  	/*
  	 * Update s_block.
fc3043345   Dennis Zhou (Facebook)   percpu: update al...
676
677
678
  	 * block->first_free must be updated if the allocation takes its place.
  	 * If the allocation breaks the contig_hint, a scan is required to
  	 * restore this hint.
ca460b3c9   Dennis Zhou (Facebook)   percpu: introduce...
679
  	 */
fc3043345   Dennis Zhou (Facebook)   percpu: update al...
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
  	if (s_off == s_block->first_free)
  		s_block->first_free = find_next_zero_bit(
  					pcpu_index_alloc_map(chunk, s_index),
  					PCPU_BITMAP_BLOCK_BITS,
  					s_off + bits);
  
  	if (s_off >= s_block->contig_hint_start &&
  	    s_off < s_block->contig_hint_start + s_block->contig_hint) {
  		/* block contig hint is broken - scan to fix it */
  		pcpu_block_refresh_hint(chunk, s_index);
  	} else {
  		/* update left and right contig manually */
  		s_block->left_free = min(s_block->left_free, s_off);
  		if (s_index == e_index)
  			s_block->right_free = min_t(int, s_block->right_free,
  					PCPU_BITMAP_BLOCK_BITS - e_off);
  		else
  			s_block->right_free = 0;
  	}
ca460b3c9   Dennis Zhou (Facebook)   percpu: introduce...
699
700
701
702
703
  
  	/*
  	 * Update e_block.
  	 */
  	if (s_index != e_index) {
fc3043345   Dennis Zhou (Facebook)   percpu: update al...
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
  		/*
  		 * When the allocation is across blocks, the end is along
  		 * the left part of the e_block.
  		 */
  		e_block->first_free = find_next_zero_bit(
  				pcpu_index_alloc_map(chunk, e_index),
  				PCPU_BITMAP_BLOCK_BITS, e_off);
  
  		if (e_off == PCPU_BITMAP_BLOCK_BITS) {
  			/* reset the block */
  			e_block++;
  		} else {
  			if (e_off > e_block->contig_hint_start) {
  				/* contig hint is broken - scan to fix it */
  				pcpu_block_refresh_hint(chunk, e_index);
  			} else {
  				e_block->left_free = 0;
  				e_block->right_free =
  					min_t(int, e_block->right_free,
  					      PCPU_BITMAP_BLOCK_BITS - e_off);
  			}
  		}
ca460b3c9   Dennis Zhou (Facebook)   percpu: introduce...
726
727
728
729
730
731
732
733
  
  		/* update in-between md_blocks */
  		for (block = s_block + 1; block < e_block; block++) {
  			block->contig_hint = 0;
  			block->left_free = 0;
  			block->right_free = 0;
  		}
  	}
fc3043345   Dennis Zhou (Facebook)   percpu: update al...
734
735
736
737
738
739
740
741
  	/*
  	 * The only time a full chunk scan is required is if the chunk
  	 * contig hint is broken.  Otherwise, it means a smaller space
  	 * was used and therefore the chunk contig hint is still correct.
  	 */
  	if (bit_off >= chunk->contig_bits_start  &&
  	    bit_off < chunk->contig_bits_start + chunk->contig_bits)
  		pcpu_chunk_refresh_hint(chunk);
ca460b3c9   Dennis Zhou (Facebook)   percpu: introduce...
742
743
744
745
746
747
748
  }
  
  /**
   * pcpu_block_update_hint_free - updates the block hints on the free path
   * @chunk: chunk of interest
   * @bit_off: chunk offset
   * @bits: size of request
b185cd0dc   Dennis Zhou (Facebook)   percpu: update fr...
749
750
751
752
753
754
755
756
757
758
759
   *
   * Updates metadata for the allocation path.  This avoids a blind block
   * refresh by making use of the block contig hints.  If this fails, it scans
   * forward and backward to determine the extent of the free area.  This is
   * capped at the boundary of blocks.
   *
   * A chunk update is triggered if a page becomes free, a block becomes free,
   * or the free spans across blocks.  This tradeoff is to minimize iterating
   * over the block metadata to update chunk->contig_bits.  chunk->contig_bits
   * may be off by up to a page, but it will never be more than the available
   * space.  If the contig hint is contained in one block, it will be accurate.
ca460b3c9   Dennis Zhou (Facebook)   percpu: introduce...
760
761
762
763
764
765
766
   */
  static void pcpu_block_update_hint_free(struct pcpu_chunk *chunk, int bit_off,
  					int bits)
  {
  	struct pcpu_block_md *s_block, *e_block, *block;
  	int s_index, e_index;	/* block indexes of the freed allocation */
  	int s_off, e_off;	/* block offsets of the freed allocation */
b185cd0dc   Dennis Zhou (Facebook)   percpu: update fr...
767
  	int start, end;		/* start and end of the whole free area */
ca460b3c9   Dennis Zhou (Facebook)   percpu: introduce...
768
769
770
771
772
773
774
775
776
777
778
779
780
781
  
  	/*
  	 * Calculate per block offsets.
  	 * The calculation uses an inclusive range, but the resulting offsets
  	 * are [start, end).  e_index always points to the last block in the
  	 * range.
  	 */
  	s_index = pcpu_off_to_block_index(bit_off);
  	e_index = pcpu_off_to_block_index(bit_off + bits - 1);
  	s_off = pcpu_off_to_block_off(bit_off);
  	e_off = pcpu_off_to_block_off(bit_off + bits - 1) + 1;
  
  	s_block = chunk->md_blocks + s_index;
  	e_block = chunk->md_blocks + e_index;
b185cd0dc   Dennis Zhou (Facebook)   percpu: update fr...
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
  	/*
  	 * Check if the freed area aligns with the block->contig_hint.
  	 * If it does, then the scan to find the beginning/end of the
  	 * larger free area can be avoided.
  	 *
  	 * start and end refer to beginning and end of the free area
  	 * within each their respective blocks.  This is not necessarily
  	 * the entire free area as it may span blocks past the beginning
  	 * or end of the block.
  	 */
  	start = s_off;
  	if (s_off == s_block->contig_hint + s_block->contig_hint_start) {
  		start = s_block->contig_hint_start;
  	} else {
  		/*
  		 * Scan backwards to find the extent of the free area.
  		 * find_last_bit returns the starting bit, so if the start bit
  		 * is returned, that means there was no last bit and the
  		 * remainder of the chunk is free.
  		 */
  		int l_bit = find_last_bit(pcpu_index_alloc_map(chunk, s_index),
  					  start);
  		start = (start == l_bit) ? 0 : l_bit + 1;
  	}
  
  	end = e_off;
  	if (e_off == e_block->contig_hint_start)
  		end = e_block->contig_hint_start + e_block->contig_hint;
  	else
  		end = find_next_bit(pcpu_index_alloc_map(chunk, e_index),
  				    PCPU_BITMAP_BLOCK_BITS, end);
ca460b3c9   Dennis Zhou (Facebook)   percpu: introduce...
813
  	/* update s_block */
b185cd0dc   Dennis Zhou (Facebook)   percpu: update fr...
814
815
  	e_off = (s_index == e_index) ? end : PCPU_BITMAP_BLOCK_BITS;
  	pcpu_block_update(s_block, start, e_off);
ca460b3c9   Dennis Zhou (Facebook)   percpu: introduce...
816
817
818
819
  
  	/* freeing in the same block */
  	if (s_index != e_index) {
  		/* update e_block */
b185cd0dc   Dennis Zhou (Facebook)   percpu: update fr...
820
  		pcpu_block_update(e_block, 0, end);
ca460b3c9   Dennis Zhou (Facebook)   percpu: introduce...
821
822
823
824
825
826
827
828
829
830
  
  		/* reset md_blocks in the middle */
  		for (block = s_block + 1; block < e_block; block++) {
  			block->first_free = 0;
  			block->contig_hint_start = 0;
  			block->contig_hint = PCPU_BITMAP_BLOCK_BITS;
  			block->left_free = PCPU_BITMAP_BLOCK_BITS;
  			block->right_free = PCPU_BITMAP_BLOCK_BITS;
  		}
  	}
b185cd0dc   Dennis Zhou (Facebook)   percpu: update fr...
831
832
833
834
835
836
837
838
839
840
841
842
843
  	/*
  	 * Refresh chunk metadata when the free makes a page free, a block
  	 * free, or spans across blocks.  The contig hint may be off by up to
  	 * a page, but if the hint is contained in a block, it will be accurate
  	 * with the else condition below.
  	 */
  	if ((ALIGN_DOWN(end, min(PCPU_BITS_PER_PAGE, PCPU_BITMAP_BLOCK_BITS)) >
  	     ALIGN(start, min(PCPU_BITS_PER_PAGE, PCPU_BITMAP_BLOCK_BITS))) ||
  	    s_index != e_index)
  		pcpu_chunk_refresh_hint(chunk);
  	else
  		pcpu_chunk_update(chunk, pcpu_block_off_to_off(s_index, start),
  				  s_block->contig_hint);
ca460b3c9   Dennis Zhou (Facebook)   percpu: introduce...
844
845
846
  }
  
  /**
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
   * pcpu_is_populated - determines if the region is populated
   * @chunk: chunk of interest
   * @bit_off: chunk offset
   * @bits: size of area
   * @next_off: return value for the next offset to start searching
   *
   * For atomic allocations, check if the backing pages are populated.
   *
   * RETURNS:
   * Bool if the backing pages are populated.
   * next_index is to skip over unpopulated blocks in pcpu_find_block_fit.
   */
  static bool pcpu_is_populated(struct pcpu_chunk *chunk, int bit_off, int bits,
  			      int *next_off)
  {
  	int page_start, page_end, rs, re;
833af8427   Tejun Heo   percpu: restructu...
863

40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
864
865
  	page_start = PFN_DOWN(bit_off * PCPU_MIN_ALLOC_SIZE);
  	page_end = PFN_UP((bit_off + bits) * PCPU_MIN_ALLOC_SIZE);
833af8427   Tejun Heo   percpu: restructu...
866

40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
867
868
869
870
  	rs = page_start;
  	pcpu_next_unpop(chunk->populated, &rs, &re, page_end);
  	if (rs >= page_end)
  		return true;
833af8427   Tejun Heo   percpu: restructu...
871

40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
872
873
  	*next_off = re * PAGE_SIZE / PCPU_MIN_ALLOC_SIZE;
  	return false;
9f7dcf224   Tejun Heo   percpu: move chun...
874
875
876
  }
  
  /**
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
877
878
879
880
881
882
   * pcpu_find_block_fit - finds the block index to start searching
   * @chunk: chunk of interest
   * @alloc_bits: size of request in allocation units
   * @align: alignment of area (max PAGE_SIZE bytes)
   * @pop_only: use populated regions only
   *
b4c2116cf   Dennis Zhou (Facebook)   percpu: update pc...
883
884
885
886
887
888
889
890
   * Given a chunk and an allocation spec, find the offset to begin searching
   * for a free region.  This iterates over the bitmap metadata blocks to
   * find an offset that will be guaranteed to fit the requirements.  It is
   * not quite first fit as if the allocation does not fit in the contig hint
   * of a block or chunk, it is skipped.  This errs on the side of caution
   * to prevent excess iteration.  Poor alignment can cause the allocator to
   * skip over blocks and chunks that have valid free areas.
   *
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
891
892
893
   * RETURNS:
   * The offset in the bitmap to begin searching.
   * -1 if no offset is found.
a16037c8d   Tejun Heo   percpu: make pcpu...
894
   */
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
895
896
  static int pcpu_find_block_fit(struct pcpu_chunk *chunk, int alloc_bits,
  			       size_t align, bool pop_only)
a16037c8d   Tejun Heo   percpu: make pcpu...
897
  {
b4c2116cf   Dennis Zhou (Facebook)   percpu: update pc...
898
  	int bit_off, bits, next_off;
a16037c8d   Tejun Heo   percpu: make pcpu...
899

13f966373   Dennis Zhou (Facebook)   percpu: skip chun...
900
901
902
903
904
905
906
907
908
909
  	/*
  	 * Check to see if the allocation can fit in the chunk's contig hint.
  	 * This is an optimization to prevent scanning by assuming if it
  	 * cannot fit in the global hint, there is memory pressure and creating
  	 * a new chunk would happen soon.
  	 */
  	bit_off = ALIGN(chunk->contig_bits_start, align) -
  		  chunk->contig_bits_start;
  	if (bit_off + alloc_bits > chunk->contig_bits)
  		return -1;
b4c2116cf   Dennis Zhou (Facebook)   percpu: update pc...
910
911
912
  	bit_off = chunk->first_bit;
  	bits = 0;
  	pcpu_for_each_fit_region(chunk, alloc_bits, align, bit_off, bits) {
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
913
  		if (!pop_only || pcpu_is_populated(chunk, bit_off, bits,
b4c2116cf   Dennis Zhou (Facebook)   percpu: update pc...
914
  						   &next_off))
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
915
  			break;
a16037c8d   Tejun Heo   percpu: make pcpu...
916

b4c2116cf   Dennis Zhou (Facebook)   percpu: update pc...
917
  		bit_off = next_off;
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
918
  		bits = 0;
a16037c8d   Tejun Heo   percpu: make pcpu...
919
  	}
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
920
921
922
923
924
  
  	if (bit_off == pcpu_chunk_map_bits(chunk))
  		return -1;
  
  	return bit_off;
a16037c8d   Tejun Heo   percpu: make pcpu...
925
926
927
  }
  
  /**
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
928
   * pcpu_alloc_area - allocates an area from a pcpu_chunk
fbf59bc9d   Tejun Heo   percpu: implement...
929
   * @chunk: chunk of interest
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
930
931
932
   * @alloc_bits: size of request in allocation units
   * @align: alignment of area (max PAGE_SIZE)
   * @start: bit_off to start searching
9f7dcf224   Tejun Heo   percpu: move chun...
933
   *
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
934
   * This function takes in a @start offset to begin searching to fit an
b4c2116cf   Dennis Zhou (Facebook)   percpu: update pc...
935
936
937
938
939
940
   * allocation of @alloc_bits with alignment @align.  It needs to scan
   * the allocation map because if it fits within the block's contig hint,
   * @start will be block->first_free. This is an attempt to fill the
   * allocation prior to breaking the contig hint.  The allocation and
   * boundary maps are updated accordingly if it confirms a valid
   * free area.
ccea34b5d   Tejun Heo   percpu: finer gra...
941
   *
fbf59bc9d   Tejun Heo   percpu: implement...
942
   * RETURNS:
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
943
944
   * Allocated addr offset in @chunk on success.
   * -1 if no matching area is found.
fbf59bc9d   Tejun Heo   percpu: implement...
945
   */
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
946
947
  static int pcpu_alloc_area(struct pcpu_chunk *chunk, int alloc_bits,
  			   size_t align, int start)
fbf59bc9d   Tejun Heo   percpu: implement...
948
  {
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
949
950
  	size_t align_mask = (align) ? (align - 1) : 0;
  	int bit_off, end, oslot;
a16037c8d   Tejun Heo   percpu: make pcpu...
951

40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
952
  	lockdep_assert_held(&pcpu_lock);
fbf59bc9d   Tejun Heo   percpu: implement...
953

40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
954
  	oslot = pcpu_chunk_slot(chunk);
fbf59bc9d   Tejun Heo   percpu: implement...
955

40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
956
957
958
  	/*
  	 * Search to find a fit.
  	 */
b4c2116cf   Dennis Zhou (Facebook)   percpu: update pc...
959
  	end = start + alloc_bits + PCPU_BITMAP_BLOCK_BITS;
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
960
961
962
963
  	bit_off = bitmap_find_next_zero_area(chunk->alloc_map, end, start,
  					     alloc_bits, align_mask);
  	if (bit_off >= end)
  		return -1;
fbf59bc9d   Tejun Heo   percpu: implement...
964

40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
965
966
  	/* update alloc map */
  	bitmap_set(chunk->alloc_map, bit_off, alloc_bits);
3d331ad74   Al Viro   percpu: speed all...
967

40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
968
969
970
971
  	/* update boundary map */
  	set_bit(bit_off, chunk->bound_map);
  	bitmap_clear(chunk->bound_map, bit_off + 1, alloc_bits - 1);
  	set_bit(bit_off + alloc_bits, chunk->bound_map);
fbf59bc9d   Tejun Heo   percpu: implement...
972

40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
973
  	chunk->free_bytes -= alloc_bits * PCPU_MIN_ALLOC_SIZE;
fbf59bc9d   Tejun Heo   percpu: implement...
974

86b442fbc   Dennis Zhou (Facebook)   percpu: add first...
975
976
977
978
979
980
  	/* update first free bit */
  	if (bit_off == chunk->first_bit)
  		chunk->first_bit = find_next_zero_bit(
  					chunk->alloc_map,
  					pcpu_chunk_map_bits(chunk),
  					bit_off + alloc_bits);
ca460b3c9   Dennis Zhou (Facebook)   percpu: introduce...
981
  	pcpu_block_update_hint_alloc(chunk, bit_off, alloc_bits);
fbf59bc9d   Tejun Heo   percpu: implement...
982

fbf59bc9d   Tejun Heo   percpu: implement...
983
  	pcpu_chunk_relocate(chunk, oslot);
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
984
  	return bit_off * PCPU_MIN_ALLOC_SIZE;
fbf59bc9d   Tejun Heo   percpu: implement...
985
986
987
  }
  
  /**
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
988
   * pcpu_free_area - frees the corresponding offset
fbf59bc9d   Tejun Heo   percpu: implement...
989
   * @chunk: chunk of interest
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
990
   * @off: addr offset into chunk
ccea34b5d   Tejun Heo   percpu: finer gra...
991
   *
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
992
993
   * This function determines the size of an allocation to free using
   * the boundary bitmap and clears the allocation map.
fbf59bc9d   Tejun Heo   percpu: implement...
994
   */
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
995
  static void pcpu_free_area(struct pcpu_chunk *chunk, int off)
fbf59bc9d   Tejun Heo   percpu: implement...
996
  {
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
997
  	int bit_off, bits, end, oslot;
723ad1d90   Al Viro   percpu: store off...
998

5ccd30e40   Dennis Zhou   percpu: add missi...
999
  	lockdep_assert_held(&pcpu_lock);
30a5b5367   Dennis Zhou   percpu: expose st...
1000
  	pcpu_stats_area_dealloc(chunk);
5ccd30e40   Dennis Zhou   percpu: add missi...
1001

40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1002
  	oslot = pcpu_chunk_slot(chunk);
fbf59bc9d   Tejun Heo   percpu: implement...
1003

40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1004
  	bit_off = off / PCPU_MIN_ALLOC_SIZE;
3d331ad74   Al Viro   percpu: speed all...
1005

40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1006
1007
1008
1009
1010
  	/* find end index */
  	end = find_next_bit(chunk->bound_map, pcpu_chunk_map_bits(chunk),
  			    bit_off + 1);
  	bits = end - bit_off;
  	bitmap_clear(chunk->alloc_map, bit_off, bits);
fbf59bc9d   Tejun Heo   percpu: implement...
1011

40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1012
1013
  	/* update metadata */
  	chunk->free_bytes += bits * PCPU_MIN_ALLOC_SIZE;
b539b87fe   Tejun Heo   percpu: implmeent...
1014

86b442fbc   Dennis Zhou (Facebook)   percpu: add first...
1015
1016
  	/* update first free bit */
  	chunk->first_bit = min(chunk->first_bit, bit_off);
ca460b3c9   Dennis Zhou (Facebook)   percpu: introduce...
1017
  	pcpu_block_update_hint_free(chunk, bit_off, bits);
fbf59bc9d   Tejun Heo   percpu: implement...
1018

fbf59bc9d   Tejun Heo   percpu: implement...
1019
1020
  	pcpu_chunk_relocate(chunk, oslot);
  }
ca460b3c9   Dennis Zhou (Facebook)   percpu: introduce...
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
  static void pcpu_init_md_blocks(struct pcpu_chunk *chunk)
  {
  	struct pcpu_block_md *md_block;
  
  	for (md_block = chunk->md_blocks;
  	     md_block != chunk->md_blocks + pcpu_chunk_nr_blocks(chunk);
  	     md_block++) {
  		md_block->contig_hint = PCPU_BITMAP_BLOCK_BITS;
  		md_block->left_free = PCPU_BITMAP_BLOCK_BITS;
  		md_block->right_free = PCPU_BITMAP_BLOCK_BITS;
  	}
  }
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
  /**
   * pcpu_alloc_first_chunk - creates chunks that serve the first chunk
   * @tmp_addr: the start of the region served
   * @map_size: size of the region served
   *
   * This is responsible for creating the chunks that serve the first chunk.  The
   * base_addr is page aligned down of @tmp_addr while the region end is page
   * aligned up.  Offsets are kept track of to determine the region served. All
   * this is done to appease the bitmap allocator in avoiding partial blocks.
   *
   * RETURNS:
   * Chunk serving the region at @tmp_addr of @map_size.
   */
c0ebfdc3f   Dennis Zhou (Facebook)   percpu: modify ba...
1046
  static struct pcpu_chunk * __init pcpu_alloc_first_chunk(unsigned long tmp_addr,
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1047
  							 int map_size)
10edf5b0b   Dennis Zhou (Facebook)   percpu: unify all...
1048
1049
  {
  	struct pcpu_chunk *chunk;
ca460b3c9   Dennis Zhou (Facebook)   percpu: introduce...
1050
  	unsigned long aligned_addr, lcm_align;
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1051
  	int start_offset, offset_bits, region_size, region_bits;
c0ebfdc3f   Dennis Zhou (Facebook)   percpu: modify ba...
1052
1053
1054
1055
1056
  
  	/* region calculations */
  	aligned_addr = tmp_addr & PAGE_MASK;
  
  	start_offset = tmp_addr - aligned_addr;
6b9d7c8e8   Dennis Zhou (Facebook)   percpu: end chunk...
1057

ca460b3c9   Dennis Zhou (Facebook)   percpu: introduce...
1058
1059
1060
1061
1062
1063
1064
  	/*
  	 * Align the end of the region with the LCM of PAGE_SIZE and
  	 * PCPU_BITMAP_BLOCK_SIZE.  One of these constants is a multiple of
  	 * the other.
  	 */
  	lcm_align = lcm(PAGE_SIZE, PCPU_BITMAP_BLOCK_SIZE);
  	region_size = ALIGN(start_offset + map_size, lcm_align);
10edf5b0b   Dennis Zhou (Facebook)   percpu: unify all...
1065

c0ebfdc3f   Dennis Zhou (Facebook)   percpu: modify ba...
1066
  	/* allocate chunk */
8ab16c43e   Dennis Zhou (Facebook)   percpu: change th...
1067
1068
1069
  	chunk = memblock_virt_alloc(sizeof(struct pcpu_chunk) +
  				    BITS_TO_LONGS(region_size >> PAGE_SHIFT),
  				    0);
c0ebfdc3f   Dennis Zhou (Facebook)   percpu: modify ba...
1070

10edf5b0b   Dennis Zhou (Facebook)   percpu: unify all...
1071
  	INIT_LIST_HEAD(&chunk->list);
c0ebfdc3f   Dennis Zhou (Facebook)   percpu: modify ba...
1072
1073
  
  	chunk->base_addr = (void *)aligned_addr;
10edf5b0b   Dennis Zhou (Facebook)   percpu: unify all...
1074
  	chunk->start_offset = start_offset;
6b9d7c8e8   Dennis Zhou (Facebook)   percpu: end chunk...
1075
  	chunk->end_offset = region_size - chunk->start_offset - map_size;
c0ebfdc3f   Dennis Zhou (Facebook)   percpu: modify ba...
1076

8ab16c43e   Dennis Zhou (Facebook)   percpu: change th...
1077
  	chunk->nr_pages = region_size >> PAGE_SHIFT;
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1078
  	region_bits = pcpu_chunk_map_bits(chunk);
c0ebfdc3f   Dennis Zhou (Facebook)   percpu: modify ba...
1079

ca460b3c9   Dennis Zhou (Facebook)   percpu: introduce...
1080
1081
1082
1083
1084
1085
1086
  	chunk->alloc_map = memblock_virt_alloc(BITS_TO_LONGS(region_bits) *
  					       sizeof(chunk->alloc_map[0]), 0);
  	chunk->bound_map = memblock_virt_alloc(BITS_TO_LONGS(region_bits + 1) *
  					       sizeof(chunk->bound_map[0]), 0);
  	chunk->md_blocks = memblock_virt_alloc(pcpu_chunk_nr_blocks(chunk) *
  					       sizeof(chunk->md_blocks[0]), 0);
  	pcpu_init_md_blocks(chunk);
10edf5b0b   Dennis Zhou (Facebook)   percpu: unify all...
1087
1088
1089
  
  	/* manage populated page bitmap */
  	chunk->immutable = true;
8ab16c43e   Dennis Zhou (Facebook)   percpu: change th...
1090
1091
  	bitmap_fill(chunk->populated, chunk->nr_pages);
  	chunk->nr_populated = chunk->nr_pages;
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1092
1093
1094
  	chunk->nr_empty_pop_pages =
  		pcpu_cnt_pop_pages(chunk, start_offset / PCPU_MIN_ALLOC_SIZE,
  				   map_size / PCPU_MIN_ALLOC_SIZE);
10edf5b0b   Dennis Zhou (Facebook)   percpu: unify all...
1095

40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1096
1097
  	chunk->contig_bits = map_size / PCPU_MIN_ALLOC_SIZE;
  	chunk->free_bytes = map_size;
c0ebfdc3f   Dennis Zhou (Facebook)   percpu: modify ba...
1098
1099
1100
  
  	if (chunk->start_offset) {
  		/* hide the beginning of the bitmap */
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1101
1102
1103
1104
  		offset_bits = chunk->start_offset / PCPU_MIN_ALLOC_SIZE;
  		bitmap_set(chunk->alloc_map, 0, offset_bits);
  		set_bit(0, chunk->bound_map);
  		set_bit(offset_bits, chunk->bound_map);
ca460b3c9   Dennis Zhou (Facebook)   percpu: introduce...
1105

86b442fbc   Dennis Zhou (Facebook)   percpu: add first...
1106
  		chunk->first_bit = offset_bits;
ca460b3c9   Dennis Zhou (Facebook)   percpu: introduce...
1107
  		pcpu_block_update_hint_alloc(chunk, 0, offset_bits);
c0ebfdc3f   Dennis Zhou (Facebook)   percpu: modify ba...
1108
  	}
6b9d7c8e8   Dennis Zhou (Facebook)   percpu: end chunk...
1109
1110
  	if (chunk->end_offset) {
  		/* hide the end of the bitmap */
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1111
1112
1113
1114
1115
1116
1117
  		offset_bits = chunk->end_offset / PCPU_MIN_ALLOC_SIZE;
  		bitmap_set(chunk->alloc_map,
  			   pcpu_chunk_map_bits(chunk) - offset_bits,
  			   offset_bits);
  		set_bit((start_offset + map_size) / PCPU_MIN_ALLOC_SIZE,
  			chunk->bound_map);
  		set_bit(region_bits, chunk->bound_map);
6b9d7c8e8   Dennis Zhou (Facebook)   percpu: end chunk...
1118

ca460b3c9   Dennis Zhou (Facebook)   percpu: introduce...
1119
1120
1121
  		pcpu_block_update_hint_alloc(chunk, pcpu_chunk_map_bits(chunk)
  					     - offset_bits, offset_bits);
  	}
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1122

10edf5b0b   Dennis Zhou (Facebook)   percpu: unify all...
1123
1124
  	return chunk;
  }
47504ee04   Dennis Zhou   percpu: add __GFP...
1125
  static struct pcpu_chunk *pcpu_alloc_chunk(gfp_t gfp)
6081089fd   Tejun Heo   percpu: reorganiz...
1126
1127
  {
  	struct pcpu_chunk *chunk;
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1128
  	int region_bits;
6081089fd   Tejun Heo   percpu: reorganiz...
1129

47504ee04   Dennis Zhou   percpu: add __GFP...
1130
  	chunk = pcpu_mem_zalloc(pcpu_chunk_struct_size, gfp);
6081089fd   Tejun Heo   percpu: reorganiz...
1131
1132
  	if (!chunk)
  		return NULL;
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1133
1134
1135
  	INIT_LIST_HEAD(&chunk->list);
  	chunk->nr_pages = pcpu_unit_pages;
  	region_bits = pcpu_chunk_map_bits(chunk);
6081089fd   Tejun Heo   percpu: reorganiz...
1136

40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1137
  	chunk->alloc_map = pcpu_mem_zalloc(BITS_TO_LONGS(region_bits) *
47504ee04   Dennis Zhou   percpu: add __GFP...
1138
  					   sizeof(chunk->alloc_map[0]), gfp);
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1139
1140
  	if (!chunk->alloc_map)
  		goto alloc_map_fail;
6081089fd   Tejun Heo   percpu: reorganiz...
1141

40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1142
  	chunk->bound_map = pcpu_mem_zalloc(BITS_TO_LONGS(region_bits + 1) *
47504ee04   Dennis Zhou   percpu: add __GFP...
1143
  					   sizeof(chunk->bound_map[0]), gfp);
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1144
1145
  	if (!chunk->bound_map)
  		goto bound_map_fail;
6081089fd   Tejun Heo   percpu: reorganiz...
1146

ca460b3c9   Dennis Zhou (Facebook)   percpu: introduce...
1147
  	chunk->md_blocks = pcpu_mem_zalloc(pcpu_chunk_nr_blocks(chunk) *
47504ee04   Dennis Zhou   percpu: add __GFP...
1148
  					   sizeof(chunk->md_blocks[0]), gfp);
ca460b3c9   Dennis Zhou (Facebook)   percpu: introduce...
1149
1150
1151
1152
  	if (!chunk->md_blocks)
  		goto md_blocks_fail;
  
  	pcpu_init_md_blocks(chunk);
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1153
1154
1155
  	/* init metadata */
  	chunk->contig_bits = region_bits;
  	chunk->free_bytes = chunk->nr_pages * PAGE_SIZE;
c0ebfdc3f   Dennis Zhou (Facebook)   percpu: modify ba...
1156

6081089fd   Tejun Heo   percpu: reorganiz...
1157
  	return chunk;
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1158

ca460b3c9   Dennis Zhou (Facebook)   percpu: introduce...
1159
1160
  md_blocks_fail:
  	pcpu_mem_free(chunk->bound_map);
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1161
1162
1163
1164
1165
1166
  bound_map_fail:
  	pcpu_mem_free(chunk->alloc_map);
  alloc_map_fail:
  	pcpu_mem_free(chunk);
  
  	return NULL;
6081089fd   Tejun Heo   percpu: reorganiz...
1167
1168
1169
1170
1171
1172
  }
  
  static void pcpu_free_chunk(struct pcpu_chunk *chunk)
  {
  	if (!chunk)
  		return;
6685b3573   Mike Rapoport   percpu: stop leak...
1173
  	pcpu_mem_free(chunk->md_blocks);
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1174
1175
  	pcpu_mem_free(chunk->bound_map);
  	pcpu_mem_free(chunk->alloc_map);
1d5cfdb07   Tetsuo Handa   tree wide: use kv...
1176
  	pcpu_mem_free(chunk);
6081089fd   Tejun Heo   percpu: reorganiz...
1177
  }
b539b87fe   Tejun Heo   percpu: implmeent...
1178
1179
1180
1181
1182
  /**
   * pcpu_chunk_populated - post-population bookkeeping
   * @chunk: pcpu_chunk which got populated
   * @page_start: the start page
   * @page_end: the end page
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1183
   * @for_alloc: if this is to populate for allocation
b539b87fe   Tejun Heo   percpu: implmeent...
1184
1185
1186
1187
   *
   * Pages in [@page_start,@page_end) have been populated to @chunk.  Update
   * the bookkeeping information accordingly.  Must be called after each
   * successful population.
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1188
1189
1190
   *
   * If this is @for_alloc, do not increment pcpu_nr_empty_pop_pages because it
   * is to serve an allocation in that area.
b539b87fe   Tejun Heo   percpu: implmeent...
1191
   */
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1192
1193
  static void pcpu_chunk_populated(struct pcpu_chunk *chunk, int page_start,
  				 int page_end, bool for_alloc)
b539b87fe   Tejun Heo   percpu: implmeent...
1194
1195
1196
1197
1198
1199
1200
  {
  	int nr = page_end - page_start;
  
  	lockdep_assert_held(&pcpu_lock);
  
  	bitmap_set(chunk->populated, page_start, nr);
  	chunk->nr_populated += nr;
7e8a6304d   Dennis Zhou (Facebook)   /proc/meminfo: ad...
1201
  	pcpu_nr_populated += nr;
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1202
1203
1204
1205
1206
  
  	if (!for_alloc) {
  		chunk->nr_empty_pop_pages += nr;
  		pcpu_nr_empty_pop_pages += nr;
  	}
b539b87fe   Tejun Heo   percpu: implmeent...
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
  }
  
  /**
   * pcpu_chunk_depopulated - post-depopulation bookkeeping
   * @chunk: pcpu_chunk which got depopulated
   * @page_start: the start page
   * @page_end: the end page
   *
   * Pages in [@page_start,@page_end) have been depopulated from @chunk.
   * Update the bookkeeping information accordingly.  Must be called after
   * each successful depopulation.
   */
  static void pcpu_chunk_depopulated(struct pcpu_chunk *chunk,
  				   int page_start, int page_end)
  {
  	int nr = page_end - page_start;
  
  	lockdep_assert_held(&pcpu_lock);
  
  	bitmap_clear(chunk->populated, page_start, nr);
  	chunk->nr_populated -= nr;
0cecf50cf   Dennis Zhou (Facebook)   percpu: introduce...
1228
  	chunk->nr_empty_pop_pages -= nr;
b539b87fe   Tejun Heo   percpu: implmeent...
1229
  	pcpu_nr_empty_pop_pages -= nr;
7e8a6304d   Dennis Zhou (Facebook)   /proc/meminfo: ad...
1230
  	pcpu_nr_populated -= nr;
b539b87fe   Tejun Heo   percpu: implmeent...
1231
  }
9f6455325   Tejun Heo   percpu: move vmal...
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
  /*
   * Chunk management implementation.
   *
   * To allow different implementations, chunk alloc/free and
   * [de]population are implemented in a separate file which is pulled
   * into this file and compiled together.  The following functions
   * should be implemented.
   *
   * pcpu_populate_chunk		- populate the specified range of a chunk
   * pcpu_depopulate_chunk	- depopulate the specified range of a chunk
   * pcpu_create_chunk		- create a new chunk
   * pcpu_destroy_chunk		- destroy a chunk, always preceded by full depop
   * pcpu_addr_to_page		- translate address to physical address
   * pcpu_verify_alloc_info	- check alloc_info is acceptable during init
fbf59bc9d   Tejun Heo   percpu: implement...
1246
   */
15d9f3d11   Dennis Zhou   percpu: match chu...
1247
  static int pcpu_populate_chunk(struct pcpu_chunk *chunk,
47504ee04   Dennis Zhou   percpu: add __GFP...
1248
  			       int page_start, int page_end, gfp_t gfp);
15d9f3d11   Dennis Zhou   percpu: match chu...
1249
1250
  static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk,
  				  int page_start, int page_end);
47504ee04   Dennis Zhou   percpu: add __GFP...
1251
  static struct pcpu_chunk *pcpu_create_chunk(gfp_t gfp);
9f6455325   Tejun Heo   percpu: move vmal...
1252
1253
1254
  static void pcpu_destroy_chunk(struct pcpu_chunk *chunk);
  static struct page *pcpu_addr_to_page(void *addr);
  static int __init pcpu_verify_alloc_info(const struct pcpu_alloc_info *ai);
fbf59bc9d   Tejun Heo   percpu: implement...
1255

b0c9778b1   Tejun Heo   percpu: implement...
1256
1257
1258
  #ifdef CONFIG_NEED_PER_CPU_KM
  #include "percpu-km.c"
  #else
9f6455325   Tejun Heo   percpu: move vmal...
1259
  #include "percpu-vm.c"
b0c9778b1   Tejun Heo   percpu: implement...
1260
  #endif
fbf59bc9d   Tejun Heo   percpu: implement...
1261
1262
  
  /**
88999a898   Tejun Heo   percpu: misc prep...
1263
1264
1265
   * pcpu_chunk_addr_search - determine chunk containing specified address
   * @addr: address for which the chunk needs to be determined.
   *
c0ebfdc3f   Dennis Zhou (Facebook)   percpu: modify ba...
1266
1267
1268
   * This is an internal function that handles all but static allocations.
   * Static percpu address values should never be passed into the allocator.
   *
88999a898   Tejun Heo   percpu: misc prep...
1269
1270
1271
1272
1273
   * RETURNS:
   * The address of the found chunk.
   */
  static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr)
  {
c0ebfdc3f   Dennis Zhou (Facebook)   percpu: modify ba...
1274
  	/* is it in the dynamic region (first chunk)? */
560f2c236   Dennis Zhou (Facebook)   percpu: combine p...
1275
  	if (pcpu_addr_in_chunk(pcpu_first_chunk, addr))
88999a898   Tejun Heo   percpu: misc prep...
1276
  		return pcpu_first_chunk;
c0ebfdc3f   Dennis Zhou (Facebook)   percpu: modify ba...
1277
1278
  
  	/* is it in the reserved region? */
560f2c236   Dennis Zhou (Facebook)   percpu: combine p...
1279
  	if (pcpu_addr_in_chunk(pcpu_reserved_chunk, addr))
c0ebfdc3f   Dennis Zhou (Facebook)   percpu: modify ba...
1280
  		return pcpu_reserved_chunk;
88999a898   Tejun Heo   percpu: misc prep...
1281
1282
1283
1284
1285
1286
1287
1288
1289
  
  	/*
  	 * The address is relative to unit0 which might be unused and
  	 * thus unmapped.  Offset the address to the unit space of the
  	 * current processor before looking it up in the vmalloc
  	 * space.  Note that any possible cpu id can be used here, so
  	 * there's no need to worry about preemption or cpu hotplug.
  	 */
  	addr += pcpu_unit_offsets[raw_smp_processor_id()];
9f6455325   Tejun Heo   percpu: move vmal...
1290
  	return pcpu_get_page_chunk(pcpu_addr_to_page(addr));
88999a898   Tejun Heo   percpu: misc prep...
1291
1292
1293
  }
  
  /**
edcb46399   Tejun Heo   percpu, module: i...
1294
   * pcpu_alloc - the percpu allocator
cae3aeb83   Tejun Heo   percpu: clean up ...
1295
   * @size: size of area to allocate in bytes
fbf59bc9d   Tejun Heo   percpu: implement...
1296
   * @align: alignment of area (max PAGE_SIZE)
edcb46399   Tejun Heo   percpu, module: i...
1297
   * @reserved: allocate from the reserved chunk if available
5835d96e9   Tejun Heo   percpu: implement...
1298
   * @gfp: allocation flags
fbf59bc9d   Tejun Heo   percpu: implement...
1299
   *
5835d96e9   Tejun Heo   percpu: implement...
1300
   * Allocate percpu area of @size bytes aligned at @align.  If @gfp doesn't
0ea7eeec2   Daniel Borkmann   mm, percpu: add s...
1301
1302
1303
   * contain %GFP_KERNEL, the allocation is atomic. If @gfp has __GFP_NOWARN
   * then no warning will be triggered on invalid or failed allocation
   * requests.
fbf59bc9d   Tejun Heo   percpu: implement...
1304
1305
1306
1307
   *
   * RETURNS:
   * Percpu pointer to the allocated area on success, NULL on failure.
   */
5835d96e9   Tejun Heo   percpu: implement...
1308
1309
  static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
  				 gfp_t gfp)
fbf59bc9d   Tejun Heo   percpu: implement...
1310
  {
554fef1c3   Dennis Zhou   percpu: allow sel...
1311
1312
  	/* whitelisted flags that can be passed to the backing allocators */
  	gfp_t pcpu_gfp = gfp & (GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN);
0ea7eeec2   Daniel Borkmann   mm, percpu: add s...
1313
1314
  	bool is_atomic = (gfp & GFP_KERNEL) != GFP_KERNEL;
  	bool do_warn = !(gfp & __GFP_NOWARN);
f2badb0c9   Tejun Heo   percpu: make allo...
1315
  	static int warn_limit = 10;
fbf59bc9d   Tejun Heo   percpu: implement...
1316
  	struct pcpu_chunk *chunk;
f2badb0c9   Tejun Heo   percpu: make allo...
1317
  	const char *err;
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1318
  	int slot, off, cpu, ret;
403a91b16   Jiri Kosina   percpu: allow pcp...
1319
  	unsigned long flags;
f528f0b8e   Catalin Marinas   kmemleak: Handle ...
1320
  	void __percpu *ptr;
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1321
  	size_t bits, bit_align;
fbf59bc9d   Tejun Heo   percpu: implement...
1322

723ad1d90   Al Viro   percpu: store off...
1323
  	/*
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1324
1325
1326
1327
  	 * There is now a minimum allocation size of PCPU_MIN_ALLOC_SIZE,
  	 * therefore alignment must be a minimum of that many bytes.
  	 * An allocation may have internal fragmentation from rounding up
  	 * of up to PCPU_MIN_ALLOC_SIZE - 1 bytes.
723ad1d90   Al Viro   percpu: store off...
1328
  	 */
d2f3c3849   Dennis Zhou (Facebook)   percpu: increase ...
1329
1330
  	if (unlikely(align < PCPU_MIN_ALLOC_SIZE))
  		align = PCPU_MIN_ALLOC_SIZE;
723ad1d90   Al Viro   percpu: store off...
1331

d2f3c3849   Dennis Zhou (Facebook)   percpu: increase ...
1332
  	size = ALIGN(size, PCPU_MIN_ALLOC_SIZE);
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1333
1334
  	bits = size >> PCPU_MIN_ALLOC_SHIFT;
  	bit_align = align >> PCPU_MIN_ALLOC_SHIFT;
2f69fa829   Al Viro   percpu: allocatio...
1335

3ca45a46f   zijun_hu   percpu: ensure th...
1336
1337
  	if (unlikely(!size || size > PCPU_MIN_UNIT_SIZE || align > PAGE_SIZE ||
  		     !is_power_of_2(align))) {
0ea7eeec2   Daniel Borkmann   mm, percpu: add s...
1338
1339
  		WARN(do_warn, "illegal size (%zu) or align (%zu) for percpu allocation
  ",
756a025f0   Joe Perches   mm: coalesce spli...
1340
  		     size, align);
fbf59bc9d   Tejun Heo   percpu: implement...
1341
1342
  		return NULL;
  	}
f52ba1fef   Kirill Tkhai   mm: Allow to kill...
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
  	if (!is_atomic) {
  		/*
  		 * pcpu_balance_workfn() allocates memory under this mutex,
  		 * and it may wait for memory reclaim. Allow current task
  		 * to become OOM victim, in case of memory pressure.
  		 */
  		if (gfp & __GFP_NOFAIL)
  			mutex_lock(&pcpu_alloc_mutex);
  		else if (mutex_lock_killable(&pcpu_alloc_mutex))
  			return NULL;
  	}
6710e594f   Tejun Heo   percpu: fix synch...
1354

403a91b16   Jiri Kosina   percpu: allow pcp...
1355
  	spin_lock_irqsave(&pcpu_lock, flags);
fbf59bc9d   Tejun Heo   percpu: implement...
1356

edcb46399   Tejun Heo   percpu, module: i...
1357
1358
1359
  	/* serve reserved allocations from the reserved chunk if available */
  	if (reserved && pcpu_reserved_chunk) {
  		chunk = pcpu_reserved_chunk;
833af8427   Tejun Heo   percpu: restructu...
1360

40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1361
1362
  		off = pcpu_find_block_fit(chunk, bits, bit_align, is_atomic);
  		if (off < 0) {
833af8427   Tejun Heo   percpu: restructu...
1363
  			err = "alloc from reserved chunk failed";
ccea34b5d   Tejun Heo   percpu: finer gra...
1364
  			goto fail_unlock;
f2badb0c9   Tejun Heo   percpu: make allo...
1365
  		}
833af8427   Tejun Heo   percpu: restructu...
1366

40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1367
  		off = pcpu_alloc_area(chunk, bits, bit_align, off);
edcb46399   Tejun Heo   percpu, module: i...
1368
1369
  		if (off >= 0)
  			goto area_found;
833af8427   Tejun Heo   percpu: restructu...
1370

f2badb0c9   Tejun Heo   percpu: make allo...
1371
  		err = "alloc from reserved chunk failed";
ccea34b5d   Tejun Heo   percpu: finer gra...
1372
  		goto fail_unlock;
edcb46399   Tejun Heo   percpu, module: i...
1373
  	}
ccea34b5d   Tejun Heo   percpu: finer gra...
1374
  restart:
edcb46399   Tejun Heo   percpu, module: i...
1375
  	/* search through normal chunks */
fbf59bc9d   Tejun Heo   percpu: implement...
1376
1377
  	for (slot = pcpu_size_to_slot(size); slot < pcpu_nr_slots; slot++) {
  		list_for_each_entry(chunk, &pcpu_slot[slot], list) {
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1378
1379
1380
  			off = pcpu_find_block_fit(chunk, bits, bit_align,
  						  is_atomic);
  			if (off < 0)
fbf59bc9d   Tejun Heo   percpu: implement...
1381
  				continue;
ccea34b5d   Tejun Heo   percpu: finer gra...
1382

40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1383
  			off = pcpu_alloc_area(chunk, bits, bit_align, off);
fbf59bc9d   Tejun Heo   percpu: implement...
1384
1385
  			if (off >= 0)
  				goto area_found;
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1386

fbf59bc9d   Tejun Heo   percpu: implement...
1387
1388
  		}
  	}
403a91b16   Jiri Kosina   percpu: allow pcp...
1389
  	spin_unlock_irqrestore(&pcpu_lock, flags);
ccea34b5d   Tejun Heo   percpu: finer gra...
1390

b38d08f31   Tejun Heo   percpu: restructu...
1391
1392
1393
1394
1395
  	/*
  	 * No space left.  Create a new chunk.  We don't want multiple
  	 * tasks to create chunks simultaneously.  Serialize and create iff
  	 * there's still no empty chunk after grabbing the mutex.
  	 */
11df02bf9   Dennis Zhou   percpu: resolve e...
1396
1397
  	if (is_atomic) {
  		err = "atomic alloc failed, no space left";
5835d96e9   Tejun Heo   percpu: implement...
1398
  		goto fail;
11df02bf9   Dennis Zhou   percpu: resolve e...
1399
  	}
5835d96e9   Tejun Heo   percpu: implement...
1400

b38d08f31   Tejun Heo   percpu: restructu...
1401
  	if (list_empty(&pcpu_slot[pcpu_nr_slots - 1])) {
554fef1c3   Dennis Zhou   percpu: allow sel...
1402
  		chunk = pcpu_create_chunk(pcpu_gfp);
b38d08f31   Tejun Heo   percpu: restructu...
1403
1404
1405
1406
1407
1408
1409
1410
1411
  		if (!chunk) {
  			err = "failed to allocate new chunk";
  			goto fail;
  		}
  
  		spin_lock_irqsave(&pcpu_lock, flags);
  		pcpu_chunk_relocate(chunk, -1);
  	} else {
  		spin_lock_irqsave(&pcpu_lock, flags);
f2badb0c9   Tejun Heo   percpu: make allo...
1412
  	}
ccea34b5d   Tejun Heo   percpu: finer gra...
1413

ccea34b5d   Tejun Heo   percpu: finer gra...
1414
  	goto restart;
fbf59bc9d   Tejun Heo   percpu: implement...
1415
1416
  
  area_found:
30a5b5367   Dennis Zhou   percpu: expose st...
1417
  	pcpu_stats_area_alloc(chunk, size);
403a91b16   Jiri Kosina   percpu: allow pcp...
1418
  	spin_unlock_irqrestore(&pcpu_lock, flags);
ccea34b5d   Tejun Heo   percpu: finer gra...
1419

dca496451   Tejun Heo   percpu: move comm...
1420
  	/* populate if not all pages are already there */
5835d96e9   Tejun Heo   percpu: implement...
1421
  	if (!is_atomic) {
e04d32083   Tejun Heo   percpu: indent th...
1422
  		int page_start, page_end, rs, re;
dca496451   Tejun Heo   percpu: move comm...
1423

e04d32083   Tejun Heo   percpu: indent th...
1424
1425
  		page_start = PFN_DOWN(off);
  		page_end = PFN_UP(off + size);
b38d08f31   Tejun Heo   percpu: restructu...
1426

91e914c5a   Dennis Zhou (Facebook)   percpu: generaliz...
1427
1428
  		pcpu_for_each_unpop_region(chunk->populated, rs, re,
  					   page_start, page_end) {
e04d32083   Tejun Heo   percpu: indent th...
1429
  			WARN_ON(chunk->immutable);
554fef1c3   Dennis Zhou   percpu: allow sel...
1430
  			ret = pcpu_populate_chunk(chunk, rs, re, pcpu_gfp);
e04d32083   Tejun Heo   percpu: indent th...
1431
1432
1433
  
  			spin_lock_irqsave(&pcpu_lock, flags);
  			if (ret) {
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1434
  				pcpu_free_area(chunk, off);
e04d32083   Tejun Heo   percpu: indent th...
1435
1436
1437
  				err = "failed to populate";
  				goto fail_unlock;
  			}
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1438
  			pcpu_chunk_populated(chunk, rs, re, true);
e04d32083   Tejun Heo   percpu: indent th...
1439
  			spin_unlock_irqrestore(&pcpu_lock, flags);
dca496451   Tejun Heo   percpu: move comm...
1440
  		}
fbf59bc9d   Tejun Heo   percpu: implement...
1441

e04d32083   Tejun Heo   percpu: indent th...
1442
1443
  		mutex_unlock(&pcpu_alloc_mutex);
  	}
ccea34b5d   Tejun Heo   percpu: finer gra...
1444

1a4d76076   Tejun Heo   percpu: implement...
1445
1446
  	if (pcpu_nr_empty_pop_pages < PCPU_EMPTY_POP_PAGES_LOW)
  		pcpu_schedule_balance_work();
dca496451   Tejun Heo   percpu: move comm...
1447
1448
1449
  	/* clear the areas and return address relative to base address */
  	for_each_possible_cpu(cpu)
  		memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size);
f528f0b8e   Catalin Marinas   kmemleak: Handle ...
1450
  	ptr = __addr_to_pcpu_ptr(chunk->base_addr + off);
8a8c35fad   Larry Finger   mm: kmemleak_allo...
1451
  	kmemleak_alloc_percpu(ptr, size, gfp);
df95e795a   Dennis Zhou   percpu: add trace...
1452
1453
1454
  
  	trace_percpu_alloc_percpu(reserved, is_atomic, size, align,
  			chunk->base_addr, off, ptr);
f528f0b8e   Catalin Marinas   kmemleak: Handle ...
1455
  	return ptr;
ccea34b5d   Tejun Heo   percpu: finer gra...
1456
1457
  
  fail_unlock:
403a91b16   Jiri Kosina   percpu: allow pcp...
1458
  	spin_unlock_irqrestore(&pcpu_lock, flags);
b38d08f31   Tejun Heo   percpu: restructu...
1459
  fail:
df95e795a   Dennis Zhou   percpu: add trace...
1460
  	trace_percpu_alloc_percpu_fail(reserved, is_atomic, size, align);
0ea7eeec2   Daniel Borkmann   mm, percpu: add s...
1461
  	if (!is_atomic && do_warn && warn_limit) {
870d4b12a   Joe Perches   mm: percpu: use p...
1462
1463
  		pr_warn("allocation failed, size=%zu align=%zu atomic=%d, %s
  ",
598d80914   Joe Perches   mm: convert pr_wa...
1464
  			size, align, is_atomic, err);
f2badb0c9   Tejun Heo   percpu: make allo...
1465
1466
  		dump_stack();
  		if (!--warn_limit)
870d4b12a   Joe Perches   mm: percpu: use p...
1467
1468
  			pr_info("limit reached, disable warning
  ");
f2badb0c9   Tejun Heo   percpu: make allo...
1469
  	}
1a4d76076   Tejun Heo   percpu: implement...
1470
1471
1472
1473
  	if (is_atomic) {
  		/* see the flag handling in pcpu_blance_workfn() */
  		pcpu_atomic_alloc_failed = true;
  		pcpu_schedule_balance_work();
6710e594f   Tejun Heo   percpu: fix synch...
1474
1475
  	} else {
  		mutex_unlock(&pcpu_alloc_mutex);
1a4d76076   Tejun Heo   percpu: implement...
1476
  	}
ccea34b5d   Tejun Heo   percpu: finer gra...
1477
  	return NULL;
fbf59bc9d   Tejun Heo   percpu: implement...
1478
  }
edcb46399   Tejun Heo   percpu, module: i...
1479
1480
  
  /**
5835d96e9   Tejun Heo   percpu: implement...
1481
   * __alloc_percpu_gfp - allocate dynamic percpu area
edcb46399   Tejun Heo   percpu, module: i...
1482
1483
   * @size: size of area to allocate in bytes
   * @align: alignment of area (max PAGE_SIZE)
5835d96e9   Tejun Heo   percpu: implement...
1484
   * @gfp: allocation flags
edcb46399   Tejun Heo   percpu, module: i...
1485
   *
5835d96e9   Tejun Heo   percpu: implement...
1486
1487
   * Allocate zero-filled percpu area of @size bytes aligned at @align.  If
   * @gfp doesn't contain %GFP_KERNEL, the allocation doesn't block and can
0ea7eeec2   Daniel Borkmann   mm, percpu: add s...
1488
1489
1490
   * be called from any context but is a lot more likely to fail. If @gfp
   * has __GFP_NOWARN then no warning will be triggered on invalid or failed
   * allocation requests.
ccea34b5d   Tejun Heo   percpu: finer gra...
1491
   *
edcb46399   Tejun Heo   percpu, module: i...
1492
1493
1494
   * RETURNS:
   * Percpu pointer to the allocated area on success, NULL on failure.
   */
5835d96e9   Tejun Heo   percpu: implement...
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
  void __percpu *__alloc_percpu_gfp(size_t size, size_t align, gfp_t gfp)
  {
  	return pcpu_alloc(size, align, false, gfp);
  }
  EXPORT_SYMBOL_GPL(__alloc_percpu_gfp);
  
  /**
   * __alloc_percpu - allocate dynamic percpu area
   * @size: size of area to allocate in bytes
   * @align: alignment of area (max PAGE_SIZE)
   *
   * Equivalent to __alloc_percpu_gfp(size, align, %GFP_KERNEL).
   */
43cf38eb5   Tejun Heo   percpu: add __per...
1508
  void __percpu *__alloc_percpu(size_t size, size_t align)
edcb46399   Tejun Heo   percpu, module: i...
1509
  {
5835d96e9   Tejun Heo   percpu: implement...
1510
  	return pcpu_alloc(size, align, false, GFP_KERNEL);
edcb46399   Tejun Heo   percpu, module: i...
1511
  }
fbf59bc9d   Tejun Heo   percpu: implement...
1512
  EXPORT_SYMBOL_GPL(__alloc_percpu);
edcb46399   Tejun Heo   percpu, module: i...
1513
1514
1515
1516
1517
  /**
   * __alloc_reserved_percpu - allocate reserved percpu area
   * @size: size of area to allocate in bytes
   * @align: alignment of area (max PAGE_SIZE)
   *
9329ba970   Tejun Heo   percpu: update co...
1518
1519
1520
1521
   * Allocate zero-filled percpu area of @size bytes aligned at @align
   * from reserved percpu area if arch has set it up; otherwise,
   * allocation is served from the same dynamic area.  Might sleep.
   * Might trigger writeouts.
edcb46399   Tejun Heo   percpu, module: i...
1522
   *
ccea34b5d   Tejun Heo   percpu: finer gra...
1523
1524
1525
   * CONTEXT:
   * Does GFP_KERNEL allocation.
   *
edcb46399   Tejun Heo   percpu, module: i...
1526
1527
1528
   * RETURNS:
   * Percpu pointer to the allocated area on success, NULL on failure.
   */
43cf38eb5   Tejun Heo   percpu: add __per...
1529
  void __percpu *__alloc_reserved_percpu(size_t size, size_t align)
edcb46399   Tejun Heo   percpu, module: i...
1530
  {
5835d96e9   Tejun Heo   percpu: implement...
1531
  	return pcpu_alloc(size, align, true, GFP_KERNEL);
edcb46399   Tejun Heo   percpu, module: i...
1532
  }
a56dbddf0   Tejun Heo   percpu: move full...
1533
  /**
1a4d76076   Tejun Heo   percpu: implement...
1534
   * pcpu_balance_workfn - manage the amount of free chunks and populated pages
a56dbddf0   Tejun Heo   percpu: move full...
1535
1536
   * @work: unused
   *
47504ee04   Dennis Zhou   percpu: add __GFP...
1537
1538
1539
1540
1541
1542
   * Reclaim all fully free chunks except for the first one.  This is also
   * responsible for maintaining the pool of empty populated pages.  However,
   * it is possible that this is called when physical memory is scarce causing
   * OOM killer to be triggered.  We should avoid doing so until an actual
   * allocation causes the failure as it is possible that requests can be
   * serviced from already backed regions.
a56dbddf0   Tejun Heo   percpu: move full...
1543
   */
fe6bd8c3d   Tejun Heo   percpu: rename pc...
1544
  static void pcpu_balance_workfn(struct work_struct *work)
fbf59bc9d   Tejun Heo   percpu: implement...
1545
  {
47504ee04   Dennis Zhou   percpu: add __GFP...
1546
  	/* gfp flags passed to underlying allocators */
554fef1c3   Dennis Zhou   percpu: allow sel...
1547
  	const gfp_t gfp = GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN;
fe6bd8c3d   Tejun Heo   percpu: rename pc...
1548
1549
  	LIST_HEAD(to_free);
  	struct list_head *free_head = &pcpu_slot[pcpu_nr_slots - 1];
a56dbddf0   Tejun Heo   percpu: move full...
1550
  	struct pcpu_chunk *chunk, *next;
1a4d76076   Tejun Heo   percpu: implement...
1551
  	int slot, nr_to_pop, ret;
a56dbddf0   Tejun Heo   percpu: move full...
1552

1a4d76076   Tejun Heo   percpu: implement...
1553
1554
1555
1556
  	/*
  	 * There's no reason to keep around multiple unused chunks and VM
  	 * areas can be scarce.  Destroy all free chunks except for one.
  	 */
ccea34b5d   Tejun Heo   percpu: finer gra...
1557
1558
  	mutex_lock(&pcpu_alloc_mutex);
  	spin_lock_irq(&pcpu_lock);
a56dbddf0   Tejun Heo   percpu: move full...
1559

fe6bd8c3d   Tejun Heo   percpu: rename pc...
1560
  	list_for_each_entry_safe(chunk, next, free_head, list) {
a56dbddf0   Tejun Heo   percpu: move full...
1561
1562
1563
  		WARN_ON(chunk->immutable);
  
  		/* spare the first one */
fe6bd8c3d   Tejun Heo   percpu: rename pc...
1564
  		if (chunk == list_first_entry(free_head, struct pcpu_chunk, list))
a56dbddf0   Tejun Heo   percpu: move full...
1565
  			continue;
fe6bd8c3d   Tejun Heo   percpu: rename pc...
1566
  		list_move(&chunk->list, &to_free);
a56dbddf0   Tejun Heo   percpu: move full...
1567
  	}
ccea34b5d   Tejun Heo   percpu: finer gra...
1568
  	spin_unlock_irq(&pcpu_lock);
a56dbddf0   Tejun Heo   percpu: move full...
1569

fe6bd8c3d   Tejun Heo   percpu: rename pc...
1570
  	list_for_each_entry_safe(chunk, next, &to_free, list) {
a93ace487   Tejun Heo   percpu: move regi...
1571
  		int rs, re;
dca496451   Tejun Heo   percpu: move comm...
1572

91e914c5a   Dennis Zhou (Facebook)   percpu: generaliz...
1573
1574
  		pcpu_for_each_pop_region(chunk->populated, rs, re, 0,
  					 chunk->nr_pages) {
a93ace487   Tejun Heo   percpu: move regi...
1575
  			pcpu_depopulate_chunk(chunk, rs, re);
b539b87fe   Tejun Heo   percpu: implmeent...
1576
1577
1578
  			spin_lock_irq(&pcpu_lock);
  			pcpu_chunk_depopulated(chunk, rs, re);
  			spin_unlock_irq(&pcpu_lock);
a93ace487   Tejun Heo   percpu: move regi...
1579
  		}
6081089fd   Tejun Heo   percpu: reorganiz...
1580
  		pcpu_destroy_chunk(chunk);
accd4f36a   Eric Dumazet   percpu: add a sch...
1581
  		cond_resched();
a56dbddf0   Tejun Heo   percpu: move full...
1582
  	}
971f3918a   Tejun Heo   percpu: fix pcpu_...
1583

1a4d76076   Tejun Heo   percpu: implement...
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
  	/*
  	 * Ensure there are certain number of free populated pages for
  	 * atomic allocs.  Fill up from the most packed so that atomic
  	 * allocs don't increase fragmentation.  If atomic allocation
  	 * failed previously, always populate the maximum amount.  This
  	 * should prevent atomic allocs larger than PAGE_SIZE from keeping
  	 * failing indefinitely; however, large atomic allocs are not
  	 * something we support properly and can be highly unreliable and
  	 * inefficient.
  	 */
  retry_pop:
  	if (pcpu_atomic_alloc_failed) {
  		nr_to_pop = PCPU_EMPTY_POP_PAGES_HIGH;
  		/* best effort anyway, don't worry about synchronization */
  		pcpu_atomic_alloc_failed = false;
  	} else {
  		nr_to_pop = clamp(PCPU_EMPTY_POP_PAGES_HIGH -
  				  pcpu_nr_empty_pop_pages,
  				  0, PCPU_EMPTY_POP_PAGES_HIGH);
  	}
  
  	for (slot = pcpu_size_to_slot(PAGE_SIZE); slot < pcpu_nr_slots; slot++) {
  		int nr_unpop = 0, rs, re;
  
  		if (!nr_to_pop)
  			break;
  
  		spin_lock_irq(&pcpu_lock);
  		list_for_each_entry(chunk, &pcpu_slot[slot], list) {
8ab16c43e   Dennis Zhou (Facebook)   percpu: change th...
1613
  			nr_unpop = chunk->nr_pages - chunk->nr_populated;
1a4d76076   Tejun Heo   percpu: implement...
1614
1615
1616
1617
1618
1619
1620
1621
1622
  			if (nr_unpop)
  				break;
  		}
  		spin_unlock_irq(&pcpu_lock);
  
  		if (!nr_unpop)
  			continue;
  
  		/* @chunk can't go away while pcpu_alloc_mutex is held */
91e914c5a   Dennis Zhou (Facebook)   percpu: generaliz...
1623
1624
  		pcpu_for_each_unpop_region(chunk->populated, rs, re, 0,
  					   chunk->nr_pages) {
1a4d76076   Tejun Heo   percpu: implement...
1625
  			int nr = min(re - rs, nr_to_pop);
47504ee04   Dennis Zhou   percpu: add __GFP...
1626
  			ret = pcpu_populate_chunk(chunk, rs, rs + nr, gfp);
1a4d76076   Tejun Heo   percpu: implement...
1627
1628
1629
  			if (!ret) {
  				nr_to_pop -= nr;
  				spin_lock_irq(&pcpu_lock);
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1630
  				pcpu_chunk_populated(chunk, rs, rs + nr, false);
1a4d76076   Tejun Heo   percpu: implement...
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
  				spin_unlock_irq(&pcpu_lock);
  			} else {
  				nr_to_pop = 0;
  			}
  
  			if (!nr_to_pop)
  				break;
  		}
  	}
  
  	if (nr_to_pop) {
  		/* ran out of chunks to populate, create a new one and retry */
47504ee04   Dennis Zhou   percpu: add __GFP...
1643
  		chunk = pcpu_create_chunk(gfp);
1a4d76076   Tejun Heo   percpu: implement...
1644
1645
1646
1647
1648
1649
1650
  		if (chunk) {
  			spin_lock_irq(&pcpu_lock);
  			pcpu_chunk_relocate(chunk, -1);
  			spin_unlock_irq(&pcpu_lock);
  			goto retry_pop;
  		}
  	}
971f3918a   Tejun Heo   percpu: fix pcpu_...
1651
  	mutex_unlock(&pcpu_alloc_mutex);
fbf59bc9d   Tejun Heo   percpu: implement...
1652
1653
1654
1655
1656
1657
  }
  
  /**
   * free_percpu - free percpu area
   * @ptr: pointer to area to free
   *
ccea34b5d   Tejun Heo   percpu: finer gra...
1658
1659
1660
1661
   * Free percpu area @ptr.
   *
   * CONTEXT:
   * Can be called from atomic context.
fbf59bc9d   Tejun Heo   percpu: implement...
1662
   */
43cf38eb5   Tejun Heo   percpu: add __per...
1663
  void free_percpu(void __percpu *ptr)
fbf59bc9d   Tejun Heo   percpu: implement...
1664
  {
129182e56   Andrew Morton   percpu: avoid cal...
1665
  	void *addr;
fbf59bc9d   Tejun Heo   percpu: implement...
1666
  	struct pcpu_chunk *chunk;
ccea34b5d   Tejun Heo   percpu: finer gra...
1667
  	unsigned long flags;
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1668
  	int off;
fbf59bc9d   Tejun Heo   percpu: implement...
1669
1670
1671
  
  	if (!ptr)
  		return;
f528f0b8e   Catalin Marinas   kmemleak: Handle ...
1672
  	kmemleak_free_percpu(ptr);
129182e56   Andrew Morton   percpu: avoid cal...
1673
  	addr = __pcpu_ptr_to_addr(ptr);
ccea34b5d   Tejun Heo   percpu: finer gra...
1674
  	spin_lock_irqsave(&pcpu_lock, flags);
fbf59bc9d   Tejun Heo   percpu: implement...
1675
1676
  
  	chunk = pcpu_chunk_addr_search(addr);
bba174f5e   Tejun Heo   percpu: add chunk...
1677
  	off = addr - chunk->base_addr;
fbf59bc9d   Tejun Heo   percpu: implement...
1678

40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1679
  	pcpu_free_area(chunk, off);
fbf59bc9d   Tejun Heo   percpu: implement...
1680

a56dbddf0   Tejun Heo   percpu: move full...
1681
  	/* if there are more than one fully free chunks, wake up grim reaper */
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1682
  	if (chunk->free_bytes == pcpu_unit_size) {
fbf59bc9d   Tejun Heo   percpu: implement...
1683
  		struct pcpu_chunk *pos;
a56dbddf0   Tejun Heo   percpu: move full...
1684
  		list_for_each_entry(pos, &pcpu_slot[pcpu_nr_slots - 1], list)
fbf59bc9d   Tejun Heo   percpu: implement...
1685
  			if (pos != chunk) {
1a4d76076   Tejun Heo   percpu: implement...
1686
  				pcpu_schedule_balance_work();
fbf59bc9d   Tejun Heo   percpu: implement...
1687
1688
1689
  				break;
  			}
  	}
df95e795a   Dennis Zhou   percpu: add trace...
1690
  	trace_percpu_free_percpu(chunk->base_addr, off, ptr);
ccea34b5d   Tejun Heo   percpu: finer gra...
1691
  	spin_unlock_irqrestore(&pcpu_lock, flags);
fbf59bc9d   Tejun Heo   percpu: implement...
1692
1693
  }
  EXPORT_SYMBOL_GPL(free_percpu);
383776fa7   Thomas Gleixner   locking/lockdep: ...
1694
  bool __is_kernel_percpu_address(unsigned long addr, unsigned long *can_addr)
10fad5e46   Tejun Heo   percpu, module: i...
1695
  {
bbddff054   Tejun Heo   percpu: use percp...
1696
  #ifdef CONFIG_SMP
10fad5e46   Tejun Heo   percpu, module: i...
1697
1698
1699
1700
1701
1702
  	const size_t static_size = __per_cpu_end - __per_cpu_start;
  	void __percpu *base = __addr_to_pcpu_ptr(pcpu_base_addr);
  	unsigned int cpu;
  
  	for_each_possible_cpu(cpu) {
  		void *start = per_cpu_ptr(base, cpu);
383776fa7   Thomas Gleixner   locking/lockdep: ...
1703
  		void *va = (void *)addr;
10fad5e46   Tejun Heo   percpu, module: i...
1704

383776fa7   Thomas Gleixner   locking/lockdep: ...
1705
  		if (va >= start && va < start + static_size) {
8ce371f98   Peter Zijlstra   lockdep: Fix per-...
1706
  			if (can_addr) {
383776fa7   Thomas Gleixner   locking/lockdep: ...
1707
  				*can_addr = (unsigned long) (va - start);
8ce371f98   Peter Zijlstra   lockdep: Fix per-...
1708
1709
1710
  				*can_addr += (unsigned long)
  					per_cpu_ptr(base, get_boot_cpu_id());
  			}
10fad5e46   Tejun Heo   percpu, module: i...
1711
  			return true;
383776fa7   Thomas Gleixner   locking/lockdep: ...
1712
1713
  		}
  	}
bbddff054   Tejun Heo   percpu: use percp...
1714
1715
  #endif
  	/* on UP, can't distinguish from other static vars, always false */
10fad5e46   Tejun Heo   percpu, module: i...
1716
1717
1718
1719
  	return false;
  }
  
  /**
383776fa7   Thomas Gleixner   locking/lockdep: ...
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
   * is_kernel_percpu_address - test whether address is from static percpu area
   * @addr: address to test
   *
   * Test whether @addr belongs to in-kernel static percpu area.  Module
   * static percpu areas are not considered.  For those, use
   * is_module_percpu_address().
   *
   * RETURNS:
   * %true if @addr is from in-kernel static percpu area, %false otherwise.
   */
  bool is_kernel_percpu_address(unsigned long addr)
  {
  	return __is_kernel_percpu_address(addr, NULL);
  }
  
  /**
3b034b0d0   Vivek Goyal   percpu: Fix kdump...
1736
1737
1738
1739
1740
1741
1742
1743
   * per_cpu_ptr_to_phys - convert translated percpu address to physical address
   * @addr: the address to be converted to physical address
   *
   * Given @addr which is dereferenceable address obtained via one of
   * percpu access macros, this function translates it into its physical
   * address.  The caller is responsible for ensuring @addr stays valid
   * until this function finishes.
   *
67589c714   Dave Young   percpu: explain w...
1744
1745
1746
1747
1748
   * percpu allocator has special setup for the first chunk, which currently
   * supports either embedding in linear address space or vmalloc mapping,
   * and, from the second one, the backing allocator (currently either vm or
   * km) provides translation.
   *
bffc43758   Yannick Guerrini   percpu: Fix trivi...
1749
   * The addr can be translated simply without checking if it falls into the
67589c714   Dave Young   percpu: explain w...
1750
1751
1752
1753
1754
   * first chunk. But the current code reflects better how percpu allocator
   * actually works, and the verification can discover both bugs in percpu
   * allocator itself and per_cpu_ptr_to_phys() callers. So we keep current
   * code.
   *
3b034b0d0   Vivek Goyal   percpu: Fix kdump...
1755
1756
1757
1758
1759
   * RETURNS:
   * The physical address for @addr.
   */
  phys_addr_t per_cpu_ptr_to_phys(void *addr)
  {
9983b6f0c   Tejun Heo   percpu: fix first...
1760
1761
  	void __percpu *base = __addr_to_pcpu_ptr(pcpu_base_addr);
  	bool in_first_chunk = false;
a855b84c3   Tejun Heo   percpu: fix chunk...
1762
  	unsigned long first_low, first_high;
9983b6f0c   Tejun Heo   percpu: fix first...
1763
1764
1765
  	unsigned int cpu;
  
  	/*
a855b84c3   Tejun Heo   percpu: fix chunk...
1766
  	 * The following test on unit_low/high isn't strictly
9983b6f0c   Tejun Heo   percpu: fix first...
1767
1768
  	 * necessary but will speed up lookups of addresses which
  	 * aren't in the first chunk.
c0ebfdc3f   Dennis Zhou (Facebook)   percpu: modify ba...
1769
1770
1771
1772
1773
  	 *
  	 * The address check is against full chunk sizes.  pcpu_base_addr
  	 * points to the beginning of the first chunk including the
  	 * static region.  Assumes good intent as the first chunk may
  	 * not be full (ie. < pcpu_unit_pages in size).
9983b6f0c   Tejun Heo   percpu: fix first...
1774
  	 */
c0ebfdc3f   Dennis Zhou (Facebook)   percpu: modify ba...
1775
1776
1777
1778
  	first_low = (unsigned long)pcpu_base_addr +
  		    pcpu_unit_page_offset(pcpu_low_unit_cpu, 0);
  	first_high = (unsigned long)pcpu_base_addr +
  		     pcpu_unit_page_offset(pcpu_high_unit_cpu, pcpu_unit_pages);
a855b84c3   Tejun Heo   percpu: fix chunk...
1779
1780
  	if ((unsigned long)addr >= first_low &&
  	    (unsigned long)addr < first_high) {
9983b6f0c   Tejun Heo   percpu: fix first...
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
  		for_each_possible_cpu(cpu) {
  			void *start = per_cpu_ptr(base, cpu);
  
  			if (addr >= start && addr < start + pcpu_unit_size) {
  				in_first_chunk = true;
  				break;
  			}
  		}
  	}
  
  	if (in_first_chunk) {
eac522ef4   David Howells   NOMMU: percpu sho...
1792
  		if (!is_vmalloc_addr(addr))
020ec6537   Tejun Heo   percpu: factor ou...
1793
1794
  			return __pa(addr);
  		else
9f57bd4d6   Eugene Surovegin   percpu: fix per_c...
1795
1796
  			return page_to_phys(vmalloc_to_page(addr)) +
  			       offset_in_page(addr);
020ec6537   Tejun Heo   percpu: factor ou...
1797
  	} else
9f57bd4d6   Eugene Surovegin   percpu: fix per_c...
1798
1799
  		return page_to_phys(pcpu_addr_to_page(addr)) +
  		       offset_in_page(addr);
3b034b0d0   Vivek Goyal   percpu: Fix kdump...
1800
  }
fbf59bc9d   Tejun Heo   percpu: implement...
1801
  /**
fd1e8a1fe   Tejun Heo   percpu: introduce...
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
   * pcpu_alloc_alloc_info - allocate percpu allocation info
   * @nr_groups: the number of groups
   * @nr_units: the number of units
   *
   * Allocate ai which is large enough for @nr_groups groups containing
   * @nr_units units.  The returned ai's groups[0].cpu_map points to the
   * cpu_map array which is long enough for @nr_units and filled with
   * NR_CPUS.  It's the caller's responsibility to initialize cpu_map
   * pointer of other groups.
   *
   * RETURNS:
   * Pointer to the allocated pcpu_alloc_info on success, NULL on
   * failure.
   */
  struct pcpu_alloc_info * __init pcpu_alloc_alloc_info(int nr_groups,
  						      int nr_units)
  {
  	struct pcpu_alloc_info *ai;
  	size_t base_size, ai_size;
  	void *ptr;
  	int unit;
  
  	base_size = ALIGN(sizeof(*ai) + nr_groups * sizeof(ai->groups[0]),
  			  __alignof__(ai->groups[0].cpu_map[0]));
  	ai_size = base_size + nr_units * sizeof(ai->groups[0].cpu_map[0]);
438a50618   Nicolas Pitre   percpu: don't for...
1827
  	ptr = memblock_virt_alloc_nopanic(PFN_ALIGN(ai_size), PAGE_SIZE);
fd1e8a1fe   Tejun Heo   percpu: introduce...
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
  	if (!ptr)
  		return NULL;
  	ai = ptr;
  	ptr += base_size;
  
  	ai->groups[0].cpu_map = ptr;
  
  	for (unit = 0; unit < nr_units; unit++)
  		ai->groups[0].cpu_map[unit] = NR_CPUS;
  
  	ai->nr_groups = nr_groups;
  	ai->__ai_size = PFN_ALIGN(ai_size);
  
  	return ai;
  }
  
  /**
   * pcpu_free_alloc_info - free percpu allocation info
   * @ai: pcpu_alloc_info to free
   *
   * Free @ai which was allocated by pcpu_alloc_alloc_info().
   */
  void __init pcpu_free_alloc_info(struct pcpu_alloc_info *ai)
  {
999c17e3d   Santosh Shilimkar   mm/percpu.c: use ...
1852
  	memblock_free_early(__pa(ai), ai->__ai_size);
fd1e8a1fe   Tejun Heo   percpu: introduce...
1853
1854
1855
  }
  
  /**
fd1e8a1fe   Tejun Heo   percpu: introduce...
1856
1857
1858
1859
1860
1861
1862
1863
   * pcpu_dump_alloc_info - print out information about pcpu_alloc_info
   * @lvl: loglevel
   * @ai: allocation info to dump
   *
   * Print out information about @ai using loglevel @lvl.
   */
  static void pcpu_dump_alloc_info(const char *lvl,
  				 const struct pcpu_alloc_info *ai)
033e48fb8   Tejun Heo   percpu: move pcpu...
1864
  {
fd1e8a1fe   Tejun Heo   percpu: introduce...
1865
  	int group_width = 1, cpu_width = 1, width;
033e48fb8   Tejun Heo   percpu: move pcpu...
1866
  	char empty_str[] = "--------";
fd1e8a1fe   Tejun Heo   percpu: introduce...
1867
1868
1869
1870
1871
1872
1873
  	int alloc = 0, alloc_end = 0;
  	int group, v;
  	int upa, apl;	/* units per alloc, allocs per line */
  
  	v = ai->nr_groups;
  	while (v /= 10)
  		group_width++;
033e48fb8   Tejun Heo   percpu: move pcpu...
1874

fd1e8a1fe   Tejun Heo   percpu: introduce...
1875
  	v = num_possible_cpus();
033e48fb8   Tejun Heo   percpu: move pcpu...
1876
  	while (v /= 10)
fd1e8a1fe   Tejun Heo   percpu: introduce...
1877
1878
  		cpu_width++;
  	empty_str[min_t(int, cpu_width, sizeof(empty_str) - 1)] = '\0';
033e48fb8   Tejun Heo   percpu: move pcpu...
1879

fd1e8a1fe   Tejun Heo   percpu: introduce...
1880
1881
1882
  	upa = ai->alloc_size / ai->unit_size;
  	width = upa * (cpu_width + 1) + group_width + 3;
  	apl = rounddown_pow_of_two(max(60 / width, 1));
033e48fb8   Tejun Heo   percpu: move pcpu...
1883

fd1e8a1fe   Tejun Heo   percpu: introduce...
1884
1885
1886
  	printk("%spcpu-alloc: s%zu r%zu d%zu u%zu alloc=%zu*%zu",
  	       lvl, ai->static_size, ai->reserved_size, ai->dyn_size,
  	       ai->unit_size, ai->alloc_size / ai->atom_size, ai->atom_size);
033e48fb8   Tejun Heo   percpu: move pcpu...
1887

fd1e8a1fe   Tejun Heo   percpu: introduce...
1888
1889
1890
1891
1892
1893
1894
1895
  	for (group = 0; group < ai->nr_groups; group++) {
  		const struct pcpu_group_info *gi = &ai->groups[group];
  		int unit = 0, unit_end = 0;
  
  		BUG_ON(gi->nr_units % upa);
  		for (alloc_end += gi->nr_units / upa;
  		     alloc < alloc_end; alloc++) {
  			if (!(alloc % apl)) {
1170532bb   Joe Perches   mm: convert print...
1896
1897
  				pr_cont("
  ");
fd1e8a1fe   Tejun Heo   percpu: introduce...
1898
1899
  				printk("%spcpu-alloc: ", lvl);
  			}
1170532bb   Joe Perches   mm: convert print...
1900
  			pr_cont("[%0*d] ", group_width, group);
fd1e8a1fe   Tejun Heo   percpu: introduce...
1901
1902
1903
  
  			for (unit_end += upa; unit < unit_end; unit++)
  				if (gi->cpu_map[unit] != NR_CPUS)
1170532bb   Joe Perches   mm: convert print...
1904
1905
  					pr_cont("%0*d ",
  						cpu_width, gi->cpu_map[unit]);
fd1e8a1fe   Tejun Heo   percpu: introduce...
1906
  				else
1170532bb   Joe Perches   mm: convert print...
1907
  					pr_cont("%s ", empty_str);
033e48fb8   Tejun Heo   percpu: move pcpu...
1908
  		}
033e48fb8   Tejun Heo   percpu: move pcpu...
1909
  	}
1170532bb   Joe Perches   mm: convert print...
1910
1911
  	pr_cont("
  ");
033e48fb8   Tejun Heo   percpu: move pcpu...
1912
  }
033e48fb8   Tejun Heo   percpu: move pcpu...
1913

fbf59bc9d   Tejun Heo   percpu: implement...
1914
  /**
8d408b4be   Tejun Heo   percpu: give more...
1915
   * pcpu_setup_first_chunk - initialize the first percpu chunk
fd1e8a1fe   Tejun Heo   percpu: introduce...
1916
   * @ai: pcpu_alloc_info describing how to percpu area is shaped
38a6be525   Tejun Heo   percpu: simplify ...
1917
   * @base_addr: mapped address
8d408b4be   Tejun Heo   percpu: give more...
1918
1919
1920
   *
   * Initialize the first percpu chunk which contains the kernel static
   * perpcu area.  This function is to be called from arch percpu area
38a6be525   Tejun Heo   percpu: simplify ...
1921
   * setup path.
8d408b4be   Tejun Heo   percpu: give more...
1922
   *
fd1e8a1fe   Tejun Heo   percpu: introduce...
1923
1924
1925
1926
1927
1928
   * @ai contains all information necessary to initialize the first
   * chunk and prime the dynamic percpu allocator.
   *
   * @ai->static_size is the size of static percpu area.
   *
   * @ai->reserved_size, if non-zero, specifies the amount of bytes to
edcb46399   Tejun Heo   percpu, module: i...
1929
1930
1931
1932
1933
1934
1935
   * reserve after the static area in the first chunk.  This reserves
   * the first chunk such that it's available only through reserved
   * percpu allocation.  This is primarily used to serve module percpu
   * static areas on architectures where the addressing model has
   * limited offset range for symbol relocations to guarantee module
   * percpu symbols fall inside the relocatable range.
   *
fd1e8a1fe   Tejun Heo   percpu: introduce...
1936
1937
1938
   * @ai->dyn_size determines the number of bytes available for dynamic
   * allocation in the first chunk.  The area between @ai->static_size +
   * @ai->reserved_size + @ai->dyn_size and @ai->unit_size is unused.
6074d5b0a   Tejun Heo   percpu: more flex...
1939
   *
fd1e8a1fe   Tejun Heo   percpu: introduce...
1940
1941
1942
   * @ai->unit_size specifies unit size and must be aligned to PAGE_SIZE
   * and equal to or larger than @ai->static_size + @ai->reserved_size +
   * @ai->dyn_size.
8d408b4be   Tejun Heo   percpu: give more...
1943
   *
fd1e8a1fe   Tejun Heo   percpu: introduce...
1944
1945
   * @ai->atom_size is the allocation atom size and used as alignment
   * for vm areas.
8d408b4be   Tejun Heo   percpu: give more...
1946
   *
fd1e8a1fe   Tejun Heo   percpu: introduce...
1947
1948
1949
1950
1951
1952
1953
1954
1955
   * @ai->alloc_size is the allocation size and always multiple of
   * @ai->atom_size.  This is larger than @ai->atom_size if
   * @ai->unit_size is larger than @ai->atom_size.
   *
   * @ai->nr_groups and @ai->groups describe virtual memory layout of
   * percpu areas.  Units which should be colocated are put into the
   * same group.  Dynamic VM areas will be allocated according to these
   * groupings.  If @ai->nr_groups is zero, a single group containing
   * all units is assumed.
8d408b4be   Tejun Heo   percpu: give more...
1956
   *
38a6be525   Tejun Heo   percpu: simplify ...
1957
1958
   * The caller should have mapped the first chunk at @base_addr and
   * copied static data to each unit.
fbf59bc9d   Tejun Heo   percpu: implement...
1959
   *
c0ebfdc3f   Dennis Zhou (Facebook)   percpu: modify ba...
1960
1961
1962
1963
1964
1965
1966
   * The first chunk will always contain a static and a dynamic region.
   * However, the static region is not managed by any chunk.  If the first
   * chunk also contains a reserved region, it is served by two chunks -
   * one for the reserved region and one for the dynamic region.  They
   * share the same vm, but use offset regions in the area allocation map.
   * The chunk serving the dynamic region is circulated in the chunk slots
   * and available for dynamic allocation like any other chunk.
edcb46399   Tejun Heo   percpu, module: i...
1967
   *
fbf59bc9d   Tejun Heo   percpu: implement...
1968
   * RETURNS:
fb435d523   Tejun Heo   percpu: add pcpu_...
1969
   * 0 on success, -errno on failure.
fbf59bc9d   Tejun Heo   percpu: implement...
1970
   */
fb435d523   Tejun Heo   percpu: add pcpu_...
1971
1972
  int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
  				  void *base_addr)
fbf59bc9d   Tejun Heo   percpu: implement...
1973
  {
b9c39442c   Dennis Zhou (Facebook)   percpu: setup_fir...
1974
  	size_t size_sum = ai->static_size + ai->reserved_size + ai->dyn_size;
d2f3c3849   Dennis Zhou (Facebook)   percpu: increase ...
1975
  	size_t static_size, dyn_size;
0c4169c3d   Dennis Zhou (Facebook)   percpu: setup_fir...
1976
  	struct pcpu_chunk *chunk;
6563297ce   Tejun Heo   percpu: use group...
1977
1978
  	unsigned long *group_offsets;
  	size_t *group_sizes;
fb435d523   Tejun Heo   percpu: add pcpu_...
1979
  	unsigned long *unit_off;
fbf59bc9d   Tejun Heo   percpu: implement...
1980
  	unsigned int cpu;
fd1e8a1fe   Tejun Heo   percpu: introduce...
1981
1982
  	int *unit_map;
  	int group, unit, i;
c0ebfdc3f   Dennis Zhou (Facebook)   percpu: modify ba...
1983
1984
  	int map_size;
  	unsigned long tmp_addr;
fbf59bc9d   Tejun Heo   percpu: implement...
1985

635b75fc1   Tejun Heo   percpu: make pcpu...
1986
1987
  #define PCPU_SETUP_BUG_ON(cond)	do {					\
  	if (unlikely(cond)) {						\
870d4b12a   Joe Perches   mm: percpu: use p...
1988
1989
1990
1991
  		pr_emerg("failed to initialize, %s
  ", #cond);		\
  		pr_emerg("cpu_possible_mask=%*pb
  ",			\
807de073b   Tejun Heo   percpu: use %*pb[...
1992
  			 cpumask_pr_args(cpu_possible_mask));		\
635b75fc1   Tejun Heo   percpu: make pcpu...
1993
1994
1995
1996
  		pcpu_dump_alloc_info(KERN_EMERG, ai);			\
  		BUG();							\
  	}								\
  } while (0)
2f39e637e   Tejun Heo   percpu: allow non...
1997
  	/* sanity checks */
635b75fc1   Tejun Heo   percpu: make pcpu...
1998
  	PCPU_SETUP_BUG_ON(ai->nr_groups <= 0);
bbddff054   Tejun Heo   percpu: use percp...
1999
  #ifdef CONFIG_SMP
635b75fc1   Tejun Heo   percpu: make pcpu...
2000
  	PCPU_SETUP_BUG_ON(!ai->static_size);
f09f1243c   Alexander Kuleshov   mm/percpu: use of...
2001
  	PCPU_SETUP_BUG_ON(offset_in_page(__per_cpu_start));
bbddff054   Tejun Heo   percpu: use percp...
2002
  #endif
635b75fc1   Tejun Heo   percpu: make pcpu...
2003
  	PCPU_SETUP_BUG_ON(!base_addr);
f09f1243c   Alexander Kuleshov   mm/percpu: use of...
2004
  	PCPU_SETUP_BUG_ON(offset_in_page(base_addr));
635b75fc1   Tejun Heo   percpu: make pcpu...
2005
  	PCPU_SETUP_BUG_ON(ai->unit_size < size_sum);
f09f1243c   Alexander Kuleshov   mm/percpu: use of...
2006
  	PCPU_SETUP_BUG_ON(offset_in_page(ai->unit_size));
635b75fc1   Tejun Heo   percpu: make pcpu...
2007
  	PCPU_SETUP_BUG_ON(ai->unit_size < PCPU_MIN_UNIT_SIZE);
ca460b3c9   Dennis Zhou (Facebook)   percpu: introduce...
2008
  	PCPU_SETUP_BUG_ON(!IS_ALIGNED(ai->unit_size, PCPU_BITMAP_BLOCK_SIZE));
099a19d91   Tejun Heo   percpu: allow lim...
2009
  	PCPU_SETUP_BUG_ON(ai->dyn_size < PERCPU_DYNAMIC_EARLY_SIZE);
fb29a2cc6   Dennis Zhou (Facebook)   percpu: setup_fir...
2010
  	PCPU_SETUP_BUG_ON(!ai->dyn_size);
d2f3c3849   Dennis Zhou (Facebook)   percpu: increase ...
2011
  	PCPU_SETUP_BUG_ON(!IS_ALIGNED(ai->reserved_size, PCPU_MIN_ALLOC_SIZE));
ca460b3c9   Dennis Zhou (Facebook)   percpu: introduce...
2012
2013
  	PCPU_SETUP_BUG_ON(!(IS_ALIGNED(PCPU_BITMAP_BLOCK_SIZE, PAGE_SIZE) ||
  			    IS_ALIGNED(PAGE_SIZE, PCPU_BITMAP_BLOCK_SIZE)));
9f6455325   Tejun Heo   percpu: move vmal...
2014
  	PCPU_SETUP_BUG_ON(pcpu_verify_alloc_info(ai) < 0);
8d408b4be   Tejun Heo   percpu: give more...
2015

6563297ce   Tejun Heo   percpu: use group...
2016
  	/* process group information and build config tables accordingly */
999c17e3d   Santosh Shilimkar   mm/percpu.c: use ...
2017
2018
2019
2020
2021
2022
  	group_offsets = memblock_virt_alloc(ai->nr_groups *
  					     sizeof(group_offsets[0]), 0);
  	group_sizes = memblock_virt_alloc(ai->nr_groups *
  					   sizeof(group_sizes[0]), 0);
  	unit_map = memblock_virt_alloc(nr_cpu_ids * sizeof(unit_map[0]), 0);
  	unit_off = memblock_virt_alloc(nr_cpu_ids * sizeof(unit_off[0]), 0);
2f39e637e   Tejun Heo   percpu: allow non...
2023

fd1e8a1fe   Tejun Heo   percpu: introduce...
2024
  	for (cpu = 0; cpu < nr_cpu_ids; cpu++)
ffe0d5a57   Tejun Heo   percpu: fix unit_...
2025
  		unit_map[cpu] = UINT_MAX;
a855b84c3   Tejun Heo   percpu: fix chunk...
2026
2027
2028
  
  	pcpu_low_unit_cpu = NR_CPUS;
  	pcpu_high_unit_cpu = NR_CPUS;
2f39e637e   Tejun Heo   percpu: allow non...
2029

fd1e8a1fe   Tejun Heo   percpu: introduce...
2030
2031
  	for (group = 0, unit = 0; group < ai->nr_groups; group++, unit += i) {
  		const struct pcpu_group_info *gi = &ai->groups[group];
2f39e637e   Tejun Heo   percpu: allow non...
2032

6563297ce   Tejun Heo   percpu: use group...
2033
2034
  		group_offsets[group] = gi->base_offset;
  		group_sizes[group] = gi->nr_units * ai->unit_size;
fd1e8a1fe   Tejun Heo   percpu: introduce...
2035
2036
2037
2038
  		for (i = 0; i < gi->nr_units; i++) {
  			cpu = gi->cpu_map[i];
  			if (cpu == NR_CPUS)
  				continue;
8d408b4be   Tejun Heo   percpu: give more...
2039

9f295664e   Dan Carpenter   percpu: off by on...
2040
  			PCPU_SETUP_BUG_ON(cpu >= nr_cpu_ids);
635b75fc1   Tejun Heo   percpu: make pcpu...
2041
2042
  			PCPU_SETUP_BUG_ON(!cpu_possible(cpu));
  			PCPU_SETUP_BUG_ON(unit_map[cpu] != UINT_MAX);
fbf59bc9d   Tejun Heo   percpu: implement...
2043

fd1e8a1fe   Tejun Heo   percpu: introduce...
2044
  			unit_map[cpu] = unit + i;
fb435d523   Tejun Heo   percpu: add pcpu_...
2045
  			unit_off[cpu] = gi->base_offset + i * ai->unit_size;
a855b84c3   Tejun Heo   percpu: fix chunk...
2046
2047
2048
2049
2050
2051
2052
  			/* determine low/high unit_cpu */
  			if (pcpu_low_unit_cpu == NR_CPUS ||
  			    unit_off[cpu] < unit_off[pcpu_low_unit_cpu])
  				pcpu_low_unit_cpu = cpu;
  			if (pcpu_high_unit_cpu == NR_CPUS ||
  			    unit_off[cpu] > unit_off[pcpu_high_unit_cpu])
  				pcpu_high_unit_cpu = cpu;
fd1e8a1fe   Tejun Heo   percpu: introduce...
2053
  		}
2f39e637e   Tejun Heo   percpu: allow non...
2054
  	}
fd1e8a1fe   Tejun Heo   percpu: introduce...
2055
2056
2057
  	pcpu_nr_units = unit;
  
  	for_each_possible_cpu(cpu)
635b75fc1   Tejun Heo   percpu: make pcpu...
2058
2059
2060
2061
  		PCPU_SETUP_BUG_ON(unit_map[cpu] == UINT_MAX);
  
  	/* we're done parsing the input, undefine BUG macro and dump config */
  #undef PCPU_SETUP_BUG_ON
bcbea798f   Tejun Heo   percpu: print out...
2062
  	pcpu_dump_alloc_info(KERN_DEBUG, ai);
fd1e8a1fe   Tejun Heo   percpu: introduce...
2063

6563297ce   Tejun Heo   percpu: use group...
2064
2065
2066
  	pcpu_nr_groups = ai->nr_groups;
  	pcpu_group_offsets = group_offsets;
  	pcpu_group_sizes = group_sizes;
fd1e8a1fe   Tejun Heo   percpu: introduce...
2067
  	pcpu_unit_map = unit_map;
fb435d523   Tejun Heo   percpu: add pcpu_...
2068
  	pcpu_unit_offsets = unit_off;
2f39e637e   Tejun Heo   percpu: allow non...
2069
2070
  
  	/* determine basic parameters */
fd1e8a1fe   Tejun Heo   percpu: introduce...
2071
  	pcpu_unit_pages = ai->unit_size >> PAGE_SHIFT;
d9b55eeb1   Tejun Heo   percpu: remove un...
2072
  	pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT;
6563297ce   Tejun Heo   percpu: use group...
2073
  	pcpu_atom_size = ai->atom_size;
ce3141a27   Tejun Heo   percpu: drop pcpu...
2074
2075
  	pcpu_chunk_struct_size = sizeof(struct pcpu_chunk) +
  		BITS_TO_LONGS(pcpu_unit_pages) * sizeof(unsigned long);
cafe8816b   Tejun Heo   percpu: use negat...
2076

30a5b5367   Dennis Zhou   percpu: expose st...
2077
  	pcpu_stats_save_ai(ai);
d9b55eeb1   Tejun Heo   percpu: remove un...
2078
2079
2080
2081
2082
  	/*
  	 * Allocate chunk slots.  The additional last slot is for
  	 * empty chunks.
  	 */
  	pcpu_nr_slots = __pcpu_size_to_slot(pcpu_unit_size) + 2;
999c17e3d   Santosh Shilimkar   mm/percpu.c: use ...
2083
2084
  	pcpu_slot = memblock_virt_alloc(
  			pcpu_nr_slots * sizeof(pcpu_slot[0]), 0);
fbf59bc9d   Tejun Heo   percpu: implement...
2085
2086
  	for (i = 0; i < pcpu_nr_slots; i++)
  		INIT_LIST_HEAD(&pcpu_slot[i]);
edcb46399   Tejun Heo   percpu, module: i...
2087
  	/*
d2f3c3849   Dennis Zhou (Facebook)   percpu: increase ...
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
  	 * The end of the static region needs to be aligned with the
  	 * minimum allocation size as this offsets the reserved and
  	 * dynamic region.  The first chunk ends page aligned by
  	 * expanding the dynamic region, therefore the dynamic region
  	 * can be shrunk to compensate while still staying above the
  	 * configured sizes.
  	 */
  	static_size = ALIGN(ai->static_size, PCPU_MIN_ALLOC_SIZE);
  	dyn_size = ai->dyn_size - (static_size - ai->static_size);
  
  	/*
c0ebfdc3f   Dennis Zhou (Facebook)   percpu: modify ba...
2099
2100
2101
2102
2103
2104
  	 * Initialize first chunk.
  	 * If the reserved_size is non-zero, this initializes the reserved
  	 * chunk.  If the reserved_size is zero, the reserved chunk is NULL
  	 * and the dynamic region is initialized here.  The first chunk,
  	 * pcpu_first_chunk, will always point to the chunk that serves
  	 * the dynamic region.
edcb46399   Tejun Heo   percpu, module: i...
2105
  	 */
d2f3c3849   Dennis Zhou (Facebook)   percpu: increase ...
2106
2107
  	tmp_addr = (unsigned long)base_addr + static_size;
  	map_size = ai->reserved_size ?: dyn_size;
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
2108
  	chunk = pcpu_alloc_first_chunk(tmp_addr, map_size);
61ace7fa2   Tejun Heo   percpu: improve f...
2109

edcb46399   Tejun Heo   percpu, module: i...
2110
  	/* init dynamic chunk if necessary */
b9c39442c   Dennis Zhou (Facebook)   percpu: setup_fir...
2111
  	if (ai->reserved_size) {
0c4169c3d   Dennis Zhou (Facebook)   percpu: setup_fir...
2112
  		pcpu_reserved_chunk = chunk;
b9c39442c   Dennis Zhou (Facebook)   percpu: setup_fir...
2113

d2f3c3849   Dennis Zhou (Facebook)   percpu: increase ...
2114
  		tmp_addr = (unsigned long)base_addr + static_size +
c0ebfdc3f   Dennis Zhou (Facebook)   percpu: modify ba...
2115
  			   ai->reserved_size;
d2f3c3849   Dennis Zhou (Facebook)   percpu: increase ...
2116
  		map_size = dyn_size;
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
2117
  		chunk = pcpu_alloc_first_chunk(tmp_addr, map_size);
edcb46399   Tejun Heo   percpu, module: i...
2118
  	}
2441d15c9   Tejun Heo   percpu: cosmetic ...
2119
  	/* link the first chunk in */
0c4169c3d   Dennis Zhou (Facebook)   percpu: setup_fir...
2120
  	pcpu_first_chunk = chunk;
0cecf50cf   Dennis Zhou (Facebook)   percpu: introduce...
2121
  	pcpu_nr_empty_pop_pages = pcpu_first_chunk->nr_empty_pop_pages;
ae9e6bc9f   Tejun Heo   percpu: don't put...
2122
  	pcpu_chunk_relocate(pcpu_first_chunk, -1);
fbf59bc9d   Tejun Heo   percpu: implement...
2123

7e8a6304d   Dennis Zhou (Facebook)   /proc/meminfo: ad...
2124
2125
  	/* include all regions of the first chunk */
  	pcpu_nr_populated += PFN_DOWN(size_sum);
30a5b5367   Dennis Zhou   percpu: expose st...
2126
  	pcpu_stats_chunk_alloc();
df95e795a   Dennis Zhou   percpu: add trace...
2127
  	trace_percpu_create_chunk(base_addr);
30a5b5367   Dennis Zhou   percpu: expose st...
2128

fbf59bc9d   Tejun Heo   percpu: implement...
2129
  	/* we're done */
bba174f5e   Tejun Heo   percpu: add chunk...
2130
  	pcpu_base_addr = base_addr;
fb435d523   Tejun Heo   percpu: add pcpu_...
2131
  	return 0;
fbf59bc9d   Tejun Heo   percpu: implement...
2132
  }
66c3a7577   Tejun Heo   percpu: generaliz...
2133

bbddff054   Tejun Heo   percpu: use percp...
2134
  #ifdef CONFIG_SMP
17f3609c2   Andi Kleen   sections: fix sec...
2135
  const char * const pcpu_fc_names[PCPU_FC_NR] __initconst = {
f58dc01ba   Tejun Heo   percpu: generaliz...
2136
2137
2138
  	[PCPU_FC_AUTO]	= "auto",
  	[PCPU_FC_EMBED]	= "embed",
  	[PCPU_FC_PAGE]	= "page",
f58dc01ba   Tejun Heo   percpu: generaliz...
2139
  };
66c3a7577   Tejun Heo   percpu: generaliz...
2140

f58dc01ba   Tejun Heo   percpu: generaliz...
2141
  enum pcpu_fc pcpu_chosen_fc __initdata = PCPU_FC_AUTO;
66c3a7577   Tejun Heo   percpu: generaliz...
2142

f58dc01ba   Tejun Heo   percpu: generaliz...
2143
2144
  static int __init percpu_alloc_setup(char *str)
  {
5479c78ac   Cyrill Gorcunov   mm, percpu: Make ...
2145
2146
  	if (!str)
  		return -EINVAL;
f58dc01ba   Tejun Heo   percpu: generaliz...
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
  	if (0)
  		/* nada */;
  #ifdef CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK
  	else if (!strcmp(str, "embed"))
  		pcpu_chosen_fc = PCPU_FC_EMBED;
  #endif
  #ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK
  	else if (!strcmp(str, "page"))
  		pcpu_chosen_fc = PCPU_FC_PAGE;
  #endif
f58dc01ba   Tejun Heo   percpu: generaliz...
2157
  	else
870d4b12a   Joe Perches   mm: percpu: use p...
2158
2159
  		pr_warn("unknown allocator %s specified
  ", str);
66c3a7577   Tejun Heo   percpu: generaliz...
2160

f58dc01ba   Tejun Heo   percpu: generaliz...
2161
  	return 0;
66c3a7577   Tejun Heo   percpu: generaliz...
2162
  }
f58dc01ba   Tejun Heo   percpu: generaliz...
2163
  early_param("percpu_alloc", percpu_alloc_setup);
66c3a7577   Tejun Heo   percpu: generaliz...
2164

3c9a024fd   Tejun Heo   percpu: fix build...
2165
2166
2167
2168
2169
  /*
   * pcpu_embed_first_chunk() is used by the generic percpu setup.
   * Build it if needed by the arch config or the generic setup is going
   * to be used.
   */
08fc45806   Tejun Heo   percpu: build fir...
2170
2171
  #if defined(CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK) || \
  	!defined(CONFIG_HAVE_SETUP_PER_CPU_AREA)
3c9a024fd   Tejun Heo   percpu: fix build...
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
  #define BUILD_EMBED_FIRST_CHUNK
  #endif
  
  /* build pcpu_page_first_chunk() iff needed by the arch config */
  #if defined(CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK)
  #define BUILD_PAGE_FIRST_CHUNK
  #endif
  
  /* pcpu_build_alloc_info() is used by both embed and page first chunk */
  #if defined(BUILD_EMBED_FIRST_CHUNK) || defined(BUILD_PAGE_FIRST_CHUNK)
  /**
   * pcpu_build_alloc_info - build alloc_info considering distances between CPUs
   * @reserved_size: the size of reserved percpu area in bytes
   * @dyn_size: minimum free size for dynamic allocation in bytes
   * @atom_size: allocation atom size
   * @cpu_distance_fn: callback to determine distance between cpus, optional
   *
   * This function determines grouping of units, their mappings to cpus
   * and other parameters considering needed percpu size, allocation
   * atom size and distances between CPUs.
   *
bffc43758   Yannick Guerrini   percpu: Fix trivi...
2193
   * Groups are always multiples of atom size and CPUs which are of
3c9a024fd   Tejun Heo   percpu: fix build...
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
   * LOCAL_DISTANCE both ways are grouped together and share space for
   * units in the same group.  The returned configuration is guaranteed
   * to have CPUs on different nodes on different groups and >=75% usage
   * of allocated virtual address space.
   *
   * RETURNS:
   * On success, pointer to the new allocation_info is returned.  On
   * failure, ERR_PTR value is returned.
   */
  static struct pcpu_alloc_info * __init pcpu_build_alloc_info(
  				size_t reserved_size, size_t dyn_size,
  				size_t atom_size,
  				pcpu_fc_cpu_distance_fn_t cpu_distance_fn)
  {
  	static int group_map[NR_CPUS] __initdata;
  	static int group_cnt[NR_CPUS] __initdata;
  	const size_t static_size = __per_cpu_end - __per_cpu_start;
  	int nr_groups = 1, nr_units = 0;
  	size_t size_sum, min_unit_size, alloc_size;
  	int upa, max_upa, uninitialized_var(best_upa);	/* units_per_alloc */
  	int last_allocs, group, unit;
  	unsigned int cpu, tcpu;
  	struct pcpu_alloc_info *ai;
  	unsigned int *cpu_map;
  
  	/* this function may be called multiple times */
  	memset(group_map, 0, sizeof(group_map));
  	memset(group_cnt, 0, sizeof(group_cnt));
  
  	/* calculate size_sum and ensure dyn_size is enough for early alloc */
  	size_sum = PFN_ALIGN(static_size + reserved_size +
  			    max_t(size_t, dyn_size, PERCPU_DYNAMIC_EARLY_SIZE));
  	dyn_size = size_sum - static_size - reserved_size;
  
  	/*
  	 * Determine min_unit_size, alloc_size and max_upa such that
  	 * alloc_size is multiple of atom_size and is the smallest
25985edce   Lucas De Marchi   Fix common misspe...
2231
  	 * which can accommodate 4k aligned segments which are equal to
3c9a024fd   Tejun Heo   percpu: fix build...
2232
2233
2234
  	 * or larger than min_unit_size.
  	 */
  	min_unit_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE);
9c0151627   Dennis Zhou (Facebook)   percpu: update th...
2235
  	/* determine the maximum # of units that can fit in an allocation */
3c9a024fd   Tejun Heo   percpu: fix build...
2236
2237
  	alloc_size = roundup(min_unit_size, atom_size);
  	upa = alloc_size / min_unit_size;
f09f1243c   Alexander Kuleshov   mm/percpu: use of...
2238
  	while (alloc_size % upa || (offset_in_page(alloc_size / upa)))
3c9a024fd   Tejun Heo   percpu: fix build...
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
  		upa--;
  	max_upa = upa;
  
  	/* group cpus according to their proximity */
  	for_each_possible_cpu(cpu) {
  		group = 0;
  	next_group:
  		for_each_possible_cpu(tcpu) {
  			if (cpu == tcpu)
  				break;
  			if (group_map[tcpu] == group && cpu_distance_fn &&
  			    (cpu_distance_fn(cpu, tcpu) > LOCAL_DISTANCE ||
  			     cpu_distance_fn(tcpu, cpu) > LOCAL_DISTANCE)) {
  				group++;
  				nr_groups = max(nr_groups, group + 1);
  				goto next_group;
  			}
  		}
  		group_map[cpu] = group;
  		group_cnt[group]++;
  	}
  
  	/*
9c0151627   Dennis Zhou (Facebook)   percpu: update th...
2262
2263
2264
  	 * Wasted space is caused by a ratio imbalance of upa to group_cnt.
  	 * Expand the unit_size until we use >= 75% of the units allocated.
  	 * Related to atom_size, which could be much larger than the unit_size.
3c9a024fd   Tejun Heo   percpu: fix build...
2265
2266
2267
2268
  	 */
  	last_allocs = INT_MAX;
  	for (upa = max_upa; upa; upa--) {
  		int allocs = 0, wasted = 0;
f09f1243c   Alexander Kuleshov   mm/percpu: use of...
2269
  		if (alloc_size % upa || (offset_in_page(alloc_size / upa)))
3c9a024fd   Tejun Heo   percpu: fix build...
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
  			continue;
  
  		for (group = 0; group < nr_groups; group++) {
  			int this_allocs = DIV_ROUND_UP(group_cnt[group], upa);
  			allocs += this_allocs;
  			wasted += this_allocs * upa - group_cnt[group];
  		}
  
  		/*
  		 * Don't accept if wastage is over 1/3.  The
  		 * greater-than comparison ensures upa==1 always
  		 * passes the following check.
  		 */
  		if (wasted > num_possible_cpus() / 3)
  			continue;
  
  		/* and then don't consume more memory */
  		if (allocs > last_allocs)
  			break;
  		last_allocs = allocs;
  		best_upa = upa;
  	}
  	upa = best_upa;
  
  	/* allocate and fill alloc_info */
  	for (group = 0; group < nr_groups; group++)
  		nr_units += roundup(group_cnt[group], upa);
  
  	ai = pcpu_alloc_alloc_info(nr_groups, nr_units);
  	if (!ai)
  		return ERR_PTR(-ENOMEM);
  	cpu_map = ai->groups[0].cpu_map;
  
  	for (group = 0; group < nr_groups; group++) {
  		ai->groups[group].cpu_map = cpu_map;
  		cpu_map += roundup(group_cnt[group], upa);
  	}
  
  	ai->static_size = static_size;
  	ai->reserved_size = reserved_size;
  	ai->dyn_size = dyn_size;
  	ai->unit_size = alloc_size / upa;
  	ai->atom_size = atom_size;
  	ai->alloc_size = alloc_size;
  
  	for (group = 0, unit = 0; group_cnt[group]; group++) {
  		struct pcpu_group_info *gi = &ai->groups[group];
  
  		/*
  		 * Initialize base_offset as if all groups are located
  		 * back-to-back.  The caller should update this to
  		 * reflect actual allocation.
  		 */
  		gi->base_offset = unit * ai->unit_size;
  
  		for_each_possible_cpu(cpu)
  			if (group_map[cpu] == group)
  				gi->cpu_map[gi->nr_units++] = cpu;
  		gi->nr_units = roundup(gi->nr_units, upa);
  		unit += gi->nr_units;
  	}
  	BUG_ON(unit != nr_units);
  
  	return ai;
  }
  #endif /* BUILD_EMBED_FIRST_CHUNK || BUILD_PAGE_FIRST_CHUNK */
  
  #if defined(BUILD_EMBED_FIRST_CHUNK)
66c3a7577   Tejun Heo   percpu: generaliz...
2338
2339
  /**
   * pcpu_embed_first_chunk - embed the first percpu chunk into bootmem
66c3a7577   Tejun Heo   percpu: generaliz...
2340
   * @reserved_size: the size of reserved percpu area in bytes
4ba6ce250   Tejun Heo   percpu: make @dyn...
2341
   * @dyn_size: minimum free size for dynamic allocation in bytes
c8826dd53   Tejun Heo   percpu: update em...
2342
2343
2344
   * @atom_size: allocation atom size
   * @cpu_distance_fn: callback to determine distance between cpus, optional
   * @alloc_fn: function to allocate percpu page
25985edce   Lucas De Marchi   Fix common misspe...
2345
   * @free_fn: function to free percpu page
66c3a7577   Tejun Heo   percpu: generaliz...
2346
2347
2348
2349
2350
   *
   * This is a helper to ease setting up embedded first percpu chunk and
   * can be called where pcpu_setup_first_chunk() is expected.
   *
   * If this function is used to setup the first chunk, it is allocated
c8826dd53   Tejun Heo   percpu: update em...
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
   * by calling @alloc_fn and used as-is without being mapped into
   * vmalloc area.  Allocations are always whole multiples of @atom_size
   * aligned to @atom_size.
   *
   * This enables the first chunk to piggy back on the linear physical
   * mapping which often uses larger page size.  Please note that this
   * can result in very sparse cpu->unit mapping on NUMA machines thus
   * requiring large vmalloc address space.  Don't use this allocator if
   * vmalloc space is not orders of magnitude larger than distances
   * between node memory addresses (ie. 32bit NUMA machines).
66c3a7577   Tejun Heo   percpu: generaliz...
2361
   *
4ba6ce250   Tejun Heo   percpu: make @dyn...
2362
   * @dyn_size specifies the minimum dynamic area size.
66c3a7577   Tejun Heo   percpu: generaliz...
2363
2364
   *
   * If the needed size is smaller than the minimum or specified unit
c8826dd53   Tejun Heo   percpu: update em...
2365
   * size, the leftover is returned using @free_fn.
66c3a7577   Tejun Heo   percpu: generaliz...
2366
2367
   *
   * RETURNS:
fb435d523   Tejun Heo   percpu: add pcpu_...
2368
   * 0 on success, -errno on failure.
66c3a7577   Tejun Heo   percpu: generaliz...
2369
   */
4ba6ce250   Tejun Heo   percpu: make @dyn...
2370
  int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size,
c8826dd53   Tejun Heo   percpu: update em...
2371
2372
2373
2374
  				  size_t atom_size,
  				  pcpu_fc_cpu_distance_fn_t cpu_distance_fn,
  				  pcpu_fc_alloc_fn_t alloc_fn,
  				  pcpu_fc_free_fn_t free_fn)
66c3a7577   Tejun Heo   percpu: generaliz...
2375
  {
c8826dd53   Tejun Heo   percpu: update em...
2376
2377
  	void *base = (void *)ULONG_MAX;
  	void **areas = NULL;
fd1e8a1fe   Tejun Heo   percpu: introduce...
2378
  	struct pcpu_alloc_info *ai;
93c76b6b2   zijun_hu   mm/percpu.c: corr...
2379
2380
  	size_t size_sum, areas_size;
  	unsigned long max_distance;
9b7396624   zijun_hu   mm/percpu.c: fix ...
2381
  	int group, i, highest_group, rc;
66c3a7577   Tejun Heo   percpu: generaliz...
2382

c8826dd53   Tejun Heo   percpu: update em...
2383
2384
  	ai = pcpu_build_alloc_info(reserved_size, dyn_size, atom_size,
  				   cpu_distance_fn);
fd1e8a1fe   Tejun Heo   percpu: introduce...
2385
2386
  	if (IS_ERR(ai))
  		return PTR_ERR(ai);
66c3a7577   Tejun Heo   percpu: generaliz...
2387

fd1e8a1fe   Tejun Heo   percpu: introduce...
2388
  	size_sum = ai->static_size + ai->reserved_size + ai->dyn_size;
c8826dd53   Tejun Heo   percpu: update em...
2389
  	areas_size = PFN_ALIGN(ai->nr_groups * sizeof(void *));
fa8a7094b   Tejun Heo   x86: implement pe...
2390

999c17e3d   Santosh Shilimkar   mm/percpu.c: use ...
2391
  	areas = memblock_virt_alloc_nopanic(areas_size, 0);
c8826dd53   Tejun Heo   percpu: update em...
2392
  	if (!areas) {
fb435d523   Tejun Heo   percpu: add pcpu_...
2393
  		rc = -ENOMEM;
c8826dd53   Tejun Heo   percpu: update em...
2394
  		goto out_free;
fa8a7094b   Tejun Heo   x86: implement pe...
2395
  	}
66c3a7577   Tejun Heo   percpu: generaliz...
2396

9b7396624   zijun_hu   mm/percpu.c: fix ...
2397
2398
  	/* allocate, copy and determine base address & max_distance */
  	highest_group = 0;
c8826dd53   Tejun Heo   percpu: update em...
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
  	for (group = 0; group < ai->nr_groups; group++) {
  		struct pcpu_group_info *gi = &ai->groups[group];
  		unsigned int cpu = NR_CPUS;
  		void *ptr;
  
  		for (i = 0; i < gi->nr_units && cpu == NR_CPUS; i++)
  			cpu = gi->cpu_map[i];
  		BUG_ON(cpu == NR_CPUS);
  
  		/* allocate space for the whole group */
  		ptr = alloc_fn(cpu, gi->nr_units * ai->unit_size, atom_size);
  		if (!ptr) {
  			rc = -ENOMEM;
  			goto out_free_areas;
  		}
f528f0b8e   Catalin Marinas   kmemleak: Handle ...
2414
2415
  		/* kmemleak tracks the percpu allocations separately */
  		kmemleak_free(ptr);
c8826dd53   Tejun Heo   percpu: update em...
2416
  		areas[group] = ptr;
fd1e8a1fe   Tejun Heo   percpu: introduce...
2417

c8826dd53   Tejun Heo   percpu: update em...
2418
  		base = min(ptr, base);
9b7396624   zijun_hu   mm/percpu.c: fix ...
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
  		if (ptr > areas[highest_group])
  			highest_group = group;
  	}
  	max_distance = areas[highest_group] - base;
  	max_distance += ai->unit_size * ai->groups[highest_group].nr_units;
  
  	/* warn if maximum distance is further than 75% of vmalloc space */
  	if (max_distance > VMALLOC_TOTAL * 3 / 4) {
  		pr_warn("max_distance=0x%lx too large for vmalloc space 0x%lx
  ",
  				max_distance, VMALLOC_TOTAL);
  #ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK
  		/* and fail if we have fallback */
  		rc = -EINVAL;
  		goto out_free_areas;
  #endif
42b642814   Tejun Heo   percpu: pcpu_embe...
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
  	}
  
  	/*
  	 * Copy data and free unused parts.  This should happen after all
  	 * allocations are complete; otherwise, we may end up with
  	 * overlapping groups.
  	 */
  	for (group = 0; group < ai->nr_groups; group++) {
  		struct pcpu_group_info *gi = &ai->groups[group];
  		void *ptr = areas[group];
c8826dd53   Tejun Heo   percpu: update em...
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
  
  		for (i = 0; i < gi->nr_units; i++, ptr += ai->unit_size) {
  			if (gi->cpu_map[i] == NR_CPUS) {
  				/* unused unit, free whole */
  				free_fn(ptr, ai->unit_size);
  				continue;
  			}
  			/* copy and return the unused part */
  			memcpy(ptr, __per_cpu_load, ai->static_size);
  			free_fn(ptr + size_sum, ai->unit_size - size_sum);
  		}
fa8a7094b   Tejun Heo   x86: implement pe...
2456
  	}
66c3a7577   Tejun Heo   percpu: generaliz...
2457

c8826dd53   Tejun Heo   percpu: update em...
2458
  	/* base address is now known, determine group base offsets */
6ea529a20   Tejun Heo   percpu: make embe...
2459
  	for (group = 0; group < ai->nr_groups; group++) {
c8826dd53   Tejun Heo   percpu: update em...
2460
  		ai->groups[group].base_offset = areas[group] - base;
6ea529a20   Tejun Heo   percpu: make embe...
2461
  	}
c8826dd53   Tejun Heo   percpu: update em...
2462

870d4b12a   Joe Perches   mm: percpu: use p...
2463
2464
  	pr_info("Embedded %zu pages/cpu @%p s%zu r%zu d%zu u%zu
  ",
fd1e8a1fe   Tejun Heo   percpu: introduce...
2465
2466
  		PFN_DOWN(size_sum), base, ai->static_size, ai->reserved_size,
  		ai->dyn_size, ai->unit_size);
d4b95f803   Tejun Heo   x86,percpu: gener...
2467

fb435d523   Tejun Heo   percpu: add pcpu_...
2468
  	rc = pcpu_setup_first_chunk(ai, base);
c8826dd53   Tejun Heo   percpu: update em...
2469
2470
2471
2472
  	goto out_free;
  
  out_free_areas:
  	for (group = 0; group < ai->nr_groups; group++)
f851c8d85   Michael Holzheu   percpu: fix bootm...
2473
2474
2475
  		if (areas[group])
  			free_fn(areas[group],
  				ai->groups[group].nr_units * ai->unit_size);
c8826dd53   Tejun Heo   percpu: update em...
2476
  out_free:
fd1e8a1fe   Tejun Heo   percpu: introduce...
2477
  	pcpu_free_alloc_info(ai);
c8826dd53   Tejun Heo   percpu: update em...
2478
  	if (areas)
999c17e3d   Santosh Shilimkar   mm/percpu.c: use ...
2479
  		memblock_free_early(__pa(areas), areas_size);
fb435d523   Tejun Heo   percpu: add pcpu_...
2480
  	return rc;
d4b95f803   Tejun Heo   x86,percpu: gener...
2481
  }
3c9a024fd   Tejun Heo   percpu: fix build...
2482
  #endif /* BUILD_EMBED_FIRST_CHUNK */
d4b95f803   Tejun Heo   x86,percpu: gener...
2483

3c9a024fd   Tejun Heo   percpu: fix build...
2484
  #ifdef BUILD_PAGE_FIRST_CHUNK
d4b95f803   Tejun Heo   x86,percpu: gener...
2485
  /**
00ae4064b   Tejun Heo   percpu: rename 4k...
2486
   * pcpu_page_first_chunk - map the first chunk using PAGE_SIZE pages
d4b95f803   Tejun Heo   x86,percpu: gener...
2487
2488
   * @reserved_size: the size of reserved percpu area in bytes
   * @alloc_fn: function to allocate percpu page, always called with PAGE_SIZE
25985edce   Lucas De Marchi   Fix common misspe...
2489
   * @free_fn: function to free percpu page, always called with PAGE_SIZE
d4b95f803   Tejun Heo   x86,percpu: gener...
2490
2491
   * @populate_pte_fn: function to populate pte
   *
00ae4064b   Tejun Heo   percpu: rename 4k...
2492
2493
   * This is a helper to ease setting up page-remapped first percpu
   * chunk and can be called where pcpu_setup_first_chunk() is expected.
d4b95f803   Tejun Heo   x86,percpu: gener...
2494
2495
2496
2497
2498
   *
   * This is the basic allocator.  Static percpu area is allocated
   * page-by-page into vmalloc area.
   *
   * RETURNS:
fb435d523   Tejun Heo   percpu: add pcpu_...
2499
   * 0 on success, -errno on failure.
d4b95f803   Tejun Heo   x86,percpu: gener...
2500
   */
fb435d523   Tejun Heo   percpu: add pcpu_...
2501
2502
2503
2504
  int __init pcpu_page_first_chunk(size_t reserved_size,
  				 pcpu_fc_alloc_fn_t alloc_fn,
  				 pcpu_fc_free_fn_t free_fn,
  				 pcpu_fc_populate_pte_fn_t populate_pte_fn)
d4b95f803   Tejun Heo   x86,percpu: gener...
2505
  {
8f05a6a65   Tejun Heo   percpu: make 4k f...
2506
  	static struct vm_struct vm;
fd1e8a1fe   Tejun Heo   percpu: introduce...
2507
  	struct pcpu_alloc_info *ai;
00ae4064b   Tejun Heo   percpu: rename 4k...
2508
  	char psize_str[16];
ce3141a27   Tejun Heo   percpu: drop pcpu...
2509
  	int unit_pages;
d4b95f803   Tejun Heo   x86,percpu: gener...
2510
  	size_t pages_size;
ce3141a27   Tejun Heo   percpu: drop pcpu...
2511
  	struct page **pages;
fb435d523   Tejun Heo   percpu: add pcpu_...
2512
  	int unit, i, j, rc;
8f6066049   zijun_hu   mm/percpu.c: fix ...
2513
2514
  	int upa;
  	int nr_g0_units;
d4b95f803   Tejun Heo   x86,percpu: gener...
2515

00ae4064b   Tejun Heo   percpu: rename 4k...
2516
  	snprintf(psize_str, sizeof(psize_str), "%luK", PAGE_SIZE >> 10);
4ba6ce250   Tejun Heo   percpu: make @dyn...
2517
  	ai = pcpu_build_alloc_info(reserved_size, 0, PAGE_SIZE, NULL);
fd1e8a1fe   Tejun Heo   percpu: introduce...
2518
2519
2520
  	if (IS_ERR(ai))
  		return PTR_ERR(ai);
  	BUG_ON(ai->nr_groups != 1);
8f6066049   zijun_hu   mm/percpu.c: fix ...
2521
2522
2523
2524
2525
2526
  	upa = ai->alloc_size/ai->unit_size;
  	nr_g0_units = roundup(num_possible_cpus(), upa);
  	if (unlikely(WARN_ON(ai->groups[0].nr_units != nr_g0_units))) {
  		pcpu_free_alloc_info(ai);
  		return -EINVAL;
  	}
fd1e8a1fe   Tejun Heo   percpu: introduce...
2527
2528
  
  	unit_pages = ai->unit_size >> PAGE_SHIFT;
d4b95f803   Tejun Heo   x86,percpu: gener...
2529
2530
  
  	/* unaligned allocations can't be freed, round up to page size */
fd1e8a1fe   Tejun Heo   percpu: introduce...
2531
2532
  	pages_size = PFN_ALIGN(unit_pages * num_possible_cpus() *
  			       sizeof(pages[0]));
999c17e3d   Santosh Shilimkar   mm/percpu.c: use ...
2533
  	pages = memblock_virt_alloc(pages_size, 0);
d4b95f803   Tejun Heo   x86,percpu: gener...
2534

8f05a6a65   Tejun Heo   percpu: make 4k f...
2535
  	/* allocate pages */
d4b95f803   Tejun Heo   x86,percpu: gener...
2536
  	j = 0;
8f6066049   zijun_hu   mm/percpu.c: fix ...
2537
2538
  	for (unit = 0; unit < num_possible_cpus(); unit++) {
  		unsigned int cpu = ai->groups[0].cpu_map[unit];
ce3141a27   Tejun Heo   percpu: drop pcpu...
2539
  		for (i = 0; i < unit_pages; i++) {
d4b95f803   Tejun Heo   x86,percpu: gener...
2540
  			void *ptr;
3cbc85652   Tejun Heo   percpu: add @alig...
2541
  			ptr = alloc_fn(cpu, PAGE_SIZE, PAGE_SIZE);
d4b95f803   Tejun Heo   x86,percpu: gener...
2542
  			if (!ptr) {
870d4b12a   Joe Perches   mm: percpu: use p...
2543
2544
  				pr_warn("failed to allocate %s page for cpu%u
  ",
8f6066049   zijun_hu   mm/percpu.c: fix ...
2545
  						psize_str, cpu);
d4b95f803   Tejun Heo   x86,percpu: gener...
2546
2547
  				goto enomem;
  			}
f528f0b8e   Catalin Marinas   kmemleak: Handle ...
2548
2549
  			/* kmemleak tracks the percpu allocations separately */
  			kmemleak_free(ptr);
ce3141a27   Tejun Heo   percpu: drop pcpu...
2550
  			pages[j++] = virt_to_page(ptr);
d4b95f803   Tejun Heo   x86,percpu: gener...
2551
  		}
8f6066049   zijun_hu   mm/percpu.c: fix ...
2552
  	}
d4b95f803   Tejun Heo   x86,percpu: gener...
2553

8f05a6a65   Tejun Heo   percpu: make 4k f...
2554
2555
  	/* allocate vm area, map the pages and copy static data */
  	vm.flags = VM_ALLOC;
fd1e8a1fe   Tejun Heo   percpu: introduce...
2556
  	vm.size = num_possible_cpus() * ai->unit_size;
8f05a6a65   Tejun Heo   percpu: make 4k f...
2557
  	vm_area_register_early(&vm, PAGE_SIZE);
fd1e8a1fe   Tejun Heo   percpu: introduce...
2558
  	for (unit = 0; unit < num_possible_cpus(); unit++) {
1d9d32572   Tejun Heo   percpu: make @dyn...
2559
  		unsigned long unit_addr =
fd1e8a1fe   Tejun Heo   percpu: introduce...
2560
  			(unsigned long)vm.addr + unit * ai->unit_size;
8f05a6a65   Tejun Heo   percpu: make 4k f...
2561

ce3141a27   Tejun Heo   percpu: drop pcpu...
2562
  		for (i = 0; i < unit_pages; i++)
8f05a6a65   Tejun Heo   percpu: make 4k f...
2563
2564
2565
  			populate_pte_fn(unit_addr + (i << PAGE_SHIFT));
  
  		/* pte already populated, the following shouldn't fail */
fb435d523   Tejun Heo   percpu: add pcpu_...
2566
2567
2568
2569
2570
  		rc = __pcpu_map_pages(unit_addr, &pages[unit * unit_pages],
  				      unit_pages);
  		if (rc < 0)
  			panic("failed to map percpu area, err=%d
  ", rc);
66c3a7577   Tejun Heo   percpu: generaliz...
2571

8f05a6a65   Tejun Heo   percpu: make 4k f...
2572
2573
2574
2575
2576
2577
2578
2579
2580
  		/*
  		 * FIXME: Archs with virtual cache should flush local
  		 * cache for the linear mapping here - something
  		 * equivalent to flush_cache_vmap() on the local cpu.
  		 * flush_cache_vmap() can't be used as most supporting
  		 * data structures are not set up yet.
  		 */
  
  		/* copy static data */
fd1e8a1fe   Tejun Heo   percpu: introduce...
2581
  		memcpy((void *)unit_addr, __per_cpu_load, ai->static_size);
66c3a7577   Tejun Heo   percpu: generaliz...
2582
2583
2584
  	}
  
  	/* we're ready, commit */
870d4b12a   Joe Perches   mm: percpu: use p...
2585
2586
  	pr_info("%d %s pages/cpu @%p s%zu r%zu d%zu
  ",
fd1e8a1fe   Tejun Heo   percpu: introduce...
2587
2588
  		unit_pages, psize_str, vm.addr, ai->static_size,
  		ai->reserved_size, ai->dyn_size);
d4b95f803   Tejun Heo   x86,percpu: gener...
2589

fb435d523   Tejun Heo   percpu: add pcpu_...
2590
  	rc = pcpu_setup_first_chunk(ai, vm.addr);
d4b95f803   Tejun Heo   x86,percpu: gener...
2591
2592
2593
2594
  	goto out_free_ar;
  
  enomem:
  	while (--j >= 0)
ce3141a27   Tejun Heo   percpu: drop pcpu...
2595
  		free_fn(page_address(pages[j]), PAGE_SIZE);
fb435d523   Tejun Heo   percpu: add pcpu_...
2596
  	rc = -ENOMEM;
d4b95f803   Tejun Heo   x86,percpu: gener...
2597
  out_free_ar:
999c17e3d   Santosh Shilimkar   mm/percpu.c: use ...
2598
  	memblock_free_early(__pa(pages), pages_size);
fd1e8a1fe   Tejun Heo   percpu: introduce...
2599
  	pcpu_free_alloc_info(ai);
fb435d523   Tejun Heo   percpu: add pcpu_...
2600
  	return rc;
d4b95f803   Tejun Heo   x86,percpu: gener...
2601
  }
3c9a024fd   Tejun Heo   percpu: fix build...
2602
  #endif /* BUILD_PAGE_FIRST_CHUNK */
d4b95f803   Tejun Heo   x86,percpu: gener...
2603

bbddff054   Tejun Heo   percpu: use percp...
2604
  #ifndef	CONFIG_HAVE_SETUP_PER_CPU_AREA
8c4bfc6e8   Tejun Heo   x86,percpu: gener...
2605
  /*
bbddff054   Tejun Heo   percpu: use percp...
2606
   * Generic SMP percpu area setup.
e74e39620   Tejun Heo   percpu: use dynam...
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
   *
   * The embedding helper is used because its behavior closely resembles
   * the original non-dynamic generic percpu area setup.  This is
   * important because many archs have addressing restrictions and might
   * fail if the percpu area is located far away from the previous
   * location.  As an added bonus, in non-NUMA cases, embedding is
   * generally a good idea TLB-wise because percpu area can piggy back
   * on the physical linear memory mapping which uses large page
   * mappings on applicable archs.
   */
e74e39620   Tejun Heo   percpu: use dynam...
2617
2618
  unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
  EXPORT_SYMBOL(__per_cpu_offset);
c8826dd53   Tejun Heo   percpu: update em...
2619
2620
2621
  static void * __init pcpu_dfl_fc_alloc(unsigned int cpu, size_t size,
  				       size_t align)
  {
999c17e3d   Santosh Shilimkar   mm/percpu.c: use ...
2622
2623
  	return  memblock_virt_alloc_from_nopanic(
  			size, align, __pa(MAX_DMA_ADDRESS));
c8826dd53   Tejun Heo   percpu: update em...
2624
  }
66c3a7577   Tejun Heo   percpu: generaliz...
2625

c8826dd53   Tejun Heo   percpu: update em...
2626
2627
  static void __init pcpu_dfl_fc_free(void *ptr, size_t size)
  {
999c17e3d   Santosh Shilimkar   mm/percpu.c: use ...
2628
  	memblock_free_early(__pa(ptr), size);
c8826dd53   Tejun Heo   percpu: update em...
2629
  }
e74e39620   Tejun Heo   percpu: use dynam...
2630
2631
  void __init setup_per_cpu_areas(void)
  {
e74e39620   Tejun Heo   percpu: use dynam...
2632
2633
  	unsigned long delta;
  	unsigned int cpu;
fb435d523   Tejun Heo   percpu: add pcpu_...
2634
  	int rc;
e74e39620   Tejun Heo   percpu: use dynam...
2635
2636
2637
2638
2639
  
  	/*
  	 * Always reserve area for module percpu variables.  That's
  	 * what the legacy allocator did.
  	 */
fb435d523   Tejun Heo   percpu: add pcpu_...
2640
  	rc = pcpu_embed_first_chunk(PERCPU_MODULE_RESERVE,
c8826dd53   Tejun Heo   percpu: update em...
2641
2642
  				    PERCPU_DYNAMIC_RESERVE, PAGE_SIZE, NULL,
  				    pcpu_dfl_fc_alloc, pcpu_dfl_fc_free);
fb435d523   Tejun Heo   percpu: add pcpu_...
2643
  	if (rc < 0)
bbddff054   Tejun Heo   percpu: use percp...
2644
  		panic("Failed to initialize percpu areas.");
e74e39620   Tejun Heo   percpu: use dynam...
2645
2646
2647
  
  	delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start;
  	for_each_possible_cpu(cpu)
fb435d523   Tejun Heo   percpu: add pcpu_...
2648
  		__per_cpu_offset[cpu] = delta + pcpu_unit_offsets[cpu];
66c3a7577   Tejun Heo   percpu: generaliz...
2649
  }
bbddff054   Tejun Heo   percpu: use percp...
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
  #endif	/* CONFIG_HAVE_SETUP_PER_CPU_AREA */
  
  #else	/* CONFIG_SMP */
  
  /*
   * UP percpu area setup.
   *
   * UP always uses km-based percpu allocator with identity mapping.
   * Static percpu variables are indistinguishable from the usual static
   * variables and don't require any special preparation.
   */
  void __init setup_per_cpu_areas(void)
  {
  	const size_t unit_size =
  		roundup_pow_of_two(max_t(size_t, PCPU_MIN_UNIT_SIZE,
  					 PERCPU_DYNAMIC_RESERVE));
  	struct pcpu_alloc_info *ai;
  	void *fc;
  
  	ai = pcpu_alloc_alloc_info(1, 1);
999c17e3d   Santosh Shilimkar   mm/percpu.c: use ...
2670
2671
2672
  	fc = memblock_virt_alloc_from_nopanic(unit_size,
  					      PAGE_SIZE,
  					      __pa(MAX_DMA_ADDRESS));
bbddff054   Tejun Heo   percpu: use percp...
2673
2674
  	if (!ai || !fc)
  		panic("Failed to allocate memory for percpu areas.");
100d13c3b   Catalin Marinas   kmemleak: Fix the...
2675
2676
  	/* kmemleak tracks the percpu allocations separately */
  	kmemleak_free(fc);
bbddff054   Tejun Heo   percpu: use percp...
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
  
  	ai->dyn_size = unit_size;
  	ai->unit_size = unit_size;
  	ai->atom_size = unit_size;
  	ai->alloc_size = unit_size;
  	ai->groups[0].nr_units = 1;
  	ai->groups[0].cpu_map[0] = 0;
  
  	if (pcpu_setup_first_chunk(ai, fc) < 0)
  		panic("Failed to initialize percpu areas.");
438a50618   Nicolas Pitre   percpu: don't for...
2687
  	pcpu_free_alloc_info(ai);
bbddff054   Tejun Heo   percpu: use percp...
2688
2689
2690
  }
  
  #endif	/* CONFIG_SMP */
099a19d91   Tejun Heo   percpu: allow lim...
2691
2692
  
  /*
7e8a6304d   Dennis Zhou (Facebook)   /proc/meminfo: ad...
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
   * pcpu_nr_pages - calculate total number of populated backing pages
   *
   * This reflects the number of pages populated to back chunks.  Metadata is
   * excluded in the number exposed in meminfo as the number of backing pages
   * scales with the number of cpus and can quickly outweigh the memory used for
   * metadata.  It also keeps this calculation nice and simple.
   *
   * RETURNS:
   * Total number of populated backing pages in use by the allocator.
   */
  unsigned long pcpu_nr_pages(void)
  {
  	return pcpu_nr_populated * pcpu_nr_units;
  }
  
  /*
1a4d76076   Tejun Heo   percpu: implement...
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
   * Percpu allocator is initialized early during boot when neither slab or
   * workqueue is available.  Plug async management until everything is up
   * and running.
   */
  static int __init percpu_enable_async(void)
  {
  	pcpu_async_enabled = true;
  	return 0;
  }
  subsys_initcall(percpu_enable_async);