Blame view

mm/percpu.c 92.3 KB
55716d264   Thomas Gleixner   treewide: Replace...
1
  // SPDX-License-Identifier: GPL-2.0-only
fbf59bc9d   Tejun Heo   percpu: implement...
2
  /*
88999a898   Tejun Heo   percpu: misc prep...
3
   * mm/percpu.c - percpu memory allocator
fbf59bc9d   Tejun Heo   percpu: implement...
4
5
6
7
   *
   * Copyright (C) 2009		SUSE Linux Products GmbH
   * Copyright (C) 2009		Tejun Heo <tj@kernel.org>
   *
5e81ee3e6   Dennis Zhou (Facebook)   percpu: update he...
8
9
10
   * Copyright (C) 2017		Facebook Inc.
   * Copyright (C) 2017		Dennis Zhou <dennisszhou@gmail.com>
   *
9c0151627   Dennis Zhou (Facebook)   percpu: update th...
11
12
13
14
   * The percpu allocator handles both static and dynamic areas.  Percpu
   * areas are allocated in chunks which are divided into units.  There is
   * a 1-to-1 mapping for units to possible cpus.  These units are grouped
   * based on NUMA properties of the machine.
fbf59bc9d   Tejun Heo   percpu: implement...
15
16
17
18
19
20
   *
   *  c0                           c1                         c2
   *  -------------------          -------------------        ------------
   * | u0 | u1 | u2 | u3 |        | u0 | u1 | u2 | u3 |      | u0 | u1 | u
   *  -------------------  ......  -------------------  ....  ------------
   *
9c0151627   Dennis Zhou (Facebook)   percpu: update th...
21
22
23
24
25
26
27
28
29
   * Allocation is done by offsets into a unit's address space.  Ie., an
   * area of 512 bytes at 6k in c1 occupies 512 bytes at 6k in c1:u0,
   * c1:u1, c1:u2, etc.  On NUMA machines, the mapping may be non-linear
   * and even sparse.  Access is handled by configuring percpu base
   * registers according to the cpu to unit mappings and offsetting the
   * base address using pcpu_unit_size.
   *
   * There is special consideration for the first chunk which must handle
   * the static percpu variables in the kernel image as allocation services
5e81ee3e6   Dennis Zhou (Facebook)   percpu: update he...
30
   * are not online yet.  In short, the first chunk is structured like so:
9c0151627   Dennis Zhou (Facebook)   percpu: update th...
31
32
33
34
35
36
37
   *
   *                  <Static | [Reserved] | Dynamic>
   *
   * The static data is copied from the original section managed by the
   * linker.  The reserved section, if non-zero, primarily manages static
   * percpu variables from kernel modules.  Finally, the dynamic section
   * takes care of normal allocations.
fbf59bc9d   Tejun Heo   percpu: implement...
38
   *
5e81ee3e6   Dennis Zhou (Facebook)   percpu: update he...
39
40
41
42
43
44
45
46
47
48
49
50
51
52
   * The allocator organizes chunks into lists according to free size and
   * tries to allocate from the fullest chunk first.  Each chunk is managed
   * by a bitmap with metadata blocks.  The allocation map is updated on
   * every allocation and free to reflect the current state while the boundary
   * map is only updated on allocation.  Each metadata block contains
   * information to help mitigate the need to iterate over large portions
   * of the bitmap.  The reverse mapping from page to chunk is stored in
   * the page's index.  Lastly, units are lazily backed and grow in unison.
   *
   * There is a unique conversion that goes on here between bytes and bits.
   * Each bit represents a fragment of size PCPU_MIN_ALLOC_SIZE.  The chunk
   * tracks the number of pages it is responsible for in nr_pages.  Helper
   * functions are used to convert from between the bytes, bits, and blocks.
   * All hints are managed in bits unless explicitly stated.
9c0151627   Dennis Zhou (Facebook)   percpu: update th...
53
   *
4091fb95b   Masahiro Yamada   scripts/spelling....
54
   * To use this allocator, arch code should do the following:
fbf59bc9d   Tejun Heo   percpu: implement...
55
   *
fbf59bc9d   Tejun Heo   percpu: implement...
56
   * - define __addr_to_pcpu_ptr() and __pcpu_ptr_to_addr() to translate
e01009833   Tejun Heo   percpu: make x86 ...
57
58
   *   regular address to percpu pointer and back if they need to be
   *   different from the default
fbf59bc9d   Tejun Heo   percpu: implement...
59
   *
8d408b4be   Tejun Heo   percpu: give more...
60
61
   * - use pcpu_setup_first_chunk() during percpu area initialization to
   *   setup the first chunk containing the kernel static percpu area
fbf59bc9d   Tejun Heo   percpu: implement...
62
   */
870d4b12a   Joe Perches   mm: percpu: use p...
63
  #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
fbf59bc9d   Tejun Heo   percpu: implement...
64
  #include <linux/bitmap.h>
57c8a661d   Mike Rapoport   mm: remove includ...
65
  #include <linux/memblock.h>
fd1e8a1fe   Tejun Heo   percpu: introduce...
66
  #include <linux/err.h>
ca460b3c9   Dennis Zhou (Facebook)   percpu: introduce...
67
  #include <linux/lcm.h>
fbf59bc9d   Tejun Heo   percpu: implement...
68
  #include <linux/list.h>
a530b7958   Tejun Heo   percpu: teach lar...
69
  #include <linux/log2.h>
fbf59bc9d   Tejun Heo   percpu: implement...
70
71
72
73
74
  #include <linux/mm.h>
  #include <linux/module.h>
  #include <linux/mutex.h>
  #include <linux/percpu.h>
  #include <linux/pfn.h>
fbf59bc9d   Tejun Heo   percpu: implement...
75
  #include <linux/slab.h>
ccea34b5d   Tejun Heo   percpu: finer gra...
76
  #include <linux/spinlock.h>
fbf59bc9d   Tejun Heo   percpu: implement...
77
  #include <linux/vmalloc.h>
a56dbddf0   Tejun Heo   percpu: move full...
78
  #include <linux/workqueue.h>
f528f0b8e   Catalin Marinas   kmemleak: Handle ...
79
  #include <linux/kmemleak.h>
71546d100   Tejun Heo   percpu: include l...
80
  #include <linux/sched.h>
fbf59bc9d   Tejun Heo   percpu: implement...
81
82
  
  #include <asm/cacheflush.h>
e01009833   Tejun Heo   percpu: make x86 ...
83
  #include <asm/sections.h>
fbf59bc9d   Tejun Heo   percpu: implement...
84
  #include <asm/tlbflush.h>
3b034b0d0   Vivek Goyal   percpu: Fix kdump...
85
  #include <asm/io.h>
fbf59bc9d   Tejun Heo   percpu: implement...
86

df95e795a   Dennis Zhou   percpu: add trace...
87
88
  #define CREATE_TRACE_POINTS
  #include <trace/events/percpu.h>
8fa3ed801   Dennis Zhou   percpu: migrate p...
89
  #include "percpu-internal.h"
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
90
91
  /* the slots are sorted by free bytes left, 1-31 bytes share the same slot */
  #define PCPU_SLOT_BASE_SHIFT		5
8744d8594   Dennis Zhou   percpu: relegate ...
92
93
  /* chunks in slots below this are subject to being sidelined on failed alloc */
  #define PCPU_SLOT_FAIL_THRESHOLD	3
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
94

1a4d76076   Tejun Heo   percpu: implement...
95
96
  #define PCPU_EMPTY_POP_PAGES_LOW	2
  #define PCPU_EMPTY_POP_PAGES_HIGH	4
fbf59bc9d   Tejun Heo   percpu: implement...
97

bbddff054   Tejun Heo   percpu: use percp...
98
  #ifdef CONFIG_SMP
e01009833   Tejun Heo   percpu: make x86 ...
99
100
101
  /* default addr <-> pcpu_ptr mapping, override in asm/percpu.h if necessary */
  #ifndef __addr_to_pcpu_ptr
  #define __addr_to_pcpu_ptr(addr)					\
43cf38eb5   Tejun Heo   percpu: add __per...
102
103
104
  	(void __percpu *)((unsigned long)(addr) -			\
  			  (unsigned long)pcpu_base_addr	+		\
  			  (unsigned long)__per_cpu_start)
e01009833   Tejun Heo   percpu: make x86 ...
105
106
107
  #endif
  #ifndef __pcpu_ptr_to_addr
  #define __pcpu_ptr_to_addr(ptr)						\
43cf38eb5   Tejun Heo   percpu: add __per...
108
109
110
  	(void __force *)((unsigned long)(ptr) +				\
  			 (unsigned long)pcpu_base_addr -		\
  			 (unsigned long)__per_cpu_start)
e01009833   Tejun Heo   percpu: make x86 ...
111
  #endif
bbddff054   Tejun Heo   percpu: use percp...
112
113
114
115
116
  #else	/* CONFIG_SMP */
  /* on UP, it's always identity mapped */
  #define __addr_to_pcpu_ptr(addr)	(void __percpu *)(addr)
  #define __pcpu_ptr_to_addr(ptr)		(void __force *)(ptr)
  #endif	/* CONFIG_SMP */
e01009833   Tejun Heo   percpu: make x86 ...
117

1328710b8   Daniel Micay   mark most percpu ...
118
119
120
121
  static int pcpu_unit_pages __ro_after_init;
  static int pcpu_unit_size __ro_after_init;
  static int pcpu_nr_units __ro_after_init;
  static int pcpu_atom_size __ro_after_init;
8fa3ed801   Dennis Zhou   percpu: migrate p...
122
  int pcpu_nr_slots __ro_after_init;
1328710b8   Daniel Micay   mark most percpu ...
123
  static size_t pcpu_chunk_struct_size __ro_after_init;
fbf59bc9d   Tejun Heo   percpu: implement...
124

a855b84c3   Tejun Heo   percpu: fix chunk...
125
  /* cpus with the lowest and highest unit addresses */
1328710b8   Daniel Micay   mark most percpu ...
126
127
  static unsigned int pcpu_low_unit_cpu __ro_after_init;
  static unsigned int pcpu_high_unit_cpu __ro_after_init;
2f39e637e   Tejun Heo   percpu: allow non...
128

fbf59bc9d   Tejun Heo   percpu: implement...
129
  /* the address of the first chunk which starts with the kernel static area */
1328710b8   Daniel Micay   mark most percpu ...
130
  void *pcpu_base_addr __ro_after_init;
fbf59bc9d   Tejun Heo   percpu: implement...
131
  EXPORT_SYMBOL_GPL(pcpu_base_addr);
1328710b8   Daniel Micay   mark most percpu ...
132
133
  static const int *pcpu_unit_map __ro_after_init;		/* cpu -> unit */
  const unsigned long *pcpu_unit_offsets __ro_after_init;	/* cpu -> unit offset */
2f39e637e   Tejun Heo   percpu: allow non...
134

6563297ce   Tejun Heo   percpu: use group...
135
  /* group information, used for vm allocation */
1328710b8   Daniel Micay   mark most percpu ...
136
137
138
  static int pcpu_nr_groups __ro_after_init;
  static const unsigned long *pcpu_group_offsets __ro_after_init;
  static const size_t *pcpu_group_sizes __ro_after_init;
6563297ce   Tejun Heo   percpu: use group...
139

ae9e6bc9f   Tejun Heo   percpu: don't put...
140
141
142
143
144
  /*
   * The first chunk which always exists.  Note that unlike other
   * chunks, this one can be allocated and mapped in several different
   * ways and thus often doesn't live in the vmalloc area.
   */
8fa3ed801   Dennis Zhou   percpu: migrate p...
145
  struct pcpu_chunk *pcpu_first_chunk __ro_after_init;
ae9e6bc9f   Tejun Heo   percpu: don't put...
146
147
148
  
  /*
   * Optional reserved chunk.  This chunk reserves part of the first
e22667056   Dennis Zhou (Facebook)   percpu: introduce...
149
150
   * chunk and serves it for reserved allocations.  When the reserved
   * region doesn't exist, the following variable is NULL.
ae9e6bc9f   Tejun Heo   percpu: don't put...
151
   */
8fa3ed801   Dennis Zhou   percpu: migrate p...
152
  struct pcpu_chunk *pcpu_reserved_chunk __ro_after_init;
edcb46399   Tejun Heo   percpu, module: i...
153

8fa3ed801   Dennis Zhou   percpu: migrate p...
154
  DEFINE_SPINLOCK(pcpu_lock);	/* all internal data structures */
6710e594f   Tejun Heo   percpu: fix synch...
155
  static DEFINE_MUTEX(pcpu_alloc_mutex);	/* chunk create/destroy, [de]pop, map ext */
fbf59bc9d   Tejun Heo   percpu: implement...
156

8fa3ed801   Dennis Zhou   percpu: migrate p...
157
  struct list_head *pcpu_slot __ro_after_init; /* chunk list slots */
fbf59bc9d   Tejun Heo   percpu: implement...
158

4f996e234   Tejun Heo   percpu: fix synch...
159
160
  /* chunks which need their map areas extended, protected by pcpu_lock */
  static LIST_HEAD(pcpu_map_extend_chunks);
b539b87fe   Tejun Heo   percpu: implmeent...
161
162
163
164
  /*
   * The number of empty populated pages, protected by pcpu_lock.  The
   * reserved chunk doesn't contribute to the count.
   */
6b9b6f399   Dennis Zhou (Facebook)   percpu: expose pc...
165
  int pcpu_nr_empty_pop_pages;
b539b87fe   Tejun Heo   percpu: implmeent...
166

1a4d76076   Tejun Heo   percpu: implement...
167
  /*
7e8a6304d   Dennis Zhou (Facebook)   /proc/meminfo: ad...
168
169
170
171
172
173
174
175
   * The number of populated pages in use by the allocator, protected by
   * pcpu_lock.  This number is kept per a unit per chunk (i.e. when a page gets
   * allocated/deallocated, it is allocated/deallocated in all units of a chunk
   * and increments/decrements this count by 1).
   */
  static unsigned long pcpu_nr_populated;
  
  /*
1a4d76076   Tejun Heo   percpu: implement...
176
177
178
179
180
   * Balance work is used to populate or destroy chunks asynchronously.  We
   * try to keep the number of populated free pages between
   * PCPU_EMPTY_POP_PAGES_LOW and HIGH for atomic allocations and at most one
   * empty chunk.
   */
fe6bd8c3d   Tejun Heo   percpu: rename pc...
181
182
  static void pcpu_balance_workfn(struct work_struct *work);
  static DECLARE_WORK(pcpu_balance_work, pcpu_balance_workfn);
1a4d76076   Tejun Heo   percpu: implement...
183
184
185
186
187
188
189
190
  static bool pcpu_async_enabled __read_mostly;
  static bool pcpu_atomic_alloc_failed;
  
  static void pcpu_schedule_balance_work(void)
  {
  	if (pcpu_async_enabled)
  		schedule_work(&pcpu_balance_work);
  }
a56dbddf0   Tejun Heo   percpu: move full...
191

c0ebfdc3f   Dennis Zhou (Facebook)   percpu: modify ba...
192
  /**
560f2c236   Dennis Zhou (Facebook)   percpu: combine p...
193
194
195
   * pcpu_addr_in_chunk - check if the address is served from this chunk
   * @chunk: chunk of interest
   * @addr: percpu address
c0ebfdc3f   Dennis Zhou (Facebook)   percpu: modify ba...
196
197
   *
   * RETURNS:
560f2c236   Dennis Zhou (Facebook)   percpu: combine p...
198
   * True if the address is served from this chunk.
c0ebfdc3f   Dennis Zhou (Facebook)   percpu: modify ba...
199
   */
560f2c236   Dennis Zhou (Facebook)   percpu: combine p...
200
  static bool pcpu_addr_in_chunk(struct pcpu_chunk *chunk, void *addr)
020ec6537   Tejun Heo   percpu: factor ou...
201
  {
c0ebfdc3f   Dennis Zhou (Facebook)   percpu: modify ba...
202
  	void *start_addr, *end_addr;
560f2c236   Dennis Zhou (Facebook)   percpu: combine p...
203
  	if (!chunk)
c0ebfdc3f   Dennis Zhou (Facebook)   percpu: modify ba...
204
  		return false;
020ec6537   Tejun Heo   percpu: factor ou...
205

560f2c236   Dennis Zhou (Facebook)   percpu: combine p...
206
207
208
  	start_addr = chunk->base_addr + chunk->start_offset;
  	end_addr = chunk->base_addr + chunk->nr_pages * PAGE_SIZE -
  		   chunk->end_offset;
c0ebfdc3f   Dennis Zhou (Facebook)   percpu: modify ba...
209
210
  
  	return addr >= start_addr && addr < end_addr;
020ec6537   Tejun Heo   percpu: factor ou...
211
  }
d9b55eeb1   Tejun Heo   percpu: remove un...
212
  static int __pcpu_size_to_slot(int size)
fbf59bc9d   Tejun Heo   percpu: implement...
213
  {
cae3aeb83   Tejun Heo   percpu: clean up ...
214
  	int highbit = fls(size);	/* size is in bytes */
fbf59bc9d   Tejun Heo   percpu: implement...
215
216
  	return max(highbit - PCPU_SLOT_BASE_SHIFT + 2, 1);
  }
d9b55eeb1   Tejun Heo   percpu: remove un...
217
218
219
220
221
222
  static int pcpu_size_to_slot(int size)
  {
  	if (size == pcpu_unit_size)
  		return pcpu_nr_slots - 1;
  	return __pcpu_size_to_slot(size);
  }
fbf59bc9d   Tejun Heo   percpu: implement...
223
224
  static int pcpu_chunk_slot(const struct pcpu_chunk *chunk)
  {
92c14cab4   Dennis Zhou   percpu: convert c...
225
226
227
228
  	const struct pcpu_block_md *chunk_md = &chunk->chunk_md;
  
  	if (chunk->free_bytes < PCPU_MIN_ALLOC_SIZE ||
  	    chunk_md->contig_hint == 0)
fbf59bc9d   Tejun Heo   percpu: implement...
229
  		return 0;
92c14cab4   Dennis Zhou   percpu: convert c...
230
  	return pcpu_size_to_slot(chunk_md->contig_hint * PCPU_MIN_ALLOC_SIZE);
fbf59bc9d   Tejun Heo   percpu: implement...
231
  }
88999a898   Tejun Heo   percpu: misc prep...
232
233
234
235
236
237
238
239
240
241
242
243
244
  /* set the pointer to a chunk in a page struct */
  static void pcpu_set_page_chunk(struct page *page, struct pcpu_chunk *pcpu)
  {
  	page->index = (unsigned long)pcpu;
  }
  
  /* obtain pointer to a chunk from a page struct */
  static struct pcpu_chunk *pcpu_get_page_chunk(struct page *page)
  {
  	return (struct pcpu_chunk *)page->index;
  }
  
  static int __maybe_unused pcpu_page_idx(unsigned int cpu, int page_idx)
fbf59bc9d   Tejun Heo   percpu: implement...
245
  {
2f39e637e   Tejun Heo   percpu: allow non...
246
  	return pcpu_unit_map[cpu] * pcpu_unit_pages + page_idx;
fbf59bc9d   Tejun Heo   percpu: implement...
247
  }
c0ebfdc3f   Dennis Zhou (Facebook)   percpu: modify ba...
248
249
250
251
  static unsigned long pcpu_unit_page_offset(unsigned int cpu, int page_idx)
  {
  	return pcpu_unit_offsets[cpu] + (page_idx << PAGE_SHIFT);
  }
9983b6f0c   Tejun Heo   percpu: fix first...
252
253
  static unsigned long pcpu_chunk_addr(struct pcpu_chunk *chunk,
  				     unsigned int cpu, int page_idx)
fbf59bc9d   Tejun Heo   percpu: implement...
254
  {
c0ebfdc3f   Dennis Zhou (Facebook)   percpu: modify ba...
255
256
  	return (unsigned long)chunk->base_addr +
  	       pcpu_unit_page_offset(cpu, page_idx);
fbf59bc9d   Tejun Heo   percpu: implement...
257
  }
91e914c5a   Dennis Zhou (Facebook)   percpu: generaliz...
258
  static void pcpu_next_unpop(unsigned long *bitmap, int *rs, int *re, int end)
ce3141a27   Tejun Heo   percpu: drop pcpu...
259
  {
91e914c5a   Dennis Zhou (Facebook)   percpu: generaliz...
260
261
  	*rs = find_next_zero_bit(bitmap, end, *rs);
  	*re = find_next_bit(bitmap, end, *rs + 1);
ce3141a27   Tejun Heo   percpu: drop pcpu...
262
  }
91e914c5a   Dennis Zhou (Facebook)   percpu: generaliz...
263
  static void pcpu_next_pop(unsigned long *bitmap, int *rs, int *re, int end)
ce3141a27   Tejun Heo   percpu: drop pcpu...
264
  {
91e914c5a   Dennis Zhou (Facebook)   percpu: generaliz...
265
266
  	*rs = find_next_bit(bitmap, end, *rs);
  	*re = find_next_zero_bit(bitmap, end, *rs + 1);
ce3141a27   Tejun Heo   percpu: drop pcpu...
267
268
269
  }
  
  /*
91e914c5a   Dennis Zhou (Facebook)   percpu: generaliz...
270
271
272
   * Bitmap region iterators.  Iterates over the bitmap between
   * [@start, @end) in @chunk.  @rs and @re should be integer variables
   * and will be set to start and end index of the current free region.
ce3141a27   Tejun Heo   percpu: drop pcpu...
273
   */
91e914c5a   Dennis Zhou (Facebook)   percpu: generaliz...
274
275
276
277
  #define pcpu_for_each_unpop_region(bitmap, rs, re, start, end)		     \
  	for ((rs) = (start), pcpu_next_unpop((bitmap), &(rs), &(re), (end)); \
  	     (rs) < (re);						     \
  	     (rs) = (re) + 1, pcpu_next_unpop((bitmap), &(rs), &(re), (end)))
ce3141a27   Tejun Heo   percpu: drop pcpu...
278

91e914c5a   Dennis Zhou (Facebook)   percpu: generaliz...
279
280
281
282
  #define pcpu_for_each_pop_region(bitmap, rs, re, start, end)		     \
  	for ((rs) = (start), pcpu_next_pop((bitmap), &(rs), &(re), (end));   \
  	     (rs) < (re);						     \
  	     (rs) = (re) + 1, pcpu_next_pop((bitmap), &(rs), &(re), (end)))
ce3141a27   Tejun Heo   percpu: drop pcpu...
283

ca460b3c9   Dennis Zhou (Facebook)   percpu: introduce...
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
  /*
   * The following are helper functions to help access bitmaps and convert
   * between bitmap offsets to address offsets.
   */
  static unsigned long *pcpu_index_alloc_map(struct pcpu_chunk *chunk, int index)
  {
  	return chunk->alloc_map +
  	       (index * PCPU_BITMAP_BLOCK_BITS / BITS_PER_LONG);
  }
  
  static unsigned long pcpu_off_to_block_index(int off)
  {
  	return off / PCPU_BITMAP_BLOCK_BITS;
  }
  
  static unsigned long pcpu_off_to_block_off(int off)
  {
  	return off & (PCPU_BITMAP_BLOCK_BITS - 1);
  }
b185cd0dc   Dennis Zhou (Facebook)   percpu: update fr...
303
304
305
306
  static unsigned long pcpu_block_off_to_off(int index, int off)
  {
  	return index * PCPU_BITMAP_BLOCK_BITS + off;
  }
382b88e96   Dennis Zhou   percpu: add block...
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
  /*
   * pcpu_next_hint - determine which hint to use
   * @block: block of interest
   * @alloc_bits: size of allocation
   *
   * This determines if we should scan based on the scan_hint or first_free.
   * In general, we want to scan from first_free to fulfill allocations by
   * first fit.  However, if we know a scan_hint at position scan_hint_start
   * cannot fulfill an allocation, we can begin scanning from there knowing
   * the contig_hint will be our fallback.
   */
  static int pcpu_next_hint(struct pcpu_block_md *block, int alloc_bits)
  {
  	/*
  	 * The three conditions below determine if we can skip past the
  	 * scan_hint.  First, does the scan hint exist.  Second, is the
  	 * contig_hint after the scan_hint (possibly not true iff
  	 * contig_hint == scan_hint).  Third, is the allocation request
  	 * larger than the scan_hint.
  	 */
  	if (block->scan_hint &&
  	    block->contig_hint_start > block->scan_hint_start &&
  	    alloc_bits > block->scan_hint)
  		return block->scan_hint_start + block->scan_hint;
  
  	return block->first_free;
  }
fbf59bc9d   Tejun Heo   percpu: implement...
334
  /**
525ca84da   Dennis Zhou (Facebook)   percpu: use metad...
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
   * pcpu_next_md_free_region - finds the next hint free area
   * @chunk: chunk of interest
   * @bit_off: chunk offset
   * @bits: size of free area
   *
   * Helper function for pcpu_for_each_md_free_region.  It checks
   * block->contig_hint and performs aggregation across blocks to find the
   * next hint.  It modifies bit_off and bits in-place to be consumed in the
   * loop.
   */
  static void pcpu_next_md_free_region(struct pcpu_chunk *chunk, int *bit_off,
  				     int *bits)
  {
  	int i = pcpu_off_to_block_index(*bit_off);
  	int block_off = pcpu_off_to_block_off(*bit_off);
  	struct pcpu_block_md *block;
  
  	*bits = 0;
  	for (block = chunk->md_blocks + i; i < pcpu_chunk_nr_blocks(chunk);
  	     block++, i++) {
  		/* handles contig area across blocks */
  		if (*bits) {
  			*bits += block->left_free;
  			if (block->left_free == PCPU_BITMAP_BLOCK_BITS)
  				continue;
  			return;
  		}
  
  		/*
  		 * This checks three things.  First is there a contig_hint to
  		 * check.  Second, have we checked this hint before by
  		 * comparing the block_off.  Third, is this the same as the
  		 * right contig hint.  In the last case, it spills over into
  		 * the next block and should be handled by the contig area
  		 * across blocks code.
  		 */
  		*bits = block->contig_hint;
  		if (*bits && block->contig_hint_start >= block_off &&
  		    *bits + block->contig_hint_start < PCPU_BITMAP_BLOCK_BITS) {
  			*bit_off = pcpu_block_off_to_off(i,
  					block->contig_hint_start);
  			return;
  		}
1fa4df3e6   Dennis Zhou   percpu: fix itera...
378
379
  		/* reset to satisfy the second predicate above */
  		block_off = 0;
525ca84da   Dennis Zhou (Facebook)   percpu: use metad...
380
381
382
383
384
  
  		*bits = block->right_free;
  		*bit_off = (i + 1) * PCPU_BITMAP_BLOCK_BITS - block->right_free;
  	}
  }
b4c2116cf   Dennis Zhou (Facebook)   percpu: update pc...
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
  /**
   * pcpu_next_fit_region - finds fit areas for a given allocation request
   * @chunk: chunk of interest
   * @alloc_bits: size of allocation
   * @align: alignment of area (max PAGE_SIZE)
   * @bit_off: chunk offset
   * @bits: size of free area
   *
   * Finds the next free region that is viable for use with a given size and
   * alignment.  This only returns if there is a valid area to be used for this
   * allocation.  block->first_free is returned if the allocation request fits
   * within the block to see if the request can be fulfilled prior to the contig
   * hint.
   */
  static void pcpu_next_fit_region(struct pcpu_chunk *chunk, int alloc_bits,
  				 int align, int *bit_off, int *bits)
  {
  	int i = pcpu_off_to_block_index(*bit_off);
  	int block_off = pcpu_off_to_block_off(*bit_off);
  	struct pcpu_block_md *block;
  
  	*bits = 0;
  	for (block = chunk->md_blocks + i; i < pcpu_chunk_nr_blocks(chunk);
  	     block++, i++) {
  		/* handles contig area across blocks */
  		if (*bits) {
  			*bits += block->left_free;
  			if (*bits >= alloc_bits)
  				return;
  			if (block->left_free == PCPU_BITMAP_BLOCK_BITS)
  				continue;
  		}
  
  		/* check block->contig_hint */
  		*bits = ALIGN(block->contig_hint_start, align) -
  			block->contig_hint_start;
  		/*
  		 * This uses the block offset to determine if this has been
  		 * checked in the prior iteration.
  		 */
  		if (block->contig_hint &&
  		    block->contig_hint_start >= block_off &&
  		    block->contig_hint >= *bits + alloc_bits) {
382b88e96   Dennis Zhou   percpu: add block...
428
  			int start = pcpu_next_hint(block, alloc_bits);
b4c2116cf   Dennis Zhou (Facebook)   percpu: update pc...
429
  			*bits += alloc_bits + block->contig_hint_start -
382b88e96   Dennis Zhou   percpu: add block...
430
431
  				 start;
  			*bit_off = pcpu_block_off_to_off(i, start);
b4c2116cf   Dennis Zhou (Facebook)   percpu: update pc...
432
433
  			return;
  		}
1fa4df3e6   Dennis Zhou   percpu: fix itera...
434
435
  		/* reset to satisfy the second predicate above */
  		block_off = 0;
b4c2116cf   Dennis Zhou (Facebook)   percpu: update pc...
436
437
438
439
440
441
442
443
444
445
446
447
  
  		*bit_off = ALIGN(PCPU_BITMAP_BLOCK_BITS - block->right_free,
  				 align);
  		*bits = PCPU_BITMAP_BLOCK_BITS - *bit_off;
  		*bit_off = pcpu_block_off_to_off(i, *bit_off);
  		if (*bits >= alloc_bits)
  			return;
  	}
  
  	/* no valid offsets were found - fail condition */
  	*bit_off = pcpu_chunk_map_bits(chunk);
  }
525ca84da   Dennis Zhou (Facebook)   percpu: use metad...
448
449
450
  /*
   * Metadata free area iterators.  These perform aggregation of free areas
   * based on the metadata blocks and return the offset @bit_off and size in
b4c2116cf   Dennis Zhou (Facebook)   percpu: update pc...
451
452
   * bits of the free area @bits.  pcpu_for_each_fit_region only returns when
   * a fit is found for the allocation request.
525ca84da   Dennis Zhou (Facebook)   percpu: use metad...
453
454
455
456
457
458
   */
  #define pcpu_for_each_md_free_region(chunk, bit_off, bits)		\
  	for (pcpu_next_md_free_region((chunk), &(bit_off), &(bits));	\
  	     (bit_off) < pcpu_chunk_map_bits((chunk));			\
  	     (bit_off) += (bits) + 1,					\
  	     pcpu_next_md_free_region((chunk), &(bit_off), &(bits)))
b4c2116cf   Dennis Zhou (Facebook)   percpu: update pc...
459
460
461
462
463
464
465
  #define pcpu_for_each_fit_region(chunk, alloc_bits, align, bit_off, bits)     \
  	for (pcpu_next_fit_region((chunk), (alloc_bits), (align), &(bit_off), \
  				  &(bits));				      \
  	     (bit_off) < pcpu_chunk_map_bits((chunk));			      \
  	     (bit_off) += (bits),					      \
  	     pcpu_next_fit_region((chunk), (alloc_bits), (align), &(bit_off), \
  				  &(bits)))
525ca84da   Dennis Zhou (Facebook)   percpu: use metad...
466
  /**
90459ce06   Bob Liu   percpu: rename pc...
467
   * pcpu_mem_zalloc - allocate memory
1880d93b8   Tejun Heo   percpu: replace p...
468
   * @size: bytes to allocate
47504ee04   Dennis Zhou   percpu: add __GFP...
469
   * @gfp: allocation flags
fbf59bc9d   Tejun Heo   percpu: implement...
470
   *
1880d93b8   Tejun Heo   percpu: replace p...
471
   * Allocate @size bytes.  If @size is smaller than PAGE_SIZE,
47504ee04   Dennis Zhou   percpu: add __GFP...
472
473
474
   * kzalloc() is used; otherwise, the equivalent of vzalloc() is used.
   * This is to facilitate passing through whitelisted flags.  The
   * returned memory is always zeroed.
fbf59bc9d   Tejun Heo   percpu: implement...
475
476
   *
   * RETURNS:
1880d93b8   Tejun Heo   percpu: replace p...
477
   * Pointer to the allocated area on success, NULL on failure.
fbf59bc9d   Tejun Heo   percpu: implement...
478
   */
47504ee04   Dennis Zhou   percpu: add __GFP...
479
  static void *pcpu_mem_zalloc(size_t size, gfp_t gfp)
fbf59bc9d   Tejun Heo   percpu: implement...
480
  {
099a19d91   Tejun Heo   percpu: allow lim...
481
482
  	if (WARN_ON_ONCE(!slab_is_available()))
  		return NULL;
1880d93b8   Tejun Heo   percpu: replace p...
483
  	if (size <= PAGE_SIZE)
554fef1c3   Dennis Zhou   percpu: allow sel...
484
  		return kzalloc(size, gfp);
7af4c0932   Jesper Juhl   percpu: zero memo...
485
  	else
554fef1c3   Dennis Zhou   percpu: allow sel...
486
  		return __vmalloc(size, gfp | __GFP_ZERO, PAGE_KERNEL);
1880d93b8   Tejun Heo   percpu: replace p...
487
  }
fbf59bc9d   Tejun Heo   percpu: implement...
488

1880d93b8   Tejun Heo   percpu: replace p...
489
490
491
  /**
   * pcpu_mem_free - free memory
   * @ptr: memory to free
1880d93b8   Tejun Heo   percpu: replace p...
492
   *
90459ce06   Bob Liu   percpu: rename pc...
493
   * Free @ptr.  @ptr should have been allocated using pcpu_mem_zalloc().
1880d93b8   Tejun Heo   percpu: replace p...
494
   */
1d5cfdb07   Tetsuo Handa   tree wide: use kv...
495
  static void pcpu_mem_free(void *ptr)
1880d93b8   Tejun Heo   percpu: replace p...
496
  {
1d5cfdb07   Tetsuo Handa   tree wide: use kv...
497
  	kvfree(ptr);
fbf59bc9d   Tejun Heo   percpu: implement...
498
  }
8744d8594   Dennis Zhou   percpu: relegate ...
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
  static void __pcpu_chunk_move(struct pcpu_chunk *chunk, int slot,
  			      bool move_front)
  {
  	if (chunk != pcpu_reserved_chunk) {
  		if (move_front)
  			list_move(&chunk->list, &pcpu_slot[slot]);
  		else
  			list_move_tail(&chunk->list, &pcpu_slot[slot]);
  	}
  }
  
  static void pcpu_chunk_move(struct pcpu_chunk *chunk, int slot)
  {
  	__pcpu_chunk_move(chunk, slot, true);
  }
fbf59bc9d   Tejun Heo   percpu: implement...
514
515
516
517
518
519
520
  /**
   * pcpu_chunk_relocate - put chunk in the appropriate chunk slot
   * @chunk: chunk of interest
   * @oslot: the previous slot it was on
   *
   * This function is called after an allocation or free changed @chunk.
   * New slot according to the changed state is determined and @chunk is
edcb46399   Tejun Heo   percpu, module: i...
521
522
   * moved to the slot.  Note that the reserved chunk is never put on
   * chunk slots.
ccea34b5d   Tejun Heo   percpu: finer gra...
523
524
525
   *
   * CONTEXT:
   * pcpu_lock.
fbf59bc9d   Tejun Heo   percpu: implement...
526
527
528
529
   */
  static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot)
  {
  	int nslot = pcpu_chunk_slot(chunk);
8744d8594   Dennis Zhou   percpu: relegate ...
530
531
  	if (oslot != nslot)
  		__pcpu_chunk_move(chunk, nslot, oslot < nslot);
833af8427   Tejun Heo   percpu: restructu...
532
  }
b239f7daf   Dennis Zhou   percpu: set PCPU_...
533
534
  /*
   * pcpu_update_empty_pages - update empty page counters
833af8427   Tejun Heo   percpu: restructu...
535
   * @chunk: chunk of interest
b239f7daf   Dennis Zhou   percpu: set PCPU_...
536
   * @nr: nr of empty pages
833af8427   Tejun Heo   percpu: restructu...
537
   *
b239f7daf   Dennis Zhou   percpu: set PCPU_...
538
539
540
   * This is used to keep track of the empty pages now based on the premise
   * a md_block covers a page.  The hint update functions recognize if a block
   * is made full or broken to calculate deltas for keeping track of free pages.
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
541
   */
b239f7daf   Dennis Zhou   percpu: set PCPU_...
542
  static inline void pcpu_update_empty_pages(struct pcpu_chunk *chunk, int nr)
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
543
  {
b239f7daf   Dennis Zhou   percpu: set PCPU_...
544
545
546
  	chunk->nr_empty_pop_pages += nr;
  	if (chunk != pcpu_reserved_chunk)
  		pcpu_nr_empty_pop_pages += nr;
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
547
  }
d9f3a01ee   Dennis Zhou   percpu: introduce...
548
549
550
551
552
553
  /*
   * pcpu_region_overlap - determines if two regions overlap
   * @a: start of first region, inclusive
   * @b: end of first region, exclusive
   * @x: start of second region, inclusive
   * @y: end of second region, exclusive
833af8427   Tejun Heo   percpu: restructu...
554
   *
d9f3a01ee   Dennis Zhou   percpu: introduce...
555
556
   * This is used to determine if the hint region [a, b) overlaps with the
   * allocated region [x, y).
833af8427   Tejun Heo   percpu: restructu...
557
   */
d9f3a01ee   Dennis Zhou   percpu: introduce...
558
  static inline bool pcpu_region_overlap(int a, int b, int x, int y)
833af8427   Tejun Heo   percpu: restructu...
559
  {
d9f3a01ee   Dennis Zhou   percpu: introduce...
560
  	return (a < y) && (x < b);
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
561
  }
9f7dcf224   Tejun Heo   percpu: move chun...
562

40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
563
  /**
ca460b3c9   Dennis Zhou (Facebook)   percpu: introduce...
564
565
566
567
568
569
   * pcpu_block_update - updates a block given a free area
   * @block: block of interest
   * @start: start offset in block
   * @end: end offset in block
   *
   * Updates a block given a known free area.  The region [start, end) is
268625a6f   Dennis Zhou (Facebook)   percpu: keep trac...
570
571
   * expected to be the entirety of the free area within a block.  Chooses
   * the best starting offset if the contig hints are equal.
ca460b3c9   Dennis Zhou (Facebook)   percpu: introduce...
572
573
574
575
576
577
578
579
   */
  static void pcpu_block_update(struct pcpu_block_md *block, int start, int end)
  {
  	int contig = end - start;
  
  	block->first_free = min(block->first_free, start);
  	if (start == 0)
  		block->left_free = contig;
047924c96   Dennis Zhou   percpu: make pcpu...
580
  	if (end == block->nr_bits)
ca460b3c9   Dennis Zhou (Facebook)   percpu: introduce...
581
582
583
  		block->right_free = contig;
  
  	if (contig > block->contig_hint) {
382b88e96   Dennis Zhou   percpu: add block...
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
  		/* promote the old contig_hint to be the new scan_hint */
  		if (start > block->contig_hint_start) {
  			if (block->contig_hint > block->scan_hint) {
  				block->scan_hint_start =
  					block->contig_hint_start;
  				block->scan_hint = block->contig_hint;
  			} else if (start < block->scan_hint_start) {
  				/*
  				 * The old contig_hint == scan_hint.  But, the
  				 * new contig is larger so hold the invariant
  				 * scan_hint_start < contig_hint_start.
  				 */
  				block->scan_hint = 0;
  			}
  		} else {
  			block->scan_hint = 0;
  		}
ca460b3c9   Dennis Zhou (Facebook)   percpu: introduce...
601
602
  		block->contig_hint_start = start;
  		block->contig_hint = contig;
382b88e96   Dennis Zhou   percpu: add block...
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
  	} else if (contig == block->contig_hint) {
  		if (block->contig_hint_start &&
  		    (!start ||
  		     __ffs(start) > __ffs(block->contig_hint_start))) {
  			/* start has a better alignment so use it */
  			block->contig_hint_start = start;
  			if (start < block->scan_hint_start &&
  			    block->contig_hint > block->scan_hint)
  				block->scan_hint = 0;
  		} else if (start > block->scan_hint_start ||
  			   block->contig_hint > block->scan_hint) {
  			/*
  			 * Knowing contig == contig_hint, update the scan_hint
  			 * if it is farther than or larger than the current
  			 * scan_hint.
  			 */
  			block->scan_hint_start = start;
  			block->scan_hint = contig;
  		}
  	} else {
  		/*
  		 * The region is smaller than the contig_hint.  So only update
  		 * the scan_hint if it is larger than or equal and farther than
  		 * the current scan_hint.
  		 */
  		if ((start < block->contig_hint_start &&
  		     (contig > block->scan_hint ||
  		      (contig == block->scan_hint &&
  		       start > block->scan_hint_start)))) {
  			block->scan_hint_start = start;
  			block->scan_hint = contig;
  		}
ca460b3c9   Dennis Zhou (Facebook)   percpu: introduce...
635
636
  	}
  }
b89462a9c   Dennis Zhou   percpu: remember ...
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
  /*
   * pcpu_block_update_scan - update a block given a free area from a scan
   * @chunk: chunk of interest
   * @bit_off: chunk offset
   * @bits: size of free area
   *
   * Finding the final allocation spot first goes through pcpu_find_block_fit()
   * to find a block that can hold the allocation and then pcpu_alloc_area()
   * where a scan is used.  When allocations require specific alignments,
   * we can inadvertently create holes which will not be seen in the alloc
   * or free paths.
   *
   * This takes a given free area hole and updates a block as it may change the
   * scan_hint.  We need to scan backwards to ensure we don't miss free bits
   * from alignment.
   */
  static void pcpu_block_update_scan(struct pcpu_chunk *chunk, int bit_off,
  				   int bits)
  {
  	int s_off = pcpu_off_to_block_off(bit_off);
  	int e_off = s_off + bits;
  	int s_index, l_bit;
  	struct pcpu_block_md *block;
  
  	if (e_off > PCPU_BITMAP_BLOCK_BITS)
  		return;
  
  	s_index = pcpu_off_to_block_index(bit_off);
  	block = chunk->md_blocks + s_index;
  
  	/* scan backwards in case of alignment skipping free bits */
  	l_bit = find_last_bit(pcpu_index_alloc_map(chunk, s_index), s_off);
  	s_off = (s_off == l_bit) ? 0 : l_bit + 1;
  
  	pcpu_block_update(block, s_off, e_off);
  }
ca460b3c9   Dennis Zhou (Facebook)   percpu: introduce...
673
  /**
92c14cab4   Dennis Zhou   percpu: convert c...
674
675
   * pcpu_chunk_refresh_hint - updates metadata about a chunk
   * @chunk: chunk of interest
d33d9f3dd   Dennis Zhou   percpu: use chunk...
676
   * @full_scan: if we should scan from the beginning
92c14cab4   Dennis Zhou   percpu: convert c...
677
678
   *
   * Iterates over the metadata blocks to find the largest contig area.
d33d9f3dd   Dennis Zhou   percpu: use chunk...
679
680
681
682
683
   * A full scan can be avoided on the allocation path as this is triggered
   * if we broke the contig_hint.  In doing so, the scan_hint will be before
   * the contig_hint or after if the scan_hint == contig_hint.  This cannot
   * be prevented on freeing as we want to find the largest area possibly
   * spanning blocks.
92c14cab4   Dennis Zhou   percpu: convert c...
684
   */
d33d9f3dd   Dennis Zhou   percpu: use chunk...
685
  static void pcpu_chunk_refresh_hint(struct pcpu_chunk *chunk, bool full_scan)
92c14cab4   Dennis Zhou   percpu: convert c...
686
687
688
  {
  	struct pcpu_block_md *chunk_md = &chunk->chunk_md;
  	int bit_off, bits;
d33d9f3dd   Dennis Zhou   percpu: use chunk...
689
690
691
692
693
694
695
696
697
698
  	/* promote scan_hint to contig_hint */
  	if (!full_scan && chunk_md->scan_hint) {
  		bit_off = chunk_md->scan_hint_start + chunk_md->scan_hint;
  		chunk_md->contig_hint_start = chunk_md->scan_hint_start;
  		chunk_md->contig_hint = chunk_md->scan_hint;
  		chunk_md->scan_hint = 0;
  	} else {
  		bit_off = chunk_md->first_free;
  		chunk_md->contig_hint = 0;
  	}
92c14cab4   Dennis Zhou   percpu: convert c...
699

92c14cab4   Dennis Zhou   percpu: convert c...
700
701
702
  	bits = 0;
  	pcpu_for_each_md_free_region(chunk, bit_off, bits) {
  		pcpu_block_update(chunk_md, bit_off, bit_off + bits);
ca460b3c9   Dennis Zhou (Facebook)   percpu: introduce...
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
  	}
  }
  
  /**
   * pcpu_block_refresh_hint
   * @chunk: chunk of interest
   * @index: index of the metadata block
   *
   * Scans over the block beginning at first_free and updates the block
   * metadata accordingly.
   */
  static void pcpu_block_refresh_hint(struct pcpu_chunk *chunk, int index)
  {
  	struct pcpu_block_md *block = chunk->md_blocks + index;
  	unsigned long *alloc_map = pcpu_index_alloc_map(chunk, index);
da3afdd5b   Dennis Zhou   percpu: use block...
718
719
720
721
722
723
724
725
726
727
728
729
  	int rs, re, start;	/* region start, region end */
  
  	/* promote scan_hint to contig_hint */
  	if (block->scan_hint) {
  		start = block->scan_hint_start + block->scan_hint;
  		block->contig_hint_start = block->scan_hint_start;
  		block->contig_hint = block->scan_hint;
  		block->scan_hint = 0;
  	} else {
  		start = block->first_free;
  		block->contig_hint = 0;
  	}
ca460b3c9   Dennis Zhou (Facebook)   percpu: introduce...
730

da3afdd5b   Dennis Zhou   percpu: use block...
731
  	block->right_free = 0;
ca460b3c9   Dennis Zhou (Facebook)   percpu: introduce...
732
733
  
  	/* iterate over free areas and update the contig hints */
da3afdd5b   Dennis Zhou   percpu: use block...
734
  	pcpu_for_each_unpop_region(alloc_map, rs, re, start,
ca460b3c9   Dennis Zhou (Facebook)   percpu: introduce...
735
736
737
738
739
740
741
742
743
744
  				   PCPU_BITMAP_BLOCK_BITS) {
  		pcpu_block_update(block, rs, re);
  	}
  }
  
  /**
   * pcpu_block_update_hint_alloc - update hint on allocation path
   * @chunk: chunk of interest
   * @bit_off: chunk offset
   * @bits: size of request
fc3043345   Dennis Zhou (Facebook)   percpu: update al...
745
746
747
748
   *
   * Updates metadata for the allocation path.  The metadata only has to be
   * refreshed by a full scan iff the chunk's contig hint is broken.  Block level
   * scans are required if the block's contig hint is broken.
ca460b3c9   Dennis Zhou (Facebook)   percpu: introduce...
749
750
751
752
   */
  static void pcpu_block_update_hint_alloc(struct pcpu_chunk *chunk, int bit_off,
  					 int bits)
  {
92c14cab4   Dennis Zhou   percpu: convert c...
753
  	struct pcpu_block_md *chunk_md = &chunk->chunk_md;
b239f7daf   Dennis Zhou   percpu: set PCPU_...
754
  	int nr_empty_pages = 0;
ca460b3c9   Dennis Zhou (Facebook)   percpu: introduce...
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
  	struct pcpu_block_md *s_block, *e_block, *block;
  	int s_index, e_index;	/* block indexes of the freed allocation */
  	int s_off, e_off;	/* block offsets of the freed allocation */
  
  	/*
  	 * Calculate per block offsets.
  	 * The calculation uses an inclusive range, but the resulting offsets
  	 * are [start, end).  e_index always points to the last block in the
  	 * range.
  	 */
  	s_index = pcpu_off_to_block_index(bit_off);
  	e_index = pcpu_off_to_block_index(bit_off + bits - 1);
  	s_off = pcpu_off_to_block_off(bit_off);
  	e_off = pcpu_off_to_block_off(bit_off + bits - 1) + 1;
  
  	s_block = chunk->md_blocks + s_index;
  	e_block = chunk->md_blocks + e_index;
  
  	/*
  	 * Update s_block.
fc3043345   Dennis Zhou (Facebook)   percpu: update al...
775
776
777
  	 * block->first_free must be updated if the allocation takes its place.
  	 * If the allocation breaks the contig_hint, a scan is required to
  	 * restore this hint.
ca460b3c9   Dennis Zhou (Facebook)   percpu: introduce...
778
  	 */
b239f7daf   Dennis Zhou   percpu: set PCPU_...
779
780
  	if (s_block->contig_hint == PCPU_BITMAP_BLOCK_BITS)
  		nr_empty_pages++;
fc3043345   Dennis Zhou (Facebook)   percpu: update al...
781
782
783
784
785
  	if (s_off == s_block->first_free)
  		s_block->first_free = find_next_zero_bit(
  					pcpu_index_alloc_map(chunk, s_index),
  					PCPU_BITMAP_BLOCK_BITS,
  					s_off + bits);
382b88e96   Dennis Zhou   percpu: add block...
786
787
788
789
790
  	if (pcpu_region_overlap(s_block->scan_hint_start,
  				s_block->scan_hint_start + s_block->scan_hint,
  				s_off,
  				s_off + bits))
  		s_block->scan_hint = 0;
d9f3a01ee   Dennis Zhou   percpu: introduce...
791
792
793
794
795
  	if (pcpu_region_overlap(s_block->contig_hint_start,
  				s_block->contig_hint_start +
  				s_block->contig_hint,
  				s_off,
  				s_off + bits)) {
fc3043345   Dennis Zhou (Facebook)   percpu: update al...
796
  		/* block contig hint is broken - scan to fix it */
da3afdd5b   Dennis Zhou   percpu: use block...
797
798
  		if (!s_off)
  			s_block->left_free = 0;
fc3043345   Dennis Zhou (Facebook)   percpu: update al...
799
800
801
802
803
804
805
806
807
808
  		pcpu_block_refresh_hint(chunk, s_index);
  	} else {
  		/* update left and right contig manually */
  		s_block->left_free = min(s_block->left_free, s_off);
  		if (s_index == e_index)
  			s_block->right_free = min_t(int, s_block->right_free,
  					PCPU_BITMAP_BLOCK_BITS - e_off);
  		else
  			s_block->right_free = 0;
  	}
ca460b3c9   Dennis Zhou (Facebook)   percpu: introduce...
809
810
811
812
813
  
  	/*
  	 * Update e_block.
  	 */
  	if (s_index != e_index) {
b239f7daf   Dennis Zhou   percpu: set PCPU_...
814
815
  		if (e_block->contig_hint == PCPU_BITMAP_BLOCK_BITS)
  			nr_empty_pages++;
fc3043345   Dennis Zhou (Facebook)   percpu: update al...
816
817
818
819
820
821
822
823
824
825
826
827
  		/*
  		 * When the allocation is across blocks, the end is along
  		 * the left part of the e_block.
  		 */
  		e_block->first_free = find_next_zero_bit(
  				pcpu_index_alloc_map(chunk, e_index),
  				PCPU_BITMAP_BLOCK_BITS, e_off);
  
  		if (e_off == PCPU_BITMAP_BLOCK_BITS) {
  			/* reset the block */
  			e_block++;
  		} else {
382b88e96   Dennis Zhou   percpu: add block...
828
829
  			if (e_off > e_block->scan_hint_start)
  				e_block->scan_hint = 0;
da3afdd5b   Dennis Zhou   percpu: use block...
830
  			e_block->left_free = 0;
fc3043345   Dennis Zhou (Facebook)   percpu: update al...
831
832
833
834
  			if (e_off > e_block->contig_hint_start) {
  				/* contig hint is broken - scan to fix it */
  				pcpu_block_refresh_hint(chunk, e_index);
  			} else {
fc3043345   Dennis Zhou (Facebook)   percpu: update al...
835
836
837
838
839
  				e_block->right_free =
  					min_t(int, e_block->right_free,
  					      PCPU_BITMAP_BLOCK_BITS - e_off);
  			}
  		}
ca460b3c9   Dennis Zhou (Facebook)   percpu: introduce...
840
841
  
  		/* update in-between md_blocks */
b239f7daf   Dennis Zhou   percpu: set PCPU_...
842
  		nr_empty_pages += (e_index - s_index - 1);
ca460b3c9   Dennis Zhou (Facebook)   percpu: introduce...
843
  		for (block = s_block + 1; block < e_block; block++) {
382b88e96   Dennis Zhou   percpu: add block...
844
  			block->scan_hint = 0;
ca460b3c9   Dennis Zhou (Facebook)   percpu: introduce...
845
846
847
848
849
  			block->contig_hint = 0;
  			block->left_free = 0;
  			block->right_free = 0;
  		}
  	}
b239f7daf   Dennis Zhou   percpu: set PCPU_...
850
851
  	if (nr_empty_pages)
  		pcpu_update_empty_pages(chunk, -nr_empty_pages);
d33d9f3dd   Dennis Zhou   percpu: use chunk...
852
853
854
855
856
857
  	if (pcpu_region_overlap(chunk_md->scan_hint_start,
  				chunk_md->scan_hint_start +
  				chunk_md->scan_hint,
  				bit_off,
  				bit_off + bits))
  		chunk_md->scan_hint = 0;
fc3043345   Dennis Zhou (Facebook)   percpu: update al...
858
859
860
861
862
  	/*
  	 * The only time a full chunk scan is required is if the chunk
  	 * contig hint is broken.  Otherwise, it means a smaller space
  	 * was used and therefore the chunk contig hint is still correct.
  	 */
92c14cab4   Dennis Zhou   percpu: convert c...
863
864
865
  	if (pcpu_region_overlap(chunk_md->contig_hint_start,
  				chunk_md->contig_hint_start +
  				chunk_md->contig_hint,
d9f3a01ee   Dennis Zhou   percpu: introduce...
866
867
  				bit_off,
  				bit_off + bits))
d33d9f3dd   Dennis Zhou   percpu: use chunk...
868
  		pcpu_chunk_refresh_hint(chunk, false);
ca460b3c9   Dennis Zhou (Facebook)   percpu: introduce...
869
870
871
872
873
874
875
  }
  
  /**
   * pcpu_block_update_hint_free - updates the block hints on the free path
   * @chunk: chunk of interest
   * @bit_off: chunk offset
   * @bits: size of request
b185cd0dc   Dennis Zhou (Facebook)   percpu: update fr...
876
877
878
879
880
881
882
883
   *
   * Updates metadata for the allocation path.  This avoids a blind block
   * refresh by making use of the block contig hints.  If this fails, it scans
   * forward and backward to determine the extent of the free area.  This is
   * capped at the boundary of blocks.
   *
   * A chunk update is triggered if a page becomes free, a block becomes free,
   * or the free spans across blocks.  This tradeoff is to minimize iterating
92c14cab4   Dennis Zhou   percpu: convert c...
884
885
886
887
   * over the block metadata to update chunk_md->contig_hint.
   * chunk_md->contig_hint may be off by up to a page, but it will never be more
   * than the available space.  If the contig hint is contained in one block, it
   * will be accurate.
ca460b3c9   Dennis Zhou (Facebook)   percpu: introduce...
888
889
890
891
   */
  static void pcpu_block_update_hint_free(struct pcpu_chunk *chunk, int bit_off,
  					int bits)
  {
b239f7daf   Dennis Zhou   percpu: set PCPU_...
892
  	int nr_empty_pages = 0;
ca460b3c9   Dennis Zhou (Facebook)   percpu: introduce...
893
894
895
  	struct pcpu_block_md *s_block, *e_block, *block;
  	int s_index, e_index;	/* block indexes of the freed allocation */
  	int s_off, e_off;	/* block offsets of the freed allocation */
b185cd0dc   Dennis Zhou (Facebook)   percpu: update fr...
896
  	int start, end;		/* start and end of the whole free area */
ca460b3c9   Dennis Zhou (Facebook)   percpu: introduce...
897
898
899
900
901
902
903
904
905
906
907
908
909
910
  
  	/*
  	 * Calculate per block offsets.
  	 * The calculation uses an inclusive range, but the resulting offsets
  	 * are [start, end).  e_index always points to the last block in the
  	 * range.
  	 */
  	s_index = pcpu_off_to_block_index(bit_off);
  	e_index = pcpu_off_to_block_index(bit_off + bits - 1);
  	s_off = pcpu_off_to_block_off(bit_off);
  	e_off = pcpu_off_to_block_off(bit_off + bits - 1) + 1;
  
  	s_block = chunk->md_blocks + s_index;
  	e_block = chunk->md_blocks + e_index;
b185cd0dc   Dennis Zhou (Facebook)   percpu: update fr...
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
  	/*
  	 * Check if the freed area aligns with the block->contig_hint.
  	 * If it does, then the scan to find the beginning/end of the
  	 * larger free area can be avoided.
  	 *
  	 * start and end refer to beginning and end of the free area
  	 * within each their respective blocks.  This is not necessarily
  	 * the entire free area as it may span blocks past the beginning
  	 * or end of the block.
  	 */
  	start = s_off;
  	if (s_off == s_block->contig_hint + s_block->contig_hint_start) {
  		start = s_block->contig_hint_start;
  	} else {
  		/*
  		 * Scan backwards to find the extent of the free area.
  		 * find_last_bit returns the starting bit, so if the start bit
  		 * is returned, that means there was no last bit and the
  		 * remainder of the chunk is free.
  		 */
  		int l_bit = find_last_bit(pcpu_index_alloc_map(chunk, s_index),
  					  start);
  		start = (start == l_bit) ? 0 : l_bit + 1;
  	}
  
  	end = e_off;
  	if (e_off == e_block->contig_hint_start)
  		end = e_block->contig_hint_start + e_block->contig_hint;
  	else
  		end = find_next_bit(pcpu_index_alloc_map(chunk, e_index),
  				    PCPU_BITMAP_BLOCK_BITS, end);
ca460b3c9   Dennis Zhou (Facebook)   percpu: introduce...
942
  	/* update s_block */
b185cd0dc   Dennis Zhou (Facebook)   percpu: update fr...
943
  	e_off = (s_index == e_index) ? end : PCPU_BITMAP_BLOCK_BITS;
b239f7daf   Dennis Zhou   percpu: set PCPU_...
944
945
  	if (!start && e_off == PCPU_BITMAP_BLOCK_BITS)
  		nr_empty_pages++;
b185cd0dc   Dennis Zhou (Facebook)   percpu: update fr...
946
  	pcpu_block_update(s_block, start, e_off);
ca460b3c9   Dennis Zhou (Facebook)   percpu: introduce...
947
948
949
950
  
  	/* freeing in the same block */
  	if (s_index != e_index) {
  		/* update e_block */
b239f7daf   Dennis Zhou   percpu: set PCPU_...
951
952
  		if (end == PCPU_BITMAP_BLOCK_BITS)
  			nr_empty_pages++;
b185cd0dc   Dennis Zhou (Facebook)   percpu: update fr...
953
  		pcpu_block_update(e_block, 0, end);
ca460b3c9   Dennis Zhou (Facebook)   percpu: introduce...
954
955
  
  		/* reset md_blocks in the middle */
b239f7daf   Dennis Zhou   percpu: set PCPU_...
956
  		nr_empty_pages += (e_index - s_index - 1);
ca460b3c9   Dennis Zhou (Facebook)   percpu: introduce...
957
958
  		for (block = s_block + 1; block < e_block; block++) {
  			block->first_free = 0;
382b88e96   Dennis Zhou   percpu: add block...
959
  			block->scan_hint = 0;
ca460b3c9   Dennis Zhou (Facebook)   percpu: introduce...
960
961
962
963
964
965
  			block->contig_hint_start = 0;
  			block->contig_hint = PCPU_BITMAP_BLOCK_BITS;
  			block->left_free = PCPU_BITMAP_BLOCK_BITS;
  			block->right_free = PCPU_BITMAP_BLOCK_BITS;
  		}
  	}
b239f7daf   Dennis Zhou   percpu: set PCPU_...
966
967
  	if (nr_empty_pages)
  		pcpu_update_empty_pages(chunk, nr_empty_pages);
b185cd0dc   Dennis Zhou (Facebook)   percpu: update fr...
968
  	/*
b239f7daf   Dennis Zhou   percpu: set PCPU_...
969
970
971
972
  	 * Refresh chunk metadata when the free makes a block free or spans
  	 * across blocks.  The contig_hint may be off by up to a page, but if
  	 * the contig_hint is contained in a block, it will be accurate with
  	 * the else condition below.
b185cd0dc   Dennis Zhou (Facebook)   percpu: update fr...
973
  	 */
b239f7daf   Dennis Zhou   percpu: set PCPU_...
974
  	if (((end - start) >= PCPU_BITMAP_BLOCK_BITS) || s_index != e_index)
d33d9f3dd   Dennis Zhou   percpu: use chunk...
975
  		pcpu_chunk_refresh_hint(chunk, true);
b185cd0dc   Dennis Zhou (Facebook)   percpu: update fr...
976
  	else
92c14cab4   Dennis Zhou   percpu: convert c...
977
978
979
  		pcpu_block_update(&chunk->chunk_md,
  				  pcpu_block_off_to_off(s_index, start),
  				  end);
ca460b3c9   Dennis Zhou (Facebook)   percpu: introduce...
980
981
982
  }
  
  /**
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
   * pcpu_is_populated - determines if the region is populated
   * @chunk: chunk of interest
   * @bit_off: chunk offset
   * @bits: size of area
   * @next_off: return value for the next offset to start searching
   *
   * For atomic allocations, check if the backing pages are populated.
   *
   * RETURNS:
   * Bool if the backing pages are populated.
   * next_index is to skip over unpopulated blocks in pcpu_find_block_fit.
   */
  static bool pcpu_is_populated(struct pcpu_chunk *chunk, int bit_off, int bits,
  			      int *next_off)
  {
  	int page_start, page_end, rs, re;
833af8427   Tejun Heo   percpu: restructu...
999

40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1000
1001
  	page_start = PFN_DOWN(bit_off * PCPU_MIN_ALLOC_SIZE);
  	page_end = PFN_UP((bit_off + bits) * PCPU_MIN_ALLOC_SIZE);
833af8427   Tejun Heo   percpu: restructu...
1002

40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1003
1004
1005
1006
  	rs = page_start;
  	pcpu_next_unpop(chunk->populated, &rs, &re, page_end);
  	if (rs >= page_end)
  		return true;
833af8427   Tejun Heo   percpu: restructu...
1007

40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1008
1009
  	*next_off = re * PAGE_SIZE / PCPU_MIN_ALLOC_SIZE;
  	return false;
9f7dcf224   Tejun Heo   percpu: move chun...
1010
1011
1012
  }
  
  /**
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1013
1014
1015
1016
1017
1018
   * pcpu_find_block_fit - finds the block index to start searching
   * @chunk: chunk of interest
   * @alloc_bits: size of request in allocation units
   * @align: alignment of area (max PAGE_SIZE bytes)
   * @pop_only: use populated regions only
   *
b4c2116cf   Dennis Zhou (Facebook)   percpu: update pc...
1019
1020
1021
1022
1023
1024
1025
1026
   * Given a chunk and an allocation spec, find the offset to begin searching
   * for a free region.  This iterates over the bitmap metadata blocks to
   * find an offset that will be guaranteed to fit the requirements.  It is
   * not quite first fit as if the allocation does not fit in the contig hint
   * of a block or chunk, it is skipped.  This errs on the side of caution
   * to prevent excess iteration.  Poor alignment can cause the allocator to
   * skip over blocks and chunks that have valid free areas.
   *
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1027
1028
1029
   * RETURNS:
   * The offset in the bitmap to begin searching.
   * -1 if no offset is found.
a16037c8d   Tejun Heo   percpu: make pcpu...
1030
   */
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1031
1032
  static int pcpu_find_block_fit(struct pcpu_chunk *chunk, int alloc_bits,
  			       size_t align, bool pop_only)
a16037c8d   Tejun Heo   percpu: make pcpu...
1033
  {
92c14cab4   Dennis Zhou   percpu: convert c...
1034
  	struct pcpu_block_md *chunk_md = &chunk->chunk_md;
b4c2116cf   Dennis Zhou (Facebook)   percpu: update pc...
1035
  	int bit_off, bits, next_off;
a16037c8d   Tejun Heo   percpu: make pcpu...
1036

13f966373   Dennis Zhou (Facebook)   percpu: skip chun...
1037
1038
1039
1040
1041
1042
  	/*
  	 * Check to see if the allocation can fit in the chunk's contig hint.
  	 * This is an optimization to prevent scanning by assuming if it
  	 * cannot fit in the global hint, there is memory pressure and creating
  	 * a new chunk would happen soon.
  	 */
92c14cab4   Dennis Zhou   percpu: convert c...
1043
1044
1045
  	bit_off = ALIGN(chunk_md->contig_hint_start, align) -
  		  chunk_md->contig_hint_start;
  	if (bit_off + alloc_bits > chunk_md->contig_hint)
13f966373   Dennis Zhou (Facebook)   percpu: skip chun...
1046
  		return -1;
d33d9f3dd   Dennis Zhou   percpu: use chunk...
1047
  	bit_off = pcpu_next_hint(chunk_md, alloc_bits);
b4c2116cf   Dennis Zhou (Facebook)   percpu: update pc...
1048
1049
  	bits = 0;
  	pcpu_for_each_fit_region(chunk, alloc_bits, align, bit_off, bits) {
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1050
  		if (!pop_only || pcpu_is_populated(chunk, bit_off, bits,
b4c2116cf   Dennis Zhou (Facebook)   percpu: update pc...
1051
  						   &next_off))
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1052
  			break;
a16037c8d   Tejun Heo   percpu: make pcpu...
1053

b4c2116cf   Dennis Zhou (Facebook)   percpu: update pc...
1054
  		bit_off = next_off;
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1055
  		bits = 0;
a16037c8d   Tejun Heo   percpu: make pcpu...
1056
  	}
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1057
1058
1059
1060
1061
  
  	if (bit_off == pcpu_chunk_map_bits(chunk))
  		return -1;
  
  	return bit_off;
a16037c8d   Tejun Heo   percpu: make pcpu...
1062
  }
b89462a9c   Dennis Zhou   percpu: remember ...
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
  /*
   * pcpu_find_zero_area - modified from bitmap_find_next_zero_area_off()
   * @map: the address to base the search on
   * @size: the bitmap size in bits
   * @start: the bitnumber to start searching at
   * @nr: the number of zeroed bits we're looking for
   * @align_mask: alignment mask for zero area
   * @largest_off: offset of the largest area skipped
   * @largest_bits: size of the largest area skipped
   *
   * The @align_mask should be one less than a power of 2.
   *
   * This is a modified version of bitmap_find_next_zero_area_off() to remember
   * the largest area that was skipped.  This is imperfect, but in general is
   * good enough.  The largest remembered region is the largest failed region
   * seen.  This does not include anything we possibly skipped due to alignment.
   * pcpu_block_update_scan() does scan backwards to try and recover what was
   * lost to alignment.  While this can cause scanning to miss earlier possible
   * free areas, smaller allocations will eventually fill those holes.
   */
  static unsigned long pcpu_find_zero_area(unsigned long *map,
  					 unsigned long size,
  					 unsigned long start,
  					 unsigned long nr,
  					 unsigned long align_mask,
  					 unsigned long *largest_off,
  					 unsigned long *largest_bits)
  {
  	unsigned long index, end, i, area_off, area_bits;
  again:
  	index = find_next_zero_bit(map, size, start);
  
  	/* Align allocation */
  	index = __ALIGN_MASK(index, align_mask);
  	area_off = index;
  
  	end = index + nr;
  	if (end > size)
  		return end;
  	i = find_next_bit(map, end, index);
  	if (i < end) {
  		area_bits = i - area_off;
  		/* remember largest unused area with best alignment */
  		if (area_bits > *largest_bits ||
  		    (area_bits == *largest_bits && *largest_off &&
  		     (!area_off || __ffs(area_off) > __ffs(*largest_off)))) {
  			*largest_off = area_off;
  			*largest_bits = area_bits;
  		}
  
  		start = i + 1;
  		goto again;
  	}
  	return index;
  }
a16037c8d   Tejun Heo   percpu: make pcpu...
1118
  /**
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1119
   * pcpu_alloc_area - allocates an area from a pcpu_chunk
fbf59bc9d   Tejun Heo   percpu: implement...
1120
   * @chunk: chunk of interest
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1121
1122
1123
   * @alloc_bits: size of request in allocation units
   * @align: alignment of area (max PAGE_SIZE)
   * @start: bit_off to start searching
9f7dcf224   Tejun Heo   percpu: move chun...
1124
   *
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1125
   * This function takes in a @start offset to begin searching to fit an
b4c2116cf   Dennis Zhou (Facebook)   percpu: update pc...
1126
1127
1128
1129
1130
1131
   * allocation of @alloc_bits with alignment @align.  It needs to scan
   * the allocation map because if it fits within the block's contig hint,
   * @start will be block->first_free. This is an attempt to fill the
   * allocation prior to breaking the contig hint.  The allocation and
   * boundary maps are updated accordingly if it confirms a valid
   * free area.
ccea34b5d   Tejun Heo   percpu: finer gra...
1132
   *
fbf59bc9d   Tejun Heo   percpu: implement...
1133
   * RETURNS:
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1134
1135
   * Allocated addr offset in @chunk on success.
   * -1 if no matching area is found.
fbf59bc9d   Tejun Heo   percpu: implement...
1136
   */
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1137
1138
  static int pcpu_alloc_area(struct pcpu_chunk *chunk, int alloc_bits,
  			   size_t align, int start)
fbf59bc9d   Tejun Heo   percpu: implement...
1139
  {
92c14cab4   Dennis Zhou   percpu: convert c...
1140
  	struct pcpu_block_md *chunk_md = &chunk->chunk_md;
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1141
  	size_t align_mask = (align) ? (align - 1) : 0;
b89462a9c   Dennis Zhou   percpu: remember ...
1142
  	unsigned long area_off = 0, area_bits = 0;
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1143
  	int bit_off, end, oslot;
a16037c8d   Tejun Heo   percpu: make pcpu...
1144

40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1145
  	lockdep_assert_held(&pcpu_lock);
fbf59bc9d   Tejun Heo   percpu: implement...
1146

40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1147
  	oslot = pcpu_chunk_slot(chunk);
fbf59bc9d   Tejun Heo   percpu: implement...
1148

40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1149
1150
1151
  	/*
  	 * Search to find a fit.
  	 */
8c43004af   Dennis Zhou   percpu: do not se...
1152
1153
  	end = min_t(int, start + alloc_bits + PCPU_BITMAP_BLOCK_BITS,
  		    pcpu_chunk_map_bits(chunk));
b89462a9c   Dennis Zhou   percpu: remember ...
1154
1155
  	bit_off = pcpu_find_zero_area(chunk->alloc_map, end, start, alloc_bits,
  				      align_mask, &area_off, &area_bits);
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1156
1157
  	if (bit_off >= end)
  		return -1;
fbf59bc9d   Tejun Heo   percpu: implement...
1158

b89462a9c   Dennis Zhou   percpu: remember ...
1159
1160
  	if (area_bits)
  		pcpu_block_update_scan(chunk, area_off, area_bits);
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1161
1162
  	/* update alloc map */
  	bitmap_set(chunk->alloc_map, bit_off, alloc_bits);
3d331ad74   Al Viro   percpu: speed all...
1163

40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1164
1165
1166
1167
  	/* update boundary map */
  	set_bit(bit_off, chunk->bound_map);
  	bitmap_clear(chunk->bound_map, bit_off + 1, alloc_bits - 1);
  	set_bit(bit_off + alloc_bits, chunk->bound_map);
fbf59bc9d   Tejun Heo   percpu: implement...
1168

40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1169
  	chunk->free_bytes -= alloc_bits * PCPU_MIN_ALLOC_SIZE;
fbf59bc9d   Tejun Heo   percpu: implement...
1170

86b442fbc   Dennis Zhou (Facebook)   percpu: add first...
1171
  	/* update first free bit */
92c14cab4   Dennis Zhou   percpu: convert c...
1172
1173
  	if (bit_off == chunk_md->first_free)
  		chunk_md->first_free = find_next_zero_bit(
86b442fbc   Dennis Zhou (Facebook)   percpu: add first...
1174
1175
1176
  					chunk->alloc_map,
  					pcpu_chunk_map_bits(chunk),
  					bit_off + alloc_bits);
ca460b3c9   Dennis Zhou (Facebook)   percpu: introduce...
1177
  	pcpu_block_update_hint_alloc(chunk, bit_off, alloc_bits);
fbf59bc9d   Tejun Heo   percpu: implement...
1178

fbf59bc9d   Tejun Heo   percpu: implement...
1179
  	pcpu_chunk_relocate(chunk, oslot);
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1180
  	return bit_off * PCPU_MIN_ALLOC_SIZE;
fbf59bc9d   Tejun Heo   percpu: implement...
1181
1182
1183
  }
  
  /**
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1184
   * pcpu_free_area - frees the corresponding offset
fbf59bc9d   Tejun Heo   percpu: implement...
1185
   * @chunk: chunk of interest
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1186
   * @off: addr offset into chunk
ccea34b5d   Tejun Heo   percpu: finer gra...
1187
   *
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1188
1189
   * This function determines the size of an allocation to free using
   * the boundary bitmap and clears the allocation map.
fbf59bc9d   Tejun Heo   percpu: implement...
1190
   */
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1191
  static void pcpu_free_area(struct pcpu_chunk *chunk, int off)
fbf59bc9d   Tejun Heo   percpu: implement...
1192
  {
92c14cab4   Dennis Zhou   percpu: convert c...
1193
  	struct pcpu_block_md *chunk_md = &chunk->chunk_md;
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1194
  	int bit_off, bits, end, oslot;
723ad1d90   Al Viro   percpu: store off...
1195

5ccd30e40   Dennis Zhou   percpu: add missi...
1196
  	lockdep_assert_held(&pcpu_lock);
30a5b5367   Dennis Zhou   percpu: expose st...
1197
  	pcpu_stats_area_dealloc(chunk);
5ccd30e40   Dennis Zhou   percpu: add missi...
1198

40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1199
  	oslot = pcpu_chunk_slot(chunk);
fbf59bc9d   Tejun Heo   percpu: implement...
1200

40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1201
  	bit_off = off / PCPU_MIN_ALLOC_SIZE;
3d331ad74   Al Viro   percpu: speed all...
1202

40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1203
1204
1205
1206
1207
  	/* find end index */
  	end = find_next_bit(chunk->bound_map, pcpu_chunk_map_bits(chunk),
  			    bit_off + 1);
  	bits = end - bit_off;
  	bitmap_clear(chunk->alloc_map, bit_off, bits);
fbf59bc9d   Tejun Heo   percpu: implement...
1208

40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1209
1210
  	/* update metadata */
  	chunk->free_bytes += bits * PCPU_MIN_ALLOC_SIZE;
b539b87fe   Tejun Heo   percpu: implmeent...
1211

86b442fbc   Dennis Zhou (Facebook)   percpu: add first...
1212
  	/* update first free bit */
92c14cab4   Dennis Zhou   percpu: convert c...
1213
  	chunk_md->first_free = min(chunk_md->first_free, bit_off);
86b442fbc   Dennis Zhou (Facebook)   percpu: add first...
1214

ca460b3c9   Dennis Zhou (Facebook)   percpu: introduce...
1215
  	pcpu_block_update_hint_free(chunk, bit_off, bits);
fbf59bc9d   Tejun Heo   percpu: implement...
1216

fbf59bc9d   Tejun Heo   percpu: implement...
1217
1218
  	pcpu_chunk_relocate(chunk, oslot);
  }
047924c96   Dennis Zhou   percpu: make pcpu...
1219
1220
1221
1222
1223
1224
1225
1226
1227
  static void pcpu_init_md_block(struct pcpu_block_md *block, int nr_bits)
  {
  	block->scan_hint = 0;
  	block->contig_hint = nr_bits;
  	block->left_free = nr_bits;
  	block->right_free = nr_bits;
  	block->first_free = 0;
  	block->nr_bits = nr_bits;
  }
ca460b3c9   Dennis Zhou (Facebook)   percpu: introduce...
1228
1229
1230
  static void pcpu_init_md_blocks(struct pcpu_chunk *chunk)
  {
  	struct pcpu_block_md *md_block;
92c14cab4   Dennis Zhou   percpu: convert c...
1231
1232
  	/* init the chunk's block */
  	pcpu_init_md_block(&chunk->chunk_md, pcpu_chunk_map_bits(chunk));
ca460b3c9   Dennis Zhou (Facebook)   percpu: introduce...
1233
1234
  	for (md_block = chunk->md_blocks;
  	     md_block != chunk->md_blocks + pcpu_chunk_nr_blocks(chunk);
047924c96   Dennis Zhou   percpu: make pcpu...
1235
1236
  	     md_block++)
  		pcpu_init_md_block(md_block, PCPU_BITMAP_BLOCK_BITS);
ca460b3c9   Dennis Zhou (Facebook)   percpu: introduce...
1237
  }
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
  /**
   * pcpu_alloc_first_chunk - creates chunks that serve the first chunk
   * @tmp_addr: the start of the region served
   * @map_size: size of the region served
   *
   * This is responsible for creating the chunks that serve the first chunk.  The
   * base_addr is page aligned down of @tmp_addr while the region end is page
   * aligned up.  Offsets are kept track of to determine the region served. All
   * this is done to appease the bitmap allocator in avoiding partial blocks.
   *
   * RETURNS:
   * Chunk serving the region at @tmp_addr of @map_size.
   */
c0ebfdc3f   Dennis Zhou (Facebook)   percpu: modify ba...
1251
  static struct pcpu_chunk * __init pcpu_alloc_first_chunk(unsigned long tmp_addr,
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1252
  							 int map_size)
10edf5b0b   Dennis Zhou (Facebook)   percpu: unify all...
1253
1254
  {
  	struct pcpu_chunk *chunk;
ca460b3c9   Dennis Zhou (Facebook)   percpu: introduce...
1255
  	unsigned long aligned_addr, lcm_align;
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1256
  	int start_offset, offset_bits, region_size, region_bits;
f655f4053   Mike Rapoport   mm/percpu: add ch...
1257
  	size_t alloc_size;
c0ebfdc3f   Dennis Zhou (Facebook)   percpu: modify ba...
1258
1259
1260
1261
1262
  
  	/* region calculations */
  	aligned_addr = tmp_addr & PAGE_MASK;
  
  	start_offset = tmp_addr - aligned_addr;
6b9d7c8e8   Dennis Zhou (Facebook)   percpu: end chunk...
1263

ca460b3c9   Dennis Zhou (Facebook)   percpu: introduce...
1264
1265
1266
1267
1268
1269
1270
  	/*
  	 * Align the end of the region with the LCM of PAGE_SIZE and
  	 * PCPU_BITMAP_BLOCK_SIZE.  One of these constants is a multiple of
  	 * the other.
  	 */
  	lcm_align = lcm(PAGE_SIZE, PCPU_BITMAP_BLOCK_SIZE);
  	region_size = ALIGN(start_offset + map_size, lcm_align);
10edf5b0b   Dennis Zhou (Facebook)   percpu: unify all...
1271

c0ebfdc3f   Dennis Zhou (Facebook)   percpu: modify ba...
1272
  	/* allocate chunk */
f655f4053   Mike Rapoport   mm/percpu: add ch...
1273
  	alloc_size = sizeof(struct pcpu_chunk) +
e44bd84cd   Sunghyun Jin   percpu: fix first...
1274
  		BITS_TO_LONGS(region_size >> PAGE_SHIFT) * sizeof(unsigned long);
f655f4053   Mike Rapoport   mm/percpu: add ch...
1275
1276
1277
1278
1279
  	chunk = memblock_alloc(alloc_size, SMP_CACHE_BYTES);
  	if (!chunk)
  		panic("%s: Failed to allocate %zu bytes
  ", __func__,
  		      alloc_size);
c0ebfdc3f   Dennis Zhou (Facebook)   percpu: modify ba...
1280

10edf5b0b   Dennis Zhou (Facebook)   percpu: unify all...
1281
  	INIT_LIST_HEAD(&chunk->list);
c0ebfdc3f   Dennis Zhou (Facebook)   percpu: modify ba...
1282
1283
  
  	chunk->base_addr = (void *)aligned_addr;
10edf5b0b   Dennis Zhou (Facebook)   percpu: unify all...
1284
  	chunk->start_offset = start_offset;
6b9d7c8e8   Dennis Zhou (Facebook)   percpu: end chunk...
1285
  	chunk->end_offset = region_size - chunk->start_offset - map_size;
c0ebfdc3f   Dennis Zhou (Facebook)   percpu: modify ba...
1286

8ab16c43e   Dennis Zhou (Facebook)   percpu: change th...
1287
  	chunk->nr_pages = region_size >> PAGE_SHIFT;
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1288
  	region_bits = pcpu_chunk_map_bits(chunk);
c0ebfdc3f   Dennis Zhou (Facebook)   percpu: modify ba...
1289

f655f4053   Mike Rapoport   mm/percpu: add ch...
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
  	alloc_size = BITS_TO_LONGS(region_bits) * sizeof(chunk->alloc_map[0]);
  	chunk->alloc_map = memblock_alloc(alloc_size, SMP_CACHE_BYTES);
  	if (!chunk->alloc_map)
  		panic("%s: Failed to allocate %zu bytes
  ", __func__,
  		      alloc_size);
  
  	alloc_size =
  		BITS_TO_LONGS(region_bits + 1) * sizeof(chunk->bound_map[0]);
  	chunk->bound_map = memblock_alloc(alloc_size, SMP_CACHE_BYTES);
  	if (!chunk->bound_map)
  		panic("%s: Failed to allocate %zu bytes
  ", __func__,
  		      alloc_size);
  
  	alloc_size = pcpu_chunk_nr_blocks(chunk) * sizeof(chunk->md_blocks[0]);
  	chunk->md_blocks = memblock_alloc(alloc_size, SMP_CACHE_BYTES);
  	if (!chunk->md_blocks)
  		panic("%s: Failed to allocate %zu bytes
  ", __func__,
  		      alloc_size);
ca460b3c9   Dennis Zhou (Facebook)   percpu: introduce...
1311
  	pcpu_init_md_blocks(chunk);
10edf5b0b   Dennis Zhou (Facebook)   percpu: unify all...
1312
1313
1314
  
  	/* manage populated page bitmap */
  	chunk->immutable = true;
8ab16c43e   Dennis Zhou (Facebook)   percpu: change th...
1315
1316
  	bitmap_fill(chunk->populated, chunk->nr_pages);
  	chunk->nr_populated = chunk->nr_pages;
b239f7daf   Dennis Zhou   percpu: set PCPU_...
1317
  	chunk->nr_empty_pop_pages = chunk->nr_pages;
10edf5b0b   Dennis Zhou (Facebook)   percpu: unify all...
1318

40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1319
  	chunk->free_bytes = map_size;
c0ebfdc3f   Dennis Zhou (Facebook)   percpu: modify ba...
1320
1321
1322
  
  	if (chunk->start_offset) {
  		/* hide the beginning of the bitmap */
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1323
1324
1325
1326
  		offset_bits = chunk->start_offset / PCPU_MIN_ALLOC_SIZE;
  		bitmap_set(chunk->alloc_map, 0, offset_bits);
  		set_bit(0, chunk->bound_map);
  		set_bit(offset_bits, chunk->bound_map);
ca460b3c9   Dennis Zhou (Facebook)   percpu: introduce...
1327

92c14cab4   Dennis Zhou   percpu: convert c...
1328
  		chunk->chunk_md.first_free = offset_bits;
86b442fbc   Dennis Zhou (Facebook)   percpu: add first...
1329

ca460b3c9   Dennis Zhou (Facebook)   percpu: introduce...
1330
  		pcpu_block_update_hint_alloc(chunk, 0, offset_bits);
c0ebfdc3f   Dennis Zhou (Facebook)   percpu: modify ba...
1331
  	}
6b9d7c8e8   Dennis Zhou (Facebook)   percpu: end chunk...
1332
1333
  	if (chunk->end_offset) {
  		/* hide the end of the bitmap */
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1334
1335
1336
1337
1338
1339
1340
  		offset_bits = chunk->end_offset / PCPU_MIN_ALLOC_SIZE;
  		bitmap_set(chunk->alloc_map,
  			   pcpu_chunk_map_bits(chunk) - offset_bits,
  			   offset_bits);
  		set_bit((start_offset + map_size) / PCPU_MIN_ALLOC_SIZE,
  			chunk->bound_map);
  		set_bit(region_bits, chunk->bound_map);
6b9d7c8e8   Dennis Zhou (Facebook)   percpu: end chunk...
1341

ca460b3c9   Dennis Zhou (Facebook)   percpu: introduce...
1342
1343
1344
  		pcpu_block_update_hint_alloc(chunk, pcpu_chunk_map_bits(chunk)
  					     - offset_bits, offset_bits);
  	}
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1345

10edf5b0b   Dennis Zhou (Facebook)   percpu: unify all...
1346
1347
  	return chunk;
  }
47504ee04   Dennis Zhou   percpu: add __GFP...
1348
  static struct pcpu_chunk *pcpu_alloc_chunk(gfp_t gfp)
6081089fd   Tejun Heo   percpu: reorganiz...
1349
1350
  {
  	struct pcpu_chunk *chunk;
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1351
  	int region_bits;
6081089fd   Tejun Heo   percpu: reorganiz...
1352

47504ee04   Dennis Zhou   percpu: add __GFP...
1353
  	chunk = pcpu_mem_zalloc(pcpu_chunk_struct_size, gfp);
6081089fd   Tejun Heo   percpu: reorganiz...
1354
1355
  	if (!chunk)
  		return NULL;
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1356
1357
1358
  	INIT_LIST_HEAD(&chunk->list);
  	chunk->nr_pages = pcpu_unit_pages;
  	region_bits = pcpu_chunk_map_bits(chunk);
6081089fd   Tejun Heo   percpu: reorganiz...
1359

40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1360
  	chunk->alloc_map = pcpu_mem_zalloc(BITS_TO_LONGS(region_bits) *
47504ee04   Dennis Zhou   percpu: add __GFP...
1361
  					   sizeof(chunk->alloc_map[0]), gfp);
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1362
1363
  	if (!chunk->alloc_map)
  		goto alloc_map_fail;
6081089fd   Tejun Heo   percpu: reorganiz...
1364

40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1365
  	chunk->bound_map = pcpu_mem_zalloc(BITS_TO_LONGS(region_bits + 1) *
47504ee04   Dennis Zhou   percpu: add __GFP...
1366
  					   sizeof(chunk->bound_map[0]), gfp);
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1367
1368
  	if (!chunk->bound_map)
  		goto bound_map_fail;
6081089fd   Tejun Heo   percpu: reorganiz...
1369

ca460b3c9   Dennis Zhou (Facebook)   percpu: introduce...
1370
  	chunk->md_blocks = pcpu_mem_zalloc(pcpu_chunk_nr_blocks(chunk) *
47504ee04   Dennis Zhou   percpu: add __GFP...
1371
  					   sizeof(chunk->md_blocks[0]), gfp);
ca460b3c9   Dennis Zhou (Facebook)   percpu: introduce...
1372
1373
1374
1375
  	if (!chunk->md_blocks)
  		goto md_blocks_fail;
  
  	pcpu_init_md_blocks(chunk);
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1376
  	/* init metadata */
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1377
  	chunk->free_bytes = chunk->nr_pages * PAGE_SIZE;
c0ebfdc3f   Dennis Zhou (Facebook)   percpu: modify ba...
1378

6081089fd   Tejun Heo   percpu: reorganiz...
1379
  	return chunk;
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1380

ca460b3c9   Dennis Zhou (Facebook)   percpu: introduce...
1381
1382
  md_blocks_fail:
  	pcpu_mem_free(chunk->bound_map);
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1383
1384
1385
1386
1387
1388
  bound_map_fail:
  	pcpu_mem_free(chunk->alloc_map);
  alloc_map_fail:
  	pcpu_mem_free(chunk);
  
  	return NULL;
6081089fd   Tejun Heo   percpu: reorganiz...
1389
1390
1391
1392
1393
1394
  }
  
  static void pcpu_free_chunk(struct pcpu_chunk *chunk)
  {
  	if (!chunk)
  		return;
6685b3573   Mike Rapoport   percpu: stop leak...
1395
  	pcpu_mem_free(chunk->md_blocks);
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1396
1397
  	pcpu_mem_free(chunk->bound_map);
  	pcpu_mem_free(chunk->alloc_map);
1d5cfdb07   Tetsuo Handa   tree wide: use kv...
1398
  	pcpu_mem_free(chunk);
6081089fd   Tejun Heo   percpu: reorganiz...
1399
  }
b539b87fe   Tejun Heo   percpu: implmeent...
1400
1401
1402
1403
1404
1405
1406
1407
1408
  /**
   * pcpu_chunk_populated - post-population bookkeeping
   * @chunk: pcpu_chunk which got populated
   * @page_start: the start page
   * @page_end: the end page
   *
   * Pages in [@page_start,@page_end) have been populated to @chunk.  Update
   * the bookkeeping information accordingly.  Must be called after each
   * successful population.
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1409
1410
1411
   *
   * If this is @for_alloc, do not increment pcpu_nr_empty_pop_pages because it
   * is to serve an allocation in that area.
b539b87fe   Tejun Heo   percpu: implmeent...
1412
   */
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1413
  static void pcpu_chunk_populated(struct pcpu_chunk *chunk, int page_start,
b239f7daf   Dennis Zhou   percpu: set PCPU_...
1414
  				 int page_end)
b539b87fe   Tejun Heo   percpu: implmeent...
1415
1416
1417
1418
1419
1420
1421
  {
  	int nr = page_end - page_start;
  
  	lockdep_assert_held(&pcpu_lock);
  
  	bitmap_set(chunk->populated, page_start, nr);
  	chunk->nr_populated += nr;
7e8a6304d   Dennis Zhou (Facebook)   /proc/meminfo: ad...
1422
  	pcpu_nr_populated += nr;
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1423

b239f7daf   Dennis Zhou   percpu: set PCPU_...
1424
  	pcpu_update_empty_pages(chunk, nr);
b539b87fe   Tejun Heo   percpu: implmeent...
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
  }
  
  /**
   * pcpu_chunk_depopulated - post-depopulation bookkeeping
   * @chunk: pcpu_chunk which got depopulated
   * @page_start: the start page
   * @page_end: the end page
   *
   * Pages in [@page_start,@page_end) have been depopulated from @chunk.
   * Update the bookkeeping information accordingly.  Must be called after
   * each successful depopulation.
   */
  static void pcpu_chunk_depopulated(struct pcpu_chunk *chunk,
  				   int page_start, int page_end)
  {
  	int nr = page_end - page_start;
  
  	lockdep_assert_held(&pcpu_lock);
  
  	bitmap_clear(chunk->populated, page_start, nr);
  	chunk->nr_populated -= nr;
7e8a6304d   Dennis Zhou (Facebook)   /proc/meminfo: ad...
1446
  	pcpu_nr_populated -= nr;
b239f7daf   Dennis Zhou   percpu: set PCPU_...
1447
1448
  
  	pcpu_update_empty_pages(chunk, -nr);
b539b87fe   Tejun Heo   percpu: implmeent...
1449
  }
9f6455325   Tejun Heo   percpu: move vmal...
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
  /*
   * Chunk management implementation.
   *
   * To allow different implementations, chunk alloc/free and
   * [de]population are implemented in a separate file which is pulled
   * into this file and compiled together.  The following functions
   * should be implemented.
   *
   * pcpu_populate_chunk		- populate the specified range of a chunk
   * pcpu_depopulate_chunk	- depopulate the specified range of a chunk
   * pcpu_create_chunk		- create a new chunk
   * pcpu_destroy_chunk		- destroy a chunk, always preceded by full depop
   * pcpu_addr_to_page		- translate address to physical address
   * pcpu_verify_alloc_info	- check alloc_info is acceptable during init
fbf59bc9d   Tejun Heo   percpu: implement...
1464
   */
15d9f3d11   Dennis Zhou   percpu: match chu...
1465
  static int pcpu_populate_chunk(struct pcpu_chunk *chunk,
47504ee04   Dennis Zhou   percpu: add __GFP...
1466
  			       int page_start, int page_end, gfp_t gfp);
15d9f3d11   Dennis Zhou   percpu: match chu...
1467
1468
  static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk,
  				  int page_start, int page_end);
47504ee04   Dennis Zhou   percpu: add __GFP...
1469
  static struct pcpu_chunk *pcpu_create_chunk(gfp_t gfp);
9f6455325   Tejun Heo   percpu: move vmal...
1470
1471
1472
  static void pcpu_destroy_chunk(struct pcpu_chunk *chunk);
  static struct page *pcpu_addr_to_page(void *addr);
  static int __init pcpu_verify_alloc_info(const struct pcpu_alloc_info *ai);
fbf59bc9d   Tejun Heo   percpu: implement...
1473

b0c9778b1   Tejun Heo   percpu: implement...
1474
1475
1476
  #ifdef CONFIG_NEED_PER_CPU_KM
  #include "percpu-km.c"
  #else
9f6455325   Tejun Heo   percpu: move vmal...
1477
  #include "percpu-vm.c"
b0c9778b1   Tejun Heo   percpu: implement...
1478
  #endif
fbf59bc9d   Tejun Heo   percpu: implement...
1479
1480
  
  /**
88999a898   Tejun Heo   percpu: misc prep...
1481
1482
1483
   * pcpu_chunk_addr_search - determine chunk containing specified address
   * @addr: address for which the chunk needs to be determined.
   *
c0ebfdc3f   Dennis Zhou (Facebook)   percpu: modify ba...
1484
1485
1486
   * This is an internal function that handles all but static allocations.
   * Static percpu address values should never be passed into the allocator.
   *
88999a898   Tejun Heo   percpu: misc prep...
1487
1488
1489
1490
1491
   * RETURNS:
   * The address of the found chunk.
   */
  static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr)
  {
c0ebfdc3f   Dennis Zhou (Facebook)   percpu: modify ba...
1492
  	/* is it in the dynamic region (first chunk)? */
560f2c236   Dennis Zhou (Facebook)   percpu: combine p...
1493
  	if (pcpu_addr_in_chunk(pcpu_first_chunk, addr))
88999a898   Tejun Heo   percpu: misc prep...
1494
  		return pcpu_first_chunk;
c0ebfdc3f   Dennis Zhou (Facebook)   percpu: modify ba...
1495
1496
  
  	/* is it in the reserved region? */
560f2c236   Dennis Zhou (Facebook)   percpu: combine p...
1497
  	if (pcpu_addr_in_chunk(pcpu_reserved_chunk, addr))
c0ebfdc3f   Dennis Zhou (Facebook)   percpu: modify ba...
1498
  		return pcpu_reserved_chunk;
88999a898   Tejun Heo   percpu: misc prep...
1499
1500
1501
1502
1503
1504
1505
1506
1507
  
  	/*
  	 * The address is relative to unit0 which might be unused and
  	 * thus unmapped.  Offset the address to the unit space of the
  	 * current processor before looking it up in the vmalloc
  	 * space.  Note that any possible cpu id can be used here, so
  	 * there's no need to worry about preemption or cpu hotplug.
  	 */
  	addr += pcpu_unit_offsets[raw_smp_processor_id()];
9f6455325   Tejun Heo   percpu: move vmal...
1508
  	return pcpu_get_page_chunk(pcpu_addr_to_page(addr));
88999a898   Tejun Heo   percpu: misc prep...
1509
1510
1511
  }
  
  /**
edcb46399   Tejun Heo   percpu, module: i...
1512
   * pcpu_alloc - the percpu allocator
cae3aeb83   Tejun Heo   percpu: clean up ...
1513
   * @size: size of area to allocate in bytes
fbf59bc9d   Tejun Heo   percpu: implement...
1514
   * @align: alignment of area (max PAGE_SIZE)
edcb46399   Tejun Heo   percpu, module: i...
1515
   * @reserved: allocate from the reserved chunk if available
5835d96e9   Tejun Heo   percpu: implement...
1516
   * @gfp: allocation flags
fbf59bc9d   Tejun Heo   percpu: implement...
1517
   *
5835d96e9   Tejun Heo   percpu: implement...
1518
   * Allocate percpu area of @size bytes aligned at @align.  If @gfp doesn't
0ea7eeec2   Daniel Borkmann   mm, percpu: add s...
1519
1520
1521
   * contain %GFP_KERNEL, the allocation is atomic. If @gfp has __GFP_NOWARN
   * then no warning will be triggered on invalid or failed allocation
   * requests.
fbf59bc9d   Tejun Heo   percpu: implement...
1522
1523
1524
1525
   *
   * RETURNS:
   * Percpu pointer to the allocated area on success, NULL on failure.
   */
5835d96e9   Tejun Heo   percpu: implement...
1526
1527
  static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
  				 gfp_t gfp)
fbf59bc9d   Tejun Heo   percpu: implement...
1528
  {
554fef1c3   Dennis Zhou   percpu: allow sel...
1529
1530
  	/* whitelisted flags that can be passed to the backing allocators */
  	gfp_t pcpu_gfp = gfp & (GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN);
0ea7eeec2   Daniel Borkmann   mm, percpu: add s...
1531
1532
  	bool is_atomic = (gfp & GFP_KERNEL) != GFP_KERNEL;
  	bool do_warn = !(gfp & __GFP_NOWARN);
f2badb0c9   Tejun Heo   percpu: make allo...
1533
  	static int warn_limit = 10;
8744d8594   Dennis Zhou   percpu: relegate ...
1534
  	struct pcpu_chunk *chunk, *next;
f2badb0c9   Tejun Heo   percpu: make allo...
1535
  	const char *err;
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1536
  	int slot, off, cpu, ret;
403a91b16   Jiri Kosina   percpu: allow pcp...
1537
  	unsigned long flags;
f528f0b8e   Catalin Marinas   kmemleak: Handle ...
1538
  	void __percpu *ptr;
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1539
  	size_t bits, bit_align;
fbf59bc9d   Tejun Heo   percpu: implement...
1540

723ad1d90   Al Viro   percpu: store off...
1541
  	/*
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1542
1543
1544
1545
  	 * There is now a minimum allocation size of PCPU_MIN_ALLOC_SIZE,
  	 * therefore alignment must be a minimum of that many bytes.
  	 * An allocation may have internal fragmentation from rounding up
  	 * of up to PCPU_MIN_ALLOC_SIZE - 1 bytes.
723ad1d90   Al Viro   percpu: store off...
1546
  	 */
d2f3c3849   Dennis Zhou (Facebook)   percpu: increase ...
1547
1548
  	if (unlikely(align < PCPU_MIN_ALLOC_SIZE))
  		align = PCPU_MIN_ALLOC_SIZE;
723ad1d90   Al Viro   percpu: store off...
1549

d2f3c3849   Dennis Zhou (Facebook)   percpu: increase ...
1550
  	size = ALIGN(size, PCPU_MIN_ALLOC_SIZE);
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1551
1552
  	bits = size >> PCPU_MIN_ALLOC_SHIFT;
  	bit_align = align >> PCPU_MIN_ALLOC_SHIFT;
2f69fa829   Al Viro   percpu: allocatio...
1553

3ca45a46f   zijun_hu   percpu: ensure th...
1554
1555
  	if (unlikely(!size || size > PCPU_MIN_UNIT_SIZE || align > PAGE_SIZE ||
  		     !is_power_of_2(align))) {
0ea7eeec2   Daniel Borkmann   mm, percpu: add s...
1556
1557
  		WARN(do_warn, "illegal size (%zu) or align (%zu) for percpu allocation
  ",
756a025f0   Joe Perches   mm: coalesce spli...
1558
  		     size, align);
fbf59bc9d   Tejun Heo   percpu: implement...
1559
1560
  		return NULL;
  	}
f52ba1fef   Kirill Tkhai   mm: Allow to kill...
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
  	if (!is_atomic) {
  		/*
  		 * pcpu_balance_workfn() allocates memory under this mutex,
  		 * and it may wait for memory reclaim. Allow current task
  		 * to become OOM victim, in case of memory pressure.
  		 */
  		if (gfp & __GFP_NOFAIL)
  			mutex_lock(&pcpu_alloc_mutex);
  		else if (mutex_lock_killable(&pcpu_alloc_mutex))
  			return NULL;
  	}
6710e594f   Tejun Heo   percpu: fix synch...
1572

403a91b16   Jiri Kosina   percpu: allow pcp...
1573
  	spin_lock_irqsave(&pcpu_lock, flags);
fbf59bc9d   Tejun Heo   percpu: implement...
1574

edcb46399   Tejun Heo   percpu, module: i...
1575
1576
1577
  	/* serve reserved allocations from the reserved chunk if available */
  	if (reserved && pcpu_reserved_chunk) {
  		chunk = pcpu_reserved_chunk;
833af8427   Tejun Heo   percpu: restructu...
1578

40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1579
1580
  		off = pcpu_find_block_fit(chunk, bits, bit_align, is_atomic);
  		if (off < 0) {
833af8427   Tejun Heo   percpu: restructu...
1581
  			err = "alloc from reserved chunk failed";
ccea34b5d   Tejun Heo   percpu: finer gra...
1582
  			goto fail_unlock;
f2badb0c9   Tejun Heo   percpu: make allo...
1583
  		}
833af8427   Tejun Heo   percpu: restructu...
1584

40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1585
  		off = pcpu_alloc_area(chunk, bits, bit_align, off);
edcb46399   Tejun Heo   percpu, module: i...
1586
1587
  		if (off >= 0)
  			goto area_found;
833af8427   Tejun Heo   percpu: restructu...
1588

f2badb0c9   Tejun Heo   percpu: make allo...
1589
  		err = "alloc from reserved chunk failed";
ccea34b5d   Tejun Heo   percpu: finer gra...
1590
  		goto fail_unlock;
edcb46399   Tejun Heo   percpu, module: i...
1591
  	}
ccea34b5d   Tejun Heo   percpu: finer gra...
1592
  restart:
edcb46399   Tejun Heo   percpu, module: i...
1593
  	/* search through normal chunks */
fbf59bc9d   Tejun Heo   percpu: implement...
1594
  	for (slot = pcpu_size_to_slot(size); slot < pcpu_nr_slots; slot++) {
8744d8594   Dennis Zhou   percpu: relegate ...
1595
  		list_for_each_entry_safe(chunk, next, &pcpu_slot[slot], list) {
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1596
1597
  			off = pcpu_find_block_fit(chunk, bits, bit_align,
  						  is_atomic);
8744d8594   Dennis Zhou   percpu: relegate ...
1598
1599
1600
  			if (off < 0) {
  				if (slot < PCPU_SLOT_FAIL_THRESHOLD)
  					pcpu_chunk_move(chunk, 0);
fbf59bc9d   Tejun Heo   percpu: implement...
1601
  				continue;
8744d8594   Dennis Zhou   percpu: relegate ...
1602
  			}
ccea34b5d   Tejun Heo   percpu: finer gra...
1603

40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1604
  			off = pcpu_alloc_area(chunk, bits, bit_align, off);
fbf59bc9d   Tejun Heo   percpu: implement...
1605
1606
  			if (off >= 0)
  				goto area_found;
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1607

fbf59bc9d   Tejun Heo   percpu: implement...
1608
1609
  		}
  	}
403a91b16   Jiri Kosina   percpu: allow pcp...
1610
  	spin_unlock_irqrestore(&pcpu_lock, flags);
ccea34b5d   Tejun Heo   percpu: finer gra...
1611

b38d08f31   Tejun Heo   percpu: restructu...
1612
1613
1614
1615
1616
  	/*
  	 * No space left.  Create a new chunk.  We don't want multiple
  	 * tasks to create chunks simultaneously.  Serialize and create iff
  	 * there's still no empty chunk after grabbing the mutex.
  	 */
11df02bf9   Dennis Zhou   percpu: resolve e...
1617
1618
  	if (is_atomic) {
  		err = "atomic alloc failed, no space left";
5835d96e9   Tejun Heo   percpu: implement...
1619
  		goto fail;
11df02bf9   Dennis Zhou   percpu: resolve e...
1620
  	}
5835d96e9   Tejun Heo   percpu: implement...
1621

b38d08f31   Tejun Heo   percpu: restructu...
1622
  	if (list_empty(&pcpu_slot[pcpu_nr_slots - 1])) {
554fef1c3   Dennis Zhou   percpu: allow sel...
1623
  		chunk = pcpu_create_chunk(pcpu_gfp);
b38d08f31   Tejun Heo   percpu: restructu...
1624
1625
1626
1627
1628
1629
1630
1631
1632
  		if (!chunk) {
  			err = "failed to allocate new chunk";
  			goto fail;
  		}
  
  		spin_lock_irqsave(&pcpu_lock, flags);
  		pcpu_chunk_relocate(chunk, -1);
  	} else {
  		spin_lock_irqsave(&pcpu_lock, flags);
f2badb0c9   Tejun Heo   percpu: make allo...
1633
  	}
ccea34b5d   Tejun Heo   percpu: finer gra...
1634

ccea34b5d   Tejun Heo   percpu: finer gra...
1635
  	goto restart;
fbf59bc9d   Tejun Heo   percpu: implement...
1636
1637
  
  area_found:
30a5b5367   Dennis Zhou   percpu: expose st...
1638
  	pcpu_stats_area_alloc(chunk, size);
403a91b16   Jiri Kosina   percpu: allow pcp...
1639
  	spin_unlock_irqrestore(&pcpu_lock, flags);
ccea34b5d   Tejun Heo   percpu: finer gra...
1640

dca496451   Tejun Heo   percpu: move comm...
1641
  	/* populate if not all pages are already there */
5835d96e9   Tejun Heo   percpu: implement...
1642
  	if (!is_atomic) {
e04d32083   Tejun Heo   percpu: indent th...
1643
  		int page_start, page_end, rs, re;
dca496451   Tejun Heo   percpu: move comm...
1644

e04d32083   Tejun Heo   percpu: indent th...
1645
1646
  		page_start = PFN_DOWN(off);
  		page_end = PFN_UP(off + size);
b38d08f31   Tejun Heo   percpu: restructu...
1647

91e914c5a   Dennis Zhou (Facebook)   percpu: generaliz...
1648
1649
  		pcpu_for_each_unpop_region(chunk->populated, rs, re,
  					   page_start, page_end) {
e04d32083   Tejun Heo   percpu: indent th...
1650
  			WARN_ON(chunk->immutable);
554fef1c3   Dennis Zhou   percpu: allow sel...
1651
  			ret = pcpu_populate_chunk(chunk, rs, re, pcpu_gfp);
e04d32083   Tejun Heo   percpu: indent th...
1652
1653
1654
  
  			spin_lock_irqsave(&pcpu_lock, flags);
  			if (ret) {
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1655
  				pcpu_free_area(chunk, off);
e04d32083   Tejun Heo   percpu: indent th...
1656
1657
1658
  				err = "failed to populate";
  				goto fail_unlock;
  			}
b239f7daf   Dennis Zhou   percpu: set PCPU_...
1659
  			pcpu_chunk_populated(chunk, rs, re);
e04d32083   Tejun Heo   percpu: indent th...
1660
  			spin_unlock_irqrestore(&pcpu_lock, flags);
dca496451   Tejun Heo   percpu: move comm...
1661
  		}
fbf59bc9d   Tejun Heo   percpu: implement...
1662

e04d32083   Tejun Heo   percpu: indent th...
1663
1664
  		mutex_unlock(&pcpu_alloc_mutex);
  	}
ccea34b5d   Tejun Heo   percpu: finer gra...
1665

1a4d76076   Tejun Heo   percpu: implement...
1666
1667
  	if (pcpu_nr_empty_pop_pages < PCPU_EMPTY_POP_PAGES_LOW)
  		pcpu_schedule_balance_work();
dca496451   Tejun Heo   percpu: move comm...
1668
1669
1670
  	/* clear the areas and return address relative to base address */
  	for_each_possible_cpu(cpu)
  		memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size);
f528f0b8e   Catalin Marinas   kmemleak: Handle ...
1671
  	ptr = __addr_to_pcpu_ptr(chunk->base_addr + off);
8a8c35fad   Larry Finger   mm: kmemleak_allo...
1672
  	kmemleak_alloc_percpu(ptr, size, gfp);
df95e795a   Dennis Zhou   percpu: add trace...
1673
1674
1675
  
  	trace_percpu_alloc_percpu(reserved, is_atomic, size, align,
  			chunk->base_addr, off, ptr);
f528f0b8e   Catalin Marinas   kmemleak: Handle ...
1676
  	return ptr;
ccea34b5d   Tejun Heo   percpu: finer gra...
1677
1678
  
  fail_unlock:
403a91b16   Jiri Kosina   percpu: allow pcp...
1679
  	spin_unlock_irqrestore(&pcpu_lock, flags);
b38d08f31   Tejun Heo   percpu: restructu...
1680
  fail:
df95e795a   Dennis Zhou   percpu: add trace...
1681
  	trace_percpu_alloc_percpu_fail(reserved, is_atomic, size, align);
0ea7eeec2   Daniel Borkmann   mm, percpu: add s...
1682
  	if (!is_atomic && do_warn && warn_limit) {
870d4b12a   Joe Perches   mm: percpu: use p...
1683
1684
  		pr_warn("allocation failed, size=%zu align=%zu atomic=%d, %s
  ",
598d80914   Joe Perches   mm: convert pr_wa...
1685
  			size, align, is_atomic, err);
f2badb0c9   Tejun Heo   percpu: make allo...
1686
1687
  		dump_stack();
  		if (!--warn_limit)
870d4b12a   Joe Perches   mm: percpu: use p...
1688
1689
  			pr_info("limit reached, disable warning
  ");
f2badb0c9   Tejun Heo   percpu: make allo...
1690
  	}
1a4d76076   Tejun Heo   percpu: implement...
1691
1692
1693
1694
  	if (is_atomic) {
  		/* see the flag handling in pcpu_blance_workfn() */
  		pcpu_atomic_alloc_failed = true;
  		pcpu_schedule_balance_work();
6710e594f   Tejun Heo   percpu: fix synch...
1695
1696
  	} else {
  		mutex_unlock(&pcpu_alloc_mutex);
1a4d76076   Tejun Heo   percpu: implement...
1697
  	}
ccea34b5d   Tejun Heo   percpu: finer gra...
1698
  	return NULL;
fbf59bc9d   Tejun Heo   percpu: implement...
1699
  }
edcb46399   Tejun Heo   percpu, module: i...
1700
1701
  
  /**
5835d96e9   Tejun Heo   percpu: implement...
1702
   * __alloc_percpu_gfp - allocate dynamic percpu area
edcb46399   Tejun Heo   percpu, module: i...
1703
1704
   * @size: size of area to allocate in bytes
   * @align: alignment of area (max PAGE_SIZE)
5835d96e9   Tejun Heo   percpu: implement...
1705
   * @gfp: allocation flags
edcb46399   Tejun Heo   percpu, module: i...
1706
   *
5835d96e9   Tejun Heo   percpu: implement...
1707
1708
   * Allocate zero-filled percpu area of @size bytes aligned at @align.  If
   * @gfp doesn't contain %GFP_KERNEL, the allocation doesn't block and can
0ea7eeec2   Daniel Borkmann   mm, percpu: add s...
1709
1710
1711
   * be called from any context but is a lot more likely to fail. If @gfp
   * has __GFP_NOWARN then no warning will be triggered on invalid or failed
   * allocation requests.
ccea34b5d   Tejun Heo   percpu: finer gra...
1712
   *
edcb46399   Tejun Heo   percpu, module: i...
1713
1714
1715
   * RETURNS:
   * Percpu pointer to the allocated area on success, NULL on failure.
   */
5835d96e9   Tejun Heo   percpu: implement...
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
  void __percpu *__alloc_percpu_gfp(size_t size, size_t align, gfp_t gfp)
  {
  	return pcpu_alloc(size, align, false, gfp);
  }
  EXPORT_SYMBOL_GPL(__alloc_percpu_gfp);
  
  /**
   * __alloc_percpu - allocate dynamic percpu area
   * @size: size of area to allocate in bytes
   * @align: alignment of area (max PAGE_SIZE)
   *
   * Equivalent to __alloc_percpu_gfp(size, align, %GFP_KERNEL).
   */
43cf38eb5   Tejun Heo   percpu: add __per...
1729
  void __percpu *__alloc_percpu(size_t size, size_t align)
edcb46399   Tejun Heo   percpu, module: i...
1730
  {
5835d96e9   Tejun Heo   percpu: implement...
1731
  	return pcpu_alloc(size, align, false, GFP_KERNEL);
edcb46399   Tejun Heo   percpu, module: i...
1732
  }
fbf59bc9d   Tejun Heo   percpu: implement...
1733
  EXPORT_SYMBOL_GPL(__alloc_percpu);
edcb46399   Tejun Heo   percpu, module: i...
1734
1735
1736
1737
1738
  /**
   * __alloc_reserved_percpu - allocate reserved percpu area
   * @size: size of area to allocate in bytes
   * @align: alignment of area (max PAGE_SIZE)
   *
9329ba970   Tejun Heo   percpu: update co...
1739
1740
1741
1742
   * Allocate zero-filled percpu area of @size bytes aligned at @align
   * from reserved percpu area if arch has set it up; otherwise,
   * allocation is served from the same dynamic area.  Might sleep.
   * Might trigger writeouts.
edcb46399   Tejun Heo   percpu, module: i...
1743
   *
ccea34b5d   Tejun Heo   percpu: finer gra...
1744
1745
1746
   * CONTEXT:
   * Does GFP_KERNEL allocation.
   *
edcb46399   Tejun Heo   percpu, module: i...
1747
1748
1749
   * RETURNS:
   * Percpu pointer to the allocated area on success, NULL on failure.
   */
43cf38eb5   Tejun Heo   percpu: add __per...
1750
  void __percpu *__alloc_reserved_percpu(size_t size, size_t align)
edcb46399   Tejun Heo   percpu, module: i...
1751
  {
5835d96e9   Tejun Heo   percpu: implement...
1752
  	return pcpu_alloc(size, align, true, GFP_KERNEL);
edcb46399   Tejun Heo   percpu, module: i...
1753
  }
a56dbddf0   Tejun Heo   percpu: move full...
1754
  /**
1a4d76076   Tejun Heo   percpu: implement...
1755
   * pcpu_balance_workfn - manage the amount of free chunks and populated pages
a56dbddf0   Tejun Heo   percpu: move full...
1756
1757
   * @work: unused
   *
47504ee04   Dennis Zhou   percpu: add __GFP...
1758
1759
1760
1761
1762
1763
   * Reclaim all fully free chunks except for the first one.  This is also
   * responsible for maintaining the pool of empty populated pages.  However,
   * it is possible that this is called when physical memory is scarce causing
   * OOM killer to be triggered.  We should avoid doing so until an actual
   * allocation causes the failure as it is possible that requests can be
   * serviced from already backed regions.
a56dbddf0   Tejun Heo   percpu: move full...
1764
   */
fe6bd8c3d   Tejun Heo   percpu: rename pc...
1765
  static void pcpu_balance_workfn(struct work_struct *work)
fbf59bc9d   Tejun Heo   percpu: implement...
1766
  {
47504ee04   Dennis Zhou   percpu: add __GFP...
1767
  	/* gfp flags passed to underlying allocators */
554fef1c3   Dennis Zhou   percpu: allow sel...
1768
  	const gfp_t gfp = GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN;
fe6bd8c3d   Tejun Heo   percpu: rename pc...
1769
1770
  	LIST_HEAD(to_free);
  	struct list_head *free_head = &pcpu_slot[pcpu_nr_slots - 1];
a56dbddf0   Tejun Heo   percpu: move full...
1771
  	struct pcpu_chunk *chunk, *next;
1a4d76076   Tejun Heo   percpu: implement...
1772
  	int slot, nr_to_pop, ret;
a56dbddf0   Tejun Heo   percpu: move full...
1773

1a4d76076   Tejun Heo   percpu: implement...
1774
1775
1776
1777
  	/*
  	 * There's no reason to keep around multiple unused chunks and VM
  	 * areas can be scarce.  Destroy all free chunks except for one.
  	 */
ccea34b5d   Tejun Heo   percpu: finer gra...
1778
1779
  	mutex_lock(&pcpu_alloc_mutex);
  	spin_lock_irq(&pcpu_lock);
a56dbddf0   Tejun Heo   percpu: move full...
1780

fe6bd8c3d   Tejun Heo   percpu: rename pc...
1781
  	list_for_each_entry_safe(chunk, next, free_head, list) {
a56dbddf0   Tejun Heo   percpu: move full...
1782
1783
1784
  		WARN_ON(chunk->immutable);
  
  		/* spare the first one */
fe6bd8c3d   Tejun Heo   percpu: rename pc...
1785
  		if (chunk == list_first_entry(free_head, struct pcpu_chunk, list))
a56dbddf0   Tejun Heo   percpu: move full...
1786
  			continue;
fe6bd8c3d   Tejun Heo   percpu: rename pc...
1787
  		list_move(&chunk->list, &to_free);
a56dbddf0   Tejun Heo   percpu: move full...
1788
  	}
ccea34b5d   Tejun Heo   percpu: finer gra...
1789
  	spin_unlock_irq(&pcpu_lock);
a56dbddf0   Tejun Heo   percpu: move full...
1790

fe6bd8c3d   Tejun Heo   percpu: rename pc...
1791
  	list_for_each_entry_safe(chunk, next, &to_free, list) {
a93ace487   Tejun Heo   percpu: move regi...
1792
  		int rs, re;
dca496451   Tejun Heo   percpu: move comm...
1793

91e914c5a   Dennis Zhou (Facebook)   percpu: generaliz...
1794
1795
  		pcpu_for_each_pop_region(chunk->populated, rs, re, 0,
  					 chunk->nr_pages) {
a93ace487   Tejun Heo   percpu: move regi...
1796
  			pcpu_depopulate_chunk(chunk, rs, re);
b539b87fe   Tejun Heo   percpu: implmeent...
1797
1798
1799
  			spin_lock_irq(&pcpu_lock);
  			pcpu_chunk_depopulated(chunk, rs, re);
  			spin_unlock_irq(&pcpu_lock);
a93ace487   Tejun Heo   percpu: move regi...
1800
  		}
6081089fd   Tejun Heo   percpu: reorganiz...
1801
  		pcpu_destroy_chunk(chunk);
accd4f36a   Eric Dumazet   percpu: add a sch...
1802
  		cond_resched();
a56dbddf0   Tejun Heo   percpu: move full...
1803
  	}
971f3918a   Tejun Heo   percpu: fix pcpu_...
1804

1a4d76076   Tejun Heo   percpu: implement...
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
  	/*
  	 * Ensure there are certain number of free populated pages for
  	 * atomic allocs.  Fill up from the most packed so that atomic
  	 * allocs don't increase fragmentation.  If atomic allocation
  	 * failed previously, always populate the maximum amount.  This
  	 * should prevent atomic allocs larger than PAGE_SIZE from keeping
  	 * failing indefinitely; however, large atomic allocs are not
  	 * something we support properly and can be highly unreliable and
  	 * inefficient.
  	 */
  retry_pop:
  	if (pcpu_atomic_alloc_failed) {
  		nr_to_pop = PCPU_EMPTY_POP_PAGES_HIGH;
  		/* best effort anyway, don't worry about synchronization */
  		pcpu_atomic_alloc_failed = false;
  	} else {
  		nr_to_pop = clamp(PCPU_EMPTY_POP_PAGES_HIGH -
  				  pcpu_nr_empty_pop_pages,
  				  0, PCPU_EMPTY_POP_PAGES_HIGH);
  	}
  
  	for (slot = pcpu_size_to_slot(PAGE_SIZE); slot < pcpu_nr_slots; slot++) {
  		int nr_unpop = 0, rs, re;
  
  		if (!nr_to_pop)
  			break;
  
  		spin_lock_irq(&pcpu_lock);
  		list_for_each_entry(chunk, &pcpu_slot[slot], list) {
8ab16c43e   Dennis Zhou (Facebook)   percpu: change th...
1834
  			nr_unpop = chunk->nr_pages - chunk->nr_populated;
1a4d76076   Tejun Heo   percpu: implement...
1835
1836
1837
1838
1839
1840
1841
1842
1843
  			if (nr_unpop)
  				break;
  		}
  		spin_unlock_irq(&pcpu_lock);
  
  		if (!nr_unpop)
  			continue;
  
  		/* @chunk can't go away while pcpu_alloc_mutex is held */
91e914c5a   Dennis Zhou (Facebook)   percpu: generaliz...
1844
1845
  		pcpu_for_each_unpop_region(chunk->populated, rs, re, 0,
  					   chunk->nr_pages) {
1a4d76076   Tejun Heo   percpu: implement...
1846
  			int nr = min(re - rs, nr_to_pop);
47504ee04   Dennis Zhou   percpu: add __GFP...
1847
  			ret = pcpu_populate_chunk(chunk, rs, rs + nr, gfp);
1a4d76076   Tejun Heo   percpu: implement...
1848
1849
1850
  			if (!ret) {
  				nr_to_pop -= nr;
  				spin_lock_irq(&pcpu_lock);
b239f7daf   Dennis Zhou   percpu: set PCPU_...
1851
  				pcpu_chunk_populated(chunk, rs, rs + nr);
1a4d76076   Tejun Heo   percpu: implement...
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
  				spin_unlock_irq(&pcpu_lock);
  			} else {
  				nr_to_pop = 0;
  			}
  
  			if (!nr_to_pop)
  				break;
  		}
  	}
  
  	if (nr_to_pop) {
  		/* ran out of chunks to populate, create a new one and retry */
47504ee04   Dennis Zhou   percpu: add __GFP...
1864
  		chunk = pcpu_create_chunk(gfp);
1a4d76076   Tejun Heo   percpu: implement...
1865
1866
1867
1868
1869
1870
1871
  		if (chunk) {
  			spin_lock_irq(&pcpu_lock);
  			pcpu_chunk_relocate(chunk, -1);
  			spin_unlock_irq(&pcpu_lock);
  			goto retry_pop;
  		}
  	}
971f3918a   Tejun Heo   percpu: fix pcpu_...
1872
  	mutex_unlock(&pcpu_alloc_mutex);
fbf59bc9d   Tejun Heo   percpu: implement...
1873
1874
1875
1876
1877
1878
  }
  
  /**
   * free_percpu - free percpu area
   * @ptr: pointer to area to free
   *
ccea34b5d   Tejun Heo   percpu: finer gra...
1879
1880
1881
1882
   * Free percpu area @ptr.
   *
   * CONTEXT:
   * Can be called from atomic context.
fbf59bc9d   Tejun Heo   percpu: implement...
1883
   */
43cf38eb5   Tejun Heo   percpu: add __per...
1884
  void free_percpu(void __percpu *ptr)
fbf59bc9d   Tejun Heo   percpu: implement...
1885
  {
129182e56   Andrew Morton   percpu: avoid cal...
1886
  	void *addr;
fbf59bc9d   Tejun Heo   percpu: implement...
1887
  	struct pcpu_chunk *chunk;
ccea34b5d   Tejun Heo   percpu: finer gra...
1888
  	unsigned long flags;
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1889
  	int off;
198790d9a   John Sperbeck   percpu: remove sp...
1890
  	bool need_balance = false;
fbf59bc9d   Tejun Heo   percpu: implement...
1891
1892
1893
  
  	if (!ptr)
  		return;
f528f0b8e   Catalin Marinas   kmemleak: Handle ...
1894
  	kmemleak_free_percpu(ptr);
129182e56   Andrew Morton   percpu: avoid cal...
1895
  	addr = __pcpu_ptr_to_addr(ptr);
ccea34b5d   Tejun Heo   percpu: finer gra...
1896
  	spin_lock_irqsave(&pcpu_lock, flags);
fbf59bc9d   Tejun Heo   percpu: implement...
1897
1898
  
  	chunk = pcpu_chunk_addr_search(addr);
bba174f5e   Tejun Heo   percpu: add chunk...
1899
  	off = addr - chunk->base_addr;
fbf59bc9d   Tejun Heo   percpu: implement...
1900

40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1901
  	pcpu_free_area(chunk, off);
fbf59bc9d   Tejun Heo   percpu: implement...
1902

a56dbddf0   Tejun Heo   percpu: move full...
1903
  	/* if there are more than one fully free chunks, wake up grim reaper */
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1904
  	if (chunk->free_bytes == pcpu_unit_size) {
fbf59bc9d   Tejun Heo   percpu: implement...
1905
  		struct pcpu_chunk *pos;
a56dbddf0   Tejun Heo   percpu: move full...
1906
  		list_for_each_entry(pos, &pcpu_slot[pcpu_nr_slots - 1], list)
fbf59bc9d   Tejun Heo   percpu: implement...
1907
  			if (pos != chunk) {
198790d9a   John Sperbeck   percpu: remove sp...
1908
  				need_balance = true;
fbf59bc9d   Tejun Heo   percpu: implement...
1909
1910
1911
  				break;
  			}
  	}
df95e795a   Dennis Zhou   percpu: add trace...
1912
  	trace_percpu_free_percpu(chunk->base_addr, off, ptr);
ccea34b5d   Tejun Heo   percpu: finer gra...
1913
  	spin_unlock_irqrestore(&pcpu_lock, flags);
198790d9a   John Sperbeck   percpu: remove sp...
1914
1915
1916
  
  	if (need_balance)
  		pcpu_schedule_balance_work();
fbf59bc9d   Tejun Heo   percpu: implement...
1917
1918
  }
  EXPORT_SYMBOL_GPL(free_percpu);
383776fa7   Thomas Gleixner   locking/lockdep: ...
1919
  bool __is_kernel_percpu_address(unsigned long addr, unsigned long *can_addr)
10fad5e46   Tejun Heo   percpu, module: i...
1920
  {
bbddff054   Tejun Heo   percpu: use percp...
1921
  #ifdef CONFIG_SMP
10fad5e46   Tejun Heo   percpu, module: i...
1922
1923
1924
1925
1926
1927
  	const size_t static_size = __per_cpu_end - __per_cpu_start;
  	void __percpu *base = __addr_to_pcpu_ptr(pcpu_base_addr);
  	unsigned int cpu;
  
  	for_each_possible_cpu(cpu) {
  		void *start = per_cpu_ptr(base, cpu);
383776fa7   Thomas Gleixner   locking/lockdep: ...
1928
  		void *va = (void *)addr;
10fad5e46   Tejun Heo   percpu, module: i...
1929

383776fa7   Thomas Gleixner   locking/lockdep: ...
1930
  		if (va >= start && va < start + static_size) {
8ce371f98   Peter Zijlstra   lockdep: Fix per-...
1931
  			if (can_addr) {
383776fa7   Thomas Gleixner   locking/lockdep: ...
1932
  				*can_addr = (unsigned long) (va - start);
8ce371f98   Peter Zijlstra   lockdep: Fix per-...
1933
1934
1935
  				*can_addr += (unsigned long)
  					per_cpu_ptr(base, get_boot_cpu_id());
  			}
10fad5e46   Tejun Heo   percpu, module: i...
1936
  			return true;
383776fa7   Thomas Gleixner   locking/lockdep: ...
1937
1938
  		}
  	}
bbddff054   Tejun Heo   percpu: use percp...
1939
1940
  #endif
  	/* on UP, can't distinguish from other static vars, always false */
10fad5e46   Tejun Heo   percpu, module: i...
1941
1942
1943
1944
  	return false;
  }
  
  /**
383776fa7   Thomas Gleixner   locking/lockdep: ...
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
   * is_kernel_percpu_address - test whether address is from static percpu area
   * @addr: address to test
   *
   * Test whether @addr belongs to in-kernel static percpu area.  Module
   * static percpu areas are not considered.  For those, use
   * is_module_percpu_address().
   *
   * RETURNS:
   * %true if @addr is from in-kernel static percpu area, %false otherwise.
   */
  bool is_kernel_percpu_address(unsigned long addr)
  {
  	return __is_kernel_percpu_address(addr, NULL);
  }
  
  /**
3b034b0d0   Vivek Goyal   percpu: Fix kdump...
1961
1962
1963
1964
1965
1966
1967
1968
   * per_cpu_ptr_to_phys - convert translated percpu address to physical address
   * @addr: the address to be converted to physical address
   *
   * Given @addr which is dereferenceable address obtained via one of
   * percpu access macros, this function translates it into its physical
   * address.  The caller is responsible for ensuring @addr stays valid
   * until this function finishes.
   *
67589c714   Dave Young   percpu: explain w...
1969
1970
1971
1972
1973
   * percpu allocator has special setup for the first chunk, which currently
   * supports either embedding in linear address space or vmalloc mapping,
   * and, from the second one, the backing allocator (currently either vm or
   * km) provides translation.
   *
bffc43758   Yannick Guerrini   percpu: Fix trivi...
1974
   * The addr can be translated simply without checking if it falls into the
67589c714   Dave Young   percpu: explain w...
1975
1976
1977
1978
1979
   * first chunk. But the current code reflects better how percpu allocator
   * actually works, and the verification can discover both bugs in percpu
   * allocator itself and per_cpu_ptr_to_phys() callers. So we keep current
   * code.
   *
3b034b0d0   Vivek Goyal   percpu: Fix kdump...
1980
1981
1982
1983
1984
   * RETURNS:
   * The physical address for @addr.
   */
  phys_addr_t per_cpu_ptr_to_phys(void *addr)
  {
9983b6f0c   Tejun Heo   percpu: fix first...
1985
1986
  	void __percpu *base = __addr_to_pcpu_ptr(pcpu_base_addr);
  	bool in_first_chunk = false;
a855b84c3   Tejun Heo   percpu: fix chunk...
1987
  	unsigned long first_low, first_high;
9983b6f0c   Tejun Heo   percpu: fix first...
1988
1989
1990
  	unsigned int cpu;
  
  	/*
a855b84c3   Tejun Heo   percpu: fix chunk...
1991
  	 * The following test on unit_low/high isn't strictly
9983b6f0c   Tejun Heo   percpu: fix first...
1992
1993
  	 * necessary but will speed up lookups of addresses which
  	 * aren't in the first chunk.
c0ebfdc3f   Dennis Zhou (Facebook)   percpu: modify ba...
1994
1995
1996
1997
1998
  	 *
  	 * The address check is against full chunk sizes.  pcpu_base_addr
  	 * points to the beginning of the first chunk including the
  	 * static region.  Assumes good intent as the first chunk may
  	 * not be full (ie. < pcpu_unit_pages in size).
9983b6f0c   Tejun Heo   percpu: fix first...
1999
  	 */
c0ebfdc3f   Dennis Zhou (Facebook)   percpu: modify ba...
2000
2001
2002
2003
  	first_low = (unsigned long)pcpu_base_addr +
  		    pcpu_unit_page_offset(pcpu_low_unit_cpu, 0);
  	first_high = (unsigned long)pcpu_base_addr +
  		     pcpu_unit_page_offset(pcpu_high_unit_cpu, pcpu_unit_pages);
a855b84c3   Tejun Heo   percpu: fix chunk...
2004
2005
  	if ((unsigned long)addr >= first_low &&
  	    (unsigned long)addr < first_high) {
9983b6f0c   Tejun Heo   percpu: fix first...
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
  		for_each_possible_cpu(cpu) {
  			void *start = per_cpu_ptr(base, cpu);
  
  			if (addr >= start && addr < start + pcpu_unit_size) {
  				in_first_chunk = true;
  				break;
  			}
  		}
  	}
  
  	if (in_first_chunk) {
eac522ef4   David Howells   NOMMU: percpu sho...
2017
  		if (!is_vmalloc_addr(addr))
020ec6537   Tejun Heo   percpu: factor ou...
2018
2019
  			return __pa(addr);
  		else
9f57bd4d6   Eugene Surovegin   percpu: fix per_c...
2020
2021
  			return page_to_phys(vmalloc_to_page(addr)) +
  			       offset_in_page(addr);
020ec6537   Tejun Heo   percpu: factor ou...
2022
  	} else
9f57bd4d6   Eugene Surovegin   percpu: fix per_c...
2023
2024
  		return page_to_phys(pcpu_addr_to_page(addr)) +
  		       offset_in_page(addr);
3b034b0d0   Vivek Goyal   percpu: Fix kdump...
2025
  }
fbf59bc9d   Tejun Heo   percpu: implement...
2026
  /**
fd1e8a1fe   Tejun Heo   percpu: introduce...
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
   * pcpu_alloc_alloc_info - allocate percpu allocation info
   * @nr_groups: the number of groups
   * @nr_units: the number of units
   *
   * Allocate ai which is large enough for @nr_groups groups containing
   * @nr_units units.  The returned ai's groups[0].cpu_map points to the
   * cpu_map array which is long enough for @nr_units and filled with
   * NR_CPUS.  It's the caller's responsibility to initialize cpu_map
   * pointer of other groups.
   *
   * RETURNS:
   * Pointer to the allocated pcpu_alloc_info on success, NULL on
   * failure.
   */
  struct pcpu_alloc_info * __init pcpu_alloc_alloc_info(int nr_groups,
  						      int nr_units)
  {
  	struct pcpu_alloc_info *ai;
  	size_t base_size, ai_size;
  	void *ptr;
  	int unit;
14d376124   Gustavo A. R. Silva   percpu: Use struc...
2048
  	base_size = ALIGN(struct_size(ai, groups, nr_groups),
fd1e8a1fe   Tejun Heo   percpu: introduce...
2049
2050
  			  __alignof__(ai->groups[0].cpu_map[0]));
  	ai_size = base_size + nr_units * sizeof(ai->groups[0].cpu_map[0]);
26fb3dae0   Mike Rapoport   memblock: drop me...
2051
  	ptr = memblock_alloc(PFN_ALIGN(ai_size), PAGE_SIZE);
fd1e8a1fe   Tejun Heo   percpu: introduce...
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
  	if (!ptr)
  		return NULL;
  	ai = ptr;
  	ptr += base_size;
  
  	ai->groups[0].cpu_map = ptr;
  
  	for (unit = 0; unit < nr_units; unit++)
  		ai->groups[0].cpu_map[unit] = NR_CPUS;
  
  	ai->nr_groups = nr_groups;
  	ai->__ai_size = PFN_ALIGN(ai_size);
  
  	return ai;
  }
  
  /**
   * pcpu_free_alloc_info - free percpu allocation info
   * @ai: pcpu_alloc_info to free
   *
   * Free @ai which was allocated by pcpu_alloc_alloc_info().
   */
  void __init pcpu_free_alloc_info(struct pcpu_alloc_info *ai)
  {
999c17e3d   Santosh Shilimkar   mm/percpu.c: use ...
2076
  	memblock_free_early(__pa(ai), ai->__ai_size);
fd1e8a1fe   Tejun Heo   percpu: introduce...
2077
2078
2079
  }
  
  /**
fd1e8a1fe   Tejun Heo   percpu: introduce...
2080
2081
2082
2083
2084
2085
2086
2087
   * pcpu_dump_alloc_info - print out information about pcpu_alloc_info
   * @lvl: loglevel
   * @ai: allocation info to dump
   *
   * Print out information about @ai using loglevel @lvl.
   */
  static void pcpu_dump_alloc_info(const char *lvl,
  				 const struct pcpu_alloc_info *ai)
033e48fb8   Tejun Heo   percpu: move pcpu...
2088
  {
fd1e8a1fe   Tejun Heo   percpu: introduce...
2089
  	int group_width = 1, cpu_width = 1, width;
033e48fb8   Tejun Heo   percpu: move pcpu...
2090
  	char empty_str[] = "--------";
fd1e8a1fe   Tejun Heo   percpu: introduce...
2091
2092
2093
2094
2095
2096
2097
  	int alloc = 0, alloc_end = 0;
  	int group, v;
  	int upa, apl;	/* units per alloc, allocs per line */
  
  	v = ai->nr_groups;
  	while (v /= 10)
  		group_width++;
033e48fb8   Tejun Heo   percpu: move pcpu...
2098

fd1e8a1fe   Tejun Heo   percpu: introduce...
2099
  	v = num_possible_cpus();
033e48fb8   Tejun Heo   percpu: move pcpu...
2100
  	while (v /= 10)
fd1e8a1fe   Tejun Heo   percpu: introduce...
2101
2102
  		cpu_width++;
  	empty_str[min_t(int, cpu_width, sizeof(empty_str) - 1)] = '\0';
033e48fb8   Tejun Heo   percpu: move pcpu...
2103

fd1e8a1fe   Tejun Heo   percpu: introduce...
2104
2105
2106
  	upa = ai->alloc_size / ai->unit_size;
  	width = upa * (cpu_width + 1) + group_width + 3;
  	apl = rounddown_pow_of_two(max(60 / width, 1));
033e48fb8   Tejun Heo   percpu: move pcpu...
2107

fd1e8a1fe   Tejun Heo   percpu: introduce...
2108
2109
2110
  	printk("%spcpu-alloc: s%zu r%zu d%zu u%zu alloc=%zu*%zu",
  	       lvl, ai->static_size, ai->reserved_size, ai->dyn_size,
  	       ai->unit_size, ai->alloc_size / ai->atom_size, ai->atom_size);
033e48fb8   Tejun Heo   percpu: move pcpu...
2111

fd1e8a1fe   Tejun Heo   percpu: introduce...
2112
2113
2114
2115
2116
2117
2118
2119
  	for (group = 0; group < ai->nr_groups; group++) {
  		const struct pcpu_group_info *gi = &ai->groups[group];
  		int unit = 0, unit_end = 0;
  
  		BUG_ON(gi->nr_units % upa);
  		for (alloc_end += gi->nr_units / upa;
  		     alloc < alloc_end; alloc++) {
  			if (!(alloc % apl)) {
1170532bb   Joe Perches   mm: convert print...
2120
2121
  				pr_cont("
  ");
fd1e8a1fe   Tejun Heo   percpu: introduce...
2122
2123
  				printk("%spcpu-alloc: ", lvl);
  			}
1170532bb   Joe Perches   mm: convert print...
2124
  			pr_cont("[%0*d] ", group_width, group);
fd1e8a1fe   Tejun Heo   percpu: introduce...
2125
2126
2127
  
  			for (unit_end += upa; unit < unit_end; unit++)
  				if (gi->cpu_map[unit] != NR_CPUS)
1170532bb   Joe Perches   mm: convert print...
2128
2129
  					pr_cont("%0*d ",
  						cpu_width, gi->cpu_map[unit]);
fd1e8a1fe   Tejun Heo   percpu: introduce...
2130
  				else
1170532bb   Joe Perches   mm: convert print...
2131
  					pr_cont("%s ", empty_str);
033e48fb8   Tejun Heo   percpu: move pcpu...
2132
  		}
033e48fb8   Tejun Heo   percpu: move pcpu...
2133
  	}
1170532bb   Joe Perches   mm: convert print...
2134
2135
  	pr_cont("
  ");
033e48fb8   Tejun Heo   percpu: move pcpu...
2136
  }
033e48fb8   Tejun Heo   percpu: move pcpu...
2137

fbf59bc9d   Tejun Heo   percpu: implement...
2138
  /**
8d408b4be   Tejun Heo   percpu: give more...
2139
   * pcpu_setup_first_chunk - initialize the first percpu chunk
fd1e8a1fe   Tejun Heo   percpu: introduce...
2140
   * @ai: pcpu_alloc_info describing how to percpu area is shaped
38a6be525   Tejun Heo   percpu: simplify ...
2141
   * @base_addr: mapped address
8d408b4be   Tejun Heo   percpu: give more...
2142
2143
   *
   * Initialize the first percpu chunk which contains the kernel static
69ab285b6   Christophe JAILLET   percpu: fix typo ...
2144
   * percpu area.  This function is to be called from arch percpu area
38a6be525   Tejun Heo   percpu: simplify ...
2145
   * setup path.
8d408b4be   Tejun Heo   percpu: give more...
2146
   *
fd1e8a1fe   Tejun Heo   percpu: introduce...
2147
2148
2149
2150
2151
2152
   * @ai contains all information necessary to initialize the first
   * chunk and prime the dynamic percpu allocator.
   *
   * @ai->static_size is the size of static percpu area.
   *
   * @ai->reserved_size, if non-zero, specifies the amount of bytes to
edcb46399   Tejun Heo   percpu, module: i...
2153
2154
2155
2156
2157
2158
2159
   * reserve after the static area in the first chunk.  This reserves
   * the first chunk such that it's available only through reserved
   * percpu allocation.  This is primarily used to serve module percpu
   * static areas on architectures where the addressing model has
   * limited offset range for symbol relocations to guarantee module
   * percpu symbols fall inside the relocatable range.
   *
fd1e8a1fe   Tejun Heo   percpu: introduce...
2160
2161
2162
   * @ai->dyn_size determines the number of bytes available for dynamic
   * allocation in the first chunk.  The area between @ai->static_size +
   * @ai->reserved_size + @ai->dyn_size and @ai->unit_size is unused.
6074d5b0a   Tejun Heo   percpu: more flex...
2163
   *
fd1e8a1fe   Tejun Heo   percpu: introduce...
2164
2165
2166
   * @ai->unit_size specifies unit size and must be aligned to PAGE_SIZE
   * and equal to or larger than @ai->static_size + @ai->reserved_size +
   * @ai->dyn_size.
8d408b4be   Tejun Heo   percpu: give more...
2167
   *
fd1e8a1fe   Tejun Heo   percpu: introduce...
2168
2169
   * @ai->atom_size is the allocation atom size and used as alignment
   * for vm areas.
8d408b4be   Tejun Heo   percpu: give more...
2170
   *
fd1e8a1fe   Tejun Heo   percpu: introduce...
2171
2172
2173
2174
2175
2176
2177
2178
2179
   * @ai->alloc_size is the allocation size and always multiple of
   * @ai->atom_size.  This is larger than @ai->atom_size if
   * @ai->unit_size is larger than @ai->atom_size.
   *
   * @ai->nr_groups and @ai->groups describe virtual memory layout of
   * percpu areas.  Units which should be colocated are put into the
   * same group.  Dynamic VM areas will be allocated according to these
   * groupings.  If @ai->nr_groups is zero, a single group containing
   * all units is assumed.
8d408b4be   Tejun Heo   percpu: give more...
2180
   *
38a6be525   Tejun Heo   percpu: simplify ...
2181
2182
   * The caller should have mapped the first chunk at @base_addr and
   * copied static data to each unit.
fbf59bc9d   Tejun Heo   percpu: implement...
2183
   *
c0ebfdc3f   Dennis Zhou (Facebook)   percpu: modify ba...
2184
2185
2186
2187
2188
2189
2190
   * The first chunk will always contain a static and a dynamic region.
   * However, the static region is not managed by any chunk.  If the first
   * chunk also contains a reserved region, it is served by two chunks -
   * one for the reserved region and one for the dynamic region.  They
   * share the same vm, but use offset regions in the area allocation map.
   * The chunk serving the dynamic region is circulated in the chunk slots
   * and available for dynamic allocation like any other chunk.
fbf59bc9d   Tejun Heo   percpu: implement...
2191
   */
163fa2343   Kefeng Wang   percpu: Make pcpu...
2192
2193
  void __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
  				   void *base_addr)
fbf59bc9d   Tejun Heo   percpu: implement...
2194
  {
b9c39442c   Dennis Zhou (Facebook)   percpu: setup_fir...
2195
  	size_t size_sum = ai->static_size + ai->reserved_size + ai->dyn_size;
d2f3c3849   Dennis Zhou (Facebook)   percpu: increase ...
2196
  	size_t static_size, dyn_size;
0c4169c3d   Dennis Zhou (Facebook)   percpu: setup_fir...
2197
  	struct pcpu_chunk *chunk;
6563297ce   Tejun Heo   percpu: use group...
2198
2199
  	unsigned long *group_offsets;
  	size_t *group_sizes;
fb435d523   Tejun Heo   percpu: add pcpu_...
2200
  	unsigned long *unit_off;
fbf59bc9d   Tejun Heo   percpu: implement...
2201
  	unsigned int cpu;
fd1e8a1fe   Tejun Heo   percpu: introduce...
2202
2203
  	int *unit_map;
  	int group, unit, i;
c0ebfdc3f   Dennis Zhou (Facebook)   percpu: modify ba...
2204
2205
  	int map_size;
  	unsigned long tmp_addr;
f655f4053   Mike Rapoport   mm/percpu: add ch...
2206
  	size_t alloc_size;
fbf59bc9d   Tejun Heo   percpu: implement...
2207

635b75fc1   Tejun Heo   percpu: make pcpu...
2208
2209
  #define PCPU_SETUP_BUG_ON(cond)	do {					\
  	if (unlikely(cond)) {						\
870d4b12a   Joe Perches   mm: percpu: use p...
2210
2211
2212
2213
  		pr_emerg("failed to initialize, %s
  ", #cond);		\
  		pr_emerg("cpu_possible_mask=%*pb
  ",			\
807de073b   Tejun Heo   percpu: use %*pb[...
2214
  			 cpumask_pr_args(cpu_possible_mask));		\
635b75fc1   Tejun Heo   percpu: make pcpu...
2215
2216
2217
2218
  		pcpu_dump_alloc_info(KERN_EMERG, ai);			\
  		BUG();							\
  	}								\
  } while (0)
2f39e637e   Tejun Heo   percpu: allow non...
2219
  	/* sanity checks */
635b75fc1   Tejun Heo   percpu: make pcpu...
2220
  	PCPU_SETUP_BUG_ON(ai->nr_groups <= 0);
bbddff054   Tejun Heo   percpu: use percp...
2221
  #ifdef CONFIG_SMP
635b75fc1   Tejun Heo   percpu: make pcpu...
2222
  	PCPU_SETUP_BUG_ON(!ai->static_size);
f09f1243c   Alexander Kuleshov   mm/percpu: use of...
2223
  	PCPU_SETUP_BUG_ON(offset_in_page(__per_cpu_start));
bbddff054   Tejun Heo   percpu: use percp...
2224
  #endif
635b75fc1   Tejun Heo   percpu: make pcpu...
2225
  	PCPU_SETUP_BUG_ON(!base_addr);
f09f1243c   Alexander Kuleshov   mm/percpu: use of...
2226
  	PCPU_SETUP_BUG_ON(offset_in_page(base_addr));
635b75fc1   Tejun Heo   percpu: make pcpu...
2227
  	PCPU_SETUP_BUG_ON(ai->unit_size < size_sum);
f09f1243c   Alexander Kuleshov   mm/percpu: use of...
2228
  	PCPU_SETUP_BUG_ON(offset_in_page(ai->unit_size));
635b75fc1   Tejun Heo   percpu: make pcpu...
2229
  	PCPU_SETUP_BUG_ON(ai->unit_size < PCPU_MIN_UNIT_SIZE);
ca460b3c9   Dennis Zhou (Facebook)   percpu: introduce...
2230
  	PCPU_SETUP_BUG_ON(!IS_ALIGNED(ai->unit_size, PCPU_BITMAP_BLOCK_SIZE));
099a19d91   Tejun Heo   percpu: allow lim...
2231
  	PCPU_SETUP_BUG_ON(ai->dyn_size < PERCPU_DYNAMIC_EARLY_SIZE);
fb29a2cc6   Dennis Zhou (Facebook)   percpu: setup_fir...
2232
  	PCPU_SETUP_BUG_ON(!ai->dyn_size);
d2f3c3849   Dennis Zhou (Facebook)   percpu: increase ...
2233
  	PCPU_SETUP_BUG_ON(!IS_ALIGNED(ai->reserved_size, PCPU_MIN_ALLOC_SIZE));
ca460b3c9   Dennis Zhou (Facebook)   percpu: introduce...
2234
2235
  	PCPU_SETUP_BUG_ON(!(IS_ALIGNED(PCPU_BITMAP_BLOCK_SIZE, PAGE_SIZE) ||
  			    IS_ALIGNED(PAGE_SIZE, PCPU_BITMAP_BLOCK_SIZE)));
9f6455325   Tejun Heo   percpu: move vmal...
2236
  	PCPU_SETUP_BUG_ON(pcpu_verify_alloc_info(ai) < 0);
8d408b4be   Tejun Heo   percpu: give more...
2237

6563297ce   Tejun Heo   percpu: use group...
2238
  	/* process group information and build config tables accordingly */
f655f4053   Mike Rapoport   mm/percpu: add ch...
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
  	alloc_size = ai->nr_groups * sizeof(group_offsets[0]);
  	group_offsets = memblock_alloc(alloc_size, SMP_CACHE_BYTES);
  	if (!group_offsets)
  		panic("%s: Failed to allocate %zu bytes
  ", __func__,
  		      alloc_size);
  
  	alloc_size = ai->nr_groups * sizeof(group_sizes[0]);
  	group_sizes = memblock_alloc(alloc_size, SMP_CACHE_BYTES);
  	if (!group_sizes)
  		panic("%s: Failed to allocate %zu bytes
  ", __func__,
  		      alloc_size);
  
  	alloc_size = nr_cpu_ids * sizeof(unit_map[0]);
  	unit_map = memblock_alloc(alloc_size, SMP_CACHE_BYTES);
  	if (!unit_map)
  		panic("%s: Failed to allocate %zu bytes
  ", __func__,
  		      alloc_size);
  
  	alloc_size = nr_cpu_ids * sizeof(unit_off[0]);
  	unit_off = memblock_alloc(alloc_size, SMP_CACHE_BYTES);
  	if (!unit_off)
  		panic("%s: Failed to allocate %zu bytes
  ", __func__,
  		      alloc_size);
2f39e637e   Tejun Heo   percpu: allow non...
2266

fd1e8a1fe   Tejun Heo   percpu: introduce...
2267
  	for (cpu = 0; cpu < nr_cpu_ids; cpu++)
ffe0d5a57   Tejun Heo   percpu: fix unit_...
2268
  		unit_map[cpu] = UINT_MAX;
a855b84c3   Tejun Heo   percpu: fix chunk...
2269
2270
2271
  
  	pcpu_low_unit_cpu = NR_CPUS;
  	pcpu_high_unit_cpu = NR_CPUS;
2f39e637e   Tejun Heo   percpu: allow non...
2272

fd1e8a1fe   Tejun Heo   percpu: introduce...
2273
2274
  	for (group = 0, unit = 0; group < ai->nr_groups; group++, unit += i) {
  		const struct pcpu_group_info *gi = &ai->groups[group];
2f39e637e   Tejun Heo   percpu: allow non...
2275

6563297ce   Tejun Heo   percpu: use group...
2276
2277
  		group_offsets[group] = gi->base_offset;
  		group_sizes[group] = gi->nr_units * ai->unit_size;
fd1e8a1fe   Tejun Heo   percpu: introduce...
2278
2279
2280
2281
  		for (i = 0; i < gi->nr_units; i++) {
  			cpu = gi->cpu_map[i];
  			if (cpu == NR_CPUS)
  				continue;
8d408b4be   Tejun Heo   percpu: give more...
2282

9f295664e   Dan Carpenter   percpu: off by on...
2283
  			PCPU_SETUP_BUG_ON(cpu >= nr_cpu_ids);
635b75fc1   Tejun Heo   percpu: make pcpu...
2284
2285
  			PCPU_SETUP_BUG_ON(!cpu_possible(cpu));
  			PCPU_SETUP_BUG_ON(unit_map[cpu] != UINT_MAX);
fbf59bc9d   Tejun Heo   percpu: implement...
2286

fd1e8a1fe   Tejun Heo   percpu: introduce...
2287
  			unit_map[cpu] = unit + i;
fb435d523   Tejun Heo   percpu: add pcpu_...
2288
  			unit_off[cpu] = gi->base_offset + i * ai->unit_size;
a855b84c3   Tejun Heo   percpu: fix chunk...
2289
2290
2291
2292
2293
2294
2295
  			/* determine low/high unit_cpu */
  			if (pcpu_low_unit_cpu == NR_CPUS ||
  			    unit_off[cpu] < unit_off[pcpu_low_unit_cpu])
  				pcpu_low_unit_cpu = cpu;
  			if (pcpu_high_unit_cpu == NR_CPUS ||
  			    unit_off[cpu] > unit_off[pcpu_high_unit_cpu])
  				pcpu_high_unit_cpu = cpu;
fd1e8a1fe   Tejun Heo   percpu: introduce...
2296
  		}
2f39e637e   Tejun Heo   percpu: allow non...
2297
  	}
fd1e8a1fe   Tejun Heo   percpu: introduce...
2298
2299
2300
  	pcpu_nr_units = unit;
  
  	for_each_possible_cpu(cpu)
635b75fc1   Tejun Heo   percpu: make pcpu...
2301
2302
2303
2304
  		PCPU_SETUP_BUG_ON(unit_map[cpu] == UINT_MAX);
  
  	/* we're done parsing the input, undefine BUG macro and dump config */
  #undef PCPU_SETUP_BUG_ON
bcbea798f   Tejun Heo   percpu: print out...
2305
  	pcpu_dump_alloc_info(KERN_DEBUG, ai);
fd1e8a1fe   Tejun Heo   percpu: introduce...
2306

6563297ce   Tejun Heo   percpu: use group...
2307
2308
2309
  	pcpu_nr_groups = ai->nr_groups;
  	pcpu_group_offsets = group_offsets;
  	pcpu_group_sizes = group_sizes;
fd1e8a1fe   Tejun Heo   percpu: introduce...
2310
  	pcpu_unit_map = unit_map;
fb435d523   Tejun Heo   percpu: add pcpu_...
2311
  	pcpu_unit_offsets = unit_off;
2f39e637e   Tejun Heo   percpu: allow non...
2312
2313
  
  	/* determine basic parameters */
fd1e8a1fe   Tejun Heo   percpu: introduce...
2314
  	pcpu_unit_pages = ai->unit_size >> PAGE_SHIFT;
d9b55eeb1   Tejun Heo   percpu: remove un...
2315
  	pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT;
6563297ce   Tejun Heo   percpu: use group...
2316
  	pcpu_atom_size = ai->atom_size;
ce3141a27   Tejun Heo   percpu: drop pcpu...
2317
2318
  	pcpu_chunk_struct_size = sizeof(struct pcpu_chunk) +
  		BITS_TO_LONGS(pcpu_unit_pages) * sizeof(unsigned long);
cafe8816b   Tejun Heo   percpu: use negat...
2319

30a5b5367   Dennis Zhou   percpu: expose st...
2320
  	pcpu_stats_save_ai(ai);
d9b55eeb1   Tejun Heo   percpu: remove un...
2321
2322
2323
2324
2325
  	/*
  	 * Allocate chunk slots.  The additional last slot is for
  	 * empty chunks.
  	 */
  	pcpu_nr_slots = __pcpu_size_to_slot(pcpu_unit_size) + 2;
7e1c4e279   Mike Rapoport   memblock: stop us...
2326
2327
  	pcpu_slot = memblock_alloc(pcpu_nr_slots * sizeof(pcpu_slot[0]),
  				   SMP_CACHE_BYTES);
f655f4053   Mike Rapoport   mm/percpu: add ch...
2328
2329
2330
2331
  	if (!pcpu_slot)
  		panic("%s: Failed to allocate %zu bytes
  ", __func__,
  		      pcpu_nr_slots * sizeof(pcpu_slot[0]));
fbf59bc9d   Tejun Heo   percpu: implement...
2332
2333
  	for (i = 0; i < pcpu_nr_slots; i++)
  		INIT_LIST_HEAD(&pcpu_slot[i]);
edcb46399   Tejun Heo   percpu, module: i...
2334
  	/*
d2f3c3849   Dennis Zhou (Facebook)   percpu: increase ...
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
  	 * The end of the static region needs to be aligned with the
  	 * minimum allocation size as this offsets the reserved and
  	 * dynamic region.  The first chunk ends page aligned by
  	 * expanding the dynamic region, therefore the dynamic region
  	 * can be shrunk to compensate while still staying above the
  	 * configured sizes.
  	 */
  	static_size = ALIGN(ai->static_size, PCPU_MIN_ALLOC_SIZE);
  	dyn_size = ai->dyn_size - (static_size - ai->static_size);
  
  	/*
c0ebfdc3f   Dennis Zhou (Facebook)   percpu: modify ba...
2346
2347
2348
2349
2350
2351
  	 * Initialize first chunk.
  	 * If the reserved_size is non-zero, this initializes the reserved
  	 * chunk.  If the reserved_size is zero, the reserved chunk is NULL
  	 * and the dynamic region is initialized here.  The first chunk,
  	 * pcpu_first_chunk, will always point to the chunk that serves
  	 * the dynamic region.
edcb46399   Tejun Heo   percpu, module: i...
2352
  	 */
d2f3c3849   Dennis Zhou (Facebook)   percpu: increase ...
2353
2354
  	tmp_addr = (unsigned long)base_addr + static_size;
  	map_size = ai->reserved_size ?: dyn_size;
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
2355
  	chunk = pcpu_alloc_first_chunk(tmp_addr, map_size);
61ace7fa2   Tejun Heo   percpu: improve f...
2356

edcb46399   Tejun Heo   percpu, module: i...
2357
  	/* init dynamic chunk if necessary */
b9c39442c   Dennis Zhou (Facebook)   percpu: setup_fir...
2358
  	if (ai->reserved_size) {
0c4169c3d   Dennis Zhou (Facebook)   percpu: setup_fir...
2359
  		pcpu_reserved_chunk = chunk;
b9c39442c   Dennis Zhou (Facebook)   percpu: setup_fir...
2360

d2f3c3849   Dennis Zhou (Facebook)   percpu: increase ...
2361
  		tmp_addr = (unsigned long)base_addr + static_size +
c0ebfdc3f   Dennis Zhou (Facebook)   percpu: modify ba...
2362
  			   ai->reserved_size;
d2f3c3849   Dennis Zhou (Facebook)   percpu: increase ...
2363
  		map_size = dyn_size;
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
2364
  		chunk = pcpu_alloc_first_chunk(tmp_addr, map_size);
edcb46399   Tejun Heo   percpu, module: i...
2365
  	}
2441d15c9   Tejun Heo   percpu: cosmetic ...
2366
  	/* link the first chunk in */
0c4169c3d   Dennis Zhou (Facebook)   percpu: setup_fir...
2367
  	pcpu_first_chunk = chunk;
0cecf50cf   Dennis Zhou (Facebook)   percpu: introduce...
2368
  	pcpu_nr_empty_pop_pages = pcpu_first_chunk->nr_empty_pop_pages;
ae9e6bc9f   Tejun Heo   percpu: don't put...
2369
  	pcpu_chunk_relocate(pcpu_first_chunk, -1);
fbf59bc9d   Tejun Heo   percpu: implement...
2370

7e8a6304d   Dennis Zhou (Facebook)   /proc/meminfo: ad...
2371
2372
  	/* include all regions of the first chunk */
  	pcpu_nr_populated += PFN_DOWN(size_sum);
30a5b5367   Dennis Zhou   percpu: expose st...
2373
  	pcpu_stats_chunk_alloc();
df95e795a   Dennis Zhou   percpu: add trace...
2374
  	trace_percpu_create_chunk(base_addr);
30a5b5367   Dennis Zhou   percpu: expose st...
2375

fbf59bc9d   Tejun Heo   percpu: implement...
2376
  	/* we're done */
bba174f5e   Tejun Heo   percpu: add chunk...
2377
  	pcpu_base_addr = base_addr;
fbf59bc9d   Tejun Heo   percpu: implement...
2378
  }
66c3a7577   Tejun Heo   percpu: generaliz...
2379

bbddff054   Tejun Heo   percpu: use percp...
2380
  #ifdef CONFIG_SMP
17f3609c2   Andi Kleen   sections: fix sec...
2381
  const char * const pcpu_fc_names[PCPU_FC_NR] __initconst = {
f58dc01ba   Tejun Heo   percpu: generaliz...
2382
2383
2384
  	[PCPU_FC_AUTO]	= "auto",
  	[PCPU_FC_EMBED]	= "embed",
  	[PCPU_FC_PAGE]	= "page",
f58dc01ba   Tejun Heo   percpu: generaliz...
2385
  };
66c3a7577   Tejun Heo   percpu: generaliz...
2386

f58dc01ba   Tejun Heo   percpu: generaliz...
2387
  enum pcpu_fc pcpu_chosen_fc __initdata = PCPU_FC_AUTO;
66c3a7577   Tejun Heo   percpu: generaliz...
2388

f58dc01ba   Tejun Heo   percpu: generaliz...
2389
2390
  static int __init percpu_alloc_setup(char *str)
  {
5479c78ac   Cyrill Gorcunov   mm, percpu: Make ...
2391
2392
  	if (!str)
  		return -EINVAL;
f58dc01ba   Tejun Heo   percpu: generaliz...
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
  	if (0)
  		/* nada */;
  #ifdef CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK
  	else if (!strcmp(str, "embed"))
  		pcpu_chosen_fc = PCPU_FC_EMBED;
  #endif
  #ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK
  	else if (!strcmp(str, "page"))
  		pcpu_chosen_fc = PCPU_FC_PAGE;
  #endif
f58dc01ba   Tejun Heo   percpu: generaliz...
2403
  	else
870d4b12a   Joe Perches   mm: percpu: use p...
2404
2405
  		pr_warn("unknown allocator %s specified
  ", str);
66c3a7577   Tejun Heo   percpu: generaliz...
2406

f58dc01ba   Tejun Heo   percpu: generaliz...
2407
  	return 0;
66c3a7577   Tejun Heo   percpu: generaliz...
2408
  }
f58dc01ba   Tejun Heo   percpu: generaliz...
2409
  early_param("percpu_alloc", percpu_alloc_setup);
66c3a7577   Tejun Heo   percpu: generaliz...
2410

3c9a024fd   Tejun Heo   percpu: fix build...
2411
2412
2413
2414
2415
  /*
   * pcpu_embed_first_chunk() is used by the generic percpu setup.
   * Build it if needed by the arch config or the generic setup is going
   * to be used.
   */
08fc45806   Tejun Heo   percpu: build fir...
2416
2417
  #if defined(CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK) || \
  	!defined(CONFIG_HAVE_SETUP_PER_CPU_AREA)
3c9a024fd   Tejun Heo   percpu: fix build...
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
  #define BUILD_EMBED_FIRST_CHUNK
  #endif
  
  /* build pcpu_page_first_chunk() iff needed by the arch config */
  #if defined(CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK)
  #define BUILD_PAGE_FIRST_CHUNK
  #endif
  
  /* pcpu_build_alloc_info() is used by both embed and page first chunk */
  #if defined(BUILD_EMBED_FIRST_CHUNK) || defined(BUILD_PAGE_FIRST_CHUNK)
  /**
   * pcpu_build_alloc_info - build alloc_info considering distances between CPUs
   * @reserved_size: the size of reserved percpu area in bytes
   * @dyn_size: minimum free size for dynamic allocation in bytes
   * @atom_size: allocation atom size
   * @cpu_distance_fn: callback to determine distance between cpus, optional
   *
   * This function determines grouping of units, their mappings to cpus
   * and other parameters considering needed percpu size, allocation
   * atom size and distances between CPUs.
   *
bffc43758   Yannick Guerrini   percpu: Fix trivi...
2439
   * Groups are always multiples of atom size and CPUs which are of
3c9a024fd   Tejun Heo   percpu: fix build...
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
   * LOCAL_DISTANCE both ways are grouped together and share space for
   * units in the same group.  The returned configuration is guaranteed
   * to have CPUs on different nodes on different groups and >=75% usage
   * of allocated virtual address space.
   *
   * RETURNS:
   * On success, pointer to the new allocation_info is returned.  On
   * failure, ERR_PTR value is returned.
   */
  static struct pcpu_alloc_info * __init pcpu_build_alloc_info(
  				size_t reserved_size, size_t dyn_size,
  				size_t atom_size,
  				pcpu_fc_cpu_distance_fn_t cpu_distance_fn)
  {
  	static int group_map[NR_CPUS] __initdata;
  	static int group_cnt[NR_CPUS] __initdata;
  	const size_t static_size = __per_cpu_end - __per_cpu_start;
  	int nr_groups = 1, nr_units = 0;
  	size_t size_sum, min_unit_size, alloc_size;
  	int upa, max_upa, uninitialized_var(best_upa);	/* units_per_alloc */
  	int last_allocs, group, unit;
  	unsigned int cpu, tcpu;
  	struct pcpu_alloc_info *ai;
  	unsigned int *cpu_map;
  
  	/* this function may be called multiple times */
  	memset(group_map, 0, sizeof(group_map));
  	memset(group_cnt, 0, sizeof(group_cnt));
  
  	/* calculate size_sum and ensure dyn_size is enough for early alloc */
  	size_sum = PFN_ALIGN(static_size + reserved_size +
  			    max_t(size_t, dyn_size, PERCPU_DYNAMIC_EARLY_SIZE));
  	dyn_size = size_sum - static_size - reserved_size;
  
  	/*
  	 * Determine min_unit_size, alloc_size and max_upa such that
  	 * alloc_size is multiple of atom_size and is the smallest
25985edce   Lucas De Marchi   Fix common misspe...
2477
  	 * which can accommodate 4k aligned segments which are equal to
3c9a024fd   Tejun Heo   percpu: fix build...
2478
2479
2480
  	 * or larger than min_unit_size.
  	 */
  	min_unit_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE);
9c0151627   Dennis Zhou (Facebook)   percpu: update th...
2481
  	/* determine the maximum # of units that can fit in an allocation */
3c9a024fd   Tejun Heo   percpu: fix build...
2482
2483
  	alloc_size = roundup(min_unit_size, atom_size);
  	upa = alloc_size / min_unit_size;
f09f1243c   Alexander Kuleshov   mm/percpu: use of...
2484
  	while (alloc_size % upa || (offset_in_page(alloc_size / upa)))
3c9a024fd   Tejun Heo   percpu: fix build...
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
  		upa--;
  	max_upa = upa;
  
  	/* group cpus according to their proximity */
  	for_each_possible_cpu(cpu) {
  		group = 0;
  	next_group:
  		for_each_possible_cpu(tcpu) {
  			if (cpu == tcpu)
  				break;
  			if (group_map[tcpu] == group && cpu_distance_fn &&
  			    (cpu_distance_fn(cpu, tcpu) > LOCAL_DISTANCE ||
  			     cpu_distance_fn(tcpu, cpu) > LOCAL_DISTANCE)) {
  				group++;
  				nr_groups = max(nr_groups, group + 1);
  				goto next_group;
  			}
  		}
  		group_map[cpu] = group;
  		group_cnt[group]++;
  	}
  
  	/*
9c0151627   Dennis Zhou (Facebook)   percpu: update th...
2508
2509
2510
  	 * Wasted space is caused by a ratio imbalance of upa to group_cnt.
  	 * Expand the unit_size until we use >= 75% of the units allocated.
  	 * Related to atom_size, which could be much larger than the unit_size.
3c9a024fd   Tejun Heo   percpu: fix build...
2511
2512
2513
2514
  	 */
  	last_allocs = INT_MAX;
  	for (upa = max_upa; upa; upa--) {
  		int allocs = 0, wasted = 0;
f09f1243c   Alexander Kuleshov   mm/percpu: use of...
2515
  		if (alloc_size % upa || (offset_in_page(alloc_size / upa)))
3c9a024fd   Tejun Heo   percpu: fix build...
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
  			continue;
  
  		for (group = 0; group < nr_groups; group++) {
  			int this_allocs = DIV_ROUND_UP(group_cnt[group], upa);
  			allocs += this_allocs;
  			wasted += this_allocs * upa - group_cnt[group];
  		}
  
  		/*
  		 * Don't accept if wastage is over 1/3.  The
  		 * greater-than comparison ensures upa==1 always
  		 * passes the following check.
  		 */
  		if (wasted > num_possible_cpus() / 3)
  			continue;
  
  		/* and then don't consume more memory */
  		if (allocs > last_allocs)
  			break;
  		last_allocs = allocs;
  		best_upa = upa;
  	}
  	upa = best_upa;
  
  	/* allocate and fill alloc_info */
  	for (group = 0; group < nr_groups; group++)
  		nr_units += roundup(group_cnt[group], upa);
  
  	ai = pcpu_alloc_alloc_info(nr_groups, nr_units);
  	if (!ai)
  		return ERR_PTR(-ENOMEM);
  	cpu_map = ai->groups[0].cpu_map;
  
  	for (group = 0; group < nr_groups; group++) {
  		ai->groups[group].cpu_map = cpu_map;
  		cpu_map += roundup(group_cnt[group], upa);
  	}
  
  	ai->static_size = static_size;
  	ai->reserved_size = reserved_size;
  	ai->dyn_size = dyn_size;
  	ai->unit_size = alloc_size / upa;
  	ai->atom_size = atom_size;
  	ai->alloc_size = alloc_size;
2de7852fe   Peng Fan   percpu: use nr_gr...
2560
  	for (group = 0, unit = 0; group < nr_groups; group++) {
3c9a024fd   Tejun Heo   percpu: fix build...
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
  		struct pcpu_group_info *gi = &ai->groups[group];
  
  		/*
  		 * Initialize base_offset as if all groups are located
  		 * back-to-back.  The caller should update this to
  		 * reflect actual allocation.
  		 */
  		gi->base_offset = unit * ai->unit_size;
  
  		for_each_possible_cpu(cpu)
  			if (group_map[cpu] == group)
  				gi->cpu_map[gi->nr_units++] = cpu;
  		gi->nr_units = roundup(gi->nr_units, upa);
  		unit += gi->nr_units;
  	}
  	BUG_ON(unit != nr_units);
  
  	return ai;
  }
  #endif /* BUILD_EMBED_FIRST_CHUNK || BUILD_PAGE_FIRST_CHUNK */
  
  #if defined(BUILD_EMBED_FIRST_CHUNK)
66c3a7577   Tejun Heo   percpu: generaliz...
2583
2584
  /**
   * pcpu_embed_first_chunk - embed the first percpu chunk into bootmem
66c3a7577   Tejun Heo   percpu: generaliz...
2585
   * @reserved_size: the size of reserved percpu area in bytes
4ba6ce250   Tejun Heo   percpu: make @dyn...
2586
   * @dyn_size: minimum free size for dynamic allocation in bytes
c8826dd53   Tejun Heo   percpu: update em...
2587
2588
2589
   * @atom_size: allocation atom size
   * @cpu_distance_fn: callback to determine distance between cpus, optional
   * @alloc_fn: function to allocate percpu page
25985edce   Lucas De Marchi   Fix common misspe...
2590
   * @free_fn: function to free percpu page
66c3a7577   Tejun Heo   percpu: generaliz...
2591
2592
2593
2594
2595
   *
   * This is a helper to ease setting up embedded first percpu chunk and
   * can be called where pcpu_setup_first_chunk() is expected.
   *
   * If this function is used to setup the first chunk, it is allocated
c8826dd53   Tejun Heo   percpu: update em...
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
   * by calling @alloc_fn and used as-is without being mapped into
   * vmalloc area.  Allocations are always whole multiples of @atom_size
   * aligned to @atom_size.
   *
   * This enables the first chunk to piggy back on the linear physical
   * mapping which often uses larger page size.  Please note that this
   * can result in very sparse cpu->unit mapping on NUMA machines thus
   * requiring large vmalloc address space.  Don't use this allocator if
   * vmalloc space is not orders of magnitude larger than distances
   * between node memory addresses (ie. 32bit NUMA machines).
66c3a7577   Tejun Heo   percpu: generaliz...
2606
   *
4ba6ce250   Tejun Heo   percpu: make @dyn...
2607
   * @dyn_size specifies the minimum dynamic area size.
66c3a7577   Tejun Heo   percpu: generaliz...
2608
2609
   *
   * If the needed size is smaller than the minimum or specified unit
c8826dd53   Tejun Heo   percpu: update em...
2610
   * size, the leftover is returned using @free_fn.
66c3a7577   Tejun Heo   percpu: generaliz...
2611
2612
   *
   * RETURNS:
fb435d523   Tejun Heo   percpu: add pcpu_...
2613
   * 0 on success, -errno on failure.
66c3a7577   Tejun Heo   percpu: generaliz...
2614
   */
4ba6ce250   Tejun Heo   percpu: make @dyn...
2615
  int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size,
c8826dd53   Tejun Heo   percpu: update em...
2616
2617
2618
2619
  				  size_t atom_size,
  				  pcpu_fc_cpu_distance_fn_t cpu_distance_fn,
  				  pcpu_fc_alloc_fn_t alloc_fn,
  				  pcpu_fc_free_fn_t free_fn)
66c3a7577   Tejun Heo   percpu: generaliz...
2620
  {
c8826dd53   Tejun Heo   percpu: update em...
2621
2622
  	void *base = (void *)ULONG_MAX;
  	void **areas = NULL;
fd1e8a1fe   Tejun Heo   percpu: introduce...
2623
  	struct pcpu_alloc_info *ai;
93c76b6b2   zijun_hu   mm/percpu.c: corr...
2624
2625
  	size_t size_sum, areas_size;
  	unsigned long max_distance;
163fa2343   Kefeng Wang   percpu: Make pcpu...
2626
  	int group, i, highest_group, rc = 0;
66c3a7577   Tejun Heo   percpu: generaliz...
2627

c8826dd53   Tejun Heo   percpu: update em...
2628
2629
  	ai = pcpu_build_alloc_info(reserved_size, dyn_size, atom_size,
  				   cpu_distance_fn);
fd1e8a1fe   Tejun Heo   percpu: introduce...
2630
2631
  	if (IS_ERR(ai))
  		return PTR_ERR(ai);
66c3a7577   Tejun Heo   percpu: generaliz...
2632

fd1e8a1fe   Tejun Heo   percpu: introduce...
2633
  	size_sum = ai->static_size + ai->reserved_size + ai->dyn_size;
c8826dd53   Tejun Heo   percpu: update em...
2634
  	areas_size = PFN_ALIGN(ai->nr_groups * sizeof(void *));
fa8a7094b   Tejun Heo   x86: implement pe...
2635

26fb3dae0   Mike Rapoport   memblock: drop me...
2636
  	areas = memblock_alloc(areas_size, SMP_CACHE_BYTES);
c8826dd53   Tejun Heo   percpu: update em...
2637
  	if (!areas) {
fb435d523   Tejun Heo   percpu: add pcpu_...
2638
  		rc = -ENOMEM;
c8826dd53   Tejun Heo   percpu: update em...
2639
  		goto out_free;
fa8a7094b   Tejun Heo   x86: implement pe...
2640
  	}
66c3a7577   Tejun Heo   percpu: generaliz...
2641

9b7396624   zijun_hu   mm/percpu.c: fix ...
2642
2643
  	/* allocate, copy and determine base address & max_distance */
  	highest_group = 0;
c8826dd53   Tejun Heo   percpu: update em...
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
  	for (group = 0; group < ai->nr_groups; group++) {
  		struct pcpu_group_info *gi = &ai->groups[group];
  		unsigned int cpu = NR_CPUS;
  		void *ptr;
  
  		for (i = 0; i < gi->nr_units && cpu == NR_CPUS; i++)
  			cpu = gi->cpu_map[i];
  		BUG_ON(cpu == NR_CPUS);
  
  		/* allocate space for the whole group */
  		ptr = alloc_fn(cpu, gi->nr_units * ai->unit_size, atom_size);
  		if (!ptr) {
  			rc = -ENOMEM;
  			goto out_free_areas;
  		}
f528f0b8e   Catalin Marinas   kmemleak: Handle ...
2659
2660
  		/* kmemleak tracks the percpu allocations separately */
  		kmemleak_free(ptr);
c8826dd53   Tejun Heo   percpu: update em...
2661
  		areas[group] = ptr;
fd1e8a1fe   Tejun Heo   percpu: introduce...
2662

c8826dd53   Tejun Heo   percpu: update em...
2663
  		base = min(ptr, base);
9b7396624   zijun_hu   mm/percpu.c: fix ...
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
  		if (ptr > areas[highest_group])
  			highest_group = group;
  	}
  	max_distance = areas[highest_group] - base;
  	max_distance += ai->unit_size * ai->groups[highest_group].nr_units;
  
  	/* warn if maximum distance is further than 75% of vmalloc space */
  	if (max_distance > VMALLOC_TOTAL * 3 / 4) {
  		pr_warn("max_distance=0x%lx too large for vmalloc space 0x%lx
  ",
  				max_distance, VMALLOC_TOTAL);
  #ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK
  		/* and fail if we have fallback */
  		rc = -EINVAL;
  		goto out_free_areas;
  #endif
42b642814   Tejun Heo   percpu: pcpu_embe...
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
  	}
  
  	/*
  	 * Copy data and free unused parts.  This should happen after all
  	 * allocations are complete; otherwise, we may end up with
  	 * overlapping groups.
  	 */
  	for (group = 0; group < ai->nr_groups; group++) {
  		struct pcpu_group_info *gi = &ai->groups[group];
  		void *ptr = areas[group];
c8826dd53   Tejun Heo   percpu: update em...
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
  
  		for (i = 0; i < gi->nr_units; i++, ptr += ai->unit_size) {
  			if (gi->cpu_map[i] == NR_CPUS) {
  				/* unused unit, free whole */
  				free_fn(ptr, ai->unit_size);
  				continue;
  			}
  			/* copy and return the unused part */
  			memcpy(ptr, __per_cpu_load, ai->static_size);
  			free_fn(ptr + size_sum, ai->unit_size - size_sum);
  		}
fa8a7094b   Tejun Heo   x86: implement pe...
2701
  	}
66c3a7577   Tejun Heo   percpu: generaliz...
2702

c8826dd53   Tejun Heo   percpu: update em...
2703
  	/* base address is now known, determine group base offsets */
6ea529a20   Tejun Heo   percpu: make embe...
2704
  	for (group = 0; group < ai->nr_groups; group++) {
c8826dd53   Tejun Heo   percpu: update em...
2705
  		ai->groups[group].base_offset = areas[group] - base;
6ea529a20   Tejun Heo   percpu: make embe...
2706
  	}
c8826dd53   Tejun Heo   percpu: update em...
2707

00206a69e   Matteo Croce   percpu: stop prin...
2708
2709
2710
  	pr_info("Embedded %zu pages/cpu s%zu r%zu d%zu u%zu
  ",
  		PFN_DOWN(size_sum), ai->static_size, ai->reserved_size,
fd1e8a1fe   Tejun Heo   percpu: introduce...
2711
  		ai->dyn_size, ai->unit_size);
d4b95f803   Tejun Heo   x86,percpu: gener...
2712

163fa2343   Kefeng Wang   percpu: Make pcpu...
2713
  	pcpu_setup_first_chunk(ai, base);
c8826dd53   Tejun Heo   percpu: update em...
2714
2715
2716
2717
  	goto out_free;
  
  out_free_areas:
  	for (group = 0; group < ai->nr_groups; group++)
f851c8d85   Michael Holzheu   percpu: fix bootm...
2718
2719
2720
  		if (areas[group])
  			free_fn(areas[group],
  				ai->groups[group].nr_units * ai->unit_size);
c8826dd53   Tejun Heo   percpu: update em...
2721
  out_free:
fd1e8a1fe   Tejun Heo   percpu: introduce...
2722
  	pcpu_free_alloc_info(ai);
c8826dd53   Tejun Heo   percpu: update em...
2723
  	if (areas)
999c17e3d   Santosh Shilimkar   mm/percpu.c: use ...
2724
  		memblock_free_early(__pa(areas), areas_size);
fb435d523   Tejun Heo   percpu: add pcpu_...
2725
  	return rc;
d4b95f803   Tejun Heo   x86,percpu: gener...
2726
  }
3c9a024fd   Tejun Heo   percpu: fix build...
2727
  #endif /* BUILD_EMBED_FIRST_CHUNK */
d4b95f803   Tejun Heo   x86,percpu: gener...
2728

3c9a024fd   Tejun Heo   percpu: fix build...
2729
  #ifdef BUILD_PAGE_FIRST_CHUNK
d4b95f803   Tejun Heo   x86,percpu: gener...
2730
  /**
00ae4064b   Tejun Heo   percpu: rename 4k...
2731
   * pcpu_page_first_chunk - map the first chunk using PAGE_SIZE pages
d4b95f803   Tejun Heo   x86,percpu: gener...
2732
2733
   * @reserved_size: the size of reserved percpu area in bytes
   * @alloc_fn: function to allocate percpu page, always called with PAGE_SIZE
25985edce   Lucas De Marchi   Fix common misspe...
2734
   * @free_fn: function to free percpu page, always called with PAGE_SIZE
d4b95f803   Tejun Heo   x86,percpu: gener...
2735
2736
   * @populate_pte_fn: function to populate pte
   *
00ae4064b   Tejun Heo   percpu: rename 4k...
2737
2738
   * This is a helper to ease setting up page-remapped first percpu
   * chunk and can be called where pcpu_setup_first_chunk() is expected.
d4b95f803   Tejun Heo   x86,percpu: gener...
2739
2740
2741
2742
2743
   *
   * This is the basic allocator.  Static percpu area is allocated
   * page-by-page into vmalloc area.
   *
   * RETURNS:
fb435d523   Tejun Heo   percpu: add pcpu_...
2744
   * 0 on success, -errno on failure.
d4b95f803   Tejun Heo   x86,percpu: gener...
2745
   */
fb435d523   Tejun Heo   percpu: add pcpu_...
2746
2747
2748
2749
  int __init pcpu_page_first_chunk(size_t reserved_size,
  				 pcpu_fc_alloc_fn_t alloc_fn,
  				 pcpu_fc_free_fn_t free_fn,
  				 pcpu_fc_populate_pte_fn_t populate_pte_fn)
d4b95f803   Tejun Heo   x86,percpu: gener...
2750
  {
8f05a6a65   Tejun Heo   percpu: make 4k f...
2751
  	static struct vm_struct vm;
fd1e8a1fe   Tejun Heo   percpu: introduce...
2752
  	struct pcpu_alloc_info *ai;
00ae4064b   Tejun Heo   percpu: rename 4k...
2753
  	char psize_str[16];
ce3141a27   Tejun Heo   percpu: drop pcpu...
2754
  	int unit_pages;
d4b95f803   Tejun Heo   x86,percpu: gener...
2755
  	size_t pages_size;
ce3141a27   Tejun Heo   percpu: drop pcpu...
2756
  	struct page **pages;
163fa2343   Kefeng Wang   percpu: Make pcpu...
2757
  	int unit, i, j, rc = 0;
8f6066049   zijun_hu   mm/percpu.c: fix ...
2758
2759
  	int upa;
  	int nr_g0_units;
d4b95f803   Tejun Heo   x86,percpu: gener...
2760

00ae4064b   Tejun Heo   percpu: rename 4k...
2761
  	snprintf(psize_str, sizeof(psize_str), "%luK", PAGE_SIZE >> 10);
4ba6ce250   Tejun Heo   percpu: make @dyn...
2762
  	ai = pcpu_build_alloc_info(reserved_size, 0, PAGE_SIZE, NULL);
fd1e8a1fe   Tejun Heo   percpu: introduce...
2763
2764
2765
  	if (IS_ERR(ai))
  		return PTR_ERR(ai);
  	BUG_ON(ai->nr_groups != 1);
8f6066049   zijun_hu   mm/percpu.c: fix ...
2766
2767
  	upa = ai->alloc_size/ai->unit_size;
  	nr_g0_units = roundup(num_possible_cpus(), upa);
0b59c25f9   Igor Stoppa   mm: percpu: remov...
2768
  	if (WARN_ON(ai->groups[0].nr_units != nr_g0_units)) {
8f6066049   zijun_hu   mm/percpu.c: fix ...
2769
2770
2771
  		pcpu_free_alloc_info(ai);
  		return -EINVAL;
  	}
fd1e8a1fe   Tejun Heo   percpu: introduce...
2772
2773
  
  	unit_pages = ai->unit_size >> PAGE_SHIFT;
d4b95f803   Tejun Heo   x86,percpu: gener...
2774
2775
  
  	/* unaligned allocations can't be freed, round up to page size */
fd1e8a1fe   Tejun Heo   percpu: introduce...
2776
2777
  	pages_size = PFN_ALIGN(unit_pages * num_possible_cpus() *
  			       sizeof(pages[0]));
7e1c4e279   Mike Rapoport   memblock: stop us...
2778
  	pages = memblock_alloc(pages_size, SMP_CACHE_BYTES);
f655f4053   Mike Rapoport   mm/percpu: add ch...
2779
2780
2781
2782
  	if (!pages)
  		panic("%s: Failed to allocate %zu bytes
  ", __func__,
  		      pages_size);
d4b95f803   Tejun Heo   x86,percpu: gener...
2783

8f05a6a65   Tejun Heo   percpu: make 4k f...
2784
  	/* allocate pages */
d4b95f803   Tejun Heo   x86,percpu: gener...
2785
  	j = 0;
8f6066049   zijun_hu   mm/percpu.c: fix ...
2786
2787
  	for (unit = 0; unit < num_possible_cpus(); unit++) {
  		unsigned int cpu = ai->groups[0].cpu_map[unit];
ce3141a27   Tejun Heo   percpu: drop pcpu...
2788
  		for (i = 0; i < unit_pages; i++) {
d4b95f803   Tejun Heo   x86,percpu: gener...
2789
  			void *ptr;
3cbc85652   Tejun Heo   percpu: add @alig...
2790
  			ptr = alloc_fn(cpu, PAGE_SIZE, PAGE_SIZE);
d4b95f803   Tejun Heo   x86,percpu: gener...
2791
  			if (!ptr) {
870d4b12a   Joe Perches   mm: percpu: use p...
2792
2793
  				pr_warn("failed to allocate %s page for cpu%u
  ",
8f6066049   zijun_hu   mm/percpu.c: fix ...
2794
  						psize_str, cpu);
d4b95f803   Tejun Heo   x86,percpu: gener...
2795
2796
  				goto enomem;
  			}
f528f0b8e   Catalin Marinas   kmemleak: Handle ...
2797
2798
  			/* kmemleak tracks the percpu allocations separately */
  			kmemleak_free(ptr);
ce3141a27   Tejun Heo   percpu: drop pcpu...
2799
  			pages[j++] = virt_to_page(ptr);
d4b95f803   Tejun Heo   x86,percpu: gener...
2800
  		}
8f6066049   zijun_hu   mm/percpu.c: fix ...
2801
  	}
d4b95f803   Tejun Heo   x86,percpu: gener...
2802

8f05a6a65   Tejun Heo   percpu: make 4k f...
2803
2804
  	/* allocate vm area, map the pages and copy static data */
  	vm.flags = VM_ALLOC;
fd1e8a1fe   Tejun Heo   percpu: introduce...
2805
  	vm.size = num_possible_cpus() * ai->unit_size;
8f05a6a65   Tejun Heo   percpu: make 4k f...
2806
  	vm_area_register_early(&vm, PAGE_SIZE);
fd1e8a1fe   Tejun Heo   percpu: introduce...
2807
  	for (unit = 0; unit < num_possible_cpus(); unit++) {
1d9d32572   Tejun Heo   percpu: make @dyn...
2808
  		unsigned long unit_addr =
fd1e8a1fe   Tejun Heo   percpu: introduce...
2809
  			(unsigned long)vm.addr + unit * ai->unit_size;
8f05a6a65   Tejun Heo   percpu: make 4k f...
2810

ce3141a27   Tejun Heo   percpu: drop pcpu...
2811
  		for (i = 0; i < unit_pages; i++)
8f05a6a65   Tejun Heo   percpu: make 4k f...
2812
2813
2814
  			populate_pte_fn(unit_addr + (i << PAGE_SHIFT));
  
  		/* pte already populated, the following shouldn't fail */
fb435d523   Tejun Heo   percpu: add pcpu_...
2815
2816
2817
2818
2819
  		rc = __pcpu_map_pages(unit_addr, &pages[unit * unit_pages],
  				      unit_pages);
  		if (rc < 0)
  			panic("failed to map percpu area, err=%d
  ", rc);
66c3a7577   Tejun Heo   percpu: generaliz...
2820

8f05a6a65   Tejun Heo   percpu: make 4k f...
2821
2822
2823
2824
2825
2826
2827
2828
2829
  		/*
  		 * FIXME: Archs with virtual cache should flush local
  		 * cache for the linear mapping here - something
  		 * equivalent to flush_cache_vmap() on the local cpu.
  		 * flush_cache_vmap() can't be used as most supporting
  		 * data structures are not set up yet.
  		 */
  
  		/* copy static data */
fd1e8a1fe   Tejun Heo   percpu: introduce...
2830
  		memcpy((void *)unit_addr, __per_cpu_load, ai->static_size);
66c3a7577   Tejun Heo   percpu: generaliz...
2831
2832
2833
  	}
  
  	/* we're ready, commit */
00206a69e   Matteo Croce   percpu: stop prin...
2834
2835
2836
  	pr_info("%d %s pages/cpu s%zu r%zu d%zu
  ",
  		unit_pages, psize_str, ai->static_size,
fd1e8a1fe   Tejun Heo   percpu: introduce...
2837
  		ai->reserved_size, ai->dyn_size);
d4b95f803   Tejun Heo   x86,percpu: gener...
2838

163fa2343   Kefeng Wang   percpu: Make pcpu...
2839
  	pcpu_setup_first_chunk(ai, vm.addr);
d4b95f803   Tejun Heo   x86,percpu: gener...
2840
2841
2842
2843
  	goto out_free_ar;
  
  enomem:
  	while (--j >= 0)
ce3141a27   Tejun Heo   percpu: drop pcpu...
2844
  		free_fn(page_address(pages[j]), PAGE_SIZE);
fb435d523   Tejun Heo   percpu: add pcpu_...
2845
  	rc = -ENOMEM;
d4b95f803   Tejun Heo   x86,percpu: gener...
2846
  out_free_ar:
999c17e3d   Santosh Shilimkar   mm/percpu.c: use ...
2847
  	memblock_free_early(__pa(pages), pages_size);
fd1e8a1fe   Tejun Heo   percpu: introduce...
2848
  	pcpu_free_alloc_info(ai);
fb435d523   Tejun Heo   percpu: add pcpu_...
2849
  	return rc;
d4b95f803   Tejun Heo   x86,percpu: gener...
2850
  }
3c9a024fd   Tejun Heo   percpu: fix build...
2851
  #endif /* BUILD_PAGE_FIRST_CHUNK */
d4b95f803   Tejun Heo   x86,percpu: gener...
2852

bbddff054   Tejun Heo   percpu: use percp...
2853
  #ifndef	CONFIG_HAVE_SETUP_PER_CPU_AREA
8c4bfc6e8   Tejun Heo   x86,percpu: gener...
2854
  /*
bbddff054   Tejun Heo   percpu: use percp...
2855
   * Generic SMP percpu area setup.
e74e39620   Tejun Heo   percpu: use dynam...
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
   *
   * The embedding helper is used because its behavior closely resembles
   * the original non-dynamic generic percpu area setup.  This is
   * important because many archs have addressing restrictions and might
   * fail if the percpu area is located far away from the previous
   * location.  As an added bonus, in non-NUMA cases, embedding is
   * generally a good idea TLB-wise because percpu area can piggy back
   * on the physical linear memory mapping which uses large page
   * mappings on applicable archs.
   */
e74e39620   Tejun Heo   percpu: use dynam...
2866
2867
  unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
  EXPORT_SYMBOL(__per_cpu_offset);
c8826dd53   Tejun Heo   percpu: update em...
2868
2869
2870
  static void * __init pcpu_dfl_fc_alloc(unsigned int cpu, size_t size,
  				       size_t align)
  {
26fb3dae0   Mike Rapoport   memblock: drop me...
2871
  	return  memblock_alloc_from(size, align, __pa(MAX_DMA_ADDRESS));
c8826dd53   Tejun Heo   percpu: update em...
2872
  }
66c3a7577   Tejun Heo   percpu: generaliz...
2873

c8826dd53   Tejun Heo   percpu: update em...
2874
2875
  static void __init pcpu_dfl_fc_free(void *ptr, size_t size)
  {
999c17e3d   Santosh Shilimkar   mm/percpu.c: use ...
2876
  	memblock_free_early(__pa(ptr), size);
c8826dd53   Tejun Heo   percpu: update em...
2877
  }
e74e39620   Tejun Heo   percpu: use dynam...
2878
2879
  void __init setup_per_cpu_areas(void)
  {
e74e39620   Tejun Heo   percpu: use dynam...
2880
2881
  	unsigned long delta;
  	unsigned int cpu;
fb435d523   Tejun Heo   percpu: add pcpu_...
2882
  	int rc;
e74e39620   Tejun Heo   percpu: use dynam...
2883
2884
2885
2886
2887
  
  	/*
  	 * Always reserve area for module percpu variables.  That's
  	 * what the legacy allocator did.
  	 */
fb435d523   Tejun Heo   percpu: add pcpu_...
2888
  	rc = pcpu_embed_first_chunk(PERCPU_MODULE_RESERVE,
c8826dd53   Tejun Heo   percpu: update em...
2889
2890
  				    PERCPU_DYNAMIC_RESERVE, PAGE_SIZE, NULL,
  				    pcpu_dfl_fc_alloc, pcpu_dfl_fc_free);
fb435d523   Tejun Heo   percpu: add pcpu_...
2891
  	if (rc < 0)
bbddff054   Tejun Heo   percpu: use percp...
2892
  		panic("Failed to initialize percpu areas.");
e74e39620   Tejun Heo   percpu: use dynam...
2893
2894
2895
  
  	delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start;
  	for_each_possible_cpu(cpu)
fb435d523   Tejun Heo   percpu: add pcpu_...
2896
  		__per_cpu_offset[cpu] = delta + pcpu_unit_offsets[cpu];
66c3a7577   Tejun Heo   percpu: generaliz...
2897
  }
bbddff054   Tejun Heo   percpu: use percp...
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
  #endif	/* CONFIG_HAVE_SETUP_PER_CPU_AREA */
  
  #else	/* CONFIG_SMP */
  
  /*
   * UP percpu area setup.
   *
   * UP always uses km-based percpu allocator with identity mapping.
   * Static percpu variables are indistinguishable from the usual static
   * variables and don't require any special preparation.
   */
  void __init setup_per_cpu_areas(void)
  {
  	const size_t unit_size =
  		roundup_pow_of_two(max_t(size_t, PCPU_MIN_UNIT_SIZE,
  					 PERCPU_DYNAMIC_RESERVE));
  	struct pcpu_alloc_info *ai;
  	void *fc;
  
  	ai = pcpu_alloc_alloc_info(1, 1);
26fb3dae0   Mike Rapoport   memblock: drop me...
2918
  	fc = memblock_alloc_from(unit_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
bbddff054   Tejun Heo   percpu: use percp...
2919
2920
  	if (!ai || !fc)
  		panic("Failed to allocate memory for percpu areas.");
100d13c3b   Catalin Marinas   kmemleak: Fix the...
2921
2922
  	/* kmemleak tracks the percpu allocations separately */
  	kmemleak_free(fc);
bbddff054   Tejun Heo   percpu: use percp...
2923
2924
2925
2926
2927
2928
2929
  
  	ai->dyn_size = unit_size;
  	ai->unit_size = unit_size;
  	ai->atom_size = unit_size;
  	ai->alloc_size = unit_size;
  	ai->groups[0].nr_units = 1;
  	ai->groups[0].cpu_map[0] = 0;
163fa2343   Kefeng Wang   percpu: Make pcpu...
2930
  	pcpu_setup_first_chunk(ai, fc);
438a50618   Nicolas Pitre   percpu: don't for...
2931
  	pcpu_free_alloc_info(ai);
bbddff054   Tejun Heo   percpu: use percp...
2932
2933
2934
  }
  
  #endif	/* CONFIG_SMP */
099a19d91   Tejun Heo   percpu: allow lim...
2935
2936
  
  /*
7e8a6304d   Dennis Zhou (Facebook)   /proc/meminfo: ad...
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
   * pcpu_nr_pages - calculate total number of populated backing pages
   *
   * This reflects the number of pages populated to back chunks.  Metadata is
   * excluded in the number exposed in meminfo as the number of backing pages
   * scales with the number of cpus and can quickly outweigh the memory used for
   * metadata.  It also keeps this calculation nice and simple.
   *
   * RETURNS:
   * Total number of populated backing pages in use by the allocator.
   */
  unsigned long pcpu_nr_pages(void)
  {
  	return pcpu_nr_populated * pcpu_nr_units;
  }
  
  /*
1a4d76076   Tejun Heo   percpu: implement...
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
   * Percpu allocator is initialized early during boot when neither slab or
   * workqueue is available.  Plug async management until everything is up
   * and running.
   */
  static int __init percpu_enable_async(void)
  {
  	pcpu_async_enabled = true;
  	return 0;
  }
  subsys_initcall(percpu_enable_async);