Blame view

mm/percpu.c 95.5 KB
55716d264   Thomas Gleixner   treewide: Replace...
1
  // SPDX-License-Identifier: GPL-2.0-only
fbf59bc9d   Tejun Heo   percpu: implement...
2
  /*
88999a898   Tejun Heo   percpu: misc prep...
3
   * mm/percpu.c - percpu memory allocator
fbf59bc9d   Tejun Heo   percpu: implement...
4
5
6
7
   *
   * Copyright (C) 2009		SUSE Linux Products GmbH
   * Copyright (C) 2009		Tejun Heo <tj@kernel.org>
   *
5e81ee3e6   Dennis Zhou (Facebook)   percpu: update he...
8
   * Copyright (C) 2017		Facebook Inc.
bfacd38f8   Dennis Zhou   percpu: update co...
9
   * Copyright (C) 2017		Dennis Zhou <dennis@kernel.org>
5e81ee3e6   Dennis Zhou (Facebook)   percpu: update he...
10
   *
9c0151627   Dennis Zhou (Facebook)   percpu: update th...
11
12
13
14
   * The percpu allocator handles both static and dynamic areas.  Percpu
   * areas are allocated in chunks which are divided into units.  There is
   * a 1-to-1 mapping for units to possible cpus.  These units are grouped
   * based on NUMA properties of the machine.
fbf59bc9d   Tejun Heo   percpu: implement...
15
16
17
18
19
20
   *
   *  c0                           c1                         c2
   *  -------------------          -------------------        ------------
   * | u0 | u1 | u2 | u3 |        | u0 | u1 | u2 | u3 |      | u0 | u1 | u
   *  -------------------  ......  -------------------  ....  ------------
   *
9c0151627   Dennis Zhou (Facebook)   percpu: update th...
21
22
23
24
25
26
27
28
29
   * Allocation is done by offsets into a unit's address space.  Ie., an
   * area of 512 bytes at 6k in c1 occupies 512 bytes at 6k in c1:u0,
   * c1:u1, c1:u2, etc.  On NUMA machines, the mapping may be non-linear
   * and even sparse.  Access is handled by configuring percpu base
   * registers according to the cpu to unit mappings and offsetting the
   * base address using pcpu_unit_size.
   *
   * There is special consideration for the first chunk which must handle
   * the static percpu variables in the kernel image as allocation services
5e81ee3e6   Dennis Zhou (Facebook)   percpu: update he...
30
   * are not online yet.  In short, the first chunk is structured like so:
9c0151627   Dennis Zhou (Facebook)   percpu: update th...
31
32
33
34
35
36
37
   *
   *                  <Static | [Reserved] | Dynamic>
   *
   * The static data is copied from the original section managed by the
   * linker.  The reserved section, if non-zero, primarily manages static
   * percpu variables from kernel modules.  Finally, the dynamic section
   * takes care of normal allocations.
fbf59bc9d   Tejun Heo   percpu: implement...
38
   *
5e81ee3e6   Dennis Zhou (Facebook)   percpu: update he...
39
   * The allocator organizes chunks into lists according to free size and
3c7be18ac   Roman Gushchin   mm: memcg/percpu:...
40
41
42
43
44
45
46
47
   * memcg-awareness.  To make a percpu allocation memcg-aware the __GFP_ACCOUNT
   * flag should be passed.  All memcg-aware allocations are sharing one set
   * of chunks and all unaccounted allocations and allocations performed
   * by processes belonging to the root memory cgroup are using the second set.
   *
   * The allocator tries to allocate from the fullest chunk first. Each chunk
   * is managed by a bitmap with metadata blocks.  The allocation map is updated
   * on every allocation and free to reflect the current state while the boundary
5e81ee3e6   Dennis Zhou (Facebook)   percpu: update he...
48
49
50
51
52
53
54
55
56
57
   * map is only updated on allocation.  Each metadata block contains
   * information to help mitigate the need to iterate over large portions
   * of the bitmap.  The reverse mapping from page to chunk is stored in
   * the page's index.  Lastly, units are lazily backed and grow in unison.
   *
   * There is a unique conversion that goes on here between bytes and bits.
   * Each bit represents a fragment of size PCPU_MIN_ALLOC_SIZE.  The chunk
   * tracks the number of pages it is responsible for in nr_pages.  Helper
   * functions are used to convert from between the bytes, bits, and blocks.
   * All hints are managed in bits unless explicitly stated.
9c0151627   Dennis Zhou (Facebook)   percpu: update th...
58
   *
4091fb95b   Masahiro Yamada   scripts/spelling....
59
   * To use this allocator, arch code should do the following:
fbf59bc9d   Tejun Heo   percpu: implement...
60
   *
fbf59bc9d   Tejun Heo   percpu: implement...
61
   * - define __addr_to_pcpu_ptr() and __pcpu_ptr_to_addr() to translate
e01009833   Tejun Heo   percpu: make x86 ...
62
63
   *   regular address to percpu pointer and back if they need to be
   *   different from the default
fbf59bc9d   Tejun Heo   percpu: implement...
64
   *
8d408b4be   Tejun Heo   percpu: give more...
65
66
   * - use pcpu_setup_first_chunk() during percpu area initialization to
   *   setup the first chunk containing the kernel static percpu area
fbf59bc9d   Tejun Heo   percpu: implement...
67
   */
870d4b12a   Joe Perches   mm: percpu: use p...
68
  #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
fbf59bc9d   Tejun Heo   percpu: implement...
69
  #include <linux/bitmap.h>
57c8a661d   Mike Rapoport   mm: remove includ...
70
  #include <linux/memblock.h>
fd1e8a1fe   Tejun Heo   percpu: introduce...
71
  #include <linux/err.h>
ca460b3c9   Dennis Zhou (Facebook)   percpu: introduce...
72
  #include <linux/lcm.h>
fbf59bc9d   Tejun Heo   percpu: implement...
73
  #include <linux/list.h>
a530b7958   Tejun Heo   percpu: teach lar...
74
  #include <linux/log2.h>
fbf59bc9d   Tejun Heo   percpu: implement...
75
76
77
78
79
  #include <linux/mm.h>
  #include <linux/module.h>
  #include <linux/mutex.h>
  #include <linux/percpu.h>
  #include <linux/pfn.h>
fbf59bc9d   Tejun Heo   percpu: implement...
80
  #include <linux/slab.h>
ccea34b5d   Tejun Heo   percpu: finer gra...
81
  #include <linux/spinlock.h>
fbf59bc9d   Tejun Heo   percpu: implement...
82
  #include <linux/vmalloc.h>
a56dbddf0   Tejun Heo   percpu: move full...
83
  #include <linux/workqueue.h>
f528f0b8e   Catalin Marinas   kmemleak: Handle ...
84
  #include <linux/kmemleak.h>
71546d100   Tejun Heo   percpu: include l...
85
  #include <linux/sched.h>
28307d938   Filipe Manana   percpu: make pcpu...
86
  #include <linux/sched/mm.h>
3c7be18ac   Roman Gushchin   mm: memcg/percpu:...
87
  #include <linux/memcontrol.h>
fbf59bc9d   Tejun Heo   percpu: implement...
88
89
  
  #include <asm/cacheflush.h>
e01009833   Tejun Heo   percpu: make x86 ...
90
  #include <asm/sections.h>
fbf59bc9d   Tejun Heo   percpu: implement...
91
  #include <asm/tlbflush.h>
3b034b0d0   Vivek Goyal   percpu: Fix kdump...
92
  #include <asm/io.h>
fbf59bc9d   Tejun Heo   percpu: implement...
93

df95e795a   Dennis Zhou   percpu: add trace...
94
95
  #define CREATE_TRACE_POINTS
  #include <trace/events/percpu.h>
8fa3ed801   Dennis Zhou   percpu: migrate p...
96
  #include "percpu-internal.h"
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
97
98
  /* the slots are sorted by free bytes left, 1-31 bytes share the same slot */
  #define PCPU_SLOT_BASE_SHIFT		5
8744d8594   Dennis Zhou   percpu: relegate ...
99
100
  /* chunks in slots below this are subject to being sidelined on failed alloc */
  #define PCPU_SLOT_FAIL_THRESHOLD	3
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
101

1a4d76076   Tejun Heo   percpu: implement...
102
103
  #define PCPU_EMPTY_POP_PAGES_LOW	2
  #define PCPU_EMPTY_POP_PAGES_HIGH	4
fbf59bc9d   Tejun Heo   percpu: implement...
104

bbddff054   Tejun Heo   percpu: use percp...
105
  #ifdef CONFIG_SMP
e01009833   Tejun Heo   percpu: make x86 ...
106
107
108
  /* default addr <-> pcpu_ptr mapping, override in asm/percpu.h if necessary */
  #ifndef __addr_to_pcpu_ptr
  #define __addr_to_pcpu_ptr(addr)					\
43cf38eb5   Tejun Heo   percpu: add __per...
109
110
111
  	(void __percpu *)((unsigned long)(addr) -			\
  			  (unsigned long)pcpu_base_addr	+		\
  			  (unsigned long)__per_cpu_start)
e01009833   Tejun Heo   percpu: make x86 ...
112
113
114
  #endif
  #ifndef __pcpu_ptr_to_addr
  #define __pcpu_ptr_to_addr(ptr)						\
43cf38eb5   Tejun Heo   percpu: add __per...
115
116
117
  	(void __force *)((unsigned long)(ptr) +				\
  			 (unsigned long)pcpu_base_addr -		\
  			 (unsigned long)__per_cpu_start)
e01009833   Tejun Heo   percpu: make x86 ...
118
  #endif
bbddff054   Tejun Heo   percpu: use percp...
119
120
121
122
123
  #else	/* CONFIG_SMP */
  /* on UP, it's always identity mapped */
  #define __addr_to_pcpu_ptr(addr)	(void __percpu *)(addr)
  #define __pcpu_ptr_to_addr(ptr)		(void __force *)(ptr)
  #endif	/* CONFIG_SMP */
e01009833   Tejun Heo   percpu: make x86 ...
124

1328710b8   Daniel Micay   mark most percpu ...
125
126
127
128
  static int pcpu_unit_pages __ro_after_init;
  static int pcpu_unit_size __ro_after_init;
  static int pcpu_nr_units __ro_after_init;
  static int pcpu_atom_size __ro_after_init;
8fa3ed801   Dennis Zhou   percpu: migrate p...
129
  int pcpu_nr_slots __ro_after_init;
1328710b8   Daniel Micay   mark most percpu ...
130
  static size_t pcpu_chunk_struct_size __ro_after_init;
fbf59bc9d   Tejun Heo   percpu: implement...
131

a855b84c3   Tejun Heo   percpu: fix chunk...
132
  /* cpus with the lowest and highest unit addresses */
1328710b8   Daniel Micay   mark most percpu ...
133
134
  static unsigned int pcpu_low_unit_cpu __ro_after_init;
  static unsigned int pcpu_high_unit_cpu __ro_after_init;
2f39e637e   Tejun Heo   percpu: allow non...
135

fbf59bc9d   Tejun Heo   percpu: implement...
136
  /* the address of the first chunk which starts with the kernel static area */
1328710b8   Daniel Micay   mark most percpu ...
137
  void *pcpu_base_addr __ro_after_init;
fbf59bc9d   Tejun Heo   percpu: implement...
138
  EXPORT_SYMBOL_GPL(pcpu_base_addr);
1328710b8   Daniel Micay   mark most percpu ...
139
140
  static const int *pcpu_unit_map __ro_after_init;		/* cpu -> unit */
  const unsigned long *pcpu_unit_offsets __ro_after_init;	/* cpu -> unit offset */
2f39e637e   Tejun Heo   percpu: allow non...
141

6563297ce   Tejun Heo   percpu: use group...
142
  /* group information, used for vm allocation */
1328710b8   Daniel Micay   mark most percpu ...
143
144
145
  static int pcpu_nr_groups __ro_after_init;
  static const unsigned long *pcpu_group_offsets __ro_after_init;
  static const size_t *pcpu_group_sizes __ro_after_init;
6563297ce   Tejun Heo   percpu: use group...
146

ae9e6bc9f   Tejun Heo   percpu: don't put...
147
148
149
150
151
  /*
   * The first chunk which always exists.  Note that unlike other
   * chunks, this one can be allocated and mapped in several different
   * ways and thus often doesn't live in the vmalloc area.
   */
8fa3ed801   Dennis Zhou   percpu: migrate p...
152
  struct pcpu_chunk *pcpu_first_chunk __ro_after_init;
ae9e6bc9f   Tejun Heo   percpu: don't put...
153
154
155
  
  /*
   * Optional reserved chunk.  This chunk reserves part of the first
e22667056   Dennis Zhou (Facebook)   percpu: introduce...
156
157
   * chunk and serves it for reserved allocations.  When the reserved
   * region doesn't exist, the following variable is NULL.
ae9e6bc9f   Tejun Heo   percpu: don't put...
158
   */
8fa3ed801   Dennis Zhou   percpu: migrate p...
159
  struct pcpu_chunk *pcpu_reserved_chunk __ro_after_init;
edcb46399   Tejun Heo   percpu, module: i...
160

8fa3ed801   Dennis Zhou   percpu: migrate p...
161
  DEFINE_SPINLOCK(pcpu_lock);	/* all internal data structures */
6710e594f   Tejun Heo   percpu: fix synch...
162
  static DEFINE_MUTEX(pcpu_alloc_mutex);	/* chunk create/destroy, [de]pop, map ext */
fbf59bc9d   Tejun Heo   percpu: implement...
163

3c7be18ac   Roman Gushchin   mm: memcg/percpu:...
164
  struct list_head *pcpu_chunk_lists __ro_after_init; /* chunk list slots */
fbf59bc9d   Tejun Heo   percpu: implement...
165

4f996e234   Tejun Heo   percpu: fix synch...
166
167
  /* chunks which need their map areas extended, protected by pcpu_lock */
  static LIST_HEAD(pcpu_map_extend_chunks);
b539b87fe   Tejun Heo   percpu: implmeent...
168
169
170
171
  /*
   * The number of empty populated pages, protected by pcpu_lock.  The
   * reserved chunk doesn't contribute to the count.
   */
6b9b6f399   Dennis Zhou (Facebook)   percpu: expose pc...
172
  int pcpu_nr_empty_pop_pages;
b539b87fe   Tejun Heo   percpu: implmeent...
173

1a4d76076   Tejun Heo   percpu: implement...
174
  /*
7e8a6304d   Dennis Zhou (Facebook)   /proc/meminfo: ad...
175
176
177
178
179
180
181
182
   * The number of populated pages in use by the allocator, protected by
   * pcpu_lock.  This number is kept per a unit per chunk (i.e. when a page gets
   * allocated/deallocated, it is allocated/deallocated in all units of a chunk
   * and increments/decrements this count by 1).
   */
  static unsigned long pcpu_nr_populated;
  
  /*
1a4d76076   Tejun Heo   percpu: implement...
183
184
185
186
187
   * Balance work is used to populate or destroy chunks asynchronously.  We
   * try to keep the number of populated free pages between
   * PCPU_EMPTY_POP_PAGES_LOW and HIGH for atomic allocations and at most one
   * empty chunk.
   */
fe6bd8c3d   Tejun Heo   percpu: rename pc...
188
189
  static void pcpu_balance_workfn(struct work_struct *work);
  static DECLARE_WORK(pcpu_balance_work, pcpu_balance_workfn);
1a4d76076   Tejun Heo   percpu: implement...
190
191
192
193
194
195
196
197
  static bool pcpu_async_enabled __read_mostly;
  static bool pcpu_atomic_alloc_failed;
  
  static void pcpu_schedule_balance_work(void)
  {
  	if (pcpu_async_enabled)
  		schedule_work(&pcpu_balance_work);
  }
a56dbddf0   Tejun Heo   percpu: move full...
198

c0ebfdc3f   Dennis Zhou (Facebook)   percpu: modify ba...
199
  /**
560f2c236   Dennis Zhou (Facebook)   percpu: combine p...
200
201
202
   * pcpu_addr_in_chunk - check if the address is served from this chunk
   * @chunk: chunk of interest
   * @addr: percpu address
c0ebfdc3f   Dennis Zhou (Facebook)   percpu: modify ba...
203
204
   *
   * RETURNS:
560f2c236   Dennis Zhou (Facebook)   percpu: combine p...
205
   * True if the address is served from this chunk.
c0ebfdc3f   Dennis Zhou (Facebook)   percpu: modify ba...
206
   */
560f2c236   Dennis Zhou (Facebook)   percpu: combine p...
207
  static bool pcpu_addr_in_chunk(struct pcpu_chunk *chunk, void *addr)
020ec6537   Tejun Heo   percpu: factor ou...
208
  {
c0ebfdc3f   Dennis Zhou (Facebook)   percpu: modify ba...
209
  	void *start_addr, *end_addr;
560f2c236   Dennis Zhou (Facebook)   percpu: combine p...
210
  	if (!chunk)
c0ebfdc3f   Dennis Zhou (Facebook)   percpu: modify ba...
211
  		return false;
020ec6537   Tejun Heo   percpu: factor ou...
212

560f2c236   Dennis Zhou (Facebook)   percpu: combine p...
213
214
215
  	start_addr = chunk->base_addr + chunk->start_offset;
  	end_addr = chunk->base_addr + chunk->nr_pages * PAGE_SIZE -
  		   chunk->end_offset;
c0ebfdc3f   Dennis Zhou (Facebook)   percpu: modify ba...
216
217
  
  	return addr >= start_addr && addr < end_addr;
020ec6537   Tejun Heo   percpu: factor ou...
218
  }
d9b55eeb1   Tejun Heo   percpu: remove un...
219
  static int __pcpu_size_to_slot(int size)
fbf59bc9d   Tejun Heo   percpu: implement...
220
  {
cae3aeb83   Tejun Heo   percpu: clean up ...
221
  	int highbit = fls(size);	/* size is in bytes */
fbf59bc9d   Tejun Heo   percpu: implement...
222
223
  	return max(highbit - PCPU_SLOT_BASE_SHIFT + 2, 1);
  }
d9b55eeb1   Tejun Heo   percpu: remove un...
224
225
226
227
228
229
  static int pcpu_size_to_slot(int size)
  {
  	if (size == pcpu_unit_size)
  		return pcpu_nr_slots - 1;
  	return __pcpu_size_to_slot(size);
  }
fbf59bc9d   Tejun Heo   percpu: implement...
230
231
  static int pcpu_chunk_slot(const struct pcpu_chunk *chunk)
  {
92c14cab4   Dennis Zhou   percpu: convert c...
232
233
234
235
  	const struct pcpu_block_md *chunk_md = &chunk->chunk_md;
  
  	if (chunk->free_bytes < PCPU_MIN_ALLOC_SIZE ||
  	    chunk_md->contig_hint == 0)
fbf59bc9d   Tejun Heo   percpu: implement...
236
  		return 0;
92c14cab4   Dennis Zhou   percpu: convert c...
237
  	return pcpu_size_to_slot(chunk_md->contig_hint * PCPU_MIN_ALLOC_SIZE);
fbf59bc9d   Tejun Heo   percpu: implement...
238
  }
88999a898   Tejun Heo   percpu: misc prep...
239
240
241
242
243
244
245
246
247
248
249
250
251
  /* set the pointer to a chunk in a page struct */
  static void pcpu_set_page_chunk(struct page *page, struct pcpu_chunk *pcpu)
  {
  	page->index = (unsigned long)pcpu;
  }
  
  /* obtain pointer to a chunk from a page struct */
  static struct pcpu_chunk *pcpu_get_page_chunk(struct page *page)
  {
  	return (struct pcpu_chunk *)page->index;
  }
  
  static int __maybe_unused pcpu_page_idx(unsigned int cpu, int page_idx)
fbf59bc9d   Tejun Heo   percpu: implement...
252
  {
2f39e637e   Tejun Heo   percpu: allow non...
253
  	return pcpu_unit_map[cpu] * pcpu_unit_pages + page_idx;
fbf59bc9d   Tejun Heo   percpu: implement...
254
  }
c0ebfdc3f   Dennis Zhou (Facebook)   percpu: modify ba...
255
256
257
258
  static unsigned long pcpu_unit_page_offset(unsigned int cpu, int page_idx)
  {
  	return pcpu_unit_offsets[cpu] + (page_idx << PAGE_SHIFT);
  }
9983b6f0c   Tejun Heo   percpu: fix first...
259
260
  static unsigned long pcpu_chunk_addr(struct pcpu_chunk *chunk,
  				     unsigned int cpu, int page_idx)
fbf59bc9d   Tejun Heo   percpu: implement...
261
  {
c0ebfdc3f   Dennis Zhou (Facebook)   percpu: modify ba...
262
263
  	return (unsigned long)chunk->base_addr +
  	       pcpu_unit_page_offset(cpu, page_idx);
fbf59bc9d   Tejun Heo   percpu: implement...
264
  }
ca460b3c9   Dennis Zhou (Facebook)   percpu: introduce...
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
  /*
   * The following are helper functions to help access bitmaps and convert
   * between bitmap offsets to address offsets.
   */
  static unsigned long *pcpu_index_alloc_map(struct pcpu_chunk *chunk, int index)
  {
  	return chunk->alloc_map +
  	       (index * PCPU_BITMAP_BLOCK_BITS / BITS_PER_LONG);
  }
  
  static unsigned long pcpu_off_to_block_index(int off)
  {
  	return off / PCPU_BITMAP_BLOCK_BITS;
  }
  
  static unsigned long pcpu_off_to_block_off(int off)
  {
  	return off & (PCPU_BITMAP_BLOCK_BITS - 1);
  }
b185cd0dc   Dennis Zhou (Facebook)   percpu: update fr...
284
285
286
287
  static unsigned long pcpu_block_off_to_off(int index, int off)
  {
  	return index * PCPU_BITMAP_BLOCK_BITS + off;
  }
382b88e96   Dennis Zhou   percpu: add block...
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
  /*
   * pcpu_next_hint - determine which hint to use
   * @block: block of interest
   * @alloc_bits: size of allocation
   *
   * This determines if we should scan based on the scan_hint or first_free.
   * In general, we want to scan from first_free to fulfill allocations by
   * first fit.  However, if we know a scan_hint at position scan_hint_start
   * cannot fulfill an allocation, we can begin scanning from there knowing
   * the contig_hint will be our fallback.
   */
  static int pcpu_next_hint(struct pcpu_block_md *block, int alloc_bits)
  {
  	/*
  	 * The three conditions below determine if we can skip past the
  	 * scan_hint.  First, does the scan hint exist.  Second, is the
  	 * contig_hint after the scan_hint (possibly not true iff
  	 * contig_hint == scan_hint).  Third, is the allocation request
  	 * larger than the scan_hint.
  	 */
  	if (block->scan_hint &&
  	    block->contig_hint_start > block->scan_hint_start &&
  	    alloc_bits > block->scan_hint)
  		return block->scan_hint_start + block->scan_hint;
  
  	return block->first_free;
  }
fbf59bc9d   Tejun Heo   percpu: implement...
315
  /**
525ca84da   Dennis Zhou (Facebook)   percpu: use metad...
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
   * pcpu_next_md_free_region - finds the next hint free area
   * @chunk: chunk of interest
   * @bit_off: chunk offset
   * @bits: size of free area
   *
   * Helper function for pcpu_for_each_md_free_region.  It checks
   * block->contig_hint and performs aggregation across blocks to find the
   * next hint.  It modifies bit_off and bits in-place to be consumed in the
   * loop.
   */
  static void pcpu_next_md_free_region(struct pcpu_chunk *chunk, int *bit_off,
  				     int *bits)
  {
  	int i = pcpu_off_to_block_index(*bit_off);
  	int block_off = pcpu_off_to_block_off(*bit_off);
  	struct pcpu_block_md *block;
  
  	*bits = 0;
  	for (block = chunk->md_blocks + i; i < pcpu_chunk_nr_blocks(chunk);
  	     block++, i++) {
  		/* handles contig area across blocks */
  		if (*bits) {
  			*bits += block->left_free;
  			if (block->left_free == PCPU_BITMAP_BLOCK_BITS)
  				continue;
  			return;
  		}
  
  		/*
  		 * This checks three things.  First is there a contig_hint to
  		 * check.  Second, have we checked this hint before by
  		 * comparing the block_off.  Third, is this the same as the
  		 * right contig hint.  In the last case, it spills over into
  		 * the next block and should be handled by the contig area
  		 * across blocks code.
  		 */
  		*bits = block->contig_hint;
  		if (*bits && block->contig_hint_start >= block_off &&
  		    *bits + block->contig_hint_start < PCPU_BITMAP_BLOCK_BITS) {
  			*bit_off = pcpu_block_off_to_off(i,
  					block->contig_hint_start);
  			return;
  		}
1fa4df3e6   Dennis Zhou   percpu: fix itera...
359
360
  		/* reset to satisfy the second predicate above */
  		block_off = 0;
525ca84da   Dennis Zhou (Facebook)   percpu: use metad...
361
362
363
364
365
  
  		*bits = block->right_free;
  		*bit_off = (i + 1) * PCPU_BITMAP_BLOCK_BITS - block->right_free;
  	}
  }
b4c2116cf   Dennis Zhou (Facebook)   percpu: update pc...
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
  /**
   * pcpu_next_fit_region - finds fit areas for a given allocation request
   * @chunk: chunk of interest
   * @alloc_bits: size of allocation
   * @align: alignment of area (max PAGE_SIZE)
   * @bit_off: chunk offset
   * @bits: size of free area
   *
   * Finds the next free region that is viable for use with a given size and
   * alignment.  This only returns if there is a valid area to be used for this
   * allocation.  block->first_free is returned if the allocation request fits
   * within the block to see if the request can be fulfilled prior to the contig
   * hint.
   */
  static void pcpu_next_fit_region(struct pcpu_chunk *chunk, int alloc_bits,
  				 int align, int *bit_off, int *bits)
  {
  	int i = pcpu_off_to_block_index(*bit_off);
  	int block_off = pcpu_off_to_block_off(*bit_off);
  	struct pcpu_block_md *block;
  
  	*bits = 0;
  	for (block = chunk->md_blocks + i; i < pcpu_chunk_nr_blocks(chunk);
  	     block++, i++) {
  		/* handles contig area across blocks */
  		if (*bits) {
  			*bits += block->left_free;
  			if (*bits >= alloc_bits)
  				return;
  			if (block->left_free == PCPU_BITMAP_BLOCK_BITS)
  				continue;
  		}
  
  		/* check block->contig_hint */
  		*bits = ALIGN(block->contig_hint_start, align) -
  			block->contig_hint_start;
  		/*
  		 * This uses the block offset to determine if this has been
  		 * checked in the prior iteration.
  		 */
  		if (block->contig_hint &&
  		    block->contig_hint_start >= block_off &&
  		    block->contig_hint >= *bits + alloc_bits) {
382b88e96   Dennis Zhou   percpu: add block...
409
  			int start = pcpu_next_hint(block, alloc_bits);
b4c2116cf   Dennis Zhou (Facebook)   percpu: update pc...
410
  			*bits += alloc_bits + block->contig_hint_start -
382b88e96   Dennis Zhou   percpu: add block...
411
412
  				 start;
  			*bit_off = pcpu_block_off_to_off(i, start);
b4c2116cf   Dennis Zhou (Facebook)   percpu: update pc...
413
414
  			return;
  		}
1fa4df3e6   Dennis Zhou   percpu: fix itera...
415
416
  		/* reset to satisfy the second predicate above */
  		block_off = 0;
b4c2116cf   Dennis Zhou (Facebook)   percpu: update pc...
417
418
419
420
421
422
423
424
425
426
427
428
  
  		*bit_off = ALIGN(PCPU_BITMAP_BLOCK_BITS - block->right_free,
  				 align);
  		*bits = PCPU_BITMAP_BLOCK_BITS - *bit_off;
  		*bit_off = pcpu_block_off_to_off(i, *bit_off);
  		if (*bits >= alloc_bits)
  			return;
  	}
  
  	/* no valid offsets were found - fail condition */
  	*bit_off = pcpu_chunk_map_bits(chunk);
  }
525ca84da   Dennis Zhou (Facebook)   percpu: use metad...
429
430
431
  /*
   * Metadata free area iterators.  These perform aggregation of free areas
   * based on the metadata blocks and return the offset @bit_off and size in
b4c2116cf   Dennis Zhou (Facebook)   percpu: update pc...
432
433
   * bits of the free area @bits.  pcpu_for_each_fit_region only returns when
   * a fit is found for the allocation request.
525ca84da   Dennis Zhou (Facebook)   percpu: use metad...
434
435
436
437
438
439
   */
  #define pcpu_for_each_md_free_region(chunk, bit_off, bits)		\
  	for (pcpu_next_md_free_region((chunk), &(bit_off), &(bits));	\
  	     (bit_off) < pcpu_chunk_map_bits((chunk));			\
  	     (bit_off) += (bits) + 1,					\
  	     pcpu_next_md_free_region((chunk), &(bit_off), &(bits)))
b4c2116cf   Dennis Zhou (Facebook)   percpu: update pc...
440
441
442
443
444
445
446
  #define pcpu_for_each_fit_region(chunk, alloc_bits, align, bit_off, bits)     \
  	for (pcpu_next_fit_region((chunk), (alloc_bits), (align), &(bit_off), \
  				  &(bits));				      \
  	     (bit_off) < pcpu_chunk_map_bits((chunk));			      \
  	     (bit_off) += (bits),					      \
  	     pcpu_next_fit_region((chunk), (alloc_bits), (align), &(bit_off), \
  				  &(bits)))
525ca84da   Dennis Zhou (Facebook)   percpu: use metad...
447
  /**
90459ce06   Bob Liu   percpu: rename pc...
448
   * pcpu_mem_zalloc - allocate memory
1880d93b8   Tejun Heo   percpu: replace p...
449
   * @size: bytes to allocate
47504ee04   Dennis Zhou   percpu: add __GFP...
450
   * @gfp: allocation flags
fbf59bc9d   Tejun Heo   percpu: implement...
451
   *
1880d93b8   Tejun Heo   percpu: replace p...
452
   * Allocate @size bytes.  If @size is smaller than PAGE_SIZE,
47504ee04   Dennis Zhou   percpu: add __GFP...
453
454
455
   * kzalloc() is used; otherwise, the equivalent of vzalloc() is used.
   * This is to facilitate passing through whitelisted flags.  The
   * returned memory is always zeroed.
fbf59bc9d   Tejun Heo   percpu: implement...
456
457
   *
   * RETURNS:
1880d93b8   Tejun Heo   percpu: replace p...
458
   * Pointer to the allocated area on success, NULL on failure.
fbf59bc9d   Tejun Heo   percpu: implement...
459
   */
47504ee04   Dennis Zhou   percpu: add __GFP...
460
  static void *pcpu_mem_zalloc(size_t size, gfp_t gfp)
fbf59bc9d   Tejun Heo   percpu: implement...
461
  {
099a19d91   Tejun Heo   percpu: allow lim...
462
463
  	if (WARN_ON_ONCE(!slab_is_available()))
  		return NULL;
1880d93b8   Tejun Heo   percpu: replace p...
464
  	if (size <= PAGE_SIZE)
554fef1c3   Dennis Zhou   percpu: allow sel...
465
  		return kzalloc(size, gfp);
7af4c0932   Jesper Juhl   percpu: zero memo...
466
  	else
88dca4ca5   Christoph Hellwig   mm: remove the pg...
467
  		return __vmalloc(size, gfp | __GFP_ZERO);
1880d93b8   Tejun Heo   percpu: replace p...
468
  }
fbf59bc9d   Tejun Heo   percpu: implement...
469

1880d93b8   Tejun Heo   percpu: replace p...
470
471
472
  /**
   * pcpu_mem_free - free memory
   * @ptr: memory to free
1880d93b8   Tejun Heo   percpu: replace p...
473
   *
90459ce06   Bob Liu   percpu: rename pc...
474
   * Free @ptr.  @ptr should have been allocated using pcpu_mem_zalloc().
1880d93b8   Tejun Heo   percpu: replace p...
475
   */
1d5cfdb07   Tetsuo Handa   tree wide: use kv...
476
  static void pcpu_mem_free(void *ptr)
1880d93b8   Tejun Heo   percpu: replace p...
477
  {
1d5cfdb07   Tetsuo Handa   tree wide: use kv...
478
  	kvfree(ptr);
fbf59bc9d   Tejun Heo   percpu: implement...
479
  }
8744d8594   Dennis Zhou   percpu: relegate ...
480
481
482
483
  static void __pcpu_chunk_move(struct pcpu_chunk *chunk, int slot,
  			      bool move_front)
  {
  	if (chunk != pcpu_reserved_chunk) {
3c7be18ac   Roman Gushchin   mm: memcg/percpu:...
484
485
486
  		struct list_head *pcpu_slot;
  
  		pcpu_slot = pcpu_chunk_list(pcpu_chunk_type(chunk));
8744d8594   Dennis Zhou   percpu: relegate ...
487
488
489
490
491
492
493
494
495
496
497
  		if (move_front)
  			list_move(&chunk->list, &pcpu_slot[slot]);
  		else
  			list_move_tail(&chunk->list, &pcpu_slot[slot]);
  	}
  }
  
  static void pcpu_chunk_move(struct pcpu_chunk *chunk, int slot)
  {
  	__pcpu_chunk_move(chunk, slot, true);
  }
fbf59bc9d   Tejun Heo   percpu: implement...
498
499
500
501
502
503
504
  /**
   * pcpu_chunk_relocate - put chunk in the appropriate chunk slot
   * @chunk: chunk of interest
   * @oslot: the previous slot it was on
   *
   * This function is called after an allocation or free changed @chunk.
   * New slot according to the changed state is determined and @chunk is
edcb46399   Tejun Heo   percpu, module: i...
505
506
   * moved to the slot.  Note that the reserved chunk is never put on
   * chunk slots.
ccea34b5d   Tejun Heo   percpu: finer gra...
507
508
509
   *
   * CONTEXT:
   * pcpu_lock.
fbf59bc9d   Tejun Heo   percpu: implement...
510
511
512
513
   */
  static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot)
  {
  	int nslot = pcpu_chunk_slot(chunk);
8744d8594   Dennis Zhou   percpu: relegate ...
514
515
  	if (oslot != nslot)
  		__pcpu_chunk_move(chunk, nslot, oslot < nslot);
833af8427   Tejun Heo   percpu: restructu...
516
  }
b239f7daf   Dennis Zhou   percpu: set PCPU_...
517
518
  /*
   * pcpu_update_empty_pages - update empty page counters
833af8427   Tejun Heo   percpu: restructu...
519
   * @chunk: chunk of interest
b239f7daf   Dennis Zhou   percpu: set PCPU_...
520
   * @nr: nr of empty pages
833af8427   Tejun Heo   percpu: restructu...
521
   *
b239f7daf   Dennis Zhou   percpu: set PCPU_...
522
523
524
   * This is used to keep track of the empty pages now based on the premise
   * a md_block covers a page.  The hint update functions recognize if a block
   * is made full or broken to calculate deltas for keeping track of free pages.
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
525
   */
b239f7daf   Dennis Zhou   percpu: set PCPU_...
526
  static inline void pcpu_update_empty_pages(struct pcpu_chunk *chunk, int nr)
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
527
  {
b239f7daf   Dennis Zhou   percpu: set PCPU_...
528
529
530
  	chunk->nr_empty_pop_pages += nr;
  	if (chunk != pcpu_reserved_chunk)
  		pcpu_nr_empty_pop_pages += nr;
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
531
  }
d9f3a01ee   Dennis Zhou   percpu: introduce...
532
533
534
535
536
537
  /*
   * pcpu_region_overlap - determines if two regions overlap
   * @a: start of first region, inclusive
   * @b: end of first region, exclusive
   * @x: start of second region, inclusive
   * @y: end of second region, exclusive
833af8427   Tejun Heo   percpu: restructu...
538
   *
d9f3a01ee   Dennis Zhou   percpu: introduce...
539
540
   * This is used to determine if the hint region [a, b) overlaps with the
   * allocated region [x, y).
833af8427   Tejun Heo   percpu: restructu...
541
   */
d9f3a01ee   Dennis Zhou   percpu: introduce...
542
  static inline bool pcpu_region_overlap(int a, int b, int x, int y)
833af8427   Tejun Heo   percpu: restructu...
543
  {
d9f3a01ee   Dennis Zhou   percpu: introduce...
544
  	return (a < y) && (x < b);
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
545
  }
9f7dcf224   Tejun Heo   percpu: move chun...
546

40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
547
  /**
ca460b3c9   Dennis Zhou (Facebook)   percpu: introduce...
548
549
550
551
552
553
   * pcpu_block_update - updates a block given a free area
   * @block: block of interest
   * @start: start offset in block
   * @end: end offset in block
   *
   * Updates a block given a known free area.  The region [start, end) is
268625a6f   Dennis Zhou (Facebook)   percpu: keep trac...
554
555
   * expected to be the entirety of the free area within a block.  Chooses
   * the best starting offset if the contig hints are equal.
ca460b3c9   Dennis Zhou (Facebook)   percpu: introduce...
556
557
558
559
560
561
562
563
   */
  static void pcpu_block_update(struct pcpu_block_md *block, int start, int end)
  {
  	int contig = end - start;
  
  	block->first_free = min(block->first_free, start);
  	if (start == 0)
  		block->left_free = contig;
047924c96   Dennis Zhou   percpu: make pcpu...
564
  	if (end == block->nr_bits)
ca460b3c9   Dennis Zhou (Facebook)   percpu: introduce...
565
566
567
  		block->right_free = contig;
  
  	if (contig > block->contig_hint) {
382b88e96   Dennis Zhou   percpu: add block...
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
  		/* promote the old contig_hint to be the new scan_hint */
  		if (start > block->contig_hint_start) {
  			if (block->contig_hint > block->scan_hint) {
  				block->scan_hint_start =
  					block->contig_hint_start;
  				block->scan_hint = block->contig_hint;
  			} else if (start < block->scan_hint_start) {
  				/*
  				 * The old contig_hint == scan_hint.  But, the
  				 * new contig is larger so hold the invariant
  				 * scan_hint_start < contig_hint_start.
  				 */
  				block->scan_hint = 0;
  			}
  		} else {
  			block->scan_hint = 0;
  		}
ca460b3c9   Dennis Zhou (Facebook)   percpu: introduce...
585
586
  		block->contig_hint_start = start;
  		block->contig_hint = contig;
382b88e96   Dennis Zhou   percpu: add block...
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
  	} else if (contig == block->contig_hint) {
  		if (block->contig_hint_start &&
  		    (!start ||
  		     __ffs(start) > __ffs(block->contig_hint_start))) {
  			/* start has a better alignment so use it */
  			block->contig_hint_start = start;
  			if (start < block->scan_hint_start &&
  			    block->contig_hint > block->scan_hint)
  				block->scan_hint = 0;
  		} else if (start > block->scan_hint_start ||
  			   block->contig_hint > block->scan_hint) {
  			/*
  			 * Knowing contig == contig_hint, update the scan_hint
  			 * if it is farther than or larger than the current
  			 * scan_hint.
  			 */
  			block->scan_hint_start = start;
  			block->scan_hint = contig;
  		}
  	} else {
  		/*
  		 * The region is smaller than the contig_hint.  So only update
  		 * the scan_hint if it is larger than or equal and farther than
  		 * the current scan_hint.
  		 */
  		if ((start < block->contig_hint_start &&
  		     (contig > block->scan_hint ||
  		      (contig == block->scan_hint &&
  		       start > block->scan_hint_start)))) {
  			block->scan_hint_start = start;
  			block->scan_hint = contig;
  		}
ca460b3c9   Dennis Zhou (Facebook)   percpu: introduce...
619
620
  	}
  }
b89462a9c   Dennis Zhou   percpu: remember ...
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
  /*
   * pcpu_block_update_scan - update a block given a free area from a scan
   * @chunk: chunk of interest
   * @bit_off: chunk offset
   * @bits: size of free area
   *
   * Finding the final allocation spot first goes through pcpu_find_block_fit()
   * to find a block that can hold the allocation and then pcpu_alloc_area()
   * where a scan is used.  When allocations require specific alignments,
   * we can inadvertently create holes which will not be seen in the alloc
   * or free paths.
   *
   * This takes a given free area hole and updates a block as it may change the
   * scan_hint.  We need to scan backwards to ensure we don't miss free bits
   * from alignment.
   */
  static void pcpu_block_update_scan(struct pcpu_chunk *chunk, int bit_off,
  				   int bits)
  {
  	int s_off = pcpu_off_to_block_off(bit_off);
  	int e_off = s_off + bits;
  	int s_index, l_bit;
  	struct pcpu_block_md *block;
  
  	if (e_off > PCPU_BITMAP_BLOCK_BITS)
  		return;
  
  	s_index = pcpu_off_to_block_index(bit_off);
  	block = chunk->md_blocks + s_index;
  
  	/* scan backwards in case of alignment skipping free bits */
  	l_bit = find_last_bit(pcpu_index_alloc_map(chunk, s_index), s_off);
  	s_off = (s_off == l_bit) ? 0 : l_bit + 1;
  
  	pcpu_block_update(block, s_off, e_off);
  }
ca460b3c9   Dennis Zhou (Facebook)   percpu: introduce...
657
  /**
92c14cab4   Dennis Zhou   percpu: convert c...
658
659
   * pcpu_chunk_refresh_hint - updates metadata about a chunk
   * @chunk: chunk of interest
d33d9f3dd   Dennis Zhou   percpu: use chunk...
660
   * @full_scan: if we should scan from the beginning
92c14cab4   Dennis Zhou   percpu: convert c...
661
662
   *
   * Iterates over the metadata blocks to find the largest contig area.
d33d9f3dd   Dennis Zhou   percpu: use chunk...
663
664
665
666
667
   * A full scan can be avoided on the allocation path as this is triggered
   * if we broke the contig_hint.  In doing so, the scan_hint will be before
   * the contig_hint or after if the scan_hint == contig_hint.  This cannot
   * be prevented on freeing as we want to find the largest area possibly
   * spanning blocks.
92c14cab4   Dennis Zhou   percpu: convert c...
668
   */
d33d9f3dd   Dennis Zhou   percpu: use chunk...
669
  static void pcpu_chunk_refresh_hint(struct pcpu_chunk *chunk, bool full_scan)
92c14cab4   Dennis Zhou   percpu: convert c...
670
671
672
  {
  	struct pcpu_block_md *chunk_md = &chunk->chunk_md;
  	int bit_off, bits;
d33d9f3dd   Dennis Zhou   percpu: use chunk...
673
674
675
676
677
678
679
680
681
682
  	/* promote scan_hint to contig_hint */
  	if (!full_scan && chunk_md->scan_hint) {
  		bit_off = chunk_md->scan_hint_start + chunk_md->scan_hint;
  		chunk_md->contig_hint_start = chunk_md->scan_hint_start;
  		chunk_md->contig_hint = chunk_md->scan_hint;
  		chunk_md->scan_hint = 0;
  	} else {
  		bit_off = chunk_md->first_free;
  		chunk_md->contig_hint = 0;
  	}
92c14cab4   Dennis Zhou   percpu: convert c...
683

92c14cab4   Dennis Zhou   percpu: convert c...
684
  	bits = 0;
e837dfde1   Dennis Zhou   bitmap: genericiz...
685
  	pcpu_for_each_md_free_region(chunk, bit_off, bits)
92c14cab4   Dennis Zhou   percpu: convert c...
686
  		pcpu_block_update(chunk_md, bit_off, bit_off + bits);
ca460b3c9   Dennis Zhou (Facebook)   percpu: introduce...
687
688
689
690
691
692
693
694
695
696
697
698
699
700
  }
  
  /**
   * pcpu_block_refresh_hint
   * @chunk: chunk of interest
   * @index: index of the metadata block
   *
   * Scans over the block beginning at first_free and updates the block
   * metadata accordingly.
   */
  static void pcpu_block_refresh_hint(struct pcpu_chunk *chunk, int index)
  {
  	struct pcpu_block_md *block = chunk->md_blocks + index;
  	unsigned long *alloc_map = pcpu_index_alloc_map(chunk, index);
e837dfde1   Dennis Zhou   bitmap: genericiz...
701
  	unsigned int rs, re, start;	/* region start, region end */
da3afdd5b   Dennis Zhou   percpu: use block...
702
703
704
705
706
707
708
709
710
711
712
  
  	/* promote scan_hint to contig_hint */
  	if (block->scan_hint) {
  		start = block->scan_hint_start + block->scan_hint;
  		block->contig_hint_start = block->scan_hint_start;
  		block->contig_hint = block->scan_hint;
  		block->scan_hint = 0;
  	} else {
  		start = block->first_free;
  		block->contig_hint = 0;
  	}
ca460b3c9   Dennis Zhou (Facebook)   percpu: introduce...
713

da3afdd5b   Dennis Zhou   percpu: use block...
714
  	block->right_free = 0;
ca460b3c9   Dennis Zhou (Facebook)   percpu: introduce...
715
716
  
  	/* iterate over free areas and update the contig hints */
e837dfde1   Dennis Zhou   bitmap: genericiz...
717
718
  	bitmap_for_each_clear_region(alloc_map, rs, re, start,
  				     PCPU_BITMAP_BLOCK_BITS)
ca460b3c9   Dennis Zhou (Facebook)   percpu: introduce...
719
  		pcpu_block_update(block, rs, re);
ca460b3c9   Dennis Zhou (Facebook)   percpu: introduce...
720
721
722
723
724
725
726
  }
  
  /**
   * pcpu_block_update_hint_alloc - update hint on allocation path
   * @chunk: chunk of interest
   * @bit_off: chunk offset
   * @bits: size of request
fc3043345   Dennis Zhou (Facebook)   percpu: update al...
727
728
729
730
   *
   * Updates metadata for the allocation path.  The metadata only has to be
   * refreshed by a full scan iff the chunk's contig hint is broken.  Block level
   * scans are required if the block's contig hint is broken.
ca460b3c9   Dennis Zhou (Facebook)   percpu: introduce...
731
732
733
734
   */
  static void pcpu_block_update_hint_alloc(struct pcpu_chunk *chunk, int bit_off,
  					 int bits)
  {
92c14cab4   Dennis Zhou   percpu: convert c...
735
  	struct pcpu_block_md *chunk_md = &chunk->chunk_md;
b239f7daf   Dennis Zhou   percpu: set PCPU_...
736
  	int nr_empty_pages = 0;
ca460b3c9   Dennis Zhou (Facebook)   percpu: introduce...
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
  	struct pcpu_block_md *s_block, *e_block, *block;
  	int s_index, e_index;	/* block indexes of the freed allocation */
  	int s_off, e_off;	/* block offsets of the freed allocation */
  
  	/*
  	 * Calculate per block offsets.
  	 * The calculation uses an inclusive range, but the resulting offsets
  	 * are [start, end).  e_index always points to the last block in the
  	 * range.
  	 */
  	s_index = pcpu_off_to_block_index(bit_off);
  	e_index = pcpu_off_to_block_index(bit_off + bits - 1);
  	s_off = pcpu_off_to_block_off(bit_off);
  	e_off = pcpu_off_to_block_off(bit_off + bits - 1) + 1;
  
  	s_block = chunk->md_blocks + s_index;
  	e_block = chunk->md_blocks + e_index;
  
  	/*
  	 * Update s_block.
fc3043345   Dennis Zhou (Facebook)   percpu: update al...
757
758
759
  	 * block->first_free must be updated if the allocation takes its place.
  	 * If the allocation breaks the contig_hint, a scan is required to
  	 * restore this hint.
ca460b3c9   Dennis Zhou (Facebook)   percpu: introduce...
760
  	 */
b239f7daf   Dennis Zhou   percpu: set PCPU_...
761
762
  	if (s_block->contig_hint == PCPU_BITMAP_BLOCK_BITS)
  		nr_empty_pages++;
fc3043345   Dennis Zhou (Facebook)   percpu: update al...
763
764
765
766
767
  	if (s_off == s_block->first_free)
  		s_block->first_free = find_next_zero_bit(
  					pcpu_index_alloc_map(chunk, s_index),
  					PCPU_BITMAP_BLOCK_BITS,
  					s_off + bits);
382b88e96   Dennis Zhou   percpu: add block...
768
769
770
771
772
  	if (pcpu_region_overlap(s_block->scan_hint_start,
  				s_block->scan_hint_start + s_block->scan_hint,
  				s_off,
  				s_off + bits))
  		s_block->scan_hint = 0;
d9f3a01ee   Dennis Zhou   percpu: introduce...
773
774
775
776
777
  	if (pcpu_region_overlap(s_block->contig_hint_start,
  				s_block->contig_hint_start +
  				s_block->contig_hint,
  				s_off,
  				s_off + bits)) {
fc3043345   Dennis Zhou (Facebook)   percpu: update al...
778
  		/* block contig hint is broken - scan to fix it */
da3afdd5b   Dennis Zhou   percpu: use block...
779
780
  		if (!s_off)
  			s_block->left_free = 0;
fc3043345   Dennis Zhou (Facebook)   percpu: update al...
781
782
783
784
785
786
787
788
789
790
  		pcpu_block_refresh_hint(chunk, s_index);
  	} else {
  		/* update left and right contig manually */
  		s_block->left_free = min(s_block->left_free, s_off);
  		if (s_index == e_index)
  			s_block->right_free = min_t(int, s_block->right_free,
  					PCPU_BITMAP_BLOCK_BITS - e_off);
  		else
  			s_block->right_free = 0;
  	}
ca460b3c9   Dennis Zhou (Facebook)   percpu: introduce...
791
792
793
794
795
  
  	/*
  	 * Update e_block.
  	 */
  	if (s_index != e_index) {
b239f7daf   Dennis Zhou   percpu: set PCPU_...
796
797
  		if (e_block->contig_hint == PCPU_BITMAP_BLOCK_BITS)
  			nr_empty_pages++;
fc3043345   Dennis Zhou (Facebook)   percpu: update al...
798
799
800
801
802
803
804
805
806
807
808
809
  		/*
  		 * When the allocation is across blocks, the end is along
  		 * the left part of the e_block.
  		 */
  		e_block->first_free = find_next_zero_bit(
  				pcpu_index_alloc_map(chunk, e_index),
  				PCPU_BITMAP_BLOCK_BITS, e_off);
  
  		if (e_off == PCPU_BITMAP_BLOCK_BITS) {
  			/* reset the block */
  			e_block++;
  		} else {
382b88e96   Dennis Zhou   percpu: add block...
810
811
  			if (e_off > e_block->scan_hint_start)
  				e_block->scan_hint = 0;
da3afdd5b   Dennis Zhou   percpu: use block...
812
  			e_block->left_free = 0;
fc3043345   Dennis Zhou (Facebook)   percpu: update al...
813
814
815
816
  			if (e_off > e_block->contig_hint_start) {
  				/* contig hint is broken - scan to fix it */
  				pcpu_block_refresh_hint(chunk, e_index);
  			} else {
fc3043345   Dennis Zhou (Facebook)   percpu: update al...
817
818
819
820
821
  				e_block->right_free =
  					min_t(int, e_block->right_free,
  					      PCPU_BITMAP_BLOCK_BITS - e_off);
  			}
  		}
ca460b3c9   Dennis Zhou (Facebook)   percpu: introduce...
822
823
  
  		/* update in-between md_blocks */
b239f7daf   Dennis Zhou   percpu: set PCPU_...
824
  		nr_empty_pages += (e_index - s_index - 1);
ca460b3c9   Dennis Zhou (Facebook)   percpu: introduce...
825
  		for (block = s_block + 1; block < e_block; block++) {
382b88e96   Dennis Zhou   percpu: add block...
826
  			block->scan_hint = 0;
ca460b3c9   Dennis Zhou (Facebook)   percpu: introduce...
827
828
829
830
831
  			block->contig_hint = 0;
  			block->left_free = 0;
  			block->right_free = 0;
  		}
  	}
b239f7daf   Dennis Zhou   percpu: set PCPU_...
832
833
  	if (nr_empty_pages)
  		pcpu_update_empty_pages(chunk, -nr_empty_pages);
d33d9f3dd   Dennis Zhou   percpu: use chunk...
834
835
836
837
838
839
  	if (pcpu_region_overlap(chunk_md->scan_hint_start,
  				chunk_md->scan_hint_start +
  				chunk_md->scan_hint,
  				bit_off,
  				bit_off + bits))
  		chunk_md->scan_hint = 0;
fc3043345   Dennis Zhou (Facebook)   percpu: update al...
840
841
842
843
844
  	/*
  	 * The only time a full chunk scan is required is if the chunk
  	 * contig hint is broken.  Otherwise, it means a smaller space
  	 * was used and therefore the chunk contig hint is still correct.
  	 */
92c14cab4   Dennis Zhou   percpu: convert c...
845
846
847
  	if (pcpu_region_overlap(chunk_md->contig_hint_start,
  				chunk_md->contig_hint_start +
  				chunk_md->contig_hint,
d9f3a01ee   Dennis Zhou   percpu: introduce...
848
849
  				bit_off,
  				bit_off + bits))
d33d9f3dd   Dennis Zhou   percpu: use chunk...
850
  		pcpu_chunk_refresh_hint(chunk, false);
ca460b3c9   Dennis Zhou (Facebook)   percpu: introduce...
851
852
853
854
855
856
857
  }
  
  /**
   * pcpu_block_update_hint_free - updates the block hints on the free path
   * @chunk: chunk of interest
   * @bit_off: chunk offset
   * @bits: size of request
b185cd0dc   Dennis Zhou (Facebook)   percpu: update fr...
858
859
860
861
862
863
864
865
   *
   * Updates metadata for the allocation path.  This avoids a blind block
   * refresh by making use of the block contig hints.  If this fails, it scans
   * forward and backward to determine the extent of the free area.  This is
   * capped at the boundary of blocks.
   *
   * A chunk update is triggered if a page becomes free, a block becomes free,
   * or the free spans across blocks.  This tradeoff is to minimize iterating
92c14cab4   Dennis Zhou   percpu: convert c...
866
867
868
869
   * over the block metadata to update chunk_md->contig_hint.
   * chunk_md->contig_hint may be off by up to a page, but it will never be more
   * than the available space.  If the contig hint is contained in one block, it
   * will be accurate.
ca460b3c9   Dennis Zhou (Facebook)   percpu: introduce...
870
871
872
873
   */
  static void pcpu_block_update_hint_free(struct pcpu_chunk *chunk, int bit_off,
  					int bits)
  {
b239f7daf   Dennis Zhou   percpu: set PCPU_...
874
  	int nr_empty_pages = 0;
ca460b3c9   Dennis Zhou (Facebook)   percpu: introduce...
875
876
877
  	struct pcpu_block_md *s_block, *e_block, *block;
  	int s_index, e_index;	/* block indexes of the freed allocation */
  	int s_off, e_off;	/* block offsets of the freed allocation */
b185cd0dc   Dennis Zhou (Facebook)   percpu: update fr...
878
  	int start, end;		/* start and end of the whole free area */
ca460b3c9   Dennis Zhou (Facebook)   percpu: introduce...
879
880
881
882
883
884
885
886
887
888
889
890
891
892
  
  	/*
  	 * Calculate per block offsets.
  	 * The calculation uses an inclusive range, but the resulting offsets
  	 * are [start, end).  e_index always points to the last block in the
  	 * range.
  	 */
  	s_index = pcpu_off_to_block_index(bit_off);
  	e_index = pcpu_off_to_block_index(bit_off + bits - 1);
  	s_off = pcpu_off_to_block_off(bit_off);
  	e_off = pcpu_off_to_block_off(bit_off + bits - 1) + 1;
  
  	s_block = chunk->md_blocks + s_index;
  	e_block = chunk->md_blocks + e_index;
b185cd0dc   Dennis Zhou (Facebook)   percpu: update fr...
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
  	/*
  	 * Check if the freed area aligns with the block->contig_hint.
  	 * If it does, then the scan to find the beginning/end of the
  	 * larger free area can be avoided.
  	 *
  	 * start and end refer to beginning and end of the free area
  	 * within each their respective blocks.  This is not necessarily
  	 * the entire free area as it may span blocks past the beginning
  	 * or end of the block.
  	 */
  	start = s_off;
  	if (s_off == s_block->contig_hint + s_block->contig_hint_start) {
  		start = s_block->contig_hint_start;
  	} else {
  		/*
  		 * Scan backwards to find the extent of the free area.
  		 * find_last_bit returns the starting bit, so if the start bit
  		 * is returned, that means there was no last bit and the
  		 * remainder of the chunk is free.
  		 */
  		int l_bit = find_last_bit(pcpu_index_alloc_map(chunk, s_index),
  					  start);
  		start = (start == l_bit) ? 0 : l_bit + 1;
  	}
  
  	end = e_off;
  	if (e_off == e_block->contig_hint_start)
  		end = e_block->contig_hint_start + e_block->contig_hint;
  	else
  		end = find_next_bit(pcpu_index_alloc_map(chunk, e_index),
  				    PCPU_BITMAP_BLOCK_BITS, end);
ca460b3c9   Dennis Zhou (Facebook)   percpu: introduce...
924
  	/* update s_block */
b185cd0dc   Dennis Zhou (Facebook)   percpu: update fr...
925
  	e_off = (s_index == e_index) ? end : PCPU_BITMAP_BLOCK_BITS;
b239f7daf   Dennis Zhou   percpu: set PCPU_...
926
927
  	if (!start && e_off == PCPU_BITMAP_BLOCK_BITS)
  		nr_empty_pages++;
b185cd0dc   Dennis Zhou (Facebook)   percpu: update fr...
928
  	pcpu_block_update(s_block, start, e_off);
ca460b3c9   Dennis Zhou (Facebook)   percpu: introduce...
929
930
931
932
  
  	/* freeing in the same block */
  	if (s_index != e_index) {
  		/* update e_block */
b239f7daf   Dennis Zhou   percpu: set PCPU_...
933
934
  		if (end == PCPU_BITMAP_BLOCK_BITS)
  			nr_empty_pages++;
b185cd0dc   Dennis Zhou (Facebook)   percpu: update fr...
935
  		pcpu_block_update(e_block, 0, end);
ca460b3c9   Dennis Zhou (Facebook)   percpu: introduce...
936
937
  
  		/* reset md_blocks in the middle */
b239f7daf   Dennis Zhou   percpu: set PCPU_...
938
  		nr_empty_pages += (e_index - s_index - 1);
ca460b3c9   Dennis Zhou (Facebook)   percpu: introduce...
939
940
  		for (block = s_block + 1; block < e_block; block++) {
  			block->first_free = 0;
382b88e96   Dennis Zhou   percpu: add block...
941
  			block->scan_hint = 0;
ca460b3c9   Dennis Zhou (Facebook)   percpu: introduce...
942
943
944
945
946
947
  			block->contig_hint_start = 0;
  			block->contig_hint = PCPU_BITMAP_BLOCK_BITS;
  			block->left_free = PCPU_BITMAP_BLOCK_BITS;
  			block->right_free = PCPU_BITMAP_BLOCK_BITS;
  		}
  	}
b239f7daf   Dennis Zhou   percpu: set PCPU_...
948
949
  	if (nr_empty_pages)
  		pcpu_update_empty_pages(chunk, nr_empty_pages);
b185cd0dc   Dennis Zhou (Facebook)   percpu: update fr...
950
  	/*
b239f7daf   Dennis Zhou   percpu: set PCPU_...
951
952
953
954
  	 * Refresh chunk metadata when the free makes a block free or spans
  	 * across blocks.  The contig_hint may be off by up to a page, but if
  	 * the contig_hint is contained in a block, it will be accurate with
  	 * the else condition below.
b185cd0dc   Dennis Zhou (Facebook)   percpu: update fr...
955
  	 */
b239f7daf   Dennis Zhou   percpu: set PCPU_...
956
  	if (((end - start) >= PCPU_BITMAP_BLOCK_BITS) || s_index != e_index)
d33d9f3dd   Dennis Zhou   percpu: use chunk...
957
  		pcpu_chunk_refresh_hint(chunk, true);
b185cd0dc   Dennis Zhou (Facebook)   percpu: update fr...
958
  	else
92c14cab4   Dennis Zhou   percpu: convert c...
959
960
961
  		pcpu_block_update(&chunk->chunk_md,
  				  pcpu_block_off_to_off(s_index, start),
  				  end);
ca460b3c9   Dennis Zhou (Facebook)   percpu: introduce...
962
963
964
  }
  
  /**
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
   * pcpu_is_populated - determines if the region is populated
   * @chunk: chunk of interest
   * @bit_off: chunk offset
   * @bits: size of area
   * @next_off: return value for the next offset to start searching
   *
   * For atomic allocations, check if the backing pages are populated.
   *
   * RETURNS:
   * Bool if the backing pages are populated.
   * next_index is to skip over unpopulated blocks in pcpu_find_block_fit.
   */
  static bool pcpu_is_populated(struct pcpu_chunk *chunk, int bit_off, int bits,
  			      int *next_off)
  {
e837dfde1   Dennis Zhou   bitmap: genericiz...
980
  	unsigned int page_start, page_end, rs, re;
833af8427   Tejun Heo   percpu: restructu...
981

40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
982
983
  	page_start = PFN_DOWN(bit_off * PCPU_MIN_ALLOC_SIZE);
  	page_end = PFN_UP((bit_off + bits) * PCPU_MIN_ALLOC_SIZE);
833af8427   Tejun Heo   percpu: restructu...
984

40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
985
  	rs = page_start;
e837dfde1   Dennis Zhou   bitmap: genericiz...
986
  	bitmap_next_clear_region(chunk->populated, &rs, &re, page_end);
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
987
988
  	if (rs >= page_end)
  		return true;
833af8427   Tejun Heo   percpu: restructu...
989

40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
990
991
  	*next_off = re * PAGE_SIZE / PCPU_MIN_ALLOC_SIZE;
  	return false;
9f7dcf224   Tejun Heo   percpu: move chun...
992
993
994
  }
  
  /**
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
995
996
997
998
999
1000
   * pcpu_find_block_fit - finds the block index to start searching
   * @chunk: chunk of interest
   * @alloc_bits: size of request in allocation units
   * @align: alignment of area (max PAGE_SIZE bytes)
   * @pop_only: use populated regions only
   *
b4c2116cf   Dennis Zhou (Facebook)   percpu: update pc...
1001
1002
1003
1004
1005
1006
1007
1008
   * Given a chunk and an allocation spec, find the offset to begin searching
   * for a free region.  This iterates over the bitmap metadata blocks to
   * find an offset that will be guaranteed to fit the requirements.  It is
   * not quite first fit as if the allocation does not fit in the contig hint
   * of a block or chunk, it is skipped.  This errs on the side of caution
   * to prevent excess iteration.  Poor alignment can cause the allocator to
   * skip over blocks and chunks that have valid free areas.
   *
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1009
1010
1011
   * RETURNS:
   * The offset in the bitmap to begin searching.
   * -1 if no offset is found.
a16037c8d   Tejun Heo   percpu: make pcpu...
1012
   */
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1013
1014
  static int pcpu_find_block_fit(struct pcpu_chunk *chunk, int alloc_bits,
  			       size_t align, bool pop_only)
a16037c8d   Tejun Heo   percpu: make pcpu...
1015
  {
92c14cab4   Dennis Zhou   percpu: convert c...
1016
  	struct pcpu_block_md *chunk_md = &chunk->chunk_md;
b4c2116cf   Dennis Zhou (Facebook)   percpu: update pc...
1017
  	int bit_off, bits, next_off;
a16037c8d   Tejun Heo   percpu: make pcpu...
1018

13f966373   Dennis Zhou (Facebook)   percpu: skip chun...
1019
1020
1021
1022
1023
1024
  	/*
  	 * Check to see if the allocation can fit in the chunk's contig hint.
  	 * This is an optimization to prevent scanning by assuming if it
  	 * cannot fit in the global hint, there is memory pressure and creating
  	 * a new chunk would happen soon.
  	 */
92c14cab4   Dennis Zhou   percpu: convert c...
1025
1026
1027
  	bit_off = ALIGN(chunk_md->contig_hint_start, align) -
  		  chunk_md->contig_hint_start;
  	if (bit_off + alloc_bits > chunk_md->contig_hint)
13f966373   Dennis Zhou (Facebook)   percpu: skip chun...
1028
  		return -1;
d33d9f3dd   Dennis Zhou   percpu: use chunk...
1029
  	bit_off = pcpu_next_hint(chunk_md, alloc_bits);
b4c2116cf   Dennis Zhou (Facebook)   percpu: update pc...
1030
1031
  	bits = 0;
  	pcpu_for_each_fit_region(chunk, alloc_bits, align, bit_off, bits) {
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1032
  		if (!pop_only || pcpu_is_populated(chunk, bit_off, bits,
b4c2116cf   Dennis Zhou (Facebook)   percpu: update pc...
1033
  						   &next_off))
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1034
  			break;
a16037c8d   Tejun Heo   percpu: make pcpu...
1035

b4c2116cf   Dennis Zhou (Facebook)   percpu: update pc...
1036
  		bit_off = next_off;
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1037
  		bits = 0;
a16037c8d   Tejun Heo   percpu: make pcpu...
1038
  	}
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1039
1040
1041
1042
1043
  
  	if (bit_off == pcpu_chunk_map_bits(chunk))
  		return -1;
  
  	return bit_off;
a16037c8d   Tejun Heo   percpu: make pcpu...
1044
  }
b89462a9c   Dennis Zhou   percpu: remember ...
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
  /*
   * pcpu_find_zero_area - modified from bitmap_find_next_zero_area_off()
   * @map: the address to base the search on
   * @size: the bitmap size in bits
   * @start: the bitnumber to start searching at
   * @nr: the number of zeroed bits we're looking for
   * @align_mask: alignment mask for zero area
   * @largest_off: offset of the largest area skipped
   * @largest_bits: size of the largest area skipped
   *
   * The @align_mask should be one less than a power of 2.
   *
   * This is a modified version of bitmap_find_next_zero_area_off() to remember
   * the largest area that was skipped.  This is imperfect, but in general is
   * good enough.  The largest remembered region is the largest failed region
   * seen.  This does not include anything we possibly skipped due to alignment.
   * pcpu_block_update_scan() does scan backwards to try and recover what was
   * lost to alignment.  While this can cause scanning to miss earlier possible
   * free areas, smaller allocations will eventually fill those holes.
   */
  static unsigned long pcpu_find_zero_area(unsigned long *map,
  					 unsigned long size,
  					 unsigned long start,
  					 unsigned long nr,
  					 unsigned long align_mask,
  					 unsigned long *largest_off,
  					 unsigned long *largest_bits)
  {
  	unsigned long index, end, i, area_off, area_bits;
  again:
  	index = find_next_zero_bit(map, size, start);
  
  	/* Align allocation */
  	index = __ALIGN_MASK(index, align_mask);
  	area_off = index;
  
  	end = index + nr;
  	if (end > size)
  		return end;
  	i = find_next_bit(map, end, index);
  	if (i < end) {
  		area_bits = i - area_off;
  		/* remember largest unused area with best alignment */
  		if (area_bits > *largest_bits ||
  		    (area_bits == *largest_bits && *largest_off &&
  		     (!area_off || __ffs(area_off) > __ffs(*largest_off)))) {
  			*largest_off = area_off;
  			*largest_bits = area_bits;
  		}
  
  		start = i + 1;
  		goto again;
  	}
  	return index;
  }
a16037c8d   Tejun Heo   percpu: make pcpu...
1100
  /**
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1101
   * pcpu_alloc_area - allocates an area from a pcpu_chunk
fbf59bc9d   Tejun Heo   percpu: implement...
1102
   * @chunk: chunk of interest
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1103
1104
1105
   * @alloc_bits: size of request in allocation units
   * @align: alignment of area (max PAGE_SIZE)
   * @start: bit_off to start searching
9f7dcf224   Tejun Heo   percpu: move chun...
1106
   *
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1107
   * This function takes in a @start offset to begin searching to fit an
b4c2116cf   Dennis Zhou (Facebook)   percpu: update pc...
1108
1109
1110
1111
1112
1113
   * allocation of @alloc_bits with alignment @align.  It needs to scan
   * the allocation map because if it fits within the block's contig hint,
   * @start will be block->first_free. This is an attempt to fill the
   * allocation prior to breaking the contig hint.  The allocation and
   * boundary maps are updated accordingly if it confirms a valid
   * free area.
ccea34b5d   Tejun Heo   percpu: finer gra...
1114
   *
fbf59bc9d   Tejun Heo   percpu: implement...
1115
   * RETURNS:
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1116
1117
   * Allocated addr offset in @chunk on success.
   * -1 if no matching area is found.
fbf59bc9d   Tejun Heo   percpu: implement...
1118
   */
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1119
1120
  static int pcpu_alloc_area(struct pcpu_chunk *chunk, int alloc_bits,
  			   size_t align, int start)
fbf59bc9d   Tejun Heo   percpu: implement...
1121
  {
92c14cab4   Dennis Zhou   percpu: convert c...
1122
  	struct pcpu_block_md *chunk_md = &chunk->chunk_md;
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1123
  	size_t align_mask = (align) ? (align - 1) : 0;
b89462a9c   Dennis Zhou   percpu: remember ...
1124
  	unsigned long area_off = 0, area_bits = 0;
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1125
  	int bit_off, end, oslot;
a16037c8d   Tejun Heo   percpu: make pcpu...
1126

40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1127
  	lockdep_assert_held(&pcpu_lock);
fbf59bc9d   Tejun Heo   percpu: implement...
1128

40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1129
  	oslot = pcpu_chunk_slot(chunk);
fbf59bc9d   Tejun Heo   percpu: implement...
1130

40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1131
1132
1133
  	/*
  	 * Search to find a fit.
  	 */
8c43004af   Dennis Zhou   percpu: do not se...
1134
1135
  	end = min_t(int, start + alloc_bits + PCPU_BITMAP_BLOCK_BITS,
  		    pcpu_chunk_map_bits(chunk));
b89462a9c   Dennis Zhou   percpu: remember ...
1136
1137
  	bit_off = pcpu_find_zero_area(chunk->alloc_map, end, start, alloc_bits,
  				      align_mask, &area_off, &area_bits);
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1138
1139
  	if (bit_off >= end)
  		return -1;
fbf59bc9d   Tejun Heo   percpu: implement...
1140

b89462a9c   Dennis Zhou   percpu: remember ...
1141
1142
  	if (area_bits)
  		pcpu_block_update_scan(chunk, area_off, area_bits);
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1143
1144
  	/* update alloc map */
  	bitmap_set(chunk->alloc_map, bit_off, alloc_bits);
3d331ad74   Al Viro   percpu: speed all...
1145

40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1146
1147
1148
1149
  	/* update boundary map */
  	set_bit(bit_off, chunk->bound_map);
  	bitmap_clear(chunk->bound_map, bit_off + 1, alloc_bits - 1);
  	set_bit(bit_off + alloc_bits, chunk->bound_map);
fbf59bc9d   Tejun Heo   percpu: implement...
1150

40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1151
  	chunk->free_bytes -= alloc_bits * PCPU_MIN_ALLOC_SIZE;
fbf59bc9d   Tejun Heo   percpu: implement...
1152

86b442fbc   Dennis Zhou (Facebook)   percpu: add first...
1153
  	/* update first free bit */
92c14cab4   Dennis Zhou   percpu: convert c...
1154
1155
  	if (bit_off == chunk_md->first_free)
  		chunk_md->first_free = find_next_zero_bit(
86b442fbc   Dennis Zhou (Facebook)   percpu: add first...
1156
1157
1158
  					chunk->alloc_map,
  					pcpu_chunk_map_bits(chunk),
  					bit_off + alloc_bits);
ca460b3c9   Dennis Zhou (Facebook)   percpu: introduce...
1159
  	pcpu_block_update_hint_alloc(chunk, bit_off, alloc_bits);
fbf59bc9d   Tejun Heo   percpu: implement...
1160

fbf59bc9d   Tejun Heo   percpu: implement...
1161
  	pcpu_chunk_relocate(chunk, oslot);
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1162
  	return bit_off * PCPU_MIN_ALLOC_SIZE;
fbf59bc9d   Tejun Heo   percpu: implement...
1163
1164
1165
  }
  
  /**
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1166
   * pcpu_free_area - frees the corresponding offset
fbf59bc9d   Tejun Heo   percpu: implement...
1167
   * @chunk: chunk of interest
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1168
   * @off: addr offset into chunk
ccea34b5d   Tejun Heo   percpu: finer gra...
1169
   *
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1170
1171
   * This function determines the size of an allocation to free using
   * the boundary bitmap and clears the allocation map.
5b32af91b   Roman Gushchin   percpu: return nu...
1172
1173
1174
   *
   * RETURNS:
   * Number of freed bytes.
fbf59bc9d   Tejun Heo   percpu: implement...
1175
   */
5b32af91b   Roman Gushchin   percpu: return nu...
1176
  static int pcpu_free_area(struct pcpu_chunk *chunk, int off)
fbf59bc9d   Tejun Heo   percpu: implement...
1177
  {
92c14cab4   Dennis Zhou   percpu: convert c...
1178
  	struct pcpu_block_md *chunk_md = &chunk->chunk_md;
5b32af91b   Roman Gushchin   percpu: return nu...
1179
  	int bit_off, bits, end, oslot, freed;
723ad1d90   Al Viro   percpu: store off...
1180

5ccd30e40   Dennis Zhou   percpu: add missi...
1181
  	lockdep_assert_held(&pcpu_lock);
30a5b5367   Dennis Zhou   percpu: expose st...
1182
  	pcpu_stats_area_dealloc(chunk);
5ccd30e40   Dennis Zhou   percpu: add missi...
1183

40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1184
  	oslot = pcpu_chunk_slot(chunk);
fbf59bc9d   Tejun Heo   percpu: implement...
1185

40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1186
  	bit_off = off / PCPU_MIN_ALLOC_SIZE;
3d331ad74   Al Viro   percpu: speed all...
1187

40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1188
1189
1190
1191
1192
  	/* find end index */
  	end = find_next_bit(chunk->bound_map, pcpu_chunk_map_bits(chunk),
  			    bit_off + 1);
  	bits = end - bit_off;
  	bitmap_clear(chunk->alloc_map, bit_off, bits);
fbf59bc9d   Tejun Heo   percpu: implement...
1193

5b32af91b   Roman Gushchin   percpu: return nu...
1194
  	freed = bits * PCPU_MIN_ALLOC_SIZE;
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1195
  	/* update metadata */
5b32af91b   Roman Gushchin   percpu: return nu...
1196
  	chunk->free_bytes += freed;
b539b87fe   Tejun Heo   percpu: implmeent...
1197

86b442fbc   Dennis Zhou (Facebook)   percpu: add first...
1198
  	/* update first free bit */
92c14cab4   Dennis Zhou   percpu: convert c...
1199
  	chunk_md->first_free = min(chunk_md->first_free, bit_off);
86b442fbc   Dennis Zhou (Facebook)   percpu: add first...
1200

ca460b3c9   Dennis Zhou (Facebook)   percpu: introduce...
1201
  	pcpu_block_update_hint_free(chunk, bit_off, bits);
fbf59bc9d   Tejun Heo   percpu: implement...
1202

fbf59bc9d   Tejun Heo   percpu: implement...
1203
  	pcpu_chunk_relocate(chunk, oslot);
5b32af91b   Roman Gushchin   percpu: return nu...
1204
1205
  
  	return freed;
fbf59bc9d   Tejun Heo   percpu: implement...
1206
  }
047924c96   Dennis Zhou   percpu: make pcpu...
1207
1208
1209
1210
1211
1212
1213
1214
1215
  static void pcpu_init_md_block(struct pcpu_block_md *block, int nr_bits)
  {
  	block->scan_hint = 0;
  	block->contig_hint = nr_bits;
  	block->left_free = nr_bits;
  	block->right_free = nr_bits;
  	block->first_free = 0;
  	block->nr_bits = nr_bits;
  }
ca460b3c9   Dennis Zhou (Facebook)   percpu: introduce...
1216
1217
1218
  static void pcpu_init_md_blocks(struct pcpu_chunk *chunk)
  {
  	struct pcpu_block_md *md_block;
92c14cab4   Dennis Zhou   percpu: convert c...
1219
1220
  	/* init the chunk's block */
  	pcpu_init_md_block(&chunk->chunk_md, pcpu_chunk_map_bits(chunk));
ca460b3c9   Dennis Zhou (Facebook)   percpu: introduce...
1221
1222
  	for (md_block = chunk->md_blocks;
  	     md_block != chunk->md_blocks + pcpu_chunk_nr_blocks(chunk);
047924c96   Dennis Zhou   percpu: make pcpu...
1223
1224
  	     md_block++)
  		pcpu_init_md_block(md_block, PCPU_BITMAP_BLOCK_BITS);
ca460b3c9   Dennis Zhou (Facebook)   percpu: introduce...
1225
  }
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
  /**
   * pcpu_alloc_first_chunk - creates chunks that serve the first chunk
   * @tmp_addr: the start of the region served
   * @map_size: size of the region served
   *
   * This is responsible for creating the chunks that serve the first chunk.  The
   * base_addr is page aligned down of @tmp_addr while the region end is page
   * aligned up.  Offsets are kept track of to determine the region served. All
   * this is done to appease the bitmap allocator in avoiding partial blocks.
   *
   * RETURNS:
   * Chunk serving the region at @tmp_addr of @map_size.
   */
c0ebfdc3f   Dennis Zhou (Facebook)   percpu: modify ba...
1239
  static struct pcpu_chunk * __init pcpu_alloc_first_chunk(unsigned long tmp_addr,
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1240
  							 int map_size)
10edf5b0b   Dennis Zhou (Facebook)   percpu: unify all...
1241
1242
  {
  	struct pcpu_chunk *chunk;
ca460b3c9   Dennis Zhou (Facebook)   percpu: introduce...
1243
  	unsigned long aligned_addr, lcm_align;
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1244
  	int start_offset, offset_bits, region_size, region_bits;
f655f4053   Mike Rapoport   mm/percpu: add ch...
1245
  	size_t alloc_size;
c0ebfdc3f   Dennis Zhou (Facebook)   percpu: modify ba...
1246
1247
1248
1249
1250
  
  	/* region calculations */
  	aligned_addr = tmp_addr & PAGE_MASK;
  
  	start_offset = tmp_addr - aligned_addr;
6b9d7c8e8   Dennis Zhou (Facebook)   percpu: end chunk...
1251

ca460b3c9   Dennis Zhou (Facebook)   percpu: introduce...
1252
1253
1254
1255
1256
1257
1258
  	/*
  	 * Align the end of the region with the LCM of PAGE_SIZE and
  	 * PCPU_BITMAP_BLOCK_SIZE.  One of these constants is a multiple of
  	 * the other.
  	 */
  	lcm_align = lcm(PAGE_SIZE, PCPU_BITMAP_BLOCK_SIZE);
  	region_size = ALIGN(start_offset + map_size, lcm_align);
10edf5b0b   Dennis Zhou (Facebook)   percpu: unify all...
1259

c0ebfdc3f   Dennis Zhou (Facebook)   percpu: modify ba...
1260
  	/* allocate chunk */
61cf93d3e   Dennis Zhou   percpu: convert f...
1261
1262
  	alloc_size = struct_size(chunk, populated,
  				 BITS_TO_LONGS(region_size >> PAGE_SHIFT));
f655f4053   Mike Rapoport   mm/percpu: add ch...
1263
1264
1265
1266
1267
  	chunk = memblock_alloc(alloc_size, SMP_CACHE_BYTES);
  	if (!chunk)
  		panic("%s: Failed to allocate %zu bytes
  ", __func__,
  		      alloc_size);
c0ebfdc3f   Dennis Zhou (Facebook)   percpu: modify ba...
1268

10edf5b0b   Dennis Zhou (Facebook)   percpu: unify all...
1269
  	INIT_LIST_HEAD(&chunk->list);
c0ebfdc3f   Dennis Zhou (Facebook)   percpu: modify ba...
1270
1271
  
  	chunk->base_addr = (void *)aligned_addr;
10edf5b0b   Dennis Zhou (Facebook)   percpu: unify all...
1272
  	chunk->start_offset = start_offset;
6b9d7c8e8   Dennis Zhou (Facebook)   percpu: end chunk...
1273
  	chunk->end_offset = region_size - chunk->start_offset - map_size;
c0ebfdc3f   Dennis Zhou (Facebook)   percpu: modify ba...
1274

8ab16c43e   Dennis Zhou (Facebook)   percpu: change th...
1275
  	chunk->nr_pages = region_size >> PAGE_SHIFT;
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1276
  	region_bits = pcpu_chunk_map_bits(chunk);
c0ebfdc3f   Dennis Zhou (Facebook)   percpu: modify ba...
1277

f655f4053   Mike Rapoport   mm/percpu: add ch...
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
  	alloc_size = BITS_TO_LONGS(region_bits) * sizeof(chunk->alloc_map[0]);
  	chunk->alloc_map = memblock_alloc(alloc_size, SMP_CACHE_BYTES);
  	if (!chunk->alloc_map)
  		panic("%s: Failed to allocate %zu bytes
  ", __func__,
  		      alloc_size);
  
  	alloc_size =
  		BITS_TO_LONGS(region_bits + 1) * sizeof(chunk->bound_map[0]);
  	chunk->bound_map = memblock_alloc(alloc_size, SMP_CACHE_BYTES);
  	if (!chunk->bound_map)
  		panic("%s: Failed to allocate %zu bytes
  ", __func__,
  		      alloc_size);
  
  	alloc_size = pcpu_chunk_nr_blocks(chunk) * sizeof(chunk->md_blocks[0]);
  	chunk->md_blocks = memblock_alloc(alloc_size, SMP_CACHE_BYTES);
  	if (!chunk->md_blocks)
  		panic("%s: Failed to allocate %zu bytes
  ", __func__,
  		      alloc_size);
3c7be18ac   Roman Gushchin   mm: memcg/percpu:...
1299
1300
1301
1302
  #ifdef CONFIG_MEMCG_KMEM
  	/* first chunk isn't memcg-aware */
  	chunk->obj_cgroups = NULL;
  #endif
ca460b3c9   Dennis Zhou (Facebook)   percpu: introduce...
1303
  	pcpu_init_md_blocks(chunk);
10edf5b0b   Dennis Zhou (Facebook)   percpu: unify all...
1304
1305
1306
  
  	/* manage populated page bitmap */
  	chunk->immutable = true;
8ab16c43e   Dennis Zhou (Facebook)   percpu: change th...
1307
1308
  	bitmap_fill(chunk->populated, chunk->nr_pages);
  	chunk->nr_populated = chunk->nr_pages;
b239f7daf   Dennis Zhou   percpu: set PCPU_...
1309
  	chunk->nr_empty_pop_pages = chunk->nr_pages;
10edf5b0b   Dennis Zhou (Facebook)   percpu: unify all...
1310

40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1311
  	chunk->free_bytes = map_size;
c0ebfdc3f   Dennis Zhou (Facebook)   percpu: modify ba...
1312
1313
1314
  
  	if (chunk->start_offset) {
  		/* hide the beginning of the bitmap */
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1315
1316
1317
1318
  		offset_bits = chunk->start_offset / PCPU_MIN_ALLOC_SIZE;
  		bitmap_set(chunk->alloc_map, 0, offset_bits);
  		set_bit(0, chunk->bound_map);
  		set_bit(offset_bits, chunk->bound_map);
ca460b3c9   Dennis Zhou (Facebook)   percpu: introduce...
1319

92c14cab4   Dennis Zhou   percpu: convert c...
1320
  		chunk->chunk_md.first_free = offset_bits;
86b442fbc   Dennis Zhou (Facebook)   percpu: add first...
1321

ca460b3c9   Dennis Zhou (Facebook)   percpu: introduce...
1322
  		pcpu_block_update_hint_alloc(chunk, 0, offset_bits);
c0ebfdc3f   Dennis Zhou (Facebook)   percpu: modify ba...
1323
  	}
6b9d7c8e8   Dennis Zhou (Facebook)   percpu: end chunk...
1324
1325
  	if (chunk->end_offset) {
  		/* hide the end of the bitmap */
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1326
1327
1328
1329
1330
1331
1332
  		offset_bits = chunk->end_offset / PCPU_MIN_ALLOC_SIZE;
  		bitmap_set(chunk->alloc_map,
  			   pcpu_chunk_map_bits(chunk) - offset_bits,
  			   offset_bits);
  		set_bit((start_offset + map_size) / PCPU_MIN_ALLOC_SIZE,
  			chunk->bound_map);
  		set_bit(region_bits, chunk->bound_map);
6b9d7c8e8   Dennis Zhou (Facebook)   percpu: end chunk...
1333

ca460b3c9   Dennis Zhou (Facebook)   percpu: introduce...
1334
1335
1336
  		pcpu_block_update_hint_alloc(chunk, pcpu_chunk_map_bits(chunk)
  					     - offset_bits, offset_bits);
  	}
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1337

10edf5b0b   Dennis Zhou (Facebook)   percpu: unify all...
1338
1339
  	return chunk;
  }
3c7be18ac   Roman Gushchin   mm: memcg/percpu:...
1340
  static struct pcpu_chunk *pcpu_alloc_chunk(enum pcpu_chunk_type type, gfp_t gfp)
6081089fd   Tejun Heo   percpu: reorganiz...
1341
1342
  {
  	struct pcpu_chunk *chunk;
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1343
  	int region_bits;
6081089fd   Tejun Heo   percpu: reorganiz...
1344

47504ee04   Dennis Zhou   percpu: add __GFP...
1345
  	chunk = pcpu_mem_zalloc(pcpu_chunk_struct_size, gfp);
6081089fd   Tejun Heo   percpu: reorganiz...
1346
1347
  	if (!chunk)
  		return NULL;
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1348
1349
1350
  	INIT_LIST_HEAD(&chunk->list);
  	chunk->nr_pages = pcpu_unit_pages;
  	region_bits = pcpu_chunk_map_bits(chunk);
6081089fd   Tejun Heo   percpu: reorganiz...
1351

40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1352
  	chunk->alloc_map = pcpu_mem_zalloc(BITS_TO_LONGS(region_bits) *
47504ee04   Dennis Zhou   percpu: add __GFP...
1353
  					   sizeof(chunk->alloc_map[0]), gfp);
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1354
1355
  	if (!chunk->alloc_map)
  		goto alloc_map_fail;
6081089fd   Tejun Heo   percpu: reorganiz...
1356

40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1357
  	chunk->bound_map = pcpu_mem_zalloc(BITS_TO_LONGS(region_bits + 1) *
47504ee04   Dennis Zhou   percpu: add __GFP...
1358
  					   sizeof(chunk->bound_map[0]), gfp);
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1359
1360
  	if (!chunk->bound_map)
  		goto bound_map_fail;
6081089fd   Tejun Heo   percpu: reorganiz...
1361

ca460b3c9   Dennis Zhou (Facebook)   percpu: introduce...
1362
  	chunk->md_blocks = pcpu_mem_zalloc(pcpu_chunk_nr_blocks(chunk) *
47504ee04   Dennis Zhou   percpu: add __GFP...
1363
  					   sizeof(chunk->md_blocks[0]), gfp);
ca460b3c9   Dennis Zhou (Facebook)   percpu: introduce...
1364
1365
  	if (!chunk->md_blocks)
  		goto md_blocks_fail;
3c7be18ac   Roman Gushchin   mm: memcg/percpu:...
1366
1367
1368
1369
1370
1371
1372
1373
1374
  #ifdef CONFIG_MEMCG_KMEM
  	if (pcpu_is_memcg_chunk(type)) {
  		chunk->obj_cgroups =
  			pcpu_mem_zalloc(pcpu_chunk_map_bits(chunk) *
  					sizeof(struct obj_cgroup *), gfp);
  		if (!chunk->obj_cgroups)
  			goto objcg_fail;
  	}
  #endif
ca460b3c9   Dennis Zhou (Facebook)   percpu: introduce...
1375
  	pcpu_init_md_blocks(chunk);
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1376
  	/* init metadata */
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1377
  	chunk->free_bytes = chunk->nr_pages * PAGE_SIZE;
c0ebfdc3f   Dennis Zhou (Facebook)   percpu: modify ba...
1378

6081089fd   Tejun Heo   percpu: reorganiz...
1379
  	return chunk;
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1380

3c7be18ac   Roman Gushchin   mm: memcg/percpu:...
1381
1382
1383
1384
  #ifdef CONFIG_MEMCG_KMEM
  objcg_fail:
  	pcpu_mem_free(chunk->md_blocks);
  #endif
ca460b3c9   Dennis Zhou (Facebook)   percpu: introduce...
1385
1386
  md_blocks_fail:
  	pcpu_mem_free(chunk->bound_map);
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1387
1388
1389
1390
1391
1392
  bound_map_fail:
  	pcpu_mem_free(chunk->alloc_map);
  alloc_map_fail:
  	pcpu_mem_free(chunk);
  
  	return NULL;
6081089fd   Tejun Heo   percpu: reorganiz...
1393
1394
1395
1396
1397
1398
  }
  
  static void pcpu_free_chunk(struct pcpu_chunk *chunk)
  {
  	if (!chunk)
  		return;
3c7be18ac   Roman Gushchin   mm: memcg/percpu:...
1399
1400
1401
  #ifdef CONFIG_MEMCG_KMEM
  	pcpu_mem_free(chunk->obj_cgroups);
  #endif
6685b3573   Mike Rapoport   percpu: stop leak...
1402
  	pcpu_mem_free(chunk->md_blocks);
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1403
1404
  	pcpu_mem_free(chunk->bound_map);
  	pcpu_mem_free(chunk->alloc_map);
1d5cfdb07   Tetsuo Handa   tree wide: use kv...
1405
  	pcpu_mem_free(chunk);
6081089fd   Tejun Heo   percpu: reorganiz...
1406
  }
b539b87fe   Tejun Heo   percpu: implmeent...
1407
1408
1409
1410
1411
1412
1413
1414
1415
  /**
   * pcpu_chunk_populated - post-population bookkeeping
   * @chunk: pcpu_chunk which got populated
   * @page_start: the start page
   * @page_end: the end page
   *
   * Pages in [@page_start,@page_end) have been populated to @chunk.  Update
   * the bookkeeping information accordingly.  Must be called after each
   * successful population.
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1416
1417
1418
   *
   * If this is @for_alloc, do not increment pcpu_nr_empty_pop_pages because it
   * is to serve an allocation in that area.
b539b87fe   Tejun Heo   percpu: implmeent...
1419
   */
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1420
  static void pcpu_chunk_populated(struct pcpu_chunk *chunk, int page_start,
b239f7daf   Dennis Zhou   percpu: set PCPU_...
1421
  				 int page_end)
b539b87fe   Tejun Heo   percpu: implmeent...
1422
1423
1424
1425
1426
1427
1428
  {
  	int nr = page_end - page_start;
  
  	lockdep_assert_held(&pcpu_lock);
  
  	bitmap_set(chunk->populated, page_start, nr);
  	chunk->nr_populated += nr;
7e8a6304d   Dennis Zhou (Facebook)   /proc/meminfo: ad...
1429
  	pcpu_nr_populated += nr;
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1430

b239f7daf   Dennis Zhou   percpu: set PCPU_...
1431
  	pcpu_update_empty_pages(chunk, nr);
b539b87fe   Tejun Heo   percpu: implmeent...
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
  }
  
  /**
   * pcpu_chunk_depopulated - post-depopulation bookkeeping
   * @chunk: pcpu_chunk which got depopulated
   * @page_start: the start page
   * @page_end: the end page
   *
   * Pages in [@page_start,@page_end) have been depopulated from @chunk.
   * Update the bookkeeping information accordingly.  Must be called after
   * each successful depopulation.
   */
  static void pcpu_chunk_depopulated(struct pcpu_chunk *chunk,
  				   int page_start, int page_end)
  {
  	int nr = page_end - page_start;
  
  	lockdep_assert_held(&pcpu_lock);
  
  	bitmap_clear(chunk->populated, page_start, nr);
  	chunk->nr_populated -= nr;
7e8a6304d   Dennis Zhou (Facebook)   /proc/meminfo: ad...
1453
  	pcpu_nr_populated -= nr;
b239f7daf   Dennis Zhou   percpu: set PCPU_...
1454
1455
  
  	pcpu_update_empty_pages(chunk, -nr);
b539b87fe   Tejun Heo   percpu: implmeent...
1456
  }
9f6455325   Tejun Heo   percpu: move vmal...
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
  /*
   * Chunk management implementation.
   *
   * To allow different implementations, chunk alloc/free and
   * [de]population are implemented in a separate file which is pulled
   * into this file and compiled together.  The following functions
   * should be implemented.
   *
   * pcpu_populate_chunk		- populate the specified range of a chunk
   * pcpu_depopulate_chunk	- depopulate the specified range of a chunk
   * pcpu_create_chunk		- create a new chunk
   * pcpu_destroy_chunk		- destroy a chunk, always preceded by full depop
   * pcpu_addr_to_page		- translate address to physical address
   * pcpu_verify_alloc_info	- check alloc_info is acceptable during init
fbf59bc9d   Tejun Heo   percpu: implement...
1471
   */
15d9f3d11   Dennis Zhou   percpu: match chu...
1472
  static int pcpu_populate_chunk(struct pcpu_chunk *chunk,
47504ee04   Dennis Zhou   percpu: add __GFP...
1473
  			       int page_start, int page_end, gfp_t gfp);
15d9f3d11   Dennis Zhou   percpu: match chu...
1474
1475
  static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk,
  				  int page_start, int page_end);
3c7be18ac   Roman Gushchin   mm: memcg/percpu:...
1476
1477
  static struct pcpu_chunk *pcpu_create_chunk(enum pcpu_chunk_type type,
  					    gfp_t gfp);
9f6455325   Tejun Heo   percpu: move vmal...
1478
1479
1480
  static void pcpu_destroy_chunk(struct pcpu_chunk *chunk);
  static struct page *pcpu_addr_to_page(void *addr);
  static int __init pcpu_verify_alloc_info(const struct pcpu_alloc_info *ai);
fbf59bc9d   Tejun Heo   percpu: implement...
1481

b0c9778b1   Tejun Heo   percpu: implement...
1482
1483
1484
  #ifdef CONFIG_NEED_PER_CPU_KM
  #include "percpu-km.c"
  #else
9f6455325   Tejun Heo   percpu: move vmal...
1485
  #include "percpu-vm.c"
b0c9778b1   Tejun Heo   percpu: implement...
1486
  #endif
fbf59bc9d   Tejun Heo   percpu: implement...
1487
1488
  
  /**
88999a898   Tejun Heo   percpu: misc prep...
1489
1490
1491
   * pcpu_chunk_addr_search - determine chunk containing specified address
   * @addr: address for which the chunk needs to be determined.
   *
c0ebfdc3f   Dennis Zhou (Facebook)   percpu: modify ba...
1492
1493
1494
   * This is an internal function that handles all but static allocations.
   * Static percpu address values should never be passed into the allocator.
   *
88999a898   Tejun Heo   percpu: misc prep...
1495
1496
1497
1498
1499
   * RETURNS:
   * The address of the found chunk.
   */
  static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr)
  {
c0ebfdc3f   Dennis Zhou (Facebook)   percpu: modify ba...
1500
  	/* is it in the dynamic region (first chunk)? */
560f2c236   Dennis Zhou (Facebook)   percpu: combine p...
1501
  	if (pcpu_addr_in_chunk(pcpu_first_chunk, addr))
88999a898   Tejun Heo   percpu: misc prep...
1502
  		return pcpu_first_chunk;
c0ebfdc3f   Dennis Zhou (Facebook)   percpu: modify ba...
1503
1504
  
  	/* is it in the reserved region? */
560f2c236   Dennis Zhou (Facebook)   percpu: combine p...
1505
  	if (pcpu_addr_in_chunk(pcpu_reserved_chunk, addr))
c0ebfdc3f   Dennis Zhou (Facebook)   percpu: modify ba...
1506
  		return pcpu_reserved_chunk;
88999a898   Tejun Heo   percpu: misc prep...
1507
1508
1509
1510
1511
1512
1513
1514
1515
  
  	/*
  	 * The address is relative to unit0 which might be unused and
  	 * thus unmapped.  Offset the address to the unit space of the
  	 * current processor before looking it up in the vmalloc
  	 * space.  Note that any possible cpu id can be used here, so
  	 * there's no need to worry about preemption or cpu hotplug.
  	 */
  	addr += pcpu_unit_offsets[raw_smp_processor_id()];
9f6455325   Tejun Heo   percpu: move vmal...
1516
  	return pcpu_get_page_chunk(pcpu_addr_to_page(addr));
88999a898   Tejun Heo   percpu: misc prep...
1517
  }
3c7be18ac   Roman Gushchin   mm: memcg/percpu:...
1518
1519
1520
1521
1522
  #ifdef CONFIG_MEMCG_KMEM
  static enum pcpu_chunk_type pcpu_memcg_pre_alloc_hook(size_t size, gfp_t gfp,
  						     struct obj_cgroup **objcgp)
  {
  	struct obj_cgroup *objcg;
279c3393e   Roman Gushchin   mm: kmem: move me...
1523
  	if (!memcg_kmem_enabled() || !(gfp & __GFP_ACCOUNT))
3c7be18ac   Roman Gushchin   mm: memcg/percpu:...
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
  		return PCPU_CHUNK_ROOT;
  
  	objcg = get_obj_cgroup_from_current();
  	if (!objcg)
  		return PCPU_CHUNK_ROOT;
  
  	if (obj_cgroup_charge(objcg, gfp, size * num_possible_cpus())) {
  		obj_cgroup_put(objcg);
  		return PCPU_FAIL_ALLOC;
  	}
  
  	*objcgp = objcg;
  	return PCPU_CHUNK_MEMCG;
  }
  
  static void pcpu_memcg_post_alloc_hook(struct obj_cgroup *objcg,
  				       struct pcpu_chunk *chunk, int off,
  				       size_t size)
  {
  	if (!objcg)
  		return;
  
  	if (chunk) {
  		chunk->obj_cgroups[off >> PCPU_MIN_ALLOC_SHIFT] = objcg;
772616b03   Roman Gushchin   mm: memcg/percpu:...
1548
1549
1550
1551
1552
  
  		rcu_read_lock();
  		mod_memcg_state(obj_cgroup_memcg(objcg), MEMCG_PERCPU_B,
  				size * num_possible_cpus());
  		rcu_read_unlock();
3c7be18ac   Roman Gushchin   mm: memcg/percpu:...
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
  	} else {
  		obj_cgroup_uncharge(objcg, size * num_possible_cpus());
  		obj_cgroup_put(objcg);
  	}
  }
  
  static void pcpu_memcg_free_hook(struct pcpu_chunk *chunk, int off, size_t size)
  {
  	struct obj_cgroup *objcg;
  
  	if (!pcpu_is_memcg_chunk(pcpu_chunk_type(chunk)))
  		return;
  
  	objcg = chunk->obj_cgroups[off >> PCPU_MIN_ALLOC_SHIFT];
  	chunk->obj_cgroups[off >> PCPU_MIN_ALLOC_SHIFT] = NULL;
  
  	obj_cgroup_uncharge(objcg, size * num_possible_cpus());
772616b03   Roman Gushchin   mm: memcg/percpu:...
1570
1571
1572
1573
  	rcu_read_lock();
  	mod_memcg_state(obj_cgroup_memcg(objcg), MEMCG_PERCPU_B,
  			-(size * num_possible_cpus()));
  	rcu_read_unlock();
3c7be18ac   Roman Gushchin   mm: memcg/percpu:...
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
  	obj_cgroup_put(objcg);
  }
  
  #else /* CONFIG_MEMCG_KMEM */
  static enum pcpu_chunk_type
  pcpu_memcg_pre_alloc_hook(size_t size, gfp_t gfp, struct obj_cgroup **objcgp)
  {
  	return PCPU_CHUNK_ROOT;
  }
  
  static void pcpu_memcg_post_alloc_hook(struct obj_cgroup *objcg,
  				       struct pcpu_chunk *chunk, int off,
  				       size_t size)
  {
  }
  
  static void pcpu_memcg_free_hook(struct pcpu_chunk *chunk, int off, size_t size)
  {
  }
  #endif /* CONFIG_MEMCG_KMEM */
88999a898   Tejun Heo   percpu: misc prep...
1594
  /**
edcb46399   Tejun Heo   percpu, module: i...
1595
   * pcpu_alloc - the percpu allocator
cae3aeb83   Tejun Heo   percpu: clean up ...
1596
   * @size: size of area to allocate in bytes
fbf59bc9d   Tejun Heo   percpu: implement...
1597
   * @align: alignment of area (max PAGE_SIZE)
edcb46399   Tejun Heo   percpu, module: i...
1598
   * @reserved: allocate from the reserved chunk if available
5835d96e9   Tejun Heo   percpu: implement...
1599
   * @gfp: allocation flags
fbf59bc9d   Tejun Heo   percpu: implement...
1600
   *
5835d96e9   Tejun Heo   percpu: implement...
1601
   * Allocate percpu area of @size bytes aligned at @align.  If @gfp doesn't
0ea7eeec2   Daniel Borkmann   mm, percpu: add s...
1602
1603
1604
   * contain %GFP_KERNEL, the allocation is atomic. If @gfp has __GFP_NOWARN
   * then no warning will be triggered on invalid or failed allocation
   * requests.
fbf59bc9d   Tejun Heo   percpu: implement...
1605
1606
1607
1608
   *
   * RETURNS:
   * Percpu pointer to the allocated area on success, NULL on failure.
   */
5835d96e9   Tejun Heo   percpu: implement...
1609
1610
  static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
  				 gfp_t gfp)
fbf59bc9d   Tejun Heo   percpu: implement...
1611
  {
28307d938   Filipe Manana   percpu: make pcpu...
1612
1613
1614
  	gfp_t pcpu_gfp;
  	bool is_atomic;
  	bool do_warn;
3c7be18ac   Roman Gushchin   mm: memcg/percpu:...
1615
1616
1617
  	enum pcpu_chunk_type type;
  	struct list_head *pcpu_slot;
  	struct obj_cgroup *objcg = NULL;
f2badb0c9   Tejun Heo   percpu: make allo...
1618
  	static int warn_limit = 10;
8744d8594   Dennis Zhou   percpu: relegate ...
1619
  	struct pcpu_chunk *chunk, *next;
f2badb0c9   Tejun Heo   percpu: make allo...
1620
  	const char *err;
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1621
  	int slot, off, cpu, ret;
403a91b16   Jiri Kosina   percpu: allow pcp...
1622
  	unsigned long flags;
f528f0b8e   Catalin Marinas   kmemleak: Handle ...
1623
  	void __percpu *ptr;
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1624
  	size_t bits, bit_align;
fbf59bc9d   Tejun Heo   percpu: implement...
1625

28307d938   Filipe Manana   percpu: make pcpu...
1626
1627
1628
1629
1630
  	gfp = current_gfp_context(gfp);
  	/* whitelisted flags that can be passed to the backing allocators */
  	pcpu_gfp = gfp & (GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN);
  	is_atomic = (gfp & GFP_KERNEL) != GFP_KERNEL;
  	do_warn = !(gfp & __GFP_NOWARN);
723ad1d90   Al Viro   percpu: store off...
1631
  	/*
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1632
1633
1634
1635
  	 * There is now a minimum allocation size of PCPU_MIN_ALLOC_SIZE,
  	 * therefore alignment must be a minimum of that many bytes.
  	 * An allocation may have internal fragmentation from rounding up
  	 * of up to PCPU_MIN_ALLOC_SIZE - 1 bytes.
723ad1d90   Al Viro   percpu: store off...
1636
  	 */
d2f3c3849   Dennis Zhou (Facebook)   percpu: increase ...
1637
1638
  	if (unlikely(align < PCPU_MIN_ALLOC_SIZE))
  		align = PCPU_MIN_ALLOC_SIZE;
723ad1d90   Al Viro   percpu: store off...
1639

d2f3c3849   Dennis Zhou (Facebook)   percpu: increase ...
1640
  	size = ALIGN(size, PCPU_MIN_ALLOC_SIZE);
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1641
1642
  	bits = size >> PCPU_MIN_ALLOC_SHIFT;
  	bit_align = align >> PCPU_MIN_ALLOC_SHIFT;
2f69fa829   Al Viro   percpu: allocatio...
1643

3ca45a46f   zijun_hu   percpu: ensure th...
1644
1645
  	if (unlikely(!size || size > PCPU_MIN_UNIT_SIZE || align > PAGE_SIZE ||
  		     !is_power_of_2(align))) {
0ea7eeec2   Daniel Borkmann   mm, percpu: add s...
1646
1647
  		WARN(do_warn, "illegal size (%zu) or align (%zu) for percpu allocation
  ",
756a025f0   Joe Perches   mm: coalesce spli...
1648
  		     size, align);
fbf59bc9d   Tejun Heo   percpu: implement...
1649
1650
  		return NULL;
  	}
3c7be18ac   Roman Gushchin   mm: memcg/percpu:...
1651
1652
1653
1654
  	type = pcpu_memcg_pre_alloc_hook(size, gfp, &objcg);
  	if (unlikely(type == PCPU_FAIL_ALLOC))
  		return NULL;
  	pcpu_slot = pcpu_chunk_list(type);
f52ba1fef   Kirill Tkhai   mm: Allow to kill...
1655
1656
1657
1658
1659
1660
  	if (!is_atomic) {
  		/*
  		 * pcpu_balance_workfn() allocates memory under this mutex,
  		 * and it may wait for memory reclaim. Allow current task
  		 * to become OOM victim, in case of memory pressure.
  		 */
3c7be18ac   Roman Gushchin   mm: memcg/percpu:...
1661
  		if (gfp & __GFP_NOFAIL) {
f52ba1fef   Kirill Tkhai   mm: Allow to kill...
1662
  			mutex_lock(&pcpu_alloc_mutex);
3c7be18ac   Roman Gushchin   mm: memcg/percpu:...
1663
1664
  		} else if (mutex_lock_killable(&pcpu_alloc_mutex)) {
  			pcpu_memcg_post_alloc_hook(objcg, NULL, 0, size);
f52ba1fef   Kirill Tkhai   mm: Allow to kill...
1665
  			return NULL;
3c7be18ac   Roman Gushchin   mm: memcg/percpu:...
1666
  		}
f52ba1fef   Kirill Tkhai   mm: Allow to kill...
1667
  	}
6710e594f   Tejun Heo   percpu: fix synch...
1668

403a91b16   Jiri Kosina   percpu: allow pcp...
1669
  	spin_lock_irqsave(&pcpu_lock, flags);
fbf59bc9d   Tejun Heo   percpu: implement...
1670

edcb46399   Tejun Heo   percpu, module: i...
1671
1672
1673
  	/* serve reserved allocations from the reserved chunk if available */
  	if (reserved && pcpu_reserved_chunk) {
  		chunk = pcpu_reserved_chunk;
833af8427   Tejun Heo   percpu: restructu...
1674

40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1675
1676
  		off = pcpu_find_block_fit(chunk, bits, bit_align, is_atomic);
  		if (off < 0) {
833af8427   Tejun Heo   percpu: restructu...
1677
  			err = "alloc from reserved chunk failed";
ccea34b5d   Tejun Heo   percpu: finer gra...
1678
  			goto fail_unlock;
f2badb0c9   Tejun Heo   percpu: make allo...
1679
  		}
833af8427   Tejun Heo   percpu: restructu...
1680

40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1681
  		off = pcpu_alloc_area(chunk, bits, bit_align, off);
edcb46399   Tejun Heo   percpu, module: i...
1682
1683
  		if (off >= 0)
  			goto area_found;
833af8427   Tejun Heo   percpu: restructu...
1684

f2badb0c9   Tejun Heo   percpu: make allo...
1685
  		err = "alloc from reserved chunk failed";
ccea34b5d   Tejun Heo   percpu: finer gra...
1686
  		goto fail_unlock;
edcb46399   Tejun Heo   percpu, module: i...
1687
  	}
ccea34b5d   Tejun Heo   percpu: finer gra...
1688
  restart:
edcb46399   Tejun Heo   percpu, module: i...
1689
  	/* search through normal chunks */
fbf59bc9d   Tejun Heo   percpu: implement...
1690
  	for (slot = pcpu_size_to_slot(size); slot < pcpu_nr_slots; slot++) {
8744d8594   Dennis Zhou   percpu: relegate ...
1691
  		list_for_each_entry_safe(chunk, next, &pcpu_slot[slot], list) {
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1692
1693
  			off = pcpu_find_block_fit(chunk, bits, bit_align,
  						  is_atomic);
8744d8594   Dennis Zhou   percpu: relegate ...
1694
1695
1696
  			if (off < 0) {
  				if (slot < PCPU_SLOT_FAIL_THRESHOLD)
  					pcpu_chunk_move(chunk, 0);
fbf59bc9d   Tejun Heo   percpu: implement...
1697
  				continue;
8744d8594   Dennis Zhou   percpu: relegate ...
1698
  			}
ccea34b5d   Tejun Heo   percpu: finer gra...
1699

40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1700
  			off = pcpu_alloc_area(chunk, bits, bit_align, off);
fbf59bc9d   Tejun Heo   percpu: implement...
1701
1702
  			if (off >= 0)
  				goto area_found;
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1703

fbf59bc9d   Tejun Heo   percpu: implement...
1704
1705
  		}
  	}
403a91b16   Jiri Kosina   percpu: allow pcp...
1706
  	spin_unlock_irqrestore(&pcpu_lock, flags);
ccea34b5d   Tejun Heo   percpu: finer gra...
1707

b38d08f31   Tejun Heo   percpu: restructu...
1708
1709
1710
1711
1712
  	/*
  	 * No space left.  Create a new chunk.  We don't want multiple
  	 * tasks to create chunks simultaneously.  Serialize and create iff
  	 * there's still no empty chunk after grabbing the mutex.
  	 */
11df02bf9   Dennis Zhou   percpu: resolve e...
1713
1714
  	if (is_atomic) {
  		err = "atomic alloc failed, no space left";
5835d96e9   Tejun Heo   percpu: implement...
1715
  		goto fail;
11df02bf9   Dennis Zhou   percpu: resolve e...
1716
  	}
5835d96e9   Tejun Heo   percpu: implement...
1717

b38d08f31   Tejun Heo   percpu: restructu...
1718
  	if (list_empty(&pcpu_slot[pcpu_nr_slots - 1])) {
3c7be18ac   Roman Gushchin   mm: memcg/percpu:...
1719
  		chunk = pcpu_create_chunk(type, pcpu_gfp);
b38d08f31   Tejun Heo   percpu: restructu...
1720
1721
1722
1723
1724
1725
1726
1727
1728
  		if (!chunk) {
  			err = "failed to allocate new chunk";
  			goto fail;
  		}
  
  		spin_lock_irqsave(&pcpu_lock, flags);
  		pcpu_chunk_relocate(chunk, -1);
  	} else {
  		spin_lock_irqsave(&pcpu_lock, flags);
f2badb0c9   Tejun Heo   percpu: make allo...
1729
  	}
ccea34b5d   Tejun Heo   percpu: finer gra...
1730

ccea34b5d   Tejun Heo   percpu: finer gra...
1731
  	goto restart;
fbf59bc9d   Tejun Heo   percpu: implement...
1732
1733
  
  area_found:
30a5b5367   Dennis Zhou   percpu: expose st...
1734
  	pcpu_stats_area_alloc(chunk, size);
403a91b16   Jiri Kosina   percpu: allow pcp...
1735
  	spin_unlock_irqrestore(&pcpu_lock, flags);
ccea34b5d   Tejun Heo   percpu: finer gra...
1736

dca496451   Tejun Heo   percpu: move comm...
1737
  	/* populate if not all pages are already there */
5835d96e9   Tejun Heo   percpu: implement...
1738
  	if (!is_atomic) {
e837dfde1   Dennis Zhou   bitmap: genericiz...
1739
  		unsigned int page_start, page_end, rs, re;
dca496451   Tejun Heo   percpu: move comm...
1740

e04d32083   Tejun Heo   percpu: indent th...
1741
1742
  		page_start = PFN_DOWN(off);
  		page_end = PFN_UP(off + size);
b38d08f31   Tejun Heo   percpu: restructu...
1743

e837dfde1   Dennis Zhou   bitmap: genericiz...
1744
1745
  		bitmap_for_each_clear_region(chunk->populated, rs, re,
  					     page_start, page_end) {
e04d32083   Tejun Heo   percpu: indent th...
1746
  			WARN_ON(chunk->immutable);
554fef1c3   Dennis Zhou   percpu: allow sel...
1747
  			ret = pcpu_populate_chunk(chunk, rs, re, pcpu_gfp);
e04d32083   Tejun Heo   percpu: indent th...
1748
1749
1750
  
  			spin_lock_irqsave(&pcpu_lock, flags);
  			if (ret) {
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
1751
  				pcpu_free_area(chunk, off);
e04d32083   Tejun Heo   percpu: indent th...
1752
1753
1754
  				err = "failed to populate";
  				goto fail_unlock;
  			}
b239f7daf   Dennis Zhou   percpu: set PCPU_...
1755
  			pcpu_chunk_populated(chunk, rs, re);
e04d32083   Tejun Heo   percpu: indent th...
1756
  			spin_unlock_irqrestore(&pcpu_lock, flags);
dca496451   Tejun Heo   percpu: move comm...
1757
  		}
fbf59bc9d   Tejun Heo   percpu: implement...
1758

e04d32083   Tejun Heo   percpu: indent th...
1759
1760
  		mutex_unlock(&pcpu_alloc_mutex);
  	}
ccea34b5d   Tejun Heo   percpu: finer gra...
1761

1a4d76076   Tejun Heo   percpu: implement...
1762
1763
  	if (pcpu_nr_empty_pop_pages < PCPU_EMPTY_POP_PAGES_LOW)
  		pcpu_schedule_balance_work();
dca496451   Tejun Heo   percpu: move comm...
1764
1765
1766
  	/* clear the areas and return address relative to base address */
  	for_each_possible_cpu(cpu)
  		memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size);
f528f0b8e   Catalin Marinas   kmemleak: Handle ...
1767
  	ptr = __addr_to_pcpu_ptr(chunk->base_addr + off);
8a8c35fad   Larry Finger   mm: kmemleak_allo...
1768
  	kmemleak_alloc_percpu(ptr, size, gfp);
df95e795a   Dennis Zhou   percpu: add trace...
1769
1770
1771
  
  	trace_percpu_alloc_percpu(reserved, is_atomic, size, align,
  			chunk->base_addr, off, ptr);
3c7be18ac   Roman Gushchin   mm: memcg/percpu:...
1772
  	pcpu_memcg_post_alloc_hook(objcg, chunk, off, size);
f528f0b8e   Catalin Marinas   kmemleak: Handle ...
1773
  	return ptr;
ccea34b5d   Tejun Heo   percpu: finer gra...
1774
1775
  
  fail_unlock:
403a91b16   Jiri Kosina   percpu: allow pcp...
1776
  	spin_unlock_irqrestore(&pcpu_lock, flags);
b38d08f31   Tejun Heo   percpu: restructu...
1777
  fail:
df95e795a   Dennis Zhou   percpu: add trace...
1778
  	trace_percpu_alloc_percpu_fail(reserved, is_atomic, size, align);
0ea7eeec2   Daniel Borkmann   mm, percpu: add s...
1779
  	if (!is_atomic && do_warn && warn_limit) {
870d4b12a   Joe Perches   mm: percpu: use p...
1780
1781
  		pr_warn("allocation failed, size=%zu align=%zu atomic=%d, %s
  ",
598d80914   Joe Perches   mm: convert pr_wa...
1782
  			size, align, is_atomic, err);
f2badb0c9   Tejun Heo   percpu: make allo...
1783
1784
  		dump_stack();
  		if (!--warn_limit)
870d4b12a   Joe Perches   mm: percpu: use p...
1785
1786
  			pr_info("limit reached, disable warning
  ");
f2badb0c9   Tejun Heo   percpu: make allo...
1787
  	}
1a4d76076   Tejun Heo   percpu: implement...
1788
1789
1790
1791
  	if (is_atomic) {
  		/* see the flag handling in pcpu_blance_workfn() */
  		pcpu_atomic_alloc_failed = true;
  		pcpu_schedule_balance_work();
6710e594f   Tejun Heo   percpu: fix synch...
1792
1793
  	} else {
  		mutex_unlock(&pcpu_alloc_mutex);
1a4d76076   Tejun Heo   percpu: implement...
1794
  	}
3c7be18ac   Roman Gushchin   mm: memcg/percpu:...
1795
1796
  
  	pcpu_memcg_post_alloc_hook(objcg, NULL, 0, size);
ccea34b5d   Tejun Heo   percpu: finer gra...
1797
  	return NULL;
fbf59bc9d   Tejun Heo   percpu: implement...
1798
  }
edcb46399   Tejun Heo   percpu, module: i...
1799
1800
  
  /**
5835d96e9   Tejun Heo   percpu: implement...
1801
   * __alloc_percpu_gfp - allocate dynamic percpu area
edcb46399   Tejun Heo   percpu, module: i...
1802
1803
   * @size: size of area to allocate in bytes
   * @align: alignment of area (max PAGE_SIZE)
5835d96e9   Tejun Heo   percpu: implement...
1804
   * @gfp: allocation flags
edcb46399   Tejun Heo   percpu, module: i...
1805
   *
5835d96e9   Tejun Heo   percpu: implement...
1806
1807
   * Allocate zero-filled percpu area of @size bytes aligned at @align.  If
   * @gfp doesn't contain %GFP_KERNEL, the allocation doesn't block and can
0ea7eeec2   Daniel Borkmann   mm, percpu: add s...
1808
1809
1810
   * be called from any context but is a lot more likely to fail. If @gfp
   * has __GFP_NOWARN then no warning will be triggered on invalid or failed
   * allocation requests.
ccea34b5d   Tejun Heo   percpu: finer gra...
1811
   *
edcb46399   Tejun Heo   percpu, module: i...
1812
1813
1814
   * RETURNS:
   * Percpu pointer to the allocated area on success, NULL on failure.
   */
5835d96e9   Tejun Heo   percpu: implement...
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
  void __percpu *__alloc_percpu_gfp(size_t size, size_t align, gfp_t gfp)
  {
  	return pcpu_alloc(size, align, false, gfp);
  }
  EXPORT_SYMBOL_GPL(__alloc_percpu_gfp);
  
  /**
   * __alloc_percpu - allocate dynamic percpu area
   * @size: size of area to allocate in bytes
   * @align: alignment of area (max PAGE_SIZE)
   *
   * Equivalent to __alloc_percpu_gfp(size, align, %GFP_KERNEL).
   */
43cf38eb5   Tejun Heo   percpu: add __per...
1828
  void __percpu *__alloc_percpu(size_t size, size_t align)
edcb46399   Tejun Heo   percpu, module: i...
1829
  {
5835d96e9   Tejun Heo   percpu: implement...
1830
  	return pcpu_alloc(size, align, false, GFP_KERNEL);
edcb46399   Tejun Heo   percpu, module: i...
1831
  }
fbf59bc9d   Tejun Heo   percpu: implement...
1832
  EXPORT_SYMBOL_GPL(__alloc_percpu);
edcb46399   Tejun Heo   percpu, module: i...
1833
1834
1835
1836
1837
  /**
   * __alloc_reserved_percpu - allocate reserved percpu area
   * @size: size of area to allocate in bytes
   * @align: alignment of area (max PAGE_SIZE)
   *
9329ba970   Tejun Heo   percpu: update co...
1838
1839
1840
1841
   * Allocate zero-filled percpu area of @size bytes aligned at @align
   * from reserved percpu area if arch has set it up; otherwise,
   * allocation is served from the same dynamic area.  Might sleep.
   * Might trigger writeouts.
edcb46399   Tejun Heo   percpu, module: i...
1842
   *
ccea34b5d   Tejun Heo   percpu: finer gra...
1843
1844
1845
   * CONTEXT:
   * Does GFP_KERNEL allocation.
   *
edcb46399   Tejun Heo   percpu, module: i...
1846
1847
1848
   * RETURNS:
   * Percpu pointer to the allocated area on success, NULL on failure.
   */
43cf38eb5   Tejun Heo   percpu: add __per...
1849
  void __percpu *__alloc_reserved_percpu(size_t size, size_t align)
edcb46399   Tejun Heo   percpu, module: i...
1850
  {
5835d96e9   Tejun Heo   percpu: implement...
1851
  	return pcpu_alloc(size, align, true, GFP_KERNEL);
edcb46399   Tejun Heo   percpu, module: i...
1852
  }
a56dbddf0   Tejun Heo   percpu: move full...
1853
  /**
3c7be18ac   Roman Gushchin   mm: memcg/percpu:...
1854
1855
   * __pcpu_balance_workfn - manage the amount of free chunks and populated pages
   * @type: chunk type
a56dbddf0   Tejun Heo   percpu: move full...
1856
   *
47504ee04   Dennis Zhou   percpu: add __GFP...
1857
1858
1859
1860
1861
1862
   * Reclaim all fully free chunks except for the first one.  This is also
   * responsible for maintaining the pool of empty populated pages.  However,
   * it is possible that this is called when physical memory is scarce causing
   * OOM killer to be triggered.  We should avoid doing so until an actual
   * allocation causes the failure as it is possible that requests can be
   * serviced from already backed regions.
a56dbddf0   Tejun Heo   percpu: move full...
1863
   */
3c7be18ac   Roman Gushchin   mm: memcg/percpu:...
1864
  static void __pcpu_balance_workfn(enum pcpu_chunk_type type)
fbf59bc9d   Tejun Heo   percpu: implement...
1865
  {
47504ee04   Dennis Zhou   percpu: add __GFP...
1866
  	/* gfp flags passed to underlying allocators */
554fef1c3   Dennis Zhou   percpu: allow sel...
1867
  	const gfp_t gfp = GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN;
fe6bd8c3d   Tejun Heo   percpu: rename pc...
1868
  	LIST_HEAD(to_free);
3c7be18ac   Roman Gushchin   mm: memcg/percpu:...
1869
  	struct list_head *pcpu_slot = pcpu_chunk_list(type);
fe6bd8c3d   Tejun Heo   percpu: rename pc...
1870
  	struct list_head *free_head = &pcpu_slot[pcpu_nr_slots - 1];
a56dbddf0   Tejun Heo   percpu: move full...
1871
  	struct pcpu_chunk *chunk, *next;
1a4d76076   Tejun Heo   percpu: implement...
1872
  	int slot, nr_to_pop, ret;
a56dbddf0   Tejun Heo   percpu: move full...
1873

1a4d76076   Tejun Heo   percpu: implement...
1874
1875
1876
1877
  	/*
  	 * There's no reason to keep around multiple unused chunks and VM
  	 * areas can be scarce.  Destroy all free chunks except for one.
  	 */
ccea34b5d   Tejun Heo   percpu: finer gra...
1878
1879
  	mutex_lock(&pcpu_alloc_mutex);
  	spin_lock_irq(&pcpu_lock);
a56dbddf0   Tejun Heo   percpu: move full...
1880

fe6bd8c3d   Tejun Heo   percpu: rename pc...
1881
  	list_for_each_entry_safe(chunk, next, free_head, list) {
a56dbddf0   Tejun Heo   percpu: move full...
1882
1883
1884
  		WARN_ON(chunk->immutable);
  
  		/* spare the first one */
fe6bd8c3d   Tejun Heo   percpu: rename pc...
1885
  		if (chunk == list_first_entry(free_head, struct pcpu_chunk, list))
a56dbddf0   Tejun Heo   percpu: move full...
1886
  			continue;
fe6bd8c3d   Tejun Heo   percpu: rename pc...
1887
  		list_move(&chunk->list, &to_free);
a56dbddf0   Tejun Heo   percpu: move full...
1888
  	}
ccea34b5d   Tejun Heo   percpu: finer gra...
1889
  	spin_unlock_irq(&pcpu_lock);
a56dbddf0   Tejun Heo   percpu: move full...
1890

fe6bd8c3d   Tejun Heo   percpu: rename pc...
1891
  	list_for_each_entry_safe(chunk, next, &to_free, list) {
e837dfde1   Dennis Zhou   bitmap: genericiz...
1892
  		unsigned int rs, re;
dca496451   Tejun Heo   percpu: move comm...
1893

e837dfde1   Dennis Zhou   bitmap: genericiz...
1894
1895
  		bitmap_for_each_set_region(chunk->populated, rs, re, 0,
  					   chunk->nr_pages) {
a93ace487   Tejun Heo   percpu: move regi...
1896
  			pcpu_depopulate_chunk(chunk, rs, re);
b539b87fe   Tejun Heo   percpu: implmeent...
1897
1898
1899
  			spin_lock_irq(&pcpu_lock);
  			pcpu_chunk_depopulated(chunk, rs, re);
  			spin_unlock_irq(&pcpu_lock);
a93ace487   Tejun Heo   percpu: move regi...
1900
  		}
6081089fd   Tejun Heo   percpu: reorganiz...
1901
  		pcpu_destroy_chunk(chunk);
accd4f36a   Eric Dumazet   percpu: add a sch...
1902
  		cond_resched();
a56dbddf0   Tejun Heo   percpu: move full...
1903
  	}
971f3918a   Tejun Heo   percpu: fix pcpu_...
1904

1a4d76076   Tejun Heo   percpu: implement...
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
  	/*
  	 * Ensure there are certain number of free populated pages for
  	 * atomic allocs.  Fill up from the most packed so that atomic
  	 * allocs don't increase fragmentation.  If atomic allocation
  	 * failed previously, always populate the maximum amount.  This
  	 * should prevent atomic allocs larger than PAGE_SIZE from keeping
  	 * failing indefinitely; however, large atomic allocs are not
  	 * something we support properly and can be highly unreliable and
  	 * inefficient.
  	 */
  retry_pop:
  	if (pcpu_atomic_alloc_failed) {
  		nr_to_pop = PCPU_EMPTY_POP_PAGES_HIGH;
  		/* best effort anyway, don't worry about synchronization */
  		pcpu_atomic_alloc_failed = false;
  	} else {
  		nr_to_pop = clamp(PCPU_EMPTY_POP_PAGES_HIGH -
  				  pcpu_nr_empty_pop_pages,
  				  0, PCPU_EMPTY_POP_PAGES_HIGH);
  	}
  
  	for (slot = pcpu_size_to_slot(PAGE_SIZE); slot < pcpu_nr_slots; slot++) {
e837dfde1   Dennis Zhou   bitmap: genericiz...
1927
  		unsigned int nr_unpop = 0, rs, re;
1a4d76076   Tejun Heo   percpu: implement...
1928
1929
1930
1931
1932
1933
  
  		if (!nr_to_pop)
  			break;
  
  		spin_lock_irq(&pcpu_lock);
  		list_for_each_entry(chunk, &pcpu_slot[slot], list) {
8ab16c43e   Dennis Zhou (Facebook)   percpu: change th...
1934
  			nr_unpop = chunk->nr_pages - chunk->nr_populated;
1a4d76076   Tejun Heo   percpu: implement...
1935
1936
1937
1938
1939
1940
1941
1942
1943
  			if (nr_unpop)
  				break;
  		}
  		spin_unlock_irq(&pcpu_lock);
  
  		if (!nr_unpop)
  			continue;
  
  		/* @chunk can't go away while pcpu_alloc_mutex is held */
e837dfde1   Dennis Zhou   bitmap: genericiz...
1944
1945
1946
  		bitmap_for_each_clear_region(chunk->populated, rs, re, 0,
  					     chunk->nr_pages) {
  			int nr = min_t(int, re - rs, nr_to_pop);
1a4d76076   Tejun Heo   percpu: implement...
1947

47504ee04   Dennis Zhou   percpu: add __GFP...
1948
  			ret = pcpu_populate_chunk(chunk, rs, rs + nr, gfp);
1a4d76076   Tejun Heo   percpu: implement...
1949
1950
1951
  			if (!ret) {
  				nr_to_pop -= nr;
  				spin_lock_irq(&pcpu_lock);
b239f7daf   Dennis Zhou   percpu: set PCPU_...
1952
  				pcpu_chunk_populated(chunk, rs, rs + nr);
1a4d76076   Tejun Heo   percpu: implement...
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
  				spin_unlock_irq(&pcpu_lock);
  			} else {
  				nr_to_pop = 0;
  			}
  
  			if (!nr_to_pop)
  				break;
  		}
  	}
  
  	if (nr_to_pop) {
  		/* ran out of chunks to populate, create a new one and retry */
3c7be18ac   Roman Gushchin   mm: memcg/percpu:...
1965
  		chunk = pcpu_create_chunk(type, gfp);
1a4d76076   Tejun Heo   percpu: implement...
1966
1967
1968
1969
1970
1971
1972
  		if (chunk) {
  			spin_lock_irq(&pcpu_lock);
  			pcpu_chunk_relocate(chunk, -1);
  			spin_unlock_irq(&pcpu_lock);
  			goto retry_pop;
  		}
  	}
971f3918a   Tejun Heo   percpu: fix pcpu_...
1973
  	mutex_unlock(&pcpu_alloc_mutex);
fbf59bc9d   Tejun Heo   percpu: implement...
1974
1975
1976
  }
  
  /**
3c7be18ac   Roman Gushchin   mm: memcg/percpu:...
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
   * pcpu_balance_workfn - manage the amount of free chunks and populated pages
   * @work: unused
   *
   * Call __pcpu_balance_workfn() for each chunk type.
   */
  static void pcpu_balance_workfn(struct work_struct *work)
  {
  	enum pcpu_chunk_type type;
  
  	for (type = 0; type < PCPU_NR_CHUNK_TYPES; type++)
  		__pcpu_balance_workfn(type);
  }
  
  /**
fbf59bc9d   Tejun Heo   percpu: implement...
1991
1992
1993
   * free_percpu - free percpu area
   * @ptr: pointer to area to free
   *
ccea34b5d   Tejun Heo   percpu: finer gra...
1994
1995
1996
1997
   * Free percpu area @ptr.
   *
   * CONTEXT:
   * Can be called from atomic context.
fbf59bc9d   Tejun Heo   percpu: implement...
1998
   */
43cf38eb5   Tejun Heo   percpu: add __per...
1999
  void free_percpu(void __percpu *ptr)
fbf59bc9d   Tejun Heo   percpu: implement...
2000
  {
129182e56   Andrew Morton   percpu: avoid cal...
2001
  	void *addr;
fbf59bc9d   Tejun Heo   percpu: implement...
2002
  	struct pcpu_chunk *chunk;
ccea34b5d   Tejun Heo   percpu: finer gra...
2003
  	unsigned long flags;
3c7be18ac   Roman Gushchin   mm: memcg/percpu:...
2004
  	int size, off;
198790d9a   John Sperbeck   percpu: remove sp...
2005
  	bool need_balance = false;
3c7be18ac   Roman Gushchin   mm: memcg/percpu:...
2006
  	struct list_head *pcpu_slot;
fbf59bc9d   Tejun Heo   percpu: implement...
2007
2008
2009
  
  	if (!ptr)
  		return;
f528f0b8e   Catalin Marinas   kmemleak: Handle ...
2010
  	kmemleak_free_percpu(ptr);
129182e56   Andrew Morton   percpu: avoid cal...
2011
  	addr = __pcpu_ptr_to_addr(ptr);
ccea34b5d   Tejun Heo   percpu: finer gra...
2012
  	spin_lock_irqsave(&pcpu_lock, flags);
fbf59bc9d   Tejun Heo   percpu: implement...
2013
2014
  
  	chunk = pcpu_chunk_addr_search(addr);
bba174f5e   Tejun Heo   percpu: add chunk...
2015
  	off = addr - chunk->base_addr;
fbf59bc9d   Tejun Heo   percpu: implement...
2016

3c7be18ac   Roman Gushchin   mm: memcg/percpu:...
2017
2018
2019
2020
2021
  	size = pcpu_free_area(chunk, off);
  
  	pcpu_slot = pcpu_chunk_list(pcpu_chunk_type(chunk));
  
  	pcpu_memcg_free_hook(chunk, off, size);
fbf59bc9d   Tejun Heo   percpu: implement...
2022

a56dbddf0   Tejun Heo   percpu: move full...
2023
  	/* if there are more than one fully free chunks, wake up grim reaper */
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
2024
  	if (chunk->free_bytes == pcpu_unit_size) {
fbf59bc9d   Tejun Heo   percpu: implement...
2025
  		struct pcpu_chunk *pos;
a56dbddf0   Tejun Heo   percpu: move full...
2026
  		list_for_each_entry(pos, &pcpu_slot[pcpu_nr_slots - 1], list)
fbf59bc9d   Tejun Heo   percpu: implement...
2027
  			if (pos != chunk) {
198790d9a   John Sperbeck   percpu: remove sp...
2028
  				need_balance = true;
fbf59bc9d   Tejun Heo   percpu: implement...
2029
2030
2031
  				break;
  			}
  	}
df95e795a   Dennis Zhou   percpu: add trace...
2032
  	trace_percpu_free_percpu(chunk->base_addr, off, ptr);
ccea34b5d   Tejun Heo   percpu: finer gra...
2033
  	spin_unlock_irqrestore(&pcpu_lock, flags);
198790d9a   John Sperbeck   percpu: remove sp...
2034
2035
2036
  
  	if (need_balance)
  		pcpu_schedule_balance_work();
fbf59bc9d   Tejun Heo   percpu: implement...
2037
2038
  }
  EXPORT_SYMBOL_GPL(free_percpu);
383776fa7   Thomas Gleixner   locking/lockdep: ...
2039
  bool __is_kernel_percpu_address(unsigned long addr, unsigned long *can_addr)
10fad5e46   Tejun Heo   percpu, module: i...
2040
  {
bbddff054   Tejun Heo   percpu: use percp...
2041
  #ifdef CONFIG_SMP
10fad5e46   Tejun Heo   percpu, module: i...
2042
2043
2044
2045
2046
2047
  	const size_t static_size = __per_cpu_end - __per_cpu_start;
  	void __percpu *base = __addr_to_pcpu_ptr(pcpu_base_addr);
  	unsigned int cpu;
  
  	for_each_possible_cpu(cpu) {
  		void *start = per_cpu_ptr(base, cpu);
383776fa7   Thomas Gleixner   locking/lockdep: ...
2048
  		void *va = (void *)addr;
10fad5e46   Tejun Heo   percpu, module: i...
2049

383776fa7   Thomas Gleixner   locking/lockdep: ...
2050
  		if (va >= start && va < start + static_size) {
8ce371f98   Peter Zijlstra   lockdep: Fix per-...
2051
  			if (can_addr) {
383776fa7   Thomas Gleixner   locking/lockdep: ...
2052
  				*can_addr = (unsigned long) (va - start);
8ce371f98   Peter Zijlstra   lockdep: Fix per-...
2053
2054
2055
  				*can_addr += (unsigned long)
  					per_cpu_ptr(base, get_boot_cpu_id());
  			}
10fad5e46   Tejun Heo   percpu, module: i...
2056
  			return true;
383776fa7   Thomas Gleixner   locking/lockdep: ...
2057
2058
  		}
  	}
bbddff054   Tejun Heo   percpu: use percp...
2059
2060
  #endif
  	/* on UP, can't distinguish from other static vars, always false */
10fad5e46   Tejun Heo   percpu, module: i...
2061
2062
2063
2064
  	return false;
  }
  
  /**
383776fa7   Thomas Gleixner   locking/lockdep: ...
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
   * is_kernel_percpu_address - test whether address is from static percpu area
   * @addr: address to test
   *
   * Test whether @addr belongs to in-kernel static percpu area.  Module
   * static percpu areas are not considered.  For those, use
   * is_module_percpu_address().
   *
   * RETURNS:
   * %true if @addr is from in-kernel static percpu area, %false otherwise.
   */
  bool is_kernel_percpu_address(unsigned long addr)
  {
  	return __is_kernel_percpu_address(addr, NULL);
  }
  
  /**
3b034b0d0   Vivek Goyal   percpu: Fix kdump...
2081
2082
2083
2084
2085
2086
2087
2088
   * per_cpu_ptr_to_phys - convert translated percpu address to physical address
   * @addr: the address to be converted to physical address
   *
   * Given @addr which is dereferenceable address obtained via one of
   * percpu access macros, this function translates it into its physical
   * address.  The caller is responsible for ensuring @addr stays valid
   * until this function finishes.
   *
67589c714   Dave Young   percpu: explain w...
2089
2090
2091
2092
2093
   * percpu allocator has special setup for the first chunk, which currently
   * supports either embedding in linear address space or vmalloc mapping,
   * and, from the second one, the backing allocator (currently either vm or
   * km) provides translation.
   *
bffc43758   Yannick Guerrini   percpu: Fix trivi...
2094
   * The addr can be translated simply without checking if it falls into the
67589c714   Dave Young   percpu: explain w...
2095
2096
2097
2098
2099
   * first chunk. But the current code reflects better how percpu allocator
   * actually works, and the verification can discover both bugs in percpu
   * allocator itself and per_cpu_ptr_to_phys() callers. So we keep current
   * code.
   *
3b034b0d0   Vivek Goyal   percpu: Fix kdump...
2100
2101
2102
2103
2104
   * RETURNS:
   * The physical address for @addr.
   */
  phys_addr_t per_cpu_ptr_to_phys(void *addr)
  {
9983b6f0c   Tejun Heo   percpu: fix first...
2105
2106
  	void __percpu *base = __addr_to_pcpu_ptr(pcpu_base_addr);
  	bool in_first_chunk = false;
a855b84c3   Tejun Heo   percpu: fix chunk...
2107
  	unsigned long first_low, first_high;
9983b6f0c   Tejun Heo   percpu: fix first...
2108
2109
2110
  	unsigned int cpu;
  
  	/*
a855b84c3   Tejun Heo   percpu: fix chunk...
2111
  	 * The following test on unit_low/high isn't strictly
9983b6f0c   Tejun Heo   percpu: fix first...
2112
2113
  	 * necessary but will speed up lookups of addresses which
  	 * aren't in the first chunk.
c0ebfdc3f   Dennis Zhou (Facebook)   percpu: modify ba...
2114
2115
2116
2117
2118
  	 *
  	 * The address check is against full chunk sizes.  pcpu_base_addr
  	 * points to the beginning of the first chunk including the
  	 * static region.  Assumes good intent as the first chunk may
  	 * not be full (ie. < pcpu_unit_pages in size).
9983b6f0c   Tejun Heo   percpu: fix first...
2119
  	 */
c0ebfdc3f   Dennis Zhou (Facebook)   percpu: modify ba...
2120
2121
2122
2123
  	first_low = (unsigned long)pcpu_base_addr +
  		    pcpu_unit_page_offset(pcpu_low_unit_cpu, 0);
  	first_high = (unsigned long)pcpu_base_addr +
  		     pcpu_unit_page_offset(pcpu_high_unit_cpu, pcpu_unit_pages);
a855b84c3   Tejun Heo   percpu: fix chunk...
2124
2125
  	if ((unsigned long)addr >= first_low &&
  	    (unsigned long)addr < first_high) {
9983b6f0c   Tejun Heo   percpu: fix first...
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
  		for_each_possible_cpu(cpu) {
  			void *start = per_cpu_ptr(base, cpu);
  
  			if (addr >= start && addr < start + pcpu_unit_size) {
  				in_first_chunk = true;
  				break;
  			}
  		}
  	}
  
  	if (in_first_chunk) {
eac522ef4   David Howells   NOMMU: percpu sho...
2137
  		if (!is_vmalloc_addr(addr))
020ec6537   Tejun Heo   percpu: factor ou...
2138
2139
  			return __pa(addr);
  		else
9f57bd4d6   Eugene Surovegin   percpu: fix per_c...
2140
2141
  			return page_to_phys(vmalloc_to_page(addr)) +
  			       offset_in_page(addr);
020ec6537   Tejun Heo   percpu: factor ou...
2142
  	} else
9f57bd4d6   Eugene Surovegin   percpu: fix per_c...
2143
2144
  		return page_to_phys(pcpu_addr_to_page(addr)) +
  		       offset_in_page(addr);
3b034b0d0   Vivek Goyal   percpu: Fix kdump...
2145
  }
36abe273c   Prasad Sodagudi   ANDROID: percpu: ...
2146
  EXPORT_SYMBOL_GPL(per_cpu_ptr_to_phys);
3b034b0d0   Vivek Goyal   percpu: Fix kdump...
2147

fbf59bc9d   Tejun Heo   percpu: implement...
2148
  /**
fd1e8a1fe   Tejun Heo   percpu: introduce...
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
   * pcpu_alloc_alloc_info - allocate percpu allocation info
   * @nr_groups: the number of groups
   * @nr_units: the number of units
   *
   * Allocate ai which is large enough for @nr_groups groups containing
   * @nr_units units.  The returned ai's groups[0].cpu_map points to the
   * cpu_map array which is long enough for @nr_units and filled with
   * NR_CPUS.  It's the caller's responsibility to initialize cpu_map
   * pointer of other groups.
   *
   * RETURNS:
   * Pointer to the allocated pcpu_alloc_info on success, NULL on
   * failure.
   */
  struct pcpu_alloc_info * __init pcpu_alloc_alloc_info(int nr_groups,
  						      int nr_units)
  {
  	struct pcpu_alloc_info *ai;
  	size_t base_size, ai_size;
  	void *ptr;
  	int unit;
14d376124   Gustavo A. R. Silva   percpu: Use struc...
2170
  	base_size = ALIGN(struct_size(ai, groups, nr_groups),
fd1e8a1fe   Tejun Heo   percpu: introduce...
2171
2172
  			  __alignof__(ai->groups[0].cpu_map[0]));
  	ai_size = base_size + nr_units * sizeof(ai->groups[0].cpu_map[0]);
26fb3dae0   Mike Rapoport   memblock: drop me...
2173
  	ptr = memblock_alloc(PFN_ALIGN(ai_size), PAGE_SIZE);
fd1e8a1fe   Tejun Heo   percpu: introduce...
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
  	if (!ptr)
  		return NULL;
  	ai = ptr;
  	ptr += base_size;
  
  	ai->groups[0].cpu_map = ptr;
  
  	for (unit = 0; unit < nr_units; unit++)
  		ai->groups[0].cpu_map[unit] = NR_CPUS;
  
  	ai->nr_groups = nr_groups;
  	ai->__ai_size = PFN_ALIGN(ai_size);
  
  	return ai;
  }
  
  /**
   * pcpu_free_alloc_info - free percpu allocation info
   * @ai: pcpu_alloc_info to free
   *
   * Free @ai which was allocated by pcpu_alloc_alloc_info().
   */
  void __init pcpu_free_alloc_info(struct pcpu_alloc_info *ai)
  {
999c17e3d   Santosh Shilimkar   mm/percpu.c: use ...
2198
  	memblock_free_early(__pa(ai), ai->__ai_size);
fd1e8a1fe   Tejun Heo   percpu: introduce...
2199
2200
2201
  }
  
  /**
fd1e8a1fe   Tejun Heo   percpu: introduce...
2202
2203
2204
2205
2206
2207
2208
2209
   * pcpu_dump_alloc_info - print out information about pcpu_alloc_info
   * @lvl: loglevel
   * @ai: allocation info to dump
   *
   * Print out information about @ai using loglevel @lvl.
   */
  static void pcpu_dump_alloc_info(const char *lvl,
  				 const struct pcpu_alloc_info *ai)
033e48fb8   Tejun Heo   percpu: move pcpu...
2210
  {
fd1e8a1fe   Tejun Heo   percpu: introduce...
2211
  	int group_width = 1, cpu_width = 1, width;
033e48fb8   Tejun Heo   percpu: move pcpu...
2212
  	char empty_str[] = "--------";
fd1e8a1fe   Tejun Heo   percpu: introduce...
2213
2214
2215
2216
2217
2218
2219
  	int alloc = 0, alloc_end = 0;
  	int group, v;
  	int upa, apl;	/* units per alloc, allocs per line */
  
  	v = ai->nr_groups;
  	while (v /= 10)
  		group_width++;
033e48fb8   Tejun Heo   percpu: move pcpu...
2220

fd1e8a1fe   Tejun Heo   percpu: introduce...
2221
  	v = num_possible_cpus();
033e48fb8   Tejun Heo   percpu: move pcpu...
2222
  	while (v /= 10)
fd1e8a1fe   Tejun Heo   percpu: introduce...
2223
2224
  		cpu_width++;
  	empty_str[min_t(int, cpu_width, sizeof(empty_str) - 1)] = '\0';
033e48fb8   Tejun Heo   percpu: move pcpu...
2225

fd1e8a1fe   Tejun Heo   percpu: introduce...
2226
2227
2228
  	upa = ai->alloc_size / ai->unit_size;
  	width = upa * (cpu_width + 1) + group_width + 3;
  	apl = rounddown_pow_of_two(max(60 / width, 1));
033e48fb8   Tejun Heo   percpu: move pcpu...
2229

fd1e8a1fe   Tejun Heo   percpu: introduce...
2230
2231
2232
  	printk("%spcpu-alloc: s%zu r%zu d%zu u%zu alloc=%zu*%zu",
  	       lvl, ai->static_size, ai->reserved_size, ai->dyn_size,
  	       ai->unit_size, ai->alloc_size / ai->atom_size, ai->atom_size);
033e48fb8   Tejun Heo   percpu: move pcpu...
2233

fd1e8a1fe   Tejun Heo   percpu: introduce...
2234
2235
2236
2237
2238
2239
2240
2241
  	for (group = 0; group < ai->nr_groups; group++) {
  		const struct pcpu_group_info *gi = &ai->groups[group];
  		int unit = 0, unit_end = 0;
  
  		BUG_ON(gi->nr_units % upa);
  		for (alloc_end += gi->nr_units / upa;
  		     alloc < alloc_end; alloc++) {
  			if (!(alloc % apl)) {
1170532bb   Joe Perches   mm: convert print...
2242
2243
  				pr_cont("
  ");
fd1e8a1fe   Tejun Heo   percpu: introduce...
2244
2245
  				printk("%spcpu-alloc: ", lvl);
  			}
1170532bb   Joe Perches   mm: convert print...
2246
  			pr_cont("[%0*d] ", group_width, group);
fd1e8a1fe   Tejun Heo   percpu: introduce...
2247
2248
2249
  
  			for (unit_end += upa; unit < unit_end; unit++)
  				if (gi->cpu_map[unit] != NR_CPUS)
1170532bb   Joe Perches   mm: convert print...
2250
2251
  					pr_cont("%0*d ",
  						cpu_width, gi->cpu_map[unit]);
fd1e8a1fe   Tejun Heo   percpu: introduce...
2252
  				else
1170532bb   Joe Perches   mm: convert print...
2253
  					pr_cont("%s ", empty_str);
033e48fb8   Tejun Heo   percpu: move pcpu...
2254
  		}
033e48fb8   Tejun Heo   percpu: move pcpu...
2255
  	}
1170532bb   Joe Perches   mm: convert print...
2256
2257
  	pr_cont("
  ");
033e48fb8   Tejun Heo   percpu: move pcpu...
2258
  }
033e48fb8   Tejun Heo   percpu: move pcpu...
2259

fbf59bc9d   Tejun Heo   percpu: implement...
2260
  /**
8d408b4be   Tejun Heo   percpu: give more...
2261
   * pcpu_setup_first_chunk - initialize the first percpu chunk
fd1e8a1fe   Tejun Heo   percpu: introduce...
2262
   * @ai: pcpu_alloc_info describing how to percpu area is shaped
38a6be525   Tejun Heo   percpu: simplify ...
2263
   * @base_addr: mapped address
8d408b4be   Tejun Heo   percpu: give more...
2264
2265
   *
   * Initialize the first percpu chunk which contains the kernel static
69ab285b6   Christophe JAILLET   percpu: fix typo ...
2266
   * percpu area.  This function is to be called from arch percpu area
38a6be525   Tejun Heo   percpu: simplify ...
2267
   * setup path.
8d408b4be   Tejun Heo   percpu: give more...
2268
   *
fd1e8a1fe   Tejun Heo   percpu: introduce...
2269
2270
2271
2272
2273
2274
   * @ai contains all information necessary to initialize the first
   * chunk and prime the dynamic percpu allocator.
   *
   * @ai->static_size is the size of static percpu area.
   *
   * @ai->reserved_size, if non-zero, specifies the amount of bytes to
edcb46399   Tejun Heo   percpu, module: i...
2275
2276
2277
2278
2279
2280
2281
   * reserve after the static area in the first chunk.  This reserves
   * the first chunk such that it's available only through reserved
   * percpu allocation.  This is primarily used to serve module percpu
   * static areas on architectures where the addressing model has
   * limited offset range for symbol relocations to guarantee module
   * percpu symbols fall inside the relocatable range.
   *
fd1e8a1fe   Tejun Heo   percpu: introduce...
2282
2283
2284
   * @ai->dyn_size determines the number of bytes available for dynamic
   * allocation in the first chunk.  The area between @ai->static_size +
   * @ai->reserved_size + @ai->dyn_size and @ai->unit_size is unused.
6074d5b0a   Tejun Heo   percpu: more flex...
2285
   *
fd1e8a1fe   Tejun Heo   percpu: introduce...
2286
2287
2288
   * @ai->unit_size specifies unit size and must be aligned to PAGE_SIZE
   * and equal to or larger than @ai->static_size + @ai->reserved_size +
   * @ai->dyn_size.
8d408b4be   Tejun Heo   percpu: give more...
2289
   *
fd1e8a1fe   Tejun Heo   percpu: introduce...
2290
2291
   * @ai->atom_size is the allocation atom size and used as alignment
   * for vm areas.
8d408b4be   Tejun Heo   percpu: give more...
2292
   *
fd1e8a1fe   Tejun Heo   percpu: introduce...
2293
2294
2295
2296
2297
2298
2299
2300
2301
   * @ai->alloc_size is the allocation size and always multiple of
   * @ai->atom_size.  This is larger than @ai->atom_size if
   * @ai->unit_size is larger than @ai->atom_size.
   *
   * @ai->nr_groups and @ai->groups describe virtual memory layout of
   * percpu areas.  Units which should be colocated are put into the
   * same group.  Dynamic VM areas will be allocated according to these
   * groupings.  If @ai->nr_groups is zero, a single group containing
   * all units is assumed.
8d408b4be   Tejun Heo   percpu: give more...
2302
   *
38a6be525   Tejun Heo   percpu: simplify ...
2303
2304
   * The caller should have mapped the first chunk at @base_addr and
   * copied static data to each unit.
fbf59bc9d   Tejun Heo   percpu: implement...
2305
   *
c0ebfdc3f   Dennis Zhou (Facebook)   percpu: modify ba...
2306
2307
2308
2309
2310
2311
2312
   * The first chunk will always contain a static and a dynamic region.
   * However, the static region is not managed by any chunk.  If the first
   * chunk also contains a reserved region, it is served by two chunks -
   * one for the reserved region and one for the dynamic region.  They
   * share the same vm, but use offset regions in the area allocation map.
   * The chunk serving the dynamic region is circulated in the chunk slots
   * and available for dynamic allocation like any other chunk.
fbf59bc9d   Tejun Heo   percpu: implement...
2313
   */
163fa2343   Kefeng Wang   percpu: Make pcpu...
2314
2315
  void __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
  				   void *base_addr)
fbf59bc9d   Tejun Heo   percpu: implement...
2316
  {
b9c39442c   Dennis Zhou (Facebook)   percpu: setup_fir...
2317
  	size_t size_sum = ai->static_size + ai->reserved_size + ai->dyn_size;
d2f3c3849   Dennis Zhou (Facebook)   percpu: increase ...
2318
  	size_t static_size, dyn_size;
0c4169c3d   Dennis Zhou (Facebook)   percpu: setup_fir...
2319
  	struct pcpu_chunk *chunk;
6563297ce   Tejun Heo   percpu: use group...
2320
2321
  	unsigned long *group_offsets;
  	size_t *group_sizes;
fb435d523   Tejun Heo   percpu: add pcpu_...
2322
  	unsigned long *unit_off;
fbf59bc9d   Tejun Heo   percpu: implement...
2323
  	unsigned int cpu;
fd1e8a1fe   Tejun Heo   percpu: introduce...
2324
2325
  	int *unit_map;
  	int group, unit, i;
c0ebfdc3f   Dennis Zhou (Facebook)   percpu: modify ba...
2326
2327
  	int map_size;
  	unsigned long tmp_addr;
f655f4053   Mike Rapoport   mm/percpu: add ch...
2328
  	size_t alloc_size;
3c7be18ac   Roman Gushchin   mm: memcg/percpu:...
2329
  	enum pcpu_chunk_type type;
fbf59bc9d   Tejun Heo   percpu: implement...
2330

635b75fc1   Tejun Heo   percpu: make pcpu...
2331
2332
  #define PCPU_SETUP_BUG_ON(cond)	do {					\
  	if (unlikely(cond)) {						\
870d4b12a   Joe Perches   mm: percpu: use p...
2333
2334
2335
2336
  		pr_emerg("failed to initialize, %s
  ", #cond);		\
  		pr_emerg("cpu_possible_mask=%*pb
  ",			\
807de073b   Tejun Heo   percpu: use %*pb[...
2337
  			 cpumask_pr_args(cpu_possible_mask));		\
635b75fc1   Tejun Heo   percpu: make pcpu...
2338
2339
2340
2341
  		pcpu_dump_alloc_info(KERN_EMERG, ai);			\
  		BUG();							\
  	}								\
  } while (0)
2f39e637e   Tejun Heo   percpu: allow non...
2342
  	/* sanity checks */
635b75fc1   Tejun Heo   percpu: make pcpu...
2343
  	PCPU_SETUP_BUG_ON(ai->nr_groups <= 0);
bbddff054   Tejun Heo   percpu: use percp...
2344
  #ifdef CONFIG_SMP
635b75fc1   Tejun Heo   percpu: make pcpu...
2345
  	PCPU_SETUP_BUG_ON(!ai->static_size);
f09f1243c   Alexander Kuleshov   mm/percpu: use of...
2346
  	PCPU_SETUP_BUG_ON(offset_in_page(__per_cpu_start));
bbddff054   Tejun Heo   percpu: use percp...
2347
  #endif
635b75fc1   Tejun Heo   percpu: make pcpu...
2348
  	PCPU_SETUP_BUG_ON(!base_addr);
f09f1243c   Alexander Kuleshov   mm/percpu: use of...
2349
  	PCPU_SETUP_BUG_ON(offset_in_page(base_addr));
635b75fc1   Tejun Heo   percpu: make pcpu...
2350
  	PCPU_SETUP_BUG_ON(ai->unit_size < size_sum);
f09f1243c   Alexander Kuleshov   mm/percpu: use of...
2351
  	PCPU_SETUP_BUG_ON(offset_in_page(ai->unit_size));
635b75fc1   Tejun Heo   percpu: make pcpu...
2352
  	PCPU_SETUP_BUG_ON(ai->unit_size < PCPU_MIN_UNIT_SIZE);
ca460b3c9   Dennis Zhou (Facebook)   percpu: introduce...
2353
  	PCPU_SETUP_BUG_ON(!IS_ALIGNED(ai->unit_size, PCPU_BITMAP_BLOCK_SIZE));
099a19d91   Tejun Heo   percpu: allow lim...
2354
  	PCPU_SETUP_BUG_ON(ai->dyn_size < PERCPU_DYNAMIC_EARLY_SIZE);
fb29a2cc6   Dennis Zhou (Facebook)   percpu: setup_fir...
2355
  	PCPU_SETUP_BUG_ON(!ai->dyn_size);
d2f3c3849   Dennis Zhou (Facebook)   percpu: increase ...
2356
  	PCPU_SETUP_BUG_ON(!IS_ALIGNED(ai->reserved_size, PCPU_MIN_ALLOC_SIZE));
ca460b3c9   Dennis Zhou (Facebook)   percpu: introduce...
2357
2358
  	PCPU_SETUP_BUG_ON(!(IS_ALIGNED(PCPU_BITMAP_BLOCK_SIZE, PAGE_SIZE) ||
  			    IS_ALIGNED(PAGE_SIZE, PCPU_BITMAP_BLOCK_SIZE)));
9f6455325   Tejun Heo   percpu: move vmal...
2359
  	PCPU_SETUP_BUG_ON(pcpu_verify_alloc_info(ai) < 0);
8d408b4be   Tejun Heo   percpu: give more...
2360

6563297ce   Tejun Heo   percpu: use group...
2361
  	/* process group information and build config tables accordingly */
f655f4053   Mike Rapoport   mm/percpu: add ch...
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
  	alloc_size = ai->nr_groups * sizeof(group_offsets[0]);
  	group_offsets = memblock_alloc(alloc_size, SMP_CACHE_BYTES);
  	if (!group_offsets)
  		panic("%s: Failed to allocate %zu bytes
  ", __func__,
  		      alloc_size);
  
  	alloc_size = ai->nr_groups * sizeof(group_sizes[0]);
  	group_sizes = memblock_alloc(alloc_size, SMP_CACHE_BYTES);
  	if (!group_sizes)
  		panic("%s: Failed to allocate %zu bytes
  ", __func__,
  		      alloc_size);
  
  	alloc_size = nr_cpu_ids * sizeof(unit_map[0]);
  	unit_map = memblock_alloc(alloc_size, SMP_CACHE_BYTES);
  	if (!unit_map)
  		panic("%s: Failed to allocate %zu bytes
  ", __func__,
  		      alloc_size);
  
  	alloc_size = nr_cpu_ids * sizeof(unit_off[0]);
  	unit_off = memblock_alloc(alloc_size, SMP_CACHE_BYTES);
  	if (!unit_off)
  		panic("%s: Failed to allocate %zu bytes
  ", __func__,
  		      alloc_size);
2f39e637e   Tejun Heo   percpu: allow non...
2389

fd1e8a1fe   Tejun Heo   percpu: introduce...
2390
  	for (cpu = 0; cpu < nr_cpu_ids; cpu++)
ffe0d5a57   Tejun Heo   percpu: fix unit_...
2391
  		unit_map[cpu] = UINT_MAX;
a855b84c3   Tejun Heo   percpu: fix chunk...
2392
2393
2394
  
  	pcpu_low_unit_cpu = NR_CPUS;
  	pcpu_high_unit_cpu = NR_CPUS;
2f39e637e   Tejun Heo   percpu: allow non...
2395

fd1e8a1fe   Tejun Heo   percpu: introduce...
2396
2397
  	for (group = 0, unit = 0; group < ai->nr_groups; group++, unit += i) {
  		const struct pcpu_group_info *gi = &ai->groups[group];
2f39e637e   Tejun Heo   percpu: allow non...
2398

6563297ce   Tejun Heo   percpu: use group...
2399
2400
  		group_offsets[group] = gi->base_offset;
  		group_sizes[group] = gi->nr_units * ai->unit_size;
fd1e8a1fe   Tejun Heo   percpu: introduce...
2401
2402
2403
2404
  		for (i = 0; i < gi->nr_units; i++) {
  			cpu = gi->cpu_map[i];
  			if (cpu == NR_CPUS)
  				continue;
8d408b4be   Tejun Heo   percpu: give more...
2405

9f295664e   Dan Carpenter   percpu: off by on...
2406
  			PCPU_SETUP_BUG_ON(cpu >= nr_cpu_ids);
635b75fc1   Tejun Heo   percpu: make pcpu...
2407
2408
  			PCPU_SETUP_BUG_ON(!cpu_possible(cpu));
  			PCPU_SETUP_BUG_ON(unit_map[cpu] != UINT_MAX);
fbf59bc9d   Tejun Heo   percpu: implement...
2409

fd1e8a1fe   Tejun Heo   percpu: introduce...
2410
  			unit_map[cpu] = unit + i;
fb435d523   Tejun Heo   percpu: add pcpu_...
2411
  			unit_off[cpu] = gi->base_offset + i * ai->unit_size;
a855b84c3   Tejun Heo   percpu: fix chunk...
2412
2413
2414
2415
2416
2417
2418
  			/* determine low/high unit_cpu */
  			if (pcpu_low_unit_cpu == NR_CPUS ||
  			    unit_off[cpu] < unit_off[pcpu_low_unit_cpu])
  				pcpu_low_unit_cpu = cpu;
  			if (pcpu_high_unit_cpu == NR_CPUS ||
  			    unit_off[cpu] > unit_off[pcpu_high_unit_cpu])
  				pcpu_high_unit_cpu = cpu;
fd1e8a1fe   Tejun Heo   percpu: introduce...
2419
  		}
2f39e637e   Tejun Heo   percpu: allow non...
2420
  	}
fd1e8a1fe   Tejun Heo   percpu: introduce...
2421
2422
2423
  	pcpu_nr_units = unit;
  
  	for_each_possible_cpu(cpu)
635b75fc1   Tejun Heo   percpu: make pcpu...
2424
2425
2426
2427
  		PCPU_SETUP_BUG_ON(unit_map[cpu] == UINT_MAX);
  
  	/* we're done parsing the input, undefine BUG macro and dump config */
  #undef PCPU_SETUP_BUG_ON
bcbea798f   Tejun Heo   percpu: print out...
2428
  	pcpu_dump_alloc_info(KERN_DEBUG, ai);
fd1e8a1fe   Tejun Heo   percpu: introduce...
2429

6563297ce   Tejun Heo   percpu: use group...
2430
2431
2432
  	pcpu_nr_groups = ai->nr_groups;
  	pcpu_group_offsets = group_offsets;
  	pcpu_group_sizes = group_sizes;
fd1e8a1fe   Tejun Heo   percpu: introduce...
2433
  	pcpu_unit_map = unit_map;
fb435d523   Tejun Heo   percpu: add pcpu_...
2434
  	pcpu_unit_offsets = unit_off;
2f39e637e   Tejun Heo   percpu: allow non...
2435
2436
  
  	/* determine basic parameters */
fd1e8a1fe   Tejun Heo   percpu: introduce...
2437
  	pcpu_unit_pages = ai->unit_size >> PAGE_SHIFT;
d9b55eeb1   Tejun Heo   percpu: remove un...
2438
  	pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT;
6563297ce   Tejun Heo   percpu: use group...
2439
  	pcpu_atom_size = ai->atom_size;
61cf93d3e   Dennis Zhou   percpu: convert f...
2440
2441
  	pcpu_chunk_struct_size = struct_size(chunk, populated,
  					     BITS_TO_LONGS(pcpu_unit_pages));
cafe8816b   Tejun Heo   percpu: use negat...
2442

30a5b5367   Dennis Zhou   percpu: expose st...
2443
  	pcpu_stats_save_ai(ai);
d9b55eeb1   Tejun Heo   percpu: remove un...
2444
2445
2446
2447
2448
  	/*
  	 * Allocate chunk slots.  The additional last slot is for
  	 * empty chunks.
  	 */
  	pcpu_nr_slots = __pcpu_size_to_slot(pcpu_unit_size) + 2;
3c7be18ac   Roman Gushchin   mm: memcg/percpu:...
2449
2450
2451
2452
2453
  	pcpu_chunk_lists = memblock_alloc(pcpu_nr_slots *
  					  sizeof(pcpu_chunk_lists[0]) *
  					  PCPU_NR_CHUNK_TYPES,
  					  SMP_CACHE_BYTES);
  	if (!pcpu_chunk_lists)
f655f4053   Mike Rapoport   mm/percpu: add ch...
2454
2455
  		panic("%s: Failed to allocate %zu bytes
  ", __func__,
3c7be18ac   Roman Gushchin   mm: memcg/percpu:...
2456
2457
2458
2459
2460
2461
  		      pcpu_nr_slots * sizeof(pcpu_chunk_lists[0]) *
  		      PCPU_NR_CHUNK_TYPES);
  
  	for (type = 0; type < PCPU_NR_CHUNK_TYPES; type++)
  		for (i = 0; i < pcpu_nr_slots; i++)
  			INIT_LIST_HEAD(&pcpu_chunk_list(type)[i]);
fbf59bc9d   Tejun Heo   percpu: implement...
2462

edcb46399   Tejun Heo   percpu, module: i...
2463
  	/*
d2f3c3849   Dennis Zhou (Facebook)   percpu: increase ...
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
  	 * The end of the static region needs to be aligned with the
  	 * minimum allocation size as this offsets the reserved and
  	 * dynamic region.  The first chunk ends page aligned by
  	 * expanding the dynamic region, therefore the dynamic region
  	 * can be shrunk to compensate while still staying above the
  	 * configured sizes.
  	 */
  	static_size = ALIGN(ai->static_size, PCPU_MIN_ALLOC_SIZE);
  	dyn_size = ai->dyn_size - (static_size - ai->static_size);
  
  	/*
c0ebfdc3f   Dennis Zhou (Facebook)   percpu: modify ba...
2475
2476
2477
2478
2479
2480
  	 * Initialize first chunk.
  	 * If the reserved_size is non-zero, this initializes the reserved
  	 * chunk.  If the reserved_size is zero, the reserved chunk is NULL
  	 * and the dynamic region is initialized here.  The first chunk,
  	 * pcpu_first_chunk, will always point to the chunk that serves
  	 * the dynamic region.
edcb46399   Tejun Heo   percpu, module: i...
2481
  	 */
d2f3c3849   Dennis Zhou (Facebook)   percpu: increase ...
2482
2483
  	tmp_addr = (unsigned long)base_addr + static_size;
  	map_size = ai->reserved_size ?: dyn_size;
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
2484
  	chunk = pcpu_alloc_first_chunk(tmp_addr, map_size);
61ace7fa2   Tejun Heo   percpu: improve f...
2485

edcb46399   Tejun Heo   percpu, module: i...
2486
  	/* init dynamic chunk if necessary */
b9c39442c   Dennis Zhou (Facebook)   percpu: setup_fir...
2487
  	if (ai->reserved_size) {
0c4169c3d   Dennis Zhou (Facebook)   percpu: setup_fir...
2488
  		pcpu_reserved_chunk = chunk;
b9c39442c   Dennis Zhou (Facebook)   percpu: setup_fir...
2489

d2f3c3849   Dennis Zhou (Facebook)   percpu: increase ...
2490
  		tmp_addr = (unsigned long)base_addr + static_size +
c0ebfdc3f   Dennis Zhou (Facebook)   percpu: modify ba...
2491
  			   ai->reserved_size;
d2f3c3849   Dennis Zhou (Facebook)   percpu: increase ...
2492
  		map_size = dyn_size;
40064aeca   Dennis Zhou (Facebook)   percpu: replace a...
2493
  		chunk = pcpu_alloc_first_chunk(tmp_addr, map_size);
edcb46399   Tejun Heo   percpu, module: i...
2494
  	}
2441d15c9   Tejun Heo   percpu: cosmetic ...
2495
  	/* link the first chunk in */
0c4169c3d   Dennis Zhou (Facebook)   percpu: setup_fir...
2496
  	pcpu_first_chunk = chunk;
0cecf50cf   Dennis Zhou (Facebook)   percpu: introduce...
2497
  	pcpu_nr_empty_pop_pages = pcpu_first_chunk->nr_empty_pop_pages;
ae9e6bc9f   Tejun Heo   percpu: don't put...
2498
  	pcpu_chunk_relocate(pcpu_first_chunk, -1);
fbf59bc9d   Tejun Heo   percpu: implement...
2499

7e8a6304d   Dennis Zhou (Facebook)   /proc/meminfo: ad...
2500
2501
  	/* include all regions of the first chunk */
  	pcpu_nr_populated += PFN_DOWN(size_sum);
30a5b5367   Dennis Zhou   percpu: expose st...
2502
  	pcpu_stats_chunk_alloc();
df95e795a   Dennis Zhou   percpu: add trace...
2503
  	trace_percpu_create_chunk(base_addr);
30a5b5367   Dennis Zhou   percpu: expose st...
2504

fbf59bc9d   Tejun Heo   percpu: implement...
2505
  	/* we're done */
bba174f5e   Tejun Heo   percpu: add chunk...
2506
  	pcpu_base_addr = base_addr;
fbf59bc9d   Tejun Heo   percpu: implement...
2507
  }
66c3a7577   Tejun Heo   percpu: generaliz...
2508

bbddff054   Tejun Heo   percpu: use percp...
2509
  #ifdef CONFIG_SMP
17f3609c2   Andi Kleen   sections: fix sec...
2510
  const char * const pcpu_fc_names[PCPU_FC_NR] __initconst = {
f58dc01ba   Tejun Heo   percpu: generaliz...
2511
2512
2513
  	[PCPU_FC_AUTO]	= "auto",
  	[PCPU_FC_EMBED]	= "embed",
  	[PCPU_FC_PAGE]	= "page",
f58dc01ba   Tejun Heo   percpu: generaliz...
2514
  };
66c3a7577   Tejun Heo   percpu: generaliz...
2515

f58dc01ba   Tejun Heo   percpu: generaliz...
2516
  enum pcpu_fc pcpu_chosen_fc __initdata = PCPU_FC_AUTO;
66c3a7577   Tejun Heo   percpu: generaliz...
2517

f58dc01ba   Tejun Heo   percpu: generaliz...
2518
2519
  static int __init percpu_alloc_setup(char *str)
  {
5479c78ac   Cyrill Gorcunov   mm, percpu: Make ...
2520
2521
  	if (!str)
  		return -EINVAL;
f58dc01ba   Tejun Heo   percpu: generaliz...
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
  	if (0)
  		/* nada */;
  #ifdef CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK
  	else if (!strcmp(str, "embed"))
  		pcpu_chosen_fc = PCPU_FC_EMBED;
  #endif
  #ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK
  	else if (!strcmp(str, "page"))
  		pcpu_chosen_fc = PCPU_FC_PAGE;
  #endif
f58dc01ba   Tejun Heo   percpu: generaliz...
2532
  	else
870d4b12a   Joe Perches   mm: percpu: use p...
2533
2534
  		pr_warn("unknown allocator %s specified
  ", str);
66c3a7577   Tejun Heo   percpu: generaliz...
2535

f58dc01ba   Tejun Heo   percpu: generaliz...
2536
  	return 0;
66c3a7577   Tejun Heo   percpu: generaliz...
2537
  }
f58dc01ba   Tejun Heo   percpu: generaliz...
2538
  early_param("percpu_alloc", percpu_alloc_setup);
66c3a7577   Tejun Heo   percpu: generaliz...
2539

3c9a024fd   Tejun Heo   percpu: fix build...
2540
2541
2542
2543
2544
  /*
   * pcpu_embed_first_chunk() is used by the generic percpu setup.
   * Build it if needed by the arch config or the generic setup is going
   * to be used.
   */
08fc45806   Tejun Heo   percpu: build fir...
2545
2546
  #if defined(CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK) || \
  	!defined(CONFIG_HAVE_SETUP_PER_CPU_AREA)
3c9a024fd   Tejun Heo   percpu: fix build...
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
  #define BUILD_EMBED_FIRST_CHUNK
  #endif
  
  /* build pcpu_page_first_chunk() iff needed by the arch config */
  #if defined(CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK)
  #define BUILD_PAGE_FIRST_CHUNK
  #endif
  
  /* pcpu_build_alloc_info() is used by both embed and page first chunk */
  #if defined(BUILD_EMBED_FIRST_CHUNK) || defined(BUILD_PAGE_FIRST_CHUNK)
  /**
   * pcpu_build_alloc_info - build alloc_info considering distances between CPUs
   * @reserved_size: the size of reserved percpu area in bytes
   * @dyn_size: minimum free size for dynamic allocation in bytes
   * @atom_size: allocation atom size
   * @cpu_distance_fn: callback to determine distance between cpus, optional
   *
   * This function determines grouping of units, their mappings to cpus
   * and other parameters considering needed percpu size, allocation
   * atom size and distances between CPUs.
   *
bffc43758   Yannick Guerrini   percpu: Fix trivi...
2568
   * Groups are always multiples of atom size and CPUs which are of
3c9a024fd   Tejun Heo   percpu: fix build...
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
   * LOCAL_DISTANCE both ways are grouped together and share space for
   * units in the same group.  The returned configuration is guaranteed
   * to have CPUs on different nodes on different groups and >=75% usage
   * of allocated virtual address space.
   *
   * RETURNS:
   * On success, pointer to the new allocation_info is returned.  On
   * failure, ERR_PTR value is returned.
   */
  static struct pcpu_alloc_info * __init pcpu_build_alloc_info(
  				size_t reserved_size, size_t dyn_size,
  				size_t atom_size,
  				pcpu_fc_cpu_distance_fn_t cpu_distance_fn)
  {
  	static int group_map[NR_CPUS] __initdata;
  	static int group_cnt[NR_CPUS] __initdata;
  	const size_t static_size = __per_cpu_end - __per_cpu_start;
  	int nr_groups = 1, nr_units = 0;
  	size_t size_sum, min_unit_size, alloc_size;
3f649ab72   Kees Cook   treewide: Remove ...
2588
  	int upa, max_upa, best_upa;	/* units_per_alloc */
3c9a024fd   Tejun Heo   percpu: fix build...
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
  	int last_allocs, group, unit;
  	unsigned int cpu, tcpu;
  	struct pcpu_alloc_info *ai;
  	unsigned int *cpu_map;
  
  	/* this function may be called multiple times */
  	memset(group_map, 0, sizeof(group_map));
  	memset(group_cnt, 0, sizeof(group_cnt));
  
  	/* calculate size_sum and ensure dyn_size is enough for early alloc */
  	size_sum = PFN_ALIGN(static_size + reserved_size +
  			    max_t(size_t, dyn_size, PERCPU_DYNAMIC_EARLY_SIZE));
  	dyn_size = size_sum - static_size - reserved_size;
  
  	/*
  	 * Determine min_unit_size, alloc_size and max_upa such that
  	 * alloc_size is multiple of atom_size and is the smallest
25985edce   Lucas De Marchi   Fix common misspe...
2606
  	 * which can accommodate 4k aligned segments which are equal to
3c9a024fd   Tejun Heo   percpu: fix build...
2607
2608
2609
  	 * or larger than min_unit_size.
  	 */
  	min_unit_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE);
9c0151627   Dennis Zhou (Facebook)   percpu: update th...
2610
  	/* determine the maximum # of units that can fit in an allocation */
3c9a024fd   Tejun Heo   percpu: fix build...
2611
2612
  	alloc_size = roundup(min_unit_size, atom_size);
  	upa = alloc_size / min_unit_size;
f09f1243c   Alexander Kuleshov   mm/percpu: use of...
2613
  	while (alloc_size % upa || (offset_in_page(alloc_size / upa)))
3c9a024fd   Tejun Heo   percpu: fix build...
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
  		upa--;
  	max_upa = upa;
  
  	/* group cpus according to their proximity */
  	for_each_possible_cpu(cpu) {
  		group = 0;
  	next_group:
  		for_each_possible_cpu(tcpu) {
  			if (cpu == tcpu)
  				break;
  			if (group_map[tcpu] == group && cpu_distance_fn &&
  			    (cpu_distance_fn(cpu, tcpu) > LOCAL_DISTANCE ||
  			     cpu_distance_fn(tcpu, cpu) > LOCAL_DISTANCE)) {
  				group++;
  				nr_groups = max(nr_groups, group + 1);
  				goto next_group;
  			}
  		}
  		group_map[cpu] = group;
  		group_cnt[group]++;
  	}
  
  	/*
9c0151627   Dennis Zhou (Facebook)   percpu: update th...
2637
2638
2639
  	 * Wasted space is caused by a ratio imbalance of upa to group_cnt.
  	 * Expand the unit_size until we use >= 75% of the units allocated.
  	 * Related to atom_size, which could be much larger than the unit_size.
3c9a024fd   Tejun Heo   percpu: fix build...
2640
2641
2642
2643
  	 */
  	last_allocs = INT_MAX;
  	for (upa = max_upa; upa; upa--) {
  		int allocs = 0, wasted = 0;
f09f1243c   Alexander Kuleshov   mm/percpu: use of...
2644
  		if (alloc_size % upa || (offset_in_page(alloc_size / upa)))
3c9a024fd   Tejun Heo   percpu: fix build...
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
  			continue;
  
  		for (group = 0; group < nr_groups; group++) {
  			int this_allocs = DIV_ROUND_UP(group_cnt[group], upa);
  			allocs += this_allocs;
  			wasted += this_allocs * upa - group_cnt[group];
  		}
  
  		/*
  		 * Don't accept if wastage is over 1/3.  The
  		 * greater-than comparison ensures upa==1 always
  		 * passes the following check.
  		 */
  		if (wasted > num_possible_cpus() / 3)
  			continue;
  
  		/* and then don't consume more memory */
  		if (allocs > last_allocs)
  			break;
  		last_allocs = allocs;
  		best_upa = upa;
  	}
  	upa = best_upa;
  
  	/* allocate and fill alloc_info */
  	for (group = 0; group < nr_groups; group++)
  		nr_units += roundup(group_cnt[group], upa);
  
  	ai = pcpu_alloc_alloc_info(nr_groups, nr_units);
  	if (!ai)
  		return ERR_PTR(-ENOMEM);
  	cpu_map = ai->groups[0].cpu_map;
  
  	for (group = 0; group < nr_groups; group++) {
  		ai->groups[group].cpu_map = cpu_map;
  		cpu_map += roundup(group_cnt[group], upa);
  	}
  
  	ai->static_size = static_size;
  	ai->reserved_size = reserved_size;
  	ai->dyn_size = dyn_size;
  	ai->unit_size = alloc_size / upa;
  	ai->atom_size = atom_size;
  	ai->alloc_size = alloc_size;
2de7852fe   Peng Fan   percpu: use nr_gr...
2689
  	for (group = 0, unit = 0; group < nr_groups; group++) {
3c9a024fd   Tejun Heo   percpu: fix build...
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
  		struct pcpu_group_info *gi = &ai->groups[group];
  
  		/*
  		 * Initialize base_offset as if all groups are located
  		 * back-to-back.  The caller should update this to
  		 * reflect actual allocation.
  		 */
  		gi->base_offset = unit * ai->unit_size;
  
  		for_each_possible_cpu(cpu)
  			if (group_map[cpu] == group)
  				gi->cpu_map[gi->nr_units++] = cpu;
  		gi->nr_units = roundup(gi->nr_units, upa);
  		unit += gi->nr_units;
  	}
  	BUG_ON(unit != nr_units);
  
  	return ai;
  }
  #endif /* BUILD_EMBED_FIRST_CHUNK || BUILD_PAGE_FIRST_CHUNK */
  
  #if defined(BUILD_EMBED_FIRST_CHUNK)
66c3a7577   Tejun Heo   percpu: generaliz...
2712
2713
  /**
   * pcpu_embed_first_chunk - embed the first percpu chunk into bootmem
66c3a7577   Tejun Heo   percpu: generaliz...
2714
   * @reserved_size: the size of reserved percpu area in bytes
4ba6ce250   Tejun Heo   percpu: make @dyn...
2715
   * @dyn_size: minimum free size for dynamic allocation in bytes
c8826dd53   Tejun Heo   percpu: update em...
2716
2717
2718
   * @atom_size: allocation atom size
   * @cpu_distance_fn: callback to determine distance between cpus, optional
   * @alloc_fn: function to allocate percpu page
25985edce   Lucas De Marchi   Fix common misspe...
2719
   * @free_fn: function to free percpu page
66c3a7577   Tejun Heo   percpu: generaliz...
2720
2721
2722
2723
2724
   *
   * This is a helper to ease setting up embedded first percpu chunk and
   * can be called where pcpu_setup_first_chunk() is expected.
   *
   * If this function is used to setup the first chunk, it is allocated
c8826dd53   Tejun Heo   percpu: update em...
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
   * by calling @alloc_fn and used as-is without being mapped into
   * vmalloc area.  Allocations are always whole multiples of @atom_size
   * aligned to @atom_size.
   *
   * This enables the first chunk to piggy back on the linear physical
   * mapping which often uses larger page size.  Please note that this
   * can result in very sparse cpu->unit mapping on NUMA machines thus
   * requiring large vmalloc address space.  Don't use this allocator if
   * vmalloc space is not orders of magnitude larger than distances
   * between node memory addresses (ie. 32bit NUMA machines).
66c3a7577   Tejun Heo   percpu: generaliz...
2735
   *
4ba6ce250   Tejun Heo   percpu: make @dyn...
2736
   * @dyn_size specifies the minimum dynamic area size.
66c3a7577   Tejun Heo   percpu: generaliz...
2737
2738
   *
   * If the needed size is smaller than the minimum or specified unit
c8826dd53   Tejun Heo   percpu: update em...
2739
   * size, the leftover is returned using @free_fn.
66c3a7577   Tejun Heo   percpu: generaliz...
2740
2741
   *
   * RETURNS:
fb435d523   Tejun Heo   percpu: add pcpu_...
2742
   * 0 on success, -errno on failure.
66c3a7577   Tejun Heo   percpu: generaliz...
2743
   */
4ba6ce250   Tejun Heo   percpu: make @dyn...
2744
  int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size,
c8826dd53   Tejun Heo   percpu: update em...
2745
2746
2747
2748
  				  size_t atom_size,
  				  pcpu_fc_cpu_distance_fn_t cpu_distance_fn,
  				  pcpu_fc_alloc_fn_t alloc_fn,
  				  pcpu_fc_free_fn_t free_fn)
66c3a7577   Tejun Heo   percpu: generaliz...
2749
  {
c8826dd53   Tejun Heo   percpu: update em...
2750
2751
  	void *base = (void *)ULONG_MAX;
  	void **areas = NULL;
fd1e8a1fe   Tejun Heo   percpu: introduce...
2752
  	struct pcpu_alloc_info *ai;
93c76b6b2   zijun_hu   mm/percpu.c: corr...
2753
2754
  	size_t size_sum, areas_size;
  	unsigned long max_distance;
163fa2343   Kefeng Wang   percpu: Make pcpu...
2755
  	int group, i, highest_group, rc = 0;
66c3a7577   Tejun Heo   percpu: generaliz...
2756

c8826dd53   Tejun Heo   percpu: update em...
2757
2758
  	ai = pcpu_build_alloc_info(reserved_size, dyn_size, atom_size,
  				   cpu_distance_fn);
fd1e8a1fe   Tejun Heo   percpu: introduce...
2759
2760
  	if (IS_ERR(ai))
  		return PTR_ERR(ai);
66c3a7577   Tejun Heo   percpu: generaliz...
2761

fd1e8a1fe   Tejun Heo   percpu: introduce...
2762
  	size_sum = ai->static_size + ai->reserved_size + ai->dyn_size;
c8826dd53   Tejun Heo   percpu: update em...
2763
  	areas_size = PFN_ALIGN(ai->nr_groups * sizeof(void *));
fa8a7094b   Tejun Heo   x86: implement pe...
2764

26fb3dae0   Mike Rapoport   memblock: drop me...
2765
  	areas = memblock_alloc(areas_size, SMP_CACHE_BYTES);
c8826dd53   Tejun Heo   percpu: update em...
2766
  	if (!areas) {
fb435d523   Tejun Heo   percpu: add pcpu_...
2767
  		rc = -ENOMEM;
c8826dd53   Tejun Heo   percpu: update em...
2768
  		goto out_free;
fa8a7094b   Tejun Heo   x86: implement pe...
2769
  	}
66c3a7577   Tejun Heo   percpu: generaliz...
2770

9b7396624   zijun_hu   mm/percpu.c: fix ...
2771
2772
  	/* allocate, copy and determine base address & max_distance */
  	highest_group = 0;
c8826dd53   Tejun Heo   percpu: update em...
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
  	for (group = 0; group < ai->nr_groups; group++) {
  		struct pcpu_group_info *gi = &ai->groups[group];
  		unsigned int cpu = NR_CPUS;
  		void *ptr;
  
  		for (i = 0; i < gi->nr_units && cpu == NR_CPUS; i++)
  			cpu = gi->cpu_map[i];
  		BUG_ON(cpu == NR_CPUS);
  
  		/* allocate space for the whole group */
  		ptr = alloc_fn(cpu, gi->nr_units * ai->unit_size, atom_size);
  		if (!ptr) {
  			rc = -ENOMEM;
  			goto out_free_areas;
  		}
f528f0b8e   Catalin Marinas   kmemleak: Handle ...
2788
2789
  		/* kmemleak tracks the percpu allocations separately */
  		kmemleak_free(ptr);
c8826dd53   Tejun Heo   percpu: update em...
2790
  		areas[group] = ptr;
fd1e8a1fe   Tejun Heo   percpu: introduce...
2791

c8826dd53   Tejun Heo   percpu: update em...
2792
  		base = min(ptr, base);
9b7396624   zijun_hu   mm/percpu.c: fix ...
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
  		if (ptr > areas[highest_group])
  			highest_group = group;
  	}
  	max_distance = areas[highest_group] - base;
  	max_distance += ai->unit_size * ai->groups[highest_group].nr_units;
  
  	/* warn if maximum distance is further than 75% of vmalloc space */
  	if (max_distance > VMALLOC_TOTAL * 3 / 4) {
  		pr_warn("max_distance=0x%lx too large for vmalloc space 0x%lx
  ",
  				max_distance, VMALLOC_TOTAL);
  #ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK
  		/* and fail if we have fallback */
  		rc = -EINVAL;
  		goto out_free_areas;
  #endif
42b642814   Tejun Heo   percpu: pcpu_embe...
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
  	}
  
  	/*
  	 * Copy data and free unused parts.  This should happen after all
  	 * allocations are complete; otherwise, we may end up with
  	 * overlapping groups.
  	 */
  	for (group = 0; group < ai->nr_groups; group++) {
  		struct pcpu_group_info *gi = &ai->groups[group];
  		void *ptr = areas[group];
c8826dd53   Tejun Heo   percpu: update em...
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
  
  		for (i = 0; i < gi->nr_units; i++, ptr += ai->unit_size) {
  			if (gi->cpu_map[i] == NR_CPUS) {
  				/* unused unit, free whole */
  				free_fn(ptr, ai->unit_size);
  				continue;
  			}
  			/* copy and return the unused part */
  			memcpy(ptr, __per_cpu_load, ai->static_size);
  			free_fn(ptr + size_sum, ai->unit_size - size_sum);
  		}
fa8a7094b   Tejun Heo   x86: implement pe...
2830
  	}
66c3a7577   Tejun Heo   percpu: generaliz...
2831

c8826dd53   Tejun Heo   percpu: update em...
2832
  	/* base address is now known, determine group base offsets */
6ea529a20   Tejun Heo   percpu: make embe...
2833
  	for (group = 0; group < ai->nr_groups; group++) {
c8826dd53   Tejun Heo   percpu: update em...
2834
  		ai->groups[group].base_offset = areas[group] - base;
6ea529a20   Tejun Heo   percpu: make embe...
2835
  	}
c8826dd53   Tejun Heo   percpu: update em...
2836

00206a69e   Matteo Croce   percpu: stop prin...
2837
2838
2839
  	pr_info("Embedded %zu pages/cpu s%zu r%zu d%zu u%zu
  ",
  		PFN_DOWN(size_sum), ai->static_size, ai->reserved_size,
fd1e8a1fe   Tejun Heo   percpu: introduce...
2840
  		ai->dyn_size, ai->unit_size);
d4b95f803   Tejun Heo   x86,percpu: gener...
2841

163fa2343   Kefeng Wang   percpu: Make pcpu...
2842
  	pcpu_setup_first_chunk(ai, base);
c8826dd53   Tejun Heo   percpu: update em...
2843
2844
2845
2846
  	goto out_free;
  
  out_free_areas:
  	for (group = 0; group < ai->nr_groups; group++)
f851c8d85   Michael Holzheu   percpu: fix bootm...
2847
2848
2849
  		if (areas[group])
  			free_fn(areas[group],
  				ai->groups[group].nr_units * ai->unit_size);
c8826dd53   Tejun Heo   percpu: update em...
2850
  out_free:
fd1e8a1fe   Tejun Heo   percpu: introduce...
2851
  	pcpu_free_alloc_info(ai);
c8826dd53   Tejun Heo   percpu: update em...
2852
  	if (areas)
999c17e3d   Santosh Shilimkar   mm/percpu.c: use ...
2853
  		memblock_free_early(__pa(areas), areas_size);
fb435d523   Tejun Heo   percpu: add pcpu_...
2854
  	return rc;
d4b95f803   Tejun Heo   x86,percpu: gener...
2855
  }
3c9a024fd   Tejun Heo   percpu: fix build...
2856
  #endif /* BUILD_EMBED_FIRST_CHUNK */
d4b95f803   Tejun Heo   x86,percpu: gener...
2857

3c9a024fd   Tejun Heo   percpu: fix build...
2858
  #ifdef BUILD_PAGE_FIRST_CHUNK
d4b95f803   Tejun Heo   x86,percpu: gener...
2859
  /**
00ae4064b   Tejun Heo   percpu: rename 4k...
2860
   * pcpu_page_first_chunk - map the first chunk using PAGE_SIZE pages
d4b95f803   Tejun Heo   x86,percpu: gener...
2861
2862
   * @reserved_size: the size of reserved percpu area in bytes
   * @alloc_fn: function to allocate percpu page, always called with PAGE_SIZE
25985edce   Lucas De Marchi   Fix common misspe...
2863
   * @free_fn: function to free percpu page, always called with PAGE_SIZE
d4b95f803   Tejun Heo   x86,percpu: gener...
2864
2865
   * @populate_pte_fn: function to populate pte
   *
00ae4064b   Tejun Heo   percpu: rename 4k...
2866
2867
   * This is a helper to ease setting up page-remapped first percpu
   * chunk and can be called where pcpu_setup_first_chunk() is expected.
d4b95f803   Tejun Heo   x86,percpu: gener...
2868
2869
2870
2871
2872
   *
   * This is the basic allocator.  Static percpu area is allocated
   * page-by-page into vmalloc area.
   *
   * RETURNS:
fb435d523   Tejun Heo   percpu: add pcpu_...
2873
   * 0 on success, -errno on failure.
d4b95f803   Tejun Heo   x86,percpu: gener...
2874
   */
fb435d523   Tejun Heo   percpu: add pcpu_...
2875
2876
2877
2878
  int __init pcpu_page_first_chunk(size_t reserved_size,
  				 pcpu_fc_alloc_fn_t alloc_fn,
  				 pcpu_fc_free_fn_t free_fn,
  				 pcpu_fc_populate_pte_fn_t populate_pte_fn)
d4b95f803   Tejun Heo   x86,percpu: gener...
2879
  {
8f05a6a65   Tejun Heo   percpu: make 4k f...
2880
  	static struct vm_struct vm;
fd1e8a1fe   Tejun Heo   percpu: introduce...
2881
  	struct pcpu_alloc_info *ai;
00ae4064b   Tejun Heo   percpu: rename 4k...
2882
  	char psize_str[16];
ce3141a27   Tejun Heo   percpu: drop pcpu...
2883
  	int unit_pages;
d4b95f803   Tejun Heo   x86,percpu: gener...
2884
  	size_t pages_size;
ce3141a27   Tejun Heo   percpu: drop pcpu...
2885
  	struct page **pages;
163fa2343   Kefeng Wang   percpu: Make pcpu...
2886
  	int unit, i, j, rc = 0;
8f6066049   zijun_hu   mm/percpu.c: fix ...
2887
2888
  	int upa;
  	int nr_g0_units;
d4b95f803   Tejun Heo   x86,percpu: gener...
2889

00ae4064b   Tejun Heo   percpu: rename 4k...
2890
  	snprintf(psize_str, sizeof(psize_str), "%luK", PAGE_SIZE >> 10);
4ba6ce250   Tejun Heo   percpu: make @dyn...
2891
  	ai = pcpu_build_alloc_info(reserved_size, 0, PAGE_SIZE, NULL);
fd1e8a1fe   Tejun Heo   percpu: introduce...
2892
2893
2894
  	if (IS_ERR(ai))
  		return PTR_ERR(ai);
  	BUG_ON(ai->nr_groups != 1);
8f6066049   zijun_hu   mm/percpu.c: fix ...
2895
2896
  	upa = ai->alloc_size/ai->unit_size;
  	nr_g0_units = roundup(num_possible_cpus(), upa);
0b59c25f9   Igor Stoppa   mm: percpu: remov...
2897
  	if (WARN_ON(ai->groups[0].nr_units != nr_g0_units)) {
8f6066049   zijun_hu   mm/percpu.c: fix ...
2898
2899
2900
  		pcpu_free_alloc_info(ai);
  		return -EINVAL;
  	}
fd1e8a1fe   Tejun Heo   percpu: introduce...
2901
2902
  
  	unit_pages = ai->unit_size >> PAGE_SHIFT;
d4b95f803   Tejun Heo   x86,percpu: gener...
2903
2904
  
  	/* unaligned allocations can't be freed, round up to page size */
fd1e8a1fe   Tejun Heo   percpu: introduce...
2905
2906
  	pages_size = PFN_ALIGN(unit_pages * num_possible_cpus() *
  			       sizeof(pages[0]));
7e1c4e279   Mike Rapoport   memblock: stop us...
2907
  	pages = memblock_alloc(pages_size, SMP_CACHE_BYTES);
f655f4053   Mike Rapoport   mm/percpu: add ch...
2908
2909
2910
2911
  	if (!pages)
  		panic("%s: Failed to allocate %zu bytes
  ", __func__,
  		      pages_size);
d4b95f803   Tejun Heo   x86,percpu: gener...
2912

8f05a6a65   Tejun Heo   percpu: make 4k f...
2913
  	/* allocate pages */
d4b95f803   Tejun Heo   x86,percpu: gener...
2914
  	j = 0;
8f6066049   zijun_hu   mm/percpu.c: fix ...
2915
2916
  	for (unit = 0; unit < num_possible_cpus(); unit++) {
  		unsigned int cpu = ai->groups[0].cpu_map[unit];
ce3141a27   Tejun Heo   percpu: drop pcpu...
2917
  		for (i = 0; i < unit_pages; i++) {
d4b95f803   Tejun Heo   x86,percpu: gener...
2918
  			void *ptr;
3cbc85652   Tejun Heo   percpu: add @alig...
2919
  			ptr = alloc_fn(cpu, PAGE_SIZE, PAGE_SIZE);
d4b95f803   Tejun Heo   x86,percpu: gener...
2920
  			if (!ptr) {
870d4b12a   Joe Perches   mm: percpu: use p...
2921
2922
  				pr_warn("failed to allocate %s page for cpu%u
  ",
8f6066049   zijun_hu   mm/percpu.c: fix ...
2923
  						psize_str, cpu);
d4b95f803   Tejun Heo   x86,percpu: gener...
2924
2925
  				goto enomem;
  			}
f528f0b8e   Catalin Marinas   kmemleak: Handle ...
2926
2927
  			/* kmemleak tracks the percpu allocations separately */
  			kmemleak_free(ptr);
ce3141a27   Tejun Heo   percpu: drop pcpu...
2928
  			pages[j++] = virt_to_page(ptr);
d4b95f803   Tejun Heo   x86,percpu: gener...
2929
  		}
8f6066049   zijun_hu   mm/percpu.c: fix ...
2930
  	}
d4b95f803   Tejun Heo   x86,percpu: gener...
2931

8f05a6a65   Tejun Heo   percpu: make 4k f...
2932
2933
  	/* allocate vm area, map the pages and copy static data */
  	vm.flags = VM_ALLOC;
fd1e8a1fe   Tejun Heo   percpu: introduce...
2934
  	vm.size = num_possible_cpus() * ai->unit_size;
8f05a6a65   Tejun Heo   percpu: make 4k f...
2935
  	vm_area_register_early(&vm, PAGE_SIZE);
fd1e8a1fe   Tejun Heo   percpu: introduce...
2936
  	for (unit = 0; unit < num_possible_cpus(); unit++) {
1d9d32572   Tejun Heo   percpu: make @dyn...
2937
  		unsigned long unit_addr =
fd1e8a1fe   Tejun Heo   percpu: introduce...
2938
  			(unsigned long)vm.addr + unit * ai->unit_size;
8f05a6a65   Tejun Heo   percpu: make 4k f...
2939

ce3141a27   Tejun Heo   percpu: drop pcpu...
2940
  		for (i = 0; i < unit_pages; i++)
8f05a6a65   Tejun Heo   percpu: make 4k f...
2941
2942
2943
  			populate_pte_fn(unit_addr + (i << PAGE_SHIFT));
  
  		/* pte already populated, the following shouldn't fail */
fb435d523   Tejun Heo   percpu: add pcpu_...
2944
2945
2946
2947
2948
  		rc = __pcpu_map_pages(unit_addr, &pages[unit * unit_pages],
  				      unit_pages);
  		if (rc < 0)
  			panic("failed to map percpu area, err=%d
  ", rc);
66c3a7577   Tejun Heo   percpu: generaliz...
2949

8f05a6a65   Tejun Heo   percpu: make 4k f...
2950
2951
2952
2953
2954
2955
2956
2957
2958
  		/*
  		 * FIXME: Archs with virtual cache should flush local
  		 * cache for the linear mapping here - something
  		 * equivalent to flush_cache_vmap() on the local cpu.
  		 * flush_cache_vmap() can't be used as most supporting
  		 * data structures are not set up yet.
  		 */
  
  		/* copy static data */
fd1e8a1fe   Tejun Heo   percpu: introduce...
2959
  		memcpy((void *)unit_addr, __per_cpu_load, ai->static_size);
66c3a7577   Tejun Heo   percpu: generaliz...
2960
2961
2962
  	}
  
  	/* we're ready, commit */
00206a69e   Matteo Croce   percpu: stop prin...
2963
2964
2965
  	pr_info("%d %s pages/cpu s%zu r%zu d%zu
  ",
  		unit_pages, psize_str, ai->static_size,
fd1e8a1fe   Tejun Heo   percpu: introduce...
2966
  		ai->reserved_size, ai->dyn_size);
d4b95f803   Tejun Heo   x86,percpu: gener...
2967

163fa2343   Kefeng Wang   percpu: Make pcpu...
2968
  	pcpu_setup_first_chunk(ai, vm.addr);
d4b95f803   Tejun Heo   x86,percpu: gener...
2969
2970
2971
2972
  	goto out_free_ar;
  
  enomem:
  	while (--j >= 0)
ce3141a27   Tejun Heo   percpu: drop pcpu...
2973
  		free_fn(page_address(pages[j]), PAGE_SIZE);
fb435d523   Tejun Heo   percpu: add pcpu_...
2974
  	rc = -ENOMEM;
d4b95f803   Tejun Heo   x86,percpu: gener...
2975
  out_free_ar:
999c17e3d   Santosh Shilimkar   mm/percpu.c: use ...
2976
  	memblock_free_early(__pa(pages), pages_size);
fd1e8a1fe   Tejun Heo   percpu: introduce...
2977
  	pcpu_free_alloc_info(ai);
fb435d523   Tejun Heo   percpu: add pcpu_...
2978
  	return rc;
d4b95f803   Tejun Heo   x86,percpu: gener...
2979
  }
3c9a024fd   Tejun Heo   percpu: fix build...
2980
  #endif /* BUILD_PAGE_FIRST_CHUNK */
d4b95f803   Tejun Heo   x86,percpu: gener...
2981

bbddff054   Tejun Heo   percpu: use percp...
2982
  #ifndef	CONFIG_HAVE_SETUP_PER_CPU_AREA
8c4bfc6e8   Tejun Heo   x86,percpu: gener...
2983
  /*
bbddff054   Tejun Heo   percpu: use percp...
2984
   * Generic SMP percpu area setup.
e74e39620   Tejun Heo   percpu: use dynam...
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
   *
   * The embedding helper is used because its behavior closely resembles
   * the original non-dynamic generic percpu area setup.  This is
   * important because many archs have addressing restrictions and might
   * fail if the percpu area is located far away from the previous
   * location.  As an added bonus, in non-NUMA cases, embedding is
   * generally a good idea TLB-wise because percpu area can piggy back
   * on the physical linear memory mapping which uses large page
   * mappings on applicable archs.
   */
e74e39620   Tejun Heo   percpu: use dynam...
2995
2996
  unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
  EXPORT_SYMBOL(__per_cpu_offset);
c8826dd53   Tejun Heo   percpu: update em...
2997
2998
2999
  static void * __init pcpu_dfl_fc_alloc(unsigned int cpu, size_t size,
  				       size_t align)
  {
26fb3dae0   Mike Rapoport   memblock: drop me...
3000
  	return  memblock_alloc_from(size, align, __pa(MAX_DMA_ADDRESS));
c8826dd53   Tejun Heo   percpu: update em...
3001
  }
66c3a7577   Tejun Heo   percpu: generaliz...
3002

c8826dd53   Tejun Heo   percpu: update em...
3003
3004
  static void __init pcpu_dfl_fc_free(void *ptr, size_t size)
  {
999c17e3d   Santosh Shilimkar   mm/percpu.c: use ...
3005
  	memblock_free_early(__pa(ptr), size);
c8826dd53   Tejun Heo   percpu: update em...
3006
  }
e74e39620   Tejun Heo   percpu: use dynam...
3007
3008
  void __init setup_per_cpu_areas(void)
  {
e74e39620   Tejun Heo   percpu: use dynam...
3009
3010
  	unsigned long delta;
  	unsigned int cpu;
fb435d523   Tejun Heo   percpu: add pcpu_...
3011
  	int rc;
e74e39620   Tejun Heo   percpu: use dynam...
3012
3013
3014
3015
3016
  
  	/*
  	 * Always reserve area for module percpu variables.  That's
  	 * what the legacy allocator did.
  	 */
fb435d523   Tejun Heo   percpu: add pcpu_...
3017
  	rc = pcpu_embed_first_chunk(PERCPU_MODULE_RESERVE,
c8826dd53   Tejun Heo   percpu: update em...
3018
3019
  				    PERCPU_DYNAMIC_RESERVE, PAGE_SIZE, NULL,
  				    pcpu_dfl_fc_alloc, pcpu_dfl_fc_free);
fb435d523   Tejun Heo   percpu: add pcpu_...
3020
  	if (rc < 0)
bbddff054   Tejun Heo   percpu: use percp...
3021
  		panic("Failed to initialize percpu areas.");
e74e39620   Tejun Heo   percpu: use dynam...
3022
3023
3024
  
  	delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start;
  	for_each_possible_cpu(cpu)
fb435d523   Tejun Heo   percpu: add pcpu_...
3025
  		__per_cpu_offset[cpu] = delta + pcpu_unit_offsets[cpu];
66c3a7577   Tejun Heo   percpu: generaliz...
3026
  }
bbddff054   Tejun Heo   percpu: use percp...
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
  #endif	/* CONFIG_HAVE_SETUP_PER_CPU_AREA */
  
  #else	/* CONFIG_SMP */
  
  /*
   * UP percpu area setup.
   *
   * UP always uses km-based percpu allocator with identity mapping.
   * Static percpu variables are indistinguishable from the usual static
   * variables and don't require any special preparation.
   */
  void __init setup_per_cpu_areas(void)
  {
  	const size_t unit_size =
  		roundup_pow_of_two(max_t(size_t, PCPU_MIN_UNIT_SIZE,
  					 PERCPU_DYNAMIC_RESERVE));
  	struct pcpu_alloc_info *ai;
  	void *fc;
  
  	ai = pcpu_alloc_alloc_info(1, 1);
26fb3dae0   Mike Rapoport   memblock: drop me...
3047
  	fc = memblock_alloc_from(unit_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
bbddff054   Tejun Heo   percpu: use percp...
3048
3049
  	if (!ai || !fc)
  		panic("Failed to allocate memory for percpu areas.");
100d13c3b   Catalin Marinas   kmemleak: Fix the...
3050
3051
  	/* kmemleak tracks the percpu allocations separately */
  	kmemleak_free(fc);
bbddff054   Tejun Heo   percpu: use percp...
3052
3053
3054
3055
3056
3057
3058
  
  	ai->dyn_size = unit_size;
  	ai->unit_size = unit_size;
  	ai->atom_size = unit_size;
  	ai->alloc_size = unit_size;
  	ai->groups[0].nr_units = 1;
  	ai->groups[0].cpu_map[0] = 0;
163fa2343   Kefeng Wang   percpu: Make pcpu...
3059
  	pcpu_setup_first_chunk(ai, fc);
438a50618   Nicolas Pitre   percpu: don't for...
3060
  	pcpu_free_alloc_info(ai);
bbddff054   Tejun Heo   percpu: use percp...
3061
3062
3063
  }
  
  #endif	/* CONFIG_SMP */
099a19d91   Tejun Heo   percpu: allow lim...
3064
3065
  
  /*
7e8a6304d   Dennis Zhou (Facebook)   /proc/meminfo: ad...
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
   * pcpu_nr_pages - calculate total number of populated backing pages
   *
   * This reflects the number of pages populated to back chunks.  Metadata is
   * excluded in the number exposed in meminfo as the number of backing pages
   * scales with the number of cpus and can quickly outweigh the memory used for
   * metadata.  It also keeps this calculation nice and simple.
   *
   * RETURNS:
   * Total number of populated backing pages in use by the allocator.
   */
  unsigned long pcpu_nr_pages(void)
  {
  	return pcpu_nr_populated * pcpu_nr_units;
  }
224f180d7   Vijayanand Jitta   ANDROID: mm: Expo...
3080
  EXPORT_SYMBOL_GPL(pcpu_nr_pages);
7e8a6304d   Dennis Zhou (Facebook)   /proc/meminfo: ad...
3081
3082
  
  /*
1a4d76076   Tejun Heo   percpu: implement...
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
   * Percpu allocator is initialized early during boot when neither slab or
   * workqueue is available.  Plug async management until everything is up
   * and running.
   */
  static int __init percpu_enable_async(void)
  {
  	pcpu_async_enabled = true;
  	return 0;
  }
  subsys_initcall(percpu_enable_async);