Blame view

mm/vmalloc.c 89.7 KB
457c89965   Thomas Gleixner   treewide: Add SPD...
1
  // SPDX-License-Identifier: GPL-2.0-only
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2
3
4
5
6
7
8
  /*
   *  linux/mm/vmalloc.c
   *
   *  Copyright (C) 1993  Linus Torvalds
   *  Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
   *  SMP-safe vmalloc/vfree/ioremap, Tigran Aivazian <tigran@veritas.com>, May 2000
   *  Major rework to support vmap/vunmap, Christoph Hellwig, SGI, August 2002
930fc45a4   Christoph Lameter   [PATCH] vmalloc_node
9
   *  Numa awareness, Christoph Lameter, SGI, June 2005
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
10
   */
db64fe022   Nick Piggin   mm: rewrite vmap ...
11
  #include <linux/vmalloc.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
12
13
14
  #include <linux/mm.h>
  #include <linux/module.h>
  #include <linux/highmem.h>
c3edc4010   Ingo Molnar   sched/headers: Mo...
15
  #include <linux/sched/signal.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
16
17
18
  #include <linux/slab.h>
  #include <linux/spinlock.h>
  #include <linux/interrupt.h>
5f6a6a9c4   Alexey Dobriyan   proc: move /proc/...
19
  #include <linux/proc_fs.h>
a10aa5798   Christoph Lameter   vmalloc: show vma...
20
  #include <linux/seq_file.h>
868b104d7   Rick Edgecombe   mm/vmalloc: Add f...
21
  #include <linux/set_memory.h>
3ac7fe5a4   Thomas Gleixner   infrastructure to...
22
  #include <linux/debugobjects.h>
230169693   Christoph Lameter   vmallocinfo: add ...
23
  #include <linux/kallsyms.h>
db64fe022   Nick Piggin   mm: rewrite vmap ...
24
  #include <linux/list.h>
4da56b99d   Chris Wilson   mm/vmap: Add a no...
25
  #include <linux/notifier.h>
db64fe022   Nick Piggin   mm: rewrite vmap ...
26
27
28
  #include <linux/rbtree.h>
  #include <linux/radix-tree.h>
  #include <linux/rcupdate.h>
f0aa66179   Tejun Heo   vmalloc: implemen...
29
  #include <linux/pfn.h>
89219d37a   Catalin Marinas   kmemleak: Add the...
30
  #include <linux/kmemleak.h>
60063497a   Arun Sharma   atomic: use <linu...
31
  #include <linux/atomic.h>
3b32123d7   Gideon Israel Dsouza   mm: use macros fr...
32
  #include <linux/compiler.h>
32fcfd407   Al Viro   make vfree() safe...
33
  #include <linux/llist.h>
0f616be12   Toshi Kani   mm: change __get_...
34
  #include <linux/bitops.h>
68ad4a330   Uladzislau Rezki (Sony)   mm/vmalloc.c: kee...
35
  #include <linux/rbtree_augmented.h>
f4f235309   Jann Horn   vmalloc: fix rema...
36
  #include <linux/overflow.h>
3b32123d7   Gideon Israel Dsouza   mm: use macros fr...
37

7c0f6ba68   Linus Torvalds   Replace <asm/uacc...
38
  #include <linux/uaccess.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
39
  #include <asm/tlbflush.h>
2dca6999e   David Miller   mm, perf_event: M...
40
  #include <asm/shmparam.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
41

dd56b0464   Mel Gorman   mm: page_alloc: h...
42
  #include "internal.h"
32fcfd407   Al Viro   make vfree() safe...
43
44
45
46
47
48
49
50
51
52
53
  struct vfree_deferred {
  	struct llist_head list;
  	struct work_struct wq;
  };
  static DEFINE_PER_CPU(struct vfree_deferred, vfree_deferred);
  
  static void __vunmap(const void *, int);
  
  static void free_work(struct work_struct *w)
  {
  	struct vfree_deferred *p = container_of(w, struct vfree_deferred, wq);
894e58c14   Byungchul Park   mm/vmalloc.c: don...
54
55
56
57
  	struct llist_node *t, *llnode;
  
  	llist_for_each_safe(llnode, t, llist_del_all(&p->list))
  		__vunmap((void *)llnode, 1);
32fcfd407   Al Viro   make vfree() safe...
58
  }
db64fe022   Nick Piggin   mm: rewrite vmap ...
59
  /*** Page table manipulation functions ***/
b221385bc   Adrian Bunk   [PATCH] mm/: make...
60

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
61
62
63
64
65
66
67
68
69
70
  static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end)
  {
  	pte_t *pte;
  
  	pte = pte_offset_kernel(pmd, addr);
  	do {
  		pte_t ptent = ptep_get_and_clear(&init_mm, addr, pte);
  		WARN_ON(!pte_none(ptent) && !pte_present(ptent));
  	} while (pte++, addr += PAGE_SIZE, addr != end);
  }
db64fe022   Nick Piggin   mm: rewrite vmap ...
71
  static void vunmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
72
73
74
75
76
77
78
  {
  	pmd_t *pmd;
  	unsigned long next;
  
  	pmd = pmd_offset(pud, addr);
  	do {
  		next = pmd_addr_end(addr, end);
b9820d8f3   Toshi Kani   mm: change vunmap...
79
80
  		if (pmd_clear_huge(pmd))
  			continue;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
81
82
83
  		if (pmd_none_or_clear_bad(pmd))
  			continue;
  		vunmap_pte_range(pmd, addr, next);
75aaa8fa7   Sasha Levin   mm/vunmap: add co...
84
85
  
  		cond_resched();
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
86
87
  	} while (pmd++, addr = next, addr != end);
  }
c2febafc6   Kirill A. Shutemov   mm: convert gener...
88
  static void vunmap_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
89
90
91
  {
  	pud_t *pud;
  	unsigned long next;
c2febafc6   Kirill A. Shutemov   mm: convert gener...
92
  	pud = pud_offset(p4d, addr);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
93
94
  	do {
  		next = pud_addr_end(addr, end);
b9820d8f3   Toshi Kani   mm: change vunmap...
95
96
  		if (pud_clear_huge(pud))
  			continue;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
97
98
99
100
101
  		if (pud_none_or_clear_bad(pud))
  			continue;
  		vunmap_pmd_range(pud, addr, next);
  	} while (pud++, addr = next, addr != end);
  }
c2febafc6   Kirill A. Shutemov   mm: convert gener...
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
  static void vunmap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end)
  {
  	p4d_t *p4d;
  	unsigned long next;
  
  	p4d = p4d_offset(pgd, addr);
  	do {
  		next = p4d_addr_end(addr, end);
  		if (p4d_clear_huge(p4d))
  			continue;
  		if (p4d_none_or_clear_bad(p4d))
  			continue;
  		vunmap_pud_range(p4d, addr, next);
  	} while (p4d++, addr = next, addr != end);
  }
db64fe022   Nick Piggin   mm: rewrite vmap ...
117
  static void vunmap_page_range(unsigned long addr, unsigned long end)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
118
119
120
  {
  	pgd_t *pgd;
  	unsigned long next;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
121
122
123
  
  	BUG_ON(addr >= end);
  	pgd = pgd_offset_k(addr);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
124
125
126
127
  	do {
  		next = pgd_addr_end(addr, end);
  		if (pgd_none_or_clear_bad(pgd))
  			continue;
c2febafc6   Kirill A. Shutemov   mm: convert gener...
128
  		vunmap_p4d_range(pgd, addr, next);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
129
  	} while (pgd++, addr = next, addr != end);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
130
131
132
  }
  
  static int vmap_pte_range(pmd_t *pmd, unsigned long addr,
db64fe022   Nick Piggin   mm: rewrite vmap ...
133
  		unsigned long end, pgprot_t prot, struct page **pages, int *nr)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
134
135
  {
  	pte_t *pte;
db64fe022   Nick Piggin   mm: rewrite vmap ...
136
137
138
139
  	/*
  	 * nr is a running index into the array which helps higher level
  	 * callers keep track of where we're up to.
  	 */
872fec16d   Hugh Dickins   [PATCH] mm: init_...
140
  	pte = pte_alloc_kernel(pmd, addr);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
141
142
143
  	if (!pte)
  		return -ENOMEM;
  	do {
db64fe022   Nick Piggin   mm: rewrite vmap ...
144
145
146
147
148
  		struct page *page = pages[*nr];
  
  		if (WARN_ON(!pte_none(*pte)))
  			return -EBUSY;
  		if (WARN_ON(!page))
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
149
150
  			return -ENOMEM;
  		set_pte_at(&init_mm, addr, pte, mk_pte(page, prot));
db64fe022   Nick Piggin   mm: rewrite vmap ...
151
  		(*nr)++;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
152
153
154
  	} while (pte++, addr += PAGE_SIZE, addr != end);
  	return 0;
  }
db64fe022   Nick Piggin   mm: rewrite vmap ...
155
156
  static int vmap_pmd_range(pud_t *pud, unsigned long addr,
  		unsigned long end, pgprot_t prot, struct page **pages, int *nr)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
157
158
159
160
161
162
163
164
165
  {
  	pmd_t *pmd;
  	unsigned long next;
  
  	pmd = pmd_alloc(&init_mm, pud, addr);
  	if (!pmd)
  		return -ENOMEM;
  	do {
  		next = pmd_addr_end(addr, end);
db64fe022   Nick Piggin   mm: rewrite vmap ...
166
  		if (vmap_pte_range(pmd, addr, next, prot, pages, nr))
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
167
168
169
170
  			return -ENOMEM;
  	} while (pmd++, addr = next, addr != end);
  	return 0;
  }
c2febafc6   Kirill A. Shutemov   mm: convert gener...
171
  static int vmap_pud_range(p4d_t *p4d, unsigned long addr,
db64fe022   Nick Piggin   mm: rewrite vmap ...
172
  		unsigned long end, pgprot_t prot, struct page **pages, int *nr)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
173
174
175
  {
  	pud_t *pud;
  	unsigned long next;
c2febafc6   Kirill A. Shutemov   mm: convert gener...
176
  	pud = pud_alloc(&init_mm, p4d, addr);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
177
178
179
180
  	if (!pud)
  		return -ENOMEM;
  	do {
  		next = pud_addr_end(addr, end);
db64fe022   Nick Piggin   mm: rewrite vmap ...
181
  		if (vmap_pmd_range(pud, addr, next, prot, pages, nr))
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
182
183
184
185
  			return -ENOMEM;
  	} while (pud++, addr = next, addr != end);
  	return 0;
  }
c2febafc6   Kirill A. Shutemov   mm: convert gener...
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
  static int vmap_p4d_range(pgd_t *pgd, unsigned long addr,
  		unsigned long end, pgprot_t prot, struct page **pages, int *nr)
  {
  	p4d_t *p4d;
  	unsigned long next;
  
  	p4d = p4d_alloc(&init_mm, pgd, addr);
  	if (!p4d)
  		return -ENOMEM;
  	do {
  		next = p4d_addr_end(addr, end);
  		if (vmap_pud_range(p4d, addr, next, prot, pages, nr))
  			return -ENOMEM;
  	} while (p4d++, addr = next, addr != end);
  	return 0;
  }
db64fe022   Nick Piggin   mm: rewrite vmap ...
202
203
204
205
206
207
  /*
   * Set up page tables in kva (addr, end). The ptes shall have prot "prot", and
   * will have pfns corresponding to the "pages" array.
   *
   * Ie. pte at addr+N*PAGE_SIZE shall point to pfn corresponding to pages[N]
   */
8fc489850   Tejun Heo   vmalloc: add un/m...
208
209
  static int vmap_page_range_noflush(unsigned long start, unsigned long end,
  				   pgprot_t prot, struct page **pages)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
210
211
212
  {
  	pgd_t *pgd;
  	unsigned long next;
2e4e27c7d   Adam Lackorzynski   vmalloc.c: fix fl...
213
  	unsigned long addr = start;
db64fe022   Nick Piggin   mm: rewrite vmap ...
214
215
  	int err = 0;
  	int nr = 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
216
217
218
  
  	BUG_ON(addr >= end);
  	pgd = pgd_offset_k(addr);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
219
220
  	do {
  		next = pgd_addr_end(addr, end);
c2febafc6   Kirill A. Shutemov   mm: convert gener...
221
  		err = vmap_p4d_range(pgd, addr, next, prot, pages, &nr);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
222
  		if (err)
bf88c8c83   Figo.zhang   vmalloc.c: fix do...
223
  			return err;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
224
  	} while (pgd++, addr = next, addr != end);
db64fe022   Nick Piggin   mm: rewrite vmap ...
225

db64fe022   Nick Piggin   mm: rewrite vmap ...
226
  	return nr;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
227
  }
8fc489850   Tejun Heo   vmalloc: add un/m...
228
229
230
231
232
233
234
235
236
  static int vmap_page_range(unsigned long start, unsigned long end,
  			   pgprot_t prot, struct page **pages)
  {
  	int ret;
  
  	ret = vmap_page_range_noflush(start, end, prot, pages);
  	flush_cache_vmap(start, end);
  	return ret;
  }
81ac3ad90   KAMEZAWA Hiroyuki   kcore: register m...
237
  int is_vmalloc_or_module_addr(const void *x)
73bdf0a60   Linus Torvalds   Introduce is_vmal...
238
239
  {
  	/*
ab4f2ee13   Russell King   [ARM] fix naming ...
240
  	 * ARM, x86-64 and sparc64 put modules in a special place,
73bdf0a60   Linus Torvalds   Introduce is_vmal...
241
242
243
244
245
246
247
248
249
250
  	 * and fall back on vmalloc() if that fails. Others
  	 * just put it in the vmalloc space.
  	 */
  #if defined(CONFIG_MODULES) && defined(MODULES_VADDR)
  	unsigned long addr = (unsigned long)x;
  	if (addr >= MODULES_VADDR && addr < MODULES_END)
  		return 1;
  #endif
  	return is_vmalloc_addr(x);
  }
48667e7a4   Christoph Lameter   Move vmalloc_to_p...
251
  /*
add688fbd   malc   Revert "mm/vmallo...
252
   * Walk a vmap address to the struct page it maps.
48667e7a4   Christoph Lameter   Move vmalloc_to_p...
253
   */
add688fbd   malc   Revert "mm/vmallo...
254
  struct page *vmalloc_to_page(const void *vmalloc_addr)
48667e7a4   Christoph Lameter   Move vmalloc_to_p...
255
256
  {
  	unsigned long addr = (unsigned long) vmalloc_addr;
add688fbd   malc   Revert "mm/vmallo...
257
  	struct page *page = NULL;
48667e7a4   Christoph Lameter   Move vmalloc_to_p...
258
  	pgd_t *pgd = pgd_offset_k(addr);
c2febafc6   Kirill A. Shutemov   mm: convert gener...
259
260
261
262
  	p4d_t *p4d;
  	pud_t *pud;
  	pmd_t *pmd;
  	pte_t *ptep, pte;
48667e7a4   Christoph Lameter   Move vmalloc_to_p...
263

7aa413def   Ingo Molnar   x86, MM: virtual ...
264
265
266
267
  	/*
  	 * XXX we might need to change this if we add VIRTUAL_BUG_ON for
  	 * architectures that do not vmalloc module space
  	 */
73bdf0a60   Linus Torvalds   Introduce is_vmal...
268
  	VIRTUAL_BUG_ON(!is_vmalloc_or_module_addr(vmalloc_addr));
59ea74633   Jiri Slaby   MM: virtual addre...
269

c2febafc6   Kirill A. Shutemov   mm: convert gener...
270
271
272
273
274
275
  	if (pgd_none(*pgd))
  		return NULL;
  	p4d = p4d_offset(pgd, addr);
  	if (p4d_none(*p4d))
  		return NULL;
  	pud = pud_offset(p4d, addr);
029c54b09   Ard Biesheuvel   mm/vmalloc.c: hug...
276
277
278
279
280
281
282
283
284
285
286
  
  	/*
  	 * Don't dereference bad PUD or PMD (below) entries. This will also
  	 * identify huge mappings, which we may encounter on architectures
  	 * that define CONFIG_HAVE_ARCH_HUGE_VMAP=y. Such regions will be
  	 * identified as vmalloc addresses by is_vmalloc_addr(), but are
  	 * not [unambiguously] associated with a struct page, so there is
  	 * no correct value to return for them.
  	 */
  	WARN_ON_ONCE(pud_bad(*pud));
  	if (pud_none(*pud) || pud_bad(*pud))
c2febafc6   Kirill A. Shutemov   mm: convert gener...
287
288
  		return NULL;
  	pmd = pmd_offset(pud, addr);
029c54b09   Ard Biesheuvel   mm/vmalloc.c: hug...
289
290
  	WARN_ON_ONCE(pmd_bad(*pmd));
  	if (pmd_none(*pmd) || pmd_bad(*pmd))
c2febafc6   Kirill A. Shutemov   mm: convert gener...
291
292
293
294
295
296
297
  		return NULL;
  
  	ptep = pte_offset_map(pmd, addr);
  	pte = *ptep;
  	if (pte_present(pte))
  		page = pte_page(pte);
  	pte_unmap(ptep);
add688fbd   malc   Revert "mm/vmallo...
298
  	return page;
48667e7a4   Christoph Lameter   Move vmalloc_to_p...
299
  }
add688fbd   malc   Revert "mm/vmallo...
300
  EXPORT_SYMBOL(vmalloc_to_page);
48667e7a4   Christoph Lameter   Move vmalloc_to_p...
301
302
  
  /*
add688fbd   malc   Revert "mm/vmallo...
303
   * Map a vmalloc()-space virtual address to the physical page frame number.
48667e7a4   Christoph Lameter   Move vmalloc_to_p...
304
   */
add688fbd   malc   Revert "mm/vmallo...
305
  unsigned long vmalloc_to_pfn(const void *vmalloc_addr)
48667e7a4   Christoph Lameter   Move vmalloc_to_p...
306
  {
add688fbd   malc   Revert "mm/vmallo...
307
  	return page_to_pfn(vmalloc_to_page(vmalloc_addr));
48667e7a4   Christoph Lameter   Move vmalloc_to_p...
308
  }
add688fbd   malc   Revert "mm/vmallo...
309
  EXPORT_SYMBOL(vmalloc_to_pfn);
48667e7a4   Christoph Lameter   Move vmalloc_to_p...
310

db64fe022   Nick Piggin   mm: rewrite vmap ...
311
312
  
  /*** Global kva allocator ***/
bb850f4da   Uladzislau Rezki (Sony)   mm/vmap: add DEBU...
313
  #define DEBUG_AUGMENT_PROPAGATE_CHECK 0
a6cf4e0fe   Uladzislau Rezki (Sony)   mm/vmap: add DEBU...
314
  #define DEBUG_AUGMENT_LOWEST_MATCH_CHECK 0
bb850f4da   Uladzislau Rezki (Sony)   mm/vmap: add DEBU...
315

db64fe022   Nick Piggin   mm: rewrite vmap ...
316

db64fe022   Nick Piggin   mm: rewrite vmap ...
317
  static DEFINE_SPINLOCK(vmap_area_lock);
f1c4069e1   Joonsoo Kim   mm, vmalloc: expo...
318
319
  /* Export for kexec only */
  LIST_HEAD(vmap_area_list);
80c4bd7a5   Chris Wilson   mm/vmalloc: keep ...
320
  static LLIST_HEAD(vmap_purge_list);
89699605f   Nick Piggin   mm: vmap area cache
321
  static struct rb_root vmap_area_root = RB_ROOT;
68ad4a330   Uladzislau Rezki (Sony)   mm/vmalloc.c: kee...
322
  static bool vmap_initialized __read_mostly;
89699605f   Nick Piggin   mm: vmap area cache
323

68ad4a330   Uladzislau Rezki (Sony)   mm/vmalloc.c: kee...
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
  /*
   * This kmem_cache is used for vmap_area objects. Instead of
   * allocating from slab we reuse an object from this cache to
   * make things faster. Especially in "no edge" splitting of
   * free block.
   */
  static struct kmem_cache *vmap_area_cachep;
  
  /*
   * This linked list is used in pair with free_vmap_area_root.
   * It gives O(1) access to prev/next to perform fast coalescing.
   */
  static LIST_HEAD(free_vmap_area_list);
  
  /*
   * This augment red-black tree represents the free vmap space.
   * All vmap_area objects in this tree are sorted by va->va_start
   * address. It is used for allocation and merging when a vmap
   * object is released.
   *
   * Each vmap_area node contains a maximum available free block
   * of its sub-tree, right or left. Therefore it is possible to
   * find a lowest match of free area.
   */
  static struct rb_root free_vmap_area_root = RB_ROOT;
82dd23e84   Uladzislau Rezki (Sony)   mm/vmalloc.c: pre...
349
350
351
352
353
354
  /*
   * Preload a CPU with one object for "no edge" split case. The
   * aim is to get rid of allocations from the atomic context, thus
   * to use more permissive allocation masks.
   */
  static DEFINE_PER_CPU(struct vmap_area *, ne_fit_preload_node);
68ad4a330   Uladzislau Rezki (Sony)   mm/vmalloc.c: kee...
355
356
357
358
359
360
361
362
363
364
365
366
367
368
  static __always_inline unsigned long
  va_size(struct vmap_area *va)
  {
  	return (va->va_end - va->va_start);
  }
  
  static __always_inline unsigned long
  get_subtree_max_size(struct rb_node *node)
  {
  	struct vmap_area *va;
  
  	va = rb_entry_safe(node, struct vmap_area, rb_node);
  	return va ? va->subtree_max_size : 0;
  }
89699605f   Nick Piggin   mm: vmap area cache
369

68ad4a330   Uladzislau Rezki (Sony)   mm/vmalloc.c: kee...
370
371
372
373
374
375
376
377
378
379
  /*
   * Gets called when remove the node and rotate.
   */
  static __always_inline unsigned long
  compute_subtree_max_size(struct vmap_area *va)
  {
  	return max3(va_size(va),
  		get_subtree_max_size(va->rb_node.rb_left),
  		get_subtree_max_size(va->rb_node.rb_right));
  }
315cc066b   Michel Lespinasse   augmented rbtree:...
380
381
  RB_DECLARE_CALLBACKS_MAX(static, free_vmap_area_rb_augment_cb,
  	struct vmap_area, rb_node, unsigned long, subtree_max_size, va_size)
68ad4a330   Uladzislau Rezki (Sony)   mm/vmalloc.c: kee...
382
383
384
385
  
  static void purge_vmap_area_lazy(void);
  static BLOCKING_NOTIFIER_HEAD(vmap_notify_list);
  static unsigned long lazy_max_pages(void);
db64fe022   Nick Piggin   mm: rewrite vmap ...
386

97105f0ab   Roman Gushchin   mm: vmalloc: show...
387
388
389
390
391
392
  static atomic_long_t nr_vmalloc_pages;
  
  unsigned long vmalloc_nr_pages(void)
  {
  	return atomic_long_read(&nr_vmalloc_pages);
  }
db64fe022   Nick Piggin   mm: rewrite vmap ...
393
  static struct vmap_area *__find_vmap_area(unsigned long addr)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
394
  {
db64fe022   Nick Piggin   mm: rewrite vmap ...
395
396
397
398
399
400
401
402
  	struct rb_node *n = vmap_area_root.rb_node;
  
  	while (n) {
  		struct vmap_area *va;
  
  		va = rb_entry(n, struct vmap_area, rb_node);
  		if (addr < va->va_start)
  			n = n->rb_left;
cef2ac3f6   HATAYAMA Daisuke   vmalloc: make fin...
403
  		else if (addr >= va->va_end)
db64fe022   Nick Piggin   mm: rewrite vmap ...
404
405
406
407
408
409
410
  			n = n->rb_right;
  		else
  			return va;
  	}
  
  	return NULL;
  }
68ad4a330   Uladzislau Rezki (Sony)   mm/vmalloc.c: kee...
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
  /*
   * This function returns back addresses of parent node
   * and its left or right link for further processing.
   */
  static __always_inline struct rb_node **
  find_va_links(struct vmap_area *va,
  	struct rb_root *root, struct rb_node *from,
  	struct rb_node **parent)
  {
  	struct vmap_area *tmp_va;
  	struct rb_node **link;
  
  	if (root) {
  		link = &root->rb_node;
  		if (unlikely(!*link)) {
  			*parent = NULL;
  			return link;
  		}
  	} else {
  		link = &from;
  	}
db64fe022   Nick Piggin   mm: rewrite vmap ...
432

68ad4a330   Uladzislau Rezki (Sony)   mm/vmalloc.c: kee...
433
434
435
436
437
438
439
  	/*
  	 * Go to the bottom of the tree. When we hit the last point
  	 * we end up with parent rb_node and correct direction, i name
  	 * it link, where the new va->rb_node will be attached to.
  	 */
  	do {
  		tmp_va = rb_entry(*link, struct vmap_area, rb_node);
db64fe022   Nick Piggin   mm: rewrite vmap ...
440

68ad4a330   Uladzislau Rezki (Sony)   mm/vmalloc.c: kee...
441
442
443
444
445
446
447
448
449
450
451
  		/*
  		 * During the traversal we also do some sanity check.
  		 * Trigger the BUG() if there are sides(left/right)
  		 * or full overlaps.
  		 */
  		if (va->va_start < tmp_va->va_end &&
  				va->va_end <= tmp_va->va_start)
  			link = &(*link)->rb_left;
  		else if (va->va_end > tmp_va->va_start &&
  				va->va_start >= tmp_va->va_end)
  			link = &(*link)->rb_right;
db64fe022   Nick Piggin   mm: rewrite vmap ...
452
453
  		else
  			BUG();
68ad4a330   Uladzislau Rezki (Sony)   mm/vmalloc.c: kee...
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
  	} while (*link);
  
  	*parent = &tmp_va->rb_node;
  	return link;
  }
  
  static __always_inline struct list_head *
  get_va_next_sibling(struct rb_node *parent, struct rb_node **link)
  {
  	struct list_head *list;
  
  	if (unlikely(!parent))
  		/*
  		 * The red-black tree where we try to find VA neighbors
  		 * before merging or inserting is empty, i.e. it means
  		 * there is no free vmap space. Normally it does not
  		 * happen but we handle this case anyway.
  		 */
  		return NULL;
  
  	list = &rb_entry(parent, struct vmap_area, rb_node)->list;
  	return (&parent->rb_right == link ? list->next : list);
  }
  
  static __always_inline void
  link_va(struct vmap_area *va, struct rb_root *root,
  	struct rb_node *parent, struct rb_node **link, struct list_head *head)
  {
  	/*
  	 * VA is still not in the list, but we can
  	 * identify its future previous list_head node.
  	 */
  	if (likely(parent)) {
  		head = &rb_entry(parent, struct vmap_area, rb_node)->list;
  		if (&parent->rb_right != link)
  			head = head->prev;
db64fe022   Nick Piggin   mm: rewrite vmap ...
490
  	}
68ad4a330   Uladzislau Rezki (Sony)   mm/vmalloc.c: kee...
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
  	/* Insert to the rb-tree */
  	rb_link_node(&va->rb_node, parent, link);
  	if (root == &free_vmap_area_root) {
  		/*
  		 * Some explanation here. Just perform simple insertion
  		 * to the tree. We do not set va->subtree_max_size to
  		 * its current size before calling rb_insert_augmented().
  		 * It is because of we populate the tree from the bottom
  		 * to parent levels when the node _is_ in the tree.
  		 *
  		 * Therefore we set subtree_max_size to zero after insertion,
  		 * to let __augment_tree_propagate_from() puts everything to
  		 * the correct order later on.
  		 */
  		rb_insert_augmented(&va->rb_node,
  			root, &free_vmap_area_rb_augment_cb);
  		va->subtree_max_size = 0;
  	} else {
  		rb_insert_color(&va->rb_node, root);
  	}
db64fe022   Nick Piggin   mm: rewrite vmap ...
511

68ad4a330   Uladzislau Rezki (Sony)   mm/vmalloc.c: kee...
512
513
  	/* Address-sort this list */
  	list_add(&va->list, head);
db64fe022   Nick Piggin   mm: rewrite vmap ...
514
  }
68ad4a330   Uladzislau Rezki (Sony)   mm/vmalloc.c: kee...
515
516
517
  static __always_inline void
  unlink_va(struct vmap_area *va, struct rb_root *root)
  {
460e42d19   Uladzislau Rezki (Sony)   mm/vmalloc.c: swi...
518
519
  	if (WARN_ON(RB_EMPTY_NODE(&va->rb_node)))
  		return;
db64fe022   Nick Piggin   mm: rewrite vmap ...
520

460e42d19   Uladzislau Rezki (Sony)   mm/vmalloc.c: swi...
521
522
523
524
525
526
527
528
  	if (root == &free_vmap_area_root)
  		rb_erase_augmented(&va->rb_node,
  			root, &free_vmap_area_rb_augment_cb);
  	else
  		rb_erase(&va->rb_node, root);
  
  	list_del(&va->list);
  	RB_CLEAR_NODE(&va->rb_node);
68ad4a330   Uladzislau Rezki (Sony)   mm/vmalloc.c: kee...
529
  }
bb850f4da   Uladzislau Rezki (Sony)   mm/vmap: add DEBU...
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
  #if DEBUG_AUGMENT_PROPAGATE_CHECK
  static void
  augment_tree_propagate_check(struct rb_node *n)
  {
  	struct vmap_area *va;
  	struct rb_node *node;
  	unsigned long size;
  	bool found = false;
  
  	if (n == NULL)
  		return;
  
  	va = rb_entry(n, struct vmap_area, rb_node);
  	size = va->subtree_max_size;
  	node = n;
  
  	while (node) {
  		va = rb_entry(node, struct vmap_area, rb_node);
  
  		if (get_subtree_max_size(node->rb_left) == size) {
  			node = node->rb_left;
  		} else {
  			if (va_size(va) == size) {
  				found = true;
  				break;
  			}
  
  			node = node->rb_right;
  		}
  	}
  
  	if (!found) {
  		va = rb_entry(n, struct vmap_area, rb_node);
  		pr_emerg("tree is corrupted: %lu, %lu
  ",
  			va_size(va), va->subtree_max_size);
  	}
  
  	augment_tree_propagate_check(n->rb_left);
  	augment_tree_propagate_check(n->rb_right);
  }
  #endif
68ad4a330   Uladzislau Rezki (Sony)   mm/vmalloc.c: kee...
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
  /*
   * This function populates subtree_max_size from bottom to upper
   * levels starting from VA point. The propagation must be done
   * when VA size is modified by changing its va_start/va_end. Or
   * in case of newly inserting of VA to the tree.
   *
   * It means that __augment_tree_propagate_from() must be called:
   * - After VA has been inserted to the tree(free path);
   * - After VA has been shrunk(allocation path);
   * - After VA has been increased(merging path).
   *
   * Please note that, it does not mean that upper parent nodes
   * and their subtree_max_size are recalculated all the time up
   * to the root node.
   *
   *       4--8
   *        /\
   *       /  \
   *      /    \
   *    2--2  8--8
   *
   * For example if we modify the node 4, shrinking it to 2, then
   * no any modification is required. If we shrink the node 2 to 1
   * its subtree_max_size is updated only, and set to 1. If we shrink
   * the node 8 to 6, then its subtree_max_size is set to 6 and parent
   * node becomes 4--6.
   */
  static __always_inline void
  augment_tree_propagate_from(struct vmap_area *va)
  {
  	struct rb_node *node = &va->rb_node;
  	unsigned long new_va_sub_max_size;
  
  	while (node) {
  		va = rb_entry(node, struct vmap_area, rb_node);
  		new_va_sub_max_size = compute_subtree_max_size(va);
  
  		/*
  		 * If the newly calculated maximum available size of the
  		 * subtree is equal to the current one, then it means that
  		 * the tree is propagated correctly. So we have to stop at
  		 * this point to save cycles.
  		 */
  		if (va->subtree_max_size == new_va_sub_max_size)
  			break;
  
  		va->subtree_max_size = new_va_sub_max_size;
  		node = rb_parent(&va->rb_node);
  	}
bb850f4da   Uladzislau Rezki (Sony)   mm/vmap: add DEBU...
621
622
623
624
  
  #if DEBUG_AUGMENT_PROPAGATE_CHECK
  	augment_tree_propagate_check(free_vmap_area_root.rb_node);
  #endif
68ad4a330   Uladzislau Rezki (Sony)   mm/vmalloc.c: kee...
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
  }
  
  static void
  insert_vmap_area(struct vmap_area *va,
  	struct rb_root *root, struct list_head *head)
  {
  	struct rb_node **link;
  	struct rb_node *parent;
  
  	link = find_va_links(va, root, NULL, &parent);
  	link_va(va, root, parent, link, head);
  }
  
  static void
  insert_vmap_area_augment(struct vmap_area *va,
  	struct rb_node *from, struct rb_root *root,
  	struct list_head *head)
  {
  	struct rb_node **link;
  	struct rb_node *parent;
  
  	if (from)
  		link = find_va_links(va, NULL, from, &parent);
  	else
  		link = find_va_links(va, root, NULL, &parent);
  
  	link_va(va, root, parent, link, head);
  	augment_tree_propagate_from(va);
  }
  
  /*
   * Merge de-allocated chunk of VA memory with previous
   * and next free blocks. If coalesce is not done a new
   * free area is inserted. If VA has been merged, it is
   * freed.
   */
  static __always_inline void
  merge_or_add_vmap_area(struct vmap_area *va,
  	struct rb_root *root, struct list_head *head)
  {
  	struct vmap_area *sibling;
  	struct list_head *next;
  	struct rb_node **link;
  	struct rb_node *parent;
  	bool merged = false;
  
  	/*
  	 * Find a place in the tree where VA potentially will be
  	 * inserted, unless it is merged with its sibling/siblings.
  	 */
  	link = find_va_links(va, root, NULL, &parent);
  
  	/*
  	 * Get next node of VA to check if merging can be done.
  	 */
  	next = get_va_next_sibling(parent, link);
  	if (unlikely(next == NULL))
  		goto insert;
  
  	/*
  	 * start            end
  	 * |                |
  	 * |<------VA------>|<-----Next----->|
  	 *                  |                |
  	 *                  start            end
  	 */
  	if (next != head) {
  		sibling = list_entry(next, struct vmap_area, list);
  		if (sibling->va_start == va->va_end) {
  			sibling->va_start = va->va_start;
  
  			/* Check and update the tree if needed. */
  			augment_tree_propagate_from(sibling);
68ad4a330   Uladzislau Rezki (Sony)   mm/vmalloc.c: kee...
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
  			/* Free vmap_area object. */
  			kmem_cache_free(vmap_area_cachep, va);
  
  			/* Point to the new merged area. */
  			va = sibling;
  			merged = true;
  		}
  	}
  
  	/*
  	 * start            end
  	 * |                |
  	 * |<-----Prev----->|<------VA------>|
  	 *                  |                |
  	 *                  start            end
  	 */
  	if (next->prev != head) {
  		sibling = list_entry(next->prev, struct vmap_area, list);
  		if (sibling->va_end == va->va_start) {
  			sibling->va_end = va->va_end;
  
  			/* Check and update the tree if needed. */
  			augment_tree_propagate_from(sibling);
54f63d9d8   Uladzislau Rezki (Sony)   mm/vmalloc.c: get...
721
722
  			if (merged)
  				unlink_va(va, root);
68ad4a330   Uladzislau Rezki (Sony)   mm/vmalloc.c: kee...
723
724
725
  
  			/* Free vmap_area object. */
  			kmem_cache_free(vmap_area_cachep, va);
68ad4a330   Uladzislau Rezki (Sony)   mm/vmalloc.c: kee...
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
  			return;
  		}
  	}
  
  insert:
  	if (!merged) {
  		link_va(va, root, parent, link, head);
  		augment_tree_propagate_from(va);
  	}
  }
  
  static __always_inline bool
  is_within_this_va(struct vmap_area *va, unsigned long size,
  	unsigned long align, unsigned long vstart)
  {
  	unsigned long nva_start_addr;
  
  	if (va->va_start > vstart)
  		nva_start_addr = ALIGN(va->va_start, align);
  	else
  		nva_start_addr = ALIGN(vstart, align);
  
  	/* Can be overflowed due to big size or alignment. */
  	if (nva_start_addr + size < nva_start_addr ||
  			nva_start_addr < vstart)
  		return false;
  
  	return (nva_start_addr + size <= va->va_end);
  }
  
  /*
   * Find the first free block(lowest start address) in the tree,
   * that will accomplish the request corresponding to passing
   * parameters.
   */
  static __always_inline struct vmap_area *
  find_vmap_lowest_match(unsigned long size,
  	unsigned long align, unsigned long vstart)
  {
  	struct vmap_area *va;
  	struct rb_node *node;
  	unsigned long length;
  
  	/* Start from the root. */
  	node = free_vmap_area_root.rb_node;
  
  	/* Adjust the search size for alignment overhead. */
  	length = size + align - 1;
  
  	while (node) {
  		va = rb_entry(node, struct vmap_area, rb_node);
  
  		if (get_subtree_max_size(node->rb_left) >= length &&
  				vstart < va->va_start) {
  			node = node->rb_left;
  		} else {
  			if (is_within_this_va(va, size, align, vstart))
  				return va;
  
  			/*
  			 * Does not make sense to go deeper towards the right
  			 * sub-tree if it does not have a free block that is
  			 * equal or bigger to the requested search length.
  			 */
  			if (get_subtree_max_size(node->rb_right) >= length) {
  				node = node->rb_right;
  				continue;
  			}
  
  			/*
3806b0414   Andrew Morton   mm/vmalloc.c: fix...
796
  			 * OK. We roll back and find the first right sub-tree,
68ad4a330   Uladzislau Rezki (Sony)   mm/vmalloc.c: kee...
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
  			 * that will satisfy the search criteria. It can happen
  			 * only once due to "vstart" restriction.
  			 */
  			while ((node = rb_parent(node))) {
  				va = rb_entry(node, struct vmap_area, rb_node);
  				if (is_within_this_va(va, size, align, vstart))
  					return va;
  
  				if (get_subtree_max_size(node->rb_right) >= length &&
  						vstart <= va->va_start) {
  					node = node->rb_right;
  					break;
  				}
  			}
  		}
  	}
  
  	return NULL;
  }
a6cf4e0fe   Uladzislau Rezki (Sony)   mm/vmap: add DEBU...
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
  #if DEBUG_AUGMENT_LOWEST_MATCH_CHECK
  #include <linux/random.h>
  
  static struct vmap_area *
  find_vmap_lowest_linear_match(unsigned long size,
  	unsigned long align, unsigned long vstart)
  {
  	struct vmap_area *va;
  
  	list_for_each_entry(va, &free_vmap_area_list, list) {
  		if (!is_within_this_va(va, size, align, vstart))
  			continue;
  
  		return va;
  	}
  
  	return NULL;
  }
  
  static void
  find_vmap_lowest_match_check(unsigned long size)
  {
  	struct vmap_area *va_1, *va_2;
  	unsigned long vstart;
  	unsigned int rnd;
  
  	get_random_bytes(&rnd, sizeof(rnd));
  	vstart = VMALLOC_START + rnd;
  
  	va_1 = find_vmap_lowest_match(size, 1, vstart);
  	va_2 = find_vmap_lowest_linear_match(size, 1, vstart);
  
  	if (va_1 != va_2)
  		pr_emerg("not lowest: t: 0x%p, l: 0x%p, v: 0x%lx
  ",
  			va_1, va_2, vstart);
  }
  #endif
68ad4a330   Uladzislau Rezki (Sony)   mm/vmalloc.c: kee...
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
  enum fit_type {
  	NOTHING_FIT = 0,
  	FL_FIT_TYPE = 1,	/* full fit */
  	LE_FIT_TYPE = 2,	/* left edge fit */
  	RE_FIT_TYPE = 3,	/* right edge fit */
  	NE_FIT_TYPE = 4		/* no edge fit */
  };
  
  static __always_inline enum fit_type
  classify_va_fit_type(struct vmap_area *va,
  	unsigned long nva_start_addr, unsigned long size)
  {
  	enum fit_type type;
  
  	/* Check if it is within VA. */
  	if (nva_start_addr < va->va_start ||
  			nva_start_addr + size > va->va_end)
  		return NOTHING_FIT;
  
  	/* Now classify. */
  	if (va->va_start == nva_start_addr) {
  		if (va->va_end == nva_start_addr + size)
  			type = FL_FIT_TYPE;
  		else
  			type = LE_FIT_TYPE;
  	} else if (va->va_end == nva_start_addr + size) {
  		type = RE_FIT_TYPE;
  	} else {
  		type = NE_FIT_TYPE;
  	}
  
  	return type;
  }
  
  static __always_inline int
  adjust_va_to_fit_type(struct vmap_area *va,
  	unsigned long nva_start_addr, unsigned long size,
  	enum fit_type type)
  {
2c9292336   Arnd Bergmann   mm/vmalloc.c: avo...
893
  	struct vmap_area *lva = NULL;
68ad4a330   Uladzislau Rezki (Sony)   mm/vmalloc.c: kee...
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
  
  	if (type == FL_FIT_TYPE) {
  		/*
  		 * No need to split VA, it fully fits.
  		 *
  		 * |               |
  		 * V      NVA      V
  		 * |---------------|
  		 */
  		unlink_va(va, &free_vmap_area_root);
  		kmem_cache_free(vmap_area_cachep, va);
  	} else if (type == LE_FIT_TYPE) {
  		/*
  		 * Split left edge of fit VA.
  		 *
  		 * |       |
  		 * V  NVA  V   R
  		 * |-------|-------|
  		 */
  		va->va_start += size;
  	} else if (type == RE_FIT_TYPE) {
  		/*
  		 * Split right edge of fit VA.
  		 *
  		 *         |       |
  		 *     L   V  NVA  V
  		 * |-------|-------|
  		 */
  		va->va_end = nva_start_addr;
  	} else if (type == NE_FIT_TYPE) {
  		/*
  		 * Split no edge of fit VA.
  		 *
  		 *     |       |
  		 *   L V  NVA  V R
  		 * |---|-------|---|
  		 */
82dd23e84   Uladzislau Rezki (Sony)   mm/vmalloc.c: pre...
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
  		lva = __this_cpu_xchg(ne_fit_preload_node, NULL);
  		if (unlikely(!lva)) {
  			/*
  			 * For percpu allocator we do not do any pre-allocation
  			 * and leave it as it is. The reason is it most likely
  			 * never ends up with NE_FIT_TYPE splitting. In case of
  			 * percpu allocations offsets and sizes are aligned to
  			 * fixed align request, i.e. RE_FIT_TYPE and FL_FIT_TYPE
  			 * are its main fitting cases.
  			 *
  			 * There are a few exceptions though, as an example it is
  			 * a first allocation (early boot up) when we have "one"
  			 * big free space that has to be split.
  			 */
  			lva = kmem_cache_alloc(vmap_area_cachep, GFP_NOWAIT);
  			if (!lva)
  				return -1;
  		}
68ad4a330   Uladzislau Rezki (Sony)   mm/vmalloc.c: kee...
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
  
  		/*
  		 * Build the remainder.
  		 */
  		lva->va_start = va->va_start;
  		lva->va_end = nva_start_addr;
  
  		/*
  		 * Shrink this VA to remaining size.
  		 */
  		va->va_start = nva_start_addr + size;
  	} else {
  		return -1;
  	}
  
  	if (type != FL_FIT_TYPE) {
  		augment_tree_propagate_from(va);
2c9292336   Arnd Bergmann   mm/vmalloc.c: avo...
966
  		if (lva)	/* type == NE_FIT_TYPE */
68ad4a330   Uladzislau Rezki (Sony)   mm/vmalloc.c: kee...
967
968
969
970
971
972
973
974
975
976
977
978
979
  			insert_vmap_area_augment(lva, &va->rb_node,
  				&free_vmap_area_root, &free_vmap_area_list);
  	}
  
  	return 0;
  }
  
  /*
   * Returns a start address of the newly allocated area, if success.
   * Otherwise a vend is returned that indicates failure.
   */
  static __always_inline unsigned long
  __alloc_vmap_area(unsigned long size, unsigned long align,
cacca6baf   Uladzislau Rezki (Sony)   mm/vmalloc.c: rem...
980
  	unsigned long vstart, unsigned long vend)
68ad4a330   Uladzislau Rezki (Sony)   mm/vmalloc.c: kee...
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
  {
  	unsigned long nva_start_addr;
  	struct vmap_area *va;
  	enum fit_type type;
  	int ret;
  
  	va = find_vmap_lowest_match(size, align, vstart);
  	if (unlikely(!va))
  		return vend;
  
  	if (va->va_start > vstart)
  		nva_start_addr = ALIGN(va->va_start, align);
  	else
  		nva_start_addr = ALIGN(vstart, align);
  
  	/* Check the "vend" restriction. */
  	if (nva_start_addr + size > vend)
  		return vend;
  
  	/* Classify what we have found. */
  	type = classify_va_fit_type(va, nva_start_addr, size);
  	if (WARN_ON_ONCE(type == NOTHING_FIT))
  		return vend;
  
  	/* Update the free vmap_area. */
  	ret = adjust_va_to_fit_type(va, nva_start_addr, size, type);
  	if (ret)
  		return vend;
a6cf4e0fe   Uladzislau Rezki (Sony)   mm/vmap: add DEBU...
1009
1010
1011
  #if DEBUG_AUGMENT_LOWEST_MATCH_CHECK
  	find_vmap_lowest_match_check(size);
  #endif
68ad4a330   Uladzislau Rezki (Sony)   mm/vmalloc.c: kee...
1012
1013
  	return nva_start_addr;
  }
4da56b99d   Chris Wilson   mm/vmap: Add a no...
1014

db64fe022   Nick Piggin   mm: rewrite vmap ...
1015
1016
1017
1018
1019
1020
1021
1022
1023
  /*
   * Allocate a region of KVA of the specified size and alignment, within the
   * vstart and vend.
   */
  static struct vmap_area *alloc_vmap_area(unsigned long size,
  				unsigned long align,
  				unsigned long vstart, unsigned long vend,
  				int node, gfp_t gfp_mask)
  {
82dd23e84   Uladzislau Rezki (Sony)   mm/vmalloc.c: pre...
1024
  	struct vmap_area *va, *pva;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1025
  	unsigned long addr;
db64fe022   Nick Piggin   mm: rewrite vmap ...
1026
  	int purged = 0;
7766970cc   Nick Piggin   mm: vmap fix over...
1027
  	BUG_ON(!size);
891c49abf   Alexander Kuleshov   mm/vmalloc: use o...
1028
  	BUG_ON(offset_in_page(size));
89699605f   Nick Piggin   mm: vmap area cache
1029
  	BUG_ON(!is_power_of_2(align));
db64fe022   Nick Piggin   mm: rewrite vmap ...
1030

68ad4a330   Uladzislau Rezki (Sony)   mm/vmalloc.c: kee...
1031
1032
  	if (unlikely(!vmap_initialized))
  		return ERR_PTR(-EBUSY);
5803ed292   Christoph Hellwig   mm: mark all call...
1033
  	might_sleep();
4da56b99d   Chris Wilson   mm/vmap: Add a no...
1034

68ad4a330   Uladzislau Rezki (Sony)   mm/vmalloc.c: kee...
1035
  	va = kmem_cache_alloc_node(vmap_area_cachep,
db64fe022   Nick Piggin   mm: rewrite vmap ...
1036
1037
1038
  			gfp_mask & GFP_RECLAIM_MASK, node);
  	if (unlikely(!va))
  		return ERR_PTR(-ENOMEM);
7f88f88f8   Catalin Marinas   mm: kmemleak: avo...
1039
1040
1041
1042
1043
  	/*
  	 * Only scan the relevant parts containing pointers to other objects
  	 * to avoid false negatives.
  	 */
  	kmemleak_scan_area(&va->rb_node, SIZE_MAX, gfp_mask & GFP_RECLAIM_MASK);
db64fe022   Nick Piggin   mm: rewrite vmap ...
1044
  retry:
82dd23e84   Uladzislau Rezki (Sony)   mm/vmalloc.c: pre...
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
  	/*
  	 * Preload this CPU with one extra vmap_area object to ensure
  	 * that we have it available when fit type of free area is
  	 * NE_FIT_TYPE.
  	 *
  	 * The preload is done in non-atomic context, thus it allows us
  	 * to use more permissive allocation masks to be more stable under
  	 * low memory condition and high memory pressure.
  	 *
  	 * Even if it fails we do not really care about that. Just proceed
  	 * as it is. "overflow" path will refill the cache we allocate from.
  	 */
  	preempt_disable();
  	if (!__this_cpu_read(ne_fit_preload_node)) {
  		preempt_enable();
  		pva = kmem_cache_alloc_node(vmap_area_cachep, GFP_KERNEL, node);
  		preempt_disable();
  
  		if (__this_cpu_cmpxchg(ne_fit_preload_node, NULL, pva)) {
  			if (pva)
  				kmem_cache_free(vmap_area_cachep, pva);
  		}
  	}
db64fe022   Nick Piggin   mm: rewrite vmap ...
1068
  	spin_lock(&vmap_area_lock);
82dd23e84   Uladzislau Rezki (Sony)   mm/vmalloc.c: pre...
1069
  	preempt_enable();
89699605f   Nick Piggin   mm: vmap area cache
1070

afd07389d   Uladzislau Rezki (Sony)   mm/vmalloc.c: fix...
1071
  	/*
68ad4a330   Uladzislau Rezki (Sony)   mm/vmalloc.c: kee...
1072
1073
  	 * If an allocation fails, the "vend" address is
  	 * returned. Therefore trigger the overflow path.
afd07389d   Uladzislau Rezki (Sony)   mm/vmalloc.c: fix...
1074
  	 */
cacca6baf   Uladzislau Rezki (Sony)   mm/vmalloc.c: rem...
1075
  	addr = __alloc_vmap_area(size, align, vstart, vend);
68ad4a330   Uladzislau Rezki (Sony)   mm/vmalloc.c: kee...
1076
  	if (unlikely(addr == vend))
89699605f   Nick Piggin   mm: vmap area cache
1077
  		goto overflow;
db64fe022   Nick Piggin   mm: rewrite vmap ...
1078
1079
1080
  
  	va->va_start = addr;
  	va->va_end = addr + size;
688fcbfc0   Pengfei Li   mm/vmalloc: modif...
1081
  	va->vm = NULL;
68ad4a330   Uladzislau Rezki (Sony)   mm/vmalloc.c: kee...
1082
  	insert_vmap_area(va, &vmap_area_root, &vmap_area_list);
db64fe022   Nick Piggin   mm: rewrite vmap ...
1083
  	spin_unlock(&vmap_area_lock);
61e165578   Wang Xiaoqiang   mm/vmalloc.c: use...
1084
  	BUG_ON(!IS_ALIGNED(va->va_start, align));
89699605f   Nick Piggin   mm: vmap area cache
1085
1086
  	BUG_ON(va->va_start < vstart);
  	BUG_ON(va->va_end > vend);
db64fe022   Nick Piggin   mm: rewrite vmap ...
1087
  	return va;
89699605f   Nick Piggin   mm: vmap area cache
1088
1089
1090
1091
1092
1093
1094
1095
  
  overflow:
  	spin_unlock(&vmap_area_lock);
  	if (!purged) {
  		purge_vmap_area_lazy();
  		purged = 1;
  		goto retry;
  	}
4da56b99d   Chris Wilson   mm/vmap: Add a no...
1096
1097
1098
1099
1100
1101
1102
1103
1104
  
  	if (gfpflags_allow_blocking(gfp_mask)) {
  		unsigned long freed = 0;
  		blocking_notifier_call_chain(&vmap_notify_list, 0, &freed);
  		if (freed > 0) {
  			purged = 0;
  			goto retry;
  		}
  	}
03497d761   Florian Fainelli   mm: Silence vmap(...
1105
  	if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit())
756a025f0   Joe Perches   mm: coalesce spli...
1106
1107
1108
  		pr_warn("vmap allocation for size %lu failed: use vmalloc=<size> to increase size
  ",
  			size);
68ad4a330   Uladzislau Rezki (Sony)   mm/vmalloc.c: kee...
1109
1110
  
  	kmem_cache_free(vmap_area_cachep, va);
89699605f   Nick Piggin   mm: vmap area cache
1111
  	return ERR_PTR(-EBUSY);
db64fe022   Nick Piggin   mm: rewrite vmap ...
1112
  }
4da56b99d   Chris Wilson   mm/vmap: Add a no...
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
  int register_vmap_purge_notifier(struct notifier_block *nb)
  {
  	return blocking_notifier_chain_register(&vmap_notify_list, nb);
  }
  EXPORT_SYMBOL_GPL(register_vmap_purge_notifier);
  
  int unregister_vmap_purge_notifier(struct notifier_block *nb)
  {
  	return blocking_notifier_chain_unregister(&vmap_notify_list, nb);
  }
  EXPORT_SYMBOL_GPL(unregister_vmap_purge_notifier);
db64fe022   Nick Piggin   mm: rewrite vmap ...
1124
1125
  static void __free_vmap_area(struct vmap_area *va)
  {
ca23e405e   Tejun Heo   vmalloc: implemen...
1126
  	/*
68ad4a330   Uladzislau Rezki (Sony)   mm/vmalloc.c: kee...
1127
  	 * Remove from the busy tree/list.
ca23e405e   Tejun Heo   vmalloc: implemen...
1128
  	 */
68ad4a330   Uladzislau Rezki (Sony)   mm/vmalloc.c: kee...
1129
  	unlink_va(va, &vmap_area_root);
ca23e405e   Tejun Heo   vmalloc: implemen...
1130

68ad4a330   Uladzislau Rezki (Sony)   mm/vmalloc.c: kee...
1131
1132
1133
1134
1135
  	/*
  	 * Merge VA with its neighbors, otherwise just add it.
  	 */
  	merge_or_add_vmap_area(va,
  		&free_vmap_area_root, &free_vmap_area_list);
db64fe022   Nick Piggin   mm: rewrite vmap ...
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
  }
  
  /*
   * Free a region of KVA allocated by alloc_vmap_area
   */
  static void free_vmap_area(struct vmap_area *va)
  {
  	spin_lock(&vmap_area_lock);
  	__free_vmap_area(va);
  	spin_unlock(&vmap_area_lock);
  }
  
  /*
   * Clear the pagetable entries of a given vmap_area
   */
  static void unmap_vmap_area(struct vmap_area *va)
  {
  	vunmap_page_range(va->va_start, va->va_end);
  }
  
  /*
   * lazy_max_pages is the maximum amount of virtual address space we gather up
   * before attempting to purge with a TLB flush.
   *
   * There is a tradeoff here: a larger number will cover more kernel page tables
   * and take slightly longer to purge, but it will linearly reduce the number of
   * global TLB flushes that must be performed. It would seem natural to scale
   * this number up linearly with the number of CPUs (because vmapping activity
   * could also scale linearly with the number of CPUs), however it is likely
   * that in practice, workloads might be constrained in other ways that mean
   * vmap activity will not scale linearly with CPUs. Also, I want to be
   * conservative and not introduce a big latency on huge systems, so go with
   * a less aggressive log scale. It will still be an improvement over the old
   * code, and it will be simple to change the scale factor if we find that it
   * becomes a problem on bigger systems.
   */
  static unsigned long lazy_max_pages(void)
  {
  	unsigned int log;
  
  	log = fls(num_online_cpus());
  
  	return log * (32UL * 1024 * 1024 / PAGE_SIZE);
  }
4d36e6f80   Uladzislau Rezki (Sony)   mm/vmalloc.c: con...
1180
  static atomic_long_t vmap_lazy_nr = ATOMIC_LONG_INIT(0);
db64fe022   Nick Piggin   mm: rewrite vmap ...
1181

0574ecd14   Christoph Hellwig   mm: refactor __pu...
1182
1183
1184
1185
1186
  /*
   * Serialize vmap purging.  There is no actual criticial section protected
   * by this look, but we want to avoid concurrent calls for performance
   * reasons and to make the pcpu_get_vm_areas more deterministic.
   */
f9e099776   Christoph Hellwig   mm: turn vmap_pur...
1187
  static DEFINE_MUTEX(vmap_purge_lock);
0574ecd14   Christoph Hellwig   mm: refactor __pu...
1188

02b709df8   Nick Piggin   mm: purge fragmen...
1189
1190
  /* for per-CPU blocks */
  static void purge_fragmented_blocks_allcpus(void);
db64fe022   Nick Piggin   mm: rewrite vmap ...
1191
  /*
3ee48b6af   Cliff Wickman   mm, x86: Saving v...
1192
1193
1194
1195
1196
   * called before a call to iounmap() if the caller wants vm_area_struct's
   * immediately freed.
   */
  void set_iounmap_nonlazy(void)
  {
4d36e6f80   Uladzislau Rezki (Sony)   mm/vmalloc.c: con...
1197
  	atomic_long_set(&vmap_lazy_nr, lazy_max_pages()+1);
3ee48b6af   Cliff Wickman   mm, x86: Saving v...
1198
1199
1200
  }
  
  /*
db64fe022   Nick Piggin   mm: rewrite vmap ...
1201
   * Purges all lazily-freed vmap areas.
db64fe022   Nick Piggin   mm: rewrite vmap ...
1202
   */
0574ecd14   Christoph Hellwig   mm: refactor __pu...
1203
  static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end)
db64fe022   Nick Piggin   mm: rewrite vmap ...
1204
  {
4d36e6f80   Uladzislau Rezki (Sony)   mm/vmalloc.c: con...
1205
  	unsigned long resched_threshold;
80c4bd7a5   Chris Wilson   mm/vmalloc: keep ...
1206
  	struct llist_node *valist;
db64fe022   Nick Piggin   mm: rewrite vmap ...
1207
  	struct vmap_area *va;
cbb766766   Vegard Nossum   mm: fix lazy vmap...
1208
  	struct vmap_area *n_va;
db64fe022   Nick Piggin   mm: rewrite vmap ...
1209

0574ecd14   Christoph Hellwig   mm: refactor __pu...
1210
  	lockdep_assert_held(&vmap_purge_lock);
02b709df8   Nick Piggin   mm: purge fragmen...
1211

80c4bd7a5   Chris Wilson   mm/vmalloc: keep ...
1212
  	valist = llist_del_all(&vmap_purge_list);
68571be99   Uladzislau Rezki (Sony)   mm/vmalloc.c: add...
1213
1214
1215
1216
  	if (unlikely(valist == NULL))
  		return false;
  
  	/*
3f8fd02b1   Joerg Roedel   mm/vmalloc: Sync ...
1217
1218
1219
  	 * First make sure the mappings are removed from all page-tables
  	 * before they are freed.
  	 */
66f28e110   Joerg Roedel   x86/mm: split vma...
1220
  	vmalloc_sync_unmappings();
3f8fd02b1   Joerg Roedel   mm/vmalloc: Sync ...
1221
1222
  
  	/*
68571be99   Uladzislau Rezki (Sony)   mm/vmalloc.c: add...
1223
1224
1225
  	 * TODO: to calculate a flush range without looping.
  	 * The list can be up to lazy_max_pages() elements.
  	 */
80c4bd7a5   Chris Wilson   mm/vmalloc: keep ...
1226
  	llist_for_each_entry(va, valist, purge_list) {
0574ecd14   Christoph Hellwig   mm: refactor __pu...
1227
1228
1229
1230
  		if (va->va_start < start)
  			start = va->va_start;
  		if (va->va_end > end)
  			end = va->va_end;
db64fe022   Nick Piggin   mm: rewrite vmap ...
1231
  	}
db64fe022   Nick Piggin   mm: rewrite vmap ...
1232

0574ecd14   Christoph Hellwig   mm: refactor __pu...
1233
  	flush_tlb_kernel_range(start, end);
4d36e6f80   Uladzislau Rezki (Sony)   mm/vmalloc.c: con...
1234
  	resched_threshold = lazy_max_pages() << 1;
db64fe022   Nick Piggin   mm: rewrite vmap ...
1235

0574ecd14   Christoph Hellwig   mm: refactor __pu...
1236
  	spin_lock(&vmap_area_lock);
763b218dd   Joel Fernandes   mm: add preempt p...
1237
  	llist_for_each_entry_safe(va, n_va, valist, purge_list) {
4d36e6f80   Uladzislau Rezki (Sony)   mm/vmalloc.c: con...
1238
  		unsigned long nr = (va->va_end - va->va_start) >> PAGE_SHIFT;
763b218dd   Joel Fernandes   mm: add preempt p...
1239

dd3b8353b   Uladzislau Rezki (Sony)   mm/vmalloc: do no...
1240
1241
1242
1243
1244
1245
1246
  		/*
  		 * Finally insert or merge lazily-freed area. It is
  		 * detached and there is no need to "unlink" it from
  		 * anything.
  		 */
  		merge_or_add_vmap_area(va,
  			&free_vmap_area_root, &free_vmap_area_list);
4d36e6f80   Uladzislau Rezki (Sony)   mm/vmalloc.c: con...
1247
  		atomic_long_sub(nr, &vmap_lazy_nr);
68571be99   Uladzislau Rezki (Sony)   mm/vmalloc.c: add...
1248

4d36e6f80   Uladzislau Rezki (Sony)   mm/vmalloc.c: con...
1249
  		if (atomic_long_read(&vmap_lazy_nr) < resched_threshold)
68571be99   Uladzislau Rezki (Sony)   mm/vmalloc.c: add...
1250
  			cond_resched_lock(&vmap_area_lock);
763b218dd   Joel Fernandes   mm: add preempt p...
1251
  	}
0574ecd14   Christoph Hellwig   mm: refactor __pu...
1252
1253
  	spin_unlock(&vmap_area_lock);
  	return true;
db64fe022   Nick Piggin   mm: rewrite vmap ...
1254
1255
1256
  }
  
  /*
496850e5f   Nick Piggin   mm: vmalloc failu...
1257
1258
1259
1260
1261
   * Kick off a purge of the outstanding lazy areas. Don't bother if somebody
   * is already purging.
   */
  static void try_purge_vmap_area_lazy(void)
  {
f9e099776   Christoph Hellwig   mm: turn vmap_pur...
1262
  	if (mutex_trylock(&vmap_purge_lock)) {
0574ecd14   Christoph Hellwig   mm: refactor __pu...
1263
  		__purge_vmap_area_lazy(ULONG_MAX, 0);
f9e099776   Christoph Hellwig   mm: turn vmap_pur...
1264
  		mutex_unlock(&vmap_purge_lock);
0574ecd14   Christoph Hellwig   mm: refactor __pu...
1265
  	}
496850e5f   Nick Piggin   mm: vmalloc failu...
1266
1267
1268
  }
  
  /*
db64fe022   Nick Piggin   mm: rewrite vmap ...
1269
1270
1271
1272
   * Kick off a purge of the outstanding lazy areas.
   */
  static void purge_vmap_area_lazy(void)
  {
f9e099776   Christoph Hellwig   mm: turn vmap_pur...
1273
  	mutex_lock(&vmap_purge_lock);
0574ecd14   Christoph Hellwig   mm: refactor __pu...
1274
1275
  	purge_fragmented_blocks_allcpus();
  	__purge_vmap_area_lazy(ULONG_MAX, 0);
f9e099776   Christoph Hellwig   mm: turn vmap_pur...
1276
  	mutex_unlock(&vmap_purge_lock);
db64fe022   Nick Piggin   mm: rewrite vmap ...
1277
1278
1279
  }
  
  /*
64141da58   Jeremy Fitzhardinge   vmalloc: eagerly ...
1280
1281
1282
   * Free a vmap area, caller ensuring that the area has been unmapped
   * and flush_cache_vunmap had been called for the correct range
   * previously.
db64fe022   Nick Piggin   mm: rewrite vmap ...
1283
   */
64141da58   Jeremy Fitzhardinge   vmalloc: eagerly ...
1284
  static void free_vmap_area_noflush(struct vmap_area *va)
db64fe022   Nick Piggin   mm: rewrite vmap ...
1285
  {
4d36e6f80   Uladzislau Rezki (Sony)   mm/vmalloc.c: con...
1286
  	unsigned long nr_lazy;
80c4bd7a5   Chris Wilson   mm/vmalloc: keep ...
1287

dd3b8353b   Uladzislau Rezki (Sony)   mm/vmalloc: do no...
1288
1289
1290
  	spin_lock(&vmap_area_lock);
  	unlink_va(va, &vmap_area_root);
  	spin_unlock(&vmap_area_lock);
4d36e6f80   Uladzislau Rezki (Sony)   mm/vmalloc.c: con...
1291
1292
  	nr_lazy = atomic_long_add_return((va->va_end - va->va_start) >>
  				PAGE_SHIFT, &vmap_lazy_nr);
80c4bd7a5   Chris Wilson   mm/vmalloc: keep ...
1293
1294
1295
1296
1297
  
  	/* After this point, we may free va at any time */
  	llist_add(&va->purge_list, &vmap_purge_list);
  
  	if (unlikely(nr_lazy > lazy_max_pages()))
496850e5f   Nick Piggin   mm: vmalloc failu...
1298
  		try_purge_vmap_area_lazy();
db64fe022   Nick Piggin   mm: rewrite vmap ...
1299
  }
b29acbdcf   Nick Piggin   mm: vmalloc fix l...
1300
1301
1302
1303
1304
1305
  /*
   * Free and unmap a vmap area
   */
  static void free_unmap_vmap_area(struct vmap_area *va)
  {
  	flush_cache_vunmap(va->va_start, va->va_end);
c8eef01e2   Christoph Hellwig   mm: remove free_u...
1306
  	unmap_vmap_area(va);
d30dce351   Vlastimil Babka   mm, debug_pageall...
1307
  	if (debug_pagealloc_enabled_static())
82a2e924f   Chintan Pandya   mm: vmalloc: clea...
1308
  		flush_tlb_kernel_range(va->va_start, va->va_end);
c8eef01e2   Christoph Hellwig   mm: remove free_u...
1309
  	free_vmap_area_noflush(va);
b29acbdcf   Nick Piggin   mm: vmalloc fix l...
1310
  }
db64fe022   Nick Piggin   mm: rewrite vmap ...
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
  static struct vmap_area *find_vmap_area(unsigned long addr)
  {
  	struct vmap_area *va;
  
  	spin_lock(&vmap_area_lock);
  	va = __find_vmap_area(addr);
  	spin_unlock(&vmap_area_lock);
  
  	return va;
  }
db64fe022   Nick Piggin   mm: rewrite vmap ...
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
  /*** Per cpu kva allocator ***/
  
  /*
   * vmap space is limited especially on 32 bit architectures. Ensure there is
   * room for at least 16 percpu vmap blocks per CPU.
   */
  /*
   * If we had a constant VMALLOC_START and VMALLOC_END, we'd like to be able
   * to #define VMALLOC_SPACE		(VMALLOC_END-VMALLOC_START). Guess
   * instead (we just need a rough idea)
   */
  #if BITS_PER_LONG == 32
  #define VMALLOC_SPACE		(128UL*1024*1024)
  #else
  #define VMALLOC_SPACE		(128UL*1024*1024*1024)
  #endif
  
  #define VMALLOC_PAGES		(VMALLOC_SPACE / PAGE_SIZE)
  #define VMAP_MAX_ALLOC		BITS_PER_LONG	/* 256K with 4K pages */
  #define VMAP_BBMAP_BITS_MAX	1024	/* 4MB with 4K pages */
  #define VMAP_BBMAP_BITS_MIN	(VMAP_MAX_ALLOC*2)
  #define VMAP_MIN(x, y)		((x) < (y) ? (x) : (y)) /* can't use min() */
  #define VMAP_MAX(x, y)		((x) > (y) ? (x) : (y)) /* can't use max() */
f982f9151   Clemens Ladisch   mm: fix wrong vma...
1344
1345
1346
1347
  #define VMAP_BBMAP_BITS		\
  		VMAP_MIN(VMAP_BBMAP_BITS_MAX,	\
  		VMAP_MAX(VMAP_BBMAP_BITS_MIN,	\
  			VMALLOC_PAGES / roundup_pow_of_two(NR_CPUS) / 16))
db64fe022   Nick Piggin   mm: rewrite vmap ...
1348
1349
1350
1351
1352
1353
  
  #define VMAP_BLOCK_SIZE		(VMAP_BBMAP_BITS * PAGE_SIZE)
  
  struct vmap_block_queue {
  	spinlock_t lock;
  	struct list_head free;
db64fe022   Nick Piggin   mm: rewrite vmap ...
1354
1355
1356
1357
1358
  };
  
  struct vmap_block {
  	spinlock_t lock;
  	struct vmap_area *va;
db64fe022   Nick Piggin   mm: rewrite vmap ...
1359
  	unsigned long free, dirty;
7d61bfe8f   Roman Pen   mm/vmalloc: get r...
1360
  	unsigned long dirty_min, dirty_max; /*< dirty range */
de5604231   Nick Piggin   mm: percpu-vmap f...
1361
1362
  	struct list_head free_list;
  	struct rcu_head rcu_head;
02b709df8   Nick Piggin   mm: purge fragmen...
1363
  	struct list_head purge;
db64fe022   Nick Piggin   mm: rewrite vmap ...
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
  };
  
  /* Queue of free and dirty vmap blocks, for allocation and flushing purposes */
  static DEFINE_PER_CPU(struct vmap_block_queue, vmap_block_queue);
  
  /*
   * Radix tree of vmap blocks, indexed by address, to quickly find a vmap block
   * in the free path. Could get rid of this if we change the API to return a
   * "cookie" from alloc, to be passed to free. But no big deal yet.
   */
  static DEFINE_SPINLOCK(vmap_block_tree_lock);
  static RADIX_TREE(vmap_block_tree, GFP_ATOMIC);
  
  /*
   * We should probably have a fallback mechanism to allocate virtual memory
   * out of partially filled vmap blocks. However vmap block sizing should be
   * fairly reasonable according to the vmalloc size, so it shouldn't be a
   * big problem.
   */
  
  static unsigned long addr_to_vb_idx(unsigned long addr)
  {
  	addr -= VMALLOC_START & ~(VMAP_BLOCK_SIZE-1);
  	addr /= VMAP_BLOCK_SIZE;
  	return addr;
  }
cf725ce27   Roman Pen   mm/vmalloc: occup...
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
  static void *vmap_block_vaddr(unsigned long va_start, unsigned long pages_off)
  {
  	unsigned long addr;
  
  	addr = va_start + (pages_off << PAGE_SHIFT);
  	BUG_ON(addr_to_vb_idx(addr) != addr_to_vb_idx(va_start));
  	return (void *)addr;
  }
  
  /**
   * new_vmap_block - allocates new vmap_block and occupies 2^order pages in this
   *                  block. Of course pages number can't exceed VMAP_BBMAP_BITS
   * @order:    how many 2^order pages should be occupied in newly allocated block
   * @gfp_mask: flags for the page level allocator
   *
a862f68a8   Mike Rapoport   docs/core-api/mm:...
1405
   * Return: virtual address in a newly allocated block or ERR_PTR(-errno)
cf725ce27   Roman Pen   mm/vmalloc: occup...
1406
1407
   */
  static void *new_vmap_block(unsigned int order, gfp_t gfp_mask)
db64fe022   Nick Piggin   mm: rewrite vmap ...
1408
1409
1410
1411
1412
1413
  {
  	struct vmap_block_queue *vbq;
  	struct vmap_block *vb;
  	struct vmap_area *va;
  	unsigned long vb_idx;
  	int node, err;
cf725ce27   Roman Pen   mm/vmalloc: occup...
1414
  	void *vaddr;
db64fe022   Nick Piggin   mm: rewrite vmap ...
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
  
  	node = numa_node_id();
  
  	vb = kmalloc_node(sizeof(struct vmap_block),
  			gfp_mask & GFP_RECLAIM_MASK, node);
  	if (unlikely(!vb))
  		return ERR_PTR(-ENOMEM);
  
  	va = alloc_vmap_area(VMAP_BLOCK_SIZE, VMAP_BLOCK_SIZE,
  					VMALLOC_START, VMALLOC_END,
  					node, gfp_mask);
ddf9c6d47   Tobias Klauser   vmalloc: remove r...
1426
  	if (IS_ERR(va)) {
db64fe022   Nick Piggin   mm: rewrite vmap ...
1427
  		kfree(vb);
e7d863407   Julia Lawall   mm: use ERR_CAST
1428
  		return ERR_CAST(va);
db64fe022   Nick Piggin   mm: rewrite vmap ...
1429
1430
1431
1432
1433
1434
1435
1436
  	}
  
  	err = radix_tree_preload(gfp_mask);
  	if (unlikely(err)) {
  		kfree(vb);
  		free_vmap_area(va);
  		return ERR_PTR(err);
  	}
cf725ce27   Roman Pen   mm/vmalloc: occup...
1437
  	vaddr = vmap_block_vaddr(va->va_start, 0);
db64fe022   Nick Piggin   mm: rewrite vmap ...
1438
1439
  	spin_lock_init(&vb->lock);
  	vb->va = va;
cf725ce27   Roman Pen   mm/vmalloc: occup...
1440
1441
1442
  	/* At least something should be left free */
  	BUG_ON(VMAP_BBMAP_BITS <= (1UL << order));
  	vb->free = VMAP_BBMAP_BITS - (1UL << order);
db64fe022   Nick Piggin   mm: rewrite vmap ...
1443
  	vb->dirty = 0;
7d61bfe8f   Roman Pen   mm/vmalloc: get r...
1444
1445
  	vb->dirty_min = VMAP_BBMAP_BITS;
  	vb->dirty_max = 0;
db64fe022   Nick Piggin   mm: rewrite vmap ...
1446
  	INIT_LIST_HEAD(&vb->free_list);
db64fe022   Nick Piggin   mm: rewrite vmap ...
1447
1448
1449
1450
1451
1452
1453
1454
1455
  
  	vb_idx = addr_to_vb_idx(va->va_start);
  	spin_lock(&vmap_block_tree_lock);
  	err = radix_tree_insert(&vmap_block_tree, vb_idx, vb);
  	spin_unlock(&vmap_block_tree_lock);
  	BUG_ON(err);
  	radix_tree_preload_end();
  
  	vbq = &get_cpu_var(vmap_block_queue);
db64fe022   Nick Piggin   mm: rewrite vmap ...
1456
  	spin_lock(&vbq->lock);
68ac546f2   Roman Pen   mm/vmalloc: fix p...
1457
  	list_add_tail_rcu(&vb->free_list, &vbq->free);
db64fe022   Nick Piggin   mm: rewrite vmap ...
1458
  	spin_unlock(&vbq->lock);
3f04ba859   Tejun Heo   vmalloc: fix use ...
1459
  	put_cpu_var(vmap_block_queue);
db64fe022   Nick Piggin   mm: rewrite vmap ...
1460

cf725ce27   Roman Pen   mm/vmalloc: occup...
1461
  	return vaddr;
db64fe022   Nick Piggin   mm: rewrite vmap ...
1462
  }
db64fe022   Nick Piggin   mm: rewrite vmap ...
1463
1464
1465
1466
  static void free_vmap_block(struct vmap_block *vb)
  {
  	struct vmap_block *tmp;
  	unsigned long vb_idx;
db64fe022   Nick Piggin   mm: rewrite vmap ...
1467
1468
1469
1470
1471
  	vb_idx = addr_to_vb_idx(vb->va->va_start);
  	spin_lock(&vmap_block_tree_lock);
  	tmp = radix_tree_delete(&vmap_block_tree, vb_idx);
  	spin_unlock(&vmap_block_tree_lock);
  	BUG_ON(tmp != vb);
64141da58   Jeremy Fitzhardinge   vmalloc: eagerly ...
1472
  	free_vmap_area_noflush(vb->va);
22a3c7d18   Lai Jiangshan   vmalloc,rcu: Conv...
1473
  	kfree_rcu(vb, rcu_head);
db64fe022   Nick Piggin   mm: rewrite vmap ...
1474
  }
02b709df8   Nick Piggin   mm: purge fragmen...
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
  static void purge_fragmented_blocks(int cpu)
  {
  	LIST_HEAD(purge);
  	struct vmap_block *vb;
  	struct vmap_block *n_vb;
  	struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, cpu);
  
  	rcu_read_lock();
  	list_for_each_entry_rcu(vb, &vbq->free, free_list) {
  
  		if (!(vb->free + vb->dirty == VMAP_BBMAP_BITS && vb->dirty != VMAP_BBMAP_BITS))
  			continue;
  
  		spin_lock(&vb->lock);
  		if (vb->free + vb->dirty == VMAP_BBMAP_BITS && vb->dirty != VMAP_BBMAP_BITS) {
  			vb->free = 0; /* prevent further allocs after releasing lock */
  			vb->dirty = VMAP_BBMAP_BITS; /* prevent purging it again */
7d61bfe8f   Roman Pen   mm/vmalloc: get r...
1492
1493
  			vb->dirty_min = 0;
  			vb->dirty_max = VMAP_BBMAP_BITS;
02b709df8   Nick Piggin   mm: purge fragmen...
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
  			spin_lock(&vbq->lock);
  			list_del_rcu(&vb->free_list);
  			spin_unlock(&vbq->lock);
  			spin_unlock(&vb->lock);
  			list_add_tail(&vb->purge, &purge);
  		} else
  			spin_unlock(&vb->lock);
  	}
  	rcu_read_unlock();
  
  	list_for_each_entry_safe(vb, n_vb, &purge, purge) {
  		list_del(&vb->purge);
  		free_vmap_block(vb);
  	}
  }
02b709df8   Nick Piggin   mm: purge fragmen...
1509
1510
1511
1512
1513
1514
1515
  static void purge_fragmented_blocks_allcpus(void)
  {
  	int cpu;
  
  	for_each_possible_cpu(cpu)
  		purge_fragmented_blocks(cpu);
  }
db64fe022   Nick Piggin   mm: rewrite vmap ...
1516
1517
1518
1519
  static void *vb_alloc(unsigned long size, gfp_t gfp_mask)
  {
  	struct vmap_block_queue *vbq;
  	struct vmap_block *vb;
cf725ce27   Roman Pen   mm/vmalloc: occup...
1520
  	void *vaddr = NULL;
db64fe022   Nick Piggin   mm: rewrite vmap ...
1521
  	unsigned int order;
891c49abf   Alexander Kuleshov   mm/vmalloc: use o...
1522
  	BUG_ON(offset_in_page(size));
db64fe022   Nick Piggin   mm: rewrite vmap ...
1523
  	BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC);
aa91c4d89   Jan Kara   mm: make vb_alloc...
1524
1525
1526
1527
1528
1529
1530
1531
  	if (WARN_ON(size == 0)) {
  		/*
  		 * Allocating 0 bytes isn't what caller wants since
  		 * get_order(0) returns funny result. Just warn and terminate
  		 * early.
  		 */
  		return NULL;
  	}
db64fe022   Nick Piggin   mm: rewrite vmap ...
1532
  	order = get_order(size);
db64fe022   Nick Piggin   mm: rewrite vmap ...
1533
1534
1535
  	rcu_read_lock();
  	vbq = &get_cpu_var(vmap_block_queue);
  	list_for_each_entry_rcu(vb, &vbq->free, free_list) {
cf725ce27   Roman Pen   mm/vmalloc: occup...
1536
  		unsigned long pages_off;
db64fe022   Nick Piggin   mm: rewrite vmap ...
1537
1538
  
  		spin_lock(&vb->lock);
cf725ce27   Roman Pen   mm/vmalloc: occup...
1539
1540
1541
1542
  		if (vb->free < (1UL << order)) {
  			spin_unlock(&vb->lock);
  			continue;
  		}
02b709df8   Nick Piggin   mm: purge fragmen...
1543

cf725ce27   Roman Pen   mm/vmalloc: occup...
1544
1545
  		pages_off = VMAP_BBMAP_BITS - vb->free;
  		vaddr = vmap_block_vaddr(vb->va->va_start, pages_off);
02b709df8   Nick Piggin   mm: purge fragmen...
1546
1547
1548
1549
1550
1551
  		vb->free -= 1UL << order;
  		if (vb->free == 0) {
  			spin_lock(&vbq->lock);
  			list_del_rcu(&vb->free_list);
  			spin_unlock(&vbq->lock);
  		}
cf725ce27   Roman Pen   mm/vmalloc: occup...
1552

02b709df8   Nick Piggin   mm: purge fragmen...
1553
1554
  		spin_unlock(&vb->lock);
  		break;
db64fe022   Nick Piggin   mm: rewrite vmap ...
1555
  	}
02b709df8   Nick Piggin   mm: purge fragmen...
1556

3f04ba859   Tejun Heo   vmalloc: fix use ...
1557
  	put_cpu_var(vmap_block_queue);
db64fe022   Nick Piggin   mm: rewrite vmap ...
1558
  	rcu_read_unlock();
cf725ce27   Roman Pen   mm/vmalloc: occup...
1559
1560
1561
  	/* Allocate new block if nothing was found */
  	if (!vaddr)
  		vaddr = new_vmap_block(order, gfp_mask);
db64fe022   Nick Piggin   mm: rewrite vmap ...
1562

cf725ce27   Roman Pen   mm/vmalloc: occup...
1563
  	return vaddr;
db64fe022   Nick Piggin   mm: rewrite vmap ...
1564
1565
1566
1567
1568
1569
1570
1571
  }
  
  static void vb_free(const void *addr, unsigned long size)
  {
  	unsigned long offset;
  	unsigned long vb_idx;
  	unsigned int order;
  	struct vmap_block *vb;
891c49abf   Alexander Kuleshov   mm/vmalloc: use o...
1572
  	BUG_ON(offset_in_page(size));
db64fe022   Nick Piggin   mm: rewrite vmap ...
1573
  	BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC);
b29acbdcf   Nick Piggin   mm: vmalloc fix l...
1574
1575
  
  	flush_cache_vunmap((unsigned long)addr, (unsigned long)addr + size);
db64fe022   Nick Piggin   mm: rewrite vmap ...
1576
1577
1578
  	order = get_order(size);
  
  	offset = (unsigned long)addr & (VMAP_BLOCK_SIZE - 1);
7d61bfe8f   Roman Pen   mm/vmalloc: get r...
1579
  	offset >>= PAGE_SHIFT;
db64fe022   Nick Piggin   mm: rewrite vmap ...
1580
1581
1582
1583
1584
1585
  
  	vb_idx = addr_to_vb_idx((unsigned long)addr);
  	rcu_read_lock();
  	vb = radix_tree_lookup(&vmap_block_tree, vb_idx);
  	rcu_read_unlock();
  	BUG_ON(!vb);
64141da58   Jeremy Fitzhardinge   vmalloc: eagerly ...
1586
  	vunmap_page_range((unsigned long)addr, (unsigned long)addr + size);
d30dce351   Vlastimil Babka   mm, debug_pageall...
1587
  	if (debug_pagealloc_enabled_static())
82a2e924f   Chintan Pandya   mm: vmalloc: clea...
1588
1589
  		flush_tlb_kernel_range((unsigned long)addr,
  					(unsigned long)addr + size);
db64fe022   Nick Piggin   mm: rewrite vmap ...
1590
  	spin_lock(&vb->lock);
7d61bfe8f   Roman Pen   mm/vmalloc: get r...
1591
1592
1593
1594
  
  	/* Expand dirty range */
  	vb->dirty_min = min(vb->dirty_min, offset);
  	vb->dirty_max = max(vb->dirty_max, offset + (1UL << order));
d086817dc   MinChan Kim   vmap: remove need...
1595

db64fe022   Nick Piggin   mm: rewrite vmap ...
1596
1597
  	vb->dirty += 1UL << order;
  	if (vb->dirty == VMAP_BBMAP_BITS) {
de5604231   Nick Piggin   mm: percpu-vmap f...
1598
  		BUG_ON(vb->free);
db64fe022   Nick Piggin   mm: rewrite vmap ...
1599
1600
1601
1602
1603
  		spin_unlock(&vb->lock);
  		free_vmap_block(vb);
  	} else
  		spin_unlock(&vb->lock);
  }
868b104d7   Rick Edgecombe   mm/vmalloc: Add f...
1604
  static void _vm_unmap_aliases(unsigned long start, unsigned long end, int flush)
db64fe022   Nick Piggin   mm: rewrite vmap ...
1605
  {
db64fe022   Nick Piggin   mm: rewrite vmap ...
1606
  	int cpu;
db64fe022   Nick Piggin   mm: rewrite vmap ...
1607

9b4633340   Jeremy Fitzhardinge   vmap: cope with v...
1608
1609
  	if (unlikely(!vmap_initialized))
  		return;
5803ed292   Christoph Hellwig   mm: mark all call...
1610
  	might_sleep();
db64fe022   Nick Piggin   mm: rewrite vmap ...
1611
1612
1613
1614
1615
1616
  	for_each_possible_cpu(cpu) {
  		struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, cpu);
  		struct vmap_block *vb;
  
  		rcu_read_lock();
  		list_for_each_entry_rcu(vb, &vbq->free, free_list) {
db64fe022   Nick Piggin   mm: rewrite vmap ...
1617
  			spin_lock(&vb->lock);
7d61bfe8f   Roman Pen   mm/vmalloc: get r...
1618
1619
  			if (vb->dirty) {
  				unsigned long va_start = vb->va->va_start;
db64fe022   Nick Piggin   mm: rewrite vmap ...
1620
  				unsigned long s, e;
b136be5e0   Joonsoo Kim   mm, vmalloc: use ...
1621

7d61bfe8f   Roman Pen   mm/vmalloc: get r...
1622
1623
  				s = va_start + (vb->dirty_min << PAGE_SHIFT);
  				e = va_start + (vb->dirty_max << PAGE_SHIFT);
db64fe022   Nick Piggin   mm: rewrite vmap ...
1624

7d61bfe8f   Roman Pen   mm/vmalloc: get r...
1625
1626
  				start = min(s, start);
  				end   = max(e, end);
db64fe022   Nick Piggin   mm: rewrite vmap ...
1627

7d61bfe8f   Roman Pen   mm/vmalloc: get r...
1628
  				flush = 1;
db64fe022   Nick Piggin   mm: rewrite vmap ...
1629
1630
1631
1632
1633
  			}
  			spin_unlock(&vb->lock);
  		}
  		rcu_read_unlock();
  	}
f9e099776   Christoph Hellwig   mm: turn vmap_pur...
1634
  	mutex_lock(&vmap_purge_lock);
0574ecd14   Christoph Hellwig   mm: refactor __pu...
1635
1636
1637
  	purge_fragmented_blocks_allcpus();
  	if (!__purge_vmap_area_lazy(start, end) && flush)
  		flush_tlb_kernel_range(start, end);
f9e099776   Christoph Hellwig   mm: turn vmap_pur...
1638
  	mutex_unlock(&vmap_purge_lock);
db64fe022   Nick Piggin   mm: rewrite vmap ...
1639
  }
868b104d7   Rick Edgecombe   mm/vmalloc: Add f...
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
  
  /**
   * vm_unmap_aliases - unmap outstanding lazy aliases in the vmap layer
   *
   * The vmap/vmalloc layer lazily flushes kernel virtual mappings primarily
   * to amortize TLB flushing overheads. What this means is that any page you
   * have now, may, in a former life, have been mapped into kernel virtual
   * address by the vmap layer and so there might be some CPUs with TLB entries
   * still referencing that page (additional to the regular 1:1 kernel mapping).
   *
   * vm_unmap_aliases flushes all such lazy mappings. After it returns, we can
   * be sure that none of the pages we have control over will have any aliases
   * from the vmap layer.
   */
  void vm_unmap_aliases(void)
  {
  	unsigned long start = ULONG_MAX, end = 0;
  	int flush = 0;
  
  	_vm_unmap_aliases(start, end, flush);
  }
db64fe022   Nick Piggin   mm: rewrite vmap ...
1661
1662
1663
1664
1665
1666
1667
1668
1669
  EXPORT_SYMBOL_GPL(vm_unmap_aliases);
  
  /**
   * vm_unmap_ram - unmap linear kernel address space set up by vm_map_ram
   * @mem: the pointer returned by vm_map_ram
   * @count: the count passed to that vm_map_ram call (cannot unmap partial)
   */
  void vm_unmap_ram(const void *mem, unsigned int count)
  {
65ee03c4b   Guillermo Julián Moreno   mm: fix overflow ...
1670
  	unsigned long size = (unsigned long)count << PAGE_SHIFT;
db64fe022   Nick Piggin   mm: rewrite vmap ...
1671
  	unsigned long addr = (unsigned long)mem;
9c3acf604   Christoph Hellwig   mm: remove free_u...
1672
  	struct vmap_area *va;
db64fe022   Nick Piggin   mm: rewrite vmap ...
1673

5803ed292   Christoph Hellwig   mm: mark all call...
1674
  	might_sleep();
db64fe022   Nick Piggin   mm: rewrite vmap ...
1675
1676
1677
  	BUG_ON(!addr);
  	BUG_ON(addr < VMALLOC_START);
  	BUG_ON(addr > VMALLOC_END);
a1c0b1a07   Shawn Lin   mm/vmalloc: use P...
1678
  	BUG_ON(!PAGE_ALIGNED(addr));
db64fe022   Nick Piggin   mm: rewrite vmap ...
1679

9c3acf604   Christoph Hellwig   mm: remove free_u...
1680
  	if (likely(count <= VMAP_MAX_ALLOC)) {
05e3ff950   Chintan Pandya   mm: vmalloc: pass...
1681
  		debug_check_no_locks_freed(mem, size);
db64fe022   Nick Piggin   mm: rewrite vmap ...
1682
  		vb_free(mem, size);
9c3acf604   Christoph Hellwig   mm: remove free_u...
1683
1684
1685
1686
1687
  		return;
  	}
  
  	va = find_vmap_area(addr);
  	BUG_ON(!va);
05e3ff950   Chintan Pandya   mm: vmalloc: pass...
1688
1689
  	debug_check_no_locks_freed((void *)va->va_start,
  				    (va->va_end - va->va_start));
9c3acf604   Christoph Hellwig   mm: remove free_u...
1690
  	free_unmap_vmap_area(va);
db64fe022   Nick Piggin   mm: rewrite vmap ...
1691
1692
1693
1694
1695
1696
1697
1698
1699
  }
  EXPORT_SYMBOL(vm_unmap_ram);
  
  /**
   * vm_map_ram - map pages linearly into kernel virtual address (vmalloc space)
   * @pages: an array of pointers to the pages to be mapped
   * @count: number of pages
   * @node: prefer to allocate data structures on this node
   * @prot: memory protection to use. PAGE_KERNEL for regular RAM
e99c97ade   Randy Dunlap   mm: fix kernel-do...
1700
   *
364376383   Gioh Kim   mm/vmalloc.c: enh...
1701
1702
1703
1704
1705
1706
   * If you use this function for less than VMAP_MAX_ALLOC pages, it could be
   * faster than vmap so it's good.  But if you mix long-life and short-life
   * objects with vm_map_ram(), it could consume lots of address space through
   * fragmentation (especially on a 32bit machine).  You could see failures in
   * the end.  Please use this function for short-lived objects.
   *
e99c97ade   Randy Dunlap   mm: fix kernel-do...
1707
   * Returns: a pointer to the address that has been mapped, or %NULL on failure
db64fe022   Nick Piggin   mm: rewrite vmap ...
1708
1709
1710
   */
  void *vm_map_ram(struct page **pages, unsigned int count, int node, pgprot_t prot)
  {
65ee03c4b   Guillermo Julián Moreno   mm: fix overflow ...
1711
  	unsigned long size = (unsigned long)count << PAGE_SHIFT;
db64fe022   Nick Piggin   mm: rewrite vmap ...
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
  	unsigned long addr;
  	void *mem;
  
  	if (likely(count <= VMAP_MAX_ALLOC)) {
  		mem = vb_alloc(size, GFP_KERNEL);
  		if (IS_ERR(mem))
  			return NULL;
  		addr = (unsigned long)mem;
  	} else {
  		struct vmap_area *va;
  		va = alloc_vmap_area(size, PAGE_SIZE,
  				VMALLOC_START, VMALLOC_END, node, GFP_KERNEL);
  		if (IS_ERR(va))
  			return NULL;
  
  		addr = va->va_start;
  		mem = (void *)addr;
  	}
  	if (vmap_page_range(addr, addr + size, prot, pages) < 0) {
  		vm_unmap_ram(mem, count);
  		return NULL;
  	}
  	return mem;
  }
  EXPORT_SYMBOL(vm_map_ram);
4341fa454   Joonsoo Kim   mm, vmalloc: remo...
1737
  static struct vm_struct *vmlist __initdata;
92eac1681   Mike Rapoport   docs/mm: vmalloc:...
1738

f0aa66179   Tejun Heo   vmalloc: implemen...
1739
  /**
be9b7335e   Nicolas Pitre   mm: add vm_area_a...
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
   * vm_area_add_early - add vmap area early during boot
   * @vm: vm_struct to add
   *
   * This function is used to add fixed kernel vm area to vmlist before
   * vmalloc_init() is called.  @vm->addr, @vm->size, and @vm->flags
   * should contain proper values and the other fields should be zero.
   *
   * DO NOT USE THIS FUNCTION UNLESS YOU KNOW WHAT YOU'RE DOING.
   */
  void __init vm_area_add_early(struct vm_struct *vm)
  {
  	struct vm_struct *tmp, **p;
  
  	BUG_ON(vmap_initialized);
  	for (p = &vmlist; (tmp = *p) != NULL; p = &tmp->next) {
  		if (tmp->addr >= vm->addr) {
  			BUG_ON(tmp->addr < vm->addr + vm->size);
  			break;
  		} else
  			BUG_ON(tmp->addr + tmp->size > vm->addr);
  	}
  	vm->next = *p;
  	*p = vm;
  }
  
  /**
f0aa66179   Tejun Heo   vmalloc: implemen...
1766
1767
   * vm_area_register_early - register vmap area early during boot
   * @vm: vm_struct to register
c0c0a2937   Tejun Heo   vmalloc: add @ali...
1768
   * @align: requested alignment
f0aa66179   Tejun Heo   vmalloc: implemen...
1769
1770
1771
1772
1773
1774
1775
1776
   *
   * This function is used to register kernel vm area before
   * vmalloc_init() is called.  @vm->size and @vm->flags should contain
   * proper values on entry and other fields should be zero.  On return,
   * vm->addr contains the allocated address.
   *
   * DO NOT USE THIS FUNCTION UNLESS YOU KNOW WHAT YOU'RE DOING.
   */
c0c0a2937   Tejun Heo   vmalloc: add @ali...
1777
  void __init vm_area_register_early(struct vm_struct *vm, size_t align)
f0aa66179   Tejun Heo   vmalloc: implemen...
1778
1779
  {
  	static size_t vm_init_off __initdata;
c0c0a2937   Tejun Heo   vmalloc: add @ali...
1780
1781
1782
1783
  	unsigned long addr;
  
  	addr = ALIGN(VMALLOC_START + vm_init_off, align);
  	vm_init_off = PFN_ALIGN(addr + vm->size) - VMALLOC_START;
f0aa66179   Tejun Heo   vmalloc: implemen...
1784

c0c0a2937   Tejun Heo   vmalloc: add @ali...
1785
  	vm->addr = (void *)addr;
f0aa66179   Tejun Heo   vmalloc: implemen...
1786

be9b7335e   Nicolas Pitre   mm: add vm_area_a...
1787
  	vm_area_add_early(vm);
f0aa66179   Tejun Heo   vmalloc: implemen...
1788
  }
68ad4a330   Uladzislau Rezki (Sony)   mm/vmalloc.c: kee...
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
  static void vmap_init_free_space(void)
  {
  	unsigned long vmap_start = 1;
  	const unsigned long vmap_end = ULONG_MAX;
  	struct vmap_area *busy, *free;
  
  	/*
  	 *     B     F     B     B     B     F
  	 * -|-----|.....|-----|-----|-----|.....|-
  	 *  |           The KVA space           |
  	 *  |<--------------------------------->|
  	 */
  	list_for_each_entry(busy, &vmap_area_list, list) {
  		if (busy->va_start - vmap_start > 0) {
  			free = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT);
  			if (!WARN_ON_ONCE(!free)) {
  				free->va_start = vmap_start;
  				free->va_end = busy->va_start;
  
  				insert_vmap_area_augment(free, NULL,
  					&free_vmap_area_root,
  						&free_vmap_area_list);
  			}
  		}
  
  		vmap_start = busy->va_end;
  	}
  
  	if (vmap_end - vmap_start > 0) {
  		free = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT);
  		if (!WARN_ON_ONCE(!free)) {
  			free->va_start = vmap_start;
  			free->va_end = vmap_end;
  
  			insert_vmap_area_augment(free, NULL,
  				&free_vmap_area_root,
  					&free_vmap_area_list);
  		}
  	}
  }
db64fe022   Nick Piggin   mm: rewrite vmap ...
1829
1830
  void __init vmalloc_init(void)
  {
822c18f2e   Ivan Kokshaysky   alpha: fix vmallo...
1831
1832
  	struct vmap_area *va;
  	struct vm_struct *tmp;
db64fe022   Nick Piggin   mm: rewrite vmap ...
1833
  	int i;
68ad4a330   Uladzislau Rezki (Sony)   mm/vmalloc.c: kee...
1834
1835
1836
1837
  	/*
  	 * Create the cache for vmap_area objects.
  	 */
  	vmap_area_cachep = KMEM_CACHE(vmap_area, SLAB_PANIC);
db64fe022   Nick Piggin   mm: rewrite vmap ...
1838
1839
  	for_each_possible_cpu(i) {
  		struct vmap_block_queue *vbq;
32fcfd407   Al Viro   make vfree() safe...
1840
  		struct vfree_deferred *p;
db64fe022   Nick Piggin   mm: rewrite vmap ...
1841
1842
1843
1844
  
  		vbq = &per_cpu(vmap_block_queue, i);
  		spin_lock_init(&vbq->lock);
  		INIT_LIST_HEAD(&vbq->free);
32fcfd407   Al Viro   make vfree() safe...
1845
1846
1847
  		p = &per_cpu(vfree_deferred, i);
  		init_llist_head(&p->list);
  		INIT_WORK(&p->wq, free_work);
db64fe022   Nick Piggin   mm: rewrite vmap ...
1848
  	}
9b4633340   Jeremy Fitzhardinge   vmap: cope with v...
1849

822c18f2e   Ivan Kokshaysky   alpha: fix vmallo...
1850
1851
  	/* Import existing vmlist entries. */
  	for (tmp = vmlist; tmp; tmp = tmp->next) {
68ad4a330   Uladzislau Rezki (Sony)   mm/vmalloc.c: kee...
1852
1853
1854
  		va = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT);
  		if (WARN_ON_ONCE(!va))
  			continue;
822c18f2e   Ivan Kokshaysky   alpha: fix vmallo...
1855
1856
  		va->va_start = (unsigned long)tmp->addr;
  		va->va_end = va->va_start + tmp->size;
dbda591d9   KyongHo   mm: fix faulty in...
1857
  		va->vm = tmp;
68ad4a330   Uladzislau Rezki (Sony)   mm/vmalloc.c: kee...
1858
  		insert_vmap_area(va, &vmap_area_root, &vmap_area_list);
822c18f2e   Ivan Kokshaysky   alpha: fix vmallo...
1859
  	}
ca23e405e   Tejun Heo   vmalloc: implemen...
1860

68ad4a330   Uladzislau Rezki (Sony)   mm/vmalloc.c: kee...
1861
1862
1863
1864
  	/*
  	 * Now we can initialize a free vmap space.
  	 */
  	vmap_init_free_space();
9b4633340   Jeremy Fitzhardinge   vmap: cope with v...
1865
  	vmap_initialized = true;
db64fe022   Nick Piggin   mm: rewrite vmap ...
1866
  }
8fc489850   Tejun Heo   vmalloc: add un/m...
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
  /**
   * map_kernel_range_noflush - map kernel VM area with the specified pages
   * @addr: start of the VM area to map
   * @size: size of the VM area to map
   * @prot: page protection flags to use
   * @pages: pages to map
   *
   * Map PFN_UP(@size) pages at @addr.  The VM area @addr and @size
   * specify should have been allocated using get_vm_area() and its
   * friends.
   *
   * NOTE:
   * This function does NOT do any cache flushing.  The caller is
   * responsible for calling flush_cache_vmap() on to-be-mapped areas
   * before calling this function.
   *
   * RETURNS:
   * The number of pages mapped on success, -errno on failure.
   */
  int map_kernel_range_noflush(unsigned long addr, unsigned long size,
  			     pgprot_t prot, struct page **pages)
  {
  	return vmap_page_range_noflush(addr, addr + size, prot, pages);
  }
  
  /**
   * unmap_kernel_range_noflush - unmap kernel VM area
   * @addr: start of the VM area to unmap
   * @size: size of the VM area to unmap
   *
   * Unmap PFN_UP(@size) pages at @addr.  The VM area @addr and @size
   * specify should have been allocated using get_vm_area() and its
   * friends.
   *
   * NOTE:
   * This function does NOT do any cache flushing.  The caller is
   * responsible for calling flush_cache_vunmap() on to-be-mapped areas
   * before calling this function and flush_tlb_kernel_range() after.
   */
  void unmap_kernel_range_noflush(unsigned long addr, unsigned long size)
  {
  	vunmap_page_range(addr, addr + size);
  }
81e88fdc4   Huang Ying   ACPI, APEI, Gener...
1910
  EXPORT_SYMBOL_GPL(unmap_kernel_range_noflush);
8fc489850   Tejun Heo   vmalloc: add un/m...
1911
1912
1913
1914
1915
1916
1917
1918
1919
  
  /**
   * unmap_kernel_range - unmap kernel VM area and flush cache and TLB
   * @addr: start of the VM area to unmap
   * @size: size of the VM area to unmap
   *
   * Similar to unmap_kernel_range_noflush() but flushes vcache before
   * the unmapping and tlb after.
   */
db64fe022   Nick Piggin   mm: rewrite vmap ...
1920
1921
1922
  void unmap_kernel_range(unsigned long addr, unsigned long size)
  {
  	unsigned long end = addr + size;
f6fcba701   Tejun Heo   vmalloc: call flu...
1923
1924
  
  	flush_cache_vunmap(addr, end);
db64fe022   Nick Piggin   mm: rewrite vmap ...
1925
1926
1927
  	vunmap_page_range(addr, end);
  	flush_tlb_kernel_range(addr, end);
  }
93ef6d6ca   Minchan Kim   mm/vmalloc.c: exp...
1928
  EXPORT_SYMBOL_GPL(unmap_kernel_range);
db64fe022   Nick Piggin   mm: rewrite vmap ...
1929

f6f8ed473   WANG Chao   mm/vmalloc.c: cle...
1930
  int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page **pages)
db64fe022   Nick Piggin   mm: rewrite vmap ...
1931
1932
  {
  	unsigned long addr = (unsigned long)area->addr;
762216ab4   Wanpeng Li   mm/vmalloc: use w...
1933
  	unsigned long end = addr + get_vm_area_size(area);
db64fe022   Nick Piggin   mm: rewrite vmap ...
1934
  	int err;
f6f8ed473   WANG Chao   mm/vmalloc.c: cle...
1935
  	err = vmap_page_range(addr, end, prot, pages);
db64fe022   Nick Piggin   mm: rewrite vmap ...
1936

f6f8ed473   WANG Chao   mm/vmalloc.c: cle...
1937
  	return err > 0 ? 0 : err;
db64fe022   Nick Piggin   mm: rewrite vmap ...
1938
1939
  }
  EXPORT_SYMBOL_GPL(map_vm_area);
f5252e009   Mitsuo Hayasaka   mm: avoid null po...
1940
  static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
5e6cafc83   Marek Szyprowski   mm: vmalloc: use ...
1941
  			      unsigned long flags, const void *caller)
cf88c7900   Tejun Heo   vmalloc: separate...
1942
  {
c69480ade   Joonsoo Kim   mm, vmalloc: prot...
1943
  	spin_lock(&vmap_area_lock);
cf88c7900   Tejun Heo   vmalloc: separate...
1944
1945
1946
1947
  	vm->flags = flags;
  	vm->addr = (void *)va->va_start;
  	vm->size = va->va_end - va->va_start;
  	vm->caller = caller;
db1aecafe   Minchan Kim   mm/vmalloc.c: cha...
1948
  	va->vm = vm;
c69480ade   Joonsoo Kim   mm, vmalloc: prot...
1949
  	spin_unlock(&vmap_area_lock);
f5252e009   Mitsuo Hayasaka   mm: avoid null po...
1950
  }
cf88c7900   Tejun Heo   vmalloc: separate...
1951

20fc02b47   Zhang Yanfei   mm/vmalloc.c: ren...
1952
  static void clear_vm_uninitialized_flag(struct vm_struct *vm)
f5252e009   Mitsuo Hayasaka   mm: avoid null po...
1953
  {
d4033afdf   Joonsoo Kim   mm, vmalloc: iter...
1954
  	/*
20fc02b47   Zhang Yanfei   mm/vmalloc.c: ren...
1955
  	 * Before removing VM_UNINITIALIZED,
d4033afdf   Joonsoo Kim   mm, vmalloc: iter...
1956
1957
1958
1959
  	 * we should make sure that vm has proper values.
  	 * Pair with smp_rmb() in show_numa_info().
  	 */
  	smp_wmb();
20fc02b47   Zhang Yanfei   mm/vmalloc.c: ren...
1960
  	vm->flags &= ~VM_UNINITIALIZED;
cf88c7900   Tejun Heo   vmalloc: separate...
1961
  }
db64fe022   Nick Piggin   mm: rewrite vmap ...
1962
  static struct vm_struct *__get_vm_area_node(unsigned long size,
2dca6999e   David Miller   mm, perf_event: M...
1963
  		unsigned long align, unsigned long flags, unsigned long start,
5e6cafc83   Marek Szyprowski   mm: vmalloc: use ...
1964
  		unsigned long end, int node, gfp_t gfp_mask, const void *caller)
db64fe022   Nick Piggin   mm: rewrite vmap ...
1965
  {
0006526d7   Kautuk Consul   mm/vmalloc.c: rem...
1966
  	struct vmap_area *va;
db64fe022   Nick Piggin   mm: rewrite vmap ...
1967
  	struct vm_struct *area;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1968

52fd24ca1   Giridhar Pemmasani   [PATCH] __vmalloc...
1969
  	BUG_ON(in_interrupt());
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1970
  	size = PAGE_ALIGN(size);
31be83095   OGAWA Hirofumi   [PATCH] Fix stran...
1971
1972
  	if (unlikely(!size))
  		return NULL;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1973

252e5c6e2   zijun_hu   mm/vmalloc.c: fix...
1974
1975
1976
  	if (flags & VM_IOREMAP)
  		align = 1ul << clamp_t(int, get_count_order_long(size),
  				       PAGE_SHIFT, IOREMAP_MAX_ORDER);
cf88c7900   Tejun Heo   vmalloc: separate...
1977
  	area = kzalloc_node(sizeof(*area), gfp_mask & GFP_RECLAIM_MASK, node);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1978
1979
  	if (unlikely(!area))
  		return NULL;
71394fe50   Andrey Ryabinin   mm: vmalloc: add ...
1980
1981
  	if (!(flags & VM_NO_GUARD))
  		size += PAGE_SIZE;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1982

db64fe022   Nick Piggin   mm: rewrite vmap ...
1983
1984
1985
1986
  	va = alloc_vmap_area(size, align, start, end, node, gfp_mask);
  	if (IS_ERR(va)) {
  		kfree(area);
  		return NULL;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1987
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1988

d82b1d857   Zhang Yanfei   mm, vmalloc: only...
1989
  	setup_vmalloc_vm(area, va, flags, caller);
f5252e009   Mitsuo Hayasaka   mm: avoid null po...
1990

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1991
  	return area;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1992
  }
930fc45a4   Christoph Lameter   [PATCH] vmalloc_node
1993
1994
1995
  struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags,
  				unsigned long start, unsigned long end)
  {
00ef2d2f8   David Rientjes   mm: use NUMA_NO_NODE
1996
1997
  	return __get_vm_area_node(size, 1, flags, start, end, NUMA_NO_NODE,
  				  GFP_KERNEL, __builtin_return_address(0));
930fc45a4   Christoph Lameter   [PATCH] vmalloc_node
1998
  }
5992b6dac   Rusty Russell   lguest: export sy...
1999
  EXPORT_SYMBOL_GPL(__get_vm_area);
930fc45a4   Christoph Lameter   [PATCH] vmalloc_node
2000

c29686129   Benjamin Herrenschmidt   vmalloc: add __ge...
2001
2002
  struct vm_struct *__get_vm_area_caller(unsigned long size, unsigned long flags,
  				       unsigned long start, unsigned long end,
5e6cafc83   Marek Szyprowski   mm: vmalloc: use ...
2003
  				       const void *caller)
c29686129   Benjamin Herrenschmidt   vmalloc: add __ge...
2004
  {
00ef2d2f8   David Rientjes   mm: use NUMA_NO_NODE
2005
2006
  	return __get_vm_area_node(size, 1, flags, start, end, NUMA_NO_NODE,
  				  GFP_KERNEL, caller);
c29686129   Benjamin Herrenschmidt   vmalloc: add __ge...
2007
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2008
  /**
92eac1681   Mike Rapoport   docs/mm: vmalloc:...
2009
2010
2011
   * get_vm_area - reserve a contiguous kernel virtual area
   * @size:	 size of the area
   * @flags:	 %VM_IOREMAP for I/O mappings or VM_ALLOC
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2012
   *
92eac1681   Mike Rapoport   docs/mm: vmalloc:...
2013
2014
2015
   * Search an area of @size in the kernel virtual mapping area,
   * and reserved it for out purposes.  Returns the area descriptor
   * on success or %NULL on failure.
a862f68a8   Mike Rapoport   docs/core-api/mm:...
2016
2017
   *
   * Return: the area descriptor on success or %NULL on failure.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2018
2019
2020
   */
  struct vm_struct *get_vm_area(unsigned long size, unsigned long flags)
  {
2dca6999e   David Miller   mm, perf_event: M...
2021
  	return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END,
00ef2d2f8   David Rientjes   mm: use NUMA_NO_NODE
2022
2023
  				  NUMA_NO_NODE, GFP_KERNEL,
  				  __builtin_return_address(0));
230169693   Christoph Lameter   vmallocinfo: add ...
2024
2025
2026
  }
  
  struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags,
5e6cafc83   Marek Szyprowski   mm: vmalloc: use ...
2027
  				const void *caller)
230169693   Christoph Lameter   vmallocinfo: add ...
2028
  {
2dca6999e   David Miller   mm, perf_event: M...
2029
  	return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END,
00ef2d2f8   David Rientjes   mm: use NUMA_NO_NODE
2030
  				  NUMA_NO_NODE, GFP_KERNEL, caller);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2031
  }
e9da6e990   Marek Szyprowski   ARM: dma-mapping:...
2032
  /**
92eac1681   Mike Rapoport   docs/mm: vmalloc:...
2033
2034
   * find_vm_area - find a continuous kernel virtual area
   * @addr:	  base address
e9da6e990   Marek Szyprowski   ARM: dma-mapping:...
2035
   *
92eac1681   Mike Rapoport   docs/mm: vmalloc:...
2036
2037
2038
   * Search for the kernel VM area starting at @addr, and return it.
   * It is up to the caller to do all required locking to keep the returned
   * pointer valid.
a862f68a8   Mike Rapoport   docs/core-api/mm:...
2039
2040
   *
   * Return: pointer to the found area or %NULL on faulure
e9da6e990   Marek Szyprowski   ARM: dma-mapping:...
2041
2042
   */
  struct vm_struct *find_vm_area(const void *addr)
833423143   Nick Piggin   [PATCH] mm: intro...
2043
  {
db64fe022   Nick Piggin   mm: rewrite vmap ...
2044
  	struct vmap_area *va;
833423143   Nick Piggin   [PATCH] mm: intro...
2045

db64fe022   Nick Piggin   mm: rewrite vmap ...
2046
  	va = find_vmap_area((unsigned long)addr);
688fcbfc0   Pengfei Li   mm/vmalloc: modif...
2047
2048
  	if (!va)
  		return NULL;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2049

688fcbfc0   Pengfei Li   mm/vmalloc: modif...
2050
  	return va->vm;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2051
  }
7856dfeb2   Andi Kleen   [PATCH] x86_64: F...
2052
  /**
92eac1681   Mike Rapoport   docs/mm: vmalloc:...
2053
2054
   * remove_vm_area - find and remove a continuous kernel virtual area
   * @addr:	    base address
7856dfeb2   Andi Kleen   [PATCH] x86_64: F...
2055
   *
92eac1681   Mike Rapoport   docs/mm: vmalloc:...
2056
2057
2058
   * Search for the kernel VM area starting at @addr, and remove it.
   * This function returns the found VM area, but using it is NOT safe
   * on SMP machines, except for its size or flags.
a862f68a8   Mike Rapoport   docs/core-api/mm:...
2059
2060
   *
   * Return: pointer to the found area or %NULL on faulure
7856dfeb2   Andi Kleen   [PATCH] x86_64: F...
2061
   */
b3bdda02a   Christoph Lameter   vmalloc: add cons...
2062
  struct vm_struct *remove_vm_area(const void *addr)
7856dfeb2   Andi Kleen   [PATCH] x86_64: F...
2063
  {
db64fe022   Nick Piggin   mm: rewrite vmap ...
2064
  	struct vmap_area *va;
5803ed292   Christoph Hellwig   mm: mark all call...
2065
  	might_sleep();
dd3b8353b   Uladzislau Rezki (Sony)   mm/vmalloc: do no...
2066
2067
  	spin_lock(&vmap_area_lock);
  	va = __find_vmap_area((unsigned long)addr);
688fcbfc0   Pengfei Li   mm/vmalloc: modif...
2068
  	if (va && va->vm) {
db1aecafe   Minchan Kim   mm/vmalloc.c: cha...
2069
  		struct vm_struct *vm = va->vm;
f5252e009   Mitsuo Hayasaka   mm: avoid null po...
2070

c69480ade   Joonsoo Kim   mm, vmalloc: prot...
2071
  		va->vm = NULL;
c69480ade   Joonsoo Kim   mm, vmalloc: prot...
2072
  		spin_unlock(&vmap_area_lock);
a5af5aa8b   Andrey Ryabinin   kasan, module, vm...
2073
  		kasan_free_shadow(vm);
dd32c2799   KAMEZAWA Hiroyuki   vmalloc: unmap vm...
2074
  		free_unmap_vmap_area(va);
dd32c2799   KAMEZAWA Hiroyuki   vmalloc: unmap vm...
2075

db64fe022   Nick Piggin   mm: rewrite vmap ...
2076
2077
  		return vm;
  	}
dd3b8353b   Uladzislau Rezki (Sony)   mm/vmalloc: do no...
2078
2079
  
  	spin_unlock(&vmap_area_lock);
db64fe022   Nick Piggin   mm: rewrite vmap ...
2080
  	return NULL;
7856dfeb2   Andi Kleen   [PATCH] x86_64: F...
2081
  }
868b104d7   Rick Edgecombe   mm/vmalloc: Add f...
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
  static inline void set_area_direct_map(const struct vm_struct *area,
  				       int (*set_direct_map)(struct page *page))
  {
  	int i;
  
  	for (i = 0; i < area->nr_pages; i++)
  		if (page_address(area->pages[i]))
  			set_direct_map(area->pages[i]);
  }
  
  /* Handle removing and resetting vm mappings related to the vm_struct. */
  static void vm_remove_mappings(struct vm_struct *area, int deallocate_pages)
  {
868b104d7   Rick Edgecombe   mm/vmalloc: Add f...
2095
2096
  	unsigned long start = ULONG_MAX, end = 0;
  	int flush_reset = area->flags & VM_FLUSH_RESET_PERMS;
31e67340c   Rick Edgecombe   mm/vmalloc: Avoid...
2097
  	int flush_dmap = 0;
868b104d7   Rick Edgecombe   mm/vmalloc: Add f...
2098
  	int i;
868b104d7   Rick Edgecombe   mm/vmalloc: Add f...
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
  	remove_vm_area(area->addr);
  
  	/* If this is not VM_FLUSH_RESET_PERMS memory, no need for the below. */
  	if (!flush_reset)
  		return;
  
  	/*
  	 * If not deallocating pages, just do the flush of the VM area and
  	 * return.
  	 */
  	if (!deallocate_pages) {
  		vm_unmap_aliases();
  		return;
  	}
  
  	/*
  	 * If execution gets here, flush the vm mapping and reset the direct
  	 * map. Find the start and end range of the direct mappings to make sure
  	 * the vm_unmap_aliases() flush includes the direct map.
  	 */
  	for (i = 0; i < area->nr_pages; i++) {
8e41f8726   Rick Edgecombe   mm/vmalloc: Fix c...
2120
2121
  		unsigned long addr = (unsigned long)page_address(area->pages[i]);
  		if (addr) {
868b104d7   Rick Edgecombe   mm/vmalloc: Add f...
2122
  			start = min(addr, start);
8e41f8726   Rick Edgecombe   mm/vmalloc: Fix c...
2123
  			end = max(addr + PAGE_SIZE, end);
31e67340c   Rick Edgecombe   mm/vmalloc: Avoid...
2124
  			flush_dmap = 1;
868b104d7   Rick Edgecombe   mm/vmalloc: Add f...
2125
2126
2127
2128
2129
2130
2131
2132
2133
  		}
  	}
  
  	/*
  	 * Set direct map to something invalid so that it won't be cached if
  	 * there are any accesses after the TLB flush, then flush the TLB and
  	 * reset the direct map permissions to the default.
  	 */
  	set_area_direct_map(area, set_direct_map_invalid_noflush);
31e67340c   Rick Edgecombe   mm/vmalloc: Avoid...
2134
  	_vm_unmap_aliases(start, end, flush_dmap);
868b104d7   Rick Edgecombe   mm/vmalloc: Add f...
2135
2136
  	set_area_direct_map(area, set_direct_map_default_noflush);
  }
b3bdda02a   Christoph Lameter   vmalloc: add cons...
2137
  static void __vunmap(const void *addr, int deallocate_pages)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2138
2139
2140
2141
2142
  {
  	struct vm_struct *area;
  
  	if (!addr)
  		return;
e69e9d4ae   HATAYAMA Daisuke   vmalloc: introduc...
2143
2144
  	if (WARN(!PAGE_ALIGNED(addr), "Trying to vfree() bad address (%p)
  ",
ab15d9b4c   Dan Carpenter   mm/vmalloc.c: unb...
2145
  			addr))
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2146
  		return;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2147

6ade20327   Liviu Dudau   mm/vmalloc.c: don...
2148
  	area = find_vm_area(addr);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2149
  	if (unlikely(!area)) {
4c8573e25   Arjan van de Ven   Use WARN() in mm/...
2150
2151
  		WARN(1, KERN_ERR "Trying to vfree() nonexistent vm area (%p)
  ",
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2152
  				addr);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2153
2154
  		return;
  	}
05e3ff950   Chintan Pandya   mm: vmalloc: pass...
2155
2156
  	debug_check_no_locks_freed(area->addr, get_vm_area_size(area));
  	debug_check_no_obj_freed(area->addr, get_vm_area_size(area));
9a11b49a8   Ingo Molnar   [PATCH] lockdep: ...
2157

868b104d7   Rick Edgecombe   mm/vmalloc: Add f...
2158
  	vm_remove_mappings(area, deallocate_pages);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2159
2160
2161
2162
  	if (deallocate_pages) {
  		int i;
  
  		for (i = 0; i < area->nr_pages; i++) {
bf53d6f8f   Christoph Lameter   vmalloc: clean up...
2163
2164
2165
  			struct page *page = area->pages[i];
  
  			BUG_ON(!page);
4949148ad   Vladimir Davydov   mm: charge/unchar...
2166
  			__free_pages(page, 0);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2167
  		}
97105f0ab   Roman Gushchin   mm: vmalloc: show...
2168
  		atomic_long_sub(area->nr_pages, &nr_vmalloc_pages);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2169

244d63ee3   David Rientjes   mm, vmalloc: remo...
2170
  		kvfree(area->pages);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2171
2172
2173
2174
2175
  	}
  
  	kfree(area);
  	return;
  }
bf22e37a6   Andrey Ryabinin   mm: add vfree_ato...
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
  
  static inline void __vfree_deferred(const void *addr)
  {
  	/*
  	 * Use raw_cpu_ptr() because this can be called from preemptible
  	 * context. Preemption is absolutely fine here, because the llist_add()
  	 * implementation is lockless, so it works even if we are adding to
  	 * nother cpu's list.  schedule_work() should be fine with this too.
  	 */
  	struct vfree_deferred *p = raw_cpu_ptr(&vfree_deferred);
  
  	if (llist_add((struct llist_node *)addr, &p->list))
  		schedule_work(&p->wq);
  }
  
  /**
92eac1681   Mike Rapoport   docs/mm: vmalloc:...
2192
2193
   * vfree_atomic - release memory allocated by vmalloc()
   * @addr:	  memory base address
bf22e37a6   Andrey Ryabinin   mm: add vfree_ato...
2194
   *
92eac1681   Mike Rapoport   docs/mm: vmalloc:...
2195
2196
   * This one is just like vfree() but can be called in any atomic context
   * except NMIs.
bf22e37a6   Andrey Ryabinin   mm: add vfree_ato...
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
   */
  void vfree_atomic(const void *addr)
  {
  	BUG_ON(in_nmi());
  
  	kmemleak_free(addr);
  
  	if (!addr)
  		return;
  	__vfree_deferred(addr);
  }
c67dc6247   Roman Penyaev   mm/vmalloc: do no...
2208
2209
2210
2211
2212
2213
2214
  static void __vfree(const void *addr)
  {
  	if (unlikely(in_interrupt()))
  		__vfree_deferred(addr);
  	else
  		__vunmap(addr, 1);
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2215
  /**
92eac1681   Mike Rapoport   docs/mm: vmalloc:...
2216
2217
   * vfree - release memory allocated by vmalloc()
   * @addr:  memory base address
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2218
   *
92eac1681   Mike Rapoport   docs/mm: vmalloc:...
2219
2220
2221
   * Free the virtually continuous memory area starting at @addr, as
   * obtained from vmalloc(), vmalloc_32() or __vmalloc(). If @addr is
   * NULL, no operation is performed.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2222
   *
92eac1681   Mike Rapoport   docs/mm: vmalloc:...
2223
2224
2225
   * Must not be called in NMI context (strictly speaking, only if we don't
   * have CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG, but making the calling
   * conventions for vfree() arch-depenedent would be a really bad idea)
c9fcee513   Andrew Morton   mm/vmalloc.c: add...
2226
   *
92eac1681   Mike Rapoport   docs/mm: vmalloc:...
2227
   * May sleep if called *not* from interrupt context.
3ca4ea3a7   Andrey Ryabinin   mm/vmalloc.c: imp...
2228
   *
92eac1681   Mike Rapoport   docs/mm: vmalloc:...
2229
   * NOTE: assumes that the object at @addr has a size >= sizeof(llist_node)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2230
   */
b3bdda02a   Christoph Lameter   vmalloc: add cons...
2231
  void vfree(const void *addr)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2232
  {
32fcfd407   Al Viro   make vfree() safe...
2233
  	BUG_ON(in_nmi());
89219d37a   Catalin Marinas   kmemleak: Add the...
2234
2235
  
  	kmemleak_free(addr);
a8dda165e   Andrey Ryabinin   vfree: add debug ...
2236
  	might_sleep_if(!in_interrupt());
32fcfd407   Al Viro   make vfree() safe...
2237
2238
  	if (!addr)
  		return;
c67dc6247   Roman Penyaev   mm/vmalloc: do no...
2239
2240
  
  	__vfree(addr);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2241
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2242
2243
2244
  EXPORT_SYMBOL(vfree);
  
  /**
92eac1681   Mike Rapoport   docs/mm: vmalloc:...
2245
2246
   * vunmap - release virtual mapping obtained by vmap()
   * @addr:   memory base address
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2247
   *
92eac1681   Mike Rapoport   docs/mm: vmalloc:...
2248
2249
   * Free the virtually contiguous memory area starting at @addr,
   * which was created from the page array passed to vmap().
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2250
   *
92eac1681   Mike Rapoport   docs/mm: vmalloc:...
2251
   * Must not be called in interrupt context.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2252
   */
b3bdda02a   Christoph Lameter   vmalloc: add cons...
2253
  void vunmap(const void *addr)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2254
2255
  {
  	BUG_ON(in_interrupt());
34754b69a   Peter Zijlstra   x86: make vmap ye...
2256
  	might_sleep();
32fcfd407   Al Viro   make vfree() safe...
2257
2258
  	if (addr)
  		__vunmap(addr, 0);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2259
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2260
2261
2262
  EXPORT_SYMBOL(vunmap);
  
  /**
92eac1681   Mike Rapoport   docs/mm: vmalloc:...
2263
2264
2265
2266
2267
2268
2269
2270
   * vmap - map an array of pages into virtually contiguous space
   * @pages: array of page pointers
   * @count: number of pages to map
   * @flags: vm_area->flags
   * @prot: page protection for the mapping
   *
   * Maps @count pages from @pages into contiguous kernel virtual
   * space.
a862f68a8   Mike Rapoport   docs/core-api/mm:...
2271
2272
   *
   * Return: the address of the area or %NULL on failure
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2273
2274
   */
  void *vmap(struct page **pages, unsigned int count,
92eac1681   Mike Rapoport   docs/mm: vmalloc:...
2275
  	   unsigned long flags, pgprot_t prot)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2276
2277
  {
  	struct vm_struct *area;
65ee03c4b   Guillermo Julián Moreno   mm: fix overflow ...
2278
  	unsigned long size;		/* In bytes */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2279

34754b69a   Peter Zijlstra   x86: make vmap ye...
2280
  	might_sleep();
ca79b0c21   Arun KS   mm: convert total...
2281
  	if (count > totalram_pages())
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2282
  		return NULL;
65ee03c4b   Guillermo Julián Moreno   mm: fix overflow ...
2283
2284
  	size = (unsigned long)count << PAGE_SHIFT;
  	area = get_vm_area_caller(size, flags, __builtin_return_address(0));
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2285
2286
  	if (!area)
  		return NULL;
230169693   Christoph Lameter   vmallocinfo: add ...
2287

f6f8ed473   WANG Chao   mm/vmalloc.c: cle...
2288
  	if (map_vm_area(area, prot, pages)) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2289
2290
2291
2292
2293
2294
  		vunmap(area->addr);
  		return NULL;
  	}
  
  	return area->addr;
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2295
  EXPORT_SYMBOL(vmap);
8594a21cf   Michal Hocko   mm, vmalloc: fix ...
2296
2297
2298
  static void *__vmalloc_node(unsigned long size, unsigned long align,
  			    gfp_t gfp_mask, pgprot_t prot,
  			    int node, const void *caller);
e31d9eb5c   Adrian Bunk   make __vmalloc_ar...
2299
  static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
3722e13cf   Wanpeng Li   mm/vmalloc: don't...
2300
  				 pgprot_t prot, int node)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2301
2302
2303
  {
  	struct page **pages;
  	unsigned int nr_pages, array_size, i;
930f036b4   David Rientjes   mm, vmalloc: cons...
2304
  	const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO;
704b862f9   Laura Abbott   mm/vmalloc.c: don...
2305
2306
2307
2308
  	const gfp_t alloc_mask = gfp_mask | __GFP_NOWARN;
  	const gfp_t highmem_mask = (gfp_mask & (GFP_DMA | GFP_DMA32)) ?
  					0 :
  					__GFP_HIGHMEM;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2309

762216ab4   Wanpeng Li   mm/vmalloc: use w...
2310
  	nr_pages = get_vm_area_size(area) >> PAGE_SHIFT;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2311
  	array_size = (nr_pages * sizeof(struct page *));
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2312
  	/* Please note that the recursion is strictly bounded. */
8757d5fa6   Jan Kiszka   [PATCH] mm: fix o...
2313
  	if (array_size > PAGE_SIZE) {
704b862f9   Laura Abbott   mm/vmalloc.c: don...
2314
  		pages = __vmalloc_node(array_size, 1, nested_gfp|highmem_mask,
3722e13cf   Wanpeng Li   mm/vmalloc: don't...
2315
  				PAGE_KERNEL, node, area->caller);
286e1ea3a   Andrew Morton   [PATCH] vmalloc()...
2316
  	} else {
976d6dfbb   Jan Beulich   vmalloc(): adjust...
2317
  		pages = kmalloc_node(array_size, nested_gfp, node);
286e1ea3a   Andrew Morton   [PATCH] vmalloc()...
2318
  	}
7ea362427   Austin Kim   mm/vmalloc.c: mov...
2319
2320
  
  	if (!pages) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2321
2322
2323
2324
  		remove_vm_area(area->addr);
  		kfree(area);
  		return NULL;
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2325

7ea362427   Austin Kim   mm/vmalloc.c: mov...
2326
2327
  	area->pages = pages;
  	area->nr_pages = nr_pages;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2328
  	for (i = 0; i < area->nr_pages; i++) {
bf53d6f8f   Christoph Lameter   vmalloc: clean up...
2329
  		struct page *page;
4b90951c0   Jianguo Wu   mm/vmalloc: use N...
2330
  		if (node == NUMA_NO_NODE)
704b862f9   Laura Abbott   mm/vmalloc.c: don...
2331
  			page = alloc_page(alloc_mask|highmem_mask);
930fc45a4   Christoph Lameter   [PATCH] vmalloc_node
2332
  		else
704b862f9   Laura Abbott   mm/vmalloc.c: don...
2333
  			page = alloc_pages_node(node, alloc_mask|highmem_mask, 0);
bf53d6f8f   Christoph Lameter   vmalloc: clean up...
2334
2335
  
  		if (unlikely(!page)) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2336
2337
  			/* Successfully allocated i pages, free them in __vunmap() */
  			area->nr_pages = i;
97105f0ab   Roman Gushchin   mm: vmalloc: show...
2338
  			atomic_long_add(area->nr_pages, &nr_vmalloc_pages);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2339
2340
  			goto fail;
  		}
bf53d6f8f   Christoph Lameter   vmalloc: clean up...
2341
  		area->pages[i] = page;
704b862f9   Laura Abbott   mm/vmalloc.c: don...
2342
  		if (gfpflags_allow_blocking(gfp_mask|highmem_mask))
660654f90   Eric Dumazet   mm/vmalloc.c: add...
2343
  			cond_resched();
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2344
  	}
97105f0ab   Roman Gushchin   mm: vmalloc: show...
2345
  	atomic_long_add(area->nr_pages, &nr_vmalloc_pages);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2346

f6f8ed473   WANG Chao   mm/vmalloc.c: cle...
2347
  	if (map_vm_area(area, prot, pages))
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2348
2349
2350
2351
  		goto fail;
  	return area->addr;
  
  fail:
a8e99259e   Michal Hocko   mm, page_alloc: w...
2352
  	warn_alloc(gfp_mask, NULL,
7877cdcc3   Michal Hocko   mm: consolidate w...
2353
  			  "vmalloc: allocation failure, allocated %ld of %ld bytes",
22943ab11   Dave Hansen   mm: print vmalloc...
2354
  			  (area->nr_pages*PAGE_SIZE), area->size);
c67dc6247   Roman Penyaev   mm/vmalloc: do no...
2355
  	__vfree(area->addr);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2356
2357
2358
2359
  	return NULL;
  }
  
  /**
92eac1681   Mike Rapoport   docs/mm: vmalloc:...
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
   * __vmalloc_node_range - allocate virtually contiguous memory
   * @size:		  allocation size
   * @align:		  desired alignment
   * @start:		  vm area range start
   * @end:		  vm area range end
   * @gfp_mask:		  flags for the page level allocator
   * @prot:		  protection mask for the allocated pages
   * @vm_flags:		  additional vm area flags (e.g. %VM_NO_GUARD)
   * @node:		  node to use for allocation or NUMA_NO_NODE
   * @caller:		  caller's return address
   *
   * Allocate enough pages to cover @size from the page level
   * allocator with @gfp_mask flags.  Map them into contiguous
   * kernel virtual space, using a pagetable protection of @prot.
a862f68a8   Mike Rapoport   docs/core-api/mm:...
2374
2375
   *
   * Return: the address of the area or %NULL on failure
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2376
   */
d0a21265d   David Rientjes   mm: unify module_...
2377
2378
  void *__vmalloc_node_range(unsigned long size, unsigned long align,
  			unsigned long start, unsigned long end, gfp_t gfp_mask,
cb9e3c292   Andrey Ryabinin   mm: vmalloc: pass...
2379
2380
  			pgprot_t prot, unsigned long vm_flags, int node,
  			const void *caller)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2381
2382
  {
  	struct vm_struct *area;
89219d37a   Catalin Marinas   kmemleak: Add the...
2383
2384
  	void *addr;
  	unsigned long real_size = size;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2385
2386
  
  	size = PAGE_ALIGN(size);
ca79b0c21   Arun KS   mm: convert total...
2387
  	if (!size || (size >> PAGE_SHIFT) > totalram_pages())
de7d2b567   Joe Perches   mm/vmalloc.c: rep...
2388
  		goto fail;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2389

cb9e3c292   Andrey Ryabinin   mm: vmalloc: pass...
2390
2391
  	area = __get_vm_area_node(size, align, VM_ALLOC | VM_UNINITIALIZED |
  				vm_flags, start, end, node, gfp_mask, caller);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2392
  	if (!area)
de7d2b567   Joe Perches   mm/vmalloc.c: rep...
2393
  		goto fail;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2394

3722e13cf   Wanpeng Li   mm/vmalloc: don't...
2395
  	addr = __vmalloc_area_node(area, gfp_mask, prot, node);
1368edf06   Mel Gorman   mm: vmalloc: chec...
2396
  	if (!addr)
b82225f3f   Wanpeng Li   revert mm/vmalloc...
2397
  		return NULL;
89219d37a   Catalin Marinas   kmemleak: Add the...
2398
2399
  
  	/*
20fc02b47   Zhang Yanfei   mm/vmalloc.c: ren...
2400
2401
  	 * In this function, newly allocated vm_struct has VM_UNINITIALIZED
  	 * flag. It means that vm_struct is not fully initialized.
4341fa454   Joonsoo Kim   mm, vmalloc: remo...
2402
  	 * Now, it is fully initialized, so remove this flag here.
f5252e009   Mitsuo Hayasaka   mm: avoid null po...
2403
  	 */
20fc02b47   Zhang Yanfei   mm/vmalloc.c: ren...
2404
  	clear_vm_uninitialized_flag(area);
f5252e009   Mitsuo Hayasaka   mm: avoid null po...
2405

94f4a1618   Catalin Marinas   mm: kmemleak: tre...
2406
  	kmemleak_vmalloc(area, size, gfp_mask);
89219d37a   Catalin Marinas   kmemleak: Add the...
2407
2408
  
  	return addr;
de7d2b567   Joe Perches   mm/vmalloc.c: rep...
2409
2410
  
  fail:
a8e99259e   Michal Hocko   mm, page_alloc: w...
2411
  	warn_alloc(gfp_mask, NULL,
7877cdcc3   Michal Hocko   mm: consolidate w...
2412
  			  "vmalloc: allocation failure: %lu bytes", real_size);
de7d2b567   Joe Perches   mm/vmalloc.c: rep...
2413
  	return NULL;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2414
  }
153178edc   Uladzislau Rezki (Sony)   vmalloc: export _...
2415
2416
2417
2418
2419
2420
2421
2422
  /*
   * This is only for performance analysis of vmalloc and stress purpose.
   * It is required by vmalloc test module, therefore do not use it other
   * than that.
   */
  #ifdef CONFIG_TEST_VMALLOC_MODULE
  EXPORT_SYMBOL_GPL(__vmalloc_node_range);
  #endif
d0a21265d   David Rientjes   mm: unify module_...
2423
  /**
92eac1681   Mike Rapoport   docs/mm: vmalloc:...
2424
2425
2426
2427
2428
2429
2430
   * __vmalloc_node - allocate virtually contiguous memory
   * @size:	    allocation size
   * @align:	    desired alignment
   * @gfp_mask:	    flags for the page level allocator
   * @prot:	    protection mask for the allocated pages
   * @node:	    node to use for allocation or NUMA_NO_NODE
   * @caller:	    caller's return address
a7c3e901a   Michal Hocko   mm: introduce kv[...
2431
   *
92eac1681   Mike Rapoport   docs/mm: vmalloc:...
2432
2433
2434
   * Allocate enough pages to cover @size from the page level
   * allocator with @gfp_mask flags.  Map them into contiguous
   * kernel virtual space, using a pagetable protection of @prot.
a7c3e901a   Michal Hocko   mm: introduce kv[...
2435
   *
92eac1681   Mike Rapoport   docs/mm: vmalloc:...
2436
2437
   * Reclaim modifiers in @gfp_mask - __GFP_NORETRY, __GFP_RETRY_MAYFAIL
   * and __GFP_NOFAIL are not supported
a7c3e901a   Michal Hocko   mm: introduce kv[...
2438
   *
92eac1681   Mike Rapoport   docs/mm: vmalloc:...
2439
2440
   * Any use of gfp flags outside of GFP_KERNEL should be consulted
   * with mm people.
a862f68a8   Mike Rapoport   docs/core-api/mm:...
2441
2442
   *
   * Return: pointer to the allocated memory or %NULL on error
d0a21265d   David Rientjes   mm: unify module_...
2443
   */
8594a21cf   Michal Hocko   mm, vmalloc: fix ...
2444
  static void *__vmalloc_node(unsigned long size, unsigned long align,
d0a21265d   David Rientjes   mm: unify module_...
2445
  			    gfp_t gfp_mask, pgprot_t prot,
5e6cafc83   Marek Szyprowski   mm: vmalloc: use ...
2446
  			    int node, const void *caller)
d0a21265d   David Rientjes   mm: unify module_...
2447
2448
  {
  	return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END,
cb9e3c292   Andrey Ryabinin   mm: vmalloc: pass...
2449
  				gfp_mask, prot, 0, node, caller);
d0a21265d   David Rientjes   mm: unify module_...
2450
  }
930fc45a4   Christoph Lameter   [PATCH] vmalloc_node
2451
2452
  void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot)
  {
00ef2d2f8   David Rientjes   mm: use NUMA_NO_NODE
2453
  	return __vmalloc_node(size, 1, gfp_mask, prot, NUMA_NO_NODE,
230169693   Christoph Lameter   vmallocinfo: add ...
2454
  				__builtin_return_address(0));
930fc45a4   Christoph Lameter   [PATCH] vmalloc_node
2455
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2456
  EXPORT_SYMBOL(__vmalloc);
8594a21cf   Michal Hocko   mm, vmalloc: fix ...
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
  static inline void *__vmalloc_node_flags(unsigned long size,
  					int node, gfp_t flags)
  {
  	return __vmalloc_node(size, 1, flags, PAGE_KERNEL,
  					node, __builtin_return_address(0));
  }
  
  
  void *__vmalloc_node_flags_caller(unsigned long size, int node, gfp_t flags,
  				  void *caller)
  {
  	return __vmalloc_node(size, 1, flags, PAGE_KERNEL, node, caller);
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2470
  /**
92eac1681   Mike Rapoport   docs/mm: vmalloc:...
2471
2472
2473
2474
2475
   * vmalloc - allocate virtually contiguous memory
   * @size:    allocation size
   *
   * Allocate enough pages to cover @size from the page level
   * allocator and map them into contiguous kernel virtual space.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2476
   *
92eac1681   Mike Rapoport   docs/mm: vmalloc:...
2477
2478
   * For tight control over page level allocator and protection flags
   * use __vmalloc() instead.
a862f68a8   Mike Rapoport   docs/core-api/mm:...
2479
2480
   *
   * Return: pointer to the allocated memory or %NULL on error
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2481
2482
2483
   */
  void *vmalloc(unsigned long size)
  {
00ef2d2f8   David Rientjes   mm: use NUMA_NO_NODE
2484
  	return __vmalloc_node_flags(size, NUMA_NO_NODE,
19809c2da   Michal Hocko   mm, vmalloc: use ...
2485
  				    GFP_KERNEL);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2486
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2487
  EXPORT_SYMBOL(vmalloc);
930fc45a4   Christoph Lameter   [PATCH] vmalloc_node
2488
  /**
92eac1681   Mike Rapoport   docs/mm: vmalloc:...
2489
2490
2491
2492
2493
2494
2495
2496
2497
   * vzalloc - allocate virtually contiguous memory with zero fill
   * @size:    allocation size
   *
   * Allocate enough pages to cover @size from the page level
   * allocator and map them into contiguous kernel virtual space.
   * The memory allocated is set to zero.
   *
   * For tight control over page level allocator and protection flags
   * use __vmalloc() instead.
a862f68a8   Mike Rapoport   docs/core-api/mm:...
2498
2499
   *
   * Return: pointer to the allocated memory or %NULL on error
e1ca7788d   Dave Young   mm: add vzalloc()...
2500
2501
2502
   */
  void *vzalloc(unsigned long size)
  {
00ef2d2f8   David Rientjes   mm: use NUMA_NO_NODE
2503
  	return __vmalloc_node_flags(size, NUMA_NO_NODE,
19809c2da   Michal Hocko   mm, vmalloc: use ...
2504
  				GFP_KERNEL | __GFP_ZERO);
e1ca7788d   Dave Young   mm: add vzalloc()...
2505
2506
2507
2508
  }
  EXPORT_SYMBOL(vzalloc);
  
  /**
ead04089b   Rolf Eike Beer   [PATCH] Fix kerne...
2509
2510
   * vmalloc_user - allocate zeroed virtually contiguous memory for userspace
   * @size: allocation size
833423143   Nick Piggin   [PATCH] mm: intro...
2511
   *
ead04089b   Rolf Eike Beer   [PATCH] Fix kerne...
2512
2513
   * The resulting memory area is zeroed so it can be mapped to userspace
   * without leaking data.
a862f68a8   Mike Rapoport   docs/core-api/mm:...
2514
2515
   *
   * Return: pointer to the allocated memory or %NULL on error
833423143   Nick Piggin   [PATCH] mm: intro...
2516
2517
2518
   */
  void *vmalloc_user(unsigned long size)
  {
bc84c5352   Roman Penyaev   mm/vmalloc: pass ...
2519
2520
2521
2522
  	return __vmalloc_node_range(size, SHMLBA,  VMALLOC_START, VMALLOC_END,
  				    GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL,
  				    VM_USERMAP, NUMA_NO_NODE,
  				    __builtin_return_address(0));
833423143   Nick Piggin   [PATCH] mm: intro...
2523
2524
2525
2526
  }
  EXPORT_SYMBOL(vmalloc_user);
  
  /**
92eac1681   Mike Rapoport   docs/mm: vmalloc:...
2527
2528
2529
   * vmalloc_node - allocate memory on a specific node
   * @size:	  allocation size
   * @node:	  numa node
930fc45a4   Christoph Lameter   [PATCH] vmalloc_node
2530
   *
92eac1681   Mike Rapoport   docs/mm: vmalloc:...
2531
2532
   * Allocate enough pages to cover @size from the page level
   * allocator and map them into contiguous kernel virtual space.
930fc45a4   Christoph Lameter   [PATCH] vmalloc_node
2533
   *
92eac1681   Mike Rapoport   docs/mm: vmalloc:...
2534
2535
   * For tight control over page level allocator and protection flags
   * use __vmalloc() instead.
a862f68a8   Mike Rapoport   docs/core-api/mm:...
2536
2537
   *
   * Return: pointer to the allocated memory or %NULL on error
930fc45a4   Christoph Lameter   [PATCH] vmalloc_node
2538
2539
2540
   */
  void *vmalloc_node(unsigned long size, int node)
  {
19809c2da   Michal Hocko   mm, vmalloc: use ...
2541
  	return __vmalloc_node(size, 1, GFP_KERNEL, PAGE_KERNEL,
230169693   Christoph Lameter   vmallocinfo: add ...
2542
  					node, __builtin_return_address(0));
930fc45a4   Christoph Lameter   [PATCH] vmalloc_node
2543
2544
  }
  EXPORT_SYMBOL(vmalloc_node);
e1ca7788d   Dave Young   mm: add vzalloc()...
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
  /**
   * vzalloc_node - allocate memory on a specific node with zero fill
   * @size:	allocation size
   * @node:	numa node
   *
   * Allocate enough pages to cover @size from the page level
   * allocator and map them into contiguous kernel virtual space.
   * The memory allocated is set to zero.
   *
   * For tight control over page level allocator and protection flags
   * use __vmalloc_node() instead.
a862f68a8   Mike Rapoport   docs/core-api/mm:...
2556
2557
   *
   * Return: pointer to the allocated memory or %NULL on error
e1ca7788d   Dave Young   mm: add vzalloc()...
2558
2559
2560
2561
   */
  void *vzalloc_node(unsigned long size, int node)
  {
  	return __vmalloc_node_flags(size, node,
19809c2da   Michal Hocko   mm, vmalloc: use ...
2562
  			 GFP_KERNEL | __GFP_ZERO);
e1ca7788d   Dave Young   mm: add vzalloc()...
2563
2564
  }
  EXPORT_SYMBOL(vzalloc_node);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2565
  /**
92eac1681   Mike Rapoport   docs/mm: vmalloc:...
2566
2567
   * vmalloc_exec - allocate virtually contiguous, executable memory
   * @size:	  allocation size
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2568
   *
92eac1681   Mike Rapoport   docs/mm: vmalloc:...
2569
2570
2571
   * Kernel-internal function to allocate enough pages to cover @size
   * the page level allocator and map them into contiguous and
   * executable kernel virtual space.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2572
   *
92eac1681   Mike Rapoport   docs/mm: vmalloc:...
2573
2574
   * For tight control over page level allocator and protection flags
   * use __vmalloc() instead.
a862f68a8   Mike Rapoport   docs/core-api/mm:...
2575
2576
   *
   * Return: pointer to the allocated memory or %NULL on error
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2577
   */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2578
2579
  void *vmalloc_exec(unsigned long size)
  {
868b104d7   Rick Edgecombe   mm/vmalloc: Add f...
2580
2581
2582
  	return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END,
  			GFP_KERNEL, PAGE_KERNEL_EXEC, VM_FLUSH_RESET_PERMS,
  			NUMA_NO_NODE, __builtin_return_address(0));
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2583
  }
0d08e0d3a   Andi Kleen   [PATCH] x86-64: F...
2584
  #if defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA32)
698d0831b   Michal Hocko   vmalloc: fix __GF...
2585
  #define GFP_VMALLOC32 (GFP_DMA32 | GFP_KERNEL)
0d08e0d3a   Andi Kleen   [PATCH] x86-64: F...
2586
  #elif defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA)
698d0831b   Michal Hocko   vmalloc: fix __GF...
2587
  #define GFP_VMALLOC32 (GFP_DMA | GFP_KERNEL)
0d08e0d3a   Andi Kleen   [PATCH] x86-64: F...
2588
  #else
698d0831b   Michal Hocko   vmalloc: fix __GF...
2589
2590
2591
2592
2593
  /*
   * 64b systems should always have either DMA or DMA32 zones. For others
   * GFP_DMA32 should do the right thing and use the normal zone.
   */
  #define GFP_VMALLOC32 GFP_DMA32 | GFP_KERNEL
0d08e0d3a   Andi Kleen   [PATCH] x86-64: F...
2594
  #endif
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2595
  /**
92eac1681   Mike Rapoport   docs/mm: vmalloc:...
2596
2597
   * vmalloc_32 - allocate virtually contiguous memory (32bit addressable)
   * @size:	allocation size
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2598
   *
92eac1681   Mike Rapoport   docs/mm: vmalloc:...
2599
2600
   * Allocate enough 32bit PA addressable pages to cover @size from the
   * page level allocator and map them into contiguous kernel virtual space.
a862f68a8   Mike Rapoport   docs/core-api/mm:...
2601
2602
   *
   * Return: pointer to the allocated memory or %NULL on error
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2603
2604
2605
   */
  void *vmalloc_32(unsigned long size)
  {
2dca6999e   David Miller   mm, perf_event: M...
2606
  	return __vmalloc_node(size, 1, GFP_VMALLOC32, PAGE_KERNEL,
00ef2d2f8   David Rientjes   mm: use NUMA_NO_NODE
2607
  			      NUMA_NO_NODE, __builtin_return_address(0));
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2608
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2609
  EXPORT_SYMBOL(vmalloc_32);
833423143   Nick Piggin   [PATCH] mm: intro...
2610
  /**
ead04089b   Rolf Eike Beer   [PATCH] Fix kerne...
2611
   * vmalloc_32_user - allocate zeroed virtually contiguous 32bit memory
92eac1681   Mike Rapoport   docs/mm: vmalloc:...
2612
   * @size:	     allocation size
ead04089b   Rolf Eike Beer   [PATCH] Fix kerne...
2613
2614
2615
   *
   * The resulting memory area is 32bit addressable and zeroed so it can be
   * mapped to userspace without leaking data.
a862f68a8   Mike Rapoport   docs/core-api/mm:...
2616
2617
   *
   * Return: pointer to the allocated memory or %NULL on error
833423143   Nick Piggin   [PATCH] mm: intro...
2618
2619
2620
   */
  void *vmalloc_32_user(unsigned long size)
  {
bc84c5352   Roman Penyaev   mm/vmalloc: pass ...
2621
2622
2623
2624
  	return __vmalloc_node_range(size, SHMLBA,  VMALLOC_START, VMALLOC_END,
  				    GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL,
  				    VM_USERMAP, NUMA_NO_NODE,
  				    __builtin_return_address(0));
833423143   Nick Piggin   [PATCH] mm: intro...
2625
2626
  }
  EXPORT_SYMBOL(vmalloc_32_user);
d0107eb07   KAMEZAWA Hiroyuki   kcore: fix vread/...
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
  /*
   * small helper routine , copy contents to buf from addr.
   * If the page is not present, fill zero.
   */
  
  static int aligned_vread(char *buf, char *addr, unsigned long count)
  {
  	struct page *p;
  	int copied = 0;
  
  	while (count) {
  		unsigned long offset, length;
891c49abf   Alexander Kuleshov   mm/vmalloc: use o...
2639
  		offset = offset_in_page(addr);
d0107eb07   KAMEZAWA Hiroyuki   kcore: fix vread/...
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
  		length = PAGE_SIZE - offset;
  		if (length > count)
  			length = count;
  		p = vmalloc_to_page(addr);
  		/*
  		 * To do safe access to this _mapped_ area, we need
  		 * lock. But adding lock here means that we need to add
  		 * overhead of vmalloc()/vfree() calles for this _debug_
  		 * interface, rarely used. Instead of that, we'll use
  		 * kmap() and get small overhead in this access function.
  		 */
  		if (p) {
  			/*
  			 * we can expect USER0 is not used (see vread/vwrite's
  			 * function description)
  			 */
9b04c5fec   Cong Wang   mm: remove the se...
2656
  			void *map = kmap_atomic(p);
d0107eb07   KAMEZAWA Hiroyuki   kcore: fix vread/...
2657
  			memcpy(buf, map + offset, length);
9b04c5fec   Cong Wang   mm: remove the se...
2658
  			kunmap_atomic(map);
d0107eb07   KAMEZAWA Hiroyuki   kcore: fix vread/...
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
  		} else
  			memset(buf, 0, length);
  
  		addr += length;
  		buf += length;
  		copied += length;
  		count -= length;
  	}
  	return copied;
  }
  
  static int aligned_vwrite(char *buf, char *addr, unsigned long count)
  {
  	struct page *p;
  	int copied = 0;
  
  	while (count) {
  		unsigned long offset, length;
891c49abf   Alexander Kuleshov   mm/vmalloc: use o...
2677
  		offset = offset_in_page(addr);
d0107eb07   KAMEZAWA Hiroyuki   kcore: fix vread/...
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
  		length = PAGE_SIZE - offset;
  		if (length > count)
  			length = count;
  		p = vmalloc_to_page(addr);
  		/*
  		 * To do safe access to this _mapped_ area, we need
  		 * lock. But adding lock here means that we need to add
  		 * overhead of vmalloc()/vfree() calles for this _debug_
  		 * interface, rarely used. Instead of that, we'll use
  		 * kmap() and get small overhead in this access function.
  		 */
  		if (p) {
  			/*
  			 * we can expect USER0 is not used (see vread/vwrite's
  			 * function description)
  			 */
9b04c5fec   Cong Wang   mm: remove the se...
2694
  			void *map = kmap_atomic(p);
d0107eb07   KAMEZAWA Hiroyuki   kcore: fix vread/...
2695
  			memcpy(map + offset, buf, length);
9b04c5fec   Cong Wang   mm: remove the se...
2696
  			kunmap_atomic(map);
d0107eb07   KAMEZAWA Hiroyuki   kcore: fix vread/...
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
  		}
  		addr += length;
  		buf += length;
  		copied += length;
  		count -= length;
  	}
  	return copied;
  }
  
  /**
92eac1681   Mike Rapoport   docs/mm: vmalloc:...
2707
2708
2709
2710
2711
   * vread() - read vmalloc area in a safe way.
   * @buf:     buffer for reading data
   * @addr:    vm address.
   * @count:   number of bytes to be read.
   *
92eac1681   Mike Rapoport   docs/mm: vmalloc:...
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
   * This function checks that addr is a valid vmalloc'ed area, and
   * copy data from that area to a given buffer. If the given memory range
   * of [addr...addr+count) includes some valid address, data is copied to
   * proper area of @buf. If there are memory holes, they'll be zero-filled.
   * IOREMAP area is treated as memory hole and no copy is done.
   *
   * If [addr...addr+count) doesn't includes any intersects with alive
   * vm_struct area, returns 0. @buf should be kernel's buffer.
   *
   * Note: In usual ops, vread() is never necessary because the caller
   * should know vmalloc() area is valid and can use memcpy().
   * This is for routines which have to access vmalloc area without
d9009d67f   Geert Uytterhoeven   mm/vmalloc.c: spe...
2724
   * any information, as /dev/kmem.
a862f68a8   Mike Rapoport   docs/core-api/mm:...
2725
2726
2727
2728
   *
   * Return: number of bytes for which addr and buf should be increased
   * (same number as @count) or %0 if [addr...addr+count) doesn't
   * include any intersection with valid vmalloc area
d0107eb07   KAMEZAWA Hiroyuki   kcore: fix vread/...
2729
   */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2730
2731
  long vread(char *buf, char *addr, unsigned long count)
  {
e81ce85f9   Joonsoo Kim   mm, vmalloc: iter...
2732
2733
  	struct vmap_area *va;
  	struct vm_struct *vm;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2734
  	char *vaddr, *buf_start = buf;
d0107eb07   KAMEZAWA Hiroyuki   kcore: fix vread/...
2735
  	unsigned long buflen = count;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2736
2737
2738
2739
2740
  	unsigned long n;
  
  	/* Don't allow overflow */
  	if ((unsigned long) addr + count < count)
  		count = -(unsigned long) addr;
e81ce85f9   Joonsoo Kim   mm, vmalloc: iter...
2741
2742
2743
2744
  	spin_lock(&vmap_area_lock);
  	list_for_each_entry(va, &vmap_area_list, list) {
  		if (!count)
  			break;
688fcbfc0   Pengfei Li   mm/vmalloc: modif...
2745
  		if (!va->vm)
e81ce85f9   Joonsoo Kim   mm, vmalloc: iter...
2746
2747
2748
2749
  			continue;
  
  		vm = va->vm;
  		vaddr = (char *) vm->addr;
762216ab4   Wanpeng Li   mm/vmalloc: use w...
2750
  		if (addr >= vaddr + get_vm_area_size(vm))
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2751
2752
2753
2754
2755
2756
2757
2758
2759
  			continue;
  		while (addr < vaddr) {
  			if (count == 0)
  				goto finished;
  			*buf = '\0';
  			buf++;
  			addr++;
  			count--;
  		}
762216ab4   Wanpeng Li   mm/vmalloc: use w...
2760
  		n = vaddr + get_vm_area_size(vm) - addr;
d0107eb07   KAMEZAWA Hiroyuki   kcore: fix vread/...
2761
2762
  		if (n > count)
  			n = count;
e81ce85f9   Joonsoo Kim   mm, vmalloc: iter...
2763
  		if (!(vm->flags & VM_IOREMAP))
d0107eb07   KAMEZAWA Hiroyuki   kcore: fix vread/...
2764
2765
2766
2767
2768
2769
  			aligned_vread(buf, addr, n);
  		else /* IOREMAP area is treated as memory hole */
  			memset(buf, 0, n);
  		buf += n;
  		addr += n;
  		count -= n;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2770
2771
  	}
  finished:
e81ce85f9   Joonsoo Kim   mm, vmalloc: iter...
2772
  	spin_unlock(&vmap_area_lock);
d0107eb07   KAMEZAWA Hiroyuki   kcore: fix vread/...
2773
2774
2775
2776
2777
2778
2779
2780
  
  	if (buf == buf_start)
  		return 0;
  	/* zero-fill memory holes */
  	if (buf != buf_start + buflen)
  		memset(buf, 0, buflen - (buf - buf_start));
  
  	return buflen;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2781
  }
d0107eb07   KAMEZAWA Hiroyuki   kcore: fix vread/...
2782
  /**
92eac1681   Mike Rapoport   docs/mm: vmalloc:...
2783
2784
2785
2786
2787
   * vwrite() - write vmalloc area in a safe way.
   * @buf:      buffer for source data
   * @addr:     vm address.
   * @count:    number of bytes to be read.
   *
92eac1681   Mike Rapoport   docs/mm: vmalloc:...
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
   * This function checks that addr is a valid vmalloc'ed area, and
   * copy data from a buffer to the given addr. If specified range of
   * [addr...addr+count) includes some valid address, data is copied from
   * proper area of @buf. If there are memory holes, no copy to hole.
   * IOREMAP area is treated as memory hole and no copy is done.
   *
   * If [addr...addr+count) doesn't includes any intersects with alive
   * vm_struct area, returns 0. @buf should be kernel's buffer.
   *
   * Note: In usual ops, vwrite() is never necessary because the caller
   * should know vmalloc() area is valid and can use memcpy().
   * This is for routines which have to access vmalloc area without
d9009d67f   Geert Uytterhoeven   mm/vmalloc.c: spe...
2800
   * any information, as /dev/kmem.
a862f68a8   Mike Rapoport   docs/core-api/mm:...
2801
2802
2803
2804
   *
   * Return: number of bytes for which addr and buf should be
   * increased (same number as @count) or %0 if [addr...addr+count)
   * doesn't include any intersection with valid vmalloc area
d0107eb07   KAMEZAWA Hiroyuki   kcore: fix vread/...
2805
   */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2806
2807
  long vwrite(char *buf, char *addr, unsigned long count)
  {
e81ce85f9   Joonsoo Kim   mm, vmalloc: iter...
2808
2809
  	struct vmap_area *va;
  	struct vm_struct *vm;
d0107eb07   KAMEZAWA Hiroyuki   kcore: fix vread/...
2810
2811
2812
  	char *vaddr;
  	unsigned long n, buflen;
  	int copied = 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2813
2814
2815
2816
  
  	/* Don't allow overflow */
  	if ((unsigned long) addr + count < count)
  		count = -(unsigned long) addr;
d0107eb07   KAMEZAWA Hiroyuki   kcore: fix vread/...
2817
  	buflen = count;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2818

e81ce85f9   Joonsoo Kim   mm, vmalloc: iter...
2819
2820
2821
2822
  	spin_lock(&vmap_area_lock);
  	list_for_each_entry(va, &vmap_area_list, list) {
  		if (!count)
  			break;
688fcbfc0   Pengfei Li   mm/vmalloc: modif...
2823
  		if (!va->vm)
e81ce85f9   Joonsoo Kim   mm, vmalloc: iter...
2824
2825
2826
2827
  			continue;
  
  		vm = va->vm;
  		vaddr = (char *) vm->addr;
762216ab4   Wanpeng Li   mm/vmalloc: use w...
2828
  		if (addr >= vaddr + get_vm_area_size(vm))
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2829
2830
2831
2832
2833
2834
2835
2836
  			continue;
  		while (addr < vaddr) {
  			if (count == 0)
  				goto finished;
  			buf++;
  			addr++;
  			count--;
  		}
762216ab4   Wanpeng Li   mm/vmalloc: use w...
2837
  		n = vaddr + get_vm_area_size(vm) - addr;
d0107eb07   KAMEZAWA Hiroyuki   kcore: fix vread/...
2838
2839
  		if (n > count)
  			n = count;
e81ce85f9   Joonsoo Kim   mm, vmalloc: iter...
2840
  		if (!(vm->flags & VM_IOREMAP)) {
d0107eb07   KAMEZAWA Hiroyuki   kcore: fix vread/...
2841
2842
2843
2844
2845
2846
  			aligned_vwrite(buf, addr, n);
  			copied++;
  		}
  		buf += n;
  		addr += n;
  		count -= n;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2847
2848
  	}
  finished:
e81ce85f9   Joonsoo Kim   mm, vmalloc: iter...
2849
  	spin_unlock(&vmap_area_lock);
d0107eb07   KAMEZAWA Hiroyuki   kcore: fix vread/...
2850
2851
2852
  	if (!copied)
  		return 0;
  	return buflen;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2853
  }
833423143   Nick Piggin   [PATCH] mm: intro...
2854
2855
  
  /**
92eac1681   Mike Rapoport   docs/mm: vmalloc:...
2856
2857
2858
2859
   * remap_vmalloc_range_partial - map vmalloc pages to userspace
   * @vma:		vma to cover
   * @uaddr:		target user address to start at
   * @kaddr:		virtual address of vmalloc kernel memory
f4f235309   Jann Horn   vmalloc: fix rema...
2860
   * @pgoff:		offset from @kaddr to start at
92eac1681   Mike Rapoport   docs/mm: vmalloc:...
2861
   * @size:		size of map area
7682486b3   Randy Dunlap   mm: fix various k...
2862
   *
92eac1681   Mike Rapoport   docs/mm: vmalloc:...
2863
   * Returns:	0 for success, -Exxx on failure
833423143   Nick Piggin   [PATCH] mm: intro...
2864
   *
92eac1681   Mike Rapoport   docs/mm: vmalloc:...
2865
2866
2867
2868
   * This function checks that @kaddr is a valid vmalloc'ed area,
   * and that it is big enough to cover the range starting at
   * @uaddr in @vma. Will return failure if that criteria isn't
   * met.
833423143   Nick Piggin   [PATCH] mm: intro...
2869
   *
92eac1681   Mike Rapoport   docs/mm: vmalloc:...
2870
   * Similar to remap_pfn_range() (see mm/memory.c)
833423143   Nick Piggin   [PATCH] mm: intro...
2871
   */
e69e9d4ae   HATAYAMA Daisuke   vmalloc: introduc...
2872
  int remap_vmalloc_range_partial(struct vm_area_struct *vma, unsigned long uaddr,
f4f235309   Jann Horn   vmalloc: fix rema...
2873
2874
  				void *kaddr, unsigned long pgoff,
  				unsigned long size)
833423143   Nick Piggin   [PATCH] mm: intro...
2875
2876
  {
  	struct vm_struct *area;
f4f235309   Jann Horn   vmalloc: fix rema...
2877
2878
2879
2880
2881
  	unsigned long off;
  	unsigned long end_index;
  
  	if (check_shl_overflow(pgoff, PAGE_SHIFT, &off))
  		return -EINVAL;
833423143   Nick Piggin   [PATCH] mm: intro...
2882

e69e9d4ae   HATAYAMA Daisuke   vmalloc: introduc...
2883
2884
2885
  	size = PAGE_ALIGN(size);
  
  	if (!PAGE_ALIGNED(uaddr) || !PAGE_ALIGNED(kaddr))
833423143   Nick Piggin   [PATCH] mm: intro...
2886
  		return -EINVAL;
e69e9d4ae   HATAYAMA Daisuke   vmalloc: introduc...
2887
  	area = find_vm_area(kaddr);
833423143   Nick Piggin   [PATCH] mm: intro...
2888
  	if (!area)
db64fe022   Nick Piggin   mm: rewrite vmap ...
2889
  		return -EINVAL;
833423143   Nick Piggin   [PATCH] mm: intro...
2890

fe9041c24   Christoph Hellwig   vmalloc: lift the...
2891
  	if (!(area->flags & (VM_USERMAP | VM_DMA_COHERENT)))
db64fe022   Nick Piggin   mm: rewrite vmap ...
2892
  		return -EINVAL;
833423143   Nick Piggin   [PATCH] mm: intro...
2893

f4f235309   Jann Horn   vmalloc: fix rema...
2894
2895
  	if (check_add_overflow(size, off, &end_index) ||
  	    end_index > get_vm_area_size(area))
db64fe022   Nick Piggin   mm: rewrite vmap ...
2896
  		return -EINVAL;
f4f235309   Jann Horn   vmalloc: fix rema...
2897
  	kaddr += off;
833423143   Nick Piggin   [PATCH] mm: intro...
2898

833423143   Nick Piggin   [PATCH] mm: intro...
2899
  	do {
e69e9d4ae   HATAYAMA Daisuke   vmalloc: introduc...
2900
  		struct page *page = vmalloc_to_page(kaddr);
db64fe022   Nick Piggin   mm: rewrite vmap ...
2901
  		int ret;
833423143   Nick Piggin   [PATCH] mm: intro...
2902
2903
2904
2905
2906
  		ret = vm_insert_page(vma, uaddr, page);
  		if (ret)
  			return ret;
  
  		uaddr += PAGE_SIZE;
e69e9d4ae   HATAYAMA Daisuke   vmalloc: introduc...
2907
2908
2909
  		kaddr += PAGE_SIZE;
  		size -= PAGE_SIZE;
  	} while (size > 0);
833423143   Nick Piggin   [PATCH] mm: intro...
2910

314e51b98   Konstantin Khlebnikov   mm: kill vma flag...
2911
  	vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP;
833423143   Nick Piggin   [PATCH] mm: intro...
2912

db64fe022   Nick Piggin   mm: rewrite vmap ...
2913
  	return 0;
833423143   Nick Piggin   [PATCH] mm: intro...
2914
  }
e69e9d4ae   HATAYAMA Daisuke   vmalloc: introduc...
2915
2916
2917
  EXPORT_SYMBOL(remap_vmalloc_range_partial);
  
  /**
92eac1681   Mike Rapoport   docs/mm: vmalloc:...
2918
2919
2920
2921
   * remap_vmalloc_range - map vmalloc pages to userspace
   * @vma:		vma to cover (map full range of vma)
   * @addr:		vmalloc memory
   * @pgoff:		number of pages into addr before first page to map
e69e9d4ae   HATAYAMA Daisuke   vmalloc: introduc...
2922
   *
92eac1681   Mike Rapoport   docs/mm: vmalloc:...
2923
   * Returns:	0 for success, -Exxx on failure
e69e9d4ae   HATAYAMA Daisuke   vmalloc: introduc...
2924
   *
92eac1681   Mike Rapoport   docs/mm: vmalloc:...
2925
2926
2927
   * This function checks that addr is a valid vmalloc'ed area, and
   * that it is big enough to cover the vma. Will return failure if
   * that criteria isn't met.
e69e9d4ae   HATAYAMA Daisuke   vmalloc: introduc...
2928
   *
92eac1681   Mike Rapoport   docs/mm: vmalloc:...
2929
   * Similar to remap_pfn_range() (see mm/memory.c)
e69e9d4ae   HATAYAMA Daisuke   vmalloc: introduc...
2930
2931
2932
2933
2934
   */
  int remap_vmalloc_range(struct vm_area_struct *vma, void *addr,
  						unsigned long pgoff)
  {
  	return remap_vmalloc_range_partial(vma, vma->vm_start,
f4f235309   Jann Horn   vmalloc: fix rema...
2935
  					   addr, pgoff,
e69e9d4ae   HATAYAMA Daisuke   vmalloc: introduc...
2936
2937
  					   vma->vm_end - vma->vm_start);
  }
833423143   Nick Piggin   [PATCH] mm: intro...
2938
  EXPORT_SYMBOL(remap_vmalloc_range);
1eeb66a1b   Christoph Hellwig   move die notifier...
2939
  /*
66f28e110   Joerg Roedel   x86/mm: split vma...
2940
2941
   * Implement stubs for vmalloc_sync_[un]mappings () if the architecture chose
   * not to have one.
3f8fd02b1   Joerg Roedel   mm/vmalloc: Sync ...
2942
2943
2944
   *
   * The purpose of this function is to make sure the vmalloc area
   * mappings are identical in all page-tables in the system.
1eeb66a1b   Christoph Hellwig   move die notifier...
2945
   */
66f28e110   Joerg Roedel   x86/mm: split vma...
2946
  void __weak vmalloc_sync_mappings(void)
1eeb66a1b   Christoph Hellwig   move die notifier...
2947
2948
  {
  }
5f4352fbf   Jeremy Fitzhardinge   Allocate and free...
2949

66f28e110   Joerg Roedel   x86/mm: split vma...
2950
2951
2952
  void __weak vmalloc_sync_unmappings(void)
  {
  }
5f4352fbf   Jeremy Fitzhardinge   Allocate and free...
2953

8b1e0f81f   Anshuman Khandual   mm/pgtable: drop ...
2954
  static int f(pte_t *pte, unsigned long addr, void *data)
5f4352fbf   Jeremy Fitzhardinge   Allocate and free...
2955
  {
cd12909cb   David Vrabel   xen: map foreign ...
2956
2957
2958
2959
2960
2961
  	pte_t ***p = data;
  
  	if (p) {
  		*(*p) = pte;
  		(*p)++;
  	}
5f4352fbf   Jeremy Fitzhardinge   Allocate and free...
2962
2963
2964
2965
  	return 0;
  }
  
  /**
92eac1681   Mike Rapoport   docs/mm: vmalloc:...
2966
2967
2968
   * alloc_vm_area - allocate a range of kernel address space
   * @size:	   size of the area
   * @ptes:	   returns the PTEs for the address space
7682486b3   Randy Dunlap   mm: fix various k...
2969
   *
92eac1681   Mike Rapoport   docs/mm: vmalloc:...
2970
   * Returns:	NULL on failure, vm_struct on success
5f4352fbf   Jeremy Fitzhardinge   Allocate and free...
2971
   *
92eac1681   Mike Rapoport   docs/mm: vmalloc:...
2972
2973
2974
   * This function reserves a range of kernel address space, and
   * allocates pagetables to map that range.  No actual mappings
   * are created.
cd12909cb   David Vrabel   xen: map foreign ...
2975
   *
92eac1681   Mike Rapoport   docs/mm: vmalloc:...
2976
2977
   * If @ptes is non-NULL, pointers to the PTEs (in init_mm)
   * allocated for the VM area are returned.
5f4352fbf   Jeremy Fitzhardinge   Allocate and free...
2978
   */
cd12909cb   David Vrabel   xen: map foreign ...
2979
  struct vm_struct *alloc_vm_area(size_t size, pte_t **ptes)
5f4352fbf   Jeremy Fitzhardinge   Allocate and free...
2980
2981
  {
  	struct vm_struct *area;
230169693   Christoph Lameter   vmallocinfo: add ...
2982
2983
  	area = get_vm_area_caller(size, VM_IOREMAP,
  				__builtin_return_address(0));
5f4352fbf   Jeremy Fitzhardinge   Allocate and free...
2984
2985
2986
2987
2988
2989
2990
2991
  	if (area == NULL)
  		return NULL;
  
  	/*
  	 * This ensures that page tables are constructed for this region
  	 * of kernel virtual address space and mapped into init_mm.
  	 */
  	if (apply_to_page_range(&init_mm, (unsigned long)area->addr,
cd12909cb   David Vrabel   xen: map foreign ...
2992
  				size, f, ptes ? &ptes : NULL)) {
5f4352fbf   Jeremy Fitzhardinge   Allocate and free...
2993
2994
2995
  		free_vm_area(area);
  		return NULL;
  	}
5f4352fbf   Jeremy Fitzhardinge   Allocate and free...
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
  	return area;
  }
  EXPORT_SYMBOL_GPL(alloc_vm_area);
  
  void free_vm_area(struct vm_struct *area)
  {
  	struct vm_struct *ret;
  	ret = remove_vm_area(area->addr);
  	BUG_ON(ret != area);
  	kfree(area);
  }
  EXPORT_SYMBOL_GPL(free_vm_area);
a10aa5798   Christoph Lameter   vmalloc: show vma...
3008

4f8b02b4e   Tejun Heo   vmalloc: pcpu_get...
3009
  #ifdef CONFIG_SMP
ca23e405e   Tejun Heo   vmalloc: implemen...
3010
3011
  static struct vmap_area *node_to_va(struct rb_node *n)
  {
4583e7731   Geliang Tang   mm/vmalloc.c: use...
3012
  	return rb_entry_safe(n, struct vmap_area, rb_node);
ca23e405e   Tejun Heo   vmalloc: implemen...
3013
3014
3015
  }
  
  /**
68ad4a330   Uladzislau Rezki (Sony)   mm/vmalloc.c: kee...
3016
3017
   * pvm_find_va_enclose_addr - find the vmap_area @addr belongs to
   * @addr: target address
ca23e405e   Tejun Heo   vmalloc: implemen...
3018
   *
68ad4a330   Uladzislau Rezki (Sony)   mm/vmalloc.c: kee...
3019
3020
3021
3022
   * Returns: vmap_area if it is found. If there is no such area
   *   the first highest(reverse order) vmap_area is returned
   *   i.e. va->va_start < addr && va->va_end < addr or NULL
   *   if there are no any areas before @addr.
ca23e405e   Tejun Heo   vmalloc: implemen...
3023
   */
68ad4a330   Uladzislau Rezki (Sony)   mm/vmalloc.c: kee...
3024
3025
  static struct vmap_area *
  pvm_find_va_enclose_addr(unsigned long addr)
ca23e405e   Tejun Heo   vmalloc: implemen...
3026
  {
68ad4a330   Uladzislau Rezki (Sony)   mm/vmalloc.c: kee...
3027
3028
3029
3030
3031
  	struct vmap_area *va, *tmp;
  	struct rb_node *n;
  
  	n = free_vmap_area_root.rb_node;
  	va = NULL;
ca23e405e   Tejun Heo   vmalloc: implemen...
3032
3033
  
  	while (n) {
68ad4a330   Uladzislau Rezki (Sony)   mm/vmalloc.c: kee...
3034
3035
3036
3037
3038
  		tmp = rb_entry(n, struct vmap_area, rb_node);
  		if (tmp->va_start <= addr) {
  			va = tmp;
  			if (tmp->va_end >= addr)
  				break;
ca23e405e   Tejun Heo   vmalloc: implemen...
3039
  			n = n->rb_right;
68ad4a330   Uladzislau Rezki (Sony)   mm/vmalloc.c: kee...
3040
3041
3042
  		} else {
  			n = n->rb_left;
  		}
ca23e405e   Tejun Heo   vmalloc: implemen...
3043
  	}
68ad4a330   Uladzislau Rezki (Sony)   mm/vmalloc.c: kee...
3044
  	return va;
ca23e405e   Tejun Heo   vmalloc: implemen...
3045
3046
3047
  }
  
  /**
68ad4a330   Uladzislau Rezki (Sony)   mm/vmalloc.c: kee...
3048
3049
3050
3051
3052
   * pvm_determine_end_from_reverse - find the highest aligned address
   * of free block below VMALLOC_END
   * @va:
   *   in - the VA we start the search(reverse order);
   *   out - the VA with the highest aligned end address.
ca23e405e   Tejun Heo   vmalloc: implemen...
3053
   *
68ad4a330   Uladzislau Rezki (Sony)   mm/vmalloc.c: kee...
3054
   * Returns: determined end address within vmap_area
ca23e405e   Tejun Heo   vmalloc: implemen...
3055
   */
68ad4a330   Uladzislau Rezki (Sony)   mm/vmalloc.c: kee...
3056
3057
  static unsigned long
  pvm_determine_end_from_reverse(struct vmap_area **va, unsigned long align)
ca23e405e   Tejun Heo   vmalloc: implemen...
3058
  {
68ad4a330   Uladzislau Rezki (Sony)   mm/vmalloc.c: kee...
3059
  	unsigned long vmalloc_end = VMALLOC_END & ~(align - 1);
ca23e405e   Tejun Heo   vmalloc: implemen...
3060
  	unsigned long addr;
68ad4a330   Uladzislau Rezki (Sony)   mm/vmalloc.c: kee...
3061
3062
3063
3064
3065
3066
3067
  	if (likely(*va)) {
  		list_for_each_entry_from_reverse((*va),
  				&free_vmap_area_list, list) {
  			addr = min((*va)->va_end & ~(align - 1), vmalloc_end);
  			if ((*va)->va_start < addr)
  				return addr;
  		}
ca23e405e   Tejun Heo   vmalloc: implemen...
3068
  	}
68ad4a330   Uladzislau Rezki (Sony)   mm/vmalloc.c: kee...
3069
  	return 0;
ca23e405e   Tejun Heo   vmalloc: implemen...
3070
3071
3072
3073
3074
3075
3076
3077
  }
  
  /**
   * pcpu_get_vm_areas - allocate vmalloc areas for percpu allocator
   * @offsets: array containing offset of each area
   * @sizes: array containing size of each area
   * @nr_vms: the number of areas to allocate
   * @align: alignment, all entries in @offsets and @sizes must be aligned to this
ca23e405e   Tejun Heo   vmalloc: implemen...
3078
3079
3080
3081
3082
3083
   *
   * Returns: kmalloc'd vm_struct pointer array pointing to allocated
   *	    vm_structs on success, %NULL on failure
   *
   * Percpu allocator wants to use congruent vm areas so that it can
   * maintain the offsets among percpu areas.  This function allocates
ec3f64fc9   David Rientjes   mm: remove gfp ma...
3084
3085
3086
3087
   * congruent vmalloc areas for it with GFP_KERNEL.  These areas tend to
   * be scattered pretty far, distance between two areas easily going up
   * to gigabytes.  To avoid interacting with regular vmallocs, these
   * areas are allocated from top.
ca23e405e   Tejun Heo   vmalloc: implemen...
3088
   *
68ad4a330   Uladzislau Rezki (Sony)   mm/vmalloc.c: kee...
3089
3090
3091
3092
3093
3094
   * Despite its complicated look, this allocator is rather simple. It
   * does everything top-down and scans free blocks from the end looking
   * for matching base. While scanning, if any of the areas do not fit the
   * base address is pulled down to fit the area. Scanning is repeated till
   * all the areas fit and then all necessary data structures are inserted
   * and the result is returned.
ca23e405e   Tejun Heo   vmalloc: implemen...
3095
3096
3097
   */
  struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
  				     const size_t *sizes, int nr_vms,
ec3f64fc9   David Rientjes   mm: remove gfp ma...
3098
  				     size_t align)
ca23e405e   Tejun Heo   vmalloc: implemen...
3099
3100
3101
  {
  	const unsigned long vmalloc_start = ALIGN(VMALLOC_START, align);
  	const unsigned long vmalloc_end = VMALLOC_END & ~(align - 1);
68ad4a330   Uladzislau Rezki (Sony)   mm/vmalloc.c: kee...
3102
  	struct vmap_area **vas, *va;
ca23e405e   Tejun Heo   vmalloc: implemen...
3103
3104
  	struct vm_struct **vms;
  	int area, area2, last_area, term_area;
68ad4a330   Uladzislau Rezki (Sony)   mm/vmalloc.c: kee...
3105
  	unsigned long base, start, size, end, last_end;
ca23e405e   Tejun Heo   vmalloc: implemen...
3106
  	bool purged = false;
68ad4a330   Uladzislau Rezki (Sony)   mm/vmalloc.c: kee...
3107
  	enum fit_type type;
ca23e405e   Tejun Heo   vmalloc: implemen...
3108

ca23e405e   Tejun Heo   vmalloc: implemen...
3109
  	/* verify parameters and allocate data structures */
891c49abf   Alexander Kuleshov   mm/vmalloc: use o...
3110
  	BUG_ON(offset_in_page(align) || !is_power_of_2(align));
ca23e405e   Tejun Heo   vmalloc: implemen...
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
  	for (last_area = 0, area = 0; area < nr_vms; area++) {
  		start = offsets[area];
  		end = start + sizes[area];
  
  		/* is everything aligned properly? */
  		BUG_ON(!IS_ALIGNED(offsets[area], align));
  		BUG_ON(!IS_ALIGNED(sizes[area], align));
  
  		/* detect the area with the highest address */
  		if (start > offsets[last_area])
  			last_area = area;
c568da282   Wei Yang   mm/vmalloc.c: hal...
3122
  		for (area2 = area + 1; area2 < nr_vms; area2++) {
ca23e405e   Tejun Heo   vmalloc: implemen...
3123
3124
  			unsigned long start2 = offsets[area2];
  			unsigned long end2 = start2 + sizes[area2];
c568da282   Wei Yang   mm/vmalloc.c: hal...
3125
  			BUG_ON(start2 < end && start < end2);
ca23e405e   Tejun Heo   vmalloc: implemen...
3126
3127
3128
3129
3130
3131
3132
3133
  		}
  	}
  	last_end = offsets[last_area] + sizes[last_area];
  
  	if (vmalloc_end - vmalloc_start < last_end) {
  		WARN_ON(true);
  		return NULL;
  	}
4d67d8605   Thomas Meyer   mm: use kcalloc()...
3134
3135
  	vms = kcalloc(nr_vms, sizeof(vms[0]), GFP_KERNEL);
  	vas = kcalloc(nr_vms, sizeof(vas[0]), GFP_KERNEL);
ca23e405e   Tejun Heo   vmalloc: implemen...
3136
  	if (!vas || !vms)
f1db7afd9   Kautuk Consul   mm/vmalloc.c: eli...
3137
  		goto err_free2;
ca23e405e   Tejun Heo   vmalloc: implemen...
3138
3139
  
  	for (area = 0; area < nr_vms; area++) {
68ad4a330   Uladzislau Rezki (Sony)   mm/vmalloc.c: kee...
3140
  		vas[area] = kmem_cache_zalloc(vmap_area_cachep, GFP_KERNEL);
ec3f64fc9   David Rientjes   mm: remove gfp ma...
3141
  		vms[area] = kzalloc(sizeof(struct vm_struct), GFP_KERNEL);
ca23e405e   Tejun Heo   vmalloc: implemen...
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
  		if (!vas[area] || !vms[area])
  			goto err_free;
  	}
  retry:
  	spin_lock(&vmap_area_lock);
  
  	/* start scanning - we scan from the top, begin with the last area */
  	area = term_area = last_area;
  	start = offsets[area];
  	end = start + sizes[area];
68ad4a330   Uladzislau Rezki (Sony)   mm/vmalloc.c: kee...
3152
3153
  	va = pvm_find_va_enclose_addr(vmalloc_end);
  	base = pvm_determine_end_from_reverse(&va, align) - end;
ca23e405e   Tejun Heo   vmalloc: implemen...
3154
3155
  
  	while (true) {
ca23e405e   Tejun Heo   vmalloc: implemen...
3156
3157
3158
3159
  		/*
  		 * base might have underflowed, add last_end before
  		 * comparing.
  		 */
68ad4a330   Uladzislau Rezki (Sony)   mm/vmalloc.c: kee...
3160
3161
  		if (base + last_end < vmalloc_start + last_end)
  			goto overflow;
ca23e405e   Tejun Heo   vmalloc: implemen...
3162
3163
  
  		/*
68ad4a330   Uladzislau Rezki (Sony)   mm/vmalloc.c: kee...
3164
  		 * Fitting base has not been found.
ca23e405e   Tejun Heo   vmalloc: implemen...
3165
  		 */
68ad4a330   Uladzislau Rezki (Sony)   mm/vmalloc.c: kee...
3166
3167
  		if (va == NULL)
  			goto overflow;
ca23e405e   Tejun Heo   vmalloc: implemen...
3168
3169
  
  		/*
5336e52c9   Kuppuswamy Sathyanarayanan   mm/vmalloc.c: fix...
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
  		 * If required width exeeds current VA block, move
  		 * base downwards and then recheck.
  		 */
  		if (base + end > va->va_end) {
  			base = pvm_determine_end_from_reverse(&va, align) - end;
  			term_area = area;
  			continue;
  		}
  
  		/*
68ad4a330   Uladzislau Rezki (Sony)   mm/vmalloc.c: kee...
3180
  		 * If this VA does not fit, move base downwards and recheck.
ca23e405e   Tejun Heo   vmalloc: implemen...
3181
  		 */
5336e52c9   Kuppuswamy Sathyanarayanan   mm/vmalloc.c: fix...
3182
  		if (base + start < va->va_start) {
68ad4a330   Uladzislau Rezki (Sony)   mm/vmalloc.c: kee...
3183
3184
  			va = node_to_va(rb_prev(&va->rb_node));
  			base = pvm_determine_end_from_reverse(&va, align) - end;
ca23e405e   Tejun Heo   vmalloc: implemen...
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
  			term_area = area;
  			continue;
  		}
  
  		/*
  		 * This area fits, move on to the previous one.  If
  		 * the previous one is the terminal one, we're done.
  		 */
  		area = (area + nr_vms - 1) % nr_vms;
  		if (area == term_area)
  			break;
68ad4a330   Uladzislau Rezki (Sony)   mm/vmalloc.c: kee...
3196

ca23e405e   Tejun Heo   vmalloc: implemen...
3197
3198
  		start = offsets[area];
  		end = start + sizes[area];
68ad4a330   Uladzislau Rezki (Sony)   mm/vmalloc.c: kee...
3199
  		va = pvm_find_va_enclose_addr(base + end);
ca23e405e   Tejun Heo   vmalloc: implemen...
3200
  	}
68ad4a330   Uladzislau Rezki (Sony)   mm/vmalloc.c: kee...
3201

ca23e405e   Tejun Heo   vmalloc: implemen...
3202
3203
  	/* we've found a fitting base, insert all va's */
  	for (area = 0; area < nr_vms; area++) {
68ad4a330   Uladzislau Rezki (Sony)   mm/vmalloc.c: kee...
3204
  		int ret;
ca23e405e   Tejun Heo   vmalloc: implemen...
3205

68ad4a330   Uladzislau Rezki (Sony)   mm/vmalloc.c: kee...
3206
3207
  		start = base + offsets[area];
  		size = sizes[area];
ca23e405e   Tejun Heo   vmalloc: implemen...
3208

68ad4a330   Uladzislau Rezki (Sony)   mm/vmalloc.c: kee...
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
  		va = pvm_find_va_enclose_addr(start);
  		if (WARN_ON_ONCE(va == NULL))
  			/* It is a BUG(), but trigger recovery instead. */
  			goto recovery;
  
  		type = classify_va_fit_type(va, start, size);
  		if (WARN_ON_ONCE(type == NOTHING_FIT))
  			/* It is a BUG(), but trigger recovery instead. */
  			goto recovery;
  
  		ret = adjust_va_to_fit_type(va, start, size, type);
  		if (unlikely(ret))
  			goto recovery;
  
  		/* Allocated area. */
  		va = vas[area];
  		va->va_start = start;
  		va->va_end = start + size;
  
  		insert_vmap_area(va, &vmap_area_root, &vmap_area_list);
  	}
ca23e405e   Tejun Heo   vmalloc: implemen...
3230
3231
3232
3233
3234
  
  	spin_unlock(&vmap_area_lock);
  
  	/* insert all vm's */
  	for (area = 0; area < nr_vms; area++)
3645cb4a4   Zhang Yanfei   mm, vmalloc: call...
3235
3236
  		setup_vmalloc_vm(vms[area], vas[area], VM_ALLOC,
  				 pcpu_get_vm_areas);
ca23e405e   Tejun Heo   vmalloc: implemen...
3237
3238
3239
  
  	kfree(vas);
  	return vms;
68ad4a330   Uladzislau Rezki (Sony)   mm/vmalloc.c: kee...
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
  recovery:
  	/* Remove previously inserted areas. */
  	while (area--) {
  		__free_vmap_area(vas[area]);
  		vas[area] = NULL;
  	}
  
  overflow:
  	spin_unlock(&vmap_area_lock);
  	if (!purged) {
  		purge_vmap_area_lazy();
  		purged = true;
  
  		/* Before "retry", check if we recover. */
  		for (area = 0; area < nr_vms; area++) {
  			if (vas[area])
  				continue;
  
  			vas[area] = kmem_cache_zalloc(
  				vmap_area_cachep, GFP_KERNEL);
  			if (!vas[area])
  				goto err_free;
  		}
  
  		goto retry;
  	}
ca23e405e   Tejun Heo   vmalloc: implemen...
3266
3267
  err_free:
  	for (area = 0; area < nr_vms; area++) {
68ad4a330   Uladzislau Rezki (Sony)   mm/vmalloc.c: kee...
3268
3269
  		if (vas[area])
  			kmem_cache_free(vmap_area_cachep, vas[area]);
f1db7afd9   Kautuk Consul   mm/vmalloc.c: eli...
3270
  		kfree(vms[area]);
ca23e405e   Tejun Heo   vmalloc: implemen...
3271
  	}
f1db7afd9   Kautuk Consul   mm/vmalloc.c: eli...
3272
  err_free2:
ca23e405e   Tejun Heo   vmalloc: implemen...
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
  	kfree(vas);
  	kfree(vms);
  	return NULL;
  }
  
  /**
   * pcpu_free_vm_areas - free vmalloc areas for percpu allocator
   * @vms: vm_struct pointer array returned by pcpu_get_vm_areas()
   * @nr_vms: the number of allocated areas
   *
   * Free vm_structs and the array allocated by pcpu_get_vm_areas().
   */
  void pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms)
  {
  	int i;
  
  	for (i = 0; i < nr_vms; i++)
  		free_vm_area(vms[i]);
  	kfree(vms);
  }
4f8b02b4e   Tejun Heo   vmalloc: pcpu_get...
3293
  #endif	/* CONFIG_SMP */
a10aa5798   Christoph Lameter   vmalloc: show vma...
3294
3295
3296
  
  #ifdef CONFIG_PROC_FS
  static void *s_start(struct seq_file *m, loff_t *pos)
d4033afdf   Joonsoo Kim   mm, vmalloc: iter...
3297
  	__acquires(&vmap_area_lock)
a10aa5798   Christoph Lameter   vmalloc: show vma...
3298
  {
d4033afdf   Joonsoo Kim   mm, vmalloc: iter...
3299
  	spin_lock(&vmap_area_lock);
3f5000693   zijun_hu   mm/vmalloc.c: sim...
3300
  	return seq_list_start(&vmap_area_list, *pos);
a10aa5798   Christoph Lameter   vmalloc: show vma...
3301
3302
3303
3304
  }
  
  static void *s_next(struct seq_file *m, void *p, loff_t *pos)
  {
3f5000693   zijun_hu   mm/vmalloc.c: sim...
3305
  	return seq_list_next(p, &vmap_area_list, pos);
a10aa5798   Christoph Lameter   vmalloc: show vma...
3306
3307
3308
  }
  
  static void s_stop(struct seq_file *m, void *p)
d4033afdf   Joonsoo Kim   mm, vmalloc: iter...
3309
  	__releases(&vmap_area_lock)
a10aa5798   Christoph Lameter   vmalloc: show vma...
3310
  {
d4033afdf   Joonsoo Kim   mm, vmalloc: iter...
3311
  	spin_unlock(&vmap_area_lock);
a10aa5798   Christoph Lameter   vmalloc: show vma...
3312
  }
a47a126ad   Eric Dumazet   vmallocinfo: add ...
3313
3314
  static void show_numa_info(struct seq_file *m, struct vm_struct *v)
  {
e5adfffc8   Kirill A. Shutemov   mm: use IS_ENABLE...
3315
  	if (IS_ENABLED(CONFIG_NUMA)) {
a47a126ad   Eric Dumazet   vmallocinfo: add ...
3316
3317
3318
3319
  		unsigned int nr, *counters = m->private;
  
  		if (!counters)
  			return;
af12346cd   Wanpeng Li   mm/vmalloc: rever...
3320
3321
  		if (v->flags & VM_UNINITIALIZED)
  			return;
7e5b528b4   Dmitry Vyukov   mm/vmalloc.c: fix...
3322
3323
  		/* Pair with smp_wmb() in clear_vm_uninitialized_flag() */
  		smp_rmb();
af12346cd   Wanpeng Li   mm/vmalloc: rever...
3324

a47a126ad   Eric Dumazet   vmallocinfo: add ...
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
  		memset(counters, 0, nr_node_ids * sizeof(unsigned int));
  
  		for (nr = 0; nr < v->nr_pages; nr++)
  			counters[page_to_nid(v->pages[nr])]++;
  
  		for_each_node_state(nr, N_HIGH_MEMORY)
  			if (counters[nr])
  				seq_printf(m, " N%u=%u", nr, counters[nr]);
  	}
  }
dd3b8353b   Uladzislau Rezki (Sony)   mm/vmalloc: do no...
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
  static void show_purge_info(struct seq_file *m)
  {
  	struct llist_node *head;
  	struct vmap_area *va;
  
  	head = READ_ONCE(vmap_purge_list.first);
  	if (head == NULL)
  		return;
  
  	llist_for_each_entry(va, head, purge_list) {
  		seq_printf(m, "0x%pK-0x%pK %7ld unpurged vm_area
  ",
  			(void *)va->va_start, (void *)va->va_end,
  			va->va_end - va->va_start);
  	}
  }
a10aa5798   Christoph Lameter   vmalloc: show vma...
3351
3352
  static int s_show(struct seq_file *m, void *p)
  {
3f5000693   zijun_hu   mm/vmalloc.c: sim...
3353
  	struct vmap_area *va;
d4033afdf   Joonsoo Kim   mm, vmalloc: iter...
3354
  	struct vm_struct *v;
3f5000693   zijun_hu   mm/vmalloc.c: sim...
3355
  	va = list_entry(p, struct vmap_area, list);
c2ce8c142   Wanpeng Li   mm/vmalloc: fix s...
3356
  	/*
688fcbfc0   Pengfei Li   mm/vmalloc: modif...
3357
3358
  	 * s_show can encounter race with remove_vm_area, !vm on behalf
  	 * of vmap area is being tear down or vm_map_ram allocation.
c2ce8c142   Wanpeng Li   mm/vmalloc: fix s...
3359
  	 */
688fcbfc0   Pengfei Li   mm/vmalloc: modif...
3360
  	if (!va->vm) {
dd3b8353b   Uladzislau Rezki (Sony)   mm/vmalloc: do no...
3361
3362
  		seq_printf(m, "0x%pK-0x%pK %7ld vm_map_ram
  ",
78c72746f   Yisheng Xie   vmalloc: show laz...
3363
  			(void *)va->va_start, (void *)va->va_end,
dd3b8353b   Uladzislau Rezki (Sony)   mm/vmalloc: do no...
3364
  			va->va_end - va->va_start);
78c72746f   Yisheng Xie   vmalloc: show laz...
3365

d4033afdf   Joonsoo Kim   mm, vmalloc: iter...
3366
  		return 0;
78c72746f   Yisheng Xie   vmalloc: show laz...
3367
  	}
d4033afdf   Joonsoo Kim   mm, vmalloc: iter...
3368
3369
  
  	v = va->vm;
a10aa5798   Christoph Lameter   vmalloc: show vma...
3370

45ec16908   Kees Cook   mm: use %pK for /...
3371
  	seq_printf(m, "0x%pK-0x%pK %7ld",
a10aa5798   Christoph Lameter   vmalloc: show vma...
3372
  		v->addr, v->addr + v->size, v->size);
62c70bce8   Joe Perches   mm: convert sprin...
3373
3374
  	if (v->caller)
  		seq_printf(m, " %pS", v->caller);
230169693   Christoph Lameter   vmallocinfo: add ...
3375

a10aa5798   Christoph Lameter   vmalloc: show vma...
3376
3377
3378
3379
  	if (v->nr_pages)
  		seq_printf(m, " pages=%d", v->nr_pages);
  
  	if (v->phys_addr)
199eaa05a   Miles Chen   mm: cleanups for ...
3380
  		seq_printf(m, " phys=%pa", &v->phys_addr);
a10aa5798   Christoph Lameter   vmalloc: show vma...
3381
3382
  
  	if (v->flags & VM_IOREMAP)
f4527c908   Fabian Frederick   mm/vmalloc.c: rep...
3383
  		seq_puts(m, " ioremap");
a10aa5798   Christoph Lameter   vmalloc: show vma...
3384
3385
  
  	if (v->flags & VM_ALLOC)
f4527c908   Fabian Frederick   mm/vmalloc.c: rep...
3386
  		seq_puts(m, " vmalloc");
a10aa5798   Christoph Lameter   vmalloc: show vma...
3387
3388
  
  	if (v->flags & VM_MAP)
f4527c908   Fabian Frederick   mm/vmalloc.c: rep...
3389
  		seq_puts(m, " vmap");
a10aa5798   Christoph Lameter   vmalloc: show vma...
3390
3391
  
  	if (v->flags & VM_USERMAP)
f4527c908   Fabian Frederick   mm/vmalloc.c: rep...
3392
  		seq_puts(m, " user");
a10aa5798   Christoph Lameter   vmalloc: show vma...
3393

fe9041c24   Christoph Hellwig   vmalloc: lift the...
3394
3395
  	if (v->flags & VM_DMA_COHERENT)
  		seq_puts(m, " dma-coherent");
244d63ee3   David Rientjes   mm, vmalloc: remo...
3396
  	if (is_vmalloc_addr(v->pages))
f4527c908   Fabian Frederick   mm/vmalloc.c: rep...
3397
  		seq_puts(m, " vpages");
a10aa5798   Christoph Lameter   vmalloc: show vma...
3398

a47a126ad   Eric Dumazet   vmallocinfo: add ...
3399
  	show_numa_info(m, v);
a10aa5798   Christoph Lameter   vmalloc: show vma...
3400
3401
  	seq_putc(m, '
  ');
dd3b8353b   Uladzislau Rezki (Sony)   mm/vmalloc: do no...
3402
3403
3404
3405
3406
3407
3408
3409
3410
  
  	/*
  	 * As a final step, dump "unpurged" areas. Note,
  	 * that entire "/proc/vmallocinfo" output will not
  	 * be address sorted, because the purge list is not
  	 * sorted.
  	 */
  	if (list_is_last(&va->list, &vmap_area_list))
  		show_purge_info(m);
a10aa5798   Christoph Lameter   vmalloc: show vma...
3411
3412
  	return 0;
  }
5f6a6a9c4   Alexey Dobriyan   proc: move /proc/...
3413
  static const struct seq_operations vmalloc_op = {
a10aa5798   Christoph Lameter   vmalloc: show vma...
3414
3415
3416
3417
3418
  	.start = s_start,
  	.next = s_next,
  	.stop = s_stop,
  	.show = s_show,
  };
5f6a6a9c4   Alexey Dobriyan   proc: move /proc/...
3419

5f6a6a9c4   Alexey Dobriyan   proc: move /proc/...
3420
3421
  static int __init proc_vmalloc_init(void)
  {
fddda2b7b   Christoph Hellwig   proc: introduce p...
3422
  	if (IS_ENABLED(CONFIG_NUMA))
0825a6f98   Joe Perches   mm: use octal not...
3423
  		proc_create_seq_private("vmallocinfo", 0400, NULL,
44414d82c   Christoph Hellwig   proc: introduce p...
3424
3425
  				&vmalloc_op,
  				nr_node_ids * sizeof(unsigned int), NULL);
fddda2b7b   Christoph Hellwig   proc: introduce p...
3426
  	else
0825a6f98   Joe Perches   mm: use octal not...
3427
  		proc_create_seq("vmallocinfo", 0400, NULL, &vmalloc_op);
5f6a6a9c4   Alexey Dobriyan   proc: move /proc/...
3428
3429
3430
  	return 0;
  }
  module_init(proc_vmalloc_init);
db3808c1b   Joonsoo Kim   mm, vmalloc: move...
3431

a10aa5798   Christoph Lameter   vmalloc: show vma...
3432
  #endif