Commit e1ca7788dec6773b1a2bce51b7141948f2b8bccf

Authored by Dave Young
Committed by Linus Torvalds
1 parent 7bbc0905ea

mm: add vzalloc() and vzalloc_node() helpers

Add vzalloc() and vzalloc_node() to encapsulate the
vmalloc-then-memset-zero operation.

Use __GFP_ZERO to zero fill the allocated memory.

Signed-off-by: Dave Young <hidave.darkstar@gmail.com>
Cc: Christoph Lameter <cl@linux-foundation.org>
Acked-by: Greg Ungerer <gerg@snapgear.com>
Cc: David Howells <dhowells@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Showing 3 changed files with 94 additions and 3 deletions Inline Diff

include/linux/vmalloc.h
1 #ifndef _LINUX_VMALLOC_H 1 #ifndef _LINUX_VMALLOC_H
2 #define _LINUX_VMALLOC_H 2 #define _LINUX_VMALLOC_H
3 3
4 #include <linux/spinlock.h> 4 #include <linux/spinlock.h>
5 #include <linux/init.h> 5 #include <linux/init.h>
6 #include <asm/page.h> /* pgprot_t */ 6 #include <asm/page.h> /* pgprot_t */
7 7
8 struct vm_area_struct; /* vma defining user mapping in mm_types.h */ 8 struct vm_area_struct; /* vma defining user mapping in mm_types.h */
9 9
10 extern bool vmap_lazy_unmap; 10 extern bool vmap_lazy_unmap;
11 11
12 /* bits in flags of vmalloc's vm_struct below */ 12 /* bits in flags of vmalloc's vm_struct below */
13 #define VM_IOREMAP 0x00000001 /* ioremap() and friends */ 13 #define VM_IOREMAP 0x00000001 /* ioremap() and friends */
14 #define VM_ALLOC 0x00000002 /* vmalloc() */ 14 #define VM_ALLOC 0x00000002 /* vmalloc() */
15 #define VM_MAP 0x00000004 /* vmap()ed pages */ 15 #define VM_MAP 0x00000004 /* vmap()ed pages */
16 #define VM_USERMAP 0x00000008 /* suitable for remap_vmalloc_range */ 16 #define VM_USERMAP 0x00000008 /* suitable for remap_vmalloc_range */
17 #define VM_VPAGES 0x00000010 /* buffer for pages was vmalloc'ed */ 17 #define VM_VPAGES 0x00000010 /* buffer for pages was vmalloc'ed */
18 /* bits [20..32] reserved for arch specific ioremap internals */ 18 /* bits [20..32] reserved for arch specific ioremap internals */
19 19
20 /* 20 /*
21 * Maximum alignment for ioremap() regions. 21 * Maximum alignment for ioremap() regions.
22 * Can be overriden by arch-specific value. 22 * Can be overriden by arch-specific value.
23 */ 23 */
24 #ifndef IOREMAP_MAX_ORDER 24 #ifndef IOREMAP_MAX_ORDER
25 #define IOREMAP_MAX_ORDER (7 + PAGE_SHIFT) /* 128 pages */ 25 #define IOREMAP_MAX_ORDER (7 + PAGE_SHIFT) /* 128 pages */
26 #endif 26 #endif
27 27
28 struct vm_struct { 28 struct vm_struct {
29 struct vm_struct *next; 29 struct vm_struct *next;
30 void *addr; 30 void *addr;
31 unsigned long size; 31 unsigned long size;
32 unsigned long flags; 32 unsigned long flags;
33 struct page **pages; 33 struct page **pages;
34 unsigned int nr_pages; 34 unsigned int nr_pages;
35 phys_addr_t phys_addr; 35 phys_addr_t phys_addr;
36 void *caller; 36 void *caller;
37 }; 37 };
38 38
39 /* 39 /*
40 * Highlevel APIs for driver use 40 * Highlevel APIs for driver use
41 */ 41 */
42 extern void vm_unmap_ram(const void *mem, unsigned int count); 42 extern void vm_unmap_ram(const void *mem, unsigned int count);
43 extern void *vm_map_ram(struct page **pages, unsigned int count, 43 extern void *vm_map_ram(struct page **pages, unsigned int count,
44 int node, pgprot_t prot); 44 int node, pgprot_t prot);
45 extern void vm_unmap_aliases(void); 45 extern void vm_unmap_aliases(void);
46 46
47 #ifdef CONFIG_MMU 47 #ifdef CONFIG_MMU
48 extern void __init vmalloc_init(void); 48 extern void __init vmalloc_init(void);
49 #else 49 #else
50 static inline void vmalloc_init(void) 50 static inline void vmalloc_init(void)
51 { 51 {
52 } 52 }
53 #endif 53 #endif
54 54
55 extern void *vmalloc(unsigned long size); 55 extern void *vmalloc(unsigned long size);
56 extern void *vzalloc(unsigned long size);
56 extern void *vmalloc_user(unsigned long size); 57 extern void *vmalloc_user(unsigned long size);
57 extern void *vmalloc_node(unsigned long size, int node); 58 extern void *vmalloc_node(unsigned long size, int node);
59 extern void *vzalloc_node(unsigned long size, int node);
58 extern void *vmalloc_exec(unsigned long size); 60 extern void *vmalloc_exec(unsigned long size);
59 extern void *vmalloc_32(unsigned long size); 61 extern void *vmalloc_32(unsigned long size);
60 extern void *vmalloc_32_user(unsigned long size); 62 extern void *vmalloc_32_user(unsigned long size);
61 extern void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot); 63 extern void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot);
62 extern void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, 64 extern void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask,
63 pgprot_t prot); 65 pgprot_t prot);
64 extern void vfree(const void *addr); 66 extern void vfree(const void *addr);
65 67
66 extern void *vmap(struct page **pages, unsigned int count, 68 extern void *vmap(struct page **pages, unsigned int count,
67 unsigned long flags, pgprot_t prot); 69 unsigned long flags, pgprot_t prot);
68 extern void vunmap(const void *addr); 70 extern void vunmap(const void *addr);
69 71
70 extern int remap_vmalloc_range(struct vm_area_struct *vma, void *addr, 72 extern int remap_vmalloc_range(struct vm_area_struct *vma, void *addr,
71 unsigned long pgoff); 73 unsigned long pgoff);
72 void vmalloc_sync_all(void); 74 void vmalloc_sync_all(void);
73 75
74 /* 76 /*
75 * Lowlevel-APIs (not for driver use!) 77 * Lowlevel-APIs (not for driver use!)
76 */ 78 */
77 79
78 static inline size_t get_vm_area_size(const struct vm_struct *area) 80 static inline size_t get_vm_area_size(const struct vm_struct *area)
79 { 81 {
80 /* return actual size without guard page */ 82 /* return actual size without guard page */
81 return area->size - PAGE_SIZE; 83 return area->size - PAGE_SIZE;
82 } 84 }
83 85
84 extern struct vm_struct *get_vm_area(unsigned long size, unsigned long flags); 86 extern struct vm_struct *get_vm_area(unsigned long size, unsigned long flags);
85 extern struct vm_struct *get_vm_area_caller(unsigned long size, 87 extern struct vm_struct *get_vm_area_caller(unsigned long size,
86 unsigned long flags, void *caller); 88 unsigned long flags, void *caller);
87 extern struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags, 89 extern struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags,
88 unsigned long start, unsigned long end); 90 unsigned long start, unsigned long end);
89 extern struct vm_struct *__get_vm_area_caller(unsigned long size, 91 extern struct vm_struct *__get_vm_area_caller(unsigned long size,
90 unsigned long flags, 92 unsigned long flags,
91 unsigned long start, unsigned long end, 93 unsigned long start, unsigned long end,
92 void *caller); 94 void *caller);
93 extern struct vm_struct *get_vm_area_node(unsigned long size, 95 extern struct vm_struct *get_vm_area_node(unsigned long size,
94 unsigned long flags, int node, 96 unsigned long flags, int node,
95 gfp_t gfp_mask); 97 gfp_t gfp_mask);
96 extern struct vm_struct *remove_vm_area(const void *addr); 98 extern struct vm_struct *remove_vm_area(const void *addr);
97 99
98 extern int map_vm_area(struct vm_struct *area, pgprot_t prot, 100 extern int map_vm_area(struct vm_struct *area, pgprot_t prot,
99 struct page ***pages); 101 struct page ***pages);
100 extern int map_kernel_range_noflush(unsigned long start, unsigned long size, 102 extern int map_kernel_range_noflush(unsigned long start, unsigned long size,
101 pgprot_t prot, struct page **pages); 103 pgprot_t prot, struct page **pages);
102 extern void unmap_kernel_range_noflush(unsigned long addr, unsigned long size); 104 extern void unmap_kernel_range_noflush(unsigned long addr, unsigned long size);
103 extern void unmap_kernel_range(unsigned long addr, unsigned long size); 105 extern void unmap_kernel_range(unsigned long addr, unsigned long size);
104 106
105 /* Allocate/destroy a 'vmalloc' VM area. */ 107 /* Allocate/destroy a 'vmalloc' VM area. */
106 extern struct vm_struct *alloc_vm_area(size_t size); 108 extern struct vm_struct *alloc_vm_area(size_t size);
107 extern void free_vm_area(struct vm_struct *area); 109 extern void free_vm_area(struct vm_struct *area);
108 110
109 /* for /dev/kmem */ 111 /* for /dev/kmem */
110 extern long vread(char *buf, char *addr, unsigned long count); 112 extern long vread(char *buf, char *addr, unsigned long count);
111 extern long vwrite(char *buf, char *addr, unsigned long count); 113 extern long vwrite(char *buf, char *addr, unsigned long count);
112 114
113 /* 115 /*
114 * Internals. Dont't use.. 116 * Internals. Dont't use..
115 */ 117 */
116 extern rwlock_t vmlist_lock; 118 extern rwlock_t vmlist_lock;
117 extern struct vm_struct *vmlist; 119 extern struct vm_struct *vmlist;
118 extern __init void vm_area_register_early(struct vm_struct *vm, size_t align); 120 extern __init void vm_area_register_early(struct vm_struct *vm, size_t align);
119 121
120 #ifdef CONFIG_SMP 122 #ifdef CONFIG_SMP
121 struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets, 123 struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
122 const size_t *sizes, int nr_vms, 124 const size_t *sizes, int nr_vms,
123 size_t align, gfp_t gfp_mask); 125 size_t align, gfp_t gfp_mask);
124 126
125 void pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms); 127 void pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms);
126 #endif 128 #endif
127 129
128 #endif /* _LINUX_VMALLOC_H */ 130 #endif /* _LINUX_VMALLOC_H */
129 131
1 /* 1 /*
2 * linux/mm/nommu.c 2 * linux/mm/nommu.c
3 * 3 *
4 * Replacement code for mm functions to support CPU's that don't 4 * Replacement code for mm functions to support CPU's that don't
5 * have any form of memory management unit (thus no virtual memory). 5 * have any form of memory management unit (thus no virtual memory).
6 * 6 *
7 * See Documentation/nommu-mmap.txt 7 * See Documentation/nommu-mmap.txt
8 * 8 *
9 * Copyright (c) 2004-2008 David Howells <dhowells@redhat.com> 9 * Copyright (c) 2004-2008 David Howells <dhowells@redhat.com>
10 * Copyright (c) 2000-2003 David McCullough <davidm@snapgear.com> 10 * Copyright (c) 2000-2003 David McCullough <davidm@snapgear.com>
11 * Copyright (c) 2000-2001 D Jeff Dionne <jeff@uClinux.org> 11 * Copyright (c) 2000-2001 D Jeff Dionne <jeff@uClinux.org>
12 * Copyright (c) 2002 Greg Ungerer <gerg@snapgear.com> 12 * Copyright (c) 2002 Greg Ungerer <gerg@snapgear.com>
13 * Copyright (c) 2007-2009 Paul Mundt <lethal@linux-sh.org> 13 * Copyright (c) 2007-2009 Paul Mundt <lethal@linux-sh.org>
14 */ 14 */
15 15
16 #include <linux/module.h> 16 #include <linux/module.h>
17 #include <linux/mm.h> 17 #include <linux/mm.h>
18 #include <linux/mman.h> 18 #include <linux/mman.h>
19 #include <linux/swap.h> 19 #include <linux/swap.h>
20 #include <linux/file.h> 20 #include <linux/file.h>
21 #include <linux/highmem.h> 21 #include <linux/highmem.h>
22 #include <linux/pagemap.h> 22 #include <linux/pagemap.h>
23 #include <linux/slab.h> 23 #include <linux/slab.h>
24 #include <linux/vmalloc.h> 24 #include <linux/vmalloc.h>
25 #include <linux/tracehook.h> 25 #include <linux/tracehook.h>
26 #include <linux/blkdev.h> 26 #include <linux/blkdev.h>
27 #include <linux/backing-dev.h> 27 #include <linux/backing-dev.h>
28 #include <linux/mount.h> 28 #include <linux/mount.h>
29 #include <linux/personality.h> 29 #include <linux/personality.h>
30 #include <linux/security.h> 30 #include <linux/security.h>
31 #include <linux/syscalls.h> 31 #include <linux/syscalls.h>
32 32
33 #include <asm/uaccess.h> 33 #include <asm/uaccess.h>
34 #include <asm/tlb.h> 34 #include <asm/tlb.h>
35 #include <asm/tlbflush.h> 35 #include <asm/tlbflush.h>
36 #include <asm/mmu_context.h> 36 #include <asm/mmu_context.h>
37 #include "internal.h" 37 #include "internal.h"
38 38
39 #if 0 39 #if 0
40 #define kenter(FMT, ...) \ 40 #define kenter(FMT, ...) \
41 printk(KERN_DEBUG "==> %s("FMT")\n", __func__, ##__VA_ARGS__) 41 printk(KERN_DEBUG "==> %s("FMT")\n", __func__, ##__VA_ARGS__)
42 #define kleave(FMT, ...) \ 42 #define kleave(FMT, ...) \
43 printk(KERN_DEBUG "<== %s()"FMT"\n", __func__, ##__VA_ARGS__) 43 printk(KERN_DEBUG "<== %s()"FMT"\n", __func__, ##__VA_ARGS__)
44 #define kdebug(FMT, ...) \ 44 #define kdebug(FMT, ...) \
45 printk(KERN_DEBUG "xxx" FMT"yyy\n", ##__VA_ARGS__) 45 printk(KERN_DEBUG "xxx" FMT"yyy\n", ##__VA_ARGS__)
46 #else 46 #else
47 #define kenter(FMT, ...) \ 47 #define kenter(FMT, ...) \
48 no_printk(KERN_DEBUG "==> %s("FMT")\n", __func__, ##__VA_ARGS__) 48 no_printk(KERN_DEBUG "==> %s("FMT")\n", __func__, ##__VA_ARGS__)
49 #define kleave(FMT, ...) \ 49 #define kleave(FMT, ...) \
50 no_printk(KERN_DEBUG "<== %s()"FMT"\n", __func__, ##__VA_ARGS__) 50 no_printk(KERN_DEBUG "<== %s()"FMT"\n", __func__, ##__VA_ARGS__)
51 #define kdebug(FMT, ...) \ 51 #define kdebug(FMT, ...) \
52 no_printk(KERN_DEBUG FMT"\n", ##__VA_ARGS__) 52 no_printk(KERN_DEBUG FMT"\n", ##__VA_ARGS__)
53 #endif 53 #endif
54 54
55 void *high_memory; 55 void *high_memory;
56 struct page *mem_map; 56 struct page *mem_map;
57 unsigned long max_mapnr; 57 unsigned long max_mapnr;
58 unsigned long num_physpages; 58 unsigned long num_physpages;
59 unsigned long highest_memmap_pfn; 59 unsigned long highest_memmap_pfn;
60 struct percpu_counter vm_committed_as; 60 struct percpu_counter vm_committed_as;
61 int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */ 61 int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */
62 int sysctl_overcommit_ratio = 50; /* default is 50% */ 62 int sysctl_overcommit_ratio = 50; /* default is 50% */
63 int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT; 63 int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT;
64 int sysctl_nr_trim_pages = CONFIG_NOMMU_INITIAL_TRIM_EXCESS; 64 int sysctl_nr_trim_pages = CONFIG_NOMMU_INITIAL_TRIM_EXCESS;
65 int heap_stack_gap = 0; 65 int heap_stack_gap = 0;
66 66
67 atomic_long_t mmap_pages_allocated; 67 atomic_long_t mmap_pages_allocated;
68 68
69 EXPORT_SYMBOL(mem_map); 69 EXPORT_SYMBOL(mem_map);
70 EXPORT_SYMBOL(num_physpages); 70 EXPORT_SYMBOL(num_physpages);
71 71
72 /* list of mapped, potentially shareable regions */ 72 /* list of mapped, potentially shareable regions */
73 static struct kmem_cache *vm_region_jar; 73 static struct kmem_cache *vm_region_jar;
74 struct rb_root nommu_region_tree = RB_ROOT; 74 struct rb_root nommu_region_tree = RB_ROOT;
75 DECLARE_RWSEM(nommu_region_sem); 75 DECLARE_RWSEM(nommu_region_sem);
76 76
77 const struct vm_operations_struct generic_file_vm_ops = { 77 const struct vm_operations_struct generic_file_vm_ops = {
78 }; 78 };
79 79
80 /* 80 /*
81 * Return the total memory allocated for this pointer, not 81 * Return the total memory allocated for this pointer, not
82 * just what the caller asked for. 82 * just what the caller asked for.
83 * 83 *
84 * Doesn't have to be accurate, i.e. may have races. 84 * Doesn't have to be accurate, i.e. may have races.
85 */ 85 */
86 unsigned int kobjsize(const void *objp) 86 unsigned int kobjsize(const void *objp)
87 { 87 {
88 struct page *page; 88 struct page *page;
89 89
90 /* 90 /*
91 * If the object we have should not have ksize performed on it, 91 * If the object we have should not have ksize performed on it,
92 * return size of 0 92 * return size of 0
93 */ 93 */
94 if (!objp || !virt_addr_valid(objp)) 94 if (!objp || !virt_addr_valid(objp))
95 return 0; 95 return 0;
96 96
97 page = virt_to_head_page(objp); 97 page = virt_to_head_page(objp);
98 98
99 /* 99 /*
100 * If the allocator sets PageSlab, we know the pointer came from 100 * If the allocator sets PageSlab, we know the pointer came from
101 * kmalloc(). 101 * kmalloc().
102 */ 102 */
103 if (PageSlab(page)) 103 if (PageSlab(page))
104 return ksize(objp); 104 return ksize(objp);
105 105
106 /* 106 /*
107 * If it's not a compound page, see if we have a matching VMA 107 * If it's not a compound page, see if we have a matching VMA
108 * region. This test is intentionally done in reverse order, 108 * region. This test is intentionally done in reverse order,
109 * so if there's no VMA, we still fall through and hand back 109 * so if there's no VMA, we still fall through and hand back
110 * PAGE_SIZE for 0-order pages. 110 * PAGE_SIZE for 0-order pages.
111 */ 111 */
112 if (!PageCompound(page)) { 112 if (!PageCompound(page)) {
113 struct vm_area_struct *vma; 113 struct vm_area_struct *vma;
114 114
115 vma = find_vma(current->mm, (unsigned long)objp); 115 vma = find_vma(current->mm, (unsigned long)objp);
116 if (vma) 116 if (vma)
117 return vma->vm_end - vma->vm_start; 117 return vma->vm_end - vma->vm_start;
118 } 118 }
119 119
120 /* 120 /*
121 * The ksize() function is only guaranteed to work for pointers 121 * The ksize() function is only guaranteed to work for pointers
122 * returned by kmalloc(). So handle arbitrary pointers here. 122 * returned by kmalloc(). So handle arbitrary pointers here.
123 */ 123 */
124 return PAGE_SIZE << compound_order(page); 124 return PAGE_SIZE << compound_order(page);
125 } 125 }
126 126
127 int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, 127 int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
128 unsigned long start, int nr_pages, unsigned int foll_flags, 128 unsigned long start, int nr_pages, unsigned int foll_flags,
129 struct page **pages, struct vm_area_struct **vmas) 129 struct page **pages, struct vm_area_struct **vmas)
130 { 130 {
131 struct vm_area_struct *vma; 131 struct vm_area_struct *vma;
132 unsigned long vm_flags; 132 unsigned long vm_flags;
133 int i; 133 int i;
134 134
135 /* calculate required read or write permissions. 135 /* calculate required read or write permissions.
136 * If FOLL_FORCE is set, we only require the "MAY" flags. 136 * If FOLL_FORCE is set, we only require the "MAY" flags.
137 */ 137 */
138 vm_flags = (foll_flags & FOLL_WRITE) ? 138 vm_flags = (foll_flags & FOLL_WRITE) ?
139 (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD); 139 (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD);
140 vm_flags &= (foll_flags & FOLL_FORCE) ? 140 vm_flags &= (foll_flags & FOLL_FORCE) ?
141 (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE); 141 (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE);
142 142
143 for (i = 0; i < nr_pages; i++) { 143 for (i = 0; i < nr_pages; i++) {
144 vma = find_vma(mm, start); 144 vma = find_vma(mm, start);
145 if (!vma) 145 if (!vma)
146 goto finish_or_fault; 146 goto finish_or_fault;
147 147
148 /* protect what we can, including chardevs */ 148 /* protect what we can, including chardevs */
149 if ((vma->vm_flags & (VM_IO | VM_PFNMAP)) || 149 if ((vma->vm_flags & (VM_IO | VM_PFNMAP)) ||
150 !(vm_flags & vma->vm_flags)) 150 !(vm_flags & vma->vm_flags))
151 goto finish_or_fault; 151 goto finish_or_fault;
152 152
153 if (pages) { 153 if (pages) {
154 pages[i] = virt_to_page(start); 154 pages[i] = virt_to_page(start);
155 if (pages[i]) 155 if (pages[i])
156 page_cache_get(pages[i]); 156 page_cache_get(pages[i]);
157 } 157 }
158 if (vmas) 158 if (vmas)
159 vmas[i] = vma; 159 vmas[i] = vma;
160 start = (start + PAGE_SIZE) & PAGE_MASK; 160 start = (start + PAGE_SIZE) & PAGE_MASK;
161 } 161 }
162 162
163 return i; 163 return i;
164 164
165 finish_or_fault: 165 finish_or_fault:
166 return i ? : -EFAULT; 166 return i ? : -EFAULT;
167 } 167 }
168 168
169 /* 169 /*
170 * get a list of pages in an address range belonging to the specified process 170 * get a list of pages in an address range belonging to the specified process
171 * and indicate the VMA that covers each page 171 * and indicate the VMA that covers each page
172 * - this is potentially dodgy as we may end incrementing the page count of a 172 * - this is potentially dodgy as we may end incrementing the page count of a
173 * slab page or a secondary page from a compound page 173 * slab page or a secondary page from a compound page
174 * - don't permit access to VMAs that don't support it, such as I/O mappings 174 * - don't permit access to VMAs that don't support it, such as I/O mappings
175 */ 175 */
176 int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, 176 int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
177 unsigned long start, int nr_pages, int write, int force, 177 unsigned long start, int nr_pages, int write, int force,
178 struct page **pages, struct vm_area_struct **vmas) 178 struct page **pages, struct vm_area_struct **vmas)
179 { 179 {
180 int flags = 0; 180 int flags = 0;
181 181
182 if (write) 182 if (write)
183 flags |= FOLL_WRITE; 183 flags |= FOLL_WRITE;
184 if (force) 184 if (force)
185 flags |= FOLL_FORCE; 185 flags |= FOLL_FORCE;
186 186
187 return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas); 187 return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas);
188 } 188 }
189 EXPORT_SYMBOL(get_user_pages); 189 EXPORT_SYMBOL(get_user_pages);
190 190
191 /** 191 /**
192 * follow_pfn - look up PFN at a user virtual address 192 * follow_pfn - look up PFN at a user virtual address
193 * @vma: memory mapping 193 * @vma: memory mapping
194 * @address: user virtual address 194 * @address: user virtual address
195 * @pfn: location to store found PFN 195 * @pfn: location to store found PFN
196 * 196 *
197 * Only IO mappings and raw PFN mappings are allowed. 197 * Only IO mappings and raw PFN mappings are allowed.
198 * 198 *
199 * Returns zero and the pfn at @pfn on success, -ve otherwise. 199 * Returns zero and the pfn at @pfn on success, -ve otherwise.
200 */ 200 */
201 int follow_pfn(struct vm_area_struct *vma, unsigned long address, 201 int follow_pfn(struct vm_area_struct *vma, unsigned long address,
202 unsigned long *pfn) 202 unsigned long *pfn)
203 { 203 {
204 if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) 204 if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
205 return -EINVAL; 205 return -EINVAL;
206 206
207 *pfn = address >> PAGE_SHIFT; 207 *pfn = address >> PAGE_SHIFT;
208 return 0; 208 return 0;
209 } 209 }
210 EXPORT_SYMBOL(follow_pfn); 210 EXPORT_SYMBOL(follow_pfn);
211 211
212 DEFINE_RWLOCK(vmlist_lock); 212 DEFINE_RWLOCK(vmlist_lock);
213 struct vm_struct *vmlist; 213 struct vm_struct *vmlist;
214 214
215 void vfree(const void *addr) 215 void vfree(const void *addr)
216 { 216 {
217 kfree(addr); 217 kfree(addr);
218 } 218 }
219 EXPORT_SYMBOL(vfree); 219 EXPORT_SYMBOL(vfree);
220 220
221 void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot) 221 void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot)
222 { 222 {
223 /* 223 /*
224 * You can't specify __GFP_HIGHMEM with kmalloc() since kmalloc() 224 * You can't specify __GFP_HIGHMEM with kmalloc() since kmalloc()
225 * returns only a logical address. 225 * returns only a logical address.
226 */ 226 */
227 return kmalloc(size, (gfp_mask | __GFP_COMP) & ~__GFP_HIGHMEM); 227 return kmalloc(size, (gfp_mask | __GFP_COMP) & ~__GFP_HIGHMEM);
228 } 228 }
229 EXPORT_SYMBOL(__vmalloc); 229 EXPORT_SYMBOL(__vmalloc);
230 230
231 void *vmalloc_user(unsigned long size) 231 void *vmalloc_user(unsigned long size)
232 { 232 {
233 void *ret; 233 void *ret;
234 234
235 ret = __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, 235 ret = __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO,
236 PAGE_KERNEL); 236 PAGE_KERNEL);
237 if (ret) { 237 if (ret) {
238 struct vm_area_struct *vma; 238 struct vm_area_struct *vma;
239 239
240 down_write(&current->mm->mmap_sem); 240 down_write(&current->mm->mmap_sem);
241 vma = find_vma(current->mm, (unsigned long)ret); 241 vma = find_vma(current->mm, (unsigned long)ret);
242 if (vma) 242 if (vma)
243 vma->vm_flags |= VM_USERMAP; 243 vma->vm_flags |= VM_USERMAP;
244 up_write(&current->mm->mmap_sem); 244 up_write(&current->mm->mmap_sem);
245 } 245 }
246 246
247 return ret; 247 return ret;
248 } 248 }
249 EXPORT_SYMBOL(vmalloc_user); 249 EXPORT_SYMBOL(vmalloc_user);
250 250
251 struct page *vmalloc_to_page(const void *addr) 251 struct page *vmalloc_to_page(const void *addr)
252 { 252 {
253 return virt_to_page(addr); 253 return virt_to_page(addr);
254 } 254 }
255 EXPORT_SYMBOL(vmalloc_to_page); 255 EXPORT_SYMBOL(vmalloc_to_page);
256 256
257 unsigned long vmalloc_to_pfn(const void *addr) 257 unsigned long vmalloc_to_pfn(const void *addr)
258 { 258 {
259 return page_to_pfn(virt_to_page(addr)); 259 return page_to_pfn(virt_to_page(addr));
260 } 260 }
261 EXPORT_SYMBOL(vmalloc_to_pfn); 261 EXPORT_SYMBOL(vmalloc_to_pfn);
262 262
263 long vread(char *buf, char *addr, unsigned long count) 263 long vread(char *buf, char *addr, unsigned long count)
264 { 264 {
265 memcpy(buf, addr, count); 265 memcpy(buf, addr, count);
266 return count; 266 return count;
267 } 267 }
268 268
269 long vwrite(char *buf, char *addr, unsigned long count) 269 long vwrite(char *buf, char *addr, unsigned long count)
270 { 270 {
271 /* Don't allow overflow */ 271 /* Don't allow overflow */
272 if ((unsigned long) addr + count < count) 272 if ((unsigned long) addr + count < count)
273 count = -(unsigned long) addr; 273 count = -(unsigned long) addr;
274 274
275 memcpy(addr, buf, count); 275 memcpy(addr, buf, count);
276 return(count); 276 return(count);
277 } 277 }
278 278
279 /* 279 /*
280 * vmalloc - allocate virtually continguos memory 280 * vmalloc - allocate virtually continguos memory
281 * 281 *
282 * @size: allocation size 282 * @size: allocation size
283 * 283 *
284 * Allocate enough pages to cover @size from the page level 284 * Allocate enough pages to cover @size from the page level
285 * allocator and map them into continguos kernel virtual space. 285 * allocator and map them into continguos kernel virtual space.
286 * 286 *
287 * For tight control over page level allocator and protection flags 287 * For tight control over page level allocator and protection flags
288 * use __vmalloc() instead. 288 * use __vmalloc() instead.
289 */ 289 */
290 void *vmalloc(unsigned long size) 290 void *vmalloc(unsigned long size)
291 { 291 {
292 return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL); 292 return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL);
293 } 293 }
294 EXPORT_SYMBOL(vmalloc); 294 EXPORT_SYMBOL(vmalloc);
295 295
296 /*
297 * vzalloc - allocate virtually continguos memory with zero fill
298 *
299 * @size: allocation size
300 *
301 * Allocate enough pages to cover @size from the page level
302 * allocator and map them into continguos kernel virtual space.
303 * The memory allocated is set to zero.
304 *
305 * For tight control over page level allocator and protection flags
306 * use __vmalloc() instead.
307 */
308 void *vzalloc(unsigned long size)
309 {
310 return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO,
311 PAGE_KERNEL);
312 }
313 EXPORT_SYMBOL(vzalloc);
314
315 /**
316 * vmalloc_node - allocate memory on a specific node
317 * @size: allocation size
318 * @node: numa node
319 *
320 * Allocate enough pages to cover @size from the page level
321 * allocator and map them into contiguous kernel virtual space.
322 *
323 * For tight control over page level allocator and protection flags
324 * use __vmalloc() instead.
325 */
296 void *vmalloc_node(unsigned long size, int node) 326 void *vmalloc_node(unsigned long size, int node)
297 { 327 {
298 return vmalloc(size); 328 return vmalloc(size);
299 } 329 }
300 EXPORT_SYMBOL(vmalloc_node); 330
331 /**
332 * vzalloc_node - allocate memory on a specific node with zero fill
333 * @size: allocation size
334 * @node: numa node
335 *
336 * Allocate enough pages to cover @size from the page level
337 * allocator and map them into contiguous kernel virtual space.
338 * The memory allocated is set to zero.
339 *
340 * For tight control over page level allocator and protection flags
341 * use __vmalloc() instead.
342 */
343 void *vzalloc_node(unsigned long size, int node)
344 {
345 return vzalloc(size);
346 }
347 EXPORT_SYMBOL(vzalloc_node);
301 348
302 #ifndef PAGE_KERNEL_EXEC 349 #ifndef PAGE_KERNEL_EXEC
303 # define PAGE_KERNEL_EXEC PAGE_KERNEL 350 # define PAGE_KERNEL_EXEC PAGE_KERNEL
304 #endif 351 #endif
305 352
306 /** 353 /**
307 * vmalloc_exec - allocate virtually contiguous, executable memory 354 * vmalloc_exec - allocate virtually contiguous, executable memory
308 * @size: allocation size 355 * @size: allocation size
309 * 356 *
310 * Kernel-internal function to allocate enough pages to cover @size 357 * Kernel-internal function to allocate enough pages to cover @size
311 * the page level allocator and map them into contiguous and 358 * the page level allocator and map them into contiguous and
312 * executable kernel virtual space. 359 * executable kernel virtual space.
313 * 360 *
314 * For tight control over page level allocator and protection flags 361 * For tight control over page level allocator and protection flags
315 * use __vmalloc() instead. 362 * use __vmalloc() instead.
316 */ 363 */
317 364
318 void *vmalloc_exec(unsigned long size) 365 void *vmalloc_exec(unsigned long size)
319 { 366 {
320 return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL_EXEC); 367 return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL_EXEC);
321 } 368 }
322 369
323 /** 370 /**
324 * vmalloc_32 - allocate virtually contiguous memory (32bit addressable) 371 * vmalloc_32 - allocate virtually contiguous memory (32bit addressable)
325 * @size: allocation size 372 * @size: allocation size
326 * 373 *
327 * Allocate enough 32bit PA addressable pages to cover @size from the 374 * Allocate enough 32bit PA addressable pages to cover @size from the
328 * page level allocator and map them into continguos kernel virtual space. 375 * page level allocator and map them into continguos kernel virtual space.
329 */ 376 */
330 void *vmalloc_32(unsigned long size) 377 void *vmalloc_32(unsigned long size)
331 { 378 {
332 return __vmalloc(size, GFP_KERNEL, PAGE_KERNEL); 379 return __vmalloc(size, GFP_KERNEL, PAGE_KERNEL);
333 } 380 }
334 EXPORT_SYMBOL(vmalloc_32); 381 EXPORT_SYMBOL(vmalloc_32);
335 382
336 /** 383 /**
337 * vmalloc_32_user - allocate zeroed virtually contiguous 32bit memory 384 * vmalloc_32_user - allocate zeroed virtually contiguous 32bit memory
338 * @size: allocation size 385 * @size: allocation size
339 * 386 *
340 * The resulting memory area is 32bit addressable and zeroed so it can be 387 * The resulting memory area is 32bit addressable and zeroed so it can be
341 * mapped to userspace without leaking data. 388 * mapped to userspace without leaking data.
342 * 389 *
343 * VM_USERMAP is set on the corresponding VMA so that subsequent calls to 390 * VM_USERMAP is set on the corresponding VMA so that subsequent calls to
344 * remap_vmalloc_range() are permissible. 391 * remap_vmalloc_range() are permissible.
345 */ 392 */
346 void *vmalloc_32_user(unsigned long size) 393 void *vmalloc_32_user(unsigned long size)
347 { 394 {
348 /* 395 /*
349 * We'll have to sort out the ZONE_DMA bits for 64-bit, 396 * We'll have to sort out the ZONE_DMA bits for 64-bit,
350 * but for now this can simply use vmalloc_user() directly. 397 * but for now this can simply use vmalloc_user() directly.
351 */ 398 */
352 return vmalloc_user(size); 399 return vmalloc_user(size);
353 } 400 }
354 EXPORT_SYMBOL(vmalloc_32_user); 401 EXPORT_SYMBOL(vmalloc_32_user);
355 402
356 void *vmap(struct page **pages, unsigned int count, unsigned long flags, pgprot_t prot) 403 void *vmap(struct page **pages, unsigned int count, unsigned long flags, pgprot_t prot)
357 { 404 {
358 BUG(); 405 BUG();
359 return NULL; 406 return NULL;
360 } 407 }
361 EXPORT_SYMBOL(vmap); 408 EXPORT_SYMBOL(vmap);
362 409
363 void vunmap(const void *addr) 410 void vunmap(const void *addr)
364 { 411 {
365 BUG(); 412 BUG();
366 } 413 }
367 EXPORT_SYMBOL(vunmap); 414 EXPORT_SYMBOL(vunmap);
368 415
369 void *vm_map_ram(struct page **pages, unsigned int count, int node, pgprot_t prot) 416 void *vm_map_ram(struct page **pages, unsigned int count, int node, pgprot_t prot)
370 { 417 {
371 BUG(); 418 BUG();
372 return NULL; 419 return NULL;
373 } 420 }
374 EXPORT_SYMBOL(vm_map_ram); 421 EXPORT_SYMBOL(vm_map_ram);
375 422
376 void vm_unmap_ram(const void *mem, unsigned int count) 423 void vm_unmap_ram(const void *mem, unsigned int count)
377 { 424 {
378 BUG(); 425 BUG();
379 } 426 }
380 EXPORT_SYMBOL(vm_unmap_ram); 427 EXPORT_SYMBOL(vm_unmap_ram);
381 428
382 void vm_unmap_aliases(void) 429 void vm_unmap_aliases(void)
383 { 430 {
384 } 431 }
385 EXPORT_SYMBOL_GPL(vm_unmap_aliases); 432 EXPORT_SYMBOL_GPL(vm_unmap_aliases);
386 433
387 /* 434 /*
388 * Implement a stub for vmalloc_sync_all() if the architecture chose not to 435 * Implement a stub for vmalloc_sync_all() if the architecture chose not to
389 * have one. 436 * have one.
390 */ 437 */
391 void __attribute__((weak)) vmalloc_sync_all(void) 438 void __attribute__((weak)) vmalloc_sync_all(void)
392 { 439 {
393 } 440 }
394 441
395 int vm_insert_page(struct vm_area_struct *vma, unsigned long addr, 442 int vm_insert_page(struct vm_area_struct *vma, unsigned long addr,
396 struct page *page) 443 struct page *page)
397 { 444 {
398 return -EINVAL; 445 return -EINVAL;
399 } 446 }
400 EXPORT_SYMBOL(vm_insert_page); 447 EXPORT_SYMBOL(vm_insert_page);
401 448
402 /* 449 /*
403 * sys_brk() for the most part doesn't need the global kernel 450 * sys_brk() for the most part doesn't need the global kernel
404 * lock, except when an application is doing something nasty 451 * lock, except when an application is doing something nasty
405 * like trying to un-brk an area that has already been mapped 452 * like trying to un-brk an area that has already been mapped
406 * to a regular file. in this case, the unmapping will need 453 * to a regular file. in this case, the unmapping will need
407 * to invoke file system routines that need the global lock. 454 * to invoke file system routines that need the global lock.
408 */ 455 */
409 SYSCALL_DEFINE1(brk, unsigned long, brk) 456 SYSCALL_DEFINE1(brk, unsigned long, brk)
410 { 457 {
411 struct mm_struct *mm = current->mm; 458 struct mm_struct *mm = current->mm;
412 459
413 if (brk < mm->start_brk || brk > mm->context.end_brk) 460 if (brk < mm->start_brk || brk > mm->context.end_brk)
414 return mm->brk; 461 return mm->brk;
415 462
416 if (mm->brk == brk) 463 if (mm->brk == brk)
417 return mm->brk; 464 return mm->brk;
418 465
419 /* 466 /*
420 * Always allow shrinking brk 467 * Always allow shrinking brk
421 */ 468 */
422 if (brk <= mm->brk) { 469 if (brk <= mm->brk) {
423 mm->brk = brk; 470 mm->brk = brk;
424 return brk; 471 return brk;
425 } 472 }
426 473
427 /* 474 /*
428 * Ok, looks good - let it rip. 475 * Ok, looks good - let it rip.
429 */ 476 */
430 flush_icache_range(mm->brk, brk); 477 flush_icache_range(mm->brk, brk);
431 return mm->brk = brk; 478 return mm->brk = brk;
432 } 479 }
433 480
434 /* 481 /*
435 * initialise the VMA and region record slabs 482 * initialise the VMA and region record slabs
436 */ 483 */
437 void __init mmap_init(void) 484 void __init mmap_init(void)
438 { 485 {
439 int ret; 486 int ret;
440 487
441 ret = percpu_counter_init(&vm_committed_as, 0); 488 ret = percpu_counter_init(&vm_committed_as, 0);
442 VM_BUG_ON(ret); 489 VM_BUG_ON(ret);
443 vm_region_jar = KMEM_CACHE(vm_region, SLAB_PANIC); 490 vm_region_jar = KMEM_CACHE(vm_region, SLAB_PANIC);
444 } 491 }
445 492
446 /* 493 /*
447 * validate the region tree 494 * validate the region tree
448 * - the caller must hold the region lock 495 * - the caller must hold the region lock
449 */ 496 */
450 #ifdef CONFIG_DEBUG_NOMMU_REGIONS 497 #ifdef CONFIG_DEBUG_NOMMU_REGIONS
451 static noinline void validate_nommu_regions(void) 498 static noinline void validate_nommu_regions(void)
452 { 499 {
453 struct vm_region *region, *last; 500 struct vm_region *region, *last;
454 struct rb_node *p, *lastp; 501 struct rb_node *p, *lastp;
455 502
456 lastp = rb_first(&nommu_region_tree); 503 lastp = rb_first(&nommu_region_tree);
457 if (!lastp) 504 if (!lastp)
458 return; 505 return;
459 506
460 last = rb_entry(lastp, struct vm_region, vm_rb); 507 last = rb_entry(lastp, struct vm_region, vm_rb);
461 BUG_ON(unlikely(last->vm_end <= last->vm_start)); 508 BUG_ON(unlikely(last->vm_end <= last->vm_start));
462 BUG_ON(unlikely(last->vm_top < last->vm_end)); 509 BUG_ON(unlikely(last->vm_top < last->vm_end));
463 510
464 while ((p = rb_next(lastp))) { 511 while ((p = rb_next(lastp))) {
465 region = rb_entry(p, struct vm_region, vm_rb); 512 region = rb_entry(p, struct vm_region, vm_rb);
466 last = rb_entry(lastp, struct vm_region, vm_rb); 513 last = rb_entry(lastp, struct vm_region, vm_rb);
467 514
468 BUG_ON(unlikely(region->vm_end <= region->vm_start)); 515 BUG_ON(unlikely(region->vm_end <= region->vm_start));
469 BUG_ON(unlikely(region->vm_top < region->vm_end)); 516 BUG_ON(unlikely(region->vm_top < region->vm_end));
470 BUG_ON(unlikely(region->vm_start < last->vm_top)); 517 BUG_ON(unlikely(region->vm_start < last->vm_top));
471 518
472 lastp = p; 519 lastp = p;
473 } 520 }
474 } 521 }
475 #else 522 #else
476 static void validate_nommu_regions(void) 523 static void validate_nommu_regions(void)
477 { 524 {
478 } 525 }
479 #endif 526 #endif
480 527
481 /* 528 /*
482 * add a region into the global tree 529 * add a region into the global tree
483 */ 530 */
484 static void add_nommu_region(struct vm_region *region) 531 static void add_nommu_region(struct vm_region *region)
485 { 532 {
486 struct vm_region *pregion; 533 struct vm_region *pregion;
487 struct rb_node **p, *parent; 534 struct rb_node **p, *parent;
488 535
489 validate_nommu_regions(); 536 validate_nommu_regions();
490 537
491 parent = NULL; 538 parent = NULL;
492 p = &nommu_region_tree.rb_node; 539 p = &nommu_region_tree.rb_node;
493 while (*p) { 540 while (*p) {
494 parent = *p; 541 parent = *p;
495 pregion = rb_entry(parent, struct vm_region, vm_rb); 542 pregion = rb_entry(parent, struct vm_region, vm_rb);
496 if (region->vm_start < pregion->vm_start) 543 if (region->vm_start < pregion->vm_start)
497 p = &(*p)->rb_left; 544 p = &(*p)->rb_left;
498 else if (region->vm_start > pregion->vm_start) 545 else if (region->vm_start > pregion->vm_start)
499 p = &(*p)->rb_right; 546 p = &(*p)->rb_right;
500 else if (pregion == region) 547 else if (pregion == region)
501 return; 548 return;
502 else 549 else
503 BUG(); 550 BUG();
504 } 551 }
505 552
506 rb_link_node(&region->vm_rb, parent, p); 553 rb_link_node(&region->vm_rb, parent, p);
507 rb_insert_color(&region->vm_rb, &nommu_region_tree); 554 rb_insert_color(&region->vm_rb, &nommu_region_tree);
508 555
509 validate_nommu_regions(); 556 validate_nommu_regions();
510 } 557 }
511 558
512 /* 559 /*
513 * delete a region from the global tree 560 * delete a region from the global tree
514 */ 561 */
515 static void delete_nommu_region(struct vm_region *region) 562 static void delete_nommu_region(struct vm_region *region)
516 { 563 {
517 BUG_ON(!nommu_region_tree.rb_node); 564 BUG_ON(!nommu_region_tree.rb_node);
518 565
519 validate_nommu_regions(); 566 validate_nommu_regions();
520 rb_erase(&region->vm_rb, &nommu_region_tree); 567 rb_erase(&region->vm_rb, &nommu_region_tree);
521 validate_nommu_regions(); 568 validate_nommu_regions();
522 } 569 }
523 570
524 /* 571 /*
525 * free a contiguous series of pages 572 * free a contiguous series of pages
526 */ 573 */
527 static void free_page_series(unsigned long from, unsigned long to) 574 static void free_page_series(unsigned long from, unsigned long to)
528 { 575 {
529 for (; from < to; from += PAGE_SIZE) { 576 for (; from < to; from += PAGE_SIZE) {
530 struct page *page = virt_to_page(from); 577 struct page *page = virt_to_page(from);
531 578
532 kdebug("- free %lx", from); 579 kdebug("- free %lx", from);
533 atomic_long_dec(&mmap_pages_allocated); 580 atomic_long_dec(&mmap_pages_allocated);
534 if (page_count(page) != 1) 581 if (page_count(page) != 1)
535 kdebug("free page %p: refcount not one: %d", 582 kdebug("free page %p: refcount not one: %d",
536 page, page_count(page)); 583 page, page_count(page));
537 put_page(page); 584 put_page(page);
538 } 585 }
539 } 586 }
540 587
541 /* 588 /*
542 * release a reference to a region 589 * release a reference to a region
543 * - the caller must hold the region semaphore for writing, which this releases 590 * - the caller must hold the region semaphore for writing, which this releases
544 * - the region may not have been added to the tree yet, in which case vm_top 591 * - the region may not have been added to the tree yet, in which case vm_top
545 * will equal vm_start 592 * will equal vm_start
546 */ 593 */
547 static void __put_nommu_region(struct vm_region *region) 594 static void __put_nommu_region(struct vm_region *region)
548 __releases(nommu_region_sem) 595 __releases(nommu_region_sem)
549 { 596 {
550 kenter("%p{%d}", region, region->vm_usage); 597 kenter("%p{%d}", region, region->vm_usage);
551 598
552 BUG_ON(!nommu_region_tree.rb_node); 599 BUG_ON(!nommu_region_tree.rb_node);
553 600
554 if (--region->vm_usage == 0) { 601 if (--region->vm_usage == 0) {
555 if (region->vm_top > region->vm_start) 602 if (region->vm_top > region->vm_start)
556 delete_nommu_region(region); 603 delete_nommu_region(region);
557 up_write(&nommu_region_sem); 604 up_write(&nommu_region_sem);
558 605
559 if (region->vm_file) 606 if (region->vm_file)
560 fput(region->vm_file); 607 fput(region->vm_file);
561 608
562 /* IO memory and memory shared directly out of the pagecache 609 /* IO memory and memory shared directly out of the pagecache
563 * from ramfs/tmpfs mustn't be released here */ 610 * from ramfs/tmpfs mustn't be released here */
564 if (region->vm_flags & VM_MAPPED_COPY) { 611 if (region->vm_flags & VM_MAPPED_COPY) {
565 kdebug("free series"); 612 kdebug("free series");
566 free_page_series(region->vm_start, region->vm_top); 613 free_page_series(region->vm_start, region->vm_top);
567 } 614 }
568 kmem_cache_free(vm_region_jar, region); 615 kmem_cache_free(vm_region_jar, region);
569 } else { 616 } else {
570 up_write(&nommu_region_sem); 617 up_write(&nommu_region_sem);
571 } 618 }
572 } 619 }
573 620
574 /* 621 /*
575 * release a reference to a region 622 * release a reference to a region
576 */ 623 */
577 static void put_nommu_region(struct vm_region *region) 624 static void put_nommu_region(struct vm_region *region)
578 { 625 {
579 down_write(&nommu_region_sem); 626 down_write(&nommu_region_sem);
580 __put_nommu_region(region); 627 __put_nommu_region(region);
581 } 628 }
582 629
583 /* 630 /*
584 * update protection on a vma 631 * update protection on a vma
585 */ 632 */
586 static void protect_vma(struct vm_area_struct *vma, unsigned long flags) 633 static void protect_vma(struct vm_area_struct *vma, unsigned long flags)
587 { 634 {
588 #ifdef CONFIG_MPU 635 #ifdef CONFIG_MPU
589 struct mm_struct *mm = vma->vm_mm; 636 struct mm_struct *mm = vma->vm_mm;
590 long start = vma->vm_start & PAGE_MASK; 637 long start = vma->vm_start & PAGE_MASK;
591 while (start < vma->vm_end) { 638 while (start < vma->vm_end) {
592 protect_page(mm, start, flags); 639 protect_page(mm, start, flags);
593 start += PAGE_SIZE; 640 start += PAGE_SIZE;
594 } 641 }
595 update_protections(mm); 642 update_protections(mm);
596 #endif 643 #endif
597 } 644 }
598 645
599 /* 646 /*
600 * add a VMA into a process's mm_struct in the appropriate place in the list 647 * add a VMA into a process's mm_struct in the appropriate place in the list
601 * and tree and add to the address space's page tree also if not an anonymous 648 * and tree and add to the address space's page tree also if not an anonymous
602 * page 649 * page
603 * - should be called with mm->mmap_sem held writelocked 650 * - should be called with mm->mmap_sem held writelocked
604 */ 651 */
605 static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma) 652 static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma)
606 { 653 {
607 struct vm_area_struct *pvma, **pp, *next; 654 struct vm_area_struct *pvma, **pp, *next;
608 struct address_space *mapping; 655 struct address_space *mapping;
609 struct rb_node **p, *parent; 656 struct rb_node **p, *parent;
610 657
611 kenter(",%p", vma); 658 kenter(",%p", vma);
612 659
613 BUG_ON(!vma->vm_region); 660 BUG_ON(!vma->vm_region);
614 661
615 mm->map_count++; 662 mm->map_count++;
616 vma->vm_mm = mm; 663 vma->vm_mm = mm;
617 664
618 protect_vma(vma, vma->vm_flags); 665 protect_vma(vma, vma->vm_flags);
619 666
620 /* add the VMA to the mapping */ 667 /* add the VMA to the mapping */
621 if (vma->vm_file) { 668 if (vma->vm_file) {
622 mapping = vma->vm_file->f_mapping; 669 mapping = vma->vm_file->f_mapping;
623 670
624 flush_dcache_mmap_lock(mapping); 671 flush_dcache_mmap_lock(mapping);
625 vma_prio_tree_insert(vma, &mapping->i_mmap); 672 vma_prio_tree_insert(vma, &mapping->i_mmap);
626 flush_dcache_mmap_unlock(mapping); 673 flush_dcache_mmap_unlock(mapping);
627 } 674 }
628 675
629 /* add the VMA to the tree */ 676 /* add the VMA to the tree */
630 parent = NULL; 677 parent = NULL;
631 p = &mm->mm_rb.rb_node; 678 p = &mm->mm_rb.rb_node;
632 while (*p) { 679 while (*p) {
633 parent = *p; 680 parent = *p;
634 pvma = rb_entry(parent, struct vm_area_struct, vm_rb); 681 pvma = rb_entry(parent, struct vm_area_struct, vm_rb);
635 682
636 /* sort by: start addr, end addr, VMA struct addr in that order 683 /* sort by: start addr, end addr, VMA struct addr in that order
637 * (the latter is necessary as we may get identical VMAs) */ 684 * (the latter is necessary as we may get identical VMAs) */
638 if (vma->vm_start < pvma->vm_start) 685 if (vma->vm_start < pvma->vm_start)
639 p = &(*p)->rb_left; 686 p = &(*p)->rb_left;
640 else if (vma->vm_start > pvma->vm_start) 687 else if (vma->vm_start > pvma->vm_start)
641 p = &(*p)->rb_right; 688 p = &(*p)->rb_right;
642 else if (vma->vm_end < pvma->vm_end) 689 else if (vma->vm_end < pvma->vm_end)
643 p = &(*p)->rb_left; 690 p = &(*p)->rb_left;
644 else if (vma->vm_end > pvma->vm_end) 691 else if (vma->vm_end > pvma->vm_end)
645 p = &(*p)->rb_right; 692 p = &(*p)->rb_right;
646 else if (vma < pvma) 693 else if (vma < pvma)
647 p = &(*p)->rb_left; 694 p = &(*p)->rb_left;
648 else if (vma > pvma) 695 else if (vma > pvma)
649 p = &(*p)->rb_right; 696 p = &(*p)->rb_right;
650 else 697 else
651 BUG(); 698 BUG();
652 } 699 }
653 700
654 rb_link_node(&vma->vm_rb, parent, p); 701 rb_link_node(&vma->vm_rb, parent, p);
655 rb_insert_color(&vma->vm_rb, &mm->mm_rb); 702 rb_insert_color(&vma->vm_rb, &mm->mm_rb);
656 703
657 /* add VMA to the VMA list also */ 704 /* add VMA to the VMA list also */
658 for (pp = &mm->mmap; (pvma = *pp); pp = &(*pp)->vm_next) { 705 for (pp = &mm->mmap; (pvma = *pp); pp = &(*pp)->vm_next) {
659 if (pvma->vm_start > vma->vm_start) 706 if (pvma->vm_start > vma->vm_start)
660 break; 707 break;
661 if (pvma->vm_start < vma->vm_start) 708 if (pvma->vm_start < vma->vm_start)
662 continue; 709 continue;
663 if (pvma->vm_end < vma->vm_end) 710 if (pvma->vm_end < vma->vm_end)
664 break; 711 break;
665 } 712 }
666 713
667 next = *pp; 714 next = *pp;
668 *pp = vma; 715 *pp = vma;
669 vma->vm_next = next; 716 vma->vm_next = next;
670 if (next) 717 if (next)
671 next->vm_prev = vma; 718 next->vm_prev = vma;
672 } 719 }
673 720
674 /* 721 /*
675 * delete a VMA from its owning mm_struct and address space 722 * delete a VMA from its owning mm_struct and address space
676 */ 723 */
677 static void delete_vma_from_mm(struct vm_area_struct *vma) 724 static void delete_vma_from_mm(struct vm_area_struct *vma)
678 { 725 {
679 struct vm_area_struct **pp; 726 struct vm_area_struct **pp;
680 struct address_space *mapping; 727 struct address_space *mapping;
681 struct mm_struct *mm = vma->vm_mm; 728 struct mm_struct *mm = vma->vm_mm;
682 729
683 kenter("%p", vma); 730 kenter("%p", vma);
684 731
685 protect_vma(vma, 0); 732 protect_vma(vma, 0);
686 733
687 mm->map_count--; 734 mm->map_count--;
688 if (mm->mmap_cache == vma) 735 if (mm->mmap_cache == vma)
689 mm->mmap_cache = NULL; 736 mm->mmap_cache = NULL;
690 737
691 /* remove the VMA from the mapping */ 738 /* remove the VMA from the mapping */
692 if (vma->vm_file) { 739 if (vma->vm_file) {
693 mapping = vma->vm_file->f_mapping; 740 mapping = vma->vm_file->f_mapping;
694 741
695 flush_dcache_mmap_lock(mapping); 742 flush_dcache_mmap_lock(mapping);
696 vma_prio_tree_remove(vma, &mapping->i_mmap); 743 vma_prio_tree_remove(vma, &mapping->i_mmap);
697 flush_dcache_mmap_unlock(mapping); 744 flush_dcache_mmap_unlock(mapping);
698 } 745 }
699 746
700 /* remove from the MM's tree and list */ 747 /* remove from the MM's tree and list */
701 rb_erase(&vma->vm_rb, &mm->mm_rb); 748 rb_erase(&vma->vm_rb, &mm->mm_rb);
702 for (pp = &mm->mmap; *pp; pp = &(*pp)->vm_next) { 749 for (pp = &mm->mmap; *pp; pp = &(*pp)->vm_next) {
703 if (*pp == vma) { 750 if (*pp == vma) {
704 *pp = vma->vm_next; 751 *pp = vma->vm_next;
705 break; 752 break;
706 } 753 }
707 } 754 }
708 755
709 vma->vm_mm = NULL; 756 vma->vm_mm = NULL;
710 } 757 }
711 758
712 /* 759 /*
713 * destroy a VMA record 760 * destroy a VMA record
714 */ 761 */
715 static void delete_vma(struct mm_struct *mm, struct vm_area_struct *vma) 762 static void delete_vma(struct mm_struct *mm, struct vm_area_struct *vma)
716 { 763 {
717 kenter("%p", vma); 764 kenter("%p", vma);
718 if (vma->vm_ops && vma->vm_ops->close) 765 if (vma->vm_ops && vma->vm_ops->close)
719 vma->vm_ops->close(vma); 766 vma->vm_ops->close(vma);
720 if (vma->vm_file) { 767 if (vma->vm_file) {
721 fput(vma->vm_file); 768 fput(vma->vm_file);
722 if (vma->vm_flags & VM_EXECUTABLE) 769 if (vma->vm_flags & VM_EXECUTABLE)
723 removed_exe_file_vma(mm); 770 removed_exe_file_vma(mm);
724 } 771 }
725 put_nommu_region(vma->vm_region); 772 put_nommu_region(vma->vm_region);
726 kmem_cache_free(vm_area_cachep, vma); 773 kmem_cache_free(vm_area_cachep, vma);
727 } 774 }
728 775
729 /* 776 /*
730 * look up the first VMA in which addr resides, NULL if none 777 * look up the first VMA in which addr resides, NULL if none
731 * - should be called with mm->mmap_sem at least held readlocked 778 * - should be called with mm->mmap_sem at least held readlocked
732 */ 779 */
733 struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) 780 struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
734 { 781 {
735 struct vm_area_struct *vma; 782 struct vm_area_struct *vma;
736 struct rb_node *n = mm->mm_rb.rb_node; 783 struct rb_node *n = mm->mm_rb.rb_node;
737 784
738 /* check the cache first */ 785 /* check the cache first */
739 vma = mm->mmap_cache; 786 vma = mm->mmap_cache;
740 if (vma && vma->vm_start <= addr && vma->vm_end > addr) 787 if (vma && vma->vm_start <= addr && vma->vm_end > addr)
741 return vma; 788 return vma;
742 789
743 /* trawl the tree (there may be multiple mappings in which addr 790 /* trawl the tree (there may be multiple mappings in which addr
744 * resides) */ 791 * resides) */
745 for (n = rb_first(&mm->mm_rb); n; n = rb_next(n)) { 792 for (n = rb_first(&mm->mm_rb); n; n = rb_next(n)) {
746 vma = rb_entry(n, struct vm_area_struct, vm_rb); 793 vma = rb_entry(n, struct vm_area_struct, vm_rb);
747 if (vma->vm_start > addr) 794 if (vma->vm_start > addr)
748 return NULL; 795 return NULL;
749 if (vma->vm_end > addr) { 796 if (vma->vm_end > addr) {
750 mm->mmap_cache = vma; 797 mm->mmap_cache = vma;
751 return vma; 798 return vma;
752 } 799 }
753 } 800 }
754 801
755 return NULL; 802 return NULL;
756 } 803 }
757 EXPORT_SYMBOL(find_vma); 804 EXPORT_SYMBOL(find_vma);
758 805
759 /* 806 /*
760 * find a VMA 807 * find a VMA
761 * - we don't extend stack VMAs under NOMMU conditions 808 * - we don't extend stack VMAs under NOMMU conditions
762 */ 809 */
763 struct vm_area_struct *find_extend_vma(struct mm_struct *mm, unsigned long addr) 810 struct vm_area_struct *find_extend_vma(struct mm_struct *mm, unsigned long addr)
764 { 811 {
765 return find_vma(mm, addr); 812 return find_vma(mm, addr);
766 } 813 }
767 814
768 /* 815 /*
769 * expand a stack to a given address 816 * expand a stack to a given address
770 * - not supported under NOMMU conditions 817 * - not supported under NOMMU conditions
771 */ 818 */
772 int expand_stack(struct vm_area_struct *vma, unsigned long address) 819 int expand_stack(struct vm_area_struct *vma, unsigned long address)
773 { 820 {
774 return -ENOMEM; 821 return -ENOMEM;
775 } 822 }
776 823
777 /* 824 /*
778 * look up the first VMA exactly that exactly matches addr 825 * look up the first VMA exactly that exactly matches addr
779 * - should be called with mm->mmap_sem at least held readlocked 826 * - should be called with mm->mmap_sem at least held readlocked
780 */ 827 */
781 static struct vm_area_struct *find_vma_exact(struct mm_struct *mm, 828 static struct vm_area_struct *find_vma_exact(struct mm_struct *mm,
782 unsigned long addr, 829 unsigned long addr,
783 unsigned long len) 830 unsigned long len)
784 { 831 {
785 struct vm_area_struct *vma; 832 struct vm_area_struct *vma;
786 struct rb_node *n = mm->mm_rb.rb_node; 833 struct rb_node *n = mm->mm_rb.rb_node;
787 unsigned long end = addr + len; 834 unsigned long end = addr + len;
788 835
789 /* check the cache first */ 836 /* check the cache first */
790 vma = mm->mmap_cache; 837 vma = mm->mmap_cache;
791 if (vma && vma->vm_start == addr && vma->vm_end == end) 838 if (vma && vma->vm_start == addr && vma->vm_end == end)
792 return vma; 839 return vma;
793 840
794 /* trawl the tree (there may be multiple mappings in which addr 841 /* trawl the tree (there may be multiple mappings in which addr
795 * resides) */ 842 * resides) */
796 for (n = rb_first(&mm->mm_rb); n; n = rb_next(n)) { 843 for (n = rb_first(&mm->mm_rb); n; n = rb_next(n)) {
797 vma = rb_entry(n, struct vm_area_struct, vm_rb); 844 vma = rb_entry(n, struct vm_area_struct, vm_rb);
798 if (vma->vm_start < addr) 845 if (vma->vm_start < addr)
799 continue; 846 continue;
800 if (vma->vm_start > addr) 847 if (vma->vm_start > addr)
801 return NULL; 848 return NULL;
802 if (vma->vm_end == end) { 849 if (vma->vm_end == end) {
803 mm->mmap_cache = vma; 850 mm->mmap_cache = vma;
804 return vma; 851 return vma;
805 } 852 }
806 } 853 }
807 854
808 return NULL; 855 return NULL;
809 } 856 }
810 857
811 /* 858 /*
812 * determine whether a mapping should be permitted and, if so, what sort of 859 * determine whether a mapping should be permitted and, if so, what sort of
813 * mapping we're capable of supporting 860 * mapping we're capable of supporting
814 */ 861 */
815 static int validate_mmap_request(struct file *file, 862 static int validate_mmap_request(struct file *file,
816 unsigned long addr, 863 unsigned long addr,
817 unsigned long len, 864 unsigned long len,
818 unsigned long prot, 865 unsigned long prot,
819 unsigned long flags, 866 unsigned long flags,
820 unsigned long pgoff, 867 unsigned long pgoff,
821 unsigned long *_capabilities) 868 unsigned long *_capabilities)
822 { 869 {
823 unsigned long capabilities, rlen; 870 unsigned long capabilities, rlen;
824 unsigned long reqprot = prot; 871 unsigned long reqprot = prot;
825 int ret; 872 int ret;
826 873
827 /* do the simple checks first */ 874 /* do the simple checks first */
828 if (flags & MAP_FIXED) { 875 if (flags & MAP_FIXED) {
829 printk(KERN_DEBUG 876 printk(KERN_DEBUG
830 "%d: Can't do fixed-address/overlay mmap of RAM\n", 877 "%d: Can't do fixed-address/overlay mmap of RAM\n",
831 current->pid); 878 current->pid);
832 return -EINVAL; 879 return -EINVAL;
833 } 880 }
834 881
835 if ((flags & MAP_TYPE) != MAP_PRIVATE && 882 if ((flags & MAP_TYPE) != MAP_PRIVATE &&
836 (flags & MAP_TYPE) != MAP_SHARED) 883 (flags & MAP_TYPE) != MAP_SHARED)
837 return -EINVAL; 884 return -EINVAL;
838 885
839 if (!len) 886 if (!len)
840 return -EINVAL; 887 return -EINVAL;
841 888
842 /* Careful about overflows.. */ 889 /* Careful about overflows.. */
843 rlen = PAGE_ALIGN(len); 890 rlen = PAGE_ALIGN(len);
844 if (!rlen || rlen > TASK_SIZE) 891 if (!rlen || rlen > TASK_SIZE)
845 return -ENOMEM; 892 return -ENOMEM;
846 893
847 /* offset overflow? */ 894 /* offset overflow? */
848 if ((pgoff + (rlen >> PAGE_SHIFT)) < pgoff) 895 if ((pgoff + (rlen >> PAGE_SHIFT)) < pgoff)
849 return -EOVERFLOW; 896 return -EOVERFLOW;
850 897
851 if (file) { 898 if (file) {
852 /* validate file mapping requests */ 899 /* validate file mapping requests */
853 struct address_space *mapping; 900 struct address_space *mapping;
854 901
855 /* files must support mmap */ 902 /* files must support mmap */
856 if (!file->f_op || !file->f_op->mmap) 903 if (!file->f_op || !file->f_op->mmap)
857 return -ENODEV; 904 return -ENODEV;
858 905
859 /* work out if what we've got could possibly be shared 906 /* work out if what we've got could possibly be shared
860 * - we support chardevs that provide their own "memory" 907 * - we support chardevs that provide their own "memory"
861 * - we support files/blockdevs that are memory backed 908 * - we support files/blockdevs that are memory backed
862 */ 909 */
863 mapping = file->f_mapping; 910 mapping = file->f_mapping;
864 if (!mapping) 911 if (!mapping)
865 mapping = file->f_path.dentry->d_inode->i_mapping; 912 mapping = file->f_path.dentry->d_inode->i_mapping;
866 913
867 capabilities = 0; 914 capabilities = 0;
868 if (mapping && mapping->backing_dev_info) 915 if (mapping && mapping->backing_dev_info)
869 capabilities = mapping->backing_dev_info->capabilities; 916 capabilities = mapping->backing_dev_info->capabilities;
870 917
871 if (!capabilities) { 918 if (!capabilities) {
872 /* no explicit capabilities set, so assume some 919 /* no explicit capabilities set, so assume some
873 * defaults */ 920 * defaults */
874 switch (file->f_path.dentry->d_inode->i_mode & S_IFMT) { 921 switch (file->f_path.dentry->d_inode->i_mode & S_IFMT) {
875 case S_IFREG: 922 case S_IFREG:
876 case S_IFBLK: 923 case S_IFBLK:
877 capabilities = BDI_CAP_MAP_COPY; 924 capabilities = BDI_CAP_MAP_COPY;
878 break; 925 break;
879 926
880 case S_IFCHR: 927 case S_IFCHR:
881 capabilities = 928 capabilities =
882 BDI_CAP_MAP_DIRECT | 929 BDI_CAP_MAP_DIRECT |
883 BDI_CAP_READ_MAP | 930 BDI_CAP_READ_MAP |
884 BDI_CAP_WRITE_MAP; 931 BDI_CAP_WRITE_MAP;
885 break; 932 break;
886 933
887 default: 934 default:
888 return -EINVAL; 935 return -EINVAL;
889 } 936 }
890 } 937 }
891 938
892 /* eliminate any capabilities that we can't support on this 939 /* eliminate any capabilities that we can't support on this
893 * device */ 940 * device */
894 if (!file->f_op->get_unmapped_area) 941 if (!file->f_op->get_unmapped_area)
895 capabilities &= ~BDI_CAP_MAP_DIRECT; 942 capabilities &= ~BDI_CAP_MAP_DIRECT;
896 if (!file->f_op->read) 943 if (!file->f_op->read)
897 capabilities &= ~BDI_CAP_MAP_COPY; 944 capabilities &= ~BDI_CAP_MAP_COPY;
898 945
899 /* The file shall have been opened with read permission. */ 946 /* The file shall have been opened with read permission. */
900 if (!(file->f_mode & FMODE_READ)) 947 if (!(file->f_mode & FMODE_READ))
901 return -EACCES; 948 return -EACCES;
902 949
903 if (flags & MAP_SHARED) { 950 if (flags & MAP_SHARED) {
904 /* do checks for writing, appending and locking */ 951 /* do checks for writing, appending and locking */
905 if ((prot & PROT_WRITE) && 952 if ((prot & PROT_WRITE) &&
906 !(file->f_mode & FMODE_WRITE)) 953 !(file->f_mode & FMODE_WRITE))
907 return -EACCES; 954 return -EACCES;
908 955
909 if (IS_APPEND(file->f_path.dentry->d_inode) && 956 if (IS_APPEND(file->f_path.dentry->d_inode) &&
910 (file->f_mode & FMODE_WRITE)) 957 (file->f_mode & FMODE_WRITE))
911 return -EACCES; 958 return -EACCES;
912 959
913 if (locks_verify_locked(file->f_path.dentry->d_inode)) 960 if (locks_verify_locked(file->f_path.dentry->d_inode))
914 return -EAGAIN; 961 return -EAGAIN;
915 962
916 if (!(capabilities & BDI_CAP_MAP_DIRECT)) 963 if (!(capabilities & BDI_CAP_MAP_DIRECT))
917 return -ENODEV; 964 return -ENODEV;
918 965
919 /* we mustn't privatise shared mappings */ 966 /* we mustn't privatise shared mappings */
920 capabilities &= ~BDI_CAP_MAP_COPY; 967 capabilities &= ~BDI_CAP_MAP_COPY;
921 } 968 }
922 else { 969 else {
923 /* we're going to read the file into private memory we 970 /* we're going to read the file into private memory we
924 * allocate */ 971 * allocate */
925 if (!(capabilities & BDI_CAP_MAP_COPY)) 972 if (!(capabilities & BDI_CAP_MAP_COPY))
926 return -ENODEV; 973 return -ENODEV;
927 974
928 /* we don't permit a private writable mapping to be 975 /* we don't permit a private writable mapping to be
929 * shared with the backing device */ 976 * shared with the backing device */
930 if (prot & PROT_WRITE) 977 if (prot & PROT_WRITE)
931 capabilities &= ~BDI_CAP_MAP_DIRECT; 978 capabilities &= ~BDI_CAP_MAP_DIRECT;
932 } 979 }
933 980
934 if (capabilities & BDI_CAP_MAP_DIRECT) { 981 if (capabilities & BDI_CAP_MAP_DIRECT) {
935 if (((prot & PROT_READ) && !(capabilities & BDI_CAP_READ_MAP)) || 982 if (((prot & PROT_READ) && !(capabilities & BDI_CAP_READ_MAP)) ||
936 ((prot & PROT_WRITE) && !(capabilities & BDI_CAP_WRITE_MAP)) || 983 ((prot & PROT_WRITE) && !(capabilities & BDI_CAP_WRITE_MAP)) ||
937 ((prot & PROT_EXEC) && !(capabilities & BDI_CAP_EXEC_MAP)) 984 ((prot & PROT_EXEC) && !(capabilities & BDI_CAP_EXEC_MAP))
938 ) { 985 ) {
939 capabilities &= ~BDI_CAP_MAP_DIRECT; 986 capabilities &= ~BDI_CAP_MAP_DIRECT;
940 if (flags & MAP_SHARED) { 987 if (flags & MAP_SHARED) {
941 printk(KERN_WARNING 988 printk(KERN_WARNING
942 "MAP_SHARED not completely supported on !MMU\n"); 989 "MAP_SHARED not completely supported on !MMU\n");
943 return -EINVAL; 990 return -EINVAL;
944 } 991 }
945 } 992 }
946 } 993 }
947 994
948 /* handle executable mappings and implied executable 995 /* handle executable mappings and implied executable
949 * mappings */ 996 * mappings */
950 if (file->f_path.mnt->mnt_flags & MNT_NOEXEC) { 997 if (file->f_path.mnt->mnt_flags & MNT_NOEXEC) {
951 if (prot & PROT_EXEC) 998 if (prot & PROT_EXEC)
952 return -EPERM; 999 return -EPERM;
953 } 1000 }
954 else if ((prot & PROT_READ) && !(prot & PROT_EXEC)) { 1001 else if ((prot & PROT_READ) && !(prot & PROT_EXEC)) {
955 /* handle implication of PROT_EXEC by PROT_READ */ 1002 /* handle implication of PROT_EXEC by PROT_READ */
956 if (current->personality & READ_IMPLIES_EXEC) { 1003 if (current->personality & READ_IMPLIES_EXEC) {
957 if (capabilities & BDI_CAP_EXEC_MAP) 1004 if (capabilities & BDI_CAP_EXEC_MAP)
958 prot |= PROT_EXEC; 1005 prot |= PROT_EXEC;
959 } 1006 }
960 } 1007 }
961 else if ((prot & PROT_READ) && 1008 else if ((prot & PROT_READ) &&
962 (prot & PROT_EXEC) && 1009 (prot & PROT_EXEC) &&
963 !(capabilities & BDI_CAP_EXEC_MAP) 1010 !(capabilities & BDI_CAP_EXEC_MAP)
964 ) { 1011 ) {
965 /* backing file is not executable, try to copy */ 1012 /* backing file is not executable, try to copy */
966 capabilities &= ~BDI_CAP_MAP_DIRECT; 1013 capabilities &= ~BDI_CAP_MAP_DIRECT;
967 } 1014 }
968 } 1015 }
969 else { 1016 else {
970 /* anonymous mappings are always memory backed and can be 1017 /* anonymous mappings are always memory backed and can be
971 * privately mapped 1018 * privately mapped
972 */ 1019 */
973 capabilities = BDI_CAP_MAP_COPY; 1020 capabilities = BDI_CAP_MAP_COPY;
974 1021
975 /* handle PROT_EXEC implication by PROT_READ */ 1022 /* handle PROT_EXEC implication by PROT_READ */
976 if ((prot & PROT_READ) && 1023 if ((prot & PROT_READ) &&
977 (current->personality & READ_IMPLIES_EXEC)) 1024 (current->personality & READ_IMPLIES_EXEC))
978 prot |= PROT_EXEC; 1025 prot |= PROT_EXEC;
979 } 1026 }
980 1027
981 /* allow the security API to have its say */ 1028 /* allow the security API to have its say */
982 ret = security_file_mmap(file, reqprot, prot, flags, addr, 0); 1029 ret = security_file_mmap(file, reqprot, prot, flags, addr, 0);
983 if (ret < 0) 1030 if (ret < 0)
984 return ret; 1031 return ret;
985 1032
986 /* looks okay */ 1033 /* looks okay */
987 *_capabilities = capabilities; 1034 *_capabilities = capabilities;
988 return 0; 1035 return 0;
989 } 1036 }
990 1037
991 /* 1038 /*
992 * we've determined that we can make the mapping, now translate what we 1039 * we've determined that we can make the mapping, now translate what we
993 * now know into VMA flags 1040 * now know into VMA flags
994 */ 1041 */
995 static unsigned long determine_vm_flags(struct file *file, 1042 static unsigned long determine_vm_flags(struct file *file,
996 unsigned long prot, 1043 unsigned long prot,
997 unsigned long flags, 1044 unsigned long flags,
998 unsigned long capabilities) 1045 unsigned long capabilities)
999 { 1046 {
1000 unsigned long vm_flags; 1047 unsigned long vm_flags;
1001 1048
1002 vm_flags = calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags); 1049 vm_flags = calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags);
1003 /* vm_flags |= mm->def_flags; */ 1050 /* vm_flags |= mm->def_flags; */
1004 1051
1005 if (!(capabilities & BDI_CAP_MAP_DIRECT)) { 1052 if (!(capabilities & BDI_CAP_MAP_DIRECT)) {
1006 /* attempt to share read-only copies of mapped file chunks */ 1053 /* attempt to share read-only copies of mapped file chunks */
1007 vm_flags |= VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC; 1054 vm_flags |= VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
1008 if (file && !(prot & PROT_WRITE)) 1055 if (file && !(prot & PROT_WRITE))
1009 vm_flags |= VM_MAYSHARE; 1056 vm_flags |= VM_MAYSHARE;
1010 } else { 1057 } else {
1011 /* overlay a shareable mapping on the backing device or inode 1058 /* overlay a shareable mapping on the backing device or inode
1012 * if possible - used for chardevs, ramfs/tmpfs/shmfs and 1059 * if possible - used for chardevs, ramfs/tmpfs/shmfs and
1013 * romfs/cramfs */ 1060 * romfs/cramfs */
1014 vm_flags |= VM_MAYSHARE | (capabilities & BDI_CAP_VMFLAGS); 1061 vm_flags |= VM_MAYSHARE | (capabilities & BDI_CAP_VMFLAGS);
1015 if (flags & MAP_SHARED) 1062 if (flags & MAP_SHARED)
1016 vm_flags |= VM_SHARED; 1063 vm_flags |= VM_SHARED;
1017 } 1064 }
1018 1065
1019 /* refuse to let anyone share private mappings with this process if 1066 /* refuse to let anyone share private mappings with this process if
1020 * it's being traced - otherwise breakpoints set in it may interfere 1067 * it's being traced - otherwise breakpoints set in it may interfere
1021 * with another untraced process 1068 * with another untraced process
1022 */ 1069 */
1023 if ((flags & MAP_PRIVATE) && tracehook_expect_breakpoints(current)) 1070 if ((flags & MAP_PRIVATE) && tracehook_expect_breakpoints(current))
1024 vm_flags &= ~VM_MAYSHARE; 1071 vm_flags &= ~VM_MAYSHARE;
1025 1072
1026 return vm_flags; 1073 return vm_flags;
1027 } 1074 }
1028 1075
1029 /* 1076 /*
1030 * set up a shared mapping on a file (the driver or filesystem provides and 1077 * set up a shared mapping on a file (the driver or filesystem provides and
1031 * pins the storage) 1078 * pins the storage)
1032 */ 1079 */
1033 static int do_mmap_shared_file(struct vm_area_struct *vma) 1080 static int do_mmap_shared_file(struct vm_area_struct *vma)
1034 { 1081 {
1035 int ret; 1082 int ret;
1036 1083
1037 ret = vma->vm_file->f_op->mmap(vma->vm_file, vma); 1084 ret = vma->vm_file->f_op->mmap(vma->vm_file, vma);
1038 if (ret == 0) { 1085 if (ret == 0) {
1039 vma->vm_region->vm_top = vma->vm_region->vm_end; 1086 vma->vm_region->vm_top = vma->vm_region->vm_end;
1040 return 0; 1087 return 0;
1041 } 1088 }
1042 if (ret != -ENOSYS) 1089 if (ret != -ENOSYS)
1043 return ret; 1090 return ret;
1044 1091
1045 /* getting -ENOSYS indicates that direct mmap isn't possible (as 1092 /* getting -ENOSYS indicates that direct mmap isn't possible (as
1046 * opposed to tried but failed) so we can only give a suitable error as 1093 * opposed to tried but failed) so we can only give a suitable error as
1047 * it's not possible to make a private copy if MAP_SHARED was given */ 1094 * it's not possible to make a private copy if MAP_SHARED was given */
1048 return -ENODEV; 1095 return -ENODEV;
1049 } 1096 }
1050 1097
1051 /* 1098 /*
1052 * set up a private mapping or an anonymous shared mapping 1099 * set up a private mapping or an anonymous shared mapping
1053 */ 1100 */
1054 static int do_mmap_private(struct vm_area_struct *vma, 1101 static int do_mmap_private(struct vm_area_struct *vma,
1055 struct vm_region *region, 1102 struct vm_region *region,
1056 unsigned long len, 1103 unsigned long len,
1057 unsigned long capabilities) 1104 unsigned long capabilities)
1058 { 1105 {
1059 struct page *pages; 1106 struct page *pages;
1060 unsigned long total, point, n, rlen; 1107 unsigned long total, point, n, rlen;
1061 void *base; 1108 void *base;
1062 int ret, order; 1109 int ret, order;
1063 1110
1064 /* invoke the file's mapping function so that it can keep track of 1111 /* invoke the file's mapping function so that it can keep track of
1065 * shared mappings on devices or memory 1112 * shared mappings on devices or memory
1066 * - VM_MAYSHARE will be set if it may attempt to share 1113 * - VM_MAYSHARE will be set if it may attempt to share
1067 */ 1114 */
1068 if (capabilities & BDI_CAP_MAP_DIRECT) { 1115 if (capabilities & BDI_CAP_MAP_DIRECT) {
1069 ret = vma->vm_file->f_op->mmap(vma->vm_file, vma); 1116 ret = vma->vm_file->f_op->mmap(vma->vm_file, vma);
1070 if (ret == 0) { 1117 if (ret == 0) {
1071 /* shouldn't return success if we're not sharing */ 1118 /* shouldn't return success if we're not sharing */
1072 BUG_ON(!(vma->vm_flags & VM_MAYSHARE)); 1119 BUG_ON(!(vma->vm_flags & VM_MAYSHARE));
1073 vma->vm_region->vm_top = vma->vm_region->vm_end; 1120 vma->vm_region->vm_top = vma->vm_region->vm_end;
1074 return 0; 1121 return 0;
1075 } 1122 }
1076 if (ret != -ENOSYS) 1123 if (ret != -ENOSYS)
1077 return ret; 1124 return ret;
1078 1125
1079 /* getting an ENOSYS error indicates that direct mmap isn't 1126 /* getting an ENOSYS error indicates that direct mmap isn't
1080 * possible (as opposed to tried but failed) so we'll try to 1127 * possible (as opposed to tried but failed) so we'll try to
1081 * make a private copy of the data and map that instead */ 1128 * make a private copy of the data and map that instead */
1082 } 1129 }
1083 1130
1084 rlen = PAGE_ALIGN(len); 1131 rlen = PAGE_ALIGN(len);
1085 1132
1086 /* allocate some memory to hold the mapping 1133 /* allocate some memory to hold the mapping
1087 * - note that this may not return a page-aligned address if the object 1134 * - note that this may not return a page-aligned address if the object
1088 * we're allocating is smaller than a page 1135 * we're allocating is smaller than a page
1089 */ 1136 */
1090 order = get_order(rlen); 1137 order = get_order(rlen);
1091 kdebug("alloc order %d for %lx", order, len); 1138 kdebug("alloc order %d for %lx", order, len);
1092 1139
1093 pages = alloc_pages(GFP_KERNEL, order); 1140 pages = alloc_pages(GFP_KERNEL, order);
1094 if (!pages) 1141 if (!pages)
1095 goto enomem; 1142 goto enomem;
1096 1143
1097 total = 1 << order; 1144 total = 1 << order;
1098 atomic_long_add(total, &mmap_pages_allocated); 1145 atomic_long_add(total, &mmap_pages_allocated);
1099 1146
1100 point = rlen >> PAGE_SHIFT; 1147 point = rlen >> PAGE_SHIFT;
1101 1148
1102 /* we allocated a power-of-2 sized page set, so we may want to trim off 1149 /* we allocated a power-of-2 sized page set, so we may want to trim off
1103 * the excess */ 1150 * the excess */
1104 if (sysctl_nr_trim_pages && total - point >= sysctl_nr_trim_pages) { 1151 if (sysctl_nr_trim_pages && total - point >= sysctl_nr_trim_pages) {
1105 while (total > point) { 1152 while (total > point) {
1106 order = ilog2(total - point); 1153 order = ilog2(total - point);
1107 n = 1 << order; 1154 n = 1 << order;
1108 kdebug("shave %lu/%lu @%lu", n, total - point, total); 1155 kdebug("shave %lu/%lu @%lu", n, total - point, total);
1109 atomic_long_sub(n, &mmap_pages_allocated); 1156 atomic_long_sub(n, &mmap_pages_allocated);
1110 total -= n; 1157 total -= n;
1111 set_page_refcounted(pages + total); 1158 set_page_refcounted(pages + total);
1112 __free_pages(pages + total, order); 1159 __free_pages(pages + total, order);
1113 } 1160 }
1114 } 1161 }
1115 1162
1116 for (point = 1; point < total; point++) 1163 for (point = 1; point < total; point++)
1117 set_page_refcounted(&pages[point]); 1164 set_page_refcounted(&pages[point]);
1118 1165
1119 base = page_address(pages); 1166 base = page_address(pages);
1120 region->vm_flags = vma->vm_flags |= VM_MAPPED_COPY; 1167 region->vm_flags = vma->vm_flags |= VM_MAPPED_COPY;
1121 region->vm_start = (unsigned long) base; 1168 region->vm_start = (unsigned long) base;
1122 region->vm_end = region->vm_start + rlen; 1169 region->vm_end = region->vm_start + rlen;
1123 region->vm_top = region->vm_start + (total << PAGE_SHIFT); 1170 region->vm_top = region->vm_start + (total << PAGE_SHIFT);
1124 1171
1125 vma->vm_start = region->vm_start; 1172 vma->vm_start = region->vm_start;
1126 vma->vm_end = region->vm_start + len; 1173 vma->vm_end = region->vm_start + len;
1127 1174
1128 if (vma->vm_file) { 1175 if (vma->vm_file) {
1129 /* read the contents of a file into the copy */ 1176 /* read the contents of a file into the copy */
1130 mm_segment_t old_fs; 1177 mm_segment_t old_fs;
1131 loff_t fpos; 1178 loff_t fpos;
1132 1179
1133 fpos = vma->vm_pgoff; 1180 fpos = vma->vm_pgoff;
1134 fpos <<= PAGE_SHIFT; 1181 fpos <<= PAGE_SHIFT;
1135 1182
1136 old_fs = get_fs(); 1183 old_fs = get_fs();
1137 set_fs(KERNEL_DS); 1184 set_fs(KERNEL_DS);
1138 ret = vma->vm_file->f_op->read(vma->vm_file, base, rlen, &fpos); 1185 ret = vma->vm_file->f_op->read(vma->vm_file, base, rlen, &fpos);
1139 set_fs(old_fs); 1186 set_fs(old_fs);
1140 1187
1141 if (ret < 0) 1188 if (ret < 0)
1142 goto error_free; 1189 goto error_free;
1143 1190
1144 /* clear the last little bit */ 1191 /* clear the last little bit */
1145 if (ret < rlen) 1192 if (ret < rlen)
1146 memset(base + ret, 0, rlen - ret); 1193 memset(base + ret, 0, rlen - ret);
1147 1194
1148 } 1195 }
1149 1196
1150 return 0; 1197 return 0;
1151 1198
1152 error_free: 1199 error_free:
1153 free_page_series(region->vm_start, region->vm_end); 1200 free_page_series(region->vm_start, region->vm_end);
1154 region->vm_start = vma->vm_start = 0; 1201 region->vm_start = vma->vm_start = 0;
1155 region->vm_end = vma->vm_end = 0; 1202 region->vm_end = vma->vm_end = 0;
1156 region->vm_top = 0; 1203 region->vm_top = 0;
1157 return ret; 1204 return ret;
1158 1205
1159 enomem: 1206 enomem:
1160 printk("Allocation of length %lu from process %d (%s) failed\n", 1207 printk("Allocation of length %lu from process %d (%s) failed\n",
1161 len, current->pid, current->comm); 1208 len, current->pid, current->comm);
1162 show_free_areas(); 1209 show_free_areas();
1163 return -ENOMEM; 1210 return -ENOMEM;
1164 } 1211 }
1165 1212
1166 /* 1213 /*
1167 * handle mapping creation for uClinux 1214 * handle mapping creation for uClinux
1168 */ 1215 */
1169 unsigned long do_mmap_pgoff(struct file *file, 1216 unsigned long do_mmap_pgoff(struct file *file,
1170 unsigned long addr, 1217 unsigned long addr,
1171 unsigned long len, 1218 unsigned long len,
1172 unsigned long prot, 1219 unsigned long prot,
1173 unsigned long flags, 1220 unsigned long flags,
1174 unsigned long pgoff) 1221 unsigned long pgoff)
1175 { 1222 {
1176 struct vm_area_struct *vma; 1223 struct vm_area_struct *vma;
1177 struct vm_region *region; 1224 struct vm_region *region;
1178 struct rb_node *rb; 1225 struct rb_node *rb;
1179 unsigned long capabilities, vm_flags, result; 1226 unsigned long capabilities, vm_flags, result;
1180 int ret; 1227 int ret;
1181 1228
1182 kenter(",%lx,%lx,%lx,%lx,%lx", addr, len, prot, flags, pgoff); 1229 kenter(",%lx,%lx,%lx,%lx,%lx", addr, len, prot, flags, pgoff);
1183 1230
1184 /* decide whether we should attempt the mapping, and if so what sort of 1231 /* decide whether we should attempt the mapping, and if so what sort of
1185 * mapping */ 1232 * mapping */
1186 ret = validate_mmap_request(file, addr, len, prot, flags, pgoff, 1233 ret = validate_mmap_request(file, addr, len, prot, flags, pgoff,
1187 &capabilities); 1234 &capabilities);
1188 if (ret < 0) { 1235 if (ret < 0) {
1189 kleave(" = %d [val]", ret); 1236 kleave(" = %d [val]", ret);
1190 return ret; 1237 return ret;
1191 } 1238 }
1192 1239
1193 /* we ignore the address hint */ 1240 /* we ignore the address hint */
1194 addr = 0; 1241 addr = 0;
1195 1242
1196 /* we've determined that we can make the mapping, now translate what we 1243 /* we've determined that we can make the mapping, now translate what we
1197 * now know into VMA flags */ 1244 * now know into VMA flags */
1198 vm_flags = determine_vm_flags(file, prot, flags, capabilities); 1245 vm_flags = determine_vm_flags(file, prot, flags, capabilities);
1199 1246
1200 /* we're going to need to record the mapping */ 1247 /* we're going to need to record the mapping */
1201 region = kmem_cache_zalloc(vm_region_jar, GFP_KERNEL); 1248 region = kmem_cache_zalloc(vm_region_jar, GFP_KERNEL);
1202 if (!region) 1249 if (!region)
1203 goto error_getting_region; 1250 goto error_getting_region;
1204 1251
1205 vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); 1252 vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
1206 if (!vma) 1253 if (!vma)
1207 goto error_getting_vma; 1254 goto error_getting_vma;
1208 1255
1209 region->vm_usage = 1; 1256 region->vm_usage = 1;
1210 region->vm_flags = vm_flags; 1257 region->vm_flags = vm_flags;
1211 region->vm_pgoff = pgoff; 1258 region->vm_pgoff = pgoff;
1212 1259
1213 INIT_LIST_HEAD(&vma->anon_vma_chain); 1260 INIT_LIST_HEAD(&vma->anon_vma_chain);
1214 vma->vm_flags = vm_flags; 1261 vma->vm_flags = vm_flags;
1215 vma->vm_pgoff = pgoff; 1262 vma->vm_pgoff = pgoff;
1216 1263
1217 if (file) { 1264 if (file) {
1218 region->vm_file = file; 1265 region->vm_file = file;
1219 get_file(file); 1266 get_file(file);
1220 vma->vm_file = file; 1267 vma->vm_file = file;
1221 get_file(file); 1268 get_file(file);
1222 if (vm_flags & VM_EXECUTABLE) { 1269 if (vm_flags & VM_EXECUTABLE) {
1223 added_exe_file_vma(current->mm); 1270 added_exe_file_vma(current->mm);
1224 vma->vm_mm = current->mm; 1271 vma->vm_mm = current->mm;
1225 } 1272 }
1226 } 1273 }
1227 1274
1228 down_write(&nommu_region_sem); 1275 down_write(&nommu_region_sem);
1229 1276
1230 /* if we want to share, we need to check for regions created by other 1277 /* if we want to share, we need to check for regions created by other
1231 * mmap() calls that overlap with our proposed mapping 1278 * mmap() calls that overlap with our proposed mapping
1232 * - we can only share with a superset match on most regular files 1279 * - we can only share with a superset match on most regular files
1233 * - shared mappings on character devices and memory backed files are 1280 * - shared mappings on character devices and memory backed files are
1234 * permitted to overlap inexactly as far as we are concerned for in 1281 * permitted to overlap inexactly as far as we are concerned for in
1235 * these cases, sharing is handled in the driver or filesystem rather 1282 * these cases, sharing is handled in the driver or filesystem rather
1236 * than here 1283 * than here
1237 */ 1284 */
1238 if (vm_flags & VM_MAYSHARE) { 1285 if (vm_flags & VM_MAYSHARE) {
1239 struct vm_region *pregion; 1286 struct vm_region *pregion;
1240 unsigned long pglen, rpglen, pgend, rpgend, start; 1287 unsigned long pglen, rpglen, pgend, rpgend, start;
1241 1288
1242 pglen = (len + PAGE_SIZE - 1) >> PAGE_SHIFT; 1289 pglen = (len + PAGE_SIZE - 1) >> PAGE_SHIFT;
1243 pgend = pgoff + pglen; 1290 pgend = pgoff + pglen;
1244 1291
1245 for (rb = rb_first(&nommu_region_tree); rb; rb = rb_next(rb)) { 1292 for (rb = rb_first(&nommu_region_tree); rb; rb = rb_next(rb)) {
1246 pregion = rb_entry(rb, struct vm_region, vm_rb); 1293 pregion = rb_entry(rb, struct vm_region, vm_rb);
1247 1294
1248 if (!(pregion->vm_flags & VM_MAYSHARE)) 1295 if (!(pregion->vm_flags & VM_MAYSHARE))
1249 continue; 1296 continue;
1250 1297
1251 /* search for overlapping mappings on the same file */ 1298 /* search for overlapping mappings on the same file */
1252 if (pregion->vm_file->f_path.dentry->d_inode != 1299 if (pregion->vm_file->f_path.dentry->d_inode !=
1253 file->f_path.dentry->d_inode) 1300 file->f_path.dentry->d_inode)
1254 continue; 1301 continue;
1255 1302
1256 if (pregion->vm_pgoff >= pgend) 1303 if (pregion->vm_pgoff >= pgend)
1257 continue; 1304 continue;
1258 1305
1259 rpglen = pregion->vm_end - pregion->vm_start; 1306 rpglen = pregion->vm_end - pregion->vm_start;
1260 rpglen = (rpglen + PAGE_SIZE - 1) >> PAGE_SHIFT; 1307 rpglen = (rpglen + PAGE_SIZE - 1) >> PAGE_SHIFT;
1261 rpgend = pregion->vm_pgoff + rpglen; 1308 rpgend = pregion->vm_pgoff + rpglen;
1262 if (pgoff >= rpgend) 1309 if (pgoff >= rpgend)
1263 continue; 1310 continue;
1264 1311
1265 /* handle inexactly overlapping matches between 1312 /* handle inexactly overlapping matches between
1266 * mappings */ 1313 * mappings */
1267 if ((pregion->vm_pgoff != pgoff || rpglen != pglen) && 1314 if ((pregion->vm_pgoff != pgoff || rpglen != pglen) &&
1268 !(pgoff >= pregion->vm_pgoff && pgend <= rpgend)) { 1315 !(pgoff >= pregion->vm_pgoff && pgend <= rpgend)) {
1269 /* new mapping is not a subset of the region */ 1316 /* new mapping is not a subset of the region */
1270 if (!(capabilities & BDI_CAP_MAP_DIRECT)) 1317 if (!(capabilities & BDI_CAP_MAP_DIRECT))
1271 goto sharing_violation; 1318 goto sharing_violation;
1272 continue; 1319 continue;
1273 } 1320 }
1274 1321
1275 /* we've found a region we can share */ 1322 /* we've found a region we can share */
1276 pregion->vm_usage++; 1323 pregion->vm_usage++;
1277 vma->vm_region = pregion; 1324 vma->vm_region = pregion;
1278 start = pregion->vm_start; 1325 start = pregion->vm_start;
1279 start += (pgoff - pregion->vm_pgoff) << PAGE_SHIFT; 1326 start += (pgoff - pregion->vm_pgoff) << PAGE_SHIFT;
1280 vma->vm_start = start; 1327 vma->vm_start = start;
1281 vma->vm_end = start + len; 1328 vma->vm_end = start + len;
1282 1329
1283 if (pregion->vm_flags & VM_MAPPED_COPY) { 1330 if (pregion->vm_flags & VM_MAPPED_COPY) {
1284 kdebug("share copy"); 1331 kdebug("share copy");
1285 vma->vm_flags |= VM_MAPPED_COPY; 1332 vma->vm_flags |= VM_MAPPED_COPY;
1286 } else { 1333 } else {
1287 kdebug("share mmap"); 1334 kdebug("share mmap");
1288 ret = do_mmap_shared_file(vma); 1335 ret = do_mmap_shared_file(vma);
1289 if (ret < 0) { 1336 if (ret < 0) {
1290 vma->vm_region = NULL; 1337 vma->vm_region = NULL;
1291 vma->vm_start = 0; 1338 vma->vm_start = 0;
1292 vma->vm_end = 0; 1339 vma->vm_end = 0;
1293 pregion->vm_usage--; 1340 pregion->vm_usage--;
1294 pregion = NULL; 1341 pregion = NULL;
1295 goto error_just_free; 1342 goto error_just_free;
1296 } 1343 }
1297 } 1344 }
1298 fput(region->vm_file); 1345 fput(region->vm_file);
1299 kmem_cache_free(vm_region_jar, region); 1346 kmem_cache_free(vm_region_jar, region);
1300 region = pregion; 1347 region = pregion;
1301 result = start; 1348 result = start;
1302 goto share; 1349 goto share;
1303 } 1350 }
1304 1351
1305 /* obtain the address at which to make a shared mapping 1352 /* obtain the address at which to make a shared mapping
1306 * - this is the hook for quasi-memory character devices to 1353 * - this is the hook for quasi-memory character devices to
1307 * tell us the location of a shared mapping 1354 * tell us the location of a shared mapping
1308 */ 1355 */
1309 if (capabilities & BDI_CAP_MAP_DIRECT) { 1356 if (capabilities & BDI_CAP_MAP_DIRECT) {
1310 addr = file->f_op->get_unmapped_area(file, addr, len, 1357 addr = file->f_op->get_unmapped_area(file, addr, len,
1311 pgoff, flags); 1358 pgoff, flags);
1312 if (IS_ERR((void *) addr)) { 1359 if (IS_ERR((void *) addr)) {
1313 ret = addr; 1360 ret = addr;
1314 if (ret != (unsigned long) -ENOSYS) 1361 if (ret != (unsigned long) -ENOSYS)
1315 goto error_just_free; 1362 goto error_just_free;
1316 1363
1317 /* the driver refused to tell us where to site 1364 /* the driver refused to tell us where to site
1318 * the mapping so we'll have to attempt to copy 1365 * the mapping so we'll have to attempt to copy
1319 * it */ 1366 * it */
1320 ret = (unsigned long) -ENODEV; 1367 ret = (unsigned long) -ENODEV;
1321 if (!(capabilities & BDI_CAP_MAP_COPY)) 1368 if (!(capabilities & BDI_CAP_MAP_COPY))
1322 goto error_just_free; 1369 goto error_just_free;
1323 1370
1324 capabilities &= ~BDI_CAP_MAP_DIRECT; 1371 capabilities &= ~BDI_CAP_MAP_DIRECT;
1325 } else { 1372 } else {
1326 vma->vm_start = region->vm_start = addr; 1373 vma->vm_start = region->vm_start = addr;
1327 vma->vm_end = region->vm_end = addr + len; 1374 vma->vm_end = region->vm_end = addr + len;
1328 } 1375 }
1329 } 1376 }
1330 } 1377 }
1331 1378
1332 vma->vm_region = region; 1379 vma->vm_region = region;
1333 1380
1334 /* set up the mapping 1381 /* set up the mapping
1335 * - the region is filled in if BDI_CAP_MAP_DIRECT is still set 1382 * - the region is filled in if BDI_CAP_MAP_DIRECT is still set
1336 */ 1383 */
1337 if (file && vma->vm_flags & VM_SHARED) 1384 if (file && vma->vm_flags & VM_SHARED)
1338 ret = do_mmap_shared_file(vma); 1385 ret = do_mmap_shared_file(vma);
1339 else 1386 else
1340 ret = do_mmap_private(vma, region, len, capabilities); 1387 ret = do_mmap_private(vma, region, len, capabilities);
1341 if (ret < 0) 1388 if (ret < 0)
1342 goto error_just_free; 1389 goto error_just_free;
1343 add_nommu_region(region); 1390 add_nommu_region(region);
1344 1391
1345 /* clear anonymous mappings that don't ask for uninitialized data */ 1392 /* clear anonymous mappings that don't ask for uninitialized data */
1346 if (!vma->vm_file && !(flags & MAP_UNINITIALIZED)) 1393 if (!vma->vm_file && !(flags & MAP_UNINITIALIZED))
1347 memset((void *)region->vm_start, 0, 1394 memset((void *)region->vm_start, 0,
1348 region->vm_end - region->vm_start); 1395 region->vm_end - region->vm_start);
1349 1396
1350 /* okay... we have a mapping; now we have to register it */ 1397 /* okay... we have a mapping; now we have to register it */
1351 result = vma->vm_start; 1398 result = vma->vm_start;
1352 1399
1353 current->mm->total_vm += len >> PAGE_SHIFT; 1400 current->mm->total_vm += len >> PAGE_SHIFT;
1354 1401
1355 share: 1402 share:
1356 add_vma_to_mm(current->mm, vma); 1403 add_vma_to_mm(current->mm, vma);
1357 1404
1358 /* we flush the region from the icache only when the first executable 1405 /* we flush the region from the icache only when the first executable
1359 * mapping of it is made */ 1406 * mapping of it is made */
1360 if (vma->vm_flags & VM_EXEC && !region->vm_icache_flushed) { 1407 if (vma->vm_flags & VM_EXEC && !region->vm_icache_flushed) {
1361 flush_icache_range(region->vm_start, region->vm_end); 1408 flush_icache_range(region->vm_start, region->vm_end);
1362 region->vm_icache_flushed = true; 1409 region->vm_icache_flushed = true;
1363 } 1410 }
1364 1411
1365 up_write(&nommu_region_sem); 1412 up_write(&nommu_region_sem);
1366 1413
1367 kleave(" = %lx", result); 1414 kleave(" = %lx", result);
1368 return result; 1415 return result;
1369 1416
1370 error_just_free: 1417 error_just_free:
1371 up_write(&nommu_region_sem); 1418 up_write(&nommu_region_sem);
1372 error: 1419 error:
1373 if (region->vm_file) 1420 if (region->vm_file)
1374 fput(region->vm_file); 1421 fput(region->vm_file);
1375 kmem_cache_free(vm_region_jar, region); 1422 kmem_cache_free(vm_region_jar, region);
1376 if (vma->vm_file) 1423 if (vma->vm_file)
1377 fput(vma->vm_file); 1424 fput(vma->vm_file);
1378 if (vma->vm_flags & VM_EXECUTABLE) 1425 if (vma->vm_flags & VM_EXECUTABLE)
1379 removed_exe_file_vma(vma->vm_mm); 1426 removed_exe_file_vma(vma->vm_mm);
1380 kmem_cache_free(vm_area_cachep, vma); 1427 kmem_cache_free(vm_area_cachep, vma);
1381 kleave(" = %d", ret); 1428 kleave(" = %d", ret);
1382 return ret; 1429 return ret;
1383 1430
1384 sharing_violation: 1431 sharing_violation:
1385 up_write(&nommu_region_sem); 1432 up_write(&nommu_region_sem);
1386 printk(KERN_WARNING "Attempt to share mismatched mappings\n"); 1433 printk(KERN_WARNING "Attempt to share mismatched mappings\n");
1387 ret = -EINVAL; 1434 ret = -EINVAL;
1388 goto error; 1435 goto error;
1389 1436
1390 error_getting_vma: 1437 error_getting_vma:
1391 kmem_cache_free(vm_region_jar, region); 1438 kmem_cache_free(vm_region_jar, region);
1392 printk(KERN_WARNING "Allocation of vma for %lu byte allocation" 1439 printk(KERN_WARNING "Allocation of vma for %lu byte allocation"
1393 " from process %d failed\n", 1440 " from process %d failed\n",
1394 len, current->pid); 1441 len, current->pid);
1395 show_free_areas(); 1442 show_free_areas();
1396 return -ENOMEM; 1443 return -ENOMEM;
1397 1444
1398 error_getting_region: 1445 error_getting_region:
1399 printk(KERN_WARNING "Allocation of vm region for %lu byte allocation" 1446 printk(KERN_WARNING "Allocation of vm region for %lu byte allocation"
1400 " from process %d failed\n", 1447 " from process %d failed\n",
1401 len, current->pid); 1448 len, current->pid);
1402 show_free_areas(); 1449 show_free_areas();
1403 return -ENOMEM; 1450 return -ENOMEM;
1404 } 1451 }
1405 EXPORT_SYMBOL(do_mmap_pgoff); 1452 EXPORT_SYMBOL(do_mmap_pgoff);
1406 1453
1407 SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len, 1454 SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
1408 unsigned long, prot, unsigned long, flags, 1455 unsigned long, prot, unsigned long, flags,
1409 unsigned long, fd, unsigned long, pgoff) 1456 unsigned long, fd, unsigned long, pgoff)
1410 { 1457 {
1411 struct file *file = NULL; 1458 struct file *file = NULL;
1412 unsigned long retval = -EBADF; 1459 unsigned long retval = -EBADF;
1413 1460
1414 if (!(flags & MAP_ANONYMOUS)) { 1461 if (!(flags & MAP_ANONYMOUS)) {
1415 file = fget(fd); 1462 file = fget(fd);
1416 if (!file) 1463 if (!file)
1417 goto out; 1464 goto out;
1418 } 1465 }
1419 1466
1420 flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); 1467 flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);
1421 1468
1422 down_write(&current->mm->mmap_sem); 1469 down_write(&current->mm->mmap_sem);
1423 retval = do_mmap_pgoff(file, addr, len, prot, flags, pgoff); 1470 retval = do_mmap_pgoff(file, addr, len, prot, flags, pgoff);
1424 up_write(&current->mm->mmap_sem); 1471 up_write(&current->mm->mmap_sem);
1425 1472
1426 if (file) 1473 if (file)
1427 fput(file); 1474 fput(file);
1428 out: 1475 out:
1429 return retval; 1476 return retval;
1430 } 1477 }
1431 1478
1432 #ifdef __ARCH_WANT_SYS_OLD_MMAP 1479 #ifdef __ARCH_WANT_SYS_OLD_MMAP
1433 struct mmap_arg_struct { 1480 struct mmap_arg_struct {
1434 unsigned long addr; 1481 unsigned long addr;
1435 unsigned long len; 1482 unsigned long len;
1436 unsigned long prot; 1483 unsigned long prot;
1437 unsigned long flags; 1484 unsigned long flags;
1438 unsigned long fd; 1485 unsigned long fd;
1439 unsigned long offset; 1486 unsigned long offset;
1440 }; 1487 };
1441 1488
1442 SYSCALL_DEFINE1(old_mmap, struct mmap_arg_struct __user *, arg) 1489 SYSCALL_DEFINE1(old_mmap, struct mmap_arg_struct __user *, arg)
1443 { 1490 {
1444 struct mmap_arg_struct a; 1491 struct mmap_arg_struct a;
1445 1492
1446 if (copy_from_user(&a, arg, sizeof(a))) 1493 if (copy_from_user(&a, arg, sizeof(a)))
1447 return -EFAULT; 1494 return -EFAULT;
1448 if (a.offset & ~PAGE_MASK) 1495 if (a.offset & ~PAGE_MASK)
1449 return -EINVAL; 1496 return -EINVAL;
1450 1497
1451 return sys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd, 1498 return sys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd,
1452 a.offset >> PAGE_SHIFT); 1499 a.offset >> PAGE_SHIFT);
1453 } 1500 }
1454 #endif /* __ARCH_WANT_SYS_OLD_MMAP */ 1501 #endif /* __ARCH_WANT_SYS_OLD_MMAP */
1455 1502
1456 /* 1503 /*
1457 * split a vma into two pieces at address 'addr', a new vma is allocated either 1504 * split a vma into two pieces at address 'addr', a new vma is allocated either
1458 * for the first part or the tail. 1505 * for the first part or the tail.
1459 */ 1506 */
1460 int split_vma(struct mm_struct *mm, struct vm_area_struct *vma, 1507 int split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
1461 unsigned long addr, int new_below) 1508 unsigned long addr, int new_below)
1462 { 1509 {
1463 struct vm_area_struct *new; 1510 struct vm_area_struct *new;
1464 struct vm_region *region; 1511 struct vm_region *region;
1465 unsigned long npages; 1512 unsigned long npages;
1466 1513
1467 kenter(""); 1514 kenter("");
1468 1515
1469 /* we're only permitted to split anonymous regions (these should have 1516 /* we're only permitted to split anonymous regions (these should have
1470 * only a single usage on the region) */ 1517 * only a single usage on the region) */
1471 if (vma->vm_file) 1518 if (vma->vm_file)
1472 return -ENOMEM; 1519 return -ENOMEM;
1473 1520
1474 if (mm->map_count >= sysctl_max_map_count) 1521 if (mm->map_count >= sysctl_max_map_count)
1475 return -ENOMEM; 1522 return -ENOMEM;
1476 1523
1477 region = kmem_cache_alloc(vm_region_jar, GFP_KERNEL); 1524 region = kmem_cache_alloc(vm_region_jar, GFP_KERNEL);
1478 if (!region) 1525 if (!region)
1479 return -ENOMEM; 1526 return -ENOMEM;
1480 1527
1481 new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); 1528 new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
1482 if (!new) { 1529 if (!new) {
1483 kmem_cache_free(vm_region_jar, region); 1530 kmem_cache_free(vm_region_jar, region);
1484 return -ENOMEM; 1531 return -ENOMEM;
1485 } 1532 }
1486 1533
1487 /* most fields are the same, copy all, and then fixup */ 1534 /* most fields are the same, copy all, and then fixup */
1488 *new = *vma; 1535 *new = *vma;
1489 *region = *vma->vm_region; 1536 *region = *vma->vm_region;
1490 new->vm_region = region; 1537 new->vm_region = region;
1491 1538
1492 npages = (addr - vma->vm_start) >> PAGE_SHIFT; 1539 npages = (addr - vma->vm_start) >> PAGE_SHIFT;
1493 1540
1494 if (new_below) { 1541 if (new_below) {
1495 region->vm_top = region->vm_end = new->vm_end = addr; 1542 region->vm_top = region->vm_end = new->vm_end = addr;
1496 } else { 1543 } else {
1497 region->vm_start = new->vm_start = addr; 1544 region->vm_start = new->vm_start = addr;
1498 region->vm_pgoff = new->vm_pgoff += npages; 1545 region->vm_pgoff = new->vm_pgoff += npages;
1499 } 1546 }
1500 1547
1501 if (new->vm_ops && new->vm_ops->open) 1548 if (new->vm_ops && new->vm_ops->open)
1502 new->vm_ops->open(new); 1549 new->vm_ops->open(new);
1503 1550
1504 delete_vma_from_mm(vma); 1551 delete_vma_from_mm(vma);
1505 down_write(&nommu_region_sem); 1552 down_write(&nommu_region_sem);
1506 delete_nommu_region(vma->vm_region); 1553 delete_nommu_region(vma->vm_region);
1507 if (new_below) { 1554 if (new_below) {
1508 vma->vm_region->vm_start = vma->vm_start = addr; 1555 vma->vm_region->vm_start = vma->vm_start = addr;
1509 vma->vm_region->vm_pgoff = vma->vm_pgoff += npages; 1556 vma->vm_region->vm_pgoff = vma->vm_pgoff += npages;
1510 } else { 1557 } else {
1511 vma->vm_region->vm_end = vma->vm_end = addr; 1558 vma->vm_region->vm_end = vma->vm_end = addr;
1512 vma->vm_region->vm_top = addr; 1559 vma->vm_region->vm_top = addr;
1513 } 1560 }
1514 add_nommu_region(vma->vm_region); 1561 add_nommu_region(vma->vm_region);
1515 add_nommu_region(new->vm_region); 1562 add_nommu_region(new->vm_region);
1516 up_write(&nommu_region_sem); 1563 up_write(&nommu_region_sem);
1517 add_vma_to_mm(mm, vma); 1564 add_vma_to_mm(mm, vma);
1518 add_vma_to_mm(mm, new); 1565 add_vma_to_mm(mm, new);
1519 return 0; 1566 return 0;
1520 } 1567 }
1521 1568
1522 /* 1569 /*
1523 * shrink a VMA by removing the specified chunk from either the beginning or 1570 * shrink a VMA by removing the specified chunk from either the beginning or
1524 * the end 1571 * the end
1525 */ 1572 */
1526 static int shrink_vma(struct mm_struct *mm, 1573 static int shrink_vma(struct mm_struct *mm,
1527 struct vm_area_struct *vma, 1574 struct vm_area_struct *vma,
1528 unsigned long from, unsigned long to) 1575 unsigned long from, unsigned long to)
1529 { 1576 {
1530 struct vm_region *region; 1577 struct vm_region *region;
1531 1578
1532 kenter(""); 1579 kenter("");
1533 1580
1534 /* adjust the VMA's pointers, which may reposition it in the MM's tree 1581 /* adjust the VMA's pointers, which may reposition it in the MM's tree
1535 * and list */ 1582 * and list */
1536 delete_vma_from_mm(vma); 1583 delete_vma_from_mm(vma);
1537 if (from > vma->vm_start) 1584 if (from > vma->vm_start)
1538 vma->vm_end = from; 1585 vma->vm_end = from;
1539 else 1586 else
1540 vma->vm_start = to; 1587 vma->vm_start = to;
1541 add_vma_to_mm(mm, vma); 1588 add_vma_to_mm(mm, vma);
1542 1589
1543 /* cut the backing region down to size */ 1590 /* cut the backing region down to size */
1544 region = vma->vm_region; 1591 region = vma->vm_region;
1545 BUG_ON(region->vm_usage != 1); 1592 BUG_ON(region->vm_usage != 1);
1546 1593
1547 down_write(&nommu_region_sem); 1594 down_write(&nommu_region_sem);
1548 delete_nommu_region(region); 1595 delete_nommu_region(region);
1549 if (from > region->vm_start) { 1596 if (from > region->vm_start) {
1550 to = region->vm_top; 1597 to = region->vm_top;
1551 region->vm_top = region->vm_end = from; 1598 region->vm_top = region->vm_end = from;
1552 } else { 1599 } else {
1553 region->vm_start = to; 1600 region->vm_start = to;
1554 } 1601 }
1555 add_nommu_region(region); 1602 add_nommu_region(region);
1556 up_write(&nommu_region_sem); 1603 up_write(&nommu_region_sem);
1557 1604
1558 free_page_series(from, to); 1605 free_page_series(from, to);
1559 return 0; 1606 return 0;
1560 } 1607 }
1561 1608
1562 /* 1609 /*
1563 * release a mapping 1610 * release a mapping
1564 * - under NOMMU conditions the chunk to be unmapped must be backed by a single 1611 * - under NOMMU conditions the chunk to be unmapped must be backed by a single
1565 * VMA, though it need not cover the whole VMA 1612 * VMA, though it need not cover the whole VMA
1566 */ 1613 */
1567 int do_munmap(struct mm_struct *mm, unsigned long start, size_t len) 1614 int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
1568 { 1615 {
1569 struct vm_area_struct *vma; 1616 struct vm_area_struct *vma;
1570 struct rb_node *rb; 1617 struct rb_node *rb;
1571 unsigned long end = start + len; 1618 unsigned long end = start + len;
1572 int ret; 1619 int ret;
1573 1620
1574 kenter(",%lx,%zx", start, len); 1621 kenter(",%lx,%zx", start, len);
1575 1622
1576 if (len == 0) 1623 if (len == 0)
1577 return -EINVAL; 1624 return -EINVAL;
1578 1625
1579 /* find the first potentially overlapping VMA */ 1626 /* find the first potentially overlapping VMA */
1580 vma = find_vma(mm, start); 1627 vma = find_vma(mm, start);
1581 if (!vma) { 1628 if (!vma) {
1582 static int limit = 0; 1629 static int limit = 0;
1583 if (limit < 5) { 1630 if (limit < 5) {
1584 printk(KERN_WARNING 1631 printk(KERN_WARNING
1585 "munmap of memory not mmapped by process %d" 1632 "munmap of memory not mmapped by process %d"
1586 " (%s): 0x%lx-0x%lx\n", 1633 " (%s): 0x%lx-0x%lx\n",
1587 current->pid, current->comm, 1634 current->pid, current->comm,
1588 start, start + len - 1); 1635 start, start + len - 1);
1589 limit++; 1636 limit++;
1590 } 1637 }
1591 return -EINVAL; 1638 return -EINVAL;
1592 } 1639 }
1593 1640
1594 /* we're allowed to split an anonymous VMA but not a file-backed one */ 1641 /* we're allowed to split an anonymous VMA but not a file-backed one */
1595 if (vma->vm_file) { 1642 if (vma->vm_file) {
1596 do { 1643 do {
1597 if (start > vma->vm_start) { 1644 if (start > vma->vm_start) {
1598 kleave(" = -EINVAL [miss]"); 1645 kleave(" = -EINVAL [miss]");
1599 return -EINVAL; 1646 return -EINVAL;
1600 } 1647 }
1601 if (end == vma->vm_end) 1648 if (end == vma->vm_end)
1602 goto erase_whole_vma; 1649 goto erase_whole_vma;
1603 rb = rb_next(&vma->vm_rb); 1650 rb = rb_next(&vma->vm_rb);
1604 vma = rb_entry(rb, struct vm_area_struct, vm_rb); 1651 vma = rb_entry(rb, struct vm_area_struct, vm_rb);
1605 } while (rb); 1652 } while (rb);
1606 kleave(" = -EINVAL [split file]"); 1653 kleave(" = -EINVAL [split file]");
1607 return -EINVAL; 1654 return -EINVAL;
1608 } else { 1655 } else {
1609 /* the chunk must be a subset of the VMA found */ 1656 /* the chunk must be a subset of the VMA found */
1610 if (start == vma->vm_start && end == vma->vm_end) 1657 if (start == vma->vm_start && end == vma->vm_end)
1611 goto erase_whole_vma; 1658 goto erase_whole_vma;
1612 if (start < vma->vm_start || end > vma->vm_end) { 1659 if (start < vma->vm_start || end > vma->vm_end) {
1613 kleave(" = -EINVAL [superset]"); 1660 kleave(" = -EINVAL [superset]");
1614 return -EINVAL; 1661 return -EINVAL;
1615 } 1662 }
1616 if (start & ~PAGE_MASK) { 1663 if (start & ~PAGE_MASK) {
1617 kleave(" = -EINVAL [unaligned start]"); 1664 kleave(" = -EINVAL [unaligned start]");
1618 return -EINVAL; 1665 return -EINVAL;
1619 } 1666 }
1620 if (end != vma->vm_end && end & ~PAGE_MASK) { 1667 if (end != vma->vm_end && end & ~PAGE_MASK) {
1621 kleave(" = -EINVAL [unaligned split]"); 1668 kleave(" = -EINVAL [unaligned split]");
1622 return -EINVAL; 1669 return -EINVAL;
1623 } 1670 }
1624 if (start != vma->vm_start && end != vma->vm_end) { 1671 if (start != vma->vm_start && end != vma->vm_end) {
1625 ret = split_vma(mm, vma, start, 1); 1672 ret = split_vma(mm, vma, start, 1);
1626 if (ret < 0) { 1673 if (ret < 0) {
1627 kleave(" = %d [split]", ret); 1674 kleave(" = %d [split]", ret);
1628 return ret; 1675 return ret;
1629 } 1676 }
1630 } 1677 }
1631 return shrink_vma(mm, vma, start, end); 1678 return shrink_vma(mm, vma, start, end);
1632 } 1679 }
1633 1680
1634 erase_whole_vma: 1681 erase_whole_vma:
1635 delete_vma_from_mm(vma); 1682 delete_vma_from_mm(vma);
1636 delete_vma(mm, vma); 1683 delete_vma(mm, vma);
1637 kleave(" = 0"); 1684 kleave(" = 0");
1638 return 0; 1685 return 0;
1639 } 1686 }
1640 EXPORT_SYMBOL(do_munmap); 1687 EXPORT_SYMBOL(do_munmap);
1641 1688
1642 SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len) 1689 SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len)
1643 { 1690 {
1644 int ret; 1691 int ret;
1645 struct mm_struct *mm = current->mm; 1692 struct mm_struct *mm = current->mm;
1646 1693
1647 down_write(&mm->mmap_sem); 1694 down_write(&mm->mmap_sem);
1648 ret = do_munmap(mm, addr, len); 1695 ret = do_munmap(mm, addr, len);
1649 up_write(&mm->mmap_sem); 1696 up_write(&mm->mmap_sem);
1650 return ret; 1697 return ret;
1651 } 1698 }
1652 1699
1653 /* 1700 /*
1654 * release all the mappings made in a process's VM space 1701 * release all the mappings made in a process's VM space
1655 */ 1702 */
1656 void exit_mmap(struct mm_struct *mm) 1703 void exit_mmap(struct mm_struct *mm)
1657 { 1704 {
1658 struct vm_area_struct *vma; 1705 struct vm_area_struct *vma;
1659 1706
1660 if (!mm) 1707 if (!mm)
1661 return; 1708 return;
1662 1709
1663 kenter(""); 1710 kenter("");
1664 1711
1665 mm->total_vm = 0; 1712 mm->total_vm = 0;
1666 1713
1667 while ((vma = mm->mmap)) { 1714 while ((vma = mm->mmap)) {
1668 mm->mmap = vma->vm_next; 1715 mm->mmap = vma->vm_next;
1669 delete_vma_from_mm(vma); 1716 delete_vma_from_mm(vma);
1670 delete_vma(mm, vma); 1717 delete_vma(mm, vma);
1671 } 1718 }
1672 1719
1673 kleave(""); 1720 kleave("");
1674 } 1721 }
1675 1722
1676 unsigned long do_brk(unsigned long addr, unsigned long len) 1723 unsigned long do_brk(unsigned long addr, unsigned long len)
1677 { 1724 {
1678 return -ENOMEM; 1725 return -ENOMEM;
1679 } 1726 }
1680 1727
1681 /* 1728 /*
1682 * expand (or shrink) an existing mapping, potentially moving it at the same 1729 * expand (or shrink) an existing mapping, potentially moving it at the same
1683 * time (controlled by the MREMAP_MAYMOVE flag and available VM space) 1730 * time (controlled by the MREMAP_MAYMOVE flag and available VM space)
1684 * 1731 *
1685 * under NOMMU conditions, we only permit changing a mapping's size, and only 1732 * under NOMMU conditions, we only permit changing a mapping's size, and only
1686 * as long as it stays within the region allocated by do_mmap_private() and the 1733 * as long as it stays within the region allocated by do_mmap_private() and the
1687 * block is not shareable 1734 * block is not shareable
1688 * 1735 *
1689 * MREMAP_FIXED is not supported under NOMMU conditions 1736 * MREMAP_FIXED is not supported under NOMMU conditions
1690 */ 1737 */
1691 unsigned long do_mremap(unsigned long addr, 1738 unsigned long do_mremap(unsigned long addr,
1692 unsigned long old_len, unsigned long new_len, 1739 unsigned long old_len, unsigned long new_len,
1693 unsigned long flags, unsigned long new_addr) 1740 unsigned long flags, unsigned long new_addr)
1694 { 1741 {
1695 struct vm_area_struct *vma; 1742 struct vm_area_struct *vma;
1696 1743
1697 /* insanity checks first */ 1744 /* insanity checks first */
1698 if (old_len == 0 || new_len == 0) 1745 if (old_len == 0 || new_len == 0)
1699 return (unsigned long) -EINVAL; 1746 return (unsigned long) -EINVAL;
1700 1747
1701 if (addr & ~PAGE_MASK) 1748 if (addr & ~PAGE_MASK)
1702 return -EINVAL; 1749 return -EINVAL;
1703 1750
1704 if (flags & MREMAP_FIXED && new_addr != addr) 1751 if (flags & MREMAP_FIXED && new_addr != addr)
1705 return (unsigned long) -EINVAL; 1752 return (unsigned long) -EINVAL;
1706 1753
1707 vma = find_vma_exact(current->mm, addr, old_len); 1754 vma = find_vma_exact(current->mm, addr, old_len);
1708 if (!vma) 1755 if (!vma)
1709 return (unsigned long) -EINVAL; 1756 return (unsigned long) -EINVAL;
1710 1757
1711 if (vma->vm_end != vma->vm_start + old_len) 1758 if (vma->vm_end != vma->vm_start + old_len)
1712 return (unsigned long) -EFAULT; 1759 return (unsigned long) -EFAULT;
1713 1760
1714 if (vma->vm_flags & VM_MAYSHARE) 1761 if (vma->vm_flags & VM_MAYSHARE)
1715 return (unsigned long) -EPERM; 1762 return (unsigned long) -EPERM;
1716 1763
1717 if (new_len > vma->vm_region->vm_end - vma->vm_region->vm_start) 1764 if (new_len > vma->vm_region->vm_end - vma->vm_region->vm_start)
1718 return (unsigned long) -ENOMEM; 1765 return (unsigned long) -ENOMEM;
1719 1766
1720 /* all checks complete - do it */ 1767 /* all checks complete - do it */
1721 vma->vm_end = vma->vm_start + new_len; 1768 vma->vm_end = vma->vm_start + new_len;
1722 return vma->vm_start; 1769 return vma->vm_start;
1723 } 1770 }
1724 EXPORT_SYMBOL(do_mremap); 1771 EXPORT_SYMBOL(do_mremap);
1725 1772
1726 SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, 1773 SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
1727 unsigned long, new_len, unsigned long, flags, 1774 unsigned long, new_len, unsigned long, flags,
1728 unsigned long, new_addr) 1775 unsigned long, new_addr)
1729 { 1776 {
1730 unsigned long ret; 1777 unsigned long ret;
1731 1778
1732 down_write(&current->mm->mmap_sem); 1779 down_write(&current->mm->mmap_sem);
1733 ret = do_mremap(addr, old_len, new_len, flags, new_addr); 1780 ret = do_mremap(addr, old_len, new_len, flags, new_addr);
1734 up_write(&current->mm->mmap_sem); 1781 up_write(&current->mm->mmap_sem);
1735 return ret; 1782 return ret;
1736 } 1783 }
1737 1784
1738 struct page *follow_page(struct vm_area_struct *vma, unsigned long address, 1785 struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
1739 unsigned int foll_flags) 1786 unsigned int foll_flags)
1740 { 1787 {
1741 return NULL; 1788 return NULL;
1742 } 1789 }
1743 1790
1744 int remap_pfn_range(struct vm_area_struct *vma, unsigned long from, 1791 int remap_pfn_range(struct vm_area_struct *vma, unsigned long from,
1745 unsigned long to, unsigned long size, pgprot_t prot) 1792 unsigned long to, unsigned long size, pgprot_t prot)
1746 { 1793 {
1747 vma->vm_start = vma->vm_pgoff << PAGE_SHIFT; 1794 vma->vm_start = vma->vm_pgoff << PAGE_SHIFT;
1748 return 0; 1795 return 0;
1749 } 1796 }
1750 EXPORT_SYMBOL(remap_pfn_range); 1797 EXPORT_SYMBOL(remap_pfn_range);
1751 1798
1752 int remap_vmalloc_range(struct vm_area_struct *vma, void *addr, 1799 int remap_vmalloc_range(struct vm_area_struct *vma, void *addr,
1753 unsigned long pgoff) 1800 unsigned long pgoff)
1754 { 1801 {
1755 unsigned int size = vma->vm_end - vma->vm_start; 1802 unsigned int size = vma->vm_end - vma->vm_start;
1756 1803
1757 if (!(vma->vm_flags & VM_USERMAP)) 1804 if (!(vma->vm_flags & VM_USERMAP))
1758 return -EINVAL; 1805 return -EINVAL;
1759 1806
1760 vma->vm_start = (unsigned long)(addr + (pgoff << PAGE_SHIFT)); 1807 vma->vm_start = (unsigned long)(addr + (pgoff << PAGE_SHIFT));
1761 vma->vm_end = vma->vm_start + size; 1808 vma->vm_end = vma->vm_start + size;
1762 1809
1763 return 0; 1810 return 0;
1764 } 1811 }
1765 EXPORT_SYMBOL(remap_vmalloc_range); 1812 EXPORT_SYMBOL(remap_vmalloc_range);
1766 1813
1767 void swap_unplug_io_fn(struct backing_dev_info *bdi, struct page *page) 1814 void swap_unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
1768 { 1815 {
1769 } 1816 }
1770 1817
1771 unsigned long arch_get_unmapped_area(struct file *file, unsigned long addr, 1818 unsigned long arch_get_unmapped_area(struct file *file, unsigned long addr,
1772 unsigned long len, unsigned long pgoff, unsigned long flags) 1819 unsigned long len, unsigned long pgoff, unsigned long flags)
1773 { 1820 {
1774 return -ENOMEM; 1821 return -ENOMEM;
1775 } 1822 }
1776 1823
1777 void arch_unmap_area(struct mm_struct *mm, unsigned long addr) 1824 void arch_unmap_area(struct mm_struct *mm, unsigned long addr)
1778 { 1825 {
1779 } 1826 }
1780 1827
1781 void unmap_mapping_range(struct address_space *mapping, 1828 void unmap_mapping_range(struct address_space *mapping,
1782 loff_t const holebegin, loff_t const holelen, 1829 loff_t const holebegin, loff_t const holelen,
1783 int even_cows) 1830 int even_cows)
1784 { 1831 {
1785 } 1832 }
1786 EXPORT_SYMBOL(unmap_mapping_range); 1833 EXPORT_SYMBOL(unmap_mapping_range);
1787 1834
1788 /* 1835 /*
1789 * Check that a process has enough memory to allocate a new virtual 1836 * Check that a process has enough memory to allocate a new virtual
1790 * mapping. 0 means there is enough memory for the allocation to 1837 * mapping. 0 means there is enough memory for the allocation to
1791 * succeed and -ENOMEM implies there is not. 1838 * succeed and -ENOMEM implies there is not.
1792 * 1839 *
1793 * We currently support three overcommit policies, which are set via the 1840 * We currently support three overcommit policies, which are set via the
1794 * vm.overcommit_memory sysctl. See Documentation/vm/overcommit-accounting 1841 * vm.overcommit_memory sysctl. See Documentation/vm/overcommit-accounting
1795 * 1842 *
1796 * Strict overcommit modes added 2002 Feb 26 by Alan Cox. 1843 * Strict overcommit modes added 2002 Feb 26 by Alan Cox.
1797 * Additional code 2002 Jul 20 by Robert Love. 1844 * Additional code 2002 Jul 20 by Robert Love.
1798 * 1845 *
1799 * cap_sys_admin is 1 if the process has admin privileges, 0 otherwise. 1846 * cap_sys_admin is 1 if the process has admin privileges, 0 otherwise.
1800 * 1847 *
1801 * Note this is a helper function intended to be used by LSMs which 1848 * Note this is a helper function intended to be used by LSMs which
1802 * wish to use this logic. 1849 * wish to use this logic.
1803 */ 1850 */
1804 int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) 1851 int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
1805 { 1852 {
1806 unsigned long free, allowed; 1853 unsigned long free, allowed;
1807 1854
1808 vm_acct_memory(pages); 1855 vm_acct_memory(pages);
1809 1856
1810 /* 1857 /*
1811 * Sometimes we want to use more memory than we have 1858 * Sometimes we want to use more memory than we have
1812 */ 1859 */
1813 if (sysctl_overcommit_memory == OVERCOMMIT_ALWAYS) 1860 if (sysctl_overcommit_memory == OVERCOMMIT_ALWAYS)
1814 return 0; 1861 return 0;
1815 1862
1816 if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) { 1863 if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) {
1817 unsigned long n; 1864 unsigned long n;
1818 1865
1819 free = global_page_state(NR_FILE_PAGES); 1866 free = global_page_state(NR_FILE_PAGES);
1820 free += nr_swap_pages; 1867 free += nr_swap_pages;
1821 1868
1822 /* 1869 /*
1823 * Any slabs which are created with the 1870 * Any slabs which are created with the
1824 * SLAB_RECLAIM_ACCOUNT flag claim to have contents 1871 * SLAB_RECLAIM_ACCOUNT flag claim to have contents
1825 * which are reclaimable, under pressure. The dentry 1872 * which are reclaimable, under pressure. The dentry
1826 * cache and most inode caches should fall into this 1873 * cache and most inode caches should fall into this
1827 */ 1874 */
1828 free += global_page_state(NR_SLAB_RECLAIMABLE); 1875 free += global_page_state(NR_SLAB_RECLAIMABLE);
1829 1876
1830 /* 1877 /*
1831 * Leave the last 3% for root 1878 * Leave the last 3% for root
1832 */ 1879 */
1833 if (!cap_sys_admin) 1880 if (!cap_sys_admin)
1834 free -= free / 32; 1881 free -= free / 32;
1835 1882
1836 if (free > pages) 1883 if (free > pages)
1837 return 0; 1884 return 0;
1838 1885
1839 /* 1886 /*
1840 * nr_free_pages() is very expensive on large systems, 1887 * nr_free_pages() is very expensive on large systems,
1841 * only call if we're about to fail. 1888 * only call if we're about to fail.
1842 */ 1889 */
1843 n = nr_free_pages(); 1890 n = nr_free_pages();
1844 1891
1845 /* 1892 /*
1846 * Leave reserved pages. The pages are not for anonymous pages. 1893 * Leave reserved pages. The pages are not for anonymous pages.
1847 */ 1894 */
1848 if (n <= totalreserve_pages) 1895 if (n <= totalreserve_pages)
1849 goto error; 1896 goto error;
1850 else 1897 else
1851 n -= totalreserve_pages; 1898 n -= totalreserve_pages;
1852 1899
1853 /* 1900 /*
1854 * Leave the last 3% for root 1901 * Leave the last 3% for root
1855 */ 1902 */
1856 if (!cap_sys_admin) 1903 if (!cap_sys_admin)
1857 n -= n / 32; 1904 n -= n / 32;
1858 free += n; 1905 free += n;
1859 1906
1860 if (free > pages) 1907 if (free > pages)
1861 return 0; 1908 return 0;
1862 1909
1863 goto error; 1910 goto error;
1864 } 1911 }
1865 1912
1866 allowed = totalram_pages * sysctl_overcommit_ratio / 100; 1913 allowed = totalram_pages * sysctl_overcommit_ratio / 100;
1867 /* 1914 /*
1868 * Leave the last 3% for root 1915 * Leave the last 3% for root
1869 */ 1916 */
1870 if (!cap_sys_admin) 1917 if (!cap_sys_admin)
1871 allowed -= allowed / 32; 1918 allowed -= allowed / 32;
1872 allowed += total_swap_pages; 1919 allowed += total_swap_pages;
1873 1920
1874 /* Don't let a single process grow too big: 1921 /* Don't let a single process grow too big:
1875 leave 3% of the size of this process for other processes */ 1922 leave 3% of the size of this process for other processes */
1876 if (mm) 1923 if (mm)
1877 allowed -= mm->total_vm / 32; 1924 allowed -= mm->total_vm / 32;
1878 1925
1879 if (percpu_counter_read_positive(&vm_committed_as) < allowed) 1926 if (percpu_counter_read_positive(&vm_committed_as) < allowed)
1880 return 0; 1927 return 0;
1881 1928
1882 error: 1929 error:
1883 vm_unacct_memory(pages); 1930 vm_unacct_memory(pages);
1884 1931
1885 return -ENOMEM; 1932 return -ENOMEM;
1886 } 1933 }
1887 1934
1888 int in_gate_area_no_task(unsigned long addr) 1935 int in_gate_area_no_task(unsigned long addr)
1889 { 1936 {
1890 return 0; 1937 return 0;
1891 } 1938 }
1892 1939
1893 int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) 1940 int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
1894 { 1941 {
1895 BUG(); 1942 BUG();
1896 return 0; 1943 return 0;
1897 } 1944 }
1898 EXPORT_SYMBOL(filemap_fault); 1945 EXPORT_SYMBOL(filemap_fault);
1899 1946
1900 /* 1947 /*
1901 * Access another process' address space. 1948 * Access another process' address space.
1902 * - source/target buffer must be kernel space 1949 * - source/target buffer must be kernel space
1903 */ 1950 */
1904 int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write) 1951 int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write)
1905 { 1952 {
1906 struct vm_area_struct *vma; 1953 struct vm_area_struct *vma;
1907 struct mm_struct *mm; 1954 struct mm_struct *mm;
1908 1955
1909 if (addr + len < addr) 1956 if (addr + len < addr)
1910 return 0; 1957 return 0;
1911 1958
1912 mm = get_task_mm(tsk); 1959 mm = get_task_mm(tsk);
1913 if (!mm) 1960 if (!mm)
1914 return 0; 1961 return 0;
1915 1962
1916 down_read(&mm->mmap_sem); 1963 down_read(&mm->mmap_sem);
1917 1964
1918 /* the access must start within one of the target process's mappings */ 1965 /* the access must start within one of the target process's mappings */
1919 vma = find_vma(mm, addr); 1966 vma = find_vma(mm, addr);
1920 if (vma) { 1967 if (vma) {
1921 /* don't overrun this mapping */ 1968 /* don't overrun this mapping */
1922 if (addr + len >= vma->vm_end) 1969 if (addr + len >= vma->vm_end)
1923 len = vma->vm_end - addr; 1970 len = vma->vm_end - addr;
1924 1971
1925 /* only read or write mappings where it is permitted */ 1972 /* only read or write mappings where it is permitted */
1926 if (write && vma->vm_flags & VM_MAYWRITE) 1973 if (write && vma->vm_flags & VM_MAYWRITE)
1927 copy_to_user_page(vma, NULL, addr, 1974 copy_to_user_page(vma, NULL, addr,
1928 (void *) addr, buf, len); 1975 (void *) addr, buf, len);
1929 else if (!write && vma->vm_flags & VM_MAYREAD) 1976 else if (!write && vma->vm_flags & VM_MAYREAD)
1930 copy_from_user_page(vma, NULL, addr, 1977 copy_from_user_page(vma, NULL, addr,
1931 buf, (void *) addr, len); 1978 buf, (void *) addr, len);
1932 else 1979 else
1933 len = 0; 1980 len = 0;
1934 } else { 1981 } else {
1935 len = 0; 1982 len = 0;
1936 } 1983 }
1937 1984
1938 up_read(&mm->mmap_sem); 1985 up_read(&mm->mmap_sem);
1939 mmput(mm); 1986 mmput(mm);
1940 return len; 1987 return len;
1941 } 1988 }
1942 1989
1943 /** 1990 /**
1944 * nommu_shrink_inode_mappings - Shrink the shared mappings on an inode 1991 * nommu_shrink_inode_mappings - Shrink the shared mappings on an inode
1945 * @inode: The inode to check 1992 * @inode: The inode to check
1946 * @size: The current filesize of the inode 1993 * @size: The current filesize of the inode
1947 * @newsize: The proposed filesize of the inode 1994 * @newsize: The proposed filesize of the inode
1948 * 1995 *
1949 * Check the shared mappings on an inode on behalf of a shrinking truncate to 1996 * Check the shared mappings on an inode on behalf of a shrinking truncate to
1950 * make sure that that any outstanding VMAs aren't broken and then shrink the 1997 * make sure that that any outstanding VMAs aren't broken and then shrink the
1951 * vm_regions that extend that beyond so that do_mmap_pgoff() doesn't 1998 * vm_regions that extend that beyond so that do_mmap_pgoff() doesn't
1952 * automatically grant mappings that are too large. 1999 * automatically grant mappings that are too large.
1953 */ 2000 */
1954 int nommu_shrink_inode_mappings(struct inode *inode, size_t size, 2001 int nommu_shrink_inode_mappings(struct inode *inode, size_t size,
1955 size_t newsize) 2002 size_t newsize)
1956 { 2003 {
1957 struct vm_area_struct *vma; 2004 struct vm_area_struct *vma;
1958 struct prio_tree_iter iter; 2005 struct prio_tree_iter iter;
1959 struct vm_region *region; 2006 struct vm_region *region;
1960 pgoff_t low, high; 2007 pgoff_t low, high;
1961 size_t r_size, r_top; 2008 size_t r_size, r_top;
1962 2009
1963 low = newsize >> PAGE_SHIFT; 2010 low = newsize >> PAGE_SHIFT;
1964 high = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; 2011 high = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
1965 2012
1966 down_write(&nommu_region_sem); 2013 down_write(&nommu_region_sem);
1967 2014
1968 /* search for VMAs that fall within the dead zone */ 2015 /* search for VMAs that fall within the dead zone */
1969 vma_prio_tree_foreach(vma, &iter, &inode->i_mapping->i_mmap, 2016 vma_prio_tree_foreach(vma, &iter, &inode->i_mapping->i_mmap,
1970 low, high) { 2017 low, high) {
1971 /* found one - only interested if it's shared out of the page 2018 /* found one - only interested if it's shared out of the page
1972 * cache */ 2019 * cache */
1973 if (vma->vm_flags & VM_SHARED) { 2020 if (vma->vm_flags & VM_SHARED) {
1974 up_write(&nommu_region_sem); 2021 up_write(&nommu_region_sem);
1975 return -ETXTBSY; /* not quite true, but near enough */ 2022 return -ETXTBSY; /* not quite true, but near enough */
1976 } 2023 }
1977 } 2024 }
1978 2025
1979 /* reduce any regions that overlap the dead zone - if in existence, 2026 /* reduce any regions that overlap the dead zone - if in existence,
1980 * these will be pointed to by VMAs that don't overlap the dead zone 2027 * these will be pointed to by VMAs that don't overlap the dead zone
1981 * 2028 *
1982 * we don't check for any regions that start beyond the EOF as there 2029 * we don't check for any regions that start beyond the EOF as there
1983 * shouldn't be any 2030 * shouldn't be any
1984 */ 2031 */
1985 vma_prio_tree_foreach(vma, &iter, &inode->i_mapping->i_mmap, 2032 vma_prio_tree_foreach(vma, &iter, &inode->i_mapping->i_mmap,
1986 0, ULONG_MAX) { 2033 0, ULONG_MAX) {
1987 if (!(vma->vm_flags & VM_SHARED)) 2034 if (!(vma->vm_flags & VM_SHARED))
1988 continue; 2035 continue;
1989 2036
1990 region = vma->vm_region; 2037 region = vma->vm_region;
1991 r_size = region->vm_top - region->vm_start; 2038 r_size = region->vm_top - region->vm_start;
1992 r_top = (region->vm_pgoff << PAGE_SHIFT) + r_size; 2039 r_top = (region->vm_pgoff << PAGE_SHIFT) + r_size;
1993 2040
1994 if (r_top > newsize) { 2041 if (r_top > newsize) {
1995 region->vm_top -= r_top - newsize; 2042 region->vm_top -= r_top - newsize;
1996 if (region->vm_end > region->vm_top) 2043 if (region->vm_end > region->vm_top)
1997 region->vm_end = region->vm_top; 2044 region->vm_end = region->vm_top;
1998 } 2045 }
1999 } 2046 }
2000 2047
2001 up_write(&nommu_region_sem); 2048 up_write(&nommu_region_sem);
2002 return 0; 2049 return 0;
2003 } 2050 }
2004 2051
1 /* 1 /*
2 * linux/mm/vmalloc.c 2 * linux/mm/vmalloc.c
3 * 3 *
4 * Copyright (C) 1993 Linus Torvalds 4 * Copyright (C) 1993 Linus Torvalds
5 * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999 5 * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
6 * SMP-safe vmalloc/vfree/ioremap, Tigran Aivazian <tigran@veritas.com>, May 2000 6 * SMP-safe vmalloc/vfree/ioremap, Tigran Aivazian <tigran@veritas.com>, May 2000
7 * Major rework to support vmap/vunmap, Christoph Hellwig, SGI, August 2002 7 * Major rework to support vmap/vunmap, Christoph Hellwig, SGI, August 2002
8 * Numa awareness, Christoph Lameter, SGI, June 2005 8 * Numa awareness, Christoph Lameter, SGI, June 2005
9 */ 9 */
10 10
11 #include <linux/vmalloc.h> 11 #include <linux/vmalloc.h>
12 #include <linux/mm.h> 12 #include <linux/mm.h>
13 #include <linux/module.h> 13 #include <linux/module.h>
14 #include <linux/highmem.h> 14 #include <linux/highmem.h>
15 #include <linux/sched.h> 15 #include <linux/sched.h>
16 #include <linux/slab.h> 16 #include <linux/slab.h>
17 #include <linux/spinlock.h> 17 #include <linux/spinlock.h>
18 #include <linux/interrupt.h> 18 #include <linux/interrupt.h>
19 #include <linux/proc_fs.h> 19 #include <linux/proc_fs.h>
20 #include <linux/seq_file.h> 20 #include <linux/seq_file.h>
21 #include <linux/debugobjects.h> 21 #include <linux/debugobjects.h>
22 #include <linux/kallsyms.h> 22 #include <linux/kallsyms.h>
23 #include <linux/list.h> 23 #include <linux/list.h>
24 #include <linux/rbtree.h> 24 #include <linux/rbtree.h>
25 #include <linux/radix-tree.h> 25 #include <linux/radix-tree.h>
26 #include <linux/rcupdate.h> 26 #include <linux/rcupdate.h>
27 #include <linux/pfn.h> 27 #include <linux/pfn.h>
28 #include <linux/kmemleak.h> 28 #include <linux/kmemleak.h>
29 #include <asm/atomic.h> 29 #include <asm/atomic.h>
30 #include <asm/uaccess.h> 30 #include <asm/uaccess.h>
31 #include <asm/tlbflush.h> 31 #include <asm/tlbflush.h>
32 #include <asm/shmparam.h> 32 #include <asm/shmparam.h>
33 33
34 bool vmap_lazy_unmap __read_mostly = true; 34 bool vmap_lazy_unmap __read_mostly = true;
35 35
36 /*** Page table manipulation functions ***/ 36 /*** Page table manipulation functions ***/
37 37
38 static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end) 38 static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end)
39 { 39 {
40 pte_t *pte; 40 pte_t *pte;
41 41
42 pte = pte_offset_kernel(pmd, addr); 42 pte = pte_offset_kernel(pmd, addr);
43 do { 43 do {
44 pte_t ptent = ptep_get_and_clear(&init_mm, addr, pte); 44 pte_t ptent = ptep_get_and_clear(&init_mm, addr, pte);
45 WARN_ON(!pte_none(ptent) && !pte_present(ptent)); 45 WARN_ON(!pte_none(ptent) && !pte_present(ptent));
46 } while (pte++, addr += PAGE_SIZE, addr != end); 46 } while (pte++, addr += PAGE_SIZE, addr != end);
47 } 47 }
48 48
49 static void vunmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end) 49 static void vunmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end)
50 { 50 {
51 pmd_t *pmd; 51 pmd_t *pmd;
52 unsigned long next; 52 unsigned long next;
53 53
54 pmd = pmd_offset(pud, addr); 54 pmd = pmd_offset(pud, addr);
55 do { 55 do {
56 next = pmd_addr_end(addr, end); 56 next = pmd_addr_end(addr, end);
57 if (pmd_none_or_clear_bad(pmd)) 57 if (pmd_none_or_clear_bad(pmd))
58 continue; 58 continue;
59 vunmap_pte_range(pmd, addr, next); 59 vunmap_pte_range(pmd, addr, next);
60 } while (pmd++, addr = next, addr != end); 60 } while (pmd++, addr = next, addr != end);
61 } 61 }
62 62
63 static void vunmap_pud_range(pgd_t *pgd, unsigned long addr, unsigned long end) 63 static void vunmap_pud_range(pgd_t *pgd, unsigned long addr, unsigned long end)
64 { 64 {
65 pud_t *pud; 65 pud_t *pud;
66 unsigned long next; 66 unsigned long next;
67 67
68 pud = pud_offset(pgd, addr); 68 pud = pud_offset(pgd, addr);
69 do { 69 do {
70 next = pud_addr_end(addr, end); 70 next = pud_addr_end(addr, end);
71 if (pud_none_or_clear_bad(pud)) 71 if (pud_none_or_clear_bad(pud))
72 continue; 72 continue;
73 vunmap_pmd_range(pud, addr, next); 73 vunmap_pmd_range(pud, addr, next);
74 } while (pud++, addr = next, addr != end); 74 } while (pud++, addr = next, addr != end);
75 } 75 }
76 76
77 static void vunmap_page_range(unsigned long addr, unsigned long end) 77 static void vunmap_page_range(unsigned long addr, unsigned long end)
78 { 78 {
79 pgd_t *pgd; 79 pgd_t *pgd;
80 unsigned long next; 80 unsigned long next;
81 81
82 BUG_ON(addr >= end); 82 BUG_ON(addr >= end);
83 pgd = pgd_offset_k(addr); 83 pgd = pgd_offset_k(addr);
84 do { 84 do {
85 next = pgd_addr_end(addr, end); 85 next = pgd_addr_end(addr, end);
86 if (pgd_none_or_clear_bad(pgd)) 86 if (pgd_none_or_clear_bad(pgd))
87 continue; 87 continue;
88 vunmap_pud_range(pgd, addr, next); 88 vunmap_pud_range(pgd, addr, next);
89 } while (pgd++, addr = next, addr != end); 89 } while (pgd++, addr = next, addr != end);
90 } 90 }
91 91
92 static int vmap_pte_range(pmd_t *pmd, unsigned long addr, 92 static int vmap_pte_range(pmd_t *pmd, unsigned long addr,
93 unsigned long end, pgprot_t prot, struct page **pages, int *nr) 93 unsigned long end, pgprot_t prot, struct page **pages, int *nr)
94 { 94 {
95 pte_t *pte; 95 pte_t *pte;
96 96
97 /* 97 /*
98 * nr is a running index into the array which helps higher level 98 * nr is a running index into the array which helps higher level
99 * callers keep track of where we're up to. 99 * callers keep track of where we're up to.
100 */ 100 */
101 101
102 pte = pte_alloc_kernel(pmd, addr); 102 pte = pte_alloc_kernel(pmd, addr);
103 if (!pte) 103 if (!pte)
104 return -ENOMEM; 104 return -ENOMEM;
105 do { 105 do {
106 struct page *page = pages[*nr]; 106 struct page *page = pages[*nr];
107 107
108 if (WARN_ON(!pte_none(*pte))) 108 if (WARN_ON(!pte_none(*pte)))
109 return -EBUSY; 109 return -EBUSY;
110 if (WARN_ON(!page)) 110 if (WARN_ON(!page))
111 return -ENOMEM; 111 return -ENOMEM;
112 set_pte_at(&init_mm, addr, pte, mk_pte(page, prot)); 112 set_pte_at(&init_mm, addr, pte, mk_pte(page, prot));
113 (*nr)++; 113 (*nr)++;
114 } while (pte++, addr += PAGE_SIZE, addr != end); 114 } while (pte++, addr += PAGE_SIZE, addr != end);
115 return 0; 115 return 0;
116 } 116 }
117 117
118 static int vmap_pmd_range(pud_t *pud, unsigned long addr, 118 static int vmap_pmd_range(pud_t *pud, unsigned long addr,
119 unsigned long end, pgprot_t prot, struct page **pages, int *nr) 119 unsigned long end, pgprot_t prot, struct page **pages, int *nr)
120 { 120 {
121 pmd_t *pmd; 121 pmd_t *pmd;
122 unsigned long next; 122 unsigned long next;
123 123
124 pmd = pmd_alloc(&init_mm, pud, addr); 124 pmd = pmd_alloc(&init_mm, pud, addr);
125 if (!pmd) 125 if (!pmd)
126 return -ENOMEM; 126 return -ENOMEM;
127 do { 127 do {
128 next = pmd_addr_end(addr, end); 128 next = pmd_addr_end(addr, end);
129 if (vmap_pte_range(pmd, addr, next, prot, pages, nr)) 129 if (vmap_pte_range(pmd, addr, next, prot, pages, nr))
130 return -ENOMEM; 130 return -ENOMEM;
131 } while (pmd++, addr = next, addr != end); 131 } while (pmd++, addr = next, addr != end);
132 return 0; 132 return 0;
133 } 133 }
134 134
135 static int vmap_pud_range(pgd_t *pgd, unsigned long addr, 135 static int vmap_pud_range(pgd_t *pgd, unsigned long addr,
136 unsigned long end, pgprot_t prot, struct page **pages, int *nr) 136 unsigned long end, pgprot_t prot, struct page **pages, int *nr)
137 { 137 {
138 pud_t *pud; 138 pud_t *pud;
139 unsigned long next; 139 unsigned long next;
140 140
141 pud = pud_alloc(&init_mm, pgd, addr); 141 pud = pud_alloc(&init_mm, pgd, addr);
142 if (!pud) 142 if (!pud)
143 return -ENOMEM; 143 return -ENOMEM;
144 do { 144 do {
145 next = pud_addr_end(addr, end); 145 next = pud_addr_end(addr, end);
146 if (vmap_pmd_range(pud, addr, next, prot, pages, nr)) 146 if (vmap_pmd_range(pud, addr, next, prot, pages, nr))
147 return -ENOMEM; 147 return -ENOMEM;
148 } while (pud++, addr = next, addr != end); 148 } while (pud++, addr = next, addr != end);
149 return 0; 149 return 0;
150 } 150 }
151 151
152 /* 152 /*
153 * Set up page tables in kva (addr, end). The ptes shall have prot "prot", and 153 * Set up page tables in kva (addr, end). The ptes shall have prot "prot", and
154 * will have pfns corresponding to the "pages" array. 154 * will have pfns corresponding to the "pages" array.
155 * 155 *
156 * Ie. pte at addr+N*PAGE_SIZE shall point to pfn corresponding to pages[N] 156 * Ie. pte at addr+N*PAGE_SIZE shall point to pfn corresponding to pages[N]
157 */ 157 */
158 static int vmap_page_range_noflush(unsigned long start, unsigned long end, 158 static int vmap_page_range_noflush(unsigned long start, unsigned long end,
159 pgprot_t prot, struct page **pages) 159 pgprot_t prot, struct page **pages)
160 { 160 {
161 pgd_t *pgd; 161 pgd_t *pgd;
162 unsigned long next; 162 unsigned long next;
163 unsigned long addr = start; 163 unsigned long addr = start;
164 int err = 0; 164 int err = 0;
165 int nr = 0; 165 int nr = 0;
166 166
167 BUG_ON(addr >= end); 167 BUG_ON(addr >= end);
168 pgd = pgd_offset_k(addr); 168 pgd = pgd_offset_k(addr);
169 do { 169 do {
170 next = pgd_addr_end(addr, end); 170 next = pgd_addr_end(addr, end);
171 err = vmap_pud_range(pgd, addr, next, prot, pages, &nr); 171 err = vmap_pud_range(pgd, addr, next, prot, pages, &nr);
172 if (err) 172 if (err)
173 return err; 173 return err;
174 } while (pgd++, addr = next, addr != end); 174 } while (pgd++, addr = next, addr != end);
175 175
176 return nr; 176 return nr;
177 } 177 }
178 178
179 static int vmap_page_range(unsigned long start, unsigned long end, 179 static int vmap_page_range(unsigned long start, unsigned long end,
180 pgprot_t prot, struct page **pages) 180 pgprot_t prot, struct page **pages)
181 { 181 {
182 int ret; 182 int ret;
183 183
184 ret = vmap_page_range_noflush(start, end, prot, pages); 184 ret = vmap_page_range_noflush(start, end, prot, pages);
185 flush_cache_vmap(start, end); 185 flush_cache_vmap(start, end);
186 return ret; 186 return ret;
187 } 187 }
188 188
189 int is_vmalloc_or_module_addr(const void *x) 189 int is_vmalloc_or_module_addr(const void *x)
190 { 190 {
191 /* 191 /*
192 * ARM, x86-64 and sparc64 put modules in a special place, 192 * ARM, x86-64 and sparc64 put modules in a special place,
193 * and fall back on vmalloc() if that fails. Others 193 * and fall back on vmalloc() if that fails. Others
194 * just put it in the vmalloc space. 194 * just put it in the vmalloc space.
195 */ 195 */
196 #if defined(CONFIG_MODULES) && defined(MODULES_VADDR) 196 #if defined(CONFIG_MODULES) && defined(MODULES_VADDR)
197 unsigned long addr = (unsigned long)x; 197 unsigned long addr = (unsigned long)x;
198 if (addr >= MODULES_VADDR && addr < MODULES_END) 198 if (addr >= MODULES_VADDR && addr < MODULES_END)
199 return 1; 199 return 1;
200 #endif 200 #endif
201 return is_vmalloc_addr(x); 201 return is_vmalloc_addr(x);
202 } 202 }
203 203
204 /* 204 /*
205 * Walk a vmap address to the struct page it maps. 205 * Walk a vmap address to the struct page it maps.
206 */ 206 */
207 struct page *vmalloc_to_page(const void *vmalloc_addr) 207 struct page *vmalloc_to_page(const void *vmalloc_addr)
208 { 208 {
209 unsigned long addr = (unsigned long) vmalloc_addr; 209 unsigned long addr = (unsigned long) vmalloc_addr;
210 struct page *page = NULL; 210 struct page *page = NULL;
211 pgd_t *pgd = pgd_offset_k(addr); 211 pgd_t *pgd = pgd_offset_k(addr);
212 212
213 /* 213 /*
214 * XXX we might need to change this if we add VIRTUAL_BUG_ON for 214 * XXX we might need to change this if we add VIRTUAL_BUG_ON for
215 * architectures that do not vmalloc module space 215 * architectures that do not vmalloc module space
216 */ 216 */
217 VIRTUAL_BUG_ON(!is_vmalloc_or_module_addr(vmalloc_addr)); 217 VIRTUAL_BUG_ON(!is_vmalloc_or_module_addr(vmalloc_addr));
218 218
219 if (!pgd_none(*pgd)) { 219 if (!pgd_none(*pgd)) {
220 pud_t *pud = pud_offset(pgd, addr); 220 pud_t *pud = pud_offset(pgd, addr);
221 if (!pud_none(*pud)) { 221 if (!pud_none(*pud)) {
222 pmd_t *pmd = pmd_offset(pud, addr); 222 pmd_t *pmd = pmd_offset(pud, addr);
223 if (!pmd_none(*pmd)) { 223 if (!pmd_none(*pmd)) {
224 pte_t *ptep, pte; 224 pte_t *ptep, pte;
225 225
226 ptep = pte_offset_map(pmd, addr); 226 ptep = pte_offset_map(pmd, addr);
227 pte = *ptep; 227 pte = *ptep;
228 if (pte_present(pte)) 228 if (pte_present(pte))
229 page = pte_page(pte); 229 page = pte_page(pte);
230 pte_unmap(ptep); 230 pte_unmap(ptep);
231 } 231 }
232 } 232 }
233 } 233 }
234 return page; 234 return page;
235 } 235 }
236 EXPORT_SYMBOL(vmalloc_to_page); 236 EXPORT_SYMBOL(vmalloc_to_page);
237 237
238 /* 238 /*
239 * Map a vmalloc()-space virtual address to the physical page frame number. 239 * Map a vmalloc()-space virtual address to the physical page frame number.
240 */ 240 */
241 unsigned long vmalloc_to_pfn(const void *vmalloc_addr) 241 unsigned long vmalloc_to_pfn(const void *vmalloc_addr)
242 { 242 {
243 return page_to_pfn(vmalloc_to_page(vmalloc_addr)); 243 return page_to_pfn(vmalloc_to_page(vmalloc_addr));
244 } 244 }
245 EXPORT_SYMBOL(vmalloc_to_pfn); 245 EXPORT_SYMBOL(vmalloc_to_pfn);
246 246
247 247
248 /*** Global kva allocator ***/ 248 /*** Global kva allocator ***/
249 249
250 #define VM_LAZY_FREE 0x01 250 #define VM_LAZY_FREE 0x01
251 #define VM_LAZY_FREEING 0x02 251 #define VM_LAZY_FREEING 0x02
252 #define VM_VM_AREA 0x04 252 #define VM_VM_AREA 0x04
253 253
254 struct vmap_area { 254 struct vmap_area {
255 unsigned long va_start; 255 unsigned long va_start;
256 unsigned long va_end; 256 unsigned long va_end;
257 unsigned long flags; 257 unsigned long flags;
258 struct rb_node rb_node; /* address sorted rbtree */ 258 struct rb_node rb_node; /* address sorted rbtree */
259 struct list_head list; /* address sorted list */ 259 struct list_head list; /* address sorted list */
260 struct list_head purge_list; /* "lazy purge" list */ 260 struct list_head purge_list; /* "lazy purge" list */
261 void *private; 261 void *private;
262 struct rcu_head rcu_head; 262 struct rcu_head rcu_head;
263 }; 263 };
264 264
265 static DEFINE_SPINLOCK(vmap_area_lock); 265 static DEFINE_SPINLOCK(vmap_area_lock);
266 static struct rb_root vmap_area_root = RB_ROOT; 266 static struct rb_root vmap_area_root = RB_ROOT;
267 static LIST_HEAD(vmap_area_list); 267 static LIST_HEAD(vmap_area_list);
268 static unsigned long vmap_area_pcpu_hole; 268 static unsigned long vmap_area_pcpu_hole;
269 269
270 static struct vmap_area *__find_vmap_area(unsigned long addr) 270 static struct vmap_area *__find_vmap_area(unsigned long addr)
271 { 271 {
272 struct rb_node *n = vmap_area_root.rb_node; 272 struct rb_node *n = vmap_area_root.rb_node;
273 273
274 while (n) { 274 while (n) {
275 struct vmap_area *va; 275 struct vmap_area *va;
276 276
277 va = rb_entry(n, struct vmap_area, rb_node); 277 va = rb_entry(n, struct vmap_area, rb_node);
278 if (addr < va->va_start) 278 if (addr < va->va_start)
279 n = n->rb_left; 279 n = n->rb_left;
280 else if (addr > va->va_start) 280 else if (addr > va->va_start)
281 n = n->rb_right; 281 n = n->rb_right;
282 else 282 else
283 return va; 283 return va;
284 } 284 }
285 285
286 return NULL; 286 return NULL;
287 } 287 }
288 288
289 static void __insert_vmap_area(struct vmap_area *va) 289 static void __insert_vmap_area(struct vmap_area *va)
290 { 290 {
291 struct rb_node **p = &vmap_area_root.rb_node; 291 struct rb_node **p = &vmap_area_root.rb_node;
292 struct rb_node *parent = NULL; 292 struct rb_node *parent = NULL;
293 struct rb_node *tmp; 293 struct rb_node *tmp;
294 294
295 while (*p) { 295 while (*p) {
296 struct vmap_area *tmp_va; 296 struct vmap_area *tmp_va;
297 297
298 parent = *p; 298 parent = *p;
299 tmp_va = rb_entry(parent, struct vmap_area, rb_node); 299 tmp_va = rb_entry(parent, struct vmap_area, rb_node);
300 if (va->va_start < tmp_va->va_end) 300 if (va->va_start < tmp_va->va_end)
301 p = &(*p)->rb_left; 301 p = &(*p)->rb_left;
302 else if (va->va_end > tmp_va->va_start) 302 else if (va->va_end > tmp_va->va_start)
303 p = &(*p)->rb_right; 303 p = &(*p)->rb_right;
304 else 304 else
305 BUG(); 305 BUG();
306 } 306 }
307 307
308 rb_link_node(&va->rb_node, parent, p); 308 rb_link_node(&va->rb_node, parent, p);
309 rb_insert_color(&va->rb_node, &vmap_area_root); 309 rb_insert_color(&va->rb_node, &vmap_area_root);
310 310
311 /* address-sort this list so it is usable like the vmlist */ 311 /* address-sort this list so it is usable like the vmlist */
312 tmp = rb_prev(&va->rb_node); 312 tmp = rb_prev(&va->rb_node);
313 if (tmp) { 313 if (tmp) {
314 struct vmap_area *prev; 314 struct vmap_area *prev;
315 prev = rb_entry(tmp, struct vmap_area, rb_node); 315 prev = rb_entry(tmp, struct vmap_area, rb_node);
316 list_add_rcu(&va->list, &prev->list); 316 list_add_rcu(&va->list, &prev->list);
317 } else 317 } else
318 list_add_rcu(&va->list, &vmap_area_list); 318 list_add_rcu(&va->list, &vmap_area_list);
319 } 319 }
320 320
321 static void purge_vmap_area_lazy(void); 321 static void purge_vmap_area_lazy(void);
322 322
323 /* 323 /*
324 * Allocate a region of KVA of the specified size and alignment, within the 324 * Allocate a region of KVA of the specified size and alignment, within the
325 * vstart and vend. 325 * vstart and vend.
326 */ 326 */
327 static struct vmap_area *alloc_vmap_area(unsigned long size, 327 static struct vmap_area *alloc_vmap_area(unsigned long size,
328 unsigned long align, 328 unsigned long align,
329 unsigned long vstart, unsigned long vend, 329 unsigned long vstart, unsigned long vend,
330 int node, gfp_t gfp_mask) 330 int node, gfp_t gfp_mask)
331 { 331 {
332 struct vmap_area *va; 332 struct vmap_area *va;
333 struct rb_node *n; 333 struct rb_node *n;
334 unsigned long addr; 334 unsigned long addr;
335 int purged = 0; 335 int purged = 0;
336 336
337 BUG_ON(!size); 337 BUG_ON(!size);
338 BUG_ON(size & ~PAGE_MASK); 338 BUG_ON(size & ~PAGE_MASK);
339 339
340 va = kmalloc_node(sizeof(struct vmap_area), 340 va = kmalloc_node(sizeof(struct vmap_area),
341 gfp_mask & GFP_RECLAIM_MASK, node); 341 gfp_mask & GFP_RECLAIM_MASK, node);
342 if (unlikely(!va)) 342 if (unlikely(!va))
343 return ERR_PTR(-ENOMEM); 343 return ERR_PTR(-ENOMEM);
344 344
345 retry: 345 retry:
346 addr = ALIGN(vstart, align); 346 addr = ALIGN(vstart, align);
347 347
348 spin_lock(&vmap_area_lock); 348 spin_lock(&vmap_area_lock);
349 if (addr + size - 1 < addr) 349 if (addr + size - 1 < addr)
350 goto overflow; 350 goto overflow;
351 351
352 /* XXX: could have a last_hole cache */ 352 /* XXX: could have a last_hole cache */
353 n = vmap_area_root.rb_node; 353 n = vmap_area_root.rb_node;
354 if (n) { 354 if (n) {
355 struct vmap_area *first = NULL; 355 struct vmap_area *first = NULL;
356 356
357 do { 357 do {
358 struct vmap_area *tmp; 358 struct vmap_area *tmp;
359 tmp = rb_entry(n, struct vmap_area, rb_node); 359 tmp = rb_entry(n, struct vmap_area, rb_node);
360 if (tmp->va_end >= addr) { 360 if (tmp->va_end >= addr) {
361 if (!first && tmp->va_start < addr + size) 361 if (!first && tmp->va_start < addr + size)
362 first = tmp; 362 first = tmp;
363 n = n->rb_left; 363 n = n->rb_left;
364 } else { 364 } else {
365 first = tmp; 365 first = tmp;
366 n = n->rb_right; 366 n = n->rb_right;
367 } 367 }
368 } while (n); 368 } while (n);
369 369
370 if (!first) 370 if (!first)
371 goto found; 371 goto found;
372 372
373 if (first->va_end < addr) { 373 if (first->va_end < addr) {
374 n = rb_next(&first->rb_node); 374 n = rb_next(&first->rb_node);
375 if (n) 375 if (n)
376 first = rb_entry(n, struct vmap_area, rb_node); 376 first = rb_entry(n, struct vmap_area, rb_node);
377 else 377 else
378 goto found; 378 goto found;
379 } 379 }
380 380
381 while (addr + size > first->va_start && addr + size <= vend) { 381 while (addr + size > first->va_start && addr + size <= vend) {
382 addr = ALIGN(first->va_end + PAGE_SIZE, align); 382 addr = ALIGN(first->va_end + PAGE_SIZE, align);
383 if (addr + size - 1 < addr) 383 if (addr + size - 1 < addr)
384 goto overflow; 384 goto overflow;
385 385
386 n = rb_next(&first->rb_node); 386 n = rb_next(&first->rb_node);
387 if (n) 387 if (n)
388 first = rb_entry(n, struct vmap_area, rb_node); 388 first = rb_entry(n, struct vmap_area, rb_node);
389 else 389 else
390 goto found; 390 goto found;
391 } 391 }
392 } 392 }
393 found: 393 found:
394 if (addr + size > vend) { 394 if (addr + size > vend) {
395 overflow: 395 overflow:
396 spin_unlock(&vmap_area_lock); 396 spin_unlock(&vmap_area_lock);
397 if (!purged) { 397 if (!purged) {
398 purge_vmap_area_lazy(); 398 purge_vmap_area_lazy();
399 purged = 1; 399 purged = 1;
400 goto retry; 400 goto retry;
401 } 401 }
402 if (printk_ratelimit()) 402 if (printk_ratelimit())
403 printk(KERN_WARNING 403 printk(KERN_WARNING
404 "vmap allocation for size %lu failed: " 404 "vmap allocation for size %lu failed: "
405 "use vmalloc=<size> to increase size.\n", size); 405 "use vmalloc=<size> to increase size.\n", size);
406 kfree(va); 406 kfree(va);
407 return ERR_PTR(-EBUSY); 407 return ERR_PTR(-EBUSY);
408 } 408 }
409 409
410 BUG_ON(addr & (align-1)); 410 BUG_ON(addr & (align-1));
411 411
412 va->va_start = addr; 412 va->va_start = addr;
413 va->va_end = addr + size; 413 va->va_end = addr + size;
414 va->flags = 0; 414 va->flags = 0;
415 __insert_vmap_area(va); 415 __insert_vmap_area(va);
416 spin_unlock(&vmap_area_lock); 416 spin_unlock(&vmap_area_lock);
417 417
418 return va; 418 return va;
419 } 419 }
420 420
421 static void rcu_free_va(struct rcu_head *head) 421 static void rcu_free_va(struct rcu_head *head)
422 { 422 {
423 struct vmap_area *va = container_of(head, struct vmap_area, rcu_head); 423 struct vmap_area *va = container_of(head, struct vmap_area, rcu_head);
424 424
425 kfree(va); 425 kfree(va);
426 } 426 }
427 427
428 static void __free_vmap_area(struct vmap_area *va) 428 static void __free_vmap_area(struct vmap_area *va)
429 { 429 {
430 BUG_ON(RB_EMPTY_NODE(&va->rb_node)); 430 BUG_ON(RB_EMPTY_NODE(&va->rb_node));
431 rb_erase(&va->rb_node, &vmap_area_root); 431 rb_erase(&va->rb_node, &vmap_area_root);
432 RB_CLEAR_NODE(&va->rb_node); 432 RB_CLEAR_NODE(&va->rb_node);
433 list_del_rcu(&va->list); 433 list_del_rcu(&va->list);
434 434
435 /* 435 /*
436 * Track the highest possible candidate for pcpu area 436 * Track the highest possible candidate for pcpu area
437 * allocation. Areas outside of vmalloc area can be returned 437 * allocation. Areas outside of vmalloc area can be returned
438 * here too, consider only end addresses which fall inside 438 * here too, consider only end addresses which fall inside
439 * vmalloc area proper. 439 * vmalloc area proper.
440 */ 440 */
441 if (va->va_end > VMALLOC_START && va->va_end <= VMALLOC_END) 441 if (va->va_end > VMALLOC_START && va->va_end <= VMALLOC_END)
442 vmap_area_pcpu_hole = max(vmap_area_pcpu_hole, va->va_end); 442 vmap_area_pcpu_hole = max(vmap_area_pcpu_hole, va->va_end);
443 443
444 call_rcu(&va->rcu_head, rcu_free_va); 444 call_rcu(&va->rcu_head, rcu_free_va);
445 } 445 }
446 446
447 /* 447 /*
448 * Free a region of KVA allocated by alloc_vmap_area 448 * Free a region of KVA allocated by alloc_vmap_area
449 */ 449 */
450 static void free_vmap_area(struct vmap_area *va) 450 static void free_vmap_area(struct vmap_area *va)
451 { 451 {
452 spin_lock(&vmap_area_lock); 452 spin_lock(&vmap_area_lock);
453 __free_vmap_area(va); 453 __free_vmap_area(va);
454 spin_unlock(&vmap_area_lock); 454 spin_unlock(&vmap_area_lock);
455 } 455 }
456 456
457 /* 457 /*
458 * Clear the pagetable entries of a given vmap_area 458 * Clear the pagetable entries of a given vmap_area
459 */ 459 */
460 static void unmap_vmap_area(struct vmap_area *va) 460 static void unmap_vmap_area(struct vmap_area *va)
461 { 461 {
462 vunmap_page_range(va->va_start, va->va_end); 462 vunmap_page_range(va->va_start, va->va_end);
463 } 463 }
464 464
465 static void vmap_debug_free_range(unsigned long start, unsigned long end) 465 static void vmap_debug_free_range(unsigned long start, unsigned long end)
466 { 466 {
467 /* 467 /*
468 * Unmap page tables and force a TLB flush immediately if 468 * Unmap page tables and force a TLB flush immediately if
469 * CONFIG_DEBUG_PAGEALLOC is set. This catches use after free 469 * CONFIG_DEBUG_PAGEALLOC is set. This catches use after free
470 * bugs similarly to those in linear kernel virtual address 470 * bugs similarly to those in linear kernel virtual address
471 * space after a page has been freed. 471 * space after a page has been freed.
472 * 472 *
473 * All the lazy freeing logic is still retained, in order to 473 * All the lazy freeing logic is still retained, in order to
474 * minimise intrusiveness of this debugging feature. 474 * minimise intrusiveness of this debugging feature.
475 * 475 *
476 * This is going to be *slow* (linear kernel virtual address 476 * This is going to be *slow* (linear kernel virtual address
477 * debugging doesn't do a broadcast TLB flush so it is a lot 477 * debugging doesn't do a broadcast TLB flush so it is a lot
478 * faster). 478 * faster).
479 */ 479 */
480 #ifdef CONFIG_DEBUG_PAGEALLOC 480 #ifdef CONFIG_DEBUG_PAGEALLOC
481 vunmap_page_range(start, end); 481 vunmap_page_range(start, end);
482 flush_tlb_kernel_range(start, end); 482 flush_tlb_kernel_range(start, end);
483 #endif 483 #endif
484 } 484 }
485 485
486 /* 486 /*
487 * lazy_max_pages is the maximum amount of virtual address space we gather up 487 * lazy_max_pages is the maximum amount of virtual address space we gather up
488 * before attempting to purge with a TLB flush. 488 * before attempting to purge with a TLB flush.
489 * 489 *
490 * There is a tradeoff here: a larger number will cover more kernel page tables 490 * There is a tradeoff here: a larger number will cover more kernel page tables
491 * and take slightly longer to purge, but it will linearly reduce the number of 491 * and take slightly longer to purge, but it will linearly reduce the number of
492 * global TLB flushes that must be performed. It would seem natural to scale 492 * global TLB flushes that must be performed. It would seem natural to scale
493 * this number up linearly with the number of CPUs (because vmapping activity 493 * this number up linearly with the number of CPUs (because vmapping activity
494 * could also scale linearly with the number of CPUs), however it is likely 494 * could also scale linearly with the number of CPUs), however it is likely
495 * that in practice, workloads might be constrained in other ways that mean 495 * that in practice, workloads might be constrained in other ways that mean
496 * vmap activity will not scale linearly with CPUs. Also, I want to be 496 * vmap activity will not scale linearly with CPUs. Also, I want to be
497 * conservative and not introduce a big latency on huge systems, so go with 497 * conservative and not introduce a big latency on huge systems, so go with
498 * a less aggressive log scale. It will still be an improvement over the old 498 * a less aggressive log scale. It will still be an improvement over the old
499 * code, and it will be simple to change the scale factor if we find that it 499 * code, and it will be simple to change the scale factor if we find that it
500 * becomes a problem on bigger systems. 500 * becomes a problem on bigger systems.
501 */ 501 */
502 static unsigned long lazy_max_pages(void) 502 static unsigned long lazy_max_pages(void)
503 { 503 {
504 unsigned int log; 504 unsigned int log;
505 505
506 if (!vmap_lazy_unmap) 506 if (!vmap_lazy_unmap)
507 return 0; 507 return 0;
508 508
509 log = fls(num_online_cpus()); 509 log = fls(num_online_cpus());
510 510
511 return log * (32UL * 1024 * 1024 / PAGE_SIZE); 511 return log * (32UL * 1024 * 1024 / PAGE_SIZE);
512 } 512 }
513 513
514 static atomic_t vmap_lazy_nr = ATOMIC_INIT(0); 514 static atomic_t vmap_lazy_nr = ATOMIC_INIT(0);
515 515
516 /* for per-CPU blocks */ 516 /* for per-CPU blocks */
517 static void purge_fragmented_blocks_allcpus(void); 517 static void purge_fragmented_blocks_allcpus(void);
518 518
519 /* 519 /*
520 * called before a call to iounmap() if the caller wants vm_area_struct's 520 * called before a call to iounmap() if the caller wants vm_area_struct's
521 * immediately freed. 521 * immediately freed.
522 */ 522 */
523 void set_iounmap_nonlazy(void) 523 void set_iounmap_nonlazy(void)
524 { 524 {
525 atomic_set(&vmap_lazy_nr, lazy_max_pages()+1); 525 atomic_set(&vmap_lazy_nr, lazy_max_pages()+1);
526 } 526 }
527 527
528 /* 528 /*
529 * Purges all lazily-freed vmap areas. 529 * Purges all lazily-freed vmap areas.
530 * 530 *
531 * If sync is 0 then don't purge if there is already a purge in progress. 531 * If sync is 0 then don't purge if there is already a purge in progress.
532 * If force_flush is 1, then flush kernel TLBs between *start and *end even 532 * If force_flush is 1, then flush kernel TLBs between *start and *end even
533 * if we found no lazy vmap areas to unmap (callers can use this to optimise 533 * if we found no lazy vmap areas to unmap (callers can use this to optimise
534 * their own TLB flushing). 534 * their own TLB flushing).
535 * Returns with *start = min(*start, lowest purged address) 535 * Returns with *start = min(*start, lowest purged address)
536 * *end = max(*end, highest purged address) 536 * *end = max(*end, highest purged address)
537 */ 537 */
538 static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end, 538 static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end,
539 int sync, int force_flush) 539 int sync, int force_flush)
540 { 540 {
541 static DEFINE_SPINLOCK(purge_lock); 541 static DEFINE_SPINLOCK(purge_lock);
542 LIST_HEAD(valist); 542 LIST_HEAD(valist);
543 struct vmap_area *va; 543 struct vmap_area *va;
544 struct vmap_area *n_va; 544 struct vmap_area *n_va;
545 int nr = 0; 545 int nr = 0;
546 546
547 /* 547 /*
548 * If sync is 0 but force_flush is 1, we'll go sync anyway but callers 548 * If sync is 0 but force_flush is 1, we'll go sync anyway but callers
549 * should not expect such behaviour. This just simplifies locking for 549 * should not expect such behaviour. This just simplifies locking for
550 * the case that isn't actually used at the moment anyway. 550 * the case that isn't actually used at the moment anyway.
551 */ 551 */
552 if (!sync && !force_flush) { 552 if (!sync && !force_flush) {
553 if (!spin_trylock(&purge_lock)) 553 if (!spin_trylock(&purge_lock))
554 return; 554 return;
555 } else 555 } else
556 spin_lock(&purge_lock); 556 spin_lock(&purge_lock);
557 557
558 if (sync) 558 if (sync)
559 purge_fragmented_blocks_allcpus(); 559 purge_fragmented_blocks_allcpus();
560 560
561 rcu_read_lock(); 561 rcu_read_lock();
562 list_for_each_entry_rcu(va, &vmap_area_list, list) { 562 list_for_each_entry_rcu(va, &vmap_area_list, list) {
563 if (va->flags & VM_LAZY_FREE) { 563 if (va->flags & VM_LAZY_FREE) {
564 if (va->va_start < *start) 564 if (va->va_start < *start)
565 *start = va->va_start; 565 *start = va->va_start;
566 if (va->va_end > *end) 566 if (va->va_end > *end)
567 *end = va->va_end; 567 *end = va->va_end;
568 nr += (va->va_end - va->va_start) >> PAGE_SHIFT; 568 nr += (va->va_end - va->va_start) >> PAGE_SHIFT;
569 unmap_vmap_area(va); 569 unmap_vmap_area(va);
570 list_add_tail(&va->purge_list, &valist); 570 list_add_tail(&va->purge_list, &valist);
571 va->flags |= VM_LAZY_FREEING; 571 va->flags |= VM_LAZY_FREEING;
572 va->flags &= ~VM_LAZY_FREE; 572 va->flags &= ~VM_LAZY_FREE;
573 } 573 }
574 } 574 }
575 rcu_read_unlock(); 575 rcu_read_unlock();
576 576
577 if (nr) 577 if (nr)
578 atomic_sub(nr, &vmap_lazy_nr); 578 atomic_sub(nr, &vmap_lazy_nr);
579 579
580 if (nr || force_flush) 580 if (nr || force_flush)
581 flush_tlb_kernel_range(*start, *end); 581 flush_tlb_kernel_range(*start, *end);
582 582
583 if (nr) { 583 if (nr) {
584 spin_lock(&vmap_area_lock); 584 spin_lock(&vmap_area_lock);
585 list_for_each_entry_safe(va, n_va, &valist, purge_list) 585 list_for_each_entry_safe(va, n_va, &valist, purge_list)
586 __free_vmap_area(va); 586 __free_vmap_area(va);
587 spin_unlock(&vmap_area_lock); 587 spin_unlock(&vmap_area_lock);
588 } 588 }
589 spin_unlock(&purge_lock); 589 spin_unlock(&purge_lock);
590 } 590 }
591 591
592 /* 592 /*
593 * Kick off a purge of the outstanding lazy areas. Don't bother if somebody 593 * Kick off a purge of the outstanding lazy areas. Don't bother if somebody
594 * is already purging. 594 * is already purging.
595 */ 595 */
596 static void try_purge_vmap_area_lazy(void) 596 static void try_purge_vmap_area_lazy(void)
597 { 597 {
598 unsigned long start = ULONG_MAX, end = 0; 598 unsigned long start = ULONG_MAX, end = 0;
599 599
600 __purge_vmap_area_lazy(&start, &end, 0, 0); 600 __purge_vmap_area_lazy(&start, &end, 0, 0);
601 } 601 }
602 602
603 /* 603 /*
604 * Kick off a purge of the outstanding lazy areas. 604 * Kick off a purge of the outstanding lazy areas.
605 */ 605 */
606 static void purge_vmap_area_lazy(void) 606 static void purge_vmap_area_lazy(void)
607 { 607 {
608 unsigned long start = ULONG_MAX, end = 0; 608 unsigned long start = ULONG_MAX, end = 0;
609 609
610 __purge_vmap_area_lazy(&start, &end, 1, 0); 610 __purge_vmap_area_lazy(&start, &end, 1, 0);
611 } 611 }
612 612
613 /* 613 /*
614 * Free and unmap a vmap area, caller ensuring flush_cache_vunmap had been 614 * Free and unmap a vmap area, caller ensuring flush_cache_vunmap had been
615 * called for the correct range previously. 615 * called for the correct range previously.
616 */ 616 */
617 static void free_unmap_vmap_area_noflush(struct vmap_area *va) 617 static void free_unmap_vmap_area_noflush(struct vmap_area *va)
618 { 618 {
619 va->flags |= VM_LAZY_FREE; 619 va->flags |= VM_LAZY_FREE;
620 atomic_add((va->va_end - va->va_start) >> PAGE_SHIFT, &vmap_lazy_nr); 620 atomic_add((va->va_end - va->va_start) >> PAGE_SHIFT, &vmap_lazy_nr);
621 if (unlikely(atomic_read(&vmap_lazy_nr) > lazy_max_pages())) 621 if (unlikely(atomic_read(&vmap_lazy_nr) > lazy_max_pages()))
622 try_purge_vmap_area_lazy(); 622 try_purge_vmap_area_lazy();
623 } 623 }
624 624
625 /* 625 /*
626 * Free and unmap a vmap area 626 * Free and unmap a vmap area
627 */ 627 */
628 static void free_unmap_vmap_area(struct vmap_area *va) 628 static void free_unmap_vmap_area(struct vmap_area *va)
629 { 629 {
630 flush_cache_vunmap(va->va_start, va->va_end); 630 flush_cache_vunmap(va->va_start, va->va_end);
631 free_unmap_vmap_area_noflush(va); 631 free_unmap_vmap_area_noflush(va);
632 } 632 }
633 633
634 static struct vmap_area *find_vmap_area(unsigned long addr) 634 static struct vmap_area *find_vmap_area(unsigned long addr)
635 { 635 {
636 struct vmap_area *va; 636 struct vmap_area *va;
637 637
638 spin_lock(&vmap_area_lock); 638 spin_lock(&vmap_area_lock);
639 va = __find_vmap_area(addr); 639 va = __find_vmap_area(addr);
640 spin_unlock(&vmap_area_lock); 640 spin_unlock(&vmap_area_lock);
641 641
642 return va; 642 return va;
643 } 643 }
644 644
645 static void free_unmap_vmap_area_addr(unsigned long addr) 645 static void free_unmap_vmap_area_addr(unsigned long addr)
646 { 646 {
647 struct vmap_area *va; 647 struct vmap_area *va;
648 648
649 va = find_vmap_area(addr); 649 va = find_vmap_area(addr);
650 BUG_ON(!va); 650 BUG_ON(!va);
651 free_unmap_vmap_area(va); 651 free_unmap_vmap_area(va);
652 } 652 }
653 653
654 654
655 /*** Per cpu kva allocator ***/ 655 /*** Per cpu kva allocator ***/
656 656
657 /* 657 /*
658 * vmap space is limited especially on 32 bit architectures. Ensure there is 658 * vmap space is limited especially on 32 bit architectures. Ensure there is
659 * room for at least 16 percpu vmap blocks per CPU. 659 * room for at least 16 percpu vmap blocks per CPU.
660 */ 660 */
661 /* 661 /*
662 * If we had a constant VMALLOC_START and VMALLOC_END, we'd like to be able 662 * If we had a constant VMALLOC_START and VMALLOC_END, we'd like to be able
663 * to #define VMALLOC_SPACE (VMALLOC_END-VMALLOC_START). Guess 663 * to #define VMALLOC_SPACE (VMALLOC_END-VMALLOC_START). Guess
664 * instead (we just need a rough idea) 664 * instead (we just need a rough idea)
665 */ 665 */
666 #if BITS_PER_LONG == 32 666 #if BITS_PER_LONG == 32
667 #define VMALLOC_SPACE (128UL*1024*1024) 667 #define VMALLOC_SPACE (128UL*1024*1024)
668 #else 668 #else
669 #define VMALLOC_SPACE (128UL*1024*1024*1024) 669 #define VMALLOC_SPACE (128UL*1024*1024*1024)
670 #endif 670 #endif
671 671
672 #define VMALLOC_PAGES (VMALLOC_SPACE / PAGE_SIZE) 672 #define VMALLOC_PAGES (VMALLOC_SPACE / PAGE_SIZE)
673 #define VMAP_MAX_ALLOC BITS_PER_LONG /* 256K with 4K pages */ 673 #define VMAP_MAX_ALLOC BITS_PER_LONG /* 256K with 4K pages */
674 #define VMAP_BBMAP_BITS_MAX 1024 /* 4MB with 4K pages */ 674 #define VMAP_BBMAP_BITS_MAX 1024 /* 4MB with 4K pages */
675 #define VMAP_BBMAP_BITS_MIN (VMAP_MAX_ALLOC*2) 675 #define VMAP_BBMAP_BITS_MIN (VMAP_MAX_ALLOC*2)
676 #define VMAP_MIN(x, y) ((x) < (y) ? (x) : (y)) /* can't use min() */ 676 #define VMAP_MIN(x, y) ((x) < (y) ? (x) : (y)) /* can't use min() */
677 #define VMAP_MAX(x, y) ((x) > (y) ? (x) : (y)) /* can't use max() */ 677 #define VMAP_MAX(x, y) ((x) > (y) ? (x) : (y)) /* can't use max() */
678 #define VMAP_BBMAP_BITS VMAP_MIN(VMAP_BBMAP_BITS_MAX, \ 678 #define VMAP_BBMAP_BITS VMAP_MIN(VMAP_BBMAP_BITS_MAX, \
679 VMAP_MAX(VMAP_BBMAP_BITS_MIN, \ 679 VMAP_MAX(VMAP_BBMAP_BITS_MIN, \
680 VMALLOC_PAGES / NR_CPUS / 16)) 680 VMALLOC_PAGES / NR_CPUS / 16))
681 681
682 #define VMAP_BLOCK_SIZE (VMAP_BBMAP_BITS * PAGE_SIZE) 682 #define VMAP_BLOCK_SIZE (VMAP_BBMAP_BITS * PAGE_SIZE)
683 683
684 static bool vmap_initialized __read_mostly = false; 684 static bool vmap_initialized __read_mostly = false;
685 685
686 struct vmap_block_queue { 686 struct vmap_block_queue {
687 spinlock_t lock; 687 spinlock_t lock;
688 struct list_head free; 688 struct list_head free;
689 }; 689 };
690 690
691 struct vmap_block { 691 struct vmap_block {
692 spinlock_t lock; 692 spinlock_t lock;
693 struct vmap_area *va; 693 struct vmap_area *va;
694 struct vmap_block_queue *vbq; 694 struct vmap_block_queue *vbq;
695 unsigned long free, dirty; 695 unsigned long free, dirty;
696 DECLARE_BITMAP(alloc_map, VMAP_BBMAP_BITS); 696 DECLARE_BITMAP(alloc_map, VMAP_BBMAP_BITS);
697 DECLARE_BITMAP(dirty_map, VMAP_BBMAP_BITS); 697 DECLARE_BITMAP(dirty_map, VMAP_BBMAP_BITS);
698 struct list_head free_list; 698 struct list_head free_list;
699 struct rcu_head rcu_head; 699 struct rcu_head rcu_head;
700 struct list_head purge; 700 struct list_head purge;
701 }; 701 };
702 702
703 /* Queue of free and dirty vmap blocks, for allocation and flushing purposes */ 703 /* Queue of free and dirty vmap blocks, for allocation and flushing purposes */
704 static DEFINE_PER_CPU(struct vmap_block_queue, vmap_block_queue); 704 static DEFINE_PER_CPU(struct vmap_block_queue, vmap_block_queue);
705 705
706 /* 706 /*
707 * Radix tree of vmap blocks, indexed by address, to quickly find a vmap block 707 * Radix tree of vmap blocks, indexed by address, to quickly find a vmap block
708 * in the free path. Could get rid of this if we change the API to return a 708 * in the free path. Could get rid of this if we change the API to return a
709 * "cookie" from alloc, to be passed to free. But no big deal yet. 709 * "cookie" from alloc, to be passed to free. But no big deal yet.
710 */ 710 */
711 static DEFINE_SPINLOCK(vmap_block_tree_lock); 711 static DEFINE_SPINLOCK(vmap_block_tree_lock);
712 static RADIX_TREE(vmap_block_tree, GFP_ATOMIC); 712 static RADIX_TREE(vmap_block_tree, GFP_ATOMIC);
713 713
714 /* 714 /*
715 * We should probably have a fallback mechanism to allocate virtual memory 715 * We should probably have a fallback mechanism to allocate virtual memory
716 * out of partially filled vmap blocks. However vmap block sizing should be 716 * out of partially filled vmap blocks. However vmap block sizing should be
717 * fairly reasonable according to the vmalloc size, so it shouldn't be a 717 * fairly reasonable according to the vmalloc size, so it shouldn't be a
718 * big problem. 718 * big problem.
719 */ 719 */
720 720
721 static unsigned long addr_to_vb_idx(unsigned long addr) 721 static unsigned long addr_to_vb_idx(unsigned long addr)
722 { 722 {
723 addr -= VMALLOC_START & ~(VMAP_BLOCK_SIZE-1); 723 addr -= VMALLOC_START & ~(VMAP_BLOCK_SIZE-1);
724 addr /= VMAP_BLOCK_SIZE; 724 addr /= VMAP_BLOCK_SIZE;
725 return addr; 725 return addr;
726 } 726 }
727 727
728 static struct vmap_block *new_vmap_block(gfp_t gfp_mask) 728 static struct vmap_block *new_vmap_block(gfp_t gfp_mask)
729 { 729 {
730 struct vmap_block_queue *vbq; 730 struct vmap_block_queue *vbq;
731 struct vmap_block *vb; 731 struct vmap_block *vb;
732 struct vmap_area *va; 732 struct vmap_area *va;
733 unsigned long vb_idx; 733 unsigned long vb_idx;
734 int node, err; 734 int node, err;
735 735
736 node = numa_node_id(); 736 node = numa_node_id();
737 737
738 vb = kmalloc_node(sizeof(struct vmap_block), 738 vb = kmalloc_node(sizeof(struct vmap_block),
739 gfp_mask & GFP_RECLAIM_MASK, node); 739 gfp_mask & GFP_RECLAIM_MASK, node);
740 if (unlikely(!vb)) 740 if (unlikely(!vb))
741 return ERR_PTR(-ENOMEM); 741 return ERR_PTR(-ENOMEM);
742 742
743 va = alloc_vmap_area(VMAP_BLOCK_SIZE, VMAP_BLOCK_SIZE, 743 va = alloc_vmap_area(VMAP_BLOCK_SIZE, VMAP_BLOCK_SIZE,
744 VMALLOC_START, VMALLOC_END, 744 VMALLOC_START, VMALLOC_END,
745 node, gfp_mask); 745 node, gfp_mask);
746 if (unlikely(IS_ERR(va))) { 746 if (unlikely(IS_ERR(va))) {
747 kfree(vb); 747 kfree(vb);
748 return ERR_CAST(va); 748 return ERR_CAST(va);
749 } 749 }
750 750
751 err = radix_tree_preload(gfp_mask); 751 err = radix_tree_preload(gfp_mask);
752 if (unlikely(err)) { 752 if (unlikely(err)) {
753 kfree(vb); 753 kfree(vb);
754 free_vmap_area(va); 754 free_vmap_area(va);
755 return ERR_PTR(err); 755 return ERR_PTR(err);
756 } 756 }
757 757
758 spin_lock_init(&vb->lock); 758 spin_lock_init(&vb->lock);
759 vb->va = va; 759 vb->va = va;
760 vb->free = VMAP_BBMAP_BITS; 760 vb->free = VMAP_BBMAP_BITS;
761 vb->dirty = 0; 761 vb->dirty = 0;
762 bitmap_zero(vb->alloc_map, VMAP_BBMAP_BITS); 762 bitmap_zero(vb->alloc_map, VMAP_BBMAP_BITS);
763 bitmap_zero(vb->dirty_map, VMAP_BBMAP_BITS); 763 bitmap_zero(vb->dirty_map, VMAP_BBMAP_BITS);
764 INIT_LIST_HEAD(&vb->free_list); 764 INIT_LIST_HEAD(&vb->free_list);
765 765
766 vb_idx = addr_to_vb_idx(va->va_start); 766 vb_idx = addr_to_vb_idx(va->va_start);
767 spin_lock(&vmap_block_tree_lock); 767 spin_lock(&vmap_block_tree_lock);
768 err = radix_tree_insert(&vmap_block_tree, vb_idx, vb); 768 err = radix_tree_insert(&vmap_block_tree, vb_idx, vb);
769 spin_unlock(&vmap_block_tree_lock); 769 spin_unlock(&vmap_block_tree_lock);
770 BUG_ON(err); 770 BUG_ON(err);
771 radix_tree_preload_end(); 771 radix_tree_preload_end();
772 772
773 vbq = &get_cpu_var(vmap_block_queue); 773 vbq = &get_cpu_var(vmap_block_queue);
774 vb->vbq = vbq; 774 vb->vbq = vbq;
775 spin_lock(&vbq->lock); 775 spin_lock(&vbq->lock);
776 list_add_rcu(&vb->free_list, &vbq->free); 776 list_add_rcu(&vb->free_list, &vbq->free);
777 spin_unlock(&vbq->lock); 777 spin_unlock(&vbq->lock);
778 put_cpu_var(vmap_block_queue); 778 put_cpu_var(vmap_block_queue);
779 779
780 return vb; 780 return vb;
781 } 781 }
782 782
783 static void rcu_free_vb(struct rcu_head *head) 783 static void rcu_free_vb(struct rcu_head *head)
784 { 784 {
785 struct vmap_block *vb = container_of(head, struct vmap_block, rcu_head); 785 struct vmap_block *vb = container_of(head, struct vmap_block, rcu_head);
786 786
787 kfree(vb); 787 kfree(vb);
788 } 788 }
789 789
790 static void free_vmap_block(struct vmap_block *vb) 790 static void free_vmap_block(struct vmap_block *vb)
791 { 791 {
792 struct vmap_block *tmp; 792 struct vmap_block *tmp;
793 unsigned long vb_idx; 793 unsigned long vb_idx;
794 794
795 vb_idx = addr_to_vb_idx(vb->va->va_start); 795 vb_idx = addr_to_vb_idx(vb->va->va_start);
796 spin_lock(&vmap_block_tree_lock); 796 spin_lock(&vmap_block_tree_lock);
797 tmp = radix_tree_delete(&vmap_block_tree, vb_idx); 797 tmp = radix_tree_delete(&vmap_block_tree, vb_idx);
798 spin_unlock(&vmap_block_tree_lock); 798 spin_unlock(&vmap_block_tree_lock);
799 BUG_ON(tmp != vb); 799 BUG_ON(tmp != vb);
800 800
801 free_unmap_vmap_area_noflush(vb->va); 801 free_unmap_vmap_area_noflush(vb->va);
802 call_rcu(&vb->rcu_head, rcu_free_vb); 802 call_rcu(&vb->rcu_head, rcu_free_vb);
803 } 803 }
804 804
805 static void purge_fragmented_blocks(int cpu) 805 static void purge_fragmented_blocks(int cpu)
806 { 806 {
807 LIST_HEAD(purge); 807 LIST_HEAD(purge);
808 struct vmap_block *vb; 808 struct vmap_block *vb;
809 struct vmap_block *n_vb; 809 struct vmap_block *n_vb;
810 struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, cpu); 810 struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, cpu);
811 811
812 rcu_read_lock(); 812 rcu_read_lock();
813 list_for_each_entry_rcu(vb, &vbq->free, free_list) { 813 list_for_each_entry_rcu(vb, &vbq->free, free_list) {
814 814
815 if (!(vb->free + vb->dirty == VMAP_BBMAP_BITS && vb->dirty != VMAP_BBMAP_BITS)) 815 if (!(vb->free + vb->dirty == VMAP_BBMAP_BITS && vb->dirty != VMAP_BBMAP_BITS))
816 continue; 816 continue;
817 817
818 spin_lock(&vb->lock); 818 spin_lock(&vb->lock);
819 if (vb->free + vb->dirty == VMAP_BBMAP_BITS && vb->dirty != VMAP_BBMAP_BITS) { 819 if (vb->free + vb->dirty == VMAP_BBMAP_BITS && vb->dirty != VMAP_BBMAP_BITS) {
820 vb->free = 0; /* prevent further allocs after releasing lock */ 820 vb->free = 0; /* prevent further allocs after releasing lock */
821 vb->dirty = VMAP_BBMAP_BITS; /* prevent purging it again */ 821 vb->dirty = VMAP_BBMAP_BITS; /* prevent purging it again */
822 bitmap_fill(vb->alloc_map, VMAP_BBMAP_BITS); 822 bitmap_fill(vb->alloc_map, VMAP_BBMAP_BITS);
823 bitmap_fill(vb->dirty_map, VMAP_BBMAP_BITS); 823 bitmap_fill(vb->dirty_map, VMAP_BBMAP_BITS);
824 spin_lock(&vbq->lock); 824 spin_lock(&vbq->lock);
825 list_del_rcu(&vb->free_list); 825 list_del_rcu(&vb->free_list);
826 spin_unlock(&vbq->lock); 826 spin_unlock(&vbq->lock);
827 spin_unlock(&vb->lock); 827 spin_unlock(&vb->lock);
828 list_add_tail(&vb->purge, &purge); 828 list_add_tail(&vb->purge, &purge);
829 } else 829 } else
830 spin_unlock(&vb->lock); 830 spin_unlock(&vb->lock);
831 } 831 }
832 rcu_read_unlock(); 832 rcu_read_unlock();
833 833
834 list_for_each_entry_safe(vb, n_vb, &purge, purge) { 834 list_for_each_entry_safe(vb, n_vb, &purge, purge) {
835 list_del(&vb->purge); 835 list_del(&vb->purge);
836 free_vmap_block(vb); 836 free_vmap_block(vb);
837 } 837 }
838 } 838 }
839 839
840 static void purge_fragmented_blocks_thiscpu(void) 840 static void purge_fragmented_blocks_thiscpu(void)
841 { 841 {
842 purge_fragmented_blocks(smp_processor_id()); 842 purge_fragmented_blocks(smp_processor_id());
843 } 843 }
844 844
845 static void purge_fragmented_blocks_allcpus(void) 845 static void purge_fragmented_blocks_allcpus(void)
846 { 846 {
847 int cpu; 847 int cpu;
848 848
849 for_each_possible_cpu(cpu) 849 for_each_possible_cpu(cpu)
850 purge_fragmented_blocks(cpu); 850 purge_fragmented_blocks(cpu);
851 } 851 }
852 852
853 static void *vb_alloc(unsigned long size, gfp_t gfp_mask) 853 static void *vb_alloc(unsigned long size, gfp_t gfp_mask)
854 { 854 {
855 struct vmap_block_queue *vbq; 855 struct vmap_block_queue *vbq;
856 struct vmap_block *vb; 856 struct vmap_block *vb;
857 unsigned long addr = 0; 857 unsigned long addr = 0;
858 unsigned int order; 858 unsigned int order;
859 int purge = 0; 859 int purge = 0;
860 860
861 BUG_ON(size & ~PAGE_MASK); 861 BUG_ON(size & ~PAGE_MASK);
862 BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC); 862 BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC);
863 order = get_order(size); 863 order = get_order(size);
864 864
865 again: 865 again:
866 rcu_read_lock(); 866 rcu_read_lock();
867 vbq = &get_cpu_var(vmap_block_queue); 867 vbq = &get_cpu_var(vmap_block_queue);
868 list_for_each_entry_rcu(vb, &vbq->free, free_list) { 868 list_for_each_entry_rcu(vb, &vbq->free, free_list) {
869 int i; 869 int i;
870 870
871 spin_lock(&vb->lock); 871 spin_lock(&vb->lock);
872 if (vb->free < 1UL << order) 872 if (vb->free < 1UL << order)
873 goto next; 873 goto next;
874 874
875 i = bitmap_find_free_region(vb->alloc_map, 875 i = bitmap_find_free_region(vb->alloc_map,
876 VMAP_BBMAP_BITS, order); 876 VMAP_BBMAP_BITS, order);
877 877
878 if (i < 0) { 878 if (i < 0) {
879 if (vb->free + vb->dirty == VMAP_BBMAP_BITS) { 879 if (vb->free + vb->dirty == VMAP_BBMAP_BITS) {
880 /* fragmented and no outstanding allocations */ 880 /* fragmented and no outstanding allocations */
881 BUG_ON(vb->dirty != VMAP_BBMAP_BITS); 881 BUG_ON(vb->dirty != VMAP_BBMAP_BITS);
882 purge = 1; 882 purge = 1;
883 } 883 }
884 goto next; 884 goto next;
885 } 885 }
886 addr = vb->va->va_start + (i << PAGE_SHIFT); 886 addr = vb->va->va_start + (i << PAGE_SHIFT);
887 BUG_ON(addr_to_vb_idx(addr) != 887 BUG_ON(addr_to_vb_idx(addr) !=
888 addr_to_vb_idx(vb->va->va_start)); 888 addr_to_vb_idx(vb->va->va_start));
889 vb->free -= 1UL << order; 889 vb->free -= 1UL << order;
890 if (vb->free == 0) { 890 if (vb->free == 0) {
891 spin_lock(&vbq->lock); 891 spin_lock(&vbq->lock);
892 list_del_rcu(&vb->free_list); 892 list_del_rcu(&vb->free_list);
893 spin_unlock(&vbq->lock); 893 spin_unlock(&vbq->lock);
894 } 894 }
895 spin_unlock(&vb->lock); 895 spin_unlock(&vb->lock);
896 break; 896 break;
897 next: 897 next:
898 spin_unlock(&vb->lock); 898 spin_unlock(&vb->lock);
899 } 899 }
900 900
901 if (purge) 901 if (purge)
902 purge_fragmented_blocks_thiscpu(); 902 purge_fragmented_blocks_thiscpu();
903 903
904 put_cpu_var(vmap_block_queue); 904 put_cpu_var(vmap_block_queue);
905 rcu_read_unlock(); 905 rcu_read_unlock();
906 906
907 if (!addr) { 907 if (!addr) {
908 vb = new_vmap_block(gfp_mask); 908 vb = new_vmap_block(gfp_mask);
909 if (IS_ERR(vb)) 909 if (IS_ERR(vb))
910 return vb; 910 return vb;
911 goto again; 911 goto again;
912 } 912 }
913 913
914 return (void *)addr; 914 return (void *)addr;
915 } 915 }
916 916
917 static void vb_free(const void *addr, unsigned long size) 917 static void vb_free(const void *addr, unsigned long size)
918 { 918 {
919 unsigned long offset; 919 unsigned long offset;
920 unsigned long vb_idx; 920 unsigned long vb_idx;
921 unsigned int order; 921 unsigned int order;
922 struct vmap_block *vb; 922 struct vmap_block *vb;
923 923
924 BUG_ON(size & ~PAGE_MASK); 924 BUG_ON(size & ~PAGE_MASK);
925 BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC); 925 BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC);
926 926
927 flush_cache_vunmap((unsigned long)addr, (unsigned long)addr + size); 927 flush_cache_vunmap((unsigned long)addr, (unsigned long)addr + size);
928 928
929 order = get_order(size); 929 order = get_order(size);
930 930
931 offset = (unsigned long)addr & (VMAP_BLOCK_SIZE - 1); 931 offset = (unsigned long)addr & (VMAP_BLOCK_SIZE - 1);
932 932
933 vb_idx = addr_to_vb_idx((unsigned long)addr); 933 vb_idx = addr_to_vb_idx((unsigned long)addr);
934 rcu_read_lock(); 934 rcu_read_lock();
935 vb = radix_tree_lookup(&vmap_block_tree, vb_idx); 935 vb = radix_tree_lookup(&vmap_block_tree, vb_idx);
936 rcu_read_unlock(); 936 rcu_read_unlock();
937 BUG_ON(!vb); 937 BUG_ON(!vb);
938 938
939 spin_lock(&vb->lock); 939 spin_lock(&vb->lock);
940 BUG_ON(bitmap_allocate_region(vb->dirty_map, offset >> PAGE_SHIFT, order)); 940 BUG_ON(bitmap_allocate_region(vb->dirty_map, offset >> PAGE_SHIFT, order));
941 941
942 vb->dirty += 1UL << order; 942 vb->dirty += 1UL << order;
943 if (vb->dirty == VMAP_BBMAP_BITS) { 943 if (vb->dirty == VMAP_BBMAP_BITS) {
944 BUG_ON(vb->free); 944 BUG_ON(vb->free);
945 spin_unlock(&vb->lock); 945 spin_unlock(&vb->lock);
946 free_vmap_block(vb); 946 free_vmap_block(vb);
947 } else 947 } else
948 spin_unlock(&vb->lock); 948 spin_unlock(&vb->lock);
949 } 949 }
950 950
951 /** 951 /**
952 * vm_unmap_aliases - unmap outstanding lazy aliases in the vmap layer 952 * vm_unmap_aliases - unmap outstanding lazy aliases in the vmap layer
953 * 953 *
954 * The vmap/vmalloc layer lazily flushes kernel virtual mappings primarily 954 * The vmap/vmalloc layer lazily flushes kernel virtual mappings primarily
955 * to amortize TLB flushing overheads. What this means is that any page you 955 * to amortize TLB flushing overheads. What this means is that any page you
956 * have now, may, in a former life, have been mapped into kernel virtual 956 * have now, may, in a former life, have been mapped into kernel virtual
957 * address by the vmap layer and so there might be some CPUs with TLB entries 957 * address by the vmap layer and so there might be some CPUs with TLB entries
958 * still referencing that page (additional to the regular 1:1 kernel mapping). 958 * still referencing that page (additional to the regular 1:1 kernel mapping).
959 * 959 *
960 * vm_unmap_aliases flushes all such lazy mappings. After it returns, we can 960 * vm_unmap_aliases flushes all such lazy mappings. After it returns, we can
961 * be sure that none of the pages we have control over will have any aliases 961 * be sure that none of the pages we have control over will have any aliases
962 * from the vmap layer. 962 * from the vmap layer.
963 */ 963 */
964 void vm_unmap_aliases(void) 964 void vm_unmap_aliases(void)
965 { 965 {
966 unsigned long start = ULONG_MAX, end = 0; 966 unsigned long start = ULONG_MAX, end = 0;
967 int cpu; 967 int cpu;
968 int flush = 0; 968 int flush = 0;
969 969
970 if (unlikely(!vmap_initialized)) 970 if (unlikely(!vmap_initialized))
971 return; 971 return;
972 972
973 for_each_possible_cpu(cpu) { 973 for_each_possible_cpu(cpu) {
974 struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, cpu); 974 struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, cpu);
975 struct vmap_block *vb; 975 struct vmap_block *vb;
976 976
977 rcu_read_lock(); 977 rcu_read_lock();
978 list_for_each_entry_rcu(vb, &vbq->free, free_list) { 978 list_for_each_entry_rcu(vb, &vbq->free, free_list) {
979 int i; 979 int i;
980 980
981 spin_lock(&vb->lock); 981 spin_lock(&vb->lock);
982 i = find_first_bit(vb->dirty_map, VMAP_BBMAP_BITS); 982 i = find_first_bit(vb->dirty_map, VMAP_BBMAP_BITS);
983 while (i < VMAP_BBMAP_BITS) { 983 while (i < VMAP_BBMAP_BITS) {
984 unsigned long s, e; 984 unsigned long s, e;
985 int j; 985 int j;
986 j = find_next_zero_bit(vb->dirty_map, 986 j = find_next_zero_bit(vb->dirty_map,
987 VMAP_BBMAP_BITS, i); 987 VMAP_BBMAP_BITS, i);
988 988
989 s = vb->va->va_start + (i << PAGE_SHIFT); 989 s = vb->va->va_start + (i << PAGE_SHIFT);
990 e = vb->va->va_start + (j << PAGE_SHIFT); 990 e = vb->va->va_start + (j << PAGE_SHIFT);
991 vunmap_page_range(s, e); 991 vunmap_page_range(s, e);
992 flush = 1; 992 flush = 1;
993 993
994 if (s < start) 994 if (s < start)
995 start = s; 995 start = s;
996 if (e > end) 996 if (e > end)
997 end = e; 997 end = e;
998 998
999 i = j; 999 i = j;
1000 i = find_next_bit(vb->dirty_map, 1000 i = find_next_bit(vb->dirty_map,
1001 VMAP_BBMAP_BITS, i); 1001 VMAP_BBMAP_BITS, i);
1002 } 1002 }
1003 spin_unlock(&vb->lock); 1003 spin_unlock(&vb->lock);
1004 } 1004 }
1005 rcu_read_unlock(); 1005 rcu_read_unlock();
1006 } 1006 }
1007 1007
1008 __purge_vmap_area_lazy(&start, &end, 1, flush); 1008 __purge_vmap_area_lazy(&start, &end, 1, flush);
1009 } 1009 }
1010 EXPORT_SYMBOL_GPL(vm_unmap_aliases); 1010 EXPORT_SYMBOL_GPL(vm_unmap_aliases);
1011 1011
1012 /** 1012 /**
1013 * vm_unmap_ram - unmap linear kernel address space set up by vm_map_ram 1013 * vm_unmap_ram - unmap linear kernel address space set up by vm_map_ram
1014 * @mem: the pointer returned by vm_map_ram 1014 * @mem: the pointer returned by vm_map_ram
1015 * @count: the count passed to that vm_map_ram call (cannot unmap partial) 1015 * @count: the count passed to that vm_map_ram call (cannot unmap partial)
1016 */ 1016 */
1017 void vm_unmap_ram(const void *mem, unsigned int count) 1017 void vm_unmap_ram(const void *mem, unsigned int count)
1018 { 1018 {
1019 unsigned long size = count << PAGE_SHIFT; 1019 unsigned long size = count << PAGE_SHIFT;
1020 unsigned long addr = (unsigned long)mem; 1020 unsigned long addr = (unsigned long)mem;
1021 1021
1022 BUG_ON(!addr); 1022 BUG_ON(!addr);
1023 BUG_ON(addr < VMALLOC_START); 1023 BUG_ON(addr < VMALLOC_START);
1024 BUG_ON(addr > VMALLOC_END); 1024 BUG_ON(addr > VMALLOC_END);
1025 BUG_ON(addr & (PAGE_SIZE-1)); 1025 BUG_ON(addr & (PAGE_SIZE-1));
1026 1026
1027 debug_check_no_locks_freed(mem, size); 1027 debug_check_no_locks_freed(mem, size);
1028 vmap_debug_free_range(addr, addr+size); 1028 vmap_debug_free_range(addr, addr+size);
1029 1029
1030 if (likely(count <= VMAP_MAX_ALLOC)) 1030 if (likely(count <= VMAP_MAX_ALLOC))
1031 vb_free(mem, size); 1031 vb_free(mem, size);
1032 else 1032 else
1033 free_unmap_vmap_area_addr(addr); 1033 free_unmap_vmap_area_addr(addr);
1034 } 1034 }
1035 EXPORT_SYMBOL(vm_unmap_ram); 1035 EXPORT_SYMBOL(vm_unmap_ram);
1036 1036
1037 /** 1037 /**
1038 * vm_map_ram - map pages linearly into kernel virtual address (vmalloc space) 1038 * vm_map_ram - map pages linearly into kernel virtual address (vmalloc space)
1039 * @pages: an array of pointers to the pages to be mapped 1039 * @pages: an array of pointers to the pages to be mapped
1040 * @count: number of pages 1040 * @count: number of pages
1041 * @node: prefer to allocate data structures on this node 1041 * @node: prefer to allocate data structures on this node
1042 * @prot: memory protection to use. PAGE_KERNEL for regular RAM 1042 * @prot: memory protection to use. PAGE_KERNEL for regular RAM
1043 * 1043 *
1044 * Returns: a pointer to the address that has been mapped, or %NULL on failure 1044 * Returns: a pointer to the address that has been mapped, or %NULL on failure
1045 */ 1045 */
1046 void *vm_map_ram(struct page **pages, unsigned int count, int node, pgprot_t prot) 1046 void *vm_map_ram(struct page **pages, unsigned int count, int node, pgprot_t prot)
1047 { 1047 {
1048 unsigned long size = count << PAGE_SHIFT; 1048 unsigned long size = count << PAGE_SHIFT;
1049 unsigned long addr; 1049 unsigned long addr;
1050 void *mem; 1050 void *mem;
1051 1051
1052 if (likely(count <= VMAP_MAX_ALLOC)) { 1052 if (likely(count <= VMAP_MAX_ALLOC)) {
1053 mem = vb_alloc(size, GFP_KERNEL); 1053 mem = vb_alloc(size, GFP_KERNEL);
1054 if (IS_ERR(mem)) 1054 if (IS_ERR(mem))
1055 return NULL; 1055 return NULL;
1056 addr = (unsigned long)mem; 1056 addr = (unsigned long)mem;
1057 } else { 1057 } else {
1058 struct vmap_area *va; 1058 struct vmap_area *va;
1059 va = alloc_vmap_area(size, PAGE_SIZE, 1059 va = alloc_vmap_area(size, PAGE_SIZE,
1060 VMALLOC_START, VMALLOC_END, node, GFP_KERNEL); 1060 VMALLOC_START, VMALLOC_END, node, GFP_KERNEL);
1061 if (IS_ERR(va)) 1061 if (IS_ERR(va))
1062 return NULL; 1062 return NULL;
1063 1063
1064 addr = va->va_start; 1064 addr = va->va_start;
1065 mem = (void *)addr; 1065 mem = (void *)addr;
1066 } 1066 }
1067 if (vmap_page_range(addr, addr + size, prot, pages) < 0) { 1067 if (vmap_page_range(addr, addr + size, prot, pages) < 0) {
1068 vm_unmap_ram(mem, count); 1068 vm_unmap_ram(mem, count);
1069 return NULL; 1069 return NULL;
1070 } 1070 }
1071 return mem; 1071 return mem;
1072 } 1072 }
1073 EXPORT_SYMBOL(vm_map_ram); 1073 EXPORT_SYMBOL(vm_map_ram);
1074 1074
1075 /** 1075 /**
1076 * vm_area_register_early - register vmap area early during boot 1076 * vm_area_register_early - register vmap area early during boot
1077 * @vm: vm_struct to register 1077 * @vm: vm_struct to register
1078 * @align: requested alignment 1078 * @align: requested alignment
1079 * 1079 *
1080 * This function is used to register kernel vm area before 1080 * This function is used to register kernel vm area before
1081 * vmalloc_init() is called. @vm->size and @vm->flags should contain 1081 * vmalloc_init() is called. @vm->size and @vm->flags should contain
1082 * proper values on entry and other fields should be zero. On return, 1082 * proper values on entry and other fields should be zero. On return,
1083 * vm->addr contains the allocated address. 1083 * vm->addr contains the allocated address.
1084 * 1084 *
1085 * DO NOT USE THIS FUNCTION UNLESS YOU KNOW WHAT YOU'RE DOING. 1085 * DO NOT USE THIS FUNCTION UNLESS YOU KNOW WHAT YOU'RE DOING.
1086 */ 1086 */
1087 void __init vm_area_register_early(struct vm_struct *vm, size_t align) 1087 void __init vm_area_register_early(struct vm_struct *vm, size_t align)
1088 { 1088 {
1089 static size_t vm_init_off __initdata; 1089 static size_t vm_init_off __initdata;
1090 unsigned long addr; 1090 unsigned long addr;
1091 1091
1092 addr = ALIGN(VMALLOC_START + vm_init_off, align); 1092 addr = ALIGN(VMALLOC_START + vm_init_off, align);
1093 vm_init_off = PFN_ALIGN(addr + vm->size) - VMALLOC_START; 1093 vm_init_off = PFN_ALIGN(addr + vm->size) - VMALLOC_START;
1094 1094
1095 vm->addr = (void *)addr; 1095 vm->addr = (void *)addr;
1096 1096
1097 vm->next = vmlist; 1097 vm->next = vmlist;
1098 vmlist = vm; 1098 vmlist = vm;
1099 } 1099 }
1100 1100
1101 void __init vmalloc_init(void) 1101 void __init vmalloc_init(void)
1102 { 1102 {
1103 struct vmap_area *va; 1103 struct vmap_area *va;
1104 struct vm_struct *tmp; 1104 struct vm_struct *tmp;
1105 int i; 1105 int i;
1106 1106
1107 for_each_possible_cpu(i) { 1107 for_each_possible_cpu(i) {
1108 struct vmap_block_queue *vbq; 1108 struct vmap_block_queue *vbq;
1109 1109
1110 vbq = &per_cpu(vmap_block_queue, i); 1110 vbq = &per_cpu(vmap_block_queue, i);
1111 spin_lock_init(&vbq->lock); 1111 spin_lock_init(&vbq->lock);
1112 INIT_LIST_HEAD(&vbq->free); 1112 INIT_LIST_HEAD(&vbq->free);
1113 } 1113 }
1114 1114
1115 /* Import existing vmlist entries. */ 1115 /* Import existing vmlist entries. */
1116 for (tmp = vmlist; tmp; tmp = tmp->next) { 1116 for (tmp = vmlist; tmp; tmp = tmp->next) {
1117 va = kzalloc(sizeof(struct vmap_area), GFP_NOWAIT); 1117 va = kzalloc(sizeof(struct vmap_area), GFP_NOWAIT);
1118 va->flags = tmp->flags | VM_VM_AREA; 1118 va->flags = tmp->flags | VM_VM_AREA;
1119 va->va_start = (unsigned long)tmp->addr; 1119 va->va_start = (unsigned long)tmp->addr;
1120 va->va_end = va->va_start + tmp->size; 1120 va->va_end = va->va_start + tmp->size;
1121 __insert_vmap_area(va); 1121 __insert_vmap_area(va);
1122 } 1122 }
1123 1123
1124 vmap_area_pcpu_hole = VMALLOC_END; 1124 vmap_area_pcpu_hole = VMALLOC_END;
1125 1125
1126 vmap_initialized = true; 1126 vmap_initialized = true;
1127 } 1127 }
1128 1128
1129 /** 1129 /**
1130 * map_kernel_range_noflush - map kernel VM area with the specified pages 1130 * map_kernel_range_noflush - map kernel VM area with the specified pages
1131 * @addr: start of the VM area to map 1131 * @addr: start of the VM area to map
1132 * @size: size of the VM area to map 1132 * @size: size of the VM area to map
1133 * @prot: page protection flags to use 1133 * @prot: page protection flags to use
1134 * @pages: pages to map 1134 * @pages: pages to map
1135 * 1135 *
1136 * Map PFN_UP(@size) pages at @addr. The VM area @addr and @size 1136 * Map PFN_UP(@size) pages at @addr. The VM area @addr and @size
1137 * specify should have been allocated using get_vm_area() and its 1137 * specify should have been allocated using get_vm_area() and its
1138 * friends. 1138 * friends.
1139 * 1139 *
1140 * NOTE: 1140 * NOTE:
1141 * This function does NOT do any cache flushing. The caller is 1141 * This function does NOT do any cache flushing. The caller is
1142 * responsible for calling flush_cache_vmap() on to-be-mapped areas 1142 * responsible for calling flush_cache_vmap() on to-be-mapped areas
1143 * before calling this function. 1143 * before calling this function.
1144 * 1144 *
1145 * RETURNS: 1145 * RETURNS:
1146 * The number of pages mapped on success, -errno on failure. 1146 * The number of pages mapped on success, -errno on failure.
1147 */ 1147 */
1148 int map_kernel_range_noflush(unsigned long addr, unsigned long size, 1148 int map_kernel_range_noflush(unsigned long addr, unsigned long size,
1149 pgprot_t prot, struct page **pages) 1149 pgprot_t prot, struct page **pages)
1150 { 1150 {
1151 return vmap_page_range_noflush(addr, addr + size, prot, pages); 1151 return vmap_page_range_noflush(addr, addr + size, prot, pages);
1152 } 1152 }
1153 1153
1154 /** 1154 /**
1155 * unmap_kernel_range_noflush - unmap kernel VM area 1155 * unmap_kernel_range_noflush - unmap kernel VM area
1156 * @addr: start of the VM area to unmap 1156 * @addr: start of the VM area to unmap
1157 * @size: size of the VM area to unmap 1157 * @size: size of the VM area to unmap
1158 * 1158 *
1159 * Unmap PFN_UP(@size) pages at @addr. The VM area @addr and @size 1159 * Unmap PFN_UP(@size) pages at @addr. The VM area @addr and @size
1160 * specify should have been allocated using get_vm_area() and its 1160 * specify should have been allocated using get_vm_area() and its
1161 * friends. 1161 * friends.
1162 * 1162 *
1163 * NOTE: 1163 * NOTE:
1164 * This function does NOT do any cache flushing. The caller is 1164 * This function does NOT do any cache flushing. The caller is
1165 * responsible for calling flush_cache_vunmap() on to-be-mapped areas 1165 * responsible for calling flush_cache_vunmap() on to-be-mapped areas
1166 * before calling this function and flush_tlb_kernel_range() after. 1166 * before calling this function and flush_tlb_kernel_range() after.
1167 */ 1167 */
1168 void unmap_kernel_range_noflush(unsigned long addr, unsigned long size) 1168 void unmap_kernel_range_noflush(unsigned long addr, unsigned long size)
1169 { 1169 {
1170 vunmap_page_range(addr, addr + size); 1170 vunmap_page_range(addr, addr + size);
1171 } 1171 }
1172 1172
1173 /** 1173 /**
1174 * unmap_kernel_range - unmap kernel VM area and flush cache and TLB 1174 * unmap_kernel_range - unmap kernel VM area and flush cache and TLB
1175 * @addr: start of the VM area to unmap 1175 * @addr: start of the VM area to unmap
1176 * @size: size of the VM area to unmap 1176 * @size: size of the VM area to unmap
1177 * 1177 *
1178 * Similar to unmap_kernel_range_noflush() but flushes vcache before 1178 * Similar to unmap_kernel_range_noflush() but flushes vcache before
1179 * the unmapping and tlb after. 1179 * the unmapping and tlb after.
1180 */ 1180 */
1181 void unmap_kernel_range(unsigned long addr, unsigned long size) 1181 void unmap_kernel_range(unsigned long addr, unsigned long size)
1182 { 1182 {
1183 unsigned long end = addr + size; 1183 unsigned long end = addr + size;
1184 1184
1185 flush_cache_vunmap(addr, end); 1185 flush_cache_vunmap(addr, end);
1186 vunmap_page_range(addr, end); 1186 vunmap_page_range(addr, end);
1187 flush_tlb_kernel_range(addr, end); 1187 flush_tlb_kernel_range(addr, end);
1188 } 1188 }
1189 1189
1190 int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page ***pages) 1190 int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page ***pages)
1191 { 1191 {
1192 unsigned long addr = (unsigned long)area->addr; 1192 unsigned long addr = (unsigned long)area->addr;
1193 unsigned long end = addr + area->size - PAGE_SIZE; 1193 unsigned long end = addr + area->size - PAGE_SIZE;
1194 int err; 1194 int err;
1195 1195
1196 err = vmap_page_range(addr, end, prot, *pages); 1196 err = vmap_page_range(addr, end, prot, *pages);
1197 if (err > 0) { 1197 if (err > 0) {
1198 *pages += err; 1198 *pages += err;
1199 err = 0; 1199 err = 0;
1200 } 1200 }
1201 1201
1202 return err; 1202 return err;
1203 } 1203 }
1204 EXPORT_SYMBOL_GPL(map_vm_area); 1204 EXPORT_SYMBOL_GPL(map_vm_area);
1205 1205
1206 /*** Old vmalloc interfaces ***/ 1206 /*** Old vmalloc interfaces ***/
1207 DEFINE_RWLOCK(vmlist_lock); 1207 DEFINE_RWLOCK(vmlist_lock);
1208 struct vm_struct *vmlist; 1208 struct vm_struct *vmlist;
1209 1209
1210 static void insert_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va, 1210 static void insert_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
1211 unsigned long flags, void *caller) 1211 unsigned long flags, void *caller)
1212 { 1212 {
1213 struct vm_struct *tmp, **p; 1213 struct vm_struct *tmp, **p;
1214 1214
1215 vm->flags = flags; 1215 vm->flags = flags;
1216 vm->addr = (void *)va->va_start; 1216 vm->addr = (void *)va->va_start;
1217 vm->size = va->va_end - va->va_start; 1217 vm->size = va->va_end - va->va_start;
1218 vm->caller = caller; 1218 vm->caller = caller;
1219 va->private = vm; 1219 va->private = vm;
1220 va->flags |= VM_VM_AREA; 1220 va->flags |= VM_VM_AREA;
1221 1221
1222 write_lock(&vmlist_lock); 1222 write_lock(&vmlist_lock);
1223 for (p = &vmlist; (tmp = *p) != NULL; p = &tmp->next) { 1223 for (p = &vmlist; (tmp = *p) != NULL; p = &tmp->next) {
1224 if (tmp->addr >= vm->addr) 1224 if (tmp->addr >= vm->addr)
1225 break; 1225 break;
1226 } 1226 }
1227 vm->next = *p; 1227 vm->next = *p;
1228 *p = vm; 1228 *p = vm;
1229 write_unlock(&vmlist_lock); 1229 write_unlock(&vmlist_lock);
1230 } 1230 }
1231 1231
1232 static struct vm_struct *__get_vm_area_node(unsigned long size, 1232 static struct vm_struct *__get_vm_area_node(unsigned long size,
1233 unsigned long align, unsigned long flags, unsigned long start, 1233 unsigned long align, unsigned long flags, unsigned long start,
1234 unsigned long end, int node, gfp_t gfp_mask, void *caller) 1234 unsigned long end, int node, gfp_t gfp_mask, void *caller)
1235 { 1235 {
1236 static struct vmap_area *va; 1236 static struct vmap_area *va;
1237 struct vm_struct *area; 1237 struct vm_struct *area;
1238 1238
1239 BUG_ON(in_interrupt()); 1239 BUG_ON(in_interrupt());
1240 if (flags & VM_IOREMAP) { 1240 if (flags & VM_IOREMAP) {
1241 int bit = fls(size); 1241 int bit = fls(size);
1242 1242
1243 if (bit > IOREMAP_MAX_ORDER) 1243 if (bit > IOREMAP_MAX_ORDER)
1244 bit = IOREMAP_MAX_ORDER; 1244 bit = IOREMAP_MAX_ORDER;
1245 else if (bit < PAGE_SHIFT) 1245 else if (bit < PAGE_SHIFT)
1246 bit = PAGE_SHIFT; 1246 bit = PAGE_SHIFT;
1247 1247
1248 align = 1ul << bit; 1248 align = 1ul << bit;
1249 } 1249 }
1250 1250
1251 size = PAGE_ALIGN(size); 1251 size = PAGE_ALIGN(size);
1252 if (unlikely(!size)) 1252 if (unlikely(!size))
1253 return NULL; 1253 return NULL;
1254 1254
1255 area = kzalloc_node(sizeof(*area), gfp_mask & GFP_RECLAIM_MASK, node); 1255 area = kzalloc_node(sizeof(*area), gfp_mask & GFP_RECLAIM_MASK, node);
1256 if (unlikely(!area)) 1256 if (unlikely(!area))
1257 return NULL; 1257 return NULL;
1258 1258
1259 /* 1259 /*
1260 * We always allocate a guard page. 1260 * We always allocate a guard page.
1261 */ 1261 */
1262 size += PAGE_SIZE; 1262 size += PAGE_SIZE;
1263 1263
1264 va = alloc_vmap_area(size, align, start, end, node, gfp_mask); 1264 va = alloc_vmap_area(size, align, start, end, node, gfp_mask);
1265 if (IS_ERR(va)) { 1265 if (IS_ERR(va)) {
1266 kfree(area); 1266 kfree(area);
1267 return NULL; 1267 return NULL;
1268 } 1268 }
1269 1269
1270 insert_vmalloc_vm(area, va, flags, caller); 1270 insert_vmalloc_vm(area, va, flags, caller);
1271 return area; 1271 return area;
1272 } 1272 }
1273 1273
1274 struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags, 1274 struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags,
1275 unsigned long start, unsigned long end) 1275 unsigned long start, unsigned long end)
1276 { 1276 {
1277 return __get_vm_area_node(size, 1, flags, start, end, -1, GFP_KERNEL, 1277 return __get_vm_area_node(size, 1, flags, start, end, -1, GFP_KERNEL,
1278 __builtin_return_address(0)); 1278 __builtin_return_address(0));
1279 } 1279 }
1280 EXPORT_SYMBOL_GPL(__get_vm_area); 1280 EXPORT_SYMBOL_GPL(__get_vm_area);
1281 1281
1282 struct vm_struct *__get_vm_area_caller(unsigned long size, unsigned long flags, 1282 struct vm_struct *__get_vm_area_caller(unsigned long size, unsigned long flags,
1283 unsigned long start, unsigned long end, 1283 unsigned long start, unsigned long end,
1284 void *caller) 1284 void *caller)
1285 { 1285 {
1286 return __get_vm_area_node(size, 1, flags, start, end, -1, GFP_KERNEL, 1286 return __get_vm_area_node(size, 1, flags, start, end, -1, GFP_KERNEL,
1287 caller); 1287 caller);
1288 } 1288 }
1289 1289
1290 /** 1290 /**
1291 * get_vm_area - reserve a contiguous kernel virtual area 1291 * get_vm_area - reserve a contiguous kernel virtual area
1292 * @size: size of the area 1292 * @size: size of the area
1293 * @flags: %VM_IOREMAP for I/O mappings or VM_ALLOC 1293 * @flags: %VM_IOREMAP for I/O mappings or VM_ALLOC
1294 * 1294 *
1295 * Search an area of @size in the kernel virtual mapping area, 1295 * Search an area of @size in the kernel virtual mapping area,
1296 * and reserved it for out purposes. Returns the area descriptor 1296 * and reserved it for out purposes. Returns the area descriptor
1297 * on success or %NULL on failure. 1297 * on success or %NULL on failure.
1298 */ 1298 */
1299 struct vm_struct *get_vm_area(unsigned long size, unsigned long flags) 1299 struct vm_struct *get_vm_area(unsigned long size, unsigned long flags)
1300 { 1300 {
1301 return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END, 1301 return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END,
1302 -1, GFP_KERNEL, __builtin_return_address(0)); 1302 -1, GFP_KERNEL, __builtin_return_address(0));
1303 } 1303 }
1304 1304
1305 struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags, 1305 struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags,
1306 void *caller) 1306 void *caller)
1307 { 1307 {
1308 return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END, 1308 return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END,
1309 -1, GFP_KERNEL, caller); 1309 -1, GFP_KERNEL, caller);
1310 } 1310 }
1311 1311
1312 struct vm_struct *get_vm_area_node(unsigned long size, unsigned long flags, 1312 struct vm_struct *get_vm_area_node(unsigned long size, unsigned long flags,
1313 int node, gfp_t gfp_mask) 1313 int node, gfp_t gfp_mask)
1314 { 1314 {
1315 return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END, 1315 return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END,
1316 node, gfp_mask, __builtin_return_address(0)); 1316 node, gfp_mask, __builtin_return_address(0));
1317 } 1317 }
1318 1318
1319 static struct vm_struct *find_vm_area(const void *addr) 1319 static struct vm_struct *find_vm_area(const void *addr)
1320 { 1320 {
1321 struct vmap_area *va; 1321 struct vmap_area *va;
1322 1322
1323 va = find_vmap_area((unsigned long)addr); 1323 va = find_vmap_area((unsigned long)addr);
1324 if (va && va->flags & VM_VM_AREA) 1324 if (va && va->flags & VM_VM_AREA)
1325 return va->private; 1325 return va->private;
1326 1326
1327 return NULL; 1327 return NULL;
1328 } 1328 }
1329 1329
1330 /** 1330 /**
1331 * remove_vm_area - find and remove a continuous kernel virtual area 1331 * remove_vm_area - find and remove a continuous kernel virtual area
1332 * @addr: base address 1332 * @addr: base address
1333 * 1333 *
1334 * Search for the kernel VM area starting at @addr, and remove it. 1334 * Search for the kernel VM area starting at @addr, and remove it.
1335 * This function returns the found VM area, but using it is NOT safe 1335 * This function returns the found VM area, but using it is NOT safe
1336 * on SMP machines, except for its size or flags. 1336 * on SMP machines, except for its size or flags.
1337 */ 1337 */
1338 struct vm_struct *remove_vm_area(const void *addr) 1338 struct vm_struct *remove_vm_area(const void *addr)
1339 { 1339 {
1340 struct vmap_area *va; 1340 struct vmap_area *va;
1341 1341
1342 va = find_vmap_area((unsigned long)addr); 1342 va = find_vmap_area((unsigned long)addr);
1343 if (va && va->flags & VM_VM_AREA) { 1343 if (va && va->flags & VM_VM_AREA) {
1344 struct vm_struct *vm = va->private; 1344 struct vm_struct *vm = va->private;
1345 struct vm_struct *tmp, **p; 1345 struct vm_struct *tmp, **p;
1346 /* 1346 /*
1347 * remove from list and disallow access to this vm_struct 1347 * remove from list and disallow access to this vm_struct
1348 * before unmap. (address range confliction is maintained by 1348 * before unmap. (address range confliction is maintained by
1349 * vmap.) 1349 * vmap.)
1350 */ 1350 */
1351 write_lock(&vmlist_lock); 1351 write_lock(&vmlist_lock);
1352 for (p = &vmlist; (tmp = *p) != vm; p = &tmp->next) 1352 for (p = &vmlist; (tmp = *p) != vm; p = &tmp->next)
1353 ; 1353 ;
1354 *p = tmp->next; 1354 *p = tmp->next;
1355 write_unlock(&vmlist_lock); 1355 write_unlock(&vmlist_lock);
1356 1356
1357 vmap_debug_free_range(va->va_start, va->va_end); 1357 vmap_debug_free_range(va->va_start, va->va_end);
1358 free_unmap_vmap_area(va); 1358 free_unmap_vmap_area(va);
1359 vm->size -= PAGE_SIZE; 1359 vm->size -= PAGE_SIZE;
1360 1360
1361 return vm; 1361 return vm;
1362 } 1362 }
1363 return NULL; 1363 return NULL;
1364 } 1364 }
1365 1365
1366 static void __vunmap(const void *addr, int deallocate_pages) 1366 static void __vunmap(const void *addr, int deallocate_pages)
1367 { 1367 {
1368 struct vm_struct *area; 1368 struct vm_struct *area;
1369 1369
1370 if (!addr) 1370 if (!addr)
1371 return; 1371 return;
1372 1372
1373 if ((PAGE_SIZE-1) & (unsigned long)addr) { 1373 if ((PAGE_SIZE-1) & (unsigned long)addr) {
1374 WARN(1, KERN_ERR "Trying to vfree() bad address (%p)\n", addr); 1374 WARN(1, KERN_ERR "Trying to vfree() bad address (%p)\n", addr);
1375 return; 1375 return;
1376 } 1376 }
1377 1377
1378 area = remove_vm_area(addr); 1378 area = remove_vm_area(addr);
1379 if (unlikely(!area)) { 1379 if (unlikely(!area)) {
1380 WARN(1, KERN_ERR "Trying to vfree() nonexistent vm area (%p)\n", 1380 WARN(1, KERN_ERR "Trying to vfree() nonexistent vm area (%p)\n",
1381 addr); 1381 addr);
1382 return; 1382 return;
1383 } 1383 }
1384 1384
1385 debug_check_no_locks_freed(addr, area->size); 1385 debug_check_no_locks_freed(addr, area->size);
1386 debug_check_no_obj_freed(addr, area->size); 1386 debug_check_no_obj_freed(addr, area->size);
1387 1387
1388 if (deallocate_pages) { 1388 if (deallocate_pages) {
1389 int i; 1389 int i;
1390 1390
1391 for (i = 0; i < area->nr_pages; i++) { 1391 for (i = 0; i < area->nr_pages; i++) {
1392 struct page *page = area->pages[i]; 1392 struct page *page = area->pages[i];
1393 1393
1394 BUG_ON(!page); 1394 BUG_ON(!page);
1395 __free_page(page); 1395 __free_page(page);
1396 } 1396 }
1397 1397
1398 if (area->flags & VM_VPAGES) 1398 if (area->flags & VM_VPAGES)
1399 vfree(area->pages); 1399 vfree(area->pages);
1400 else 1400 else
1401 kfree(area->pages); 1401 kfree(area->pages);
1402 } 1402 }
1403 1403
1404 kfree(area); 1404 kfree(area);
1405 return; 1405 return;
1406 } 1406 }
1407 1407
1408 /** 1408 /**
1409 * vfree - release memory allocated by vmalloc() 1409 * vfree - release memory allocated by vmalloc()
1410 * @addr: memory base address 1410 * @addr: memory base address
1411 * 1411 *
1412 * Free the virtually continuous memory area starting at @addr, as 1412 * Free the virtually continuous memory area starting at @addr, as
1413 * obtained from vmalloc(), vmalloc_32() or __vmalloc(). If @addr is 1413 * obtained from vmalloc(), vmalloc_32() or __vmalloc(). If @addr is
1414 * NULL, no operation is performed. 1414 * NULL, no operation is performed.
1415 * 1415 *
1416 * Must not be called in interrupt context. 1416 * Must not be called in interrupt context.
1417 */ 1417 */
1418 void vfree(const void *addr) 1418 void vfree(const void *addr)
1419 { 1419 {
1420 BUG_ON(in_interrupt()); 1420 BUG_ON(in_interrupt());
1421 1421
1422 kmemleak_free(addr); 1422 kmemleak_free(addr);
1423 1423
1424 __vunmap(addr, 1); 1424 __vunmap(addr, 1);
1425 } 1425 }
1426 EXPORT_SYMBOL(vfree); 1426 EXPORT_SYMBOL(vfree);
1427 1427
1428 /** 1428 /**
1429 * vunmap - release virtual mapping obtained by vmap() 1429 * vunmap - release virtual mapping obtained by vmap()
1430 * @addr: memory base address 1430 * @addr: memory base address
1431 * 1431 *
1432 * Free the virtually contiguous memory area starting at @addr, 1432 * Free the virtually contiguous memory area starting at @addr,
1433 * which was created from the page array passed to vmap(). 1433 * which was created from the page array passed to vmap().
1434 * 1434 *
1435 * Must not be called in interrupt context. 1435 * Must not be called in interrupt context.
1436 */ 1436 */
1437 void vunmap(const void *addr) 1437 void vunmap(const void *addr)
1438 { 1438 {
1439 BUG_ON(in_interrupt()); 1439 BUG_ON(in_interrupt());
1440 might_sleep(); 1440 might_sleep();
1441 __vunmap(addr, 0); 1441 __vunmap(addr, 0);
1442 } 1442 }
1443 EXPORT_SYMBOL(vunmap); 1443 EXPORT_SYMBOL(vunmap);
1444 1444
1445 /** 1445 /**
1446 * vmap - map an array of pages into virtually contiguous space 1446 * vmap - map an array of pages into virtually contiguous space
1447 * @pages: array of page pointers 1447 * @pages: array of page pointers
1448 * @count: number of pages to map 1448 * @count: number of pages to map
1449 * @flags: vm_area->flags 1449 * @flags: vm_area->flags
1450 * @prot: page protection for the mapping 1450 * @prot: page protection for the mapping
1451 * 1451 *
1452 * Maps @count pages from @pages into contiguous kernel virtual 1452 * Maps @count pages from @pages into contiguous kernel virtual
1453 * space. 1453 * space.
1454 */ 1454 */
1455 void *vmap(struct page **pages, unsigned int count, 1455 void *vmap(struct page **pages, unsigned int count,
1456 unsigned long flags, pgprot_t prot) 1456 unsigned long flags, pgprot_t prot)
1457 { 1457 {
1458 struct vm_struct *area; 1458 struct vm_struct *area;
1459 1459
1460 might_sleep(); 1460 might_sleep();
1461 1461
1462 if (count > totalram_pages) 1462 if (count > totalram_pages)
1463 return NULL; 1463 return NULL;
1464 1464
1465 area = get_vm_area_caller((count << PAGE_SHIFT), flags, 1465 area = get_vm_area_caller((count << PAGE_SHIFT), flags,
1466 __builtin_return_address(0)); 1466 __builtin_return_address(0));
1467 if (!area) 1467 if (!area)
1468 return NULL; 1468 return NULL;
1469 1469
1470 if (map_vm_area(area, prot, &pages)) { 1470 if (map_vm_area(area, prot, &pages)) {
1471 vunmap(area->addr); 1471 vunmap(area->addr);
1472 return NULL; 1472 return NULL;
1473 } 1473 }
1474 1474
1475 return area->addr; 1475 return area->addr;
1476 } 1476 }
1477 EXPORT_SYMBOL(vmap); 1477 EXPORT_SYMBOL(vmap);
1478 1478
1479 static void *__vmalloc_node(unsigned long size, unsigned long align, 1479 static void *__vmalloc_node(unsigned long size, unsigned long align,
1480 gfp_t gfp_mask, pgprot_t prot, 1480 gfp_t gfp_mask, pgprot_t prot,
1481 int node, void *caller); 1481 int node, void *caller);
1482 static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, 1482 static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
1483 pgprot_t prot, int node, void *caller) 1483 pgprot_t prot, int node, void *caller)
1484 { 1484 {
1485 struct page **pages; 1485 struct page **pages;
1486 unsigned int nr_pages, array_size, i; 1486 unsigned int nr_pages, array_size, i;
1487 gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO; 1487 gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO;
1488 1488
1489 nr_pages = (area->size - PAGE_SIZE) >> PAGE_SHIFT; 1489 nr_pages = (area->size - PAGE_SIZE) >> PAGE_SHIFT;
1490 array_size = (nr_pages * sizeof(struct page *)); 1490 array_size = (nr_pages * sizeof(struct page *));
1491 1491
1492 area->nr_pages = nr_pages; 1492 area->nr_pages = nr_pages;
1493 /* Please note that the recursion is strictly bounded. */ 1493 /* Please note that the recursion is strictly bounded. */
1494 if (array_size > PAGE_SIZE) { 1494 if (array_size > PAGE_SIZE) {
1495 pages = __vmalloc_node(array_size, 1, nested_gfp|__GFP_HIGHMEM, 1495 pages = __vmalloc_node(array_size, 1, nested_gfp|__GFP_HIGHMEM,
1496 PAGE_KERNEL, node, caller); 1496 PAGE_KERNEL, node, caller);
1497 area->flags |= VM_VPAGES; 1497 area->flags |= VM_VPAGES;
1498 } else { 1498 } else {
1499 pages = kmalloc_node(array_size, nested_gfp, node); 1499 pages = kmalloc_node(array_size, nested_gfp, node);
1500 } 1500 }
1501 area->pages = pages; 1501 area->pages = pages;
1502 area->caller = caller; 1502 area->caller = caller;
1503 if (!area->pages) { 1503 if (!area->pages) {
1504 remove_vm_area(area->addr); 1504 remove_vm_area(area->addr);
1505 kfree(area); 1505 kfree(area);
1506 return NULL; 1506 return NULL;
1507 } 1507 }
1508 1508
1509 for (i = 0; i < area->nr_pages; i++) { 1509 for (i = 0; i < area->nr_pages; i++) {
1510 struct page *page; 1510 struct page *page;
1511 1511
1512 if (node < 0) 1512 if (node < 0)
1513 page = alloc_page(gfp_mask); 1513 page = alloc_page(gfp_mask);
1514 else 1514 else
1515 page = alloc_pages_node(node, gfp_mask, 0); 1515 page = alloc_pages_node(node, gfp_mask, 0);
1516 1516
1517 if (unlikely(!page)) { 1517 if (unlikely(!page)) {
1518 /* Successfully allocated i pages, free them in __vunmap() */ 1518 /* Successfully allocated i pages, free them in __vunmap() */
1519 area->nr_pages = i; 1519 area->nr_pages = i;
1520 goto fail; 1520 goto fail;
1521 } 1521 }
1522 area->pages[i] = page; 1522 area->pages[i] = page;
1523 } 1523 }
1524 1524
1525 if (map_vm_area(area, prot, &pages)) 1525 if (map_vm_area(area, prot, &pages))
1526 goto fail; 1526 goto fail;
1527 return area->addr; 1527 return area->addr;
1528 1528
1529 fail: 1529 fail:
1530 vfree(area->addr); 1530 vfree(area->addr);
1531 return NULL; 1531 return NULL;
1532 } 1532 }
1533 1533
1534 void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot) 1534 void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot)
1535 { 1535 {
1536 void *addr = __vmalloc_area_node(area, gfp_mask, prot, -1, 1536 void *addr = __vmalloc_area_node(area, gfp_mask, prot, -1,
1537 __builtin_return_address(0)); 1537 __builtin_return_address(0));
1538 1538
1539 /* 1539 /*
1540 * A ref_count = 3 is needed because the vm_struct and vmap_area 1540 * A ref_count = 3 is needed because the vm_struct and vmap_area
1541 * structures allocated in the __get_vm_area_node() function contain 1541 * structures allocated in the __get_vm_area_node() function contain
1542 * references to the virtual address of the vmalloc'ed block. 1542 * references to the virtual address of the vmalloc'ed block.
1543 */ 1543 */
1544 kmemleak_alloc(addr, area->size - PAGE_SIZE, 3, gfp_mask); 1544 kmemleak_alloc(addr, area->size - PAGE_SIZE, 3, gfp_mask);
1545 1545
1546 return addr; 1546 return addr;
1547 } 1547 }
1548 1548
1549 /** 1549 /**
1550 * __vmalloc_node - allocate virtually contiguous memory 1550 * __vmalloc_node - allocate virtually contiguous memory
1551 * @size: allocation size 1551 * @size: allocation size
1552 * @align: desired alignment 1552 * @align: desired alignment
1553 * @gfp_mask: flags for the page level allocator 1553 * @gfp_mask: flags for the page level allocator
1554 * @prot: protection mask for the allocated pages 1554 * @prot: protection mask for the allocated pages
1555 * @node: node to use for allocation or -1 1555 * @node: node to use for allocation or -1
1556 * @caller: caller's return address 1556 * @caller: caller's return address
1557 * 1557 *
1558 * Allocate enough pages to cover @size from the page level 1558 * Allocate enough pages to cover @size from the page level
1559 * allocator with @gfp_mask flags. Map them into contiguous 1559 * allocator with @gfp_mask flags. Map them into contiguous
1560 * kernel virtual space, using a pagetable protection of @prot. 1560 * kernel virtual space, using a pagetable protection of @prot.
1561 */ 1561 */
1562 static void *__vmalloc_node(unsigned long size, unsigned long align, 1562 static void *__vmalloc_node(unsigned long size, unsigned long align,
1563 gfp_t gfp_mask, pgprot_t prot, 1563 gfp_t gfp_mask, pgprot_t prot,
1564 int node, void *caller) 1564 int node, void *caller)
1565 { 1565 {
1566 struct vm_struct *area; 1566 struct vm_struct *area;
1567 void *addr; 1567 void *addr;
1568 unsigned long real_size = size; 1568 unsigned long real_size = size;
1569 1569
1570 size = PAGE_ALIGN(size); 1570 size = PAGE_ALIGN(size);
1571 if (!size || (size >> PAGE_SHIFT) > totalram_pages) 1571 if (!size || (size >> PAGE_SHIFT) > totalram_pages)
1572 return NULL; 1572 return NULL;
1573 1573
1574 area = __get_vm_area_node(size, align, VM_ALLOC, VMALLOC_START, 1574 area = __get_vm_area_node(size, align, VM_ALLOC, VMALLOC_START,
1575 VMALLOC_END, node, gfp_mask, caller); 1575 VMALLOC_END, node, gfp_mask, caller);
1576 1576
1577 if (!area) 1577 if (!area)
1578 return NULL; 1578 return NULL;
1579 1579
1580 addr = __vmalloc_area_node(area, gfp_mask, prot, node, caller); 1580 addr = __vmalloc_area_node(area, gfp_mask, prot, node, caller);
1581 1581
1582 /* 1582 /*
1583 * A ref_count = 3 is needed because the vm_struct and vmap_area 1583 * A ref_count = 3 is needed because the vm_struct and vmap_area
1584 * structures allocated in the __get_vm_area_node() function contain 1584 * structures allocated in the __get_vm_area_node() function contain
1585 * references to the virtual address of the vmalloc'ed block. 1585 * references to the virtual address of the vmalloc'ed block.
1586 */ 1586 */
1587 kmemleak_alloc(addr, real_size, 3, gfp_mask); 1587 kmemleak_alloc(addr, real_size, 3, gfp_mask);
1588 1588
1589 return addr; 1589 return addr;
1590 } 1590 }
1591 1591
1592 void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot) 1592 void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot)
1593 { 1593 {
1594 return __vmalloc_node(size, 1, gfp_mask, prot, -1, 1594 return __vmalloc_node(size, 1, gfp_mask, prot, -1,
1595 __builtin_return_address(0)); 1595 __builtin_return_address(0));
1596 } 1596 }
1597 EXPORT_SYMBOL(__vmalloc); 1597 EXPORT_SYMBOL(__vmalloc);
1598 1598
1599 static inline void *__vmalloc_node_flags(unsigned long size,
1600 int node, gfp_t flags)
1601 {
1602 return __vmalloc_node(size, 1, flags, PAGE_KERNEL,
1603 node, __builtin_return_address(0));
1604 }
1605
1599 /** 1606 /**
1600 * vmalloc - allocate virtually contiguous memory 1607 * vmalloc - allocate virtually contiguous memory
1601 * @size: allocation size 1608 * @size: allocation size
1602 * Allocate enough pages to cover @size from the page level 1609 * Allocate enough pages to cover @size from the page level
1603 * allocator and map them into contiguous kernel virtual space. 1610 * allocator and map them into contiguous kernel virtual space.
1604 * 1611 *
1605 * For tight control over page level allocator and protection flags 1612 * For tight control over page level allocator and protection flags
1606 * use __vmalloc() instead. 1613 * use __vmalloc() instead.
1607 */ 1614 */
1608 void *vmalloc(unsigned long size) 1615 void *vmalloc(unsigned long size)
1609 { 1616 {
1610 return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL, 1617 return __vmalloc_node_flags(size, -1, GFP_KERNEL | __GFP_HIGHMEM);
1611 -1, __builtin_return_address(0));
1612 } 1618 }
1613 EXPORT_SYMBOL(vmalloc); 1619 EXPORT_SYMBOL(vmalloc);
1614 1620
1615 /** 1621 /**
1622 * vzalloc - allocate virtually contiguous memory with zero fill
1623 * @size: allocation size
1624 * Allocate enough pages to cover @size from the page level
1625 * allocator and map them into contiguous kernel virtual space.
1626 * The memory allocated is set to zero.
1627 *
1628 * For tight control over page level allocator and protection flags
1629 * use __vmalloc() instead.
1630 */
1631 void *vzalloc(unsigned long size)
1632 {
1633 return __vmalloc_node_flags(size, -1,
1634 GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO);
1635 }
1636 EXPORT_SYMBOL(vzalloc);
1637
1638 /**
1616 * vmalloc_user - allocate zeroed virtually contiguous memory for userspace 1639 * vmalloc_user - allocate zeroed virtually contiguous memory for userspace
1617 * @size: allocation size 1640 * @size: allocation size
1618 * 1641 *
1619 * The resulting memory area is zeroed so it can be mapped to userspace 1642 * The resulting memory area is zeroed so it can be mapped to userspace
1620 * without leaking data. 1643 * without leaking data.
1621 */ 1644 */
1622 void *vmalloc_user(unsigned long size) 1645 void *vmalloc_user(unsigned long size)
1623 { 1646 {
1624 struct vm_struct *area; 1647 struct vm_struct *area;
1625 void *ret; 1648 void *ret;
1626 1649
1627 ret = __vmalloc_node(size, SHMLBA, 1650 ret = __vmalloc_node(size, SHMLBA,
1628 GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, 1651 GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO,
1629 PAGE_KERNEL, -1, __builtin_return_address(0)); 1652 PAGE_KERNEL, -1, __builtin_return_address(0));
1630 if (ret) { 1653 if (ret) {
1631 area = find_vm_area(ret); 1654 area = find_vm_area(ret);
1632 area->flags |= VM_USERMAP; 1655 area->flags |= VM_USERMAP;
1633 } 1656 }
1634 return ret; 1657 return ret;
1635 } 1658 }
1636 EXPORT_SYMBOL(vmalloc_user); 1659 EXPORT_SYMBOL(vmalloc_user);
1637 1660
1638 /** 1661 /**
1639 * vmalloc_node - allocate memory on a specific node 1662 * vmalloc_node - allocate memory on a specific node
1640 * @size: allocation size 1663 * @size: allocation size
1641 * @node: numa node 1664 * @node: numa node
1642 * 1665 *
1643 * Allocate enough pages to cover @size from the page level 1666 * Allocate enough pages to cover @size from the page level
1644 * allocator and map them into contiguous kernel virtual space. 1667 * allocator and map them into contiguous kernel virtual space.
1645 * 1668 *
1646 * For tight control over page level allocator and protection flags 1669 * For tight control over page level allocator and protection flags
1647 * use __vmalloc() instead. 1670 * use __vmalloc() instead.
1648 */ 1671 */
1649 void *vmalloc_node(unsigned long size, int node) 1672 void *vmalloc_node(unsigned long size, int node)
1650 { 1673 {
1651 return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL, 1674 return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL,
1652 node, __builtin_return_address(0)); 1675 node, __builtin_return_address(0));
1653 } 1676 }
1654 EXPORT_SYMBOL(vmalloc_node); 1677 EXPORT_SYMBOL(vmalloc_node);
1678
1679 /**
1680 * vzalloc_node - allocate memory on a specific node with zero fill
1681 * @size: allocation size
1682 * @node: numa node
1683 *
1684 * Allocate enough pages to cover @size from the page level
1685 * allocator and map them into contiguous kernel virtual space.
1686 * The memory allocated is set to zero.
1687 *
1688 * For tight control over page level allocator and protection flags
1689 * use __vmalloc_node() instead.
1690 */
1691 void *vzalloc_node(unsigned long size, int node)
1692 {
1693 return __vmalloc_node_flags(size, node,
1694 GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO);
1695 }
1696 EXPORT_SYMBOL(vzalloc_node);
1655 1697
1656 #ifndef PAGE_KERNEL_EXEC 1698 #ifndef PAGE_KERNEL_EXEC
1657 # define PAGE_KERNEL_EXEC PAGE_KERNEL 1699 # define PAGE_KERNEL_EXEC PAGE_KERNEL
1658 #endif 1700 #endif
1659 1701
1660 /** 1702 /**
1661 * vmalloc_exec - allocate virtually contiguous, executable memory 1703 * vmalloc_exec - allocate virtually contiguous, executable memory
1662 * @size: allocation size 1704 * @size: allocation size
1663 * 1705 *
1664 * Kernel-internal function to allocate enough pages to cover @size 1706 * Kernel-internal function to allocate enough pages to cover @size
1665 * the page level allocator and map them into contiguous and 1707 * the page level allocator and map them into contiguous and
1666 * executable kernel virtual space. 1708 * executable kernel virtual space.
1667 * 1709 *
1668 * For tight control over page level allocator and protection flags 1710 * For tight control over page level allocator and protection flags
1669 * use __vmalloc() instead. 1711 * use __vmalloc() instead.
1670 */ 1712 */
1671 1713
1672 void *vmalloc_exec(unsigned long size) 1714 void *vmalloc_exec(unsigned long size)
1673 { 1715 {
1674 return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL_EXEC, 1716 return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL_EXEC,
1675 -1, __builtin_return_address(0)); 1717 -1, __builtin_return_address(0));
1676 } 1718 }
1677 1719
1678 #if defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA32) 1720 #if defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA32)
1679 #define GFP_VMALLOC32 GFP_DMA32 | GFP_KERNEL 1721 #define GFP_VMALLOC32 GFP_DMA32 | GFP_KERNEL
1680 #elif defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA) 1722 #elif defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA)
1681 #define GFP_VMALLOC32 GFP_DMA | GFP_KERNEL 1723 #define GFP_VMALLOC32 GFP_DMA | GFP_KERNEL
1682 #else 1724 #else
1683 #define GFP_VMALLOC32 GFP_KERNEL 1725 #define GFP_VMALLOC32 GFP_KERNEL
1684 #endif 1726 #endif
1685 1727
1686 /** 1728 /**
1687 * vmalloc_32 - allocate virtually contiguous memory (32bit addressable) 1729 * vmalloc_32 - allocate virtually contiguous memory (32bit addressable)
1688 * @size: allocation size 1730 * @size: allocation size
1689 * 1731 *
1690 * Allocate enough 32bit PA addressable pages to cover @size from the 1732 * Allocate enough 32bit PA addressable pages to cover @size from the
1691 * page level allocator and map them into contiguous kernel virtual space. 1733 * page level allocator and map them into contiguous kernel virtual space.
1692 */ 1734 */
1693 void *vmalloc_32(unsigned long size) 1735 void *vmalloc_32(unsigned long size)
1694 { 1736 {
1695 return __vmalloc_node(size, 1, GFP_VMALLOC32, PAGE_KERNEL, 1737 return __vmalloc_node(size, 1, GFP_VMALLOC32, PAGE_KERNEL,
1696 -1, __builtin_return_address(0)); 1738 -1, __builtin_return_address(0));
1697 } 1739 }
1698 EXPORT_SYMBOL(vmalloc_32); 1740 EXPORT_SYMBOL(vmalloc_32);
1699 1741
1700 /** 1742 /**
1701 * vmalloc_32_user - allocate zeroed virtually contiguous 32bit memory 1743 * vmalloc_32_user - allocate zeroed virtually contiguous 32bit memory
1702 * @size: allocation size 1744 * @size: allocation size
1703 * 1745 *
1704 * The resulting memory area is 32bit addressable and zeroed so it can be 1746 * The resulting memory area is 32bit addressable and zeroed so it can be
1705 * mapped to userspace without leaking data. 1747 * mapped to userspace without leaking data.
1706 */ 1748 */
1707 void *vmalloc_32_user(unsigned long size) 1749 void *vmalloc_32_user(unsigned long size)
1708 { 1750 {
1709 struct vm_struct *area; 1751 struct vm_struct *area;
1710 void *ret; 1752 void *ret;
1711 1753
1712 ret = __vmalloc_node(size, 1, GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL, 1754 ret = __vmalloc_node(size, 1, GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL,
1713 -1, __builtin_return_address(0)); 1755 -1, __builtin_return_address(0));
1714 if (ret) { 1756 if (ret) {
1715 area = find_vm_area(ret); 1757 area = find_vm_area(ret);
1716 area->flags |= VM_USERMAP; 1758 area->flags |= VM_USERMAP;
1717 } 1759 }
1718 return ret; 1760 return ret;
1719 } 1761 }
1720 EXPORT_SYMBOL(vmalloc_32_user); 1762 EXPORT_SYMBOL(vmalloc_32_user);
1721 1763
1722 /* 1764 /*
1723 * small helper routine , copy contents to buf from addr. 1765 * small helper routine , copy contents to buf from addr.
1724 * If the page is not present, fill zero. 1766 * If the page is not present, fill zero.
1725 */ 1767 */
1726 1768
1727 static int aligned_vread(char *buf, char *addr, unsigned long count) 1769 static int aligned_vread(char *buf, char *addr, unsigned long count)
1728 { 1770 {
1729 struct page *p; 1771 struct page *p;
1730 int copied = 0; 1772 int copied = 0;
1731 1773
1732 while (count) { 1774 while (count) {
1733 unsigned long offset, length; 1775 unsigned long offset, length;
1734 1776
1735 offset = (unsigned long)addr & ~PAGE_MASK; 1777 offset = (unsigned long)addr & ~PAGE_MASK;
1736 length = PAGE_SIZE - offset; 1778 length = PAGE_SIZE - offset;
1737 if (length > count) 1779 if (length > count)
1738 length = count; 1780 length = count;
1739 p = vmalloc_to_page(addr); 1781 p = vmalloc_to_page(addr);
1740 /* 1782 /*
1741 * To do safe access to this _mapped_ area, we need 1783 * To do safe access to this _mapped_ area, we need
1742 * lock. But adding lock here means that we need to add 1784 * lock. But adding lock here means that we need to add
1743 * overhead of vmalloc()/vfree() calles for this _debug_ 1785 * overhead of vmalloc()/vfree() calles for this _debug_
1744 * interface, rarely used. Instead of that, we'll use 1786 * interface, rarely used. Instead of that, we'll use
1745 * kmap() and get small overhead in this access function. 1787 * kmap() and get small overhead in this access function.
1746 */ 1788 */
1747 if (p) { 1789 if (p) {
1748 /* 1790 /*
1749 * we can expect USER0 is not used (see vread/vwrite's 1791 * we can expect USER0 is not used (see vread/vwrite's
1750 * function description) 1792 * function description)
1751 */ 1793 */
1752 void *map = kmap_atomic(p, KM_USER0); 1794 void *map = kmap_atomic(p, KM_USER0);
1753 memcpy(buf, map + offset, length); 1795 memcpy(buf, map + offset, length);
1754 kunmap_atomic(map, KM_USER0); 1796 kunmap_atomic(map, KM_USER0);
1755 } else 1797 } else
1756 memset(buf, 0, length); 1798 memset(buf, 0, length);
1757 1799
1758 addr += length; 1800 addr += length;
1759 buf += length; 1801 buf += length;
1760 copied += length; 1802 copied += length;
1761 count -= length; 1803 count -= length;
1762 } 1804 }
1763 return copied; 1805 return copied;
1764 } 1806 }
1765 1807
1766 static int aligned_vwrite(char *buf, char *addr, unsigned long count) 1808 static int aligned_vwrite(char *buf, char *addr, unsigned long count)
1767 { 1809 {
1768 struct page *p; 1810 struct page *p;
1769 int copied = 0; 1811 int copied = 0;
1770 1812
1771 while (count) { 1813 while (count) {
1772 unsigned long offset, length; 1814 unsigned long offset, length;
1773 1815
1774 offset = (unsigned long)addr & ~PAGE_MASK; 1816 offset = (unsigned long)addr & ~PAGE_MASK;
1775 length = PAGE_SIZE - offset; 1817 length = PAGE_SIZE - offset;
1776 if (length > count) 1818 if (length > count)
1777 length = count; 1819 length = count;
1778 p = vmalloc_to_page(addr); 1820 p = vmalloc_to_page(addr);
1779 /* 1821 /*
1780 * To do safe access to this _mapped_ area, we need 1822 * To do safe access to this _mapped_ area, we need
1781 * lock. But adding lock here means that we need to add 1823 * lock. But adding lock here means that we need to add
1782 * overhead of vmalloc()/vfree() calles for this _debug_ 1824 * overhead of vmalloc()/vfree() calles for this _debug_
1783 * interface, rarely used. Instead of that, we'll use 1825 * interface, rarely used. Instead of that, we'll use
1784 * kmap() and get small overhead in this access function. 1826 * kmap() and get small overhead in this access function.
1785 */ 1827 */
1786 if (p) { 1828 if (p) {
1787 /* 1829 /*
1788 * we can expect USER0 is not used (see vread/vwrite's 1830 * we can expect USER0 is not used (see vread/vwrite's
1789 * function description) 1831 * function description)
1790 */ 1832 */
1791 void *map = kmap_atomic(p, KM_USER0); 1833 void *map = kmap_atomic(p, KM_USER0);
1792 memcpy(map + offset, buf, length); 1834 memcpy(map + offset, buf, length);
1793 kunmap_atomic(map, KM_USER0); 1835 kunmap_atomic(map, KM_USER0);
1794 } 1836 }
1795 addr += length; 1837 addr += length;
1796 buf += length; 1838 buf += length;
1797 copied += length; 1839 copied += length;
1798 count -= length; 1840 count -= length;
1799 } 1841 }
1800 return copied; 1842 return copied;
1801 } 1843 }
1802 1844
1803 /** 1845 /**
1804 * vread() - read vmalloc area in a safe way. 1846 * vread() - read vmalloc area in a safe way.
1805 * @buf: buffer for reading data 1847 * @buf: buffer for reading data
1806 * @addr: vm address. 1848 * @addr: vm address.
1807 * @count: number of bytes to be read. 1849 * @count: number of bytes to be read.
1808 * 1850 *
1809 * Returns # of bytes which addr and buf should be increased. 1851 * Returns # of bytes which addr and buf should be increased.
1810 * (same number to @count). Returns 0 if [addr...addr+count) doesn't 1852 * (same number to @count). Returns 0 if [addr...addr+count) doesn't
1811 * includes any intersect with alive vmalloc area. 1853 * includes any intersect with alive vmalloc area.
1812 * 1854 *
1813 * This function checks that addr is a valid vmalloc'ed area, and 1855 * This function checks that addr is a valid vmalloc'ed area, and
1814 * copy data from that area to a given buffer. If the given memory range 1856 * copy data from that area to a given buffer. If the given memory range
1815 * of [addr...addr+count) includes some valid address, data is copied to 1857 * of [addr...addr+count) includes some valid address, data is copied to
1816 * proper area of @buf. If there are memory holes, they'll be zero-filled. 1858 * proper area of @buf. If there are memory holes, they'll be zero-filled.
1817 * IOREMAP area is treated as memory hole and no copy is done. 1859 * IOREMAP area is treated as memory hole and no copy is done.
1818 * 1860 *
1819 * If [addr...addr+count) doesn't includes any intersects with alive 1861 * If [addr...addr+count) doesn't includes any intersects with alive
1820 * vm_struct area, returns 0. 1862 * vm_struct area, returns 0.
1821 * @buf should be kernel's buffer. Because this function uses KM_USER0, 1863 * @buf should be kernel's buffer. Because this function uses KM_USER0,
1822 * the caller should guarantee KM_USER0 is not used. 1864 * the caller should guarantee KM_USER0 is not used.
1823 * 1865 *
1824 * Note: In usual ops, vread() is never necessary because the caller 1866 * Note: In usual ops, vread() is never necessary because the caller
1825 * should know vmalloc() area is valid and can use memcpy(). 1867 * should know vmalloc() area is valid and can use memcpy().
1826 * This is for routines which have to access vmalloc area without 1868 * This is for routines which have to access vmalloc area without
1827 * any informaion, as /dev/kmem. 1869 * any informaion, as /dev/kmem.
1828 * 1870 *
1829 */ 1871 */
1830 1872
1831 long vread(char *buf, char *addr, unsigned long count) 1873 long vread(char *buf, char *addr, unsigned long count)
1832 { 1874 {
1833 struct vm_struct *tmp; 1875 struct vm_struct *tmp;
1834 char *vaddr, *buf_start = buf; 1876 char *vaddr, *buf_start = buf;
1835 unsigned long buflen = count; 1877 unsigned long buflen = count;
1836 unsigned long n; 1878 unsigned long n;
1837 1879
1838 /* Don't allow overflow */ 1880 /* Don't allow overflow */
1839 if ((unsigned long) addr + count < count) 1881 if ((unsigned long) addr + count < count)
1840 count = -(unsigned long) addr; 1882 count = -(unsigned long) addr;
1841 1883
1842 read_lock(&vmlist_lock); 1884 read_lock(&vmlist_lock);
1843 for (tmp = vmlist; count && tmp; tmp = tmp->next) { 1885 for (tmp = vmlist; count && tmp; tmp = tmp->next) {
1844 vaddr = (char *) tmp->addr; 1886 vaddr = (char *) tmp->addr;
1845 if (addr >= vaddr + tmp->size - PAGE_SIZE) 1887 if (addr >= vaddr + tmp->size - PAGE_SIZE)
1846 continue; 1888 continue;
1847 while (addr < vaddr) { 1889 while (addr < vaddr) {
1848 if (count == 0) 1890 if (count == 0)
1849 goto finished; 1891 goto finished;
1850 *buf = '\0'; 1892 *buf = '\0';
1851 buf++; 1893 buf++;
1852 addr++; 1894 addr++;
1853 count--; 1895 count--;
1854 } 1896 }
1855 n = vaddr + tmp->size - PAGE_SIZE - addr; 1897 n = vaddr + tmp->size - PAGE_SIZE - addr;
1856 if (n > count) 1898 if (n > count)
1857 n = count; 1899 n = count;
1858 if (!(tmp->flags & VM_IOREMAP)) 1900 if (!(tmp->flags & VM_IOREMAP))
1859 aligned_vread(buf, addr, n); 1901 aligned_vread(buf, addr, n);
1860 else /* IOREMAP area is treated as memory hole */ 1902 else /* IOREMAP area is treated as memory hole */
1861 memset(buf, 0, n); 1903 memset(buf, 0, n);
1862 buf += n; 1904 buf += n;
1863 addr += n; 1905 addr += n;
1864 count -= n; 1906 count -= n;
1865 } 1907 }
1866 finished: 1908 finished:
1867 read_unlock(&vmlist_lock); 1909 read_unlock(&vmlist_lock);
1868 1910
1869 if (buf == buf_start) 1911 if (buf == buf_start)
1870 return 0; 1912 return 0;
1871 /* zero-fill memory holes */ 1913 /* zero-fill memory holes */
1872 if (buf != buf_start + buflen) 1914 if (buf != buf_start + buflen)
1873 memset(buf, 0, buflen - (buf - buf_start)); 1915 memset(buf, 0, buflen - (buf - buf_start));
1874 1916
1875 return buflen; 1917 return buflen;
1876 } 1918 }
1877 1919
1878 /** 1920 /**
1879 * vwrite() - write vmalloc area in a safe way. 1921 * vwrite() - write vmalloc area in a safe way.
1880 * @buf: buffer for source data 1922 * @buf: buffer for source data
1881 * @addr: vm address. 1923 * @addr: vm address.
1882 * @count: number of bytes to be read. 1924 * @count: number of bytes to be read.
1883 * 1925 *
1884 * Returns # of bytes which addr and buf should be incresed. 1926 * Returns # of bytes which addr and buf should be incresed.
1885 * (same number to @count). 1927 * (same number to @count).
1886 * If [addr...addr+count) doesn't includes any intersect with valid 1928 * If [addr...addr+count) doesn't includes any intersect with valid
1887 * vmalloc area, returns 0. 1929 * vmalloc area, returns 0.
1888 * 1930 *
1889 * This function checks that addr is a valid vmalloc'ed area, and 1931 * This function checks that addr is a valid vmalloc'ed area, and
1890 * copy data from a buffer to the given addr. If specified range of 1932 * copy data from a buffer to the given addr. If specified range of
1891 * [addr...addr+count) includes some valid address, data is copied from 1933 * [addr...addr+count) includes some valid address, data is copied from
1892 * proper area of @buf. If there are memory holes, no copy to hole. 1934 * proper area of @buf. If there are memory holes, no copy to hole.
1893 * IOREMAP area is treated as memory hole and no copy is done. 1935 * IOREMAP area is treated as memory hole and no copy is done.
1894 * 1936 *
1895 * If [addr...addr+count) doesn't includes any intersects with alive 1937 * If [addr...addr+count) doesn't includes any intersects with alive
1896 * vm_struct area, returns 0. 1938 * vm_struct area, returns 0.
1897 * @buf should be kernel's buffer. Because this function uses KM_USER0, 1939 * @buf should be kernel's buffer. Because this function uses KM_USER0,
1898 * the caller should guarantee KM_USER0 is not used. 1940 * the caller should guarantee KM_USER0 is not used.
1899 * 1941 *
1900 * Note: In usual ops, vwrite() is never necessary because the caller 1942 * Note: In usual ops, vwrite() is never necessary because the caller
1901 * should know vmalloc() area is valid and can use memcpy(). 1943 * should know vmalloc() area is valid and can use memcpy().
1902 * This is for routines which have to access vmalloc area without 1944 * This is for routines which have to access vmalloc area without
1903 * any informaion, as /dev/kmem. 1945 * any informaion, as /dev/kmem.
1904 * 1946 *
1905 * The caller should guarantee KM_USER1 is not used. 1947 * The caller should guarantee KM_USER1 is not used.
1906 */ 1948 */
1907 1949
1908 long vwrite(char *buf, char *addr, unsigned long count) 1950 long vwrite(char *buf, char *addr, unsigned long count)
1909 { 1951 {
1910 struct vm_struct *tmp; 1952 struct vm_struct *tmp;
1911 char *vaddr; 1953 char *vaddr;
1912 unsigned long n, buflen; 1954 unsigned long n, buflen;
1913 int copied = 0; 1955 int copied = 0;
1914 1956
1915 /* Don't allow overflow */ 1957 /* Don't allow overflow */
1916 if ((unsigned long) addr + count < count) 1958 if ((unsigned long) addr + count < count)
1917 count = -(unsigned long) addr; 1959 count = -(unsigned long) addr;
1918 buflen = count; 1960 buflen = count;
1919 1961
1920 read_lock(&vmlist_lock); 1962 read_lock(&vmlist_lock);
1921 for (tmp = vmlist; count && tmp; tmp = tmp->next) { 1963 for (tmp = vmlist; count && tmp; tmp = tmp->next) {
1922 vaddr = (char *) tmp->addr; 1964 vaddr = (char *) tmp->addr;
1923 if (addr >= vaddr + tmp->size - PAGE_SIZE) 1965 if (addr >= vaddr + tmp->size - PAGE_SIZE)
1924 continue; 1966 continue;
1925 while (addr < vaddr) { 1967 while (addr < vaddr) {
1926 if (count == 0) 1968 if (count == 0)
1927 goto finished; 1969 goto finished;
1928 buf++; 1970 buf++;
1929 addr++; 1971 addr++;
1930 count--; 1972 count--;
1931 } 1973 }
1932 n = vaddr + tmp->size - PAGE_SIZE - addr; 1974 n = vaddr + tmp->size - PAGE_SIZE - addr;
1933 if (n > count) 1975 if (n > count)
1934 n = count; 1976 n = count;
1935 if (!(tmp->flags & VM_IOREMAP)) { 1977 if (!(tmp->flags & VM_IOREMAP)) {
1936 aligned_vwrite(buf, addr, n); 1978 aligned_vwrite(buf, addr, n);
1937 copied++; 1979 copied++;
1938 } 1980 }
1939 buf += n; 1981 buf += n;
1940 addr += n; 1982 addr += n;
1941 count -= n; 1983 count -= n;
1942 } 1984 }
1943 finished: 1985 finished:
1944 read_unlock(&vmlist_lock); 1986 read_unlock(&vmlist_lock);
1945 if (!copied) 1987 if (!copied)
1946 return 0; 1988 return 0;
1947 return buflen; 1989 return buflen;
1948 } 1990 }
1949 1991
1950 /** 1992 /**
1951 * remap_vmalloc_range - map vmalloc pages to userspace 1993 * remap_vmalloc_range - map vmalloc pages to userspace
1952 * @vma: vma to cover (map full range of vma) 1994 * @vma: vma to cover (map full range of vma)
1953 * @addr: vmalloc memory 1995 * @addr: vmalloc memory
1954 * @pgoff: number of pages into addr before first page to map 1996 * @pgoff: number of pages into addr before first page to map
1955 * 1997 *
1956 * Returns: 0 for success, -Exxx on failure 1998 * Returns: 0 for success, -Exxx on failure
1957 * 1999 *
1958 * This function checks that addr is a valid vmalloc'ed area, and 2000 * This function checks that addr is a valid vmalloc'ed area, and
1959 * that it is big enough to cover the vma. Will return failure if 2001 * that it is big enough to cover the vma. Will return failure if
1960 * that criteria isn't met. 2002 * that criteria isn't met.
1961 * 2003 *
1962 * Similar to remap_pfn_range() (see mm/memory.c) 2004 * Similar to remap_pfn_range() (see mm/memory.c)
1963 */ 2005 */
1964 int remap_vmalloc_range(struct vm_area_struct *vma, void *addr, 2006 int remap_vmalloc_range(struct vm_area_struct *vma, void *addr,
1965 unsigned long pgoff) 2007 unsigned long pgoff)
1966 { 2008 {
1967 struct vm_struct *area; 2009 struct vm_struct *area;
1968 unsigned long uaddr = vma->vm_start; 2010 unsigned long uaddr = vma->vm_start;
1969 unsigned long usize = vma->vm_end - vma->vm_start; 2011 unsigned long usize = vma->vm_end - vma->vm_start;
1970 2012
1971 if ((PAGE_SIZE-1) & (unsigned long)addr) 2013 if ((PAGE_SIZE-1) & (unsigned long)addr)
1972 return -EINVAL; 2014 return -EINVAL;
1973 2015
1974 area = find_vm_area(addr); 2016 area = find_vm_area(addr);
1975 if (!area) 2017 if (!area)
1976 return -EINVAL; 2018 return -EINVAL;
1977 2019
1978 if (!(area->flags & VM_USERMAP)) 2020 if (!(area->flags & VM_USERMAP))
1979 return -EINVAL; 2021 return -EINVAL;
1980 2022
1981 if (usize + (pgoff << PAGE_SHIFT) > area->size - PAGE_SIZE) 2023 if (usize + (pgoff << PAGE_SHIFT) > area->size - PAGE_SIZE)
1982 return -EINVAL; 2024 return -EINVAL;
1983 2025
1984 addr += pgoff << PAGE_SHIFT; 2026 addr += pgoff << PAGE_SHIFT;
1985 do { 2027 do {
1986 struct page *page = vmalloc_to_page(addr); 2028 struct page *page = vmalloc_to_page(addr);
1987 int ret; 2029 int ret;
1988 2030
1989 ret = vm_insert_page(vma, uaddr, page); 2031 ret = vm_insert_page(vma, uaddr, page);
1990 if (ret) 2032 if (ret)
1991 return ret; 2033 return ret;
1992 2034
1993 uaddr += PAGE_SIZE; 2035 uaddr += PAGE_SIZE;
1994 addr += PAGE_SIZE; 2036 addr += PAGE_SIZE;
1995 usize -= PAGE_SIZE; 2037 usize -= PAGE_SIZE;
1996 } while (usize > 0); 2038 } while (usize > 0);
1997 2039
1998 /* Prevent "things" like memory migration? VM_flags need a cleanup... */ 2040 /* Prevent "things" like memory migration? VM_flags need a cleanup... */
1999 vma->vm_flags |= VM_RESERVED; 2041 vma->vm_flags |= VM_RESERVED;
2000 2042
2001 return 0; 2043 return 0;
2002 } 2044 }
2003 EXPORT_SYMBOL(remap_vmalloc_range); 2045 EXPORT_SYMBOL(remap_vmalloc_range);
2004 2046
2005 /* 2047 /*
2006 * Implement a stub for vmalloc_sync_all() if the architecture chose not to 2048 * Implement a stub for vmalloc_sync_all() if the architecture chose not to
2007 * have one. 2049 * have one.
2008 */ 2050 */
2009 void __attribute__((weak)) vmalloc_sync_all(void) 2051 void __attribute__((weak)) vmalloc_sync_all(void)
2010 { 2052 {
2011 } 2053 }
2012 2054
2013 2055
2014 static int f(pte_t *pte, pgtable_t table, unsigned long addr, void *data) 2056 static int f(pte_t *pte, pgtable_t table, unsigned long addr, void *data)
2015 { 2057 {
2016 /* apply_to_page_range() does all the hard work. */ 2058 /* apply_to_page_range() does all the hard work. */
2017 return 0; 2059 return 0;
2018 } 2060 }
2019 2061
2020 /** 2062 /**
2021 * alloc_vm_area - allocate a range of kernel address space 2063 * alloc_vm_area - allocate a range of kernel address space
2022 * @size: size of the area 2064 * @size: size of the area
2023 * 2065 *
2024 * Returns: NULL on failure, vm_struct on success 2066 * Returns: NULL on failure, vm_struct on success
2025 * 2067 *
2026 * This function reserves a range of kernel address space, and 2068 * This function reserves a range of kernel address space, and
2027 * allocates pagetables to map that range. No actual mappings 2069 * allocates pagetables to map that range. No actual mappings
2028 * are created. If the kernel address space is not shared 2070 * are created. If the kernel address space is not shared
2029 * between processes, it syncs the pagetable across all 2071 * between processes, it syncs the pagetable across all
2030 * processes. 2072 * processes.
2031 */ 2073 */
2032 struct vm_struct *alloc_vm_area(size_t size) 2074 struct vm_struct *alloc_vm_area(size_t size)
2033 { 2075 {
2034 struct vm_struct *area; 2076 struct vm_struct *area;
2035 2077
2036 area = get_vm_area_caller(size, VM_IOREMAP, 2078 area = get_vm_area_caller(size, VM_IOREMAP,
2037 __builtin_return_address(0)); 2079 __builtin_return_address(0));
2038 if (area == NULL) 2080 if (area == NULL)
2039 return NULL; 2081 return NULL;
2040 2082
2041 /* 2083 /*
2042 * This ensures that page tables are constructed for this region 2084 * This ensures that page tables are constructed for this region
2043 * of kernel virtual address space and mapped into init_mm. 2085 * of kernel virtual address space and mapped into init_mm.
2044 */ 2086 */
2045 if (apply_to_page_range(&init_mm, (unsigned long)area->addr, 2087 if (apply_to_page_range(&init_mm, (unsigned long)area->addr,
2046 area->size, f, NULL)) { 2088 area->size, f, NULL)) {
2047 free_vm_area(area); 2089 free_vm_area(area);
2048 return NULL; 2090 return NULL;
2049 } 2091 }
2050 2092
2051 /* Make sure the pagetables are constructed in process kernel 2093 /* Make sure the pagetables are constructed in process kernel
2052 mappings */ 2094 mappings */
2053 vmalloc_sync_all(); 2095 vmalloc_sync_all();
2054 2096
2055 return area; 2097 return area;
2056 } 2098 }
2057 EXPORT_SYMBOL_GPL(alloc_vm_area); 2099 EXPORT_SYMBOL_GPL(alloc_vm_area);
2058 2100
2059 void free_vm_area(struct vm_struct *area) 2101 void free_vm_area(struct vm_struct *area)
2060 { 2102 {
2061 struct vm_struct *ret; 2103 struct vm_struct *ret;
2062 ret = remove_vm_area(area->addr); 2104 ret = remove_vm_area(area->addr);
2063 BUG_ON(ret != area); 2105 BUG_ON(ret != area);
2064 kfree(area); 2106 kfree(area);
2065 } 2107 }
2066 EXPORT_SYMBOL_GPL(free_vm_area); 2108 EXPORT_SYMBOL_GPL(free_vm_area);
2067 2109
2068 #ifdef CONFIG_SMP 2110 #ifdef CONFIG_SMP
2069 static struct vmap_area *node_to_va(struct rb_node *n) 2111 static struct vmap_area *node_to_va(struct rb_node *n)
2070 { 2112 {
2071 return n ? rb_entry(n, struct vmap_area, rb_node) : NULL; 2113 return n ? rb_entry(n, struct vmap_area, rb_node) : NULL;
2072 } 2114 }
2073 2115
2074 /** 2116 /**
2075 * pvm_find_next_prev - find the next and prev vmap_area surrounding @end 2117 * pvm_find_next_prev - find the next and prev vmap_area surrounding @end
2076 * @end: target address 2118 * @end: target address
2077 * @pnext: out arg for the next vmap_area 2119 * @pnext: out arg for the next vmap_area
2078 * @pprev: out arg for the previous vmap_area 2120 * @pprev: out arg for the previous vmap_area
2079 * 2121 *
2080 * Returns: %true if either or both of next and prev are found, 2122 * Returns: %true if either or both of next and prev are found,
2081 * %false if no vmap_area exists 2123 * %false if no vmap_area exists
2082 * 2124 *
2083 * Find vmap_areas end addresses of which enclose @end. ie. if not 2125 * Find vmap_areas end addresses of which enclose @end. ie. if not
2084 * NULL, *pnext->va_end > @end and *pprev->va_end <= @end. 2126 * NULL, *pnext->va_end > @end and *pprev->va_end <= @end.
2085 */ 2127 */
2086 static bool pvm_find_next_prev(unsigned long end, 2128 static bool pvm_find_next_prev(unsigned long end,
2087 struct vmap_area **pnext, 2129 struct vmap_area **pnext,
2088 struct vmap_area **pprev) 2130 struct vmap_area **pprev)
2089 { 2131 {
2090 struct rb_node *n = vmap_area_root.rb_node; 2132 struct rb_node *n = vmap_area_root.rb_node;
2091 struct vmap_area *va = NULL; 2133 struct vmap_area *va = NULL;
2092 2134
2093 while (n) { 2135 while (n) {
2094 va = rb_entry(n, struct vmap_area, rb_node); 2136 va = rb_entry(n, struct vmap_area, rb_node);
2095 if (end < va->va_end) 2137 if (end < va->va_end)
2096 n = n->rb_left; 2138 n = n->rb_left;
2097 else if (end > va->va_end) 2139 else if (end > va->va_end)
2098 n = n->rb_right; 2140 n = n->rb_right;
2099 else 2141 else
2100 break; 2142 break;
2101 } 2143 }
2102 2144
2103 if (!va) 2145 if (!va)
2104 return false; 2146 return false;
2105 2147
2106 if (va->va_end > end) { 2148 if (va->va_end > end) {
2107 *pnext = va; 2149 *pnext = va;
2108 *pprev = node_to_va(rb_prev(&(*pnext)->rb_node)); 2150 *pprev = node_to_va(rb_prev(&(*pnext)->rb_node));
2109 } else { 2151 } else {
2110 *pprev = va; 2152 *pprev = va;
2111 *pnext = node_to_va(rb_next(&(*pprev)->rb_node)); 2153 *pnext = node_to_va(rb_next(&(*pprev)->rb_node));
2112 } 2154 }
2113 return true; 2155 return true;
2114 } 2156 }
2115 2157
2116 /** 2158 /**
2117 * pvm_determine_end - find the highest aligned address between two vmap_areas 2159 * pvm_determine_end - find the highest aligned address between two vmap_areas
2118 * @pnext: in/out arg for the next vmap_area 2160 * @pnext: in/out arg for the next vmap_area
2119 * @pprev: in/out arg for the previous vmap_area 2161 * @pprev: in/out arg for the previous vmap_area
2120 * @align: alignment 2162 * @align: alignment
2121 * 2163 *
2122 * Returns: determined end address 2164 * Returns: determined end address
2123 * 2165 *
2124 * Find the highest aligned address between *@pnext and *@pprev below 2166 * Find the highest aligned address between *@pnext and *@pprev below
2125 * VMALLOC_END. *@pnext and *@pprev are adjusted so that the aligned 2167 * VMALLOC_END. *@pnext and *@pprev are adjusted so that the aligned
2126 * down address is between the end addresses of the two vmap_areas. 2168 * down address is between the end addresses of the two vmap_areas.
2127 * 2169 *
2128 * Please note that the address returned by this function may fall 2170 * Please note that the address returned by this function may fall
2129 * inside *@pnext vmap_area. The caller is responsible for checking 2171 * inside *@pnext vmap_area. The caller is responsible for checking
2130 * that. 2172 * that.
2131 */ 2173 */
2132 static unsigned long pvm_determine_end(struct vmap_area **pnext, 2174 static unsigned long pvm_determine_end(struct vmap_area **pnext,
2133 struct vmap_area **pprev, 2175 struct vmap_area **pprev,
2134 unsigned long align) 2176 unsigned long align)
2135 { 2177 {
2136 const unsigned long vmalloc_end = VMALLOC_END & ~(align - 1); 2178 const unsigned long vmalloc_end = VMALLOC_END & ~(align - 1);
2137 unsigned long addr; 2179 unsigned long addr;
2138 2180
2139 if (*pnext) 2181 if (*pnext)
2140 addr = min((*pnext)->va_start & ~(align - 1), vmalloc_end); 2182 addr = min((*pnext)->va_start & ~(align - 1), vmalloc_end);
2141 else 2183 else
2142 addr = vmalloc_end; 2184 addr = vmalloc_end;
2143 2185
2144 while (*pprev && (*pprev)->va_end > addr) { 2186 while (*pprev && (*pprev)->va_end > addr) {
2145 *pnext = *pprev; 2187 *pnext = *pprev;
2146 *pprev = node_to_va(rb_prev(&(*pnext)->rb_node)); 2188 *pprev = node_to_va(rb_prev(&(*pnext)->rb_node));
2147 } 2189 }
2148 2190
2149 return addr; 2191 return addr;
2150 } 2192 }
2151 2193
2152 /** 2194 /**
2153 * pcpu_get_vm_areas - allocate vmalloc areas for percpu allocator 2195 * pcpu_get_vm_areas - allocate vmalloc areas for percpu allocator
2154 * @offsets: array containing offset of each area 2196 * @offsets: array containing offset of each area
2155 * @sizes: array containing size of each area 2197 * @sizes: array containing size of each area
2156 * @nr_vms: the number of areas to allocate 2198 * @nr_vms: the number of areas to allocate
2157 * @align: alignment, all entries in @offsets and @sizes must be aligned to this 2199 * @align: alignment, all entries in @offsets and @sizes must be aligned to this
2158 * @gfp_mask: allocation mask 2200 * @gfp_mask: allocation mask
2159 * 2201 *
2160 * Returns: kmalloc'd vm_struct pointer array pointing to allocated 2202 * Returns: kmalloc'd vm_struct pointer array pointing to allocated
2161 * vm_structs on success, %NULL on failure 2203 * vm_structs on success, %NULL on failure
2162 * 2204 *
2163 * Percpu allocator wants to use congruent vm areas so that it can 2205 * Percpu allocator wants to use congruent vm areas so that it can
2164 * maintain the offsets among percpu areas. This function allocates 2206 * maintain the offsets among percpu areas. This function allocates
2165 * congruent vmalloc areas for it. These areas tend to be scattered 2207 * congruent vmalloc areas for it. These areas tend to be scattered
2166 * pretty far, distance between two areas easily going up to 2208 * pretty far, distance between two areas easily going up to
2167 * gigabytes. To avoid interacting with regular vmallocs, these areas 2209 * gigabytes. To avoid interacting with regular vmallocs, these areas
2168 * are allocated from top. 2210 * are allocated from top.
2169 * 2211 *
2170 * Despite its complicated look, this allocator is rather simple. It 2212 * Despite its complicated look, this allocator is rather simple. It
2171 * does everything top-down and scans areas from the end looking for 2213 * does everything top-down and scans areas from the end looking for
2172 * matching slot. While scanning, if any of the areas overlaps with 2214 * matching slot. While scanning, if any of the areas overlaps with
2173 * existing vmap_area, the base address is pulled down to fit the 2215 * existing vmap_area, the base address is pulled down to fit the
2174 * area. Scanning is repeated till all the areas fit and then all 2216 * area. Scanning is repeated till all the areas fit and then all
2175 * necessary data structres are inserted and the result is returned. 2217 * necessary data structres are inserted and the result is returned.
2176 */ 2218 */
2177 struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets, 2219 struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
2178 const size_t *sizes, int nr_vms, 2220 const size_t *sizes, int nr_vms,
2179 size_t align, gfp_t gfp_mask) 2221 size_t align, gfp_t gfp_mask)
2180 { 2222 {
2181 const unsigned long vmalloc_start = ALIGN(VMALLOC_START, align); 2223 const unsigned long vmalloc_start = ALIGN(VMALLOC_START, align);
2182 const unsigned long vmalloc_end = VMALLOC_END & ~(align - 1); 2224 const unsigned long vmalloc_end = VMALLOC_END & ~(align - 1);
2183 struct vmap_area **vas, *prev, *next; 2225 struct vmap_area **vas, *prev, *next;
2184 struct vm_struct **vms; 2226 struct vm_struct **vms;
2185 int area, area2, last_area, term_area; 2227 int area, area2, last_area, term_area;
2186 unsigned long base, start, end, last_end; 2228 unsigned long base, start, end, last_end;
2187 bool purged = false; 2229 bool purged = false;
2188 2230
2189 gfp_mask &= GFP_RECLAIM_MASK; 2231 gfp_mask &= GFP_RECLAIM_MASK;
2190 2232
2191 /* verify parameters and allocate data structures */ 2233 /* verify parameters and allocate data structures */
2192 BUG_ON(align & ~PAGE_MASK || !is_power_of_2(align)); 2234 BUG_ON(align & ~PAGE_MASK || !is_power_of_2(align));
2193 for (last_area = 0, area = 0; area < nr_vms; area++) { 2235 for (last_area = 0, area = 0; area < nr_vms; area++) {
2194 start = offsets[area]; 2236 start = offsets[area];
2195 end = start + sizes[area]; 2237 end = start + sizes[area];
2196 2238
2197 /* is everything aligned properly? */ 2239 /* is everything aligned properly? */
2198 BUG_ON(!IS_ALIGNED(offsets[area], align)); 2240 BUG_ON(!IS_ALIGNED(offsets[area], align));
2199 BUG_ON(!IS_ALIGNED(sizes[area], align)); 2241 BUG_ON(!IS_ALIGNED(sizes[area], align));
2200 2242
2201 /* detect the area with the highest address */ 2243 /* detect the area with the highest address */
2202 if (start > offsets[last_area]) 2244 if (start > offsets[last_area])
2203 last_area = area; 2245 last_area = area;
2204 2246
2205 for (area2 = 0; area2 < nr_vms; area2++) { 2247 for (area2 = 0; area2 < nr_vms; area2++) {
2206 unsigned long start2 = offsets[area2]; 2248 unsigned long start2 = offsets[area2];
2207 unsigned long end2 = start2 + sizes[area2]; 2249 unsigned long end2 = start2 + sizes[area2];
2208 2250
2209 if (area2 == area) 2251 if (area2 == area)
2210 continue; 2252 continue;
2211 2253
2212 BUG_ON(start2 >= start && start2 < end); 2254 BUG_ON(start2 >= start && start2 < end);
2213 BUG_ON(end2 <= end && end2 > start); 2255 BUG_ON(end2 <= end && end2 > start);
2214 } 2256 }
2215 } 2257 }
2216 last_end = offsets[last_area] + sizes[last_area]; 2258 last_end = offsets[last_area] + sizes[last_area];
2217 2259
2218 if (vmalloc_end - vmalloc_start < last_end) { 2260 if (vmalloc_end - vmalloc_start < last_end) {
2219 WARN_ON(true); 2261 WARN_ON(true);
2220 return NULL; 2262 return NULL;
2221 } 2263 }
2222 2264
2223 vms = kzalloc(sizeof(vms[0]) * nr_vms, gfp_mask); 2265 vms = kzalloc(sizeof(vms[0]) * nr_vms, gfp_mask);
2224 vas = kzalloc(sizeof(vas[0]) * nr_vms, gfp_mask); 2266 vas = kzalloc(sizeof(vas[0]) * nr_vms, gfp_mask);
2225 if (!vas || !vms) 2267 if (!vas || !vms)
2226 goto err_free; 2268 goto err_free;
2227 2269
2228 for (area = 0; area < nr_vms; area++) { 2270 for (area = 0; area < nr_vms; area++) {
2229 vas[area] = kzalloc(sizeof(struct vmap_area), gfp_mask); 2271 vas[area] = kzalloc(sizeof(struct vmap_area), gfp_mask);
2230 vms[area] = kzalloc(sizeof(struct vm_struct), gfp_mask); 2272 vms[area] = kzalloc(sizeof(struct vm_struct), gfp_mask);
2231 if (!vas[area] || !vms[area]) 2273 if (!vas[area] || !vms[area])
2232 goto err_free; 2274 goto err_free;
2233 } 2275 }
2234 retry: 2276 retry:
2235 spin_lock(&vmap_area_lock); 2277 spin_lock(&vmap_area_lock);
2236 2278
2237 /* start scanning - we scan from the top, begin with the last area */ 2279 /* start scanning - we scan from the top, begin with the last area */
2238 area = term_area = last_area; 2280 area = term_area = last_area;
2239 start = offsets[area]; 2281 start = offsets[area];
2240 end = start + sizes[area]; 2282 end = start + sizes[area];
2241 2283
2242 if (!pvm_find_next_prev(vmap_area_pcpu_hole, &next, &prev)) { 2284 if (!pvm_find_next_prev(vmap_area_pcpu_hole, &next, &prev)) {
2243 base = vmalloc_end - last_end; 2285 base = vmalloc_end - last_end;
2244 goto found; 2286 goto found;
2245 } 2287 }
2246 base = pvm_determine_end(&next, &prev, align) - end; 2288 base = pvm_determine_end(&next, &prev, align) - end;
2247 2289
2248 while (true) { 2290 while (true) {
2249 BUG_ON(next && next->va_end <= base + end); 2291 BUG_ON(next && next->va_end <= base + end);
2250 BUG_ON(prev && prev->va_end > base + end); 2292 BUG_ON(prev && prev->va_end > base + end);
2251 2293
2252 /* 2294 /*
2253 * base might have underflowed, add last_end before 2295 * base might have underflowed, add last_end before
2254 * comparing. 2296 * comparing.
2255 */ 2297 */
2256 if (base + last_end < vmalloc_start + last_end) { 2298 if (base + last_end < vmalloc_start + last_end) {
2257 spin_unlock(&vmap_area_lock); 2299 spin_unlock(&vmap_area_lock);
2258 if (!purged) { 2300 if (!purged) {
2259 purge_vmap_area_lazy(); 2301 purge_vmap_area_lazy();
2260 purged = true; 2302 purged = true;
2261 goto retry; 2303 goto retry;
2262 } 2304 }
2263 goto err_free; 2305 goto err_free;
2264 } 2306 }
2265 2307
2266 /* 2308 /*
2267 * If next overlaps, move base downwards so that it's 2309 * If next overlaps, move base downwards so that it's
2268 * right below next and then recheck. 2310 * right below next and then recheck.
2269 */ 2311 */
2270 if (next && next->va_start < base + end) { 2312 if (next && next->va_start < base + end) {
2271 base = pvm_determine_end(&next, &prev, align) - end; 2313 base = pvm_determine_end(&next, &prev, align) - end;
2272 term_area = area; 2314 term_area = area;
2273 continue; 2315 continue;
2274 } 2316 }
2275 2317
2276 /* 2318 /*
2277 * If prev overlaps, shift down next and prev and move 2319 * If prev overlaps, shift down next and prev and move
2278 * base so that it's right below new next and then 2320 * base so that it's right below new next and then
2279 * recheck. 2321 * recheck.
2280 */ 2322 */
2281 if (prev && prev->va_end > base + start) { 2323 if (prev && prev->va_end > base + start) {
2282 next = prev; 2324 next = prev;
2283 prev = node_to_va(rb_prev(&next->rb_node)); 2325 prev = node_to_va(rb_prev(&next->rb_node));
2284 base = pvm_determine_end(&next, &prev, align) - end; 2326 base = pvm_determine_end(&next, &prev, align) - end;
2285 term_area = area; 2327 term_area = area;
2286 continue; 2328 continue;
2287 } 2329 }
2288 2330
2289 /* 2331 /*
2290 * This area fits, move on to the previous one. If 2332 * This area fits, move on to the previous one. If
2291 * the previous one is the terminal one, we're done. 2333 * the previous one is the terminal one, we're done.
2292 */ 2334 */
2293 area = (area + nr_vms - 1) % nr_vms; 2335 area = (area + nr_vms - 1) % nr_vms;
2294 if (area == term_area) 2336 if (area == term_area)
2295 break; 2337 break;
2296 start = offsets[area]; 2338 start = offsets[area];
2297 end = start + sizes[area]; 2339 end = start + sizes[area];
2298 pvm_find_next_prev(base + end, &next, &prev); 2340 pvm_find_next_prev(base + end, &next, &prev);
2299 } 2341 }
2300 found: 2342 found:
2301 /* we've found a fitting base, insert all va's */ 2343 /* we've found a fitting base, insert all va's */
2302 for (area = 0; area < nr_vms; area++) { 2344 for (area = 0; area < nr_vms; area++) {
2303 struct vmap_area *va = vas[area]; 2345 struct vmap_area *va = vas[area];
2304 2346
2305 va->va_start = base + offsets[area]; 2347 va->va_start = base + offsets[area];
2306 va->va_end = va->va_start + sizes[area]; 2348 va->va_end = va->va_start + sizes[area];
2307 __insert_vmap_area(va); 2349 __insert_vmap_area(va);
2308 } 2350 }
2309 2351
2310 vmap_area_pcpu_hole = base + offsets[last_area]; 2352 vmap_area_pcpu_hole = base + offsets[last_area];
2311 2353
2312 spin_unlock(&vmap_area_lock); 2354 spin_unlock(&vmap_area_lock);
2313 2355
2314 /* insert all vm's */ 2356 /* insert all vm's */
2315 for (area = 0; area < nr_vms; area++) 2357 for (area = 0; area < nr_vms; area++)
2316 insert_vmalloc_vm(vms[area], vas[area], VM_ALLOC, 2358 insert_vmalloc_vm(vms[area], vas[area], VM_ALLOC,
2317 pcpu_get_vm_areas); 2359 pcpu_get_vm_areas);
2318 2360
2319 kfree(vas); 2361 kfree(vas);
2320 return vms; 2362 return vms;
2321 2363
2322 err_free: 2364 err_free:
2323 for (area = 0; area < nr_vms; area++) { 2365 for (area = 0; area < nr_vms; area++) {
2324 if (vas) 2366 if (vas)
2325 kfree(vas[area]); 2367 kfree(vas[area]);
2326 if (vms) 2368 if (vms)
2327 kfree(vms[area]); 2369 kfree(vms[area]);
2328 } 2370 }
2329 kfree(vas); 2371 kfree(vas);
2330 kfree(vms); 2372 kfree(vms);
2331 return NULL; 2373 return NULL;
2332 } 2374 }
2333 2375
2334 /** 2376 /**
2335 * pcpu_free_vm_areas - free vmalloc areas for percpu allocator 2377 * pcpu_free_vm_areas - free vmalloc areas for percpu allocator
2336 * @vms: vm_struct pointer array returned by pcpu_get_vm_areas() 2378 * @vms: vm_struct pointer array returned by pcpu_get_vm_areas()
2337 * @nr_vms: the number of allocated areas 2379 * @nr_vms: the number of allocated areas
2338 * 2380 *
2339 * Free vm_structs and the array allocated by pcpu_get_vm_areas(). 2381 * Free vm_structs and the array allocated by pcpu_get_vm_areas().
2340 */ 2382 */
2341 void pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms) 2383 void pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms)
2342 { 2384 {
2343 int i; 2385 int i;
2344 2386
2345 for (i = 0; i < nr_vms; i++) 2387 for (i = 0; i < nr_vms; i++)
2346 free_vm_area(vms[i]); 2388 free_vm_area(vms[i]);
2347 kfree(vms); 2389 kfree(vms);
2348 } 2390 }
2349 #endif /* CONFIG_SMP */ 2391 #endif /* CONFIG_SMP */
2350 2392
2351 #ifdef CONFIG_PROC_FS 2393 #ifdef CONFIG_PROC_FS
2352 static void *s_start(struct seq_file *m, loff_t *pos) 2394 static void *s_start(struct seq_file *m, loff_t *pos)
2353 __acquires(&vmlist_lock) 2395 __acquires(&vmlist_lock)
2354 { 2396 {
2355 loff_t n = *pos; 2397 loff_t n = *pos;
2356 struct vm_struct *v; 2398 struct vm_struct *v;
2357 2399
2358 read_lock(&vmlist_lock); 2400 read_lock(&vmlist_lock);
2359 v = vmlist; 2401 v = vmlist;
2360 while (n > 0 && v) { 2402 while (n > 0 && v) {
2361 n--; 2403 n--;
2362 v = v->next; 2404 v = v->next;
2363 } 2405 }
2364 if (!n) 2406 if (!n)
2365 return v; 2407 return v;
2366 2408
2367 return NULL; 2409 return NULL;
2368 2410
2369 } 2411 }
2370 2412
2371 static void *s_next(struct seq_file *m, void *p, loff_t *pos) 2413 static void *s_next(struct seq_file *m, void *p, loff_t *pos)
2372 { 2414 {
2373 struct vm_struct *v = p; 2415 struct vm_struct *v = p;
2374 2416
2375 ++*pos; 2417 ++*pos;
2376 return v->next; 2418 return v->next;
2377 } 2419 }
2378 2420
2379 static void s_stop(struct seq_file *m, void *p) 2421 static void s_stop(struct seq_file *m, void *p)
2380 __releases(&vmlist_lock) 2422 __releases(&vmlist_lock)
2381 { 2423 {
2382 read_unlock(&vmlist_lock); 2424 read_unlock(&vmlist_lock);
2383 } 2425 }
2384 2426
2385 static void show_numa_info(struct seq_file *m, struct vm_struct *v) 2427 static void show_numa_info(struct seq_file *m, struct vm_struct *v)
2386 { 2428 {
2387 if (NUMA_BUILD) { 2429 if (NUMA_BUILD) {
2388 unsigned int nr, *counters = m->private; 2430 unsigned int nr, *counters = m->private;
2389 2431
2390 if (!counters) 2432 if (!counters)
2391 return; 2433 return;
2392 2434
2393 memset(counters, 0, nr_node_ids * sizeof(unsigned int)); 2435 memset(counters, 0, nr_node_ids * sizeof(unsigned int));
2394 2436
2395 for (nr = 0; nr < v->nr_pages; nr++) 2437 for (nr = 0; nr < v->nr_pages; nr++)
2396 counters[page_to_nid(v->pages[nr])]++; 2438 counters[page_to_nid(v->pages[nr])]++;
2397 2439
2398 for_each_node_state(nr, N_HIGH_MEMORY) 2440 for_each_node_state(nr, N_HIGH_MEMORY)
2399 if (counters[nr]) 2441 if (counters[nr])
2400 seq_printf(m, " N%u=%u", nr, counters[nr]); 2442 seq_printf(m, " N%u=%u", nr, counters[nr]);
2401 } 2443 }
2402 } 2444 }
2403 2445
2404 static int s_show(struct seq_file *m, void *p) 2446 static int s_show(struct seq_file *m, void *p)
2405 { 2447 {
2406 struct vm_struct *v = p; 2448 struct vm_struct *v = p;
2407 2449
2408 seq_printf(m, "0x%p-0x%p %7ld", 2450 seq_printf(m, "0x%p-0x%p %7ld",
2409 v->addr, v->addr + v->size, v->size); 2451 v->addr, v->addr + v->size, v->size);
2410 2452
2411 if (v->caller) { 2453 if (v->caller) {
2412 char buff[KSYM_SYMBOL_LEN]; 2454 char buff[KSYM_SYMBOL_LEN];
2413 2455
2414 seq_putc(m, ' '); 2456 seq_putc(m, ' ');
2415 sprint_symbol(buff, (unsigned long)v->caller); 2457 sprint_symbol(buff, (unsigned long)v->caller);
2416 seq_puts(m, buff); 2458 seq_puts(m, buff);
2417 } 2459 }
2418 2460
2419 if (v->nr_pages) 2461 if (v->nr_pages)
2420 seq_printf(m, " pages=%d", v->nr_pages); 2462 seq_printf(m, " pages=%d", v->nr_pages);
2421 2463
2422 if (v->phys_addr) 2464 if (v->phys_addr)
2423 seq_printf(m, " phys=%llx", (unsigned long long)v->phys_addr); 2465 seq_printf(m, " phys=%llx", (unsigned long long)v->phys_addr);
2424 2466
2425 if (v->flags & VM_IOREMAP) 2467 if (v->flags & VM_IOREMAP)
2426 seq_printf(m, " ioremap"); 2468 seq_printf(m, " ioremap");
2427 2469
2428 if (v->flags & VM_ALLOC) 2470 if (v->flags & VM_ALLOC)
2429 seq_printf(m, " vmalloc"); 2471 seq_printf(m, " vmalloc");
2430 2472
2431 if (v->flags & VM_MAP) 2473 if (v->flags & VM_MAP)
2432 seq_printf(m, " vmap"); 2474 seq_printf(m, " vmap");
2433 2475
2434 if (v->flags & VM_USERMAP) 2476 if (v->flags & VM_USERMAP)
2435 seq_printf(m, " user"); 2477 seq_printf(m, " user");
2436 2478
2437 if (v->flags & VM_VPAGES) 2479 if (v->flags & VM_VPAGES)
2438 seq_printf(m, " vpages"); 2480 seq_printf(m, " vpages");
2439 2481
2440 show_numa_info(m, v); 2482 show_numa_info(m, v);
2441 seq_putc(m, '\n'); 2483 seq_putc(m, '\n');
2442 return 0; 2484 return 0;
2443 } 2485 }
2444 2486
2445 static const struct seq_operations vmalloc_op = { 2487 static const struct seq_operations vmalloc_op = {
2446 .start = s_start, 2488 .start = s_start,
2447 .next = s_next, 2489 .next = s_next,
2448 .stop = s_stop, 2490 .stop = s_stop,
2449 .show = s_show, 2491 .show = s_show,
2450 }; 2492 };
2451 2493
2452 static int vmalloc_open(struct inode *inode, struct file *file) 2494 static int vmalloc_open(struct inode *inode, struct file *file)
2453 { 2495 {
2454 unsigned int *ptr = NULL; 2496 unsigned int *ptr = NULL;
2455 int ret; 2497 int ret;
2456 2498
2457 if (NUMA_BUILD) { 2499 if (NUMA_BUILD) {
2458 ptr = kmalloc(nr_node_ids * sizeof(unsigned int), GFP_KERNEL); 2500 ptr = kmalloc(nr_node_ids * sizeof(unsigned int), GFP_KERNEL);
2459 if (ptr == NULL) 2501 if (ptr == NULL)
2460 return -ENOMEM; 2502 return -ENOMEM;
2461 } 2503 }
2462 ret = seq_open(file, &vmalloc_op); 2504 ret = seq_open(file, &vmalloc_op);
2463 if (!ret) { 2505 if (!ret) {
2464 struct seq_file *m = file->private_data; 2506 struct seq_file *m = file->private_data;
2465 m->private = ptr; 2507 m->private = ptr;
2466 } else 2508 } else
2467 kfree(ptr); 2509 kfree(ptr);
2468 return ret; 2510 return ret;
2469 } 2511 }
2470 2512
2471 static const struct file_operations proc_vmalloc_operations = { 2513 static const struct file_operations proc_vmalloc_operations = {
2472 .open = vmalloc_open, 2514 .open = vmalloc_open,
2473 .read = seq_read, 2515 .read = seq_read,
2474 .llseek = seq_lseek, 2516 .llseek = seq_lseek,
2475 .release = seq_release_private, 2517 .release = seq_release_private,
2476 }; 2518 };
2477 2519
2478 static int __init proc_vmalloc_init(void) 2520 static int __init proc_vmalloc_init(void)
2479 { 2521 {
2480 proc_create("vmallocinfo", S_IRUSR, NULL, &proc_vmalloc_operations); 2522 proc_create("vmallocinfo", S_IRUSR, NULL, &proc_vmalloc_operations);
2481 return 0; 2523 return 0;
2482 } 2524 }
2483 module_init(proc_vmalloc_init); 2525 module_init(proc_vmalloc_init);
2484 #endif 2526 #endif
2485 2527