Commit 5ddd36b9c59887c6416e21daf984fbdd9b1818df

Authored by Stephen Wilson
Committed by Al Viro
1 parent 206cb63657

mm: implement access_remote_vm

Provide an alternative to access_process_vm that allows the caller to obtain a
reference to the supplied mm_struct.

Signed-off-by: Stephen Wilson <wilsons@start.ca>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

Showing 2 changed files with 18 additions and 0 deletions Inline Diff

1 #ifndef _LINUX_MM_H 1 #ifndef _LINUX_MM_H
2 #define _LINUX_MM_H 2 #define _LINUX_MM_H
3 3
4 #include <linux/errno.h> 4 #include <linux/errno.h>
5 5
6 #ifdef __KERNEL__ 6 #ifdef __KERNEL__
7 7
8 #include <linux/gfp.h> 8 #include <linux/gfp.h>
9 #include <linux/list.h> 9 #include <linux/list.h>
10 #include <linux/mmzone.h> 10 #include <linux/mmzone.h>
11 #include <linux/rbtree.h> 11 #include <linux/rbtree.h>
12 #include <linux/prio_tree.h> 12 #include <linux/prio_tree.h>
13 #include <linux/debug_locks.h> 13 #include <linux/debug_locks.h>
14 #include <linux/mm_types.h> 14 #include <linux/mm_types.h>
15 #include <linux/range.h> 15 #include <linux/range.h>
16 #include <linux/pfn.h> 16 #include <linux/pfn.h>
17 #include <linux/bit_spinlock.h> 17 #include <linux/bit_spinlock.h>
18 18
19 struct mempolicy; 19 struct mempolicy;
20 struct anon_vma; 20 struct anon_vma;
21 struct file_ra_state; 21 struct file_ra_state;
22 struct user_struct; 22 struct user_struct;
23 struct writeback_control; 23 struct writeback_control;
24 24
25 #ifndef CONFIG_DISCONTIGMEM /* Don't use mapnrs, do it properly */ 25 #ifndef CONFIG_DISCONTIGMEM /* Don't use mapnrs, do it properly */
26 extern unsigned long max_mapnr; 26 extern unsigned long max_mapnr;
27 #endif 27 #endif
28 28
29 extern unsigned long num_physpages; 29 extern unsigned long num_physpages;
30 extern unsigned long totalram_pages; 30 extern unsigned long totalram_pages;
31 extern void * high_memory; 31 extern void * high_memory;
32 extern int page_cluster; 32 extern int page_cluster;
33 33
34 #ifdef CONFIG_SYSCTL 34 #ifdef CONFIG_SYSCTL
35 extern int sysctl_legacy_va_layout; 35 extern int sysctl_legacy_va_layout;
36 #else 36 #else
37 #define sysctl_legacy_va_layout 0 37 #define sysctl_legacy_va_layout 0
38 #endif 38 #endif
39 39
40 #include <asm/page.h> 40 #include <asm/page.h>
41 #include <asm/pgtable.h> 41 #include <asm/pgtable.h>
42 #include <asm/processor.h> 42 #include <asm/processor.h>
43 43
44 #define nth_page(page,n) pfn_to_page(page_to_pfn((page)) + (n)) 44 #define nth_page(page,n) pfn_to_page(page_to_pfn((page)) + (n))
45 45
46 /* to align the pointer to the (next) page boundary */ 46 /* to align the pointer to the (next) page boundary */
47 #define PAGE_ALIGN(addr) ALIGN(addr, PAGE_SIZE) 47 #define PAGE_ALIGN(addr) ALIGN(addr, PAGE_SIZE)
48 48
49 /* 49 /*
50 * Linux kernel virtual memory manager primitives. 50 * Linux kernel virtual memory manager primitives.
51 * The idea being to have a "virtual" mm in the same way 51 * The idea being to have a "virtual" mm in the same way
52 * we have a virtual fs - giving a cleaner interface to the 52 * we have a virtual fs - giving a cleaner interface to the
53 * mm details, and allowing different kinds of memory mappings 53 * mm details, and allowing different kinds of memory mappings
54 * (from shared memory to executable loading to arbitrary 54 * (from shared memory to executable loading to arbitrary
55 * mmap() functions). 55 * mmap() functions).
56 */ 56 */
57 57
58 extern struct kmem_cache *vm_area_cachep; 58 extern struct kmem_cache *vm_area_cachep;
59 59
60 #ifndef CONFIG_MMU 60 #ifndef CONFIG_MMU
61 extern struct rb_root nommu_region_tree; 61 extern struct rb_root nommu_region_tree;
62 extern struct rw_semaphore nommu_region_sem; 62 extern struct rw_semaphore nommu_region_sem;
63 63
64 extern unsigned int kobjsize(const void *objp); 64 extern unsigned int kobjsize(const void *objp);
65 #endif 65 #endif
66 66
67 /* 67 /*
68 * vm_flags in vm_area_struct, see mm_types.h. 68 * vm_flags in vm_area_struct, see mm_types.h.
69 */ 69 */
70 #define VM_READ 0x00000001 /* currently active flags */ 70 #define VM_READ 0x00000001 /* currently active flags */
71 #define VM_WRITE 0x00000002 71 #define VM_WRITE 0x00000002
72 #define VM_EXEC 0x00000004 72 #define VM_EXEC 0x00000004
73 #define VM_SHARED 0x00000008 73 #define VM_SHARED 0x00000008
74 74
75 /* mprotect() hardcodes VM_MAYREAD >> 4 == VM_READ, and so for r/w/x bits. */ 75 /* mprotect() hardcodes VM_MAYREAD >> 4 == VM_READ, and so for r/w/x bits. */
76 #define VM_MAYREAD 0x00000010 /* limits for mprotect() etc */ 76 #define VM_MAYREAD 0x00000010 /* limits for mprotect() etc */
77 #define VM_MAYWRITE 0x00000020 77 #define VM_MAYWRITE 0x00000020
78 #define VM_MAYEXEC 0x00000040 78 #define VM_MAYEXEC 0x00000040
79 #define VM_MAYSHARE 0x00000080 79 #define VM_MAYSHARE 0x00000080
80 80
81 #define VM_GROWSDOWN 0x00000100 /* general info on the segment */ 81 #define VM_GROWSDOWN 0x00000100 /* general info on the segment */
82 #if defined(CONFIG_STACK_GROWSUP) || defined(CONFIG_IA64) 82 #if defined(CONFIG_STACK_GROWSUP) || defined(CONFIG_IA64)
83 #define VM_GROWSUP 0x00000200 83 #define VM_GROWSUP 0x00000200
84 #else 84 #else
85 #define VM_GROWSUP 0x00000000 85 #define VM_GROWSUP 0x00000000
86 #define VM_NOHUGEPAGE 0x00000200 /* MADV_NOHUGEPAGE marked this vma */ 86 #define VM_NOHUGEPAGE 0x00000200 /* MADV_NOHUGEPAGE marked this vma */
87 #endif 87 #endif
88 #define VM_PFNMAP 0x00000400 /* Page-ranges managed without "struct page", just pure PFN */ 88 #define VM_PFNMAP 0x00000400 /* Page-ranges managed without "struct page", just pure PFN */
89 #define VM_DENYWRITE 0x00000800 /* ETXTBSY on write attempts.. */ 89 #define VM_DENYWRITE 0x00000800 /* ETXTBSY on write attempts.. */
90 90
91 #define VM_EXECUTABLE 0x00001000 91 #define VM_EXECUTABLE 0x00001000
92 #define VM_LOCKED 0x00002000 92 #define VM_LOCKED 0x00002000
93 #define VM_IO 0x00004000 /* Memory mapped I/O or similar */ 93 #define VM_IO 0x00004000 /* Memory mapped I/O or similar */
94 94
95 /* Used by sys_madvise() */ 95 /* Used by sys_madvise() */
96 #define VM_SEQ_READ 0x00008000 /* App will access data sequentially */ 96 #define VM_SEQ_READ 0x00008000 /* App will access data sequentially */
97 #define VM_RAND_READ 0x00010000 /* App will not benefit from clustered reads */ 97 #define VM_RAND_READ 0x00010000 /* App will not benefit from clustered reads */
98 98
99 #define VM_DONTCOPY 0x00020000 /* Do not copy this vma on fork */ 99 #define VM_DONTCOPY 0x00020000 /* Do not copy this vma on fork */
100 #define VM_DONTEXPAND 0x00040000 /* Cannot expand with mremap() */ 100 #define VM_DONTEXPAND 0x00040000 /* Cannot expand with mremap() */
101 #define VM_RESERVED 0x00080000 /* Count as reserved_vm like IO */ 101 #define VM_RESERVED 0x00080000 /* Count as reserved_vm like IO */
102 #define VM_ACCOUNT 0x00100000 /* Is a VM accounted object */ 102 #define VM_ACCOUNT 0x00100000 /* Is a VM accounted object */
103 #define VM_NORESERVE 0x00200000 /* should the VM suppress accounting */ 103 #define VM_NORESERVE 0x00200000 /* should the VM suppress accounting */
104 #define VM_HUGETLB 0x00400000 /* Huge TLB Page VM */ 104 #define VM_HUGETLB 0x00400000 /* Huge TLB Page VM */
105 #define VM_NONLINEAR 0x00800000 /* Is non-linear (remap_file_pages) */ 105 #define VM_NONLINEAR 0x00800000 /* Is non-linear (remap_file_pages) */
106 #ifndef CONFIG_TRANSPARENT_HUGEPAGE 106 #ifndef CONFIG_TRANSPARENT_HUGEPAGE
107 #define VM_MAPPED_COPY 0x01000000 /* T if mapped copy of data (nommu mmap) */ 107 #define VM_MAPPED_COPY 0x01000000 /* T if mapped copy of data (nommu mmap) */
108 #else 108 #else
109 #define VM_HUGEPAGE 0x01000000 /* MADV_HUGEPAGE marked this vma */ 109 #define VM_HUGEPAGE 0x01000000 /* MADV_HUGEPAGE marked this vma */
110 #endif 110 #endif
111 #define VM_INSERTPAGE 0x02000000 /* The vma has had "vm_insert_page()" done on it */ 111 #define VM_INSERTPAGE 0x02000000 /* The vma has had "vm_insert_page()" done on it */
112 #define VM_ALWAYSDUMP 0x04000000 /* Always include in core dumps */ 112 #define VM_ALWAYSDUMP 0x04000000 /* Always include in core dumps */
113 113
114 #define VM_CAN_NONLINEAR 0x08000000 /* Has ->fault & does nonlinear pages */ 114 #define VM_CAN_NONLINEAR 0x08000000 /* Has ->fault & does nonlinear pages */
115 #define VM_MIXEDMAP 0x10000000 /* Can contain "struct page" and pure PFN pages */ 115 #define VM_MIXEDMAP 0x10000000 /* Can contain "struct page" and pure PFN pages */
116 #define VM_SAO 0x20000000 /* Strong Access Ordering (powerpc) */ 116 #define VM_SAO 0x20000000 /* Strong Access Ordering (powerpc) */
117 #define VM_PFN_AT_MMAP 0x40000000 /* PFNMAP vma that is fully mapped at mmap time */ 117 #define VM_PFN_AT_MMAP 0x40000000 /* PFNMAP vma that is fully mapped at mmap time */
118 #define VM_MERGEABLE 0x80000000 /* KSM may merge identical pages */ 118 #define VM_MERGEABLE 0x80000000 /* KSM may merge identical pages */
119 119
120 /* Bits set in the VMA until the stack is in its final location */ 120 /* Bits set in the VMA until the stack is in its final location */
121 #define VM_STACK_INCOMPLETE_SETUP (VM_RAND_READ | VM_SEQ_READ) 121 #define VM_STACK_INCOMPLETE_SETUP (VM_RAND_READ | VM_SEQ_READ)
122 122
123 #ifndef VM_STACK_DEFAULT_FLAGS /* arch can override this */ 123 #ifndef VM_STACK_DEFAULT_FLAGS /* arch can override this */
124 #define VM_STACK_DEFAULT_FLAGS VM_DATA_DEFAULT_FLAGS 124 #define VM_STACK_DEFAULT_FLAGS VM_DATA_DEFAULT_FLAGS
125 #endif 125 #endif
126 126
127 #ifdef CONFIG_STACK_GROWSUP 127 #ifdef CONFIG_STACK_GROWSUP
128 #define VM_STACK_FLAGS (VM_GROWSUP | VM_STACK_DEFAULT_FLAGS | VM_ACCOUNT) 128 #define VM_STACK_FLAGS (VM_GROWSUP | VM_STACK_DEFAULT_FLAGS | VM_ACCOUNT)
129 #else 129 #else
130 #define VM_STACK_FLAGS (VM_GROWSDOWN | VM_STACK_DEFAULT_FLAGS | VM_ACCOUNT) 130 #define VM_STACK_FLAGS (VM_GROWSDOWN | VM_STACK_DEFAULT_FLAGS | VM_ACCOUNT)
131 #endif 131 #endif
132 132
133 #define VM_READHINTMASK (VM_SEQ_READ | VM_RAND_READ) 133 #define VM_READHINTMASK (VM_SEQ_READ | VM_RAND_READ)
134 #define VM_ClearReadHint(v) (v)->vm_flags &= ~VM_READHINTMASK 134 #define VM_ClearReadHint(v) (v)->vm_flags &= ~VM_READHINTMASK
135 #define VM_NormalReadHint(v) (!((v)->vm_flags & VM_READHINTMASK)) 135 #define VM_NormalReadHint(v) (!((v)->vm_flags & VM_READHINTMASK))
136 #define VM_SequentialReadHint(v) ((v)->vm_flags & VM_SEQ_READ) 136 #define VM_SequentialReadHint(v) ((v)->vm_flags & VM_SEQ_READ)
137 #define VM_RandomReadHint(v) ((v)->vm_flags & VM_RAND_READ) 137 #define VM_RandomReadHint(v) ((v)->vm_flags & VM_RAND_READ)
138 138
139 /* 139 /*
140 * special vmas that are non-mergable, non-mlock()able 140 * special vmas that are non-mergable, non-mlock()able
141 */ 141 */
142 #define VM_SPECIAL (VM_IO | VM_DONTEXPAND | VM_RESERVED | VM_PFNMAP) 142 #define VM_SPECIAL (VM_IO | VM_DONTEXPAND | VM_RESERVED | VM_PFNMAP)
143 143
144 /* 144 /*
145 * mapping from the currently active vm_flags protection bits (the 145 * mapping from the currently active vm_flags protection bits (the
146 * low four bits) to a page protection mask.. 146 * low four bits) to a page protection mask..
147 */ 147 */
148 extern pgprot_t protection_map[16]; 148 extern pgprot_t protection_map[16];
149 149
150 #define FAULT_FLAG_WRITE 0x01 /* Fault was a write access */ 150 #define FAULT_FLAG_WRITE 0x01 /* Fault was a write access */
151 #define FAULT_FLAG_NONLINEAR 0x02 /* Fault was via a nonlinear mapping */ 151 #define FAULT_FLAG_NONLINEAR 0x02 /* Fault was via a nonlinear mapping */
152 #define FAULT_FLAG_MKWRITE 0x04 /* Fault was mkwrite of existing pte */ 152 #define FAULT_FLAG_MKWRITE 0x04 /* Fault was mkwrite of existing pte */
153 #define FAULT_FLAG_ALLOW_RETRY 0x08 /* Retry fault if blocking */ 153 #define FAULT_FLAG_ALLOW_RETRY 0x08 /* Retry fault if blocking */
154 154
155 /* 155 /*
156 * This interface is used by x86 PAT code to identify a pfn mapping that is 156 * This interface is used by x86 PAT code to identify a pfn mapping that is
157 * linear over entire vma. This is to optimize PAT code that deals with 157 * linear over entire vma. This is to optimize PAT code that deals with
158 * marking the physical region with a particular prot. This is not for generic 158 * marking the physical region with a particular prot. This is not for generic
159 * mm use. Note also that this check will not work if the pfn mapping is 159 * mm use. Note also that this check will not work if the pfn mapping is
160 * linear for a vma starting at physical address 0. In which case PAT code 160 * linear for a vma starting at physical address 0. In which case PAT code
161 * falls back to slow path of reserving physical range page by page. 161 * falls back to slow path of reserving physical range page by page.
162 */ 162 */
163 static inline int is_linear_pfn_mapping(struct vm_area_struct *vma) 163 static inline int is_linear_pfn_mapping(struct vm_area_struct *vma)
164 { 164 {
165 return (vma->vm_flags & VM_PFN_AT_MMAP); 165 return (vma->vm_flags & VM_PFN_AT_MMAP);
166 } 166 }
167 167
168 static inline int is_pfn_mapping(struct vm_area_struct *vma) 168 static inline int is_pfn_mapping(struct vm_area_struct *vma)
169 { 169 {
170 return (vma->vm_flags & VM_PFNMAP); 170 return (vma->vm_flags & VM_PFNMAP);
171 } 171 }
172 172
173 /* 173 /*
174 * vm_fault is filled by the the pagefault handler and passed to the vma's 174 * vm_fault is filled by the the pagefault handler and passed to the vma's
175 * ->fault function. The vma's ->fault is responsible for returning a bitmask 175 * ->fault function. The vma's ->fault is responsible for returning a bitmask
176 * of VM_FAULT_xxx flags that give details about how the fault was handled. 176 * of VM_FAULT_xxx flags that give details about how the fault was handled.
177 * 177 *
178 * pgoff should be used in favour of virtual_address, if possible. If pgoff 178 * pgoff should be used in favour of virtual_address, if possible. If pgoff
179 * is used, one may set VM_CAN_NONLINEAR in the vma->vm_flags to get nonlinear 179 * is used, one may set VM_CAN_NONLINEAR in the vma->vm_flags to get nonlinear
180 * mapping support. 180 * mapping support.
181 */ 181 */
182 struct vm_fault { 182 struct vm_fault {
183 unsigned int flags; /* FAULT_FLAG_xxx flags */ 183 unsigned int flags; /* FAULT_FLAG_xxx flags */
184 pgoff_t pgoff; /* Logical page offset based on vma */ 184 pgoff_t pgoff; /* Logical page offset based on vma */
185 void __user *virtual_address; /* Faulting virtual address */ 185 void __user *virtual_address; /* Faulting virtual address */
186 186
187 struct page *page; /* ->fault handlers should return a 187 struct page *page; /* ->fault handlers should return a
188 * page here, unless VM_FAULT_NOPAGE 188 * page here, unless VM_FAULT_NOPAGE
189 * is set (which is also implied by 189 * is set (which is also implied by
190 * VM_FAULT_ERROR). 190 * VM_FAULT_ERROR).
191 */ 191 */
192 }; 192 };
193 193
194 /* 194 /*
195 * These are the virtual MM functions - opening of an area, closing and 195 * These are the virtual MM functions - opening of an area, closing and
196 * unmapping it (needed to keep files on disk up-to-date etc), pointer 196 * unmapping it (needed to keep files on disk up-to-date etc), pointer
197 * to the functions called when a no-page or a wp-page exception occurs. 197 * to the functions called when a no-page or a wp-page exception occurs.
198 */ 198 */
199 struct vm_operations_struct { 199 struct vm_operations_struct {
200 void (*open)(struct vm_area_struct * area); 200 void (*open)(struct vm_area_struct * area);
201 void (*close)(struct vm_area_struct * area); 201 void (*close)(struct vm_area_struct * area);
202 int (*fault)(struct vm_area_struct *vma, struct vm_fault *vmf); 202 int (*fault)(struct vm_area_struct *vma, struct vm_fault *vmf);
203 203
204 /* notification that a previously read-only page is about to become 204 /* notification that a previously read-only page is about to become
205 * writable, if an error is returned it will cause a SIGBUS */ 205 * writable, if an error is returned it will cause a SIGBUS */
206 int (*page_mkwrite)(struct vm_area_struct *vma, struct vm_fault *vmf); 206 int (*page_mkwrite)(struct vm_area_struct *vma, struct vm_fault *vmf);
207 207
208 /* called by access_process_vm when get_user_pages() fails, typically 208 /* called by access_process_vm when get_user_pages() fails, typically
209 * for use by special VMAs that can switch between memory and hardware 209 * for use by special VMAs that can switch between memory and hardware
210 */ 210 */
211 int (*access)(struct vm_area_struct *vma, unsigned long addr, 211 int (*access)(struct vm_area_struct *vma, unsigned long addr,
212 void *buf, int len, int write); 212 void *buf, int len, int write);
213 #ifdef CONFIG_NUMA 213 #ifdef CONFIG_NUMA
214 /* 214 /*
215 * set_policy() op must add a reference to any non-NULL @new mempolicy 215 * set_policy() op must add a reference to any non-NULL @new mempolicy
216 * to hold the policy upon return. Caller should pass NULL @new to 216 * to hold the policy upon return. Caller should pass NULL @new to
217 * remove a policy and fall back to surrounding context--i.e. do not 217 * remove a policy and fall back to surrounding context--i.e. do not
218 * install a MPOL_DEFAULT policy, nor the task or system default 218 * install a MPOL_DEFAULT policy, nor the task or system default
219 * mempolicy. 219 * mempolicy.
220 */ 220 */
221 int (*set_policy)(struct vm_area_struct *vma, struct mempolicy *new); 221 int (*set_policy)(struct vm_area_struct *vma, struct mempolicy *new);
222 222
223 /* 223 /*
224 * get_policy() op must add reference [mpol_get()] to any policy at 224 * get_policy() op must add reference [mpol_get()] to any policy at
225 * (vma,addr) marked as MPOL_SHARED. The shared policy infrastructure 225 * (vma,addr) marked as MPOL_SHARED. The shared policy infrastructure
226 * in mm/mempolicy.c will do this automatically. 226 * in mm/mempolicy.c will do this automatically.
227 * get_policy() must NOT add a ref if the policy at (vma,addr) is not 227 * get_policy() must NOT add a ref if the policy at (vma,addr) is not
228 * marked as MPOL_SHARED. vma policies are protected by the mmap_sem. 228 * marked as MPOL_SHARED. vma policies are protected by the mmap_sem.
229 * If no [shared/vma] mempolicy exists at the addr, get_policy() op 229 * If no [shared/vma] mempolicy exists at the addr, get_policy() op
230 * must return NULL--i.e., do not "fallback" to task or system default 230 * must return NULL--i.e., do not "fallback" to task or system default
231 * policy. 231 * policy.
232 */ 232 */
233 struct mempolicy *(*get_policy)(struct vm_area_struct *vma, 233 struct mempolicy *(*get_policy)(struct vm_area_struct *vma,
234 unsigned long addr); 234 unsigned long addr);
235 int (*migrate)(struct vm_area_struct *vma, const nodemask_t *from, 235 int (*migrate)(struct vm_area_struct *vma, const nodemask_t *from,
236 const nodemask_t *to, unsigned long flags); 236 const nodemask_t *to, unsigned long flags);
237 #endif 237 #endif
238 }; 238 };
239 239
240 struct mmu_gather; 240 struct mmu_gather;
241 struct inode; 241 struct inode;
242 242
243 #define page_private(page) ((page)->private) 243 #define page_private(page) ((page)->private)
244 #define set_page_private(page, v) ((page)->private = (v)) 244 #define set_page_private(page, v) ((page)->private = (v))
245 245
246 /* 246 /*
247 * FIXME: take this include out, include page-flags.h in 247 * FIXME: take this include out, include page-flags.h in
248 * files which need it (119 of them) 248 * files which need it (119 of them)
249 */ 249 */
250 #include <linux/page-flags.h> 250 #include <linux/page-flags.h>
251 #include <linux/huge_mm.h> 251 #include <linux/huge_mm.h>
252 252
253 /* 253 /*
254 * Methods to modify the page usage count. 254 * Methods to modify the page usage count.
255 * 255 *
256 * What counts for a page usage: 256 * What counts for a page usage:
257 * - cache mapping (page->mapping) 257 * - cache mapping (page->mapping)
258 * - private data (page->private) 258 * - private data (page->private)
259 * - page mapped in a task's page tables, each mapping 259 * - page mapped in a task's page tables, each mapping
260 * is counted separately 260 * is counted separately
261 * 261 *
262 * Also, many kernel routines increase the page count before a critical 262 * Also, many kernel routines increase the page count before a critical
263 * routine so they can be sure the page doesn't go away from under them. 263 * routine so they can be sure the page doesn't go away from under them.
264 */ 264 */
265 265
266 /* 266 /*
267 * Drop a ref, return true if the refcount fell to zero (the page has no users) 267 * Drop a ref, return true if the refcount fell to zero (the page has no users)
268 */ 268 */
269 static inline int put_page_testzero(struct page *page) 269 static inline int put_page_testzero(struct page *page)
270 { 270 {
271 VM_BUG_ON(atomic_read(&page->_count) == 0); 271 VM_BUG_ON(atomic_read(&page->_count) == 0);
272 return atomic_dec_and_test(&page->_count); 272 return atomic_dec_and_test(&page->_count);
273 } 273 }
274 274
275 /* 275 /*
276 * Try to grab a ref unless the page has a refcount of zero, return false if 276 * Try to grab a ref unless the page has a refcount of zero, return false if
277 * that is the case. 277 * that is the case.
278 */ 278 */
279 static inline int get_page_unless_zero(struct page *page) 279 static inline int get_page_unless_zero(struct page *page)
280 { 280 {
281 return atomic_inc_not_zero(&page->_count); 281 return atomic_inc_not_zero(&page->_count);
282 } 282 }
283 283
284 extern int page_is_ram(unsigned long pfn); 284 extern int page_is_ram(unsigned long pfn);
285 285
286 /* Support for virtually mapped pages */ 286 /* Support for virtually mapped pages */
287 struct page *vmalloc_to_page(const void *addr); 287 struct page *vmalloc_to_page(const void *addr);
288 unsigned long vmalloc_to_pfn(const void *addr); 288 unsigned long vmalloc_to_pfn(const void *addr);
289 289
290 /* 290 /*
291 * Determine if an address is within the vmalloc range 291 * Determine if an address is within the vmalloc range
292 * 292 *
293 * On nommu, vmalloc/vfree wrap through kmalloc/kfree directly, so there 293 * On nommu, vmalloc/vfree wrap through kmalloc/kfree directly, so there
294 * is no special casing required. 294 * is no special casing required.
295 */ 295 */
296 static inline int is_vmalloc_addr(const void *x) 296 static inline int is_vmalloc_addr(const void *x)
297 { 297 {
298 #ifdef CONFIG_MMU 298 #ifdef CONFIG_MMU
299 unsigned long addr = (unsigned long)x; 299 unsigned long addr = (unsigned long)x;
300 300
301 return addr >= VMALLOC_START && addr < VMALLOC_END; 301 return addr >= VMALLOC_START && addr < VMALLOC_END;
302 #else 302 #else
303 return 0; 303 return 0;
304 #endif 304 #endif
305 } 305 }
306 #ifdef CONFIG_MMU 306 #ifdef CONFIG_MMU
307 extern int is_vmalloc_or_module_addr(const void *x); 307 extern int is_vmalloc_or_module_addr(const void *x);
308 #else 308 #else
309 static inline int is_vmalloc_or_module_addr(const void *x) 309 static inline int is_vmalloc_or_module_addr(const void *x)
310 { 310 {
311 return 0; 311 return 0;
312 } 312 }
313 #endif 313 #endif
314 314
315 static inline void compound_lock(struct page *page) 315 static inline void compound_lock(struct page *page)
316 { 316 {
317 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 317 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
318 bit_spin_lock(PG_compound_lock, &page->flags); 318 bit_spin_lock(PG_compound_lock, &page->flags);
319 #endif 319 #endif
320 } 320 }
321 321
322 static inline void compound_unlock(struct page *page) 322 static inline void compound_unlock(struct page *page)
323 { 323 {
324 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 324 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
325 bit_spin_unlock(PG_compound_lock, &page->flags); 325 bit_spin_unlock(PG_compound_lock, &page->flags);
326 #endif 326 #endif
327 } 327 }
328 328
329 static inline unsigned long compound_lock_irqsave(struct page *page) 329 static inline unsigned long compound_lock_irqsave(struct page *page)
330 { 330 {
331 unsigned long uninitialized_var(flags); 331 unsigned long uninitialized_var(flags);
332 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 332 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
333 local_irq_save(flags); 333 local_irq_save(flags);
334 compound_lock(page); 334 compound_lock(page);
335 #endif 335 #endif
336 return flags; 336 return flags;
337 } 337 }
338 338
339 static inline void compound_unlock_irqrestore(struct page *page, 339 static inline void compound_unlock_irqrestore(struct page *page,
340 unsigned long flags) 340 unsigned long flags)
341 { 341 {
342 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 342 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
343 compound_unlock(page); 343 compound_unlock(page);
344 local_irq_restore(flags); 344 local_irq_restore(flags);
345 #endif 345 #endif
346 } 346 }
347 347
348 static inline struct page *compound_head(struct page *page) 348 static inline struct page *compound_head(struct page *page)
349 { 349 {
350 if (unlikely(PageTail(page))) 350 if (unlikely(PageTail(page)))
351 return page->first_page; 351 return page->first_page;
352 return page; 352 return page;
353 } 353 }
354 354
355 static inline int page_count(struct page *page) 355 static inline int page_count(struct page *page)
356 { 356 {
357 return atomic_read(&compound_head(page)->_count); 357 return atomic_read(&compound_head(page)->_count);
358 } 358 }
359 359
360 static inline void get_page(struct page *page) 360 static inline void get_page(struct page *page)
361 { 361 {
362 /* 362 /*
363 * Getting a normal page or the head of a compound page 363 * Getting a normal page or the head of a compound page
364 * requires to already have an elevated page->_count. Only if 364 * requires to already have an elevated page->_count. Only if
365 * we're getting a tail page, the elevated page->_count is 365 * we're getting a tail page, the elevated page->_count is
366 * required only in the head page, so for tail pages the 366 * required only in the head page, so for tail pages the
367 * bugcheck only verifies that the page->_count isn't 367 * bugcheck only verifies that the page->_count isn't
368 * negative. 368 * negative.
369 */ 369 */
370 VM_BUG_ON(atomic_read(&page->_count) < !PageTail(page)); 370 VM_BUG_ON(atomic_read(&page->_count) < !PageTail(page));
371 atomic_inc(&page->_count); 371 atomic_inc(&page->_count);
372 /* 372 /*
373 * Getting a tail page will elevate both the head and tail 373 * Getting a tail page will elevate both the head and tail
374 * page->_count(s). 374 * page->_count(s).
375 */ 375 */
376 if (unlikely(PageTail(page))) { 376 if (unlikely(PageTail(page))) {
377 /* 377 /*
378 * This is safe only because 378 * This is safe only because
379 * __split_huge_page_refcount can't run under 379 * __split_huge_page_refcount can't run under
380 * get_page(). 380 * get_page().
381 */ 381 */
382 VM_BUG_ON(atomic_read(&page->first_page->_count) <= 0); 382 VM_BUG_ON(atomic_read(&page->first_page->_count) <= 0);
383 atomic_inc(&page->first_page->_count); 383 atomic_inc(&page->first_page->_count);
384 } 384 }
385 } 385 }
386 386
387 static inline struct page *virt_to_head_page(const void *x) 387 static inline struct page *virt_to_head_page(const void *x)
388 { 388 {
389 struct page *page = virt_to_page(x); 389 struct page *page = virt_to_page(x);
390 return compound_head(page); 390 return compound_head(page);
391 } 391 }
392 392
393 /* 393 /*
394 * Setup the page count before being freed into the page allocator for 394 * Setup the page count before being freed into the page allocator for
395 * the first time (boot or memory hotplug) 395 * the first time (boot or memory hotplug)
396 */ 396 */
397 static inline void init_page_count(struct page *page) 397 static inline void init_page_count(struct page *page)
398 { 398 {
399 atomic_set(&page->_count, 1); 399 atomic_set(&page->_count, 1);
400 } 400 }
401 401
402 /* 402 /*
403 * PageBuddy() indicate that the page is free and in the buddy system 403 * PageBuddy() indicate that the page is free and in the buddy system
404 * (see mm/page_alloc.c). 404 * (see mm/page_alloc.c).
405 * 405 *
406 * PAGE_BUDDY_MAPCOUNT_VALUE must be <= -2 but better not too close to 406 * PAGE_BUDDY_MAPCOUNT_VALUE must be <= -2 but better not too close to
407 * -2 so that an underflow of the page_mapcount() won't be mistaken 407 * -2 so that an underflow of the page_mapcount() won't be mistaken
408 * for a genuine PAGE_BUDDY_MAPCOUNT_VALUE. -128 can be created very 408 * for a genuine PAGE_BUDDY_MAPCOUNT_VALUE. -128 can be created very
409 * efficiently by most CPU architectures. 409 * efficiently by most CPU architectures.
410 */ 410 */
411 #define PAGE_BUDDY_MAPCOUNT_VALUE (-128) 411 #define PAGE_BUDDY_MAPCOUNT_VALUE (-128)
412 412
413 static inline int PageBuddy(struct page *page) 413 static inline int PageBuddy(struct page *page)
414 { 414 {
415 return atomic_read(&page->_mapcount) == PAGE_BUDDY_MAPCOUNT_VALUE; 415 return atomic_read(&page->_mapcount) == PAGE_BUDDY_MAPCOUNT_VALUE;
416 } 416 }
417 417
418 static inline void __SetPageBuddy(struct page *page) 418 static inline void __SetPageBuddy(struct page *page)
419 { 419 {
420 VM_BUG_ON(atomic_read(&page->_mapcount) != -1); 420 VM_BUG_ON(atomic_read(&page->_mapcount) != -1);
421 atomic_set(&page->_mapcount, PAGE_BUDDY_MAPCOUNT_VALUE); 421 atomic_set(&page->_mapcount, PAGE_BUDDY_MAPCOUNT_VALUE);
422 } 422 }
423 423
424 static inline void __ClearPageBuddy(struct page *page) 424 static inline void __ClearPageBuddy(struct page *page)
425 { 425 {
426 VM_BUG_ON(!PageBuddy(page)); 426 VM_BUG_ON(!PageBuddy(page));
427 atomic_set(&page->_mapcount, -1); 427 atomic_set(&page->_mapcount, -1);
428 } 428 }
429 429
430 void put_page(struct page *page); 430 void put_page(struct page *page);
431 void put_pages_list(struct list_head *pages); 431 void put_pages_list(struct list_head *pages);
432 432
433 void split_page(struct page *page, unsigned int order); 433 void split_page(struct page *page, unsigned int order);
434 int split_free_page(struct page *page); 434 int split_free_page(struct page *page);
435 435
436 /* 436 /*
437 * Compound pages have a destructor function. Provide a 437 * Compound pages have a destructor function. Provide a
438 * prototype for that function and accessor functions. 438 * prototype for that function and accessor functions.
439 * These are _only_ valid on the head of a PG_compound page. 439 * These are _only_ valid on the head of a PG_compound page.
440 */ 440 */
441 typedef void compound_page_dtor(struct page *); 441 typedef void compound_page_dtor(struct page *);
442 442
443 static inline void set_compound_page_dtor(struct page *page, 443 static inline void set_compound_page_dtor(struct page *page,
444 compound_page_dtor *dtor) 444 compound_page_dtor *dtor)
445 { 445 {
446 page[1].lru.next = (void *)dtor; 446 page[1].lru.next = (void *)dtor;
447 } 447 }
448 448
449 static inline compound_page_dtor *get_compound_page_dtor(struct page *page) 449 static inline compound_page_dtor *get_compound_page_dtor(struct page *page)
450 { 450 {
451 return (compound_page_dtor *)page[1].lru.next; 451 return (compound_page_dtor *)page[1].lru.next;
452 } 452 }
453 453
454 static inline int compound_order(struct page *page) 454 static inline int compound_order(struct page *page)
455 { 455 {
456 if (!PageHead(page)) 456 if (!PageHead(page))
457 return 0; 457 return 0;
458 return (unsigned long)page[1].lru.prev; 458 return (unsigned long)page[1].lru.prev;
459 } 459 }
460 460
461 static inline int compound_trans_order(struct page *page) 461 static inline int compound_trans_order(struct page *page)
462 { 462 {
463 int order; 463 int order;
464 unsigned long flags; 464 unsigned long flags;
465 465
466 if (!PageHead(page)) 466 if (!PageHead(page))
467 return 0; 467 return 0;
468 468
469 flags = compound_lock_irqsave(page); 469 flags = compound_lock_irqsave(page);
470 order = compound_order(page); 470 order = compound_order(page);
471 compound_unlock_irqrestore(page, flags); 471 compound_unlock_irqrestore(page, flags);
472 return order; 472 return order;
473 } 473 }
474 474
475 static inline void set_compound_order(struct page *page, unsigned long order) 475 static inline void set_compound_order(struct page *page, unsigned long order)
476 { 476 {
477 page[1].lru.prev = (void *)order; 477 page[1].lru.prev = (void *)order;
478 } 478 }
479 479
480 #ifdef CONFIG_MMU 480 #ifdef CONFIG_MMU
481 /* 481 /*
482 * Do pte_mkwrite, but only if the vma says VM_WRITE. We do this when 482 * Do pte_mkwrite, but only if the vma says VM_WRITE. We do this when
483 * servicing faults for write access. In the normal case, do always want 483 * servicing faults for write access. In the normal case, do always want
484 * pte_mkwrite. But get_user_pages can cause write faults for mappings 484 * pte_mkwrite. But get_user_pages can cause write faults for mappings
485 * that do not have writing enabled, when used by access_process_vm. 485 * that do not have writing enabled, when used by access_process_vm.
486 */ 486 */
487 static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma) 487 static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
488 { 488 {
489 if (likely(vma->vm_flags & VM_WRITE)) 489 if (likely(vma->vm_flags & VM_WRITE))
490 pte = pte_mkwrite(pte); 490 pte = pte_mkwrite(pte);
491 return pte; 491 return pte;
492 } 492 }
493 #endif 493 #endif
494 494
495 /* 495 /*
496 * Multiple processes may "see" the same page. E.g. for untouched 496 * Multiple processes may "see" the same page. E.g. for untouched
497 * mappings of /dev/null, all processes see the same page full of 497 * mappings of /dev/null, all processes see the same page full of
498 * zeroes, and text pages of executables and shared libraries have 498 * zeroes, and text pages of executables and shared libraries have
499 * only one copy in memory, at most, normally. 499 * only one copy in memory, at most, normally.
500 * 500 *
501 * For the non-reserved pages, page_count(page) denotes a reference count. 501 * For the non-reserved pages, page_count(page) denotes a reference count.
502 * page_count() == 0 means the page is free. page->lru is then used for 502 * page_count() == 0 means the page is free. page->lru is then used for
503 * freelist management in the buddy allocator. 503 * freelist management in the buddy allocator.
504 * page_count() > 0 means the page has been allocated. 504 * page_count() > 0 means the page has been allocated.
505 * 505 *
506 * Pages are allocated by the slab allocator in order to provide memory 506 * Pages are allocated by the slab allocator in order to provide memory
507 * to kmalloc and kmem_cache_alloc. In this case, the management of the 507 * to kmalloc and kmem_cache_alloc. In this case, the management of the
508 * page, and the fields in 'struct page' are the responsibility of mm/slab.c 508 * page, and the fields in 'struct page' are the responsibility of mm/slab.c
509 * unless a particular usage is carefully commented. (the responsibility of 509 * unless a particular usage is carefully commented. (the responsibility of
510 * freeing the kmalloc memory is the caller's, of course). 510 * freeing the kmalloc memory is the caller's, of course).
511 * 511 *
512 * A page may be used by anyone else who does a __get_free_page(). 512 * A page may be used by anyone else who does a __get_free_page().
513 * In this case, page_count still tracks the references, and should only 513 * In this case, page_count still tracks the references, and should only
514 * be used through the normal accessor functions. The top bits of page->flags 514 * be used through the normal accessor functions. The top bits of page->flags
515 * and page->virtual store page management information, but all other fields 515 * and page->virtual store page management information, but all other fields
516 * are unused and could be used privately, carefully. The management of this 516 * are unused and could be used privately, carefully. The management of this
517 * page is the responsibility of the one who allocated it, and those who have 517 * page is the responsibility of the one who allocated it, and those who have
518 * subsequently been given references to it. 518 * subsequently been given references to it.
519 * 519 *
520 * The other pages (we may call them "pagecache pages") are completely 520 * The other pages (we may call them "pagecache pages") are completely
521 * managed by the Linux memory manager: I/O, buffers, swapping etc. 521 * managed by the Linux memory manager: I/O, buffers, swapping etc.
522 * The following discussion applies only to them. 522 * The following discussion applies only to them.
523 * 523 *
524 * A pagecache page contains an opaque `private' member, which belongs to the 524 * A pagecache page contains an opaque `private' member, which belongs to the
525 * page's address_space. Usually, this is the address of a circular list of 525 * page's address_space. Usually, this is the address of a circular list of
526 * the page's disk buffers. PG_private must be set to tell the VM to call 526 * the page's disk buffers. PG_private must be set to tell the VM to call
527 * into the filesystem to release these pages. 527 * into the filesystem to release these pages.
528 * 528 *
529 * A page may belong to an inode's memory mapping. In this case, page->mapping 529 * A page may belong to an inode's memory mapping. In this case, page->mapping
530 * is the pointer to the inode, and page->index is the file offset of the page, 530 * is the pointer to the inode, and page->index is the file offset of the page,
531 * in units of PAGE_CACHE_SIZE. 531 * in units of PAGE_CACHE_SIZE.
532 * 532 *
533 * If pagecache pages are not associated with an inode, they are said to be 533 * If pagecache pages are not associated with an inode, they are said to be
534 * anonymous pages. These may become associated with the swapcache, and in that 534 * anonymous pages. These may become associated with the swapcache, and in that
535 * case PG_swapcache is set, and page->private is an offset into the swapcache. 535 * case PG_swapcache is set, and page->private is an offset into the swapcache.
536 * 536 *
537 * In either case (swapcache or inode backed), the pagecache itself holds one 537 * In either case (swapcache or inode backed), the pagecache itself holds one
538 * reference to the page. Setting PG_private should also increment the 538 * reference to the page. Setting PG_private should also increment the
539 * refcount. The each user mapping also has a reference to the page. 539 * refcount. The each user mapping also has a reference to the page.
540 * 540 *
541 * The pagecache pages are stored in a per-mapping radix tree, which is 541 * The pagecache pages are stored in a per-mapping radix tree, which is
542 * rooted at mapping->page_tree, and indexed by offset. 542 * rooted at mapping->page_tree, and indexed by offset.
543 * Where 2.4 and early 2.6 kernels kept dirty/clean pages in per-address_space 543 * Where 2.4 and early 2.6 kernels kept dirty/clean pages in per-address_space
544 * lists, we instead now tag pages as dirty/writeback in the radix tree. 544 * lists, we instead now tag pages as dirty/writeback in the radix tree.
545 * 545 *
546 * All pagecache pages may be subject to I/O: 546 * All pagecache pages may be subject to I/O:
547 * - inode pages may need to be read from disk, 547 * - inode pages may need to be read from disk,
548 * - inode pages which have been modified and are MAP_SHARED may need 548 * - inode pages which have been modified and are MAP_SHARED may need
549 * to be written back to the inode on disk, 549 * to be written back to the inode on disk,
550 * - anonymous pages (including MAP_PRIVATE file mappings) which have been 550 * - anonymous pages (including MAP_PRIVATE file mappings) which have been
551 * modified may need to be swapped out to swap space and (later) to be read 551 * modified may need to be swapped out to swap space and (later) to be read
552 * back into memory. 552 * back into memory.
553 */ 553 */
554 554
555 /* 555 /*
556 * The zone field is never updated after free_area_init_core() 556 * The zone field is never updated after free_area_init_core()
557 * sets it, so none of the operations on it need to be atomic. 557 * sets it, so none of the operations on it need to be atomic.
558 */ 558 */
559 559
560 560
561 /* 561 /*
562 * page->flags layout: 562 * page->flags layout:
563 * 563 *
564 * There are three possibilities for how page->flags get 564 * There are three possibilities for how page->flags get
565 * laid out. The first is for the normal case, without 565 * laid out. The first is for the normal case, without
566 * sparsemem. The second is for sparsemem when there is 566 * sparsemem. The second is for sparsemem when there is
567 * plenty of space for node and section. The last is when 567 * plenty of space for node and section. The last is when
568 * we have run out of space and have to fall back to an 568 * we have run out of space and have to fall back to an
569 * alternate (slower) way of determining the node. 569 * alternate (slower) way of determining the node.
570 * 570 *
571 * No sparsemem or sparsemem vmemmap: | NODE | ZONE | ... | FLAGS | 571 * No sparsemem or sparsemem vmemmap: | NODE | ZONE | ... | FLAGS |
572 * classic sparse with space for node:| SECTION | NODE | ZONE | ... | FLAGS | 572 * classic sparse with space for node:| SECTION | NODE | ZONE | ... | FLAGS |
573 * classic sparse no space for node: | SECTION | ZONE | ... | FLAGS | 573 * classic sparse no space for node: | SECTION | ZONE | ... | FLAGS |
574 */ 574 */
575 #if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP) 575 #if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP)
576 #define SECTIONS_WIDTH SECTIONS_SHIFT 576 #define SECTIONS_WIDTH SECTIONS_SHIFT
577 #else 577 #else
578 #define SECTIONS_WIDTH 0 578 #define SECTIONS_WIDTH 0
579 #endif 579 #endif
580 580
581 #define ZONES_WIDTH ZONES_SHIFT 581 #define ZONES_WIDTH ZONES_SHIFT
582 582
583 #if SECTIONS_WIDTH+ZONES_WIDTH+NODES_SHIFT <= BITS_PER_LONG - NR_PAGEFLAGS 583 #if SECTIONS_WIDTH+ZONES_WIDTH+NODES_SHIFT <= BITS_PER_LONG - NR_PAGEFLAGS
584 #define NODES_WIDTH NODES_SHIFT 584 #define NODES_WIDTH NODES_SHIFT
585 #else 585 #else
586 #ifdef CONFIG_SPARSEMEM_VMEMMAP 586 #ifdef CONFIG_SPARSEMEM_VMEMMAP
587 #error "Vmemmap: No space for nodes field in page flags" 587 #error "Vmemmap: No space for nodes field in page flags"
588 #endif 588 #endif
589 #define NODES_WIDTH 0 589 #define NODES_WIDTH 0
590 #endif 590 #endif
591 591
592 /* Page flags: | [SECTION] | [NODE] | ZONE | ... | FLAGS | */ 592 /* Page flags: | [SECTION] | [NODE] | ZONE | ... | FLAGS | */
593 #define SECTIONS_PGOFF ((sizeof(unsigned long)*8) - SECTIONS_WIDTH) 593 #define SECTIONS_PGOFF ((sizeof(unsigned long)*8) - SECTIONS_WIDTH)
594 #define NODES_PGOFF (SECTIONS_PGOFF - NODES_WIDTH) 594 #define NODES_PGOFF (SECTIONS_PGOFF - NODES_WIDTH)
595 #define ZONES_PGOFF (NODES_PGOFF - ZONES_WIDTH) 595 #define ZONES_PGOFF (NODES_PGOFF - ZONES_WIDTH)
596 596
597 /* 597 /*
598 * We are going to use the flags for the page to node mapping if its in 598 * We are going to use the flags for the page to node mapping if its in
599 * there. This includes the case where there is no node, so it is implicit. 599 * there. This includes the case where there is no node, so it is implicit.
600 */ 600 */
601 #if !(NODES_WIDTH > 0 || NODES_SHIFT == 0) 601 #if !(NODES_WIDTH > 0 || NODES_SHIFT == 0)
602 #define NODE_NOT_IN_PAGE_FLAGS 602 #define NODE_NOT_IN_PAGE_FLAGS
603 #endif 603 #endif
604 604
605 #ifndef PFN_SECTION_SHIFT 605 #ifndef PFN_SECTION_SHIFT
606 #define PFN_SECTION_SHIFT 0 606 #define PFN_SECTION_SHIFT 0
607 #endif 607 #endif
608 608
609 /* 609 /*
610 * Define the bit shifts to access each section. For non-existant 610 * Define the bit shifts to access each section. For non-existant
611 * sections we define the shift as 0; that plus a 0 mask ensures 611 * sections we define the shift as 0; that plus a 0 mask ensures
612 * the compiler will optimise away reference to them. 612 * the compiler will optimise away reference to them.
613 */ 613 */
614 #define SECTIONS_PGSHIFT (SECTIONS_PGOFF * (SECTIONS_WIDTH != 0)) 614 #define SECTIONS_PGSHIFT (SECTIONS_PGOFF * (SECTIONS_WIDTH != 0))
615 #define NODES_PGSHIFT (NODES_PGOFF * (NODES_WIDTH != 0)) 615 #define NODES_PGSHIFT (NODES_PGOFF * (NODES_WIDTH != 0))
616 #define ZONES_PGSHIFT (ZONES_PGOFF * (ZONES_WIDTH != 0)) 616 #define ZONES_PGSHIFT (ZONES_PGOFF * (ZONES_WIDTH != 0))
617 617
618 /* NODE:ZONE or SECTION:ZONE is used to ID a zone for the buddy allocator */ 618 /* NODE:ZONE or SECTION:ZONE is used to ID a zone for the buddy allocator */
619 #ifdef NODE_NOT_IN_PAGE_FLAGS 619 #ifdef NODE_NOT_IN_PAGE_FLAGS
620 #define ZONEID_SHIFT (SECTIONS_SHIFT + ZONES_SHIFT) 620 #define ZONEID_SHIFT (SECTIONS_SHIFT + ZONES_SHIFT)
621 #define ZONEID_PGOFF ((SECTIONS_PGOFF < ZONES_PGOFF)? \ 621 #define ZONEID_PGOFF ((SECTIONS_PGOFF < ZONES_PGOFF)? \
622 SECTIONS_PGOFF : ZONES_PGOFF) 622 SECTIONS_PGOFF : ZONES_PGOFF)
623 #else 623 #else
624 #define ZONEID_SHIFT (NODES_SHIFT + ZONES_SHIFT) 624 #define ZONEID_SHIFT (NODES_SHIFT + ZONES_SHIFT)
625 #define ZONEID_PGOFF ((NODES_PGOFF < ZONES_PGOFF)? \ 625 #define ZONEID_PGOFF ((NODES_PGOFF < ZONES_PGOFF)? \
626 NODES_PGOFF : ZONES_PGOFF) 626 NODES_PGOFF : ZONES_PGOFF)
627 #endif 627 #endif
628 628
629 #define ZONEID_PGSHIFT (ZONEID_PGOFF * (ZONEID_SHIFT != 0)) 629 #define ZONEID_PGSHIFT (ZONEID_PGOFF * (ZONEID_SHIFT != 0))
630 630
631 #if SECTIONS_WIDTH+NODES_WIDTH+ZONES_WIDTH > BITS_PER_LONG - NR_PAGEFLAGS 631 #if SECTIONS_WIDTH+NODES_WIDTH+ZONES_WIDTH > BITS_PER_LONG - NR_PAGEFLAGS
632 #error SECTIONS_WIDTH+NODES_WIDTH+ZONES_WIDTH > BITS_PER_LONG - NR_PAGEFLAGS 632 #error SECTIONS_WIDTH+NODES_WIDTH+ZONES_WIDTH > BITS_PER_LONG - NR_PAGEFLAGS
633 #endif 633 #endif
634 634
635 #define ZONES_MASK ((1UL << ZONES_WIDTH) - 1) 635 #define ZONES_MASK ((1UL << ZONES_WIDTH) - 1)
636 #define NODES_MASK ((1UL << NODES_WIDTH) - 1) 636 #define NODES_MASK ((1UL << NODES_WIDTH) - 1)
637 #define SECTIONS_MASK ((1UL << SECTIONS_WIDTH) - 1) 637 #define SECTIONS_MASK ((1UL << SECTIONS_WIDTH) - 1)
638 #define ZONEID_MASK ((1UL << ZONEID_SHIFT) - 1) 638 #define ZONEID_MASK ((1UL << ZONEID_SHIFT) - 1)
639 639
640 static inline enum zone_type page_zonenum(struct page *page) 640 static inline enum zone_type page_zonenum(struct page *page)
641 { 641 {
642 return (page->flags >> ZONES_PGSHIFT) & ZONES_MASK; 642 return (page->flags >> ZONES_PGSHIFT) & ZONES_MASK;
643 } 643 }
644 644
645 /* 645 /*
646 * The identification function is only used by the buddy allocator for 646 * The identification function is only used by the buddy allocator for
647 * determining if two pages could be buddies. We are not really 647 * determining if two pages could be buddies. We are not really
648 * identifying a zone since we could be using a the section number 648 * identifying a zone since we could be using a the section number
649 * id if we have not node id available in page flags. 649 * id if we have not node id available in page flags.
650 * We guarantee only that it will return the same value for two 650 * We guarantee only that it will return the same value for two
651 * combinable pages in a zone. 651 * combinable pages in a zone.
652 */ 652 */
653 static inline int page_zone_id(struct page *page) 653 static inline int page_zone_id(struct page *page)
654 { 654 {
655 return (page->flags >> ZONEID_PGSHIFT) & ZONEID_MASK; 655 return (page->flags >> ZONEID_PGSHIFT) & ZONEID_MASK;
656 } 656 }
657 657
658 static inline int zone_to_nid(struct zone *zone) 658 static inline int zone_to_nid(struct zone *zone)
659 { 659 {
660 #ifdef CONFIG_NUMA 660 #ifdef CONFIG_NUMA
661 return zone->node; 661 return zone->node;
662 #else 662 #else
663 return 0; 663 return 0;
664 #endif 664 #endif
665 } 665 }
666 666
667 #ifdef NODE_NOT_IN_PAGE_FLAGS 667 #ifdef NODE_NOT_IN_PAGE_FLAGS
668 extern int page_to_nid(struct page *page); 668 extern int page_to_nid(struct page *page);
669 #else 669 #else
670 static inline int page_to_nid(struct page *page) 670 static inline int page_to_nid(struct page *page)
671 { 671 {
672 return (page->flags >> NODES_PGSHIFT) & NODES_MASK; 672 return (page->flags >> NODES_PGSHIFT) & NODES_MASK;
673 } 673 }
674 #endif 674 #endif
675 675
676 static inline struct zone *page_zone(struct page *page) 676 static inline struct zone *page_zone(struct page *page)
677 { 677 {
678 return &NODE_DATA(page_to_nid(page))->node_zones[page_zonenum(page)]; 678 return &NODE_DATA(page_to_nid(page))->node_zones[page_zonenum(page)];
679 } 679 }
680 680
681 #if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP) 681 #if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP)
682 static inline unsigned long page_to_section(struct page *page) 682 static inline unsigned long page_to_section(struct page *page)
683 { 683 {
684 return (page->flags >> SECTIONS_PGSHIFT) & SECTIONS_MASK; 684 return (page->flags >> SECTIONS_PGSHIFT) & SECTIONS_MASK;
685 } 685 }
686 #endif 686 #endif
687 687
688 static inline void set_page_zone(struct page *page, enum zone_type zone) 688 static inline void set_page_zone(struct page *page, enum zone_type zone)
689 { 689 {
690 page->flags &= ~(ZONES_MASK << ZONES_PGSHIFT); 690 page->flags &= ~(ZONES_MASK << ZONES_PGSHIFT);
691 page->flags |= (zone & ZONES_MASK) << ZONES_PGSHIFT; 691 page->flags |= (zone & ZONES_MASK) << ZONES_PGSHIFT;
692 } 692 }
693 693
694 static inline void set_page_node(struct page *page, unsigned long node) 694 static inline void set_page_node(struct page *page, unsigned long node)
695 { 695 {
696 page->flags &= ~(NODES_MASK << NODES_PGSHIFT); 696 page->flags &= ~(NODES_MASK << NODES_PGSHIFT);
697 page->flags |= (node & NODES_MASK) << NODES_PGSHIFT; 697 page->flags |= (node & NODES_MASK) << NODES_PGSHIFT;
698 } 698 }
699 699
700 static inline void set_page_section(struct page *page, unsigned long section) 700 static inline void set_page_section(struct page *page, unsigned long section)
701 { 701 {
702 page->flags &= ~(SECTIONS_MASK << SECTIONS_PGSHIFT); 702 page->flags &= ~(SECTIONS_MASK << SECTIONS_PGSHIFT);
703 page->flags |= (section & SECTIONS_MASK) << SECTIONS_PGSHIFT; 703 page->flags |= (section & SECTIONS_MASK) << SECTIONS_PGSHIFT;
704 } 704 }
705 705
706 static inline void set_page_links(struct page *page, enum zone_type zone, 706 static inline void set_page_links(struct page *page, enum zone_type zone,
707 unsigned long node, unsigned long pfn) 707 unsigned long node, unsigned long pfn)
708 { 708 {
709 set_page_zone(page, zone); 709 set_page_zone(page, zone);
710 set_page_node(page, node); 710 set_page_node(page, node);
711 set_page_section(page, pfn_to_section_nr(pfn)); 711 set_page_section(page, pfn_to_section_nr(pfn));
712 } 712 }
713 713
714 /* 714 /*
715 * Some inline functions in vmstat.h depend on page_zone() 715 * Some inline functions in vmstat.h depend on page_zone()
716 */ 716 */
717 #include <linux/vmstat.h> 717 #include <linux/vmstat.h>
718 718
719 static __always_inline void *lowmem_page_address(struct page *page) 719 static __always_inline void *lowmem_page_address(struct page *page)
720 { 720 {
721 return __va(PFN_PHYS(page_to_pfn(page))); 721 return __va(PFN_PHYS(page_to_pfn(page)));
722 } 722 }
723 723
724 #if defined(CONFIG_HIGHMEM) && !defined(WANT_PAGE_VIRTUAL) 724 #if defined(CONFIG_HIGHMEM) && !defined(WANT_PAGE_VIRTUAL)
725 #define HASHED_PAGE_VIRTUAL 725 #define HASHED_PAGE_VIRTUAL
726 #endif 726 #endif
727 727
728 #if defined(WANT_PAGE_VIRTUAL) 728 #if defined(WANT_PAGE_VIRTUAL)
729 #define page_address(page) ((page)->virtual) 729 #define page_address(page) ((page)->virtual)
730 #define set_page_address(page, address) \ 730 #define set_page_address(page, address) \
731 do { \ 731 do { \
732 (page)->virtual = (address); \ 732 (page)->virtual = (address); \
733 } while(0) 733 } while(0)
734 #define page_address_init() do { } while(0) 734 #define page_address_init() do { } while(0)
735 #endif 735 #endif
736 736
737 #if defined(HASHED_PAGE_VIRTUAL) 737 #if defined(HASHED_PAGE_VIRTUAL)
738 void *page_address(struct page *page); 738 void *page_address(struct page *page);
739 void set_page_address(struct page *page, void *virtual); 739 void set_page_address(struct page *page, void *virtual);
740 void page_address_init(void); 740 void page_address_init(void);
741 #endif 741 #endif
742 742
743 #if !defined(HASHED_PAGE_VIRTUAL) && !defined(WANT_PAGE_VIRTUAL) 743 #if !defined(HASHED_PAGE_VIRTUAL) && !defined(WANT_PAGE_VIRTUAL)
744 #define page_address(page) lowmem_page_address(page) 744 #define page_address(page) lowmem_page_address(page)
745 #define set_page_address(page, address) do { } while(0) 745 #define set_page_address(page, address) do { } while(0)
746 #define page_address_init() do { } while(0) 746 #define page_address_init() do { } while(0)
747 #endif 747 #endif
748 748
749 /* 749 /*
750 * On an anonymous page mapped into a user virtual memory area, 750 * On an anonymous page mapped into a user virtual memory area,
751 * page->mapping points to its anon_vma, not to a struct address_space; 751 * page->mapping points to its anon_vma, not to a struct address_space;
752 * with the PAGE_MAPPING_ANON bit set to distinguish it. See rmap.h. 752 * with the PAGE_MAPPING_ANON bit set to distinguish it. See rmap.h.
753 * 753 *
754 * On an anonymous page in a VM_MERGEABLE area, if CONFIG_KSM is enabled, 754 * On an anonymous page in a VM_MERGEABLE area, if CONFIG_KSM is enabled,
755 * the PAGE_MAPPING_KSM bit may be set along with the PAGE_MAPPING_ANON bit; 755 * the PAGE_MAPPING_KSM bit may be set along with the PAGE_MAPPING_ANON bit;
756 * and then page->mapping points, not to an anon_vma, but to a private 756 * and then page->mapping points, not to an anon_vma, but to a private
757 * structure which KSM associates with that merged page. See ksm.h. 757 * structure which KSM associates with that merged page. See ksm.h.
758 * 758 *
759 * PAGE_MAPPING_KSM without PAGE_MAPPING_ANON is currently never used. 759 * PAGE_MAPPING_KSM without PAGE_MAPPING_ANON is currently never used.
760 * 760 *
761 * Please note that, confusingly, "page_mapping" refers to the inode 761 * Please note that, confusingly, "page_mapping" refers to the inode
762 * address_space which maps the page from disk; whereas "page_mapped" 762 * address_space which maps the page from disk; whereas "page_mapped"
763 * refers to user virtual address space into which the page is mapped. 763 * refers to user virtual address space into which the page is mapped.
764 */ 764 */
765 #define PAGE_MAPPING_ANON 1 765 #define PAGE_MAPPING_ANON 1
766 #define PAGE_MAPPING_KSM 2 766 #define PAGE_MAPPING_KSM 2
767 #define PAGE_MAPPING_FLAGS (PAGE_MAPPING_ANON | PAGE_MAPPING_KSM) 767 #define PAGE_MAPPING_FLAGS (PAGE_MAPPING_ANON | PAGE_MAPPING_KSM)
768 768
769 extern struct address_space swapper_space; 769 extern struct address_space swapper_space;
770 static inline struct address_space *page_mapping(struct page *page) 770 static inline struct address_space *page_mapping(struct page *page)
771 { 771 {
772 struct address_space *mapping = page->mapping; 772 struct address_space *mapping = page->mapping;
773 773
774 VM_BUG_ON(PageSlab(page)); 774 VM_BUG_ON(PageSlab(page));
775 if (unlikely(PageSwapCache(page))) 775 if (unlikely(PageSwapCache(page)))
776 mapping = &swapper_space; 776 mapping = &swapper_space;
777 else if ((unsigned long)mapping & PAGE_MAPPING_ANON) 777 else if ((unsigned long)mapping & PAGE_MAPPING_ANON)
778 mapping = NULL; 778 mapping = NULL;
779 return mapping; 779 return mapping;
780 } 780 }
781 781
782 /* Neutral page->mapping pointer to address_space or anon_vma or other */ 782 /* Neutral page->mapping pointer to address_space or anon_vma or other */
783 static inline void *page_rmapping(struct page *page) 783 static inline void *page_rmapping(struct page *page)
784 { 784 {
785 return (void *)((unsigned long)page->mapping & ~PAGE_MAPPING_FLAGS); 785 return (void *)((unsigned long)page->mapping & ~PAGE_MAPPING_FLAGS);
786 } 786 }
787 787
788 static inline int PageAnon(struct page *page) 788 static inline int PageAnon(struct page *page)
789 { 789 {
790 return ((unsigned long)page->mapping & PAGE_MAPPING_ANON) != 0; 790 return ((unsigned long)page->mapping & PAGE_MAPPING_ANON) != 0;
791 } 791 }
792 792
793 /* 793 /*
794 * Return the pagecache index of the passed page. Regular pagecache pages 794 * Return the pagecache index of the passed page. Regular pagecache pages
795 * use ->index whereas swapcache pages use ->private 795 * use ->index whereas swapcache pages use ->private
796 */ 796 */
797 static inline pgoff_t page_index(struct page *page) 797 static inline pgoff_t page_index(struct page *page)
798 { 798 {
799 if (unlikely(PageSwapCache(page))) 799 if (unlikely(PageSwapCache(page)))
800 return page_private(page); 800 return page_private(page);
801 return page->index; 801 return page->index;
802 } 802 }
803 803
804 /* 804 /*
805 * The atomic page->_mapcount, like _count, starts from -1: 805 * The atomic page->_mapcount, like _count, starts from -1:
806 * so that transitions both from it and to it can be tracked, 806 * so that transitions both from it and to it can be tracked,
807 * using atomic_inc_and_test and atomic_add_negative(-1). 807 * using atomic_inc_and_test and atomic_add_negative(-1).
808 */ 808 */
809 static inline void reset_page_mapcount(struct page *page) 809 static inline void reset_page_mapcount(struct page *page)
810 { 810 {
811 atomic_set(&(page)->_mapcount, -1); 811 atomic_set(&(page)->_mapcount, -1);
812 } 812 }
813 813
814 static inline int page_mapcount(struct page *page) 814 static inline int page_mapcount(struct page *page)
815 { 815 {
816 return atomic_read(&(page)->_mapcount) + 1; 816 return atomic_read(&(page)->_mapcount) + 1;
817 } 817 }
818 818
819 /* 819 /*
820 * Return true if this page is mapped into pagetables. 820 * Return true if this page is mapped into pagetables.
821 */ 821 */
822 static inline int page_mapped(struct page *page) 822 static inline int page_mapped(struct page *page)
823 { 823 {
824 return atomic_read(&(page)->_mapcount) >= 0; 824 return atomic_read(&(page)->_mapcount) >= 0;
825 } 825 }
826 826
827 /* 827 /*
828 * Different kinds of faults, as returned by handle_mm_fault(). 828 * Different kinds of faults, as returned by handle_mm_fault().
829 * Used to decide whether a process gets delivered SIGBUS or 829 * Used to decide whether a process gets delivered SIGBUS or
830 * just gets major/minor fault counters bumped up. 830 * just gets major/minor fault counters bumped up.
831 */ 831 */
832 832
833 #define VM_FAULT_MINOR 0 /* For backwards compat. Remove me quickly. */ 833 #define VM_FAULT_MINOR 0 /* For backwards compat. Remove me quickly. */
834 834
835 #define VM_FAULT_OOM 0x0001 835 #define VM_FAULT_OOM 0x0001
836 #define VM_FAULT_SIGBUS 0x0002 836 #define VM_FAULT_SIGBUS 0x0002
837 #define VM_FAULT_MAJOR 0x0004 837 #define VM_FAULT_MAJOR 0x0004
838 #define VM_FAULT_WRITE 0x0008 /* Special case for get_user_pages */ 838 #define VM_FAULT_WRITE 0x0008 /* Special case for get_user_pages */
839 #define VM_FAULT_HWPOISON 0x0010 /* Hit poisoned small page */ 839 #define VM_FAULT_HWPOISON 0x0010 /* Hit poisoned small page */
840 #define VM_FAULT_HWPOISON_LARGE 0x0020 /* Hit poisoned large page. Index encoded in upper bits */ 840 #define VM_FAULT_HWPOISON_LARGE 0x0020 /* Hit poisoned large page. Index encoded in upper bits */
841 841
842 #define VM_FAULT_NOPAGE 0x0100 /* ->fault installed the pte, not return page */ 842 #define VM_FAULT_NOPAGE 0x0100 /* ->fault installed the pte, not return page */
843 #define VM_FAULT_LOCKED 0x0200 /* ->fault locked the returned page */ 843 #define VM_FAULT_LOCKED 0x0200 /* ->fault locked the returned page */
844 #define VM_FAULT_RETRY 0x0400 /* ->fault blocked, must retry */ 844 #define VM_FAULT_RETRY 0x0400 /* ->fault blocked, must retry */
845 845
846 #define VM_FAULT_HWPOISON_LARGE_MASK 0xf000 /* encodes hpage index for large hwpoison */ 846 #define VM_FAULT_HWPOISON_LARGE_MASK 0xf000 /* encodes hpage index for large hwpoison */
847 847
848 #define VM_FAULT_ERROR (VM_FAULT_OOM | VM_FAULT_SIGBUS | VM_FAULT_HWPOISON | \ 848 #define VM_FAULT_ERROR (VM_FAULT_OOM | VM_FAULT_SIGBUS | VM_FAULT_HWPOISON | \
849 VM_FAULT_HWPOISON_LARGE) 849 VM_FAULT_HWPOISON_LARGE)
850 850
851 /* Encode hstate index for a hwpoisoned large page */ 851 /* Encode hstate index for a hwpoisoned large page */
852 #define VM_FAULT_SET_HINDEX(x) ((x) << 12) 852 #define VM_FAULT_SET_HINDEX(x) ((x) << 12)
853 #define VM_FAULT_GET_HINDEX(x) (((x) >> 12) & 0xf) 853 #define VM_FAULT_GET_HINDEX(x) (((x) >> 12) & 0xf)
854 854
855 /* 855 /*
856 * Can be called by the pagefault handler when it gets a VM_FAULT_OOM. 856 * Can be called by the pagefault handler when it gets a VM_FAULT_OOM.
857 */ 857 */
858 extern void pagefault_out_of_memory(void); 858 extern void pagefault_out_of_memory(void);
859 859
860 #define offset_in_page(p) ((unsigned long)(p) & ~PAGE_MASK) 860 #define offset_in_page(p) ((unsigned long)(p) & ~PAGE_MASK)
861 861
862 extern void show_free_areas(void); 862 extern void show_free_areas(void);
863 863
864 int shmem_lock(struct file *file, int lock, struct user_struct *user); 864 int shmem_lock(struct file *file, int lock, struct user_struct *user);
865 struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags); 865 struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags);
866 int shmem_zero_setup(struct vm_area_struct *); 866 int shmem_zero_setup(struct vm_area_struct *);
867 867
868 #ifndef CONFIG_MMU 868 #ifndef CONFIG_MMU
869 extern unsigned long shmem_get_unmapped_area(struct file *file, 869 extern unsigned long shmem_get_unmapped_area(struct file *file,
870 unsigned long addr, 870 unsigned long addr,
871 unsigned long len, 871 unsigned long len,
872 unsigned long pgoff, 872 unsigned long pgoff,
873 unsigned long flags); 873 unsigned long flags);
874 #endif 874 #endif
875 875
876 extern int can_do_mlock(void); 876 extern int can_do_mlock(void);
877 extern int user_shm_lock(size_t, struct user_struct *); 877 extern int user_shm_lock(size_t, struct user_struct *);
878 extern void user_shm_unlock(size_t, struct user_struct *); 878 extern void user_shm_unlock(size_t, struct user_struct *);
879 879
880 /* 880 /*
881 * Parameter block passed down to zap_pte_range in exceptional cases. 881 * Parameter block passed down to zap_pte_range in exceptional cases.
882 */ 882 */
883 struct zap_details { 883 struct zap_details {
884 struct vm_area_struct *nonlinear_vma; /* Check page->index if set */ 884 struct vm_area_struct *nonlinear_vma; /* Check page->index if set */
885 struct address_space *check_mapping; /* Check page->mapping if set */ 885 struct address_space *check_mapping; /* Check page->mapping if set */
886 pgoff_t first_index; /* Lowest page->index to unmap */ 886 pgoff_t first_index; /* Lowest page->index to unmap */
887 pgoff_t last_index; /* Highest page->index to unmap */ 887 pgoff_t last_index; /* Highest page->index to unmap */
888 spinlock_t *i_mmap_lock; /* For unmap_mapping_range: */ 888 spinlock_t *i_mmap_lock; /* For unmap_mapping_range: */
889 unsigned long truncate_count; /* Compare vm_truncate_count */ 889 unsigned long truncate_count; /* Compare vm_truncate_count */
890 }; 890 };
891 891
892 struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, 892 struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
893 pte_t pte); 893 pte_t pte);
894 894
895 int zap_vma_ptes(struct vm_area_struct *vma, unsigned long address, 895 int zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,
896 unsigned long size); 896 unsigned long size);
897 unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address, 897 unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address,
898 unsigned long size, struct zap_details *); 898 unsigned long size, struct zap_details *);
899 unsigned long unmap_vmas(struct mmu_gather **tlb, 899 unsigned long unmap_vmas(struct mmu_gather **tlb,
900 struct vm_area_struct *start_vma, unsigned long start_addr, 900 struct vm_area_struct *start_vma, unsigned long start_addr,
901 unsigned long end_addr, unsigned long *nr_accounted, 901 unsigned long end_addr, unsigned long *nr_accounted,
902 struct zap_details *); 902 struct zap_details *);
903 903
904 /** 904 /**
905 * mm_walk - callbacks for walk_page_range 905 * mm_walk - callbacks for walk_page_range
906 * @pgd_entry: if set, called for each non-empty PGD (top-level) entry 906 * @pgd_entry: if set, called for each non-empty PGD (top-level) entry
907 * @pud_entry: if set, called for each non-empty PUD (2nd-level) entry 907 * @pud_entry: if set, called for each non-empty PUD (2nd-level) entry
908 * @pmd_entry: if set, called for each non-empty PMD (3rd-level) entry 908 * @pmd_entry: if set, called for each non-empty PMD (3rd-level) entry
909 * @pte_entry: if set, called for each non-empty PTE (4th-level) entry 909 * @pte_entry: if set, called for each non-empty PTE (4th-level) entry
910 * @pte_hole: if set, called for each hole at all levels 910 * @pte_hole: if set, called for each hole at all levels
911 * @hugetlb_entry: if set, called for each hugetlb entry 911 * @hugetlb_entry: if set, called for each hugetlb entry
912 * 912 *
913 * (see walk_page_range for more details) 913 * (see walk_page_range for more details)
914 */ 914 */
915 struct mm_walk { 915 struct mm_walk {
916 int (*pgd_entry)(pgd_t *, unsigned long, unsigned long, struct mm_walk *); 916 int (*pgd_entry)(pgd_t *, unsigned long, unsigned long, struct mm_walk *);
917 int (*pud_entry)(pud_t *, unsigned long, unsigned long, struct mm_walk *); 917 int (*pud_entry)(pud_t *, unsigned long, unsigned long, struct mm_walk *);
918 int (*pmd_entry)(pmd_t *, unsigned long, unsigned long, struct mm_walk *); 918 int (*pmd_entry)(pmd_t *, unsigned long, unsigned long, struct mm_walk *);
919 int (*pte_entry)(pte_t *, unsigned long, unsigned long, struct mm_walk *); 919 int (*pte_entry)(pte_t *, unsigned long, unsigned long, struct mm_walk *);
920 int (*pte_hole)(unsigned long, unsigned long, struct mm_walk *); 920 int (*pte_hole)(unsigned long, unsigned long, struct mm_walk *);
921 int (*hugetlb_entry)(pte_t *, unsigned long, 921 int (*hugetlb_entry)(pte_t *, unsigned long,
922 unsigned long, unsigned long, struct mm_walk *); 922 unsigned long, unsigned long, struct mm_walk *);
923 struct mm_struct *mm; 923 struct mm_struct *mm;
924 void *private; 924 void *private;
925 }; 925 };
926 926
927 int walk_page_range(unsigned long addr, unsigned long end, 927 int walk_page_range(unsigned long addr, unsigned long end,
928 struct mm_walk *walk); 928 struct mm_walk *walk);
929 void free_pgd_range(struct mmu_gather *tlb, unsigned long addr, 929 void free_pgd_range(struct mmu_gather *tlb, unsigned long addr,
930 unsigned long end, unsigned long floor, unsigned long ceiling); 930 unsigned long end, unsigned long floor, unsigned long ceiling);
931 int copy_page_range(struct mm_struct *dst, struct mm_struct *src, 931 int copy_page_range(struct mm_struct *dst, struct mm_struct *src,
932 struct vm_area_struct *vma); 932 struct vm_area_struct *vma);
933 void unmap_mapping_range(struct address_space *mapping, 933 void unmap_mapping_range(struct address_space *mapping,
934 loff_t const holebegin, loff_t const holelen, int even_cows); 934 loff_t const holebegin, loff_t const holelen, int even_cows);
935 int follow_pfn(struct vm_area_struct *vma, unsigned long address, 935 int follow_pfn(struct vm_area_struct *vma, unsigned long address,
936 unsigned long *pfn); 936 unsigned long *pfn);
937 int follow_phys(struct vm_area_struct *vma, unsigned long address, 937 int follow_phys(struct vm_area_struct *vma, unsigned long address,
938 unsigned int flags, unsigned long *prot, resource_size_t *phys); 938 unsigned int flags, unsigned long *prot, resource_size_t *phys);
939 int generic_access_phys(struct vm_area_struct *vma, unsigned long addr, 939 int generic_access_phys(struct vm_area_struct *vma, unsigned long addr,
940 void *buf, int len, int write); 940 void *buf, int len, int write);
941 941
942 static inline void unmap_shared_mapping_range(struct address_space *mapping, 942 static inline void unmap_shared_mapping_range(struct address_space *mapping,
943 loff_t const holebegin, loff_t const holelen) 943 loff_t const holebegin, loff_t const holelen)
944 { 944 {
945 unmap_mapping_range(mapping, holebegin, holelen, 0); 945 unmap_mapping_range(mapping, holebegin, holelen, 0);
946 } 946 }
947 947
948 extern void truncate_pagecache(struct inode *inode, loff_t old, loff_t new); 948 extern void truncate_pagecache(struct inode *inode, loff_t old, loff_t new);
949 extern void truncate_setsize(struct inode *inode, loff_t newsize); 949 extern void truncate_setsize(struct inode *inode, loff_t newsize);
950 extern int vmtruncate(struct inode *inode, loff_t offset); 950 extern int vmtruncate(struct inode *inode, loff_t offset);
951 extern int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end); 951 extern int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end);
952 952
953 int truncate_inode_page(struct address_space *mapping, struct page *page); 953 int truncate_inode_page(struct address_space *mapping, struct page *page);
954 int generic_error_remove_page(struct address_space *mapping, struct page *page); 954 int generic_error_remove_page(struct address_space *mapping, struct page *page);
955 955
956 int invalidate_inode_page(struct page *page); 956 int invalidate_inode_page(struct page *page);
957 957
958 #ifdef CONFIG_MMU 958 #ifdef CONFIG_MMU
959 extern int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, 959 extern int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
960 unsigned long address, unsigned int flags); 960 unsigned long address, unsigned int flags);
961 #else 961 #else
962 static inline int handle_mm_fault(struct mm_struct *mm, 962 static inline int handle_mm_fault(struct mm_struct *mm,
963 struct vm_area_struct *vma, unsigned long address, 963 struct vm_area_struct *vma, unsigned long address,
964 unsigned int flags) 964 unsigned int flags)
965 { 965 {
966 /* should never happen if there's no MMU */ 966 /* should never happen if there's no MMU */
967 BUG(); 967 BUG();
968 return VM_FAULT_SIGBUS; 968 return VM_FAULT_SIGBUS;
969 } 969 }
970 #endif 970 #endif
971 971
972 extern int make_pages_present(unsigned long addr, unsigned long end); 972 extern int make_pages_present(unsigned long addr, unsigned long end);
973 extern int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write); 973 extern int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write);
974 extern int access_remote_vm(struct mm_struct *mm, unsigned long addr,
975 void *buf, int len, int write);
974 976
975 int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, 977 int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
976 unsigned long start, int len, unsigned int foll_flags, 978 unsigned long start, int len, unsigned int foll_flags,
977 struct page **pages, struct vm_area_struct **vmas, 979 struct page **pages, struct vm_area_struct **vmas,
978 int *nonblocking); 980 int *nonblocking);
979 int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, 981 int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
980 unsigned long start, int nr_pages, int write, int force, 982 unsigned long start, int nr_pages, int write, int force,
981 struct page **pages, struct vm_area_struct **vmas); 983 struct page **pages, struct vm_area_struct **vmas);
982 int get_user_pages_fast(unsigned long start, int nr_pages, int write, 984 int get_user_pages_fast(unsigned long start, int nr_pages, int write,
983 struct page **pages); 985 struct page **pages);
984 struct page *get_dump_page(unsigned long addr); 986 struct page *get_dump_page(unsigned long addr);
985 987
986 extern int try_to_release_page(struct page * page, gfp_t gfp_mask); 988 extern int try_to_release_page(struct page * page, gfp_t gfp_mask);
987 extern void do_invalidatepage(struct page *page, unsigned long offset); 989 extern void do_invalidatepage(struct page *page, unsigned long offset);
988 990
989 int __set_page_dirty_nobuffers(struct page *page); 991 int __set_page_dirty_nobuffers(struct page *page);
990 int __set_page_dirty_no_writeback(struct page *page); 992 int __set_page_dirty_no_writeback(struct page *page);
991 int redirty_page_for_writepage(struct writeback_control *wbc, 993 int redirty_page_for_writepage(struct writeback_control *wbc,
992 struct page *page); 994 struct page *page);
993 void account_page_dirtied(struct page *page, struct address_space *mapping); 995 void account_page_dirtied(struct page *page, struct address_space *mapping);
994 void account_page_writeback(struct page *page); 996 void account_page_writeback(struct page *page);
995 int set_page_dirty(struct page *page); 997 int set_page_dirty(struct page *page);
996 int set_page_dirty_lock(struct page *page); 998 int set_page_dirty_lock(struct page *page);
997 int clear_page_dirty_for_io(struct page *page); 999 int clear_page_dirty_for_io(struct page *page);
998 1000
999 /* Is the vma a continuation of the stack vma above it? */ 1001 /* Is the vma a continuation of the stack vma above it? */
1000 static inline int vma_stack_continue(struct vm_area_struct *vma, unsigned long addr) 1002 static inline int vma_stack_continue(struct vm_area_struct *vma, unsigned long addr)
1001 { 1003 {
1002 return vma && (vma->vm_end == addr) && (vma->vm_flags & VM_GROWSDOWN); 1004 return vma && (vma->vm_end == addr) && (vma->vm_flags & VM_GROWSDOWN);
1003 } 1005 }
1004 1006
1005 extern unsigned long move_page_tables(struct vm_area_struct *vma, 1007 extern unsigned long move_page_tables(struct vm_area_struct *vma,
1006 unsigned long old_addr, struct vm_area_struct *new_vma, 1008 unsigned long old_addr, struct vm_area_struct *new_vma,
1007 unsigned long new_addr, unsigned long len); 1009 unsigned long new_addr, unsigned long len);
1008 extern unsigned long do_mremap(unsigned long addr, 1010 extern unsigned long do_mremap(unsigned long addr,
1009 unsigned long old_len, unsigned long new_len, 1011 unsigned long old_len, unsigned long new_len,
1010 unsigned long flags, unsigned long new_addr); 1012 unsigned long flags, unsigned long new_addr);
1011 extern int mprotect_fixup(struct vm_area_struct *vma, 1013 extern int mprotect_fixup(struct vm_area_struct *vma,
1012 struct vm_area_struct **pprev, unsigned long start, 1014 struct vm_area_struct **pprev, unsigned long start,
1013 unsigned long end, unsigned long newflags); 1015 unsigned long end, unsigned long newflags);
1014 1016
1015 /* 1017 /*
1016 * doesn't attempt to fault and will return short. 1018 * doesn't attempt to fault and will return short.
1017 */ 1019 */
1018 int __get_user_pages_fast(unsigned long start, int nr_pages, int write, 1020 int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
1019 struct page **pages); 1021 struct page **pages);
1020 /* 1022 /*
1021 * per-process(per-mm_struct) statistics. 1023 * per-process(per-mm_struct) statistics.
1022 */ 1024 */
1023 #if defined(SPLIT_RSS_COUNTING) 1025 #if defined(SPLIT_RSS_COUNTING)
1024 /* 1026 /*
1025 * The mm counters are not protected by its page_table_lock, 1027 * The mm counters are not protected by its page_table_lock,
1026 * so must be incremented atomically. 1028 * so must be incremented atomically.
1027 */ 1029 */
1028 static inline void set_mm_counter(struct mm_struct *mm, int member, long value) 1030 static inline void set_mm_counter(struct mm_struct *mm, int member, long value)
1029 { 1031 {
1030 atomic_long_set(&mm->rss_stat.count[member], value); 1032 atomic_long_set(&mm->rss_stat.count[member], value);
1031 } 1033 }
1032 1034
1033 unsigned long get_mm_counter(struct mm_struct *mm, int member); 1035 unsigned long get_mm_counter(struct mm_struct *mm, int member);
1034 1036
1035 static inline void add_mm_counter(struct mm_struct *mm, int member, long value) 1037 static inline void add_mm_counter(struct mm_struct *mm, int member, long value)
1036 { 1038 {
1037 atomic_long_add(value, &mm->rss_stat.count[member]); 1039 atomic_long_add(value, &mm->rss_stat.count[member]);
1038 } 1040 }
1039 1041
1040 static inline void inc_mm_counter(struct mm_struct *mm, int member) 1042 static inline void inc_mm_counter(struct mm_struct *mm, int member)
1041 { 1043 {
1042 atomic_long_inc(&mm->rss_stat.count[member]); 1044 atomic_long_inc(&mm->rss_stat.count[member]);
1043 } 1045 }
1044 1046
1045 static inline void dec_mm_counter(struct mm_struct *mm, int member) 1047 static inline void dec_mm_counter(struct mm_struct *mm, int member)
1046 { 1048 {
1047 atomic_long_dec(&mm->rss_stat.count[member]); 1049 atomic_long_dec(&mm->rss_stat.count[member]);
1048 } 1050 }
1049 1051
1050 #else /* !USE_SPLIT_PTLOCKS */ 1052 #else /* !USE_SPLIT_PTLOCKS */
1051 /* 1053 /*
1052 * The mm counters are protected by its page_table_lock, 1054 * The mm counters are protected by its page_table_lock,
1053 * so can be incremented directly. 1055 * so can be incremented directly.
1054 */ 1056 */
1055 static inline void set_mm_counter(struct mm_struct *mm, int member, long value) 1057 static inline void set_mm_counter(struct mm_struct *mm, int member, long value)
1056 { 1058 {
1057 mm->rss_stat.count[member] = value; 1059 mm->rss_stat.count[member] = value;
1058 } 1060 }
1059 1061
1060 static inline unsigned long get_mm_counter(struct mm_struct *mm, int member) 1062 static inline unsigned long get_mm_counter(struct mm_struct *mm, int member)
1061 { 1063 {
1062 return mm->rss_stat.count[member]; 1064 return mm->rss_stat.count[member];
1063 } 1065 }
1064 1066
1065 static inline void add_mm_counter(struct mm_struct *mm, int member, long value) 1067 static inline void add_mm_counter(struct mm_struct *mm, int member, long value)
1066 { 1068 {
1067 mm->rss_stat.count[member] += value; 1069 mm->rss_stat.count[member] += value;
1068 } 1070 }
1069 1071
1070 static inline void inc_mm_counter(struct mm_struct *mm, int member) 1072 static inline void inc_mm_counter(struct mm_struct *mm, int member)
1071 { 1073 {
1072 mm->rss_stat.count[member]++; 1074 mm->rss_stat.count[member]++;
1073 } 1075 }
1074 1076
1075 static inline void dec_mm_counter(struct mm_struct *mm, int member) 1077 static inline void dec_mm_counter(struct mm_struct *mm, int member)
1076 { 1078 {
1077 mm->rss_stat.count[member]--; 1079 mm->rss_stat.count[member]--;
1078 } 1080 }
1079 1081
1080 #endif /* !USE_SPLIT_PTLOCKS */ 1082 #endif /* !USE_SPLIT_PTLOCKS */
1081 1083
1082 static inline unsigned long get_mm_rss(struct mm_struct *mm) 1084 static inline unsigned long get_mm_rss(struct mm_struct *mm)
1083 { 1085 {
1084 return get_mm_counter(mm, MM_FILEPAGES) + 1086 return get_mm_counter(mm, MM_FILEPAGES) +
1085 get_mm_counter(mm, MM_ANONPAGES); 1087 get_mm_counter(mm, MM_ANONPAGES);
1086 } 1088 }
1087 1089
1088 static inline unsigned long get_mm_hiwater_rss(struct mm_struct *mm) 1090 static inline unsigned long get_mm_hiwater_rss(struct mm_struct *mm)
1089 { 1091 {
1090 return max(mm->hiwater_rss, get_mm_rss(mm)); 1092 return max(mm->hiwater_rss, get_mm_rss(mm));
1091 } 1093 }
1092 1094
1093 static inline unsigned long get_mm_hiwater_vm(struct mm_struct *mm) 1095 static inline unsigned long get_mm_hiwater_vm(struct mm_struct *mm)
1094 { 1096 {
1095 return max(mm->hiwater_vm, mm->total_vm); 1097 return max(mm->hiwater_vm, mm->total_vm);
1096 } 1098 }
1097 1099
1098 static inline void update_hiwater_rss(struct mm_struct *mm) 1100 static inline void update_hiwater_rss(struct mm_struct *mm)
1099 { 1101 {
1100 unsigned long _rss = get_mm_rss(mm); 1102 unsigned long _rss = get_mm_rss(mm);
1101 1103
1102 if ((mm)->hiwater_rss < _rss) 1104 if ((mm)->hiwater_rss < _rss)
1103 (mm)->hiwater_rss = _rss; 1105 (mm)->hiwater_rss = _rss;
1104 } 1106 }
1105 1107
1106 static inline void update_hiwater_vm(struct mm_struct *mm) 1108 static inline void update_hiwater_vm(struct mm_struct *mm)
1107 { 1109 {
1108 if (mm->hiwater_vm < mm->total_vm) 1110 if (mm->hiwater_vm < mm->total_vm)
1109 mm->hiwater_vm = mm->total_vm; 1111 mm->hiwater_vm = mm->total_vm;
1110 } 1112 }
1111 1113
1112 static inline void setmax_mm_hiwater_rss(unsigned long *maxrss, 1114 static inline void setmax_mm_hiwater_rss(unsigned long *maxrss,
1113 struct mm_struct *mm) 1115 struct mm_struct *mm)
1114 { 1116 {
1115 unsigned long hiwater_rss = get_mm_hiwater_rss(mm); 1117 unsigned long hiwater_rss = get_mm_hiwater_rss(mm);
1116 1118
1117 if (*maxrss < hiwater_rss) 1119 if (*maxrss < hiwater_rss)
1118 *maxrss = hiwater_rss; 1120 *maxrss = hiwater_rss;
1119 } 1121 }
1120 1122
1121 #if defined(SPLIT_RSS_COUNTING) 1123 #if defined(SPLIT_RSS_COUNTING)
1122 void sync_mm_rss(struct task_struct *task, struct mm_struct *mm); 1124 void sync_mm_rss(struct task_struct *task, struct mm_struct *mm);
1123 #else 1125 #else
1124 static inline void sync_mm_rss(struct task_struct *task, struct mm_struct *mm) 1126 static inline void sync_mm_rss(struct task_struct *task, struct mm_struct *mm)
1125 { 1127 {
1126 } 1128 }
1127 #endif 1129 #endif
1128 1130
1129 /* 1131 /*
1130 * A callback you can register to apply pressure to ageable caches. 1132 * A callback you can register to apply pressure to ageable caches.
1131 * 1133 *
1132 * 'shrink' is passed a count 'nr_to_scan' and a 'gfpmask'. It should 1134 * 'shrink' is passed a count 'nr_to_scan' and a 'gfpmask'. It should
1133 * look through the least-recently-used 'nr_to_scan' entries and 1135 * look through the least-recently-used 'nr_to_scan' entries and
1134 * attempt to free them up. It should return the number of objects 1136 * attempt to free them up. It should return the number of objects
1135 * which remain in the cache. If it returns -1, it means it cannot do 1137 * which remain in the cache. If it returns -1, it means it cannot do
1136 * any scanning at this time (eg. there is a risk of deadlock). 1138 * any scanning at this time (eg. there is a risk of deadlock).
1137 * 1139 *
1138 * The 'gfpmask' refers to the allocation we are currently trying to 1140 * The 'gfpmask' refers to the allocation we are currently trying to
1139 * fulfil. 1141 * fulfil.
1140 * 1142 *
1141 * Note that 'shrink' will be passed nr_to_scan == 0 when the VM is 1143 * Note that 'shrink' will be passed nr_to_scan == 0 when the VM is
1142 * querying the cache size, so a fastpath for that case is appropriate. 1144 * querying the cache size, so a fastpath for that case is appropriate.
1143 */ 1145 */
1144 struct shrinker { 1146 struct shrinker {
1145 int (*shrink)(struct shrinker *, int nr_to_scan, gfp_t gfp_mask); 1147 int (*shrink)(struct shrinker *, int nr_to_scan, gfp_t gfp_mask);
1146 int seeks; /* seeks to recreate an obj */ 1148 int seeks; /* seeks to recreate an obj */
1147 1149
1148 /* These are for internal use */ 1150 /* These are for internal use */
1149 struct list_head list; 1151 struct list_head list;
1150 long nr; /* objs pending delete */ 1152 long nr; /* objs pending delete */
1151 }; 1153 };
1152 #define DEFAULT_SEEKS 2 /* A good number if you don't know better. */ 1154 #define DEFAULT_SEEKS 2 /* A good number if you don't know better. */
1153 extern void register_shrinker(struct shrinker *); 1155 extern void register_shrinker(struct shrinker *);
1154 extern void unregister_shrinker(struct shrinker *); 1156 extern void unregister_shrinker(struct shrinker *);
1155 1157
1156 int vma_wants_writenotify(struct vm_area_struct *vma); 1158 int vma_wants_writenotify(struct vm_area_struct *vma);
1157 1159
1158 extern pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr, 1160 extern pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr,
1159 spinlock_t **ptl); 1161 spinlock_t **ptl);
1160 static inline pte_t *get_locked_pte(struct mm_struct *mm, unsigned long addr, 1162 static inline pte_t *get_locked_pte(struct mm_struct *mm, unsigned long addr,
1161 spinlock_t **ptl) 1163 spinlock_t **ptl)
1162 { 1164 {
1163 pte_t *ptep; 1165 pte_t *ptep;
1164 __cond_lock(*ptl, ptep = __get_locked_pte(mm, addr, ptl)); 1166 __cond_lock(*ptl, ptep = __get_locked_pte(mm, addr, ptl));
1165 return ptep; 1167 return ptep;
1166 } 1168 }
1167 1169
1168 #ifdef __PAGETABLE_PUD_FOLDED 1170 #ifdef __PAGETABLE_PUD_FOLDED
1169 static inline int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, 1171 static inline int __pud_alloc(struct mm_struct *mm, pgd_t *pgd,
1170 unsigned long address) 1172 unsigned long address)
1171 { 1173 {
1172 return 0; 1174 return 0;
1173 } 1175 }
1174 #else 1176 #else
1175 int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address); 1177 int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address);
1176 #endif 1178 #endif
1177 1179
1178 #ifdef __PAGETABLE_PMD_FOLDED 1180 #ifdef __PAGETABLE_PMD_FOLDED
1179 static inline int __pmd_alloc(struct mm_struct *mm, pud_t *pud, 1181 static inline int __pmd_alloc(struct mm_struct *mm, pud_t *pud,
1180 unsigned long address) 1182 unsigned long address)
1181 { 1183 {
1182 return 0; 1184 return 0;
1183 } 1185 }
1184 #else 1186 #else
1185 int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address); 1187 int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address);
1186 #endif 1188 #endif
1187 1189
1188 int __pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma, 1190 int __pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
1189 pmd_t *pmd, unsigned long address); 1191 pmd_t *pmd, unsigned long address);
1190 int __pte_alloc_kernel(pmd_t *pmd, unsigned long address); 1192 int __pte_alloc_kernel(pmd_t *pmd, unsigned long address);
1191 1193
1192 /* 1194 /*
1193 * The following ifdef needed to get the 4level-fixup.h header to work. 1195 * The following ifdef needed to get the 4level-fixup.h header to work.
1194 * Remove it when 4level-fixup.h has been removed. 1196 * Remove it when 4level-fixup.h has been removed.
1195 */ 1197 */
1196 #if defined(CONFIG_MMU) && !defined(__ARCH_HAS_4LEVEL_HACK) 1198 #if defined(CONFIG_MMU) && !defined(__ARCH_HAS_4LEVEL_HACK)
1197 static inline pud_t *pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address) 1199 static inline pud_t *pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
1198 { 1200 {
1199 return (unlikely(pgd_none(*pgd)) && __pud_alloc(mm, pgd, address))? 1201 return (unlikely(pgd_none(*pgd)) && __pud_alloc(mm, pgd, address))?
1200 NULL: pud_offset(pgd, address); 1202 NULL: pud_offset(pgd, address);
1201 } 1203 }
1202 1204
1203 static inline pmd_t *pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address) 1205 static inline pmd_t *pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
1204 { 1206 {
1205 return (unlikely(pud_none(*pud)) && __pmd_alloc(mm, pud, address))? 1207 return (unlikely(pud_none(*pud)) && __pmd_alloc(mm, pud, address))?
1206 NULL: pmd_offset(pud, address); 1208 NULL: pmd_offset(pud, address);
1207 } 1209 }
1208 #endif /* CONFIG_MMU && !__ARCH_HAS_4LEVEL_HACK */ 1210 #endif /* CONFIG_MMU && !__ARCH_HAS_4LEVEL_HACK */
1209 1211
1210 #if USE_SPLIT_PTLOCKS 1212 #if USE_SPLIT_PTLOCKS
1211 /* 1213 /*
1212 * We tuck a spinlock to guard each pagetable page into its struct page, 1214 * We tuck a spinlock to guard each pagetable page into its struct page,
1213 * at page->private, with BUILD_BUG_ON to make sure that this will not 1215 * at page->private, with BUILD_BUG_ON to make sure that this will not
1214 * overflow into the next struct page (as it might with DEBUG_SPINLOCK). 1216 * overflow into the next struct page (as it might with DEBUG_SPINLOCK).
1215 * When freeing, reset page->mapping so free_pages_check won't complain. 1217 * When freeing, reset page->mapping so free_pages_check won't complain.
1216 */ 1218 */
1217 #define __pte_lockptr(page) &((page)->ptl) 1219 #define __pte_lockptr(page) &((page)->ptl)
1218 #define pte_lock_init(_page) do { \ 1220 #define pte_lock_init(_page) do { \
1219 spin_lock_init(__pte_lockptr(_page)); \ 1221 spin_lock_init(__pte_lockptr(_page)); \
1220 } while (0) 1222 } while (0)
1221 #define pte_lock_deinit(page) ((page)->mapping = NULL) 1223 #define pte_lock_deinit(page) ((page)->mapping = NULL)
1222 #define pte_lockptr(mm, pmd) ({(void)(mm); __pte_lockptr(pmd_page(*(pmd)));}) 1224 #define pte_lockptr(mm, pmd) ({(void)(mm); __pte_lockptr(pmd_page(*(pmd)));})
1223 #else /* !USE_SPLIT_PTLOCKS */ 1225 #else /* !USE_SPLIT_PTLOCKS */
1224 /* 1226 /*
1225 * We use mm->page_table_lock to guard all pagetable pages of the mm. 1227 * We use mm->page_table_lock to guard all pagetable pages of the mm.
1226 */ 1228 */
1227 #define pte_lock_init(page) do {} while (0) 1229 #define pte_lock_init(page) do {} while (0)
1228 #define pte_lock_deinit(page) do {} while (0) 1230 #define pte_lock_deinit(page) do {} while (0)
1229 #define pte_lockptr(mm, pmd) ({(void)(pmd); &(mm)->page_table_lock;}) 1231 #define pte_lockptr(mm, pmd) ({(void)(pmd); &(mm)->page_table_lock;})
1230 #endif /* USE_SPLIT_PTLOCKS */ 1232 #endif /* USE_SPLIT_PTLOCKS */
1231 1233
1232 static inline void pgtable_page_ctor(struct page *page) 1234 static inline void pgtable_page_ctor(struct page *page)
1233 { 1235 {
1234 pte_lock_init(page); 1236 pte_lock_init(page);
1235 inc_zone_page_state(page, NR_PAGETABLE); 1237 inc_zone_page_state(page, NR_PAGETABLE);
1236 } 1238 }
1237 1239
1238 static inline void pgtable_page_dtor(struct page *page) 1240 static inline void pgtable_page_dtor(struct page *page)
1239 { 1241 {
1240 pte_lock_deinit(page); 1242 pte_lock_deinit(page);
1241 dec_zone_page_state(page, NR_PAGETABLE); 1243 dec_zone_page_state(page, NR_PAGETABLE);
1242 } 1244 }
1243 1245
1244 #define pte_offset_map_lock(mm, pmd, address, ptlp) \ 1246 #define pte_offset_map_lock(mm, pmd, address, ptlp) \
1245 ({ \ 1247 ({ \
1246 spinlock_t *__ptl = pte_lockptr(mm, pmd); \ 1248 spinlock_t *__ptl = pte_lockptr(mm, pmd); \
1247 pte_t *__pte = pte_offset_map(pmd, address); \ 1249 pte_t *__pte = pte_offset_map(pmd, address); \
1248 *(ptlp) = __ptl; \ 1250 *(ptlp) = __ptl; \
1249 spin_lock(__ptl); \ 1251 spin_lock(__ptl); \
1250 __pte; \ 1252 __pte; \
1251 }) 1253 })
1252 1254
1253 #define pte_unmap_unlock(pte, ptl) do { \ 1255 #define pte_unmap_unlock(pte, ptl) do { \
1254 spin_unlock(ptl); \ 1256 spin_unlock(ptl); \
1255 pte_unmap(pte); \ 1257 pte_unmap(pte); \
1256 } while (0) 1258 } while (0)
1257 1259
1258 #define pte_alloc_map(mm, vma, pmd, address) \ 1260 #define pte_alloc_map(mm, vma, pmd, address) \
1259 ((unlikely(pmd_none(*(pmd))) && __pte_alloc(mm, vma, \ 1261 ((unlikely(pmd_none(*(pmd))) && __pte_alloc(mm, vma, \
1260 pmd, address))? \ 1262 pmd, address))? \
1261 NULL: pte_offset_map(pmd, address)) 1263 NULL: pte_offset_map(pmd, address))
1262 1264
1263 #define pte_alloc_map_lock(mm, pmd, address, ptlp) \ 1265 #define pte_alloc_map_lock(mm, pmd, address, ptlp) \
1264 ((unlikely(pmd_none(*(pmd))) && __pte_alloc(mm, NULL, \ 1266 ((unlikely(pmd_none(*(pmd))) && __pte_alloc(mm, NULL, \
1265 pmd, address))? \ 1267 pmd, address))? \
1266 NULL: pte_offset_map_lock(mm, pmd, address, ptlp)) 1268 NULL: pte_offset_map_lock(mm, pmd, address, ptlp))
1267 1269
1268 #define pte_alloc_kernel(pmd, address) \ 1270 #define pte_alloc_kernel(pmd, address) \
1269 ((unlikely(pmd_none(*(pmd))) && __pte_alloc_kernel(pmd, address))? \ 1271 ((unlikely(pmd_none(*(pmd))) && __pte_alloc_kernel(pmd, address))? \
1270 NULL: pte_offset_kernel(pmd, address)) 1272 NULL: pte_offset_kernel(pmd, address))
1271 1273
1272 extern void free_area_init(unsigned long * zones_size); 1274 extern void free_area_init(unsigned long * zones_size);
1273 extern void free_area_init_node(int nid, unsigned long * zones_size, 1275 extern void free_area_init_node(int nid, unsigned long * zones_size,
1274 unsigned long zone_start_pfn, unsigned long *zholes_size); 1276 unsigned long zone_start_pfn, unsigned long *zholes_size);
1275 #ifdef CONFIG_ARCH_POPULATES_NODE_MAP 1277 #ifdef CONFIG_ARCH_POPULATES_NODE_MAP
1276 /* 1278 /*
1277 * With CONFIG_ARCH_POPULATES_NODE_MAP set, an architecture may initialise its 1279 * With CONFIG_ARCH_POPULATES_NODE_MAP set, an architecture may initialise its
1278 * zones, allocate the backing mem_map and account for memory holes in a more 1280 * zones, allocate the backing mem_map and account for memory holes in a more
1279 * architecture independent manner. This is a substitute for creating the 1281 * architecture independent manner. This is a substitute for creating the
1280 * zone_sizes[] and zholes_size[] arrays and passing them to 1282 * zone_sizes[] and zholes_size[] arrays and passing them to
1281 * free_area_init_node() 1283 * free_area_init_node()
1282 * 1284 *
1283 * An architecture is expected to register range of page frames backed by 1285 * An architecture is expected to register range of page frames backed by
1284 * physical memory with add_active_range() before calling 1286 * physical memory with add_active_range() before calling
1285 * free_area_init_nodes() passing in the PFN each zone ends at. At a basic 1287 * free_area_init_nodes() passing in the PFN each zone ends at. At a basic
1286 * usage, an architecture is expected to do something like 1288 * usage, an architecture is expected to do something like
1287 * 1289 *
1288 * unsigned long max_zone_pfns[MAX_NR_ZONES] = {max_dma, max_normal_pfn, 1290 * unsigned long max_zone_pfns[MAX_NR_ZONES] = {max_dma, max_normal_pfn,
1289 * max_highmem_pfn}; 1291 * max_highmem_pfn};
1290 * for_each_valid_physical_page_range() 1292 * for_each_valid_physical_page_range()
1291 * add_active_range(node_id, start_pfn, end_pfn) 1293 * add_active_range(node_id, start_pfn, end_pfn)
1292 * free_area_init_nodes(max_zone_pfns); 1294 * free_area_init_nodes(max_zone_pfns);
1293 * 1295 *
1294 * If the architecture guarantees that there are no holes in the ranges 1296 * If the architecture guarantees that there are no holes in the ranges
1295 * registered with add_active_range(), free_bootmem_active_regions() 1297 * registered with add_active_range(), free_bootmem_active_regions()
1296 * will call free_bootmem_node() for each registered physical page range. 1298 * will call free_bootmem_node() for each registered physical page range.
1297 * Similarly sparse_memory_present_with_active_regions() calls 1299 * Similarly sparse_memory_present_with_active_regions() calls
1298 * memory_present() for each range when SPARSEMEM is enabled. 1300 * memory_present() for each range when SPARSEMEM is enabled.
1299 * 1301 *
1300 * See mm/page_alloc.c for more information on each function exposed by 1302 * See mm/page_alloc.c for more information on each function exposed by
1301 * CONFIG_ARCH_POPULATES_NODE_MAP 1303 * CONFIG_ARCH_POPULATES_NODE_MAP
1302 */ 1304 */
1303 extern void free_area_init_nodes(unsigned long *max_zone_pfn); 1305 extern void free_area_init_nodes(unsigned long *max_zone_pfn);
1304 extern void add_active_range(unsigned int nid, unsigned long start_pfn, 1306 extern void add_active_range(unsigned int nid, unsigned long start_pfn,
1305 unsigned long end_pfn); 1307 unsigned long end_pfn);
1306 extern void remove_active_range(unsigned int nid, unsigned long start_pfn, 1308 extern void remove_active_range(unsigned int nid, unsigned long start_pfn,
1307 unsigned long end_pfn); 1309 unsigned long end_pfn);
1308 extern void remove_all_active_ranges(void); 1310 extern void remove_all_active_ranges(void);
1309 void sort_node_map(void); 1311 void sort_node_map(void);
1310 unsigned long __absent_pages_in_range(int nid, unsigned long start_pfn, 1312 unsigned long __absent_pages_in_range(int nid, unsigned long start_pfn,
1311 unsigned long end_pfn); 1313 unsigned long end_pfn);
1312 extern unsigned long absent_pages_in_range(unsigned long start_pfn, 1314 extern unsigned long absent_pages_in_range(unsigned long start_pfn,
1313 unsigned long end_pfn); 1315 unsigned long end_pfn);
1314 extern void get_pfn_range_for_nid(unsigned int nid, 1316 extern void get_pfn_range_for_nid(unsigned int nid,
1315 unsigned long *start_pfn, unsigned long *end_pfn); 1317 unsigned long *start_pfn, unsigned long *end_pfn);
1316 extern unsigned long find_min_pfn_with_active_regions(void); 1318 extern unsigned long find_min_pfn_with_active_regions(void);
1317 extern void free_bootmem_with_active_regions(int nid, 1319 extern void free_bootmem_with_active_regions(int nid,
1318 unsigned long max_low_pfn); 1320 unsigned long max_low_pfn);
1319 int add_from_early_node_map(struct range *range, int az, 1321 int add_from_early_node_map(struct range *range, int az,
1320 int nr_range, int nid); 1322 int nr_range, int nid);
1321 u64 __init find_memory_core_early(int nid, u64 size, u64 align, 1323 u64 __init find_memory_core_early(int nid, u64 size, u64 align,
1322 u64 goal, u64 limit); 1324 u64 goal, u64 limit);
1323 typedef int (*work_fn_t)(unsigned long, unsigned long, void *); 1325 typedef int (*work_fn_t)(unsigned long, unsigned long, void *);
1324 extern void work_with_active_regions(int nid, work_fn_t work_fn, void *data); 1326 extern void work_with_active_regions(int nid, work_fn_t work_fn, void *data);
1325 extern void sparse_memory_present_with_active_regions(int nid); 1327 extern void sparse_memory_present_with_active_regions(int nid);
1326 #endif /* CONFIG_ARCH_POPULATES_NODE_MAP */ 1328 #endif /* CONFIG_ARCH_POPULATES_NODE_MAP */
1327 1329
1328 #if !defined(CONFIG_ARCH_POPULATES_NODE_MAP) && \ 1330 #if !defined(CONFIG_ARCH_POPULATES_NODE_MAP) && \
1329 !defined(CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID) 1331 !defined(CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID)
1330 static inline int __early_pfn_to_nid(unsigned long pfn) 1332 static inline int __early_pfn_to_nid(unsigned long pfn)
1331 { 1333 {
1332 return 0; 1334 return 0;
1333 } 1335 }
1334 #else 1336 #else
1335 /* please see mm/page_alloc.c */ 1337 /* please see mm/page_alloc.c */
1336 extern int __meminit early_pfn_to_nid(unsigned long pfn); 1338 extern int __meminit early_pfn_to_nid(unsigned long pfn);
1337 #ifdef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID 1339 #ifdef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID
1338 /* there is a per-arch backend function. */ 1340 /* there is a per-arch backend function. */
1339 extern int __meminit __early_pfn_to_nid(unsigned long pfn); 1341 extern int __meminit __early_pfn_to_nid(unsigned long pfn);
1340 #endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */ 1342 #endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */
1341 #endif 1343 #endif
1342 1344
1343 extern void set_dma_reserve(unsigned long new_dma_reserve); 1345 extern void set_dma_reserve(unsigned long new_dma_reserve);
1344 extern void memmap_init_zone(unsigned long, int, unsigned long, 1346 extern void memmap_init_zone(unsigned long, int, unsigned long,
1345 unsigned long, enum memmap_context); 1347 unsigned long, enum memmap_context);
1346 extern void setup_per_zone_wmarks(void); 1348 extern void setup_per_zone_wmarks(void);
1347 extern void calculate_zone_inactive_ratio(struct zone *zone); 1349 extern void calculate_zone_inactive_ratio(struct zone *zone);
1348 extern void mem_init(void); 1350 extern void mem_init(void);
1349 extern void __init mmap_init(void); 1351 extern void __init mmap_init(void);
1350 extern void show_mem(void); 1352 extern void show_mem(void);
1351 extern void si_meminfo(struct sysinfo * val); 1353 extern void si_meminfo(struct sysinfo * val);
1352 extern void si_meminfo_node(struct sysinfo *val, int nid); 1354 extern void si_meminfo_node(struct sysinfo *val, int nid);
1353 extern int after_bootmem; 1355 extern int after_bootmem;
1354 1356
1355 extern void setup_per_cpu_pageset(void); 1357 extern void setup_per_cpu_pageset(void);
1356 1358
1357 extern void zone_pcp_update(struct zone *zone); 1359 extern void zone_pcp_update(struct zone *zone);
1358 1360
1359 /* nommu.c */ 1361 /* nommu.c */
1360 extern atomic_long_t mmap_pages_allocated; 1362 extern atomic_long_t mmap_pages_allocated;
1361 extern int nommu_shrink_inode_mappings(struct inode *, size_t, size_t); 1363 extern int nommu_shrink_inode_mappings(struct inode *, size_t, size_t);
1362 1364
1363 /* prio_tree.c */ 1365 /* prio_tree.c */
1364 void vma_prio_tree_add(struct vm_area_struct *, struct vm_area_struct *old); 1366 void vma_prio_tree_add(struct vm_area_struct *, struct vm_area_struct *old);
1365 void vma_prio_tree_insert(struct vm_area_struct *, struct prio_tree_root *); 1367 void vma_prio_tree_insert(struct vm_area_struct *, struct prio_tree_root *);
1366 void vma_prio_tree_remove(struct vm_area_struct *, struct prio_tree_root *); 1368 void vma_prio_tree_remove(struct vm_area_struct *, struct prio_tree_root *);
1367 struct vm_area_struct *vma_prio_tree_next(struct vm_area_struct *vma, 1369 struct vm_area_struct *vma_prio_tree_next(struct vm_area_struct *vma,
1368 struct prio_tree_iter *iter); 1370 struct prio_tree_iter *iter);
1369 1371
1370 #define vma_prio_tree_foreach(vma, iter, root, begin, end) \ 1372 #define vma_prio_tree_foreach(vma, iter, root, begin, end) \
1371 for (prio_tree_iter_init(iter, root, begin, end), vma = NULL; \ 1373 for (prio_tree_iter_init(iter, root, begin, end), vma = NULL; \
1372 (vma = vma_prio_tree_next(vma, iter)); ) 1374 (vma = vma_prio_tree_next(vma, iter)); )
1373 1375
1374 static inline void vma_nonlinear_insert(struct vm_area_struct *vma, 1376 static inline void vma_nonlinear_insert(struct vm_area_struct *vma,
1375 struct list_head *list) 1377 struct list_head *list)
1376 { 1378 {
1377 vma->shared.vm_set.parent = NULL; 1379 vma->shared.vm_set.parent = NULL;
1378 list_add_tail(&vma->shared.vm_set.list, list); 1380 list_add_tail(&vma->shared.vm_set.list, list);
1379 } 1381 }
1380 1382
1381 /* mmap.c */ 1383 /* mmap.c */
1382 extern int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin); 1384 extern int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin);
1383 extern int vma_adjust(struct vm_area_struct *vma, unsigned long start, 1385 extern int vma_adjust(struct vm_area_struct *vma, unsigned long start,
1384 unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert); 1386 unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert);
1385 extern struct vm_area_struct *vma_merge(struct mm_struct *, 1387 extern struct vm_area_struct *vma_merge(struct mm_struct *,
1386 struct vm_area_struct *prev, unsigned long addr, unsigned long end, 1388 struct vm_area_struct *prev, unsigned long addr, unsigned long end,
1387 unsigned long vm_flags, struct anon_vma *, struct file *, pgoff_t, 1389 unsigned long vm_flags, struct anon_vma *, struct file *, pgoff_t,
1388 struct mempolicy *); 1390 struct mempolicy *);
1389 extern struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *); 1391 extern struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *);
1390 extern int split_vma(struct mm_struct *, 1392 extern int split_vma(struct mm_struct *,
1391 struct vm_area_struct *, unsigned long addr, int new_below); 1393 struct vm_area_struct *, unsigned long addr, int new_below);
1392 extern int insert_vm_struct(struct mm_struct *, struct vm_area_struct *); 1394 extern int insert_vm_struct(struct mm_struct *, struct vm_area_struct *);
1393 extern void __vma_link_rb(struct mm_struct *, struct vm_area_struct *, 1395 extern void __vma_link_rb(struct mm_struct *, struct vm_area_struct *,
1394 struct rb_node **, struct rb_node *); 1396 struct rb_node **, struct rb_node *);
1395 extern void unlink_file_vma(struct vm_area_struct *); 1397 extern void unlink_file_vma(struct vm_area_struct *);
1396 extern struct vm_area_struct *copy_vma(struct vm_area_struct **, 1398 extern struct vm_area_struct *copy_vma(struct vm_area_struct **,
1397 unsigned long addr, unsigned long len, pgoff_t pgoff); 1399 unsigned long addr, unsigned long len, pgoff_t pgoff);
1398 extern void exit_mmap(struct mm_struct *); 1400 extern void exit_mmap(struct mm_struct *);
1399 1401
1400 extern int mm_take_all_locks(struct mm_struct *mm); 1402 extern int mm_take_all_locks(struct mm_struct *mm);
1401 extern void mm_drop_all_locks(struct mm_struct *mm); 1403 extern void mm_drop_all_locks(struct mm_struct *mm);
1402 1404
1403 #ifdef CONFIG_PROC_FS 1405 #ifdef CONFIG_PROC_FS
1404 /* From fs/proc/base.c. callers must _not_ hold the mm's exe_file_lock */ 1406 /* From fs/proc/base.c. callers must _not_ hold the mm's exe_file_lock */
1405 extern void added_exe_file_vma(struct mm_struct *mm); 1407 extern void added_exe_file_vma(struct mm_struct *mm);
1406 extern void removed_exe_file_vma(struct mm_struct *mm); 1408 extern void removed_exe_file_vma(struct mm_struct *mm);
1407 #else 1409 #else
1408 static inline void added_exe_file_vma(struct mm_struct *mm) 1410 static inline void added_exe_file_vma(struct mm_struct *mm)
1409 {} 1411 {}
1410 1412
1411 static inline void removed_exe_file_vma(struct mm_struct *mm) 1413 static inline void removed_exe_file_vma(struct mm_struct *mm)
1412 {} 1414 {}
1413 #endif /* CONFIG_PROC_FS */ 1415 #endif /* CONFIG_PROC_FS */
1414 1416
1415 extern int may_expand_vm(struct mm_struct *mm, unsigned long npages); 1417 extern int may_expand_vm(struct mm_struct *mm, unsigned long npages);
1416 extern int install_special_mapping(struct mm_struct *mm, 1418 extern int install_special_mapping(struct mm_struct *mm,
1417 unsigned long addr, unsigned long len, 1419 unsigned long addr, unsigned long len,
1418 unsigned long flags, struct page **pages); 1420 unsigned long flags, struct page **pages);
1419 1421
1420 extern unsigned long get_unmapped_area(struct file *, unsigned long, unsigned long, unsigned long, unsigned long); 1422 extern unsigned long get_unmapped_area(struct file *, unsigned long, unsigned long, unsigned long, unsigned long);
1421 1423
1422 extern unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, 1424 extern unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
1423 unsigned long len, unsigned long prot, 1425 unsigned long len, unsigned long prot,
1424 unsigned long flag, unsigned long pgoff); 1426 unsigned long flag, unsigned long pgoff);
1425 extern unsigned long mmap_region(struct file *file, unsigned long addr, 1427 extern unsigned long mmap_region(struct file *file, unsigned long addr,
1426 unsigned long len, unsigned long flags, 1428 unsigned long len, unsigned long flags,
1427 unsigned int vm_flags, unsigned long pgoff); 1429 unsigned int vm_flags, unsigned long pgoff);
1428 1430
1429 static inline unsigned long do_mmap(struct file *file, unsigned long addr, 1431 static inline unsigned long do_mmap(struct file *file, unsigned long addr,
1430 unsigned long len, unsigned long prot, 1432 unsigned long len, unsigned long prot,
1431 unsigned long flag, unsigned long offset) 1433 unsigned long flag, unsigned long offset)
1432 { 1434 {
1433 unsigned long ret = -EINVAL; 1435 unsigned long ret = -EINVAL;
1434 if ((offset + PAGE_ALIGN(len)) < offset) 1436 if ((offset + PAGE_ALIGN(len)) < offset)
1435 goto out; 1437 goto out;
1436 if (!(offset & ~PAGE_MASK)) 1438 if (!(offset & ~PAGE_MASK))
1437 ret = do_mmap_pgoff(file, addr, len, prot, flag, offset >> PAGE_SHIFT); 1439 ret = do_mmap_pgoff(file, addr, len, prot, flag, offset >> PAGE_SHIFT);
1438 out: 1440 out:
1439 return ret; 1441 return ret;
1440 } 1442 }
1441 1443
1442 extern int do_munmap(struct mm_struct *, unsigned long, size_t); 1444 extern int do_munmap(struct mm_struct *, unsigned long, size_t);
1443 1445
1444 extern unsigned long do_brk(unsigned long, unsigned long); 1446 extern unsigned long do_brk(unsigned long, unsigned long);
1445 1447
1446 /* filemap.c */ 1448 /* filemap.c */
1447 extern unsigned long page_unuse(struct page *); 1449 extern unsigned long page_unuse(struct page *);
1448 extern void truncate_inode_pages(struct address_space *, loff_t); 1450 extern void truncate_inode_pages(struct address_space *, loff_t);
1449 extern void truncate_inode_pages_range(struct address_space *, 1451 extern void truncate_inode_pages_range(struct address_space *,
1450 loff_t lstart, loff_t lend); 1452 loff_t lstart, loff_t lend);
1451 1453
1452 /* generic vm_area_ops exported for stackable file systems */ 1454 /* generic vm_area_ops exported for stackable file systems */
1453 extern int filemap_fault(struct vm_area_struct *, struct vm_fault *); 1455 extern int filemap_fault(struct vm_area_struct *, struct vm_fault *);
1454 1456
1455 /* mm/page-writeback.c */ 1457 /* mm/page-writeback.c */
1456 int write_one_page(struct page *page, int wait); 1458 int write_one_page(struct page *page, int wait);
1457 void task_dirty_inc(struct task_struct *tsk); 1459 void task_dirty_inc(struct task_struct *tsk);
1458 1460
1459 /* readahead.c */ 1461 /* readahead.c */
1460 #define VM_MAX_READAHEAD 128 /* kbytes */ 1462 #define VM_MAX_READAHEAD 128 /* kbytes */
1461 #define VM_MIN_READAHEAD 16 /* kbytes (includes current page) */ 1463 #define VM_MIN_READAHEAD 16 /* kbytes (includes current page) */
1462 1464
1463 int force_page_cache_readahead(struct address_space *mapping, struct file *filp, 1465 int force_page_cache_readahead(struct address_space *mapping, struct file *filp,
1464 pgoff_t offset, unsigned long nr_to_read); 1466 pgoff_t offset, unsigned long nr_to_read);
1465 1467
1466 void page_cache_sync_readahead(struct address_space *mapping, 1468 void page_cache_sync_readahead(struct address_space *mapping,
1467 struct file_ra_state *ra, 1469 struct file_ra_state *ra,
1468 struct file *filp, 1470 struct file *filp,
1469 pgoff_t offset, 1471 pgoff_t offset,
1470 unsigned long size); 1472 unsigned long size);
1471 1473
1472 void page_cache_async_readahead(struct address_space *mapping, 1474 void page_cache_async_readahead(struct address_space *mapping,
1473 struct file_ra_state *ra, 1475 struct file_ra_state *ra,
1474 struct file *filp, 1476 struct file *filp,
1475 struct page *pg, 1477 struct page *pg,
1476 pgoff_t offset, 1478 pgoff_t offset,
1477 unsigned long size); 1479 unsigned long size);
1478 1480
1479 unsigned long max_sane_readahead(unsigned long nr); 1481 unsigned long max_sane_readahead(unsigned long nr);
1480 unsigned long ra_submit(struct file_ra_state *ra, 1482 unsigned long ra_submit(struct file_ra_state *ra,
1481 struct address_space *mapping, 1483 struct address_space *mapping,
1482 struct file *filp); 1484 struct file *filp);
1483 1485
1484 /* Do stack extension */ 1486 /* Do stack extension */
1485 extern int expand_stack(struct vm_area_struct *vma, unsigned long address); 1487 extern int expand_stack(struct vm_area_struct *vma, unsigned long address);
1486 #if VM_GROWSUP 1488 #if VM_GROWSUP
1487 extern int expand_upwards(struct vm_area_struct *vma, unsigned long address); 1489 extern int expand_upwards(struct vm_area_struct *vma, unsigned long address);
1488 #else 1490 #else
1489 #define expand_upwards(vma, address) do { } while (0) 1491 #define expand_upwards(vma, address) do { } while (0)
1490 #endif 1492 #endif
1491 extern int expand_stack_downwards(struct vm_area_struct *vma, 1493 extern int expand_stack_downwards(struct vm_area_struct *vma,
1492 unsigned long address); 1494 unsigned long address);
1493 1495
1494 /* Look up the first VMA which satisfies addr < vm_end, NULL if none. */ 1496 /* Look up the first VMA which satisfies addr < vm_end, NULL if none. */
1495 extern struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long addr); 1497 extern struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long addr);
1496 extern struct vm_area_struct * find_vma_prev(struct mm_struct * mm, unsigned long addr, 1498 extern struct vm_area_struct * find_vma_prev(struct mm_struct * mm, unsigned long addr,
1497 struct vm_area_struct **pprev); 1499 struct vm_area_struct **pprev);
1498 1500
1499 /* Look up the first VMA which intersects the interval start_addr..end_addr-1, 1501 /* Look up the first VMA which intersects the interval start_addr..end_addr-1,
1500 NULL if none. Assume start_addr < end_addr. */ 1502 NULL if none. Assume start_addr < end_addr. */
1501 static inline struct vm_area_struct * find_vma_intersection(struct mm_struct * mm, unsigned long start_addr, unsigned long end_addr) 1503 static inline struct vm_area_struct * find_vma_intersection(struct mm_struct * mm, unsigned long start_addr, unsigned long end_addr)
1502 { 1504 {
1503 struct vm_area_struct * vma = find_vma(mm,start_addr); 1505 struct vm_area_struct * vma = find_vma(mm,start_addr);
1504 1506
1505 if (vma && end_addr <= vma->vm_start) 1507 if (vma && end_addr <= vma->vm_start)
1506 vma = NULL; 1508 vma = NULL;
1507 return vma; 1509 return vma;
1508 } 1510 }
1509 1511
1510 static inline unsigned long vma_pages(struct vm_area_struct *vma) 1512 static inline unsigned long vma_pages(struct vm_area_struct *vma)
1511 { 1513 {
1512 return (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; 1514 return (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
1513 } 1515 }
1514 1516
1515 #ifdef CONFIG_MMU 1517 #ifdef CONFIG_MMU
1516 pgprot_t vm_get_page_prot(unsigned long vm_flags); 1518 pgprot_t vm_get_page_prot(unsigned long vm_flags);
1517 #else 1519 #else
1518 static inline pgprot_t vm_get_page_prot(unsigned long vm_flags) 1520 static inline pgprot_t vm_get_page_prot(unsigned long vm_flags)
1519 { 1521 {
1520 return __pgprot(0); 1522 return __pgprot(0);
1521 } 1523 }
1522 #endif 1524 #endif
1523 1525
1524 struct vm_area_struct *find_extend_vma(struct mm_struct *, unsigned long addr); 1526 struct vm_area_struct *find_extend_vma(struct mm_struct *, unsigned long addr);
1525 int remap_pfn_range(struct vm_area_struct *, unsigned long addr, 1527 int remap_pfn_range(struct vm_area_struct *, unsigned long addr,
1526 unsigned long pfn, unsigned long size, pgprot_t); 1528 unsigned long pfn, unsigned long size, pgprot_t);
1527 int vm_insert_page(struct vm_area_struct *, unsigned long addr, struct page *); 1529 int vm_insert_page(struct vm_area_struct *, unsigned long addr, struct page *);
1528 int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr, 1530 int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
1529 unsigned long pfn); 1531 unsigned long pfn);
1530 int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr, 1532 int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
1531 unsigned long pfn); 1533 unsigned long pfn);
1532 1534
1533 struct page *follow_page(struct vm_area_struct *, unsigned long address, 1535 struct page *follow_page(struct vm_area_struct *, unsigned long address,
1534 unsigned int foll_flags); 1536 unsigned int foll_flags);
1535 #define FOLL_WRITE 0x01 /* check pte is writable */ 1537 #define FOLL_WRITE 0x01 /* check pte is writable */
1536 #define FOLL_TOUCH 0x02 /* mark page accessed */ 1538 #define FOLL_TOUCH 0x02 /* mark page accessed */
1537 #define FOLL_GET 0x04 /* do get_page on page */ 1539 #define FOLL_GET 0x04 /* do get_page on page */
1538 #define FOLL_DUMP 0x08 /* give error on hole if it would be zero */ 1540 #define FOLL_DUMP 0x08 /* give error on hole if it would be zero */
1539 #define FOLL_FORCE 0x10 /* get_user_pages read/write w/o permission */ 1541 #define FOLL_FORCE 0x10 /* get_user_pages read/write w/o permission */
1540 #define FOLL_MLOCK 0x40 /* mark page as mlocked */ 1542 #define FOLL_MLOCK 0x40 /* mark page as mlocked */
1541 #define FOLL_SPLIT 0x80 /* don't return transhuge pages, split them */ 1543 #define FOLL_SPLIT 0x80 /* don't return transhuge pages, split them */
1542 #define FOLL_HWPOISON 0x100 /* check page is hwpoisoned */ 1544 #define FOLL_HWPOISON 0x100 /* check page is hwpoisoned */
1543 1545
1544 typedef int (*pte_fn_t)(pte_t *pte, pgtable_t token, unsigned long addr, 1546 typedef int (*pte_fn_t)(pte_t *pte, pgtable_t token, unsigned long addr,
1545 void *data); 1547 void *data);
1546 extern int apply_to_page_range(struct mm_struct *mm, unsigned long address, 1548 extern int apply_to_page_range(struct mm_struct *mm, unsigned long address,
1547 unsigned long size, pte_fn_t fn, void *data); 1549 unsigned long size, pte_fn_t fn, void *data);
1548 1550
1549 #ifdef CONFIG_PROC_FS 1551 #ifdef CONFIG_PROC_FS
1550 void vm_stat_account(struct mm_struct *, unsigned long, struct file *, long); 1552 void vm_stat_account(struct mm_struct *, unsigned long, struct file *, long);
1551 #else 1553 #else
1552 static inline void vm_stat_account(struct mm_struct *mm, 1554 static inline void vm_stat_account(struct mm_struct *mm,
1553 unsigned long flags, struct file *file, long pages) 1555 unsigned long flags, struct file *file, long pages)
1554 { 1556 {
1555 } 1557 }
1556 #endif /* CONFIG_PROC_FS */ 1558 #endif /* CONFIG_PROC_FS */
1557 1559
1558 #ifdef CONFIG_DEBUG_PAGEALLOC 1560 #ifdef CONFIG_DEBUG_PAGEALLOC
1559 extern int debug_pagealloc_enabled; 1561 extern int debug_pagealloc_enabled;
1560 1562
1561 extern void kernel_map_pages(struct page *page, int numpages, int enable); 1563 extern void kernel_map_pages(struct page *page, int numpages, int enable);
1562 1564
1563 static inline void enable_debug_pagealloc(void) 1565 static inline void enable_debug_pagealloc(void)
1564 { 1566 {
1565 debug_pagealloc_enabled = 1; 1567 debug_pagealloc_enabled = 1;
1566 } 1568 }
1567 #ifdef CONFIG_HIBERNATION 1569 #ifdef CONFIG_HIBERNATION
1568 extern bool kernel_page_present(struct page *page); 1570 extern bool kernel_page_present(struct page *page);
1569 #endif /* CONFIG_HIBERNATION */ 1571 #endif /* CONFIG_HIBERNATION */
1570 #else 1572 #else
1571 static inline void 1573 static inline void
1572 kernel_map_pages(struct page *page, int numpages, int enable) {} 1574 kernel_map_pages(struct page *page, int numpages, int enable) {}
1573 static inline void enable_debug_pagealloc(void) 1575 static inline void enable_debug_pagealloc(void)
1574 { 1576 {
1575 } 1577 }
1576 #ifdef CONFIG_HIBERNATION 1578 #ifdef CONFIG_HIBERNATION
1577 static inline bool kernel_page_present(struct page *page) { return true; } 1579 static inline bool kernel_page_present(struct page *page) { return true; }
1578 #endif /* CONFIG_HIBERNATION */ 1580 #endif /* CONFIG_HIBERNATION */
1579 #endif 1581 #endif
1580 1582
1581 extern struct vm_area_struct *get_gate_vma(struct mm_struct *mm); 1583 extern struct vm_area_struct *get_gate_vma(struct mm_struct *mm);
1582 #ifdef __HAVE_ARCH_GATE_AREA 1584 #ifdef __HAVE_ARCH_GATE_AREA
1583 int in_gate_area_no_mm(unsigned long addr); 1585 int in_gate_area_no_mm(unsigned long addr);
1584 int in_gate_area(struct mm_struct *mm, unsigned long addr); 1586 int in_gate_area(struct mm_struct *mm, unsigned long addr);
1585 #else 1587 #else
1586 int in_gate_area_no_mm(unsigned long addr); 1588 int in_gate_area_no_mm(unsigned long addr);
1587 #define in_gate_area(mm, addr) ({(void)mm; in_gate_area_no_mm(addr);}) 1589 #define in_gate_area(mm, addr) ({(void)mm; in_gate_area_no_mm(addr);})
1588 #endif /* __HAVE_ARCH_GATE_AREA */ 1590 #endif /* __HAVE_ARCH_GATE_AREA */
1589 1591
1590 int drop_caches_sysctl_handler(struct ctl_table *, int, 1592 int drop_caches_sysctl_handler(struct ctl_table *, int,
1591 void __user *, size_t *, loff_t *); 1593 void __user *, size_t *, loff_t *);
1592 unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask, 1594 unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask,
1593 unsigned long lru_pages); 1595 unsigned long lru_pages);
1594 1596
1595 #ifndef CONFIG_MMU 1597 #ifndef CONFIG_MMU
1596 #define randomize_va_space 0 1598 #define randomize_va_space 0
1597 #else 1599 #else
1598 extern int randomize_va_space; 1600 extern int randomize_va_space;
1599 #endif 1601 #endif
1600 1602
1601 const char * arch_vma_name(struct vm_area_struct *vma); 1603 const char * arch_vma_name(struct vm_area_struct *vma);
1602 void print_vma_addr(char *prefix, unsigned long rip); 1604 void print_vma_addr(char *prefix, unsigned long rip);
1603 1605
1604 void sparse_mem_maps_populate_node(struct page **map_map, 1606 void sparse_mem_maps_populate_node(struct page **map_map,
1605 unsigned long pnum_begin, 1607 unsigned long pnum_begin,
1606 unsigned long pnum_end, 1608 unsigned long pnum_end,
1607 unsigned long map_count, 1609 unsigned long map_count,
1608 int nodeid); 1610 int nodeid);
1609 1611
1610 struct page *sparse_mem_map_populate(unsigned long pnum, int nid); 1612 struct page *sparse_mem_map_populate(unsigned long pnum, int nid);
1611 pgd_t *vmemmap_pgd_populate(unsigned long addr, int node); 1613 pgd_t *vmemmap_pgd_populate(unsigned long addr, int node);
1612 pud_t *vmemmap_pud_populate(pgd_t *pgd, unsigned long addr, int node); 1614 pud_t *vmemmap_pud_populate(pgd_t *pgd, unsigned long addr, int node);
1613 pmd_t *vmemmap_pmd_populate(pud_t *pud, unsigned long addr, int node); 1615 pmd_t *vmemmap_pmd_populate(pud_t *pud, unsigned long addr, int node);
1614 pte_t *vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node); 1616 pte_t *vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node);
1615 void *vmemmap_alloc_block(unsigned long size, int node); 1617 void *vmemmap_alloc_block(unsigned long size, int node);
1616 void *vmemmap_alloc_block_buf(unsigned long size, int node); 1618 void *vmemmap_alloc_block_buf(unsigned long size, int node);
1617 void vmemmap_verify(pte_t *, int, unsigned long, unsigned long); 1619 void vmemmap_verify(pte_t *, int, unsigned long, unsigned long);
1618 int vmemmap_populate_basepages(struct page *start_page, 1620 int vmemmap_populate_basepages(struct page *start_page,
1619 unsigned long pages, int node); 1621 unsigned long pages, int node);
1620 int vmemmap_populate(struct page *start_page, unsigned long pages, int node); 1622 int vmemmap_populate(struct page *start_page, unsigned long pages, int node);
1621 void vmemmap_populate_print_last(void); 1623 void vmemmap_populate_print_last(void);
1622 1624
1623 1625
1624 enum mf_flags { 1626 enum mf_flags {
1625 MF_COUNT_INCREASED = 1 << 0, 1627 MF_COUNT_INCREASED = 1 << 0,
1626 }; 1628 };
1627 extern void memory_failure(unsigned long pfn, int trapno); 1629 extern void memory_failure(unsigned long pfn, int trapno);
1628 extern int __memory_failure(unsigned long pfn, int trapno, int flags); 1630 extern int __memory_failure(unsigned long pfn, int trapno, int flags);
1629 extern int unpoison_memory(unsigned long pfn); 1631 extern int unpoison_memory(unsigned long pfn);
1630 extern int sysctl_memory_failure_early_kill; 1632 extern int sysctl_memory_failure_early_kill;
1631 extern int sysctl_memory_failure_recovery; 1633 extern int sysctl_memory_failure_recovery;
1632 extern void shake_page(struct page *p, int access); 1634 extern void shake_page(struct page *p, int access);
1633 extern atomic_long_t mce_bad_pages; 1635 extern atomic_long_t mce_bad_pages;
1634 extern int soft_offline_page(struct page *page, int flags); 1636 extern int soft_offline_page(struct page *page, int flags);
1635 1637
1636 extern void dump_page(struct page *page); 1638 extern void dump_page(struct page *page);
1637 1639
1638 #if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS) 1640 #if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS)
1639 extern void clear_huge_page(struct page *page, 1641 extern void clear_huge_page(struct page *page,
1640 unsigned long addr, 1642 unsigned long addr,
1641 unsigned int pages_per_huge_page); 1643 unsigned int pages_per_huge_page);
1642 extern void copy_user_huge_page(struct page *dst, struct page *src, 1644 extern void copy_user_huge_page(struct page *dst, struct page *src,
1643 unsigned long addr, struct vm_area_struct *vma, 1645 unsigned long addr, struct vm_area_struct *vma,
1644 unsigned int pages_per_huge_page); 1646 unsigned int pages_per_huge_page);
1645 #endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */ 1647 #endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */
1646 1648
1647 #endif /* __KERNEL__ */ 1649 #endif /* __KERNEL__ */
1648 #endif /* _LINUX_MM_H */ 1650 #endif /* _LINUX_MM_H */
1649 1651
1 /* 1 /*
2 * linux/mm/memory.c 2 * linux/mm/memory.c
3 * 3 *
4 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds 4 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
5 */ 5 */
6 6
7 /* 7 /*
8 * demand-loading started 01.12.91 - seems it is high on the list of 8 * demand-loading started 01.12.91 - seems it is high on the list of
9 * things wanted, and it should be easy to implement. - Linus 9 * things wanted, and it should be easy to implement. - Linus
10 */ 10 */
11 11
12 /* 12 /*
13 * Ok, demand-loading was easy, shared pages a little bit tricker. Shared 13 * Ok, demand-loading was easy, shared pages a little bit tricker. Shared
14 * pages started 02.12.91, seems to work. - Linus. 14 * pages started 02.12.91, seems to work. - Linus.
15 * 15 *
16 * Tested sharing by executing about 30 /bin/sh: under the old kernel it 16 * Tested sharing by executing about 30 /bin/sh: under the old kernel it
17 * would have taken more than the 6M I have free, but it worked well as 17 * would have taken more than the 6M I have free, but it worked well as
18 * far as I could see. 18 * far as I could see.
19 * 19 *
20 * Also corrected some "invalidate()"s - I wasn't doing enough of them. 20 * Also corrected some "invalidate()"s - I wasn't doing enough of them.
21 */ 21 */
22 22
23 /* 23 /*
24 * Real VM (paging to/from disk) started 18.12.91. Much more work and 24 * Real VM (paging to/from disk) started 18.12.91. Much more work and
25 * thought has to go into this. Oh, well.. 25 * thought has to go into this. Oh, well..
26 * 19.12.91 - works, somewhat. Sometimes I get faults, don't know why. 26 * 19.12.91 - works, somewhat. Sometimes I get faults, don't know why.
27 * Found it. Everything seems to work now. 27 * Found it. Everything seems to work now.
28 * 20.12.91 - Ok, making the swap-device changeable like the root. 28 * 20.12.91 - Ok, making the swap-device changeable like the root.
29 */ 29 */
30 30
31 /* 31 /*
32 * 05.04.94 - Multi-page memory management added for v1.1. 32 * 05.04.94 - Multi-page memory management added for v1.1.
33 * Idea by Alex Bligh (alex@cconcepts.co.uk) 33 * Idea by Alex Bligh (alex@cconcepts.co.uk)
34 * 34 *
35 * 16.07.99 - Support of BIGMEM added by Gerhard Wichert, Siemens AG 35 * 16.07.99 - Support of BIGMEM added by Gerhard Wichert, Siemens AG
36 * (Gerhard.Wichert@pdb.siemens.de) 36 * (Gerhard.Wichert@pdb.siemens.de)
37 * 37 *
38 * Aug/Sep 2004 Changed to four level page tables (Andi Kleen) 38 * Aug/Sep 2004 Changed to four level page tables (Andi Kleen)
39 */ 39 */
40 40
41 #include <linux/kernel_stat.h> 41 #include <linux/kernel_stat.h>
42 #include <linux/mm.h> 42 #include <linux/mm.h>
43 #include <linux/hugetlb.h> 43 #include <linux/hugetlb.h>
44 #include <linux/mman.h> 44 #include <linux/mman.h>
45 #include <linux/swap.h> 45 #include <linux/swap.h>
46 #include <linux/highmem.h> 46 #include <linux/highmem.h>
47 #include <linux/pagemap.h> 47 #include <linux/pagemap.h>
48 #include <linux/ksm.h> 48 #include <linux/ksm.h>
49 #include <linux/rmap.h> 49 #include <linux/rmap.h>
50 #include <linux/module.h> 50 #include <linux/module.h>
51 #include <linux/delayacct.h> 51 #include <linux/delayacct.h>
52 #include <linux/init.h> 52 #include <linux/init.h>
53 #include <linux/writeback.h> 53 #include <linux/writeback.h>
54 #include <linux/memcontrol.h> 54 #include <linux/memcontrol.h>
55 #include <linux/mmu_notifier.h> 55 #include <linux/mmu_notifier.h>
56 #include <linux/kallsyms.h> 56 #include <linux/kallsyms.h>
57 #include <linux/swapops.h> 57 #include <linux/swapops.h>
58 #include <linux/elf.h> 58 #include <linux/elf.h>
59 #include <linux/gfp.h> 59 #include <linux/gfp.h>
60 60
61 #include <asm/io.h> 61 #include <asm/io.h>
62 #include <asm/pgalloc.h> 62 #include <asm/pgalloc.h>
63 #include <asm/uaccess.h> 63 #include <asm/uaccess.h>
64 #include <asm/tlb.h> 64 #include <asm/tlb.h>
65 #include <asm/tlbflush.h> 65 #include <asm/tlbflush.h>
66 #include <asm/pgtable.h> 66 #include <asm/pgtable.h>
67 67
68 #include "internal.h" 68 #include "internal.h"
69 69
70 #ifndef CONFIG_NEED_MULTIPLE_NODES 70 #ifndef CONFIG_NEED_MULTIPLE_NODES
71 /* use the per-pgdat data instead for discontigmem - mbligh */ 71 /* use the per-pgdat data instead for discontigmem - mbligh */
72 unsigned long max_mapnr; 72 unsigned long max_mapnr;
73 struct page *mem_map; 73 struct page *mem_map;
74 74
75 EXPORT_SYMBOL(max_mapnr); 75 EXPORT_SYMBOL(max_mapnr);
76 EXPORT_SYMBOL(mem_map); 76 EXPORT_SYMBOL(mem_map);
77 #endif 77 #endif
78 78
79 unsigned long num_physpages; 79 unsigned long num_physpages;
80 /* 80 /*
81 * A number of key systems in x86 including ioremap() rely on the assumption 81 * A number of key systems in x86 including ioremap() rely on the assumption
82 * that high_memory defines the upper bound on direct map memory, then end 82 * that high_memory defines the upper bound on direct map memory, then end
83 * of ZONE_NORMAL. Under CONFIG_DISCONTIG this means that max_low_pfn and 83 * of ZONE_NORMAL. Under CONFIG_DISCONTIG this means that max_low_pfn and
84 * highstart_pfn must be the same; there must be no gap between ZONE_NORMAL 84 * highstart_pfn must be the same; there must be no gap between ZONE_NORMAL
85 * and ZONE_HIGHMEM. 85 * and ZONE_HIGHMEM.
86 */ 86 */
87 void * high_memory; 87 void * high_memory;
88 88
89 EXPORT_SYMBOL(num_physpages); 89 EXPORT_SYMBOL(num_physpages);
90 EXPORT_SYMBOL(high_memory); 90 EXPORT_SYMBOL(high_memory);
91 91
92 /* 92 /*
93 * Randomize the address space (stacks, mmaps, brk, etc.). 93 * Randomize the address space (stacks, mmaps, brk, etc.).
94 * 94 *
95 * ( When CONFIG_COMPAT_BRK=y we exclude brk from randomization, 95 * ( When CONFIG_COMPAT_BRK=y we exclude brk from randomization,
96 * as ancient (libc5 based) binaries can segfault. ) 96 * as ancient (libc5 based) binaries can segfault. )
97 */ 97 */
98 int randomize_va_space __read_mostly = 98 int randomize_va_space __read_mostly =
99 #ifdef CONFIG_COMPAT_BRK 99 #ifdef CONFIG_COMPAT_BRK
100 1; 100 1;
101 #else 101 #else
102 2; 102 2;
103 #endif 103 #endif
104 104
105 static int __init disable_randmaps(char *s) 105 static int __init disable_randmaps(char *s)
106 { 106 {
107 randomize_va_space = 0; 107 randomize_va_space = 0;
108 return 1; 108 return 1;
109 } 109 }
110 __setup("norandmaps", disable_randmaps); 110 __setup("norandmaps", disable_randmaps);
111 111
112 unsigned long zero_pfn __read_mostly; 112 unsigned long zero_pfn __read_mostly;
113 unsigned long highest_memmap_pfn __read_mostly; 113 unsigned long highest_memmap_pfn __read_mostly;
114 114
115 /* 115 /*
116 * CONFIG_MMU architectures set up ZERO_PAGE in their paging_init() 116 * CONFIG_MMU architectures set up ZERO_PAGE in their paging_init()
117 */ 117 */
118 static int __init init_zero_pfn(void) 118 static int __init init_zero_pfn(void)
119 { 119 {
120 zero_pfn = page_to_pfn(ZERO_PAGE(0)); 120 zero_pfn = page_to_pfn(ZERO_PAGE(0));
121 return 0; 121 return 0;
122 } 122 }
123 core_initcall(init_zero_pfn); 123 core_initcall(init_zero_pfn);
124 124
125 125
126 #if defined(SPLIT_RSS_COUNTING) 126 #if defined(SPLIT_RSS_COUNTING)
127 127
128 static void __sync_task_rss_stat(struct task_struct *task, struct mm_struct *mm) 128 static void __sync_task_rss_stat(struct task_struct *task, struct mm_struct *mm)
129 { 129 {
130 int i; 130 int i;
131 131
132 for (i = 0; i < NR_MM_COUNTERS; i++) { 132 for (i = 0; i < NR_MM_COUNTERS; i++) {
133 if (task->rss_stat.count[i]) { 133 if (task->rss_stat.count[i]) {
134 add_mm_counter(mm, i, task->rss_stat.count[i]); 134 add_mm_counter(mm, i, task->rss_stat.count[i]);
135 task->rss_stat.count[i] = 0; 135 task->rss_stat.count[i] = 0;
136 } 136 }
137 } 137 }
138 task->rss_stat.events = 0; 138 task->rss_stat.events = 0;
139 } 139 }
140 140
141 static void add_mm_counter_fast(struct mm_struct *mm, int member, int val) 141 static void add_mm_counter_fast(struct mm_struct *mm, int member, int val)
142 { 142 {
143 struct task_struct *task = current; 143 struct task_struct *task = current;
144 144
145 if (likely(task->mm == mm)) 145 if (likely(task->mm == mm))
146 task->rss_stat.count[member] += val; 146 task->rss_stat.count[member] += val;
147 else 147 else
148 add_mm_counter(mm, member, val); 148 add_mm_counter(mm, member, val);
149 } 149 }
150 #define inc_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, 1) 150 #define inc_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, 1)
151 #define dec_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, -1) 151 #define dec_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, -1)
152 152
153 /* sync counter once per 64 page faults */ 153 /* sync counter once per 64 page faults */
154 #define TASK_RSS_EVENTS_THRESH (64) 154 #define TASK_RSS_EVENTS_THRESH (64)
155 static void check_sync_rss_stat(struct task_struct *task) 155 static void check_sync_rss_stat(struct task_struct *task)
156 { 156 {
157 if (unlikely(task != current)) 157 if (unlikely(task != current))
158 return; 158 return;
159 if (unlikely(task->rss_stat.events++ > TASK_RSS_EVENTS_THRESH)) 159 if (unlikely(task->rss_stat.events++ > TASK_RSS_EVENTS_THRESH))
160 __sync_task_rss_stat(task, task->mm); 160 __sync_task_rss_stat(task, task->mm);
161 } 161 }
162 162
163 unsigned long get_mm_counter(struct mm_struct *mm, int member) 163 unsigned long get_mm_counter(struct mm_struct *mm, int member)
164 { 164 {
165 long val = 0; 165 long val = 0;
166 166
167 /* 167 /*
168 * Don't use task->mm here...for avoiding to use task_get_mm().. 168 * Don't use task->mm here...for avoiding to use task_get_mm()..
169 * The caller must guarantee task->mm is not invalid. 169 * The caller must guarantee task->mm is not invalid.
170 */ 170 */
171 val = atomic_long_read(&mm->rss_stat.count[member]); 171 val = atomic_long_read(&mm->rss_stat.count[member]);
172 /* 172 /*
173 * counter is updated in asynchronous manner and may go to minus. 173 * counter is updated in asynchronous manner and may go to minus.
174 * But it's never be expected number for users. 174 * But it's never be expected number for users.
175 */ 175 */
176 if (val < 0) 176 if (val < 0)
177 return 0; 177 return 0;
178 return (unsigned long)val; 178 return (unsigned long)val;
179 } 179 }
180 180
181 void sync_mm_rss(struct task_struct *task, struct mm_struct *mm) 181 void sync_mm_rss(struct task_struct *task, struct mm_struct *mm)
182 { 182 {
183 __sync_task_rss_stat(task, mm); 183 __sync_task_rss_stat(task, mm);
184 } 184 }
185 #else 185 #else
186 186
187 #define inc_mm_counter_fast(mm, member) inc_mm_counter(mm, member) 187 #define inc_mm_counter_fast(mm, member) inc_mm_counter(mm, member)
188 #define dec_mm_counter_fast(mm, member) dec_mm_counter(mm, member) 188 #define dec_mm_counter_fast(mm, member) dec_mm_counter(mm, member)
189 189
190 static void check_sync_rss_stat(struct task_struct *task) 190 static void check_sync_rss_stat(struct task_struct *task)
191 { 191 {
192 } 192 }
193 193
194 #endif 194 #endif
195 195
196 /* 196 /*
197 * If a p?d_bad entry is found while walking page tables, report 197 * If a p?d_bad entry is found while walking page tables, report
198 * the error, before resetting entry to p?d_none. Usually (but 198 * the error, before resetting entry to p?d_none. Usually (but
199 * very seldom) called out from the p?d_none_or_clear_bad macros. 199 * very seldom) called out from the p?d_none_or_clear_bad macros.
200 */ 200 */
201 201
202 void pgd_clear_bad(pgd_t *pgd) 202 void pgd_clear_bad(pgd_t *pgd)
203 { 203 {
204 pgd_ERROR(*pgd); 204 pgd_ERROR(*pgd);
205 pgd_clear(pgd); 205 pgd_clear(pgd);
206 } 206 }
207 207
208 void pud_clear_bad(pud_t *pud) 208 void pud_clear_bad(pud_t *pud)
209 { 209 {
210 pud_ERROR(*pud); 210 pud_ERROR(*pud);
211 pud_clear(pud); 211 pud_clear(pud);
212 } 212 }
213 213
214 void pmd_clear_bad(pmd_t *pmd) 214 void pmd_clear_bad(pmd_t *pmd)
215 { 215 {
216 pmd_ERROR(*pmd); 216 pmd_ERROR(*pmd);
217 pmd_clear(pmd); 217 pmd_clear(pmd);
218 } 218 }
219 219
220 /* 220 /*
221 * Note: this doesn't free the actual pages themselves. That 221 * Note: this doesn't free the actual pages themselves. That
222 * has been handled earlier when unmapping all the memory regions. 222 * has been handled earlier when unmapping all the memory regions.
223 */ 223 */
224 static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd, 224 static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
225 unsigned long addr) 225 unsigned long addr)
226 { 226 {
227 pgtable_t token = pmd_pgtable(*pmd); 227 pgtable_t token = pmd_pgtable(*pmd);
228 pmd_clear(pmd); 228 pmd_clear(pmd);
229 pte_free_tlb(tlb, token, addr); 229 pte_free_tlb(tlb, token, addr);
230 tlb->mm->nr_ptes--; 230 tlb->mm->nr_ptes--;
231 } 231 }
232 232
233 static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud, 233 static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
234 unsigned long addr, unsigned long end, 234 unsigned long addr, unsigned long end,
235 unsigned long floor, unsigned long ceiling) 235 unsigned long floor, unsigned long ceiling)
236 { 236 {
237 pmd_t *pmd; 237 pmd_t *pmd;
238 unsigned long next; 238 unsigned long next;
239 unsigned long start; 239 unsigned long start;
240 240
241 start = addr; 241 start = addr;
242 pmd = pmd_offset(pud, addr); 242 pmd = pmd_offset(pud, addr);
243 do { 243 do {
244 next = pmd_addr_end(addr, end); 244 next = pmd_addr_end(addr, end);
245 if (pmd_none_or_clear_bad(pmd)) 245 if (pmd_none_or_clear_bad(pmd))
246 continue; 246 continue;
247 free_pte_range(tlb, pmd, addr); 247 free_pte_range(tlb, pmd, addr);
248 } while (pmd++, addr = next, addr != end); 248 } while (pmd++, addr = next, addr != end);
249 249
250 start &= PUD_MASK; 250 start &= PUD_MASK;
251 if (start < floor) 251 if (start < floor)
252 return; 252 return;
253 if (ceiling) { 253 if (ceiling) {
254 ceiling &= PUD_MASK; 254 ceiling &= PUD_MASK;
255 if (!ceiling) 255 if (!ceiling)
256 return; 256 return;
257 } 257 }
258 if (end - 1 > ceiling - 1) 258 if (end - 1 > ceiling - 1)
259 return; 259 return;
260 260
261 pmd = pmd_offset(pud, start); 261 pmd = pmd_offset(pud, start);
262 pud_clear(pud); 262 pud_clear(pud);
263 pmd_free_tlb(tlb, pmd, start); 263 pmd_free_tlb(tlb, pmd, start);
264 } 264 }
265 265
266 static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd, 266 static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
267 unsigned long addr, unsigned long end, 267 unsigned long addr, unsigned long end,
268 unsigned long floor, unsigned long ceiling) 268 unsigned long floor, unsigned long ceiling)
269 { 269 {
270 pud_t *pud; 270 pud_t *pud;
271 unsigned long next; 271 unsigned long next;
272 unsigned long start; 272 unsigned long start;
273 273
274 start = addr; 274 start = addr;
275 pud = pud_offset(pgd, addr); 275 pud = pud_offset(pgd, addr);
276 do { 276 do {
277 next = pud_addr_end(addr, end); 277 next = pud_addr_end(addr, end);
278 if (pud_none_or_clear_bad(pud)) 278 if (pud_none_or_clear_bad(pud))
279 continue; 279 continue;
280 free_pmd_range(tlb, pud, addr, next, floor, ceiling); 280 free_pmd_range(tlb, pud, addr, next, floor, ceiling);
281 } while (pud++, addr = next, addr != end); 281 } while (pud++, addr = next, addr != end);
282 282
283 start &= PGDIR_MASK; 283 start &= PGDIR_MASK;
284 if (start < floor) 284 if (start < floor)
285 return; 285 return;
286 if (ceiling) { 286 if (ceiling) {
287 ceiling &= PGDIR_MASK; 287 ceiling &= PGDIR_MASK;
288 if (!ceiling) 288 if (!ceiling)
289 return; 289 return;
290 } 290 }
291 if (end - 1 > ceiling - 1) 291 if (end - 1 > ceiling - 1)
292 return; 292 return;
293 293
294 pud = pud_offset(pgd, start); 294 pud = pud_offset(pgd, start);
295 pgd_clear(pgd); 295 pgd_clear(pgd);
296 pud_free_tlb(tlb, pud, start); 296 pud_free_tlb(tlb, pud, start);
297 } 297 }
298 298
299 /* 299 /*
300 * This function frees user-level page tables of a process. 300 * This function frees user-level page tables of a process.
301 * 301 *
302 * Must be called with pagetable lock held. 302 * Must be called with pagetable lock held.
303 */ 303 */
304 void free_pgd_range(struct mmu_gather *tlb, 304 void free_pgd_range(struct mmu_gather *tlb,
305 unsigned long addr, unsigned long end, 305 unsigned long addr, unsigned long end,
306 unsigned long floor, unsigned long ceiling) 306 unsigned long floor, unsigned long ceiling)
307 { 307 {
308 pgd_t *pgd; 308 pgd_t *pgd;
309 unsigned long next; 309 unsigned long next;
310 310
311 /* 311 /*
312 * The next few lines have given us lots of grief... 312 * The next few lines have given us lots of grief...
313 * 313 *
314 * Why are we testing PMD* at this top level? Because often 314 * Why are we testing PMD* at this top level? Because often
315 * there will be no work to do at all, and we'd prefer not to 315 * there will be no work to do at all, and we'd prefer not to
316 * go all the way down to the bottom just to discover that. 316 * go all the way down to the bottom just to discover that.
317 * 317 *
318 * Why all these "- 1"s? Because 0 represents both the bottom 318 * Why all these "- 1"s? Because 0 represents both the bottom
319 * of the address space and the top of it (using -1 for the 319 * of the address space and the top of it (using -1 for the
320 * top wouldn't help much: the masks would do the wrong thing). 320 * top wouldn't help much: the masks would do the wrong thing).
321 * The rule is that addr 0 and floor 0 refer to the bottom of 321 * The rule is that addr 0 and floor 0 refer to the bottom of
322 * the address space, but end 0 and ceiling 0 refer to the top 322 * the address space, but end 0 and ceiling 0 refer to the top
323 * Comparisons need to use "end - 1" and "ceiling - 1" (though 323 * Comparisons need to use "end - 1" and "ceiling - 1" (though
324 * that end 0 case should be mythical). 324 * that end 0 case should be mythical).
325 * 325 *
326 * Wherever addr is brought up or ceiling brought down, we must 326 * Wherever addr is brought up or ceiling brought down, we must
327 * be careful to reject "the opposite 0" before it confuses the 327 * be careful to reject "the opposite 0" before it confuses the
328 * subsequent tests. But what about where end is brought down 328 * subsequent tests. But what about where end is brought down
329 * by PMD_SIZE below? no, end can't go down to 0 there. 329 * by PMD_SIZE below? no, end can't go down to 0 there.
330 * 330 *
331 * Whereas we round start (addr) and ceiling down, by different 331 * Whereas we round start (addr) and ceiling down, by different
332 * masks at different levels, in order to test whether a table 332 * masks at different levels, in order to test whether a table
333 * now has no other vmas using it, so can be freed, we don't 333 * now has no other vmas using it, so can be freed, we don't
334 * bother to round floor or end up - the tests don't need that. 334 * bother to round floor or end up - the tests don't need that.
335 */ 335 */
336 336
337 addr &= PMD_MASK; 337 addr &= PMD_MASK;
338 if (addr < floor) { 338 if (addr < floor) {
339 addr += PMD_SIZE; 339 addr += PMD_SIZE;
340 if (!addr) 340 if (!addr)
341 return; 341 return;
342 } 342 }
343 if (ceiling) { 343 if (ceiling) {
344 ceiling &= PMD_MASK; 344 ceiling &= PMD_MASK;
345 if (!ceiling) 345 if (!ceiling)
346 return; 346 return;
347 } 347 }
348 if (end - 1 > ceiling - 1) 348 if (end - 1 > ceiling - 1)
349 end -= PMD_SIZE; 349 end -= PMD_SIZE;
350 if (addr > end - 1) 350 if (addr > end - 1)
351 return; 351 return;
352 352
353 pgd = pgd_offset(tlb->mm, addr); 353 pgd = pgd_offset(tlb->mm, addr);
354 do { 354 do {
355 next = pgd_addr_end(addr, end); 355 next = pgd_addr_end(addr, end);
356 if (pgd_none_or_clear_bad(pgd)) 356 if (pgd_none_or_clear_bad(pgd))
357 continue; 357 continue;
358 free_pud_range(tlb, pgd, addr, next, floor, ceiling); 358 free_pud_range(tlb, pgd, addr, next, floor, ceiling);
359 } while (pgd++, addr = next, addr != end); 359 } while (pgd++, addr = next, addr != end);
360 } 360 }
361 361
362 void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma, 362 void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,
363 unsigned long floor, unsigned long ceiling) 363 unsigned long floor, unsigned long ceiling)
364 { 364 {
365 while (vma) { 365 while (vma) {
366 struct vm_area_struct *next = vma->vm_next; 366 struct vm_area_struct *next = vma->vm_next;
367 unsigned long addr = vma->vm_start; 367 unsigned long addr = vma->vm_start;
368 368
369 /* 369 /*
370 * Hide vma from rmap and truncate_pagecache before freeing 370 * Hide vma from rmap and truncate_pagecache before freeing
371 * pgtables 371 * pgtables
372 */ 372 */
373 unlink_anon_vmas(vma); 373 unlink_anon_vmas(vma);
374 unlink_file_vma(vma); 374 unlink_file_vma(vma);
375 375
376 if (is_vm_hugetlb_page(vma)) { 376 if (is_vm_hugetlb_page(vma)) {
377 hugetlb_free_pgd_range(tlb, addr, vma->vm_end, 377 hugetlb_free_pgd_range(tlb, addr, vma->vm_end,
378 floor, next? next->vm_start: ceiling); 378 floor, next? next->vm_start: ceiling);
379 } else { 379 } else {
380 /* 380 /*
381 * Optimization: gather nearby vmas into one call down 381 * Optimization: gather nearby vmas into one call down
382 */ 382 */
383 while (next && next->vm_start <= vma->vm_end + PMD_SIZE 383 while (next && next->vm_start <= vma->vm_end + PMD_SIZE
384 && !is_vm_hugetlb_page(next)) { 384 && !is_vm_hugetlb_page(next)) {
385 vma = next; 385 vma = next;
386 next = vma->vm_next; 386 next = vma->vm_next;
387 unlink_anon_vmas(vma); 387 unlink_anon_vmas(vma);
388 unlink_file_vma(vma); 388 unlink_file_vma(vma);
389 } 389 }
390 free_pgd_range(tlb, addr, vma->vm_end, 390 free_pgd_range(tlb, addr, vma->vm_end,
391 floor, next? next->vm_start: ceiling); 391 floor, next? next->vm_start: ceiling);
392 } 392 }
393 vma = next; 393 vma = next;
394 } 394 }
395 } 395 }
396 396
397 int __pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma, 397 int __pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
398 pmd_t *pmd, unsigned long address) 398 pmd_t *pmd, unsigned long address)
399 { 399 {
400 pgtable_t new = pte_alloc_one(mm, address); 400 pgtable_t new = pte_alloc_one(mm, address);
401 int wait_split_huge_page; 401 int wait_split_huge_page;
402 if (!new) 402 if (!new)
403 return -ENOMEM; 403 return -ENOMEM;
404 404
405 /* 405 /*
406 * Ensure all pte setup (eg. pte page lock and page clearing) are 406 * Ensure all pte setup (eg. pte page lock and page clearing) are
407 * visible before the pte is made visible to other CPUs by being 407 * visible before the pte is made visible to other CPUs by being
408 * put into page tables. 408 * put into page tables.
409 * 409 *
410 * The other side of the story is the pointer chasing in the page 410 * The other side of the story is the pointer chasing in the page
411 * table walking code (when walking the page table without locking; 411 * table walking code (when walking the page table without locking;
412 * ie. most of the time). Fortunately, these data accesses consist 412 * ie. most of the time). Fortunately, these data accesses consist
413 * of a chain of data-dependent loads, meaning most CPUs (alpha 413 * of a chain of data-dependent loads, meaning most CPUs (alpha
414 * being the notable exception) will already guarantee loads are 414 * being the notable exception) will already guarantee loads are
415 * seen in-order. See the alpha page table accessors for the 415 * seen in-order. See the alpha page table accessors for the
416 * smp_read_barrier_depends() barriers in page table walking code. 416 * smp_read_barrier_depends() barriers in page table walking code.
417 */ 417 */
418 smp_wmb(); /* Could be smp_wmb__xxx(before|after)_spin_lock */ 418 smp_wmb(); /* Could be smp_wmb__xxx(before|after)_spin_lock */
419 419
420 spin_lock(&mm->page_table_lock); 420 spin_lock(&mm->page_table_lock);
421 wait_split_huge_page = 0; 421 wait_split_huge_page = 0;
422 if (likely(pmd_none(*pmd))) { /* Has another populated it ? */ 422 if (likely(pmd_none(*pmd))) { /* Has another populated it ? */
423 mm->nr_ptes++; 423 mm->nr_ptes++;
424 pmd_populate(mm, pmd, new); 424 pmd_populate(mm, pmd, new);
425 new = NULL; 425 new = NULL;
426 } else if (unlikely(pmd_trans_splitting(*pmd))) 426 } else if (unlikely(pmd_trans_splitting(*pmd)))
427 wait_split_huge_page = 1; 427 wait_split_huge_page = 1;
428 spin_unlock(&mm->page_table_lock); 428 spin_unlock(&mm->page_table_lock);
429 if (new) 429 if (new)
430 pte_free(mm, new); 430 pte_free(mm, new);
431 if (wait_split_huge_page) 431 if (wait_split_huge_page)
432 wait_split_huge_page(vma->anon_vma, pmd); 432 wait_split_huge_page(vma->anon_vma, pmd);
433 return 0; 433 return 0;
434 } 434 }
435 435
436 int __pte_alloc_kernel(pmd_t *pmd, unsigned long address) 436 int __pte_alloc_kernel(pmd_t *pmd, unsigned long address)
437 { 437 {
438 pte_t *new = pte_alloc_one_kernel(&init_mm, address); 438 pte_t *new = pte_alloc_one_kernel(&init_mm, address);
439 if (!new) 439 if (!new)
440 return -ENOMEM; 440 return -ENOMEM;
441 441
442 smp_wmb(); /* See comment in __pte_alloc */ 442 smp_wmb(); /* See comment in __pte_alloc */
443 443
444 spin_lock(&init_mm.page_table_lock); 444 spin_lock(&init_mm.page_table_lock);
445 if (likely(pmd_none(*pmd))) { /* Has another populated it ? */ 445 if (likely(pmd_none(*pmd))) { /* Has another populated it ? */
446 pmd_populate_kernel(&init_mm, pmd, new); 446 pmd_populate_kernel(&init_mm, pmd, new);
447 new = NULL; 447 new = NULL;
448 } else 448 } else
449 VM_BUG_ON(pmd_trans_splitting(*pmd)); 449 VM_BUG_ON(pmd_trans_splitting(*pmd));
450 spin_unlock(&init_mm.page_table_lock); 450 spin_unlock(&init_mm.page_table_lock);
451 if (new) 451 if (new)
452 pte_free_kernel(&init_mm, new); 452 pte_free_kernel(&init_mm, new);
453 return 0; 453 return 0;
454 } 454 }
455 455
456 static inline void init_rss_vec(int *rss) 456 static inline void init_rss_vec(int *rss)
457 { 457 {
458 memset(rss, 0, sizeof(int) * NR_MM_COUNTERS); 458 memset(rss, 0, sizeof(int) * NR_MM_COUNTERS);
459 } 459 }
460 460
461 static inline void add_mm_rss_vec(struct mm_struct *mm, int *rss) 461 static inline void add_mm_rss_vec(struct mm_struct *mm, int *rss)
462 { 462 {
463 int i; 463 int i;
464 464
465 if (current->mm == mm) 465 if (current->mm == mm)
466 sync_mm_rss(current, mm); 466 sync_mm_rss(current, mm);
467 for (i = 0; i < NR_MM_COUNTERS; i++) 467 for (i = 0; i < NR_MM_COUNTERS; i++)
468 if (rss[i]) 468 if (rss[i])
469 add_mm_counter(mm, i, rss[i]); 469 add_mm_counter(mm, i, rss[i]);
470 } 470 }
471 471
472 /* 472 /*
473 * This function is called to print an error when a bad pte 473 * This function is called to print an error when a bad pte
474 * is found. For example, we might have a PFN-mapped pte in 474 * is found. For example, we might have a PFN-mapped pte in
475 * a region that doesn't allow it. 475 * a region that doesn't allow it.
476 * 476 *
477 * The calling function must still handle the error. 477 * The calling function must still handle the error.
478 */ 478 */
479 static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr, 479 static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
480 pte_t pte, struct page *page) 480 pte_t pte, struct page *page)
481 { 481 {
482 pgd_t *pgd = pgd_offset(vma->vm_mm, addr); 482 pgd_t *pgd = pgd_offset(vma->vm_mm, addr);
483 pud_t *pud = pud_offset(pgd, addr); 483 pud_t *pud = pud_offset(pgd, addr);
484 pmd_t *pmd = pmd_offset(pud, addr); 484 pmd_t *pmd = pmd_offset(pud, addr);
485 struct address_space *mapping; 485 struct address_space *mapping;
486 pgoff_t index; 486 pgoff_t index;
487 static unsigned long resume; 487 static unsigned long resume;
488 static unsigned long nr_shown; 488 static unsigned long nr_shown;
489 static unsigned long nr_unshown; 489 static unsigned long nr_unshown;
490 490
491 /* 491 /*
492 * Allow a burst of 60 reports, then keep quiet for that minute; 492 * Allow a burst of 60 reports, then keep quiet for that minute;
493 * or allow a steady drip of one report per second. 493 * or allow a steady drip of one report per second.
494 */ 494 */
495 if (nr_shown == 60) { 495 if (nr_shown == 60) {
496 if (time_before(jiffies, resume)) { 496 if (time_before(jiffies, resume)) {
497 nr_unshown++; 497 nr_unshown++;
498 return; 498 return;
499 } 499 }
500 if (nr_unshown) { 500 if (nr_unshown) {
501 printk(KERN_ALERT 501 printk(KERN_ALERT
502 "BUG: Bad page map: %lu messages suppressed\n", 502 "BUG: Bad page map: %lu messages suppressed\n",
503 nr_unshown); 503 nr_unshown);
504 nr_unshown = 0; 504 nr_unshown = 0;
505 } 505 }
506 nr_shown = 0; 506 nr_shown = 0;
507 } 507 }
508 if (nr_shown++ == 0) 508 if (nr_shown++ == 0)
509 resume = jiffies + 60 * HZ; 509 resume = jiffies + 60 * HZ;
510 510
511 mapping = vma->vm_file ? vma->vm_file->f_mapping : NULL; 511 mapping = vma->vm_file ? vma->vm_file->f_mapping : NULL;
512 index = linear_page_index(vma, addr); 512 index = linear_page_index(vma, addr);
513 513
514 printk(KERN_ALERT 514 printk(KERN_ALERT
515 "BUG: Bad page map in process %s pte:%08llx pmd:%08llx\n", 515 "BUG: Bad page map in process %s pte:%08llx pmd:%08llx\n",
516 current->comm, 516 current->comm,
517 (long long)pte_val(pte), (long long)pmd_val(*pmd)); 517 (long long)pte_val(pte), (long long)pmd_val(*pmd));
518 if (page) 518 if (page)
519 dump_page(page); 519 dump_page(page);
520 printk(KERN_ALERT 520 printk(KERN_ALERT
521 "addr:%p vm_flags:%08lx anon_vma:%p mapping:%p index:%lx\n", 521 "addr:%p vm_flags:%08lx anon_vma:%p mapping:%p index:%lx\n",
522 (void *)addr, vma->vm_flags, vma->anon_vma, mapping, index); 522 (void *)addr, vma->vm_flags, vma->anon_vma, mapping, index);
523 /* 523 /*
524 * Choose text because data symbols depend on CONFIG_KALLSYMS_ALL=y 524 * Choose text because data symbols depend on CONFIG_KALLSYMS_ALL=y
525 */ 525 */
526 if (vma->vm_ops) 526 if (vma->vm_ops)
527 print_symbol(KERN_ALERT "vma->vm_ops->fault: %s\n", 527 print_symbol(KERN_ALERT "vma->vm_ops->fault: %s\n",
528 (unsigned long)vma->vm_ops->fault); 528 (unsigned long)vma->vm_ops->fault);
529 if (vma->vm_file && vma->vm_file->f_op) 529 if (vma->vm_file && vma->vm_file->f_op)
530 print_symbol(KERN_ALERT "vma->vm_file->f_op->mmap: %s\n", 530 print_symbol(KERN_ALERT "vma->vm_file->f_op->mmap: %s\n",
531 (unsigned long)vma->vm_file->f_op->mmap); 531 (unsigned long)vma->vm_file->f_op->mmap);
532 dump_stack(); 532 dump_stack();
533 add_taint(TAINT_BAD_PAGE); 533 add_taint(TAINT_BAD_PAGE);
534 } 534 }
535 535
536 static inline int is_cow_mapping(unsigned int flags) 536 static inline int is_cow_mapping(unsigned int flags)
537 { 537 {
538 return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; 538 return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
539 } 539 }
540 540
541 #ifndef is_zero_pfn 541 #ifndef is_zero_pfn
542 static inline int is_zero_pfn(unsigned long pfn) 542 static inline int is_zero_pfn(unsigned long pfn)
543 { 543 {
544 return pfn == zero_pfn; 544 return pfn == zero_pfn;
545 } 545 }
546 #endif 546 #endif
547 547
548 #ifndef my_zero_pfn 548 #ifndef my_zero_pfn
549 static inline unsigned long my_zero_pfn(unsigned long addr) 549 static inline unsigned long my_zero_pfn(unsigned long addr)
550 { 550 {
551 return zero_pfn; 551 return zero_pfn;
552 } 552 }
553 #endif 553 #endif
554 554
555 /* 555 /*
556 * vm_normal_page -- This function gets the "struct page" associated with a pte. 556 * vm_normal_page -- This function gets the "struct page" associated with a pte.
557 * 557 *
558 * "Special" mappings do not wish to be associated with a "struct page" (either 558 * "Special" mappings do not wish to be associated with a "struct page" (either
559 * it doesn't exist, or it exists but they don't want to touch it). In this 559 * it doesn't exist, or it exists but they don't want to touch it). In this
560 * case, NULL is returned here. "Normal" mappings do have a struct page. 560 * case, NULL is returned here. "Normal" mappings do have a struct page.
561 * 561 *
562 * There are 2 broad cases. Firstly, an architecture may define a pte_special() 562 * There are 2 broad cases. Firstly, an architecture may define a pte_special()
563 * pte bit, in which case this function is trivial. Secondly, an architecture 563 * pte bit, in which case this function is trivial. Secondly, an architecture
564 * may not have a spare pte bit, which requires a more complicated scheme, 564 * may not have a spare pte bit, which requires a more complicated scheme,
565 * described below. 565 * described below.
566 * 566 *
567 * A raw VM_PFNMAP mapping (ie. one that is not COWed) is always considered a 567 * A raw VM_PFNMAP mapping (ie. one that is not COWed) is always considered a
568 * special mapping (even if there are underlying and valid "struct pages"). 568 * special mapping (even if there are underlying and valid "struct pages").
569 * COWed pages of a VM_PFNMAP are always normal. 569 * COWed pages of a VM_PFNMAP are always normal.
570 * 570 *
571 * The way we recognize COWed pages within VM_PFNMAP mappings is through the 571 * The way we recognize COWed pages within VM_PFNMAP mappings is through the
572 * rules set up by "remap_pfn_range()": the vma will have the VM_PFNMAP bit 572 * rules set up by "remap_pfn_range()": the vma will have the VM_PFNMAP bit
573 * set, and the vm_pgoff will point to the first PFN mapped: thus every special 573 * set, and the vm_pgoff will point to the first PFN mapped: thus every special
574 * mapping will always honor the rule 574 * mapping will always honor the rule
575 * 575 *
576 * pfn_of_page == vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT) 576 * pfn_of_page == vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT)
577 * 577 *
578 * And for normal mappings this is false. 578 * And for normal mappings this is false.
579 * 579 *
580 * This restricts such mappings to be a linear translation from virtual address 580 * This restricts such mappings to be a linear translation from virtual address
581 * to pfn. To get around this restriction, we allow arbitrary mappings so long 581 * to pfn. To get around this restriction, we allow arbitrary mappings so long
582 * as the vma is not a COW mapping; in that case, we know that all ptes are 582 * as the vma is not a COW mapping; in that case, we know that all ptes are
583 * special (because none can have been COWed). 583 * special (because none can have been COWed).
584 * 584 *
585 * 585 *
586 * In order to support COW of arbitrary special mappings, we have VM_MIXEDMAP. 586 * In order to support COW of arbitrary special mappings, we have VM_MIXEDMAP.
587 * 587 *
588 * VM_MIXEDMAP mappings can likewise contain memory with or without "struct 588 * VM_MIXEDMAP mappings can likewise contain memory with or without "struct
589 * page" backing, however the difference is that _all_ pages with a struct 589 * page" backing, however the difference is that _all_ pages with a struct
590 * page (that is, those where pfn_valid is true) are refcounted and considered 590 * page (that is, those where pfn_valid is true) are refcounted and considered
591 * normal pages by the VM. The disadvantage is that pages are refcounted 591 * normal pages by the VM. The disadvantage is that pages are refcounted
592 * (which can be slower and simply not an option for some PFNMAP users). The 592 * (which can be slower and simply not an option for some PFNMAP users). The
593 * advantage is that we don't have to follow the strict linearity rule of 593 * advantage is that we don't have to follow the strict linearity rule of
594 * PFNMAP mappings in order to support COWable mappings. 594 * PFNMAP mappings in order to support COWable mappings.
595 * 595 *
596 */ 596 */
597 #ifdef __HAVE_ARCH_PTE_SPECIAL 597 #ifdef __HAVE_ARCH_PTE_SPECIAL
598 # define HAVE_PTE_SPECIAL 1 598 # define HAVE_PTE_SPECIAL 1
599 #else 599 #else
600 # define HAVE_PTE_SPECIAL 0 600 # define HAVE_PTE_SPECIAL 0
601 #endif 601 #endif
602 struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, 602 struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
603 pte_t pte) 603 pte_t pte)
604 { 604 {
605 unsigned long pfn = pte_pfn(pte); 605 unsigned long pfn = pte_pfn(pte);
606 606
607 if (HAVE_PTE_SPECIAL) { 607 if (HAVE_PTE_SPECIAL) {
608 if (likely(!pte_special(pte))) 608 if (likely(!pte_special(pte)))
609 goto check_pfn; 609 goto check_pfn;
610 if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP)) 610 if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))
611 return NULL; 611 return NULL;
612 if (!is_zero_pfn(pfn)) 612 if (!is_zero_pfn(pfn))
613 print_bad_pte(vma, addr, pte, NULL); 613 print_bad_pte(vma, addr, pte, NULL);
614 return NULL; 614 return NULL;
615 } 615 }
616 616
617 /* !HAVE_PTE_SPECIAL case follows: */ 617 /* !HAVE_PTE_SPECIAL case follows: */
618 618
619 if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) { 619 if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) {
620 if (vma->vm_flags & VM_MIXEDMAP) { 620 if (vma->vm_flags & VM_MIXEDMAP) {
621 if (!pfn_valid(pfn)) 621 if (!pfn_valid(pfn))
622 return NULL; 622 return NULL;
623 goto out; 623 goto out;
624 } else { 624 } else {
625 unsigned long off; 625 unsigned long off;
626 off = (addr - vma->vm_start) >> PAGE_SHIFT; 626 off = (addr - vma->vm_start) >> PAGE_SHIFT;
627 if (pfn == vma->vm_pgoff + off) 627 if (pfn == vma->vm_pgoff + off)
628 return NULL; 628 return NULL;
629 if (!is_cow_mapping(vma->vm_flags)) 629 if (!is_cow_mapping(vma->vm_flags))
630 return NULL; 630 return NULL;
631 } 631 }
632 } 632 }
633 633
634 if (is_zero_pfn(pfn)) 634 if (is_zero_pfn(pfn))
635 return NULL; 635 return NULL;
636 check_pfn: 636 check_pfn:
637 if (unlikely(pfn > highest_memmap_pfn)) { 637 if (unlikely(pfn > highest_memmap_pfn)) {
638 print_bad_pte(vma, addr, pte, NULL); 638 print_bad_pte(vma, addr, pte, NULL);
639 return NULL; 639 return NULL;
640 } 640 }
641 641
642 /* 642 /*
643 * NOTE! We still have PageReserved() pages in the page tables. 643 * NOTE! We still have PageReserved() pages in the page tables.
644 * eg. VDSO mappings can cause them to exist. 644 * eg. VDSO mappings can cause them to exist.
645 */ 645 */
646 out: 646 out:
647 return pfn_to_page(pfn); 647 return pfn_to_page(pfn);
648 } 648 }
649 649
650 /* 650 /*
651 * copy one vm_area from one task to the other. Assumes the page tables 651 * copy one vm_area from one task to the other. Assumes the page tables
652 * already present in the new task to be cleared in the whole range 652 * already present in the new task to be cleared in the whole range
653 * covered by this vma. 653 * covered by this vma.
654 */ 654 */
655 655
656 static inline unsigned long 656 static inline unsigned long
657 copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, 657 copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
658 pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma, 658 pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma,
659 unsigned long addr, int *rss) 659 unsigned long addr, int *rss)
660 { 660 {
661 unsigned long vm_flags = vma->vm_flags; 661 unsigned long vm_flags = vma->vm_flags;
662 pte_t pte = *src_pte; 662 pte_t pte = *src_pte;
663 struct page *page; 663 struct page *page;
664 664
665 /* pte contains position in swap or file, so copy. */ 665 /* pte contains position in swap or file, so copy. */
666 if (unlikely(!pte_present(pte))) { 666 if (unlikely(!pte_present(pte))) {
667 if (!pte_file(pte)) { 667 if (!pte_file(pte)) {
668 swp_entry_t entry = pte_to_swp_entry(pte); 668 swp_entry_t entry = pte_to_swp_entry(pte);
669 669
670 if (swap_duplicate(entry) < 0) 670 if (swap_duplicate(entry) < 0)
671 return entry.val; 671 return entry.val;
672 672
673 /* make sure dst_mm is on swapoff's mmlist. */ 673 /* make sure dst_mm is on swapoff's mmlist. */
674 if (unlikely(list_empty(&dst_mm->mmlist))) { 674 if (unlikely(list_empty(&dst_mm->mmlist))) {
675 spin_lock(&mmlist_lock); 675 spin_lock(&mmlist_lock);
676 if (list_empty(&dst_mm->mmlist)) 676 if (list_empty(&dst_mm->mmlist))
677 list_add(&dst_mm->mmlist, 677 list_add(&dst_mm->mmlist,
678 &src_mm->mmlist); 678 &src_mm->mmlist);
679 spin_unlock(&mmlist_lock); 679 spin_unlock(&mmlist_lock);
680 } 680 }
681 if (likely(!non_swap_entry(entry))) 681 if (likely(!non_swap_entry(entry)))
682 rss[MM_SWAPENTS]++; 682 rss[MM_SWAPENTS]++;
683 else if (is_write_migration_entry(entry) && 683 else if (is_write_migration_entry(entry) &&
684 is_cow_mapping(vm_flags)) { 684 is_cow_mapping(vm_flags)) {
685 /* 685 /*
686 * COW mappings require pages in both parent 686 * COW mappings require pages in both parent
687 * and child to be set to read. 687 * and child to be set to read.
688 */ 688 */
689 make_migration_entry_read(&entry); 689 make_migration_entry_read(&entry);
690 pte = swp_entry_to_pte(entry); 690 pte = swp_entry_to_pte(entry);
691 set_pte_at(src_mm, addr, src_pte, pte); 691 set_pte_at(src_mm, addr, src_pte, pte);
692 } 692 }
693 } 693 }
694 goto out_set_pte; 694 goto out_set_pte;
695 } 695 }
696 696
697 /* 697 /*
698 * If it's a COW mapping, write protect it both 698 * If it's a COW mapping, write protect it both
699 * in the parent and the child 699 * in the parent and the child
700 */ 700 */
701 if (is_cow_mapping(vm_flags)) { 701 if (is_cow_mapping(vm_flags)) {
702 ptep_set_wrprotect(src_mm, addr, src_pte); 702 ptep_set_wrprotect(src_mm, addr, src_pte);
703 pte = pte_wrprotect(pte); 703 pte = pte_wrprotect(pte);
704 } 704 }
705 705
706 /* 706 /*
707 * If it's a shared mapping, mark it clean in 707 * If it's a shared mapping, mark it clean in
708 * the child 708 * the child
709 */ 709 */
710 if (vm_flags & VM_SHARED) 710 if (vm_flags & VM_SHARED)
711 pte = pte_mkclean(pte); 711 pte = pte_mkclean(pte);
712 pte = pte_mkold(pte); 712 pte = pte_mkold(pte);
713 713
714 page = vm_normal_page(vma, addr, pte); 714 page = vm_normal_page(vma, addr, pte);
715 if (page) { 715 if (page) {
716 get_page(page); 716 get_page(page);
717 page_dup_rmap(page); 717 page_dup_rmap(page);
718 if (PageAnon(page)) 718 if (PageAnon(page))
719 rss[MM_ANONPAGES]++; 719 rss[MM_ANONPAGES]++;
720 else 720 else
721 rss[MM_FILEPAGES]++; 721 rss[MM_FILEPAGES]++;
722 } 722 }
723 723
724 out_set_pte: 724 out_set_pte:
725 set_pte_at(dst_mm, addr, dst_pte, pte); 725 set_pte_at(dst_mm, addr, dst_pte, pte);
726 return 0; 726 return 0;
727 } 727 }
728 728
729 int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, 729 int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
730 pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma, 730 pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma,
731 unsigned long addr, unsigned long end) 731 unsigned long addr, unsigned long end)
732 { 732 {
733 pte_t *orig_src_pte, *orig_dst_pte; 733 pte_t *orig_src_pte, *orig_dst_pte;
734 pte_t *src_pte, *dst_pte; 734 pte_t *src_pte, *dst_pte;
735 spinlock_t *src_ptl, *dst_ptl; 735 spinlock_t *src_ptl, *dst_ptl;
736 int progress = 0; 736 int progress = 0;
737 int rss[NR_MM_COUNTERS]; 737 int rss[NR_MM_COUNTERS];
738 swp_entry_t entry = (swp_entry_t){0}; 738 swp_entry_t entry = (swp_entry_t){0};
739 739
740 again: 740 again:
741 init_rss_vec(rss); 741 init_rss_vec(rss);
742 742
743 dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl); 743 dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl);
744 if (!dst_pte) 744 if (!dst_pte)
745 return -ENOMEM; 745 return -ENOMEM;
746 src_pte = pte_offset_map(src_pmd, addr); 746 src_pte = pte_offset_map(src_pmd, addr);
747 src_ptl = pte_lockptr(src_mm, src_pmd); 747 src_ptl = pte_lockptr(src_mm, src_pmd);
748 spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); 748 spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
749 orig_src_pte = src_pte; 749 orig_src_pte = src_pte;
750 orig_dst_pte = dst_pte; 750 orig_dst_pte = dst_pte;
751 arch_enter_lazy_mmu_mode(); 751 arch_enter_lazy_mmu_mode();
752 752
753 do { 753 do {
754 /* 754 /*
755 * We are holding two locks at this point - either of them 755 * We are holding two locks at this point - either of them
756 * could generate latencies in another task on another CPU. 756 * could generate latencies in another task on another CPU.
757 */ 757 */
758 if (progress >= 32) { 758 if (progress >= 32) {
759 progress = 0; 759 progress = 0;
760 if (need_resched() || 760 if (need_resched() ||
761 spin_needbreak(src_ptl) || spin_needbreak(dst_ptl)) 761 spin_needbreak(src_ptl) || spin_needbreak(dst_ptl))
762 break; 762 break;
763 } 763 }
764 if (pte_none(*src_pte)) { 764 if (pte_none(*src_pte)) {
765 progress++; 765 progress++;
766 continue; 766 continue;
767 } 767 }
768 entry.val = copy_one_pte(dst_mm, src_mm, dst_pte, src_pte, 768 entry.val = copy_one_pte(dst_mm, src_mm, dst_pte, src_pte,
769 vma, addr, rss); 769 vma, addr, rss);
770 if (entry.val) 770 if (entry.val)
771 break; 771 break;
772 progress += 8; 772 progress += 8;
773 } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end); 773 } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);
774 774
775 arch_leave_lazy_mmu_mode(); 775 arch_leave_lazy_mmu_mode();
776 spin_unlock(src_ptl); 776 spin_unlock(src_ptl);
777 pte_unmap(orig_src_pte); 777 pte_unmap(orig_src_pte);
778 add_mm_rss_vec(dst_mm, rss); 778 add_mm_rss_vec(dst_mm, rss);
779 pte_unmap_unlock(orig_dst_pte, dst_ptl); 779 pte_unmap_unlock(orig_dst_pte, dst_ptl);
780 cond_resched(); 780 cond_resched();
781 781
782 if (entry.val) { 782 if (entry.val) {
783 if (add_swap_count_continuation(entry, GFP_KERNEL) < 0) 783 if (add_swap_count_continuation(entry, GFP_KERNEL) < 0)
784 return -ENOMEM; 784 return -ENOMEM;
785 progress = 0; 785 progress = 0;
786 } 786 }
787 if (addr != end) 787 if (addr != end)
788 goto again; 788 goto again;
789 return 0; 789 return 0;
790 } 790 }
791 791
792 static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, 792 static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
793 pud_t *dst_pud, pud_t *src_pud, struct vm_area_struct *vma, 793 pud_t *dst_pud, pud_t *src_pud, struct vm_area_struct *vma,
794 unsigned long addr, unsigned long end) 794 unsigned long addr, unsigned long end)
795 { 795 {
796 pmd_t *src_pmd, *dst_pmd; 796 pmd_t *src_pmd, *dst_pmd;
797 unsigned long next; 797 unsigned long next;
798 798
799 dst_pmd = pmd_alloc(dst_mm, dst_pud, addr); 799 dst_pmd = pmd_alloc(dst_mm, dst_pud, addr);
800 if (!dst_pmd) 800 if (!dst_pmd)
801 return -ENOMEM; 801 return -ENOMEM;
802 src_pmd = pmd_offset(src_pud, addr); 802 src_pmd = pmd_offset(src_pud, addr);
803 do { 803 do {
804 next = pmd_addr_end(addr, end); 804 next = pmd_addr_end(addr, end);
805 if (pmd_trans_huge(*src_pmd)) { 805 if (pmd_trans_huge(*src_pmd)) {
806 int err; 806 int err;
807 VM_BUG_ON(next-addr != HPAGE_PMD_SIZE); 807 VM_BUG_ON(next-addr != HPAGE_PMD_SIZE);
808 err = copy_huge_pmd(dst_mm, src_mm, 808 err = copy_huge_pmd(dst_mm, src_mm,
809 dst_pmd, src_pmd, addr, vma); 809 dst_pmd, src_pmd, addr, vma);
810 if (err == -ENOMEM) 810 if (err == -ENOMEM)
811 return -ENOMEM; 811 return -ENOMEM;
812 if (!err) 812 if (!err)
813 continue; 813 continue;
814 /* fall through */ 814 /* fall through */
815 } 815 }
816 if (pmd_none_or_clear_bad(src_pmd)) 816 if (pmd_none_or_clear_bad(src_pmd))
817 continue; 817 continue;
818 if (copy_pte_range(dst_mm, src_mm, dst_pmd, src_pmd, 818 if (copy_pte_range(dst_mm, src_mm, dst_pmd, src_pmd,
819 vma, addr, next)) 819 vma, addr, next))
820 return -ENOMEM; 820 return -ENOMEM;
821 } while (dst_pmd++, src_pmd++, addr = next, addr != end); 821 } while (dst_pmd++, src_pmd++, addr = next, addr != end);
822 return 0; 822 return 0;
823 } 823 }
824 824
825 static inline int copy_pud_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, 825 static inline int copy_pud_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
826 pgd_t *dst_pgd, pgd_t *src_pgd, struct vm_area_struct *vma, 826 pgd_t *dst_pgd, pgd_t *src_pgd, struct vm_area_struct *vma,
827 unsigned long addr, unsigned long end) 827 unsigned long addr, unsigned long end)
828 { 828 {
829 pud_t *src_pud, *dst_pud; 829 pud_t *src_pud, *dst_pud;
830 unsigned long next; 830 unsigned long next;
831 831
832 dst_pud = pud_alloc(dst_mm, dst_pgd, addr); 832 dst_pud = pud_alloc(dst_mm, dst_pgd, addr);
833 if (!dst_pud) 833 if (!dst_pud)
834 return -ENOMEM; 834 return -ENOMEM;
835 src_pud = pud_offset(src_pgd, addr); 835 src_pud = pud_offset(src_pgd, addr);
836 do { 836 do {
837 next = pud_addr_end(addr, end); 837 next = pud_addr_end(addr, end);
838 if (pud_none_or_clear_bad(src_pud)) 838 if (pud_none_or_clear_bad(src_pud))
839 continue; 839 continue;
840 if (copy_pmd_range(dst_mm, src_mm, dst_pud, src_pud, 840 if (copy_pmd_range(dst_mm, src_mm, dst_pud, src_pud,
841 vma, addr, next)) 841 vma, addr, next))
842 return -ENOMEM; 842 return -ENOMEM;
843 } while (dst_pud++, src_pud++, addr = next, addr != end); 843 } while (dst_pud++, src_pud++, addr = next, addr != end);
844 return 0; 844 return 0;
845 } 845 }
846 846
847 int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, 847 int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
848 struct vm_area_struct *vma) 848 struct vm_area_struct *vma)
849 { 849 {
850 pgd_t *src_pgd, *dst_pgd; 850 pgd_t *src_pgd, *dst_pgd;
851 unsigned long next; 851 unsigned long next;
852 unsigned long addr = vma->vm_start; 852 unsigned long addr = vma->vm_start;
853 unsigned long end = vma->vm_end; 853 unsigned long end = vma->vm_end;
854 int ret; 854 int ret;
855 855
856 /* 856 /*
857 * Don't copy ptes where a page fault will fill them correctly. 857 * Don't copy ptes where a page fault will fill them correctly.
858 * Fork becomes much lighter when there are big shared or private 858 * Fork becomes much lighter when there are big shared or private
859 * readonly mappings. The tradeoff is that copy_page_range is more 859 * readonly mappings. The tradeoff is that copy_page_range is more
860 * efficient than faulting. 860 * efficient than faulting.
861 */ 861 */
862 if (!(vma->vm_flags & (VM_HUGETLB|VM_NONLINEAR|VM_PFNMAP|VM_INSERTPAGE))) { 862 if (!(vma->vm_flags & (VM_HUGETLB|VM_NONLINEAR|VM_PFNMAP|VM_INSERTPAGE))) {
863 if (!vma->anon_vma) 863 if (!vma->anon_vma)
864 return 0; 864 return 0;
865 } 865 }
866 866
867 if (is_vm_hugetlb_page(vma)) 867 if (is_vm_hugetlb_page(vma))
868 return copy_hugetlb_page_range(dst_mm, src_mm, vma); 868 return copy_hugetlb_page_range(dst_mm, src_mm, vma);
869 869
870 if (unlikely(is_pfn_mapping(vma))) { 870 if (unlikely(is_pfn_mapping(vma))) {
871 /* 871 /*
872 * We do not free on error cases below as remove_vma 872 * We do not free on error cases below as remove_vma
873 * gets called on error from higher level routine 873 * gets called on error from higher level routine
874 */ 874 */
875 ret = track_pfn_vma_copy(vma); 875 ret = track_pfn_vma_copy(vma);
876 if (ret) 876 if (ret)
877 return ret; 877 return ret;
878 } 878 }
879 879
880 /* 880 /*
881 * We need to invalidate the secondary MMU mappings only when 881 * We need to invalidate the secondary MMU mappings only when
882 * there could be a permission downgrade on the ptes of the 882 * there could be a permission downgrade on the ptes of the
883 * parent mm. And a permission downgrade will only happen if 883 * parent mm. And a permission downgrade will only happen if
884 * is_cow_mapping() returns true. 884 * is_cow_mapping() returns true.
885 */ 885 */
886 if (is_cow_mapping(vma->vm_flags)) 886 if (is_cow_mapping(vma->vm_flags))
887 mmu_notifier_invalidate_range_start(src_mm, addr, end); 887 mmu_notifier_invalidate_range_start(src_mm, addr, end);
888 888
889 ret = 0; 889 ret = 0;
890 dst_pgd = pgd_offset(dst_mm, addr); 890 dst_pgd = pgd_offset(dst_mm, addr);
891 src_pgd = pgd_offset(src_mm, addr); 891 src_pgd = pgd_offset(src_mm, addr);
892 do { 892 do {
893 next = pgd_addr_end(addr, end); 893 next = pgd_addr_end(addr, end);
894 if (pgd_none_or_clear_bad(src_pgd)) 894 if (pgd_none_or_clear_bad(src_pgd))
895 continue; 895 continue;
896 if (unlikely(copy_pud_range(dst_mm, src_mm, dst_pgd, src_pgd, 896 if (unlikely(copy_pud_range(dst_mm, src_mm, dst_pgd, src_pgd,
897 vma, addr, next))) { 897 vma, addr, next))) {
898 ret = -ENOMEM; 898 ret = -ENOMEM;
899 break; 899 break;
900 } 900 }
901 } while (dst_pgd++, src_pgd++, addr = next, addr != end); 901 } while (dst_pgd++, src_pgd++, addr = next, addr != end);
902 902
903 if (is_cow_mapping(vma->vm_flags)) 903 if (is_cow_mapping(vma->vm_flags))
904 mmu_notifier_invalidate_range_end(src_mm, 904 mmu_notifier_invalidate_range_end(src_mm,
905 vma->vm_start, end); 905 vma->vm_start, end);
906 return ret; 906 return ret;
907 } 907 }
908 908
909 static unsigned long zap_pte_range(struct mmu_gather *tlb, 909 static unsigned long zap_pte_range(struct mmu_gather *tlb,
910 struct vm_area_struct *vma, pmd_t *pmd, 910 struct vm_area_struct *vma, pmd_t *pmd,
911 unsigned long addr, unsigned long end, 911 unsigned long addr, unsigned long end,
912 long *zap_work, struct zap_details *details) 912 long *zap_work, struct zap_details *details)
913 { 913 {
914 struct mm_struct *mm = tlb->mm; 914 struct mm_struct *mm = tlb->mm;
915 pte_t *pte; 915 pte_t *pte;
916 spinlock_t *ptl; 916 spinlock_t *ptl;
917 int rss[NR_MM_COUNTERS]; 917 int rss[NR_MM_COUNTERS];
918 918
919 init_rss_vec(rss); 919 init_rss_vec(rss);
920 920
921 pte = pte_offset_map_lock(mm, pmd, addr, &ptl); 921 pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
922 arch_enter_lazy_mmu_mode(); 922 arch_enter_lazy_mmu_mode();
923 do { 923 do {
924 pte_t ptent = *pte; 924 pte_t ptent = *pte;
925 if (pte_none(ptent)) { 925 if (pte_none(ptent)) {
926 (*zap_work)--; 926 (*zap_work)--;
927 continue; 927 continue;
928 } 928 }
929 929
930 (*zap_work) -= PAGE_SIZE; 930 (*zap_work) -= PAGE_SIZE;
931 931
932 if (pte_present(ptent)) { 932 if (pte_present(ptent)) {
933 struct page *page; 933 struct page *page;
934 934
935 page = vm_normal_page(vma, addr, ptent); 935 page = vm_normal_page(vma, addr, ptent);
936 if (unlikely(details) && page) { 936 if (unlikely(details) && page) {
937 /* 937 /*
938 * unmap_shared_mapping_pages() wants to 938 * unmap_shared_mapping_pages() wants to
939 * invalidate cache without truncating: 939 * invalidate cache without truncating:
940 * unmap shared but keep private pages. 940 * unmap shared but keep private pages.
941 */ 941 */
942 if (details->check_mapping && 942 if (details->check_mapping &&
943 details->check_mapping != page->mapping) 943 details->check_mapping != page->mapping)
944 continue; 944 continue;
945 /* 945 /*
946 * Each page->index must be checked when 946 * Each page->index must be checked when
947 * invalidating or truncating nonlinear. 947 * invalidating or truncating nonlinear.
948 */ 948 */
949 if (details->nonlinear_vma && 949 if (details->nonlinear_vma &&
950 (page->index < details->first_index || 950 (page->index < details->first_index ||
951 page->index > details->last_index)) 951 page->index > details->last_index))
952 continue; 952 continue;
953 } 953 }
954 ptent = ptep_get_and_clear_full(mm, addr, pte, 954 ptent = ptep_get_and_clear_full(mm, addr, pte,
955 tlb->fullmm); 955 tlb->fullmm);
956 tlb_remove_tlb_entry(tlb, pte, addr); 956 tlb_remove_tlb_entry(tlb, pte, addr);
957 if (unlikely(!page)) 957 if (unlikely(!page))
958 continue; 958 continue;
959 if (unlikely(details) && details->nonlinear_vma 959 if (unlikely(details) && details->nonlinear_vma
960 && linear_page_index(details->nonlinear_vma, 960 && linear_page_index(details->nonlinear_vma,
961 addr) != page->index) 961 addr) != page->index)
962 set_pte_at(mm, addr, pte, 962 set_pte_at(mm, addr, pte,
963 pgoff_to_pte(page->index)); 963 pgoff_to_pte(page->index));
964 if (PageAnon(page)) 964 if (PageAnon(page))
965 rss[MM_ANONPAGES]--; 965 rss[MM_ANONPAGES]--;
966 else { 966 else {
967 if (pte_dirty(ptent)) 967 if (pte_dirty(ptent))
968 set_page_dirty(page); 968 set_page_dirty(page);
969 if (pte_young(ptent) && 969 if (pte_young(ptent) &&
970 likely(!VM_SequentialReadHint(vma))) 970 likely(!VM_SequentialReadHint(vma)))
971 mark_page_accessed(page); 971 mark_page_accessed(page);
972 rss[MM_FILEPAGES]--; 972 rss[MM_FILEPAGES]--;
973 } 973 }
974 page_remove_rmap(page); 974 page_remove_rmap(page);
975 if (unlikely(page_mapcount(page) < 0)) 975 if (unlikely(page_mapcount(page) < 0))
976 print_bad_pte(vma, addr, ptent, page); 976 print_bad_pte(vma, addr, ptent, page);
977 tlb_remove_page(tlb, page); 977 tlb_remove_page(tlb, page);
978 continue; 978 continue;
979 } 979 }
980 /* 980 /*
981 * If details->check_mapping, we leave swap entries; 981 * If details->check_mapping, we leave swap entries;
982 * if details->nonlinear_vma, we leave file entries. 982 * if details->nonlinear_vma, we leave file entries.
983 */ 983 */
984 if (unlikely(details)) 984 if (unlikely(details))
985 continue; 985 continue;
986 if (pte_file(ptent)) { 986 if (pte_file(ptent)) {
987 if (unlikely(!(vma->vm_flags & VM_NONLINEAR))) 987 if (unlikely(!(vma->vm_flags & VM_NONLINEAR)))
988 print_bad_pte(vma, addr, ptent, NULL); 988 print_bad_pte(vma, addr, ptent, NULL);
989 } else { 989 } else {
990 swp_entry_t entry = pte_to_swp_entry(ptent); 990 swp_entry_t entry = pte_to_swp_entry(ptent);
991 991
992 if (!non_swap_entry(entry)) 992 if (!non_swap_entry(entry))
993 rss[MM_SWAPENTS]--; 993 rss[MM_SWAPENTS]--;
994 if (unlikely(!free_swap_and_cache(entry))) 994 if (unlikely(!free_swap_and_cache(entry)))
995 print_bad_pte(vma, addr, ptent, NULL); 995 print_bad_pte(vma, addr, ptent, NULL);
996 } 996 }
997 pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); 997 pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
998 } while (pte++, addr += PAGE_SIZE, (addr != end && *zap_work > 0)); 998 } while (pte++, addr += PAGE_SIZE, (addr != end && *zap_work > 0));
999 999
1000 add_mm_rss_vec(mm, rss); 1000 add_mm_rss_vec(mm, rss);
1001 arch_leave_lazy_mmu_mode(); 1001 arch_leave_lazy_mmu_mode();
1002 pte_unmap_unlock(pte - 1, ptl); 1002 pte_unmap_unlock(pte - 1, ptl);
1003 1003
1004 return addr; 1004 return addr;
1005 } 1005 }
1006 1006
1007 static inline unsigned long zap_pmd_range(struct mmu_gather *tlb, 1007 static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
1008 struct vm_area_struct *vma, pud_t *pud, 1008 struct vm_area_struct *vma, pud_t *pud,
1009 unsigned long addr, unsigned long end, 1009 unsigned long addr, unsigned long end,
1010 long *zap_work, struct zap_details *details) 1010 long *zap_work, struct zap_details *details)
1011 { 1011 {
1012 pmd_t *pmd; 1012 pmd_t *pmd;
1013 unsigned long next; 1013 unsigned long next;
1014 1014
1015 pmd = pmd_offset(pud, addr); 1015 pmd = pmd_offset(pud, addr);
1016 do { 1016 do {
1017 next = pmd_addr_end(addr, end); 1017 next = pmd_addr_end(addr, end);
1018 if (pmd_trans_huge(*pmd)) { 1018 if (pmd_trans_huge(*pmd)) {
1019 if (next-addr != HPAGE_PMD_SIZE) { 1019 if (next-addr != HPAGE_PMD_SIZE) {
1020 VM_BUG_ON(!rwsem_is_locked(&tlb->mm->mmap_sem)); 1020 VM_BUG_ON(!rwsem_is_locked(&tlb->mm->mmap_sem));
1021 split_huge_page_pmd(vma->vm_mm, pmd); 1021 split_huge_page_pmd(vma->vm_mm, pmd);
1022 } else if (zap_huge_pmd(tlb, vma, pmd)) { 1022 } else if (zap_huge_pmd(tlb, vma, pmd)) {
1023 (*zap_work)--; 1023 (*zap_work)--;
1024 continue; 1024 continue;
1025 } 1025 }
1026 /* fall through */ 1026 /* fall through */
1027 } 1027 }
1028 if (pmd_none_or_clear_bad(pmd)) { 1028 if (pmd_none_or_clear_bad(pmd)) {
1029 (*zap_work)--; 1029 (*zap_work)--;
1030 continue; 1030 continue;
1031 } 1031 }
1032 next = zap_pte_range(tlb, vma, pmd, addr, next, 1032 next = zap_pte_range(tlb, vma, pmd, addr, next,
1033 zap_work, details); 1033 zap_work, details);
1034 } while (pmd++, addr = next, (addr != end && *zap_work > 0)); 1034 } while (pmd++, addr = next, (addr != end && *zap_work > 0));
1035 1035
1036 return addr; 1036 return addr;
1037 } 1037 }
1038 1038
1039 static inline unsigned long zap_pud_range(struct mmu_gather *tlb, 1039 static inline unsigned long zap_pud_range(struct mmu_gather *tlb,
1040 struct vm_area_struct *vma, pgd_t *pgd, 1040 struct vm_area_struct *vma, pgd_t *pgd,
1041 unsigned long addr, unsigned long end, 1041 unsigned long addr, unsigned long end,
1042 long *zap_work, struct zap_details *details) 1042 long *zap_work, struct zap_details *details)
1043 { 1043 {
1044 pud_t *pud; 1044 pud_t *pud;
1045 unsigned long next; 1045 unsigned long next;
1046 1046
1047 pud = pud_offset(pgd, addr); 1047 pud = pud_offset(pgd, addr);
1048 do { 1048 do {
1049 next = pud_addr_end(addr, end); 1049 next = pud_addr_end(addr, end);
1050 if (pud_none_or_clear_bad(pud)) { 1050 if (pud_none_or_clear_bad(pud)) {
1051 (*zap_work)--; 1051 (*zap_work)--;
1052 continue; 1052 continue;
1053 } 1053 }
1054 next = zap_pmd_range(tlb, vma, pud, addr, next, 1054 next = zap_pmd_range(tlb, vma, pud, addr, next,
1055 zap_work, details); 1055 zap_work, details);
1056 } while (pud++, addr = next, (addr != end && *zap_work > 0)); 1056 } while (pud++, addr = next, (addr != end && *zap_work > 0));
1057 1057
1058 return addr; 1058 return addr;
1059 } 1059 }
1060 1060
1061 static unsigned long unmap_page_range(struct mmu_gather *tlb, 1061 static unsigned long unmap_page_range(struct mmu_gather *tlb,
1062 struct vm_area_struct *vma, 1062 struct vm_area_struct *vma,
1063 unsigned long addr, unsigned long end, 1063 unsigned long addr, unsigned long end,
1064 long *zap_work, struct zap_details *details) 1064 long *zap_work, struct zap_details *details)
1065 { 1065 {
1066 pgd_t *pgd; 1066 pgd_t *pgd;
1067 unsigned long next; 1067 unsigned long next;
1068 1068
1069 if (details && !details->check_mapping && !details->nonlinear_vma) 1069 if (details && !details->check_mapping && !details->nonlinear_vma)
1070 details = NULL; 1070 details = NULL;
1071 1071
1072 BUG_ON(addr >= end); 1072 BUG_ON(addr >= end);
1073 mem_cgroup_uncharge_start(); 1073 mem_cgroup_uncharge_start();
1074 tlb_start_vma(tlb, vma); 1074 tlb_start_vma(tlb, vma);
1075 pgd = pgd_offset(vma->vm_mm, addr); 1075 pgd = pgd_offset(vma->vm_mm, addr);
1076 do { 1076 do {
1077 next = pgd_addr_end(addr, end); 1077 next = pgd_addr_end(addr, end);
1078 if (pgd_none_or_clear_bad(pgd)) { 1078 if (pgd_none_or_clear_bad(pgd)) {
1079 (*zap_work)--; 1079 (*zap_work)--;
1080 continue; 1080 continue;
1081 } 1081 }
1082 next = zap_pud_range(tlb, vma, pgd, addr, next, 1082 next = zap_pud_range(tlb, vma, pgd, addr, next,
1083 zap_work, details); 1083 zap_work, details);
1084 } while (pgd++, addr = next, (addr != end && *zap_work > 0)); 1084 } while (pgd++, addr = next, (addr != end && *zap_work > 0));
1085 tlb_end_vma(tlb, vma); 1085 tlb_end_vma(tlb, vma);
1086 mem_cgroup_uncharge_end(); 1086 mem_cgroup_uncharge_end();
1087 1087
1088 return addr; 1088 return addr;
1089 } 1089 }
1090 1090
1091 #ifdef CONFIG_PREEMPT 1091 #ifdef CONFIG_PREEMPT
1092 # define ZAP_BLOCK_SIZE (8 * PAGE_SIZE) 1092 # define ZAP_BLOCK_SIZE (8 * PAGE_SIZE)
1093 #else 1093 #else
1094 /* No preempt: go for improved straight-line efficiency */ 1094 /* No preempt: go for improved straight-line efficiency */
1095 # define ZAP_BLOCK_SIZE (1024 * PAGE_SIZE) 1095 # define ZAP_BLOCK_SIZE (1024 * PAGE_SIZE)
1096 #endif 1096 #endif
1097 1097
1098 /** 1098 /**
1099 * unmap_vmas - unmap a range of memory covered by a list of vma's 1099 * unmap_vmas - unmap a range of memory covered by a list of vma's
1100 * @tlbp: address of the caller's struct mmu_gather 1100 * @tlbp: address of the caller's struct mmu_gather
1101 * @vma: the starting vma 1101 * @vma: the starting vma
1102 * @start_addr: virtual address at which to start unmapping 1102 * @start_addr: virtual address at which to start unmapping
1103 * @end_addr: virtual address at which to end unmapping 1103 * @end_addr: virtual address at which to end unmapping
1104 * @nr_accounted: Place number of unmapped pages in vm-accountable vma's here 1104 * @nr_accounted: Place number of unmapped pages in vm-accountable vma's here
1105 * @details: details of nonlinear truncation or shared cache invalidation 1105 * @details: details of nonlinear truncation or shared cache invalidation
1106 * 1106 *
1107 * Returns the end address of the unmapping (restart addr if interrupted). 1107 * Returns the end address of the unmapping (restart addr if interrupted).
1108 * 1108 *
1109 * Unmap all pages in the vma list. 1109 * Unmap all pages in the vma list.
1110 * 1110 *
1111 * We aim to not hold locks for too long (for scheduling latency reasons). 1111 * We aim to not hold locks for too long (for scheduling latency reasons).
1112 * So zap pages in ZAP_BLOCK_SIZE bytecounts. This means we need to 1112 * So zap pages in ZAP_BLOCK_SIZE bytecounts. This means we need to
1113 * return the ending mmu_gather to the caller. 1113 * return the ending mmu_gather to the caller.
1114 * 1114 *
1115 * Only addresses between `start' and `end' will be unmapped. 1115 * Only addresses between `start' and `end' will be unmapped.
1116 * 1116 *
1117 * The VMA list must be sorted in ascending virtual address order. 1117 * The VMA list must be sorted in ascending virtual address order.
1118 * 1118 *
1119 * unmap_vmas() assumes that the caller will flush the whole unmapped address 1119 * unmap_vmas() assumes that the caller will flush the whole unmapped address
1120 * range after unmap_vmas() returns. So the only responsibility here is to 1120 * range after unmap_vmas() returns. So the only responsibility here is to
1121 * ensure that any thus-far unmapped pages are flushed before unmap_vmas() 1121 * ensure that any thus-far unmapped pages are flushed before unmap_vmas()
1122 * drops the lock and schedules. 1122 * drops the lock and schedules.
1123 */ 1123 */
1124 unsigned long unmap_vmas(struct mmu_gather **tlbp, 1124 unsigned long unmap_vmas(struct mmu_gather **tlbp,
1125 struct vm_area_struct *vma, unsigned long start_addr, 1125 struct vm_area_struct *vma, unsigned long start_addr,
1126 unsigned long end_addr, unsigned long *nr_accounted, 1126 unsigned long end_addr, unsigned long *nr_accounted,
1127 struct zap_details *details) 1127 struct zap_details *details)
1128 { 1128 {
1129 long zap_work = ZAP_BLOCK_SIZE; 1129 long zap_work = ZAP_BLOCK_SIZE;
1130 unsigned long tlb_start = 0; /* For tlb_finish_mmu */ 1130 unsigned long tlb_start = 0; /* For tlb_finish_mmu */
1131 int tlb_start_valid = 0; 1131 int tlb_start_valid = 0;
1132 unsigned long start = start_addr; 1132 unsigned long start = start_addr;
1133 spinlock_t *i_mmap_lock = details? details->i_mmap_lock: NULL; 1133 spinlock_t *i_mmap_lock = details? details->i_mmap_lock: NULL;
1134 int fullmm = (*tlbp)->fullmm; 1134 int fullmm = (*tlbp)->fullmm;
1135 struct mm_struct *mm = vma->vm_mm; 1135 struct mm_struct *mm = vma->vm_mm;
1136 1136
1137 mmu_notifier_invalidate_range_start(mm, start_addr, end_addr); 1137 mmu_notifier_invalidate_range_start(mm, start_addr, end_addr);
1138 for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) { 1138 for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) {
1139 unsigned long end; 1139 unsigned long end;
1140 1140
1141 start = max(vma->vm_start, start_addr); 1141 start = max(vma->vm_start, start_addr);
1142 if (start >= vma->vm_end) 1142 if (start >= vma->vm_end)
1143 continue; 1143 continue;
1144 end = min(vma->vm_end, end_addr); 1144 end = min(vma->vm_end, end_addr);
1145 if (end <= vma->vm_start) 1145 if (end <= vma->vm_start)
1146 continue; 1146 continue;
1147 1147
1148 if (vma->vm_flags & VM_ACCOUNT) 1148 if (vma->vm_flags & VM_ACCOUNT)
1149 *nr_accounted += (end - start) >> PAGE_SHIFT; 1149 *nr_accounted += (end - start) >> PAGE_SHIFT;
1150 1150
1151 if (unlikely(is_pfn_mapping(vma))) 1151 if (unlikely(is_pfn_mapping(vma)))
1152 untrack_pfn_vma(vma, 0, 0); 1152 untrack_pfn_vma(vma, 0, 0);
1153 1153
1154 while (start != end) { 1154 while (start != end) {
1155 if (!tlb_start_valid) { 1155 if (!tlb_start_valid) {
1156 tlb_start = start; 1156 tlb_start = start;
1157 tlb_start_valid = 1; 1157 tlb_start_valid = 1;
1158 } 1158 }
1159 1159
1160 if (unlikely(is_vm_hugetlb_page(vma))) { 1160 if (unlikely(is_vm_hugetlb_page(vma))) {
1161 /* 1161 /*
1162 * It is undesirable to test vma->vm_file as it 1162 * It is undesirable to test vma->vm_file as it
1163 * should be non-null for valid hugetlb area. 1163 * should be non-null for valid hugetlb area.
1164 * However, vm_file will be NULL in the error 1164 * However, vm_file will be NULL in the error
1165 * cleanup path of do_mmap_pgoff. When 1165 * cleanup path of do_mmap_pgoff. When
1166 * hugetlbfs ->mmap method fails, 1166 * hugetlbfs ->mmap method fails,
1167 * do_mmap_pgoff() nullifies vma->vm_file 1167 * do_mmap_pgoff() nullifies vma->vm_file
1168 * before calling this function to clean up. 1168 * before calling this function to clean up.
1169 * Since no pte has actually been setup, it is 1169 * Since no pte has actually been setup, it is
1170 * safe to do nothing in this case. 1170 * safe to do nothing in this case.
1171 */ 1171 */
1172 if (vma->vm_file) { 1172 if (vma->vm_file) {
1173 unmap_hugepage_range(vma, start, end, NULL); 1173 unmap_hugepage_range(vma, start, end, NULL);
1174 zap_work -= (end - start) / 1174 zap_work -= (end - start) /
1175 pages_per_huge_page(hstate_vma(vma)); 1175 pages_per_huge_page(hstate_vma(vma));
1176 } 1176 }
1177 1177
1178 start = end; 1178 start = end;
1179 } else 1179 } else
1180 start = unmap_page_range(*tlbp, vma, 1180 start = unmap_page_range(*tlbp, vma,
1181 start, end, &zap_work, details); 1181 start, end, &zap_work, details);
1182 1182
1183 if (zap_work > 0) { 1183 if (zap_work > 0) {
1184 BUG_ON(start != end); 1184 BUG_ON(start != end);
1185 break; 1185 break;
1186 } 1186 }
1187 1187
1188 tlb_finish_mmu(*tlbp, tlb_start, start); 1188 tlb_finish_mmu(*tlbp, tlb_start, start);
1189 1189
1190 if (need_resched() || 1190 if (need_resched() ||
1191 (i_mmap_lock && spin_needbreak(i_mmap_lock))) { 1191 (i_mmap_lock && spin_needbreak(i_mmap_lock))) {
1192 if (i_mmap_lock) { 1192 if (i_mmap_lock) {
1193 *tlbp = NULL; 1193 *tlbp = NULL;
1194 goto out; 1194 goto out;
1195 } 1195 }
1196 cond_resched(); 1196 cond_resched();
1197 } 1197 }
1198 1198
1199 *tlbp = tlb_gather_mmu(vma->vm_mm, fullmm); 1199 *tlbp = tlb_gather_mmu(vma->vm_mm, fullmm);
1200 tlb_start_valid = 0; 1200 tlb_start_valid = 0;
1201 zap_work = ZAP_BLOCK_SIZE; 1201 zap_work = ZAP_BLOCK_SIZE;
1202 } 1202 }
1203 } 1203 }
1204 out: 1204 out:
1205 mmu_notifier_invalidate_range_end(mm, start_addr, end_addr); 1205 mmu_notifier_invalidate_range_end(mm, start_addr, end_addr);
1206 return start; /* which is now the end (or restart) address */ 1206 return start; /* which is now the end (or restart) address */
1207 } 1207 }
1208 1208
1209 /** 1209 /**
1210 * zap_page_range - remove user pages in a given range 1210 * zap_page_range - remove user pages in a given range
1211 * @vma: vm_area_struct holding the applicable pages 1211 * @vma: vm_area_struct holding the applicable pages
1212 * @address: starting address of pages to zap 1212 * @address: starting address of pages to zap
1213 * @size: number of bytes to zap 1213 * @size: number of bytes to zap
1214 * @details: details of nonlinear truncation or shared cache invalidation 1214 * @details: details of nonlinear truncation or shared cache invalidation
1215 */ 1215 */
1216 unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address, 1216 unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address,
1217 unsigned long size, struct zap_details *details) 1217 unsigned long size, struct zap_details *details)
1218 { 1218 {
1219 struct mm_struct *mm = vma->vm_mm; 1219 struct mm_struct *mm = vma->vm_mm;
1220 struct mmu_gather *tlb; 1220 struct mmu_gather *tlb;
1221 unsigned long end = address + size; 1221 unsigned long end = address + size;
1222 unsigned long nr_accounted = 0; 1222 unsigned long nr_accounted = 0;
1223 1223
1224 lru_add_drain(); 1224 lru_add_drain();
1225 tlb = tlb_gather_mmu(mm, 0); 1225 tlb = tlb_gather_mmu(mm, 0);
1226 update_hiwater_rss(mm); 1226 update_hiwater_rss(mm);
1227 end = unmap_vmas(&tlb, vma, address, end, &nr_accounted, details); 1227 end = unmap_vmas(&tlb, vma, address, end, &nr_accounted, details);
1228 if (tlb) 1228 if (tlb)
1229 tlb_finish_mmu(tlb, address, end); 1229 tlb_finish_mmu(tlb, address, end);
1230 return end; 1230 return end;
1231 } 1231 }
1232 1232
1233 /** 1233 /**
1234 * zap_vma_ptes - remove ptes mapping the vma 1234 * zap_vma_ptes - remove ptes mapping the vma
1235 * @vma: vm_area_struct holding ptes to be zapped 1235 * @vma: vm_area_struct holding ptes to be zapped
1236 * @address: starting address of pages to zap 1236 * @address: starting address of pages to zap
1237 * @size: number of bytes to zap 1237 * @size: number of bytes to zap
1238 * 1238 *
1239 * This function only unmaps ptes assigned to VM_PFNMAP vmas. 1239 * This function only unmaps ptes assigned to VM_PFNMAP vmas.
1240 * 1240 *
1241 * The entire address range must be fully contained within the vma. 1241 * The entire address range must be fully contained within the vma.
1242 * 1242 *
1243 * Returns 0 if successful. 1243 * Returns 0 if successful.
1244 */ 1244 */
1245 int zap_vma_ptes(struct vm_area_struct *vma, unsigned long address, 1245 int zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,
1246 unsigned long size) 1246 unsigned long size)
1247 { 1247 {
1248 if (address < vma->vm_start || address + size > vma->vm_end || 1248 if (address < vma->vm_start || address + size > vma->vm_end ||
1249 !(vma->vm_flags & VM_PFNMAP)) 1249 !(vma->vm_flags & VM_PFNMAP))
1250 return -1; 1250 return -1;
1251 zap_page_range(vma, address, size, NULL); 1251 zap_page_range(vma, address, size, NULL);
1252 return 0; 1252 return 0;
1253 } 1253 }
1254 EXPORT_SYMBOL_GPL(zap_vma_ptes); 1254 EXPORT_SYMBOL_GPL(zap_vma_ptes);
1255 1255
1256 /** 1256 /**
1257 * follow_page - look up a page descriptor from a user-virtual address 1257 * follow_page - look up a page descriptor from a user-virtual address
1258 * @vma: vm_area_struct mapping @address 1258 * @vma: vm_area_struct mapping @address
1259 * @address: virtual address to look up 1259 * @address: virtual address to look up
1260 * @flags: flags modifying lookup behaviour 1260 * @flags: flags modifying lookup behaviour
1261 * 1261 *
1262 * @flags can have FOLL_ flags set, defined in <linux/mm.h> 1262 * @flags can have FOLL_ flags set, defined in <linux/mm.h>
1263 * 1263 *
1264 * Returns the mapped (struct page *), %NULL if no mapping exists, or 1264 * Returns the mapped (struct page *), %NULL if no mapping exists, or
1265 * an error pointer if there is a mapping to something not represented 1265 * an error pointer if there is a mapping to something not represented
1266 * by a page descriptor (see also vm_normal_page()). 1266 * by a page descriptor (see also vm_normal_page()).
1267 */ 1267 */
1268 struct page *follow_page(struct vm_area_struct *vma, unsigned long address, 1268 struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
1269 unsigned int flags) 1269 unsigned int flags)
1270 { 1270 {
1271 pgd_t *pgd; 1271 pgd_t *pgd;
1272 pud_t *pud; 1272 pud_t *pud;
1273 pmd_t *pmd; 1273 pmd_t *pmd;
1274 pte_t *ptep, pte; 1274 pte_t *ptep, pte;
1275 spinlock_t *ptl; 1275 spinlock_t *ptl;
1276 struct page *page; 1276 struct page *page;
1277 struct mm_struct *mm = vma->vm_mm; 1277 struct mm_struct *mm = vma->vm_mm;
1278 1278
1279 page = follow_huge_addr(mm, address, flags & FOLL_WRITE); 1279 page = follow_huge_addr(mm, address, flags & FOLL_WRITE);
1280 if (!IS_ERR(page)) { 1280 if (!IS_ERR(page)) {
1281 BUG_ON(flags & FOLL_GET); 1281 BUG_ON(flags & FOLL_GET);
1282 goto out; 1282 goto out;
1283 } 1283 }
1284 1284
1285 page = NULL; 1285 page = NULL;
1286 pgd = pgd_offset(mm, address); 1286 pgd = pgd_offset(mm, address);
1287 if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) 1287 if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
1288 goto no_page_table; 1288 goto no_page_table;
1289 1289
1290 pud = pud_offset(pgd, address); 1290 pud = pud_offset(pgd, address);
1291 if (pud_none(*pud)) 1291 if (pud_none(*pud))
1292 goto no_page_table; 1292 goto no_page_table;
1293 if (pud_huge(*pud) && vma->vm_flags & VM_HUGETLB) { 1293 if (pud_huge(*pud) && vma->vm_flags & VM_HUGETLB) {
1294 BUG_ON(flags & FOLL_GET); 1294 BUG_ON(flags & FOLL_GET);
1295 page = follow_huge_pud(mm, address, pud, flags & FOLL_WRITE); 1295 page = follow_huge_pud(mm, address, pud, flags & FOLL_WRITE);
1296 goto out; 1296 goto out;
1297 } 1297 }
1298 if (unlikely(pud_bad(*pud))) 1298 if (unlikely(pud_bad(*pud)))
1299 goto no_page_table; 1299 goto no_page_table;
1300 1300
1301 pmd = pmd_offset(pud, address); 1301 pmd = pmd_offset(pud, address);
1302 if (pmd_none(*pmd)) 1302 if (pmd_none(*pmd))
1303 goto no_page_table; 1303 goto no_page_table;
1304 if (pmd_huge(*pmd) && vma->vm_flags & VM_HUGETLB) { 1304 if (pmd_huge(*pmd) && vma->vm_flags & VM_HUGETLB) {
1305 BUG_ON(flags & FOLL_GET); 1305 BUG_ON(flags & FOLL_GET);
1306 page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE); 1306 page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE);
1307 goto out; 1307 goto out;
1308 } 1308 }
1309 if (pmd_trans_huge(*pmd)) { 1309 if (pmd_trans_huge(*pmd)) {
1310 if (flags & FOLL_SPLIT) { 1310 if (flags & FOLL_SPLIT) {
1311 split_huge_page_pmd(mm, pmd); 1311 split_huge_page_pmd(mm, pmd);
1312 goto split_fallthrough; 1312 goto split_fallthrough;
1313 } 1313 }
1314 spin_lock(&mm->page_table_lock); 1314 spin_lock(&mm->page_table_lock);
1315 if (likely(pmd_trans_huge(*pmd))) { 1315 if (likely(pmd_trans_huge(*pmd))) {
1316 if (unlikely(pmd_trans_splitting(*pmd))) { 1316 if (unlikely(pmd_trans_splitting(*pmd))) {
1317 spin_unlock(&mm->page_table_lock); 1317 spin_unlock(&mm->page_table_lock);
1318 wait_split_huge_page(vma->anon_vma, pmd); 1318 wait_split_huge_page(vma->anon_vma, pmd);
1319 } else { 1319 } else {
1320 page = follow_trans_huge_pmd(mm, address, 1320 page = follow_trans_huge_pmd(mm, address,
1321 pmd, flags); 1321 pmd, flags);
1322 spin_unlock(&mm->page_table_lock); 1322 spin_unlock(&mm->page_table_lock);
1323 goto out; 1323 goto out;
1324 } 1324 }
1325 } else 1325 } else
1326 spin_unlock(&mm->page_table_lock); 1326 spin_unlock(&mm->page_table_lock);
1327 /* fall through */ 1327 /* fall through */
1328 } 1328 }
1329 split_fallthrough: 1329 split_fallthrough:
1330 if (unlikely(pmd_bad(*pmd))) 1330 if (unlikely(pmd_bad(*pmd)))
1331 goto no_page_table; 1331 goto no_page_table;
1332 1332
1333 ptep = pte_offset_map_lock(mm, pmd, address, &ptl); 1333 ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
1334 1334
1335 pte = *ptep; 1335 pte = *ptep;
1336 if (!pte_present(pte)) 1336 if (!pte_present(pte))
1337 goto no_page; 1337 goto no_page;
1338 if ((flags & FOLL_WRITE) && !pte_write(pte)) 1338 if ((flags & FOLL_WRITE) && !pte_write(pte))
1339 goto unlock; 1339 goto unlock;
1340 1340
1341 page = vm_normal_page(vma, address, pte); 1341 page = vm_normal_page(vma, address, pte);
1342 if (unlikely(!page)) { 1342 if (unlikely(!page)) {
1343 if ((flags & FOLL_DUMP) || 1343 if ((flags & FOLL_DUMP) ||
1344 !is_zero_pfn(pte_pfn(pte))) 1344 !is_zero_pfn(pte_pfn(pte)))
1345 goto bad_page; 1345 goto bad_page;
1346 page = pte_page(pte); 1346 page = pte_page(pte);
1347 } 1347 }
1348 1348
1349 if (flags & FOLL_GET) 1349 if (flags & FOLL_GET)
1350 get_page(page); 1350 get_page(page);
1351 if (flags & FOLL_TOUCH) { 1351 if (flags & FOLL_TOUCH) {
1352 if ((flags & FOLL_WRITE) && 1352 if ((flags & FOLL_WRITE) &&
1353 !pte_dirty(pte) && !PageDirty(page)) 1353 !pte_dirty(pte) && !PageDirty(page))
1354 set_page_dirty(page); 1354 set_page_dirty(page);
1355 /* 1355 /*
1356 * pte_mkyoung() would be more correct here, but atomic care 1356 * pte_mkyoung() would be more correct here, but atomic care
1357 * is needed to avoid losing the dirty bit: it is easier to use 1357 * is needed to avoid losing the dirty bit: it is easier to use
1358 * mark_page_accessed(). 1358 * mark_page_accessed().
1359 */ 1359 */
1360 mark_page_accessed(page); 1360 mark_page_accessed(page);
1361 } 1361 }
1362 if (flags & FOLL_MLOCK) { 1362 if (flags & FOLL_MLOCK) {
1363 /* 1363 /*
1364 * The preliminary mapping check is mainly to avoid the 1364 * The preliminary mapping check is mainly to avoid the
1365 * pointless overhead of lock_page on the ZERO_PAGE 1365 * pointless overhead of lock_page on the ZERO_PAGE
1366 * which might bounce very badly if there is contention. 1366 * which might bounce very badly if there is contention.
1367 * 1367 *
1368 * If the page is already locked, we don't need to 1368 * If the page is already locked, we don't need to
1369 * handle it now - vmscan will handle it later if and 1369 * handle it now - vmscan will handle it later if and
1370 * when it attempts to reclaim the page. 1370 * when it attempts to reclaim the page.
1371 */ 1371 */
1372 if (page->mapping && trylock_page(page)) { 1372 if (page->mapping && trylock_page(page)) {
1373 lru_add_drain(); /* push cached pages to LRU */ 1373 lru_add_drain(); /* push cached pages to LRU */
1374 /* 1374 /*
1375 * Because we lock page here and migration is 1375 * Because we lock page here and migration is
1376 * blocked by the pte's page reference, we need 1376 * blocked by the pte's page reference, we need
1377 * only check for file-cache page truncation. 1377 * only check for file-cache page truncation.
1378 */ 1378 */
1379 if (page->mapping) 1379 if (page->mapping)
1380 mlock_vma_page(page); 1380 mlock_vma_page(page);
1381 unlock_page(page); 1381 unlock_page(page);
1382 } 1382 }
1383 } 1383 }
1384 unlock: 1384 unlock:
1385 pte_unmap_unlock(ptep, ptl); 1385 pte_unmap_unlock(ptep, ptl);
1386 out: 1386 out:
1387 return page; 1387 return page;
1388 1388
1389 bad_page: 1389 bad_page:
1390 pte_unmap_unlock(ptep, ptl); 1390 pte_unmap_unlock(ptep, ptl);
1391 return ERR_PTR(-EFAULT); 1391 return ERR_PTR(-EFAULT);
1392 1392
1393 no_page: 1393 no_page:
1394 pte_unmap_unlock(ptep, ptl); 1394 pte_unmap_unlock(ptep, ptl);
1395 if (!pte_none(pte)) 1395 if (!pte_none(pte))
1396 return page; 1396 return page;
1397 1397
1398 no_page_table: 1398 no_page_table:
1399 /* 1399 /*
1400 * When core dumping an enormous anonymous area that nobody 1400 * When core dumping an enormous anonymous area that nobody
1401 * has touched so far, we don't want to allocate unnecessary pages or 1401 * has touched so far, we don't want to allocate unnecessary pages or
1402 * page tables. Return error instead of NULL to skip handle_mm_fault, 1402 * page tables. Return error instead of NULL to skip handle_mm_fault,
1403 * then get_dump_page() will return NULL to leave a hole in the dump. 1403 * then get_dump_page() will return NULL to leave a hole in the dump.
1404 * But we can only make this optimization where a hole would surely 1404 * But we can only make this optimization where a hole would surely
1405 * be zero-filled if handle_mm_fault() actually did handle it. 1405 * be zero-filled if handle_mm_fault() actually did handle it.
1406 */ 1406 */
1407 if ((flags & FOLL_DUMP) && 1407 if ((flags & FOLL_DUMP) &&
1408 (!vma->vm_ops || !vma->vm_ops->fault)) 1408 (!vma->vm_ops || !vma->vm_ops->fault))
1409 return ERR_PTR(-EFAULT); 1409 return ERR_PTR(-EFAULT);
1410 return page; 1410 return page;
1411 } 1411 }
1412 1412
1413 /** 1413 /**
1414 * __get_user_pages() - pin user pages in memory 1414 * __get_user_pages() - pin user pages in memory
1415 * @tsk: task_struct of target task 1415 * @tsk: task_struct of target task
1416 * @mm: mm_struct of target mm 1416 * @mm: mm_struct of target mm
1417 * @start: starting user address 1417 * @start: starting user address
1418 * @nr_pages: number of pages from start to pin 1418 * @nr_pages: number of pages from start to pin
1419 * @gup_flags: flags modifying pin behaviour 1419 * @gup_flags: flags modifying pin behaviour
1420 * @pages: array that receives pointers to the pages pinned. 1420 * @pages: array that receives pointers to the pages pinned.
1421 * Should be at least nr_pages long. Or NULL, if caller 1421 * Should be at least nr_pages long. Or NULL, if caller
1422 * only intends to ensure the pages are faulted in. 1422 * only intends to ensure the pages are faulted in.
1423 * @vmas: array of pointers to vmas corresponding to each page. 1423 * @vmas: array of pointers to vmas corresponding to each page.
1424 * Or NULL if the caller does not require them. 1424 * Or NULL if the caller does not require them.
1425 * @nonblocking: whether waiting for disk IO or mmap_sem contention 1425 * @nonblocking: whether waiting for disk IO or mmap_sem contention
1426 * 1426 *
1427 * Returns number of pages pinned. This may be fewer than the number 1427 * Returns number of pages pinned. This may be fewer than the number
1428 * requested. If nr_pages is 0 or negative, returns 0. If no pages 1428 * requested. If nr_pages is 0 or negative, returns 0. If no pages
1429 * were pinned, returns -errno. Each page returned must be released 1429 * were pinned, returns -errno. Each page returned must be released
1430 * with a put_page() call when it is finished with. vmas will only 1430 * with a put_page() call when it is finished with. vmas will only
1431 * remain valid while mmap_sem is held. 1431 * remain valid while mmap_sem is held.
1432 * 1432 *
1433 * Must be called with mmap_sem held for read or write. 1433 * Must be called with mmap_sem held for read or write.
1434 * 1434 *
1435 * __get_user_pages walks a process's page tables and takes a reference to 1435 * __get_user_pages walks a process's page tables and takes a reference to
1436 * each struct page that each user address corresponds to at a given 1436 * each struct page that each user address corresponds to at a given
1437 * instant. That is, it takes the page that would be accessed if a user 1437 * instant. That is, it takes the page that would be accessed if a user
1438 * thread accesses the given user virtual address at that instant. 1438 * thread accesses the given user virtual address at that instant.
1439 * 1439 *
1440 * This does not guarantee that the page exists in the user mappings when 1440 * This does not guarantee that the page exists in the user mappings when
1441 * __get_user_pages returns, and there may even be a completely different 1441 * __get_user_pages returns, and there may even be a completely different
1442 * page there in some cases (eg. if mmapped pagecache has been invalidated 1442 * page there in some cases (eg. if mmapped pagecache has been invalidated
1443 * and subsequently re faulted). However it does guarantee that the page 1443 * and subsequently re faulted). However it does guarantee that the page
1444 * won't be freed completely. And mostly callers simply care that the page 1444 * won't be freed completely. And mostly callers simply care that the page
1445 * contains data that was valid *at some point in time*. Typically, an IO 1445 * contains data that was valid *at some point in time*. Typically, an IO
1446 * or similar operation cannot guarantee anything stronger anyway because 1446 * or similar operation cannot guarantee anything stronger anyway because
1447 * locks can't be held over the syscall boundary. 1447 * locks can't be held over the syscall boundary.
1448 * 1448 *
1449 * If @gup_flags & FOLL_WRITE == 0, the page must not be written to. If 1449 * If @gup_flags & FOLL_WRITE == 0, the page must not be written to. If
1450 * the page is written to, set_page_dirty (or set_page_dirty_lock, as 1450 * the page is written to, set_page_dirty (or set_page_dirty_lock, as
1451 * appropriate) must be called after the page is finished with, and 1451 * appropriate) must be called after the page is finished with, and
1452 * before put_page is called. 1452 * before put_page is called.
1453 * 1453 *
1454 * If @nonblocking != NULL, __get_user_pages will not wait for disk IO 1454 * If @nonblocking != NULL, __get_user_pages will not wait for disk IO
1455 * or mmap_sem contention, and if waiting is needed to pin all pages, 1455 * or mmap_sem contention, and if waiting is needed to pin all pages,
1456 * *@nonblocking will be set to 0. 1456 * *@nonblocking will be set to 0.
1457 * 1457 *
1458 * In most cases, get_user_pages or get_user_pages_fast should be used 1458 * In most cases, get_user_pages or get_user_pages_fast should be used
1459 * instead of __get_user_pages. __get_user_pages should be used only if 1459 * instead of __get_user_pages. __get_user_pages should be used only if
1460 * you need some special @gup_flags. 1460 * you need some special @gup_flags.
1461 */ 1461 */
1462 int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, 1462 int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1463 unsigned long start, int nr_pages, unsigned int gup_flags, 1463 unsigned long start, int nr_pages, unsigned int gup_flags,
1464 struct page **pages, struct vm_area_struct **vmas, 1464 struct page **pages, struct vm_area_struct **vmas,
1465 int *nonblocking) 1465 int *nonblocking)
1466 { 1466 {
1467 int i; 1467 int i;
1468 unsigned long vm_flags; 1468 unsigned long vm_flags;
1469 1469
1470 if (nr_pages <= 0) 1470 if (nr_pages <= 0)
1471 return 0; 1471 return 0;
1472 1472
1473 VM_BUG_ON(!!pages != !!(gup_flags & FOLL_GET)); 1473 VM_BUG_ON(!!pages != !!(gup_flags & FOLL_GET));
1474 1474
1475 /* 1475 /*
1476 * Require read or write permissions. 1476 * Require read or write permissions.
1477 * If FOLL_FORCE is set, we only require the "MAY" flags. 1477 * If FOLL_FORCE is set, we only require the "MAY" flags.
1478 */ 1478 */
1479 vm_flags = (gup_flags & FOLL_WRITE) ? 1479 vm_flags = (gup_flags & FOLL_WRITE) ?
1480 (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD); 1480 (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD);
1481 vm_flags &= (gup_flags & FOLL_FORCE) ? 1481 vm_flags &= (gup_flags & FOLL_FORCE) ?
1482 (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE); 1482 (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE);
1483 i = 0; 1483 i = 0;
1484 1484
1485 do { 1485 do {
1486 struct vm_area_struct *vma; 1486 struct vm_area_struct *vma;
1487 1487
1488 vma = find_extend_vma(mm, start); 1488 vma = find_extend_vma(mm, start);
1489 if (!vma && in_gate_area(mm, start)) { 1489 if (!vma && in_gate_area(mm, start)) {
1490 unsigned long pg = start & PAGE_MASK; 1490 unsigned long pg = start & PAGE_MASK;
1491 struct vm_area_struct *gate_vma = get_gate_vma(mm); 1491 struct vm_area_struct *gate_vma = get_gate_vma(mm);
1492 pgd_t *pgd; 1492 pgd_t *pgd;
1493 pud_t *pud; 1493 pud_t *pud;
1494 pmd_t *pmd; 1494 pmd_t *pmd;
1495 pte_t *pte; 1495 pte_t *pte;
1496 1496
1497 /* user gate pages are read-only */ 1497 /* user gate pages are read-only */
1498 if (gup_flags & FOLL_WRITE) 1498 if (gup_flags & FOLL_WRITE)
1499 return i ? : -EFAULT; 1499 return i ? : -EFAULT;
1500 if (pg > TASK_SIZE) 1500 if (pg > TASK_SIZE)
1501 pgd = pgd_offset_k(pg); 1501 pgd = pgd_offset_k(pg);
1502 else 1502 else
1503 pgd = pgd_offset_gate(mm, pg); 1503 pgd = pgd_offset_gate(mm, pg);
1504 BUG_ON(pgd_none(*pgd)); 1504 BUG_ON(pgd_none(*pgd));
1505 pud = pud_offset(pgd, pg); 1505 pud = pud_offset(pgd, pg);
1506 BUG_ON(pud_none(*pud)); 1506 BUG_ON(pud_none(*pud));
1507 pmd = pmd_offset(pud, pg); 1507 pmd = pmd_offset(pud, pg);
1508 if (pmd_none(*pmd)) 1508 if (pmd_none(*pmd))
1509 return i ? : -EFAULT; 1509 return i ? : -EFAULT;
1510 VM_BUG_ON(pmd_trans_huge(*pmd)); 1510 VM_BUG_ON(pmd_trans_huge(*pmd));
1511 pte = pte_offset_map(pmd, pg); 1511 pte = pte_offset_map(pmd, pg);
1512 if (pte_none(*pte)) { 1512 if (pte_none(*pte)) {
1513 pte_unmap(pte); 1513 pte_unmap(pte);
1514 return i ? : -EFAULT; 1514 return i ? : -EFAULT;
1515 } 1515 }
1516 if (pages) { 1516 if (pages) {
1517 struct page *page; 1517 struct page *page;
1518 1518
1519 page = vm_normal_page(gate_vma, start, *pte); 1519 page = vm_normal_page(gate_vma, start, *pte);
1520 if (!page) { 1520 if (!page) {
1521 if (!(gup_flags & FOLL_DUMP) && 1521 if (!(gup_flags & FOLL_DUMP) &&
1522 is_zero_pfn(pte_pfn(*pte))) 1522 is_zero_pfn(pte_pfn(*pte)))
1523 page = pte_page(*pte); 1523 page = pte_page(*pte);
1524 else { 1524 else {
1525 pte_unmap(pte); 1525 pte_unmap(pte);
1526 return i ? : -EFAULT; 1526 return i ? : -EFAULT;
1527 } 1527 }
1528 } 1528 }
1529 pages[i] = page; 1529 pages[i] = page;
1530 get_page(page); 1530 get_page(page);
1531 } 1531 }
1532 pte_unmap(pte); 1532 pte_unmap(pte);
1533 if (vmas) 1533 if (vmas)
1534 vmas[i] = gate_vma; 1534 vmas[i] = gate_vma;
1535 i++; 1535 i++;
1536 start += PAGE_SIZE; 1536 start += PAGE_SIZE;
1537 nr_pages--; 1537 nr_pages--;
1538 continue; 1538 continue;
1539 } 1539 }
1540 1540
1541 if (!vma || 1541 if (!vma ||
1542 (vma->vm_flags & (VM_IO | VM_PFNMAP)) || 1542 (vma->vm_flags & (VM_IO | VM_PFNMAP)) ||
1543 !(vm_flags & vma->vm_flags)) 1543 !(vm_flags & vma->vm_flags))
1544 return i ? : -EFAULT; 1544 return i ? : -EFAULT;
1545 1545
1546 if (is_vm_hugetlb_page(vma)) { 1546 if (is_vm_hugetlb_page(vma)) {
1547 i = follow_hugetlb_page(mm, vma, pages, vmas, 1547 i = follow_hugetlb_page(mm, vma, pages, vmas,
1548 &start, &nr_pages, i, gup_flags); 1548 &start, &nr_pages, i, gup_flags);
1549 continue; 1549 continue;
1550 } 1550 }
1551 1551
1552 do { 1552 do {
1553 struct page *page; 1553 struct page *page;
1554 unsigned int foll_flags = gup_flags; 1554 unsigned int foll_flags = gup_flags;
1555 1555
1556 /* 1556 /*
1557 * If we have a pending SIGKILL, don't keep faulting 1557 * If we have a pending SIGKILL, don't keep faulting
1558 * pages and potentially allocating memory. 1558 * pages and potentially allocating memory.
1559 */ 1559 */
1560 if (unlikely(fatal_signal_pending(current))) 1560 if (unlikely(fatal_signal_pending(current)))
1561 return i ? i : -ERESTARTSYS; 1561 return i ? i : -ERESTARTSYS;
1562 1562
1563 cond_resched(); 1563 cond_resched();
1564 while (!(page = follow_page(vma, start, foll_flags))) { 1564 while (!(page = follow_page(vma, start, foll_flags))) {
1565 int ret; 1565 int ret;
1566 unsigned int fault_flags = 0; 1566 unsigned int fault_flags = 0;
1567 1567
1568 if (foll_flags & FOLL_WRITE) 1568 if (foll_flags & FOLL_WRITE)
1569 fault_flags |= FAULT_FLAG_WRITE; 1569 fault_flags |= FAULT_FLAG_WRITE;
1570 if (nonblocking) 1570 if (nonblocking)
1571 fault_flags |= FAULT_FLAG_ALLOW_RETRY; 1571 fault_flags |= FAULT_FLAG_ALLOW_RETRY;
1572 1572
1573 ret = handle_mm_fault(mm, vma, start, 1573 ret = handle_mm_fault(mm, vma, start,
1574 fault_flags); 1574 fault_flags);
1575 1575
1576 if (ret & VM_FAULT_ERROR) { 1576 if (ret & VM_FAULT_ERROR) {
1577 if (ret & VM_FAULT_OOM) 1577 if (ret & VM_FAULT_OOM)
1578 return i ? i : -ENOMEM; 1578 return i ? i : -ENOMEM;
1579 if (ret & (VM_FAULT_HWPOISON | 1579 if (ret & (VM_FAULT_HWPOISON |
1580 VM_FAULT_HWPOISON_LARGE)) { 1580 VM_FAULT_HWPOISON_LARGE)) {
1581 if (i) 1581 if (i)
1582 return i; 1582 return i;
1583 else if (gup_flags & FOLL_HWPOISON) 1583 else if (gup_flags & FOLL_HWPOISON)
1584 return -EHWPOISON; 1584 return -EHWPOISON;
1585 else 1585 else
1586 return -EFAULT; 1586 return -EFAULT;
1587 } 1587 }
1588 if (ret & VM_FAULT_SIGBUS) 1588 if (ret & VM_FAULT_SIGBUS)
1589 return i ? i : -EFAULT; 1589 return i ? i : -EFAULT;
1590 BUG(); 1590 BUG();
1591 } 1591 }
1592 1592
1593 if (tsk) { 1593 if (tsk) {
1594 if (ret & VM_FAULT_MAJOR) 1594 if (ret & VM_FAULT_MAJOR)
1595 tsk->maj_flt++; 1595 tsk->maj_flt++;
1596 else 1596 else
1597 tsk->min_flt++; 1597 tsk->min_flt++;
1598 } 1598 }
1599 1599
1600 if (ret & VM_FAULT_RETRY) { 1600 if (ret & VM_FAULT_RETRY) {
1601 *nonblocking = 0; 1601 *nonblocking = 0;
1602 return i; 1602 return i;
1603 } 1603 }
1604 1604
1605 /* 1605 /*
1606 * The VM_FAULT_WRITE bit tells us that 1606 * The VM_FAULT_WRITE bit tells us that
1607 * do_wp_page has broken COW when necessary, 1607 * do_wp_page has broken COW when necessary,
1608 * even if maybe_mkwrite decided not to set 1608 * even if maybe_mkwrite decided not to set
1609 * pte_write. We can thus safely do subsequent 1609 * pte_write. We can thus safely do subsequent
1610 * page lookups as if they were reads. But only 1610 * page lookups as if they were reads. But only
1611 * do so when looping for pte_write is futile: 1611 * do so when looping for pte_write is futile:
1612 * in some cases userspace may also be wanting 1612 * in some cases userspace may also be wanting
1613 * to write to the gotten user page, which a 1613 * to write to the gotten user page, which a
1614 * read fault here might prevent (a readonly 1614 * read fault here might prevent (a readonly
1615 * page might get reCOWed by userspace write). 1615 * page might get reCOWed by userspace write).
1616 */ 1616 */
1617 if ((ret & VM_FAULT_WRITE) && 1617 if ((ret & VM_FAULT_WRITE) &&
1618 !(vma->vm_flags & VM_WRITE)) 1618 !(vma->vm_flags & VM_WRITE))
1619 foll_flags &= ~FOLL_WRITE; 1619 foll_flags &= ~FOLL_WRITE;
1620 1620
1621 cond_resched(); 1621 cond_resched();
1622 } 1622 }
1623 if (IS_ERR(page)) 1623 if (IS_ERR(page))
1624 return i ? i : PTR_ERR(page); 1624 return i ? i : PTR_ERR(page);
1625 if (pages) { 1625 if (pages) {
1626 pages[i] = page; 1626 pages[i] = page;
1627 1627
1628 flush_anon_page(vma, page, start); 1628 flush_anon_page(vma, page, start);
1629 flush_dcache_page(page); 1629 flush_dcache_page(page);
1630 } 1630 }
1631 if (vmas) 1631 if (vmas)
1632 vmas[i] = vma; 1632 vmas[i] = vma;
1633 i++; 1633 i++;
1634 start += PAGE_SIZE; 1634 start += PAGE_SIZE;
1635 nr_pages--; 1635 nr_pages--;
1636 } while (nr_pages && start < vma->vm_end); 1636 } while (nr_pages && start < vma->vm_end);
1637 } while (nr_pages); 1637 } while (nr_pages);
1638 return i; 1638 return i;
1639 } 1639 }
1640 EXPORT_SYMBOL(__get_user_pages); 1640 EXPORT_SYMBOL(__get_user_pages);
1641 1641
1642 /** 1642 /**
1643 * get_user_pages() - pin user pages in memory 1643 * get_user_pages() - pin user pages in memory
1644 * @tsk: the task_struct to use for page fault accounting, or 1644 * @tsk: the task_struct to use for page fault accounting, or
1645 * NULL if faults are not to be recorded. 1645 * NULL if faults are not to be recorded.
1646 * @mm: mm_struct of target mm 1646 * @mm: mm_struct of target mm
1647 * @start: starting user address 1647 * @start: starting user address
1648 * @nr_pages: number of pages from start to pin 1648 * @nr_pages: number of pages from start to pin
1649 * @write: whether pages will be written to by the caller 1649 * @write: whether pages will be written to by the caller
1650 * @force: whether to force write access even if user mapping is 1650 * @force: whether to force write access even if user mapping is
1651 * readonly. This will result in the page being COWed even 1651 * readonly. This will result in the page being COWed even
1652 * in MAP_SHARED mappings. You do not want this. 1652 * in MAP_SHARED mappings. You do not want this.
1653 * @pages: array that receives pointers to the pages pinned. 1653 * @pages: array that receives pointers to the pages pinned.
1654 * Should be at least nr_pages long. Or NULL, if caller 1654 * Should be at least nr_pages long. Or NULL, if caller
1655 * only intends to ensure the pages are faulted in. 1655 * only intends to ensure the pages are faulted in.
1656 * @vmas: array of pointers to vmas corresponding to each page. 1656 * @vmas: array of pointers to vmas corresponding to each page.
1657 * Or NULL if the caller does not require them. 1657 * Or NULL if the caller does not require them.
1658 * 1658 *
1659 * Returns number of pages pinned. This may be fewer than the number 1659 * Returns number of pages pinned. This may be fewer than the number
1660 * requested. If nr_pages is 0 or negative, returns 0. If no pages 1660 * requested. If nr_pages is 0 or negative, returns 0. If no pages
1661 * were pinned, returns -errno. Each page returned must be released 1661 * were pinned, returns -errno. Each page returned must be released
1662 * with a put_page() call when it is finished with. vmas will only 1662 * with a put_page() call when it is finished with. vmas will only
1663 * remain valid while mmap_sem is held. 1663 * remain valid while mmap_sem is held.
1664 * 1664 *
1665 * Must be called with mmap_sem held for read or write. 1665 * Must be called with mmap_sem held for read or write.
1666 * 1666 *
1667 * get_user_pages walks a process's page tables and takes a reference to 1667 * get_user_pages walks a process's page tables and takes a reference to
1668 * each struct page that each user address corresponds to at a given 1668 * each struct page that each user address corresponds to at a given
1669 * instant. That is, it takes the page that would be accessed if a user 1669 * instant. That is, it takes the page that would be accessed if a user
1670 * thread accesses the given user virtual address at that instant. 1670 * thread accesses the given user virtual address at that instant.
1671 * 1671 *
1672 * This does not guarantee that the page exists in the user mappings when 1672 * This does not guarantee that the page exists in the user mappings when
1673 * get_user_pages returns, and there may even be a completely different 1673 * get_user_pages returns, and there may even be a completely different
1674 * page there in some cases (eg. if mmapped pagecache has been invalidated 1674 * page there in some cases (eg. if mmapped pagecache has been invalidated
1675 * and subsequently re faulted). However it does guarantee that the page 1675 * and subsequently re faulted). However it does guarantee that the page
1676 * won't be freed completely. And mostly callers simply care that the page 1676 * won't be freed completely. And mostly callers simply care that the page
1677 * contains data that was valid *at some point in time*. Typically, an IO 1677 * contains data that was valid *at some point in time*. Typically, an IO
1678 * or similar operation cannot guarantee anything stronger anyway because 1678 * or similar operation cannot guarantee anything stronger anyway because
1679 * locks can't be held over the syscall boundary. 1679 * locks can't be held over the syscall boundary.
1680 * 1680 *
1681 * If write=0, the page must not be written to. If the page is written to, 1681 * If write=0, the page must not be written to. If the page is written to,
1682 * set_page_dirty (or set_page_dirty_lock, as appropriate) must be called 1682 * set_page_dirty (or set_page_dirty_lock, as appropriate) must be called
1683 * after the page is finished with, and before put_page is called. 1683 * after the page is finished with, and before put_page is called.
1684 * 1684 *
1685 * get_user_pages is typically used for fewer-copy IO operations, to get a 1685 * get_user_pages is typically used for fewer-copy IO operations, to get a
1686 * handle on the memory by some means other than accesses via the user virtual 1686 * handle on the memory by some means other than accesses via the user virtual
1687 * addresses. The pages may be submitted for DMA to devices or accessed via 1687 * addresses. The pages may be submitted for DMA to devices or accessed via
1688 * their kernel linear mapping (via the kmap APIs). Care should be taken to 1688 * their kernel linear mapping (via the kmap APIs). Care should be taken to
1689 * use the correct cache flushing APIs. 1689 * use the correct cache flushing APIs.
1690 * 1690 *
1691 * See also get_user_pages_fast, for performance critical applications. 1691 * See also get_user_pages_fast, for performance critical applications.
1692 */ 1692 */
1693 int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, 1693 int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1694 unsigned long start, int nr_pages, int write, int force, 1694 unsigned long start, int nr_pages, int write, int force,
1695 struct page **pages, struct vm_area_struct **vmas) 1695 struct page **pages, struct vm_area_struct **vmas)
1696 { 1696 {
1697 int flags = FOLL_TOUCH; 1697 int flags = FOLL_TOUCH;
1698 1698
1699 if (pages) 1699 if (pages)
1700 flags |= FOLL_GET; 1700 flags |= FOLL_GET;
1701 if (write) 1701 if (write)
1702 flags |= FOLL_WRITE; 1702 flags |= FOLL_WRITE;
1703 if (force) 1703 if (force)
1704 flags |= FOLL_FORCE; 1704 flags |= FOLL_FORCE;
1705 1705
1706 return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas, 1706 return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas,
1707 NULL); 1707 NULL);
1708 } 1708 }
1709 EXPORT_SYMBOL(get_user_pages); 1709 EXPORT_SYMBOL(get_user_pages);
1710 1710
1711 /** 1711 /**
1712 * get_dump_page() - pin user page in memory while writing it to core dump 1712 * get_dump_page() - pin user page in memory while writing it to core dump
1713 * @addr: user address 1713 * @addr: user address
1714 * 1714 *
1715 * Returns struct page pointer of user page pinned for dump, 1715 * Returns struct page pointer of user page pinned for dump,
1716 * to be freed afterwards by page_cache_release() or put_page(). 1716 * to be freed afterwards by page_cache_release() or put_page().
1717 * 1717 *
1718 * Returns NULL on any kind of failure - a hole must then be inserted into 1718 * Returns NULL on any kind of failure - a hole must then be inserted into
1719 * the corefile, to preserve alignment with its headers; and also returns 1719 * the corefile, to preserve alignment with its headers; and also returns
1720 * NULL wherever the ZERO_PAGE, or an anonymous pte_none, has been found - 1720 * NULL wherever the ZERO_PAGE, or an anonymous pte_none, has been found -
1721 * allowing a hole to be left in the corefile to save diskspace. 1721 * allowing a hole to be left in the corefile to save diskspace.
1722 * 1722 *
1723 * Called without mmap_sem, but after all other threads have been killed. 1723 * Called without mmap_sem, but after all other threads have been killed.
1724 */ 1724 */
1725 #ifdef CONFIG_ELF_CORE 1725 #ifdef CONFIG_ELF_CORE
1726 struct page *get_dump_page(unsigned long addr) 1726 struct page *get_dump_page(unsigned long addr)
1727 { 1727 {
1728 struct vm_area_struct *vma; 1728 struct vm_area_struct *vma;
1729 struct page *page; 1729 struct page *page;
1730 1730
1731 if (__get_user_pages(current, current->mm, addr, 1, 1731 if (__get_user_pages(current, current->mm, addr, 1,
1732 FOLL_FORCE | FOLL_DUMP | FOLL_GET, &page, &vma, 1732 FOLL_FORCE | FOLL_DUMP | FOLL_GET, &page, &vma,
1733 NULL) < 1) 1733 NULL) < 1)
1734 return NULL; 1734 return NULL;
1735 flush_cache_page(vma, addr, page_to_pfn(page)); 1735 flush_cache_page(vma, addr, page_to_pfn(page));
1736 return page; 1736 return page;
1737 } 1737 }
1738 #endif /* CONFIG_ELF_CORE */ 1738 #endif /* CONFIG_ELF_CORE */
1739 1739
1740 pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr, 1740 pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr,
1741 spinlock_t **ptl) 1741 spinlock_t **ptl)
1742 { 1742 {
1743 pgd_t * pgd = pgd_offset(mm, addr); 1743 pgd_t * pgd = pgd_offset(mm, addr);
1744 pud_t * pud = pud_alloc(mm, pgd, addr); 1744 pud_t * pud = pud_alloc(mm, pgd, addr);
1745 if (pud) { 1745 if (pud) {
1746 pmd_t * pmd = pmd_alloc(mm, pud, addr); 1746 pmd_t * pmd = pmd_alloc(mm, pud, addr);
1747 if (pmd) { 1747 if (pmd) {
1748 VM_BUG_ON(pmd_trans_huge(*pmd)); 1748 VM_BUG_ON(pmd_trans_huge(*pmd));
1749 return pte_alloc_map_lock(mm, pmd, addr, ptl); 1749 return pte_alloc_map_lock(mm, pmd, addr, ptl);
1750 } 1750 }
1751 } 1751 }
1752 return NULL; 1752 return NULL;
1753 } 1753 }
1754 1754
1755 /* 1755 /*
1756 * This is the old fallback for page remapping. 1756 * This is the old fallback for page remapping.
1757 * 1757 *
1758 * For historical reasons, it only allows reserved pages. Only 1758 * For historical reasons, it only allows reserved pages. Only
1759 * old drivers should use this, and they needed to mark their 1759 * old drivers should use this, and they needed to mark their
1760 * pages reserved for the old functions anyway. 1760 * pages reserved for the old functions anyway.
1761 */ 1761 */
1762 static int insert_page(struct vm_area_struct *vma, unsigned long addr, 1762 static int insert_page(struct vm_area_struct *vma, unsigned long addr,
1763 struct page *page, pgprot_t prot) 1763 struct page *page, pgprot_t prot)
1764 { 1764 {
1765 struct mm_struct *mm = vma->vm_mm; 1765 struct mm_struct *mm = vma->vm_mm;
1766 int retval; 1766 int retval;
1767 pte_t *pte; 1767 pte_t *pte;
1768 spinlock_t *ptl; 1768 spinlock_t *ptl;
1769 1769
1770 retval = -EINVAL; 1770 retval = -EINVAL;
1771 if (PageAnon(page)) 1771 if (PageAnon(page))
1772 goto out; 1772 goto out;
1773 retval = -ENOMEM; 1773 retval = -ENOMEM;
1774 flush_dcache_page(page); 1774 flush_dcache_page(page);
1775 pte = get_locked_pte(mm, addr, &ptl); 1775 pte = get_locked_pte(mm, addr, &ptl);
1776 if (!pte) 1776 if (!pte)
1777 goto out; 1777 goto out;
1778 retval = -EBUSY; 1778 retval = -EBUSY;
1779 if (!pte_none(*pte)) 1779 if (!pte_none(*pte))
1780 goto out_unlock; 1780 goto out_unlock;
1781 1781
1782 /* Ok, finally just insert the thing.. */ 1782 /* Ok, finally just insert the thing.. */
1783 get_page(page); 1783 get_page(page);
1784 inc_mm_counter_fast(mm, MM_FILEPAGES); 1784 inc_mm_counter_fast(mm, MM_FILEPAGES);
1785 page_add_file_rmap(page); 1785 page_add_file_rmap(page);
1786 set_pte_at(mm, addr, pte, mk_pte(page, prot)); 1786 set_pte_at(mm, addr, pte, mk_pte(page, prot));
1787 1787
1788 retval = 0; 1788 retval = 0;
1789 pte_unmap_unlock(pte, ptl); 1789 pte_unmap_unlock(pte, ptl);
1790 return retval; 1790 return retval;
1791 out_unlock: 1791 out_unlock:
1792 pte_unmap_unlock(pte, ptl); 1792 pte_unmap_unlock(pte, ptl);
1793 out: 1793 out:
1794 return retval; 1794 return retval;
1795 } 1795 }
1796 1796
1797 /** 1797 /**
1798 * vm_insert_page - insert single page into user vma 1798 * vm_insert_page - insert single page into user vma
1799 * @vma: user vma to map to 1799 * @vma: user vma to map to
1800 * @addr: target user address of this page 1800 * @addr: target user address of this page
1801 * @page: source kernel page 1801 * @page: source kernel page
1802 * 1802 *
1803 * This allows drivers to insert individual pages they've allocated 1803 * This allows drivers to insert individual pages they've allocated
1804 * into a user vma. 1804 * into a user vma.
1805 * 1805 *
1806 * The page has to be a nice clean _individual_ kernel allocation. 1806 * The page has to be a nice clean _individual_ kernel allocation.
1807 * If you allocate a compound page, you need to have marked it as 1807 * If you allocate a compound page, you need to have marked it as
1808 * such (__GFP_COMP), or manually just split the page up yourself 1808 * such (__GFP_COMP), or manually just split the page up yourself
1809 * (see split_page()). 1809 * (see split_page()).
1810 * 1810 *
1811 * NOTE! Traditionally this was done with "remap_pfn_range()" which 1811 * NOTE! Traditionally this was done with "remap_pfn_range()" which
1812 * took an arbitrary page protection parameter. This doesn't allow 1812 * took an arbitrary page protection parameter. This doesn't allow
1813 * that. Your vma protection will have to be set up correctly, which 1813 * that. Your vma protection will have to be set up correctly, which
1814 * means that if you want a shared writable mapping, you'd better 1814 * means that if you want a shared writable mapping, you'd better
1815 * ask for a shared writable mapping! 1815 * ask for a shared writable mapping!
1816 * 1816 *
1817 * The page does not need to be reserved. 1817 * The page does not need to be reserved.
1818 */ 1818 */
1819 int vm_insert_page(struct vm_area_struct *vma, unsigned long addr, 1819 int vm_insert_page(struct vm_area_struct *vma, unsigned long addr,
1820 struct page *page) 1820 struct page *page)
1821 { 1821 {
1822 if (addr < vma->vm_start || addr >= vma->vm_end) 1822 if (addr < vma->vm_start || addr >= vma->vm_end)
1823 return -EFAULT; 1823 return -EFAULT;
1824 if (!page_count(page)) 1824 if (!page_count(page))
1825 return -EINVAL; 1825 return -EINVAL;
1826 vma->vm_flags |= VM_INSERTPAGE; 1826 vma->vm_flags |= VM_INSERTPAGE;
1827 return insert_page(vma, addr, page, vma->vm_page_prot); 1827 return insert_page(vma, addr, page, vma->vm_page_prot);
1828 } 1828 }
1829 EXPORT_SYMBOL(vm_insert_page); 1829 EXPORT_SYMBOL(vm_insert_page);
1830 1830
1831 static int insert_pfn(struct vm_area_struct *vma, unsigned long addr, 1831 static int insert_pfn(struct vm_area_struct *vma, unsigned long addr,
1832 unsigned long pfn, pgprot_t prot) 1832 unsigned long pfn, pgprot_t prot)
1833 { 1833 {
1834 struct mm_struct *mm = vma->vm_mm; 1834 struct mm_struct *mm = vma->vm_mm;
1835 int retval; 1835 int retval;
1836 pte_t *pte, entry; 1836 pte_t *pte, entry;
1837 spinlock_t *ptl; 1837 spinlock_t *ptl;
1838 1838
1839 retval = -ENOMEM; 1839 retval = -ENOMEM;
1840 pte = get_locked_pte(mm, addr, &ptl); 1840 pte = get_locked_pte(mm, addr, &ptl);
1841 if (!pte) 1841 if (!pte)
1842 goto out; 1842 goto out;
1843 retval = -EBUSY; 1843 retval = -EBUSY;
1844 if (!pte_none(*pte)) 1844 if (!pte_none(*pte))
1845 goto out_unlock; 1845 goto out_unlock;
1846 1846
1847 /* Ok, finally just insert the thing.. */ 1847 /* Ok, finally just insert the thing.. */
1848 entry = pte_mkspecial(pfn_pte(pfn, prot)); 1848 entry = pte_mkspecial(pfn_pte(pfn, prot));
1849 set_pte_at(mm, addr, pte, entry); 1849 set_pte_at(mm, addr, pte, entry);
1850 update_mmu_cache(vma, addr, pte); /* XXX: why not for insert_page? */ 1850 update_mmu_cache(vma, addr, pte); /* XXX: why not for insert_page? */
1851 1851
1852 retval = 0; 1852 retval = 0;
1853 out_unlock: 1853 out_unlock:
1854 pte_unmap_unlock(pte, ptl); 1854 pte_unmap_unlock(pte, ptl);
1855 out: 1855 out:
1856 return retval; 1856 return retval;
1857 } 1857 }
1858 1858
1859 /** 1859 /**
1860 * vm_insert_pfn - insert single pfn into user vma 1860 * vm_insert_pfn - insert single pfn into user vma
1861 * @vma: user vma to map to 1861 * @vma: user vma to map to
1862 * @addr: target user address of this page 1862 * @addr: target user address of this page
1863 * @pfn: source kernel pfn 1863 * @pfn: source kernel pfn
1864 * 1864 *
1865 * Similar to vm_inert_page, this allows drivers to insert individual pages 1865 * Similar to vm_inert_page, this allows drivers to insert individual pages
1866 * they've allocated into a user vma. Same comments apply. 1866 * they've allocated into a user vma. Same comments apply.
1867 * 1867 *
1868 * This function should only be called from a vm_ops->fault handler, and 1868 * This function should only be called from a vm_ops->fault handler, and
1869 * in that case the handler should return NULL. 1869 * in that case the handler should return NULL.
1870 * 1870 *
1871 * vma cannot be a COW mapping. 1871 * vma cannot be a COW mapping.
1872 * 1872 *
1873 * As this is called only for pages that do not currently exist, we 1873 * As this is called only for pages that do not currently exist, we
1874 * do not need to flush old virtual caches or the TLB. 1874 * do not need to flush old virtual caches or the TLB.
1875 */ 1875 */
1876 int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr, 1876 int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
1877 unsigned long pfn) 1877 unsigned long pfn)
1878 { 1878 {
1879 int ret; 1879 int ret;
1880 pgprot_t pgprot = vma->vm_page_prot; 1880 pgprot_t pgprot = vma->vm_page_prot;
1881 /* 1881 /*
1882 * Technically, architectures with pte_special can avoid all these 1882 * Technically, architectures with pte_special can avoid all these
1883 * restrictions (same for remap_pfn_range). However we would like 1883 * restrictions (same for remap_pfn_range). However we would like
1884 * consistency in testing and feature parity among all, so we should 1884 * consistency in testing and feature parity among all, so we should
1885 * try to keep these invariants in place for everybody. 1885 * try to keep these invariants in place for everybody.
1886 */ 1886 */
1887 BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))); 1887 BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)));
1888 BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) == 1888 BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) ==
1889 (VM_PFNMAP|VM_MIXEDMAP)); 1889 (VM_PFNMAP|VM_MIXEDMAP));
1890 BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags)); 1890 BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
1891 BUG_ON((vma->vm_flags & VM_MIXEDMAP) && pfn_valid(pfn)); 1891 BUG_ON((vma->vm_flags & VM_MIXEDMAP) && pfn_valid(pfn));
1892 1892
1893 if (addr < vma->vm_start || addr >= vma->vm_end) 1893 if (addr < vma->vm_start || addr >= vma->vm_end)
1894 return -EFAULT; 1894 return -EFAULT;
1895 if (track_pfn_vma_new(vma, &pgprot, pfn, PAGE_SIZE)) 1895 if (track_pfn_vma_new(vma, &pgprot, pfn, PAGE_SIZE))
1896 return -EINVAL; 1896 return -EINVAL;
1897 1897
1898 ret = insert_pfn(vma, addr, pfn, pgprot); 1898 ret = insert_pfn(vma, addr, pfn, pgprot);
1899 1899
1900 if (ret) 1900 if (ret)
1901 untrack_pfn_vma(vma, pfn, PAGE_SIZE); 1901 untrack_pfn_vma(vma, pfn, PAGE_SIZE);
1902 1902
1903 return ret; 1903 return ret;
1904 } 1904 }
1905 EXPORT_SYMBOL(vm_insert_pfn); 1905 EXPORT_SYMBOL(vm_insert_pfn);
1906 1906
1907 int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr, 1907 int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
1908 unsigned long pfn) 1908 unsigned long pfn)
1909 { 1909 {
1910 BUG_ON(!(vma->vm_flags & VM_MIXEDMAP)); 1910 BUG_ON(!(vma->vm_flags & VM_MIXEDMAP));
1911 1911
1912 if (addr < vma->vm_start || addr >= vma->vm_end) 1912 if (addr < vma->vm_start || addr >= vma->vm_end)
1913 return -EFAULT; 1913 return -EFAULT;
1914 1914
1915 /* 1915 /*
1916 * If we don't have pte special, then we have to use the pfn_valid() 1916 * If we don't have pte special, then we have to use the pfn_valid()
1917 * based VM_MIXEDMAP scheme (see vm_normal_page), and thus we *must* 1917 * based VM_MIXEDMAP scheme (see vm_normal_page), and thus we *must*
1918 * refcount the page if pfn_valid is true (hence insert_page rather 1918 * refcount the page if pfn_valid is true (hence insert_page rather
1919 * than insert_pfn). If a zero_pfn were inserted into a VM_MIXEDMAP 1919 * than insert_pfn). If a zero_pfn were inserted into a VM_MIXEDMAP
1920 * without pte special, it would there be refcounted as a normal page. 1920 * without pte special, it would there be refcounted as a normal page.
1921 */ 1921 */
1922 if (!HAVE_PTE_SPECIAL && pfn_valid(pfn)) { 1922 if (!HAVE_PTE_SPECIAL && pfn_valid(pfn)) {
1923 struct page *page; 1923 struct page *page;
1924 1924
1925 page = pfn_to_page(pfn); 1925 page = pfn_to_page(pfn);
1926 return insert_page(vma, addr, page, vma->vm_page_prot); 1926 return insert_page(vma, addr, page, vma->vm_page_prot);
1927 } 1927 }
1928 return insert_pfn(vma, addr, pfn, vma->vm_page_prot); 1928 return insert_pfn(vma, addr, pfn, vma->vm_page_prot);
1929 } 1929 }
1930 EXPORT_SYMBOL(vm_insert_mixed); 1930 EXPORT_SYMBOL(vm_insert_mixed);
1931 1931
1932 /* 1932 /*
1933 * maps a range of physical memory into the requested pages. the old 1933 * maps a range of physical memory into the requested pages. the old
1934 * mappings are removed. any references to nonexistent pages results 1934 * mappings are removed. any references to nonexistent pages results
1935 * in null mappings (currently treated as "copy-on-access") 1935 * in null mappings (currently treated as "copy-on-access")
1936 */ 1936 */
1937 static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd, 1937 static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd,
1938 unsigned long addr, unsigned long end, 1938 unsigned long addr, unsigned long end,
1939 unsigned long pfn, pgprot_t prot) 1939 unsigned long pfn, pgprot_t prot)
1940 { 1940 {
1941 pte_t *pte; 1941 pte_t *pte;
1942 spinlock_t *ptl; 1942 spinlock_t *ptl;
1943 1943
1944 pte = pte_alloc_map_lock(mm, pmd, addr, &ptl); 1944 pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
1945 if (!pte) 1945 if (!pte)
1946 return -ENOMEM; 1946 return -ENOMEM;
1947 arch_enter_lazy_mmu_mode(); 1947 arch_enter_lazy_mmu_mode();
1948 do { 1948 do {
1949 BUG_ON(!pte_none(*pte)); 1949 BUG_ON(!pte_none(*pte));
1950 set_pte_at(mm, addr, pte, pte_mkspecial(pfn_pte(pfn, prot))); 1950 set_pte_at(mm, addr, pte, pte_mkspecial(pfn_pte(pfn, prot)));
1951 pfn++; 1951 pfn++;
1952 } while (pte++, addr += PAGE_SIZE, addr != end); 1952 } while (pte++, addr += PAGE_SIZE, addr != end);
1953 arch_leave_lazy_mmu_mode(); 1953 arch_leave_lazy_mmu_mode();
1954 pte_unmap_unlock(pte - 1, ptl); 1954 pte_unmap_unlock(pte - 1, ptl);
1955 return 0; 1955 return 0;
1956 } 1956 }
1957 1957
1958 static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud, 1958 static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud,
1959 unsigned long addr, unsigned long end, 1959 unsigned long addr, unsigned long end,
1960 unsigned long pfn, pgprot_t prot) 1960 unsigned long pfn, pgprot_t prot)
1961 { 1961 {
1962 pmd_t *pmd; 1962 pmd_t *pmd;
1963 unsigned long next; 1963 unsigned long next;
1964 1964
1965 pfn -= addr >> PAGE_SHIFT; 1965 pfn -= addr >> PAGE_SHIFT;
1966 pmd = pmd_alloc(mm, pud, addr); 1966 pmd = pmd_alloc(mm, pud, addr);
1967 if (!pmd) 1967 if (!pmd)
1968 return -ENOMEM; 1968 return -ENOMEM;
1969 VM_BUG_ON(pmd_trans_huge(*pmd)); 1969 VM_BUG_ON(pmd_trans_huge(*pmd));
1970 do { 1970 do {
1971 next = pmd_addr_end(addr, end); 1971 next = pmd_addr_end(addr, end);
1972 if (remap_pte_range(mm, pmd, addr, next, 1972 if (remap_pte_range(mm, pmd, addr, next,
1973 pfn + (addr >> PAGE_SHIFT), prot)) 1973 pfn + (addr >> PAGE_SHIFT), prot))
1974 return -ENOMEM; 1974 return -ENOMEM;
1975 } while (pmd++, addr = next, addr != end); 1975 } while (pmd++, addr = next, addr != end);
1976 return 0; 1976 return 0;
1977 } 1977 }
1978 1978
1979 static inline int remap_pud_range(struct mm_struct *mm, pgd_t *pgd, 1979 static inline int remap_pud_range(struct mm_struct *mm, pgd_t *pgd,
1980 unsigned long addr, unsigned long end, 1980 unsigned long addr, unsigned long end,
1981 unsigned long pfn, pgprot_t prot) 1981 unsigned long pfn, pgprot_t prot)
1982 { 1982 {
1983 pud_t *pud; 1983 pud_t *pud;
1984 unsigned long next; 1984 unsigned long next;
1985 1985
1986 pfn -= addr >> PAGE_SHIFT; 1986 pfn -= addr >> PAGE_SHIFT;
1987 pud = pud_alloc(mm, pgd, addr); 1987 pud = pud_alloc(mm, pgd, addr);
1988 if (!pud) 1988 if (!pud)
1989 return -ENOMEM; 1989 return -ENOMEM;
1990 do { 1990 do {
1991 next = pud_addr_end(addr, end); 1991 next = pud_addr_end(addr, end);
1992 if (remap_pmd_range(mm, pud, addr, next, 1992 if (remap_pmd_range(mm, pud, addr, next,
1993 pfn + (addr >> PAGE_SHIFT), prot)) 1993 pfn + (addr >> PAGE_SHIFT), prot))
1994 return -ENOMEM; 1994 return -ENOMEM;
1995 } while (pud++, addr = next, addr != end); 1995 } while (pud++, addr = next, addr != end);
1996 return 0; 1996 return 0;
1997 } 1997 }
1998 1998
1999 /** 1999 /**
2000 * remap_pfn_range - remap kernel memory to userspace 2000 * remap_pfn_range - remap kernel memory to userspace
2001 * @vma: user vma to map to 2001 * @vma: user vma to map to
2002 * @addr: target user address to start at 2002 * @addr: target user address to start at
2003 * @pfn: physical address of kernel memory 2003 * @pfn: physical address of kernel memory
2004 * @size: size of map area 2004 * @size: size of map area
2005 * @prot: page protection flags for this mapping 2005 * @prot: page protection flags for this mapping
2006 * 2006 *
2007 * Note: this is only safe if the mm semaphore is held when called. 2007 * Note: this is only safe if the mm semaphore is held when called.
2008 */ 2008 */
2009 int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, 2009 int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
2010 unsigned long pfn, unsigned long size, pgprot_t prot) 2010 unsigned long pfn, unsigned long size, pgprot_t prot)
2011 { 2011 {
2012 pgd_t *pgd; 2012 pgd_t *pgd;
2013 unsigned long next; 2013 unsigned long next;
2014 unsigned long end = addr + PAGE_ALIGN(size); 2014 unsigned long end = addr + PAGE_ALIGN(size);
2015 struct mm_struct *mm = vma->vm_mm; 2015 struct mm_struct *mm = vma->vm_mm;
2016 int err; 2016 int err;
2017 2017
2018 /* 2018 /*
2019 * Physically remapped pages are special. Tell the 2019 * Physically remapped pages are special. Tell the
2020 * rest of the world about it: 2020 * rest of the world about it:
2021 * VM_IO tells people not to look at these pages 2021 * VM_IO tells people not to look at these pages
2022 * (accesses can have side effects). 2022 * (accesses can have side effects).
2023 * VM_RESERVED is specified all over the place, because 2023 * VM_RESERVED is specified all over the place, because
2024 * in 2.4 it kept swapout's vma scan off this vma; but 2024 * in 2.4 it kept swapout's vma scan off this vma; but
2025 * in 2.6 the LRU scan won't even find its pages, so this 2025 * in 2.6 the LRU scan won't even find its pages, so this
2026 * flag means no more than count its pages in reserved_vm, 2026 * flag means no more than count its pages in reserved_vm,
2027 * and omit it from core dump, even when VM_IO turned off. 2027 * and omit it from core dump, even when VM_IO turned off.
2028 * VM_PFNMAP tells the core MM that the base pages are just 2028 * VM_PFNMAP tells the core MM that the base pages are just
2029 * raw PFN mappings, and do not have a "struct page" associated 2029 * raw PFN mappings, and do not have a "struct page" associated
2030 * with them. 2030 * with them.
2031 * 2031 *
2032 * There's a horrible special case to handle copy-on-write 2032 * There's a horrible special case to handle copy-on-write
2033 * behaviour that some programs depend on. We mark the "original" 2033 * behaviour that some programs depend on. We mark the "original"
2034 * un-COW'ed pages by matching them up with "vma->vm_pgoff". 2034 * un-COW'ed pages by matching them up with "vma->vm_pgoff".
2035 */ 2035 */
2036 if (addr == vma->vm_start && end == vma->vm_end) { 2036 if (addr == vma->vm_start && end == vma->vm_end) {
2037 vma->vm_pgoff = pfn; 2037 vma->vm_pgoff = pfn;
2038 vma->vm_flags |= VM_PFN_AT_MMAP; 2038 vma->vm_flags |= VM_PFN_AT_MMAP;
2039 } else if (is_cow_mapping(vma->vm_flags)) 2039 } else if (is_cow_mapping(vma->vm_flags))
2040 return -EINVAL; 2040 return -EINVAL;
2041 2041
2042 vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP; 2042 vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP;
2043 2043
2044 err = track_pfn_vma_new(vma, &prot, pfn, PAGE_ALIGN(size)); 2044 err = track_pfn_vma_new(vma, &prot, pfn, PAGE_ALIGN(size));
2045 if (err) { 2045 if (err) {
2046 /* 2046 /*
2047 * To indicate that track_pfn related cleanup is not 2047 * To indicate that track_pfn related cleanup is not
2048 * needed from higher level routine calling unmap_vmas 2048 * needed from higher level routine calling unmap_vmas
2049 */ 2049 */
2050 vma->vm_flags &= ~(VM_IO | VM_RESERVED | VM_PFNMAP); 2050 vma->vm_flags &= ~(VM_IO | VM_RESERVED | VM_PFNMAP);
2051 vma->vm_flags &= ~VM_PFN_AT_MMAP; 2051 vma->vm_flags &= ~VM_PFN_AT_MMAP;
2052 return -EINVAL; 2052 return -EINVAL;
2053 } 2053 }
2054 2054
2055 BUG_ON(addr >= end); 2055 BUG_ON(addr >= end);
2056 pfn -= addr >> PAGE_SHIFT; 2056 pfn -= addr >> PAGE_SHIFT;
2057 pgd = pgd_offset(mm, addr); 2057 pgd = pgd_offset(mm, addr);
2058 flush_cache_range(vma, addr, end); 2058 flush_cache_range(vma, addr, end);
2059 do { 2059 do {
2060 next = pgd_addr_end(addr, end); 2060 next = pgd_addr_end(addr, end);
2061 err = remap_pud_range(mm, pgd, addr, next, 2061 err = remap_pud_range(mm, pgd, addr, next,
2062 pfn + (addr >> PAGE_SHIFT), prot); 2062 pfn + (addr >> PAGE_SHIFT), prot);
2063 if (err) 2063 if (err)
2064 break; 2064 break;
2065 } while (pgd++, addr = next, addr != end); 2065 } while (pgd++, addr = next, addr != end);
2066 2066
2067 if (err) 2067 if (err)
2068 untrack_pfn_vma(vma, pfn, PAGE_ALIGN(size)); 2068 untrack_pfn_vma(vma, pfn, PAGE_ALIGN(size));
2069 2069
2070 return err; 2070 return err;
2071 } 2071 }
2072 EXPORT_SYMBOL(remap_pfn_range); 2072 EXPORT_SYMBOL(remap_pfn_range);
2073 2073
2074 static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd, 2074 static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
2075 unsigned long addr, unsigned long end, 2075 unsigned long addr, unsigned long end,
2076 pte_fn_t fn, void *data) 2076 pte_fn_t fn, void *data)
2077 { 2077 {
2078 pte_t *pte; 2078 pte_t *pte;
2079 int err; 2079 int err;
2080 pgtable_t token; 2080 pgtable_t token;
2081 spinlock_t *uninitialized_var(ptl); 2081 spinlock_t *uninitialized_var(ptl);
2082 2082
2083 pte = (mm == &init_mm) ? 2083 pte = (mm == &init_mm) ?
2084 pte_alloc_kernel(pmd, addr) : 2084 pte_alloc_kernel(pmd, addr) :
2085 pte_alloc_map_lock(mm, pmd, addr, &ptl); 2085 pte_alloc_map_lock(mm, pmd, addr, &ptl);
2086 if (!pte) 2086 if (!pte)
2087 return -ENOMEM; 2087 return -ENOMEM;
2088 2088
2089 BUG_ON(pmd_huge(*pmd)); 2089 BUG_ON(pmd_huge(*pmd));
2090 2090
2091 arch_enter_lazy_mmu_mode(); 2091 arch_enter_lazy_mmu_mode();
2092 2092
2093 token = pmd_pgtable(*pmd); 2093 token = pmd_pgtable(*pmd);
2094 2094
2095 do { 2095 do {
2096 err = fn(pte++, token, addr, data); 2096 err = fn(pte++, token, addr, data);
2097 if (err) 2097 if (err)
2098 break; 2098 break;
2099 } while (addr += PAGE_SIZE, addr != end); 2099 } while (addr += PAGE_SIZE, addr != end);
2100 2100
2101 arch_leave_lazy_mmu_mode(); 2101 arch_leave_lazy_mmu_mode();
2102 2102
2103 if (mm != &init_mm) 2103 if (mm != &init_mm)
2104 pte_unmap_unlock(pte-1, ptl); 2104 pte_unmap_unlock(pte-1, ptl);
2105 return err; 2105 return err;
2106 } 2106 }
2107 2107
2108 static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud, 2108 static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud,
2109 unsigned long addr, unsigned long end, 2109 unsigned long addr, unsigned long end,
2110 pte_fn_t fn, void *data) 2110 pte_fn_t fn, void *data)
2111 { 2111 {
2112 pmd_t *pmd; 2112 pmd_t *pmd;
2113 unsigned long next; 2113 unsigned long next;
2114 int err; 2114 int err;
2115 2115
2116 BUG_ON(pud_huge(*pud)); 2116 BUG_ON(pud_huge(*pud));
2117 2117
2118 pmd = pmd_alloc(mm, pud, addr); 2118 pmd = pmd_alloc(mm, pud, addr);
2119 if (!pmd) 2119 if (!pmd)
2120 return -ENOMEM; 2120 return -ENOMEM;
2121 do { 2121 do {
2122 next = pmd_addr_end(addr, end); 2122 next = pmd_addr_end(addr, end);
2123 err = apply_to_pte_range(mm, pmd, addr, next, fn, data); 2123 err = apply_to_pte_range(mm, pmd, addr, next, fn, data);
2124 if (err) 2124 if (err)
2125 break; 2125 break;
2126 } while (pmd++, addr = next, addr != end); 2126 } while (pmd++, addr = next, addr != end);
2127 return err; 2127 return err;
2128 } 2128 }
2129 2129
2130 static int apply_to_pud_range(struct mm_struct *mm, pgd_t *pgd, 2130 static int apply_to_pud_range(struct mm_struct *mm, pgd_t *pgd,
2131 unsigned long addr, unsigned long end, 2131 unsigned long addr, unsigned long end,
2132 pte_fn_t fn, void *data) 2132 pte_fn_t fn, void *data)
2133 { 2133 {
2134 pud_t *pud; 2134 pud_t *pud;
2135 unsigned long next; 2135 unsigned long next;
2136 int err; 2136 int err;
2137 2137
2138 pud = pud_alloc(mm, pgd, addr); 2138 pud = pud_alloc(mm, pgd, addr);
2139 if (!pud) 2139 if (!pud)
2140 return -ENOMEM; 2140 return -ENOMEM;
2141 do { 2141 do {
2142 next = pud_addr_end(addr, end); 2142 next = pud_addr_end(addr, end);
2143 err = apply_to_pmd_range(mm, pud, addr, next, fn, data); 2143 err = apply_to_pmd_range(mm, pud, addr, next, fn, data);
2144 if (err) 2144 if (err)
2145 break; 2145 break;
2146 } while (pud++, addr = next, addr != end); 2146 } while (pud++, addr = next, addr != end);
2147 return err; 2147 return err;
2148 } 2148 }
2149 2149
2150 /* 2150 /*
2151 * Scan a region of virtual memory, filling in page tables as necessary 2151 * Scan a region of virtual memory, filling in page tables as necessary
2152 * and calling a provided function on each leaf page table. 2152 * and calling a provided function on each leaf page table.
2153 */ 2153 */
2154 int apply_to_page_range(struct mm_struct *mm, unsigned long addr, 2154 int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
2155 unsigned long size, pte_fn_t fn, void *data) 2155 unsigned long size, pte_fn_t fn, void *data)
2156 { 2156 {
2157 pgd_t *pgd; 2157 pgd_t *pgd;
2158 unsigned long next; 2158 unsigned long next;
2159 unsigned long end = addr + size; 2159 unsigned long end = addr + size;
2160 int err; 2160 int err;
2161 2161
2162 BUG_ON(addr >= end); 2162 BUG_ON(addr >= end);
2163 pgd = pgd_offset(mm, addr); 2163 pgd = pgd_offset(mm, addr);
2164 do { 2164 do {
2165 next = pgd_addr_end(addr, end); 2165 next = pgd_addr_end(addr, end);
2166 err = apply_to_pud_range(mm, pgd, addr, next, fn, data); 2166 err = apply_to_pud_range(mm, pgd, addr, next, fn, data);
2167 if (err) 2167 if (err)
2168 break; 2168 break;
2169 } while (pgd++, addr = next, addr != end); 2169 } while (pgd++, addr = next, addr != end);
2170 2170
2171 return err; 2171 return err;
2172 } 2172 }
2173 EXPORT_SYMBOL_GPL(apply_to_page_range); 2173 EXPORT_SYMBOL_GPL(apply_to_page_range);
2174 2174
2175 /* 2175 /*
2176 * handle_pte_fault chooses page fault handler according to an entry 2176 * handle_pte_fault chooses page fault handler according to an entry
2177 * which was read non-atomically. Before making any commitment, on 2177 * which was read non-atomically. Before making any commitment, on
2178 * those architectures or configurations (e.g. i386 with PAE) which 2178 * those architectures or configurations (e.g. i386 with PAE) which
2179 * might give a mix of unmatched parts, do_swap_page and do_nonlinear_fault 2179 * might give a mix of unmatched parts, do_swap_page and do_nonlinear_fault
2180 * must check under lock before unmapping the pte and proceeding 2180 * must check under lock before unmapping the pte and proceeding
2181 * (but do_wp_page is only called after already making such a check; 2181 * (but do_wp_page is only called after already making such a check;
2182 * and do_anonymous_page can safely check later on). 2182 * and do_anonymous_page can safely check later on).
2183 */ 2183 */
2184 static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd, 2184 static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd,
2185 pte_t *page_table, pte_t orig_pte) 2185 pte_t *page_table, pte_t orig_pte)
2186 { 2186 {
2187 int same = 1; 2187 int same = 1;
2188 #if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT) 2188 #if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT)
2189 if (sizeof(pte_t) > sizeof(unsigned long)) { 2189 if (sizeof(pte_t) > sizeof(unsigned long)) {
2190 spinlock_t *ptl = pte_lockptr(mm, pmd); 2190 spinlock_t *ptl = pte_lockptr(mm, pmd);
2191 spin_lock(ptl); 2191 spin_lock(ptl);
2192 same = pte_same(*page_table, orig_pte); 2192 same = pte_same(*page_table, orig_pte);
2193 spin_unlock(ptl); 2193 spin_unlock(ptl);
2194 } 2194 }
2195 #endif 2195 #endif
2196 pte_unmap(page_table); 2196 pte_unmap(page_table);
2197 return same; 2197 return same;
2198 } 2198 }
2199 2199
2200 static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va, struct vm_area_struct *vma) 2200 static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va, struct vm_area_struct *vma)
2201 { 2201 {
2202 /* 2202 /*
2203 * If the source page was a PFN mapping, we don't have 2203 * If the source page was a PFN mapping, we don't have
2204 * a "struct page" for it. We do a best-effort copy by 2204 * a "struct page" for it. We do a best-effort copy by
2205 * just copying from the original user address. If that 2205 * just copying from the original user address. If that
2206 * fails, we just zero-fill it. Live with it. 2206 * fails, we just zero-fill it. Live with it.
2207 */ 2207 */
2208 if (unlikely(!src)) { 2208 if (unlikely(!src)) {
2209 void *kaddr = kmap_atomic(dst, KM_USER0); 2209 void *kaddr = kmap_atomic(dst, KM_USER0);
2210 void __user *uaddr = (void __user *)(va & PAGE_MASK); 2210 void __user *uaddr = (void __user *)(va & PAGE_MASK);
2211 2211
2212 /* 2212 /*
2213 * This really shouldn't fail, because the page is there 2213 * This really shouldn't fail, because the page is there
2214 * in the page tables. But it might just be unreadable, 2214 * in the page tables. But it might just be unreadable,
2215 * in which case we just give up and fill the result with 2215 * in which case we just give up and fill the result with
2216 * zeroes. 2216 * zeroes.
2217 */ 2217 */
2218 if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE)) 2218 if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE))
2219 clear_page(kaddr); 2219 clear_page(kaddr);
2220 kunmap_atomic(kaddr, KM_USER0); 2220 kunmap_atomic(kaddr, KM_USER0);
2221 flush_dcache_page(dst); 2221 flush_dcache_page(dst);
2222 } else 2222 } else
2223 copy_user_highpage(dst, src, va, vma); 2223 copy_user_highpage(dst, src, va, vma);
2224 } 2224 }
2225 2225
2226 /* 2226 /*
2227 * This routine handles present pages, when users try to write 2227 * This routine handles present pages, when users try to write
2228 * to a shared page. It is done by copying the page to a new address 2228 * to a shared page. It is done by copying the page to a new address
2229 * and decrementing the shared-page counter for the old page. 2229 * and decrementing the shared-page counter for the old page.
2230 * 2230 *
2231 * Note that this routine assumes that the protection checks have been 2231 * Note that this routine assumes that the protection checks have been
2232 * done by the caller (the low-level page fault routine in most cases). 2232 * done by the caller (the low-level page fault routine in most cases).
2233 * Thus we can safely just mark it writable once we've done any necessary 2233 * Thus we can safely just mark it writable once we've done any necessary
2234 * COW. 2234 * COW.
2235 * 2235 *
2236 * We also mark the page dirty at this point even though the page will 2236 * We also mark the page dirty at this point even though the page will
2237 * change only once the write actually happens. This avoids a few races, 2237 * change only once the write actually happens. This avoids a few races,
2238 * and potentially makes it more efficient. 2238 * and potentially makes it more efficient.
2239 * 2239 *
2240 * We enter with non-exclusive mmap_sem (to exclude vma changes, 2240 * We enter with non-exclusive mmap_sem (to exclude vma changes,
2241 * but allow concurrent faults), with pte both mapped and locked. 2241 * but allow concurrent faults), with pte both mapped and locked.
2242 * We return with mmap_sem still held, but pte unmapped and unlocked. 2242 * We return with mmap_sem still held, but pte unmapped and unlocked.
2243 */ 2243 */
2244 static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, 2244 static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
2245 unsigned long address, pte_t *page_table, pmd_t *pmd, 2245 unsigned long address, pte_t *page_table, pmd_t *pmd,
2246 spinlock_t *ptl, pte_t orig_pte) 2246 spinlock_t *ptl, pte_t orig_pte)
2247 __releases(ptl) 2247 __releases(ptl)
2248 { 2248 {
2249 struct page *old_page, *new_page; 2249 struct page *old_page, *new_page;
2250 pte_t entry; 2250 pte_t entry;
2251 int ret = 0; 2251 int ret = 0;
2252 int page_mkwrite = 0; 2252 int page_mkwrite = 0;
2253 struct page *dirty_page = NULL; 2253 struct page *dirty_page = NULL;
2254 2254
2255 old_page = vm_normal_page(vma, address, orig_pte); 2255 old_page = vm_normal_page(vma, address, orig_pte);
2256 if (!old_page) { 2256 if (!old_page) {
2257 /* 2257 /*
2258 * VM_MIXEDMAP !pfn_valid() case 2258 * VM_MIXEDMAP !pfn_valid() case
2259 * 2259 *
2260 * We should not cow pages in a shared writeable mapping. 2260 * We should not cow pages in a shared writeable mapping.
2261 * Just mark the pages writable as we can't do any dirty 2261 * Just mark the pages writable as we can't do any dirty
2262 * accounting on raw pfn maps. 2262 * accounting on raw pfn maps.
2263 */ 2263 */
2264 if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) == 2264 if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
2265 (VM_WRITE|VM_SHARED)) 2265 (VM_WRITE|VM_SHARED))
2266 goto reuse; 2266 goto reuse;
2267 goto gotten; 2267 goto gotten;
2268 } 2268 }
2269 2269
2270 /* 2270 /*
2271 * Take out anonymous pages first, anonymous shared vmas are 2271 * Take out anonymous pages first, anonymous shared vmas are
2272 * not dirty accountable. 2272 * not dirty accountable.
2273 */ 2273 */
2274 if (PageAnon(old_page) && !PageKsm(old_page)) { 2274 if (PageAnon(old_page) && !PageKsm(old_page)) {
2275 if (!trylock_page(old_page)) { 2275 if (!trylock_page(old_page)) {
2276 page_cache_get(old_page); 2276 page_cache_get(old_page);
2277 pte_unmap_unlock(page_table, ptl); 2277 pte_unmap_unlock(page_table, ptl);
2278 lock_page(old_page); 2278 lock_page(old_page);
2279 page_table = pte_offset_map_lock(mm, pmd, address, 2279 page_table = pte_offset_map_lock(mm, pmd, address,
2280 &ptl); 2280 &ptl);
2281 if (!pte_same(*page_table, orig_pte)) { 2281 if (!pte_same(*page_table, orig_pte)) {
2282 unlock_page(old_page); 2282 unlock_page(old_page);
2283 goto unlock; 2283 goto unlock;
2284 } 2284 }
2285 page_cache_release(old_page); 2285 page_cache_release(old_page);
2286 } 2286 }
2287 if (reuse_swap_page(old_page)) { 2287 if (reuse_swap_page(old_page)) {
2288 /* 2288 /*
2289 * The page is all ours. Move it to our anon_vma so 2289 * The page is all ours. Move it to our anon_vma so
2290 * the rmap code will not search our parent or siblings. 2290 * the rmap code will not search our parent or siblings.
2291 * Protected against the rmap code by the page lock. 2291 * Protected against the rmap code by the page lock.
2292 */ 2292 */
2293 page_move_anon_rmap(old_page, vma, address); 2293 page_move_anon_rmap(old_page, vma, address);
2294 unlock_page(old_page); 2294 unlock_page(old_page);
2295 goto reuse; 2295 goto reuse;
2296 } 2296 }
2297 unlock_page(old_page); 2297 unlock_page(old_page);
2298 } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) == 2298 } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
2299 (VM_WRITE|VM_SHARED))) { 2299 (VM_WRITE|VM_SHARED))) {
2300 /* 2300 /*
2301 * Only catch write-faults on shared writable pages, 2301 * Only catch write-faults on shared writable pages,
2302 * read-only shared pages can get COWed by 2302 * read-only shared pages can get COWed by
2303 * get_user_pages(.write=1, .force=1). 2303 * get_user_pages(.write=1, .force=1).
2304 */ 2304 */
2305 if (vma->vm_ops && vma->vm_ops->page_mkwrite) { 2305 if (vma->vm_ops && vma->vm_ops->page_mkwrite) {
2306 struct vm_fault vmf; 2306 struct vm_fault vmf;
2307 int tmp; 2307 int tmp;
2308 2308
2309 vmf.virtual_address = (void __user *)(address & 2309 vmf.virtual_address = (void __user *)(address &
2310 PAGE_MASK); 2310 PAGE_MASK);
2311 vmf.pgoff = old_page->index; 2311 vmf.pgoff = old_page->index;
2312 vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE; 2312 vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;
2313 vmf.page = old_page; 2313 vmf.page = old_page;
2314 2314
2315 /* 2315 /*
2316 * Notify the address space that the page is about to 2316 * Notify the address space that the page is about to
2317 * become writable so that it can prohibit this or wait 2317 * become writable so that it can prohibit this or wait
2318 * for the page to get into an appropriate state. 2318 * for the page to get into an appropriate state.
2319 * 2319 *
2320 * We do this without the lock held, so that it can 2320 * We do this without the lock held, so that it can
2321 * sleep if it needs to. 2321 * sleep if it needs to.
2322 */ 2322 */
2323 page_cache_get(old_page); 2323 page_cache_get(old_page);
2324 pte_unmap_unlock(page_table, ptl); 2324 pte_unmap_unlock(page_table, ptl);
2325 2325
2326 tmp = vma->vm_ops->page_mkwrite(vma, &vmf); 2326 tmp = vma->vm_ops->page_mkwrite(vma, &vmf);
2327 if (unlikely(tmp & 2327 if (unlikely(tmp &
2328 (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) { 2328 (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) {
2329 ret = tmp; 2329 ret = tmp;
2330 goto unwritable_page; 2330 goto unwritable_page;
2331 } 2331 }
2332 if (unlikely(!(tmp & VM_FAULT_LOCKED))) { 2332 if (unlikely(!(tmp & VM_FAULT_LOCKED))) {
2333 lock_page(old_page); 2333 lock_page(old_page);
2334 if (!old_page->mapping) { 2334 if (!old_page->mapping) {
2335 ret = 0; /* retry the fault */ 2335 ret = 0; /* retry the fault */
2336 unlock_page(old_page); 2336 unlock_page(old_page);
2337 goto unwritable_page; 2337 goto unwritable_page;
2338 } 2338 }
2339 } else 2339 } else
2340 VM_BUG_ON(!PageLocked(old_page)); 2340 VM_BUG_ON(!PageLocked(old_page));
2341 2341
2342 /* 2342 /*
2343 * Since we dropped the lock we need to revalidate 2343 * Since we dropped the lock we need to revalidate
2344 * the PTE as someone else may have changed it. If 2344 * the PTE as someone else may have changed it. If
2345 * they did, we just return, as we can count on the 2345 * they did, we just return, as we can count on the
2346 * MMU to tell us if they didn't also make it writable. 2346 * MMU to tell us if they didn't also make it writable.
2347 */ 2347 */
2348 page_table = pte_offset_map_lock(mm, pmd, address, 2348 page_table = pte_offset_map_lock(mm, pmd, address,
2349 &ptl); 2349 &ptl);
2350 if (!pte_same(*page_table, orig_pte)) { 2350 if (!pte_same(*page_table, orig_pte)) {
2351 unlock_page(old_page); 2351 unlock_page(old_page);
2352 goto unlock; 2352 goto unlock;
2353 } 2353 }
2354 2354
2355 page_mkwrite = 1; 2355 page_mkwrite = 1;
2356 } 2356 }
2357 dirty_page = old_page; 2357 dirty_page = old_page;
2358 get_page(dirty_page); 2358 get_page(dirty_page);
2359 2359
2360 reuse: 2360 reuse:
2361 flush_cache_page(vma, address, pte_pfn(orig_pte)); 2361 flush_cache_page(vma, address, pte_pfn(orig_pte));
2362 entry = pte_mkyoung(orig_pte); 2362 entry = pte_mkyoung(orig_pte);
2363 entry = maybe_mkwrite(pte_mkdirty(entry), vma); 2363 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
2364 if (ptep_set_access_flags(vma, address, page_table, entry,1)) 2364 if (ptep_set_access_flags(vma, address, page_table, entry,1))
2365 update_mmu_cache(vma, address, page_table); 2365 update_mmu_cache(vma, address, page_table);
2366 pte_unmap_unlock(page_table, ptl); 2366 pte_unmap_unlock(page_table, ptl);
2367 ret |= VM_FAULT_WRITE; 2367 ret |= VM_FAULT_WRITE;
2368 2368
2369 if (!dirty_page) 2369 if (!dirty_page)
2370 return ret; 2370 return ret;
2371 2371
2372 /* 2372 /*
2373 * Yes, Virginia, this is actually required to prevent a race 2373 * Yes, Virginia, this is actually required to prevent a race
2374 * with clear_page_dirty_for_io() from clearing the page dirty 2374 * with clear_page_dirty_for_io() from clearing the page dirty
2375 * bit after it clear all dirty ptes, but before a racing 2375 * bit after it clear all dirty ptes, but before a racing
2376 * do_wp_page installs a dirty pte. 2376 * do_wp_page installs a dirty pte.
2377 * 2377 *
2378 * __do_fault is protected similarly. 2378 * __do_fault is protected similarly.
2379 */ 2379 */
2380 if (!page_mkwrite) { 2380 if (!page_mkwrite) {
2381 wait_on_page_locked(dirty_page); 2381 wait_on_page_locked(dirty_page);
2382 set_page_dirty_balance(dirty_page, page_mkwrite); 2382 set_page_dirty_balance(dirty_page, page_mkwrite);
2383 } 2383 }
2384 put_page(dirty_page); 2384 put_page(dirty_page);
2385 if (page_mkwrite) { 2385 if (page_mkwrite) {
2386 struct address_space *mapping = dirty_page->mapping; 2386 struct address_space *mapping = dirty_page->mapping;
2387 2387
2388 set_page_dirty(dirty_page); 2388 set_page_dirty(dirty_page);
2389 unlock_page(dirty_page); 2389 unlock_page(dirty_page);
2390 page_cache_release(dirty_page); 2390 page_cache_release(dirty_page);
2391 if (mapping) { 2391 if (mapping) {
2392 /* 2392 /*
2393 * Some device drivers do not set page.mapping 2393 * Some device drivers do not set page.mapping
2394 * but still dirty their pages 2394 * but still dirty their pages
2395 */ 2395 */
2396 balance_dirty_pages_ratelimited(mapping); 2396 balance_dirty_pages_ratelimited(mapping);
2397 } 2397 }
2398 } 2398 }
2399 2399
2400 /* file_update_time outside page_lock */ 2400 /* file_update_time outside page_lock */
2401 if (vma->vm_file) 2401 if (vma->vm_file)
2402 file_update_time(vma->vm_file); 2402 file_update_time(vma->vm_file);
2403 2403
2404 return ret; 2404 return ret;
2405 } 2405 }
2406 2406
2407 /* 2407 /*
2408 * Ok, we need to copy. Oh, well.. 2408 * Ok, we need to copy. Oh, well..
2409 */ 2409 */
2410 page_cache_get(old_page); 2410 page_cache_get(old_page);
2411 gotten: 2411 gotten:
2412 pte_unmap_unlock(page_table, ptl); 2412 pte_unmap_unlock(page_table, ptl);
2413 2413
2414 if (unlikely(anon_vma_prepare(vma))) 2414 if (unlikely(anon_vma_prepare(vma)))
2415 goto oom; 2415 goto oom;
2416 2416
2417 if (is_zero_pfn(pte_pfn(orig_pte))) { 2417 if (is_zero_pfn(pte_pfn(orig_pte))) {
2418 new_page = alloc_zeroed_user_highpage_movable(vma, address); 2418 new_page = alloc_zeroed_user_highpage_movable(vma, address);
2419 if (!new_page) 2419 if (!new_page)
2420 goto oom; 2420 goto oom;
2421 } else { 2421 } else {
2422 new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); 2422 new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
2423 if (!new_page) 2423 if (!new_page)
2424 goto oom; 2424 goto oom;
2425 cow_user_page(new_page, old_page, address, vma); 2425 cow_user_page(new_page, old_page, address, vma);
2426 } 2426 }
2427 __SetPageUptodate(new_page); 2427 __SetPageUptodate(new_page);
2428 2428
2429 if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL)) 2429 if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))
2430 goto oom_free_new; 2430 goto oom_free_new;
2431 2431
2432 /* 2432 /*
2433 * Re-check the pte - we dropped the lock 2433 * Re-check the pte - we dropped the lock
2434 */ 2434 */
2435 page_table = pte_offset_map_lock(mm, pmd, address, &ptl); 2435 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
2436 if (likely(pte_same(*page_table, orig_pte))) { 2436 if (likely(pte_same(*page_table, orig_pte))) {
2437 if (old_page) { 2437 if (old_page) {
2438 if (!PageAnon(old_page)) { 2438 if (!PageAnon(old_page)) {
2439 dec_mm_counter_fast(mm, MM_FILEPAGES); 2439 dec_mm_counter_fast(mm, MM_FILEPAGES);
2440 inc_mm_counter_fast(mm, MM_ANONPAGES); 2440 inc_mm_counter_fast(mm, MM_ANONPAGES);
2441 } 2441 }
2442 } else 2442 } else
2443 inc_mm_counter_fast(mm, MM_ANONPAGES); 2443 inc_mm_counter_fast(mm, MM_ANONPAGES);
2444 flush_cache_page(vma, address, pte_pfn(orig_pte)); 2444 flush_cache_page(vma, address, pte_pfn(orig_pte));
2445 entry = mk_pte(new_page, vma->vm_page_prot); 2445 entry = mk_pte(new_page, vma->vm_page_prot);
2446 entry = maybe_mkwrite(pte_mkdirty(entry), vma); 2446 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
2447 /* 2447 /*
2448 * Clear the pte entry and flush it first, before updating the 2448 * Clear the pte entry and flush it first, before updating the
2449 * pte with the new entry. This will avoid a race condition 2449 * pte with the new entry. This will avoid a race condition
2450 * seen in the presence of one thread doing SMC and another 2450 * seen in the presence of one thread doing SMC and another
2451 * thread doing COW. 2451 * thread doing COW.
2452 */ 2452 */
2453 ptep_clear_flush(vma, address, page_table); 2453 ptep_clear_flush(vma, address, page_table);
2454 page_add_new_anon_rmap(new_page, vma, address); 2454 page_add_new_anon_rmap(new_page, vma, address);
2455 /* 2455 /*
2456 * We call the notify macro here because, when using secondary 2456 * We call the notify macro here because, when using secondary
2457 * mmu page tables (such as kvm shadow page tables), we want the 2457 * mmu page tables (such as kvm shadow page tables), we want the
2458 * new page to be mapped directly into the secondary page table. 2458 * new page to be mapped directly into the secondary page table.
2459 */ 2459 */
2460 set_pte_at_notify(mm, address, page_table, entry); 2460 set_pte_at_notify(mm, address, page_table, entry);
2461 update_mmu_cache(vma, address, page_table); 2461 update_mmu_cache(vma, address, page_table);
2462 if (old_page) { 2462 if (old_page) {
2463 /* 2463 /*
2464 * Only after switching the pte to the new page may 2464 * Only after switching the pte to the new page may
2465 * we remove the mapcount here. Otherwise another 2465 * we remove the mapcount here. Otherwise another
2466 * process may come and find the rmap count decremented 2466 * process may come and find the rmap count decremented
2467 * before the pte is switched to the new page, and 2467 * before the pte is switched to the new page, and
2468 * "reuse" the old page writing into it while our pte 2468 * "reuse" the old page writing into it while our pte
2469 * here still points into it and can be read by other 2469 * here still points into it and can be read by other
2470 * threads. 2470 * threads.
2471 * 2471 *
2472 * The critical issue is to order this 2472 * The critical issue is to order this
2473 * page_remove_rmap with the ptp_clear_flush above. 2473 * page_remove_rmap with the ptp_clear_flush above.
2474 * Those stores are ordered by (if nothing else,) 2474 * Those stores are ordered by (if nothing else,)
2475 * the barrier present in the atomic_add_negative 2475 * the barrier present in the atomic_add_negative
2476 * in page_remove_rmap. 2476 * in page_remove_rmap.
2477 * 2477 *
2478 * Then the TLB flush in ptep_clear_flush ensures that 2478 * Then the TLB flush in ptep_clear_flush ensures that
2479 * no process can access the old page before the 2479 * no process can access the old page before the
2480 * decremented mapcount is visible. And the old page 2480 * decremented mapcount is visible. And the old page
2481 * cannot be reused until after the decremented 2481 * cannot be reused until after the decremented
2482 * mapcount is visible. So transitively, TLBs to 2482 * mapcount is visible. So transitively, TLBs to
2483 * old page will be flushed before it can be reused. 2483 * old page will be flushed before it can be reused.
2484 */ 2484 */
2485 page_remove_rmap(old_page); 2485 page_remove_rmap(old_page);
2486 } 2486 }
2487 2487
2488 /* Free the old page.. */ 2488 /* Free the old page.. */
2489 new_page = old_page; 2489 new_page = old_page;
2490 ret |= VM_FAULT_WRITE; 2490 ret |= VM_FAULT_WRITE;
2491 } else 2491 } else
2492 mem_cgroup_uncharge_page(new_page); 2492 mem_cgroup_uncharge_page(new_page);
2493 2493
2494 if (new_page) 2494 if (new_page)
2495 page_cache_release(new_page); 2495 page_cache_release(new_page);
2496 unlock: 2496 unlock:
2497 pte_unmap_unlock(page_table, ptl); 2497 pte_unmap_unlock(page_table, ptl);
2498 if (old_page) { 2498 if (old_page) {
2499 /* 2499 /*
2500 * Don't let another task, with possibly unlocked vma, 2500 * Don't let another task, with possibly unlocked vma,
2501 * keep the mlocked page. 2501 * keep the mlocked page.
2502 */ 2502 */
2503 if ((ret & VM_FAULT_WRITE) && (vma->vm_flags & VM_LOCKED)) { 2503 if ((ret & VM_FAULT_WRITE) && (vma->vm_flags & VM_LOCKED)) {
2504 lock_page(old_page); /* LRU manipulation */ 2504 lock_page(old_page); /* LRU manipulation */
2505 munlock_vma_page(old_page); 2505 munlock_vma_page(old_page);
2506 unlock_page(old_page); 2506 unlock_page(old_page);
2507 } 2507 }
2508 page_cache_release(old_page); 2508 page_cache_release(old_page);
2509 } 2509 }
2510 return ret; 2510 return ret;
2511 oom_free_new: 2511 oom_free_new:
2512 page_cache_release(new_page); 2512 page_cache_release(new_page);
2513 oom: 2513 oom:
2514 if (old_page) { 2514 if (old_page) {
2515 if (page_mkwrite) { 2515 if (page_mkwrite) {
2516 unlock_page(old_page); 2516 unlock_page(old_page);
2517 page_cache_release(old_page); 2517 page_cache_release(old_page);
2518 } 2518 }
2519 page_cache_release(old_page); 2519 page_cache_release(old_page);
2520 } 2520 }
2521 return VM_FAULT_OOM; 2521 return VM_FAULT_OOM;
2522 2522
2523 unwritable_page: 2523 unwritable_page:
2524 page_cache_release(old_page); 2524 page_cache_release(old_page);
2525 return ret; 2525 return ret;
2526 } 2526 }
2527 2527
2528 /* 2528 /*
2529 * Helper functions for unmap_mapping_range(). 2529 * Helper functions for unmap_mapping_range().
2530 * 2530 *
2531 * __ Notes on dropping i_mmap_lock to reduce latency while unmapping __ 2531 * __ Notes on dropping i_mmap_lock to reduce latency while unmapping __
2532 * 2532 *
2533 * We have to restart searching the prio_tree whenever we drop the lock, 2533 * We have to restart searching the prio_tree whenever we drop the lock,
2534 * since the iterator is only valid while the lock is held, and anyway 2534 * since the iterator is only valid while the lock is held, and anyway
2535 * a later vma might be split and reinserted earlier while lock dropped. 2535 * a later vma might be split and reinserted earlier while lock dropped.
2536 * 2536 *
2537 * The list of nonlinear vmas could be handled more efficiently, using 2537 * The list of nonlinear vmas could be handled more efficiently, using
2538 * a placeholder, but handle it in the same way until a need is shown. 2538 * a placeholder, but handle it in the same way until a need is shown.
2539 * It is important to search the prio_tree before nonlinear list: a vma 2539 * It is important to search the prio_tree before nonlinear list: a vma
2540 * may become nonlinear and be shifted from prio_tree to nonlinear list 2540 * may become nonlinear and be shifted from prio_tree to nonlinear list
2541 * while the lock is dropped; but never shifted from list to prio_tree. 2541 * while the lock is dropped; but never shifted from list to prio_tree.
2542 * 2542 *
2543 * In order to make forward progress despite restarting the search, 2543 * In order to make forward progress despite restarting the search,
2544 * vm_truncate_count is used to mark a vma as now dealt with, so we can 2544 * vm_truncate_count is used to mark a vma as now dealt with, so we can
2545 * quickly skip it next time around. Since the prio_tree search only 2545 * quickly skip it next time around. Since the prio_tree search only
2546 * shows us those vmas affected by unmapping the range in question, we 2546 * shows us those vmas affected by unmapping the range in question, we
2547 * can't efficiently keep all vmas in step with mapping->truncate_count: 2547 * can't efficiently keep all vmas in step with mapping->truncate_count:
2548 * so instead reset them all whenever it wraps back to 0 (then go to 1). 2548 * so instead reset them all whenever it wraps back to 0 (then go to 1).
2549 * mapping->truncate_count and vma->vm_truncate_count are protected by 2549 * mapping->truncate_count and vma->vm_truncate_count are protected by
2550 * i_mmap_lock. 2550 * i_mmap_lock.
2551 * 2551 *
2552 * In order to make forward progress despite repeatedly restarting some 2552 * In order to make forward progress despite repeatedly restarting some
2553 * large vma, note the restart_addr from unmap_vmas when it breaks out: 2553 * large vma, note the restart_addr from unmap_vmas when it breaks out:
2554 * and restart from that address when we reach that vma again. It might 2554 * and restart from that address when we reach that vma again. It might
2555 * have been split or merged, shrunk or extended, but never shifted: so 2555 * have been split or merged, shrunk or extended, but never shifted: so
2556 * restart_addr remains valid so long as it remains in the vma's range. 2556 * restart_addr remains valid so long as it remains in the vma's range.
2557 * unmap_mapping_range forces truncate_count to leap over page-aligned 2557 * unmap_mapping_range forces truncate_count to leap over page-aligned
2558 * values so we can save vma's restart_addr in its truncate_count field. 2558 * values so we can save vma's restart_addr in its truncate_count field.
2559 */ 2559 */
2560 #define is_restart_addr(truncate_count) (!((truncate_count) & ~PAGE_MASK)) 2560 #define is_restart_addr(truncate_count) (!((truncate_count) & ~PAGE_MASK))
2561 2561
2562 static void reset_vma_truncate_counts(struct address_space *mapping) 2562 static void reset_vma_truncate_counts(struct address_space *mapping)
2563 { 2563 {
2564 struct vm_area_struct *vma; 2564 struct vm_area_struct *vma;
2565 struct prio_tree_iter iter; 2565 struct prio_tree_iter iter;
2566 2566
2567 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, 0, ULONG_MAX) 2567 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, 0, ULONG_MAX)
2568 vma->vm_truncate_count = 0; 2568 vma->vm_truncate_count = 0;
2569 list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list) 2569 list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list)
2570 vma->vm_truncate_count = 0; 2570 vma->vm_truncate_count = 0;
2571 } 2571 }
2572 2572
2573 static int unmap_mapping_range_vma(struct vm_area_struct *vma, 2573 static int unmap_mapping_range_vma(struct vm_area_struct *vma,
2574 unsigned long start_addr, unsigned long end_addr, 2574 unsigned long start_addr, unsigned long end_addr,
2575 struct zap_details *details) 2575 struct zap_details *details)
2576 { 2576 {
2577 unsigned long restart_addr; 2577 unsigned long restart_addr;
2578 int need_break; 2578 int need_break;
2579 2579
2580 /* 2580 /*
2581 * files that support invalidating or truncating portions of the 2581 * files that support invalidating or truncating portions of the
2582 * file from under mmaped areas must have their ->fault function 2582 * file from under mmaped areas must have their ->fault function
2583 * return a locked page (and set VM_FAULT_LOCKED in the return). 2583 * return a locked page (and set VM_FAULT_LOCKED in the return).
2584 * This provides synchronisation against concurrent unmapping here. 2584 * This provides synchronisation against concurrent unmapping here.
2585 */ 2585 */
2586 2586
2587 again: 2587 again:
2588 restart_addr = vma->vm_truncate_count; 2588 restart_addr = vma->vm_truncate_count;
2589 if (is_restart_addr(restart_addr) && start_addr < restart_addr) { 2589 if (is_restart_addr(restart_addr) && start_addr < restart_addr) {
2590 start_addr = restart_addr; 2590 start_addr = restart_addr;
2591 if (start_addr >= end_addr) { 2591 if (start_addr >= end_addr) {
2592 /* Top of vma has been split off since last time */ 2592 /* Top of vma has been split off since last time */
2593 vma->vm_truncate_count = details->truncate_count; 2593 vma->vm_truncate_count = details->truncate_count;
2594 return 0; 2594 return 0;
2595 } 2595 }
2596 } 2596 }
2597 2597
2598 restart_addr = zap_page_range(vma, start_addr, 2598 restart_addr = zap_page_range(vma, start_addr,
2599 end_addr - start_addr, details); 2599 end_addr - start_addr, details);
2600 need_break = need_resched() || spin_needbreak(details->i_mmap_lock); 2600 need_break = need_resched() || spin_needbreak(details->i_mmap_lock);
2601 2601
2602 if (restart_addr >= end_addr) { 2602 if (restart_addr >= end_addr) {
2603 /* We have now completed this vma: mark it so */ 2603 /* We have now completed this vma: mark it so */
2604 vma->vm_truncate_count = details->truncate_count; 2604 vma->vm_truncate_count = details->truncate_count;
2605 if (!need_break) 2605 if (!need_break)
2606 return 0; 2606 return 0;
2607 } else { 2607 } else {
2608 /* Note restart_addr in vma's truncate_count field */ 2608 /* Note restart_addr in vma's truncate_count field */
2609 vma->vm_truncate_count = restart_addr; 2609 vma->vm_truncate_count = restart_addr;
2610 if (!need_break) 2610 if (!need_break)
2611 goto again; 2611 goto again;
2612 } 2612 }
2613 2613
2614 spin_unlock(details->i_mmap_lock); 2614 spin_unlock(details->i_mmap_lock);
2615 cond_resched(); 2615 cond_resched();
2616 spin_lock(details->i_mmap_lock); 2616 spin_lock(details->i_mmap_lock);
2617 return -EINTR; 2617 return -EINTR;
2618 } 2618 }
2619 2619
2620 static inline void unmap_mapping_range_tree(struct prio_tree_root *root, 2620 static inline void unmap_mapping_range_tree(struct prio_tree_root *root,
2621 struct zap_details *details) 2621 struct zap_details *details)
2622 { 2622 {
2623 struct vm_area_struct *vma; 2623 struct vm_area_struct *vma;
2624 struct prio_tree_iter iter; 2624 struct prio_tree_iter iter;
2625 pgoff_t vba, vea, zba, zea; 2625 pgoff_t vba, vea, zba, zea;
2626 2626
2627 restart: 2627 restart:
2628 vma_prio_tree_foreach(vma, &iter, root, 2628 vma_prio_tree_foreach(vma, &iter, root,
2629 details->first_index, details->last_index) { 2629 details->first_index, details->last_index) {
2630 /* Skip quickly over those we have already dealt with */ 2630 /* Skip quickly over those we have already dealt with */
2631 if (vma->vm_truncate_count == details->truncate_count) 2631 if (vma->vm_truncate_count == details->truncate_count)
2632 continue; 2632 continue;
2633 2633
2634 vba = vma->vm_pgoff; 2634 vba = vma->vm_pgoff;
2635 vea = vba + ((vma->vm_end - vma->vm_start) >> PAGE_SHIFT) - 1; 2635 vea = vba + ((vma->vm_end - vma->vm_start) >> PAGE_SHIFT) - 1;
2636 /* Assume for now that PAGE_CACHE_SHIFT == PAGE_SHIFT */ 2636 /* Assume for now that PAGE_CACHE_SHIFT == PAGE_SHIFT */
2637 zba = details->first_index; 2637 zba = details->first_index;
2638 if (zba < vba) 2638 if (zba < vba)
2639 zba = vba; 2639 zba = vba;
2640 zea = details->last_index; 2640 zea = details->last_index;
2641 if (zea > vea) 2641 if (zea > vea)
2642 zea = vea; 2642 zea = vea;
2643 2643
2644 if (unmap_mapping_range_vma(vma, 2644 if (unmap_mapping_range_vma(vma,
2645 ((zba - vba) << PAGE_SHIFT) + vma->vm_start, 2645 ((zba - vba) << PAGE_SHIFT) + vma->vm_start,
2646 ((zea - vba + 1) << PAGE_SHIFT) + vma->vm_start, 2646 ((zea - vba + 1) << PAGE_SHIFT) + vma->vm_start,
2647 details) < 0) 2647 details) < 0)
2648 goto restart; 2648 goto restart;
2649 } 2649 }
2650 } 2650 }
2651 2651
2652 static inline void unmap_mapping_range_list(struct list_head *head, 2652 static inline void unmap_mapping_range_list(struct list_head *head,
2653 struct zap_details *details) 2653 struct zap_details *details)
2654 { 2654 {
2655 struct vm_area_struct *vma; 2655 struct vm_area_struct *vma;
2656 2656
2657 /* 2657 /*
2658 * In nonlinear VMAs there is no correspondence between virtual address 2658 * In nonlinear VMAs there is no correspondence between virtual address
2659 * offset and file offset. So we must perform an exhaustive search 2659 * offset and file offset. So we must perform an exhaustive search
2660 * across *all* the pages in each nonlinear VMA, not just the pages 2660 * across *all* the pages in each nonlinear VMA, not just the pages
2661 * whose virtual address lies outside the file truncation point. 2661 * whose virtual address lies outside the file truncation point.
2662 */ 2662 */
2663 restart: 2663 restart:
2664 list_for_each_entry(vma, head, shared.vm_set.list) { 2664 list_for_each_entry(vma, head, shared.vm_set.list) {
2665 /* Skip quickly over those we have already dealt with */ 2665 /* Skip quickly over those we have already dealt with */
2666 if (vma->vm_truncate_count == details->truncate_count) 2666 if (vma->vm_truncate_count == details->truncate_count)
2667 continue; 2667 continue;
2668 details->nonlinear_vma = vma; 2668 details->nonlinear_vma = vma;
2669 if (unmap_mapping_range_vma(vma, vma->vm_start, 2669 if (unmap_mapping_range_vma(vma, vma->vm_start,
2670 vma->vm_end, details) < 0) 2670 vma->vm_end, details) < 0)
2671 goto restart; 2671 goto restart;
2672 } 2672 }
2673 } 2673 }
2674 2674
2675 /** 2675 /**
2676 * unmap_mapping_range - unmap the portion of all mmaps in the specified address_space corresponding to the specified page range in the underlying file. 2676 * unmap_mapping_range - unmap the portion of all mmaps in the specified address_space corresponding to the specified page range in the underlying file.
2677 * @mapping: the address space containing mmaps to be unmapped. 2677 * @mapping: the address space containing mmaps to be unmapped.
2678 * @holebegin: byte in first page to unmap, relative to the start of 2678 * @holebegin: byte in first page to unmap, relative to the start of
2679 * the underlying file. This will be rounded down to a PAGE_SIZE 2679 * the underlying file. This will be rounded down to a PAGE_SIZE
2680 * boundary. Note that this is different from truncate_pagecache(), which 2680 * boundary. Note that this is different from truncate_pagecache(), which
2681 * must keep the partial page. In contrast, we must get rid of 2681 * must keep the partial page. In contrast, we must get rid of
2682 * partial pages. 2682 * partial pages.
2683 * @holelen: size of prospective hole in bytes. This will be rounded 2683 * @holelen: size of prospective hole in bytes. This will be rounded
2684 * up to a PAGE_SIZE boundary. A holelen of zero truncates to the 2684 * up to a PAGE_SIZE boundary. A holelen of zero truncates to the
2685 * end of the file. 2685 * end of the file.
2686 * @even_cows: 1 when truncating a file, unmap even private COWed pages; 2686 * @even_cows: 1 when truncating a file, unmap even private COWed pages;
2687 * but 0 when invalidating pagecache, don't throw away private data. 2687 * but 0 when invalidating pagecache, don't throw away private data.
2688 */ 2688 */
2689 void unmap_mapping_range(struct address_space *mapping, 2689 void unmap_mapping_range(struct address_space *mapping,
2690 loff_t const holebegin, loff_t const holelen, int even_cows) 2690 loff_t const holebegin, loff_t const holelen, int even_cows)
2691 { 2691 {
2692 struct zap_details details; 2692 struct zap_details details;
2693 pgoff_t hba = holebegin >> PAGE_SHIFT; 2693 pgoff_t hba = holebegin >> PAGE_SHIFT;
2694 pgoff_t hlen = (holelen + PAGE_SIZE - 1) >> PAGE_SHIFT; 2694 pgoff_t hlen = (holelen + PAGE_SIZE - 1) >> PAGE_SHIFT;
2695 2695
2696 /* Check for overflow. */ 2696 /* Check for overflow. */
2697 if (sizeof(holelen) > sizeof(hlen)) { 2697 if (sizeof(holelen) > sizeof(hlen)) {
2698 long long holeend = 2698 long long holeend =
2699 (holebegin + holelen + PAGE_SIZE - 1) >> PAGE_SHIFT; 2699 (holebegin + holelen + PAGE_SIZE - 1) >> PAGE_SHIFT;
2700 if (holeend & ~(long long)ULONG_MAX) 2700 if (holeend & ~(long long)ULONG_MAX)
2701 hlen = ULONG_MAX - hba + 1; 2701 hlen = ULONG_MAX - hba + 1;
2702 } 2702 }
2703 2703
2704 details.check_mapping = even_cows? NULL: mapping; 2704 details.check_mapping = even_cows? NULL: mapping;
2705 details.nonlinear_vma = NULL; 2705 details.nonlinear_vma = NULL;
2706 details.first_index = hba; 2706 details.first_index = hba;
2707 details.last_index = hba + hlen - 1; 2707 details.last_index = hba + hlen - 1;
2708 if (details.last_index < details.first_index) 2708 if (details.last_index < details.first_index)
2709 details.last_index = ULONG_MAX; 2709 details.last_index = ULONG_MAX;
2710 details.i_mmap_lock = &mapping->i_mmap_lock; 2710 details.i_mmap_lock = &mapping->i_mmap_lock;
2711 2711
2712 mutex_lock(&mapping->unmap_mutex); 2712 mutex_lock(&mapping->unmap_mutex);
2713 spin_lock(&mapping->i_mmap_lock); 2713 spin_lock(&mapping->i_mmap_lock);
2714 2714
2715 /* Protect against endless unmapping loops */ 2715 /* Protect against endless unmapping loops */
2716 mapping->truncate_count++; 2716 mapping->truncate_count++;
2717 if (unlikely(is_restart_addr(mapping->truncate_count))) { 2717 if (unlikely(is_restart_addr(mapping->truncate_count))) {
2718 if (mapping->truncate_count == 0) 2718 if (mapping->truncate_count == 0)
2719 reset_vma_truncate_counts(mapping); 2719 reset_vma_truncate_counts(mapping);
2720 mapping->truncate_count++; 2720 mapping->truncate_count++;
2721 } 2721 }
2722 details.truncate_count = mapping->truncate_count; 2722 details.truncate_count = mapping->truncate_count;
2723 2723
2724 if (unlikely(!prio_tree_empty(&mapping->i_mmap))) 2724 if (unlikely(!prio_tree_empty(&mapping->i_mmap)))
2725 unmap_mapping_range_tree(&mapping->i_mmap, &details); 2725 unmap_mapping_range_tree(&mapping->i_mmap, &details);
2726 if (unlikely(!list_empty(&mapping->i_mmap_nonlinear))) 2726 if (unlikely(!list_empty(&mapping->i_mmap_nonlinear)))
2727 unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details); 2727 unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details);
2728 spin_unlock(&mapping->i_mmap_lock); 2728 spin_unlock(&mapping->i_mmap_lock);
2729 mutex_unlock(&mapping->unmap_mutex); 2729 mutex_unlock(&mapping->unmap_mutex);
2730 } 2730 }
2731 EXPORT_SYMBOL(unmap_mapping_range); 2731 EXPORT_SYMBOL(unmap_mapping_range);
2732 2732
2733 int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end) 2733 int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end)
2734 { 2734 {
2735 struct address_space *mapping = inode->i_mapping; 2735 struct address_space *mapping = inode->i_mapping;
2736 2736
2737 /* 2737 /*
2738 * If the underlying filesystem is not going to provide 2738 * If the underlying filesystem is not going to provide
2739 * a way to truncate a range of blocks (punch a hole) - 2739 * a way to truncate a range of blocks (punch a hole) -
2740 * we should return failure right now. 2740 * we should return failure right now.
2741 */ 2741 */
2742 if (!inode->i_op->truncate_range) 2742 if (!inode->i_op->truncate_range)
2743 return -ENOSYS; 2743 return -ENOSYS;
2744 2744
2745 mutex_lock(&inode->i_mutex); 2745 mutex_lock(&inode->i_mutex);
2746 down_write(&inode->i_alloc_sem); 2746 down_write(&inode->i_alloc_sem);
2747 unmap_mapping_range(mapping, offset, (end - offset), 1); 2747 unmap_mapping_range(mapping, offset, (end - offset), 1);
2748 truncate_inode_pages_range(mapping, offset, end); 2748 truncate_inode_pages_range(mapping, offset, end);
2749 unmap_mapping_range(mapping, offset, (end - offset), 1); 2749 unmap_mapping_range(mapping, offset, (end - offset), 1);
2750 inode->i_op->truncate_range(inode, offset, end); 2750 inode->i_op->truncate_range(inode, offset, end);
2751 up_write(&inode->i_alloc_sem); 2751 up_write(&inode->i_alloc_sem);
2752 mutex_unlock(&inode->i_mutex); 2752 mutex_unlock(&inode->i_mutex);
2753 2753
2754 return 0; 2754 return 0;
2755 } 2755 }
2756 2756
2757 /* 2757 /*
2758 * We enter with non-exclusive mmap_sem (to exclude vma changes, 2758 * We enter with non-exclusive mmap_sem (to exclude vma changes,
2759 * but allow concurrent faults), and pte mapped but not yet locked. 2759 * but allow concurrent faults), and pte mapped but not yet locked.
2760 * We return with mmap_sem still held, but pte unmapped and unlocked. 2760 * We return with mmap_sem still held, but pte unmapped and unlocked.
2761 */ 2761 */
2762 static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, 2762 static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2763 unsigned long address, pte_t *page_table, pmd_t *pmd, 2763 unsigned long address, pte_t *page_table, pmd_t *pmd,
2764 unsigned int flags, pte_t orig_pte) 2764 unsigned int flags, pte_t orig_pte)
2765 { 2765 {
2766 spinlock_t *ptl; 2766 spinlock_t *ptl;
2767 struct page *page, *swapcache = NULL; 2767 struct page *page, *swapcache = NULL;
2768 swp_entry_t entry; 2768 swp_entry_t entry;
2769 pte_t pte; 2769 pte_t pte;
2770 int locked; 2770 int locked;
2771 struct mem_cgroup *ptr = NULL; 2771 struct mem_cgroup *ptr = NULL;
2772 int exclusive = 0; 2772 int exclusive = 0;
2773 int ret = 0; 2773 int ret = 0;
2774 2774
2775 if (!pte_unmap_same(mm, pmd, page_table, orig_pte)) 2775 if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
2776 goto out; 2776 goto out;
2777 2777
2778 entry = pte_to_swp_entry(orig_pte); 2778 entry = pte_to_swp_entry(orig_pte);
2779 if (unlikely(non_swap_entry(entry))) { 2779 if (unlikely(non_swap_entry(entry))) {
2780 if (is_migration_entry(entry)) { 2780 if (is_migration_entry(entry)) {
2781 migration_entry_wait(mm, pmd, address); 2781 migration_entry_wait(mm, pmd, address);
2782 } else if (is_hwpoison_entry(entry)) { 2782 } else if (is_hwpoison_entry(entry)) {
2783 ret = VM_FAULT_HWPOISON; 2783 ret = VM_FAULT_HWPOISON;
2784 } else { 2784 } else {
2785 print_bad_pte(vma, address, orig_pte, NULL); 2785 print_bad_pte(vma, address, orig_pte, NULL);
2786 ret = VM_FAULT_SIGBUS; 2786 ret = VM_FAULT_SIGBUS;
2787 } 2787 }
2788 goto out; 2788 goto out;
2789 } 2789 }
2790 delayacct_set_flag(DELAYACCT_PF_SWAPIN); 2790 delayacct_set_flag(DELAYACCT_PF_SWAPIN);
2791 page = lookup_swap_cache(entry); 2791 page = lookup_swap_cache(entry);
2792 if (!page) { 2792 if (!page) {
2793 grab_swap_token(mm); /* Contend for token _before_ read-in */ 2793 grab_swap_token(mm); /* Contend for token _before_ read-in */
2794 page = swapin_readahead(entry, 2794 page = swapin_readahead(entry,
2795 GFP_HIGHUSER_MOVABLE, vma, address); 2795 GFP_HIGHUSER_MOVABLE, vma, address);
2796 if (!page) { 2796 if (!page) {
2797 /* 2797 /*
2798 * Back out if somebody else faulted in this pte 2798 * Back out if somebody else faulted in this pte
2799 * while we released the pte lock. 2799 * while we released the pte lock.
2800 */ 2800 */
2801 page_table = pte_offset_map_lock(mm, pmd, address, &ptl); 2801 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
2802 if (likely(pte_same(*page_table, orig_pte))) 2802 if (likely(pte_same(*page_table, orig_pte)))
2803 ret = VM_FAULT_OOM; 2803 ret = VM_FAULT_OOM;
2804 delayacct_clear_flag(DELAYACCT_PF_SWAPIN); 2804 delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
2805 goto unlock; 2805 goto unlock;
2806 } 2806 }
2807 2807
2808 /* Had to read the page from swap area: Major fault */ 2808 /* Had to read the page from swap area: Major fault */
2809 ret = VM_FAULT_MAJOR; 2809 ret = VM_FAULT_MAJOR;
2810 count_vm_event(PGMAJFAULT); 2810 count_vm_event(PGMAJFAULT);
2811 } else if (PageHWPoison(page)) { 2811 } else if (PageHWPoison(page)) {
2812 /* 2812 /*
2813 * hwpoisoned dirty swapcache pages are kept for killing 2813 * hwpoisoned dirty swapcache pages are kept for killing
2814 * owner processes (which may be unknown at hwpoison time) 2814 * owner processes (which may be unknown at hwpoison time)
2815 */ 2815 */
2816 ret = VM_FAULT_HWPOISON; 2816 ret = VM_FAULT_HWPOISON;
2817 delayacct_clear_flag(DELAYACCT_PF_SWAPIN); 2817 delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
2818 goto out_release; 2818 goto out_release;
2819 } 2819 }
2820 2820
2821 locked = lock_page_or_retry(page, mm, flags); 2821 locked = lock_page_or_retry(page, mm, flags);
2822 delayacct_clear_flag(DELAYACCT_PF_SWAPIN); 2822 delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
2823 if (!locked) { 2823 if (!locked) {
2824 ret |= VM_FAULT_RETRY; 2824 ret |= VM_FAULT_RETRY;
2825 goto out_release; 2825 goto out_release;
2826 } 2826 }
2827 2827
2828 /* 2828 /*
2829 * Make sure try_to_free_swap or reuse_swap_page or swapoff did not 2829 * Make sure try_to_free_swap or reuse_swap_page or swapoff did not
2830 * release the swapcache from under us. The page pin, and pte_same 2830 * release the swapcache from under us. The page pin, and pte_same
2831 * test below, are not enough to exclude that. Even if it is still 2831 * test below, are not enough to exclude that. Even if it is still
2832 * swapcache, we need to check that the page's swap has not changed. 2832 * swapcache, we need to check that the page's swap has not changed.
2833 */ 2833 */
2834 if (unlikely(!PageSwapCache(page) || page_private(page) != entry.val)) 2834 if (unlikely(!PageSwapCache(page) || page_private(page) != entry.val))
2835 goto out_page; 2835 goto out_page;
2836 2836
2837 if (ksm_might_need_to_copy(page, vma, address)) { 2837 if (ksm_might_need_to_copy(page, vma, address)) {
2838 swapcache = page; 2838 swapcache = page;
2839 page = ksm_does_need_to_copy(page, vma, address); 2839 page = ksm_does_need_to_copy(page, vma, address);
2840 2840
2841 if (unlikely(!page)) { 2841 if (unlikely(!page)) {
2842 ret = VM_FAULT_OOM; 2842 ret = VM_FAULT_OOM;
2843 page = swapcache; 2843 page = swapcache;
2844 swapcache = NULL; 2844 swapcache = NULL;
2845 goto out_page; 2845 goto out_page;
2846 } 2846 }
2847 } 2847 }
2848 2848
2849 if (mem_cgroup_try_charge_swapin(mm, page, GFP_KERNEL, &ptr)) { 2849 if (mem_cgroup_try_charge_swapin(mm, page, GFP_KERNEL, &ptr)) {
2850 ret = VM_FAULT_OOM; 2850 ret = VM_FAULT_OOM;
2851 goto out_page; 2851 goto out_page;
2852 } 2852 }
2853 2853
2854 /* 2854 /*
2855 * Back out if somebody else already faulted in this pte. 2855 * Back out if somebody else already faulted in this pte.
2856 */ 2856 */
2857 page_table = pte_offset_map_lock(mm, pmd, address, &ptl); 2857 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
2858 if (unlikely(!pte_same(*page_table, orig_pte))) 2858 if (unlikely(!pte_same(*page_table, orig_pte)))
2859 goto out_nomap; 2859 goto out_nomap;
2860 2860
2861 if (unlikely(!PageUptodate(page))) { 2861 if (unlikely(!PageUptodate(page))) {
2862 ret = VM_FAULT_SIGBUS; 2862 ret = VM_FAULT_SIGBUS;
2863 goto out_nomap; 2863 goto out_nomap;
2864 } 2864 }
2865 2865
2866 /* 2866 /*
2867 * The page isn't present yet, go ahead with the fault. 2867 * The page isn't present yet, go ahead with the fault.
2868 * 2868 *
2869 * Be careful about the sequence of operations here. 2869 * Be careful about the sequence of operations here.
2870 * To get its accounting right, reuse_swap_page() must be called 2870 * To get its accounting right, reuse_swap_page() must be called
2871 * while the page is counted on swap but not yet in mapcount i.e. 2871 * while the page is counted on swap but not yet in mapcount i.e.
2872 * before page_add_anon_rmap() and swap_free(); try_to_free_swap() 2872 * before page_add_anon_rmap() and swap_free(); try_to_free_swap()
2873 * must be called after the swap_free(), or it will never succeed. 2873 * must be called after the swap_free(), or it will never succeed.
2874 * Because delete_from_swap_page() may be called by reuse_swap_page(), 2874 * Because delete_from_swap_page() may be called by reuse_swap_page(),
2875 * mem_cgroup_commit_charge_swapin() may not be able to find swp_entry 2875 * mem_cgroup_commit_charge_swapin() may not be able to find swp_entry
2876 * in page->private. In this case, a record in swap_cgroup is silently 2876 * in page->private. In this case, a record in swap_cgroup is silently
2877 * discarded at swap_free(). 2877 * discarded at swap_free().
2878 */ 2878 */
2879 2879
2880 inc_mm_counter_fast(mm, MM_ANONPAGES); 2880 inc_mm_counter_fast(mm, MM_ANONPAGES);
2881 dec_mm_counter_fast(mm, MM_SWAPENTS); 2881 dec_mm_counter_fast(mm, MM_SWAPENTS);
2882 pte = mk_pte(page, vma->vm_page_prot); 2882 pte = mk_pte(page, vma->vm_page_prot);
2883 if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) { 2883 if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) {
2884 pte = maybe_mkwrite(pte_mkdirty(pte), vma); 2884 pte = maybe_mkwrite(pte_mkdirty(pte), vma);
2885 flags &= ~FAULT_FLAG_WRITE; 2885 flags &= ~FAULT_FLAG_WRITE;
2886 ret |= VM_FAULT_WRITE; 2886 ret |= VM_FAULT_WRITE;
2887 exclusive = 1; 2887 exclusive = 1;
2888 } 2888 }
2889 flush_icache_page(vma, page); 2889 flush_icache_page(vma, page);
2890 set_pte_at(mm, address, page_table, pte); 2890 set_pte_at(mm, address, page_table, pte);
2891 do_page_add_anon_rmap(page, vma, address, exclusive); 2891 do_page_add_anon_rmap(page, vma, address, exclusive);
2892 /* It's better to call commit-charge after rmap is established */ 2892 /* It's better to call commit-charge after rmap is established */
2893 mem_cgroup_commit_charge_swapin(page, ptr); 2893 mem_cgroup_commit_charge_swapin(page, ptr);
2894 2894
2895 swap_free(entry); 2895 swap_free(entry);
2896 if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page)) 2896 if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page))
2897 try_to_free_swap(page); 2897 try_to_free_swap(page);
2898 unlock_page(page); 2898 unlock_page(page);
2899 if (swapcache) { 2899 if (swapcache) {
2900 /* 2900 /*
2901 * Hold the lock to avoid the swap entry to be reused 2901 * Hold the lock to avoid the swap entry to be reused
2902 * until we take the PT lock for the pte_same() check 2902 * until we take the PT lock for the pte_same() check
2903 * (to avoid false positives from pte_same). For 2903 * (to avoid false positives from pte_same). For
2904 * further safety release the lock after the swap_free 2904 * further safety release the lock after the swap_free
2905 * so that the swap count won't change under a 2905 * so that the swap count won't change under a
2906 * parallel locked swapcache. 2906 * parallel locked swapcache.
2907 */ 2907 */
2908 unlock_page(swapcache); 2908 unlock_page(swapcache);
2909 page_cache_release(swapcache); 2909 page_cache_release(swapcache);
2910 } 2910 }
2911 2911
2912 if (flags & FAULT_FLAG_WRITE) { 2912 if (flags & FAULT_FLAG_WRITE) {
2913 ret |= do_wp_page(mm, vma, address, page_table, pmd, ptl, pte); 2913 ret |= do_wp_page(mm, vma, address, page_table, pmd, ptl, pte);
2914 if (ret & VM_FAULT_ERROR) 2914 if (ret & VM_FAULT_ERROR)
2915 ret &= VM_FAULT_ERROR; 2915 ret &= VM_FAULT_ERROR;
2916 goto out; 2916 goto out;
2917 } 2917 }
2918 2918
2919 /* No need to invalidate - it was non-present before */ 2919 /* No need to invalidate - it was non-present before */
2920 update_mmu_cache(vma, address, page_table); 2920 update_mmu_cache(vma, address, page_table);
2921 unlock: 2921 unlock:
2922 pte_unmap_unlock(page_table, ptl); 2922 pte_unmap_unlock(page_table, ptl);
2923 out: 2923 out:
2924 return ret; 2924 return ret;
2925 out_nomap: 2925 out_nomap:
2926 mem_cgroup_cancel_charge_swapin(ptr); 2926 mem_cgroup_cancel_charge_swapin(ptr);
2927 pte_unmap_unlock(page_table, ptl); 2927 pte_unmap_unlock(page_table, ptl);
2928 out_page: 2928 out_page:
2929 unlock_page(page); 2929 unlock_page(page);
2930 out_release: 2930 out_release:
2931 page_cache_release(page); 2931 page_cache_release(page);
2932 if (swapcache) { 2932 if (swapcache) {
2933 unlock_page(swapcache); 2933 unlock_page(swapcache);
2934 page_cache_release(swapcache); 2934 page_cache_release(swapcache);
2935 } 2935 }
2936 return ret; 2936 return ret;
2937 } 2937 }
2938 2938
2939 /* 2939 /*
2940 * This is like a special single-page "expand_{down|up}wards()", 2940 * This is like a special single-page "expand_{down|up}wards()",
2941 * except we must first make sure that 'address{-|+}PAGE_SIZE' 2941 * except we must first make sure that 'address{-|+}PAGE_SIZE'
2942 * doesn't hit another vma. 2942 * doesn't hit another vma.
2943 */ 2943 */
2944 static inline int check_stack_guard_page(struct vm_area_struct *vma, unsigned long address) 2944 static inline int check_stack_guard_page(struct vm_area_struct *vma, unsigned long address)
2945 { 2945 {
2946 address &= PAGE_MASK; 2946 address &= PAGE_MASK;
2947 if ((vma->vm_flags & VM_GROWSDOWN) && address == vma->vm_start) { 2947 if ((vma->vm_flags & VM_GROWSDOWN) && address == vma->vm_start) {
2948 struct vm_area_struct *prev = vma->vm_prev; 2948 struct vm_area_struct *prev = vma->vm_prev;
2949 2949
2950 /* 2950 /*
2951 * Is there a mapping abutting this one below? 2951 * Is there a mapping abutting this one below?
2952 * 2952 *
2953 * That's only ok if it's the same stack mapping 2953 * That's only ok if it's the same stack mapping
2954 * that has gotten split.. 2954 * that has gotten split..
2955 */ 2955 */
2956 if (prev && prev->vm_end == address) 2956 if (prev && prev->vm_end == address)
2957 return prev->vm_flags & VM_GROWSDOWN ? 0 : -ENOMEM; 2957 return prev->vm_flags & VM_GROWSDOWN ? 0 : -ENOMEM;
2958 2958
2959 expand_stack(vma, address - PAGE_SIZE); 2959 expand_stack(vma, address - PAGE_SIZE);
2960 } 2960 }
2961 if ((vma->vm_flags & VM_GROWSUP) && address + PAGE_SIZE == vma->vm_end) { 2961 if ((vma->vm_flags & VM_GROWSUP) && address + PAGE_SIZE == vma->vm_end) {
2962 struct vm_area_struct *next = vma->vm_next; 2962 struct vm_area_struct *next = vma->vm_next;
2963 2963
2964 /* As VM_GROWSDOWN but s/below/above/ */ 2964 /* As VM_GROWSDOWN but s/below/above/ */
2965 if (next && next->vm_start == address + PAGE_SIZE) 2965 if (next && next->vm_start == address + PAGE_SIZE)
2966 return next->vm_flags & VM_GROWSUP ? 0 : -ENOMEM; 2966 return next->vm_flags & VM_GROWSUP ? 0 : -ENOMEM;
2967 2967
2968 expand_upwards(vma, address + PAGE_SIZE); 2968 expand_upwards(vma, address + PAGE_SIZE);
2969 } 2969 }
2970 return 0; 2970 return 0;
2971 } 2971 }
2972 2972
2973 /* 2973 /*
2974 * We enter with non-exclusive mmap_sem (to exclude vma changes, 2974 * We enter with non-exclusive mmap_sem (to exclude vma changes,
2975 * but allow concurrent faults), and pte mapped but not yet locked. 2975 * but allow concurrent faults), and pte mapped but not yet locked.
2976 * We return with mmap_sem still held, but pte unmapped and unlocked. 2976 * We return with mmap_sem still held, but pte unmapped and unlocked.
2977 */ 2977 */
2978 static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, 2978 static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
2979 unsigned long address, pte_t *page_table, pmd_t *pmd, 2979 unsigned long address, pte_t *page_table, pmd_t *pmd,
2980 unsigned int flags) 2980 unsigned int flags)
2981 { 2981 {
2982 struct page *page; 2982 struct page *page;
2983 spinlock_t *ptl; 2983 spinlock_t *ptl;
2984 pte_t entry; 2984 pte_t entry;
2985 2985
2986 pte_unmap(page_table); 2986 pte_unmap(page_table);
2987 2987
2988 /* Check if we need to add a guard page to the stack */ 2988 /* Check if we need to add a guard page to the stack */
2989 if (check_stack_guard_page(vma, address) < 0) 2989 if (check_stack_guard_page(vma, address) < 0)
2990 return VM_FAULT_SIGBUS; 2990 return VM_FAULT_SIGBUS;
2991 2991
2992 /* Use the zero-page for reads */ 2992 /* Use the zero-page for reads */
2993 if (!(flags & FAULT_FLAG_WRITE)) { 2993 if (!(flags & FAULT_FLAG_WRITE)) {
2994 entry = pte_mkspecial(pfn_pte(my_zero_pfn(address), 2994 entry = pte_mkspecial(pfn_pte(my_zero_pfn(address),
2995 vma->vm_page_prot)); 2995 vma->vm_page_prot));
2996 page_table = pte_offset_map_lock(mm, pmd, address, &ptl); 2996 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
2997 if (!pte_none(*page_table)) 2997 if (!pte_none(*page_table))
2998 goto unlock; 2998 goto unlock;
2999 goto setpte; 2999 goto setpte;
3000 } 3000 }
3001 3001
3002 /* Allocate our own private page. */ 3002 /* Allocate our own private page. */
3003 if (unlikely(anon_vma_prepare(vma))) 3003 if (unlikely(anon_vma_prepare(vma)))
3004 goto oom; 3004 goto oom;
3005 page = alloc_zeroed_user_highpage_movable(vma, address); 3005 page = alloc_zeroed_user_highpage_movable(vma, address);
3006 if (!page) 3006 if (!page)
3007 goto oom; 3007 goto oom;
3008 __SetPageUptodate(page); 3008 __SetPageUptodate(page);
3009 3009
3010 if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL)) 3010 if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))
3011 goto oom_free_page; 3011 goto oom_free_page;
3012 3012
3013 entry = mk_pte(page, vma->vm_page_prot); 3013 entry = mk_pte(page, vma->vm_page_prot);
3014 if (vma->vm_flags & VM_WRITE) 3014 if (vma->vm_flags & VM_WRITE)
3015 entry = pte_mkwrite(pte_mkdirty(entry)); 3015 entry = pte_mkwrite(pte_mkdirty(entry));
3016 3016
3017 page_table = pte_offset_map_lock(mm, pmd, address, &ptl); 3017 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
3018 if (!pte_none(*page_table)) 3018 if (!pte_none(*page_table))
3019 goto release; 3019 goto release;
3020 3020
3021 inc_mm_counter_fast(mm, MM_ANONPAGES); 3021 inc_mm_counter_fast(mm, MM_ANONPAGES);
3022 page_add_new_anon_rmap(page, vma, address); 3022 page_add_new_anon_rmap(page, vma, address);
3023 setpte: 3023 setpte:
3024 set_pte_at(mm, address, page_table, entry); 3024 set_pte_at(mm, address, page_table, entry);
3025 3025
3026 /* No need to invalidate - it was non-present before */ 3026 /* No need to invalidate - it was non-present before */
3027 update_mmu_cache(vma, address, page_table); 3027 update_mmu_cache(vma, address, page_table);
3028 unlock: 3028 unlock:
3029 pte_unmap_unlock(page_table, ptl); 3029 pte_unmap_unlock(page_table, ptl);
3030 return 0; 3030 return 0;
3031 release: 3031 release:
3032 mem_cgroup_uncharge_page(page); 3032 mem_cgroup_uncharge_page(page);
3033 page_cache_release(page); 3033 page_cache_release(page);
3034 goto unlock; 3034 goto unlock;
3035 oom_free_page: 3035 oom_free_page:
3036 page_cache_release(page); 3036 page_cache_release(page);
3037 oom: 3037 oom:
3038 return VM_FAULT_OOM; 3038 return VM_FAULT_OOM;
3039 } 3039 }
3040 3040
3041 /* 3041 /*
3042 * __do_fault() tries to create a new page mapping. It aggressively 3042 * __do_fault() tries to create a new page mapping. It aggressively
3043 * tries to share with existing pages, but makes a separate copy if 3043 * tries to share with existing pages, but makes a separate copy if
3044 * the FAULT_FLAG_WRITE is set in the flags parameter in order to avoid 3044 * the FAULT_FLAG_WRITE is set in the flags parameter in order to avoid
3045 * the next page fault. 3045 * the next page fault.
3046 * 3046 *
3047 * As this is called only for pages that do not currently exist, we 3047 * As this is called only for pages that do not currently exist, we
3048 * do not need to flush old virtual caches or the TLB. 3048 * do not need to flush old virtual caches or the TLB.
3049 * 3049 *
3050 * We enter with non-exclusive mmap_sem (to exclude vma changes, 3050 * We enter with non-exclusive mmap_sem (to exclude vma changes,
3051 * but allow concurrent faults), and pte neither mapped nor locked. 3051 * but allow concurrent faults), and pte neither mapped nor locked.
3052 * We return with mmap_sem still held, but pte unmapped and unlocked. 3052 * We return with mmap_sem still held, but pte unmapped and unlocked.
3053 */ 3053 */
3054 static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, 3054 static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3055 unsigned long address, pmd_t *pmd, 3055 unsigned long address, pmd_t *pmd,
3056 pgoff_t pgoff, unsigned int flags, pte_t orig_pte) 3056 pgoff_t pgoff, unsigned int flags, pte_t orig_pte)
3057 { 3057 {
3058 pte_t *page_table; 3058 pte_t *page_table;
3059 spinlock_t *ptl; 3059 spinlock_t *ptl;
3060 struct page *page; 3060 struct page *page;
3061 pte_t entry; 3061 pte_t entry;
3062 int anon = 0; 3062 int anon = 0;
3063 int charged = 0; 3063 int charged = 0;
3064 struct page *dirty_page = NULL; 3064 struct page *dirty_page = NULL;
3065 struct vm_fault vmf; 3065 struct vm_fault vmf;
3066 int ret; 3066 int ret;
3067 int page_mkwrite = 0; 3067 int page_mkwrite = 0;
3068 3068
3069 vmf.virtual_address = (void __user *)(address & PAGE_MASK); 3069 vmf.virtual_address = (void __user *)(address & PAGE_MASK);
3070 vmf.pgoff = pgoff; 3070 vmf.pgoff = pgoff;
3071 vmf.flags = flags; 3071 vmf.flags = flags;
3072 vmf.page = NULL; 3072 vmf.page = NULL;
3073 3073
3074 ret = vma->vm_ops->fault(vma, &vmf); 3074 ret = vma->vm_ops->fault(vma, &vmf);
3075 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | 3075 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE |
3076 VM_FAULT_RETRY))) 3076 VM_FAULT_RETRY)))
3077 return ret; 3077 return ret;
3078 3078
3079 if (unlikely(PageHWPoison(vmf.page))) { 3079 if (unlikely(PageHWPoison(vmf.page))) {
3080 if (ret & VM_FAULT_LOCKED) 3080 if (ret & VM_FAULT_LOCKED)
3081 unlock_page(vmf.page); 3081 unlock_page(vmf.page);
3082 return VM_FAULT_HWPOISON; 3082 return VM_FAULT_HWPOISON;
3083 } 3083 }
3084 3084
3085 /* 3085 /*
3086 * For consistency in subsequent calls, make the faulted page always 3086 * For consistency in subsequent calls, make the faulted page always
3087 * locked. 3087 * locked.
3088 */ 3088 */
3089 if (unlikely(!(ret & VM_FAULT_LOCKED))) 3089 if (unlikely(!(ret & VM_FAULT_LOCKED)))
3090 lock_page(vmf.page); 3090 lock_page(vmf.page);
3091 else 3091 else
3092 VM_BUG_ON(!PageLocked(vmf.page)); 3092 VM_BUG_ON(!PageLocked(vmf.page));
3093 3093
3094 /* 3094 /*
3095 * Should we do an early C-O-W break? 3095 * Should we do an early C-O-W break?
3096 */ 3096 */
3097 page = vmf.page; 3097 page = vmf.page;
3098 if (flags & FAULT_FLAG_WRITE) { 3098 if (flags & FAULT_FLAG_WRITE) {
3099 if (!(vma->vm_flags & VM_SHARED)) { 3099 if (!(vma->vm_flags & VM_SHARED)) {
3100 anon = 1; 3100 anon = 1;
3101 if (unlikely(anon_vma_prepare(vma))) { 3101 if (unlikely(anon_vma_prepare(vma))) {
3102 ret = VM_FAULT_OOM; 3102 ret = VM_FAULT_OOM;
3103 goto out; 3103 goto out;
3104 } 3104 }
3105 page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, 3105 page = alloc_page_vma(GFP_HIGHUSER_MOVABLE,
3106 vma, address); 3106 vma, address);
3107 if (!page) { 3107 if (!page) {
3108 ret = VM_FAULT_OOM; 3108 ret = VM_FAULT_OOM;
3109 goto out; 3109 goto out;
3110 } 3110 }
3111 if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL)) { 3111 if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL)) {
3112 ret = VM_FAULT_OOM; 3112 ret = VM_FAULT_OOM;
3113 page_cache_release(page); 3113 page_cache_release(page);
3114 goto out; 3114 goto out;
3115 } 3115 }
3116 charged = 1; 3116 charged = 1;
3117 copy_user_highpage(page, vmf.page, address, vma); 3117 copy_user_highpage(page, vmf.page, address, vma);
3118 __SetPageUptodate(page); 3118 __SetPageUptodate(page);
3119 } else { 3119 } else {
3120 /* 3120 /*
3121 * If the page will be shareable, see if the backing 3121 * If the page will be shareable, see if the backing
3122 * address space wants to know that the page is about 3122 * address space wants to know that the page is about
3123 * to become writable 3123 * to become writable
3124 */ 3124 */
3125 if (vma->vm_ops->page_mkwrite) { 3125 if (vma->vm_ops->page_mkwrite) {
3126 int tmp; 3126 int tmp;
3127 3127
3128 unlock_page(page); 3128 unlock_page(page);
3129 vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE; 3129 vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;
3130 tmp = vma->vm_ops->page_mkwrite(vma, &vmf); 3130 tmp = vma->vm_ops->page_mkwrite(vma, &vmf);
3131 if (unlikely(tmp & 3131 if (unlikely(tmp &
3132 (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) { 3132 (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) {
3133 ret = tmp; 3133 ret = tmp;
3134 goto unwritable_page; 3134 goto unwritable_page;
3135 } 3135 }
3136 if (unlikely(!(tmp & VM_FAULT_LOCKED))) { 3136 if (unlikely(!(tmp & VM_FAULT_LOCKED))) {
3137 lock_page(page); 3137 lock_page(page);
3138 if (!page->mapping) { 3138 if (!page->mapping) {
3139 ret = 0; /* retry the fault */ 3139 ret = 0; /* retry the fault */
3140 unlock_page(page); 3140 unlock_page(page);
3141 goto unwritable_page; 3141 goto unwritable_page;
3142 } 3142 }
3143 } else 3143 } else
3144 VM_BUG_ON(!PageLocked(page)); 3144 VM_BUG_ON(!PageLocked(page));
3145 page_mkwrite = 1; 3145 page_mkwrite = 1;
3146 } 3146 }
3147 } 3147 }
3148 3148
3149 } 3149 }
3150 3150
3151 page_table = pte_offset_map_lock(mm, pmd, address, &ptl); 3151 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
3152 3152
3153 /* 3153 /*
3154 * This silly early PAGE_DIRTY setting removes a race 3154 * This silly early PAGE_DIRTY setting removes a race
3155 * due to the bad i386 page protection. But it's valid 3155 * due to the bad i386 page protection. But it's valid
3156 * for other architectures too. 3156 * for other architectures too.
3157 * 3157 *
3158 * Note that if FAULT_FLAG_WRITE is set, we either now have 3158 * Note that if FAULT_FLAG_WRITE is set, we either now have
3159 * an exclusive copy of the page, or this is a shared mapping, 3159 * an exclusive copy of the page, or this is a shared mapping,
3160 * so we can make it writable and dirty to avoid having to 3160 * so we can make it writable and dirty to avoid having to
3161 * handle that later. 3161 * handle that later.
3162 */ 3162 */
3163 /* Only go through if we didn't race with anybody else... */ 3163 /* Only go through if we didn't race with anybody else... */
3164 if (likely(pte_same(*page_table, orig_pte))) { 3164 if (likely(pte_same(*page_table, orig_pte))) {
3165 flush_icache_page(vma, page); 3165 flush_icache_page(vma, page);
3166 entry = mk_pte(page, vma->vm_page_prot); 3166 entry = mk_pte(page, vma->vm_page_prot);
3167 if (flags & FAULT_FLAG_WRITE) 3167 if (flags & FAULT_FLAG_WRITE)
3168 entry = maybe_mkwrite(pte_mkdirty(entry), vma); 3168 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
3169 if (anon) { 3169 if (anon) {
3170 inc_mm_counter_fast(mm, MM_ANONPAGES); 3170 inc_mm_counter_fast(mm, MM_ANONPAGES);
3171 page_add_new_anon_rmap(page, vma, address); 3171 page_add_new_anon_rmap(page, vma, address);
3172 } else { 3172 } else {
3173 inc_mm_counter_fast(mm, MM_FILEPAGES); 3173 inc_mm_counter_fast(mm, MM_FILEPAGES);
3174 page_add_file_rmap(page); 3174 page_add_file_rmap(page);
3175 if (flags & FAULT_FLAG_WRITE) { 3175 if (flags & FAULT_FLAG_WRITE) {
3176 dirty_page = page; 3176 dirty_page = page;
3177 get_page(dirty_page); 3177 get_page(dirty_page);
3178 } 3178 }
3179 } 3179 }
3180 set_pte_at(mm, address, page_table, entry); 3180 set_pte_at(mm, address, page_table, entry);
3181 3181
3182 /* no need to invalidate: a not-present page won't be cached */ 3182 /* no need to invalidate: a not-present page won't be cached */
3183 update_mmu_cache(vma, address, page_table); 3183 update_mmu_cache(vma, address, page_table);
3184 } else { 3184 } else {
3185 if (charged) 3185 if (charged)
3186 mem_cgroup_uncharge_page(page); 3186 mem_cgroup_uncharge_page(page);
3187 if (anon) 3187 if (anon)
3188 page_cache_release(page); 3188 page_cache_release(page);
3189 else 3189 else
3190 anon = 1; /* no anon but release faulted_page */ 3190 anon = 1; /* no anon but release faulted_page */
3191 } 3191 }
3192 3192
3193 pte_unmap_unlock(page_table, ptl); 3193 pte_unmap_unlock(page_table, ptl);
3194 3194
3195 out: 3195 out:
3196 if (dirty_page) { 3196 if (dirty_page) {
3197 struct address_space *mapping = page->mapping; 3197 struct address_space *mapping = page->mapping;
3198 3198
3199 if (set_page_dirty(dirty_page)) 3199 if (set_page_dirty(dirty_page))
3200 page_mkwrite = 1; 3200 page_mkwrite = 1;
3201 unlock_page(dirty_page); 3201 unlock_page(dirty_page);
3202 put_page(dirty_page); 3202 put_page(dirty_page);
3203 if (page_mkwrite && mapping) { 3203 if (page_mkwrite && mapping) {
3204 /* 3204 /*
3205 * Some device drivers do not set page.mapping but still 3205 * Some device drivers do not set page.mapping but still
3206 * dirty their pages 3206 * dirty their pages
3207 */ 3207 */
3208 balance_dirty_pages_ratelimited(mapping); 3208 balance_dirty_pages_ratelimited(mapping);
3209 } 3209 }
3210 3210
3211 /* file_update_time outside page_lock */ 3211 /* file_update_time outside page_lock */
3212 if (vma->vm_file) 3212 if (vma->vm_file)
3213 file_update_time(vma->vm_file); 3213 file_update_time(vma->vm_file);
3214 } else { 3214 } else {
3215 unlock_page(vmf.page); 3215 unlock_page(vmf.page);
3216 if (anon) 3216 if (anon)
3217 page_cache_release(vmf.page); 3217 page_cache_release(vmf.page);
3218 } 3218 }
3219 3219
3220 return ret; 3220 return ret;
3221 3221
3222 unwritable_page: 3222 unwritable_page:
3223 page_cache_release(page); 3223 page_cache_release(page);
3224 return ret; 3224 return ret;
3225 } 3225 }
3226 3226
3227 static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma, 3227 static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3228 unsigned long address, pte_t *page_table, pmd_t *pmd, 3228 unsigned long address, pte_t *page_table, pmd_t *pmd,
3229 unsigned int flags, pte_t orig_pte) 3229 unsigned int flags, pte_t orig_pte)
3230 { 3230 {
3231 pgoff_t pgoff = (((address & PAGE_MASK) 3231 pgoff_t pgoff = (((address & PAGE_MASK)
3232 - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; 3232 - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
3233 3233
3234 pte_unmap(page_table); 3234 pte_unmap(page_table);
3235 return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte); 3235 return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
3236 } 3236 }
3237 3237
3238 /* 3238 /*
3239 * Fault of a previously existing named mapping. Repopulate the pte 3239 * Fault of a previously existing named mapping. Repopulate the pte
3240 * from the encoded file_pte if possible. This enables swappable 3240 * from the encoded file_pte if possible. This enables swappable
3241 * nonlinear vmas. 3241 * nonlinear vmas.
3242 * 3242 *
3243 * We enter with non-exclusive mmap_sem (to exclude vma changes, 3243 * We enter with non-exclusive mmap_sem (to exclude vma changes,
3244 * but allow concurrent faults), and pte mapped but not yet locked. 3244 * but allow concurrent faults), and pte mapped but not yet locked.
3245 * We return with mmap_sem still held, but pte unmapped and unlocked. 3245 * We return with mmap_sem still held, but pte unmapped and unlocked.
3246 */ 3246 */
3247 static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma, 3247 static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3248 unsigned long address, pte_t *page_table, pmd_t *pmd, 3248 unsigned long address, pte_t *page_table, pmd_t *pmd,
3249 unsigned int flags, pte_t orig_pte) 3249 unsigned int flags, pte_t orig_pte)
3250 { 3250 {
3251 pgoff_t pgoff; 3251 pgoff_t pgoff;
3252 3252
3253 flags |= FAULT_FLAG_NONLINEAR; 3253 flags |= FAULT_FLAG_NONLINEAR;
3254 3254
3255 if (!pte_unmap_same(mm, pmd, page_table, orig_pte)) 3255 if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
3256 return 0; 3256 return 0;
3257 3257
3258 if (unlikely(!(vma->vm_flags & VM_NONLINEAR))) { 3258 if (unlikely(!(vma->vm_flags & VM_NONLINEAR))) {
3259 /* 3259 /*
3260 * Page table corrupted: show pte and kill process. 3260 * Page table corrupted: show pte and kill process.
3261 */ 3261 */
3262 print_bad_pte(vma, address, orig_pte, NULL); 3262 print_bad_pte(vma, address, orig_pte, NULL);
3263 return VM_FAULT_SIGBUS; 3263 return VM_FAULT_SIGBUS;
3264 } 3264 }
3265 3265
3266 pgoff = pte_to_pgoff(orig_pte); 3266 pgoff = pte_to_pgoff(orig_pte);
3267 return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte); 3267 return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
3268 } 3268 }
3269 3269
3270 /* 3270 /*
3271 * These routines also need to handle stuff like marking pages dirty 3271 * These routines also need to handle stuff like marking pages dirty
3272 * and/or accessed for architectures that don't do it in hardware (most 3272 * and/or accessed for architectures that don't do it in hardware (most
3273 * RISC architectures). The early dirtying is also good on the i386. 3273 * RISC architectures). The early dirtying is also good on the i386.
3274 * 3274 *
3275 * There is also a hook called "update_mmu_cache()" that architectures 3275 * There is also a hook called "update_mmu_cache()" that architectures
3276 * with external mmu caches can use to update those (ie the Sparc or 3276 * with external mmu caches can use to update those (ie the Sparc or
3277 * PowerPC hashed page tables that act as extended TLBs). 3277 * PowerPC hashed page tables that act as extended TLBs).
3278 * 3278 *
3279 * We enter with non-exclusive mmap_sem (to exclude vma changes, 3279 * We enter with non-exclusive mmap_sem (to exclude vma changes,
3280 * but allow concurrent faults), and pte mapped but not yet locked. 3280 * but allow concurrent faults), and pte mapped but not yet locked.
3281 * We return with mmap_sem still held, but pte unmapped and unlocked. 3281 * We return with mmap_sem still held, but pte unmapped and unlocked.
3282 */ 3282 */
3283 int handle_pte_fault(struct mm_struct *mm, 3283 int handle_pte_fault(struct mm_struct *mm,
3284 struct vm_area_struct *vma, unsigned long address, 3284 struct vm_area_struct *vma, unsigned long address,
3285 pte_t *pte, pmd_t *pmd, unsigned int flags) 3285 pte_t *pte, pmd_t *pmd, unsigned int flags)
3286 { 3286 {
3287 pte_t entry; 3287 pte_t entry;
3288 spinlock_t *ptl; 3288 spinlock_t *ptl;
3289 3289
3290 entry = *pte; 3290 entry = *pte;
3291 if (!pte_present(entry)) { 3291 if (!pte_present(entry)) {
3292 if (pte_none(entry)) { 3292 if (pte_none(entry)) {
3293 if (vma->vm_ops) { 3293 if (vma->vm_ops) {
3294 if (likely(vma->vm_ops->fault)) 3294 if (likely(vma->vm_ops->fault))
3295 return do_linear_fault(mm, vma, address, 3295 return do_linear_fault(mm, vma, address,
3296 pte, pmd, flags, entry); 3296 pte, pmd, flags, entry);
3297 } 3297 }
3298 return do_anonymous_page(mm, vma, address, 3298 return do_anonymous_page(mm, vma, address,
3299 pte, pmd, flags); 3299 pte, pmd, flags);
3300 } 3300 }
3301 if (pte_file(entry)) 3301 if (pte_file(entry))
3302 return do_nonlinear_fault(mm, vma, address, 3302 return do_nonlinear_fault(mm, vma, address,
3303 pte, pmd, flags, entry); 3303 pte, pmd, flags, entry);
3304 return do_swap_page(mm, vma, address, 3304 return do_swap_page(mm, vma, address,
3305 pte, pmd, flags, entry); 3305 pte, pmd, flags, entry);
3306 } 3306 }
3307 3307
3308 ptl = pte_lockptr(mm, pmd); 3308 ptl = pte_lockptr(mm, pmd);
3309 spin_lock(ptl); 3309 spin_lock(ptl);
3310 if (unlikely(!pte_same(*pte, entry))) 3310 if (unlikely(!pte_same(*pte, entry)))
3311 goto unlock; 3311 goto unlock;
3312 if (flags & FAULT_FLAG_WRITE) { 3312 if (flags & FAULT_FLAG_WRITE) {
3313 if (!pte_write(entry)) 3313 if (!pte_write(entry))
3314 return do_wp_page(mm, vma, address, 3314 return do_wp_page(mm, vma, address,
3315 pte, pmd, ptl, entry); 3315 pte, pmd, ptl, entry);
3316 entry = pte_mkdirty(entry); 3316 entry = pte_mkdirty(entry);
3317 } 3317 }
3318 entry = pte_mkyoung(entry); 3318 entry = pte_mkyoung(entry);
3319 if (ptep_set_access_flags(vma, address, pte, entry, flags & FAULT_FLAG_WRITE)) { 3319 if (ptep_set_access_flags(vma, address, pte, entry, flags & FAULT_FLAG_WRITE)) {
3320 update_mmu_cache(vma, address, pte); 3320 update_mmu_cache(vma, address, pte);
3321 } else { 3321 } else {
3322 /* 3322 /*
3323 * This is needed only for protection faults but the arch code 3323 * This is needed only for protection faults but the arch code
3324 * is not yet telling us if this is a protection fault or not. 3324 * is not yet telling us if this is a protection fault or not.
3325 * This still avoids useless tlb flushes for .text page faults 3325 * This still avoids useless tlb flushes for .text page faults
3326 * with threads. 3326 * with threads.
3327 */ 3327 */
3328 if (flags & FAULT_FLAG_WRITE) 3328 if (flags & FAULT_FLAG_WRITE)
3329 flush_tlb_fix_spurious_fault(vma, address); 3329 flush_tlb_fix_spurious_fault(vma, address);
3330 } 3330 }
3331 unlock: 3331 unlock:
3332 pte_unmap_unlock(pte, ptl); 3332 pte_unmap_unlock(pte, ptl);
3333 return 0; 3333 return 0;
3334 } 3334 }
3335 3335
3336 /* 3336 /*
3337 * By the time we get here, we already hold the mm semaphore 3337 * By the time we get here, we already hold the mm semaphore
3338 */ 3338 */
3339 int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, 3339 int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3340 unsigned long address, unsigned int flags) 3340 unsigned long address, unsigned int flags)
3341 { 3341 {
3342 pgd_t *pgd; 3342 pgd_t *pgd;
3343 pud_t *pud; 3343 pud_t *pud;
3344 pmd_t *pmd; 3344 pmd_t *pmd;
3345 pte_t *pte; 3345 pte_t *pte;
3346 3346
3347 __set_current_state(TASK_RUNNING); 3347 __set_current_state(TASK_RUNNING);
3348 3348
3349 count_vm_event(PGFAULT); 3349 count_vm_event(PGFAULT);
3350 3350
3351 /* do counter updates before entering really critical section. */ 3351 /* do counter updates before entering really critical section. */
3352 check_sync_rss_stat(current); 3352 check_sync_rss_stat(current);
3353 3353
3354 if (unlikely(is_vm_hugetlb_page(vma))) 3354 if (unlikely(is_vm_hugetlb_page(vma)))
3355 return hugetlb_fault(mm, vma, address, flags); 3355 return hugetlb_fault(mm, vma, address, flags);
3356 3356
3357 pgd = pgd_offset(mm, address); 3357 pgd = pgd_offset(mm, address);
3358 pud = pud_alloc(mm, pgd, address); 3358 pud = pud_alloc(mm, pgd, address);
3359 if (!pud) 3359 if (!pud)
3360 return VM_FAULT_OOM; 3360 return VM_FAULT_OOM;
3361 pmd = pmd_alloc(mm, pud, address); 3361 pmd = pmd_alloc(mm, pud, address);
3362 if (!pmd) 3362 if (!pmd)
3363 return VM_FAULT_OOM; 3363 return VM_FAULT_OOM;
3364 if (pmd_none(*pmd) && transparent_hugepage_enabled(vma)) { 3364 if (pmd_none(*pmd) && transparent_hugepage_enabled(vma)) {
3365 if (!vma->vm_ops) 3365 if (!vma->vm_ops)
3366 return do_huge_pmd_anonymous_page(mm, vma, address, 3366 return do_huge_pmd_anonymous_page(mm, vma, address,
3367 pmd, flags); 3367 pmd, flags);
3368 } else { 3368 } else {
3369 pmd_t orig_pmd = *pmd; 3369 pmd_t orig_pmd = *pmd;
3370 barrier(); 3370 barrier();
3371 if (pmd_trans_huge(orig_pmd)) { 3371 if (pmd_trans_huge(orig_pmd)) {
3372 if (flags & FAULT_FLAG_WRITE && 3372 if (flags & FAULT_FLAG_WRITE &&
3373 !pmd_write(orig_pmd) && 3373 !pmd_write(orig_pmd) &&
3374 !pmd_trans_splitting(orig_pmd)) 3374 !pmd_trans_splitting(orig_pmd))
3375 return do_huge_pmd_wp_page(mm, vma, address, 3375 return do_huge_pmd_wp_page(mm, vma, address,
3376 pmd, orig_pmd); 3376 pmd, orig_pmd);
3377 return 0; 3377 return 0;
3378 } 3378 }
3379 } 3379 }
3380 3380
3381 /* 3381 /*
3382 * Use __pte_alloc instead of pte_alloc_map, because we can't 3382 * Use __pte_alloc instead of pte_alloc_map, because we can't
3383 * run pte_offset_map on the pmd, if an huge pmd could 3383 * run pte_offset_map on the pmd, if an huge pmd could
3384 * materialize from under us from a different thread. 3384 * materialize from under us from a different thread.
3385 */ 3385 */
3386 if (unlikely(__pte_alloc(mm, vma, pmd, address))) 3386 if (unlikely(__pte_alloc(mm, vma, pmd, address)))
3387 return VM_FAULT_OOM; 3387 return VM_FAULT_OOM;
3388 /* if an huge pmd materialized from under us just retry later */ 3388 /* if an huge pmd materialized from under us just retry later */
3389 if (unlikely(pmd_trans_huge(*pmd))) 3389 if (unlikely(pmd_trans_huge(*pmd)))
3390 return 0; 3390 return 0;
3391 /* 3391 /*
3392 * A regular pmd is established and it can't morph into a huge pmd 3392 * A regular pmd is established and it can't morph into a huge pmd
3393 * from under us anymore at this point because we hold the mmap_sem 3393 * from under us anymore at this point because we hold the mmap_sem
3394 * read mode and khugepaged takes it in write mode. So now it's 3394 * read mode and khugepaged takes it in write mode. So now it's
3395 * safe to run pte_offset_map(). 3395 * safe to run pte_offset_map().
3396 */ 3396 */
3397 pte = pte_offset_map(pmd, address); 3397 pte = pte_offset_map(pmd, address);
3398 3398
3399 return handle_pte_fault(mm, vma, address, pte, pmd, flags); 3399 return handle_pte_fault(mm, vma, address, pte, pmd, flags);
3400 } 3400 }
3401 3401
3402 #ifndef __PAGETABLE_PUD_FOLDED 3402 #ifndef __PAGETABLE_PUD_FOLDED
3403 /* 3403 /*
3404 * Allocate page upper directory. 3404 * Allocate page upper directory.
3405 * We've already handled the fast-path in-line. 3405 * We've already handled the fast-path in-line.
3406 */ 3406 */
3407 int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address) 3407 int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
3408 { 3408 {
3409 pud_t *new = pud_alloc_one(mm, address); 3409 pud_t *new = pud_alloc_one(mm, address);
3410 if (!new) 3410 if (!new)
3411 return -ENOMEM; 3411 return -ENOMEM;
3412 3412
3413 smp_wmb(); /* See comment in __pte_alloc */ 3413 smp_wmb(); /* See comment in __pte_alloc */
3414 3414
3415 spin_lock(&mm->page_table_lock); 3415 spin_lock(&mm->page_table_lock);
3416 if (pgd_present(*pgd)) /* Another has populated it */ 3416 if (pgd_present(*pgd)) /* Another has populated it */
3417 pud_free(mm, new); 3417 pud_free(mm, new);
3418 else 3418 else
3419 pgd_populate(mm, pgd, new); 3419 pgd_populate(mm, pgd, new);
3420 spin_unlock(&mm->page_table_lock); 3420 spin_unlock(&mm->page_table_lock);
3421 return 0; 3421 return 0;
3422 } 3422 }
3423 #endif /* __PAGETABLE_PUD_FOLDED */ 3423 #endif /* __PAGETABLE_PUD_FOLDED */
3424 3424
3425 #ifndef __PAGETABLE_PMD_FOLDED 3425 #ifndef __PAGETABLE_PMD_FOLDED
3426 /* 3426 /*
3427 * Allocate page middle directory. 3427 * Allocate page middle directory.
3428 * We've already handled the fast-path in-line. 3428 * We've already handled the fast-path in-line.
3429 */ 3429 */
3430 int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address) 3430 int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
3431 { 3431 {
3432 pmd_t *new = pmd_alloc_one(mm, address); 3432 pmd_t *new = pmd_alloc_one(mm, address);
3433 if (!new) 3433 if (!new)
3434 return -ENOMEM; 3434 return -ENOMEM;
3435 3435
3436 smp_wmb(); /* See comment in __pte_alloc */ 3436 smp_wmb(); /* See comment in __pte_alloc */
3437 3437
3438 spin_lock(&mm->page_table_lock); 3438 spin_lock(&mm->page_table_lock);
3439 #ifndef __ARCH_HAS_4LEVEL_HACK 3439 #ifndef __ARCH_HAS_4LEVEL_HACK
3440 if (pud_present(*pud)) /* Another has populated it */ 3440 if (pud_present(*pud)) /* Another has populated it */
3441 pmd_free(mm, new); 3441 pmd_free(mm, new);
3442 else 3442 else
3443 pud_populate(mm, pud, new); 3443 pud_populate(mm, pud, new);
3444 #else 3444 #else
3445 if (pgd_present(*pud)) /* Another has populated it */ 3445 if (pgd_present(*pud)) /* Another has populated it */
3446 pmd_free(mm, new); 3446 pmd_free(mm, new);
3447 else 3447 else
3448 pgd_populate(mm, pud, new); 3448 pgd_populate(mm, pud, new);
3449 #endif /* __ARCH_HAS_4LEVEL_HACK */ 3449 #endif /* __ARCH_HAS_4LEVEL_HACK */
3450 spin_unlock(&mm->page_table_lock); 3450 spin_unlock(&mm->page_table_lock);
3451 return 0; 3451 return 0;
3452 } 3452 }
3453 #endif /* __PAGETABLE_PMD_FOLDED */ 3453 #endif /* __PAGETABLE_PMD_FOLDED */
3454 3454
3455 int make_pages_present(unsigned long addr, unsigned long end) 3455 int make_pages_present(unsigned long addr, unsigned long end)
3456 { 3456 {
3457 int ret, len, write; 3457 int ret, len, write;
3458 struct vm_area_struct * vma; 3458 struct vm_area_struct * vma;
3459 3459
3460 vma = find_vma(current->mm, addr); 3460 vma = find_vma(current->mm, addr);
3461 if (!vma) 3461 if (!vma)
3462 return -ENOMEM; 3462 return -ENOMEM;
3463 /* 3463 /*
3464 * We want to touch writable mappings with a write fault in order 3464 * We want to touch writable mappings with a write fault in order
3465 * to break COW, except for shared mappings because these don't COW 3465 * to break COW, except for shared mappings because these don't COW
3466 * and we would not want to dirty them for nothing. 3466 * and we would not want to dirty them for nothing.
3467 */ 3467 */
3468 write = (vma->vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE; 3468 write = (vma->vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE;
3469 BUG_ON(addr >= end); 3469 BUG_ON(addr >= end);
3470 BUG_ON(end > vma->vm_end); 3470 BUG_ON(end > vma->vm_end);
3471 len = DIV_ROUND_UP(end, PAGE_SIZE) - addr/PAGE_SIZE; 3471 len = DIV_ROUND_UP(end, PAGE_SIZE) - addr/PAGE_SIZE;
3472 ret = get_user_pages(current, current->mm, addr, 3472 ret = get_user_pages(current, current->mm, addr,
3473 len, write, 0, NULL, NULL); 3473 len, write, 0, NULL, NULL);
3474 if (ret < 0) 3474 if (ret < 0)
3475 return ret; 3475 return ret;
3476 return ret == len ? 0 : -EFAULT; 3476 return ret == len ? 0 : -EFAULT;
3477 } 3477 }
3478 3478
3479 #if !defined(__HAVE_ARCH_GATE_AREA) 3479 #if !defined(__HAVE_ARCH_GATE_AREA)
3480 3480
3481 #if defined(AT_SYSINFO_EHDR) 3481 #if defined(AT_SYSINFO_EHDR)
3482 static struct vm_area_struct gate_vma; 3482 static struct vm_area_struct gate_vma;
3483 3483
3484 static int __init gate_vma_init(void) 3484 static int __init gate_vma_init(void)
3485 { 3485 {
3486 gate_vma.vm_mm = NULL; 3486 gate_vma.vm_mm = NULL;
3487 gate_vma.vm_start = FIXADDR_USER_START; 3487 gate_vma.vm_start = FIXADDR_USER_START;
3488 gate_vma.vm_end = FIXADDR_USER_END; 3488 gate_vma.vm_end = FIXADDR_USER_END;
3489 gate_vma.vm_flags = VM_READ | VM_MAYREAD | VM_EXEC | VM_MAYEXEC; 3489 gate_vma.vm_flags = VM_READ | VM_MAYREAD | VM_EXEC | VM_MAYEXEC;
3490 gate_vma.vm_page_prot = __P101; 3490 gate_vma.vm_page_prot = __P101;
3491 /* 3491 /*
3492 * Make sure the vDSO gets into every core dump. 3492 * Make sure the vDSO gets into every core dump.
3493 * Dumping its contents makes post-mortem fully interpretable later 3493 * Dumping its contents makes post-mortem fully interpretable later
3494 * without matching up the same kernel and hardware config to see 3494 * without matching up the same kernel and hardware config to see
3495 * what PC values meant. 3495 * what PC values meant.
3496 */ 3496 */
3497 gate_vma.vm_flags |= VM_ALWAYSDUMP; 3497 gate_vma.vm_flags |= VM_ALWAYSDUMP;
3498 return 0; 3498 return 0;
3499 } 3499 }
3500 __initcall(gate_vma_init); 3500 __initcall(gate_vma_init);
3501 #endif 3501 #endif
3502 3502
3503 struct vm_area_struct *get_gate_vma(struct mm_struct *mm) 3503 struct vm_area_struct *get_gate_vma(struct mm_struct *mm)
3504 { 3504 {
3505 #ifdef AT_SYSINFO_EHDR 3505 #ifdef AT_SYSINFO_EHDR
3506 return &gate_vma; 3506 return &gate_vma;
3507 #else 3507 #else
3508 return NULL; 3508 return NULL;
3509 #endif 3509 #endif
3510 } 3510 }
3511 3511
3512 int in_gate_area_no_mm(unsigned long addr) 3512 int in_gate_area_no_mm(unsigned long addr)
3513 { 3513 {
3514 #ifdef AT_SYSINFO_EHDR 3514 #ifdef AT_SYSINFO_EHDR
3515 if ((addr >= FIXADDR_USER_START) && (addr < FIXADDR_USER_END)) 3515 if ((addr >= FIXADDR_USER_START) && (addr < FIXADDR_USER_END))
3516 return 1; 3516 return 1;
3517 #endif 3517 #endif
3518 return 0; 3518 return 0;
3519 } 3519 }
3520 3520
3521 #endif /* __HAVE_ARCH_GATE_AREA */ 3521 #endif /* __HAVE_ARCH_GATE_AREA */
3522 3522
3523 static int __follow_pte(struct mm_struct *mm, unsigned long address, 3523 static int __follow_pte(struct mm_struct *mm, unsigned long address,
3524 pte_t **ptepp, spinlock_t **ptlp) 3524 pte_t **ptepp, spinlock_t **ptlp)
3525 { 3525 {
3526 pgd_t *pgd; 3526 pgd_t *pgd;
3527 pud_t *pud; 3527 pud_t *pud;
3528 pmd_t *pmd; 3528 pmd_t *pmd;
3529 pte_t *ptep; 3529 pte_t *ptep;
3530 3530
3531 pgd = pgd_offset(mm, address); 3531 pgd = pgd_offset(mm, address);
3532 if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) 3532 if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
3533 goto out; 3533 goto out;
3534 3534
3535 pud = pud_offset(pgd, address); 3535 pud = pud_offset(pgd, address);
3536 if (pud_none(*pud) || unlikely(pud_bad(*pud))) 3536 if (pud_none(*pud) || unlikely(pud_bad(*pud)))
3537 goto out; 3537 goto out;
3538 3538
3539 pmd = pmd_offset(pud, address); 3539 pmd = pmd_offset(pud, address);
3540 VM_BUG_ON(pmd_trans_huge(*pmd)); 3540 VM_BUG_ON(pmd_trans_huge(*pmd));
3541 if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd))) 3541 if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
3542 goto out; 3542 goto out;
3543 3543
3544 /* We cannot handle huge page PFN maps. Luckily they don't exist. */ 3544 /* We cannot handle huge page PFN maps. Luckily they don't exist. */
3545 if (pmd_huge(*pmd)) 3545 if (pmd_huge(*pmd))
3546 goto out; 3546 goto out;
3547 3547
3548 ptep = pte_offset_map_lock(mm, pmd, address, ptlp); 3548 ptep = pte_offset_map_lock(mm, pmd, address, ptlp);
3549 if (!ptep) 3549 if (!ptep)
3550 goto out; 3550 goto out;
3551 if (!pte_present(*ptep)) 3551 if (!pte_present(*ptep))
3552 goto unlock; 3552 goto unlock;
3553 *ptepp = ptep; 3553 *ptepp = ptep;
3554 return 0; 3554 return 0;
3555 unlock: 3555 unlock:
3556 pte_unmap_unlock(ptep, *ptlp); 3556 pte_unmap_unlock(ptep, *ptlp);
3557 out: 3557 out:
3558 return -EINVAL; 3558 return -EINVAL;
3559 } 3559 }
3560 3560
3561 static inline int follow_pte(struct mm_struct *mm, unsigned long address, 3561 static inline int follow_pte(struct mm_struct *mm, unsigned long address,
3562 pte_t **ptepp, spinlock_t **ptlp) 3562 pte_t **ptepp, spinlock_t **ptlp)
3563 { 3563 {
3564 int res; 3564 int res;
3565 3565
3566 /* (void) is needed to make gcc happy */ 3566 /* (void) is needed to make gcc happy */
3567 (void) __cond_lock(*ptlp, 3567 (void) __cond_lock(*ptlp,
3568 !(res = __follow_pte(mm, address, ptepp, ptlp))); 3568 !(res = __follow_pte(mm, address, ptepp, ptlp)));
3569 return res; 3569 return res;
3570 } 3570 }
3571 3571
3572 /** 3572 /**
3573 * follow_pfn - look up PFN at a user virtual address 3573 * follow_pfn - look up PFN at a user virtual address
3574 * @vma: memory mapping 3574 * @vma: memory mapping
3575 * @address: user virtual address 3575 * @address: user virtual address
3576 * @pfn: location to store found PFN 3576 * @pfn: location to store found PFN
3577 * 3577 *
3578 * Only IO mappings and raw PFN mappings are allowed. 3578 * Only IO mappings and raw PFN mappings are allowed.
3579 * 3579 *
3580 * Returns zero and the pfn at @pfn on success, -ve otherwise. 3580 * Returns zero and the pfn at @pfn on success, -ve otherwise.
3581 */ 3581 */
3582 int follow_pfn(struct vm_area_struct *vma, unsigned long address, 3582 int follow_pfn(struct vm_area_struct *vma, unsigned long address,
3583 unsigned long *pfn) 3583 unsigned long *pfn)
3584 { 3584 {
3585 int ret = -EINVAL; 3585 int ret = -EINVAL;
3586 spinlock_t *ptl; 3586 spinlock_t *ptl;
3587 pte_t *ptep; 3587 pte_t *ptep;
3588 3588
3589 if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) 3589 if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
3590 return ret; 3590 return ret;
3591 3591
3592 ret = follow_pte(vma->vm_mm, address, &ptep, &ptl); 3592 ret = follow_pte(vma->vm_mm, address, &ptep, &ptl);
3593 if (ret) 3593 if (ret)
3594 return ret; 3594 return ret;
3595 *pfn = pte_pfn(*ptep); 3595 *pfn = pte_pfn(*ptep);
3596 pte_unmap_unlock(ptep, ptl); 3596 pte_unmap_unlock(ptep, ptl);
3597 return 0; 3597 return 0;
3598 } 3598 }
3599 EXPORT_SYMBOL(follow_pfn); 3599 EXPORT_SYMBOL(follow_pfn);
3600 3600
3601 #ifdef CONFIG_HAVE_IOREMAP_PROT 3601 #ifdef CONFIG_HAVE_IOREMAP_PROT
3602 int follow_phys(struct vm_area_struct *vma, 3602 int follow_phys(struct vm_area_struct *vma,
3603 unsigned long address, unsigned int flags, 3603 unsigned long address, unsigned int flags,
3604 unsigned long *prot, resource_size_t *phys) 3604 unsigned long *prot, resource_size_t *phys)
3605 { 3605 {
3606 int ret = -EINVAL; 3606 int ret = -EINVAL;
3607 pte_t *ptep, pte; 3607 pte_t *ptep, pte;
3608 spinlock_t *ptl; 3608 spinlock_t *ptl;
3609 3609
3610 if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) 3610 if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
3611 goto out; 3611 goto out;
3612 3612
3613 if (follow_pte(vma->vm_mm, address, &ptep, &ptl)) 3613 if (follow_pte(vma->vm_mm, address, &ptep, &ptl))
3614 goto out; 3614 goto out;
3615 pte = *ptep; 3615 pte = *ptep;
3616 3616
3617 if ((flags & FOLL_WRITE) && !pte_write(pte)) 3617 if ((flags & FOLL_WRITE) && !pte_write(pte))
3618 goto unlock; 3618 goto unlock;
3619 3619
3620 *prot = pgprot_val(pte_pgprot(pte)); 3620 *prot = pgprot_val(pte_pgprot(pte));
3621 *phys = (resource_size_t)pte_pfn(pte) << PAGE_SHIFT; 3621 *phys = (resource_size_t)pte_pfn(pte) << PAGE_SHIFT;
3622 3622
3623 ret = 0; 3623 ret = 0;
3624 unlock: 3624 unlock:
3625 pte_unmap_unlock(ptep, ptl); 3625 pte_unmap_unlock(ptep, ptl);
3626 out: 3626 out:
3627 return ret; 3627 return ret;
3628 } 3628 }
3629 3629
3630 int generic_access_phys(struct vm_area_struct *vma, unsigned long addr, 3630 int generic_access_phys(struct vm_area_struct *vma, unsigned long addr,
3631 void *buf, int len, int write) 3631 void *buf, int len, int write)
3632 { 3632 {
3633 resource_size_t phys_addr; 3633 resource_size_t phys_addr;
3634 unsigned long prot = 0; 3634 unsigned long prot = 0;
3635 void __iomem *maddr; 3635 void __iomem *maddr;
3636 int offset = addr & (PAGE_SIZE-1); 3636 int offset = addr & (PAGE_SIZE-1);
3637 3637
3638 if (follow_phys(vma, addr, write, &prot, &phys_addr)) 3638 if (follow_phys(vma, addr, write, &prot, &phys_addr))
3639 return -EINVAL; 3639 return -EINVAL;
3640 3640
3641 maddr = ioremap_prot(phys_addr, PAGE_SIZE, prot); 3641 maddr = ioremap_prot(phys_addr, PAGE_SIZE, prot);
3642 if (write) 3642 if (write)
3643 memcpy_toio(maddr + offset, buf, len); 3643 memcpy_toio(maddr + offset, buf, len);
3644 else 3644 else
3645 memcpy_fromio(buf, maddr + offset, len); 3645 memcpy_fromio(buf, maddr + offset, len);
3646 iounmap(maddr); 3646 iounmap(maddr);
3647 3647
3648 return len; 3648 return len;
3649 } 3649 }
3650 #endif 3650 #endif
3651 3651
3652 /* 3652 /*
3653 * Access another process' address space as given in mm. If non-NULL, use the 3653 * Access another process' address space as given in mm. If non-NULL, use the
3654 * given task for page fault accounting. 3654 * given task for page fault accounting.
3655 */ 3655 */
3656 static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm, 3656 static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
3657 unsigned long addr, void *buf, int len, int write) 3657 unsigned long addr, void *buf, int len, int write)
3658 { 3658 {
3659 struct vm_area_struct *vma; 3659 struct vm_area_struct *vma;
3660 void *old_buf = buf; 3660 void *old_buf = buf;
3661 3661
3662 down_read(&mm->mmap_sem); 3662 down_read(&mm->mmap_sem);
3663 /* ignore errors, just check how much was successfully transferred */ 3663 /* ignore errors, just check how much was successfully transferred */
3664 while (len) { 3664 while (len) {
3665 int bytes, ret, offset; 3665 int bytes, ret, offset;
3666 void *maddr; 3666 void *maddr;
3667 struct page *page = NULL; 3667 struct page *page = NULL;
3668 3668
3669 ret = get_user_pages(tsk, mm, addr, 1, 3669 ret = get_user_pages(tsk, mm, addr, 1,
3670 write, 1, &page, &vma); 3670 write, 1, &page, &vma);
3671 if (ret <= 0) { 3671 if (ret <= 0) {
3672 /* 3672 /*
3673 * Check if this is a VM_IO | VM_PFNMAP VMA, which 3673 * Check if this is a VM_IO | VM_PFNMAP VMA, which
3674 * we can access using slightly different code. 3674 * we can access using slightly different code.
3675 */ 3675 */
3676 #ifdef CONFIG_HAVE_IOREMAP_PROT 3676 #ifdef CONFIG_HAVE_IOREMAP_PROT
3677 vma = find_vma(mm, addr); 3677 vma = find_vma(mm, addr);
3678 if (!vma) 3678 if (!vma)
3679 break; 3679 break;
3680 if (vma->vm_ops && vma->vm_ops->access) 3680 if (vma->vm_ops && vma->vm_ops->access)
3681 ret = vma->vm_ops->access(vma, addr, buf, 3681 ret = vma->vm_ops->access(vma, addr, buf,
3682 len, write); 3682 len, write);
3683 if (ret <= 0) 3683 if (ret <= 0)
3684 #endif 3684 #endif
3685 break; 3685 break;
3686 bytes = ret; 3686 bytes = ret;
3687 } else { 3687 } else {
3688 bytes = len; 3688 bytes = len;
3689 offset = addr & (PAGE_SIZE-1); 3689 offset = addr & (PAGE_SIZE-1);
3690 if (bytes > PAGE_SIZE-offset) 3690 if (bytes > PAGE_SIZE-offset)
3691 bytes = PAGE_SIZE-offset; 3691 bytes = PAGE_SIZE-offset;
3692 3692
3693 maddr = kmap(page); 3693 maddr = kmap(page);
3694 if (write) { 3694 if (write) {
3695 copy_to_user_page(vma, page, addr, 3695 copy_to_user_page(vma, page, addr,
3696 maddr + offset, buf, bytes); 3696 maddr + offset, buf, bytes);
3697 set_page_dirty_lock(page); 3697 set_page_dirty_lock(page);
3698 } else { 3698 } else {
3699 copy_from_user_page(vma, page, addr, 3699 copy_from_user_page(vma, page, addr,
3700 buf, maddr + offset, bytes); 3700 buf, maddr + offset, bytes);
3701 } 3701 }
3702 kunmap(page); 3702 kunmap(page);
3703 page_cache_release(page); 3703 page_cache_release(page);
3704 } 3704 }
3705 len -= bytes; 3705 len -= bytes;
3706 buf += bytes; 3706 buf += bytes;
3707 addr += bytes; 3707 addr += bytes;
3708 } 3708 }
3709 up_read(&mm->mmap_sem); 3709 up_read(&mm->mmap_sem);
3710 3710
3711 return buf - old_buf; 3711 return buf - old_buf;
3712 } 3712 }
3713 3713
3714 /**
3715 * @access_remote_vm - access another process' address space
3716 * @mm: the mm_struct of the target address space
3717 * @addr: start address to access
3718 * @buf: source or destination buffer
3719 * @len: number of bytes to transfer
3720 * @write: whether the access is a write
3721 *
3722 * The caller must hold a reference on @mm.
3723 */
3724 int access_remote_vm(struct mm_struct *mm, unsigned long addr,
3725 void *buf, int len, int write)
3726 {
3727 return __access_remote_vm(NULL, mm, addr, buf, len, write);
3728 }
3729
3714 /* 3730 /*
3715 * Access another process' address space. 3731 * Access another process' address space.
3716 * Source/target buffer must be kernel space, 3732 * Source/target buffer must be kernel space,
3717 * Do not walk the page table directly, use get_user_pages 3733 * Do not walk the page table directly, use get_user_pages
3718 */ 3734 */
3719 int access_process_vm(struct task_struct *tsk, unsigned long addr, 3735 int access_process_vm(struct task_struct *tsk, unsigned long addr,
3720 void *buf, int len, int write) 3736 void *buf, int len, int write)
3721 { 3737 {
3722 struct mm_struct *mm; 3738 struct mm_struct *mm;
3723 int ret; 3739 int ret;
3724 3740
3725 mm = get_task_mm(tsk); 3741 mm = get_task_mm(tsk);
3726 if (!mm) 3742 if (!mm)
3727 return 0; 3743 return 0;
3728 3744
3729 ret = __access_remote_vm(tsk, mm, addr, buf, len, write); 3745 ret = __access_remote_vm(tsk, mm, addr, buf, len, write);
3730 mmput(mm); 3746 mmput(mm);
3731 3747
3732 return ret; 3748 return ret;
3733 } 3749 }
3734 3750
3735 /* 3751 /*
3736 * Print the name of a VMA. 3752 * Print the name of a VMA.
3737 */ 3753 */
3738 void print_vma_addr(char *prefix, unsigned long ip) 3754 void print_vma_addr(char *prefix, unsigned long ip)
3739 { 3755 {
3740 struct mm_struct *mm = current->mm; 3756 struct mm_struct *mm = current->mm;
3741 struct vm_area_struct *vma; 3757 struct vm_area_struct *vma;
3742 3758
3743 /* 3759 /*
3744 * Do not print if we are in atomic 3760 * Do not print if we are in atomic
3745 * contexts (in exception stacks, etc.): 3761 * contexts (in exception stacks, etc.):
3746 */ 3762 */
3747 if (preempt_count()) 3763 if (preempt_count())
3748 return; 3764 return;
3749 3765
3750 down_read(&mm->mmap_sem); 3766 down_read(&mm->mmap_sem);
3751 vma = find_vma(mm, ip); 3767 vma = find_vma(mm, ip);
3752 if (vma && vma->vm_file) { 3768 if (vma && vma->vm_file) {
3753 struct file *f = vma->vm_file; 3769 struct file *f = vma->vm_file;
3754 char *buf = (char *)__get_free_page(GFP_KERNEL); 3770 char *buf = (char *)__get_free_page(GFP_KERNEL);
3755 if (buf) { 3771 if (buf) {
3756 char *p, *s; 3772 char *p, *s;
3757 3773
3758 p = d_path(&f->f_path, buf, PAGE_SIZE); 3774 p = d_path(&f->f_path, buf, PAGE_SIZE);
3759 if (IS_ERR(p)) 3775 if (IS_ERR(p))
3760 p = "?"; 3776 p = "?";
3761 s = strrchr(p, '/'); 3777 s = strrchr(p, '/');
3762 if (s) 3778 if (s)
3763 p = s+1; 3779 p = s+1;
3764 printk("%s%s[%lx+%lx]", prefix, p, 3780 printk("%s%s[%lx+%lx]", prefix, p,
3765 vma->vm_start, 3781 vma->vm_start,
3766 vma->vm_end - vma->vm_start); 3782 vma->vm_end - vma->vm_start);
3767 free_page((unsigned long)buf); 3783 free_page((unsigned long)buf);
3768 } 3784 }
3769 } 3785 }
3770 up_read(&current->mm->mmap_sem); 3786 up_read(&current->mm->mmap_sem);
3771 } 3787 }
3772 3788
3773 #ifdef CONFIG_PROVE_LOCKING 3789 #ifdef CONFIG_PROVE_LOCKING
3774 void might_fault(void) 3790 void might_fault(void)
3775 { 3791 {
3776 /* 3792 /*
3777 * Some code (nfs/sunrpc) uses socket ops on kernel memory while 3793 * Some code (nfs/sunrpc) uses socket ops on kernel memory while
3778 * holding the mmap_sem, this is safe because kernel memory doesn't 3794 * holding the mmap_sem, this is safe because kernel memory doesn't
3779 * get paged out, therefore we'll never actually fault, and the 3795 * get paged out, therefore we'll never actually fault, and the
3780 * below annotations will generate false positives. 3796 * below annotations will generate false positives.
3781 */ 3797 */
3782 if (segment_eq(get_fs(), KERNEL_DS)) 3798 if (segment_eq(get_fs(), KERNEL_DS))
3783 return; 3799 return;
3784 3800
3785 might_sleep(); 3801 might_sleep();
3786 /* 3802 /*
3787 * it would be nicer only to annotate paths which are not under 3803 * it would be nicer only to annotate paths which are not under
3788 * pagefault_disable, however that requires a larger audit and 3804 * pagefault_disable, however that requires a larger audit and
3789 * providing helpers like get_user_atomic. 3805 * providing helpers like get_user_atomic.
3790 */ 3806 */
3791 if (!in_atomic() && current->mm) 3807 if (!in_atomic() && current->mm)
3792 might_lock_read(&current->mm->mmap_sem); 3808 might_lock_read(&current->mm->mmap_sem);
3793 } 3809 }
3794 EXPORT_SYMBOL(might_fault); 3810 EXPORT_SYMBOL(might_fault);
3795 #endif 3811 #endif
3796 3812
3797 #if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS) 3813 #if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS)
3798 static void clear_gigantic_page(struct page *page, 3814 static void clear_gigantic_page(struct page *page,
3799 unsigned long addr, 3815 unsigned long addr,
3800 unsigned int pages_per_huge_page) 3816 unsigned int pages_per_huge_page)
3801 { 3817 {
3802 int i; 3818 int i;
3803 struct page *p = page; 3819 struct page *p = page;
3804 3820
3805 might_sleep(); 3821 might_sleep();
3806 for (i = 0; i < pages_per_huge_page; 3822 for (i = 0; i < pages_per_huge_page;
3807 i++, p = mem_map_next(p, page, i)) { 3823 i++, p = mem_map_next(p, page, i)) {
3808 cond_resched(); 3824 cond_resched();
3809 clear_user_highpage(p, addr + i * PAGE_SIZE); 3825 clear_user_highpage(p, addr + i * PAGE_SIZE);
3810 } 3826 }
3811 } 3827 }
3812 void clear_huge_page(struct page *page, 3828 void clear_huge_page(struct page *page,
3813 unsigned long addr, unsigned int pages_per_huge_page) 3829 unsigned long addr, unsigned int pages_per_huge_page)
3814 { 3830 {
3815 int i; 3831 int i;
3816 3832
3817 if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) { 3833 if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) {
3818 clear_gigantic_page(page, addr, pages_per_huge_page); 3834 clear_gigantic_page(page, addr, pages_per_huge_page);
3819 return; 3835 return;
3820 } 3836 }
3821 3837
3822 might_sleep(); 3838 might_sleep();
3823 for (i = 0; i < pages_per_huge_page; i++) { 3839 for (i = 0; i < pages_per_huge_page; i++) {
3824 cond_resched(); 3840 cond_resched();
3825 clear_user_highpage(page + i, addr + i * PAGE_SIZE); 3841 clear_user_highpage(page + i, addr + i * PAGE_SIZE);
3826 } 3842 }
3827 } 3843 }
3828 3844
3829 static void copy_user_gigantic_page(struct page *dst, struct page *src, 3845 static void copy_user_gigantic_page(struct page *dst, struct page *src,
3830 unsigned long addr, 3846 unsigned long addr,
3831 struct vm_area_struct *vma, 3847 struct vm_area_struct *vma,
3832 unsigned int pages_per_huge_page) 3848 unsigned int pages_per_huge_page)
3833 { 3849 {
3834 int i; 3850 int i;
3835 struct page *dst_base = dst; 3851 struct page *dst_base = dst;
3836 struct page *src_base = src; 3852 struct page *src_base = src;
3837 3853
3838 for (i = 0; i < pages_per_huge_page; ) { 3854 for (i = 0; i < pages_per_huge_page; ) {
3839 cond_resched(); 3855 cond_resched();
3840 copy_user_highpage(dst, src, addr + i*PAGE_SIZE, vma); 3856 copy_user_highpage(dst, src, addr + i*PAGE_SIZE, vma);
3841 3857
3842 i++; 3858 i++;
3843 dst = mem_map_next(dst, dst_base, i); 3859 dst = mem_map_next(dst, dst_base, i);
3844 src = mem_map_next(src, src_base, i); 3860 src = mem_map_next(src, src_base, i);
3845 } 3861 }
3846 } 3862 }
3847 3863
3848 void copy_user_huge_page(struct page *dst, struct page *src, 3864 void copy_user_huge_page(struct page *dst, struct page *src,
3849 unsigned long addr, struct vm_area_struct *vma, 3865 unsigned long addr, struct vm_area_struct *vma,
3850 unsigned int pages_per_huge_page) 3866 unsigned int pages_per_huge_page)
3851 { 3867 {
3852 int i; 3868 int i;
3853 3869
3854 if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) { 3870 if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) {
3855 copy_user_gigantic_page(dst, src, addr, vma, 3871 copy_user_gigantic_page(dst, src, addr, vma,
3856 pages_per_huge_page); 3872 pages_per_huge_page);
3857 return; 3873 return;
3858 } 3874 }
3859 3875
3860 might_sleep(); 3876 might_sleep();
3861 for (i = 0; i < pages_per_huge_page; i++) { 3877 for (i = 0; i < pages_per_huge_page; i++) {
3862 cond_resched(); 3878 cond_resched();
3863 copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma); 3879 copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma);
3864 } 3880 }
3865 } 3881 }
3866 #endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */ 3882 #endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */
3867 3883