Blame view

mm/userfaultfd.c 17.6 KB
20c8ccb19   Thomas Gleixner   treewide: Replace...
1
  // SPDX-License-Identifier: GPL-2.0-only
c1a4de99f   Andrea Arcangeli   userfaultfd: mcop...
2
3
4
5
  /*
   *  mm/userfaultfd.c
   *
   *  Copyright (C) 2015  Red Hat, Inc.
c1a4de99f   Andrea Arcangeli   userfaultfd: mcop...
6
7
8
   */
  
  #include <linux/mm.h>
174cd4b1e   Ingo Molnar   sched/headers: Pr...
9
  #include <linux/sched/signal.h>
c1a4de99f   Andrea Arcangeli   userfaultfd: mcop...
10
11
12
13
14
15
  #include <linux/pagemap.h>
  #include <linux/rmap.h>
  #include <linux/swap.h>
  #include <linux/swapops.h>
  #include <linux/userfaultfd_k.h>
  #include <linux/mmu_notifier.h>
60d4d2d2b   Mike Kravetz   userfaultfd: huge...
16
  #include <linux/hugetlb.h>
26071cedc   Mike Rapoport   userfaultfd: shme...
17
  #include <linux/shmem_fs.h>
c1a4de99f   Andrea Arcangeli   userfaultfd: mcop...
18
19
  #include <asm/tlbflush.h>
  #include "internal.h"
643aa36ea   Wei Yang   userfaultfd: wrap...
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
  static __always_inline
  struct vm_area_struct *find_dst_vma(struct mm_struct *dst_mm,
  				    unsigned long dst_start,
  				    unsigned long len)
  {
  	/*
  	 * Make sure that the dst range is both valid and fully within a
  	 * single existing vma.
  	 */
  	struct vm_area_struct *dst_vma;
  
  	dst_vma = find_vma(dst_mm, dst_start);
  	if (!dst_vma)
  		return NULL;
  
  	if (dst_start < dst_vma->vm_start ||
  	    dst_start + len > dst_vma->vm_end)
  		return NULL;
  
  	/*
  	 * Check the vma is registered in uffd, this is required to
  	 * enforce the VM_MAYWRITE check done at uffd registration
  	 * time.
  	 */
  	if (!dst_vma->vm_userfaultfd_ctx.ctx)
  		return NULL;
  
  	return dst_vma;
  }
c1a4de99f   Andrea Arcangeli   userfaultfd: mcop...
49
50
51
52
  static int mcopy_atomic_pte(struct mm_struct *dst_mm,
  			    pmd_t *dst_pmd,
  			    struct vm_area_struct *dst_vma,
  			    unsigned long dst_addr,
b6ebaedb4   Andrea Arcangeli   userfaultfd: avoi...
53
  			    unsigned long src_addr,
72981e0e7   Andrea Arcangeli   userfaultfd: wp: ...
54
55
  			    struct page **pagep,
  			    bool wp_copy)
c1a4de99f   Andrea Arcangeli   userfaultfd: mcop...
56
  {
c1a4de99f   Andrea Arcangeli   userfaultfd: mcop...
57
58
  	pte_t _dst_pte, *dst_pte;
  	spinlock_t *ptl;
c1a4de99f   Andrea Arcangeli   userfaultfd: mcop...
59
60
  	void *page_kaddr;
  	int ret;
b6ebaedb4   Andrea Arcangeli   userfaultfd: avoi...
61
  	struct page *page;
e2a50c1f6   Andrea Arcangeli   userfaultfd: shme...
62
63
  	pgoff_t offset, max_off;
  	struct inode *inode;
c1a4de99f   Andrea Arcangeli   userfaultfd: mcop...
64

b6ebaedb4   Andrea Arcangeli   userfaultfd: avoi...
65
66
67
68
69
70
71
72
73
74
75
  	if (!*pagep) {
  		ret = -ENOMEM;
  		page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, dst_vma, dst_addr);
  		if (!page)
  			goto out;
  
  		page_kaddr = kmap_atomic(page);
  		ret = copy_from_user(page_kaddr,
  				     (const void __user *) src_addr,
  				     PAGE_SIZE);
  		kunmap_atomic(page_kaddr);
c1e8d7c6a   Michel Lespinasse   mmap locking API:...
76
  		/* fallback to copy_from_user outside mmap_lock */
b6ebaedb4   Andrea Arcangeli   userfaultfd: avoi...
77
  		if (unlikely(ret)) {
9e368259a   Andrea Arcangeli   userfaultfd: use ...
78
  			ret = -ENOENT;
b6ebaedb4   Andrea Arcangeli   userfaultfd: avoi...
79
80
81
82
83
84
85
86
  			*pagep = page;
  			/* don't free the page */
  			goto out;
  		}
  	} else {
  		page = *pagep;
  		*pagep = NULL;
  	}
c1a4de99f   Andrea Arcangeli   userfaultfd: mcop...
87
88
89
  
  	/*
  	 * The memory barrier inside __SetPageUptodate makes sure that
f4f5329d4   Wei Yang   mm: fix typos in ...
90
  	 * preceding stores to the page contents become visible before
c1a4de99f   Andrea Arcangeli   userfaultfd: mcop...
91
92
93
94
95
  	 * the set_pte_at() write.
  	 */
  	__SetPageUptodate(page);
  
  	ret = -ENOMEM;
d9eb1ea2b   Johannes Weiner   mm: memcontrol: d...
96
  	if (mem_cgroup_charge(page, dst_mm, GFP_KERNEL))
c1a4de99f   Andrea Arcangeli   userfaultfd: mcop...
97
  		goto out_release;
72981e0e7   Andrea Arcangeli   userfaultfd: wp: ...
98
  	_dst_pte = pte_mkdirty(mk_pte(page, dst_vma->vm_page_prot));
292924b26   Peter Xu   userfaultfd: wp: ...
99
100
101
102
103
104
  	if (dst_vma->vm_flags & VM_WRITE) {
  		if (wp_copy)
  			_dst_pte = pte_mkuffd_wp(_dst_pte);
  		else
  			_dst_pte = pte_mkwrite(_dst_pte);
  	}
c1a4de99f   Andrea Arcangeli   userfaultfd: mcop...
105

c1a4de99f   Andrea Arcangeli   userfaultfd: mcop...
106
  	dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl);
e2a50c1f6   Andrea Arcangeli   userfaultfd: shme...
107
108
109
110
111
112
113
114
115
116
  	if (dst_vma->vm_file) {
  		/* the shmem MAP_PRIVATE case requires checking the i_size */
  		inode = dst_vma->vm_file->f_inode;
  		offset = linear_page_index(dst_vma, dst_addr);
  		max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
  		ret = -EFAULT;
  		if (unlikely(offset >= max_off))
  			goto out_release_uncharge_unlock;
  	}
  	ret = -EEXIST;
c1a4de99f   Andrea Arcangeli   userfaultfd: mcop...
117
118
119
120
  	if (!pte_none(*dst_pte))
  		goto out_release_uncharge_unlock;
  
  	inc_mm_counter(dst_mm, MM_ANONPAGES);
be5d0a74c   Johannes Weiner   mm: memcontrol: s...
121
  	page_add_new_anon_rmap(page, dst_vma, dst_addr, false);
b518154e5   Joonsoo Kim   mm/vmscan: protec...
122
  	lru_cache_add_inactive_or_unevictable(page, dst_vma);
c1a4de99f   Andrea Arcangeli   userfaultfd: mcop...
123
124
125
126
127
128
129
130
131
132
133
134
  
  	set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
  
  	/* No need to invalidate - it was non-present before */
  	update_mmu_cache(dst_vma, dst_addr, dst_pte);
  
  	pte_unmap_unlock(dst_pte, ptl);
  	ret = 0;
  out:
  	return ret;
  out_release_uncharge_unlock:
  	pte_unmap_unlock(dst_pte, ptl);
c1a4de99f   Andrea Arcangeli   userfaultfd: mcop...
135
  out_release:
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
136
  	put_page(page);
c1a4de99f   Andrea Arcangeli   userfaultfd: mcop...
137
  	goto out;
c1a4de99f   Andrea Arcangeli   userfaultfd: mcop...
138
139
140
141
142
143
144
145
146
147
  }
  
  static int mfill_zeropage_pte(struct mm_struct *dst_mm,
  			      pmd_t *dst_pmd,
  			      struct vm_area_struct *dst_vma,
  			      unsigned long dst_addr)
  {
  	pte_t _dst_pte, *dst_pte;
  	spinlock_t *ptl;
  	int ret;
e2a50c1f6   Andrea Arcangeli   userfaultfd: shme...
148
149
  	pgoff_t offset, max_off;
  	struct inode *inode;
c1a4de99f   Andrea Arcangeli   userfaultfd: mcop...
150
151
152
  
  	_dst_pte = pte_mkspecial(pfn_pte(my_zero_pfn(dst_addr),
  					 dst_vma->vm_page_prot));
c1a4de99f   Andrea Arcangeli   userfaultfd: mcop...
153
  	dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl);
e2a50c1f6   Andrea Arcangeli   userfaultfd: shme...
154
155
156
157
158
159
160
161
162
163
  	if (dst_vma->vm_file) {
  		/* the shmem MAP_PRIVATE case requires checking the i_size */
  		inode = dst_vma->vm_file->f_inode;
  		offset = linear_page_index(dst_vma, dst_addr);
  		max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
  		ret = -EFAULT;
  		if (unlikely(offset >= max_off))
  			goto out_unlock;
  	}
  	ret = -EEXIST;
c1a4de99f   Andrea Arcangeli   userfaultfd: mcop...
164
165
166
167
168
169
170
171
172
173
174
175
176
177
  	if (!pte_none(*dst_pte))
  		goto out_unlock;
  	set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
  	/* No need to invalidate - it was non-present before */
  	update_mmu_cache(dst_vma, dst_addr, dst_pte);
  	ret = 0;
  out_unlock:
  	pte_unmap_unlock(dst_pte, ptl);
  	return ret;
  }
  
  static pmd_t *mm_alloc_pmd(struct mm_struct *mm, unsigned long address)
  {
  	pgd_t *pgd;
c2febafc6   Kirill A. Shutemov   mm: convert gener...
178
  	p4d_t *p4d;
c1a4de99f   Andrea Arcangeli   userfaultfd: mcop...
179
  	pud_t *pud;
c1a4de99f   Andrea Arcangeli   userfaultfd: mcop...
180
181
  
  	pgd = pgd_offset(mm, address);
c2febafc6   Kirill A. Shutemov   mm: convert gener...
182
183
184
185
186
187
188
189
190
191
192
193
  	p4d = p4d_alloc(mm, pgd, address);
  	if (!p4d)
  		return NULL;
  	pud = pud_alloc(mm, p4d, address);
  	if (!pud)
  		return NULL;
  	/*
  	 * Note that we didn't run this because the pmd was
  	 * missing, the *pmd may be already established and in
  	 * turn it may also be a trans_huge_pmd.
  	 */
  	return pmd_alloc(mm, pud, address);
c1a4de99f   Andrea Arcangeli   userfaultfd: mcop...
194
  }
60d4d2d2b   Mike Kravetz   userfaultfd: huge...
195
196
197
  #ifdef CONFIG_HUGETLB_PAGE
  /*
   * __mcopy_atomic processing for HUGETLB vmas.  Note that this routine is
c1e8d7c6a   Michel Lespinasse   mmap locking API:...
198
   * called with mmap_lock held, it will release mmap_lock before returning.
60d4d2d2b   Mike Kravetz   userfaultfd: huge...
199
200
201
202
203
204
205
206
   */
  static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm,
  					      struct vm_area_struct *dst_vma,
  					      unsigned long dst_start,
  					      unsigned long src_start,
  					      unsigned long len,
  					      bool zeropage)
  {
1c9e8def4   Mike Kravetz   userfaultfd: huge...
207
208
  	int vm_alloc_shared = dst_vma->vm_flags & VM_SHARED;
  	int vm_shared = dst_vma->vm_flags & VM_SHARED;
60d4d2d2b   Mike Kravetz   userfaultfd: huge...
209
210
211
212
213
  	ssize_t err;
  	pte_t *dst_pte;
  	unsigned long src_addr, dst_addr;
  	long copied;
  	struct page *page;
60d4d2d2b   Mike Kravetz   userfaultfd: huge...
214
215
216
217
218
219
220
221
222
223
224
225
  	unsigned long vma_hpagesize;
  	pgoff_t idx;
  	u32 hash;
  	struct address_space *mapping;
  
  	/*
  	 * There is no default zero huge page for all huge page sizes as
  	 * supported by hugetlb.  A PMD_SIZE huge pages may exist as used
  	 * by THP.  Since we can not reliably insert a zero page, this
  	 * feature is not supported.
  	 */
  	if (zeropage) {
d8ed45c5d   Michel Lespinasse   mmap locking API:...
226
  		mmap_read_unlock(dst_mm);
60d4d2d2b   Mike Kravetz   userfaultfd: huge...
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
  		return -EINVAL;
  	}
  
  	src_addr = src_start;
  	dst_addr = dst_start;
  	copied = 0;
  	page = NULL;
  	vma_hpagesize = vma_kernel_pagesize(dst_vma);
  
  	/*
  	 * Validate alignment based on huge page size
  	 */
  	err = -EINVAL;
  	if (dst_start & (vma_hpagesize - 1) || len & (vma_hpagesize - 1))
  		goto out_unlock;
  
  retry:
  	/*
c1e8d7c6a   Michel Lespinasse   mmap locking API:...
245
  	 * On routine entry dst_vma is set.  If we had to drop mmap_lock and
60d4d2d2b   Mike Kravetz   userfaultfd: huge...
246
247
248
  	 * retry, dst_vma will be set to NULL and we must lookup again.
  	 */
  	if (!dst_vma) {
27d02568f   Mike Rapoport   userfaultfd: mcop...
249
  		err = -ENOENT;
643aa36ea   Wei Yang   userfaultfd: wrap...
250
  		dst_vma = find_dst_vma(dst_mm, dst_start, len);
60d4d2d2b   Mike Kravetz   userfaultfd: huge...
251
252
  		if (!dst_vma || !is_vm_hugetlb_page(dst_vma))
  			goto out_unlock;
1c9e8def4   Mike Kravetz   userfaultfd: huge...
253

27d02568f   Mike Rapoport   userfaultfd: mcop...
254
255
256
  		err = -EINVAL;
  		if (vma_hpagesize != vma_kernel_pagesize(dst_vma))
  			goto out_unlock;
1c9e8def4   Mike Kravetz   userfaultfd: huge...
257
  		vm_shared = dst_vma->vm_flags & VM_SHARED;
60d4d2d2b   Mike Kravetz   userfaultfd: huge...
258
  	}
60d4d2d2b   Mike Kravetz   userfaultfd: huge...
259
  	/*
1c9e8def4   Mike Kravetz   userfaultfd: huge...
260
  	 * If not shared, ensure the dst_vma has a anon_vma.
60d4d2d2b   Mike Kravetz   userfaultfd: huge...
261
262
  	 */
  	err = -ENOMEM;
1c9e8def4   Mike Kravetz   userfaultfd: huge...
263
264
265
266
  	if (!vm_shared) {
  		if (unlikely(anon_vma_prepare(dst_vma)))
  			goto out_unlock;
  	}
60d4d2d2b   Mike Kravetz   userfaultfd: huge...
267

60d4d2d2b   Mike Kravetz   userfaultfd: huge...
268
269
270
271
  	while (src_addr < src_start + len) {
  		pte_t dst_pteval;
  
  		BUG_ON(dst_addr >= dst_start + len);
60d4d2d2b   Mike Kravetz   userfaultfd: huge...
272
273
  
  		/*
c0d0381ad   Mike Kravetz   hugetlbfs: use i_...
274
275
276
277
  		 * Serialize via i_mmap_rwsem and hugetlb_fault_mutex.
  		 * i_mmap_rwsem ensures the dst_pte remains valid even
  		 * in the case of shared pmds.  fault mutex prevents
  		 * races with other faulting threads.
60d4d2d2b   Mike Kravetz   userfaultfd: huge...
278
  		 */
ddeaab32a   Mike Kravetz   hugetlbfs: revert...
279
  		mapping = dst_vma->vm_file->f_mapping;
c0d0381ad   Mike Kravetz   hugetlbfs: use i_...
280
281
  		i_mmap_lock_read(mapping);
  		idx = linear_page_index(dst_vma, dst_addr);
188b04a7d   Wei Yang   hugetlb: remove u...
282
  		hash = hugetlb_fault_mutex_hash(mapping, idx);
60d4d2d2b   Mike Kravetz   userfaultfd: huge...
283
284
285
  		mutex_lock(&hugetlb_fault_mutex_table[hash]);
  
  		err = -ENOMEM;
4fb07ee65   Wei Yang   userfaultfd: use ...
286
  		dst_pte = huge_pte_alloc(dst_mm, dst_addr, vma_hpagesize);
60d4d2d2b   Mike Kravetz   userfaultfd: huge...
287
288
  		if (!dst_pte) {
  			mutex_unlock(&hugetlb_fault_mutex_table[hash]);
c0d0381ad   Mike Kravetz   hugetlbfs: use i_...
289
  			i_mmap_unlock_read(mapping);
60d4d2d2b   Mike Kravetz   userfaultfd: huge...
290
291
292
293
294
295
296
  			goto out_unlock;
  		}
  
  		err = -EEXIST;
  		dst_pteval = huge_ptep_get(dst_pte);
  		if (!huge_pte_none(dst_pteval)) {
  			mutex_unlock(&hugetlb_fault_mutex_table[hash]);
c0d0381ad   Mike Kravetz   hugetlbfs: use i_...
297
  			i_mmap_unlock_read(mapping);
60d4d2d2b   Mike Kravetz   userfaultfd: huge...
298
299
300
301
302
303
304
  			goto out_unlock;
  		}
  
  		err = hugetlb_mcopy_atomic_pte(dst_mm, dst_pte, dst_vma,
  						dst_addr, src_addr, &page);
  
  		mutex_unlock(&hugetlb_fault_mutex_table[hash]);
c0d0381ad   Mike Kravetz   hugetlbfs: use i_...
305
  		i_mmap_unlock_read(mapping);
1c9e8def4   Mike Kravetz   userfaultfd: huge...
306
  		vm_alloc_shared = vm_shared;
60d4d2d2b   Mike Kravetz   userfaultfd: huge...
307
308
  
  		cond_resched();
9e368259a   Andrea Arcangeli   userfaultfd: use ...
309
  		if (unlikely(err == -ENOENT)) {
d8ed45c5d   Michel Lespinasse   mmap locking API:...
310
  			mmap_read_unlock(dst_mm);
60d4d2d2b   Mike Kravetz   userfaultfd: huge...
311
312
313
314
  			BUG_ON(!page);
  
  			err = copy_huge_page_from_user(page,
  						(const void __user *)src_addr,
4fb07ee65   Wei Yang   userfaultfd: use ...
315
316
  						vma_hpagesize / PAGE_SIZE,
  						true);
60d4d2d2b   Mike Kravetz   userfaultfd: huge...
317
318
319
320
  			if (unlikely(err)) {
  				err = -EFAULT;
  				goto out;
  			}
d8ed45c5d   Michel Lespinasse   mmap locking API:...
321
  			mmap_read_lock(dst_mm);
60d4d2d2b   Mike Kravetz   userfaultfd: huge...
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
  
  			dst_vma = NULL;
  			goto retry;
  		} else
  			BUG_ON(page);
  
  		if (!err) {
  			dst_addr += vma_hpagesize;
  			src_addr += vma_hpagesize;
  			copied += vma_hpagesize;
  
  			if (fatal_signal_pending(current))
  				err = -EINTR;
  		}
  		if (err)
  			break;
  	}
  
  out_unlock:
d8ed45c5d   Michel Lespinasse   mmap locking API:...
341
  	mmap_read_unlock(dst_mm);
60d4d2d2b   Mike Kravetz   userfaultfd: huge...
342
  out:
21205bf8f   Mike Kravetz   userfaultfd: huge...
343
344
345
  	if (page) {
  		/*
  		 * We encountered an error and are about to free a newly
1c9e8def4   Mike Kravetz   userfaultfd: huge...
346
347
348
349
350
351
  		 * allocated huge page.
  		 *
  		 * Reservation handling is very subtle, and is different for
  		 * private and shared mappings.  See the routine
  		 * restore_reserve_on_error for details.  Unfortunately, we
  		 * can not call restore_reserve_on_error now as it would
c1e8d7c6a   Michel Lespinasse   mmap locking API:...
352
  		 * require holding mmap_lock.
1c9e8def4   Mike Kravetz   userfaultfd: huge...
353
354
355
356
357
  		 *
  		 * If a reservation for the page existed in the reservation
  		 * map of a private mapping, the map was modified to indicate
  		 * the reservation was consumed when the page was allocated.
  		 * We clear the PagePrivate flag now so that the global
21205bf8f   Mike Kravetz   userfaultfd: huge...
358
359
360
  		 * reserve count will not be incremented in free_huge_page.
  		 * The reservation map will still indicate the reservation
  		 * was consumed and possibly prevent later page allocation.
1c9e8def4   Mike Kravetz   userfaultfd: huge...
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
  		 * This is better than leaking a global reservation.  If no
  		 * reservation existed, it is still safe to clear PagePrivate
  		 * as no adjustments to reservation counts were made during
  		 * allocation.
  		 *
  		 * The reservation map for shared mappings indicates which
  		 * pages have reservations.  When a huge page is allocated
  		 * for an address with a reservation, no change is made to
  		 * the reserve map.  In this case PagePrivate will be set
  		 * to indicate that the global reservation count should be
  		 * incremented when the page is freed.  This is the desired
  		 * behavior.  However, when a huge page is allocated for an
  		 * address without a reservation a reservation entry is added
  		 * to the reservation map, and PagePrivate will not be set.
  		 * When the page is freed, the global reserve count will NOT
  		 * be incremented and it will appear as though we have leaked
  		 * reserved page.  In this case, set PagePrivate so that the
  		 * global reserve count will be incremented to match the
  		 * reservation map entry which was created.
  		 *
  		 * Note that vm_alloc_shared is based on the flags of the vma
  		 * for which the page was originally allocated.  dst_vma could
  		 * be different or NULL on error.
21205bf8f   Mike Kravetz   userfaultfd: huge...
384
  		 */
1c9e8def4   Mike Kravetz   userfaultfd: huge...
385
386
387
388
  		if (vm_alloc_shared)
  			SetPagePrivate(page);
  		else
  			ClearPagePrivate(page);
60d4d2d2b   Mike Kravetz   userfaultfd: huge...
389
  		put_page(page);
21205bf8f   Mike Kravetz   userfaultfd: huge...
390
  	}
60d4d2d2b   Mike Kravetz   userfaultfd: huge...
391
392
393
394
395
396
397
398
399
400
401
402
403
404
  	BUG_ON(copied < 0);
  	BUG_ON(err > 0);
  	BUG_ON(!copied && !err);
  	return copied ? copied : err;
  }
  #else /* !CONFIG_HUGETLB_PAGE */
  /* fail at build time if gcc attempts to use this */
  extern ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm,
  				      struct vm_area_struct *dst_vma,
  				      unsigned long dst_start,
  				      unsigned long src_start,
  				      unsigned long len,
  				      bool zeropage);
  #endif /* CONFIG_HUGETLB_PAGE */
3217d3c79   Mike Rapoport   userfaultfd: mcop...
405
406
407
408
409
410
  static __always_inline ssize_t mfill_atomic_pte(struct mm_struct *dst_mm,
  						pmd_t *dst_pmd,
  						struct vm_area_struct *dst_vma,
  						unsigned long dst_addr,
  						unsigned long src_addr,
  						struct page **page,
72981e0e7   Andrea Arcangeli   userfaultfd: wp: ...
411
412
  						bool zeropage,
  						bool wp_copy)
3217d3c79   Mike Rapoport   userfaultfd: mcop...
413
414
  {
  	ssize_t err;
5b51072e9   Andrea Arcangeli   userfaultfd: shme...
415
416
417
418
419
420
421
422
423
424
425
  	/*
  	 * The normal page fault path for a shmem will invoke the
  	 * fault, fill the hole in the file and COW it right away. The
  	 * result generates plain anonymous memory. So when we are
  	 * asked to fill an hole in a MAP_PRIVATE shmem mapping, we'll
  	 * generate anonymous memory directly without actually filling
  	 * the hole. For the MAP_PRIVATE case the robustness check
  	 * only happens in the pagetable (to verify it's still none)
  	 * and not in the radix tree.
  	 */
  	if (!(dst_vma->vm_flags & VM_SHARED)) {
3217d3c79   Mike Rapoport   userfaultfd: mcop...
426
427
  		if (!zeropage)
  			err = mcopy_atomic_pte(dst_mm, dst_pmd, dst_vma,
72981e0e7   Andrea Arcangeli   userfaultfd: wp: ...
428
429
  					       dst_addr, src_addr, page,
  					       wp_copy);
3217d3c79   Mike Rapoport   userfaultfd: mcop...
430
431
432
433
  		else
  			err = mfill_zeropage_pte(dst_mm, dst_pmd,
  						 dst_vma, dst_addr);
  	} else {
72981e0e7   Andrea Arcangeli   userfaultfd: wp: ...
434
  		VM_WARN_ON_ONCE(wp_copy);
8fb44e540   Mike Rapoport   userfaultfd: shme...
435
  		if (!zeropage)
3217d3c79   Mike Rapoport   userfaultfd: mcop...
436
437
438
  			err = shmem_mcopy_atomic_pte(dst_mm, dst_pmd,
  						     dst_vma, dst_addr,
  						     src_addr, page);
8fb44e540   Mike Rapoport   userfaultfd: shme...
439
440
441
  		else
  			err = shmem_mfill_zeropage_pte(dst_mm, dst_pmd,
  						       dst_vma, dst_addr);
3217d3c79   Mike Rapoport   userfaultfd: mcop...
442
443
444
445
  	}
  
  	return err;
  }
c1a4de99f   Andrea Arcangeli   userfaultfd: mcop...
446
447
448
449
  static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm,
  					      unsigned long dst_start,
  					      unsigned long src_start,
  					      unsigned long len,
df2cc96e7   Mike Rapoport   userfaultfd: prev...
450
  					      bool zeropage,
72981e0e7   Andrea Arcangeli   userfaultfd: wp: ...
451
452
  					      bool *mmap_changing,
  					      __u64 mode)
c1a4de99f   Andrea Arcangeli   userfaultfd: mcop...
453
454
455
456
457
  {
  	struct vm_area_struct *dst_vma;
  	ssize_t err;
  	pmd_t *dst_pmd;
  	unsigned long src_addr, dst_addr;
b6ebaedb4   Andrea Arcangeli   userfaultfd: avoi...
458
459
  	long copied;
  	struct page *page;
72981e0e7   Andrea Arcangeli   userfaultfd: wp: ...
460
  	bool wp_copy;
c1a4de99f   Andrea Arcangeli   userfaultfd: mcop...
461
462
463
464
465
466
467
468
469
470
  
  	/*
  	 * Sanitize the command parameters:
  	 */
  	BUG_ON(dst_start & ~PAGE_MASK);
  	BUG_ON(len & ~PAGE_MASK);
  
  	/* Does the address range wrap, or is the span zero-sized? */
  	BUG_ON(src_start + len <= src_start);
  	BUG_ON(dst_start + len <= dst_start);
b6ebaedb4   Andrea Arcangeli   userfaultfd: avoi...
471
472
473
474
475
  	src_addr = src_start;
  	dst_addr = dst_start;
  	copied = 0;
  	page = NULL;
  retry:
d8ed45c5d   Michel Lespinasse   mmap locking API:...
476
  	mmap_read_lock(dst_mm);
c1a4de99f   Andrea Arcangeli   userfaultfd: mcop...
477
478
  
  	/*
df2cc96e7   Mike Rapoport   userfaultfd: prev...
479
480
481
482
483
484
485
486
487
  	 * If memory mappings are changing because of non-cooperative
  	 * operation (e.g. mremap) running in parallel, bail out and
  	 * request the user to retry later
  	 */
  	err = -EAGAIN;
  	if (mmap_changing && READ_ONCE(*mmap_changing))
  		goto out_unlock;
  
  	/*
c1a4de99f   Andrea Arcangeli   userfaultfd: mcop...
488
489
490
  	 * Make sure the vma is not shared, that the dst range is
  	 * both valid and fully within a single existing vma.
  	 */
27d02568f   Mike Rapoport   userfaultfd: mcop...
491
  	err = -ENOENT;
643aa36ea   Wei Yang   userfaultfd: wrap...
492
  	dst_vma = find_dst_vma(dst_mm, dst_start, len);
26071cedc   Mike Rapoport   userfaultfd: shme...
493
494
  	if (!dst_vma)
  		goto out_unlock;
c1a4de99f   Andrea Arcangeli   userfaultfd: mcop...
495

27d02568f   Mike Rapoport   userfaultfd: mcop...
496
497
498
499
500
501
502
503
  	err = -EINVAL;
  	/*
  	 * shmem_zero_setup is invoked in mmap for MAP_ANONYMOUS|MAP_SHARED but
  	 * it will overwrite vm_ops, so vma_is_anonymous must return false.
  	 */
  	if (WARN_ON_ONCE(vma_is_anonymous(dst_vma) &&
  	    dst_vma->vm_flags & VM_SHARED))
  		goto out_unlock;
c1a4de99f   Andrea Arcangeli   userfaultfd: mcop...
504
  	/*
72981e0e7   Andrea Arcangeli   userfaultfd: wp: ...
505
506
507
508
509
510
511
512
  	 * validate 'mode' now that we know the dst_vma: don't allow
  	 * a wrprotect copy if the userfaultfd didn't register as WP.
  	 */
  	wp_copy = mode & UFFDIO_COPY_MODE_WP;
  	if (wp_copy && !(dst_vma->vm_flags & VM_UFFD_WP))
  		goto out_unlock;
  
  	/*
60d4d2d2b   Mike Kravetz   userfaultfd: huge...
513
514
515
516
517
  	 * If this is a HUGETLB vma, pass off to appropriate routine
  	 */
  	if (is_vm_hugetlb_page(dst_vma))
  		return  __mcopy_atomic_hugetlb(dst_mm, dst_vma, dst_start,
  						src_start, len, zeropage);
26071cedc   Mike Rapoport   userfaultfd: shme...
518
  	if (!vma_is_anonymous(dst_vma) && !vma_is_shmem(dst_vma))
b6ebaedb4   Andrea Arcangeli   userfaultfd: avoi...
519
  		goto out_unlock;
c1a4de99f   Andrea Arcangeli   userfaultfd: mcop...
520
521
522
523
524
525
526
  
  	/*
  	 * Ensure the dst_vma has a anon_vma or this page
  	 * would get a NULL anon_vma when moved in the
  	 * dst_vma.
  	 */
  	err = -ENOMEM;
5b51072e9   Andrea Arcangeli   userfaultfd: shme...
527
528
  	if (!(dst_vma->vm_flags & VM_SHARED) &&
  	    unlikely(anon_vma_prepare(dst_vma)))
b6ebaedb4   Andrea Arcangeli   userfaultfd: avoi...
529
  		goto out_unlock;
c1a4de99f   Andrea Arcangeli   userfaultfd: mcop...
530

b6ebaedb4   Andrea Arcangeli   userfaultfd: avoi...
531
  	while (src_addr < src_start + len) {
c1a4de99f   Andrea Arcangeli   userfaultfd: mcop...
532
  		pmd_t dst_pmdval;
b6ebaedb4   Andrea Arcangeli   userfaultfd: avoi...
533

c1a4de99f   Andrea Arcangeli   userfaultfd: mcop...
534
  		BUG_ON(dst_addr >= dst_start + len);
b6ebaedb4   Andrea Arcangeli   userfaultfd: avoi...
535

c1a4de99f   Andrea Arcangeli   userfaultfd: mcop...
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
  		dst_pmd = mm_alloc_pmd(dst_mm, dst_addr);
  		if (unlikely(!dst_pmd)) {
  			err = -ENOMEM;
  			break;
  		}
  
  		dst_pmdval = pmd_read_atomic(dst_pmd);
  		/*
  		 * If the dst_pmd is mapped as THP don't
  		 * override it and just be strict.
  		 */
  		if (unlikely(pmd_trans_huge(dst_pmdval))) {
  			err = -EEXIST;
  			break;
  		}
  		if (unlikely(pmd_none(dst_pmdval)) &&
4cf589249   Joel Fernandes (Google)   mm: treewide: rem...
552
  		    unlikely(__pte_alloc(dst_mm, dst_pmd))) {
c1a4de99f   Andrea Arcangeli   userfaultfd: mcop...
553
554
555
556
557
558
559
560
561
562
563
  			err = -ENOMEM;
  			break;
  		}
  		/* If an huge pmd materialized from under us fail */
  		if (unlikely(pmd_trans_huge(*dst_pmd))) {
  			err = -EFAULT;
  			break;
  		}
  
  		BUG_ON(pmd_none(*dst_pmd));
  		BUG_ON(pmd_trans_huge(*dst_pmd));
3217d3c79   Mike Rapoport   userfaultfd: mcop...
564
  		err = mfill_atomic_pte(dst_mm, dst_pmd, dst_vma, dst_addr,
72981e0e7   Andrea Arcangeli   userfaultfd: wp: ...
565
  				       src_addr, &page, zeropage, wp_copy);
c1a4de99f   Andrea Arcangeli   userfaultfd: mcop...
566
  		cond_resched();
9e368259a   Andrea Arcangeli   userfaultfd: use ...
567
  		if (unlikely(err == -ENOENT)) {
b6ebaedb4   Andrea Arcangeli   userfaultfd: avoi...
568
  			void *page_kaddr;
d8ed45c5d   Michel Lespinasse   mmap locking API:...
569
  			mmap_read_unlock(dst_mm);
b6ebaedb4   Andrea Arcangeli   userfaultfd: avoi...
570
571
572
573
574
575
576
577
578
579
580
581
582
583
  			BUG_ON(!page);
  
  			page_kaddr = kmap(page);
  			err = copy_from_user(page_kaddr,
  					     (const void __user *) src_addr,
  					     PAGE_SIZE);
  			kunmap(page);
  			if (unlikely(err)) {
  				err = -EFAULT;
  				goto out;
  			}
  			goto retry;
  		} else
  			BUG_ON(page);
c1a4de99f   Andrea Arcangeli   userfaultfd: mcop...
584
585
586
587
588
589
590
591
592
593
594
  		if (!err) {
  			dst_addr += PAGE_SIZE;
  			src_addr += PAGE_SIZE;
  			copied += PAGE_SIZE;
  
  			if (fatal_signal_pending(current))
  				err = -EINTR;
  		}
  		if (err)
  			break;
  	}
b6ebaedb4   Andrea Arcangeli   userfaultfd: avoi...
595
  out_unlock:
d8ed45c5d   Michel Lespinasse   mmap locking API:...
596
  	mmap_read_unlock(dst_mm);
b6ebaedb4   Andrea Arcangeli   userfaultfd: avoi...
597
598
  out:
  	if (page)
09cbfeaf1   Kirill A. Shutemov   mm, fs: get rid o...
599
  		put_page(page);
c1a4de99f   Andrea Arcangeli   userfaultfd: mcop...
600
601
602
603
604
605
606
  	BUG_ON(copied < 0);
  	BUG_ON(err > 0);
  	BUG_ON(!copied && !err);
  	return copied ? copied : err;
  }
  
  ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start,
df2cc96e7   Mike Rapoport   userfaultfd: prev...
607
  		     unsigned long src_start, unsigned long len,
72981e0e7   Andrea Arcangeli   userfaultfd: wp: ...
608
  		     bool *mmap_changing, __u64 mode)
c1a4de99f   Andrea Arcangeli   userfaultfd: mcop...
609
  {
df2cc96e7   Mike Rapoport   userfaultfd: prev...
610
  	return __mcopy_atomic(dst_mm, dst_start, src_start, len, false,
72981e0e7   Andrea Arcangeli   userfaultfd: wp: ...
611
  			      mmap_changing, mode);
c1a4de99f   Andrea Arcangeli   userfaultfd: mcop...
612
613
614
  }
  
  ssize_t mfill_zeropage(struct mm_struct *dst_mm, unsigned long start,
df2cc96e7   Mike Rapoport   userfaultfd: prev...
615
  		       unsigned long len, bool *mmap_changing)
c1a4de99f   Andrea Arcangeli   userfaultfd: mcop...
616
  {
72981e0e7   Andrea Arcangeli   userfaultfd: wp: ...
617
  	return __mcopy_atomic(dst_mm, start, 0, len, true, mmap_changing, 0);
c1a4de99f   Andrea Arcangeli   userfaultfd: mcop...
618
  }
ffd057939   Shaohua Li   userfaultfd: wp: ...
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
  
  int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start,
  			unsigned long len, bool enable_wp, bool *mmap_changing)
  {
  	struct vm_area_struct *dst_vma;
  	pgprot_t newprot;
  	int err;
  
  	/*
  	 * Sanitize the command parameters:
  	 */
  	BUG_ON(start & ~PAGE_MASK);
  	BUG_ON(len & ~PAGE_MASK);
  
  	/* Does the address range wrap, or is the span zero-sized? */
  	BUG_ON(start + len <= start);
d8ed45c5d   Michel Lespinasse   mmap locking API:...
635
  	mmap_read_lock(dst_mm);
ffd057939   Shaohua Li   userfaultfd: wp: ...
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
  
  	/*
  	 * If memory mappings are changing because of non-cooperative
  	 * operation (e.g. mremap) running in parallel, bail out and
  	 * request the user to retry later
  	 */
  	err = -EAGAIN;
  	if (mmap_changing && READ_ONCE(*mmap_changing))
  		goto out_unlock;
  
  	err = -ENOENT;
  	dst_vma = find_dst_vma(dst_mm, start, len);
  	/*
  	 * Make sure the vma is not shared, that the dst range is
  	 * both valid and fully within a single existing vma.
  	 */
  	if (!dst_vma || (dst_vma->vm_flags & VM_SHARED))
  		goto out_unlock;
  	if (!userfaultfd_wp(dst_vma))
  		goto out_unlock;
  	if (!vma_is_anonymous(dst_vma))
  		goto out_unlock;
  
  	if (enable_wp)
  		newprot = vm_get_page_prot(dst_vma->vm_flags & ~(VM_WRITE));
  	else
  		newprot = vm_get_page_prot(dst_vma->vm_flags);
  
  	change_protection(dst_vma, start, start + len, newprot,
  			  enable_wp ? MM_CP_UFFD_WP : MM_CP_UFFD_WP_RESOLVE);
  
  	err = 0;
  out_unlock:
d8ed45c5d   Michel Lespinasse   mmap locking API:...
669
  	mmap_read_unlock(dst_mm);
ffd057939   Shaohua Li   userfaultfd: wp: ...
670
671
  	return err;
  }