Blame view

mm/gup.c 84 KB
457c89965   Thomas Gleixner   treewide: Add SPD...
1
  // SPDX-License-Identifier: GPL-2.0-only
4bbd4c776   Kirill A. Shutemov   mm: move get_user...
2
3
4
5
  #include <linux/kernel.h>
  #include <linux/errno.h>
  #include <linux/err.h>
  #include <linux/spinlock.h>
4bbd4c776   Kirill A. Shutemov   mm: move get_user...
6
  #include <linux/mm.h>
3565fce3a   Dan Williams   mm, x86: get_user...
7
  #include <linux/memremap.h>
4bbd4c776   Kirill A. Shutemov   mm: move get_user...
8
9
10
11
  #include <linux/pagemap.h>
  #include <linux/rmap.h>
  #include <linux/swap.h>
  #include <linux/swapops.h>
174cd4b1e   Ingo Molnar   sched/headers: Pr...
12
  #include <linux/sched/signal.h>
2667f50e8   Steve Capper   mm: introduce a g...
13
  #include <linux/rwsem.h>
f30c59e92   Aneesh Kumar K.V   mm: Update generi...
14
  #include <linux/hugetlb.h>
9a4e9f3b2   Aneesh Kumar K.V   mm: update get_us...
15
16
17
  #include <linux/migrate.h>
  #include <linux/mm_inline.h>
  #include <linux/sched/mm.h>
1027e4436   Kirill A. Shutemov   mm: make GUP hand...
18

33a709b25   Dave Hansen   mm/gup, x86/mm/pk...
19
  #include <asm/mmu_context.h>
1027e4436   Kirill A. Shutemov   mm: make GUP hand...
20
  #include <asm/tlbflush.h>
2667f50e8   Steve Capper   mm: introduce a g...
21

4bbd4c776   Kirill A. Shutemov   mm: move get_user...
22
  #include "internal.h"
df06b37ff   Keith Busch   mm/gup: cache dev...
23
24
25
26
  struct follow_page_context {
  	struct dev_pagemap *pgmap;
  	unsigned int page_mask;
  };
47e29d32a   John Hubbard   mm/gup: page->hpa...
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
  static void hpage_pincount_add(struct page *page, int refs)
  {
  	VM_BUG_ON_PAGE(!hpage_pincount_available(page), page);
  	VM_BUG_ON_PAGE(page != compound_head(page), page);
  
  	atomic_add(refs, compound_pincount_ptr(page));
  }
  
  static void hpage_pincount_sub(struct page *page, int refs)
  {
  	VM_BUG_ON_PAGE(!hpage_pincount_available(page), page);
  	VM_BUG_ON_PAGE(page != compound_head(page), page);
  
  	atomic_sub(refs, compound_pincount_ptr(page));
  }
a707cdd55   John Hubbard   mm/gup: move try_...
42
43
44
45
46
47
48
49
50
51
52
53
54
55
  /*
   * Return the compound head page with ref appropriately incremented,
   * or NULL if that failed.
   */
  static inline struct page *try_get_compound_head(struct page *page, int refs)
  {
  	struct page *head = compound_head(page);
  
  	if (WARN_ON_ONCE(page_ref_count(head) < 0))
  		return NULL;
  	if (unlikely(!page_cache_add_speculative(head, refs)))
  		return NULL;
  	return head;
  }
3faa52c03   John Hubbard   mm/gup: track FOL...
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
  /*
   * try_grab_compound_head() - attempt to elevate a page's refcount, by a
   * flags-dependent amount.
   *
   * "grab" names in this file mean, "look at flags to decide whether to use
   * FOLL_PIN or FOLL_GET behavior, when incrementing the page's refcount.
   *
   * Either FOLL_PIN or FOLL_GET (or neither) must be set, but not both at the
   * same time. (That's true throughout the get_user_pages*() and
   * pin_user_pages*() APIs.) Cases:
   *
   *    FOLL_GET: page's refcount will be incremented by 1.
   *    FOLL_PIN: page's refcount will be incremented by GUP_PIN_COUNTING_BIAS.
   *
   * Return: head page (with refcount appropriately incremented) for success, or
   * NULL upon failure. If neither FOLL_GET nor FOLL_PIN was set, that's
   * considered failure, and furthermore, a likely bug in the caller, so a warning
   * is also emitted.
   */
  static __maybe_unused struct page *try_grab_compound_head(struct page *page,
  							  int refs,
  							  unsigned int flags)
  {
  	if (flags & FOLL_GET)
  		return try_get_compound_head(page, refs);
  	else if (flags & FOLL_PIN) {
1970dc6f5   John Hubbard   mm/gup: /proc/vms...
82
  		int orig_refs = refs;
47e29d32a   John Hubbard   mm/gup: page->hpa...
83
  		/*
df3a0a21b   Pingfan Liu   mm/gup: fix omiss...
84
85
86
87
88
89
90
91
  		 * Can't do FOLL_LONGTERM + FOLL_PIN with CMA in the gup fast
  		 * path, so fail and let the caller fall back to the slow path.
  		 */
  		if (unlikely(flags & FOLL_LONGTERM) &&
  				is_migrate_cma_page(page))
  			return NULL;
  
  		/*
47e29d32a   John Hubbard   mm/gup: page->hpa...
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
  		 * When pinning a compound page of order > 1 (which is what
  		 * hpage_pincount_available() checks for), use an exact count to
  		 * track it, via hpage_pincount_add/_sub().
  		 *
  		 * However, be sure to *also* increment the normal page refcount
  		 * field at least once, so that the page really is pinned.
  		 */
  		if (!hpage_pincount_available(page))
  			refs *= GUP_PIN_COUNTING_BIAS;
  
  		page = try_get_compound_head(page, refs);
  		if (!page)
  			return NULL;
  
  		if (hpage_pincount_available(page))
  			hpage_pincount_add(page, refs);
1970dc6f5   John Hubbard   mm/gup: /proc/vms...
108
109
  		mod_node_page_state(page_pgdat(page), NR_FOLL_PIN_ACQUIRED,
  				    orig_refs);
47e29d32a   John Hubbard   mm/gup: page->hpa...
110
  		return page;
3faa52c03   John Hubbard   mm/gup: track FOL...
111
112
113
114
115
  	}
  
  	WARN_ON_ONCE(1);
  	return NULL;
  }
cfde6c181   Jason Gunthorpe   mm/gup: combine p...
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
  static void put_compound_head(struct page *page, int refs, unsigned int flags)
  {
  	if (flags & FOLL_PIN) {
  		mod_node_page_state(page_pgdat(page), NR_FOLL_PIN_RELEASED,
  				    refs);
  
  		if (hpage_pincount_available(page))
  			hpage_pincount_sub(page, refs);
  		else
  			refs *= GUP_PIN_COUNTING_BIAS;
  	}
  
  	VM_BUG_ON_PAGE(page_ref_count(page) < refs, page);
  	/*
  	 * Calling put_page() for each ref is unnecessarily slow. Only the last
  	 * ref needs a put_page().
  	 */
  	if (refs > 1)
  		page_ref_sub(page, refs - 1);
  	put_page(page);
  }
3faa52c03   John Hubbard   mm/gup: track FOL...
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
  /**
   * try_grab_page() - elevate a page's refcount by a flag-dependent amount
   *
   * This might not do anything at all, depending on the flags argument.
   *
   * "grab" names in this file mean, "look at flags to decide whether to use
   * FOLL_PIN or FOLL_GET behavior, when incrementing the page's refcount.
   *
   * @page:    pointer to page to be grabbed
   * @flags:   gup flags: these are the FOLL_* flag values.
   *
   * Either FOLL_PIN or FOLL_GET (or neither) may be set, but not both at the same
   * time. Cases:
   *
   *    FOLL_GET: page's refcount will be incremented by 1.
   *    FOLL_PIN: page's refcount will be incremented by GUP_PIN_COUNTING_BIAS.
   *
   * Return: true for success, or if no action was required (if neither FOLL_PIN
   * nor FOLL_GET was set, nothing is done). False for failure: FOLL_GET or
   * FOLL_PIN was set, but the page could not be grabbed.
   */
  bool __must_check try_grab_page(struct page *page, unsigned int flags)
  {
  	WARN_ON_ONCE((flags & (FOLL_GET | FOLL_PIN)) == (FOLL_GET | FOLL_PIN));
  
  	if (flags & FOLL_GET)
  		return try_get_page(page);
  	else if (flags & FOLL_PIN) {
47e29d32a   John Hubbard   mm/gup: page->hpa...
165
  		int refs = 1;
3faa52c03   John Hubbard   mm/gup: track FOL...
166
167
168
169
  		page = compound_head(page);
  
  		if (WARN_ON_ONCE(page_ref_count(page) <= 0))
  			return false;
47e29d32a   John Hubbard   mm/gup: page->hpa...
170
171
172
173
174
175
176
177
178
179
180
181
  		if (hpage_pincount_available(page))
  			hpage_pincount_add(page, 1);
  		else
  			refs = GUP_PIN_COUNTING_BIAS;
  
  		/*
  		 * Similar to try_grab_compound_head(): even if using the
  		 * hpage_pincount_add/_sub() routines, be sure to
  		 * *also* increment the normal page refcount field at least
  		 * once, so that the page really is pinned.
  		 */
  		page_ref_add(page, refs);
1970dc6f5   John Hubbard   mm/gup: /proc/vms...
182
183
  
  		mod_node_page_state(page_pgdat(page), NR_FOLL_PIN_ACQUIRED, 1);
3faa52c03   John Hubbard   mm/gup: track FOL...
184
185
186
187
  	}
  
  	return true;
  }
3faa52c03   John Hubbard   mm/gup: track FOL...
188
189
190
191
192
193
194
195
196
197
198
  /**
   * unpin_user_page() - release a dma-pinned page
   * @page:            pointer to page to be released
   *
   * Pages that were pinned via pin_user_pages*() must be released via either
   * unpin_user_page(), or one of the unpin_user_pages*() routines. This is so
   * that such pages can be separately tracked and uniquely handled. In
   * particular, interactions with RDMA and filesystems need special handling.
   */
  void unpin_user_page(struct page *page)
  {
cfde6c181   Jason Gunthorpe   mm/gup: combine p...
199
  	put_compound_head(compound_head(page), 1, FOLL_PIN);
3faa52c03   John Hubbard   mm/gup: track FOL...
200
201
  }
  EXPORT_SYMBOL(unpin_user_page);
fc1d8e7cc   John Hubbard   mm: introduce put...
202
  /**
f1f6a7dd9   John Hubbard   mm, tree-wide: re...
203
   * unpin_user_pages_dirty_lock() - release and optionally dirty gup-pinned pages
2d15eb31b   akpm@linux-foundation.org   mm/gup: add make_...
204
   * @pages:  array of pages to be maybe marked dirty, and definitely released.
fc1d8e7cc   John Hubbard   mm: introduce put...
205
   * @npages: number of pages in the @pages array.
2d15eb31b   akpm@linux-foundation.org   mm/gup: add make_...
206
   * @make_dirty: whether to mark the pages dirty
fc1d8e7cc   John Hubbard   mm: introduce put...
207
208
209
210
211
   *
   * "gup-pinned page" refers to a page that has had one of the get_user_pages()
   * variants called on that page.
   *
   * For each page in the @pages array, make that page (or its head page, if a
2d15eb31b   akpm@linux-foundation.org   mm/gup: add make_...
212
   * compound page) dirty, if @make_dirty is true, and if the page was previously
f1f6a7dd9   John Hubbard   mm, tree-wide: re...
213
214
   * listed as clean. In any case, releases all pages using unpin_user_page(),
   * possibly via unpin_user_pages(), for the non-dirty case.
fc1d8e7cc   John Hubbard   mm: introduce put...
215
   *
f1f6a7dd9   John Hubbard   mm, tree-wide: re...
216
   * Please see the unpin_user_page() documentation for details.
fc1d8e7cc   John Hubbard   mm: introduce put...
217
   *
2d15eb31b   akpm@linux-foundation.org   mm/gup: add make_...
218
219
220
   * set_page_dirty_lock() is used internally. If instead, set_page_dirty() is
   * required, then the caller should a) verify that this is really correct,
   * because _lock() is usually required, and b) hand code it:
f1f6a7dd9   John Hubbard   mm, tree-wide: re...
221
   * set_page_dirty_lock(), unpin_user_page().
fc1d8e7cc   John Hubbard   mm: introduce put...
222
223
   *
   */
f1f6a7dd9   John Hubbard   mm, tree-wide: re...
224
225
  void unpin_user_pages_dirty_lock(struct page **pages, unsigned long npages,
  				 bool make_dirty)
fc1d8e7cc   John Hubbard   mm: introduce put...
226
  {
2d15eb31b   akpm@linux-foundation.org   mm/gup: add make_...
227
  	unsigned long index;
fc1d8e7cc   John Hubbard   mm: introduce put...
228

2d15eb31b   akpm@linux-foundation.org   mm/gup: add make_...
229
230
231
232
233
234
235
  	/*
  	 * TODO: this can be optimized for huge pages: if a series of pages is
  	 * physically contiguous and part of the same compound page, then a
  	 * single operation to the head page should suffice.
  	 */
  
  	if (!make_dirty) {
f1f6a7dd9   John Hubbard   mm, tree-wide: re...
236
  		unpin_user_pages(pages, npages);
2d15eb31b   akpm@linux-foundation.org   mm/gup: add make_...
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
  		return;
  	}
  
  	for (index = 0; index < npages; index++) {
  		struct page *page = compound_head(pages[index]);
  		/*
  		 * Checking PageDirty at this point may race with
  		 * clear_page_dirty_for_io(), but that's OK. Two key
  		 * cases:
  		 *
  		 * 1) This code sees the page as already dirty, so it
  		 * skips the call to set_page_dirty(). That could happen
  		 * because clear_page_dirty_for_io() called
  		 * page_mkclean(), followed by set_page_dirty().
  		 * However, now the page is going to get written back,
  		 * which meets the original intention of setting it
  		 * dirty, so all is well: clear_page_dirty_for_io() goes
  		 * on to call TestClearPageDirty(), and write the page
  		 * back.
  		 *
  		 * 2) This code sees the page as clean, so it calls
  		 * set_page_dirty(). The page stays dirty, despite being
  		 * written back, so it gets written back again in the
  		 * next writeback cycle. This is harmless.
  		 */
  		if (!PageDirty(page))
  			set_page_dirty_lock(page);
f1f6a7dd9   John Hubbard   mm, tree-wide: re...
264
  		unpin_user_page(page);
2d15eb31b   akpm@linux-foundation.org   mm/gup: add make_...
265
  	}
fc1d8e7cc   John Hubbard   mm: introduce put...
266
  }
f1f6a7dd9   John Hubbard   mm, tree-wide: re...
267
  EXPORT_SYMBOL(unpin_user_pages_dirty_lock);
fc1d8e7cc   John Hubbard   mm: introduce put...
268
269
  
  /**
f1f6a7dd9   John Hubbard   mm, tree-wide: re...
270
   * unpin_user_pages() - release an array of gup-pinned pages.
fc1d8e7cc   John Hubbard   mm: introduce put...
271
272
273
   * @pages:  array of pages to be marked dirty and released.
   * @npages: number of pages in the @pages array.
   *
f1f6a7dd9   John Hubbard   mm, tree-wide: re...
274
   * For each page in the @pages array, release the page using unpin_user_page().
fc1d8e7cc   John Hubbard   mm: introduce put...
275
   *
f1f6a7dd9   John Hubbard   mm, tree-wide: re...
276
   * Please see the unpin_user_page() documentation for details.
fc1d8e7cc   John Hubbard   mm: introduce put...
277
   */
f1f6a7dd9   John Hubbard   mm, tree-wide: re...
278
  void unpin_user_pages(struct page **pages, unsigned long npages)
fc1d8e7cc   John Hubbard   mm: introduce put...
279
280
281
282
  {
  	unsigned long index;
  
  	/*
146608bb7   John Hubbard   mm/gup: protect u...
283
284
285
286
287
288
289
  	 * If this WARN_ON() fires, then the system *might* be leaking pages (by
  	 * leaving them pinned), but probably not. More likely, gup/pup returned
  	 * a hard -ERRNO error to the caller, who erroneously passed it here.
  	 */
  	if (WARN_ON(IS_ERR_VALUE(npages)))
  		return;
  	/*
fc1d8e7cc   John Hubbard   mm: introduce put...
290
291
292
293
294
  	 * TODO: this can be optimized for huge pages: if a series of pages is
  	 * physically contiguous and part of the same compound page, then a
  	 * single operation to the head page should suffice.
  	 */
  	for (index = 0; index < npages; index++)
f1f6a7dd9   John Hubbard   mm, tree-wide: re...
295
  		unpin_user_page(pages[index]);
fc1d8e7cc   John Hubbard   mm: introduce put...
296
  }
f1f6a7dd9   John Hubbard   mm, tree-wide: re...
297
  EXPORT_SYMBOL(unpin_user_pages);
fc1d8e7cc   John Hubbard   mm: introduce put...
298

050a9adc6   Christoph Hellwig   mm: consolidate t...
299
  #ifdef CONFIG_MMU
69e68b4f0   Kirill A. Shutemov   mm: cleanup follo...
300
301
  static struct page *no_page_table(struct vm_area_struct *vma,
  		unsigned int flags)
4bbd4c776   Kirill A. Shutemov   mm: move get_user...
302
  {
69e68b4f0   Kirill A. Shutemov   mm: cleanup follo...
303
304
305
306
307
308
309
310
  	/*
  	 * When core dumping an enormous anonymous area that nobody
  	 * has touched so far, we don't want to allocate unnecessary pages or
  	 * page tables.  Return error instead of NULL to skip handle_mm_fault,
  	 * then get_dump_page() will return NULL to leave a hole in the dump.
  	 * But we can only make this optimization where a hole would surely
  	 * be zero-filled if handle_mm_fault() actually did handle it.
  	 */
a0137f16d   Anshuman Khandual   mm/vma: replace a...
311
312
  	if ((flags & FOLL_DUMP) &&
  			(vma_is_anonymous(vma) || !vma->vm_ops->fault))
69e68b4f0   Kirill A. Shutemov   mm: cleanup follo...
313
314
315
  		return ERR_PTR(-EFAULT);
  	return NULL;
  }
4bbd4c776   Kirill A. Shutemov   mm: move get_user...
316

1027e4436   Kirill A. Shutemov   mm: make GUP hand...
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
  static int follow_pfn_pte(struct vm_area_struct *vma, unsigned long address,
  		pte_t *pte, unsigned int flags)
  {
  	/* No page to get reference */
  	if (flags & FOLL_GET)
  		return -EFAULT;
  
  	if (flags & FOLL_TOUCH) {
  		pte_t entry = *pte;
  
  		if (flags & FOLL_WRITE)
  			entry = pte_mkdirty(entry);
  		entry = pte_mkyoung(entry);
  
  		if (!pte_same(*pte, entry)) {
  			set_pte_at(vma->vm_mm, address, pte, entry);
  			update_mmu_cache(vma, address, pte);
  		}
  	}
  
  	/* Proper page table entry exists, but no corresponding struct page */
  	return -EEXIST;
  }
19be0eaff   Linus Torvalds   mm: remove gup_fl...
340
  /*
a308c71bf   Peter Xu   mm/gup: Remove en...
341
342
   * FOLL_FORCE can write to even unwritable pte's, but only
   * after we've gone through a COW cycle and they are dirty.
19be0eaff   Linus Torvalds   mm: remove gup_fl...
343
344
345
   */
  static inline bool can_follow_write_pte(pte_t pte, unsigned int flags)
  {
a308c71bf   Peter Xu   mm/gup: Remove en...
346
347
  	return pte_write(pte) ||
  		((flags & FOLL_FORCE) && (flags & FOLL_COW) && pte_dirty(pte));
19be0eaff   Linus Torvalds   mm: remove gup_fl...
348
  }
69e68b4f0   Kirill A. Shutemov   mm: cleanup follo...
349
  static struct page *follow_page_pte(struct vm_area_struct *vma,
df06b37ff   Keith Busch   mm/gup: cache dev...
350
351
  		unsigned long address, pmd_t *pmd, unsigned int flags,
  		struct dev_pagemap **pgmap)
69e68b4f0   Kirill A. Shutemov   mm: cleanup follo...
352
353
354
355
356
  {
  	struct mm_struct *mm = vma->vm_mm;
  	struct page *page;
  	spinlock_t *ptl;
  	pte_t *ptep, pte;
f28d43636   Claudio Imbrenda   mm/gup/writeback:...
357
  	int ret;
4bbd4c776   Kirill A. Shutemov   mm: move get_user...
358

eddb1c228   John Hubbard   mm/gup: introduce...
359
360
361
362
  	/* FOLL_GET and FOLL_PIN are mutually exclusive. */
  	if (WARN_ON_ONCE((flags & (FOLL_PIN | FOLL_GET)) ==
  			 (FOLL_PIN | FOLL_GET)))
  		return ERR_PTR(-EINVAL);
69e68b4f0   Kirill A. Shutemov   mm: cleanup follo...
363
  retry:
4bbd4c776   Kirill A. Shutemov   mm: move get_user...
364
  	if (unlikely(pmd_bad(*pmd)))
69e68b4f0   Kirill A. Shutemov   mm: cleanup follo...
365
  		return no_page_table(vma, flags);
4bbd4c776   Kirill A. Shutemov   mm: move get_user...
366
367
  
  	ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
4bbd4c776   Kirill A. Shutemov   mm: move get_user...
368
369
370
371
372
373
374
375
376
377
  	pte = *ptep;
  	if (!pte_present(pte)) {
  		swp_entry_t entry;
  		/*
  		 * KSM's break_ksm() relies upon recognizing a ksm page
  		 * even while it is being migrated, so for that case we
  		 * need migration_entry_wait().
  		 */
  		if (likely(!(flags & FOLL_MIGRATION)))
  			goto no_page;
0661a3361   Kirill A. Shutemov   mm: remove rest u...
378
  		if (pte_none(pte))
4bbd4c776   Kirill A. Shutemov   mm: move get_user...
379
380
381
382
383
384
  			goto no_page;
  		entry = pte_to_swp_entry(pte);
  		if (!is_migration_entry(entry))
  			goto no_page;
  		pte_unmap_unlock(ptep, ptl);
  		migration_entry_wait(mm, pmd, address);
69e68b4f0   Kirill A. Shutemov   mm: cleanup follo...
385
  		goto retry;
4bbd4c776   Kirill A. Shutemov   mm: move get_user...
386
  	}
8a0516ed8   Mel Gorman   mm: convert p[te|...
387
  	if ((flags & FOLL_NUMA) && pte_protnone(pte))
4bbd4c776   Kirill A. Shutemov   mm: move get_user...
388
  		goto no_page;
19be0eaff   Linus Torvalds   mm: remove gup_fl...
389
  	if ((flags & FOLL_WRITE) && !can_follow_write_pte(pte, flags)) {
69e68b4f0   Kirill A. Shutemov   mm: cleanup follo...
390
391
392
  		pte_unmap_unlock(ptep, ptl);
  		return NULL;
  	}
4bbd4c776   Kirill A. Shutemov   mm: move get_user...
393
394
  
  	page = vm_normal_page(vma, address, pte);
3faa52c03   John Hubbard   mm/gup: track FOL...
395
  	if (!page && pte_devmap(pte) && (flags & (FOLL_GET | FOLL_PIN))) {
3565fce3a   Dan Williams   mm, x86: get_user...
396
  		/*
3faa52c03   John Hubbard   mm/gup: track FOL...
397
398
399
  		 * Only return device mapping pages in the FOLL_GET or FOLL_PIN
  		 * case since they are only valid while holding the pgmap
  		 * reference.
3565fce3a   Dan Williams   mm, x86: get_user...
400
  		 */
df06b37ff   Keith Busch   mm/gup: cache dev...
401
402
  		*pgmap = get_dev_pagemap(pte_pfn(pte), *pgmap);
  		if (*pgmap)
3565fce3a   Dan Williams   mm, x86: get_user...
403
404
405
406
  			page = pte_page(pte);
  		else
  			goto no_page;
  	} else if (unlikely(!page)) {
1027e4436   Kirill A. Shutemov   mm: make GUP hand...
407
408
409
410
411
412
413
414
415
  		if (flags & FOLL_DUMP) {
  			/* Avoid special (like zero) pages in core dumps */
  			page = ERR_PTR(-EFAULT);
  			goto out;
  		}
  
  		if (is_zero_pfn(pte_pfn(pte))) {
  			page = pte_page(pte);
  		} else {
1027e4436   Kirill A. Shutemov   mm: make GUP hand...
416
417
418
419
  			ret = follow_pfn_pte(vma, address, ptep, flags);
  			page = ERR_PTR(ret);
  			goto out;
  		}
4bbd4c776   Kirill A. Shutemov   mm: move get_user...
420
  	}
6742d293c   Kirill A. Shutemov   mm: adjust FOLL_S...
421
  	if (flags & FOLL_SPLIT && PageTransCompound(page)) {
6742d293c   Kirill A. Shutemov   mm: adjust FOLL_S...
422
423
424
425
426
427
428
429
430
431
  		get_page(page);
  		pte_unmap_unlock(ptep, ptl);
  		lock_page(page);
  		ret = split_huge_page(page);
  		unlock_page(page);
  		put_page(page);
  		if (ret)
  			return ERR_PTR(ret);
  		goto retry;
  	}
3faa52c03   John Hubbard   mm/gup: track FOL...
432
433
434
435
  	/* try_grab_page() does nothing unless FOLL_GET or FOLL_PIN is set. */
  	if (unlikely(!try_grab_page(page, flags))) {
  		page = ERR_PTR(-ENOMEM);
  		goto out;
8fde12ca7   Linus Torvalds   mm: prevent get_u...
436
  	}
f28d43636   Claudio Imbrenda   mm/gup/writeback:...
437
438
439
440
441
442
443
444
445
446
447
448
449
  	/*
  	 * We need to make the page accessible if and only if we are going
  	 * to access its content (the FOLL_PIN case).  Please see
  	 * Documentation/core-api/pin_user_pages.rst for details.
  	 */
  	if (flags & FOLL_PIN) {
  		ret = arch_make_page_accessible(page);
  		if (ret) {
  			unpin_user_page(page);
  			page = ERR_PTR(ret);
  			goto out;
  		}
  	}
4bbd4c776   Kirill A. Shutemov   mm: move get_user...
450
451
452
453
454
455
456
457
458
459
460
  	if (flags & FOLL_TOUCH) {
  		if ((flags & FOLL_WRITE) &&
  		    !pte_dirty(pte) && !PageDirty(page))
  			set_page_dirty(page);
  		/*
  		 * pte_mkyoung() would be more correct here, but atomic care
  		 * is needed to avoid losing the dirty bit: it is easier to use
  		 * mark_page_accessed().
  		 */
  		mark_page_accessed(page);
  	}
de60f5f10   Eric B Munson   mm: introduce VM_...
461
  	if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
e90309c9f   Kirill A. Shutemov   thp: allow mlocke...
462
463
464
  		/* Do not mlock pte-mapped THP */
  		if (PageTransCompound(page))
  			goto out;
4bbd4c776   Kirill A. Shutemov   mm: move get_user...
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
  		/*
  		 * The preliminary mapping check is mainly to avoid the
  		 * pointless overhead of lock_page on the ZERO_PAGE
  		 * which might bounce very badly if there is contention.
  		 *
  		 * If the page is already locked, we don't need to
  		 * handle it now - vmscan will handle it later if and
  		 * when it attempts to reclaim the page.
  		 */
  		if (page->mapping && trylock_page(page)) {
  			lru_add_drain();  /* push cached pages to LRU */
  			/*
  			 * Because we lock page here, and migration is
  			 * blocked by the pte's page reference, and we
  			 * know the page is still mapped, we don't even
  			 * need to check for file-cache page truncation.
  			 */
  			mlock_vma_page(page);
  			unlock_page(page);
  		}
  	}
1027e4436   Kirill A. Shutemov   mm: make GUP hand...
486
  out:
4bbd4c776   Kirill A. Shutemov   mm: move get_user...
487
  	pte_unmap_unlock(ptep, ptl);
4bbd4c776   Kirill A. Shutemov   mm: move get_user...
488
  	return page;
4bbd4c776   Kirill A. Shutemov   mm: move get_user...
489
490
491
  no_page:
  	pte_unmap_unlock(ptep, ptl);
  	if (!pte_none(pte))
69e68b4f0   Kirill A. Shutemov   mm: cleanup follo...
492
493
494
  		return NULL;
  	return no_page_table(vma, flags);
  }
080dbb618   Aneesh Kumar K.V   mm/follow_page_ma...
495
496
  static struct page *follow_pmd_mask(struct vm_area_struct *vma,
  				    unsigned long address, pud_t *pudp,
df06b37ff   Keith Busch   mm/gup: cache dev...
497
498
  				    unsigned int flags,
  				    struct follow_page_context *ctx)
69e68b4f0   Kirill A. Shutemov   mm: cleanup follo...
499
  {
688272809   Huang Ying   mm, gup: prevent ...
500
  	pmd_t *pmd, pmdval;
69e68b4f0   Kirill A. Shutemov   mm: cleanup follo...
501
502
503
  	spinlock_t *ptl;
  	struct page *page;
  	struct mm_struct *mm = vma->vm_mm;
080dbb618   Aneesh Kumar K.V   mm/follow_page_ma...
504
  	pmd = pmd_offset(pudp, address);
688272809   Huang Ying   mm, gup: prevent ...
505
506
507
508
509
510
  	/*
  	 * The READ_ONCE() will stabilize the pmdval in a register or
  	 * on the stack so that it will stop changing under the code.
  	 */
  	pmdval = READ_ONCE(*pmd);
  	if (pmd_none(pmdval))
69e68b4f0   Kirill A. Shutemov   mm: cleanup follo...
511
  		return no_page_table(vma, flags);
be9d30458   Wei Yang   mm/gup.c: use is_...
512
  	if (pmd_huge(pmdval) && is_vm_hugetlb_page(vma)) {
e66f17ff7   Naoya Horiguchi   mm/hugetlb: take ...
513
514
515
516
  		page = follow_huge_pmd(mm, address, pmd, flags);
  		if (page)
  			return page;
  		return no_page_table(vma, flags);
69e68b4f0   Kirill A. Shutemov   mm: cleanup follo...
517
  	}
688272809   Huang Ying   mm, gup: prevent ...
518
  	if (is_hugepd(__hugepd(pmd_val(pmdval)))) {
4dc71451a   Aneesh Kumar K.V   mm/follow_page_ma...
519
  		page = follow_huge_pd(vma, address,
688272809   Huang Ying   mm, gup: prevent ...
520
  				      __hugepd(pmd_val(pmdval)), flags,
4dc71451a   Aneesh Kumar K.V   mm/follow_page_ma...
521
522
523
524
525
  				      PMD_SHIFT);
  		if (page)
  			return page;
  		return no_page_table(vma, flags);
  	}
84c3fc4e9   Zi Yan   mm: thp: check pm...
526
  retry:
688272809   Huang Ying   mm, gup: prevent ...
527
  	if (!pmd_present(pmdval)) {
84c3fc4e9   Zi Yan   mm: thp: check pm...
528
529
530
  		if (likely(!(flags & FOLL_MIGRATION)))
  			return no_page_table(vma, flags);
  		VM_BUG_ON(thp_migration_supported() &&
688272809   Huang Ying   mm, gup: prevent ...
531
532
  				  !is_pmd_migration_entry(pmdval));
  		if (is_pmd_migration_entry(pmdval))
84c3fc4e9   Zi Yan   mm: thp: check pm...
533
  			pmd_migration_entry_wait(mm, pmd);
688272809   Huang Ying   mm, gup: prevent ...
534
535
536
  		pmdval = READ_ONCE(*pmd);
  		/*
  		 * MADV_DONTNEED may convert the pmd to null because
c1e8d7c6a   Michel Lespinasse   mmap locking API:...
537
  		 * mmap_lock is held in read mode
688272809   Huang Ying   mm, gup: prevent ...
538
539
540
  		 */
  		if (pmd_none(pmdval))
  			return no_page_table(vma, flags);
84c3fc4e9   Zi Yan   mm: thp: check pm...
541
542
  		goto retry;
  	}
688272809   Huang Ying   mm, gup: prevent ...
543
  	if (pmd_devmap(pmdval)) {
3565fce3a   Dan Williams   mm, x86: get_user...
544
  		ptl = pmd_lock(mm, pmd);
df06b37ff   Keith Busch   mm/gup: cache dev...
545
  		page = follow_devmap_pmd(vma, address, pmd, flags, &ctx->pgmap);
3565fce3a   Dan Williams   mm, x86: get_user...
546
547
548
549
  		spin_unlock(ptl);
  		if (page)
  			return page;
  	}
688272809   Huang Ying   mm, gup: prevent ...
550
  	if (likely(!pmd_trans_huge(pmdval)))
df06b37ff   Keith Busch   mm/gup: cache dev...
551
  		return follow_page_pte(vma, address, pmd, flags, &ctx->pgmap);
6742d293c   Kirill A. Shutemov   mm: adjust FOLL_S...
552

688272809   Huang Ying   mm, gup: prevent ...
553
  	if ((flags & FOLL_NUMA) && pmd_protnone(pmdval))
db08f2030   Aneesh Kumar K.V   mm/gup: check for...
554
  		return no_page_table(vma, flags);
84c3fc4e9   Zi Yan   mm: thp: check pm...
555
  retry_locked:
6742d293c   Kirill A. Shutemov   mm: adjust FOLL_S...
556
  	ptl = pmd_lock(mm, pmd);
688272809   Huang Ying   mm, gup: prevent ...
557
558
559
560
  	if (unlikely(pmd_none(*pmd))) {
  		spin_unlock(ptl);
  		return no_page_table(vma, flags);
  	}
84c3fc4e9   Zi Yan   mm: thp: check pm...
561
562
563
564
565
566
567
  	if (unlikely(!pmd_present(*pmd))) {
  		spin_unlock(ptl);
  		if (likely(!(flags & FOLL_MIGRATION)))
  			return no_page_table(vma, flags);
  		pmd_migration_entry_wait(mm, pmd);
  		goto retry_locked;
  	}
6742d293c   Kirill A. Shutemov   mm: adjust FOLL_S...
568
569
  	if (unlikely(!pmd_trans_huge(*pmd))) {
  		spin_unlock(ptl);
df06b37ff   Keith Busch   mm/gup: cache dev...
570
  		return follow_page_pte(vma, address, pmd, flags, &ctx->pgmap);
6742d293c   Kirill A. Shutemov   mm: adjust FOLL_S...
571
  	}
bfe7b00de   Song Liu   mm, thp: introduc...
572
  	if (flags & (FOLL_SPLIT | FOLL_SPLIT_PMD)) {
6742d293c   Kirill A. Shutemov   mm: adjust FOLL_S...
573
574
575
576
577
  		int ret;
  		page = pmd_page(*pmd);
  		if (is_huge_zero_page(page)) {
  			spin_unlock(ptl);
  			ret = 0;
78ddc5347   Kirill A. Shutemov   thp: rename split...
578
  			split_huge_pmd(vma, pmd, address);
337d9abf1   Naoya Horiguchi   mm: thp: check pm...
579
580
  			if (pmd_trans_unstable(pmd))
  				ret = -EBUSY;
bfe7b00de   Song Liu   mm, thp: introduc...
581
  		} else if (flags & FOLL_SPLIT) {
8fde12ca7   Linus Torvalds   mm: prevent get_u...
582
583
584
585
  			if (unlikely(!try_get_page(page))) {
  				spin_unlock(ptl);
  				return ERR_PTR(-ENOMEM);
  			}
69e68b4f0   Kirill A. Shutemov   mm: cleanup follo...
586
  			spin_unlock(ptl);
6742d293c   Kirill A. Shutemov   mm: adjust FOLL_S...
587
588
589
590
  			lock_page(page);
  			ret = split_huge_page(page);
  			unlock_page(page);
  			put_page(page);
baa355fd3   Kirill A. Shutemov   thp: file pages s...
591
592
  			if (pmd_none(*pmd))
  				return no_page_table(vma, flags);
bfe7b00de   Song Liu   mm, thp: introduc...
593
594
595
596
  		} else {  /* flags & FOLL_SPLIT_PMD */
  			spin_unlock(ptl);
  			split_huge_pmd(vma, pmd, address);
  			ret = pte_alloc(mm, pmd) ? -ENOMEM : 0;
6742d293c   Kirill A. Shutemov   mm: adjust FOLL_S...
597
598
599
  		}
  
  		return ret ? ERR_PTR(ret) :
df06b37ff   Keith Busch   mm/gup: cache dev...
600
  			follow_page_pte(vma, address, pmd, flags, &ctx->pgmap);
69e68b4f0   Kirill A. Shutemov   mm: cleanup follo...
601
  	}
6742d293c   Kirill A. Shutemov   mm: adjust FOLL_S...
602
603
  	page = follow_trans_huge_pmd(vma, address, pmd, flags);
  	spin_unlock(ptl);
df06b37ff   Keith Busch   mm/gup: cache dev...
604
  	ctx->page_mask = HPAGE_PMD_NR - 1;
6742d293c   Kirill A. Shutemov   mm: adjust FOLL_S...
605
  	return page;
4bbd4c776   Kirill A. Shutemov   mm: move get_user...
606
  }
080dbb618   Aneesh Kumar K.V   mm/follow_page_ma...
607
608
  static struct page *follow_pud_mask(struct vm_area_struct *vma,
  				    unsigned long address, p4d_t *p4dp,
df06b37ff   Keith Busch   mm/gup: cache dev...
609
610
  				    unsigned int flags,
  				    struct follow_page_context *ctx)
080dbb618   Aneesh Kumar K.V   mm/follow_page_ma...
611
612
613
614
615
616
617
618
619
  {
  	pud_t *pud;
  	spinlock_t *ptl;
  	struct page *page;
  	struct mm_struct *mm = vma->vm_mm;
  
  	pud = pud_offset(p4dp, address);
  	if (pud_none(*pud))
  		return no_page_table(vma, flags);
be9d30458   Wei Yang   mm/gup.c: use is_...
620
  	if (pud_huge(*pud) && is_vm_hugetlb_page(vma)) {
080dbb618   Aneesh Kumar K.V   mm/follow_page_ma...
621
622
623
624
625
  		page = follow_huge_pud(mm, address, pud, flags);
  		if (page)
  			return page;
  		return no_page_table(vma, flags);
  	}
4dc71451a   Aneesh Kumar K.V   mm/follow_page_ma...
626
627
628
629
630
631
632
633
  	if (is_hugepd(__hugepd(pud_val(*pud)))) {
  		page = follow_huge_pd(vma, address,
  				      __hugepd(pud_val(*pud)), flags,
  				      PUD_SHIFT);
  		if (page)
  			return page;
  		return no_page_table(vma, flags);
  	}
080dbb618   Aneesh Kumar K.V   mm/follow_page_ma...
634
635
  	if (pud_devmap(*pud)) {
  		ptl = pud_lock(mm, pud);
df06b37ff   Keith Busch   mm/gup: cache dev...
636
  		page = follow_devmap_pud(vma, address, pud, flags, &ctx->pgmap);
080dbb618   Aneesh Kumar K.V   mm/follow_page_ma...
637
638
639
640
641
642
  		spin_unlock(ptl);
  		if (page)
  			return page;
  	}
  	if (unlikely(pud_bad(*pud)))
  		return no_page_table(vma, flags);
df06b37ff   Keith Busch   mm/gup: cache dev...
643
  	return follow_pmd_mask(vma, address, pud, flags, ctx);
080dbb618   Aneesh Kumar K.V   mm/follow_page_ma...
644
  }
080dbb618   Aneesh Kumar K.V   mm/follow_page_ma...
645
646
  static struct page *follow_p4d_mask(struct vm_area_struct *vma,
  				    unsigned long address, pgd_t *pgdp,
df06b37ff   Keith Busch   mm/gup: cache dev...
647
648
  				    unsigned int flags,
  				    struct follow_page_context *ctx)
080dbb618   Aneesh Kumar K.V   mm/follow_page_ma...
649
650
  {
  	p4d_t *p4d;
4dc71451a   Aneesh Kumar K.V   mm/follow_page_ma...
651
  	struct page *page;
080dbb618   Aneesh Kumar K.V   mm/follow_page_ma...
652
653
654
655
656
657
658
  
  	p4d = p4d_offset(pgdp, address);
  	if (p4d_none(*p4d))
  		return no_page_table(vma, flags);
  	BUILD_BUG_ON(p4d_huge(*p4d));
  	if (unlikely(p4d_bad(*p4d)))
  		return no_page_table(vma, flags);
4dc71451a   Aneesh Kumar K.V   mm/follow_page_ma...
659
660
661
662
663
664
665
666
  	if (is_hugepd(__hugepd(p4d_val(*p4d)))) {
  		page = follow_huge_pd(vma, address,
  				      __hugepd(p4d_val(*p4d)), flags,
  				      P4D_SHIFT);
  		if (page)
  			return page;
  		return no_page_table(vma, flags);
  	}
df06b37ff   Keith Busch   mm/gup: cache dev...
667
  	return follow_pud_mask(vma, address, p4d, flags, ctx);
080dbb618   Aneesh Kumar K.V   mm/follow_page_ma...
668
669
670
671
672
673
674
  }
  
  /**
   * follow_page_mask - look up a page descriptor from a user-virtual address
   * @vma: vm_area_struct mapping @address
   * @address: virtual address to look up
   * @flags: flags modifying lookup behaviour
78179556e   Mike Rapoport   mm/gup.c: fix fol...
675
676
   * @ctx: contains dev_pagemap for %ZONE_DEVICE memory pinning and a
   *       pointer to output page_mask
080dbb618   Aneesh Kumar K.V   mm/follow_page_ma...
677
678
679
   *
   * @flags can have FOLL_ flags set, defined in <linux/mm.h>
   *
78179556e   Mike Rapoport   mm/gup.c: fix fol...
680
681
682
683
684
685
   * When getting pages from ZONE_DEVICE memory, the @ctx->pgmap caches
   * the device's dev_pagemap metadata to avoid repeating expensive lookups.
   *
   * On output, the @ctx->page_mask is set according to the size of the page.
   *
   * Return: the mapped (struct page *), %NULL if no mapping exists, or
080dbb618   Aneesh Kumar K.V   mm/follow_page_ma...
686
687
688
   * an error pointer if there is a mapping to something not represented
   * by a page descriptor (see also vm_normal_page()).
   */
a7030aea2   Bharath Vedartham   mm/gup.c: make fo...
689
  static struct page *follow_page_mask(struct vm_area_struct *vma,
080dbb618   Aneesh Kumar K.V   mm/follow_page_ma...
690
  			      unsigned long address, unsigned int flags,
df06b37ff   Keith Busch   mm/gup: cache dev...
691
  			      struct follow_page_context *ctx)
080dbb618   Aneesh Kumar K.V   mm/follow_page_ma...
692
693
694
695
  {
  	pgd_t *pgd;
  	struct page *page;
  	struct mm_struct *mm = vma->vm_mm;
df06b37ff   Keith Busch   mm/gup: cache dev...
696
  	ctx->page_mask = 0;
080dbb618   Aneesh Kumar K.V   mm/follow_page_ma...
697
698
699
700
  
  	/* make this handle hugepd */
  	page = follow_huge_addr(mm, address, flags & FOLL_WRITE);
  	if (!IS_ERR(page)) {
3faa52c03   John Hubbard   mm/gup: track FOL...
701
  		WARN_ON_ONCE(flags & (FOLL_GET | FOLL_PIN));
080dbb618   Aneesh Kumar K.V   mm/follow_page_ma...
702
703
704
705
706
707
708
  		return page;
  	}
  
  	pgd = pgd_offset(mm, address);
  
  	if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
  		return no_page_table(vma, flags);
faaa5b62d   Anshuman Khandual   mm/follow_page_ma...
709
710
711
712
713
714
  	if (pgd_huge(*pgd)) {
  		page = follow_huge_pgd(mm, address, pgd, flags);
  		if (page)
  			return page;
  		return no_page_table(vma, flags);
  	}
4dc71451a   Aneesh Kumar K.V   mm/follow_page_ma...
715
716
717
718
719
720
721
722
  	if (is_hugepd(__hugepd(pgd_val(*pgd)))) {
  		page = follow_huge_pd(vma, address,
  				      __hugepd(pgd_val(*pgd)), flags,
  				      PGDIR_SHIFT);
  		if (page)
  			return page;
  		return no_page_table(vma, flags);
  	}
faaa5b62d   Anshuman Khandual   mm/follow_page_ma...
723

df06b37ff   Keith Busch   mm/gup: cache dev...
724
725
726
727
728
729
730
731
732
733
734
735
736
  	return follow_p4d_mask(vma, address, pgd, flags, ctx);
  }
  
  struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
  			 unsigned int foll_flags)
  {
  	struct follow_page_context ctx = { NULL };
  	struct page *page;
  
  	page = follow_page_mask(vma, address, foll_flags, &ctx);
  	if (ctx.pgmap)
  		put_dev_pagemap(ctx.pgmap);
  	return page;
080dbb618   Aneesh Kumar K.V   mm/follow_page_ma...
737
  }
f2b495ca8   Kirill A. Shutemov   mm: extract in_ga...
738
739
740
741
742
  static int get_gate_page(struct mm_struct *mm, unsigned long address,
  		unsigned int gup_flags, struct vm_area_struct **vma,
  		struct page **page)
  {
  	pgd_t *pgd;
c2febafc6   Kirill A. Shutemov   mm: convert gener...
743
  	p4d_t *p4d;
f2b495ca8   Kirill A. Shutemov   mm: extract in_ga...
744
745
746
747
748
749
750
751
752
753
754
755
  	pud_t *pud;
  	pmd_t *pmd;
  	pte_t *pte;
  	int ret = -EFAULT;
  
  	/* user gate pages are read-only */
  	if (gup_flags & FOLL_WRITE)
  		return -EFAULT;
  	if (address > TASK_SIZE)
  		pgd = pgd_offset_k(address);
  	else
  		pgd = pgd_offset_gate(mm, address);
b5d1c39f3   Andy Lutomirski   mm/gup.c: remove ...
756
757
  	if (pgd_none(*pgd))
  		return -EFAULT;
c2febafc6   Kirill A. Shutemov   mm: convert gener...
758
  	p4d = p4d_offset(pgd, address);
b5d1c39f3   Andy Lutomirski   mm/gup.c: remove ...
759
760
  	if (p4d_none(*p4d))
  		return -EFAULT;
c2febafc6   Kirill A. Shutemov   mm: convert gener...
761
  	pud = pud_offset(p4d, address);
b5d1c39f3   Andy Lutomirski   mm/gup.c: remove ...
762
763
  	if (pud_none(*pud))
  		return -EFAULT;
f2b495ca8   Kirill A. Shutemov   mm: extract in_ga...
764
  	pmd = pmd_offset(pud, address);
84c3fc4e9   Zi Yan   mm: thp: check pm...
765
  	if (!pmd_present(*pmd))
f2b495ca8   Kirill A. Shutemov   mm: extract in_ga...
766
767
768
769
770
771
772
773
774
775
776
777
778
779
  		return -EFAULT;
  	VM_BUG_ON(pmd_trans_huge(*pmd));
  	pte = pte_offset_map(pmd, address);
  	if (pte_none(*pte))
  		goto unmap;
  	*vma = get_gate_vma(mm);
  	if (!page)
  		goto out;
  	*page = vm_normal_page(*vma, address, *pte);
  	if (!*page) {
  		if ((gup_flags & FOLL_DUMP) || !is_zero_pfn(pte_pfn(*pte)))
  			goto unmap;
  		*page = pte_page(*pte);
  	}
9fa2dd946   Dave Hansen   mm: fix pin vs. g...
780
  	if (unlikely(!try_grab_page(*page, gup_flags))) {
8fde12ca7   Linus Torvalds   mm: prevent get_u...
781
782
783
  		ret = -ENOMEM;
  		goto unmap;
  	}
f2b495ca8   Kirill A. Shutemov   mm: extract in_ga...
784
785
786
787
788
789
  out:
  	ret = 0;
  unmap:
  	pte_unmap(pte);
  	return ret;
  }
9a95f3cf7   Paul Cassella   mm: describe mmap...
790
  /*
c1e8d7c6a   Michel Lespinasse   mmap locking API:...
791
792
   * mmap_lock must be held on entry.  If @locked != NULL and *@flags
   * does not include FOLL_NOWAIT, the mmap_lock may be released.  If it
4f6da9341   Peter Xu   mm/gup: rename "n...
793
   * is, *@locked will be set to 0 and -EBUSY returned.
9a95f3cf7   Paul Cassella   mm: describe mmap...
794
   */
64019a2e4   Peter Xu   mm/gup: remove ta...
795
  static int faultin_page(struct vm_area_struct *vma,
4f6da9341   Peter Xu   mm/gup: rename "n...
796
  		unsigned long address, unsigned int *flags, int *locked)
167444834   Kirill A. Shutemov   mm: extract code ...
797
  {
167444834   Kirill A. Shutemov   mm: extract code ...
798
  	unsigned int fault_flags = 0;
2b7403035   Souptick Joarder   mm: Change return...
799
  	vm_fault_t ret;
167444834   Kirill A. Shutemov   mm: extract code ...
800

de60f5f10   Eric B Munson   mm: introduce VM_...
801
802
803
  	/* mlock all present pages, but do not fault in new pages */
  	if ((*flags & (FOLL_POPULATE | FOLL_MLOCK)) == FOLL_MLOCK)
  		return -ENOENT;
167444834   Kirill A. Shutemov   mm: extract code ...
804
805
  	if (*flags & FOLL_WRITE)
  		fault_flags |= FAULT_FLAG_WRITE;
1b2ee1266   Dave Hansen   mm/core: Do not e...
806
807
  	if (*flags & FOLL_REMOTE)
  		fault_flags |= FAULT_FLAG_REMOTE;
4f6da9341   Peter Xu   mm/gup: rename "n...
808
  	if (locked)
71335f37c   Peter Xu   mm/gup: allow to ...
809
  		fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
167444834   Kirill A. Shutemov   mm: extract code ...
810
811
  	if (*flags & FOLL_NOWAIT)
  		fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT;
234b239be   Andres Lagar-Cavilla   kvm: Faults which...
812
  	if (*flags & FOLL_TRIED) {
4426e945d   Peter Xu   mm/gup: allow VM_...
813
814
815
816
  		/*
  		 * Note: FAULT_FLAG_ALLOW_RETRY and FAULT_FLAG_TRIED
  		 * can co-exist
  		 */
234b239be   Andres Lagar-Cavilla   kvm: Faults which...
817
818
  		fault_flags |= FAULT_FLAG_TRIED;
  	}
167444834   Kirill A. Shutemov   mm: extract code ...
819

bce617ede   Peter Xu   mm: do page fault...
820
  	ret = handle_mm_fault(vma, address, fault_flags, NULL);
167444834   Kirill A. Shutemov   mm: extract code ...
821
  	if (ret & VM_FAULT_ERROR) {
9a291a7c9   James Morse   mm/hugetlb: repor...
822
823
824
825
  		int err = vm_fault_to_errno(ret, *flags);
  
  		if (err)
  			return err;
167444834   Kirill A. Shutemov   mm: extract code ...
826
827
  		BUG();
  	}
167444834   Kirill A. Shutemov   mm: extract code ...
828
  	if (ret & VM_FAULT_RETRY) {
4f6da9341   Peter Xu   mm/gup: rename "n...
829
830
  		if (locked && !(fault_flags & FAULT_FLAG_RETRY_NOWAIT))
  			*locked = 0;
167444834   Kirill A. Shutemov   mm: extract code ...
831
832
833
834
835
836
837
838
839
840
841
842
843
  		return -EBUSY;
  	}
  
  	/*
  	 * The VM_FAULT_WRITE bit tells us that do_wp_page has broken COW when
  	 * necessary, even if maybe_mkwrite decided not to set pte_write. We
  	 * can thus safely do subsequent page lookups as if they were reads.
  	 * But only do so when looping for pte_write is futile: in some cases
  	 * userspace may also be wanting to write to the gotten user page,
  	 * which a read fault here might prevent (a readonly page might get
  	 * reCOWed by userspace write).
  	 */
  	if ((ret & VM_FAULT_WRITE) && !(vma->vm_flags & VM_WRITE))
2923117b7   Mario Leinweber   mm/gup.c: fix cod...
844
  		*flags |= FOLL_COW;
167444834   Kirill A. Shutemov   mm: extract code ...
845
846
  	return 0;
  }
fa5bb2093   Kirill A. Shutemov   mm: cleanup __get...
847
848
849
  static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags)
  {
  	vm_flags_t vm_flags = vma->vm_flags;
1b2ee1266   Dave Hansen   mm/core: Do not e...
850
851
  	int write = (gup_flags & FOLL_WRITE);
  	int foreign = (gup_flags & FOLL_REMOTE);
fa5bb2093   Kirill A. Shutemov   mm: cleanup __get...
852
853
854
  
  	if (vm_flags & (VM_IO | VM_PFNMAP))
  		return -EFAULT;
7f7ccc2cc   Willy Tarreau   proc: do not acce...
855
856
  	if (gup_flags & FOLL_ANON && !vma_is_anonymous(vma))
  		return -EFAULT;
1b2ee1266   Dave Hansen   mm/core: Do not e...
857
  	if (write) {
fa5bb2093   Kirill A. Shutemov   mm: cleanup __get...
858
859
860
861
862
863
864
865
866
867
868
869
  		if (!(vm_flags & VM_WRITE)) {
  			if (!(gup_flags & FOLL_FORCE))
  				return -EFAULT;
  			/*
  			 * We used to let the write,force case do COW in a
  			 * VM_MAYWRITE VM_SHARED !VM_WRITE vma, so ptrace could
  			 * set a breakpoint in a read-only mapping of an
  			 * executable, without corrupting the file (yet only
  			 * when that file had been opened for writing!).
  			 * Anon pages in shared mappings are surprising: now
  			 * just reject it.
  			 */
464353647   Hugh Dickins   mm: retire GUP WA...
870
  			if (!is_cow_mapping(vm_flags))
fa5bb2093   Kirill A. Shutemov   mm: cleanup __get...
871
  				return -EFAULT;
fa5bb2093   Kirill A. Shutemov   mm: cleanup __get...
872
873
874
875
876
877
878
879
880
881
882
  		}
  	} else if (!(vm_flags & VM_READ)) {
  		if (!(gup_flags & FOLL_FORCE))
  			return -EFAULT;
  		/*
  		 * Is there actually any vma we can reach here which does not
  		 * have VM_MAYREAD set?
  		 */
  		if (!(vm_flags & VM_MAYREAD))
  			return -EFAULT;
  	}
d61172b4b   Dave Hansen   mm/core, x86/mm/p...
883
884
885
886
887
  	/*
  	 * gups are always data accesses, not instruction
  	 * fetches, so execute=false here
  	 */
  	if (!arch_vma_access_permitted(vma, write, false, foreign))
33a709b25   Dave Hansen   mm/gup, x86/mm/pk...
888
  		return -EFAULT;
fa5bb2093   Kirill A. Shutemov   mm: cleanup __get...
889
890
  	return 0;
  }
4bbd4c776   Kirill A. Shutemov   mm: move get_user...
891
892
  /**
   * __get_user_pages() - pin user pages in memory
4bbd4c776   Kirill A. Shutemov   mm: move get_user...
893
894
895
896
897
898
899
900
901
   * @mm:		mm_struct of target mm
   * @start:	starting user address
   * @nr_pages:	number of pages from start to pin
   * @gup_flags:	flags modifying pin behaviour
   * @pages:	array that receives pointers to the pages pinned.
   *		Should be at least nr_pages long. Or NULL, if caller
   *		only intends to ensure the pages are faulted in.
   * @vmas:	array of pointers to vmas corresponding to each page.
   *		Or NULL if the caller does not require them.
c1e8d7c6a   Michel Lespinasse   mmap locking API:...
902
   * @locked:     whether we're still with the mmap_lock held
4bbd4c776   Kirill A. Shutemov   mm: move get_user...
903
   *
d2dfbe47f   Liu Xiang   mm/gup.c: fix com...
904
905
906
907
908
909
910
   * Returns either number of pages pinned (which may be less than the
   * number requested), or an error. Details about the return value:
   *
   * -- If nr_pages is 0, returns 0.
   * -- If nr_pages is >0, but no pages were pinned, returns -errno.
   * -- If nr_pages is >0, and some pages were pinned, returns the number of
   *    pages pinned. Again, this may be less than nr_pages.
2d3a36a47   Michal Hocko   mm, mempolicy: fi...
911
   * -- 0 return value is possible when the fault would need to be retried.
d2dfbe47f   Liu Xiang   mm/gup.c: fix com...
912
913
914
   *
   * The caller is responsible for releasing returned @pages, via put_page().
   *
c1e8d7c6a   Michel Lespinasse   mmap locking API:...
915
   * @vmas are valid only as long as mmap_lock is held.
4bbd4c776   Kirill A. Shutemov   mm: move get_user...
916
   *
c1e8d7c6a   Michel Lespinasse   mmap locking API:...
917
   * Must be called with mmap_lock held.  It may be released.  See below.
4bbd4c776   Kirill A. Shutemov   mm: move get_user...
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
   *
   * __get_user_pages walks a process's page tables and takes a reference to
   * each struct page that each user address corresponds to at a given
   * instant. That is, it takes the page that would be accessed if a user
   * thread accesses the given user virtual address at that instant.
   *
   * This does not guarantee that the page exists in the user mappings when
   * __get_user_pages returns, and there may even be a completely different
   * page there in some cases (eg. if mmapped pagecache has been invalidated
   * and subsequently re faulted). However it does guarantee that the page
   * won't be freed completely. And mostly callers simply care that the page
   * contains data that was valid *at some point in time*. Typically, an IO
   * or similar operation cannot guarantee anything stronger anyway because
   * locks can't be held over the syscall boundary.
   *
   * If @gup_flags & FOLL_WRITE == 0, the page must not be written to. If
   * the page is written to, set_page_dirty (or set_page_dirty_lock, as
   * appropriate) must be called after the page is finished with, and
   * before put_page is called.
   *
c1e8d7c6a   Michel Lespinasse   mmap locking API:...
938
   * If @locked != NULL, *@locked will be set to 0 when mmap_lock is
4f6da9341   Peter Xu   mm/gup: rename "n...
939
940
   * released by an up_read().  That can happen if @gup_flags does not
   * have FOLL_NOWAIT.
9a95f3cf7   Paul Cassella   mm: describe mmap...
941
   *
4f6da9341   Peter Xu   mm/gup: rename "n...
942
   * A caller using such a combination of @locked and @gup_flags
c1e8d7c6a   Michel Lespinasse   mmap locking API:...
943
   * must therefore hold the mmap_lock for reading only, and recognize
9a95f3cf7   Paul Cassella   mm: describe mmap...
944
945
   * when it's been released.  Otherwise, it must be held for either
   * reading or writing and will not be released.
4bbd4c776   Kirill A. Shutemov   mm: move get_user...
946
947
948
949
950
   *
   * In most cases, get_user_pages or get_user_pages_fast should be used
   * instead of __get_user_pages. __get_user_pages should be used only if
   * you need some special @gup_flags.
   */
64019a2e4   Peter Xu   mm/gup: remove ta...
951
  static long __get_user_pages(struct mm_struct *mm,
4bbd4c776   Kirill A. Shutemov   mm: move get_user...
952
953
  		unsigned long start, unsigned long nr_pages,
  		unsigned int gup_flags, struct page **pages,
4f6da9341   Peter Xu   mm/gup: rename "n...
954
  		struct vm_area_struct **vmas, int *locked)
4bbd4c776   Kirill A. Shutemov   mm: move get_user...
955
  {
df06b37ff   Keith Busch   mm/gup: cache dev...
956
  	long ret = 0, i = 0;
fa5bb2093   Kirill A. Shutemov   mm: cleanup __get...
957
  	struct vm_area_struct *vma = NULL;
df06b37ff   Keith Busch   mm/gup: cache dev...
958
  	struct follow_page_context ctx = { NULL };
4bbd4c776   Kirill A. Shutemov   mm: move get_user...
959
960
961
  
  	if (!nr_pages)
  		return 0;
f96525941   Andrey Konovalov   mm: untag user po...
962
  	start = untagged_addr(start);
eddb1c228   John Hubbard   mm/gup: introduce...
963
  	VM_BUG_ON(!!pages != !!(gup_flags & (FOLL_GET | FOLL_PIN)));
4bbd4c776   Kirill A. Shutemov   mm: move get_user...
964
965
966
967
968
969
970
971
  
  	/*
  	 * If FOLL_FORCE is set then do not force a full fault as the hinting
  	 * fault information is unrelated to the reference behaviour of a task
  	 * using the address space
  	 */
  	if (!(gup_flags & FOLL_FORCE))
  		gup_flags |= FOLL_NUMA;
4bbd4c776   Kirill A. Shutemov   mm: move get_user...
972
  	do {
fa5bb2093   Kirill A. Shutemov   mm: cleanup __get...
973
974
975
976
977
978
979
980
  		struct page *page;
  		unsigned int foll_flags = gup_flags;
  		unsigned int page_increm;
  
  		/* first iteration or cross vma bound */
  		if (!vma || start >= vma->vm_end) {
  			vma = find_extend_vma(mm, start);
  			if (!vma && in_gate_area(mm, start)) {
fa5bb2093   Kirill A. Shutemov   mm: cleanup __get...
981
982
983
984
  				ret = get_gate_page(mm, start & PAGE_MASK,
  						gup_flags, &vma,
  						pages ? &pages[i] : NULL);
  				if (ret)
08be37b79   John Hubbard   mm/gup: finish co...
985
  					goto out;
df06b37ff   Keith Busch   mm/gup: cache dev...
986
  				ctx.page_mask = 0;
fa5bb2093   Kirill A. Shutemov   mm: cleanup __get...
987
988
  				goto next_page;
  			}
4bbd4c776   Kirill A. Shutemov   mm: move get_user...
989

df06b37ff   Keith Busch   mm/gup: cache dev...
990
991
992
993
  			if (!vma || check_vma_flags(vma, gup_flags)) {
  				ret = -EFAULT;
  				goto out;
  			}
fa5bb2093   Kirill A. Shutemov   mm: cleanup __get...
994
995
996
  			if (is_vm_hugetlb_page(vma)) {
  				i = follow_hugetlb_page(mm, vma, pages, vmas,
  						&start, &nr_pages, i,
a308c71bf   Peter Xu   mm/gup: Remove en...
997
  						gup_flags, locked);
ad415db81   Peter Xu   mm/gup: fix __get...
998
999
1000
  				if (locked && *locked == 0) {
  					/*
  					 * We've got a VM_FAULT_RETRY
c1e8d7c6a   Michel Lespinasse   mmap locking API:...
1001
  					 * and we've lost mmap_lock.
ad415db81   Peter Xu   mm/gup: fix __get...
1002
1003
1004
1005
1006
1007
  					 * We must stop here.
  					 */
  					BUG_ON(gup_flags & FOLL_NOWAIT);
  					BUG_ON(ret != 0);
  					goto out;
  				}
fa5bb2093   Kirill A. Shutemov   mm: cleanup __get...
1008
  				continue;
4bbd4c776   Kirill A. Shutemov   mm: move get_user...
1009
  			}
fa5bb2093   Kirill A. Shutemov   mm: cleanup __get...
1010
1011
1012
1013
1014
1015
  		}
  retry:
  		/*
  		 * If we have a pending SIGKILL, don't keep faulting pages and
  		 * potentially allocating memory.
  		 */
fa45f1162   Davidlohr Bueso   mm/: remove calle...
1016
  		if (fatal_signal_pending(current)) {
d180870d8   Michal Hocko   mm, gup: return E...
1017
  			ret = -EINTR;
df06b37ff   Keith Busch   mm/gup: cache dev...
1018
1019
  			goto out;
  		}
fa5bb2093   Kirill A. Shutemov   mm: cleanup __get...
1020
  		cond_resched();
df06b37ff   Keith Busch   mm/gup: cache dev...
1021
1022
  
  		page = follow_page_mask(vma, start, foll_flags, &ctx);
fa5bb2093   Kirill A. Shutemov   mm: cleanup __get...
1023
  		if (!page) {
64019a2e4   Peter Xu   mm/gup: remove ta...
1024
  			ret = faultin_page(vma, start, &foll_flags, locked);
fa5bb2093   Kirill A. Shutemov   mm: cleanup __get...
1025
1026
1027
  			switch (ret) {
  			case 0:
  				goto retry;
df06b37ff   Keith Busch   mm/gup: cache dev...
1028
1029
  			case -EBUSY:
  				ret = 0;
e4a9bc589   Joe Perches   mm: use fallthrough;
1030
  				fallthrough;
fa5bb2093   Kirill A. Shutemov   mm: cleanup __get...
1031
1032
1033
  			case -EFAULT:
  			case -ENOMEM:
  			case -EHWPOISON:
df06b37ff   Keith Busch   mm/gup: cache dev...
1034
  				goto out;
fa5bb2093   Kirill A. Shutemov   mm: cleanup __get...
1035
1036
  			case -ENOENT:
  				goto next_page;
4bbd4c776   Kirill A. Shutemov   mm: move get_user...
1037
  			}
fa5bb2093   Kirill A. Shutemov   mm: cleanup __get...
1038
  			BUG();
1027e4436   Kirill A. Shutemov   mm: make GUP hand...
1039
1040
1041
1042
1043
1044
1045
  		} else if (PTR_ERR(page) == -EEXIST) {
  			/*
  			 * Proper page table entry exists, but no corresponding
  			 * struct page.
  			 */
  			goto next_page;
  		} else if (IS_ERR(page)) {
df06b37ff   Keith Busch   mm/gup: cache dev...
1046
1047
  			ret = PTR_ERR(page);
  			goto out;
1027e4436   Kirill A. Shutemov   mm: make GUP hand...
1048
  		}
fa5bb2093   Kirill A. Shutemov   mm: cleanup __get...
1049
1050
1051
1052
  		if (pages) {
  			pages[i] = page;
  			flush_anon_page(vma, page, start);
  			flush_dcache_page(page);
df06b37ff   Keith Busch   mm/gup: cache dev...
1053
  			ctx.page_mask = 0;
4bbd4c776   Kirill A. Shutemov   mm: move get_user...
1054
  		}
4bbd4c776   Kirill A. Shutemov   mm: move get_user...
1055
  next_page:
fa5bb2093   Kirill A. Shutemov   mm: cleanup __get...
1056
1057
  		if (vmas) {
  			vmas[i] = vma;
df06b37ff   Keith Busch   mm/gup: cache dev...
1058
  			ctx.page_mask = 0;
fa5bb2093   Kirill A. Shutemov   mm: cleanup __get...
1059
  		}
df06b37ff   Keith Busch   mm/gup: cache dev...
1060
  		page_increm = 1 + (~(start >> PAGE_SHIFT) & ctx.page_mask);
fa5bb2093   Kirill A. Shutemov   mm: cleanup __get...
1061
1062
1063
1064
1065
  		if (page_increm > nr_pages)
  			page_increm = nr_pages;
  		i += page_increm;
  		start += page_increm * PAGE_SIZE;
  		nr_pages -= page_increm;
4bbd4c776   Kirill A. Shutemov   mm: move get_user...
1066
  	} while (nr_pages);
df06b37ff   Keith Busch   mm/gup: cache dev...
1067
1068
1069
1070
  out:
  	if (ctx.pgmap)
  		put_dev_pagemap(ctx.pgmap);
  	return i ? i : ret;
4bbd4c776   Kirill A. Shutemov   mm: move get_user...
1071
  }
4bbd4c776   Kirill A. Shutemov   mm: move get_user...
1072

771ab4302   Tobias Klauser   mm/gup.c: make un...
1073
1074
  static bool vma_permits_fault(struct vm_area_struct *vma,
  			      unsigned int fault_flags)
d4925e00d   Dave Hansen   mm/gup: Factor ou...
1075
  {
1b2ee1266   Dave Hansen   mm/core: Do not e...
1076
1077
  	bool write   = !!(fault_flags & FAULT_FLAG_WRITE);
  	bool foreign = !!(fault_flags & FAULT_FLAG_REMOTE);
33a709b25   Dave Hansen   mm/gup, x86/mm/pk...
1078
  	vm_flags_t vm_flags = write ? VM_WRITE : VM_READ;
d4925e00d   Dave Hansen   mm/gup: Factor ou...
1079
1080
1081
  
  	if (!(vm_flags & vma->vm_flags))
  		return false;
33a709b25   Dave Hansen   mm/gup, x86/mm/pk...
1082
1083
  	/*
  	 * The architecture might have a hardware protection
1b2ee1266   Dave Hansen   mm/core: Do not e...
1084
  	 * mechanism other than read/write that can deny access.
d61172b4b   Dave Hansen   mm/core, x86/mm/p...
1085
1086
1087
  	 *
  	 * gup always represents data access, not instruction
  	 * fetches, so execute=false here:
33a709b25   Dave Hansen   mm/gup, x86/mm/pk...
1088
  	 */
d61172b4b   Dave Hansen   mm/core, x86/mm/p...
1089
  	if (!arch_vma_access_permitted(vma, write, false, foreign))
33a709b25   Dave Hansen   mm/gup, x86/mm/pk...
1090
  		return false;
d4925e00d   Dave Hansen   mm/gup: Factor ou...
1091
1092
  	return true;
  }
adc8cb406   Souptick Joarder   mm/gup.c: update ...
1093
  /**
4bbd4c776   Kirill A. Shutemov   mm: move get_user...
1094
   * fixup_user_fault() - manually resolve a user page fault
4bbd4c776   Kirill A. Shutemov   mm: move get_user...
1095
1096
1097
   * @mm:		mm_struct of target mm
   * @address:	user address
   * @fault_flags:flags to pass down to handle_mm_fault()
c1e8d7c6a   Michel Lespinasse   mmap locking API:...
1098
   * @unlocked:	did we unlock the mmap_lock while retrying, maybe NULL if caller
548b6a1e5   Miles Chen   mm/gup.c: further...
1099
1100
   *		does not allow retry. If NULL, the caller must guarantee
   *		that fault_flags does not contain FAULT_FLAG_ALLOW_RETRY.
4bbd4c776   Kirill A. Shutemov   mm: move get_user...
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
   *
   * This is meant to be called in the specific scenario where for locking reasons
   * we try to access user memory in atomic context (within a pagefault_disable()
   * section), this returns -EFAULT, and we want to resolve the user fault before
   * trying again.
   *
   * Typically this is meant to be used by the futex code.
   *
   * The main difference with get_user_pages() is that this function will
   * unconditionally call handle_mm_fault() which will in turn perform all the
   * necessary SW fixup of the dirty and young bits in the PTE, while
4a9e1cda2   Dominik Dingel   mm: bring in addi...
1112
   * get_user_pages() only guarantees to update these in the struct page.
4bbd4c776   Kirill A. Shutemov   mm: move get_user...
1113
1114
1115
1116
1117
1118
   *
   * This is important for some architectures where those bits also gate the
   * access permission to the page because they are maintained in software.  On
   * such architectures, gup() will not be enough to make a subsequent access
   * succeed.
   *
c1e8d7c6a   Michel Lespinasse   mmap locking API:...
1119
1120
   * This function will not return with an unlocked mmap_lock. So it has not the
   * same semantics wrt the @mm->mmap_lock as does filemap_fault().
4bbd4c776   Kirill A. Shutemov   mm: move get_user...
1121
   */
64019a2e4   Peter Xu   mm/gup: remove ta...
1122
  int fixup_user_fault(struct mm_struct *mm,
4a9e1cda2   Dominik Dingel   mm: bring in addi...
1123
1124
  		     unsigned long address, unsigned int fault_flags,
  		     bool *unlocked)
4bbd4c776   Kirill A. Shutemov   mm: move get_user...
1125
1126
  {
  	struct vm_area_struct *vma;
2b7403035   Souptick Joarder   mm: Change return...
1127
  	vm_fault_t ret, major = 0;
4a9e1cda2   Dominik Dingel   mm: bring in addi...
1128

f96525941   Andrey Konovalov   mm: untag user po...
1129
  	address = untagged_addr(address);
4a9e1cda2   Dominik Dingel   mm: bring in addi...
1130
  	if (unlocked)
71335f37c   Peter Xu   mm/gup: allow to ...
1131
  		fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
4bbd4c776   Kirill A. Shutemov   mm: move get_user...
1132

4a9e1cda2   Dominik Dingel   mm: bring in addi...
1133
  retry:
4bbd4c776   Kirill A. Shutemov   mm: move get_user...
1134
1135
1136
  	vma = find_extend_vma(mm, address);
  	if (!vma || address < vma->vm_start)
  		return -EFAULT;
d4925e00d   Dave Hansen   mm/gup: Factor ou...
1137
  	if (!vma_permits_fault(vma, fault_flags))
4bbd4c776   Kirill A. Shutemov   mm: move get_user...
1138
  		return -EFAULT;
475f4dfc0   Peter Xu   mm/gup: fix fixup...
1139
1140
1141
  	if ((fault_flags & FAULT_FLAG_KILLABLE) &&
  	    fatal_signal_pending(current))
  		return -EINTR;
bce617ede   Peter Xu   mm: do page fault...
1142
  	ret = handle_mm_fault(vma, address, fault_flags, NULL);
4a9e1cda2   Dominik Dingel   mm: bring in addi...
1143
  	major |= ret & VM_FAULT_MAJOR;
4bbd4c776   Kirill A. Shutemov   mm: move get_user...
1144
  	if (ret & VM_FAULT_ERROR) {
9a291a7c9   James Morse   mm/hugetlb: repor...
1145
1146
1147
1148
  		int err = vm_fault_to_errno(ret, 0);
  
  		if (err)
  			return err;
4bbd4c776   Kirill A. Shutemov   mm: move get_user...
1149
1150
  		BUG();
  	}
4a9e1cda2   Dominik Dingel   mm: bring in addi...
1151
1152
  
  	if (ret & VM_FAULT_RETRY) {
d8ed45c5d   Michel Lespinasse   mmap locking API:...
1153
  		mmap_read_lock(mm);
475f4dfc0   Peter Xu   mm/gup: fix fixup...
1154
1155
1156
  		*unlocked = true;
  		fault_flags |= FAULT_FLAG_TRIED;
  		goto retry;
4a9e1cda2   Dominik Dingel   mm: bring in addi...
1157
  	}
4bbd4c776   Kirill A. Shutemov   mm: move get_user...
1158
1159
  	return 0;
  }
add6a0cd1   Paolo Bonzini   KVM: MMU: try to ...
1160
  EXPORT_SYMBOL_GPL(fixup_user_fault);
4bbd4c776   Kirill A. Shutemov   mm: move get_user...
1161

2d3a36a47   Michal Hocko   mm, mempolicy: fi...
1162
1163
1164
1165
  /*
   * Please note that this function, unlike __get_user_pages will not
   * return 0 for nr_pages > 0 without FOLL_NOWAIT
   */
64019a2e4   Peter Xu   mm/gup: remove ta...
1166
  static __always_inline long __get_user_pages_locked(struct mm_struct *mm,
f0818f472   Andrea Arcangeli   mm: gup: add get_...
1167
1168
  						unsigned long start,
  						unsigned long nr_pages,
f0818f472   Andrea Arcangeli   mm: gup: add get_...
1169
1170
  						struct page **pages,
  						struct vm_area_struct **vmas,
e716712f8   Al Viro   __get_user_pages_...
1171
  						int *locked,
0fd71a56f   Andrea Arcangeli   mm: gup: add __ge...
1172
  						unsigned int flags)
f0818f472   Andrea Arcangeli   mm: gup: add get_...
1173
  {
f0818f472   Andrea Arcangeli   mm: gup: add get_...
1174
1175
1176
1177
1178
1179
1180
1181
1182
  	long ret, pages_done;
  	bool lock_dropped;
  
  	if (locked) {
  		/* if VM_FAULT_RETRY can be returned, vmas become invalid */
  		BUG_ON(vmas);
  		/* check caller initialized locked */
  		BUG_ON(*locked != 1);
  	}
008cfe441   Peter Xu   mm: Introduce mm_...
1183
  	if (flags & FOLL_PIN)
a4d63c373   Jason A. Donenfeld   mm: do not rely o...
1184
  		atomic_set(&mm->has_pinned, 1);
008cfe441   Peter Xu   mm: Introduce mm_...
1185

eddb1c228   John Hubbard   mm/gup: introduce...
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
  	/*
  	 * FOLL_PIN and FOLL_GET are mutually exclusive. Traditional behavior
  	 * is to set FOLL_GET if the caller wants pages[] filled in (but has
  	 * carelessly failed to specify FOLL_GET), so keep doing that, but only
  	 * for FOLL_GET, not for the newer FOLL_PIN.
  	 *
  	 * FOLL_PIN always expects pages to be non-null, but no need to assert
  	 * that here, as any failures will be obvious enough.
  	 */
  	if (pages && !(flags & FOLL_PIN))
f0818f472   Andrea Arcangeli   mm: gup: add get_...
1196
  		flags |= FOLL_GET;
f0818f472   Andrea Arcangeli   mm: gup: add get_...
1197
1198
1199
1200
  
  	pages_done = 0;
  	lock_dropped = false;
  	for (;;) {
64019a2e4   Peter Xu   mm/gup: remove ta...
1201
  		ret = __get_user_pages(mm, start, nr_pages, flags, pages,
f0818f472   Andrea Arcangeli   mm: gup: add get_...
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
  				       vmas, locked);
  		if (!locked)
  			/* VM_FAULT_RETRY couldn't trigger, bypass */
  			return ret;
  
  		/* VM_FAULT_RETRY cannot return errors */
  		if (!*locked) {
  			BUG_ON(ret < 0);
  			BUG_ON(ret >= nr_pages);
  		}
f0818f472   Andrea Arcangeli   mm: gup: add get_...
1212
1213
1214
1215
1216
1217
1218
  		if (ret > 0) {
  			nr_pages -= ret;
  			pages_done += ret;
  			if (!nr_pages)
  				break;
  		}
  		if (*locked) {
96312e612   Andrea Arcangeli   mm/gup.c: teach g...
1219
1220
1221
1222
  			/*
  			 * VM_FAULT_RETRY didn't trigger or it was a
  			 * FOLL_NOWAIT.
  			 */
f0818f472   Andrea Arcangeli   mm: gup: add get_...
1223
1224
1225
1226
  			if (!pages_done)
  				pages_done = ret;
  			break;
  		}
df17277b2   Mike Rapoport   mm/gup: continue ...
1227
1228
1229
1230
1231
1232
  		/*
  		 * VM_FAULT_RETRY triggered, so seek to the faulting offset.
  		 * For the prefault case (!pages) we only update counts.
  		 */
  		if (likely(pages))
  			pages += ret;
f0818f472   Andrea Arcangeli   mm: gup: add get_...
1233
  		start += ret << PAGE_SHIFT;
4426e945d   Peter Xu   mm/gup: allow VM_...
1234
  		lock_dropped = true;
f0818f472   Andrea Arcangeli   mm: gup: add get_...
1235

4426e945d   Peter Xu   mm/gup: allow VM_...
1236
  retry:
f0818f472   Andrea Arcangeli   mm: gup: add get_...
1237
1238
  		/*
  		 * Repeat on the address that fired VM_FAULT_RETRY
4426e945d   Peter Xu   mm/gup: allow VM_...
1239
1240
1241
1242
  		 * with both FAULT_FLAG_ALLOW_RETRY and
  		 * FAULT_FLAG_TRIED.  Note that GUP can be interrupted
  		 * by fatal signals, so we need to check it before we
  		 * start trying again otherwise it can loop forever.
f0818f472   Andrea Arcangeli   mm: gup: add get_...
1243
  		 */
4426e945d   Peter Xu   mm/gup: allow VM_...
1244

ae46d2aa6   Hillf Danton   mm/gup: Let __get...
1245
1246
1247
  		if (fatal_signal_pending(current)) {
  			if (!pages_done)
  				pages_done = -EINTR;
4426e945d   Peter Xu   mm/gup: allow VM_...
1248
  			break;
ae46d2aa6   Hillf Danton   mm/gup: Let __get...
1249
  		}
4426e945d   Peter Xu   mm/gup: allow VM_...
1250

d8ed45c5d   Michel Lespinasse   mmap locking API:...
1251
  		ret = mmap_read_lock_killable(mm);
71335f37c   Peter Xu   mm/gup: allow to ...
1252
1253
1254
1255
1256
1257
  		if (ret) {
  			BUG_ON(ret > 0);
  			if (!pages_done)
  				pages_done = ret;
  			break;
  		}
4426e945d   Peter Xu   mm/gup: allow VM_...
1258

c7b6a566b   Peter Xu   mm/gup: Mark lock...
1259
  		*locked = 1;
64019a2e4   Peter Xu   mm/gup: remove ta...
1260
  		ret = __get_user_pages(mm, start, 1, flags | FOLL_TRIED,
4426e945d   Peter Xu   mm/gup: allow VM_...
1261
1262
1263
1264
1265
1266
  				       pages, NULL, locked);
  		if (!*locked) {
  			/* Continue to retry until we succeeded */
  			BUG_ON(ret != 0);
  			goto retry;
  		}
f0818f472   Andrea Arcangeli   mm: gup: add get_...
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
  		if (ret != 1) {
  			BUG_ON(ret > 1);
  			if (!pages_done)
  				pages_done = ret;
  			break;
  		}
  		nr_pages--;
  		pages_done++;
  		if (!nr_pages)
  			break;
df17277b2   Mike Rapoport   mm/gup: continue ...
1277
1278
  		if (likely(pages))
  			pages++;
f0818f472   Andrea Arcangeli   mm: gup: add get_...
1279
1280
  		start += PAGE_SIZE;
  	}
e716712f8   Al Viro   __get_user_pages_...
1281
  	if (lock_dropped && *locked) {
f0818f472   Andrea Arcangeli   mm: gup: add get_...
1282
1283
1284
1285
  		/*
  		 * We must let the caller know we temporarily dropped the lock
  		 * and so the critical section protected by it was lost.
  		 */
d8ed45c5d   Michel Lespinasse   mmap locking API:...
1286
  		mmap_read_unlock(mm);
f0818f472   Andrea Arcangeli   mm: gup: add get_...
1287
1288
1289
1290
  		*locked = 0;
  	}
  	return pages_done;
  }
d3649f68b   Christoph Hellwig   mm: reorder code ...
1291
1292
1293
1294
1295
  /**
   * populate_vma_page_range() -  populate a range of pages in the vma.
   * @vma:   target vma
   * @start: start address
   * @end:   end address
c1e8d7c6a   Michel Lespinasse   mmap locking API:...
1296
   * @locked: whether the mmap_lock is still held
d3649f68b   Christoph Hellwig   mm: reorder code ...
1297
1298
1299
   *
   * This takes care of mlocking the pages too if VM_LOCKED is set.
   *
0a36f7f85   Tang Yizhou   mm/gup.c: fix the...
1300
1301
   * Return either number of pages pinned in the vma, or a negative error
   * code on error.
d3649f68b   Christoph Hellwig   mm: reorder code ...
1302
   *
c1e8d7c6a   Michel Lespinasse   mmap locking API:...
1303
   * vma->vm_mm->mmap_lock must be held.
d3649f68b   Christoph Hellwig   mm: reorder code ...
1304
   *
4f6da9341   Peter Xu   mm/gup: rename "n...
1305
   * If @locked is NULL, it may be held for read or write and will
d3649f68b   Christoph Hellwig   mm: reorder code ...
1306
1307
   * be unperturbed.
   *
4f6da9341   Peter Xu   mm/gup: rename "n...
1308
1309
   * If @locked is non-NULL, it must held for read only and may be
   * released.  If it's released, *@locked will be set to 0.
d3649f68b   Christoph Hellwig   mm: reorder code ...
1310
1311
   */
  long populate_vma_page_range(struct vm_area_struct *vma,
4f6da9341   Peter Xu   mm/gup: rename "n...
1312
  		unsigned long start, unsigned long end, int *locked)
d3649f68b   Christoph Hellwig   mm: reorder code ...
1313
1314
1315
1316
1317
1318
1319
1320
1321
  {
  	struct mm_struct *mm = vma->vm_mm;
  	unsigned long nr_pages = (end - start) / PAGE_SIZE;
  	int gup_flags;
  
  	VM_BUG_ON(start & ~PAGE_MASK);
  	VM_BUG_ON(end   & ~PAGE_MASK);
  	VM_BUG_ON_VMA(start < vma->vm_start, vma);
  	VM_BUG_ON_VMA(end   > vma->vm_end, vma);
42fc54140   Michel Lespinasse   mmap locking API:...
1322
  	mmap_assert_locked(mm);
d3649f68b   Christoph Hellwig   mm: reorder code ...
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
  
  	gup_flags = FOLL_TOUCH | FOLL_POPULATE | FOLL_MLOCK;
  	if (vma->vm_flags & VM_LOCKONFAULT)
  		gup_flags &= ~FOLL_POPULATE;
  	/*
  	 * We want to touch writable mappings with a write fault in order
  	 * to break COW, except for shared mappings because these don't COW
  	 * and we would not want to dirty them for nothing.
  	 */
  	if ((vma->vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE)
  		gup_flags |= FOLL_WRITE;
  
  	/*
  	 * We want mlock to succeed for regions that have any permissions
  	 * other than PROT_NONE.
  	 */
3122e80ef   Anshuman Khandual   mm/vma: make vma_...
1339
  	if (vma_is_accessible(vma))
d3649f68b   Christoph Hellwig   mm: reorder code ...
1340
1341
1342
1343
1344
1345
  		gup_flags |= FOLL_FORCE;
  
  	/*
  	 * We made sure addr is within a VMA, so the following will
  	 * not result in a stack expansion that recurses back here.
  	 */
64019a2e4   Peter Xu   mm/gup: remove ta...
1346
  	return __get_user_pages(mm, start, nr_pages, gup_flags,
4f6da9341   Peter Xu   mm/gup: rename "n...
1347
  				NULL, NULL, locked);
d3649f68b   Christoph Hellwig   mm: reorder code ...
1348
1349
1350
1351
1352
1353
1354
  }
  
  /*
   * __mm_populate - populate and/or mlock pages within a range of address space.
   *
   * This is used to implement mlock() and the MAP_POPULATE / MAP_LOCKED mmap
   * flags. VMAs must be already marked with the desired vm_flags, and
c1e8d7c6a   Michel Lespinasse   mmap locking API:...
1355
   * mmap_lock must not be held.
d3649f68b   Christoph Hellwig   mm: reorder code ...
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
   */
  int __mm_populate(unsigned long start, unsigned long len, int ignore_errors)
  {
  	struct mm_struct *mm = current->mm;
  	unsigned long end, nstart, nend;
  	struct vm_area_struct *vma = NULL;
  	int locked = 0;
  	long ret = 0;
  
  	end = start + len;
  
  	for (nstart = start; nstart < end; nstart = nend) {
  		/*
  		 * We want to fault in pages for [nstart; end) address range.
  		 * Find first corresponding VMA.
  		 */
  		if (!locked) {
  			locked = 1;
d8ed45c5d   Michel Lespinasse   mmap locking API:...
1374
  			mmap_read_lock(mm);
d3649f68b   Christoph Hellwig   mm: reorder code ...
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
  			vma = find_vma(mm, nstart);
  		} else if (nstart >= vma->vm_end)
  			vma = vma->vm_next;
  		if (!vma || vma->vm_start >= end)
  			break;
  		/*
  		 * Set [nstart; nend) to intersection of desired address
  		 * range with the first VMA. Also, skip undesirable VMA types.
  		 */
  		nend = min(end, vma->vm_end);
  		if (vma->vm_flags & (VM_IO | VM_PFNMAP))
  			continue;
  		if (nstart < vma->vm_start)
  			nstart = vma->vm_start;
  		/*
  		 * Now fault in a range of pages. populate_vma_page_range()
  		 * double checks the vma flags, so that it won't mlock pages
  		 * if the vma was already munlocked.
  		 */
  		ret = populate_vma_page_range(vma, nstart, nend, &locked);
  		if (ret < 0) {
  			if (ignore_errors) {
  				ret = 0;
  				continue;	/* continue at next VMA */
  			}
  			break;
  		}
  		nend = nstart + ret * PAGE_SIZE;
  		ret = 0;
  	}
  	if (locked)
d8ed45c5d   Michel Lespinasse   mmap locking API:...
1406
  		mmap_read_unlock(mm);
d3649f68b   Christoph Hellwig   mm: reorder code ...
1407
1408
  	return ret;	/* 0 or negative error code */
  }
050a9adc6   Christoph Hellwig   mm: consolidate t...
1409
  #else /* CONFIG_MMU */
64019a2e4   Peter Xu   mm/gup: remove ta...
1410
  static long __get_user_pages_locked(struct mm_struct *mm, unsigned long start,
050a9adc6   Christoph Hellwig   mm: consolidate t...
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
  		unsigned long nr_pages, struct page **pages,
  		struct vm_area_struct **vmas, int *locked,
  		unsigned int foll_flags)
  {
  	struct vm_area_struct *vma;
  	unsigned long vm_flags;
  	int i;
  
  	/* calculate required read or write permissions.
  	 * If FOLL_FORCE is set, we only require the "MAY" flags.
  	 */
  	vm_flags  = (foll_flags & FOLL_WRITE) ?
  			(VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD);
  	vm_flags &= (foll_flags & FOLL_FORCE) ?
  			(VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE);
  
  	for (i = 0; i < nr_pages; i++) {
  		vma = find_vma(mm, start);
  		if (!vma)
  			goto finish_or_fault;
  
  		/* protect what we can, including chardevs */
  		if ((vma->vm_flags & (VM_IO | VM_PFNMAP)) ||
  		    !(vm_flags & vma->vm_flags))
  			goto finish_or_fault;
  
  		if (pages) {
  			pages[i] = virt_to_page(start);
  			if (pages[i])
  				get_page(pages[i]);
  		}
  		if (vmas)
  			vmas[i] = vma;
  		start = (start + PAGE_SIZE) & PAGE_MASK;
  	}
  
  	return i;
  
  finish_or_fault:
  	return i ? : -EFAULT;
  }
  #endif /* !CONFIG_MMU */
d3649f68b   Christoph Hellwig   mm: reorder code ...
1453

8f942eea1   Jann Horn   binfmt_elf_fdpic:...
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
  /**
   * get_dump_page() - pin user page in memory while writing it to core dump
   * @addr: user address
   *
   * Returns struct page pointer of user page pinned for dump,
   * to be freed afterwards by put_page().
   *
   * Returns NULL on any kind of failure - a hole must then be inserted into
   * the corefile, to preserve alignment with its headers; and also returns
   * NULL wherever the ZERO_PAGE, or an anonymous pte_none, has been found -
   * allowing a hole to be left in the corefile to save diskspace.
   *
7f3bfab52   Jann Horn   mm/gup: take mmap...
1466
   * Called without mmap_lock (takes and releases the mmap_lock by itself).
8f942eea1   Jann Horn   binfmt_elf_fdpic:...
1467
1468
1469
1470
   */
  #ifdef CONFIG_ELF_CORE
  struct page *get_dump_page(unsigned long addr)
  {
7f3bfab52   Jann Horn   mm/gup: take mmap...
1471
  	struct mm_struct *mm = current->mm;
8f942eea1   Jann Horn   binfmt_elf_fdpic:...
1472
  	struct page *page;
7f3bfab52   Jann Horn   mm/gup: take mmap...
1473
1474
  	int locked = 1;
  	int ret;
8f942eea1   Jann Horn   binfmt_elf_fdpic:...
1475

7f3bfab52   Jann Horn   mm/gup: take mmap...
1476
  	if (mmap_read_lock_killable(mm))
8f942eea1   Jann Horn   binfmt_elf_fdpic:...
1477
  		return NULL;
7f3bfab52   Jann Horn   mm/gup: take mmap...
1478
1479
1480
1481
1482
  	ret = __get_user_pages_locked(mm, addr, 1, &page, NULL, &locked,
  				      FOLL_FORCE | FOLL_DUMP | FOLL_GET);
  	if (locked)
  		mmap_read_unlock(mm);
  	return (ret == 1) ? page : NULL;
8f942eea1   Jann Horn   binfmt_elf_fdpic:...
1483
1484
  }
  #endif /* CONFIG_ELF_CORE */
9a4e9f3b2   Aneesh Kumar K.V   mm: update get_us...
1485
  #if defined(CONFIG_FS_DAX) || defined (CONFIG_CMA)
9a4e9f3b2   Aneesh Kumar K.V   mm: update get_us...
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
  static bool check_dax_vmas(struct vm_area_struct **vmas, long nr_pages)
  {
  	long i;
  	struct vm_area_struct *vma_prev = NULL;
  
  	for (i = 0; i < nr_pages; i++) {
  		struct vm_area_struct *vma = vmas[i];
  
  		if (vma == vma_prev)
  			continue;
  
  		vma_prev = vma;
  
  		if (vma_is_fsdax(vma))
  			return true;
  	}
  	return false;
  }
9a4e9f3b2   Aneesh Kumar K.V   mm: update get_us...
1504
1505
  
  #ifdef CONFIG_CMA
64019a2e4   Peter Xu   mm/gup: remove ta...
1506
  static long check_and_migrate_cma_pages(struct mm_struct *mm,
932f4a630   Ira Weiny   mm/gup: replace g...
1507
1508
  					unsigned long start,
  					unsigned long nr_pages,
9a4e9f3b2   Aneesh Kumar K.V   mm: update get_us...
1509
  					struct page **pages,
932f4a630   Ira Weiny   mm/gup: replace g...
1510
1511
  					struct vm_area_struct **vmas,
  					unsigned int gup_flags)
9a4e9f3b2   Aneesh Kumar K.V   mm: update get_us...
1512
  {
aa712399c   Pingfan Liu   mm/gup: speed up ...
1513
1514
  	unsigned long i;
  	unsigned long step;
9a4e9f3b2   Aneesh Kumar K.V   mm: update get_us...
1515
1516
1517
  	bool drain_allow = true;
  	bool migrate_allow = true;
  	LIST_HEAD(cma_page_list);
b96cc6551   zhong jiang   mm/gup.c: allow C...
1518
  	long ret = nr_pages;
ed03d9245   Joonsoo Kim   mm/gup: use a sta...
1519
1520
1521
1522
  	struct migration_target_control mtc = {
  		.nid = NUMA_NO_NODE,
  		.gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_NOWARN,
  	};
9a4e9f3b2   Aneesh Kumar K.V   mm: update get_us...
1523
1524
  
  check_again:
aa712399c   Pingfan Liu   mm/gup: speed up ...
1525
1526
1527
1528
1529
1530
1531
1532
  	for (i = 0; i < nr_pages;) {
  
  		struct page *head = compound_head(pages[i]);
  
  		/*
  		 * gup may start from a tail page. Advance step by the left
  		 * part.
  		 */
d8c6546b1   Matthew Wilcox (Oracle)   mm: introduce com...
1533
  		step = compound_nr(head) - (pages[i] - head);
9a4e9f3b2   Aneesh Kumar K.V   mm: update get_us...
1534
1535
1536
1537
1538
  		/*
  		 * If we get a page from the CMA zone, since we are going to
  		 * be pinning these entries, we might as well move them out
  		 * of the CMA zone if possible.
  		 */
aa712399c   Pingfan Liu   mm/gup: speed up ...
1539
1540
  		if (is_migrate_cma_page(head)) {
  			if (PageHuge(head))
9a4e9f3b2   Aneesh Kumar K.V   mm: update get_us...
1541
  				isolate_huge_page(head, &cma_page_list);
aa712399c   Pingfan Liu   mm/gup: speed up ...
1542
  			else {
9a4e9f3b2   Aneesh Kumar K.V   mm: update get_us...
1543
1544
1545
1546
1547
1548
1549
1550
1551
  				if (!PageLRU(head) && drain_allow) {
  					lru_add_drain_all();
  					drain_allow = false;
  				}
  
  				if (!isolate_lru_page(head)) {
  					list_add_tail(&head->lru, &cma_page_list);
  					mod_node_page_state(page_pgdat(head),
  							    NR_ISOLATED_ANON +
9de4f22a6   Huang Ying   mm: code cleanup ...
1552
  							    page_is_file_lru(head),
6c357848b   Matthew Wilcox (Oracle)   mm: replace hpage...
1553
  							    thp_nr_pages(head));
9a4e9f3b2   Aneesh Kumar K.V   mm: update get_us...
1554
1555
1556
  				}
  			}
  		}
aa712399c   Pingfan Liu   mm/gup: speed up ...
1557
1558
  
  		i += step;
9a4e9f3b2   Aneesh Kumar K.V   mm: update get_us...
1559
1560
1561
1562
1563
1564
  	}
  
  	if (!list_empty(&cma_page_list)) {
  		/*
  		 * drop the above get_user_pages reference.
  		 */
96e1fac16   Jason Gunthorpe   mm/gup: use unpin...
1565
1566
1567
1568
1569
  		if (gup_flags & FOLL_PIN)
  			unpin_user_pages(pages, nr_pages);
  		else
  			for (i = 0; i < nr_pages; i++)
  				put_page(pages[i]);
9a4e9f3b2   Aneesh Kumar K.V   mm: update get_us...
1570

ed03d9245   Joonsoo Kim   mm/gup: use a sta...
1571
1572
  		if (migrate_pages(&cma_page_list, alloc_migration_target, NULL,
  			(unsigned long)&mtc, MIGRATE_SYNC, MR_CONTIG_RANGE)) {
9a4e9f3b2   Aneesh Kumar K.V   mm: update get_us...
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
  			/*
  			 * some of the pages failed migration. Do get_user_pages
  			 * without migration.
  			 */
  			migrate_allow = false;
  
  			if (!list_empty(&cma_page_list))
  				putback_movable_pages(&cma_page_list);
  		}
  		/*
932f4a630   Ira Weiny   mm/gup: replace g...
1583
1584
1585
  		 * We did migrate all the pages, Try to get the page references
  		 * again migrating any new CMA pages which we failed to isolate
  		 * earlier.
9a4e9f3b2   Aneesh Kumar K.V   mm: update get_us...
1586
  		 */
64019a2e4   Peter Xu   mm/gup: remove ta...
1587
  		ret = __get_user_pages_locked(mm, start, nr_pages,
932f4a630   Ira Weiny   mm/gup: replace g...
1588
1589
  						   pages, vmas, NULL,
  						   gup_flags);
b96cc6551   zhong jiang   mm/gup.c: allow C...
1590
1591
  		if ((ret > 0) && migrate_allow) {
  			nr_pages = ret;
9a4e9f3b2   Aneesh Kumar K.V   mm: update get_us...
1592
1593
1594
1595
  			drain_allow = true;
  			goto check_again;
  		}
  	}
b96cc6551   zhong jiang   mm/gup.c: allow C...
1596
  	return ret;
9a4e9f3b2   Aneesh Kumar K.V   mm: update get_us...
1597
1598
  }
  #else
64019a2e4   Peter Xu   mm/gup: remove ta...
1599
  static long check_and_migrate_cma_pages(struct mm_struct *mm,
932f4a630   Ira Weiny   mm/gup: replace g...
1600
1601
1602
1603
1604
  					unsigned long start,
  					unsigned long nr_pages,
  					struct page **pages,
  					struct vm_area_struct **vmas,
  					unsigned int gup_flags)
9a4e9f3b2   Aneesh Kumar K.V   mm: update get_us...
1605
1606
1607
  {
  	return nr_pages;
  }
050a9adc6   Christoph Hellwig   mm: consolidate t...
1608
  #endif /* CONFIG_CMA */
9a4e9f3b2   Aneesh Kumar K.V   mm: update get_us...
1609

2bb6d2837   Dan Williams   mm: introduce get...
1610
  /*
932f4a630   Ira Weiny   mm/gup: replace g...
1611
1612
   * __gup_longterm_locked() is a wrapper for __get_user_pages_locked which
   * allows us to process the FOLL_LONGTERM flag.
2bb6d2837   Dan Williams   mm: introduce get...
1613
   */
64019a2e4   Peter Xu   mm/gup: remove ta...
1614
  static long __gup_longterm_locked(struct mm_struct *mm,
932f4a630   Ira Weiny   mm/gup: replace g...
1615
1616
1617
1618
1619
  				  unsigned long start,
  				  unsigned long nr_pages,
  				  struct page **pages,
  				  struct vm_area_struct **vmas,
  				  unsigned int gup_flags)
2bb6d2837   Dan Williams   mm: introduce get...
1620
  {
932f4a630   Ira Weiny   mm/gup: replace g...
1621
1622
  	struct vm_area_struct **vmas_tmp = vmas;
  	unsigned long flags = 0;
2bb6d2837   Dan Williams   mm: introduce get...
1623
  	long rc, i;
932f4a630   Ira Weiny   mm/gup: replace g...
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
  	if (gup_flags & FOLL_LONGTERM) {
  		if (!pages)
  			return -EINVAL;
  
  		if (!vmas_tmp) {
  			vmas_tmp = kcalloc(nr_pages,
  					   sizeof(struct vm_area_struct *),
  					   GFP_KERNEL);
  			if (!vmas_tmp)
  				return -ENOMEM;
  		}
  		flags = memalloc_nocma_save();
2bb6d2837   Dan Williams   mm: introduce get...
1636
  	}
64019a2e4   Peter Xu   mm/gup: remove ta...
1637
  	rc = __get_user_pages_locked(mm, start, nr_pages, pages,
932f4a630   Ira Weiny   mm/gup: replace g...
1638
  				     vmas_tmp, NULL, gup_flags);
2bb6d2837   Dan Williams   mm: introduce get...
1639

932f4a630   Ira Weiny   mm/gup: replace g...
1640
  	if (gup_flags & FOLL_LONGTERM) {
932f4a630   Ira Weiny   mm/gup: replace g...
1641
1642
1643
1644
  		if (rc < 0)
  			goto out;
  
  		if (check_dax_vmas(vmas_tmp, rc)) {
96e1fac16   Jason Gunthorpe   mm/gup: use unpin...
1645
1646
1647
1648
1649
  			if (gup_flags & FOLL_PIN)
  				unpin_user_pages(pages, rc);
  			else
  				for (i = 0; i < rc; i++)
  					put_page(pages[i]);
932f4a630   Ira Weiny   mm/gup: replace g...
1650
1651
1652
  			rc = -EOPNOTSUPP;
  			goto out;
  		}
64019a2e4   Peter Xu   mm/gup: remove ta...
1653
  		rc = check_and_migrate_cma_pages(mm, start, rc, pages,
932f4a630   Ira Weiny   mm/gup: replace g...
1654
  						 vmas_tmp, gup_flags);
41b4dc14e   Joonsoo Kim   mm/gup: restrict ...
1655
1656
  out:
  		memalloc_nocma_restore(flags);
9a4e9f3b2   Aneesh Kumar K.V   mm: update get_us...
1657
  	}
2bb6d2837   Dan Williams   mm: introduce get...
1658

932f4a630   Ira Weiny   mm/gup: replace g...
1659
1660
  	if (vmas_tmp != vmas)
  		kfree(vmas_tmp);
2bb6d2837   Dan Williams   mm: introduce get...
1661
1662
  	return rc;
  }
932f4a630   Ira Weiny   mm/gup: replace g...
1663
  #else /* !CONFIG_FS_DAX && !CONFIG_CMA */
64019a2e4   Peter Xu   mm/gup: remove ta...
1664
  static __always_inline long __gup_longterm_locked(struct mm_struct *mm,
932f4a630   Ira Weiny   mm/gup: replace g...
1665
1666
1667
1668
1669
1670
  						  unsigned long start,
  						  unsigned long nr_pages,
  						  struct page **pages,
  						  struct vm_area_struct **vmas,
  						  unsigned int flags)
  {
64019a2e4   Peter Xu   mm/gup: remove ta...
1671
  	return __get_user_pages_locked(mm, start, nr_pages, pages, vmas,
932f4a630   Ira Weiny   mm/gup: replace g...
1672
1673
1674
  				       NULL, flags);
  }
  #endif /* CONFIG_FS_DAX || CONFIG_CMA */
447f3e45c   Barry Song   mm/gup: don't per...
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
  static bool is_valid_gup_flags(unsigned int gup_flags)
  {
  	/*
  	 * FOLL_PIN must only be set internally by the pin_user_pages*() APIs,
  	 * never directly by the caller, so enforce that with an assertion:
  	 */
  	if (WARN_ON_ONCE(gup_flags & FOLL_PIN))
  		return false;
  	/*
  	 * FOLL_PIN is a prerequisite to FOLL_LONGTERM. Another way of saying
  	 * that is, FOLL_LONGTERM is a specific case, more restrictive case of
  	 * FOLL_PIN.
  	 */
  	if (WARN_ON_ONCE(gup_flags & FOLL_LONGTERM))
  		return false;
  
  	return true;
  }
22bf29b67   John Hubbard   mm/gup: split get...
1693
  #ifdef CONFIG_MMU
64019a2e4   Peter Xu   mm/gup: remove ta...
1694
  static long __get_user_pages_remote(struct mm_struct *mm,
22bf29b67   John Hubbard   mm/gup: split get...
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
  				    unsigned long start, unsigned long nr_pages,
  				    unsigned int gup_flags, struct page **pages,
  				    struct vm_area_struct **vmas, int *locked)
  {
  	/*
  	 * Parts of FOLL_LONGTERM behavior are incompatible with
  	 * FAULT_FLAG_ALLOW_RETRY because of the FS DAX check requirement on
  	 * vmas. However, this only comes up if locked is set, and there are
  	 * callers that do request FOLL_LONGTERM, but do not set locked. So,
  	 * allow what we can.
  	 */
  	if (gup_flags & FOLL_LONGTERM) {
  		if (WARN_ON_ONCE(locked))
  			return -EINVAL;
  		/*
  		 * This will check the vmas (even if our vmas arg is NULL)
  		 * and return -ENOTSUPP if DAX isn't allowed in this case:
  		 */
64019a2e4   Peter Xu   mm/gup: remove ta...
1713
  		return __gup_longterm_locked(mm, start, nr_pages, pages,
22bf29b67   John Hubbard   mm/gup: split get...
1714
1715
1716
  					     vmas, gup_flags | FOLL_TOUCH |
  					     FOLL_REMOTE);
  	}
64019a2e4   Peter Xu   mm/gup: remove ta...
1717
  	return __get_user_pages_locked(mm, start, nr_pages, pages, vmas,
22bf29b67   John Hubbard   mm/gup: split get...
1718
1719
1720
  				       locked,
  				       gup_flags | FOLL_TOUCH | FOLL_REMOTE);
  }
adc8cb406   Souptick Joarder   mm/gup.c: update ...
1721
  /**
c4237f8b1   John Hubbard   mm: fix get_user_...
1722
   * get_user_pages_remote() - pin user pages in memory
c4237f8b1   John Hubbard   mm: fix get_user_...
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
   * @mm:		mm_struct of target mm
   * @start:	starting user address
   * @nr_pages:	number of pages from start to pin
   * @gup_flags:	flags modifying lookup behaviour
   * @pages:	array that receives pointers to the pages pinned.
   *		Should be at least nr_pages long. Or NULL, if caller
   *		only intends to ensure the pages are faulted in.
   * @vmas:	array of pointers to vmas corresponding to each page.
   *		Or NULL if the caller does not require them.
   * @locked:	pointer to lock flag indicating whether lock is held and
   *		subsequently whether VM_FAULT_RETRY functionality can be
   *		utilised. Lock must initially be held.
   *
   * Returns either number of pages pinned (which may be less than the
   * number requested), or an error. Details about the return value:
   *
   * -- If nr_pages is 0, returns 0.
   * -- If nr_pages is >0, but no pages were pinned, returns -errno.
   * -- If nr_pages is >0, and some pages were pinned, returns the number of
   *    pages pinned. Again, this may be less than nr_pages.
   *
   * The caller is responsible for releasing returned @pages, via put_page().
   *
c1e8d7c6a   Michel Lespinasse   mmap locking API:...
1746
   * @vmas are valid only as long as mmap_lock is held.
c4237f8b1   John Hubbard   mm: fix get_user_...
1747
   *
c1e8d7c6a   Michel Lespinasse   mmap locking API:...
1748
   * Must be called with mmap_lock held for read or write.
c4237f8b1   John Hubbard   mm: fix get_user_...
1749
   *
adc8cb406   Souptick Joarder   mm/gup.c: update ...
1750
1751
   * get_user_pages_remote walks a process's page tables and takes a reference
   * to each struct page that each user address corresponds to at a given
c4237f8b1   John Hubbard   mm: fix get_user_...
1752
1753
1754
1755
   * instant. That is, it takes the page that would be accessed if a user
   * thread accesses the given user virtual address at that instant.
   *
   * This does not guarantee that the page exists in the user mappings when
adc8cb406   Souptick Joarder   mm/gup.c: update ...
1756
   * get_user_pages_remote returns, and there may even be a completely different
c4237f8b1   John Hubbard   mm: fix get_user_...
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
   * page there in some cases (eg. if mmapped pagecache has been invalidated
   * and subsequently re faulted). However it does guarantee that the page
   * won't be freed completely. And mostly callers simply care that the page
   * contains data that was valid *at some point in time*. Typically, an IO
   * or similar operation cannot guarantee anything stronger anyway because
   * locks can't be held over the syscall boundary.
   *
   * If gup_flags & FOLL_WRITE == 0, the page must not be written to. If the page
   * is written to, set_page_dirty (or set_page_dirty_lock, as appropriate) must
   * be called after the page is finished with, and before put_page is called.
   *
adc8cb406   Souptick Joarder   mm/gup.c: update ...
1768
1769
1770
1771
1772
   * get_user_pages_remote is typically used for fewer-copy IO operations,
   * to get a handle on the memory by some means other than accesses
   * via the user virtual addresses. The pages may be submitted for
   * DMA to devices or accessed via their kernel linear mapping (via the
   * kmap APIs). Care should be taken to use the correct cache flushing APIs.
c4237f8b1   John Hubbard   mm: fix get_user_...
1773
1774
1775
   *
   * See also get_user_pages_fast, for performance critical applications.
   *
adc8cb406   Souptick Joarder   mm/gup.c: update ...
1776
   * get_user_pages_remote should be phased out in favor of
c4237f8b1   John Hubbard   mm: fix get_user_...
1777
   * get_user_pages_locked|unlocked or get_user_pages_fast. Nothing
adc8cb406   Souptick Joarder   mm/gup.c: update ...
1778
   * should use get_user_pages_remote because it cannot pass
c4237f8b1   John Hubbard   mm: fix get_user_...
1779
1780
   * FAULT_FLAG_ALLOW_RETRY to handle_mm_fault.
   */
64019a2e4   Peter Xu   mm/gup: remove ta...
1781
  long get_user_pages_remote(struct mm_struct *mm,
c4237f8b1   John Hubbard   mm: fix get_user_...
1782
1783
1784
1785
  		unsigned long start, unsigned long nr_pages,
  		unsigned int gup_flags, struct page **pages,
  		struct vm_area_struct **vmas, int *locked)
  {
447f3e45c   Barry Song   mm/gup: don't per...
1786
  	if (!is_valid_gup_flags(gup_flags))
eddb1c228   John Hubbard   mm/gup: introduce...
1787
  		return -EINVAL;
64019a2e4   Peter Xu   mm/gup: remove ta...
1788
  	return __get_user_pages_remote(mm, start, nr_pages, gup_flags,
22bf29b67   John Hubbard   mm/gup: split get...
1789
  				       pages, vmas, locked);
c4237f8b1   John Hubbard   mm: fix get_user_...
1790
1791
  }
  EXPORT_SYMBOL(get_user_pages_remote);
eddb1c228   John Hubbard   mm/gup: introduce...
1792
  #else /* CONFIG_MMU */
64019a2e4   Peter Xu   mm/gup: remove ta...
1793
  long get_user_pages_remote(struct mm_struct *mm,
eddb1c228   John Hubbard   mm/gup: introduce...
1794
1795
1796
1797
1798
1799
  			   unsigned long start, unsigned long nr_pages,
  			   unsigned int gup_flags, struct page **pages,
  			   struct vm_area_struct **vmas, int *locked)
  {
  	return 0;
  }
3faa52c03   John Hubbard   mm/gup: track FOL...
1800

64019a2e4   Peter Xu   mm/gup: remove ta...
1801
  static long __get_user_pages_remote(struct mm_struct *mm,
3faa52c03   John Hubbard   mm/gup: track FOL...
1802
1803
1804
1805
1806
1807
  				    unsigned long start, unsigned long nr_pages,
  				    unsigned int gup_flags, struct page **pages,
  				    struct vm_area_struct **vmas, int *locked)
  {
  	return 0;
  }
eddb1c228   John Hubbard   mm/gup: introduce...
1808
  #endif /* !CONFIG_MMU */
adc8cb406   Souptick Joarder   mm/gup.c: update ...
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
  /**
   * get_user_pages() - pin user pages in memory
   * @start:      starting user address
   * @nr_pages:   number of pages from start to pin
   * @gup_flags:  flags modifying lookup behaviour
   * @pages:      array that receives pointers to the pages pinned.
   *              Should be at least nr_pages long. Or NULL, if caller
   *              only intends to ensure the pages are faulted in.
   * @vmas:       array of pointers to vmas corresponding to each page.
   *              Or NULL if the caller does not require them.
   *
64019a2e4   Peter Xu   mm/gup: remove ta...
1820
1821
1822
1823
   * This is the same as get_user_pages_remote(), just with a less-flexible
   * calling convention where we assume that the mm being operated on belongs to
   * the current task, and doesn't allow passing of a locked parameter.  We also
   * obviously don't pass FOLL_REMOTE in here.
932f4a630   Ira Weiny   mm/gup: replace g...
1824
1825
1826
1827
1828
   */
  long get_user_pages(unsigned long start, unsigned long nr_pages,
  		unsigned int gup_flags, struct page **pages,
  		struct vm_area_struct **vmas)
  {
447f3e45c   Barry Song   mm/gup: don't per...
1829
  	if (!is_valid_gup_flags(gup_flags))
eddb1c228   John Hubbard   mm/gup: introduce...
1830
  		return -EINVAL;
64019a2e4   Peter Xu   mm/gup: remove ta...
1831
  	return __gup_longterm_locked(current->mm, start, nr_pages,
932f4a630   Ira Weiny   mm/gup: replace g...
1832
1833
1834
  				     pages, vmas, gup_flags | FOLL_TOUCH);
  }
  EXPORT_SYMBOL(get_user_pages);
2bb6d2837   Dan Williams   mm: introduce get...
1835

adc8cb406   Souptick Joarder   mm/gup.c: update ...
1836
  /**
d3649f68b   Christoph Hellwig   mm: reorder code ...
1837
   * get_user_pages_locked() is suitable to replace the form:
acc3c8d15   Kirill A. Shutemov   mm: move mm_popul...
1838
   *
3e4e28c5a   Michel Lespinasse   mmap locking API:...
1839
   *      mmap_read_lock(mm);
d3649f68b   Christoph Hellwig   mm: reorder code ...
1840
   *      do_something()
64019a2e4   Peter Xu   mm/gup: remove ta...
1841
   *      get_user_pages(mm, ..., pages, NULL);
3e4e28c5a   Michel Lespinasse   mmap locking API:...
1842
   *      mmap_read_unlock(mm);
acc3c8d15   Kirill A. Shutemov   mm: move mm_popul...
1843
   *
d3649f68b   Christoph Hellwig   mm: reorder code ...
1844
   *  to:
acc3c8d15   Kirill A. Shutemov   mm: move mm_popul...
1845
   *
d3649f68b   Christoph Hellwig   mm: reorder code ...
1846
   *      int locked = 1;
3e4e28c5a   Michel Lespinasse   mmap locking API:...
1847
   *      mmap_read_lock(mm);
d3649f68b   Christoph Hellwig   mm: reorder code ...
1848
   *      do_something()
64019a2e4   Peter Xu   mm/gup: remove ta...
1849
   *      get_user_pages_locked(mm, ..., pages, &locked);
d3649f68b   Christoph Hellwig   mm: reorder code ...
1850
   *      if (locked)
3e4e28c5a   Michel Lespinasse   mmap locking API:...
1851
   *          mmap_read_unlock(mm);
adc8cb406   Souptick Joarder   mm/gup.c: update ...
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
   *
   * @start:      starting user address
   * @nr_pages:   number of pages from start to pin
   * @gup_flags:  flags modifying lookup behaviour
   * @pages:      array that receives pointers to the pages pinned.
   *              Should be at least nr_pages long. Or NULL, if caller
   *              only intends to ensure the pages are faulted in.
   * @locked:     pointer to lock flag indicating whether lock is held and
   *              subsequently whether VM_FAULT_RETRY functionality can be
   *              utilised. Lock must initially be held.
   *
   * We can leverage the VM_FAULT_RETRY functionality in the page fault
   * paths better by using either get_user_pages_locked() or
   * get_user_pages_unlocked().
   *
acc3c8d15   Kirill A. Shutemov   mm: move mm_popul...
1867
   */
d3649f68b   Christoph Hellwig   mm: reorder code ...
1868
1869
1870
  long get_user_pages_locked(unsigned long start, unsigned long nr_pages,
  			   unsigned int gup_flags, struct page **pages,
  			   int *locked)
acc3c8d15   Kirill A. Shutemov   mm: move mm_popul...
1871
  {
acc3c8d15   Kirill A. Shutemov   mm: move mm_popul...
1872
  	/*
d3649f68b   Christoph Hellwig   mm: reorder code ...
1873
1874
1875
1876
  	 * FIXME: Current FOLL_LONGTERM behavior is incompatible with
  	 * FAULT_FLAG_ALLOW_RETRY because of the FS DAX check requirement on
  	 * vmas.  As there are no users of this flag in this call we simply
  	 * disallow this option for now.
acc3c8d15   Kirill A. Shutemov   mm: move mm_popul...
1877
  	 */
d3649f68b   Christoph Hellwig   mm: reorder code ...
1878
1879
  	if (WARN_ON_ONCE(gup_flags & FOLL_LONGTERM))
  		return -EINVAL;
420c2091b   John Hubbard   mm/gup: introduce...
1880
1881
1882
1883
1884
1885
  	/*
  	 * FOLL_PIN must only be set internally by the pin_user_pages*() APIs,
  	 * never directly by the caller, so enforce that:
  	 */
  	if (WARN_ON_ONCE(gup_flags & FOLL_PIN))
  		return -EINVAL;
acc3c8d15   Kirill A. Shutemov   mm: move mm_popul...
1886

64019a2e4   Peter Xu   mm/gup: remove ta...
1887
  	return __get_user_pages_locked(current->mm, start, nr_pages,
d3649f68b   Christoph Hellwig   mm: reorder code ...
1888
1889
  				       pages, NULL, locked,
  				       gup_flags | FOLL_TOUCH);
acc3c8d15   Kirill A. Shutemov   mm: move mm_popul...
1890
  }
d3649f68b   Christoph Hellwig   mm: reorder code ...
1891
  EXPORT_SYMBOL(get_user_pages_locked);
acc3c8d15   Kirill A. Shutemov   mm: move mm_popul...
1892
1893
  
  /*
d3649f68b   Christoph Hellwig   mm: reorder code ...
1894
   * get_user_pages_unlocked() is suitable to replace the form:
acc3c8d15   Kirill A. Shutemov   mm: move mm_popul...
1895
   *
3e4e28c5a   Michel Lespinasse   mmap locking API:...
1896
   *      mmap_read_lock(mm);
64019a2e4   Peter Xu   mm/gup: remove ta...
1897
   *      get_user_pages(mm, ..., pages, NULL);
3e4e28c5a   Michel Lespinasse   mmap locking API:...
1898
   *      mmap_read_unlock(mm);
d3649f68b   Christoph Hellwig   mm: reorder code ...
1899
1900
1901
   *
   *  with:
   *
64019a2e4   Peter Xu   mm/gup: remove ta...
1902
   *      get_user_pages_unlocked(mm, ..., pages);
d3649f68b   Christoph Hellwig   mm: reorder code ...
1903
1904
1905
1906
   *
   * It is functionally equivalent to get_user_pages_fast so
   * get_user_pages_fast should be used instead if specific gup_flags
   * (e.g. FOLL_FORCE) are not required.
acc3c8d15   Kirill A. Shutemov   mm: move mm_popul...
1907
   */
d3649f68b   Christoph Hellwig   mm: reorder code ...
1908
1909
  long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
  			     struct page **pages, unsigned int gup_flags)
acc3c8d15   Kirill A. Shutemov   mm: move mm_popul...
1910
1911
  {
  	struct mm_struct *mm = current->mm;
d3649f68b   Christoph Hellwig   mm: reorder code ...
1912
1913
  	int locked = 1;
  	long ret;
acc3c8d15   Kirill A. Shutemov   mm: move mm_popul...
1914

d3649f68b   Christoph Hellwig   mm: reorder code ...
1915
1916
1917
1918
1919
1920
1921
1922
  	/*
  	 * FIXME: Current FOLL_LONGTERM behavior is incompatible with
  	 * FAULT_FLAG_ALLOW_RETRY because of the FS DAX check requirement on
  	 * vmas.  As there are no users of this flag in this call we simply
  	 * disallow this option for now.
  	 */
  	if (WARN_ON_ONCE(gup_flags & FOLL_LONGTERM))
  		return -EINVAL;
acc3c8d15   Kirill A. Shutemov   mm: move mm_popul...
1923

d8ed45c5d   Michel Lespinasse   mmap locking API:...
1924
  	mmap_read_lock(mm);
64019a2e4   Peter Xu   mm/gup: remove ta...
1925
  	ret = __get_user_pages_locked(mm, start, nr_pages, pages, NULL,
d3649f68b   Christoph Hellwig   mm: reorder code ...
1926
  				      &locked, gup_flags | FOLL_TOUCH);
acc3c8d15   Kirill A. Shutemov   mm: move mm_popul...
1927
  	if (locked)
d8ed45c5d   Michel Lespinasse   mmap locking API:...
1928
  		mmap_read_unlock(mm);
d3649f68b   Christoph Hellwig   mm: reorder code ...
1929
  	return ret;
4bbd4c776   Kirill A. Shutemov   mm: move get_user...
1930
  }
d3649f68b   Christoph Hellwig   mm: reorder code ...
1931
  EXPORT_SYMBOL(get_user_pages_unlocked);
2667f50e8   Steve Capper   mm: introduce a g...
1932
1933
  
  /*
67a929e09   Christoph Hellwig   mm: rename CONFIG...
1934
   * Fast GUP
2667f50e8   Steve Capper   mm: introduce a g...
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
   *
   * get_user_pages_fast attempts to pin user pages by walking the page
   * tables directly and avoids taking locks. Thus the walker needs to be
   * protected from page table pages being freed from under it, and should
   * block any THP splits.
   *
   * One way to achieve this is to have the walker disable interrupts, and
   * rely on IPIs from the TLB flushing code blocking before the page table
   * pages are freed. This is unsuitable for architectures that do not need
   * to broadcast an IPI when invalidating TLBs.
   *
   * Another way to achieve this is to batch up page table containing pages
   * belonging to more than one mm_user, then rcu_sched a callback to free those
   * pages. Disabling interrupts will allow the fast_gup walker to both block
   * the rcu_sched callback, and an IPI that we broadcast for splitting THPs
   * (which is a relatively rare event). The code below adopts this strategy.
   *
   * Before activating this code, please be aware that the following assumptions
   * are currently made:
   *
ff2e6d725   Peter Zijlstra   asm-generic/tlb: ...
1955
   *  *) Either MMU_GATHER_RCU_TABLE_FREE is enabled, and tlb_remove_table() is used to
e585513b7   Kirill A. Shutemov   x86/mm/gup: Switc...
1956
   *  free pages containing page tables or TLB flushing requires IPI broadcast.
2667f50e8   Steve Capper   mm: introduce a g...
1957
   *
2667f50e8   Steve Capper   mm: introduce a g...
1958
1959
1960
1961
1962
1963
1964
1965
   *  *) ptes can be read atomically by the architecture.
   *
   *  *) access_ok is sufficient to validate userspace address ranges.
   *
   * The last two assumptions can be relaxed by the addition of helper functions.
   *
   * This code is based heavily on the PowerPC implementation by Nick Piggin.
   */
67a929e09   Christoph Hellwig   mm: rename CONFIG...
1966
  #ifdef CONFIG_HAVE_FAST_GUP
39656e83d   Christoph Hellwig   mm: lift the x86_...
1967
  #ifdef CONFIG_GUP_GET_PTE_LOW_HIGH
3faa52c03   John Hubbard   mm/gup: track FOL...
1968

39656e83d   Christoph Hellwig   mm: lift the x86_...
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
  /*
   * WARNING: only to be used in the get_user_pages_fast() implementation.
   *
   * With get_user_pages_fast(), we walk down the pagetables without taking any
   * locks.  For this we would like to load the pointers atomically, but sometimes
   * that is not possible (e.g. without expensive cmpxchg8b on x86_32 PAE).  What
   * we do have is the guarantee that a PTE will only either go from not present
   * to present, or present to not present or both -- it will not switch to a
   * completely different present page without a TLB flush in between; something
   * that we are blocking by holding interrupts off.
   *
   * Setting ptes from not present to present goes:
   *
   *   ptep->pte_high = h;
   *   smp_wmb();
   *   ptep->pte_low = l;
   *
   * And present to not present goes:
   *
   *   ptep->pte_low = 0;
   *   smp_wmb();
   *   ptep->pte_high = 0;
   *
   * We must ensure here that the load of pte_low sees 'l' IFF pte_high sees 'h'.
   * We load pte_high *after* loading pte_low, which ensures we don't see an older
   * value of pte_high.  *Then* we recheck pte_low, which ensures that we haven't
   * picked up a changed pte high. We might have gotten rubbish values from
   * pte_low and pte_high, but we are guaranteed that pte_low will not have the
   * present bit set *unless* it is 'l'. Because get_user_pages_fast() only
   * operates on present ptes we're safe.
   */
  static inline pte_t gup_get_pte(pte_t *ptep)
  {
  	pte_t pte;
2667f50e8   Steve Capper   mm: introduce a g...
2003

39656e83d   Christoph Hellwig   mm: lift the x86_...
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
  	do {
  		pte.pte_low = ptep->pte_low;
  		smp_rmb();
  		pte.pte_high = ptep->pte_high;
  		smp_rmb();
  	} while (unlikely(pte.pte_low != ptep->pte_low));
  
  	return pte;
  }
  #else /* CONFIG_GUP_GET_PTE_LOW_HIGH */
0005d20b2   Kirill A. Shutemov   mm/gup: Move page...
2014
  /*
39656e83d   Christoph Hellwig   mm: lift the x86_...
2015
   * We require that the PTE can be read atomically.
0005d20b2   Kirill A. Shutemov   mm/gup: Move page...
2016
2017
2018
   */
  static inline pte_t gup_get_pte(pte_t *ptep)
  {
481e980a7   Christophe Leroy   mm: Allow arches ...
2019
  	return ptep_get(ptep);
0005d20b2   Kirill A. Shutemov   mm/gup: Move page...
2020
  }
39656e83d   Christoph Hellwig   mm: lift the x86_...
2021
  #endif /* CONFIG_GUP_GET_PTE_LOW_HIGH */
0005d20b2   Kirill A. Shutemov   mm/gup: Move page...
2022

790c73690   Guenter Roeck   mm/gup.c: mark un...
2023
  static void __maybe_unused undo_dev_pagemap(int *nr, int nr_start,
3b78d8347   John Hubbard   mm/gup: pass gup ...
2024
  					    unsigned int flags,
790c73690   Guenter Roeck   mm/gup.c: mark un...
2025
  					    struct page **pages)
b59f65fa0   Kirill A. Shutemov   mm/gup: Implement...
2026
2027
2028
2029
2030
  {
  	while ((*nr) - nr_start) {
  		struct page *page = pages[--(*nr)];
  
  		ClearPageReferenced(page);
3faa52c03   John Hubbard   mm/gup: track FOL...
2031
2032
2033
2034
  		if (flags & FOLL_PIN)
  			unpin_user_page(page);
  		else
  			put_page(page);
b59f65fa0   Kirill A. Shutemov   mm/gup: Implement...
2035
2036
  	}
  }
3010a5ea6   Laurent Dufour   mm: introduce ARC...
2037
  #ifdef CONFIG_ARCH_HAS_PTE_SPECIAL
2667f50e8   Steve Capper   mm: introduce a g...
2038
  static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
b798bec47   Ira Weiny   mm/gup: change wr...
2039
  			 unsigned int flags, struct page **pages, int *nr)
2667f50e8   Steve Capper   mm: introduce a g...
2040
  {
b59f65fa0   Kirill A. Shutemov   mm/gup: Implement...
2041
2042
  	struct dev_pagemap *pgmap = NULL;
  	int nr_start = *nr, ret = 0;
2667f50e8   Steve Capper   mm: introduce a g...
2043
  	pte_t *ptep, *ptem;
2667f50e8   Steve Capper   mm: introduce a g...
2044
2045
2046
  
  	ptem = ptep = pte_offset_map(&pmd, addr);
  	do {
0005d20b2   Kirill A. Shutemov   mm/gup: Move page...
2047
  		pte_t pte = gup_get_pte(ptep);
7aef4172c   Kirill A. Shutemov   mm: handle PTE-ma...
2048
  		struct page *head, *page;
2667f50e8   Steve Capper   mm: introduce a g...
2049
2050
2051
  
  		/*
  		 * Similar to the PMD case below, NUMA hinting must take slow
8a0516ed8   Mel Gorman   mm: convert p[te|...
2052
  		 * path using the pte_protnone check.
2667f50e8   Steve Capper   mm: introduce a g...
2053
  		 */
e7884f8ea   Kirill A. Shutemov   mm/gup: Move perm...
2054
2055
  		if (pte_protnone(pte))
  			goto pte_unmap;
b798bec47   Ira Weiny   mm/gup: change wr...
2056
  		if (!pte_access_permitted(pte, flags & FOLL_WRITE))
e7884f8ea   Kirill A. Shutemov   mm/gup: Move perm...
2057
  			goto pte_unmap;
b59f65fa0   Kirill A. Shutemov   mm/gup: Implement...
2058
  		if (pte_devmap(pte)) {
7af75561e   Ira Weiny   mm/gup: add FOLL_...
2059
2060
  			if (unlikely(flags & FOLL_LONGTERM))
  				goto pte_unmap;
b59f65fa0   Kirill A. Shutemov   mm/gup: Implement...
2061
2062
  			pgmap = get_dev_pagemap(pte_pfn(pte), pgmap);
  			if (unlikely(!pgmap)) {
3b78d8347   John Hubbard   mm/gup: pass gup ...
2063
  				undo_dev_pagemap(nr, nr_start, flags, pages);
b59f65fa0   Kirill A. Shutemov   mm/gup: Implement...
2064
2065
2066
  				goto pte_unmap;
  			}
  		} else if (pte_special(pte))
2667f50e8   Steve Capper   mm: introduce a g...
2067
2068
2069
2070
  			goto pte_unmap;
  
  		VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
  		page = pte_page(pte);
3faa52c03   John Hubbard   mm/gup: track FOL...
2071
  		head = try_grab_compound_head(page, 1, flags);
8fde12ca7   Linus Torvalds   mm: prevent get_u...
2072
  		if (!head)
2667f50e8   Steve Capper   mm: introduce a g...
2073
2074
2075
  			goto pte_unmap;
  
  		if (unlikely(pte_val(pte) != pte_val(*ptep))) {
3faa52c03   John Hubbard   mm/gup: track FOL...
2076
  			put_compound_head(head, 1, flags);
2667f50e8   Steve Capper   mm: introduce a g...
2077
2078
  			goto pte_unmap;
  		}
7aef4172c   Kirill A. Shutemov   mm: handle PTE-ma...
2079
  		VM_BUG_ON_PAGE(compound_head(page) != head, page);
e93480537   Kirill A. Shutemov   mm/gup: Mark all ...
2080

f28d43636   Claudio Imbrenda   mm/gup/writeback:...
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
  		/*
  		 * We need to make the page accessible if and only if we are
  		 * going to access its content (the FOLL_PIN case).  Please
  		 * see Documentation/core-api/pin_user_pages.rst for
  		 * details.
  		 */
  		if (flags & FOLL_PIN) {
  			ret = arch_make_page_accessible(page);
  			if (ret) {
  				unpin_user_page(page);
  				goto pte_unmap;
  			}
  		}
e93480537   Kirill A. Shutemov   mm/gup: Mark all ...
2094
  		SetPageReferenced(page);
2667f50e8   Steve Capper   mm: introduce a g...
2095
2096
2097
2098
2099
2100
2101
2102
  		pages[*nr] = page;
  		(*nr)++;
  
  	} while (ptep++, addr += PAGE_SIZE, addr != end);
  
  	ret = 1;
  
  pte_unmap:
832d7aa05   Christoph Hellwig   mm: optimize dev_...
2103
2104
  	if (pgmap)
  		put_dev_pagemap(pgmap);
2667f50e8   Steve Capper   mm: introduce a g...
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
  	pte_unmap(ptem);
  	return ret;
  }
  #else
  
  /*
   * If we can't determine whether or not a pte is special, then fail immediately
   * for ptes. Note, we can still pin HugeTLB and THP as these are guaranteed not
   * to be special.
   *
   * For a futex to be placed on a THP tail page, get_futex_key requires a
dadbb612f   Souptick Joarder   mm/gup.c: convert...
2116
   * get_user_pages_fast_only implementation that can pin pages. Thus it's still
2667f50e8   Steve Capper   mm: introduce a g...
2117
2118
2119
   * useful to have gup_huge_pmd even if we can't operate on ptes.
   */
  static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
b798bec47   Ira Weiny   mm/gup: change wr...
2120
  			 unsigned int flags, struct page **pages, int *nr)
2667f50e8   Steve Capper   mm: introduce a g...
2121
2122
2123
  {
  	return 0;
  }
3010a5ea6   Laurent Dufour   mm: introduce ARC...
2124
  #endif /* CONFIG_ARCH_HAS_PTE_SPECIAL */
2667f50e8   Steve Capper   mm: introduce a g...
2125

175967318   Robin Murphy   mm: introduce ARC...
2126
  #if defined(CONFIG_ARCH_HAS_PTE_DEVMAP) && defined(CONFIG_TRANSPARENT_HUGEPAGE)
b59f65fa0   Kirill A. Shutemov   mm/gup: Implement...
2127
  static int __gup_device_huge(unsigned long pfn, unsigned long addr,
86dfbed49   John Hubbard   mm/gup: pass a fl...
2128
2129
  			     unsigned long end, unsigned int flags,
  			     struct page **pages, int *nr)
b59f65fa0   Kirill A. Shutemov   mm/gup: Implement...
2130
2131
2132
2133
2134
2135
2136
2137
2138
  {
  	int nr_start = *nr;
  	struct dev_pagemap *pgmap = NULL;
  
  	do {
  		struct page *page = pfn_to_page(pfn);
  
  		pgmap = get_dev_pagemap(pfn, pgmap);
  		if (unlikely(!pgmap)) {
3b78d8347   John Hubbard   mm/gup: pass gup ...
2139
  			undo_dev_pagemap(nr, nr_start, flags, pages);
b59f65fa0   Kirill A. Shutemov   mm/gup: Implement...
2140
2141
2142
2143
  			return 0;
  		}
  		SetPageReferenced(page);
  		pages[*nr] = page;
3faa52c03   John Hubbard   mm/gup: track FOL...
2144
2145
2146
2147
  		if (unlikely(!try_grab_page(page, flags))) {
  			undo_dev_pagemap(nr, nr_start, flags, pages);
  			return 0;
  		}
b59f65fa0   Kirill A. Shutemov   mm/gup: Implement...
2148
2149
2150
  		(*nr)++;
  		pfn++;
  	} while (addr += PAGE_SIZE, addr != end);
832d7aa05   Christoph Hellwig   mm: optimize dev_...
2151
2152
2153
  
  	if (pgmap)
  		put_dev_pagemap(pgmap);
b59f65fa0   Kirill A. Shutemov   mm/gup: Implement...
2154
2155
  	return 1;
  }
a9b6de77b   Dan Williams   mm: fix __gup_dev...
2156
  static int __gup_device_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
86dfbed49   John Hubbard   mm/gup: pass a fl...
2157
2158
  				 unsigned long end, unsigned int flags,
  				 struct page **pages, int *nr)
b59f65fa0   Kirill A. Shutemov   mm/gup: Implement...
2159
2160
  {
  	unsigned long fault_pfn;
a9b6de77b   Dan Williams   mm: fix __gup_dev...
2161
2162
2163
  	int nr_start = *nr;
  
  	fault_pfn = pmd_pfn(orig) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
86dfbed49   John Hubbard   mm/gup: pass a fl...
2164
  	if (!__gup_device_huge(fault_pfn, addr, end, flags, pages, nr))
a9b6de77b   Dan Williams   mm: fix __gup_dev...
2165
  		return 0;
b59f65fa0   Kirill A. Shutemov   mm/gup: Implement...
2166

a9b6de77b   Dan Williams   mm: fix __gup_dev...
2167
  	if (unlikely(pmd_val(orig) != pmd_val(*pmdp))) {
3b78d8347   John Hubbard   mm/gup: pass gup ...
2168
  		undo_dev_pagemap(nr, nr_start, flags, pages);
a9b6de77b   Dan Williams   mm: fix __gup_dev...
2169
2170
2171
  		return 0;
  	}
  	return 1;
b59f65fa0   Kirill A. Shutemov   mm/gup: Implement...
2172
  }
a9b6de77b   Dan Williams   mm: fix __gup_dev...
2173
  static int __gup_device_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr,
86dfbed49   John Hubbard   mm/gup: pass a fl...
2174
2175
  				 unsigned long end, unsigned int flags,
  				 struct page **pages, int *nr)
b59f65fa0   Kirill A. Shutemov   mm/gup: Implement...
2176
2177
  {
  	unsigned long fault_pfn;
a9b6de77b   Dan Williams   mm: fix __gup_dev...
2178
2179
2180
  	int nr_start = *nr;
  
  	fault_pfn = pud_pfn(orig) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
86dfbed49   John Hubbard   mm/gup: pass a fl...
2181
  	if (!__gup_device_huge(fault_pfn, addr, end, flags, pages, nr))
a9b6de77b   Dan Williams   mm: fix __gup_dev...
2182
  		return 0;
b59f65fa0   Kirill A. Shutemov   mm/gup: Implement...
2183

a9b6de77b   Dan Williams   mm: fix __gup_dev...
2184
  	if (unlikely(pud_val(orig) != pud_val(*pudp))) {
3b78d8347   John Hubbard   mm/gup: pass gup ...
2185
  		undo_dev_pagemap(nr, nr_start, flags, pages);
a9b6de77b   Dan Williams   mm: fix __gup_dev...
2186
2187
2188
  		return 0;
  	}
  	return 1;
b59f65fa0   Kirill A. Shutemov   mm/gup: Implement...
2189
2190
  }
  #else
a9b6de77b   Dan Williams   mm: fix __gup_dev...
2191
  static int __gup_device_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
86dfbed49   John Hubbard   mm/gup: pass a fl...
2192
2193
  				 unsigned long end, unsigned int flags,
  				 struct page **pages, int *nr)
b59f65fa0   Kirill A. Shutemov   mm/gup: Implement...
2194
2195
2196
2197
  {
  	BUILD_BUG();
  	return 0;
  }
a9b6de77b   Dan Williams   mm: fix __gup_dev...
2198
  static int __gup_device_huge_pud(pud_t pud, pud_t *pudp, unsigned long addr,
86dfbed49   John Hubbard   mm/gup: pass a fl...
2199
2200
  				 unsigned long end, unsigned int flags,
  				 struct page **pages, int *nr)
b59f65fa0   Kirill A. Shutemov   mm/gup: Implement...
2201
2202
2203
2204
2205
  {
  	BUILD_BUG();
  	return 0;
  }
  #endif
a43e98208   John Hubbard   mm/gup: factor ou...
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
  static int record_subpages(struct page *page, unsigned long addr,
  			   unsigned long end, struct page **pages)
  {
  	int nr;
  
  	for (nr = 0; addr != end; addr += PAGE_SIZE)
  		pages[nr++] = page++;
  
  	return nr;
  }
cbd34da7d   Christoph Hellwig   mm: move the powe...
2216
2217
2218
2219
2220
2221
2222
2223
2224
  #ifdef CONFIG_ARCH_HAS_HUGEPD
  static unsigned long hugepte_addr_end(unsigned long addr, unsigned long end,
  				      unsigned long sz)
  {
  	unsigned long __boundary = (addr + sz) & ~(sz-1);
  	return (__boundary - 1 < end - 1) ? __boundary : end;
  }
  
  static int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
0cd22afdc   John Hubbard   mm/gup: fix a mis...
2225
2226
  		       unsigned long end, unsigned int flags,
  		       struct page **pages, int *nr)
cbd34da7d   Christoph Hellwig   mm: move the powe...
2227
2228
2229
2230
2231
2232
2233
2234
2235
  {
  	unsigned long pte_end;
  	struct page *head, *page;
  	pte_t pte;
  	int refs;
  
  	pte_end = (addr + sz) & ~(sz-1);
  	if (pte_end < end)
  		end = pte_end;
55ca22633   Christophe Leroy   mm/gup: Use huge_...
2236
  	pte = huge_ptep_get(ptep);
cbd34da7d   Christoph Hellwig   mm: move the powe...
2237

0cd22afdc   John Hubbard   mm/gup: fix a mis...
2238
  	if (!pte_access_permitted(pte, flags & FOLL_WRITE))
cbd34da7d   Christoph Hellwig   mm: move the powe...
2239
2240
2241
2242
  		return 0;
  
  	/* hugepages are never "special" */
  	VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
cbd34da7d   Christoph Hellwig   mm: move the powe...
2243
  	head = pte_page(pte);
cbd34da7d   Christoph Hellwig   mm: move the powe...
2244
  	page = head + ((addr & (sz-1)) >> PAGE_SHIFT);
a43e98208   John Hubbard   mm/gup: factor ou...
2245
  	refs = record_subpages(page, addr, end, pages + *nr);
cbd34da7d   Christoph Hellwig   mm: move the powe...
2246

3faa52c03   John Hubbard   mm/gup: track FOL...
2247
  	head = try_grab_compound_head(head, refs, flags);
a43e98208   John Hubbard   mm/gup: factor ou...
2248
  	if (!head)
cbd34da7d   Christoph Hellwig   mm: move the powe...
2249
  		return 0;
cbd34da7d   Christoph Hellwig   mm: move the powe...
2250
2251
  
  	if (unlikely(pte_val(pte) != pte_val(*ptep))) {
3b78d8347   John Hubbard   mm/gup: pass gup ...
2252
  		put_compound_head(head, refs, flags);
cbd34da7d   Christoph Hellwig   mm: move the powe...
2253
2254
  		return 0;
  	}
a43e98208   John Hubbard   mm/gup: factor ou...
2255
  	*nr += refs;
520b4a449   Christoph Hellwig   mm: mark the page...
2256
  	SetPageReferenced(head);
cbd34da7d   Christoph Hellwig   mm: move the powe...
2257
2258
2259
2260
  	return 1;
  }
  
  static int gup_huge_pd(hugepd_t hugepd, unsigned long addr,
0cd22afdc   John Hubbard   mm/gup: fix a mis...
2261
  		unsigned int pdshift, unsigned long end, unsigned int flags,
cbd34da7d   Christoph Hellwig   mm: move the powe...
2262
2263
2264
2265
2266
2267
2268
2269
2270
  		struct page **pages, int *nr)
  {
  	pte_t *ptep;
  	unsigned long sz = 1UL << hugepd_shift(hugepd);
  	unsigned long next;
  
  	ptep = hugepte_offset(hugepd, addr, pdshift);
  	do {
  		next = hugepte_addr_end(addr, end, sz);
0cd22afdc   John Hubbard   mm/gup: fix a mis...
2271
  		if (!gup_hugepte(ptep, sz, addr, end, flags, pages, nr))
cbd34da7d   Christoph Hellwig   mm: move the powe...
2272
2273
2274
2275
2276
2277
2278
  			return 0;
  	} while (ptep++, addr = next, addr != end);
  
  	return 1;
  }
  #else
  static inline int gup_huge_pd(hugepd_t hugepd, unsigned long addr,
0cd22afdc   John Hubbard   mm/gup: fix a mis...
2279
  		unsigned int pdshift, unsigned long end, unsigned int flags,
cbd34da7d   Christoph Hellwig   mm: move the powe...
2280
2281
2282
2283
2284
  		struct page **pages, int *nr)
  {
  	return 0;
  }
  #endif /* CONFIG_ARCH_HAS_HUGEPD */
2667f50e8   Steve Capper   mm: introduce a g...
2285
  static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
0cd22afdc   John Hubbard   mm/gup: fix a mis...
2286
2287
  			unsigned long end, unsigned int flags,
  			struct page **pages, int *nr)
2667f50e8   Steve Capper   mm: introduce a g...
2288
  {
ddc58f27f   Kirill A. Shutemov   mm: drop tail pag...
2289
  	struct page *head, *page;
2667f50e8   Steve Capper   mm: introduce a g...
2290
  	int refs;
b798bec47   Ira Weiny   mm/gup: change wr...
2291
  	if (!pmd_access_permitted(orig, flags & FOLL_WRITE))
2667f50e8   Steve Capper   mm: introduce a g...
2292
  		return 0;
7af75561e   Ira Weiny   mm/gup: add FOLL_...
2293
2294
2295
  	if (pmd_devmap(orig)) {
  		if (unlikely(flags & FOLL_LONGTERM))
  			return 0;
86dfbed49   John Hubbard   mm/gup: pass a fl...
2296
2297
  		return __gup_device_huge_pmd(orig, pmdp, addr, end, flags,
  					     pages, nr);
7af75561e   Ira Weiny   mm/gup: add FOLL_...
2298
  	}
b59f65fa0   Kirill A. Shutemov   mm/gup: Implement...
2299

d63206ee3   Punit Agrawal   mm, gup: ensure r...
2300
  	page = pmd_page(orig) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
a43e98208   John Hubbard   mm/gup: factor ou...
2301
  	refs = record_subpages(page, addr, end, pages + *nr);
2667f50e8   Steve Capper   mm: introduce a g...
2302

3faa52c03   John Hubbard   mm/gup: track FOL...
2303
  	head = try_grab_compound_head(pmd_page(orig), refs, flags);
a43e98208   John Hubbard   mm/gup: factor ou...
2304
  	if (!head)
2667f50e8   Steve Capper   mm: introduce a g...
2305
  		return 0;
2667f50e8   Steve Capper   mm: introduce a g...
2306
2307
  
  	if (unlikely(pmd_val(orig) != pmd_val(*pmdp))) {
3b78d8347   John Hubbard   mm/gup: pass gup ...
2308
  		put_compound_head(head, refs, flags);
2667f50e8   Steve Capper   mm: introduce a g...
2309
2310
  		return 0;
  	}
a43e98208   John Hubbard   mm/gup: factor ou...
2311
  	*nr += refs;
e93480537   Kirill A. Shutemov   mm/gup: Mark all ...
2312
  	SetPageReferenced(head);
2667f50e8   Steve Capper   mm: introduce a g...
2313
2314
2315
2316
  	return 1;
  }
  
  static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr,
86dfbed49   John Hubbard   mm/gup: pass a fl...
2317
2318
  			unsigned long end, unsigned int flags,
  			struct page **pages, int *nr)
2667f50e8   Steve Capper   mm: introduce a g...
2319
  {
ddc58f27f   Kirill A. Shutemov   mm: drop tail pag...
2320
  	struct page *head, *page;
2667f50e8   Steve Capper   mm: introduce a g...
2321
  	int refs;
b798bec47   Ira Weiny   mm/gup: change wr...
2322
  	if (!pud_access_permitted(orig, flags & FOLL_WRITE))
2667f50e8   Steve Capper   mm: introduce a g...
2323
  		return 0;
7af75561e   Ira Weiny   mm/gup: add FOLL_...
2324
2325
2326
  	if (pud_devmap(orig)) {
  		if (unlikely(flags & FOLL_LONGTERM))
  			return 0;
86dfbed49   John Hubbard   mm/gup: pass a fl...
2327
2328
  		return __gup_device_huge_pud(orig, pudp, addr, end, flags,
  					     pages, nr);
7af75561e   Ira Weiny   mm/gup: add FOLL_...
2329
  	}
b59f65fa0   Kirill A. Shutemov   mm/gup: Implement...
2330

d63206ee3   Punit Agrawal   mm, gup: ensure r...
2331
  	page = pud_page(orig) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
a43e98208   John Hubbard   mm/gup: factor ou...
2332
  	refs = record_subpages(page, addr, end, pages + *nr);
2667f50e8   Steve Capper   mm: introduce a g...
2333

3faa52c03   John Hubbard   mm/gup: track FOL...
2334
  	head = try_grab_compound_head(pud_page(orig), refs, flags);
a43e98208   John Hubbard   mm/gup: factor ou...
2335
  	if (!head)
2667f50e8   Steve Capper   mm: introduce a g...
2336
  		return 0;
2667f50e8   Steve Capper   mm: introduce a g...
2337
2338
  
  	if (unlikely(pud_val(orig) != pud_val(*pudp))) {
3b78d8347   John Hubbard   mm/gup: pass gup ...
2339
  		put_compound_head(head, refs, flags);
2667f50e8   Steve Capper   mm: introduce a g...
2340
2341
  		return 0;
  	}
a43e98208   John Hubbard   mm/gup: factor ou...
2342
  	*nr += refs;
e93480537   Kirill A. Shutemov   mm/gup: Mark all ...
2343
  	SetPageReferenced(head);
2667f50e8   Steve Capper   mm: introduce a g...
2344
2345
  	return 1;
  }
f30c59e92   Aneesh Kumar K.V   mm: Update generi...
2346
  static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr,
b798bec47   Ira Weiny   mm/gup: change wr...
2347
  			unsigned long end, unsigned int flags,
f30c59e92   Aneesh Kumar K.V   mm: Update generi...
2348
2349
2350
  			struct page **pages, int *nr)
  {
  	int refs;
ddc58f27f   Kirill A. Shutemov   mm: drop tail pag...
2351
  	struct page *head, *page;
f30c59e92   Aneesh Kumar K.V   mm: Update generi...
2352

b798bec47   Ira Weiny   mm/gup: change wr...
2353
  	if (!pgd_access_permitted(orig, flags & FOLL_WRITE))
f30c59e92   Aneesh Kumar K.V   mm: Update generi...
2354
  		return 0;
b59f65fa0   Kirill A. Shutemov   mm/gup: Implement...
2355
  	BUILD_BUG_ON(pgd_devmap(orig));
a43e98208   John Hubbard   mm/gup: factor ou...
2356

d63206ee3   Punit Agrawal   mm, gup: ensure r...
2357
  	page = pgd_page(orig) + ((addr & ~PGDIR_MASK) >> PAGE_SHIFT);
a43e98208   John Hubbard   mm/gup: factor ou...
2358
  	refs = record_subpages(page, addr, end, pages + *nr);
f30c59e92   Aneesh Kumar K.V   mm: Update generi...
2359

3faa52c03   John Hubbard   mm/gup: track FOL...
2360
  	head = try_grab_compound_head(pgd_page(orig), refs, flags);
a43e98208   John Hubbard   mm/gup: factor ou...
2361
  	if (!head)
f30c59e92   Aneesh Kumar K.V   mm: Update generi...
2362
  		return 0;
f30c59e92   Aneesh Kumar K.V   mm: Update generi...
2363
2364
  
  	if (unlikely(pgd_val(orig) != pgd_val(*pgdp))) {
3b78d8347   John Hubbard   mm/gup: pass gup ...
2365
  		put_compound_head(head, refs, flags);
f30c59e92   Aneesh Kumar K.V   mm: Update generi...
2366
2367
  		return 0;
  	}
a43e98208   John Hubbard   mm/gup: factor ou...
2368
  	*nr += refs;
e93480537   Kirill A. Shutemov   mm/gup: Mark all ...
2369
  	SetPageReferenced(head);
f30c59e92   Aneesh Kumar K.V   mm: Update generi...
2370
2371
  	return 1;
  }
d3f7b1bb2   Vasily Gorbik   mm/gup: fix gup_f...
2372
  static int gup_pmd_range(pud_t *pudp, pud_t pud, unsigned long addr, unsigned long end,
b798bec47   Ira Weiny   mm/gup: change wr...
2373
  		unsigned int flags, struct page **pages, int *nr)
2667f50e8   Steve Capper   mm: introduce a g...
2374
2375
2376
  {
  	unsigned long next;
  	pmd_t *pmdp;
d3f7b1bb2   Vasily Gorbik   mm/gup: fix gup_f...
2377
  	pmdp = pmd_offset_lockless(pudp, pud, addr);
2667f50e8   Steve Capper   mm: introduce a g...
2378
  	do {
38c5ce936   Christian Borntraeger   mm/gup: Replace A...
2379
  		pmd_t pmd = READ_ONCE(*pmdp);
2667f50e8   Steve Capper   mm: introduce a g...
2380
2381
  
  		next = pmd_addr_end(addr, end);
84c3fc4e9   Zi Yan   mm: thp: check pm...
2382
  		if (!pmd_present(pmd))
2667f50e8   Steve Capper   mm: introduce a g...
2383
  			return 0;
414fd080d   Yu Zhao   mm/gup: fix gup_p...
2384
2385
  		if (unlikely(pmd_trans_huge(pmd) || pmd_huge(pmd) ||
  			     pmd_devmap(pmd))) {
2667f50e8   Steve Capper   mm: introduce a g...
2386
2387
2388
2389
2390
  			/*
  			 * NUMA hinting faults need to be handled in the GUP
  			 * slowpath for accounting purposes and so that they
  			 * can be serialised against THP migration.
  			 */
8a0516ed8   Mel Gorman   mm: convert p[te|...
2391
  			if (pmd_protnone(pmd))
2667f50e8   Steve Capper   mm: introduce a g...
2392
  				return 0;
b798bec47   Ira Weiny   mm/gup: change wr...
2393
  			if (!gup_huge_pmd(pmd, pmdp, addr, next, flags,
2667f50e8   Steve Capper   mm: introduce a g...
2394
2395
  				pages, nr))
  				return 0;
f30c59e92   Aneesh Kumar K.V   mm: Update generi...
2396
2397
2398
2399
2400
2401
  		} else if (unlikely(is_hugepd(__hugepd(pmd_val(pmd))))) {
  			/*
  			 * architecture have different format for hugetlbfs
  			 * pmd format and THP pmd format
  			 */
  			if (!gup_huge_pd(__hugepd(pmd_val(pmd)), addr,
b798bec47   Ira Weiny   mm/gup: change wr...
2402
  					 PMD_SHIFT, next, flags, pages, nr))
f30c59e92   Aneesh Kumar K.V   mm: Update generi...
2403
  				return 0;
b798bec47   Ira Weiny   mm/gup: change wr...
2404
  		} else if (!gup_pte_range(pmd, addr, next, flags, pages, nr))
2923117b7   Mario Leinweber   mm/gup.c: fix cod...
2405
  			return 0;
2667f50e8   Steve Capper   mm: introduce a g...
2406
2407
2408
2409
  	} while (pmdp++, addr = next, addr != end);
  
  	return 1;
  }
d3f7b1bb2   Vasily Gorbik   mm/gup: fix gup_f...
2410
  static int gup_pud_range(p4d_t *p4dp, p4d_t p4d, unsigned long addr, unsigned long end,
b798bec47   Ira Weiny   mm/gup: change wr...
2411
  			 unsigned int flags, struct page **pages, int *nr)
2667f50e8   Steve Capper   mm: introduce a g...
2412
2413
2414
  {
  	unsigned long next;
  	pud_t *pudp;
d3f7b1bb2   Vasily Gorbik   mm/gup: fix gup_f...
2415
  	pudp = pud_offset_lockless(p4dp, p4d, addr);
2667f50e8   Steve Capper   mm: introduce a g...
2416
  	do {
e37c69827   Christian Borntraeger   mm: replace ACCES...
2417
  		pud_t pud = READ_ONCE(*pudp);
2667f50e8   Steve Capper   mm: introduce a g...
2418
2419
  
  		next = pud_addr_end(addr, end);
15494520b   Qiujun Huang   mm: fix gup_pud_r...
2420
  		if (unlikely(!pud_present(pud)))
2667f50e8   Steve Capper   mm: introduce a g...
2421
  			return 0;
f30c59e92   Aneesh Kumar K.V   mm: Update generi...
2422
  		if (unlikely(pud_huge(pud))) {
b798bec47   Ira Weiny   mm/gup: change wr...
2423
  			if (!gup_huge_pud(pud, pudp, addr, next, flags,
f30c59e92   Aneesh Kumar K.V   mm: Update generi...
2424
2425
2426
2427
  					  pages, nr))
  				return 0;
  		} else if (unlikely(is_hugepd(__hugepd(pud_val(pud))))) {
  			if (!gup_huge_pd(__hugepd(pud_val(pud)), addr,
b798bec47   Ira Weiny   mm/gup: change wr...
2428
  					 PUD_SHIFT, next, flags, pages, nr))
2667f50e8   Steve Capper   mm: introduce a g...
2429
  				return 0;
d3f7b1bb2   Vasily Gorbik   mm/gup: fix gup_f...
2430
  		} else if (!gup_pmd_range(pudp, pud, addr, next, flags, pages, nr))
2667f50e8   Steve Capper   mm: introduce a g...
2431
2432
2433
2434
2435
  			return 0;
  	} while (pudp++, addr = next, addr != end);
  
  	return 1;
  }
d3f7b1bb2   Vasily Gorbik   mm/gup: fix gup_f...
2436
  static int gup_p4d_range(pgd_t *pgdp, pgd_t pgd, unsigned long addr, unsigned long end,
b798bec47   Ira Weiny   mm/gup: change wr...
2437
  			 unsigned int flags, struct page **pages, int *nr)
c2febafc6   Kirill A. Shutemov   mm: convert gener...
2438
2439
2440
  {
  	unsigned long next;
  	p4d_t *p4dp;
d3f7b1bb2   Vasily Gorbik   mm/gup: fix gup_f...
2441
  	p4dp = p4d_offset_lockless(pgdp, pgd, addr);
c2febafc6   Kirill A. Shutemov   mm: convert gener...
2442
2443
2444
2445
2446
2447
2448
2449
2450
  	do {
  		p4d_t p4d = READ_ONCE(*p4dp);
  
  		next = p4d_addr_end(addr, end);
  		if (p4d_none(p4d))
  			return 0;
  		BUILD_BUG_ON(p4d_huge(p4d));
  		if (unlikely(is_hugepd(__hugepd(p4d_val(p4d))))) {
  			if (!gup_huge_pd(__hugepd(p4d_val(p4d)), addr,
b798bec47   Ira Weiny   mm/gup: change wr...
2451
  					 P4D_SHIFT, next, flags, pages, nr))
c2febafc6   Kirill A. Shutemov   mm: convert gener...
2452
  				return 0;
d3f7b1bb2   Vasily Gorbik   mm/gup: fix gup_f...
2453
  		} else if (!gup_pud_range(p4dp, p4d, addr, next, flags, pages, nr))
c2febafc6   Kirill A. Shutemov   mm: convert gener...
2454
2455
2456
2457
2458
  			return 0;
  	} while (p4dp++, addr = next, addr != end);
  
  	return 1;
  }
5b65c4677   Kirill A. Shutemov   mm, x86/mm: Fix p...
2459
  static void gup_pgd_range(unsigned long addr, unsigned long end,
b798bec47   Ira Weiny   mm/gup: change wr...
2460
  		unsigned int flags, struct page **pages, int *nr)
5b65c4677   Kirill A. Shutemov   mm, x86/mm: Fix p...
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
  {
  	unsigned long next;
  	pgd_t *pgdp;
  
  	pgdp = pgd_offset(current->mm, addr);
  	do {
  		pgd_t pgd = READ_ONCE(*pgdp);
  
  		next = pgd_addr_end(addr, end);
  		if (pgd_none(pgd))
  			return;
  		if (unlikely(pgd_huge(pgd))) {
b798bec47   Ira Weiny   mm/gup: change wr...
2473
  			if (!gup_huge_pgd(pgd, pgdp, addr, next, flags,
5b65c4677   Kirill A. Shutemov   mm, x86/mm: Fix p...
2474
2475
2476
2477
  					  pages, nr))
  				return;
  		} else if (unlikely(is_hugepd(__hugepd(pgd_val(pgd))))) {
  			if (!gup_huge_pd(__hugepd(pgd_val(pgd)), addr,
b798bec47   Ira Weiny   mm/gup: change wr...
2478
  					 PGDIR_SHIFT, next, flags, pages, nr))
5b65c4677   Kirill A. Shutemov   mm, x86/mm: Fix p...
2479
  				return;
d3f7b1bb2   Vasily Gorbik   mm/gup: fix gup_f...
2480
  		} else if (!gup_p4d_range(pgdp, pgd, addr, next, flags, pages, nr))
5b65c4677   Kirill A. Shutemov   mm, x86/mm: Fix p...
2481
2482
2483
  			return;
  	} while (pgdp++, addr = next, addr != end);
  }
050a9adc6   Christoph Hellwig   mm: consolidate t...
2484
2485
2486
2487
2488
2489
  #else
  static inline void gup_pgd_range(unsigned long addr, unsigned long end,
  		unsigned int flags, struct page **pages, int *nr)
  {
  }
  #endif /* CONFIG_HAVE_FAST_GUP */
5b65c4677   Kirill A. Shutemov   mm, x86/mm: Fix p...
2490
2491
2492
  
  #ifndef gup_fast_permitted
  /*
dadbb612f   Souptick Joarder   mm/gup.c: convert...
2493
   * Check if it's allowed to use get_user_pages_fast_only() for the range, or
5b65c4677   Kirill A. Shutemov   mm, x86/mm: Fix p...
2494
2495
   * we need to fall back to the slow version:
   */
26f4c3280   Christoph Hellwig   mm: simplify gup_...
2496
  static bool gup_fast_permitted(unsigned long start, unsigned long end)
5b65c4677   Kirill A. Shutemov   mm, x86/mm: Fix p...
2497
  {
26f4c3280   Christoph Hellwig   mm: simplify gup_...
2498
  	return true;
5b65c4677   Kirill A. Shutemov   mm, x86/mm: Fix p...
2499
2500
  }
  #endif
7af75561e   Ira Weiny   mm/gup: add FOLL_...
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
  static int __gup_longterm_unlocked(unsigned long start, int nr_pages,
  				   unsigned int gup_flags, struct page **pages)
  {
  	int ret;
  
  	/*
  	 * FIXME: FOLL_LONGTERM does not work with
  	 * get_user_pages_unlocked() (see comments in that function)
  	 */
  	if (gup_flags & FOLL_LONGTERM) {
d8ed45c5d   Michel Lespinasse   mmap locking API:...
2511
  		mmap_read_lock(current->mm);
64019a2e4   Peter Xu   mm/gup: remove ta...
2512
  		ret = __gup_longterm_locked(current->mm,
7af75561e   Ira Weiny   mm/gup: add FOLL_...
2513
2514
  					    start, nr_pages,
  					    pages, NULL, gup_flags);
d8ed45c5d   Michel Lespinasse   mmap locking API:...
2515
  		mmap_read_unlock(current->mm);
7af75561e   Ira Weiny   mm/gup: add FOLL_...
2516
2517
2518
2519
2520
2521
2522
  	} else {
  		ret = get_user_pages_unlocked(start, nr_pages,
  					      pages, gup_flags);
  	}
  
  	return ret;
  }
bcb0f647c   Jason Gunthorpe   mm/gup: reorganiz...
2523
2524
2525
2526
2527
2528
2529
  static unsigned long lockless_pages_from_mm(unsigned long start,
  					    unsigned long end,
  					    unsigned int gup_flags,
  					    struct page **pages)
  {
  	unsigned long flags;
  	int nr_pinned = 0;
537946556   Jason Gunthorpe   mm/gup: prevent g...
2530
  	unsigned seq;
bcb0f647c   Jason Gunthorpe   mm/gup: reorganiz...
2531
2532
2533
2534
  
  	if (!IS_ENABLED(CONFIG_HAVE_FAST_GUP) ||
  	    !gup_fast_permitted(start, end))
  		return 0;
537946556   Jason Gunthorpe   mm/gup: prevent g...
2535
2536
2537
2538
2539
  	if (gup_flags & FOLL_PIN) {
  		seq = raw_read_seqcount(&current->mm->write_protect_seq);
  		if (seq & 1)
  			return 0;
  	}
bcb0f647c   Jason Gunthorpe   mm/gup: reorganiz...
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
  	/*
  	 * Disable interrupts. The nested form is used, in order to allow full,
  	 * general purpose use of this routine.
  	 *
  	 * With interrupts disabled, we block page table pages from being freed
  	 * from under us. See struct mmu_table_batch comments in
  	 * include/asm-generic/tlb.h for more details.
  	 *
  	 * We do not adopt an rcu_read_lock() here as we also want to block IPIs
  	 * that come from THPs splitting.
  	 */
  	local_irq_save(flags);
  	gup_pgd_range(start, end, gup_flags, pages, &nr_pinned);
  	local_irq_restore(flags);
537946556   Jason Gunthorpe   mm/gup: prevent g...
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
  
  	/*
  	 * When pinning pages for DMA there could be a concurrent write protect
  	 * from fork() via copy_page_range(), in this case always fail fast GUP.
  	 */
  	if (gup_flags & FOLL_PIN) {
  		if (read_seqcount_retry(&current->mm->write_protect_seq, seq)) {
  			unpin_user_pages(pages, nr_pinned);
  			return 0;
  		}
  	}
bcb0f647c   Jason Gunthorpe   mm/gup: reorganiz...
2565
2566
2567
2568
2569
  	return nr_pinned;
  }
  
  static int internal_get_user_pages_fast(unsigned long start,
  					unsigned long nr_pages,
eddb1c228   John Hubbard   mm/gup: introduce...
2570
2571
  					unsigned int gup_flags,
  					struct page **pages)
2667f50e8   Steve Capper   mm: introduce a g...
2572
  {
bcb0f647c   Jason Gunthorpe   mm/gup: reorganiz...
2573
2574
2575
  	unsigned long len, end;
  	unsigned long nr_pinned;
  	int ret;
2667f50e8   Steve Capper   mm: introduce a g...
2576

f4000fdf4   John Hubbard   mm/gup: allow FOL...
2577
  	if (WARN_ON_ONCE(gup_flags & ~(FOLL_WRITE | FOLL_LONGTERM |
376a34efa   John Hubbard   mm/gup: refactor ...
2578
2579
  				       FOLL_FORCE | FOLL_PIN | FOLL_GET |
  				       FOLL_FAST_ONLY)))
817be129e   Christoph Hellwig   mm: validate get_...
2580
  		return -EINVAL;
008cfe441   Peter Xu   mm: Introduce mm_...
2581
2582
  	if (gup_flags & FOLL_PIN)
  		atomic_set(&current->mm->has_pinned, 1);
f81cd178e   John Hubbard   mm/gup: might_loc...
2583
  	if (!(gup_flags & FOLL_FAST_ONLY))
da1c55f1b   Michel Lespinasse   mmap locking API:...
2584
  		might_lock_read(&current->mm->mmap_lock);
f81cd178e   John Hubbard   mm/gup: might_loc...
2585

f455c8548   Christoph Hellwig   mm: use untagged_...
2586
  	start = untagged_addr(start) & PAGE_MASK;
bcb0f647c   Jason Gunthorpe   mm/gup: reorganiz...
2587
2588
  	len = nr_pages << PAGE_SHIFT;
  	if (check_add_overflow(start, len, &end))
c61611f70   Michael S. Tsirkin   get_user_pages_fa...
2589
  		return 0;
96d4f267e   Linus Torvalds   Remove 'type' arg...
2590
  	if (unlikely(!access_ok((void __user *)start, len)))
c61611f70   Michael S. Tsirkin   get_user_pages_fa...
2591
  		return -EFAULT;
73e10a618   Kirill A. Shutemov   mm/gup: Provide c...
2592

bcb0f647c   Jason Gunthorpe   mm/gup: reorganiz...
2593
2594
2595
  	nr_pinned = lockless_pages_from_mm(start, end, gup_flags, pages);
  	if (nr_pinned == nr_pages || gup_flags & FOLL_FAST_ONLY)
  		return nr_pinned;
2667f50e8   Steve Capper   mm: introduce a g...
2596

bcb0f647c   Jason Gunthorpe   mm/gup: reorganiz...
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
  	/* Slow path: try to get the remaining pages with get_user_pages */
  	start += nr_pinned << PAGE_SHIFT;
  	pages += nr_pinned;
  	ret = __gup_longterm_unlocked(start, nr_pages - nr_pinned, gup_flags,
  				      pages);
  	if (ret < 0) {
  		/*
  		 * The caller has to unpin the pages we already pinned so
  		 * returning -errno is not an option
  		 */
  		if (nr_pinned)
  			return nr_pinned;
  		return ret;
2667f50e8   Steve Capper   mm: introduce a g...
2610
  	}
bcb0f647c   Jason Gunthorpe   mm/gup: reorganiz...
2611
  	return ret + nr_pinned;
2667f50e8   Steve Capper   mm: introduce a g...
2612
  }
bcb0f647c   Jason Gunthorpe   mm/gup: reorganiz...
2613

dadbb612f   Souptick Joarder   mm/gup.c: convert...
2614
2615
2616
2617
2618
2619
2620
2621
  /**
   * get_user_pages_fast_only() - pin user pages in memory
   * @start:      starting user address
   * @nr_pages:   number of pages from start to pin
   * @gup_flags:  flags modifying pin behaviour
   * @pages:      array that receives pointers to the pages pinned.
   *              Should be at least nr_pages long.
   *
9e1f0580d   John Hubbard   mm/gup: move __ge...
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
   * Like get_user_pages_fast() except it's IRQ-safe in that it won't fall back to
   * the regular GUP.
   * Note a difference with get_user_pages_fast: this always returns the
   * number of pages pinned, 0 if no pages were pinned.
   *
   * If the architecture does not support this function, simply return with no
   * pages pinned.
   *
   * Careful, careful! COW breaking can go either way, so a non-write
   * access can get ambiguous page results. If you call this function without
   * 'write' set, you'd better be sure that you're ok with that ambiguity.
   */
dadbb612f   Souptick Joarder   mm/gup.c: convert...
2634
2635
  int get_user_pages_fast_only(unsigned long start, int nr_pages,
  			     unsigned int gup_flags, struct page **pages)
9e1f0580d   John Hubbard   mm/gup: move __ge...
2636
  {
376a34efa   John Hubbard   mm/gup: refactor ...
2637
  	int nr_pinned;
9e1f0580d   John Hubbard   mm/gup: move __ge...
2638
2639
2640
  	/*
  	 * Internally (within mm/gup.c), gup fast variants must set FOLL_GET,
  	 * because gup fast is always a "pin with a +1 page refcount" request.
376a34efa   John Hubbard   mm/gup: refactor ...
2641
2642
2643
  	 *
  	 * FOLL_FAST_ONLY is required in order to match the API description of
  	 * this routine: no fall back to regular ("slow") GUP.
9e1f0580d   John Hubbard   mm/gup: move __ge...
2644
  	 */
dadbb612f   Souptick Joarder   mm/gup.c: convert...
2645
  	gup_flags |= FOLL_GET | FOLL_FAST_ONLY;
9e1f0580d   John Hubbard   mm/gup: move __ge...
2646

376a34efa   John Hubbard   mm/gup: refactor ...
2647
2648
  	nr_pinned = internal_get_user_pages_fast(start, nr_pages, gup_flags,
  						 pages);
9e1f0580d   John Hubbard   mm/gup: move __ge...
2649
2650
  
  	/*
376a34efa   John Hubbard   mm/gup: refactor ...
2651
2652
2653
2654
  	 * As specified in the API description above, this routine is not
  	 * allowed to return negative values. However, the common core
  	 * routine internal_get_user_pages_fast() *can* return -errno.
  	 * Therefore, correct for that here:
9e1f0580d   John Hubbard   mm/gup: move __ge...
2655
  	 */
376a34efa   John Hubbard   mm/gup: refactor ...
2656
2657
  	if (nr_pinned < 0)
  		nr_pinned = 0;
9e1f0580d   John Hubbard   mm/gup: move __ge...
2658
2659
2660
  
  	return nr_pinned;
  }
dadbb612f   Souptick Joarder   mm/gup.c: convert...
2661
  EXPORT_SYMBOL_GPL(get_user_pages_fast_only);
9e1f0580d   John Hubbard   mm/gup: move __ge...
2662

eddb1c228   John Hubbard   mm/gup: introduce...
2663
2664
  /**
   * get_user_pages_fast() - pin user pages in memory
3faa52c03   John Hubbard   mm/gup: track FOL...
2665
2666
2667
2668
2669
   * @start:      starting user address
   * @nr_pages:   number of pages from start to pin
   * @gup_flags:  flags modifying pin behaviour
   * @pages:      array that receives pointers to the pages pinned.
   *              Should be at least nr_pages long.
eddb1c228   John Hubbard   mm/gup: introduce...
2670
   *
c1e8d7c6a   Michel Lespinasse   mmap locking API:...
2671
   * Attempt to pin user pages in memory without taking mm->mmap_lock.
eddb1c228   John Hubbard   mm/gup: introduce...
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
   * If not successful, it will fall back to taking the lock and
   * calling get_user_pages().
   *
   * Returns number of pages pinned. This may be fewer than the number requested.
   * If nr_pages is 0 or negative, returns 0. If no pages were pinned, returns
   * -errno.
   */
  int get_user_pages_fast(unsigned long start, int nr_pages,
  			unsigned int gup_flags, struct page **pages)
  {
447f3e45c   Barry Song   mm/gup: don't per...
2682
  	if (!is_valid_gup_flags(gup_flags))
eddb1c228   John Hubbard   mm/gup: introduce...
2683
  		return -EINVAL;
94202f126   John Hubbard   mm/gup: require F...
2684
2685
2686
2687
2688
2689
2690
  	/*
  	 * The caller may or may not have explicitly set FOLL_GET; either way is
  	 * OK. However, internally (within mm/gup.c), gup fast variants must set
  	 * FOLL_GET, because gup fast is always a "pin with a +1 page refcount"
  	 * request.
  	 */
  	gup_flags |= FOLL_GET;
eddb1c228   John Hubbard   mm/gup: introduce...
2691
2692
  	return internal_get_user_pages_fast(start, nr_pages, gup_flags, pages);
  }
050a9adc6   Christoph Hellwig   mm: consolidate t...
2693
  EXPORT_SYMBOL_GPL(get_user_pages_fast);
eddb1c228   John Hubbard   mm/gup: introduce...
2694
2695
2696
2697
  
  /**
   * pin_user_pages_fast() - pin user pages in memory without taking locks
   *
3faa52c03   John Hubbard   mm/gup: track FOL...
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
   * @start:      starting user address
   * @nr_pages:   number of pages from start to pin
   * @gup_flags:  flags modifying pin behaviour
   * @pages:      array that receives pointers to the pages pinned.
   *              Should be at least nr_pages long.
   *
   * Nearly the same as get_user_pages_fast(), except that FOLL_PIN is set. See
   * get_user_pages_fast() for documentation on the function arguments, because
   * the arguments here are identical.
   *
   * FOLL_PIN means that the pages must be released via unpin_user_page(). Please
72ef5e52b   Mauro Carvalho Chehab   docs: fix broken ...
2709
   * see Documentation/core-api/pin_user_pages.rst for further details.
eddb1c228   John Hubbard   mm/gup: introduce...
2710
2711
2712
2713
   */
  int pin_user_pages_fast(unsigned long start, int nr_pages,
  			unsigned int gup_flags, struct page **pages)
  {
3faa52c03   John Hubbard   mm/gup: track FOL...
2714
2715
2716
2717
2718
2719
  	/* FOLL_GET and FOLL_PIN are mutually exclusive. */
  	if (WARN_ON_ONCE(gup_flags & FOLL_GET))
  		return -EINVAL;
  
  	gup_flags |= FOLL_PIN;
  	return internal_get_user_pages_fast(start, nr_pages, gup_flags, pages);
eddb1c228   John Hubbard   mm/gup: introduce...
2720
2721
  }
  EXPORT_SYMBOL_GPL(pin_user_pages_fast);
104acc327   John Hubbard   mm/gup: introduce...
2722
  /*
dadbb612f   Souptick Joarder   mm/gup.c: convert...
2723
2724
   * This is the FOLL_PIN equivalent of get_user_pages_fast_only(). Behavior
   * is the same, except that this one sets FOLL_PIN instead of FOLL_GET.
104acc327   John Hubbard   mm/gup: introduce...
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
   *
   * The API rules are the same, too: no negative values may be returned.
   */
  int pin_user_pages_fast_only(unsigned long start, int nr_pages,
  			     unsigned int gup_flags, struct page **pages)
  {
  	int nr_pinned;
  
  	/*
  	 * FOLL_GET and FOLL_PIN are mutually exclusive. Note that the API
  	 * rules require returning 0, rather than -errno:
  	 */
  	if (WARN_ON_ONCE(gup_flags & FOLL_GET))
  		return 0;
  	/*
  	 * FOLL_FAST_ONLY is required in order to match the API description of
  	 * this routine: no fall back to regular ("slow") GUP.
  	 */
  	gup_flags |= (FOLL_PIN | FOLL_FAST_ONLY);
  	nr_pinned = internal_get_user_pages_fast(start, nr_pages, gup_flags,
  						 pages);
  	/*
  	 * This routine is not allowed to return negative values. However,
  	 * internal_get_user_pages_fast() *can* return -errno. Therefore,
  	 * correct for that here:
  	 */
  	if (nr_pinned < 0)
  		nr_pinned = 0;
  
  	return nr_pinned;
  }
  EXPORT_SYMBOL_GPL(pin_user_pages_fast_only);
eddb1c228   John Hubbard   mm/gup: introduce...
2757
  /**
64019a2e4   Peter Xu   mm/gup: remove ta...
2758
   * pin_user_pages_remote() - pin pages of a remote process
eddb1c228   John Hubbard   mm/gup: introduce...
2759
   *
3faa52c03   John Hubbard   mm/gup: track FOL...
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
   * @mm:		mm_struct of target mm
   * @start:	starting user address
   * @nr_pages:	number of pages from start to pin
   * @gup_flags:	flags modifying lookup behaviour
   * @pages:	array that receives pointers to the pages pinned.
   *		Should be at least nr_pages long. Or NULL, if caller
   *		only intends to ensure the pages are faulted in.
   * @vmas:	array of pointers to vmas corresponding to each page.
   *		Or NULL if the caller does not require them.
   * @locked:	pointer to lock flag indicating whether lock is held and
   *		subsequently whether VM_FAULT_RETRY functionality can be
   *		utilised. Lock must initially be held.
   *
   * Nearly the same as get_user_pages_remote(), except that FOLL_PIN is set. See
   * get_user_pages_remote() for documentation on the function arguments, because
   * the arguments here are identical.
   *
   * FOLL_PIN means that the pages must be released via unpin_user_page(). Please
72ef5e52b   Mauro Carvalho Chehab   docs: fix broken ...
2778
   * see Documentation/core-api/pin_user_pages.rst for details.
eddb1c228   John Hubbard   mm/gup: introduce...
2779
   */
64019a2e4   Peter Xu   mm/gup: remove ta...
2780
  long pin_user_pages_remote(struct mm_struct *mm,
eddb1c228   John Hubbard   mm/gup: introduce...
2781
2782
2783
2784
  			   unsigned long start, unsigned long nr_pages,
  			   unsigned int gup_flags, struct page **pages,
  			   struct vm_area_struct **vmas, int *locked)
  {
3faa52c03   John Hubbard   mm/gup: track FOL...
2785
2786
2787
2788
2789
  	/* FOLL_GET and FOLL_PIN are mutually exclusive. */
  	if (WARN_ON_ONCE(gup_flags & FOLL_GET))
  		return -EINVAL;
  
  	gup_flags |= FOLL_PIN;
64019a2e4   Peter Xu   mm/gup: remove ta...
2790
  	return __get_user_pages_remote(mm, start, nr_pages, gup_flags,
3faa52c03   John Hubbard   mm/gup: track FOL...
2791
  				       pages, vmas, locked);
eddb1c228   John Hubbard   mm/gup: introduce...
2792
2793
2794
2795
2796
2797
  }
  EXPORT_SYMBOL(pin_user_pages_remote);
  
  /**
   * pin_user_pages() - pin user pages in memory for use by other devices
   *
3faa52c03   John Hubbard   mm/gup: track FOL...
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
   * @start:	starting user address
   * @nr_pages:	number of pages from start to pin
   * @gup_flags:	flags modifying lookup behaviour
   * @pages:	array that receives pointers to the pages pinned.
   *		Should be at least nr_pages long. Or NULL, if caller
   *		only intends to ensure the pages are faulted in.
   * @vmas:	array of pointers to vmas corresponding to each page.
   *		Or NULL if the caller does not require them.
   *
   * Nearly the same as get_user_pages(), except that FOLL_TOUCH is not set, and
   * FOLL_PIN is set.
   *
   * FOLL_PIN means that the pages must be released via unpin_user_page(). Please
72ef5e52b   Mauro Carvalho Chehab   docs: fix broken ...
2811
   * see Documentation/core-api/pin_user_pages.rst for details.
eddb1c228   John Hubbard   mm/gup: introduce...
2812
2813
2814
2815
2816
   */
  long pin_user_pages(unsigned long start, unsigned long nr_pages,
  		    unsigned int gup_flags, struct page **pages,
  		    struct vm_area_struct **vmas)
  {
3faa52c03   John Hubbard   mm/gup: track FOL...
2817
2818
2819
2820
2821
  	/* FOLL_GET and FOLL_PIN are mutually exclusive. */
  	if (WARN_ON_ONCE(gup_flags & FOLL_GET))
  		return -EINVAL;
  
  	gup_flags |= FOLL_PIN;
64019a2e4   Peter Xu   mm/gup: remove ta...
2822
  	return __gup_longterm_locked(current->mm, start, nr_pages,
3faa52c03   John Hubbard   mm/gup: track FOL...
2823
  				     pages, vmas, gup_flags);
eddb1c228   John Hubbard   mm/gup: introduce...
2824
2825
  }
  EXPORT_SYMBOL(pin_user_pages);
914290233   John Hubbard   mm/gup: introduce...
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
  
  /*
   * pin_user_pages_unlocked() is the FOLL_PIN variant of
   * get_user_pages_unlocked(). Behavior is the same, except that this one sets
   * FOLL_PIN and rejects FOLL_GET.
   */
  long pin_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
  			     struct page **pages, unsigned int gup_flags)
  {
  	/* FOLL_GET and FOLL_PIN are mutually exclusive. */
  	if (WARN_ON_ONCE(gup_flags & FOLL_GET))
  		return -EINVAL;
  
  	gup_flags |= FOLL_PIN;
  	return get_user_pages_unlocked(start, nr_pages, pages, gup_flags);
  }
  EXPORT_SYMBOL(pin_user_pages_unlocked);
420c2091b   John Hubbard   mm/gup: introduce...
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
  
  /*
   * pin_user_pages_locked() is the FOLL_PIN variant of get_user_pages_locked().
   * Behavior is the same, except that this one sets FOLL_PIN and rejects
   * FOLL_GET.
   */
  long pin_user_pages_locked(unsigned long start, unsigned long nr_pages,
  			   unsigned int gup_flags, struct page **pages,
  			   int *locked)
  {
  	/*
  	 * FIXME: Current FOLL_LONGTERM behavior is incompatible with
  	 * FAULT_FLAG_ALLOW_RETRY because of the FS DAX check requirement on
  	 * vmas.  As there are no users of this flag in this call we simply
  	 * disallow this option for now.
  	 */
  	if (WARN_ON_ONCE(gup_flags & FOLL_LONGTERM))
  		return -EINVAL;
  
  	/* FOLL_GET and FOLL_PIN are mutually exclusive. */
  	if (WARN_ON_ONCE(gup_flags & FOLL_GET))
  		return -EINVAL;
  
  	gup_flags |= FOLL_PIN;
64019a2e4   Peter Xu   mm/gup: remove ta...
2867
  	return __get_user_pages_locked(current->mm, start, nr_pages,
420c2091b   John Hubbard   mm/gup: introduce...
2868
2869
2870
2871
  				       pages, NULL, locked,
  				       gup_flags | FOLL_TOUCH);
  }
  EXPORT_SYMBOL(pin_user_pages_locked);