Blame view
fs/userfaultfd.c
51.5 KB
20c8ccb19 treewide: Replace... |
1 |
// SPDX-License-Identifier: GPL-2.0-only |
86039bd3b userfaultfd: add ... |
2 3 4 5 6 7 8 |
/* * fs/userfaultfd.c * * Copyright (C) 2007 Davide Libenzi <davidel@xmailserver.org> * Copyright (C) 2008-2009 Red Hat, Inc. * Copyright (C) 2015 Red Hat, Inc. * |
86039bd3b userfaultfd: add ... |
9 10 11 |
* Some part derived from fs/eventfd.c (anon inode setup) and * mm/ksm.c (mm hashing). */ |
9cd75c3cd userfaultfd: non-... |
12 |
#include <linux/list.h> |
86039bd3b userfaultfd: add ... |
13 |
#include <linux/hashtable.h> |
174cd4b1e sched/headers: Pr... |
14 |
#include <linux/sched/signal.h> |
6e84f3152 sched/headers: Pr... |
15 |
#include <linux/sched/mm.h> |
86039bd3b userfaultfd: add ... |
16 17 18 19 20 21 22 23 24 25 26 27 |
#include <linux/mm.h> #include <linux/poll.h> #include <linux/slab.h> #include <linux/seq_file.h> #include <linux/file.h> #include <linux/bug.h> #include <linux/anon_inodes.h> #include <linux/syscalls.h> #include <linux/userfaultfd_k.h> #include <linux/mempolicy.h> #include <linux/ioctl.h> #include <linux/security.h> |
cab350afc userfaultfd: huge... |
28 |
#include <linux/hugetlb.h> |
86039bd3b userfaultfd: add ... |
29 |
|
cefdca0a8 userfaultfd/sysct... |
30 |
int sysctl_unprivileged_userfaultfd __read_mostly = 1; |
3004ec9ca userfaultfd: allo... |
31 |
static struct kmem_cache *userfaultfd_ctx_cachep __read_mostly; |
86039bd3b userfaultfd: add ... |
32 33 34 35 |
enum userfaultfd_state { UFFD_STATE_WAIT_API, UFFD_STATE_RUNNING, }; |
3004ec9ca userfaultfd: allo... |
36 37 38 |
/* * Start with fault_pending_wqh and fault_wqh so they're more likely * to be in the same cacheline. |
cbcfa130a fs/userfaultfd.c:... |
39 40 41 42 43 44 45 46 47 48 |
* * Locking order: * fd_wqh.lock * fault_pending_wqh.lock * fault_wqh.lock * event_wqh.lock * * To avoid deadlocks, IRQs must be disabled when taking any of the above locks, * since fd_wqh.lock is taken by aio_poll() while it's holding a lock that's * also taken in IRQ context. |
3004ec9ca userfaultfd: allo... |
49 |
*/ |
86039bd3b userfaultfd: add ... |
50 |
struct userfaultfd_ctx { |
15b726ef0 userfaultfd: opti... |
51 52 53 |
/* waitqueue head for the pending (i.e. not read) userfaults */ wait_queue_head_t fault_pending_wqh; /* waitqueue head for the userfaults */ |
86039bd3b userfaultfd: add ... |
54 55 56 |
wait_queue_head_t fault_wqh; /* waitqueue head for the pseudo fd to wakeup poll/read */ wait_queue_head_t fd_wqh; |
9cd75c3cd userfaultfd: non-... |
57 58 |
/* waitqueue head for events */ wait_queue_head_t event_wqh; |
2c5b7e1be userfaultfd: avoi... |
59 |
/* a refile sequence protected by fault_pending_wqh lock */ |
2ca97ac8b userfaultfd: Use ... |
60 |
seqcount_spinlock_t refile_seq; |
3004ec9ca userfaultfd: allo... |
61 |
/* pseudo fd refcounting */ |
ca8804206 userfaultfd: conv... |
62 |
refcount_t refcount; |
86039bd3b userfaultfd: add ... |
63 64 |
/* userfaultfd syscall flags */ unsigned int flags; |
9cd75c3cd userfaultfd: non-... |
65 66 |
/* features requested from the userspace */ unsigned int features; |
86039bd3b userfaultfd: add ... |
67 68 69 70 |
/* state machine */ enum userfaultfd_state state; /* released */ bool released; |
df2cc96e7 userfaultfd: prev... |
71 72 |
/* memory mappings are changing because of non-cooperative event */ bool mmap_changing; |
86039bd3b userfaultfd: add ... |
73 74 75 |
/* mm with one ore more vmas attached to this userfaultfd_ctx */ struct mm_struct *mm; }; |
893e26e61 userfaultfd: non-... |
76 77 78 79 80 |
struct userfaultfd_fork_ctx { struct userfaultfd_ctx *orig; struct userfaultfd_ctx *new; struct list_head list; }; |
897ab3e0c userfaultfd: non-... |
81 82 83 84 85 86 |
struct userfaultfd_unmap_ctx { struct userfaultfd_ctx *ctx; unsigned long start; unsigned long end; struct list_head list; }; |
86039bd3b userfaultfd: add ... |
87 |
struct userfaultfd_wait_queue { |
a9b85f941 userfaultfd: chan... |
88 |
struct uffd_msg msg; |
ac6424b98 sched/wait: Renam... |
89 |
wait_queue_entry_t wq; |
86039bd3b userfaultfd: add ... |
90 |
struct userfaultfd_ctx *ctx; |
15a77c6fe userfaultfd: fix ... |
91 |
bool waken; |
86039bd3b userfaultfd: add ... |
92 93 94 95 96 97 |
}; struct userfaultfd_wake_range { unsigned long start; unsigned long len; }; |
ac6424b98 sched/wait: Renam... |
98 |
static int userfaultfd_wake_function(wait_queue_entry_t *wq, unsigned mode, |
86039bd3b userfaultfd: add ... |
99 100 101 102 103 104 105 106 107 |
int wake_flags, void *key) { struct userfaultfd_wake_range *range = key; int ret; struct userfaultfd_wait_queue *uwq; unsigned long start, len; uwq = container_of(wq, struct userfaultfd_wait_queue, wq); ret = 0; |
86039bd3b userfaultfd: add ... |
108 109 110 |
/* len == 0 means wake all */ start = range->start; len = range->len; |
a9b85f941 userfaultfd: chan... |
111 112 |
if (len && (start > uwq->msg.arg.pagefault.address || start + len <= uwq->msg.arg.pagefault.address)) |
86039bd3b userfaultfd: add ... |
113 |
goto out; |
15a77c6fe userfaultfd: fix ... |
114 115 |
WRITE_ONCE(uwq->waken, true); /* |
a9668cd6e locking: Remove s... |
116 117 |
* The Program-Order guarantees provided by the scheduler * ensure uwq->waken is visible before the task is woken. |
15a77c6fe userfaultfd: fix ... |
118 |
*/ |
86039bd3b userfaultfd: add ... |
119 |
ret = wake_up_state(wq->private, mode); |
a9668cd6e locking: Remove s... |
120 |
if (ret) { |
86039bd3b userfaultfd: add ... |
121 122 123 |
/* * Wake only once, autoremove behavior. * |
a9668cd6e locking: Remove s... |
124 125 126 127 128 129 130 |
* After the effect of list_del_init is visible to the other * CPUs, the waitqueue may disappear from under us, see the * !list_empty_careful() in handle_userfault(). * * try_to_wake_up() has an implicit smp_mb(), and the * wq->private is read before calling the extern function * "wake_up_state" (which in turns calls try_to_wake_up). |
86039bd3b userfaultfd: add ... |
131 |
*/ |
2055da973 sched/wait: Disam... |
132 |
list_del_init(&wq->entry); |
a9668cd6e locking: Remove s... |
133 |
} |
86039bd3b userfaultfd: add ... |
134 135 136 137 138 139 140 141 |
out: return ret; } /** * userfaultfd_ctx_get - Acquires a reference to the internal userfaultfd * context. * @ctx: [in] Pointer to the userfaultfd context. |
86039bd3b userfaultfd: add ... |
142 143 144 |
*/ static void userfaultfd_ctx_get(struct userfaultfd_ctx *ctx) { |
ca8804206 userfaultfd: conv... |
145 |
refcount_inc(&ctx->refcount); |
86039bd3b userfaultfd: add ... |
146 147 148 149 150 151 152 153 154 155 156 157 |
} /** * userfaultfd_ctx_put - Releases a reference to the internal userfaultfd * context. * @ctx: [in] Pointer to userfaultfd context. * * The userfaultfd context reference must have been previously acquired either * with userfaultfd_ctx_get() or userfaultfd_ctx_fdget(). */ static void userfaultfd_ctx_put(struct userfaultfd_ctx *ctx) { |
ca8804206 userfaultfd: conv... |
158 |
if (refcount_dec_and_test(&ctx->refcount)) { |
86039bd3b userfaultfd: add ... |
159 160 161 162 |
VM_BUG_ON(spin_is_locked(&ctx->fault_pending_wqh.lock)); VM_BUG_ON(waitqueue_active(&ctx->fault_pending_wqh)); VM_BUG_ON(spin_is_locked(&ctx->fault_wqh.lock)); VM_BUG_ON(waitqueue_active(&ctx->fault_wqh)); |
9cd75c3cd userfaultfd: non-... |
163 164 |
VM_BUG_ON(spin_is_locked(&ctx->event_wqh.lock)); VM_BUG_ON(waitqueue_active(&ctx->event_wqh)); |
86039bd3b userfaultfd: add ... |
165 166 |
VM_BUG_ON(spin_is_locked(&ctx->fd_wqh.lock)); VM_BUG_ON(waitqueue_active(&ctx->fd_wqh)); |
d2005e3f4 userfaultfd: don'... |
167 |
mmdrop(ctx->mm); |
3004ec9ca userfaultfd: allo... |
168 |
kmem_cache_free(userfaultfd_ctx_cachep, ctx); |
86039bd3b userfaultfd: add ... |
169 170 |
} } |
a9b85f941 userfaultfd: chan... |
171 |
static inline void msg_init(struct uffd_msg *msg) |
86039bd3b userfaultfd: add ... |
172 |
{ |
a9b85f941 userfaultfd: chan... |
173 174 175 176 177 178 179 180 181 182 |
BUILD_BUG_ON(sizeof(struct uffd_msg) != 32); /* * Must use memset to zero out the paddings or kernel data is * leaked to userland. */ memset(msg, 0, sizeof(struct uffd_msg)); } static inline struct uffd_msg userfault_msg(unsigned long address, unsigned int flags, |
9d4ac9348 userfaultfd: prov... |
183 184 |
unsigned long reason, unsigned int features) |
a9b85f941 userfaultfd: chan... |
185 186 187 188 189 |
{ struct uffd_msg msg; msg_init(&msg); msg.event = UFFD_EVENT_PAGEFAULT; msg.arg.pagefault.address = address; |
86039bd3b userfaultfd: add ... |
190 191 |
if (flags & FAULT_FLAG_WRITE) /* |
a4605a61d userfaultfd: corr... |
192 |
* If UFFD_FEATURE_PAGEFAULT_FLAG_WP was set in the |
a9b85f941 userfaultfd: chan... |
193 194 195 196 |
* uffdio_api.features and UFFD_PAGEFAULT_FLAG_WRITE * was not set in a UFFD_EVENT_PAGEFAULT, it means it * was a read fault, otherwise if set it means it's * a write fault. |
86039bd3b userfaultfd: add ... |
197 |
*/ |
a9b85f941 userfaultfd: chan... |
198 |
msg.arg.pagefault.flags |= UFFD_PAGEFAULT_FLAG_WRITE; |
86039bd3b userfaultfd: add ... |
199 200 |
if (reason & VM_UFFD_WP) /* |
a9b85f941 userfaultfd: chan... |
201 202 203 204 205 |
* If UFFD_FEATURE_PAGEFAULT_FLAG_WP was set in the * uffdio_api.features and UFFD_PAGEFAULT_FLAG_WP was * not set in a UFFD_EVENT_PAGEFAULT, it means it was * a missing fault, otherwise if set it means it's a * write protect fault. |
86039bd3b userfaultfd: add ... |
206 |
*/ |
a9b85f941 userfaultfd: chan... |
207 |
msg.arg.pagefault.flags |= UFFD_PAGEFAULT_FLAG_WP; |
9d4ac9348 userfaultfd: prov... |
208 |
if (features & UFFD_FEATURE_THREAD_ID) |
a36985d31 userfaultfd: prov... |
209 |
msg.arg.pagefault.feat.ptid = task_pid_vnr(current); |
a9b85f941 userfaultfd: chan... |
210 |
return msg; |
86039bd3b userfaultfd: add ... |
211 |
} |
369cd2121 userfaultfd: huge... |
212 213 214 215 216 217 |
#ifdef CONFIG_HUGETLB_PAGE /* * Same functionality as userfaultfd_must_wait below with modifications for * hugepmd ranges. */ static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx, |
7868a2087 mm/hugetlb: add s... |
218 |
struct vm_area_struct *vma, |
369cd2121 userfaultfd: huge... |
219 220 221 222 223 |
unsigned long address, unsigned long flags, unsigned long reason) { struct mm_struct *mm = ctx->mm; |
1e2c04362 userfaultfd: huge... |
224 |
pte_t *ptep, pte; |
369cd2121 userfaultfd: huge... |
225 |
bool ret = true; |
42fc54140 mmap locking API:... |
226 |
mmap_assert_locked(mm); |
369cd2121 userfaultfd: huge... |
227 |
|
1e2c04362 userfaultfd: huge... |
228 229 230 |
ptep = huge_pte_offset(mm, address, vma_mmu_pagesize(vma)); if (!ptep) |
369cd2121 userfaultfd: huge... |
231 232 233 |
goto out; ret = false; |
1e2c04362 userfaultfd: huge... |
234 |
pte = huge_ptep_get(ptep); |
369cd2121 userfaultfd: huge... |
235 236 237 238 239 |
/* * Lockless access: we're in a wait_event so it's ok if it * changes under us. */ |
1e2c04362 userfaultfd: huge... |
240 |
if (huge_pte_none(pte)) |
369cd2121 userfaultfd: huge... |
241 |
ret = true; |
1e2c04362 userfaultfd: huge... |
242 |
if (!huge_pte_write(pte) && (reason & VM_UFFD_WP)) |
369cd2121 userfaultfd: huge... |
243 244 245 246 247 248 |
ret = true; out: return ret; } #else static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx, |
7868a2087 mm/hugetlb: add s... |
249 |
struct vm_area_struct *vma, |
369cd2121 userfaultfd: huge... |
250 251 252 253 254 255 256 |
unsigned long address, unsigned long flags, unsigned long reason) { return false; /* should never get here */ } #endif /* CONFIG_HUGETLB_PAGE */ |
86039bd3b userfaultfd: add ... |
257 |
/* |
8d2afd96c userfaultfd: solv... |
258 259 260 261 262 263 264 265 266 267 268 269 270 |
* Verify the pagetables are still not ok after having reigstered into * the fault_pending_wqh to avoid userland having to UFFDIO_WAKE any * userfault that has already been resolved, if userfaultfd_read and * UFFDIO_COPY|ZEROPAGE are being run simultaneously on two different * threads. */ static inline bool userfaultfd_must_wait(struct userfaultfd_ctx *ctx, unsigned long address, unsigned long flags, unsigned long reason) { struct mm_struct *mm = ctx->mm; pgd_t *pgd; |
c2febafc6 mm: convert gener... |
271 |
p4d_t *p4d; |
8d2afd96c userfaultfd: solv... |
272 273 274 275 |
pud_t *pud; pmd_t *pmd, _pmd; pte_t *pte; bool ret = true; |
42fc54140 mmap locking API:... |
276 |
mmap_assert_locked(mm); |
8d2afd96c userfaultfd: solv... |
277 278 279 280 |
pgd = pgd_offset(mm, address); if (!pgd_present(*pgd)) goto out; |
c2febafc6 mm: convert gener... |
281 282 283 284 |
p4d = p4d_offset(pgd, address); if (!p4d_present(*p4d)) goto out; pud = pud_offset(p4d, address); |
8d2afd96c userfaultfd: solv... |
285 286 287 288 289 290 291 292 293 294 295 296 |
if (!pud_present(*pud)) goto out; pmd = pmd_offset(pud, address); /* * READ_ONCE must function as a barrier with narrower scope * and it must be equivalent to: * _pmd = *pmd; barrier(); * * This is to deal with the instability (as in * pmd_trans_unstable) of the pmd. */ _pmd = READ_ONCE(*pmd); |
a365ac09d mm, userfaultfd, ... |
297 |
if (pmd_none(_pmd)) |
8d2afd96c userfaultfd: solv... |
298 299 300 |
goto out; ret = false; |
a365ac09d mm, userfaultfd, ... |
301 302 |
if (!pmd_present(_pmd)) goto out; |
63b2d4174 userfaultfd: wp: ... |
303 304 305 |
if (pmd_trans_huge(_pmd)) { if (!pmd_write(_pmd) && (reason & VM_UFFD_WP)) ret = true; |
8d2afd96c userfaultfd: solv... |
306 |
goto out; |
63b2d4174 userfaultfd: wp: ... |
307 |
} |
8d2afd96c userfaultfd: solv... |
308 309 310 311 312 313 314 315 316 317 318 319 |
/* * the pmd is stable (as in !pmd_trans_unstable) so we can re-read it * and use the standard pte_offset_map() instead of parsing _pmd. */ pte = pte_offset_map(pmd, address); /* * Lockless access: we're in a wait_event so it's ok if it * changes under us. */ if (pte_none(*pte)) ret = true; |
63b2d4174 userfaultfd: wp: ... |
320 321 |
if (!pte_write(*pte) && (reason & VM_UFFD_WP)) ret = true; |
8d2afd96c userfaultfd: solv... |
322 323 324 325 326 |
pte_unmap(pte); out: return ret; } |
3e69ad081 mm/userfaultfd: h... |
327 328 329 330 331 332 333 334 335 336 |
static inline long userfaultfd_get_blocking_state(unsigned int flags) { if (flags & FAULT_FLAG_INTERRUPTIBLE) return TASK_INTERRUPTIBLE; if (flags & FAULT_FLAG_KILLABLE) return TASK_KILLABLE; return TASK_UNINTERRUPTIBLE; } |
8d2afd96c userfaultfd: solv... |
337 |
/* |
86039bd3b userfaultfd: add ... |
338 339 340 341 342 |
* The locking rules involved in returning VM_FAULT_RETRY depending on * FAULT_FLAG_ALLOW_RETRY, FAULT_FLAG_RETRY_NOWAIT and * FAULT_FLAG_KILLABLE are not straightforward. The "Caution" * recommendation in __lock_page_or_retry is not an understatement. * |
c1e8d7c6a mmap locking API:... |
343 |
* If FAULT_FLAG_ALLOW_RETRY is set, the mmap_lock must be released |
86039bd3b userfaultfd: add ... |
344 345 346 347 348 |
* before returning VM_FAULT_RETRY only if FAULT_FLAG_RETRY_NOWAIT is * not set. * * If FAULT_FLAG_ALLOW_RETRY is set but FAULT_FLAG_KILLABLE is not * set, VM_FAULT_RETRY can still be returned if and only if there are |
c1e8d7c6a mmap locking API:... |
349 |
* fatal_signal_pending()s, and the mmap_lock must be released before |
86039bd3b userfaultfd: add ... |
350 351 |
* returning it. */ |
2b7403035 mm: Change return... |
352 |
vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason) |
86039bd3b userfaultfd: add ... |
353 |
{ |
82b0f8c39 mm: join struct f... |
354 |
struct mm_struct *mm = vmf->vma->vm_mm; |
86039bd3b userfaultfd: add ... |
355 356 |
struct userfaultfd_ctx *ctx; struct userfaultfd_wait_queue uwq; |
2b7403035 mm: Change return... |
357 |
vm_fault_t ret = VM_FAULT_SIGBUS; |
3e69ad081 mm/userfaultfd: h... |
358 |
bool must_wait; |
15a77c6fe userfaultfd: fix ... |
359 |
long blocking_state; |
86039bd3b userfaultfd: add ... |
360 |
|
64c2b2030 userfaultfd: shme... |
361 362 363 364 365 366 367 368 369 |
/* * We don't do userfault handling for the final child pid update. * * We also don't do userfault handling during * coredumping. hugetlbfs has the special * follow_hugetlb_page() to skip missing pages in the * FOLL_DUMP case, anon memory also checks for FOLL_DUMP with * the no_page_table() helper in follow_page_mask(), but the * shmem_vm_ops->fault method is invoked even during |
c1e8d7c6a mmap locking API:... |
370 |
* coredumping without mmap_lock and it ends up here. |
64c2b2030 userfaultfd: shme... |
371 372 373 374 375 |
*/ if (current->flags & (PF_EXITING|PF_DUMPCORE)) goto out; /* |
c1e8d7c6a mmap locking API:... |
376 377 |
* Coredumping runs without mmap_lock so we can only check that * the mmap_lock is held, if PF_DUMPCORE was not set. |
64c2b2030 userfaultfd: shme... |
378 |
*/ |
42fc54140 mmap locking API:... |
379 |
mmap_assert_locked(mm); |
64c2b2030 userfaultfd: shme... |
380 |
|
82b0f8c39 mm: join struct f... |
381 |
ctx = vmf->vma->vm_userfaultfd_ctx.ctx; |
86039bd3b userfaultfd: add ... |
382 |
if (!ctx) |
ba85c702e userfaultfd: wake... |
383 |
goto out; |
86039bd3b userfaultfd: add ... |
384 385 386 387 388 |
BUG_ON(ctx->mm != mm); VM_BUG_ON(reason & ~(VM_UFFD_MISSING|VM_UFFD_WP)); VM_BUG_ON(!(reason & VM_UFFD_MISSING) ^ !!(reason & VM_UFFD_WP)); |
2d6d6f5a0 mm: userfaultfd: ... |
389 390 |
if (ctx->features & UFFD_FEATURE_SIGBUS) goto out; |
86039bd3b userfaultfd: add ... |
391 392 393 |
/* * If it's already released don't get it. This avoids to loop * in __get_user_pages if userfaultfd_release waits on the |
c1e8d7c6a mmap locking API:... |
394 |
* caller of handle_userfault to release the mmap_lock. |
86039bd3b userfaultfd: add ... |
395 |
*/ |
6aa7de059 locking/atomics: ... |
396 |
if (unlikely(READ_ONCE(ctx->released))) { |
656710a60 userfaultfd: non-... |
397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 |
/* * Don't return VM_FAULT_SIGBUS in this case, so a non * cooperative manager can close the uffd after the * last UFFDIO_COPY, without risking to trigger an * involuntary SIGBUS if the process was starting the * userfaultfd while the userfaultfd was still armed * (but after the last UFFDIO_COPY). If the uffd * wasn't already closed when the userfault reached * this point, that would normally be solved by * userfaultfd_must_wait returning 'false'. * * If we were to return VM_FAULT_SIGBUS here, the non * cooperative manager would be instead forced to * always call UFFDIO_UNREGISTER before it can safely * close the uffd. */ ret = VM_FAULT_NOPAGE; |
ba85c702e userfaultfd: wake... |
414 |
goto out; |
656710a60 userfaultfd: non-... |
415 |
} |
86039bd3b userfaultfd: add ... |
416 417 418 419 420 421 422 423 424 425 426 |
/* * Check that we can return VM_FAULT_RETRY. * * NOTE: it should become possible to return VM_FAULT_RETRY * even if FAULT_FLAG_TRIED is set without leading to gup() * -EBUSY failures, if the userfaultfd is to be extended for * VM_UFFD_WP tracking and we intend to arm the userfault * without first stopping userland access to the memory. For * VM_UFFD_MISSING userfaults this is enough for now. */ |
82b0f8c39 mm: join struct f... |
427 |
if (unlikely(!(vmf->flags & FAULT_FLAG_ALLOW_RETRY))) { |
86039bd3b userfaultfd: add ... |
428 429 430 431 432 |
/* * Validate the invariant that nowait must allow retry * to be sure not to return SIGBUS erroneously on * nowait invocations. */ |
82b0f8c39 mm: join struct f... |
433 |
BUG_ON(vmf->flags & FAULT_FLAG_RETRY_NOWAIT); |
86039bd3b userfaultfd: add ... |
434 435 436 |
#ifdef CONFIG_DEBUG_VM if (printk_ratelimit()) { printk(KERN_WARNING |
82b0f8c39 mm: join struct f... |
437 438 439 |
"FAULT_FLAG_ALLOW_RETRY missing %x ", vmf->flags); |
86039bd3b userfaultfd: add ... |
440 441 442 |
dump_stack(); } #endif |
ba85c702e userfaultfd: wake... |
443 |
goto out; |
86039bd3b userfaultfd: add ... |
444 445 446 447 448 449 |
} /* * Handle nowait, not much to do other than tell it to retry * and wait. */ |
ba85c702e userfaultfd: wake... |
450 |
ret = VM_FAULT_RETRY; |
82b0f8c39 mm: join struct f... |
451 |
if (vmf->flags & FAULT_FLAG_RETRY_NOWAIT) |
ba85c702e userfaultfd: wake... |
452 |
goto out; |
86039bd3b userfaultfd: add ... |
453 |
|
c1e8d7c6a mmap locking API:... |
454 |
/* take the reference before dropping the mmap_lock */ |
86039bd3b userfaultfd: add ... |
455 |
userfaultfd_ctx_get(ctx); |
86039bd3b userfaultfd: add ... |
456 457 |
init_waitqueue_func_entry(&uwq.wq, userfaultfd_wake_function); uwq.wq.private = current; |
9d4ac9348 userfaultfd: prov... |
458 459 |
uwq.msg = userfault_msg(vmf->address, vmf->flags, reason, ctx->features); |
86039bd3b userfaultfd: add ... |
460 |
uwq.ctx = ctx; |
15a77c6fe userfaultfd: fix ... |
461 |
uwq.waken = false; |
86039bd3b userfaultfd: add ... |
462 |
|
3e69ad081 mm/userfaultfd: h... |
463 |
blocking_state = userfaultfd_get_blocking_state(vmf->flags); |
dfa37dc3f userfaultfd: allo... |
464 |
|
cbcfa130a fs/userfaultfd.c:... |
465 |
spin_lock_irq(&ctx->fault_pending_wqh.lock); |
86039bd3b userfaultfd: add ... |
466 467 468 469 |
/* * After the __add_wait_queue the uwq is visible to userland * through poll/read(). */ |
15b726ef0 userfaultfd: opti... |
470 471 472 473 474 475 |
__add_wait_queue(&ctx->fault_pending_wqh, &uwq.wq); /* * The smp_mb() after __set_current_state prevents the reads * following the spin_unlock to happen before the list_add in * __add_wait_queue. */ |
15a77c6fe userfaultfd: fix ... |
476 |
set_current_state(blocking_state); |
cbcfa130a fs/userfaultfd.c:... |
477 |
spin_unlock_irq(&ctx->fault_pending_wqh.lock); |
86039bd3b userfaultfd: add ... |
478 |
|
369cd2121 userfaultfd: huge... |
479 480 481 482 |
if (!is_vm_hugetlb_page(vmf->vma)) must_wait = userfaultfd_must_wait(ctx, vmf->address, vmf->flags, reason); else |
7868a2087 mm/hugetlb: add s... |
483 484 |
must_wait = userfaultfd_huge_must_wait(ctx, vmf->vma, vmf->address, |
369cd2121 userfaultfd: huge... |
485 |
vmf->flags, reason); |
d8ed45c5d mmap locking API:... |
486 |
mmap_read_unlock(mm); |
8d2afd96c userfaultfd: solv... |
487 |
|
f9bf35222 userfaultfd: simp... |
488 |
if (likely(must_wait && !READ_ONCE(ctx->released))) { |
a9a08845e vfs: do bulk POLL... |
489 |
wake_up_poll(&ctx->fd_wqh, EPOLLIN); |
86039bd3b userfaultfd: add ... |
490 |
schedule(); |
ba85c702e userfaultfd: wake... |
491 |
} |
86039bd3b userfaultfd: add ... |
492 |
|
ba85c702e userfaultfd: wake... |
493 |
__set_current_state(TASK_RUNNING); |
15b726ef0 userfaultfd: opti... |
494 495 496 497 498 499 500 501 502 503 504 505 506 507 |
/* * Here we race with the list_del; list_add in * userfaultfd_ctx_read(), however because we don't ever run * list_del_init() to refile across the two lists, the prev * and next pointers will never point to self. list_add also * would never let any of the two pointers to point to * self. So list_empty_careful won't risk to see both pointers * pointing to self at any time during the list refile. The * only case where list_del_init() is called is the full * removal in the wake function and there we don't re-list_add * and it's fine not to block on the spinlock. The uwq on this * kernel stack can be released after the list_del_init. */ |
2055da973 sched/wait: Disam... |
508 |
if (!list_empty_careful(&uwq.wq.entry)) { |
cbcfa130a fs/userfaultfd.c:... |
509 |
spin_lock_irq(&ctx->fault_pending_wqh.lock); |
15b726ef0 userfaultfd: opti... |
510 511 512 513 |
/* * No need of list_del_init(), the uwq on the stack * will be freed shortly anyway. */ |
2055da973 sched/wait: Disam... |
514 |
list_del(&uwq.wq.entry); |
cbcfa130a fs/userfaultfd.c:... |
515 |
spin_unlock_irq(&ctx->fault_pending_wqh.lock); |
86039bd3b userfaultfd: add ... |
516 |
} |
86039bd3b userfaultfd: add ... |
517 518 519 520 521 522 |
/* * ctx may go away after this if the userfault pseudo fd is * already released. */ userfaultfd_ctx_put(ctx); |
ba85c702e userfaultfd: wake... |
523 524 |
out: return ret; |
86039bd3b userfaultfd: add ... |
525 |
} |
8c9e7bb7a userfaultfd: non-... |
526 527 |
static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx, struct userfaultfd_wait_queue *ewq) |
9cd75c3cd userfaultfd: non-... |
528 |
{ |
0cbb4b4f4 userfaultfd: clea... |
529 |
struct userfaultfd_ctx *release_new_ctx; |
9a69a829f userfaultfd: non-... |
530 531 |
if (WARN_ON_ONCE(current->flags & PF_EXITING)) goto out; |
9cd75c3cd userfaultfd: non-... |
532 533 534 |
ewq->ctx = ctx; init_waitqueue_entry(&ewq->wq, current); |
0cbb4b4f4 userfaultfd: clea... |
535 |
release_new_ctx = NULL; |
9cd75c3cd userfaultfd: non-... |
536 |
|
cbcfa130a fs/userfaultfd.c:... |
537 |
spin_lock_irq(&ctx->event_wqh.lock); |
9cd75c3cd userfaultfd: non-... |
538 539 540 541 542 543 544 545 546 |
/* * After the __add_wait_queue the uwq is visible to userland * through poll/read(). */ __add_wait_queue(&ctx->event_wqh, &ewq->wq); for (;;) { set_current_state(TASK_KILLABLE); if (ewq->msg.event == 0) break; |
6aa7de059 locking/atomics: ... |
547 |
if (READ_ONCE(ctx->released) || |
9cd75c3cd userfaultfd: non-... |
548 |
fatal_signal_pending(current)) { |
384632e67 userfaultfd: non-... |
549 550 551 552 553 554 |
/* * &ewq->wq may be queued in fork_event, but * __remove_wait_queue ignores the head * parameter. It would be a problem if it * didn't. */ |
9cd75c3cd userfaultfd: non-... |
555 |
__remove_wait_queue(&ctx->event_wqh, &ewq->wq); |
7eb76d457 userfaultfd: non-... |
556 557 558 559 560 561 |
if (ewq->msg.event == UFFD_EVENT_FORK) { struct userfaultfd_ctx *new; new = (struct userfaultfd_ctx *) (unsigned long) ewq->msg.arg.reserved.reserved1; |
0cbb4b4f4 userfaultfd: clea... |
562 |
release_new_ctx = new; |
7eb76d457 userfaultfd: non-... |
563 |
} |
9cd75c3cd userfaultfd: non-... |
564 565 |
break; } |
cbcfa130a fs/userfaultfd.c:... |
566 |
spin_unlock_irq(&ctx->event_wqh.lock); |
9cd75c3cd userfaultfd: non-... |
567 |
|
a9a08845e vfs: do bulk POLL... |
568 |
wake_up_poll(&ctx->fd_wqh, EPOLLIN); |
9cd75c3cd userfaultfd: non-... |
569 |
schedule(); |
cbcfa130a fs/userfaultfd.c:... |
570 |
spin_lock_irq(&ctx->event_wqh.lock); |
9cd75c3cd userfaultfd: non-... |
571 572 |
} __set_current_state(TASK_RUNNING); |
cbcfa130a fs/userfaultfd.c:... |
573 |
spin_unlock_irq(&ctx->event_wqh.lock); |
9cd75c3cd userfaultfd: non-... |
574 |
|
0cbb4b4f4 userfaultfd: clea... |
575 576 577 578 579 |
if (release_new_ctx) { struct vm_area_struct *vma; struct mm_struct *mm = release_new_ctx->mm; /* the various vma->vm_userfaultfd_ctx still points to it */ |
d8ed45c5d mmap locking API:... |
580 |
mmap_write_lock(mm); |
0cbb4b4f4 userfaultfd: clea... |
581 |
for (vma = mm->mmap; vma; vma = vma->vm_next) |
31e810aa1 userfaultfd: remo... |
582 |
if (vma->vm_userfaultfd_ctx.ctx == release_new_ctx) { |
0cbb4b4f4 userfaultfd: clea... |
583 |
vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; |
31e810aa1 userfaultfd: remo... |
584 585 |
vma->vm_flags &= ~(VM_UFFD_WP | VM_UFFD_MISSING); } |
d8ed45c5d mmap locking API:... |
586 |
mmap_write_unlock(mm); |
0cbb4b4f4 userfaultfd: clea... |
587 588 589 |
userfaultfd_ctx_put(release_new_ctx); } |
9cd75c3cd userfaultfd: non-... |
590 591 592 593 |
/* * ctx may go away after this if the userfault pseudo fd is * already released. */ |
9a69a829f userfaultfd: non-... |
594 |
out: |
df2cc96e7 userfaultfd: prev... |
595 |
WRITE_ONCE(ctx->mmap_changing, false); |
9cd75c3cd userfaultfd: non-... |
596 |
userfaultfd_ctx_put(ctx); |
9cd75c3cd userfaultfd: non-... |
597 598 599 600 601 602 603 604 605 |
} static void userfaultfd_event_complete(struct userfaultfd_ctx *ctx, struct userfaultfd_wait_queue *ewq) { ewq->msg.event = 0; wake_up_locked(&ctx->event_wqh); __remove_wait_queue(&ctx->event_wqh, &ewq->wq); } |
893e26e61 userfaultfd: non-... |
606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 |
int dup_userfaultfd(struct vm_area_struct *vma, struct list_head *fcs) { struct userfaultfd_ctx *ctx = NULL, *octx; struct userfaultfd_fork_ctx *fctx; octx = vma->vm_userfaultfd_ctx.ctx; if (!octx || !(octx->features & UFFD_FEATURE_EVENT_FORK)) { vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; vma->vm_flags &= ~(VM_UFFD_WP | VM_UFFD_MISSING); return 0; } list_for_each_entry(fctx, fcs, list) if (fctx->orig == octx) { ctx = fctx->new; break; } if (!ctx) { fctx = kmalloc(sizeof(*fctx), GFP_KERNEL); if (!fctx) return -ENOMEM; ctx = kmem_cache_alloc(userfaultfd_ctx_cachep, GFP_KERNEL); if (!ctx) { kfree(fctx); return -ENOMEM; } |
ca8804206 userfaultfd: conv... |
634 |
refcount_set(&ctx->refcount, 1); |
893e26e61 userfaultfd: non-... |
635 636 637 638 |
ctx->flags = octx->flags; ctx->state = UFFD_STATE_RUNNING; ctx->features = octx->features; ctx->released = false; |
df2cc96e7 userfaultfd: prev... |
639 |
ctx->mmap_changing = false; |
893e26e61 userfaultfd: non-... |
640 |
ctx->mm = vma->vm_mm; |
00bb31fa4 userfaultfd: use ... |
641 |
mmgrab(ctx->mm); |
893e26e61 userfaultfd: non-... |
642 643 |
userfaultfd_ctx_get(octx); |
df2cc96e7 userfaultfd: prev... |
644 |
WRITE_ONCE(octx->mmap_changing, true); |
893e26e61 userfaultfd: non-... |
645 646 647 648 649 650 651 652 |
fctx->orig = octx; fctx->new = ctx; list_add_tail(&fctx->list, fcs); } vma->vm_userfaultfd_ctx.ctx = ctx; return 0; } |
8c9e7bb7a userfaultfd: non-... |
653 |
static void dup_fctx(struct userfaultfd_fork_ctx *fctx) |
893e26e61 userfaultfd: non-... |
654 655 656 657 658 659 660 661 |
{ struct userfaultfd_ctx *ctx = fctx->orig; struct userfaultfd_wait_queue ewq; msg_init(&ewq.msg); ewq.msg.event = UFFD_EVENT_FORK; ewq.msg.arg.reserved.reserved1 = (unsigned long)fctx->new; |
8c9e7bb7a userfaultfd: non-... |
662 |
userfaultfd_event_wait_completion(ctx, &ewq); |
893e26e61 userfaultfd: non-... |
663 664 665 666 |
} void dup_userfaultfd_complete(struct list_head *fcs) { |
893e26e61 userfaultfd: non-... |
667 668 669 |
struct userfaultfd_fork_ctx *fctx, *n; list_for_each_entry_safe(fctx, n, fcs, list) { |
8c9e7bb7a userfaultfd: non-... |
670 |
dup_fctx(fctx); |
893e26e61 userfaultfd: non-... |
671 672 673 674 |
list_del(&fctx->list); kfree(fctx); } } |
72f87654c userfaultfd: non-... |
675 676 677 678 679 680 |
void mremap_userfaultfd_prep(struct vm_area_struct *vma, struct vm_userfaultfd_ctx *vm_ctx) { struct userfaultfd_ctx *ctx; ctx = vma->vm_userfaultfd_ctx.ctx; |
3cfd22be0 userfaultfd: clea... |
681 682 683 684 685 |
if (!ctx) return; if (ctx->features & UFFD_FEATURE_EVENT_REMAP) { |
72f87654c userfaultfd: non-... |
686 687 |
vm_ctx->ctx = ctx; userfaultfd_ctx_get(ctx); |
df2cc96e7 userfaultfd: prev... |
688 |
WRITE_ONCE(ctx->mmap_changing, true); |
3cfd22be0 userfaultfd: clea... |
689 690 691 692 |
} else { /* Drop uffd context if remap feature not enabled */ vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; vma->vm_flags &= ~(VM_UFFD_WP | VM_UFFD_MISSING); |
72f87654c userfaultfd: non-... |
693 694 |
} } |
90794bf19 userfaultfd: non-... |
695 |
void mremap_userfaultfd_complete(struct vm_userfaultfd_ctx *vm_ctx, |
72f87654c userfaultfd: non-... |
696 697 698 |
unsigned long from, unsigned long to, unsigned long len) { |
90794bf19 userfaultfd: non-... |
699 |
struct userfaultfd_ctx *ctx = vm_ctx->ctx; |
72f87654c userfaultfd: non-... |
700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 |
struct userfaultfd_wait_queue ewq; if (!ctx) return; if (to & ~PAGE_MASK) { userfaultfd_ctx_put(ctx); return; } msg_init(&ewq.msg); ewq.msg.event = UFFD_EVENT_REMAP; ewq.msg.arg.remap.from = from; ewq.msg.arg.remap.to = to; ewq.msg.arg.remap.len = len; userfaultfd_event_wait_completion(ctx, &ewq); } |
70ccb92fd userfaultfd: non-... |
719 |
bool userfaultfd_remove(struct vm_area_struct *vma, |
d811914d8 userfaultfd: non-... |
720 |
unsigned long start, unsigned long end) |
05ce77249 userfaultfd: non-... |
721 722 723 724 725 726 |
{ struct mm_struct *mm = vma->vm_mm; struct userfaultfd_ctx *ctx; struct userfaultfd_wait_queue ewq; ctx = vma->vm_userfaultfd_ctx.ctx; |
d811914d8 userfaultfd: non-... |
727 |
if (!ctx || !(ctx->features & UFFD_FEATURE_EVENT_REMOVE)) |
70ccb92fd userfaultfd: non-... |
728 |
return true; |
05ce77249 userfaultfd: non-... |
729 730 |
userfaultfd_ctx_get(ctx); |
df2cc96e7 userfaultfd: prev... |
731 |
WRITE_ONCE(ctx->mmap_changing, true); |
d8ed45c5d mmap locking API:... |
732 |
mmap_read_unlock(mm); |
05ce77249 userfaultfd: non-... |
733 |
|
05ce77249 userfaultfd: non-... |
734 |
msg_init(&ewq.msg); |
d811914d8 userfaultfd: non-... |
735 736 737 |
ewq.msg.event = UFFD_EVENT_REMOVE; ewq.msg.arg.remove.start = start; ewq.msg.arg.remove.end = end; |
05ce77249 userfaultfd: non-... |
738 739 |
userfaultfd_event_wait_completion(ctx, &ewq); |
70ccb92fd userfaultfd: non-... |
740 |
return false; |
05ce77249 userfaultfd: non-... |
741 |
} |
897ab3e0c userfaultfd: non-... |
742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 |
static bool has_unmap_ctx(struct userfaultfd_ctx *ctx, struct list_head *unmaps, unsigned long start, unsigned long end) { struct userfaultfd_unmap_ctx *unmap_ctx; list_for_each_entry(unmap_ctx, unmaps, list) if (unmap_ctx->ctx == ctx && unmap_ctx->start == start && unmap_ctx->end == end) return true; return false; } int userfaultfd_unmap_prep(struct vm_area_struct *vma, unsigned long start, unsigned long end, struct list_head *unmaps) { for ( ; vma && vma->vm_start < end; vma = vma->vm_next) { struct userfaultfd_unmap_ctx *unmap_ctx; struct userfaultfd_ctx *ctx = vma->vm_userfaultfd_ctx.ctx; if (!ctx || !(ctx->features & UFFD_FEATURE_EVENT_UNMAP) || has_unmap_ctx(ctx, unmaps, start, end)) continue; unmap_ctx = kzalloc(sizeof(*unmap_ctx), GFP_KERNEL); if (!unmap_ctx) return -ENOMEM; userfaultfd_ctx_get(ctx); |
df2cc96e7 userfaultfd: prev... |
772 |
WRITE_ONCE(ctx->mmap_changing, true); |
897ab3e0c userfaultfd: non-... |
773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 |
unmap_ctx->ctx = ctx; unmap_ctx->start = start; unmap_ctx->end = end; list_add_tail(&unmap_ctx->list, unmaps); } return 0; } void userfaultfd_unmap_complete(struct mm_struct *mm, struct list_head *uf) { struct userfaultfd_unmap_ctx *ctx, *n; struct userfaultfd_wait_queue ewq; list_for_each_entry_safe(ctx, n, uf, list) { msg_init(&ewq.msg); ewq.msg.event = UFFD_EVENT_UNMAP; ewq.msg.arg.remove.start = ctx->start; ewq.msg.arg.remove.end = ctx->end; userfaultfd_event_wait_completion(ctx->ctx, &ewq); list_del(&ctx->list); kfree(ctx); } } |
86039bd3b userfaultfd: add ... |
800 801 802 803 804 805 806 807 |
static int userfaultfd_release(struct inode *inode, struct file *file) { struct userfaultfd_ctx *ctx = file->private_data; struct mm_struct *mm = ctx->mm; struct vm_area_struct *vma, *prev; /* len == 0 means wake all */ struct userfaultfd_wake_range range = { .len = 0, }; unsigned long new_flags; |
6aa7de059 locking/atomics: ... |
808 |
WRITE_ONCE(ctx->released, true); |
86039bd3b userfaultfd: add ... |
809 |
|
d2005e3f4 userfaultfd: don'... |
810 811 |
if (!mmget_not_zero(mm)) goto wakeup; |
86039bd3b userfaultfd: add ... |
812 813 814 815 |
/* * Flush page faults out of all CPUs. NOTE: all page faults * must be retried without returning VM_FAULT_SIGBUS if * userfaultfd_ctx_get() succeeds but vma->vma_userfault_ctx |
c1e8d7c6a mmap locking API:... |
816 |
* changes while handle_userfault released the mmap_lock. So |
86039bd3b userfaultfd: add ... |
817 |
* it's critical that released is set to true (above), before |
c1e8d7c6a mmap locking API:... |
818 |
* taking the mmap_lock for writing. |
86039bd3b userfaultfd: add ... |
819 |
*/ |
d8ed45c5d mmap locking API:... |
820 |
mmap_write_lock(mm); |
86039bd3b userfaultfd: add ... |
821 822 823 824 825 826 827 828 829 830 |
prev = NULL; for (vma = mm->mmap; vma; vma = vma->vm_next) { cond_resched(); BUG_ON(!!vma->vm_userfaultfd_ctx.ctx ^ !!(vma->vm_flags & (VM_UFFD_MISSING | VM_UFFD_WP))); if (vma->vm_userfaultfd_ctx.ctx != ctx) { prev = vma; continue; } new_flags = vma->vm_flags & ~(VM_UFFD_MISSING | VM_UFFD_WP); |
4d45e75a9 mm: remove the no... |
831 832 833 834 |
prev = vma_merge(mm, prev, vma->vm_start, vma->vm_end, new_flags, vma->anon_vma, vma->vm_file, vma->vm_pgoff, vma_policy(vma), |
05d2a661f Merge 54a4c789ca8... |
835 836 |
NULL_VM_UFFD_CTX, vma_get_anon_name(vma)); |
4d45e75a9 mm: remove the no... |
837 838 839 840 |
if (prev) vma = prev; else prev = vma; |
86039bd3b userfaultfd: add ... |
841 842 843 |
vma->vm_flags = new_flags; vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; } |
d8ed45c5d mmap locking API:... |
844 |
mmap_write_unlock(mm); |
d2005e3f4 userfaultfd: don'... |
845 846 |
mmput(mm); wakeup: |
86039bd3b userfaultfd: add ... |
847 |
/* |
15b726ef0 userfaultfd: opti... |
848 |
* After no new page faults can wait on this fault_*wqh, flush |
86039bd3b userfaultfd: add ... |
849 |
* the last page faults that may have been already waiting on |
15b726ef0 userfaultfd: opti... |
850 |
* the fault_*wqh. |
86039bd3b userfaultfd: add ... |
851 |
*/ |
cbcfa130a fs/userfaultfd.c:... |
852 |
spin_lock_irq(&ctx->fault_pending_wqh.lock); |
ac5be6b47 userfaultfd: reve... |
853 |
__wake_up_locked_key(&ctx->fault_pending_wqh, TASK_NORMAL, &range); |
c430d1e84 userfaultfd: use ... |
854 |
__wake_up(&ctx->fault_wqh, TASK_NORMAL, 1, &range); |
cbcfa130a fs/userfaultfd.c:... |
855 |
spin_unlock_irq(&ctx->fault_pending_wqh.lock); |
86039bd3b userfaultfd: add ... |
856 |
|
5a18b64e3 userfaultfd: non-... |
857 858 |
/* Flush pending events that may still wait on event_wqh */ wake_up_all(&ctx->event_wqh); |
a9a08845e vfs: do bulk POLL... |
859 |
wake_up_poll(&ctx->fd_wqh, EPOLLHUP); |
86039bd3b userfaultfd: add ... |
860 861 862 |
userfaultfd_ctx_put(ctx); return 0; } |
15b726ef0 userfaultfd: opti... |
863 |
/* fault_pending_wqh.lock must be hold by the caller */ |
6dcc27fd3 userfaultfd: non-... |
864 865 |
static inline struct userfaultfd_wait_queue *find_userfault_in( wait_queue_head_t *wqh) |
86039bd3b userfaultfd: add ... |
866 |
{ |
ac6424b98 sched/wait: Renam... |
867 |
wait_queue_entry_t *wq; |
15b726ef0 userfaultfd: opti... |
868 |
struct userfaultfd_wait_queue *uwq; |
86039bd3b userfaultfd: add ... |
869 |
|
456a73789 userfaultfd: Repl... |
870 |
lockdep_assert_held(&wqh->lock); |
86039bd3b userfaultfd: add ... |
871 |
|
15b726ef0 userfaultfd: opti... |
872 |
uwq = NULL; |
6dcc27fd3 userfaultfd: non-... |
873 |
if (!waitqueue_active(wqh)) |
15b726ef0 userfaultfd: opti... |
874 875 |
goto out; /* walk in reverse to provide FIFO behavior to read userfaults */ |
2055da973 sched/wait: Disam... |
876 |
wq = list_last_entry(&wqh->head, typeof(*wq), entry); |
15b726ef0 userfaultfd: opti... |
877 878 879 |
uwq = container_of(wq, struct userfaultfd_wait_queue, wq); out: return uwq; |
86039bd3b userfaultfd: add ... |
880 |
} |
6dcc27fd3 userfaultfd: non-... |
881 882 883 884 885 886 |
static inline struct userfaultfd_wait_queue *find_userfault( struct userfaultfd_ctx *ctx) { return find_userfault_in(&ctx->fault_pending_wqh); } |
86039bd3b userfaultfd: add ... |
887 |
|
9cd75c3cd userfaultfd: non-... |
888 889 890 891 892 |
static inline struct userfaultfd_wait_queue *find_userfault_evt( struct userfaultfd_ctx *ctx) { return find_userfault_in(&ctx->event_wqh); } |
076ccb76e fs: annotate ->po... |
893 |
static __poll_t userfaultfd_poll(struct file *file, poll_table *wait) |
86039bd3b userfaultfd: add ... |
894 895 |
{ struct userfaultfd_ctx *ctx = file->private_data; |
076ccb76e fs: annotate ->po... |
896 |
__poll_t ret; |
86039bd3b userfaultfd: add ... |
897 898 899 900 901 |
poll_wait(file, &ctx->fd_wqh, wait); switch (ctx->state) { case UFFD_STATE_WAIT_API: |
a9a08845e vfs: do bulk POLL... |
902 |
return EPOLLERR; |
86039bd3b userfaultfd: add ... |
903 |
case UFFD_STATE_RUNNING: |
ba85c702e userfaultfd: wake... |
904 905 906 907 908 |
/* * poll() never guarantees that read won't block. * userfaults can be waken before they're read(). */ if (unlikely(!(file->f_flags & O_NONBLOCK))) |
a9a08845e vfs: do bulk POLL... |
909 |
return EPOLLERR; |
15b726ef0 userfaultfd: opti... |
910 911 912 913 914 915 916 917 918 919 920 921 922 |
/* * lockless access to see if there are pending faults * __pollwait last action is the add_wait_queue but * the spin_unlock would allow the waitqueue_active to * pass above the actual list_add inside * add_wait_queue critical section. So use a full * memory barrier to serialize the list_add write of * add_wait_queue() with the waitqueue_active read * below. */ ret = 0; smp_mb(); if (waitqueue_active(&ctx->fault_pending_wqh)) |
a9a08845e vfs: do bulk POLL... |
923 |
ret = EPOLLIN; |
9cd75c3cd userfaultfd: non-... |
924 |
else if (waitqueue_active(&ctx->event_wqh)) |
a9a08845e vfs: do bulk POLL... |
925 |
ret = EPOLLIN; |
9cd75c3cd userfaultfd: non-... |
926 |
|
86039bd3b userfaultfd: add ... |
927 928 |
return ret; default: |
8474901a3 userfaultfd: conv... |
929 |
WARN_ON_ONCE(1); |
a9a08845e vfs: do bulk POLL... |
930 |
return EPOLLERR; |
86039bd3b userfaultfd: add ... |
931 932 |
} } |
893e26e61 userfaultfd: non-... |
933 934 935 936 937 938 939 |
static const struct file_operations userfaultfd_fops; static int resolve_userfault_fork(struct userfaultfd_ctx *ctx, struct userfaultfd_ctx *new, struct uffd_msg *msg) { int fd; |
893e26e61 userfaultfd: non-... |
940 |
|
284cd241a userfaultfd: conv... |
941 942 |
fd = anon_inode_getfd("[userfaultfd]", &userfaultfd_fops, new, O_RDWR | (new->flags & UFFD_SHARED_FCNTL_FLAGS)); |
893e26e61 userfaultfd: non-... |
943 944 |
if (fd < 0) return fd; |
893e26e61 userfaultfd: non-... |
945 946 |
msg->arg.reserved.reserved1 = 0; msg->arg.fork.ufd = fd; |
893e26e61 userfaultfd: non-... |
947 948 |
return 0; } |
86039bd3b userfaultfd: add ... |
949 |
static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait, |
a9b85f941 userfaultfd: chan... |
950 |
struct uffd_msg *msg) |
86039bd3b userfaultfd: add ... |
951 952 953 |
{ ssize_t ret; DECLARE_WAITQUEUE(wait, current); |
15b726ef0 userfaultfd: opti... |
954 |
struct userfaultfd_wait_queue *uwq; |
893e26e61 userfaultfd: non-... |
955 956 957 958 959 960 961 962 963 |
/* * Handling fork event requires sleeping operations, so * we drop the event_wqh lock, then do these ops, then * lock it back and wake up the waiter. While the lock is * dropped the ewq may go away so we keep track of it * carefully. */ LIST_HEAD(fork_event); struct userfaultfd_ctx *fork_nctx = NULL; |
86039bd3b userfaultfd: add ... |
964 |
|
15b726ef0 userfaultfd: opti... |
965 |
/* always take the fd_wqh lock before the fault_pending_wqh lock */ |
ae62c16e1 userfaultfd: disa... |
966 |
spin_lock_irq(&ctx->fd_wqh.lock); |
86039bd3b userfaultfd: add ... |
967 968 969 |
__add_wait_queue(&ctx->fd_wqh, &wait); for (;;) { set_current_state(TASK_INTERRUPTIBLE); |
15b726ef0 userfaultfd: opti... |
970 971 972 |
spin_lock(&ctx->fault_pending_wqh.lock); uwq = find_userfault(ctx); if (uwq) { |
86039bd3b userfaultfd: add ... |
973 |
/* |
2c5b7e1be userfaultfd: avoi... |
974 975 976 977 978 979 980 981 982 |
* Use a seqcount to repeat the lockless check * in wake_userfault() to avoid missing * wakeups because during the refile both * waitqueue could become empty if this is the * only userfault. */ write_seqcount_begin(&ctx->refile_seq); /* |
15b726ef0 userfaultfd: opti... |
983 984 985 986 987 988 989 990 991 992 993 994 995 |
* The fault_pending_wqh.lock prevents the uwq * to disappear from under us. * * Refile this userfault from * fault_pending_wqh to fault_wqh, it's not * pending anymore after we read it. * * Use list_del() by hand (as * userfaultfd_wake_function also uses * list_del_init() by hand) to be sure nobody * changes __remove_wait_queue() to use * list_del_init() in turn breaking the * !list_empty_careful() check in |
2055da973 sched/wait: Disam... |
996 |
* handle_userfault(). The uwq->wq.head list |
15b726ef0 userfaultfd: opti... |
997 998 999 1000 1001 |
* must never be empty at any time during the * refile, or the waitqueue could disappear * from under us. The "wait_queue_head_t" * parameter of __remove_wait_queue() is unused * anyway. |
86039bd3b userfaultfd: add ... |
1002 |
*/ |
2055da973 sched/wait: Disam... |
1003 |
list_del(&uwq->wq.entry); |
c430d1e84 userfaultfd: use ... |
1004 |
add_wait_queue(&ctx->fault_wqh, &uwq->wq); |
15b726ef0 userfaultfd: opti... |
1005 |
|
2c5b7e1be userfaultfd: avoi... |
1006 |
write_seqcount_end(&ctx->refile_seq); |
a9b85f941 userfaultfd: chan... |
1007 1008 |
/* careful to always initialize msg if ret == 0 */ *msg = uwq->msg; |
15b726ef0 userfaultfd: opti... |
1009 |
spin_unlock(&ctx->fault_pending_wqh.lock); |
86039bd3b userfaultfd: add ... |
1010 1011 1012 |
ret = 0; break; } |
15b726ef0 userfaultfd: opti... |
1013 |
spin_unlock(&ctx->fault_pending_wqh.lock); |
9cd75c3cd userfaultfd: non-... |
1014 1015 1016 1017 1018 |
spin_lock(&ctx->event_wqh.lock); uwq = find_userfault_evt(ctx); if (uwq) { *msg = uwq->msg; |
893e26e61 userfaultfd: non-... |
1019 1020 1021 1022 |
if (uwq->msg.event == UFFD_EVENT_FORK) { fork_nctx = (struct userfaultfd_ctx *) (unsigned long) uwq->msg.arg.reserved.reserved1; |
2055da973 sched/wait: Disam... |
1023 |
list_move(&uwq->wq.entry, &fork_event); |
384632e67 userfaultfd: non-... |
1024 1025 1026 1027 1028 1029 |
/* * fork_nctx can be freed as soon as * we drop the lock, unless we take a * reference on it. */ userfaultfd_ctx_get(fork_nctx); |
893e26e61 userfaultfd: non-... |
1030 1031 1032 1033 |
spin_unlock(&ctx->event_wqh.lock); ret = 0; break; } |
9cd75c3cd userfaultfd: non-... |
1034 1035 1036 1037 1038 1039 |
userfaultfd_event_complete(ctx, uwq); spin_unlock(&ctx->event_wqh.lock); ret = 0; break; } spin_unlock(&ctx->event_wqh.lock); |
86039bd3b userfaultfd: add ... |
1040 1041 1042 1043 1044 1045 1046 1047 |
if (signal_pending(current)) { ret = -ERESTARTSYS; break; } if (no_wait) { ret = -EAGAIN; break; } |
ae62c16e1 userfaultfd: disa... |
1048 |
spin_unlock_irq(&ctx->fd_wqh.lock); |
86039bd3b userfaultfd: add ... |
1049 |
schedule(); |
ae62c16e1 userfaultfd: disa... |
1050 |
spin_lock_irq(&ctx->fd_wqh.lock); |
86039bd3b userfaultfd: add ... |
1051 1052 1053 |
} __remove_wait_queue(&ctx->fd_wqh, &wait); __set_current_state(TASK_RUNNING); |
ae62c16e1 userfaultfd: disa... |
1054 |
spin_unlock_irq(&ctx->fd_wqh.lock); |
86039bd3b userfaultfd: add ... |
1055 |
|
893e26e61 userfaultfd: non-... |
1056 1057 |
if (!ret && msg->event == UFFD_EVENT_FORK) { ret = resolve_userfault_fork(ctx, fork_nctx, msg); |
cbcfa130a fs/userfaultfd.c:... |
1058 |
spin_lock_irq(&ctx->event_wqh.lock); |
384632e67 userfaultfd: non-... |
1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 |
if (!list_empty(&fork_event)) { /* * The fork thread didn't abort, so we can * drop the temporary refcount. */ userfaultfd_ctx_put(fork_nctx); uwq = list_first_entry(&fork_event, typeof(*uwq), wq.entry); /* * If fork_event list wasn't empty and in turn * the event wasn't already released by fork * (the event is allocated on fork kernel * stack), put the event back to its place in * the event_wq. fork_event head will be freed * as soon as we return so the event cannot * stay queued there no matter the current * "ret" value. */ list_del(&uwq->wq.entry); __add_wait_queue(&ctx->event_wqh, &uwq->wq); |
893e26e61 userfaultfd: non-... |
1081 |
|
384632e67 userfaultfd: non-... |
1082 1083 1084 1085 1086 1087 |
/* * Leave the event in the waitqueue and report * error to userland if we failed to resolve * the userfault fork. */ if (likely(!ret)) |
893e26e61 userfaultfd: non-... |
1088 |
userfaultfd_event_complete(ctx, uwq); |
384632e67 userfaultfd: non-... |
1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 |
} else { /* * Here the fork thread aborted and the * refcount from the fork thread on fork_nctx * has already been released. We still hold * the reference we took before releasing the * lock above. If resolve_userfault_fork * failed we've to drop it because the * fork_nctx has to be freed in such case. If * it succeeded we'll hold it because the new * uffd references it. */ if (ret) userfaultfd_ctx_put(fork_nctx); |
893e26e61 userfaultfd: non-... |
1103 |
} |
cbcfa130a fs/userfaultfd.c:... |
1104 |
spin_unlock_irq(&ctx->event_wqh.lock); |
893e26e61 userfaultfd: non-... |
1105 |
} |
86039bd3b userfaultfd: add ... |
1106 1107 1108 1109 1110 1111 1112 1113 |
return ret; } static ssize_t userfaultfd_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { struct userfaultfd_ctx *ctx = file->private_data; ssize_t _ret, ret = 0; |
a9b85f941 userfaultfd: chan... |
1114 |
struct uffd_msg msg; |
86039bd3b userfaultfd: add ... |
1115 1116 1117 1118 |
int no_wait = file->f_flags & O_NONBLOCK; if (ctx->state == UFFD_STATE_WAIT_API) return -EINVAL; |
86039bd3b userfaultfd: add ... |
1119 1120 |
for (;;) { |
a9b85f941 userfaultfd: chan... |
1121 |
if (count < sizeof(msg)) |
86039bd3b userfaultfd: add ... |
1122 |
return ret ? ret : -EINVAL; |
a9b85f941 userfaultfd: chan... |
1123 |
_ret = userfaultfd_ctx_read(ctx, no_wait, &msg); |
86039bd3b userfaultfd: add ... |
1124 1125 |
if (_ret < 0) return ret ? ret : _ret; |
a9b85f941 userfaultfd: chan... |
1126 |
if (copy_to_user((__u64 __user *) buf, &msg, sizeof(msg))) |
86039bd3b userfaultfd: add ... |
1127 |
return ret ? ret : -EFAULT; |
a9b85f941 userfaultfd: chan... |
1128 1129 1130 |
ret += sizeof(msg); buf += sizeof(msg); count -= sizeof(msg); |
86039bd3b userfaultfd: add ... |
1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 |
/* * Allow to read more than one fault at time but only * block if waiting for the very first one. */ no_wait = O_NONBLOCK; } } static void __wake_userfault(struct userfaultfd_ctx *ctx, struct userfaultfd_wake_range *range) { |
cbcfa130a fs/userfaultfd.c:... |
1142 |
spin_lock_irq(&ctx->fault_pending_wqh.lock); |
86039bd3b userfaultfd: add ... |
1143 |
/* wake all in the range and autoremove */ |
15b726ef0 userfaultfd: opti... |
1144 |
if (waitqueue_active(&ctx->fault_pending_wqh)) |
ac5be6b47 userfaultfd: reve... |
1145 |
__wake_up_locked_key(&ctx->fault_pending_wqh, TASK_NORMAL, |
15b726ef0 userfaultfd: opti... |
1146 1147 |
range); if (waitqueue_active(&ctx->fault_wqh)) |
c430d1e84 userfaultfd: use ... |
1148 |
__wake_up(&ctx->fault_wqh, TASK_NORMAL, 1, range); |
cbcfa130a fs/userfaultfd.c:... |
1149 |
spin_unlock_irq(&ctx->fault_pending_wqh.lock); |
86039bd3b userfaultfd: add ... |
1150 1151 1152 1153 1154 |
} static __always_inline void wake_userfault(struct userfaultfd_ctx *ctx, struct userfaultfd_wake_range *range) { |
2c5b7e1be userfaultfd: avoi... |
1155 1156 |
unsigned seq; bool need_wakeup; |
86039bd3b userfaultfd: add ... |
1157 1158 1159 |
/* * To be sure waitqueue_active() is not reordered by the CPU * before the pagetable update, use an explicit SMP memory |
3e4e28c5a mmap locking API:... |
1160 |
* barrier here. PT lock release or mmap_read_unlock(mm) still |
86039bd3b userfaultfd: add ... |
1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 |
* have release semantics that can allow the * waitqueue_active() to be reordered before the pte update. */ smp_mb(); /* * Use waitqueue_active because it's very frequent to * change the address space atomically even if there are no * userfaults yet. So we take the spinlock only when we're * sure we've userfaults to wake. */ |
2c5b7e1be userfaultfd: avoi... |
1172 1173 1174 1175 1176 1177 1178 |
do { seq = read_seqcount_begin(&ctx->refile_seq); need_wakeup = waitqueue_active(&ctx->fault_pending_wqh) || waitqueue_active(&ctx->fault_wqh); cond_resched(); } while (read_seqcount_retry(&ctx->refile_seq, seq)); if (need_wakeup) |
86039bd3b userfaultfd: add ... |
1179 1180 1181 1182 |
__wake_userfault(ctx, range); } static __always_inline int validate_range(struct mm_struct *mm, |
7d0325749 userfaultfd: unta... |
1183 |
__u64 *start, __u64 len) |
86039bd3b userfaultfd: add ... |
1184 1185 |
{ __u64 task_size = mm->task_size; |
7d0325749 userfaultfd: unta... |
1186 1187 1188 |
*start = untagged_addr(*start); if (*start & ~PAGE_MASK) |
86039bd3b userfaultfd: add ... |
1189 1190 1191 1192 1193 |
return -EINVAL; if (len & ~PAGE_MASK) return -EINVAL; if (!len) return -EINVAL; |
7d0325749 userfaultfd: unta... |
1194 |
if (*start < mmap_min_addr) |
86039bd3b userfaultfd: add ... |
1195 |
return -EINVAL; |
7d0325749 userfaultfd: unta... |
1196 |
if (*start >= task_size) |
86039bd3b userfaultfd: add ... |
1197 |
return -EINVAL; |
7d0325749 userfaultfd: unta... |
1198 |
if (len > task_size - *start) |
86039bd3b userfaultfd: add ... |
1199 1200 1201 |
return -EINVAL; return 0; } |
63b2d4174 userfaultfd: wp: ... |
1202 1203 |
static inline bool vma_can_userfault(struct vm_area_struct *vma, unsigned long vm_flags) |
ba6907db6 userfaultfd: intr... |
1204 |
{ |
63b2d4174 userfaultfd: wp: ... |
1205 1206 1207 1208 |
/* FIXME: add WP support to hugetlbfs and shmem */ return vma_is_anonymous(vma) || ((is_vm_hugetlb_page(vma) || vma_is_shmem(vma)) && !(vm_flags & VM_UFFD_WP)); |
ba6907db6 userfaultfd: intr... |
1209 |
} |
86039bd3b userfaultfd: add ... |
1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 |
static int userfaultfd_register(struct userfaultfd_ctx *ctx, unsigned long arg) { struct mm_struct *mm = ctx->mm; struct vm_area_struct *vma, *prev, *cur; int ret; struct uffdio_register uffdio_register; struct uffdio_register __user *user_uffdio_register; unsigned long vm_flags, new_flags; bool found; |
ce53e8e6f userfaultfd: repo... |
1220 |
bool basic_ioctls; |
86039bd3b userfaultfd: add ... |
1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 |
unsigned long start, end, vma_end; user_uffdio_register = (struct uffdio_register __user *) arg; ret = -EFAULT; if (copy_from_user(&uffdio_register, user_uffdio_register, sizeof(uffdio_register)-sizeof(__u64))) goto out; ret = -EINVAL; if (!uffdio_register.mode) goto out; if (uffdio_register.mode & ~(UFFDIO_REGISTER_MODE_MISSING| UFFDIO_REGISTER_MODE_WP)) goto out; vm_flags = 0; if (uffdio_register.mode & UFFDIO_REGISTER_MODE_MISSING) vm_flags |= VM_UFFD_MISSING; |
63b2d4174 userfaultfd: wp: ... |
1239 |
if (uffdio_register.mode & UFFDIO_REGISTER_MODE_WP) |
86039bd3b userfaultfd: add ... |
1240 |
vm_flags |= VM_UFFD_WP; |
86039bd3b userfaultfd: add ... |
1241 |
|
7d0325749 userfaultfd: unta... |
1242 |
ret = validate_range(mm, &uffdio_register.range.start, |
86039bd3b userfaultfd: add ... |
1243 1244 1245 1246 1247 1248 |
uffdio_register.range.len); if (ret) goto out; start = uffdio_register.range.start; end = start + uffdio_register.range.len; |
d2005e3f4 userfaultfd: don'... |
1249 1250 1251 |
ret = -ENOMEM; if (!mmget_not_zero(mm)) goto out; |
d8ed45c5d mmap locking API:... |
1252 |
mmap_write_lock(mm); |
86039bd3b userfaultfd: add ... |
1253 |
vma = find_vma_prev(mm, start, &prev); |
86039bd3b userfaultfd: add ... |
1254 1255 1256 1257 1258 1259 1260 1261 1262 |
if (!vma) goto out_unlock; /* check that there's at least one vma in the range */ ret = -EINVAL; if (vma->vm_start >= end) goto out_unlock; /* |
cab350afc userfaultfd: huge... |
1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 |
* If the first vma contains huge pages, make sure start address * is aligned to huge page size. */ if (is_vm_hugetlb_page(vma)) { unsigned long vma_hpagesize = vma_kernel_pagesize(vma); if (start & (vma_hpagesize - 1)) goto out_unlock; } /* |
86039bd3b userfaultfd: add ... |
1274 |
* Search for not compatible vmas. |
86039bd3b userfaultfd: add ... |
1275 1276 |
*/ found = false; |
ce53e8e6f userfaultfd: repo... |
1277 |
basic_ioctls = false; |
86039bd3b userfaultfd: add ... |
1278 1279 1280 1281 1282 1283 1284 1285 |
for (cur = vma; cur && cur->vm_start < end; cur = cur->vm_next) { cond_resched(); BUG_ON(!!cur->vm_userfaultfd_ctx.ctx ^ !!(cur->vm_flags & (VM_UFFD_MISSING | VM_UFFD_WP))); /* check not compatible vmas */ ret = -EINVAL; |
63b2d4174 userfaultfd: wp: ... |
1286 |
if (!vma_can_userfault(cur, vm_flags)) |
86039bd3b userfaultfd: add ... |
1287 |
goto out_unlock; |
29ec90660 userfaultfd: shme... |
1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 |
/* * UFFDIO_COPY will fill file holes even without * PROT_WRITE. This check enforces that if this is a * MAP_SHARED, the process has write permission to the backing * file. If VM_MAYWRITE is set it also enforces that on a * MAP_SHARED vma: there is no F_WRITE_SEAL and no further * F_WRITE_SEAL can be taken until the vma is destroyed. */ ret = -EPERM; if (unlikely(!(cur->vm_flags & VM_MAYWRITE))) goto out_unlock; |
cab350afc userfaultfd: huge... |
1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 |
/* * If this vma contains ending address, and huge pages * check alignment. */ if (is_vm_hugetlb_page(cur) && end <= cur->vm_end && end > cur->vm_start) { unsigned long vma_hpagesize = vma_kernel_pagesize(cur); ret = -EINVAL; if (end & (vma_hpagesize - 1)) goto out_unlock; } |
63b2d4174 userfaultfd: wp: ... |
1313 1314 |
if ((vm_flags & VM_UFFD_WP) && !(cur->vm_flags & VM_MAYWRITE)) goto out_unlock; |
86039bd3b userfaultfd: add ... |
1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 |
/* * Check that this vma isn't already owned by a * different userfaultfd. We can't allow more than one * userfaultfd to own a single vma simultaneously or we * wouldn't know which one to deliver the userfaults to. */ ret = -EBUSY; if (cur->vm_userfaultfd_ctx.ctx && cur->vm_userfaultfd_ctx.ctx != ctx) goto out_unlock; |
cab350afc userfaultfd: huge... |
1326 1327 1328 |
/* * Note vmas containing huge pages */ |
ce53e8e6f userfaultfd: repo... |
1329 1330 |
if (is_vm_hugetlb_page(cur)) basic_ioctls = true; |
cab350afc userfaultfd: huge... |
1331 |
|
86039bd3b userfaultfd: add ... |
1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 |
found = true; } BUG_ON(!found); if (vma->vm_start < start) prev = vma; ret = 0; do { cond_resched(); |
63b2d4174 userfaultfd: wp: ... |
1342 |
BUG_ON(!vma_can_userfault(vma, vm_flags)); |
86039bd3b userfaultfd: add ... |
1343 1344 |
BUG_ON(vma->vm_userfaultfd_ctx.ctx && vma->vm_userfaultfd_ctx.ctx != ctx); |
29ec90660 userfaultfd: shme... |
1345 |
WARN_ON(!(vma->vm_flags & VM_MAYWRITE)); |
86039bd3b userfaultfd: add ... |
1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 |
/* * Nothing to do: this vma is already registered into this * userfaultfd and with the right tracking mode too. */ if (vma->vm_userfaultfd_ctx.ctx == ctx && (vma->vm_flags & vm_flags) == vm_flags) goto skip; if (vma->vm_start > start) start = vma->vm_start; vma_end = min(end, vma->vm_end); |
9d4678eb1 fs/userfaultfd.c:... |
1358 1359 |
new_flags = (vma->vm_flags & ~(VM_UFFD_MISSING|VM_UFFD_WP)) | vm_flags; |
86039bd3b userfaultfd: add ... |
1360 1361 1362 |
prev = vma_merge(mm, prev, start, vma_end, new_flags, vma->anon_vma, vma->vm_file, vma->vm_pgoff, vma_policy(vma), |
60500a422 ANDROID: mm: add ... |
1363 1364 |
((struct vm_userfaultfd_ctx){ ctx }), vma_get_anon_name(vma)); |
86039bd3b userfaultfd: add ... |
1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 |
if (prev) { vma = prev; goto next; } if (vma->vm_start < start) { ret = split_vma(mm, vma, start, 1); if (ret) break; } if (vma->vm_end > end) { ret = split_vma(mm, vma, end, 0); if (ret) break; } next: /* * In the vma_merge() successful mprotect-like case 8: * the next vma was merged into the current one and * the current one has not been updated yet. */ vma->vm_flags = new_flags; vma->vm_userfaultfd_ctx.ctx = ctx; skip: prev = vma; start = vma->vm_end; vma = vma->vm_next; } while (vma && vma->vm_start < end); out_unlock: |
d8ed45c5d mmap locking API:... |
1394 |
mmap_write_unlock(mm); |
d2005e3f4 userfaultfd: don'... |
1395 |
mmput(mm); |
86039bd3b userfaultfd: add ... |
1396 |
if (!ret) { |
14819305e userfaultfd: wp: ... |
1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 |
__u64 ioctls_out; ioctls_out = basic_ioctls ? UFFD_API_RANGE_IOCTLS_BASIC : UFFD_API_RANGE_IOCTLS; /* * Declare the WP ioctl only if the WP mode is * specified and all checks passed with the range */ if (!(uffdio_register.mode & UFFDIO_REGISTER_MODE_WP)) ioctls_out &= ~((__u64)1 << _UFFDIO_WRITEPROTECT); |
86039bd3b userfaultfd: add ... |
1408 1409 1410 1411 1412 |
/* * Now that we scanned all vmas we can already tell * userland which ioctls methods are guaranteed to * succeed on this range. */ |
14819305e userfaultfd: wp: ... |
1413 |
if (put_user(ioctls_out, &user_uffdio_register->ioctls)) |
86039bd3b userfaultfd: add ... |
1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 |
ret = -EFAULT; } out: return ret; } static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, unsigned long arg) { struct mm_struct *mm = ctx->mm; struct vm_area_struct *vma, *prev, *cur; int ret; struct uffdio_range uffdio_unregister; unsigned long new_flags; bool found; unsigned long start, end, vma_end; const void __user *buf = (void __user *)arg; ret = -EFAULT; if (copy_from_user(&uffdio_unregister, buf, sizeof(uffdio_unregister))) goto out; |
7d0325749 userfaultfd: unta... |
1435 |
ret = validate_range(mm, &uffdio_unregister.start, |
86039bd3b userfaultfd: add ... |
1436 1437 1438 1439 1440 1441 |
uffdio_unregister.len); if (ret) goto out; start = uffdio_unregister.start; end = start + uffdio_unregister.len; |
d2005e3f4 userfaultfd: don'... |
1442 1443 1444 |
ret = -ENOMEM; if (!mmget_not_zero(mm)) goto out; |
d8ed45c5d mmap locking API:... |
1445 |
mmap_write_lock(mm); |
86039bd3b userfaultfd: add ... |
1446 |
vma = find_vma_prev(mm, start, &prev); |
86039bd3b userfaultfd: add ... |
1447 1448 1449 1450 1451 1452 1453 1454 1455 |
if (!vma) goto out_unlock; /* check that there's at least one vma in the range */ ret = -EINVAL; if (vma->vm_start >= end) goto out_unlock; /* |
cab350afc userfaultfd: huge... |
1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 |
* If the first vma contains huge pages, make sure start address * is aligned to huge page size. */ if (is_vm_hugetlb_page(vma)) { unsigned long vma_hpagesize = vma_kernel_pagesize(vma); if (start & (vma_hpagesize - 1)) goto out_unlock; } /* |
86039bd3b userfaultfd: add ... |
1467 |
* Search for not compatible vmas. |
86039bd3b userfaultfd: add ... |
1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 |
*/ found = false; ret = -EINVAL; for (cur = vma; cur && cur->vm_start < end; cur = cur->vm_next) { cond_resched(); BUG_ON(!!cur->vm_userfaultfd_ctx.ctx ^ !!(cur->vm_flags & (VM_UFFD_MISSING | VM_UFFD_WP))); /* * Check not compatible vmas, not strictly required * here as not compatible vmas cannot have an * userfaultfd_ctx registered on them, but this * provides for more strict behavior to notice * unregistration errors. */ |
63b2d4174 userfaultfd: wp: ... |
1484 |
if (!vma_can_userfault(cur, cur->vm_flags)) |
86039bd3b userfaultfd: add ... |
1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 |
goto out_unlock; found = true; } BUG_ON(!found); if (vma->vm_start < start) prev = vma; ret = 0; do { cond_resched(); |
63b2d4174 userfaultfd: wp: ... |
1497 |
BUG_ON(!vma_can_userfault(vma, vma->vm_flags)); |
86039bd3b userfaultfd: add ... |
1498 1499 1500 1501 1502 1503 1504 |
/* * Nothing to do: this vma is already registered into this * userfaultfd and with the right tracking mode too. */ if (!vma->vm_userfaultfd_ctx.ctx) goto skip; |
01e881f5a userfaultfd: chec... |
1505 |
WARN_ON(!(vma->vm_flags & VM_MAYWRITE)); |
86039bd3b userfaultfd: add ... |
1506 1507 1508 |
if (vma->vm_start > start) start = vma->vm_start; vma_end = min(end, vma->vm_end); |
09fa5296a userfaultfd: non-... |
1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 |
if (userfaultfd_missing(vma)) { /* * Wake any concurrent pending userfault while * we unregister, so they will not hang * permanently and it avoids userland to call * UFFDIO_WAKE explicitly. */ struct userfaultfd_wake_range range; range.start = start; range.len = vma_end - start; wake_userfault(vma->vm_userfaultfd_ctx.ctx, &range); } |
86039bd3b userfaultfd: add ... |
1521 1522 1523 1524 |
new_flags = vma->vm_flags & ~(VM_UFFD_MISSING | VM_UFFD_WP); prev = vma_merge(mm, prev, start, vma_end, new_flags, vma->anon_vma, vma->vm_file, vma->vm_pgoff, vma_policy(vma), |
60500a422 ANDROID: mm: add ... |
1525 1526 |
NULL_VM_UFFD_CTX, vma_get_anon_name(vma)); |
86039bd3b userfaultfd: add ... |
1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 |
if (prev) { vma = prev; goto next; } if (vma->vm_start < start) { ret = split_vma(mm, vma, start, 1); if (ret) break; } if (vma->vm_end > end) { ret = split_vma(mm, vma, end, 0); if (ret) break; } next: /* * In the vma_merge() successful mprotect-like case 8: * the next vma was merged into the current one and * the current one has not been updated yet. */ vma->vm_flags = new_flags; vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; skip: prev = vma; start = vma->vm_end; vma = vma->vm_next; } while (vma && vma->vm_start < end); out_unlock: |
d8ed45c5d mmap locking API:... |
1556 |
mmap_write_unlock(mm); |
d2005e3f4 userfaultfd: don'... |
1557 |
mmput(mm); |
86039bd3b userfaultfd: add ... |
1558 1559 1560 1561 1562 |
out: return ret; } /* |
ba85c702e userfaultfd: wake... |
1563 1564 |
* userfaultfd_wake may be used in combination with the * UFFDIO_*_MODE_DONTWAKE to wakeup userfaults in batches. |
86039bd3b userfaultfd: add ... |
1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 |
*/ static int userfaultfd_wake(struct userfaultfd_ctx *ctx, unsigned long arg) { int ret; struct uffdio_range uffdio_wake; struct userfaultfd_wake_range range; const void __user *buf = (void __user *)arg; ret = -EFAULT; if (copy_from_user(&uffdio_wake, buf, sizeof(uffdio_wake))) goto out; |
7d0325749 userfaultfd: unta... |
1577 |
ret = validate_range(ctx->mm, &uffdio_wake.start, uffdio_wake.len); |
86039bd3b userfaultfd: add ... |
1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 |
if (ret) goto out; range.start = uffdio_wake.start; range.len = uffdio_wake.len; /* * len == 0 means wake all and we don't want to wake all here, * so check it again to be sure. */ VM_BUG_ON(!range.len); wake_userfault(ctx, &range); ret = 0; out: return ret; } |
ad465cae9 userfaultfd: UFFD... |
1596 1597 1598 1599 1600 1601 1602 1603 1604 |
static int userfaultfd_copy(struct userfaultfd_ctx *ctx, unsigned long arg) { __s64 ret; struct uffdio_copy uffdio_copy; struct uffdio_copy __user *user_uffdio_copy; struct userfaultfd_wake_range range; user_uffdio_copy = (struct uffdio_copy __user *) arg; |
df2cc96e7 userfaultfd: prev... |
1605 1606 1607 |
ret = -EAGAIN; if (READ_ONCE(ctx->mmap_changing)) goto out; |
ad465cae9 userfaultfd: UFFD... |
1608 1609 1610 1611 1612 |
ret = -EFAULT; if (copy_from_user(&uffdio_copy, user_uffdio_copy, /* don't copy "copy" last field */ sizeof(uffdio_copy)-sizeof(__s64))) goto out; |
7d0325749 userfaultfd: unta... |
1613 |
ret = validate_range(ctx->mm, &uffdio_copy.dst, uffdio_copy.len); |
ad465cae9 userfaultfd: UFFD... |
1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 |
if (ret) goto out; /* * double check for wraparound just in case. copy_from_user() * will later check uffdio_copy.src + uffdio_copy.len to fit * in the userland range. */ ret = -EINVAL; if (uffdio_copy.src + uffdio_copy.len <= uffdio_copy.src) goto out; |
72981e0e7 userfaultfd: wp: ... |
1624 |
if (uffdio_copy.mode & ~(UFFDIO_COPY_MODE_DONTWAKE|UFFDIO_COPY_MODE_WP)) |
ad465cae9 userfaultfd: UFFD... |
1625 |
goto out; |
d2005e3f4 userfaultfd: don'... |
1626 1627 |
if (mmget_not_zero(ctx->mm)) { ret = mcopy_atomic(ctx->mm, uffdio_copy.dst, uffdio_copy.src, |
72981e0e7 userfaultfd: wp: ... |
1628 1629 |
uffdio_copy.len, &ctx->mmap_changing, uffdio_copy.mode); |
d2005e3f4 userfaultfd: don'... |
1630 |
mmput(ctx->mm); |
96333187a userfaultfd_copy:... |
1631 |
} else { |
e86b298be userfaultfd: repl... |
1632 |
return -ESRCH; |
d2005e3f4 userfaultfd: don'... |
1633 |
} |
ad465cae9 userfaultfd: UFFD... |
1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 |
if (unlikely(put_user(ret, &user_uffdio_copy->copy))) return -EFAULT; if (ret < 0) goto out; BUG_ON(!ret); /* len == 0 would wake all */ range.len = ret; if (!(uffdio_copy.mode & UFFDIO_COPY_MODE_DONTWAKE)) { range.start = uffdio_copy.dst; wake_userfault(ctx, &range); } ret = range.len == uffdio_copy.len ? 0 : -EAGAIN; out: return ret; } static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx, unsigned long arg) { __s64 ret; struct uffdio_zeropage uffdio_zeropage; struct uffdio_zeropage __user *user_uffdio_zeropage; struct userfaultfd_wake_range range; user_uffdio_zeropage = (struct uffdio_zeropage __user *) arg; |
df2cc96e7 userfaultfd: prev... |
1659 1660 1661 |
ret = -EAGAIN; if (READ_ONCE(ctx->mmap_changing)) goto out; |
ad465cae9 userfaultfd: UFFD... |
1662 1663 1664 1665 1666 |
ret = -EFAULT; if (copy_from_user(&uffdio_zeropage, user_uffdio_zeropage, /* don't copy "zeropage" last field */ sizeof(uffdio_zeropage)-sizeof(__s64))) goto out; |
7d0325749 userfaultfd: unta... |
1667 |
ret = validate_range(ctx->mm, &uffdio_zeropage.range.start, |
ad465cae9 userfaultfd: UFFD... |
1668 1669 1670 1671 1672 1673 |
uffdio_zeropage.range.len); if (ret) goto out; ret = -EINVAL; if (uffdio_zeropage.mode & ~UFFDIO_ZEROPAGE_MODE_DONTWAKE) goto out; |
d2005e3f4 userfaultfd: don'... |
1674 1675 |
if (mmget_not_zero(ctx->mm)) { ret = mfill_zeropage(ctx->mm, uffdio_zeropage.range.start, |
df2cc96e7 userfaultfd: prev... |
1676 1677 |
uffdio_zeropage.range.len, &ctx->mmap_changing); |
d2005e3f4 userfaultfd: don'... |
1678 |
mmput(ctx->mm); |
9d95aa4ba userfaultfd_zerop... |
1679 |
} else { |
e86b298be userfaultfd: repl... |
1680 |
return -ESRCH; |
d2005e3f4 userfaultfd: don'... |
1681 |
} |
ad465cae9 userfaultfd: UFFD... |
1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 |
if (unlikely(put_user(ret, &user_uffdio_zeropage->zeropage))) return -EFAULT; if (ret < 0) goto out; /* len == 0 would wake all */ BUG_ON(!ret); range.len = ret; if (!(uffdio_zeropage.mode & UFFDIO_ZEROPAGE_MODE_DONTWAKE)) { range.start = uffdio_zeropage.range.start; wake_userfault(ctx, &range); } ret = range.len == uffdio_zeropage.range.len ? 0 : -EAGAIN; out: return ret; } |
63b2d4174 userfaultfd: wp: ... |
1697 1698 1699 1700 1701 1702 1703 |
static int userfaultfd_writeprotect(struct userfaultfd_ctx *ctx, unsigned long arg) { int ret; struct uffdio_writeprotect uffdio_wp; struct uffdio_writeprotect __user *user_uffdio_wp; struct userfaultfd_wake_range range; |
23080e278 userfaultfd: wp: ... |
1704 |
bool mode_wp, mode_dontwake; |
63b2d4174 userfaultfd: wp: ... |
1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 |
if (READ_ONCE(ctx->mmap_changing)) return -EAGAIN; user_uffdio_wp = (struct uffdio_writeprotect __user *) arg; if (copy_from_user(&uffdio_wp, user_uffdio_wp, sizeof(struct uffdio_writeprotect))) return -EFAULT; ret = validate_range(ctx->mm, &uffdio_wp.range.start, uffdio_wp.range.len); if (ret) return ret; if (uffdio_wp.mode & ~(UFFDIO_WRITEPROTECT_MODE_DONTWAKE | UFFDIO_WRITEPROTECT_MODE_WP)) return -EINVAL; |
23080e278 userfaultfd: wp: ... |
1723 1724 1725 1726 1727 |
mode_wp = uffdio_wp.mode & UFFDIO_WRITEPROTECT_MODE_WP; mode_dontwake = uffdio_wp.mode & UFFDIO_WRITEPROTECT_MODE_DONTWAKE; if (mode_wp && mode_dontwake) |
63b2d4174 userfaultfd: wp: ... |
1728 1729 1730 |
return -EINVAL; ret = mwriteprotect_range(ctx->mm, uffdio_wp.range.start, |
23080e278 userfaultfd: wp: ... |
1731 |
uffdio_wp.range.len, mode_wp, |
63b2d4174 userfaultfd: wp: ... |
1732 1733 1734 |
&ctx->mmap_changing); if (ret) return ret; |
23080e278 userfaultfd: wp: ... |
1735 |
if (!mode_wp && !mode_dontwake) { |
63b2d4174 userfaultfd: wp: ... |
1736 1737 1738 1739 1740 1741 |
range.start = uffdio_wp.range.start; range.len = uffdio_wp.range.len; wake_userfault(ctx, &range); } return ret; } |
9cd75c3cd userfaultfd: non-... |
1742 1743 1744 1745 1746 1747 1748 |
static inline unsigned int uffd_ctx_features(__u64 user_features) { /* * For the current set of features the bits just coincide */ return (unsigned int)user_features; } |
86039bd3b userfaultfd: add ... |
1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 |
/* * userland asks for a certain API version and we return which bits * and ioctl commands are implemented in this kernel for such API * version or -EINVAL if unknown. */ static int userfaultfd_api(struct userfaultfd_ctx *ctx, unsigned long arg) { struct uffdio_api uffdio_api; void __user *buf = (void __user *)arg; int ret; |
656031445 userfaultfd: non-... |
1760 |
__u64 features; |
86039bd3b userfaultfd: add ... |
1761 1762 1763 1764 1765 |
ret = -EINVAL; if (ctx->state != UFFD_STATE_WAIT_API) goto out; ret = -EFAULT; |
a9b85f941 userfaultfd: chan... |
1766 |
if (copy_from_user(&uffdio_api, buf, sizeof(uffdio_api))) |
86039bd3b userfaultfd: add ... |
1767 |
goto out; |
656031445 userfaultfd: non-... |
1768 |
features = uffdio_api.features; |
3c1c24d91 userfaultfd: requ... |
1769 1770 1771 1772 1773 1774 |
ret = -EINVAL; if (uffdio_api.api != UFFD_API || (features & ~UFFD_API_FEATURES)) goto err_out; ret = -EPERM; if ((features & UFFD_FEATURE_EVENT_FORK) && !capable(CAP_SYS_PTRACE)) goto err_out; |
656031445 userfaultfd: non-... |
1775 1776 |
/* report all available features and ioctls to userland */ uffdio_api.features = UFFD_API_FEATURES; |
86039bd3b userfaultfd: add ... |
1777 1778 1779 1780 1781 |
uffdio_api.ioctls = UFFD_API_IOCTLS; ret = -EFAULT; if (copy_to_user(buf, &uffdio_api, sizeof(uffdio_api))) goto out; ctx->state = UFFD_STATE_RUNNING; |
656031445 userfaultfd: non-... |
1782 1783 |
/* only enable the requested features for this uffd context */ ctx->features = uffd_ctx_features(features); |
86039bd3b userfaultfd: add ... |
1784 1785 1786 |
ret = 0; out: return ret; |
3c1c24d91 userfaultfd: requ... |
1787 1788 1789 1790 1791 |
err_out: memset(&uffdio_api, 0, sizeof(uffdio_api)); if (copy_to_user(buf, &uffdio_api, sizeof(uffdio_api))) ret = -EFAULT; goto out; |
86039bd3b userfaultfd: add ... |
1792 1793 1794 1795 1796 1797 1798 |
} static long userfaultfd_ioctl(struct file *file, unsigned cmd, unsigned long arg) { int ret = -EINVAL; struct userfaultfd_ctx *ctx = file->private_data; |
e6485a47b userfaultfd: requ... |
1799 1800 |
if (cmd != UFFDIO_API && ctx->state == UFFD_STATE_WAIT_API) return -EINVAL; |
86039bd3b userfaultfd: add ... |
1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 |
switch(cmd) { case UFFDIO_API: ret = userfaultfd_api(ctx, arg); break; case UFFDIO_REGISTER: ret = userfaultfd_register(ctx, arg); break; case UFFDIO_UNREGISTER: ret = userfaultfd_unregister(ctx, arg); break; case UFFDIO_WAKE: ret = userfaultfd_wake(ctx, arg); break; |
ad465cae9 userfaultfd: UFFD... |
1814 1815 1816 1817 1818 1819 |
case UFFDIO_COPY: ret = userfaultfd_copy(ctx, arg); break; case UFFDIO_ZEROPAGE: ret = userfaultfd_zeropage(ctx, arg); break; |
63b2d4174 userfaultfd: wp: ... |
1820 1821 1822 |
case UFFDIO_WRITEPROTECT: ret = userfaultfd_writeprotect(ctx, arg); break; |
86039bd3b userfaultfd: add ... |
1823 1824 1825 1826 1827 1828 1829 1830 |
} return ret; } #ifdef CONFIG_PROC_FS static void userfaultfd_show_fdinfo(struct seq_file *m, struct file *f) { struct userfaultfd_ctx *ctx = f->private_data; |
ac6424b98 sched/wait: Renam... |
1831 |
wait_queue_entry_t *wq; |
86039bd3b userfaultfd: add ... |
1832 |
unsigned long pending = 0, total = 0; |
cbcfa130a fs/userfaultfd.c:... |
1833 |
spin_lock_irq(&ctx->fault_pending_wqh.lock); |
2055da973 sched/wait: Disam... |
1834 |
list_for_each_entry(wq, &ctx->fault_pending_wqh.head, entry) { |
15b726ef0 userfaultfd: opti... |
1835 1836 1837 |
pending++; total++; } |
2055da973 sched/wait: Disam... |
1838 |
list_for_each_entry(wq, &ctx->fault_wqh.head, entry) { |
86039bd3b userfaultfd: add ... |
1839 1840 |
total++; } |
cbcfa130a fs/userfaultfd.c:... |
1841 |
spin_unlock_irq(&ctx->fault_pending_wqh.lock); |
86039bd3b userfaultfd: add ... |
1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 |
/* * If more protocols will be added, there will be all shown * separated by a space. Like this: * protocols: aa:... bb:... */ seq_printf(m, "pending:\t%lu total:\t%lu API:\t%Lx:%x:%Lx ", |
045098e94 userfaultfd: repo... |
1852 |
pending, total, UFFD_API, ctx->features, |
86039bd3b userfaultfd: add ... |
1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 |
UFFD_API_IOCTLS|UFFD_API_RANGE_IOCTLS); } #endif static const struct file_operations userfaultfd_fops = { #ifdef CONFIG_PROC_FS .show_fdinfo = userfaultfd_show_fdinfo, #endif .release = userfaultfd_release, .poll = userfaultfd_poll, .read = userfaultfd_read, .unlocked_ioctl = userfaultfd_ioctl, |
1832f2d8f compat_ioctl: mov... |
1865 |
.compat_ioctl = compat_ptr_ioctl, |
86039bd3b userfaultfd: add ... |
1866 1867 |
.llseek = noop_llseek, }; |
3004ec9ca userfaultfd: allo... |
1868 1869 1870 1871 1872 1873 |
static void init_once_userfaultfd_ctx(void *mem) { struct userfaultfd_ctx *ctx = (struct userfaultfd_ctx *) mem; init_waitqueue_head(&ctx->fault_pending_wqh); init_waitqueue_head(&ctx->fault_wqh); |
9cd75c3cd userfaultfd: non-... |
1874 |
init_waitqueue_head(&ctx->event_wqh); |
3004ec9ca userfaultfd: allo... |
1875 |
init_waitqueue_head(&ctx->fd_wqh); |
2ca97ac8b userfaultfd: Use ... |
1876 |
seqcount_spinlock_init(&ctx->refile_seq, &ctx->fault_pending_wqh.lock); |
3004ec9ca userfaultfd: allo... |
1877 |
} |
284cd241a userfaultfd: conv... |
1878 |
SYSCALL_DEFINE1(userfaultfd, int, flags) |
86039bd3b userfaultfd: add ... |
1879 |
{ |
86039bd3b userfaultfd: add ... |
1880 |
struct userfaultfd_ctx *ctx; |
284cd241a userfaultfd: conv... |
1881 |
int fd; |
86039bd3b userfaultfd: add ... |
1882 |
|
cefdca0a8 userfaultfd/sysct... |
1883 1884 |
if (!sysctl_unprivileged_userfaultfd && !capable(CAP_SYS_PTRACE)) return -EPERM; |
86039bd3b userfaultfd: add ... |
1885 1886 1887 1888 1889 |
BUG_ON(!current->mm); /* Check the UFFD_* constants for consistency. */ BUILD_BUG_ON(UFFD_CLOEXEC != O_CLOEXEC); BUILD_BUG_ON(UFFD_NONBLOCK != O_NONBLOCK); |
86039bd3b userfaultfd: add ... |
1890 |
if (flags & ~UFFD_SHARED_FCNTL_FLAGS) |
284cd241a userfaultfd: conv... |
1891 |
return -EINVAL; |
86039bd3b userfaultfd: add ... |
1892 |
|
3004ec9ca userfaultfd: allo... |
1893 |
ctx = kmem_cache_alloc(userfaultfd_ctx_cachep, GFP_KERNEL); |
86039bd3b userfaultfd: add ... |
1894 |
if (!ctx) |
284cd241a userfaultfd: conv... |
1895 |
return -ENOMEM; |
86039bd3b userfaultfd: add ... |
1896 |
|
ca8804206 userfaultfd: conv... |
1897 |
refcount_set(&ctx->refcount, 1); |
86039bd3b userfaultfd: add ... |
1898 |
ctx->flags = flags; |
9cd75c3cd userfaultfd: non-... |
1899 |
ctx->features = 0; |
86039bd3b userfaultfd: add ... |
1900 1901 |
ctx->state = UFFD_STATE_WAIT_API; ctx->released = false; |
df2cc96e7 userfaultfd: prev... |
1902 |
ctx->mmap_changing = false; |
86039bd3b userfaultfd: add ... |
1903 1904 |
ctx->mm = current->mm; /* prevent the mm struct to be freed */ |
f1f100764 mm: add new mmgra... |
1905 |
mmgrab(ctx->mm); |
86039bd3b userfaultfd: add ... |
1906 |
|
284cd241a userfaultfd: conv... |
1907 1908 1909 |
fd = anon_inode_getfd("[userfaultfd]", &userfaultfd_fops, ctx, O_RDWR | (flags & UFFD_SHARED_FCNTL_FLAGS)); if (fd < 0) { |
d2005e3f4 userfaultfd: don'... |
1910 |
mmdrop(ctx->mm); |
3004ec9ca userfaultfd: allo... |
1911 |
kmem_cache_free(userfaultfd_ctx_cachep, ctx); |
c03e946fd userfaultfd: add ... |
1912 |
} |
86039bd3b userfaultfd: add ... |
1913 |
return fd; |
86039bd3b userfaultfd: add ... |
1914 |
} |
3004ec9ca userfaultfd: allo... |
1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 |
static int __init userfaultfd_init(void) { userfaultfd_ctx_cachep = kmem_cache_create("userfaultfd_ctx_cache", sizeof(struct userfaultfd_ctx), 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, init_once_userfaultfd_ctx); return 0; } __initcall(userfaultfd_init); |