Blame view
fs/dax.c
40.4 KB
d475c6346 dax,ext2: replace... |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 |
/* * fs/dax.c - Direct Access filesystem code * Copyright (c) 2013-2014 Intel Corporation * Author: Matthew Wilcox <matthew.r.wilcox@intel.com> * Author: Ross Zwisler <ross.zwisler@linux.intel.com> * * This program is free software; you can redistribute it and/or modify it * under the terms and conditions of the GNU General Public License, * version 2, as published by the Free Software Foundation. * * This program is distributed in the hope it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * more details. */ #include <linux/atomic.h> #include <linux/blkdev.h> #include <linux/buffer_head.h> |
d77e92e27 dax: update PMD f... |
20 |
#include <linux/dax.h> |
d475c6346 dax,ext2: replace... |
21 22 |
#include <linux/fs.h> #include <linux/genhd.h> |
f7ca90b16 dax,ext2: replace... |
23 24 25 |
#include <linux/highmem.h> #include <linux/memcontrol.h> #include <linux/mm.h> |
d475c6346 dax,ext2: replace... |
26 |
#include <linux/mutex.h> |
9973c98ec dax: add support ... |
27 |
#include <linux/pagevec.h> |
289c6aeda dax,ext2: replace... |
28 |
#include <linux/sched.h> |
f361bf4a6 sched/headers: Pr... |
29 |
#include <linux/sched/signal.h> |
d475c6346 dax,ext2: replace... |
30 |
#include <linux/uio.h> |
f7ca90b16 dax,ext2: replace... |
31 |
#include <linux/vmstat.h> |
34c0fd540 mm, dax, pmem: in... |
32 |
#include <linux/pfn_t.h> |
0e749e542 dax: increase gra... |
33 |
#include <linux/sizes.h> |
4b4bb46d0 dax: clear dirty ... |
34 |
#include <linux/mmu_notifier.h> |
a254e5681 dax: provide an i... |
35 36 |
#include <linux/iomap.h> #include "internal.h" |
d475c6346 dax,ext2: replace... |
37 |
|
282a8e039 dax: add tracepoi... |
38 39 |
#define CREATE_TRACE_POINTS #include <trace/events/fs_dax.h> |
ac401cc78 dax: New fault lo... |
40 41 42 |
/* We choose 4096 entries - same as per-zone page wait tables */ #define DAX_WAIT_TABLE_BITS 12 #define DAX_WAIT_TABLE_ENTRIES (1 << DAX_WAIT_TABLE_BITS) |
917f34526 dax: use PG_PMD_C... |
43 44 |
/* The 'colour' (ie low bits) within a PMD of a page offset. */ #define PG_PMD_COLOUR ((PMD_SIZE >> PAGE_SHIFT) - 1) |
ce95ab0fa dax: make 'wait_t... |
45 |
static wait_queue_head_t wait_table[DAX_WAIT_TABLE_ENTRIES]; |
ac401cc78 dax: New fault lo... |
46 47 48 49 50 51 52 53 54 55 |
static int __init init_dax_wait_table(void) { int i; for (i = 0; i < DAX_WAIT_TABLE_ENTRIES; i++) init_waitqueue_head(wait_table + i); return 0; } fs_initcall(init_dax_wait_table); |
527b19d08 dax: move all DAX... |
56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 |
/* * We use lowest available bit in exceptional entry for locking, one bit for * the entry size (PMD) and two more to tell us if the entry is a zero page or * an empty entry that is just used for locking. In total four special bits. * * If the PMD bit isn't set the entry has size PAGE_SIZE, and if the ZERO_PAGE * and EMPTY bits aren't set the entry is a normal DAX entry with a filesystem * block allocation. */ #define RADIX_DAX_SHIFT (RADIX_TREE_EXCEPTIONAL_SHIFT + 4) #define RADIX_DAX_ENTRY_LOCK (1 << RADIX_TREE_EXCEPTIONAL_SHIFT) #define RADIX_DAX_PMD (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 1)) #define RADIX_DAX_ZERO_PAGE (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 2)) #define RADIX_DAX_EMPTY (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 3)) static unsigned long dax_radix_sector(void *entry) { return (unsigned long)entry >> RADIX_DAX_SHIFT; } static void *dax_radix_locked_entry(sector_t sector, unsigned long flags) { return (void *)(RADIX_TREE_EXCEPTIONAL_ENTRY | flags | ((unsigned long)sector << RADIX_DAX_SHIFT) | RADIX_DAX_ENTRY_LOCK); } static unsigned int dax_radix_order(void *entry) { if ((unsigned long)entry & RADIX_DAX_PMD) return PMD_SHIFT - PAGE_SHIFT; return 0; } |
642261ac9 dax: add struct i... |
89 |
static int dax_is_pmd_entry(void *entry) |
d1a5f2b4d block: use DAX fo... |
90 |
{ |
642261ac9 dax: add struct i... |
91 |
return (unsigned long)entry & RADIX_DAX_PMD; |
d1a5f2b4d block: use DAX fo... |
92 |
} |
642261ac9 dax: add struct i... |
93 |
static int dax_is_pte_entry(void *entry) |
d475c6346 dax,ext2: replace... |
94 |
{ |
642261ac9 dax: add struct i... |
95 |
return !((unsigned long)entry & RADIX_DAX_PMD); |
d475c6346 dax,ext2: replace... |
96 |
} |
642261ac9 dax: add struct i... |
97 |
static int dax_is_zero_entry(void *entry) |
d475c6346 dax,ext2: replace... |
98 |
{ |
91d25ba8a dax: use common 4... |
99 |
return (unsigned long)entry & RADIX_DAX_ZERO_PAGE; |
d475c6346 dax,ext2: replace... |
100 |
} |
642261ac9 dax: add struct i... |
101 |
static int dax_is_empty_entry(void *entry) |
b2e0d1625 dax: fix lifetime... |
102 |
{ |
642261ac9 dax: add struct i... |
103 |
return (unsigned long)entry & RADIX_DAX_EMPTY; |
b2e0d1625 dax: fix lifetime... |
104 |
} |
f7ca90b16 dax,ext2: replace... |
105 |
/* |
ac401cc78 dax: New fault lo... |
106 107 108 109 |
* DAX radix tree locking */ struct exceptional_entry_key { struct address_space *mapping; |
63e95b5c4 dax: coordinate l... |
110 |
pgoff_t entry_start; |
ac401cc78 dax: New fault lo... |
111 112 113 |
}; struct wait_exceptional_entry_queue { |
ac6424b98 sched/wait: Renam... |
114 |
wait_queue_entry_t wait; |
ac401cc78 dax: New fault lo... |
115 116 |
struct exceptional_entry_key key; }; |
63e95b5c4 dax: coordinate l... |
117 118 119 120 121 122 123 124 125 126 |
static wait_queue_head_t *dax_entry_waitqueue(struct address_space *mapping, pgoff_t index, void *entry, struct exceptional_entry_key *key) { unsigned long hash; /* * If 'entry' is a PMD, align the 'index' that we use for the wait * queue to the start of that PMD. This ensures that all offsets in * the range covered by the PMD map to the same bit lock. */ |
642261ac9 dax: add struct i... |
127 |
if (dax_is_pmd_entry(entry)) |
917f34526 dax: use PG_PMD_C... |
128 |
index &= ~PG_PMD_COLOUR; |
63e95b5c4 dax: coordinate l... |
129 130 131 132 133 134 135 |
key->mapping = mapping; key->entry_start = index; hash = hash_long((unsigned long)mapping ^ index, DAX_WAIT_TABLE_BITS); return wait_table + hash; } |
ac6424b98 sched/wait: Renam... |
136 |
static int wake_exceptional_entry_func(wait_queue_entry_t *wait, unsigned int mode, |
ac401cc78 dax: New fault lo... |
137 138 139 140 141 142 143 |
int sync, void *keyp) { struct exceptional_entry_key *key = keyp; struct wait_exceptional_entry_queue *ewait = container_of(wait, struct wait_exceptional_entry_queue, wait); if (key->mapping != ewait->key.mapping || |
63e95b5c4 dax: coordinate l... |
144 |
key->entry_start != ewait->key.entry_start) |
ac401cc78 dax: New fault lo... |
145 146 147 148 149 |
return 0; return autoremove_wake_function(wait, mode, sync, NULL); } /* |
e30331ff0 dax: relocate som... |
150 151 152 153 154 155 |
* We do not necessarily hold the mapping->tree_lock when we call this * function so it is possible that 'entry' is no longer a valid item in the * radix tree. This is okay because all we really need to do is to find the * correct waitqueue where tasks might be waiting for that old 'entry' and * wake them. */ |
d01ad197a dax: remove DAX c... |
156 |
static void dax_wake_mapping_entry_waiter(struct address_space *mapping, |
e30331ff0 dax: relocate som... |
157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 |
pgoff_t index, void *entry, bool wake_all) { struct exceptional_entry_key key; wait_queue_head_t *wq; wq = dax_entry_waitqueue(mapping, index, entry, &key); /* * Checking for locked entry and prepare_to_wait_exclusive() happens * under mapping->tree_lock, ditto for entry handling in our callers. * So at this point all tasks that could have seen our entry locked * must be in the waitqueue and the following check will see them. */ if (waitqueue_active(wq)) __wake_up(wq, TASK_NORMAL, wake_all ? 0 : 1, &key); } /* |
ac401cc78 dax: New fault lo... |
175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 |
* Check whether the given slot is locked. The function must be called with * mapping->tree_lock held */ static inline int slot_locked(struct address_space *mapping, void **slot) { unsigned long entry = (unsigned long) radix_tree_deref_slot_protected(slot, &mapping->tree_lock); return entry & RADIX_DAX_ENTRY_LOCK; } /* * Mark the given slot is locked. The function must be called with * mapping->tree_lock held */ static inline void *lock_slot(struct address_space *mapping, void **slot) { unsigned long entry = (unsigned long) radix_tree_deref_slot_protected(slot, &mapping->tree_lock); entry |= RADIX_DAX_ENTRY_LOCK; |
6d75f366b lib: radix-tree: ... |
195 |
radix_tree_replace_slot(&mapping->page_tree, slot, (void *)entry); |
ac401cc78 dax: New fault lo... |
196 197 198 199 200 201 202 203 204 205 206 207 208 |
return (void *)entry; } /* * Mark the given slot is unlocked. The function must be called with * mapping->tree_lock held */ static inline void *unlock_slot(struct address_space *mapping, void **slot) { unsigned long entry = (unsigned long) radix_tree_deref_slot_protected(slot, &mapping->tree_lock); entry &= ~(unsigned long)RADIX_DAX_ENTRY_LOCK; |
6d75f366b lib: radix-tree: ... |
209 |
radix_tree_replace_slot(&mapping->page_tree, slot, (void *)entry); |
ac401cc78 dax: New fault lo... |
210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 |
return (void *)entry; } /* * Lookup entry in radix tree, wait for it to become unlocked if it is * exceptional entry and return it. The caller must call * put_unlocked_mapping_entry() when he decided not to lock the entry or * put_locked_mapping_entry() when he locked the entry and now wants to * unlock it. * * The function must be called with mapping->tree_lock held. */ static void *get_unlocked_mapping_entry(struct address_space *mapping, pgoff_t index, void ***slotp) { |
e3ad61c64 dax: consistent v... |
225 |
void *entry, **slot; |
ac401cc78 dax: New fault lo... |
226 |
struct wait_exceptional_entry_queue ewait; |
63e95b5c4 dax: coordinate l... |
227 |
wait_queue_head_t *wq; |
ac401cc78 dax: New fault lo... |
228 229 230 |
init_wait(&ewait.wait); ewait.wait.func = wake_exceptional_entry_func; |
ac401cc78 dax: New fault lo... |
231 232 |
for (;;) { |
e3ad61c64 dax: consistent v... |
233 |
entry = __radix_tree_lookup(&mapping->page_tree, index, NULL, |
ac401cc78 dax: New fault lo... |
234 |
&slot); |
91d25ba8a dax: use common 4... |
235 236 |
if (!entry || WARN_ON_ONCE(!radix_tree_exceptional_entry(entry)) || |
ac401cc78 dax: New fault lo... |
237 238 239 |
!slot_locked(mapping, slot)) { if (slotp) *slotp = slot; |
e3ad61c64 dax: consistent v... |
240 |
return entry; |
ac401cc78 dax: New fault lo... |
241 |
} |
63e95b5c4 dax: coordinate l... |
242 243 |
wq = dax_entry_waitqueue(mapping, index, entry, &ewait.key); |
ac401cc78 dax: New fault lo... |
244 245 246 247 248 249 250 251 |
prepare_to_wait_exclusive(wq, &ewait.wait, TASK_UNINTERRUPTIBLE); spin_unlock_irq(&mapping->tree_lock); schedule(); finish_wait(wq, &ewait.wait); spin_lock_irq(&mapping->tree_lock); } } |
b1aa812b2 mm: move handling... |
252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 |
static void dax_unlock_mapping_entry(struct address_space *mapping, pgoff_t index) { void *entry, **slot; spin_lock_irq(&mapping->tree_lock); entry = __radix_tree_lookup(&mapping->page_tree, index, NULL, &slot); if (WARN_ON_ONCE(!entry || !radix_tree_exceptional_entry(entry) || !slot_locked(mapping, slot))) { spin_unlock_irq(&mapping->tree_lock); return; } unlock_slot(mapping, slot); spin_unlock_irq(&mapping->tree_lock); dax_wake_mapping_entry_waiter(mapping, index, entry, false); } |
422476c46 dax: move put_(un... |
268 |
static void put_locked_mapping_entry(struct address_space *mapping, |
91d25ba8a dax: use common 4... |
269 |
pgoff_t index) |
422476c46 dax: move put_(un... |
270 |
{ |
91d25ba8a dax: use common 4... |
271 |
dax_unlock_mapping_entry(mapping, index); |
422476c46 dax: move put_(un... |
272 273 274 275 276 277 278 279 280 |
} /* * Called when we are done with radix tree entry we looked up via * get_unlocked_mapping_entry() and which we didn't lock in the end. */ static void put_unlocked_mapping_entry(struct address_space *mapping, pgoff_t index, void *entry) { |
91d25ba8a dax: use common 4... |
281 |
if (!entry) |
422476c46 dax: move put_(un... |
282 283 284 285 286 |
return; /* We have to wake up next waiter for the radix tree entry lock */ dax_wake_mapping_entry_waiter(mapping, index, entry, false); } |
ac401cc78 dax: New fault lo... |
287 |
/* |
91d25ba8a dax: use common 4... |
288 289 290 291 |
* Find radix tree entry at given index. If it points to an exceptional entry, * return it with the radix tree entry locked. If the radix tree doesn't * contain given index, create an empty exceptional entry for the index and * return with it locked. |
ac401cc78 dax: New fault lo... |
292 |
* |
642261ac9 dax: add struct i... |
293 294 |
* When requesting an entry with size RADIX_DAX_PMD, grab_mapping_entry() will * either return that locked entry or will return an error. This error will |
91d25ba8a dax: use common 4... |
295 296 |
* happen if there are any 4k entries within the 2MiB range that we are * requesting. |
642261ac9 dax: add struct i... |
297 298 299 300 301 302 303 304 305 306 307 308 |
* * We always favor 4k entries over 2MiB entries. There isn't a flow where we * evict 4k entries in order to 'upgrade' them to a 2MiB entry. A 2MiB * insertion will fail if it finds any 4k entries already in the tree, and a * 4k insertion will cause an existing 2MiB entry to be unmapped and * downgraded to 4k entries. This happens for both 2MiB huge zero pages as * well as 2MiB empty entries. * * The exception to this downgrade path is for 2MiB DAX PMD entries that have * real storage backing them. We will leave these real 2MiB DAX entries in * the tree, and PTE writes will simply dirty the entire 2MiB DAX entry. * |
ac401cc78 dax: New fault lo... |
309 310 311 312 |
* Note: Unlike filemap_fault() we don't honor FAULT_FLAG_RETRY flags. For * persistent memory the benefit is doubtful. We can add that later if we can * show it helps. */ |
642261ac9 dax: add struct i... |
313 314 |
static void *grab_mapping_entry(struct address_space *mapping, pgoff_t index, unsigned long size_flag) |
ac401cc78 dax: New fault lo... |
315 |
{ |
642261ac9 dax: add struct i... |
316 |
bool pmd_downgrade = false; /* splitting 2MiB entry into 4k entries? */ |
e3ad61c64 dax: consistent v... |
317 |
void *entry, **slot; |
ac401cc78 dax: New fault lo... |
318 319 320 |
restart: spin_lock_irq(&mapping->tree_lock); |
e3ad61c64 dax: consistent v... |
321 |
entry = get_unlocked_mapping_entry(mapping, index, &slot); |
642261ac9 dax: add struct i... |
322 |
|
91d25ba8a dax: use common 4... |
323 324 325 326 |
if (WARN_ON_ONCE(entry && !radix_tree_exceptional_entry(entry))) { entry = ERR_PTR(-EIO); goto out_unlock; } |
642261ac9 dax: add struct i... |
327 328 |
if (entry) { if (size_flag & RADIX_DAX_PMD) { |
91d25ba8a dax: use common 4... |
329 |
if (dax_is_pte_entry(entry)) { |
642261ac9 dax: add struct i... |
330 331 332 333 334 335 |
put_unlocked_mapping_entry(mapping, index, entry); entry = ERR_PTR(-EEXIST); goto out_unlock; } } else { /* trying to grab a PTE entry */ |
91d25ba8a dax: use common 4... |
336 |
if (dax_is_pmd_entry(entry) && |
642261ac9 dax: add struct i... |
337 338 339 340 341 342 |
(dax_is_zero_entry(entry) || dax_is_empty_entry(entry))) { pmd_downgrade = true; } } } |
ac401cc78 dax: New fault lo... |
343 |
/* No entry for given index? Make sure radix tree is big enough. */ |
642261ac9 dax: add struct i... |
344 |
if (!entry || pmd_downgrade) { |
ac401cc78 dax: New fault lo... |
345 |
int err; |
642261ac9 dax: add struct i... |
346 347 348 349 350 351 352 |
if (pmd_downgrade) { /* * Make sure 'entry' remains valid while we drop * mapping->tree_lock. */ entry = lock_slot(mapping, slot); } |
ac401cc78 dax: New fault lo... |
353 |
spin_unlock_irq(&mapping->tree_lock); |
642261ac9 dax: add struct i... |
354 355 356 357 358 359 360 361 |
/* * Besides huge zero pages the only other thing that gets * downgraded are empty entries which don't need to be * unmapped. */ if (pmd_downgrade && dax_is_zero_entry(entry)) unmap_mapping_range(mapping, (index << PAGE_SHIFT) & PMD_MASK, PMD_SIZE, 0); |
ac401cc78 dax: New fault lo... |
362 363 |
err = radix_tree_preload( mapping_gfp_mask(mapping) & ~__GFP_HIGHMEM); |
0cb80b484 dax: Fix sleep in... |
364 365 |
if (err) { if (pmd_downgrade) |
91d25ba8a dax: use common 4... |
366 |
put_locked_mapping_entry(mapping, index); |
ac401cc78 dax: New fault lo... |
367 |
return ERR_PTR(err); |
0cb80b484 dax: Fix sleep in... |
368 |
} |
ac401cc78 dax: New fault lo... |
369 |
spin_lock_irq(&mapping->tree_lock); |
642261ac9 dax: add struct i... |
370 |
|
e11f8b7b6 dax: fix radix tr... |
371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 |
if (!entry) { /* * We needed to drop the page_tree lock while calling * radix_tree_preload() and we didn't have an entry to * lock. See if another thread inserted an entry at * our index during this time. */ entry = __radix_tree_lookup(&mapping->page_tree, index, NULL, &slot); if (entry) { radix_tree_preload_end(); spin_unlock_irq(&mapping->tree_lock); goto restart; } } |
642261ac9 dax: add struct i... |
386 387 388 389 390 391 392 393 394 395 396 |
if (pmd_downgrade) { radix_tree_delete(&mapping->page_tree, index); mapping->nrexceptional--; dax_wake_mapping_entry_waiter(mapping, index, entry, true); } entry = dax_radix_locked_entry(0, size_flag | RADIX_DAX_EMPTY); err = __radix_tree_insert(&mapping->page_tree, index, dax_radix_order(entry), entry); |
ac401cc78 dax: New fault lo... |
397 398 399 |
radix_tree_preload_end(); if (err) { spin_unlock_irq(&mapping->tree_lock); |
642261ac9 dax: add struct i... |
400 |
/* |
e11f8b7b6 dax: fix radix tr... |
401 402 403 404 405 406 |
* Our insertion of a DAX entry failed, most likely * because we were inserting a PMD entry and it * collided with a PTE sized entry at a different * index in the PMD range. We haven't inserted * anything into the radix tree and have no waiters to * wake. |
642261ac9 dax: add struct i... |
407 |
*/ |
ac401cc78 dax: New fault lo... |
408 409 410 411 412 |
return ERR_PTR(err); } /* Good, we have inserted empty locked entry into the tree. */ mapping->nrexceptional++; spin_unlock_irq(&mapping->tree_lock); |
e3ad61c64 dax: consistent v... |
413 |
return entry; |
ac401cc78 dax: New fault lo... |
414 |
} |
e3ad61c64 dax: consistent v... |
415 |
entry = lock_slot(mapping, slot); |
642261ac9 dax: add struct i... |
416 |
out_unlock: |
ac401cc78 dax: New fault lo... |
417 |
spin_unlock_irq(&mapping->tree_lock); |
e3ad61c64 dax: consistent v... |
418 |
return entry; |
ac401cc78 dax: New fault lo... |
419 |
} |
c6dcf52c2 mm: Invalidate DA... |
420 421 422 423 424 425 426 427 428 |
static int __dax_invalidate_mapping_entry(struct address_space *mapping, pgoff_t index, bool trunc) { int ret = 0; void *entry; struct radix_tree_root *page_tree = &mapping->page_tree; spin_lock_irq(&mapping->tree_lock); entry = get_unlocked_mapping_entry(mapping, index, NULL); |
91d25ba8a dax: use common 4... |
429 |
if (!entry || WARN_ON_ONCE(!radix_tree_exceptional_entry(entry))) |
c6dcf52c2 mm: Invalidate DA... |
430 431 432 433 434 435 436 437 438 439 440 441 442 |
goto out; if (!trunc && (radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_DIRTY) || radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE))) goto out; radix_tree_delete(page_tree, index); mapping->nrexceptional--; ret = 1; out: put_unlocked_mapping_entry(mapping, index, entry); spin_unlock_irq(&mapping->tree_lock); return ret; } |
ac401cc78 dax: New fault lo... |
443 444 445 446 447 448 |
/* * Delete exceptional DAX entry at @index from @mapping. Wait for radix tree * entry to get unlocked before deleting it. */ int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index) { |
c6dcf52c2 mm: Invalidate DA... |
449 |
int ret = __dax_invalidate_mapping_entry(mapping, index, true); |
ac401cc78 dax: New fault lo... |
450 |
|
ac401cc78 dax: New fault lo... |
451 452 453 454 455 456 457 |
/* * This gets called from truncate / punch_hole path. As such, the caller * must hold locks protecting against concurrent modifications of the * radix tree (usually fs-private i_mmap_sem for writing). Since the * caller has seen exceptional entry for this index, we better find it * at that index as well... */ |
c6dcf52c2 mm: Invalidate DA... |
458 459 460 461 462 |
WARN_ON_ONCE(!ret); return ret; } /* |
c6dcf52c2 mm: Invalidate DA... |
463 464 465 466 467 468 |
* Invalidate exceptional DAX entry if it is clean. */ int dax_invalidate_mapping_entry_sync(struct address_space *mapping, pgoff_t index) { return __dax_invalidate_mapping_entry(mapping, index, false); |
ac401cc78 dax: New fault lo... |
469 |
} |
cccbce671 filesystem-dax: c... |
470 471 472 |
static int copy_user_dax(struct block_device *bdev, struct dax_device *dax_dev, sector_t sector, size_t size, struct page *to, unsigned long vaddr) |
f7ca90b16 dax,ext2: replace... |
473 |
{ |
cccbce671 filesystem-dax: c... |
474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 |
void *vto, *kaddr; pgoff_t pgoff; pfn_t pfn; long rc; int id; rc = bdev_dax_pgoff(bdev, sector, size, &pgoff); if (rc) return rc; id = dax_read_lock(); rc = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size), &kaddr, &pfn); if (rc < 0) { dax_read_unlock(id); return rc; } |
f7ca90b16 dax,ext2: replace... |
490 |
vto = kmap_atomic(to); |
cccbce671 filesystem-dax: c... |
491 |
copy_user_page(vto, (void __force *)kaddr, vaddr, to); |
f7ca90b16 dax,ext2: replace... |
492 |
kunmap_atomic(vto); |
cccbce671 filesystem-dax: c... |
493 |
dax_read_unlock(id); |
f7ca90b16 dax,ext2: replace... |
494 495 |
return 0; } |
642261ac9 dax: add struct i... |
496 497 498 499 500 501 502 |
/* * By this point grab_mapping_entry() has ensured that we have a locked entry * of the appropriate size so we don't have to worry about downgrading PMDs to * PTEs. If we happen to be trying to insert a PTE and there is a PMD * already in the tree, we will skip the insertion and just dirty the PMD as * appropriate. */ |
ac401cc78 dax: New fault lo... |
503 504 |
static void *dax_insert_mapping_entry(struct address_space *mapping, struct vm_fault *vmf, |
642261ac9 dax: add struct i... |
505 506 |
void *entry, sector_t sector, unsigned long flags) |
9973c98ec dax: add support ... |
507 508 |
{ struct radix_tree_root *page_tree = &mapping->page_tree; |
ac401cc78 dax: New fault lo... |
509 510 |
void *new_entry; pgoff_t index = vmf->pgoff; |
9973c98ec dax: add support ... |
511 |
|
ac401cc78 dax: New fault lo... |
512 |
if (vmf->flags & FAULT_FLAG_WRITE) |
d2b2a28e6 dax: dirty inode ... |
513 |
__mark_inode_dirty(mapping->host, I_DIRTY_PAGES); |
9973c98ec dax: add support ... |
514 |
|
91d25ba8a dax: use common 4... |
515 516 517 518 519 520 521 522 523 |
if (dax_is_zero_entry(entry) && !(flags & RADIX_DAX_ZERO_PAGE)) { /* we are replacing a zero page with block mapping */ if (dax_is_pmd_entry(entry)) unmap_mapping_range(mapping, (vmf->pgoff << PAGE_SHIFT) & PMD_MASK, PMD_SIZE, 0); else /* pte entry */ unmap_mapping_range(mapping, vmf->pgoff << PAGE_SHIFT, PAGE_SIZE, 0); |
9973c98ec dax: add support ... |
524 |
} |
ac401cc78 dax: New fault lo... |
525 |
spin_lock_irq(&mapping->tree_lock); |
642261ac9 dax: add struct i... |
526 |
new_entry = dax_radix_locked_entry(sector, flags); |
91d25ba8a dax: use common 4... |
527 |
if (dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) { |
642261ac9 dax: add struct i... |
528 529 530 531 532 533 534 535 |
/* * Only swap our new entry into the radix tree if the current * entry is a zero page or an empty entry. If a normal PTE or * PMD entry is already in the tree, we leave it alone. This * means that if we are trying to insert a PTE and the * existing entry is a PMD, we will just leave the PMD in the * tree and dirty it if necessary. */ |
f7942430e lib: radix-tree: ... |
536 |
struct radix_tree_node *node; |
ac401cc78 dax: New fault lo... |
537 538 |
void **slot; void *ret; |
9973c98ec dax: add support ... |
539 |
|
f7942430e lib: radix-tree: ... |
540 |
ret = __radix_tree_lookup(page_tree, index, &node, &slot); |
ac401cc78 dax: New fault lo... |
541 |
WARN_ON_ONCE(ret != entry); |
4d693d086 lib: radix-tree: ... |
542 543 |
__radix_tree_replace(page_tree, node, slot, new_entry, NULL, NULL); |
91d25ba8a dax: use common 4... |
544 |
entry = new_entry; |
9973c98ec dax: add support ... |
545 |
} |
91d25ba8a dax: use common 4... |
546 |
|
ac401cc78 dax: New fault lo... |
547 |
if (vmf->flags & FAULT_FLAG_WRITE) |
9973c98ec dax: add support ... |
548 |
radix_tree_tag_set(page_tree, index, PAGECACHE_TAG_DIRTY); |
91d25ba8a dax: use common 4... |
549 |
|
9973c98ec dax: add support ... |
550 |
spin_unlock_irq(&mapping->tree_lock); |
91d25ba8a dax: use common 4... |
551 |
return entry; |
9973c98ec dax: add support ... |
552 |
} |
4b4bb46d0 dax: clear dirty ... |
553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 |
static inline unsigned long pgoff_address(pgoff_t pgoff, struct vm_area_struct *vma) { unsigned long address; address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma); return address; } /* Walk all mappings of a given index of a file and writeprotect them */ static void dax_mapping_entry_mkclean(struct address_space *mapping, pgoff_t index, unsigned long pfn) { struct vm_area_struct *vma; |
f729c8c9b dax: wrprotect pm... |
568 569 |
pte_t pte, *ptep = NULL; pmd_t *pmdp = NULL; |
4b4bb46d0 dax: clear dirty ... |
570 |
spinlock_t *ptl; |
4b4bb46d0 dax: clear dirty ... |
571 572 573 |
i_mmap_lock_read(mapping); vma_interval_tree_foreach(vma, &mapping->i_mmap, index, index) { |
a4d1a8852 dax: update to ne... |
574 |
unsigned long address, start, end; |
4b4bb46d0 dax: clear dirty ... |
575 576 577 578 579 580 581 |
cond_resched(); if (!(vma->vm_flags & VM_SHARED)) continue; address = pgoff_address(index, vma); |
a4d1a8852 dax: update to ne... |
582 583 584 585 586 587 588 |
/* * Note because we provide start/end to follow_pte_pmd it will * call mmu_notifier_invalidate_range_start() on our behalf * before taking any lock. */ if (follow_pte_pmd(vma->vm_mm, address, &start, &end, &ptep, &pmdp, &ptl)) |
4b4bb46d0 dax: clear dirty ... |
589 |
continue; |
4b4bb46d0 dax: clear dirty ... |
590 |
|
f729c8c9b dax: wrprotect pm... |
591 592 593 594 595 596 597 598 599 600 601 602 603 604 |
if (pmdp) { #ifdef CONFIG_FS_DAX_PMD pmd_t pmd; if (pfn != pmd_pfn(*pmdp)) goto unlock_pmd; if (!pmd_dirty(*pmdp) && !pmd_write(*pmdp)) goto unlock_pmd; flush_cache_page(vma, address, pfn); pmd = pmdp_huge_clear_flush(vma, address, pmdp); pmd = pmd_wrprotect(pmd); pmd = pmd_mkclean(pmd); set_pmd_at(vma->vm_mm, address, pmdp, pmd); |
a4d1a8852 dax: update to ne... |
605 |
mmu_notifier_invalidate_range(vma->vm_mm, start, end); |
f729c8c9b dax: wrprotect pm... |
606 |
unlock_pmd: |
f729c8c9b dax: wrprotect pm... |
607 |
#endif |
710b5124a fs/dax.c: release... |
608 |
spin_unlock(ptl); |
f729c8c9b dax: wrprotect pm... |
609 610 611 612 613 614 615 616 617 618 619 |
} else { if (pfn != pte_pfn(*ptep)) goto unlock_pte; if (!pte_dirty(*ptep) && !pte_write(*ptep)) goto unlock_pte; flush_cache_page(vma, address, pfn); pte = ptep_clear_flush(vma, address, ptep); pte = pte_wrprotect(pte); pte = pte_mkclean(pte); set_pte_at(vma->vm_mm, address, ptep, pte); |
a4d1a8852 dax: update to ne... |
620 |
mmu_notifier_invalidate_range(vma->vm_mm, start, end); |
f729c8c9b dax: wrprotect pm... |
621 622 623 |
unlock_pte: pte_unmap_unlock(ptep, ptl); } |
4b4bb46d0 dax: clear dirty ... |
624 |
|
a4d1a8852 dax: update to ne... |
625 |
mmu_notifier_invalidate_range_end(vma->vm_mm, start, end); |
4b4bb46d0 dax: clear dirty ... |
626 627 628 |
} i_mmap_unlock_read(mapping); } |
9973c98ec dax: add support ... |
629 |
static int dax_writeback_one(struct block_device *bdev, |
cccbce671 filesystem-dax: c... |
630 631 |
struct dax_device *dax_dev, struct address_space *mapping, pgoff_t index, void *entry) |
9973c98ec dax: add support ... |
632 633 |
{ struct radix_tree_root *page_tree = &mapping->page_tree; |
cccbce671 filesystem-dax: c... |
634 635 636 637 638 639 |
void *entry2, **slot, *kaddr; long ret = 0, id; sector_t sector; pgoff_t pgoff; size_t size; pfn_t pfn; |
9973c98ec dax: add support ... |
640 |
|
9973c98ec dax: add support ... |
641 |
/* |
a6abc2c0e dax: make cache f... |
642 643 |
* A page got tagged dirty in DAX mapping? Something is seriously * wrong. |
9973c98ec dax: add support ... |
644 |
*/ |
a6abc2c0e dax: make cache f... |
645 646 |
if (WARN_ON(!radix_tree_exceptional_entry(entry))) return -EIO; |
9973c98ec dax: add support ... |
647 |
|
a6abc2c0e dax: make cache f... |
648 649 650 |
spin_lock_irq(&mapping->tree_lock); entry2 = get_unlocked_mapping_entry(mapping, index, &slot); /* Entry got punched out / reallocated? */ |
91d25ba8a dax: use common 4... |
651 |
if (!entry2 || WARN_ON_ONCE(!radix_tree_exceptional_entry(entry2))) |
a6abc2c0e dax: make cache f... |
652 653 654 655 656 657 658 659 |
goto put_unlocked; /* * Entry got reallocated elsewhere? No need to writeback. We have to * compare sectors as we must not bail out due to difference in lockbit * or entry type. */ if (dax_radix_sector(entry2) != dax_radix_sector(entry)) goto put_unlocked; |
642261ac9 dax: add struct i... |
660 661 |
if (WARN_ON_ONCE(dax_is_empty_entry(entry) || dax_is_zero_entry(entry))) { |
9973c98ec dax: add support ... |
662 |
ret = -EIO; |
a6abc2c0e dax: make cache f... |
663 |
goto put_unlocked; |
9973c98ec dax: add support ... |
664 |
} |
a6abc2c0e dax: make cache f... |
665 666 667 668 669 670 671 672 673 674 675 676 677 678 |
/* Another fsync thread may have already written back this entry */ if (!radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE)) goto put_unlocked; /* Lock the entry to serialize with page faults */ entry = lock_slot(mapping, slot); /* * We can clear the tag now but we have to be careful so that concurrent * dax_writeback_one() calls for the same index cannot finish before we * actually flush the caches. This is achieved as the calls will look * at the entry only under tree_lock and once they do that they will * see the entry locked and wait for it to unlock. */ radix_tree_tag_clear(page_tree, index, PAGECACHE_TAG_TOWRITE); spin_unlock_irq(&mapping->tree_lock); |
642261ac9 dax: add struct i... |
679 680 681 682 683 684 685 |
/* * Even if dax_writeback_mapping_range() was given a wbc->range_start * in the middle of a PMD, the 'index' we are given will be aligned to * the start index of the PMD, as will the sector we pull from * 'entry'. This allows us to flush for PMD_SIZE and not have to * worry about partial PMD writebacks. */ |
cccbce671 filesystem-dax: c... |
686 687 688 689 690 691 692 |
sector = dax_radix_sector(entry); size = PAGE_SIZE << dax_radix_order(entry); id = dax_read_lock(); ret = bdev_dax_pgoff(bdev, sector, size, &pgoff); if (ret) goto dax_unlock; |
9973c98ec dax: add support ... |
693 694 |
/* |
cccbce671 filesystem-dax: c... |
695 696 |
* dax_direct_access() may sleep, so cannot hold tree_lock over * its invocation. |
9973c98ec dax: add support ... |
697 |
*/ |
cccbce671 filesystem-dax: c... |
698 699 700 |
ret = dax_direct_access(dax_dev, pgoff, size / PAGE_SIZE, &kaddr, &pfn); if (ret < 0) goto dax_unlock; |
9973c98ec dax: add support ... |
701 |
|
cccbce671 filesystem-dax: c... |
702 |
if (WARN_ON_ONCE(ret < size / PAGE_SIZE)) { |
9973c98ec dax: add support ... |
703 |
ret = -EIO; |
cccbce671 filesystem-dax: c... |
704 |
goto dax_unlock; |
9973c98ec dax: add support ... |
705 |
} |
cccbce671 filesystem-dax: c... |
706 |
dax_mapping_entry_mkclean(mapping, index, pfn_t_to_pfn(pfn)); |
c3ca015fa dax: remove the p... |
707 |
dax_flush(dax_dev, kaddr, size); |
4b4bb46d0 dax: clear dirty ... |
708 709 710 711 712 713 714 715 716 |
/* * After we have flushed the cache, we can clear the dirty tag. There * cannot be new dirty data in the pfn after the flush has completed as * the pfn mappings are writeprotected and fault waits for mapping * entry lock. */ spin_lock_irq(&mapping->tree_lock); radix_tree_tag_clear(page_tree, index, PAGECACHE_TAG_DIRTY); spin_unlock_irq(&mapping->tree_lock); |
f9bc3a075 dax: add tracepoi... |
717 |
trace_dax_writeback_one(mapping->host, index, size >> PAGE_SHIFT); |
cccbce671 filesystem-dax: c... |
718 719 |
dax_unlock: dax_read_unlock(id); |
91d25ba8a dax: use common 4... |
720 |
put_locked_mapping_entry(mapping, index); |
9973c98ec dax: add support ... |
721 |
return ret; |
a6abc2c0e dax: make cache f... |
722 723 |
put_unlocked: put_unlocked_mapping_entry(mapping, index, entry2); |
9973c98ec dax: add support ... |
724 725 726 727 728 729 730 731 732 |
spin_unlock_irq(&mapping->tree_lock); return ret; } /* * Flush the mapping to the persistent domain within the byte range of [start, * end]. This is required by data integrity operations to ensure file data is * on persistent storage prior to completion of the operation. */ |
7f6d5b529 dax: move writeba... |
733 734 |
int dax_writeback_mapping_range(struct address_space *mapping, struct block_device *bdev, struct writeback_control *wbc) |
9973c98ec dax: add support ... |
735 736 |
{ struct inode *inode = mapping->host; |
642261ac9 dax: add struct i... |
737 |
pgoff_t start_index, end_index; |
9973c98ec dax: add support ... |
738 |
pgoff_t indices[PAGEVEC_SIZE]; |
cccbce671 filesystem-dax: c... |
739 |
struct dax_device *dax_dev; |
9973c98ec dax: add support ... |
740 741 742 |
struct pagevec pvec; bool done = false; int i, ret = 0; |
9973c98ec dax: add support ... |
743 744 745 |
if (WARN_ON_ONCE(inode->i_blkbits != PAGE_SHIFT)) return -EIO; |
7f6d5b529 dax: move writeba... |
746 747 |
if (!mapping->nrexceptional || wbc->sync_mode != WB_SYNC_ALL) return 0; |
cccbce671 filesystem-dax: c... |
748 749 750 |
dax_dev = dax_get_by_host(bdev->bd_disk->disk_name); if (!dax_dev) return -EIO; |
09cbfeaf1 mm, fs: get rid o... |
751 752 |
start_index = wbc->range_start >> PAGE_SHIFT; end_index = wbc->range_end >> PAGE_SHIFT; |
9973c98ec dax: add support ... |
753 |
|
d14a3f48a dax: add tracepoi... |
754 |
trace_dax_writeback_range(inode, start_index, end_index); |
9973c98ec dax: add support ... |
755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 |
tag_pages_for_writeback(mapping, start_index, end_index); pagevec_init(&pvec, 0); while (!done) { pvec.nr = find_get_entries_tag(mapping, start_index, PAGECACHE_TAG_TOWRITE, PAGEVEC_SIZE, pvec.pages, indices); if (pvec.nr == 0) break; for (i = 0; i < pvec.nr; i++) { if (indices[i] > end_index) { done = true; break; } |
cccbce671 filesystem-dax: c... |
771 772 |
ret = dax_writeback_one(bdev, dax_dev, mapping, indices[i], pvec.pages[i]); |
819ec6b91 dax: set errors i... |
773 774 |
if (ret < 0) { mapping_set_error(mapping, ret); |
d14a3f48a dax: add tracepoi... |
775 |
goto out; |
819ec6b91 dax: set errors i... |
776 |
} |
9973c98ec dax: add support ... |
777 |
} |
1eb643d02 fs/dax.c: fix ine... |
778 |
start_index = indices[pvec.nr - 1] + 1; |
9973c98ec dax: add support ... |
779 |
} |
d14a3f48a dax: add tracepoi... |
780 |
out: |
cccbce671 filesystem-dax: c... |
781 |
put_dax(dax_dev); |
d14a3f48a dax: add tracepoi... |
782 783 |
trace_dax_writeback_range_done(inode, start_index, end_index); return (ret < 0 ? ret : 0); |
9973c98ec dax: add support ... |
784 785 |
} EXPORT_SYMBOL_GPL(dax_writeback_mapping_range); |
ac401cc78 dax: New fault lo... |
786 |
static int dax_insert_mapping(struct address_space *mapping, |
cccbce671 filesystem-dax: c... |
787 |
struct block_device *bdev, struct dax_device *dax_dev, |
91d25ba8a dax: use common 4... |
788 |
sector_t sector, size_t size, void *entry, |
cccbce671 filesystem-dax: c... |
789 |
struct vm_area_struct *vma, struct vm_fault *vmf) |
f7ca90b16 dax,ext2: replace... |
790 |
{ |
1a29d85eb mm: use vmf->addr... |
791 |
unsigned long vaddr = vmf->address; |
cccbce671 filesystem-dax: c... |
792 793 794 795 |
void *ret, *kaddr; pgoff_t pgoff; int id, rc; pfn_t pfn; |
f7ca90b16 dax,ext2: replace... |
796 |
|
cccbce671 filesystem-dax: c... |
797 798 799 |
rc = bdev_dax_pgoff(bdev, sector, size, &pgoff); if (rc) return rc; |
f7ca90b16 dax,ext2: replace... |
800 |
|
cccbce671 filesystem-dax: c... |
801 802 803 804 805 806 807 808 809 |
id = dax_read_lock(); rc = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size), &kaddr, &pfn); if (rc < 0) { dax_read_unlock(id); return rc; } dax_read_unlock(id); ret = dax_insert_mapping_entry(mapping, vmf, entry, sector, 0); |
4d9a2c874 dax: Remove i_mma... |
810 811 |
if (IS_ERR(ret)) return PTR_ERR(ret); |
9973c98ec dax: add support ... |
812 |
|
b44407345 dax: add tracepoi... |
813 |
trace_dax_insert_mapping(mapping->host, vmf, ret); |
91d25ba8a dax: use common 4... |
814 815 816 817 |
if (vmf->flags & FAULT_FLAG_WRITE) return vm_insert_mixed_mkwrite(vma, vaddr, pfn); else return vm_insert_mixed(vma, vaddr, pfn); |
0e3b210ce dax: use pfn_mkwr... |
818 |
} |
0e3b210ce dax: use pfn_mkwr... |
819 |
|
e30331ff0 dax: relocate som... |
820 |
/* |
91d25ba8a dax: use common 4... |
821 822 823 824 825 |
* The user has performed a load from a hole in the file. Allocating a new * page in the file would cause excessive storage usage for workloads with * sparse files. Instead we insert a read-only mapping of the 4k zero page. * If this page is ever written to we will re-fault and change the mapping to * point to real DAX storage instead. |
e30331ff0 dax: relocate som... |
826 |
*/ |
91d25ba8a dax: use common 4... |
827 |
static int dax_load_hole(struct address_space *mapping, void *entry, |
e30331ff0 dax: relocate som... |
828 829 830 |
struct vm_fault *vmf) { struct inode *inode = mapping->host; |
91d25ba8a dax: use common 4... |
831 832 833 834 |
unsigned long vaddr = vmf->address; int ret = VM_FAULT_NOPAGE; struct page *zero_page; void *entry2; |
e30331ff0 dax: relocate som... |
835 |
|
91d25ba8a dax: use common 4... |
836 837 |
zero_page = ZERO_PAGE(0); if (unlikely(!zero_page)) { |
e30331ff0 dax: relocate som... |
838 839 840 |
ret = VM_FAULT_OOM; goto out; } |
91d25ba8a dax: use common 4... |
841 842 843 844 845 |
entry2 = dax_insert_mapping_entry(mapping, vmf, entry, 0, RADIX_DAX_ZERO_PAGE); if (IS_ERR(entry2)) { ret = VM_FAULT_SIGBUS; goto out; |
e30331ff0 dax: relocate som... |
846 |
} |
91d25ba8a dax: use common 4... |
847 848 |
vm_insert_mixed(vmf->vma, vaddr, page_to_pfn_t(zero_page)); |
e30331ff0 dax: relocate som... |
849 850 851 852 |
out: trace_dax_load_hole(inode, vmf, ret); return ret; } |
4b0228fa1 dax: for truncate... |
853 854 855 856 857 858 859 860 861 862 863 864 |
static bool dax_range_is_aligned(struct block_device *bdev, unsigned int offset, unsigned int length) { unsigned short sector_size = bdev_logical_block_size(bdev); if (!IS_ALIGNED(offset, sector_size)) return false; if (!IS_ALIGNED(length, sector_size)) return false; return true; } |
cccbce671 filesystem-dax: c... |
865 866 867 |
int __dax_zero_page_range(struct block_device *bdev, struct dax_device *dax_dev, sector_t sector, unsigned int offset, unsigned int size) |
679c8bd3b dax: export a low... |
868 |
{ |
cccbce671 filesystem-dax: c... |
869 870 |
if (dax_range_is_aligned(bdev, offset, size)) { sector_t start_sector = sector + (offset >> 9); |
4b0228fa1 dax: for truncate... |
871 872 |
return blkdev_issue_zeroout(bdev, start_sector, |
53ef7d0e2 Merge tag 'libnvd... |
873 |
size >> 9, GFP_NOFS, 0); |
4b0228fa1 dax: for truncate... |
874 |
} else { |
cccbce671 filesystem-dax: c... |
875 876 877 878 |
pgoff_t pgoff; long rc, id; void *kaddr; pfn_t pfn; |
e84b83b9e filesystem-dax: f... |
879 |
rc = bdev_dax_pgoff(bdev, sector, PAGE_SIZE, &pgoff); |
cccbce671 filesystem-dax: c... |
880 881 882 883 |
if (rc) return rc; id = dax_read_lock(); |
e84b83b9e filesystem-dax: f... |
884 |
rc = dax_direct_access(dax_dev, pgoff, 1, &kaddr, |
cccbce671 filesystem-dax: c... |
885 886 887 888 889 |
&pfn); if (rc < 0) { dax_read_unlock(id); return rc; } |
81f558701 x86, dax: replace... |
890 |
memset(kaddr + offset, 0, size); |
c3ca015fa dax: remove the p... |
891 |
dax_flush(dax_dev, kaddr + offset, size); |
cccbce671 filesystem-dax: c... |
892 |
dax_read_unlock(id); |
4b0228fa1 dax: for truncate... |
893 |
} |
679c8bd3b dax: export a low... |
894 895 896 |
return 0; } EXPORT_SYMBOL_GPL(__dax_zero_page_range); |
333ccc978 dax: add dax_ioma... |
897 |
static sector_t dax_iomap_sector(struct iomap *iomap, loff_t pos) |
25726bc15 dax: add dax_zero... |
898 |
{ |
333ccc978 dax: add dax_ioma... |
899 |
return iomap->blkno + (((pos & PAGE_MASK) - iomap->offset) >> 9); |
25726bc15 dax: add dax_zero... |
900 |
} |
a254e5681 dax: provide an i... |
901 |
|
a254e5681 dax: provide an i... |
902 |
static loff_t |
11c59c92f dax: correct dax ... |
903 |
dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data, |
a254e5681 dax: provide an i... |
904 905 |
struct iomap *iomap) { |
cccbce671 filesystem-dax: c... |
906 907 |
struct block_device *bdev = iomap->bdev; struct dax_device *dax_dev = iomap->dax_dev; |
a254e5681 dax: provide an i... |
908 909 910 |
struct iov_iter *iter = data; loff_t end = pos + length, done = 0; ssize_t ret = 0; |
cccbce671 filesystem-dax: c... |
911 |
int id; |
a254e5681 dax: provide an i... |
912 913 914 915 916 917 918 919 920 921 922 923 |
if (iov_iter_rw(iter) == READ) { end = min(end, i_size_read(inode)); if (pos >= end) return 0; if (iomap->type == IOMAP_HOLE || iomap->type == IOMAP_UNWRITTEN) return iov_iter_zero(min(length, end - pos), iter); } if (WARN_ON_ONCE(iomap->type != IOMAP_MAPPED)) return -EIO; |
e3fce68cd dax: Avoid page i... |
924 925 926 927 928 |
/* * Write can allocate block for an area which has a hole page mapped * into page tables. We have to tear down these mappings so that data * written by write(2) is visible in mmap. */ |
cd656375f mm: fix data corr... |
929 |
if (iomap->flags & IOMAP_F_NEW) { |
e3fce68cd dax: Avoid page i... |
930 931 932 933 |
invalidate_inode_pages2_range(inode->i_mapping, pos >> PAGE_SHIFT, (end - 1) >> PAGE_SHIFT); } |
cccbce671 filesystem-dax: c... |
934 |
id = dax_read_lock(); |
a254e5681 dax: provide an i... |
935 936 |
while (pos < end) { unsigned offset = pos & (PAGE_SIZE - 1); |
cccbce671 filesystem-dax: c... |
937 938 |
const size_t size = ALIGN(length + offset, PAGE_SIZE); const sector_t sector = dax_iomap_sector(iomap, pos); |
a254e5681 dax: provide an i... |
939 |
ssize_t map_len; |
cccbce671 filesystem-dax: c... |
940 941 942 |
pgoff_t pgoff; void *kaddr; pfn_t pfn; |
a254e5681 dax: provide an i... |
943 |
|
d1908f525 fs: break out of ... |
944 945 946 947 |
if (fatal_signal_pending(current)) { ret = -EINTR; break; } |
cccbce671 filesystem-dax: c... |
948 949 950 951 952 953 |
ret = bdev_dax_pgoff(bdev, sector, size, &pgoff); if (ret) break; map_len = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size), &kaddr, &pfn); |
a254e5681 dax: provide an i... |
954 955 956 957 |
if (map_len < 0) { ret = map_len; break; } |
cccbce671 filesystem-dax: c... |
958 959 |
map_len = PFN_PHYS(map_len); kaddr += offset; |
a254e5681 dax: provide an i... |
960 961 962 |
map_len -= offset; if (map_len > end - pos) map_len = end - pos; |
a2e050f5a dax: explain how ... |
963 964 965 966 967 |
/* * The userspace address for the memory copy has already been * validated via access_ok() in either vfs_read() or * vfs_write(), depending on which operation we are doing. */ |
a254e5681 dax: provide an i... |
968 |
if (iov_iter_rw(iter) == WRITE) |
fec53774f filesystem-dax: c... |
969 970 |
map_len = dax_copy_from_iter(dax_dev, pgoff, kaddr, map_len, iter); |
a254e5681 dax: provide an i... |
971 |
else |
cccbce671 filesystem-dax: c... |
972 |
map_len = copy_to_iter(kaddr, map_len, iter); |
a254e5681 dax: provide an i... |
973 974 975 976 977 978 979 980 981 |
if (map_len <= 0) { ret = map_len ? map_len : -EFAULT; break; } pos += map_len; length -= map_len; done += map_len; } |
cccbce671 filesystem-dax: c... |
982 |
dax_read_unlock(id); |
a254e5681 dax: provide an i... |
983 984 985 986 987 |
return done ? done : ret; } /** |
11c59c92f dax: correct dax ... |
988 |
* dax_iomap_rw - Perform I/O to a DAX file |
a254e5681 dax: provide an i... |
989 990 991 992 993 994 995 996 997 |
* @iocb: The control block for this I/O * @iter: The addresses to do I/O from or to * @ops: iomap ops passed from the file system * * This function performs read and write operations to directly mapped * persistent memory. The callers needs to take care of read/write exclusion * and evicting any page cache pages in the region under I/O. */ ssize_t |
11c59c92f dax: correct dax ... |
998 |
dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter, |
8ff6daa17 iomap: constify s... |
999 |
const struct iomap_ops *ops) |
a254e5681 dax: provide an i... |
1000 1001 1002 1003 1004 |
{ struct address_space *mapping = iocb->ki_filp->f_mapping; struct inode *inode = mapping->host; loff_t pos = iocb->ki_pos, ret = 0, done = 0; unsigned flags = 0; |
168316db3 dax: assert that ... |
1005 1006 |
if (iov_iter_rw(iter) == WRITE) { lockdep_assert_held_exclusive(&inode->i_rwsem); |
a254e5681 dax: provide an i... |
1007 |
flags |= IOMAP_WRITE; |
168316db3 dax: assert that ... |
1008 1009 1010 |
} else { lockdep_assert_held(&inode->i_rwsem); } |
a254e5681 dax: provide an i... |
1011 |
|
a254e5681 dax: provide an i... |
1012 1013 |
while (iov_iter_count(iter)) { ret = iomap_apply(inode, pos, iov_iter_count(iter), flags, ops, |
11c59c92f dax: correct dax ... |
1014 |
iter, dax_iomap_actor); |
a254e5681 dax: provide an i... |
1015 1016 1017 1018 1019 1020 1021 1022 1023 |
if (ret <= 0) break; pos += ret; done += ret; } iocb->ki_pos += done; return done ? done : ret; } |
11c59c92f dax: correct dax ... |
1024 |
EXPORT_SYMBOL_GPL(dax_iomap_rw); |
a7d73fe6c dax: provide an i... |
1025 |
|
9f141d6ef dax: Call ->iomap... |
1026 1027 1028 1029 1030 1031 1032 1033 |
static int dax_fault_return(int error) { if (error == 0) return VM_FAULT_NOPAGE; if (error == -ENOMEM) return VM_FAULT_OOM; return VM_FAULT_SIGBUS; } |
a2d581675 mm,fs,dax: change... |
1034 1035 |
static int dax_iomap_pte_fault(struct vm_fault *vmf, const struct iomap_ops *ops) |
a7d73fe6c dax: provide an i... |
1036 |
{ |
11bac8000 mm, fs: reduce fa... |
1037 |
struct address_space *mapping = vmf->vma->vm_file->f_mapping; |
a7d73fe6c dax: provide an i... |
1038 |
struct inode *inode = mapping->host; |
1a29d85eb mm: use vmf->addr... |
1039 |
unsigned long vaddr = vmf->address; |
a7d73fe6c dax: provide an i... |
1040 1041 1042 |
loff_t pos = (loff_t)vmf->pgoff << PAGE_SHIFT; sector_t sector; struct iomap iomap = { 0 }; |
9484ab1bf dax: Introduce IO... |
1043 |
unsigned flags = IOMAP_FAULT; |
a7d73fe6c dax: provide an i... |
1044 |
int error, major = 0; |
b1aa812b2 mm: move handling... |
1045 |
int vmf_ret = 0; |
a7d73fe6c dax: provide an i... |
1046 |
void *entry; |
a9c42b33e dax: add tracepoi... |
1047 |
trace_dax_pte_fault(inode, vmf, vmf_ret); |
a7d73fe6c dax: provide an i... |
1048 1049 1050 1051 1052 |
/* * Check whether offset isn't beyond end of file now. Caller is supposed * to hold locks serializing us with truncate / punch hole so this is * a reliable test. */ |
a9c42b33e dax: add tracepoi... |
1053 1054 1055 1056 |
if (pos >= i_size_read(inode)) { vmf_ret = VM_FAULT_SIGBUS; goto out; } |
a7d73fe6c dax: provide an i... |
1057 |
|
a7d73fe6c dax: provide an i... |
1058 1059 |
if ((vmf->flags & FAULT_FLAG_WRITE) && !vmf->cow_page) flags |= IOMAP_WRITE; |
13e451fdc dax: fix data cor... |
1060 1061 1062 1063 1064 |
entry = grab_mapping_entry(mapping, vmf->pgoff, 0); if (IS_ERR(entry)) { vmf_ret = dax_fault_return(PTR_ERR(entry)); goto out; } |
a7d73fe6c dax: provide an i... |
1065 |
/* |
e2093926a dax: fix race bet... |
1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 |
* It is possible, particularly with mixed reads & writes to private * mappings, that we have raced with a PMD fault that overlaps with * the PTE we need to set up. If so just return and the fault will be * retried. */ if (pmd_trans_huge(*vmf->pmd) || pmd_devmap(*vmf->pmd)) { vmf_ret = VM_FAULT_NOPAGE; goto unlock_entry; } /* |
a7d73fe6c dax: provide an i... |
1077 1078 1079 1080 1081 |
* Note that we don't bother to use iomap_apply here: DAX required * the file system block size to be equal the page size, which means * that we never have to deal with more than a single extent here. */ error = ops->iomap_begin(inode, pos, PAGE_SIZE, flags, &iomap); |
a9c42b33e dax: add tracepoi... |
1082 1083 |
if (error) { vmf_ret = dax_fault_return(error); |
13e451fdc dax: fix data cor... |
1084 |
goto unlock_entry; |
a9c42b33e dax: add tracepoi... |
1085 |
} |
a7d73fe6c dax: provide an i... |
1086 |
if (WARN_ON_ONCE(iomap.offset + iomap.length < pos + PAGE_SIZE)) { |
13e451fdc dax: fix data cor... |
1087 1088 |
error = -EIO; /* fs corruption? */ goto error_finish_iomap; |
a7d73fe6c dax: provide an i... |
1089 |
} |
333ccc978 dax: add dax_ioma... |
1090 |
sector = dax_iomap_sector(&iomap, pos); |
a7d73fe6c dax: provide an i... |
1091 1092 1093 1094 1095 1096 1097 1098 |
if (vmf->cow_page) { switch (iomap.type) { case IOMAP_HOLE: case IOMAP_UNWRITTEN: clear_user_highpage(vmf->cow_page, vaddr); break; case IOMAP_MAPPED: |
cccbce671 filesystem-dax: c... |
1099 1100 |
error = copy_user_dax(iomap.bdev, iomap.dax_dev, sector, PAGE_SIZE, vmf->cow_page, vaddr); |
a7d73fe6c dax: provide an i... |
1101 1102 1103 1104 1105 1106 1107 1108 |
break; default: WARN_ON_ONCE(1); error = -EIO; break; } if (error) |
13e451fdc dax: fix data cor... |
1109 |
goto error_finish_iomap; |
b1aa812b2 mm: move handling... |
1110 1111 1112 1113 1114 |
__SetPageUptodate(vmf->cow_page); vmf_ret = finish_fault(vmf); if (!vmf_ret) vmf_ret = VM_FAULT_DONE_COW; |
13e451fdc dax: fix data cor... |
1115 |
goto finish_iomap; |
a7d73fe6c dax: provide an i... |
1116 1117 1118 1119 1120 1121 |
} switch (iomap.type) { case IOMAP_MAPPED: if (iomap.flags & IOMAP_F_NEW) { count_vm_event(PGMAJFAULT); |
2262185c5 mm: per-cgroup me... |
1122 |
count_memcg_event_mm(vmf->vma->vm_mm, PGMAJFAULT); |
a7d73fe6c dax: provide an i... |
1123 1124 |
major = VM_FAULT_MAJOR; } |
cccbce671 filesystem-dax: c... |
1125 |
error = dax_insert_mapping(mapping, iomap.bdev, iomap.dax_dev, |
91d25ba8a dax: use common 4... |
1126 |
sector, PAGE_SIZE, entry, vmf->vma, vmf); |
9f141d6ef dax: Call ->iomap... |
1127 1128 1129 |
/* -EBUSY is fine, somebody else faulted on the same PTE */ if (error == -EBUSY) error = 0; |
a7d73fe6c dax: provide an i... |
1130 1131 1132 |
break; case IOMAP_UNWRITTEN: case IOMAP_HOLE: |
1550290b0 dax: dax_iomap_fa... |
1133 |
if (!(vmf->flags & FAULT_FLAG_WRITE)) { |
91d25ba8a dax: use common 4... |
1134 |
vmf_ret = dax_load_hole(mapping, entry, vmf); |
13e451fdc dax: fix data cor... |
1135 |
goto finish_iomap; |
1550290b0 dax: dax_iomap_fa... |
1136 |
} |
a7d73fe6c dax: provide an i... |
1137 1138 1139 1140 1141 1142 |
/*FALLTHRU*/ default: WARN_ON_ONCE(1); error = -EIO; break; } |
13e451fdc dax: fix data cor... |
1143 |
error_finish_iomap: |
9f141d6ef dax: Call ->iomap... |
1144 |
vmf_ret = dax_fault_return(error) | major; |
9f141d6ef dax: Call ->iomap... |
1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 |
finish_iomap: if (ops->iomap_end) { int copied = PAGE_SIZE; if (vmf_ret & VM_FAULT_ERROR) copied = 0; /* * The fault is done by now and there's no way back (other * thread may be already happily using PTE we have installed). * Just ignore error from ->iomap_end since we cannot do much * with it. */ ops->iomap_end(inode, pos, PAGE_SIZE, copied, flags, &iomap); |
1550290b0 dax: dax_iomap_fa... |
1158 |
} |
13e451fdc dax: fix data cor... |
1159 |
unlock_entry: |
91d25ba8a dax: use common 4... |
1160 |
put_locked_mapping_entry(mapping, vmf->pgoff); |
13e451fdc dax: fix data cor... |
1161 |
out: |
a9c42b33e dax: add tracepoi... |
1162 |
trace_dax_pte_fault_done(inode, vmf, vmf_ret); |
9f141d6ef dax: Call ->iomap... |
1163 |
return vmf_ret; |
a7d73fe6c dax: provide an i... |
1164 |
} |
642261ac9 dax: add struct i... |
1165 1166 |
#ifdef CONFIG_FS_DAX_PMD |
f42003917 mm, dax: change p... |
1167 |
static int dax_pmd_insert_mapping(struct vm_fault *vmf, struct iomap *iomap, |
91d25ba8a dax: use common 4... |
1168 |
loff_t pos, void *entry) |
642261ac9 dax: add struct i... |
1169 |
{ |
f42003917 mm, dax: change p... |
1170 |
struct address_space *mapping = vmf->vma->vm_file->f_mapping; |
cccbce671 filesystem-dax: c... |
1171 1172 |
const sector_t sector = dax_iomap_sector(iomap, pos); struct dax_device *dax_dev = iomap->dax_dev; |
642261ac9 dax: add struct i... |
1173 |
struct block_device *bdev = iomap->bdev; |
27a7ffacc dax: add tracepoi... |
1174 |
struct inode *inode = mapping->host; |
cccbce671 filesystem-dax: c... |
1175 1176 1177 1178 |
const size_t size = PMD_SIZE; void *ret = NULL, *kaddr; long length = 0; pgoff_t pgoff; |
2f52074d3 dax: initialize v... |
1179 |
pfn_t pfn = {}; |
cccbce671 filesystem-dax: c... |
1180 1181 1182 |
int id; if (bdev_dax_pgoff(bdev, sector, size, &pgoff) != 0) |
27a7ffacc dax: add tracepoi... |
1183 |
goto fallback; |
642261ac9 dax: add struct i... |
1184 |
|
cccbce671 filesystem-dax: c... |
1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 |
id = dax_read_lock(); length = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size), &kaddr, &pfn); if (length < 0) goto unlock_fallback; length = PFN_PHYS(length); if (length < size) goto unlock_fallback; if (pfn_t_to_pfn(pfn) & PG_PMD_COLOUR) goto unlock_fallback; if (!pfn_t_devmap(pfn)) goto unlock_fallback; dax_read_unlock(id); |
91d25ba8a dax: use common 4... |
1198 |
ret = dax_insert_mapping_entry(mapping, vmf, entry, sector, |
642261ac9 dax: add struct i... |
1199 1200 |
RADIX_DAX_PMD); if (IS_ERR(ret)) |
27a7ffacc dax: add tracepoi... |
1201 |
goto fallback; |
642261ac9 dax: add struct i... |
1202 |
|
cccbce671 filesystem-dax: c... |
1203 |
trace_dax_pmd_insert_mapping(inode, vmf, length, pfn, ret); |
f42003917 mm, dax: change p... |
1204 |
return vmf_insert_pfn_pmd(vmf->vma, vmf->address, vmf->pmd, |
cccbce671 filesystem-dax: c... |
1205 |
pfn, vmf->flags & FAULT_FLAG_WRITE); |
642261ac9 dax: add struct i... |
1206 |
|
cccbce671 filesystem-dax: c... |
1207 1208 |
unlock_fallback: dax_read_unlock(id); |
27a7ffacc dax: add tracepoi... |
1209 |
fallback: |
cccbce671 filesystem-dax: c... |
1210 |
trace_dax_pmd_insert_mapping_fallback(inode, vmf, length, pfn, ret); |
642261ac9 dax: add struct i... |
1211 1212 |
return VM_FAULT_FALLBACK; } |
f42003917 mm, dax: change p... |
1213 |
static int dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap, |
91d25ba8a dax: use common 4... |
1214 |
void *entry) |
642261ac9 dax: add struct i... |
1215 |
{ |
f42003917 mm, dax: change p... |
1216 1217 |
struct address_space *mapping = vmf->vma->vm_file->f_mapping; unsigned long pmd_addr = vmf->address & PMD_MASK; |
653b2ea33 dax: add tracepoi... |
1218 |
struct inode *inode = mapping->host; |
642261ac9 dax: add struct i... |
1219 |
struct page *zero_page; |
653b2ea33 dax: add tracepoi... |
1220 |
void *ret = NULL; |
642261ac9 dax: add struct i... |
1221 1222 |
spinlock_t *ptl; pmd_t pmd_entry; |
642261ac9 dax: add struct i... |
1223 |
|
f42003917 mm, dax: change p... |
1224 |
zero_page = mm_get_huge_zero_page(vmf->vma->vm_mm); |
642261ac9 dax: add struct i... |
1225 1226 |
if (unlikely(!zero_page)) |
653b2ea33 dax: add tracepoi... |
1227 |
goto fallback; |
642261ac9 dax: add struct i... |
1228 |
|
91d25ba8a dax: use common 4... |
1229 1230 |
ret = dax_insert_mapping_entry(mapping, vmf, entry, 0, RADIX_DAX_PMD | RADIX_DAX_ZERO_PAGE); |
642261ac9 dax: add struct i... |
1231 |
if (IS_ERR(ret)) |
653b2ea33 dax: add tracepoi... |
1232 |
goto fallback; |
642261ac9 dax: add struct i... |
1233 |
|
f42003917 mm, dax: change p... |
1234 1235 |
ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd); if (!pmd_none(*(vmf->pmd))) { |
642261ac9 dax: add struct i... |
1236 |
spin_unlock(ptl); |
653b2ea33 dax: add tracepoi... |
1237 |
goto fallback; |
642261ac9 dax: add struct i... |
1238 |
} |
f42003917 mm, dax: change p... |
1239 |
pmd_entry = mk_pmd(zero_page, vmf->vma->vm_page_prot); |
642261ac9 dax: add struct i... |
1240 |
pmd_entry = pmd_mkhuge(pmd_entry); |
f42003917 mm, dax: change p... |
1241 |
set_pmd_at(vmf->vma->vm_mm, pmd_addr, vmf->pmd, pmd_entry); |
642261ac9 dax: add struct i... |
1242 |
spin_unlock(ptl); |
f42003917 mm, dax: change p... |
1243 |
trace_dax_pmd_load_hole(inode, vmf, zero_page, ret); |
642261ac9 dax: add struct i... |
1244 |
return VM_FAULT_NOPAGE; |
653b2ea33 dax: add tracepoi... |
1245 1246 |
fallback: |
f42003917 mm, dax: change p... |
1247 |
trace_dax_pmd_load_hole_fallback(inode, vmf, zero_page, ret); |
653b2ea33 dax: add tracepoi... |
1248 |
return VM_FAULT_FALLBACK; |
642261ac9 dax: add struct i... |
1249 |
} |
a2d581675 mm,fs,dax: change... |
1250 1251 |
static int dax_iomap_pmd_fault(struct vm_fault *vmf, const struct iomap_ops *ops) |
642261ac9 dax: add struct i... |
1252 |
{ |
f42003917 mm, dax: change p... |
1253 |
struct vm_area_struct *vma = vmf->vma; |
642261ac9 dax: add struct i... |
1254 |
struct address_space *mapping = vma->vm_file->f_mapping; |
d8a849e1b mm, dax: make pmd... |
1255 1256 |
unsigned long pmd_addr = vmf->address & PMD_MASK; bool write = vmf->flags & FAULT_FLAG_WRITE; |
9484ab1bf dax: Introduce IO... |
1257 |
unsigned int iomap_flags = (write ? IOMAP_WRITE : 0) | IOMAP_FAULT; |
642261ac9 dax: add struct i... |
1258 1259 1260 1261 |
struct inode *inode = mapping->host; int result = VM_FAULT_FALLBACK; struct iomap iomap = { 0 }; pgoff_t max_pgoff, pgoff; |
642261ac9 dax: add struct i... |
1262 1263 1264 |
void *entry; loff_t pos; int error; |
282a8e039 dax: add tracepoi... |
1265 1266 1267 1268 1269 1270 |
/* * Check whether offset isn't beyond end of file now. Caller is * supposed to hold locks serializing us with truncate / punch hole so * this is a reliable test. */ pgoff = linear_page_index(vma, pmd_addr); |
c21261e63 dax: fix PMD faul... |
1271 |
max_pgoff = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); |
282a8e039 dax: add tracepoi... |
1272 |
|
f42003917 mm, dax: change p... |
1273 |
trace_dax_pmd_fault(inode, vmf, max_pgoff, 0); |
282a8e039 dax: add tracepoi... |
1274 |
|
fffa281b4 dax: fix deadlock... |
1275 1276 1277 1278 1279 1280 1281 1282 1283 |
/* * Make sure that the faulting address's PMD offset (color) matches * the PMD offset from the start of the file. This is necessary so * that a PMD range in the page table overlaps exactly with a PMD * range in the radix tree. */ if ((vmf->pgoff & PG_PMD_COLOUR) != ((vmf->address >> PAGE_SHIFT) & PG_PMD_COLOUR)) goto fallback; |
642261ac9 dax: add struct i... |
1284 1285 1286 1287 1288 1289 1290 1291 1292 |
/* Fall back to PTEs if we're going to COW */ if (write && !(vma->vm_flags & VM_SHARED)) goto fallback; /* If the PMD would extend outside the VMA */ if (pmd_addr < vma->vm_start) goto fallback; if ((pmd_addr + PMD_SIZE) > vma->vm_end) goto fallback; |
c21261e63 dax: fix PMD faul... |
1293 |
if (pgoff >= max_pgoff) { |
282a8e039 dax: add tracepoi... |
1294 1295 1296 |
result = VM_FAULT_SIGBUS; goto out; } |
642261ac9 dax: add struct i... |
1297 1298 |
/* If the PMD would extend beyond the file size */ |
c21261e63 dax: fix PMD faul... |
1299 |
if ((pgoff | PG_PMD_COLOUR) >= max_pgoff) |
642261ac9 dax: add struct i... |
1300 1301 1302 |
goto fallback; /* |
91d25ba8a dax: use common 4... |
1303 1304 1305 1306 |
* grab_mapping_entry() will make sure we get a 2MiB empty entry, a * 2MiB zero page entry or a DAX PMD. If it can't (because a 4k page * is already in the tree, for instance), it will return -EEXIST and * we just fall back to 4k entries. |
876f29460 dax: fix PMD data... |
1307 1308 1309 1310 1311 1312 |
*/ entry = grab_mapping_entry(mapping, pgoff, RADIX_DAX_PMD); if (IS_ERR(entry)) goto fallback; /* |
e2093926a dax: fix race bet... |
1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 |
* It is possible, particularly with mixed reads & writes to private * mappings, that we have raced with a PTE fault that overlaps with * the PMD we need to set up. If so just return and the fault will be * retried. */ if (!pmd_none(*vmf->pmd) && !pmd_trans_huge(*vmf->pmd) && !pmd_devmap(*vmf->pmd)) { result = 0; goto unlock_entry; } /* |
642261ac9 dax: add struct i... |
1325 1326 1327 1328 1329 1330 1331 |
* Note that we don't use iomap_apply here. We aren't doing I/O, only * setting up a mapping, so really we're using iomap_begin() as a way * to look up our filesystem block. */ pos = (loff_t)pgoff << PAGE_SHIFT; error = ops->iomap_begin(inode, pos, PMD_SIZE, iomap_flags, &iomap); if (error) |
876f29460 dax: fix PMD data... |
1332 |
goto unlock_entry; |
9f141d6ef dax: Call ->iomap... |
1333 |
|
642261ac9 dax: add struct i... |
1334 1335 |
if (iomap.offset + iomap.length < pos + PMD_SIZE) goto finish_iomap; |
642261ac9 dax: add struct i... |
1336 1337 |
switch (iomap.type) { case IOMAP_MAPPED: |
91d25ba8a dax: use common 4... |
1338 |
result = dax_pmd_insert_mapping(vmf, &iomap, pos, entry); |
642261ac9 dax: add struct i... |
1339 1340 1341 1342 |
break; case IOMAP_UNWRITTEN: case IOMAP_HOLE: if (WARN_ON_ONCE(write)) |
876f29460 dax: fix PMD data... |
1343 |
break; |
91d25ba8a dax: use common 4... |
1344 |
result = dax_pmd_load_hole(vmf, &iomap, entry); |
642261ac9 dax: add struct i... |
1345 1346 1347 1348 1349 1350 1351 1352 |
break; default: WARN_ON_ONCE(1); break; } finish_iomap: if (ops->iomap_end) { |
9f141d6ef dax: Call ->iomap... |
1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 |
int copied = PMD_SIZE; if (result == VM_FAULT_FALLBACK) copied = 0; /* * The fault is done by now and there's no way back (other * thread may be already happily using PMD we have installed). * Just ignore error from ->iomap_end since we cannot do much * with it. */ ops->iomap_end(inode, pos, PMD_SIZE, copied, iomap_flags, &iomap); |
642261ac9 dax: add struct i... |
1365 |
} |
876f29460 dax: fix PMD data... |
1366 |
unlock_entry: |
91d25ba8a dax: use common 4... |
1367 |
put_locked_mapping_entry(mapping, pgoff); |
642261ac9 dax: add struct i... |
1368 1369 |
fallback: if (result == VM_FAULT_FALLBACK) { |
d8a849e1b mm, dax: make pmd... |
1370 |
split_huge_pmd(vma, vmf->pmd, vmf->address); |
642261ac9 dax: add struct i... |
1371 1372 |
count_vm_event(THP_FAULT_FALLBACK); } |
282a8e039 dax: add tracepoi... |
1373 |
out: |
f42003917 mm, dax: change p... |
1374 |
trace_dax_pmd_fault_done(inode, vmf, max_pgoff, result); |
642261ac9 dax: add struct i... |
1375 1376 |
return result; } |
a2d581675 mm,fs,dax: change... |
1377 |
#else |
01cddfe99 mm,fs,dax: mark d... |
1378 1379 |
static int dax_iomap_pmd_fault(struct vm_fault *vmf, const struct iomap_ops *ops) |
a2d581675 mm,fs,dax: change... |
1380 1381 1382 |
{ return VM_FAULT_FALLBACK; } |
642261ac9 dax: add struct i... |
1383 |
#endif /* CONFIG_FS_DAX_PMD */ |
a2d581675 mm,fs,dax: change... |
1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 |
/** * dax_iomap_fault - handle a page fault on a DAX file * @vmf: The description of the fault * @ops: iomap ops passed from the file system * * When a page fault occurs, filesystems may call this helper in * their fault handler for DAX files. dax_iomap_fault() assumes the caller * has done all the necessary locking for page fault to proceed * successfully. */ |
c791ace1e mm: replace FAULT... |
1395 1396 |
int dax_iomap_fault(struct vm_fault *vmf, enum page_entry_size pe_size, const struct iomap_ops *ops) |
a2d581675 mm,fs,dax: change... |
1397 |
{ |
c791ace1e mm: replace FAULT... |
1398 1399 |
switch (pe_size) { case PE_SIZE_PTE: |
a2d581675 mm,fs,dax: change... |
1400 |
return dax_iomap_pte_fault(vmf, ops); |
c791ace1e mm: replace FAULT... |
1401 |
case PE_SIZE_PMD: |
a2d581675 mm,fs,dax: change... |
1402 1403 1404 1405 1406 1407 |
return dax_iomap_pmd_fault(vmf, ops); default: return VM_FAULT_FALLBACK; } } EXPORT_SYMBOL_GPL(dax_iomap_fault); |