Commit 1998cc048901109a29924380b8e91bc049b32951

Authored by Shaohua Li
Committed by Linus Torvalds
1 parent a394cb8ee6

mm: make madvise(MADV_WILLNEED) support swap file prefetch

Make madvise(MADV_WILLNEED) support swap file prefetch.  If memory is
swapout, this syscall can do swapin prefetch.  It has no impact if the
memory isn't swapout.

[akpm@linux-foundation.org: fix CONFIG_SWAP=n build]
[sasha.levin@oracle.com: fix BUG on madvise early failure]
Signed-off-by: Shaohua Li <shli@fusionio.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Rik van Riel <riel@redhat.com>
Signed-off-by: Sasha Levin <sasha.levin@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Showing 1 changed file with 101 additions and 4 deletions Side-by-side Diff

... ... @@ -16,6 +16,9 @@
16 16 #include <linux/ksm.h>
17 17 #include <linux/fs.h>
18 18 #include <linux/file.h>
  19 +#include <linux/blkdev.h>
  20 +#include <linux/swap.h>
  21 +#include <linux/swapops.h>
19 22  
20 23 /*
21 24 * Any behaviour which results in changes to the vma->vm_flags needs to
... ... @@ -131,6 +134,84 @@
131 134 return error;
132 135 }
133 136  
  137 +#ifdef CONFIG_SWAP
  138 +static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start,
  139 + unsigned long end, struct mm_walk *walk)
  140 +{
  141 + pte_t *orig_pte;
  142 + struct vm_area_struct *vma = walk->private;
  143 + unsigned long index;
  144 +
  145 + if (pmd_none_or_trans_huge_or_clear_bad(pmd))
  146 + return 0;
  147 +
  148 + for (index = start; index != end; index += PAGE_SIZE) {
  149 + pte_t pte;
  150 + swp_entry_t entry;
  151 + struct page *page;
  152 + spinlock_t *ptl;
  153 +
  154 + orig_pte = pte_offset_map_lock(vma->vm_mm, pmd, start, &ptl);
  155 + pte = *(orig_pte + ((index - start) / PAGE_SIZE));
  156 + pte_unmap_unlock(orig_pte, ptl);
  157 +
  158 + if (pte_present(pte) || pte_none(pte) || pte_file(pte))
  159 + continue;
  160 + entry = pte_to_swp_entry(pte);
  161 + if (unlikely(non_swap_entry(entry)))
  162 + continue;
  163 +
  164 + page = read_swap_cache_async(entry, GFP_HIGHUSER_MOVABLE,
  165 + vma, index);
  166 + if (page)
  167 + page_cache_release(page);
  168 + }
  169 +
  170 + return 0;
  171 +}
  172 +
  173 +static void force_swapin_readahead(struct vm_area_struct *vma,
  174 + unsigned long start, unsigned long end)
  175 +{
  176 + struct mm_walk walk = {
  177 + .mm = vma->vm_mm,
  178 + .pmd_entry = swapin_walk_pmd_entry,
  179 + .private = vma,
  180 + };
  181 +
  182 + walk_page_range(start, end, &walk);
  183 +
  184 + lru_add_drain(); /* Push any new pages onto the LRU now */
  185 +}
  186 +
  187 +static void force_shm_swapin_readahead(struct vm_area_struct *vma,
  188 + unsigned long start, unsigned long end,
  189 + struct address_space *mapping)
  190 +{
  191 + pgoff_t index;
  192 + struct page *page;
  193 + swp_entry_t swap;
  194 +
  195 + for (; start < end; start += PAGE_SIZE) {
  196 + index = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
  197 +
  198 + page = find_get_page(mapping, index);
  199 + if (!radix_tree_exceptional_entry(page)) {
  200 + if (page)
  201 + page_cache_release(page);
  202 + continue;
  203 + }
  204 + swap = radix_to_swp_entry(page);
  205 + page = read_swap_cache_async(swap, GFP_HIGHUSER_MOVABLE,
  206 + NULL, 0);
  207 + if (page)
  208 + page_cache_release(page);
  209 + }
  210 +
  211 + lru_add_drain(); /* Push any new pages onto the LRU now */
  212 +}
  213 +#endif /* CONFIG_SWAP */
  214 +
134 215 /*
135 216 * Schedule all required I/O operations. Do not wait for completion.
136 217 */
... ... @@ -140,6 +221,18 @@
140 221 {
141 222 struct file *file = vma->vm_file;
142 223  
  224 +#ifdef CONFIG_SWAP
  225 + if (!file || mapping_cap_swap_backed(file->f_mapping)) {
  226 + *prev = vma;
  227 + if (!file)
  228 + force_swapin_readahead(vma, start, end);
  229 + else
  230 + force_shm_swapin_readahead(vma, start, end,
  231 + file->f_mapping);
  232 + return 0;
  233 + }
  234 +#endif
  235 +
143 236 if (!file)
144 237 return -EBADF;
145 238  
... ... @@ -371,6 +464,7 @@
371 464 int error = -EINVAL;
372 465 int write;
373 466 size_t len;
  467 + struct blk_plug plug;
374 468  
375 469 #ifdef CONFIG_MEMORY_FAILURE
376 470 if (behavior == MADV_HWPOISON || behavior == MADV_SOFT_OFFLINE)
377 471  
378 472  
... ... @@ -410,18 +504,19 @@
410 504 if (vma && start > vma->vm_start)
411 505 prev = vma;
412 506  
  507 + blk_start_plug(&plug);
413 508 for (;;) {
414 509 /* Still start < end. */
415 510 error = -ENOMEM;
416 511 if (!vma)
417   - goto out;
  512 + goto out_plug;
418 513  
419 514 /* Here start < (end|vma->vm_end). */
420 515 if (start < vma->vm_start) {
421 516 unmapped_error = -ENOMEM;
422 517 start = vma->vm_start;
423 518 if (start >= end)
424   - goto out;
  519 + goto out_plug;
425 520 }
426 521  
427 522 /* Here vma->vm_start <= start < (end|vma->vm_end) */
428 523  
429 524  
... ... @@ -432,18 +527,20 @@
432 527 /* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */
433 528 error = madvise_vma(vma, &prev, start, tmp, behavior);
434 529 if (error)
435   - goto out;
  530 + goto out_plug;
436 531 start = tmp;
437 532 if (prev && start < prev->vm_end)
438 533 start = prev->vm_end;
439 534 error = unmapped_error;
440 535 if (start >= end)
441   - goto out;
  536 + goto out_plug;
442 537 if (prev)
443 538 vma = prev->vm_next;
444 539 else /* madvise_remove dropped mmap_sem */
445 540 vma = find_vma(current->mm, start);
446 541 }
  542 +out_plug:
  543 + blk_finish_plug(&plug);
447 544 out:
448 545 if (write)
449 546 up_write(&current->mm->mmap_sem);