Commit bde05d1ccd512696b09db9dd2e5f33ad19152605

Authored by Hugh Dickins
Committed by Linus Torvalds
1 parent 5ceb9ce6fe

shmem: replace page if mapping excludes its zone

The GMA500 GPU driver uses GEM shmem objects, but with a new twist: the
backing RAM has to be below 4GB.  Not a problem while the boards
supported only 4GB: but now Intel's D2700MUD boards support 8GB, and
their GMA3600 is managed by the GMA500 driver.

shmem/tmpfs has never pretended to support hardware restrictions on the
backing memory, but it might have appeared to do so before v3.1, and
even now it works fine until a page is swapped out then back in.  When
read_cache_page_gfp() supplied a freshly allocated page for copy, that
compensated for whatever choice might have been made by earlier swapin
readahead; but swapoff was likely to destroy the illusion.

We'd like to continue to support GMA500, so now add a new
shmem_should_replace_page() check on the zone when about to move a page
from swapcache to filecache (in swapin and swapoff cases), with
shmem_replace_page() to allocate and substitute a suitable page (given
gma500/gem.c's mapping_set_gfp_mask GFP_KERNEL | __GFP_DMA32).

This does involve a minor extension to mem_cgroup_replace_page_cache()
(the page may or may not have already been charged); and I've removed a
comment and call to mem_cgroup_uncharge_cache_page(), which in fact is
always a no-op while PageSwapCache.

Also removed optimization of an unlikely path in shmem_getpage_gfp(),
now that we need to check PageSwapCache more carefully (a racing caller
might already have made the copy).  And at one point shmem_unuse_inode()
needs to use the hitherto private page_swapcount(), to guard against
racing with inode eviction.

It would make sense to extend shmem_should_replace_page(), to cover
cpuset and NUMA mempolicy restrictions too, but set that aside for now:
needs a cleanup of shmem mempolicy handling, and more testing, and ought
to handle swap faults in do_swap_page() as well as shmem.

Signed-off-by: Hugh Dickins <hughd@google.com>
Cc: Christoph Hellwig <hch@infradead.org>
Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Alan Cox <alan@lxorguk.ukuu.org.uk>
Cc: Stephane Marchesin <marcheu@chromium.org>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Dave Airlie <airlied@gmail.com>
Cc: Daniel Vetter <daniel@ffwll.ch>
Cc: Rob Clark <rob.clark@linaro.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Showing 4 changed files with 142 additions and 24 deletions Side-by-side Diff

include/linux/swap.h
... ... @@ -351,6 +351,7 @@
351 351 extern unsigned int count_swap_pages(int, int);
352 352 extern sector_t map_swap_page(struct page *, struct block_device **);
353 353 extern sector_t swapdev_block(int, pgoff_t);
  354 +extern int page_swapcount(struct page *);
354 355 extern int reuse_swap_page(struct page *);
355 356 extern int try_to_free_swap(struct page *);
356 357 struct backing_dev_info;
... ... @@ -443,6 +444,11 @@
443 444  
444 445 static inline void delete_from_swap_cache(struct page *page)
445 446 {
  447 +}
  448 +
  449 +static inline int page_swapcount(struct page *page)
  450 +{
  451 + return 0;
446 452 }
447 453  
448 454 #define reuse_swap_page(page) (page_mapcount(page) == 1)
... ... @@ -3373,7 +3373,7 @@
3373 3373 void mem_cgroup_replace_page_cache(struct page *oldpage,
3374 3374 struct page *newpage)
3375 3375 {
3376   - struct mem_cgroup *memcg;
  3376 + struct mem_cgroup *memcg = NULL;
3377 3377 struct page_cgroup *pc;
3378 3378 enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE;
3379 3379  
3380 3380  
... ... @@ -3383,10 +3383,19 @@
3383 3383 pc = lookup_page_cgroup(oldpage);
3384 3384 /* fix accounting on old pages */
3385 3385 lock_page_cgroup(pc);
3386   - memcg = pc->mem_cgroup;
3387   - mem_cgroup_charge_statistics(memcg, false, -1);
3388   - ClearPageCgroupUsed(pc);
  3386 + if (PageCgroupUsed(pc)) {
  3387 + memcg = pc->mem_cgroup;
  3388 + mem_cgroup_charge_statistics(memcg, false, -1);
  3389 + ClearPageCgroupUsed(pc);
  3390 + }
3389 3391 unlock_page_cgroup(pc);
  3392 +
  3393 + /*
  3394 + * When called from shmem_replace_page(), in some cases the
  3395 + * oldpage has already been charged, and in some cases not.
  3396 + */
  3397 + if (!memcg)
  3398 + return;
3390 3399  
3391 3400 if (PageSwapBacked(oldpage))
3392 3401 type = MEM_CGROUP_CHARGE_TYPE_SHMEM;
... ... @@ -103,6 +103,9 @@
103 103 }
104 104 #endif
105 105  
  106 +static bool shmem_should_replace_page(struct page *page, gfp_t gfp);
  107 +static int shmem_replace_page(struct page **pagep, gfp_t gfp,
  108 + struct shmem_inode_info *info, pgoff_t index);
106 109 static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
107 110 struct page **pagep, enum sgp_type sgp, gfp_t gfp, int *fault_type);
108 111  
109 112  
... ... @@ -604,12 +607,13 @@
604 607 * If swap found in inode, free it and move page from swapcache to filecache.
605 608 */
606 609 static int shmem_unuse_inode(struct shmem_inode_info *info,
607   - swp_entry_t swap, struct page *page)
  610 + swp_entry_t swap, struct page **pagep)
608 611 {
609 612 struct address_space *mapping = info->vfs_inode.i_mapping;
610 613 void *radswap;
611 614 pgoff_t index;
612   - int error;
  615 + gfp_t gfp;
  616 + int error = 0;
613 617  
614 618 radswap = swp_to_radix_entry(swap);
615 619 index = radix_tree_locate_item(&mapping->page_tree, radswap);
616 620  
617 621  
618 622  
... ... @@ -625,22 +629,37 @@
625 629 if (shmem_swaplist.next != &info->swaplist)
626 630 list_move_tail(&shmem_swaplist, &info->swaplist);
627 631  
  632 + gfp = mapping_gfp_mask(mapping);
  633 + if (shmem_should_replace_page(*pagep, gfp)) {
  634 + mutex_unlock(&shmem_swaplist_mutex);
  635 + error = shmem_replace_page(pagep, gfp, info, index);
  636 + mutex_lock(&shmem_swaplist_mutex);
  637 + /*
  638 + * We needed to drop mutex to make that restrictive page
  639 + * allocation; but the inode might already be freed by now,
  640 + * and we cannot refer to inode or mapping or info to check.
  641 + * However, we do hold page lock on the PageSwapCache page,
  642 + * so can check if that still has our reference remaining.
  643 + */
  644 + if (!page_swapcount(*pagep))
  645 + error = -ENOENT;
  646 + }
  647 +
628 648 /*
629 649 * We rely on shmem_swaplist_mutex, not only to protect the swaplist,
630 650 * but also to hold up shmem_evict_inode(): so inode cannot be freed
631 651 * beneath us (pagelock doesn't help until the page is in pagecache).
632 652 */
633   - error = shmem_add_to_page_cache(page, mapping, index,
  653 + if (!error)
  654 + error = shmem_add_to_page_cache(*pagep, mapping, index,
634 655 GFP_NOWAIT, radswap);
635   - /* which does mem_cgroup_uncharge_cache_page on error */
636   -
637 656 if (error != -ENOMEM) {
638 657 /*
639 658 * Truncation and eviction use free_swap_and_cache(), which
640 659 * only does trylock page: if we raced, best clean up here.
641 660 */
642   - delete_from_swap_cache(page);
643   - set_page_dirty(page);
  661 + delete_from_swap_cache(*pagep);
  662 + set_page_dirty(*pagep);
644 663 if (!error) {
645 664 spin_lock(&info->lock);
646 665 info->swapped--;
647 666  
... ... @@ -660,9 +679,16 @@
660 679 struct list_head *this, *next;
661 680 struct shmem_inode_info *info;
662 681 int found = 0;
663   - int error;
  682 + int error = 0;
664 683  
665 684 /*
  685 + * There's a faint possibility that swap page was replaced before
  686 + * caller locked it: it will come back later with the right page.
  687 + */
  688 + if (unlikely(!PageSwapCache(page)))
  689 + goto out;
  690 +
  691 + /*
666 692 * Charge page using GFP_KERNEL while we can wait, before taking
667 693 * the shmem_swaplist_mutex which might hold up shmem_writepage().
668 694 * Charged back to the user (not to caller) when swap account is used.
... ... @@ -676,7 +702,7 @@
676 702 list_for_each_safe(this, next, &shmem_swaplist) {
677 703 info = list_entry(this, struct shmem_inode_info, swaplist);
678 704 if (info->swapped)
679   - found = shmem_unuse_inode(info, swap, page);
  705 + found = shmem_unuse_inode(info, swap, &page);
680 706 else
681 707 list_del_init(&info->swaplist);
682 708 cond_resched();
... ... @@ -685,8 +711,6 @@
685 711 }
686 712 mutex_unlock(&shmem_swaplist_mutex);
687 713  
688   - if (!found)
689   - mem_cgroup_uncharge_cache_page(page);
690 714 if (found < 0)
691 715 error = found;
692 716 out:
... ... @@ -856,6 +880,84 @@
856 880 #endif
857 881  
858 882 /*
  883 + * When a page is moved from swapcache to shmem filecache (either by the
  884 + * usual swapin of shmem_getpage_gfp(), or by the less common swapoff of
  885 + * shmem_unuse_inode()), it may have been read in earlier from swap, in
  886 + * ignorance of the mapping it belongs to. If that mapping has special
  887 + * constraints (like the gma500 GEM driver, which requires RAM below 4GB),
  888 + * we may need to copy to a suitable page before moving to filecache.
  889 + *
  890 + * In a future release, this may well be extended to respect cpuset and
  891 + * NUMA mempolicy, and applied also to anonymous pages in do_swap_page();
  892 + * but for now it is a simple matter of zone.
  893 + */
  894 +static bool shmem_should_replace_page(struct page *page, gfp_t gfp)
  895 +{
  896 + return page_zonenum(page) > gfp_zone(gfp);
  897 +}
  898 +
  899 +static int shmem_replace_page(struct page **pagep, gfp_t gfp,
  900 + struct shmem_inode_info *info, pgoff_t index)
  901 +{
  902 + struct page *oldpage, *newpage;
  903 + struct address_space *swap_mapping;
  904 + pgoff_t swap_index;
  905 + int error;
  906 +
  907 + oldpage = *pagep;
  908 + swap_index = page_private(oldpage);
  909 + swap_mapping = page_mapping(oldpage);
  910 +
  911 + /*
  912 + * We have arrived here because our zones are constrained, so don't
  913 + * limit chance of success by further cpuset and node constraints.
  914 + */
  915 + gfp &= ~GFP_CONSTRAINT_MASK;
  916 + newpage = shmem_alloc_page(gfp, info, index);
  917 + if (!newpage)
  918 + return -ENOMEM;
  919 + VM_BUG_ON(shmem_should_replace_page(newpage, gfp));
  920 +
  921 + *pagep = newpage;
  922 + page_cache_get(newpage);
  923 + copy_highpage(newpage, oldpage);
  924 +
  925 + VM_BUG_ON(!PageLocked(oldpage));
  926 + __set_page_locked(newpage);
  927 + VM_BUG_ON(!PageUptodate(oldpage));
  928 + SetPageUptodate(newpage);
  929 + VM_BUG_ON(!PageSwapBacked(oldpage));
  930 + SetPageSwapBacked(newpage);
  931 + VM_BUG_ON(!swap_index);
  932 + set_page_private(newpage, swap_index);
  933 + VM_BUG_ON(!PageSwapCache(oldpage));
  934 + SetPageSwapCache(newpage);
  935 +
  936 + /*
  937 + * Our caller will very soon move newpage out of swapcache, but it's
  938 + * a nice clean interface for us to replace oldpage by newpage there.
  939 + */
  940 + spin_lock_irq(&swap_mapping->tree_lock);
  941 + error = shmem_radix_tree_replace(swap_mapping, swap_index, oldpage,
  942 + newpage);
  943 + __inc_zone_page_state(newpage, NR_FILE_PAGES);
  944 + __dec_zone_page_state(oldpage, NR_FILE_PAGES);
  945 + spin_unlock_irq(&swap_mapping->tree_lock);
  946 + BUG_ON(error);
  947 +
  948 + mem_cgroup_replace_page_cache(oldpage, newpage);
  949 + lru_cache_add_anon(newpage);
  950 +
  951 + ClearPageSwapCache(oldpage);
  952 + set_page_private(oldpage, 0);
  953 +
  954 + unlock_page(oldpage);
  955 + page_cache_release(oldpage);
  956 + page_cache_release(oldpage);
  957 + return 0;
  958 +}
  959 +
  960 +/*
859 961 * shmem_getpage_gfp - find page in cache, or get from swap, or allocate
860 962 *
861 963 * If we allocate a new one we do not mark it dirty. That's up to the
862 964  
... ... @@ -923,19 +1025,20 @@
923 1025  
924 1026 /* We have to do this with page locked to prevent races */
925 1027 lock_page(page);
  1028 + if (!PageSwapCache(page) || page->mapping) {
  1029 + error = -EEXIST; /* try again */
  1030 + goto failed;
  1031 + }
926 1032 if (!PageUptodate(page)) {
927 1033 error = -EIO;
928 1034 goto failed;
929 1035 }
930 1036 wait_on_page_writeback(page);
931 1037  
932   - /* Someone may have already done it for us */
933   - if (page->mapping) {
934   - if (page->mapping == mapping &&
935   - page->index == index)
936   - goto done;
937   - error = -EEXIST;
938   - goto failed;
  1038 + if (shmem_should_replace_page(page, gfp)) {
  1039 + error = shmem_replace_page(&page, gfp, info, index);
  1040 + if (error)
  1041 + goto failed;
939 1042 }
940 1043  
941 1044 error = mem_cgroup_cache_charge(page, current->mm,
... ... @@ -998,7 +1101,7 @@
998 1101 if (sgp == SGP_DIRTY)
999 1102 set_page_dirty(page);
1000 1103 }
1001   -done:
  1104 +
1002 1105 /* Perhaps the file has been truncated since we checked */
1003 1106 if (sgp != SGP_WRITE &&
1004 1107 ((loff_t)index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) {
... ... @@ -601,7 +601,7 @@
601 601 * This does not give an exact answer when swap count is continued,
602 602 * but does include the high COUNT_CONTINUED flag to allow for that.
603 603 */
604   -static inline int page_swapcount(struct page *page)
  604 +int page_swapcount(struct page *page)
605 605 {
606 606 int count = 0;
607 607 struct swap_info_struct *p;