Commit bde05d1ccd512696b09db9dd2e5f33ad19152605
Committed by
Linus Torvalds
1 parent
5ceb9ce6fe
Exists in
master
and in
20 other branches
shmem: replace page if mapping excludes its zone
The GMA500 GPU driver uses GEM shmem objects, but with a new twist: the backing RAM has to be below 4GB. Not a problem while the boards supported only 4GB: but now Intel's D2700MUD boards support 8GB, and their GMA3600 is managed by the GMA500 driver. shmem/tmpfs has never pretended to support hardware restrictions on the backing memory, but it might have appeared to do so before v3.1, and even now it works fine until a page is swapped out then back in. When read_cache_page_gfp() supplied a freshly allocated page for copy, that compensated for whatever choice might have been made by earlier swapin readahead; but swapoff was likely to destroy the illusion. We'd like to continue to support GMA500, so now add a new shmem_should_replace_page() check on the zone when about to move a page from swapcache to filecache (in swapin and swapoff cases), with shmem_replace_page() to allocate and substitute a suitable page (given gma500/gem.c's mapping_set_gfp_mask GFP_KERNEL | __GFP_DMA32). This does involve a minor extension to mem_cgroup_replace_page_cache() (the page may or may not have already been charged); and I've removed a comment and call to mem_cgroup_uncharge_cache_page(), which in fact is always a no-op while PageSwapCache. Also removed optimization of an unlikely path in shmem_getpage_gfp(), now that we need to check PageSwapCache more carefully (a racing caller might already have made the copy). And at one point shmem_unuse_inode() needs to use the hitherto private page_swapcount(), to guard against racing with inode eviction. It would make sense to extend shmem_should_replace_page(), to cover cpuset and NUMA mempolicy restrictions too, but set that aside for now: needs a cleanup of shmem mempolicy handling, and more testing, and ought to handle swap faults in do_swap_page() as well as shmem. Signed-off-by: Hugh Dickins <hughd@google.com> Cc: Christoph Hellwig <hch@infradead.org> Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Alan Cox <alan@lxorguk.ukuu.org.uk> Cc: Stephane Marchesin <marcheu@chromium.org> Cc: Andi Kleen <andi@firstfloor.org> Cc: Dave Airlie <airlied@gmail.com> Cc: Daniel Vetter <daniel@ffwll.ch> Cc: Rob Clark <rob.clark@linaro.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Showing 4 changed files with 142 additions and 24 deletions Side-by-side Diff
include/linux/swap.h
... | ... | @@ -351,6 +351,7 @@ |
351 | 351 | extern unsigned int count_swap_pages(int, int); |
352 | 352 | extern sector_t map_swap_page(struct page *, struct block_device **); |
353 | 353 | extern sector_t swapdev_block(int, pgoff_t); |
354 | +extern int page_swapcount(struct page *); | |
354 | 355 | extern int reuse_swap_page(struct page *); |
355 | 356 | extern int try_to_free_swap(struct page *); |
356 | 357 | struct backing_dev_info; |
... | ... | @@ -443,6 +444,11 @@ |
443 | 444 | |
444 | 445 | static inline void delete_from_swap_cache(struct page *page) |
445 | 446 | { |
447 | +} | |
448 | + | |
449 | +static inline int page_swapcount(struct page *page) | |
450 | +{ | |
451 | + return 0; | |
446 | 452 | } |
447 | 453 | |
448 | 454 | #define reuse_swap_page(page) (page_mapcount(page) == 1) |
mm/memcontrol.c
... | ... | @@ -3373,7 +3373,7 @@ |
3373 | 3373 | void mem_cgroup_replace_page_cache(struct page *oldpage, |
3374 | 3374 | struct page *newpage) |
3375 | 3375 | { |
3376 | - struct mem_cgroup *memcg; | |
3376 | + struct mem_cgroup *memcg = NULL; | |
3377 | 3377 | struct page_cgroup *pc; |
3378 | 3378 | enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE; |
3379 | 3379 | |
3380 | 3380 | |
... | ... | @@ -3383,10 +3383,19 @@ |
3383 | 3383 | pc = lookup_page_cgroup(oldpage); |
3384 | 3384 | /* fix accounting on old pages */ |
3385 | 3385 | lock_page_cgroup(pc); |
3386 | - memcg = pc->mem_cgroup; | |
3387 | - mem_cgroup_charge_statistics(memcg, false, -1); | |
3388 | - ClearPageCgroupUsed(pc); | |
3386 | + if (PageCgroupUsed(pc)) { | |
3387 | + memcg = pc->mem_cgroup; | |
3388 | + mem_cgroup_charge_statistics(memcg, false, -1); | |
3389 | + ClearPageCgroupUsed(pc); | |
3390 | + } | |
3389 | 3391 | unlock_page_cgroup(pc); |
3392 | + | |
3393 | + /* | |
3394 | + * When called from shmem_replace_page(), in some cases the | |
3395 | + * oldpage has already been charged, and in some cases not. | |
3396 | + */ | |
3397 | + if (!memcg) | |
3398 | + return; | |
3390 | 3399 | |
3391 | 3400 | if (PageSwapBacked(oldpage)) |
3392 | 3401 | type = MEM_CGROUP_CHARGE_TYPE_SHMEM; |
mm/shmem.c
... | ... | @@ -103,6 +103,9 @@ |
103 | 103 | } |
104 | 104 | #endif |
105 | 105 | |
106 | +static bool shmem_should_replace_page(struct page *page, gfp_t gfp); | |
107 | +static int shmem_replace_page(struct page **pagep, gfp_t gfp, | |
108 | + struct shmem_inode_info *info, pgoff_t index); | |
106 | 109 | static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, |
107 | 110 | struct page **pagep, enum sgp_type sgp, gfp_t gfp, int *fault_type); |
108 | 111 | |
109 | 112 | |
... | ... | @@ -604,12 +607,13 @@ |
604 | 607 | * If swap found in inode, free it and move page from swapcache to filecache. |
605 | 608 | */ |
606 | 609 | static int shmem_unuse_inode(struct shmem_inode_info *info, |
607 | - swp_entry_t swap, struct page *page) | |
610 | + swp_entry_t swap, struct page **pagep) | |
608 | 611 | { |
609 | 612 | struct address_space *mapping = info->vfs_inode.i_mapping; |
610 | 613 | void *radswap; |
611 | 614 | pgoff_t index; |
612 | - int error; | |
615 | + gfp_t gfp; | |
616 | + int error = 0; | |
613 | 617 | |
614 | 618 | radswap = swp_to_radix_entry(swap); |
615 | 619 | index = radix_tree_locate_item(&mapping->page_tree, radswap); |
616 | 620 | |
617 | 621 | |
618 | 622 | |
... | ... | @@ -625,22 +629,37 @@ |
625 | 629 | if (shmem_swaplist.next != &info->swaplist) |
626 | 630 | list_move_tail(&shmem_swaplist, &info->swaplist); |
627 | 631 | |
632 | + gfp = mapping_gfp_mask(mapping); | |
633 | + if (shmem_should_replace_page(*pagep, gfp)) { | |
634 | + mutex_unlock(&shmem_swaplist_mutex); | |
635 | + error = shmem_replace_page(pagep, gfp, info, index); | |
636 | + mutex_lock(&shmem_swaplist_mutex); | |
637 | + /* | |
638 | + * We needed to drop mutex to make that restrictive page | |
639 | + * allocation; but the inode might already be freed by now, | |
640 | + * and we cannot refer to inode or mapping or info to check. | |
641 | + * However, we do hold page lock on the PageSwapCache page, | |
642 | + * so can check if that still has our reference remaining. | |
643 | + */ | |
644 | + if (!page_swapcount(*pagep)) | |
645 | + error = -ENOENT; | |
646 | + } | |
647 | + | |
628 | 648 | /* |
629 | 649 | * We rely on shmem_swaplist_mutex, not only to protect the swaplist, |
630 | 650 | * but also to hold up shmem_evict_inode(): so inode cannot be freed |
631 | 651 | * beneath us (pagelock doesn't help until the page is in pagecache). |
632 | 652 | */ |
633 | - error = shmem_add_to_page_cache(page, mapping, index, | |
653 | + if (!error) | |
654 | + error = shmem_add_to_page_cache(*pagep, mapping, index, | |
634 | 655 | GFP_NOWAIT, radswap); |
635 | - /* which does mem_cgroup_uncharge_cache_page on error */ | |
636 | - | |
637 | 656 | if (error != -ENOMEM) { |
638 | 657 | /* |
639 | 658 | * Truncation and eviction use free_swap_and_cache(), which |
640 | 659 | * only does trylock page: if we raced, best clean up here. |
641 | 660 | */ |
642 | - delete_from_swap_cache(page); | |
643 | - set_page_dirty(page); | |
661 | + delete_from_swap_cache(*pagep); | |
662 | + set_page_dirty(*pagep); | |
644 | 663 | if (!error) { |
645 | 664 | spin_lock(&info->lock); |
646 | 665 | info->swapped--; |
647 | 666 | |
... | ... | @@ -660,9 +679,16 @@ |
660 | 679 | struct list_head *this, *next; |
661 | 680 | struct shmem_inode_info *info; |
662 | 681 | int found = 0; |
663 | - int error; | |
682 | + int error = 0; | |
664 | 683 | |
665 | 684 | /* |
685 | + * There's a faint possibility that swap page was replaced before | |
686 | + * caller locked it: it will come back later with the right page. | |
687 | + */ | |
688 | + if (unlikely(!PageSwapCache(page))) | |
689 | + goto out; | |
690 | + | |
691 | + /* | |
666 | 692 | * Charge page using GFP_KERNEL while we can wait, before taking |
667 | 693 | * the shmem_swaplist_mutex which might hold up shmem_writepage(). |
668 | 694 | * Charged back to the user (not to caller) when swap account is used. |
... | ... | @@ -676,7 +702,7 @@ |
676 | 702 | list_for_each_safe(this, next, &shmem_swaplist) { |
677 | 703 | info = list_entry(this, struct shmem_inode_info, swaplist); |
678 | 704 | if (info->swapped) |
679 | - found = shmem_unuse_inode(info, swap, page); | |
705 | + found = shmem_unuse_inode(info, swap, &page); | |
680 | 706 | else |
681 | 707 | list_del_init(&info->swaplist); |
682 | 708 | cond_resched(); |
... | ... | @@ -685,8 +711,6 @@ |
685 | 711 | } |
686 | 712 | mutex_unlock(&shmem_swaplist_mutex); |
687 | 713 | |
688 | - if (!found) | |
689 | - mem_cgroup_uncharge_cache_page(page); | |
690 | 714 | if (found < 0) |
691 | 715 | error = found; |
692 | 716 | out: |
... | ... | @@ -856,6 +880,84 @@ |
856 | 880 | #endif |
857 | 881 | |
858 | 882 | /* |
883 | + * When a page is moved from swapcache to shmem filecache (either by the | |
884 | + * usual swapin of shmem_getpage_gfp(), or by the less common swapoff of | |
885 | + * shmem_unuse_inode()), it may have been read in earlier from swap, in | |
886 | + * ignorance of the mapping it belongs to. If that mapping has special | |
887 | + * constraints (like the gma500 GEM driver, which requires RAM below 4GB), | |
888 | + * we may need to copy to a suitable page before moving to filecache. | |
889 | + * | |
890 | + * In a future release, this may well be extended to respect cpuset and | |
891 | + * NUMA mempolicy, and applied also to anonymous pages in do_swap_page(); | |
892 | + * but for now it is a simple matter of zone. | |
893 | + */ | |
894 | +static bool shmem_should_replace_page(struct page *page, gfp_t gfp) | |
895 | +{ | |
896 | + return page_zonenum(page) > gfp_zone(gfp); | |
897 | +} | |
898 | + | |
899 | +static int shmem_replace_page(struct page **pagep, gfp_t gfp, | |
900 | + struct shmem_inode_info *info, pgoff_t index) | |
901 | +{ | |
902 | + struct page *oldpage, *newpage; | |
903 | + struct address_space *swap_mapping; | |
904 | + pgoff_t swap_index; | |
905 | + int error; | |
906 | + | |
907 | + oldpage = *pagep; | |
908 | + swap_index = page_private(oldpage); | |
909 | + swap_mapping = page_mapping(oldpage); | |
910 | + | |
911 | + /* | |
912 | + * We have arrived here because our zones are constrained, so don't | |
913 | + * limit chance of success by further cpuset and node constraints. | |
914 | + */ | |
915 | + gfp &= ~GFP_CONSTRAINT_MASK; | |
916 | + newpage = shmem_alloc_page(gfp, info, index); | |
917 | + if (!newpage) | |
918 | + return -ENOMEM; | |
919 | + VM_BUG_ON(shmem_should_replace_page(newpage, gfp)); | |
920 | + | |
921 | + *pagep = newpage; | |
922 | + page_cache_get(newpage); | |
923 | + copy_highpage(newpage, oldpage); | |
924 | + | |
925 | + VM_BUG_ON(!PageLocked(oldpage)); | |
926 | + __set_page_locked(newpage); | |
927 | + VM_BUG_ON(!PageUptodate(oldpage)); | |
928 | + SetPageUptodate(newpage); | |
929 | + VM_BUG_ON(!PageSwapBacked(oldpage)); | |
930 | + SetPageSwapBacked(newpage); | |
931 | + VM_BUG_ON(!swap_index); | |
932 | + set_page_private(newpage, swap_index); | |
933 | + VM_BUG_ON(!PageSwapCache(oldpage)); | |
934 | + SetPageSwapCache(newpage); | |
935 | + | |
936 | + /* | |
937 | + * Our caller will very soon move newpage out of swapcache, but it's | |
938 | + * a nice clean interface for us to replace oldpage by newpage there. | |
939 | + */ | |
940 | + spin_lock_irq(&swap_mapping->tree_lock); | |
941 | + error = shmem_radix_tree_replace(swap_mapping, swap_index, oldpage, | |
942 | + newpage); | |
943 | + __inc_zone_page_state(newpage, NR_FILE_PAGES); | |
944 | + __dec_zone_page_state(oldpage, NR_FILE_PAGES); | |
945 | + spin_unlock_irq(&swap_mapping->tree_lock); | |
946 | + BUG_ON(error); | |
947 | + | |
948 | + mem_cgroup_replace_page_cache(oldpage, newpage); | |
949 | + lru_cache_add_anon(newpage); | |
950 | + | |
951 | + ClearPageSwapCache(oldpage); | |
952 | + set_page_private(oldpage, 0); | |
953 | + | |
954 | + unlock_page(oldpage); | |
955 | + page_cache_release(oldpage); | |
956 | + page_cache_release(oldpage); | |
957 | + return 0; | |
958 | +} | |
959 | + | |
960 | +/* | |
859 | 961 | * shmem_getpage_gfp - find page in cache, or get from swap, or allocate |
860 | 962 | * |
861 | 963 | * If we allocate a new one we do not mark it dirty. That's up to the |
862 | 964 | |
... | ... | @@ -923,19 +1025,20 @@ |
923 | 1025 | |
924 | 1026 | /* We have to do this with page locked to prevent races */ |
925 | 1027 | lock_page(page); |
1028 | + if (!PageSwapCache(page) || page->mapping) { | |
1029 | + error = -EEXIST; /* try again */ | |
1030 | + goto failed; | |
1031 | + } | |
926 | 1032 | if (!PageUptodate(page)) { |
927 | 1033 | error = -EIO; |
928 | 1034 | goto failed; |
929 | 1035 | } |
930 | 1036 | wait_on_page_writeback(page); |
931 | 1037 | |
932 | - /* Someone may have already done it for us */ | |
933 | - if (page->mapping) { | |
934 | - if (page->mapping == mapping && | |
935 | - page->index == index) | |
936 | - goto done; | |
937 | - error = -EEXIST; | |
938 | - goto failed; | |
1038 | + if (shmem_should_replace_page(page, gfp)) { | |
1039 | + error = shmem_replace_page(&page, gfp, info, index); | |
1040 | + if (error) | |
1041 | + goto failed; | |
939 | 1042 | } |
940 | 1043 | |
941 | 1044 | error = mem_cgroup_cache_charge(page, current->mm, |
... | ... | @@ -998,7 +1101,7 @@ |
998 | 1101 | if (sgp == SGP_DIRTY) |
999 | 1102 | set_page_dirty(page); |
1000 | 1103 | } |
1001 | -done: | |
1104 | + | |
1002 | 1105 | /* Perhaps the file has been truncated since we checked */ |
1003 | 1106 | if (sgp != SGP_WRITE && |
1004 | 1107 | ((loff_t)index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) { |
mm/swapfile.c
... | ... | @@ -601,7 +601,7 @@ |
601 | 601 | * This does not give an exact answer when swap count is continued, |
602 | 602 | * but does include the high COUNT_CONTINUED flag to allow for that. |
603 | 603 | */ |
604 | -static inline int page_swapcount(struct page *page) | |
604 | +int page_swapcount(struct page *page) | |
605 | 605 | { |
606 | 606 | int count = 0; |
607 | 607 | struct swap_info_struct *p; |
-
mentioned in commit 212300
-
mentioned in commit 212300
-
mentioned in commit 212300
-
mentioned in commit 212300
-
mentioned in commit 212300
-
mentioned in commit 212300
-
mentioned in commit 212300
-
mentioned in commit 212300
-
mentioned in commit 212300
-
mentioned in commit 212300
-
mentioned in commit 212300
-
mentioned in commit 212300
-
mentioned in commit 212300