Commit 479db0bf408e65baa14d2a9821abfcbc0804b847
Committed by
Linus Torvalds
1 parent
2d70b68d42
Exists in
master
and in
7 other branches
mm: dirty page tracking race fix
There is a race with dirty page accounting where a page may not properly be accounted for. clear_page_dirty_for_io() calls page_mkclean; then TestClearPageDirty. page_mkclean walks the rmaps for that page, and for each one it cleans and write protects the pte if it was dirty. It uses page_check_address to find the pte. That function has a shortcut to avoid the ptl if the pte is not present. Unfortunately, the pte can be switched to not-present then back to present by other code while holding the page table lock -- this should not be a signal for page_mkclean to ignore that pte, because it may be dirty. For example, powerpc64's set_pte_at will clear a previously present pte before setting it to the desired value. There may also be other code in core mm or in arch which do similar things. The consequence of the bug is loss of data integrity due to msync, and loss of dirty page accounting accuracy. XIP's __xip_unmap could easily also be unreliable (depending on the exact XIP locking scheme), which can lead to data corruption. Fix this by having an option to always take ptl to check the pte in page_check_address. It's possible to retain this optimization for page_referenced and try_to_unmap. Signed-off-by: Nick Piggin <npiggin@suse.de> Cc: Jared Hulbert <jaredeh@gmail.com> Cc: Carsten Otte <cotte@freenet.de> Cc: Hugh Dickins <hugh@veritas.com> Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Showing 3 changed files with 11 additions and 7 deletions Inline Diff
include/linux/rmap.h
1 | #ifndef _LINUX_RMAP_H | 1 | #ifndef _LINUX_RMAP_H |
2 | #define _LINUX_RMAP_H | 2 | #define _LINUX_RMAP_H |
3 | /* | 3 | /* |
4 | * Declarations for Reverse Mapping functions in mm/rmap.c | 4 | * Declarations for Reverse Mapping functions in mm/rmap.c |
5 | */ | 5 | */ |
6 | 6 | ||
7 | #include <linux/list.h> | 7 | #include <linux/list.h> |
8 | #include <linux/slab.h> | 8 | #include <linux/slab.h> |
9 | #include <linux/mm.h> | 9 | #include <linux/mm.h> |
10 | #include <linux/spinlock.h> | 10 | #include <linux/spinlock.h> |
11 | #include <linux/memcontrol.h> | 11 | #include <linux/memcontrol.h> |
12 | 12 | ||
13 | /* | 13 | /* |
14 | * The anon_vma heads a list of private "related" vmas, to scan if | 14 | * The anon_vma heads a list of private "related" vmas, to scan if |
15 | * an anonymous page pointing to this anon_vma needs to be unmapped: | 15 | * an anonymous page pointing to this anon_vma needs to be unmapped: |
16 | * the vmas on the list will be related by forking, or by splitting. | 16 | * the vmas on the list will be related by forking, or by splitting. |
17 | * | 17 | * |
18 | * Since vmas come and go as they are split and merged (particularly | 18 | * Since vmas come and go as they are split and merged (particularly |
19 | * in mprotect), the mapping field of an anonymous page cannot point | 19 | * in mprotect), the mapping field of an anonymous page cannot point |
20 | * directly to a vma: instead it points to an anon_vma, on whose list | 20 | * directly to a vma: instead it points to an anon_vma, on whose list |
21 | * the related vmas can be easily linked or unlinked. | 21 | * the related vmas can be easily linked or unlinked. |
22 | * | 22 | * |
23 | * After unlinking the last vma on the list, we must garbage collect | 23 | * After unlinking the last vma on the list, we must garbage collect |
24 | * the anon_vma object itself: we're guaranteed no page can be | 24 | * the anon_vma object itself: we're guaranteed no page can be |
25 | * pointing to this anon_vma once its vma list is empty. | 25 | * pointing to this anon_vma once its vma list is empty. |
26 | */ | 26 | */ |
27 | struct anon_vma { | 27 | struct anon_vma { |
28 | spinlock_t lock; /* Serialize access to vma list */ | 28 | spinlock_t lock; /* Serialize access to vma list */ |
29 | /* | 29 | /* |
30 | * NOTE: the LSB of the head.next is set by | 30 | * NOTE: the LSB of the head.next is set by |
31 | * mm_take_all_locks() _after_ taking the above lock. So the | 31 | * mm_take_all_locks() _after_ taking the above lock. So the |
32 | * head must only be read/written after taking the above lock | 32 | * head must only be read/written after taking the above lock |
33 | * to be sure to see a valid next pointer. The LSB bit itself | 33 | * to be sure to see a valid next pointer. The LSB bit itself |
34 | * is serialized by a system wide lock only visible to | 34 | * is serialized by a system wide lock only visible to |
35 | * mm_take_all_locks() (mm_all_locks_mutex). | 35 | * mm_take_all_locks() (mm_all_locks_mutex). |
36 | */ | 36 | */ |
37 | struct list_head head; /* List of private "related" vmas */ | 37 | struct list_head head; /* List of private "related" vmas */ |
38 | }; | 38 | }; |
39 | 39 | ||
40 | #ifdef CONFIG_MMU | 40 | #ifdef CONFIG_MMU |
41 | 41 | ||
42 | extern struct kmem_cache *anon_vma_cachep; | 42 | extern struct kmem_cache *anon_vma_cachep; |
43 | 43 | ||
44 | static inline struct anon_vma *anon_vma_alloc(void) | 44 | static inline struct anon_vma *anon_vma_alloc(void) |
45 | { | 45 | { |
46 | return kmem_cache_alloc(anon_vma_cachep, GFP_KERNEL); | 46 | return kmem_cache_alloc(anon_vma_cachep, GFP_KERNEL); |
47 | } | 47 | } |
48 | 48 | ||
49 | static inline void anon_vma_free(struct anon_vma *anon_vma) | 49 | static inline void anon_vma_free(struct anon_vma *anon_vma) |
50 | { | 50 | { |
51 | kmem_cache_free(anon_vma_cachep, anon_vma); | 51 | kmem_cache_free(anon_vma_cachep, anon_vma); |
52 | } | 52 | } |
53 | 53 | ||
54 | static inline void anon_vma_lock(struct vm_area_struct *vma) | 54 | static inline void anon_vma_lock(struct vm_area_struct *vma) |
55 | { | 55 | { |
56 | struct anon_vma *anon_vma = vma->anon_vma; | 56 | struct anon_vma *anon_vma = vma->anon_vma; |
57 | if (anon_vma) | 57 | if (anon_vma) |
58 | spin_lock(&anon_vma->lock); | 58 | spin_lock(&anon_vma->lock); |
59 | } | 59 | } |
60 | 60 | ||
61 | static inline void anon_vma_unlock(struct vm_area_struct *vma) | 61 | static inline void anon_vma_unlock(struct vm_area_struct *vma) |
62 | { | 62 | { |
63 | struct anon_vma *anon_vma = vma->anon_vma; | 63 | struct anon_vma *anon_vma = vma->anon_vma; |
64 | if (anon_vma) | 64 | if (anon_vma) |
65 | spin_unlock(&anon_vma->lock); | 65 | spin_unlock(&anon_vma->lock); |
66 | } | 66 | } |
67 | 67 | ||
68 | /* | 68 | /* |
69 | * anon_vma helper functions. | 69 | * anon_vma helper functions. |
70 | */ | 70 | */ |
71 | void anon_vma_init(void); /* create anon_vma_cachep */ | 71 | void anon_vma_init(void); /* create anon_vma_cachep */ |
72 | int anon_vma_prepare(struct vm_area_struct *); | 72 | int anon_vma_prepare(struct vm_area_struct *); |
73 | void __anon_vma_merge(struct vm_area_struct *, struct vm_area_struct *); | 73 | void __anon_vma_merge(struct vm_area_struct *, struct vm_area_struct *); |
74 | void anon_vma_unlink(struct vm_area_struct *); | 74 | void anon_vma_unlink(struct vm_area_struct *); |
75 | void anon_vma_link(struct vm_area_struct *); | 75 | void anon_vma_link(struct vm_area_struct *); |
76 | void __anon_vma_link(struct vm_area_struct *); | 76 | void __anon_vma_link(struct vm_area_struct *); |
77 | 77 | ||
78 | /* | 78 | /* |
79 | * rmap interfaces called when adding or removing pte of page | 79 | * rmap interfaces called when adding or removing pte of page |
80 | */ | 80 | */ |
81 | void page_add_anon_rmap(struct page *, struct vm_area_struct *, unsigned long); | 81 | void page_add_anon_rmap(struct page *, struct vm_area_struct *, unsigned long); |
82 | void page_add_new_anon_rmap(struct page *, struct vm_area_struct *, unsigned long); | 82 | void page_add_new_anon_rmap(struct page *, struct vm_area_struct *, unsigned long); |
83 | void page_add_file_rmap(struct page *); | 83 | void page_add_file_rmap(struct page *); |
84 | void page_remove_rmap(struct page *, struct vm_area_struct *); | 84 | void page_remove_rmap(struct page *, struct vm_area_struct *); |
85 | 85 | ||
86 | #ifdef CONFIG_DEBUG_VM | 86 | #ifdef CONFIG_DEBUG_VM |
87 | void page_dup_rmap(struct page *page, struct vm_area_struct *vma, unsigned long address); | 87 | void page_dup_rmap(struct page *page, struct vm_area_struct *vma, unsigned long address); |
88 | #else | 88 | #else |
89 | static inline void page_dup_rmap(struct page *page, struct vm_area_struct *vma, unsigned long address) | 89 | static inline void page_dup_rmap(struct page *page, struct vm_area_struct *vma, unsigned long address) |
90 | { | 90 | { |
91 | atomic_inc(&page->_mapcount); | 91 | atomic_inc(&page->_mapcount); |
92 | } | 92 | } |
93 | #endif | 93 | #endif |
94 | 94 | ||
95 | /* | 95 | /* |
96 | * Called from mm/vmscan.c to handle paging out | 96 | * Called from mm/vmscan.c to handle paging out |
97 | */ | 97 | */ |
98 | int page_referenced(struct page *, int is_locked, struct mem_cgroup *cnt); | 98 | int page_referenced(struct page *, int is_locked, struct mem_cgroup *cnt); |
99 | int try_to_unmap(struct page *, int ignore_refs); | 99 | int try_to_unmap(struct page *, int ignore_refs); |
100 | 100 | ||
101 | /* | 101 | /* |
102 | * Called from mm/filemap_xip.c to unmap empty zero page | 102 | * Called from mm/filemap_xip.c to unmap empty zero page |
103 | */ | 103 | */ |
104 | pte_t *page_check_address(struct page *, struct mm_struct *, | 104 | pte_t *page_check_address(struct page *, struct mm_struct *, |
105 | unsigned long, spinlock_t **); | 105 | unsigned long, spinlock_t **, int); |
106 | 106 | ||
107 | /* | 107 | /* |
108 | * Used by swapoff to help locate where page is expected in vma. | 108 | * Used by swapoff to help locate where page is expected in vma. |
109 | */ | 109 | */ |
110 | unsigned long page_address_in_vma(struct page *, struct vm_area_struct *); | 110 | unsigned long page_address_in_vma(struct page *, struct vm_area_struct *); |
111 | 111 | ||
112 | /* | 112 | /* |
113 | * Cleans the PTEs of shared mappings. | 113 | * Cleans the PTEs of shared mappings. |
114 | * (and since clean PTEs should also be readonly, write protects them too) | 114 | * (and since clean PTEs should also be readonly, write protects them too) |
115 | * | 115 | * |
116 | * returns the number of cleaned PTEs. | 116 | * returns the number of cleaned PTEs. |
117 | */ | 117 | */ |
118 | int page_mkclean(struct page *); | 118 | int page_mkclean(struct page *); |
119 | 119 | ||
120 | #else /* !CONFIG_MMU */ | 120 | #else /* !CONFIG_MMU */ |
121 | 121 | ||
122 | #define anon_vma_init() do {} while (0) | 122 | #define anon_vma_init() do {} while (0) |
123 | #define anon_vma_prepare(vma) (0) | 123 | #define anon_vma_prepare(vma) (0) |
124 | #define anon_vma_link(vma) do {} while (0) | 124 | #define anon_vma_link(vma) do {} while (0) |
125 | 125 | ||
126 | #define page_referenced(page,l,cnt) TestClearPageReferenced(page) | 126 | #define page_referenced(page,l,cnt) TestClearPageReferenced(page) |
127 | #define try_to_unmap(page, refs) SWAP_FAIL | 127 | #define try_to_unmap(page, refs) SWAP_FAIL |
128 | 128 | ||
129 | static inline int page_mkclean(struct page *page) | 129 | static inline int page_mkclean(struct page *page) |
130 | { | 130 | { |
131 | return 0; | 131 | return 0; |
132 | } | 132 | } |
133 | 133 | ||
134 | 134 | ||
135 | #endif /* CONFIG_MMU */ | 135 | #endif /* CONFIG_MMU */ |
136 | 136 | ||
137 | /* | 137 | /* |
138 | * Return values of try_to_unmap | 138 | * Return values of try_to_unmap |
139 | */ | 139 | */ |
140 | #define SWAP_SUCCESS 0 | 140 | #define SWAP_SUCCESS 0 |
141 | #define SWAP_AGAIN 1 | 141 | #define SWAP_AGAIN 1 |
142 | #define SWAP_FAIL 2 | 142 | #define SWAP_FAIL 2 |
143 | 143 | ||
144 | #endif /* _LINUX_RMAP_H */ | 144 | #endif /* _LINUX_RMAP_H */ |
145 | 145 |
mm/filemap_xip.c
1 | /* | 1 | /* |
2 | * linux/mm/filemap_xip.c | 2 | * linux/mm/filemap_xip.c |
3 | * | 3 | * |
4 | * Copyright (C) 2005 IBM Corporation | 4 | * Copyright (C) 2005 IBM Corporation |
5 | * Author: Carsten Otte <cotte@de.ibm.com> | 5 | * Author: Carsten Otte <cotte@de.ibm.com> |
6 | * | 6 | * |
7 | * derived from linux/mm/filemap.c - Copyright (C) Linus Torvalds | 7 | * derived from linux/mm/filemap.c - Copyright (C) Linus Torvalds |
8 | * | 8 | * |
9 | */ | 9 | */ |
10 | 10 | ||
11 | #include <linux/fs.h> | 11 | #include <linux/fs.h> |
12 | #include <linux/pagemap.h> | 12 | #include <linux/pagemap.h> |
13 | #include <linux/module.h> | 13 | #include <linux/module.h> |
14 | #include <linux/uio.h> | 14 | #include <linux/uio.h> |
15 | #include <linux/rmap.h> | 15 | #include <linux/rmap.h> |
16 | #include <linux/mmu_notifier.h> | 16 | #include <linux/mmu_notifier.h> |
17 | #include <linux/sched.h> | 17 | #include <linux/sched.h> |
18 | #include <asm/tlbflush.h> | 18 | #include <asm/tlbflush.h> |
19 | #include <asm/io.h> | 19 | #include <asm/io.h> |
20 | 20 | ||
21 | /* | 21 | /* |
22 | * We do use our own empty page to avoid interference with other users | 22 | * We do use our own empty page to avoid interference with other users |
23 | * of ZERO_PAGE(), such as /dev/zero | 23 | * of ZERO_PAGE(), such as /dev/zero |
24 | */ | 24 | */ |
25 | static struct page *__xip_sparse_page; | 25 | static struct page *__xip_sparse_page; |
26 | 26 | ||
27 | static struct page *xip_sparse_page(void) | 27 | static struct page *xip_sparse_page(void) |
28 | { | 28 | { |
29 | if (!__xip_sparse_page) { | 29 | if (!__xip_sparse_page) { |
30 | struct page *page = alloc_page(GFP_HIGHUSER | __GFP_ZERO); | 30 | struct page *page = alloc_page(GFP_HIGHUSER | __GFP_ZERO); |
31 | 31 | ||
32 | if (page) { | 32 | if (page) { |
33 | static DEFINE_SPINLOCK(xip_alloc_lock); | 33 | static DEFINE_SPINLOCK(xip_alloc_lock); |
34 | spin_lock(&xip_alloc_lock); | 34 | spin_lock(&xip_alloc_lock); |
35 | if (!__xip_sparse_page) | 35 | if (!__xip_sparse_page) |
36 | __xip_sparse_page = page; | 36 | __xip_sparse_page = page; |
37 | else | 37 | else |
38 | __free_page(page); | 38 | __free_page(page); |
39 | spin_unlock(&xip_alloc_lock); | 39 | spin_unlock(&xip_alloc_lock); |
40 | } | 40 | } |
41 | } | 41 | } |
42 | return __xip_sparse_page; | 42 | return __xip_sparse_page; |
43 | } | 43 | } |
44 | 44 | ||
45 | /* | 45 | /* |
46 | * This is a file read routine for execute in place files, and uses | 46 | * This is a file read routine for execute in place files, and uses |
47 | * the mapping->a_ops->get_xip_mem() function for the actual low-level | 47 | * the mapping->a_ops->get_xip_mem() function for the actual low-level |
48 | * stuff. | 48 | * stuff. |
49 | * | 49 | * |
50 | * Note the struct file* is not used at all. It may be NULL. | 50 | * Note the struct file* is not used at all. It may be NULL. |
51 | */ | 51 | */ |
52 | static ssize_t | 52 | static ssize_t |
53 | do_xip_mapping_read(struct address_space *mapping, | 53 | do_xip_mapping_read(struct address_space *mapping, |
54 | struct file_ra_state *_ra, | 54 | struct file_ra_state *_ra, |
55 | struct file *filp, | 55 | struct file *filp, |
56 | char __user *buf, | 56 | char __user *buf, |
57 | size_t len, | 57 | size_t len, |
58 | loff_t *ppos) | 58 | loff_t *ppos) |
59 | { | 59 | { |
60 | struct inode *inode = mapping->host; | 60 | struct inode *inode = mapping->host; |
61 | pgoff_t index, end_index; | 61 | pgoff_t index, end_index; |
62 | unsigned long offset; | 62 | unsigned long offset; |
63 | loff_t isize, pos; | 63 | loff_t isize, pos; |
64 | size_t copied = 0, error = 0; | 64 | size_t copied = 0, error = 0; |
65 | 65 | ||
66 | BUG_ON(!mapping->a_ops->get_xip_mem); | 66 | BUG_ON(!mapping->a_ops->get_xip_mem); |
67 | 67 | ||
68 | pos = *ppos; | 68 | pos = *ppos; |
69 | index = pos >> PAGE_CACHE_SHIFT; | 69 | index = pos >> PAGE_CACHE_SHIFT; |
70 | offset = pos & ~PAGE_CACHE_MASK; | 70 | offset = pos & ~PAGE_CACHE_MASK; |
71 | 71 | ||
72 | isize = i_size_read(inode); | 72 | isize = i_size_read(inode); |
73 | if (!isize) | 73 | if (!isize) |
74 | goto out; | 74 | goto out; |
75 | 75 | ||
76 | end_index = (isize - 1) >> PAGE_CACHE_SHIFT; | 76 | end_index = (isize - 1) >> PAGE_CACHE_SHIFT; |
77 | do { | 77 | do { |
78 | unsigned long nr, left; | 78 | unsigned long nr, left; |
79 | void *xip_mem; | 79 | void *xip_mem; |
80 | unsigned long xip_pfn; | 80 | unsigned long xip_pfn; |
81 | int zero = 0; | 81 | int zero = 0; |
82 | 82 | ||
83 | /* nr is the maximum number of bytes to copy from this page */ | 83 | /* nr is the maximum number of bytes to copy from this page */ |
84 | nr = PAGE_CACHE_SIZE; | 84 | nr = PAGE_CACHE_SIZE; |
85 | if (index >= end_index) { | 85 | if (index >= end_index) { |
86 | if (index > end_index) | 86 | if (index > end_index) |
87 | goto out; | 87 | goto out; |
88 | nr = ((isize - 1) & ~PAGE_CACHE_MASK) + 1; | 88 | nr = ((isize - 1) & ~PAGE_CACHE_MASK) + 1; |
89 | if (nr <= offset) { | 89 | if (nr <= offset) { |
90 | goto out; | 90 | goto out; |
91 | } | 91 | } |
92 | } | 92 | } |
93 | nr = nr - offset; | 93 | nr = nr - offset; |
94 | if (nr > len) | 94 | if (nr > len) |
95 | nr = len; | 95 | nr = len; |
96 | 96 | ||
97 | error = mapping->a_ops->get_xip_mem(mapping, index, 0, | 97 | error = mapping->a_ops->get_xip_mem(mapping, index, 0, |
98 | &xip_mem, &xip_pfn); | 98 | &xip_mem, &xip_pfn); |
99 | if (unlikely(error)) { | 99 | if (unlikely(error)) { |
100 | if (error == -ENODATA) { | 100 | if (error == -ENODATA) { |
101 | /* sparse */ | 101 | /* sparse */ |
102 | zero = 1; | 102 | zero = 1; |
103 | } else | 103 | } else |
104 | goto out; | 104 | goto out; |
105 | } | 105 | } |
106 | 106 | ||
107 | /* If users can be writing to this page using arbitrary | 107 | /* If users can be writing to this page using arbitrary |
108 | * virtual addresses, take care about potential aliasing | 108 | * virtual addresses, take care about potential aliasing |
109 | * before reading the page on the kernel side. | 109 | * before reading the page on the kernel side. |
110 | */ | 110 | */ |
111 | if (mapping_writably_mapped(mapping)) | 111 | if (mapping_writably_mapped(mapping)) |
112 | /* address based flush */ ; | 112 | /* address based flush */ ; |
113 | 113 | ||
114 | /* | 114 | /* |
115 | * Ok, we have the mem, so now we can copy it to user space... | 115 | * Ok, we have the mem, so now we can copy it to user space... |
116 | * | 116 | * |
117 | * The actor routine returns how many bytes were actually used.. | 117 | * The actor routine returns how many bytes were actually used.. |
118 | * NOTE! This may not be the same as how much of a user buffer | 118 | * NOTE! This may not be the same as how much of a user buffer |
119 | * we filled up (we may be padding etc), so we can only update | 119 | * we filled up (we may be padding etc), so we can only update |
120 | * "pos" here (the actor routine has to update the user buffer | 120 | * "pos" here (the actor routine has to update the user buffer |
121 | * pointers and the remaining count). | 121 | * pointers and the remaining count). |
122 | */ | 122 | */ |
123 | if (!zero) | 123 | if (!zero) |
124 | left = __copy_to_user(buf+copied, xip_mem+offset, nr); | 124 | left = __copy_to_user(buf+copied, xip_mem+offset, nr); |
125 | else | 125 | else |
126 | left = __clear_user(buf + copied, nr); | 126 | left = __clear_user(buf + copied, nr); |
127 | 127 | ||
128 | if (left) { | 128 | if (left) { |
129 | error = -EFAULT; | 129 | error = -EFAULT; |
130 | goto out; | 130 | goto out; |
131 | } | 131 | } |
132 | 132 | ||
133 | copied += (nr - left); | 133 | copied += (nr - left); |
134 | offset += (nr - left); | 134 | offset += (nr - left); |
135 | index += offset >> PAGE_CACHE_SHIFT; | 135 | index += offset >> PAGE_CACHE_SHIFT; |
136 | offset &= ~PAGE_CACHE_MASK; | 136 | offset &= ~PAGE_CACHE_MASK; |
137 | } while (copied < len); | 137 | } while (copied < len); |
138 | 138 | ||
139 | out: | 139 | out: |
140 | *ppos = pos + copied; | 140 | *ppos = pos + copied; |
141 | if (filp) | 141 | if (filp) |
142 | file_accessed(filp); | 142 | file_accessed(filp); |
143 | 143 | ||
144 | return (copied ? copied : error); | 144 | return (copied ? copied : error); |
145 | } | 145 | } |
146 | 146 | ||
147 | ssize_t | 147 | ssize_t |
148 | xip_file_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos) | 148 | xip_file_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos) |
149 | { | 149 | { |
150 | if (!access_ok(VERIFY_WRITE, buf, len)) | 150 | if (!access_ok(VERIFY_WRITE, buf, len)) |
151 | return -EFAULT; | 151 | return -EFAULT; |
152 | 152 | ||
153 | return do_xip_mapping_read(filp->f_mapping, &filp->f_ra, filp, | 153 | return do_xip_mapping_read(filp->f_mapping, &filp->f_ra, filp, |
154 | buf, len, ppos); | 154 | buf, len, ppos); |
155 | } | 155 | } |
156 | EXPORT_SYMBOL_GPL(xip_file_read); | 156 | EXPORT_SYMBOL_GPL(xip_file_read); |
157 | 157 | ||
158 | /* | 158 | /* |
159 | * __xip_unmap is invoked from xip_unmap and | 159 | * __xip_unmap is invoked from xip_unmap and |
160 | * xip_write | 160 | * xip_write |
161 | * | 161 | * |
162 | * This function walks all vmas of the address_space and unmaps the | 162 | * This function walks all vmas of the address_space and unmaps the |
163 | * __xip_sparse_page when found at pgoff. | 163 | * __xip_sparse_page when found at pgoff. |
164 | */ | 164 | */ |
165 | static void | 165 | static void |
166 | __xip_unmap (struct address_space * mapping, | 166 | __xip_unmap (struct address_space * mapping, |
167 | unsigned long pgoff) | 167 | unsigned long pgoff) |
168 | { | 168 | { |
169 | struct vm_area_struct *vma; | 169 | struct vm_area_struct *vma; |
170 | struct mm_struct *mm; | 170 | struct mm_struct *mm; |
171 | struct prio_tree_iter iter; | 171 | struct prio_tree_iter iter; |
172 | unsigned long address; | 172 | unsigned long address; |
173 | pte_t *pte; | 173 | pte_t *pte; |
174 | pte_t pteval; | 174 | pte_t pteval; |
175 | spinlock_t *ptl; | 175 | spinlock_t *ptl; |
176 | struct page *page; | 176 | struct page *page; |
177 | 177 | ||
178 | page = __xip_sparse_page; | 178 | page = __xip_sparse_page; |
179 | if (!page) | 179 | if (!page) |
180 | return; | 180 | return; |
181 | 181 | ||
182 | spin_lock(&mapping->i_mmap_lock); | 182 | spin_lock(&mapping->i_mmap_lock); |
183 | vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { | 183 | vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { |
184 | mm = vma->vm_mm; | 184 | mm = vma->vm_mm; |
185 | address = vma->vm_start + | 185 | address = vma->vm_start + |
186 | ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); | 186 | ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); |
187 | BUG_ON(address < vma->vm_start || address >= vma->vm_end); | 187 | BUG_ON(address < vma->vm_start || address >= vma->vm_end); |
188 | pte = page_check_address(page, mm, address, &ptl); | 188 | pte = page_check_address(page, mm, address, &ptl, 1); |
189 | if (pte) { | 189 | if (pte) { |
190 | /* Nuke the page table entry. */ | 190 | /* Nuke the page table entry. */ |
191 | flush_cache_page(vma, address, pte_pfn(*pte)); | 191 | flush_cache_page(vma, address, pte_pfn(*pte)); |
192 | pteval = ptep_clear_flush_notify(vma, address, pte); | 192 | pteval = ptep_clear_flush_notify(vma, address, pte); |
193 | page_remove_rmap(page, vma); | 193 | page_remove_rmap(page, vma); |
194 | dec_mm_counter(mm, file_rss); | 194 | dec_mm_counter(mm, file_rss); |
195 | BUG_ON(pte_dirty(pteval)); | 195 | BUG_ON(pte_dirty(pteval)); |
196 | pte_unmap_unlock(pte, ptl); | 196 | pte_unmap_unlock(pte, ptl); |
197 | page_cache_release(page); | 197 | page_cache_release(page); |
198 | } | 198 | } |
199 | } | 199 | } |
200 | spin_unlock(&mapping->i_mmap_lock); | 200 | spin_unlock(&mapping->i_mmap_lock); |
201 | } | 201 | } |
202 | 202 | ||
203 | /* | 203 | /* |
204 | * xip_fault() is invoked via the vma operations vector for a | 204 | * xip_fault() is invoked via the vma operations vector for a |
205 | * mapped memory region to read in file data during a page fault. | 205 | * mapped memory region to read in file data during a page fault. |
206 | * | 206 | * |
207 | * This function is derived from filemap_fault, but used for execute in place | 207 | * This function is derived from filemap_fault, but used for execute in place |
208 | */ | 208 | */ |
209 | static int xip_file_fault(struct vm_area_struct *vma, struct vm_fault *vmf) | 209 | static int xip_file_fault(struct vm_area_struct *vma, struct vm_fault *vmf) |
210 | { | 210 | { |
211 | struct file *file = vma->vm_file; | 211 | struct file *file = vma->vm_file; |
212 | struct address_space *mapping = file->f_mapping; | 212 | struct address_space *mapping = file->f_mapping; |
213 | struct inode *inode = mapping->host; | 213 | struct inode *inode = mapping->host; |
214 | pgoff_t size; | 214 | pgoff_t size; |
215 | void *xip_mem; | 215 | void *xip_mem; |
216 | unsigned long xip_pfn; | 216 | unsigned long xip_pfn; |
217 | struct page *page; | 217 | struct page *page; |
218 | int error; | 218 | int error; |
219 | 219 | ||
220 | /* XXX: are VM_FAULT_ codes OK? */ | 220 | /* XXX: are VM_FAULT_ codes OK? */ |
221 | 221 | ||
222 | size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; | 222 | size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; |
223 | if (vmf->pgoff >= size) | 223 | if (vmf->pgoff >= size) |
224 | return VM_FAULT_SIGBUS; | 224 | return VM_FAULT_SIGBUS; |
225 | 225 | ||
226 | error = mapping->a_ops->get_xip_mem(mapping, vmf->pgoff, 0, | 226 | error = mapping->a_ops->get_xip_mem(mapping, vmf->pgoff, 0, |
227 | &xip_mem, &xip_pfn); | 227 | &xip_mem, &xip_pfn); |
228 | if (likely(!error)) | 228 | if (likely(!error)) |
229 | goto found; | 229 | goto found; |
230 | if (error != -ENODATA) | 230 | if (error != -ENODATA) |
231 | return VM_FAULT_OOM; | 231 | return VM_FAULT_OOM; |
232 | 232 | ||
233 | /* sparse block */ | 233 | /* sparse block */ |
234 | if ((vma->vm_flags & (VM_WRITE | VM_MAYWRITE)) && | 234 | if ((vma->vm_flags & (VM_WRITE | VM_MAYWRITE)) && |
235 | (vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) && | 235 | (vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) && |
236 | (!(mapping->host->i_sb->s_flags & MS_RDONLY))) { | 236 | (!(mapping->host->i_sb->s_flags & MS_RDONLY))) { |
237 | int err; | 237 | int err; |
238 | 238 | ||
239 | /* maybe shared writable, allocate new block */ | 239 | /* maybe shared writable, allocate new block */ |
240 | error = mapping->a_ops->get_xip_mem(mapping, vmf->pgoff, 1, | 240 | error = mapping->a_ops->get_xip_mem(mapping, vmf->pgoff, 1, |
241 | &xip_mem, &xip_pfn); | 241 | &xip_mem, &xip_pfn); |
242 | if (error) | 242 | if (error) |
243 | return VM_FAULT_SIGBUS; | 243 | return VM_FAULT_SIGBUS; |
244 | /* unmap sparse mappings at pgoff from all other vmas */ | 244 | /* unmap sparse mappings at pgoff from all other vmas */ |
245 | __xip_unmap(mapping, vmf->pgoff); | 245 | __xip_unmap(mapping, vmf->pgoff); |
246 | 246 | ||
247 | found: | 247 | found: |
248 | err = vm_insert_mixed(vma, (unsigned long)vmf->virtual_address, | 248 | err = vm_insert_mixed(vma, (unsigned long)vmf->virtual_address, |
249 | xip_pfn); | 249 | xip_pfn); |
250 | if (err == -ENOMEM) | 250 | if (err == -ENOMEM) |
251 | return VM_FAULT_OOM; | 251 | return VM_FAULT_OOM; |
252 | BUG_ON(err); | 252 | BUG_ON(err); |
253 | return VM_FAULT_NOPAGE; | 253 | return VM_FAULT_NOPAGE; |
254 | } else { | 254 | } else { |
255 | /* not shared and writable, use xip_sparse_page() */ | 255 | /* not shared and writable, use xip_sparse_page() */ |
256 | page = xip_sparse_page(); | 256 | page = xip_sparse_page(); |
257 | if (!page) | 257 | if (!page) |
258 | return VM_FAULT_OOM; | 258 | return VM_FAULT_OOM; |
259 | 259 | ||
260 | page_cache_get(page); | 260 | page_cache_get(page); |
261 | vmf->page = page; | 261 | vmf->page = page; |
262 | return 0; | 262 | return 0; |
263 | } | 263 | } |
264 | } | 264 | } |
265 | 265 | ||
266 | static struct vm_operations_struct xip_file_vm_ops = { | 266 | static struct vm_operations_struct xip_file_vm_ops = { |
267 | .fault = xip_file_fault, | 267 | .fault = xip_file_fault, |
268 | }; | 268 | }; |
269 | 269 | ||
270 | int xip_file_mmap(struct file * file, struct vm_area_struct * vma) | 270 | int xip_file_mmap(struct file * file, struct vm_area_struct * vma) |
271 | { | 271 | { |
272 | BUG_ON(!file->f_mapping->a_ops->get_xip_mem); | 272 | BUG_ON(!file->f_mapping->a_ops->get_xip_mem); |
273 | 273 | ||
274 | file_accessed(file); | 274 | file_accessed(file); |
275 | vma->vm_ops = &xip_file_vm_ops; | 275 | vma->vm_ops = &xip_file_vm_ops; |
276 | vma->vm_flags |= VM_CAN_NONLINEAR | VM_MIXEDMAP; | 276 | vma->vm_flags |= VM_CAN_NONLINEAR | VM_MIXEDMAP; |
277 | return 0; | 277 | return 0; |
278 | } | 278 | } |
279 | EXPORT_SYMBOL_GPL(xip_file_mmap); | 279 | EXPORT_SYMBOL_GPL(xip_file_mmap); |
280 | 280 | ||
281 | static ssize_t | 281 | static ssize_t |
282 | __xip_file_write(struct file *filp, const char __user *buf, | 282 | __xip_file_write(struct file *filp, const char __user *buf, |
283 | size_t count, loff_t pos, loff_t *ppos) | 283 | size_t count, loff_t pos, loff_t *ppos) |
284 | { | 284 | { |
285 | struct address_space * mapping = filp->f_mapping; | 285 | struct address_space * mapping = filp->f_mapping; |
286 | const struct address_space_operations *a_ops = mapping->a_ops; | 286 | const struct address_space_operations *a_ops = mapping->a_ops; |
287 | struct inode *inode = mapping->host; | 287 | struct inode *inode = mapping->host; |
288 | long status = 0; | 288 | long status = 0; |
289 | size_t bytes; | 289 | size_t bytes; |
290 | ssize_t written = 0; | 290 | ssize_t written = 0; |
291 | 291 | ||
292 | BUG_ON(!mapping->a_ops->get_xip_mem); | 292 | BUG_ON(!mapping->a_ops->get_xip_mem); |
293 | 293 | ||
294 | do { | 294 | do { |
295 | unsigned long index; | 295 | unsigned long index; |
296 | unsigned long offset; | 296 | unsigned long offset; |
297 | size_t copied; | 297 | size_t copied; |
298 | void *xip_mem; | 298 | void *xip_mem; |
299 | unsigned long xip_pfn; | 299 | unsigned long xip_pfn; |
300 | 300 | ||
301 | offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */ | 301 | offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */ |
302 | index = pos >> PAGE_CACHE_SHIFT; | 302 | index = pos >> PAGE_CACHE_SHIFT; |
303 | bytes = PAGE_CACHE_SIZE - offset; | 303 | bytes = PAGE_CACHE_SIZE - offset; |
304 | if (bytes > count) | 304 | if (bytes > count) |
305 | bytes = count; | 305 | bytes = count; |
306 | 306 | ||
307 | status = a_ops->get_xip_mem(mapping, index, 0, | 307 | status = a_ops->get_xip_mem(mapping, index, 0, |
308 | &xip_mem, &xip_pfn); | 308 | &xip_mem, &xip_pfn); |
309 | if (status == -ENODATA) { | 309 | if (status == -ENODATA) { |
310 | /* we allocate a new page unmap it */ | 310 | /* we allocate a new page unmap it */ |
311 | status = a_ops->get_xip_mem(mapping, index, 1, | 311 | status = a_ops->get_xip_mem(mapping, index, 1, |
312 | &xip_mem, &xip_pfn); | 312 | &xip_mem, &xip_pfn); |
313 | if (!status) | 313 | if (!status) |
314 | /* unmap page at pgoff from all other vmas */ | 314 | /* unmap page at pgoff from all other vmas */ |
315 | __xip_unmap(mapping, index); | 315 | __xip_unmap(mapping, index); |
316 | } | 316 | } |
317 | 317 | ||
318 | if (status) | 318 | if (status) |
319 | break; | 319 | break; |
320 | 320 | ||
321 | copied = bytes - | 321 | copied = bytes - |
322 | __copy_from_user_nocache(xip_mem + offset, buf, bytes); | 322 | __copy_from_user_nocache(xip_mem + offset, buf, bytes); |
323 | 323 | ||
324 | if (likely(copied > 0)) { | 324 | if (likely(copied > 0)) { |
325 | status = copied; | 325 | status = copied; |
326 | 326 | ||
327 | if (status >= 0) { | 327 | if (status >= 0) { |
328 | written += status; | 328 | written += status; |
329 | count -= status; | 329 | count -= status; |
330 | pos += status; | 330 | pos += status; |
331 | buf += status; | 331 | buf += status; |
332 | } | 332 | } |
333 | } | 333 | } |
334 | if (unlikely(copied != bytes)) | 334 | if (unlikely(copied != bytes)) |
335 | if (status >= 0) | 335 | if (status >= 0) |
336 | status = -EFAULT; | 336 | status = -EFAULT; |
337 | if (status < 0) | 337 | if (status < 0) |
338 | break; | 338 | break; |
339 | } while (count); | 339 | } while (count); |
340 | *ppos = pos; | 340 | *ppos = pos; |
341 | /* | 341 | /* |
342 | * No need to use i_size_read() here, the i_size | 342 | * No need to use i_size_read() here, the i_size |
343 | * cannot change under us because we hold i_mutex. | 343 | * cannot change under us because we hold i_mutex. |
344 | */ | 344 | */ |
345 | if (pos > inode->i_size) { | 345 | if (pos > inode->i_size) { |
346 | i_size_write(inode, pos); | 346 | i_size_write(inode, pos); |
347 | mark_inode_dirty(inode); | 347 | mark_inode_dirty(inode); |
348 | } | 348 | } |
349 | 349 | ||
350 | return written ? written : status; | 350 | return written ? written : status; |
351 | } | 351 | } |
352 | 352 | ||
353 | ssize_t | 353 | ssize_t |
354 | xip_file_write(struct file *filp, const char __user *buf, size_t len, | 354 | xip_file_write(struct file *filp, const char __user *buf, size_t len, |
355 | loff_t *ppos) | 355 | loff_t *ppos) |
356 | { | 356 | { |
357 | struct address_space *mapping = filp->f_mapping; | 357 | struct address_space *mapping = filp->f_mapping; |
358 | struct inode *inode = mapping->host; | 358 | struct inode *inode = mapping->host; |
359 | size_t count; | 359 | size_t count; |
360 | loff_t pos; | 360 | loff_t pos; |
361 | ssize_t ret; | 361 | ssize_t ret; |
362 | 362 | ||
363 | mutex_lock(&inode->i_mutex); | 363 | mutex_lock(&inode->i_mutex); |
364 | 364 | ||
365 | if (!access_ok(VERIFY_READ, buf, len)) { | 365 | if (!access_ok(VERIFY_READ, buf, len)) { |
366 | ret=-EFAULT; | 366 | ret=-EFAULT; |
367 | goto out_up; | 367 | goto out_up; |
368 | } | 368 | } |
369 | 369 | ||
370 | pos = *ppos; | 370 | pos = *ppos; |
371 | count = len; | 371 | count = len; |
372 | 372 | ||
373 | vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE); | 373 | vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE); |
374 | 374 | ||
375 | /* We can write back this queue in page reclaim */ | 375 | /* We can write back this queue in page reclaim */ |
376 | current->backing_dev_info = mapping->backing_dev_info; | 376 | current->backing_dev_info = mapping->backing_dev_info; |
377 | 377 | ||
378 | ret = generic_write_checks(filp, &pos, &count, S_ISBLK(inode->i_mode)); | 378 | ret = generic_write_checks(filp, &pos, &count, S_ISBLK(inode->i_mode)); |
379 | if (ret) | 379 | if (ret) |
380 | goto out_backing; | 380 | goto out_backing; |
381 | if (count == 0) | 381 | if (count == 0) |
382 | goto out_backing; | 382 | goto out_backing; |
383 | 383 | ||
384 | ret = file_remove_suid(filp); | 384 | ret = file_remove_suid(filp); |
385 | if (ret) | 385 | if (ret) |
386 | goto out_backing; | 386 | goto out_backing; |
387 | 387 | ||
388 | file_update_time(filp); | 388 | file_update_time(filp); |
389 | 389 | ||
390 | ret = __xip_file_write (filp, buf, count, pos, ppos); | 390 | ret = __xip_file_write (filp, buf, count, pos, ppos); |
391 | 391 | ||
392 | out_backing: | 392 | out_backing: |
393 | current->backing_dev_info = NULL; | 393 | current->backing_dev_info = NULL; |
394 | out_up: | 394 | out_up: |
395 | mutex_unlock(&inode->i_mutex); | 395 | mutex_unlock(&inode->i_mutex); |
396 | return ret; | 396 | return ret; |
397 | } | 397 | } |
398 | EXPORT_SYMBOL_GPL(xip_file_write); | 398 | EXPORT_SYMBOL_GPL(xip_file_write); |
399 | 399 | ||
400 | /* | 400 | /* |
401 | * truncate a page used for execute in place | 401 | * truncate a page used for execute in place |
402 | * functionality is analog to block_truncate_page but does use get_xip_mem | 402 | * functionality is analog to block_truncate_page but does use get_xip_mem |
403 | * to get the page instead of page cache | 403 | * to get the page instead of page cache |
404 | */ | 404 | */ |
405 | int | 405 | int |
406 | xip_truncate_page(struct address_space *mapping, loff_t from) | 406 | xip_truncate_page(struct address_space *mapping, loff_t from) |
407 | { | 407 | { |
408 | pgoff_t index = from >> PAGE_CACHE_SHIFT; | 408 | pgoff_t index = from >> PAGE_CACHE_SHIFT; |
409 | unsigned offset = from & (PAGE_CACHE_SIZE-1); | 409 | unsigned offset = from & (PAGE_CACHE_SIZE-1); |
410 | unsigned blocksize; | 410 | unsigned blocksize; |
411 | unsigned length; | 411 | unsigned length; |
412 | void *xip_mem; | 412 | void *xip_mem; |
413 | unsigned long xip_pfn; | 413 | unsigned long xip_pfn; |
414 | int err; | 414 | int err; |
415 | 415 | ||
416 | BUG_ON(!mapping->a_ops->get_xip_mem); | 416 | BUG_ON(!mapping->a_ops->get_xip_mem); |
417 | 417 | ||
418 | blocksize = 1 << mapping->host->i_blkbits; | 418 | blocksize = 1 << mapping->host->i_blkbits; |
419 | length = offset & (blocksize - 1); | 419 | length = offset & (blocksize - 1); |
420 | 420 | ||
421 | /* Block boundary? Nothing to do */ | 421 | /* Block boundary? Nothing to do */ |
422 | if (!length) | 422 | if (!length) |
423 | return 0; | 423 | return 0; |
424 | 424 | ||
425 | length = blocksize - length; | 425 | length = blocksize - length; |
426 | 426 | ||
427 | err = mapping->a_ops->get_xip_mem(mapping, index, 0, | 427 | err = mapping->a_ops->get_xip_mem(mapping, index, 0, |
428 | &xip_mem, &xip_pfn); | 428 | &xip_mem, &xip_pfn); |
429 | if (unlikely(err)) { | 429 | if (unlikely(err)) { |
430 | if (err == -ENODATA) | 430 | if (err == -ENODATA) |
431 | /* Hole? No need to truncate */ | 431 | /* Hole? No need to truncate */ |
432 | return 0; | 432 | return 0; |
433 | else | 433 | else |
434 | return err; | 434 | return err; |
435 | } | 435 | } |
436 | memset(xip_mem + offset, 0, length); | 436 | memset(xip_mem + offset, 0, length); |
437 | return 0; | 437 | return 0; |
438 | } | 438 | } |
439 | EXPORT_SYMBOL_GPL(xip_truncate_page); | 439 | EXPORT_SYMBOL_GPL(xip_truncate_page); |
440 | 440 |
mm/rmap.c
1 | /* | 1 | /* |
2 | * mm/rmap.c - physical to virtual reverse mappings | 2 | * mm/rmap.c - physical to virtual reverse mappings |
3 | * | 3 | * |
4 | * Copyright 2001, Rik van Riel <riel@conectiva.com.br> | 4 | * Copyright 2001, Rik van Riel <riel@conectiva.com.br> |
5 | * Released under the General Public License (GPL). | 5 | * Released under the General Public License (GPL). |
6 | * | 6 | * |
7 | * Simple, low overhead reverse mapping scheme. | 7 | * Simple, low overhead reverse mapping scheme. |
8 | * Please try to keep this thing as modular as possible. | 8 | * Please try to keep this thing as modular as possible. |
9 | * | 9 | * |
10 | * Provides methods for unmapping each kind of mapped page: | 10 | * Provides methods for unmapping each kind of mapped page: |
11 | * the anon methods track anonymous pages, and | 11 | * the anon methods track anonymous pages, and |
12 | * the file methods track pages belonging to an inode. | 12 | * the file methods track pages belonging to an inode. |
13 | * | 13 | * |
14 | * Original design by Rik van Riel <riel@conectiva.com.br> 2001 | 14 | * Original design by Rik van Riel <riel@conectiva.com.br> 2001 |
15 | * File methods by Dave McCracken <dmccr@us.ibm.com> 2003, 2004 | 15 | * File methods by Dave McCracken <dmccr@us.ibm.com> 2003, 2004 |
16 | * Anonymous methods by Andrea Arcangeli <andrea@suse.de> 2004 | 16 | * Anonymous methods by Andrea Arcangeli <andrea@suse.de> 2004 |
17 | * Contributions by Hugh Dickins <hugh@veritas.com> 2003, 2004 | 17 | * Contributions by Hugh Dickins <hugh@veritas.com> 2003, 2004 |
18 | */ | 18 | */ |
19 | 19 | ||
20 | /* | 20 | /* |
21 | * Lock ordering in mm: | 21 | * Lock ordering in mm: |
22 | * | 22 | * |
23 | * inode->i_mutex (while writing or truncating, not reading or faulting) | 23 | * inode->i_mutex (while writing or truncating, not reading or faulting) |
24 | * inode->i_alloc_sem (vmtruncate_range) | 24 | * inode->i_alloc_sem (vmtruncate_range) |
25 | * mm->mmap_sem | 25 | * mm->mmap_sem |
26 | * page->flags PG_locked (lock_page) | 26 | * page->flags PG_locked (lock_page) |
27 | * mapping->i_mmap_lock | 27 | * mapping->i_mmap_lock |
28 | * anon_vma->lock | 28 | * anon_vma->lock |
29 | * mm->page_table_lock or pte_lock | 29 | * mm->page_table_lock or pte_lock |
30 | * zone->lru_lock (in mark_page_accessed, isolate_lru_page) | 30 | * zone->lru_lock (in mark_page_accessed, isolate_lru_page) |
31 | * swap_lock (in swap_duplicate, swap_info_get) | 31 | * swap_lock (in swap_duplicate, swap_info_get) |
32 | * mmlist_lock (in mmput, drain_mmlist and others) | 32 | * mmlist_lock (in mmput, drain_mmlist and others) |
33 | * mapping->private_lock (in __set_page_dirty_buffers) | 33 | * mapping->private_lock (in __set_page_dirty_buffers) |
34 | * inode_lock (in set_page_dirty's __mark_inode_dirty) | 34 | * inode_lock (in set_page_dirty's __mark_inode_dirty) |
35 | * sb_lock (within inode_lock in fs/fs-writeback.c) | 35 | * sb_lock (within inode_lock in fs/fs-writeback.c) |
36 | * mapping->tree_lock (widely used, in set_page_dirty, | 36 | * mapping->tree_lock (widely used, in set_page_dirty, |
37 | * in arch-dependent flush_dcache_mmap_lock, | 37 | * in arch-dependent flush_dcache_mmap_lock, |
38 | * within inode_lock in __sync_single_inode) | 38 | * within inode_lock in __sync_single_inode) |
39 | */ | 39 | */ |
40 | 40 | ||
41 | #include <linux/mm.h> | 41 | #include <linux/mm.h> |
42 | #include <linux/pagemap.h> | 42 | #include <linux/pagemap.h> |
43 | #include <linux/swap.h> | 43 | #include <linux/swap.h> |
44 | #include <linux/swapops.h> | 44 | #include <linux/swapops.h> |
45 | #include <linux/slab.h> | 45 | #include <linux/slab.h> |
46 | #include <linux/init.h> | 46 | #include <linux/init.h> |
47 | #include <linux/rmap.h> | 47 | #include <linux/rmap.h> |
48 | #include <linux/rcupdate.h> | 48 | #include <linux/rcupdate.h> |
49 | #include <linux/module.h> | 49 | #include <linux/module.h> |
50 | #include <linux/kallsyms.h> | 50 | #include <linux/kallsyms.h> |
51 | #include <linux/memcontrol.h> | 51 | #include <linux/memcontrol.h> |
52 | #include <linux/mmu_notifier.h> | 52 | #include <linux/mmu_notifier.h> |
53 | 53 | ||
54 | #include <asm/tlbflush.h> | 54 | #include <asm/tlbflush.h> |
55 | 55 | ||
56 | struct kmem_cache *anon_vma_cachep; | 56 | struct kmem_cache *anon_vma_cachep; |
57 | 57 | ||
58 | /* This must be called under the mmap_sem. */ | 58 | /* This must be called under the mmap_sem. */ |
59 | int anon_vma_prepare(struct vm_area_struct *vma) | 59 | int anon_vma_prepare(struct vm_area_struct *vma) |
60 | { | 60 | { |
61 | struct anon_vma *anon_vma = vma->anon_vma; | 61 | struct anon_vma *anon_vma = vma->anon_vma; |
62 | 62 | ||
63 | might_sleep(); | 63 | might_sleep(); |
64 | if (unlikely(!anon_vma)) { | 64 | if (unlikely(!anon_vma)) { |
65 | struct mm_struct *mm = vma->vm_mm; | 65 | struct mm_struct *mm = vma->vm_mm; |
66 | struct anon_vma *allocated, *locked; | 66 | struct anon_vma *allocated, *locked; |
67 | 67 | ||
68 | anon_vma = find_mergeable_anon_vma(vma); | 68 | anon_vma = find_mergeable_anon_vma(vma); |
69 | if (anon_vma) { | 69 | if (anon_vma) { |
70 | allocated = NULL; | 70 | allocated = NULL; |
71 | locked = anon_vma; | 71 | locked = anon_vma; |
72 | spin_lock(&locked->lock); | 72 | spin_lock(&locked->lock); |
73 | } else { | 73 | } else { |
74 | anon_vma = anon_vma_alloc(); | 74 | anon_vma = anon_vma_alloc(); |
75 | if (unlikely(!anon_vma)) | 75 | if (unlikely(!anon_vma)) |
76 | return -ENOMEM; | 76 | return -ENOMEM; |
77 | allocated = anon_vma; | 77 | allocated = anon_vma; |
78 | locked = NULL; | 78 | locked = NULL; |
79 | } | 79 | } |
80 | 80 | ||
81 | /* page_table_lock to protect against threads */ | 81 | /* page_table_lock to protect against threads */ |
82 | spin_lock(&mm->page_table_lock); | 82 | spin_lock(&mm->page_table_lock); |
83 | if (likely(!vma->anon_vma)) { | 83 | if (likely(!vma->anon_vma)) { |
84 | vma->anon_vma = anon_vma; | 84 | vma->anon_vma = anon_vma; |
85 | list_add_tail(&vma->anon_vma_node, &anon_vma->head); | 85 | list_add_tail(&vma->anon_vma_node, &anon_vma->head); |
86 | allocated = NULL; | 86 | allocated = NULL; |
87 | } | 87 | } |
88 | spin_unlock(&mm->page_table_lock); | 88 | spin_unlock(&mm->page_table_lock); |
89 | 89 | ||
90 | if (locked) | 90 | if (locked) |
91 | spin_unlock(&locked->lock); | 91 | spin_unlock(&locked->lock); |
92 | if (unlikely(allocated)) | 92 | if (unlikely(allocated)) |
93 | anon_vma_free(allocated); | 93 | anon_vma_free(allocated); |
94 | } | 94 | } |
95 | return 0; | 95 | return 0; |
96 | } | 96 | } |
97 | 97 | ||
98 | void __anon_vma_merge(struct vm_area_struct *vma, struct vm_area_struct *next) | 98 | void __anon_vma_merge(struct vm_area_struct *vma, struct vm_area_struct *next) |
99 | { | 99 | { |
100 | BUG_ON(vma->anon_vma != next->anon_vma); | 100 | BUG_ON(vma->anon_vma != next->anon_vma); |
101 | list_del(&next->anon_vma_node); | 101 | list_del(&next->anon_vma_node); |
102 | } | 102 | } |
103 | 103 | ||
104 | void __anon_vma_link(struct vm_area_struct *vma) | 104 | void __anon_vma_link(struct vm_area_struct *vma) |
105 | { | 105 | { |
106 | struct anon_vma *anon_vma = vma->anon_vma; | 106 | struct anon_vma *anon_vma = vma->anon_vma; |
107 | 107 | ||
108 | if (anon_vma) | 108 | if (anon_vma) |
109 | list_add_tail(&vma->anon_vma_node, &anon_vma->head); | 109 | list_add_tail(&vma->anon_vma_node, &anon_vma->head); |
110 | } | 110 | } |
111 | 111 | ||
112 | void anon_vma_link(struct vm_area_struct *vma) | 112 | void anon_vma_link(struct vm_area_struct *vma) |
113 | { | 113 | { |
114 | struct anon_vma *anon_vma = vma->anon_vma; | 114 | struct anon_vma *anon_vma = vma->anon_vma; |
115 | 115 | ||
116 | if (anon_vma) { | 116 | if (anon_vma) { |
117 | spin_lock(&anon_vma->lock); | 117 | spin_lock(&anon_vma->lock); |
118 | list_add_tail(&vma->anon_vma_node, &anon_vma->head); | 118 | list_add_tail(&vma->anon_vma_node, &anon_vma->head); |
119 | spin_unlock(&anon_vma->lock); | 119 | spin_unlock(&anon_vma->lock); |
120 | } | 120 | } |
121 | } | 121 | } |
122 | 122 | ||
123 | void anon_vma_unlink(struct vm_area_struct *vma) | 123 | void anon_vma_unlink(struct vm_area_struct *vma) |
124 | { | 124 | { |
125 | struct anon_vma *anon_vma = vma->anon_vma; | 125 | struct anon_vma *anon_vma = vma->anon_vma; |
126 | int empty; | 126 | int empty; |
127 | 127 | ||
128 | if (!anon_vma) | 128 | if (!anon_vma) |
129 | return; | 129 | return; |
130 | 130 | ||
131 | spin_lock(&anon_vma->lock); | 131 | spin_lock(&anon_vma->lock); |
132 | list_del(&vma->anon_vma_node); | 132 | list_del(&vma->anon_vma_node); |
133 | 133 | ||
134 | /* We must garbage collect the anon_vma if it's empty */ | 134 | /* We must garbage collect the anon_vma if it's empty */ |
135 | empty = list_empty(&anon_vma->head); | 135 | empty = list_empty(&anon_vma->head); |
136 | spin_unlock(&anon_vma->lock); | 136 | spin_unlock(&anon_vma->lock); |
137 | 137 | ||
138 | if (empty) | 138 | if (empty) |
139 | anon_vma_free(anon_vma); | 139 | anon_vma_free(anon_vma); |
140 | } | 140 | } |
141 | 141 | ||
142 | static void anon_vma_ctor(void *data) | 142 | static void anon_vma_ctor(void *data) |
143 | { | 143 | { |
144 | struct anon_vma *anon_vma = data; | 144 | struct anon_vma *anon_vma = data; |
145 | 145 | ||
146 | spin_lock_init(&anon_vma->lock); | 146 | spin_lock_init(&anon_vma->lock); |
147 | INIT_LIST_HEAD(&anon_vma->head); | 147 | INIT_LIST_HEAD(&anon_vma->head); |
148 | } | 148 | } |
149 | 149 | ||
150 | void __init anon_vma_init(void) | 150 | void __init anon_vma_init(void) |
151 | { | 151 | { |
152 | anon_vma_cachep = kmem_cache_create("anon_vma", sizeof(struct anon_vma), | 152 | anon_vma_cachep = kmem_cache_create("anon_vma", sizeof(struct anon_vma), |
153 | 0, SLAB_DESTROY_BY_RCU|SLAB_PANIC, anon_vma_ctor); | 153 | 0, SLAB_DESTROY_BY_RCU|SLAB_PANIC, anon_vma_ctor); |
154 | } | 154 | } |
155 | 155 | ||
156 | /* | 156 | /* |
157 | * Getting a lock on a stable anon_vma from a page off the LRU is | 157 | * Getting a lock on a stable anon_vma from a page off the LRU is |
158 | * tricky: page_lock_anon_vma rely on RCU to guard against the races. | 158 | * tricky: page_lock_anon_vma rely on RCU to guard against the races. |
159 | */ | 159 | */ |
160 | static struct anon_vma *page_lock_anon_vma(struct page *page) | 160 | static struct anon_vma *page_lock_anon_vma(struct page *page) |
161 | { | 161 | { |
162 | struct anon_vma *anon_vma; | 162 | struct anon_vma *anon_vma; |
163 | unsigned long anon_mapping; | 163 | unsigned long anon_mapping; |
164 | 164 | ||
165 | rcu_read_lock(); | 165 | rcu_read_lock(); |
166 | anon_mapping = (unsigned long) page->mapping; | 166 | anon_mapping = (unsigned long) page->mapping; |
167 | if (!(anon_mapping & PAGE_MAPPING_ANON)) | 167 | if (!(anon_mapping & PAGE_MAPPING_ANON)) |
168 | goto out; | 168 | goto out; |
169 | if (!page_mapped(page)) | 169 | if (!page_mapped(page)) |
170 | goto out; | 170 | goto out; |
171 | 171 | ||
172 | anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON); | 172 | anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON); |
173 | spin_lock(&anon_vma->lock); | 173 | spin_lock(&anon_vma->lock); |
174 | return anon_vma; | 174 | return anon_vma; |
175 | out: | 175 | out: |
176 | rcu_read_unlock(); | 176 | rcu_read_unlock(); |
177 | return NULL; | 177 | return NULL; |
178 | } | 178 | } |
179 | 179 | ||
180 | static void page_unlock_anon_vma(struct anon_vma *anon_vma) | 180 | static void page_unlock_anon_vma(struct anon_vma *anon_vma) |
181 | { | 181 | { |
182 | spin_unlock(&anon_vma->lock); | 182 | spin_unlock(&anon_vma->lock); |
183 | rcu_read_unlock(); | 183 | rcu_read_unlock(); |
184 | } | 184 | } |
185 | 185 | ||
186 | /* | 186 | /* |
187 | * At what user virtual address is page expected in @vma? | 187 | * At what user virtual address is page expected in @vma? |
188 | * Returns virtual address or -EFAULT if page's index/offset is not | 188 | * Returns virtual address or -EFAULT if page's index/offset is not |
189 | * within the range mapped the @vma. | 189 | * within the range mapped the @vma. |
190 | */ | 190 | */ |
191 | static inline unsigned long | 191 | static inline unsigned long |
192 | vma_address(struct page *page, struct vm_area_struct *vma) | 192 | vma_address(struct page *page, struct vm_area_struct *vma) |
193 | { | 193 | { |
194 | pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); | 194 | pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); |
195 | unsigned long address; | 195 | unsigned long address; |
196 | 196 | ||
197 | address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); | 197 | address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); |
198 | if (unlikely(address < vma->vm_start || address >= vma->vm_end)) { | 198 | if (unlikely(address < vma->vm_start || address >= vma->vm_end)) { |
199 | /* page should be within @vma mapping range */ | 199 | /* page should be within @vma mapping range */ |
200 | return -EFAULT; | 200 | return -EFAULT; |
201 | } | 201 | } |
202 | return address; | 202 | return address; |
203 | } | 203 | } |
204 | 204 | ||
205 | /* | 205 | /* |
206 | * At what user virtual address is page expected in vma? checking that the | 206 | * At what user virtual address is page expected in vma? checking that the |
207 | * page matches the vma: currently only used on anon pages, by unuse_vma; | 207 | * page matches the vma: currently only used on anon pages, by unuse_vma; |
208 | */ | 208 | */ |
209 | unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) | 209 | unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) |
210 | { | 210 | { |
211 | if (PageAnon(page)) { | 211 | if (PageAnon(page)) { |
212 | if ((void *)vma->anon_vma != | 212 | if ((void *)vma->anon_vma != |
213 | (void *)page->mapping - PAGE_MAPPING_ANON) | 213 | (void *)page->mapping - PAGE_MAPPING_ANON) |
214 | return -EFAULT; | 214 | return -EFAULT; |
215 | } else if (page->mapping && !(vma->vm_flags & VM_NONLINEAR)) { | 215 | } else if (page->mapping && !(vma->vm_flags & VM_NONLINEAR)) { |
216 | if (!vma->vm_file || | 216 | if (!vma->vm_file || |
217 | vma->vm_file->f_mapping != page->mapping) | 217 | vma->vm_file->f_mapping != page->mapping) |
218 | return -EFAULT; | 218 | return -EFAULT; |
219 | } else | 219 | } else |
220 | return -EFAULT; | 220 | return -EFAULT; |
221 | return vma_address(page, vma); | 221 | return vma_address(page, vma); |
222 | } | 222 | } |
223 | 223 | ||
224 | /* | 224 | /* |
225 | * Check that @page is mapped at @address into @mm. | 225 | * Check that @page is mapped at @address into @mm. |
226 | * | 226 | * |
227 | * If @sync is false, page_check_address may perform a racy check to avoid | ||
228 | * the page table lock when the pte is not present (helpful when reclaiming | ||
229 | * highly shared pages). | ||
230 | * | ||
227 | * On success returns with pte mapped and locked. | 231 | * On success returns with pte mapped and locked. |
228 | */ | 232 | */ |
229 | pte_t *page_check_address(struct page *page, struct mm_struct *mm, | 233 | pte_t *page_check_address(struct page *page, struct mm_struct *mm, |
230 | unsigned long address, spinlock_t **ptlp) | 234 | unsigned long address, spinlock_t **ptlp, int sync) |
231 | { | 235 | { |
232 | pgd_t *pgd; | 236 | pgd_t *pgd; |
233 | pud_t *pud; | 237 | pud_t *pud; |
234 | pmd_t *pmd; | 238 | pmd_t *pmd; |
235 | pte_t *pte; | 239 | pte_t *pte; |
236 | spinlock_t *ptl; | 240 | spinlock_t *ptl; |
237 | 241 | ||
238 | pgd = pgd_offset(mm, address); | 242 | pgd = pgd_offset(mm, address); |
239 | if (!pgd_present(*pgd)) | 243 | if (!pgd_present(*pgd)) |
240 | return NULL; | 244 | return NULL; |
241 | 245 | ||
242 | pud = pud_offset(pgd, address); | 246 | pud = pud_offset(pgd, address); |
243 | if (!pud_present(*pud)) | 247 | if (!pud_present(*pud)) |
244 | return NULL; | 248 | return NULL; |
245 | 249 | ||
246 | pmd = pmd_offset(pud, address); | 250 | pmd = pmd_offset(pud, address); |
247 | if (!pmd_present(*pmd)) | 251 | if (!pmd_present(*pmd)) |
248 | return NULL; | 252 | return NULL; |
249 | 253 | ||
250 | pte = pte_offset_map(pmd, address); | 254 | pte = pte_offset_map(pmd, address); |
251 | /* Make a quick check before getting the lock */ | 255 | /* Make a quick check before getting the lock */ |
252 | if (!pte_present(*pte)) { | 256 | if (!sync && !pte_present(*pte)) { |
253 | pte_unmap(pte); | 257 | pte_unmap(pte); |
254 | return NULL; | 258 | return NULL; |
255 | } | 259 | } |
256 | 260 | ||
257 | ptl = pte_lockptr(mm, pmd); | 261 | ptl = pte_lockptr(mm, pmd); |
258 | spin_lock(ptl); | 262 | spin_lock(ptl); |
259 | if (pte_present(*pte) && page_to_pfn(page) == pte_pfn(*pte)) { | 263 | if (pte_present(*pte) && page_to_pfn(page) == pte_pfn(*pte)) { |
260 | *ptlp = ptl; | 264 | *ptlp = ptl; |
261 | return pte; | 265 | return pte; |
262 | } | 266 | } |
263 | pte_unmap_unlock(pte, ptl); | 267 | pte_unmap_unlock(pte, ptl); |
264 | return NULL; | 268 | return NULL; |
265 | } | 269 | } |
266 | 270 | ||
267 | /* | 271 | /* |
268 | * Subfunctions of page_referenced: page_referenced_one called | 272 | * Subfunctions of page_referenced: page_referenced_one called |
269 | * repeatedly from either page_referenced_anon or page_referenced_file. | 273 | * repeatedly from either page_referenced_anon or page_referenced_file. |
270 | */ | 274 | */ |
271 | static int page_referenced_one(struct page *page, | 275 | static int page_referenced_one(struct page *page, |
272 | struct vm_area_struct *vma, unsigned int *mapcount) | 276 | struct vm_area_struct *vma, unsigned int *mapcount) |
273 | { | 277 | { |
274 | struct mm_struct *mm = vma->vm_mm; | 278 | struct mm_struct *mm = vma->vm_mm; |
275 | unsigned long address; | 279 | unsigned long address; |
276 | pte_t *pte; | 280 | pte_t *pte; |
277 | spinlock_t *ptl; | 281 | spinlock_t *ptl; |
278 | int referenced = 0; | 282 | int referenced = 0; |
279 | 283 | ||
280 | address = vma_address(page, vma); | 284 | address = vma_address(page, vma); |
281 | if (address == -EFAULT) | 285 | if (address == -EFAULT) |
282 | goto out; | 286 | goto out; |
283 | 287 | ||
284 | pte = page_check_address(page, mm, address, &ptl); | 288 | pte = page_check_address(page, mm, address, &ptl, 0); |
285 | if (!pte) | 289 | if (!pte) |
286 | goto out; | 290 | goto out; |
287 | 291 | ||
288 | if (vma->vm_flags & VM_LOCKED) { | 292 | if (vma->vm_flags & VM_LOCKED) { |
289 | referenced++; | 293 | referenced++; |
290 | *mapcount = 1; /* break early from loop */ | 294 | *mapcount = 1; /* break early from loop */ |
291 | } else if (ptep_clear_flush_young_notify(vma, address, pte)) | 295 | } else if (ptep_clear_flush_young_notify(vma, address, pte)) |
292 | referenced++; | 296 | referenced++; |
293 | 297 | ||
294 | /* Pretend the page is referenced if the task has the | 298 | /* Pretend the page is referenced if the task has the |
295 | swap token and is in the middle of a page fault. */ | 299 | swap token and is in the middle of a page fault. */ |
296 | if (mm != current->mm && has_swap_token(mm) && | 300 | if (mm != current->mm && has_swap_token(mm) && |
297 | rwsem_is_locked(&mm->mmap_sem)) | 301 | rwsem_is_locked(&mm->mmap_sem)) |
298 | referenced++; | 302 | referenced++; |
299 | 303 | ||
300 | (*mapcount)--; | 304 | (*mapcount)--; |
301 | pte_unmap_unlock(pte, ptl); | 305 | pte_unmap_unlock(pte, ptl); |
302 | out: | 306 | out: |
303 | return referenced; | 307 | return referenced; |
304 | } | 308 | } |
305 | 309 | ||
306 | static int page_referenced_anon(struct page *page, | 310 | static int page_referenced_anon(struct page *page, |
307 | struct mem_cgroup *mem_cont) | 311 | struct mem_cgroup *mem_cont) |
308 | { | 312 | { |
309 | unsigned int mapcount; | 313 | unsigned int mapcount; |
310 | struct anon_vma *anon_vma; | 314 | struct anon_vma *anon_vma; |
311 | struct vm_area_struct *vma; | 315 | struct vm_area_struct *vma; |
312 | int referenced = 0; | 316 | int referenced = 0; |
313 | 317 | ||
314 | anon_vma = page_lock_anon_vma(page); | 318 | anon_vma = page_lock_anon_vma(page); |
315 | if (!anon_vma) | 319 | if (!anon_vma) |
316 | return referenced; | 320 | return referenced; |
317 | 321 | ||
318 | mapcount = page_mapcount(page); | 322 | mapcount = page_mapcount(page); |
319 | list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { | 323 | list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { |
320 | /* | 324 | /* |
321 | * If we are reclaiming on behalf of a cgroup, skip | 325 | * If we are reclaiming on behalf of a cgroup, skip |
322 | * counting on behalf of references from different | 326 | * counting on behalf of references from different |
323 | * cgroups | 327 | * cgroups |
324 | */ | 328 | */ |
325 | if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont)) | 329 | if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont)) |
326 | continue; | 330 | continue; |
327 | referenced += page_referenced_one(page, vma, &mapcount); | 331 | referenced += page_referenced_one(page, vma, &mapcount); |
328 | if (!mapcount) | 332 | if (!mapcount) |
329 | break; | 333 | break; |
330 | } | 334 | } |
331 | 335 | ||
332 | page_unlock_anon_vma(anon_vma); | 336 | page_unlock_anon_vma(anon_vma); |
333 | return referenced; | 337 | return referenced; |
334 | } | 338 | } |
335 | 339 | ||
336 | /** | 340 | /** |
337 | * page_referenced_file - referenced check for object-based rmap | 341 | * page_referenced_file - referenced check for object-based rmap |
338 | * @page: the page we're checking references on. | 342 | * @page: the page we're checking references on. |
339 | * @mem_cont: target memory controller | 343 | * @mem_cont: target memory controller |
340 | * | 344 | * |
341 | * For an object-based mapped page, find all the places it is mapped and | 345 | * For an object-based mapped page, find all the places it is mapped and |
342 | * check/clear the referenced flag. This is done by following the page->mapping | 346 | * check/clear the referenced flag. This is done by following the page->mapping |
343 | * pointer, then walking the chain of vmas it holds. It returns the number | 347 | * pointer, then walking the chain of vmas it holds. It returns the number |
344 | * of references it found. | 348 | * of references it found. |
345 | * | 349 | * |
346 | * This function is only called from page_referenced for object-based pages. | 350 | * This function is only called from page_referenced for object-based pages. |
347 | */ | 351 | */ |
348 | static int page_referenced_file(struct page *page, | 352 | static int page_referenced_file(struct page *page, |
349 | struct mem_cgroup *mem_cont) | 353 | struct mem_cgroup *mem_cont) |
350 | { | 354 | { |
351 | unsigned int mapcount; | 355 | unsigned int mapcount; |
352 | struct address_space *mapping = page->mapping; | 356 | struct address_space *mapping = page->mapping; |
353 | pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); | 357 | pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); |
354 | struct vm_area_struct *vma; | 358 | struct vm_area_struct *vma; |
355 | struct prio_tree_iter iter; | 359 | struct prio_tree_iter iter; |
356 | int referenced = 0; | 360 | int referenced = 0; |
357 | 361 | ||
358 | /* | 362 | /* |
359 | * The caller's checks on page->mapping and !PageAnon have made | 363 | * The caller's checks on page->mapping and !PageAnon have made |
360 | * sure that this is a file page: the check for page->mapping | 364 | * sure that this is a file page: the check for page->mapping |
361 | * excludes the case just before it gets set on an anon page. | 365 | * excludes the case just before it gets set on an anon page. |
362 | */ | 366 | */ |
363 | BUG_ON(PageAnon(page)); | 367 | BUG_ON(PageAnon(page)); |
364 | 368 | ||
365 | /* | 369 | /* |
366 | * The page lock not only makes sure that page->mapping cannot | 370 | * The page lock not only makes sure that page->mapping cannot |
367 | * suddenly be NULLified by truncation, it makes sure that the | 371 | * suddenly be NULLified by truncation, it makes sure that the |
368 | * structure at mapping cannot be freed and reused yet, | 372 | * structure at mapping cannot be freed and reused yet, |
369 | * so we can safely take mapping->i_mmap_lock. | 373 | * so we can safely take mapping->i_mmap_lock. |
370 | */ | 374 | */ |
371 | BUG_ON(!PageLocked(page)); | 375 | BUG_ON(!PageLocked(page)); |
372 | 376 | ||
373 | spin_lock(&mapping->i_mmap_lock); | 377 | spin_lock(&mapping->i_mmap_lock); |
374 | 378 | ||
375 | /* | 379 | /* |
376 | * i_mmap_lock does not stabilize mapcount at all, but mapcount | 380 | * i_mmap_lock does not stabilize mapcount at all, but mapcount |
377 | * is more likely to be accurate if we note it after spinning. | 381 | * is more likely to be accurate if we note it after spinning. |
378 | */ | 382 | */ |
379 | mapcount = page_mapcount(page); | 383 | mapcount = page_mapcount(page); |
380 | 384 | ||
381 | vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { | 385 | vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { |
382 | /* | 386 | /* |
383 | * If we are reclaiming on behalf of a cgroup, skip | 387 | * If we are reclaiming on behalf of a cgroup, skip |
384 | * counting on behalf of references from different | 388 | * counting on behalf of references from different |
385 | * cgroups | 389 | * cgroups |
386 | */ | 390 | */ |
387 | if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont)) | 391 | if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont)) |
388 | continue; | 392 | continue; |
389 | if ((vma->vm_flags & (VM_LOCKED|VM_MAYSHARE)) | 393 | if ((vma->vm_flags & (VM_LOCKED|VM_MAYSHARE)) |
390 | == (VM_LOCKED|VM_MAYSHARE)) { | 394 | == (VM_LOCKED|VM_MAYSHARE)) { |
391 | referenced++; | 395 | referenced++; |
392 | break; | 396 | break; |
393 | } | 397 | } |
394 | referenced += page_referenced_one(page, vma, &mapcount); | 398 | referenced += page_referenced_one(page, vma, &mapcount); |
395 | if (!mapcount) | 399 | if (!mapcount) |
396 | break; | 400 | break; |
397 | } | 401 | } |
398 | 402 | ||
399 | spin_unlock(&mapping->i_mmap_lock); | 403 | spin_unlock(&mapping->i_mmap_lock); |
400 | return referenced; | 404 | return referenced; |
401 | } | 405 | } |
402 | 406 | ||
403 | /** | 407 | /** |
404 | * page_referenced - test if the page was referenced | 408 | * page_referenced - test if the page was referenced |
405 | * @page: the page to test | 409 | * @page: the page to test |
406 | * @is_locked: caller holds lock on the page | 410 | * @is_locked: caller holds lock on the page |
407 | * @mem_cont: target memory controller | 411 | * @mem_cont: target memory controller |
408 | * | 412 | * |
409 | * Quick test_and_clear_referenced for all mappings to a page, | 413 | * Quick test_and_clear_referenced for all mappings to a page, |
410 | * returns the number of ptes which referenced the page. | 414 | * returns the number of ptes which referenced the page. |
411 | */ | 415 | */ |
412 | int page_referenced(struct page *page, int is_locked, | 416 | int page_referenced(struct page *page, int is_locked, |
413 | struct mem_cgroup *mem_cont) | 417 | struct mem_cgroup *mem_cont) |
414 | { | 418 | { |
415 | int referenced = 0; | 419 | int referenced = 0; |
416 | 420 | ||
417 | if (TestClearPageReferenced(page)) | 421 | if (TestClearPageReferenced(page)) |
418 | referenced++; | 422 | referenced++; |
419 | 423 | ||
420 | if (page_mapped(page) && page->mapping) { | 424 | if (page_mapped(page) && page->mapping) { |
421 | if (PageAnon(page)) | 425 | if (PageAnon(page)) |
422 | referenced += page_referenced_anon(page, mem_cont); | 426 | referenced += page_referenced_anon(page, mem_cont); |
423 | else if (is_locked) | 427 | else if (is_locked) |
424 | referenced += page_referenced_file(page, mem_cont); | 428 | referenced += page_referenced_file(page, mem_cont); |
425 | else if (!trylock_page(page)) | 429 | else if (!trylock_page(page)) |
426 | referenced++; | 430 | referenced++; |
427 | else { | 431 | else { |
428 | if (page->mapping) | 432 | if (page->mapping) |
429 | referenced += | 433 | referenced += |
430 | page_referenced_file(page, mem_cont); | 434 | page_referenced_file(page, mem_cont); |
431 | unlock_page(page); | 435 | unlock_page(page); |
432 | } | 436 | } |
433 | } | 437 | } |
434 | 438 | ||
435 | if (page_test_and_clear_young(page)) | 439 | if (page_test_and_clear_young(page)) |
436 | referenced++; | 440 | referenced++; |
437 | 441 | ||
438 | return referenced; | 442 | return referenced; |
439 | } | 443 | } |
440 | 444 | ||
441 | static int page_mkclean_one(struct page *page, struct vm_area_struct *vma) | 445 | static int page_mkclean_one(struct page *page, struct vm_area_struct *vma) |
442 | { | 446 | { |
443 | struct mm_struct *mm = vma->vm_mm; | 447 | struct mm_struct *mm = vma->vm_mm; |
444 | unsigned long address; | 448 | unsigned long address; |
445 | pte_t *pte; | 449 | pte_t *pte; |
446 | spinlock_t *ptl; | 450 | spinlock_t *ptl; |
447 | int ret = 0; | 451 | int ret = 0; |
448 | 452 | ||
449 | address = vma_address(page, vma); | 453 | address = vma_address(page, vma); |
450 | if (address == -EFAULT) | 454 | if (address == -EFAULT) |
451 | goto out; | 455 | goto out; |
452 | 456 | ||
453 | pte = page_check_address(page, mm, address, &ptl); | 457 | pte = page_check_address(page, mm, address, &ptl, 1); |
454 | if (!pte) | 458 | if (!pte) |
455 | goto out; | 459 | goto out; |
456 | 460 | ||
457 | if (pte_dirty(*pte) || pte_write(*pte)) { | 461 | if (pte_dirty(*pte) || pte_write(*pte)) { |
458 | pte_t entry; | 462 | pte_t entry; |
459 | 463 | ||
460 | flush_cache_page(vma, address, pte_pfn(*pte)); | 464 | flush_cache_page(vma, address, pte_pfn(*pte)); |
461 | entry = ptep_clear_flush_notify(vma, address, pte); | 465 | entry = ptep_clear_flush_notify(vma, address, pte); |
462 | entry = pte_wrprotect(entry); | 466 | entry = pte_wrprotect(entry); |
463 | entry = pte_mkclean(entry); | 467 | entry = pte_mkclean(entry); |
464 | set_pte_at(mm, address, pte, entry); | 468 | set_pte_at(mm, address, pte, entry); |
465 | ret = 1; | 469 | ret = 1; |
466 | } | 470 | } |
467 | 471 | ||
468 | pte_unmap_unlock(pte, ptl); | 472 | pte_unmap_unlock(pte, ptl); |
469 | out: | 473 | out: |
470 | return ret; | 474 | return ret; |
471 | } | 475 | } |
472 | 476 | ||
473 | static int page_mkclean_file(struct address_space *mapping, struct page *page) | 477 | static int page_mkclean_file(struct address_space *mapping, struct page *page) |
474 | { | 478 | { |
475 | pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); | 479 | pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); |
476 | struct vm_area_struct *vma; | 480 | struct vm_area_struct *vma; |
477 | struct prio_tree_iter iter; | 481 | struct prio_tree_iter iter; |
478 | int ret = 0; | 482 | int ret = 0; |
479 | 483 | ||
480 | BUG_ON(PageAnon(page)); | 484 | BUG_ON(PageAnon(page)); |
481 | 485 | ||
482 | spin_lock(&mapping->i_mmap_lock); | 486 | spin_lock(&mapping->i_mmap_lock); |
483 | vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { | 487 | vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { |
484 | if (vma->vm_flags & VM_SHARED) | 488 | if (vma->vm_flags & VM_SHARED) |
485 | ret += page_mkclean_one(page, vma); | 489 | ret += page_mkclean_one(page, vma); |
486 | } | 490 | } |
487 | spin_unlock(&mapping->i_mmap_lock); | 491 | spin_unlock(&mapping->i_mmap_lock); |
488 | return ret; | 492 | return ret; |
489 | } | 493 | } |
490 | 494 | ||
491 | int page_mkclean(struct page *page) | 495 | int page_mkclean(struct page *page) |
492 | { | 496 | { |
493 | int ret = 0; | 497 | int ret = 0; |
494 | 498 | ||
495 | BUG_ON(!PageLocked(page)); | 499 | BUG_ON(!PageLocked(page)); |
496 | 500 | ||
497 | if (page_mapped(page)) { | 501 | if (page_mapped(page)) { |
498 | struct address_space *mapping = page_mapping(page); | 502 | struct address_space *mapping = page_mapping(page); |
499 | if (mapping) { | 503 | if (mapping) { |
500 | ret = page_mkclean_file(mapping, page); | 504 | ret = page_mkclean_file(mapping, page); |
501 | if (page_test_dirty(page)) { | 505 | if (page_test_dirty(page)) { |
502 | page_clear_dirty(page); | 506 | page_clear_dirty(page); |
503 | ret = 1; | 507 | ret = 1; |
504 | } | 508 | } |
505 | } | 509 | } |
506 | } | 510 | } |
507 | 511 | ||
508 | return ret; | 512 | return ret; |
509 | } | 513 | } |
510 | EXPORT_SYMBOL_GPL(page_mkclean); | 514 | EXPORT_SYMBOL_GPL(page_mkclean); |
511 | 515 | ||
512 | /** | 516 | /** |
513 | * __page_set_anon_rmap - setup new anonymous rmap | 517 | * __page_set_anon_rmap - setup new anonymous rmap |
514 | * @page: the page to add the mapping to | 518 | * @page: the page to add the mapping to |
515 | * @vma: the vm area in which the mapping is added | 519 | * @vma: the vm area in which the mapping is added |
516 | * @address: the user virtual address mapped | 520 | * @address: the user virtual address mapped |
517 | */ | 521 | */ |
518 | static void __page_set_anon_rmap(struct page *page, | 522 | static void __page_set_anon_rmap(struct page *page, |
519 | struct vm_area_struct *vma, unsigned long address) | 523 | struct vm_area_struct *vma, unsigned long address) |
520 | { | 524 | { |
521 | struct anon_vma *anon_vma = vma->anon_vma; | 525 | struct anon_vma *anon_vma = vma->anon_vma; |
522 | 526 | ||
523 | BUG_ON(!anon_vma); | 527 | BUG_ON(!anon_vma); |
524 | anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; | 528 | anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; |
525 | page->mapping = (struct address_space *) anon_vma; | 529 | page->mapping = (struct address_space *) anon_vma; |
526 | 530 | ||
527 | page->index = linear_page_index(vma, address); | 531 | page->index = linear_page_index(vma, address); |
528 | 532 | ||
529 | /* | 533 | /* |
530 | * nr_mapped state can be updated without turning off | 534 | * nr_mapped state can be updated without turning off |
531 | * interrupts because it is not modified via interrupt. | 535 | * interrupts because it is not modified via interrupt. |
532 | */ | 536 | */ |
533 | __inc_zone_page_state(page, NR_ANON_PAGES); | 537 | __inc_zone_page_state(page, NR_ANON_PAGES); |
534 | } | 538 | } |
535 | 539 | ||
536 | /** | 540 | /** |
537 | * __page_check_anon_rmap - sanity check anonymous rmap addition | 541 | * __page_check_anon_rmap - sanity check anonymous rmap addition |
538 | * @page: the page to add the mapping to | 542 | * @page: the page to add the mapping to |
539 | * @vma: the vm area in which the mapping is added | 543 | * @vma: the vm area in which the mapping is added |
540 | * @address: the user virtual address mapped | 544 | * @address: the user virtual address mapped |
541 | */ | 545 | */ |
542 | static void __page_check_anon_rmap(struct page *page, | 546 | static void __page_check_anon_rmap(struct page *page, |
543 | struct vm_area_struct *vma, unsigned long address) | 547 | struct vm_area_struct *vma, unsigned long address) |
544 | { | 548 | { |
545 | #ifdef CONFIG_DEBUG_VM | 549 | #ifdef CONFIG_DEBUG_VM |
546 | /* | 550 | /* |
547 | * The page's anon-rmap details (mapping and index) are guaranteed to | 551 | * The page's anon-rmap details (mapping and index) are guaranteed to |
548 | * be set up correctly at this point. | 552 | * be set up correctly at this point. |
549 | * | 553 | * |
550 | * We have exclusion against page_add_anon_rmap because the caller | 554 | * We have exclusion against page_add_anon_rmap because the caller |
551 | * always holds the page locked, except if called from page_dup_rmap, | 555 | * always holds the page locked, except if called from page_dup_rmap, |
552 | * in which case the page is already known to be setup. | 556 | * in which case the page is already known to be setup. |
553 | * | 557 | * |
554 | * We have exclusion against page_add_new_anon_rmap because those pages | 558 | * We have exclusion against page_add_new_anon_rmap because those pages |
555 | * are initially only visible via the pagetables, and the pte is locked | 559 | * are initially only visible via the pagetables, and the pte is locked |
556 | * over the call to page_add_new_anon_rmap. | 560 | * over the call to page_add_new_anon_rmap. |
557 | */ | 561 | */ |
558 | struct anon_vma *anon_vma = vma->anon_vma; | 562 | struct anon_vma *anon_vma = vma->anon_vma; |
559 | anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; | 563 | anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; |
560 | BUG_ON(page->mapping != (struct address_space *)anon_vma); | 564 | BUG_ON(page->mapping != (struct address_space *)anon_vma); |
561 | BUG_ON(page->index != linear_page_index(vma, address)); | 565 | BUG_ON(page->index != linear_page_index(vma, address)); |
562 | #endif | 566 | #endif |
563 | } | 567 | } |
564 | 568 | ||
565 | /** | 569 | /** |
566 | * page_add_anon_rmap - add pte mapping to an anonymous page | 570 | * page_add_anon_rmap - add pte mapping to an anonymous page |
567 | * @page: the page to add the mapping to | 571 | * @page: the page to add the mapping to |
568 | * @vma: the vm area in which the mapping is added | 572 | * @vma: the vm area in which the mapping is added |
569 | * @address: the user virtual address mapped | 573 | * @address: the user virtual address mapped |
570 | * | 574 | * |
571 | * The caller needs to hold the pte lock and the page must be locked. | 575 | * The caller needs to hold the pte lock and the page must be locked. |
572 | */ | 576 | */ |
573 | void page_add_anon_rmap(struct page *page, | 577 | void page_add_anon_rmap(struct page *page, |
574 | struct vm_area_struct *vma, unsigned long address) | 578 | struct vm_area_struct *vma, unsigned long address) |
575 | { | 579 | { |
576 | VM_BUG_ON(!PageLocked(page)); | 580 | VM_BUG_ON(!PageLocked(page)); |
577 | VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end); | 581 | VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end); |
578 | if (atomic_inc_and_test(&page->_mapcount)) | 582 | if (atomic_inc_and_test(&page->_mapcount)) |
579 | __page_set_anon_rmap(page, vma, address); | 583 | __page_set_anon_rmap(page, vma, address); |
580 | else | 584 | else |
581 | __page_check_anon_rmap(page, vma, address); | 585 | __page_check_anon_rmap(page, vma, address); |
582 | } | 586 | } |
583 | 587 | ||
584 | /** | 588 | /** |
585 | * page_add_new_anon_rmap - add pte mapping to a new anonymous page | 589 | * page_add_new_anon_rmap - add pte mapping to a new anonymous page |
586 | * @page: the page to add the mapping to | 590 | * @page: the page to add the mapping to |
587 | * @vma: the vm area in which the mapping is added | 591 | * @vma: the vm area in which the mapping is added |
588 | * @address: the user virtual address mapped | 592 | * @address: the user virtual address mapped |
589 | * | 593 | * |
590 | * Same as page_add_anon_rmap but must only be called on *new* pages. | 594 | * Same as page_add_anon_rmap but must only be called on *new* pages. |
591 | * This means the inc-and-test can be bypassed. | 595 | * This means the inc-and-test can be bypassed. |
592 | * Page does not have to be locked. | 596 | * Page does not have to be locked. |
593 | */ | 597 | */ |
594 | void page_add_new_anon_rmap(struct page *page, | 598 | void page_add_new_anon_rmap(struct page *page, |
595 | struct vm_area_struct *vma, unsigned long address) | 599 | struct vm_area_struct *vma, unsigned long address) |
596 | { | 600 | { |
597 | BUG_ON(address < vma->vm_start || address >= vma->vm_end); | 601 | BUG_ON(address < vma->vm_start || address >= vma->vm_end); |
598 | atomic_set(&page->_mapcount, 0); /* elevate count by 1 (starts at -1) */ | 602 | atomic_set(&page->_mapcount, 0); /* elevate count by 1 (starts at -1) */ |
599 | __page_set_anon_rmap(page, vma, address); | 603 | __page_set_anon_rmap(page, vma, address); |
600 | } | 604 | } |
601 | 605 | ||
602 | /** | 606 | /** |
603 | * page_add_file_rmap - add pte mapping to a file page | 607 | * page_add_file_rmap - add pte mapping to a file page |
604 | * @page: the page to add the mapping to | 608 | * @page: the page to add the mapping to |
605 | * | 609 | * |
606 | * The caller needs to hold the pte lock. | 610 | * The caller needs to hold the pte lock. |
607 | */ | 611 | */ |
608 | void page_add_file_rmap(struct page *page) | 612 | void page_add_file_rmap(struct page *page) |
609 | { | 613 | { |
610 | if (atomic_inc_and_test(&page->_mapcount)) | 614 | if (atomic_inc_and_test(&page->_mapcount)) |
611 | __inc_zone_page_state(page, NR_FILE_MAPPED); | 615 | __inc_zone_page_state(page, NR_FILE_MAPPED); |
612 | } | 616 | } |
613 | 617 | ||
614 | #ifdef CONFIG_DEBUG_VM | 618 | #ifdef CONFIG_DEBUG_VM |
615 | /** | 619 | /** |
616 | * page_dup_rmap - duplicate pte mapping to a page | 620 | * page_dup_rmap - duplicate pte mapping to a page |
617 | * @page: the page to add the mapping to | 621 | * @page: the page to add the mapping to |
618 | * @vma: the vm area being duplicated | 622 | * @vma: the vm area being duplicated |
619 | * @address: the user virtual address mapped | 623 | * @address: the user virtual address mapped |
620 | * | 624 | * |
621 | * For copy_page_range only: minimal extract from page_add_file_rmap / | 625 | * For copy_page_range only: minimal extract from page_add_file_rmap / |
622 | * page_add_anon_rmap, avoiding unnecessary tests (already checked) so it's | 626 | * page_add_anon_rmap, avoiding unnecessary tests (already checked) so it's |
623 | * quicker. | 627 | * quicker. |
624 | * | 628 | * |
625 | * The caller needs to hold the pte lock. | 629 | * The caller needs to hold the pte lock. |
626 | */ | 630 | */ |
627 | void page_dup_rmap(struct page *page, struct vm_area_struct *vma, unsigned long address) | 631 | void page_dup_rmap(struct page *page, struct vm_area_struct *vma, unsigned long address) |
628 | { | 632 | { |
629 | BUG_ON(page_mapcount(page) == 0); | 633 | BUG_ON(page_mapcount(page) == 0); |
630 | if (PageAnon(page)) | 634 | if (PageAnon(page)) |
631 | __page_check_anon_rmap(page, vma, address); | 635 | __page_check_anon_rmap(page, vma, address); |
632 | atomic_inc(&page->_mapcount); | 636 | atomic_inc(&page->_mapcount); |
633 | } | 637 | } |
634 | #endif | 638 | #endif |
635 | 639 | ||
636 | /** | 640 | /** |
637 | * page_remove_rmap - take down pte mapping from a page | 641 | * page_remove_rmap - take down pte mapping from a page |
638 | * @page: page to remove mapping from | 642 | * @page: page to remove mapping from |
639 | * @vma: the vm area in which the mapping is removed | 643 | * @vma: the vm area in which the mapping is removed |
640 | * | 644 | * |
641 | * The caller needs to hold the pte lock. | 645 | * The caller needs to hold the pte lock. |
642 | */ | 646 | */ |
643 | void page_remove_rmap(struct page *page, struct vm_area_struct *vma) | 647 | void page_remove_rmap(struct page *page, struct vm_area_struct *vma) |
644 | { | 648 | { |
645 | if (atomic_add_negative(-1, &page->_mapcount)) { | 649 | if (atomic_add_negative(-1, &page->_mapcount)) { |
646 | if (unlikely(page_mapcount(page) < 0)) { | 650 | if (unlikely(page_mapcount(page) < 0)) { |
647 | printk (KERN_EMERG "Eeek! page_mapcount(page) went negative! (%d)\n", page_mapcount(page)); | 651 | printk (KERN_EMERG "Eeek! page_mapcount(page) went negative! (%d)\n", page_mapcount(page)); |
648 | printk (KERN_EMERG " page pfn = %lx\n", page_to_pfn(page)); | 652 | printk (KERN_EMERG " page pfn = %lx\n", page_to_pfn(page)); |
649 | printk (KERN_EMERG " page->flags = %lx\n", page->flags); | 653 | printk (KERN_EMERG " page->flags = %lx\n", page->flags); |
650 | printk (KERN_EMERG " page->count = %x\n", page_count(page)); | 654 | printk (KERN_EMERG " page->count = %x\n", page_count(page)); |
651 | printk (KERN_EMERG " page->mapping = %p\n", page->mapping); | 655 | printk (KERN_EMERG " page->mapping = %p\n", page->mapping); |
652 | print_symbol (KERN_EMERG " vma->vm_ops = %s\n", (unsigned long)vma->vm_ops); | 656 | print_symbol (KERN_EMERG " vma->vm_ops = %s\n", (unsigned long)vma->vm_ops); |
653 | if (vma->vm_ops) { | 657 | if (vma->vm_ops) { |
654 | print_symbol (KERN_EMERG " vma->vm_ops->fault = %s\n", (unsigned long)vma->vm_ops->fault); | 658 | print_symbol (KERN_EMERG " vma->vm_ops->fault = %s\n", (unsigned long)vma->vm_ops->fault); |
655 | } | 659 | } |
656 | if (vma->vm_file && vma->vm_file->f_op) | 660 | if (vma->vm_file && vma->vm_file->f_op) |
657 | print_symbol (KERN_EMERG " vma->vm_file->f_op->mmap = %s\n", (unsigned long)vma->vm_file->f_op->mmap); | 661 | print_symbol (KERN_EMERG " vma->vm_file->f_op->mmap = %s\n", (unsigned long)vma->vm_file->f_op->mmap); |
658 | BUG(); | 662 | BUG(); |
659 | } | 663 | } |
660 | 664 | ||
661 | /* | 665 | /* |
662 | * Now that the last pte has gone, s390 must transfer dirty | 666 | * Now that the last pte has gone, s390 must transfer dirty |
663 | * flag from storage key to struct page. We can usually skip | 667 | * flag from storage key to struct page. We can usually skip |
664 | * this if the page is anon, so about to be freed; but perhaps | 668 | * this if the page is anon, so about to be freed; but perhaps |
665 | * not if it's in swapcache - there might be another pte slot | 669 | * not if it's in swapcache - there might be another pte slot |
666 | * containing the swap entry, but page not yet written to swap. | 670 | * containing the swap entry, but page not yet written to swap. |
667 | */ | 671 | */ |
668 | if ((!PageAnon(page) || PageSwapCache(page)) && | 672 | if ((!PageAnon(page) || PageSwapCache(page)) && |
669 | page_test_dirty(page)) { | 673 | page_test_dirty(page)) { |
670 | page_clear_dirty(page); | 674 | page_clear_dirty(page); |
671 | set_page_dirty(page); | 675 | set_page_dirty(page); |
672 | } | 676 | } |
673 | 677 | ||
674 | mem_cgroup_uncharge_page(page); | 678 | mem_cgroup_uncharge_page(page); |
675 | __dec_zone_page_state(page, | 679 | __dec_zone_page_state(page, |
676 | PageAnon(page) ? NR_ANON_PAGES : NR_FILE_MAPPED); | 680 | PageAnon(page) ? NR_ANON_PAGES : NR_FILE_MAPPED); |
677 | /* | 681 | /* |
678 | * It would be tidy to reset the PageAnon mapping here, | 682 | * It would be tidy to reset the PageAnon mapping here, |
679 | * but that might overwrite a racing page_add_anon_rmap | 683 | * but that might overwrite a racing page_add_anon_rmap |
680 | * which increments mapcount after us but sets mapping | 684 | * which increments mapcount after us but sets mapping |
681 | * before us: so leave the reset to free_hot_cold_page, | 685 | * before us: so leave the reset to free_hot_cold_page, |
682 | * and remember that it's only reliable while mapped. | 686 | * and remember that it's only reliable while mapped. |
683 | * Leaving it set also helps swapoff to reinstate ptes | 687 | * Leaving it set also helps swapoff to reinstate ptes |
684 | * faster for those pages still in swapcache. | 688 | * faster for those pages still in swapcache. |
685 | */ | 689 | */ |
686 | } | 690 | } |
687 | } | 691 | } |
688 | 692 | ||
689 | /* | 693 | /* |
690 | * Subfunctions of try_to_unmap: try_to_unmap_one called | 694 | * Subfunctions of try_to_unmap: try_to_unmap_one called |
691 | * repeatedly from either try_to_unmap_anon or try_to_unmap_file. | 695 | * repeatedly from either try_to_unmap_anon or try_to_unmap_file. |
692 | */ | 696 | */ |
693 | static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, | 697 | static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, |
694 | int migration) | 698 | int migration) |
695 | { | 699 | { |
696 | struct mm_struct *mm = vma->vm_mm; | 700 | struct mm_struct *mm = vma->vm_mm; |
697 | unsigned long address; | 701 | unsigned long address; |
698 | pte_t *pte; | 702 | pte_t *pte; |
699 | pte_t pteval; | 703 | pte_t pteval; |
700 | spinlock_t *ptl; | 704 | spinlock_t *ptl; |
701 | int ret = SWAP_AGAIN; | 705 | int ret = SWAP_AGAIN; |
702 | 706 | ||
703 | address = vma_address(page, vma); | 707 | address = vma_address(page, vma); |
704 | if (address == -EFAULT) | 708 | if (address == -EFAULT) |
705 | goto out; | 709 | goto out; |
706 | 710 | ||
707 | pte = page_check_address(page, mm, address, &ptl); | 711 | pte = page_check_address(page, mm, address, &ptl, 0); |
708 | if (!pte) | 712 | if (!pte) |
709 | goto out; | 713 | goto out; |
710 | 714 | ||
711 | /* | 715 | /* |
712 | * If the page is mlock()d, we cannot swap it out. | 716 | * If the page is mlock()d, we cannot swap it out. |
713 | * If it's recently referenced (perhaps page_referenced | 717 | * If it's recently referenced (perhaps page_referenced |
714 | * skipped over this mm) then we should reactivate it. | 718 | * skipped over this mm) then we should reactivate it. |
715 | */ | 719 | */ |
716 | if (!migration && ((vma->vm_flags & VM_LOCKED) || | 720 | if (!migration && ((vma->vm_flags & VM_LOCKED) || |
717 | (ptep_clear_flush_young_notify(vma, address, pte)))) { | 721 | (ptep_clear_flush_young_notify(vma, address, pte)))) { |
718 | ret = SWAP_FAIL; | 722 | ret = SWAP_FAIL; |
719 | goto out_unmap; | 723 | goto out_unmap; |
720 | } | 724 | } |
721 | 725 | ||
722 | /* Nuke the page table entry. */ | 726 | /* Nuke the page table entry. */ |
723 | flush_cache_page(vma, address, page_to_pfn(page)); | 727 | flush_cache_page(vma, address, page_to_pfn(page)); |
724 | pteval = ptep_clear_flush_notify(vma, address, pte); | 728 | pteval = ptep_clear_flush_notify(vma, address, pte); |
725 | 729 | ||
726 | /* Move the dirty bit to the physical page now the pte is gone. */ | 730 | /* Move the dirty bit to the physical page now the pte is gone. */ |
727 | if (pte_dirty(pteval)) | 731 | if (pte_dirty(pteval)) |
728 | set_page_dirty(page); | 732 | set_page_dirty(page); |
729 | 733 | ||
730 | /* Update high watermark before we lower rss */ | 734 | /* Update high watermark before we lower rss */ |
731 | update_hiwater_rss(mm); | 735 | update_hiwater_rss(mm); |
732 | 736 | ||
733 | if (PageAnon(page)) { | 737 | if (PageAnon(page)) { |
734 | swp_entry_t entry = { .val = page_private(page) }; | 738 | swp_entry_t entry = { .val = page_private(page) }; |
735 | 739 | ||
736 | if (PageSwapCache(page)) { | 740 | if (PageSwapCache(page)) { |
737 | /* | 741 | /* |
738 | * Store the swap location in the pte. | 742 | * Store the swap location in the pte. |
739 | * See handle_pte_fault() ... | 743 | * See handle_pte_fault() ... |
740 | */ | 744 | */ |
741 | swap_duplicate(entry); | 745 | swap_duplicate(entry); |
742 | if (list_empty(&mm->mmlist)) { | 746 | if (list_empty(&mm->mmlist)) { |
743 | spin_lock(&mmlist_lock); | 747 | spin_lock(&mmlist_lock); |
744 | if (list_empty(&mm->mmlist)) | 748 | if (list_empty(&mm->mmlist)) |
745 | list_add(&mm->mmlist, &init_mm.mmlist); | 749 | list_add(&mm->mmlist, &init_mm.mmlist); |
746 | spin_unlock(&mmlist_lock); | 750 | spin_unlock(&mmlist_lock); |
747 | } | 751 | } |
748 | dec_mm_counter(mm, anon_rss); | 752 | dec_mm_counter(mm, anon_rss); |
749 | #ifdef CONFIG_MIGRATION | 753 | #ifdef CONFIG_MIGRATION |
750 | } else { | 754 | } else { |
751 | /* | 755 | /* |
752 | * Store the pfn of the page in a special migration | 756 | * Store the pfn of the page in a special migration |
753 | * pte. do_swap_page() will wait until the migration | 757 | * pte. do_swap_page() will wait until the migration |
754 | * pte is removed and then restart fault handling. | 758 | * pte is removed and then restart fault handling. |
755 | */ | 759 | */ |
756 | BUG_ON(!migration); | 760 | BUG_ON(!migration); |
757 | entry = make_migration_entry(page, pte_write(pteval)); | 761 | entry = make_migration_entry(page, pte_write(pteval)); |
758 | #endif | 762 | #endif |
759 | } | 763 | } |
760 | set_pte_at(mm, address, pte, swp_entry_to_pte(entry)); | 764 | set_pte_at(mm, address, pte, swp_entry_to_pte(entry)); |
761 | BUG_ON(pte_file(*pte)); | 765 | BUG_ON(pte_file(*pte)); |
762 | } else | 766 | } else |
763 | #ifdef CONFIG_MIGRATION | 767 | #ifdef CONFIG_MIGRATION |
764 | if (migration) { | 768 | if (migration) { |
765 | /* Establish migration entry for a file page */ | 769 | /* Establish migration entry for a file page */ |
766 | swp_entry_t entry; | 770 | swp_entry_t entry; |
767 | entry = make_migration_entry(page, pte_write(pteval)); | 771 | entry = make_migration_entry(page, pte_write(pteval)); |
768 | set_pte_at(mm, address, pte, swp_entry_to_pte(entry)); | 772 | set_pte_at(mm, address, pte, swp_entry_to_pte(entry)); |
769 | } else | 773 | } else |
770 | #endif | 774 | #endif |
771 | dec_mm_counter(mm, file_rss); | 775 | dec_mm_counter(mm, file_rss); |
772 | 776 | ||
773 | 777 | ||
774 | page_remove_rmap(page, vma); | 778 | page_remove_rmap(page, vma); |
775 | page_cache_release(page); | 779 | page_cache_release(page); |
776 | 780 | ||
777 | out_unmap: | 781 | out_unmap: |
778 | pte_unmap_unlock(pte, ptl); | 782 | pte_unmap_unlock(pte, ptl); |
779 | out: | 783 | out: |
780 | return ret; | 784 | return ret; |
781 | } | 785 | } |
782 | 786 | ||
783 | /* | 787 | /* |
784 | * objrmap doesn't work for nonlinear VMAs because the assumption that | 788 | * objrmap doesn't work for nonlinear VMAs because the assumption that |
785 | * offset-into-file correlates with offset-into-virtual-addresses does not hold. | 789 | * offset-into-file correlates with offset-into-virtual-addresses does not hold. |
786 | * Consequently, given a particular page and its ->index, we cannot locate the | 790 | * Consequently, given a particular page and its ->index, we cannot locate the |
787 | * ptes which are mapping that page without an exhaustive linear search. | 791 | * ptes which are mapping that page without an exhaustive linear search. |
788 | * | 792 | * |
789 | * So what this code does is a mini "virtual scan" of each nonlinear VMA which | 793 | * So what this code does is a mini "virtual scan" of each nonlinear VMA which |
790 | * maps the file to which the target page belongs. The ->vm_private_data field | 794 | * maps the file to which the target page belongs. The ->vm_private_data field |
791 | * holds the current cursor into that scan. Successive searches will circulate | 795 | * holds the current cursor into that scan. Successive searches will circulate |
792 | * around the vma's virtual address space. | 796 | * around the vma's virtual address space. |
793 | * | 797 | * |
794 | * So as more replacement pressure is applied to the pages in a nonlinear VMA, | 798 | * So as more replacement pressure is applied to the pages in a nonlinear VMA, |
795 | * more scanning pressure is placed against them as well. Eventually pages | 799 | * more scanning pressure is placed against them as well. Eventually pages |
796 | * will become fully unmapped and are eligible for eviction. | 800 | * will become fully unmapped and are eligible for eviction. |
797 | * | 801 | * |
798 | * For very sparsely populated VMAs this is a little inefficient - chances are | 802 | * For very sparsely populated VMAs this is a little inefficient - chances are |
799 | * there there won't be many ptes located within the scan cluster. In this case | 803 | * there there won't be many ptes located within the scan cluster. In this case |
800 | * maybe we could scan further - to the end of the pte page, perhaps. | 804 | * maybe we could scan further - to the end of the pte page, perhaps. |
801 | */ | 805 | */ |
802 | #define CLUSTER_SIZE min(32*PAGE_SIZE, PMD_SIZE) | 806 | #define CLUSTER_SIZE min(32*PAGE_SIZE, PMD_SIZE) |
803 | #define CLUSTER_MASK (~(CLUSTER_SIZE - 1)) | 807 | #define CLUSTER_MASK (~(CLUSTER_SIZE - 1)) |
804 | 808 | ||
805 | static void try_to_unmap_cluster(unsigned long cursor, | 809 | static void try_to_unmap_cluster(unsigned long cursor, |
806 | unsigned int *mapcount, struct vm_area_struct *vma) | 810 | unsigned int *mapcount, struct vm_area_struct *vma) |
807 | { | 811 | { |
808 | struct mm_struct *mm = vma->vm_mm; | 812 | struct mm_struct *mm = vma->vm_mm; |
809 | pgd_t *pgd; | 813 | pgd_t *pgd; |
810 | pud_t *pud; | 814 | pud_t *pud; |
811 | pmd_t *pmd; | 815 | pmd_t *pmd; |
812 | pte_t *pte; | 816 | pte_t *pte; |
813 | pte_t pteval; | 817 | pte_t pteval; |
814 | spinlock_t *ptl; | 818 | spinlock_t *ptl; |
815 | struct page *page; | 819 | struct page *page; |
816 | unsigned long address; | 820 | unsigned long address; |
817 | unsigned long end; | 821 | unsigned long end; |
818 | 822 | ||
819 | address = (vma->vm_start + cursor) & CLUSTER_MASK; | 823 | address = (vma->vm_start + cursor) & CLUSTER_MASK; |
820 | end = address + CLUSTER_SIZE; | 824 | end = address + CLUSTER_SIZE; |
821 | if (address < vma->vm_start) | 825 | if (address < vma->vm_start) |
822 | address = vma->vm_start; | 826 | address = vma->vm_start; |
823 | if (end > vma->vm_end) | 827 | if (end > vma->vm_end) |
824 | end = vma->vm_end; | 828 | end = vma->vm_end; |
825 | 829 | ||
826 | pgd = pgd_offset(mm, address); | 830 | pgd = pgd_offset(mm, address); |
827 | if (!pgd_present(*pgd)) | 831 | if (!pgd_present(*pgd)) |
828 | return; | 832 | return; |
829 | 833 | ||
830 | pud = pud_offset(pgd, address); | 834 | pud = pud_offset(pgd, address); |
831 | if (!pud_present(*pud)) | 835 | if (!pud_present(*pud)) |
832 | return; | 836 | return; |
833 | 837 | ||
834 | pmd = pmd_offset(pud, address); | 838 | pmd = pmd_offset(pud, address); |
835 | if (!pmd_present(*pmd)) | 839 | if (!pmd_present(*pmd)) |
836 | return; | 840 | return; |
837 | 841 | ||
838 | pte = pte_offset_map_lock(mm, pmd, address, &ptl); | 842 | pte = pte_offset_map_lock(mm, pmd, address, &ptl); |
839 | 843 | ||
840 | /* Update high watermark before we lower rss */ | 844 | /* Update high watermark before we lower rss */ |
841 | update_hiwater_rss(mm); | 845 | update_hiwater_rss(mm); |
842 | 846 | ||
843 | for (; address < end; pte++, address += PAGE_SIZE) { | 847 | for (; address < end; pte++, address += PAGE_SIZE) { |
844 | if (!pte_present(*pte)) | 848 | if (!pte_present(*pte)) |
845 | continue; | 849 | continue; |
846 | page = vm_normal_page(vma, address, *pte); | 850 | page = vm_normal_page(vma, address, *pte); |
847 | BUG_ON(!page || PageAnon(page)); | 851 | BUG_ON(!page || PageAnon(page)); |
848 | 852 | ||
849 | if (ptep_clear_flush_young_notify(vma, address, pte)) | 853 | if (ptep_clear_flush_young_notify(vma, address, pte)) |
850 | continue; | 854 | continue; |
851 | 855 | ||
852 | /* Nuke the page table entry. */ | 856 | /* Nuke the page table entry. */ |
853 | flush_cache_page(vma, address, pte_pfn(*pte)); | 857 | flush_cache_page(vma, address, pte_pfn(*pte)); |
854 | pteval = ptep_clear_flush_notify(vma, address, pte); | 858 | pteval = ptep_clear_flush_notify(vma, address, pte); |
855 | 859 | ||
856 | /* If nonlinear, store the file page offset in the pte. */ | 860 | /* If nonlinear, store the file page offset in the pte. */ |
857 | if (page->index != linear_page_index(vma, address)) | 861 | if (page->index != linear_page_index(vma, address)) |
858 | set_pte_at(mm, address, pte, pgoff_to_pte(page->index)); | 862 | set_pte_at(mm, address, pte, pgoff_to_pte(page->index)); |
859 | 863 | ||
860 | /* Move the dirty bit to the physical page now the pte is gone. */ | 864 | /* Move the dirty bit to the physical page now the pte is gone. */ |
861 | if (pte_dirty(pteval)) | 865 | if (pte_dirty(pteval)) |
862 | set_page_dirty(page); | 866 | set_page_dirty(page); |
863 | 867 | ||
864 | page_remove_rmap(page, vma); | 868 | page_remove_rmap(page, vma); |
865 | page_cache_release(page); | 869 | page_cache_release(page); |
866 | dec_mm_counter(mm, file_rss); | 870 | dec_mm_counter(mm, file_rss); |
867 | (*mapcount)--; | 871 | (*mapcount)--; |
868 | } | 872 | } |
869 | pte_unmap_unlock(pte - 1, ptl); | 873 | pte_unmap_unlock(pte - 1, ptl); |
870 | } | 874 | } |
871 | 875 | ||
872 | static int try_to_unmap_anon(struct page *page, int migration) | 876 | static int try_to_unmap_anon(struct page *page, int migration) |
873 | { | 877 | { |
874 | struct anon_vma *anon_vma; | 878 | struct anon_vma *anon_vma; |
875 | struct vm_area_struct *vma; | 879 | struct vm_area_struct *vma; |
876 | int ret = SWAP_AGAIN; | 880 | int ret = SWAP_AGAIN; |
877 | 881 | ||
878 | anon_vma = page_lock_anon_vma(page); | 882 | anon_vma = page_lock_anon_vma(page); |
879 | if (!anon_vma) | 883 | if (!anon_vma) |
880 | return ret; | 884 | return ret; |
881 | 885 | ||
882 | list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { | 886 | list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { |
883 | ret = try_to_unmap_one(page, vma, migration); | 887 | ret = try_to_unmap_one(page, vma, migration); |
884 | if (ret == SWAP_FAIL || !page_mapped(page)) | 888 | if (ret == SWAP_FAIL || !page_mapped(page)) |
885 | break; | 889 | break; |
886 | } | 890 | } |
887 | 891 | ||
888 | page_unlock_anon_vma(anon_vma); | 892 | page_unlock_anon_vma(anon_vma); |
889 | return ret; | 893 | return ret; |
890 | } | 894 | } |
891 | 895 | ||
892 | /** | 896 | /** |
893 | * try_to_unmap_file - unmap file page using the object-based rmap method | 897 | * try_to_unmap_file - unmap file page using the object-based rmap method |
894 | * @page: the page to unmap | 898 | * @page: the page to unmap |
895 | * @migration: migration flag | 899 | * @migration: migration flag |
896 | * | 900 | * |
897 | * Find all the mappings of a page using the mapping pointer and the vma chains | 901 | * Find all the mappings of a page using the mapping pointer and the vma chains |
898 | * contained in the address_space struct it points to. | 902 | * contained in the address_space struct it points to. |
899 | * | 903 | * |
900 | * This function is only called from try_to_unmap for object-based pages. | 904 | * This function is only called from try_to_unmap for object-based pages. |
901 | */ | 905 | */ |
902 | static int try_to_unmap_file(struct page *page, int migration) | 906 | static int try_to_unmap_file(struct page *page, int migration) |
903 | { | 907 | { |
904 | struct address_space *mapping = page->mapping; | 908 | struct address_space *mapping = page->mapping; |
905 | pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); | 909 | pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); |
906 | struct vm_area_struct *vma; | 910 | struct vm_area_struct *vma; |
907 | struct prio_tree_iter iter; | 911 | struct prio_tree_iter iter; |
908 | int ret = SWAP_AGAIN; | 912 | int ret = SWAP_AGAIN; |
909 | unsigned long cursor; | 913 | unsigned long cursor; |
910 | unsigned long max_nl_cursor = 0; | 914 | unsigned long max_nl_cursor = 0; |
911 | unsigned long max_nl_size = 0; | 915 | unsigned long max_nl_size = 0; |
912 | unsigned int mapcount; | 916 | unsigned int mapcount; |
913 | 917 | ||
914 | spin_lock(&mapping->i_mmap_lock); | 918 | spin_lock(&mapping->i_mmap_lock); |
915 | vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { | 919 | vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { |
916 | ret = try_to_unmap_one(page, vma, migration); | 920 | ret = try_to_unmap_one(page, vma, migration); |
917 | if (ret == SWAP_FAIL || !page_mapped(page)) | 921 | if (ret == SWAP_FAIL || !page_mapped(page)) |
918 | goto out; | 922 | goto out; |
919 | } | 923 | } |
920 | 924 | ||
921 | if (list_empty(&mapping->i_mmap_nonlinear)) | 925 | if (list_empty(&mapping->i_mmap_nonlinear)) |
922 | goto out; | 926 | goto out; |
923 | 927 | ||
924 | list_for_each_entry(vma, &mapping->i_mmap_nonlinear, | 928 | list_for_each_entry(vma, &mapping->i_mmap_nonlinear, |
925 | shared.vm_set.list) { | 929 | shared.vm_set.list) { |
926 | if ((vma->vm_flags & VM_LOCKED) && !migration) | 930 | if ((vma->vm_flags & VM_LOCKED) && !migration) |
927 | continue; | 931 | continue; |
928 | cursor = (unsigned long) vma->vm_private_data; | 932 | cursor = (unsigned long) vma->vm_private_data; |
929 | if (cursor > max_nl_cursor) | 933 | if (cursor > max_nl_cursor) |
930 | max_nl_cursor = cursor; | 934 | max_nl_cursor = cursor; |
931 | cursor = vma->vm_end - vma->vm_start; | 935 | cursor = vma->vm_end - vma->vm_start; |
932 | if (cursor > max_nl_size) | 936 | if (cursor > max_nl_size) |
933 | max_nl_size = cursor; | 937 | max_nl_size = cursor; |
934 | } | 938 | } |
935 | 939 | ||
936 | if (max_nl_size == 0) { /* any nonlinears locked or reserved */ | 940 | if (max_nl_size == 0) { /* any nonlinears locked or reserved */ |
937 | ret = SWAP_FAIL; | 941 | ret = SWAP_FAIL; |
938 | goto out; | 942 | goto out; |
939 | } | 943 | } |
940 | 944 | ||
941 | /* | 945 | /* |
942 | * We don't try to search for this page in the nonlinear vmas, | 946 | * We don't try to search for this page in the nonlinear vmas, |
943 | * and page_referenced wouldn't have found it anyway. Instead | 947 | * and page_referenced wouldn't have found it anyway. Instead |
944 | * just walk the nonlinear vmas trying to age and unmap some. | 948 | * just walk the nonlinear vmas trying to age and unmap some. |
945 | * The mapcount of the page we came in with is irrelevant, | 949 | * The mapcount of the page we came in with is irrelevant, |
946 | * but even so use it as a guide to how hard we should try? | 950 | * but even so use it as a guide to how hard we should try? |
947 | */ | 951 | */ |
948 | mapcount = page_mapcount(page); | 952 | mapcount = page_mapcount(page); |
949 | if (!mapcount) | 953 | if (!mapcount) |
950 | goto out; | 954 | goto out; |
951 | cond_resched_lock(&mapping->i_mmap_lock); | 955 | cond_resched_lock(&mapping->i_mmap_lock); |
952 | 956 | ||
953 | max_nl_size = (max_nl_size + CLUSTER_SIZE - 1) & CLUSTER_MASK; | 957 | max_nl_size = (max_nl_size + CLUSTER_SIZE - 1) & CLUSTER_MASK; |
954 | if (max_nl_cursor == 0) | 958 | if (max_nl_cursor == 0) |
955 | max_nl_cursor = CLUSTER_SIZE; | 959 | max_nl_cursor = CLUSTER_SIZE; |
956 | 960 | ||
957 | do { | 961 | do { |
958 | list_for_each_entry(vma, &mapping->i_mmap_nonlinear, | 962 | list_for_each_entry(vma, &mapping->i_mmap_nonlinear, |
959 | shared.vm_set.list) { | 963 | shared.vm_set.list) { |
960 | if ((vma->vm_flags & VM_LOCKED) && !migration) | 964 | if ((vma->vm_flags & VM_LOCKED) && !migration) |
961 | continue; | 965 | continue; |
962 | cursor = (unsigned long) vma->vm_private_data; | 966 | cursor = (unsigned long) vma->vm_private_data; |
963 | while ( cursor < max_nl_cursor && | 967 | while ( cursor < max_nl_cursor && |
964 | cursor < vma->vm_end - vma->vm_start) { | 968 | cursor < vma->vm_end - vma->vm_start) { |
965 | try_to_unmap_cluster(cursor, &mapcount, vma); | 969 | try_to_unmap_cluster(cursor, &mapcount, vma); |
966 | cursor += CLUSTER_SIZE; | 970 | cursor += CLUSTER_SIZE; |
967 | vma->vm_private_data = (void *) cursor; | 971 | vma->vm_private_data = (void *) cursor; |
968 | if ((int)mapcount <= 0) | 972 | if ((int)mapcount <= 0) |
969 | goto out; | 973 | goto out; |
970 | } | 974 | } |
971 | vma->vm_private_data = (void *) max_nl_cursor; | 975 | vma->vm_private_data = (void *) max_nl_cursor; |
972 | } | 976 | } |
973 | cond_resched_lock(&mapping->i_mmap_lock); | 977 | cond_resched_lock(&mapping->i_mmap_lock); |
974 | max_nl_cursor += CLUSTER_SIZE; | 978 | max_nl_cursor += CLUSTER_SIZE; |
975 | } while (max_nl_cursor <= max_nl_size); | 979 | } while (max_nl_cursor <= max_nl_size); |
976 | 980 | ||
977 | /* | 981 | /* |
978 | * Don't loop forever (perhaps all the remaining pages are | 982 | * Don't loop forever (perhaps all the remaining pages are |
979 | * in locked vmas). Reset cursor on all unreserved nonlinear | 983 | * in locked vmas). Reset cursor on all unreserved nonlinear |
980 | * vmas, now forgetting on which ones it had fallen behind. | 984 | * vmas, now forgetting on which ones it had fallen behind. |
981 | */ | 985 | */ |
982 | list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list) | 986 | list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list) |
983 | vma->vm_private_data = NULL; | 987 | vma->vm_private_data = NULL; |
984 | out: | 988 | out: |
985 | spin_unlock(&mapping->i_mmap_lock); | 989 | spin_unlock(&mapping->i_mmap_lock); |
986 | return ret; | 990 | return ret; |
987 | } | 991 | } |
988 | 992 | ||
989 | /** | 993 | /** |
990 | * try_to_unmap - try to remove all page table mappings to a page | 994 | * try_to_unmap - try to remove all page table mappings to a page |
991 | * @page: the page to get unmapped | 995 | * @page: the page to get unmapped |
992 | * @migration: migration flag | 996 | * @migration: migration flag |
993 | * | 997 | * |
994 | * Tries to remove all the page table entries which are mapping this | 998 | * Tries to remove all the page table entries which are mapping this |
995 | * page, used in the pageout path. Caller must hold the page lock. | 999 | * page, used in the pageout path. Caller must hold the page lock. |
996 | * Return values are: | 1000 | * Return values are: |
997 | * | 1001 | * |
998 | * SWAP_SUCCESS - we succeeded in removing all mappings | 1002 | * SWAP_SUCCESS - we succeeded in removing all mappings |
999 | * SWAP_AGAIN - we missed a mapping, try again later | 1003 | * SWAP_AGAIN - we missed a mapping, try again later |
1000 | * SWAP_FAIL - the page is unswappable | 1004 | * SWAP_FAIL - the page is unswappable |
1001 | */ | 1005 | */ |
1002 | int try_to_unmap(struct page *page, int migration) | 1006 | int try_to_unmap(struct page *page, int migration) |
1003 | { | 1007 | { |
1004 | int ret; | 1008 | int ret; |
1005 | 1009 | ||
1006 | BUG_ON(!PageLocked(page)); | 1010 | BUG_ON(!PageLocked(page)); |
1007 | 1011 | ||
1008 | if (PageAnon(page)) | 1012 | if (PageAnon(page)) |
1009 | ret = try_to_unmap_anon(page, migration); | 1013 | ret = try_to_unmap_anon(page, migration); |
1010 | else | 1014 | else |
1011 | ret = try_to_unmap_file(page, migration); | 1015 | ret = try_to_unmap_file(page, migration); |
1012 | 1016 | ||
1013 | if (!page_mapped(page)) | 1017 | if (!page_mapped(page)) |
1014 | ret = SWAP_SUCCESS; | 1018 | ret = SWAP_SUCCESS; |
1015 | return ret; | 1019 | return ret; |
1016 | } | 1020 | } |
1017 | 1021 | ||
1018 | 1022 |