Commit 204ec841fbea3e5138168edbc3a76d46747cc987

Authored by Peter Zijlstra
Committed by Linus Torvalds
1 parent ee6a645788

[PATCH] mm: msync() cleanup

With the tracking of dirty pages properly done now, msync doesn't need to scan
the PTEs anymore to determine the dirty status.

From: Hugh Dickins <hugh@veritas.com>

In looking to do that, I made some other tidyups: can remove several
#includes, and sys_msync loop termination not quite right.

Most of those points are criticisms of the existing sys_msync, not of your
patch.  In particular, the loop termination errors were introduced in 2.6.17:
I did notice this shortly before it came out, but decided I was more likely to
get it wrong myself, and make matters worse if I tried to rush a last-minute
fix in.  And it's not terribly likely to go wrong, nor disastrous if it does
go wrong (may miss reporting an unmapped area; may also fsync file of a
following vma).

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

Showing 1 changed file with 33 additions and 163 deletions Side-by-side Diff

... ... @@ -7,149 +7,33 @@
7 7 /*
8 8 * The msync() system call.
9 9 */
10   -#include <linux/slab.h>
11   -#include <linux/pagemap.h>
12 10 #include <linux/fs.h>
13 11 #include <linux/mm.h>
14 12 #include <linux/mman.h>
15   -#include <linux/hugetlb.h>
16   -#include <linux/writeback.h>
17 13 #include <linux/file.h>
18 14 #include <linux/syscalls.h>
19 15  
20   -#include <asm/pgtable.h>
21   -#include <asm/tlbflush.h>
22   -
23   -static unsigned long msync_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
24   - unsigned long addr, unsigned long end)
25   -{
26   - pte_t *pte;
27   - spinlock_t *ptl;
28   - int progress = 0;
29   - unsigned long ret = 0;
30   -
31   -again:
32   - pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
33   - do {
34   - struct page *page;
35   -
36   - if (progress >= 64) {
37   - progress = 0;
38   - if (need_resched() || need_lockbreak(ptl))
39   - break;
40   - }
41   - progress++;
42   - if (!pte_present(*pte))
43   - continue;
44   - if (!pte_maybe_dirty(*pte))
45   - continue;
46   - page = vm_normal_page(vma, addr, *pte);
47   - if (!page)
48   - continue;
49   - if (ptep_clear_flush_dirty(vma, addr, pte) ||
50   - page_test_and_clear_dirty(page))
51   - ret += set_page_dirty(page);
52   - progress += 3;
53   - } while (pte++, addr += PAGE_SIZE, addr != end);
54   - pte_unmap_unlock(pte - 1, ptl);
55   - cond_resched();
56   - if (addr != end)
57   - goto again;
58   - return ret;
59   -}
60   -
61   -static inline unsigned long msync_pmd_range(struct vm_area_struct *vma,
62   - pud_t *pud, unsigned long addr, unsigned long end)
63   -{
64   - pmd_t *pmd;
65   - unsigned long next;
66   - unsigned long ret = 0;
67   -
68   - pmd = pmd_offset(pud, addr);
69   - do {
70   - next = pmd_addr_end(addr, end);
71   - if (pmd_none_or_clear_bad(pmd))
72   - continue;
73   - ret += msync_pte_range(vma, pmd, addr, next);
74   - } while (pmd++, addr = next, addr != end);
75   - return ret;
76   -}
77   -
78   -static inline unsigned long msync_pud_range(struct vm_area_struct *vma,
79   - pgd_t *pgd, unsigned long addr, unsigned long end)
80   -{
81   - pud_t *pud;
82   - unsigned long next;
83   - unsigned long ret = 0;
84   -
85   - pud = pud_offset(pgd, addr);
86   - do {
87   - next = pud_addr_end(addr, end);
88   - if (pud_none_or_clear_bad(pud))
89   - continue;
90   - ret += msync_pmd_range(vma, pud, addr, next);
91   - } while (pud++, addr = next, addr != end);
92   - return ret;
93   -}
94   -
95   -static unsigned long msync_page_range(struct vm_area_struct *vma,
96   - unsigned long addr, unsigned long end)
97   -{
98   - pgd_t *pgd;
99   - unsigned long next;
100   - unsigned long ret = 0;
101   -
102   - /* For hugepages we can't go walking the page table normally,
103   - * but that's ok, hugetlbfs is memory based, so we don't need
104   - * to do anything more on an msync().
105   - */
106   - if (vma->vm_flags & VM_HUGETLB)
107   - return 0;
108   -
109   - BUG_ON(addr >= end);
110   - pgd = pgd_offset(vma->vm_mm, addr);
111   - flush_cache_range(vma, addr, end);
112   - do {
113   - next = pgd_addr_end(addr, end);
114   - if (pgd_none_or_clear_bad(pgd))
115   - continue;
116   - ret += msync_pud_range(vma, pgd, addr, next);
117   - } while (pgd++, addr = next, addr != end);
118   - return ret;
119   -}
120   -
121 16 /*
122 17 * MS_SYNC syncs the entire file - including mappings.
123 18 *
124   - * MS_ASYNC does not start I/O (it used to, up to 2.5.67). Instead, it just
125   - * marks the relevant pages dirty. The application may now run fsync() to
  19 + * MS_ASYNC does not start I/O (it used to, up to 2.5.67).
  20 + * Nor does it marks the relevant pages dirty (it used to up to 2.6.17).
  21 + * Now it doesn't do anything, since dirty pages are properly tracked.
  22 + *
  23 + * The application may now run fsync() to
126 24 * write out the dirty pages and wait on the writeout and check the result.
127 25 * Or the application may run fadvise(FADV_DONTNEED) against the fd to start
128 26 * async writeout immediately.
129 27 * So by _not_ starting I/O in MS_ASYNC we provide complete flexibility to
130 28 * applications.
131 29 */
132   -static int msync_interval(struct vm_area_struct *vma, unsigned long addr,
133   - unsigned long end, int flags,
134   - unsigned long *nr_pages_dirtied)
135   -{
136   - struct file *file = vma->vm_file;
137   -
138   - if ((flags & MS_INVALIDATE) && (vma->vm_flags & VM_LOCKED))
139   - return -EBUSY;
140   -
141   - if (file && (vma->vm_flags & VM_SHARED))
142   - *nr_pages_dirtied = msync_page_range(vma, addr, end);
143   - return 0;
144   -}
145   -
146 30 asmlinkage long sys_msync(unsigned long start, size_t len, int flags)
147 31 {
148 32 unsigned long end;
  33 + struct mm_struct *mm = current->mm;
149 34 struct vm_area_struct *vma;
150 35 int unmapped_error = 0;
151 36 int error = -EINVAL;
152   - int done = 0;
153 37  
154 38 if (flags & ~(MS_ASYNC | MS_INVALIDATE | MS_SYNC))
155 39 goto out;
156 40  
157 41  
158 42  
159 43  
160 44  
161 45  
162 46  
163 47  
164 48  
165 49  
166 50  
... ... @@ -169,65 +53,51 @@
169 53 * If the interval [start,end) covers some unmapped address ranges,
170 54 * just ignore them, but return -ENOMEM at the end.
171 55 */
172   - down_read(&current->mm->mmap_sem);
173   - vma = find_vma(current->mm, start);
174   - if (!vma) {
175   - error = -ENOMEM;
176   - goto out_unlock;
177   - }
178   - do {
179   - unsigned long nr_pages_dirtied = 0;
  56 + down_read(&mm->mmap_sem);
  57 + vma = find_vma(mm, start);
  58 + for (;;) {
180 59 struct file *file;
181 60  
  61 + /* Still start < end. */
  62 + error = -ENOMEM;
  63 + if (!vma)
  64 + goto out_unlock;
182 65 /* Here start < vma->vm_end. */
183 66 if (start < vma->vm_start) {
184   - unmapped_error = -ENOMEM;
185 67 start = vma->vm_start;
  68 + if (start >= end)
  69 + goto out_unlock;
  70 + unmapped_error = -ENOMEM;
186 71 }
187 72 /* Here vma->vm_start <= start < vma->vm_end. */
188   - if (end <= vma->vm_end) {
189   - if (start < end) {
190   - error = msync_interval(vma, start, end, flags,
191   - &nr_pages_dirtied);
192   - if (error)
193   - goto out_unlock;
194   - }
195   - error = unmapped_error;
196   - done = 1;
197   - } else {
198   - /* Here vma->vm_start <= start < vma->vm_end < end. */
199   - error = msync_interval(vma, start, vma->vm_end, flags,
200   - &nr_pages_dirtied);
201   - if (error)
202   - goto out_unlock;
  73 + if ((flags & MS_INVALIDATE) &&
  74 + (vma->vm_flags & VM_LOCKED)) {
  75 + error = -EBUSY;
  76 + goto out_unlock;
203 77 }
204 78 file = vma->vm_file;
205 79 start = vma->vm_end;
206   - if ((flags & MS_ASYNC) && file && nr_pages_dirtied) {
207   - get_file(file);
208   - up_read(&current->mm->mmap_sem);
209   - balance_dirty_pages_ratelimited_nr(file->f_mapping,
210   - nr_pages_dirtied);
211   - fput(file);
212   - down_read(&current->mm->mmap_sem);
213   - vma = find_vma(current->mm, start);
214   - } else if ((flags & MS_SYNC) && file &&
  80 + if ((flags & MS_SYNC) && file &&
215 81 (vma->vm_flags & VM_SHARED)) {
216 82 get_file(file);
217   - up_read(&current->mm->mmap_sem);
  83 + up_read(&mm->mmap_sem);
218 84 error = do_fsync(file, 0);
219 85 fput(file);
220   - down_read(&current->mm->mmap_sem);
221   - if (error)
222   - goto out_unlock;
223   - vma = find_vma(current->mm, start);
  86 + if (error || start >= end)
  87 + goto out;
  88 + down_read(&mm->mmap_sem);
  89 + vma = find_vma(mm, start);
224 90 } else {
  91 + if (start >= end) {
  92 + error = 0;
  93 + goto out_unlock;
  94 + }
225 95 vma = vma->vm_next;
226 96 }
227   - } while (vma && !done);
  97 + }
228 98 out_unlock:
229   - up_read(&current->mm->mmap_sem);
  99 + up_read(&mm->mmap_sem);
230 100 out:
231   - return error;
  101 + return error ? : unmapped_error;
232 102 }