Commit 7307de80510a70e5e5aa98de1e80ccbb7d90a3a8
1 parent
607d44aa3f
ocfs2: shared writeable mmap
Implement cluster consistent shared writeable mappings using the ->page_mkwrite() callback. Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
Showing 4 changed files with 200 additions and 39 deletions Side-by-side Diff
fs/ocfs2/aops.c
... | ... | @@ -1034,7 +1034,8 @@ |
1034 | 1034 | */ |
1035 | 1035 | static int ocfs2_grab_pages_for_write(struct address_space *mapping, |
1036 | 1036 | struct ocfs2_write_ctxt *wc, |
1037 | - u32 cpos, loff_t user_pos, int new) | |
1037 | + u32 cpos, loff_t user_pos, int new, | |
1038 | + struct page *mmap_page) | |
1038 | 1039 | { |
1039 | 1040 | int ret = 0, i; |
1040 | 1041 | unsigned long start, target_index, index; |
... | ... | @@ -1058,11 +1059,36 @@ |
1058 | 1059 | for(i = 0; i < wc->w_num_pages; i++) { |
1059 | 1060 | index = start + i; |
1060 | 1061 | |
1061 | - wc->w_pages[i] = find_or_create_page(mapping, index, GFP_NOFS); | |
1062 | - if (!wc->w_pages[i]) { | |
1063 | - ret = -ENOMEM; | |
1064 | - mlog_errno(ret); | |
1065 | - goto out; | |
1062 | + if (index == target_index && mmap_page) { | |
1063 | + /* | |
1064 | + * ocfs2_pagemkwrite() is a little different | |
1065 | + * and wants us to directly use the page | |
1066 | + * passed in. | |
1067 | + */ | |
1068 | + lock_page(mmap_page); | |
1069 | + | |
1070 | + if (mmap_page->mapping != mapping) { | |
1071 | + unlock_page(mmap_page); | |
1072 | + /* | |
1073 | + * Sanity check - the locking in | |
1074 | + * ocfs2_pagemkwrite() should ensure | |
1075 | + * that this code doesn't trigger. | |
1076 | + */ | |
1077 | + ret = -EINVAL; | |
1078 | + mlog_errno(ret); | |
1079 | + goto out; | |
1080 | + } | |
1081 | + | |
1082 | + page_cache_get(mmap_page); | |
1083 | + wc->w_pages[i] = mmap_page; | |
1084 | + } else { | |
1085 | + wc->w_pages[i] = find_or_create_page(mapping, index, | |
1086 | + GFP_NOFS); | |
1087 | + if (!wc->w_pages[i]) { | |
1088 | + ret = -ENOMEM; | |
1089 | + mlog_errno(ret); | |
1090 | + goto out; | |
1091 | + } | |
1066 | 1092 | } |
1067 | 1093 | |
1068 | 1094 | if (index == target_index) |
... | ... | @@ -1213,10 +1239,10 @@ |
1213 | 1239 | } |
1214 | 1240 | } |
1215 | 1241 | |
1216 | -static int ocfs2_write_begin_nolock(struct address_space *mapping, | |
1217 | - loff_t pos, unsigned len, unsigned flags, | |
1218 | - struct page **pagep, void **fsdata, | |
1219 | - struct buffer_head *di_bh) | |
1242 | +int ocfs2_write_begin_nolock(struct address_space *mapping, | |
1243 | + loff_t pos, unsigned len, unsigned flags, | |
1244 | + struct page **pagep, void **fsdata, | |
1245 | + struct buffer_head *di_bh, struct page *mmap_page) | |
1220 | 1246 | { |
1221 | 1247 | int ret, i, credits = OCFS2_INODE_UPDATE_CREDITS; |
1222 | 1248 | unsigned int num_clusters = 0, clusters_to_alloc = 0; |
... | ... | @@ -1318,7 +1344,7 @@ |
1318 | 1344 | * extent. |
1319 | 1345 | */ |
1320 | 1346 | ret = ocfs2_grab_pages_for_write(mapping, wc, wc->w_cpos, pos, |
1321 | - clusters_to_alloc); | |
1347 | + clusters_to_alloc, mmap_page); | |
1322 | 1348 | if (ret) { |
1323 | 1349 | mlog_errno(ret); |
1324 | 1350 | goto out_commit; |
... | ... | @@ -1386,7 +1412,7 @@ |
1386 | 1412 | } |
1387 | 1413 | |
1388 | 1414 | ret = ocfs2_write_begin_nolock(mapping, pos, len, flags, pagep, |
1389 | - fsdata, di_bh); | |
1415 | + fsdata, di_bh, NULL); | |
1390 | 1416 | if (ret) { |
1391 | 1417 | mlog_errno(ret); |
1392 | 1418 | goto out_fail_data; |
... | ... | @@ -1407,9 +1433,9 @@ |
1407 | 1433 | return ret; |
1408 | 1434 | } |
1409 | 1435 | |
1410 | -static int ocfs2_write_end_nolock(struct address_space *mapping, | |
1411 | - loff_t pos, unsigned len, unsigned copied, | |
1412 | - struct page *page, void *fsdata) | |
1436 | +int ocfs2_write_end_nolock(struct address_space *mapping, | |
1437 | + loff_t pos, unsigned len, unsigned copied, | |
1438 | + struct page *page, void *fsdata) | |
1413 | 1439 | { |
1414 | 1440 | int i; |
1415 | 1441 | unsigned from, to, start = pos & (PAGE_CACHE_SIZE - 1); |
fs/ocfs2/aops.h
... | ... | @@ -50,6 +50,15 @@ |
50 | 50 | loff_t pos, unsigned len, unsigned copied, |
51 | 51 | struct page *page, void *fsdata); |
52 | 52 | |
53 | +int ocfs2_write_end_nolock(struct address_space *mapping, | |
54 | + loff_t pos, unsigned len, unsigned copied, | |
55 | + struct page *page, void *fsdata); | |
56 | + | |
57 | +int ocfs2_write_begin_nolock(struct address_space *mapping, | |
58 | + loff_t pos, unsigned len, unsigned flags, | |
59 | + struct page **pagep, void **fsdata, | |
60 | + struct buffer_head *di_bh, struct page *mmap_page); | |
61 | + | |
53 | 62 | /* all ocfs2_dio_end_io()'s fault */ |
54 | 63 | #define ocfs2_iocb_is_rw_locked(iocb) \ |
55 | 64 | test_bit(0, (unsigned long *)&iocb->private) |
fs/ocfs2/file.c
... | ... | @@ -1001,6 +1001,13 @@ |
1001 | 1001 | goto bail_unlock; |
1002 | 1002 | } |
1003 | 1003 | |
1004 | + /* | |
1005 | + * This will intentionally not wind up calling vmtruncate(), | |
1006 | + * since all the work for a size change has been done above. | |
1007 | + * Otherwise, we could get into problems with truncate as | |
1008 | + * ip_alloc_sem is used there to protect against i_size | |
1009 | + * changes. | |
1010 | + */ | |
1004 | 1011 | status = inode_setattr(inode, attr); |
1005 | 1012 | if (status < 0) { |
1006 | 1013 | mlog_errno(status); |
fs/ocfs2/mmap.c
... | ... | @@ -37,11 +37,29 @@ |
37 | 37 | |
38 | 38 | #include "ocfs2.h" |
39 | 39 | |
40 | +#include "aops.h" | |
40 | 41 | #include "dlmglue.h" |
41 | 42 | #include "file.h" |
42 | 43 | #include "inode.h" |
43 | 44 | #include "mmap.h" |
44 | 45 | |
46 | +static inline int ocfs2_vm_op_block_sigs(sigset_t *blocked, sigset_t *oldset) | |
47 | +{ | |
48 | + /* The best way to deal with signals in the vm path is | |
49 | + * to block them upfront, rather than allowing the | |
50 | + * locking paths to return -ERESTARTSYS. */ | |
51 | + sigfillset(blocked); | |
52 | + | |
53 | + /* We should technically never get a bad return value | |
54 | + * from sigprocmask */ | |
55 | + return sigprocmask(SIG_BLOCK, blocked, oldset); | |
56 | +} | |
57 | + | |
58 | +static inline int ocfs2_vm_op_unblock_sigs(sigset_t *oldset) | |
59 | +{ | |
60 | + return sigprocmask(SIG_SETMASK, oldset, NULL); | |
61 | +} | |
62 | + | |
45 | 63 | static struct page *ocfs2_nopage(struct vm_area_struct * area, |
46 | 64 | unsigned long address, |
47 | 65 | int *type) |
... | ... | @@ -53,14 +71,7 @@ |
53 | 71 | mlog_entry("(area=%p, address=%lu, type=%p)\n", area, address, |
54 | 72 | type); |
55 | 73 | |
56 | - /* The best way to deal with signals in this path is | |
57 | - * to block them upfront, rather than allowing the | |
58 | - * locking paths to return -ERESTARTSYS. */ | |
59 | - sigfillset(&blocked); | |
60 | - | |
61 | - /* We should technically never get a bad ret return | |
62 | - * from sigprocmask */ | |
63 | - ret = sigprocmask(SIG_BLOCK, &blocked, &oldset); | |
74 | + ret = ocfs2_vm_op_block_sigs(&blocked, &oldset); | |
64 | 75 | if (ret < 0) { |
65 | 76 | mlog_errno(ret); |
66 | 77 | goto out; |
... | ... | @@ -68,7 +79,7 @@ |
68 | 79 | |
69 | 80 | page = filemap_nopage(area, address, type); |
70 | 81 | |
71 | - ret = sigprocmask(SIG_SETMASK, &oldset, NULL); | |
82 | + ret = ocfs2_vm_op_unblock_sigs(&oldset); | |
72 | 83 | if (ret < 0) |
73 | 84 | mlog_errno(ret); |
74 | 85 | out: |
75 | 86 | |
76 | 87 | |
77 | 88 | |
78 | 89 | |
79 | 90 | |
80 | 91 | |
... | ... | @@ -76,27 +87,135 @@ |
76 | 87 | return page; |
77 | 88 | } |
78 | 89 | |
79 | -static struct vm_operations_struct ocfs2_file_vm_ops = { | |
80 | - .nopage = ocfs2_nopage, | |
81 | -}; | |
90 | +static int __ocfs2_page_mkwrite(struct inode *inode, struct buffer_head *di_bh, | |
91 | + struct page *page) | |
92 | +{ | |
93 | + int ret; | |
94 | + struct address_space *mapping = inode->i_mapping; | |
95 | + loff_t pos = page->index << PAGE_CACHE_SHIFT; | |
96 | + unsigned int len = PAGE_CACHE_SIZE; | |
97 | + pgoff_t last_index; | |
98 | + struct page *locked_page = NULL; | |
99 | + void *fsdata; | |
100 | + loff_t size = i_size_read(inode); | |
82 | 101 | |
83 | -int ocfs2_mmap(struct file *file, struct vm_area_struct *vma) | |
102 | + /* | |
103 | + * Another node might have truncated while we were waiting on | |
104 | + * cluster locks. | |
105 | + */ | |
106 | + last_index = size >> PAGE_CACHE_SHIFT; | |
107 | + if (page->index > last_index) { | |
108 | + ret = -EINVAL; | |
109 | + goto out; | |
110 | + } | |
111 | + | |
112 | + /* | |
113 | + * The i_size check above doesn't catch the case where nodes | |
114 | + * truncated and then re-extended the file. We'll re-check the | |
115 | + * page mapping after taking the page lock inside of | |
116 | + * ocfs2_write_begin_nolock(). | |
117 | + */ | |
118 | + if (!PageUptodate(page) || page->mapping != inode->i_mapping) { | |
119 | + ret = -EINVAL; | |
120 | + goto out; | |
121 | + } | |
122 | + | |
123 | + /* | |
124 | + * Call ocfs2_write_begin() and ocfs2_write_end() to take | |
125 | + * advantage of the allocation code there. We pass a write | |
126 | + * length of the whole page (chopped to i_size) to make sure | |
127 | + * the whole thing is allocated. | |
128 | + * | |
129 | + * Since we know the page is up to date, we don't have to | |
130 | + * worry about ocfs2_write_begin() skipping some buffer reads | |
131 | + * because the "write" would invalidate their data. | |
132 | + */ | |
133 | + if (page->index == last_index) | |
134 | + len = size & ~PAGE_CACHE_MASK; | |
135 | + | |
136 | + ret = ocfs2_write_begin_nolock(mapping, pos, len, 0, &locked_page, | |
137 | + &fsdata, di_bh, page); | |
138 | + if (ret) { | |
139 | + if (ret != -ENOSPC) | |
140 | + mlog_errno(ret); | |
141 | + goto out; | |
142 | + } | |
143 | + | |
144 | + ret = ocfs2_write_end_nolock(mapping, pos, len, len, locked_page, | |
145 | + fsdata); | |
146 | + if (ret < 0) { | |
147 | + mlog_errno(ret); | |
148 | + goto out; | |
149 | + } | |
150 | + BUG_ON(ret != len); | |
151 | + ret = 0; | |
152 | +out: | |
153 | + return ret; | |
154 | +} | |
155 | + | |
156 | +static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct page *page) | |
84 | 157 | { |
85 | - int ret = 0, lock_level = 0; | |
86 | - struct ocfs2_super *osb = OCFS2_SB(file->f_dentry->d_inode->i_sb); | |
158 | + struct inode *inode = vma->vm_file->f_path.dentry->d_inode; | |
159 | + struct buffer_head *di_bh = NULL; | |
160 | + sigset_t blocked, oldset; | |
161 | + int ret, ret2; | |
87 | 162 | |
163 | + ret = ocfs2_vm_op_block_sigs(&blocked, &oldset); | |
164 | + if (ret < 0) { | |
165 | + mlog_errno(ret); | |
166 | + return ret; | |
167 | + } | |
168 | + | |
88 | 169 | /* |
89 | - * Only support shared writeable mmap for local mounts which | |
90 | - * don't know about holes. | |
170 | + * The cluster locks taken will block a truncate from another | |
171 | + * node. Taking the data lock will also ensure that we don't | |
172 | + * attempt page truncation as part of a downconvert. | |
91 | 173 | */ |
92 | - if ((!ocfs2_mount_local(osb) || ocfs2_sparse_alloc(osb)) && | |
93 | - ((vma->vm_flags & VM_SHARED) || (vma->vm_flags & VM_MAYSHARE)) && | |
94 | - ((vma->vm_flags & VM_WRITE) || (vma->vm_flags & VM_MAYWRITE))) { | |
95 | - mlog(0, "disallow shared writable mmaps %lx\n", vma->vm_flags); | |
96 | - /* This is -EINVAL because generic_file_readonly_mmap | |
97 | - * returns it in a similar situation. */ | |
98 | - return -EINVAL; | |
174 | + ret = ocfs2_meta_lock(inode, &di_bh, 1); | |
175 | + if (ret < 0) { | |
176 | + mlog_errno(ret); | |
177 | + goto out; | |
99 | 178 | } |
179 | + | |
180 | + /* | |
181 | + * The alloc sem should be enough to serialize with | |
182 | + * ocfs2_truncate_file() changing i_size as well as any thread | |
183 | + * modifying the inode btree. | |
184 | + */ | |
185 | + down_write(&OCFS2_I(inode)->ip_alloc_sem); | |
186 | + | |
187 | + ret = ocfs2_data_lock(inode, 1); | |
188 | + if (ret < 0) { | |
189 | + mlog_errno(ret); | |
190 | + goto out_meta_unlock; | |
191 | + } | |
192 | + | |
193 | + ret = __ocfs2_page_mkwrite(inode, di_bh, page); | |
194 | + | |
195 | + ocfs2_data_unlock(inode, 1); | |
196 | + | |
197 | +out_meta_unlock: | |
198 | + up_write(&OCFS2_I(inode)->ip_alloc_sem); | |
199 | + | |
200 | + brelse(di_bh); | |
201 | + ocfs2_meta_unlock(inode, 1); | |
202 | + | |
203 | +out: | |
204 | + ret2 = ocfs2_vm_op_unblock_sigs(&oldset); | |
205 | + if (ret2 < 0) | |
206 | + mlog_errno(ret2); | |
207 | + | |
208 | + return ret; | |
209 | +} | |
210 | + | |
211 | +static struct vm_operations_struct ocfs2_file_vm_ops = { | |
212 | + .nopage = ocfs2_nopage, | |
213 | + .page_mkwrite = ocfs2_page_mkwrite, | |
214 | +}; | |
215 | + | |
216 | +int ocfs2_mmap(struct file *file, struct vm_area_struct *vma) | |
217 | +{ | |
218 | + int ret = 0, lock_level = 0; | |
100 | 219 | |
101 | 220 | ret = ocfs2_meta_lock_atime(file->f_dentry->d_inode, |
102 | 221 | file->f_vfsmnt, &lock_level); |