Commit cac36bb06efe4880234524e117e0e712b10b1f16

Authored by Jens Axboe
1 parent d96e6e7164

pipe: change the ->pin() operation to ->confirm()

The name 'pin' was badly chosen, it doesn't pin a pipe buffer
in the most commonly used sense in the kernel. So change the
name to 'confirm', after debating this issue with Hugh
Dickins a bit.

A good return from ->confirm() means that the buffer is really
there, and that the contents are good.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

Showing 7 changed files with 22 additions and 20 deletions Inline Diff

drivers/block/loop.c
1 /* 1 /*
2 * linux/drivers/block/loop.c 2 * linux/drivers/block/loop.c
3 * 3 *
4 * Written by Theodore Ts'o, 3/29/93 4 * Written by Theodore Ts'o, 3/29/93
5 * 5 *
6 * Copyright 1993 by Theodore Ts'o. Redistribution of this file is 6 * Copyright 1993 by Theodore Ts'o. Redistribution of this file is
7 * permitted under the GNU General Public License. 7 * permitted under the GNU General Public License.
8 * 8 *
9 * DES encryption plus some minor changes by Werner Almesberger, 30-MAY-1993 9 * DES encryption plus some minor changes by Werner Almesberger, 30-MAY-1993
10 * more DES encryption plus IDEA encryption by Nicholas J. Leon, June 20, 1996 10 * more DES encryption plus IDEA encryption by Nicholas J. Leon, June 20, 1996
11 * 11 *
12 * Modularized and updated for 1.1.16 kernel - Mitch Dsouza 28th May 1994 12 * Modularized and updated for 1.1.16 kernel - Mitch Dsouza 28th May 1994
13 * Adapted for 1.3.59 kernel - Andries Brouwer, 1 Feb 1996 13 * Adapted for 1.3.59 kernel - Andries Brouwer, 1 Feb 1996
14 * 14 *
15 * Fixed do_loop_request() re-entrancy - Vincent.Renardias@waw.com Mar 20, 1997 15 * Fixed do_loop_request() re-entrancy - Vincent.Renardias@waw.com Mar 20, 1997
16 * 16 *
17 * Added devfs support - Richard Gooch <rgooch@atnf.csiro.au> 16-Jan-1998 17 * Added devfs support - Richard Gooch <rgooch@atnf.csiro.au> 16-Jan-1998
18 * 18 *
19 * Handle sparse backing files correctly - Kenn Humborg, Jun 28, 1998 19 * Handle sparse backing files correctly - Kenn Humborg, Jun 28, 1998
20 * 20 *
21 * Loadable modules and other fixes by AK, 1998 21 * Loadable modules and other fixes by AK, 1998
22 * 22 *
23 * Make real block number available to downstream transfer functions, enables 23 * Make real block number available to downstream transfer functions, enables
24 * CBC (and relatives) mode encryption requiring unique IVs per data block. 24 * CBC (and relatives) mode encryption requiring unique IVs per data block.
25 * Reed H. Petty, rhp@draper.net 25 * Reed H. Petty, rhp@draper.net
26 * 26 *
27 * Maximum number of loop devices now dynamic via max_loop module parameter. 27 * Maximum number of loop devices now dynamic via max_loop module parameter.
28 * Russell Kroll <rkroll@exploits.org> 19990701 28 * Russell Kroll <rkroll@exploits.org> 19990701
29 * 29 *
30 * Maximum number of loop devices when compiled-in now selectable by passing 30 * Maximum number of loop devices when compiled-in now selectable by passing
31 * max_loop=<1-255> to the kernel on boot. 31 * max_loop=<1-255> to the kernel on boot.
32 * Erik I. Bolsø, <eriki@himolde.no>, Oct 31, 1999 32 * Erik I. Bolsø, <eriki@himolde.no>, Oct 31, 1999
33 * 33 *
34 * Completely rewrite request handling to be make_request_fn style and 34 * Completely rewrite request handling to be make_request_fn style and
35 * non blocking, pushing work to a helper thread. Lots of fixes from 35 * non blocking, pushing work to a helper thread. Lots of fixes from
36 * Al Viro too. 36 * Al Viro too.
37 * Jens Axboe <axboe@suse.de>, Nov 2000 37 * Jens Axboe <axboe@suse.de>, Nov 2000
38 * 38 *
39 * Support up to 256 loop devices 39 * Support up to 256 loop devices
40 * Heinz Mauelshagen <mge@sistina.com>, Feb 2002 40 * Heinz Mauelshagen <mge@sistina.com>, Feb 2002
41 * 41 *
42 * Support for falling back on the write file operation when the address space 42 * Support for falling back on the write file operation when the address space
43 * operations prepare_write and/or commit_write are not available on the 43 * operations prepare_write and/or commit_write are not available on the
44 * backing filesystem. 44 * backing filesystem.
45 * Anton Altaparmakov, 16 Feb 2005 45 * Anton Altaparmakov, 16 Feb 2005
46 * 46 *
47 * Still To Fix: 47 * Still To Fix:
48 * - Advisory locking is ignored here. 48 * - Advisory locking is ignored here.
49 * - Should use an own CAP_* category instead of CAP_SYS_ADMIN 49 * - Should use an own CAP_* category instead of CAP_SYS_ADMIN
50 * 50 *
51 */ 51 */
52 52
53 #include <linux/module.h> 53 #include <linux/module.h>
54 #include <linux/moduleparam.h> 54 #include <linux/moduleparam.h>
55 #include <linux/sched.h> 55 #include <linux/sched.h>
56 #include <linux/fs.h> 56 #include <linux/fs.h>
57 #include <linux/file.h> 57 #include <linux/file.h>
58 #include <linux/stat.h> 58 #include <linux/stat.h>
59 #include <linux/errno.h> 59 #include <linux/errno.h>
60 #include <linux/major.h> 60 #include <linux/major.h>
61 #include <linux/wait.h> 61 #include <linux/wait.h>
62 #include <linux/blkdev.h> 62 #include <linux/blkdev.h>
63 #include <linux/blkpg.h> 63 #include <linux/blkpg.h>
64 #include <linux/init.h> 64 #include <linux/init.h>
65 #include <linux/smp_lock.h> 65 #include <linux/smp_lock.h>
66 #include <linux/swap.h> 66 #include <linux/swap.h>
67 #include <linux/slab.h> 67 #include <linux/slab.h>
68 #include <linux/loop.h> 68 #include <linux/loop.h>
69 #include <linux/compat.h> 69 #include <linux/compat.h>
70 #include <linux/suspend.h> 70 #include <linux/suspend.h>
71 #include <linux/writeback.h> 71 #include <linux/writeback.h>
72 #include <linux/buffer_head.h> /* for invalidate_bdev() */ 72 #include <linux/buffer_head.h> /* for invalidate_bdev() */
73 #include <linux/completion.h> 73 #include <linux/completion.h>
74 #include <linux/highmem.h> 74 #include <linux/highmem.h>
75 #include <linux/gfp.h> 75 #include <linux/gfp.h>
76 #include <linux/kthread.h> 76 #include <linux/kthread.h>
77 #include <linux/splice.h> 77 #include <linux/splice.h>
78 78
79 #include <asm/uaccess.h> 79 #include <asm/uaccess.h>
80 80
81 static LIST_HEAD(loop_devices); 81 static LIST_HEAD(loop_devices);
82 static DEFINE_MUTEX(loop_devices_mutex); 82 static DEFINE_MUTEX(loop_devices_mutex);
83 83
84 /* 84 /*
85 * Transfer functions 85 * Transfer functions
86 */ 86 */
87 static int transfer_none(struct loop_device *lo, int cmd, 87 static int transfer_none(struct loop_device *lo, int cmd,
88 struct page *raw_page, unsigned raw_off, 88 struct page *raw_page, unsigned raw_off,
89 struct page *loop_page, unsigned loop_off, 89 struct page *loop_page, unsigned loop_off,
90 int size, sector_t real_block) 90 int size, sector_t real_block)
91 { 91 {
92 char *raw_buf = kmap_atomic(raw_page, KM_USER0) + raw_off; 92 char *raw_buf = kmap_atomic(raw_page, KM_USER0) + raw_off;
93 char *loop_buf = kmap_atomic(loop_page, KM_USER1) + loop_off; 93 char *loop_buf = kmap_atomic(loop_page, KM_USER1) + loop_off;
94 94
95 if (cmd == READ) 95 if (cmd == READ)
96 memcpy(loop_buf, raw_buf, size); 96 memcpy(loop_buf, raw_buf, size);
97 else 97 else
98 memcpy(raw_buf, loop_buf, size); 98 memcpy(raw_buf, loop_buf, size);
99 99
100 kunmap_atomic(raw_buf, KM_USER0); 100 kunmap_atomic(raw_buf, KM_USER0);
101 kunmap_atomic(loop_buf, KM_USER1); 101 kunmap_atomic(loop_buf, KM_USER1);
102 cond_resched(); 102 cond_resched();
103 return 0; 103 return 0;
104 } 104 }
105 105
106 static int transfer_xor(struct loop_device *lo, int cmd, 106 static int transfer_xor(struct loop_device *lo, int cmd,
107 struct page *raw_page, unsigned raw_off, 107 struct page *raw_page, unsigned raw_off,
108 struct page *loop_page, unsigned loop_off, 108 struct page *loop_page, unsigned loop_off,
109 int size, sector_t real_block) 109 int size, sector_t real_block)
110 { 110 {
111 char *raw_buf = kmap_atomic(raw_page, KM_USER0) + raw_off; 111 char *raw_buf = kmap_atomic(raw_page, KM_USER0) + raw_off;
112 char *loop_buf = kmap_atomic(loop_page, KM_USER1) + loop_off; 112 char *loop_buf = kmap_atomic(loop_page, KM_USER1) + loop_off;
113 char *in, *out, *key; 113 char *in, *out, *key;
114 int i, keysize; 114 int i, keysize;
115 115
116 if (cmd == READ) { 116 if (cmd == READ) {
117 in = raw_buf; 117 in = raw_buf;
118 out = loop_buf; 118 out = loop_buf;
119 } else { 119 } else {
120 in = loop_buf; 120 in = loop_buf;
121 out = raw_buf; 121 out = raw_buf;
122 } 122 }
123 123
124 key = lo->lo_encrypt_key; 124 key = lo->lo_encrypt_key;
125 keysize = lo->lo_encrypt_key_size; 125 keysize = lo->lo_encrypt_key_size;
126 for (i = 0; i < size; i++) 126 for (i = 0; i < size; i++)
127 *out++ = *in++ ^ key[(i & 511) % keysize]; 127 *out++ = *in++ ^ key[(i & 511) % keysize];
128 128
129 kunmap_atomic(raw_buf, KM_USER0); 129 kunmap_atomic(raw_buf, KM_USER0);
130 kunmap_atomic(loop_buf, KM_USER1); 130 kunmap_atomic(loop_buf, KM_USER1);
131 cond_resched(); 131 cond_resched();
132 return 0; 132 return 0;
133 } 133 }
134 134
135 static int xor_init(struct loop_device *lo, const struct loop_info64 *info) 135 static int xor_init(struct loop_device *lo, const struct loop_info64 *info)
136 { 136 {
137 if (unlikely(info->lo_encrypt_key_size <= 0)) 137 if (unlikely(info->lo_encrypt_key_size <= 0))
138 return -EINVAL; 138 return -EINVAL;
139 return 0; 139 return 0;
140 } 140 }
141 141
142 static struct loop_func_table none_funcs = { 142 static struct loop_func_table none_funcs = {
143 .number = LO_CRYPT_NONE, 143 .number = LO_CRYPT_NONE,
144 .transfer = transfer_none, 144 .transfer = transfer_none,
145 }; 145 };
146 146
147 static struct loop_func_table xor_funcs = { 147 static struct loop_func_table xor_funcs = {
148 .number = LO_CRYPT_XOR, 148 .number = LO_CRYPT_XOR,
149 .transfer = transfer_xor, 149 .transfer = transfer_xor,
150 .init = xor_init 150 .init = xor_init
151 }; 151 };
152 152
153 /* xfer_funcs[0] is special - its release function is never called */ 153 /* xfer_funcs[0] is special - its release function is never called */
154 static struct loop_func_table *xfer_funcs[MAX_LO_CRYPT] = { 154 static struct loop_func_table *xfer_funcs[MAX_LO_CRYPT] = {
155 &none_funcs, 155 &none_funcs,
156 &xor_funcs 156 &xor_funcs
157 }; 157 };
158 158
159 static loff_t get_loop_size(struct loop_device *lo, struct file *file) 159 static loff_t get_loop_size(struct loop_device *lo, struct file *file)
160 { 160 {
161 loff_t size, offset, loopsize; 161 loff_t size, offset, loopsize;
162 162
163 /* Compute loopsize in bytes */ 163 /* Compute loopsize in bytes */
164 size = i_size_read(file->f_mapping->host); 164 size = i_size_read(file->f_mapping->host);
165 offset = lo->lo_offset; 165 offset = lo->lo_offset;
166 loopsize = size - offset; 166 loopsize = size - offset;
167 if (lo->lo_sizelimit > 0 && lo->lo_sizelimit < loopsize) 167 if (lo->lo_sizelimit > 0 && lo->lo_sizelimit < loopsize)
168 loopsize = lo->lo_sizelimit; 168 loopsize = lo->lo_sizelimit;
169 169
170 /* 170 /*
171 * Unfortunately, if we want to do I/O on the device, 171 * Unfortunately, if we want to do I/O on the device,
172 * the number of 512-byte sectors has to fit into a sector_t. 172 * the number of 512-byte sectors has to fit into a sector_t.
173 */ 173 */
174 return loopsize >> 9; 174 return loopsize >> 9;
175 } 175 }
176 176
177 static int 177 static int
178 figure_loop_size(struct loop_device *lo) 178 figure_loop_size(struct loop_device *lo)
179 { 179 {
180 loff_t size = get_loop_size(lo, lo->lo_backing_file); 180 loff_t size = get_loop_size(lo, lo->lo_backing_file);
181 sector_t x = (sector_t)size; 181 sector_t x = (sector_t)size;
182 182
183 if (unlikely((loff_t)x != size)) 183 if (unlikely((loff_t)x != size))
184 return -EFBIG; 184 return -EFBIG;
185 185
186 set_capacity(lo->lo_disk, x); 186 set_capacity(lo->lo_disk, x);
187 return 0; 187 return 0;
188 } 188 }
189 189
190 static inline int 190 static inline int
191 lo_do_transfer(struct loop_device *lo, int cmd, 191 lo_do_transfer(struct loop_device *lo, int cmd,
192 struct page *rpage, unsigned roffs, 192 struct page *rpage, unsigned roffs,
193 struct page *lpage, unsigned loffs, 193 struct page *lpage, unsigned loffs,
194 int size, sector_t rblock) 194 int size, sector_t rblock)
195 { 195 {
196 if (unlikely(!lo->transfer)) 196 if (unlikely(!lo->transfer))
197 return 0; 197 return 0;
198 198
199 return lo->transfer(lo, cmd, rpage, roffs, lpage, loffs, size, rblock); 199 return lo->transfer(lo, cmd, rpage, roffs, lpage, loffs, size, rblock);
200 } 200 }
201 201
202 /** 202 /**
203 * do_lo_send_aops - helper for writing data to a loop device 203 * do_lo_send_aops - helper for writing data to a loop device
204 * 204 *
205 * This is the fast version for backing filesystems which implement the address 205 * This is the fast version for backing filesystems which implement the address
206 * space operations prepare_write and commit_write. 206 * space operations prepare_write and commit_write.
207 */ 207 */
208 static int do_lo_send_aops(struct loop_device *lo, struct bio_vec *bvec, 208 static int do_lo_send_aops(struct loop_device *lo, struct bio_vec *bvec,
209 int bsize, loff_t pos, struct page *page) 209 int bsize, loff_t pos, struct page *page)
210 { 210 {
211 struct file *file = lo->lo_backing_file; /* kudos to NFsckingS */ 211 struct file *file = lo->lo_backing_file; /* kudos to NFsckingS */
212 struct address_space *mapping = file->f_mapping; 212 struct address_space *mapping = file->f_mapping;
213 const struct address_space_operations *aops = mapping->a_ops; 213 const struct address_space_operations *aops = mapping->a_ops;
214 pgoff_t index; 214 pgoff_t index;
215 unsigned offset, bv_offs; 215 unsigned offset, bv_offs;
216 int len, ret; 216 int len, ret;
217 217
218 mutex_lock(&mapping->host->i_mutex); 218 mutex_lock(&mapping->host->i_mutex);
219 index = pos >> PAGE_CACHE_SHIFT; 219 index = pos >> PAGE_CACHE_SHIFT;
220 offset = pos & ((pgoff_t)PAGE_CACHE_SIZE - 1); 220 offset = pos & ((pgoff_t)PAGE_CACHE_SIZE - 1);
221 bv_offs = bvec->bv_offset; 221 bv_offs = bvec->bv_offset;
222 len = bvec->bv_len; 222 len = bvec->bv_len;
223 while (len > 0) { 223 while (len > 0) {
224 sector_t IV; 224 sector_t IV;
225 unsigned size; 225 unsigned size;
226 int transfer_result; 226 int transfer_result;
227 227
228 IV = ((sector_t)index << (PAGE_CACHE_SHIFT - 9))+(offset >> 9); 228 IV = ((sector_t)index << (PAGE_CACHE_SHIFT - 9))+(offset >> 9);
229 size = PAGE_CACHE_SIZE - offset; 229 size = PAGE_CACHE_SIZE - offset;
230 if (size > len) 230 if (size > len)
231 size = len; 231 size = len;
232 page = grab_cache_page(mapping, index); 232 page = grab_cache_page(mapping, index);
233 if (unlikely(!page)) 233 if (unlikely(!page))
234 goto fail; 234 goto fail;
235 ret = aops->prepare_write(file, page, offset, 235 ret = aops->prepare_write(file, page, offset,
236 offset + size); 236 offset + size);
237 if (unlikely(ret)) { 237 if (unlikely(ret)) {
238 if (ret == AOP_TRUNCATED_PAGE) { 238 if (ret == AOP_TRUNCATED_PAGE) {
239 page_cache_release(page); 239 page_cache_release(page);
240 continue; 240 continue;
241 } 241 }
242 goto unlock; 242 goto unlock;
243 } 243 }
244 transfer_result = lo_do_transfer(lo, WRITE, page, offset, 244 transfer_result = lo_do_transfer(lo, WRITE, page, offset,
245 bvec->bv_page, bv_offs, size, IV); 245 bvec->bv_page, bv_offs, size, IV);
246 if (unlikely(transfer_result)) { 246 if (unlikely(transfer_result)) {
247 /* 247 /*
248 * The transfer failed, but we still write the data to 248 * The transfer failed, but we still write the data to
249 * keep prepare/commit calls balanced. 249 * keep prepare/commit calls balanced.
250 */ 250 */
251 printk(KERN_ERR "loop: transfer error block %llu\n", 251 printk(KERN_ERR "loop: transfer error block %llu\n",
252 (unsigned long long)index); 252 (unsigned long long)index);
253 zero_user_page(page, offset, size, KM_USER0); 253 zero_user_page(page, offset, size, KM_USER0);
254 } 254 }
255 flush_dcache_page(page); 255 flush_dcache_page(page);
256 ret = aops->commit_write(file, page, offset, 256 ret = aops->commit_write(file, page, offset,
257 offset + size); 257 offset + size);
258 if (unlikely(ret)) { 258 if (unlikely(ret)) {
259 if (ret == AOP_TRUNCATED_PAGE) { 259 if (ret == AOP_TRUNCATED_PAGE) {
260 page_cache_release(page); 260 page_cache_release(page);
261 continue; 261 continue;
262 } 262 }
263 goto unlock; 263 goto unlock;
264 } 264 }
265 if (unlikely(transfer_result)) 265 if (unlikely(transfer_result))
266 goto unlock; 266 goto unlock;
267 bv_offs += size; 267 bv_offs += size;
268 len -= size; 268 len -= size;
269 offset = 0; 269 offset = 0;
270 index++; 270 index++;
271 pos += size; 271 pos += size;
272 unlock_page(page); 272 unlock_page(page);
273 page_cache_release(page); 273 page_cache_release(page);
274 } 274 }
275 ret = 0; 275 ret = 0;
276 out: 276 out:
277 mutex_unlock(&mapping->host->i_mutex); 277 mutex_unlock(&mapping->host->i_mutex);
278 return ret; 278 return ret;
279 unlock: 279 unlock:
280 unlock_page(page); 280 unlock_page(page);
281 page_cache_release(page); 281 page_cache_release(page);
282 fail: 282 fail:
283 ret = -1; 283 ret = -1;
284 goto out; 284 goto out;
285 } 285 }
286 286
287 /** 287 /**
288 * __do_lo_send_write - helper for writing data to a loop device 288 * __do_lo_send_write - helper for writing data to a loop device
289 * 289 *
290 * This helper just factors out common code between do_lo_send_direct_write() 290 * This helper just factors out common code between do_lo_send_direct_write()
291 * and do_lo_send_write(). 291 * and do_lo_send_write().
292 */ 292 */
293 static int __do_lo_send_write(struct file *file, 293 static int __do_lo_send_write(struct file *file,
294 u8 *buf, const int len, loff_t pos) 294 u8 *buf, const int len, loff_t pos)
295 { 295 {
296 ssize_t bw; 296 ssize_t bw;
297 mm_segment_t old_fs = get_fs(); 297 mm_segment_t old_fs = get_fs();
298 298
299 set_fs(get_ds()); 299 set_fs(get_ds());
300 bw = file->f_op->write(file, buf, len, &pos); 300 bw = file->f_op->write(file, buf, len, &pos);
301 set_fs(old_fs); 301 set_fs(old_fs);
302 if (likely(bw == len)) 302 if (likely(bw == len))
303 return 0; 303 return 0;
304 printk(KERN_ERR "loop: Write error at byte offset %llu, length %i.\n", 304 printk(KERN_ERR "loop: Write error at byte offset %llu, length %i.\n",
305 (unsigned long long)pos, len); 305 (unsigned long long)pos, len);
306 if (bw >= 0) 306 if (bw >= 0)
307 bw = -EIO; 307 bw = -EIO;
308 return bw; 308 return bw;
309 } 309 }
310 310
311 /** 311 /**
312 * do_lo_send_direct_write - helper for writing data to a loop device 312 * do_lo_send_direct_write - helper for writing data to a loop device
313 * 313 *
314 * This is the fast, non-transforming version for backing filesystems which do 314 * This is the fast, non-transforming version for backing filesystems which do
315 * not implement the address space operations prepare_write and commit_write. 315 * not implement the address space operations prepare_write and commit_write.
316 * It uses the write file operation which should be present on all writeable 316 * It uses the write file operation which should be present on all writeable
317 * filesystems. 317 * filesystems.
318 */ 318 */
319 static int do_lo_send_direct_write(struct loop_device *lo, 319 static int do_lo_send_direct_write(struct loop_device *lo,
320 struct bio_vec *bvec, int bsize, loff_t pos, struct page *page) 320 struct bio_vec *bvec, int bsize, loff_t pos, struct page *page)
321 { 321 {
322 ssize_t bw = __do_lo_send_write(lo->lo_backing_file, 322 ssize_t bw = __do_lo_send_write(lo->lo_backing_file,
323 kmap(bvec->bv_page) + bvec->bv_offset, 323 kmap(bvec->bv_page) + bvec->bv_offset,
324 bvec->bv_len, pos); 324 bvec->bv_len, pos);
325 kunmap(bvec->bv_page); 325 kunmap(bvec->bv_page);
326 cond_resched(); 326 cond_resched();
327 return bw; 327 return bw;
328 } 328 }
329 329
330 /** 330 /**
331 * do_lo_send_write - helper for writing data to a loop device 331 * do_lo_send_write - helper for writing data to a loop device
332 * 332 *
333 * This is the slow, transforming version for filesystems which do not 333 * This is the slow, transforming version for filesystems which do not
334 * implement the address space operations prepare_write and commit_write. It 334 * implement the address space operations prepare_write and commit_write. It
335 * uses the write file operation which should be present on all writeable 335 * uses the write file operation which should be present on all writeable
336 * filesystems. 336 * filesystems.
337 * 337 *
338 * Using fops->write is slower than using aops->{prepare,commit}_write in the 338 * Using fops->write is slower than using aops->{prepare,commit}_write in the
339 * transforming case because we need to double buffer the data as we cannot do 339 * transforming case because we need to double buffer the data as we cannot do
340 * the transformations in place as we do not have direct access to the 340 * the transformations in place as we do not have direct access to the
341 * destination pages of the backing file. 341 * destination pages of the backing file.
342 */ 342 */
343 static int do_lo_send_write(struct loop_device *lo, struct bio_vec *bvec, 343 static int do_lo_send_write(struct loop_device *lo, struct bio_vec *bvec,
344 int bsize, loff_t pos, struct page *page) 344 int bsize, loff_t pos, struct page *page)
345 { 345 {
346 int ret = lo_do_transfer(lo, WRITE, page, 0, bvec->bv_page, 346 int ret = lo_do_transfer(lo, WRITE, page, 0, bvec->bv_page,
347 bvec->bv_offset, bvec->bv_len, pos >> 9); 347 bvec->bv_offset, bvec->bv_len, pos >> 9);
348 if (likely(!ret)) 348 if (likely(!ret))
349 return __do_lo_send_write(lo->lo_backing_file, 349 return __do_lo_send_write(lo->lo_backing_file,
350 page_address(page), bvec->bv_len, 350 page_address(page), bvec->bv_len,
351 pos); 351 pos);
352 printk(KERN_ERR "loop: Transfer error at byte offset %llu, " 352 printk(KERN_ERR "loop: Transfer error at byte offset %llu, "
353 "length %i.\n", (unsigned long long)pos, bvec->bv_len); 353 "length %i.\n", (unsigned long long)pos, bvec->bv_len);
354 if (ret > 0) 354 if (ret > 0)
355 ret = -EIO; 355 ret = -EIO;
356 return ret; 356 return ret;
357 } 357 }
358 358
359 static int lo_send(struct loop_device *lo, struct bio *bio, int bsize, 359 static int lo_send(struct loop_device *lo, struct bio *bio, int bsize,
360 loff_t pos) 360 loff_t pos)
361 { 361 {
362 int (*do_lo_send)(struct loop_device *, struct bio_vec *, int, loff_t, 362 int (*do_lo_send)(struct loop_device *, struct bio_vec *, int, loff_t,
363 struct page *page); 363 struct page *page);
364 struct bio_vec *bvec; 364 struct bio_vec *bvec;
365 struct page *page = NULL; 365 struct page *page = NULL;
366 int i, ret = 0; 366 int i, ret = 0;
367 367
368 do_lo_send = do_lo_send_aops; 368 do_lo_send = do_lo_send_aops;
369 if (!(lo->lo_flags & LO_FLAGS_USE_AOPS)) { 369 if (!(lo->lo_flags & LO_FLAGS_USE_AOPS)) {
370 do_lo_send = do_lo_send_direct_write; 370 do_lo_send = do_lo_send_direct_write;
371 if (lo->transfer != transfer_none) { 371 if (lo->transfer != transfer_none) {
372 page = alloc_page(GFP_NOIO | __GFP_HIGHMEM); 372 page = alloc_page(GFP_NOIO | __GFP_HIGHMEM);
373 if (unlikely(!page)) 373 if (unlikely(!page))
374 goto fail; 374 goto fail;
375 kmap(page); 375 kmap(page);
376 do_lo_send = do_lo_send_write; 376 do_lo_send = do_lo_send_write;
377 } 377 }
378 } 378 }
379 bio_for_each_segment(bvec, bio, i) { 379 bio_for_each_segment(bvec, bio, i) {
380 ret = do_lo_send(lo, bvec, bsize, pos, page); 380 ret = do_lo_send(lo, bvec, bsize, pos, page);
381 if (ret < 0) 381 if (ret < 0)
382 break; 382 break;
383 pos += bvec->bv_len; 383 pos += bvec->bv_len;
384 } 384 }
385 if (page) { 385 if (page) {
386 kunmap(page); 386 kunmap(page);
387 __free_page(page); 387 __free_page(page);
388 } 388 }
389 out: 389 out:
390 return ret; 390 return ret;
391 fail: 391 fail:
392 printk(KERN_ERR "loop: Failed to allocate temporary page for write.\n"); 392 printk(KERN_ERR "loop: Failed to allocate temporary page for write.\n");
393 ret = -ENOMEM; 393 ret = -ENOMEM;
394 goto out; 394 goto out;
395 } 395 }
396 396
397 struct lo_read_data { 397 struct lo_read_data {
398 struct loop_device *lo; 398 struct loop_device *lo;
399 struct page *page; 399 struct page *page;
400 unsigned offset; 400 unsigned offset;
401 int bsize; 401 int bsize;
402 }; 402 };
403 403
404 static int 404 static int
405 lo_splice_actor(struct pipe_inode_info *pipe, struct pipe_buffer *buf, 405 lo_splice_actor(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
406 struct splice_desc *sd) 406 struct splice_desc *sd)
407 { 407 {
408 struct lo_read_data *p = sd->u.data; 408 struct lo_read_data *p = sd->u.data;
409 struct loop_device *lo = p->lo; 409 struct loop_device *lo = p->lo;
410 struct page *page = buf->page; 410 struct page *page = buf->page;
411 sector_t IV; 411 sector_t IV;
412 size_t size; 412 size_t size;
413 int ret; 413 int ret;
414 414
415 ret = buf->ops->pin(pipe, buf); 415 ret = buf->ops->confirm(pipe, buf);
416 if (unlikely(ret)) 416 if (unlikely(ret))
417 return ret; 417 return ret;
418 418
419 IV = ((sector_t) page->index << (PAGE_CACHE_SHIFT - 9)) + 419 IV = ((sector_t) page->index << (PAGE_CACHE_SHIFT - 9)) +
420 (buf->offset >> 9); 420 (buf->offset >> 9);
421 size = sd->len; 421 size = sd->len;
422 if (size > p->bsize) 422 if (size > p->bsize)
423 size = p->bsize; 423 size = p->bsize;
424 424
425 if (lo_do_transfer(lo, READ, page, buf->offset, p->page, p->offset, size, IV)) { 425 if (lo_do_transfer(lo, READ, page, buf->offset, p->page, p->offset, size, IV)) {
426 printk(KERN_ERR "loop: transfer error block %ld\n", 426 printk(KERN_ERR "loop: transfer error block %ld\n",
427 page->index); 427 page->index);
428 size = -EINVAL; 428 size = -EINVAL;
429 } 429 }
430 430
431 flush_dcache_page(p->page); 431 flush_dcache_page(p->page);
432 432
433 if (size > 0) 433 if (size > 0)
434 p->offset += size; 434 p->offset += size;
435 435
436 return size; 436 return size;
437 } 437 }
438 438
439 static int 439 static int
440 lo_direct_splice_actor(struct pipe_inode_info *pipe, struct splice_desc *sd) 440 lo_direct_splice_actor(struct pipe_inode_info *pipe, struct splice_desc *sd)
441 { 441 {
442 return __splice_from_pipe(pipe, sd, lo_splice_actor); 442 return __splice_from_pipe(pipe, sd, lo_splice_actor);
443 } 443 }
444 444
445 static int 445 static int
446 do_lo_receive(struct loop_device *lo, 446 do_lo_receive(struct loop_device *lo,
447 struct bio_vec *bvec, int bsize, loff_t pos) 447 struct bio_vec *bvec, int bsize, loff_t pos)
448 { 448 {
449 struct lo_read_data cookie; 449 struct lo_read_data cookie;
450 struct splice_desc sd; 450 struct splice_desc sd;
451 struct file *file; 451 struct file *file;
452 long retval; 452 long retval;
453 453
454 cookie.lo = lo; 454 cookie.lo = lo;
455 cookie.page = bvec->bv_page; 455 cookie.page = bvec->bv_page;
456 cookie.offset = bvec->bv_offset; 456 cookie.offset = bvec->bv_offset;
457 cookie.bsize = bsize; 457 cookie.bsize = bsize;
458 458
459 sd.len = 0; 459 sd.len = 0;
460 sd.total_len = bvec->bv_len; 460 sd.total_len = bvec->bv_len;
461 sd.flags = 0; 461 sd.flags = 0;
462 sd.pos = pos; 462 sd.pos = pos;
463 sd.u.data = &cookie; 463 sd.u.data = &cookie;
464 464
465 file = lo->lo_backing_file; 465 file = lo->lo_backing_file;
466 retval = splice_direct_to_actor(file, &sd, lo_direct_splice_actor); 466 retval = splice_direct_to_actor(file, &sd, lo_direct_splice_actor);
467 467
468 if (retval < 0) 468 if (retval < 0)
469 return retval; 469 return retval;
470 470
471 return 0; 471 return 0;
472 } 472 }
473 473
474 static int 474 static int
475 lo_receive(struct loop_device *lo, struct bio *bio, int bsize, loff_t pos) 475 lo_receive(struct loop_device *lo, struct bio *bio, int bsize, loff_t pos)
476 { 476 {
477 struct bio_vec *bvec; 477 struct bio_vec *bvec;
478 int i, ret = 0; 478 int i, ret = 0;
479 479
480 bio_for_each_segment(bvec, bio, i) { 480 bio_for_each_segment(bvec, bio, i) {
481 ret = do_lo_receive(lo, bvec, bsize, pos); 481 ret = do_lo_receive(lo, bvec, bsize, pos);
482 if (ret < 0) 482 if (ret < 0)
483 break; 483 break;
484 pos += bvec->bv_len; 484 pos += bvec->bv_len;
485 } 485 }
486 return ret; 486 return ret;
487 } 487 }
488 488
489 static int do_bio_filebacked(struct loop_device *lo, struct bio *bio) 489 static int do_bio_filebacked(struct loop_device *lo, struct bio *bio)
490 { 490 {
491 loff_t pos; 491 loff_t pos;
492 int ret; 492 int ret;
493 493
494 pos = ((loff_t) bio->bi_sector << 9) + lo->lo_offset; 494 pos = ((loff_t) bio->bi_sector << 9) + lo->lo_offset;
495 if (bio_rw(bio) == WRITE) 495 if (bio_rw(bio) == WRITE)
496 ret = lo_send(lo, bio, lo->lo_blocksize, pos); 496 ret = lo_send(lo, bio, lo->lo_blocksize, pos);
497 else 497 else
498 ret = lo_receive(lo, bio, lo->lo_blocksize, pos); 498 ret = lo_receive(lo, bio, lo->lo_blocksize, pos);
499 return ret; 499 return ret;
500 } 500 }
501 501
502 /* 502 /*
503 * Add bio to back of pending list 503 * Add bio to back of pending list
504 */ 504 */
505 static void loop_add_bio(struct loop_device *lo, struct bio *bio) 505 static void loop_add_bio(struct loop_device *lo, struct bio *bio)
506 { 506 {
507 if (lo->lo_biotail) { 507 if (lo->lo_biotail) {
508 lo->lo_biotail->bi_next = bio; 508 lo->lo_biotail->bi_next = bio;
509 lo->lo_biotail = bio; 509 lo->lo_biotail = bio;
510 } else 510 } else
511 lo->lo_bio = lo->lo_biotail = bio; 511 lo->lo_bio = lo->lo_biotail = bio;
512 } 512 }
513 513
514 /* 514 /*
515 * Grab first pending buffer 515 * Grab first pending buffer
516 */ 516 */
517 static struct bio *loop_get_bio(struct loop_device *lo) 517 static struct bio *loop_get_bio(struct loop_device *lo)
518 { 518 {
519 struct bio *bio; 519 struct bio *bio;
520 520
521 if ((bio = lo->lo_bio)) { 521 if ((bio = lo->lo_bio)) {
522 if (bio == lo->lo_biotail) 522 if (bio == lo->lo_biotail)
523 lo->lo_biotail = NULL; 523 lo->lo_biotail = NULL;
524 lo->lo_bio = bio->bi_next; 524 lo->lo_bio = bio->bi_next;
525 bio->bi_next = NULL; 525 bio->bi_next = NULL;
526 } 526 }
527 527
528 return bio; 528 return bio;
529 } 529 }
530 530
531 static int loop_make_request(request_queue_t *q, struct bio *old_bio) 531 static int loop_make_request(request_queue_t *q, struct bio *old_bio)
532 { 532 {
533 struct loop_device *lo = q->queuedata; 533 struct loop_device *lo = q->queuedata;
534 int rw = bio_rw(old_bio); 534 int rw = bio_rw(old_bio);
535 535
536 if (rw == READA) 536 if (rw == READA)
537 rw = READ; 537 rw = READ;
538 538
539 BUG_ON(!lo || (rw != READ && rw != WRITE)); 539 BUG_ON(!lo || (rw != READ && rw != WRITE));
540 540
541 spin_lock_irq(&lo->lo_lock); 541 spin_lock_irq(&lo->lo_lock);
542 if (lo->lo_state != Lo_bound) 542 if (lo->lo_state != Lo_bound)
543 goto out; 543 goto out;
544 if (unlikely(rw == WRITE && (lo->lo_flags & LO_FLAGS_READ_ONLY))) 544 if (unlikely(rw == WRITE && (lo->lo_flags & LO_FLAGS_READ_ONLY)))
545 goto out; 545 goto out;
546 loop_add_bio(lo, old_bio); 546 loop_add_bio(lo, old_bio);
547 wake_up(&lo->lo_event); 547 wake_up(&lo->lo_event);
548 spin_unlock_irq(&lo->lo_lock); 548 spin_unlock_irq(&lo->lo_lock);
549 return 0; 549 return 0;
550 550
551 out: 551 out:
552 spin_unlock_irq(&lo->lo_lock); 552 spin_unlock_irq(&lo->lo_lock);
553 bio_io_error(old_bio, old_bio->bi_size); 553 bio_io_error(old_bio, old_bio->bi_size);
554 return 0; 554 return 0;
555 } 555 }
556 556
557 /* 557 /*
558 * kick off io on the underlying address space 558 * kick off io on the underlying address space
559 */ 559 */
560 static void loop_unplug(request_queue_t *q) 560 static void loop_unplug(request_queue_t *q)
561 { 561 {
562 struct loop_device *lo = q->queuedata; 562 struct loop_device *lo = q->queuedata;
563 563
564 clear_bit(QUEUE_FLAG_PLUGGED, &q->queue_flags); 564 clear_bit(QUEUE_FLAG_PLUGGED, &q->queue_flags);
565 blk_run_address_space(lo->lo_backing_file->f_mapping); 565 blk_run_address_space(lo->lo_backing_file->f_mapping);
566 } 566 }
567 567
568 struct switch_request { 568 struct switch_request {
569 struct file *file; 569 struct file *file;
570 struct completion wait; 570 struct completion wait;
571 }; 571 };
572 572
573 static void do_loop_switch(struct loop_device *, struct switch_request *); 573 static void do_loop_switch(struct loop_device *, struct switch_request *);
574 574
575 static inline void loop_handle_bio(struct loop_device *lo, struct bio *bio) 575 static inline void loop_handle_bio(struct loop_device *lo, struct bio *bio)
576 { 576 {
577 if (unlikely(!bio->bi_bdev)) { 577 if (unlikely(!bio->bi_bdev)) {
578 do_loop_switch(lo, bio->bi_private); 578 do_loop_switch(lo, bio->bi_private);
579 bio_put(bio); 579 bio_put(bio);
580 } else { 580 } else {
581 int ret = do_bio_filebacked(lo, bio); 581 int ret = do_bio_filebacked(lo, bio);
582 bio_endio(bio, bio->bi_size, ret); 582 bio_endio(bio, bio->bi_size, ret);
583 } 583 }
584 } 584 }
585 585
586 /* 586 /*
587 * worker thread that handles reads/writes to file backed loop devices, 587 * worker thread that handles reads/writes to file backed loop devices,
588 * to avoid blocking in our make_request_fn. it also does loop decrypting 588 * to avoid blocking in our make_request_fn. it also does loop decrypting
589 * on reads for block backed loop, as that is too heavy to do from 589 * on reads for block backed loop, as that is too heavy to do from
590 * b_end_io context where irqs may be disabled. 590 * b_end_io context where irqs may be disabled.
591 * 591 *
592 * Loop explanation: loop_clr_fd() sets lo_state to Lo_rundown before 592 * Loop explanation: loop_clr_fd() sets lo_state to Lo_rundown before
593 * calling kthread_stop(). Therefore once kthread_should_stop() is 593 * calling kthread_stop(). Therefore once kthread_should_stop() is
594 * true, make_request will not place any more requests. Therefore 594 * true, make_request will not place any more requests. Therefore
595 * once kthread_should_stop() is true and lo_bio is NULL, we are 595 * once kthread_should_stop() is true and lo_bio is NULL, we are
596 * done with the loop. 596 * done with the loop.
597 */ 597 */
598 static int loop_thread(void *data) 598 static int loop_thread(void *data)
599 { 599 {
600 struct loop_device *lo = data; 600 struct loop_device *lo = data;
601 struct bio *bio; 601 struct bio *bio;
602 602
603 /* 603 /*
604 * loop can be used in an encrypted device, 604 * loop can be used in an encrypted device,
605 * hence, it mustn't be stopped at all 605 * hence, it mustn't be stopped at all
606 * because it could be indirectly used during suspension 606 * because it could be indirectly used during suspension
607 */ 607 */
608 current->flags |= PF_NOFREEZE; 608 current->flags |= PF_NOFREEZE;
609 609
610 set_user_nice(current, -20); 610 set_user_nice(current, -20);
611 611
612 while (!kthread_should_stop() || lo->lo_bio) { 612 while (!kthread_should_stop() || lo->lo_bio) {
613 613
614 wait_event_interruptible(lo->lo_event, 614 wait_event_interruptible(lo->lo_event,
615 lo->lo_bio || kthread_should_stop()); 615 lo->lo_bio || kthread_should_stop());
616 616
617 if (!lo->lo_bio) 617 if (!lo->lo_bio)
618 continue; 618 continue;
619 spin_lock_irq(&lo->lo_lock); 619 spin_lock_irq(&lo->lo_lock);
620 bio = loop_get_bio(lo); 620 bio = loop_get_bio(lo);
621 spin_unlock_irq(&lo->lo_lock); 621 spin_unlock_irq(&lo->lo_lock);
622 622
623 BUG_ON(!bio); 623 BUG_ON(!bio);
624 loop_handle_bio(lo, bio); 624 loop_handle_bio(lo, bio);
625 } 625 }
626 626
627 return 0; 627 return 0;
628 } 628 }
629 629
630 /* 630 /*
631 * loop_switch performs the hard work of switching a backing store. 631 * loop_switch performs the hard work of switching a backing store.
632 * First it needs to flush existing IO, it does this by sending a magic 632 * First it needs to flush existing IO, it does this by sending a magic
633 * BIO down the pipe. The completion of this BIO does the actual switch. 633 * BIO down the pipe. The completion of this BIO does the actual switch.
634 */ 634 */
635 static int loop_switch(struct loop_device *lo, struct file *file) 635 static int loop_switch(struct loop_device *lo, struct file *file)
636 { 636 {
637 struct switch_request w; 637 struct switch_request w;
638 struct bio *bio = bio_alloc(GFP_KERNEL, 1); 638 struct bio *bio = bio_alloc(GFP_KERNEL, 1);
639 if (!bio) 639 if (!bio)
640 return -ENOMEM; 640 return -ENOMEM;
641 init_completion(&w.wait); 641 init_completion(&w.wait);
642 w.file = file; 642 w.file = file;
643 bio->bi_private = &w; 643 bio->bi_private = &w;
644 bio->bi_bdev = NULL; 644 bio->bi_bdev = NULL;
645 loop_make_request(lo->lo_queue, bio); 645 loop_make_request(lo->lo_queue, bio);
646 wait_for_completion(&w.wait); 646 wait_for_completion(&w.wait);
647 return 0; 647 return 0;
648 } 648 }
649 649
650 /* 650 /*
651 * Do the actual switch; called from the BIO completion routine 651 * Do the actual switch; called from the BIO completion routine
652 */ 652 */
653 static void do_loop_switch(struct loop_device *lo, struct switch_request *p) 653 static void do_loop_switch(struct loop_device *lo, struct switch_request *p)
654 { 654 {
655 struct file *file = p->file; 655 struct file *file = p->file;
656 struct file *old_file = lo->lo_backing_file; 656 struct file *old_file = lo->lo_backing_file;
657 struct address_space *mapping = file->f_mapping; 657 struct address_space *mapping = file->f_mapping;
658 658
659 mapping_set_gfp_mask(old_file->f_mapping, lo->old_gfp_mask); 659 mapping_set_gfp_mask(old_file->f_mapping, lo->old_gfp_mask);
660 lo->lo_backing_file = file; 660 lo->lo_backing_file = file;
661 lo->lo_blocksize = S_ISBLK(mapping->host->i_mode) ? 661 lo->lo_blocksize = S_ISBLK(mapping->host->i_mode) ?
662 mapping->host->i_bdev->bd_block_size : PAGE_SIZE; 662 mapping->host->i_bdev->bd_block_size : PAGE_SIZE;
663 lo->old_gfp_mask = mapping_gfp_mask(mapping); 663 lo->old_gfp_mask = mapping_gfp_mask(mapping);
664 mapping_set_gfp_mask(mapping, lo->old_gfp_mask & ~(__GFP_IO|__GFP_FS)); 664 mapping_set_gfp_mask(mapping, lo->old_gfp_mask & ~(__GFP_IO|__GFP_FS));
665 complete(&p->wait); 665 complete(&p->wait);
666 } 666 }
667 667
668 668
669 /* 669 /*
670 * loop_change_fd switched the backing store of a loopback device to 670 * loop_change_fd switched the backing store of a loopback device to
671 * a new file. This is useful for operating system installers to free up 671 * a new file. This is useful for operating system installers to free up
672 * the original file and in High Availability environments to switch to 672 * the original file and in High Availability environments to switch to
673 * an alternative location for the content in case of server meltdown. 673 * an alternative location for the content in case of server meltdown.
674 * This can only work if the loop device is used read-only, and if the 674 * This can only work if the loop device is used read-only, and if the
675 * new backing store is the same size and type as the old backing store. 675 * new backing store is the same size and type as the old backing store.
676 */ 676 */
677 static int loop_change_fd(struct loop_device *lo, struct file *lo_file, 677 static int loop_change_fd(struct loop_device *lo, struct file *lo_file,
678 struct block_device *bdev, unsigned int arg) 678 struct block_device *bdev, unsigned int arg)
679 { 679 {
680 struct file *file, *old_file; 680 struct file *file, *old_file;
681 struct inode *inode; 681 struct inode *inode;
682 int error; 682 int error;
683 683
684 error = -ENXIO; 684 error = -ENXIO;
685 if (lo->lo_state != Lo_bound) 685 if (lo->lo_state != Lo_bound)
686 goto out; 686 goto out;
687 687
688 /* the loop device has to be read-only */ 688 /* the loop device has to be read-only */
689 error = -EINVAL; 689 error = -EINVAL;
690 if (!(lo->lo_flags & LO_FLAGS_READ_ONLY)) 690 if (!(lo->lo_flags & LO_FLAGS_READ_ONLY))
691 goto out; 691 goto out;
692 692
693 error = -EBADF; 693 error = -EBADF;
694 file = fget(arg); 694 file = fget(arg);
695 if (!file) 695 if (!file)
696 goto out; 696 goto out;
697 697
698 inode = file->f_mapping->host; 698 inode = file->f_mapping->host;
699 old_file = lo->lo_backing_file; 699 old_file = lo->lo_backing_file;
700 700
701 error = -EINVAL; 701 error = -EINVAL;
702 702
703 if (!S_ISREG(inode->i_mode) && !S_ISBLK(inode->i_mode)) 703 if (!S_ISREG(inode->i_mode) && !S_ISBLK(inode->i_mode))
704 goto out_putf; 704 goto out_putf;
705 705
706 /* new backing store needs to support loop (eg splice_read) */ 706 /* new backing store needs to support loop (eg splice_read) */
707 if (!inode->i_fop->splice_read) 707 if (!inode->i_fop->splice_read)
708 goto out_putf; 708 goto out_putf;
709 709
710 /* size of the new backing store needs to be the same */ 710 /* size of the new backing store needs to be the same */
711 if (get_loop_size(lo, file) != get_loop_size(lo, old_file)) 711 if (get_loop_size(lo, file) != get_loop_size(lo, old_file))
712 goto out_putf; 712 goto out_putf;
713 713
714 /* and ... switch */ 714 /* and ... switch */
715 error = loop_switch(lo, file); 715 error = loop_switch(lo, file);
716 if (error) 716 if (error)
717 goto out_putf; 717 goto out_putf;
718 718
719 fput(old_file); 719 fput(old_file);
720 return 0; 720 return 0;
721 721
722 out_putf: 722 out_putf:
723 fput(file); 723 fput(file);
724 out: 724 out:
725 return error; 725 return error;
726 } 726 }
727 727
728 static inline int is_loop_device(struct file *file) 728 static inline int is_loop_device(struct file *file)
729 { 729 {
730 struct inode *i = file->f_mapping->host; 730 struct inode *i = file->f_mapping->host;
731 731
732 return i && S_ISBLK(i->i_mode) && MAJOR(i->i_rdev) == LOOP_MAJOR; 732 return i && S_ISBLK(i->i_mode) && MAJOR(i->i_rdev) == LOOP_MAJOR;
733 } 733 }
734 734
735 static int loop_set_fd(struct loop_device *lo, struct file *lo_file, 735 static int loop_set_fd(struct loop_device *lo, struct file *lo_file,
736 struct block_device *bdev, unsigned int arg) 736 struct block_device *bdev, unsigned int arg)
737 { 737 {
738 struct file *file, *f; 738 struct file *file, *f;
739 struct inode *inode; 739 struct inode *inode;
740 struct address_space *mapping; 740 struct address_space *mapping;
741 unsigned lo_blocksize; 741 unsigned lo_blocksize;
742 int lo_flags = 0; 742 int lo_flags = 0;
743 int error; 743 int error;
744 loff_t size; 744 loff_t size;
745 745
746 /* This is safe, since we have a reference from open(). */ 746 /* This is safe, since we have a reference from open(). */
747 __module_get(THIS_MODULE); 747 __module_get(THIS_MODULE);
748 748
749 error = -EBADF; 749 error = -EBADF;
750 file = fget(arg); 750 file = fget(arg);
751 if (!file) 751 if (!file)
752 goto out; 752 goto out;
753 753
754 error = -EBUSY; 754 error = -EBUSY;
755 if (lo->lo_state != Lo_unbound) 755 if (lo->lo_state != Lo_unbound)
756 goto out_putf; 756 goto out_putf;
757 757
758 /* Avoid recursion */ 758 /* Avoid recursion */
759 f = file; 759 f = file;
760 while (is_loop_device(f)) { 760 while (is_loop_device(f)) {
761 struct loop_device *l; 761 struct loop_device *l;
762 762
763 if (f->f_mapping->host->i_rdev == lo_file->f_mapping->host->i_rdev) 763 if (f->f_mapping->host->i_rdev == lo_file->f_mapping->host->i_rdev)
764 goto out_putf; 764 goto out_putf;
765 765
766 l = f->f_mapping->host->i_bdev->bd_disk->private_data; 766 l = f->f_mapping->host->i_bdev->bd_disk->private_data;
767 if (l->lo_state == Lo_unbound) { 767 if (l->lo_state == Lo_unbound) {
768 error = -EINVAL; 768 error = -EINVAL;
769 goto out_putf; 769 goto out_putf;
770 } 770 }
771 f = l->lo_backing_file; 771 f = l->lo_backing_file;
772 } 772 }
773 773
774 mapping = file->f_mapping; 774 mapping = file->f_mapping;
775 inode = mapping->host; 775 inode = mapping->host;
776 776
777 if (!(file->f_mode & FMODE_WRITE)) 777 if (!(file->f_mode & FMODE_WRITE))
778 lo_flags |= LO_FLAGS_READ_ONLY; 778 lo_flags |= LO_FLAGS_READ_ONLY;
779 779
780 error = -EINVAL; 780 error = -EINVAL;
781 if (S_ISREG(inode->i_mode) || S_ISBLK(inode->i_mode)) { 781 if (S_ISREG(inode->i_mode) || S_ISBLK(inode->i_mode)) {
782 const struct address_space_operations *aops = mapping->a_ops; 782 const struct address_space_operations *aops = mapping->a_ops;
783 /* 783 /*
784 * If we can't read - sorry. If we only can't write - well, 784 * If we can't read - sorry. If we only can't write - well,
785 * it's going to be read-only. 785 * it's going to be read-only.
786 */ 786 */
787 if (!file->f_op->splice_read) 787 if (!file->f_op->splice_read)
788 goto out_putf; 788 goto out_putf;
789 if (aops->prepare_write && aops->commit_write) 789 if (aops->prepare_write && aops->commit_write)
790 lo_flags |= LO_FLAGS_USE_AOPS; 790 lo_flags |= LO_FLAGS_USE_AOPS;
791 if (!(lo_flags & LO_FLAGS_USE_AOPS) && !file->f_op->write) 791 if (!(lo_flags & LO_FLAGS_USE_AOPS) && !file->f_op->write)
792 lo_flags |= LO_FLAGS_READ_ONLY; 792 lo_flags |= LO_FLAGS_READ_ONLY;
793 793
794 lo_blocksize = S_ISBLK(inode->i_mode) ? 794 lo_blocksize = S_ISBLK(inode->i_mode) ?
795 inode->i_bdev->bd_block_size : PAGE_SIZE; 795 inode->i_bdev->bd_block_size : PAGE_SIZE;
796 796
797 error = 0; 797 error = 0;
798 } else { 798 } else {
799 goto out_putf; 799 goto out_putf;
800 } 800 }
801 801
802 size = get_loop_size(lo, file); 802 size = get_loop_size(lo, file);
803 803
804 if ((loff_t)(sector_t)size != size) { 804 if ((loff_t)(sector_t)size != size) {
805 error = -EFBIG; 805 error = -EFBIG;
806 goto out_putf; 806 goto out_putf;
807 } 807 }
808 808
809 if (!(lo_file->f_mode & FMODE_WRITE)) 809 if (!(lo_file->f_mode & FMODE_WRITE))
810 lo_flags |= LO_FLAGS_READ_ONLY; 810 lo_flags |= LO_FLAGS_READ_ONLY;
811 811
812 set_device_ro(bdev, (lo_flags & LO_FLAGS_READ_ONLY) != 0); 812 set_device_ro(bdev, (lo_flags & LO_FLAGS_READ_ONLY) != 0);
813 813
814 lo->lo_blocksize = lo_blocksize; 814 lo->lo_blocksize = lo_blocksize;
815 lo->lo_device = bdev; 815 lo->lo_device = bdev;
816 lo->lo_flags = lo_flags; 816 lo->lo_flags = lo_flags;
817 lo->lo_backing_file = file; 817 lo->lo_backing_file = file;
818 lo->transfer = transfer_none; 818 lo->transfer = transfer_none;
819 lo->ioctl = NULL; 819 lo->ioctl = NULL;
820 lo->lo_sizelimit = 0; 820 lo->lo_sizelimit = 0;
821 lo->old_gfp_mask = mapping_gfp_mask(mapping); 821 lo->old_gfp_mask = mapping_gfp_mask(mapping);
822 mapping_set_gfp_mask(mapping, lo->old_gfp_mask & ~(__GFP_IO|__GFP_FS)); 822 mapping_set_gfp_mask(mapping, lo->old_gfp_mask & ~(__GFP_IO|__GFP_FS));
823 823
824 lo->lo_bio = lo->lo_biotail = NULL; 824 lo->lo_bio = lo->lo_biotail = NULL;
825 825
826 /* 826 /*
827 * set queue make_request_fn, and add limits based on lower level 827 * set queue make_request_fn, and add limits based on lower level
828 * device 828 * device
829 */ 829 */
830 blk_queue_make_request(lo->lo_queue, loop_make_request); 830 blk_queue_make_request(lo->lo_queue, loop_make_request);
831 lo->lo_queue->queuedata = lo; 831 lo->lo_queue->queuedata = lo;
832 lo->lo_queue->unplug_fn = loop_unplug; 832 lo->lo_queue->unplug_fn = loop_unplug;
833 833
834 set_capacity(lo->lo_disk, size); 834 set_capacity(lo->lo_disk, size);
835 bd_set_size(bdev, size << 9); 835 bd_set_size(bdev, size << 9);
836 836
837 set_blocksize(bdev, lo_blocksize); 837 set_blocksize(bdev, lo_blocksize);
838 838
839 lo->lo_thread = kthread_create(loop_thread, lo, "loop%d", 839 lo->lo_thread = kthread_create(loop_thread, lo, "loop%d",
840 lo->lo_number); 840 lo->lo_number);
841 if (IS_ERR(lo->lo_thread)) { 841 if (IS_ERR(lo->lo_thread)) {
842 error = PTR_ERR(lo->lo_thread); 842 error = PTR_ERR(lo->lo_thread);
843 goto out_clr; 843 goto out_clr;
844 } 844 }
845 lo->lo_state = Lo_bound; 845 lo->lo_state = Lo_bound;
846 wake_up_process(lo->lo_thread); 846 wake_up_process(lo->lo_thread);
847 return 0; 847 return 0;
848 848
849 out_clr: 849 out_clr:
850 lo->lo_thread = NULL; 850 lo->lo_thread = NULL;
851 lo->lo_device = NULL; 851 lo->lo_device = NULL;
852 lo->lo_backing_file = NULL; 852 lo->lo_backing_file = NULL;
853 lo->lo_flags = 0; 853 lo->lo_flags = 0;
854 set_capacity(lo->lo_disk, 0); 854 set_capacity(lo->lo_disk, 0);
855 invalidate_bdev(bdev); 855 invalidate_bdev(bdev);
856 bd_set_size(bdev, 0); 856 bd_set_size(bdev, 0);
857 mapping_set_gfp_mask(mapping, lo->old_gfp_mask); 857 mapping_set_gfp_mask(mapping, lo->old_gfp_mask);
858 lo->lo_state = Lo_unbound; 858 lo->lo_state = Lo_unbound;
859 out_putf: 859 out_putf:
860 fput(file); 860 fput(file);
861 out: 861 out:
862 /* This is safe: open() is still holding a reference. */ 862 /* This is safe: open() is still holding a reference. */
863 module_put(THIS_MODULE); 863 module_put(THIS_MODULE);
864 return error; 864 return error;
865 } 865 }
866 866
867 static int 867 static int
868 loop_release_xfer(struct loop_device *lo) 868 loop_release_xfer(struct loop_device *lo)
869 { 869 {
870 int err = 0; 870 int err = 0;
871 struct loop_func_table *xfer = lo->lo_encryption; 871 struct loop_func_table *xfer = lo->lo_encryption;
872 872
873 if (xfer) { 873 if (xfer) {
874 if (xfer->release) 874 if (xfer->release)
875 err = xfer->release(lo); 875 err = xfer->release(lo);
876 lo->transfer = NULL; 876 lo->transfer = NULL;
877 lo->lo_encryption = NULL; 877 lo->lo_encryption = NULL;
878 module_put(xfer->owner); 878 module_put(xfer->owner);
879 } 879 }
880 return err; 880 return err;
881 } 881 }
882 882
883 static int 883 static int
884 loop_init_xfer(struct loop_device *lo, struct loop_func_table *xfer, 884 loop_init_xfer(struct loop_device *lo, struct loop_func_table *xfer,
885 const struct loop_info64 *i) 885 const struct loop_info64 *i)
886 { 886 {
887 int err = 0; 887 int err = 0;
888 888
889 if (xfer) { 889 if (xfer) {
890 struct module *owner = xfer->owner; 890 struct module *owner = xfer->owner;
891 891
892 if (!try_module_get(owner)) 892 if (!try_module_get(owner))
893 return -EINVAL; 893 return -EINVAL;
894 if (xfer->init) 894 if (xfer->init)
895 err = xfer->init(lo, i); 895 err = xfer->init(lo, i);
896 if (err) 896 if (err)
897 module_put(owner); 897 module_put(owner);
898 else 898 else
899 lo->lo_encryption = xfer; 899 lo->lo_encryption = xfer;
900 } 900 }
901 return err; 901 return err;
902 } 902 }
903 903
904 static int loop_clr_fd(struct loop_device *lo, struct block_device *bdev) 904 static int loop_clr_fd(struct loop_device *lo, struct block_device *bdev)
905 { 905 {
906 struct file *filp = lo->lo_backing_file; 906 struct file *filp = lo->lo_backing_file;
907 gfp_t gfp = lo->old_gfp_mask; 907 gfp_t gfp = lo->old_gfp_mask;
908 908
909 if (lo->lo_state != Lo_bound) 909 if (lo->lo_state != Lo_bound)
910 return -ENXIO; 910 return -ENXIO;
911 911
912 if (lo->lo_refcnt > 1) /* we needed one fd for the ioctl */ 912 if (lo->lo_refcnt > 1) /* we needed one fd for the ioctl */
913 return -EBUSY; 913 return -EBUSY;
914 914
915 if (filp == NULL) 915 if (filp == NULL)
916 return -EINVAL; 916 return -EINVAL;
917 917
918 spin_lock_irq(&lo->lo_lock); 918 spin_lock_irq(&lo->lo_lock);
919 lo->lo_state = Lo_rundown; 919 lo->lo_state = Lo_rundown;
920 spin_unlock_irq(&lo->lo_lock); 920 spin_unlock_irq(&lo->lo_lock);
921 921
922 kthread_stop(lo->lo_thread); 922 kthread_stop(lo->lo_thread);
923 923
924 lo->lo_backing_file = NULL; 924 lo->lo_backing_file = NULL;
925 925
926 loop_release_xfer(lo); 926 loop_release_xfer(lo);
927 lo->transfer = NULL; 927 lo->transfer = NULL;
928 lo->ioctl = NULL; 928 lo->ioctl = NULL;
929 lo->lo_device = NULL; 929 lo->lo_device = NULL;
930 lo->lo_encryption = NULL; 930 lo->lo_encryption = NULL;
931 lo->lo_offset = 0; 931 lo->lo_offset = 0;
932 lo->lo_sizelimit = 0; 932 lo->lo_sizelimit = 0;
933 lo->lo_encrypt_key_size = 0; 933 lo->lo_encrypt_key_size = 0;
934 lo->lo_flags = 0; 934 lo->lo_flags = 0;
935 lo->lo_thread = NULL; 935 lo->lo_thread = NULL;
936 memset(lo->lo_encrypt_key, 0, LO_KEY_SIZE); 936 memset(lo->lo_encrypt_key, 0, LO_KEY_SIZE);
937 memset(lo->lo_crypt_name, 0, LO_NAME_SIZE); 937 memset(lo->lo_crypt_name, 0, LO_NAME_SIZE);
938 memset(lo->lo_file_name, 0, LO_NAME_SIZE); 938 memset(lo->lo_file_name, 0, LO_NAME_SIZE);
939 invalidate_bdev(bdev); 939 invalidate_bdev(bdev);
940 set_capacity(lo->lo_disk, 0); 940 set_capacity(lo->lo_disk, 0);
941 bd_set_size(bdev, 0); 941 bd_set_size(bdev, 0);
942 mapping_set_gfp_mask(filp->f_mapping, gfp); 942 mapping_set_gfp_mask(filp->f_mapping, gfp);
943 lo->lo_state = Lo_unbound; 943 lo->lo_state = Lo_unbound;
944 fput(filp); 944 fput(filp);
945 /* This is safe: open() is still holding a reference. */ 945 /* This is safe: open() is still holding a reference. */
946 module_put(THIS_MODULE); 946 module_put(THIS_MODULE);
947 return 0; 947 return 0;
948 } 948 }
949 949
950 static int 950 static int
951 loop_set_status(struct loop_device *lo, const struct loop_info64 *info) 951 loop_set_status(struct loop_device *lo, const struct loop_info64 *info)
952 { 952 {
953 int err; 953 int err;
954 struct loop_func_table *xfer; 954 struct loop_func_table *xfer;
955 955
956 if (lo->lo_encrypt_key_size && lo->lo_key_owner != current->uid && 956 if (lo->lo_encrypt_key_size && lo->lo_key_owner != current->uid &&
957 !capable(CAP_SYS_ADMIN)) 957 !capable(CAP_SYS_ADMIN))
958 return -EPERM; 958 return -EPERM;
959 if (lo->lo_state != Lo_bound) 959 if (lo->lo_state != Lo_bound)
960 return -ENXIO; 960 return -ENXIO;
961 if ((unsigned int) info->lo_encrypt_key_size > LO_KEY_SIZE) 961 if ((unsigned int) info->lo_encrypt_key_size > LO_KEY_SIZE)
962 return -EINVAL; 962 return -EINVAL;
963 963
964 err = loop_release_xfer(lo); 964 err = loop_release_xfer(lo);
965 if (err) 965 if (err)
966 return err; 966 return err;
967 967
968 if (info->lo_encrypt_type) { 968 if (info->lo_encrypt_type) {
969 unsigned int type = info->lo_encrypt_type; 969 unsigned int type = info->lo_encrypt_type;
970 970
971 if (type >= MAX_LO_CRYPT) 971 if (type >= MAX_LO_CRYPT)
972 return -EINVAL; 972 return -EINVAL;
973 xfer = xfer_funcs[type]; 973 xfer = xfer_funcs[type];
974 if (xfer == NULL) 974 if (xfer == NULL)
975 return -EINVAL; 975 return -EINVAL;
976 } else 976 } else
977 xfer = NULL; 977 xfer = NULL;
978 978
979 err = loop_init_xfer(lo, xfer, info); 979 err = loop_init_xfer(lo, xfer, info);
980 if (err) 980 if (err)
981 return err; 981 return err;
982 982
983 if (lo->lo_offset != info->lo_offset || 983 if (lo->lo_offset != info->lo_offset ||
984 lo->lo_sizelimit != info->lo_sizelimit) { 984 lo->lo_sizelimit != info->lo_sizelimit) {
985 lo->lo_offset = info->lo_offset; 985 lo->lo_offset = info->lo_offset;
986 lo->lo_sizelimit = info->lo_sizelimit; 986 lo->lo_sizelimit = info->lo_sizelimit;
987 if (figure_loop_size(lo)) 987 if (figure_loop_size(lo))
988 return -EFBIG; 988 return -EFBIG;
989 } 989 }
990 990
991 memcpy(lo->lo_file_name, info->lo_file_name, LO_NAME_SIZE); 991 memcpy(lo->lo_file_name, info->lo_file_name, LO_NAME_SIZE);
992 memcpy(lo->lo_crypt_name, info->lo_crypt_name, LO_NAME_SIZE); 992 memcpy(lo->lo_crypt_name, info->lo_crypt_name, LO_NAME_SIZE);
993 lo->lo_file_name[LO_NAME_SIZE-1] = 0; 993 lo->lo_file_name[LO_NAME_SIZE-1] = 0;
994 lo->lo_crypt_name[LO_NAME_SIZE-1] = 0; 994 lo->lo_crypt_name[LO_NAME_SIZE-1] = 0;
995 995
996 if (!xfer) 996 if (!xfer)
997 xfer = &none_funcs; 997 xfer = &none_funcs;
998 lo->transfer = xfer->transfer; 998 lo->transfer = xfer->transfer;
999 lo->ioctl = xfer->ioctl; 999 lo->ioctl = xfer->ioctl;
1000 1000
1001 lo->lo_encrypt_key_size = info->lo_encrypt_key_size; 1001 lo->lo_encrypt_key_size = info->lo_encrypt_key_size;
1002 lo->lo_init[0] = info->lo_init[0]; 1002 lo->lo_init[0] = info->lo_init[0];
1003 lo->lo_init[1] = info->lo_init[1]; 1003 lo->lo_init[1] = info->lo_init[1];
1004 if (info->lo_encrypt_key_size) { 1004 if (info->lo_encrypt_key_size) {
1005 memcpy(lo->lo_encrypt_key, info->lo_encrypt_key, 1005 memcpy(lo->lo_encrypt_key, info->lo_encrypt_key,
1006 info->lo_encrypt_key_size); 1006 info->lo_encrypt_key_size);
1007 lo->lo_key_owner = current->uid; 1007 lo->lo_key_owner = current->uid;
1008 } 1008 }
1009 1009
1010 return 0; 1010 return 0;
1011 } 1011 }
1012 1012
1013 static int 1013 static int
1014 loop_get_status(struct loop_device *lo, struct loop_info64 *info) 1014 loop_get_status(struct loop_device *lo, struct loop_info64 *info)
1015 { 1015 {
1016 struct file *file = lo->lo_backing_file; 1016 struct file *file = lo->lo_backing_file;
1017 struct kstat stat; 1017 struct kstat stat;
1018 int error; 1018 int error;
1019 1019
1020 if (lo->lo_state != Lo_bound) 1020 if (lo->lo_state != Lo_bound)
1021 return -ENXIO; 1021 return -ENXIO;
1022 error = vfs_getattr(file->f_path.mnt, file->f_path.dentry, &stat); 1022 error = vfs_getattr(file->f_path.mnt, file->f_path.dentry, &stat);
1023 if (error) 1023 if (error)
1024 return error; 1024 return error;
1025 memset(info, 0, sizeof(*info)); 1025 memset(info, 0, sizeof(*info));
1026 info->lo_number = lo->lo_number; 1026 info->lo_number = lo->lo_number;
1027 info->lo_device = huge_encode_dev(stat.dev); 1027 info->lo_device = huge_encode_dev(stat.dev);
1028 info->lo_inode = stat.ino; 1028 info->lo_inode = stat.ino;
1029 info->lo_rdevice = huge_encode_dev(lo->lo_device ? stat.rdev : stat.dev); 1029 info->lo_rdevice = huge_encode_dev(lo->lo_device ? stat.rdev : stat.dev);
1030 info->lo_offset = lo->lo_offset; 1030 info->lo_offset = lo->lo_offset;
1031 info->lo_sizelimit = lo->lo_sizelimit; 1031 info->lo_sizelimit = lo->lo_sizelimit;
1032 info->lo_flags = lo->lo_flags; 1032 info->lo_flags = lo->lo_flags;
1033 memcpy(info->lo_file_name, lo->lo_file_name, LO_NAME_SIZE); 1033 memcpy(info->lo_file_name, lo->lo_file_name, LO_NAME_SIZE);
1034 memcpy(info->lo_crypt_name, lo->lo_crypt_name, LO_NAME_SIZE); 1034 memcpy(info->lo_crypt_name, lo->lo_crypt_name, LO_NAME_SIZE);
1035 info->lo_encrypt_type = 1035 info->lo_encrypt_type =
1036 lo->lo_encryption ? lo->lo_encryption->number : 0; 1036 lo->lo_encryption ? lo->lo_encryption->number : 0;
1037 if (lo->lo_encrypt_key_size && capable(CAP_SYS_ADMIN)) { 1037 if (lo->lo_encrypt_key_size && capable(CAP_SYS_ADMIN)) {
1038 info->lo_encrypt_key_size = lo->lo_encrypt_key_size; 1038 info->lo_encrypt_key_size = lo->lo_encrypt_key_size;
1039 memcpy(info->lo_encrypt_key, lo->lo_encrypt_key, 1039 memcpy(info->lo_encrypt_key, lo->lo_encrypt_key,
1040 lo->lo_encrypt_key_size); 1040 lo->lo_encrypt_key_size);
1041 } 1041 }
1042 return 0; 1042 return 0;
1043 } 1043 }
1044 1044
1045 static void 1045 static void
1046 loop_info64_from_old(const struct loop_info *info, struct loop_info64 *info64) 1046 loop_info64_from_old(const struct loop_info *info, struct loop_info64 *info64)
1047 { 1047 {
1048 memset(info64, 0, sizeof(*info64)); 1048 memset(info64, 0, sizeof(*info64));
1049 info64->lo_number = info->lo_number; 1049 info64->lo_number = info->lo_number;
1050 info64->lo_device = info->lo_device; 1050 info64->lo_device = info->lo_device;
1051 info64->lo_inode = info->lo_inode; 1051 info64->lo_inode = info->lo_inode;
1052 info64->lo_rdevice = info->lo_rdevice; 1052 info64->lo_rdevice = info->lo_rdevice;
1053 info64->lo_offset = info->lo_offset; 1053 info64->lo_offset = info->lo_offset;
1054 info64->lo_sizelimit = 0; 1054 info64->lo_sizelimit = 0;
1055 info64->lo_encrypt_type = info->lo_encrypt_type; 1055 info64->lo_encrypt_type = info->lo_encrypt_type;
1056 info64->lo_encrypt_key_size = info->lo_encrypt_key_size; 1056 info64->lo_encrypt_key_size = info->lo_encrypt_key_size;
1057 info64->lo_flags = info->lo_flags; 1057 info64->lo_flags = info->lo_flags;
1058 info64->lo_init[0] = info->lo_init[0]; 1058 info64->lo_init[0] = info->lo_init[0];
1059 info64->lo_init[1] = info->lo_init[1]; 1059 info64->lo_init[1] = info->lo_init[1];
1060 if (info->lo_encrypt_type == LO_CRYPT_CRYPTOAPI) 1060 if (info->lo_encrypt_type == LO_CRYPT_CRYPTOAPI)
1061 memcpy(info64->lo_crypt_name, info->lo_name, LO_NAME_SIZE); 1061 memcpy(info64->lo_crypt_name, info->lo_name, LO_NAME_SIZE);
1062 else 1062 else
1063 memcpy(info64->lo_file_name, info->lo_name, LO_NAME_SIZE); 1063 memcpy(info64->lo_file_name, info->lo_name, LO_NAME_SIZE);
1064 memcpy(info64->lo_encrypt_key, info->lo_encrypt_key, LO_KEY_SIZE); 1064 memcpy(info64->lo_encrypt_key, info->lo_encrypt_key, LO_KEY_SIZE);
1065 } 1065 }
1066 1066
1067 static int 1067 static int
1068 loop_info64_to_old(const struct loop_info64 *info64, struct loop_info *info) 1068 loop_info64_to_old(const struct loop_info64 *info64, struct loop_info *info)
1069 { 1069 {
1070 memset(info, 0, sizeof(*info)); 1070 memset(info, 0, sizeof(*info));
1071 info->lo_number = info64->lo_number; 1071 info->lo_number = info64->lo_number;
1072 info->lo_device = info64->lo_device; 1072 info->lo_device = info64->lo_device;
1073 info->lo_inode = info64->lo_inode; 1073 info->lo_inode = info64->lo_inode;
1074 info->lo_rdevice = info64->lo_rdevice; 1074 info->lo_rdevice = info64->lo_rdevice;
1075 info->lo_offset = info64->lo_offset; 1075 info->lo_offset = info64->lo_offset;
1076 info->lo_encrypt_type = info64->lo_encrypt_type; 1076 info->lo_encrypt_type = info64->lo_encrypt_type;
1077 info->lo_encrypt_key_size = info64->lo_encrypt_key_size; 1077 info->lo_encrypt_key_size = info64->lo_encrypt_key_size;
1078 info->lo_flags = info64->lo_flags; 1078 info->lo_flags = info64->lo_flags;
1079 info->lo_init[0] = info64->lo_init[0]; 1079 info->lo_init[0] = info64->lo_init[0];
1080 info->lo_init[1] = info64->lo_init[1]; 1080 info->lo_init[1] = info64->lo_init[1];
1081 if (info->lo_encrypt_type == LO_CRYPT_CRYPTOAPI) 1081 if (info->lo_encrypt_type == LO_CRYPT_CRYPTOAPI)
1082 memcpy(info->lo_name, info64->lo_crypt_name, LO_NAME_SIZE); 1082 memcpy(info->lo_name, info64->lo_crypt_name, LO_NAME_SIZE);
1083 else 1083 else
1084 memcpy(info->lo_name, info64->lo_file_name, LO_NAME_SIZE); 1084 memcpy(info->lo_name, info64->lo_file_name, LO_NAME_SIZE);
1085 memcpy(info->lo_encrypt_key, info64->lo_encrypt_key, LO_KEY_SIZE); 1085 memcpy(info->lo_encrypt_key, info64->lo_encrypt_key, LO_KEY_SIZE);
1086 1086
1087 /* error in case values were truncated */ 1087 /* error in case values were truncated */
1088 if (info->lo_device != info64->lo_device || 1088 if (info->lo_device != info64->lo_device ||
1089 info->lo_rdevice != info64->lo_rdevice || 1089 info->lo_rdevice != info64->lo_rdevice ||
1090 info->lo_inode != info64->lo_inode || 1090 info->lo_inode != info64->lo_inode ||
1091 info->lo_offset != info64->lo_offset) 1091 info->lo_offset != info64->lo_offset)
1092 return -EOVERFLOW; 1092 return -EOVERFLOW;
1093 1093
1094 return 0; 1094 return 0;
1095 } 1095 }
1096 1096
1097 static int 1097 static int
1098 loop_set_status_old(struct loop_device *lo, const struct loop_info __user *arg) 1098 loop_set_status_old(struct loop_device *lo, const struct loop_info __user *arg)
1099 { 1099 {
1100 struct loop_info info; 1100 struct loop_info info;
1101 struct loop_info64 info64; 1101 struct loop_info64 info64;
1102 1102
1103 if (copy_from_user(&info, arg, sizeof (struct loop_info))) 1103 if (copy_from_user(&info, arg, sizeof (struct loop_info)))
1104 return -EFAULT; 1104 return -EFAULT;
1105 loop_info64_from_old(&info, &info64); 1105 loop_info64_from_old(&info, &info64);
1106 return loop_set_status(lo, &info64); 1106 return loop_set_status(lo, &info64);
1107 } 1107 }
1108 1108
1109 static int 1109 static int
1110 loop_set_status64(struct loop_device *lo, const struct loop_info64 __user *arg) 1110 loop_set_status64(struct loop_device *lo, const struct loop_info64 __user *arg)
1111 { 1111 {
1112 struct loop_info64 info64; 1112 struct loop_info64 info64;
1113 1113
1114 if (copy_from_user(&info64, arg, sizeof (struct loop_info64))) 1114 if (copy_from_user(&info64, arg, sizeof (struct loop_info64)))
1115 return -EFAULT; 1115 return -EFAULT;
1116 return loop_set_status(lo, &info64); 1116 return loop_set_status(lo, &info64);
1117 } 1117 }
1118 1118
1119 static int 1119 static int
1120 loop_get_status_old(struct loop_device *lo, struct loop_info __user *arg) { 1120 loop_get_status_old(struct loop_device *lo, struct loop_info __user *arg) {
1121 struct loop_info info; 1121 struct loop_info info;
1122 struct loop_info64 info64; 1122 struct loop_info64 info64;
1123 int err = 0; 1123 int err = 0;
1124 1124
1125 if (!arg) 1125 if (!arg)
1126 err = -EINVAL; 1126 err = -EINVAL;
1127 if (!err) 1127 if (!err)
1128 err = loop_get_status(lo, &info64); 1128 err = loop_get_status(lo, &info64);
1129 if (!err) 1129 if (!err)
1130 err = loop_info64_to_old(&info64, &info); 1130 err = loop_info64_to_old(&info64, &info);
1131 if (!err && copy_to_user(arg, &info, sizeof(info))) 1131 if (!err && copy_to_user(arg, &info, sizeof(info)))
1132 err = -EFAULT; 1132 err = -EFAULT;
1133 1133
1134 return err; 1134 return err;
1135 } 1135 }
1136 1136
1137 static int 1137 static int
1138 loop_get_status64(struct loop_device *lo, struct loop_info64 __user *arg) { 1138 loop_get_status64(struct loop_device *lo, struct loop_info64 __user *arg) {
1139 struct loop_info64 info64; 1139 struct loop_info64 info64;
1140 int err = 0; 1140 int err = 0;
1141 1141
1142 if (!arg) 1142 if (!arg)
1143 err = -EINVAL; 1143 err = -EINVAL;
1144 if (!err) 1144 if (!err)
1145 err = loop_get_status(lo, &info64); 1145 err = loop_get_status(lo, &info64);
1146 if (!err && copy_to_user(arg, &info64, sizeof(info64))) 1146 if (!err && copy_to_user(arg, &info64, sizeof(info64)))
1147 err = -EFAULT; 1147 err = -EFAULT;
1148 1148
1149 return err; 1149 return err;
1150 } 1150 }
1151 1151
1152 static int lo_ioctl(struct inode * inode, struct file * file, 1152 static int lo_ioctl(struct inode * inode, struct file * file,
1153 unsigned int cmd, unsigned long arg) 1153 unsigned int cmd, unsigned long arg)
1154 { 1154 {
1155 struct loop_device *lo = inode->i_bdev->bd_disk->private_data; 1155 struct loop_device *lo = inode->i_bdev->bd_disk->private_data;
1156 int err; 1156 int err;
1157 1157
1158 mutex_lock(&lo->lo_ctl_mutex); 1158 mutex_lock(&lo->lo_ctl_mutex);
1159 switch (cmd) { 1159 switch (cmd) {
1160 case LOOP_SET_FD: 1160 case LOOP_SET_FD:
1161 err = loop_set_fd(lo, file, inode->i_bdev, arg); 1161 err = loop_set_fd(lo, file, inode->i_bdev, arg);
1162 break; 1162 break;
1163 case LOOP_CHANGE_FD: 1163 case LOOP_CHANGE_FD:
1164 err = loop_change_fd(lo, file, inode->i_bdev, arg); 1164 err = loop_change_fd(lo, file, inode->i_bdev, arg);
1165 break; 1165 break;
1166 case LOOP_CLR_FD: 1166 case LOOP_CLR_FD:
1167 err = loop_clr_fd(lo, inode->i_bdev); 1167 err = loop_clr_fd(lo, inode->i_bdev);
1168 break; 1168 break;
1169 case LOOP_SET_STATUS: 1169 case LOOP_SET_STATUS:
1170 err = loop_set_status_old(lo, (struct loop_info __user *) arg); 1170 err = loop_set_status_old(lo, (struct loop_info __user *) arg);
1171 break; 1171 break;
1172 case LOOP_GET_STATUS: 1172 case LOOP_GET_STATUS:
1173 err = loop_get_status_old(lo, (struct loop_info __user *) arg); 1173 err = loop_get_status_old(lo, (struct loop_info __user *) arg);
1174 break; 1174 break;
1175 case LOOP_SET_STATUS64: 1175 case LOOP_SET_STATUS64:
1176 err = loop_set_status64(lo, (struct loop_info64 __user *) arg); 1176 err = loop_set_status64(lo, (struct loop_info64 __user *) arg);
1177 break; 1177 break;
1178 case LOOP_GET_STATUS64: 1178 case LOOP_GET_STATUS64:
1179 err = loop_get_status64(lo, (struct loop_info64 __user *) arg); 1179 err = loop_get_status64(lo, (struct loop_info64 __user *) arg);
1180 break; 1180 break;
1181 default: 1181 default:
1182 err = lo->ioctl ? lo->ioctl(lo, cmd, arg) : -EINVAL; 1182 err = lo->ioctl ? lo->ioctl(lo, cmd, arg) : -EINVAL;
1183 } 1183 }
1184 mutex_unlock(&lo->lo_ctl_mutex); 1184 mutex_unlock(&lo->lo_ctl_mutex);
1185 return err; 1185 return err;
1186 } 1186 }
1187 1187
1188 #ifdef CONFIG_COMPAT 1188 #ifdef CONFIG_COMPAT
1189 struct compat_loop_info { 1189 struct compat_loop_info {
1190 compat_int_t lo_number; /* ioctl r/o */ 1190 compat_int_t lo_number; /* ioctl r/o */
1191 compat_dev_t lo_device; /* ioctl r/o */ 1191 compat_dev_t lo_device; /* ioctl r/o */
1192 compat_ulong_t lo_inode; /* ioctl r/o */ 1192 compat_ulong_t lo_inode; /* ioctl r/o */
1193 compat_dev_t lo_rdevice; /* ioctl r/o */ 1193 compat_dev_t lo_rdevice; /* ioctl r/o */
1194 compat_int_t lo_offset; 1194 compat_int_t lo_offset;
1195 compat_int_t lo_encrypt_type; 1195 compat_int_t lo_encrypt_type;
1196 compat_int_t lo_encrypt_key_size; /* ioctl w/o */ 1196 compat_int_t lo_encrypt_key_size; /* ioctl w/o */
1197 compat_int_t lo_flags; /* ioctl r/o */ 1197 compat_int_t lo_flags; /* ioctl r/o */
1198 char lo_name[LO_NAME_SIZE]; 1198 char lo_name[LO_NAME_SIZE];
1199 unsigned char lo_encrypt_key[LO_KEY_SIZE]; /* ioctl w/o */ 1199 unsigned char lo_encrypt_key[LO_KEY_SIZE]; /* ioctl w/o */
1200 compat_ulong_t lo_init[2]; 1200 compat_ulong_t lo_init[2];
1201 char reserved[4]; 1201 char reserved[4];
1202 }; 1202 };
1203 1203
1204 /* 1204 /*
1205 * Transfer 32-bit compatibility structure in userspace to 64-bit loop info 1205 * Transfer 32-bit compatibility structure in userspace to 64-bit loop info
1206 * - noinlined to reduce stack space usage in main part of driver 1206 * - noinlined to reduce stack space usage in main part of driver
1207 */ 1207 */
1208 static noinline int 1208 static noinline int
1209 loop_info64_from_compat(const struct compat_loop_info __user *arg, 1209 loop_info64_from_compat(const struct compat_loop_info __user *arg,
1210 struct loop_info64 *info64) 1210 struct loop_info64 *info64)
1211 { 1211 {
1212 struct compat_loop_info info; 1212 struct compat_loop_info info;
1213 1213
1214 if (copy_from_user(&info, arg, sizeof(info))) 1214 if (copy_from_user(&info, arg, sizeof(info)))
1215 return -EFAULT; 1215 return -EFAULT;
1216 1216
1217 memset(info64, 0, sizeof(*info64)); 1217 memset(info64, 0, sizeof(*info64));
1218 info64->lo_number = info.lo_number; 1218 info64->lo_number = info.lo_number;
1219 info64->lo_device = info.lo_device; 1219 info64->lo_device = info.lo_device;
1220 info64->lo_inode = info.lo_inode; 1220 info64->lo_inode = info.lo_inode;
1221 info64->lo_rdevice = info.lo_rdevice; 1221 info64->lo_rdevice = info.lo_rdevice;
1222 info64->lo_offset = info.lo_offset; 1222 info64->lo_offset = info.lo_offset;
1223 info64->lo_sizelimit = 0; 1223 info64->lo_sizelimit = 0;
1224 info64->lo_encrypt_type = info.lo_encrypt_type; 1224 info64->lo_encrypt_type = info.lo_encrypt_type;
1225 info64->lo_encrypt_key_size = info.lo_encrypt_key_size; 1225 info64->lo_encrypt_key_size = info.lo_encrypt_key_size;
1226 info64->lo_flags = info.lo_flags; 1226 info64->lo_flags = info.lo_flags;
1227 info64->lo_init[0] = info.lo_init[0]; 1227 info64->lo_init[0] = info.lo_init[0];
1228 info64->lo_init[1] = info.lo_init[1]; 1228 info64->lo_init[1] = info.lo_init[1];
1229 if (info.lo_encrypt_type == LO_CRYPT_CRYPTOAPI) 1229 if (info.lo_encrypt_type == LO_CRYPT_CRYPTOAPI)
1230 memcpy(info64->lo_crypt_name, info.lo_name, LO_NAME_SIZE); 1230 memcpy(info64->lo_crypt_name, info.lo_name, LO_NAME_SIZE);
1231 else 1231 else
1232 memcpy(info64->lo_file_name, info.lo_name, LO_NAME_SIZE); 1232 memcpy(info64->lo_file_name, info.lo_name, LO_NAME_SIZE);
1233 memcpy(info64->lo_encrypt_key, info.lo_encrypt_key, LO_KEY_SIZE); 1233 memcpy(info64->lo_encrypt_key, info.lo_encrypt_key, LO_KEY_SIZE);
1234 return 0; 1234 return 0;
1235 } 1235 }
1236 1236
1237 /* 1237 /*
1238 * Transfer 64-bit loop info to 32-bit compatibility structure in userspace 1238 * Transfer 64-bit loop info to 32-bit compatibility structure in userspace
1239 * - noinlined to reduce stack space usage in main part of driver 1239 * - noinlined to reduce stack space usage in main part of driver
1240 */ 1240 */
1241 static noinline int 1241 static noinline int
1242 loop_info64_to_compat(const struct loop_info64 *info64, 1242 loop_info64_to_compat(const struct loop_info64 *info64,
1243 struct compat_loop_info __user *arg) 1243 struct compat_loop_info __user *arg)
1244 { 1244 {
1245 struct compat_loop_info info; 1245 struct compat_loop_info info;
1246 1246
1247 memset(&info, 0, sizeof(info)); 1247 memset(&info, 0, sizeof(info));
1248 info.lo_number = info64->lo_number; 1248 info.lo_number = info64->lo_number;
1249 info.lo_device = info64->lo_device; 1249 info.lo_device = info64->lo_device;
1250 info.lo_inode = info64->lo_inode; 1250 info.lo_inode = info64->lo_inode;
1251 info.lo_rdevice = info64->lo_rdevice; 1251 info.lo_rdevice = info64->lo_rdevice;
1252 info.lo_offset = info64->lo_offset; 1252 info.lo_offset = info64->lo_offset;
1253 info.lo_encrypt_type = info64->lo_encrypt_type; 1253 info.lo_encrypt_type = info64->lo_encrypt_type;
1254 info.lo_encrypt_key_size = info64->lo_encrypt_key_size; 1254 info.lo_encrypt_key_size = info64->lo_encrypt_key_size;
1255 info.lo_flags = info64->lo_flags; 1255 info.lo_flags = info64->lo_flags;
1256 info.lo_init[0] = info64->lo_init[0]; 1256 info.lo_init[0] = info64->lo_init[0];
1257 info.lo_init[1] = info64->lo_init[1]; 1257 info.lo_init[1] = info64->lo_init[1];
1258 if (info.lo_encrypt_type == LO_CRYPT_CRYPTOAPI) 1258 if (info.lo_encrypt_type == LO_CRYPT_CRYPTOAPI)
1259 memcpy(info.lo_name, info64->lo_crypt_name, LO_NAME_SIZE); 1259 memcpy(info.lo_name, info64->lo_crypt_name, LO_NAME_SIZE);
1260 else 1260 else
1261 memcpy(info.lo_name, info64->lo_file_name, LO_NAME_SIZE); 1261 memcpy(info.lo_name, info64->lo_file_name, LO_NAME_SIZE);
1262 memcpy(info.lo_encrypt_key, info64->lo_encrypt_key, LO_KEY_SIZE); 1262 memcpy(info.lo_encrypt_key, info64->lo_encrypt_key, LO_KEY_SIZE);
1263 1263
1264 /* error in case values were truncated */ 1264 /* error in case values were truncated */
1265 if (info.lo_device != info64->lo_device || 1265 if (info.lo_device != info64->lo_device ||
1266 info.lo_rdevice != info64->lo_rdevice || 1266 info.lo_rdevice != info64->lo_rdevice ||
1267 info.lo_inode != info64->lo_inode || 1267 info.lo_inode != info64->lo_inode ||
1268 info.lo_offset != info64->lo_offset || 1268 info.lo_offset != info64->lo_offset ||
1269 info.lo_init[0] != info64->lo_init[0] || 1269 info.lo_init[0] != info64->lo_init[0] ||
1270 info.lo_init[1] != info64->lo_init[1]) 1270 info.lo_init[1] != info64->lo_init[1])
1271 return -EOVERFLOW; 1271 return -EOVERFLOW;
1272 1272
1273 if (copy_to_user(arg, &info, sizeof(info))) 1273 if (copy_to_user(arg, &info, sizeof(info)))
1274 return -EFAULT; 1274 return -EFAULT;
1275 return 0; 1275 return 0;
1276 } 1276 }
1277 1277
1278 static int 1278 static int
1279 loop_set_status_compat(struct loop_device *lo, 1279 loop_set_status_compat(struct loop_device *lo,
1280 const struct compat_loop_info __user *arg) 1280 const struct compat_loop_info __user *arg)
1281 { 1281 {
1282 struct loop_info64 info64; 1282 struct loop_info64 info64;
1283 int ret; 1283 int ret;
1284 1284
1285 ret = loop_info64_from_compat(arg, &info64); 1285 ret = loop_info64_from_compat(arg, &info64);
1286 if (ret < 0) 1286 if (ret < 0)
1287 return ret; 1287 return ret;
1288 return loop_set_status(lo, &info64); 1288 return loop_set_status(lo, &info64);
1289 } 1289 }
1290 1290
1291 static int 1291 static int
1292 loop_get_status_compat(struct loop_device *lo, 1292 loop_get_status_compat(struct loop_device *lo,
1293 struct compat_loop_info __user *arg) 1293 struct compat_loop_info __user *arg)
1294 { 1294 {
1295 struct loop_info64 info64; 1295 struct loop_info64 info64;
1296 int err = 0; 1296 int err = 0;
1297 1297
1298 if (!arg) 1298 if (!arg)
1299 err = -EINVAL; 1299 err = -EINVAL;
1300 if (!err) 1300 if (!err)
1301 err = loop_get_status(lo, &info64); 1301 err = loop_get_status(lo, &info64);
1302 if (!err) 1302 if (!err)
1303 err = loop_info64_to_compat(&info64, arg); 1303 err = loop_info64_to_compat(&info64, arg);
1304 return err; 1304 return err;
1305 } 1305 }
1306 1306
1307 static long lo_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) 1307 static long lo_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
1308 { 1308 {
1309 struct inode *inode = file->f_path.dentry->d_inode; 1309 struct inode *inode = file->f_path.dentry->d_inode;
1310 struct loop_device *lo = inode->i_bdev->bd_disk->private_data; 1310 struct loop_device *lo = inode->i_bdev->bd_disk->private_data;
1311 int err; 1311 int err;
1312 1312
1313 lock_kernel(); 1313 lock_kernel();
1314 switch(cmd) { 1314 switch(cmd) {
1315 case LOOP_SET_STATUS: 1315 case LOOP_SET_STATUS:
1316 mutex_lock(&lo->lo_ctl_mutex); 1316 mutex_lock(&lo->lo_ctl_mutex);
1317 err = loop_set_status_compat( 1317 err = loop_set_status_compat(
1318 lo, (const struct compat_loop_info __user *) arg); 1318 lo, (const struct compat_loop_info __user *) arg);
1319 mutex_unlock(&lo->lo_ctl_mutex); 1319 mutex_unlock(&lo->lo_ctl_mutex);
1320 break; 1320 break;
1321 case LOOP_GET_STATUS: 1321 case LOOP_GET_STATUS:
1322 mutex_lock(&lo->lo_ctl_mutex); 1322 mutex_lock(&lo->lo_ctl_mutex);
1323 err = loop_get_status_compat( 1323 err = loop_get_status_compat(
1324 lo, (struct compat_loop_info __user *) arg); 1324 lo, (struct compat_loop_info __user *) arg);
1325 mutex_unlock(&lo->lo_ctl_mutex); 1325 mutex_unlock(&lo->lo_ctl_mutex);
1326 break; 1326 break;
1327 case LOOP_CLR_FD: 1327 case LOOP_CLR_FD:
1328 case LOOP_GET_STATUS64: 1328 case LOOP_GET_STATUS64:
1329 case LOOP_SET_STATUS64: 1329 case LOOP_SET_STATUS64:
1330 arg = (unsigned long) compat_ptr(arg); 1330 arg = (unsigned long) compat_ptr(arg);
1331 case LOOP_SET_FD: 1331 case LOOP_SET_FD:
1332 case LOOP_CHANGE_FD: 1332 case LOOP_CHANGE_FD:
1333 err = lo_ioctl(inode, file, cmd, arg); 1333 err = lo_ioctl(inode, file, cmd, arg);
1334 break; 1334 break;
1335 default: 1335 default:
1336 err = -ENOIOCTLCMD; 1336 err = -ENOIOCTLCMD;
1337 break; 1337 break;
1338 } 1338 }
1339 unlock_kernel(); 1339 unlock_kernel();
1340 return err; 1340 return err;
1341 } 1341 }
1342 #endif 1342 #endif
1343 1343
1344 static int lo_open(struct inode *inode, struct file *file) 1344 static int lo_open(struct inode *inode, struct file *file)
1345 { 1345 {
1346 struct loop_device *lo = inode->i_bdev->bd_disk->private_data; 1346 struct loop_device *lo = inode->i_bdev->bd_disk->private_data;
1347 1347
1348 mutex_lock(&lo->lo_ctl_mutex); 1348 mutex_lock(&lo->lo_ctl_mutex);
1349 lo->lo_refcnt++; 1349 lo->lo_refcnt++;
1350 mutex_unlock(&lo->lo_ctl_mutex); 1350 mutex_unlock(&lo->lo_ctl_mutex);
1351 1351
1352 return 0; 1352 return 0;
1353 } 1353 }
1354 1354
1355 static int lo_release(struct inode *inode, struct file *file) 1355 static int lo_release(struct inode *inode, struct file *file)
1356 { 1356 {
1357 struct loop_device *lo = inode->i_bdev->bd_disk->private_data; 1357 struct loop_device *lo = inode->i_bdev->bd_disk->private_data;
1358 1358
1359 mutex_lock(&lo->lo_ctl_mutex); 1359 mutex_lock(&lo->lo_ctl_mutex);
1360 --lo->lo_refcnt; 1360 --lo->lo_refcnt;
1361 mutex_unlock(&lo->lo_ctl_mutex); 1361 mutex_unlock(&lo->lo_ctl_mutex);
1362 1362
1363 return 0; 1363 return 0;
1364 } 1364 }
1365 1365
1366 static struct block_device_operations lo_fops = { 1366 static struct block_device_operations lo_fops = {
1367 .owner = THIS_MODULE, 1367 .owner = THIS_MODULE,
1368 .open = lo_open, 1368 .open = lo_open,
1369 .release = lo_release, 1369 .release = lo_release,
1370 .ioctl = lo_ioctl, 1370 .ioctl = lo_ioctl,
1371 #ifdef CONFIG_COMPAT 1371 #ifdef CONFIG_COMPAT
1372 .compat_ioctl = lo_compat_ioctl, 1372 .compat_ioctl = lo_compat_ioctl,
1373 #endif 1373 #endif
1374 }; 1374 };
1375 1375
1376 /* 1376 /*
1377 * And now the modules code and kernel interface. 1377 * And now the modules code and kernel interface.
1378 */ 1378 */
1379 static int max_loop; 1379 static int max_loop;
1380 module_param(max_loop, int, 0); 1380 module_param(max_loop, int, 0);
1381 MODULE_PARM_DESC(max_loop, "Maximum number of loop devices"); 1381 MODULE_PARM_DESC(max_loop, "Maximum number of loop devices");
1382 MODULE_LICENSE("GPL"); 1382 MODULE_LICENSE("GPL");
1383 MODULE_ALIAS_BLOCKDEV_MAJOR(LOOP_MAJOR); 1383 MODULE_ALIAS_BLOCKDEV_MAJOR(LOOP_MAJOR);
1384 1384
1385 int loop_register_transfer(struct loop_func_table *funcs) 1385 int loop_register_transfer(struct loop_func_table *funcs)
1386 { 1386 {
1387 unsigned int n = funcs->number; 1387 unsigned int n = funcs->number;
1388 1388
1389 if (n >= MAX_LO_CRYPT || xfer_funcs[n]) 1389 if (n >= MAX_LO_CRYPT || xfer_funcs[n])
1390 return -EINVAL; 1390 return -EINVAL;
1391 xfer_funcs[n] = funcs; 1391 xfer_funcs[n] = funcs;
1392 return 0; 1392 return 0;
1393 } 1393 }
1394 1394
1395 int loop_unregister_transfer(int number) 1395 int loop_unregister_transfer(int number)
1396 { 1396 {
1397 unsigned int n = number; 1397 unsigned int n = number;
1398 struct loop_device *lo; 1398 struct loop_device *lo;
1399 struct loop_func_table *xfer; 1399 struct loop_func_table *xfer;
1400 1400
1401 if (n == 0 || n >= MAX_LO_CRYPT || (xfer = xfer_funcs[n]) == NULL) 1401 if (n == 0 || n >= MAX_LO_CRYPT || (xfer = xfer_funcs[n]) == NULL)
1402 return -EINVAL; 1402 return -EINVAL;
1403 1403
1404 xfer_funcs[n] = NULL; 1404 xfer_funcs[n] = NULL;
1405 1405
1406 list_for_each_entry(lo, &loop_devices, lo_list) { 1406 list_for_each_entry(lo, &loop_devices, lo_list) {
1407 mutex_lock(&lo->lo_ctl_mutex); 1407 mutex_lock(&lo->lo_ctl_mutex);
1408 1408
1409 if (lo->lo_encryption == xfer) 1409 if (lo->lo_encryption == xfer)
1410 loop_release_xfer(lo); 1410 loop_release_xfer(lo);
1411 1411
1412 mutex_unlock(&lo->lo_ctl_mutex); 1412 mutex_unlock(&lo->lo_ctl_mutex);
1413 } 1413 }
1414 1414
1415 return 0; 1415 return 0;
1416 } 1416 }
1417 1417
1418 EXPORT_SYMBOL(loop_register_transfer); 1418 EXPORT_SYMBOL(loop_register_transfer);
1419 EXPORT_SYMBOL(loop_unregister_transfer); 1419 EXPORT_SYMBOL(loop_unregister_transfer);
1420 1420
1421 static struct loop_device *loop_alloc(int i) 1421 static struct loop_device *loop_alloc(int i)
1422 { 1422 {
1423 struct loop_device *lo; 1423 struct loop_device *lo;
1424 struct gendisk *disk; 1424 struct gendisk *disk;
1425 1425
1426 lo = kzalloc(sizeof(*lo), GFP_KERNEL); 1426 lo = kzalloc(sizeof(*lo), GFP_KERNEL);
1427 if (!lo) 1427 if (!lo)
1428 goto out; 1428 goto out;
1429 1429
1430 lo->lo_queue = blk_alloc_queue(GFP_KERNEL); 1430 lo->lo_queue = blk_alloc_queue(GFP_KERNEL);
1431 if (!lo->lo_queue) 1431 if (!lo->lo_queue)
1432 goto out_free_dev; 1432 goto out_free_dev;
1433 1433
1434 disk = lo->lo_disk = alloc_disk(1); 1434 disk = lo->lo_disk = alloc_disk(1);
1435 if (!disk) 1435 if (!disk)
1436 goto out_free_queue; 1436 goto out_free_queue;
1437 1437
1438 mutex_init(&lo->lo_ctl_mutex); 1438 mutex_init(&lo->lo_ctl_mutex);
1439 lo->lo_number = i; 1439 lo->lo_number = i;
1440 lo->lo_thread = NULL; 1440 lo->lo_thread = NULL;
1441 init_waitqueue_head(&lo->lo_event); 1441 init_waitqueue_head(&lo->lo_event);
1442 spin_lock_init(&lo->lo_lock); 1442 spin_lock_init(&lo->lo_lock);
1443 disk->major = LOOP_MAJOR; 1443 disk->major = LOOP_MAJOR;
1444 disk->first_minor = i; 1444 disk->first_minor = i;
1445 disk->fops = &lo_fops; 1445 disk->fops = &lo_fops;
1446 disk->private_data = lo; 1446 disk->private_data = lo;
1447 disk->queue = lo->lo_queue; 1447 disk->queue = lo->lo_queue;
1448 sprintf(disk->disk_name, "loop%d", i); 1448 sprintf(disk->disk_name, "loop%d", i);
1449 return lo; 1449 return lo;
1450 1450
1451 out_free_queue: 1451 out_free_queue:
1452 blk_cleanup_queue(lo->lo_queue); 1452 blk_cleanup_queue(lo->lo_queue);
1453 out_free_dev: 1453 out_free_dev:
1454 kfree(lo); 1454 kfree(lo);
1455 out: 1455 out:
1456 return NULL; 1456 return NULL;
1457 } 1457 }
1458 1458
1459 static void loop_free(struct loop_device *lo) 1459 static void loop_free(struct loop_device *lo)
1460 { 1460 {
1461 blk_cleanup_queue(lo->lo_queue); 1461 blk_cleanup_queue(lo->lo_queue);
1462 put_disk(lo->lo_disk); 1462 put_disk(lo->lo_disk);
1463 list_del(&lo->lo_list); 1463 list_del(&lo->lo_list);
1464 kfree(lo); 1464 kfree(lo);
1465 } 1465 }
1466 1466
1467 static struct loop_device *loop_init_one(int i) 1467 static struct loop_device *loop_init_one(int i)
1468 { 1468 {
1469 struct loop_device *lo; 1469 struct loop_device *lo;
1470 1470
1471 list_for_each_entry(lo, &loop_devices, lo_list) { 1471 list_for_each_entry(lo, &loop_devices, lo_list) {
1472 if (lo->lo_number == i) 1472 if (lo->lo_number == i)
1473 return lo; 1473 return lo;
1474 } 1474 }
1475 1475
1476 lo = loop_alloc(i); 1476 lo = loop_alloc(i);
1477 if (lo) { 1477 if (lo) {
1478 add_disk(lo->lo_disk); 1478 add_disk(lo->lo_disk);
1479 list_add_tail(&lo->lo_list, &loop_devices); 1479 list_add_tail(&lo->lo_list, &loop_devices);
1480 } 1480 }
1481 return lo; 1481 return lo;
1482 } 1482 }
1483 1483
1484 static void loop_del_one(struct loop_device *lo) 1484 static void loop_del_one(struct loop_device *lo)
1485 { 1485 {
1486 del_gendisk(lo->lo_disk); 1486 del_gendisk(lo->lo_disk);
1487 loop_free(lo); 1487 loop_free(lo);
1488 } 1488 }
1489 1489
1490 static struct kobject *loop_probe(dev_t dev, int *part, void *data) 1490 static struct kobject *loop_probe(dev_t dev, int *part, void *data)
1491 { 1491 {
1492 struct loop_device *lo; 1492 struct loop_device *lo;
1493 struct kobject *kobj; 1493 struct kobject *kobj;
1494 1494
1495 mutex_lock(&loop_devices_mutex); 1495 mutex_lock(&loop_devices_mutex);
1496 lo = loop_init_one(dev & MINORMASK); 1496 lo = loop_init_one(dev & MINORMASK);
1497 kobj = lo ? get_disk(lo->lo_disk) : ERR_PTR(-ENOMEM); 1497 kobj = lo ? get_disk(lo->lo_disk) : ERR_PTR(-ENOMEM);
1498 mutex_unlock(&loop_devices_mutex); 1498 mutex_unlock(&loop_devices_mutex);
1499 1499
1500 *part = 0; 1500 *part = 0;
1501 return kobj; 1501 return kobj;
1502 } 1502 }
1503 1503
1504 static int __init loop_init(void) 1504 static int __init loop_init(void)
1505 { 1505 {
1506 int i, nr; 1506 int i, nr;
1507 unsigned long range; 1507 unsigned long range;
1508 struct loop_device *lo, *next; 1508 struct loop_device *lo, *next;
1509 1509
1510 /* 1510 /*
1511 * loop module now has a feature to instantiate underlying device 1511 * loop module now has a feature to instantiate underlying device
1512 * structure on-demand, provided that there is an access dev node. 1512 * structure on-demand, provided that there is an access dev node.
1513 * However, this will not work well with user space tool that doesn't 1513 * However, this will not work well with user space tool that doesn't
1514 * know about such "feature". In order to not break any existing 1514 * know about such "feature". In order to not break any existing
1515 * tool, we do the following: 1515 * tool, we do the following:
1516 * 1516 *
1517 * (1) if max_loop is specified, create that many upfront, and this 1517 * (1) if max_loop is specified, create that many upfront, and this
1518 * also becomes a hard limit. 1518 * also becomes a hard limit.
1519 * (2) if max_loop is not specified, create 8 loop device on module 1519 * (2) if max_loop is not specified, create 8 loop device on module
1520 * load, user can further extend loop device by create dev node 1520 * load, user can further extend loop device by create dev node
1521 * themselves and have kernel automatically instantiate actual 1521 * themselves and have kernel automatically instantiate actual
1522 * device on-demand. 1522 * device on-demand.
1523 */ 1523 */
1524 if (max_loop > 1UL << MINORBITS) 1524 if (max_loop > 1UL << MINORBITS)
1525 return -EINVAL; 1525 return -EINVAL;
1526 1526
1527 if (max_loop) { 1527 if (max_loop) {
1528 nr = max_loop; 1528 nr = max_loop;
1529 range = max_loop; 1529 range = max_loop;
1530 } else { 1530 } else {
1531 nr = 8; 1531 nr = 8;
1532 range = 1UL << MINORBITS; 1532 range = 1UL << MINORBITS;
1533 } 1533 }
1534 1534
1535 if (register_blkdev(LOOP_MAJOR, "loop")) 1535 if (register_blkdev(LOOP_MAJOR, "loop"))
1536 return -EIO; 1536 return -EIO;
1537 1537
1538 for (i = 0; i < nr; i++) { 1538 for (i = 0; i < nr; i++) {
1539 lo = loop_alloc(i); 1539 lo = loop_alloc(i);
1540 if (!lo) 1540 if (!lo)
1541 goto Enomem; 1541 goto Enomem;
1542 list_add_tail(&lo->lo_list, &loop_devices); 1542 list_add_tail(&lo->lo_list, &loop_devices);
1543 } 1543 }
1544 1544
1545 /* point of no return */ 1545 /* point of no return */
1546 1546
1547 list_for_each_entry(lo, &loop_devices, lo_list) 1547 list_for_each_entry(lo, &loop_devices, lo_list)
1548 add_disk(lo->lo_disk); 1548 add_disk(lo->lo_disk);
1549 1549
1550 blk_register_region(MKDEV(LOOP_MAJOR, 0), range, 1550 blk_register_region(MKDEV(LOOP_MAJOR, 0), range,
1551 THIS_MODULE, loop_probe, NULL, NULL); 1551 THIS_MODULE, loop_probe, NULL, NULL);
1552 1552
1553 printk(KERN_INFO "loop: module loaded\n"); 1553 printk(KERN_INFO "loop: module loaded\n");
1554 return 0; 1554 return 0;
1555 1555
1556 Enomem: 1556 Enomem:
1557 printk(KERN_INFO "loop: out of memory\n"); 1557 printk(KERN_INFO "loop: out of memory\n");
1558 1558
1559 list_for_each_entry_safe(lo, next, &loop_devices, lo_list) 1559 list_for_each_entry_safe(lo, next, &loop_devices, lo_list)
1560 loop_free(lo); 1560 loop_free(lo);
1561 1561
1562 unregister_blkdev(LOOP_MAJOR, "loop"); 1562 unregister_blkdev(LOOP_MAJOR, "loop");
1563 return -ENOMEM; 1563 return -ENOMEM;
1564 } 1564 }
1565 1565
1566 static void __exit loop_exit(void) 1566 static void __exit loop_exit(void)
1567 { 1567 {
1568 unsigned long range; 1568 unsigned long range;
1569 struct loop_device *lo, *next; 1569 struct loop_device *lo, *next;
1570 1570
1571 range = max_loop ? max_loop : 1UL << MINORBITS; 1571 range = max_loop ? max_loop : 1UL << MINORBITS;
1572 1572
1573 list_for_each_entry_safe(lo, next, &loop_devices, lo_list) 1573 list_for_each_entry_safe(lo, next, &loop_devices, lo_list)
1574 loop_del_one(lo); 1574 loop_del_one(lo);
1575 1575
1576 blk_unregister_region(MKDEV(LOOP_MAJOR, 0), range); 1576 blk_unregister_region(MKDEV(LOOP_MAJOR, 0), range);
1577 if (unregister_blkdev(LOOP_MAJOR, "loop")) 1577 if (unregister_blkdev(LOOP_MAJOR, "loop"))
1578 printk(KERN_WARNING "loop: cannot unregister blkdev\n"); 1578 printk(KERN_WARNING "loop: cannot unregister blkdev\n");
1579 } 1579 }
1580 1580
1581 module_init(loop_init); 1581 module_init(loop_init);
1582 module_exit(loop_exit); 1582 module_exit(loop_exit);
1583 1583
1584 #ifndef MODULE 1584 #ifndef MODULE
1585 static int __init max_loop_setup(char *str) 1585 static int __init max_loop_setup(char *str)
1586 { 1586 {
1587 max_loop = simple_strtol(str, NULL, 0); 1587 max_loop = simple_strtol(str, NULL, 0);
1588 return 1; 1588 return 1;
1589 } 1589 }
1590 1590
1591 __setup("max_loop=", max_loop_setup); 1591 __setup("max_loop=", max_loop_setup);
1592 #endif 1592 #endif
1593 1593
1 #define MSNFS /* HACK HACK */ 1 #define MSNFS /* HACK HACK */
2 /* 2 /*
3 * linux/fs/nfsd/vfs.c 3 * linux/fs/nfsd/vfs.c
4 * 4 *
5 * File operations used by nfsd. Some of these have been ripped from 5 * File operations used by nfsd. Some of these have been ripped from
6 * other parts of the kernel because they weren't exported, others 6 * other parts of the kernel because they weren't exported, others
7 * are partial duplicates with added or changed functionality. 7 * are partial duplicates with added or changed functionality.
8 * 8 *
9 * Note that several functions dget() the dentry upon which they want 9 * Note that several functions dget() the dentry upon which they want
10 * to act, most notably those that create directory entries. Response 10 * to act, most notably those that create directory entries. Response
11 * dentry's are dput()'d if necessary in the release callback. 11 * dentry's are dput()'d if necessary in the release callback.
12 * So if you notice code paths that apparently fail to dput() the 12 * So if you notice code paths that apparently fail to dput() the
13 * dentry, don't worry--they have been taken care of. 13 * dentry, don't worry--they have been taken care of.
14 * 14 *
15 * Copyright (C) 1995-1999 Olaf Kirch <okir@monad.swb.de> 15 * Copyright (C) 1995-1999 Olaf Kirch <okir@monad.swb.de>
16 * Zerocpy NFS support (C) 2002 Hirokazu Takahashi <taka@valinux.co.jp> 16 * Zerocpy NFS support (C) 2002 Hirokazu Takahashi <taka@valinux.co.jp>
17 */ 17 */
18 18
19 #include <linux/string.h> 19 #include <linux/string.h>
20 #include <linux/time.h> 20 #include <linux/time.h>
21 #include <linux/errno.h> 21 #include <linux/errno.h>
22 #include <linux/fs.h> 22 #include <linux/fs.h>
23 #include <linux/file.h> 23 #include <linux/file.h>
24 #include <linux/mount.h> 24 #include <linux/mount.h>
25 #include <linux/major.h> 25 #include <linux/major.h>
26 #include <linux/splice.h> 26 #include <linux/splice.h>
27 #include <linux/proc_fs.h> 27 #include <linux/proc_fs.h>
28 #include <linux/stat.h> 28 #include <linux/stat.h>
29 #include <linux/fcntl.h> 29 #include <linux/fcntl.h>
30 #include <linux/net.h> 30 #include <linux/net.h>
31 #include <linux/unistd.h> 31 #include <linux/unistd.h>
32 #include <linux/slab.h> 32 #include <linux/slab.h>
33 #include <linux/pagemap.h> 33 #include <linux/pagemap.h>
34 #include <linux/in.h> 34 #include <linux/in.h>
35 #include <linux/module.h> 35 #include <linux/module.h>
36 #include <linux/namei.h> 36 #include <linux/namei.h>
37 #include <linux/vfs.h> 37 #include <linux/vfs.h>
38 #include <linux/delay.h> 38 #include <linux/delay.h>
39 #include <linux/sunrpc/svc.h> 39 #include <linux/sunrpc/svc.h>
40 #include <linux/nfsd/nfsd.h> 40 #include <linux/nfsd/nfsd.h>
41 #ifdef CONFIG_NFSD_V3 41 #ifdef CONFIG_NFSD_V3
42 #include <linux/nfs3.h> 42 #include <linux/nfs3.h>
43 #include <linux/nfsd/xdr3.h> 43 #include <linux/nfsd/xdr3.h>
44 #endif /* CONFIG_NFSD_V3 */ 44 #endif /* CONFIG_NFSD_V3 */
45 #include <linux/nfsd/nfsfh.h> 45 #include <linux/nfsd/nfsfh.h>
46 #include <linux/quotaops.h> 46 #include <linux/quotaops.h>
47 #include <linux/fsnotify.h> 47 #include <linux/fsnotify.h>
48 #include <linux/posix_acl.h> 48 #include <linux/posix_acl.h>
49 #include <linux/posix_acl_xattr.h> 49 #include <linux/posix_acl_xattr.h>
50 #include <linux/xattr.h> 50 #include <linux/xattr.h>
51 #ifdef CONFIG_NFSD_V4 51 #ifdef CONFIG_NFSD_V4
52 #include <linux/nfs4.h> 52 #include <linux/nfs4.h>
53 #include <linux/nfs4_acl.h> 53 #include <linux/nfs4_acl.h>
54 #include <linux/nfsd_idmap.h> 54 #include <linux/nfsd_idmap.h>
55 #include <linux/security.h> 55 #include <linux/security.h>
56 #endif /* CONFIG_NFSD_V4 */ 56 #endif /* CONFIG_NFSD_V4 */
57 #include <linux/jhash.h> 57 #include <linux/jhash.h>
58 58
59 #include <asm/uaccess.h> 59 #include <asm/uaccess.h>
60 60
61 #define NFSDDBG_FACILITY NFSDDBG_FILEOP 61 #define NFSDDBG_FACILITY NFSDDBG_FILEOP
62 62
63 63
64 /* We must ignore files (but only files) which might have mandatory 64 /* We must ignore files (but only files) which might have mandatory
65 * locks on them because there is no way to know if the accesser has 65 * locks on them because there is no way to know if the accesser has
66 * the lock. 66 * the lock.
67 */ 67 */
68 #define IS_ISMNDLK(i) (S_ISREG((i)->i_mode) && MANDATORY_LOCK(i)) 68 #define IS_ISMNDLK(i) (S_ISREG((i)->i_mode) && MANDATORY_LOCK(i))
69 69
70 /* 70 /*
71 * This is a cache of readahead params that help us choose the proper 71 * This is a cache of readahead params that help us choose the proper
72 * readahead strategy. Initially, we set all readahead parameters to 0 72 * readahead strategy. Initially, we set all readahead parameters to 0
73 * and let the VFS handle things. 73 * and let the VFS handle things.
74 * If you increase the number of cached files very much, you'll need to 74 * If you increase the number of cached files very much, you'll need to
75 * add a hash table here. 75 * add a hash table here.
76 */ 76 */
77 struct raparms { 77 struct raparms {
78 struct raparms *p_next; 78 struct raparms *p_next;
79 unsigned int p_count; 79 unsigned int p_count;
80 ino_t p_ino; 80 ino_t p_ino;
81 dev_t p_dev; 81 dev_t p_dev;
82 int p_set; 82 int p_set;
83 struct file_ra_state p_ra; 83 struct file_ra_state p_ra;
84 unsigned int p_hindex; 84 unsigned int p_hindex;
85 }; 85 };
86 86
87 struct raparm_hbucket { 87 struct raparm_hbucket {
88 struct raparms *pb_head; 88 struct raparms *pb_head;
89 spinlock_t pb_lock; 89 spinlock_t pb_lock;
90 } ____cacheline_aligned_in_smp; 90 } ____cacheline_aligned_in_smp;
91 91
92 static struct raparms * raparml; 92 static struct raparms * raparml;
93 #define RAPARM_HASH_BITS 4 93 #define RAPARM_HASH_BITS 4
94 #define RAPARM_HASH_SIZE (1<<RAPARM_HASH_BITS) 94 #define RAPARM_HASH_SIZE (1<<RAPARM_HASH_BITS)
95 #define RAPARM_HASH_MASK (RAPARM_HASH_SIZE-1) 95 #define RAPARM_HASH_MASK (RAPARM_HASH_SIZE-1)
96 static struct raparm_hbucket raparm_hash[RAPARM_HASH_SIZE]; 96 static struct raparm_hbucket raparm_hash[RAPARM_HASH_SIZE];
97 97
98 /* 98 /*
99 * Called from nfsd_lookup and encode_dirent. Check if we have crossed 99 * Called from nfsd_lookup and encode_dirent. Check if we have crossed
100 * a mount point. 100 * a mount point.
101 * Returns -EAGAIN or -ETIMEDOUT leaving *dpp and *expp unchanged, 101 * Returns -EAGAIN or -ETIMEDOUT leaving *dpp and *expp unchanged,
102 * or nfs_ok having possibly changed *dpp and *expp 102 * or nfs_ok having possibly changed *dpp and *expp
103 */ 103 */
104 int 104 int
105 nfsd_cross_mnt(struct svc_rqst *rqstp, struct dentry **dpp, 105 nfsd_cross_mnt(struct svc_rqst *rqstp, struct dentry **dpp,
106 struct svc_export **expp) 106 struct svc_export **expp)
107 { 107 {
108 struct svc_export *exp = *expp, *exp2 = NULL; 108 struct svc_export *exp = *expp, *exp2 = NULL;
109 struct dentry *dentry = *dpp; 109 struct dentry *dentry = *dpp;
110 struct vfsmount *mnt = mntget(exp->ex_mnt); 110 struct vfsmount *mnt = mntget(exp->ex_mnt);
111 struct dentry *mounts = dget(dentry); 111 struct dentry *mounts = dget(dentry);
112 int err = 0; 112 int err = 0;
113 113
114 while (follow_down(&mnt,&mounts)&&d_mountpoint(mounts)); 114 while (follow_down(&mnt,&mounts)&&d_mountpoint(mounts));
115 115
116 exp2 = exp_get_by_name(exp->ex_client, mnt, mounts, &rqstp->rq_chandle); 116 exp2 = exp_get_by_name(exp->ex_client, mnt, mounts, &rqstp->rq_chandle);
117 if (IS_ERR(exp2)) { 117 if (IS_ERR(exp2)) {
118 err = PTR_ERR(exp2); 118 err = PTR_ERR(exp2);
119 dput(mounts); 119 dput(mounts);
120 mntput(mnt); 120 mntput(mnt);
121 goto out; 121 goto out;
122 } 122 }
123 if (exp2 && ((exp->ex_flags & NFSEXP_CROSSMOUNT) || EX_NOHIDE(exp2))) { 123 if (exp2 && ((exp->ex_flags & NFSEXP_CROSSMOUNT) || EX_NOHIDE(exp2))) {
124 /* successfully crossed mount point */ 124 /* successfully crossed mount point */
125 exp_put(exp); 125 exp_put(exp);
126 *expp = exp2; 126 *expp = exp2;
127 dput(dentry); 127 dput(dentry);
128 *dpp = mounts; 128 *dpp = mounts;
129 } else { 129 } else {
130 if (exp2) exp_put(exp2); 130 if (exp2) exp_put(exp2);
131 dput(mounts); 131 dput(mounts);
132 } 132 }
133 mntput(mnt); 133 mntput(mnt);
134 out: 134 out:
135 return err; 135 return err;
136 } 136 }
137 137
138 /* 138 /*
139 * Look up one component of a pathname. 139 * Look up one component of a pathname.
140 * N.B. After this call _both_ fhp and resfh need an fh_put 140 * N.B. After this call _both_ fhp and resfh need an fh_put
141 * 141 *
142 * If the lookup would cross a mountpoint, and the mounted filesystem 142 * If the lookup would cross a mountpoint, and the mounted filesystem
143 * is exported to the client with NFSEXP_NOHIDE, then the lookup is 143 * is exported to the client with NFSEXP_NOHIDE, then the lookup is
144 * accepted as it stands and the mounted directory is 144 * accepted as it stands and the mounted directory is
145 * returned. Otherwise the covered directory is returned. 145 * returned. Otherwise the covered directory is returned.
146 * NOTE: this mountpoint crossing is not supported properly by all 146 * NOTE: this mountpoint crossing is not supported properly by all
147 * clients and is explicitly disallowed for NFSv3 147 * clients and is explicitly disallowed for NFSv3
148 * NeilBrown <neilb@cse.unsw.edu.au> 148 * NeilBrown <neilb@cse.unsw.edu.au>
149 */ 149 */
150 __be32 150 __be32
151 nfsd_lookup(struct svc_rqst *rqstp, struct svc_fh *fhp, const char *name, 151 nfsd_lookup(struct svc_rqst *rqstp, struct svc_fh *fhp, const char *name,
152 int len, struct svc_fh *resfh) 152 int len, struct svc_fh *resfh)
153 { 153 {
154 struct svc_export *exp; 154 struct svc_export *exp;
155 struct dentry *dparent; 155 struct dentry *dparent;
156 struct dentry *dentry; 156 struct dentry *dentry;
157 __be32 err; 157 __be32 err;
158 int host_err; 158 int host_err;
159 159
160 dprintk("nfsd: nfsd_lookup(fh %s, %.*s)\n", SVCFH_fmt(fhp), len,name); 160 dprintk("nfsd: nfsd_lookup(fh %s, %.*s)\n", SVCFH_fmt(fhp), len,name);
161 161
162 /* Obtain dentry and export. */ 162 /* Obtain dentry and export. */
163 err = fh_verify(rqstp, fhp, S_IFDIR, MAY_EXEC); 163 err = fh_verify(rqstp, fhp, S_IFDIR, MAY_EXEC);
164 if (err) 164 if (err)
165 return err; 165 return err;
166 166
167 dparent = fhp->fh_dentry; 167 dparent = fhp->fh_dentry;
168 exp = fhp->fh_export; 168 exp = fhp->fh_export;
169 exp_get(exp); 169 exp_get(exp);
170 170
171 err = nfserr_acces; 171 err = nfserr_acces;
172 172
173 /* Lookup the name, but don't follow links */ 173 /* Lookup the name, but don't follow links */
174 if (isdotent(name, len)) { 174 if (isdotent(name, len)) {
175 if (len==1) 175 if (len==1)
176 dentry = dget(dparent); 176 dentry = dget(dparent);
177 else if (dparent != exp->ex_dentry) { 177 else if (dparent != exp->ex_dentry) {
178 dentry = dget_parent(dparent); 178 dentry = dget_parent(dparent);
179 } else if (!EX_NOHIDE(exp)) 179 } else if (!EX_NOHIDE(exp))
180 dentry = dget(dparent); /* .. == . just like at / */ 180 dentry = dget(dparent); /* .. == . just like at / */
181 else { 181 else {
182 /* checking mountpoint crossing is very different when stepping up */ 182 /* checking mountpoint crossing is very different when stepping up */
183 struct svc_export *exp2 = NULL; 183 struct svc_export *exp2 = NULL;
184 struct dentry *dp; 184 struct dentry *dp;
185 struct vfsmount *mnt = mntget(exp->ex_mnt); 185 struct vfsmount *mnt = mntget(exp->ex_mnt);
186 dentry = dget(dparent); 186 dentry = dget(dparent);
187 while(dentry == mnt->mnt_root && follow_up(&mnt, &dentry)) 187 while(dentry == mnt->mnt_root && follow_up(&mnt, &dentry))
188 ; 188 ;
189 dp = dget_parent(dentry); 189 dp = dget_parent(dentry);
190 dput(dentry); 190 dput(dentry);
191 dentry = dp; 191 dentry = dp;
192 192
193 exp2 = exp_parent(exp->ex_client, mnt, dentry, 193 exp2 = exp_parent(exp->ex_client, mnt, dentry,
194 &rqstp->rq_chandle); 194 &rqstp->rq_chandle);
195 if (IS_ERR(exp2)) { 195 if (IS_ERR(exp2)) {
196 host_err = PTR_ERR(exp2); 196 host_err = PTR_ERR(exp2);
197 dput(dentry); 197 dput(dentry);
198 mntput(mnt); 198 mntput(mnt);
199 goto out_nfserr; 199 goto out_nfserr;
200 } 200 }
201 if (!exp2) { 201 if (!exp2) {
202 dput(dentry); 202 dput(dentry);
203 dentry = dget(dparent); 203 dentry = dget(dparent);
204 } else { 204 } else {
205 exp_put(exp); 205 exp_put(exp);
206 exp = exp2; 206 exp = exp2;
207 } 207 }
208 mntput(mnt); 208 mntput(mnt);
209 } 209 }
210 } else { 210 } else {
211 fh_lock(fhp); 211 fh_lock(fhp);
212 dentry = lookup_one_len(name, dparent, len); 212 dentry = lookup_one_len(name, dparent, len);
213 host_err = PTR_ERR(dentry); 213 host_err = PTR_ERR(dentry);
214 if (IS_ERR(dentry)) 214 if (IS_ERR(dentry))
215 goto out_nfserr; 215 goto out_nfserr;
216 /* 216 /*
217 * check if we have crossed a mount point ... 217 * check if we have crossed a mount point ...
218 */ 218 */
219 if (d_mountpoint(dentry)) { 219 if (d_mountpoint(dentry)) {
220 if ((host_err = nfsd_cross_mnt(rqstp, &dentry, &exp))) { 220 if ((host_err = nfsd_cross_mnt(rqstp, &dentry, &exp))) {
221 dput(dentry); 221 dput(dentry);
222 goto out_nfserr; 222 goto out_nfserr;
223 } 223 }
224 } 224 }
225 } 225 }
226 /* 226 /*
227 * Note: we compose the file handle now, but as the 227 * Note: we compose the file handle now, but as the
228 * dentry may be negative, it may need to be updated. 228 * dentry may be negative, it may need to be updated.
229 */ 229 */
230 err = fh_compose(resfh, exp, dentry, fhp); 230 err = fh_compose(resfh, exp, dentry, fhp);
231 if (!err && !dentry->d_inode) 231 if (!err && !dentry->d_inode)
232 err = nfserr_noent; 232 err = nfserr_noent;
233 dput(dentry); 233 dput(dentry);
234 out: 234 out:
235 exp_put(exp); 235 exp_put(exp);
236 return err; 236 return err;
237 237
238 out_nfserr: 238 out_nfserr:
239 err = nfserrno(host_err); 239 err = nfserrno(host_err);
240 goto out; 240 goto out;
241 } 241 }
242 242
243 /* 243 /*
244 * Set various file attributes. 244 * Set various file attributes.
245 * N.B. After this call fhp needs an fh_put 245 * N.B. After this call fhp needs an fh_put
246 */ 246 */
247 __be32 247 __be32
248 nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap, 248 nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap,
249 int check_guard, time_t guardtime) 249 int check_guard, time_t guardtime)
250 { 250 {
251 struct dentry *dentry; 251 struct dentry *dentry;
252 struct inode *inode; 252 struct inode *inode;
253 int accmode = MAY_SATTR; 253 int accmode = MAY_SATTR;
254 int ftype = 0; 254 int ftype = 0;
255 int imode; 255 int imode;
256 __be32 err; 256 __be32 err;
257 int host_err; 257 int host_err;
258 int size_change = 0; 258 int size_change = 0;
259 259
260 if (iap->ia_valid & (ATTR_ATIME | ATTR_MTIME | ATTR_SIZE)) 260 if (iap->ia_valid & (ATTR_ATIME | ATTR_MTIME | ATTR_SIZE))
261 accmode |= MAY_WRITE|MAY_OWNER_OVERRIDE; 261 accmode |= MAY_WRITE|MAY_OWNER_OVERRIDE;
262 if (iap->ia_valid & ATTR_SIZE) 262 if (iap->ia_valid & ATTR_SIZE)
263 ftype = S_IFREG; 263 ftype = S_IFREG;
264 264
265 /* Get inode */ 265 /* Get inode */
266 err = fh_verify(rqstp, fhp, ftype, accmode); 266 err = fh_verify(rqstp, fhp, ftype, accmode);
267 if (err) 267 if (err)
268 goto out; 268 goto out;
269 269
270 dentry = fhp->fh_dentry; 270 dentry = fhp->fh_dentry;
271 inode = dentry->d_inode; 271 inode = dentry->d_inode;
272 272
273 /* Ignore any mode updates on symlinks */ 273 /* Ignore any mode updates on symlinks */
274 if (S_ISLNK(inode->i_mode)) 274 if (S_ISLNK(inode->i_mode))
275 iap->ia_valid &= ~ATTR_MODE; 275 iap->ia_valid &= ~ATTR_MODE;
276 276
277 if (!iap->ia_valid) 277 if (!iap->ia_valid)
278 goto out; 278 goto out;
279 279
280 /* NFSv2 does not differentiate between "set-[ac]time-to-now" 280 /* NFSv2 does not differentiate between "set-[ac]time-to-now"
281 * which only requires access, and "set-[ac]time-to-X" which 281 * which only requires access, and "set-[ac]time-to-X" which
282 * requires ownership. 282 * requires ownership.
283 * So if it looks like it might be "set both to the same time which 283 * So if it looks like it might be "set both to the same time which
284 * is close to now", and if inode_change_ok fails, then we 284 * is close to now", and if inode_change_ok fails, then we
285 * convert to "set to now" instead of "set to explicit time" 285 * convert to "set to now" instead of "set to explicit time"
286 * 286 *
287 * We only call inode_change_ok as the last test as technically 287 * We only call inode_change_ok as the last test as technically
288 * it is not an interface that we should be using. It is only 288 * it is not an interface that we should be using. It is only
289 * valid if the filesystem does not define it's own i_op->setattr. 289 * valid if the filesystem does not define it's own i_op->setattr.
290 */ 290 */
291 #define BOTH_TIME_SET (ATTR_ATIME_SET | ATTR_MTIME_SET) 291 #define BOTH_TIME_SET (ATTR_ATIME_SET | ATTR_MTIME_SET)
292 #define MAX_TOUCH_TIME_ERROR (30*60) 292 #define MAX_TOUCH_TIME_ERROR (30*60)
293 if ((iap->ia_valid & BOTH_TIME_SET) == BOTH_TIME_SET 293 if ((iap->ia_valid & BOTH_TIME_SET) == BOTH_TIME_SET
294 && iap->ia_mtime.tv_sec == iap->ia_atime.tv_sec 294 && iap->ia_mtime.tv_sec == iap->ia_atime.tv_sec
295 ) { 295 ) {
296 /* Looks probable. Now just make sure time is in the right ballpark. 296 /* Looks probable. Now just make sure time is in the right ballpark.
297 * Solaris, at least, doesn't seem to care what the time request is. 297 * Solaris, at least, doesn't seem to care what the time request is.
298 * We require it be within 30 minutes of now. 298 * We require it be within 30 minutes of now.
299 */ 299 */
300 time_t delta = iap->ia_atime.tv_sec - get_seconds(); 300 time_t delta = iap->ia_atime.tv_sec - get_seconds();
301 if (delta<0) delta = -delta; 301 if (delta<0) delta = -delta;
302 if (delta < MAX_TOUCH_TIME_ERROR && 302 if (delta < MAX_TOUCH_TIME_ERROR &&
303 inode_change_ok(inode, iap) != 0) { 303 inode_change_ok(inode, iap) != 0) {
304 /* turn off ATTR_[AM]TIME_SET but leave ATTR_[AM]TIME 304 /* turn off ATTR_[AM]TIME_SET but leave ATTR_[AM]TIME
305 * this will cause notify_change to set these times to "now" 305 * this will cause notify_change to set these times to "now"
306 */ 306 */
307 iap->ia_valid &= ~BOTH_TIME_SET; 307 iap->ia_valid &= ~BOTH_TIME_SET;
308 } 308 }
309 } 309 }
310 310
311 /* The size case is special. It changes the file as well as the attributes. */ 311 /* The size case is special. It changes the file as well as the attributes. */
312 if (iap->ia_valid & ATTR_SIZE) { 312 if (iap->ia_valid & ATTR_SIZE) {
313 if (iap->ia_size < inode->i_size) { 313 if (iap->ia_size < inode->i_size) {
314 err = nfsd_permission(fhp->fh_export, dentry, MAY_TRUNC|MAY_OWNER_OVERRIDE); 314 err = nfsd_permission(fhp->fh_export, dentry, MAY_TRUNC|MAY_OWNER_OVERRIDE);
315 if (err) 315 if (err)
316 goto out; 316 goto out;
317 } 317 }
318 318
319 /* 319 /*
320 * If we are changing the size of the file, then 320 * If we are changing the size of the file, then
321 * we need to break all leases. 321 * we need to break all leases.
322 */ 322 */
323 host_err = break_lease(inode, FMODE_WRITE | O_NONBLOCK); 323 host_err = break_lease(inode, FMODE_WRITE | O_NONBLOCK);
324 if (host_err == -EWOULDBLOCK) 324 if (host_err == -EWOULDBLOCK)
325 host_err = -ETIMEDOUT; 325 host_err = -ETIMEDOUT;
326 if (host_err) /* ENOMEM or EWOULDBLOCK */ 326 if (host_err) /* ENOMEM or EWOULDBLOCK */
327 goto out_nfserr; 327 goto out_nfserr;
328 328
329 host_err = get_write_access(inode); 329 host_err = get_write_access(inode);
330 if (host_err) 330 if (host_err)
331 goto out_nfserr; 331 goto out_nfserr;
332 332
333 size_change = 1; 333 size_change = 1;
334 host_err = locks_verify_truncate(inode, NULL, iap->ia_size); 334 host_err = locks_verify_truncate(inode, NULL, iap->ia_size);
335 if (host_err) { 335 if (host_err) {
336 put_write_access(inode); 336 put_write_access(inode);
337 goto out_nfserr; 337 goto out_nfserr;
338 } 338 }
339 DQUOT_INIT(inode); 339 DQUOT_INIT(inode);
340 } 340 }
341 341
342 imode = inode->i_mode; 342 imode = inode->i_mode;
343 if (iap->ia_valid & ATTR_MODE) { 343 if (iap->ia_valid & ATTR_MODE) {
344 iap->ia_mode &= S_IALLUGO; 344 iap->ia_mode &= S_IALLUGO;
345 imode = iap->ia_mode |= (imode & ~S_IALLUGO); 345 imode = iap->ia_mode |= (imode & ~S_IALLUGO);
346 } 346 }
347 347
348 /* Revoke setuid/setgid bit on chown/chgrp */ 348 /* Revoke setuid/setgid bit on chown/chgrp */
349 if ((iap->ia_valid & ATTR_UID) && iap->ia_uid != inode->i_uid) 349 if ((iap->ia_valid & ATTR_UID) && iap->ia_uid != inode->i_uid)
350 iap->ia_valid |= ATTR_KILL_SUID; 350 iap->ia_valid |= ATTR_KILL_SUID;
351 if ((iap->ia_valid & ATTR_GID) && iap->ia_gid != inode->i_gid) 351 if ((iap->ia_valid & ATTR_GID) && iap->ia_gid != inode->i_gid)
352 iap->ia_valid |= ATTR_KILL_SGID; 352 iap->ia_valid |= ATTR_KILL_SGID;
353 353
354 /* Change the attributes. */ 354 /* Change the attributes. */
355 355
356 iap->ia_valid |= ATTR_CTIME; 356 iap->ia_valid |= ATTR_CTIME;
357 357
358 err = nfserr_notsync; 358 err = nfserr_notsync;
359 if (!check_guard || guardtime == inode->i_ctime.tv_sec) { 359 if (!check_guard || guardtime == inode->i_ctime.tv_sec) {
360 fh_lock(fhp); 360 fh_lock(fhp);
361 host_err = notify_change(dentry, iap); 361 host_err = notify_change(dentry, iap);
362 err = nfserrno(host_err); 362 err = nfserrno(host_err);
363 fh_unlock(fhp); 363 fh_unlock(fhp);
364 } 364 }
365 if (size_change) 365 if (size_change)
366 put_write_access(inode); 366 put_write_access(inode);
367 if (!err) 367 if (!err)
368 if (EX_ISSYNC(fhp->fh_export)) 368 if (EX_ISSYNC(fhp->fh_export))
369 write_inode_now(inode, 1); 369 write_inode_now(inode, 1);
370 out: 370 out:
371 return err; 371 return err;
372 372
373 out_nfserr: 373 out_nfserr:
374 err = nfserrno(host_err); 374 err = nfserrno(host_err);
375 goto out; 375 goto out;
376 } 376 }
377 377
378 #if defined(CONFIG_NFSD_V2_ACL) || \ 378 #if defined(CONFIG_NFSD_V2_ACL) || \
379 defined(CONFIG_NFSD_V3_ACL) || \ 379 defined(CONFIG_NFSD_V3_ACL) || \
380 defined(CONFIG_NFSD_V4) 380 defined(CONFIG_NFSD_V4)
381 static ssize_t nfsd_getxattr(struct dentry *dentry, char *key, void **buf) 381 static ssize_t nfsd_getxattr(struct dentry *dentry, char *key, void **buf)
382 { 382 {
383 ssize_t buflen; 383 ssize_t buflen;
384 384
385 buflen = vfs_getxattr(dentry, key, NULL, 0); 385 buflen = vfs_getxattr(dentry, key, NULL, 0);
386 if (buflen <= 0) 386 if (buflen <= 0)
387 return buflen; 387 return buflen;
388 388
389 *buf = kmalloc(buflen, GFP_KERNEL); 389 *buf = kmalloc(buflen, GFP_KERNEL);
390 if (!*buf) 390 if (!*buf)
391 return -ENOMEM; 391 return -ENOMEM;
392 392
393 return vfs_getxattr(dentry, key, *buf, buflen); 393 return vfs_getxattr(dentry, key, *buf, buflen);
394 } 394 }
395 #endif 395 #endif
396 396
397 #if defined(CONFIG_NFSD_V4) 397 #if defined(CONFIG_NFSD_V4)
398 static int 398 static int
399 set_nfsv4_acl_one(struct dentry *dentry, struct posix_acl *pacl, char *key) 399 set_nfsv4_acl_one(struct dentry *dentry, struct posix_acl *pacl, char *key)
400 { 400 {
401 int len; 401 int len;
402 size_t buflen; 402 size_t buflen;
403 char *buf = NULL; 403 char *buf = NULL;
404 int error = 0; 404 int error = 0;
405 405
406 buflen = posix_acl_xattr_size(pacl->a_count); 406 buflen = posix_acl_xattr_size(pacl->a_count);
407 buf = kmalloc(buflen, GFP_KERNEL); 407 buf = kmalloc(buflen, GFP_KERNEL);
408 error = -ENOMEM; 408 error = -ENOMEM;
409 if (buf == NULL) 409 if (buf == NULL)
410 goto out; 410 goto out;
411 411
412 len = posix_acl_to_xattr(pacl, buf, buflen); 412 len = posix_acl_to_xattr(pacl, buf, buflen);
413 if (len < 0) { 413 if (len < 0) {
414 error = len; 414 error = len;
415 goto out; 415 goto out;
416 } 416 }
417 417
418 error = vfs_setxattr(dentry, key, buf, len, 0); 418 error = vfs_setxattr(dentry, key, buf, len, 0);
419 out: 419 out:
420 kfree(buf); 420 kfree(buf);
421 return error; 421 return error;
422 } 422 }
423 423
424 __be32 424 __be32
425 nfsd4_set_nfs4_acl(struct svc_rqst *rqstp, struct svc_fh *fhp, 425 nfsd4_set_nfs4_acl(struct svc_rqst *rqstp, struct svc_fh *fhp,
426 struct nfs4_acl *acl) 426 struct nfs4_acl *acl)
427 { 427 {
428 __be32 error; 428 __be32 error;
429 int host_error; 429 int host_error;
430 struct dentry *dentry; 430 struct dentry *dentry;
431 struct inode *inode; 431 struct inode *inode;
432 struct posix_acl *pacl = NULL, *dpacl = NULL; 432 struct posix_acl *pacl = NULL, *dpacl = NULL;
433 unsigned int flags = 0; 433 unsigned int flags = 0;
434 434
435 /* Get inode */ 435 /* Get inode */
436 error = fh_verify(rqstp, fhp, 0 /* S_IFREG */, MAY_SATTR); 436 error = fh_verify(rqstp, fhp, 0 /* S_IFREG */, MAY_SATTR);
437 if (error) 437 if (error)
438 goto out; 438 goto out;
439 439
440 dentry = fhp->fh_dentry; 440 dentry = fhp->fh_dentry;
441 inode = dentry->d_inode; 441 inode = dentry->d_inode;
442 if (S_ISDIR(inode->i_mode)) 442 if (S_ISDIR(inode->i_mode))
443 flags = NFS4_ACL_DIR; 443 flags = NFS4_ACL_DIR;
444 444
445 host_error = nfs4_acl_nfsv4_to_posix(acl, &pacl, &dpacl, flags); 445 host_error = nfs4_acl_nfsv4_to_posix(acl, &pacl, &dpacl, flags);
446 if (host_error == -EINVAL) { 446 if (host_error == -EINVAL) {
447 error = nfserr_attrnotsupp; 447 error = nfserr_attrnotsupp;
448 goto out; 448 goto out;
449 } else if (host_error < 0) 449 } else if (host_error < 0)
450 goto out_nfserr; 450 goto out_nfserr;
451 451
452 host_error = set_nfsv4_acl_one(dentry, pacl, POSIX_ACL_XATTR_ACCESS); 452 host_error = set_nfsv4_acl_one(dentry, pacl, POSIX_ACL_XATTR_ACCESS);
453 if (host_error < 0) 453 if (host_error < 0)
454 goto out_nfserr; 454 goto out_nfserr;
455 455
456 if (S_ISDIR(inode->i_mode)) { 456 if (S_ISDIR(inode->i_mode)) {
457 host_error = set_nfsv4_acl_one(dentry, dpacl, POSIX_ACL_XATTR_DEFAULT); 457 host_error = set_nfsv4_acl_one(dentry, dpacl, POSIX_ACL_XATTR_DEFAULT);
458 if (host_error < 0) 458 if (host_error < 0)
459 goto out_nfserr; 459 goto out_nfserr;
460 } 460 }
461 461
462 error = nfs_ok; 462 error = nfs_ok;
463 463
464 out: 464 out:
465 posix_acl_release(pacl); 465 posix_acl_release(pacl);
466 posix_acl_release(dpacl); 466 posix_acl_release(dpacl);
467 return (error); 467 return (error);
468 out_nfserr: 468 out_nfserr:
469 if (host_error == -EOPNOTSUPP) 469 if (host_error == -EOPNOTSUPP)
470 error = nfserr_attrnotsupp; 470 error = nfserr_attrnotsupp;
471 else 471 else
472 error = nfserrno(host_error); 472 error = nfserrno(host_error);
473 goto out; 473 goto out;
474 } 474 }
475 475
476 static struct posix_acl * 476 static struct posix_acl *
477 _get_posix_acl(struct dentry *dentry, char *key) 477 _get_posix_acl(struct dentry *dentry, char *key)
478 { 478 {
479 void *buf = NULL; 479 void *buf = NULL;
480 struct posix_acl *pacl = NULL; 480 struct posix_acl *pacl = NULL;
481 int buflen; 481 int buflen;
482 482
483 buflen = nfsd_getxattr(dentry, key, &buf); 483 buflen = nfsd_getxattr(dentry, key, &buf);
484 if (!buflen) 484 if (!buflen)
485 buflen = -ENODATA; 485 buflen = -ENODATA;
486 if (buflen <= 0) 486 if (buflen <= 0)
487 return ERR_PTR(buflen); 487 return ERR_PTR(buflen);
488 488
489 pacl = posix_acl_from_xattr(buf, buflen); 489 pacl = posix_acl_from_xattr(buf, buflen);
490 kfree(buf); 490 kfree(buf);
491 return pacl; 491 return pacl;
492 } 492 }
493 493
494 int 494 int
495 nfsd4_get_nfs4_acl(struct svc_rqst *rqstp, struct dentry *dentry, struct nfs4_acl **acl) 495 nfsd4_get_nfs4_acl(struct svc_rqst *rqstp, struct dentry *dentry, struct nfs4_acl **acl)
496 { 496 {
497 struct inode *inode = dentry->d_inode; 497 struct inode *inode = dentry->d_inode;
498 int error = 0; 498 int error = 0;
499 struct posix_acl *pacl = NULL, *dpacl = NULL; 499 struct posix_acl *pacl = NULL, *dpacl = NULL;
500 unsigned int flags = 0; 500 unsigned int flags = 0;
501 501
502 pacl = _get_posix_acl(dentry, POSIX_ACL_XATTR_ACCESS); 502 pacl = _get_posix_acl(dentry, POSIX_ACL_XATTR_ACCESS);
503 if (IS_ERR(pacl) && PTR_ERR(pacl) == -ENODATA) 503 if (IS_ERR(pacl) && PTR_ERR(pacl) == -ENODATA)
504 pacl = posix_acl_from_mode(inode->i_mode, GFP_KERNEL); 504 pacl = posix_acl_from_mode(inode->i_mode, GFP_KERNEL);
505 if (IS_ERR(pacl)) { 505 if (IS_ERR(pacl)) {
506 error = PTR_ERR(pacl); 506 error = PTR_ERR(pacl);
507 pacl = NULL; 507 pacl = NULL;
508 goto out; 508 goto out;
509 } 509 }
510 510
511 if (S_ISDIR(inode->i_mode)) { 511 if (S_ISDIR(inode->i_mode)) {
512 dpacl = _get_posix_acl(dentry, POSIX_ACL_XATTR_DEFAULT); 512 dpacl = _get_posix_acl(dentry, POSIX_ACL_XATTR_DEFAULT);
513 if (IS_ERR(dpacl) && PTR_ERR(dpacl) == -ENODATA) 513 if (IS_ERR(dpacl) && PTR_ERR(dpacl) == -ENODATA)
514 dpacl = NULL; 514 dpacl = NULL;
515 else if (IS_ERR(dpacl)) { 515 else if (IS_ERR(dpacl)) {
516 error = PTR_ERR(dpacl); 516 error = PTR_ERR(dpacl);
517 dpacl = NULL; 517 dpacl = NULL;
518 goto out; 518 goto out;
519 } 519 }
520 flags = NFS4_ACL_DIR; 520 flags = NFS4_ACL_DIR;
521 } 521 }
522 522
523 *acl = nfs4_acl_posix_to_nfsv4(pacl, dpacl, flags); 523 *acl = nfs4_acl_posix_to_nfsv4(pacl, dpacl, flags);
524 if (IS_ERR(*acl)) { 524 if (IS_ERR(*acl)) {
525 error = PTR_ERR(*acl); 525 error = PTR_ERR(*acl);
526 *acl = NULL; 526 *acl = NULL;
527 } 527 }
528 out: 528 out:
529 posix_acl_release(pacl); 529 posix_acl_release(pacl);
530 posix_acl_release(dpacl); 530 posix_acl_release(dpacl);
531 return error; 531 return error;
532 } 532 }
533 533
534 #endif /* defined(CONFIG_NFS_V4) */ 534 #endif /* defined(CONFIG_NFS_V4) */
535 535
536 #ifdef CONFIG_NFSD_V3 536 #ifdef CONFIG_NFSD_V3
537 /* 537 /*
538 * Check server access rights to a file system object 538 * Check server access rights to a file system object
539 */ 539 */
540 struct accessmap { 540 struct accessmap {
541 u32 access; 541 u32 access;
542 int how; 542 int how;
543 }; 543 };
544 static struct accessmap nfs3_regaccess[] = { 544 static struct accessmap nfs3_regaccess[] = {
545 { NFS3_ACCESS_READ, MAY_READ }, 545 { NFS3_ACCESS_READ, MAY_READ },
546 { NFS3_ACCESS_EXECUTE, MAY_EXEC }, 546 { NFS3_ACCESS_EXECUTE, MAY_EXEC },
547 { NFS3_ACCESS_MODIFY, MAY_WRITE|MAY_TRUNC }, 547 { NFS3_ACCESS_MODIFY, MAY_WRITE|MAY_TRUNC },
548 { NFS3_ACCESS_EXTEND, MAY_WRITE }, 548 { NFS3_ACCESS_EXTEND, MAY_WRITE },
549 549
550 { 0, 0 } 550 { 0, 0 }
551 }; 551 };
552 552
553 static struct accessmap nfs3_diraccess[] = { 553 static struct accessmap nfs3_diraccess[] = {
554 { NFS3_ACCESS_READ, MAY_READ }, 554 { NFS3_ACCESS_READ, MAY_READ },
555 { NFS3_ACCESS_LOOKUP, MAY_EXEC }, 555 { NFS3_ACCESS_LOOKUP, MAY_EXEC },
556 { NFS3_ACCESS_MODIFY, MAY_EXEC|MAY_WRITE|MAY_TRUNC }, 556 { NFS3_ACCESS_MODIFY, MAY_EXEC|MAY_WRITE|MAY_TRUNC },
557 { NFS3_ACCESS_EXTEND, MAY_EXEC|MAY_WRITE }, 557 { NFS3_ACCESS_EXTEND, MAY_EXEC|MAY_WRITE },
558 { NFS3_ACCESS_DELETE, MAY_REMOVE }, 558 { NFS3_ACCESS_DELETE, MAY_REMOVE },
559 559
560 { 0, 0 } 560 { 0, 0 }
561 }; 561 };
562 562
563 static struct accessmap nfs3_anyaccess[] = { 563 static struct accessmap nfs3_anyaccess[] = {
564 /* Some clients - Solaris 2.6 at least, make an access call 564 /* Some clients - Solaris 2.6 at least, make an access call
565 * to the server to check for access for things like /dev/null 565 * to the server to check for access for things like /dev/null
566 * (which really, the server doesn't care about). So 566 * (which really, the server doesn't care about). So
567 * We provide simple access checking for them, looking 567 * We provide simple access checking for them, looking
568 * mainly at mode bits, and we make sure to ignore read-only 568 * mainly at mode bits, and we make sure to ignore read-only
569 * filesystem checks 569 * filesystem checks
570 */ 570 */
571 { NFS3_ACCESS_READ, MAY_READ }, 571 { NFS3_ACCESS_READ, MAY_READ },
572 { NFS3_ACCESS_EXECUTE, MAY_EXEC }, 572 { NFS3_ACCESS_EXECUTE, MAY_EXEC },
573 { NFS3_ACCESS_MODIFY, MAY_WRITE|MAY_LOCAL_ACCESS }, 573 { NFS3_ACCESS_MODIFY, MAY_WRITE|MAY_LOCAL_ACCESS },
574 { NFS3_ACCESS_EXTEND, MAY_WRITE|MAY_LOCAL_ACCESS }, 574 { NFS3_ACCESS_EXTEND, MAY_WRITE|MAY_LOCAL_ACCESS },
575 575
576 { 0, 0 } 576 { 0, 0 }
577 }; 577 };
578 578
579 __be32 579 __be32
580 nfsd_access(struct svc_rqst *rqstp, struct svc_fh *fhp, u32 *access, u32 *supported) 580 nfsd_access(struct svc_rqst *rqstp, struct svc_fh *fhp, u32 *access, u32 *supported)
581 { 581 {
582 struct accessmap *map; 582 struct accessmap *map;
583 struct svc_export *export; 583 struct svc_export *export;
584 struct dentry *dentry; 584 struct dentry *dentry;
585 u32 query, result = 0, sresult = 0; 585 u32 query, result = 0, sresult = 0;
586 __be32 error; 586 __be32 error;
587 587
588 error = fh_verify(rqstp, fhp, 0, MAY_NOP); 588 error = fh_verify(rqstp, fhp, 0, MAY_NOP);
589 if (error) 589 if (error)
590 goto out; 590 goto out;
591 591
592 export = fhp->fh_export; 592 export = fhp->fh_export;
593 dentry = fhp->fh_dentry; 593 dentry = fhp->fh_dentry;
594 594
595 if (S_ISREG(dentry->d_inode->i_mode)) 595 if (S_ISREG(dentry->d_inode->i_mode))
596 map = nfs3_regaccess; 596 map = nfs3_regaccess;
597 else if (S_ISDIR(dentry->d_inode->i_mode)) 597 else if (S_ISDIR(dentry->d_inode->i_mode))
598 map = nfs3_diraccess; 598 map = nfs3_diraccess;
599 else 599 else
600 map = nfs3_anyaccess; 600 map = nfs3_anyaccess;
601 601
602 602
603 query = *access; 603 query = *access;
604 for (; map->access; map++) { 604 for (; map->access; map++) {
605 if (map->access & query) { 605 if (map->access & query) {
606 __be32 err2; 606 __be32 err2;
607 607
608 sresult |= map->access; 608 sresult |= map->access;
609 609
610 err2 = nfsd_permission(export, dentry, map->how); 610 err2 = nfsd_permission(export, dentry, map->how);
611 switch (err2) { 611 switch (err2) {
612 case nfs_ok: 612 case nfs_ok:
613 result |= map->access; 613 result |= map->access;
614 break; 614 break;
615 615
616 /* the following error codes just mean the access was not allowed, 616 /* the following error codes just mean the access was not allowed,
617 * rather than an error occurred */ 617 * rather than an error occurred */
618 case nfserr_rofs: 618 case nfserr_rofs:
619 case nfserr_acces: 619 case nfserr_acces:
620 case nfserr_perm: 620 case nfserr_perm:
621 /* simply don't "or" in the access bit. */ 621 /* simply don't "or" in the access bit. */
622 break; 622 break;
623 default: 623 default:
624 error = err2; 624 error = err2;
625 goto out; 625 goto out;
626 } 626 }
627 } 627 }
628 } 628 }
629 *access = result; 629 *access = result;
630 if (supported) 630 if (supported)
631 *supported = sresult; 631 *supported = sresult;
632 632
633 out: 633 out:
634 return error; 634 return error;
635 } 635 }
636 #endif /* CONFIG_NFSD_V3 */ 636 #endif /* CONFIG_NFSD_V3 */
637 637
638 638
639 639
640 /* 640 /*
641 * Open an existing file or directory. 641 * Open an existing file or directory.
642 * The access argument indicates the type of open (read/write/lock) 642 * The access argument indicates the type of open (read/write/lock)
643 * N.B. After this call fhp needs an fh_put 643 * N.B. After this call fhp needs an fh_put
644 */ 644 */
645 __be32 645 __be32
646 nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, 646 nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
647 int access, struct file **filp) 647 int access, struct file **filp)
648 { 648 {
649 struct dentry *dentry; 649 struct dentry *dentry;
650 struct inode *inode; 650 struct inode *inode;
651 int flags = O_RDONLY|O_LARGEFILE; 651 int flags = O_RDONLY|O_LARGEFILE;
652 __be32 err; 652 __be32 err;
653 int host_err; 653 int host_err;
654 654
655 /* 655 /*
656 * If we get here, then the client has already done an "open", 656 * If we get here, then the client has already done an "open",
657 * and (hopefully) checked permission - so allow OWNER_OVERRIDE 657 * and (hopefully) checked permission - so allow OWNER_OVERRIDE
658 * in case a chmod has now revoked permission. 658 * in case a chmod has now revoked permission.
659 */ 659 */
660 err = fh_verify(rqstp, fhp, type, access | MAY_OWNER_OVERRIDE); 660 err = fh_verify(rqstp, fhp, type, access | MAY_OWNER_OVERRIDE);
661 if (err) 661 if (err)
662 goto out; 662 goto out;
663 663
664 dentry = fhp->fh_dentry; 664 dentry = fhp->fh_dentry;
665 inode = dentry->d_inode; 665 inode = dentry->d_inode;
666 666
667 /* Disallow write access to files with the append-only bit set 667 /* Disallow write access to files with the append-only bit set
668 * or any access when mandatory locking enabled 668 * or any access when mandatory locking enabled
669 */ 669 */
670 err = nfserr_perm; 670 err = nfserr_perm;
671 if (IS_APPEND(inode) && (access & MAY_WRITE)) 671 if (IS_APPEND(inode) && (access & MAY_WRITE))
672 goto out; 672 goto out;
673 if (IS_ISMNDLK(inode)) 673 if (IS_ISMNDLK(inode))
674 goto out; 674 goto out;
675 675
676 if (!inode->i_fop) 676 if (!inode->i_fop)
677 goto out; 677 goto out;
678 678
679 /* 679 /*
680 * Check to see if there are any leases on this file. 680 * Check to see if there are any leases on this file.
681 * This may block while leases are broken. 681 * This may block while leases are broken.
682 */ 682 */
683 host_err = break_lease(inode, O_NONBLOCK | ((access & MAY_WRITE) ? FMODE_WRITE : 0)); 683 host_err = break_lease(inode, O_NONBLOCK | ((access & MAY_WRITE) ? FMODE_WRITE : 0));
684 if (host_err == -EWOULDBLOCK) 684 if (host_err == -EWOULDBLOCK)
685 host_err = -ETIMEDOUT; 685 host_err = -ETIMEDOUT;
686 if (host_err) /* NOMEM or WOULDBLOCK */ 686 if (host_err) /* NOMEM or WOULDBLOCK */
687 goto out_nfserr; 687 goto out_nfserr;
688 688
689 if (access & MAY_WRITE) { 689 if (access & MAY_WRITE) {
690 if (access & MAY_READ) 690 if (access & MAY_READ)
691 flags = O_RDWR|O_LARGEFILE; 691 flags = O_RDWR|O_LARGEFILE;
692 else 692 else
693 flags = O_WRONLY|O_LARGEFILE; 693 flags = O_WRONLY|O_LARGEFILE;
694 694
695 DQUOT_INIT(inode); 695 DQUOT_INIT(inode);
696 } 696 }
697 *filp = dentry_open(dget(dentry), mntget(fhp->fh_export->ex_mnt), flags); 697 *filp = dentry_open(dget(dentry), mntget(fhp->fh_export->ex_mnt), flags);
698 if (IS_ERR(*filp)) 698 if (IS_ERR(*filp))
699 host_err = PTR_ERR(*filp); 699 host_err = PTR_ERR(*filp);
700 out_nfserr: 700 out_nfserr:
701 err = nfserrno(host_err); 701 err = nfserrno(host_err);
702 out: 702 out:
703 return err; 703 return err;
704 } 704 }
705 705
706 /* 706 /*
707 * Close a file. 707 * Close a file.
708 */ 708 */
709 void 709 void
710 nfsd_close(struct file *filp) 710 nfsd_close(struct file *filp)
711 { 711 {
712 fput(filp); 712 fput(filp);
713 } 713 }
714 714
715 /* 715 /*
716 * Sync a file 716 * Sync a file
717 * As this calls fsync (not fdatasync) there is no need for a write_inode 717 * As this calls fsync (not fdatasync) there is no need for a write_inode
718 * after it. 718 * after it.
719 */ 719 */
720 static inline int nfsd_dosync(struct file *filp, struct dentry *dp, 720 static inline int nfsd_dosync(struct file *filp, struct dentry *dp,
721 const struct file_operations *fop) 721 const struct file_operations *fop)
722 { 722 {
723 struct inode *inode = dp->d_inode; 723 struct inode *inode = dp->d_inode;
724 int (*fsync) (struct file *, struct dentry *, int); 724 int (*fsync) (struct file *, struct dentry *, int);
725 int err; 725 int err;
726 726
727 err = filemap_fdatawrite(inode->i_mapping); 727 err = filemap_fdatawrite(inode->i_mapping);
728 if (err == 0 && fop && (fsync = fop->fsync)) 728 if (err == 0 && fop && (fsync = fop->fsync))
729 err = fsync(filp, dp, 0); 729 err = fsync(filp, dp, 0);
730 if (err == 0) 730 if (err == 0)
731 err = filemap_fdatawait(inode->i_mapping); 731 err = filemap_fdatawait(inode->i_mapping);
732 732
733 return err; 733 return err;
734 } 734 }
735 735
736 736
737 static int 737 static int
738 nfsd_sync(struct file *filp) 738 nfsd_sync(struct file *filp)
739 { 739 {
740 int err; 740 int err;
741 struct inode *inode = filp->f_path.dentry->d_inode; 741 struct inode *inode = filp->f_path.dentry->d_inode;
742 dprintk("nfsd: sync file %s\n", filp->f_path.dentry->d_name.name); 742 dprintk("nfsd: sync file %s\n", filp->f_path.dentry->d_name.name);
743 mutex_lock(&inode->i_mutex); 743 mutex_lock(&inode->i_mutex);
744 err=nfsd_dosync(filp, filp->f_path.dentry, filp->f_op); 744 err=nfsd_dosync(filp, filp->f_path.dentry, filp->f_op);
745 mutex_unlock(&inode->i_mutex); 745 mutex_unlock(&inode->i_mutex);
746 746
747 return err; 747 return err;
748 } 748 }
749 749
750 int 750 int
751 nfsd_sync_dir(struct dentry *dp) 751 nfsd_sync_dir(struct dentry *dp)
752 { 752 {
753 return nfsd_dosync(NULL, dp, dp->d_inode->i_fop); 753 return nfsd_dosync(NULL, dp, dp->d_inode->i_fop);
754 } 754 }
755 755
756 /* 756 /*
757 * Obtain the readahead parameters for the file 757 * Obtain the readahead parameters for the file
758 * specified by (dev, ino). 758 * specified by (dev, ino).
759 */ 759 */
760 760
761 static inline struct raparms * 761 static inline struct raparms *
762 nfsd_get_raparms(dev_t dev, ino_t ino) 762 nfsd_get_raparms(dev_t dev, ino_t ino)
763 { 763 {
764 struct raparms *ra, **rap, **frap = NULL; 764 struct raparms *ra, **rap, **frap = NULL;
765 int depth = 0; 765 int depth = 0;
766 unsigned int hash; 766 unsigned int hash;
767 struct raparm_hbucket *rab; 767 struct raparm_hbucket *rab;
768 768
769 hash = jhash_2words(dev, ino, 0xfeedbeef) & RAPARM_HASH_MASK; 769 hash = jhash_2words(dev, ino, 0xfeedbeef) & RAPARM_HASH_MASK;
770 rab = &raparm_hash[hash]; 770 rab = &raparm_hash[hash];
771 771
772 spin_lock(&rab->pb_lock); 772 spin_lock(&rab->pb_lock);
773 for (rap = &rab->pb_head; (ra = *rap); rap = &ra->p_next) { 773 for (rap = &rab->pb_head; (ra = *rap); rap = &ra->p_next) {
774 if (ra->p_ino == ino && ra->p_dev == dev) 774 if (ra->p_ino == ino && ra->p_dev == dev)
775 goto found; 775 goto found;
776 depth++; 776 depth++;
777 if (ra->p_count == 0) 777 if (ra->p_count == 0)
778 frap = rap; 778 frap = rap;
779 } 779 }
780 depth = nfsdstats.ra_size*11/10; 780 depth = nfsdstats.ra_size*11/10;
781 if (!frap) { 781 if (!frap) {
782 spin_unlock(&rab->pb_lock); 782 spin_unlock(&rab->pb_lock);
783 return NULL; 783 return NULL;
784 } 784 }
785 rap = frap; 785 rap = frap;
786 ra = *frap; 786 ra = *frap;
787 ra->p_dev = dev; 787 ra->p_dev = dev;
788 ra->p_ino = ino; 788 ra->p_ino = ino;
789 ra->p_set = 0; 789 ra->p_set = 0;
790 ra->p_hindex = hash; 790 ra->p_hindex = hash;
791 found: 791 found:
792 if (rap != &rab->pb_head) { 792 if (rap != &rab->pb_head) {
793 *rap = ra->p_next; 793 *rap = ra->p_next;
794 ra->p_next = rab->pb_head; 794 ra->p_next = rab->pb_head;
795 rab->pb_head = ra; 795 rab->pb_head = ra;
796 } 796 }
797 ra->p_count++; 797 ra->p_count++;
798 nfsdstats.ra_depth[depth*10/nfsdstats.ra_size]++; 798 nfsdstats.ra_depth[depth*10/nfsdstats.ra_size]++;
799 spin_unlock(&rab->pb_lock); 799 spin_unlock(&rab->pb_lock);
800 return ra; 800 return ra;
801 } 801 }
802 802
803 /* 803 /*
804 * Grab and keep cached pages associated with a file in the svc_rqst 804 * Grab and keep cached pages associated with a file in the svc_rqst
805 * so that they can be passed to the network sendmsg/sendpage routines 805 * so that they can be passed to the network sendmsg/sendpage routines
806 * directly. They will be released after the sending has completed. 806 * directly. They will be released after the sending has completed.
807 */ 807 */
808 static int 808 static int
809 nfsd_splice_actor(struct pipe_inode_info *pipe, struct pipe_buffer *buf, 809 nfsd_splice_actor(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
810 struct splice_desc *sd) 810 struct splice_desc *sd)
811 { 811 {
812 struct svc_rqst *rqstp = sd->u.data; 812 struct svc_rqst *rqstp = sd->u.data;
813 struct page **pp = rqstp->rq_respages + rqstp->rq_resused; 813 struct page **pp = rqstp->rq_respages + rqstp->rq_resused;
814 struct page *page = buf->page; 814 struct page *page = buf->page;
815 size_t size; 815 size_t size;
816 int ret; 816 int ret;
817 817
818 ret = buf->ops->pin(pipe, buf); 818 ret = buf->ops->confirm(pipe, buf);
819 if (unlikely(ret)) 819 if (unlikely(ret))
820 return ret; 820 return ret;
821 821
822 size = sd->len; 822 size = sd->len;
823 823
824 if (rqstp->rq_res.page_len == 0) { 824 if (rqstp->rq_res.page_len == 0) {
825 get_page(page); 825 get_page(page);
826 put_page(*pp); 826 put_page(*pp);
827 *pp = page; 827 *pp = page;
828 rqstp->rq_resused++; 828 rqstp->rq_resused++;
829 rqstp->rq_res.page_base = buf->offset; 829 rqstp->rq_res.page_base = buf->offset;
830 rqstp->rq_res.page_len = size; 830 rqstp->rq_res.page_len = size;
831 } else if (page != pp[-1]) { 831 } else if (page != pp[-1]) {
832 get_page(page); 832 get_page(page);
833 if (*pp) 833 if (*pp)
834 put_page(*pp); 834 put_page(*pp);
835 *pp = page; 835 *pp = page;
836 rqstp->rq_resused++; 836 rqstp->rq_resused++;
837 rqstp->rq_res.page_len += size; 837 rqstp->rq_res.page_len += size;
838 } else 838 } else
839 rqstp->rq_res.page_len += size; 839 rqstp->rq_res.page_len += size;
840 840
841 return size; 841 return size;
842 } 842 }
843 843
844 static int nfsd_direct_splice_actor(struct pipe_inode_info *pipe, 844 static int nfsd_direct_splice_actor(struct pipe_inode_info *pipe,
845 struct splice_desc *sd) 845 struct splice_desc *sd)
846 { 846 {
847 return __splice_from_pipe(pipe, sd, nfsd_splice_actor); 847 return __splice_from_pipe(pipe, sd, nfsd_splice_actor);
848 } 848 }
849 849
850 static __be32 850 static __be32
851 nfsd_vfs_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, 851 nfsd_vfs_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
852 loff_t offset, struct kvec *vec, int vlen, unsigned long *count) 852 loff_t offset, struct kvec *vec, int vlen, unsigned long *count)
853 { 853 {
854 struct inode *inode; 854 struct inode *inode;
855 struct raparms *ra; 855 struct raparms *ra;
856 mm_segment_t oldfs; 856 mm_segment_t oldfs;
857 __be32 err; 857 __be32 err;
858 int host_err; 858 int host_err;
859 859
860 err = nfserr_perm; 860 err = nfserr_perm;
861 inode = file->f_path.dentry->d_inode; 861 inode = file->f_path.dentry->d_inode;
862 #ifdef MSNFS 862 #ifdef MSNFS
863 if ((fhp->fh_export->ex_flags & NFSEXP_MSNFS) && 863 if ((fhp->fh_export->ex_flags & NFSEXP_MSNFS) &&
864 (!lock_may_read(inode, offset, *count))) 864 (!lock_may_read(inode, offset, *count)))
865 goto out; 865 goto out;
866 #endif 866 #endif
867 867
868 /* Get readahead parameters */ 868 /* Get readahead parameters */
869 ra = nfsd_get_raparms(inode->i_sb->s_dev, inode->i_ino); 869 ra = nfsd_get_raparms(inode->i_sb->s_dev, inode->i_ino);
870 870
871 if (ra && ra->p_set) 871 if (ra && ra->p_set)
872 file->f_ra = ra->p_ra; 872 file->f_ra = ra->p_ra;
873 873
874 if (file->f_op->splice_read && rqstp->rq_splice_ok) { 874 if (file->f_op->splice_read && rqstp->rq_splice_ok) {
875 struct splice_desc sd = { 875 struct splice_desc sd = {
876 .len = 0, 876 .len = 0,
877 .total_len = *count, 877 .total_len = *count,
878 .pos = offset, 878 .pos = offset,
879 .u.data = rqstp, 879 .u.data = rqstp,
880 }; 880 };
881 881
882 host_err = splice_direct_to_actor(file, &sd, nfsd_direct_splice_actor); 882 host_err = splice_direct_to_actor(file, &sd, nfsd_direct_splice_actor);
883 } else { 883 } else {
884 oldfs = get_fs(); 884 oldfs = get_fs();
885 set_fs(KERNEL_DS); 885 set_fs(KERNEL_DS);
886 host_err = vfs_readv(file, (struct iovec __user *)vec, vlen, &offset); 886 host_err = vfs_readv(file, (struct iovec __user *)vec, vlen, &offset);
887 set_fs(oldfs); 887 set_fs(oldfs);
888 } 888 }
889 889
890 /* Write back readahead params */ 890 /* Write back readahead params */
891 if (ra) { 891 if (ra) {
892 struct raparm_hbucket *rab = &raparm_hash[ra->p_hindex]; 892 struct raparm_hbucket *rab = &raparm_hash[ra->p_hindex];
893 spin_lock(&rab->pb_lock); 893 spin_lock(&rab->pb_lock);
894 ra->p_ra = file->f_ra; 894 ra->p_ra = file->f_ra;
895 ra->p_set = 1; 895 ra->p_set = 1;
896 ra->p_count--; 896 ra->p_count--;
897 spin_unlock(&rab->pb_lock); 897 spin_unlock(&rab->pb_lock);
898 } 898 }
899 899
900 if (host_err >= 0) { 900 if (host_err >= 0) {
901 nfsdstats.io_read += host_err; 901 nfsdstats.io_read += host_err;
902 *count = host_err; 902 *count = host_err;
903 err = 0; 903 err = 0;
904 fsnotify_access(file->f_path.dentry); 904 fsnotify_access(file->f_path.dentry);
905 } else 905 } else
906 err = nfserrno(host_err); 906 err = nfserrno(host_err);
907 out: 907 out:
908 return err; 908 return err;
909 } 909 }
910 910
911 static void kill_suid(struct dentry *dentry) 911 static void kill_suid(struct dentry *dentry)
912 { 912 {
913 struct iattr ia; 913 struct iattr ia;
914 ia.ia_valid = ATTR_KILL_SUID | ATTR_KILL_SGID; 914 ia.ia_valid = ATTR_KILL_SUID | ATTR_KILL_SGID;
915 915
916 mutex_lock(&dentry->d_inode->i_mutex); 916 mutex_lock(&dentry->d_inode->i_mutex);
917 notify_change(dentry, &ia); 917 notify_change(dentry, &ia);
918 mutex_unlock(&dentry->d_inode->i_mutex); 918 mutex_unlock(&dentry->d_inode->i_mutex);
919 } 919 }
920 920
921 static __be32 921 static __be32
922 nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, 922 nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
923 loff_t offset, struct kvec *vec, int vlen, 923 loff_t offset, struct kvec *vec, int vlen,
924 unsigned long cnt, int *stablep) 924 unsigned long cnt, int *stablep)
925 { 925 {
926 struct svc_export *exp; 926 struct svc_export *exp;
927 struct dentry *dentry; 927 struct dentry *dentry;
928 struct inode *inode; 928 struct inode *inode;
929 mm_segment_t oldfs; 929 mm_segment_t oldfs;
930 __be32 err = 0; 930 __be32 err = 0;
931 int host_err; 931 int host_err;
932 int stable = *stablep; 932 int stable = *stablep;
933 933
934 #ifdef MSNFS 934 #ifdef MSNFS
935 err = nfserr_perm; 935 err = nfserr_perm;
936 936
937 if ((fhp->fh_export->ex_flags & NFSEXP_MSNFS) && 937 if ((fhp->fh_export->ex_flags & NFSEXP_MSNFS) &&
938 (!lock_may_write(file->f_path.dentry->d_inode, offset, cnt))) 938 (!lock_may_write(file->f_path.dentry->d_inode, offset, cnt)))
939 goto out; 939 goto out;
940 #endif 940 #endif
941 941
942 dentry = file->f_path.dentry; 942 dentry = file->f_path.dentry;
943 inode = dentry->d_inode; 943 inode = dentry->d_inode;
944 exp = fhp->fh_export; 944 exp = fhp->fh_export;
945 945
946 /* 946 /*
947 * Request sync writes if 947 * Request sync writes if
948 * - the sync export option has been set, or 948 * - the sync export option has been set, or
949 * - the client requested O_SYNC behavior (NFSv3 feature). 949 * - the client requested O_SYNC behavior (NFSv3 feature).
950 * - The file system doesn't support fsync(). 950 * - The file system doesn't support fsync().
951 * When gathered writes have been configured for this volume, 951 * When gathered writes have been configured for this volume,
952 * flushing the data to disk is handled separately below. 952 * flushing the data to disk is handled separately below.
953 */ 953 */
954 954
955 if (file->f_op->fsync == 0) {/* COMMIT3 cannot work */ 955 if (file->f_op->fsync == 0) {/* COMMIT3 cannot work */
956 stable = 2; 956 stable = 2;
957 *stablep = 2; /* FILE_SYNC */ 957 *stablep = 2; /* FILE_SYNC */
958 } 958 }
959 959
960 if (!EX_ISSYNC(exp)) 960 if (!EX_ISSYNC(exp))
961 stable = 0; 961 stable = 0;
962 if (stable && !EX_WGATHER(exp)) 962 if (stable && !EX_WGATHER(exp))
963 file->f_flags |= O_SYNC; 963 file->f_flags |= O_SYNC;
964 964
965 /* Write the data. */ 965 /* Write the data. */
966 oldfs = get_fs(); set_fs(KERNEL_DS); 966 oldfs = get_fs(); set_fs(KERNEL_DS);
967 host_err = vfs_writev(file, (struct iovec __user *)vec, vlen, &offset); 967 host_err = vfs_writev(file, (struct iovec __user *)vec, vlen, &offset);
968 set_fs(oldfs); 968 set_fs(oldfs);
969 if (host_err >= 0) { 969 if (host_err >= 0) {
970 nfsdstats.io_write += cnt; 970 nfsdstats.io_write += cnt;
971 fsnotify_modify(file->f_path.dentry); 971 fsnotify_modify(file->f_path.dentry);
972 } 972 }
973 973
974 /* clear setuid/setgid flag after write */ 974 /* clear setuid/setgid flag after write */
975 if (host_err >= 0 && (inode->i_mode & (S_ISUID | S_ISGID))) 975 if (host_err >= 0 && (inode->i_mode & (S_ISUID | S_ISGID)))
976 kill_suid(dentry); 976 kill_suid(dentry);
977 977
978 if (host_err >= 0 && stable) { 978 if (host_err >= 0 && stable) {
979 static ino_t last_ino; 979 static ino_t last_ino;
980 static dev_t last_dev; 980 static dev_t last_dev;
981 981
982 /* 982 /*
983 * Gathered writes: If another process is currently 983 * Gathered writes: If another process is currently
984 * writing to the file, there's a high chance 984 * writing to the file, there's a high chance
985 * this is another nfsd (triggered by a bulk write 985 * this is another nfsd (triggered by a bulk write
986 * from a client's biod). Rather than syncing the 986 * from a client's biod). Rather than syncing the
987 * file with each write request, we sleep for 10 msec. 987 * file with each write request, we sleep for 10 msec.
988 * 988 *
989 * I don't know if this roughly approximates 989 * I don't know if this roughly approximates
990 * C. Juszak's idea of gathered writes, but it's a 990 * C. Juszak's idea of gathered writes, but it's a
991 * nice and simple solution (IMHO), and it seems to 991 * nice and simple solution (IMHO), and it seems to
992 * work:-) 992 * work:-)
993 */ 993 */
994 if (EX_WGATHER(exp)) { 994 if (EX_WGATHER(exp)) {
995 if (atomic_read(&inode->i_writecount) > 1 995 if (atomic_read(&inode->i_writecount) > 1
996 || (last_ino == inode->i_ino && last_dev == inode->i_sb->s_dev)) { 996 || (last_ino == inode->i_ino && last_dev == inode->i_sb->s_dev)) {
997 dprintk("nfsd: write defer %d\n", current->pid); 997 dprintk("nfsd: write defer %d\n", current->pid);
998 msleep(10); 998 msleep(10);
999 dprintk("nfsd: write resume %d\n", current->pid); 999 dprintk("nfsd: write resume %d\n", current->pid);
1000 } 1000 }
1001 1001
1002 if (inode->i_state & I_DIRTY) { 1002 if (inode->i_state & I_DIRTY) {
1003 dprintk("nfsd: write sync %d\n", current->pid); 1003 dprintk("nfsd: write sync %d\n", current->pid);
1004 host_err=nfsd_sync(file); 1004 host_err=nfsd_sync(file);
1005 } 1005 }
1006 #if 0 1006 #if 0
1007 wake_up(&inode->i_wait); 1007 wake_up(&inode->i_wait);
1008 #endif 1008 #endif
1009 } 1009 }
1010 last_ino = inode->i_ino; 1010 last_ino = inode->i_ino;
1011 last_dev = inode->i_sb->s_dev; 1011 last_dev = inode->i_sb->s_dev;
1012 } 1012 }
1013 1013
1014 dprintk("nfsd: write complete host_err=%d\n", host_err); 1014 dprintk("nfsd: write complete host_err=%d\n", host_err);
1015 if (host_err >= 0) 1015 if (host_err >= 0)
1016 err = 0; 1016 err = 0;
1017 else 1017 else
1018 err = nfserrno(host_err); 1018 err = nfserrno(host_err);
1019 out: 1019 out:
1020 return err; 1020 return err;
1021 } 1021 }
1022 1022
1023 /* 1023 /*
1024 * Read data from a file. count must contain the requested read count 1024 * Read data from a file. count must contain the requested read count
1025 * on entry. On return, *count contains the number of bytes actually read. 1025 * on entry. On return, *count contains the number of bytes actually read.
1026 * N.B. After this call fhp needs an fh_put 1026 * N.B. After this call fhp needs an fh_put
1027 */ 1027 */
1028 __be32 1028 __be32
1029 nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, 1029 nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
1030 loff_t offset, struct kvec *vec, int vlen, 1030 loff_t offset, struct kvec *vec, int vlen,
1031 unsigned long *count) 1031 unsigned long *count)
1032 { 1032 {
1033 __be32 err; 1033 __be32 err;
1034 1034
1035 if (file) { 1035 if (file) {
1036 err = nfsd_permission(fhp->fh_export, fhp->fh_dentry, 1036 err = nfsd_permission(fhp->fh_export, fhp->fh_dentry,
1037 MAY_READ|MAY_OWNER_OVERRIDE); 1037 MAY_READ|MAY_OWNER_OVERRIDE);
1038 if (err) 1038 if (err)
1039 goto out; 1039 goto out;
1040 err = nfsd_vfs_read(rqstp, fhp, file, offset, vec, vlen, count); 1040 err = nfsd_vfs_read(rqstp, fhp, file, offset, vec, vlen, count);
1041 } else { 1041 } else {
1042 err = nfsd_open(rqstp, fhp, S_IFREG, MAY_READ, &file); 1042 err = nfsd_open(rqstp, fhp, S_IFREG, MAY_READ, &file);
1043 if (err) 1043 if (err)
1044 goto out; 1044 goto out;
1045 err = nfsd_vfs_read(rqstp, fhp, file, offset, vec, vlen, count); 1045 err = nfsd_vfs_read(rqstp, fhp, file, offset, vec, vlen, count);
1046 nfsd_close(file); 1046 nfsd_close(file);
1047 } 1047 }
1048 out: 1048 out:
1049 return err; 1049 return err;
1050 } 1050 }
1051 1051
1052 /* 1052 /*
1053 * Write data to a file. 1053 * Write data to a file.
1054 * The stable flag requests synchronous writes. 1054 * The stable flag requests synchronous writes.
1055 * N.B. After this call fhp needs an fh_put 1055 * N.B. After this call fhp needs an fh_put
1056 */ 1056 */
1057 __be32 1057 __be32
1058 nfsd_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, 1058 nfsd_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
1059 loff_t offset, struct kvec *vec, int vlen, unsigned long cnt, 1059 loff_t offset, struct kvec *vec, int vlen, unsigned long cnt,
1060 int *stablep) 1060 int *stablep)
1061 { 1061 {
1062 __be32 err = 0; 1062 __be32 err = 0;
1063 1063
1064 if (file) { 1064 if (file) {
1065 err = nfsd_permission(fhp->fh_export, fhp->fh_dentry, 1065 err = nfsd_permission(fhp->fh_export, fhp->fh_dentry,
1066 MAY_WRITE|MAY_OWNER_OVERRIDE); 1066 MAY_WRITE|MAY_OWNER_OVERRIDE);
1067 if (err) 1067 if (err)
1068 goto out; 1068 goto out;
1069 err = nfsd_vfs_write(rqstp, fhp, file, offset, vec, vlen, cnt, 1069 err = nfsd_vfs_write(rqstp, fhp, file, offset, vec, vlen, cnt,
1070 stablep); 1070 stablep);
1071 } else { 1071 } else {
1072 err = nfsd_open(rqstp, fhp, S_IFREG, MAY_WRITE, &file); 1072 err = nfsd_open(rqstp, fhp, S_IFREG, MAY_WRITE, &file);
1073 if (err) 1073 if (err)
1074 goto out; 1074 goto out;
1075 1075
1076 if (cnt) 1076 if (cnt)
1077 err = nfsd_vfs_write(rqstp, fhp, file, offset, vec, vlen, 1077 err = nfsd_vfs_write(rqstp, fhp, file, offset, vec, vlen,
1078 cnt, stablep); 1078 cnt, stablep);
1079 nfsd_close(file); 1079 nfsd_close(file);
1080 } 1080 }
1081 out: 1081 out:
1082 return err; 1082 return err;
1083 } 1083 }
1084 1084
1085 #ifdef CONFIG_NFSD_V3 1085 #ifdef CONFIG_NFSD_V3
1086 /* 1086 /*
1087 * Commit all pending writes to stable storage. 1087 * Commit all pending writes to stable storage.
1088 * Strictly speaking, we could sync just the indicated file region here, 1088 * Strictly speaking, we could sync just the indicated file region here,
1089 * but there's currently no way we can ask the VFS to do so. 1089 * but there's currently no way we can ask the VFS to do so.
1090 * 1090 *
1091 * Unfortunately we cannot lock the file to make sure we return full WCC 1091 * Unfortunately we cannot lock the file to make sure we return full WCC
1092 * data to the client, as locking happens lower down in the filesystem. 1092 * data to the client, as locking happens lower down in the filesystem.
1093 */ 1093 */
1094 __be32 1094 __be32
1095 nfsd_commit(struct svc_rqst *rqstp, struct svc_fh *fhp, 1095 nfsd_commit(struct svc_rqst *rqstp, struct svc_fh *fhp,
1096 loff_t offset, unsigned long count) 1096 loff_t offset, unsigned long count)
1097 { 1097 {
1098 struct file *file; 1098 struct file *file;
1099 __be32 err; 1099 __be32 err;
1100 1100
1101 if ((u64)count > ~(u64)offset) 1101 if ((u64)count > ~(u64)offset)
1102 return nfserr_inval; 1102 return nfserr_inval;
1103 1103
1104 if ((err = nfsd_open(rqstp, fhp, S_IFREG, MAY_WRITE, &file)) != 0) 1104 if ((err = nfsd_open(rqstp, fhp, S_IFREG, MAY_WRITE, &file)) != 0)
1105 return err; 1105 return err;
1106 if (EX_ISSYNC(fhp->fh_export)) { 1106 if (EX_ISSYNC(fhp->fh_export)) {
1107 if (file->f_op && file->f_op->fsync) { 1107 if (file->f_op && file->f_op->fsync) {
1108 err = nfserrno(nfsd_sync(file)); 1108 err = nfserrno(nfsd_sync(file));
1109 } else { 1109 } else {
1110 err = nfserr_notsupp; 1110 err = nfserr_notsupp;
1111 } 1111 }
1112 } 1112 }
1113 1113
1114 nfsd_close(file); 1114 nfsd_close(file);
1115 return err; 1115 return err;
1116 } 1116 }
1117 #endif /* CONFIG_NFSD_V3 */ 1117 #endif /* CONFIG_NFSD_V3 */
1118 1118
1119 /* 1119 /*
1120 * Create a file (regular, directory, device, fifo); UNIX sockets 1120 * Create a file (regular, directory, device, fifo); UNIX sockets
1121 * not yet implemented. 1121 * not yet implemented.
1122 * If the response fh has been verified, the parent directory should 1122 * If the response fh has been verified, the parent directory should
1123 * already be locked. Note that the parent directory is left locked. 1123 * already be locked. Note that the parent directory is left locked.
1124 * 1124 *
1125 * N.B. Every call to nfsd_create needs an fh_put for _both_ fhp and resfhp 1125 * N.B. Every call to nfsd_create needs an fh_put for _both_ fhp and resfhp
1126 */ 1126 */
1127 __be32 1127 __be32
1128 nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp, 1128 nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
1129 char *fname, int flen, struct iattr *iap, 1129 char *fname, int flen, struct iattr *iap,
1130 int type, dev_t rdev, struct svc_fh *resfhp) 1130 int type, dev_t rdev, struct svc_fh *resfhp)
1131 { 1131 {
1132 struct dentry *dentry, *dchild = NULL; 1132 struct dentry *dentry, *dchild = NULL;
1133 struct inode *dirp; 1133 struct inode *dirp;
1134 __be32 err; 1134 __be32 err;
1135 int host_err; 1135 int host_err;
1136 1136
1137 err = nfserr_perm; 1137 err = nfserr_perm;
1138 if (!flen) 1138 if (!flen)
1139 goto out; 1139 goto out;
1140 err = nfserr_exist; 1140 err = nfserr_exist;
1141 if (isdotent(fname, flen)) 1141 if (isdotent(fname, flen))
1142 goto out; 1142 goto out;
1143 1143
1144 err = fh_verify(rqstp, fhp, S_IFDIR, MAY_CREATE); 1144 err = fh_verify(rqstp, fhp, S_IFDIR, MAY_CREATE);
1145 if (err) 1145 if (err)
1146 goto out; 1146 goto out;
1147 1147
1148 dentry = fhp->fh_dentry; 1148 dentry = fhp->fh_dentry;
1149 dirp = dentry->d_inode; 1149 dirp = dentry->d_inode;
1150 1150
1151 err = nfserr_notdir; 1151 err = nfserr_notdir;
1152 if(!dirp->i_op || !dirp->i_op->lookup) 1152 if(!dirp->i_op || !dirp->i_op->lookup)
1153 goto out; 1153 goto out;
1154 /* 1154 /*
1155 * Check whether the response file handle has been verified yet. 1155 * Check whether the response file handle has been verified yet.
1156 * If it has, the parent directory should already be locked. 1156 * If it has, the parent directory should already be locked.
1157 */ 1157 */
1158 if (!resfhp->fh_dentry) { 1158 if (!resfhp->fh_dentry) {
1159 /* called from nfsd_proc_mkdir, or possibly nfsd3_proc_create */ 1159 /* called from nfsd_proc_mkdir, or possibly nfsd3_proc_create */
1160 fh_lock_nested(fhp, I_MUTEX_PARENT); 1160 fh_lock_nested(fhp, I_MUTEX_PARENT);
1161 dchild = lookup_one_len(fname, dentry, flen); 1161 dchild = lookup_one_len(fname, dentry, flen);
1162 host_err = PTR_ERR(dchild); 1162 host_err = PTR_ERR(dchild);
1163 if (IS_ERR(dchild)) 1163 if (IS_ERR(dchild))
1164 goto out_nfserr; 1164 goto out_nfserr;
1165 err = fh_compose(resfhp, fhp->fh_export, dchild, fhp); 1165 err = fh_compose(resfhp, fhp->fh_export, dchild, fhp);
1166 if (err) 1166 if (err)
1167 goto out; 1167 goto out;
1168 } else { 1168 } else {
1169 /* called from nfsd_proc_create */ 1169 /* called from nfsd_proc_create */
1170 dchild = dget(resfhp->fh_dentry); 1170 dchild = dget(resfhp->fh_dentry);
1171 if (!fhp->fh_locked) { 1171 if (!fhp->fh_locked) {
1172 /* not actually possible */ 1172 /* not actually possible */
1173 printk(KERN_ERR 1173 printk(KERN_ERR
1174 "nfsd_create: parent %s/%s not locked!\n", 1174 "nfsd_create: parent %s/%s not locked!\n",
1175 dentry->d_parent->d_name.name, 1175 dentry->d_parent->d_name.name,
1176 dentry->d_name.name); 1176 dentry->d_name.name);
1177 err = nfserr_io; 1177 err = nfserr_io;
1178 goto out; 1178 goto out;
1179 } 1179 }
1180 } 1180 }
1181 /* 1181 /*
1182 * Make sure the child dentry is still negative ... 1182 * Make sure the child dentry is still negative ...
1183 */ 1183 */
1184 err = nfserr_exist; 1184 err = nfserr_exist;
1185 if (dchild->d_inode) { 1185 if (dchild->d_inode) {
1186 dprintk("nfsd_create: dentry %s/%s not negative!\n", 1186 dprintk("nfsd_create: dentry %s/%s not negative!\n",
1187 dentry->d_name.name, dchild->d_name.name); 1187 dentry->d_name.name, dchild->d_name.name);
1188 goto out; 1188 goto out;
1189 } 1189 }
1190 1190
1191 if (!(iap->ia_valid & ATTR_MODE)) 1191 if (!(iap->ia_valid & ATTR_MODE))
1192 iap->ia_mode = 0; 1192 iap->ia_mode = 0;
1193 iap->ia_mode = (iap->ia_mode & S_IALLUGO) | type; 1193 iap->ia_mode = (iap->ia_mode & S_IALLUGO) | type;
1194 1194
1195 /* 1195 /*
1196 * Get the dir op function pointer. 1196 * Get the dir op function pointer.
1197 */ 1197 */
1198 err = 0; 1198 err = 0;
1199 switch (type) { 1199 switch (type) {
1200 case S_IFREG: 1200 case S_IFREG:
1201 host_err = vfs_create(dirp, dchild, iap->ia_mode, NULL); 1201 host_err = vfs_create(dirp, dchild, iap->ia_mode, NULL);
1202 break; 1202 break;
1203 case S_IFDIR: 1203 case S_IFDIR:
1204 host_err = vfs_mkdir(dirp, dchild, iap->ia_mode); 1204 host_err = vfs_mkdir(dirp, dchild, iap->ia_mode);
1205 break; 1205 break;
1206 case S_IFCHR: 1206 case S_IFCHR:
1207 case S_IFBLK: 1207 case S_IFBLK:
1208 case S_IFIFO: 1208 case S_IFIFO:
1209 case S_IFSOCK: 1209 case S_IFSOCK:
1210 host_err = vfs_mknod(dirp, dchild, iap->ia_mode, rdev); 1210 host_err = vfs_mknod(dirp, dchild, iap->ia_mode, rdev);
1211 break; 1211 break;
1212 default: 1212 default:
1213 printk("nfsd: bad file type %o in nfsd_create\n", type); 1213 printk("nfsd: bad file type %o in nfsd_create\n", type);
1214 host_err = -EINVAL; 1214 host_err = -EINVAL;
1215 } 1215 }
1216 if (host_err < 0) 1216 if (host_err < 0)
1217 goto out_nfserr; 1217 goto out_nfserr;
1218 1218
1219 if (EX_ISSYNC(fhp->fh_export)) { 1219 if (EX_ISSYNC(fhp->fh_export)) {
1220 err = nfserrno(nfsd_sync_dir(dentry)); 1220 err = nfserrno(nfsd_sync_dir(dentry));
1221 write_inode_now(dchild->d_inode, 1); 1221 write_inode_now(dchild->d_inode, 1);
1222 } 1222 }
1223 1223
1224 1224
1225 /* Set file attributes. Mode has already been set and 1225 /* Set file attributes. Mode has already been set and
1226 * setting uid/gid works only for root. Irix appears to 1226 * setting uid/gid works only for root. Irix appears to
1227 * send along the gid when it tries to implement setgid 1227 * send along the gid when it tries to implement setgid
1228 * directories via NFS. 1228 * directories via NFS.
1229 */ 1229 */
1230 if ((iap->ia_valid &= ~(ATTR_UID|ATTR_GID|ATTR_MODE)) != 0) { 1230 if ((iap->ia_valid &= ~(ATTR_UID|ATTR_GID|ATTR_MODE)) != 0) {
1231 __be32 err2 = nfsd_setattr(rqstp, resfhp, iap, 0, (time_t)0); 1231 __be32 err2 = nfsd_setattr(rqstp, resfhp, iap, 0, (time_t)0);
1232 if (err2) 1232 if (err2)
1233 err = err2; 1233 err = err2;
1234 } 1234 }
1235 /* 1235 /*
1236 * Update the file handle to get the new inode info. 1236 * Update the file handle to get the new inode info.
1237 */ 1237 */
1238 if (!err) 1238 if (!err)
1239 err = fh_update(resfhp); 1239 err = fh_update(resfhp);
1240 out: 1240 out:
1241 if (dchild && !IS_ERR(dchild)) 1241 if (dchild && !IS_ERR(dchild))
1242 dput(dchild); 1242 dput(dchild);
1243 return err; 1243 return err;
1244 1244
1245 out_nfserr: 1245 out_nfserr:
1246 err = nfserrno(host_err); 1246 err = nfserrno(host_err);
1247 goto out; 1247 goto out;
1248 } 1248 }
1249 1249
1250 #ifdef CONFIG_NFSD_V3 1250 #ifdef CONFIG_NFSD_V3
1251 /* 1251 /*
1252 * NFSv3 version of nfsd_create 1252 * NFSv3 version of nfsd_create
1253 */ 1253 */
1254 __be32 1254 __be32
1255 nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp, 1255 nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp,
1256 char *fname, int flen, struct iattr *iap, 1256 char *fname, int flen, struct iattr *iap,
1257 struct svc_fh *resfhp, int createmode, u32 *verifier, 1257 struct svc_fh *resfhp, int createmode, u32 *verifier,
1258 int *truncp, int *created) 1258 int *truncp, int *created)
1259 { 1259 {
1260 struct dentry *dentry, *dchild = NULL; 1260 struct dentry *dentry, *dchild = NULL;
1261 struct inode *dirp; 1261 struct inode *dirp;
1262 __be32 err; 1262 __be32 err;
1263 int host_err; 1263 int host_err;
1264 __u32 v_mtime=0, v_atime=0; 1264 __u32 v_mtime=0, v_atime=0;
1265 1265
1266 err = nfserr_perm; 1266 err = nfserr_perm;
1267 if (!flen) 1267 if (!flen)
1268 goto out; 1268 goto out;
1269 err = nfserr_exist; 1269 err = nfserr_exist;
1270 if (isdotent(fname, flen)) 1270 if (isdotent(fname, flen))
1271 goto out; 1271 goto out;
1272 if (!(iap->ia_valid & ATTR_MODE)) 1272 if (!(iap->ia_valid & ATTR_MODE))
1273 iap->ia_mode = 0; 1273 iap->ia_mode = 0;
1274 err = fh_verify(rqstp, fhp, S_IFDIR, MAY_CREATE); 1274 err = fh_verify(rqstp, fhp, S_IFDIR, MAY_CREATE);
1275 if (err) 1275 if (err)
1276 goto out; 1276 goto out;
1277 1277
1278 dentry = fhp->fh_dentry; 1278 dentry = fhp->fh_dentry;
1279 dirp = dentry->d_inode; 1279 dirp = dentry->d_inode;
1280 1280
1281 /* Get all the sanity checks out of the way before 1281 /* Get all the sanity checks out of the way before
1282 * we lock the parent. */ 1282 * we lock the parent. */
1283 err = nfserr_notdir; 1283 err = nfserr_notdir;
1284 if(!dirp->i_op || !dirp->i_op->lookup) 1284 if(!dirp->i_op || !dirp->i_op->lookup)
1285 goto out; 1285 goto out;
1286 fh_lock_nested(fhp, I_MUTEX_PARENT); 1286 fh_lock_nested(fhp, I_MUTEX_PARENT);
1287 1287
1288 /* 1288 /*
1289 * Compose the response file handle. 1289 * Compose the response file handle.
1290 */ 1290 */
1291 dchild = lookup_one_len(fname, dentry, flen); 1291 dchild = lookup_one_len(fname, dentry, flen);
1292 host_err = PTR_ERR(dchild); 1292 host_err = PTR_ERR(dchild);
1293 if (IS_ERR(dchild)) 1293 if (IS_ERR(dchild))
1294 goto out_nfserr; 1294 goto out_nfserr;
1295 1295
1296 err = fh_compose(resfhp, fhp->fh_export, dchild, fhp); 1296 err = fh_compose(resfhp, fhp->fh_export, dchild, fhp);
1297 if (err) 1297 if (err)
1298 goto out; 1298 goto out;
1299 1299
1300 if (createmode == NFS3_CREATE_EXCLUSIVE) { 1300 if (createmode == NFS3_CREATE_EXCLUSIVE) {
1301 /* solaris7 gets confused (bugid 4218508) if these have 1301 /* solaris7 gets confused (bugid 4218508) if these have
1302 * the high bit set, so just clear the high bits. 1302 * the high bit set, so just clear the high bits.
1303 */ 1303 */
1304 v_mtime = verifier[0]&0x7fffffff; 1304 v_mtime = verifier[0]&0x7fffffff;
1305 v_atime = verifier[1]&0x7fffffff; 1305 v_atime = verifier[1]&0x7fffffff;
1306 } 1306 }
1307 1307
1308 if (dchild->d_inode) { 1308 if (dchild->d_inode) {
1309 err = 0; 1309 err = 0;
1310 1310
1311 switch (createmode) { 1311 switch (createmode) {
1312 case NFS3_CREATE_UNCHECKED: 1312 case NFS3_CREATE_UNCHECKED:
1313 if (! S_ISREG(dchild->d_inode->i_mode)) 1313 if (! S_ISREG(dchild->d_inode->i_mode))
1314 err = nfserr_exist; 1314 err = nfserr_exist;
1315 else if (truncp) { 1315 else if (truncp) {
1316 /* in nfsv4, we need to treat this case a little 1316 /* in nfsv4, we need to treat this case a little
1317 * differently. we don't want to truncate the 1317 * differently. we don't want to truncate the
1318 * file now; this would be wrong if the OPEN 1318 * file now; this would be wrong if the OPEN
1319 * fails for some other reason. furthermore, 1319 * fails for some other reason. furthermore,
1320 * if the size is nonzero, we should ignore it 1320 * if the size is nonzero, we should ignore it
1321 * according to spec! 1321 * according to spec!
1322 */ 1322 */
1323 *truncp = (iap->ia_valid & ATTR_SIZE) && !iap->ia_size; 1323 *truncp = (iap->ia_valid & ATTR_SIZE) && !iap->ia_size;
1324 } 1324 }
1325 else { 1325 else {
1326 iap->ia_valid &= ATTR_SIZE; 1326 iap->ia_valid &= ATTR_SIZE;
1327 goto set_attr; 1327 goto set_attr;
1328 } 1328 }
1329 break; 1329 break;
1330 case NFS3_CREATE_EXCLUSIVE: 1330 case NFS3_CREATE_EXCLUSIVE:
1331 if ( dchild->d_inode->i_mtime.tv_sec == v_mtime 1331 if ( dchild->d_inode->i_mtime.tv_sec == v_mtime
1332 && dchild->d_inode->i_atime.tv_sec == v_atime 1332 && dchild->d_inode->i_atime.tv_sec == v_atime
1333 && dchild->d_inode->i_size == 0 ) 1333 && dchild->d_inode->i_size == 0 )
1334 break; 1334 break;
1335 /* fallthru */ 1335 /* fallthru */
1336 case NFS3_CREATE_GUARDED: 1336 case NFS3_CREATE_GUARDED:
1337 err = nfserr_exist; 1337 err = nfserr_exist;
1338 } 1338 }
1339 goto out; 1339 goto out;
1340 } 1340 }
1341 1341
1342 host_err = vfs_create(dirp, dchild, iap->ia_mode, NULL); 1342 host_err = vfs_create(dirp, dchild, iap->ia_mode, NULL);
1343 if (host_err < 0) 1343 if (host_err < 0)
1344 goto out_nfserr; 1344 goto out_nfserr;
1345 if (created) 1345 if (created)
1346 *created = 1; 1346 *created = 1;
1347 1347
1348 if (EX_ISSYNC(fhp->fh_export)) { 1348 if (EX_ISSYNC(fhp->fh_export)) {
1349 err = nfserrno(nfsd_sync_dir(dentry)); 1349 err = nfserrno(nfsd_sync_dir(dentry));
1350 /* setattr will sync the child (or not) */ 1350 /* setattr will sync the child (or not) */
1351 } 1351 }
1352 1352
1353 if (createmode == NFS3_CREATE_EXCLUSIVE) { 1353 if (createmode == NFS3_CREATE_EXCLUSIVE) {
1354 /* Cram the verifier into atime/mtime */ 1354 /* Cram the verifier into atime/mtime */
1355 iap->ia_valid = ATTR_MTIME|ATTR_ATIME 1355 iap->ia_valid = ATTR_MTIME|ATTR_ATIME
1356 | ATTR_MTIME_SET|ATTR_ATIME_SET; 1356 | ATTR_MTIME_SET|ATTR_ATIME_SET;
1357 /* XXX someone who knows this better please fix it for nsec */ 1357 /* XXX someone who knows this better please fix it for nsec */
1358 iap->ia_mtime.tv_sec = v_mtime; 1358 iap->ia_mtime.tv_sec = v_mtime;
1359 iap->ia_atime.tv_sec = v_atime; 1359 iap->ia_atime.tv_sec = v_atime;
1360 iap->ia_mtime.tv_nsec = 0; 1360 iap->ia_mtime.tv_nsec = 0;
1361 iap->ia_atime.tv_nsec = 0; 1361 iap->ia_atime.tv_nsec = 0;
1362 } 1362 }
1363 1363
1364 /* Set file attributes. 1364 /* Set file attributes.
1365 * Irix appears to send along the gid when it tries to 1365 * Irix appears to send along the gid when it tries to
1366 * implement setgid directories via NFS. Clear out all that cruft. 1366 * implement setgid directories via NFS. Clear out all that cruft.
1367 */ 1367 */
1368 set_attr: 1368 set_attr:
1369 if ((iap->ia_valid &= ~(ATTR_UID|ATTR_GID|ATTR_MODE)) != 0) { 1369 if ((iap->ia_valid &= ~(ATTR_UID|ATTR_GID|ATTR_MODE)) != 0) {
1370 __be32 err2 = nfsd_setattr(rqstp, resfhp, iap, 0, (time_t)0); 1370 __be32 err2 = nfsd_setattr(rqstp, resfhp, iap, 0, (time_t)0);
1371 if (err2) 1371 if (err2)
1372 err = err2; 1372 err = err2;
1373 } 1373 }
1374 1374
1375 /* 1375 /*
1376 * Update the filehandle to get the new inode info. 1376 * Update the filehandle to get the new inode info.
1377 */ 1377 */
1378 if (!err) 1378 if (!err)
1379 err = fh_update(resfhp); 1379 err = fh_update(resfhp);
1380 1380
1381 out: 1381 out:
1382 fh_unlock(fhp); 1382 fh_unlock(fhp);
1383 if (dchild && !IS_ERR(dchild)) 1383 if (dchild && !IS_ERR(dchild))
1384 dput(dchild); 1384 dput(dchild);
1385 return err; 1385 return err;
1386 1386
1387 out_nfserr: 1387 out_nfserr:
1388 err = nfserrno(host_err); 1388 err = nfserrno(host_err);
1389 goto out; 1389 goto out;
1390 } 1390 }
1391 #endif /* CONFIG_NFSD_V3 */ 1391 #endif /* CONFIG_NFSD_V3 */
1392 1392
1393 /* 1393 /*
1394 * Read a symlink. On entry, *lenp must contain the maximum path length that 1394 * Read a symlink. On entry, *lenp must contain the maximum path length that
1395 * fits into the buffer. On return, it contains the true length. 1395 * fits into the buffer. On return, it contains the true length.
1396 * N.B. After this call fhp needs an fh_put 1396 * N.B. After this call fhp needs an fh_put
1397 */ 1397 */
1398 __be32 1398 __be32
1399 nfsd_readlink(struct svc_rqst *rqstp, struct svc_fh *fhp, char *buf, int *lenp) 1399 nfsd_readlink(struct svc_rqst *rqstp, struct svc_fh *fhp, char *buf, int *lenp)
1400 { 1400 {
1401 struct dentry *dentry; 1401 struct dentry *dentry;
1402 struct inode *inode; 1402 struct inode *inode;
1403 mm_segment_t oldfs; 1403 mm_segment_t oldfs;
1404 __be32 err; 1404 __be32 err;
1405 int host_err; 1405 int host_err;
1406 1406
1407 err = fh_verify(rqstp, fhp, S_IFLNK, MAY_NOP); 1407 err = fh_verify(rqstp, fhp, S_IFLNK, MAY_NOP);
1408 if (err) 1408 if (err)
1409 goto out; 1409 goto out;
1410 1410
1411 dentry = fhp->fh_dentry; 1411 dentry = fhp->fh_dentry;
1412 inode = dentry->d_inode; 1412 inode = dentry->d_inode;
1413 1413
1414 err = nfserr_inval; 1414 err = nfserr_inval;
1415 if (!inode->i_op || !inode->i_op->readlink) 1415 if (!inode->i_op || !inode->i_op->readlink)
1416 goto out; 1416 goto out;
1417 1417
1418 touch_atime(fhp->fh_export->ex_mnt, dentry); 1418 touch_atime(fhp->fh_export->ex_mnt, dentry);
1419 /* N.B. Why does this call need a get_fs()?? 1419 /* N.B. Why does this call need a get_fs()??
1420 * Remove the set_fs and watch the fireworks:-) --okir 1420 * Remove the set_fs and watch the fireworks:-) --okir
1421 */ 1421 */
1422 1422
1423 oldfs = get_fs(); set_fs(KERNEL_DS); 1423 oldfs = get_fs(); set_fs(KERNEL_DS);
1424 host_err = inode->i_op->readlink(dentry, buf, *lenp); 1424 host_err = inode->i_op->readlink(dentry, buf, *lenp);
1425 set_fs(oldfs); 1425 set_fs(oldfs);
1426 1426
1427 if (host_err < 0) 1427 if (host_err < 0)
1428 goto out_nfserr; 1428 goto out_nfserr;
1429 *lenp = host_err; 1429 *lenp = host_err;
1430 err = 0; 1430 err = 0;
1431 out: 1431 out:
1432 return err; 1432 return err;
1433 1433
1434 out_nfserr: 1434 out_nfserr:
1435 err = nfserrno(host_err); 1435 err = nfserrno(host_err);
1436 goto out; 1436 goto out;
1437 } 1437 }
1438 1438
1439 /* 1439 /*
1440 * Create a symlink and look up its inode 1440 * Create a symlink and look up its inode
1441 * N.B. After this call _both_ fhp and resfhp need an fh_put 1441 * N.B. After this call _both_ fhp and resfhp need an fh_put
1442 */ 1442 */
1443 __be32 1443 __be32
1444 nfsd_symlink(struct svc_rqst *rqstp, struct svc_fh *fhp, 1444 nfsd_symlink(struct svc_rqst *rqstp, struct svc_fh *fhp,
1445 char *fname, int flen, 1445 char *fname, int flen,
1446 char *path, int plen, 1446 char *path, int plen,
1447 struct svc_fh *resfhp, 1447 struct svc_fh *resfhp,
1448 struct iattr *iap) 1448 struct iattr *iap)
1449 { 1449 {
1450 struct dentry *dentry, *dnew; 1450 struct dentry *dentry, *dnew;
1451 __be32 err, cerr; 1451 __be32 err, cerr;
1452 int host_err; 1452 int host_err;
1453 umode_t mode; 1453 umode_t mode;
1454 1454
1455 err = nfserr_noent; 1455 err = nfserr_noent;
1456 if (!flen || !plen) 1456 if (!flen || !plen)
1457 goto out; 1457 goto out;
1458 err = nfserr_exist; 1458 err = nfserr_exist;
1459 if (isdotent(fname, flen)) 1459 if (isdotent(fname, flen))
1460 goto out; 1460 goto out;
1461 1461
1462 err = fh_verify(rqstp, fhp, S_IFDIR, MAY_CREATE); 1462 err = fh_verify(rqstp, fhp, S_IFDIR, MAY_CREATE);
1463 if (err) 1463 if (err)
1464 goto out; 1464 goto out;
1465 fh_lock(fhp); 1465 fh_lock(fhp);
1466 dentry = fhp->fh_dentry; 1466 dentry = fhp->fh_dentry;
1467 dnew = lookup_one_len(fname, dentry, flen); 1467 dnew = lookup_one_len(fname, dentry, flen);
1468 host_err = PTR_ERR(dnew); 1468 host_err = PTR_ERR(dnew);
1469 if (IS_ERR(dnew)) 1469 if (IS_ERR(dnew))
1470 goto out_nfserr; 1470 goto out_nfserr;
1471 1471
1472 mode = S_IALLUGO; 1472 mode = S_IALLUGO;
1473 /* Only the MODE ATTRibute is even vaguely meaningful */ 1473 /* Only the MODE ATTRibute is even vaguely meaningful */
1474 if (iap && (iap->ia_valid & ATTR_MODE)) 1474 if (iap && (iap->ia_valid & ATTR_MODE))
1475 mode = iap->ia_mode & S_IALLUGO; 1475 mode = iap->ia_mode & S_IALLUGO;
1476 1476
1477 if (unlikely(path[plen] != 0)) { 1477 if (unlikely(path[plen] != 0)) {
1478 char *path_alloced = kmalloc(plen+1, GFP_KERNEL); 1478 char *path_alloced = kmalloc(plen+1, GFP_KERNEL);
1479 if (path_alloced == NULL) 1479 if (path_alloced == NULL)
1480 host_err = -ENOMEM; 1480 host_err = -ENOMEM;
1481 else { 1481 else {
1482 strncpy(path_alloced, path, plen); 1482 strncpy(path_alloced, path, plen);
1483 path_alloced[plen] = 0; 1483 path_alloced[plen] = 0;
1484 host_err = vfs_symlink(dentry->d_inode, dnew, path_alloced, mode); 1484 host_err = vfs_symlink(dentry->d_inode, dnew, path_alloced, mode);
1485 kfree(path_alloced); 1485 kfree(path_alloced);
1486 } 1486 }
1487 } else 1487 } else
1488 host_err = vfs_symlink(dentry->d_inode, dnew, path, mode); 1488 host_err = vfs_symlink(dentry->d_inode, dnew, path, mode);
1489 1489
1490 if (!host_err) { 1490 if (!host_err) {
1491 if (EX_ISSYNC(fhp->fh_export)) 1491 if (EX_ISSYNC(fhp->fh_export))
1492 host_err = nfsd_sync_dir(dentry); 1492 host_err = nfsd_sync_dir(dentry);
1493 } 1493 }
1494 err = nfserrno(host_err); 1494 err = nfserrno(host_err);
1495 fh_unlock(fhp); 1495 fh_unlock(fhp);
1496 1496
1497 cerr = fh_compose(resfhp, fhp->fh_export, dnew, fhp); 1497 cerr = fh_compose(resfhp, fhp->fh_export, dnew, fhp);
1498 dput(dnew); 1498 dput(dnew);
1499 if (err==0) err = cerr; 1499 if (err==0) err = cerr;
1500 out: 1500 out:
1501 return err; 1501 return err;
1502 1502
1503 out_nfserr: 1503 out_nfserr:
1504 err = nfserrno(host_err); 1504 err = nfserrno(host_err);
1505 goto out; 1505 goto out;
1506 } 1506 }
1507 1507
1508 /* 1508 /*
1509 * Create a hardlink 1509 * Create a hardlink
1510 * N.B. After this call _both_ ffhp and tfhp need an fh_put 1510 * N.B. After this call _both_ ffhp and tfhp need an fh_put
1511 */ 1511 */
1512 __be32 1512 __be32
1513 nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp, 1513 nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp,
1514 char *name, int len, struct svc_fh *tfhp) 1514 char *name, int len, struct svc_fh *tfhp)
1515 { 1515 {
1516 struct dentry *ddir, *dnew, *dold; 1516 struct dentry *ddir, *dnew, *dold;
1517 struct inode *dirp, *dest; 1517 struct inode *dirp, *dest;
1518 __be32 err; 1518 __be32 err;
1519 int host_err; 1519 int host_err;
1520 1520
1521 err = fh_verify(rqstp, ffhp, S_IFDIR, MAY_CREATE); 1521 err = fh_verify(rqstp, ffhp, S_IFDIR, MAY_CREATE);
1522 if (err) 1522 if (err)
1523 goto out; 1523 goto out;
1524 err = fh_verify(rqstp, tfhp, -S_IFDIR, MAY_NOP); 1524 err = fh_verify(rqstp, tfhp, -S_IFDIR, MAY_NOP);
1525 if (err) 1525 if (err)
1526 goto out; 1526 goto out;
1527 1527
1528 err = nfserr_perm; 1528 err = nfserr_perm;
1529 if (!len) 1529 if (!len)
1530 goto out; 1530 goto out;
1531 err = nfserr_exist; 1531 err = nfserr_exist;
1532 if (isdotent(name, len)) 1532 if (isdotent(name, len))
1533 goto out; 1533 goto out;
1534 1534
1535 fh_lock_nested(ffhp, I_MUTEX_PARENT); 1535 fh_lock_nested(ffhp, I_MUTEX_PARENT);
1536 ddir = ffhp->fh_dentry; 1536 ddir = ffhp->fh_dentry;
1537 dirp = ddir->d_inode; 1537 dirp = ddir->d_inode;
1538 1538
1539 dnew = lookup_one_len(name, ddir, len); 1539 dnew = lookup_one_len(name, ddir, len);
1540 host_err = PTR_ERR(dnew); 1540 host_err = PTR_ERR(dnew);
1541 if (IS_ERR(dnew)) 1541 if (IS_ERR(dnew))
1542 goto out_nfserr; 1542 goto out_nfserr;
1543 1543
1544 dold = tfhp->fh_dentry; 1544 dold = tfhp->fh_dentry;
1545 dest = dold->d_inode; 1545 dest = dold->d_inode;
1546 1546
1547 host_err = vfs_link(dold, dirp, dnew); 1547 host_err = vfs_link(dold, dirp, dnew);
1548 if (!host_err) { 1548 if (!host_err) {
1549 if (EX_ISSYNC(ffhp->fh_export)) { 1549 if (EX_ISSYNC(ffhp->fh_export)) {
1550 err = nfserrno(nfsd_sync_dir(ddir)); 1550 err = nfserrno(nfsd_sync_dir(ddir));
1551 write_inode_now(dest, 1); 1551 write_inode_now(dest, 1);
1552 } 1552 }
1553 err = 0; 1553 err = 0;
1554 } else { 1554 } else {
1555 if (host_err == -EXDEV && rqstp->rq_vers == 2) 1555 if (host_err == -EXDEV && rqstp->rq_vers == 2)
1556 err = nfserr_acces; 1556 err = nfserr_acces;
1557 else 1557 else
1558 err = nfserrno(host_err); 1558 err = nfserrno(host_err);
1559 } 1559 }
1560 1560
1561 dput(dnew); 1561 dput(dnew);
1562 out_unlock: 1562 out_unlock:
1563 fh_unlock(ffhp); 1563 fh_unlock(ffhp);
1564 out: 1564 out:
1565 return err; 1565 return err;
1566 1566
1567 out_nfserr: 1567 out_nfserr:
1568 err = nfserrno(host_err); 1568 err = nfserrno(host_err);
1569 goto out_unlock; 1569 goto out_unlock;
1570 } 1570 }
1571 1571
1572 /* 1572 /*
1573 * Rename a file 1573 * Rename a file
1574 * N.B. After this call _both_ ffhp and tfhp need an fh_put 1574 * N.B. After this call _both_ ffhp and tfhp need an fh_put
1575 */ 1575 */
1576 __be32 1576 __be32
1577 nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen, 1577 nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen,
1578 struct svc_fh *tfhp, char *tname, int tlen) 1578 struct svc_fh *tfhp, char *tname, int tlen)
1579 { 1579 {
1580 struct dentry *fdentry, *tdentry, *odentry, *ndentry, *trap; 1580 struct dentry *fdentry, *tdentry, *odentry, *ndentry, *trap;
1581 struct inode *fdir, *tdir; 1581 struct inode *fdir, *tdir;
1582 __be32 err; 1582 __be32 err;
1583 int host_err; 1583 int host_err;
1584 1584
1585 err = fh_verify(rqstp, ffhp, S_IFDIR, MAY_REMOVE); 1585 err = fh_verify(rqstp, ffhp, S_IFDIR, MAY_REMOVE);
1586 if (err) 1586 if (err)
1587 goto out; 1587 goto out;
1588 err = fh_verify(rqstp, tfhp, S_IFDIR, MAY_CREATE); 1588 err = fh_verify(rqstp, tfhp, S_IFDIR, MAY_CREATE);
1589 if (err) 1589 if (err)
1590 goto out; 1590 goto out;
1591 1591
1592 fdentry = ffhp->fh_dentry; 1592 fdentry = ffhp->fh_dentry;
1593 fdir = fdentry->d_inode; 1593 fdir = fdentry->d_inode;
1594 1594
1595 tdentry = tfhp->fh_dentry; 1595 tdentry = tfhp->fh_dentry;
1596 tdir = tdentry->d_inode; 1596 tdir = tdentry->d_inode;
1597 1597
1598 err = (rqstp->rq_vers == 2) ? nfserr_acces : nfserr_xdev; 1598 err = (rqstp->rq_vers == 2) ? nfserr_acces : nfserr_xdev;
1599 if (ffhp->fh_export != tfhp->fh_export) 1599 if (ffhp->fh_export != tfhp->fh_export)
1600 goto out; 1600 goto out;
1601 1601
1602 err = nfserr_perm; 1602 err = nfserr_perm;
1603 if (!flen || isdotent(fname, flen) || !tlen || isdotent(tname, tlen)) 1603 if (!flen || isdotent(fname, flen) || !tlen || isdotent(tname, tlen))
1604 goto out; 1604 goto out;
1605 1605
1606 /* cannot use fh_lock as we need deadlock protective ordering 1606 /* cannot use fh_lock as we need deadlock protective ordering
1607 * so do it by hand */ 1607 * so do it by hand */
1608 trap = lock_rename(tdentry, fdentry); 1608 trap = lock_rename(tdentry, fdentry);
1609 ffhp->fh_locked = tfhp->fh_locked = 1; 1609 ffhp->fh_locked = tfhp->fh_locked = 1;
1610 fill_pre_wcc(ffhp); 1610 fill_pre_wcc(ffhp);
1611 fill_pre_wcc(tfhp); 1611 fill_pre_wcc(tfhp);
1612 1612
1613 odentry = lookup_one_len(fname, fdentry, flen); 1613 odentry = lookup_one_len(fname, fdentry, flen);
1614 host_err = PTR_ERR(odentry); 1614 host_err = PTR_ERR(odentry);
1615 if (IS_ERR(odentry)) 1615 if (IS_ERR(odentry))
1616 goto out_nfserr; 1616 goto out_nfserr;
1617 1617
1618 host_err = -ENOENT; 1618 host_err = -ENOENT;
1619 if (!odentry->d_inode) 1619 if (!odentry->d_inode)
1620 goto out_dput_old; 1620 goto out_dput_old;
1621 host_err = -EINVAL; 1621 host_err = -EINVAL;
1622 if (odentry == trap) 1622 if (odentry == trap)
1623 goto out_dput_old; 1623 goto out_dput_old;
1624 1624
1625 ndentry = lookup_one_len(tname, tdentry, tlen); 1625 ndentry = lookup_one_len(tname, tdentry, tlen);
1626 host_err = PTR_ERR(ndentry); 1626 host_err = PTR_ERR(ndentry);
1627 if (IS_ERR(ndentry)) 1627 if (IS_ERR(ndentry))
1628 goto out_dput_old; 1628 goto out_dput_old;
1629 host_err = -ENOTEMPTY; 1629 host_err = -ENOTEMPTY;
1630 if (ndentry == trap) 1630 if (ndentry == trap)
1631 goto out_dput_new; 1631 goto out_dput_new;
1632 1632
1633 #ifdef MSNFS 1633 #ifdef MSNFS
1634 if ((ffhp->fh_export->ex_flags & NFSEXP_MSNFS) && 1634 if ((ffhp->fh_export->ex_flags & NFSEXP_MSNFS) &&
1635 ((atomic_read(&odentry->d_count) > 1) 1635 ((atomic_read(&odentry->d_count) > 1)
1636 || (atomic_read(&ndentry->d_count) > 1))) { 1636 || (atomic_read(&ndentry->d_count) > 1))) {
1637 host_err = -EPERM; 1637 host_err = -EPERM;
1638 } else 1638 } else
1639 #endif 1639 #endif
1640 host_err = vfs_rename(fdir, odentry, tdir, ndentry); 1640 host_err = vfs_rename(fdir, odentry, tdir, ndentry);
1641 if (!host_err && EX_ISSYNC(tfhp->fh_export)) { 1641 if (!host_err && EX_ISSYNC(tfhp->fh_export)) {
1642 host_err = nfsd_sync_dir(tdentry); 1642 host_err = nfsd_sync_dir(tdentry);
1643 if (!host_err) 1643 if (!host_err)
1644 host_err = nfsd_sync_dir(fdentry); 1644 host_err = nfsd_sync_dir(fdentry);
1645 } 1645 }
1646 1646
1647 out_dput_new: 1647 out_dput_new:
1648 dput(ndentry); 1648 dput(ndentry);
1649 out_dput_old: 1649 out_dput_old:
1650 dput(odentry); 1650 dput(odentry);
1651 out_nfserr: 1651 out_nfserr:
1652 err = nfserrno(host_err); 1652 err = nfserrno(host_err);
1653 1653
1654 /* we cannot reply on fh_unlock on the two filehandles, 1654 /* we cannot reply on fh_unlock on the two filehandles,
1655 * as that would do the wrong thing if the two directories 1655 * as that would do the wrong thing if the two directories
1656 * were the same, so again we do it by hand 1656 * were the same, so again we do it by hand
1657 */ 1657 */
1658 fill_post_wcc(ffhp); 1658 fill_post_wcc(ffhp);
1659 fill_post_wcc(tfhp); 1659 fill_post_wcc(tfhp);
1660 unlock_rename(tdentry, fdentry); 1660 unlock_rename(tdentry, fdentry);
1661 ffhp->fh_locked = tfhp->fh_locked = 0; 1661 ffhp->fh_locked = tfhp->fh_locked = 0;
1662 1662
1663 out: 1663 out:
1664 return err; 1664 return err;
1665 } 1665 }
1666 1666
1667 /* 1667 /*
1668 * Unlink a file or directory 1668 * Unlink a file or directory
1669 * N.B. After this call fhp needs an fh_put 1669 * N.B. After this call fhp needs an fh_put
1670 */ 1670 */
1671 __be32 1671 __be32
1672 nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, 1672 nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
1673 char *fname, int flen) 1673 char *fname, int flen)
1674 { 1674 {
1675 struct dentry *dentry, *rdentry; 1675 struct dentry *dentry, *rdentry;
1676 struct inode *dirp; 1676 struct inode *dirp;
1677 __be32 err; 1677 __be32 err;
1678 int host_err; 1678 int host_err;
1679 1679
1680 err = nfserr_acces; 1680 err = nfserr_acces;
1681 if (!flen || isdotent(fname, flen)) 1681 if (!flen || isdotent(fname, flen))
1682 goto out; 1682 goto out;
1683 err = fh_verify(rqstp, fhp, S_IFDIR, MAY_REMOVE); 1683 err = fh_verify(rqstp, fhp, S_IFDIR, MAY_REMOVE);
1684 if (err) 1684 if (err)
1685 goto out; 1685 goto out;
1686 1686
1687 fh_lock_nested(fhp, I_MUTEX_PARENT); 1687 fh_lock_nested(fhp, I_MUTEX_PARENT);
1688 dentry = fhp->fh_dentry; 1688 dentry = fhp->fh_dentry;
1689 dirp = dentry->d_inode; 1689 dirp = dentry->d_inode;
1690 1690
1691 rdentry = lookup_one_len(fname, dentry, flen); 1691 rdentry = lookup_one_len(fname, dentry, flen);
1692 host_err = PTR_ERR(rdentry); 1692 host_err = PTR_ERR(rdentry);
1693 if (IS_ERR(rdentry)) 1693 if (IS_ERR(rdentry))
1694 goto out_nfserr; 1694 goto out_nfserr;
1695 1695
1696 if (!rdentry->d_inode) { 1696 if (!rdentry->d_inode) {
1697 dput(rdentry); 1697 dput(rdentry);
1698 err = nfserr_noent; 1698 err = nfserr_noent;
1699 goto out; 1699 goto out;
1700 } 1700 }
1701 1701
1702 if (!type) 1702 if (!type)
1703 type = rdentry->d_inode->i_mode & S_IFMT; 1703 type = rdentry->d_inode->i_mode & S_IFMT;
1704 1704
1705 if (type != S_IFDIR) { /* It's UNLINK */ 1705 if (type != S_IFDIR) { /* It's UNLINK */
1706 #ifdef MSNFS 1706 #ifdef MSNFS
1707 if ((fhp->fh_export->ex_flags & NFSEXP_MSNFS) && 1707 if ((fhp->fh_export->ex_flags & NFSEXP_MSNFS) &&
1708 (atomic_read(&rdentry->d_count) > 1)) { 1708 (atomic_read(&rdentry->d_count) > 1)) {
1709 host_err = -EPERM; 1709 host_err = -EPERM;
1710 } else 1710 } else
1711 #endif 1711 #endif
1712 host_err = vfs_unlink(dirp, rdentry); 1712 host_err = vfs_unlink(dirp, rdentry);
1713 } else { /* It's RMDIR */ 1713 } else { /* It's RMDIR */
1714 host_err = vfs_rmdir(dirp, rdentry); 1714 host_err = vfs_rmdir(dirp, rdentry);
1715 } 1715 }
1716 1716
1717 dput(rdentry); 1717 dput(rdentry);
1718 1718
1719 if (host_err) 1719 if (host_err)
1720 goto out_nfserr; 1720 goto out_nfserr;
1721 if (EX_ISSYNC(fhp->fh_export)) 1721 if (EX_ISSYNC(fhp->fh_export))
1722 host_err = nfsd_sync_dir(dentry); 1722 host_err = nfsd_sync_dir(dentry);
1723 1723
1724 out_nfserr: 1724 out_nfserr:
1725 err = nfserrno(host_err); 1725 err = nfserrno(host_err);
1726 out: 1726 out:
1727 return err; 1727 return err;
1728 } 1728 }
1729 1729
1730 /* 1730 /*
1731 * Read entries from a directory. 1731 * Read entries from a directory.
1732 * The NFSv3/4 verifier we ignore for now. 1732 * The NFSv3/4 verifier we ignore for now.
1733 */ 1733 */
1734 __be32 1734 __be32
1735 nfsd_readdir(struct svc_rqst *rqstp, struct svc_fh *fhp, loff_t *offsetp, 1735 nfsd_readdir(struct svc_rqst *rqstp, struct svc_fh *fhp, loff_t *offsetp,
1736 struct readdir_cd *cdp, filldir_t func) 1736 struct readdir_cd *cdp, filldir_t func)
1737 { 1737 {
1738 __be32 err; 1738 __be32 err;
1739 int host_err; 1739 int host_err;
1740 struct file *file; 1740 struct file *file;
1741 loff_t offset = *offsetp; 1741 loff_t offset = *offsetp;
1742 1742
1743 err = nfsd_open(rqstp, fhp, S_IFDIR, MAY_READ, &file); 1743 err = nfsd_open(rqstp, fhp, S_IFDIR, MAY_READ, &file);
1744 if (err) 1744 if (err)
1745 goto out; 1745 goto out;
1746 1746
1747 offset = vfs_llseek(file, offset, 0); 1747 offset = vfs_llseek(file, offset, 0);
1748 if (offset < 0) { 1748 if (offset < 0) {
1749 err = nfserrno((int)offset); 1749 err = nfserrno((int)offset);
1750 goto out_close; 1750 goto out_close;
1751 } 1751 }
1752 1752
1753 /* 1753 /*
1754 * Read the directory entries. This silly loop is necessary because 1754 * Read the directory entries. This silly loop is necessary because
1755 * readdir() is not guaranteed to fill up the entire buffer, but 1755 * readdir() is not guaranteed to fill up the entire buffer, but
1756 * may choose to do less. 1756 * may choose to do less.
1757 */ 1757 */
1758 1758
1759 do { 1759 do {
1760 cdp->err = nfserr_eof; /* will be cleared on successful read */ 1760 cdp->err = nfserr_eof; /* will be cleared on successful read */
1761 host_err = vfs_readdir(file, func, cdp); 1761 host_err = vfs_readdir(file, func, cdp);
1762 } while (host_err >=0 && cdp->err == nfs_ok); 1762 } while (host_err >=0 && cdp->err == nfs_ok);
1763 if (host_err) 1763 if (host_err)
1764 err = nfserrno(host_err); 1764 err = nfserrno(host_err);
1765 else 1765 else
1766 err = cdp->err; 1766 err = cdp->err;
1767 *offsetp = vfs_llseek(file, 0, 1); 1767 *offsetp = vfs_llseek(file, 0, 1);
1768 1768
1769 if (err == nfserr_eof || err == nfserr_toosmall) 1769 if (err == nfserr_eof || err == nfserr_toosmall)
1770 err = nfs_ok; /* can still be found in ->err */ 1770 err = nfs_ok; /* can still be found in ->err */
1771 out_close: 1771 out_close:
1772 nfsd_close(file); 1772 nfsd_close(file);
1773 out: 1773 out:
1774 return err; 1774 return err;
1775 } 1775 }
1776 1776
1777 /* 1777 /*
1778 * Get file system stats 1778 * Get file system stats
1779 * N.B. After this call fhp needs an fh_put 1779 * N.B. After this call fhp needs an fh_put
1780 */ 1780 */
1781 __be32 1781 __be32
1782 nfsd_statfs(struct svc_rqst *rqstp, struct svc_fh *fhp, struct kstatfs *stat) 1782 nfsd_statfs(struct svc_rqst *rqstp, struct svc_fh *fhp, struct kstatfs *stat)
1783 { 1783 {
1784 __be32 err = fh_verify(rqstp, fhp, 0, MAY_NOP); 1784 __be32 err = fh_verify(rqstp, fhp, 0, MAY_NOP);
1785 if (!err && vfs_statfs(fhp->fh_dentry,stat)) 1785 if (!err && vfs_statfs(fhp->fh_dentry,stat))
1786 err = nfserr_io; 1786 err = nfserr_io;
1787 return err; 1787 return err;
1788 } 1788 }
1789 1789
1790 /* 1790 /*
1791 * Check for a user's access permissions to this inode. 1791 * Check for a user's access permissions to this inode.
1792 */ 1792 */
1793 __be32 1793 __be32
1794 nfsd_permission(struct svc_export *exp, struct dentry *dentry, int acc) 1794 nfsd_permission(struct svc_export *exp, struct dentry *dentry, int acc)
1795 { 1795 {
1796 struct inode *inode = dentry->d_inode; 1796 struct inode *inode = dentry->d_inode;
1797 int err; 1797 int err;
1798 1798
1799 if (acc == MAY_NOP) 1799 if (acc == MAY_NOP)
1800 return 0; 1800 return 0;
1801 #if 0 1801 #if 0
1802 dprintk("nfsd: permission 0x%x%s%s%s%s%s%s%s mode 0%o%s%s%s\n", 1802 dprintk("nfsd: permission 0x%x%s%s%s%s%s%s%s mode 0%o%s%s%s\n",
1803 acc, 1803 acc,
1804 (acc & MAY_READ)? " read" : "", 1804 (acc & MAY_READ)? " read" : "",
1805 (acc & MAY_WRITE)? " write" : "", 1805 (acc & MAY_WRITE)? " write" : "",
1806 (acc & MAY_EXEC)? " exec" : "", 1806 (acc & MAY_EXEC)? " exec" : "",
1807 (acc & MAY_SATTR)? " sattr" : "", 1807 (acc & MAY_SATTR)? " sattr" : "",
1808 (acc & MAY_TRUNC)? " trunc" : "", 1808 (acc & MAY_TRUNC)? " trunc" : "",
1809 (acc & MAY_LOCK)? " lock" : "", 1809 (acc & MAY_LOCK)? " lock" : "",
1810 (acc & MAY_OWNER_OVERRIDE)? " owneroverride" : "", 1810 (acc & MAY_OWNER_OVERRIDE)? " owneroverride" : "",
1811 inode->i_mode, 1811 inode->i_mode,
1812 IS_IMMUTABLE(inode)? " immut" : "", 1812 IS_IMMUTABLE(inode)? " immut" : "",
1813 IS_APPEND(inode)? " append" : "", 1813 IS_APPEND(inode)? " append" : "",
1814 IS_RDONLY(inode)? " ro" : ""); 1814 IS_RDONLY(inode)? " ro" : "");
1815 dprintk(" owner %d/%d user %d/%d\n", 1815 dprintk(" owner %d/%d user %d/%d\n",
1816 inode->i_uid, inode->i_gid, current->fsuid, current->fsgid); 1816 inode->i_uid, inode->i_gid, current->fsuid, current->fsgid);
1817 #endif 1817 #endif
1818 1818
1819 /* Normally we reject any write/sattr etc access on a read-only file 1819 /* Normally we reject any write/sattr etc access on a read-only file
1820 * system. But if it is IRIX doing check on write-access for a 1820 * system. But if it is IRIX doing check on write-access for a
1821 * device special file, we ignore rofs. 1821 * device special file, we ignore rofs.
1822 */ 1822 */
1823 if (!(acc & MAY_LOCAL_ACCESS)) 1823 if (!(acc & MAY_LOCAL_ACCESS))
1824 if (acc & (MAY_WRITE | MAY_SATTR | MAY_TRUNC)) { 1824 if (acc & (MAY_WRITE | MAY_SATTR | MAY_TRUNC)) {
1825 if (EX_RDONLY(exp) || IS_RDONLY(inode)) 1825 if (EX_RDONLY(exp) || IS_RDONLY(inode))
1826 return nfserr_rofs; 1826 return nfserr_rofs;
1827 if (/* (acc & MAY_WRITE) && */ IS_IMMUTABLE(inode)) 1827 if (/* (acc & MAY_WRITE) && */ IS_IMMUTABLE(inode))
1828 return nfserr_perm; 1828 return nfserr_perm;
1829 } 1829 }
1830 if ((acc & MAY_TRUNC) && IS_APPEND(inode)) 1830 if ((acc & MAY_TRUNC) && IS_APPEND(inode))
1831 return nfserr_perm; 1831 return nfserr_perm;
1832 1832
1833 if (acc & MAY_LOCK) { 1833 if (acc & MAY_LOCK) {
1834 /* If we cannot rely on authentication in NLM requests, 1834 /* If we cannot rely on authentication in NLM requests,
1835 * just allow locks, otherwise require read permission, or 1835 * just allow locks, otherwise require read permission, or
1836 * ownership 1836 * ownership
1837 */ 1837 */
1838 if (exp->ex_flags & NFSEXP_NOAUTHNLM) 1838 if (exp->ex_flags & NFSEXP_NOAUTHNLM)
1839 return 0; 1839 return 0;
1840 else 1840 else
1841 acc = MAY_READ | MAY_OWNER_OVERRIDE; 1841 acc = MAY_READ | MAY_OWNER_OVERRIDE;
1842 } 1842 }
1843 /* 1843 /*
1844 * The file owner always gets access permission for accesses that 1844 * The file owner always gets access permission for accesses that
1845 * would normally be checked at open time. This is to make 1845 * would normally be checked at open time. This is to make
1846 * file access work even when the client has done a fchmod(fd, 0). 1846 * file access work even when the client has done a fchmod(fd, 0).
1847 * 1847 *
1848 * However, `cp foo bar' should fail nevertheless when bar is 1848 * However, `cp foo bar' should fail nevertheless when bar is
1849 * readonly. A sensible way to do this might be to reject all 1849 * readonly. A sensible way to do this might be to reject all
1850 * attempts to truncate a read-only file, because a creat() call 1850 * attempts to truncate a read-only file, because a creat() call
1851 * always implies file truncation. 1851 * always implies file truncation.
1852 * ... but this isn't really fair. A process may reasonably call 1852 * ... but this isn't really fair. A process may reasonably call
1853 * ftruncate on an open file descriptor on a file with perm 000. 1853 * ftruncate on an open file descriptor on a file with perm 000.
1854 * We must trust the client to do permission checking - using "ACCESS" 1854 * We must trust the client to do permission checking - using "ACCESS"
1855 * with NFSv3. 1855 * with NFSv3.
1856 */ 1856 */
1857 if ((acc & MAY_OWNER_OVERRIDE) && 1857 if ((acc & MAY_OWNER_OVERRIDE) &&
1858 inode->i_uid == current->fsuid) 1858 inode->i_uid == current->fsuid)
1859 return 0; 1859 return 0;
1860 1860
1861 err = permission(inode, acc & (MAY_READ|MAY_WRITE|MAY_EXEC), NULL); 1861 err = permission(inode, acc & (MAY_READ|MAY_WRITE|MAY_EXEC), NULL);
1862 1862
1863 /* Allow read access to binaries even when mode 111 */ 1863 /* Allow read access to binaries even when mode 111 */
1864 if (err == -EACCES && S_ISREG(inode->i_mode) && 1864 if (err == -EACCES && S_ISREG(inode->i_mode) &&
1865 acc == (MAY_READ | MAY_OWNER_OVERRIDE)) 1865 acc == (MAY_READ | MAY_OWNER_OVERRIDE))
1866 err = permission(inode, MAY_EXEC, NULL); 1866 err = permission(inode, MAY_EXEC, NULL);
1867 1867
1868 return err? nfserrno(err) : 0; 1868 return err? nfserrno(err) : 0;
1869 } 1869 }
1870 1870
1871 void 1871 void
1872 nfsd_racache_shutdown(void) 1872 nfsd_racache_shutdown(void)
1873 { 1873 {
1874 if (!raparml) 1874 if (!raparml)
1875 return; 1875 return;
1876 dprintk("nfsd: freeing readahead buffers.\n"); 1876 dprintk("nfsd: freeing readahead buffers.\n");
1877 kfree(raparml); 1877 kfree(raparml);
1878 raparml = NULL; 1878 raparml = NULL;
1879 } 1879 }
1880 /* 1880 /*
1881 * Initialize readahead param cache 1881 * Initialize readahead param cache
1882 */ 1882 */
1883 int 1883 int
1884 nfsd_racache_init(int cache_size) 1884 nfsd_racache_init(int cache_size)
1885 { 1885 {
1886 int i; 1886 int i;
1887 int j = 0; 1887 int j = 0;
1888 int nperbucket; 1888 int nperbucket;
1889 1889
1890 1890
1891 if (raparml) 1891 if (raparml)
1892 return 0; 1892 return 0;
1893 if (cache_size < 2*RAPARM_HASH_SIZE) 1893 if (cache_size < 2*RAPARM_HASH_SIZE)
1894 cache_size = 2*RAPARM_HASH_SIZE; 1894 cache_size = 2*RAPARM_HASH_SIZE;
1895 raparml = kcalloc(cache_size, sizeof(struct raparms), GFP_KERNEL); 1895 raparml = kcalloc(cache_size, sizeof(struct raparms), GFP_KERNEL);
1896 1896
1897 if (!raparml) { 1897 if (!raparml) {
1898 printk(KERN_WARNING 1898 printk(KERN_WARNING
1899 "nfsd: Could not allocate memory read-ahead cache.\n"); 1899 "nfsd: Could not allocate memory read-ahead cache.\n");
1900 return -ENOMEM; 1900 return -ENOMEM;
1901 } 1901 }
1902 1902
1903 dprintk("nfsd: allocating %d readahead buffers.\n", cache_size); 1903 dprintk("nfsd: allocating %d readahead buffers.\n", cache_size);
1904 for (i = 0 ; i < RAPARM_HASH_SIZE ; i++) { 1904 for (i = 0 ; i < RAPARM_HASH_SIZE ; i++) {
1905 raparm_hash[i].pb_head = NULL; 1905 raparm_hash[i].pb_head = NULL;
1906 spin_lock_init(&raparm_hash[i].pb_lock); 1906 spin_lock_init(&raparm_hash[i].pb_lock);
1907 } 1907 }
1908 nperbucket = cache_size >> RAPARM_HASH_BITS; 1908 nperbucket = cache_size >> RAPARM_HASH_BITS;
1909 for (i = 0; i < cache_size - 1; i++) { 1909 for (i = 0; i < cache_size - 1; i++) {
1910 if (i % nperbucket == 0) 1910 if (i % nperbucket == 0)
1911 raparm_hash[j++].pb_head = raparml + i; 1911 raparm_hash[j++].pb_head = raparml + i;
1912 if (i % nperbucket < nperbucket-1) 1912 if (i % nperbucket < nperbucket-1)
1913 raparml[i].p_next = raparml + i + 1; 1913 raparml[i].p_next = raparml + i + 1;
1914 } 1914 }
1915 1915
1916 nfsdstats.ra_size = cache_size; 1916 nfsdstats.ra_size = cache_size;
1917 return 0; 1917 return 0;
1918 } 1918 }
1919 1919
1920 #if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) 1920 #if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL)
1921 struct posix_acl * 1921 struct posix_acl *
1922 nfsd_get_posix_acl(struct svc_fh *fhp, int type) 1922 nfsd_get_posix_acl(struct svc_fh *fhp, int type)
1923 { 1923 {
1924 struct inode *inode = fhp->fh_dentry->d_inode; 1924 struct inode *inode = fhp->fh_dentry->d_inode;
1925 char *name; 1925 char *name;
1926 void *value = NULL; 1926 void *value = NULL;
1927 ssize_t size; 1927 ssize_t size;
1928 struct posix_acl *acl; 1928 struct posix_acl *acl;
1929 1929
1930 if (!IS_POSIXACL(inode)) 1930 if (!IS_POSIXACL(inode))
1931 return ERR_PTR(-EOPNOTSUPP); 1931 return ERR_PTR(-EOPNOTSUPP);
1932 1932
1933 switch (type) { 1933 switch (type) {
1934 case ACL_TYPE_ACCESS: 1934 case ACL_TYPE_ACCESS:
1935 name = POSIX_ACL_XATTR_ACCESS; 1935 name = POSIX_ACL_XATTR_ACCESS;
1936 break; 1936 break;
1937 case ACL_TYPE_DEFAULT: 1937 case ACL_TYPE_DEFAULT:
1938 name = POSIX_ACL_XATTR_DEFAULT; 1938 name = POSIX_ACL_XATTR_DEFAULT;
1939 break; 1939 break;
1940 default: 1940 default:
1941 return ERR_PTR(-EOPNOTSUPP); 1941 return ERR_PTR(-EOPNOTSUPP);
1942 } 1942 }
1943 1943
1944 size = nfsd_getxattr(fhp->fh_dentry, name, &value); 1944 size = nfsd_getxattr(fhp->fh_dentry, name, &value);
1945 if (size < 0) 1945 if (size < 0)
1946 return ERR_PTR(size); 1946 return ERR_PTR(size);
1947 1947
1948 acl = posix_acl_from_xattr(value, size); 1948 acl = posix_acl_from_xattr(value, size);
1949 kfree(value); 1949 kfree(value);
1950 return acl; 1950 return acl;
1951 } 1951 }
1952 1952
1953 int 1953 int
1954 nfsd_set_posix_acl(struct svc_fh *fhp, int type, struct posix_acl *acl) 1954 nfsd_set_posix_acl(struct svc_fh *fhp, int type, struct posix_acl *acl)
1955 { 1955 {
1956 struct inode *inode = fhp->fh_dentry->d_inode; 1956 struct inode *inode = fhp->fh_dentry->d_inode;
1957 char *name; 1957 char *name;
1958 void *value = NULL; 1958 void *value = NULL;
1959 size_t size; 1959 size_t size;
1960 int error; 1960 int error;
1961 1961
1962 if (!IS_POSIXACL(inode) || !inode->i_op || 1962 if (!IS_POSIXACL(inode) || !inode->i_op ||
1963 !inode->i_op->setxattr || !inode->i_op->removexattr) 1963 !inode->i_op->setxattr || !inode->i_op->removexattr)
1964 return -EOPNOTSUPP; 1964 return -EOPNOTSUPP;
1965 switch(type) { 1965 switch(type) {
1966 case ACL_TYPE_ACCESS: 1966 case ACL_TYPE_ACCESS:
1967 name = POSIX_ACL_XATTR_ACCESS; 1967 name = POSIX_ACL_XATTR_ACCESS;
1968 break; 1968 break;
1969 case ACL_TYPE_DEFAULT: 1969 case ACL_TYPE_DEFAULT:
1970 name = POSIX_ACL_XATTR_DEFAULT; 1970 name = POSIX_ACL_XATTR_DEFAULT;
1971 break; 1971 break;
1972 default: 1972 default:
1973 return -EOPNOTSUPP; 1973 return -EOPNOTSUPP;
1974 } 1974 }
1975 1975
1976 if (acl && acl->a_count) { 1976 if (acl && acl->a_count) {
1977 size = posix_acl_xattr_size(acl->a_count); 1977 size = posix_acl_xattr_size(acl->a_count);
1978 value = kmalloc(size, GFP_KERNEL); 1978 value = kmalloc(size, GFP_KERNEL);
1979 if (!value) 1979 if (!value)
1980 return -ENOMEM; 1980 return -ENOMEM;
1981 error = posix_acl_to_xattr(acl, value, size); 1981 error = posix_acl_to_xattr(acl, value, size);
1982 if (error < 0) 1982 if (error < 0)
1983 goto getout; 1983 goto getout;
1984 size = error; 1984 size = error;
1985 } else 1985 } else
1986 size = 0; 1986 size = 0;
1987 1987
1988 if (size) 1988 if (size)
1989 error = vfs_setxattr(fhp->fh_dentry, name, value, size, 0); 1989 error = vfs_setxattr(fhp->fh_dentry, name, value, size, 0);
1990 else { 1990 else {
1991 if (!S_ISDIR(inode->i_mode) && type == ACL_TYPE_DEFAULT) 1991 if (!S_ISDIR(inode->i_mode) && type == ACL_TYPE_DEFAULT)
1992 error = 0; 1992 error = 0;
1993 else { 1993 else {
1994 error = vfs_removexattr(fhp->fh_dentry, name); 1994 error = vfs_removexattr(fhp->fh_dentry, name);
1995 if (error == -ENODATA) 1995 if (error == -ENODATA)
1996 error = 0; 1996 error = 0;
1997 } 1997 }
1998 } 1998 }
1999 1999
2000 getout: 2000 getout:
2001 kfree(value); 2001 kfree(value);
2002 return error; 2002 return error;
2003 } 2003 }
2004 #endif /* defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) */ 2004 #endif /* defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) */
2005 2005
1 /* -*- mode: c; c-basic-offset: 8; -*- 1 /* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0: 2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 * 3 *
4 * file.c 4 * file.c
5 * 5 *
6 * File open, close, extend, truncate 6 * File open, close, extend, truncate
7 * 7 *
8 * Copyright (C) 2002, 2004 Oracle. All rights reserved. 8 * Copyright (C) 2002, 2004 Oracle. All rights reserved.
9 * 9 *
10 * This program is free software; you can redistribute it and/or 10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public 11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either 12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version. 13 * version 2 of the License, or (at your option) any later version.
14 * 14 *
15 * This program is distributed in the hope that it will be useful, 15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of 16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details. 18 * General Public License for more details.
19 * 19 *
20 * You should have received a copy of the GNU General Public 20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the 21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA. 23 * Boston, MA 021110-1307, USA.
24 */ 24 */
25 25
26 #include <linux/capability.h> 26 #include <linux/capability.h>
27 #include <linux/fs.h> 27 #include <linux/fs.h>
28 #include <linux/types.h> 28 #include <linux/types.h>
29 #include <linux/slab.h> 29 #include <linux/slab.h>
30 #include <linux/highmem.h> 30 #include <linux/highmem.h>
31 #include <linux/pagemap.h> 31 #include <linux/pagemap.h>
32 #include <linux/uio.h> 32 #include <linux/uio.h>
33 #include <linux/sched.h> 33 #include <linux/sched.h>
34 #include <linux/splice.h> 34 #include <linux/splice.h>
35 #include <linux/mount.h> 35 #include <linux/mount.h>
36 #include <linux/writeback.h> 36 #include <linux/writeback.h>
37 37
38 #define MLOG_MASK_PREFIX ML_INODE 38 #define MLOG_MASK_PREFIX ML_INODE
39 #include <cluster/masklog.h> 39 #include <cluster/masklog.h>
40 40
41 #include "ocfs2.h" 41 #include "ocfs2.h"
42 42
43 #include "alloc.h" 43 #include "alloc.h"
44 #include "aops.h" 44 #include "aops.h"
45 #include "dir.h" 45 #include "dir.h"
46 #include "dlmglue.h" 46 #include "dlmglue.h"
47 #include "extent_map.h" 47 #include "extent_map.h"
48 #include "file.h" 48 #include "file.h"
49 #include "sysfile.h" 49 #include "sysfile.h"
50 #include "inode.h" 50 #include "inode.h"
51 #include "ioctl.h" 51 #include "ioctl.h"
52 #include "journal.h" 52 #include "journal.h"
53 #include "mmap.h" 53 #include "mmap.h"
54 #include "suballoc.h" 54 #include "suballoc.h"
55 #include "super.h" 55 #include "super.h"
56 56
57 #include "buffer_head_io.h" 57 #include "buffer_head_io.h"
58 58
59 static int ocfs2_sync_inode(struct inode *inode) 59 static int ocfs2_sync_inode(struct inode *inode)
60 { 60 {
61 filemap_fdatawrite(inode->i_mapping); 61 filemap_fdatawrite(inode->i_mapping);
62 return sync_mapping_buffers(inode->i_mapping); 62 return sync_mapping_buffers(inode->i_mapping);
63 } 63 }
64 64
65 static int ocfs2_file_open(struct inode *inode, struct file *file) 65 static int ocfs2_file_open(struct inode *inode, struct file *file)
66 { 66 {
67 int status; 67 int status;
68 int mode = file->f_flags; 68 int mode = file->f_flags;
69 struct ocfs2_inode_info *oi = OCFS2_I(inode); 69 struct ocfs2_inode_info *oi = OCFS2_I(inode);
70 70
71 mlog_entry("(0x%p, 0x%p, '%.*s')\n", inode, file, 71 mlog_entry("(0x%p, 0x%p, '%.*s')\n", inode, file,
72 file->f_path.dentry->d_name.len, file->f_path.dentry->d_name.name); 72 file->f_path.dentry->d_name.len, file->f_path.dentry->d_name.name);
73 73
74 spin_lock(&oi->ip_lock); 74 spin_lock(&oi->ip_lock);
75 75
76 /* Check that the inode hasn't been wiped from disk by another 76 /* Check that the inode hasn't been wiped from disk by another
77 * node. If it hasn't then we're safe as long as we hold the 77 * node. If it hasn't then we're safe as long as we hold the
78 * spin lock until our increment of open count. */ 78 * spin lock until our increment of open count. */
79 if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_DELETED) { 79 if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_DELETED) {
80 spin_unlock(&oi->ip_lock); 80 spin_unlock(&oi->ip_lock);
81 81
82 status = -ENOENT; 82 status = -ENOENT;
83 goto leave; 83 goto leave;
84 } 84 }
85 85
86 if (mode & O_DIRECT) 86 if (mode & O_DIRECT)
87 oi->ip_flags |= OCFS2_INODE_OPEN_DIRECT; 87 oi->ip_flags |= OCFS2_INODE_OPEN_DIRECT;
88 88
89 oi->ip_open_count++; 89 oi->ip_open_count++;
90 spin_unlock(&oi->ip_lock); 90 spin_unlock(&oi->ip_lock);
91 status = 0; 91 status = 0;
92 leave: 92 leave:
93 mlog_exit(status); 93 mlog_exit(status);
94 return status; 94 return status;
95 } 95 }
96 96
97 static int ocfs2_file_release(struct inode *inode, struct file *file) 97 static int ocfs2_file_release(struct inode *inode, struct file *file)
98 { 98 {
99 struct ocfs2_inode_info *oi = OCFS2_I(inode); 99 struct ocfs2_inode_info *oi = OCFS2_I(inode);
100 100
101 mlog_entry("(0x%p, 0x%p, '%.*s')\n", inode, file, 101 mlog_entry("(0x%p, 0x%p, '%.*s')\n", inode, file,
102 file->f_path.dentry->d_name.len, 102 file->f_path.dentry->d_name.len,
103 file->f_path.dentry->d_name.name); 103 file->f_path.dentry->d_name.name);
104 104
105 spin_lock(&oi->ip_lock); 105 spin_lock(&oi->ip_lock);
106 if (!--oi->ip_open_count) 106 if (!--oi->ip_open_count)
107 oi->ip_flags &= ~OCFS2_INODE_OPEN_DIRECT; 107 oi->ip_flags &= ~OCFS2_INODE_OPEN_DIRECT;
108 spin_unlock(&oi->ip_lock); 108 spin_unlock(&oi->ip_lock);
109 109
110 mlog_exit(0); 110 mlog_exit(0);
111 111
112 return 0; 112 return 0;
113 } 113 }
114 114
115 static int ocfs2_sync_file(struct file *file, 115 static int ocfs2_sync_file(struct file *file,
116 struct dentry *dentry, 116 struct dentry *dentry,
117 int datasync) 117 int datasync)
118 { 118 {
119 int err = 0; 119 int err = 0;
120 journal_t *journal; 120 journal_t *journal;
121 struct inode *inode = dentry->d_inode; 121 struct inode *inode = dentry->d_inode;
122 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 122 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
123 123
124 mlog_entry("(0x%p, 0x%p, %d, '%.*s')\n", file, dentry, datasync, 124 mlog_entry("(0x%p, 0x%p, %d, '%.*s')\n", file, dentry, datasync,
125 dentry->d_name.len, dentry->d_name.name); 125 dentry->d_name.len, dentry->d_name.name);
126 126
127 err = ocfs2_sync_inode(dentry->d_inode); 127 err = ocfs2_sync_inode(dentry->d_inode);
128 if (err) 128 if (err)
129 goto bail; 129 goto bail;
130 130
131 journal = osb->journal->j_journal; 131 journal = osb->journal->j_journal;
132 err = journal_force_commit(journal); 132 err = journal_force_commit(journal);
133 133
134 bail: 134 bail:
135 mlog_exit(err); 135 mlog_exit(err);
136 136
137 return (err < 0) ? -EIO : 0; 137 return (err < 0) ? -EIO : 0;
138 } 138 }
139 139
140 int ocfs2_should_update_atime(struct inode *inode, 140 int ocfs2_should_update_atime(struct inode *inode,
141 struct vfsmount *vfsmnt) 141 struct vfsmount *vfsmnt)
142 { 142 {
143 struct timespec now; 143 struct timespec now;
144 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 144 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
145 145
146 if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb)) 146 if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
147 return 0; 147 return 0;
148 148
149 if ((inode->i_flags & S_NOATIME) || 149 if ((inode->i_flags & S_NOATIME) ||
150 ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))) 150 ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode)))
151 return 0; 151 return 0;
152 152
153 /* 153 /*
154 * We can be called with no vfsmnt structure - NFSD will 154 * We can be called with no vfsmnt structure - NFSD will
155 * sometimes do this. 155 * sometimes do this.
156 * 156 *
157 * Note that our action here is different than touch_atime() - 157 * Note that our action here is different than touch_atime() -
158 * if we can't tell whether this is a noatime mount, then we 158 * if we can't tell whether this is a noatime mount, then we
159 * don't know whether to trust the value of s_atime_quantum. 159 * don't know whether to trust the value of s_atime_quantum.
160 */ 160 */
161 if (vfsmnt == NULL) 161 if (vfsmnt == NULL)
162 return 0; 162 return 0;
163 163
164 if ((vfsmnt->mnt_flags & MNT_NOATIME) || 164 if ((vfsmnt->mnt_flags & MNT_NOATIME) ||
165 ((vfsmnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))) 165 ((vfsmnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode)))
166 return 0; 166 return 0;
167 167
168 if (vfsmnt->mnt_flags & MNT_RELATIME) { 168 if (vfsmnt->mnt_flags & MNT_RELATIME) {
169 if ((timespec_compare(&inode->i_atime, &inode->i_mtime) <= 0) || 169 if ((timespec_compare(&inode->i_atime, &inode->i_mtime) <= 0) ||
170 (timespec_compare(&inode->i_atime, &inode->i_ctime) <= 0)) 170 (timespec_compare(&inode->i_atime, &inode->i_ctime) <= 0))
171 return 1; 171 return 1;
172 172
173 return 0; 173 return 0;
174 } 174 }
175 175
176 now = CURRENT_TIME; 176 now = CURRENT_TIME;
177 if ((now.tv_sec - inode->i_atime.tv_sec <= osb->s_atime_quantum)) 177 if ((now.tv_sec - inode->i_atime.tv_sec <= osb->s_atime_quantum))
178 return 0; 178 return 0;
179 else 179 else
180 return 1; 180 return 1;
181 } 181 }
182 182
183 int ocfs2_update_inode_atime(struct inode *inode, 183 int ocfs2_update_inode_atime(struct inode *inode,
184 struct buffer_head *bh) 184 struct buffer_head *bh)
185 { 185 {
186 int ret; 186 int ret;
187 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 187 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
188 handle_t *handle; 188 handle_t *handle;
189 189
190 mlog_entry_void(); 190 mlog_entry_void();
191 191
192 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); 192 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
193 if (handle == NULL) { 193 if (handle == NULL) {
194 ret = -ENOMEM; 194 ret = -ENOMEM;
195 mlog_errno(ret); 195 mlog_errno(ret);
196 goto out; 196 goto out;
197 } 197 }
198 198
199 inode->i_atime = CURRENT_TIME; 199 inode->i_atime = CURRENT_TIME;
200 ret = ocfs2_mark_inode_dirty(handle, inode, bh); 200 ret = ocfs2_mark_inode_dirty(handle, inode, bh);
201 if (ret < 0) 201 if (ret < 0)
202 mlog_errno(ret); 202 mlog_errno(ret);
203 203
204 ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle); 204 ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
205 out: 205 out:
206 mlog_exit(ret); 206 mlog_exit(ret);
207 return ret; 207 return ret;
208 } 208 }
209 209
210 static int ocfs2_set_inode_size(handle_t *handle, 210 static int ocfs2_set_inode_size(handle_t *handle,
211 struct inode *inode, 211 struct inode *inode,
212 struct buffer_head *fe_bh, 212 struct buffer_head *fe_bh,
213 u64 new_i_size) 213 u64 new_i_size)
214 { 214 {
215 int status; 215 int status;
216 216
217 mlog_entry_void(); 217 mlog_entry_void();
218 i_size_write(inode, new_i_size); 218 i_size_write(inode, new_i_size);
219 inode->i_blocks = ocfs2_inode_sector_count(inode); 219 inode->i_blocks = ocfs2_inode_sector_count(inode);
220 inode->i_ctime = inode->i_mtime = CURRENT_TIME; 220 inode->i_ctime = inode->i_mtime = CURRENT_TIME;
221 221
222 status = ocfs2_mark_inode_dirty(handle, inode, fe_bh); 222 status = ocfs2_mark_inode_dirty(handle, inode, fe_bh);
223 if (status < 0) { 223 if (status < 0) {
224 mlog_errno(status); 224 mlog_errno(status);
225 goto bail; 225 goto bail;
226 } 226 }
227 227
228 bail: 228 bail:
229 mlog_exit(status); 229 mlog_exit(status);
230 return status; 230 return status;
231 } 231 }
232 232
233 static int ocfs2_simple_size_update(struct inode *inode, 233 static int ocfs2_simple_size_update(struct inode *inode,
234 struct buffer_head *di_bh, 234 struct buffer_head *di_bh,
235 u64 new_i_size) 235 u64 new_i_size)
236 { 236 {
237 int ret; 237 int ret;
238 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 238 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
239 handle_t *handle = NULL; 239 handle_t *handle = NULL;
240 240
241 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); 241 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
242 if (handle == NULL) { 242 if (handle == NULL) {
243 ret = -ENOMEM; 243 ret = -ENOMEM;
244 mlog_errno(ret); 244 mlog_errno(ret);
245 goto out; 245 goto out;
246 } 246 }
247 247
248 ret = ocfs2_set_inode_size(handle, inode, di_bh, 248 ret = ocfs2_set_inode_size(handle, inode, di_bh,
249 new_i_size); 249 new_i_size);
250 if (ret < 0) 250 if (ret < 0)
251 mlog_errno(ret); 251 mlog_errno(ret);
252 252
253 ocfs2_commit_trans(osb, handle); 253 ocfs2_commit_trans(osb, handle);
254 out: 254 out:
255 return ret; 255 return ret;
256 } 256 }
257 257
258 static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb, 258 static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb,
259 struct inode *inode, 259 struct inode *inode,
260 struct buffer_head *fe_bh, 260 struct buffer_head *fe_bh,
261 u64 new_i_size) 261 u64 new_i_size)
262 { 262 {
263 int status; 263 int status;
264 handle_t *handle; 264 handle_t *handle;
265 struct ocfs2_dinode *di; 265 struct ocfs2_dinode *di;
266 266
267 mlog_entry_void(); 267 mlog_entry_void();
268 268
269 /* TODO: This needs to actually orphan the inode in this 269 /* TODO: This needs to actually orphan the inode in this
270 * transaction. */ 270 * transaction. */
271 271
272 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); 272 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
273 if (IS_ERR(handle)) { 273 if (IS_ERR(handle)) {
274 status = PTR_ERR(handle); 274 status = PTR_ERR(handle);
275 mlog_errno(status); 275 mlog_errno(status);
276 goto out; 276 goto out;
277 } 277 }
278 278
279 status = ocfs2_journal_access(handle, inode, fe_bh, 279 status = ocfs2_journal_access(handle, inode, fe_bh,
280 OCFS2_JOURNAL_ACCESS_WRITE); 280 OCFS2_JOURNAL_ACCESS_WRITE);
281 if (status < 0) { 281 if (status < 0) {
282 mlog_errno(status); 282 mlog_errno(status);
283 goto out_commit; 283 goto out_commit;
284 } 284 }
285 285
286 /* 286 /*
287 * Do this before setting i_size. 287 * Do this before setting i_size.
288 */ 288 */
289 status = ocfs2_zero_tail_for_truncate(inode, handle, new_i_size); 289 status = ocfs2_zero_tail_for_truncate(inode, handle, new_i_size);
290 if (status) { 290 if (status) {
291 mlog_errno(status); 291 mlog_errno(status);
292 goto out_commit; 292 goto out_commit;
293 } 293 }
294 294
295 i_size_write(inode, new_i_size); 295 i_size_write(inode, new_i_size);
296 inode->i_blocks = ocfs2_align_bytes_to_sectors(new_i_size); 296 inode->i_blocks = ocfs2_align_bytes_to_sectors(new_i_size);
297 inode->i_ctime = inode->i_mtime = CURRENT_TIME; 297 inode->i_ctime = inode->i_mtime = CURRENT_TIME;
298 298
299 di = (struct ocfs2_dinode *) fe_bh->b_data; 299 di = (struct ocfs2_dinode *) fe_bh->b_data;
300 di->i_size = cpu_to_le64(new_i_size); 300 di->i_size = cpu_to_le64(new_i_size);
301 di->i_ctime = di->i_mtime = cpu_to_le64(inode->i_ctime.tv_sec); 301 di->i_ctime = di->i_mtime = cpu_to_le64(inode->i_ctime.tv_sec);
302 di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); 302 di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
303 303
304 status = ocfs2_journal_dirty(handle, fe_bh); 304 status = ocfs2_journal_dirty(handle, fe_bh);
305 if (status < 0) 305 if (status < 0)
306 mlog_errno(status); 306 mlog_errno(status);
307 307
308 out_commit: 308 out_commit:
309 ocfs2_commit_trans(osb, handle); 309 ocfs2_commit_trans(osb, handle);
310 out: 310 out:
311 311
312 mlog_exit(status); 312 mlog_exit(status);
313 return status; 313 return status;
314 } 314 }
315 315
316 static int ocfs2_truncate_file(struct inode *inode, 316 static int ocfs2_truncate_file(struct inode *inode,
317 struct buffer_head *di_bh, 317 struct buffer_head *di_bh,
318 u64 new_i_size) 318 u64 new_i_size)
319 { 319 {
320 int status = 0; 320 int status = 0;
321 struct ocfs2_dinode *fe = NULL; 321 struct ocfs2_dinode *fe = NULL;
322 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 322 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
323 struct ocfs2_truncate_context *tc = NULL; 323 struct ocfs2_truncate_context *tc = NULL;
324 324
325 mlog_entry("(inode = %llu, new_i_size = %llu\n", 325 mlog_entry("(inode = %llu, new_i_size = %llu\n",
326 (unsigned long long)OCFS2_I(inode)->ip_blkno, 326 (unsigned long long)OCFS2_I(inode)->ip_blkno,
327 (unsigned long long)new_i_size); 327 (unsigned long long)new_i_size);
328 328
329 unmap_mapping_range(inode->i_mapping, new_i_size + PAGE_SIZE - 1, 0, 1); 329 unmap_mapping_range(inode->i_mapping, new_i_size + PAGE_SIZE - 1, 0, 1);
330 truncate_inode_pages(inode->i_mapping, new_i_size); 330 truncate_inode_pages(inode->i_mapping, new_i_size);
331 331
332 fe = (struct ocfs2_dinode *) di_bh->b_data; 332 fe = (struct ocfs2_dinode *) di_bh->b_data;
333 if (!OCFS2_IS_VALID_DINODE(fe)) { 333 if (!OCFS2_IS_VALID_DINODE(fe)) {
334 OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe); 334 OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe);
335 status = -EIO; 335 status = -EIO;
336 goto bail; 336 goto bail;
337 } 337 }
338 338
339 mlog_bug_on_msg(le64_to_cpu(fe->i_size) != i_size_read(inode), 339 mlog_bug_on_msg(le64_to_cpu(fe->i_size) != i_size_read(inode),
340 "Inode %llu, inode i_size = %lld != di " 340 "Inode %llu, inode i_size = %lld != di "
341 "i_size = %llu, i_flags = 0x%x\n", 341 "i_size = %llu, i_flags = 0x%x\n",
342 (unsigned long long)OCFS2_I(inode)->ip_blkno, 342 (unsigned long long)OCFS2_I(inode)->ip_blkno,
343 i_size_read(inode), 343 i_size_read(inode),
344 (unsigned long long)le64_to_cpu(fe->i_size), 344 (unsigned long long)le64_to_cpu(fe->i_size),
345 le32_to_cpu(fe->i_flags)); 345 le32_to_cpu(fe->i_flags));
346 346
347 if (new_i_size > le64_to_cpu(fe->i_size)) { 347 if (new_i_size > le64_to_cpu(fe->i_size)) {
348 mlog(0, "asked to truncate file with size (%llu) to size (%llu)!\n", 348 mlog(0, "asked to truncate file with size (%llu) to size (%llu)!\n",
349 (unsigned long long)le64_to_cpu(fe->i_size), 349 (unsigned long long)le64_to_cpu(fe->i_size),
350 (unsigned long long)new_i_size); 350 (unsigned long long)new_i_size);
351 status = -EINVAL; 351 status = -EINVAL;
352 mlog_errno(status); 352 mlog_errno(status);
353 goto bail; 353 goto bail;
354 } 354 }
355 355
356 mlog(0, "inode %llu, i_size = %llu, new_i_size = %llu\n", 356 mlog(0, "inode %llu, i_size = %llu, new_i_size = %llu\n",
357 (unsigned long long)le64_to_cpu(fe->i_blkno), 357 (unsigned long long)le64_to_cpu(fe->i_blkno),
358 (unsigned long long)le64_to_cpu(fe->i_size), 358 (unsigned long long)le64_to_cpu(fe->i_size),
359 (unsigned long long)new_i_size); 359 (unsigned long long)new_i_size);
360 360
361 /* lets handle the simple truncate cases before doing any more 361 /* lets handle the simple truncate cases before doing any more
362 * cluster locking. */ 362 * cluster locking. */
363 if (new_i_size == le64_to_cpu(fe->i_size)) 363 if (new_i_size == le64_to_cpu(fe->i_size))
364 goto bail; 364 goto bail;
365 365
366 /* This forces other nodes to sync and drop their pages. Do 366 /* This forces other nodes to sync and drop their pages. Do
367 * this even if we have a truncate without allocation change - 367 * this even if we have a truncate without allocation change -
368 * ocfs2 cluster sizes can be much greater than page size, so 368 * ocfs2 cluster sizes can be much greater than page size, so
369 * we have to truncate them anyway. */ 369 * we have to truncate them anyway. */
370 status = ocfs2_data_lock(inode, 1); 370 status = ocfs2_data_lock(inode, 1);
371 if (status < 0) { 371 if (status < 0) {
372 mlog_errno(status); 372 mlog_errno(status);
373 goto bail; 373 goto bail;
374 } 374 }
375 375
376 /* alright, we're going to need to do a full blown alloc size 376 /* alright, we're going to need to do a full blown alloc size
377 * change. Orphan the inode so that recovery can complete the 377 * change. Orphan the inode so that recovery can complete the
378 * truncate if necessary. This does the task of marking 378 * truncate if necessary. This does the task of marking
379 * i_size. */ 379 * i_size. */
380 status = ocfs2_orphan_for_truncate(osb, inode, di_bh, new_i_size); 380 status = ocfs2_orphan_for_truncate(osb, inode, di_bh, new_i_size);
381 if (status < 0) { 381 if (status < 0) {
382 mlog_errno(status); 382 mlog_errno(status);
383 goto bail_unlock_data; 383 goto bail_unlock_data;
384 } 384 }
385 385
386 status = ocfs2_prepare_truncate(osb, inode, di_bh, &tc); 386 status = ocfs2_prepare_truncate(osb, inode, di_bh, &tc);
387 if (status < 0) { 387 if (status < 0) {
388 mlog_errno(status); 388 mlog_errno(status);
389 goto bail_unlock_data; 389 goto bail_unlock_data;
390 } 390 }
391 391
392 status = ocfs2_commit_truncate(osb, inode, di_bh, tc); 392 status = ocfs2_commit_truncate(osb, inode, di_bh, tc);
393 if (status < 0) { 393 if (status < 0) {
394 mlog_errno(status); 394 mlog_errno(status);
395 goto bail_unlock_data; 395 goto bail_unlock_data;
396 } 396 }
397 397
398 /* TODO: orphan dir cleanup here. */ 398 /* TODO: orphan dir cleanup here. */
399 bail_unlock_data: 399 bail_unlock_data:
400 ocfs2_data_unlock(inode, 1); 400 ocfs2_data_unlock(inode, 1);
401 401
402 bail: 402 bail:
403 403
404 mlog_exit(status); 404 mlog_exit(status);
405 return status; 405 return status;
406 } 406 }
407 407
408 /* 408 /*
409 * extend allocation only here. 409 * extend allocation only here.
410 * we'll update all the disk stuff, and oip->alloc_size 410 * we'll update all the disk stuff, and oip->alloc_size
411 * 411 *
412 * expect stuff to be locked, a transaction started and enough data / 412 * expect stuff to be locked, a transaction started and enough data /
413 * metadata reservations in the contexts. 413 * metadata reservations in the contexts.
414 * 414 *
415 * Will return -EAGAIN, and a reason if a restart is needed. 415 * Will return -EAGAIN, and a reason if a restart is needed.
416 * If passed in, *reason will always be set, even in error. 416 * If passed in, *reason will always be set, even in error.
417 */ 417 */
418 int ocfs2_do_extend_allocation(struct ocfs2_super *osb, 418 int ocfs2_do_extend_allocation(struct ocfs2_super *osb,
419 struct inode *inode, 419 struct inode *inode,
420 u32 *logical_offset, 420 u32 *logical_offset,
421 u32 clusters_to_add, 421 u32 clusters_to_add,
422 struct buffer_head *fe_bh, 422 struct buffer_head *fe_bh,
423 handle_t *handle, 423 handle_t *handle,
424 struct ocfs2_alloc_context *data_ac, 424 struct ocfs2_alloc_context *data_ac,
425 struct ocfs2_alloc_context *meta_ac, 425 struct ocfs2_alloc_context *meta_ac,
426 enum ocfs2_alloc_restarted *reason_ret) 426 enum ocfs2_alloc_restarted *reason_ret)
427 { 427 {
428 int status = 0; 428 int status = 0;
429 int free_extents; 429 int free_extents;
430 struct ocfs2_dinode *fe = (struct ocfs2_dinode *) fe_bh->b_data; 430 struct ocfs2_dinode *fe = (struct ocfs2_dinode *) fe_bh->b_data;
431 enum ocfs2_alloc_restarted reason = RESTART_NONE; 431 enum ocfs2_alloc_restarted reason = RESTART_NONE;
432 u32 bit_off, num_bits; 432 u32 bit_off, num_bits;
433 u64 block; 433 u64 block;
434 434
435 BUG_ON(!clusters_to_add); 435 BUG_ON(!clusters_to_add);
436 436
437 free_extents = ocfs2_num_free_extents(osb, inode, fe); 437 free_extents = ocfs2_num_free_extents(osb, inode, fe);
438 if (free_extents < 0) { 438 if (free_extents < 0) {
439 status = free_extents; 439 status = free_extents;
440 mlog_errno(status); 440 mlog_errno(status);
441 goto leave; 441 goto leave;
442 } 442 }
443 443
444 /* there are two cases which could cause us to EAGAIN in the 444 /* there are two cases which could cause us to EAGAIN in the
445 * we-need-more-metadata case: 445 * we-need-more-metadata case:
446 * 1) we haven't reserved *any* 446 * 1) we haven't reserved *any*
447 * 2) we are so fragmented, we've needed to add metadata too 447 * 2) we are so fragmented, we've needed to add metadata too
448 * many times. */ 448 * many times. */
449 if (!free_extents && !meta_ac) { 449 if (!free_extents && !meta_ac) {
450 mlog(0, "we haven't reserved any metadata!\n"); 450 mlog(0, "we haven't reserved any metadata!\n");
451 status = -EAGAIN; 451 status = -EAGAIN;
452 reason = RESTART_META; 452 reason = RESTART_META;
453 goto leave; 453 goto leave;
454 } else if ((!free_extents) 454 } else if ((!free_extents)
455 && (ocfs2_alloc_context_bits_left(meta_ac) 455 && (ocfs2_alloc_context_bits_left(meta_ac)
456 < ocfs2_extend_meta_needed(fe))) { 456 < ocfs2_extend_meta_needed(fe))) {
457 mlog(0, "filesystem is really fragmented...\n"); 457 mlog(0, "filesystem is really fragmented...\n");
458 status = -EAGAIN; 458 status = -EAGAIN;
459 reason = RESTART_META; 459 reason = RESTART_META;
460 goto leave; 460 goto leave;
461 } 461 }
462 462
463 status = ocfs2_claim_clusters(osb, handle, data_ac, 1, 463 status = ocfs2_claim_clusters(osb, handle, data_ac, 1,
464 &bit_off, &num_bits); 464 &bit_off, &num_bits);
465 if (status < 0) { 465 if (status < 0) {
466 if (status != -ENOSPC) 466 if (status != -ENOSPC)
467 mlog_errno(status); 467 mlog_errno(status);
468 goto leave; 468 goto leave;
469 } 469 }
470 470
471 BUG_ON(num_bits > clusters_to_add); 471 BUG_ON(num_bits > clusters_to_add);
472 472
473 /* reserve our write early -- insert_extent may update the inode */ 473 /* reserve our write early -- insert_extent may update the inode */
474 status = ocfs2_journal_access(handle, inode, fe_bh, 474 status = ocfs2_journal_access(handle, inode, fe_bh,
475 OCFS2_JOURNAL_ACCESS_WRITE); 475 OCFS2_JOURNAL_ACCESS_WRITE);
476 if (status < 0) { 476 if (status < 0) {
477 mlog_errno(status); 477 mlog_errno(status);
478 goto leave; 478 goto leave;
479 } 479 }
480 480
481 block = ocfs2_clusters_to_blocks(osb->sb, bit_off); 481 block = ocfs2_clusters_to_blocks(osb->sb, bit_off);
482 mlog(0, "Allocating %u clusters at block %u for inode %llu\n", 482 mlog(0, "Allocating %u clusters at block %u for inode %llu\n",
483 num_bits, bit_off, (unsigned long long)OCFS2_I(inode)->ip_blkno); 483 num_bits, bit_off, (unsigned long long)OCFS2_I(inode)->ip_blkno);
484 status = ocfs2_insert_extent(osb, handle, inode, fe_bh, 484 status = ocfs2_insert_extent(osb, handle, inode, fe_bh,
485 *logical_offset, block, num_bits, 485 *logical_offset, block, num_bits,
486 meta_ac); 486 meta_ac);
487 if (status < 0) { 487 if (status < 0) {
488 mlog_errno(status); 488 mlog_errno(status);
489 goto leave; 489 goto leave;
490 } 490 }
491 491
492 status = ocfs2_journal_dirty(handle, fe_bh); 492 status = ocfs2_journal_dirty(handle, fe_bh);
493 if (status < 0) { 493 if (status < 0) {
494 mlog_errno(status); 494 mlog_errno(status);
495 goto leave; 495 goto leave;
496 } 496 }
497 497
498 clusters_to_add -= num_bits; 498 clusters_to_add -= num_bits;
499 *logical_offset += num_bits; 499 *logical_offset += num_bits;
500 500
501 if (clusters_to_add) { 501 if (clusters_to_add) {
502 mlog(0, "need to alloc once more, clusters = %u, wanted = " 502 mlog(0, "need to alloc once more, clusters = %u, wanted = "
503 "%u\n", fe->i_clusters, clusters_to_add); 503 "%u\n", fe->i_clusters, clusters_to_add);
504 status = -EAGAIN; 504 status = -EAGAIN;
505 reason = RESTART_TRANS; 505 reason = RESTART_TRANS;
506 } 506 }
507 507
508 leave: 508 leave:
509 mlog_exit(status); 509 mlog_exit(status);
510 if (reason_ret) 510 if (reason_ret)
511 *reason_ret = reason; 511 *reason_ret = reason;
512 return status; 512 return status;
513 } 513 }
514 514
515 /* 515 /*
516 * For a given allocation, determine which allocators will need to be 516 * For a given allocation, determine which allocators will need to be
517 * accessed, and lock them, reserving the appropriate number of bits. 517 * accessed, and lock them, reserving the appropriate number of bits.
518 * 518 *
519 * Called from ocfs2_extend_allocation() for file systems which don't 519 * Called from ocfs2_extend_allocation() for file systems which don't
520 * support holes, and from ocfs2_write() for file systems which 520 * support holes, and from ocfs2_write() for file systems which
521 * understand sparse inodes. 521 * understand sparse inodes.
522 */ 522 */
523 int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_dinode *di, 523 int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_dinode *di,
524 u32 clusters_to_add, 524 u32 clusters_to_add,
525 struct ocfs2_alloc_context **data_ac, 525 struct ocfs2_alloc_context **data_ac,
526 struct ocfs2_alloc_context **meta_ac) 526 struct ocfs2_alloc_context **meta_ac)
527 { 527 {
528 int ret, num_free_extents; 528 int ret, num_free_extents;
529 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 529 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
530 530
531 *meta_ac = NULL; 531 *meta_ac = NULL;
532 *data_ac = NULL; 532 *data_ac = NULL;
533 533
534 mlog(0, "extend inode %llu, i_size = %lld, di->i_clusters = %u, " 534 mlog(0, "extend inode %llu, i_size = %lld, di->i_clusters = %u, "
535 "clusters_to_add = %u\n", 535 "clusters_to_add = %u\n",
536 (unsigned long long)OCFS2_I(inode)->ip_blkno, i_size_read(inode), 536 (unsigned long long)OCFS2_I(inode)->ip_blkno, i_size_read(inode),
537 le32_to_cpu(di->i_clusters), clusters_to_add); 537 le32_to_cpu(di->i_clusters), clusters_to_add);
538 538
539 num_free_extents = ocfs2_num_free_extents(osb, inode, di); 539 num_free_extents = ocfs2_num_free_extents(osb, inode, di);
540 if (num_free_extents < 0) { 540 if (num_free_extents < 0) {
541 ret = num_free_extents; 541 ret = num_free_extents;
542 mlog_errno(ret); 542 mlog_errno(ret);
543 goto out; 543 goto out;
544 } 544 }
545 545
546 /* 546 /*
547 * Sparse allocation file systems need to be more conservative 547 * Sparse allocation file systems need to be more conservative
548 * with reserving room for expansion - the actual allocation 548 * with reserving room for expansion - the actual allocation
549 * happens while we've got a journal handle open so re-taking 549 * happens while we've got a journal handle open so re-taking
550 * a cluster lock (because we ran out of room for another 550 * a cluster lock (because we ran out of room for another
551 * extent) will violate ordering rules. 551 * extent) will violate ordering rules.
552 * 552 *
553 * Most of the time we'll only be seeing this 1 cluster at a time 553 * Most of the time we'll only be seeing this 1 cluster at a time
554 * anyway. 554 * anyway.
555 */ 555 */
556 if (!num_free_extents || 556 if (!num_free_extents ||
557 (ocfs2_sparse_alloc(osb) && num_free_extents < clusters_to_add)) { 557 (ocfs2_sparse_alloc(osb) && num_free_extents < clusters_to_add)) {
558 ret = ocfs2_reserve_new_metadata(osb, di, meta_ac); 558 ret = ocfs2_reserve_new_metadata(osb, di, meta_ac);
559 if (ret < 0) { 559 if (ret < 0) {
560 if (ret != -ENOSPC) 560 if (ret != -ENOSPC)
561 mlog_errno(ret); 561 mlog_errno(ret);
562 goto out; 562 goto out;
563 } 563 }
564 } 564 }
565 565
566 ret = ocfs2_reserve_clusters(osb, clusters_to_add, data_ac); 566 ret = ocfs2_reserve_clusters(osb, clusters_to_add, data_ac);
567 if (ret < 0) { 567 if (ret < 0) {
568 if (ret != -ENOSPC) 568 if (ret != -ENOSPC)
569 mlog_errno(ret); 569 mlog_errno(ret);
570 goto out; 570 goto out;
571 } 571 }
572 572
573 out: 573 out:
574 if (ret) { 574 if (ret) {
575 if (*meta_ac) { 575 if (*meta_ac) {
576 ocfs2_free_alloc_context(*meta_ac); 576 ocfs2_free_alloc_context(*meta_ac);
577 *meta_ac = NULL; 577 *meta_ac = NULL;
578 } 578 }
579 579
580 /* 580 /*
581 * We cannot have an error and a non null *data_ac. 581 * We cannot have an error and a non null *data_ac.
582 */ 582 */
583 } 583 }
584 584
585 return ret; 585 return ret;
586 } 586 }
587 587
588 static int ocfs2_extend_allocation(struct inode *inode, 588 static int ocfs2_extend_allocation(struct inode *inode,
589 u32 clusters_to_add) 589 u32 clusters_to_add)
590 { 590 {
591 int status = 0; 591 int status = 0;
592 int restart_func = 0; 592 int restart_func = 0;
593 int drop_alloc_sem = 0; 593 int drop_alloc_sem = 0;
594 int credits; 594 int credits;
595 u32 prev_clusters, logical_start; 595 u32 prev_clusters, logical_start;
596 struct buffer_head *bh = NULL; 596 struct buffer_head *bh = NULL;
597 struct ocfs2_dinode *fe = NULL; 597 struct ocfs2_dinode *fe = NULL;
598 handle_t *handle = NULL; 598 handle_t *handle = NULL;
599 struct ocfs2_alloc_context *data_ac = NULL; 599 struct ocfs2_alloc_context *data_ac = NULL;
600 struct ocfs2_alloc_context *meta_ac = NULL; 600 struct ocfs2_alloc_context *meta_ac = NULL;
601 enum ocfs2_alloc_restarted why; 601 enum ocfs2_alloc_restarted why;
602 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 602 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
603 603
604 mlog_entry("(clusters_to_add = %u)\n", clusters_to_add); 604 mlog_entry("(clusters_to_add = %u)\n", clusters_to_add);
605 605
606 /* 606 /*
607 * This function only exists for file systems which don't 607 * This function only exists for file systems which don't
608 * support holes. 608 * support holes.
609 */ 609 */
610 BUG_ON(ocfs2_sparse_alloc(osb)); 610 BUG_ON(ocfs2_sparse_alloc(osb));
611 611
612 status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, &bh, 612 status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, &bh,
613 OCFS2_BH_CACHED, inode); 613 OCFS2_BH_CACHED, inode);
614 if (status < 0) { 614 if (status < 0) {
615 mlog_errno(status); 615 mlog_errno(status);
616 goto leave; 616 goto leave;
617 } 617 }
618 618
619 fe = (struct ocfs2_dinode *) bh->b_data; 619 fe = (struct ocfs2_dinode *) bh->b_data;
620 if (!OCFS2_IS_VALID_DINODE(fe)) { 620 if (!OCFS2_IS_VALID_DINODE(fe)) {
621 OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe); 621 OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe);
622 status = -EIO; 622 status = -EIO;
623 goto leave; 623 goto leave;
624 } 624 }
625 625
626 logical_start = OCFS2_I(inode)->ip_clusters; 626 logical_start = OCFS2_I(inode)->ip_clusters;
627 627
628 restart_all: 628 restart_all:
629 BUG_ON(le32_to_cpu(fe->i_clusters) != OCFS2_I(inode)->ip_clusters); 629 BUG_ON(le32_to_cpu(fe->i_clusters) != OCFS2_I(inode)->ip_clusters);
630 630
631 /* blocks peope in read/write from reading our allocation 631 /* blocks peope in read/write from reading our allocation
632 * until we're done changing it. We depend on i_mutex to block 632 * until we're done changing it. We depend on i_mutex to block
633 * other extend/truncate calls while we're here. Ordering wrt 633 * other extend/truncate calls while we're here. Ordering wrt
634 * start_trans is important here -- always do it before! */ 634 * start_trans is important here -- always do it before! */
635 down_write(&OCFS2_I(inode)->ip_alloc_sem); 635 down_write(&OCFS2_I(inode)->ip_alloc_sem);
636 drop_alloc_sem = 1; 636 drop_alloc_sem = 1;
637 637
638 status = ocfs2_lock_allocators(inode, fe, clusters_to_add, &data_ac, 638 status = ocfs2_lock_allocators(inode, fe, clusters_to_add, &data_ac,
639 &meta_ac); 639 &meta_ac);
640 if (status) { 640 if (status) {
641 mlog_errno(status); 641 mlog_errno(status);
642 goto leave; 642 goto leave;
643 } 643 }
644 644
645 credits = ocfs2_calc_extend_credits(osb->sb, fe, clusters_to_add); 645 credits = ocfs2_calc_extend_credits(osb->sb, fe, clusters_to_add);
646 handle = ocfs2_start_trans(osb, credits); 646 handle = ocfs2_start_trans(osb, credits);
647 if (IS_ERR(handle)) { 647 if (IS_ERR(handle)) {
648 status = PTR_ERR(handle); 648 status = PTR_ERR(handle);
649 handle = NULL; 649 handle = NULL;
650 mlog_errno(status); 650 mlog_errno(status);
651 goto leave; 651 goto leave;
652 } 652 }
653 653
654 restarted_transaction: 654 restarted_transaction:
655 /* reserve a write to the file entry early on - that we if we 655 /* reserve a write to the file entry early on - that we if we
656 * run out of credits in the allocation path, we can still 656 * run out of credits in the allocation path, we can still
657 * update i_size. */ 657 * update i_size. */
658 status = ocfs2_journal_access(handle, inode, bh, 658 status = ocfs2_journal_access(handle, inode, bh,
659 OCFS2_JOURNAL_ACCESS_WRITE); 659 OCFS2_JOURNAL_ACCESS_WRITE);
660 if (status < 0) { 660 if (status < 0) {
661 mlog_errno(status); 661 mlog_errno(status);
662 goto leave; 662 goto leave;
663 } 663 }
664 664
665 prev_clusters = OCFS2_I(inode)->ip_clusters; 665 prev_clusters = OCFS2_I(inode)->ip_clusters;
666 666
667 status = ocfs2_do_extend_allocation(osb, 667 status = ocfs2_do_extend_allocation(osb,
668 inode, 668 inode,
669 &logical_start, 669 &logical_start,
670 clusters_to_add, 670 clusters_to_add,
671 bh, 671 bh,
672 handle, 672 handle,
673 data_ac, 673 data_ac,
674 meta_ac, 674 meta_ac,
675 &why); 675 &why);
676 if ((status < 0) && (status != -EAGAIN)) { 676 if ((status < 0) && (status != -EAGAIN)) {
677 if (status != -ENOSPC) 677 if (status != -ENOSPC)
678 mlog_errno(status); 678 mlog_errno(status);
679 goto leave; 679 goto leave;
680 } 680 }
681 681
682 status = ocfs2_journal_dirty(handle, bh); 682 status = ocfs2_journal_dirty(handle, bh);
683 if (status < 0) { 683 if (status < 0) {
684 mlog_errno(status); 684 mlog_errno(status);
685 goto leave; 685 goto leave;
686 } 686 }
687 687
688 spin_lock(&OCFS2_I(inode)->ip_lock); 688 spin_lock(&OCFS2_I(inode)->ip_lock);
689 clusters_to_add -= (OCFS2_I(inode)->ip_clusters - prev_clusters); 689 clusters_to_add -= (OCFS2_I(inode)->ip_clusters - prev_clusters);
690 spin_unlock(&OCFS2_I(inode)->ip_lock); 690 spin_unlock(&OCFS2_I(inode)->ip_lock);
691 691
692 if (why != RESTART_NONE && clusters_to_add) { 692 if (why != RESTART_NONE && clusters_to_add) {
693 if (why == RESTART_META) { 693 if (why == RESTART_META) {
694 mlog(0, "restarting function.\n"); 694 mlog(0, "restarting function.\n");
695 restart_func = 1; 695 restart_func = 1;
696 } else { 696 } else {
697 BUG_ON(why != RESTART_TRANS); 697 BUG_ON(why != RESTART_TRANS);
698 698
699 mlog(0, "restarting transaction.\n"); 699 mlog(0, "restarting transaction.\n");
700 /* TODO: This can be more intelligent. */ 700 /* TODO: This can be more intelligent. */
701 credits = ocfs2_calc_extend_credits(osb->sb, 701 credits = ocfs2_calc_extend_credits(osb->sb,
702 fe, 702 fe,
703 clusters_to_add); 703 clusters_to_add);
704 status = ocfs2_extend_trans(handle, credits); 704 status = ocfs2_extend_trans(handle, credits);
705 if (status < 0) { 705 if (status < 0) {
706 /* handle still has to be committed at 706 /* handle still has to be committed at
707 * this point. */ 707 * this point. */
708 status = -ENOMEM; 708 status = -ENOMEM;
709 mlog_errno(status); 709 mlog_errno(status);
710 goto leave; 710 goto leave;
711 } 711 }
712 goto restarted_transaction; 712 goto restarted_transaction;
713 } 713 }
714 } 714 }
715 715
716 mlog(0, "fe: i_clusters = %u, i_size=%llu\n", 716 mlog(0, "fe: i_clusters = %u, i_size=%llu\n",
717 le32_to_cpu(fe->i_clusters), 717 le32_to_cpu(fe->i_clusters),
718 (unsigned long long)le64_to_cpu(fe->i_size)); 718 (unsigned long long)le64_to_cpu(fe->i_size));
719 mlog(0, "inode: ip_clusters=%u, i_size=%lld\n", 719 mlog(0, "inode: ip_clusters=%u, i_size=%lld\n",
720 OCFS2_I(inode)->ip_clusters, i_size_read(inode)); 720 OCFS2_I(inode)->ip_clusters, i_size_read(inode));
721 721
722 leave: 722 leave:
723 if (drop_alloc_sem) { 723 if (drop_alloc_sem) {
724 up_write(&OCFS2_I(inode)->ip_alloc_sem); 724 up_write(&OCFS2_I(inode)->ip_alloc_sem);
725 drop_alloc_sem = 0; 725 drop_alloc_sem = 0;
726 } 726 }
727 if (handle) { 727 if (handle) {
728 ocfs2_commit_trans(osb, handle); 728 ocfs2_commit_trans(osb, handle);
729 handle = NULL; 729 handle = NULL;
730 } 730 }
731 if (data_ac) { 731 if (data_ac) {
732 ocfs2_free_alloc_context(data_ac); 732 ocfs2_free_alloc_context(data_ac);
733 data_ac = NULL; 733 data_ac = NULL;
734 } 734 }
735 if (meta_ac) { 735 if (meta_ac) {
736 ocfs2_free_alloc_context(meta_ac); 736 ocfs2_free_alloc_context(meta_ac);
737 meta_ac = NULL; 737 meta_ac = NULL;
738 } 738 }
739 if ((!status) && restart_func) { 739 if ((!status) && restart_func) {
740 restart_func = 0; 740 restart_func = 0;
741 goto restart_all; 741 goto restart_all;
742 } 742 }
743 if (bh) { 743 if (bh) {
744 brelse(bh); 744 brelse(bh);
745 bh = NULL; 745 bh = NULL;
746 } 746 }
747 747
748 mlog_exit(status); 748 mlog_exit(status);
749 return status; 749 return status;
750 } 750 }
751 751
752 /* Some parts of this taken from generic_cont_expand, which turned out 752 /* Some parts of this taken from generic_cont_expand, which turned out
753 * to be too fragile to do exactly what we need without us having to 753 * to be too fragile to do exactly what we need without us having to
754 * worry about recursive locking in ->prepare_write() and 754 * worry about recursive locking in ->prepare_write() and
755 * ->commit_write(). */ 755 * ->commit_write(). */
756 static int ocfs2_write_zero_page(struct inode *inode, 756 static int ocfs2_write_zero_page(struct inode *inode,
757 u64 size) 757 u64 size)
758 { 758 {
759 struct address_space *mapping = inode->i_mapping; 759 struct address_space *mapping = inode->i_mapping;
760 struct page *page; 760 struct page *page;
761 unsigned long index; 761 unsigned long index;
762 unsigned int offset; 762 unsigned int offset;
763 handle_t *handle = NULL; 763 handle_t *handle = NULL;
764 int ret; 764 int ret;
765 765
766 offset = (size & (PAGE_CACHE_SIZE-1)); /* Within page */ 766 offset = (size & (PAGE_CACHE_SIZE-1)); /* Within page */
767 /* ugh. in prepare/commit_write, if from==to==start of block, we 767 /* ugh. in prepare/commit_write, if from==to==start of block, we
768 ** skip the prepare. make sure we never send an offset for the start 768 ** skip the prepare. make sure we never send an offset for the start
769 ** of a block 769 ** of a block
770 */ 770 */
771 if ((offset & (inode->i_sb->s_blocksize - 1)) == 0) { 771 if ((offset & (inode->i_sb->s_blocksize - 1)) == 0) {
772 offset++; 772 offset++;
773 } 773 }
774 index = size >> PAGE_CACHE_SHIFT; 774 index = size >> PAGE_CACHE_SHIFT;
775 775
776 page = grab_cache_page(mapping, index); 776 page = grab_cache_page(mapping, index);
777 if (!page) { 777 if (!page) {
778 ret = -ENOMEM; 778 ret = -ENOMEM;
779 mlog_errno(ret); 779 mlog_errno(ret);
780 goto out; 780 goto out;
781 } 781 }
782 782
783 ret = ocfs2_prepare_write_nolock(inode, page, offset, offset); 783 ret = ocfs2_prepare_write_nolock(inode, page, offset, offset);
784 if (ret < 0) { 784 if (ret < 0) {
785 mlog_errno(ret); 785 mlog_errno(ret);
786 goto out_unlock; 786 goto out_unlock;
787 } 787 }
788 788
789 if (ocfs2_should_order_data(inode)) { 789 if (ocfs2_should_order_data(inode)) {
790 handle = ocfs2_start_walk_page_trans(inode, page, offset, 790 handle = ocfs2_start_walk_page_trans(inode, page, offset,
791 offset); 791 offset);
792 if (IS_ERR(handle)) { 792 if (IS_ERR(handle)) {
793 ret = PTR_ERR(handle); 793 ret = PTR_ERR(handle);
794 handle = NULL; 794 handle = NULL;
795 goto out_unlock; 795 goto out_unlock;
796 } 796 }
797 } 797 }
798 798
799 /* must not update i_size! */ 799 /* must not update i_size! */
800 ret = block_commit_write(page, offset, offset); 800 ret = block_commit_write(page, offset, offset);
801 if (ret < 0) 801 if (ret < 0)
802 mlog_errno(ret); 802 mlog_errno(ret);
803 else 803 else
804 ret = 0; 804 ret = 0;
805 805
806 if (handle) 806 if (handle)
807 ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle); 807 ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
808 out_unlock: 808 out_unlock:
809 unlock_page(page); 809 unlock_page(page);
810 page_cache_release(page); 810 page_cache_release(page);
811 out: 811 out:
812 return ret; 812 return ret;
813 } 813 }
814 814
815 static int ocfs2_zero_extend(struct inode *inode, 815 static int ocfs2_zero_extend(struct inode *inode,
816 u64 zero_to_size) 816 u64 zero_to_size)
817 { 817 {
818 int ret = 0; 818 int ret = 0;
819 u64 start_off; 819 u64 start_off;
820 struct super_block *sb = inode->i_sb; 820 struct super_block *sb = inode->i_sb;
821 821
822 start_off = ocfs2_align_bytes_to_blocks(sb, i_size_read(inode)); 822 start_off = ocfs2_align_bytes_to_blocks(sb, i_size_read(inode));
823 while (start_off < zero_to_size) { 823 while (start_off < zero_to_size) {
824 ret = ocfs2_write_zero_page(inode, start_off); 824 ret = ocfs2_write_zero_page(inode, start_off);
825 if (ret < 0) { 825 if (ret < 0) {
826 mlog_errno(ret); 826 mlog_errno(ret);
827 goto out; 827 goto out;
828 } 828 }
829 829
830 start_off += sb->s_blocksize; 830 start_off += sb->s_blocksize;
831 831
832 /* 832 /*
833 * Very large extends have the potential to lock up 833 * Very large extends have the potential to lock up
834 * the cpu for extended periods of time. 834 * the cpu for extended periods of time.
835 */ 835 */
836 cond_resched(); 836 cond_resched();
837 } 837 }
838 838
839 out: 839 out:
840 return ret; 840 return ret;
841 } 841 }
842 842
843 /* 843 /*
844 * A tail_to_skip value > 0 indicates that we're being called from 844 * A tail_to_skip value > 0 indicates that we're being called from
845 * ocfs2_file_aio_write(). This has the following implications: 845 * ocfs2_file_aio_write(). This has the following implications:
846 * 846 *
847 * - we don't want to update i_size 847 * - we don't want to update i_size
848 * - di_bh will be NULL, which is fine because it's only used in the 848 * - di_bh will be NULL, which is fine because it's only used in the
849 * case where we want to update i_size. 849 * case where we want to update i_size.
850 * - ocfs2_zero_extend() will then only be filling the hole created 850 * - ocfs2_zero_extend() will then only be filling the hole created
851 * between i_size and the start of the write. 851 * between i_size and the start of the write.
852 */ 852 */
853 static int ocfs2_extend_file(struct inode *inode, 853 static int ocfs2_extend_file(struct inode *inode,
854 struct buffer_head *di_bh, 854 struct buffer_head *di_bh,
855 u64 new_i_size, 855 u64 new_i_size,
856 size_t tail_to_skip) 856 size_t tail_to_skip)
857 { 857 {
858 int ret = 0; 858 int ret = 0;
859 u32 clusters_to_add = 0; 859 u32 clusters_to_add = 0;
860 860
861 BUG_ON(!tail_to_skip && !di_bh); 861 BUG_ON(!tail_to_skip && !di_bh);
862 862
863 /* setattr sometimes calls us like this. */ 863 /* setattr sometimes calls us like this. */
864 if (new_i_size == 0) 864 if (new_i_size == 0)
865 goto out; 865 goto out;
866 866
867 if (i_size_read(inode) == new_i_size) 867 if (i_size_read(inode) == new_i_size)
868 goto out; 868 goto out;
869 BUG_ON(new_i_size < i_size_read(inode)); 869 BUG_ON(new_i_size < i_size_read(inode));
870 870
871 if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) { 871 if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) {
872 BUG_ON(tail_to_skip != 0); 872 BUG_ON(tail_to_skip != 0);
873 goto out_update_size; 873 goto out_update_size;
874 } 874 }
875 875
876 clusters_to_add = ocfs2_clusters_for_bytes(inode->i_sb, new_i_size) - 876 clusters_to_add = ocfs2_clusters_for_bytes(inode->i_sb, new_i_size) -
877 OCFS2_I(inode)->ip_clusters; 877 OCFS2_I(inode)->ip_clusters;
878 878
879 /* 879 /*
880 * protect the pages that ocfs2_zero_extend is going to be 880 * protect the pages that ocfs2_zero_extend is going to be
881 * pulling into the page cache.. we do this before the 881 * pulling into the page cache.. we do this before the
882 * metadata extend so that we don't get into the situation 882 * metadata extend so that we don't get into the situation
883 * where we've extended the metadata but can't get the data 883 * where we've extended the metadata but can't get the data
884 * lock to zero. 884 * lock to zero.
885 */ 885 */
886 ret = ocfs2_data_lock(inode, 1); 886 ret = ocfs2_data_lock(inode, 1);
887 if (ret < 0) { 887 if (ret < 0) {
888 mlog_errno(ret); 888 mlog_errno(ret);
889 goto out; 889 goto out;
890 } 890 }
891 891
892 if (clusters_to_add) { 892 if (clusters_to_add) {
893 ret = ocfs2_extend_allocation(inode, clusters_to_add); 893 ret = ocfs2_extend_allocation(inode, clusters_to_add);
894 if (ret < 0) { 894 if (ret < 0) {
895 mlog_errno(ret); 895 mlog_errno(ret);
896 goto out_unlock; 896 goto out_unlock;
897 } 897 }
898 } 898 }
899 899
900 /* 900 /*
901 * Call this even if we don't add any clusters to the tree. We 901 * Call this even if we don't add any clusters to the tree. We
902 * still need to zero the area between the old i_size and the 902 * still need to zero the area between the old i_size and the
903 * new i_size. 903 * new i_size.
904 */ 904 */
905 ret = ocfs2_zero_extend(inode, (u64)new_i_size - tail_to_skip); 905 ret = ocfs2_zero_extend(inode, (u64)new_i_size - tail_to_skip);
906 if (ret < 0) { 906 if (ret < 0) {
907 mlog_errno(ret); 907 mlog_errno(ret);
908 goto out_unlock; 908 goto out_unlock;
909 } 909 }
910 910
911 out_update_size: 911 out_update_size:
912 if (!tail_to_skip) { 912 if (!tail_to_skip) {
913 /* We're being called from ocfs2_setattr() which wants 913 /* We're being called from ocfs2_setattr() which wants
914 * us to update i_size */ 914 * us to update i_size */
915 ret = ocfs2_simple_size_update(inode, di_bh, new_i_size); 915 ret = ocfs2_simple_size_update(inode, di_bh, new_i_size);
916 if (ret < 0) 916 if (ret < 0)
917 mlog_errno(ret); 917 mlog_errno(ret);
918 } 918 }
919 919
920 out_unlock: 920 out_unlock:
921 if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) 921 if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
922 ocfs2_data_unlock(inode, 1); 922 ocfs2_data_unlock(inode, 1);
923 923
924 out: 924 out:
925 return ret; 925 return ret;
926 } 926 }
927 927
928 int ocfs2_setattr(struct dentry *dentry, struct iattr *attr) 928 int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
929 { 929 {
930 int status = 0, size_change; 930 int status = 0, size_change;
931 struct inode *inode = dentry->d_inode; 931 struct inode *inode = dentry->d_inode;
932 struct super_block *sb = inode->i_sb; 932 struct super_block *sb = inode->i_sb;
933 struct ocfs2_super *osb = OCFS2_SB(sb); 933 struct ocfs2_super *osb = OCFS2_SB(sb);
934 struct buffer_head *bh = NULL; 934 struct buffer_head *bh = NULL;
935 handle_t *handle = NULL; 935 handle_t *handle = NULL;
936 936
937 mlog_entry("(0x%p, '%.*s')\n", dentry, 937 mlog_entry("(0x%p, '%.*s')\n", dentry,
938 dentry->d_name.len, dentry->d_name.name); 938 dentry->d_name.len, dentry->d_name.name);
939 939
940 if (attr->ia_valid & ATTR_MODE) 940 if (attr->ia_valid & ATTR_MODE)
941 mlog(0, "mode change: %d\n", attr->ia_mode); 941 mlog(0, "mode change: %d\n", attr->ia_mode);
942 if (attr->ia_valid & ATTR_UID) 942 if (attr->ia_valid & ATTR_UID)
943 mlog(0, "uid change: %d\n", attr->ia_uid); 943 mlog(0, "uid change: %d\n", attr->ia_uid);
944 if (attr->ia_valid & ATTR_GID) 944 if (attr->ia_valid & ATTR_GID)
945 mlog(0, "gid change: %d\n", attr->ia_gid); 945 mlog(0, "gid change: %d\n", attr->ia_gid);
946 if (attr->ia_valid & ATTR_SIZE) 946 if (attr->ia_valid & ATTR_SIZE)
947 mlog(0, "size change...\n"); 947 mlog(0, "size change...\n");
948 if (attr->ia_valid & (ATTR_ATIME | ATTR_MTIME | ATTR_CTIME)) 948 if (attr->ia_valid & (ATTR_ATIME | ATTR_MTIME | ATTR_CTIME))
949 mlog(0, "time change...\n"); 949 mlog(0, "time change...\n");
950 950
951 #define OCFS2_VALID_ATTRS (ATTR_ATIME | ATTR_MTIME | ATTR_CTIME | ATTR_SIZE \ 951 #define OCFS2_VALID_ATTRS (ATTR_ATIME | ATTR_MTIME | ATTR_CTIME | ATTR_SIZE \
952 | ATTR_GID | ATTR_UID | ATTR_MODE) 952 | ATTR_GID | ATTR_UID | ATTR_MODE)
953 if (!(attr->ia_valid & OCFS2_VALID_ATTRS)) { 953 if (!(attr->ia_valid & OCFS2_VALID_ATTRS)) {
954 mlog(0, "can't handle attrs: 0x%x\n", attr->ia_valid); 954 mlog(0, "can't handle attrs: 0x%x\n", attr->ia_valid);
955 return 0; 955 return 0;
956 } 956 }
957 957
958 status = inode_change_ok(inode, attr); 958 status = inode_change_ok(inode, attr);
959 if (status) 959 if (status)
960 return status; 960 return status;
961 961
962 size_change = S_ISREG(inode->i_mode) && attr->ia_valid & ATTR_SIZE; 962 size_change = S_ISREG(inode->i_mode) && attr->ia_valid & ATTR_SIZE;
963 if (size_change) { 963 if (size_change) {
964 status = ocfs2_rw_lock(inode, 1); 964 status = ocfs2_rw_lock(inode, 1);
965 if (status < 0) { 965 if (status < 0) {
966 mlog_errno(status); 966 mlog_errno(status);
967 goto bail; 967 goto bail;
968 } 968 }
969 } 969 }
970 970
971 status = ocfs2_meta_lock(inode, &bh, 1); 971 status = ocfs2_meta_lock(inode, &bh, 1);
972 if (status < 0) { 972 if (status < 0) {
973 if (status != -ENOENT) 973 if (status != -ENOENT)
974 mlog_errno(status); 974 mlog_errno(status);
975 goto bail_unlock_rw; 975 goto bail_unlock_rw;
976 } 976 }
977 977
978 if (size_change && attr->ia_size != i_size_read(inode)) { 978 if (size_change && attr->ia_size != i_size_read(inode)) {
979 if (i_size_read(inode) > attr->ia_size) 979 if (i_size_read(inode) > attr->ia_size)
980 status = ocfs2_truncate_file(inode, bh, attr->ia_size); 980 status = ocfs2_truncate_file(inode, bh, attr->ia_size);
981 else 981 else
982 status = ocfs2_extend_file(inode, bh, attr->ia_size, 0); 982 status = ocfs2_extend_file(inode, bh, attr->ia_size, 0);
983 if (status < 0) { 983 if (status < 0) {
984 if (status != -ENOSPC) 984 if (status != -ENOSPC)
985 mlog_errno(status); 985 mlog_errno(status);
986 status = -ENOSPC; 986 status = -ENOSPC;
987 goto bail_unlock; 987 goto bail_unlock;
988 } 988 }
989 } 989 }
990 990
991 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); 991 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
992 if (IS_ERR(handle)) { 992 if (IS_ERR(handle)) {
993 status = PTR_ERR(handle); 993 status = PTR_ERR(handle);
994 mlog_errno(status); 994 mlog_errno(status);
995 goto bail_unlock; 995 goto bail_unlock;
996 } 996 }
997 997
998 status = inode_setattr(inode, attr); 998 status = inode_setattr(inode, attr);
999 if (status < 0) { 999 if (status < 0) {
1000 mlog_errno(status); 1000 mlog_errno(status);
1001 goto bail_commit; 1001 goto bail_commit;
1002 } 1002 }
1003 1003
1004 status = ocfs2_mark_inode_dirty(handle, inode, bh); 1004 status = ocfs2_mark_inode_dirty(handle, inode, bh);
1005 if (status < 0) 1005 if (status < 0)
1006 mlog_errno(status); 1006 mlog_errno(status);
1007 1007
1008 bail_commit: 1008 bail_commit:
1009 ocfs2_commit_trans(osb, handle); 1009 ocfs2_commit_trans(osb, handle);
1010 bail_unlock: 1010 bail_unlock:
1011 ocfs2_meta_unlock(inode, 1); 1011 ocfs2_meta_unlock(inode, 1);
1012 bail_unlock_rw: 1012 bail_unlock_rw:
1013 if (size_change) 1013 if (size_change)
1014 ocfs2_rw_unlock(inode, 1); 1014 ocfs2_rw_unlock(inode, 1);
1015 bail: 1015 bail:
1016 if (bh) 1016 if (bh)
1017 brelse(bh); 1017 brelse(bh);
1018 1018
1019 mlog_exit(status); 1019 mlog_exit(status);
1020 return status; 1020 return status;
1021 } 1021 }
1022 1022
1023 int ocfs2_getattr(struct vfsmount *mnt, 1023 int ocfs2_getattr(struct vfsmount *mnt,
1024 struct dentry *dentry, 1024 struct dentry *dentry,
1025 struct kstat *stat) 1025 struct kstat *stat)
1026 { 1026 {
1027 struct inode *inode = dentry->d_inode; 1027 struct inode *inode = dentry->d_inode;
1028 struct super_block *sb = dentry->d_inode->i_sb; 1028 struct super_block *sb = dentry->d_inode->i_sb;
1029 struct ocfs2_super *osb = sb->s_fs_info; 1029 struct ocfs2_super *osb = sb->s_fs_info;
1030 int err; 1030 int err;
1031 1031
1032 mlog_entry_void(); 1032 mlog_entry_void();
1033 1033
1034 err = ocfs2_inode_revalidate(dentry); 1034 err = ocfs2_inode_revalidate(dentry);
1035 if (err) { 1035 if (err) {
1036 if (err != -ENOENT) 1036 if (err != -ENOENT)
1037 mlog_errno(err); 1037 mlog_errno(err);
1038 goto bail; 1038 goto bail;
1039 } 1039 }
1040 1040
1041 generic_fillattr(inode, stat); 1041 generic_fillattr(inode, stat);
1042 1042
1043 /* We set the blksize from the cluster size for performance */ 1043 /* We set the blksize from the cluster size for performance */
1044 stat->blksize = osb->s_clustersize; 1044 stat->blksize = osb->s_clustersize;
1045 1045
1046 bail: 1046 bail:
1047 mlog_exit(err); 1047 mlog_exit(err);
1048 1048
1049 return err; 1049 return err;
1050 } 1050 }
1051 1051
1052 int ocfs2_permission(struct inode *inode, int mask, struct nameidata *nd) 1052 int ocfs2_permission(struct inode *inode, int mask, struct nameidata *nd)
1053 { 1053 {
1054 int ret; 1054 int ret;
1055 1055
1056 mlog_entry_void(); 1056 mlog_entry_void();
1057 1057
1058 ret = ocfs2_meta_lock(inode, NULL, 0); 1058 ret = ocfs2_meta_lock(inode, NULL, 0);
1059 if (ret) { 1059 if (ret) {
1060 if (ret != -ENOENT) 1060 if (ret != -ENOENT)
1061 mlog_errno(ret); 1061 mlog_errno(ret);
1062 goto out; 1062 goto out;
1063 } 1063 }
1064 1064
1065 ret = generic_permission(inode, mask, NULL); 1065 ret = generic_permission(inode, mask, NULL);
1066 1066
1067 ocfs2_meta_unlock(inode, 0); 1067 ocfs2_meta_unlock(inode, 0);
1068 out: 1068 out:
1069 mlog_exit(ret); 1069 mlog_exit(ret);
1070 return ret; 1070 return ret;
1071 } 1071 }
1072 1072
1073 static int ocfs2_write_remove_suid(struct inode *inode) 1073 static int ocfs2_write_remove_suid(struct inode *inode)
1074 { 1074 {
1075 int ret; 1075 int ret;
1076 struct buffer_head *bh = NULL; 1076 struct buffer_head *bh = NULL;
1077 struct ocfs2_inode_info *oi = OCFS2_I(inode); 1077 struct ocfs2_inode_info *oi = OCFS2_I(inode);
1078 handle_t *handle; 1078 handle_t *handle;
1079 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 1079 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1080 struct ocfs2_dinode *di; 1080 struct ocfs2_dinode *di;
1081 1081
1082 mlog_entry("(Inode %llu, mode 0%o)\n", 1082 mlog_entry("(Inode %llu, mode 0%o)\n",
1083 (unsigned long long)oi->ip_blkno, inode->i_mode); 1083 (unsigned long long)oi->ip_blkno, inode->i_mode);
1084 1084
1085 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); 1085 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
1086 if (handle == NULL) { 1086 if (handle == NULL) {
1087 ret = -ENOMEM; 1087 ret = -ENOMEM;
1088 mlog_errno(ret); 1088 mlog_errno(ret);
1089 goto out; 1089 goto out;
1090 } 1090 }
1091 1091
1092 ret = ocfs2_read_block(osb, oi->ip_blkno, &bh, OCFS2_BH_CACHED, inode); 1092 ret = ocfs2_read_block(osb, oi->ip_blkno, &bh, OCFS2_BH_CACHED, inode);
1093 if (ret < 0) { 1093 if (ret < 0) {
1094 mlog_errno(ret); 1094 mlog_errno(ret);
1095 goto out_trans; 1095 goto out_trans;
1096 } 1096 }
1097 1097
1098 ret = ocfs2_journal_access(handle, inode, bh, 1098 ret = ocfs2_journal_access(handle, inode, bh,
1099 OCFS2_JOURNAL_ACCESS_WRITE); 1099 OCFS2_JOURNAL_ACCESS_WRITE);
1100 if (ret < 0) { 1100 if (ret < 0) {
1101 mlog_errno(ret); 1101 mlog_errno(ret);
1102 goto out_bh; 1102 goto out_bh;
1103 } 1103 }
1104 1104
1105 inode->i_mode &= ~S_ISUID; 1105 inode->i_mode &= ~S_ISUID;
1106 if ((inode->i_mode & S_ISGID) && (inode->i_mode & S_IXGRP)) 1106 if ((inode->i_mode & S_ISGID) && (inode->i_mode & S_IXGRP))
1107 inode->i_mode &= ~S_ISGID; 1107 inode->i_mode &= ~S_ISGID;
1108 1108
1109 di = (struct ocfs2_dinode *) bh->b_data; 1109 di = (struct ocfs2_dinode *) bh->b_data;
1110 di->i_mode = cpu_to_le16(inode->i_mode); 1110 di->i_mode = cpu_to_le16(inode->i_mode);
1111 1111
1112 ret = ocfs2_journal_dirty(handle, bh); 1112 ret = ocfs2_journal_dirty(handle, bh);
1113 if (ret < 0) 1113 if (ret < 0)
1114 mlog_errno(ret); 1114 mlog_errno(ret);
1115 out_bh: 1115 out_bh:
1116 brelse(bh); 1116 brelse(bh);
1117 out_trans: 1117 out_trans:
1118 ocfs2_commit_trans(osb, handle); 1118 ocfs2_commit_trans(osb, handle);
1119 out: 1119 out:
1120 mlog_exit(ret); 1120 mlog_exit(ret);
1121 return ret; 1121 return ret;
1122 } 1122 }
1123 1123
1124 /* 1124 /*
1125 * Will look for holes and unwritten extents in the range starting at 1125 * Will look for holes and unwritten extents in the range starting at
1126 * pos for count bytes (inclusive). 1126 * pos for count bytes (inclusive).
1127 */ 1127 */
1128 static int ocfs2_check_range_for_holes(struct inode *inode, loff_t pos, 1128 static int ocfs2_check_range_for_holes(struct inode *inode, loff_t pos,
1129 size_t count) 1129 size_t count)
1130 { 1130 {
1131 int ret = 0; 1131 int ret = 0;
1132 unsigned int extent_flags; 1132 unsigned int extent_flags;
1133 u32 cpos, clusters, extent_len, phys_cpos; 1133 u32 cpos, clusters, extent_len, phys_cpos;
1134 struct super_block *sb = inode->i_sb; 1134 struct super_block *sb = inode->i_sb;
1135 1135
1136 cpos = pos >> OCFS2_SB(sb)->s_clustersize_bits; 1136 cpos = pos >> OCFS2_SB(sb)->s_clustersize_bits;
1137 clusters = ocfs2_clusters_for_bytes(sb, pos + count) - cpos; 1137 clusters = ocfs2_clusters_for_bytes(sb, pos + count) - cpos;
1138 1138
1139 while (clusters) { 1139 while (clusters) {
1140 ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, &extent_len, 1140 ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, &extent_len,
1141 &extent_flags); 1141 &extent_flags);
1142 if (ret < 0) { 1142 if (ret < 0) {
1143 mlog_errno(ret); 1143 mlog_errno(ret);
1144 goto out; 1144 goto out;
1145 } 1145 }
1146 1146
1147 if (phys_cpos == 0 || (extent_flags & OCFS2_EXT_UNWRITTEN)) { 1147 if (phys_cpos == 0 || (extent_flags & OCFS2_EXT_UNWRITTEN)) {
1148 ret = 1; 1148 ret = 1;
1149 break; 1149 break;
1150 } 1150 }
1151 1151
1152 if (extent_len > clusters) 1152 if (extent_len > clusters)
1153 extent_len = clusters; 1153 extent_len = clusters;
1154 1154
1155 clusters -= extent_len; 1155 clusters -= extent_len;
1156 cpos += extent_len; 1156 cpos += extent_len;
1157 } 1157 }
1158 out: 1158 out:
1159 return ret; 1159 return ret;
1160 } 1160 }
1161 1161
1162 static int ocfs2_prepare_inode_for_write(struct dentry *dentry, 1162 static int ocfs2_prepare_inode_for_write(struct dentry *dentry,
1163 loff_t *ppos, 1163 loff_t *ppos,
1164 size_t count, 1164 size_t count,
1165 int appending, 1165 int appending,
1166 int *direct_io) 1166 int *direct_io)
1167 { 1167 {
1168 int ret = 0, meta_level = appending; 1168 int ret = 0, meta_level = appending;
1169 struct inode *inode = dentry->d_inode; 1169 struct inode *inode = dentry->d_inode;
1170 u32 clusters; 1170 u32 clusters;
1171 loff_t newsize, saved_pos; 1171 loff_t newsize, saved_pos;
1172 1172
1173 /* 1173 /*
1174 * We sample i_size under a read level meta lock to see if our write 1174 * We sample i_size under a read level meta lock to see if our write
1175 * is extending the file, if it is we back off and get a write level 1175 * is extending the file, if it is we back off and get a write level
1176 * meta lock. 1176 * meta lock.
1177 */ 1177 */
1178 for(;;) { 1178 for(;;) {
1179 ret = ocfs2_meta_lock(inode, NULL, meta_level); 1179 ret = ocfs2_meta_lock(inode, NULL, meta_level);
1180 if (ret < 0) { 1180 if (ret < 0) {
1181 meta_level = -1; 1181 meta_level = -1;
1182 mlog_errno(ret); 1182 mlog_errno(ret);
1183 goto out; 1183 goto out;
1184 } 1184 }
1185 1185
1186 /* Clear suid / sgid if necessary. We do this here 1186 /* Clear suid / sgid if necessary. We do this here
1187 * instead of later in the write path because 1187 * instead of later in the write path because
1188 * remove_suid() calls ->setattr without any hint that 1188 * remove_suid() calls ->setattr without any hint that
1189 * we may have already done our cluster locking. Since 1189 * we may have already done our cluster locking. Since
1190 * ocfs2_setattr() *must* take cluster locks to 1190 * ocfs2_setattr() *must* take cluster locks to
1191 * proceeed, this will lead us to recursively lock the 1191 * proceeed, this will lead us to recursively lock the
1192 * inode. There's also the dinode i_size state which 1192 * inode. There's also the dinode i_size state which
1193 * can be lost via setattr during extending writes (we 1193 * can be lost via setattr during extending writes (we
1194 * set inode->i_size at the end of a write. */ 1194 * set inode->i_size at the end of a write. */
1195 if (should_remove_suid(dentry)) { 1195 if (should_remove_suid(dentry)) {
1196 if (meta_level == 0) { 1196 if (meta_level == 0) {
1197 ocfs2_meta_unlock(inode, meta_level); 1197 ocfs2_meta_unlock(inode, meta_level);
1198 meta_level = 1; 1198 meta_level = 1;
1199 continue; 1199 continue;
1200 } 1200 }
1201 1201
1202 ret = ocfs2_write_remove_suid(inode); 1202 ret = ocfs2_write_remove_suid(inode);
1203 if (ret < 0) { 1203 if (ret < 0) {
1204 mlog_errno(ret); 1204 mlog_errno(ret);
1205 goto out_unlock; 1205 goto out_unlock;
1206 } 1206 }
1207 } 1207 }
1208 1208
1209 /* work on a copy of ppos until we're sure that we won't have 1209 /* work on a copy of ppos until we're sure that we won't have
1210 * to recalculate it due to relocking. */ 1210 * to recalculate it due to relocking. */
1211 if (appending) { 1211 if (appending) {
1212 saved_pos = i_size_read(inode); 1212 saved_pos = i_size_read(inode);
1213 mlog(0, "O_APPEND: inode->i_size=%llu\n", saved_pos); 1213 mlog(0, "O_APPEND: inode->i_size=%llu\n", saved_pos);
1214 } else { 1214 } else {
1215 saved_pos = *ppos; 1215 saved_pos = *ppos;
1216 } 1216 }
1217 1217
1218 if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) { 1218 if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) {
1219 loff_t end = saved_pos + count; 1219 loff_t end = saved_pos + count;
1220 1220
1221 /* 1221 /*
1222 * Skip the O_DIRECT checks if we don't need 1222 * Skip the O_DIRECT checks if we don't need
1223 * them. 1223 * them.
1224 */ 1224 */
1225 if (!direct_io || !(*direct_io)) 1225 if (!direct_io || !(*direct_io))
1226 break; 1226 break;
1227 1227
1228 /* 1228 /*
1229 * Allowing concurrent direct writes means 1229 * Allowing concurrent direct writes means
1230 * i_size changes wouldn't be synchronized, so 1230 * i_size changes wouldn't be synchronized, so
1231 * one node could wind up truncating another 1231 * one node could wind up truncating another
1232 * nodes writes. 1232 * nodes writes.
1233 */ 1233 */
1234 if (end > i_size_read(inode)) { 1234 if (end > i_size_read(inode)) {
1235 *direct_io = 0; 1235 *direct_io = 0;
1236 break; 1236 break;
1237 } 1237 }
1238 1238
1239 /* 1239 /*
1240 * We don't fill holes during direct io, so 1240 * We don't fill holes during direct io, so
1241 * check for them here. If any are found, the 1241 * check for them here. If any are found, the
1242 * caller will have to retake some cluster 1242 * caller will have to retake some cluster
1243 * locks and initiate the io as buffered. 1243 * locks and initiate the io as buffered.
1244 */ 1244 */
1245 ret = ocfs2_check_range_for_holes(inode, saved_pos, 1245 ret = ocfs2_check_range_for_holes(inode, saved_pos,
1246 count); 1246 count);
1247 if (ret == 1) { 1247 if (ret == 1) {
1248 *direct_io = 0; 1248 *direct_io = 0;
1249 ret = 0; 1249 ret = 0;
1250 } else if (ret < 0) 1250 } else if (ret < 0)
1251 mlog_errno(ret); 1251 mlog_errno(ret);
1252 break; 1252 break;
1253 } 1253 }
1254 1254
1255 /* 1255 /*
1256 * The rest of this loop is concerned with legacy file 1256 * The rest of this loop is concerned with legacy file
1257 * systems which don't support sparse files. 1257 * systems which don't support sparse files.
1258 */ 1258 */
1259 1259
1260 newsize = count + saved_pos; 1260 newsize = count + saved_pos;
1261 1261
1262 mlog(0, "pos=%lld newsize=%lld cursize=%lld\n", 1262 mlog(0, "pos=%lld newsize=%lld cursize=%lld\n",
1263 (long long) saved_pos, (long long) newsize, 1263 (long long) saved_pos, (long long) newsize,
1264 (long long) i_size_read(inode)); 1264 (long long) i_size_read(inode));
1265 1265
1266 /* No need for a higher level metadata lock if we're 1266 /* No need for a higher level metadata lock if we're
1267 * never going past i_size. */ 1267 * never going past i_size. */
1268 if (newsize <= i_size_read(inode)) 1268 if (newsize <= i_size_read(inode))
1269 break; 1269 break;
1270 1270
1271 if (meta_level == 0) { 1271 if (meta_level == 0) {
1272 ocfs2_meta_unlock(inode, meta_level); 1272 ocfs2_meta_unlock(inode, meta_level);
1273 meta_level = 1; 1273 meta_level = 1;
1274 continue; 1274 continue;
1275 } 1275 }
1276 1276
1277 spin_lock(&OCFS2_I(inode)->ip_lock); 1277 spin_lock(&OCFS2_I(inode)->ip_lock);
1278 clusters = ocfs2_clusters_for_bytes(inode->i_sb, newsize) - 1278 clusters = ocfs2_clusters_for_bytes(inode->i_sb, newsize) -
1279 OCFS2_I(inode)->ip_clusters; 1279 OCFS2_I(inode)->ip_clusters;
1280 spin_unlock(&OCFS2_I(inode)->ip_lock); 1280 spin_unlock(&OCFS2_I(inode)->ip_lock);
1281 1281
1282 mlog(0, "Writing at EOF, may need more allocation: " 1282 mlog(0, "Writing at EOF, may need more allocation: "
1283 "i_size = %lld, newsize = %lld, need %u clusters\n", 1283 "i_size = %lld, newsize = %lld, need %u clusters\n",
1284 (long long) i_size_read(inode), (long long) newsize, 1284 (long long) i_size_read(inode), (long long) newsize,
1285 clusters); 1285 clusters);
1286 1286
1287 /* We only want to continue the rest of this loop if 1287 /* We only want to continue the rest of this loop if
1288 * our extend will actually require more 1288 * our extend will actually require more
1289 * allocation. */ 1289 * allocation. */
1290 if (!clusters) 1290 if (!clusters)
1291 break; 1291 break;
1292 1292
1293 ret = ocfs2_extend_file(inode, NULL, newsize, count); 1293 ret = ocfs2_extend_file(inode, NULL, newsize, count);
1294 if (ret < 0) { 1294 if (ret < 0) {
1295 if (ret != -ENOSPC) 1295 if (ret != -ENOSPC)
1296 mlog_errno(ret); 1296 mlog_errno(ret);
1297 goto out_unlock; 1297 goto out_unlock;
1298 } 1298 }
1299 break; 1299 break;
1300 } 1300 }
1301 1301
1302 if (appending) 1302 if (appending)
1303 *ppos = saved_pos; 1303 *ppos = saved_pos;
1304 1304
1305 out_unlock: 1305 out_unlock:
1306 ocfs2_meta_unlock(inode, meta_level); 1306 ocfs2_meta_unlock(inode, meta_level);
1307 1307
1308 out: 1308 out:
1309 return ret; 1309 return ret;
1310 } 1310 }
1311 1311
1312 static inline void 1312 static inline void
1313 ocfs2_set_next_iovec(const struct iovec **iovp, size_t *basep, size_t bytes) 1313 ocfs2_set_next_iovec(const struct iovec **iovp, size_t *basep, size_t bytes)
1314 { 1314 {
1315 const struct iovec *iov = *iovp; 1315 const struct iovec *iov = *iovp;
1316 size_t base = *basep; 1316 size_t base = *basep;
1317 1317
1318 do { 1318 do {
1319 int copy = min(bytes, iov->iov_len - base); 1319 int copy = min(bytes, iov->iov_len - base);
1320 1320
1321 bytes -= copy; 1321 bytes -= copy;
1322 base += copy; 1322 base += copy;
1323 if (iov->iov_len == base) { 1323 if (iov->iov_len == base) {
1324 iov++; 1324 iov++;
1325 base = 0; 1325 base = 0;
1326 } 1326 }
1327 } while (bytes); 1327 } while (bytes);
1328 *iovp = iov; 1328 *iovp = iov;
1329 *basep = base; 1329 *basep = base;
1330 } 1330 }
1331 1331
1332 static struct page * ocfs2_get_write_source(struct ocfs2_buffered_write_priv *bp, 1332 static struct page * ocfs2_get_write_source(struct ocfs2_buffered_write_priv *bp,
1333 const struct iovec *cur_iov, 1333 const struct iovec *cur_iov,
1334 size_t iov_offset) 1334 size_t iov_offset)
1335 { 1335 {
1336 int ret; 1336 int ret;
1337 char *buf; 1337 char *buf;
1338 struct page *src_page = NULL; 1338 struct page *src_page = NULL;
1339 1339
1340 buf = cur_iov->iov_base + iov_offset; 1340 buf = cur_iov->iov_base + iov_offset;
1341 1341
1342 if (!segment_eq(get_fs(), KERNEL_DS)) { 1342 if (!segment_eq(get_fs(), KERNEL_DS)) {
1343 /* 1343 /*
1344 * Pull in the user page. We want to do this outside 1344 * Pull in the user page. We want to do this outside
1345 * of the meta data locks in order to preserve locking 1345 * of the meta data locks in order to preserve locking
1346 * order in case of page fault. 1346 * order in case of page fault.
1347 */ 1347 */
1348 ret = get_user_pages(current, current->mm, 1348 ret = get_user_pages(current, current->mm,
1349 (unsigned long)buf & PAGE_CACHE_MASK, 1, 1349 (unsigned long)buf & PAGE_CACHE_MASK, 1,
1350 0, 0, &src_page, NULL); 1350 0, 0, &src_page, NULL);
1351 if (ret == 1) 1351 if (ret == 1)
1352 bp->b_src_buf = kmap(src_page); 1352 bp->b_src_buf = kmap(src_page);
1353 else 1353 else
1354 src_page = ERR_PTR(-EFAULT); 1354 src_page = ERR_PTR(-EFAULT);
1355 } else { 1355 } else {
1356 bp->b_src_buf = buf; 1356 bp->b_src_buf = buf;
1357 } 1357 }
1358 1358
1359 return src_page; 1359 return src_page;
1360 } 1360 }
1361 1361
1362 static void ocfs2_put_write_source(struct ocfs2_buffered_write_priv *bp, 1362 static void ocfs2_put_write_source(struct ocfs2_buffered_write_priv *bp,
1363 struct page *page) 1363 struct page *page)
1364 { 1364 {
1365 if (page) { 1365 if (page) {
1366 kunmap(page); 1366 kunmap(page);
1367 page_cache_release(page); 1367 page_cache_release(page);
1368 } 1368 }
1369 } 1369 }
1370 1370
1371 static ssize_t ocfs2_file_buffered_write(struct file *file, loff_t *ppos, 1371 static ssize_t ocfs2_file_buffered_write(struct file *file, loff_t *ppos,
1372 const struct iovec *iov, 1372 const struct iovec *iov,
1373 unsigned long nr_segs, 1373 unsigned long nr_segs,
1374 size_t count, 1374 size_t count,
1375 ssize_t o_direct_written) 1375 ssize_t o_direct_written)
1376 { 1376 {
1377 int ret = 0; 1377 int ret = 0;
1378 ssize_t copied, total = 0; 1378 ssize_t copied, total = 0;
1379 size_t iov_offset = 0; 1379 size_t iov_offset = 0;
1380 const struct iovec *cur_iov = iov; 1380 const struct iovec *cur_iov = iov;
1381 struct ocfs2_buffered_write_priv bp; 1381 struct ocfs2_buffered_write_priv bp;
1382 struct page *page; 1382 struct page *page;
1383 1383
1384 /* 1384 /*
1385 * handle partial DIO write. Adjust cur_iov if needed. 1385 * handle partial DIO write. Adjust cur_iov if needed.
1386 */ 1386 */
1387 ocfs2_set_next_iovec(&cur_iov, &iov_offset, o_direct_written); 1387 ocfs2_set_next_iovec(&cur_iov, &iov_offset, o_direct_written);
1388 1388
1389 do { 1389 do {
1390 bp.b_cur_off = iov_offset; 1390 bp.b_cur_off = iov_offset;
1391 bp.b_cur_iov = cur_iov; 1391 bp.b_cur_iov = cur_iov;
1392 1392
1393 page = ocfs2_get_write_source(&bp, cur_iov, iov_offset); 1393 page = ocfs2_get_write_source(&bp, cur_iov, iov_offset);
1394 if (IS_ERR(page)) { 1394 if (IS_ERR(page)) {
1395 ret = PTR_ERR(page); 1395 ret = PTR_ERR(page);
1396 goto out; 1396 goto out;
1397 } 1397 }
1398 1398
1399 copied = ocfs2_buffered_write_cluster(file, *ppos, count, 1399 copied = ocfs2_buffered_write_cluster(file, *ppos, count,
1400 ocfs2_map_and_write_user_data, 1400 ocfs2_map_and_write_user_data,
1401 &bp); 1401 &bp);
1402 1402
1403 ocfs2_put_write_source(&bp, page); 1403 ocfs2_put_write_source(&bp, page);
1404 1404
1405 if (copied < 0) { 1405 if (copied < 0) {
1406 mlog_errno(copied); 1406 mlog_errno(copied);
1407 ret = copied; 1407 ret = copied;
1408 goto out; 1408 goto out;
1409 } 1409 }
1410 1410
1411 total += copied; 1411 total += copied;
1412 *ppos = *ppos + copied; 1412 *ppos = *ppos + copied;
1413 count -= copied; 1413 count -= copied;
1414 1414
1415 ocfs2_set_next_iovec(&cur_iov, &iov_offset, copied); 1415 ocfs2_set_next_iovec(&cur_iov, &iov_offset, copied);
1416 } while(count); 1416 } while(count);
1417 1417
1418 out: 1418 out:
1419 return total ? total : ret; 1419 return total ? total : ret;
1420 } 1420 }
1421 1421
1422 static ssize_t ocfs2_file_aio_write(struct kiocb *iocb, 1422 static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
1423 const struct iovec *iov, 1423 const struct iovec *iov,
1424 unsigned long nr_segs, 1424 unsigned long nr_segs,
1425 loff_t pos) 1425 loff_t pos)
1426 { 1426 {
1427 int ret, direct_io, appending, rw_level, have_alloc_sem = 0; 1427 int ret, direct_io, appending, rw_level, have_alloc_sem = 0;
1428 int can_do_direct, sync = 0; 1428 int can_do_direct, sync = 0;
1429 ssize_t written = 0; 1429 ssize_t written = 0;
1430 size_t ocount; /* original count */ 1430 size_t ocount; /* original count */
1431 size_t count; /* after file limit checks */ 1431 size_t count; /* after file limit checks */
1432 loff_t *ppos = &iocb->ki_pos; 1432 loff_t *ppos = &iocb->ki_pos;
1433 struct file *file = iocb->ki_filp; 1433 struct file *file = iocb->ki_filp;
1434 struct inode *inode = file->f_path.dentry->d_inode; 1434 struct inode *inode = file->f_path.dentry->d_inode;
1435 1435
1436 mlog_entry("(0x%p, %u, '%.*s')\n", file, 1436 mlog_entry("(0x%p, %u, '%.*s')\n", file,
1437 (unsigned int)nr_segs, 1437 (unsigned int)nr_segs,
1438 file->f_path.dentry->d_name.len, 1438 file->f_path.dentry->d_name.len,
1439 file->f_path.dentry->d_name.name); 1439 file->f_path.dentry->d_name.name);
1440 1440
1441 if (iocb->ki_left == 0) 1441 if (iocb->ki_left == 0)
1442 return 0; 1442 return 0;
1443 1443
1444 ret = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ); 1444 ret = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ);
1445 if (ret) 1445 if (ret)
1446 return ret; 1446 return ret;
1447 1447
1448 count = ocount; 1448 count = ocount;
1449 1449
1450 vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE); 1450 vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
1451 1451
1452 appending = file->f_flags & O_APPEND ? 1 : 0; 1452 appending = file->f_flags & O_APPEND ? 1 : 0;
1453 direct_io = file->f_flags & O_DIRECT ? 1 : 0; 1453 direct_io = file->f_flags & O_DIRECT ? 1 : 0;
1454 1454
1455 mutex_lock(&inode->i_mutex); 1455 mutex_lock(&inode->i_mutex);
1456 1456
1457 relock: 1457 relock:
1458 /* to match setattr's i_mutex -> i_alloc_sem -> rw_lock ordering */ 1458 /* to match setattr's i_mutex -> i_alloc_sem -> rw_lock ordering */
1459 if (direct_io) { 1459 if (direct_io) {
1460 down_read(&inode->i_alloc_sem); 1460 down_read(&inode->i_alloc_sem);
1461 have_alloc_sem = 1; 1461 have_alloc_sem = 1;
1462 } 1462 }
1463 1463
1464 /* concurrent O_DIRECT writes are allowed */ 1464 /* concurrent O_DIRECT writes are allowed */
1465 rw_level = !direct_io; 1465 rw_level = !direct_io;
1466 ret = ocfs2_rw_lock(inode, rw_level); 1466 ret = ocfs2_rw_lock(inode, rw_level);
1467 if (ret < 0) { 1467 if (ret < 0) {
1468 mlog_errno(ret); 1468 mlog_errno(ret);
1469 goto out_sems; 1469 goto out_sems;
1470 } 1470 }
1471 1471
1472 can_do_direct = direct_io; 1472 can_do_direct = direct_io;
1473 ret = ocfs2_prepare_inode_for_write(file->f_path.dentry, ppos, 1473 ret = ocfs2_prepare_inode_for_write(file->f_path.dentry, ppos,
1474 iocb->ki_left, appending, 1474 iocb->ki_left, appending,
1475 &can_do_direct); 1475 &can_do_direct);
1476 if (ret < 0) { 1476 if (ret < 0) {
1477 mlog_errno(ret); 1477 mlog_errno(ret);
1478 goto out; 1478 goto out;
1479 } 1479 }
1480 1480
1481 /* 1481 /*
1482 * We can't complete the direct I/O as requested, fall back to 1482 * We can't complete the direct I/O as requested, fall back to
1483 * buffered I/O. 1483 * buffered I/O.
1484 */ 1484 */
1485 if (direct_io && !can_do_direct) { 1485 if (direct_io && !can_do_direct) {
1486 ocfs2_rw_unlock(inode, rw_level); 1486 ocfs2_rw_unlock(inode, rw_level);
1487 up_read(&inode->i_alloc_sem); 1487 up_read(&inode->i_alloc_sem);
1488 1488
1489 have_alloc_sem = 0; 1489 have_alloc_sem = 0;
1490 rw_level = -1; 1490 rw_level = -1;
1491 1491
1492 direct_io = 0; 1492 direct_io = 0;
1493 sync = 1; 1493 sync = 1;
1494 goto relock; 1494 goto relock;
1495 } 1495 }
1496 1496
1497 if (!sync && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) 1497 if (!sync && ((file->f_flags & O_SYNC) || IS_SYNC(inode)))
1498 sync = 1; 1498 sync = 1;
1499 1499
1500 /* 1500 /*
1501 * XXX: Is it ok to execute these checks a second time? 1501 * XXX: Is it ok to execute these checks a second time?
1502 */ 1502 */
1503 ret = generic_write_checks(file, ppos, &count, S_ISBLK(inode->i_mode)); 1503 ret = generic_write_checks(file, ppos, &count, S_ISBLK(inode->i_mode));
1504 if (ret) 1504 if (ret)
1505 goto out; 1505 goto out;
1506 1506
1507 /* 1507 /*
1508 * Set pos so that sync_page_range_nolock() below understands 1508 * Set pos so that sync_page_range_nolock() below understands
1509 * where to start from. We might've moved it around via the 1509 * where to start from. We might've moved it around via the
1510 * calls above. The range we want to actually sync starts from 1510 * calls above. The range we want to actually sync starts from
1511 * *ppos here. 1511 * *ppos here.
1512 * 1512 *
1513 */ 1513 */
1514 pos = *ppos; 1514 pos = *ppos;
1515 1515
1516 /* communicate with ocfs2_dio_end_io */ 1516 /* communicate with ocfs2_dio_end_io */
1517 ocfs2_iocb_set_rw_locked(iocb, rw_level); 1517 ocfs2_iocb_set_rw_locked(iocb, rw_level);
1518 1518
1519 if (direct_io) { 1519 if (direct_io) {
1520 written = generic_file_direct_write(iocb, iov, &nr_segs, *ppos, 1520 written = generic_file_direct_write(iocb, iov, &nr_segs, *ppos,
1521 ppos, count, ocount); 1521 ppos, count, ocount);
1522 if (written < 0) { 1522 if (written < 0) {
1523 ret = written; 1523 ret = written;
1524 goto out_dio; 1524 goto out_dio;
1525 } 1525 }
1526 } else { 1526 } else {
1527 written = ocfs2_file_buffered_write(file, ppos, iov, nr_segs, 1527 written = ocfs2_file_buffered_write(file, ppos, iov, nr_segs,
1528 count, written); 1528 count, written);
1529 if (written < 0) { 1529 if (written < 0) {
1530 ret = written; 1530 ret = written;
1531 if (ret != -EFAULT || ret != -ENOSPC) 1531 if (ret != -EFAULT || ret != -ENOSPC)
1532 mlog_errno(ret); 1532 mlog_errno(ret);
1533 goto out; 1533 goto out;
1534 } 1534 }
1535 } 1535 }
1536 1536
1537 out_dio: 1537 out_dio:
1538 /* buffered aio wouldn't have proper lock coverage today */ 1538 /* buffered aio wouldn't have proper lock coverage today */
1539 BUG_ON(ret == -EIOCBQUEUED && !(file->f_flags & O_DIRECT)); 1539 BUG_ON(ret == -EIOCBQUEUED && !(file->f_flags & O_DIRECT));
1540 1540
1541 /* 1541 /*
1542 * deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io 1542 * deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io
1543 * function pointer which is called when o_direct io completes so that 1543 * function pointer which is called when o_direct io completes so that
1544 * it can unlock our rw lock. (it's the clustered equivalent of 1544 * it can unlock our rw lock. (it's the clustered equivalent of
1545 * i_alloc_sem; protects truncate from racing with pending ios). 1545 * i_alloc_sem; protects truncate from racing with pending ios).
1546 * Unfortunately there are error cases which call end_io and others 1546 * Unfortunately there are error cases which call end_io and others
1547 * that don't. so we don't have to unlock the rw_lock if either an 1547 * that don't. so we don't have to unlock the rw_lock if either an
1548 * async dio is going to do it in the future or an end_io after an 1548 * async dio is going to do it in the future or an end_io after an
1549 * error has already done it. 1549 * error has already done it.
1550 */ 1550 */
1551 if (ret == -EIOCBQUEUED || !ocfs2_iocb_is_rw_locked(iocb)) { 1551 if (ret == -EIOCBQUEUED || !ocfs2_iocb_is_rw_locked(iocb)) {
1552 rw_level = -1; 1552 rw_level = -1;
1553 have_alloc_sem = 0; 1553 have_alloc_sem = 0;
1554 } 1554 }
1555 1555
1556 out: 1556 out:
1557 if (rw_level != -1) 1557 if (rw_level != -1)
1558 ocfs2_rw_unlock(inode, rw_level); 1558 ocfs2_rw_unlock(inode, rw_level);
1559 1559
1560 out_sems: 1560 out_sems:
1561 if (have_alloc_sem) 1561 if (have_alloc_sem)
1562 up_read(&inode->i_alloc_sem); 1562 up_read(&inode->i_alloc_sem);
1563 1563
1564 if (written > 0 && sync) { 1564 if (written > 0 && sync) {
1565 ssize_t err; 1565 ssize_t err;
1566 1566
1567 err = sync_page_range_nolock(inode, file->f_mapping, pos, count); 1567 err = sync_page_range_nolock(inode, file->f_mapping, pos, count);
1568 if (err < 0) 1568 if (err < 0)
1569 written = err; 1569 written = err;
1570 } 1570 }
1571 1571
1572 mutex_unlock(&inode->i_mutex); 1572 mutex_unlock(&inode->i_mutex);
1573 1573
1574 mlog_exit(ret); 1574 mlog_exit(ret);
1575 return written ? written : ret; 1575 return written ? written : ret;
1576 } 1576 }
1577 1577
1578 static int ocfs2_splice_write_actor(struct pipe_inode_info *pipe, 1578 static int ocfs2_splice_write_actor(struct pipe_inode_info *pipe,
1579 struct pipe_buffer *buf, 1579 struct pipe_buffer *buf,
1580 struct splice_desc *sd) 1580 struct splice_desc *sd)
1581 { 1581 {
1582 int ret, count, total = 0; 1582 int ret, count, total = 0;
1583 ssize_t copied = 0; 1583 ssize_t copied = 0;
1584 struct ocfs2_splice_write_priv sp; 1584 struct ocfs2_splice_write_priv sp;
1585 1585
1586 ret = buf->ops->pin(pipe, buf); 1586 ret = buf->ops->confirm(pipe, buf);
1587 if (ret) 1587 if (ret)
1588 goto out; 1588 goto out;
1589 1589
1590 sp.s_sd = sd; 1590 sp.s_sd = sd;
1591 sp.s_buf = buf; 1591 sp.s_buf = buf;
1592 sp.s_pipe = pipe; 1592 sp.s_pipe = pipe;
1593 sp.s_offset = sd->pos & ~PAGE_CACHE_MASK; 1593 sp.s_offset = sd->pos & ~PAGE_CACHE_MASK;
1594 sp.s_buf_offset = buf->offset; 1594 sp.s_buf_offset = buf->offset;
1595 1595
1596 count = sd->len; 1596 count = sd->len;
1597 if (count + sp.s_offset > PAGE_CACHE_SIZE) 1597 if (count + sp.s_offset > PAGE_CACHE_SIZE)
1598 count = PAGE_CACHE_SIZE - sp.s_offset; 1598 count = PAGE_CACHE_SIZE - sp.s_offset;
1599 1599
1600 do { 1600 do {
1601 /* 1601 /*
1602 * splice wants us to copy up to one page at a 1602 * splice wants us to copy up to one page at a
1603 * time. For pagesize > cluster size, this means we 1603 * time. For pagesize > cluster size, this means we
1604 * might enter ocfs2_buffered_write_cluster() more 1604 * might enter ocfs2_buffered_write_cluster() more
1605 * than once, so keep track of our progress here. 1605 * than once, so keep track of our progress here.
1606 */ 1606 */
1607 copied = ocfs2_buffered_write_cluster(sd->file, 1607 copied = ocfs2_buffered_write_cluster(sd->u.file,
1608 (loff_t)sd->pos + total, 1608 (loff_t)sd->pos + total,
1609 count, 1609 count,
1610 ocfs2_map_and_write_splice_data, 1610 ocfs2_map_and_write_splice_data,
1611 &sp); 1611 &sp);
1612 if (copied < 0) { 1612 if (copied < 0) {
1613 mlog_errno(copied); 1613 mlog_errno(copied);
1614 ret = copied; 1614 ret = copied;
1615 goto out; 1615 goto out;
1616 } 1616 }
1617 1617
1618 count -= copied; 1618 count -= copied;
1619 sp.s_offset += copied; 1619 sp.s_offset += copied;
1620 sp.s_buf_offset += copied; 1620 sp.s_buf_offset += copied;
1621 total += copied; 1621 total += copied;
1622 } while (count); 1622 } while (count);
1623 1623
1624 ret = 0; 1624 ret = 0;
1625 out: 1625 out:
1626 1626
1627 return total ? total : ret; 1627 return total ? total : ret;
1628 } 1628 }
1629 1629
1630 static ssize_t __ocfs2_file_splice_write(struct pipe_inode_info *pipe, 1630 static ssize_t __ocfs2_file_splice_write(struct pipe_inode_info *pipe,
1631 struct file *out, 1631 struct file *out,
1632 loff_t *ppos, 1632 loff_t *ppos,
1633 size_t len, 1633 size_t len,
1634 unsigned int flags) 1634 unsigned int flags)
1635 { 1635 {
1636 int ret, err; 1636 int ret, err;
1637 struct address_space *mapping = out->f_mapping; 1637 struct address_space *mapping = out->f_mapping;
1638 struct inode *inode = mapping->host; 1638 struct inode *inode = mapping->host;
1639 struct splice_desc sd = { 1639 struct splice_desc sd = {
1640 .total_len = len, 1640 .total_len = len,
1641 .flags = flags, 1641 .flags = flags,
1642 .pos = *ppos, 1642 .pos = *ppos,
1643 .u.file = out, 1643 .u.file = out,
1644 }; 1644 };
1645 1645
1646 ret = __splice_from_pipe(pipe, &sd, ocfs2_splice_write_actor); 1646 ret = __splice_from_pipe(pipe, &sd, ocfs2_splice_write_actor);
1647 if (ret > 0) { 1647 if (ret > 0) {
1648 *ppos += ret; 1648 *ppos += ret;
1649 1649
1650 if (unlikely((out->f_flags & O_SYNC) || IS_SYNC(inode))) { 1650 if (unlikely((out->f_flags & O_SYNC) || IS_SYNC(inode))) {
1651 err = generic_osync_inode(inode, mapping, 1651 err = generic_osync_inode(inode, mapping,
1652 OSYNC_METADATA|OSYNC_DATA); 1652 OSYNC_METADATA|OSYNC_DATA);
1653 if (err) 1653 if (err)
1654 ret = err; 1654 ret = err;
1655 } 1655 }
1656 } 1656 }
1657 1657
1658 return ret; 1658 return ret;
1659 } 1659 }
1660 1660
1661 static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe, 1661 static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe,
1662 struct file *out, 1662 struct file *out,
1663 loff_t *ppos, 1663 loff_t *ppos,
1664 size_t len, 1664 size_t len,
1665 unsigned int flags) 1665 unsigned int flags)
1666 { 1666 {
1667 int ret; 1667 int ret;
1668 struct inode *inode = out->f_path.dentry->d_inode; 1668 struct inode *inode = out->f_path.dentry->d_inode;
1669 1669
1670 mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", out, pipe, 1670 mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", out, pipe,
1671 (unsigned int)len, 1671 (unsigned int)len,
1672 out->f_path.dentry->d_name.len, 1672 out->f_path.dentry->d_name.len,
1673 out->f_path.dentry->d_name.name); 1673 out->f_path.dentry->d_name.name);
1674 1674
1675 inode_double_lock(inode, pipe->inode); 1675 inode_double_lock(inode, pipe->inode);
1676 1676
1677 ret = ocfs2_rw_lock(inode, 1); 1677 ret = ocfs2_rw_lock(inode, 1);
1678 if (ret < 0) { 1678 if (ret < 0) {
1679 mlog_errno(ret); 1679 mlog_errno(ret);
1680 goto out; 1680 goto out;
1681 } 1681 }
1682 1682
1683 ret = ocfs2_prepare_inode_for_write(out->f_path.dentry, ppos, len, 0, 1683 ret = ocfs2_prepare_inode_for_write(out->f_path.dentry, ppos, len, 0,
1684 NULL); 1684 NULL);
1685 if (ret < 0) { 1685 if (ret < 0) {
1686 mlog_errno(ret); 1686 mlog_errno(ret);
1687 goto out_unlock; 1687 goto out_unlock;
1688 } 1688 }
1689 1689
1690 /* ok, we're done with i_size and alloc work */ 1690 /* ok, we're done with i_size and alloc work */
1691 ret = __ocfs2_file_splice_write(pipe, out, ppos, len, flags); 1691 ret = __ocfs2_file_splice_write(pipe, out, ppos, len, flags);
1692 1692
1693 out_unlock: 1693 out_unlock:
1694 ocfs2_rw_unlock(inode, 1); 1694 ocfs2_rw_unlock(inode, 1);
1695 out: 1695 out:
1696 inode_double_unlock(inode, pipe->inode); 1696 inode_double_unlock(inode, pipe->inode);
1697 1697
1698 mlog_exit(ret); 1698 mlog_exit(ret);
1699 return ret; 1699 return ret;
1700 } 1700 }
1701 1701
1702 static ssize_t ocfs2_file_splice_read(struct file *in, 1702 static ssize_t ocfs2_file_splice_read(struct file *in,
1703 loff_t *ppos, 1703 loff_t *ppos,
1704 struct pipe_inode_info *pipe, 1704 struct pipe_inode_info *pipe,
1705 size_t len, 1705 size_t len,
1706 unsigned int flags) 1706 unsigned int flags)
1707 { 1707 {
1708 int ret = 0; 1708 int ret = 0;
1709 struct inode *inode = in->f_path.dentry->d_inode; 1709 struct inode *inode = in->f_path.dentry->d_inode;
1710 1710
1711 mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", in, pipe, 1711 mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", in, pipe,
1712 (unsigned int)len, 1712 (unsigned int)len,
1713 in->f_path.dentry->d_name.len, 1713 in->f_path.dentry->d_name.len,
1714 in->f_path.dentry->d_name.name); 1714 in->f_path.dentry->d_name.name);
1715 1715
1716 /* 1716 /*
1717 * See the comment in ocfs2_file_aio_read() 1717 * See the comment in ocfs2_file_aio_read()
1718 */ 1718 */
1719 ret = ocfs2_meta_lock(inode, NULL, 0); 1719 ret = ocfs2_meta_lock(inode, NULL, 0);
1720 if (ret < 0) { 1720 if (ret < 0) {
1721 mlog_errno(ret); 1721 mlog_errno(ret);
1722 goto bail; 1722 goto bail;
1723 } 1723 }
1724 ocfs2_meta_unlock(inode, 0); 1724 ocfs2_meta_unlock(inode, 0);
1725 1725
1726 ret = generic_file_splice_read(in, ppos, pipe, len, flags); 1726 ret = generic_file_splice_read(in, ppos, pipe, len, flags);
1727 1727
1728 bail: 1728 bail:
1729 mlog_exit(ret); 1729 mlog_exit(ret);
1730 return ret; 1730 return ret;
1731 } 1731 }
1732 1732
1733 static ssize_t ocfs2_file_aio_read(struct kiocb *iocb, 1733 static ssize_t ocfs2_file_aio_read(struct kiocb *iocb,
1734 const struct iovec *iov, 1734 const struct iovec *iov,
1735 unsigned long nr_segs, 1735 unsigned long nr_segs,
1736 loff_t pos) 1736 loff_t pos)
1737 { 1737 {
1738 int ret = 0, rw_level = -1, have_alloc_sem = 0, lock_level = 0; 1738 int ret = 0, rw_level = -1, have_alloc_sem = 0, lock_level = 0;
1739 struct file *filp = iocb->ki_filp; 1739 struct file *filp = iocb->ki_filp;
1740 struct inode *inode = filp->f_path.dentry->d_inode; 1740 struct inode *inode = filp->f_path.dentry->d_inode;
1741 1741
1742 mlog_entry("(0x%p, %u, '%.*s')\n", filp, 1742 mlog_entry("(0x%p, %u, '%.*s')\n", filp,
1743 (unsigned int)nr_segs, 1743 (unsigned int)nr_segs,
1744 filp->f_path.dentry->d_name.len, 1744 filp->f_path.dentry->d_name.len,
1745 filp->f_path.dentry->d_name.name); 1745 filp->f_path.dentry->d_name.name);
1746 1746
1747 if (!inode) { 1747 if (!inode) {
1748 ret = -EINVAL; 1748 ret = -EINVAL;
1749 mlog_errno(ret); 1749 mlog_errno(ret);
1750 goto bail; 1750 goto bail;
1751 } 1751 }
1752 1752
1753 /* 1753 /*
1754 * buffered reads protect themselves in ->readpage(). O_DIRECT reads 1754 * buffered reads protect themselves in ->readpage(). O_DIRECT reads
1755 * need locks to protect pending reads from racing with truncate. 1755 * need locks to protect pending reads from racing with truncate.
1756 */ 1756 */
1757 if (filp->f_flags & O_DIRECT) { 1757 if (filp->f_flags & O_DIRECT) {
1758 down_read(&inode->i_alloc_sem); 1758 down_read(&inode->i_alloc_sem);
1759 have_alloc_sem = 1; 1759 have_alloc_sem = 1;
1760 1760
1761 ret = ocfs2_rw_lock(inode, 0); 1761 ret = ocfs2_rw_lock(inode, 0);
1762 if (ret < 0) { 1762 if (ret < 0) {
1763 mlog_errno(ret); 1763 mlog_errno(ret);
1764 goto bail; 1764 goto bail;
1765 } 1765 }
1766 rw_level = 0; 1766 rw_level = 0;
1767 /* communicate with ocfs2_dio_end_io */ 1767 /* communicate with ocfs2_dio_end_io */
1768 ocfs2_iocb_set_rw_locked(iocb, rw_level); 1768 ocfs2_iocb_set_rw_locked(iocb, rw_level);
1769 } 1769 }
1770 1770
1771 /* 1771 /*
1772 * We're fine letting folks race truncates and extending 1772 * We're fine letting folks race truncates and extending
1773 * writes with read across the cluster, just like they can 1773 * writes with read across the cluster, just like they can
1774 * locally. Hence no rw_lock during read. 1774 * locally. Hence no rw_lock during read.
1775 * 1775 *
1776 * Take and drop the meta data lock to update inode fields 1776 * Take and drop the meta data lock to update inode fields
1777 * like i_size. This allows the checks down below 1777 * like i_size. This allows the checks down below
1778 * generic_file_aio_read() a chance of actually working. 1778 * generic_file_aio_read() a chance of actually working.
1779 */ 1779 */
1780 ret = ocfs2_meta_lock_atime(inode, filp->f_vfsmnt, &lock_level); 1780 ret = ocfs2_meta_lock_atime(inode, filp->f_vfsmnt, &lock_level);
1781 if (ret < 0) { 1781 if (ret < 0) {
1782 mlog_errno(ret); 1782 mlog_errno(ret);
1783 goto bail; 1783 goto bail;
1784 } 1784 }
1785 ocfs2_meta_unlock(inode, lock_level); 1785 ocfs2_meta_unlock(inode, lock_level);
1786 1786
1787 ret = generic_file_aio_read(iocb, iov, nr_segs, iocb->ki_pos); 1787 ret = generic_file_aio_read(iocb, iov, nr_segs, iocb->ki_pos);
1788 if (ret == -EINVAL) 1788 if (ret == -EINVAL)
1789 mlog(ML_ERROR, "generic_file_aio_read returned -EINVAL\n"); 1789 mlog(ML_ERROR, "generic_file_aio_read returned -EINVAL\n");
1790 1790
1791 /* buffered aio wouldn't have proper lock coverage today */ 1791 /* buffered aio wouldn't have proper lock coverage today */
1792 BUG_ON(ret == -EIOCBQUEUED && !(filp->f_flags & O_DIRECT)); 1792 BUG_ON(ret == -EIOCBQUEUED && !(filp->f_flags & O_DIRECT));
1793 1793
1794 /* see ocfs2_file_aio_write */ 1794 /* see ocfs2_file_aio_write */
1795 if (ret == -EIOCBQUEUED || !ocfs2_iocb_is_rw_locked(iocb)) { 1795 if (ret == -EIOCBQUEUED || !ocfs2_iocb_is_rw_locked(iocb)) {
1796 rw_level = -1; 1796 rw_level = -1;
1797 have_alloc_sem = 0; 1797 have_alloc_sem = 0;
1798 } 1798 }
1799 1799
1800 bail: 1800 bail:
1801 if (have_alloc_sem) 1801 if (have_alloc_sem)
1802 up_read(&inode->i_alloc_sem); 1802 up_read(&inode->i_alloc_sem);
1803 if (rw_level != -1) 1803 if (rw_level != -1)
1804 ocfs2_rw_unlock(inode, rw_level); 1804 ocfs2_rw_unlock(inode, rw_level);
1805 mlog_exit(ret); 1805 mlog_exit(ret);
1806 1806
1807 return ret; 1807 return ret;
1808 } 1808 }
1809 1809
1810 const struct inode_operations ocfs2_file_iops = { 1810 const struct inode_operations ocfs2_file_iops = {
1811 .setattr = ocfs2_setattr, 1811 .setattr = ocfs2_setattr,
1812 .getattr = ocfs2_getattr, 1812 .getattr = ocfs2_getattr,
1813 .permission = ocfs2_permission, 1813 .permission = ocfs2_permission,
1814 }; 1814 };
1815 1815
1816 const struct inode_operations ocfs2_special_file_iops = { 1816 const struct inode_operations ocfs2_special_file_iops = {
1817 .setattr = ocfs2_setattr, 1817 .setattr = ocfs2_setattr,
1818 .getattr = ocfs2_getattr, 1818 .getattr = ocfs2_getattr,
1819 .permission = ocfs2_permission, 1819 .permission = ocfs2_permission,
1820 }; 1820 };
1821 1821
1822 const struct file_operations ocfs2_fops = { 1822 const struct file_operations ocfs2_fops = {
1823 .read = do_sync_read, 1823 .read = do_sync_read,
1824 .write = do_sync_write, 1824 .write = do_sync_write,
1825 .mmap = ocfs2_mmap, 1825 .mmap = ocfs2_mmap,
1826 .fsync = ocfs2_sync_file, 1826 .fsync = ocfs2_sync_file,
1827 .release = ocfs2_file_release, 1827 .release = ocfs2_file_release,
1828 .open = ocfs2_file_open, 1828 .open = ocfs2_file_open,
1829 .aio_read = ocfs2_file_aio_read, 1829 .aio_read = ocfs2_file_aio_read,
1830 .aio_write = ocfs2_file_aio_write, 1830 .aio_write = ocfs2_file_aio_write,
1831 .ioctl = ocfs2_ioctl, 1831 .ioctl = ocfs2_ioctl,
1832 #ifdef CONFIG_COMPAT 1832 #ifdef CONFIG_COMPAT
1833 .compat_ioctl = ocfs2_compat_ioctl, 1833 .compat_ioctl = ocfs2_compat_ioctl,
1834 #endif 1834 #endif
1835 .splice_read = ocfs2_file_splice_read, 1835 .splice_read = ocfs2_file_splice_read,
1836 .splice_write = ocfs2_file_splice_write, 1836 .splice_write = ocfs2_file_splice_write,
1837 }; 1837 };
1838 1838
1839 const struct file_operations ocfs2_dops = { 1839 const struct file_operations ocfs2_dops = {
1840 .read = generic_read_dir, 1840 .read = generic_read_dir,
1841 .readdir = ocfs2_readdir, 1841 .readdir = ocfs2_readdir,
1842 .fsync = ocfs2_sync_file, 1842 .fsync = ocfs2_sync_file,
1843 .ioctl = ocfs2_ioctl, 1843 .ioctl = ocfs2_ioctl,
1844 #ifdef CONFIG_COMPAT 1844 #ifdef CONFIG_COMPAT
1845 .compat_ioctl = ocfs2_compat_ioctl, 1845 .compat_ioctl = ocfs2_compat_ioctl,
1846 #endif 1846 #endif
1847 }; 1847 };
1848 1848
1 /* 1 /*
2 * linux/fs/pipe.c 2 * linux/fs/pipe.c
3 * 3 *
4 * Copyright (C) 1991, 1992, 1999 Linus Torvalds 4 * Copyright (C) 1991, 1992, 1999 Linus Torvalds
5 */ 5 */
6 6
7 #include <linux/mm.h> 7 #include <linux/mm.h>
8 #include <linux/file.h> 8 #include <linux/file.h>
9 #include <linux/poll.h> 9 #include <linux/poll.h>
10 #include <linux/slab.h> 10 #include <linux/slab.h>
11 #include <linux/module.h> 11 #include <linux/module.h>
12 #include <linux/init.h> 12 #include <linux/init.h>
13 #include <linux/fs.h> 13 #include <linux/fs.h>
14 #include <linux/mount.h> 14 #include <linux/mount.h>
15 #include <linux/pipe_fs_i.h> 15 #include <linux/pipe_fs_i.h>
16 #include <linux/uio.h> 16 #include <linux/uio.h>
17 #include <linux/highmem.h> 17 #include <linux/highmem.h>
18 #include <linux/pagemap.h> 18 #include <linux/pagemap.h>
19 #include <linux/audit.h> 19 #include <linux/audit.h>
20 20
21 #include <asm/uaccess.h> 21 #include <asm/uaccess.h>
22 #include <asm/ioctls.h> 22 #include <asm/ioctls.h>
23 23
24 /* 24 /*
25 * We use a start+len construction, which provides full use of the 25 * We use a start+len construction, which provides full use of the
26 * allocated memory. 26 * allocated memory.
27 * -- Florian Coosmann (FGC) 27 * -- Florian Coosmann (FGC)
28 * 28 *
29 * Reads with count = 0 should always return 0. 29 * Reads with count = 0 should always return 0.
30 * -- Julian Bradfield 1999-06-07. 30 * -- Julian Bradfield 1999-06-07.
31 * 31 *
32 * FIFOs and Pipes now generate SIGIO for both readers and writers. 32 * FIFOs and Pipes now generate SIGIO for both readers and writers.
33 * -- Jeremy Elson <jelson@circlemud.org> 2001-08-16 33 * -- Jeremy Elson <jelson@circlemud.org> 2001-08-16
34 * 34 *
35 * pipe_read & write cleanup 35 * pipe_read & write cleanup
36 * -- Manfred Spraul <manfred@colorfullife.com> 2002-05-09 36 * -- Manfred Spraul <manfred@colorfullife.com> 2002-05-09
37 */ 37 */
38 38
39 /* Drop the inode semaphore and wait for a pipe event, atomically */ 39 /* Drop the inode semaphore and wait for a pipe event, atomically */
40 void pipe_wait(struct pipe_inode_info *pipe) 40 void pipe_wait(struct pipe_inode_info *pipe)
41 { 41 {
42 DEFINE_WAIT(wait); 42 DEFINE_WAIT(wait);
43 43
44 /* 44 /*
45 * Pipes are system-local resources, so sleeping on them 45 * Pipes are system-local resources, so sleeping on them
46 * is considered a noninteractive wait: 46 * is considered a noninteractive wait:
47 */ 47 */
48 prepare_to_wait(&pipe->wait, &wait, 48 prepare_to_wait(&pipe->wait, &wait,
49 TASK_INTERRUPTIBLE | TASK_NONINTERACTIVE); 49 TASK_INTERRUPTIBLE | TASK_NONINTERACTIVE);
50 if (pipe->inode) 50 if (pipe->inode)
51 mutex_unlock(&pipe->inode->i_mutex); 51 mutex_unlock(&pipe->inode->i_mutex);
52 schedule(); 52 schedule();
53 finish_wait(&pipe->wait, &wait); 53 finish_wait(&pipe->wait, &wait);
54 if (pipe->inode) 54 if (pipe->inode)
55 mutex_lock(&pipe->inode->i_mutex); 55 mutex_lock(&pipe->inode->i_mutex);
56 } 56 }
57 57
58 static int 58 static int
59 pipe_iov_copy_from_user(void *to, struct iovec *iov, unsigned long len, 59 pipe_iov_copy_from_user(void *to, struct iovec *iov, unsigned long len,
60 int atomic) 60 int atomic)
61 { 61 {
62 unsigned long copy; 62 unsigned long copy;
63 63
64 while (len > 0) { 64 while (len > 0) {
65 while (!iov->iov_len) 65 while (!iov->iov_len)
66 iov++; 66 iov++;
67 copy = min_t(unsigned long, len, iov->iov_len); 67 copy = min_t(unsigned long, len, iov->iov_len);
68 68
69 if (atomic) { 69 if (atomic) {
70 if (__copy_from_user_inatomic(to, iov->iov_base, copy)) 70 if (__copy_from_user_inatomic(to, iov->iov_base, copy))
71 return -EFAULT; 71 return -EFAULT;
72 } else { 72 } else {
73 if (copy_from_user(to, iov->iov_base, copy)) 73 if (copy_from_user(to, iov->iov_base, copy))
74 return -EFAULT; 74 return -EFAULT;
75 } 75 }
76 to += copy; 76 to += copy;
77 len -= copy; 77 len -= copy;
78 iov->iov_base += copy; 78 iov->iov_base += copy;
79 iov->iov_len -= copy; 79 iov->iov_len -= copy;
80 } 80 }
81 return 0; 81 return 0;
82 } 82 }
83 83
84 static int 84 static int
85 pipe_iov_copy_to_user(struct iovec *iov, const void *from, unsigned long len, 85 pipe_iov_copy_to_user(struct iovec *iov, const void *from, unsigned long len,
86 int atomic) 86 int atomic)
87 { 87 {
88 unsigned long copy; 88 unsigned long copy;
89 89
90 while (len > 0) { 90 while (len > 0) {
91 while (!iov->iov_len) 91 while (!iov->iov_len)
92 iov++; 92 iov++;
93 copy = min_t(unsigned long, len, iov->iov_len); 93 copy = min_t(unsigned long, len, iov->iov_len);
94 94
95 if (atomic) { 95 if (atomic) {
96 if (__copy_to_user_inatomic(iov->iov_base, from, copy)) 96 if (__copy_to_user_inatomic(iov->iov_base, from, copy))
97 return -EFAULT; 97 return -EFAULT;
98 } else { 98 } else {
99 if (copy_to_user(iov->iov_base, from, copy)) 99 if (copy_to_user(iov->iov_base, from, copy))
100 return -EFAULT; 100 return -EFAULT;
101 } 101 }
102 from += copy; 102 from += copy;
103 len -= copy; 103 len -= copy;
104 iov->iov_base += copy; 104 iov->iov_base += copy;
105 iov->iov_len -= copy; 105 iov->iov_len -= copy;
106 } 106 }
107 return 0; 107 return 0;
108 } 108 }
109 109
110 /* 110 /*
111 * Attempt to pre-fault in the user memory, so we can use atomic copies. 111 * Attempt to pre-fault in the user memory, so we can use atomic copies.
112 * Returns the number of bytes not faulted in. 112 * Returns the number of bytes not faulted in.
113 */ 113 */
114 static int iov_fault_in_pages_write(struct iovec *iov, unsigned long len) 114 static int iov_fault_in_pages_write(struct iovec *iov, unsigned long len)
115 { 115 {
116 while (!iov->iov_len) 116 while (!iov->iov_len)
117 iov++; 117 iov++;
118 118
119 while (len > 0) { 119 while (len > 0) {
120 unsigned long this_len; 120 unsigned long this_len;
121 121
122 this_len = min_t(unsigned long, len, iov->iov_len); 122 this_len = min_t(unsigned long, len, iov->iov_len);
123 if (fault_in_pages_writeable(iov->iov_base, this_len)) 123 if (fault_in_pages_writeable(iov->iov_base, this_len))
124 break; 124 break;
125 125
126 len -= this_len; 126 len -= this_len;
127 iov++; 127 iov++;
128 } 128 }
129 129
130 return len; 130 return len;
131 } 131 }
132 132
133 /* 133 /*
134 * Pre-fault in the user memory, so we can use atomic copies. 134 * Pre-fault in the user memory, so we can use atomic copies.
135 */ 135 */
136 static void iov_fault_in_pages_read(struct iovec *iov, unsigned long len) 136 static void iov_fault_in_pages_read(struct iovec *iov, unsigned long len)
137 { 137 {
138 while (!iov->iov_len) 138 while (!iov->iov_len)
139 iov++; 139 iov++;
140 140
141 while (len > 0) { 141 while (len > 0) {
142 unsigned long this_len; 142 unsigned long this_len;
143 143
144 this_len = min_t(unsigned long, len, iov->iov_len); 144 this_len = min_t(unsigned long, len, iov->iov_len);
145 fault_in_pages_readable(iov->iov_base, this_len); 145 fault_in_pages_readable(iov->iov_base, this_len);
146 len -= this_len; 146 len -= this_len;
147 iov++; 147 iov++;
148 } 148 }
149 } 149 }
150 150
151 static void anon_pipe_buf_release(struct pipe_inode_info *pipe, 151 static void anon_pipe_buf_release(struct pipe_inode_info *pipe,
152 struct pipe_buffer *buf) 152 struct pipe_buffer *buf)
153 { 153 {
154 struct page *page = buf->page; 154 struct page *page = buf->page;
155 155
156 /* 156 /*
157 * If nobody else uses this page, and we don't already have a 157 * If nobody else uses this page, and we don't already have a
158 * temporary page, let's keep track of it as a one-deep 158 * temporary page, let's keep track of it as a one-deep
159 * allocation cache. (Otherwise just release our reference to it) 159 * allocation cache. (Otherwise just release our reference to it)
160 */ 160 */
161 if (page_count(page) == 1 && !pipe->tmp_page) 161 if (page_count(page) == 1 && !pipe->tmp_page)
162 pipe->tmp_page = page; 162 pipe->tmp_page = page;
163 else 163 else
164 page_cache_release(page); 164 page_cache_release(page);
165 } 165 }
166 166
167 void *generic_pipe_buf_map(struct pipe_inode_info *pipe, 167 void *generic_pipe_buf_map(struct pipe_inode_info *pipe,
168 struct pipe_buffer *buf, int atomic) 168 struct pipe_buffer *buf, int atomic)
169 { 169 {
170 if (atomic) { 170 if (atomic) {
171 buf->flags |= PIPE_BUF_FLAG_ATOMIC; 171 buf->flags |= PIPE_BUF_FLAG_ATOMIC;
172 return kmap_atomic(buf->page, KM_USER0); 172 return kmap_atomic(buf->page, KM_USER0);
173 } 173 }
174 174
175 return kmap(buf->page); 175 return kmap(buf->page);
176 } 176 }
177 177
178 void generic_pipe_buf_unmap(struct pipe_inode_info *pipe, 178 void generic_pipe_buf_unmap(struct pipe_inode_info *pipe,
179 struct pipe_buffer *buf, void *map_data) 179 struct pipe_buffer *buf, void *map_data)
180 { 180 {
181 if (buf->flags & PIPE_BUF_FLAG_ATOMIC) { 181 if (buf->flags & PIPE_BUF_FLAG_ATOMIC) {
182 buf->flags &= ~PIPE_BUF_FLAG_ATOMIC; 182 buf->flags &= ~PIPE_BUF_FLAG_ATOMIC;
183 kunmap_atomic(map_data, KM_USER0); 183 kunmap_atomic(map_data, KM_USER0);
184 } else 184 } else
185 kunmap(buf->page); 185 kunmap(buf->page);
186 } 186 }
187 187
188 int generic_pipe_buf_steal(struct pipe_inode_info *pipe, 188 int generic_pipe_buf_steal(struct pipe_inode_info *pipe,
189 struct pipe_buffer *buf) 189 struct pipe_buffer *buf)
190 { 190 {
191 struct page *page = buf->page; 191 struct page *page = buf->page;
192 192
193 if (page_count(page) == 1) { 193 if (page_count(page) == 1) {
194 lock_page(page); 194 lock_page(page);
195 return 0; 195 return 0;
196 } 196 }
197 197
198 return 1; 198 return 1;
199 } 199 }
200 200
201 void generic_pipe_buf_get(struct pipe_inode_info *info, struct pipe_buffer *buf) 201 void generic_pipe_buf_get(struct pipe_inode_info *info, struct pipe_buffer *buf)
202 { 202 {
203 page_cache_get(buf->page); 203 page_cache_get(buf->page);
204 } 204 }
205 205
206 int generic_pipe_buf_pin(struct pipe_inode_info *info, struct pipe_buffer *buf) 206 int generic_pipe_buf_confirm(struct pipe_inode_info *info,
207 struct pipe_buffer *buf)
207 { 208 {
208 return 0; 209 return 0;
209 } 210 }
210 211
211 static const struct pipe_buf_operations anon_pipe_buf_ops = { 212 static const struct pipe_buf_operations anon_pipe_buf_ops = {
212 .can_merge = 1, 213 .can_merge = 1,
213 .map = generic_pipe_buf_map, 214 .map = generic_pipe_buf_map,
214 .unmap = generic_pipe_buf_unmap, 215 .unmap = generic_pipe_buf_unmap,
215 .pin = generic_pipe_buf_pin, 216 .confirm = generic_pipe_buf_confirm,
216 .release = anon_pipe_buf_release, 217 .release = anon_pipe_buf_release,
217 .steal = generic_pipe_buf_steal, 218 .steal = generic_pipe_buf_steal,
218 .get = generic_pipe_buf_get, 219 .get = generic_pipe_buf_get,
219 }; 220 };
220 221
221 static ssize_t 222 static ssize_t
222 pipe_read(struct kiocb *iocb, const struct iovec *_iov, 223 pipe_read(struct kiocb *iocb, const struct iovec *_iov,
223 unsigned long nr_segs, loff_t pos) 224 unsigned long nr_segs, loff_t pos)
224 { 225 {
225 struct file *filp = iocb->ki_filp; 226 struct file *filp = iocb->ki_filp;
226 struct inode *inode = filp->f_path.dentry->d_inode; 227 struct inode *inode = filp->f_path.dentry->d_inode;
227 struct pipe_inode_info *pipe; 228 struct pipe_inode_info *pipe;
228 int do_wakeup; 229 int do_wakeup;
229 ssize_t ret; 230 ssize_t ret;
230 struct iovec *iov = (struct iovec *)_iov; 231 struct iovec *iov = (struct iovec *)_iov;
231 size_t total_len; 232 size_t total_len;
232 233
233 total_len = iov_length(iov, nr_segs); 234 total_len = iov_length(iov, nr_segs);
234 /* Null read succeeds. */ 235 /* Null read succeeds. */
235 if (unlikely(total_len == 0)) 236 if (unlikely(total_len == 0))
236 return 0; 237 return 0;
237 238
238 do_wakeup = 0; 239 do_wakeup = 0;
239 ret = 0; 240 ret = 0;
240 mutex_lock(&inode->i_mutex); 241 mutex_lock(&inode->i_mutex);
241 pipe = inode->i_pipe; 242 pipe = inode->i_pipe;
242 for (;;) { 243 for (;;) {
243 int bufs = pipe->nrbufs; 244 int bufs = pipe->nrbufs;
244 if (bufs) { 245 if (bufs) {
245 int curbuf = pipe->curbuf; 246 int curbuf = pipe->curbuf;
246 struct pipe_buffer *buf = pipe->bufs + curbuf; 247 struct pipe_buffer *buf = pipe->bufs + curbuf;
247 const struct pipe_buf_operations *ops = buf->ops; 248 const struct pipe_buf_operations *ops = buf->ops;
248 void *addr; 249 void *addr;
249 size_t chars = buf->len; 250 size_t chars = buf->len;
250 int error, atomic; 251 int error, atomic;
251 252
252 if (chars > total_len) 253 if (chars > total_len)
253 chars = total_len; 254 chars = total_len;
254 255
255 error = ops->pin(pipe, buf); 256 error = ops->confirm(pipe, buf);
256 if (error) { 257 if (error) {
257 if (!ret) 258 if (!ret)
258 error = ret; 259 error = ret;
259 break; 260 break;
260 } 261 }
261 262
262 atomic = !iov_fault_in_pages_write(iov, chars); 263 atomic = !iov_fault_in_pages_write(iov, chars);
263 redo: 264 redo:
264 addr = ops->map(pipe, buf, atomic); 265 addr = ops->map(pipe, buf, atomic);
265 error = pipe_iov_copy_to_user(iov, addr + buf->offset, chars, atomic); 266 error = pipe_iov_copy_to_user(iov, addr + buf->offset, chars, atomic);
266 ops->unmap(pipe, buf, addr); 267 ops->unmap(pipe, buf, addr);
267 if (unlikely(error)) { 268 if (unlikely(error)) {
268 /* 269 /*
269 * Just retry with the slow path if we failed. 270 * Just retry with the slow path if we failed.
270 */ 271 */
271 if (atomic) { 272 if (atomic) {
272 atomic = 0; 273 atomic = 0;
273 goto redo; 274 goto redo;
274 } 275 }
275 if (!ret) 276 if (!ret)
276 ret = error; 277 ret = error;
277 break; 278 break;
278 } 279 }
279 ret += chars; 280 ret += chars;
280 buf->offset += chars; 281 buf->offset += chars;
281 buf->len -= chars; 282 buf->len -= chars;
282 if (!buf->len) { 283 if (!buf->len) {
283 buf->ops = NULL; 284 buf->ops = NULL;
284 ops->release(pipe, buf); 285 ops->release(pipe, buf);
285 curbuf = (curbuf + 1) & (PIPE_BUFFERS-1); 286 curbuf = (curbuf + 1) & (PIPE_BUFFERS-1);
286 pipe->curbuf = curbuf; 287 pipe->curbuf = curbuf;
287 pipe->nrbufs = --bufs; 288 pipe->nrbufs = --bufs;
288 do_wakeup = 1; 289 do_wakeup = 1;
289 } 290 }
290 total_len -= chars; 291 total_len -= chars;
291 if (!total_len) 292 if (!total_len)
292 break; /* common path: read succeeded */ 293 break; /* common path: read succeeded */
293 } 294 }
294 if (bufs) /* More to do? */ 295 if (bufs) /* More to do? */
295 continue; 296 continue;
296 if (!pipe->writers) 297 if (!pipe->writers)
297 break; 298 break;
298 if (!pipe->waiting_writers) { 299 if (!pipe->waiting_writers) {
299 /* syscall merging: Usually we must not sleep 300 /* syscall merging: Usually we must not sleep
300 * if O_NONBLOCK is set, or if we got some data. 301 * if O_NONBLOCK is set, or if we got some data.
301 * But if a writer sleeps in kernel space, then 302 * But if a writer sleeps in kernel space, then
302 * we can wait for that data without violating POSIX. 303 * we can wait for that data without violating POSIX.
303 */ 304 */
304 if (ret) 305 if (ret)
305 break; 306 break;
306 if (filp->f_flags & O_NONBLOCK) { 307 if (filp->f_flags & O_NONBLOCK) {
307 ret = -EAGAIN; 308 ret = -EAGAIN;
308 break; 309 break;
309 } 310 }
310 } 311 }
311 if (signal_pending(current)) { 312 if (signal_pending(current)) {
312 if (!ret) 313 if (!ret)
313 ret = -ERESTARTSYS; 314 ret = -ERESTARTSYS;
314 break; 315 break;
315 } 316 }
316 if (do_wakeup) { 317 if (do_wakeup) {
317 wake_up_interruptible_sync(&pipe->wait); 318 wake_up_interruptible_sync(&pipe->wait);
318 kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); 319 kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
319 } 320 }
320 pipe_wait(pipe); 321 pipe_wait(pipe);
321 } 322 }
322 mutex_unlock(&inode->i_mutex); 323 mutex_unlock(&inode->i_mutex);
323 324
324 /* Signal writers asynchronously that there is more room. */ 325 /* Signal writers asynchronously that there is more room. */
325 if (do_wakeup) { 326 if (do_wakeup) {
326 wake_up_interruptible(&pipe->wait); 327 wake_up_interruptible(&pipe->wait);
327 kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); 328 kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
328 } 329 }
329 if (ret > 0) 330 if (ret > 0)
330 file_accessed(filp); 331 file_accessed(filp);
331 return ret; 332 return ret;
332 } 333 }
333 334
334 static ssize_t 335 static ssize_t
335 pipe_write(struct kiocb *iocb, const struct iovec *_iov, 336 pipe_write(struct kiocb *iocb, const struct iovec *_iov,
336 unsigned long nr_segs, loff_t ppos) 337 unsigned long nr_segs, loff_t ppos)
337 { 338 {
338 struct file *filp = iocb->ki_filp; 339 struct file *filp = iocb->ki_filp;
339 struct inode *inode = filp->f_path.dentry->d_inode; 340 struct inode *inode = filp->f_path.dentry->d_inode;
340 struct pipe_inode_info *pipe; 341 struct pipe_inode_info *pipe;
341 ssize_t ret; 342 ssize_t ret;
342 int do_wakeup; 343 int do_wakeup;
343 struct iovec *iov = (struct iovec *)_iov; 344 struct iovec *iov = (struct iovec *)_iov;
344 size_t total_len; 345 size_t total_len;
345 ssize_t chars; 346 ssize_t chars;
346 347
347 total_len = iov_length(iov, nr_segs); 348 total_len = iov_length(iov, nr_segs);
348 /* Null write succeeds. */ 349 /* Null write succeeds. */
349 if (unlikely(total_len == 0)) 350 if (unlikely(total_len == 0))
350 return 0; 351 return 0;
351 352
352 do_wakeup = 0; 353 do_wakeup = 0;
353 ret = 0; 354 ret = 0;
354 mutex_lock(&inode->i_mutex); 355 mutex_lock(&inode->i_mutex);
355 pipe = inode->i_pipe; 356 pipe = inode->i_pipe;
356 357
357 if (!pipe->readers) { 358 if (!pipe->readers) {
358 send_sig(SIGPIPE, current, 0); 359 send_sig(SIGPIPE, current, 0);
359 ret = -EPIPE; 360 ret = -EPIPE;
360 goto out; 361 goto out;
361 } 362 }
362 363
363 /* We try to merge small writes */ 364 /* We try to merge small writes */
364 chars = total_len & (PAGE_SIZE-1); /* size of the last buffer */ 365 chars = total_len & (PAGE_SIZE-1); /* size of the last buffer */
365 if (pipe->nrbufs && chars != 0) { 366 if (pipe->nrbufs && chars != 0) {
366 int lastbuf = (pipe->curbuf + pipe->nrbufs - 1) & 367 int lastbuf = (pipe->curbuf + pipe->nrbufs - 1) &
367 (PIPE_BUFFERS-1); 368 (PIPE_BUFFERS-1);
368 struct pipe_buffer *buf = pipe->bufs + lastbuf; 369 struct pipe_buffer *buf = pipe->bufs + lastbuf;
369 const struct pipe_buf_operations *ops = buf->ops; 370 const struct pipe_buf_operations *ops = buf->ops;
370 int offset = buf->offset + buf->len; 371 int offset = buf->offset + buf->len;
371 372
372 if (ops->can_merge && offset + chars <= PAGE_SIZE) { 373 if (ops->can_merge && offset + chars <= PAGE_SIZE) {
373 int error, atomic = 1; 374 int error, atomic = 1;
374 void *addr; 375 void *addr;
375 376
376 error = ops->pin(pipe, buf); 377 error = ops->confirm(pipe, buf);
377 if (error) 378 if (error)
378 goto out; 379 goto out;
379 380
380 iov_fault_in_pages_read(iov, chars); 381 iov_fault_in_pages_read(iov, chars);
381 redo1: 382 redo1:
382 addr = ops->map(pipe, buf, atomic); 383 addr = ops->map(pipe, buf, atomic);
383 error = pipe_iov_copy_from_user(offset + addr, iov, 384 error = pipe_iov_copy_from_user(offset + addr, iov,
384 chars, atomic); 385 chars, atomic);
385 ops->unmap(pipe, buf, addr); 386 ops->unmap(pipe, buf, addr);
386 ret = error; 387 ret = error;
387 do_wakeup = 1; 388 do_wakeup = 1;
388 if (error) { 389 if (error) {
389 if (atomic) { 390 if (atomic) {
390 atomic = 0; 391 atomic = 0;
391 goto redo1; 392 goto redo1;
392 } 393 }
393 goto out; 394 goto out;
394 } 395 }
395 buf->len += chars; 396 buf->len += chars;
396 total_len -= chars; 397 total_len -= chars;
397 ret = chars; 398 ret = chars;
398 if (!total_len) 399 if (!total_len)
399 goto out; 400 goto out;
400 } 401 }
401 } 402 }
402 403
403 for (;;) { 404 for (;;) {
404 int bufs; 405 int bufs;
405 406
406 if (!pipe->readers) { 407 if (!pipe->readers) {
407 send_sig(SIGPIPE, current, 0); 408 send_sig(SIGPIPE, current, 0);
408 if (!ret) 409 if (!ret)
409 ret = -EPIPE; 410 ret = -EPIPE;
410 break; 411 break;
411 } 412 }
412 bufs = pipe->nrbufs; 413 bufs = pipe->nrbufs;
413 if (bufs < PIPE_BUFFERS) { 414 if (bufs < PIPE_BUFFERS) {
414 int newbuf = (pipe->curbuf + bufs) & (PIPE_BUFFERS-1); 415 int newbuf = (pipe->curbuf + bufs) & (PIPE_BUFFERS-1);
415 struct pipe_buffer *buf = pipe->bufs + newbuf; 416 struct pipe_buffer *buf = pipe->bufs + newbuf;
416 struct page *page = pipe->tmp_page; 417 struct page *page = pipe->tmp_page;
417 char *src; 418 char *src;
418 int error, atomic = 1; 419 int error, atomic = 1;
419 420
420 if (!page) { 421 if (!page) {
421 page = alloc_page(GFP_HIGHUSER); 422 page = alloc_page(GFP_HIGHUSER);
422 if (unlikely(!page)) { 423 if (unlikely(!page)) {
423 ret = ret ? : -ENOMEM; 424 ret = ret ? : -ENOMEM;
424 break; 425 break;
425 } 426 }
426 pipe->tmp_page = page; 427 pipe->tmp_page = page;
427 } 428 }
428 /* Always wake up, even if the copy fails. Otherwise 429 /* Always wake up, even if the copy fails. Otherwise
429 * we lock up (O_NONBLOCK-)readers that sleep due to 430 * we lock up (O_NONBLOCK-)readers that sleep due to
430 * syscall merging. 431 * syscall merging.
431 * FIXME! Is this really true? 432 * FIXME! Is this really true?
432 */ 433 */
433 do_wakeup = 1; 434 do_wakeup = 1;
434 chars = PAGE_SIZE; 435 chars = PAGE_SIZE;
435 if (chars > total_len) 436 if (chars > total_len)
436 chars = total_len; 437 chars = total_len;
437 438
438 iov_fault_in_pages_read(iov, chars); 439 iov_fault_in_pages_read(iov, chars);
439 redo2: 440 redo2:
440 if (atomic) 441 if (atomic)
441 src = kmap_atomic(page, KM_USER0); 442 src = kmap_atomic(page, KM_USER0);
442 else 443 else
443 src = kmap(page); 444 src = kmap(page);
444 445
445 error = pipe_iov_copy_from_user(src, iov, chars, 446 error = pipe_iov_copy_from_user(src, iov, chars,
446 atomic); 447 atomic);
447 if (atomic) 448 if (atomic)
448 kunmap_atomic(src, KM_USER0); 449 kunmap_atomic(src, KM_USER0);
449 else 450 else
450 kunmap(page); 451 kunmap(page);
451 452
452 if (unlikely(error)) { 453 if (unlikely(error)) {
453 if (atomic) { 454 if (atomic) {
454 atomic = 0; 455 atomic = 0;
455 goto redo2; 456 goto redo2;
456 } 457 }
457 if (!ret) 458 if (!ret)
458 ret = error; 459 ret = error;
459 break; 460 break;
460 } 461 }
461 ret += chars; 462 ret += chars;
462 463
463 /* Insert it into the buffer array */ 464 /* Insert it into the buffer array */
464 buf->page = page; 465 buf->page = page;
465 buf->ops = &anon_pipe_buf_ops; 466 buf->ops = &anon_pipe_buf_ops;
466 buf->offset = 0; 467 buf->offset = 0;
467 buf->len = chars; 468 buf->len = chars;
468 pipe->nrbufs = ++bufs; 469 pipe->nrbufs = ++bufs;
469 pipe->tmp_page = NULL; 470 pipe->tmp_page = NULL;
470 471
471 total_len -= chars; 472 total_len -= chars;
472 if (!total_len) 473 if (!total_len)
473 break; 474 break;
474 } 475 }
475 if (bufs < PIPE_BUFFERS) 476 if (bufs < PIPE_BUFFERS)
476 continue; 477 continue;
477 if (filp->f_flags & O_NONBLOCK) { 478 if (filp->f_flags & O_NONBLOCK) {
478 if (!ret) 479 if (!ret)
479 ret = -EAGAIN; 480 ret = -EAGAIN;
480 break; 481 break;
481 } 482 }
482 if (signal_pending(current)) { 483 if (signal_pending(current)) {
483 if (!ret) 484 if (!ret)
484 ret = -ERESTARTSYS; 485 ret = -ERESTARTSYS;
485 break; 486 break;
486 } 487 }
487 if (do_wakeup) { 488 if (do_wakeup) {
488 wake_up_interruptible_sync(&pipe->wait); 489 wake_up_interruptible_sync(&pipe->wait);
489 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); 490 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
490 do_wakeup = 0; 491 do_wakeup = 0;
491 } 492 }
492 pipe->waiting_writers++; 493 pipe->waiting_writers++;
493 pipe_wait(pipe); 494 pipe_wait(pipe);
494 pipe->waiting_writers--; 495 pipe->waiting_writers--;
495 } 496 }
496 out: 497 out:
497 mutex_unlock(&inode->i_mutex); 498 mutex_unlock(&inode->i_mutex);
498 if (do_wakeup) { 499 if (do_wakeup) {
499 wake_up_interruptible(&pipe->wait); 500 wake_up_interruptible(&pipe->wait);
500 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); 501 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
501 } 502 }
502 if (ret > 0) 503 if (ret > 0)
503 file_update_time(filp); 504 file_update_time(filp);
504 return ret; 505 return ret;
505 } 506 }
506 507
507 static ssize_t 508 static ssize_t
508 bad_pipe_r(struct file *filp, char __user *buf, size_t count, loff_t *ppos) 509 bad_pipe_r(struct file *filp, char __user *buf, size_t count, loff_t *ppos)
509 { 510 {
510 return -EBADF; 511 return -EBADF;
511 } 512 }
512 513
513 static ssize_t 514 static ssize_t
514 bad_pipe_w(struct file *filp, const char __user *buf, size_t count, 515 bad_pipe_w(struct file *filp, const char __user *buf, size_t count,
515 loff_t *ppos) 516 loff_t *ppos)
516 { 517 {
517 return -EBADF; 518 return -EBADF;
518 } 519 }
519 520
520 static int 521 static int
521 pipe_ioctl(struct inode *pino, struct file *filp, 522 pipe_ioctl(struct inode *pino, struct file *filp,
522 unsigned int cmd, unsigned long arg) 523 unsigned int cmd, unsigned long arg)
523 { 524 {
524 struct inode *inode = filp->f_path.dentry->d_inode; 525 struct inode *inode = filp->f_path.dentry->d_inode;
525 struct pipe_inode_info *pipe; 526 struct pipe_inode_info *pipe;
526 int count, buf, nrbufs; 527 int count, buf, nrbufs;
527 528
528 switch (cmd) { 529 switch (cmd) {
529 case FIONREAD: 530 case FIONREAD:
530 mutex_lock(&inode->i_mutex); 531 mutex_lock(&inode->i_mutex);
531 pipe = inode->i_pipe; 532 pipe = inode->i_pipe;
532 count = 0; 533 count = 0;
533 buf = pipe->curbuf; 534 buf = pipe->curbuf;
534 nrbufs = pipe->nrbufs; 535 nrbufs = pipe->nrbufs;
535 while (--nrbufs >= 0) { 536 while (--nrbufs >= 0) {
536 count += pipe->bufs[buf].len; 537 count += pipe->bufs[buf].len;
537 buf = (buf+1) & (PIPE_BUFFERS-1); 538 buf = (buf+1) & (PIPE_BUFFERS-1);
538 } 539 }
539 mutex_unlock(&inode->i_mutex); 540 mutex_unlock(&inode->i_mutex);
540 541
541 return put_user(count, (int __user *)arg); 542 return put_user(count, (int __user *)arg);
542 default: 543 default:
543 return -EINVAL; 544 return -EINVAL;
544 } 545 }
545 } 546 }
546 547
547 /* No kernel lock held - fine */ 548 /* No kernel lock held - fine */
548 static unsigned int 549 static unsigned int
549 pipe_poll(struct file *filp, poll_table *wait) 550 pipe_poll(struct file *filp, poll_table *wait)
550 { 551 {
551 unsigned int mask; 552 unsigned int mask;
552 struct inode *inode = filp->f_path.dentry->d_inode; 553 struct inode *inode = filp->f_path.dentry->d_inode;
553 struct pipe_inode_info *pipe = inode->i_pipe; 554 struct pipe_inode_info *pipe = inode->i_pipe;
554 int nrbufs; 555 int nrbufs;
555 556
556 poll_wait(filp, &pipe->wait, wait); 557 poll_wait(filp, &pipe->wait, wait);
557 558
558 /* Reading only -- no need for acquiring the semaphore. */ 559 /* Reading only -- no need for acquiring the semaphore. */
559 nrbufs = pipe->nrbufs; 560 nrbufs = pipe->nrbufs;
560 mask = 0; 561 mask = 0;
561 if (filp->f_mode & FMODE_READ) { 562 if (filp->f_mode & FMODE_READ) {
562 mask = (nrbufs > 0) ? POLLIN | POLLRDNORM : 0; 563 mask = (nrbufs > 0) ? POLLIN | POLLRDNORM : 0;
563 if (!pipe->writers && filp->f_version != pipe->w_counter) 564 if (!pipe->writers && filp->f_version != pipe->w_counter)
564 mask |= POLLHUP; 565 mask |= POLLHUP;
565 } 566 }
566 567
567 if (filp->f_mode & FMODE_WRITE) { 568 if (filp->f_mode & FMODE_WRITE) {
568 mask |= (nrbufs < PIPE_BUFFERS) ? POLLOUT | POLLWRNORM : 0; 569 mask |= (nrbufs < PIPE_BUFFERS) ? POLLOUT | POLLWRNORM : 0;
569 /* 570 /*
570 * Most Unices do not set POLLERR for FIFOs but on Linux they 571 * Most Unices do not set POLLERR for FIFOs but on Linux they
571 * behave exactly like pipes for poll(). 572 * behave exactly like pipes for poll().
572 */ 573 */
573 if (!pipe->readers) 574 if (!pipe->readers)
574 mask |= POLLERR; 575 mask |= POLLERR;
575 } 576 }
576 577
577 return mask; 578 return mask;
578 } 579 }
579 580
580 static int 581 static int
581 pipe_release(struct inode *inode, int decr, int decw) 582 pipe_release(struct inode *inode, int decr, int decw)
582 { 583 {
583 struct pipe_inode_info *pipe; 584 struct pipe_inode_info *pipe;
584 585
585 mutex_lock(&inode->i_mutex); 586 mutex_lock(&inode->i_mutex);
586 pipe = inode->i_pipe; 587 pipe = inode->i_pipe;
587 pipe->readers -= decr; 588 pipe->readers -= decr;
588 pipe->writers -= decw; 589 pipe->writers -= decw;
589 590
590 if (!pipe->readers && !pipe->writers) { 591 if (!pipe->readers && !pipe->writers) {
591 free_pipe_info(inode); 592 free_pipe_info(inode);
592 } else { 593 } else {
593 wake_up_interruptible(&pipe->wait); 594 wake_up_interruptible(&pipe->wait);
594 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); 595 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
595 kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); 596 kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
596 } 597 }
597 mutex_unlock(&inode->i_mutex); 598 mutex_unlock(&inode->i_mutex);
598 599
599 return 0; 600 return 0;
600 } 601 }
601 602
602 static int 603 static int
603 pipe_read_fasync(int fd, struct file *filp, int on) 604 pipe_read_fasync(int fd, struct file *filp, int on)
604 { 605 {
605 struct inode *inode = filp->f_path.dentry->d_inode; 606 struct inode *inode = filp->f_path.dentry->d_inode;
606 int retval; 607 int retval;
607 608
608 mutex_lock(&inode->i_mutex); 609 mutex_lock(&inode->i_mutex);
609 retval = fasync_helper(fd, filp, on, &inode->i_pipe->fasync_readers); 610 retval = fasync_helper(fd, filp, on, &inode->i_pipe->fasync_readers);
610 mutex_unlock(&inode->i_mutex); 611 mutex_unlock(&inode->i_mutex);
611 612
612 if (retval < 0) 613 if (retval < 0)
613 return retval; 614 return retval;
614 615
615 return 0; 616 return 0;
616 } 617 }
617 618
618 619
619 static int 620 static int
620 pipe_write_fasync(int fd, struct file *filp, int on) 621 pipe_write_fasync(int fd, struct file *filp, int on)
621 { 622 {
622 struct inode *inode = filp->f_path.dentry->d_inode; 623 struct inode *inode = filp->f_path.dentry->d_inode;
623 int retval; 624 int retval;
624 625
625 mutex_lock(&inode->i_mutex); 626 mutex_lock(&inode->i_mutex);
626 retval = fasync_helper(fd, filp, on, &inode->i_pipe->fasync_writers); 627 retval = fasync_helper(fd, filp, on, &inode->i_pipe->fasync_writers);
627 mutex_unlock(&inode->i_mutex); 628 mutex_unlock(&inode->i_mutex);
628 629
629 if (retval < 0) 630 if (retval < 0)
630 return retval; 631 return retval;
631 632
632 return 0; 633 return 0;
633 } 634 }
634 635
635 636
636 static int 637 static int
637 pipe_rdwr_fasync(int fd, struct file *filp, int on) 638 pipe_rdwr_fasync(int fd, struct file *filp, int on)
638 { 639 {
639 struct inode *inode = filp->f_path.dentry->d_inode; 640 struct inode *inode = filp->f_path.dentry->d_inode;
640 struct pipe_inode_info *pipe = inode->i_pipe; 641 struct pipe_inode_info *pipe = inode->i_pipe;
641 int retval; 642 int retval;
642 643
643 mutex_lock(&inode->i_mutex); 644 mutex_lock(&inode->i_mutex);
644 645
645 retval = fasync_helper(fd, filp, on, &pipe->fasync_readers); 646 retval = fasync_helper(fd, filp, on, &pipe->fasync_readers);
646 647
647 if (retval >= 0) 648 if (retval >= 0)
648 retval = fasync_helper(fd, filp, on, &pipe->fasync_writers); 649 retval = fasync_helper(fd, filp, on, &pipe->fasync_writers);
649 650
650 mutex_unlock(&inode->i_mutex); 651 mutex_unlock(&inode->i_mutex);
651 652
652 if (retval < 0) 653 if (retval < 0)
653 return retval; 654 return retval;
654 655
655 return 0; 656 return 0;
656 } 657 }
657 658
658 659
659 static int 660 static int
660 pipe_read_release(struct inode *inode, struct file *filp) 661 pipe_read_release(struct inode *inode, struct file *filp)
661 { 662 {
662 pipe_read_fasync(-1, filp, 0); 663 pipe_read_fasync(-1, filp, 0);
663 return pipe_release(inode, 1, 0); 664 return pipe_release(inode, 1, 0);
664 } 665 }
665 666
666 static int 667 static int
667 pipe_write_release(struct inode *inode, struct file *filp) 668 pipe_write_release(struct inode *inode, struct file *filp)
668 { 669 {
669 pipe_write_fasync(-1, filp, 0); 670 pipe_write_fasync(-1, filp, 0);
670 return pipe_release(inode, 0, 1); 671 return pipe_release(inode, 0, 1);
671 } 672 }
672 673
673 static int 674 static int
674 pipe_rdwr_release(struct inode *inode, struct file *filp) 675 pipe_rdwr_release(struct inode *inode, struct file *filp)
675 { 676 {
676 int decr, decw; 677 int decr, decw;
677 678
678 pipe_rdwr_fasync(-1, filp, 0); 679 pipe_rdwr_fasync(-1, filp, 0);
679 decr = (filp->f_mode & FMODE_READ) != 0; 680 decr = (filp->f_mode & FMODE_READ) != 0;
680 decw = (filp->f_mode & FMODE_WRITE) != 0; 681 decw = (filp->f_mode & FMODE_WRITE) != 0;
681 return pipe_release(inode, decr, decw); 682 return pipe_release(inode, decr, decw);
682 } 683 }
683 684
684 static int 685 static int
685 pipe_read_open(struct inode *inode, struct file *filp) 686 pipe_read_open(struct inode *inode, struct file *filp)
686 { 687 {
687 /* We could have perhaps used atomic_t, but this and friends 688 /* We could have perhaps used atomic_t, but this and friends
688 below are the only places. So it doesn't seem worthwhile. */ 689 below are the only places. So it doesn't seem worthwhile. */
689 mutex_lock(&inode->i_mutex); 690 mutex_lock(&inode->i_mutex);
690 inode->i_pipe->readers++; 691 inode->i_pipe->readers++;
691 mutex_unlock(&inode->i_mutex); 692 mutex_unlock(&inode->i_mutex);
692 693
693 return 0; 694 return 0;
694 } 695 }
695 696
696 static int 697 static int
697 pipe_write_open(struct inode *inode, struct file *filp) 698 pipe_write_open(struct inode *inode, struct file *filp)
698 { 699 {
699 mutex_lock(&inode->i_mutex); 700 mutex_lock(&inode->i_mutex);
700 inode->i_pipe->writers++; 701 inode->i_pipe->writers++;
701 mutex_unlock(&inode->i_mutex); 702 mutex_unlock(&inode->i_mutex);
702 703
703 return 0; 704 return 0;
704 } 705 }
705 706
706 static int 707 static int
707 pipe_rdwr_open(struct inode *inode, struct file *filp) 708 pipe_rdwr_open(struct inode *inode, struct file *filp)
708 { 709 {
709 mutex_lock(&inode->i_mutex); 710 mutex_lock(&inode->i_mutex);
710 if (filp->f_mode & FMODE_READ) 711 if (filp->f_mode & FMODE_READ)
711 inode->i_pipe->readers++; 712 inode->i_pipe->readers++;
712 if (filp->f_mode & FMODE_WRITE) 713 if (filp->f_mode & FMODE_WRITE)
713 inode->i_pipe->writers++; 714 inode->i_pipe->writers++;
714 mutex_unlock(&inode->i_mutex); 715 mutex_unlock(&inode->i_mutex);
715 716
716 return 0; 717 return 0;
717 } 718 }
718 719
719 /* 720 /*
720 * The file_operations structs are not static because they 721 * The file_operations structs are not static because they
721 * are also used in linux/fs/fifo.c to do operations on FIFOs. 722 * are also used in linux/fs/fifo.c to do operations on FIFOs.
722 */ 723 */
723 const struct file_operations read_fifo_fops = { 724 const struct file_operations read_fifo_fops = {
724 .llseek = no_llseek, 725 .llseek = no_llseek,
725 .read = do_sync_read, 726 .read = do_sync_read,
726 .aio_read = pipe_read, 727 .aio_read = pipe_read,
727 .write = bad_pipe_w, 728 .write = bad_pipe_w,
728 .poll = pipe_poll, 729 .poll = pipe_poll,
729 .ioctl = pipe_ioctl, 730 .ioctl = pipe_ioctl,
730 .open = pipe_read_open, 731 .open = pipe_read_open,
731 .release = pipe_read_release, 732 .release = pipe_read_release,
732 .fasync = pipe_read_fasync, 733 .fasync = pipe_read_fasync,
733 }; 734 };
734 735
735 const struct file_operations write_fifo_fops = { 736 const struct file_operations write_fifo_fops = {
736 .llseek = no_llseek, 737 .llseek = no_llseek,
737 .read = bad_pipe_r, 738 .read = bad_pipe_r,
738 .write = do_sync_write, 739 .write = do_sync_write,
739 .aio_write = pipe_write, 740 .aio_write = pipe_write,
740 .poll = pipe_poll, 741 .poll = pipe_poll,
741 .ioctl = pipe_ioctl, 742 .ioctl = pipe_ioctl,
742 .open = pipe_write_open, 743 .open = pipe_write_open,
743 .release = pipe_write_release, 744 .release = pipe_write_release,
744 .fasync = pipe_write_fasync, 745 .fasync = pipe_write_fasync,
745 }; 746 };
746 747
747 const struct file_operations rdwr_fifo_fops = { 748 const struct file_operations rdwr_fifo_fops = {
748 .llseek = no_llseek, 749 .llseek = no_llseek,
749 .read = do_sync_read, 750 .read = do_sync_read,
750 .aio_read = pipe_read, 751 .aio_read = pipe_read,
751 .write = do_sync_write, 752 .write = do_sync_write,
752 .aio_write = pipe_write, 753 .aio_write = pipe_write,
753 .poll = pipe_poll, 754 .poll = pipe_poll,
754 .ioctl = pipe_ioctl, 755 .ioctl = pipe_ioctl,
755 .open = pipe_rdwr_open, 756 .open = pipe_rdwr_open,
756 .release = pipe_rdwr_release, 757 .release = pipe_rdwr_release,
757 .fasync = pipe_rdwr_fasync, 758 .fasync = pipe_rdwr_fasync,
758 }; 759 };
759 760
760 static const struct file_operations read_pipe_fops = { 761 static const struct file_operations read_pipe_fops = {
761 .llseek = no_llseek, 762 .llseek = no_llseek,
762 .read = do_sync_read, 763 .read = do_sync_read,
763 .aio_read = pipe_read, 764 .aio_read = pipe_read,
764 .write = bad_pipe_w, 765 .write = bad_pipe_w,
765 .poll = pipe_poll, 766 .poll = pipe_poll,
766 .ioctl = pipe_ioctl, 767 .ioctl = pipe_ioctl,
767 .open = pipe_read_open, 768 .open = pipe_read_open,
768 .release = pipe_read_release, 769 .release = pipe_read_release,
769 .fasync = pipe_read_fasync, 770 .fasync = pipe_read_fasync,
770 }; 771 };
771 772
772 static const struct file_operations write_pipe_fops = { 773 static const struct file_operations write_pipe_fops = {
773 .llseek = no_llseek, 774 .llseek = no_llseek,
774 .read = bad_pipe_r, 775 .read = bad_pipe_r,
775 .write = do_sync_write, 776 .write = do_sync_write,
776 .aio_write = pipe_write, 777 .aio_write = pipe_write,
777 .poll = pipe_poll, 778 .poll = pipe_poll,
778 .ioctl = pipe_ioctl, 779 .ioctl = pipe_ioctl,
779 .open = pipe_write_open, 780 .open = pipe_write_open,
780 .release = pipe_write_release, 781 .release = pipe_write_release,
781 .fasync = pipe_write_fasync, 782 .fasync = pipe_write_fasync,
782 }; 783 };
783 784
784 static const struct file_operations rdwr_pipe_fops = { 785 static const struct file_operations rdwr_pipe_fops = {
785 .llseek = no_llseek, 786 .llseek = no_llseek,
786 .read = do_sync_read, 787 .read = do_sync_read,
787 .aio_read = pipe_read, 788 .aio_read = pipe_read,
788 .write = do_sync_write, 789 .write = do_sync_write,
789 .aio_write = pipe_write, 790 .aio_write = pipe_write,
790 .poll = pipe_poll, 791 .poll = pipe_poll,
791 .ioctl = pipe_ioctl, 792 .ioctl = pipe_ioctl,
792 .open = pipe_rdwr_open, 793 .open = pipe_rdwr_open,
793 .release = pipe_rdwr_release, 794 .release = pipe_rdwr_release,
794 .fasync = pipe_rdwr_fasync, 795 .fasync = pipe_rdwr_fasync,
795 }; 796 };
796 797
797 struct pipe_inode_info * alloc_pipe_info(struct inode *inode) 798 struct pipe_inode_info * alloc_pipe_info(struct inode *inode)
798 { 799 {
799 struct pipe_inode_info *pipe; 800 struct pipe_inode_info *pipe;
800 801
801 pipe = kzalloc(sizeof(struct pipe_inode_info), GFP_KERNEL); 802 pipe = kzalloc(sizeof(struct pipe_inode_info), GFP_KERNEL);
802 if (pipe) { 803 if (pipe) {
803 init_waitqueue_head(&pipe->wait); 804 init_waitqueue_head(&pipe->wait);
804 pipe->r_counter = pipe->w_counter = 1; 805 pipe->r_counter = pipe->w_counter = 1;
805 pipe->inode = inode; 806 pipe->inode = inode;
806 } 807 }
807 808
808 return pipe; 809 return pipe;
809 } 810 }
810 811
811 void __free_pipe_info(struct pipe_inode_info *pipe) 812 void __free_pipe_info(struct pipe_inode_info *pipe)
812 { 813 {
813 int i; 814 int i;
814 815
815 for (i = 0; i < PIPE_BUFFERS; i++) { 816 for (i = 0; i < PIPE_BUFFERS; i++) {
816 struct pipe_buffer *buf = pipe->bufs + i; 817 struct pipe_buffer *buf = pipe->bufs + i;
817 if (buf->ops) 818 if (buf->ops)
818 buf->ops->release(pipe, buf); 819 buf->ops->release(pipe, buf);
819 } 820 }
820 if (pipe->tmp_page) 821 if (pipe->tmp_page)
821 __free_page(pipe->tmp_page); 822 __free_page(pipe->tmp_page);
822 kfree(pipe); 823 kfree(pipe);
823 } 824 }
824 825
825 void free_pipe_info(struct inode *inode) 826 void free_pipe_info(struct inode *inode)
826 { 827 {
827 __free_pipe_info(inode->i_pipe); 828 __free_pipe_info(inode->i_pipe);
828 inode->i_pipe = NULL; 829 inode->i_pipe = NULL;
829 } 830 }
830 831
831 static struct vfsmount *pipe_mnt __read_mostly; 832 static struct vfsmount *pipe_mnt __read_mostly;
832 static int pipefs_delete_dentry(struct dentry *dentry) 833 static int pipefs_delete_dentry(struct dentry *dentry)
833 { 834 {
834 /* 835 /*
835 * At creation time, we pretended this dentry was hashed 836 * At creation time, we pretended this dentry was hashed
836 * (by clearing DCACHE_UNHASHED bit in d_flags) 837 * (by clearing DCACHE_UNHASHED bit in d_flags)
837 * At delete time, we restore the truth : not hashed. 838 * At delete time, we restore the truth : not hashed.
838 * (so that dput() can proceed correctly) 839 * (so that dput() can proceed correctly)
839 */ 840 */
840 dentry->d_flags |= DCACHE_UNHASHED; 841 dentry->d_flags |= DCACHE_UNHASHED;
841 return 0; 842 return 0;
842 } 843 }
843 844
844 /* 845 /*
845 * pipefs_dname() is called from d_path(). 846 * pipefs_dname() is called from d_path().
846 */ 847 */
847 static char *pipefs_dname(struct dentry *dentry, char *buffer, int buflen) 848 static char *pipefs_dname(struct dentry *dentry, char *buffer, int buflen)
848 { 849 {
849 return dynamic_dname(dentry, buffer, buflen, "pipe:[%lu]", 850 return dynamic_dname(dentry, buffer, buflen, "pipe:[%lu]",
850 dentry->d_inode->i_ino); 851 dentry->d_inode->i_ino);
851 } 852 }
852 853
853 static struct dentry_operations pipefs_dentry_operations = { 854 static struct dentry_operations pipefs_dentry_operations = {
854 .d_delete = pipefs_delete_dentry, 855 .d_delete = pipefs_delete_dentry,
855 .d_dname = pipefs_dname, 856 .d_dname = pipefs_dname,
856 }; 857 };
857 858
858 static struct inode * get_pipe_inode(void) 859 static struct inode * get_pipe_inode(void)
859 { 860 {
860 struct inode *inode = new_inode(pipe_mnt->mnt_sb); 861 struct inode *inode = new_inode(pipe_mnt->mnt_sb);
861 struct pipe_inode_info *pipe; 862 struct pipe_inode_info *pipe;
862 863
863 if (!inode) 864 if (!inode)
864 goto fail_inode; 865 goto fail_inode;
865 866
866 pipe = alloc_pipe_info(inode); 867 pipe = alloc_pipe_info(inode);
867 if (!pipe) 868 if (!pipe)
868 goto fail_iput; 869 goto fail_iput;
869 inode->i_pipe = pipe; 870 inode->i_pipe = pipe;
870 871
871 pipe->readers = pipe->writers = 1; 872 pipe->readers = pipe->writers = 1;
872 inode->i_fop = &rdwr_pipe_fops; 873 inode->i_fop = &rdwr_pipe_fops;
873 874
874 /* 875 /*
875 * Mark the inode dirty from the very beginning, 876 * Mark the inode dirty from the very beginning,
876 * that way it will never be moved to the dirty 877 * that way it will never be moved to the dirty
877 * list because "mark_inode_dirty()" will think 878 * list because "mark_inode_dirty()" will think
878 * that it already _is_ on the dirty list. 879 * that it already _is_ on the dirty list.
879 */ 880 */
880 inode->i_state = I_DIRTY; 881 inode->i_state = I_DIRTY;
881 inode->i_mode = S_IFIFO | S_IRUSR | S_IWUSR; 882 inode->i_mode = S_IFIFO | S_IRUSR | S_IWUSR;
882 inode->i_uid = current->fsuid; 883 inode->i_uid = current->fsuid;
883 inode->i_gid = current->fsgid; 884 inode->i_gid = current->fsgid;
884 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; 885 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
885 886
886 return inode; 887 return inode;
887 888
888 fail_iput: 889 fail_iput:
889 iput(inode); 890 iput(inode);
890 891
891 fail_inode: 892 fail_inode:
892 return NULL; 893 return NULL;
893 } 894 }
894 895
895 struct file *create_write_pipe(void) 896 struct file *create_write_pipe(void)
896 { 897 {
897 int err; 898 int err;
898 struct inode *inode; 899 struct inode *inode;
899 struct file *f; 900 struct file *f;
900 struct dentry *dentry; 901 struct dentry *dentry;
901 struct qstr name = { .name = "" }; 902 struct qstr name = { .name = "" };
902 903
903 f = get_empty_filp(); 904 f = get_empty_filp();
904 if (!f) 905 if (!f)
905 return ERR_PTR(-ENFILE); 906 return ERR_PTR(-ENFILE);
906 err = -ENFILE; 907 err = -ENFILE;
907 inode = get_pipe_inode(); 908 inode = get_pipe_inode();
908 if (!inode) 909 if (!inode)
909 goto err_file; 910 goto err_file;
910 911
911 err = -ENOMEM; 912 err = -ENOMEM;
912 dentry = d_alloc(pipe_mnt->mnt_sb->s_root, &name); 913 dentry = d_alloc(pipe_mnt->mnt_sb->s_root, &name);
913 if (!dentry) 914 if (!dentry)
914 goto err_inode; 915 goto err_inode;
915 916
916 dentry->d_op = &pipefs_dentry_operations; 917 dentry->d_op = &pipefs_dentry_operations;
917 /* 918 /*
918 * We dont want to publish this dentry into global dentry hash table. 919 * We dont want to publish this dentry into global dentry hash table.
919 * We pretend dentry is already hashed, by unsetting DCACHE_UNHASHED 920 * We pretend dentry is already hashed, by unsetting DCACHE_UNHASHED
920 * This permits a working /proc/$pid/fd/XXX on pipes 921 * This permits a working /proc/$pid/fd/XXX on pipes
921 */ 922 */
922 dentry->d_flags &= ~DCACHE_UNHASHED; 923 dentry->d_flags &= ~DCACHE_UNHASHED;
923 d_instantiate(dentry, inode); 924 d_instantiate(dentry, inode);
924 f->f_path.mnt = mntget(pipe_mnt); 925 f->f_path.mnt = mntget(pipe_mnt);
925 f->f_path.dentry = dentry; 926 f->f_path.dentry = dentry;
926 f->f_mapping = inode->i_mapping; 927 f->f_mapping = inode->i_mapping;
927 928
928 f->f_flags = O_WRONLY; 929 f->f_flags = O_WRONLY;
929 f->f_op = &write_pipe_fops; 930 f->f_op = &write_pipe_fops;
930 f->f_mode = FMODE_WRITE; 931 f->f_mode = FMODE_WRITE;
931 f->f_version = 0; 932 f->f_version = 0;
932 933
933 return f; 934 return f;
934 935
935 err_inode: 936 err_inode:
936 free_pipe_info(inode); 937 free_pipe_info(inode);
937 iput(inode); 938 iput(inode);
938 err_file: 939 err_file:
939 put_filp(f); 940 put_filp(f);
940 return ERR_PTR(err); 941 return ERR_PTR(err);
941 } 942 }
942 943
943 void free_write_pipe(struct file *f) 944 void free_write_pipe(struct file *f)
944 { 945 {
945 free_pipe_info(f->f_dentry->d_inode); 946 free_pipe_info(f->f_dentry->d_inode);
946 dput(f->f_path.dentry); 947 dput(f->f_path.dentry);
947 mntput(f->f_path.mnt); 948 mntput(f->f_path.mnt);
948 put_filp(f); 949 put_filp(f);
949 } 950 }
950 951
951 struct file *create_read_pipe(struct file *wrf) 952 struct file *create_read_pipe(struct file *wrf)
952 { 953 {
953 struct file *f = get_empty_filp(); 954 struct file *f = get_empty_filp();
954 if (!f) 955 if (!f)
955 return ERR_PTR(-ENFILE); 956 return ERR_PTR(-ENFILE);
956 957
957 /* Grab pipe from the writer */ 958 /* Grab pipe from the writer */
958 f->f_path.mnt = mntget(wrf->f_path.mnt); 959 f->f_path.mnt = mntget(wrf->f_path.mnt);
959 f->f_path.dentry = dget(wrf->f_path.dentry); 960 f->f_path.dentry = dget(wrf->f_path.dentry);
960 f->f_mapping = wrf->f_path.dentry->d_inode->i_mapping; 961 f->f_mapping = wrf->f_path.dentry->d_inode->i_mapping;
961 962
962 f->f_pos = 0; 963 f->f_pos = 0;
963 f->f_flags = O_RDONLY; 964 f->f_flags = O_RDONLY;
964 f->f_op = &read_pipe_fops; 965 f->f_op = &read_pipe_fops;
965 f->f_mode = FMODE_READ; 966 f->f_mode = FMODE_READ;
966 f->f_version = 0; 967 f->f_version = 0;
967 968
968 return f; 969 return f;
969 } 970 }
970 971
971 int do_pipe(int *fd) 972 int do_pipe(int *fd)
972 { 973 {
973 struct file *fw, *fr; 974 struct file *fw, *fr;
974 int error; 975 int error;
975 int fdw, fdr; 976 int fdw, fdr;
976 977
977 fw = create_write_pipe(); 978 fw = create_write_pipe();
978 if (IS_ERR(fw)) 979 if (IS_ERR(fw))
979 return PTR_ERR(fw); 980 return PTR_ERR(fw);
980 fr = create_read_pipe(fw); 981 fr = create_read_pipe(fw);
981 error = PTR_ERR(fr); 982 error = PTR_ERR(fr);
982 if (IS_ERR(fr)) 983 if (IS_ERR(fr))
983 goto err_write_pipe; 984 goto err_write_pipe;
984 985
985 error = get_unused_fd(); 986 error = get_unused_fd();
986 if (error < 0) 987 if (error < 0)
987 goto err_read_pipe; 988 goto err_read_pipe;
988 fdr = error; 989 fdr = error;
989 990
990 error = get_unused_fd(); 991 error = get_unused_fd();
991 if (error < 0) 992 if (error < 0)
992 goto err_fdr; 993 goto err_fdr;
993 fdw = error; 994 fdw = error;
994 995
995 error = audit_fd_pair(fdr, fdw); 996 error = audit_fd_pair(fdr, fdw);
996 if (error < 0) 997 if (error < 0)
997 goto err_fdw; 998 goto err_fdw;
998 999
999 fd_install(fdr, fr); 1000 fd_install(fdr, fr);
1000 fd_install(fdw, fw); 1001 fd_install(fdw, fw);
1001 fd[0] = fdr; 1002 fd[0] = fdr;
1002 fd[1] = fdw; 1003 fd[1] = fdw;
1003 1004
1004 return 0; 1005 return 0;
1005 1006
1006 err_fdw: 1007 err_fdw:
1007 put_unused_fd(fdw); 1008 put_unused_fd(fdw);
1008 err_fdr: 1009 err_fdr:
1009 put_unused_fd(fdr); 1010 put_unused_fd(fdr);
1010 err_read_pipe: 1011 err_read_pipe:
1011 dput(fr->f_dentry); 1012 dput(fr->f_dentry);
1012 mntput(fr->f_vfsmnt); 1013 mntput(fr->f_vfsmnt);
1013 put_filp(fr); 1014 put_filp(fr);
1014 err_write_pipe: 1015 err_write_pipe:
1015 free_write_pipe(fw); 1016 free_write_pipe(fw);
1016 return error; 1017 return error;
1017 } 1018 }
1018 1019
1019 /* 1020 /*
1020 * pipefs should _never_ be mounted by userland - too much of security hassle, 1021 * pipefs should _never_ be mounted by userland - too much of security hassle,
1021 * no real gain from having the whole whorehouse mounted. So we don't need 1022 * no real gain from having the whole whorehouse mounted. So we don't need
1022 * any operations on the root directory. However, we need a non-trivial 1023 * any operations on the root directory. However, we need a non-trivial
1023 * d_name - pipe: will go nicely and kill the special-casing in procfs. 1024 * d_name - pipe: will go nicely and kill the special-casing in procfs.
1024 */ 1025 */
1025 static int pipefs_get_sb(struct file_system_type *fs_type, 1026 static int pipefs_get_sb(struct file_system_type *fs_type,
1026 int flags, const char *dev_name, void *data, 1027 int flags, const char *dev_name, void *data,
1027 struct vfsmount *mnt) 1028 struct vfsmount *mnt)
1028 { 1029 {
1029 return get_sb_pseudo(fs_type, "pipe:", NULL, PIPEFS_MAGIC, mnt); 1030 return get_sb_pseudo(fs_type, "pipe:", NULL, PIPEFS_MAGIC, mnt);
1030 } 1031 }
1031 1032
1032 static struct file_system_type pipe_fs_type = { 1033 static struct file_system_type pipe_fs_type = {
1033 .name = "pipefs", 1034 .name = "pipefs",
1034 .get_sb = pipefs_get_sb, 1035 .get_sb = pipefs_get_sb,
1035 .kill_sb = kill_anon_super, 1036 .kill_sb = kill_anon_super,
1036 }; 1037 };
1037 1038
1038 static int __init init_pipe_fs(void) 1039 static int __init init_pipe_fs(void)
1039 { 1040 {
1040 int err = register_filesystem(&pipe_fs_type); 1041 int err = register_filesystem(&pipe_fs_type);
1041 1042
1042 if (!err) { 1043 if (!err) {
1043 pipe_mnt = kern_mount(&pipe_fs_type); 1044 pipe_mnt = kern_mount(&pipe_fs_type);
1044 if (IS_ERR(pipe_mnt)) { 1045 if (IS_ERR(pipe_mnt)) {
1045 err = PTR_ERR(pipe_mnt); 1046 err = PTR_ERR(pipe_mnt);
1046 unregister_filesystem(&pipe_fs_type); 1047 unregister_filesystem(&pipe_fs_type);
1047 } 1048 }
1048 } 1049 }
1049 return err; 1050 return err;
1050 } 1051 }
1051 1052
1052 static void __exit exit_pipe_fs(void) 1053 static void __exit exit_pipe_fs(void)
1053 { 1054 {
1054 unregister_filesystem(&pipe_fs_type); 1055 unregister_filesystem(&pipe_fs_type);
1055 mntput(pipe_mnt); 1056 mntput(pipe_mnt);
1056 } 1057 }
1057 1058
1058 fs_initcall(init_pipe_fs); 1059 fs_initcall(init_pipe_fs);
1059 module_exit(exit_pipe_fs); 1060 module_exit(exit_pipe_fs);
1060 1061
1 /* 1 /*
2 * "splice": joining two ropes together by interweaving their strands. 2 * "splice": joining two ropes together by interweaving their strands.
3 * 3 *
4 * This is the "extended pipe" functionality, where a pipe is used as 4 * This is the "extended pipe" functionality, where a pipe is used as
5 * an arbitrary in-memory buffer. Think of a pipe as a small kernel 5 * an arbitrary in-memory buffer. Think of a pipe as a small kernel
6 * buffer that you can use to transfer data from one end to the other. 6 * buffer that you can use to transfer data from one end to the other.
7 * 7 *
8 * The traditional unix read/write is extended with a "splice()" operation 8 * The traditional unix read/write is extended with a "splice()" operation
9 * that transfers data buffers to or from a pipe buffer. 9 * that transfers data buffers to or from a pipe buffer.
10 * 10 *
11 * Named by Larry McVoy, original implementation from Linus, extended by 11 * Named by Larry McVoy, original implementation from Linus, extended by
12 * Jens to support splicing to files, network, direct splicing, etc and 12 * Jens to support splicing to files, network, direct splicing, etc and
13 * fixing lots of bugs. 13 * fixing lots of bugs.
14 * 14 *
15 * Copyright (C) 2005-2006 Jens Axboe <axboe@kernel.dk> 15 * Copyright (C) 2005-2006 Jens Axboe <axboe@kernel.dk>
16 * Copyright (C) 2005-2006 Linus Torvalds <torvalds@osdl.org> 16 * Copyright (C) 2005-2006 Linus Torvalds <torvalds@osdl.org>
17 * Copyright (C) 2006 Ingo Molnar <mingo@elte.hu> 17 * Copyright (C) 2006 Ingo Molnar <mingo@elte.hu>
18 * 18 *
19 */ 19 */
20 #include <linux/fs.h> 20 #include <linux/fs.h>
21 #include <linux/file.h> 21 #include <linux/file.h>
22 #include <linux/pagemap.h> 22 #include <linux/pagemap.h>
23 #include <linux/splice.h> 23 #include <linux/splice.h>
24 #include <linux/mm_inline.h> 24 #include <linux/mm_inline.h>
25 #include <linux/swap.h> 25 #include <linux/swap.h>
26 #include <linux/writeback.h> 26 #include <linux/writeback.h>
27 #include <linux/buffer_head.h> 27 #include <linux/buffer_head.h>
28 #include <linux/module.h> 28 #include <linux/module.h>
29 #include <linux/syscalls.h> 29 #include <linux/syscalls.h>
30 #include <linux/uio.h> 30 #include <linux/uio.h>
31 31
32 /* 32 /*
33 * Attempt to steal a page from a pipe buffer. This should perhaps go into 33 * Attempt to steal a page from a pipe buffer. This should perhaps go into
34 * a vm helper function, it's already simplified quite a bit by the 34 * a vm helper function, it's already simplified quite a bit by the
35 * addition of remove_mapping(). If success is returned, the caller may 35 * addition of remove_mapping(). If success is returned, the caller may
36 * attempt to reuse this page for another destination. 36 * attempt to reuse this page for another destination.
37 */ 37 */
38 static int page_cache_pipe_buf_steal(struct pipe_inode_info *pipe, 38 static int page_cache_pipe_buf_steal(struct pipe_inode_info *pipe,
39 struct pipe_buffer *buf) 39 struct pipe_buffer *buf)
40 { 40 {
41 struct page *page = buf->page; 41 struct page *page = buf->page;
42 struct address_space *mapping; 42 struct address_space *mapping;
43 43
44 lock_page(page); 44 lock_page(page);
45 45
46 mapping = page_mapping(page); 46 mapping = page_mapping(page);
47 if (mapping) { 47 if (mapping) {
48 WARN_ON(!PageUptodate(page)); 48 WARN_ON(!PageUptodate(page));
49 49
50 /* 50 /*
51 * At least for ext2 with nobh option, we need to wait on 51 * At least for ext2 with nobh option, we need to wait on
52 * writeback completing on this page, since we'll remove it 52 * writeback completing on this page, since we'll remove it
53 * from the pagecache. Otherwise truncate wont wait on the 53 * from the pagecache. Otherwise truncate wont wait on the
54 * page, allowing the disk blocks to be reused by someone else 54 * page, allowing the disk blocks to be reused by someone else
55 * before we actually wrote our data to them. fs corruption 55 * before we actually wrote our data to them. fs corruption
56 * ensues. 56 * ensues.
57 */ 57 */
58 wait_on_page_writeback(page); 58 wait_on_page_writeback(page);
59 59
60 if (PagePrivate(page)) 60 if (PagePrivate(page))
61 try_to_release_page(page, GFP_KERNEL); 61 try_to_release_page(page, GFP_KERNEL);
62 62
63 /* 63 /*
64 * If we succeeded in removing the mapping, set LRU flag 64 * If we succeeded in removing the mapping, set LRU flag
65 * and return good. 65 * and return good.
66 */ 66 */
67 if (remove_mapping(mapping, page)) { 67 if (remove_mapping(mapping, page)) {
68 buf->flags |= PIPE_BUF_FLAG_LRU; 68 buf->flags |= PIPE_BUF_FLAG_LRU;
69 return 0; 69 return 0;
70 } 70 }
71 } 71 }
72 72
73 /* 73 /*
74 * Raced with truncate or failed to remove page from current 74 * Raced with truncate or failed to remove page from current
75 * address space, unlock and return failure. 75 * address space, unlock and return failure.
76 */ 76 */
77 unlock_page(page); 77 unlock_page(page);
78 return 1; 78 return 1;
79 } 79 }
80 80
81 static void page_cache_pipe_buf_release(struct pipe_inode_info *pipe, 81 static void page_cache_pipe_buf_release(struct pipe_inode_info *pipe,
82 struct pipe_buffer *buf) 82 struct pipe_buffer *buf)
83 { 83 {
84 page_cache_release(buf->page); 84 page_cache_release(buf->page);
85 buf->flags &= ~PIPE_BUF_FLAG_LRU; 85 buf->flags &= ~PIPE_BUF_FLAG_LRU;
86 } 86 }
87 87
88 static int page_cache_pipe_buf_pin(struct pipe_inode_info *pipe, 88 static int page_cache_pipe_buf_confirm(struct pipe_inode_info *pipe,
89 struct pipe_buffer *buf) 89 struct pipe_buffer *buf)
90 { 90 {
91 struct page *page = buf->page; 91 struct page *page = buf->page;
92 int err; 92 int err;
93 93
94 if (!PageUptodate(page)) { 94 if (!PageUptodate(page)) {
95 lock_page(page); 95 lock_page(page);
96 96
97 /* 97 /*
98 * Page got truncated/unhashed. This will cause a 0-byte 98 * Page got truncated/unhashed. This will cause a 0-byte
99 * splice, if this is the first page. 99 * splice, if this is the first page.
100 */ 100 */
101 if (!page->mapping) { 101 if (!page->mapping) {
102 err = -ENODATA; 102 err = -ENODATA;
103 goto error; 103 goto error;
104 } 104 }
105 105
106 /* 106 /*
107 * Uh oh, read-error from disk. 107 * Uh oh, read-error from disk.
108 */ 108 */
109 if (!PageUptodate(page)) { 109 if (!PageUptodate(page)) {
110 err = -EIO; 110 err = -EIO;
111 goto error; 111 goto error;
112 } 112 }
113 113
114 /* 114 /*
115 * Page is ok afterall, we are done. 115 * Page is ok afterall, we are done.
116 */ 116 */
117 unlock_page(page); 117 unlock_page(page);
118 } 118 }
119 119
120 return 0; 120 return 0;
121 error: 121 error:
122 unlock_page(page); 122 unlock_page(page);
123 return err; 123 return err;
124 } 124 }
125 125
126 static const struct pipe_buf_operations page_cache_pipe_buf_ops = { 126 static const struct pipe_buf_operations page_cache_pipe_buf_ops = {
127 .can_merge = 0, 127 .can_merge = 0,
128 .map = generic_pipe_buf_map, 128 .map = generic_pipe_buf_map,
129 .unmap = generic_pipe_buf_unmap, 129 .unmap = generic_pipe_buf_unmap,
130 .pin = page_cache_pipe_buf_pin, 130 .confirm = page_cache_pipe_buf_confirm,
131 .release = page_cache_pipe_buf_release, 131 .release = page_cache_pipe_buf_release,
132 .steal = page_cache_pipe_buf_steal, 132 .steal = page_cache_pipe_buf_steal,
133 .get = generic_pipe_buf_get, 133 .get = generic_pipe_buf_get,
134 }; 134 };
135 135
136 static int user_page_pipe_buf_steal(struct pipe_inode_info *pipe, 136 static int user_page_pipe_buf_steal(struct pipe_inode_info *pipe,
137 struct pipe_buffer *buf) 137 struct pipe_buffer *buf)
138 { 138 {
139 if (!(buf->flags & PIPE_BUF_FLAG_GIFT)) 139 if (!(buf->flags & PIPE_BUF_FLAG_GIFT))
140 return 1; 140 return 1;
141 141
142 buf->flags |= PIPE_BUF_FLAG_LRU; 142 buf->flags |= PIPE_BUF_FLAG_LRU;
143 return generic_pipe_buf_steal(pipe, buf); 143 return generic_pipe_buf_steal(pipe, buf);
144 } 144 }
145 145
146 static const struct pipe_buf_operations user_page_pipe_buf_ops = { 146 static const struct pipe_buf_operations user_page_pipe_buf_ops = {
147 .can_merge = 0, 147 .can_merge = 0,
148 .map = generic_pipe_buf_map, 148 .map = generic_pipe_buf_map,
149 .unmap = generic_pipe_buf_unmap, 149 .unmap = generic_pipe_buf_unmap,
150 .pin = generic_pipe_buf_pin, 150 .confirm = generic_pipe_buf_confirm,
151 .release = page_cache_pipe_buf_release, 151 .release = page_cache_pipe_buf_release,
152 .steal = user_page_pipe_buf_steal, 152 .steal = user_page_pipe_buf_steal,
153 .get = generic_pipe_buf_get, 153 .get = generic_pipe_buf_get,
154 }; 154 };
155 155
156 /** 156 /**
157 * splice_to_pipe - fill passed data into a pipe 157 * splice_to_pipe - fill passed data into a pipe
158 * @pipe: pipe to fill 158 * @pipe: pipe to fill
159 * @spd: data to fill 159 * @spd: data to fill
160 * 160 *
161 * Description: 161 * Description:
162 * @spd contains a map of pages and len/offset tupples, a long with 162 * @spd contains a map of pages and len/offset tupples, a long with
163 * the struct pipe_buf_operations associated with these pages. This 163 * the struct pipe_buf_operations associated with these pages. This
164 * function will link that data to the pipe. 164 * function will link that data to the pipe.
165 * 165 *
166 */ 166 */
167 ssize_t splice_to_pipe(struct pipe_inode_info *pipe, 167 ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
168 struct splice_pipe_desc *spd) 168 struct splice_pipe_desc *spd)
169 { 169 {
170 unsigned int spd_pages = spd->nr_pages; 170 unsigned int spd_pages = spd->nr_pages;
171 int ret, do_wakeup, page_nr; 171 int ret, do_wakeup, page_nr;
172 172
173 ret = 0; 173 ret = 0;
174 do_wakeup = 0; 174 do_wakeup = 0;
175 page_nr = 0; 175 page_nr = 0;
176 176
177 if (pipe->inode) 177 if (pipe->inode)
178 mutex_lock(&pipe->inode->i_mutex); 178 mutex_lock(&pipe->inode->i_mutex);
179 179
180 for (;;) { 180 for (;;) {
181 if (!pipe->readers) { 181 if (!pipe->readers) {
182 send_sig(SIGPIPE, current, 0); 182 send_sig(SIGPIPE, current, 0);
183 if (!ret) 183 if (!ret)
184 ret = -EPIPE; 184 ret = -EPIPE;
185 break; 185 break;
186 } 186 }
187 187
188 if (pipe->nrbufs < PIPE_BUFFERS) { 188 if (pipe->nrbufs < PIPE_BUFFERS) {
189 int newbuf = (pipe->curbuf + pipe->nrbufs) & (PIPE_BUFFERS - 1); 189 int newbuf = (pipe->curbuf + pipe->nrbufs) & (PIPE_BUFFERS - 1);
190 struct pipe_buffer *buf = pipe->bufs + newbuf; 190 struct pipe_buffer *buf = pipe->bufs + newbuf;
191 191
192 buf->page = spd->pages[page_nr]; 192 buf->page = spd->pages[page_nr];
193 buf->offset = spd->partial[page_nr].offset; 193 buf->offset = spd->partial[page_nr].offset;
194 buf->len = spd->partial[page_nr].len; 194 buf->len = spd->partial[page_nr].len;
195 buf->private = spd->partial[page_nr].private; 195 buf->private = spd->partial[page_nr].private;
196 buf->ops = spd->ops; 196 buf->ops = spd->ops;
197 if (spd->flags & SPLICE_F_GIFT) 197 if (spd->flags & SPLICE_F_GIFT)
198 buf->flags |= PIPE_BUF_FLAG_GIFT; 198 buf->flags |= PIPE_BUF_FLAG_GIFT;
199 199
200 pipe->nrbufs++; 200 pipe->nrbufs++;
201 page_nr++; 201 page_nr++;
202 ret += buf->len; 202 ret += buf->len;
203 203
204 if (pipe->inode) 204 if (pipe->inode)
205 do_wakeup = 1; 205 do_wakeup = 1;
206 206
207 if (!--spd->nr_pages) 207 if (!--spd->nr_pages)
208 break; 208 break;
209 if (pipe->nrbufs < PIPE_BUFFERS) 209 if (pipe->nrbufs < PIPE_BUFFERS)
210 continue; 210 continue;
211 211
212 break; 212 break;
213 } 213 }
214 214
215 if (spd->flags & SPLICE_F_NONBLOCK) { 215 if (spd->flags & SPLICE_F_NONBLOCK) {
216 if (!ret) 216 if (!ret)
217 ret = -EAGAIN; 217 ret = -EAGAIN;
218 break; 218 break;
219 } 219 }
220 220
221 if (signal_pending(current)) { 221 if (signal_pending(current)) {
222 if (!ret) 222 if (!ret)
223 ret = -ERESTARTSYS; 223 ret = -ERESTARTSYS;
224 break; 224 break;
225 } 225 }
226 226
227 if (do_wakeup) { 227 if (do_wakeup) {
228 smp_mb(); 228 smp_mb();
229 if (waitqueue_active(&pipe->wait)) 229 if (waitqueue_active(&pipe->wait))
230 wake_up_interruptible_sync(&pipe->wait); 230 wake_up_interruptible_sync(&pipe->wait);
231 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); 231 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
232 do_wakeup = 0; 232 do_wakeup = 0;
233 } 233 }
234 234
235 pipe->waiting_writers++; 235 pipe->waiting_writers++;
236 pipe_wait(pipe); 236 pipe_wait(pipe);
237 pipe->waiting_writers--; 237 pipe->waiting_writers--;
238 } 238 }
239 239
240 if (pipe->inode) { 240 if (pipe->inode) {
241 mutex_unlock(&pipe->inode->i_mutex); 241 mutex_unlock(&pipe->inode->i_mutex);
242 242
243 if (do_wakeup) { 243 if (do_wakeup) {
244 smp_mb(); 244 smp_mb();
245 if (waitqueue_active(&pipe->wait)) 245 if (waitqueue_active(&pipe->wait))
246 wake_up_interruptible(&pipe->wait); 246 wake_up_interruptible(&pipe->wait);
247 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); 247 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
248 } 248 }
249 } 249 }
250 250
251 while (page_nr < spd_pages) 251 while (page_nr < spd_pages)
252 page_cache_release(spd->pages[page_nr++]); 252 page_cache_release(spd->pages[page_nr++]);
253 253
254 return ret; 254 return ret;
255 } 255 }
256 256
257 static int 257 static int
258 __generic_file_splice_read(struct file *in, loff_t *ppos, 258 __generic_file_splice_read(struct file *in, loff_t *ppos,
259 struct pipe_inode_info *pipe, size_t len, 259 struct pipe_inode_info *pipe, size_t len,
260 unsigned int flags) 260 unsigned int flags)
261 { 261 {
262 struct address_space *mapping = in->f_mapping; 262 struct address_space *mapping = in->f_mapping;
263 unsigned int loff, nr_pages; 263 unsigned int loff, nr_pages;
264 struct page *pages[PIPE_BUFFERS]; 264 struct page *pages[PIPE_BUFFERS];
265 struct partial_page partial[PIPE_BUFFERS]; 265 struct partial_page partial[PIPE_BUFFERS];
266 struct page *page; 266 struct page *page;
267 pgoff_t index, end_index; 267 pgoff_t index, end_index;
268 loff_t isize; 268 loff_t isize;
269 int error, page_nr; 269 int error, page_nr;
270 struct splice_pipe_desc spd = { 270 struct splice_pipe_desc spd = {
271 .pages = pages, 271 .pages = pages,
272 .partial = partial, 272 .partial = partial,
273 .flags = flags, 273 .flags = flags,
274 .ops = &page_cache_pipe_buf_ops, 274 .ops = &page_cache_pipe_buf_ops,
275 }; 275 };
276 276
277 index = *ppos >> PAGE_CACHE_SHIFT; 277 index = *ppos >> PAGE_CACHE_SHIFT;
278 loff = *ppos & ~PAGE_CACHE_MASK; 278 loff = *ppos & ~PAGE_CACHE_MASK;
279 nr_pages = (len + loff + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 279 nr_pages = (len + loff + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
280 280
281 if (nr_pages > PIPE_BUFFERS) 281 if (nr_pages > PIPE_BUFFERS)
282 nr_pages = PIPE_BUFFERS; 282 nr_pages = PIPE_BUFFERS;
283 283
284 /* 284 /*
285 * Don't try to 2nd guess the read-ahead logic, call into 285 * Don't try to 2nd guess the read-ahead logic, call into
286 * page_cache_readahead() like the page cache reads would do. 286 * page_cache_readahead() like the page cache reads would do.
287 */ 287 */
288 page_cache_readahead(mapping, &in->f_ra, in, index, nr_pages); 288 page_cache_readahead(mapping, &in->f_ra, in, index, nr_pages);
289 289
290 /* 290 /*
291 * Lookup the (hopefully) full range of pages we need. 291 * Lookup the (hopefully) full range of pages we need.
292 */ 292 */
293 spd.nr_pages = find_get_pages_contig(mapping, index, nr_pages, pages); 293 spd.nr_pages = find_get_pages_contig(mapping, index, nr_pages, pages);
294 294
295 /* 295 /*
296 * If find_get_pages_contig() returned fewer pages than we needed, 296 * If find_get_pages_contig() returned fewer pages than we needed,
297 * allocate the rest and fill in the holes. 297 * allocate the rest and fill in the holes.
298 */ 298 */
299 error = 0; 299 error = 0;
300 index += spd.nr_pages; 300 index += spd.nr_pages;
301 while (spd.nr_pages < nr_pages) { 301 while (spd.nr_pages < nr_pages) {
302 /* 302 /*
303 * Page could be there, find_get_pages_contig() breaks on 303 * Page could be there, find_get_pages_contig() breaks on
304 * the first hole. 304 * the first hole.
305 */ 305 */
306 page = find_get_page(mapping, index); 306 page = find_get_page(mapping, index);
307 if (!page) { 307 if (!page) {
308 /* 308 /*
309 * Make sure the read-ahead engine is notified 309 * Make sure the read-ahead engine is notified
310 * about this failure. 310 * about this failure.
311 */ 311 */
312 handle_ra_miss(mapping, &in->f_ra, index); 312 handle_ra_miss(mapping, &in->f_ra, index);
313 313
314 /* 314 /*
315 * page didn't exist, allocate one. 315 * page didn't exist, allocate one.
316 */ 316 */
317 page = page_cache_alloc_cold(mapping); 317 page = page_cache_alloc_cold(mapping);
318 if (!page) 318 if (!page)
319 break; 319 break;
320 320
321 error = add_to_page_cache_lru(page, mapping, index, 321 error = add_to_page_cache_lru(page, mapping, index,
322 GFP_KERNEL); 322 GFP_KERNEL);
323 if (unlikely(error)) { 323 if (unlikely(error)) {
324 page_cache_release(page); 324 page_cache_release(page);
325 if (error == -EEXIST) 325 if (error == -EEXIST)
326 continue; 326 continue;
327 break; 327 break;
328 } 328 }
329 /* 329 /*
330 * add_to_page_cache() locks the page, unlock it 330 * add_to_page_cache() locks the page, unlock it
331 * to avoid convoluting the logic below even more. 331 * to avoid convoluting the logic below even more.
332 */ 332 */
333 unlock_page(page); 333 unlock_page(page);
334 } 334 }
335 335
336 pages[spd.nr_pages++] = page; 336 pages[spd.nr_pages++] = page;
337 index++; 337 index++;
338 } 338 }
339 339
340 /* 340 /*
341 * Now loop over the map and see if we need to start IO on any 341 * Now loop over the map and see if we need to start IO on any
342 * pages, fill in the partial map, etc. 342 * pages, fill in the partial map, etc.
343 */ 343 */
344 index = *ppos >> PAGE_CACHE_SHIFT; 344 index = *ppos >> PAGE_CACHE_SHIFT;
345 nr_pages = spd.nr_pages; 345 nr_pages = spd.nr_pages;
346 spd.nr_pages = 0; 346 spd.nr_pages = 0;
347 for (page_nr = 0; page_nr < nr_pages; page_nr++) { 347 for (page_nr = 0; page_nr < nr_pages; page_nr++) {
348 unsigned int this_len; 348 unsigned int this_len;
349 349
350 if (!len) 350 if (!len)
351 break; 351 break;
352 352
353 /* 353 /*
354 * this_len is the max we'll use from this page 354 * this_len is the max we'll use from this page
355 */ 355 */
356 this_len = min_t(unsigned long, len, PAGE_CACHE_SIZE - loff); 356 this_len = min_t(unsigned long, len, PAGE_CACHE_SIZE - loff);
357 page = pages[page_nr]; 357 page = pages[page_nr];
358 358
359 /* 359 /*
360 * If the page isn't uptodate, we may need to start io on it 360 * If the page isn't uptodate, we may need to start io on it
361 */ 361 */
362 if (!PageUptodate(page)) { 362 if (!PageUptodate(page)) {
363 /* 363 /*
364 * If in nonblock mode then dont block on waiting 364 * If in nonblock mode then dont block on waiting
365 * for an in-flight io page 365 * for an in-flight io page
366 */ 366 */
367 if (flags & SPLICE_F_NONBLOCK) { 367 if (flags & SPLICE_F_NONBLOCK) {
368 if (TestSetPageLocked(page)) 368 if (TestSetPageLocked(page))
369 break; 369 break;
370 } else 370 } else
371 lock_page(page); 371 lock_page(page);
372 372
373 /* 373 /*
374 * page was truncated, stop here. if this isn't the 374 * page was truncated, stop here. if this isn't the
375 * first page, we'll just complete what we already 375 * first page, we'll just complete what we already
376 * added 376 * added
377 */ 377 */
378 if (!page->mapping) { 378 if (!page->mapping) {
379 unlock_page(page); 379 unlock_page(page);
380 break; 380 break;
381 } 381 }
382 /* 382 /*
383 * page was already under io and is now done, great 383 * page was already under io and is now done, great
384 */ 384 */
385 if (PageUptodate(page)) { 385 if (PageUptodate(page)) {
386 unlock_page(page); 386 unlock_page(page);
387 goto fill_it; 387 goto fill_it;
388 } 388 }
389 389
390 /* 390 /*
391 * need to read in the page 391 * need to read in the page
392 */ 392 */
393 error = mapping->a_ops->readpage(in, page); 393 error = mapping->a_ops->readpage(in, page);
394 if (unlikely(error)) { 394 if (unlikely(error)) {
395 /* 395 /*
396 * We really should re-lookup the page here, 396 * We really should re-lookup the page here,
397 * but it complicates things a lot. Instead 397 * but it complicates things a lot. Instead
398 * lets just do what we already stored, and 398 * lets just do what we already stored, and
399 * we'll get it the next time we are called. 399 * we'll get it the next time we are called.
400 */ 400 */
401 if (error == AOP_TRUNCATED_PAGE) 401 if (error == AOP_TRUNCATED_PAGE)
402 error = 0; 402 error = 0;
403 403
404 break; 404 break;
405 } 405 }
406 } 406 }
407 fill_it: 407 fill_it:
408 /* 408 /*
409 * i_size must be checked after PageUptodate. 409 * i_size must be checked after PageUptodate.
410 */ 410 */
411 isize = i_size_read(mapping->host); 411 isize = i_size_read(mapping->host);
412 end_index = (isize - 1) >> PAGE_CACHE_SHIFT; 412 end_index = (isize - 1) >> PAGE_CACHE_SHIFT;
413 if (unlikely(!isize || index > end_index)) 413 if (unlikely(!isize || index > end_index))
414 break; 414 break;
415 415
416 /* 416 /*
417 * if this is the last page, see if we need to shrink 417 * if this is the last page, see if we need to shrink
418 * the length and stop 418 * the length and stop
419 */ 419 */
420 if (end_index == index) { 420 if (end_index == index) {
421 unsigned int plen; 421 unsigned int plen;
422 422
423 /* 423 /*
424 * max good bytes in this page 424 * max good bytes in this page
425 */ 425 */
426 plen = ((isize - 1) & ~PAGE_CACHE_MASK) + 1; 426 plen = ((isize - 1) & ~PAGE_CACHE_MASK) + 1;
427 if (plen <= loff) 427 if (plen <= loff)
428 break; 428 break;
429 429
430 /* 430 /*
431 * force quit after adding this page 431 * force quit after adding this page
432 */ 432 */
433 this_len = min(this_len, plen - loff); 433 this_len = min(this_len, plen - loff);
434 len = this_len; 434 len = this_len;
435 } 435 }
436 436
437 partial[page_nr].offset = loff; 437 partial[page_nr].offset = loff;
438 partial[page_nr].len = this_len; 438 partial[page_nr].len = this_len;
439 len -= this_len; 439 len -= this_len;
440 loff = 0; 440 loff = 0;
441 spd.nr_pages++; 441 spd.nr_pages++;
442 index++; 442 index++;
443 } 443 }
444 444
445 /* 445 /*
446 * Release any pages at the end, if we quit early. 'page_nr' is how far 446 * Release any pages at the end, if we quit early. 'page_nr' is how far
447 * we got, 'nr_pages' is how many pages are in the map. 447 * we got, 'nr_pages' is how many pages are in the map.
448 */ 448 */
449 while (page_nr < nr_pages) 449 while (page_nr < nr_pages)
450 page_cache_release(pages[page_nr++]); 450 page_cache_release(pages[page_nr++]);
451 451
452 if (spd.nr_pages) 452 if (spd.nr_pages)
453 return splice_to_pipe(pipe, &spd); 453 return splice_to_pipe(pipe, &spd);
454 454
455 return error; 455 return error;
456 } 456 }
457 457
458 /** 458 /**
459 * generic_file_splice_read - splice data from file to a pipe 459 * generic_file_splice_read - splice data from file to a pipe
460 * @in: file to splice from 460 * @in: file to splice from
461 * @ppos: position in @in 461 * @ppos: position in @in
462 * @pipe: pipe to splice to 462 * @pipe: pipe to splice to
463 * @len: number of bytes to splice 463 * @len: number of bytes to splice
464 * @flags: splice modifier flags 464 * @flags: splice modifier flags
465 * 465 *
466 * Description: 466 * Description:
467 * Will read pages from given file and fill them into a pipe. Can be 467 * Will read pages from given file and fill them into a pipe. Can be
468 * used as long as the address_space operations for the source implements 468 * used as long as the address_space operations for the source implements
469 * a readpage() hook. 469 * a readpage() hook.
470 * 470 *
471 */ 471 */
472 ssize_t generic_file_splice_read(struct file *in, loff_t *ppos, 472 ssize_t generic_file_splice_read(struct file *in, loff_t *ppos,
473 struct pipe_inode_info *pipe, size_t len, 473 struct pipe_inode_info *pipe, size_t len,
474 unsigned int flags) 474 unsigned int flags)
475 { 475 {
476 ssize_t spliced; 476 ssize_t spliced;
477 int ret; 477 int ret;
478 loff_t isize, left; 478 loff_t isize, left;
479 479
480 isize = i_size_read(in->f_mapping->host); 480 isize = i_size_read(in->f_mapping->host);
481 if (unlikely(*ppos >= isize)) 481 if (unlikely(*ppos >= isize))
482 return 0; 482 return 0;
483 483
484 left = isize - *ppos; 484 left = isize - *ppos;
485 if (unlikely(left < len)) 485 if (unlikely(left < len))
486 len = left; 486 len = left;
487 487
488 ret = 0; 488 ret = 0;
489 spliced = 0; 489 spliced = 0;
490 while (len) { 490 while (len) {
491 ret = __generic_file_splice_read(in, ppos, pipe, len, flags); 491 ret = __generic_file_splice_read(in, ppos, pipe, len, flags);
492 492
493 if (ret < 0) 493 if (ret < 0)
494 break; 494 break;
495 else if (!ret) { 495 else if (!ret) {
496 if (spliced) 496 if (spliced)
497 break; 497 break;
498 if (flags & SPLICE_F_NONBLOCK) { 498 if (flags & SPLICE_F_NONBLOCK) {
499 ret = -EAGAIN; 499 ret = -EAGAIN;
500 break; 500 break;
501 } 501 }
502 } 502 }
503 503
504 *ppos += ret; 504 *ppos += ret;
505 len -= ret; 505 len -= ret;
506 spliced += ret; 506 spliced += ret;
507 } 507 }
508 508
509 if (spliced) 509 if (spliced)
510 return spliced; 510 return spliced;
511 511
512 return ret; 512 return ret;
513 } 513 }
514 514
515 EXPORT_SYMBOL(generic_file_splice_read); 515 EXPORT_SYMBOL(generic_file_splice_read);
516 516
517 /* 517 /*
518 * Send 'sd->len' bytes to socket from 'sd->file' at position 'sd->pos' 518 * Send 'sd->len' bytes to socket from 'sd->file' at position 'sd->pos'
519 * using sendpage(). Return the number of bytes sent. 519 * using sendpage(). Return the number of bytes sent.
520 */ 520 */
521 static int pipe_to_sendpage(struct pipe_inode_info *pipe, 521 static int pipe_to_sendpage(struct pipe_inode_info *pipe,
522 struct pipe_buffer *buf, struct splice_desc *sd) 522 struct pipe_buffer *buf, struct splice_desc *sd)
523 { 523 {
524 struct file *file = sd->u.file; 524 struct file *file = sd->u.file;
525 loff_t pos = sd->pos; 525 loff_t pos = sd->pos;
526 int ret, more; 526 int ret, more;
527 527
528 ret = buf->ops->pin(pipe, buf); 528 ret = buf->ops->confirm(pipe, buf);
529 if (!ret) { 529 if (!ret) {
530 more = (sd->flags & SPLICE_F_MORE) || sd->len < sd->total_len; 530 more = (sd->flags & SPLICE_F_MORE) || sd->len < sd->total_len;
531 531
532 ret = file->f_op->sendpage(file, buf->page, buf->offset, 532 ret = file->f_op->sendpage(file, buf->page, buf->offset,
533 sd->len, &pos, more); 533 sd->len, &pos, more);
534 } 534 }
535 535
536 return ret; 536 return ret;
537 } 537 }
538 538
539 /* 539 /*
540 * This is a little more tricky than the file -> pipe splicing. There are 540 * This is a little more tricky than the file -> pipe splicing. There are
541 * basically three cases: 541 * basically three cases:
542 * 542 *
543 * - Destination page already exists in the address space and there 543 * - Destination page already exists in the address space and there
544 * are users of it. For that case we have no other option that 544 * are users of it. For that case we have no other option that
545 * copying the data. Tough luck. 545 * copying the data. Tough luck.
546 * - Destination page already exists in the address space, but there 546 * - Destination page already exists in the address space, but there
547 * are no users of it. Make sure it's uptodate, then drop it. Fall 547 * are no users of it. Make sure it's uptodate, then drop it. Fall
548 * through to last case. 548 * through to last case.
549 * - Destination page does not exist, we can add the pipe page to 549 * - Destination page does not exist, we can add the pipe page to
550 * the page cache and avoid the copy. 550 * the page cache and avoid the copy.
551 * 551 *
552 * If asked to move pages to the output file (SPLICE_F_MOVE is set in 552 * If asked to move pages to the output file (SPLICE_F_MOVE is set in
553 * sd->flags), we attempt to migrate pages from the pipe to the output 553 * sd->flags), we attempt to migrate pages from the pipe to the output
554 * file address space page cache. This is possible if no one else has 554 * file address space page cache. This is possible if no one else has
555 * the pipe page referenced outside of the pipe and page cache. If 555 * the pipe page referenced outside of the pipe and page cache. If
556 * SPLICE_F_MOVE isn't set, or we cannot move the page, we simply create 556 * SPLICE_F_MOVE isn't set, or we cannot move the page, we simply create
557 * a new page in the output file page cache and fill/dirty that. 557 * a new page in the output file page cache and fill/dirty that.
558 */ 558 */
559 static int pipe_to_file(struct pipe_inode_info *pipe, struct pipe_buffer *buf, 559 static int pipe_to_file(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
560 struct splice_desc *sd) 560 struct splice_desc *sd)
561 { 561 {
562 struct file *file = sd->u.file; 562 struct file *file = sd->u.file;
563 struct address_space *mapping = file->f_mapping; 563 struct address_space *mapping = file->f_mapping;
564 unsigned int offset, this_len; 564 unsigned int offset, this_len;
565 struct page *page; 565 struct page *page;
566 pgoff_t index; 566 pgoff_t index;
567 int ret; 567 int ret;
568 568
569 /* 569 /*
570 * make sure the data in this buffer is uptodate 570 * make sure the data in this buffer is uptodate
571 */ 571 */
572 ret = buf->ops->pin(pipe, buf); 572 ret = buf->ops->confirm(pipe, buf);
573 if (unlikely(ret)) 573 if (unlikely(ret))
574 return ret; 574 return ret;
575 575
576 index = sd->pos >> PAGE_CACHE_SHIFT; 576 index = sd->pos >> PAGE_CACHE_SHIFT;
577 offset = sd->pos & ~PAGE_CACHE_MASK; 577 offset = sd->pos & ~PAGE_CACHE_MASK;
578 578
579 this_len = sd->len; 579 this_len = sd->len;
580 if (this_len + offset > PAGE_CACHE_SIZE) 580 if (this_len + offset > PAGE_CACHE_SIZE)
581 this_len = PAGE_CACHE_SIZE - offset; 581 this_len = PAGE_CACHE_SIZE - offset;
582 582
583 find_page: 583 find_page:
584 page = find_lock_page(mapping, index); 584 page = find_lock_page(mapping, index);
585 if (!page) { 585 if (!page) {
586 ret = -ENOMEM; 586 ret = -ENOMEM;
587 page = page_cache_alloc_cold(mapping); 587 page = page_cache_alloc_cold(mapping);
588 if (unlikely(!page)) 588 if (unlikely(!page))
589 goto out_ret; 589 goto out_ret;
590 590
591 /* 591 /*
592 * This will also lock the page 592 * This will also lock the page
593 */ 593 */
594 ret = add_to_page_cache_lru(page, mapping, index, 594 ret = add_to_page_cache_lru(page, mapping, index,
595 GFP_KERNEL); 595 GFP_KERNEL);
596 if (unlikely(ret)) 596 if (unlikely(ret))
597 goto out; 597 goto out;
598 } 598 }
599 599
600 ret = mapping->a_ops->prepare_write(file, page, offset, offset+this_len); 600 ret = mapping->a_ops->prepare_write(file, page, offset, offset+this_len);
601 if (unlikely(ret)) { 601 if (unlikely(ret)) {
602 loff_t isize = i_size_read(mapping->host); 602 loff_t isize = i_size_read(mapping->host);
603 603
604 if (ret != AOP_TRUNCATED_PAGE) 604 if (ret != AOP_TRUNCATED_PAGE)
605 unlock_page(page); 605 unlock_page(page);
606 page_cache_release(page); 606 page_cache_release(page);
607 if (ret == AOP_TRUNCATED_PAGE) 607 if (ret == AOP_TRUNCATED_PAGE)
608 goto find_page; 608 goto find_page;
609 609
610 /* 610 /*
611 * prepare_write() may have instantiated a few blocks 611 * prepare_write() may have instantiated a few blocks
612 * outside i_size. Trim these off again. 612 * outside i_size. Trim these off again.
613 */ 613 */
614 if (sd->pos + this_len > isize) 614 if (sd->pos + this_len > isize)
615 vmtruncate(mapping->host, isize); 615 vmtruncate(mapping->host, isize);
616 616
617 goto out_ret; 617 goto out_ret;
618 } 618 }
619 619
620 if (buf->page != page) { 620 if (buf->page != page) {
621 /* 621 /*
622 * Careful, ->map() uses KM_USER0! 622 * Careful, ->map() uses KM_USER0!
623 */ 623 */
624 char *src = buf->ops->map(pipe, buf, 1); 624 char *src = buf->ops->map(pipe, buf, 1);
625 char *dst = kmap_atomic(page, KM_USER1); 625 char *dst = kmap_atomic(page, KM_USER1);
626 626
627 memcpy(dst + offset, src + buf->offset, this_len); 627 memcpy(dst + offset, src + buf->offset, this_len);
628 flush_dcache_page(page); 628 flush_dcache_page(page);
629 kunmap_atomic(dst, KM_USER1); 629 kunmap_atomic(dst, KM_USER1);
630 buf->ops->unmap(pipe, buf, src); 630 buf->ops->unmap(pipe, buf, src);
631 } 631 }
632 632
633 ret = mapping->a_ops->commit_write(file, page, offset, offset+this_len); 633 ret = mapping->a_ops->commit_write(file, page, offset, offset+this_len);
634 if (ret) { 634 if (ret) {
635 if (ret == AOP_TRUNCATED_PAGE) { 635 if (ret == AOP_TRUNCATED_PAGE) {
636 page_cache_release(page); 636 page_cache_release(page);
637 goto find_page; 637 goto find_page;
638 } 638 }
639 if (ret < 0) 639 if (ret < 0)
640 goto out; 640 goto out;
641 /* 641 /*
642 * Partial write has happened, so 'ret' already initialized by 642 * Partial write has happened, so 'ret' already initialized by
643 * number of bytes written, Where is nothing we have to do here. 643 * number of bytes written, Where is nothing we have to do here.
644 */ 644 */
645 } else 645 } else
646 ret = this_len; 646 ret = this_len;
647 /* 647 /*
648 * Return the number of bytes written and mark page as 648 * Return the number of bytes written and mark page as
649 * accessed, we are now done! 649 * accessed, we are now done!
650 */ 650 */
651 mark_page_accessed(page); 651 mark_page_accessed(page);
652 out: 652 out:
653 page_cache_release(page); 653 page_cache_release(page);
654 unlock_page(page); 654 unlock_page(page);
655 out_ret: 655 out_ret:
656 return ret; 656 return ret;
657 } 657 }
658 658
659 /** 659 /**
660 * __splice_from_pipe - splice data from a pipe to given actor 660 * __splice_from_pipe - splice data from a pipe to given actor
661 * @pipe: pipe to splice from 661 * @pipe: pipe to splice from
662 * @sd: information to @actor 662 * @sd: information to @actor
663 * @actor: handler that splices the data 663 * @actor: handler that splices the data
664 * 664 *
665 * Description: 665 * Description:
666 * This function does little more than loop over the pipe and call 666 * This function does little more than loop over the pipe and call
667 * @actor to do the actual moving of a single struct pipe_buffer to 667 * @actor to do the actual moving of a single struct pipe_buffer to
668 * the desired destination. See pipe_to_file, pipe_to_sendpage, or 668 * the desired destination. See pipe_to_file, pipe_to_sendpage, or
669 * pipe_to_user. 669 * pipe_to_user.
670 * 670 *
671 */ 671 */
672 ssize_t __splice_from_pipe(struct pipe_inode_info *pipe, struct splice_desc *sd, 672 ssize_t __splice_from_pipe(struct pipe_inode_info *pipe, struct splice_desc *sd,
673 splice_actor *actor) 673 splice_actor *actor)
674 { 674 {
675 int ret, do_wakeup, err; 675 int ret, do_wakeup, err;
676 676
677 ret = 0; 677 ret = 0;
678 do_wakeup = 0; 678 do_wakeup = 0;
679 679
680 for (;;) { 680 for (;;) {
681 if (pipe->nrbufs) { 681 if (pipe->nrbufs) {
682 struct pipe_buffer *buf = pipe->bufs + pipe->curbuf; 682 struct pipe_buffer *buf = pipe->bufs + pipe->curbuf;
683 const struct pipe_buf_operations *ops = buf->ops; 683 const struct pipe_buf_operations *ops = buf->ops;
684 684
685 sd->len = buf->len; 685 sd->len = buf->len;
686 if (sd->len > sd->total_len) 686 if (sd->len > sd->total_len)
687 sd->len = sd->total_len; 687 sd->len = sd->total_len;
688 688
689 err = actor(pipe, buf, sd); 689 err = actor(pipe, buf, sd);
690 if (err <= 0) { 690 if (err <= 0) {
691 if (!ret && err != -ENODATA) 691 if (!ret && err != -ENODATA)
692 ret = err; 692 ret = err;
693 693
694 break; 694 break;
695 } 695 }
696 696
697 ret += err; 697 ret += err;
698 buf->offset += err; 698 buf->offset += err;
699 buf->len -= err; 699 buf->len -= err;
700 700
701 sd->len -= err; 701 sd->len -= err;
702 sd->pos += err; 702 sd->pos += err;
703 sd->total_len -= err; 703 sd->total_len -= err;
704 if (sd->len) 704 if (sd->len)
705 continue; 705 continue;
706 706
707 if (!buf->len) { 707 if (!buf->len) {
708 buf->ops = NULL; 708 buf->ops = NULL;
709 ops->release(pipe, buf); 709 ops->release(pipe, buf);
710 pipe->curbuf = (pipe->curbuf + 1) & (PIPE_BUFFERS - 1); 710 pipe->curbuf = (pipe->curbuf + 1) & (PIPE_BUFFERS - 1);
711 pipe->nrbufs--; 711 pipe->nrbufs--;
712 if (pipe->inode) 712 if (pipe->inode)
713 do_wakeup = 1; 713 do_wakeup = 1;
714 } 714 }
715 715
716 if (!sd->total_len) 716 if (!sd->total_len)
717 break; 717 break;
718 } 718 }
719 719
720 if (pipe->nrbufs) 720 if (pipe->nrbufs)
721 continue; 721 continue;
722 if (!pipe->writers) 722 if (!pipe->writers)
723 break; 723 break;
724 if (!pipe->waiting_writers) { 724 if (!pipe->waiting_writers) {
725 if (ret) 725 if (ret)
726 break; 726 break;
727 } 727 }
728 728
729 if (sd->flags & SPLICE_F_NONBLOCK) { 729 if (sd->flags & SPLICE_F_NONBLOCK) {
730 if (!ret) 730 if (!ret)
731 ret = -EAGAIN; 731 ret = -EAGAIN;
732 break; 732 break;
733 } 733 }
734 734
735 if (signal_pending(current)) { 735 if (signal_pending(current)) {
736 if (!ret) 736 if (!ret)
737 ret = -ERESTARTSYS; 737 ret = -ERESTARTSYS;
738 break; 738 break;
739 } 739 }
740 740
741 if (do_wakeup) { 741 if (do_wakeup) {
742 smp_mb(); 742 smp_mb();
743 if (waitqueue_active(&pipe->wait)) 743 if (waitqueue_active(&pipe->wait))
744 wake_up_interruptible_sync(&pipe->wait); 744 wake_up_interruptible_sync(&pipe->wait);
745 kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); 745 kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
746 do_wakeup = 0; 746 do_wakeup = 0;
747 } 747 }
748 748
749 pipe_wait(pipe); 749 pipe_wait(pipe);
750 } 750 }
751 751
752 if (do_wakeup) { 752 if (do_wakeup) {
753 smp_mb(); 753 smp_mb();
754 if (waitqueue_active(&pipe->wait)) 754 if (waitqueue_active(&pipe->wait))
755 wake_up_interruptible(&pipe->wait); 755 wake_up_interruptible(&pipe->wait);
756 kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); 756 kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
757 } 757 }
758 758
759 return ret; 759 return ret;
760 } 760 }
761 EXPORT_SYMBOL(__splice_from_pipe); 761 EXPORT_SYMBOL(__splice_from_pipe);
762 762
763 /** 763 /**
764 * splice_from_pipe - splice data from a pipe to a file 764 * splice_from_pipe - splice data from a pipe to a file
765 * @pipe: pipe to splice from 765 * @pipe: pipe to splice from
766 * @out: file to splice to 766 * @out: file to splice to
767 * @ppos: position in @out 767 * @ppos: position in @out
768 * @len: how many bytes to splice 768 * @len: how many bytes to splice
769 * @flags: splice modifier flags 769 * @flags: splice modifier flags
770 * @actor: handler that splices the data 770 * @actor: handler that splices the data
771 * 771 *
772 * Description: 772 * Description:
773 * See __splice_from_pipe. This function locks the input and output inodes, 773 * See __splice_from_pipe. This function locks the input and output inodes,
774 * otherwise it's identical to __splice_from_pipe(). 774 * otherwise it's identical to __splice_from_pipe().
775 * 775 *
776 */ 776 */
777 ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out, 777 ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out,
778 loff_t *ppos, size_t len, unsigned int flags, 778 loff_t *ppos, size_t len, unsigned int flags,
779 splice_actor *actor) 779 splice_actor *actor)
780 { 780 {
781 ssize_t ret; 781 ssize_t ret;
782 struct inode *inode = out->f_mapping->host; 782 struct inode *inode = out->f_mapping->host;
783 struct splice_desc sd = { 783 struct splice_desc sd = {
784 .total_len = len, 784 .total_len = len,
785 .flags = flags, 785 .flags = flags,
786 .pos = *ppos, 786 .pos = *ppos,
787 .u.file = out, 787 .u.file = out,
788 }; 788 };
789 789
790 /* 790 /*
791 * The actor worker might be calling ->prepare_write and 791 * The actor worker might be calling ->prepare_write and
792 * ->commit_write. Most of the time, these expect i_mutex to 792 * ->commit_write. Most of the time, these expect i_mutex to
793 * be held. Since this may result in an ABBA deadlock with 793 * be held. Since this may result in an ABBA deadlock with
794 * pipe->inode, we have to order lock acquiry here. 794 * pipe->inode, we have to order lock acquiry here.
795 */ 795 */
796 inode_double_lock(inode, pipe->inode); 796 inode_double_lock(inode, pipe->inode);
797 ret = __splice_from_pipe(pipe, &sd, actor); 797 ret = __splice_from_pipe(pipe, &sd, actor);
798 inode_double_unlock(inode, pipe->inode); 798 inode_double_unlock(inode, pipe->inode);
799 799
800 return ret; 800 return ret;
801 } 801 }
802 802
803 /** 803 /**
804 * generic_file_splice_write_nolock - generic_file_splice_write without mutexes 804 * generic_file_splice_write_nolock - generic_file_splice_write without mutexes
805 * @pipe: pipe info 805 * @pipe: pipe info
806 * @out: file to write to 806 * @out: file to write to
807 * @ppos: position in @out 807 * @ppos: position in @out
808 * @len: number of bytes to splice 808 * @len: number of bytes to splice
809 * @flags: splice modifier flags 809 * @flags: splice modifier flags
810 * 810 *
811 * Description: 811 * Description:
812 * Will either move or copy pages (determined by @flags options) from 812 * Will either move or copy pages (determined by @flags options) from
813 * the given pipe inode to the given file. The caller is responsible 813 * the given pipe inode to the given file. The caller is responsible
814 * for acquiring i_mutex on both inodes. 814 * for acquiring i_mutex on both inodes.
815 * 815 *
816 */ 816 */
817 ssize_t 817 ssize_t
818 generic_file_splice_write_nolock(struct pipe_inode_info *pipe, struct file *out, 818 generic_file_splice_write_nolock(struct pipe_inode_info *pipe, struct file *out,
819 loff_t *ppos, size_t len, unsigned int flags) 819 loff_t *ppos, size_t len, unsigned int flags)
820 { 820 {
821 struct address_space *mapping = out->f_mapping; 821 struct address_space *mapping = out->f_mapping;
822 struct inode *inode = mapping->host; 822 struct inode *inode = mapping->host;
823 struct splice_desc sd = { 823 struct splice_desc sd = {
824 .total_len = len, 824 .total_len = len,
825 .flags = flags, 825 .flags = flags,
826 .pos = *ppos, 826 .pos = *ppos,
827 .u.file = out, 827 .u.file = out,
828 }; 828 };
829 ssize_t ret; 829 ssize_t ret;
830 int err; 830 int err;
831 831
832 err = remove_suid(out->f_path.dentry); 832 err = remove_suid(out->f_path.dentry);
833 if (unlikely(err)) 833 if (unlikely(err))
834 return err; 834 return err;
835 835
836 ret = __splice_from_pipe(pipe, &sd, pipe_to_file); 836 ret = __splice_from_pipe(pipe, &sd, pipe_to_file);
837 if (ret > 0) { 837 if (ret > 0) {
838 unsigned long nr_pages; 838 unsigned long nr_pages;
839 839
840 *ppos += ret; 840 *ppos += ret;
841 nr_pages = (ret + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 841 nr_pages = (ret + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
842 842
843 /* 843 /*
844 * If file or inode is SYNC and we actually wrote some data, 844 * If file or inode is SYNC and we actually wrote some data,
845 * sync it. 845 * sync it.
846 */ 846 */
847 if (unlikely((out->f_flags & O_SYNC) || IS_SYNC(inode))) { 847 if (unlikely((out->f_flags & O_SYNC) || IS_SYNC(inode))) {
848 err = generic_osync_inode(inode, mapping, 848 err = generic_osync_inode(inode, mapping,
849 OSYNC_METADATA|OSYNC_DATA); 849 OSYNC_METADATA|OSYNC_DATA);
850 850
851 if (err) 851 if (err)
852 ret = err; 852 ret = err;
853 } 853 }
854 balance_dirty_pages_ratelimited_nr(mapping, nr_pages); 854 balance_dirty_pages_ratelimited_nr(mapping, nr_pages);
855 } 855 }
856 856
857 return ret; 857 return ret;
858 } 858 }
859 859
860 EXPORT_SYMBOL(generic_file_splice_write_nolock); 860 EXPORT_SYMBOL(generic_file_splice_write_nolock);
861 861
862 /** 862 /**
863 * generic_file_splice_write - splice data from a pipe to a file 863 * generic_file_splice_write - splice data from a pipe to a file
864 * @pipe: pipe info 864 * @pipe: pipe info
865 * @out: file to write to 865 * @out: file to write to
866 * @ppos: position in @out 866 * @ppos: position in @out
867 * @len: number of bytes to splice 867 * @len: number of bytes to splice
868 * @flags: splice modifier flags 868 * @flags: splice modifier flags
869 * 869 *
870 * Description: 870 * Description:
871 * Will either move or copy pages (determined by @flags options) from 871 * Will either move or copy pages (determined by @flags options) from
872 * the given pipe inode to the given file. 872 * the given pipe inode to the given file.
873 * 873 *
874 */ 874 */
875 ssize_t 875 ssize_t
876 generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out, 876 generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
877 loff_t *ppos, size_t len, unsigned int flags) 877 loff_t *ppos, size_t len, unsigned int flags)
878 { 878 {
879 struct address_space *mapping = out->f_mapping; 879 struct address_space *mapping = out->f_mapping;
880 struct inode *inode = mapping->host; 880 struct inode *inode = mapping->host;
881 ssize_t ret; 881 ssize_t ret;
882 int err; 882 int err;
883 883
884 err = should_remove_suid(out->f_path.dentry); 884 err = should_remove_suid(out->f_path.dentry);
885 if (unlikely(err)) { 885 if (unlikely(err)) {
886 mutex_lock(&inode->i_mutex); 886 mutex_lock(&inode->i_mutex);
887 err = __remove_suid(out->f_path.dentry, err); 887 err = __remove_suid(out->f_path.dentry, err);
888 mutex_unlock(&inode->i_mutex); 888 mutex_unlock(&inode->i_mutex);
889 if (err) 889 if (err)
890 return err; 890 return err;
891 } 891 }
892 892
893 ret = splice_from_pipe(pipe, out, ppos, len, flags, pipe_to_file); 893 ret = splice_from_pipe(pipe, out, ppos, len, flags, pipe_to_file);
894 if (ret > 0) { 894 if (ret > 0) {
895 unsigned long nr_pages; 895 unsigned long nr_pages;
896 896
897 *ppos += ret; 897 *ppos += ret;
898 nr_pages = (ret + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 898 nr_pages = (ret + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
899 899
900 /* 900 /*
901 * If file or inode is SYNC and we actually wrote some data, 901 * If file or inode is SYNC and we actually wrote some data,
902 * sync it. 902 * sync it.
903 */ 903 */
904 if (unlikely((out->f_flags & O_SYNC) || IS_SYNC(inode))) { 904 if (unlikely((out->f_flags & O_SYNC) || IS_SYNC(inode))) {
905 mutex_lock(&inode->i_mutex); 905 mutex_lock(&inode->i_mutex);
906 err = generic_osync_inode(inode, mapping, 906 err = generic_osync_inode(inode, mapping,
907 OSYNC_METADATA|OSYNC_DATA); 907 OSYNC_METADATA|OSYNC_DATA);
908 mutex_unlock(&inode->i_mutex); 908 mutex_unlock(&inode->i_mutex);
909 909
910 if (err) 910 if (err)
911 ret = err; 911 ret = err;
912 } 912 }
913 balance_dirty_pages_ratelimited_nr(mapping, nr_pages); 913 balance_dirty_pages_ratelimited_nr(mapping, nr_pages);
914 } 914 }
915 915
916 return ret; 916 return ret;
917 } 917 }
918 918
919 EXPORT_SYMBOL(generic_file_splice_write); 919 EXPORT_SYMBOL(generic_file_splice_write);
920 920
921 /** 921 /**
922 * generic_splice_sendpage - splice data from a pipe to a socket 922 * generic_splice_sendpage - splice data from a pipe to a socket
923 * @pipe: pipe to splice from 923 * @pipe: pipe to splice from
924 * @out: socket to write to 924 * @out: socket to write to
925 * @ppos: position in @out 925 * @ppos: position in @out
926 * @len: number of bytes to splice 926 * @len: number of bytes to splice
927 * @flags: splice modifier flags 927 * @flags: splice modifier flags
928 * 928 *
929 * Description: 929 * Description:
930 * Will send @len bytes from the pipe to a network socket. No data copying 930 * Will send @len bytes from the pipe to a network socket. No data copying
931 * is involved. 931 * is involved.
932 * 932 *
933 */ 933 */
934 ssize_t generic_splice_sendpage(struct pipe_inode_info *pipe, struct file *out, 934 ssize_t generic_splice_sendpage(struct pipe_inode_info *pipe, struct file *out,
935 loff_t *ppos, size_t len, unsigned int flags) 935 loff_t *ppos, size_t len, unsigned int flags)
936 { 936 {
937 return splice_from_pipe(pipe, out, ppos, len, flags, pipe_to_sendpage); 937 return splice_from_pipe(pipe, out, ppos, len, flags, pipe_to_sendpage);
938 } 938 }
939 939
940 EXPORT_SYMBOL(generic_splice_sendpage); 940 EXPORT_SYMBOL(generic_splice_sendpage);
941 941
942 /* 942 /*
943 * Attempt to initiate a splice from pipe to file. 943 * Attempt to initiate a splice from pipe to file.
944 */ 944 */
945 static long do_splice_from(struct pipe_inode_info *pipe, struct file *out, 945 static long do_splice_from(struct pipe_inode_info *pipe, struct file *out,
946 loff_t *ppos, size_t len, unsigned int flags) 946 loff_t *ppos, size_t len, unsigned int flags)
947 { 947 {
948 int ret; 948 int ret;
949 949
950 if (unlikely(!out->f_op || !out->f_op->splice_write)) 950 if (unlikely(!out->f_op || !out->f_op->splice_write))
951 return -EINVAL; 951 return -EINVAL;
952 952
953 if (unlikely(!(out->f_mode & FMODE_WRITE))) 953 if (unlikely(!(out->f_mode & FMODE_WRITE)))
954 return -EBADF; 954 return -EBADF;
955 955
956 ret = rw_verify_area(WRITE, out, ppos, len); 956 ret = rw_verify_area(WRITE, out, ppos, len);
957 if (unlikely(ret < 0)) 957 if (unlikely(ret < 0))
958 return ret; 958 return ret;
959 959
960 return out->f_op->splice_write(pipe, out, ppos, len, flags); 960 return out->f_op->splice_write(pipe, out, ppos, len, flags);
961 } 961 }
962 962
963 /* 963 /*
964 * Attempt to initiate a splice from a file to a pipe. 964 * Attempt to initiate a splice from a file to a pipe.
965 */ 965 */
966 static long do_splice_to(struct file *in, loff_t *ppos, 966 static long do_splice_to(struct file *in, loff_t *ppos,
967 struct pipe_inode_info *pipe, size_t len, 967 struct pipe_inode_info *pipe, size_t len,
968 unsigned int flags) 968 unsigned int flags)
969 { 969 {
970 int ret; 970 int ret;
971 971
972 if (unlikely(!in->f_op || !in->f_op->splice_read)) 972 if (unlikely(!in->f_op || !in->f_op->splice_read))
973 return -EINVAL; 973 return -EINVAL;
974 974
975 if (unlikely(!(in->f_mode & FMODE_READ))) 975 if (unlikely(!(in->f_mode & FMODE_READ)))
976 return -EBADF; 976 return -EBADF;
977 977
978 ret = rw_verify_area(READ, in, ppos, len); 978 ret = rw_verify_area(READ, in, ppos, len);
979 if (unlikely(ret < 0)) 979 if (unlikely(ret < 0))
980 return ret; 980 return ret;
981 981
982 return in->f_op->splice_read(in, ppos, pipe, len, flags); 982 return in->f_op->splice_read(in, ppos, pipe, len, flags);
983 } 983 }
984 984
985 /** 985 /**
986 * splice_direct_to_actor - splices data directly between two non-pipes 986 * splice_direct_to_actor - splices data directly between two non-pipes
987 * @in: file to splice from 987 * @in: file to splice from
988 * @sd: actor information on where to splice to 988 * @sd: actor information on where to splice to
989 * @actor: handles the data splicing 989 * @actor: handles the data splicing
990 * 990 *
991 * Description: 991 * Description:
992 * This is a special case helper to splice directly between two 992 * This is a special case helper to splice directly between two
993 * points, without requiring an explicit pipe. Internally an allocated 993 * points, without requiring an explicit pipe. Internally an allocated
994 * pipe is cached in the process, and reused during the life time of 994 * pipe is cached in the process, and reused during the life time of
995 * that process. 995 * that process.
996 * 996 *
997 */ 997 */
998 ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd, 998 ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd,
999 splice_direct_actor *actor) 999 splice_direct_actor *actor)
1000 { 1000 {
1001 struct pipe_inode_info *pipe; 1001 struct pipe_inode_info *pipe;
1002 long ret, bytes; 1002 long ret, bytes;
1003 umode_t i_mode; 1003 umode_t i_mode;
1004 size_t len; 1004 size_t len;
1005 int i, flags; 1005 int i, flags;
1006 1006
1007 /* 1007 /*
1008 * We require the input being a regular file, as we don't want to 1008 * We require the input being a regular file, as we don't want to
1009 * randomly drop data for eg socket -> socket splicing. Use the 1009 * randomly drop data for eg socket -> socket splicing. Use the
1010 * piped splicing for that! 1010 * piped splicing for that!
1011 */ 1011 */
1012 i_mode = in->f_path.dentry->d_inode->i_mode; 1012 i_mode = in->f_path.dentry->d_inode->i_mode;
1013 if (unlikely(!S_ISREG(i_mode) && !S_ISBLK(i_mode))) 1013 if (unlikely(!S_ISREG(i_mode) && !S_ISBLK(i_mode)))
1014 return -EINVAL; 1014 return -EINVAL;
1015 1015
1016 /* 1016 /*
1017 * neither in nor out is a pipe, setup an internal pipe attached to 1017 * neither in nor out is a pipe, setup an internal pipe attached to
1018 * 'out' and transfer the wanted data from 'in' to 'out' through that 1018 * 'out' and transfer the wanted data from 'in' to 'out' through that
1019 */ 1019 */
1020 pipe = current->splice_pipe; 1020 pipe = current->splice_pipe;
1021 if (unlikely(!pipe)) { 1021 if (unlikely(!pipe)) {
1022 pipe = alloc_pipe_info(NULL); 1022 pipe = alloc_pipe_info(NULL);
1023 if (!pipe) 1023 if (!pipe)
1024 return -ENOMEM; 1024 return -ENOMEM;
1025 1025
1026 /* 1026 /*
1027 * We don't have an immediate reader, but we'll read the stuff 1027 * We don't have an immediate reader, but we'll read the stuff
1028 * out of the pipe right after the splice_to_pipe(). So set 1028 * out of the pipe right after the splice_to_pipe(). So set
1029 * PIPE_READERS appropriately. 1029 * PIPE_READERS appropriately.
1030 */ 1030 */
1031 pipe->readers = 1; 1031 pipe->readers = 1;
1032 1032
1033 current->splice_pipe = pipe; 1033 current->splice_pipe = pipe;
1034 } 1034 }
1035 1035
1036 /* 1036 /*
1037 * Do the splice. 1037 * Do the splice.
1038 */ 1038 */
1039 ret = 0; 1039 ret = 0;
1040 bytes = 0; 1040 bytes = 0;
1041 len = sd->total_len; 1041 len = sd->total_len;
1042 flags = sd->flags; 1042 flags = sd->flags;
1043 1043
1044 /* 1044 /*
1045 * Don't block on output, we have to drain the direct pipe. 1045 * Don't block on output, we have to drain the direct pipe.
1046 */ 1046 */
1047 sd->flags &= ~SPLICE_F_NONBLOCK; 1047 sd->flags &= ~SPLICE_F_NONBLOCK;
1048 1048
1049 while (len) { 1049 while (len) {
1050 size_t read_len, max_read_len; 1050 size_t read_len, max_read_len;
1051 1051
1052 /* 1052 /*
1053 * Do at most PIPE_BUFFERS pages worth of transfer: 1053 * Do at most PIPE_BUFFERS pages worth of transfer:
1054 */ 1054 */
1055 max_read_len = min(len, (size_t)(PIPE_BUFFERS*PAGE_SIZE)); 1055 max_read_len = min(len, (size_t)(PIPE_BUFFERS*PAGE_SIZE));
1056 1056
1057 ret = do_splice_to(in, &sd->pos, pipe, max_read_len, flags); 1057 ret = do_splice_to(in, &sd->pos, pipe, max_read_len, flags);
1058 if (unlikely(ret < 0)) 1058 if (unlikely(ret < 0))
1059 goto out_release; 1059 goto out_release;
1060 1060
1061 read_len = ret; 1061 read_len = ret;
1062 sd->total_len = read_len; 1062 sd->total_len = read_len;
1063 1063
1064 /* 1064 /*
1065 * NOTE: nonblocking mode only applies to the input. We 1065 * NOTE: nonblocking mode only applies to the input. We
1066 * must not do the output in nonblocking mode as then we 1066 * must not do the output in nonblocking mode as then we
1067 * could get stuck data in the internal pipe: 1067 * could get stuck data in the internal pipe:
1068 */ 1068 */
1069 ret = actor(pipe, sd); 1069 ret = actor(pipe, sd);
1070 if (unlikely(ret < 0)) 1070 if (unlikely(ret < 0))
1071 goto out_release; 1071 goto out_release;
1072 1072
1073 bytes += ret; 1073 bytes += ret;
1074 len -= ret; 1074 len -= ret;
1075 1075
1076 /* 1076 /*
1077 * In nonblocking mode, if we got back a short read then 1077 * In nonblocking mode, if we got back a short read then
1078 * that was due to either an IO error or due to the 1078 * that was due to either an IO error or due to the
1079 * pagecache entry not being there. In the IO error case 1079 * pagecache entry not being there. In the IO error case
1080 * the _next_ splice attempt will produce a clean IO error 1080 * the _next_ splice attempt will produce a clean IO error
1081 * return value (not a short read), so in both cases it's 1081 * return value (not a short read), so in both cases it's
1082 * correct to break out of the loop here: 1082 * correct to break out of the loop here:
1083 */ 1083 */
1084 if ((flags & SPLICE_F_NONBLOCK) && (read_len < max_read_len)) 1084 if ((flags & SPLICE_F_NONBLOCK) && (read_len < max_read_len))
1085 break; 1085 break;
1086 } 1086 }
1087 1087
1088 pipe->nrbufs = pipe->curbuf = 0; 1088 pipe->nrbufs = pipe->curbuf = 0;
1089 1089
1090 return bytes; 1090 return bytes;
1091 1091
1092 out_release: 1092 out_release:
1093 /* 1093 /*
1094 * If we did an incomplete transfer we must release 1094 * If we did an incomplete transfer we must release
1095 * the pipe buffers in question: 1095 * the pipe buffers in question:
1096 */ 1096 */
1097 for (i = 0; i < PIPE_BUFFERS; i++) { 1097 for (i = 0; i < PIPE_BUFFERS; i++) {
1098 struct pipe_buffer *buf = pipe->bufs + i; 1098 struct pipe_buffer *buf = pipe->bufs + i;
1099 1099
1100 if (buf->ops) { 1100 if (buf->ops) {
1101 buf->ops->release(pipe, buf); 1101 buf->ops->release(pipe, buf);
1102 buf->ops = NULL; 1102 buf->ops = NULL;
1103 } 1103 }
1104 } 1104 }
1105 pipe->nrbufs = pipe->curbuf = 0; 1105 pipe->nrbufs = pipe->curbuf = 0;
1106 1106
1107 /* 1107 /*
1108 * If we transferred some data, return the number of bytes: 1108 * If we transferred some data, return the number of bytes:
1109 */ 1109 */
1110 if (bytes > 0) 1110 if (bytes > 0)
1111 return bytes; 1111 return bytes;
1112 1112
1113 return ret; 1113 return ret;
1114 1114
1115 } 1115 }
1116 EXPORT_SYMBOL(splice_direct_to_actor); 1116 EXPORT_SYMBOL(splice_direct_to_actor);
1117 1117
1118 static int direct_splice_actor(struct pipe_inode_info *pipe, 1118 static int direct_splice_actor(struct pipe_inode_info *pipe,
1119 struct splice_desc *sd) 1119 struct splice_desc *sd)
1120 { 1120 {
1121 struct file *file = sd->u.file; 1121 struct file *file = sd->u.file;
1122 1122
1123 return do_splice_from(pipe, file, &sd->pos, sd->total_len, sd->flags); 1123 return do_splice_from(pipe, file, &sd->pos, sd->total_len, sd->flags);
1124 } 1124 }
1125 1125
1126 /** 1126 /**
1127 * do_splice_direct - splices data directly between two files 1127 * do_splice_direct - splices data directly between two files
1128 * @in: file to splice from 1128 * @in: file to splice from
1129 * @ppos: input file offset 1129 * @ppos: input file offset
1130 * @out: file to splice to 1130 * @out: file to splice to
1131 * @len: number of bytes to splice 1131 * @len: number of bytes to splice
1132 * @flags: splice modifier flags 1132 * @flags: splice modifier flags
1133 * 1133 *
1134 * Description: 1134 * Description:
1135 * For use by do_sendfile(). splice can easily emulate sendfile, but 1135 * For use by do_sendfile(). splice can easily emulate sendfile, but
1136 * doing it in the application would incur an extra system call 1136 * doing it in the application would incur an extra system call
1137 * (splice in + splice out, as compared to just sendfile()). So this helper 1137 * (splice in + splice out, as compared to just sendfile()). So this helper
1138 * can splice directly through a process-private pipe. 1138 * can splice directly through a process-private pipe.
1139 * 1139 *
1140 */ 1140 */
1141 long do_splice_direct(struct file *in, loff_t *ppos, struct file *out, 1141 long do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
1142 size_t len, unsigned int flags) 1142 size_t len, unsigned int flags)
1143 { 1143 {
1144 struct splice_desc sd = { 1144 struct splice_desc sd = {
1145 .len = len, 1145 .len = len,
1146 .total_len = len, 1146 .total_len = len,
1147 .flags = flags, 1147 .flags = flags,
1148 .pos = *ppos, 1148 .pos = *ppos,
1149 .u.file = out, 1149 .u.file = out,
1150 }; 1150 };
1151 size_t ret; 1151 size_t ret;
1152 1152
1153 ret = splice_direct_to_actor(in, &sd, direct_splice_actor); 1153 ret = splice_direct_to_actor(in, &sd, direct_splice_actor);
1154 *ppos = sd.pos; 1154 *ppos = sd.pos;
1155 return ret; 1155 return ret;
1156 } 1156 }
1157 1157
1158 /* 1158 /*
1159 * After the inode slimming patch, i_pipe/i_bdev/i_cdev share the same 1159 * After the inode slimming patch, i_pipe/i_bdev/i_cdev share the same
1160 * location, so checking ->i_pipe is not enough to verify that this is a 1160 * location, so checking ->i_pipe is not enough to verify that this is a
1161 * pipe. 1161 * pipe.
1162 */ 1162 */
1163 static inline struct pipe_inode_info *pipe_info(struct inode *inode) 1163 static inline struct pipe_inode_info *pipe_info(struct inode *inode)
1164 { 1164 {
1165 if (S_ISFIFO(inode->i_mode)) 1165 if (S_ISFIFO(inode->i_mode))
1166 return inode->i_pipe; 1166 return inode->i_pipe;
1167 1167
1168 return NULL; 1168 return NULL;
1169 } 1169 }
1170 1170
1171 /* 1171 /*
1172 * Determine where to splice to/from. 1172 * Determine where to splice to/from.
1173 */ 1173 */
1174 static long do_splice(struct file *in, loff_t __user *off_in, 1174 static long do_splice(struct file *in, loff_t __user *off_in,
1175 struct file *out, loff_t __user *off_out, 1175 struct file *out, loff_t __user *off_out,
1176 size_t len, unsigned int flags) 1176 size_t len, unsigned int flags)
1177 { 1177 {
1178 struct pipe_inode_info *pipe; 1178 struct pipe_inode_info *pipe;
1179 loff_t offset, *off; 1179 loff_t offset, *off;
1180 long ret; 1180 long ret;
1181 1181
1182 pipe = pipe_info(in->f_path.dentry->d_inode); 1182 pipe = pipe_info(in->f_path.dentry->d_inode);
1183 if (pipe) { 1183 if (pipe) {
1184 if (off_in) 1184 if (off_in)
1185 return -ESPIPE; 1185 return -ESPIPE;
1186 if (off_out) { 1186 if (off_out) {
1187 if (out->f_op->llseek == no_llseek) 1187 if (out->f_op->llseek == no_llseek)
1188 return -EINVAL; 1188 return -EINVAL;
1189 if (copy_from_user(&offset, off_out, sizeof(loff_t))) 1189 if (copy_from_user(&offset, off_out, sizeof(loff_t)))
1190 return -EFAULT; 1190 return -EFAULT;
1191 off = &offset; 1191 off = &offset;
1192 } else 1192 } else
1193 off = &out->f_pos; 1193 off = &out->f_pos;
1194 1194
1195 ret = do_splice_from(pipe, out, off, len, flags); 1195 ret = do_splice_from(pipe, out, off, len, flags);
1196 1196
1197 if (off_out && copy_to_user(off_out, off, sizeof(loff_t))) 1197 if (off_out && copy_to_user(off_out, off, sizeof(loff_t)))
1198 ret = -EFAULT; 1198 ret = -EFAULT;
1199 1199
1200 return ret; 1200 return ret;
1201 } 1201 }
1202 1202
1203 pipe = pipe_info(out->f_path.dentry->d_inode); 1203 pipe = pipe_info(out->f_path.dentry->d_inode);
1204 if (pipe) { 1204 if (pipe) {
1205 if (off_out) 1205 if (off_out)
1206 return -ESPIPE; 1206 return -ESPIPE;
1207 if (off_in) { 1207 if (off_in) {
1208 if (in->f_op->llseek == no_llseek) 1208 if (in->f_op->llseek == no_llseek)
1209 return -EINVAL; 1209 return -EINVAL;
1210 if (copy_from_user(&offset, off_in, sizeof(loff_t))) 1210 if (copy_from_user(&offset, off_in, sizeof(loff_t)))
1211 return -EFAULT; 1211 return -EFAULT;
1212 off = &offset; 1212 off = &offset;
1213 } else 1213 } else
1214 off = &in->f_pos; 1214 off = &in->f_pos;
1215 1215
1216 ret = do_splice_to(in, off, pipe, len, flags); 1216 ret = do_splice_to(in, off, pipe, len, flags);
1217 1217
1218 if (off_in && copy_to_user(off_in, off, sizeof(loff_t))) 1218 if (off_in && copy_to_user(off_in, off, sizeof(loff_t)))
1219 ret = -EFAULT; 1219 ret = -EFAULT;
1220 1220
1221 return ret; 1221 return ret;
1222 } 1222 }
1223 1223
1224 return -EINVAL; 1224 return -EINVAL;
1225 } 1225 }
1226 1226
1227 /* 1227 /*
1228 * Map an iov into an array of pages and offset/length tupples. With the 1228 * Map an iov into an array of pages and offset/length tupples. With the
1229 * partial_page structure, we can map several non-contiguous ranges into 1229 * partial_page structure, we can map several non-contiguous ranges into
1230 * our ones pages[] map instead of splitting that operation into pieces. 1230 * our ones pages[] map instead of splitting that operation into pieces.
1231 * Could easily be exported as a generic helper for other users, in which 1231 * Could easily be exported as a generic helper for other users, in which
1232 * case one would probably want to add a 'max_nr_pages' parameter as well. 1232 * case one would probably want to add a 'max_nr_pages' parameter as well.
1233 */ 1233 */
1234 static int get_iovec_page_array(const struct iovec __user *iov, 1234 static int get_iovec_page_array(const struct iovec __user *iov,
1235 unsigned int nr_vecs, struct page **pages, 1235 unsigned int nr_vecs, struct page **pages,
1236 struct partial_page *partial, int aligned) 1236 struct partial_page *partial, int aligned)
1237 { 1237 {
1238 int buffers = 0, error = 0; 1238 int buffers = 0, error = 0;
1239 1239
1240 /* 1240 /*
1241 * It's ok to take the mmap_sem for reading, even 1241 * It's ok to take the mmap_sem for reading, even
1242 * across a "get_user()". 1242 * across a "get_user()".
1243 */ 1243 */
1244 down_read(&current->mm->mmap_sem); 1244 down_read(&current->mm->mmap_sem);
1245 1245
1246 while (nr_vecs) { 1246 while (nr_vecs) {
1247 unsigned long off, npages; 1247 unsigned long off, npages;
1248 void __user *base; 1248 void __user *base;
1249 size_t len; 1249 size_t len;
1250 int i; 1250 int i;
1251 1251
1252 /* 1252 /*
1253 * Get user address base and length for this iovec. 1253 * Get user address base and length for this iovec.
1254 */ 1254 */
1255 error = get_user(base, &iov->iov_base); 1255 error = get_user(base, &iov->iov_base);
1256 if (unlikely(error)) 1256 if (unlikely(error))
1257 break; 1257 break;
1258 error = get_user(len, &iov->iov_len); 1258 error = get_user(len, &iov->iov_len);
1259 if (unlikely(error)) 1259 if (unlikely(error))
1260 break; 1260 break;
1261 1261
1262 /* 1262 /*
1263 * Sanity check this iovec. 0 read succeeds. 1263 * Sanity check this iovec. 0 read succeeds.
1264 */ 1264 */
1265 if (unlikely(!len)) 1265 if (unlikely(!len))
1266 break; 1266 break;
1267 error = -EFAULT; 1267 error = -EFAULT;
1268 if (unlikely(!base)) 1268 if (unlikely(!base))
1269 break; 1269 break;
1270 1270
1271 /* 1271 /*
1272 * Get this base offset and number of pages, then map 1272 * Get this base offset and number of pages, then map
1273 * in the user pages. 1273 * in the user pages.
1274 */ 1274 */
1275 off = (unsigned long) base & ~PAGE_MASK; 1275 off = (unsigned long) base & ~PAGE_MASK;
1276 1276
1277 /* 1277 /*
1278 * If asked for alignment, the offset must be zero and the 1278 * If asked for alignment, the offset must be zero and the
1279 * length a multiple of the PAGE_SIZE. 1279 * length a multiple of the PAGE_SIZE.
1280 */ 1280 */
1281 error = -EINVAL; 1281 error = -EINVAL;
1282 if (aligned && (off || len & ~PAGE_MASK)) 1282 if (aligned && (off || len & ~PAGE_MASK))
1283 break; 1283 break;
1284 1284
1285 npages = (off + len + PAGE_SIZE - 1) >> PAGE_SHIFT; 1285 npages = (off + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
1286 if (npages > PIPE_BUFFERS - buffers) 1286 if (npages > PIPE_BUFFERS - buffers)
1287 npages = PIPE_BUFFERS - buffers; 1287 npages = PIPE_BUFFERS - buffers;
1288 1288
1289 error = get_user_pages(current, current->mm, 1289 error = get_user_pages(current, current->mm,
1290 (unsigned long) base, npages, 0, 0, 1290 (unsigned long) base, npages, 0, 0,
1291 &pages[buffers], NULL); 1291 &pages[buffers], NULL);
1292 1292
1293 if (unlikely(error <= 0)) 1293 if (unlikely(error <= 0))
1294 break; 1294 break;
1295 1295
1296 /* 1296 /*
1297 * Fill this contiguous range into the partial page map. 1297 * Fill this contiguous range into the partial page map.
1298 */ 1298 */
1299 for (i = 0; i < error; i++) { 1299 for (i = 0; i < error; i++) {
1300 const int plen = min_t(size_t, len, PAGE_SIZE - off); 1300 const int plen = min_t(size_t, len, PAGE_SIZE - off);
1301 1301
1302 partial[buffers].offset = off; 1302 partial[buffers].offset = off;
1303 partial[buffers].len = plen; 1303 partial[buffers].len = plen;
1304 1304
1305 off = 0; 1305 off = 0;
1306 len -= plen; 1306 len -= plen;
1307 buffers++; 1307 buffers++;
1308 } 1308 }
1309 1309
1310 /* 1310 /*
1311 * We didn't complete this iov, stop here since it probably 1311 * We didn't complete this iov, stop here since it probably
1312 * means we have to move some of this into a pipe to 1312 * means we have to move some of this into a pipe to
1313 * be able to continue. 1313 * be able to continue.
1314 */ 1314 */
1315 if (len) 1315 if (len)
1316 break; 1316 break;
1317 1317
1318 /* 1318 /*
1319 * Don't continue if we mapped fewer pages than we asked for, 1319 * Don't continue if we mapped fewer pages than we asked for,
1320 * or if we mapped the max number of pages that we have 1320 * or if we mapped the max number of pages that we have
1321 * room for. 1321 * room for.
1322 */ 1322 */
1323 if (error < npages || buffers == PIPE_BUFFERS) 1323 if (error < npages || buffers == PIPE_BUFFERS)
1324 break; 1324 break;
1325 1325
1326 nr_vecs--; 1326 nr_vecs--;
1327 iov++; 1327 iov++;
1328 } 1328 }
1329 1329
1330 up_read(&current->mm->mmap_sem); 1330 up_read(&current->mm->mmap_sem);
1331 1331
1332 if (buffers) 1332 if (buffers)
1333 return buffers; 1333 return buffers;
1334 1334
1335 return error; 1335 return error;
1336 } 1336 }
1337 1337
1338 static int pipe_to_user(struct pipe_inode_info *pipe, struct pipe_buffer *buf, 1338 static int pipe_to_user(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
1339 struct splice_desc *sd) 1339 struct splice_desc *sd)
1340 { 1340 {
1341 char *src; 1341 char *src;
1342 int ret; 1342 int ret;
1343 1343
1344 ret = buf->ops->pin(pipe, buf); 1344 ret = buf->ops->confirm(pipe, buf);
1345 if (unlikely(ret)) 1345 if (unlikely(ret))
1346 return ret; 1346 return ret;
1347 1347
1348 /* 1348 /*
1349 * See if we can use the atomic maps, by prefaulting in the 1349 * See if we can use the atomic maps, by prefaulting in the
1350 * pages and doing an atomic copy 1350 * pages and doing an atomic copy
1351 */ 1351 */
1352 if (!fault_in_pages_writeable(sd->u.userptr, sd->len)) { 1352 if (!fault_in_pages_writeable(sd->u.userptr, sd->len)) {
1353 src = buf->ops->map(pipe, buf, 1); 1353 src = buf->ops->map(pipe, buf, 1);
1354 ret = __copy_to_user_inatomic(sd->u.userptr, src + buf->offset, 1354 ret = __copy_to_user_inatomic(sd->u.userptr, src + buf->offset,
1355 sd->len); 1355 sd->len);
1356 buf->ops->unmap(pipe, buf, src); 1356 buf->ops->unmap(pipe, buf, src);
1357 if (!ret) { 1357 if (!ret) {
1358 ret = sd->len; 1358 ret = sd->len;
1359 goto out; 1359 goto out;
1360 } 1360 }
1361 } 1361 }
1362 1362
1363 /* 1363 /*
1364 * No dice, use slow non-atomic map and copy 1364 * No dice, use slow non-atomic map and copy
1365 */ 1365 */
1366 src = buf->ops->map(pipe, buf, 0); 1366 src = buf->ops->map(pipe, buf, 0);
1367 1367
1368 ret = sd->len; 1368 ret = sd->len;
1369 if (copy_to_user(sd->u.userptr, src + buf->offset, sd->len)) 1369 if (copy_to_user(sd->u.userptr, src + buf->offset, sd->len))
1370 ret = -EFAULT; 1370 ret = -EFAULT;
1371 1371
1372 out: 1372 out:
1373 if (ret > 0) 1373 if (ret > 0)
1374 sd->u.userptr += ret; 1374 sd->u.userptr += ret;
1375 buf->ops->unmap(pipe, buf, src); 1375 buf->ops->unmap(pipe, buf, src);
1376 return ret; 1376 return ret;
1377 } 1377 }
1378 1378
1379 /* 1379 /*
1380 * For lack of a better implementation, implement vmsplice() to userspace 1380 * For lack of a better implementation, implement vmsplice() to userspace
1381 * as a simple copy of the pipes pages to the user iov. 1381 * as a simple copy of the pipes pages to the user iov.
1382 */ 1382 */
1383 static long vmsplice_to_user(struct file *file, const struct iovec __user *iov, 1383 static long vmsplice_to_user(struct file *file, const struct iovec __user *iov,
1384 unsigned long nr_segs, unsigned int flags) 1384 unsigned long nr_segs, unsigned int flags)
1385 { 1385 {
1386 struct pipe_inode_info *pipe; 1386 struct pipe_inode_info *pipe;
1387 struct splice_desc sd; 1387 struct splice_desc sd;
1388 ssize_t size; 1388 ssize_t size;
1389 int error; 1389 int error;
1390 long ret; 1390 long ret;
1391 1391
1392 pipe = pipe_info(file->f_path.dentry->d_inode); 1392 pipe = pipe_info(file->f_path.dentry->d_inode);
1393 if (!pipe) 1393 if (!pipe)
1394 return -EBADF; 1394 return -EBADF;
1395 1395
1396 if (pipe->inode) 1396 if (pipe->inode)
1397 mutex_lock(&pipe->inode->i_mutex); 1397 mutex_lock(&pipe->inode->i_mutex);
1398 1398
1399 error = ret = 0; 1399 error = ret = 0;
1400 while (nr_segs) { 1400 while (nr_segs) {
1401 void __user *base; 1401 void __user *base;
1402 size_t len; 1402 size_t len;
1403 1403
1404 /* 1404 /*
1405 * Get user address base and length for this iovec. 1405 * Get user address base and length for this iovec.
1406 */ 1406 */
1407 error = get_user(base, &iov->iov_base); 1407 error = get_user(base, &iov->iov_base);
1408 if (unlikely(error)) 1408 if (unlikely(error))
1409 break; 1409 break;
1410 error = get_user(len, &iov->iov_len); 1410 error = get_user(len, &iov->iov_len);
1411 if (unlikely(error)) 1411 if (unlikely(error))
1412 break; 1412 break;
1413 1413
1414 /* 1414 /*
1415 * Sanity check this iovec. 0 read succeeds. 1415 * Sanity check this iovec. 0 read succeeds.
1416 */ 1416 */
1417 if (unlikely(!len)) 1417 if (unlikely(!len))
1418 break; 1418 break;
1419 if (unlikely(!base)) { 1419 if (unlikely(!base)) {
1420 error = -EFAULT; 1420 error = -EFAULT;
1421 break; 1421 break;
1422 } 1422 }
1423 1423
1424 sd.len = 0; 1424 sd.len = 0;
1425 sd.total_len = len; 1425 sd.total_len = len;
1426 sd.flags = flags; 1426 sd.flags = flags;
1427 sd.u.userptr = base; 1427 sd.u.userptr = base;
1428 sd.pos = 0; 1428 sd.pos = 0;
1429 1429
1430 size = __splice_from_pipe(pipe, &sd, pipe_to_user); 1430 size = __splice_from_pipe(pipe, &sd, pipe_to_user);
1431 if (size < 0) { 1431 if (size < 0) {
1432 if (!ret) 1432 if (!ret)
1433 ret = size; 1433 ret = size;
1434 1434
1435 break; 1435 break;
1436 } 1436 }
1437 1437
1438 ret += size; 1438 ret += size;
1439 1439
1440 if (size < len) 1440 if (size < len)
1441 break; 1441 break;
1442 1442
1443 nr_segs--; 1443 nr_segs--;
1444 iov++; 1444 iov++;
1445 } 1445 }
1446 1446
1447 if (pipe->inode) 1447 if (pipe->inode)
1448 mutex_unlock(&pipe->inode->i_mutex); 1448 mutex_unlock(&pipe->inode->i_mutex);
1449 1449
1450 if (!ret) 1450 if (!ret)
1451 ret = error; 1451 ret = error;
1452 1452
1453 return ret; 1453 return ret;
1454 } 1454 }
1455 1455
1456 /* 1456 /*
1457 * vmsplice splices a user address range into a pipe. It can be thought of 1457 * vmsplice splices a user address range into a pipe. It can be thought of
1458 * as splice-from-memory, where the regular splice is splice-from-file (or 1458 * as splice-from-memory, where the regular splice is splice-from-file (or
1459 * to file). In both cases the output is a pipe, naturally. 1459 * to file). In both cases the output is a pipe, naturally.
1460 */ 1460 */
1461 static long vmsplice_to_pipe(struct file *file, const struct iovec __user *iov, 1461 static long vmsplice_to_pipe(struct file *file, const struct iovec __user *iov,
1462 unsigned long nr_segs, unsigned int flags) 1462 unsigned long nr_segs, unsigned int flags)
1463 { 1463 {
1464 struct pipe_inode_info *pipe; 1464 struct pipe_inode_info *pipe;
1465 struct page *pages[PIPE_BUFFERS]; 1465 struct page *pages[PIPE_BUFFERS];
1466 struct partial_page partial[PIPE_BUFFERS]; 1466 struct partial_page partial[PIPE_BUFFERS];
1467 struct splice_pipe_desc spd = { 1467 struct splice_pipe_desc spd = {
1468 .pages = pages, 1468 .pages = pages,
1469 .partial = partial, 1469 .partial = partial,
1470 .flags = flags, 1470 .flags = flags,
1471 .ops = &user_page_pipe_buf_ops, 1471 .ops = &user_page_pipe_buf_ops,
1472 }; 1472 };
1473 1473
1474 pipe = pipe_info(file->f_path.dentry->d_inode); 1474 pipe = pipe_info(file->f_path.dentry->d_inode);
1475 if (!pipe) 1475 if (!pipe)
1476 return -EBADF; 1476 return -EBADF;
1477 1477
1478 spd.nr_pages = get_iovec_page_array(iov, nr_segs, pages, partial, 1478 spd.nr_pages = get_iovec_page_array(iov, nr_segs, pages, partial,
1479 flags & SPLICE_F_GIFT); 1479 flags & SPLICE_F_GIFT);
1480 if (spd.nr_pages <= 0) 1480 if (spd.nr_pages <= 0)
1481 return spd.nr_pages; 1481 return spd.nr_pages;
1482 1482
1483 return splice_to_pipe(pipe, &spd); 1483 return splice_to_pipe(pipe, &spd);
1484 } 1484 }
1485 1485
1486 /* 1486 /*
1487 * Note that vmsplice only really supports true splicing _from_ user memory 1487 * Note that vmsplice only really supports true splicing _from_ user memory
1488 * to a pipe, not the other way around. Splicing from user memory is a simple 1488 * to a pipe, not the other way around. Splicing from user memory is a simple
1489 * operation that can be supported without any funky alignment restrictions 1489 * operation that can be supported without any funky alignment restrictions
1490 * or nasty vm tricks. We simply map in the user memory and fill them into 1490 * or nasty vm tricks. We simply map in the user memory and fill them into
1491 * a pipe. The reverse isn't quite as easy, though. There are two possible 1491 * a pipe. The reverse isn't quite as easy, though. There are two possible
1492 * solutions for that: 1492 * solutions for that:
1493 * 1493 *
1494 * - memcpy() the data internally, at which point we might as well just 1494 * - memcpy() the data internally, at which point we might as well just
1495 * do a regular read() on the buffer anyway. 1495 * do a regular read() on the buffer anyway.
1496 * - Lots of nasty vm tricks, that are neither fast nor flexible (it 1496 * - Lots of nasty vm tricks, that are neither fast nor flexible (it
1497 * has restriction limitations on both ends of the pipe). 1497 * has restriction limitations on both ends of the pipe).
1498 * 1498 *
1499 * Currently we punt and implement it as a normal copy, see pipe_to_user(). 1499 * Currently we punt and implement it as a normal copy, see pipe_to_user().
1500 * 1500 *
1501 */ 1501 */
1502 asmlinkage long sys_vmsplice(int fd, const struct iovec __user *iov, 1502 asmlinkage long sys_vmsplice(int fd, const struct iovec __user *iov,
1503 unsigned long nr_segs, unsigned int flags) 1503 unsigned long nr_segs, unsigned int flags)
1504 { 1504 {
1505 struct file *file; 1505 struct file *file;
1506 long error; 1506 long error;
1507 int fput; 1507 int fput;
1508 1508
1509 if (unlikely(nr_segs > UIO_MAXIOV)) 1509 if (unlikely(nr_segs > UIO_MAXIOV))
1510 return -EINVAL; 1510 return -EINVAL;
1511 else if (unlikely(!nr_segs)) 1511 else if (unlikely(!nr_segs))
1512 return 0; 1512 return 0;
1513 1513
1514 error = -EBADF; 1514 error = -EBADF;
1515 file = fget_light(fd, &fput); 1515 file = fget_light(fd, &fput);
1516 if (file) { 1516 if (file) {
1517 if (file->f_mode & FMODE_WRITE) 1517 if (file->f_mode & FMODE_WRITE)
1518 error = vmsplice_to_pipe(file, iov, nr_segs, flags); 1518 error = vmsplice_to_pipe(file, iov, nr_segs, flags);
1519 else if (file->f_mode & FMODE_READ) 1519 else if (file->f_mode & FMODE_READ)
1520 error = vmsplice_to_user(file, iov, nr_segs, flags); 1520 error = vmsplice_to_user(file, iov, nr_segs, flags);
1521 1521
1522 fput_light(file, fput); 1522 fput_light(file, fput);
1523 } 1523 }
1524 1524
1525 return error; 1525 return error;
1526 } 1526 }
1527 1527
1528 asmlinkage long sys_splice(int fd_in, loff_t __user *off_in, 1528 asmlinkage long sys_splice(int fd_in, loff_t __user *off_in,
1529 int fd_out, loff_t __user *off_out, 1529 int fd_out, loff_t __user *off_out,
1530 size_t len, unsigned int flags) 1530 size_t len, unsigned int flags)
1531 { 1531 {
1532 long error; 1532 long error;
1533 struct file *in, *out; 1533 struct file *in, *out;
1534 int fput_in, fput_out; 1534 int fput_in, fput_out;
1535 1535
1536 if (unlikely(!len)) 1536 if (unlikely(!len))
1537 return 0; 1537 return 0;
1538 1538
1539 error = -EBADF; 1539 error = -EBADF;
1540 in = fget_light(fd_in, &fput_in); 1540 in = fget_light(fd_in, &fput_in);
1541 if (in) { 1541 if (in) {
1542 if (in->f_mode & FMODE_READ) { 1542 if (in->f_mode & FMODE_READ) {
1543 out = fget_light(fd_out, &fput_out); 1543 out = fget_light(fd_out, &fput_out);
1544 if (out) { 1544 if (out) {
1545 if (out->f_mode & FMODE_WRITE) 1545 if (out->f_mode & FMODE_WRITE)
1546 error = do_splice(in, off_in, 1546 error = do_splice(in, off_in,
1547 out, off_out, 1547 out, off_out,
1548 len, flags); 1548 len, flags);
1549 fput_light(out, fput_out); 1549 fput_light(out, fput_out);
1550 } 1550 }
1551 } 1551 }
1552 1552
1553 fput_light(in, fput_in); 1553 fput_light(in, fput_in);
1554 } 1554 }
1555 1555
1556 return error; 1556 return error;
1557 } 1557 }
1558 1558
1559 /* 1559 /*
1560 * Make sure there's data to read. Wait for input if we can, otherwise 1560 * Make sure there's data to read. Wait for input if we can, otherwise
1561 * return an appropriate error. 1561 * return an appropriate error.
1562 */ 1562 */
1563 static int link_ipipe_prep(struct pipe_inode_info *pipe, unsigned int flags) 1563 static int link_ipipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
1564 { 1564 {
1565 int ret; 1565 int ret;
1566 1566
1567 /* 1567 /*
1568 * Check ->nrbufs without the inode lock first. This function 1568 * Check ->nrbufs without the inode lock first. This function
1569 * is speculative anyways, so missing one is ok. 1569 * is speculative anyways, so missing one is ok.
1570 */ 1570 */
1571 if (pipe->nrbufs) 1571 if (pipe->nrbufs)
1572 return 0; 1572 return 0;
1573 1573
1574 ret = 0; 1574 ret = 0;
1575 mutex_lock(&pipe->inode->i_mutex); 1575 mutex_lock(&pipe->inode->i_mutex);
1576 1576
1577 while (!pipe->nrbufs) { 1577 while (!pipe->nrbufs) {
1578 if (signal_pending(current)) { 1578 if (signal_pending(current)) {
1579 ret = -ERESTARTSYS; 1579 ret = -ERESTARTSYS;
1580 break; 1580 break;
1581 } 1581 }
1582 if (!pipe->writers) 1582 if (!pipe->writers)
1583 break; 1583 break;
1584 if (!pipe->waiting_writers) { 1584 if (!pipe->waiting_writers) {
1585 if (flags & SPLICE_F_NONBLOCK) { 1585 if (flags & SPLICE_F_NONBLOCK) {
1586 ret = -EAGAIN; 1586 ret = -EAGAIN;
1587 break; 1587 break;
1588 } 1588 }
1589 } 1589 }
1590 pipe_wait(pipe); 1590 pipe_wait(pipe);
1591 } 1591 }
1592 1592
1593 mutex_unlock(&pipe->inode->i_mutex); 1593 mutex_unlock(&pipe->inode->i_mutex);
1594 return ret; 1594 return ret;
1595 } 1595 }
1596 1596
1597 /* 1597 /*
1598 * Make sure there's writeable room. Wait for room if we can, otherwise 1598 * Make sure there's writeable room. Wait for room if we can, otherwise
1599 * return an appropriate error. 1599 * return an appropriate error.
1600 */ 1600 */
1601 static int link_opipe_prep(struct pipe_inode_info *pipe, unsigned int flags) 1601 static int link_opipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
1602 { 1602 {
1603 int ret; 1603 int ret;
1604 1604
1605 /* 1605 /*
1606 * Check ->nrbufs without the inode lock first. This function 1606 * Check ->nrbufs without the inode lock first. This function
1607 * is speculative anyways, so missing one is ok. 1607 * is speculative anyways, so missing one is ok.
1608 */ 1608 */
1609 if (pipe->nrbufs < PIPE_BUFFERS) 1609 if (pipe->nrbufs < PIPE_BUFFERS)
1610 return 0; 1610 return 0;
1611 1611
1612 ret = 0; 1612 ret = 0;
1613 mutex_lock(&pipe->inode->i_mutex); 1613 mutex_lock(&pipe->inode->i_mutex);
1614 1614
1615 while (pipe->nrbufs >= PIPE_BUFFERS) { 1615 while (pipe->nrbufs >= PIPE_BUFFERS) {
1616 if (!pipe->readers) { 1616 if (!pipe->readers) {
1617 send_sig(SIGPIPE, current, 0); 1617 send_sig(SIGPIPE, current, 0);
1618 ret = -EPIPE; 1618 ret = -EPIPE;
1619 break; 1619 break;
1620 } 1620 }
1621 if (flags & SPLICE_F_NONBLOCK) { 1621 if (flags & SPLICE_F_NONBLOCK) {
1622 ret = -EAGAIN; 1622 ret = -EAGAIN;
1623 break; 1623 break;
1624 } 1624 }
1625 if (signal_pending(current)) { 1625 if (signal_pending(current)) {
1626 ret = -ERESTARTSYS; 1626 ret = -ERESTARTSYS;
1627 break; 1627 break;
1628 } 1628 }
1629 pipe->waiting_writers++; 1629 pipe->waiting_writers++;
1630 pipe_wait(pipe); 1630 pipe_wait(pipe);
1631 pipe->waiting_writers--; 1631 pipe->waiting_writers--;
1632 } 1632 }
1633 1633
1634 mutex_unlock(&pipe->inode->i_mutex); 1634 mutex_unlock(&pipe->inode->i_mutex);
1635 return ret; 1635 return ret;
1636 } 1636 }
1637 1637
1638 /* 1638 /*
1639 * Link contents of ipipe to opipe. 1639 * Link contents of ipipe to opipe.
1640 */ 1640 */
1641 static int link_pipe(struct pipe_inode_info *ipipe, 1641 static int link_pipe(struct pipe_inode_info *ipipe,
1642 struct pipe_inode_info *opipe, 1642 struct pipe_inode_info *opipe,
1643 size_t len, unsigned int flags) 1643 size_t len, unsigned int flags)
1644 { 1644 {
1645 struct pipe_buffer *ibuf, *obuf; 1645 struct pipe_buffer *ibuf, *obuf;
1646 int ret = 0, i = 0, nbuf; 1646 int ret = 0, i = 0, nbuf;
1647 1647
1648 /* 1648 /*
1649 * Potential ABBA deadlock, work around it by ordering lock 1649 * Potential ABBA deadlock, work around it by ordering lock
1650 * grabbing by inode address. Otherwise two different processes 1650 * grabbing by inode address. Otherwise two different processes
1651 * could deadlock (one doing tee from A -> B, the other from B -> A). 1651 * could deadlock (one doing tee from A -> B, the other from B -> A).
1652 */ 1652 */
1653 inode_double_lock(ipipe->inode, opipe->inode); 1653 inode_double_lock(ipipe->inode, opipe->inode);
1654 1654
1655 do { 1655 do {
1656 if (!opipe->readers) { 1656 if (!opipe->readers) {
1657 send_sig(SIGPIPE, current, 0); 1657 send_sig(SIGPIPE, current, 0);
1658 if (!ret) 1658 if (!ret)
1659 ret = -EPIPE; 1659 ret = -EPIPE;
1660 break; 1660 break;
1661 } 1661 }
1662 1662
1663 /* 1663 /*
1664 * If we have iterated all input buffers or ran out of 1664 * If we have iterated all input buffers or ran out of
1665 * output room, break. 1665 * output room, break.
1666 */ 1666 */
1667 if (i >= ipipe->nrbufs || opipe->nrbufs >= PIPE_BUFFERS) 1667 if (i >= ipipe->nrbufs || opipe->nrbufs >= PIPE_BUFFERS)
1668 break; 1668 break;
1669 1669
1670 ibuf = ipipe->bufs + ((ipipe->curbuf + i) & (PIPE_BUFFERS - 1)); 1670 ibuf = ipipe->bufs + ((ipipe->curbuf + i) & (PIPE_BUFFERS - 1));
1671 nbuf = (opipe->curbuf + opipe->nrbufs) & (PIPE_BUFFERS - 1); 1671 nbuf = (opipe->curbuf + opipe->nrbufs) & (PIPE_BUFFERS - 1);
1672 1672
1673 /* 1673 /*
1674 * Get a reference to this pipe buffer, 1674 * Get a reference to this pipe buffer,
1675 * so we can copy the contents over. 1675 * so we can copy the contents over.
1676 */ 1676 */
1677 ibuf->ops->get(ipipe, ibuf); 1677 ibuf->ops->get(ipipe, ibuf);
1678 1678
1679 obuf = opipe->bufs + nbuf; 1679 obuf = opipe->bufs + nbuf;
1680 *obuf = *ibuf; 1680 *obuf = *ibuf;
1681 1681
1682 /* 1682 /*
1683 * Don't inherit the gift flag, we need to 1683 * Don't inherit the gift flag, we need to
1684 * prevent multiple steals of this page. 1684 * prevent multiple steals of this page.
1685 */ 1685 */
1686 obuf->flags &= ~PIPE_BUF_FLAG_GIFT; 1686 obuf->flags &= ~PIPE_BUF_FLAG_GIFT;
1687 1687
1688 if (obuf->len > len) 1688 if (obuf->len > len)
1689 obuf->len = len; 1689 obuf->len = len;
1690 1690
1691 opipe->nrbufs++; 1691 opipe->nrbufs++;
1692 ret += obuf->len; 1692 ret += obuf->len;
1693 len -= obuf->len; 1693 len -= obuf->len;
1694 i++; 1694 i++;
1695 } while (len); 1695 } while (len);
1696 1696
1697 inode_double_unlock(ipipe->inode, opipe->inode); 1697 inode_double_unlock(ipipe->inode, opipe->inode);
1698 1698
1699 /* 1699 /*
1700 * If we put data in the output pipe, wakeup any potential readers. 1700 * If we put data in the output pipe, wakeup any potential readers.
1701 */ 1701 */
1702 if (ret > 0) { 1702 if (ret > 0) {
1703 smp_mb(); 1703 smp_mb();
1704 if (waitqueue_active(&opipe->wait)) 1704 if (waitqueue_active(&opipe->wait))
1705 wake_up_interruptible(&opipe->wait); 1705 wake_up_interruptible(&opipe->wait);
1706 kill_fasync(&opipe->fasync_readers, SIGIO, POLL_IN); 1706 kill_fasync(&opipe->fasync_readers, SIGIO, POLL_IN);
1707 } 1707 }
1708 1708
1709 return ret; 1709 return ret;
1710 } 1710 }
1711 1711
1712 /* 1712 /*
1713 * This is a tee(1) implementation that works on pipes. It doesn't copy 1713 * This is a tee(1) implementation that works on pipes. It doesn't copy
1714 * any data, it simply references the 'in' pages on the 'out' pipe. 1714 * any data, it simply references the 'in' pages on the 'out' pipe.
1715 * The 'flags' used are the SPLICE_F_* variants, currently the only 1715 * The 'flags' used are the SPLICE_F_* variants, currently the only
1716 * applicable one is SPLICE_F_NONBLOCK. 1716 * applicable one is SPLICE_F_NONBLOCK.
1717 */ 1717 */
1718 static long do_tee(struct file *in, struct file *out, size_t len, 1718 static long do_tee(struct file *in, struct file *out, size_t len,
1719 unsigned int flags) 1719 unsigned int flags)
1720 { 1720 {
1721 struct pipe_inode_info *ipipe = pipe_info(in->f_path.dentry->d_inode); 1721 struct pipe_inode_info *ipipe = pipe_info(in->f_path.dentry->d_inode);
1722 struct pipe_inode_info *opipe = pipe_info(out->f_path.dentry->d_inode); 1722 struct pipe_inode_info *opipe = pipe_info(out->f_path.dentry->d_inode);
1723 int ret = -EINVAL; 1723 int ret = -EINVAL;
1724 1724
1725 /* 1725 /*
1726 * Duplicate the contents of ipipe to opipe without actually 1726 * Duplicate the contents of ipipe to opipe without actually
1727 * copying the data. 1727 * copying the data.
1728 */ 1728 */
1729 if (ipipe && opipe && ipipe != opipe) { 1729 if (ipipe && opipe && ipipe != opipe) {
1730 /* 1730 /*
1731 * Keep going, unless we encounter an error. The ipipe/opipe 1731 * Keep going, unless we encounter an error. The ipipe/opipe
1732 * ordering doesn't really matter. 1732 * ordering doesn't really matter.
1733 */ 1733 */
1734 ret = link_ipipe_prep(ipipe, flags); 1734 ret = link_ipipe_prep(ipipe, flags);
1735 if (!ret) { 1735 if (!ret) {
1736 ret = link_opipe_prep(opipe, flags); 1736 ret = link_opipe_prep(opipe, flags);
1737 if (!ret) { 1737 if (!ret) {
1738 ret = link_pipe(ipipe, opipe, len, flags); 1738 ret = link_pipe(ipipe, opipe, len, flags);
1739 if (!ret && (flags & SPLICE_F_NONBLOCK)) 1739 if (!ret && (flags & SPLICE_F_NONBLOCK))
1740 ret = -EAGAIN; 1740 ret = -EAGAIN;
1741 } 1741 }
1742 } 1742 }
1743 } 1743 }
1744 1744
1745 return ret; 1745 return ret;
1746 } 1746 }
1747 1747
1748 asmlinkage long sys_tee(int fdin, int fdout, size_t len, unsigned int flags) 1748 asmlinkage long sys_tee(int fdin, int fdout, size_t len, unsigned int flags)
1749 { 1749 {
1750 struct file *in; 1750 struct file *in;
1751 int error, fput_in; 1751 int error, fput_in;
1752 1752
1753 if (unlikely(!len)) 1753 if (unlikely(!len))
1754 return 0; 1754 return 0;
1755 1755
1756 error = -EBADF; 1756 error = -EBADF;
1757 in = fget_light(fdin, &fput_in); 1757 in = fget_light(fdin, &fput_in);
1758 if (in) { 1758 if (in) {
1759 if (in->f_mode & FMODE_READ) { 1759 if (in->f_mode & FMODE_READ) {
1760 int fput_out; 1760 int fput_out;
1761 struct file *out = fget_light(fdout, &fput_out); 1761 struct file *out = fget_light(fdout, &fput_out);
1762 1762
1763 if (out) { 1763 if (out) {
1764 if (out->f_mode & FMODE_WRITE) 1764 if (out->f_mode & FMODE_WRITE)
1765 error = do_tee(in, out, len, flags); 1765 error = do_tee(in, out, len, flags);
1766 fput_light(out, fput_out); 1766 fput_light(out, fput_out);
1767 } 1767 }
1768 } 1768 }
1769 fput_light(in, fput_in); 1769 fput_light(in, fput_in);
1770 } 1770 }
1771 1771
1772 return error; 1772 return error;
1773 } 1773 }
1774 1774
include/linux/pipe_fs_i.h
1 #ifndef _LINUX_PIPE_FS_I_H 1 #ifndef _LINUX_PIPE_FS_I_H
2 #define _LINUX_PIPE_FS_I_H 2 #define _LINUX_PIPE_FS_I_H
3 3
4 #define PIPEFS_MAGIC 0x50495045 4 #define PIPEFS_MAGIC 0x50495045
5 5
6 #define PIPE_BUFFERS (16) 6 #define PIPE_BUFFERS (16)
7 7
8 #define PIPE_BUF_FLAG_LRU 0x01 /* page is on the LRU */ 8 #define PIPE_BUF_FLAG_LRU 0x01 /* page is on the LRU */
9 #define PIPE_BUF_FLAG_ATOMIC 0x02 /* was atomically mapped */ 9 #define PIPE_BUF_FLAG_ATOMIC 0x02 /* was atomically mapped */
10 #define PIPE_BUF_FLAG_GIFT 0x04 /* page is a gift */ 10 #define PIPE_BUF_FLAG_GIFT 0x04 /* page is a gift */
11 11
12 struct pipe_buffer { 12 struct pipe_buffer {
13 struct page *page; 13 struct page *page;
14 unsigned int offset, len; 14 unsigned int offset, len;
15 const struct pipe_buf_operations *ops; 15 const struct pipe_buf_operations *ops;
16 unsigned int flags; 16 unsigned int flags;
17 unsigned long private; 17 unsigned long private;
18 }; 18 };
19 19
20 struct pipe_inode_info { 20 struct pipe_inode_info {
21 wait_queue_head_t wait; 21 wait_queue_head_t wait;
22 unsigned int nrbufs, curbuf; 22 unsigned int nrbufs, curbuf;
23 struct page *tmp_page; 23 struct page *tmp_page;
24 unsigned int readers; 24 unsigned int readers;
25 unsigned int writers; 25 unsigned int writers;
26 unsigned int waiting_writers; 26 unsigned int waiting_writers;
27 unsigned int r_counter; 27 unsigned int r_counter;
28 unsigned int w_counter; 28 unsigned int w_counter;
29 struct fasync_struct *fasync_readers; 29 struct fasync_struct *fasync_readers;
30 struct fasync_struct *fasync_writers; 30 struct fasync_struct *fasync_writers;
31 struct inode *inode; 31 struct inode *inode;
32 struct pipe_buffer bufs[PIPE_BUFFERS]; 32 struct pipe_buffer bufs[PIPE_BUFFERS];
33 }; 33 };
34 34
35 /* 35 /*
36 * Note on the nesting of these functions: 36 * Note on the nesting of these functions:
37 * 37 *
38 * ->pin() 38 * ->confirm()
39 * ->steal() 39 * ->steal()
40 * ... 40 * ...
41 * ->map() 41 * ->map()
42 * ... 42 * ...
43 * ->unmap() 43 * ->unmap()
44 * 44 *
45 * That is, ->map() must be called on a pinned buffer, same goes for ->steal(). 45 * That is, ->map() must be called on a confirmed buffer,
46 * same goes for ->steal().
46 */ 47 */
47 struct pipe_buf_operations { 48 struct pipe_buf_operations {
48 int can_merge; 49 int can_merge;
49 void * (*map)(struct pipe_inode_info *, struct pipe_buffer *, int); 50 void * (*map)(struct pipe_inode_info *, struct pipe_buffer *, int);
50 void (*unmap)(struct pipe_inode_info *, struct pipe_buffer *, void *); 51 void (*unmap)(struct pipe_inode_info *, struct pipe_buffer *, void *);
51 int (*pin)(struct pipe_inode_info *, struct pipe_buffer *); 52 int (*confirm)(struct pipe_inode_info *, struct pipe_buffer *);
52 void (*release)(struct pipe_inode_info *, struct pipe_buffer *); 53 void (*release)(struct pipe_inode_info *, struct pipe_buffer *);
53 int (*steal)(struct pipe_inode_info *, struct pipe_buffer *); 54 int (*steal)(struct pipe_inode_info *, struct pipe_buffer *);
54 void (*get)(struct pipe_inode_info *, struct pipe_buffer *); 55 void (*get)(struct pipe_inode_info *, struct pipe_buffer *);
55 }; 56 };
56 57
57 /* Differs from PIPE_BUF in that PIPE_SIZE is the length of the actual 58 /* Differs from PIPE_BUF in that PIPE_SIZE is the length of the actual
58 memory allocation, whereas PIPE_BUF makes atomicity guarantees. */ 59 memory allocation, whereas PIPE_BUF makes atomicity guarantees. */
59 #define PIPE_SIZE PAGE_SIZE 60 #define PIPE_SIZE PAGE_SIZE
60 61
61 /* Drop the inode semaphore and wait for a pipe event, atomically */ 62 /* Drop the inode semaphore and wait for a pipe event, atomically */
62 void pipe_wait(struct pipe_inode_info *pipe); 63 void pipe_wait(struct pipe_inode_info *pipe);
63 64
64 struct pipe_inode_info * alloc_pipe_info(struct inode * inode); 65 struct pipe_inode_info * alloc_pipe_info(struct inode * inode);
65 void free_pipe_info(struct inode * inode); 66 void free_pipe_info(struct inode * inode);
66 void __free_pipe_info(struct pipe_inode_info *); 67 void __free_pipe_info(struct pipe_inode_info *);
67 68
68 /* Generic pipe buffer ops functions */ 69 /* Generic pipe buffer ops functions */
69 void *generic_pipe_buf_map(struct pipe_inode_info *, struct pipe_buffer *, int); 70 void *generic_pipe_buf_map(struct pipe_inode_info *, struct pipe_buffer *, int);
70 void generic_pipe_buf_unmap(struct pipe_inode_info *, struct pipe_buffer *, void *); 71 void generic_pipe_buf_unmap(struct pipe_inode_info *, struct pipe_buffer *, void *);
71 void generic_pipe_buf_get(struct pipe_inode_info *, struct pipe_buffer *); 72 void generic_pipe_buf_get(struct pipe_inode_info *, struct pipe_buffer *);
72 int generic_pipe_buf_pin(struct pipe_inode_info *, struct pipe_buffer *); 73 int generic_pipe_buf_confirm(struct pipe_inode_info *, struct pipe_buffer *);
73 int generic_pipe_buf_steal(struct pipe_inode_info *, struct pipe_buffer *); 74 int generic_pipe_buf_steal(struct pipe_inode_info *, struct pipe_buffer *);
74 75
75 #endif 76 #endif
76 77
1 /* 1 /*
2 * Public API and common code for kernel->userspace relay file support. 2 * Public API and common code for kernel->userspace relay file support.
3 * 3 *
4 * See Documentation/filesystems/relayfs.txt for an overview of relayfs. 4 * See Documentation/filesystems/relayfs.txt for an overview of relayfs.
5 * 5 *
6 * Copyright (C) 2002-2005 - Tom Zanussi (zanussi@us.ibm.com), IBM Corp 6 * Copyright (C) 2002-2005 - Tom Zanussi (zanussi@us.ibm.com), IBM Corp
7 * Copyright (C) 1999-2005 - Karim Yaghmour (karim@opersys.com) 7 * Copyright (C) 1999-2005 - Karim Yaghmour (karim@opersys.com)
8 * 8 *
9 * Moved to kernel/relay.c by Paul Mundt, 2006. 9 * Moved to kernel/relay.c by Paul Mundt, 2006.
10 * November 2006 - CPU hotplug support by Mathieu Desnoyers 10 * November 2006 - CPU hotplug support by Mathieu Desnoyers
11 * (mathieu.desnoyers@polymtl.ca) 11 * (mathieu.desnoyers@polymtl.ca)
12 * 12 *
13 * This file is released under the GPL. 13 * This file is released under the GPL.
14 */ 14 */
15 #include <linux/errno.h> 15 #include <linux/errno.h>
16 #include <linux/stddef.h> 16 #include <linux/stddef.h>
17 #include <linux/slab.h> 17 #include <linux/slab.h>
18 #include <linux/module.h> 18 #include <linux/module.h>
19 #include <linux/string.h> 19 #include <linux/string.h>
20 #include <linux/relay.h> 20 #include <linux/relay.h>
21 #include <linux/vmalloc.h> 21 #include <linux/vmalloc.h>
22 #include <linux/mm.h> 22 #include <linux/mm.h>
23 #include <linux/cpu.h> 23 #include <linux/cpu.h>
24 #include <linux/splice.h> 24 #include <linux/splice.h>
25 25
26 /* list of open channels, for cpu hotplug */ 26 /* list of open channels, for cpu hotplug */
27 static DEFINE_MUTEX(relay_channels_mutex); 27 static DEFINE_MUTEX(relay_channels_mutex);
28 static LIST_HEAD(relay_channels); 28 static LIST_HEAD(relay_channels);
29 29
30 /* 30 /*
31 * close() vm_op implementation for relay file mapping. 31 * close() vm_op implementation for relay file mapping.
32 */ 32 */
33 static void relay_file_mmap_close(struct vm_area_struct *vma) 33 static void relay_file_mmap_close(struct vm_area_struct *vma)
34 { 34 {
35 struct rchan_buf *buf = vma->vm_private_data; 35 struct rchan_buf *buf = vma->vm_private_data;
36 buf->chan->cb->buf_unmapped(buf, vma->vm_file); 36 buf->chan->cb->buf_unmapped(buf, vma->vm_file);
37 } 37 }
38 38
39 /* 39 /*
40 * nopage() vm_op implementation for relay file mapping. 40 * nopage() vm_op implementation for relay file mapping.
41 */ 41 */
42 static struct page *relay_buf_nopage(struct vm_area_struct *vma, 42 static struct page *relay_buf_nopage(struct vm_area_struct *vma,
43 unsigned long address, 43 unsigned long address,
44 int *type) 44 int *type)
45 { 45 {
46 struct page *page; 46 struct page *page;
47 struct rchan_buf *buf = vma->vm_private_data; 47 struct rchan_buf *buf = vma->vm_private_data;
48 unsigned long offset = address - vma->vm_start; 48 unsigned long offset = address - vma->vm_start;
49 49
50 if (address > vma->vm_end) 50 if (address > vma->vm_end)
51 return NOPAGE_SIGBUS; /* Disallow mremap */ 51 return NOPAGE_SIGBUS; /* Disallow mremap */
52 if (!buf) 52 if (!buf)
53 return NOPAGE_OOM; 53 return NOPAGE_OOM;
54 54
55 page = vmalloc_to_page(buf->start + offset); 55 page = vmalloc_to_page(buf->start + offset);
56 if (!page) 56 if (!page)
57 return NOPAGE_OOM; 57 return NOPAGE_OOM;
58 get_page(page); 58 get_page(page);
59 59
60 if (type) 60 if (type)
61 *type = VM_FAULT_MINOR; 61 *type = VM_FAULT_MINOR;
62 62
63 return page; 63 return page;
64 } 64 }
65 65
66 /* 66 /*
67 * vm_ops for relay file mappings. 67 * vm_ops for relay file mappings.
68 */ 68 */
69 static struct vm_operations_struct relay_file_mmap_ops = { 69 static struct vm_operations_struct relay_file_mmap_ops = {
70 .nopage = relay_buf_nopage, 70 .nopage = relay_buf_nopage,
71 .close = relay_file_mmap_close, 71 .close = relay_file_mmap_close,
72 }; 72 };
73 73
74 /** 74 /**
75 * relay_mmap_buf: - mmap channel buffer to process address space 75 * relay_mmap_buf: - mmap channel buffer to process address space
76 * @buf: relay channel buffer 76 * @buf: relay channel buffer
77 * @vma: vm_area_struct describing memory to be mapped 77 * @vma: vm_area_struct describing memory to be mapped
78 * 78 *
79 * Returns 0 if ok, negative on error 79 * Returns 0 if ok, negative on error
80 * 80 *
81 * Caller should already have grabbed mmap_sem. 81 * Caller should already have grabbed mmap_sem.
82 */ 82 */
83 int relay_mmap_buf(struct rchan_buf *buf, struct vm_area_struct *vma) 83 int relay_mmap_buf(struct rchan_buf *buf, struct vm_area_struct *vma)
84 { 84 {
85 unsigned long length = vma->vm_end - vma->vm_start; 85 unsigned long length = vma->vm_end - vma->vm_start;
86 struct file *filp = vma->vm_file; 86 struct file *filp = vma->vm_file;
87 87
88 if (!buf) 88 if (!buf)
89 return -EBADF; 89 return -EBADF;
90 90
91 if (length != (unsigned long)buf->chan->alloc_size) 91 if (length != (unsigned long)buf->chan->alloc_size)
92 return -EINVAL; 92 return -EINVAL;
93 93
94 vma->vm_ops = &relay_file_mmap_ops; 94 vma->vm_ops = &relay_file_mmap_ops;
95 vma->vm_private_data = buf; 95 vma->vm_private_data = buf;
96 buf->chan->cb->buf_mapped(buf, filp); 96 buf->chan->cb->buf_mapped(buf, filp);
97 97
98 return 0; 98 return 0;
99 } 99 }
100 100
101 /** 101 /**
102 * relay_alloc_buf - allocate a channel buffer 102 * relay_alloc_buf - allocate a channel buffer
103 * @buf: the buffer struct 103 * @buf: the buffer struct
104 * @size: total size of the buffer 104 * @size: total size of the buffer
105 * 105 *
106 * Returns a pointer to the resulting buffer, %NULL if unsuccessful. The 106 * Returns a pointer to the resulting buffer, %NULL if unsuccessful. The
107 * passed in size will get page aligned, if it isn't already. 107 * passed in size will get page aligned, if it isn't already.
108 */ 108 */
109 static void *relay_alloc_buf(struct rchan_buf *buf, size_t *size) 109 static void *relay_alloc_buf(struct rchan_buf *buf, size_t *size)
110 { 110 {
111 void *mem; 111 void *mem;
112 unsigned int i, j, n_pages; 112 unsigned int i, j, n_pages;
113 113
114 *size = PAGE_ALIGN(*size); 114 *size = PAGE_ALIGN(*size);
115 n_pages = *size >> PAGE_SHIFT; 115 n_pages = *size >> PAGE_SHIFT;
116 116
117 buf->page_array = kcalloc(n_pages, sizeof(struct page *), GFP_KERNEL); 117 buf->page_array = kcalloc(n_pages, sizeof(struct page *), GFP_KERNEL);
118 if (!buf->page_array) 118 if (!buf->page_array)
119 return NULL; 119 return NULL;
120 120
121 for (i = 0; i < n_pages; i++) { 121 for (i = 0; i < n_pages; i++) {
122 buf->page_array[i] = alloc_page(GFP_KERNEL); 122 buf->page_array[i] = alloc_page(GFP_KERNEL);
123 if (unlikely(!buf->page_array[i])) 123 if (unlikely(!buf->page_array[i]))
124 goto depopulate; 124 goto depopulate;
125 set_page_private(buf->page_array[i], (unsigned long)buf); 125 set_page_private(buf->page_array[i], (unsigned long)buf);
126 } 126 }
127 mem = vmap(buf->page_array, n_pages, VM_MAP, PAGE_KERNEL); 127 mem = vmap(buf->page_array, n_pages, VM_MAP, PAGE_KERNEL);
128 if (!mem) 128 if (!mem)
129 goto depopulate; 129 goto depopulate;
130 130
131 memset(mem, 0, *size); 131 memset(mem, 0, *size);
132 buf->page_count = n_pages; 132 buf->page_count = n_pages;
133 return mem; 133 return mem;
134 134
135 depopulate: 135 depopulate:
136 for (j = 0; j < i; j++) 136 for (j = 0; j < i; j++)
137 __free_page(buf->page_array[j]); 137 __free_page(buf->page_array[j]);
138 kfree(buf->page_array); 138 kfree(buf->page_array);
139 return NULL; 139 return NULL;
140 } 140 }
141 141
142 /** 142 /**
143 * relay_create_buf - allocate and initialize a channel buffer 143 * relay_create_buf - allocate and initialize a channel buffer
144 * @chan: the relay channel 144 * @chan: the relay channel
145 * 145 *
146 * Returns channel buffer if successful, %NULL otherwise. 146 * Returns channel buffer if successful, %NULL otherwise.
147 */ 147 */
148 struct rchan_buf *relay_create_buf(struct rchan *chan) 148 struct rchan_buf *relay_create_buf(struct rchan *chan)
149 { 149 {
150 struct rchan_buf *buf = kzalloc(sizeof(struct rchan_buf), GFP_KERNEL); 150 struct rchan_buf *buf = kzalloc(sizeof(struct rchan_buf), GFP_KERNEL);
151 if (!buf) 151 if (!buf)
152 return NULL; 152 return NULL;
153 153
154 buf->padding = kmalloc(chan->n_subbufs * sizeof(size_t *), GFP_KERNEL); 154 buf->padding = kmalloc(chan->n_subbufs * sizeof(size_t *), GFP_KERNEL);
155 if (!buf->padding) 155 if (!buf->padding)
156 goto free_buf; 156 goto free_buf;
157 157
158 buf->start = relay_alloc_buf(buf, &chan->alloc_size); 158 buf->start = relay_alloc_buf(buf, &chan->alloc_size);
159 if (!buf->start) 159 if (!buf->start)
160 goto free_buf; 160 goto free_buf;
161 161
162 buf->chan = chan; 162 buf->chan = chan;
163 kref_get(&buf->chan->kref); 163 kref_get(&buf->chan->kref);
164 return buf; 164 return buf;
165 165
166 free_buf: 166 free_buf:
167 kfree(buf->padding); 167 kfree(buf->padding);
168 kfree(buf); 168 kfree(buf);
169 return NULL; 169 return NULL;
170 } 170 }
171 171
172 /** 172 /**
173 * relay_destroy_channel - free the channel struct 173 * relay_destroy_channel - free the channel struct
174 * @kref: target kernel reference that contains the relay channel 174 * @kref: target kernel reference that contains the relay channel
175 * 175 *
176 * Should only be called from kref_put(). 176 * Should only be called from kref_put().
177 */ 177 */
178 void relay_destroy_channel(struct kref *kref) 178 void relay_destroy_channel(struct kref *kref)
179 { 179 {
180 struct rchan *chan = container_of(kref, struct rchan, kref); 180 struct rchan *chan = container_of(kref, struct rchan, kref);
181 kfree(chan); 181 kfree(chan);
182 } 182 }
183 183
184 /** 184 /**
185 * relay_destroy_buf - destroy an rchan_buf struct and associated buffer 185 * relay_destroy_buf - destroy an rchan_buf struct and associated buffer
186 * @buf: the buffer struct 186 * @buf: the buffer struct
187 */ 187 */
188 void relay_destroy_buf(struct rchan_buf *buf) 188 void relay_destroy_buf(struct rchan_buf *buf)
189 { 189 {
190 struct rchan *chan = buf->chan; 190 struct rchan *chan = buf->chan;
191 unsigned int i; 191 unsigned int i;
192 192
193 if (likely(buf->start)) { 193 if (likely(buf->start)) {
194 vunmap(buf->start); 194 vunmap(buf->start);
195 for (i = 0; i < buf->page_count; i++) 195 for (i = 0; i < buf->page_count; i++)
196 __free_page(buf->page_array[i]); 196 __free_page(buf->page_array[i]);
197 kfree(buf->page_array); 197 kfree(buf->page_array);
198 } 198 }
199 chan->buf[buf->cpu] = NULL; 199 chan->buf[buf->cpu] = NULL;
200 kfree(buf->padding); 200 kfree(buf->padding);
201 kfree(buf); 201 kfree(buf);
202 kref_put(&chan->kref, relay_destroy_channel); 202 kref_put(&chan->kref, relay_destroy_channel);
203 } 203 }
204 204
205 /** 205 /**
206 * relay_remove_buf - remove a channel buffer 206 * relay_remove_buf - remove a channel buffer
207 * @kref: target kernel reference that contains the relay buffer 207 * @kref: target kernel reference that contains the relay buffer
208 * 208 *
209 * Removes the file from the fileystem, which also frees the 209 * Removes the file from the fileystem, which also frees the
210 * rchan_buf_struct and the channel buffer. Should only be called from 210 * rchan_buf_struct and the channel buffer. Should only be called from
211 * kref_put(). 211 * kref_put().
212 */ 212 */
213 void relay_remove_buf(struct kref *kref) 213 void relay_remove_buf(struct kref *kref)
214 { 214 {
215 struct rchan_buf *buf = container_of(kref, struct rchan_buf, kref); 215 struct rchan_buf *buf = container_of(kref, struct rchan_buf, kref);
216 buf->chan->cb->remove_buf_file(buf->dentry); 216 buf->chan->cb->remove_buf_file(buf->dentry);
217 relay_destroy_buf(buf); 217 relay_destroy_buf(buf);
218 } 218 }
219 219
220 /** 220 /**
221 * relay_buf_empty - boolean, is the channel buffer empty? 221 * relay_buf_empty - boolean, is the channel buffer empty?
222 * @buf: channel buffer 222 * @buf: channel buffer
223 * 223 *
224 * Returns 1 if the buffer is empty, 0 otherwise. 224 * Returns 1 if the buffer is empty, 0 otherwise.
225 */ 225 */
226 int relay_buf_empty(struct rchan_buf *buf) 226 int relay_buf_empty(struct rchan_buf *buf)
227 { 227 {
228 return (buf->subbufs_produced - buf->subbufs_consumed) ? 0 : 1; 228 return (buf->subbufs_produced - buf->subbufs_consumed) ? 0 : 1;
229 } 229 }
230 EXPORT_SYMBOL_GPL(relay_buf_empty); 230 EXPORT_SYMBOL_GPL(relay_buf_empty);
231 231
232 /** 232 /**
233 * relay_buf_full - boolean, is the channel buffer full? 233 * relay_buf_full - boolean, is the channel buffer full?
234 * @buf: channel buffer 234 * @buf: channel buffer
235 * 235 *
236 * Returns 1 if the buffer is full, 0 otherwise. 236 * Returns 1 if the buffer is full, 0 otherwise.
237 */ 237 */
238 int relay_buf_full(struct rchan_buf *buf) 238 int relay_buf_full(struct rchan_buf *buf)
239 { 239 {
240 size_t ready = buf->subbufs_produced - buf->subbufs_consumed; 240 size_t ready = buf->subbufs_produced - buf->subbufs_consumed;
241 return (ready >= buf->chan->n_subbufs) ? 1 : 0; 241 return (ready >= buf->chan->n_subbufs) ? 1 : 0;
242 } 242 }
243 EXPORT_SYMBOL_GPL(relay_buf_full); 243 EXPORT_SYMBOL_GPL(relay_buf_full);
244 244
245 /* 245 /*
246 * High-level relay kernel API and associated functions. 246 * High-level relay kernel API and associated functions.
247 */ 247 */
248 248
249 /* 249 /*
250 * rchan_callback implementations defining default channel behavior. Used 250 * rchan_callback implementations defining default channel behavior. Used
251 * in place of corresponding NULL values in client callback struct. 251 * in place of corresponding NULL values in client callback struct.
252 */ 252 */
253 253
254 /* 254 /*
255 * subbuf_start() default callback. Does nothing. 255 * subbuf_start() default callback. Does nothing.
256 */ 256 */
257 static int subbuf_start_default_callback (struct rchan_buf *buf, 257 static int subbuf_start_default_callback (struct rchan_buf *buf,
258 void *subbuf, 258 void *subbuf,
259 void *prev_subbuf, 259 void *prev_subbuf,
260 size_t prev_padding) 260 size_t prev_padding)
261 { 261 {
262 if (relay_buf_full(buf)) 262 if (relay_buf_full(buf))
263 return 0; 263 return 0;
264 264
265 return 1; 265 return 1;
266 } 266 }
267 267
268 /* 268 /*
269 * buf_mapped() default callback. Does nothing. 269 * buf_mapped() default callback. Does nothing.
270 */ 270 */
271 static void buf_mapped_default_callback(struct rchan_buf *buf, 271 static void buf_mapped_default_callback(struct rchan_buf *buf,
272 struct file *filp) 272 struct file *filp)
273 { 273 {
274 } 274 }
275 275
276 /* 276 /*
277 * buf_unmapped() default callback. Does nothing. 277 * buf_unmapped() default callback. Does nothing.
278 */ 278 */
279 static void buf_unmapped_default_callback(struct rchan_buf *buf, 279 static void buf_unmapped_default_callback(struct rchan_buf *buf,
280 struct file *filp) 280 struct file *filp)
281 { 281 {
282 } 282 }
283 283
284 /* 284 /*
285 * create_buf_file_create() default callback. Does nothing. 285 * create_buf_file_create() default callback. Does nothing.
286 */ 286 */
287 static struct dentry *create_buf_file_default_callback(const char *filename, 287 static struct dentry *create_buf_file_default_callback(const char *filename,
288 struct dentry *parent, 288 struct dentry *parent,
289 int mode, 289 int mode,
290 struct rchan_buf *buf, 290 struct rchan_buf *buf,
291 int *is_global) 291 int *is_global)
292 { 292 {
293 return NULL; 293 return NULL;
294 } 294 }
295 295
296 /* 296 /*
297 * remove_buf_file() default callback. Does nothing. 297 * remove_buf_file() default callback. Does nothing.
298 */ 298 */
299 static int remove_buf_file_default_callback(struct dentry *dentry) 299 static int remove_buf_file_default_callback(struct dentry *dentry)
300 { 300 {
301 return -EINVAL; 301 return -EINVAL;
302 } 302 }
303 303
304 /* relay channel default callbacks */ 304 /* relay channel default callbacks */
305 static struct rchan_callbacks default_channel_callbacks = { 305 static struct rchan_callbacks default_channel_callbacks = {
306 .subbuf_start = subbuf_start_default_callback, 306 .subbuf_start = subbuf_start_default_callback,
307 .buf_mapped = buf_mapped_default_callback, 307 .buf_mapped = buf_mapped_default_callback,
308 .buf_unmapped = buf_unmapped_default_callback, 308 .buf_unmapped = buf_unmapped_default_callback,
309 .create_buf_file = create_buf_file_default_callback, 309 .create_buf_file = create_buf_file_default_callback,
310 .remove_buf_file = remove_buf_file_default_callback, 310 .remove_buf_file = remove_buf_file_default_callback,
311 }; 311 };
312 312
313 /** 313 /**
314 * wakeup_readers - wake up readers waiting on a channel 314 * wakeup_readers - wake up readers waiting on a channel
315 * @data: contains the channel buffer 315 * @data: contains the channel buffer
316 * 316 *
317 * This is the timer function used to defer reader waking. 317 * This is the timer function used to defer reader waking.
318 */ 318 */
319 static void wakeup_readers(unsigned long data) 319 static void wakeup_readers(unsigned long data)
320 { 320 {
321 struct rchan_buf *buf = (struct rchan_buf *)data; 321 struct rchan_buf *buf = (struct rchan_buf *)data;
322 wake_up_interruptible(&buf->read_wait); 322 wake_up_interruptible(&buf->read_wait);
323 } 323 }
324 324
325 /** 325 /**
326 * __relay_reset - reset a channel buffer 326 * __relay_reset - reset a channel buffer
327 * @buf: the channel buffer 327 * @buf: the channel buffer
328 * @init: 1 if this is a first-time initialization 328 * @init: 1 if this is a first-time initialization
329 * 329 *
330 * See relay_reset() for description of effect. 330 * See relay_reset() for description of effect.
331 */ 331 */
332 static void __relay_reset(struct rchan_buf *buf, unsigned int init) 332 static void __relay_reset(struct rchan_buf *buf, unsigned int init)
333 { 333 {
334 size_t i; 334 size_t i;
335 335
336 if (init) { 336 if (init) {
337 init_waitqueue_head(&buf->read_wait); 337 init_waitqueue_head(&buf->read_wait);
338 kref_init(&buf->kref); 338 kref_init(&buf->kref);
339 setup_timer(&buf->timer, wakeup_readers, (unsigned long)buf); 339 setup_timer(&buf->timer, wakeup_readers, (unsigned long)buf);
340 } else 340 } else
341 del_timer_sync(&buf->timer); 341 del_timer_sync(&buf->timer);
342 342
343 buf->subbufs_produced = 0; 343 buf->subbufs_produced = 0;
344 buf->subbufs_consumed = 0; 344 buf->subbufs_consumed = 0;
345 buf->bytes_consumed = 0; 345 buf->bytes_consumed = 0;
346 buf->finalized = 0; 346 buf->finalized = 0;
347 buf->data = buf->start; 347 buf->data = buf->start;
348 buf->offset = 0; 348 buf->offset = 0;
349 349
350 for (i = 0; i < buf->chan->n_subbufs; i++) 350 for (i = 0; i < buf->chan->n_subbufs; i++)
351 buf->padding[i] = 0; 351 buf->padding[i] = 0;
352 352
353 buf->chan->cb->subbuf_start(buf, buf->data, NULL, 0); 353 buf->chan->cb->subbuf_start(buf, buf->data, NULL, 0);
354 } 354 }
355 355
356 /** 356 /**
357 * relay_reset - reset the channel 357 * relay_reset - reset the channel
358 * @chan: the channel 358 * @chan: the channel
359 * 359 *
360 * This has the effect of erasing all data from all channel buffers 360 * This has the effect of erasing all data from all channel buffers
361 * and restarting the channel in its initial state. The buffers 361 * and restarting the channel in its initial state. The buffers
362 * are not freed, so any mappings are still in effect. 362 * are not freed, so any mappings are still in effect.
363 * 363 *
364 * NOTE. Care should be taken that the channel isn't actually 364 * NOTE. Care should be taken that the channel isn't actually
365 * being used by anything when this call is made. 365 * being used by anything when this call is made.
366 */ 366 */
367 void relay_reset(struct rchan *chan) 367 void relay_reset(struct rchan *chan)
368 { 368 {
369 unsigned int i; 369 unsigned int i;
370 370
371 if (!chan) 371 if (!chan)
372 return; 372 return;
373 373
374 if (chan->is_global && chan->buf[0]) { 374 if (chan->is_global && chan->buf[0]) {
375 __relay_reset(chan->buf[0], 0); 375 __relay_reset(chan->buf[0], 0);
376 return; 376 return;
377 } 377 }
378 378
379 mutex_lock(&relay_channels_mutex); 379 mutex_lock(&relay_channels_mutex);
380 for_each_online_cpu(i) 380 for_each_online_cpu(i)
381 if (chan->buf[i]) 381 if (chan->buf[i])
382 __relay_reset(chan->buf[i], 0); 382 __relay_reset(chan->buf[i], 0);
383 mutex_unlock(&relay_channels_mutex); 383 mutex_unlock(&relay_channels_mutex);
384 } 384 }
385 EXPORT_SYMBOL_GPL(relay_reset); 385 EXPORT_SYMBOL_GPL(relay_reset);
386 386
387 /* 387 /*
388 * relay_open_buf - create a new relay channel buffer 388 * relay_open_buf - create a new relay channel buffer
389 * 389 *
390 * used by relay_open() and CPU hotplug. 390 * used by relay_open() and CPU hotplug.
391 */ 391 */
392 static struct rchan_buf *relay_open_buf(struct rchan *chan, unsigned int cpu) 392 static struct rchan_buf *relay_open_buf(struct rchan *chan, unsigned int cpu)
393 { 393 {
394 struct rchan_buf *buf = NULL; 394 struct rchan_buf *buf = NULL;
395 struct dentry *dentry; 395 struct dentry *dentry;
396 char *tmpname; 396 char *tmpname;
397 397
398 if (chan->is_global) 398 if (chan->is_global)
399 return chan->buf[0]; 399 return chan->buf[0];
400 400
401 tmpname = kzalloc(NAME_MAX + 1, GFP_KERNEL); 401 tmpname = kzalloc(NAME_MAX + 1, GFP_KERNEL);
402 if (!tmpname) 402 if (!tmpname)
403 goto end; 403 goto end;
404 snprintf(tmpname, NAME_MAX, "%s%d", chan->base_filename, cpu); 404 snprintf(tmpname, NAME_MAX, "%s%d", chan->base_filename, cpu);
405 405
406 buf = relay_create_buf(chan); 406 buf = relay_create_buf(chan);
407 if (!buf) 407 if (!buf)
408 goto free_name; 408 goto free_name;
409 409
410 buf->cpu = cpu; 410 buf->cpu = cpu;
411 __relay_reset(buf, 1); 411 __relay_reset(buf, 1);
412 412
413 /* Create file in fs */ 413 /* Create file in fs */
414 dentry = chan->cb->create_buf_file(tmpname, chan->parent, S_IRUSR, 414 dentry = chan->cb->create_buf_file(tmpname, chan->parent, S_IRUSR,
415 buf, &chan->is_global); 415 buf, &chan->is_global);
416 if (!dentry) 416 if (!dentry)
417 goto free_buf; 417 goto free_buf;
418 418
419 buf->dentry = dentry; 419 buf->dentry = dentry;
420 420
421 if(chan->is_global) { 421 if(chan->is_global) {
422 chan->buf[0] = buf; 422 chan->buf[0] = buf;
423 buf->cpu = 0; 423 buf->cpu = 0;
424 } 424 }
425 425
426 goto free_name; 426 goto free_name;
427 427
428 free_buf: 428 free_buf:
429 relay_destroy_buf(buf); 429 relay_destroy_buf(buf);
430 free_name: 430 free_name:
431 kfree(tmpname); 431 kfree(tmpname);
432 end: 432 end:
433 return buf; 433 return buf;
434 } 434 }
435 435
436 /** 436 /**
437 * relay_close_buf - close a channel buffer 437 * relay_close_buf - close a channel buffer
438 * @buf: channel buffer 438 * @buf: channel buffer
439 * 439 *
440 * Marks the buffer finalized and restores the default callbacks. 440 * Marks the buffer finalized and restores the default callbacks.
441 * The channel buffer and channel buffer data structure are then freed 441 * The channel buffer and channel buffer data structure are then freed
442 * automatically when the last reference is given up. 442 * automatically when the last reference is given up.
443 */ 443 */
444 static void relay_close_buf(struct rchan_buf *buf) 444 static void relay_close_buf(struct rchan_buf *buf)
445 { 445 {
446 buf->finalized = 1; 446 buf->finalized = 1;
447 del_timer_sync(&buf->timer); 447 del_timer_sync(&buf->timer);
448 kref_put(&buf->kref, relay_remove_buf); 448 kref_put(&buf->kref, relay_remove_buf);
449 } 449 }
450 450
451 static void setup_callbacks(struct rchan *chan, 451 static void setup_callbacks(struct rchan *chan,
452 struct rchan_callbacks *cb) 452 struct rchan_callbacks *cb)
453 { 453 {
454 if (!cb) { 454 if (!cb) {
455 chan->cb = &default_channel_callbacks; 455 chan->cb = &default_channel_callbacks;
456 return; 456 return;
457 } 457 }
458 458
459 if (!cb->subbuf_start) 459 if (!cb->subbuf_start)
460 cb->subbuf_start = subbuf_start_default_callback; 460 cb->subbuf_start = subbuf_start_default_callback;
461 if (!cb->buf_mapped) 461 if (!cb->buf_mapped)
462 cb->buf_mapped = buf_mapped_default_callback; 462 cb->buf_mapped = buf_mapped_default_callback;
463 if (!cb->buf_unmapped) 463 if (!cb->buf_unmapped)
464 cb->buf_unmapped = buf_unmapped_default_callback; 464 cb->buf_unmapped = buf_unmapped_default_callback;
465 if (!cb->create_buf_file) 465 if (!cb->create_buf_file)
466 cb->create_buf_file = create_buf_file_default_callback; 466 cb->create_buf_file = create_buf_file_default_callback;
467 if (!cb->remove_buf_file) 467 if (!cb->remove_buf_file)
468 cb->remove_buf_file = remove_buf_file_default_callback; 468 cb->remove_buf_file = remove_buf_file_default_callback;
469 chan->cb = cb; 469 chan->cb = cb;
470 } 470 }
471 471
472 /** 472 /**
473 * relay_hotcpu_callback - CPU hotplug callback 473 * relay_hotcpu_callback - CPU hotplug callback
474 * @nb: notifier block 474 * @nb: notifier block
475 * @action: hotplug action to take 475 * @action: hotplug action to take
476 * @hcpu: CPU number 476 * @hcpu: CPU number
477 * 477 *
478 * Returns the success/failure of the operation. (%NOTIFY_OK, %NOTIFY_BAD) 478 * Returns the success/failure of the operation. (%NOTIFY_OK, %NOTIFY_BAD)
479 */ 479 */
480 static int __cpuinit relay_hotcpu_callback(struct notifier_block *nb, 480 static int __cpuinit relay_hotcpu_callback(struct notifier_block *nb,
481 unsigned long action, 481 unsigned long action,
482 void *hcpu) 482 void *hcpu)
483 { 483 {
484 unsigned int hotcpu = (unsigned long)hcpu; 484 unsigned int hotcpu = (unsigned long)hcpu;
485 struct rchan *chan; 485 struct rchan *chan;
486 486
487 switch(action) { 487 switch(action) {
488 case CPU_UP_PREPARE: 488 case CPU_UP_PREPARE:
489 case CPU_UP_PREPARE_FROZEN: 489 case CPU_UP_PREPARE_FROZEN:
490 mutex_lock(&relay_channels_mutex); 490 mutex_lock(&relay_channels_mutex);
491 list_for_each_entry(chan, &relay_channels, list) { 491 list_for_each_entry(chan, &relay_channels, list) {
492 if (chan->buf[hotcpu]) 492 if (chan->buf[hotcpu])
493 continue; 493 continue;
494 chan->buf[hotcpu] = relay_open_buf(chan, hotcpu); 494 chan->buf[hotcpu] = relay_open_buf(chan, hotcpu);
495 if(!chan->buf[hotcpu]) { 495 if(!chan->buf[hotcpu]) {
496 printk(KERN_ERR 496 printk(KERN_ERR
497 "relay_hotcpu_callback: cpu %d buffer " 497 "relay_hotcpu_callback: cpu %d buffer "
498 "creation failed\n", hotcpu); 498 "creation failed\n", hotcpu);
499 mutex_unlock(&relay_channels_mutex); 499 mutex_unlock(&relay_channels_mutex);
500 return NOTIFY_BAD; 500 return NOTIFY_BAD;
501 } 501 }
502 } 502 }
503 mutex_unlock(&relay_channels_mutex); 503 mutex_unlock(&relay_channels_mutex);
504 break; 504 break;
505 case CPU_DEAD: 505 case CPU_DEAD:
506 case CPU_DEAD_FROZEN: 506 case CPU_DEAD_FROZEN:
507 /* No need to flush the cpu : will be flushed upon 507 /* No need to flush the cpu : will be flushed upon
508 * final relay_flush() call. */ 508 * final relay_flush() call. */
509 break; 509 break;
510 } 510 }
511 return NOTIFY_OK; 511 return NOTIFY_OK;
512 } 512 }
513 513
514 /** 514 /**
515 * relay_open - create a new relay channel 515 * relay_open - create a new relay channel
516 * @base_filename: base name of files to create 516 * @base_filename: base name of files to create
517 * @parent: dentry of parent directory, %NULL for root directory 517 * @parent: dentry of parent directory, %NULL for root directory
518 * @subbuf_size: size of sub-buffers 518 * @subbuf_size: size of sub-buffers
519 * @n_subbufs: number of sub-buffers 519 * @n_subbufs: number of sub-buffers
520 * @cb: client callback functions 520 * @cb: client callback functions
521 * @private_data: user-defined data 521 * @private_data: user-defined data
522 * 522 *
523 * Returns channel pointer if successful, %NULL otherwise. 523 * Returns channel pointer if successful, %NULL otherwise.
524 * 524 *
525 * Creates a channel buffer for each cpu using the sizes and 525 * Creates a channel buffer for each cpu using the sizes and
526 * attributes specified. The created channel buffer files 526 * attributes specified. The created channel buffer files
527 * will be named base_filename0...base_filenameN-1. File 527 * will be named base_filename0...base_filenameN-1. File
528 * permissions will be %S_IRUSR. 528 * permissions will be %S_IRUSR.
529 */ 529 */
530 struct rchan *relay_open(const char *base_filename, 530 struct rchan *relay_open(const char *base_filename,
531 struct dentry *parent, 531 struct dentry *parent,
532 size_t subbuf_size, 532 size_t subbuf_size,
533 size_t n_subbufs, 533 size_t n_subbufs,
534 struct rchan_callbacks *cb, 534 struct rchan_callbacks *cb,
535 void *private_data) 535 void *private_data)
536 { 536 {
537 unsigned int i; 537 unsigned int i;
538 struct rchan *chan; 538 struct rchan *chan;
539 if (!base_filename) 539 if (!base_filename)
540 return NULL; 540 return NULL;
541 541
542 if (!(subbuf_size && n_subbufs)) 542 if (!(subbuf_size && n_subbufs))
543 return NULL; 543 return NULL;
544 544
545 chan = kzalloc(sizeof(struct rchan), GFP_KERNEL); 545 chan = kzalloc(sizeof(struct rchan), GFP_KERNEL);
546 if (!chan) 546 if (!chan)
547 return NULL; 547 return NULL;
548 548
549 chan->version = RELAYFS_CHANNEL_VERSION; 549 chan->version = RELAYFS_CHANNEL_VERSION;
550 chan->n_subbufs = n_subbufs; 550 chan->n_subbufs = n_subbufs;
551 chan->subbuf_size = subbuf_size; 551 chan->subbuf_size = subbuf_size;
552 chan->alloc_size = FIX_SIZE(subbuf_size * n_subbufs); 552 chan->alloc_size = FIX_SIZE(subbuf_size * n_subbufs);
553 chan->parent = parent; 553 chan->parent = parent;
554 chan->private_data = private_data; 554 chan->private_data = private_data;
555 strlcpy(chan->base_filename, base_filename, NAME_MAX); 555 strlcpy(chan->base_filename, base_filename, NAME_MAX);
556 setup_callbacks(chan, cb); 556 setup_callbacks(chan, cb);
557 kref_init(&chan->kref); 557 kref_init(&chan->kref);
558 558
559 mutex_lock(&relay_channels_mutex); 559 mutex_lock(&relay_channels_mutex);
560 for_each_online_cpu(i) { 560 for_each_online_cpu(i) {
561 chan->buf[i] = relay_open_buf(chan, i); 561 chan->buf[i] = relay_open_buf(chan, i);
562 if (!chan->buf[i]) 562 if (!chan->buf[i])
563 goto free_bufs; 563 goto free_bufs;
564 } 564 }
565 list_add(&chan->list, &relay_channels); 565 list_add(&chan->list, &relay_channels);
566 mutex_unlock(&relay_channels_mutex); 566 mutex_unlock(&relay_channels_mutex);
567 567
568 return chan; 568 return chan;
569 569
570 free_bufs: 570 free_bufs:
571 for_each_online_cpu(i) { 571 for_each_online_cpu(i) {
572 if (!chan->buf[i]) 572 if (!chan->buf[i])
573 break; 573 break;
574 relay_close_buf(chan->buf[i]); 574 relay_close_buf(chan->buf[i]);
575 } 575 }
576 576
577 kref_put(&chan->kref, relay_destroy_channel); 577 kref_put(&chan->kref, relay_destroy_channel);
578 mutex_unlock(&relay_channels_mutex); 578 mutex_unlock(&relay_channels_mutex);
579 return NULL; 579 return NULL;
580 } 580 }
581 EXPORT_SYMBOL_GPL(relay_open); 581 EXPORT_SYMBOL_GPL(relay_open);
582 582
583 /** 583 /**
584 * relay_switch_subbuf - switch to a new sub-buffer 584 * relay_switch_subbuf - switch to a new sub-buffer
585 * @buf: channel buffer 585 * @buf: channel buffer
586 * @length: size of current event 586 * @length: size of current event
587 * 587 *
588 * Returns either the length passed in or 0 if full. 588 * Returns either the length passed in or 0 if full.
589 * 589 *
590 * Performs sub-buffer-switch tasks such as invoking callbacks, 590 * Performs sub-buffer-switch tasks such as invoking callbacks,
591 * updating padding counts, waking up readers, etc. 591 * updating padding counts, waking up readers, etc.
592 */ 592 */
593 size_t relay_switch_subbuf(struct rchan_buf *buf, size_t length) 593 size_t relay_switch_subbuf(struct rchan_buf *buf, size_t length)
594 { 594 {
595 void *old, *new; 595 void *old, *new;
596 size_t old_subbuf, new_subbuf; 596 size_t old_subbuf, new_subbuf;
597 597
598 if (unlikely(length > buf->chan->subbuf_size)) 598 if (unlikely(length > buf->chan->subbuf_size))
599 goto toobig; 599 goto toobig;
600 600
601 if (buf->offset != buf->chan->subbuf_size + 1) { 601 if (buf->offset != buf->chan->subbuf_size + 1) {
602 buf->prev_padding = buf->chan->subbuf_size - buf->offset; 602 buf->prev_padding = buf->chan->subbuf_size - buf->offset;
603 old_subbuf = buf->subbufs_produced % buf->chan->n_subbufs; 603 old_subbuf = buf->subbufs_produced % buf->chan->n_subbufs;
604 buf->padding[old_subbuf] = buf->prev_padding; 604 buf->padding[old_subbuf] = buf->prev_padding;
605 buf->subbufs_produced++; 605 buf->subbufs_produced++;
606 buf->dentry->d_inode->i_size += buf->chan->subbuf_size - 606 buf->dentry->d_inode->i_size += buf->chan->subbuf_size -
607 buf->padding[old_subbuf]; 607 buf->padding[old_subbuf];
608 smp_mb(); 608 smp_mb();
609 if (waitqueue_active(&buf->read_wait)) 609 if (waitqueue_active(&buf->read_wait))
610 /* 610 /*
611 * Calling wake_up_interruptible() from here 611 * Calling wake_up_interruptible() from here
612 * will deadlock if we happen to be logging 612 * will deadlock if we happen to be logging
613 * from the scheduler (trying to re-grab 613 * from the scheduler (trying to re-grab
614 * rq->lock), so defer it. 614 * rq->lock), so defer it.
615 */ 615 */
616 __mod_timer(&buf->timer, jiffies + 1); 616 __mod_timer(&buf->timer, jiffies + 1);
617 } 617 }
618 618
619 old = buf->data; 619 old = buf->data;
620 new_subbuf = buf->subbufs_produced % buf->chan->n_subbufs; 620 new_subbuf = buf->subbufs_produced % buf->chan->n_subbufs;
621 new = buf->start + new_subbuf * buf->chan->subbuf_size; 621 new = buf->start + new_subbuf * buf->chan->subbuf_size;
622 buf->offset = 0; 622 buf->offset = 0;
623 if (!buf->chan->cb->subbuf_start(buf, new, old, buf->prev_padding)) { 623 if (!buf->chan->cb->subbuf_start(buf, new, old, buf->prev_padding)) {
624 buf->offset = buf->chan->subbuf_size + 1; 624 buf->offset = buf->chan->subbuf_size + 1;
625 return 0; 625 return 0;
626 } 626 }
627 buf->data = new; 627 buf->data = new;
628 buf->padding[new_subbuf] = 0; 628 buf->padding[new_subbuf] = 0;
629 629
630 if (unlikely(length + buf->offset > buf->chan->subbuf_size)) 630 if (unlikely(length + buf->offset > buf->chan->subbuf_size))
631 goto toobig; 631 goto toobig;
632 632
633 return length; 633 return length;
634 634
635 toobig: 635 toobig:
636 buf->chan->last_toobig = length; 636 buf->chan->last_toobig = length;
637 return 0; 637 return 0;
638 } 638 }
639 EXPORT_SYMBOL_GPL(relay_switch_subbuf); 639 EXPORT_SYMBOL_GPL(relay_switch_subbuf);
640 640
641 /** 641 /**
642 * relay_subbufs_consumed - update the buffer's sub-buffers-consumed count 642 * relay_subbufs_consumed - update the buffer's sub-buffers-consumed count
643 * @chan: the channel 643 * @chan: the channel
644 * @cpu: the cpu associated with the channel buffer to update 644 * @cpu: the cpu associated with the channel buffer to update
645 * @subbufs_consumed: number of sub-buffers to add to current buf's count 645 * @subbufs_consumed: number of sub-buffers to add to current buf's count
646 * 646 *
647 * Adds to the channel buffer's consumed sub-buffer count. 647 * Adds to the channel buffer's consumed sub-buffer count.
648 * subbufs_consumed should be the number of sub-buffers newly consumed, 648 * subbufs_consumed should be the number of sub-buffers newly consumed,
649 * not the total consumed. 649 * not the total consumed.
650 * 650 *
651 * NOTE. Kernel clients don't need to call this function if the channel 651 * NOTE. Kernel clients don't need to call this function if the channel
652 * mode is 'overwrite'. 652 * mode is 'overwrite'.
653 */ 653 */
654 void relay_subbufs_consumed(struct rchan *chan, 654 void relay_subbufs_consumed(struct rchan *chan,
655 unsigned int cpu, 655 unsigned int cpu,
656 size_t subbufs_consumed) 656 size_t subbufs_consumed)
657 { 657 {
658 struct rchan_buf *buf; 658 struct rchan_buf *buf;
659 659
660 if (!chan) 660 if (!chan)
661 return; 661 return;
662 662
663 if (cpu >= NR_CPUS || !chan->buf[cpu]) 663 if (cpu >= NR_CPUS || !chan->buf[cpu])
664 return; 664 return;
665 665
666 buf = chan->buf[cpu]; 666 buf = chan->buf[cpu];
667 buf->subbufs_consumed += subbufs_consumed; 667 buf->subbufs_consumed += subbufs_consumed;
668 if (buf->subbufs_consumed > buf->subbufs_produced) 668 if (buf->subbufs_consumed > buf->subbufs_produced)
669 buf->subbufs_consumed = buf->subbufs_produced; 669 buf->subbufs_consumed = buf->subbufs_produced;
670 } 670 }
671 EXPORT_SYMBOL_GPL(relay_subbufs_consumed); 671 EXPORT_SYMBOL_GPL(relay_subbufs_consumed);
672 672
673 /** 673 /**
674 * relay_close - close the channel 674 * relay_close - close the channel
675 * @chan: the channel 675 * @chan: the channel
676 * 676 *
677 * Closes all channel buffers and frees the channel. 677 * Closes all channel buffers and frees the channel.
678 */ 678 */
679 void relay_close(struct rchan *chan) 679 void relay_close(struct rchan *chan)
680 { 680 {
681 unsigned int i; 681 unsigned int i;
682 682
683 if (!chan) 683 if (!chan)
684 return; 684 return;
685 685
686 mutex_lock(&relay_channels_mutex); 686 mutex_lock(&relay_channels_mutex);
687 if (chan->is_global && chan->buf[0]) 687 if (chan->is_global && chan->buf[0])
688 relay_close_buf(chan->buf[0]); 688 relay_close_buf(chan->buf[0]);
689 else 689 else
690 for_each_possible_cpu(i) 690 for_each_possible_cpu(i)
691 if (chan->buf[i]) 691 if (chan->buf[i])
692 relay_close_buf(chan->buf[i]); 692 relay_close_buf(chan->buf[i]);
693 693
694 if (chan->last_toobig) 694 if (chan->last_toobig)
695 printk(KERN_WARNING "relay: one or more items not logged " 695 printk(KERN_WARNING "relay: one or more items not logged "
696 "[item size (%Zd) > sub-buffer size (%Zd)]\n", 696 "[item size (%Zd) > sub-buffer size (%Zd)]\n",
697 chan->last_toobig, chan->subbuf_size); 697 chan->last_toobig, chan->subbuf_size);
698 698
699 list_del(&chan->list); 699 list_del(&chan->list);
700 kref_put(&chan->kref, relay_destroy_channel); 700 kref_put(&chan->kref, relay_destroy_channel);
701 mutex_unlock(&relay_channels_mutex); 701 mutex_unlock(&relay_channels_mutex);
702 } 702 }
703 EXPORT_SYMBOL_GPL(relay_close); 703 EXPORT_SYMBOL_GPL(relay_close);
704 704
705 /** 705 /**
706 * relay_flush - close the channel 706 * relay_flush - close the channel
707 * @chan: the channel 707 * @chan: the channel
708 * 708 *
709 * Flushes all channel buffers, i.e. forces buffer switch. 709 * Flushes all channel buffers, i.e. forces buffer switch.
710 */ 710 */
711 void relay_flush(struct rchan *chan) 711 void relay_flush(struct rchan *chan)
712 { 712 {
713 unsigned int i; 713 unsigned int i;
714 714
715 if (!chan) 715 if (!chan)
716 return; 716 return;
717 717
718 if (chan->is_global && chan->buf[0]) { 718 if (chan->is_global && chan->buf[0]) {
719 relay_switch_subbuf(chan->buf[0], 0); 719 relay_switch_subbuf(chan->buf[0], 0);
720 return; 720 return;
721 } 721 }
722 722
723 mutex_lock(&relay_channels_mutex); 723 mutex_lock(&relay_channels_mutex);
724 for_each_possible_cpu(i) 724 for_each_possible_cpu(i)
725 if (chan->buf[i]) 725 if (chan->buf[i])
726 relay_switch_subbuf(chan->buf[i], 0); 726 relay_switch_subbuf(chan->buf[i], 0);
727 mutex_unlock(&relay_channels_mutex); 727 mutex_unlock(&relay_channels_mutex);
728 } 728 }
729 EXPORT_SYMBOL_GPL(relay_flush); 729 EXPORT_SYMBOL_GPL(relay_flush);
730 730
731 /** 731 /**
732 * relay_file_open - open file op for relay files 732 * relay_file_open - open file op for relay files
733 * @inode: the inode 733 * @inode: the inode
734 * @filp: the file 734 * @filp: the file
735 * 735 *
736 * Increments the channel buffer refcount. 736 * Increments the channel buffer refcount.
737 */ 737 */
738 static int relay_file_open(struct inode *inode, struct file *filp) 738 static int relay_file_open(struct inode *inode, struct file *filp)
739 { 739 {
740 struct rchan_buf *buf = inode->i_private; 740 struct rchan_buf *buf = inode->i_private;
741 kref_get(&buf->kref); 741 kref_get(&buf->kref);
742 filp->private_data = buf; 742 filp->private_data = buf;
743 743
744 return 0; 744 return 0;
745 } 745 }
746 746
747 /** 747 /**
748 * relay_file_mmap - mmap file op for relay files 748 * relay_file_mmap - mmap file op for relay files
749 * @filp: the file 749 * @filp: the file
750 * @vma: the vma describing what to map 750 * @vma: the vma describing what to map
751 * 751 *
752 * Calls upon relay_mmap_buf() to map the file into user space. 752 * Calls upon relay_mmap_buf() to map the file into user space.
753 */ 753 */
754 static int relay_file_mmap(struct file *filp, struct vm_area_struct *vma) 754 static int relay_file_mmap(struct file *filp, struct vm_area_struct *vma)
755 { 755 {
756 struct rchan_buf *buf = filp->private_data; 756 struct rchan_buf *buf = filp->private_data;
757 return relay_mmap_buf(buf, vma); 757 return relay_mmap_buf(buf, vma);
758 } 758 }
759 759
760 /** 760 /**
761 * relay_file_poll - poll file op for relay files 761 * relay_file_poll - poll file op for relay files
762 * @filp: the file 762 * @filp: the file
763 * @wait: poll table 763 * @wait: poll table
764 * 764 *
765 * Poll implemention. 765 * Poll implemention.
766 */ 766 */
767 static unsigned int relay_file_poll(struct file *filp, poll_table *wait) 767 static unsigned int relay_file_poll(struct file *filp, poll_table *wait)
768 { 768 {
769 unsigned int mask = 0; 769 unsigned int mask = 0;
770 struct rchan_buf *buf = filp->private_data; 770 struct rchan_buf *buf = filp->private_data;
771 771
772 if (buf->finalized) 772 if (buf->finalized)
773 return POLLERR; 773 return POLLERR;
774 774
775 if (filp->f_mode & FMODE_READ) { 775 if (filp->f_mode & FMODE_READ) {
776 poll_wait(filp, &buf->read_wait, wait); 776 poll_wait(filp, &buf->read_wait, wait);
777 if (!relay_buf_empty(buf)) 777 if (!relay_buf_empty(buf))
778 mask |= POLLIN | POLLRDNORM; 778 mask |= POLLIN | POLLRDNORM;
779 } 779 }
780 780
781 return mask; 781 return mask;
782 } 782 }
783 783
784 /** 784 /**
785 * relay_file_release - release file op for relay files 785 * relay_file_release - release file op for relay files
786 * @inode: the inode 786 * @inode: the inode
787 * @filp: the file 787 * @filp: the file
788 * 788 *
789 * Decrements the channel refcount, as the filesystem is 789 * Decrements the channel refcount, as the filesystem is
790 * no longer using it. 790 * no longer using it.
791 */ 791 */
792 static int relay_file_release(struct inode *inode, struct file *filp) 792 static int relay_file_release(struct inode *inode, struct file *filp)
793 { 793 {
794 struct rchan_buf *buf = filp->private_data; 794 struct rchan_buf *buf = filp->private_data;
795 kref_put(&buf->kref, relay_remove_buf); 795 kref_put(&buf->kref, relay_remove_buf);
796 796
797 return 0; 797 return 0;
798 } 798 }
799 799
800 /* 800 /*
801 * relay_file_read_consume - update the consumed count for the buffer 801 * relay_file_read_consume - update the consumed count for the buffer
802 */ 802 */
803 static void relay_file_read_consume(struct rchan_buf *buf, 803 static void relay_file_read_consume(struct rchan_buf *buf,
804 size_t read_pos, 804 size_t read_pos,
805 size_t bytes_consumed) 805 size_t bytes_consumed)
806 { 806 {
807 size_t subbuf_size = buf->chan->subbuf_size; 807 size_t subbuf_size = buf->chan->subbuf_size;
808 size_t n_subbufs = buf->chan->n_subbufs; 808 size_t n_subbufs = buf->chan->n_subbufs;
809 size_t read_subbuf; 809 size_t read_subbuf;
810 810
811 if (buf->bytes_consumed + bytes_consumed > subbuf_size) { 811 if (buf->bytes_consumed + bytes_consumed > subbuf_size) {
812 relay_subbufs_consumed(buf->chan, buf->cpu, 1); 812 relay_subbufs_consumed(buf->chan, buf->cpu, 1);
813 buf->bytes_consumed = 0; 813 buf->bytes_consumed = 0;
814 } 814 }
815 815
816 buf->bytes_consumed += bytes_consumed; 816 buf->bytes_consumed += bytes_consumed;
817 if (!read_pos) 817 if (!read_pos)
818 read_subbuf = buf->subbufs_consumed % n_subbufs; 818 read_subbuf = buf->subbufs_consumed % n_subbufs;
819 else 819 else
820 read_subbuf = read_pos / buf->chan->subbuf_size; 820 read_subbuf = read_pos / buf->chan->subbuf_size;
821 if (buf->bytes_consumed + buf->padding[read_subbuf] == subbuf_size) { 821 if (buf->bytes_consumed + buf->padding[read_subbuf] == subbuf_size) {
822 if ((read_subbuf == buf->subbufs_produced % n_subbufs) && 822 if ((read_subbuf == buf->subbufs_produced % n_subbufs) &&
823 (buf->offset == subbuf_size)) 823 (buf->offset == subbuf_size))
824 return; 824 return;
825 relay_subbufs_consumed(buf->chan, buf->cpu, 1); 825 relay_subbufs_consumed(buf->chan, buf->cpu, 1);
826 buf->bytes_consumed = 0; 826 buf->bytes_consumed = 0;
827 } 827 }
828 } 828 }
829 829
830 /* 830 /*
831 * relay_file_read_avail - boolean, are there unconsumed bytes available? 831 * relay_file_read_avail - boolean, are there unconsumed bytes available?
832 */ 832 */
833 static int relay_file_read_avail(struct rchan_buf *buf, size_t read_pos) 833 static int relay_file_read_avail(struct rchan_buf *buf, size_t read_pos)
834 { 834 {
835 size_t subbuf_size = buf->chan->subbuf_size; 835 size_t subbuf_size = buf->chan->subbuf_size;
836 size_t n_subbufs = buf->chan->n_subbufs; 836 size_t n_subbufs = buf->chan->n_subbufs;
837 size_t produced = buf->subbufs_produced; 837 size_t produced = buf->subbufs_produced;
838 size_t consumed = buf->subbufs_consumed; 838 size_t consumed = buf->subbufs_consumed;
839 839
840 relay_file_read_consume(buf, read_pos, 0); 840 relay_file_read_consume(buf, read_pos, 0);
841 841
842 if (unlikely(buf->offset > subbuf_size)) { 842 if (unlikely(buf->offset > subbuf_size)) {
843 if (produced == consumed) 843 if (produced == consumed)
844 return 0; 844 return 0;
845 return 1; 845 return 1;
846 } 846 }
847 847
848 if (unlikely(produced - consumed >= n_subbufs)) { 848 if (unlikely(produced - consumed >= n_subbufs)) {
849 consumed = produced - n_subbufs + 1; 849 consumed = produced - n_subbufs + 1;
850 buf->subbufs_consumed = consumed; 850 buf->subbufs_consumed = consumed;
851 buf->bytes_consumed = 0; 851 buf->bytes_consumed = 0;
852 } 852 }
853 853
854 produced = (produced % n_subbufs) * subbuf_size + buf->offset; 854 produced = (produced % n_subbufs) * subbuf_size + buf->offset;
855 consumed = (consumed % n_subbufs) * subbuf_size + buf->bytes_consumed; 855 consumed = (consumed % n_subbufs) * subbuf_size + buf->bytes_consumed;
856 856
857 if (consumed > produced) 857 if (consumed > produced)
858 produced += n_subbufs * subbuf_size; 858 produced += n_subbufs * subbuf_size;
859 859
860 if (consumed == produced) 860 if (consumed == produced)
861 return 0; 861 return 0;
862 862
863 return 1; 863 return 1;
864 } 864 }
865 865
866 /** 866 /**
867 * relay_file_read_subbuf_avail - return bytes available in sub-buffer 867 * relay_file_read_subbuf_avail - return bytes available in sub-buffer
868 * @read_pos: file read position 868 * @read_pos: file read position
869 * @buf: relay channel buffer 869 * @buf: relay channel buffer
870 */ 870 */
871 static size_t relay_file_read_subbuf_avail(size_t read_pos, 871 static size_t relay_file_read_subbuf_avail(size_t read_pos,
872 struct rchan_buf *buf) 872 struct rchan_buf *buf)
873 { 873 {
874 size_t padding, avail = 0; 874 size_t padding, avail = 0;
875 size_t read_subbuf, read_offset, write_subbuf, write_offset; 875 size_t read_subbuf, read_offset, write_subbuf, write_offset;
876 size_t subbuf_size = buf->chan->subbuf_size; 876 size_t subbuf_size = buf->chan->subbuf_size;
877 877
878 write_subbuf = (buf->data - buf->start) / subbuf_size; 878 write_subbuf = (buf->data - buf->start) / subbuf_size;
879 write_offset = buf->offset > subbuf_size ? subbuf_size : buf->offset; 879 write_offset = buf->offset > subbuf_size ? subbuf_size : buf->offset;
880 read_subbuf = read_pos / subbuf_size; 880 read_subbuf = read_pos / subbuf_size;
881 read_offset = read_pos % subbuf_size; 881 read_offset = read_pos % subbuf_size;
882 padding = buf->padding[read_subbuf]; 882 padding = buf->padding[read_subbuf];
883 883
884 if (read_subbuf == write_subbuf) { 884 if (read_subbuf == write_subbuf) {
885 if (read_offset + padding < write_offset) 885 if (read_offset + padding < write_offset)
886 avail = write_offset - (read_offset + padding); 886 avail = write_offset - (read_offset + padding);
887 } else 887 } else
888 avail = (subbuf_size - padding) - read_offset; 888 avail = (subbuf_size - padding) - read_offset;
889 889
890 return avail; 890 return avail;
891 } 891 }
892 892
893 /** 893 /**
894 * relay_file_read_start_pos - find the first available byte to read 894 * relay_file_read_start_pos - find the first available byte to read
895 * @read_pos: file read position 895 * @read_pos: file read position
896 * @buf: relay channel buffer 896 * @buf: relay channel buffer
897 * 897 *
898 * If the @read_pos is in the middle of padding, return the 898 * If the @read_pos is in the middle of padding, return the
899 * position of the first actually available byte, otherwise 899 * position of the first actually available byte, otherwise
900 * return the original value. 900 * return the original value.
901 */ 901 */
902 static size_t relay_file_read_start_pos(size_t read_pos, 902 static size_t relay_file_read_start_pos(size_t read_pos,
903 struct rchan_buf *buf) 903 struct rchan_buf *buf)
904 { 904 {
905 size_t read_subbuf, padding, padding_start, padding_end; 905 size_t read_subbuf, padding, padding_start, padding_end;
906 size_t subbuf_size = buf->chan->subbuf_size; 906 size_t subbuf_size = buf->chan->subbuf_size;
907 size_t n_subbufs = buf->chan->n_subbufs; 907 size_t n_subbufs = buf->chan->n_subbufs;
908 size_t consumed = buf->subbufs_consumed % n_subbufs; 908 size_t consumed = buf->subbufs_consumed % n_subbufs;
909 909
910 if (!read_pos) 910 if (!read_pos)
911 read_pos = consumed * subbuf_size + buf->bytes_consumed; 911 read_pos = consumed * subbuf_size + buf->bytes_consumed;
912 read_subbuf = read_pos / subbuf_size; 912 read_subbuf = read_pos / subbuf_size;
913 padding = buf->padding[read_subbuf]; 913 padding = buf->padding[read_subbuf];
914 padding_start = (read_subbuf + 1) * subbuf_size - padding; 914 padding_start = (read_subbuf + 1) * subbuf_size - padding;
915 padding_end = (read_subbuf + 1) * subbuf_size; 915 padding_end = (read_subbuf + 1) * subbuf_size;
916 if (read_pos >= padding_start && read_pos < padding_end) { 916 if (read_pos >= padding_start && read_pos < padding_end) {
917 read_subbuf = (read_subbuf + 1) % n_subbufs; 917 read_subbuf = (read_subbuf + 1) % n_subbufs;
918 read_pos = read_subbuf * subbuf_size; 918 read_pos = read_subbuf * subbuf_size;
919 } 919 }
920 920
921 return read_pos; 921 return read_pos;
922 } 922 }
923 923
924 /** 924 /**
925 * relay_file_read_end_pos - return the new read position 925 * relay_file_read_end_pos - return the new read position
926 * @read_pos: file read position 926 * @read_pos: file read position
927 * @buf: relay channel buffer 927 * @buf: relay channel buffer
928 * @count: number of bytes to be read 928 * @count: number of bytes to be read
929 */ 929 */
930 static size_t relay_file_read_end_pos(struct rchan_buf *buf, 930 static size_t relay_file_read_end_pos(struct rchan_buf *buf,
931 size_t read_pos, 931 size_t read_pos,
932 size_t count) 932 size_t count)
933 { 933 {
934 size_t read_subbuf, padding, end_pos; 934 size_t read_subbuf, padding, end_pos;
935 size_t subbuf_size = buf->chan->subbuf_size; 935 size_t subbuf_size = buf->chan->subbuf_size;
936 size_t n_subbufs = buf->chan->n_subbufs; 936 size_t n_subbufs = buf->chan->n_subbufs;
937 937
938 read_subbuf = read_pos / subbuf_size; 938 read_subbuf = read_pos / subbuf_size;
939 padding = buf->padding[read_subbuf]; 939 padding = buf->padding[read_subbuf];
940 if (read_pos % subbuf_size + count + padding == subbuf_size) 940 if (read_pos % subbuf_size + count + padding == subbuf_size)
941 end_pos = (read_subbuf + 1) * subbuf_size; 941 end_pos = (read_subbuf + 1) * subbuf_size;
942 else 942 else
943 end_pos = read_pos + count; 943 end_pos = read_pos + count;
944 if (end_pos >= subbuf_size * n_subbufs) 944 if (end_pos >= subbuf_size * n_subbufs)
945 end_pos = 0; 945 end_pos = 0;
946 946
947 return end_pos; 947 return end_pos;
948 } 948 }
949 949
950 /* 950 /*
951 * subbuf_read_actor - read up to one subbuf's worth of data 951 * subbuf_read_actor - read up to one subbuf's worth of data
952 */ 952 */
953 static int subbuf_read_actor(size_t read_start, 953 static int subbuf_read_actor(size_t read_start,
954 struct rchan_buf *buf, 954 struct rchan_buf *buf,
955 size_t avail, 955 size_t avail,
956 read_descriptor_t *desc, 956 read_descriptor_t *desc,
957 read_actor_t actor) 957 read_actor_t actor)
958 { 958 {
959 void *from; 959 void *from;
960 int ret = 0; 960 int ret = 0;
961 961
962 from = buf->start + read_start; 962 from = buf->start + read_start;
963 ret = avail; 963 ret = avail;
964 if (copy_to_user(desc->arg.buf, from, avail)) { 964 if (copy_to_user(desc->arg.buf, from, avail)) {
965 desc->error = -EFAULT; 965 desc->error = -EFAULT;
966 ret = 0; 966 ret = 0;
967 } 967 }
968 desc->arg.data += ret; 968 desc->arg.data += ret;
969 desc->written += ret; 969 desc->written += ret;
970 desc->count -= ret; 970 desc->count -= ret;
971 971
972 return ret; 972 return ret;
973 } 973 }
974 974
975 typedef int (*subbuf_actor_t) (size_t read_start, 975 typedef int (*subbuf_actor_t) (size_t read_start,
976 struct rchan_buf *buf, 976 struct rchan_buf *buf,
977 size_t avail, 977 size_t avail,
978 read_descriptor_t *desc, 978 read_descriptor_t *desc,
979 read_actor_t actor); 979 read_actor_t actor);
980 980
981 /* 981 /*
982 * relay_file_read_subbufs - read count bytes, bridging subbuf boundaries 982 * relay_file_read_subbufs - read count bytes, bridging subbuf boundaries
983 */ 983 */
984 static ssize_t relay_file_read_subbufs(struct file *filp, loff_t *ppos, 984 static ssize_t relay_file_read_subbufs(struct file *filp, loff_t *ppos,
985 subbuf_actor_t subbuf_actor, 985 subbuf_actor_t subbuf_actor,
986 read_actor_t actor, 986 read_actor_t actor,
987 read_descriptor_t *desc) 987 read_descriptor_t *desc)
988 { 988 {
989 struct rchan_buf *buf = filp->private_data; 989 struct rchan_buf *buf = filp->private_data;
990 size_t read_start, avail; 990 size_t read_start, avail;
991 int ret; 991 int ret;
992 992
993 if (!desc->count) 993 if (!desc->count)
994 return 0; 994 return 0;
995 995
996 mutex_lock(&filp->f_path.dentry->d_inode->i_mutex); 996 mutex_lock(&filp->f_path.dentry->d_inode->i_mutex);
997 do { 997 do {
998 if (!relay_file_read_avail(buf, *ppos)) 998 if (!relay_file_read_avail(buf, *ppos))
999 break; 999 break;
1000 1000
1001 read_start = relay_file_read_start_pos(*ppos, buf); 1001 read_start = relay_file_read_start_pos(*ppos, buf);
1002 avail = relay_file_read_subbuf_avail(read_start, buf); 1002 avail = relay_file_read_subbuf_avail(read_start, buf);
1003 if (!avail) 1003 if (!avail)
1004 break; 1004 break;
1005 1005
1006 avail = min(desc->count, avail); 1006 avail = min(desc->count, avail);
1007 ret = subbuf_actor(read_start, buf, avail, desc, actor); 1007 ret = subbuf_actor(read_start, buf, avail, desc, actor);
1008 if (desc->error < 0) 1008 if (desc->error < 0)
1009 break; 1009 break;
1010 1010
1011 if (ret) { 1011 if (ret) {
1012 relay_file_read_consume(buf, read_start, ret); 1012 relay_file_read_consume(buf, read_start, ret);
1013 *ppos = relay_file_read_end_pos(buf, read_start, ret); 1013 *ppos = relay_file_read_end_pos(buf, read_start, ret);
1014 } 1014 }
1015 } while (desc->count && ret); 1015 } while (desc->count && ret);
1016 mutex_unlock(&filp->f_path.dentry->d_inode->i_mutex); 1016 mutex_unlock(&filp->f_path.dentry->d_inode->i_mutex);
1017 1017
1018 return desc->written; 1018 return desc->written;
1019 } 1019 }
1020 1020
1021 static ssize_t relay_file_read(struct file *filp, 1021 static ssize_t relay_file_read(struct file *filp,
1022 char __user *buffer, 1022 char __user *buffer,
1023 size_t count, 1023 size_t count,
1024 loff_t *ppos) 1024 loff_t *ppos)
1025 { 1025 {
1026 read_descriptor_t desc; 1026 read_descriptor_t desc;
1027 desc.written = 0; 1027 desc.written = 0;
1028 desc.count = count; 1028 desc.count = count;
1029 desc.arg.buf = buffer; 1029 desc.arg.buf = buffer;
1030 desc.error = 0; 1030 desc.error = 0;
1031 return relay_file_read_subbufs(filp, ppos, subbuf_read_actor, 1031 return relay_file_read_subbufs(filp, ppos, subbuf_read_actor,
1032 NULL, &desc); 1032 NULL, &desc);
1033 } 1033 }
1034 1034
1035 static void relay_consume_bytes(struct rchan_buf *rbuf, int bytes_consumed) 1035 static void relay_consume_bytes(struct rchan_buf *rbuf, int bytes_consumed)
1036 { 1036 {
1037 rbuf->bytes_consumed += bytes_consumed; 1037 rbuf->bytes_consumed += bytes_consumed;
1038 1038
1039 if (rbuf->bytes_consumed >= rbuf->chan->subbuf_size) { 1039 if (rbuf->bytes_consumed >= rbuf->chan->subbuf_size) {
1040 relay_subbufs_consumed(rbuf->chan, rbuf->cpu, 1); 1040 relay_subbufs_consumed(rbuf->chan, rbuf->cpu, 1);
1041 rbuf->bytes_consumed %= rbuf->chan->subbuf_size; 1041 rbuf->bytes_consumed %= rbuf->chan->subbuf_size;
1042 } 1042 }
1043 } 1043 }
1044 1044
1045 static void relay_pipe_buf_release(struct pipe_inode_info *pipe, 1045 static void relay_pipe_buf_release(struct pipe_inode_info *pipe,
1046 struct pipe_buffer *buf) 1046 struct pipe_buffer *buf)
1047 { 1047 {
1048 struct rchan_buf *rbuf; 1048 struct rchan_buf *rbuf;
1049 1049
1050 rbuf = (struct rchan_buf *)page_private(buf->page); 1050 rbuf = (struct rchan_buf *)page_private(buf->page);
1051 relay_consume_bytes(rbuf, buf->private); 1051 relay_consume_bytes(rbuf, buf->private);
1052 } 1052 }
1053 1053
1054 static struct pipe_buf_operations relay_pipe_buf_ops = { 1054 static struct pipe_buf_operations relay_pipe_buf_ops = {
1055 .can_merge = 0, 1055 .can_merge = 0,
1056 .map = generic_pipe_buf_map, 1056 .map = generic_pipe_buf_map,
1057 .unmap = generic_pipe_buf_unmap, 1057 .unmap = generic_pipe_buf_unmap,
1058 .pin = generic_pipe_buf_pin, 1058 .confirm = generic_pipe_buf_confirm,
1059 .release = relay_pipe_buf_release, 1059 .release = relay_pipe_buf_release,
1060 .steal = generic_pipe_buf_steal, 1060 .steal = generic_pipe_buf_steal,
1061 .get = generic_pipe_buf_get, 1061 .get = generic_pipe_buf_get,
1062 }; 1062 };
1063 1063
1064 /** 1064 /**
1065 * subbuf_splice_actor - splice up to one subbuf's worth of data 1065 * subbuf_splice_actor - splice up to one subbuf's worth of data
1066 */ 1066 */
1067 static int subbuf_splice_actor(struct file *in, 1067 static int subbuf_splice_actor(struct file *in,
1068 loff_t *ppos, 1068 loff_t *ppos,
1069 struct pipe_inode_info *pipe, 1069 struct pipe_inode_info *pipe,
1070 size_t len, 1070 size_t len,
1071 unsigned int flags, 1071 unsigned int flags,
1072 int *nonpad_ret) 1072 int *nonpad_ret)
1073 { 1073 {
1074 unsigned int pidx, poff, total_len, subbuf_pages, ret; 1074 unsigned int pidx, poff, total_len, subbuf_pages, ret;
1075 struct rchan_buf *rbuf = in->private_data; 1075 struct rchan_buf *rbuf = in->private_data;
1076 unsigned int subbuf_size = rbuf->chan->subbuf_size; 1076 unsigned int subbuf_size = rbuf->chan->subbuf_size;
1077 size_t read_start = ((size_t)*ppos) % rbuf->chan->alloc_size; 1077 size_t read_start = ((size_t)*ppos) % rbuf->chan->alloc_size;
1078 size_t read_subbuf = read_start / subbuf_size; 1078 size_t read_subbuf = read_start / subbuf_size;
1079 size_t padding = rbuf->padding[read_subbuf]; 1079 size_t padding = rbuf->padding[read_subbuf];
1080 size_t nonpad_end = read_subbuf * subbuf_size + subbuf_size - padding; 1080 size_t nonpad_end = read_subbuf * subbuf_size + subbuf_size - padding;
1081 struct page *pages[PIPE_BUFFERS]; 1081 struct page *pages[PIPE_BUFFERS];
1082 struct partial_page partial[PIPE_BUFFERS]; 1082 struct partial_page partial[PIPE_BUFFERS];
1083 struct splice_pipe_desc spd = { 1083 struct splice_pipe_desc spd = {
1084 .pages = pages, 1084 .pages = pages,
1085 .nr_pages = 0, 1085 .nr_pages = 0,
1086 .partial = partial, 1086 .partial = partial,
1087 .flags = flags, 1087 .flags = flags,
1088 .ops = &relay_pipe_buf_ops, 1088 .ops = &relay_pipe_buf_ops,
1089 }; 1089 };
1090 1090
1091 if (rbuf->subbufs_produced == rbuf->subbufs_consumed) 1091 if (rbuf->subbufs_produced == rbuf->subbufs_consumed)
1092 return 0; 1092 return 0;
1093 1093
1094 /* 1094 /*
1095 * Adjust read len, if longer than what is available 1095 * Adjust read len, if longer than what is available
1096 */ 1096 */
1097 if (len > (subbuf_size - read_start % subbuf_size)) 1097 if (len > (subbuf_size - read_start % subbuf_size))
1098 len = subbuf_size - read_start % subbuf_size; 1098 len = subbuf_size - read_start % subbuf_size;
1099 1099
1100 subbuf_pages = rbuf->chan->alloc_size >> PAGE_SHIFT; 1100 subbuf_pages = rbuf->chan->alloc_size >> PAGE_SHIFT;
1101 pidx = (read_start / PAGE_SIZE) % subbuf_pages; 1101 pidx = (read_start / PAGE_SIZE) % subbuf_pages;
1102 poff = read_start & ~PAGE_MASK; 1102 poff = read_start & ~PAGE_MASK;
1103 1103
1104 for (total_len = 0; spd.nr_pages < subbuf_pages; spd.nr_pages++) { 1104 for (total_len = 0; spd.nr_pages < subbuf_pages; spd.nr_pages++) {
1105 unsigned int this_len, this_end, private; 1105 unsigned int this_len, this_end, private;
1106 unsigned int cur_pos = read_start + total_len; 1106 unsigned int cur_pos = read_start + total_len;
1107 1107
1108 if (!len) 1108 if (!len)
1109 break; 1109 break;
1110 1110
1111 this_len = min_t(unsigned long, len, PAGE_SIZE - poff); 1111 this_len = min_t(unsigned long, len, PAGE_SIZE - poff);
1112 private = this_len; 1112 private = this_len;
1113 1113
1114 spd.pages[spd.nr_pages] = rbuf->page_array[pidx]; 1114 spd.pages[spd.nr_pages] = rbuf->page_array[pidx];
1115 spd.partial[spd.nr_pages].offset = poff; 1115 spd.partial[spd.nr_pages].offset = poff;
1116 1116
1117 this_end = cur_pos + this_len; 1117 this_end = cur_pos + this_len;
1118 if (this_end >= nonpad_end) { 1118 if (this_end >= nonpad_end) {
1119 this_len = nonpad_end - cur_pos; 1119 this_len = nonpad_end - cur_pos;
1120 private = this_len + padding; 1120 private = this_len + padding;
1121 } 1121 }
1122 spd.partial[spd.nr_pages].len = this_len; 1122 spd.partial[spd.nr_pages].len = this_len;
1123 spd.partial[spd.nr_pages].private = private; 1123 spd.partial[spd.nr_pages].private = private;
1124 1124
1125 len -= this_len; 1125 len -= this_len;
1126 total_len += this_len; 1126 total_len += this_len;
1127 poff = 0; 1127 poff = 0;
1128 pidx = (pidx + 1) % subbuf_pages; 1128 pidx = (pidx + 1) % subbuf_pages;
1129 1129
1130 if (this_end >= nonpad_end) { 1130 if (this_end >= nonpad_end) {
1131 spd.nr_pages++; 1131 spd.nr_pages++;
1132 break; 1132 break;
1133 } 1133 }
1134 } 1134 }
1135 1135
1136 if (!spd.nr_pages) 1136 if (!spd.nr_pages)
1137 return 0; 1137 return 0;
1138 1138
1139 ret = *nonpad_ret = splice_to_pipe(pipe, &spd); 1139 ret = *nonpad_ret = splice_to_pipe(pipe, &spd);
1140 if (ret < 0 || ret < total_len) 1140 if (ret < 0 || ret < total_len)
1141 return ret; 1141 return ret;
1142 1142
1143 if (read_start + ret == nonpad_end) 1143 if (read_start + ret == nonpad_end)
1144 ret += padding; 1144 ret += padding;
1145 1145
1146 return ret; 1146 return ret;
1147 } 1147 }
1148 1148
1149 static ssize_t relay_file_splice_read(struct file *in, 1149 static ssize_t relay_file_splice_read(struct file *in,
1150 loff_t *ppos, 1150 loff_t *ppos,
1151 struct pipe_inode_info *pipe, 1151 struct pipe_inode_info *pipe,
1152 size_t len, 1152 size_t len,
1153 unsigned int flags) 1153 unsigned int flags)
1154 { 1154 {
1155 ssize_t spliced; 1155 ssize_t spliced;
1156 int ret; 1156 int ret;
1157 int nonpad_ret = 0; 1157 int nonpad_ret = 0;
1158 1158
1159 ret = 0; 1159 ret = 0;
1160 spliced = 0; 1160 spliced = 0;
1161 1161
1162 while (len) { 1162 while (len) {
1163 ret = subbuf_splice_actor(in, ppos, pipe, len, flags, &nonpad_ret); 1163 ret = subbuf_splice_actor(in, ppos, pipe, len, flags, &nonpad_ret);
1164 if (ret < 0) 1164 if (ret < 0)
1165 break; 1165 break;
1166 else if (!ret) { 1166 else if (!ret) {
1167 if (spliced) 1167 if (spliced)
1168 break; 1168 break;
1169 if (flags & SPLICE_F_NONBLOCK) { 1169 if (flags & SPLICE_F_NONBLOCK) {
1170 ret = -EAGAIN; 1170 ret = -EAGAIN;
1171 break; 1171 break;
1172 } 1172 }
1173 } 1173 }
1174 1174
1175 *ppos += ret; 1175 *ppos += ret;
1176 if (ret > len) 1176 if (ret > len)
1177 len = 0; 1177 len = 0;
1178 else 1178 else
1179 len -= ret; 1179 len -= ret;
1180 spliced += nonpad_ret; 1180 spliced += nonpad_ret;
1181 nonpad_ret = 0; 1181 nonpad_ret = 0;
1182 } 1182 }
1183 1183
1184 if (spliced) 1184 if (spliced)
1185 return spliced; 1185 return spliced;
1186 1186
1187 return ret; 1187 return ret;
1188 } 1188 }
1189 1189
1190 const struct file_operations relay_file_operations = { 1190 const struct file_operations relay_file_operations = {
1191 .open = relay_file_open, 1191 .open = relay_file_open,
1192 .poll = relay_file_poll, 1192 .poll = relay_file_poll,
1193 .mmap = relay_file_mmap, 1193 .mmap = relay_file_mmap,
1194 .read = relay_file_read, 1194 .read = relay_file_read,
1195 .llseek = no_llseek, 1195 .llseek = no_llseek,
1196 .release = relay_file_release, 1196 .release = relay_file_release,
1197 .splice_read = relay_file_splice_read, 1197 .splice_read = relay_file_splice_read,
1198 }; 1198 };
1199 EXPORT_SYMBOL_GPL(relay_file_operations); 1199 EXPORT_SYMBOL_GPL(relay_file_operations);
1200 1200
1201 static __init int relay_init(void) 1201 static __init int relay_init(void)
1202 { 1202 {
1203 1203
1204 hotcpu_notifier(relay_hotcpu_callback, 0); 1204 hotcpu_notifier(relay_hotcpu_callback, 0);
1205 return 0; 1205 return 0;
1206 } 1206 }
1207 1207
1208 module_init(relay_init); 1208 module_init(relay_init);
1209 1209