Commit cac36bb06efe4880234524e117e0e712b10b1f16
1 parent
d96e6e7164
Exists in
master
and in
4 other branches
pipe: change the ->pin() operation to ->confirm()
The name 'pin' was badly chosen, it doesn't pin a pipe buffer in the most commonly used sense in the kernel. So change the name to 'confirm', after debating this issue with Hugh Dickins a bit. A good return from ->confirm() means that the buffer is really there, and that the contents are good. Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
Showing 7 changed files with 22 additions and 20 deletions Inline Diff
drivers/block/loop.c
| 1 | /* | 1 | /* |
| 2 | * linux/drivers/block/loop.c | 2 | * linux/drivers/block/loop.c |
| 3 | * | 3 | * |
| 4 | * Written by Theodore Ts'o, 3/29/93 | 4 | * Written by Theodore Ts'o, 3/29/93 |
| 5 | * | 5 | * |
| 6 | * Copyright 1993 by Theodore Ts'o. Redistribution of this file is | 6 | * Copyright 1993 by Theodore Ts'o. Redistribution of this file is |
| 7 | * permitted under the GNU General Public License. | 7 | * permitted under the GNU General Public License. |
| 8 | * | 8 | * |
| 9 | * DES encryption plus some minor changes by Werner Almesberger, 30-MAY-1993 | 9 | * DES encryption plus some minor changes by Werner Almesberger, 30-MAY-1993 |
| 10 | * more DES encryption plus IDEA encryption by Nicholas J. Leon, June 20, 1996 | 10 | * more DES encryption plus IDEA encryption by Nicholas J. Leon, June 20, 1996 |
| 11 | * | 11 | * |
| 12 | * Modularized and updated for 1.1.16 kernel - Mitch Dsouza 28th May 1994 | 12 | * Modularized and updated for 1.1.16 kernel - Mitch Dsouza 28th May 1994 |
| 13 | * Adapted for 1.3.59 kernel - Andries Brouwer, 1 Feb 1996 | 13 | * Adapted for 1.3.59 kernel - Andries Brouwer, 1 Feb 1996 |
| 14 | * | 14 | * |
| 15 | * Fixed do_loop_request() re-entrancy - Vincent.Renardias@waw.com Mar 20, 1997 | 15 | * Fixed do_loop_request() re-entrancy - Vincent.Renardias@waw.com Mar 20, 1997 |
| 16 | * | 16 | * |
| 17 | * Added devfs support - Richard Gooch <rgooch@atnf.csiro.au> 16-Jan-1998 | 17 | * Added devfs support - Richard Gooch <rgooch@atnf.csiro.au> 16-Jan-1998 |
| 18 | * | 18 | * |
| 19 | * Handle sparse backing files correctly - Kenn Humborg, Jun 28, 1998 | 19 | * Handle sparse backing files correctly - Kenn Humborg, Jun 28, 1998 |
| 20 | * | 20 | * |
| 21 | * Loadable modules and other fixes by AK, 1998 | 21 | * Loadable modules and other fixes by AK, 1998 |
| 22 | * | 22 | * |
| 23 | * Make real block number available to downstream transfer functions, enables | 23 | * Make real block number available to downstream transfer functions, enables |
| 24 | * CBC (and relatives) mode encryption requiring unique IVs per data block. | 24 | * CBC (and relatives) mode encryption requiring unique IVs per data block. |
| 25 | * Reed H. Petty, rhp@draper.net | 25 | * Reed H. Petty, rhp@draper.net |
| 26 | * | 26 | * |
| 27 | * Maximum number of loop devices now dynamic via max_loop module parameter. | 27 | * Maximum number of loop devices now dynamic via max_loop module parameter. |
| 28 | * Russell Kroll <rkroll@exploits.org> 19990701 | 28 | * Russell Kroll <rkroll@exploits.org> 19990701 |
| 29 | * | 29 | * |
| 30 | * Maximum number of loop devices when compiled-in now selectable by passing | 30 | * Maximum number of loop devices when compiled-in now selectable by passing |
| 31 | * max_loop=<1-255> to the kernel on boot. | 31 | * max_loop=<1-255> to the kernel on boot. |
| 32 | * Erik I. Bolsø, <eriki@himolde.no>, Oct 31, 1999 | 32 | * Erik I. Bolsø, <eriki@himolde.no>, Oct 31, 1999 |
| 33 | * | 33 | * |
| 34 | * Completely rewrite request handling to be make_request_fn style and | 34 | * Completely rewrite request handling to be make_request_fn style and |
| 35 | * non blocking, pushing work to a helper thread. Lots of fixes from | 35 | * non blocking, pushing work to a helper thread. Lots of fixes from |
| 36 | * Al Viro too. | 36 | * Al Viro too. |
| 37 | * Jens Axboe <axboe@suse.de>, Nov 2000 | 37 | * Jens Axboe <axboe@suse.de>, Nov 2000 |
| 38 | * | 38 | * |
| 39 | * Support up to 256 loop devices | 39 | * Support up to 256 loop devices |
| 40 | * Heinz Mauelshagen <mge@sistina.com>, Feb 2002 | 40 | * Heinz Mauelshagen <mge@sistina.com>, Feb 2002 |
| 41 | * | 41 | * |
| 42 | * Support for falling back on the write file operation when the address space | 42 | * Support for falling back on the write file operation when the address space |
| 43 | * operations prepare_write and/or commit_write are not available on the | 43 | * operations prepare_write and/or commit_write are not available on the |
| 44 | * backing filesystem. | 44 | * backing filesystem. |
| 45 | * Anton Altaparmakov, 16 Feb 2005 | 45 | * Anton Altaparmakov, 16 Feb 2005 |
| 46 | * | 46 | * |
| 47 | * Still To Fix: | 47 | * Still To Fix: |
| 48 | * - Advisory locking is ignored here. | 48 | * - Advisory locking is ignored here. |
| 49 | * - Should use an own CAP_* category instead of CAP_SYS_ADMIN | 49 | * - Should use an own CAP_* category instead of CAP_SYS_ADMIN |
| 50 | * | 50 | * |
| 51 | */ | 51 | */ |
| 52 | 52 | ||
| 53 | #include <linux/module.h> | 53 | #include <linux/module.h> |
| 54 | #include <linux/moduleparam.h> | 54 | #include <linux/moduleparam.h> |
| 55 | #include <linux/sched.h> | 55 | #include <linux/sched.h> |
| 56 | #include <linux/fs.h> | 56 | #include <linux/fs.h> |
| 57 | #include <linux/file.h> | 57 | #include <linux/file.h> |
| 58 | #include <linux/stat.h> | 58 | #include <linux/stat.h> |
| 59 | #include <linux/errno.h> | 59 | #include <linux/errno.h> |
| 60 | #include <linux/major.h> | 60 | #include <linux/major.h> |
| 61 | #include <linux/wait.h> | 61 | #include <linux/wait.h> |
| 62 | #include <linux/blkdev.h> | 62 | #include <linux/blkdev.h> |
| 63 | #include <linux/blkpg.h> | 63 | #include <linux/blkpg.h> |
| 64 | #include <linux/init.h> | 64 | #include <linux/init.h> |
| 65 | #include <linux/smp_lock.h> | 65 | #include <linux/smp_lock.h> |
| 66 | #include <linux/swap.h> | 66 | #include <linux/swap.h> |
| 67 | #include <linux/slab.h> | 67 | #include <linux/slab.h> |
| 68 | #include <linux/loop.h> | 68 | #include <linux/loop.h> |
| 69 | #include <linux/compat.h> | 69 | #include <linux/compat.h> |
| 70 | #include <linux/suspend.h> | 70 | #include <linux/suspend.h> |
| 71 | #include <linux/writeback.h> | 71 | #include <linux/writeback.h> |
| 72 | #include <linux/buffer_head.h> /* for invalidate_bdev() */ | 72 | #include <linux/buffer_head.h> /* for invalidate_bdev() */ |
| 73 | #include <linux/completion.h> | 73 | #include <linux/completion.h> |
| 74 | #include <linux/highmem.h> | 74 | #include <linux/highmem.h> |
| 75 | #include <linux/gfp.h> | 75 | #include <linux/gfp.h> |
| 76 | #include <linux/kthread.h> | 76 | #include <linux/kthread.h> |
| 77 | #include <linux/splice.h> | 77 | #include <linux/splice.h> |
| 78 | 78 | ||
| 79 | #include <asm/uaccess.h> | 79 | #include <asm/uaccess.h> |
| 80 | 80 | ||
| 81 | static LIST_HEAD(loop_devices); | 81 | static LIST_HEAD(loop_devices); |
| 82 | static DEFINE_MUTEX(loop_devices_mutex); | 82 | static DEFINE_MUTEX(loop_devices_mutex); |
| 83 | 83 | ||
| 84 | /* | 84 | /* |
| 85 | * Transfer functions | 85 | * Transfer functions |
| 86 | */ | 86 | */ |
| 87 | static int transfer_none(struct loop_device *lo, int cmd, | 87 | static int transfer_none(struct loop_device *lo, int cmd, |
| 88 | struct page *raw_page, unsigned raw_off, | 88 | struct page *raw_page, unsigned raw_off, |
| 89 | struct page *loop_page, unsigned loop_off, | 89 | struct page *loop_page, unsigned loop_off, |
| 90 | int size, sector_t real_block) | 90 | int size, sector_t real_block) |
| 91 | { | 91 | { |
| 92 | char *raw_buf = kmap_atomic(raw_page, KM_USER0) + raw_off; | 92 | char *raw_buf = kmap_atomic(raw_page, KM_USER0) + raw_off; |
| 93 | char *loop_buf = kmap_atomic(loop_page, KM_USER1) + loop_off; | 93 | char *loop_buf = kmap_atomic(loop_page, KM_USER1) + loop_off; |
| 94 | 94 | ||
| 95 | if (cmd == READ) | 95 | if (cmd == READ) |
| 96 | memcpy(loop_buf, raw_buf, size); | 96 | memcpy(loop_buf, raw_buf, size); |
| 97 | else | 97 | else |
| 98 | memcpy(raw_buf, loop_buf, size); | 98 | memcpy(raw_buf, loop_buf, size); |
| 99 | 99 | ||
| 100 | kunmap_atomic(raw_buf, KM_USER0); | 100 | kunmap_atomic(raw_buf, KM_USER0); |
| 101 | kunmap_atomic(loop_buf, KM_USER1); | 101 | kunmap_atomic(loop_buf, KM_USER1); |
| 102 | cond_resched(); | 102 | cond_resched(); |
| 103 | return 0; | 103 | return 0; |
| 104 | } | 104 | } |
| 105 | 105 | ||
| 106 | static int transfer_xor(struct loop_device *lo, int cmd, | 106 | static int transfer_xor(struct loop_device *lo, int cmd, |
| 107 | struct page *raw_page, unsigned raw_off, | 107 | struct page *raw_page, unsigned raw_off, |
| 108 | struct page *loop_page, unsigned loop_off, | 108 | struct page *loop_page, unsigned loop_off, |
| 109 | int size, sector_t real_block) | 109 | int size, sector_t real_block) |
| 110 | { | 110 | { |
| 111 | char *raw_buf = kmap_atomic(raw_page, KM_USER0) + raw_off; | 111 | char *raw_buf = kmap_atomic(raw_page, KM_USER0) + raw_off; |
| 112 | char *loop_buf = kmap_atomic(loop_page, KM_USER1) + loop_off; | 112 | char *loop_buf = kmap_atomic(loop_page, KM_USER1) + loop_off; |
| 113 | char *in, *out, *key; | 113 | char *in, *out, *key; |
| 114 | int i, keysize; | 114 | int i, keysize; |
| 115 | 115 | ||
| 116 | if (cmd == READ) { | 116 | if (cmd == READ) { |
| 117 | in = raw_buf; | 117 | in = raw_buf; |
| 118 | out = loop_buf; | 118 | out = loop_buf; |
| 119 | } else { | 119 | } else { |
| 120 | in = loop_buf; | 120 | in = loop_buf; |
| 121 | out = raw_buf; | 121 | out = raw_buf; |
| 122 | } | 122 | } |
| 123 | 123 | ||
| 124 | key = lo->lo_encrypt_key; | 124 | key = lo->lo_encrypt_key; |
| 125 | keysize = lo->lo_encrypt_key_size; | 125 | keysize = lo->lo_encrypt_key_size; |
| 126 | for (i = 0; i < size; i++) | 126 | for (i = 0; i < size; i++) |
| 127 | *out++ = *in++ ^ key[(i & 511) % keysize]; | 127 | *out++ = *in++ ^ key[(i & 511) % keysize]; |
| 128 | 128 | ||
| 129 | kunmap_atomic(raw_buf, KM_USER0); | 129 | kunmap_atomic(raw_buf, KM_USER0); |
| 130 | kunmap_atomic(loop_buf, KM_USER1); | 130 | kunmap_atomic(loop_buf, KM_USER1); |
| 131 | cond_resched(); | 131 | cond_resched(); |
| 132 | return 0; | 132 | return 0; |
| 133 | } | 133 | } |
| 134 | 134 | ||
| 135 | static int xor_init(struct loop_device *lo, const struct loop_info64 *info) | 135 | static int xor_init(struct loop_device *lo, const struct loop_info64 *info) |
| 136 | { | 136 | { |
| 137 | if (unlikely(info->lo_encrypt_key_size <= 0)) | 137 | if (unlikely(info->lo_encrypt_key_size <= 0)) |
| 138 | return -EINVAL; | 138 | return -EINVAL; |
| 139 | return 0; | 139 | return 0; |
| 140 | } | 140 | } |
| 141 | 141 | ||
| 142 | static struct loop_func_table none_funcs = { | 142 | static struct loop_func_table none_funcs = { |
| 143 | .number = LO_CRYPT_NONE, | 143 | .number = LO_CRYPT_NONE, |
| 144 | .transfer = transfer_none, | 144 | .transfer = transfer_none, |
| 145 | }; | 145 | }; |
| 146 | 146 | ||
| 147 | static struct loop_func_table xor_funcs = { | 147 | static struct loop_func_table xor_funcs = { |
| 148 | .number = LO_CRYPT_XOR, | 148 | .number = LO_CRYPT_XOR, |
| 149 | .transfer = transfer_xor, | 149 | .transfer = transfer_xor, |
| 150 | .init = xor_init | 150 | .init = xor_init |
| 151 | }; | 151 | }; |
| 152 | 152 | ||
| 153 | /* xfer_funcs[0] is special - its release function is never called */ | 153 | /* xfer_funcs[0] is special - its release function is never called */ |
| 154 | static struct loop_func_table *xfer_funcs[MAX_LO_CRYPT] = { | 154 | static struct loop_func_table *xfer_funcs[MAX_LO_CRYPT] = { |
| 155 | &none_funcs, | 155 | &none_funcs, |
| 156 | &xor_funcs | 156 | &xor_funcs |
| 157 | }; | 157 | }; |
| 158 | 158 | ||
| 159 | static loff_t get_loop_size(struct loop_device *lo, struct file *file) | 159 | static loff_t get_loop_size(struct loop_device *lo, struct file *file) |
| 160 | { | 160 | { |
| 161 | loff_t size, offset, loopsize; | 161 | loff_t size, offset, loopsize; |
| 162 | 162 | ||
| 163 | /* Compute loopsize in bytes */ | 163 | /* Compute loopsize in bytes */ |
| 164 | size = i_size_read(file->f_mapping->host); | 164 | size = i_size_read(file->f_mapping->host); |
| 165 | offset = lo->lo_offset; | 165 | offset = lo->lo_offset; |
| 166 | loopsize = size - offset; | 166 | loopsize = size - offset; |
| 167 | if (lo->lo_sizelimit > 0 && lo->lo_sizelimit < loopsize) | 167 | if (lo->lo_sizelimit > 0 && lo->lo_sizelimit < loopsize) |
| 168 | loopsize = lo->lo_sizelimit; | 168 | loopsize = lo->lo_sizelimit; |
| 169 | 169 | ||
| 170 | /* | 170 | /* |
| 171 | * Unfortunately, if we want to do I/O on the device, | 171 | * Unfortunately, if we want to do I/O on the device, |
| 172 | * the number of 512-byte sectors has to fit into a sector_t. | 172 | * the number of 512-byte sectors has to fit into a sector_t. |
| 173 | */ | 173 | */ |
| 174 | return loopsize >> 9; | 174 | return loopsize >> 9; |
| 175 | } | 175 | } |
| 176 | 176 | ||
| 177 | static int | 177 | static int |
| 178 | figure_loop_size(struct loop_device *lo) | 178 | figure_loop_size(struct loop_device *lo) |
| 179 | { | 179 | { |
| 180 | loff_t size = get_loop_size(lo, lo->lo_backing_file); | 180 | loff_t size = get_loop_size(lo, lo->lo_backing_file); |
| 181 | sector_t x = (sector_t)size; | 181 | sector_t x = (sector_t)size; |
| 182 | 182 | ||
| 183 | if (unlikely((loff_t)x != size)) | 183 | if (unlikely((loff_t)x != size)) |
| 184 | return -EFBIG; | 184 | return -EFBIG; |
| 185 | 185 | ||
| 186 | set_capacity(lo->lo_disk, x); | 186 | set_capacity(lo->lo_disk, x); |
| 187 | return 0; | 187 | return 0; |
| 188 | } | 188 | } |
| 189 | 189 | ||
| 190 | static inline int | 190 | static inline int |
| 191 | lo_do_transfer(struct loop_device *lo, int cmd, | 191 | lo_do_transfer(struct loop_device *lo, int cmd, |
| 192 | struct page *rpage, unsigned roffs, | 192 | struct page *rpage, unsigned roffs, |
| 193 | struct page *lpage, unsigned loffs, | 193 | struct page *lpage, unsigned loffs, |
| 194 | int size, sector_t rblock) | 194 | int size, sector_t rblock) |
| 195 | { | 195 | { |
| 196 | if (unlikely(!lo->transfer)) | 196 | if (unlikely(!lo->transfer)) |
| 197 | return 0; | 197 | return 0; |
| 198 | 198 | ||
| 199 | return lo->transfer(lo, cmd, rpage, roffs, lpage, loffs, size, rblock); | 199 | return lo->transfer(lo, cmd, rpage, roffs, lpage, loffs, size, rblock); |
| 200 | } | 200 | } |
| 201 | 201 | ||
| 202 | /** | 202 | /** |
| 203 | * do_lo_send_aops - helper for writing data to a loop device | 203 | * do_lo_send_aops - helper for writing data to a loop device |
| 204 | * | 204 | * |
| 205 | * This is the fast version for backing filesystems which implement the address | 205 | * This is the fast version for backing filesystems which implement the address |
| 206 | * space operations prepare_write and commit_write. | 206 | * space operations prepare_write and commit_write. |
| 207 | */ | 207 | */ |
| 208 | static int do_lo_send_aops(struct loop_device *lo, struct bio_vec *bvec, | 208 | static int do_lo_send_aops(struct loop_device *lo, struct bio_vec *bvec, |
| 209 | int bsize, loff_t pos, struct page *page) | 209 | int bsize, loff_t pos, struct page *page) |
| 210 | { | 210 | { |
| 211 | struct file *file = lo->lo_backing_file; /* kudos to NFsckingS */ | 211 | struct file *file = lo->lo_backing_file; /* kudos to NFsckingS */ |
| 212 | struct address_space *mapping = file->f_mapping; | 212 | struct address_space *mapping = file->f_mapping; |
| 213 | const struct address_space_operations *aops = mapping->a_ops; | 213 | const struct address_space_operations *aops = mapping->a_ops; |
| 214 | pgoff_t index; | 214 | pgoff_t index; |
| 215 | unsigned offset, bv_offs; | 215 | unsigned offset, bv_offs; |
| 216 | int len, ret; | 216 | int len, ret; |
| 217 | 217 | ||
| 218 | mutex_lock(&mapping->host->i_mutex); | 218 | mutex_lock(&mapping->host->i_mutex); |
| 219 | index = pos >> PAGE_CACHE_SHIFT; | 219 | index = pos >> PAGE_CACHE_SHIFT; |
| 220 | offset = pos & ((pgoff_t)PAGE_CACHE_SIZE - 1); | 220 | offset = pos & ((pgoff_t)PAGE_CACHE_SIZE - 1); |
| 221 | bv_offs = bvec->bv_offset; | 221 | bv_offs = bvec->bv_offset; |
| 222 | len = bvec->bv_len; | 222 | len = bvec->bv_len; |
| 223 | while (len > 0) { | 223 | while (len > 0) { |
| 224 | sector_t IV; | 224 | sector_t IV; |
| 225 | unsigned size; | 225 | unsigned size; |
| 226 | int transfer_result; | 226 | int transfer_result; |
| 227 | 227 | ||
| 228 | IV = ((sector_t)index << (PAGE_CACHE_SHIFT - 9))+(offset >> 9); | 228 | IV = ((sector_t)index << (PAGE_CACHE_SHIFT - 9))+(offset >> 9); |
| 229 | size = PAGE_CACHE_SIZE - offset; | 229 | size = PAGE_CACHE_SIZE - offset; |
| 230 | if (size > len) | 230 | if (size > len) |
| 231 | size = len; | 231 | size = len; |
| 232 | page = grab_cache_page(mapping, index); | 232 | page = grab_cache_page(mapping, index); |
| 233 | if (unlikely(!page)) | 233 | if (unlikely(!page)) |
| 234 | goto fail; | 234 | goto fail; |
| 235 | ret = aops->prepare_write(file, page, offset, | 235 | ret = aops->prepare_write(file, page, offset, |
| 236 | offset + size); | 236 | offset + size); |
| 237 | if (unlikely(ret)) { | 237 | if (unlikely(ret)) { |
| 238 | if (ret == AOP_TRUNCATED_PAGE) { | 238 | if (ret == AOP_TRUNCATED_PAGE) { |
| 239 | page_cache_release(page); | 239 | page_cache_release(page); |
| 240 | continue; | 240 | continue; |
| 241 | } | 241 | } |
| 242 | goto unlock; | 242 | goto unlock; |
| 243 | } | 243 | } |
| 244 | transfer_result = lo_do_transfer(lo, WRITE, page, offset, | 244 | transfer_result = lo_do_transfer(lo, WRITE, page, offset, |
| 245 | bvec->bv_page, bv_offs, size, IV); | 245 | bvec->bv_page, bv_offs, size, IV); |
| 246 | if (unlikely(transfer_result)) { | 246 | if (unlikely(transfer_result)) { |
| 247 | /* | 247 | /* |
| 248 | * The transfer failed, but we still write the data to | 248 | * The transfer failed, but we still write the data to |
| 249 | * keep prepare/commit calls balanced. | 249 | * keep prepare/commit calls balanced. |
| 250 | */ | 250 | */ |
| 251 | printk(KERN_ERR "loop: transfer error block %llu\n", | 251 | printk(KERN_ERR "loop: transfer error block %llu\n", |
| 252 | (unsigned long long)index); | 252 | (unsigned long long)index); |
| 253 | zero_user_page(page, offset, size, KM_USER0); | 253 | zero_user_page(page, offset, size, KM_USER0); |
| 254 | } | 254 | } |
| 255 | flush_dcache_page(page); | 255 | flush_dcache_page(page); |
| 256 | ret = aops->commit_write(file, page, offset, | 256 | ret = aops->commit_write(file, page, offset, |
| 257 | offset + size); | 257 | offset + size); |
| 258 | if (unlikely(ret)) { | 258 | if (unlikely(ret)) { |
| 259 | if (ret == AOP_TRUNCATED_PAGE) { | 259 | if (ret == AOP_TRUNCATED_PAGE) { |
| 260 | page_cache_release(page); | 260 | page_cache_release(page); |
| 261 | continue; | 261 | continue; |
| 262 | } | 262 | } |
| 263 | goto unlock; | 263 | goto unlock; |
| 264 | } | 264 | } |
| 265 | if (unlikely(transfer_result)) | 265 | if (unlikely(transfer_result)) |
| 266 | goto unlock; | 266 | goto unlock; |
| 267 | bv_offs += size; | 267 | bv_offs += size; |
| 268 | len -= size; | 268 | len -= size; |
| 269 | offset = 0; | 269 | offset = 0; |
| 270 | index++; | 270 | index++; |
| 271 | pos += size; | 271 | pos += size; |
| 272 | unlock_page(page); | 272 | unlock_page(page); |
| 273 | page_cache_release(page); | 273 | page_cache_release(page); |
| 274 | } | 274 | } |
| 275 | ret = 0; | 275 | ret = 0; |
| 276 | out: | 276 | out: |
| 277 | mutex_unlock(&mapping->host->i_mutex); | 277 | mutex_unlock(&mapping->host->i_mutex); |
| 278 | return ret; | 278 | return ret; |
| 279 | unlock: | 279 | unlock: |
| 280 | unlock_page(page); | 280 | unlock_page(page); |
| 281 | page_cache_release(page); | 281 | page_cache_release(page); |
| 282 | fail: | 282 | fail: |
| 283 | ret = -1; | 283 | ret = -1; |
| 284 | goto out; | 284 | goto out; |
| 285 | } | 285 | } |
| 286 | 286 | ||
| 287 | /** | 287 | /** |
| 288 | * __do_lo_send_write - helper for writing data to a loop device | 288 | * __do_lo_send_write - helper for writing data to a loop device |
| 289 | * | 289 | * |
| 290 | * This helper just factors out common code between do_lo_send_direct_write() | 290 | * This helper just factors out common code between do_lo_send_direct_write() |
| 291 | * and do_lo_send_write(). | 291 | * and do_lo_send_write(). |
| 292 | */ | 292 | */ |
| 293 | static int __do_lo_send_write(struct file *file, | 293 | static int __do_lo_send_write(struct file *file, |
| 294 | u8 *buf, const int len, loff_t pos) | 294 | u8 *buf, const int len, loff_t pos) |
| 295 | { | 295 | { |
| 296 | ssize_t bw; | 296 | ssize_t bw; |
| 297 | mm_segment_t old_fs = get_fs(); | 297 | mm_segment_t old_fs = get_fs(); |
| 298 | 298 | ||
| 299 | set_fs(get_ds()); | 299 | set_fs(get_ds()); |
| 300 | bw = file->f_op->write(file, buf, len, &pos); | 300 | bw = file->f_op->write(file, buf, len, &pos); |
| 301 | set_fs(old_fs); | 301 | set_fs(old_fs); |
| 302 | if (likely(bw == len)) | 302 | if (likely(bw == len)) |
| 303 | return 0; | 303 | return 0; |
| 304 | printk(KERN_ERR "loop: Write error at byte offset %llu, length %i.\n", | 304 | printk(KERN_ERR "loop: Write error at byte offset %llu, length %i.\n", |
| 305 | (unsigned long long)pos, len); | 305 | (unsigned long long)pos, len); |
| 306 | if (bw >= 0) | 306 | if (bw >= 0) |
| 307 | bw = -EIO; | 307 | bw = -EIO; |
| 308 | return bw; | 308 | return bw; |
| 309 | } | 309 | } |
| 310 | 310 | ||
| 311 | /** | 311 | /** |
| 312 | * do_lo_send_direct_write - helper for writing data to a loop device | 312 | * do_lo_send_direct_write - helper for writing data to a loop device |
| 313 | * | 313 | * |
| 314 | * This is the fast, non-transforming version for backing filesystems which do | 314 | * This is the fast, non-transforming version for backing filesystems which do |
| 315 | * not implement the address space operations prepare_write and commit_write. | 315 | * not implement the address space operations prepare_write and commit_write. |
| 316 | * It uses the write file operation which should be present on all writeable | 316 | * It uses the write file operation which should be present on all writeable |
| 317 | * filesystems. | 317 | * filesystems. |
| 318 | */ | 318 | */ |
| 319 | static int do_lo_send_direct_write(struct loop_device *lo, | 319 | static int do_lo_send_direct_write(struct loop_device *lo, |
| 320 | struct bio_vec *bvec, int bsize, loff_t pos, struct page *page) | 320 | struct bio_vec *bvec, int bsize, loff_t pos, struct page *page) |
| 321 | { | 321 | { |
| 322 | ssize_t bw = __do_lo_send_write(lo->lo_backing_file, | 322 | ssize_t bw = __do_lo_send_write(lo->lo_backing_file, |
| 323 | kmap(bvec->bv_page) + bvec->bv_offset, | 323 | kmap(bvec->bv_page) + bvec->bv_offset, |
| 324 | bvec->bv_len, pos); | 324 | bvec->bv_len, pos); |
| 325 | kunmap(bvec->bv_page); | 325 | kunmap(bvec->bv_page); |
| 326 | cond_resched(); | 326 | cond_resched(); |
| 327 | return bw; | 327 | return bw; |
| 328 | } | 328 | } |
| 329 | 329 | ||
| 330 | /** | 330 | /** |
| 331 | * do_lo_send_write - helper for writing data to a loop device | 331 | * do_lo_send_write - helper for writing data to a loop device |
| 332 | * | 332 | * |
| 333 | * This is the slow, transforming version for filesystems which do not | 333 | * This is the slow, transforming version for filesystems which do not |
| 334 | * implement the address space operations prepare_write and commit_write. It | 334 | * implement the address space operations prepare_write and commit_write. It |
| 335 | * uses the write file operation which should be present on all writeable | 335 | * uses the write file operation which should be present on all writeable |
| 336 | * filesystems. | 336 | * filesystems. |
| 337 | * | 337 | * |
| 338 | * Using fops->write is slower than using aops->{prepare,commit}_write in the | 338 | * Using fops->write is slower than using aops->{prepare,commit}_write in the |
| 339 | * transforming case because we need to double buffer the data as we cannot do | 339 | * transforming case because we need to double buffer the data as we cannot do |
| 340 | * the transformations in place as we do not have direct access to the | 340 | * the transformations in place as we do not have direct access to the |
| 341 | * destination pages of the backing file. | 341 | * destination pages of the backing file. |
| 342 | */ | 342 | */ |
| 343 | static int do_lo_send_write(struct loop_device *lo, struct bio_vec *bvec, | 343 | static int do_lo_send_write(struct loop_device *lo, struct bio_vec *bvec, |
| 344 | int bsize, loff_t pos, struct page *page) | 344 | int bsize, loff_t pos, struct page *page) |
| 345 | { | 345 | { |
| 346 | int ret = lo_do_transfer(lo, WRITE, page, 0, bvec->bv_page, | 346 | int ret = lo_do_transfer(lo, WRITE, page, 0, bvec->bv_page, |
| 347 | bvec->bv_offset, bvec->bv_len, pos >> 9); | 347 | bvec->bv_offset, bvec->bv_len, pos >> 9); |
| 348 | if (likely(!ret)) | 348 | if (likely(!ret)) |
| 349 | return __do_lo_send_write(lo->lo_backing_file, | 349 | return __do_lo_send_write(lo->lo_backing_file, |
| 350 | page_address(page), bvec->bv_len, | 350 | page_address(page), bvec->bv_len, |
| 351 | pos); | 351 | pos); |
| 352 | printk(KERN_ERR "loop: Transfer error at byte offset %llu, " | 352 | printk(KERN_ERR "loop: Transfer error at byte offset %llu, " |
| 353 | "length %i.\n", (unsigned long long)pos, bvec->bv_len); | 353 | "length %i.\n", (unsigned long long)pos, bvec->bv_len); |
| 354 | if (ret > 0) | 354 | if (ret > 0) |
| 355 | ret = -EIO; | 355 | ret = -EIO; |
| 356 | return ret; | 356 | return ret; |
| 357 | } | 357 | } |
| 358 | 358 | ||
| 359 | static int lo_send(struct loop_device *lo, struct bio *bio, int bsize, | 359 | static int lo_send(struct loop_device *lo, struct bio *bio, int bsize, |
| 360 | loff_t pos) | 360 | loff_t pos) |
| 361 | { | 361 | { |
| 362 | int (*do_lo_send)(struct loop_device *, struct bio_vec *, int, loff_t, | 362 | int (*do_lo_send)(struct loop_device *, struct bio_vec *, int, loff_t, |
| 363 | struct page *page); | 363 | struct page *page); |
| 364 | struct bio_vec *bvec; | 364 | struct bio_vec *bvec; |
| 365 | struct page *page = NULL; | 365 | struct page *page = NULL; |
| 366 | int i, ret = 0; | 366 | int i, ret = 0; |
| 367 | 367 | ||
| 368 | do_lo_send = do_lo_send_aops; | 368 | do_lo_send = do_lo_send_aops; |
| 369 | if (!(lo->lo_flags & LO_FLAGS_USE_AOPS)) { | 369 | if (!(lo->lo_flags & LO_FLAGS_USE_AOPS)) { |
| 370 | do_lo_send = do_lo_send_direct_write; | 370 | do_lo_send = do_lo_send_direct_write; |
| 371 | if (lo->transfer != transfer_none) { | 371 | if (lo->transfer != transfer_none) { |
| 372 | page = alloc_page(GFP_NOIO | __GFP_HIGHMEM); | 372 | page = alloc_page(GFP_NOIO | __GFP_HIGHMEM); |
| 373 | if (unlikely(!page)) | 373 | if (unlikely(!page)) |
| 374 | goto fail; | 374 | goto fail; |
| 375 | kmap(page); | 375 | kmap(page); |
| 376 | do_lo_send = do_lo_send_write; | 376 | do_lo_send = do_lo_send_write; |
| 377 | } | 377 | } |
| 378 | } | 378 | } |
| 379 | bio_for_each_segment(bvec, bio, i) { | 379 | bio_for_each_segment(bvec, bio, i) { |
| 380 | ret = do_lo_send(lo, bvec, bsize, pos, page); | 380 | ret = do_lo_send(lo, bvec, bsize, pos, page); |
| 381 | if (ret < 0) | 381 | if (ret < 0) |
| 382 | break; | 382 | break; |
| 383 | pos += bvec->bv_len; | 383 | pos += bvec->bv_len; |
| 384 | } | 384 | } |
| 385 | if (page) { | 385 | if (page) { |
| 386 | kunmap(page); | 386 | kunmap(page); |
| 387 | __free_page(page); | 387 | __free_page(page); |
| 388 | } | 388 | } |
| 389 | out: | 389 | out: |
| 390 | return ret; | 390 | return ret; |
| 391 | fail: | 391 | fail: |
| 392 | printk(KERN_ERR "loop: Failed to allocate temporary page for write.\n"); | 392 | printk(KERN_ERR "loop: Failed to allocate temporary page for write.\n"); |
| 393 | ret = -ENOMEM; | 393 | ret = -ENOMEM; |
| 394 | goto out; | 394 | goto out; |
| 395 | } | 395 | } |
| 396 | 396 | ||
| 397 | struct lo_read_data { | 397 | struct lo_read_data { |
| 398 | struct loop_device *lo; | 398 | struct loop_device *lo; |
| 399 | struct page *page; | 399 | struct page *page; |
| 400 | unsigned offset; | 400 | unsigned offset; |
| 401 | int bsize; | 401 | int bsize; |
| 402 | }; | 402 | }; |
| 403 | 403 | ||
| 404 | static int | 404 | static int |
| 405 | lo_splice_actor(struct pipe_inode_info *pipe, struct pipe_buffer *buf, | 405 | lo_splice_actor(struct pipe_inode_info *pipe, struct pipe_buffer *buf, |
| 406 | struct splice_desc *sd) | 406 | struct splice_desc *sd) |
| 407 | { | 407 | { |
| 408 | struct lo_read_data *p = sd->u.data; | 408 | struct lo_read_data *p = sd->u.data; |
| 409 | struct loop_device *lo = p->lo; | 409 | struct loop_device *lo = p->lo; |
| 410 | struct page *page = buf->page; | 410 | struct page *page = buf->page; |
| 411 | sector_t IV; | 411 | sector_t IV; |
| 412 | size_t size; | 412 | size_t size; |
| 413 | int ret; | 413 | int ret; |
| 414 | 414 | ||
| 415 | ret = buf->ops->pin(pipe, buf); | 415 | ret = buf->ops->confirm(pipe, buf); |
| 416 | if (unlikely(ret)) | 416 | if (unlikely(ret)) |
| 417 | return ret; | 417 | return ret; |
| 418 | 418 | ||
| 419 | IV = ((sector_t) page->index << (PAGE_CACHE_SHIFT - 9)) + | 419 | IV = ((sector_t) page->index << (PAGE_CACHE_SHIFT - 9)) + |
| 420 | (buf->offset >> 9); | 420 | (buf->offset >> 9); |
| 421 | size = sd->len; | 421 | size = sd->len; |
| 422 | if (size > p->bsize) | 422 | if (size > p->bsize) |
| 423 | size = p->bsize; | 423 | size = p->bsize; |
| 424 | 424 | ||
| 425 | if (lo_do_transfer(lo, READ, page, buf->offset, p->page, p->offset, size, IV)) { | 425 | if (lo_do_transfer(lo, READ, page, buf->offset, p->page, p->offset, size, IV)) { |
| 426 | printk(KERN_ERR "loop: transfer error block %ld\n", | 426 | printk(KERN_ERR "loop: transfer error block %ld\n", |
| 427 | page->index); | 427 | page->index); |
| 428 | size = -EINVAL; | 428 | size = -EINVAL; |
| 429 | } | 429 | } |
| 430 | 430 | ||
| 431 | flush_dcache_page(p->page); | 431 | flush_dcache_page(p->page); |
| 432 | 432 | ||
| 433 | if (size > 0) | 433 | if (size > 0) |
| 434 | p->offset += size; | 434 | p->offset += size; |
| 435 | 435 | ||
| 436 | return size; | 436 | return size; |
| 437 | } | 437 | } |
| 438 | 438 | ||
| 439 | static int | 439 | static int |
| 440 | lo_direct_splice_actor(struct pipe_inode_info *pipe, struct splice_desc *sd) | 440 | lo_direct_splice_actor(struct pipe_inode_info *pipe, struct splice_desc *sd) |
| 441 | { | 441 | { |
| 442 | return __splice_from_pipe(pipe, sd, lo_splice_actor); | 442 | return __splice_from_pipe(pipe, sd, lo_splice_actor); |
| 443 | } | 443 | } |
| 444 | 444 | ||
| 445 | static int | 445 | static int |
| 446 | do_lo_receive(struct loop_device *lo, | 446 | do_lo_receive(struct loop_device *lo, |
| 447 | struct bio_vec *bvec, int bsize, loff_t pos) | 447 | struct bio_vec *bvec, int bsize, loff_t pos) |
| 448 | { | 448 | { |
| 449 | struct lo_read_data cookie; | 449 | struct lo_read_data cookie; |
| 450 | struct splice_desc sd; | 450 | struct splice_desc sd; |
| 451 | struct file *file; | 451 | struct file *file; |
| 452 | long retval; | 452 | long retval; |
| 453 | 453 | ||
| 454 | cookie.lo = lo; | 454 | cookie.lo = lo; |
| 455 | cookie.page = bvec->bv_page; | 455 | cookie.page = bvec->bv_page; |
| 456 | cookie.offset = bvec->bv_offset; | 456 | cookie.offset = bvec->bv_offset; |
| 457 | cookie.bsize = bsize; | 457 | cookie.bsize = bsize; |
| 458 | 458 | ||
| 459 | sd.len = 0; | 459 | sd.len = 0; |
| 460 | sd.total_len = bvec->bv_len; | 460 | sd.total_len = bvec->bv_len; |
| 461 | sd.flags = 0; | 461 | sd.flags = 0; |
| 462 | sd.pos = pos; | 462 | sd.pos = pos; |
| 463 | sd.u.data = &cookie; | 463 | sd.u.data = &cookie; |
| 464 | 464 | ||
| 465 | file = lo->lo_backing_file; | 465 | file = lo->lo_backing_file; |
| 466 | retval = splice_direct_to_actor(file, &sd, lo_direct_splice_actor); | 466 | retval = splice_direct_to_actor(file, &sd, lo_direct_splice_actor); |
| 467 | 467 | ||
| 468 | if (retval < 0) | 468 | if (retval < 0) |
| 469 | return retval; | 469 | return retval; |
| 470 | 470 | ||
| 471 | return 0; | 471 | return 0; |
| 472 | } | 472 | } |
| 473 | 473 | ||
| 474 | static int | 474 | static int |
| 475 | lo_receive(struct loop_device *lo, struct bio *bio, int bsize, loff_t pos) | 475 | lo_receive(struct loop_device *lo, struct bio *bio, int bsize, loff_t pos) |
| 476 | { | 476 | { |
| 477 | struct bio_vec *bvec; | 477 | struct bio_vec *bvec; |
| 478 | int i, ret = 0; | 478 | int i, ret = 0; |
| 479 | 479 | ||
| 480 | bio_for_each_segment(bvec, bio, i) { | 480 | bio_for_each_segment(bvec, bio, i) { |
| 481 | ret = do_lo_receive(lo, bvec, bsize, pos); | 481 | ret = do_lo_receive(lo, bvec, bsize, pos); |
| 482 | if (ret < 0) | 482 | if (ret < 0) |
| 483 | break; | 483 | break; |
| 484 | pos += bvec->bv_len; | 484 | pos += bvec->bv_len; |
| 485 | } | 485 | } |
| 486 | return ret; | 486 | return ret; |
| 487 | } | 487 | } |
| 488 | 488 | ||
| 489 | static int do_bio_filebacked(struct loop_device *lo, struct bio *bio) | 489 | static int do_bio_filebacked(struct loop_device *lo, struct bio *bio) |
| 490 | { | 490 | { |
| 491 | loff_t pos; | 491 | loff_t pos; |
| 492 | int ret; | 492 | int ret; |
| 493 | 493 | ||
| 494 | pos = ((loff_t) bio->bi_sector << 9) + lo->lo_offset; | 494 | pos = ((loff_t) bio->bi_sector << 9) + lo->lo_offset; |
| 495 | if (bio_rw(bio) == WRITE) | 495 | if (bio_rw(bio) == WRITE) |
| 496 | ret = lo_send(lo, bio, lo->lo_blocksize, pos); | 496 | ret = lo_send(lo, bio, lo->lo_blocksize, pos); |
| 497 | else | 497 | else |
| 498 | ret = lo_receive(lo, bio, lo->lo_blocksize, pos); | 498 | ret = lo_receive(lo, bio, lo->lo_blocksize, pos); |
| 499 | return ret; | 499 | return ret; |
| 500 | } | 500 | } |
| 501 | 501 | ||
| 502 | /* | 502 | /* |
| 503 | * Add bio to back of pending list | 503 | * Add bio to back of pending list |
| 504 | */ | 504 | */ |
| 505 | static void loop_add_bio(struct loop_device *lo, struct bio *bio) | 505 | static void loop_add_bio(struct loop_device *lo, struct bio *bio) |
| 506 | { | 506 | { |
| 507 | if (lo->lo_biotail) { | 507 | if (lo->lo_biotail) { |
| 508 | lo->lo_biotail->bi_next = bio; | 508 | lo->lo_biotail->bi_next = bio; |
| 509 | lo->lo_biotail = bio; | 509 | lo->lo_biotail = bio; |
| 510 | } else | 510 | } else |
| 511 | lo->lo_bio = lo->lo_biotail = bio; | 511 | lo->lo_bio = lo->lo_biotail = bio; |
| 512 | } | 512 | } |
| 513 | 513 | ||
| 514 | /* | 514 | /* |
| 515 | * Grab first pending buffer | 515 | * Grab first pending buffer |
| 516 | */ | 516 | */ |
| 517 | static struct bio *loop_get_bio(struct loop_device *lo) | 517 | static struct bio *loop_get_bio(struct loop_device *lo) |
| 518 | { | 518 | { |
| 519 | struct bio *bio; | 519 | struct bio *bio; |
| 520 | 520 | ||
| 521 | if ((bio = lo->lo_bio)) { | 521 | if ((bio = lo->lo_bio)) { |
| 522 | if (bio == lo->lo_biotail) | 522 | if (bio == lo->lo_biotail) |
| 523 | lo->lo_biotail = NULL; | 523 | lo->lo_biotail = NULL; |
| 524 | lo->lo_bio = bio->bi_next; | 524 | lo->lo_bio = bio->bi_next; |
| 525 | bio->bi_next = NULL; | 525 | bio->bi_next = NULL; |
| 526 | } | 526 | } |
| 527 | 527 | ||
| 528 | return bio; | 528 | return bio; |
| 529 | } | 529 | } |
| 530 | 530 | ||
| 531 | static int loop_make_request(request_queue_t *q, struct bio *old_bio) | 531 | static int loop_make_request(request_queue_t *q, struct bio *old_bio) |
| 532 | { | 532 | { |
| 533 | struct loop_device *lo = q->queuedata; | 533 | struct loop_device *lo = q->queuedata; |
| 534 | int rw = bio_rw(old_bio); | 534 | int rw = bio_rw(old_bio); |
| 535 | 535 | ||
| 536 | if (rw == READA) | 536 | if (rw == READA) |
| 537 | rw = READ; | 537 | rw = READ; |
| 538 | 538 | ||
| 539 | BUG_ON(!lo || (rw != READ && rw != WRITE)); | 539 | BUG_ON(!lo || (rw != READ && rw != WRITE)); |
| 540 | 540 | ||
| 541 | spin_lock_irq(&lo->lo_lock); | 541 | spin_lock_irq(&lo->lo_lock); |
| 542 | if (lo->lo_state != Lo_bound) | 542 | if (lo->lo_state != Lo_bound) |
| 543 | goto out; | 543 | goto out; |
| 544 | if (unlikely(rw == WRITE && (lo->lo_flags & LO_FLAGS_READ_ONLY))) | 544 | if (unlikely(rw == WRITE && (lo->lo_flags & LO_FLAGS_READ_ONLY))) |
| 545 | goto out; | 545 | goto out; |
| 546 | loop_add_bio(lo, old_bio); | 546 | loop_add_bio(lo, old_bio); |
| 547 | wake_up(&lo->lo_event); | 547 | wake_up(&lo->lo_event); |
| 548 | spin_unlock_irq(&lo->lo_lock); | 548 | spin_unlock_irq(&lo->lo_lock); |
| 549 | return 0; | 549 | return 0; |
| 550 | 550 | ||
| 551 | out: | 551 | out: |
| 552 | spin_unlock_irq(&lo->lo_lock); | 552 | spin_unlock_irq(&lo->lo_lock); |
| 553 | bio_io_error(old_bio, old_bio->bi_size); | 553 | bio_io_error(old_bio, old_bio->bi_size); |
| 554 | return 0; | 554 | return 0; |
| 555 | } | 555 | } |
| 556 | 556 | ||
| 557 | /* | 557 | /* |
| 558 | * kick off io on the underlying address space | 558 | * kick off io on the underlying address space |
| 559 | */ | 559 | */ |
| 560 | static void loop_unplug(request_queue_t *q) | 560 | static void loop_unplug(request_queue_t *q) |
| 561 | { | 561 | { |
| 562 | struct loop_device *lo = q->queuedata; | 562 | struct loop_device *lo = q->queuedata; |
| 563 | 563 | ||
| 564 | clear_bit(QUEUE_FLAG_PLUGGED, &q->queue_flags); | 564 | clear_bit(QUEUE_FLAG_PLUGGED, &q->queue_flags); |
| 565 | blk_run_address_space(lo->lo_backing_file->f_mapping); | 565 | blk_run_address_space(lo->lo_backing_file->f_mapping); |
| 566 | } | 566 | } |
| 567 | 567 | ||
| 568 | struct switch_request { | 568 | struct switch_request { |
| 569 | struct file *file; | 569 | struct file *file; |
| 570 | struct completion wait; | 570 | struct completion wait; |
| 571 | }; | 571 | }; |
| 572 | 572 | ||
| 573 | static void do_loop_switch(struct loop_device *, struct switch_request *); | 573 | static void do_loop_switch(struct loop_device *, struct switch_request *); |
| 574 | 574 | ||
| 575 | static inline void loop_handle_bio(struct loop_device *lo, struct bio *bio) | 575 | static inline void loop_handle_bio(struct loop_device *lo, struct bio *bio) |
| 576 | { | 576 | { |
| 577 | if (unlikely(!bio->bi_bdev)) { | 577 | if (unlikely(!bio->bi_bdev)) { |
| 578 | do_loop_switch(lo, bio->bi_private); | 578 | do_loop_switch(lo, bio->bi_private); |
| 579 | bio_put(bio); | 579 | bio_put(bio); |
| 580 | } else { | 580 | } else { |
| 581 | int ret = do_bio_filebacked(lo, bio); | 581 | int ret = do_bio_filebacked(lo, bio); |
| 582 | bio_endio(bio, bio->bi_size, ret); | 582 | bio_endio(bio, bio->bi_size, ret); |
| 583 | } | 583 | } |
| 584 | } | 584 | } |
| 585 | 585 | ||
| 586 | /* | 586 | /* |
| 587 | * worker thread that handles reads/writes to file backed loop devices, | 587 | * worker thread that handles reads/writes to file backed loop devices, |
| 588 | * to avoid blocking in our make_request_fn. it also does loop decrypting | 588 | * to avoid blocking in our make_request_fn. it also does loop decrypting |
| 589 | * on reads for block backed loop, as that is too heavy to do from | 589 | * on reads for block backed loop, as that is too heavy to do from |
| 590 | * b_end_io context where irqs may be disabled. | 590 | * b_end_io context where irqs may be disabled. |
| 591 | * | 591 | * |
| 592 | * Loop explanation: loop_clr_fd() sets lo_state to Lo_rundown before | 592 | * Loop explanation: loop_clr_fd() sets lo_state to Lo_rundown before |
| 593 | * calling kthread_stop(). Therefore once kthread_should_stop() is | 593 | * calling kthread_stop(). Therefore once kthread_should_stop() is |
| 594 | * true, make_request will not place any more requests. Therefore | 594 | * true, make_request will not place any more requests. Therefore |
| 595 | * once kthread_should_stop() is true and lo_bio is NULL, we are | 595 | * once kthread_should_stop() is true and lo_bio is NULL, we are |
| 596 | * done with the loop. | 596 | * done with the loop. |
| 597 | */ | 597 | */ |
| 598 | static int loop_thread(void *data) | 598 | static int loop_thread(void *data) |
| 599 | { | 599 | { |
| 600 | struct loop_device *lo = data; | 600 | struct loop_device *lo = data; |
| 601 | struct bio *bio; | 601 | struct bio *bio; |
| 602 | 602 | ||
| 603 | /* | 603 | /* |
| 604 | * loop can be used in an encrypted device, | 604 | * loop can be used in an encrypted device, |
| 605 | * hence, it mustn't be stopped at all | 605 | * hence, it mustn't be stopped at all |
| 606 | * because it could be indirectly used during suspension | 606 | * because it could be indirectly used during suspension |
| 607 | */ | 607 | */ |
| 608 | current->flags |= PF_NOFREEZE; | 608 | current->flags |= PF_NOFREEZE; |
| 609 | 609 | ||
| 610 | set_user_nice(current, -20); | 610 | set_user_nice(current, -20); |
| 611 | 611 | ||
| 612 | while (!kthread_should_stop() || lo->lo_bio) { | 612 | while (!kthread_should_stop() || lo->lo_bio) { |
| 613 | 613 | ||
| 614 | wait_event_interruptible(lo->lo_event, | 614 | wait_event_interruptible(lo->lo_event, |
| 615 | lo->lo_bio || kthread_should_stop()); | 615 | lo->lo_bio || kthread_should_stop()); |
| 616 | 616 | ||
| 617 | if (!lo->lo_bio) | 617 | if (!lo->lo_bio) |
| 618 | continue; | 618 | continue; |
| 619 | spin_lock_irq(&lo->lo_lock); | 619 | spin_lock_irq(&lo->lo_lock); |
| 620 | bio = loop_get_bio(lo); | 620 | bio = loop_get_bio(lo); |
| 621 | spin_unlock_irq(&lo->lo_lock); | 621 | spin_unlock_irq(&lo->lo_lock); |
| 622 | 622 | ||
| 623 | BUG_ON(!bio); | 623 | BUG_ON(!bio); |
| 624 | loop_handle_bio(lo, bio); | 624 | loop_handle_bio(lo, bio); |
| 625 | } | 625 | } |
| 626 | 626 | ||
| 627 | return 0; | 627 | return 0; |
| 628 | } | 628 | } |
| 629 | 629 | ||
| 630 | /* | 630 | /* |
| 631 | * loop_switch performs the hard work of switching a backing store. | 631 | * loop_switch performs the hard work of switching a backing store. |
| 632 | * First it needs to flush existing IO, it does this by sending a magic | 632 | * First it needs to flush existing IO, it does this by sending a magic |
| 633 | * BIO down the pipe. The completion of this BIO does the actual switch. | 633 | * BIO down the pipe. The completion of this BIO does the actual switch. |
| 634 | */ | 634 | */ |
| 635 | static int loop_switch(struct loop_device *lo, struct file *file) | 635 | static int loop_switch(struct loop_device *lo, struct file *file) |
| 636 | { | 636 | { |
| 637 | struct switch_request w; | 637 | struct switch_request w; |
| 638 | struct bio *bio = bio_alloc(GFP_KERNEL, 1); | 638 | struct bio *bio = bio_alloc(GFP_KERNEL, 1); |
| 639 | if (!bio) | 639 | if (!bio) |
| 640 | return -ENOMEM; | 640 | return -ENOMEM; |
| 641 | init_completion(&w.wait); | 641 | init_completion(&w.wait); |
| 642 | w.file = file; | 642 | w.file = file; |
| 643 | bio->bi_private = &w; | 643 | bio->bi_private = &w; |
| 644 | bio->bi_bdev = NULL; | 644 | bio->bi_bdev = NULL; |
| 645 | loop_make_request(lo->lo_queue, bio); | 645 | loop_make_request(lo->lo_queue, bio); |
| 646 | wait_for_completion(&w.wait); | 646 | wait_for_completion(&w.wait); |
| 647 | return 0; | 647 | return 0; |
| 648 | } | 648 | } |
| 649 | 649 | ||
| 650 | /* | 650 | /* |
| 651 | * Do the actual switch; called from the BIO completion routine | 651 | * Do the actual switch; called from the BIO completion routine |
| 652 | */ | 652 | */ |
| 653 | static void do_loop_switch(struct loop_device *lo, struct switch_request *p) | 653 | static void do_loop_switch(struct loop_device *lo, struct switch_request *p) |
| 654 | { | 654 | { |
| 655 | struct file *file = p->file; | 655 | struct file *file = p->file; |
| 656 | struct file *old_file = lo->lo_backing_file; | 656 | struct file *old_file = lo->lo_backing_file; |
| 657 | struct address_space *mapping = file->f_mapping; | 657 | struct address_space *mapping = file->f_mapping; |
| 658 | 658 | ||
| 659 | mapping_set_gfp_mask(old_file->f_mapping, lo->old_gfp_mask); | 659 | mapping_set_gfp_mask(old_file->f_mapping, lo->old_gfp_mask); |
| 660 | lo->lo_backing_file = file; | 660 | lo->lo_backing_file = file; |
| 661 | lo->lo_blocksize = S_ISBLK(mapping->host->i_mode) ? | 661 | lo->lo_blocksize = S_ISBLK(mapping->host->i_mode) ? |
| 662 | mapping->host->i_bdev->bd_block_size : PAGE_SIZE; | 662 | mapping->host->i_bdev->bd_block_size : PAGE_SIZE; |
| 663 | lo->old_gfp_mask = mapping_gfp_mask(mapping); | 663 | lo->old_gfp_mask = mapping_gfp_mask(mapping); |
| 664 | mapping_set_gfp_mask(mapping, lo->old_gfp_mask & ~(__GFP_IO|__GFP_FS)); | 664 | mapping_set_gfp_mask(mapping, lo->old_gfp_mask & ~(__GFP_IO|__GFP_FS)); |
| 665 | complete(&p->wait); | 665 | complete(&p->wait); |
| 666 | } | 666 | } |
| 667 | 667 | ||
| 668 | 668 | ||
| 669 | /* | 669 | /* |
| 670 | * loop_change_fd switched the backing store of a loopback device to | 670 | * loop_change_fd switched the backing store of a loopback device to |
| 671 | * a new file. This is useful for operating system installers to free up | 671 | * a new file. This is useful for operating system installers to free up |
| 672 | * the original file and in High Availability environments to switch to | 672 | * the original file and in High Availability environments to switch to |
| 673 | * an alternative location for the content in case of server meltdown. | 673 | * an alternative location for the content in case of server meltdown. |
| 674 | * This can only work if the loop device is used read-only, and if the | 674 | * This can only work if the loop device is used read-only, and if the |
| 675 | * new backing store is the same size and type as the old backing store. | 675 | * new backing store is the same size and type as the old backing store. |
| 676 | */ | 676 | */ |
| 677 | static int loop_change_fd(struct loop_device *lo, struct file *lo_file, | 677 | static int loop_change_fd(struct loop_device *lo, struct file *lo_file, |
| 678 | struct block_device *bdev, unsigned int arg) | 678 | struct block_device *bdev, unsigned int arg) |
| 679 | { | 679 | { |
| 680 | struct file *file, *old_file; | 680 | struct file *file, *old_file; |
| 681 | struct inode *inode; | 681 | struct inode *inode; |
| 682 | int error; | 682 | int error; |
| 683 | 683 | ||
| 684 | error = -ENXIO; | 684 | error = -ENXIO; |
| 685 | if (lo->lo_state != Lo_bound) | 685 | if (lo->lo_state != Lo_bound) |
| 686 | goto out; | 686 | goto out; |
| 687 | 687 | ||
| 688 | /* the loop device has to be read-only */ | 688 | /* the loop device has to be read-only */ |
| 689 | error = -EINVAL; | 689 | error = -EINVAL; |
| 690 | if (!(lo->lo_flags & LO_FLAGS_READ_ONLY)) | 690 | if (!(lo->lo_flags & LO_FLAGS_READ_ONLY)) |
| 691 | goto out; | 691 | goto out; |
| 692 | 692 | ||
| 693 | error = -EBADF; | 693 | error = -EBADF; |
| 694 | file = fget(arg); | 694 | file = fget(arg); |
| 695 | if (!file) | 695 | if (!file) |
| 696 | goto out; | 696 | goto out; |
| 697 | 697 | ||
| 698 | inode = file->f_mapping->host; | 698 | inode = file->f_mapping->host; |
| 699 | old_file = lo->lo_backing_file; | 699 | old_file = lo->lo_backing_file; |
| 700 | 700 | ||
| 701 | error = -EINVAL; | 701 | error = -EINVAL; |
| 702 | 702 | ||
| 703 | if (!S_ISREG(inode->i_mode) && !S_ISBLK(inode->i_mode)) | 703 | if (!S_ISREG(inode->i_mode) && !S_ISBLK(inode->i_mode)) |
| 704 | goto out_putf; | 704 | goto out_putf; |
| 705 | 705 | ||
| 706 | /* new backing store needs to support loop (eg splice_read) */ | 706 | /* new backing store needs to support loop (eg splice_read) */ |
| 707 | if (!inode->i_fop->splice_read) | 707 | if (!inode->i_fop->splice_read) |
| 708 | goto out_putf; | 708 | goto out_putf; |
| 709 | 709 | ||
| 710 | /* size of the new backing store needs to be the same */ | 710 | /* size of the new backing store needs to be the same */ |
| 711 | if (get_loop_size(lo, file) != get_loop_size(lo, old_file)) | 711 | if (get_loop_size(lo, file) != get_loop_size(lo, old_file)) |
| 712 | goto out_putf; | 712 | goto out_putf; |
| 713 | 713 | ||
| 714 | /* and ... switch */ | 714 | /* and ... switch */ |
| 715 | error = loop_switch(lo, file); | 715 | error = loop_switch(lo, file); |
| 716 | if (error) | 716 | if (error) |
| 717 | goto out_putf; | 717 | goto out_putf; |
| 718 | 718 | ||
| 719 | fput(old_file); | 719 | fput(old_file); |
| 720 | return 0; | 720 | return 0; |
| 721 | 721 | ||
| 722 | out_putf: | 722 | out_putf: |
| 723 | fput(file); | 723 | fput(file); |
| 724 | out: | 724 | out: |
| 725 | return error; | 725 | return error; |
| 726 | } | 726 | } |
| 727 | 727 | ||
| 728 | static inline int is_loop_device(struct file *file) | 728 | static inline int is_loop_device(struct file *file) |
| 729 | { | 729 | { |
| 730 | struct inode *i = file->f_mapping->host; | 730 | struct inode *i = file->f_mapping->host; |
| 731 | 731 | ||
| 732 | return i && S_ISBLK(i->i_mode) && MAJOR(i->i_rdev) == LOOP_MAJOR; | 732 | return i && S_ISBLK(i->i_mode) && MAJOR(i->i_rdev) == LOOP_MAJOR; |
| 733 | } | 733 | } |
| 734 | 734 | ||
| 735 | static int loop_set_fd(struct loop_device *lo, struct file *lo_file, | 735 | static int loop_set_fd(struct loop_device *lo, struct file *lo_file, |
| 736 | struct block_device *bdev, unsigned int arg) | 736 | struct block_device *bdev, unsigned int arg) |
| 737 | { | 737 | { |
| 738 | struct file *file, *f; | 738 | struct file *file, *f; |
| 739 | struct inode *inode; | 739 | struct inode *inode; |
| 740 | struct address_space *mapping; | 740 | struct address_space *mapping; |
| 741 | unsigned lo_blocksize; | 741 | unsigned lo_blocksize; |
| 742 | int lo_flags = 0; | 742 | int lo_flags = 0; |
| 743 | int error; | 743 | int error; |
| 744 | loff_t size; | 744 | loff_t size; |
| 745 | 745 | ||
| 746 | /* This is safe, since we have a reference from open(). */ | 746 | /* This is safe, since we have a reference from open(). */ |
| 747 | __module_get(THIS_MODULE); | 747 | __module_get(THIS_MODULE); |
| 748 | 748 | ||
| 749 | error = -EBADF; | 749 | error = -EBADF; |
| 750 | file = fget(arg); | 750 | file = fget(arg); |
| 751 | if (!file) | 751 | if (!file) |
| 752 | goto out; | 752 | goto out; |
| 753 | 753 | ||
| 754 | error = -EBUSY; | 754 | error = -EBUSY; |
| 755 | if (lo->lo_state != Lo_unbound) | 755 | if (lo->lo_state != Lo_unbound) |
| 756 | goto out_putf; | 756 | goto out_putf; |
| 757 | 757 | ||
| 758 | /* Avoid recursion */ | 758 | /* Avoid recursion */ |
| 759 | f = file; | 759 | f = file; |
| 760 | while (is_loop_device(f)) { | 760 | while (is_loop_device(f)) { |
| 761 | struct loop_device *l; | 761 | struct loop_device *l; |
| 762 | 762 | ||
| 763 | if (f->f_mapping->host->i_rdev == lo_file->f_mapping->host->i_rdev) | 763 | if (f->f_mapping->host->i_rdev == lo_file->f_mapping->host->i_rdev) |
| 764 | goto out_putf; | 764 | goto out_putf; |
| 765 | 765 | ||
| 766 | l = f->f_mapping->host->i_bdev->bd_disk->private_data; | 766 | l = f->f_mapping->host->i_bdev->bd_disk->private_data; |
| 767 | if (l->lo_state == Lo_unbound) { | 767 | if (l->lo_state == Lo_unbound) { |
| 768 | error = -EINVAL; | 768 | error = -EINVAL; |
| 769 | goto out_putf; | 769 | goto out_putf; |
| 770 | } | 770 | } |
| 771 | f = l->lo_backing_file; | 771 | f = l->lo_backing_file; |
| 772 | } | 772 | } |
| 773 | 773 | ||
| 774 | mapping = file->f_mapping; | 774 | mapping = file->f_mapping; |
| 775 | inode = mapping->host; | 775 | inode = mapping->host; |
| 776 | 776 | ||
| 777 | if (!(file->f_mode & FMODE_WRITE)) | 777 | if (!(file->f_mode & FMODE_WRITE)) |
| 778 | lo_flags |= LO_FLAGS_READ_ONLY; | 778 | lo_flags |= LO_FLAGS_READ_ONLY; |
| 779 | 779 | ||
| 780 | error = -EINVAL; | 780 | error = -EINVAL; |
| 781 | if (S_ISREG(inode->i_mode) || S_ISBLK(inode->i_mode)) { | 781 | if (S_ISREG(inode->i_mode) || S_ISBLK(inode->i_mode)) { |
| 782 | const struct address_space_operations *aops = mapping->a_ops; | 782 | const struct address_space_operations *aops = mapping->a_ops; |
| 783 | /* | 783 | /* |
| 784 | * If we can't read - sorry. If we only can't write - well, | 784 | * If we can't read - sorry. If we only can't write - well, |
| 785 | * it's going to be read-only. | 785 | * it's going to be read-only. |
| 786 | */ | 786 | */ |
| 787 | if (!file->f_op->splice_read) | 787 | if (!file->f_op->splice_read) |
| 788 | goto out_putf; | 788 | goto out_putf; |
| 789 | if (aops->prepare_write && aops->commit_write) | 789 | if (aops->prepare_write && aops->commit_write) |
| 790 | lo_flags |= LO_FLAGS_USE_AOPS; | 790 | lo_flags |= LO_FLAGS_USE_AOPS; |
| 791 | if (!(lo_flags & LO_FLAGS_USE_AOPS) && !file->f_op->write) | 791 | if (!(lo_flags & LO_FLAGS_USE_AOPS) && !file->f_op->write) |
| 792 | lo_flags |= LO_FLAGS_READ_ONLY; | 792 | lo_flags |= LO_FLAGS_READ_ONLY; |
| 793 | 793 | ||
| 794 | lo_blocksize = S_ISBLK(inode->i_mode) ? | 794 | lo_blocksize = S_ISBLK(inode->i_mode) ? |
| 795 | inode->i_bdev->bd_block_size : PAGE_SIZE; | 795 | inode->i_bdev->bd_block_size : PAGE_SIZE; |
| 796 | 796 | ||
| 797 | error = 0; | 797 | error = 0; |
| 798 | } else { | 798 | } else { |
| 799 | goto out_putf; | 799 | goto out_putf; |
| 800 | } | 800 | } |
| 801 | 801 | ||
| 802 | size = get_loop_size(lo, file); | 802 | size = get_loop_size(lo, file); |
| 803 | 803 | ||
| 804 | if ((loff_t)(sector_t)size != size) { | 804 | if ((loff_t)(sector_t)size != size) { |
| 805 | error = -EFBIG; | 805 | error = -EFBIG; |
| 806 | goto out_putf; | 806 | goto out_putf; |
| 807 | } | 807 | } |
| 808 | 808 | ||
| 809 | if (!(lo_file->f_mode & FMODE_WRITE)) | 809 | if (!(lo_file->f_mode & FMODE_WRITE)) |
| 810 | lo_flags |= LO_FLAGS_READ_ONLY; | 810 | lo_flags |= LO_FLAGS_READ_ONLY; |
| 811 | 811 | ||
| 812 | set_device_ro(bdev, (lo_flags & LO_FLAGS_READ_ONLY) != 0); | 812 | set_device_ro(bdev, (lo_flags & LO_FLAGS_READ_ONLY) != 0); |
| 813 | 813 | ||
| 814 | lo->lo_blocksize = lo_blocksize; | 814 | lo->lo_blocksize = lo_blocksize; |
| 815 | lo->lo_device = bdev; | 815 | lo->lo_device = bdev; |
| 816 | lo->lo_flags = lo_flags; | 816 | lo->lo_flags = lo_flags; |
| 817 | lo->lo_backing_file = file; | 817 | lo->lo_backing_file = file; |
| 818 | lo->transfer = transfer_none; | 818 | lo->transfer = transfer_none; |
| 819 | lo->ioctl = NULL; | 819 | lo->ioctl = NULL; |
| 820 | lo->lo_sizelimit = 0; | 820 | lo->lo_sizelimit = 0; |
| 821 | lo->old_gfp_mask = mapping_gfp_mask(mapping); | 821 | lo->old_gfp_mask = mapping_gfp_mask(mapping); |
| 822 | mapping_set_gfp_mask(mapping, lo->old_gfp_mask & ~(__GFP_IO|__GFP_FS)); | 822 | mapping_set_gfp_mask(mapping, lo->old_gfp_mask & ~(__GFP_IO|__GFP_FS)); |
| 823 | 823 | ||
| 824 | lo->lo_bio = lo->lo_biotail = NULL; | 824 | lo->lo_bio = lo->lo_biotail = NULL; |
| 825 | 825 | ||
| 826 | /* | 826 | /* |
| 827 | * set queue make_request_fn, and add limits based on lower level | 827 | * set queue make_request_fn, and add limits based on lower level |
| 828 | * device | 828 | * device |
| 829 | */ | 829 | */ |
| 830 | blk_queue_make_request(lo->lo_queue, loop_make_request); | 830 | blk_queue_make_request(lo->lo_queue, loop_make_request); |
| 831 | lo->lo_queue->queuedata = lo; | 831 | lo->lo_queue->queuedata = lo; |
| 832 | lo->lo_queue->unplug_fn = loop_unplug; | 832 | lo->lo_queue->unplug_fn = loop_unplug; |
| 833 | 833 | ||
| 834 | set_capacity(lo->lo_disk, size); | 834 | set_capacity(lo->lo_disk, size); |
| 835 | bd_set_size(bdev, size << 9); | 835 | bd_set_size(bdev, size << 9); |
| 836 | 836 | ||
| 837 | set_blocksize(bdev, lo_blocksize); | 837 | set_blocksize(bdev, lo_blocksize); |
| 838 | 838 | ||
| 839 | lo->lo_thread = kthread_create(loop_thread, lo, "loop%d", | 839 | lo->lo_thread = kthread_create(loop_thread, lo, "loop%d", |
| 840 | lo->lo_number); | 840 | lo->lo_number); |
| 841 | if (IS_ERR(lo->lo_thread)) { | 841 | if (IS_ERR(lo->lo_thread)) { |
| 842 | error = PTR_ERR(lo->lo_thread); | 842 | error = PTR_ERR(lo->lo_thread); |
| 843 | goto out_clr; | 843 | goto out_clr; |
| 844 | } | 844 | } |
| 845 | lo->lo_state = Lo_bound; | 845 | lo->lo_state = Lo_bound; |
| 846 | wake_up_process(lo->lo_thread); | 846 | wake_up_process(lo->lo_thread); |
| 847 | return 0; | 847 | return 0; |
| 848 | 848 | ||
| 849 | out_clr: | 849 | out_clr: |
| 850 | lo->lo_thread = NULL; | 850 | lo->lo_thread = NULL; |
| 851 | lo->lo_device = NULL; | 851 | lo->lo_device = NULL; |
| 852 | lo->lo_backing_file = NULL; | 852 | lo->lo_backing_file = NULL; |
| 853 | lo->lo_flags = 0; | 853 | lo->lo_flags = 0; |
| 854 | set_capacity(lo->lo_disk, 0); | 854 | set_capacity(lo->lo_disk, 0); |
| 855 | invalidate_bdev(bdev); | 855 | invalidate_bdev(bdev); |
| 856 | bd_set_size(bdev, 0); | 856 | bd_set_size(bdev, 0); |
| 857 | mapping_set_gfp_mask(mapping, lo->old_gfp_mask); | 857 | mapping_set_gfp_mask(mapping, lo->old_gfp_mask); |
| 858 | lo->lo_state = Lo_unbound; | 858 | lo->lo_state = Lo_unbound; |
| 859 | out_putf: | 859 | out_putf: |
| 860 | fput(file); | 860 | fput(file); |
| 861 | out: | 861 | out: |
| 862 | /* This is safe: open() is still holding a reference. */ | 862 | /* This is safe: open() is still holding a reference. */ |
| 863 | module_put(THIS_MODULE); | 863 | module_put(THIS_MODULE); |
| 864 | return error; | 864 | return error; |
| 865 | } | 865 | } |
| 866 | 866 | ||
| 867 | static int | 867 | static int |
| 868 | loop_release_xfer(struct loop_device *lo) | 868 | loop_release_xfer(struct loop_device *lo) |
| 869 | { | 869 | { |
| 870 | int err = 0; | 870 | int err = 0; |
| 871 | struct loop_func_table *xfer = lo->lo_encryption; | 871 | struct loop_func_table *xfer = lo->lo_encryption; |
| 872 | 872 | ||
| 873 | if (xfer) { | 873 | if (xfer) { |
| 874 | if (xfer->release) | 874 | if (xfer->release) |
| 875 | err = xfer->release(lo); | 875 | err = xfer->release(lo); |
| 876 | lo->transfer = NULL; | 876 | lo->transfer = NULL; |
| 877 | lo->lo_encryption = NULL; | 877 | lo->lo_encryption = NULL; |
| 878 | module_put(xfer->owner); | 878 | module_put(xfer->owner); |
| 879 | } | 879 | } |
| 880 | return err; | 880 | return err; |
| 881 | } | 881 | } |
| 882 | 882 | ||
| 883 | static int | 883 | static int |
| 884 | loop_init_xfer(struct loop_device *lo, struct loop_func_table *xfer, | 884 | loop_init_xfer(struct loop_device *lo, struct loop_func_table *xfer, |
| 885 | const struct loop_info64 *i) | 885 | const struct loop_info64 *i) |
| 886 | { | 886 | { |
| 887 | int err = 0; | 887 | int err = 0; |
| 888 | 888 | ||
| 889 | if (xfer) { | 889 | if (xfer) { |
| 890 | struct module *owner = xfer->owner; | 890 | struct module *owner = xfer->owner; |
| 891 | 891 | ||
| 892 | if (!try_module_get(owner)) | 892 | if (!try_module_get(owner)) |
| 893 | return -EINVAL; | 893 | return -EINVAL; |
| 894 | if (xfer->init) | 894 | if (xfer->init) |
| 895 | err = xfer->init(lo, i); | 895 | err = xfer->init(lo, i); |
| 896 | if (err) | 896 | if (err) |
| 897 | module_put(owner); | 897 | module_put(owner); |
| 898 | else | 898 | else |
| 899 | lo->lo_encryption = xfer; | 899 | lo->lo_encryption = xfer; |
| 900 | } | 900 | } |
| 901 | return err; | 901 | return err; |
| 902 | } | 902 | } |
| 903 | 903 | ||
| 904 | static int loop_clr_fd(struct loop_device *lo, struct block_device *bdev) | 904 | static int loop_clr_fd(struct loop_device *lo, struct block_device *bdev) |
| 905 | { | 905 | { |
| 906 | struct file *filp = lo->lo_backing_file; | 906 | struct file *filp = lo->lo_backing_file; |
| 907 | gfp_t gfp = lo->old_gfp_mask; | 907 | gfp_t gfp = lo->old_gfp_mask; |
| 908 | 908 | ||
| 909 | if (lo->lo_state != Lo_bound) | 909 | if (lo->lo_state != Lo_bound) |
| 910 | return -ENXIO; | 910 | return -ENXIO; |
| 911 | 911 | ||
| 912 | if (lo->lo_refcnt > 1) /* we needed one fd for the ioctl */ | 912 | if (lo->lo_refcnt > 1) /* we needed one fd for the ioctl */ |
| 913 | return -EBUSY; | 913 | return -EBUSY; |
| 914 | 914 | ||
| 915 | if (filp == NULL) | 915 | if (filp == NULL) |
| 916 | return -EINVAL; | 916 | return -EINVAL; |
| 917 | 917 | ||
| 918 | spin_lock_irq(&lo->lo_lock); | 918 | spin_lock_irq(&lo->lo_lock); |
| 919 | lo->lo_state = Lo_rundown; | 919 | lo->lo_state = Lo_rundown; |
| 920 | spin_unlock_irq(&lo->lo_lock); | 920 | spin_unlock_irq(&lo->lo_lock); |
| 921 | 921 | ||
| 922 | kthread_stop(lo->lo_thread); | 922 | kthread_stop(lo->lo_thread); |
| 923 | 923 | ||
| 924 | lo->lo_backing_file = NULL; | 924 | lo->lo_backing_file = NULL; |
| 925 | 925 | ||
| 926 | loop_release_xfer(lo); | 926 | loop_release_xfer(lo); |
| 927 | lo->transfer = NULL; | 927 | lo->transfer = NULL; |
| 928 | lo->ioctl = NULL; | 928 | lo->ioctl = NULL; |
| 929 | lo->lo_device = NULL; | 929 | lo->lo_device = NULL; |
| 930 | lo->lo_encryption = NULL; | 930 | lo->lo_encryption = NULL; |
| 931 | lo->lo_offset = 0; | 931 | lo->lo_offset = 0; |
| 932 | lo->lo_sizelimit = 0; | 932 | lo->lo_sizelimit = 0; |
| 933 | lo->lo_encrypt_key_size = 0; | 933 | lo->lo_encrypt_key_size = 0; |
| 934 | lo->lo_flags = 0; | 934 | lo->lo_flags = 0; |
| 935 | lo->lo_thread = NULL; | 935 | lo->lo_thread = NULL; |
| 936 | memset(lo->lo_encrypt_key, 0, LO_KEY_SIZE); | 936 | memset(lo->lo_encrypt_key, 0, LO_KEY_SIZE); |
| 937 | memset(lo->lo_crypt_name, 0, LO_NAME_SIZE); | 937 | memset(lo->lo_crypt_name, 0, LO_NAME_SIZE); |
| 938 | memset(lo->lo_file_name, 0, LO_NAME_SIZE); | 938 | memset(lo->lo_file_name, 0, LO_NAME_SIZE); |
| 939 | invalidate_bdev(bdev); | 939 | invalidate_bdev(bdev); |
| 940 | set_capacity(lo->lo_disk, 0); | 940 | set_capacity(lo->lo_disk, 0); |
| 941 | bd_set_size(bdev, 0); | 941 | bd_set_size(bdev, 0); |
| 942 | mapping_set_gfp_mask(filp->f_mapping, gfp); | 942 | mapping_set_gfp_mask(filp->f_mapping, gfp); |
| 943 | lo->lo_state = Lo_unbound; | 943 | lo->lo_state = Lo_unbound; |
| 944 | fput(filp); | 944 | fput(filp); |
| 945 | /* This is safe: open() is still holding a reference. */ | 945 | /* This is safe: open() is still holding a reference. */ |
| 946 | module_put(THIS_MODULE); | 946 | module_put(THIS_MODULE); |
| 947 | return 0; | 947 | return 0; |
| 948 | } | 948 | } |
| 949 | 949 | ||
| 950 | static int | 950 | static int |
| 951 | loop_set_status(struct loop_device *lo, const struct loop_info64 *info) | 951 | loop_set_status(struct loop_device *lo, const struct loop_info64 *info) |
| 952 | { | 952 | { |
| 953 | int err; | 953 | int err; |
| 954 | struct loop_func_table *xfer; | 954 | struct loop_func_table *xfer; |
| 955 | 955 | ||
| 956 | if (lo->lo_encrypt_key_size && lo->lo_key_owner != current->uid && | 956 | if (lo->lo_encrypt_key_size && lo->lo_key_owner != current->uid && |
| 957 | !capable(CAP_SYS_ADMIN)) | 957 | !capable(CAP_SYS_ADMIN)) |
| 958 | return -EPERM; | 958 | return -EPERM; |
| 959 | if (lo->lo_state != Lo_bound) | 959 | if (lo->lo_state != Lo_bound) |
| 960 | return -ENXIO; | 960 | return -ENXIO; |
| 961 | if ((unsigned int) info->lo_encrypt_key_size > LO_KEY_SIZE) | 961 | if ((unsigned int) info->lo_encrypt_key_size > LO_KEY_SIZE) |
| 962 | return -EINVAL; | 962 | return -EINVAL; |
| 963 | 963 | ||
| 964 | err = loop_release_xfer(lo); | 964 | err = loop_release_xfer(lo); |
| 965 | if (err) | 965 | if (err) |
| 966 | return err; | 966 | return err; |
| 967 | 967 | ||
| 968 | if (info->lo_encrypt_type) { | 968 | if (info->lo_encrypt_type) { |
| 969 | unsigned int type = info->lo_encrypt_type; | 969 | unsigned int type = info->lo_encrypt_type; |
| 970 | 970 | ||
| 971 | if (type >= MAX_LO_CRYPT) | 971 | if (type >= MAX_LO_CRYPT) |
| 972 | return -EINVAL; | 972 | return -EINVAL; |
| 973 | xfer = xfer_funcs[type]; | 973 | xfer = xfer_funcs[type]; |
| 974 | if (xfer == NULL) | 974 | if (xfer == NULL) |
| 975 | return -EINVAL; | 975 | return -EINVAL; |
| 976 | } else | 976 | } else |
| 977 | xfer = NULL; | 977 | xfer = NULL; |
| 978 | 978 | ||
| 979 | err = loop_init_xfer(lo, xfer, info); | 979 | err = loop_init_xfer(lo, xfer, info); |
| 980 | if (err) | 980 | if (err) |
| 981 | return err; | 981 | return err; |
| 982 | 982 | ||
| 983 | if (lo->lo_offset != info->lo_offset || | 983 | if (lo->lo_offset != info->lo_offset || |
| 984 | lo->lo_sizelimit != info->lo_sizelimit) { | 984 | lo->lo_sizelimit != info->lo_sizelimit) { |
| 985 | lo->lo_offset = info->lo_offset; | 985 | lo->lo_offset = info->lo_offset; |
| 986 | lo->lo_sizelimit = info->lo_sizelimit; | 986 | lo->lo_sizelimit = info->lo_sizelimit; |
| 987 | if (figure_loop_size(lo)) | 987 | if (figure_loop_size(lo)) |
| 988 | return -EFBIG; | 988 | return -EFBIG; |
| 989 | } | 989 | } |
| 990 | 990 | ||
| 991 | memcpy(lo->lo_file_name, info->lo_file_name, LO_NAME_SIZE); | 991 | memcpy(lo->lo_file_name, info->lo_file_name, LO_NAME_SIZE); |
| 992 | memcpy(lo->lo_crypt_name, info->lo_crypt_name, LO_NAME_SIZE); | 992 | memcpy(lo->lo_crypt_name, info->lo_crypt_name, LO_NAME_SIZE); |
| 993 | lo->lo_file_name[LO_NAME_SIZE-1] = 0; | 993 | lo->lo_file_name[LO_NAME_SIZE-1] = 0; |
| 994 | lo->lo_crypt_name[LO_NAME_SIZE-1] = 0; | 994 | lo->lo_crypt_name[LO_NAME_SIZE-1] = 0; |
| 995 | 995 | ||
| 996 | if (!xfer) | 996 | if (!xfer) |
| 997 | xfer = &none_funcs; | 997 | xfer = &none_funcs; |
| 998 | lo->transfer = xfer->transfer; | 998 | lo->transfer = xfer->transfer; |
| 999 | lo->ioctl = xfer->ioctl; | 999 | lo->ioctl = xfer->ioctl; |
| 1000 | 1000 | ||
| 1001 | lo->lo_encrypt_key_size = info->lo_encrypt_key_size; | 1001 | lo->lo_encrypt_key_size = info->lo_encrypt_key_size; |
| 1002 | lo->lo_init[0] = info->lo_init[0]; | 1002 | lo->lo_init[0] = info->lo_init[0]; |
| 1003 | lo->lo_init[1] = info->lo_init[1]; | 1003 | lo->lo_init[1] = info->lo_init[1]; |
| 1004 | if (info->lo_encrypt_key_size) { | 1004 | if (info->lo_encrypt_key_size) { |
| 1005 | memcpy(lo->lo_encrypt_key, info->lo_encrypt_key, | 1005 | memcpy(lo->lo_encrypt_key, info->lo_encrypt_key, |
| 1006 | info->lo_encrypt_key_size); | 1006 | info->lo_encrypt_key_size); |
| 1007 | lo->lo_key_owner = current->uid; | 1007 | lo->lo_key_owner = current->uid; |
| 1008 | } | 1008 | } |
| 1009 | 1009 | ||
| 1010 | return 0; | 1010 | return 0; |
| 1011 | } | 1011 | } |
| 1012 | 1012 | ||
| 1013 | static int | 1013 | static int |
| 1014 | loop_get_status(struct loop_device *lo, struct loop_info64 *info) | 1014 | loop_get_status(struct loop_device *lo, struct loop_info64 *info) |
| 1015 | { | 1015 | { |
| 1016 | struct file *file = lo->lo_backing_file; | 1016 | struct file *file = lo->lo_backing_file; |
| 1017 | struct kstat stat; | 1017 | struct kstat stat; |
| 1018 | int error; | 1018 | int error; |
| 1019 | 1019 | ||
| 1020 | if (lo->lo_state != Lo_bound) | 1020 | if (lo->lo_state != Lo_bound) |
| 1021 | return -ENXIO; | 1021 | return -ENXIO; |
| 1022 | error = vfs_getattr(file->f_path.mnt, file->f_path.dentry, &stat); | 1022 | error = vfs_getattr(file->f_path.mnt, file->f_path.dentry, &stat); |
| 1023 | if (error) | 1023 | if (error) |
| 1024 | return error; | 1024 | return error; |
| 1025 | memset(info, 0, sizeof(*info)); | 1025 | memset(info, 0, sizeof(*info)); |
| 1026 | info->lo_number = lo->lo_number; | 1026 | info->lo_number = lo->lo_number; |
| 1027 | info->lo_device = huge_encode_dev(stat.dev); | 1027 | info->lo_device = huge_encode_dev(stat.dev); |
| 1028 | info->lo_inode = stat.ino; | 1028 | info->lo_inode = stat.ino; |
| 1029 | info->lo_rdevice = huge_encode_dev(lo->lo_device ? stat.rdev : stat.dev); | 1029 | info->lo_rdevice = huge_encode_dev(lo->lo_device ? stat.rdev : stat.dev); |
| 1030 | info->lo_offset = lo->lo_offset; | 1030 | info->lo_offset = lo->lo_offset; |
| 1031 | info->lo_sizelimit = lo->lo_sizelimit; | 1031 | info->lo_sizelimit = lo->lo_sizelimit; |
| 1032 | info->lo_flags = lo->lo_flags; | 1032 | info->lo_flags = lo->lo_flags; |
| 1033 | memcpy(info->lo_file_name, lo->lo_file_name, LO_NAME_SIZE); | 1033 | memcpy(info->lo_file_name, lo->lo_file_name, LO_NAME_SIZE); |
| 1034 | memcpy(info->lo_crypt_name, lo->lo_crypt_name, LO_NAME_SIZE); | 1034 | memcpy(info->lo_crypt_name, lo->lo_crypt_name, LO_NAME_SIZE); |
| 1035 | info->lo_encrypt_type = | 1035 | info->lo_encrypt_type = |
| 1036 | lo->lo_encryption ? lo->lo_encryption->number : 0; | 1036 | lo->lo_encryption ? lo->lo_encryption->number : 0; |
| 1037 | if (lo->lo_encrypt_key_size && capable(CAP_SYS_ADMIN)) { | 1037 | if (lo->lo_encrypt_key_size && capable(CAP_SYS_ADMIN)) { |
| 1038 | info->lo_encrypt_key_size = lo->lo_encrypt_key_size; | 1038 | info->lo_encrypt_key_size = lo->lo_encrypt_key_size; |
| 1039 | memcpy(info->lo_encrypt_key, lo->lo_encrypt_key, | 1039 | memcpy(info->lo_encrypt_key, lo->lo_encrypt_key, |
| 1040 | lo->lo_encrypt_key_size); | 1040 | lo->lo_encrypt_key_size); |
| 1041 | } | 1041 | } |
| 1042 | return 0; | 1042 | return 0; |
| 1043 | } | 1043 | } |
| 1044 | 1044 | ||
| 1045 | static void | 1045 | static void |
| 1046 | loop_info64_from_old(const struct loop_info *info, struct loop_info64 *info64) | 1046 | loop_info64_from_old(const struct loop_info *info, struct loop_info64 *info64) |
| 1047 | { | 1047 | { |
| 1048 | memset(info64, 0, sizeof(*info64)); | 1048 | memset(info64, 0, sizeof(*info64)); |
| 1049 | info64->lo_number = info->lo_number; | 1049 | info64->lo_number = info->lo_number; |
| 1050 | info64->lo_device = info->lo_device; | 1050 | info64->lo_device = info->lo_device; |
| 1051 | info64->lo_inode = info->lo_inode; | 1051 | info64->lo_inode = info->lo_inode; |
| 1052 | info64->lo_rdevice = info->lo_rdevice; | 1052 | info64->lo_rdevice = info->lo_rdevice; |
| 1053 | info64->lo_offset = info->lo_offset; | 1053 | info64->lo_offset = info->lo_offset; |
| 1054 | info64->lo_sizelimit = 0; | 1054 | info64->lo_sizelimit = 0; |
| 1055 | info64->lo_encrypt_type = info->lo_encrypt_type; | 1055 | info64->lo_encrypt_type = info->lo_encrypt_type; |
| 1056 | info64->lo_encrypt_key_size = info->lo_encrypt_key_size; | 1056 | info64->lo_encrypt_key_size = info->lo_encrypt_key_size; |
| 1057 | info64->lo_flags = info->lo_flags; | 1057 | info64->lo_flags = info->lo_flags; |
| 1058 | info64->lo_init[0] = info->lo_init[0]; | 1058 | info64->lo_init[0] = info->lo_init[0]; |
| 1059 | info64->lo_init[1] = info->lo_init[1]; | 1059 | info64->lo_init[1] = info->lo_init[1]; |
| 1060 | if (info->lo_encrypt_type == LO_CRYPT_CRYPTOAPI) | 1060 | if (info->lo_encrypt_type == LO_CRYPT_CRYPTOAPI) |
| 1061 | memcpy(info64->lo_crypt_name, info->lo_name, LO_NAME_SIZE); | 1061 | memcpy(info64->lo_crypt_name, info->lo_name, LO_NAME_SIZE); |
| 1062 | else | 1062 | else |
| 1063 | memcpy(info64->lo_file_name, info->lo_name, LO_NAME_SIZE); | 1063 | memcpy(info64->lo_file_name, info->lo_name, LO_NAME_SIZE); |
| 1064 | memcpy(info64->lo_encrypt_key, info->lo_encrypt_key, LO_KEY_SIZE); | 1064 | memcpy(info64->lo_encrypt_key, info->lo_encrypt_key, LO_KEY_SIZE); |
| 1065 | } | 1065 | } |
| 1066 | 1066 | ||
| 1067 | static int | 1067 | static int |
| 1068 | loop_info64_to_old(const struct loop_info64 *info64, struct loop_info *info) | 1068 | loop_info64_to_old(const struct loop_info64 *info64, struct loop_info *info) |
| 1069 | { | 1069 | { |
| 1070 | memset(info, 0, sizeof(*info)); | 1070 | memset(info, 0, sizeof(*info)); |
| 1071 | info->lo_number = info64->lo_number; | 1071 | info->lo_number = info64->lo_number; |
| 1072 | info->lo_device = info64->lo_device; | 1072 | info->lo_device = info64->lo_device; |
| 1073 | info->lo_inode = info64->lo_inode; | 1073 | info->lo_inode = info64->lo_inode; |
| 1074 | info->lo_rdevice = info64->lo_rdevice; | 1074 | info->lo_rdevice = info64->lo_rdevice; |
| 1075 | info->lo_offset = info64->lo_offset; | 1075 | info->lo_offset = info64->lo_offset; |
| 1076 | info->lo_encrypt_type = info64->lo_encrypt_type; | 1076 | info->lo_encrypt_type = info64->lo_encrypt_type; |
| 1077 | info->lo_encrypt_key_size = info64->lo_encrypt_key_size; | 1077 | info->lo_encrypt_key_size = info64->lo_encrypt_key_size; |
| 1078 | info->lo_flags = info64->lo_flags; | 1078 | info->lo_flags = info64->lo_flags; |
| 1079 | info->lo_init[0] = info64->lo_init[0]; | 1079 | info->lo_init[0] = info64->lo_init[0]; |
| 1080 | info->lo_init[1] = info64->lo_init[1]; | 1080 | info->lo_init[1] = info64->lo_init[1]; |
| 1081 | if (info->lo_encrypt_type == LO_CRYPT_CRYPTOAPI) | 1081 | if (info->lo_encrypt_type == LO_CRYPT_CRYPTOAPI) |
| 1082 | memcpy(info->lo_name, info64->lo_crypt_name, LO_NAME_SIZE); | 1082 | memcpy(info->lo_name, info64->lo_crypt_name, LO_NAME_SIZE); |
| 1083 | else | 1083 | else |
| 1084 | memcpy(info->lo_name, info64->lo_file_name, LO_NAME_SIZE); | 1084 | memcpy(info->lo_name, info64->lo_file_name, LO_NAME_SIZE); |
| 1085 | memcpy(info->lo_encrypt_key, info64->lo_encrypt_key, LO_KEY_SIZE); | 1085 | memcpy(info->lo_encrypt_key, info64->lo_encrypt_key, LO_KEY_SIZE); |
| 1086 | 1086 | ||
| 1087 | /* error in case values were truncated */ | 1087 | /* error in case values were truncated */ |
| 1088 | if (info->lo_device != info64->lo_device || | 1088 | if (info->lo_device != info64->lo_device || |
| 1089 | info->lo_rdevice != info64->lo_rdevice || | 1089 | info->lo_rdevice != info64->lo_rdevice || |
| 1090 | info->lo_inode != info64->lo_inode || | 1090 | info->lo_inode != info64->lo_inode || |
| 1091 | info->lo_offset != info64->lo_offset) | 1091 | info->lo_offset != info64->lo_offset) |
| 1092 | return -EOVERFLOW; | 1092 | return -EOVERFLOW; |
| 1093 | 1093 | ||
| 1094 | return 0; | 1094 | return 0; |
| 1095 | } | 1095 | } |
| 1096 | 1096 | ||
| 1097 | static int | 1097 | static int |
| 1098 | loop_set_status_old(struct loop_device *lo, const struct loop_info __user *arg) | 1098 | loop_set_status_old(struct loop_device *lo, const struct loop_info __user *arg) |
| 1099 | { | 1099 | { |
| 1100 | struct loop_info info; | 1100 | struct loop_info info; |
| 1101 | struct loop_info64 info64; | 1101 | struct loop_info64 info64; |
| 1102 | 1102 | ||
| 1103 | if (copy_from_user(&info, arg, sizeof (struct loop_info))) | 1103 | if (copy_from_user(&info, arg, sizeof (struct loop_info))) |
| 1104 | return -EFAULT; | 1104 | return -EFAULT; |
| 1105 | loop_info64_from_old(&info, &info64); | 1105 | loop_info64_from_old(&info, &info64); |
| 1106 | return loop_set_status(lo, &info64); | 1106 | return loop_set_status(lo, &info64); |
| 1107 | } | 1107 | } |
| 1108 | 1108 | ||
| 1109 | static int | 1109 | static int |
| 1110 | loop_set_status64(struct loop_device *lo, const struct loop_info64 __user *arg) | 1110 | loop_set_status64(struct loop_device *lo, const struct loop_info64 __user *arg) |
| 1111 | { | 1111 | { |
| 1112 | struct loop_info64 info64; | 1112 | struct loop_info64 info64; |
| 1113 | 1113 | ||
| 1114 | if (copy_from_user(&info64, arg, sizeof (struct loop_info64))) | 1114 | if (copy_from_user(&info64, arg, sizeof (struct loop_info64))) |
| 1115 | return -EFAULT; | 1115 | return -EFAULT; |
| 1116 | return loop_set_status(lo, &info64); | 1116 | return loop_set_status(lo, &info64); |
| 1117 | } | 1117 | } |
| 1118 | 1118 | ||
| 1119 | static int | 1119 | static int |
| 1120 | loop_get_status_old(struct loop_device *lo, struct loop_info __user *arg) { | 1120 | loop_get_status_old(struct loop_device *lo, struct loop_info __user *arg) { |
| 1121 | struct loop_info info; | 1121 | struct loop_info info; |
| 1122 | struct loop_info64 info64; | 1122 | struct loop_info64 info64; |
| 1123 | int err = 0; | 1123 | int err = 0; |
| 1124 | 1124 | ||
| 1125 | if (!arg) | 1125 | if (!arg) |
| 1126 | err = -EINVAL; | 1126 | err = -EINVAL; |
| 1127 | if (!err) | 1127 | if (!err) |
| 1128 | err = loop_get_status(lo, &info64); | 1128 | err = loop_get_status(lo, &info64); |
| 1129 | if (!err) | 1129 | if (!err) |
| 1130 | err = loop_info64_to_old(&info64, &info); | 1130 | err = loop_info64_to_old(&info64, &info); |
| 1131 | if (!err && copy_to_user(arg, &info, sizeof(info))) | 1131 | if (!err && copy_to_user(arg, &info, sizeof(info))) |
| 1132 | err = -EFAULT; | 1132 | err = -EFAULT; |
| 1133 | 1133 | ||
| 1134 | return err; | 1134 | return err; |
| 1135 | } | 1135 | } |
| 1136 | 1136 | ||
| 1137 | static int | 1137 | static int |
| 1138 | loop_get_status64(struct loop_device *lo, struct loop_info64 __user *arg) { | 1138 | loop_get_status64(struct loop_device *lo, struct loop_info64 __user *arg) { |
| 1139 | struct loop_info64 info64; | 1139 | struct loop_info64 info64; |
| 1140 | int err = 0; | 1140 | int err = 0; |
| 1141 | 1141 | ||
| 1142 | if (!arg) | 1142 | if (!arg) |
| 1143 | err = -EINVAL; | 1143 | err = -EINVAL; |
| 1144 | if (!err) | 1144 | if (!err) |
| 1145 | err = loop_get_status(lo, &info64); | 1145 | err = loop_get_status(lo, &info64); |
| 1146 | if (!err && copy_to_user(arg, &info64, sizeof(info64))) | 1146 | if (!err && copy_to_user(arg, &info64, sizeof(info64))) |
| 1147 | err = -EFAULT; | 1147 | err = -EFAULT; |
| 1148 | 1148 | ||
| 1149 | return err; | 1149 | return err; |
| 1150 | } | 1150 | } |
| 1151 | 1151 | ||
| 1152 | static int lo_ioctl(struct inode * inode, struct file * file, | 1152 | static int lo_ioctl(struct inode * inode, struct file * file, |
| 1153 | unsigned int cmd, unsigned long arg) | 1153 | unsigned int cmd, unsigned long arg) |
| 1154 | { | 1154 | { |
| 1155 | struct loop_device *lo = inode->i_bdev->bd_disk->private_data; | 1155 | struct loop_device *lo = inode->i_bdev->bd_disk->private_data; |
| 1156 | int err; | 1156 | int err; |
| 1157 | 1157 | ||
| 1158 | mutex_lock(&lo->lo_ctl_mutex); | 1158 | mutex_lock(&lo->lo_ctl_mutex); |
| 1159 | switch (cmd) { | 1159 | switch (cmd) { |
| 1160 | case LOOP_SET_FD: | 1160 | case LOOP_SET_FD: |
| 1161 | err = loop_set_fd(lo, file, inode->i_bdev, arg); | 1161 | err = loop_set_fd(lo, file, inode->i_bdev, arg); |
| 1162 | break; | 1162 | break; |
| 1163 | case LOOP_CHANGE_FD: | 1163 | case LOOP_CHANGE_FD: |
| 1164 | err = loop_change_fd(lo, file, inode->i_bdev, arg); | 1164 | err = loop_change_fd(lo, file, inode->i_bdev, arg); |
| 1165 | break; | 1165 | break; |
| 1166 | case LOOP_CLR_FD: | 1166 | case LOOP_CLR_FD: |
| 1167 | err = loop_clr_fd(lo, inode->i_bdev); | 1167 | err = loop_clr_fd(lo, inode->i_bdev); |
| 1168 | break; | 1168 | break; |
| 1169 | case LOOP_SET_STATUS: | 1169 | case LOOP_SET_STATUS: |
| 1170 | err = loop_set_status_old(lo, (struct loop_info __user *) arg); | 1170 | err = loop_set_status_old(lo, (struct loop_info __user *) arg); |
| 1171 | break; | 1171 | break; |
| 1172 | case LOOP_GET_STATUS: | 1172 | case LOOP_GET_STATUS: |
| 1173 | err = loop_get_status_old(lo, (struct loop_info __user *) arg); | 1173 | err = loop_get_status_old(lo, (struct loop_info __user *) arg); |
| 1174 | break; | 1174 | break; |
| 1175 | case LOOP_SET_STATUS64: | 1175 | case LOOP_SET_STATUS64: |
| 1176 | err = loop_set_status64(lo, (struct loop_info64 __user *) arg); | 1176 | err = loop_set_status64(lo, (struct loop_info64 __user *) arg); |
| 1177 | break; | 1177 | break; |
| 1178 | case LOOP_GET_STATUS64: | 1178 | case LOOP_GET_STATUS64: |
| 1179 | err = loop_get_status64(lo, (struct loop_info64 __user *) arg); | 1179 | err = loop_get_status64(lo, (struct loop_info64 __user *) arg); |
| 1180 | break; | 1180 | break; |
| 1181 | default: | 1181 | default: |
| 1182 | err = lo->ioctl ? lo->ioctl(lo, cmd, arg) : -EINVAL; | 1182 | err = lo->ioctl ? lo->ioctl(lo, cmd, arg) : -EINVAL; |
| 1183 | } | 1183 | } |
| 1184 | mutex_unlock(&lo->lo_ctl_mutex); | 1184 | mutex_unlock(&lo->lo_ctl_mutex); |
| 1185 | return err; | 1185 | return err; |
| 1186 | } | 1186 | } |
| 1187 | 1187 | ||
| 1188 | #ifdef CONFIG_COMPAT | 1188 | #ifdef CONFIG_COMPAT |
| 1189 | struct compat_loop_info { | 1189 | struct compat_loop_info { |
| 1190 | compat_int_t lo_number; /* ioctl r/o */ | 1190 | compat_int_t lo_number; /* ioctl r/o */ |
| 1191 | compat_dev_t lo_device; /* ioctl r/o */ | 1191 | compat_dev_t lo_device; /* ioctl r/o */ |
| 1192 | compat_ulong_t lo_inode; /* ioctl r/o */ | 1192 | compat_ulong_t lo_inode; /* ioctl r/o */ |
| 1193 | compat_dev_t lo_rdevice; /* ioctl r/o */ | 1193 | compat_dev_t lo_rdevice; /* ioctl r/o */ |
| 1194 | compat_int_t lo_offset; | 1194 | compat_int_t lo_offset; |
| 1195 | compat_int_t lo_encrypt_type; | 1195 | compat_int_t lo_encrypt_type; |
| 1196 | compat_int_t lo_encrypt_key_size; /* ioctl w/o */ | 1196 | compat_int_t lo_encrypt_key_size; /* ioctl w/o */ |
| 1197 | compat_int_t lo_flags; /* ioctl r/o */ | 1197 | compat_int_t lo_flags; /* ioctl r/o */ |
| 1198 | char lo_name[LO_NAME_SIZE]; | 1198 | char lo_name[LO_NAME_SIZE]; |
| 1199 | unsigned char lo_encrypt_key[LO_KEY_SIZE]; /* ioctl w/o */ | 1199 | unsigned char lo_encrypt_key[LO_KEY_SIZE]; /* ioctl w/o */ |
| 1200 | compat_ulong_t lo_init[2]; | 1200 | compat_ulong_t lo_init[2]; |
| 1201 | char reserved[4]; | 1201 | char reserved[4]; |
| 1202 | }; | 1202 | }; |
| 1203 | 1203 | ||
| 1204 | /* | 1204 | /* |
| 1205 | * Transfer 32-bit compatibility structure in userspace to 64-bit loop info | 1205 | * Transfer 32-bit compatibility structure in userspace to 64-bit loop info |
| 1206 | * - noinlined to reduce stack space usage in main part of driver | 1206 | * - noinlined to reduce stack space usage in main part of driver |
| 1207 | */ | 1207 | */ |
| 1208 | static noinline int | 1208 | static noinline int |
| 1209 | loop_info64_from_compat(const struct compat_loop_info __user *arg, | 1209 | loop_info64_from_compat(const struct compat_loop_info __user *arg, |
| 1210 | struct loop_info64 *info64) | 1210 | struct loop_info64 *info64) |
| 1211 | { | 1211 | { |
| 1212 | struct compat_loop_info info; | 1212 | struct compat_loop_info info; |
| 1213 | 1213 | ||
| 1214 | if (copy_from_user(&info, arg, sizeof(info))) | 1214 | if (copy_from_user(&info, arg, sizeof(info))) |
| 1215 | return -EFAULT; | 1215 | return -EFAULT; |
| 1216 | 1216 | ||
| 1217 | memset(info64, 0, sizeof(*info64)); | 1217 | memset(info64, 0, sizeof(*info64)); |
| 1218 | info64->lo_number = info.lo_number; | 1218 | info64->lo_number = info.lo_number; |
| 1219 | info64->lo_device = info.lo_device; | 1219 | info64->lo_device = info.lo_device; |
| 1220 | info64->lo_inode = info.lo_inode; | 1220 | info64->lo_inode = info.lo_inode; |
| 1221 | info64->lo_rdevice = info.lo_rdevice; | 1221 | info64->lo_rdevice = info.lo_rdevice; |
| 1222 | info64->lo_offset = info.lo_offset; | 1222 | info64->lo_offset = info.lo_offset; |
| 1223 | info64->lo_sizelimit = 0; | 1223 | info64->lo_sizelimit = 0; |
| 1224 | info64->lo_encrypt_type = info.lo_encrypt_type; | 1224 | info64->lo_encrypt_type = info.lo_encrypt_type; |
| 1225 | info64->lo_encrypt_key_size = info.lo_encrypt_key_size; | 1225 | info64->lo_encrypt_key_size = info.lo_encrypt_key_size; |
| 1226 | info64->lo_flags = info.lo_flags; | 1226 | info64->lo_flags = info.lo_flags; |
| 1227 | info64->lo_init[0] = info.lo_init[0]; | 1227 | info64->lo_init[0] = info.lo_init[0]; |
| 1228 | info64->lo_init[1] = info.lo_init[1]; | 1228 | info64->lo_init[1] = info.lo_init[1]; |
| 1229 | if (info.lo_encrypt_type == LO_CRYPT_CRYPTOAPI) | 1229 | if (info.lo_encrypt_type == LO_CRYPT_CRYPTOAPI) |
| 1230 | memcpy(info64->lo_crypt_name, info.lo_name, LO_NAME_SIZE); | 1230 | memcpy(info64->lo_crypt_name, info.lo_name, LO_NAME_SIZE); |
| 1231 | else | 1231 | else |
| 1232 | memcpy(info64->lo_file_name, info.lo_name, LO_NAME_SIZE); | 1232 | memcpy(info64->lo_file_name, info.lo_name, LO_NAME_SIZE); |
| 1233 | memcpy(info64->lo_encrypt_key, info.lo_encrypt_key, LO_KEY_SIZE); | 1233 | memcpy(info64->lo_encrypt_key, info.lo_encrypt_key, LO_KEY_SIZE); |
| 1234 | return 0; | 1234 | return 0; |
| 1235 | } | 1235 | } |
| 1236 | 1236 | ||
| 1237 | /* | 1237 | /* |
| 1238 | * Transfer 64-bit loop info to 32-bit compatibility structure in userspace | 1238 | * Transfer 64-bit loop info to 32-bit compatibility structure in userspace |
| 1239 | * - noinlined to reduce stack space usage in main part of driver | 1239 | * - noinlined to reduce stack space usage in main part of driver |
| 1240 | */ | 1240 | */ |
| 1241 | static noinline int | 1241 | static noinline int |
| 1242 | loop_info64_to_compat(const struct loop_info64 *info64, | 1242 | loop_info64_to_compat(const struct loop_info64 *info64, |
| 1243 | struct compat_loop_info __user *arg) | 1243 | struct compat_loop_info __user *arg) |
| 1244 | { | 1244 | { |
| 1245 | struct compat_loop_info info; | 1245 | struct compat_loop_info info; |
| 1246 | 1246 | ||
| 1247 | memset(&info, 0, sizeof(info)); | 1247 | memset(&info, 0, sizeof(info)); |
| 1248 | info.lo_number = info64->lo_number; | 1248 | info.lo_number = info64->lo_number; |
| 1249 | info.lo_device = info64->lo_device; | 1249 | info.lo_device = info64->lo_device; |
| 1250 | info.lo_inode = info64->lo_inode; | 1250 | info.lo_inode = info64->lo_inode; |
| 1251 | info.lo_rdevice = info64->lo_rdevice; | 1251 | info.lo_rdevice = info64->lo_rdevice; |
| 1252 | info.lo_offset = info64->lo_offset; | 1252 | info.lo_offset = info64->lo_offset; |
| 1253 | info.lo_encrypt_type = info64->lo_encrypt_type; | 1253 | info.lo_encrypt_type = info64->lo_encrypt_type; |
| 1254 | info.lo_encrypt_key_size = info64->lo_encrypt_key_size; | 1254 | info.lo_encrypt_key_size = info64->lo_encrypt_key_size; |
| 1255 | info.lo_flags = info64->lo_flags; | 1255 | info.lo_flags = info64->lo_flags; |
| 1256 | info.lo_init[0] = info64->lo_init[0]; | 1256 | info.lo_init[0] = info64->lo_init[0]; |
| 1257 | info.lo_init[1] = info64->lo_init[1]; | 1257 | info.lo_init[1] = info64->lo_init[1]; |
| 1258 | if (info.lo_encrypt_type == LO_CRYPT_CRYPTOAPI) | 1258 | if (info.lo_encrypt_type == LO_CRYPT_CRYPTOAPI) |
| 1259 | memcpy(info.lo_name, info64->lo_crypt_name, LO_NAME_SIZE); | 1259 | memcpy(info.lo_name, info64->lo_crypt_name, LO_NAME_SIZE); |
| 1260 | else | 1260 | else |
| 1261 | memcpy(info.lo_name, info64->lo_file_name, LO_NAME_SIZE); | 1261 | memcpy(info.lo_name, info64->lo_file_name, LO_NAME_SIZE); |
| 1262 | memcpy(info.lo_encrypt_key, info64->lo_encrypt_key, LO_KEY_SIZE); | 1262 | memcpy(info.lo_encrypt_key, info64->lo_encrypt_key, LO_KEY_SIZE); |
| 1263 | 1263 | ||
| 1264 | /* error in case values were truncated */ | 1264 | /* error in case values were truncated */ |
| 1265 | if (info.lo_device != info64->lo_device || | 1265 | if (info.lo_device != info64->lo_device || |
| 1266 | info.lo_rdevice != info64->lo_rdevice || | 1266 | info.lo_rdevice != info64->lo_rdevice || |
| 1267 | info.lo_inode != info64->lo_inode || | 1267 | info.lo_inode != info64->lo_inode || |
| 1268 | info.lo_offset != info64->lo_offset || | 1268 | info.lo_offset != info64->lo_offset || |
| 1269 | info.lo_init[0] != info64->lo_init[0] || | 1269 | info.lo_init[0] != info64->lo_init[0] || |
| 1270 | info.lo_init[1] != info64->lo_init[1]) | 1270 | info.lo_init[1] != info64->lo_init[1]) |
| 1271 | return -EOVERFLOW; | 1271 | return -EOVERFLOW; |
| 1272 | 1272 | ||
| 1273 | if (copy_to_user(arg, &info, sizeof(info))) | 1273 | if (copy_to_user(arg, &info, sizeof(info))) |
| 1274 | return -EFAULT; | 1274 | return -EFAULT; |
| 1275 | return 0; | 1275 | return 0; |
| 1276 | } | 1276 | } |
| 1277 | 1277 | ||
| 1278 | static int | 1278 | static int |
| 1279 | loop_set_status_compat(struct loop_device *lo, | 1279 | loop_set_status_compat(struct loop_device *lo, |
| 1280 | const struct compat_loop_info __user *arg) | 1280 | const struct compat_loop_info __user *arg) |
| 1281 | { | 1281 | { |
| 1282 | struct loop_info64 info64; | 1282 | struct loop_info64 info64; |
| 1283 | int ret; | 1283 | int ret; |
| 1284 | 1284 | ||
| 1285 | ret = loop_info64_from_compat(arg, &info64); | 1285 | ret = loop_info64_from_compat(arg, &info64); |
| 1286 | if (ret < 0) | 1286 | if (ret < 0) |
| 1287 | return ret; | 1287 | return ret; |
| 1288 | return loop_set_status(lo, &info64); | 1288 | return loop_set_status(lo, &info64); |
| 1289 | } | 1289 | } |
| 1290 | 1290 | ||
| 1291 | static int | 1291 | static int |
| 1292 | loop_get_status_compat(struct loop_device *lo, | 1292 | loop_get_status_compat(struct loop_device *lo, |
| 1293 | struct compat_loop_info __user *arg) | 1293 | struct compat_loop_info __user *arg) |
| 1294 | { | 1294 | { |
| 1295 | struct loop_info64 info64; | 1295 | struct loop_info64 info64; |
| 1296 | int err = 0; | 1296 | int err = 0; |
| 1297 | 1297 | ||
| 1298 | if (!arg) | 1298 | if (!arg) |
| 1299 | err = -EINVAL; | 1299 | err = -EINVAL; |
| 1300 | if (!err) | 1300 | if (!err) |
| 1301 | err = loop_get_status(lo, &info64); | 1301 | err = loop_get_status(lo, &info64); |
| 1302 | if (!err) | 1302 | if (!err) |
| 1303 | err = loop_info64_to_compat(&info64, arg); | 1303 | err = loop_info64_to_compat(&info64, arg); |
| 1304 | return err; | 1304 | return err; |
| 1305 | } | 1305 | } |
| 1306 | 1306 | ||
| 1307 | static long lo_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) | 1307 | static long lo_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) |
| 1308 | { | 1308 | { |
| 1309 | struct inode *inode = file->f_path.dentry->d_inode; | 1309 | struct inode *inode = file->f_path.dentry->d_inode; |
| 1310 | struct loop_device *lo = inode->i_bdev->bd_disk->private_data; | 1310 | struct loop_device *lo = inode->i_bdev->bd_disk->private_data; |
| 1311 | int err; | 1311 | int err; |
| 1312 | 1312 | ||
| 1313 | lock_kernel(); | 1313 | lock_kernel(); |
| 1314 | switch(cmd) { | 1314 | switch(cmd) { |
| 1315 | case LOOP_SET_STATUS: | 1315 | case LOOP_SET_STATUS: |
| 1316 | mutex_lock(&lo->lo_ctl_mutex); | 1316 | mutex_lock(&lo->lo_ctl_mutex); |
| 1317 | err = loop_set_status_compat( | 1317 | err = loop_set_status_compat( |
| 1318 | lo, (const struct compat_loop_info __user *) arg); | 1318 | lo, (const struct compat_loop_info __user *) arg); |
| 1319 | mutex_unlock(&lo->lo_ctl_mutex); | 1319 | mutex_unlock(&lo->lo_ctl_mutex); |
| 1320 | break; | 1320 | break; |
| 1321 | case LOOP_GET_STATUS: | 1321 | case LOOP_GET_STATUS: |
| 1322 | mutex_lock(&lo->lo_ctl_mutex); | 1322 | mutex_lock(&lo->lo_ctl_mutex); |
| 1323 | err = loop_get_status_compat( | 1323 | err = loop_get_status_compat( |
| 1324 | lo, (struct compat_loop_info __user *) arg); | 1324 | lo, (struct compat_loop_info __user *) arg); |
| 1325 | mutex_unlock(&lo->lo_ctl_mutex); | 1325 | mutex_unlock(&lo->lo_ctl_mutex); |
| 1326 | break; | 1326 | break; |
| 1327 | case LOOP_CLR_FD: | 1327 | case LOOP_CLR_FD: |
| 1328 | case LOOP_GET_STATUS64: | 1328 | case LOOP_GET_STATUS64: |
| 1329 | case LOOP_SET_STATUS64: | 1329 | case LOOP_SET_STATUS64: |
| 1330 | arg = (unsigned long) compat_ptr(arg); | 1330 | arg = (unsigned long) compat_ptr(arg); |
| 1331 | case LOOP_SET_FD: | 1331 | case LOOP_SET_FD: |
| 1332 | case LOOP_CHANGE_FD: | 1332 | case LOOP_CHANGE_FD: |
| 1333 | err = lo_ioctl(inode, file, cmd, arg); | 1333 | err = lo_ioctl(inode, file, cmd, arg); |
| 1334 | break; | 1334 | break; |
| 1335 | default: | 1335 | default: |
| 1336 | err = -ENOIOCTLCMD; | 1336 | err = -ENOIOCTLCMD; |
| 1337 | break; | 1337 | break; |
| 1338 | } | 1338 | } |
| 1339 | unlock_kernel(); | 1339 | unlock_kernel(); |
| 1340 | return err; | 1340 | return err; |
| 1341 | } | 1341 | } |
| 1342 | #endif | 1342 | #endif |
| 1343 | 1343 | ||
| 1344 | static int lo_open(struct inode *inode, struct file *file) | 1344 | static int lo_open(struct inode *inode, struct file *file) |
| 1345 | { | 1345 | { |
| 1346 | struct loop_device *lo = inode->i_bdev->bd_disk->private_data; | 1346 | struct loop_device *lo = inode->i_bdev->bd_disk->private_data; |
| 1347 | 1347 | ||
| 1348 | mutex_lock(&lo->lo_ctl_mutex); | 1348 | mutex_lock(&lo->lo_ctl_mutex); |
| 1349 | lo->lo_refcnt++; | 1349 | lo->lo_refcnt++; |
| 1350 | mutex_unlock(&lo->lo_ctl_mutex); | 1350 | mutex_unlock(&lo->lo_ctl_mutex); |
| 1351 | 1351 | ||
| 1352 | return 0; | 1352 | return 0; |
| 1353 | } | 1353 | } |
| 1354 | 1354 | ||
| 1355 | static int lo_release(struct inode *inode, struct file *file) | 1355 | static int lo_release(struct inode *inode, struct file *file) |
| 1356 | { | 1356 | { |
| 1357 | struct loop_device *lo = inode->i_bdev->bd_disk->private_data; | 1357 | struct loop_device *lo = inode->i_bdev->bd_disk->private_data; |
| 1358 | 1358 | ||
| 1359 | mutex_lock(&lo->lo_ctl_mutex); | 1359 | mutex_lock(&lo->lo_ctl_mutex); |
| 1360 | --lo->lo_refcnt; | 1360 | --lo->lo_refcnt; |
| 1361 | mutex_unlock(&lo->lo_ctl_mutex); | 1361 | mutex_unlock(&lo->lo_ctl_mutex); |
| 1362 | 1362 | ||
| 1363 | return 0; | 1363 | return 0; |
| 1364 | } | 1364 | } |
| 1365 | 1365 | ||
| 1366 | static struct block_device_operations lo_fops = { | 1366 | static struct block_device_operations lo_fops = { |
| 1367 | .owner = THIS_MODULE, | 1367 | .owner = THIS_MODULE, |
| 1368 | .open = lo_open, | 1368 | .open = lo_open, |
| 1369 | .release = lo_release, | 1369 | .release = lo_release, |
| 1370 | .ioctl = lo_ioctl, | 1370 | .ioctl = lo_ioctl, |
| 1371 | #ifdef CONFIG_COMPAT | 1371 | #ifdef CONFIG_COMPAT |
| 1372 | .compat_ioctl = lo_compat_ioctl, | 1372 | .compat_ioctl = lo_compat_ioctl, |
| 1373 | #endif | 1373 | #endif |
| 1374 | }; | 1374 | }; |
| 1375 | 1375 | ||
| 1376 | /* | 1376 | /* |
| 1377 | * And now the modules code and kernel interface. | 1377 | * And now the modules code and kernel interface. |
| 1378 | */ | 1378 | */ |
| 1379 | static int max_loop; | 1379 | static int max_loop; |
| 1380 | module_param(max_loop, int, 0); | 1380 | module_param(max_loop, int, 0); |
| 1381 | MODULE_PARM_DESC(max_loop, "Maximum number of loop devices"); | 1381 | MODULE_PARM_DESC(max_loop, "Maximum number of loop devices"); |
| 1382 | MODULE_LICENSE("GPL"); | 1382 | MODULE_LICENSE("GPL"); |
| 1383 | MODULE_ALIAS_BLOCKDEV_MAJOR(LOOP_MAJOR); | 1383 | MODULE_ALIAS_BLOCKDEV_MAJOR(LOOP_MAJOR); |
| 1384 | 1384 | ||
| 1385 | int loop_register_transfer(struct loop_func_table *funcs) | 1385 | int loop_register_transfer(struct loop_func_table *funcs) |
| 1386 | { | 1386 | { |
| 1387 | unsigned int n = funcs->number; | 1387 | unsigned int n = funcs->number; |
| 1388 | 1388 | ||
| 1389 | if (n >= MAX_LO_CRYPT || xfer_funcs[n]) | 1389 | if (n >= MAX_LO_CRYPT || xfer_funcs[n]) |
| 1390 | return -EINVAL; | 1390 | return -EINVAL; |
| 1391 | xfer_funcs[n] = funcs; | 1391 | xfer_funcs[n] = funcs; |
| 1392 | return 0; | 1392 | return 0; |
| 1393 | } | 1393 | } |
| 1394 | 1394 | ||
| 1395 | int loop_unregister_transfer(int number) | 1395 | int loop_unregister_transfer(int number) |
| 1396 | { | 1396 | { |
| 1397 | unsigned int n = number; | 1397 | unsigned int n = number; |
| 1398 | struct loop_device *lo; | 1398 | struct loop_device *lo; |
| 1399 | struct loop_func_table *xfer; | 1399 | struct loop_func_table *xfer; |
| 1400 | 1400 | ||
| 1401 | if (n == 0 || n >= MAX_LO_CRYPT || (xfer = xfer_funcs[n]) == NULL) | 1401 | if (n == 0 || n >= MAX_LO_CRYPT || (xfer = xfer_funcs[n]) == NULL) |
| 1402 | return -EINVAL; | 1402 | return -EINVAL; |
| 1403 | 1403 | ||
| 1404 | xfer_funcs[n] = NULL; | 1404 | xfer_funcs[n] = NULL; |
| 1405 | 1405 | ||
| 1406 | list_for_each_entry(lo, &loop_devices, lo_list) { | 1406 | list_for_each_entry(lo, &loop_devices, lo_list) { |
| 1407 | mutex_lock(&lo->lo_ctl_mutex); | 1407 | mutex_lock(&lo->lo_ctl_mutex); |
| 1408 | 1408 | ||
| 1409 | if (lo->lo_encryption == xfer) | 1409 | if (lo->lo_encryption == xfer) |
| 1410 | loop_release_xfer(lo); | 1410 | loop_release_xfer(lo); |
| 1411 | 1411 | ||
| 1412 | mutex_unlock(&lo->lo_ctl_mutex); | 1412 | mutex_unlock(&lo->lo_ctl_mutex); |
| 1413 | } | 1413 | } |
| 1414 | 1414 | ||
| 1415 | return 0; | 1415 | return 0; |
| 1416 | } | 1416 | } |
| 1417 | 1417 | ||
| 1418 | EXPORT_SYMBOL(loop_register_transfer); | 1418 | EXPORT_SYMBOL(loop_register_transfer); |
| 1419 | EXPORT_SYMBOL(loop_unregister_transfer); | 1419 | EXPORT_SYMBOL(loop_unregister_transfer); |
| 1420 | 1420 | ||
| 1421 | static struct loop_device *loop_alloc(int i) | 1421 | static struct loop_device *loop_alloc(int i) |
| 1422 | { | 1422 | { |
| 1423 | struct loop_device *lo; | 1423 | struct loop_device *lo; |
| 1424 | struct gendisk *disk; | 1424 | struct gendisk *disk; |
| 1425 | 1425 | ||
| 1426 | lo = kzalloc(sizeof(*lo), GFP_KERNEL); | 1426 | lo = kzalloc(sizeof(*lo), GFP_KERNEL); |
| 1427 | if (!lo) | 1427 | if (!lo) |
| 1428 | goto out; | 1428 | goto out; |
| 1429 | 1429 | ||
| 1430 | lo->lo_queue = blk_alloc_queue(GFP_KERNEL); | 1430 | lo->lo_queue = blk_alloc_queue(GFP_KERNEL); |
| 1431 | if (!lo->lo_queue) | 1431 | if (!lo->lo_queue) |
| 1432 | goto out_free_dev; | 1432 | goto out_free_dev; |
| 1433 | 1433 | ||
| 1434 | disk = lo->lo_disk = alloc_disk(1); | 1434 | disk = lo->lo_disk = alloc_disk(1); |
| 1435 | if (!disk) | 1435 | if (!disk) |
| 1436 | goto out_free_queue; | 1436 | goto out_free_queue; |
| 1437 | 1437 | ||
| 1438 | mutex_init(&lo->lo_ctl_mutex); | 1438 | mutex_init(&lo->lo_ctl_mutex); |
| 1439 | lo->lo_number = i; | 1439 | lo->lo_number = i; |
| 1440 | lo->lo_thread = NULL; | 1440 | lo->lo_thread = NULL; |
| 1441 | init_waitqueue_head(&lo->lo_event); | 1441 | init_waitqueue_head(&lo->lo_event); |
| 1442 | spin_lock_init(&lo->lo_lock); | 1442 | spin_lock_init(&lo->lo_lock); |
| 1443 | disk->major = LOOP_MAJOR; | 1443 | disk->major = LOOP_MAJOR; |
| 1444 | disk->first_minor = i; | 1444 | disk->first_minor = i; |
| 1445 | disk->fops = &lo_fops; | 1445 | disk->fops = &lo_fops; |
| 1446 | disk->private_data = lo; | 1446 | disk->private_data = lo; |
| 1447 | disk->queue = lo->lo_queue; | 1447 | disk->queue = lo->lo_queue; |
| 1448 | sprintf(disk->disk_name, "loop%d", i); | 1448 | sprintf(disk->disk_name, "loop%d", i); |
| 1449 | return lo; | 1449 | return lo; |
| 1450 | 1450 | ||
| 1451 | out_free_queue: | 1451 | out_free_queue: |
| 1452 | blk_cleanup_queue(lo->lo_queue); | 1452 | blk_cleanup_queue(lo->lo_queue); |
| 1453 | out_free_dev: | 1453 | out_free_dev: |
| 1454 | kfree(lo); | 1454 | kfree(lo); |
| 1455 | out: | 1455 | out: |
| 1456 | return NULL; | 1456 | return NULL; |
| 1457 | } | 1457 | } |
| 1458 | 1458 | ||
| 1459 | static void loop_free(struct loop_device *lo) | 1459 | static void loop_free(struct loop_device *lo) |
| 1460 | { | 1460 | { |
| 1461 | blk_cleanup_queue(lo->lo_queue); | 1461 | blk_cleanup_queue(lo->lo_queue); |
| 1462 | put_disk(lo->lo_disk); | 1462 | put_disk(lo->lo_disk); |
| 1463 | list_del(&lo->lo_list); | 1463 | list_del(&lo->lo_list); |
| 1464 | kfree(lo); | 1464 | kfree(lo); |
| 1465 | } | 1465 | } |
| 1466 | 1466 | ||
| 1467 | static struct loop_device *loop_init_one(int i) | 1467 | static struct loop_device *loop_init_one(int i) |
| 1468 | { | 1468 | { |
| 1469 | struct loop_device *lo; | 1469 | struct loop_device *lo; |
| 1470 | 1470 | ||
| 1471 | list_for_each_entry(lo, &loop_devices, lo_list) { | 1471 | list_for_each_entry(lo, &loop_devices, lo_list) { |
| 1472 | if (lo->lo_number == i) | 1472 | if (lo->lo_number == i) |
| 1473 | return lo; | 1473 | return lo; |
| 1474 | } | 1474 | } |
| 1475 | 1475 | ||
| 1476 | lo = loop_alloc(i); | 1476 | lo = loop_alloc(i); |
| 1477 | if (lo) { | 1477 | if (lo) { |
| 1478 | add_disk(lo->lo_disk); | 1478 | add_disk(lo->lo_disk); |
| 1479 | list_add_tail(&lo->lo_list, &loop_devices); | 1479 | list_add_tail(&lo->lo_list, &loop_devices); |
| 1480 | } | 1480 | } |
| 1481 | return lo; | 1481 | return lo; |
| 1482 | } | 1482 | } |
| 1483 | 1483 | ||
| 1484 | static void loop_del_one(struct loop_device *lo) | 1484 | static void loop_del_one(struct loop_device *lo) |
| 1485 | { | 1485 | { |
| 1486 | del_gendisk(lo->lo_disk); | 1486 | del_gendisk(lo->lo_disk); |
| 1487 | loop_free(lo); | 1487 | loop_free(lo); |
| 1488 | } | 1488 | } |
| 1489 | 1489 | ||
| 1490 | static struct kobject *loop_probe(dev_t dev, int *part, void *data) | 1490 | static struct kobject *loop_probe(dev_t dev, int *part, void *data) |
| 1491 | { | 1491 | { |
| 1492 | struct loop_device *lo; | 1492 | struct loop_device *lo; |
| 1493 | struct kobject *kobj; | 1493 | struct kobject *kobj; |
| 1494 | 1494 | ||
| 1495 | mutex_lock(&loop_devices_mutex); | 1495 | mutex_lock(&loop_devices_mutex); |
| 1496 | lo = loop_init_one(dev & MINORMASK); | 1496 | lo = loop_init_one(dev & MINORMASK); |
| 1497 | kobj = lo ? get_disk(lo->lo_disk) : ERR_PTR(-ENOMEM); | 1497 | kobj = lo ? get_disk(lo->lo_disk) : ERR_PTR(-ENOMEM); |
| 1498 | mutex_unlock(&loop_devices_mutex); | 1498 | mutex_unlock(&loop_devices_mutex); |
| 1499 | 1499 | ||
| 1500 | *part = 0; | 1500 | *part = 0; |
| 1501 | return kobj; | 1501 | return kobj; |
| 1502 | } | 1502 | } |
| 1503 | 1503 | ||
| 1504 | static int __init loop_init(void) | 1504 | static int __init loop_init(void) |
| 1505 | { | 1505 | { |
| 1506 | int i, nr; | 1506 | int i, nr; |
| 1507 | unsigned long range; | 1507 | unsigned long range; |
| 1508 | struct loop_device *lo, *next; | 1508 | struct loop_device *lo, *next; |
| 1509 | 1509 | ||
| 1510 | /* | 1510 | /* |
| 1511 | * loop module now has a feature to instantiate underlying device | 1511 | * loop module now has a feature to instantiate underlying device |
| 1512 | * structure on-demand, provided that there is an access dev node. | 1512 | * structure on-demand, provided that there is an access dev node. |
| 1513 | * However, this will not work well with user space tool that doesn't | 1513 | * However, this will not work well with user space tool that doesn't |
| 1514 | * know about such "feature". In order to not break any existing | 1514 | * know about such "feature". In order to not break any existing |
| 1515 | * tool, we do the following: | 1515 | * tool, we do the following: |
| 1516 | * | 1516 | * |
| 1517 | * (1) if max_loop is specified, create that many upfront, and this | 1517 | * (1) if max_loop is specified, create that many upfront, and this |
| 1518 | * also becomes a hard limit. | 1518 | * also becomes a hard limit. |
| 1519 | * (2) if max_loop is not specified, create 8 loop device on module | 1519 | * (2) if max_loop is not specified, create 8 loop device on module |
| 1520 | * load, user can further extend loop device by create dev node | 1520 | * load, user can further extend loop device by create dev node |
| 1521 | * themselves and have kernel automatically instantiate actual | 1521 | * themselves and have kernel automatically instantiate actual |
| 1522 | * device on-demand. | 1522 | * device on-demand. |
| 1523 | */ | 1523 | */ |
| 1524 | if (max_loop > 1UL << MINORBITS) | 1524 | if (max_loop > 1UL << MINORBITS) |
| 1525 | return -EINVAL; | 1525 | return -EINVAL; |
| 1526 | 1526 | ||
| 1527 | if (max_loop) { | 1527 | if (max_loop) { |
| 1528 | nr = max_loop; | 1528 | nr = max_loop; |
| 1529 | range = max_loop; | 1529 | range = max_loop; |
| 1530 | } else { | 1530 | } else { |
| 1531 | nr = 8; | 1531 | nr = 8; |
| 1532 | range = 1UL << MINORBITS; | 1532 | range = 1UL << MINORBITS; |
| 1533 | } | 1533 | } |
| 1534 | 1534 | ||
| 1535 | if (register_blkdev(LOOP_MAJOR, "loop")) | 1535 | if (register_blkdev(LOOP_MAJOR, "loop")) |
| 1536 | return -EIO; | 1536 | return -EIO; |
| 1537 | 1537 | ||
| 1538 | for (i = 0; i < nr; i++) { | 1538 | for (i = 0; i < nr; i++) { |
| 1539 | lo = loop_alloc(i); | 1539 | lo = loop_alloc(i); |
| 1540 | if (!lo) | 1540 | if (!lo) |
| 1541 | goto Enomem; | 1541 | goto Enomem; |
| 1542 | list_add_tail(&lo->lo_list, &loop_devices); | 1542 | list_add_tail(&lo->lo_list, &loop_devices); |
| 1543 | } | 1543 | } |
| 1544 | 1544 | ||
| 1545 | /* point of no return */ | 1545 | /* point of no return */ |
| 1546 | 1546 | ||
| 1547 | list_for_each_entry(lo, &loop_devices, lo_list) | 1547 | list_for_each_entry(lo, &loop_devices, lo_list) |
| 1548 | add_disk(lo->lo_disk); | 1548 | add_disk(lo->lo_disk); |
| 1549 | 1549 | ||
| 1550 | blk_register_region(MKDEV(LOOP_MAJOR, 0), range, | 1550 | blk_register_region(MKDEV(LOOP_MAJOR, 0), range, |
| 1551 | THIS_MODULE, loop_probe, NULL, NULL); | 1551 | THIS_MODULE, loop_probe, NULL, NULL); |
| 1552 | 1552 | ||
| 1553 | printk(KERN_INFO "loop: module loaded\n"); | 1553 | printk(KERN_INFO "loop: module loaded\n"); |
| 1554 | return 0; | 1554 | return 0; |
| 1555 | 1555 | ||
| 1556 | Enomem: | 1556 | Enomem: |
| 1557 | printk(KERN_INFO "loop: out of memory\n"); | 1557 | printk(KERN_INFO "loop: out of memory\n"); |
| 1558 | 1558 | ||
| 1559 | list_for_each_entry_safe(lo, next, &loop_devices, lo_list) | 1559 | list_for_each_entry_safe(lo, next, &loop_devices, lo_list) |
| 1560 | loop_free(lo); | 1560 | loop_free(lo); |
| 1561 | 1561 | ||
| 1562 | unregister_blkdev(LOOP_MAJOR, "loop"); | 1562 | unregister_blkdev(LOOP_MAJOR, "loop"); |
| 1563 | return -ENOMEM; | 1563 | return -ENOMEM; |
| 1564 | } | 1564 | } |
| 1565 | 1565 | ||
| 1566 | static void __exit loop_exit(void) | 1566 | static void __exit loop_exit(void) |
| 1567 | { | 1567 | { |
| 1568 | unsigned long range; | 1568 | unsigned long range; |
| 1569 | struct loop_device *lo, *next; | 1569 | struct loop_device *lo, *next; |
| 1570 | 1570 | ||
| 1571 | range = max_loop ? max_loop : 1UL << MINORBITS; | 1571 | range = max_loop ? max_loop : 1UL << MINORBITS; |
| 1572 | 1572 | ||
| 1573 | list_for_each_entry_safe(lo, next, &loop_devices, lo_list) | 1573 | list_for_each_entry_safe(lo, next, &loop_devices, lo_list) |
| 1574 | loop_del_one(lo); | 1574 | loop_del_one(lo); |
| 1575 | 1575 | ||
| 1576 | blk_unregister_region(MKDEV(LOOP_MAJOR, 0), range); | 1576 | blk_unregister_region(MKDEV(LOOP_MAJOR, 0), range); |
| 1577 | if (unregister_blkdev(LOOP_MAJOR, "loop")) | 1577 | if (unregister_blkdev(LOOP_MAJOR, "loop")) |
| 1578 | printk(KERN_WARNING "loop: cannot unregister blkdev\n"); | 1578 | printk(KERN_WARNING "loop: cannot unregister blkdev\n"); |
| 1579 | } | 1579 | } |
| 1580 | 1580 | ||
| 1581 | module_init(loop_init); | 1581 | module_init(loop_init); |
| 1582 | module_exit(loop_exit); | 1582 | module_exit(loop_exit); |
| 1583 | 1583 | ||
| 1584 | #ifndef MODULE | 1584 | #ifndef MODULE |
| 1585 | static int __init max_loop_setup(char *str) | 1585 | static int __init max_loop_setup(char *str) |
| 1586 | { | 1586 | { |
| 1587 | max_loop = simple_strtol(str, NULL, 0); | 1587 | max_loop = simple_strtol(str, NULL, 0); |
| 1588 | return 1; | 1588 | return 1; |
| 1589 | } | 1589 | } |
| 1590 | 1590 | ||
| 1591 | __setup("max_loop=", max_loop_setup); | 1591 | __setup("max_loop=", max_loop_setup); |
| 1592 | #endif | 1592 | #endif |
| 1593 | 1593 |
fs/nfsd/vfs.c
| 1 | #define MSNFS /* HACK HACK */ | 1 | #define MSNFS /* HACK HACK */ |
| 2 | /* | 2 | /* |
| 3 | * linux/fs/nfsd/vfs.c | 3 | * linux/fs/nfsd/vfs.c |
| 4 | * | 4 | * |
| 5 | * File operations used by nfsd. Some of these have been ripped from | 5 | * File operations used by nfsd. Some of these have been ripped from |
| 6 | * other parts of the kernel because they weren't exported, others | 6 | * other parts of the kernel because they weren't exported, others |
| 7 | * are partial duplicates with added or changed functionality. | 7 | * are partial duplicates with added or changed functionality. |
| 8 | * | 8 | * |
| 9 | * Note that several functions dget() the dentry upon which they want | 9 | * Note that several functions dget() the dentry upon which they want |
| 10 | * to act, most notably those that create directory entries. Response | 10 | * to act, most notably those that create directory entries. Response |
| 11 | * dentry's are dput()'d if necessary in the release callback. | 11 | * dentry's are dput()'d if necessary in the release callback. |
| 12 | * So if you notice code paths that apparently fail to dput() the | 12 | * So if you notice code paths that apparently fail to dput() the |
| 13 | * dentry, don't worry--they have been taken care of. | 13 | * dentry, don't worry--they have been taken care of. |
| 14 | * | 14 | * |
| 15 | * Copyright (C) 1995-1999 Olaf Kirch <okir@monad.swb.de> | 15 | * Copyright (C) 1995-1999 Olaf Kirch <okir@monad.swb.de> |
| 16 | * Zerocpy NFS support (C) 2002 Hirokazu Takahashi <taka@valinux.co.jp> | 16 | * Zerocpy NFS support (C) 2002 Hirokazu Takahashi <taka@valinux.co.jp> |
| 17 | */ | 17 | */ |
| 18 | 18 | ||
| 19 | #include <linux/string.h> | 19 | #include <linux/string.h> |
| 20 | #include <linux/time.h> | 20 | #include <linux/time.h> |
| 21 | #include <linux/errno.h> | 21 | #include <linux/errno.h> |
| 22 | #include <linux/fs.h> | 22 | #include <linux/fs.h> |
| 23 | #include <linux/file.h> | 23 | #include <linux/file.h> |
| 24 | #include <linux/mount.h> | 24 | #include <linux/mount.h> |
| 25 | #include <linux/major.h> | 25 | #include <linux/major.h> |
| 26 | #include <linux/splice.h> | 26 | #include <linux/splice.h> |
| 27 | #include <linux/proc_fs.h> | 27 | #include <linux/proc_fs.h> |
| 28 | #include <linux/stat.h> | 28 | #include <linux/stat.h> |
| 29 | #include <linux/fcntl.h> | 29 | #include <linux/fcntl.h> |
| 30 | #include <linux/net.h> | 30 | #include <linux/net.h> |
| 31 | #include <linux/unistd.h> | 31 | #include <linux/unistd.h> |
| 32 | #include <linux/slab.h> | 32 | #include <linux/slab.h> |
| 33 | #include <linux/pagemap.h> | 33 | #include <linux/pagemap.h> |
| 34 | #include <linux/in.h> | 34 | #include <linux/in.h> |
| 35 | #include <linux/module.h> | 35 | #include <linux/module.h> |
| 36 | #include <linux/namei.h> | 36 | #include <linux/namei.h> |
| 37 | #include <linux/vfs.h> | 37 | #include <linux/vfs.h> |
| 38 | #include <linux/delay.h> | 38 | #include <linux/delay.h> |
| 39 | #include <linux/sunrpc/svc.h> | 39 | #include <linux/sunrpc/svc.h> |
| 40 | #include <linux/nfsd/nfsd.h> | 40 | #include <linux/nfsd/nfsd.h> |
| 41 | #ifdef CONFIG_NFSD_V3 | 41 | #ifdef CONFIG_NFSD_V3 |
| 42 | #include <linux/nfs3.h> | 42 | #include <linux/nfs3.h> |
| 43 | #include <linux/nfsd/xdr3.h> | 43 | #include <linux/nfsd/xdr3.h> |
| 44 | #endif /* CONFIG_NFSD_V3 */ | 44 | #endif /* CONFIG_NFSD_V3 */ |
| 45 | #include <linux/nfsd/nfsfh.h> | 45 | #include <linux/nfsd/nfsfh.h> |
| 46 | #include <linux/quotaops.h> | 46 | #include <linux/quotaops.h> |
| 47 | #include <linux/fsnotify.h> | 47 | #include <linux/fsnotify.h> |
| 48 | #include <linux/posix_acl.h> | 48 | #include <linux/posix_acl.h> |
| 49 | #include <linux/posix_acl_xattr.h> | 49 | #include <linux/posix_acl_xattr.h> |
| 50 | #include <linux/xattr.h> | 50 | #include <linux/xattr.h> |
| 51 | #ifdef CONFIG_NFSD_V4 | 51 | #ifdef CONFIG_NFSD_V4 |
| 52 | #include <linux/nfs4.h> | 52 | #include <linux/nfs4.h> |
| 53 | #include <linux/nfs4_acl.h> | 53 | #include <linux/nfs4_acl.h> |
| 54 | #include <linux/nfsd_idmap.h> | 54 | #include <linux/nfsd_idmap.h> |
| 55 | #include <linux/security.h> | 55 | #include <linux/security.h> |
| 56 | #endif /* CONFIG_NFSD_V4 */ | 56 | #endif /* CONFIG_NFSD_V4 */ |
| 57 | #include <linux/jhash.h> | 57 | #include <linux/jhash.h> |
| 58 | 58 | ||
| 59 | #include <asm/uaccess.h> | 59 | #include <asm/uaccess.h> |
| 60 | 60 | ||
| 61 | #define NFSDDBG_FACILITY NFSDDBG_FILEOP | 61 | #define NFSDDBG_FACILITY NFSDDBG_FILEOP |
| 62 | 62 | ||
| 63 | 63 | ||
| 64 | /* We must ignore files (but only files) which might have mandatory | 64 | /* We must ignore files (but only files) which might have mandatory |
| 65 | * locks on them because there is no way to know if the accesser has | 65 | * locks on them because there is no way to know if the accesser has |
| 66 | * the lock. | 66 | * the lock. |
| 67 | */ | 67 | */ |
| 68 | #define IS_ISMNDLK(i) (S_ISREG((i)->i_mode) && MANDATORY_LOCK(i)) | 68 | #define IS_ISMNDLK(i) (S_ISREG((i)->i_mode) && MANDATORY_LOCK(i)) |
| 69 | 69 | ||
| 70 | /* | 70 | /* |
| 71 | * This is a cache of readahead params that help us choose the proper | 71 | * This is a cache of readahead params that help us choose the proper |
| 72 | * readahead strategy. Initially, we set all readahead parameters to 0 | 72 | * readahead strategy. Initially, we set all readahead parameters to 0 |
| 73 | * and let the VFS handle things. | 73 | * and let the VFS handle things. |
| 74 | * If you increase the number of cached files very much, you'll need to | 74 | * If you increase the number of cached files very much, you'll need to |
| 75 | * add a hash table here. | 75 | * add a hash table here. |
| 76 | */ | 76 | */ |
| 77 | struct raparms { | 77 | struct raparms { |
| 78 | struct raparms *p_next; | 78 | struct raparms *p_next; |
| 79 | unsigned int p_count; | 79 | unsigned int p_count; |
| 80 | ino_t p_ino; | 80 | ino_t p_ino; |
| 81 | dev_t p_dev; | 81 | dev_t p_dev; |
| 82 | int p_set; | 82 | int p_set; |
| 83 | struct file_ra_state p_ra; | 83 | struct file_ra_state p_ra; |
| 84 | unsigned int p_hindex; | 84 | unsigned int p_hindex; |
| 85 | }; | 85 | }; |
| 86 | 86 | ||
| 87 | struct raparm_hbucket { | 87 | struct raparm_hbucket { |
| 88 | struct raparms *pb_head; | 88 | struct raparms *pb_head; |
| 89 | spinlock_t pb_lock; | 89 | spinlock_t pb_lock; |
| 90 | } ____cacheline_aligned_in_smp; | 90 | } ____cacheline_aligned_in_smp; |
| 91 | 91 | ||
| 92 | static struct raparms * raparml; | 92 | static struct raparms * raparml; |
| 93 | #define RAPARM_HASH_BITS 4 | 93 | #define RAPARM_HASH_BITS 4 |
| 94 | #define RAPARM_HASH_SIZE (1<<RAPARM_HASH_BITS) | 94 | #define RAPARM_HASH_SIZE (1<<RAPARM_HASH_BITS) |
| 95 | #define RAPARM_HASH_MASK (RAPARM_HASH_SIZE-1) | 95 | #define RAPARM_HASH_MASK (RAPARM_HASH_SIZE-1) |
| 96 | static struct raparm_hbucket raparm_hash[RAPARM_HASH_SIZE]; | 96 | static struct raparm_hbucket raparm_hash[RAPARM_HASH_SIZE]; |
| 97 | 97 | ||
| 98 | /* | 98 | /* |
| 99 | * Called from nfsd_lookup and encode_dirent. Check if we have crossed | 99 | * Called from nfsd_lookup and encode_dirent. Check if we have crossed |
| 100 | * a mount point. | 100 | * a mount point. |
| 101 | * Returns -EAGAIN or -ETIMEDOUT leaving *dpp and *expp unchanged, | 101 | * Returns -EAGAIN or -ETIMEDOUT leaving *dpp and *expp unchanged, |
| 102 | * or nfs_ok having possibly changed *dpp and *expp | 102 | * or nfs_ok having possibly changed *dpp and *expp |
| 103 | */ | 103 | */ |
| 104 | int | 104 | int |
| 105 | nfsd_cross_mnt(struct svc_rqst *rqstp, struct dentry **dpp, | 105 | nfsd_cross_mnt(struct svc_rqst *rqstp, struct dentry **dpp, |
| 106 | struct svc_export **expp) | 106 | struct svc_export **expp) |
| 107 | { | 107 | { |
| 108 | struct svc_export *exp = *expp, *exp2 = NULL; | 108 | struct svc_export *exp = *expp, *exp2 = NULL; |
| 109 | struct dentry *dentry = *dpp; | 109 | struct dentry *dentry = *dpp; |
| 110 | struct vfsmount *mnt = mntget(exp->ex_mnt); | 110 | struct vfsmount *mnt = mntget(exp->ex_mnt); |
| 111 | struct dentry *mounts = dget(dentry); | 111 | struct dentry *mounts = dget(dentry); |
| 112 | int err = 0; | 112 | int err = 0; |
| 113 | 113 | ||
| 114 | while (follow_down(&mnt,&mounts)&&d_mountpoint(mounts)); | 114 | while (follow_down(&mnt,&mounts)&&d_mountpoint(mounts)); |
| 115 | 115 | ||
| 116 | exp2 = exp_get_by_name(exp->ex_client, mnt, mounts, &rqstp->rq_chandle); | 116 | exp2 = exp_get_by_name(exp->ex_client, mnt, mounts, &rqstp->rq_chandle); |
| 117 | if (IS_ERR(exp2)) { | 117 | if (IS_ERR(exp2)) { |
| 118 | err = PTR_ERR(exp2); | 118 | err = PTR_ERR(exp2); |
| 119 | dput(mounts); | 119 | dput(mounts); |
| 120 | mntput(mnt); | 120 | mntput(mnt); |
| 121 | goto out; | 121 | goto out; |
| 122 | } | 122 | } |
| 123 | if (exp2 && ((exp->ex_flags & NFSEXP_CROSSMOUNT) || EX_NOHIDE(exp2))) { | 123 | if (exp2 && ((exp->ex_flags & NFSEXP_CROSSMOUNT) || EX_NOHIDE(exp2))) { |
| 124 | /* successfully crossed mount point */ | 124 | /* successfully crossed mount point */ |
| 125 | exp_put(exp); | 125 | exp_put(exp); |
| 126 | *expp = exp2; | 126 | *expp = exp2; |
| 127 | dput(dentry); | 127 | dput(dentry); |
| 128 | *dpp = mounts; | 128 | *dpp = mounts; |
| 129 | } else { | 129 | } else { |
| 130 | if (exp2) exp_put(exp2); | 130 | if (exp2) exp_put(exp2); |
| 131 | dput(mounts); | 131 | dput(mounts); |
| 132 | } | 132 | } |
| 133 | mntput(mnt); | 133 | mntput(mnt); |
| 134 | out: | 134 | out: |
| 135 | return err; | 135 | return err; |
| 136 | } | 136 | } |
| 137 | 137 | ||
| 138 | /* | 138 | /* |
| 139 | * Look up one component of a pathname. | 139 | * Look up one component of a pathname. |
| 140 | * N.B. After this call _both_ fhp and resfh need an fh_put | 140 | * N.B. After this call _both_ fhp and resfh need an fh_put |
| 141 | * | 141 | * |
| 142 | * If the lookup would cross a mountpoint, and the mounted filesystem | 142 | * If the lookup would cross a mountpoint, and the mounted filesystem |
| 143 | * is exported to the client with NFSEXP_NOHIDE, then the lookup is | 143 | * is exported to the client with NFSEXP_NOHIDE, then the lookup is |
| 144 | * accepted as it stands and the mounted directory is | 144 | * accepted as it stands and the mounted directory is |
| 145 | * returned. Otherwise the covered directory is returned. | 145 | * returned. Otherwise the covered directory is returned. |
| 146 | * NOTE: this mountpoint crossing is not supported properly by all | 146 | * NOTE: this mountpoint crossing is not supported properly by all |
| 147 | * clients and is explicitly disallowed for NFSv3 | 147 | * clients and is explicitly disallowed for NFSv3 |
| 148 | * NeilBrown <neilb@cse.unsw.edu.au> | 148 | * NeilBrown <neilb@cse.unsw.edu.au> |
| 149 | */ | 149 | */ |
| 150 | __be32 | 150 | __be32 |
| 151 | nfsd_lookup(struct svc_rqst *rqstp, struct svc_fh *fhp, const char *name, | 151 | nfsd_lookup(struct svc_rqst *rqstp, struct svc_fh *fhp, const char *name, |
| 152 | int len, struct svc_fh *resfh) | 152 | int len, struct svc_fh *resfh) |
| 153 | { | 153 | { |
| 154 | struct svc_export *exp; | 154 | struct svc_export *exp; |
| 155 | struct dentry *dparent; | 155 | struct dentry *dparent; |
| 156 | struct dentry *dentry; | 156 | struct dentry *dentry; |
| 157 | __be32 err; | 157 | __be32 err; |
| 158 | int host_err; | 158 | int host_err; |
| 159 | 159 | ||
| 160 | dprintk("nfsd: nfsd_lookup(fh %s, %.*s)\n", SVCFH_fmt(fhp), len,name); | 160 | dprintk("nfsd: nfsd_lookup(fh %s, %.*s)\n", SVCFH_fmt(fhp), len,name); |
| 161 | 161 | ||
| 162 | /* Obtain dentry and export. */ | 162 | /* Obtain dentry and export. */ |
| 163 | err = fh_verify(rqstp, fhp, S_IFDIR, MAY_EXEC); | 163 | err = fh_verify(rqstp, fhp, S_IFDIR, MAY_EXEC); |
| 164 | if (err) | 164 | if (err) |
| 165 | return err; | 165 | return err; |
| 166 | 166 | ||
| 167 | dparent = fhp->fh_dentry; | 167 | dparent = fhp->fh_dentry; |
| 168 | exp = fhp->fh_export; | 168 | exp = fhp->fh_export; |
| 169 | exp_get(exp); | 169 | exp_get(exp); |
| 170 | 170 | ||
| 171 | err = nfserr_acces; | 171 | err = nfserr_acces; |
| 172 | 172 | ||
| 173 | /* Lookup the name, but don't follow links */ | 173 | /* Lookup the name, but don't follow links */ |
| 174 | if (isdotent(name, len)) { | 174 | if (isdotent(name, len)) { |
| 175 | if (len==1) | 175 | if (len==1) |
| 176 | dentry = dget(dparent); | 176 | dentry = dget(dparent); |
| 177 | else if (dparent != exp->ex_dentry) { | 177 | else if (dparent != exp->ex_dentry) { |
| 178 | dentry = dget_parent(dparent); | 178 | dentry = dget_parent(dparent); |
| 179 | } else if (!EX_NOHIDE(exp)) | 179 | } else if (!EX_NOHIDE(exp)) |
| 180 | dentry = dget(dparent); /* .. == . just like at / */ | 180 | dentry = dget(dparent); /* .. == . just like at / */ |
| 181 | else { | 181 | else { |
| 182 | /* checking mountpoint crossing is very different when stepping up */ | 182 | /* checking mountpoint crossing is very different when stepping up */ |
| 183 | struct svc_export *exp2 = NULL; | 183 | struct svc_export *exp2 = NULL; |
| 184 | struct dentry *dp; | 184 | struct dentry *dp; |
| 185 | struct vfsmount *mnt = mntget(exp->ex_mnt); | 185 | struct vfsmount *mnt = mntget(exp->ex_mnt); |
| 186 | dentry = dget(dparent); | 186 | dentry = dget(dparent); |
| 187 | while(dentry == mnt->mnt_root && follow_up(&mnt, &dentry)) | 187 | while(dentry == mnt->mnt_root && follow_up(&mnt, &dentry)) |
| 188 | ; | 188 | ; |
| 189 | dp = dget_parent(dentry); | 189 | dp = dget_parent(dentry); |
| 190 | dput(dentry); | 190 | dput(dentry); |
| 191 | dentry = dp; | 191 | dentry = dp; |
| 192 | 192 | ||
| 193 | exp2 = exp_parent(exp->ex_client, mnt, dentry, | 193 | exp2 = exp_parent(exp->ex_client, mnt, dentry, |
| 194 | &rqstp->rq_chandle); | 194 | &rqstp->rq_chandle); |
| 195 | if (IS_ERR(exp2)) { | 195 | if (IS_ERR(exp2)) { |
| 196 | host_err = PTR_ERR(exp2); | 196 | host_err = PTR_ERR(exp2); |
| 197 | dput(dentry); | 197 | dput(dentry); |
| 198 | mntput(mnt); | 198 | mntput(mnt); |
| 199 | goto out_nfserr; | 199 | goto out_nfserr; |
| 200 | } | 200 | } |
| 201 | if (!exp2) { | 201 | if (!exp2) { |
| 202 | dput(dentry); | 202 | dput(dentry); |
| 203 | dentry = dget(dparent); | 203 | dentry = dget(dparent); |
| 204 | } else { | 204 | } else { |
| 205 | exp_put(exp); | 205 | exp_put(exp); |
| 206 | exp = exp2; | 206 | exp = exp2; |
| 207 | } | 207 | } |
| 208 | mntput(mnt); | 208 | mntput(mnt); |
| 209 | } | 209 | } |
| 210 | } else { | 210 | } else { |
| 211 | fh_lock(fhp); | 211 | fh_lock(fhp); |
| 212 | dentry = lookup_one_len(name, dparent, len); | 212 | dentry = lookup_one_len(name, dparent, len); |
| 213 | host_err = PTR_ERR(dentry); | 213 | host_err = PTR_ERR(dentry); |
| 214 | if (IS_ERR(dentry)) | 214 | if (IS_ERR(dentry)) |
| 215 | goto out_nfserr; | 215 | goto out_nfserr; |
| 216 | /* | 216 | /* |
| 217 | * check if we have crossed a mount point ... | 217 | * check if we have crossed a mount point ... |
| 218 | */ | 218 | */ |
| 219 | if (d_mountpoint(dentry)) { | 219 | if (d_mountpoint(dentry)) { |
| 220 | if ((host_err = nfsd_cross_mnt(rqstp, &dentry, &exp))) { | 220 | if ((host_err = nfsd_cross_mnt(rqstp, &dentry, &exp))) { |
| 221 | dput(dentry); | 221 | dput(dentry); |
| 222 | goto out_nfserr; | 222 | goto out_nfserr; |
| 223 | } | 223 | } |
| 224 | } | 224 | } |
| 225 | } | 225 | } |
| 226 | /* | 226 | /* |
| 227 | * Note: we compose the file handle now, but as the | 227 | * Note: we compose the file handle now, but as the |
| 228 | * dentry may be negative, it may need to be updated. | 228 | * dentry may be negative, it may need to be updated. |
| 229 | */ | 229 | */ |
| 230 | err = fh_compose(resfh, exp, dentry, fhp); | 230 | err = fh_compose(resfh, exp, dentry, fhp); |
| 231 | if (!err && !dentry->d_inode) | 231 | if (!err && !dentry->d_inode) |
| 232 | err = nfserr_noent; | 232 | err = nfserr_noent; |
| 233 | dput(dentry); | 233 | dput(dentry); |
| 234 | out: | 234 | out: |
| 235 | exp_put(exp); | 235 | exp_put(exp); |
| 236 | return err; | 236 | return err; |
| 237 | 237 | ||
| 238 | out_nfserr: | 238 | out_nfserr: |
| 239 | err = nfserrno(host_err); | 239 | err = nfserrno(host_err); |
| 240 | goto out; | 240 | goto out; |
| 241 | } | 241 | } |
| 242 | 242 | ||
| 243 | /* | 243 | /* |
| 244 | * Set various file attributes. | 244 | * Set various file attributes. |
| 245 | * N.B. After this call fhp needs an fh_put | 245 | * N.B. After this call fhp needs an fh_put |
| 246 | */ | 246 | */ |
| 247 | __be32 | 247 | __be32 |
| 248 | nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap, | 248 | nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap, |
| 249 | int check_guard, time_t guardtime) | 249 | int check_guard, time_t guardtime) |
| 250 | { | 250 | { |
| 251 | struct dentry *dentry; | 251 | struct dentry *dentry; |
| 252 | struct inode *inode; | 252 | struct inode *inode; |
| 253 | int accmode = MAY_SATTR; | 253 | int accmode = MAY_SATTR; |
| 254 | int ftype = 0; | 254 | int ftype = 0; |
| 255 | int imode; | 255 | int imode; |
| 256 | __be32 err; | 256 | __be32 err; |
| 257 | int host_err; | 257 | int host_err; |
| 258 | int size_change = 0; | 258 | int size_change = 0; |
| 259 | 259 | ||
| 260 | if (iap->ia_valid & (ATTR_ATIME | ATTR_MTIME | ATTR_SIZE)) | 260 | if (iap->ia_valid & (ATTR_ATIME | ATTR_MTIME | ATTR_SIZE)) |
| 261 | accmode |= MAY_WRITE|MAY_OWNER_OVERRIDE; | 261 | accmode |= MAY_WRITE|MAY_OWNER_OVERRIDE; |
| 262 | if (iap->ia_valid & ATTR_SIZE) | 262 | if (iap->ia_valid & ATTR_SIZE) |
| 263 | ftype = S_IFREG; | 263 | ftype = S_IFREG; |
| 264 | 264 | ||
| 265 | /* Get inode */ | 265 | /* Get inode */ |
| 266 | err = fh_verify(rqstp, fhp, ftype, accmode); | 266 | err = fh_verify(rqstp, fhp, ftype, accmode); |
| 267 | if (err) | 267 | if (err) |
| 268 | goto out; | 268 | goto out; |
| 269 | 269 | ||
| 270 | dentry = fhp->fh_dentry; | 270 | dentry = fhp->fh_dentry; |
| 271 | inode = dentry->d_inode; | 271 | inode = dentry->d_inode; |
| 272 | 272 | ||
| 273 | /* Ignore any mode updates on symlinks */ | 273 | /* Ignore any mode updates on symlinks */ |
| 274 | if (S_ISLNK(inode->i_mode)) | 274 | if (S_ISLNK(inode->i_mode)) |
| 275 | iap->ia_valid &= ~ATTR_MODE; | 275 | iap->ia_valid &= ~ATTR_MODE; |
| 276 | 276 | ||
| 277 | if (!iap->ia_valid) | 277 | if (!iap->ia_valid) |
| 278 | goto out; | 278 | goto out; |
| 279 | 279 | ||
| 280 | /* NFSv2 does not differentiate between "set-[ac]time-to-now" | 280 | /* NFSv2 does not differentiate between "set-[ac]time-to-now" |
| 281 | * which only requires access, and "set-[ac]time-to-X" which | 281 | * which only requires access, and "set-[ac]time-to-X" which |
| 282 | * requires ownership. | 282 | * requires ownership. |
| 283 | * So if it looks like it might be "set both to the same time which | 283 | * So if it looks like it might be "set both to the same time which |
| 284 | * is close to now", and if inode_change_ok fails, then we | 284 | * is close to now", and if inode_change_ok fails, then we |
| 285 | * convert to "set to now" instead of "set to explicit time" | 285 | * convert to "set to now" instead of "set to explicit time" |
| 286 | * | 286 | * |
| 287 | * We only call inode_change_ok as the last test as technically | 287 | * We only call inode_change_ok as the last test as technically |
| 288 | * it is not an interface that we should be using. It is only | 288 | * it is not an interface that we should be using. It is only |
| 289 | * valid if the filesystem does not define it's own i_op->setattr. | 289 | * valid if the filesystem does not define it's own i_op->setattr. |
| 290 | */ | 290 | */ |
| 291 | #define BOTH_TIME_SET (ATTR_ATIME_SET | ATTR_MTIME_SET) | 291 | #define BOTH_TIME_SET (ATTR_ATIME_SET | ATTR_MTIME_SET) |
| 292 | #define MAX_TOUCH_TIME_ERROR (30*60) | 292 | #define MAX_TOUCH_TIME_ERROR (30*60) |
| 293 | if ((iap->ia_valid & BOTH_TIME_SET) == BOTH_TIME_SET | 293 | if ((iap->ia_valid & BOTH_TIME_SET) == BOTH_TIME_SET |
| 294 | && iap->ia_mtime.tv_sec == iap->ia_atime.tv_sec | 294 | && iap->ia_mtime.tv_sec == iap->ia_atime.tv_sec |
| 295 | ) { | 295 | ) { |
| 296 | /* Looks probable. Now just make sure time is in the right ballpark. | 296 | /* Looks probable. Now just make sure time is in the right ballpark. |
| 297 | * Solaris, at least, doesn't seem to care what the time request is. | 297 | * Solaris, at least, doesn't seem to care what the time request is. |
| 298 | * We require it be within 30 minutes of now. | 298 | * We require it be within 30 minutes of now. |
| 299 | */ | 299 | */ |
| 300 | time_t delta = iap->ia_atime.tv_sec - get_seconds(); | 300 | time_t delta = iap->ia_atime.tv_sec - get_seconds(); |
| 301 | if (delta<0) delta = -delta; | 301 | if (delta<0) delta = -delta; |
| 302 | if (delta < MAX_TOUCH_TIME_ERROR && | 302 | if (delta < MAX_TOUCH_TIME_ERROR && |
| 303 | inode_change_ok(inode, iap) != 0) { | 303 | inode_change_ok(inode, iap) != 0) { |
| 304 | /* turn off ATTR_[AM]TIME_SET but leave ATTR_[AM]TIME | 304 | /* turn off ATTR_[AM]TIME_SET but leave ATTR_[AM]TIME |
| 305 | * this will cause notify_change to set these times to "now" | 305 | * this will cause notify_change to set these times to "now" |
| 306 | */ | 306 | */ |
| 307 | iap->ia_valid &= ~BOTH_TIME_SET; | 307 | iap->ia_valid &= ~BOTH_TIME_SET; |
| 308 | } | 308 | } |
| 309 | } | 309 | } |
| 310 | 310 | ||
| 311 | /* The size case is special. It changes the file as well as the attributes. */ | 311 | /* The size case is special. It changes the file as well as the attributes. */ |
| 312 | if (iap->ia_valid & ATTR_SIZE) { | 312 | if (iap->ia_valid & ATTR_SIZE) { |
| 313 | if (iap->ia_size < inode->i_size) { | 313 | if (iap->ia_size < inode->i_size) { |
| 314 | err = nfsd_permission(fhp->fh_export, dentry, MAY_TRUNC|MAY_OWNER_OVERRIDE); | 314 | err = nfsd_permission(fhp->fh_export, dentry, MAY_TRUNC|MAY_OWNER_OVERRIDE); |
| 315 | if (err) | 315 | if (err) |
| 316 | goto out; | 316 | goto out; |
| 317 | } | 317 | } |
| 318 | 318 | ||
| 319 | /* | 319 | /* |
| 320 | * If we are changing the size of the file, then | 320 | * If we are changing the size of the file, then |
| 321 | * we need to break all leases. | 321 | * we need to break all leases. |
| 322 | */ | 322 | */ |
| 323 | host_err = break_lease(inode, FMODE_WRITE | O_NONBLOCK); | 323 | host_err = break_lease(inode, FMODE_WRITE | O_NONBLOCK); |
| 324 | if (host_err == -EWOULDBLOCK) | 324 | if (host_err == -EWOULDBLOCK) |
| 325 | host_err = -ETIMEDOUT; | 325 | host_err = -ETIMEDOUT; |
| 326 | if (host_err) /* ENOMEM or EWOULDBLOCK */ | 326 | if (host_err) /* ENOMEM or EWOULDBLOCK */ |
| 327 | goto out_nfserr; | 327 | goto out_nfserr; |
| 328 | 328 | ||
| 329 | host_err = get_write_access(inode); | 329 | host_err = get_write_access(inode); |
| 330 | if (host_err) | 330 | if (host_err) |
| 331 | goto out_nfserr; | 331 | goto out_nfserr; |
| 332 | 332 | ||
| 333 | size_change = 1; | 333 | size_change = 1; |
| 334 | host_err = locks_verify_truncate(inode, NULL, iap->ia_size); | 334 | host_err = locks_verify_truncate(inode, NULL, iap->ia_size); |
| 335 | if (host_err) { | 335 | if (host_err) { |
| 336 | put_write_access(inode); | 336 | put_write_access(inode); |
| 337 | goto out_nfserr; | 337 | goto out_nfserr; |
| 338 | } | 338 | } |
| 339 | DQUOT_INIT(inode); | 339 | DQUOT_INIT(inode); |
| 340 | } | 340 | } |
| 341 | 341 | ||
| 342 | imode = inode->i_mode; | 342 | imode = inode->i_mode; |
| 343 | if (iap->ia_valid & ATTR_MODE) { | 343 | if (iap->ia_valid & ATTR_MODE) { |
| 344 | iap->ia_mode &= S_IALLUGO; | 344 | iap->ia_mode &= S_IALLUGO; |
| 345 | imode = iap->ia_mode |= (imode & ~S_IALLUGO); | 345 | imode = iap->ia_mode |= (imode & ~S_IALLUGO); |
| 346 | } | 346 | } |
| 347 | 347 | ||
| 348 | /* Revoke setuid/setgid bit on chown/chgrp */ | 348 | /* Revoke setuid/setgid bit on chown/chgrp */ |
| 349 | if ((iap->ia_valid & ATTR_UID) && iap->ia_uid != inode->i_uid) | 349 | if ((iap->ia_valid & ATTR_UID) && iap->ia_uid != inode->i_uid) |
| 350 | iap->ia_valid |= ATTR_KILL_SUID; | 350 | iap->ia_valid |= ATTR_KILL_SUID; |
| 351 | if ((iap->ia_valid & ATTR_GID) && iap->ia_gid != inode->i_gid) | 351 | if ((iap->ia_valid & ATTR_GID) && iap->ia_gid != inode->i_gid) |
| 352 | iap->ia_valid |= ATTR_KILL_SGID; | 352 | iap->ia_valid |= ATTR_KILL_SGID; |
| 353 | 353 | ||
| 354 | /* Change the attributes. */ | 354 | /* Change the attributes. */ |
| 355 | 355 | ||
| 356 | iap->ia_valid |= ATTR_CTIME; | 356 | iap->ia_valid |= ATTR_CTIME; |
| 357 | 357 | ||
| 358 | err = nfserr_notsync; | 358 | err = nfserr_notsync; |
| 359 | if (!check_guard || guardtime == inode->i_ctime.tv_sec) { | 359 | if (!check_guard || guardtime == inode->i_ctime.tv_sec) { |
| 360 | fh_lock(fhp); | 360 | fh_lock(fhp); |
| 361 | host_err = notify_change(dentry, iap); | 361 | host_err = notify_change(dentry, iap); |
| 362 | err = nfserrno(host_err); | 362 | err = nfserrno(host_err); |
| 363 | fh_unlock(fhp); | 363 | fh_unlock(fhp); |
| 364 | } | 364 | } |
| 365 | if (size_change) | 365 | if (size_change) |
| 366 | put_write_access(inode); | 366 | put_write_access(inode); |
| 367 | if (!err) | 367 | if (!err) |
| 368 | if (EX_ISSYNC(fhp->fh_export)) | 368 | if (EX_ISSYNC(fhp->fh_export)) |
| 369 | write_inode_now(inode, 1); | 369 | write_inode_now(inode, 1); |
| 370 | out: | 370 | out: |
| 371 | return err; | 371 | return err; |
| 372 | 372 | ||
| 373 | out_nfserr: | 373 | out_nfserr: |
| 374 | err = nfserrno(host_err); | 374 | err = nfserrno(host_err); |
| 375 | goto out; | 375 | goto out; |
| 376 | } | 376 | } |
| 377 | 377 | ||
| 378 | #if defined(CONFIG_NFSD_V2_ACL) || \ | 378 | #if defined(CONFIG_NFSD_V2_ACL) || \ |
| 379 | defined(CONFIG_NFSD_V3_ACL) || \ | 379 | defined(CONFIG_NFSD_V3_ACL) || \ |
| 380 | defined(CONFIG_NFSD_V4) | 380 | defined(CONFIG_NFSD_V4) |
| 381 | static ssize_t nfsd_getxattr(struct dentry *dentry, char *key, void **buf) | 381 | static ssize_t nfsd_getxattr(struct dentry *dentry, char *key, void **buf) |
| 382 | { | 382 | { |
| 383 | ssize_t buflen; | 383 | ssize_t buflen; |
| 384 | 384 | ||
| 385 | buflen = vfs_getxattr(dentry, key, NULL, 0); | 385 | buflen = vfs_getxattr(dentry, key, NULL, 0); |
| 386 | if (buflen <= 0) | 386 | if (buflen <= 0) |
| 387 | return buflen; | 387 | return buflen; |
| 388 | 388 | ||
| 389 | *buf = kmalloc(buflen, GFP_KERNEL); | 389 | *buf = kmalloc(buflen, GFP_KERNEL); |
| 390 | if (!*buf) | 390 | if (!*buf) |
| 391 | return -ENOMEM; | 391 | return -ENOMEM; |
| 392 | 392 | ||
| 393 | return vfs_getxattr(dentry, key, *buf, buflen); | 393 | return vfs_getxattr(dentry, key, *buf, buflen); |
| 394 | } | 394 | } |
| 395 | #endif | 395 | #endif |
| 396 | 396 | ||
| 397 | #if defined(CONFIG_NFSD_V4) | 397 | #if defined(CONFIG_NFSD_V4) |
| 398 | static int | 398 | static int |
| 399 | set_nfsv4_acl_one(struct dentry *dentry, struct posix_acl *pacl, char *key) | 399 | set_nfsv4_acl_one(struct dentry *dentry, struct posix_acl *pacl, char *key) |
| 400 | { | 400 | { |
| 401 | int len; | 401 | int len; |
| 402 | size_t buflen; | 402 | size_t buflen; |
| 403 | char *buf = NULL; | 403 | char *buf = NULL; |
| 404 | int error = 0; | 404 | int error = 0; |
| 405 | 405 | ||
| 406 | buflen = posix_acl_xattr_size(pacl->a_count); | 406 | buflen = posix_acl_xattr_size(pacl->a_count); |
| 407 | buf = kmalloc(buflen, GFP_KERNEL); | 407 | buf = kmalloc(buflen, GFP_KERNEL); |
| 408 | error = -ENOMEM; | 408 | error = -ENOMEM; |
| 409 | if (buf == NULL) | 409 | if (buf == NULL) |
| 410 | goto out; | 410 | goto out; |
| 411 | 411 | ||
| 412 | len = posix_acl_to_xattr(pacl, buf, buflen); | 412 | len = posix_acl_to_xattr(pacl, buf, buflen); |
| 413 | if (len < 0) { | 413 | if (len < 0) { |
| 414 | error = len; | 414 | error = len; |
| 415 | goto out; | 415 | goto out; |
| 416 | } | 416 | } |
| 417 | 417 | ||
| 418 | error = vfs_setxattr(dentry, key, buf, len, 0); | 418 | error = vfs_setxattr(dentry, key, buf, len, 0); |
| 419 | out: | 419 | out: |
| 420 | kfree(buf); | 420 | kfree(buf); |
| 421 | return error; | 421 | return error; |
| 422 | } | 422 | } |
| 423 | 423 | ||
| 424 | __be32 | 424 | __be32 |
| 425 | nfsd4_set_nfs4_acl(struct svc_rqst *rqstp, struct svc_fh *fhp, | 425 | nfsd4_set_nfs4_acl(struct svc_rqst *rqstp, struct svc_fh *fhp, |
| 426 | struct nfs4_acl *acl) | 426 | struct nfs4_acl *acl) |
| 427 | { | 427 | { |
| 428 | __be32 error; | 428 | __be32 error; |
| 429 | int host_error; | 429 | int host_error; |
| 430 | struct dentry *dentry; | 430 | struct dentry *dentry; |
| 431 | struct inode *inode; | 431 | struct inode *inode; |
| 432 | struct posix_acl *pacl = NULL, *dpacl = NULL; | 432 | struct posix_acl *pacl = NULL, *dpacl = NULL; |
| 433 | unsigned int flags = 0; | 433 | unsigned int flags = 0; |
| 434 | 434 | ||
| 435 | /* Get inode */ | 435 | /* Get inode */ |
| 436 | error = fh_verify(rqstp, fhp, 0 /* S_IFREG */, MAY_SATTR); | 436 | error = fh_verify(rqstp, fhp, 0 /* S_IFREG */, MAY_SATTR); |
| 437 | if (error) | 437 | if (error) |
| 438 | goto out; | 438 | goto out; |
| 439 | 439 | ||
| 440 | dentry = fhp->fh_dentry; | 440 | dentry = fhp->fh_dentry; |
| 441 | inode = dentry->d_inode; | 441 | inode = dentry->d_inode; |
| 442 | if (S_ISDIR(inode->i_mode)) | 442 | if (S_ISDIR(inode->i_mode)) |
| 443 | flags = NFS4_ACL_DIR; | 443 | flags = NFS4_ACL_DIR; |
| 444 | 444 | ||
| 445 | host_error = nfs4_acl_nfsv4_to_posix(acl, &pacl, &dpacl, flags); | 445 | host_error = nfs4_acl_nfsv4_to_posix(acl, &pacl, &dpacl, flags); |
| 446 | if (host_error == -EINVAL) { | 446 | if (host_error == -EINVAL) { |
| 447 | error = nfserr_attrnotsupp; | 447 | error = nfserr_attrnotsupp; |
| 448 | goto out; | 448 | goto out; |
| 449 | } else if (host_error < 0) | 449 | } else if (host_error < 0) |
| 450 | goto out_nfserr; | 450 | goto out_nfserr; |
| 451 | 451 | ||
| 452 | host_error = set_nfsv4_acl_one(dentry, pacl, POSIX_ACL_XATTR_ACCESS); | 452 | host_error = set_nfsv4_acl_one(dentry, pacl, POSIX_ACL_XATTR_ACCESS); |
| 453 | if (host_error < 0) | 453 | if (host_error < 0) |
| 454 | goto out_nfserr; | 454 | goto out_nfserr; |
| 455 | 455 | ||
| 456 | if (S_ISDIR(inode->i_mode)) { | 456 | if (S_ISDIR(inode->i_mode)) { |
| 457 | host_error = set_nfsv4_acl_one(dentry, dpacl, POSIX_ACL_XATTR_DEFAULT); | 457 | host_error = set_nfsv4_acl_one(dentry, dpacl, POSIX_ACL_XATTR_DEFAULT); |
| 458 | if (host_error < 0) | 458 | if (host_error < 0) |
| 459 | goto out_nfserr; | 459 | goto out_nfserr; |
| 460 | } | 460 | } |
| 461 | 461 | ||
| 462 | error = nfs_ok; | 462 | error = nfs_ok; |
| 463 | 463 | ||
| 464 | out: | 464 | out: |
| 465 | posix_acl_release(pacl); | 465 | posix_acl_release(pacl); |
| 466 | posix_acl_release(dpacl); | 466 | posix_acl_release(dpacl); |
| 467 | return (error); | 467 | return (error); |
| 468 | out_nfserr: | 468 | out_nfserr: |
| 469 | if (host_error == -EOPNOTSUPP) | 469 | if (host_error == -EOPNOTSUPP) |
| 470 | error = nfserr_attrnotsupp; | 470 | error = nfserr_attrnotsupp; |
| 471 | else | 471 | else |
| 472 | error = nfserrno(host_error); | 472 | error = nfserrno(host_error); |
| 473 | goto out; | 473 | goto out; |
| 474 | } | 474 | } |
| 475 | 475 | ||
| 476 | static struct posix_acl * | 476 | static struct posix_acl * |
| 477 | _get_posix_acl(struct dentry *dentry, char *key) | 477 | _get_posix_acl(struct dentry *dentry, char *key) |
| 478 | { | 478 | { |
| 479 | void *buf = NULL; | 479 | void *buf = NULL; |
| 480 | struct posix_acl *pacl = NULL; | 480 | struct posix_acl *pacl = NULL; |
| 481 | int buflen; | 481 | int buflen; |
| 482 | 482 | ||
| 483 | buflen = nfsd_getxattr(dentry, key, &buf); | 483 | buflen = nfsd_getxattr(dentry, key, &buf); |
| 484 | if (!buflen) | 484 | if (!buflen) |
| 485 | buflen = -ENODATA; | 485 | buflen = -ENODATA; |
| 486 | if (buflen <= 0) | 486 | if (buflen <= 0) |
| 487 | return ERR_PTR(buflen); | 487 | return ERR_PTR(buflen); |
| 488 | 488 | ||
| 489 | pacl = posix_acl_from_xattr(buf, buflen); | 489 | pacl = posix_acl_from_xattr(buf, buflen); |
| 490 | kfree(buf); | 490 | kfree(buf); |
| 491 | return pacl; | 491 | return pacl; |
| 492 | } | 492 | } |
| 493 | 493 | ||
| 494 | int | 494 | int |
| 495 | nfsd4_get_nfs4_acl(struct svc_rqst *rqstp, struct dentry *dentry, struct nfs4_acl **acl) | 495 | nfsd4_get_nfs4_acl(struct svc_rqst *rqstp, struct dentry *dentry, struct nfs4_acl **acl) |
| 496 | { | 496 | { |
| 497 | struct inode *inode = dentry->d_inode; | 497 | struct inode *inode = dentry->d_inode; |
| 498 | int error = 0; | 498 | int error = 0; |
| 499 | struct posix_acl *pacl = NULL, *dpacl = NULL; | 499 | struct posix_acl *pacl = NULL, *dpacl = NULL; |
| 500 | unsigned int flags = 0; | 500 | unsigned int flags = 0; |
| 501 | 501 | ||
| 502 | pacl = _get_posix_acl(dentry, POSIX_ACL_XATTR_ACCESS); | 502 | pacl = _get_posix_acl(dentry, POSIX_ACL_XATTR_ACCESS); |
| 503 | if (IS_ERR(pacl) && PTR_ERR(pacl) == -ENODATA) | 503 | if (IS_ERR(pacl) && PTR_ERR(pacl) == -ENODATA) |
| 504 | pacl = posix_acl_from_mode(inode->i_mode, GFP_KERNEL); | 504 | pacl = posix_acl_from_mode(inode->i_mode, GFP_KERNEL); |
| 505 | if (IS_ERR(pacl)) { | 505 | if (IS_ERR(pacl)) { |
| 506 | error = PTR_ERR(pacl); | 506 | error = PTR_ERR(pacl); |
| 507 | pacl = NULL; | 507 | pacl = NULL; |
| 508 | goto out; | 508 | goto out; |
| 509 | } | 509 | } |
| 510 | 510 | ||
| 511 | if (S_ISDIR(inode->i_mode)) { | 511 | if (S_ISDIR(inode->i_mode)) { |
| 512 | dpacl = _get_posix_acl(dentry, POSIX_ACL_XATTR_DEFAULT); | 512 | dpacl = _get_posix_acl(dentry, POSIX_ACL_XATTR_DEFAULT); |
| 513 | if (IS_ERR(dpacl) && PTR_ERR(dpacl) == -ENODATA) | 513 | if (IS_ERR(dpacl) && PTR_ERR(dpacl) == -ENODATA) |
| 514 | dpacl = NULL; | 514 | dpacl = NULL; |
| 515 | else if (IS_ERR(dpacl)) { | 515 | else if (IS_ERR(dpacl)) { |
| 516 | error = PTR_ERR(dpacl); | 516 | error = PTR_ERR(dpacl); |
| 517 | dpacl = NULL; | 517 | dpacl = NULL; |
| 518 | goto out; | 518 | goto out; |
| 519 | } | 519 | } |
| 520 | flags = NFS4_ACL_DIR; | 520 | flags = NFS4_ACL_DIR; |
| 521 | } | 521 | } |
| 522 | 522 | ||
| 523 | *acl = nfs4_acl_posix_to_nfsv4(pacl, dpacl, flags); | 523 | *acl = nfs4_acl_posix_to_nfsv4(pacl, dpacl, flags); |
| 524 | if (IS_ERR(*acl)) { | 524 | if (IS_ERR(*acl)) { |
| 525 | error = PTR_ERR(*acl); | 525 | error = PTR_ERR(*acl); |
| 526 | *acl = NULL; | 526 | *acl = NULL; |
| 527 | } | 527 | } |
| 528 | out: | 528 | out: |
| 529 | posix_acl_release(pacl); | 529 | posix_acl_release(pacl); |
| 530 | posix_acl_release(dpacl); | 530 | posix_acl_release(dpacl); |
| 531 | return error; | 531 | return error; |
| 532 | } | 532 | } |
| 533 | 533 | ||
| 534 | #endif /* defined(CONFIG_NFS_V4) */ | 534 | #endif /* defined(CONFIG_NFS_V4) */ |
| 535 | 535 | ||
| 536 | #ifdef CONFIG_NFSD_V3 | 536 | #ifdef CONFIG_NFSD_V3 |
| 537 | /* | 537 | /* |
| 538 | * Check server access rights to a file system object | 538 | * Check server access rights to a file system object |
| 539 | */ | 539 | */ |
| 540 | struct accessmap { | 540 | struct accessmap { |
| 541 | u32 access; | 541 | u32 access; |
| 542 | int how; | 542 | int how; |
| 543 | }; | 543 | }; |
| 544 | static struct accessmap nfs3_regaccess[] = { | 544 | static struct accessmap nfs3_regaccess[] = { |
| 545 | { NFS3_ACCESS_READ, MAY_READ }, | 545 | { NFS3_ACCESS_READ, MAY_READ }, |
| 546 | { NFS3_ACCESS_EXECUTE, MAY_EXEC }, | 546 | { NFS3_ACCESS_EXECUTE, MAY_EXEC }, |
| 547 | { NFS3_ACCESS_MODIFY, MAY_WRITE|MAY_TRUNC }, | 547 | { NFS3_ACCESS_MODIFY, MAY_WRITE|MAY_TRUNC }, |
| 548 | { NFS3_ACCESS_EXTEND, MAY_WRITE }, | 548 | { NFS3_ACCESS_EXTEND, MAY_WRITE }, |
| 549 | 549 | ||
| 550 | { 0, 0 } | 550 | { 0, 0 } |
| 551 | }; | 551 | }; |
| 552 | 552 | ||
| 553 | static struct accessmap nfs3_diraccess[] = { | 553 | static struct accessmap nfs3_diraccess[] = { |
| 554 | { NFS3_ACCESS_READ, MAY_READ }, | 554 | { NFS3_ACCESS_READ, MAY_READ }, |
| 555 | { NFS3_ACCESS_LOOKUP, MAY_EXEC }, | 555 | { NFS3_ACCESS_LOOKUP, MAY_EXEC }, |
| 556 | { NFS3_ACCESS_MODIFY, MAY_EXEC|MAY_WRITE|MAY_TRUNC }, | 556 | { NFS3_ACCESS_MODIFY, MAY_EXEC|MAY_WRITE|MAY_TRUNC }, |
| 557 | { NFS3_ACCESS_EXTEND, MAY_EXEC|MAY_WRITE }, | 557 | { NFS3_ACCESS_EXTEND, MAY_EXEC|MAY_WRITE }, |
| 558 | { NFS3_ACCESS_DELETE, MAY_REMOVE }, | 558 | { NFS3_ACCESS_DELETE, MAY_REMOVE }, |
| 559 | 559 | ||
| 560 | { 0, 0 } | 560 | { 0, 0 } |
| 561 | }; | 561 | }; |
| 562 | 562 | ||
| 563 | static struct accessmap nfs3_anyaccess[] = { | 563 | static struct accessmap nfs3_anyaccess[] = { |
| 564 | /* Some clients - Solaris 2.6 at least, make an access call | 564 | /* Some clients - Solaris 2.6 at least, make an access call |
| 565 | * to the server to check for access for things like /dev/null | 565 | * to the server to check for access for things like /dev/null |
| 566 | * (which really, the server doesn't care about). So | 566 | * (which really, the server doesn't care about). So |
| 567 | * We provide simple access checking for them, looking | 567 | * We provide simple access checking for them, looking |
| 568 | * mainly at mode bits, and we make sure to ignore read-only | 568 | * mainly at mode bits, and we make sure to ignore read-only |
| 569 | * filesystem checks | 569 | * filesystem checks |
| 570 | */ | 570 | */ |
| 571 | { NFS3_ACCESS_READ, MAY_READ }, | 571 | { NFS3_ACCESS_READ, MAY_READ }, |
| 572 | { NFS3_ACCESS_EXECUTE, MAY_EXEC }, | 572 | { NFS3_ACCESS_EXECUTE, MAY_EXEC }, |
| 573 | { NFS3_ACCESS_MODIFY, MAY_WRITE|MAY_LOCAL_ACCESS }, | 573 | { NFS3_ACCESS_MODIFY, MAY_WRITE|MAY_LOCAL_ACCESS }, |
| 574 | { NFS3_ACCESS_EXTEND, MAY_WRITE|MAY_LOCAL_ACCESS }, | 574 | { NFS3_ACCESS_EXTEND, MAY_WRITE|MAY_LOCAL_ACCESS }, |
| 575 | 575 | ||
| 576 | { 0, 0 } | 576 | { 0, 0 } |
| 577 | }; | 577 | }; |
| 578 | 578 | ||
| 579 | __be32 | 579 | __be32 |
| 580 | nfsd_access(struct svc_rqst *rqstp, struct svc_fh *fhp, u32 *access, u32 *supported) | 580 | nfsd_access(struct svc_rqst *rqstp, struct svc_fh *fhp, u32 *access, u32 *supported) |
| 581 | { | 581 | { |
| 582 | struct accessmap *map; | 582 | struct accessmap *map; |
| 583 | struct svc_export *export; | 583 | struct svc_export *export; |
| 584 | struct dentry *dentry; | 584 | struct dentry *dentry; |
| 585 | u32 query, result = 0, sresult = 0; | 585 | u32 query, result = 0, sresult = 0; |
| 586 | __be32 error; | 586 | __be32 error; |
| 587 | 587 | ||
| 588 | error = fh_verify(rqstp, fhp, 0, MAY_NOP); | 588 | error = fh_verify(rqstp, fhp, 0, MAY_NOP); |
| 589 | if (error) | 589 | if (error) |
| 590 | goto out; | 590 | goto out; |
| 591 | 591 | ||
| 592 | export = fhp->fh_export; | 592 | export = fhp->fh_export; |
| 593 | dentry = fhp->fh_dentry; | 593 | dentry = fhp->fh_dentry; |
| 594 | 594 | ||
| 595 | if (S_ISREG(dentry->d_inode->i_mode)) | 595 | if (S_ISREG(dentry->d_inode->i_mode)) |
| 596 | map = nfs3_regaccess; | 596 | map = nfs3_regaccess; |
| 597 | else if (S_ISDIR(dentry->d_inode->i_mode)) | 597 | else if (S_ISDIR(dentry->d_inode->i_mode)) |
| 598 | map = nfs3_diraccess; | 598 | map = nfs3_diraccess; |
| 599 | else | 599 | else |
| 600 | map = nfs3_anyaccess; | 600 | map = nfs3_anyaccess; |
| 601 | 601 | ||
| 602 | 602 | ||
| 603 | query = *access; | 603 | query = *access; |
| 604 | for (; map->access; map++) { | 604 | for (; map->access; map++) { |
| 605 | if (map->access & query) { | 605 | if (map->access & query) { |
| 606 | __be32 err2; | 606 | __be32 err2; |
| 607 | 607 | ||
| 608 | sresult |= map->access; | 608 | sresult |= map->access; |
| 609 | 609 | ||
| 610 | err2 = nfsd_permission(export, dentry, map->how); | 610 | err2 = nfsd_permission(export, dentry, map->how); |
| 611 | switch (err2) { | 611 | switch (err2) { |
| 612 | case nfs_ok: | 612 | case nfs_ok: |
| 613 | result |= map->access; | 613 | result |= map->access; |
| 614 | break; | 614 | break; |
| 615 | 615 | ||
| 616 | /* the following error codes just mean the access was not allowed, | 616 | /* the following error codes just mean the access was not allowed, |
| 617 | * rather than an error occurred */ | 617 | * rather than an error occurred */ |
| 618 | case nfserr_rofs: | 618 | case nfserr_rofs: |
| 619 | case nfserr_acces: | 619 | case nfserr_acces: |
| 620 | case nfserr_perm: | 620 | case nfserr_perm: |
| 621 | /* simply don't "or" in the access bit. */ | 621 | /* simply don't "or" in the access bit. */ |
| 622 | break; | 622 | break; |
| 623 | default: | 623 | default: |
| 624 | error = err2; | 624 | error = err2; |
| 625 | goto out; | 625 | goto out; |
| 626 | } | 626 | } |
| 627 | } | 627 | } |
| 628 | } | 628 | } |
| 629 | *access = result; | 629 | *access = result; |
| 630 | if (supported) | 630 | if (supported) |
| 631 | *supported = sresult; | 631 | *supported = sresult; |
| 632 | 632 | ||
| 633 | out: | 633 | out: |
| 634 | return error; | 634 | return error; |
| 635 | } | 635 | } |
| 636 | #endif /* CONFIG_NFSD_V3 */ | 636 | #endif /* CONFIG_NFSD_V3 */ |
| 637 | 637 | ||
| 638 | 638 | ||
| 639 | 639 | ||
| 640 | /* | 640 | /* |
| 641 | * Open an existing file or directory. | 641 | * Open an existing file or directory. |
| 642 | * The access argument indicates the type of open (read/write/lock) | 642 | * The access argument indicates the type of open (read/write/lock) |
| 643 | * N.B. After this call fhp needs an fh_put | 643 | * N.B. After this call fhp needs an fh_put |
| 644 | */ | 644 | */ |
| 645 | __be32 | 645 | __be32 |
| 646 | nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, | 646 | nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, |
| 647 | int access, struct file **filp) | 647 | int access, struct file **filp) |
| 648 | { | 648 | { |
| 649 | struct dentry *dentry; | 649 | struct dentry *dentry; |
| 650 | struct inode *inode; | 650 | struct inode *inode; |
| 651 | int flags = O_RDONLY|O_LARGEFILE; | 651 | int flags = O_RDONLY|O_LARGEFILE; |
| 652 | __be32 err; | 652 | __be32 err; |
| 653 | int host_err; | 653 | int host_err; |
| 654 | 654 | ||
| 655 | /* | 655 | /* |
| 656 | * If we get here, then the client has already done an "open", | 656 | * If we get here, then the client has already done an "open", |
| 657 | * and (hopefully) checked permission - so allow OWNER_OVERRIDE | 657 | * and (hopefully) checked permission - so allow OWNER_OVERRIDE |
| 658 | * in case a chmod has now revoked permission. | 658 | * in case a chmod has now revoked permission. |
| 659 | */ | 659 | */ |
| 660 | err = fh_verify(rqstp, fhp, type, access | MAY_OWNER_OVERRIDE); | 660 | err = fh_verify(rqstp, fhp, type, access | MAY_OWNER_OVERRIDE); |
| 661 | if (err) | 661 | if (err) |
| 662 | goto out; | 662 | goto out; |
| 663 | 663 | ||
| 664 | dentry = fhp->fh_dentry; | 664 | dentry = fhp->fh_dentry; |
| 665 | inode = dentry->d_inode; | 665 | inode = dentry->d_inode; |
| 666 | 666 | ||
| 667 | /* Disallow write access to files with the append-only bit set | 667 | /* Disallow write access to files with the append-only bit set |
| 668 | * or any access when mandatory locking enabled | 668 | * or any access when mandatory locking enabled |
| 669 | */ | 669 | */ |
| 670 | err = nfserr_perm; | 670 | err = nfserr_perm; |
| 671 | if (IS_APPEND(inode) && (access & MAY_WRITE)) | 671 | if (IS_APPEND(inode) && (access & MAY_WRITE)) |
| 672 | goto out; | 672 | goto out; |
| 673 | if (IS_ISMNDLK(inode)) | 673 | if (IS_ISMNDLK(inode)) |
| 674 | goto out; | 674 | goto out; |
| 675 | 675 | ||
| 676 | if (!inode->i_fop) | 676 | if (!inode->i_fop) |
| 677 | goto out; | 677 | goto out; |
| 678 | 678 | ||
| 679 | /* | 679 | /* |
| 680 | * Check to see if there are any leases on this file. | 680 | * Check to see if there are any leases on this file. |
| 681 | * This may block while leases are broken. | 681 | * This may block while leases are broken. |
| 682 | */ | 682 | */ |
| 683 | host_err = break_lease(inode, O_NONBLOCK | ((access & MAY_WRITE) ? FMODE_WRITE : 0)); | 683 | host_err = break_lease(inode, O_NONBLOCK | ((access & MAY_WRITE) ? FMODE_WRITE : 0)); |
| 684 | if (host_err == -EWOULDBLOCK) | 684 | if (host_err == -EWOULDBLOCK) |
| 685 | host_err = -ETIMEDOUT; | 685 | host_err = -ETIMEDOUT; |
| 686 | if (host_err) /* NOMEM or WOULDBLOCK */ | 686 | if (host_err) /* NOMEM or WOULDBLOCK */ |
| 687 | goto out_nfserr; | 687 | goto out_nfserr; |
| 688 | 688 | ||
| 689 | if (access & MAY_WRITE) { | 689 | if (access & MAY_WRITE) { |
| 690 | if (access & MAY_READ) | 690 | if (access & MAY_READ) |
| 691 | flags = O_RDWR|O_LARGEFILE; | 691 | flags = O_RDWR|O_LARGEFILE; |
| 692 | else | 692 | else |
| 693 | flags = O_WRONLY|O_LARGEFILE; | 693 | flags = O_WRONLY|O_LARGEFILE; |
| 694 | 694 | ||
| 695 | DQUOT_INIT(inode); | 695 | DQUOT_INIT(inode); |
| 696 | } | 696 | } |
| 697 | *filp = dentry_open(dget(dentry), mntget(fhp->fh_export->ex_mnt), flags); | 697 | *filp = dentry_open(dget(dentry), mntget(fhp->fh_export->ex_mnt), flags); |
| 698 | if (IS_ERR(*filp)) | 698 | if (IS_ERR(*filp)) |
| 699 | host_err = PTR_ERR(*filp); | 699 | host_err = PTR_ERR(*filp); |
| 700 | out_nfserr: | 700 | out_nfserr: |
| 701 | err = nfserrno(host_err); | 701 | err = nfserrno(host_err); |
| 702 | out: | 702 | out: |
| 703 | return err; | 703 | return err; |
| 704 | } | 704 | } |
| 705 | 705 | ||
| 706 | /* | 706 | /* |
| 707 | * Close a file. | 707 | * Close a file. |
| 708 | */ | 708 | */ |
| 709 | void | 709 | void |
| 710 | nfsd_close(struct file *filp) | 710 | nfsd_close(struct file *filp) |
| 711 | { | 711 | { |
| 712 | fput(filp); | 712 | fput(filp); |
| 713 | } | 713 | } |
| 714 | 714 | ||
| 715 | /* | 715 | /* |
| 716 | * Sync a file | 716 | * Sync a file |
| 717 | * As this calls fsync (not fdatasync) there is no need for a write_inode | 717 | * As this calls fsync (not fdatasync) there is no need for a write_inode |
| 718 | * after it. | 718 | * after it. |
| 719 | */ | 719 | */ |
| 720 | static inline int nfsd_dosync(struct file *filp, struct dentry *dp, | 720 | static inline int nfsd_dosync(struct file *filp, struct dentry *dp, |
| 721 | const struct file_operations *fop) | 721 | const struct file_operations *fop) |
| 722 | { | 722 | { |
| 723 | struct inode *inode = dp->d_inode; | 723 | struct inode *inode = dp->d_inode; |
| 724 | int (*fsync) (struct file *, struct dentry *, int); | 724 | int (*fsync) (struct file *, struct dentry *, int); |
| 725 | int err; | 725 | int err; |
| 726 | 726 | ||
| 727 | err = filemap_fdatawrite(inode->i_mapping); | 727 | err = filemap_fdatawrite(inode->i_mapping); |
| 728 | if (err == 0 && fop && (fsync = fop->fsync)) | 728 | if (err == 0 && fop && (fsync = fop->fsync)) |
| 729 | err = fsync(filp, dp, 0); | 729 | err = fsync(filp, dp, 0); |
| 730 | if (err == 0) | 730 | if (err == 0) |
| 731 | err = filemap_fdatawait(inode->i_mapping); | 731 | err = filemap_fdatawait(inode->i_mapping); |
| 732 | 732 | ||
| 733 | return err; | 733 | return err; |
| 734 | } | 734 | } |
| 735 | 735 | ||
| 736 | 736 | ||
| 737 | static int | 737 | static int |
| 738 | nfsd_sync(struct file *filp) | 738 | nfsd_sync(struct file *filp) |
| 739 | { | 739 | { |
| 740 | int err; | 740 | int err; |
| 741 | struct inode *inode = filp->f_path.dentry->d_inode; | 741 | struct inode *inode = filp->f_path.dentry->d_inode; |
| 742 | dprintk("nfsd: sync file %s\n", filp->f_path.dentry->d_name.name); | 742 | dprintk("nfsd: sync file %s\n", filp->f_path.dentry->d_name.name); |
| 743 | mutex_lock(&inode->i_mutex); | 743 | mutex_lock(&inode->i_mutex); |
| 744 | err=nfsd_dosync(filp, filp->f_path.dentry, filp->f_op); | 744 | err=nfsd_dosync(filp, filp->f_path.dentry, filp->f_op); |
| 745 | mutex_unlock(&inode->i_mutex); | 745 | mutex_unlock(&inode->i_mutex); |
| 746 | 746 | ||
| 747 | return err; | 747 | return err; |
| 748 | } | 748 | } |
| 749 | 749 | ||
| 750 | int | 750 | int |
| 751 | nfsd_sync_dir(struct dentry *dp) | 751 | nfsd_sync_dir(struct dentry *dp) |
| 752 | { | 752 | { |
| 753 | return nfsd_dosync(NULL, dp, dp->d_inode->i_fop); | 753 | return nfsd_dosync(NULL, dp, dp->d_inode->i_fop); |
| 754 | } | 754 | } |
| 755 | 755 | ||
| 756 | /* | 756 | /* |
| 757 | * Obtain the readahead parameters for the file | 757 | * Obtain the readahead parameters for the file |
| 758 | * specified by (dev, ino). | 758 | * specified by (dev, ino). |
| 759 | */ | 759 | */ |
| 760 | 760 | ||
| 761 | static inline struct raparms * | 761 | static inline struct raparms * |
| 762 | nfsd_get_raparms(dev_t dev, ino_t ino) | 762 | nfsd_get_raparms(dev_t dev, ino_t ino) |
| 763 | { | 763 | { |
| 764 | struct raparms *ra, **rap, **frap = NULL; | 764 | struct raparms *ra, **rap, **frap = NULL; |
| 765 | int depth = 0; | 765 | int depth = 0; |
| 766 | unsigned int hash; | 766 | unsigned int hash; |
| 767 | struct raparm_hbucket *rab; | 767 | struct raparm_hbucket *rab; |
| 768 | 768 | ||
| 769 | hash = jhash_2words(dev, ino, 0xfeedbeef) & RAPARM_HASH_MASK; | 769 | hash = jhash_2words(dev, ino, 0xfeedbeef) & RAPARM_HASH_MASK; |
| 770 | rab = &raparm_hash[hash]; | 770 | rab = &raparm_hash[hash]; |
| 771 | 771 | ||
| 772 | spin_lock(&rab->pb_lock); | 772 | spin_lock(&rab->pb_lock); |
| 773 | for (rap = &rab->pb_head; (ra = *rap); rap = &ra->p_next) { | 773 | for (rap = &rab->pb_head; (ra = *rap); rap = &ra->p_next) { |
| 774 | if (ra->p_ino == ino && ra->p_dev == dev) | 774 | if (ra->p_ino == ino && ra->p_dev == dev) |
| 775 | goto found; | 775 | goto found; |
| 776 | depth++; | 776 | depth++; |
| 777 | if (ra->p_count == 0) | 777 | if (ra->p_count == 0) |
| 778 | frap = rap; | 778 | frap = rap; |
| 779 | } | 779 | } |
| 780 | depth = nfsdstats.ra_size*11/10; | 780 | depth = nfsdstats.ra_size*11/10; |
| 781 | if (!frap) { | 781 | if (!frap) { |
| 782 | spin_unlock(&rab->pb_lock); | 782 | spin_unlock(&rab->pb_lock); |
| 783 | return NULL; | 783 | return NULL; |
| 784 | } | 784 | } |
| 785 | rap = frap; | 785 | rap = frap; |
| 786 | ra = *frap; | 786 | ra = *frap; |
| 787 | ra->p_dev = dev; | 787 | ra->p_dev = dev; |
| 788 | ra->p_ino = ino; | 788 | ra->p_ino = ino; |
| 789 | ra->p_set = 0; | 789 | ra->p_set = 0; |
| 790 | ra->p_hindex = hash; | 790 | ra->p_hindex = hash; |
| 791 | found: | 791 | found: |
| 792 | if (rap != &rab->pb_head) { | 792 | if (rap != &rab->pb_head) { |
| 793 | *rap = ra->p_next; | 793 | *rap = ra->p_next; |
| 794 | ra->p_next = rab->pb_head; | 794 | ra->p_next = rab->pb_head; |
| 795 | rab->pb_head = ra; | 795 | rab->pb_head = ra; |
| 796 | } | 796 | } |
| 797 | ra->p_count++; | 797 | ra->p_count++; |
| 798 | nfsdstats.ra_depth[depth*10/nfsdstats.ra_size]++; | 798 | nfsdstats.ra_depth[depth*10/nfsdstats.ra_size]++; |
| 799 | spin_unlock(&rab->pb_lock); | 799 | spin_unlock(&rab->pb_lock); |
| 800 | return ra; | 800 | return ra; |
| 801 | } | 801 | } |
| 802 | 802 | ||
| 803 | /* | 803 | /* |
| 804 | * Grab and keep cached pages associated with a file in the svc_rqst | 804 | * Grab and keep cached pages associated with a file in the svc_rqst |
| 805 | * so that they can be passed to the network sendmsg/sendpage routines | 805 | * so that they can be passed to the network sendmsg/sendpage routines |
| 806 | * directly. They will be released after the sending has completed. | 806 | * directly. They will be released after the sending has completed. |
| 807 | */ | 807 | */ |
| 808 | static int | 808 | static int |
| 809 | nfsd_splice_actor(struct pipe_inode_info *pipe, struct pipe_buffer *buf, | 809 | nfsd_splice_actor(struct pipe_inode_info *pipe, struct pipe_buffer *buf, |
| 810 | struct splice_desc *sd) | 810 | struct splice_desc *sd) |
| 811 | { | 811 | { |
| 812 | struct svc_rqst *rqstp = sd->u.data; | 812 | struct svc_rqst *rqstp = sd->u.data; |
| 813 | struct page **pp = rqstp->rq_respages + rqstp->rq_resused; | 813 | struct page **pp = rqstp->rq_respages + rqstp->rq_resused; |
| 814 | struct page *page = buf->page; | 814 | struct page *page = buf->page; |
| 815 | size_t size; | 815 | size_t size; |
| 816 | int ret; | 816 | int ret; |
| 817 | 817 | ||
| 818 | ret = buf->ops->pin(pipe, buf); | 818 | ret = buf->ops->confirm(pipe, buf); |
| 819 | if (unlikely(ret)) | 819 | if (unlikely(ret)) |
| 820 | return ret; | 820 | return ret; |
| 821 | 821 | ||
| 822 | size = sd->len; | 822 | size = sd->len; |
| 823 | 823 | ||
| 824 | if (rqstp->rq_res.page_len == 0) { | 824 | if (rqstp->rq_res.page_len == 0) { |
| 825 | get_page(page); | 825 | get_page(page); |
| 826 | put_page(*pp); | 826 | put_page(*pp); |
| 827 | *pp = page; | 827 | *pp = page; |
| 828 | rqstp->rq_resused++; | 828 | rqstp->rq_resused++; |
| 829 | rqstp->rq_res.page_base = buf->offset; | 829 | rqstp->rq_res.page_base = buf->offset; |
| 830 | rqstp->rq_res.page_len = size; | 830 | rqstp->rq_res.page_len = size; |
| 831 | } else if (page != pp[-1]) { | 831 | } else if (page != pp[-1]) { |
| 832 | get_page(page); | 832 | get_page(page); |
| 833 | if (*pp) | 833 | if (*pp) |
| 834 | put_page(*pp); | 834 | put_page(*pp); |
| 835 | *pp = page; | 835 | *pp = page; |
| 836 | rqstp->rq_resused++; | 836 | rqstp->rq_resused++; |
| 837 | rqstp->rq_res.page_len += size; | 837 | rqstp->rq_res.page_len += size; |
| 838 | } else | 838 | } else |
| 839 | rqstp->rq_res.page_len += size; | 839 | rqstp->rq_res.page_len += size; |
| 840 | 840 | ||
| 841 | return size; | 841 | return size; |
| 842 | } | 842 | } |
| 843 | 843 | ||
| 844 | static int nfsd_direct_splice_actor(struct pipe_inode_info *pipe, | 844 | static int nfsd_direct_splice_actor(struct pipe_inode_info *pipe, |
| 845 | struct splice_desc *sd) | 845 | struct splice_desc *sd) |
| 846 | { | 846 | { |
| 847 | return __splice_from_pipe(pipe, sd, nfsd_splice_actor); | 847 | return __splice_from_pipe(pipe, sd, nfsd_splice_actor); |
| 848 | } | 848 | } |
| 849 | 849 | ||
| 850 | static __be32 | 850 | static __be32 |
| 851 | nfsd_vfs_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, | 851 | nfsd_vfs_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, |
| 852 | loff_t offset, struct kvec *vec, int vlen, unsigned long *count) | 852 | loff_t offset, struct kvec *vec, int vlen, unsigned long *count) |
| 853 | { | 853 | { |
| 854 | struct inode *inode; | 854 | struct inode *inode; |
| 855 | struct raparms *ra; | 855 | struct raparms *ra; |
| 856 | mm_segment_t oldfs; | 856 | mm_segment_t oldfs; |
| 857 | __be32 err; | 857 | __be32 err; |
| 858 | int host_err; | 858 | int host_err; |
| 859 | 859 | ||
| 860 | err = nfserr_perm; | 860 | err = nfserr_perm; |
| 861 | inode = file->f_path.dentry->d_inode; | 861 | inode = file->f_path.dentry->d_inode; |
| 862 | #ifdef MSNFS | 862 | #ifdef MSNFS |
| 863 | if ((fhp->fh_export->ex_flags & NFSEXP_MSNFS) && | 863 | if ((fhp->fh_export->ex_flags & NFSEXP_MSNFS) && |
| 864 | (!lock_may_read(inode, offset, *count))) | 864 | (!lock_may_read(inode, offset, *count))) |
| 865 | goto out; | 865 | goto out; |
| 866 | #endif | 866 | #endif |
| 867 | 867 | ||
| 868 | /* Get readahead parameters */ | 868 | /* Get readahead parameters */ |
| 869 | ra = nfsd_get_raparms(inode->i_sb->s_dev, inode->i_ino); | 869 | ra = nfsd_get_raparms(inode->i_sb->s_dev, inode->i_ino); |
| 870 | 870 | ||
| 871 | if (ra && ra->p_set) | 871 | if (ra && ra->p_set) |
| 872 | file->f_ra = ra->p_ra; | 872 | file->f_ra = ra->p_ra; |
| 873 | 873 | ||
| 874 | if (file->f_op->splice_read && rqstp->rq_splice_ok) { | 874 | if (file->f_op->splice_read && rqstp->rq_splice_ok) { |
| 875 | struct splice_desc sd = { | 875 | struct splice_desc sd = { |
| 876 | .len = 0, | 876 | .len = 0, |
| 877 | .total_len = *count, | 877 | .total_len = *count, |
| 878 | .pos = offset, | 878 | .pos = offset, |
| 879 | .u.data = rqstp, | 879 | .u.data = rqstp, |
| 880 | }; | 880 | }; |
| 881 | 881 | ||
| 882 | host_err = splice_direct_to_actor(file, &sd, nfsd_direct_splice_actor); | 882 | host_err = splice_direct_to_actor(file, &sd, nfsd_direct_splice_actor); |
| 883 | } else { | 883 | } else { |
| 884 | oldfs = get_fs(); | 884 | oldfs = get_fs(); |
| 885 | set_fs(KERNEL_DS); | 885 | set_fs(KERNEL_DS); |
| 886 | host_err = vfs_readv(file, (struct iovec __user *)vec, vlen, &offset); | 886 | host_err = vfs_readv(file, (struct iovec __user *)vec, vlen, &offset); |
| 887 | set_fs(oldfs); | 887 | set_fs(oldfs); |
| 888 | } | 888 | } |
| 889 | 889 | ||
| 890 | /* Write back readahead params */ | 890 | /* Write back readahead params */ |
| 891 | if (ra) { | 891 | if (ra) { |
| 892 | struct raparm_hbucket *rab = &raparm_hash[ra->p_hindex]; | 892 | struct raparm_hbucket *rab = &raparm_hash[ra->p_hindex]; |
| 893 | spin_lock(&rab->pb_lock); | 893 | spin_lock(&rab->pb_lock); |
| 894 | ra->p_ra = file->f_ra; | 894 | ra->p_ra = file->f_ra; |
| 895 | ra->p_set = 1; | 895 | ra->p_set = 1; |
| 896 | ra->p_count--; | 896 | ra->p_count--; |
| 897 | spin_unlock(&rab->pb_lock); | 897 | spin_unlock(&rab->pb_lock); |
| 898 | } | 898 | } |
| 899 | 899 | ||
| 900 | if (host_err >= 0) { | 900 | if (host_err >= 0) { |
| 901 | nfsdstats.io_read += host_err; | 901 | nfsdstats.io_read += host_err; |
| 902 | *count = host_err; | 902 | *count = host_err; |
| 903 | err = 0; | 903 | err = 0; |
| 904 | fsnotify_access(file->f_path.dentry); | 904 | fsnotify_access(file->f_path.dentry); |
| 905 | } else | 905 | } else |
| 906 | err = nfserrno(host_err); | 906 | err = nfserrno(host_err); |
| 907 | out: | 907 | out: |
| 908 | return err; | 908 | return err; |
| 909 | } | 909 | } |
| 910 | 910 | ||
| 911 | static void kill_suid(struct dentry *dentry) | 911 | static void kill_suid(struct dentry *dentry) |
| 912 | { | 912 | { |
| 913 | struct iattr ia; | 913 | struct iattr ia; |
| 914 | ia.ia_valid = ATTR_KILL_SUID | ATTR_KILL_SGID; | 914 | ia.ia_valid = ATTR_KILL_SUID | ATTR_KILL_SGID; |
| 915 | 915 | ||
| 916 | mutex_lock(&dentry->d_inode->i_mutex); | 916 | mutex_lock(&dentry->d_inode->i_mutex); |
| 917 | notify_change(dentry, &ia); | 917 | notify_change(dentry, &ia); |
| 918 | mutex_unlock(&dentry->d_inode->i_mutex); | 918 | mutex_unlock(&dentry->d_inode->i_mutex); |
| 919 | } | 919 | } |
| 920 | 920 | ||
| 921 | static __be32 | 921 | static __be32 |
| 922 | nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, | 922 | nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, |
| 923 | loff_t offset, struct kvec *vec, int vlen, | 923 | loff_t offset, struct kvec *vec, int vlen, |
| 924 | unsigned long cnt, int *stablep) | 924 | unsigned long cnt, int *stablep) |
| 925 | { | 925 | { |
| 926 | struct svc_export *exp; | 926 | struct svc_export *exp; |
| 927 | struct dentry *dentry; | 927 | struct dentry *dentry; |
| 928 | struct inode *inode; | 928 | struct inode *inode; |
| 929 | mm_segment_t oldfs; | 929 | mm_segment_t oldfs; |
| 930 | __be32 err = 0; | 930 | __be32 err = 0; |
| 931 | int host_err; | 931 | int host_err; |
| 932 | int stable = *stablep; | 932 | int stable = *stablep; |
| 933 | 933 | ||
| 934 | #ifdef MSNFS | 934 | #ifdef MSNFS |
| 935 | err = nfserr_perm; | 935 | err = nfserr_perm; |
| 936 | 936 | ||
| 937 | if ((fhp->fh_export->ex_flags & NFSEXP_MSNFS) && | 937 | if ((fhp->fh_export->ex_flags & NFSEXP_MSNFS) && |
| 938 | (!lock_may_write(file->f_path.dentry->d_inode, offset, cnt))) | 938 | (!lock_may_write(file->f_path.dentry->d_inode, offset, cnt))) |
| 939 | goto out; | 939 | goto out; |
| 940 | #endif | 940 | #endif |
| 941 | 941 | ||
| 942 | dentry = file->f_path.dentry; | 942 | dentry = file->f_path.dentry; |
| 943 | inode = dentry->d_inode; | 943 | inode = dentry->d_inode; |
| 944 | exp = fhp->fh_export; | 944 | exp = fhp->fh_export; |
| 945 | 945 | ||
| 946 | /* | 946 | /* |
| 947 | * Request sync writes if | 947 | * Request sync writes if |
| 948 | * - the sync export option has been set, or | 948 | * - the sync export option has been set, or |
| 949 | * - the client requested O_SYNC behavior (NFSv3 feature). | 949 | * - the client requested O_SYNC behavior (NFSv3 feature). |
| 950 | * - The file system doesn't support fsync(). | 950 | * - The file system doesn't support fsync(). |
| 951 | * When gathered writes have been configured for this volume, | 951 | * When gathered writes have been configured for this volume, |
| 952 | * flushing the data to disk is handled separately below. | 952 | * flushing the data to disk is handled separately below. |
| 953 | */ | 953 | */ |
| 954 | 954 | ||
| 955 | if (file->f_op->fsync == 0) {/* COMMIT3 cannot work */ | 955 | if (file->f_op->fsync == 0) {/* COMMIT3 cannot work */ |
| 956 | stable = 2; | 956 | stable = 2; |
| 957 | *stablep = 2; /* FILE_SYNC */ | 957 | *stablep = 2; /* FILE_SYNC */ |
| 958 | } | 958 | } |
| 959 | 959 | ||
| 960 | if (!EX_ISSYNC(exp)) | 960 | if (!EX_ISSYNC(exp)) |
| 961 | stable = 0; | 961 | stable = 0; |
| 962 | if (stable && !EX_WGATHER(exp)) | 962 | if (stable && !EX_WGATHER(exp)) |
| 963 | file->f_flags |= O_SYNC; | 963 | file->f_flags |= O_SYNC; |
| 964 | 964 | ||
| 965 | /* Write the data. */ | 965 | /* Write the data. */ |
| 966 | oldfs = get_fs(); set_fs(KERNEL_DS); | 966 | oldfs = get_fs(); set_fs(KERNEL_DS); |
| 967 | host_err = vfs_writev(file, (struct iovec __user *)vec, vlen, &offset); | 967 | host_err = vfs_writev(file, (struct iovec __user *)vec, vlen, &offset); |
| 968 | set_fs(oldfs); | 968 | set_fs(oldfs); |
| 969 | if (host_err >= 0) { | 969 | if (host_err >= 0) { |
| 970 | nfsdstats.io_write += cnt; | 970 | nfsdstats.io_write += cnt; |
| 971 | fsnotify_modify(file->f_path.dentry); | 971 | fsnotify_modify(file->f_path.dentry); |
| 972 | } | 972 | } |
| 973 | 973 | ||
| 974 | /* clear setuid/setgid flag after write */ | 974 | /* clear setuid/setgid flag after write */ |
| 975 | if (host_err >= 0 && (inode->i_mode & (S_ISUID | S_ISGID))) | 975 | if (host_err >= 0 && (inode->i_mode & (S_ISUID | S_ISGID))) |
| 976 | kill_suid(dentry); | 976 | kill_suid(dentry); |
| 977 | 977 | ||
| 978 | if (host_err >= 0 && stable) { | 978 | if (host_err >= 0 && stable) { |
| 979 | static ino_t last_ino; | 979 | static ino_t last_ino; |
| 980 | static dev_t last_dev; | 980 | static dev_t last_dev; |
| 981 | 981 | ||
| 982 | /* | 982 | /* |
| 983 | * Gathered writes: If another process is currently | 983 | * Gathered writes: If another process is currently |
| 984 | * writing to the file, there's a high chance | 984 | * writing to the file, there's a high chance |
| 985 | * this is another nfsd (triggered by a bulk write | 985 | * this is another nfsd (triggered by a bulk write |
| 986 | * from a client's biod). Rather than syncing the | 986 | * from a client's biod). Rather than syncing the |
| 987 | * file with each write request, we sleep for 10 msec. | 987 | * file with each write request, we sleep for 10 msec. |
| 988 | * | 988 | * |
| 989 | * I don't know if this roughly approximates | 989 | * I don't know if this roughly approximates |
| 990 | * C. Juszak's idea of gathered writes, but it's a | 990 | * C. Juszak's idea of gathered writes, but it's a |
| 991 | * nice and simple solution (IMHO), and it seems to | 991 | * nice and simple solution (IMHO), and it seems to |
| 992 | * work:-) | 992 | * work:-) |
| 993 | */ | 993 | */ |
| 994 | if (EX_WGATHER(exp)) { | 994 | if (EX_WGATHER(exp)) { |
| 995 | if (atomic_read(&inode->i_writecount) > 1 | 995 | if (atomic_read(&inode->i_writecount) > 1 |
| 996 | || (last_ino == inode->i_ino && last_dev == inode->i_sb->s_dev)) { | 996 | || (last_ino == inode->i_ino && last_dev == inode->i_sb->s_dev)) { |
| 997 | dprintk("nfsd: write defer %d\n", current->pid); | 997 | dprintk("nfsd: write defer %d\n", current->pid); |
| 998 | msleep(10); | 998 | msleep(10); |
| 999 | dprintk("nfsd: write resume %d\n", current->pid); | 999 | dprintk("nfsd: write resume %d\n", current->pid); |
| 1000 | } | 1000 | } |
| 1001 | 1001 | ||
| 1002 | if (inode->i_state & I_DIRTY) { | 1002 | if (inode->i_state & I_DIRTY) { |
| 1003 | dprintk("nfsd: write sync %d\n", current->pid); | 1003 | dprintk("nfsd: write sync %d\n", current->pid); |
| 1004 | host_err=nfsd_sync(file); | 1004 | host_err=nfsd_sync(file); |
| 1005 | } | 1005 | } |
| 1006 | #if 0 | 1006 | #if 0 |
| 1007 | wake_up(&inode->i_wait); | 1007 | wake_up(&inode->i_wait); |
| 1008 | #endif | 1008 | #endif |
| 1009 | } | 1009 | } |
| 1010 | last_ino = inode->i_ino; | 1010 | last_ino = inode->i_ino; |
| 1011 | last_dev = inode->i_sb->s_dev; | 1011 | last_dev = inode->i_sb->s_dev; |
| 1012 | } | 1012 | } |
| 1013 | 1013 | ||
| 1014 | dprintk("nfsd: write complete host_err=%d\n", host_err); | 1014 | dprintk("nfsd: write complete host_err=%d\n", host_err); |
| 1015 | if (host_err >= 0) | 1015 | if (host_err >= 0) |
| 1016 | err = 0; | 1016 | err = 0; |
| 1017 | else | 1017 | else |
| 1018 | err = nfserrno(host_err); | 1018 | err = nfserrno(host_err); |
| 1019 | out: | 1019 | out: |
| 1020 | return err; | 1020 | return err; |
| 1021 | } | 1021 | } |
| 1022 | 1022 | ||
| 1023 | /* | 1023 | /* |
| 1024 | * Read data from a file. count must contain the requested read count | 1024 | * Read data from a file. count must contain the requested read count |
| 1025 | * on entry. On return, *count contains the number of bytes actually read. | 1025 | * on entry. On return, *count contains the number of bytes actually read. |
| 1026 | * N.B. After this call fhp needs an fh_put | 1026 | * N.B. After this call fhp needs an fh_put |
| 1027 | */ | 1027 | */ |
| 1028 | __be32 | 1028 | __be32 |
| 1029 | nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, | 1029 | nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, |
| 1030 | loff_t offset, struct kvec *vec, int vlen, | 1030 | loff_t offset, struct kvec *vec, int vlen, |
| 1031 | unsigned long *count) | 1031 | unsigned long *count) |
| 1032 | { | 1032 | { |
| 1033 | __be32 err; | 1033 | __be32 err; |
| 1034 | 1034 | ||
| 1035 | if (file) { | 1035 | if (file) { |
| 1036 | err = nfsd_permission(fhp->fh_export, fhp->fh_dentry, | 1036 | err = nfsd_permission(fhp->fh_export, fhp->fh_dentry, |
| 1037 | MAY_READ|MAY_OWNER_OVERRIDE); | 1037 | MAY_READ|MAY_OWNER_OVERRIDE); |
| 1038 | if (err) | 1038 | if (err) |
| 1039 | goto out; | 1039 | goto out; |
| 1040 | err = nfsd_vfs_read(rqstp, fhp, file, offset, vec, vlen, count); | 1040 | err = nfsd_vfs_read(rqstp, fhp, file, offset, vec, vlen, count); |
| 1041 | } else { | 1041 | } else { |
| 1042 | err = nfsd_open(rqstp, fhp, S_IFREG, MAY_READ, &file); | 1042 | err = nfsd_open(rqstp, fhp, S_IFREG, MAY_READ, &file); |
| 1043 | if (err) | 1043 | if (err) |
| 1044 | goto out; | 1044 | goto out; |
| 1045 | err = nfsd_vfs_read(rqstp, fhp, file, offset, vec, vlen, count); | 1045 | err = nfsd_vfs_read(rqstp, fhp, file, offset, vec, vlen, count); |
| 1046 | nfsd_close(file); | 1046 | nfsd_close(file); |
| 1047 | } | 1047 | } |
| 1048 | out: | 1048 | out: |
| 1049 | return err; | 1049 | return err; |
| 1050 | } | 1050 | } |
| 1051 | 1051 | ||
| 1052 | /* | 1052 | /* |
| 1053 | * Write data to a file. | 1053 | * Write data to a file. |
| 1054 | * The stable flag requests synchronous writes. | 1054 | * The stable flag requests synchronous writes. |
| 1055 | * N.B. After this call fhp needs an fh_put | 1055 | * N.B. After this call fhp needs an fh_put |
| 1056 | */ | 1056 | */ |
| 1057 | __be32 | 1057 | __be32 |
| 1058 | nfsd_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, | 1058 | nfsd_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, |
| 1059 | loff_t offset, struct kvec *vec, int vlen, unsigned long cnt, | 1059 | loff_t offset, struct kvec *vec, int vlen, unsigned long cnt, |
| 1060 | int *stablep) | 1060 | int *stablep) |
| 1061 | { | 1061 | { |
| 1062 | __be32 err = 0; | 1062 | __be32 err = 0; |
| 1063 | 1063 | ||
| 1064 | if (file) { | 1064 | if (file) { |
| 1065 | err = nfsd_permission(fhp->fh_export, fhp->fh_dentry, | 1065 | err = nfsd_permission(fhp->fh_export, fhp->fh_dentry, |
| 1066 | MAY_WRITE|MAY_OWNER_OVERRIDE); | 1066 | MAY_WRITE|MAY_OWNER_OVERRIDE); |
| 1067 | if (err) | 1067 | if (err) |
| 1068 | goto out; | 1068 | goto out; |
| 1069 | err = nfsd_vfs_write(rqstp, fhp, file, offset, vec, vlen, cnt, | 1069 | err = nfsd_vfs_write(rqstp, fhp, file, offset, vec, vlen, cnt, |
| 1070 | stablep); | 1070 | stablep); |
| 1071 | } else { | 1071 | } else { |
| 1072 | err = nfsd_open(rqstp, fhp, S_IFREG, MAY_WRITE, &file); | 1072 | err = nfsd_open(rqstp, fhp, S_IFREG, MAY_WRITE, &file); |
| 1073 | if (err) | 1073 | if (err) |
| 1074 | goto out; | 1074 | goto out; |
| 1075 | 1075 | ||
| 1076 | if (cnt) | 1076 | if (cnt) |
| 1077 | err = nfsd_vfs_write(rqstp, fhp, file, offset, vec, vlen, | 1077 | err = nfsd_vfs_write(rqstp, fhp, file, offset, vec, vlen, |
| 1078 | cnt, stablep); | 1078 | cnt, stablep); |
| 1079 | nfsd_close(file); | 1079 | nfsd_close(file); |
| 1080 | } | 1080 | } |
| 1081 | out: | 1081 | out: |
| 1082 | return err; | 1082 | return err; |
| 1083 | } | 1083 | } |
| 1084 | 1084 | ||
| 1085 | #ifdef CONFIG_NFSD_V3 | 1085 | #ifdef CONFIG_NFSD_V3 |
| 1086 | /* | 1086 | /* |
| 1087 | * Commit all pending writes to stable storage. | 1087 | * Commit all pending writes to stable storage. |
| 1088 | * Strictly speaking, we could sync just the indicated file region here, | 1088 | * Strictly speaking, we could sync just the indicated file region here, |
| 1089 | * but there's currently no way we can ask the VFS to do so. | 1089 | * but there's currently no way we can ask the VFS to do so. |
| 1090 | * | 1090 | * |
| 1091 | * Unfortunately we cannot lock the file to make sure we return full WCC | 1091 | * Unfortunately we cannot lock the file to make sure we return full WCC |
| 1092 | * data to the client, as locking happens lower down in the filesystem. | 1092 | * data to the client, as locking happens lower down in the filesystem. |
| 1093 | */ | 1093 | */ |
| 1094 | __be32 | 1094 | __be32 |
| 1095 | nfsd_commit(struct svc_rqst *rqstp, struct svc_fh *fhp, | 1095 | nfsd_commit(struct svc_rqst *rqstp, struct svc_fh *fhp, |
| 1096 | loff_t offset, unsigned long count) | 1096 | loff_t offset, unsigned long count) |
| 1097 | { | 1097 | { |
| 1098 | struct file *file; | 1098 | struct file *file; |
| 1099 | __be32 err; | 1099 | __be32 err; |
| 1100 | 1100 | ||
| 1101 | if ((u64)count > ~(u64)offset) | 1101 | if ((u64)count > ~(u64)offset) |
| 1102 | return nfserr_inval; | 1102 | return nfserr_inval; |
| 1103 | 1103 | ||
| 1104 | if ((err = nfsd_open(rqstp, fhp, S_IFREG, MAY_WRITE, &file)) != 0) | 1104 | if ((err = nfsd_open(rqstp, fhp, S_IFREG, MAY_WRITE, &file)) != 0) |
| 1105 | return err; | 1105 | return err; |
| 1106 | if (EX_ISSYNC(fhp->fh_export)) { | 1106 | if (EX_ISSYNC(fhp->fh_export)) { |
| 1107 | if (file->f_op && file->f_op->fsync) { | 1107 | if (file->f_op && file->f_op->fsync) { |
| 1108 | err = nfserrno(nfsd_sync(file)); | 1108 | err = nfserrno(nfsd_sync(file)); |
| 1109 | } else { | 1109 | } else { |
| 1110 | err = nfserr_notsupp; | 1110 | err = nfserr_notsupp; |
| 1111 | } | 1111 | } |
| 1112 | } | 1112 | } |
| 1113 | 1113 | ||
| 1114 | nfsd_close(file); | 1114 | nfsd_close(file); |
| 1115 | return err; | 1115 | return err; |
| 1116 | } | 1116 | } |
| 1117 | #endif /* CONFIG_NFSD_V3 */ | 1117 | #endif /* CONFIG_NFSD_V3 */ |
| 1118 | 1118 | ||
| 1119 | /* | 1119 | /* |
| 1120 | * Create a file (regular, directory, device, fifo); UNIX sockets | 1120 | * Create a file (regular, directory, device, fifo); UNIX sockets |
| 1121 | * not yet implemented. | 1121 | * not yet implemented. |
| 1122 | * If the response fh has been verified, the parent directory should | 1122 | * If the response fh has been verified, the parent directory should |
| 1123 | * already be locked. Note that the parent directory is left locked. | 1123 | * already be locked. Note that the parent directory is left locked. |
| 1124 | * | 1124 | * |
| 1125 | * N.B. Every call to nfsd_create needs an fh_put for _both_ fhp and resfhp | 1125 | * N.B. Every call to nfsd_create needs an fh_put for _both_ fhp and resfhp |
| 1126 | */ | 1126 | */ |
| 1127 | __be32 | 1127 | __be32 |
| 1128 | nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp, | 1128 | nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp, |
| 1129 | char *fname, int flen, struct iattr *iap, | 1129 | char *fname, int flen, struct iattr *iap, |
| 1130 | int type, dev_t rdev, struct svc_fh *resfhp) | 1130 | int type, dev_t rdev, struct svc_fh *resfhp) |
| 1131 | { | 1131 | { |
| 1132 | struct dentry *dentry, *dchild = NULL; | 1132 | struct dentry *dentry, *dchild = NULL; |
| 1133 | struct inode *dirp; | 1133 | struct inode *dirp; |
| 1134 | __be32 err; | 1134 | __be32 err; |
| 1135 | int host_err; | 1135 | int host_err; |
| 1136 | 1136 | ||
| 1137 | err = nfserr_perm; | 1137 | err = nfserr_perm; |
| 1138 | if (!flen) | 1138 | if (!flen) |
| 1139 | goto out; | 1139 | goto out; |
| 1140 | err = nfserr_exist; | 1140 | err = nfserr_exist; |
| 1141 | if (isdotent(fname, flen)) | 1141 | if (isdotent(fname, flen)) |
| 1142 | goto out; | 1142 | goto out; |
| 1143 | 1143 | ||
| 1144 | err = fh_verify(rqstp, fhp, S_IFDIR, MAY_CREATE); | 1144 | err = fh_verify(rqstp, fhp, S_IFDIR, MAY_CREATE); |
| 1145 | if (err) | 1145 | if (err) |
| 1146 | goto out; | 1146 | goto out; |
| 1147 | 1147 | ||
| 1148 | dentry = fhp->fh_dentry; | 1148 | dentry = fhp->fh_dentry; |
| 1149 | dirp = dentry->d_inode; | 1149 | dirp = dentry->d_inode; |
| 1150 | 1150 | ||
| 1151 | err = nfserr_notdir; | 1151 | err = nfserr_notdir; |
| 1152 | if(!dirp->i_op || !dirp->i_op->lookup) | 1152 | if(!dirp->i_op || !dirp->i_op->lookup) |
| 1153 | goto out; | 1153 | goto out; |
| 1154 | /* | 1154 | /* |
| 1155 | * Check whether the response file handle has been verified yet. | 1155 | * Check whether the response file handle has been verified yet. |
| 1156 | * If it has, the parent directory should already be locked. | 1156 | * If it has, the parent directory should already be locked. |
| 1157 | */ | 1157 | */ |
| 1158 | if (!resfhp->fh_dentry) { | 1158 | if (!resfhp->fh_dentry) { |
| 1159 | /* called from nfsd_proc_mkdir, or possibly nfsd3_proc_create */ | 1159 | /* called from nfsd_proc_mkdir, or possibly nfsd3_proc_create */ |
| 1160 | fh_lock_nested(fhp, I_MUTEX_PARENT); | 1160 | fh_lock_nested(fhp, I_MUTEX_PARENT); |
| 1161 | dchild = lookup_one_len(fname, dentry, flen); | 1161 | dchild = lookup_one_len(fname, dentry, flen); |
| 1162 | host_err = PTR_ERR(dchild); | 1162 | host_err = PTR_ERR(dchild); |
| 1163 | if (IS_ERR(dchild)) | 1163 | if (IS_ERR(dchild)) |
| 1164 | goto out_nfserr; | 1164 | goto out_nfserr; |
| 1165 | err = fh_compose(resfhp, fhp->fh_export, dchild, fhp); | 1165 | err = fh_compose(resfhp, fhp->fh_export, dchild, fhp); |
| 1166 | if (err) | 1166 | if (err) |
| 1167 | goto out; | 1167 | goto out; |
| 1168 | } else { | 1168 | } else { |
| 1169 | /* called from nfsd_proc_create */ | 1169 | /* called from nfsd_proc_create */ |
| 1170 | dchild = dget(resfhp->fh_dentry); | 1170 | dchild = dget(resfhp->fh_dentry); |
| 1171 | if (!fhp->fh_locked) { | 1171 | if (!fhp->fh_locked) { |
| 1172 | /* not actually possible */ | 1172 | /* not actually possible */ |
| 1173 | printk(KERN_ERR | 1173 | printk(KERN_ERR |
| 1174 | "nfsd_create: parent %s/%s not locked!\n", | 1174 | "nfsd_create: parent %s/%s not locked!\n", |
| 1175 | dentry->d_parent->d_name.name, | 1175 | dentry->d_parent->d_name.name, |
| 1176 | dentry->d_name.name); | 1176 | dentry->d_name.name); |
| 1177 | err = nfserr_io; | 1177 | err = nfserr_io; |
| 1178 | goto out; | 1178 | goto out; |
| 1179 | } | 1179 | } |
| 1180 | } | 1180 | } |
| 1181 | /* | 1181 | /* |
| 1182 | * Make sure the child dentry is still negative ... | 1182 | * Make sure the child dentry is still negative ... |
| 1183 | */ | 1183 | */ |
| 1184 | err = nfserr_exist; | 1184 | err = nfserr_exist; |
| 1185 | if (dchild->d_inode) { | 1185 | if (dchild->d_inode) { |
| 1186 | dprintk("nfsd_create: dentry %s/%s not negative!\n", | 1186 | dprintk("nfsd_create: dentry %s/%s not negative!\n", |
| 1187 | dentry->d_name.name, dchild->d_name.name); | 1187 | dentry->d_name.name, dchild->d_name.name); |
| 1188 | goto out; | 1188 | goto out; |
| 1189 | } | 1189 | } |
| 1190 | 1190 | ||
| 1191 | if (!(iap->ia_valid & ATTR_MODE)) | 1191 | if (!(iap->ia_valid & ATTR_MODE)) |
| 1192 | iap->ia_mode = 0; | 1192 | iap->ia_mode = 0; |
| 1193 | iap->ia_mode = (iap->ia_mode & S_IALLUGO) | type; | 1193 | iap->ia_mode = (iap->ia_mode & S_IALLUGO) | type; |
| 1194 | 1194 | ||
| 1195 | /* | 1195 | /* |
| 1196 | * Get the dir op function pointer. | 1196 | * Get the dir op function pointer. |
| 1197 | */ | 1197 | */ |
| 1198 | err = 0; | 1198 | err = 0; |
| 1199 | switch (type) { | 1199 | switch (type) { |
| 1200 | case S_IFREG: | 1200 | case S_IFREG: |
| 1201 | host_err = vfs_create(dirp, dchild, iap->ia_mode, NULL); | 1201 | host_err = vfs_create(dirp, dchild, iap->ia_mode, NULL); |
| 1202 | break; | 1202 | break; |
| 1203 | case S_IFDIR: | 1203 | case S_IFDIR: |
| 1204 | host_err = vfs_mkdir(dirp, dchild, iap->ia_mode); | 1204 | host_err = vfs_mkdir(dirp, dchild, iap->ia_mode); |
| 1205 | break; | 1205 | break; |
| 1206 | case S_IFCHR: | 1206 | case S_IFCHR: |
| 1207 | case S_IFBLK: | 1207 | case S_IFBLK: |
| 1208 | case S_IFIFO: | 1208 | case S_IFIFO: |
| 1209 | case S_IFSOCK: | 1209 | case S_IFSOCK: |
| 1210 | host_err = vfs_mknod(dirp, dchild, iap->ia_mode, rdev); | 1210 | host_err = vfs_mknod(dirp, dchild, iap->ia_mode, rdev); |
| 1211 | break; | 1211 | break; |
| 1212 | default: | 1212 | default: |
| 1213 | printk("nfsd: bad file type %o in nfsd_create\n", type); | 1213 | printk("nfsd: bad file type %o in nfsd_create\n", type); |
| 1214 | host_err = -EINVAL; | 1214 | host_err = -EINVAL; |
| 1215 | } | 1215 | } |
| 1216 | if (host_err < 0) | 1216 | if (host_err < 0) |
| 1217 | goto out_nfserr; | 1217 | goto out_nfserr; |
| 1218 | 1218 | ||
| 1219 | if (EX_ISSYNC(fhp->fh_export)) { | 1219 | if (EX_ISSYNC(fhp->fh_export)) { |
| 1220 | err = nfserrno(nfsd_sync_dir(dentry)); | 1220 | err = nfserrno(nfsd_sync_dir(dentry)); |
| 1221 | write_inode_now(dchild->d_inode, 1); | 1221 | write_inode_now(dchild->d_inode, 1); |
| 1222 | } | 1222 | } |
| 1223 | 1223 | ||
| 1224 | 1224 | ||
| 1225 | /* Set file attributes. Mode has already been set and | 1225 | /* Set file attributes. Mode has already been set and |
| 1226 | * setting uid/gid works only for root. Irix appears to | 1226 | * setting uid/gid works only for root. Irix appears to |
| 1227 | * send along the gid when it tries to implement setgid | 1227 | * send along the gid when it tries to implement setgid |
| 1228 | * directories via NFS. | 1228 | * directories via NFS. |
| 1229 | */ | 1229 | */ |
| 1230 | if ((iap->ia_valid &= ~(ATTR_UID|ATTR_GID|ATTR_MODE)) != 0) { | 1230 | if ((iap->ia_valid &= ~(ATTR_UID|ATTR_GID|ATTR_MODE)) != 0) { |
| 1231 | __be32 err2 = nfsd_setattr(rqstp, resfhp, iap, 0, (time_t)0); | 1231 | __be32 err2 = nfsd_setattr(rqstp, resfhp, iap, 0, (time_t)0); |
| 1232 | if (err2) | 1232 | if (err2) |
| 1233 | err = err2; | 1233 | err = err2; |
| 1234 | } | 1234 | } |
| 1235 | /* | 1235 | /* |
| 1236 | * Update the file handle to get the new inode info. | 1236 | * Update the file handle to get the new inode info. |
| 1237 | */ | 1237 | */ |
| 1238 | if (!err) | 1238 | if (!err) |
| 1239 | err = fh_update(resfhp); | 1239 | err = fh_update(resfhp); |
| 1240 | out: | 1240 | out: |
| 1241 | if (dchild && !IS_ERR(dchild)) | 1241 | if (dchild && !IS_ERR(dchild)) |
| 1242 | dput(dchild); | 1242 | dput(dchild); |
| 1243 | return err; | 1243 | return err; |
| 1244 | 1244 | ||
| 1245 | out_nfserr: | 1245 | out_nfserr: |
| 1246 | err = nfserrno(host_err); | 1246 | err = nfserrno(host_err); |
| 1247 | goto out; | 1247 | goto out; |
| 1248 | } | 1248 | } |
| 1249 | 1249 | ||
| 1250 | #ifdef CONFIG_NFSD_V3 | 1250 | #ifdef CONFIG_NFSD_V3 |
| 1251 | /* | 1251 | /* |
| 1252 | * NFSv3 version of nfsd_create | 1252 | * NFSv3 version of nfsd_create |
| 1253 | */ | 1253 | */ |
| 1254 | __be32 | 1254 | __be32 |
| 1255 | nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp, | 1255 | nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp, |
| 1256 | char *fname, int flen, struct iattr *iap, | 1256 | char *fname, int flen, struct iattr *iap, |
| 1257 | struct svc_fh *resfhp, int createmode, u32 *verifier, | 1257 | struct svc_fh *resfhp, int createmode, u32 *verifier, |
| 1258 | int *truncp, int *created) | 1258 | int *truncp, int *created) |
| 1259 | { | 1259 | { |
| 1260 | struct dentry *dentry, *dchild = NULL; | 1260 | struct dentry *dentry, *dchild = NULL; |
| 1261 | struct inode *dirp; | 1261 | struct inode *dirp; |
| 1262 | __be32 err; | 1262 | __be32 err; |
| 1263 | int host_err; | 1263 | int host_err; |
| 1264 | __u32 v_mtime=0, v_atime=0; | 1264 | __u32 v_mtime=0, v_atime=0; |
| 1265 | 1265 | ||
| 1266 | err = nfserr_perm; | 1266 | err = nfserr_perm; |
| 1267 | if (!flen) | 1267 | if (!flen) |
| 1268 | goto out; | 1268 | goto out; |
| 1269 | err = nfserr_exist; | 1269 | err = nfserr_exist; |
| 1270 | if (isdotent(fname, flen)) | 1270 | if (isdotent(fname, flen)) |
| 1271 | goto out; | 1271 | goto out; |
| 1272 | if (!(iap->ia_valid & ATTR_MODE)) | 1272 | if (!(iap->ia_valid & ATTR_MODE)) |
| 1273 | iap->ia_mode = 0; | 1273 | iap->ia_mode = 0; |
| 1274 | err = fh_verify(rqstp, fhp, S_IFDIR, MAY_CREATE); | 1274 | err = fh_verify(rqstp, fhp, S_IFDIR, MAY_CREATE); |
| 1275 | if (err) | 1275 | if (err) |
| 1276 | goto out; | 1276 | goto out; |
| 1277 | 1277 | ||
| 1278 | dentry = fhp->fh_dentry; | 1278 | dentry = fhp->fh_dentry; |
| 1279 | dirp = dentry->d_inode; | 1279 | dirp = dentry->d_inode; |
| 1280 | 1280 | ||
| 1281 | /* Get all the sanity checks out of the way before | 1281 | /* Get all the sanity checks out of the way before |
| 1282 | * we lock the parent. */ | 1282 | * we lock the parent. */ |
| 1283 | err = nfserr_notdir; | 1283 | err = nfserr_notdir; |
| 1284 | if(!dirp->i_op || !dirp->i_op->lookup) | 1284 | if(!dirp->i_op || !dirp->i_op->lookup) |
| 1285 | goto out; | 1285 | goto out; |
| 1286 | fh_lock_nested(fhp, I_MUTEX_PARENT); | 1286 | fh_lock_nested(fhp, I_MUTEX_PARENT); |
| 1287 | 1287 | ||
| 1288 | /* | 1288 | /* |
| 1289 | * Compose the response file handle. | 1289 | * Compose the response file handle. |
| 1290 | */ | 1290 | */ |
| 1291 | dchild = lookup_one_len(fname, dentry, flen); | 1291 | dchild = lookup_one_len(fname, dentry, flen); |
| 1292 | host_err = PTR_ERR(dchild); | 1292 | host_err = PTR_ERR(dchild); |
| 1293 | if (IS_ERR(dchild)) | 1293 | if (IS_ERR(dchild)) |
| 1294 | goto out_nfserr; | 1294 | goto out_nfserr; |
| 1295 | 1295 | ||
| 1296 | err = fh_compose(resfhp, fhp->fh_export, dchild, fhp); | 1296 | err = fh_compose(resfhp, fhp->fh_export, dchild, fhp); |
| 1297 | if (err) | 1297 | if (err) |
| 1298 | goto out; | 1298 | goto out; |
| 1299 | 1299 | ||
| 1300 | if (createmode == NFS3_CREATE_EXCLUSIVE) { | 1300 | if (createmode == NFS3_CREATE_EXCLUSIVE) { |
| 1301 | /* solaris7 gets confused (bugid 4218508) if these have | 1301 | /* solaris7 gets confused (bugid 4218508) if these have |
| 1302 | * the high bit set, so just clear the high bits. | 1302 | * the high bit set, so just clear the high bits. |
| 1303 | */ | 1303 | */ |
| 1304 | v_mtime = verifier[0]&0x7fffffff; | 1304 | v_mtime = verifier[0]&0x7fffffff; |
| 1305 | v_atime = verifier[1]&0x7fffffff; | 1305 | v_atime = verifier[1]&0x7fffffff; |
| 1306 | } | 1306 | } |
| 1307 | 1307 | ||
| 1308 | if (dchild->d_inode) { | 1308 | if (dchild->d_inode) { |
| 1309 | err = 0; | 1309 | err = 0; |
| 1310 | 1310 | ||
| 1311 | switch (createmode) { | 1311 | switch (createmode) { |
| 1312 | case NFS3_CREATE_UNCHECKED: | 1312 | case NFS3_CREATE_UNCHECKED: |
| 1313 | if (! S_ISREG(dchild->d_inode->i_mode)) | 1313 | if (! S_ISREG(dchild->d_inode->i_mode)) |
| 1314 | err = nfserr_exist; | 1314 | err = nfserr_exist; |
| 1315 | else if (truncp) { | 1315 | else if (truncp) { |
| 1316 | /* in nfsv4, we need to treat this case a little | 1316 | /* in nfsv4, we need to treat this case a little |
| 1317 | * differently. we don't want to truncate the | 1317 | * differently. we don't want to truncate the |
| 1318 | * file now; this would be wrong if the OPEN | 1318 | * file now; this would be wrong if the OPEN |
| 1319 | * fails for some other reason. furthermore, | 1319 | * fails for some other reason. furthermore, |
| 1320 | * if the size is nonzero, we should ignore it | 1320 | * if the size is nonzero, we should ignore it |
| 1321 | * according to spec! | 1321 | * according to spec! |
| 1322 | */ | 1322 | */ |
| 1323 | *truncp = (iap->ia_valid & ATTR_SIZE) && !iap->ia_size; | 1323 | *truncp = (iap->ia_valid & ATTR_SIZE) && !iap->ia_size; |
| 1324 | } | 1324 | } |
| 1325 | else { | 1325 | else { |
| 1326 | iap->ia_valid &= ATTR_SIZE; | 1326 | iap->ia_valid &= ATTR_SIZE; |
| 1327 | goto set_attr; | 1327 | goto set_attr; |
| 1328 | } | 1328 | } |
| 1329 | break; | 1329 | break; |
| 1330 | case NFS3_CREATE_EXCLUSIVE: | 1330 | case NFS3_CREATE_EXCLUSIVE: |
| 1331 | if ( dchild->d_inode->i_mtime.tv_sec == v_mtime | 1331 | if ( dchild->d_inode->i_mtime.tv_sec == v_mtime |
| 1332 | && dchild->d_inode->i_atime.tv_sec == v_atime | 1332 | && dchild->d_inode->i_atime.tv_sec == v_atime |
| 1333 | && dchild->d_inode->i_size == 0 ) | 1333 | && dchild->d_inode->i_size == 0 ) |
| 1334 | break; | 1334 | break; |
| 1335 | /* fallthru */ | 1335 | /* fallthru */ |
| 1336 | case NFS3_CREATE_GUARDED: | 1336 | case NFS3_CREATE_GUARDED: |
| 1337 | err = nfserr_exist; | 1337 | err = nfserr_exist; |
| 1338 | } | 1338 | } |
| 1339 | goto out; | 1339 | goto out; |
| 1340 | } | 1340 | } |
| 1341 | 1341 | ||
| 1342 | host_err = vfs_create(dirp, dchild, iap->ia_mode, NULL); | 1342 | host_err = vfs_create(dirp, dchild, iap->ia_mode, NULL); |
| 1343 | if (host_err < 0) | 1343 | if (host_err < 0) |
| 1344 | goto out_nfserr; | 1344 | goto out_nfserr; |
| 1345 | if (created) | 1345 | if (created) |
| 1346 | *created = 1; | 1346 | *created = 1; |
| 1347 | 1347 | ||
| 1348 | if (EX_ISSYNC(fhp->fh_export)) { | 1348 | if (EX_ISSYNC(fhp->fh_export)) { |
| 1349 | err = nfserrno(nfsd_sync_dir(dentry)); | 1349 | err = nfserrno(nfsd_sync_dir(dentry)); |
| 1350 | /* setattr will sync the child (or not) */ | 1350 | /* setattr will sync the child (or not) */ |
| 1351 | } | 1351 | } |
| 1352 | 1352 | ||
| 1353 | if (createmode == NFS3_CREATE_EXCLUSIVE) { | 1353 | if (createmode == NFS3_CREATE_EXCLUSIVE) { |
| 1354 | /* Cram the verifier into atime/mtime */ | 1354 | /* Cram the verifier into atime/mtime */ |
| 1355 | iap->ia_valid = ATTR_MTIME|ATTR_ATIME | 1355 | iap->ia_valid = ATTR_MTIME|ATTR_ATIME |
| 1356 | | ATTR_MTIME_SET|ATTR_ATIME_SET; | 1356 | | ATTR_MTIME_SET|ATTR_ATIME_SET; |
| 1357 | /* XXX someone who knows this better please fix it for nsec */ | 1357 | /* XXX someone who knows this better please fix it for nsec */ |
| 1358 | iap->ia_mtime.tv_sec = v_mtime; | 1358 | iap->ia_mtime.tv_sec = v_mtime; |
| 1359 | iap->ia_atime.tv_sec = v_atime; | 1359 | iap->ia_atime.tv_sec = v_atime; |
| 1360 | iap->ia_mtime.tv_nsec = 0; | 1360 | iap->ia_mtime.tv_nsec = 0; |
| 1361 | iap->ia_atime.tv_nsec = 0; | 1361 | iap->ia_atime.tv_nsec = 0; |
| 1362 | } | 1362 | } |
| 1363 | 1363 | ||
| 1364 | /* Set file attributes. | 1364 | /* Set file attributes. |
| 1365 | * Irix appears to send along the gid when it tries to | 1365 | * Irix appears to send along the gid when it tries to |
| 1366 | * implement setgid directories via NFS. Clear out all that cruft. | 1366 | * implement setgid directories via NFS. Clear out all that cruft. |
| 1367 | */ | 1367 | */ |
| 1368 | set_attr: | 1368 | set_attr: |
| 1369 | if ((iap->ia_valid &= ~(ATTR_UID|ATTR_GID|ATTR_MODE)) != 0) { | 1369 | if ((iap->ia_valid &= ~(ATTR_UID|ATTR_GID|ATTR_MODE)) != 0) { |
| 1370 | __be32 err2 = nfsd_setattr(rqstp, resfhp, iap, 0, (time_t)0); | 1370 | __be32 err2 = nfsd_setattr(rqstp, resfhp, iap, 0, (time_t)0); |
| 1371 | if (err2) | 1371 | if (err2) |
| 1372 | err = err2; | 1372 | err = err2; |
| 1373 | } | 1373 | } |
| 1374 | 1374 | ||
| 1375 | /* | 1375 | /* |
| 1376 | * Update the filehandle to get the new inode info. | 1376 | * Update the filehandle to get the new inode info. |
| 1377 | */ | 1377 | */ |
| 1378 | if (!err) | 1378 | if (!err) |
| 1379 | err = fh_update(resfhp); | 1379 | err = fh_update(resfhp); |
| 1380 | 1380 | ||
| 1381 | out: | 1381 | out: |
| 1382 | fh_unlock(fhp); | 1382 | fh_unlock(fhp); |
| 1383 | if (dchild && !IS_ERR(dchild)) | 1383 | if (dchild && !IS_ERR(dchild)) |
| 1384 | dput(dchild); | 1384 | dput(dchild); |
| 1385 | return err; | 1385 | return err; |
| 1386 | 1386 | ||
| 1387 | out_nfserr: | 1387 | out_nfserr: |
| 1388 | err = nfserrno(host_err); | 1388 | err = nfserrno(host_err); |
| 1389 | goto out; | 1389 | goto out; |
| 1390 | } | 1390 | } |
| 1391 | #endif /* CONFIG_NFSD_V3 */ | 1391 | #endif /* CONFIG_NFSD_V3 */ |
| 1392 | 1392 | ||
| 1393 | /* | 1393 | /* |
| 1394 | * Read a symlink. On entry, *lenp must contain the maximum path length that | 1394 | * Read a symlink. On entry, *lenp must contain the maximum path length that |
| 1395 | * fits into the buffer. On return, it contains the true length. | 1395 | * fits into the buffer. On return, it contains the true length. |
| 1396 | * N.B. After this call fhp needs an fh_put | 1396 | * N.B. After this call fhp needs an fh_put |
| 1397 | */ | 1397 | */ |
| 1398 | __be32 | 1398 | __be32 |
| 1399 | nfsd_readlink(struct svc_rqst *rqstp, struct svc_fh *fhp, char *buf, int *lenp) | 1399 | nfsd_readlink(struct svc_rqst *rqstp, struct svc_fh *fhp, char *buf, int *lenp) |
| 1400 | { | 1400 | { |
| 1401 | struct dentry *dentry; | 1401 | struct dentry *dentry; |
| 1402 | struct inode *inode; | 1402 | struct inode *inode; |
| 1403 | mm_segment_t oldfs; | 1403 | mm_segment_t oldfs; |
| 1404 | __be32 err; | 1404 | __be32 err; |
| 1405 | int host_err; | 1405 | int host_err; |
| 1406 | 1406 | ||
| 1407 | err = fh_verify(rqstp, fhp, S_IFLNK, MAY_NOP); | 1407 | err = fh_verify(rqstp, fhp, S_IFLNK, MAY_NOP); |
| 1408 | if (err) | 1408 | if (err) |
| 1409 | goto out; | 1409 | goto out; |
| 1410 | 1410 | ||
| 1411 | dentry = fhp->fh_dentry; | 1411 | dentry = fhp->fh_dentry; |
| 1412 | inode = dentry->d_inode; | 1412 | inode = dentry->d_inode; |
| 1413 | 1413 | ||
| 1414 | err = nfserr_inval; | 1414 | err = nfserr_inval; |
| 1415 | if (!inode->i_op || !inode->i_op->readlink) | 1415 | if (!inode->i_op || !inode->i_op->readlink) |
| 1416 | goto out; | 1416 | goto out; |
| 1417 | 1417 | ||
| 1418 | touch_atime(fhp->fh_export->ex_mnt, dentry); | 1418 | touch_atime(fhp->fh_export->ex_mnt, dentry); |
| 1419 | /* N.B. Why does this call need a get_fs()?? | 1419 | /* N.B. Why does this call need a get_fs()?? |
| 1420 | * Remove the set_fs and watch the fireworks:-) --okir | 1420 | * Remove the set_fs and watch the fireworks:-) --okir |
| 1421 | */ | 1421 | */ |
| 1422 | 1422 | ||
| 1423 | oldfs = get_fs(); set_fs(KERNEL_DS); | 1423 | oldfs = get_fs(); set_fs(KERNEL_DS); |
| 1424 | host_err = inode->i_op->readlink(dentry, buf, *lenp); | 1424 | host_err = inode->i_op->readlink(dentry, buf, *lenp); |
| 1425 | set_fs(oldfs); | 1425 | set_fs(oldfs); |
| 1426 | 1426 | ||
| 1427 | if (host_err < 0) | 1427 | if (host_err < 0) |
| 1428 | goto out_nfserr; | 1428 | goto out_nfserr; |
| 1429 | *lenp = host_err; | 1429 | *lenp = host_err; |
| 1430 | err = 0; | 1430 | err = 0; |
| 1431 | out: | 1431 | out: |
| 1432 | return err; | 1432 | return err; |
| 1433 | 1433 | ||
| 1434 | out_nfserr: | 1434 | out_nfserr: |
| 1435 | err = nfserrno(host_err); | 1435 | err = nfserrno(host_err); |
| 1436 | goto out; | 1436 | goto out; |
| 1437 | } | 1437 | } |
| 1438 | 1438 | ||
| 1439 | /* | 1439 | /* |
| 1440 | * Create a symlink and look up its inode | 1440 | * Create a symlink and look up its inode |
| 1441 | * N.B. After this call _both_ fhp and resfhp need an fh_put | 1441 | * N.B. After this call _both_ fhp and resfhp need an fh_put |
| 1442 | */ | 1442 | */ |
| 1443 | __be32 | 1443 | __be32 |
| 1444 | nfsd_symlink(struct svc_rqst *rqstp, struct svc_fh *fhp, | 1444 | nfsd_symlink(struct svc_rqst *rqstp, struct svc_fh *fhp, |
| 1445 | char *fname, int flen, | 1445 | char *fname, int flen, |
| 1446 | char *path, int plen, | 1446 | char *path, int plen, |
| 1447 | struct svc_fh *resfhp, | 1447 | struct svc_fh *resfhp, |
| 1448 | struct iattr *iap) | 1448 | struct iattr *iap) |
| 1449 | { | 1449 | { |
| 1450 | struct dentry *dentry, *dnew; | 1450 | struct dentry *dentry, *dnew; |
| 1451 | __be32 err, cerr; | 1451 | __be32 err, cerr; |
| 1452 | int host_err; | 1452 | int host_err; |
| 1453 | umode_t mode; | 1453 | umode_t mode; |
| 1454 | 1454 | ||
| 1455 | err = nfserr_noent; | 1455 | err = nfserr_noent; |
| 1456 | if (!flen || !plen) | 1456 | if (!flen || !plen) |
| 1457 | goto out; | 1457 | goto out; |
| 1458 | err = nfserr_exist; | 1458 | err = nfserr_exist; |
| 1459 | if (isdotent(fname, flen)) | 1459 | if (isdotent(fname, flen)) |
| 1460 | goto out; | 1460 | goto out; |
| 1461 | 1461 | ||
| 1462 | err = fh_verify(rqstp, fhp, S_IFDIR, MAY_CREATE); | 1462 | err = fh_verify(rqstp, fhp, S_IFDIR, MAY_CREATE); |
| 1463 | if (err) | 1463 | if (err) |
| 1464 | goto out; | 1464 | goto out; |
| 1465 | fh_lock(fhp); | 1465 | fh_lock(fhp); |
| 1466 | dentry = fhp->fh_dentry; | 1466 | dentry = fhp->fh_dentry; |
| 1467 | dnew = lookup_one_len(fname, dentry, flen); | 1467 | dnew = lookup_one_len(fname, dentry, flen); |
| 1468 | host_err = PTR_ERR(dnew); | 1468 | host_err = PTR_ERR(dnew); |
| 1469 | if (IS_ERR(dnew)) | 1469 | if (IS_ERR(dnew)) |
| 1470 | goto out_nfserr; | 1470 | goto out_nfserr; |
| 1471 | 1471 | ||
| 1472 | mode = S_IALLUGO; | 1472 | mode = S_IALLUGO; |
| 1473 | /* Only the MODE ATTRibute is even vaguely meaningful */ | 1473 | /* Only the MODE ATTRibute is even vaguely meaningful */ |
| 1474 | if (iap && (iap->ia_valid & ATTR_MODE)) | 1474 | if (iap && (iap->ia_valid & ATTR_MODE)) |
| 1475 | mode = iap->ia_mode & S_IALLUGO; | 1475 | mode = iap->ia_mode & S_IALLUGO; |
| 1476 | 1476 | ||
| 1477 | if (unlikely(path[plen] != 0)) { | 1477 | if (unlikely(path[plen] != 0)) { |
| 1478 | char *path_alloced = kmalloc(plen+1, GFP_KERNEL); | 1478 | char *path_alloced = kmalloc(plen+1, GFP_KERNEL); |
| 1479 | if (path_alloced == NULL) | 1479 | if (path_alloced == NULL) |
| 1480 | host_err = -ENOMEM; | 1480 | host_err = -ENOMEM; |
| 1481 | else { | 1481 | else { |
| 1482 | strncpy(path_alloced, path, plen); | 1482 | strncpy(path_alloced, path, plen); |
| 1483 | path_alloced[plen] = 0; | 1483 | path_alloced[plen] = 0; |
| 1484 | host_err = vfs_symlink(dentry->d_inode, dnew, path_alloced, mode); | 1484 | host_err = vfs_symlink(dentry->d_inode, dnew, path_alloced, mode); |
| 1485 | kfree(path_alloced); | 1485 | kfree(path_alloced); |
| 1486 | } | 1486 | } |
| 1487 | } else | 1487 | } else |
| 1488 | host_err = vfs_symlink(dentry->d_inode, dnew, path, mode); | 1488 | host_err = vfs_symlink(dentry->d_inode, dnew, path, mode); |
| 1489 | 1489 | ||
| 1490 | if (!host_err) { | 1490 | if (!host_err) { |
| 1491 | if (EX_ISSYNC(fhp->fh_export)) | 1491 | if (EX_ISSYNC(fhp->fh_export)) |
| 1492 | host_err = nfsd_sync_dir(dentry); | 1492 | host_err = nfsd_sync_dir(dentry); |
| 1493 | } | 1493 | } |
| 1494 | err = nfserrno(host_err); | 1494 | err = nfserrno(host_err); |
| 1495 | fh_unlock(fhp); | 1495 | fh_unlock(fhp); |
| 1496 | 1496 | ||
| 1497 | cerr = fh_compose(resfhp, fhp->fh_export, dnew, fhp); | 1497 | cerr = fh_compose(resfhp, fhp->fh_export, dnew, fhp); |
| 1498 | dput(dnew); | 1498 | dput(dnew); |
| 1499 | if (err==0) err = cerr; | 1499 | if (err==0) err = cerr; |
| 1500 | out: | 1500 | out: |
| 1501 | return err; | 1501 | return err; |
| 1502 | 1502 | ||
| 1503 | out_nfserr: | 1503 | out_nfserr: |
| 1504 | err = nfserrno(host_err); | 1504 | err = nfserrno(host_err); |
| 1505 | goto out; | 1505 | goto out; |
| 1506 | } | 1506 | } |
| 1507 | 1507 | ||
| 1508 | /* | 1508 | /* |
| 1509 | * Create a hardlink | 1509 | * Create a hardlink |
| 1510 | * N.B. After this call _both_ ffhp and tfhp need an fh_put | 1510 | * N.B. After this call _both_ ffhp and tfhp need an fh_put |
| 1511 | */ | 1511 | */ |
| 1512 | __be32 | 1512 | __be32 |
| 1513 | nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp, | 1513 | nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp, |
| 1514 | char *name, int len, struct svc_fh *tfhp) | 1514 | char *name, int len, struct svc_fh *tfhp) |
| 1515 | { | 1515 | { |
| 1516 | struct dentry *ddir, *dnew, *dold; | 1516 | struct dentry *ddir, *dnew, *dold; |
| 1517 | struct inode *dirp, *dest; | 1517 | struct inode *dirp, *dest; |
| 1518 | __be32 err; | 1518 | __be32 err; |
| 1519 | int host_err; | 1519 | int host_err; |
| 1520 | 1520 | ||
| 1521 | err = fh_verify(rqstp, ffhp, S_IFDIR, MAY_CREATE); | 1521 | err = fh_verify(rqstp, ffhp, S_IFDIR, MAY_CREATE); |
| 1522 | if (err) | 1522 | if (err) |
| 1523 | goto out; | 1523 | goto out; |
| 1524 | err = fh_verify(rqstp, tfhp, -S_IFDIR, MAY_NOP); | 1524 | err = fh_verify(rqstp, tfhp, -S_IFDIR, MAY_NOP); |
| 1525 | if (err) | 1525 | if (err) |
| 1526 | goto out; | 1526 | goto out; |
| 1527 | 1527 | ||
| 1528 | err = nfserr_perm; | 1528 | err = nfserr_perm; |
| 1529 | if (!len) | 1529 | if (!len) |
| 1530 | goto out; | 1530 | goto out; |
| 1531 | err = nfserr_exist; | 1531 | err = nfserr_exist; |
| 1532 | if (isdotent(name, len)) | 1532 | if (isdotent(name, len)) |
| 1533 | goto out; | 1533 | goto out; |
| 1534 | 1534 | ||
| 1535 | fh_lock_nested(ffhp, I_MUTEX_PARENT); | 1535 | fh_lock_nested(ffhp, I_MUTEX_PARENT); |
| 1536 | ddir = ffhp->fh_dentry; | 1536 | ddir = ffhp->fh_dentry; |
| 1537 | dirp = ddir->d_inode; | 1537 | dirp = ddir->d_inode; |
| 1538 | 1538 | ||
| 1539 | dnew = lookup_one_len(name, ddir, len); | 1539 | dnew = lookup_one_len(name, ddir, len); |
| 1540 | host_err = PTR_ERR(dnew); | 1540 | host_err = PTR_ERR(dnew); |
| 1541 | if (IS_ERR(dnew)) | 1541 | if (IS_ERR(dnew)) |
| 1542 | goto out_nfserr; | 1542 | goto out_nfserr; |
| 1543 | 1543 | ||
| 1544 | dold = tfhp->fh_dentry; | 1544 | dold = tfhp->fh_dentry; |
| 1545 | dest = dold->d_inode; | 1545 | dest = dold->d_inode; |
| 1546 | 1546 | ||
| 1547 | host_err = vfs_link(dold, dirp, dnew); | 1547 | host_err = vfs_link(dold, dirp, dnew); |
| 1548 | if (!host_err) { | 1548 | if (!host_err) { |
| 1549 | if (EX_ISSYNC(ffhp->fh_export)) { | 1549 | if (EX_ISSYNC(ffhp->fh_export)) { |
| 1550 | err = nfserrno(nfsd_sync_dir(ddir)); | 1550 | err = nfserrno(nfsd_sync_dir(ddir)); |
| 1551 | write_inode_now(dest, 1); | 1551 | write_inode_now(dest, 1); |
| 1552 | } | 1552 | } |
| 1553 | err = 0; | 1553 | err = 0; |
| 1554 | } else { | 1554 | } else { |
| 1555 | if (host_err == -EXDEV && rqstp->rq_vers == 2) | 1555 | if (host_err == -EXDEV && rqstp->rq_vers == 2) |
| 1556 | err = nfserr_acces; | 1556 | err = nfserr_acces; |
| 1557 | else | 1557 | else |
| 1558 | err = nfserrno(host_err); | 1558 | err = nfserrno(host_err); |
| 1559 | } | 1559 | } |
| 1560 | 1560 | ||
| 1561 | dput(dnew); | 1561 | dput(dnew); |
| 1562 | out_unlock: | 1562 | out_unlock: |
| 1563 | fh_unlock(ffhp); | 1563 | fh_unlock(ffhp); |
| 1564 | out: | 1564 | out: |
| 1565 | return err; | 1565 | return err; |
| 1566 | 1566 | ||
| 1567 | out_nfserr: | 1567 | out_nfserr: |
| 1568 | err = nfserrno(host_err); | 1568 | err = nfserrno(host_err); |
| 1569 | goto out_unlock; | 1569 | goto out_unlock; |
| 1570 | } | 1570 | } |
| 1571 | 1571 | ||
| 1572 | /* | 1572 | /* |
| 1573 | * Rename a file | 1573 | * Rename a file |
| 1574 | * N.B. After this call _both_ ffhp and tfhp need an fh_put | 1574 | * N.B. After this call _both_ ffhp and tfhp need an fh_put |
| 1575 | */ | 1575 | */ |
| 1576 | __be32 | 1576 | __be32 |
| 1577 | nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen, | 1577 | nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen, |
| 1578 | struct svc_fh *tfhp, char *tname, int tlen) | 1578 | struct svc_fh *tfhp, char *tname, int tlen) |
| 1579 | { | 1579 | { |
| 1580 | struct dentry *fdentry, *tdentry, *odentry, *ndentry, *trap; | 1580 | struct dentry *fdentry, *tdentry, *odentry, *ndentry, *trap; |
| 1581 | struct inode *fdir, *tdir; | 1581 | struct inode *fdir, *tdir; |
| 1582 | __be32 err; | 1582 | __be32 err; |
| 1583 | int host_err; | 1583 | int host_err; |
| 1584 | 1584 | ||
| 1585 | err = fh_verify(rqstp, ffhp, S_IFDIR, MAY_REMOVE); | 1585 | err = fh_verify(rqstp, ffhp, S_IFDIR, MAY_REMOVE); |
| 1586 | if (err) | 1586 | if (err) |
| 1587 | goto out; | 1587 | goto out; |
| 1588 | err = fh_verify(rqstp, tfhp, S_IFDIR, MAY_CREATE); | 1588 | err = fh_verify(rqstp, tfhp, S_IFDIR, MAY_CREATE); |
| 1589 | if (err) | 1589 | if (err) |
| 1590 | goto out; | 1590 | goto out; |
| 1591 | 1591 | ||
| 1592 | fdentry = ffhp->fh_dentry; | 1592 | fdentry = ffhp->fh_dentry; |
| 1593 | fdir = fdentry->d_inode; | 1593 | fdir = fdentry->d_inode; |
| 1594 | 1594 | ||
| 1595 | tdentry = tfhp->fh_dentry; | 1595 | tdentry = tfhp->fh_dentry; |
| 1596 | tdir = tdentry->d_inode; | 1596 | tdir = tdentry->d_inode; |
| 1597 | 1597 | ||
| 1598 | err = (rqstp->rq_vers == 2) ? nfserr_acces : nfserr_xdev; | 1598 | err = (rqstp->rq_vers == 2) ? nfserr_acces : nfserr_xdev; |
| 1599 | if (ffhp->fh_export != tfhp->fh_export) | 1599 | if (ffhp->fh_export != tfhp->fh_export) |
| 1600 | goto out; | 1600 | goto out; |
| 1601 | 1601 | ||
| 1602 | err = nfserr_perm; | 1602 | err = nfserr_perm; |
| 1603 | if (!flen || isdotent(fname, flen) || !tlen || isdotent(tname, tlen)) | 1603 | if (!flen || isdotent(fname, flen) || !tlen || isdotent(tname, tlen)) |
| 1604 | goto out; | 1604 | goto out; |
| 1605 | 1605 | ||
| 1606 | /* cannot use fh_lock as we need deadlock protective ordering | 1606 | /* cannot use fh_lock as we need deadlock protective ordering |
| 1607 | * so do it by hand */ | 1607 | * so do it by hand */ |
| 1608 | trap = lock_rename(tdentry, fdentry); | 1608 | trap = lock_rename(tdentry, fdentry); |
| 1609 | ffhp->fh_locked = tfhp->fh_locked = 1; | 1609 | ffhp->fh_locked = tfhp->fh_locked = 1; |
| 1610 | fill_pre_wcc(ffhp); | 1610 | fill_pre_wcc(ffhp); |
| 1611 | fill_pre_wcc(tfhp); | 1611 | fill_pre_wcc(tfhp); |
| 1612 | 1612 | ||
| 1613 | odentry = lookup_one_len(fname, fdentry, flen); | 1613 | odentry = lookup_one_len(fname, fdentry, flen); |
| 1614 | host_err = PTR_ERR(odentry); | 1614 | host_err = PTR_ERR(odentry); |
| 1615 | if (IS_ERR(odentry)) | 1615 | if (IS_ERR(odentry)) |
| 1616 | goto out_nfserr; | 1616 | goto out_nfserr; |
| 1617 | 1617 | ||
| 1618 | host_err = -ENOENT; | 1618 | host_err = -ENOENT; |
| 1619 | if (!odentry->d_inode) | 1619 | if (!odentry->d_inode) |
| 1620 | goto out_dput_old; | 1620 | goto out_dput_old; |
| 1621 | host_err = -EINVAL; | 1621 | host_err = -EINVAL; |
| 1622 | if (odentry == trap) | 1622 | if (odentry == trap) |
| 1623 | goto out_dput_old; | 1623 | goto out_dput_old; |
| 1624 | 1624 | ||
| 1625 | ndentry = lookup_one_len(tname, tdentry, tlen); | 1625 | ndentry = lookup_one_len(tname, tdentry, tlen); |
| 1626 | host_err = PTR_ERR(ndentry); | 1626 | host_err = PTR_ERR(ndentry); |
| 1627 | if (IS_ERR(ndentry)) | 1627 | if (IS_ERR(ndentry)) |
| 1628 | goto out_dput_old; | 1628 | goto out_dput_old; |
| 1629 | host_err = -ENOTEMPTY; | 1629 | host_err = -ENOTEMPTY; |
| 1630 | if (ndentry == trap) | 1630 | if (ndentry == trap) |
| 1631 | goto out_dput_new; | 1631 | goto out_dput_new; |
| 1632 | 1632 | ||
| 1633 | #ifdef MSNFS | 1633 | #ifdef MSNFS |
| 1634 | if ((ffhp->fh_export->ex_flags & NFSEXP_MSNFS) && | 1634 | if ((ffhp->fh_export->ex_flags & NFSEXP_MSNFS) && |
| 1635 | ((atomic_read(&odentry->d_count) > 1) | 1635 | ((atomic_read(&odentry->d_count) > 1) |
| 1636 | || (atomic_read(&ndentry->d_count) > 1))) { | 1636 | || (atomic_read(&ndentry->d_count) > 1))) { |
| 1637 | host_err = -EPERM; | 1637 | host_err = -EPERM; |
| 1638 | } else | 1638 | } else |
| 1639 | #endif | 1639 | #endif |
| 1640 | host_err = vfs_rename(fdir, odentry, tdir, ndentry); | 1640 | host_err = vfs_rename(fdir, odentry, tdir, ndentry); |
| 1641 | if (!host_err && EX_ISSYNC(tfhp->fh_export)) { | 1641 | if (!host_err && EX_ISSYNC(tfhp->fh_export)) { |
| 1642 | host_err = nfsd_sync_dir(tdentry); | 1642 | host_err = nfsd_sync_dir(tdentry); |
| 1643 | if (!host_err) | 1643 | if (!host_err) |
| 1644 | host_err = nfsd_sync_dir(fdentry); | 1644 | host_err = nfsd_sync_dir(fdentry); |
| 1645 | } | 1645 | } |
| 1646 | 1646 | ||
| 1647 | out_dput_new: | 1647 | out_dput_new: |
| 1648 | dput(ndentry); | 1648 | dput(ndentry); |
| 1649 | out_dput_old: | 1649 | out_dput_old: |
| 1650 | dput(odentry); | 1650 | dput(odentry); |
| 1651 | out_nfserr: | 1651 | out_nfserr: |
| 1652 | err = nfserrno(host_err); | 1652 | err = nfserrno(host_err); |
| 1653 | 1653 | ||
| 1654 | /* we cannot reply on fh_unlock on the two filehandles, | 1654 | /* we cannot reply on fh_unlock on the two filehandles, |
| 1655 | * as that would do the wrong thing if the two directories | 1655 | * as that would do the wrong thing if the two directories |
| 1656 | * were the same, so again we do it by hand | 1656 | * were the same, so again we do it by hand |
| 1657 | */ | 1657 | */ |
| 1658 | fill_post_wcc(ffhp); | 1658 | fill_post_wcc(ffhp); |
| 1659 | fill_post_wcc(tfhp); | 1659 | fill_post_wcc(tfhp); |
| 1660 | unlock_rename(tdentry, fdentry); | 1660 | unlock_rename(tdentry, fdentry); |
| 1661 | ffhp->fh_locked = tfhp->fh_locked = 0; | 1661 | ffhp->fh_locked = tfhp->fh_locked = 0; |
| 1662 | 1662 | ||
| 1663 | out: | 1663 | out: |
| 1664 | return err; | 1664 | return err; |
| 1665 | } | 1665 | } |
| 1666 | 1666 | ||
| 1667 | /* | 1667 | /* |
| 1668 | * Unlink a file or directory | 1668 | * Unlink a file or directory |
| 1669 | * N.B. After this call fhp needs an fh_put | 1669 | * N.B. After this call fhp needs an fh_put |
| 1670 | */ | 1670 | */ |
| 1671 | __be32 | 1671 | __be32 |
| 1672 | nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, | 1672 | nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, |
| 1673 | char *fname, int flen) | 1673 | char *fname, int flen) |
| 1674 | { | 1674 | { |
| 1675 | struct dentry *dentry, *rdentry; | 1675 | struct dentry *dentry, *rdentry; |
| 1676 | struct inode *dirp; | 1676 | struct inode *dirp; |
| 1677 | __be32 err; | 1677 | __be32 err; |
| 1678 | int host_err; | 1678 | int host_err; |
| 1679 | 1679 | ||
| 1680 | err = nfserr_acces; | 1680 | err = nfserr_acces; |
| 1681 | if (!flen || isdotent(fname, flen)) | 1681 | if (!flen || isdotent(fname, flen)) |
| 1682 | goto out; | 1682 | goto out; |
| 1683 | err = fh_verify(rqstp, fhp, S_IFDIR, MAY_REMOVE); | 1683 | err = fh_verify(rqstp, fhp, S_IFDIR, MAY_REMOVE); |
| 1684 | if (err) | 1684 | if (err) |
| 1685 | goto out; | 1685 | goto out; |
| 1686 | 1686 | ||
| 1687 | fh_lock_nested(fhp, I_MUTEX_PARENT); | 1687 | fh_lock_nested(fhp, I_MUTEX_PARENT); |
| 1688 | dentry = fhp->fh_dentry; | 1688 | dentry = fhp->fh_dentry; |
| 1689 | dirp = dentry->d_inode; | 1689 | dirp = dentry->d_inode; |
| 1690 | 1690 | ||
| 1691 | rdentry = lookup_one_len(fname, dentry, flen); | 1691 | rdentry = lookup_one_len(fname, dentry, flen); |
| 1692 | host_err = PTR_ERR(rdentry); | 1692 | host_err = PTR_ERR(rdentry); |
| 1693 | if (IS_ERR(rdentry)) | 1693 | if (IS_ERR(rdentry)) |
| 1694 | goto out_nfserr; | 1694 | goto out_nfserr; |
| 1695 | 1695 | ||
| 1696 | if (!rdentry->d_inode) { | 1696 | if (!rdentry->d_inode) { |
| 1697 | dput(rdentry); | 1697 | dput(rdentry); |
| 1698 | err = nfserr_noent; | 1698 | err = nfserr_noent; |
| 1699 | goto out; | 1699 | goto out; |
| 1700 | } | 1700 | } |
| 1701 | 1701 | ||
| 1702 | if (!type) | 1702 | if (!type) |
| 1703 | type = rdentry->d_inode->i_mode & S_IFMT; | 1703 | type = rdentry->d_inode->i_mode & S_IFMT; |
| 1704 | 1704 | ||
| 1705 | if (type != S_IFDIR) { /* It's UNLINK */ | 1705 | if (type != S_IFDIR) { /* It's UNLINK */ |
| 1706 | #ifdef MSNFS | 1706 | #ifdef MSNFS |
| 1707 | if ((fhp->fh_export->ex_flags & NFSEXP_MSNFS) && | 1707 | if ((fhp->fh_export->ex_flags & NFSEXP_MSNFS) && |
| 1708 | (atomic_read(&rdentry->d_count) > 1)) { | 1708 | (atomic_read(&rdentry->d_count) > 1)) { |
| 1709 | host_err = -EPERM; | 1709 | host_err = -EPERM; |
| 1710 | } else | 1710 | } else |
| 1711 | #endif | 1711 | #endif |
| 1712 | host_err = vfs_unlink(dirp, rdentry); | 1712 | host_err = vfs_unlink(dirp, rdentry); |
| 1713 | } else { /* It's RMDIR */ | 1713 | } else { /* It's RMDIR */ |
| 1714 | host_err = vfs_rmdir(dirp, rdentry); | 1714 | host_err = vfs_rmdir(dirp, rdentry); |
| 1715 | } | 1715 | } |
| 1716 | 1716 | ||
| 1717 | dput(rdentry); | 1717 | dput(rdentry); |
| 1718 | 1718 | ||
| 1719 | if (host_err) | 1719 | if (host_err) |
| 1720 | goto out_nfserr; | 1720 | goto out_nfserr; |
| 1721 | if (EX_ISSYNC(fhp->fh_export)) | 1721 | if (EX_ISSYNC(fhp->fh_export)) |
| 1722 | host_err = nfsd_sync_dir(dentry); | 1722 | host_err = nfsd_sync_dir(dentry); |
| 1723 | 1723 | ||
| 1724 | out_nfserr: | 1724 | out_nfserr: |
| 1725 | err = nfserrno(host_err); | 1725 | err = nfserrno(host_err); |
| 1726 | out: | 1726 | out: |
| 1727 | return err; | 1727 | return err; |
| 1728 | } | 1728 | } |
| 1729 | 1729 | ||
| 1730 | /* | 1730 | /* |
| 1731 | * Read entries from a directory. | 1731 | * Read entries from a directory. |
| 1732 | * The NFSv3/4 verifier we ignore for now. | 1732 | * The NFSv3/4 verifier we ignore for now. |
| 1733 | */ | 1733 | */ |
| 1734 | __be32 | 1734 | __be32 |
| 1735 | nfsd_readdir(struct svc_rqst *rqstp, struct svc_fh *fhp, loff_t *offsetp, | 1735 | nfsd_readdir(struct svc_rqst *rqstp, struct svc_fh *fhp, loff_t *offsetp, |
| 1736 | struct readdir_cd *cdp, filldir_t func) | 1736 | struct readdir_cd *cdp, filldir_t func) |
| 1737 | { | 1737 | { |
| 1738 | __be32 err; | 1738 | __be32 err; |
| 1739 | int host_err; | 1739 | int host_err; |
| 1740 | struct file *file; | 1740 | struct file *file; |
| 1741 | loff_t offset = *offsetp; | 1741 | loff_t offset = *offsetp; |
| 1742 | 1742 | ||
| 1743 | err = nfsd_open(rqstp, fhp, S_IFDIR, MAY_READ, &file); | 1743 | err = nfsd_open(rqstp, fhp, S_IFDIR, MAY_READ, &file); |
| 1744 | if (err) | 1744 | if (err) |
| 1745 | goto out; | 1745 | goto out; |
| 1746 | 1746 | ||
| 1747 | offset = vfs_llseek(file, offset, 0); | 1747 | offset = vfs_llseek(file, offset, 0); |
| 1748 | if (offset < 0) { | 1748 | if (offset < 0) { |
| 1749 | err = nfserrno((int)offset); | 1749 | err = nfserrno((int)offset); |
| 1750 | goto out_close; | 1750 | goto out_close; |
| 1751 | } | 1751 | } |
| 1752 | 1752 | ||
| 1753 | /* | 1753 | /* |
| 1754 | * Read the directory entries. This silly loop is necessary because | 1754 | * Read the directory entries. This silly loop is necessary because |
| 1755 | * readdir() is not guaranteed to fill up the entire buffer, but | 1755 | * readdir() is not guaranteed to fill up the entire buffer, but |
| 1756 | * may choose to do less. | 1756 | * may choose to do less. |
| 1757 | */ | 1757 | */ |
| 1758 | 1758 | ||
| 1759 | do { | 1759 | do { |
| 1760 | cdp->err = nfserr_eof; /* will be cleared on successful read */ | 1760 | cdp->err = nfserr_eof; /* will be cleared on successful read */ |
| 1761 | host_err = vfs_readdir(file, func, cdp); | 1761 | host_err = vfs_readdir(file, func, cdp); |
| 1762 | } while (host_err >=0 && cdp->err == nfs_ok); | 1762 | } while (host_err >=0 && cdp->err == nfs_ok); |
| 1763 | if (host_err) | 1763 | if (host_err) |
| 1764 | err = nfserrno(host_err); | 1764 | err = nfserrno(host_err); |
| 1765 | else | 1765 | else |
| 1766 | err = cdp->err; | 1766 | err = cdp->err; |
| 1767 | *offsetp = vfs_llseek(file, 0, 1); | 1767 | *offsetp = vfs_llseek(file, 0, 1); |
| 1768 | 1768 | ||
| 1769 | if (err == nfserr_eof || err == nfserr_toosmall) | 1769 | if (err == nfserr_eof || err == nfserr_toosmall) |
| 1770 | err = nfs_ok; /* can still be found in ->err */ | 1770 | err = nfs_ok; /* can still be found in ->err */ |
| 1771 | out_close: | 1771 | out_close: |
| 1772 | nfsd_close(file); | 1772 | nfsd_close(file); |
| 1773 | out: | 1773 | out: |
| 1774 | return err; | 1774 | return err; |
| 1775 | } | 1775 | } |
| 1776 | 1776 | ||
| 1777 | /* | 1777 | /* |
| 1778 | * Get file system stats | 1778 | * Get file system stats |
| 1779 | * N.B. After this call fhp needs an fh_put | 1779 | * N.B. After this call fhp needs an fh_put |
| 1780 | */ | 1780 | */ |
| 1781 | __be32 | 1781 | __be32 |
| 1782 | nfsd_statfs(struct svc_rqst *rqstp, struct svc_fh *fhp, struct kstatfs *stat) | 1782 | nfsd_statfs(struct svc_rqst *rqstp, struct svc_fh *fhp, struct kstatfs *stat) |
| 1783 | { | 1783 | { |
| 1784 | __be32 err = fh_verify(rqstp, fhp, 0, MAY_NOP); | 1784 | __be32 err = fh_verify(rqstp, fhp, 0, MAY_NOP); |
| 1785 | if (!err && vfs_statfs(fhp->fh_dentry,stat)) | 1785 | if (!err && vfs_statfs(fhp->fh_dentry,stat)) |
| 1786 | err = nfserr_io; | 1786 | err = nfserr_io; |
| 1787 | return err; | 1787 | return err; |
| 1788 | } | 1788 | } |
| 1789 | 1789 | ||
| 1790 | /* | 1790 | /* |
| 1791 | * Check for a user's access permissions to this inode. | 1791 | * Check for a user's access permissions to this inode. |
| 1792 | */ | 1792 | */ |
| 1793 | __be32 | 1793 | __be32 |
| 1794 | nfsd_permission(struct svc_export *exp, struct dentry *dentry, int acc) | 1794 | nfsd_permission(struct svc_export *exp, struct dentry *dentry, int acc) |
| 1795 | { | 1795 | { |
| 1796 | struct inode *inode = dentry->d_inode; | 1796 | struct inode *inode = dentry->d_inode; |
| 1797 | int err; | 1797 | int err; |
| 1798 | 1798 | ||
| 1799 | if (acc == MAY_NOP) | 1799 | if (acc == MAY_NOP) |
| 1800 | return 0; | 1800 | return 0; |
| 1801 | #if 0 | 1801 | #if 0 |
| 1802 | dprintk("nfsd: permission 0x%x%s%s%s%s%s%s%s mode 0%o%s%s%s\n", | 1802 | dprintk("nfsd: permission 0x%x%s%s%s%s%s%s%s mode 0%o%s%s%s\n", |
| 1803 | acc, | 1803 | acc, |
| 1804 | (acc & MAY_READ)? " read" : "", | 1804 | (acc & MAY_READ)? " read" : "", |
| 1805 | (acc & MAY_WRITE)? " write" : "", | 1805 | (acc & MAY_WRITE)? " write" : "", |
| 1806 | (acc & MAY_EXEC)? " exec" : "", | 1806 | (acc & MAY_EXEC)? " exec" : "", |
| 1807 | (acc & MAY_SATTR)? " sattr" : "", | 1807 | (acc & MAY_SATTR)? " sattr" : "", |
| 1808 | (acc & MAY_TRUNC)? " trunc" : "", | 1808 | (acc & MAY_TRUNC)? " trunc" : "", |
| 1809 | (acc & MAY_LOCK)? " lock" : "", | 1809 | (acc & MAY_LOCK)? " lock" : "", |
| 1810 | (acc & MAY_OWNER_OVERRIDE)? " owneroverride" : "", | 1810 | (acc & MAY_OWNER_OVERRIDE)? " owneroverride" : "", |
| 1811 | inode->i_mode, | 1811 | inode->i_mode, |
| 1812 | IS_IMMUTABLE(inode)? " immut" : "", | 1812 | IS_IMMUTABLE(inode)? " immut" : "", |
| 1813 | IS_APPEND(inode)? " append" : "", | 1813 | IS_APPEND(inode)? " append" : "", |
| 1814 | IS_RDONLY(inode)? " ro" : ""); | 1814 | IS_RDONLY(inode)? " ro" : ""); |
| 1815 | dprintk(" owner %d/%d user %d/%d\n", | 1815 | dprintk(" owner %d/%d user %d/%d\n", |
| 1816 | inode->i_uid, inode->i_gid, current->fsuid, current->fsgid); | 1816 | inode->i_uid, inode->i_gid, current->fsuid, current->fsgid); |
| 1817 | #endif | 1817 | #endif |
| 1818 | 1818 | ||
| 1819 | /* Normally we reject any write/sattr etc access on a read-only file | 1819 | /* Normally we reject any write/sattr etc access on a read-only file |
| 1820 | * system. But if it is IRIX doing check on write-access for a | 1820 | * system. But if it is IRIX doing check on write-access for a |
| 1821 | * device special file, we ignore rofs. | 1821 | * device special file, we ignore rofs. |
| 1822 | */ | 1822 | */ |
| 1823 | if (!(acc & MAY_LOCAL_ACCESS)) | 1823 | if (!(acc & MAY_LOCAL_ACCESS)) |
| 1824 | if (acc & (MAY_WRITE | MAY_SATTR | MAY_TRUNC)) { | 1824 | if (acc & (MAY_WRITE | MAY_SATTR | MAY_TRUNC)) { |
| 1825 | if (EX_RDONLY(exp) || IS_RDONLY(inode)) | 1825 | if (EX_RDONLY(exp) || IS_RDONLY(inode)) |
| 1826 | return nfserr_rofs; | 1826 | return nfserr_rofs; |
| 1827 | if (/* (acc & MAY_WRITE) && */ IS_IMMUTABLE(inode)) | 1827 | if (/* (acc & MAY_WRITE) && */ IS_IMMUTABLE(inode)) |
| 1828 | return nfserr_perm; | 1828 | return nfserr_perm; |
| 1829 | } | 1829 | } |
| 1830 | if ((acc & MAY_TRUNC) && IS_APPEND(inode)) | 1830 | if ((acc & MAY_TRUNC) && IS_APPEND(inode)) |
| 1831 | return nfserr_perm; | 1831 | return nfserr_perm; |
| 1832 | 1832 | ||
| 1833 | if (acc & MAY_LOCK) { | 1833 | if (acc & MAY_LOCK) { |
| 1834 | /* If we cannot rely on authentication in NLM requests, | 1834 | /* If we cannot rely on authentication in NLM requests, |
| 1835 | * just allow locks, otherwise require read permission, or | 1835 | * just allow locks, otherwise require read permission, or |
| 1836 | * ownership | 1836 | * ownership |
| 1837 | */ | 1837 | */ |
| 1838 | if (exp->ex_flags & NFSEXP_NOAUTHNLM) | 1838 | if (exp->ex_flags & NFSEXP_NOAUTHNLM) |
| 1839 | return 0; | 1839 | return 0; |
| 1840 | else | 1840 | else |
| 1841 | acc = MAY_READ | MAY_OWNER_OVERRIDE; | 1841 | acc = MAY_READ | MAY_OWNER_OVERRIDE; |
| 1842 | } | 1842 | } |
| 1843 | /* | 1843 | /* |
| 1844 | * The file owner always gets access permission for accesses that | 1844 | * The file owner always gets access permission for accesses that |
| 1845 | * would normally be checked at open time. This is to make | 1845 | * would normally be checked at open time. This is to make |
| 1846 | * file access work even when the client has done a fchmod(fd, 0). | 1846 | * file access work even when the client has done a fchmod(fd, 0). |
| 1847 | * | 1847 | * |
| 1848 | * However, `cp foo bar' should fail nevertheless when bar is | 1848 | * However, `cp foo bar' should fail nevertheless when bar is |
| 1849 | * readonly. A sensible way to do this might be to reject all | 1849 | * readonly. A sensible way to do this might be to reject all |
| 1850 | * attempts to truncate a read-only file, because a creat() call | 1850 | * attempts to truncate a read-only file, because a creat() call |
| 1851 | * always implies file truncation. | 1851 | * always implies file truncation. |
| 1852 | * ... but this isn't really fair. A process may reasonably call | 1852 | * ... but this isn't really fair. A process may reasonably call |
| 1853 | * ftruncate on an open file descriptor on a file with perm 000. | 1853 | * ftruncate on an open file descriptor on a file with perm 000. |
| 1854 | * We must trust the client to do permission checking - using "ACCESS" | 1854 | * We must trust the client to do permission checking - using "ACCESS" |
| 1855 | * with NFSv3. | 1855 | * with NFSv3. |
| 1856 | */ | 1856 | */ |
| 1857 | if ((acc & MAY_OWNER_OVERRIDE) && | 1857 | if ((acc & MAY_OWNER_OVERRIDE) && |
| 1858 | inode->i_uid == current->fsuid) | 1858 | inode->i_uid == current->fsuid) |
| 1859 | return 0; | 1859 | return 0; |
| 1860 | 1860 | ||
| 1861 | err = permission(inode, acc & (MAY_READ|MAY_WRITE|MAY_EXEC), NULL); | 1861 | err = permission(inode, acc & (MAY_READ|MAY_WRITE|MAY_EXEC), NULL); |
| 1862 | 1862 | ||
| 1863 | /* Allow read access to binaries even when mode 111 */ | 1863 | /* Allow read access to binaries even when mode 111 */ |
| 1864 | if (err == -EACCES && S_ISREG(inode->i_mode) && | 1864 | if (err == -EACCES && S_ISREG(inode->i_mode) && |
| 1865 | acc == (MAY_READ | MAY_OWNER_OVERRIDE)) | 1865 | acc == (MAY_READ | MAY_OWNER_OVERRIDE)) |
| 1866 | err = permission(inode, MAY_EXEC, NULL); | 1866 | err = permission(inode, MAY_EXEC, NULL); |
| 1867 | 1867 | ||
| 1868 | return err? nfserrno(err) : 0; | 1868 | return err? nfserrno(err) : 0; |
| 1869 | } | 1869 | } |
| 1870 | 1870 | ||
| 1871 | void | 1871 | void |
| 1872 | nfsd_racache_shutdown(void) | 1872 | nfsd_racache_shutdown(void) |
| 1873 | { | 1873 | { |
| 1874 | if (!raparml) | 1874 | if (!raparml) |
| 1875 | return; | 1875 | return; |
| 1876 | dprintk("nfsd: freeing readahead buffers.\n"); | 1876 | dprintk("nfsd: freeing readahead buffers.\n"); |
| 1877 | kfree(raparml); | 1877 | kfree(raparml); |
| 1878 | raparml = NULL; | 1878 | raparml = NULL; |
| 1879 | } | 1879 | } |
| 1880 | /* | 1880 | /* |
| 1881 | * Initialize readahead param cache | 1881 | * Initialize readahead param cache |
| 1882 | */ | 1882 | */ |
| 1883 | int | 1883 | int |
| 1884 | nfsd_racache_init(int cache_size) | 1884 | nfsd_racache_init(int cache_size) |
| 1885 | { | 1885 | { |
| 1886 | int i; | 1886 | int i; |
| 1887 | int j = 0; | 1887 | int j = 0; |
| 1888 | int nperbucket; | 1888 | int nperbucket; |
| 1889 | 1889 | ||
| 1890 | 1890 | ||
| 1891 | if (raparml) | 1891 | if (raparml) |
| 1892 | return 0; | 1892 | return 0; |
| 1893 | if (cache_size < 2*RAPARM_HASH_SIZE) | 1893 | if (cache_size < 2*RAPARM_HASH_SIZE) |
| 1894 | cache_size = 2*RAPARM_HASH_SIZE; | 1894 | cache_size = 2*RAPARM_HASH_SIZE; |
| 1895 | raparml = kcalloc(cache_size, sizeof(struct raparms), GFP_KERNEL); | 1895 | raparml = kcalloc(cache_size, sizeof(struct raparms), GFP_KERNEL); |
| 1896 | 1896 | ||
| 1897 | if (!raparml) { | 1897 | if (!raparml) { |
| 1898 | printk(KERN_WARNING | 1898 | printk(KERN_WARNING |
| 1899 | "nfsd: Could not allocate memory read-ahead cache.\n"); | 1899 | "nfsd: Could not allocate memory read-ahead cache.\n"); |
| 1900 | return -ENOMEM; | 1900 | return -ENOMEM; |
| 1901 | } | 1901 | } |
| 1902 | 1902 | ||
| 1903 | dprintk("nfsd: allocating %d readahead buffers.\n", cache_size); | 1903 | dprintk("nfsd: allocating %d readahead buffers.\n", cache_size); |
| 1904 | for (i = 0 ; i < RAPARM_HASH_SIZE ; i++) { | 1904 | for (i = 0 ; i < RAPARM_HASH_SIZE ; i++) { |
| 1905 | raparm_hash[i].pb_head = NULL; | 1905 | raparm_hash[i].pb_head = NULL; |
| 1906 | spin_lock_init(&raparm_hash[i].pb_lock); | 1906 | spin_lock_init(&raparm_hash[i].pb_lock); |
| 1907 | } | 1907 | } |
| 1908 | nperbucket = cache_size >> RAPARM_HASH_BITS; | 1908 | nperbucket = cache_size >> RAPARM_HASH_BITS; |
| 1909 | for (i = 0; i < cache_size - 1; i++) { | 1909 | for (i = 0; i < cache_size - 1; i++) { |
| 1910 | if (i % nperbucket == 0) | 1910 | if (i % nperbucket == 0) |
| 1911 | raparm_hash[j++].pb_head = raparml + i; | 1911 | raparm_hash[j++].pb_head = raparml + i; |
| 1912 | if (i % nperbucket < nperbucket-1) | 1912 | if (i % nperbucket < nperbucket-1) |
| 1913 | raparml[i].p_next = raparml + i + 1; | 1913 | raparml[i].p_next = raparml + i + 1; |
| 1914 | } | 1914 | } |
| 1915 | 1915 | ||
| 1916 | nfsdstats.ra_size = cache_size; | 1916 | nfsdstats.ra_size = cache_size; |
| 1917 | return 0; | 1917 | return 0; |
| 1918 | } | 1918 | } |
| 1919 | 1919 | ||
| 1920 | #if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) | 1920 | #if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) |
| 1921 | struct posix_acl * | 1921 | struct posix_acl * |
| 1922 | nfsd_get_posix_acl(struct svc_fh *fhp, int type) | 1922 | nfsd_get_posix_acl(struct svc_fh *fhp, int type) |
| 1923 | { | 1923 | { |
| 1924 | struct inode *inode = fhp->fh_dentry->d_inode; | 1924 | struct inode *inode = fhp->fh_dentry->d_inode; |
| 1925 | char *name; | 1925 | char *name; |
| 1926 | void *value = NULL; | 1926 | void *value = NULL; |
| 1927 | ssize_t size; | 1927 | ssize_t size; |
| 1928 | struct posix_acl *acl; | 1928 | struct posix_acl *acl; |
| 1929 | 1929 | ||
| 1930 | if (!IS_POSIXACL(inode)) | 1930 | if (!IS_POSIXACL(inode)) |
| 1931 | return ERR_PTR(-EOPNOTSUPP); | 1931 | return ERR_PTR(-EOPNOTSUPP); |
| 1932 | 1932 | ||
| 1933 | switch (type) { | 1933 | switch (type) { |
| 1934 | case ACL_TYPE_ACCESS: | 1934 | case ACL_TYPE_ACCESS: |
| 1935 | name = POSIX_ACL_XATTR_ACCESS; | 1935 | name = POSIX_ACL_XATTR_ACCESS; |
| 1936 | break; | 1936 | break; |
| 1937 | case ACL_TYPE_DEFAULT: | 1937 | case ACL_TYPE_DEFAULT: |
| 1938 | name = POSIX_ACL_XATTR_DEFAULT; | 1938 | name = POSIX_ACL_XATTR_DEFAULT; |
| 1939 | break; | 1939 | break; |
| 1940 | default: | 1940 | default: |
| 1941 | return ERR_PTR(-EOPNOTSUPP); | 1941 | return ERR_PTR(-EOPNOTSUPP); |
| 1942 | } | 1942 | } |
| 1943 | 1943 | ||
| 1944 | size = nfsd_getxattr(fhp->fh_dentry, name, &value); | 1944 | size = nfsd_getxattr(fhp->fh_dentry, name, &value); |
| 1945 | if (size < 0) | 1945 | if (size < 0) |
| 1946 | return ERR_PTR(size); | 1946 | return ERR_PTR(size); |
| 1947 | 1947 | ||
| 1948 | acl = posix_acl_from_xattr(value, size); | 1948 | acl = posix_acl_from_xattr(value, size); |
| 1949 | kfree(value); | 1949 | kfree(value); |
| 1950 | return acl; | 1950 | return acl; |
| 1951 | } | 1951 | } |
| 1952 | 1952 | ||
| 1953 | int | 1953 | int |
| 1954 | nfsd_set_posix_acl(struct svc_fh *fhp, int type, struct posix_acl *acl) | 1954 | nfsd_set_posix_acl(struct svc_fh *fhp, int type, struct posix_acl *acl) |
| 1955 | { | 1955 | { |
| 1956 | struct inode *inode = fhp->fh_dentry->d_inode; | 1956 | struct inode *inode = fhp->fh_dentry->d_inode; |
| 1957 | char *name; | 1957 | char *name; |
| 1958 | void *value = NULL; | 1958 | void *value = NULL; |
| 1959 | size_t size; | 1959 | size_t size; |
| 1960 | int error; | 1960 | int error; |
| 1961 | 1961 | ||
| 1962 | if (!IS_POSIXACL(inode) || !inode->i_op || | 1962 | if (!IS_POSIXACL(inode) || !inode->i_op || |
| 1963 | !inode->i_op->setxattr || !inode->i_op->removexattr) | 1963 | !inode->i_op->setxattr || !inode->i_op->removexattr) |
| 1964 | return -EOPNOTSUPP; | 1964 | return -EOPNOTSUPP; |
| 1965 | switch(type) { | 1965 | switch(type) { |
| 1966 | case ACL_TYPE_ACCESS: | 1966 | case ACL_TYPE_ACCESS: |
| 1967 | name = POSIX_ACL_XATTR_ACCESS; | 1967 | name = POSIX_ACL_XATTR_ACCESS; |
| 1968 | break; | 1968 | break; |
| 1969 | case ACL_TYPE_DEFAULT: | 1969 | case ACL_TYPE_DEFAULT: |
| 1970 | name = POSIX_ACL_XATTR_DEFAULT; | 1970 | name = POSIX_ACL_XATTR_DEFAULT; |
| 1971 | break; | 1971 | break; |
| 1972 | default: | 1972 | default: |
| 1973 | return -EOPNOTSUPP; | 1973 | return -EOPNOTSUPP; |
| 1974 | } | 1974 | } |
| 1975 | 1975 | ||
| 1976 | if (acl && acl->a_count) { | 1976 | if (acl && acl->a_count) { |
| 1977 | size = posix_acl_xattr_size(acl->a_count); | 1977 | size = posix_acl_xattr_size(acl->a_count); |
| 1978 | value = kmalloc(size, GFP_KERNEL); | 1978 | value = kmalloc(size, GFP_KERNEL); |
| 1979 | if (!value) | 1979 | if (!value) |
| 1980 | return -ENOMEM; | 1980 | return -ENOMEM; |
| 1981 | error = posix_acl_to_xattr(acl, value, size); | 1981 | error = posix_acl_to_xattr(acl, value, size); |
| 1982 | if (error < 0) | 1982 | if (error < 0) |
| 1983 | goto getout; | 1983 | goto getout; |
| 1984 | size = error; | 1984 | size = error; |
| 1985 | } else | 1985 | } else |
| 1986 | size = 0; | 1986 | size = 0; |
| 1987 | 1987 | ||
| 1988 | if (size) | 1988 | if (size) |
| 1989 | error = vfs_setxattr(fhp->fh_dentry, name, value, size, 0); | 1989 | error = vfs_setxattr(fhp->fh_dentry, name, value, size, 0); |
| 1990 | else { | 1990 | else { |
| 1991 | if (!S_ISDIR(inode->i_mode) && type == ACL_TYPE_DEFAULT) | 1991 | if (!S_ISDIR(inode->i_mode) && type == ACL_TYPE_DEFAULT) |
| 1992 | error = 0; | 1992 | error = 0; |
| 1993 | else { | 1993 | else { |
| 1994 | error = vfs_removexattr(fhp->fh_dentry, name); | 1994 | error = vfs_removexattr(fhp->fh_dentry, name); |
| 1995 | if (error == -ENODATA) | 1995 | if (error == -ENODATA) |
| 1996 | error = 0; | 1996 | error = 0; |
| 1997 | } | 1997 | } |
| 1998 | } | 1998 | } |
| 1999 | 1999 | ||
| 2000 | getout: | 2000 | getout: |
| 2001 | kfree(value); | 2001 | kfree(value); |
| 2002 | return error; | 2002 | return error; |
| 2003 | } | 2003 | } |
| 2004 | #endif /* defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) */ | 2004 | #endif /* defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) */ |
| 2005 | 2005 |
fs/ocfs2/file.c
| 1 | /* -*- mode: c; c-basic-offset: 8; -*- | 1 | /* -*- mode: c; c-basic-offset: 8; -*- |
| 2 | * vim: noexpandtab sw=8 ts=8 sts=0: | 2 | * vim: noexpandtab sw=8 ts=8 sts=0: |
| 3 | * | 3 | * |
| 4 | * file.c | 4 | * file.c |
| 5 | * | 5 | * |
| 6 | * File open, close, extend, truncate | 6 | * File open, close, extend, truncate |
| 7 | * | 7 | * |
| 8 | * Copyright (C) 2002, 2004 Oracle. All rights reserved. | 8 | * Copyright (C) 2002, 2004 Oracle. All rights reserved. |
| 9 | * | 9 | * |
| 10 | * This program is free software; you can redistribute it and/or | 10 | * This program is free software; you can redistribute it and/or |
| 11 | * modify it under the terms of the GNU General Public | 11 | * modify it under the terms of the GNU General Public |
| 12 | * License as published by the Free Software Foundation; either | 12 | * License as published by the Free Software Foundation; either |
| 13 | * version 2 of the License, or (at your option) any later version. | 13 | * version 2 of the License, or (at your option) any later version. |
| 14 | * | 14 | * |
| 15 | * This program is distributed in the hope that it will be useful, | 15 | * This program is distributed in the hope that it will be useful, |
| 16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | 16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | 17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| 18 | * General Public License for more details. | 18 | * General Public License for more details. |
| 19 | * | 19 | * |
| 20 | * You should have received a copy of the GNU General Public | 20 | * You should have received a copy of the GNU General Public |
| 21 | * License along with this program; if not, write to the | 21 | * License along with this program; if not, write to the |
| 22 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | 22 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, |
| 23 | * Boston, MA 021110-1307, USA. | 23 | * Boston, MA 021110-1307, USA. |
| 24 | */ | 24 | */ |
| 25 | 25 | ||
| 26 | #include <linux/capability.h> | 26 | #include <linux/capability.h> |
| 27 | #include <linux/fs.h> | 27 | #include <linux/fs.h> |
| 28 | #include <linux/types.h> | 28 | #include <linux/types.h> |
| 29 | #include <linux/slab.h> | 29 | #include <linux/slab.h> |
| 30 | #include <linux/highmem.h> | 30 | #include <linux/highmem.h> |
| 31 | #include <linux/pagemap.h> | 31 | #include <linux/pagemap.h> |
| 32 | #include <linux/uio.h> | 32 | #include <linux/uio.h> |
| 33 | #include <linux/sched.h> | 33 | #include <linux/sched.h> |
| 34 | #include <linux/splice.h> | 34 | #include <linux/splice.h> |
| 35 | #include <linux/mount.h> | 35 | #include <linux/mount.h> |
| 36 | #include <linux/writeback.h> | 36 | #include <linux/writeback.h> |
| 37 | 37 | ||
| 38 | #define MLOG_MASK_PREFIX ML_INODE | 38 | #define MLOG_MASK_PREFIX ML_INODE |
| 39 | #include <cluster/masklog.h> | 39 | #include <cluster/masklog.h> |
| 40 | 40 | ||
| 41 | #include "ocfs2.h" | 41 | #include "ocfs2.h" |
| 42 | 42 | ||
| 43 | #include "alloc.h" | 43 | #include "alloc.h" |
| 44 | #include "aops.h" | 44 | #include "aops.h" |
| 45 | #include "dir.h" | 45 | #include "dir.h" |
| 46 | #include "dlmglue.h" | 46 | #include "dlmglue.h" |
| 47 | #include "extent_map.h" | 47 | #include "extent_map.h" |
| 48 | #include "file.h" | 48 | #include "file.h" |
| 49 | #include "sysfile.h" | 49 | #include "sysfile.h" |
| 50 | #include "inode.h" | 50 | #include "inode.h" |
| 51 | #include "ioctl.h" | 51 | #include "ioctl.h" |
| 52 | #include "journal.h" | 52 | #include "journal.h" |
| 53 | #include "mmap.h" | 53 | #include "mmap.h" |
| 54 | #include "suballoc.h" | 54 | #include "suballoc.h" |
| 55 | #include "super.h" | 55 | #include "super.h" |
| 56 | 56 | ||
| 57 | #include "buffer_head_io.h" | 57 | #include "buffer_head_io.h" |
| 58 | 58 | ||
| 59 | static int ocfs2_sync_inode(struct inode *inode) | 59 | static int ocfs2_sync_inode(struct inode *inode) |
| 60 | { | 60 | { |
| 61 | filemap_fdatawrite(inode->i_mapping); | 61 | filemap_fdatawrite(inode->i_mapping); |
| 62 | return sync_mapping_buffers(inode->i_mapping); | 62 | return sync_mapping_buffers(inode->i_mapping); |
| 63 | } | 63 | } |
| 64 | 64 | ||
| 65 | static int ocfs2_file_open(struct inode *inode, struct file *file) | 65 | static int ocfs2_file_open(struct inode *inode, struct file *file) |
| 66 | { | 66 | { |
| 67 | int status; | 67 | int status; |
| 68 | int mode = file->f_flags; | 68 | int mode = file->f_flags; |
| 69 | struct ocfs2_inode_info *oi = OCFS2_I(inode); | 69 | struct ocfs2_inode_info *oi = OCFS2_I(inode); |
| 70 | 70 | ||
| 71 | mlog_entry("(0x%p, 0x%p, '%.*s')\n", inode, file, | 71 | mlog_entry("(0x%p, 0x%p, '%.*s')\n", inode, file, |
| 72 | file->f_path.dentry->d_name.len, file->f_path.dentry->d_name.name); | 72 | file->f_path.dentry->d_name.len, file->f_path.dentry->d_name.name); |
| 73 | 73 | ||
| 74 | spin_lock(&oi->ip_lock); | 74 | spin_lock(&oi->ip_lock); |
| 75 | 75 | ||
| 76 | /* Check that the inode hasn't been wiped from disk by another | 76 | /* Check that the inode hasn't been wiped from disk by another |
| 77 | * node. If it hasn't then we're safe as long as we hold the | 77 | * node. If it hasn't then we're safe as long as we hold the |
| 78 | * spin lock until our increment of open count. */ | 78 | * spin lock until our increment of open count. */ |
| 79 | if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_DELETED) { | 79 | if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_DELETED) { |
| 80 | spin_unlock(&oi->ip_lock); | 80 | spin_unlock(&oi->ip_lock); |
| 81 | 81 | ||
| 82 | status = -ENOENT; | 82 | status = -ENOENT; |
| 83 | goto leave; | 83 | goto leave; |
| 84 | } | 84 | } |
| 85 | 85 | ||
| 86 | if (mode & O_DIRECT) | 86 | if (mode & O_DIRECT) |
| 87 | oi->ip_flags |= OCFS2_INODE_OPEN_DIRECT; | 87 | oi->ip_flags |= OCFS2_INODE_OPEN_DIRECT; |
| 88 | 88 | ||
| 89 | oi->ip_open_count++; | 89 | oi->ip_open_count++; |
| 90 | spin_unlock(&oi->ip_lock); | 90 | spin_unlock(&oi->ip_lock); |
| 91 | status = 0; | 91 | status = 0; |
| 92 | leave: | 92 | leave: |
| 93 | mlog_exit(status); | 93 | mlog_exit(status); |
| 94 | return status; | 94 | return status; |
| 95 | } | 95 | } |
| 96 | 96 | ||
| 97 | static int ocfs2_file_release(struct inode *inode, struct file *file) | 97 | static int ocfs2_file_release(struct inode *inode, struct file *file) |
| 98 | { | 98 | { |
| 99 | struct ocfs2_inode_info *oi = OCFS2_I(inode); | 99 | struct ocfs2_inode_info *oi = OCFS2_I(inode); |
| 100 | 100 | ||
| 101 | mlog_entry("(0x%p, 0x%p, '%.*s')\n", inode, file, | 101 | mlog_entry("(0x%p, 0x%p, '%.*s')\n", inode, file, |
| 102 | file->f_path.dentry->d_name.len, | 102 | file->f_path.dentry->d_name.len, |
| 103 | file->f_path.dentry->d_name.name); | 103 | file->f_path.dentry->d_name.name); |
| 104 | 104 | ||
| 105 | spin_lock(&oi->ip_lock); | 105 | spin_lock(&oi->ip_lock); |
| 106 | if (!--oi->ip_open_count) | 106 | if (!--oi->ip_open_count) |
| 107 | oi->ip_flags &= ~OCFS2_INODE_OPEN_DIRECT; | 107 | oi->ip_flags &= ~OCFS2_INODE_OPEN_DIRECT; |
| 108 | spin_unlock(&oi->ip_lock); | 108 | spin_unlock(&oi->ip_lock); |
| 109 | 109 | ||
| 110 | mlog_exit(0); | 110 | mlog_exit(0); |
| 111 | 111 | ||
| 112 | return 0; | 112 | return 0; |
| 113 | } | 113 | } |
| 114 | 114 | ||
| 115 | static int ocfs2_sync_file(struct file *file, | 115 | static int ocfs2_sync_file(struct file *file, |
| 116 | struct dentry *dentry, | 116 | struct dentry *dentry, |
| 117 | int datasync) | 117 | int datasync) |
| 118 | { | 118 | { |
| 119 | int err = 0; | 119 | int err = 0; |
| 120 | journal_t *journal; | 120 | journal_t *journal; |
| 121 | struct inode *inode = dentry->d_inode; | 121 | struct inode *inode = dentry->d_inode; |
| 122 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | 122 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); |
| 123 | 123 | ||
| 124 | mlog_entry("(0x%p, 0x%p, %d, '%.*s')\n", file, dentry, datasync, | 124 | mlog_entry("(0x%p, 0x%p, %d, '%.*s')\n", file, dentry, datasync, |
| 125 | dentry->d_name.len, dentry->d_name.name); | 125 | dentry->d_name.len, dentry->d_name.name); |
| 126 | 126 | ||
| 127 | err = ocfs2_sync_inode(dentry->d_inode); | 127 | err = ocfs2_sync_inode(dentry->d_inode); |
| 128 | if (err) | 128 | if (err) |
| 129 | goto bail; | 129 | goto bail; |
| 130 | 130 | ||
| 131 | journal = osb->journal->j_journal; | 131 | journal = osb->journal->j_journal; |
| 132 | err = journal_force_commit(journal); | 132 | err = journal_force_commit(journal); |
| 133 | 133 | ||
| 134 | bail: | 134 | bail: |
| 135 | mlog_exit(err); | 135 | mlog_exit(err); |
| 136 | 136 | ||
| 137 | return (err < 0) ? -EIO : 0; | 137 | return (err < 0) ? -EIO : 0; |
| 138 | } | 138 | } |
| 139 | 139 | ||
| 140 | int ocfs2_should_update_atime(struct inode *inode, | 140 | int ocfs2_should_update_atime(struct inode *inode, |
| 141 | struct vfsmount *vfsmnt) | 141 | struct vfsmount *vfsmnt) |
| 142 | { | 142 | { |
| 143 | struct timespec now; | 143 | struct timespec now; |
| 144 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | 144 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); |
| 145 | 145 | ||
| 146 | if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb)) | 146 | if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb)) |
| 147 | return 0; | 147 | return 0; |
| 148 | 148 | ||
| 149 | if ((inode->i_flags & S_NOATIME) || | 149 | if ((inode->i_flags & S_NOATIME) || |
| 150 | ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))) | 150 | ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))) |
| 151 | return 0; | 151 | return 0; |
| 152 | 152 | ||
| 153 | /* | 153 | /* |
| 154 | * We can be called with no vfsmnt structure - NFSD will | 154 | * We can be called with no vfsmnt structure - NFSD will |
| 155 | * sometimes do this. | 155 | * sometimes do this. |
| 156 | * | 156 | * |
| 157 | * Note that our action here is different than touch_atime() - | 157 | * Note that our action here is different than touch_atime() - |
| 158 | * if we can't tell whether this is a noatime mount, then we | 158 | * if we can't tell whether this is a noatime mount, then we |
| 159 | * don't know whether to trust the value of s_atime_quantum. | 159 | * don't know whether to trust the value of s_atime_quantum. |
| 160 | */ | 160 | */ |
| 161 | if (vfsmnt == NULL) | 161 | if (vfsmnt == NULL) |
| 162 | return 0; | 162 | return 0; |
| 163 | 163 | ||
| 164 | if ((vfsmnt->mnt_flags & MNT_NOATIME) || | 164 | if ((vfsmnt->mnt_flags & MNT_NOATIME) || |
| 165 | ((vfsmnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))) | 165 | ((vfsmnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))) |
| 166 | return 0; | 166 | return 0; |
| 167 | 167 | ||
| 168 | if (vfsmnt->mnt_flags & MNT_RELATIME) { | 168 | if (vfsmnt->mnt_flags & MNT_RELATIME) { |
| 169 | if ((timespec_compare(&inode->i_atime, &inode->i_mtime) <= 0) || | 169 | if ((timespec_compare(&inode->i_atime, &inode->i_mtime) <= 0) || |
| 170 | (timespec_compare(&inode->i_atime, &inode->i_ctime) <= 0)) | 170 | (timespec_compare(&inode->i_atime, &inode->i_ctime) <= 0)) |
| 171 | return 1; | 171 | return 1; |
| 172 | 172 | ||
| 173 | return 0; | 173 | return 0; |
| 174 | } | 174 | } |
| 175 | 175 | ||
| 176 | now = CURRENT_TIME; | 176 | now = CURRENT_TIME; |
| 177 | if ((now.tv_sec - inode->i_atime.tv_sec <= osb->s_atime_quantum)) | 177 | if ((now.tv_sec - inode->i_atime.tv_sec <= osb->s_atime_quantum)) |
| 178 | return 0; | 178 | return 0; |
| 179 | else | 179 | else |
| 180 | return 1; | 180 | return 1; |
| 181 | } | 181 | } |
| 182 | 182 | ||
| 183 | int ocfs2_update_inode_atime(struct inode *inode, | 183 | int ocfs2_update_inode_atime(struct inode *inode, |
| 184 | struct buffer_head *bh) | 184 | struct buffer_head *bh) |
| 185 | { | 185 | { |
| 186 | int ret; | 186 | int ret; |
| 187 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | 187 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); |
| 188 | handle_t *handle; | 188 | handle_t *handle; |
| 189 | 189 | ||
| 190 | mlog_entry_void(); | 190 | mlog_entry_void(); |
| 191 | 191 | ||
| 192 | handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); | 192 | handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); |
| 193 | if (handle == NULL) { | 193 | if (handle == NULL) { |
| 194 | ret = -ENOMEM; | 194 | ret = -ENOMEM; |
| 195 | mlog_errno(ret); | 195 | mlog_errno(ret); |
| 196 | goto out; | 196 | goto out; |
| 197 | } | 197 | } |
| 198 | 198 | ||
| 199 | inode->i_atime = CURRENT_TIME; | 199 | inode->i_atime = CURRENT_TIME; |
| 200 | ret = ocfs2_mark_inode_dirty(handle, inode, bh); | 200 | ret = ocfs2_mark_inode_dirty(handle, inode, bh); |
| 201 | if (ret < 0) | 201 | if (ret < 0) |
| 202 | mlog_errno(ret); | 202 | mlog_errno(ret); |
| 203 | 203 | ||
| 204 | ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle); | 204 | ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle); |
| 205 | out: | 205 | out: |
| 206 | mlog_exit(ret); | 206 | mlog_exit(ret); |
| 207 | return ret; | 207 | return ret; |
| 208 | } | 208 | } |
| 209 | 209 | ||
| 210 | static int ocfs2_set_inode_size(handle_t *handle, | 210 | static int ocfs2_set_inode_size(handle_t *handle, |
| 211 | struct inode *inode, | 211 | struct inode *inode, |
| 212 | struct buffer_head *fe_bh, | 212 | struct buffer_head *fe_bh, |
| 213 | u64 new_i_size) | 213 | u64 new_i_size) |
| 214 | { | 214 | { |
| 215 | int status; | 215 | int status; |
| 216 | 216 | ||
| 217 | mlog_entry_void(); | 217 | mlog_entry_void(); |
| 218 | i_size_write(inode, new_i_size); | 218 | i_size_write(inode, new_i_size); |
| 219 | inode->i_blocks = ocfs2_inode_sector_count(inode); | 219 | inode->i_blocks = ocfs2_inode_sector_count(inode); |
| 220 | inode->i_ctime = inode->i_mtime = CURRENT_TIME; | 220 | inode->i_ctime = inode->i_mtime = CURRENT_TIME; |
| 221 | 221 | ||
| 222 | status = ocfs2_mark_inode_dirty(handle, inode, fe_bh); | 222 | status = ocfs2_mark_inode_dirty(handle, inode, fe_bh); |
| 223 | if (status < 0) { | 223 | if (status < 0) { |
| 224 | mlog_errno(status); | 224 | mlog_errno(status); |
| 225 | goto bail; | 225 | goto bail; |
| 226 | } | 226 | } |
| 227 | 227 | ||
| 228 | bail: | 228 | bail: |
| 229 | mlog_exit(status); | 229 | mlog_exit(status); |
| 230 | return status; | 230 | return status; |
| 231 | } | 231 | } |
| 232 | 232 | ||
| 233 | static int ocfs2_simple_size_update(struct inode *inode, | 233 | static int ocfs2_simple_size_update(struct inode *inode, |
| 234 | struct buffer_head *di_bh, | 234 | struct buffer_head *di_bh, |
| 235 | u64 new_i_size) | 235 | u64 new_i_size) |
| 236 | { | 236 | { |
| 237 | int ret; | 237 | int ret; |
| 238 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | 238 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); |
| 239 | handle_t *handle = NULL; | 239 | handle_t *handle = NULL; |
| 240 | 240 | ||
| 241 | handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); | 241 | handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); |
| 242 | if (handle == NULL) { | 242 | if (handle == NULL) { |
| 243 | ret = -ENOMEM; | 243 | ret = -ENOMEM; |
| 244 | mlog_errno(ret); | 244 | mlog_errno(ret); |
| 245 | goto out; | 245 | goto out; |
| 246 | } | 246 | } |
| 247 | 247 | ||
| 248 | ret = ocfs2_set_inode_size(handle, inode, di_bh, | 248 | ret = ocfs2_set_inode_size(handle, inode, di_bh, |
| 249 | new_i_size); | 249 | new_i_size); |
| 250 | if (ret < 0) | 250 | if (ret < 0) |
| 251 | mlog_errno(ret); | 251 | mlog_errno(ret); |
| 252 | 252 | ||
| 253 | ocfs2_commit_trans(osb, handle); | 253 | ocfs2_commit_trans(osb, handle); |
| 254 | out: | 254 | out: |
| 255 | return ret; | 255 | return ret; |
| 256 | } | 256 | } |
| 257 | 257 | ||
| 258 | static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb, | 258 | static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb, |
| 259 | struct inode *inode, | 259 | struct inode *inode, |
| 260 | struct buffer_head *fe_bh, | 260 | struct buffer_head *fe_bh, |
| 261 | u64 new_i_size) | 261 | u64 new_i_size) |
| 262 | { | 262 | { |
| 263 | int status; | 263 | int status; |
| 264 | handle_t *handle; | 264 | handle_t *handle; |
| 265 | struct ocfs2_dinode *di; | 265 | struct ocfs2_dinode *di; |
| 266 | 266 | ||
| 267 | mlog_entry_void(); | 267 | mlog_entry_void(); |
| 268 | 268 | ||
| 269 | /* TODO: This needs to actually orphan the inode in this | 269 | /* TODO: This needs to actually orphan the inode in this |
| 270 | * transaction. */ | 270 | * transaction. */ |
| 271 | 271 | ||
| 272 | handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); | 272 | handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); |
| 273 | if (IS_ERR(handle)) { | 273 | if (IS_ERR(handle)) { |
| 274 | status = PTR_ERR(handle); | 274 | status = PTR_ERR(handle); |
| 275 | mlog_errno(status); | 275 | mlog_errno(status); |
| 276 | goto out; | 276 | goto out; |
| 277 | } | 277 | } |
| 278 | 278 | ||
| 279 | status = ocfs2_journal_access(handle, inode, fe_bh, | 279 | status = ocfs2_journal_access(handle, inode, fe_bh, |
| 280 | OCFS2_JOURNAL_ACCESS_WRITE); | 280 | OCFS2_JOURNAL_ACCESS_WRITE); |
| 281 | if (status < 0) { | 281 | if (status < 0) { |
| 282 | mlog_errno(status); | 282 | mlog_errno(status); |
| 283 | goto out_commit; | 283 | goto out_commit; |
| 284 | } | 284 | } |
| 285 | 285 | ||
| 286 | /* | 286 | /* |
| 287 | * Do this before setting i_size. | 287 | * Do this before setting i_size. |
| 288 | */ | 288 | */ |
| 289 | status = ocfs2_zero_tail_for_truncate(inode, handle, new_i_size); | 289 | status = ocfs2_zero_tail_for_truncate(inode, handle, new_i_size); |
| 290 | if (status) { | 290 | if (status) { |
| 291 | mlog_errno(status); | 291 | mlog_errno(status); |
| 292 | goto out_commit; | 292 | goto out_commit; |
| 293 | } | 293 | } |
| 294 | 294 | ||
| 295 | i_size_write(inode, new_i_size); | 295 | i_size_write(inode, new_i_size); |
| 296 | inode->i_blocks = ocfs2_align_bytes_to_sectors(new_i_size); | 296 | inode->i_blocks = ocfs2_align_bytes_to_sectors(new_i_size); |
| 297 | inode->i_ctime = inode->i_mtime = CURRENT_TIME; | 297 | inode->i_ctime = inode->i_mtime = CURRENT_TIME; |
| 298 | 298 | ||
| 299 | di = (struct ocfs2_dinode *) fe_bh->b_data; | 299 | di = (struct ocfs2_dinode *) fe_bh->b_data; |
| 300 | di->i_size = cpu_to_le64(new_i_size); | 300 | di->i_size = cpu_to_le64(new_i_size); |
| 301 | di->i_ctime = di->i_mtime = cpu_to_le64(inode->i_ctime.tv_sec); | 301 | di->i_ctime = di->i_mtime = cpu_to_le64(inode->i_ctime.tv_sec); |
| 302 | di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); | 302 | di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); |
| 303 | 303 | ||
| 304 | status = ocfs2_journal_dirty(handle, fe_bh); | 304 | status = ocfs2_journal_dirty(handle, fe_bh); |
| 305 | if (status < 0) | 305 | if (status < 0) |
| 306 | mlog_errno(status); | 306 | mlog_errno(status); |
| 307 | 307 | ||
| 308 | out_commit: | 308 | out_commit: |
| 309 | ocfs2_commit_trans(osb, handle); | 309 | ocfs2_commit_trans(osb, handle); |
| 310 | out: | 310 | out: |
| 311 | 311 | ||
| 312 | mlog_exit(status); | 312 | mlog_exit(status); |
| 313 | return status; | 313 | return status; |
| 314 | } | 314 | } |
| 315 | 315 | ||
| 316 | static int ocfs2_truncate_file(struct inode *inode, | 316 | static int ocfs2_truncate_file(struct inode *inode, |
| 317 | struct buffer_head *di_bh, | 317 | struct buffer_head *di_bh, |
| 318 | u64 new_i_size) | 318 | u64 new_i_size) |
| 319 | { | 319 | { |
| 320 | int status = 0; | 320 | int status = 0; |
| 321 | struct ocfs2_dinode *fe = NULL; | 321 | struct ocfs2_dinode *fe = NULL; |
| 322 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | 322 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); |
| 323 | struct ocfs2_truncate_context *tc = NULL; | 323 | struct ocfs2_truncate_context *tc = NULL; |
| 324 | 324 | ||
| 325 | mlog_entry("(inode = %llu, new_i_size = %llu\n", | 325 | mlog_entry("(inode = %llu, new_i_size = %llu\n", |
| 326 | (unsigned long long)OCFS2_I(inode)->ip_blkno, | 326 | (unsigned long long)OCFS2_I(inode)->ip_blkno, |
| 327 | (unsigned long long)new_i_size); | 327 | (unsigned long long)new_i_size); |
| 328 | 328 | ||
| 329 | unmap_mapping_range(inode->i_mapping, new_i_size + PAGE_SIZE - 1, 0, 1); | 329 | unmap_mapping_range(inode->i_mapping, new_i_size + PAGE_SIZE - 1, 0, 1); |
| 330 | truncate_inode_pages(inode->i_mapping, new_i_size); | 330 | truncate_inode_pages(inode->i_mapping, new_i_size); |
| 331 | 331 | ||
| 332 | fe = (struct ocfs2_dinode *) di_bh->b_data; | 332 | fe = (struct ocfs2_dinode *) di_bh->b_data; |
| 333 | if (!OCFS2_IS_VALID_DINODE(fe)) { | 333 | if (!OCFS2_IS_VALID_DINODE(fe)) { |
| 334 | OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe); | 334 | OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe); |
| 335 | status = -EIO; | 335 | status = -EIO; |
| 336 | goto bail; | 336 | goto bail; |
| 337 | } | 337 | } |
| 338 | 338 | ||
| 339 | mlog_bug_on_msg(le64_to_cpu(fe->i_size) != i_size_read(inode), | 339 | mlog_bug_on_msg(le64_to_cpu(fe->i_size) != i_size_read(inode), |
| 340 | "Inode %llu, inode i_size = %lld != di " | 340 | "Inode %llu, inode i_size = %lld != di " |
| 341 | "i_size = %llu, i_flags = 0x%x\n", | 341 | "i_size = %llu, i_flags = 0x%x\n", |
| 342 | (unsigned long long)OCFS2_I(inode)->ip_blkno, | 342 | (unsigned long long)OCFS2_I(inode)->ip_blkno, |
| 343 | i_size_read(inode), | 343 | i_size_read(inode), |
| 344 | (unsigned long long)le64_to_cpu(fe->i_size), | 344 | (unsigned long long)le64_to_cpu(fe->i_size), |
| 345 | le32_to_cpu(fe->i_flags)); | 345 | le32_to_cpu(fe->i_flags)); |
| 346 | 346 | ||
| 347 | if (new_i_size > le64_to_cpu(fe->i_size)) { | 347 | if (new_i_size > le64_to_cpu(fe->i_size)) { |
| 348 | mlog(0, "asked to truncate file with size (%llu) to size (%llu)!\n", | 348 | mlog(0, "asked to truncate file with size (%llu) to size (%llu)!\n", |
| 349 | (unsigned long long)le64_to_cpu(fe->i_size), | 349 | (unsigned long long)le64_to_cpu(fe->i_size), |
| 350 | (unsigned long long)new_i_size); | 350 | (unsigned long long)new_i_size); |
| 351 | status = -EINVAL; | 351 | status = -EINVAL; |
| 352 | mlog_errno(status); | 352 | mlog_errno(status); |
| 353 | goto bail; | 353 | goto bail; |
| 354 | } | 354 | } |
| 355 | 355 | ||
| 356 | mlog(0, "inode %llu, i_size = %llu, new_i_size = %llu\n", | 356 | mlog(0, "inode %llu, i_size = %llu, new_i_size = %llu\n", |
| 357 | (unsigned long long)le64_to_cpu(fe->i_blkno), | 357 | (unsigned long long)le64_to_cpu(fe->i_blkno), |
| 358 | (unsigned long long)le64_to_cpu(fe->i_size), | 358 | (unsigned long long)le64_to_cpu(fe->i_size), |
| 359 | (unsigned long long)new_i_size); | 359 | (unsigned long long)new_i_size); |
| 360 | 360 | ||
| 361 | /* lets handle the simple truncate cases before doing any more | 361 | /* lets handle the simple truncate cases before doing any more |
| 362 | * cluster locking. */ | 362 | * cluster locking. */ |
| 363 | if (new_i_size == le64_to_cpu(fe->i_size)) | 363 | if (new_i_size == le64_to_cpu(fe->i_size)) |
| 364 | goto bail; | 364 | goto bail; |
| 365 | 365 | ||
| 366 | /* This forces other nodes to sync and drop their pages. Do | 366 | /* This forces other nodes to sync and drop their pages. Do |
| 367 | * this even if we have a truncate without allocation change - | 367 | * this even if we have a truncate without allocation change - |
| 368 | * ocfs2 cluster sizes can be much greater than page size, so | 368 | * ocfs2 cluster sizes can be much greater than page size, so |
| 369 | * we have to truncate them anyway. */ | 369 | * we have to truncate them anyway. */ |
| 370 | status = ocfs2_data_lock(inode, 1); | 370 | status = ocfs2_data_lock(inode, 1); |
| 371 | if (status < 0) { | 371 | if (status < 0) { |
| 372 | mlog_errno(status); | 372 | mlog_errno(status); |
| 373 | goto bail; | 373 | goto bail; |
| 374 | } | 374 | } |
| 375 | 375 | ||
| 376 | /* alright, we're going to need to do a full blown alloc size | 376 | /* alright, we're going to need to do a full blown alloc size |
| 377 | * change. Orphan the inode so that recovery can complete the | 377 | * change. Orphan the inode so that recovery can complete the |
| 378 | * truncate if necessary. This does the task of marking | 378 | * truncate if necessary. This does the task of marking |
| 379 | * i_size. */ | 379 | * i_size. */ |
| 380 | status = ocfs2_orphan_for_truncate(osb, inode, di_bh, new_i_size); | 380 | status = ocfs2_orphan_for_truncate(osb, inode, di_bh, new_i_size); |
| 381 | if (status < 0) { | 381 | if (status < 0) { |
| 382 | mlog_errno(status); | 382 | mlog_errno(status); |
| 383 | goto bail_unlock_data; | 383 | goto bail_unlock_data; |
| 384 | } | 384 | } |
| 385 | 385 | ||
| 386 | status = ocfs2_prepare_truncate(osb, inode, di_bh, &tc); | 386 | status = ocfs2_prepare_truncate(osb, inode, di_bh, &tc); |
| 387 | if (status < 0) { | 387 | if (status < 0) { |
| 388 | mlog_errno(status); | 388 | mlog_errno(status); |
| 389 | goto bail_unlock_data; | 389 | goto bail_unlock_data; |
| 390 | } | 390 | } |
| 391 | 391 | ||
| 392 | status = ocfs2_commit_truncate(osb, inode, di_bh, tc); | 392 | status = ocfs2_commit_truncate(osb, inode, di_bh, tc); |
| 393 | if (status < 0) { | 393 | if (status < 0) { |
| 394 | mlog_errno(status); | 394 | mlog_errno(status); |
| 395 | goto bail_unlock_data; | 395 | goto bail_unlock_data; |
| 396 | } | 396 | } |
| 397 | 397 | ||
| 398 | /* TODO: orphan dir cleanup here. */ | 398 | /* TODO: orphan dir cleanup here. */ |
| 399 | bail_unlock_data: | 399 | bail_unlock_data: |
| 400 | ocfs2_data_unlock(inode, 1); | 400 | ocfs2_data_unlock(inode, 1); |
| 401 | 401 | ||
| 402 | bail: | 402 | bail: |
| 403 | 403 | ||
| 404 | mlog_exit(status); | 404 | mlog_exit(status); |
| 405 | return status; | 405 | return status; |
| 406 | } | 406 | } |
| 407 | 407 | ||
| 408 | /* | 408 | /* |
| 409 | * extend allocation only here. | 409 | * extend allocation only here. |
| 410 | * we'll update all the disk stuff, and oip->alloc_size | 410 | * we'll update all the disk stuff, and oip->alloc_size |
| 411 | * | 411 | * |
| 412 | * expect stuff to be locked, a transaction started and enough data / | 412 | * expect stuff to be locked, a transaction started and enough data / |
| 413 | * metadata reservations in the contexts. | 413 | * metadata reservations in the contexts. |
| 414 | * | 414 | * |
| 415 | * Will return -EAGAIN, and a reason if a restart is needed. | 415 | * Will return -EAGAIN, and a reason if a restart is needed. |
| 416 | * If passed in, *reason will always be set, even in error. | 416 | * If passed in, *reason will always be set, even in error. |
| 417 | */ | 417 | */ |
| 418 | int ocfs2_do_extend_allocation(struct ocfs2_super *osb, | 418 | int ocfs2_do_extend_allocation(struct ocfs2_super *osb, |
| 419 | struct inode *inode, | 419 | struct inode *inode, |
| 420 | u32 *logical_offset, | 420 | u32 *logical_offset, |
| 421 | u32 clusters_to_add, | 421 | u32 clusters_to_add, |
| 422 | struct buffer_head *fe_bh, | 422 | struct buffer_head *fe_bh, |
| 423 | handle_t *handle, | 423 | handle_t *handle, |
| 424 | struct ocfs2_alloc_context *data_ac, | 424 | struct ocfs2_alloc_context *data_ac, |
| 425 | struct ocfs2_alloc_context *meta_ac, | 425 | struct ocfs2_alloc_context *meta_ac, |
| 426 | enum ocfs2_alloc_restarted *reason_ret) | 426 | enum ocfs2_alloc_restarted *reason_ret) |
| 427 | { | 427 | { |
| 428 | int status = 0; | 428 | int status = 0; |
| 429 | int free_extents; | 429 | int free_extents; |
| 430 | struct ocfs2_dinode *fe = (struct ocfs2_dinode *) fe_bh->b_data; | 430 | struct ocfs2_dinode *fe = (struct ocfs2_dinode *) fe_bh->b_data; |
| 431 | enum ocfs2_alloc_restarted reason = RESTART_NONE; | 431 | enum ocfs2_alloc_restarted reason = RESTART_NONE; |
| 432 | u32 bit_off, num_bits; | 432 | u32 bit_off, num_bits; |
| 433 | u64 block; | 433 | u64 block; |
| 434 | 434 | ||
| 435 | BUG_ON(!clusters_to_add); | 435 | BUG_ON(!clusters_to_add); |
| 436 | 436 | ||
| 437 | free_extents = ocfs2_num_free_extents(osb, inode, fe); | 437 | free_extents = ocfs2_num_free_extents(osb, inode, fe); |
| 438 | if (free_extents < 0) { | 438 | if (free_extents < 0) { |
| 439 | status = free_extents; | 439 | status = free_extents; |
| 440 | mlog_errno(status); | 440 | mlog_errno(status); |
| 441 | goto leave; | 441 | goto leave; |
| 442 | } | 442 | } |
| 443 | 443 | ||
| 444 | /* there are two cases which could cause us to EAGAIN in the | 444 | /* there are two cases which could cause us to EAGAIN in the |
| 445 | * we-need-more-metadata case: | 445 | * we-need-more-metadata case: |
| 446 | * 1) we haven't reserved *any* | 446 | * 1) we haven't reserved *any* |
| 447 | * 2) we are so fragmented, we've needed to add metadata too | 447 | * 2) we are so fragmented, we've needed to add metadata too |
| 448 | * many times. */ | 448 | * many times. */ |
| 449 | if (!free_extents && !meta_ac) { | 449 | if (!free_extents && !meta_ac) { |
| 450 | mlog(0, "we haven't reserved any metadata!\n"); | 450 | mlog(0, "we haven't reserved any metadata!\n"); |
| 451 | status = -EAGAIN; | 451 | status = -EAGAIN; |
| 452 | reason = RESTART_META; | 452 | reason = RESTART_META; |
| 453 | goto leave; | 453 | goto leave; |
| 454 | } else if ((!free_extents) | 454 | } else if ((!free_extents) |
| 455 | && (ocfs2_alloc_context_bits_left(meta_ac) | 455 | && (ocfs2_alloc_context_bits_left(meta_ac) |
| 456 | < ocfs2_extend_meta_needed(fe))) { | 456 | < ocfs2_extend_meta_needed(fe))) { |
| 457 | mlog(0, "filesystem is really fragmented...\n"); | 457 | mlog(0, "filesystem is really fragmented...\n"); |
| 458 | status = -EAGAIN; | 458 | status = -EAGAIN; |
| 459 | reason = RESTART_META; | 459 | reason = RESTART_META; |
| 460 | goto leave; | 460 | goto leave; |
| 461 | } | 461 | } |
| 462 | 462 | ||
| 463 | status = ocfs2_claim_clusters(osb, handle, data_ac, 1, | 463 | status = ocfs2_claim_clusters(osb, handle, data_ac, 1, |
| 464 | &bit_off, &num_bits); | 464 | &bit_off, &num_bits); |
| 465 | if (status < 0) { | 465 | if (status < 0) { |
| 466 | if (status != -ENOSPC) | 466 | if (status != -ENOSPC) |
| 467 | mlog_errno(status); | 467 | mlog_errno(status); |
| 468 | goto leave; | 468 | goto leave; |
| 469 | } | 469 | } |
| 470 | 470 | ||
| 471 | BUG_ON(num_bits > clusters_to_add); | 471 | BUG_ON(num_bits > clusters_to_add); |
| 472 | 472 | ||
| 473 | /* reserve our write early -- insert_extent may update the inode */ | 473 | /* reserve our write early -- insert_extent may update the inode */ |
| 474 | status = ocfs2_journal_access(handle, inode, fe_bh, | 474 | status = ocfs2_journal_access(handle, inode, fe_bh, |
| 475 | OCFS2_JOURNAL_ACCESS_WRITE); | 475 | OCFS2_JOURNAL_ACCESS_WRITE); |
| 476 | if (status < 0) { | 476 | if (status < 0) { |
| 477 | mlog_errno(status); | 477 | mlog_errno(status); |
| 478 | goto leave; | 478 | goto leave; |
| 479 | } | 479 | } |
| 480 | 480 | ||
| 481 | block = ocfs2_clusters_to_blocks(osb->sb, bit_off); | 481 | block = ocfs2_clusters_to_blocks(osb->sb, bit_off); |
| 482 | mlog(0, "Allocating %u clusters at block %u for inode %llu\n", | 482 | mlog(0, "Allocating %u clusters at block %u for inode %llu\n", |
| 483 | num_bits, bit_off, (unsigned long long)OCFS2_I(inode)->ip_blkno); | 483 | num_bits, bit_off, (unsigned long long)OCFS2_I(inode)->ip_blkno); |
| 484 | status = ocfs2_insert_extent(osb, handle, inode, fe_bh, | 484 | status = ocfs2_insert_extent(osb, handle, inode, fe_bh, |
| 485 | *logical_offset, block, num_bits, | 485 | *logical_offset, block, num_bits, |
| 486 | meta_ac); | 486 | meta_ac); |
| 487 | if (status < 0) { | 487 | if (status < 0) { |
| 488 | mlog_errno(status); | 488 | mlog_errno(status); |
| 489 | goto leave; | 489 | goto leave; |
| 490 | } | 490 | } |
| 491 | 491 | ||
| 492 | status = ocfs2_journal_dirty(handle, fe_bh); | 492 | status = ocfs2_journal_dirty(handle, fe_bh); |
| 493 | if (status < 0) { | 493 | if (status < 0) { |
| 494 | mlog_errno(status); | 494 | mlog_errno(status); |
| 495 | goto leave; | 495 | goto leave; |
| 496 | } | 496 | } |
| 497 | 497 | ||
| 498 | clusters_to_add -= num_bits; | 498 | clusters_to_add -= num_bits; |
| 499 | *logical_offset += num_bits; | 499 | *logical_offset += num_bits; |
| 500 | 500 | ||
| 501 | if (clusters_to_add) { | 501 | if (clusters_to_add) { |
| 502 | mlog(0, "need to alloc once more, clusters = %u, wanted = " | 502 | mlog(0, "need to alloc once more, clusters = %u, wanted = " |
| 503 | "%u\n", fe->i_clusters, clusters_to_add); | 503 | "%u\n", fe->i_clusters, clusters_to_add); |
| 504 | status = -EAGAIN; | 504 | status = -EAGAIN; |
| 505 | reason = RESTART_TRANS; | 505 | reason = RESTART_TRANS; |
| 506 | } | 506 | } |
| 507 | 507 | ||
| 508 | leave: | 508 | leave: |
| 509 | mlog_exit(status); | 509 | mlog_exit(status); |
| 510 | if (reason_ret) | 510 | if (reason_ret) |
| 511 | *reason_ret = reason; | 511 | *reason_ret = reason; |
| 512 | return status; | 512 | return status; |
| 513 | } | 513 | } |
| 514 | 514 | ||
| 515 | /* | 515 | /* |
| 516 | * For a given allocation, determine which allocators will need to be | 516 | * For a given allocation, determine which allocators will need to be |
| 517 | * accessed, and lock them, reserving the appropriate number of bits. | 517 | * accessed, and lock them, reserving the appropriate number of bits. |
| 518 | * | 518 | * |
| 519 | * Called from ocfs2_extend_allocation() for file systems which don't | 519 | * Called from ocfs2_extend_allocation() for file systems which don't |
| 520 | * support holes, and from ocfs2_write() for file systems which | 520 | * support holes, and from ocfs2_write() for file systems which |
| 521 | * understand sparse inodes. | 521 | * understand sparse inodes. |
| 522 | */ | 522 | */ |
| 523 | int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_dinode *di, | 523 | int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_dinode *di, |
| 524 | u32 clusters_to_add, | 524 | u32 clusters_to_add, |
| 525 | struct ocfs2_alloc_context **data_ac, | 525 | struct ocfs2_alloc_context **data_ac, |
| 526 | struct ocfs2_alloc_context **meta_ac) | 526 | struct ocfs2_alloc_context **meta_ac) |
| 527 | { | 527 | { |
| 528 | int ret, num_free_extents; | 528 | int ret, num_free_extents; |
| 529 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | 529 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); |
| 530 | 530 | ||
| 531 | *meta_ac = NULL; | 531 | *meta_ac = NULL; |
| 532 | *data_ac = NULL; | 532 | *data_ac = NULL; |
| 533 | 533 | ||
| 534 | mlog(0, "extend inode %llu, i_size = %lld, di->i_clusters = %u, " | 534 | mlog(0, "extend inode %llu, i_size = %lld, di->i_clusters = %u, " |
| 535 | "clusters_to_add = %u\n", | 535 | "clusters_to_add = %u\n", |
| 536 | (unsigned long long)OCFS2_I(inode)->ip_blkno, i_size_read(inode), | 536 | (unsigned long long)OCFS2_I(inode)->ip_blkno, i_size_read(inode), |
| 537 | le32_to_cpu(di->i_clusters), clusters_to_add); | 537 | le32_to_cpu(di->i_clusters), clusters_to_add); |
| 538 | 538 | ||
| 539 | num_free_extents = ocfs2_num_free_extents(osb, inode, di); | 539 | num_free_extents = ocfs2_num_free_extents(osb, inode, di); |
| 540 | if (num_free_extents < 0) { | 540 | if (num_free_extents < 0) { |
| 541 | ret = num_free_extents; | 541 | ret = num_free_extents; |
| 542 | mlog_errno(ret); | 542 | mlog_errno(ret); |
| 543 | goto out; | 543 | goto out; |
| 544 | } | 544 | } |
| 545 | 545 | ||
| 546 | /* | 546 | /* |
| 547 | * Sparse allocation file systems need to be more conservative | 547 | * Sparse allocation file systems need to be more conservative |
| 548 | * with reserving room for expansion - the actual allocation | 548 | * with reserving room for expansion - the actual allocation |
| 549 | * happens while we've got a journal handle open so re-taking | 549 | * happens while we've got a journal handle open so re-taking |
| 550 | * a cluster lock (because we ran out of room for another | 550 | * a cluster lock (because we ran out of room for another |
| 551 | * extent) will violate ordering rules. | 551 | * extent) will violate ordering rules. |
| 552 | * | 552 | * |
| 553 | * Most of the time we'll only be seeing this 1 cluster at a time | 553 | * Most of the time we'll only be seeing this 1 cluster at a time |
| 554 | * anyway. | 554 | * anyway. |
| 555 | */ | 555 | */ |
| 556 | if (!num_free_extents || | 556 | if (!num_free_extents || |
| 557 | (ocfs2_sparse_alloc(osb) && num_free_extents < clusters_to_add)) { | 557 | (ocfs2_sparse_alloc(osb) && num_free_extents < clusters_to_add)) { |
| 558 | ret = ocfs2_reserve_new_metadata(osb, di, meta_ac); | 558 | ret = ocfs2_reserve_new_metadata(osb, di, meta_ac); |
| 559 | if (ret < 0) { | 559 | if (ret < 0) { |
| 560 | if (ret != -ENOSPC) | 560 | if (ret != -ENOSPC) |
| 561 | mlog_errno(ret); | 561 | mlog_errno(ret); |
| 562 | goto out; | 562 | goto out; |
| 563 | } | 563 | } |
| 564 | } | 564 | } |
| 565 | 565 | ||
| 566 | ret = ocfs2_reserve_clusters(osb, clusters_to_add, data_ac); | 566 | ret = ocfs2_reserve_clusters(osb, clusters_to_add, data_ac); |
| 567 | if (ret < 0) { | 567 | if (ret < 0) { |
| 568 | if (ret != -ENOSPC) | 568 | if (ret != -ENOSPC) |
| 569 | mlog_errno(ret); | 569 | mlog_errno(ret); |
| 570 | goto out; | 570 | goto out; |
| 571 | } | 571 | } |
| 572 | 572 | ||
| 573 | out: | 573 | out: |
| 574 | if (ret) { | 574 | if (ret) { |
| 575 | if (*meta_ac) { | 575 | if (*meta_ac) { |
| 576 | ocfs2_free_alloc_context(*meta_ac); | 576 | ocfs2_free_alloc_context(*meta_ac); |
| 577 | *meta_ac = NULL; | 577 | *meta_ac = NULL; |
| 578 | } | 578 | } |
| 579 | 579 | ||
| 580 | /* | 580 | /* |
| 581 | * We cannot have an error and a non null *data_ac. | 581 | * We cannot have an error and a non null *data_ac. |
| 582 | */ | 582 | */ |
| 583 | } | 583 | } |
| 584 | 584 | ||
| 585 | return ret; | 585 | return ret; |
| 586 | } | 586 | } |
| 587 | 587 | ||
| 588 | static int ocfs2_extend_allocation(struct inode *inode, | 588 | static int ocfs2_extend_allocation(struct inode *inode, |
| 589 | u32 clusters_to_add) | 589 | u32 clusters_to_add) |
| 590 | { | 590 | { |
| 591 | int status = 0; | 591 | int status = 0; |
| 592 | int restart_func = 0; | 592 | int restart_func = 0; |
| 593 | int drop_alloc_sem = 0; | 593 | int drop_alloc_sem = 0; |
| 594 | int credits; | 594 | int credits; |
| 595 | u32 prev_clusters, logical_start; | 595 | u32 prev_clusters, logical_start; |
| 596 | struct buffer_head *bh = NULL; | 596 | struct buffer_head *bh = NULL; |
| 597 | struct ocfs2_dinode *fe = NULL; | 597 | struct ocfs2_dinode *fe = NULL; |
| 598 | handle_t *handle = NULL; | 598 | handle_t *handle = NULL; |
| 599 | struct ocfs2_alloc_context *data_ac = NULL; | 599 | struct ocfs2_alloc_context *data_ac = NULL; |
| 600 | struct ocfs2_alloc_context *meta_ac = NULL; | 600 | struct ocfs2_alloc_context *meta_ac = NULL; |
| 601 | enum ocfs2_alloc_restarted why; | 601 | enum ocfs2_alloc_restarted why; |
| 602 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | 602 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); |
| 603 | 603 | ||
| 604 | mlog_entry("(clusters_to_add = %u)\n", clusters_to_add); | 604 | mlog_entry("(clusters_to_add = %u)\n", clusters_to_add); |
| 605 | 605 | ||
| 606 | /* | 606 | /* |
| 607 | * This function only exists for file systems which don't | 607 | * This function only exists for file systems which don't |
| 608 | * support holes. | 608 | * support holes. |
| 609 | */ | 609 | */ |
| 610 | BUG_ON(ocfs2_sparse_alloc(osb)); | 610 | BUG_ON(ocfs2_sparse_alloc(osb)); |
| 611 | 611 | ||
| 612 | status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, &bh, | 612 | status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, &bh, |
| 613 | OCFS2_BH_CACHED, inode); | 613 | OCFS2_BH_CACHED, inode); |
| 614 | if (status < 0) { | 614 | if (status < 0) { |
| 615 | mlog_errno(status); | 615 | mlog_errno(status); |
| 616 | goto leave; | 616 | goto leave; |
| 617 | } | 617 | } |
| 618 | 618 | ||
| 619 | fe = (struct ocfs2_dinode *) bh->b_data; | 619 | fe = (struct ocfs2_dinode *) bh->b_data; |
| 620 | if (!OCFS2_IS_VALID_DINODE(fe)) { | 620 | if (!OCFS2_IS_VALID_DINODE(fe)) { |
| 621 | OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe); | 621 | OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe); |
| 622 | status = -EIO; | 622 | status = -EIO; |
| 623 | goto leave; | 623 | goto leave; |
| 624 | } | 624 | } |
| 625 | 625 | ||
| 626 | logical_start = OCFS2_I(inode)->ip_clusters; | 626 | logical_start = OCFS2_I(inode)->ip_clusters; |
| 627 | 627 | ||
| 628 | restart_all: | 628 | restart_all: |
| 629 | BUG_ON(le32_to_cpu(fe->i_clusters) != OCFS2_I(inode)->ip_clusters); | 629 | BUG_ON(le32_to_cpu(fe->i_clusters) != OCFS2_I(inode)->ip_clusters); |
| 630 | 630 | ||
| 631 | /* blocks peope in read/write from reading our allocation | 631 | /* blocks peope in read/write from reading our allocation |
| 632 | * until we're done changing it. We depend on i_mutex to block | 632 | * until we're done changing it. We depend on i_mutex to block |
| 633 | * other extend/truncate calls while we're here. Ordering wrt | 633 | * other extend/truncate calls while we're here. Ordering wrt |
| 634 | * start_trans is important here -- always do it before! */ | 634 | * start_trans is important here -- always do it before! */ |
| 635 | down_write(&OCFS2_I(inode)->ip_alloc_sem); | 635 | down_write(&OCFS2_I(inode)->ip_alloc_sem); |
| 636 | drop_alloc_sem = 1; | 636 | drop_alloc_sem = 1; |
| 637 | 637 | ||
| 638 | status = ocfs2_lock_allocators(inode, fe, clusters_to_add, &data_ac, | 638 | status = ocfs2_lock_allocators(inode, fe, clusters_to_add, &data_ac, |
| 639 | &meta_ac); | 639 | &meta_ac); |
| 640 | if (status) { | 640 | if (status) { |
| 641 | mlog_errno(status); | 641 | mlog_errno(status); |
| 642 | goto leave; | 642 | goto leave; |
| 643 | } | 643 | } |
| 644 | 644 | ||
| 645 | credits = ocfs2_calc_extend_credits(osb->sb, fe, clusters_to_add); | 645 | credits = ocfs2_calc_extend_credits(osb->sb, fe, clusters_to_add); |
| 646 | handle = ocfs2_start_trans(osb, credits); | 646 | handle = ocfs2_start_trans(osb, credits); |
| 647 | if (IS_ERR(handle)) { | 647 | if (IS_ERR(handle)) { |
| 648 | status = PTR_ERR(handle); | 648 | status = PTR_ERR(handle); |
| 649 | handle = NULL; | 649 | handle = NULL; |
| 650 | mlog_errno(status); | 650 | mlog_errno(status); |
| 651 | goto leave; | 651 | goto leave; |
| 652 | } | 652 | } |
| 653 | 653 | ||
| 654 | restarted_transaction: | 654 | restarted_transaction: |
| 655 | /* reserve a write to the file entry early on - that we if we | 655 | /* reserve a write to the file entry early on - that we if we |
| 656 | * run out of credits in the allocation path, we can still | 656 | * run out of credits in the allocation path, we can still |
| 657 | * update i_size. */ | 657 | * update i_size. */ |
| 658 | status = ocfs2_journal_access(handle, inode, bh, | 658 | status = ocfs2_journal_access(handle, inode, bh, |
| 659 | OCFS2_JOURNAL_ACCESS_WRITE); | 659 | OCFS2_JOURNAL_ACCESS_WRITE); |
| 660 | if (status < 0) { | 660 | if (status < 0) { |
| 661 | mlog_errno(status); | 661 | mlog_errno(status); |
| 662 | goto leave; | 662 | goto leave; |
| 663 | } | 663 | } |
| 664 | 664 | ||
| 665 | prev_clusters = OCFS2_I(inode)->ip_clusters; | 665 | prev_clusters = OCFS2_I(inode)->ip_clusters; |
| 666 | 666 | ||
| 667 | status = ocfs2_do_extend_allocation(osb, | 667 | status = ocfs2_do_extend_allocation(osb, |
| 668 | inode, | 668 | inode, |
| 669 | &logical_start, | 669 | &logical_start, |
| 670 | clusters_to_add, | 670 | clusters_to_add, |
| 671 | bh, | 671 | bh, |
| 672 | handle, | 672 | handle, |
| 673 | data_ac, | 673 | data_ac, |
| 674 | meta_ac, | 674 | meta_ac, |
| 675 | &why); | 675 | &why); |
| 676 | if ((status < 0) && (status != -EAGAIN)) { | 676 | if ((status < 0) && (status != -EAGAIN)) { |
| 677 | if (status != -ENOSPC) | 677 | if (status != -ENOSPC) |
| 678 | mlog_errno(status); | 678 | mlog_errno(status); |
| 679 | goto leave; | 679 | goto leave; |
| 680 | } | 680 | } |
| 681 | 681 | ||
| 682 | status = ocfs2_journal_dirty(handle, bh); | 682 | status = ocfs2_journal_dirty(handle, bh); |
| 683 | if (status < 0) { | 683 | if (status < 0) { |
| 684 | mlog_errno(status); | 684 | mlog_errno(status); |
| 685 | goto leave; | 685 | goto leave; |
| 686 | } | 686 | } |
| 687 | 687 | ||
| 688 | spin_lock(&OCFS2_I(inode)->ip_lock); | 688 | spin_lock(&OCFS2_I(inode)->ip_lock); |
| 689 | clusters_to_add -= (OCFS2_I(inode)->ip_clusters - prev_clusters); | 689 | clusters_to_add -= (OCFS2_I(inode)->ip_clusters - prev_clusters); |
| 690 | spin_unlock(&OCFS2_I(inode)->ip_lock); | 690 | spin_unlock(&OCFS2_I(inode)->ip_lock); |
| 691 | 691 | ||
| 692 | if (why != RESTART_NONE && clusters_to_add) { | 692 | if (why != RESTART_NONE && clusters_to_add) { |
| 693 | if (why == RESTART_META) { | 693 | if (why == RESTART_META) { |
| 694 | mlog(0, "restarting function.\n"); | 694 | mlog(0, "restarting function.\n"); |
| 695 | restart_func = 1; | 695 | restart_func = 1; |
| 696 | } else { | 696 | } else { |
| 697 | BUG_ON(why != RESTART_TRANS); | 697 | BUG_ON(why != RESTART_TRANS); |
| 698 | 698 | ||
| 699 | mlog(0, "restarting transaction.\n"); | 699 | mlog(0, "restarting transaction.\n"); |
| 700 | /* TODO: This can be more intelligent. */ | 700 | /* TODO: This can be more intelligent. */ |
| 701 | credits = ocfs2_calc_extend_credits(osb->sb, | 701 | credits = ocfs2_calc_extend_credits(osb->sb, |
| 702 | fe, | 702 | fe, |
| 703 | clusters_to_add); | 703 | clusters_to_add); |
| 704 | status = ocfs2_extend_trans(handle, credits); | 704 | status = ocfs2_extend_trans(handle, credits); |
| 705 | if (status < 0) { | 705 | if (status < 0) { |
| 706 | /* handle still has to be committed at | 706 | /* handle still has to be committed at |
| 707 | * this point. */ | 707 | * this point. */ |
| 708 | status = -ENOMEM; | 708 | status = -ENOMEM; |
| 709 | mlog_errno(status); | 709 | mlog_errno(status); |
| 710 | goto leave; | 710 | goto leave; |
| 711 | } | 711 | } |
| 712 | goto restarted_transaction; | 712 | goto restarted_transaction; |
| 713 | } | 713 | } |
| 714 | } | 714 | } |
| 715 | 715 | ||
| 716 | mlog(0, "fe: i_clusters = %u, i_size=%llu\n", | 716 | mlog(0, "fe: i_clusters = %u, i_size=%llu\n", |
| 717 | le32_to_cpu(fe->i_clusters), | 717 | le32_to_cpu(fe->i_clusters), |
| 718 | (unsigned long long)le64_to_cpu(fe->i_size)); | 718 | (unsigned long long)le64_to_cpu(fe->i_size)); |
| 719 | mlog(0, "inode: ip_clusters=%u, i_size=%lld\n", | 719 | mlog(0, "inode: ip_clusters=%u, i_size=%lld\n", |
| 720 | OCFS2_I(inode)->ip_clusters, i_size_read(inode)); | 720 | OCFS2_I(inode)->ip_clusters, i_size_read(inode)); |
| 721 | 721 | ||
| 722 | leave: | 722 | leave: |
| 723 | if (drop_alloc_sem) { | 723 | if (drop_alloc_sem) { |
| 724 | up_write(&OCFS2_I(inode)->ip_alloc_sem); | 724 | up_write(&OCFS2_I(inode)->ip_alloc_sem); |
| 725 | drop_alloc_sem = 0; | 725 | drop_alloc_sem = 0; |
| 726 | } | 726 | } |
| 727 | if (handle) { | 727 | if (handle) { |
| 728 | ocfs2_commit_trans(osb, handle); | 728 | ocfs2_commit_trans(osb, handle); |
| 729 | handle = NULL; | 729 | handle = NULL; |
| 730 | } | 730 | } |
| 731 | if (data_ac) { | 731 | if (data_ac) { |
| 732 | ocfs2_free_alloc_context(data_ac); | 732 | ocfs2_free_alloc_context(data_ac); |
| 733 | data_ac = NULL; | 733 | data_ac = NULL; |
| 734 | } | 734 | } |
| 735 | if (meta_ac) { | 735 | if (meta_ac) { |
| 736 | ocfs2_free_alloc_context(meta_ac); | 736 | ocfs2_free_alloc_context(meta_ac); |
| 737 | meta_ac = NULL; | 737 | meta_ac = NULL; |
| 738 | } | 738 | } |
| 739 | if ((!status) && restart_func) { | 739 | if ((!status) && restart_func) { |
| 740 | restart_func = 0; | 740 | restart_func = 0; |
| 741 | goto restart_all; | 741 | goto restart_all; |
| 742 | } | 742 | } |
| 743 | if (bh) { | 743 | if (bh) { |
| 744 | brelse(bh); | 744 | brelse(bh); |
| 745 | bh = NULL; | 745 | bh = NULL; |
| 746 | } | 746 | } |
| 747 | 747 | ||
| 748 | mlog_exit(status); | 748 | mlog_exit(status); |
| 749 | return status; | 749 | return status; |
| 750 | } | 750 | } |
| 751 | 751 | ||
| 752 | /* Some parts of this taken from generic_cont_expand, which turned out | 752 | /* Some parts of this taken from generic_cont_expand, which turned out |
| 753 | * to be too fragile to do exactly what we need without us having to | 753 | * to be too fragile to do exactly what we need without us having to |
| 754 | * worry about recursive locking in ->prepare_write() and | 754 | * worry about recursive locking in ->prepare_write() and |
| 755 | * ->commit_write(). */ | 755 | * ->commit_write(). */ |
| 756 | static int ocfs2_write_zero_page(struct inode *inode, | 756 | static int ocfs2_write_zero_page(struct inode *inode, |
| 757 | u64 size) | 757 | u64 size) |
| 758 | { | 758 | { |
| 759 | struct address_space *mapping = inode->i_mapping; | 759 | struct address_space *mapping = inode->i_mapping; |
| 760 | struct page *page; | 760 | struct page *page; |
| 761 | unsigned long index; | 761 | unsigned long index; |
| 762 | unsigned int offset; | 762 | unsigned int offset; |
| 763 | handle_t *handle = NULL; | 763 | handle_t *handle = NULL; |
| 764 | int ret; | 764 | int ret; |
| 765 | 765 | ||
| 766 | offset = (size & (PAGE_CACHE_SIZE-1)); /* Within page */ | 766 | offset = (size & (PAGE_CACHE_SIZE-1)); /* Within page */ |
| 767 | /* ugh. in prepare/commit_write, if from==to==start of block, we | 767 | /* ugh. in prepare/commit_write, if from==to==start of block, we |
| 768 | ** skip the prepare. make sure we never send an offset for the start | 768 | ** skip the prepare. make sure we never send an offset for the start |
| 769 | ** of a block | 769 | ** of a block |
| 770 | */ | 770 | */ |
| 771 | if ((offset & (inode->i_sb->s_blocksize - 1)) == 0) { | 771 | if ((offset & (inode->i_sb->s_blocksize - 1)) == 0) { |
| 772 | offset++; | 772 | offset++; |
| 773 | } | 773 | } |
| 774 | index = size >> PAGE_CACHE_SHIFT; | 774 | index = size >> PAGE_CACHE_SHIFT; |
| 775 | 775 | ||
| 776 | page = grab_cache_page(mapping, index); | 776 | page = grab_cache_page(mapping, index); |
| 777 | if (!page) { | 777 | if (!page) { |
| 778 | ret = -ENOMEM; | 778 | ret = -ENOMEM; |
| 779 | mlog_errno(ret); | 779 | mlog_errno(ret); |
| 780 | goto out; | 780 | goto out; |
| 781 | } | 781 | } |
| 782 | 782 | ||
| 783 | ret = ocfs2_prepare_write_nolock(inode, page, offset, offset); | 783 | ret = ocfs2_prepare_write_nolock(inode, page, offset, offset); |
| 784 | if (ret < 0) { | 784 | if (ret < 0) { |
| 785 | mlog_errno(ret); | 785 | mlog_errno(ret); |
| 786 | goto out_unlock; | 786 | goto out_unlock; |
| 787 | } | 787 | } |
| 788 | 788 | ||
| 789 | if (ocfs2_should_order_data(inode)) { | 789 | if (ocfs2_should_order_data(inode)) { |
| 790 | handle = ocfs2_start_walk_page_trans(inode, page, offset, | 790 | handle = ocfs2_start_walk_page_trans(inode, page, offset, |
| 791 | offset); | 791 | offset); |
| 792 | if (IS_ERR(handle)) { | 792 | if (IS_ERR(handle)) { |
| 793 | ret = PTR_ERR(handle); | 793 | ret = PTR_ERR(handle); |
| 794 | handle = NULL; | 794 | handle = NULL; |
| 795 | goto out_unlock; | 795 | goto out_unlock; |
| 796 | } | 796 | } |
| 797 | } | 797 | } |
| 798 | 798 | ||
| 799 | /* must not update i_size! */ | 799 | /* must not update i_size! */ |
| 800 | ret = block_commit_write(page, offset, offset); | 800 | ret = block_commit_write(page, offset, offset); |
| 801 | if (ret < 0) | 801 | if (ret < 0) |
| 802 | mlog_errno(ret); | 802 | mlog_errno(ret); |
| 803 | else | 803 | else |
| 804 | ret = 0; | 804 | ret = 0; |
| 805 | 805 | ||
| 806 | if (handle) | 806 | if (handle) |
| 807 | ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle); | 807 | ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle); |
| 808 | out_unlock: | 808 | out_unlock: |
| 809 | unlock_page(page); | 809 | unlock_page(page); |
| 810 | page_cache_release(page); | 810 | page_cache_release(page); |
| 811 | out: | 811 | out: |
| 812 | return ret; | 812 | return ret; |
| 813 | } | 813 | } |
| 814 | 814 | ||
| 815 | static int ocfs2_zero_extend(struct inode *inode, | 815 | static int ocfs2_zero_extend(struct inode *inode, |
| 816 | u64 zero_to_size) | 816 | u64 zero_to_size) |
| 817 | { | 817 | { |
| 818 | int ret = 0; | 818 | int ret = 0; |
| 819 | u64 start_off; | 819 | u64 start_off; |
| 820 | struct super_block *sb = inode->i_sb; | 820 | struct super_block *sb = inode->i_sb; |
| 821 | 821 | ||
| 822 | start_off = ocfs2_align_bytes_to_blocks(sb, i_size_read(inode)); | 822 | start_off = ocfs2_align_bytes_to_blocks(sb, i_size_read(inode)); |
| 823 | while (start_off < zero_to_size) { | 823 | while (start_off < zero_to_size) { |
| 824 | ret = ocfs2_write_zero_page(inode, start_off); | 824 | ret = ocfs2_write_zero_page(inode, start_off); |
| 825 | if (ret < 0) { | 825 | if (ret < 0) { |
| 826 | mlog_errno(ret); | 826 | mlog_errno(ret); |
| 827 | goto out; | 827 | goto out; |
| 828 | } | 828 | } |
| 829 | 829 | ||
| 830 | start_off += sb->s_blocksize; | 830 | start_off += sb->s_blocksize; |
| 831 | 831 | ||
| 832 | /* | 832 | /* |
| 833 | * Very large extends have the potential to lock up | 833 | * Very large extends have the potential to lock up |
| 834 | * the cpu for extended periods of time. | 834 | * the cpu for extended periods of time. |
| 835 | */ | 835 | */ |
| 836 | cond_resched(); | 836 | cond_resched(); |
| 837 | } | 837 | } |
| 838 | 838 | ||
| 839 | out: | 839 | out: |
| 840 | return ret; | 840 | return ret; |
| 841 | } | 841 | } |
| 842 | 842 | ||
| 843 | /* | 843 | /* |
| 844 | * A tail_to_skip value > 0 indicates that we're being called from | 844 | * A tail_to_skip value > 0 indicates that we're being called from |
| 845 | * ocfs2_file_aio_write(). This has the following implications: | 845 | * ocfs2_file_aio_write(). This has the following implications: |
| 846 | * | 846 | * |
| 847 | * - we don't want to update i_size | 847 | * - we don't want to update i_size |
| 848 | * - di_bh will be NULL, which is fine because it's only used in the | 848 | * - di_bh will be NULL, which is fine because it's only used in the |
| 849 | * case where we want to update i_size. | 849 | * case where we want to update i_size. |
| 850 | * - ocfs2_zero_extend() will then only be filling the hole created | 850 | * - ocfs2_zero_extend() will then only be filling the hole created |
| 851 | * between i_size and the start of the write. | 851 | * between i_size and the start of the write. |
| 852 | */ | 852 | */ |
| 853 | static int ocfs2_extend_file(struct inode *inode, | 853 | static int ocfs2_extend_file(struct inode *inode, |
| 854 | struct buffer_head *di_bh, | 854 | struct buffer_head *di_bh, |
| 855 | u64 new_i_size, | 855 | u64 new_i_size, |
| 856 | size_t tail_to_skip) | 856 | size_t tail_to_skip) |
| 857 | { | 857 | { |
| 858 | int ret = 0; | 858 | int ret = 0; |
| 859 | u32 clusters_to_add = 0; | 859 | u32 clusters_to_add = 0; |
| 860 | 860 | ||
| 861 | BUG_ON(!tail_to_skip && !di_bh); | 861 | BUG_ON(!tail_to_skip && !di_bh); |
| 862 | 862 | ||
| 863 | /* setattr sometimes calls us like this. */ | 863 | /* setattr sometimes calls us like this. */ |
| 864 | if (new_i_size == 0) | 864 | if (new_i_size == 0) |
| 865 | goto out; | 865 | goto out; |
| 866 | 866 | ||
| 867 | if (i_size_read(inode) == new_i_size) | 867 | if (i_size_read(inode) == new_i_size) |
| 868 | goto out; | 868 | goto out; |
| 869 | BUG_ON(new_i_size < i_size_read(inode)); | 869 | BUG_ON(new_i_size < i_size_read(inode)); |
| 870 | 870 | ||
| 871 | if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) { | 871 | if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) { |
| 872 | BUG_ON(tail_to_skip != 0); | 872 | BUG_ON(tail_to_skip != 0); |
| 873 | goto out_update_size; | 873 | goto out_update_size; |
| 874 | } | 874 | } |
| 875 | 875 | ||
| 876 | clusters_to_add = ocfs2_clusters_for_bytes(inode->i_sb, new_i_size) - | 876 | clusters_to_add = ocfs2_clusters_for_bytes(inode->i_sb, new_i_size) - |
| 877 | OCFS2_I(inode)->ip_clusters; | 877 | OCFS2_I(inode)->ip_clusters; |
| 878 | 878 | ||
| 879 | /* | 879 | /* |
| 880 | * protect the pages that ocfs2_zero_extend is going to be | 880 | * protect the pages that ocfs2_zero_extend is going to be |
| 881 | * pulling into the page cache.. we do this before the | 881 | * pulling into the page cache.. we do this before the |
| 882 | * metadata extend so that we don't get into the situation | 882 | * metadata extend so that we don't get into the situation |
| 883 | * where we've extended the metadata but can't get the data | 883 | * where we've extended the metadata but can't get the data |
| 884 | * lock to zero. | 884 | * lock to zero. |
| 885 | */ | 885 | */ |
| 886 | ret = ocfs2_data_lock(inode, 1); | 886 | ret = ocfs2_data_lock(inode, 1); |
| 887 | if (ret < 0) { | 887 | if (ret < 0) { |
| 888 | mlog_errno(ret); | 888 | mlog_errno(ret); |
| 889 | goto out; | 889 | goto out; |
| 890 | } | 890 | } |
| 891 | 891 | ||
| 892 | if (clusters_to_add) { | 892 | if (clusters_to_add) { |
| 893 | ret = ocfs2_extend_allocation(inode, clusters_to_add); | 893 | ret = ocfs2_extend_allocation(inode, clusters_to_add); |
| 894 | if (ret < 0) { | 894 | if (ret < 0) { |
| 895 | mlog_errno(ret); | 895 | mlog_errno(ret); |
| 896 | goto out_unlock; | 896 | goto out_unlock; |
| 897 | } | 897 | } |
| 898 | } | 898 | } |
| 899 | 899 | ||
| 900 | /* | 900 | /* |
| 901 | * Call this even if we don't add any clusters to the tree. We | 901 | * Call this even if we don't add any clusters to the tree. We |
| 902 | * still need to zero the area between the old i_size and the | 902 | * still need to zero the area between the old i_size and the |
| 903 | * new i_size. | 903 | * new i_size. |
| 904 | */ | 904 | */ |
| 905 | ret = ocfs2_zero_extend(inode, (u64)new_i_size - tail_to_skip); | 905 | ret = ocfs2_zero_extend(inode, (u64)new_i_size - tail_to_skip); |
| 906 | if (ret < 0) { | 906 | if (ret < 0) { |
| 907 | mlog_errno(ret); | 907 | mlog_errno(ret); |
| 908 | goto out_unlock; | 908 | goto out_unlock; |
| 909 | } | 909 | } |
| 910 | 910 | ||
| 911 | out_update_size: | 911 | out_update_size: |
| 912 | if (!tail_to_skip) { | 912 | if (!tail_to_skip) { |
| 913 | /* We're being called from ocfs2_setattr() which wants | 913 | /* We're being called from ocfs2_setattr() which wants |
| 914 | * us to update i_size */ | 914 | * us to update i_size */ |
| 915 | ret = ocfs2_simple_size_update(inode, di_bh, new_i_size); | 915 | ret = ocfs2_simple_size_update(inode, di_bh, new_i_size); |
| 916 | if (ret < 0) | 916 | if (ret < 0) |
| 917 | mlog_errno(ret); | 917 | mlog_errno(ret); |
| 918 | } | 918 | } |
| 919 | 919 | ||
| 920 | out_unlock: | 920 | out_unlock: |
| 921 | if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) | 921 | if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) |
| 922 | ocfs2_data_unlock(inode, 1); | 922 | ocfs2_data_unlock(inode, 1); |
| 923 | 923 | ||
| 924 | out: | 924 | out: |
| 925 | return ret; | 925 | return ret; |
| 926 | } | 926 | } |
| 927 | 927 | ||
| 928 | int ocfs2_setattr(struct dentry *dentry, struct iattr *attr) | 928 | int ocfs2_setattr(struct dentry *dentry, struct iattr *attr) |
| 929 | { | 929 | { |
| 930 | int status = 0, size_change; | 930 | int status = 0, size_change; |
| 931 | struct inode *inode = dentry->d_inode; | 931 | struct inode *inode = dentry->d_inode; |
| 932 | struct super_block *sb = inode->i_sb; | 932 | struct super_block *sb = inode->i_sb; |
| 933 | struct ocfs2_super *osb = OCFS2_SB(sb); | 933 | struct ocfs2_super *osb = OCFS2_SB(sb); |
| 934 | struct buffer_head *bh = NULL; | 934 | struct buffer_head *bh = NULL; |
| 935 | handle_t *handle = NULL; | 935 | handle_t *handle = NULL; |
| 936 | 936 | ||
| 937 | mlog_entry("(0x%p, '%.*s')\n", dentry, | 937 | mlog_entry("(0x%p, '%.*s')\n", dentry, |
| 938 | dentry->d_name.len, dentry->d_name.name); | 938 | dentry->d_name.len, dentry->d_name.name); |
| 939 | 939 | ||
| 940 | if (attr->ia_valid & ATTR_MODE) | 940 | if (attr->ia_valid & ATTR_MODE) |
| 941 | mlog(0, "mode change: %d\n", attr->ia_mode); | 941 | mlog(0, "mode change: %d\n", attr->ia_mode); |
| 942 | if (attr->ia_valid & ATTR_UID) | 942 | if (attr->ia_valid & ATTR_UID) |
| 943 | mlog(0, "uid change: %d\n", attr->ia_uid); | 943 | mlog(0, "uid change: %d\n", attr->ia_uid); |
| 944 | if (attr->ia_valid & ATTR_GID) | 944 | if (attr->ia_valid & ATTR_GID) |
| 945 | mlog(0, "gid change: %d\n", attr->ia_gid); | 945 | mlog(0, "gid change: %d\n", attr->ia_gid); |
| 946 | if (attr->ia_valid & ATTR_SIZE) | 946 | if (attr->ia_valid & ATTR_SIZE) |
| 947 | mlog(0, "size change...\n"); | 947 | mlog(0, "size change...\n"); |
| 948 | if (attr->ia_valid & (ATTR_ATIME | ATTR_MTIME | ATTR_CTIME)) | 948 | if (attr->ia_valid & (ATTR_ATIME | ATTR_MTIME | ATTR_CTIME)) |
| 949 | mlog(0, "time change...\n"); | 949 | mlog(0, "time change...\n"); |
| 950 | 950 | ||
| 951 | #define OCFS2_VALID_ATTRS (ATTR_ATIME | ATTR_MTIME | ATTR_CTIME | ATTR_SIZE \ | 951 | #define OCFS2_VALID_ATTRS (ATTR_ATIME | ATTR_MTIME | ATTR_CTIME | ATTR_SIZE \ |
| 952 | | ATTR_GID | ATTR_UID | ATTR_MODE) | 952 | | ATTR_GID | ATTR_UID | ATTR_MODE) |
| 953 | if (!(attr->ia_valid & OCFS2_VALID_ATTRS)) { | 953 | if (!(attr->ia_valid & OCFS2_VALID_ATTRS)) { |
| 954 | mlog(0, "can't handle attrs: 0x%x\n", attr->ia_valid); | 954 | mlog(0, "can't handle attrs: 0x%x\n", attr->ia_valid); |
| 955 | return 0; | 955 | return 0; |
| 956 | } | 956 | } |
| 957 | 957 | ||
| 958 | status = inode_change_ok(inode, attr); | 958 | status = inode_change_ok(inode, attr); |
| 959 | if (status) | 959 | if (status) |
| 960 | return status; | 960 | return status; |
| 961 | 961 | ||
| 962 | size_change = S_ISREG(inode->i_mode) && attr->ia_valid & ATTR_SIZE; | 962 | size_change = S_ISREG(inode->i_mode) && attr->ia_valid & ATTR_SIZE; |
| 963 | if (size_change) { | 963 | if (size_change) { |
| 964 | status = ocfs2_rw_lock(inode, 1); | 964 | status = ocfs2_rw_lock(inode, 1); |
| 965 | if (status < 0) { | 965 | if (status < 0) { |
| 966 | mlog_errno(status); | 966 | mlog_errno(status); |
| 967 | goto bail; | 967 | goto bail; |
| 968 | } | 968 | } |
| 969 | } | 969 | } |
| 970 | 970 | ||
| 971 | status = ocfs2_meta_lock(inode, &bh, 1); | 971 | status = ocfs2_meta_lock(inode, &bh, 1); |
| 972 | if (status < 0) { | 972 | if (status < 0) { |
| 973 | if (status != -ENOENT) | 973 | if (status != -ENOENT) |
| 974 | mlog_errno(status); | 974 | mlog_errno(status); |
| 975 | goto bail_unlock_rw; | 975 | goto bail_unlock_rw; |
| 976 | } | 976 | } |
| 977 | 977 | ||
| 978 | if (size_change && attr->ia_size != i_size_read(inode)) { | 978 | if (size_change && attr->ia_size != i_size_read(inode)) { |
| 979 | if (i_size_read(inode) > attr->ia_size) | 979 | if (i_size_read(inode) > attr->ia_size) |
| 980 | status = ocfs2_truncate_file(inode, bh, attr->ia_size); | 980 | status = ocfs2_truncate_file(inode, bh, attr->ia_size); |
| 981 | else | 981 | else |
| 982 | status = ocfs2_extend_file(inode, bh, attr->ia_size, 0); | 982 | status = ocfs2_extend_file(inode, bh, attr->ia_size, 0); |
| 983 | if (status < 0) { | 983 | if (status < 0) { |
| 984 | if (status != -ENOSPC) | 984 | if (status != -ENOSPC) |
| 985 | mlog_errno(status); | 985 | mlog_errno(status); |
| 986 | status = -ENOSPC; | 986 | status = -ENOSPC; |
| 987 | goto bail_unlock; | 987 | goto bail_unlock; |
| 988 | } | 988 | } |
| 989 | } | 989 | } |
| 990 | 990 | ||
| 991 | handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); | 991 | handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); |
| 992 | if (IS_ERR(handle)) { | 992 | if (IS_ERR(handle)) { |
| 993 | status = PTR_ERR(handle); | 993 | status = PTR_ERR(handle); |
| 994 | mlog_errno(status); | 994 | mlog_errno(status); |
| 995 | goto bail_unlock; | 995 | goto bail_unlock; |
| 996 | } | 996 | } |
| 997 | 997 | ||
| 998 | status = inode_setattr(inode, attr); | 998 | status = inode_setattr(inode, attr); |
| 999 | if (status < 0) { | 999 | if (status < 0) { |
| 1000 | mlog_errno(status); | 1000 | mlog_errno(status); |
| 1001 | goto bail_commit; | 1001 | goto bail_commit; |
| 1002 | } | 1002 | } |
| 1003 | 1003 | ||
| 1004 | status = ocfs2_mark_inode_dirty(handle, inode, bh); | 1004 | status = ocfs2_mark_inode_dirty(handle, inode, bh); |
| 1005 | if (status < 0) | 1005 | if (status < 0) |
| 1006 | mlog_errno(status); | 1006 | mlog_errno(status); |
| 1007 | 1007 | ||
| 1008 | bail_commit: | 1008 | bail_commit: |
| 1009 | ocfs2_commit_trans(osb, handle); | 1009 | ocfs2_commit_trans(osb, handle); |
| 1010 | bail_unlock: | 1010 | bail_unlock: |
| 1011 | ocfs2_meta_unlock(inode, 1); | 1011 | ocfs2_meta_unlock(inode, 1); |
| 1012 | bail_unlock_rw: | 1012 | bail_unlock_rw: |
| 1013 | if (size_change) | 1013 | if (size_change) |
| 1014 | ocfs2_rw_unlock(inode, 1); | 1014 | ocfs2_rw_unlock(inode, 1); |
| 1015 | bail: | 1015 | bail: |
| 1016 | if (bh) | 1016 | if (bh) |
| 1017 | brelse(bh); | 1017 | brelse(bh); |
| 1018 | 1018 | ||
| 1019 | mlog_exit(status); | 1019 | mlog_exit(status); |
| 1020 | return status; | 1020 | return status; |
| 1021 | } | 1021 | } |
| 1022 | 1022 | ||
| 1023 | int ocfs2_getattr(struct vfsmount *mnt, | 1023 | int ocfs2_getattr(struct vfsmount *mnt, |
| 1024 | struct dentry *dentry, | 1024 | struct dentry *dentry, |
| 1025 | struct kstat *stat) | 1025 | struct kstat *stat) |
| 1026 | { | 1026 | { |
| 1027 | struct inode *inode = dentry->d_inode; | 1027 | struct inode *inode = dentry->d_inode; |
| 1028 | struct super_block *sb = dentry->d_inode->i_sb; | 1028 | struct super_block *sb = dentry->d_inode->i_sb; |
| 1029 | struct ocfs2_super *osb = sb->s_fs_info; | 1029 | struct ocfs2_super *osb = sb->s_fs_info; |
| 1030 | int err; | 1030 | int err; |
| 1031 | 1031 | ||
| 1032 | mlog_entry_void(); | 1032 | mlog_entry_void(); |
| 1033 | 1033 | ||
| 1034 | err = ocfs2_inode_revalidate(dentry); | 1034 | err = ocfs2_inode_revalidate(dentry); |
| 1035 | if (err) { | 1035 | if (err) { |
| 1036 | if (err != -ENOENT) | 1036 | if (err != -ENOENT) |
| 1037 | mlog_errno(err); | 1037 | mlog_errno(err); |
| 1038 | goto bail; | 1038 | goto bail; |
| 1039 | } | 1039 | } |
| 1040 | 1040 | ||
| 1041 | generic_fillattr(inode, stat); | 1041 | generic_fillattr(inode, stat); |
| 1042 | 1042 | ||
| 1043 | /* We set the blksize from the cluster size for performance */ | 1043 | /* We set the blksize from the cluster size for performance */ |
| 1044 | stat->blksize = osb->s_clustersize; | 1044 | stat->blksize = osb->s_clustersize; |
| 1045 | 1045 | ||
| 1046 | bail: | 1046 | bail: |
| 1047 | mlog_exit(err); | 1047 | mlog_exit(err); |
| 1048 | 1048 | ||
| 1049 | return err; | 1049 | return err; |
| 1050 | } | 1050 | } |
| 1051 | 1051 | ||
| 1052 | int ocfs2_permission(struct inode *inode, int mask, struct nameidata *nd) | 1052 | int ocfs2_permission(struct inode *inode, int mask, struct nameidata *nd) |
| 1053 | { | 1053 | { |
| 1054 | int ret; | 1054 | int ret; |
| 1055 | 1055 | ||
| 1056 | mlog_entry_void(); | 1056 | mlog_entry_void(); |
| 1057 | 1057 | ||
| 1058 | ret = ocfs2_meta_lock(inode, NULL, 0); | 1058 | ret = ocfs2_meta_lock(inode, NULL, 0); |
| 1059 | if (ret) { | 1059 | if (ret) { |
| 1060 | if (ret != -ENOENT) | 1060 | if (ret != -ENOENT) |
| 1061 | mlog_errno(ret); | 1061 | mlog_errno(ret); |
| 1062 | goto out; | 1062 | goto out; |
| 1063 | } | 1063 | } |
| 1064 | 1064 | ||
| 1065 | ret = generic_permission(inode, mask, NULL); | 1065 | ret = generic_permission(inode, mask, NULL); |
| 1066 | 1066 | ||
| 1067 | ocfs2_meta_unlock(inode, 0); | 1067 | ocfs2_meta_unlock(inode, 0); |
| 1068 | out: | 1068 | out: |
| 1069 | mlog_exit(ret); | 1069 | mlog_exit(ret); |
| 1070 | return ret; | 1070 | return ret; |
| 1071 | } | 1071 | } |
| 1072 | 1072 | ||
| 1073 | static int ocfs2_write_remove_suid(struct inode *inode) | 1073 | static int ocfs2_write_remove_suid(struct inode *inode) |
| 1074 | { | 1074 | { |
| 1075 | int ret; | 1075 | int ret; |
| 1076 | struct buffer_head *bh = NULL; | 1076 | struct buffer_head *bh = NULL; |
| 1077 | struct ocfs2_inode_info *oi = OCFS2_I(inode); | 1077 | struct ocfs2_inode_info *oi = OCFS2_I(inode); |
| 1078 | handle_t *handle; | 1078 | handle_t *handle; |
| 1079 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | 1079 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); |
| 1080 | struct ocfs2_dinode *di; | 1080 | struct ocfs2_dinode *di; |
| 1081 | 1081 | ||
| 1082 | mlog_entry("(Inode %llu, mode 0%o)\n", | 1082 | mlog_entry("(Inode %llu, mode 0%o)\n", |
| 1083 | (unsigned long long)oi->ip_blkno, inode->i_mode); | 1083 | (unsigned long long)oi->ip_blkno, inode->i_mode); |
| 1084 | 1084 | ||
| 1085 | handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); | 1085 | handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); |
| 1086 | if (handle == NULL) { | 1086 | if (handle == NULL) { |
| 1087 | ret = -ENOMEM; | 1087 | ret = -ENOMEM; |
| 1088 | mlog_errno(ret); | 1088 | mlog_errno(ret); |
| 1089 | goto out; | 1089 | goto out; |
| 1090 | } | 1090 | } |
| 1091 | 1091 | ||
| 1092 | ret = ocfs2_read_block(osb, oi->ip_blkno, &bh, OCFS2_BH_CACHED, inode); | 1092 | ret = ocfs2_read_block(osb, oi->ip_blkno, &bh, OCFS2_BH_CACHED, inode); |
| 1093 | if (ret < 0) { | 1093 | if (ret < 0) { |
| 1094 | mlog_errno(ret); | 1094 | mlog_errno(ret); |
| 1095 | goto out_trans; | 1095 | goto out_trans; |
| 1096 | } | 1096 | } |
| 1097 | 1097 | ||
| 1098 | ret = ocfs2_journal_access(handle, inode, bh, | 1098 | ret = ocfs2_journal_access(handle, inode, bh, |
| 1099 | OCFS2_JOURNAL_ACCESS_WRITE); | 1099 | OCFS2_JOURNAL_ACCESS_WRITE); |
| 1100 | if (ret < 0) { | 1100 | if (ret < 0) { |
| 1101 | mlog_errno(ret); | 1101 | mlog_errno(ret); |
| 1102 | goto out_bh; | 1102 | goto out_bh; |
| 1103 | } | 1103 | } |
| 1104 | 1104 | ||
| 1105 | inode->i_mode &= ~S_ISUID; | 1105 | inode->i_mode &= ~S_ISUID; |
| 1106 | if ((inode->i_mode & S_ISGID) && (inode->i_mode & S_IXGRP)) | 1106 | if ((inode->i_mode & S_ISGID) && (inode->i_mode & S_IXGRP)) |
| 1107 | inode->i_mode &= ~S_ISGID; | 1107 | inode->i_mode &= ~S_ISGID; |
| 1108 | 1108 | ||
| 1109 | di = (struct ocfs2_dinode *) bh->b_data; | 1109 | di = (struct ocfs2_dinode *) bh->b_data; |
| 1110 | di->i_mode = cpu_to_le16(inode->i_mode); | 1110 | di->i_mode = cpu_to_le16(inode->i_mode); |
| 1111 | 1111 | ||
| 1112 | ret = ocfs2_journal_dirty(handle, bh); | 1112 | ret = ocfs2_journal_dirty(handle, bh); |
| 1113 | if (ret < 0) | 1113 | if (ret < 0) |
| 1114 | mlog_errno(ret); | 1114 | mlog_errno(ret); |
| 1115 | out_bh: | 1115 | out_bh: |
| 1116 | brelse(bh); | 1116 | brelse(bh); |
| 1117 | out_trans: | 1117 | out_trans: |
| 1118 | ocfs2_commit_trans(osb, handle); | 1118 | ocfs2_commit_trans(osb, handle); |
| 1119 | out: | 1119 | out: |
| 1120 | mlog_exit(ret); | 1120 | mlog_exit(ret); |
| 1121 | return ret; | 1121 | return ret; |
| 1122 | } | 1122 | } |
| 1123 | 1123 | ||
| 1124 | /* | 1124 | /* |
| 1125 | * Will look for holes and unwritten extents in the range starting at | 1125 | * Will look for holes and unwritten extents in the range starting at |
| 1126 | * pos for count bytes (inclusive). | 1126 | * pos for count bytes (inclusive). |
| 1127 | */ | 1127 | */ |
| 1128 | static int ocfs2_check_range_for_holes(struct inode *inode, loff_t pos, | 1128 | static int ocfs2_check_range_for_holes(struct inode *inode, loff_t pos, |
| 1129 | size_t count) | 1129 | size_t count) |
| 1130 | { | 1130 | { |
| 1131 | int ret = 0; | 1131 | int ret = 0; |
| 1132 | unsigned int extent_flags; | 1132 | unsigned int extent_flags; |
| 1133 | u32 cpos, clusters, extent_len, phys_cpos; | 1133 | u32 cpos, clusters, extent_len, phys_cpos; |
| 1134 | struct super_block *sb = inode->i_sb; | 1134 | struct super_block *sb = inode->i_sb; |
| 1135 | 1135 | ||
| 1136 | cpos = pos >> OCFS2_SB(sb)->s_clustersize_bits; | 1136 | cpos = pos >> OCFS2_SB(sb)->s_clustersize_bits; |
| 1137 | clusters = ocfs2_clusters_for_bytes(sb, pos + count) - cpos; | 1137 | clusters = ocfs2_clusters_for_bytes(sb, pos + count) - cpos; |
| 1138 | 1138 | ||
| 1139 | while (clusters) { | 1139 | while (clusters) { |
| 1140 | ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, &extent_len, | 1140 | ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, &extent_len, |
| 1141 | &extent_flags); | 1141 | &extent_flags); |
| 1142 | if (ret < 0) { | 1142 | if (ret < 0) { |
| 1143 | mlog_errno(ret); | 1143 | mlog_errno(ret); |
| 1144 | goto out; | 1144 | goto out; |
| 1145 | } | 1145 | } |
| 1146 | 1146 | ||
| 1147 | if (phys_cpos == 0 || (extent_flags & OCFS2_EXT_UNWRITTEN)) { | 1147 | if (phys_cpos == 0 || (extent_flags & OCFS2_EXT_UNWRITTEN)) { |
| 1148 | ret = 1; | 1148 | ret = 1; |
| 1149 | break; | 1149 | break; |
| 1150 | } | 1150 | } |
| 1151 | 1151 | ||
| 1152 | if (extent_len > clusters) | 1152 | if (extent_len > clusters) |
| 1153 | extent_len = clusters; | 1153 | extent_len = clusters; |
| 1154 | 1154 | ||
| 1155 | clusters -= extent_len; | 1155 | clusters -= extent_len; |
| 1156 | cpos += extent_len; | 1156 | cpos += extent_len; |
| 1157 | } | 1157 | } |
| 1158 | out: | 1158 | out: |
| 1159 | return ret; | 1159 | return ret; |
| 1160 | } | 1160 | } |
| 1161 | 1161 | ||
| 1162 | static int ocfs2_prepare_inode_for_write(struct dentry *dentry, | 1162 | static int ocfs2_prepare_inode_for_write(struct dentry *dentry, |
| 1163 | loff_t *ppos, | 1163 | loff_t *ppos, |
| 1164 | size_t count, | 1164 | size_t count, |
| 1165 | int appending, | 1165 | int appending, |
| 1166 | int *direct_io) | 1166 | int *direct_io) |
| 1167 | { | 1167 | { |
| 1168 | int ret = 0, meta_level = appending; | 1168 | int ret = 0, meta_level = appending; |
| 1169 | struct inode *inode = dentry->d_inode; | 1169 | struct inode *inode = dentry->d_inode; |
| 1170 | u32 clusters; | 1170 | u32 clusters; |
| 1171 | loff_t newsize, saved_pos; | 1171 | loff_t newsize, saved_pos; |
| 1172 | 1172 | ||
| 1173 | /* | 1173 | /* |
| 1174 | * We sample i_size under a read level meta lock to see if our write | 1174 | * We sample i_size under a read level meta lock to see if our write |
| 1175 | * is extending the file, if it is we back off and get a write level | 1175 | * is extending the file, if it is we back off and get a write level |
| 1176 | * meta lock. | 1176 | * meta lock. |
| 1177 | */ | 1177 | */ |
| 1178 | for(;;) { | 1178 | for(;;) { |
| 1179 | ret = ocfs2_meta_lock(inode, NULL, meta_level); | 1179 | ret = ocfs2_meta_lock(inode, NULL, meta_level); |
| 1180 | if (ret < 0) { | 1180 | if (ret < 0) { |
| 1181 | meta_level = -1; | 1181 | meta_level = -1; |
| 1182 | mlog_errno(ret); | 1182 | mlog_errno(ret); |
| 1183 | goto out; | 1183 | goto out; |
| 1184 | } | 1184 | } |
| 1185 | 1185 | ||
| 1186 | /* Clear suid / sgid if necessary. We do this here | 1186 | /* Clear suid / sgid if necessary. We do this here |
| 1187 | * instead of later in the write path because | 1187 | * instead of later in the write path because |
| 1188 | * remove_suid() calls ->setattr without any hint that | 1188 | * remove_suid() calls ->setattr without any hint that |
| 1189 | * we may have already done our cluster locking. Since | 1189 | * we may have already done our cluster locking. Since |
| 1190 | * ocfs2_setattr() *must* take cluster locks to | 1190 | * ocfs2_setattr() *must* take cluster locks to |
| 1191 | * proceeed, this will lead us to recursively lock the | 1191 | * proceeed, this will lead us to recursively lock the |
| 1192 | * inode. There's also the dinode i_size state which | 1192 | * inode. There's also the dinode i_size state which |
| 1193 | * can be lost via setattr during extending writes (we | 1193 | * can be lost via setattr during extending writes (we |
| 1194 | * set inode->i_size at the end of a write. */ | 1194 | * set inode->i_size at the end of a write. */ |
| 1195 | if (should_remove_suid(dentry)) { | 1195 | if (should_remove_suid(dentry)) { |
| 1196 | if (meta_level == 0) { | 1196 | if (meta_level == 0) { |
| 1197 | ocfs2_meta_unlock(inode, meta_level); | 1197 | ocfs2_meta_unlock(inode, meta_level); |
| 1198 | meta_level = 1; | 1198 | meta_level = 1; |
| 1199 | continue; | 1199 | continue; |
| 1200 | } | 1200 | } |
| 1201 | 1201 | ||
| 1202 | ret = ocfs2_write_remove_suid(inode); | 1202 | ret = ocfs2_write_remove_suid(inode); |
| 1203 | if (ret < 0) { | 1203 | if (ret < 0) { |
| 1204 | mlog_errno(ret); | 1204 | mlog_errno(ret); |
| 1205 | goto out_unlock; | 1205 | goto out_unlock; |
| 1206 | } | 1206 | } |
| 1207 | } | 1207 | } |
| 1208 | 1208 | ||
| 1209 | /* work on a copy of ppos until we're sure that we won't have | 1209 | /* work on a copy of ppos until we're sure that we won't have |
| 1210 | * to recalculate it due to relocking. */ | 1210 | * to recalculate it due to relocking. */ |
| 1211 | if (appending) { | 1211 | if (appending) { |
| 1212 | saved_pos = i_size_read(inode); | 1212 | saved_pos = i_size_read(inode); |
| 1213 | mlog(0, "O_APPEND: inode->i_size=%llu\n", saved_pos); | 1213 | mlog(0, "O_APPEND: inode->i_size=%llu\n", saved_pos); |
| 1214 | } else { | 1214 | } else { |
| 1215 | saved_pos = *ppos; | 1215 | saved_pos = *ppos; |
| 1216 | } | 1216 | } |
| 1217 | 1217 | ||
| 1218 | if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) { | 1218 | if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) { |
| 1219 | loff_t end = saved_pos + count; | 1219 | loff_t end = saved_pos + count; |
| 1220 | 1220 | ||
| 1221 | /* | 1221 | /* |
| 1222 | * Skip the O_DIRECT checks if we don't need | 1222 | * Skip the O_DIRECT checks if we don't need |
| 1223 | * them. | 1223 | * them. |
| 1224 | */ | 1224 | */ |
| 1225 | if (!direct_io || !(*direct_io)) | 1225 | if (!direct_io || !(*direct_io)) |
| 1226 | break; | 1226 | break; |
| 1227 | 1227 | ||
| 1228 | /* | 1228 | /* |
| 1229 | * Allowing concurrent direct writes means | 1229 | * Allowing concurrent direct writes means |
| 1230 | * i_size changes wouldn't be synchronized, so | 1230 | * i_size changes wouldn't be synchronized, so |
| 1231 | * one node could wind up truncating another | 1231 | * one node could wind up truncating another |
| 1232 | * nodes writes. | 1232 | * nodes writes. |
| 1233 | */ | 1233 | */ |
| 1234 | if (end > i_size_read(inode)) { | 1234 | if (end > i_size_read(inode)) { |
| 1235 | *direct_io = 0; | 1235 | *direct_io = 0; |
| 1236 | break; | 1236 | break; |
| 1237 | } | 1237 | } |
| 1238 | 1238 | ||
| 1239 | /* | 1239 | /* |
| 1240 | * We don't fill holes during direct io, so | 1240 | * We don't fill holes during direct io, so |
| 1241 | * check for them here. If any are found, the | 1241 | * check for them here. If any are found, the |
| 1242 | * caller will have to retake some cluster | 1242 | * caller will have to retake some cluster |
| 1243 | * locks and initiate the io as buffered. | 1243 | * locks and initiate the io as buffered. |
| 1244 | */ | 1244 | */ |
| 1245 | ret = ocfs2_check_range_for_holes(inode, saved_pos, | 1245 | ret = ocfs2_check_range_for_holes(inode, saved_pos, |
| 1246 | count); | 1246 | count); |
| 1247 | if (ret == 1) { | 1247 | if (ret == 1) { |
| 1248 | *direct_io = 0; | 1248 | *direct_io = 0; |
| 1249 | ret = 0; | 1249 | ret = 0; |
| 1250 | } else if (ret < 0) | 1250 | } else if (ret < 0) |
| 1251 | mlog_errno(ret); | 1251 | mlog_errno(ret); |
| 1252 | break; | 1252 | break; |
| 1253 | } | 1253 | } |
| 1254 | 1254 | ||
| 1255 | /* | 1255 | /* |
| 1256 | * The rest of this loop is concerned with legacy file | 1256 | * The rest of this loop is concerned with legacy file |
| 1257 | * systems which don't support sparse files. | 1257 | * systems which don't support sparse files. |
| 1258 | */ | 1258 | */ |
| 1259 | 1259 | ||
| 1260 | newsize = count + saved_pos; | 1260 | newsize = count + saved_pos; |
| 1261 | 1261 | ||
| 1262 | mlog(0, "pos=%lld newsize=%lld cursize=%lld\n", | 1262 | mlog(0, "pos=%lld newsize=%lld cursize=%lld\n", |
| 1263 | (long long) saved_pos, (long long) newsize, | 1263 | (long long) saved_pos, (long long) newsize, |
| 1264 | (long long) i_size_read(inode)); | 1264 | (long long) i_size_read(inode)); |
| 1265 | 1265 | ||
| 1266 | /* No need for a higher level metadata lock if we're | 1266 | /* No need for a higher level metadata lock if we're |
| 1267 | * never going past i_size. */ | 1267 | * never going past i_size. */ |
| 1268 | if (newsize <= i_size_read(inode)) | 1268 | if (newsize <= i_size_read(inode)) |
| 1269 | break; | 1269 | break; |
| 1270 | 1270 | ||
| 1271 | if (meta_level == 0) { | 1271 | if (meta_level == 0) { |
| 1272 | ocfs2_meta_unlock(inode, meta_level); | 1272 | ocfs2_meta_unlock(inode, meta_level); |
| 1273 | meta_level = 1; | 1273 | meta_level = 1; |
| 1274 | continue; | 1274 | continue; |
| 1275 | } | 1275 | } |
| 1276 | 1276 | ||
| 1277 | spin_lock(&OCFS2_I(inode)->ip_lock); | 1277 | spin_lock(&OCFS2_I(inode)->ip_lock); |
| 1278 | clusters = ocfs2_clusters_for_bytes(inode->i_sb, newsize) - | 1278 | clusters = ocfs2_clusters_for_bytes(inode->i_sb, newsize) - |
| 1279 | OCFS2_I(inode)->ip_clusters; | 1279 | OCFS2_I(inode)->ip_clusters; |
| 1280 | spin_unlock(&OCFS2_I(inode)->ip_lock); | 1280 | spin_unlock(&OCFS2_I(inode)->ip_lock); |
| 1281 | 1281 | ||
| 1282 | mlog(0, "Writing at EOF, may need more allocation: " | 1282 | mlog(0, "Writing at EOF, may need more allocation: " |
| 1283 | "i_size = %lld, newsize = %lld, need %u clusters\n", | 1283 | "i_size = %lld, newsize = %lld, need %u clusters\n", |
| 1284 | (long long) i_size_read(inode), (long long) newsize, | 1284 | (long long) i_size_read(inode), (long long) newsize, |
| 1285 | clusters); | 1285 | clusters); |
| 1286 | 1286 | ||
| 1287 | /* We only want to continue the rest of this loop if | 1287 | /* We only want to continue the rest of this loop if |
| 1288 | * our extend will actually require more | 1288 | * our extend will actually require more |
| 1289 | * allocation. */ | 1289 | * allocation. */ |
| 1290 | if (!clusters) | 1290 | if (!clusters) |
| 1291 | break; | 1291 | break; |
| 1292 | 1292 | ||
| 1293 | ret = ocfs2_extend_file(inode, NULL, newsize, count); | 1293 | ret = ocfs2_extend_file(inode, NULL, newsize, count); |
| 1294 | if (ret < 0) { | 1294 | if (ret < 0) { |
| 1295 | if (ret != -ENOSPC) | 1295 | if (ret != -ENOSPC) |
| 1296 | mlog_errno(ret); | 1296 | mlog_errno(ret); |
| 1297 | goto out_unlock; | 1297 | goto out_unlock; |
| 1298 | } | 1298 | } |
| 1299 | break; | 1299 | break; |
| 1300 | } | 1300 | } |
| 1301 | 1301 | ||
| 1302 | if (appending) | 1302 | if (appending) |
| 1303 | *ppos = saved_pos; | 1303 | *ppos = saved_pos; |
| 1304 | 1304 | ||
| 1305 | out_unlock: | 1305 | out_unlock: |
| 1306 | ocfs2_meta_unlock(inode, meta_level); | 1306 | ocfs2_meta_unlock(inode, meta_level); |
| 1307 | 1307 | ||
| 1308 | out: | 1308 | out: |
| 1309 | return ret; | 1309 | return ret; |
| 1310 | } | 1310 | } |
| 1311 | 1311 | ||
| 1312 | static inline void | 1312 | static inline void |
| 1313 | ocfs2_set_next_iovec(const struct iovec **iovp, size_t *basep, size_t bytes) | 1313 | ocfs2_set_next_iovec(const struct iovec **iovp, size_t *basep, size_t bytes) |
| 1314 | { | 1314 | { |
| 1315 | const struct iovec *iov = *iovp; | 1315 | const struct iovec *iov = *iovp; |
| 1316 | size_t base = *basep; | 1316 | size_t base = *basep; |
| 1317 | 1317 | ||
| 1318 | do { | 1318 | do { |
| 1319 | int copy = min(bytes, iov->iov_len - base); | 1319 | int copy = min(bytes, iov->iov_len - base); |
| 1320 | 1320 | ||
| 1321 | bytes -= copy; | 1321 | bytes -= copy; |
| 1322 | base += copy; | 1322 | base += copy; |
| 1323 | if (iov->iov_len == base) { | 1323 | if (iov->iov_len == base) { |
| 1324 | iov++; | 1324 | iov++; |
| 1325 | base = 0; | 1325 | base = 0; |
| 1326 | } | 1326 | } |
| 1327 | } while (bytes); | 1327 | } while (bytes); |
| 1328 | *iovp = iov; | 1328 | *iovp = iov; |
| 1329 | *basep = base; | 1329 | *basep = base; |
| 1330 | } | 1330 | } |
| 1331 | 1331 | ||
| 1332 | static struct page * ocfs2_get_write_source(struct ocfs2_buffered_write_priv *bp, | 1332 | static struct page * ocfs2_get_write_source(struct ocfs2_buffered_write_priv *bp, |
| 1333 | const struct iovec *cur_iov, | 1333 | const struct iovec *cur_iov, |
| 1334 | size_t iov_offset) | 1334 | size_t iov_offset) |
| 1335 | { | 1335 | { |
| 1336 | int ret; | 1336 | int ret; |
| 1337 | char *buf; | 1337 | char *buf; |
| 1338 | struct page *src_page = NULL; | 1338 | struct page *src_page = NULL; |
| 1339 | 1339 | ||
| 1340 | buf = cur_iov->iov_base + iov_offset; | 1340 | buf = cur_iov->iov_base + iov_offset; |
| 1341 | 1341 | ||
| 1342 | if (!segment_eq(get_fs(), KERNEL_DS)) { | 1342 | if (!segment_eq(get_fs(), KERNEL_DS)) { |
| 1343 | /* | 1343 | /* |
| 1344 | * Pull in the user page. We want to do this outside | 1344 | * Pull in the user page. We want to do this outside |
| 1345 | * of the meta data locks in order to preserve locking | 1345 | * of the meta data locks in order to preserve locking |
| 1346 | * order in case of page fault. | 1346 | * order in case of page fault. |
| 1347 | */ | 1347 | */ |
| 1348 | ret = get_user_pages(current, current->mm, | 1348 | ret = get_user_pages(current, current->mm, |
| 1349 | (unsigned long)buf & PAGE_CACHE_MASK, 1, | 1349 | (unsigned long)buf & PAGE_CACHE_MASK, 1, |
| 1350 | 0, 0, &src_page, NULL); | 1350 | 0, 0, &src_page, NULL); |
| 1351 | if (ret == 1) | 1351 | if (ret == 1) |
| 1352 | bp->b_src_buf = kmap(src_page); | 1352 | bp->b_src_buf = kmap(src_page); |
| 1353 | else | 1353 | else |
| 1354 | src_page = ERR_PTR(-EFAULT); | 1354 | src_page = ERR_PTR(-EFAULT); |
| 1355 | } else { | 1355 | } else { |
| 1356 | bp->b_src_buf = buf; | 1356 | bp->b_src_buf = buf; |
| 1357 | } | 1357 | } |
| 1358 | 1358 | ||
| 1359 | return src_page; | 1359 | return src_page; |
| 1360 | } | 1360 | } |
| 1361 | 1361 | ||
| 1362 | static void ocfs2_put_write_source(struct ocfs2_buffered_write_priv *bp, | 1362 | static void ocfs2_put_write_source(struct ocfs2_buffered_write_priv *bp, |
| 1363 | struct page *page) | 1363 | struct page *page) |
| 1364 | { | 1364 | { |
| 1365 | if (page) { | 1365 | if (page) { |
| 1366 | kunmap(page); | 1366 | kunmap(page); |
| 1367 | page_cache_release(page); | 1367 | page_cache_release(page); |
| 1368 | } | 1368 | } |
| 1369 | } | 1369 | } |
| 1370 | 1370 | ||
| 1371 | static ssize_t ocfs2_file_buffered_write(struct file *file, loff_t *ppos, | 1371 | static ssize_t ocfs2_file_buffered_write(struct file *file, loff_t *ppos, |
| 1372 | const struct iovec *iov, | 1372 | const struct iovec *iov, |
| 1373 | unsigned long nr_segs, | 1373 | unsigned long nr_segs, |
| 1374 | size_t count, | 1374 | size_t count, |
| 1375 | ssize_t o_direct_written) | 1375 | ssize_t o_direct_written) |
| 1376 | { | 1376 | { |
| 1377 | int ret = 0; | 1377 | int ret = 0; |
| 1378 | ssize_t copied, total = 0; | 1378 | ssize_t copied, total = 0; |
| 1379 | size_t iov_offset = 0; | 1379 | size_t iov_offset = 0; |
| 1380 | const struct iovec *cur_iov = iov; | 1380 | const struct iovec *cur_iov = iov; |
| 1381 | struct ocfs2_buffered_write_priv bp; | 1381 | struct ocfs2_buffered_write_priv bp; |
| 1382 | struct page *page; | 1382 | struct page *page; |
| 1383 | 1383 | ||
| 1384 | /* | 1384 | /* |
| 1385 | * handle partial DIO write. Adjust cur_iov if needed. | 1385 | * handle partial DIO write. Adjust cur_iov if needed. |
| 1386 | */ | 1386 | */ |
| 1387 | ocfs2_set_next_iovec(&cur_iov, &iov_offset, o_direct_written); | 1387 | ocfs2_set_next_iovec(&cur_iov, &iov_offset, o_direct_written); |
| 1388 | 1388 | ||
| 1389 | do { | 1389 | do { |
| 1390 | bp.b_cur_off = iov_offset; | 1390 | bp.b_cur_off = iov_offset; |
| 1391 | bp.b_cur_iov = cur_iov; | 1391 | bp.b_cur_iov = cur_iov; |
| 1392 | 1392 | ||
| 1393 | page = ocfs2_get_write_source(&bp, cur_iov, iov_offset); | 1393 | page = ocfs2_get_write_source(&bp, cur_iov, iov_offset); |
| 1394 | if (IS_ERR(page)) { | 1394 | if (IS_ERR(page)) { |
| 1395 | ret = PTR_ERR(page); | 1395 | ret = PTR_ERR(page); |
| 1396 | goto out; | 1396 | goto out; |
| 1397 | } | 1397 | } |
| 1398 | 1398 | ||
| 1399 | copied = ocfs2_buffered_write_cluster(file, *ppos, count, | 1399 | copied = ocfs2_buffered_write_cluster(file, *ppos, count, |
| 1400 | ocfs2_map_and_write_user_data, | 1400 | ocfs2_map_and_write_user_data, |
| 1401 | &bp); | 1401 | &bp); |
| 1402 | 1402 | ||
| 1403 | ocfs2_put_write_source(&bp, page); | 1403 | ocfs2_put_write_source(&bp, page); |
| 1404 | 1404 | ||
| 1405 | if (copied < 0) { | 1405 | if (copied < 0) { |
| 1406 | mlog_errno(copied); | 1406 | mlog_errno(copied); |
| 1407 | ret = copied; | 1407 | ret = copied; |
| 1408 | goto out; | 1408 | goto out; |
| 1409 | } | 1409 | } |
| 1410 | 1410 | ||
| 1411 | total += copied; | 1411 | total += copied; |
| 1412 | *ppos = *ppos + copied; | 1412 | *ppos = *ppos + copied; |
| 1413 | count -= copied; | 1413 | count -= copied; |
| 1414 | 1414 | ||
| 1415 | ocfs2_set_next_iovec(&cur_iov, &iov_offset, copied); | 1415 | ocfs2_set_next_iovec(&cur_iov, &iov_offset, copied); |
| 1416 | } while(count); | 1416 | } while(count); |
| 1417 | 1417 | ||
| 1418 | out: | 1418 | out: |
| 1419 | return total ? total : ret; | 1419 | return total ? total : ret; |
| 1420 | } | 1420 | } |
| 1421 | 1421 | ||
| 1422 | static ssize_t ocfs2_file_aio_write(struct kiocb *iocb, | 1422 | static ssize_t ocfs2_file_aio_write(struct kiocb *iocb, |
| 1423 | const struct iovec *iov, | 1423 | const struct iovec *iov, |
| 1424 | unsigned long nr_segs, | 1424 | unsigned long nr_segs, |
| 1425 | loff_t pos) | 1425 | loff_t pos) |
| 1426 | { | 1426 | { |
| 1427 | int ret, direct_io, appending, rw_level, have_alloc_sem = 0; | 1427 | int ret, direct_io, appending, rw_level, have_alloc_sem = 0; |
| 1428 | int can_do_direct, sync = 0; | 1428 | int can_do_direct, sync = 0; |
| 1429 | ssize_t written = 0; | 1429 | ssize_t written = 0; |
| 1430 | size_t ocount; /* original count */ | 1430 | size_t ocount; /* original count */ |
| 1431 | size_t count; /* after file limit checks */ | 1431 | size_t count; /* after file limit checks */ |
| 1432 | loff_t *ppos = &iocb->ki_pos; | 1432 | loff_t *ppos = &iocb->ki_pos; |
| 1433 | struct file *file = iocb->ki_filp; | 1433 | struct file *file = iocb->ki_filp; |
| 1434 | struct inode *inode = file->f_path.dentry->d_inode; | 1434 | struct inode *inode = file->f_path.dentry->d_inode; |
| 1435 | 1435 | ||
| 1436 | mlog_entry("(0x%p, %u, '%.*s')\n", file, | 1436 | mlog_entry("(0x%p, %u, '%.*s')\n", file, |
| 1437 | (unsigned int)nr_segs, | 1437 | (unsigned int)nr_segs, |
| 1438 | file->f_path.dentry->d_name.len, | 1438 | file->f_path.dentry->d_name.len, |
| 1439 | file->f_path.dentry->d_name.name); | 1439 | file->f_path.dentry->d_name.name); |
| 1440 | 1440 | ||
| 1441 | if (iocb->ki_left == 0) | 1441 | if (iocb->ki_left == 0) |
| 1442 | return 0; | 1442 | return 0; |
| 1443 | 1443 | ||
| 1444 | ret = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ); | 1444 | ret = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ); |
| 1445 | if (ret) | 1445 | if (ret) |
| 1446 | return ret; | 1446 | return ret; |
| 1447 | 1447 | ||
| 1448 | count = ocount; | 1448 | count = ocount; |
| 1449 | 1449 | ||
| 1450 | vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE); | 1450 | vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE); |
| 1451 | 1451 | ||
| 1452 | appending = file->f_flags & O_APPEND ? 1 : 0; | 1452 | appending = file->f_flags & O_APPEND ? 1 : 0; |
| 1453 | direct_io = file->f_flags & O_DIRECT ? 1 : 0; | 1453 | direct_io = file->f_flags & O_DIRECT ? 1 : 0; |
| 1454 | 1454 | ||
| 1455 | mutex_lock(&inode->i_mutex); | 1455 | mutex_lock(&inode->i_mutex); |
| 1456 | 1456 | ||
| 1457 | relock: | 1457 | relock: |
| 1458 | /* to match setattr's i_mutex -> i_alloc_sem -> rw_lock ordering */ | 1458 | /* to match setattr's i_mutex -> i_alloc_sem -> rw_lock ordering */ |
| 1459 | if (direct_io) { | 1459 | if (direct_io) { |
| 1460 | down_read(&inode->i_alloc_sem); | 1460 | down_read(&inode->i_alloc_sem); |
| 1461 | have_alloc_sem = 1; | 1461 | have_alloc_sem = 1; |
| 1462 | } | 1462 | } |
| 1463 | 1463 | ||
| 1464 | /* concurrent O_DIRECT writes are allowed */ | 1464 | /* concurrent O_DIRECT writes are allowed */ |
| 1465 | rw_level = !direct_io; | 1465 | rw_level = !direct_io; |
| 1466 | ret = ocfs2_rw_lock(inode, rw_level); | 1466 | ret = ocfs2_rw_lock(inode, rw_level); |
| 1467 | if (ret < 0) { | 1467 | if (ret < 0) { |
| 1468 | mlog_errno(ret); | 1468 | mlog_errno(ret); |
| 1469 | goto out_sems; | 1469 | goto out_sems; |
| 1470 | } | 1470 | } |
| 1471 | 1471 | ||
| 1472 | can_do_direct = direct_io; | 1472 | can_do_direct = direct_io; |
| 1473 | ret = ocfs2_prepare_inode_for_write(file->f_path.dentry, ppos, | 1473 | ret = ocfs2_prepare_inode_for_write(file->f_path.dentry, ppos, |
| 1474 | iocb->ki_left, appending, | 1474 | iocb->ki_left, appending, |
| 1475 | &can_do_direct); | 1475 | &can_do_direct); |
| 1476 | if (ret < 0) { | 1476 | if (ret < 0) { |
| 1477 | mlog_errno(ret); | 1477 | mlog_errno(ret); |
| 1478 | goto out; | 1478 | goto out; |
| 1479 | } | 1479 | } |
| 1480 | 1480 | ||
| 1481 | /* | 1481 | /* |
| 1482 | * We can't complete the direct I/O as requested, fall back to | 1482 | * We can't complete the direct I/O as requested, fall back to |
| 1483 | * buffered I/O. | 1483 | * buffered I/O. |
| 1484 | */ | 1484 | */ |
| 1485 | if (direct_io && !can_do_direct) { | 1485 | if (direct_io && !can_do_direct) { |
| 1486 | ocfs2_rw_unlock(inode, rw_level); | 1486 | ocfs2_rw_unlock(inode, rw_level); |
| 1487 | up_read(&inode->i_alloc_sem); | 1487 | up_read(&inode->i_alloc_sem); |
| 1488 | 1488 | ||
| 1489 | have_alloc_sem = 0; | 1489 | have_alloc_sem = 0; |
| 1490 | rw_level = -1; | 1490 | rw_level = -1; |
| 1491 | 1491 | ||
| 1492 | direct_io = 0; | 1492 | direct_io = 0; |
| 1493 | sync = 1; | 1493 | sync = 1; |
| 1494 | goto relock; | 1494 | goto relock; |
| 1495 | } | 1495 | } |
| 1496 | 1496 | ||
| 1497 | if (!sync && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) | 1497 | if (!sync && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) |
| 1498 | sync = 1; | 1498 | sync = 1; |
| 1499 | 1499 | ||
| 1500 | /* | 1500 | /* |
| 1501 | * XXX: Is it ok to execute these checks a second time? | 1501 | * XXX: Is it ok to execute these checks a second time? |
| 1502 | */ | 1502 | */ |
| 1503 | ret = generic_write_checks(file, ppos, &count, S_ISBLK(inode->i_mode)); | 1503 | ret = generic_write_checks(file, ppos, &count, S_ISBLK(inode->i_mode)); |
| 1504 | if (ret) | 1504 | if (ret) |
| 1505 | goto out; | 1505 | goto out; |
| 1506 | 1506 | ||
| 1507 | /* | 1507 | /* |
| 1508 | * Set pos so that sync_page_range_nolock() below understands | 1508 | * Set pos so that sync_page_range_nolock() below understands |
| 1509 | * where to start from. We might've moved it around via the | 1509 | * where to start from. We might've moved it around via the |
| 1510 | * calls above. The range we want to actually sync starts from | 1510 | * calls above. The range we want to actually sync starts from |
| 1511 | * *ppos here. | 1511 | * *ppos here. |
| 1512 | * | 1512 | * |
| 1513 | */ | 1513 | */ |
| 1514 | pos = *ppos; | 1514 | pos = *ppos; |
| 1515 | 1515 | ||
| 1516 | /* communicate with ocfs2_dio_end_io */ | 1516 | /* communicate with ocfs2_dio_end_io */ |
| 1517 | ocfs2_iocb_set_rw_locked(iocb, rw_level); | 1517 | ocfs2_iocb_set_rw_locked(iocb, rw_level); |
| 1518 | 1518 | ||
| 1519 | if (direct_io) { | 1519 | if (direct_io) { |
| 1520 | written = generic_file_direct_write(iocb, iov, &nr_segs, *ppos, | 1520 | written = generic_file_direct_write(iocb, iov, &nr_segs, *ppos, |
| 1521 | ppos, count, ocount); | 1521 | ppos, count, ocount); |
| 1522 | if (written < 0) { | 1522 | if (written < 0) { |
| 1523 | ret = written; | 1523 | ret = written; |
| 1524 | goto out_dio; | 1524 | goto out_dio; |
| 1525 | } | 1525 | } |
| 1526 | } else { | 1526 | } else { |
| 1527 | written = ocfs2_file_buffered_write(file, ppos, iov, nr_segs, | 1527 | written = ocfs2_file_buffered_write(file, ppos, iov, nr_segs, |
| 1528 | count, written); | 1528 | count, written); |
| 1529 | if (written < 0) { | 1529 | if (written < 0) { |
| 1530 | ret = written; | 1530 | ret = written; |
| 1531 | if (ret != -EFAULT || ret != -ENOSPC) | 1531 | if (ret != -EFAULT || ret != -ENOSPC) |
| 1532 | mlog_errno(ret); | 1532 | mlog_errno(ret); |
| 1533 | goto out; | 1533 | goto out; |
| 1534 | } | 1534 | } |
| 1535 | } | 1535 | } |
| 1536 | 1536 | ||
| 1537 | out_dio: | 1537 | out_dio: |
| 1538 | /* buffered aio wouldn't have proper lock coverage today */ | 1538 | /* buffered aio wouldn't have proper lock coverage today */ |
| 1539 | BUG_ON(ret == -EIOCBQUEUED && !(file->f_flags & O_DIRECT)); | 1539 | BUG_ON(ret == -EIOCBQUEUED && !(file->f_flags & O_DIRECT)); |
| 1540 | 1540 | ||
| 1541 | /* | 1541 | /* |
| 1542 | * deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io | 1542 | * deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io |
| 1543 | * function pointer which is called when o_direct io completes so that | 1543 | * function pointer which is called when o_direct io completes so that |
| 1544 | * it can unlock our rw lock. (it's the clustered equivalent of | 1544 | * it can unlock our rw lock. (it's the clustered equivalent of |
| 1545 | * i_alloc_sem; protects truncate from racing with pending ios). | 1545 | * i_alloc_sem; protects truncate from racing with pending ios). |
| 1546 | * Unfortunately there are error cases which call end_io and others | 1546 | * Unfortunately there are error cases which call end_io and others |
| 1547 | * that don't. so we don't have to unlock the rw_lock if either an | 1547 | * that don't. so we don't have to unlock the rw_lock if either an |
| 1548 | * async dio is going to do it in the future or an end_io after an | 1548 | * async dio is going to do it in the future or an end_io after an |
| 1549 | * error has already done it. | 1549 | * error has already done it. |
| 1550 | */ | 1550 | */ |
| 1551 | if (ret == -EIOCBQUEUED || !ocfs2_iocb_is_rw_locked(iocb)) { | 1551 | if (ret == -EIOCBQUEUED || !ocfs2_iocb_is_rw_locked(iocb)) { |
| 1552 | rw_level = -1; | 1552 | rw_level = -1; |
| 1553 | have_alloc_sem = 0; | 1553 | have_alloc_sem = 0; |
| 1554 | } | 1554 | } |
| 1555 | 1555 | ||
| 1556 | out: | 1556 | out: |
| 1557 | if (rw_level != -1) | 1557 | if (rw_level != -1) |
| 1558 | ocfs2_rw_unlock(inode, rw_level); | 1558 | ocfs2_rw_unlock(inode, rw_level); |
| 1559 | 1559 | ||
| 1560 | out_sems: | 1560 | out_sems: |
| 1561 | if (have_alloc_sem) | 1561 | if (have_alloc_sem) |
| 1562 | up_read(&inode->i_alloc_sem); | 1562 | up_read(&inode->i_alloc_sem); |
| 1563 | 1563 | ||
| 1564 | if (written > 0 && sync) { | 1564 | if (written > 0 && sync) { |
| 1565 | ssize_t err; | 1565 | ssize_t err; |
| 1566 | 1566 | ||
| 1567 | err = sync_page_range_nolock(inode, file->f_mapping, pos, count); | 1567 | err = sync_page_range_nolock(inode, file->f_mapping, pos, count); |
| 1568 | if (err < 0) | 1568 | if (err < 0) |
| 1569 | written = err; | 1569 | written = err; |
| 1570 | } | 1570 | } |
| 1571 | 1571 | ||
| 1572 | mutex_unlock(&inode->i_mutex); | 1572 | mutex_unlock(&inode->i_mutex); |
| 1573 | 1573 | ||
| 1574 | mlog_exit(ret); | 1574 | mlog_exit(ret); |
| 1575 | return written ? written : ret; | 1575 | return written ? written : ret; |
| 1576 | } | 1576 | } |
| 1577 | 1577 | ||
| 1578 | static int ocfs2_splice_write_actor(struct pipe_inode_info *pipe, | 1578 | static int ocfs2_splice_write_actor(struct pipe_inode_info *pipe, |
| 1579 | struct pipe_buffer *buf, | 1579 | struct pipe_buffer *buf, |
| 1580 | struct splice_desc *sd) | 1580 | struct splice_desc *sd) |
| 1581 | { | 1581 | { |
| 1582 | int ret, count, total = 0; | 1582 | int ret, count, total = 0; |
| 1583 | ssize_t copied = 0; | 1583 | ssize_t copied = 0; |
| 1584 | struct ocfs2_splice_write_priv sp; | 1584 | struct ocfs2_splice_write_priv sp; |
| 1585 | 1585 | ||
| 1586 | ret = buf->ops->pin(pipe, buf); | 1586 | ret = buf->ops->confirm(pipe, buf); |
| 1587 | if (ret) | 1587 | if (ret) |
| 1588 | goto out; | 1588 | goto out; |
| 1589 | 1589 | ||
| 1590 | sp.s_sd = sd; | 1590 | sp.s_sd = sd; |
| 1591 | sp.s_buf = buf; | 1591 | sp.s_buf = buf; |
| 1592 | sp.s_pipe = pipe; | 1592 | sp.s_pipe = pipe; |
| 1593 | sp.s_offset = sd->pos & ~PAGE_CACHE_MASK; | 1593 | sp.s_offset = sd->pos & ~PAGE_CACHE_MASK; |
| 1594 | sp.s_buf_offset = buf->offset; | 1594 | sp.s_buf_offset = buf->offset; |
| 1595 | 1595 | ||
| 1596 | count = sd->len; | 1596 | count = sd->len; |
| 1597 | if (count + sp.s_offset > PAGE_CACHE_SIZE) | 1597 | if (count + sp.s_offset > PAGE_CACHE_SIZE) |
| 1598 | count = PAGE_CACHE_SIZE - sp.s_offset; | 1598 | count = PAGE_CACHE_SIZE - sp.s_offset; |
| 1599 | 1599 | ||
| 1600 | do { | 1600 | do { |
| 1601 | /* | 1601 | /* |
| 1602 | * splice wants us to copy up to one page at a | 1602 | * splice wants us to copy up to one page at a |
| 1603 | * time. For pagesize > cluster size, this means we | 1603 | * time. For pagesize > cluster size, this means we |
| 1604 | * might enter ocfs2_buffered_write_cluster() more | 1604 | * might enter ocfs2_buffered_write_cluster() more |
| 1605 | * than once, so keep track of our progress here. | 1605 | * than once, so keep track of our progress here. |
| 1606 | */ | 1606 | */ |
| 1607 | copied = ocfs2_buffered_write_cluster(sd->file, | 1607 | copied = ocfs2_buffered_write_cluster(sd->u.file, |
| 1608 | (loff_t)sd->pos + total, | 1608 | (loff_t)sd->pos + total, |
| 1609 | count, | 1609 | count, |
| 1610 | ocfs2_map_and_write_splice_data, | 1610 | ocfs2_map_and_write_splice_data, |
| 1611 | &sp); | 1611 | &sp); |
| 1612 | if (copied < 0) { | 1612 | if (copied < 0) { |
| 1613 | mlog_errno(copied); | 1613 | mlog_errno(copied); |
| 1614 | ret = copied; | 1614 | ret = copied; |
| 1615 | goto out; | 1615 | goto out; |
| 1616 | } | 1616 | } |
| 1617 | 1617 | ||
| 1618 | count -= copied; | 1618 | count -= copied; |
| 1619 | sp.s_offset += copied; | 1619 | sp.s_offset += copied; |
| 1620 | sp.s_buf_offset += copied; | 1620 | sp.s_buf_offset += copied; |
| 1621 | total += copied; | 1621 | total += copied; |
| 1622 | } while (count); | 1622 | } while (count); |
| 1623 | 1623 | ||
| 1624 | ret = 0; | 1624 | ret = 0; |
| 1625 | out: | 1625 | out: |
| 1626 | 1626 | ||
| 1627 | return total ? total : ret; | 1627 | return total ? total : ret; |
| 1628 | } | 1628 | } |
| 1629 | 1629 | ||
| 1630 | static ssize_t __ocfs2_file_splice_write(struct pipe_inode_info *pipe, | 1630 | static ssize_t __ocfs2_file_splice_write(struct pipe_inode_info *pipe, |
| 1631 | struct file *out, | 1631 | struct file *out, |
| 1632 | loff_t *ppos, | 1632 | loff_t *ppos, |
| 1633 | size_t len, | 1633 | size_t len, |
| 1634 | unsigned int flags) | 1634 | unsigned int flags) |
| 1635 | { | 1635 | { |
| 1636 | int ret, err; | 1636 | int ret, err; |
| 1637 | struct address_space *mapping = out->f_mapping; | 1637 | struct address_space *mapping = out->f_mapping; |
| 1638 | struct inode *inode = mapping->host; | 1638 | struct inode *inode = mapping->host; |
| 1639 | struct splice_desc sd = { | 1639 | struct splice_desc sd = { |
| 1640 | .total_len = len, | 1640 | .total_len = len, |
| 1641 | .flags = flags, | 1641 | .flags = flags, |
| 1642 | .pos = *ppos, | 1642 | .pos = *ppos, |
| 1643 | .u.file = out, | 1643 | .u.file = out, |
| 1644 | }; | 1644 | }; |
| 1645 | 1645 | ||
| 1646 | ret = __splice_from_pipe(pipe, &sd, ocfs2_splice_write_actor); | 1646 | ret = __splice_from_pipe(pipe, &sd, ocfs2_splice_write_actor); |
| 1647 | if (ret > 0) { | 1647 | if (ret > 0) { |
| 1648 | *ppos += ret; | 1648 | *ppos += ret; |
| 1649 | 1649 | ||
| 1650 | if (unlikely((out->f_flags & O_SYNC) || IS_SYNC(inode))) { | 1650 | if (unlikely((out->f_flags & O_SYNC) || IS_SYNC(inode))) { |
| 1651 | err = generic_osync_inode(inode, mapping, | 1651 | err = generic_osync_inode(inode, mapping, |
| 1652 | OSYNC_METADATA|OSYNC_DATA); | 1652 | OSYNC_METADATA|OSYNC_DATA); |
| 1653 | if (err) | 1653 | if (err) |
| 1654 | ret = err; | 1654 | ret = err; |
| 1655 | } | 1655 | } |
| 1656 | } | 1656 | } |
| 1657 | 1657 | ||
| 1658 | return ret; | 1658 | return ret; |
| 1659 | } | 1659 | } |
| 1660 | 1660 | ||
| 1661 | static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe, | 1661 | static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe, |
| 1662 | struct file *out, | 1662 | struct file *out, |
| 1663 | loff_t *ppos, | 1663 | loff_t *ppos, |
| 1664 | size_t len, | 1664 | size_t len, |
| 1665 | unsigned int flags) | 1665 | unsigned int flags) |
| 1666 | { | 1666 | { |
| 1667 | int ret; | 1667 | int ret; |
| 1668 | struct inode *inode = out->f_path.dentry->d_inode; | 1668 | struct inode *inode = out->f_path.dentry->d_inode; |
| 1669 | 1669 | ||
| 1670 | mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", out, pipe, | 1670 | mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", out, pipe, |
| 1671 | (unsigned int)len, | 1671 | (unsigned int)len, |
| 1672 | out->f_path.dentry->d_name.len, | 1672 | out->f_path.dentry->d_name.len, |
| 1673 | out->f_path.dentry->d_name.name); | 1673 | out->f_path.dentry->d_name.name); |
| 1674 | 1674 | ||
| 1675 | inode_double_lock(inode, pipe->inode); | 1675 | inode_double_lock(inode, pipe->inode); |
| 1676 | 1676 | ||
| 1677 | ret = ocfs2_rw_lock(inode, 1); | 1677 | ret = ocfs2_rw_lock(inode, 1); |
| 1678 | if (ret < 0) { | 1678 | if (ret < 0) { |
| 1679 | mlog_errno(ret); | 1679 | mlog_errno(ret); |
| 1680 | goto out; | 1680 | goto out; |
| 1681 | } | 1681 | } |
| 1682 | 1682 | ||
| 1683 | ret = ocfs2_prepare_inode_for_write(out->f_path.dentry, ppos, len, 0, | 1683 | ret = ocfs2_prepare_inode_for_write(out->f_path.dentry, ppos, len, 0, |
| 1684 | NULL); | 1684 | NULL); |
| 1685 | if (ret < 0) { | 1685 | if (ret < 0) { |
| 1686 | mlog_errno(ret); | 1686 | mlog_errno(ret); |
| 1687 | goto out_unlock; | 1687 | goto out_unlock; |
| 1688 | } | 1688 | } |
| 1689 | 1689 | ||
| 1690 | /* ok, we're done with i_size and alloc work */ | 1690 | /* ok, we're done with i_size and alloc work */ |
| 1691 | ret = __ocfs2_file_splice_write(pipe, out, ppos, len, flags); | 1691 | ret = __ocfs2_file_splice_write(pipe, out, ppos, len, flags); |
| 1692 | 1692 | ||
| 1693 | out_unlock: | 1693 | out_unlock: |
| 1694 | ocfs2_rw_unlock(inode, 1); | 1694 | ocfs2_rw_unlock(inode, 1); |
| 1695 | out: | 1695 | out: |
| 1696 | inode_double_unlock(inode, pipe->inode); | 1696 | inode_double_unlock(inode, pipe->inode); |
| 1697 | 1697 | ||
| 1698 | mlog_exit(ret); | 1698 | mlog_exit(ret); |
| 1699 | return ret; | 1699 | return ret; |
| 1700 | } | 1700 | } |
| 1701 | 1701 | ||
| 1702 | static ssize_t ocfs2_file_splice_read(struct file *in, | 1702 | static ssize_t ocfs2_file_splice_read(struct file *in, |
| 1703 | loff_t *ppos, | 1703 | loff_t *ppos, |
| 1704 | struct pipe_inode_info *pipe, | 1704 | struct pipe_inode_info *pipe, |
| 1705 | size_t len, | 1705 | size_t len, |
| 1706 | unsigned int flags) | 1706 | unsigned int flags) |
| 1707 | { | 1707 | { |
| 1708 | int ret = 0; | 1708 | int ret = 0; |
| 1709 | struct inode *inode = in->f_path.dentry->d_inode; | 1709 | struct inode *inode = in->f_path.dentry->d_inode; |
| 1710 | 1710 | ||
| 1711 | mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", in, pipe, | 1711 | mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", in, pipe, |
| 1712 | (unsigned int)len, | 1712 | (unsigned int)len, |
| 1713 | in->f_path.dentry->d_name.len, | 1713 | in->f_path.dentry->d_name.len, |
| 1714 | in->f_path.dentry->d_name.name); | 1714 | in->f_path.dentry->d_name.name); |
| 1715 | 1715 | ||
| 1716 | /* | 1716 | /* |
| 1717 | * See the comment in ocfs2_file_aio_read() | 1717 | * See the comment in ocfs2_file_aio_read() |
| 1718 | */ | 1718 | */ |
| 1719 | ret = ocfs2_meta_lock(inode, NULL, 0); | 1719 | ret = ocfs2_meta_lock(inode, NULL, 0); |
| 1720 | if (ret < 0) { | 1720 | if (ret < 0) { |
| 1721 | mlog_errno(ret); | 1721 | mlog_errno(ret); |
| 1722 | goto bail; | 1722 | goto bail; |
| 1723 | } | 1723 | } |
| 1724 | ocfs2_meta_unlock(inode, 0); | 1724 | ocfs2_meta_unlock(inode, 0); |
| 1725 | 1725 | ||
| 1726 | ret = generic_file_splice_read(in, ppos, pipe, len, flags); | 1726 | ret = generic_file_splice_read(in, ppos, pipe, len, flags); |
| 1727 | 1727 | ||
| 1728 | bail: | 1728 | bail: |
| 1729 | mlog_exit(ret); | 1729 | mlog_exit(ret); |
| 1730 | return ret; | 1730 | return ret; |
| 1731 | } | 1731 | } |
| 1732 | 1732 | ||
| 1733 | static ssize_t ocfs2_file_aio_read(struct kiocb *iocb, | 1733 | static ssize_t ocfs2_file_aio_read(struct kiocb *iocb, |
| 1734 | const struct iovec *iov, | 1734 | const struct iovec *iov, |
| 1735 | unsigned long nr_segs, | 1735 | unsigned long nr_segs, |
| 1736 | loff_t pos) | 1736 | loff_t pos) |
| 1737 | { | 1737 | { |
| 1738 | int ret = 0, rw_level = -1, have_alloc_sem = 0, lock_level = 0; | 1738 | int ret = 0, rw_level = -1, have_alloc_sem = 0, lock_level = 0; |
| 1739 | struct file *filp = iocb->ki_filp; | 1739 | struct file *filp = iocb->ki_filp; |
| 1740 | struct inode *inode = filp->f_path.dentry->d_inode; | 1740 | struct inode *inode = filp->f_path.dentry->d_inode; |
| 1741 | 1741 | ||
| 1742 | mlog_entry("(0x%p, %u, '%.*s')\n", filp, | 1742 | mlog_entry("(0x%p, %u, '%.*s')\n", filp, |
| 1743 | (unsigned int)nr_segs, | 1743 | (unsigned int)nr_segs, |
| 1744 | filp->f_path.dentry->d_name.len, | 1744 | filp->f_path.dentry->d_name.len, |
| 1745 | filp->f_path.dentry->d_name.name); | 1745 | filp->f_path.dentry->d_name.name); |
| 1746 | 1746 | ||
| 1747 | if (!inode) { | 1747 | if (!inode) { |
| 1748 | ret = -EINVAL; | 1748 | ret = -EINVAL; |
| 1749 | mlog_errno(ret); | 1749 | mlog_errno(ret); |
| 1750 | goto bail; | 1750 | goto bail; |
| 1751 | } | 1751 | } |
| 1752 | 1752 | ||
| 1753 | /* | 1753 | /* |
| 1754 | * buffered reads protect themselves in ->readpage(). O_DIRECT reads | 1754 | * buffered reads protect themselves in ->readpage(). O_DIRECT reads |
| 1755 | * need locks to protect pending reads from racing with truncate. | 1755 | * need locks to protect pending reads from racing with truncate. |
| 1756 | */ | 1756 | */ |
| 1757 | if (filp->f_flags & O_DIRECT) { | 1757 | if (filp->f_flags & O_DIRECT) { |
| 1758 | down_read(&inode->i_alloc_sem); | 1758 | down_read(&inode->i_alloc_sem); |
| 1759 | have_alloc_sem = 1; | 1759 | have_alloc_sem = 1; |
| 1760 | 1760 | ||
| 1761 | ret = ocfs2_rw_lock(inode, 0); | 1761 | ret = ocfs2_rw_lock(inode, 0); |
| 1762 | if (ret < 0) { | 1762 | if (ret < 0) { |
| 1763 | mlog_errno(ret); | 1763 | mlog_errno(ret); |
| 1764 | goto bail; | 1764 | goto bail; |
| 1765 | } | 1765 | } |
| 1766 | rw_level = 0; | 1766 | rw_level = 0; |
| 1767 | /* communicate with ocfs2_dio_end_io */ | 1767 | /* communicate with ocfs2_dio_end_io */ |
| 1768 | ocfs2_iocb_set_rw_locked(iocb, rw_level); | 1768 | ocfs2_iocb_set_rw_locked(iocb, rw_level); |
| 1769 | } | 1769 | } |
| 1770 | 1770 | ||
| 1771 | /* | 1771 | /* |
| 1772 | * We're fine letting folks race truncates and extending | 1772 | * We're fine letting folks race truncates and extending |
| 1773 | * writes with read across the cluster, just like they can | 1773 | * writes with read across the cluster, just like they can |
| 1774 | * locally. Hence no rw_lock during read. | 1774 | * locally. Hence no rw_lock during read. |
| 1775 | * | 1775 | * |
| 1776 | * Take and drop the meta data lock to update inode fields | 1776 | * Take and drop the meta data lock to update inode fields |
| 1777 | * like i_size. This allows the checks down below | 1777 | * like i_size. This allows the checks down below |
| 1778 | * generic_file_aio_read() a chance of actually working. | 1778 | * generic_file_aio_read() a chance of actually working. |
| 1779 | */ | 1779 | */ |
| 1780 | ret = ocfs2_meta_lock_atime(inode, filp->f_vfsmnt, &lock_level); | 1780 | ret = ocfs2_meta_lock_atime(inode, filp->f_vfsmnt, &lock_level); |
| 1781 | if (ret < 0) { | 1781 | if (ret < 0) { |
| 1782 | mlog_errno(ret); | 1782 | mlog_errno(ret); |
| 1783 | goto bail; | 1783 | goto bail; |
| 1784 | } | 1784 | } |
| 1785 | ocfs2_meta_unlock(inode, lock_level); | 1785 | ocfs2_meta_unlock(inode, lock_level); |
| 1786 | 1786 | ||
| 1787 | ret = generic_file_aio_read(iocb, iov, nr_segs, iocb->ki_pos); | 1787 | ret = generic_file_aio_read(iocb, iov, nr_segs, iocb->ki_pos); |
| 1788 | if (ret == -EINVAL) | 1788 | if (ret == -EINVAL) |
| 1789 | mlog(ML_ERROR, "generic_file_aio_read returned -EINVAL\n"); | 1789 | mlog(ML_ERROR, "generic_file_aio_read returned -EINVAL\n"); |
| 1790 | 1790 | ||
| 1791 | /* buffered aio wouldn't have proper lock coverage today */ | 1791 | /* buffered aio wouldn't have proper lock coverage today */ |
| 1792 | BUG_ON(ret == -EIOCBQUEUED && !(filp->f_flags & O_DIRECT)); | 1792 | BUG_ON(ret == -EIOCBQUEUED && !(filp->f_flags & O_DIRECT)); |
| 1793 | 1793 | ||
| 1794 | /* see ocfs2_file_aio_write */ | 1794 | /* see ocfs2_file_aio_write */ |
| 1795 | if (ret == -EIOCBQUEUED || !ocfs2_iocb_is_rw_locked(iocb)) { | 1795 | if (ret == -EIOCBQUEUED || !ocfs2_iocb_is_rw_locked(iocb)) { |
| 1796 | rw_level = -1; | 1796 | rw_level = -1; |
| 1797 | have_alloc_sem = 0; | 1797 | have_alloc_sem = 0; |
| 1798 | } | 1798 | } |
| 1799 | 1799 | ||
| 1800 | bail: | 1800 | bail: |
| 1801 | if (have_alloc_sem) | 1801 | if (have_alloc_sem) |
| 1802 | up_read(&inode->i_alloc_sem); | 1802 | up_read(&inode->i_alloc_sem); |
| 1803 | if (rw_level != -1) | 1803 | if (rw_level != -1) |
| 1804 | ocfs2_rw_unlock(inode, rw_level); | 1804 | ocfs2_rw_unlock(inode, rw_level); |
| 1805 | mlog_exit(ret); | 1805 | mlog_exit(ret); |
| 1806 | 1806 | ||
| 1807 | return ret; | 1807 | return ret; |
| 1808 | } | 1808 | } |
| 1809 | 1809 | ||
| 1810 | const struct inode_operations ocfs2_file_iops = { | 1810 | const struct inode_operations ocfs2_file_iops = { |
| 1811 | .setattr = ocfs2_setattr, | 1811 | .setattr = ocfs2_setattr, |
| 1812 | .getattr = ocfs2_getattr, | 1812 | .getattr = ocfs2_getattr, |
| 1813 | .permission = ocfs2_permission, | 1813 | .permission = ocfs2_permission, |
| 1814 | }; | 1814 | }; |
| 1815 | 1815 | ||
| 1816 | const struct inode_operations ocfs2_special_file_iops = { | 1816 | const struct inode_operations ocfs2_special_file_iops = { |
| 1817 | .setattr = ocfs2_setattr, | 1817 | .setattr = ocfs2_setattr, |
| 1818 | .getattr = ocfs2_getattr, | 1818 | .getattr = ocfs2_getattr, |
| 1819 | .permission = ocfs2_permission, | 1819 | .permission = ocfs2_permission, |
| 1820 | }; | 1820 | }; |
| 1821 | 1821 | ||
| 1822 | const struct file_operations ocfs2_fops = { | 1822 | const struct file_operations ocfs2_fops = { |
| 1823 | .read = do_sync_read, | 1823 | .read = do_sync_read, |
| 1824 | .write = do_sync_write, | 1824 | .write = do_sync_write, |
| 1825 | .mmap = ocfs2_mmap, | 1825 | .mmap = ocfs2_mmap, |
| 1826 | .fsync = ocfs2_sync_file, | 1826 | .fsync = ocfs2_sync_file, |
| 1827 | .release = ocfs2_file_release, | 1827 | .release = ocfs2_file_release, |
| 1828 | .open = ocfs2_file_open, | 1828 | .open = ocfs2_file_open, |
| 1829 | .aio_read = ocfs2_file_aio_read, | 1829 | .aio_read = ocfs2_file_aio_read, |
| 1830 | .aio_write = ocfs2_file_aio_write, | 1830 | .aio_write = ocfs2_file_aio_write, |
| 1831 | .ioctl = ocfs2_ioctl, | 1831 | .ioctl = ocfs2_ioctl, |
| 1832 | #ifdef CONFIG_COMPAT | 1832 | #ifdef CONFIG_COMPAT |
| 1833 | .compat_ioctl = ocfs2_compat_ioctl, | 1833 | .compat_ioctl = ocfs2_compat_ioctl, |
| 1834 | #endif | 1834 | #endif |
| 1835 | .splice_read = ocfs2_file_splice_read, | 1835 | .splice_read = ocfs2_file_splice_read, |
| 1836 | .splice_write = ocfs2_file_splice_write, | 1836 | .splice_write = ocfs2_file_splice_write, |
| 1837 | }; | 1837 | }; |
| 1838 | 1838 | ||
| 1839 | const struct file_operations ocfs2_dops = { | 1839 | const struct file_operations ocfs2_dops = { |
| 1840 | .read = generic_read_dir, | 1840 | .read = generic_read_dir, |
| 1841 | .readdir = ocfs2_readdir, | 1841 | .readdir = ocfs2_readdir, |
| 1842 | .fsync = ocfs2_sync_file, | 1842 | .fsync = ocfs2_sync_file, |
| 1843 | .ioctl = ocfs2_ioctl, | 1843 | .ioctl = ocfs2_ioctl, |
| 1844 | #ifdef CONFIG_COMPAT | 1844 | #ifdef CONFIG_COMPAT |
| 1845 | .compat_ioctl = ocfs2_compat_ioctl, | 1845 | .compat_ioctl = ocfs2_compat_ioctl, |
| 1846 | #endif | 1846 | #endif |
| 1847 | }; | 1847 | }; |
| 1848 | 1848 |
fs/pipe.c
| 1 | /* | 1 | /* |
| 2 | * linux/fs/pipe.c | 2 | * linux/fs/pipe.c |
| 3 | * | 3 | * |
| 4 | * Copyright (C) 1991, 1992, 1999 Linus Torvalds | 4 | * Copyright (C) 1991, 1992, 1999 Linus Torvalds |
| 5 | */ | 5 | */ |
| 6 | 6 | ||
| 7 | #include <linux/mm.h> | 7 | #include <linux/mm.h> |
| 8 | #include <linux/file.h> | 8 | #include <linux/file.h> |
| 9 | #include <linux/poll.h> | 9 | #include <linux/poll.h> |
| 10 | #include <linux/slab.h> | 10 | #include <linux/slab.h> |
| 11 | #include <linux/module.h> | 11 | #include <linux/module.h> |
| 12 | #include <linux/init.h> | 12 | #include <linux/init.h> |
| 13 | #include <linux/fs.h> | 13 | #include <linux/fs.h> |
| 14 | #include <linux/mount.h> | 14 | #include <linux/mount.h> |
| 15 | #include <linux/pipe_fs_i.h> | 15 | #include <linux/pipe_fs_i.h> |
| 16 | #include <linux/uio.h> | 16 | #include <linux/uio.h> |
| 17 | #include <linux/highmem.h> | 17 | #include <linux/highmem.h> |
| 18 | #include <linux/pagemap.h> | 18 | #include <linux/pagemap.h> |
| 19 | #include <linux/audit.h> | 19 | #include <linux/audit.h> |
| 20 | 20 | ||
| 21 | #include <asm/uaccess.h> | 21 | #include <asm/uaccess.h> |
| 22 | #include <asm/ioctls.h> | 22 | #include <asm/ioctls.h> |
| 23 | 23 | ||
| 24 | /* | 24 | /* |
| 25 | * We use a start+len construction, which provides full use of the | 25 | * We use a start+len construction, which provides full use of the |
| 26 | * allocated memory. | 26 | * allocated memory. |
| 27 | * -- Florian Coosmann (FGC) | 27 | * -- Florian Coosmann (FGC) |
| 28 | * | 28 | * |
| 29 | * Reads with count = 0 should always return 0. | 29 | * Reads with count = 0 should always return 0. |
| 30 | * -- Julian Bradfield 1999-06-07. | 30 | * -- Julian Bradfield 1999-06-07. |
| 31 | * | 31 | * |
| 32 | * FIFOs and Pipes now generate SIGIO for both readers and writers. | 32 | * FIFOs and Pipes now generate SIGIO for both readers and writers. |
| 33 | * -- Jeremy Elson <jelson@circlemud.org> 2001-08-16 | 33 | * -- Jeremy Elson <jelson@circlemud.org> 2001-08-16 |
| 34 | * | 34 | * |
| 35 | * pipe_read & write cleanup | 35 | * pipe_read & write cleanup |
| 36 | * -- Manfred Spraul <manfred@colorfullife.com> 2002-05-09 | 36 | * -- Manfred Spraul <manfred@colorfullife.com> 2002-05-09 |
| 37 | */ | 37 | */ |
| 38 | 38 | ||
| 39 | /* Drop the inode semaphore and wait for a pipe event, atomically */ | 39 | /* Drop the inode semaphore and wait for a pipe event, atomically */ |
| 40 | void pipe_wait(struct pipe_inode_info *pipe) | 40 | void pipe_wait(struct pipe_inode_info *pipe) |
| 41 | { | 41 | { |
| 42 | DEFINE_WAIT(wait); | 42 | DEFINE_WAIT(wait); |
| 43 | 43 | ||
| 44 | /* | 44 | /* |
| 45 | * Pipes are system-local resources, so sleeping on them | 45 | * Pipes are system-local resources, so sleeping on them |
| 46 | * is considered a noninteractive wait: | 46 | * is considered a noninteractive wait: |
| 47 | */ | 47 | */ |
| 48 | prepare_to_wait(&pipe->wait, &wait, | 48 | prepare_to_wait(&pipe->wait, &wait, |
| 49 | TASK_INTERRUPTIBLE | TASK_NONINTERACTIVE); | 49 | TASK_INTERRUPTIBLE | TASK_NONINTERACTIVE); |
| 50 | if (pipe->inode) | 50 | if (pipe->inode) |
| 51 | mutex_unlock(&pipe->inode->i_mutex); | 51 | mutex_unlock(&pipe->inode->i_mutex); |
| 52 | schedule(); | 52 | schedule(); |
| 53 | finish_wait(&pipe->wait, &wait); | 53 | finish_wait(&pipe->wait, &wait); |
| 54 | if (pipe->inode) | 54 | if (pipe->inode) |
| 55 | mutex_lock(&pipe->inode->i_mutex); | 55 | mutex_lock(&pipe->inode->i_mutex); |
| 56 | } | 56 | } |
| 57 | 57 | ||
| 58 | static int | 58 | static int |
| 59 | pipe_iov_copy_from_user(void *to, struct iovec *iov, unsigned long len, | 59 | pipe_iov_copy_from_user(void *to, struct iovec *iov, unsigned long len, |
| 60 | int atomic) | 60 | int atomic) |
| 61 | { | 61 | { |
| 62 | unsigned long copy; | 62 | unsigned long copy; |
| 63 | 63 | ||
| 64 | while (len > 0) { | 64 | while (len > 0) { |
| 65 | while (!iov->iov_len) | 65 | while (!iov->iov_len) |
| 66 | iov++; | 66 | iov++; |
| 67 | copy = min_t(unsigned long, len, iov->iov_len); | 67 | copy = min_t(unsigned long, len, iov->iov_len); |
| 68 | 68 | ||
| 69 | if (atomic) { | 69 | if (atomic) { |
| 70 | if (__copy_from_user_inatomic(to, iov->iov_base, copy)) | 70 | if (__copy_from_user_inatomic(to, iov->iov_base, copy)) |
| 71 | return -EFAULT; | 71 | return -EFAULT; |
| 72 | } else { | 72 | } else { |
| 73 | if (copy_from_user(to, iov->iov_base, copy)) | 73 | if (copy_from_user(to, iov->iov_base, copy)) |
| 74 | return -EFAULT; | 74 | return -EFAULT; |
| 75 | } | 75 | } |
| 76 | to += copy; | 76 | to += copy; |
| 77 | len -= copy; | 77 | len -= copy; |
| 78 | iov->iov_base += copy; | 78 | iov->iov_base += copy; |
| 79 | iov->iov_len -= copy; | 79 | iov->iov_len -= copy; |
| 80 | } | 80 | } |
| 81 | return 0; | 81 | return 0; |
| 82 | } | 82 | } |
| 83 | 83 | ||
| 84 | static int | 84 | static int |
| 85 | pipe_iov_copy_to_user(struct iovec *iov, const void *from, unsigned long len, | 85 | pipe_iov_copy_to_user(struct iovec *iov, const void *from, unsigned long len, |
| 86 | int atomic) | 86 | int atomic) |
| 87 | { | 87 | { |
| 88 | unsigned long copy; | 88 | unsigned long copy; |
| 89 | 89 | ||
| 90 | while (len > 0) { | 90 | while (len > 0) { |
| 91 | while (!iov->iov_len) | 91 | while (!iov->iov_len) |
| 92 | iov++; | 92 | iov++; |
| 93 | copy = min_t(unsigned long, len, iov->iov_len); | 93 | copy = min_t(unsigned long, len, iov->iov_len); |
| 94 | 94 | ||
| 95 | if (atomic) { | 95 | if (atomic) { |
| 96 | if (__copy_to_user_inatomic(iov->iov_base, from, copy)) | 96 | if (__copy_to_user_inatomic(iov->iov_base, from, copy)) |
| 97 | return -EFAULT; | 97 | return -EFAULT; |
| 98 | } else { | 98 | } else { |
| 99 | if (copy_to_user(iov->iov_base, from, copy)) | 99 | if (copy_to_user(iov->iov_base, from, copy)) |
| 100 | return -EFAULT; | 100 | return -EFAULT; |
| 101 | } | 101 | } |
| 102 | from += copy; | 102 | from += copy; |
| 103 | len -= copy; | 103 | len -= copy; |
| 104 | iov->iov_base += copy; | 104 | iov->iov_base += copy; |
| 105 | iov->iov_len -= copy; | 105 | iov->iov_len -= copy; |
| 106 | } | 106 | } |
| 107 | return 0; | 107 | return 0; |
| 108 | } | 108 | } |
| 109 | 109 | ||
| 110 | /* | 110 | /* |
| 111 | * Attempt to pre-fault in the user memory, so we can use atomic copies. | 111 | * Attempt to pre-fault in the user memory, so we can use atomic copies. |
| 112 | * Returns the number of bytes not faulted in. | 112 | * Returns the number of bytes not faulted in. |
| 113 | */ | 113 | */ |
| 114 | static int iov_fault_in_pages_write(struct iovec *iov, unsigned long len) | 114 | static int iov_fault_in_pages_write(struct iovec *iov, unsigned long len) |
| 115 | { | 115 | { |
| 116 | while (!iov->iov_len) | 116 | while (!iov->iov_len) |
| 117 | iov++; | 117 | iov++; |
| 118 | 118 | ||
| 119 | while (len > 0) { | 119 | while (len > 0) { |
| 120 | unsigned long this_len; | 120 | unsigned long this_len; |
| 121 | 121 | ||
| 122 | this_len = min_t(unsigned long, len, iov->iov_len); | 122 | this_len = min_t(unsigned long, len, iov->iov_len); |
| 123 | if (fault_in_pages_writeable(iov->iov_base, this_len)) | 123 | if (fault_in_pages_writeable(iov->iov_base, this_len)) |
| 124 | break; | 124 | break; |
| 125 | 125 | ||
| 126 | len -= this_len; | 126 | len -= this_len; |
| 127 | iov++; | 127 | iov++; |
| 128 | } | 128 | } |
| 129 | 129 | ||
| 130 | return len; | 130 | return len; |
| 131 | } | 131 | } |
| 132 | 132 | ||
| 133 | /* | 133 | /* |
| 134 | * Pre-fault in the user memory, so we can use atomic copies. | 134 | * Pre-fault in the user memory, so we can use atomic copies. |
| 135 | */ | 135 | */ |
| 136 | static void iov_fault_in_pages_read(struct iovec *iov, unsigned long len) | 136 | static void iov_fault_in_pages_read(struct iovec *iov, unsigned long len) |
| 137 | { | 137 | { |
| 138 | while (!iov->iov_len) | 138 | while (!iov->iov_len) |
| 139 | iov++; | 139 | iov++; |
| 140 | 140 | ||
| 141 | while (len > 0) { | 141 | while (len > 0) { |
| 142 | unsigned long this_len; | 142 | unsigned long this_len; |
| 143 | 143 | ||
| 144 | this_len = min_t(unsigned long, len, iov->iov_len); | 144 | this_len = min_t(unsigned long, len, iov->iov_len); |
| 145 | fault_in_pages_readable(iov->iov_base, this_len); | 145 | fault_in_pages_readable(iov->iov_base, this_len); |
| 146 | len -= this_len; | 146 | len -= this_len; |
| 147 | iov++; | 147 | iov++; |
| 148 | } | 148 | } |
| 149 | } | 149 | } |
| 150 | 150 | ||
| 151 | static void anon_pipe_buf_release(struct pipe_inode_info *pipe, | 151 | static void anon_pipe_buf_release(struct pipe_inode_info *pipe, |
| 152 | struct pipe_buffer *buf) | 152 | struct pipe_buffer *buf) |
| 153 | { | 153 | { |
| 154 | struct page *page = buf->page; | 154 | struct page *page = buf->page; |
| 155 | 155 | ||
| 156 | /* | 156 | /* |
| 157 | * If nobody else uses this page, and we don't already have a | 157 | * If nobody else uses this page, and we don't already have a |
| 158 | * temporary page, let's keep track of it as a one-deep | 158 | * temporary page, let's keep track of it as a one-deep |
| 159 | * allocation cache. (Otherwise just release our reference to it) | 159 | * allocation cache. (Otherwise just release our reference to it) |
| 160 | */ | 160 | */ |
| 161 | if (page_count(page) == 1 && !pipe->tmp_page) | 161 | if (page_count(page) == 1 && !pipe->tmp_page) |
| 162 | pipe->tmp_page = page; | 162 | pipe->tmp_page = page; |
| 163 | else | 163 | else |
| 164 | page_cache_release(page); | 164 | page_cache_release(page); |
| 165 | } | 165 | } |
| 166 | 166 | ||
| 167 | void *generic_pipe_buf_map(struct pipe_inode_info *pipe, | 167 | void *generic_pipe_buf_map(struct pipe_inode_info *pipe, |
| 168 | struct pipe_buffer *buf, int atomic) | 168 | struct pipe_buffer *buf, int atomic) |
| 169 | { | 169 | { |
| 170 | if (atomic) { | 170 | if (atomic) { |
| 171 | buf->flags |= PIPE_BUF_FLAG_ATOMIC; | 171 | buf->flags |= PIPE_BUF_FLAG_ATOMIC; |
| 172 | return kmap_atomic(buf->page, KM_USER0); | 172 | return kmap_atomic(buf->page, KM_USER0); |
| 173 | } | 173 | } |
| 174 | 174 | ||
| 175 | return kmap(buf->page); | 175 | return kmap(buf->page); |
| 176 | } | 176 | } |
| 177 | 177 | ||
| 178 | void generic_pipe_buf_unmap(struct pipe_inode_info *pipe, | 178 | void generic_pipe_buf_unmap(struct pipe_inode_info *pipe, |
| 179 | struct pipe_buffer *buf, void *map_data) | 179 | struct pipe_buffer *buf, void *map_data) |
| 180 | { | 180 | { |
| 181 | if (buf->flags & PIPE_BUF_FLAG_ATOMIC) { | 181 | if (buf->flags & PIPE_BUF_FLAG_ATOMIC) { |
| 182 | buf->flags &= ~PIPE_BUF_FLAG_ATOMIC; | 182 | buf->flags &= ~PIPE_BUF_FLAG_ATOMIC; |
| 183 | kunmap_atomic(map_data, KM_USER0); | 183 | kunmap_atomic(map_data, KM_USER0); |
| 184 | } else | 184 | } else |
| 185 | kunmap(buf->page); | 185 | kunmap(buf->page); |
| 186 | } | 186 | } |
| 187 | 187 | ||
| 188 | int generic_pipe_buf_steal(struct pipe_inode_info *pipe, | 188 | int generic_pipe_buf_steal(struct pipe_inode_info *pipe, |
| 189 | struct pipe_buffer *buf) | 189 | struct pipe_buffer *buf) |
| 190 | { | 190 | { |
| 191 | struct page *page = buf->page; | 191 | struct page *page = buf->page; |
| 192 | 192 | ||
| 193 | if (page_count(page) == 1) { | 193 | if (page_count(page) == 1) { |
| 194 | lock_page(page); | 194 | lock_page(page); |
| 195 | return 0; | 195 | return 0; |
| 196 | } | 196 | } |
| 197 | 197 | ||
| 198 | return 1; | 198 | return 1; |
| 199 | } | 199 | } |
| 200 | 200 | ||
| 201 | void generic_pipe_buf_get(struct pipe_inode_info *info, struct pipe_buffer *buf) | 201 | void generic_pipe_buf_get(struct pipe_inode_info *info, struct pipe_buffer *buf) |
| 202 | { | 202 | { |
| 203 | page_cache_get(buf->page); | 203 | page_cache_get(buf->page); |
| 204 | } | 204 | } |
| 205 | 205 | ||
| 206 | int generic_pipe_buf_pin(struct pipe_inode_info *info, struct pipe_buffer *buf) | 206 | int generic_pipe_buf_confirm(struct pipe_inode_info *info, |
| 207 | struct pipe_buffer *buf) | ||
| 207 | { | 208 | { |
| 208 | return 0; | 209 | return 0; |
| 209 | } | 210 | } |
| 210 | 211 | ||
| 211 | static const struct pipe_buf_operations anon_pipe_buf_ops = { | 212 | static const struct pipe_buf_operations anon_pipe_buf_ops = { |
| 212 | .can_merge = 1, | 213 | .can_merge = 1, |
| 213 | .map = generic_pipe_buf_map, | 214 | .map = generic_pipe_buf_map, |
| 214 | .unmap = generic_pipe_buf_unmap, | 215 | .unmap = generic_pipe_buf_unmap, |
| 215 | .pin = generic_pipe_buf_pin, | 216 | .confirm = generic_pipe_buf_confirm, |
| 216 | .release = anon_pipe_buf_release, | 217 | .release = anon_pipe_buf_release, |
| 217 | .steal = generic_pipe_buf_steal, | 218 | .steal = generic_pipe_buf_steal, |
| 218 | .get = generic_pipe_buf_get, | 219 | .get = generic_pipe_buf_get, |
| 219 | }; | 220 | }; |
| 220 | 221 | ||
| 221 | static ssize_t | 222 | static ssize_t |
| 222 | pipe_read(struct kiocb *iocb, const struct iovec *_iov, | 223 | pipe_read(struct kiocb *iocb, const struct iovec *_iov, |
| 223 | unsigned long nr_segs, loff_t pos) | 224 | unsigned long nr_segs, loff_t pos) |
| 224 | { | 225 | { |
| 225 | struct file *filp = iocb->ki_filp; | 226 | struct file *filp = iocb->ki_filp; |
| 226 | struct inode *inode = filp->f_path.dentry->d_inode; | 227 | struct inode *inode = filp->f_path.dentry->d_inode; |
| 227 | struct pipe_inode_info *pipe; | 228 | struct pipe_inode_info *pipe; |
| 228 | int do_wakeup; | 229 | int do_wakeup; |
| 229 | ssize_t ret; | 230 | ssize_t ret; |
| 230 | struct iovec *iov = (struct iovec *)_iov; | 231 | struct iovec *iov = (struct iovec *)_iov; |
| 231 | size_t total_len; | 232 | size_t total_len; |
| 232 | 233 | ||
| 233 | total_len = iov_length(iov, nr_segs); | 234 | total_len = iov_length(iov, nr_segs); |
| 234 | /* Null read succeeds. */ | 235 | /* Null read succeeds. */ |
| 235 | if (unlikely(total_len == 0)) | 236 | if (unlikely(total_len == 0)) |
| 236 | return 0; | 237 | return 0; |
| 237 | 238 | ||
| 238 | do_wakeup = 0; | 239 | do_wakeup = 0; |
| 239 | ret = 0; | 240 | ret = 0; |
| 240 | mutex_lock(&inode->i_mutex); | 241 | mutex_lock(&inode->i_mutex); |
| 241 | pipe = inode->i_pipe; | 242 | pipe = inode->i_pipe; |
| 242 | for (;;) { | 243 | for (;;) { |
| 243 | int bufs = pipe->nrbufs; | 244 | int bufs = pipe->nrbufs; |
| 244 | if (bufs) { | 245 | if (bufs) { |
| 245 | int curbuf = pipe->curbuf; | 246 | int curbuf = pipe->curbuf; |
| 246 | struct pipe_buffer *buf = pipe->bufs + curbuf; | 247 | struct pipe_buffer *buf = pipe->bufs + curbuf; |
| 247 | const struct pipe_buf_operations *ops = buf->ops; | 248 | const struct pipe_buf_operations *ops = buf->ops; |
| 248 | void *addr; | 249 | void *addr; |
| 249 | size_t chars = buf->len; | 250 | size_t chars = buf->len; |
| 250 | int error, atomic; | 251 | int error, atomic; |
| 251 | 252 | ||
| 252 | if (chars > total_len) | 253 | if (chars > total_len) |
| 253 | chars = total_len; | 254 | chars = total_len; |
| 254 | 255 | ||
| 255 | error = ops->pin(pipe, buf); | 256 | error = ops->confirm(pipe, buf); |
| 256 | if (error) { | 257 | if (error) { |
| 257 | if (!ret) | 258 | if (!ret) |
| 258 | error = ret; | 259 | error = ret; |
| 259 | break; | 260 | break; |
| 260 | } | 261 | } |
| 261 | 262 | ||
| 262 | atomic = !iov_fault_in_pages_write(iov, chars); | 263 | atomic = !iov_fault_in_pages_write(iov, chars); |
| 263 | redo: | 264 | redo: |
| 264 | addr = ops->map(pipe, buf, atomic); | 265 | addr = ops->map(pipe, buf, atomic); |
| 265 | error = pipe_iov_copy_to_user(iov, addr + buf->offset, chars, atomic); | 266 | error = pipe_iov_copy_to_user(iov, addr + buf->offset, chars, atomic); |
| 266 | ops->unmap(pipe, buf, addr); | 267 | ops->unmap(pipe, buf, addr); |
| 267 | if (unlikely(error)) { | 268 | if (unlikely(error)) { |
| 268 | /* | 269 | /* |
| 269 | * Just retry with the slow path if we failed. | 270 | * Just retry with the slow path if we failed. |
| 270 | */ | 271 | */ |
| 271 | if (atomic) { | 272 | if (atomic) { |
| 272 | atomic = 0; | 273 | atomic = 0; |
| 273 | goto redo; | 274 | goto redo; |
| 274 | } | 275 | } |
| 275 | if (!ret) | 276 | if (!ret) |
| 276 | ret = error; | 277 | ret = error; |
| 277 | break; | 278 | break; |
| 278 | } | 279 | } |
| 279 | ret += chars; | 280 | ret += chars; |
| 280 | buf->offset += chars; | 281 | buf->offset += chars; |
| 281 | buf->len -= chars; | 282 | buf->len -= chars; |
| 282 | if (!buf->len) { | 283 | if (!buf->len) { |
| 283 | buf->ops = NULL; | 284 | buf->ops = NULL; |
| 284 | ops->release(pipe, buf); | 285 | ops->release(pipe, buf); |
| 285 | curbuf = (curbuf + 1) & (PIPE_BUFFERS-1); | 286 | curbuf = (curbuf + 1) & (PIPE_BUFFERS-1); |
| 286 | pipe->curbuf = curbuf; | 287 | pipe->curbuf = curbuf; |
| 287 | pipe->nrbufs = --bufs; | 288 | pipe->nrbufs = --bufs; |
| 288 | do_wakeup = 1; | 289 | do_wakeup = 1; |
| 289 | } | 290 | } |
| 290 | total_len -= chars; | 291 | total_len -= chars; |
| 291 | if (!total_len) | 292 | if (!total_len) |
| 292 | break; /* common path: read succeeded */ | 293 | break; /* common path: read succeeded */ |
| 293 | } | 294 | } |
| 294 | if (bufs) /* More to do? */ | 295 | if (bufs) /* More to do? */ |
| 295 | continue; | 296 | continue; |
| 296 | if (!pipe->writers) | 297 | if (!pipe->writers) |
| 297 | break; | 298 | break; |
| 298 | if (!pipe->waiting_writers) { | 299 | if (!pipe->waiting_writers) { |
| 299 | /* syscall merging: Usually we must not sleep | 300 | /* syscall merging: Usually we must not sleep |
| 300 | * if O_NONBLOCK is set, or if we got some data. | 301 | * if O_NONBLOCK is set, or if we got some data. |
| 301 | * But if a writer sleeps in kernel space, then | 302 | * But if a writer sleeps in kernel space, then |
| 302 | * we can wait for that data without violating POSIX. | 303 | * we can wait for that data without violating POSIX. |
| 303 | */ | 304 | */ |
| 304 | if (ret) | 305 | if (ret) |
| 305 | break; | 306 | break; |
| 306 | if (filp->f_flags & O_NONBLOCK) { | 307 | if (filp->f_flags & O_NONBLOCK) { |
| 307 | ret = -EAGAIN; | 308 | ret = -EAGAIN; |
| 308 | break; | 309 | break; |
| 309 | } | 310 | } |
| 310 | } | 311 | } |
| 311 | if (signal_pending(current)) { | 312 | if (signal_pending(current)) { |
| 312 | if (!ret) | 313 | if (!ret) |
| 313 | ret = -ERESTARTSYS; | 314 | ret = -ERESTARTSYS; |
| 314 | break; | 315 | break; |
| 315 | } | 316 | } |
| 316 | if (do_wakeup) { | 317 | if (do_wakeup) { |
| 317 | wake_up_interruptible_sync(&pipe->wait); | 318 | wake_up_interruptible_sync(&pipe->wait); |
| 318 | kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); | 319 | kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); |
| 319 | } | 320 | } |
| 320 | pipe_wait(pipe); | 321 | pipe_wait(pipe); |
| 321 | } | 322 | } |
| 322 | mutex_unlock(&inode->i_mutex); | 323 | mutex_unlock(&inode->i_mutex); |
| 323 | 324 | ||
| 324 | /* Signal writers asynchronously that there is more room. */ | 325 | /* Signal writers asynchronously that there is more room. */ |
| 325 | if (do_wakeup) { | 326 | if (do_wakeup) { |
| 326 | wake_up_interruptible(&pipe->wait); | 327 | wake_up_interruptible(&pipe->wait); |
| 327 | kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); | 328 | kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); |
| 328 | } | 329 | } |
| 329 | if (ret > 0) | 330 | if (ret > 0) |
| 330 | file_accessed(filp); | 331 | file_accessed(filp); |
| 331 | return ret; | 332 | return ret; |
| 332 | } | 333 | } |
| 333 | 334 | ||
| 334 | static ssize_t | 335 | static ssize_t |
| 335 | pipe_write(struct kiocb *iocb, const struct iovec *_iov, | 336 | pipe_write(struct kiocb *iocb, const struct iovec *_iov, |
| 336 | unsigned long nr_segs, loff_t ppos) | 337 | unsigned long nr_segs, loff_t ppos) |
| 337 | { | 338 | { |
| 338 | struct file *filp = iocb->ki_filp; | 339 | struct file *filp = iocb->ki_filp; |
| 339 | struct inode *inode = filp->f_path.dentry->d_inode; | 340 | struct inode *inode = filp->f_path.dentry->d_inode; |
| 340 | struct pipe_inode_info *pipe; | 341 | struct pipe_inode_info *pipe; |
| 341 | ssize_t ret; | 342 | ssize_t ret; |
| 342 | int do_wakeup; | 343 | int do_wakeup; |
| 343 | struct iovec *iov = (struct iovec *)_iov; | 344 | struct iovec *iov = (struct iovec *)_iov; |
| 344 | size_t total_len; | 345 | size_t total_len; |
| 345 | ssize_t chars; | 346 | ssize_t chars; |
| 346 | 347 | ||
| 347 | total_len = iov_length(iov, nr_segs); | 348 | total_len = iov_length(iov, nr_segs); |
| 348 | /* Null write succeeds. */ | 349 | /* Null write succeeds. */ |
| 349 | if (unlikely(total_len == 0)) | 350 | if (unlikely(total_len == 0)) |
| 350 | return 0; | 351 | return 0; |
| 351 | 352 | ||
| 352 | do_wakeup = 0; | 353 | do_wakeup = 0; |
| 353 | ret = 0; | 354 | ret = 0; |
| 354 | mutex_lock(&inode->i_mutex); | 355 | mutex_lock(&inode->i_mutex); |
| 355 | pipe = inode->i_pipe; | 356 | pipe = inode->i_pipe; |
| 356 | 357 | ||
| 357 | if (!pipe->readers) { | 358 | if (!pipe->readers) { |
| 358 | send_sig(SIGPIPE, current, 0); | 359 | send_sig(SIGPIPE, current, 0); |
| 359 | ret = -EPIPE; | 360 | ret = -EPIPE; |
| 360 | goto out; | 361 | goto out; |
| 361 | } | 362 | } |
| 362 | 363 | ||
| 363 | /* We try to merge small writes */ | 364 | /* We try to merge small writes */ |
| 364 | chars = total_len & (PAGE_SIZE-1); /* size of the last buffer */ | 365 | chars = total_len & (PAGE_SIZE-1); /* size of the last buffer */ |
| 365 | if (pipe->nrbufs && chars != 0) { | 366 | if (pipe->nrbufs && chars != 0) { |
| 366 | int lastbuf = (pipe->curbuf + pipe->nrbufs - 1) & | 367 | int lastbuf = (pipe->curbuf + pipe->nrbufs - 1) & |
| 367 | (PIPE_BUFFERS-1); | 368 | (PIPE_BUFFERS-1); |
| 368 | struct pipe_buffer *buf = pipe->bufs + lastbuf; | 369 | struct pipe_buffer *buf = pipe->bufs + lastbuf; |
| 369 | const struct pipe_buf_operations *ops = buf->ops; | 370 | const struct pipe_buf_operations *ops = buf->ops; |
| 370 | int offset = buf->offset + buf->len; | 371 | int offset = buf->offset + buf->len; |
| 371 | 372 | ||
| 372 | if (ops->can_merge && offset + chars <= PAGE_SIZE) { | 373 | if (ops->can_merge && offset + chars <= PAGE_SIZE) { |
| 373 | int error, atomic = 1; | 374 | int error, atomic = 1; |
| 374 | void *addr; | 375 | void *addr; |
| 375 | 376 | ||
| 376 | error = ops->pin(pipe, buf); | 377 | error = ops->confirm(pipe, buf); |
| 377 | if (error) | 378 | if (error) |
| 378 | goto out; | 379 | goto out; |
| 379 | 380 | ||
| 380 | iov_fault_in_pages_read(iov, chars); | 381 | iov_fault_in_pages_read(iov, chars); |
| 381 | redo1: | 382 | redo1: |
| 382 | addr = ops->map(pipe, buf, atomic); | 383 | addr = ops->map(pipe, buf, atomic); |
| 383 | error = pipe_iov_copy_from_user(offset + addr, iov, | 384 | error = pipe_iov_copy_from_user(offset + addr, iov, |
| 384 | chars, atomic); | 385 | chars, atomic); |
| 385 | ops->unmap(pipe, buf, addr); | 386 | ops->unmap(pipe, buf, addr); |
| 386 | ret = error; | 387 | ret = error; |
| 387 | do_wakeup = 1; | 388 | do_wakeup = 1; |
| 388 | if (error) { | 389 | if (error) { |
| 389 | if (atomic) { | 390 | if (atomic) { |
| 390 | atomic = 0; | 391 | atomic = 0; |
| 391 | goto redo1; | 392 | goto redo1; |
| 392 | } | 393 | } |
| 393 | goto out; | 394 | goto out; |
| 394 | } | 395 | } |
| 395 | buf->len += chars; | 396 | buf->len += chars; |
| 396 | total_len -= chars; | 397 | total_len -= chars; |
| 397 | ret = chars; | 398 | ret = chars; |
| 398 | if (!total_len) | 399 | if (!total_len) |
| 399 | goto out; | 400 | goto out; |
| 400 | } | 401 | } |
| 401 | } | 402 | } |
| 402 | 403 | ||
| 403 | for (;;) { | 404 | for (;;) { |
| 404 | int bufs; | 405 | int bufs; |
| 405 | 406 | ||
| 406 | if (!pipe->readers) { | 407 | if (!pipe->readers) { |
| 407 | send_sig(SIGPIPE, current, 0); | 408 | send_sig(SIGPIPE, current, 0); |
| 408 | if (!ret) | 409 | if (!ret) |
| 409 | ret = -EPIPE; | 410 | ret = -EPIPE; |
| 410 | break; | 411 | break; |
| 411 | } | 412 | } |
| 412 | bufs = pipe->nrbufs; | 413 | bufs = pipe->nrbufs; |
| 413 | if (bufs < PIPE_BUFFERS) { | 414 | if (bufs < PIPE_BUFFERS) { |
| 414 | int newbuf = (pipe->curbuf + bufs) & (PIPE_BUFFERS-1); | 415 | int newbuf = (pipe->curbuf + bufs) & (PIPE_BUFFERS-1); |
| 415 | struct pipe_buffer *buf = pipe->bufs + newbuf; | 416 | struct pipe_buffer *buf = pipe->bufs + newbuf; |
| 416 | struct page *page = pipe->tmp_page; | 417 | struct page *page = pipe->tmp_page; |
| 417 | char *src; | 418 | char *src; |
| 418 | int error, atomic = 1; | 419 | int error, atomic = 1; |
| 419 | 420 | ||
| 420 | if (!page) { | 421 | if (!page) { |
| 421 | page = alloc_page(GFP_HIGHUSER); | 422 | page = alloc_page(GFP_HIGHUSER); |
| 422 | if (unlikely(!page)) { | 423 | if (unlikely(!page)) { |
| 423 | ret = ret ? : -ENOMEM; | 424 | ret = ret ? : -ENOMEM; |
| 424 | break; | 425 | break; |
| 425 | } | 426 | } |
| 426 | pipe->tmp_page = page; | 427 | pipe->tmp_page = page; |
| 427 | } | 428 | } |
| 428 | /* Always wake up, even if the copy fails. Otherwise | 429 | /* Always wake up, even if the copy fails. Otherwise |
| 429 | * we lock up (O_NONBLOCK-)readers that sleep due to | 430 | * we lock up (O_NONBLOCK-)readers that sleep due to |
| 430 | * syscall merging. | 431 | * syscall merging. |
| 431 | * FIXME! Is this really true? | 432 | * FIXME! Is this really true? |
| 432 | */ | 433 | */ |
| 433 | do_wakeup = 1; | 434 | do_wakeup = 1; |
| 434 | chars = PAGE_SIZE; | 435 | chars = PAGE_SIZE; |
| 435 | if (chars > total_len) | 436 | if (chars > total_len) |
| 436 | chars = total_len; | 437 | chars = total_len; |
| 437 | 438 | ||
| 438 | iov_fault_in_pages_read(iov, chars); | 439 | iov_fault_in_pages_read(iov, chars); |
| 439 | redo2: | 440 | redo2: |
| 440 | if (atomic) | 441 | if (atomic) |
| 441 | src = kmap_atomic(page, KM_USER0); | 442 | src = kmap_atomic(page, KM_USER0); |
| 442 | else | 443 | else |
| 443 | src = kmap(page); | 444 | src = kmap(page); |
| 444 | 445 | ||
| 445 | error = pipe_iov_copy_from_user(src, iov, chars, | 446 | error = pipe_iov_copy_from_user(src, iov, chars, |
| 446 | atomic); | 447 | atomic); |
| 447 | if (atomic) | 448 | if (atomic) |
| 448 | kunmap_atomic(src, KM_USER0); | 449 | kunmap_atomic(src, KM_USER0); |
| 449 | else | 450 | else |
| 450 | kunmap(page); | 451 | kunmap(page); |
| 451 | 452 | ||
| 452 | if (unlikely(error)) { | 453 | if (unlikely(error)) { |
| 453 | if (atomic) { | 454 | if (atomic) { |
| 454 | atomic = 0; | 455 | atomic = 0; |
| 455 | goto redo2; | 456 | goto redo2; |
| 456 | } | 457 | } |
| 457 | if (!ret) | 458 | if (!ret) |
| 458 | ret = error; | 459 | ret = error; |
| 459 | break; | 460 | break; |
| 460 | } | 461 | } |
| 461 | ret += chars; | 462 | ret += chars; |
| 462 | 463 | ||
| 463 | /* Insert it into the buffer array */ | 464 | /* Insert it into the buffer array */ |
| 464 | buf->page = page; | 465 | buf->page = page; |
| 465 | buf->ops = &anon_pipe_buf_ops; | 466 | buf->ops = &anon_pipe_buf_ops; |
| 466 | buf->offset = 0; | 467 | buf->offset = 0; |
| 467 | buf->len = chars; | 468 | buf->len = chars; |
| 468 | pipe->nrbufs = ++bufs; | 469 | pipe->nrbufs = ++bufs; |
| 469 | pipe->tmp_page = NULL; | 470 | pipe->tmp_page = NULL; |
| 470 | 471 | ||
| 471 | total_len -= chars; | 472 | total_len -= chars; |
| 472 | if (!total_len) | 473 | if (!total_len) |
| 473 | break; | 474 | break; |
| 474 | } | 475 | } |
| 475 | if (bufs < PIPE_BUFFERS) | 476 | if (bufs < PIPE_BUFFERS) |
| 476 | continue; | 477 | continue; |
| 477 | if (filp->f_flags & O_NONBLOCK) { | 478 | if (filp->f_flags & O_NONBLOCK) { |
| 478 | if (!ret) | 479 | if (!ret) |
| 479 | ret = -EAGAIN; | 480 | ret = -EAGAIN; |
| 480 | break; | 481 | break; |
| 481 | } | 482 | } |
| 482 | if (signal_pending(current)) { | 483 | if (signal_pending(current)) { |
| 483 | if (!ret) | 484 | if (!ret) |
| 484 | ret = -ERESTARTSYS; | 485 | ret = -ERESTARTSYS; |
| 485 | break; | 486 | break; |
| 486 | } | 487 | } |
| 487 | if (do_wakeup) { | 488 | if (do_wakeup) { |
| 488 | wake_up_interruptible_sync(&pipe->wait); | 489 | wake_up_interruptible_sync(&pipe->wait); |
| 489 | kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); | 490 | kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); |
| 490 | do_wakeup = 0; | 491 | do_wakeup = 0; |
| 491 | } | 492 | } |
| 492 | pipe->waiting_writers++; | 493 | pipe->waiting_writers++; |
| 493 | pipe_wait(pipe); | 494 | pipe_wait(pipe); |
| 494 | pipe->waiting_writers--; | 495 | pipe->waiting_writers--; |
| 495 | } | 496 | } |
| 496 | out: | 497 | out: |
| 497 | mutex_unlock(&inode->i_mutex); | 498 | mutex_unlock(&inode->i_mutex); |
| 498 | if (do_wakeup) { | 499 | if (do_wakeup) { |
| 499 | wake_up_interruptible(&pipe->wait); | 500 | wake_up_interruptible(&pipe->wait); |
| 500 | kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); | 501 | kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); |
| 501 | } | 502 | } |
| 502 | if (ret > 0) | 503 | if (ret > 0) |
| 503 | file_update_time(filp); | 504 | file_update_time(filp); |
| 504 | return ret; | 505 | return ret; |
| 505 | } | 506 | } |
| 506 | 507 | ||
| 507 | static ssize_t | 508 | static ssize_t |
| 508 | bad_pipe_r(struct file *filp, char __user *buf, size_t count, loff_t *ppos) | 509 | bad_pipe_r(struct file *filp, char __user *buf, size_t count, loff_t *ppos) |
| 509 | { | 510 | { |
| 510 | return -EBADF; | 511 | return -EBADF; |
| 511 | } | 512 | } |
| 512 | 513 | ||
| 513 | static ssize_t | 514 | static ssize_t |
| 514 | bad_pipe_w(struct file *filp, const char __user *buf, size_t count, | 515 | bad_pipe_w(struct file *filp, const char __user *buf, size_t count, |
| 515 | loff_t *ppos) | 516 | loff_t *ppos) |
| 516 | { | 517 | { |
| 517 | return -EBADF; | 518 | return -EBADF; |
| 518 | } | 519 | } |
| 519 | 520 | ||
| 520 | static int | 521 | static int |
| 521 | pipe_ioctl(struct inode *pino, struct file *filp, | 522 | pipe_ioctl(struct inode *pino, struct file *filp, |
| 522 | unsigned int cmd, unsigned long arg) | 523 | unsigned int cmd, unsigned long arg) |
| 523 | { | 524 | { |
| 524 | struct inode *inode = filp->f_path.dentry->d_inode; | 525 | struct inode *inode = filp->f_path.dentry->d_inode; |
| 525 | struct pipe_inode_info *pipe; | 526 | struct pipe_inode_info *pipe; |
| 526 | int count, buf, nrbufs; | 527 | int count, buf, nrbufs; |
| 527 | 528 | ||
| 528 | switch (cmd) { | 529 | switch (cmd) { |
| 529 | case FIONREAD: | 530 | case FIONREAD: |
| 530 | mutex_lock(&inode->i_mutex); | 531 | mutex_lock(&inode->i_mutex); |
| 531 | pipe = inode->i_pipe; | 532 | pipe = inode->i_pipe; |
| 532 | count = 0; | 533 | count = 0; |
| 533 | buf = pipe->curbuf; | 534 | buf = pipe->curbuf; |
| 534 | nrbufs = pipe->nrbufs; | 535 | nrbufs = pipe->nrbufs; |
| 535 | while (--nrbufs >= 0) { | 536 | while (--nrbufs >= 0) { |
| 536 | count += pipe->bufs[buf].len; | 537 | count += pipe->bufs[buf].len; |
| 537 | buf = (buf+1) & (PIPE_BUFFERS-1); | 538 | buf = (buf+1) & (PIPE_BUFFERS-1); |
| 538 | } | 539 | } |
| 539 | mutex_unlock(&inode->i_mutex); | 540 | mutex_unlock(&inode->i_mutex); |
| 540 | 541 | ||
| 541 | return put_user(count, (int __user *)arg); | 542 | return put_user(count, (int __user *)arg); |
| 542 | default: | 543 | default: |
| 543 | return -EINVAL; | 544 | return -EINVAL; |
| 544 | } | 545 | } |
| 545 | } | 546 | } |
| 546 | 547 | ||
| 547 | /* No kernel lock held - fine */ | 548 | /* No kernel lock held - fine */ |
| 548 | static unsigned int | 549 | static unsigned int |
| 549 | pipe_poll(struct file *filp, poll_table *wait) | 550 | pipe_poll(struct file *filp, poll_table *wait) |
| 550 | { | 551 | { |
| 551 | unsigned int mask; | 552 | unsigned int mask; |
| 552 | struct inode *inode = filp->f_path.dentry->d_inode; | 553 | struct inode *inode = filp->f_path.dentry->d_inode; |
| 553 | struct pipe_inode_info *pipe = inode->i_pipe; | 554 | struct pipe_inode_info *pipe = inode->i_pipe; |
| 554 | int nrbufs; | 555 | int nrbufs; |
| 555 | 556 | ||
| 556 | poll_wait(filp, &pipe->wait, wait); | 557 | poll_wait(filp, &pipe->wait, wait); |
| 557 | 558 | ||
| 558 | /* Reading only -- no need for acquiring the semaphore. */ | 559 | /* Reading only -- no need for acquiring the semaphore. */ |
| 559 | nrbufs = pipe->nrbufs; | 560 | nrbufs = pipe->nrbufs; |
| 560 | mask = 0; | 561 | mask = 0; |
| 561 | if (filp->f_mode & FMODE_READ) { | 562 | if (filp->f_mode & FMODE_READ) { |
| 562 | mask = (nrbufs > 0) ? POLLIN | POLLRDNORM : 0; | 563 | mask = (nrbufs > 0) ? POLLIN | POLLRDNORM : 0; |
| 563 | if (!pipe->writers && filp->f_version != pipe->w_counter) | 564 | if (!pipe->writers && filp->f_version != pipe->w_counter) |
| 564 | mask |= POLLHUP; | 565 | mask |= POLLHUP; |
| 565 | } | 566 | } |
| 566 | 567 | ||
| 567 | if (filp->f_mode & FMODE_WRITE) { | 568 | if (filp->f_mode & FMODE_WRITE) { |
| 568 | mask |= (nrbufs < PIPE_BUFFERS) ? POLLOUT | POLLWRNORM : 0; | 569 | mask |= (nrbufs < PIPE_BUFFERS) ? POLLOUT | POLLWRNORM : 0; |
| 569 | /* | 570 | /* |
| 570 | * Most Unices do not set POLLERR for FIFOs but on Linux they | 571 | * Most Unices do not set POLLERR for FIFOs but on Linux they |
| 571 | * behave exactly like pipes for poll(). | 572 | * behave exactly like pipes for poll(). |
| 572 | */ | 573 | */ |
| 573 | if (!pipe->readers) | 574 | if (!pipe->readers) |
| 574 | mask |= POLLERR; | 575 | mask |= POLLERR; |
| 575 | } | 576 | } |
| 576 | 577 | ||
| 577 | return mask; | 578 | return mask; |
| 578 | } | 579 | } |
| 579 | 580 | ||
| 580 | static int | 581 | static int |
| 581 | pipe_release(struct inode *inode, int decr, int decw) | 582 | pipe_release(struct inode *inode, int decr, int decw) |
| 582 | { | 583 | { |
| 583 | struct pipe_inode_info *pipe; | 584 | struct pipe_inode_info *pipe; |
| 584 | 585 | ||
| 585 | mutex_lock(&inode->i_mutex); | 586 | mutex_lock(&inode->i_mutex); |
| 586 | pipe = inode->i_pipe; | 587 | pipe = inode->i_pipe; |
| 587 | pipe->readers -= decr; | 588 | pipe->readers -= decr; |
| 588 | pipe->writers -= decw; | 589 | pipe->writers -= decw; |
| 589 | 590 | ||
| 590 | if (!pipe->readers && !pipe->writers) { | 591 | if (!pipe->readers && !pipe->writers) { |
| 591 | free_pipe_info(inode); | 592 | free_pipe_info(inode); |
| 592 | } else { | 593 | } else { |
| 593 | wake_up_interruptible(&pipe->wait); | 594 | wake_up_interruptible(&pipe->wait); |
| 594 | kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); | 595 | kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); |
| 595 | kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); | 596 | kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); |
| 596 | } | 597 | } |
| 597 | mutex_unlock(&inode->i_mutex); | 598 | mutex_unlock(&inode->i_mutex); |
| 598 | 599 | ||
| 599 | return 0; | 600 | return 0; |
| 600 | } | 601 | } |
| 601 | 602 | ||
| 602 | static int | 603 | static int |
| 603 | pipe_read_fasync(int fd, struct file *filp, int on) | 604 | pipe_read_fasync(int fd, struct file *filp, int on) |
| 604 | { | 605 | { |
| 605 | struct inode *inode = filp->f_path.dentry->d_inode; | 606 | struct inode *inode = filp->f_path.dentry->d_inode; |
| 606 | int retval; | 607 | int retval; |
| 607 | 608 | ||
| 608 | mutex_lock(&inode->i_mutex); | 609 | mutex_lock(&inode->i_mutex); |
| 609 | retval = fasync_helper(fd, filp, on, &inode->i_pipe->fasync_readers); | 610 | retval = fasync_helper(fd, filp, on, &inode->i_pipe->fasync_readers); |
| 610 | mutex_unlock(&inode->i_mutex); | 611 | mutex_unlock(&inode->i_mutex); |
| 611 | 612 | ||
| 612 | if (retval < 0) | 613 | if (retval < 0) |
| 613 | return retval; | 614 | return retval; |
| 614 | 615 | ||
| 615 | return 0; | 616 | return 0; |
| 616 | } | 617 | } |
| 617 | 618 | ||
| 618 | 619 | ||
| 619 | static int | 620 | static int |
| 620 | pipe_write_fasync(int fd, struct file *filp, int on) | 621 | pipe_write_fasync(int fd, struct file *filp, int on) |
| 621 | { | 622 | { |
| 622 | struct inode *inode = filp->f_path.dentry->d_inode; | 623 | struct inode *inode = filp->f_path.dentry->d_inode; |
| 623 | int retval; | 624 | int retval; |
| 624 | 625 | ||
| 625 | mutex_lock(&inode->i_mutex); | 626 | mutex_lock(&inode->i_mutex); |
| 626 | retval = fasync_helper(fd, filp, on, &inode->i_pipe->fasync_writers); | 627 | retval = fasync_helper(fd, filp, on, &inode->i_pipe->fasync_writers); |
| 627 | mutex_unlock(&inode->i_mutex); | 628 | mutex_unlock(&inode->i_mutex); |
| 628 | 629 | ||
| 629 | if (retval < 0) | 630 | if (retval < 0) |
| 630 | return retval; | 631 | return retval; |
| 631 | 632 | ||
| 632 | return 0; | 633 | return 0; |
| 633 | } | 634 | } |
| 634 | 635 | ||
| 635 | 636 | ||
| 636 | static int | 637 | static int |
| 637 | pipe_rdwr_fasync(int fd, struct file *filp, int on) | 638 | pipe_rdwr_fasync(int fd, struct file *filp, int on) |
| 638 | { | 639 | { |
| 639 | struct inode *inode = filp->f_path.dentry->d_inode; | 640 | struct inode *inode = filp->f_path.dentry->d_inode; |
| 640 | struct pipe_inode_info *pipe = inode->i_pipe; | 641 | struct pipe_inode_info *pipe = inode->i_pipe; |
| 641 | int retval; | 642 | int retval; |
| 642 | 643 | ||
| 643 | mutex_lock(&inode->i_mutex); | 644 | mutex_lock(&inode->i_mutex); |
| 644 | 645 | ||
| 645 | retval = fasync_helper(fd, filp, on, &pipe->fasync_readers); | 646 | retval = fasync_helper(fd, filp, on, &pipe->fasync_readers); |
| 646 | 647 | ||
| 647 | if (retval >= 0) | 648 | if (retval >= 0) |
| 648 | retval = fasync_helper(fd, filp, on, &pipe->fasync_writers); | 649 | retval = fasync_helper(fd, filp, on, &pipe->fasync_writers); |
| 649 | 650 | ||
| 650 | mutex_unlock(&inode->i_mutex); | 651 | mutex_unlock(&inode->i_mutex); |
| 651 | 652 | ||
| 652 | if (retval < 0) | 653 | if (retval < 0) |
| 653 | return retval; | 654 | return retval; |
| 654 | 655 | ||
| 655 | return 0; | 656 | return 0; |
| 656 | } | 657 | } |
| 657 | 658 | ||
| 658 | 659 | ||
| 659 | static int | 660 | static int |
| 660 | pipe_read_release(struct inode *inode, struct file *filp) | 661 | pipe_read_release(struct inode *inode, struct file *filp) |
| 661 | { | 662 | { |
| 662 | pipe_read_fasync(-1, filp, 0); | 663 | pipe_read_fasync(-1, filp, 0); |
| 663 | return pipe_release(inode, 1, 0); | 664 | return pipe_release(inode, 1, 0); |
| 664 | } | 665 | } |
| 665 | 666 | ||
| 666 | static int | 667 | static int |
| 667 | pipe_write_release(struct inode *inode, struct file *filp) | 668 | pipe_write_release(struct inode *inode, struct file *filp) |
| 668 | { | 669 | { |
| 669 | pipe_write_fasync(-1, filp, 0); | 670 | pipe_write_fasync(-1, filp, 0); |
| 670 | return pipe_release(inode, 0, 1); | 671 | return pipe_release(inode, 0, 1); |
| 671 | } | 672 | } |
| 672 | 673 | ||
| 673 | static int | 674 | static int |
| 674 | pipe_rdwr_release(struct inode *inode, struct file *filp) | 675 | pipe_rdwr_release(struct inode *inode, struct file *filp) |
| 675 | { | 676 | { |
| 676 | int decr, decw; | 677 | int decr, decw; |
| 677 | 678 | ||
| 678 | pipe_rdwr_fasync(-1, filp, 0); | 679 | pipe_rdwr_fasync(-1, filp, 0); |
| 679 | decr = (filp->f_mode & FMODE_READ) != 0; | 680 | decr = (filp->f_mode & FMODE_READ) != 0; |
| 680 | decw = (filp->f_mode & FMODE_WRITE) != 0; | 681 | decw = (filp->f_mode & FMODE_WRITE) != 0; |
| 681 | return pipe_release(inode, decr, decw); | 682 | return pipe_release(inode, decr, decw); |
| 682 | } | 683 | } |
| 683 | 684 | ||
| 684 | static int | 685 | static int |
| 685 | pipe_read_open(struct inode *inode, struct file *filp) | 686 | pipe_read_open(struct inode *inode, struct file *filp) |
| 686 | { | 687 | { |
| 687 | /* We could have perhaps used atomic_t, but this and friends | 688 | /* We could have perhaps used atomic_t, but this and friends |
| 688 | below are the only places. So it doesn't seem worthwhile. */ | 689 | below are the only places. So it doesn't seem worthwhile. */ |
| 689 | mutex_lock(&inode->i_mutex); | 690 | mutex_lock(&inode->i_mutex); |
| 690 | inode->i_pipe->readers++; | 691 | inode->i_pipe->readers++; |
| 691 | mutex_unlock(&inode->i_mutex); | 692 | mutex_unlock(&inode->i_mutex); |
| 692 | 693 | ||
| 693 | return 0; | 694 | return 0; |
| 694 | } | 695 | } |
| 695 | 696 | ||
| 696 | static int | 697 | static int |
| 697 | pipe_write_open(struct inode *inode, struct file *filp) | 698 | pipe_write_open(struct inode *inode, struct file *filp) |
| 698 | { | 699 | { |
| 699 | mutex_lock(&inode->i_mutex); | 700 | mutex_lock(&inode->i_mutex); |
| 700 | inode->i_pipe->writers++; | 701 | inode->i_pipe->writers++; |
| 701 | mutex_unlock(&inode->i_mutex); | 702 | mutex_unlock(&inode->i_mutex); |
| 702 | 703 | ||
| 703 | return 0; | 704 | return 0; |
| 704 | } | 705 | } |
| 705 | 706 | ||
| 706 | static int | 707 | static int |
| 707 | pipe_rdwr_open(struct inode *inode, struct file *filp) | 708 | pipe_rdwr_open(struct inode *inode, struct file *filp) |
| 708 | { | 709 | { |
| 709 | mutex_lock(&inode->i_mutex); | 710 | mutex_lock(&inode->i_mutex); |
| 710 | if (filp->f_mode & FMODE_READ) | 711 | if (filp->f_mode & FMODE_READ) |
| 711 | inode->i_pipe->readers++; | 712 | inode->i_pipe->readers++; |
| 712 | if (filp->f_mode & FMODE_WRITE) | 713 | if (filp->f_mode & FMODE_WRITE) |
| 713 | inode->i_pipe->writers++; | 714 | inode->i_pipe->writers++; |
| 714 | mutex_unlock(&inode->i_mutex); | 715 | mutex_unlock(&inode->i_mutex); |
| 715 | 716 | ||
| 716 | return 0; | 717 | return 0; |
| 717 | } | 718 | } |
| 718 | 719 | ||
| 719 | /* | 720 | /* |
| 720 | * The file_operations structs are not static because they | 721 | * The file_operations structs are not static because they |
| 721 | * are also used in linux/fs/fifo.c to do operations on FIFOs. | 722 | * are also used in linux/fs/fifo.c to do operations on FIFOs. |
| 722 | */ | 723 | */ |
| 723 | const struct file_operations read_fifo_fops = { | 724 | const struct file_operations read_fifo_fops = { |
| 724 | .llseek = no_llseek, | 725 | .llseek = no_llseek, |
| 725 | .read = do_sync_read, | 726 | .read = do_sync_read, |
| 726 | .aio_read = pipe_read, | 727 | .aio_read = pipe_read, |
| 727 | .write = bad_pipe_w, | 728 | .write = bad_pipe_w, |
| 728 | .poll = pipe_poll, | 729 | .poll = pipe_poll, |
| 729 | .ioctl = pipe_ioctl, | 730 | .ioctl = pipe_ioctl, |
| 730 | .open = pipe_read_open, | 731 | .open = pipe_read_open, |
| 731 | .release = pipe_read_release, | 732 | .release = pipe_read_release, |
| 732 | .fasync = pipe_read_fasync, | 733 | .fasync = pipe_read_fasync, |
| 733 | }; | 734 | }; |
| 734 | 735 | ||
| 735 | const struct file_operations write_fifo_fops = { | 736 | const struct file_operations write_fifo_fops = { |
| 736 | .llseek = no_llseek, | 737 | .llseek = no_llseek, |
| 737 | .read = bad_pipe_r, | 738 | .read = bad_pipe_r, |
| 738 | .write = do_sync_write, | 739 | .write = do_sync_write, |
| 739 | .aio_write = pipe_write, | 740 | .aio_write = pipe_write, |
| 740 | .poll = pipe_poll, | 741 | .poll = pipe_poll, |
| 741 | .ioctl = pipe_ioctl, | 742 | .ioctl = pipe_ioctl, |
| 742 | .open = pipe_write_open, | 743 | .open = pipe_write_open, |
| 743 | .release = pipe_write_release, | 744 | .release = pipe_write_release, |
| 744 | .fasync = pipe_write_fasync, | 745 | .fasync = pipe_write_fasync, |
| 745 | }; | 746 | }; |
| 746 | 747 | ||
| 747 | const struct file_operations rdwr_fifo_fops = { | 748 | const struct file_operations rdwr_fifo_fops = { |
| 748 | .llseek = no_llseek, | 749 | .llseek = no_llseek, |
| 749 | .read = do_sync_read, | 750 | .read = do_sync_read, |
| 750 | .aio_read = pipe_read, | 751 | .aio_read = pipe_read, |
| 751 | .write = do_sync_write, | 752 | .write = do_sync_write, |
| 752 | .aio_write = pipe_write, | 753 | .aio_write = pipe_write, |
| 753 | .poll = pipe_poll, | 754 | .poll = pipe_poll, |
| 754 | .ioctl = pipe_ioctl, | 755 | .ioctl = pipe_ioctl, |
| 755 | .open = pipe_rdwr_open, | 756 | .open = pipe_rdwr_open, |
| 756 | .release = pipe_rdwr_release, | 757 | .release = pipe_rdwr_release, |
| 757 | .fasync = pipe_rdwr_fasync, | 758 | .fasync = pipe_rdwr_fasync, |
| 758 | }; | 759 | }; |
| 759 | 760 | ||
| 760 | static const struct file_operations read_pipe_fops = { | 761 | static const struct file_operations read_pipe_fops = { |
| 761 | .llseek = no_llseek, | 762 | .llseek = no_llseek, |
| 762 | .read = do_sync_read, | 763 | .read = do_sync_read, |
| 763 | .aio_read = pipe_read, | 764 | .aio_read = pipe_read, |
| 764 | .write = bad_pipe_w, | 765 | .write = bad_pipe_w, |
| 765 | .poll = pipe_poll, | 766 | .poll = pipe_poll, |
| 766 | .ioctl = pipe_ioctl, | 767 | .ioctl = pipe_ioctl, |
| 767 | .open = pipe_read_open, | 768 | .open = pipe_read_open, |
| 768 | .release = pipe_read_release, | 769 | .release = pipe_read_release, |
| 769 | .fasync = pipe_read_fasync, | 770 | .fasync = pipe_read_fasync, |
| 770 | }; | 771 | }; |
| 771 | 772 | ||
| 772 | static const struct file_operations write_pipe_fops = { | 773 | static const struct file_operations write_pipe_fops = { |
| 773 | .llseek = no_llseek, | 774 | .llseek = no_llseek, |
| 774 | .read = bad_pipe_r, | 775 | .read = bad_pipe_r, |
| 775 | .write = do_sync_write, | 776 | .write = do_sync_write, |
| 776 | .aio_write = pipe_write, | 777 | .aio_write = pipe_write, |
| 777 | .poll = pipe_poll, | 778 | .poll = pipe_poll, |
| 778 | .ioctl = pipe_ioctl, | 779 | .ioctl = pipe_ioctl, |
| 779 | .open = pipe_write_open, | 780 | .open = pipe_write_open, |
| 780 | .release = pipe_write_release, | 781 | .release = pipe_write_release, |
| 781 | .fasync = pipe_write_fasync, | 782 | .fasync = pipe_write_fasync, |
| 782 | }; | 783 | }; |
| 783 | 784 | ||
| 784 | static const struct file_operations rdwr_pipe_fops = { | 785 | static const struct file_operations rdwr_pipe_fops = { |
| 785 | .llseek = no_llseek, | 786 | .llseek = no_llseek, |
| 786 | .read = do_sync_read, | 787 | .read = do_sync_read, |
| 787 | .aio_read = pipe_read, | 788 | .aio_read = pipe_read, |
| 788 | .write = do_sync_write, | 789 | .write = do_sync_write, |
| 789 | .aio_write = pipe_write, | 790 | .aio_write = pipe_write, |
| 790 | .poll = pipe_poll, | 791 | .poll = pipe_poll, |
| 791 | .ioctl = pipe_ioctl, | 792 | .ioctl = pipe_ioctl, |
| 792 | .open = pipe_rdwr_open, | 793 | .open = pipe_rdwr_open, |
| 793 | .release = pipe_rdwr_release, | 794 | .release = pipe_rdwr_release, |
| 794 | .fasync = pipe_rdwr_fasync, | 795 | .fasync = pipe_rdwr_fasync, |
| 795 | }; | 796 | }; |
| 796 | 797 | ||
| 797 | struct pipe_inode_info * alloc_pipe_info(struct inode *inode) | 798 | struct pipe_inode_info * alloc_pipe_info(struct inode *inode) |
| 798 | { | 799 | { |
| 799 | struct pipe_inode_info *pipe; | 800 | struct pipe_inode_info *pipe; |
| 800 | 801 | ||
| 801 | pipe = kzalloc(sizeof(struct pipe_inode_info), GFP_KERNEL); | 802 | pipe = kzalloc(sizeof(struct pipe_inode_info), GFP_KERNEL); |
| 802 | if (pipe) { | 803 | if (pipe) { |
| 803 | init_waitqueue_head(&pipe->wait); | 804 | init_waitqueue_head(&pipe->wait); |
| 804 | pipe->r_counter = pipe->w_counter = 1; | 805 | pipe->r_counter = pipe->w_counter = 1; |
| 805 | pipe->inode = inode; | 806 | pipe->inode = inode; |
| 806 | } | 807 | } |
| 807 | 808 | ||
| 808 | return pipe; | 809 | return pipe; |
| 809 | } | 810 | } |
| 810 | 811 | ||
| 811 | void __free_pipe_info(struct pipe_inode_info *pipe) | 812 | void __free_pipe_info(struct pipe_inode_info *pipe) |
| 812 | { | 813 | { |
| 813 | int i; | 814 | int i; |
| 814 | 815 | ||
| 815 | for (i = 0; i < PIPE_BUFFERS; i++) { | 816 | for (i = 0; i < PIPE_BUFFERS; i++) { |
| 816 | struct pipe_buffer *buf = pipe->bufs + i; | 817 | struct pipe_buffer *buf = pipe->bufs + i; |
| 817 | if (buf->ops) | 818 | if (buf->ops) |
| 818 | buf->ops->release(pipe, buf); | 819 | buf->ops->release(pipe, buf); |
| 819 | } | 820 | } |
| 820 | if (pipe->tmp_page) | 821 | if (pipe->tmp_page) |
| 821 | __free_page(pipe->tmp_page); | 822 | __free_page(pipe->tmp_page); |
| 822 | kfree(pipe); | 823 | kfree(pipe); |
| 823 | } | 824 | } |
| 824 | 825 | ||
| 825 | void free_pipe_info(struct inode *inode) | 826 | void free_pipe_info(struct inode *inode) |
| 826 | { | 827 | { |
| 827 | __free_pipe_info(inode->i_pipe); | 828 | __free_pipe_info(inode->i_pipe); |
| 828 | inode->i_pipe = NULL; | 829 | inode->i_pipe = NULL; |
| 829 | } | 830 | } |
| 830 | 831 | ||
| 831 | static struct vfsmount *pipe_mnt __read_mostly; | 832 | static struct vfsmount *pipe_mnt __read_mostly; |
| 832 | static int pipefs_delete_dentry(struct dentry *dentry) | 833 | static int pipefs_delete_dentry(struct dentry *dentry) |
| 833 | { | 834 | { |
| 834 | /* | 835 | /* |
| 835 | * At creation time, we pretended this dentry was hashed | 836 | * At creation time, we pretended this dentry was hashed |
| 836 | * (by clearing DCACHE_UNHASHED bit in d_flags) | 837 | * (by clearing DCACHE_UNHASHED bit in d_flags) |
| 837 | * At delete time, we restore the truth : not hashed. | 838 | * At delete time, we restore the truth : not hashed. |
| 838 | * (so that dput() can proceed correctly) | 839 | * (so that dput() can proceed correctly) |
| 839 | */ | 840 | */ |
| 840 | dentry->d_flags |= DCACHE_UNHASHED; | 841 | dentry->d_flags |= DCACHE_UNHASHED; |
| 841 | return 0; | 842 | return 0; |
| 842 | } | 843 | } |
| 843 | 844 | ||
| 844 | /* | 845 | /* |
| 845 | * pipefs_dname() is called from d_path(). | 846 | * pipefs_dname() is called from d_path(). |
| 846 | */ | 847 | */ |
| 847 | static char *pipefs_dname(struct dentry *dentry, char *buffer, int buflen) | 848 | static char *pipefs_dname(struct dentry *dentry, char *buffer, int buflen) |
| 848 | { | 849 | { |
| 849 | return dynamic_dname(dentry, buffer, buflen, "pipe:[%lu]", | 850 | return dynamic_dname(dentry, buffer, buflen, "pipe:[%lu]", |
| 850 | dentry->d_inode->i_ino); | 851 | dentry->d_inode->i_ino); |
| 851 | } | 852 | } |
| 852 | 853 | ||
| 853 | static struct dentry_operations pipefs_dentry_operations = { | 854 | static struct dentry_operations pipefs_dentry_operations = { |
| 854 | .d_delete = pipefs_delete_dentry, | 855 | .d_delete = pipefs_delete_dentry, |
| 855 | .d_dname = pipefs_dname, | 856 | .d_dname = pipefs_dname, |
| 856 | }; | 857 | }; |
| 857 | 858 | ||
| 858 | static struct inode * get_pipe_inode(void) | 859 | static struct inode * get_pipe_inode(void) |
| 859 | { | 860 | { |
| 860 | struct inode *inode = new_inode(pipe_mnt->mnt_sb); | 861 | struct inode *inode = new_inode(pipe_mnt->mnt_sb); |
| 861 | struct pipe_inode_info *pipe; | 862 | struct pipe_inode_info *pipe; |
| 862 | 863 | ||
| 863 | if (!inode) | 864 | if (!inode) |
| 864 | goto fail_inode; | 865 | goto fail_inode; |
| 865 | 866 | ||
| 866 | pipe = alloc_pipe_info(inode); | 867 | pipe = alloc_pipe_info(inode); |
| 867 | if (!pipe) | 868 | if (!pipe) |
| 868 | goto fail_iput; | 869 | goto fail_iput; |
| 869 | inode->i_pipe = pipe; | 870 | inode->i_pipe = pipe; |
| 870 | 871 | ||
| 871 | pipe->readers = pipe->writers = 1; | 872 | pipe->readers = pipe->writers = 1; |
| 872 | inode->i_fop = &rdwr_pipe_fops; | 873 | inode->i_fop = &rdwr_pipe_fops; |
| 873 | 874 | ||
| 874 | /* | 875 | /* |
| 875 | * Mark the inode dirty from the very beginning, | 876 | * Mark the inode dirty from the very beginning, |
| 876 | * that way it will never be moved to the dirty | 877 | * that way it will never be moved to the dirty |
| 877 | * list because "mark_inode_dirty()" will think | 878 | * list because "mark_inode_dirty()" will think |
| 878 | * that it already _is_ on the dirty list. | 879 | * that it already _is_ on the dirty list. |
| 879 | */ | 880 | */ |
| 880 | inode->i_state = I_DIRTY; | 881 | inode->i_state = I_DIRTY; |
| 881 | inode->i_mode = S_IFIFO | S_IRUSR | S_IWUSR; | 882 | inode->i_mode = S_IFIFO | S_IRUSR | S_IWUSR; |
| 882 | inode->i_uid = current->fsuid; | 883 | inode->i_uid = current->fsuid; |
| 883 | inode->i_gid = current->fsgid; | 884 | inode->i_gid = current->fsgid; |
| 884 | inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; | 885 | inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; |
| 885 | 886 | ||
| 886 | return inode; | 887 | return inode; |
| 887 | 888 | ||
| 888 | fail_iput: | 889 | fail_iput: |
| 889 | iput(inode); | 890 | iput(inode); |
| 890 | 891 | ||
| 891 | fail_inode: | 892 | fail_inode: |
| 892 | return NULL; | 893 | return NULL; |
| 893 | } | 894 | } |
| 894 | 895 | ||
| 895 | struct file *create_write_pipe(void) | 896 | struct file *create_write_pipe(void) |
| 896 | { | 897 | { |
| 897 | int err; | 898 | int err; |
| 898 | struct inode *inode; | 899 | struct inode *inode; |
| 899 | struct file *f; | 900 | struct file *f; |
| 900 | struct dentry *dentry; | 901 | struct dentry *dentry; |
| 901 | struct qstr name = { .name = "" }; | 902 | struct qstr name = { .name = "" }; |
| 902 | 903 | ||
| 903 | f = get_empty_filp(); | 904 | f = get_empty_filp(); |
| 904 | if (!f) | 905 | if (!f) |
| 905 | return ERR_PTR(-ENFILE); | 906 | return ERR_PTR(-ENFILE); |
| 906 | err = -ENFILE; | 907 | err = -ENFILE; |
| 907 | inode = get_pipe_inode(); | 908 | inode = get_pipe_inode(); |
| 908 | if (!inode) | 909 | if (!inode) |
| 909 | goto err_file; | 910 | goto err_file; |
| 910 | 911 | ||
| 911 | err = -ENOMEM; | 912 | err = -ENOMEM; |
| 912 | dentry = d_alloc(pipe_mnt->mnt_sb->s_root, &name); | 913 | dentry = d_alloc(pipe_mnt->mnt_sb->s_root, &name); |
| 913 | if (!dentry) | 914 | if (!dentry) |
| 914 | goto err_inode; | 915 | goto err_inode; |
| 915 | 916 | ||
| 916 | dentry->d_op = &pipefs_dentry_operations; | 917 | dentry->d_op = &pipefs_dentry_operations; |
| 917 | /* | 918 | /* |
| 918 | * We dont want to publish this dentry into global dentry hash table. | 919 | * We dont want to publish this dentry into global dentry hash table. |
| 919 | * We pretend dentry is already hashed, by unsetting DCACHE_UNHASHED | 920 | * We pretend dentry is already hashed, by unsetting DCACHE_UNHASHED |
| 920 | * This permits a working /proc/$pid/fd/XXX on pipes | 921 | * This permits a working /proc/$pid/fd/XXX on pipes |
| 921 | */ | 922 | */ |
| 922 | dentry->d_flags &= ~DCACHE_UNHASHED; | 923 | dentry->d_flags &= ~DCACHE_UNHASHED; |
| 923 | d_instantiate(dentry, inode); | 924 | d_instantiate(dentry, inode); |
| 924 | f->f_path.mnt = mntget(pipe_mnt); | 925 | f->f_path.mnt = mntget(pipe_mnt); |
| 925 | f->f_path.dentry = dentry; | 926 | f->f_path.dentry = dentry; |
| 926 | f->f_mapping = inode->i_mapping; | 927 | f->f_mapping = inode->i_mapping; |
| 927 | 928 | ||
| 928 | f->f_flags = O_WRONLY; | 929 | f->f_flags = O_WRONLY; |
| 929 | f->f_op = &write_pipe_fops; | 930 | f->f_op = &write_pipe_fops; |
| 930 | f->f_mode = FMODE_WRITE; | 931 | f->f_mode = FMODE_WRITE; |
| 931 | f->f_version = 0; | 932 | f->f_version = 0; |
| 932 | 933 | ||
| 933 | return f; | 934 | return f; |
| 934 | 935 | ||
| 935 | err_inode: | 936 | err_inode: |
| 936 | free_pipe_info(inode); | 937 | free_pipe_info(inode); |
| 937 | iput(inode); | 938 | iput(inode); |
| 938 | err_file: | 939 | err_file: |
| 939 | put_filp(f); | 940 | put_filp(f); |
| 940 | return ERR_PTR(err); | 941 | return ERR_PTR(err); |
| 941 | } | 942 | } |
| 942 | 943 | ||
| 943 | void free_write_pipe(struct file *f) | 944 | void free_write_pipe(struct file *f) |
| 944 | { | 945 | { |
| 945 | free_pipe_info(f->f_dentry->d_inode); | 946 | free_pipe_info(f->f_dentry->d_inode); |
| 946 | dput(f->f_path.dentry); | 947 | dput(f->f_path.dentry); |
| 947 | mntput(f->f_path.mnt); | 948 | mntput(f->f_path.mnt); |
| 948 | put_filp(f); | 949 | put_filp(f); |
| 949 | } | 950 | } |
| 950 | 951 | ||
| 951 | struct file *create_read_pipe(struct file *wrf) | 952 | struct file *create_read_pipe(struct file *wrf) |
| 952 | { | 953 | { |
| 953 | struct file *f = get_empty_filp(); | 954 | struct file *f = get_empty_filp(); |
| 954 | if (!f) | 955 | if (!f) |
| 955 | return ERR_PTR(-ENFILE); | 956 | return ERR_PTR(-ENFILE); |
| 956 | 957 | ||
| 957 | /* Grab pipe from the writer */ | 958 | /* Grab pipe from the writer */ |
| 958 | f->f_path.mnt = mntget(wrf->f_path.mnt); | 959 | f->f_path.mnt = mntget(wrf->f_path.mnt); |
| 959 | f->f_path.dentry = dget(wrf->f_path.dentry); | 960 | f->f_path.dentry = dget(wrf->f_path.dentry); |
| 960 | f->f_mapping = wrf->f_path.dentry->d_inode->i_mapping; | 961 | f->f_mapping = wrf->f_path.dentry->d_inode->i_mapping; |
| 961 | 962 | ||
| 962 | f->f_pos = 0; | 963 | f->f_pos = 0; |
| 963 | f->f_flags = O_RDONLY; | 964 | f->f_flags = O_RDONLY; |
| 964 | f->f_op = &read_pipe_fops; | 965 | f->f_op = &read_pipe_fops; |
| 965 | f->f_mode = FMODE_READ; | 966 | f->f_mode = FMODE_READ; |
| 966 | f->f_version = 0; | 967 | f->f_version = 0; |
| 967 | 968 | ||
| 968 | return f; | 969 | return f; |
| 969 | } | 970 | } |
| 970 | 971 | ||
| 971 | int do_pipe(int *fd) | 972 | int do_pipe(int *fd) |
| 972 | { | 973 | { |
| 973 | struct file *fw, *fr; | 974 | struct file *fw, *fr; |
| 974 | int error; | 975 | int error; |
| 975 | int fdw, fdr; | 976 | int fdw, fdr; |
| 976 | 977 | ||
| 977 | fw = create_write_pipe(); | 978 | fw = create_write_pipe(); |
| 978 | if (IS_ERR(fw)) | 979 | if (IS_ERR(fw)) |
| 979 | return PTR_ERR(fw); | 980 | return PTR_ERR(fw); |
| 980 | fr = create_read_pipe(fw); | 981 | fr = create_read_pipe(fw); |
| 981 | error = PTR_ERR(fr); | 982 | error = PTR_ERR(fr); |
| 982 | if (IS_ERR(fr)) | 983 | if (IS_ERR(fr)) |
| 983 | goto err_write_pipe; | 984 | goto err_write_pipe; |
| 984 | 985 | ||
| 985 | error = get_unused_fd(); | 986 | error = get_unused_fd(); |
| 986 | if (error < 0) | 987 | if (error < 0) |
| 987 | goto err_read_pipe; | 988 | goto err_read_pipe; |
| 988 | fdr = error; | 989 | fdr = error; |
| 989 | 990 | ||
| 990 | error = get_unused_fd(); | 991 | error = get_unused_fd(); |
| 991 | if (error < 0) | 992 | if (error < 0) |
| 992 | goto err_fdr; | 993 | goto err_fdr; |
| 993 | fdw = error; | 994 | fdw = error; |
| 994 | 995 | ||
| 995 | error = audit_fd_pair(fdr, fdw); | 996 | error = audit_fd_pair(fdr, fdw); |
| 996 | if (error < 0) | 997 | if (error < 0) |
| 997 | goto err_fdw; | 998 | goto err_fdw; |
| 998 | 999 | ||
| 999 | fd_install(fdr, fr); | 1000 | fd_install(fdr, fr); |
| 1000 | fd_install(fdw, fw); | 1001 | fd_install(fdw, fw); |
| 1001 | fd[0] = fdr; | 1002 | fd[0] = fdr; |
| 1002 | fd[1] = fdw; | 1003 | fd[1] = fdw; |
| 1003 | 1004 | ||
| 1004 | return 0; | 1005 | return 0; |
| 1005 | 1006 | ||
| 1006 | err_fdw: | 1007 | err_fdw: |
| 1007 | put_unused_fd(fdw); | 1008 | put_unused_fd(fdw); |
| 1008 | err_fdr: | 1009 | err_fdr: |
| 1009 | put_unused_fd(fdr); | 1010 | put_unused_fd(fdr); |
| 1010 | err_read_pipe: | 1011 | err_read_pipe: |
| 1011 | dput(fr->f_dentry); | 1012 | dput(fr->f_dentry); |
| 1012 | mntput(fr->f_vfsmnt); | 1013 | mntput(fr->f_vfsmnt); |
| 1013 | put_filp(fr); | 1014 | put_filp(fr); |
| 1014 | err_write_pipe: | 1015 | err_write_pipe: |
| 1015 | free_write_pipe(fw); | 1016 | free_write_pipe(fw); |
| 1016 | return error; | 1017 | return error; |
| 1017 | } | 1018 | } |
| 1018 | 1019 | ||
| 1019 | /* | 1020 | /* |
| 1020 | * pipefs should _never_ be mounted by userland - too much of security hassle, | 1021 | * pipefs should _never_ be mounted by userland - too much of security hassle, |
| 1021 | * no real gain from having the whole whorehouse mounted. So we don't need | 1022 | * no real gain from having the whole whorehouse mounted. So we don't need |
| 1022 | * any operations on the root directory. However, we need a non-trivial | 1023 | * any operations on the root directory. However, we need a non-trivial |
| 1023 | * d_name - pipe: will go nicely and kill the special-casing in procfs. | 1024 | * d_name - pipe: will go nicely and kill the special-casing in procfs. |
| 1024 | */ | 1025 | */ |
| 1025 | static int pipefs_get_sb(struct file_system_type *fs_type, | 1026 | static int pipefs_get_sb(struct file_system_type *fs_type, |
| 1026 | int flags, const char *dev_name, void *data, | 1027 | int flags, const char *dev_name, void *data, |
| 1027 | struct vfsmount *mnt) | 1028 | struct vfsmount *mnt) |
| 1028 | { | 1029 | { |
| 1029 | return get_sb_pseudo(fs_type, "pipe:", NULL, PIPEFS_MAGIC, mnt); | 1030 | return get_sb_pseudo(fs_type, "pipe:", NULL, PIPEFS_MAGIC, mnt); |
| 1030 | } | 1031 | } |
| 1031 | 1032 | ||
| 1032 | static struct file_system_type pipe_fs_type = { | 1033 | static struct file_system_type pipe_fs_type = { |
| 1033 | .name = "pipefs", | 1034 | .name = "pipefs", |
| 1034 | .get_sb = pipefs_get_sb, | 1035 | .get_sb = pipefs_get_sb, |
| 1035 | .kill_sb = kill_anon_super, | 1036 | .kill_sb = kill_anon_super, |
| 1036 | }; | 1037 | }; |
| 1037 | 1038 | ||
| 1038 | static int __init init_pipe_fs(void) | 1039 | static int __init init_pipe_fs(void) |
| 1039 | { | 1040 | { |
| 1040 | int err = register_filesystem(&pipe_fs_type); | 1041 | int err = register_filesystem(&pipe_fs_type); |
| 1041 | 1042 | ||
| 1042 | if (!err) { | 1043 | if (!err) { |
| 1043 | pipe_mnt = kern_mount(&pipe_fs_type); | 1044 | pipe_mnt = kern_mount(&pipe_fs_type); |
| 1044 | if (IS_ERR(pipe_mnt)) { | 1045 | if (IS_ERR(pipe_mnt)) { |
| 1045 | err = PTR_ERR(pipe_mnt); | 1046 | err = PTR_ERR(pipe_mnt); |
| 1046 | unregister_filesystem(&pipe_fs_type); | 1047 | unregister_filesystem(&pipe_fs_type); |
| 1047 | } | 1048 | } |
| 1048 | } | 1049 | } |
| 1049 | return err; | 1050 | return err; |
| 1050 | } | 1051 | } |
| 1051 | 1052 | ||
| 1052 | static void __exit exit_pipe_fs(void) | 1053 | static void __exit exit_pipe_fs(void) |
| 1053 | { | 1054 | { |
| 1054 | unregister_filesystem(&pipe_fs_type); | 1055 | unregister_filesystem(&pipe_fs_type); |
| 1055 | mntput(pipe_mnt); | 1056 | mntput(pipe_mnt); |
| 1056 | } | 1057 | } |
| 1057 | 1058 | ||
| 1058 | fs_initcall(init_pipe_fs); | 1059 | fs_initcall(init_pipe_fs); |
| 1059 | module_exit(exit_pipe_fs); | 1060 | module_exit(exit_pipe_fs); |
| 1060 | 1061 |
fs/splice.c
| 1 | /* | 1 | /* |
| 2 | * "splice": joining two ropes together by interweaving their strands. | 2 | * "splice": joining two ropes together by interweaving their strands. |
| 3 | * | 3 | * |
| 4 | * This is the "extended pipe" functionality, where a pipe is used as | 4 | * This is the "extended pipe" functionality, where a pipe is used as |
| 5 | * an arbitrary in-memory buffer. Think of a pipe as a small kernel | 5 | * an arbitrary in-memory buffer. Think of a pipe as a small kernel |
| 6 | * buffer that you can use to transfer data from one end to the other. | 6 | * buffer that you can use to transfer data from one end to the other. |
| 7 | * | 7 | * |
| 8 | * The traditional unix read/write is extended with a "splice()" operation | 8 | * The traditional unix read/write is extended with a "splice()" operation |
| 9 | * that transfers data buffers to or from a pipe buffer. | 9 | * that transfers data buffers to or from a pipe buffer. |
| 10 | * | 10 | * |
| 11 | * Named by Larry McVoy, original implementation from Linus, extended by | 11 | * Named by Larry McVoy, original implementation from Linus, extended by |
| 12 | * Jens to support splicing to files, network, direct splicing, etc and | 12 | * Jens to support splicing to files, network, direct splicing, etc and |
| 13 | * fixing lots of bugs. | 13 | * fixing lots of bugs. |
| 14 | * | 14 | * |
| 15 | * Copyright (C) 2005-2006 Jens Axboe <axboe@kernel.dk> | 15 | * Copyright (C) 2005-2006 Jens Axboe <axboe@kernel.dk> |
| 16 | * Copyright (C) 2005-2006 Linus Torvalds <torvalds@osdl.org> | 16 | * Copyright (C) 2005-2006 Linus Torvalds <torvalds@osdl.org> |
| 17 | * Copyright (C) 2006 Ingo Molnar <mingo@elte.hu> | 17 | * Copyright (C) 2006 Ingo Molnar <mingo@elte.hu> |
| 18 | * | 18 | * |
| 19 | */ | 19 | */ |
| 20 | #include <linux/fs.h> | 20 | #include <linux/fs.h> |
| 21 | #include <linux/file.h> | 21 | #include <linux/file.h> |
| 22 | #include <linux/pagemap.h> | 22 | #include <linux/pagemap.h> |
| 23 | #include <linux/splice.h> | 23 | #include <linux/splice.h> |
| 24 | #include <linux/mm_inline.h> | 24 | #include <linux/mm_inline.h> |
| 25 | #include <linux/swap.h> | 25 | #include <linux/swap.h> |
| 26 | #include <linux/writeback.h> | 26 | #include <linux/writeback.h> |
| 27 | #include <linux/buffer_head.h> | 27 | #include <linux/buffer_head.h> |
| 28 | #include <linux/module.h> | 28 | #include <linux/module.h> |
| 29 | #include <linux/syscalls.h> | 29 | #include <linux/syscalls.h> |
| 30 | #include <linux/uio.h> | 30 | #include <linux/uio.h> |
| 31 | 31 | ||
| 32 | /* | 32 | /* |
| 33 | * Attempt to steal a page from a pipe buffer. This should perhaps go into | 33 | * Attempt to steal a page from a pipe buffer. This should perhaps go into |
| 34 | * a vm helper function, it's already simplified quite a bit by the | 34 | * a vm helper function, it's already simplified quite a bit by the |
| 35 | * addition of remove_mapping(). If success is returned, the caller may | 35 | * addition of remove_mapping(). If success is returned, the caller may |
| 36 | * attempt to reuse this page for another destination. | 36 | * attempt to reuse this page for another destination. |
| 37 | */ | 37 | */ |
| 38 | static int page_cache_pipe_buf_steal(struct pipe_inode_info *pipe, | 38 | static int page_cache_pipe_buf_steal(struct pipe_inode_info *pipe, |
| 39 | struct pipe_buffer *buf) | 39 | struct pipe_buffer *buf) |
| 40 | { | 40 | { |
| 41 | struct page *page = buf->page; | 41 | struct page *page = buf->page; |
| 42 | struct address_space *mapping; | 42 | struct address_space *mapping; |
| 43 | 43 | ||
| 44 | lock_page(page); | 44 | lock_page(page); |
| 45 | 45 | ||
| 46 | mapping = page_mapping(page); | 46 | mapping = page_mapping(page); |
| 47 | if (mapping) { | 47 | if (mapping) { |
| 48 | WARN_ON(!PageUptodate(page)); | 48 | WARN_ON(!PageUptodate(page)); |
| 49 | 49 | ||
| 50 | /* | 50 | /* |
| 51 | * At least for ext2 with nobh option, we need to wait on | 51 | * At least for ext2 with nobh option, we need to wait on |
| 52 | * writeback completing on this page, since we'll remove it | 52 | * writeback completing on this page, since we'll remove it |
| 53 | * from the pagecache. Otherwise truncate wont wait on the | 53 | * from the pagecache. Otherwise truncate wont wait on the |
| 54 | * page, allowing the disk blocks to be reused by someone else | 54 | * page, allowing the disk blocks to be reused by someone else |
| 55 | * before we actually wrote our data to them. fs corruption | 55 | * before we actually wrote our data to them. fs corruption |
| 56 | * ensues. | 56 | * ensues. |
| 57 | */ | 57 | */ |
| 58 | wait_on_page_writeback(page); | 58 | wait_on_page_writeback(page); |
| 59 | 59 | ||
| 60 | if (PagePrivate(page)) | 60 | if (PagePrivate(page)) |
| 61 | try_to_release_page(page, GFP_KERNEL); | 61 | try_to_release_page(page, GFP_KERNEL); |
| 62 | 62 | ||
| 63 | /* | 63 | /* |
| 64 | * If we succeeded in removing the mapping, set LRU flag | 64 | * If we succeeded in removing the mapping, set LRU flag |
| 65 | * and return good. | 65 | * and return good. |
| 66 | */ | 66 | */ |
| 67 | if (remove_mapping(mapping, page)) { | 67 | if (remove_mapping(mapping, page)) { |
| 68 | buf->flags |= PIPE_BUF_FLAG_LRU; | 68 | buf->flags |= PIPE_BUF_FLAG_LRU; |
| 69 | return 0; | 69 | return 0; |
| 70 | } | 70 | } |
| 71 | } | 71 | } |
| 72 | 72 | ||
| 73 | /* | 73 | /* |
| 74 | * Raced with truncate or failed to remove page from current | 74 | * Raced with truncate or failed to remove page from current |
| 75 | * address space, unlock and return failure. | 75 | * address space, unlock and return failure. |
| 76 | */ | 76 | */ |
| 77 | unlock_page(page); | 77 | unlock_page(page); |
| 78 | return 1; | 78 | return 1; |
| 79 | } | 79 | } |
| 80 | 80 | ||
| 81 | static void page_cache_pipe_buf_release(struct pipe_inode_info *pipe, | 81 | static void page_cache_pipe_buf_release(struct pipe_inode_info *pipe, |
| 82 | struct pipe_buffer *buf) | 82 | struct pipe_buffer *buf) |
| 83 | { | 83 | { |
| 84 | page_cache_release(buf->page); | 84 | page_cache_release(buf->page); |
| 85 | buf->flags &= ~PIPE_BUF_FLAG_LRU; | 85 | buf->flags &= ~PIPE_BUF_FLAG_LRU; |
| 86 | } | 86 | } |
| 87 | 87 | ||
| 88 | static int page_cache_pipe_buf_pin(struct pipe_inode_info *pipe, | 88 | static int page_cache_pipe_buf_confirm(struct pipe_inode_info *pipe, |
| 89 | struct pipe_buffer *buf) | 89 | struct pipe_buffer *buf) |
| 90 | { | 90 | { |
| 91 | struct page *page = buf->page; | 91 | struct page *page = buf->page; |
| 92 | int err; | 92 | int err; |
| 93 | 93 | ||
| 94 | if (!PageUptodate(page)) { | 94 | if (!PageUptodate(page)) { |
| 95 | lock_page(page); | 95 | lock_page(page); |
| 96 | 96 | ||
| 97 | /* | 97 | /* |
| 98 | * Page got truncated/unhashed. This will cause a 0-byte | 98 | * Page got truncated/unhashed. This will cause a 0-byte |
| 99 | * splice, if this is the first page. | 99 | * splice, if this is the first page. |
| 100 | */ | 100 | */ |
| 101 | if (!page->mapping) { | 101 | if (!page->mapping) { |
| 102 | err = -ENODATA; | 102 | err = -ENODATA; |
| 103 | goto error; | 103 | goto error; |
| 104 | } | 104 | } |
| 105 | 105 | ||
| 106 | /* | 106 | /* |
| 107 | * Uh oh, read-error from disk. | 107 | * Uh oh, read-error from disk. |
| 108 | */ | 108 | */ |
| 109 | if (!PageUptodate(page)) { | 109 | if (!PageUptodate(page)) { |
| 110 | err = -EIO; | 110 | err = -EIO; |
| 111 | goto error; | 111 | goto error; |
| 112 | } | 112 | } |
| 113 | 113 | ||
| 114 | /* | 114 | /* |
| 115 | * Page is ok afterall, we are done. | 115 | * Page is ok afterall, we are done. |
| 116 | */ | 116 | */ |
| 117 | unlock_page(page); | 117 | unlock_page(page); |
| 118 | } | 118 | } |
| 119 | 119 | ||
| 120 | return 0; | 120 | return 0; |
| 121 | error: | 121 | error: |
| 122 | unlock_page(page); | 122 | unlock_page(page); |
| 123 | return err; | 123 | return err; |
| 124 | } | 124 | } |
| 125 | 125 | ||
| 126 | static const struct pipe_buf_operations page_cache_pipe_buf_ops = { | 126 | static const struct pipe_buf_operations page_cache_pipe_buf_ops = { |
| 127 | .can_merge = 0, | 127 | .can_merge = 0, |
| 128 | .map = generic_pipe_buf_map, | 128 | .map = generic_pipe_buf_map, |
| 129 | .unmap = generic_pipe_buf_unmap, | 129 | .unmap = generic_pipe_buf_unmap, |
| 130 | .pin = page_cache_pipe_buf_pin, | 130 | .confirm = page_cache_pipe_buf_confirm, |
| 131 | .release = page_cache_pipe_buf_release, | 131 | .release = page_cache_pipe_buf_release, |
| 132 | .steal = page_cache_pipe_buf_steal, | 132 | .steal = page_cache_pipe_buf_steal, |
| 133 | .get = generic_pipe_buf_get, | 133 | .get = generic_pipe_buf_get, |
| 134 | }; | 134 | }; |
| 135 | 135 | ||
| 136 | static int user_page_pipe_buf_steal(struct pipe_inode_info *pipe, | 136 | static int user_page_pipe_buf_steal(struct pipe_inode_info *pipe, |
| 137 | struct pipe_buffer *buf) | 137 | struct pipe_buffer *buf) |
| 138 | { | 138 | { |
| 139 | if (!(buf->flags & PIPE_BUF_FLAG_GIFT)) | 139 | if (!(buf->flags & PIPE_BUF_FLAG_GIFT)) |
| 140 | return 1; | 140 | return 1; |
| 141 | 141 | ||
| 142 | buf->flags |= PIPE_BUF_FLAG_LRU; | 142 | buf->flags |= PIPE_BUF_FLAG_LRU; |
| 143 | return generic_pipe_buf_steal(pipe, buf); | 143 | return generic_pipe_buf_steal(pipe, buf); |
| 144 | } | 144 | } |
| 145 | 145 | ||
| 146 | static const struct pipe_buf_operations user_page_pipe_buf_ops = { | 146 | static const struct pipe_buf_operations user_page_pipe_buf_ops = { |
| 147 | .can_merge = 0, | 147 | .can_merge = 0, |
| 148 | .map = generic_pipe_buf_map, | 148 | .map = generic_pipe_buf_map, |
| 149 | .unmap = generic_pipe_buf_unmap, | 149 | .unmap = generic_pipe_buf_unmap, |
| 150 | .pin = generic_pipe_buf_pin, | 150 | .confirm = generic_pipe_buf_confirm, |
| 151 | .release = page_cache_pipe_buf_release, | 151 | .release = page_cache_pipe_buf_release, |
| 152 | .steal = user_page_pipe_buf_steal, | 152 | .steal = user_page_pipe_buf_steal, |
| 153 | .get = generic_pipe_buf_get, | 153 | .get = generic_pipe_buf_get, |
| 154 | }; | 154 | }; |
| 155 | 155 | ||
| 156 | /** | 156 | /** |
| 157 | * splice_to_pipe - fill passed data into a pipe | 157 | * splice_to_pipe - fill passed data into a pipe |
| 158 | * @pipe: pipe to fill | 158 | * @pipe: pipe to fill |
| 159 | * @spd: data to fill | 159 | * @spd: data to fill |
| 160 | * | 160 | * |
| 161 | * Description: | 161 | * Description: |
| 162 | * @spd contains a map of pages and len/offset tupples, a long with | 162 | * @spd contains a map of pages and len/offset tupples, a long with |
| 163 | * the struct pipe_buf_operations associated with these pages. This | 163 | * the struct pipe_buf_operations associated with these pages. This |
| 164 | * function will link that data to the pipe. | 164 | * function will link that data to the pipe. |
| 165 | * | 165 | * |
| 166 | */ | 166 | */ |
| 167 | ssize_t splice_to_pipe(struct pipe_inode_info *pipe, | 167 | ssize_t splice_to_pipe(struct pipe_inode_info *pipe, |
| 168 | struct splice_pipe_desc *spd) | 168 | struct splice_pipe_desc *spd) |
| 169 | { | 169 | { |
| 170 | unsigned int spd_pages = spd->nr_pages; | 170 | unsigned int spd_pages = spd->nr_pages; |
| 171 | int ret, do_wakeup, page_nr; | 171 | int ret, do_wakeup, page_nr; |
| 172 | 172 | ||
| 173 | ret = 0; | 173 | ret = 0; |
| 174 | do_wakeup = 0; | 174 | do_wakeup = 0; |
| 175 | page_nr = 0; | 175 | page_nr = 0; |
| 176 | 176 | ||
| 177 | if (pipe->inode) | 177 | if (pipe->inode) |
| 178 | mutex_lock(&pipe->inode->i_mutex); | 178 | mutex_lock(&pipe->inode->i_mutex); |
| 179 | 179 | ||
| 180 | for (;;) { | 180 | for (;;) { |
| 181 | if (!pipe->readers) { | 181 | if (!pipe->readers) { |
| 182 | send_sig(SIGPIPE, current, 0); | 182 | send_sig(SIGPIPE, current, 0); |
| 183 | if (!ret) | 183 | if (!ret) |
| 184 | ret = -EPIPE; | 184 | ret = -EPIPE; |
| 185 | break; | 185 | break; |
| 186 | } | 186 | } |
| 187 | 187 | ||
| 188 | if (pipe->nrbufs < PIPE_BUFFERS) { | 188 | if (pipe->nrbufs < PIPE_BUFFERS) { |
| 189 | int newbuf = (pipe->curbuf + pipe->nrbufs) & (PIPE_BUFFERS - 1); | 189 | int newbuf = (pipe->curbuf + pipe->nrbufs) & (PIPE_BUFFERS - 1); |
| 190 | struct pipe_buffer *buf = pipe->bufs + newbuf; | 190 | struct pipe_buffer *buf = pipe->bufs + newbuf; |
| 191 | 191 | ||
| 192 | buf->page = spd->pages[page_nr]; | 192 | buf->page = spd->pages[page_nr]; |
| 193 | buf->offset = spd->partial[page_nr].offset; | 193 | buf->offset = spd->partial[page_nr].offset; |
| 194 | buf->len = spd->partial[page_nr].len; | 194 | buf->len = spd->partial[page_nr].len; |
| 195 | buf->private = spd->partial[page_nr].private; | 195 | buf->private = spd->partial[page_nr].private; |
| 196 | buf->ops = spd->ops; | 196 | buf->ops = spd->ops; |
| 197 | if (spd->flags & SPLICE_F_GIFT) | 197 | if (spd->flags & SPLICE_F_GIFT) |
| 198 | buf->flags |= PIPE_BUF_FLAG_GIFT; | 198 | buf->flags |= PIPE_BUF_FLAG_GIFT; |
| 199 | 199 | ||
| 200 | pipe->nrbufs++; | 200 | pipe->nrbufs++; |
| 201 | page_nr++; | 201 | page_nr++; |
| 202 | ret += buf->len; | 202 | ret += buf->len; |
| 203 | 203 | ||
| 204 | if (pipe->inode) | 204 | if (pipe->inode) |
| 205 | do_wakeup = 1; | 205 | do_wakeup = 1; |
| 206 | 206 | ||
| 207 | if (!--spd->nr_pages) | 207 | if (!--spd->nr_pages) |
| 208 | break; | 208 | break; |
| 209 | if (pipe->nrbufs < PIPE_BUFFERS) | 209 | if (pipe->nrbufs < PIPE_BUFFERS) |
| 210 | continue; | 210 | continue; |
| 211 | 211 | ||
| 212 | break; | 212 | break; |
| 213 | } | 213 | } |
| 214 | 214 | ||
| 215 | if (spd->flags & SPLICE_F_NONBLOCK) { | 215 | if (spd->flags & SPLICE_F_NONBLOCK) { |
| 216 | if (!ret) | 216 | if (!ret) |
| 217 | ret = -EAGAIN; | 217 | ret = -EAGAIN; |
| 218 | break; | 218 | break; |
| 219 | } | 219 | } |
| 220 | 220 | ||
| 221 | if (signal_pending(current)) { | 221 | if (signal_pending(current)) { |
| 222 | if (!ret) | 222 | if (!ret) |
| 223 | ret = -ERESTARTSYS; | 223 | ret = -ERESTARTSYS; |
| 224 | break; | 224 | break; |
| 225 | } | 225 | } |
| 226 | 226 | ||
| 227 | if (do_wakeup) { | 227 | if (do_wakeup) { |
| 228 | smp_mb(); | 228 | smp_mb(); |
| 229 | if (waitqueue_active(&pipe->wait)) | 229 | if (waitqueue_active(&pipe->wait)) |
| 230 | wake_up_interruptible_sync(&pipe->wait); | 230 | wake_up_interruptible_sync(&pipe->wait); |
| 231 | kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); | 231 | kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); |
| 232 | do_wakeup = 0; | 232 | do_wakeup = 0; |
| 233 | } | 233 | } |
| 234 | 234 | ||
| 235 | pipe->waiting_writers++; | 235 | pipe->waiting_writers++; |
| 236 | pipe_wait(pipe); | 236 | pipe_wait(pipe); |
| 237 | pipe->waiting_writers--; | 237 | pipe->waiting_writers--; |
| 238 | } | 238 | } |
| 239 | 239 | ||
| 240 | if (pipe->inode) { | 240 | if (pipe->inode) { |
| 241 | mutex_unlock(&pipe->inode->i_mutex); | 241 | mutex_unlock(&pipe->inode->i_mutex); |
| 242 | 242 | ||
| 243 | if (do_wakeup) { | 243 | if (do_wakeup) { |
| 244 | smp_mb(); | 244 | smp_mb(); |
| 245 | if (waitqueue_active(&pipe->wait)) | 245 | if (waitqueue_active(&pipe->wait)) |
| 246 | wake_up_interruptible(&pipe->wait); | 246 | wake_up_interruptible(&pipe->wait); |
| 247 | kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); | 247 | kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); |
| 248 | } | 248 | } |
| 249 | } | 249 | } |
| 250 | 250 | ||
| 251 | while (page_nr < spd_pages) | 251 | while (page_nr < spd_pages) |
| 252 | page_cache_release(spd->pages[page_nr++]); | 252 | page_cache_release(spd->pages[page_nr++]); |
| 253 | 253 | ||
| 254 | return ret; | 254 | return ret; |
| 255 | } | 255 | } |
| 256 | 256 | ||
| 257 | static int | 257 | static int |
| 258 | __generic_file_splice_read(struct file *in, loff_t *ppos, | 258 | __generic_file_splice_read(struct file *in, loff_t *ppos, |
| 259 | struct pipe_inode_info *pipe, size_t len, | 259 | struct pipe_inode_info *pipe, size_t len, |
| 260 | unsigned int flags) | 260 | unsigned int flags) |
| 261 | { | 261 | { |
| 262 | struct address_space *mapping = in->f_mapping; | 262 | struct address_space *mapping = in->f_mapping; |
| 263 | unsigned int loff, nr_pages; | 263 | unsigned int loff, nr_pages; |
| 264 | struct page *pages[PIPE_BUFFERS]; | 264 | struct page *pages[PIPE_BUFFERS]; |
| 265 | struct partial_page partial[PIPE_BUFFERS]; | 265 | struct partial_page partial[PIPE_BUFFERS]; |
| 266 | struct page *page; | 266 | struct page *page; |
| 267 | pgoff_t index, end_index; | 267 | pgoff_t index, end_index; |
| 268 | loff_t isize; | 268 | loff_t isize; |
| 269 | int error, page_nr; | 269 | int error, page_nr; |
| 270 | struct splice_pipe_desc spd = { | 270 | struct splice_pipe_desc spd = { |
| 271 | .pages = pages, | 271 | .pages = pages, |
| 272 | .partial = partial, | 272 | .partial = partial, |
| 273 | .flags = flags, | 273 | .flags = flags, |
| 274 | .ops = &page_cache_pipe_buf_ops, | 274 | .ops = &page_cache_pipe_buf_ops, |
| 275 | }; | 275 | }; |
| 276 | 276 | ||
| 277 | index = *ppos >> PAGE_CACHE_SHIFT; | 277 | index = *ppos >> PAGE_CACHE_SHIFT; |
| 278 | loff = *ppos & ~PAGE_CACHE_MASK; | 278 | loff = *ppos & ~PAGE_CACHE_MASK; |
| 279 | nr_pages = (len + loff + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; | 279 | nr_pages = (len + loff + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; |
| 280 | 280 | ||
| 281 | if (nr_pages > PIPE_BUFFERS) | 281 | if (nr_pages > PIPE_BUFFERS) |
| 282 | nr_pages = PIPE_BUFFERS; | 282 | nr_pages = PIPE_BUFFERS; |
| 283 | 283 | ||
| 284 | /* | 284 | /* |
| 285 | * Don't try to 2nd guess the read-ahead logic, call into | 285 | * Don't try to 2nd guess the read-ahead logic, call into |
| 286 | * page_cache_readahead() like the page cache reads would do. | 286 | * page_cache_readahead() like the page cache reads would do. |
| 287 | */ | 287 | */ |
| 288 | page_cache_readahead(mapping, &in->f_ra, in, index, nr_pages); | 288 | page_cache_readahead(mapping, &in->f_ra, in, index, nr_pages); |
| 289 | 289 | ||
| 290 | /* | 290 | /* |
| 291 | * Lookup the (hopefully) full range of pages we need. | 291 | * Lookup the (hopefully) full range of pages we need. |
| 292 | */ | 292 | */ |
| 293 | spd.nr_pages = find_get_pages_contig(mapping, index, nr_pages, pages); | 293 | spd.nr_pages = find_get_pages_contig(mapping, index, nr_pages, pages); |
| 294 | 294 | ||
| 295 | /* | 295 | /* |
| 296 | * If find_get_pages_contig() returned fewer pages than we needed, | 296 | * If find_get_pages_contig() returned fewer pages than we needed, |
| 297 | * allocate the rest and fill in the holes. | 297 | * allocate the rest and fill in the holes. |
| 298 | */ | 298 | */ |
| 299 | error = 0; | 299 | error = 0; |
| 300 | index += spd.nr_pages; | 300 | index += spd.nr_pages; |
| 301 | while (spd.nr_pages < nr_pages) { | 301 | while (spd.nr_pages < nr_pages) { |
| 302 | /* | 302 | /* |
| 303 | * Page could be there, find_get_pages_contig() breaks on | 303 | * Page could be there, find_get_pages_contig() breaks on |
| 304 | * the first hole. | 304 | * the first hole. |
| 305 | */ | 305 | */ |
| 306 | page = find_get_page(mapping, index); | 306 | page = find_get_page(mapping, index); |
| 307 | if (!page) { | 307 | if (!page) { |
| 308 | /* | 308 | /* |
| 309 | * Make sure the read-ahead engine is notified | 309 | * Make sure the read-ahead engine is notified |
| 310 | * about this failure. | 310 | * about this failure. |
| 311 | */ | 311 | */ |
| 312 | handle_ra_miss(mapping, &in->f_ra, index); | 312 | handle_ra_miss(mapping, &in->f_ra, index); |
| 313 | 313 | ||
| 314 | /* | 314 | /* |
| 315 | * page didn't exist, allocate one. | 315 | * page didn't exist, allocate one. |
| 316 | */ | 316 | */ |
| 317 | page = page_cache_alloc_cold(mapping); | 317 | page = page_cache_alloc_cold(mapping); |
| 318 | if (!page) | 318 | if (!page) |
| 319 | break; | 319 | break; |
| 320 | 320 | ||
| 321 | error = add_to_page_cache_lru(page, mapping, index, | 321 | error = add_to_page_cache_lru(page, mapping, index, |
| 322 | GFP_KERNEL); | 322 | GFP_KERNEL); |
| 323 | if (unlikely(error)) { | 323 | if (unlikely(error)) { |
| 324 | page_cache_release(page); | 324 | page_cache_release(page); |
| 325 | if (error == -EEXIST) | 325 | if (error == -EEXIST) |
| 326 | continue; | 326 | continue; |
| 327 | break; | 327 | break; |
| 328 | } | 328 | } |
| 329 | /* | 329 | /* |
| 330 | * add_to_page_cache() locks the page, unlock it | 330 | * add_to_page_cache() locks the page, unlock it |
| 331 | * to avoid convoluting the logic below even more. | 331 | * to avoid convoluting the logic below even more. |
| 332 | */ | 332 | */ |
| 333 | unlock_page(page); | 333 | unlock_page(page); |
| 334 | } | 334 | } |
| 335 | 335 | ||
| 336 | pages[spd.nr_pages++] = page; | 336 | pages[spd.nr_pages++] = page; |
| 337 | index++; | 337 | index++; |
| 338 | } | 338 | } |
| 339 | 339 | ||
| 340 | /* | 340 | /* |
| 341 | * Now loop over the map and see if we need to start IO on any | 341 | * Now loop over the map and see if we need to start IO on any |
| 342 | * pages, fill in the partial map, etc. | 342 | * pages, fill in the partial map, etc. |
| 343 | */ | 343 | */ |
| 344 | index = *ppos >> PAGE_CACHE_SHIFT; | 344 | index = *ppos >> PAGE_CACHE_SHIFT; |
| 345 | nr_pages = spd.nr_pages; | 345 | nr_pages = spd.nr_pages; |
| 346 | spd.nr_pages = 0; | 346 | spd.nr_pages = 0; |
| 347 | for (page_nr = 0; page_nr < nr_pages; page_nr++) { | 347 | for (page_nr = 0; page_nr < nr_pages; page_nr++) { |
| 348 | unsigned int this_len; | 348 | unsigned int this_len; |
| 349 | 349 | ||
| 350 | if (!len) | 350 | if (!len) |
| 351 | break; | 351 | break; |
| 352 | 352 | ||
| 353 | /* | 353 | /* |
| 354 | * this_len is the max we'll use from this page | 354 | * this_len is the max we'll use from this page |
| 355 | */ | 355 | */ |
| 356 | this_len = min_t(unsigned long, len, PAGE_CACHE_SIZE - loff); | 356 | this_len = min_t(unsigned long, len, PAGE_CACHE_SIZE - loff); |
| 357 | page = pages[page_nr]; | 357 | page = pages[page_nr]; |
| 358 | 358 | ||
| 359 | /* | 359 | /* |
| 360 | * If the page isn't uptodate, we may need to start io on it | 360 | * If the page isn't uptodate, we may need to start io on it |
| 361 | */ | 361 | */ |
| 362 | if (!PageUptodate(page)) { | 362 | if (!PageUptodate(page)) { |
| 363 | /* | 363 | /* |
| 364 | * If in nonblock mode then dont block on waiting | 364 | * If in nonblock mode then dont block on waiting |
| 365 | * for an in-flight io page | 365 | * for an in-flight io page |
| 366 | */ | 366 | */ |
| 367 | if (flags & SPLICE_F_NONBLOCK) { | 367 | if (flags & SPLICE_F_NONBLOCK) { |
| 368 | if (TestSetPageLocked(page)) | 368 | if (TestSetPageLocked(page)) |
| 369 | break; | 369 | break; |
| 370 | } else | 370 | } else |
| 371 | lock_page(page); | 371 | lock_page(page); |
| 372 | 372 | ||
| 373 | /* | 373 | /* |
| 374 | * page was truncated, stop here. if this isn't the | 374 | * page was truncated, stop here. if this isn't the |
| 375 | * first page, we'll just complete what we already | 375 | * first page, we'll just complete what we already |
| 376 | * added | 376 | * added |
| 377 | */ | 377 | */ |
| 378 | if (!page->mapping) { | 378 | if (!page->mapping) { |
| 379 | unlock_page(page); | 379 | unlock_page(page); |
| 380 | break; | 380 | break; |
| 381 | } | 381 | } |
| 382 | /* | 382 | /* |
| 383 | * page was already under io and is now done, great | 383 | * page was already under io and is now done, great |
| 384 | */ | 384 | */ |
| 385 | if (PageUptodate(page)) { | 385 | if (PageUptodate(page)) { |
| 386 | unlock_page(page); | 386 | unlock_page(page); |
| 387 | goto fill_it; | 387 | goto fill_it; |
| 388 | } | 388 | } |
| 389 | 389 | ||
| 390 | /* | 390 | /* |
| 391 | * need to read in the page | 391 | * need to read in the page |
| 392 | */ | 392 | */ |
| 393 | error = mapping->a_ops->readpage(in, page); | 393 | error = mapping->a_ops->readpage(in, page); |
| 394 | if (unlikely(error)) { | 394 | if (unlikely(error)) { |
| 395 | /* | 395 | /* |
| 396 | * We really should re-lookup the page here, | 396 | * We really should re-lookup the page here, |
| 397 | * but it complicates things a lot. Instead | 397 | * but it complicates things a lot. Instead |
| 398 | * lets just do what we already stored, and | 398 | * lets just do what we already stored, and |
| 399 | * we'll get it the next time we are called. | 399 | * we'll get it the next time we are called. |
| 400 | */ | 400 | */ |
| 401 | if (error == AOP_TRUNCATED_PAGE) | 401 | if (error == AOP_TRUNCATED_PAGE) |
| 402 | error = 0; | 402 | error = 0; |
| 403 | 403 | ||
| 404 | break; | 404 | break; |
| 405 | } | 405 | } |
| 406 | } | 406 | } |
| 407 | fill_it: | 407 | fill_it: |
| 408 | /* | 408 | /* |
| 409 | * i_size must be checked after PageUptodate. | 409 | * i_size must be checked after PageUptodate. |
| 410 | */ | 410 | */ |
| 411 | isize = i_size_read(mapping->host); | 411 | isize = i_size_read(mapping->host); |
| 412 | end_index = (isize - 1) >> PAGE_CACHE_SHIFT; | 412 | end_index = (isize - 1) >> PAGE_CACHE_SHIFT; |
| 413 | if (unlikely(!isize || index > end_index)) | 413 | if (unlikely(!isize || index > end_index)) |
| 414 | break; | 414 | break; |
| 415 | 415 | ||
| 416 | /* | 416 | /* |
| 417 | * if this is the last page, see if we need to shrink | 417 | * if this is the last page, see if we need to shrink |
| 418 | * the length and stop | 418 | * the length and stop |
| 419 | */ | 419 | */ |
| 420 | if (end_index == index) { | 420 | if (end_index == index) { |
| 421 | unsigned int plen; | 421 | unsigned int plen; |
| 422 | 422 | ||
| 423 | /* | 423 | /* |
| 424 | * max good bytes in this page | 424 | * max good bytes in this page |
| 425 | */ | 425 | */ |
| 426 | plen = ((isize - 1) & ~PAGE_CACHE_MASK) + 1; | 426 | plen = ((isize - 1) & ~PAGE_CACHE_MASK) + 1; |
| 427 | if (plen <= loff) | 427 | if (plen <= loff) |
| 428 | break; | 428 | break; |
| 429 | 429 | ||
| 430 | /* | 430 | /* |
| 431 | * force quit after adding this page | 431 | * force quit after adding this page |
| 432 | */ | 432 | */ |
| 433 | this_len = min(this_len, plen - loff); | 433 | this_len = min(this_len, plen - loff); |
| 434 | len = this_len; | 434 | len = this_len; |
| 435 | } | 435 | } |
| 436 | 436 | ||
| 437 | partial[page_nr].offset = loff; | 437 | partial[page_nr].offset = loff; |
| 438 | partial[page_nr].len = this_len; | 438 | partial[page_nr].len = this_len; |
| 439 | len -= this_len; | 439 | len -= this_len; |
| 440 | loff = 0; | 440 | loff = 0; |
| 441 | spd.nr_pages++; | 441 | spd.nr_pages++; |
| 442 | index++; | 442 | index++; |
| 443 | } | 443 | } |
| 444 | 444 | ||
| 445 | /* | 445 | /* |
| 446 | * Release any pages at the end, if we quit early. 'page_nr' is how far | 446 | * Release any pages at the end, if we quit early. 'page_nr' is how far |
| 447 | * we got, 'nr_pages' is how many pages are in the map. | 447 | * we got, 'nr_pages' is how many pages are in the map. |
| 448 | */ | 448 | */ |
| 449 | while (page_nr < nr_pages) | 449 | while (page_nr < nr_pages) |
| 450 | page_cache_release(pages[page_nr++]); | 450 | page_cache_release(pages[page_nr++]); |
| 451 | 451 | ||
| 452 | if (spd.nr_pages) | 452 | if (spd.nr_pages) |
| 453 | return splice_to_pipe(pipe, &spd); | 453 | return splice_to_pipe(pipe, &spd); |
| 454 | 454 | ||
| 455 | return error; | 455 | return error; |
| 456 | } | 456 | } |
| 457 | 457 | ||
| 458 | /** | 458 | /** |
| 459 | * generic_file_splice_read - splice data from file to a pipe | 459 | * generic_file_splice_read - splice data from file to a pipe |
| 460 | * @in: file to splice from | 460 | * @in: file to splice from |
| 461 | * @ppos: position in @in | 461 | * @ppos: position in @in |
| 462 | * @pipe: pipe to splice to | 462 | * @pipe: pipe to splice to |
| 463 | * @len: number of bytes to splice | 463 | * @len: number of bytes to splice |
| 464 | * @flags: splice modifier flags | 464 | * @flags: splice modifier flags |
| 465 | * | 465 | * |
| 466 | * Description: | 466 | * Description: |
| 467 | * Will read pages from given file and fill them into a pipe. Can be | 467 | * Will read pages from given file and fill them into a pipe. Can be |
| 468 | * used as long as the address_space operations for the source implements | 468 | * used as long as the address_space operations for the source implements |
| 469 | * a readpage() hook. | 469 | * a readpage() hook. |
| 470 | * | 470 | * |
| 471 | */ | 471 | */ |
| 472 | ssize_t generic_file_splice_read(struct file *in, loff_t *ppos, | 472 | ssize_t generic_file_splice_read(struct file *in, loff_t *ppos, |
| 473 | struct pipe_inode_info *pipe, size_t len, | 473 | struct pipe_inode_info *pipe, size_t len, |
| 474 | unsigned int flags) | 474 | unsigned int flags) |
| 475 | { | 475 | { |
| 476 | ssize_t spliced; | 476 | ssize_t spliced; |
| 477 | int ret; | 477 | int ret; |
| 478 | loff_t isize, left; | 478 | loff_t isize, left; |
| 479 | 479 | ||
| 480 | isize = i_size_read(in->f_mapping->host); | 480 | isize = i_size_read(in->f_mapping->host); |
| 481 | if (unlikely(*ppos >= isize)) | 481 | if (unlikely(*ppos >= isize)) |
| 482 | return 0; | 482 | return 0; |
| 483 | 483 | ||
| 484 | left = isize - *ppos; | 484 | left = isize - *ppos; |
| 485 | if (unlikely(left < len)) | 485 | if (unlikely(left < len)) |
| 486 | len = left; | 486 | len = left; |
| 487 | 487 | ||
| 488 | ret = 0; | 488 | ret = 0; |
| 489 | spliced = 0; | 489 | spliced = 0; |
| 490 | while (len) { | 490 | while (len) { |
| 491 | ret = __generic_file_splice_read(in, ppos, pipe, len, flags); | 491 | ret = __generic_file_splice_read(in, ppos, pipe, len, flags); |
| 492 | 492 | ||
| 493 | if (ret < 0) | 493 | if (ret < 0) |
| 494 | break; | 494 | break; |
| 495 | else if (!ret) { | 495 | else if (!ret) { |
| 496 | if (spliced) | 496 | if (spliced) |
| 497 | break; | 497 | break; |
| 498 | if (flags & SPLICE_F_NONBLOCK) { | 498 | if (flags & SPLICE_F_NONBLOCK) { |
| 499 | ret = -EAGAIN; | 499 | ret = -EAGAIN; |
| 500 | break; | 500 | break; |
| 501 | } | 501 | } |
| 502 | } | 502 | } |
| 503 | 503 | ||
| 504 | *ppos += ret; | 504 | *ppos += ret; |
| 505 | len -= ret; | 505 | len -= ret; |
| 506 | spliced += ret; | 506 | spliced += ret; |
| 507 | } | 507 | } |
| 508 | 508 | ||
| 509 | if (spliced) | 509 | if (spliced) |
| 510 | return spliced; | 510 | return spliced; |
| 511 | 511 | ||
| 512 | return ret; | 512 | return ret; |
| 513 | } | 513 | } |
| 514 | 514 | ||
| 515 | EXPORT_SYMBOL(generic_file_splice_read); | 515 | EXPORT_SYMBOL(generic_file_splice_read); |
| 516 | 516 | ||
| 517 | /* | 517 | /* |
| 518 | * Send 'sd->len' bytes to socket from 'sd->file' at position 'sd->pos' | 518 | * Send 'sd->len' bytes to socket from 'sd->file' at position 'sd->pos' |
| 519 | * using sendpage(). Return the number of bytes sent. | 519 | * using sendpage(). Return the number of bytes sent. |
| 520 | */ | 520 | */ |
| 521 | static int pipe_to_sendpage(struct pipe_inode_info *pipe, | 521 | static int pipe_to_sendpage(struct pipe_inode_info *pipe, |
| 522 | struct pipe_buffer *buf, struct splice_desc *sd) | 522 | struct pipe_buffer *buf, struct splice_desc *sd) |
| 523 | { | 523 | { |
| 524 | struct file *file = sd->u.file; | 524 | struct file *file = sd->u.file; |
| 525 | loff_t pos = sd->pos; | 525 | loff_t pos = sd->pos; |
| 526 | int ret, more; | 526 | int ret, more; |
| 527 | 527 | ||
| 528 | ret = buf->ops->pin(pipe, buf); | 528 | ret = buf->ops->confirm(pipe, buf); |
| 529 | if (!ret) { | 529 | if (!ret) { |
| 530 | more = (sd->flags & SPLICE_F_MORE) || sd->len < sd->total_len; | 530 | more = (sd->flags & SPLICE_F_MORE) || sd->len < sd->total_len; |
| 531 | 531 | ||
| 532 | ret = file->f_op->sendpage(file, buf->page, buf->offset, | 532 | ret = file->f_op->sendpage(file, buf->page, buf->offset, |
| 533 | sd->len, &pos, more); | 533 | sd->len, &pos, more); |
| 534 | } | 534 | } |
| 535 | 535 | ||
| 536 | return ret; | 536 | return ret; |
| 537 | } | 537 | } |
| 538 | 538 | ||
| 539 | /* | 539 | /* |
| 540 | * This is a little more tricky than the file -> pipe splicing. There are | 540 | * This is a little more tricky than the file -> pipe splicing. There are |
| 541 | * basically three cases: | 541 | * basically three cases: |
| 542 | * | 542 | * |
| 543 | * - Destination page already exists in the address space and there | 543 | * - Destination page already exists in the address space and there |
| 544 | * are users of it. For that case we have no other option that | 544 | * are users of it. For that case we have no other option that |
| 545 | * copying the data. Tough luck. | 545 | * copying the data. Tough luck. |
| 546 | * - Destination page already exists in the address space, but there | 546 | * - Destination page already exists in the address space, but there |
| 547 | * are no users of it. Make sure it's uptodate, then drop it. Fall | 547 | * are no users of it. Make sure it's uptodate, then drop it. Fall |
| 548 | * through to last case. | 548 | * through to last case. |
| 549 | * - Destination page does not exist, we can add the pipe page to | 549 | * - Destination page does not exist, we can add the pipe page to |
| 550 | * the page cache and avoid the copy. | 550 | * the page cache and avoid the copy. |
| 551 | * | 551 | * |
| 552 | * If asked to move pages to the output file (SPLICE_F_MOVE is set in | 552 | * If asked to move pages to the output file (SPLICE_F_MOVE is set in |
| 553 | * sd->flags), we attempt to migrate pages from the pipe to the output | 553 | * sd->flags), we attempt to migrate pages from the pipe to the output |
| 554 | * file address space page cache. This is possible if no one else has | 554 | * file address space page cache. This is possible if no one else has |
| 555 | * the pipe page referenced outside of the pipe and page cache. If | 555 | * the pipe page referenced outside of the pipe and page cache. If |
| 556 | * SPLICE_F_MOVE isn't set, or we cannot move the page, we simply create | 556 | * SPLICE_F_MOVE isn't set, or we cannot move the page, we simply create |
| 557 | * a new page in the output file page cache and fill/dirty that. | 557 | * a new page in the output file page cache and fill/dirty that. |
| 558 | */ | 558 | */ |
| 559 | static int pipe_to_file(struct pipe_inode_info *pipe, struct pipe_buffer *buf, | 559 | static int pipe_to_file(struct pipe_inode_info *pipe, struct pipe_buffer *buf, |
| 560 | struct splice_desc *sd) | 560 | struct splice_desc *sd) |
| 561 | { | 561 | { |
| 562 | struct file *file = sd->u.file; | 562 | struct file *file = sd->u.file; |
| 563 | struct address_space *mapping = file->f_mapping; | 563 | struct address_space *mapping = file->f_mapping; |
| 564 | unsigned int offset, this_len; | 564 | unsigned int offset, this_len; |
| 565 | struct page *page; | 565 | struct page *page; |
| 566 | pgoff_t index; | 566 | pgoff_t index; |
| 567 | int ret; | 567 | int ret; |
| 568 | 568 | ||
| 569 | /* | 569 | /* |
| 570 | * make sure the data in this buffer is uptodate | 570 | * make sure the data in this buffer is uptodate |
| 571 | */ | 571 | */ |
| 572 | ret = buf->ops->pin(pipe, buf); | 572 | ret = buf->ops->confirm(pipe, buf); |
| 573 | if (unlikely(ret)) | 573 | if (unlikely(ret)) |
| 574 | return ret; | 574 | return ret; |
| 575 | 575 | ||
| 576 | index = sd->pos >> PAGE_CACHE_SHIFT; | 576 | index = sd->pos >> PAGE_CACHE_SHIFT; |
| 577 | offset = sd->pos & ~PAGE_CACHE_MASK; | 577 | offset = sd->pos & ~PAGE_CACHE_MASK; |
| 578 | 578 | ||
| 579 | this_len = sd->len; | 579 | this_len = sd->len; |
| 580 | if (this_len + offset > PAGE_CACHE_SIZE) | 580 | if (this_len + offset > PAGE_CACHE_SIZE) |
| 581 | this_len = PAGE_CACHE_SIZE - offset; | 581 | this_len = PAGE_CACHE_SIZE - offset; |
| 582 | 582 | ||
| 583 | find_page: | 583 | find_page: |
| 584 | page = find_lock_page(mapping, index); | 584 | page = find_lock_page(mapping, index); |
| 585 | if (!page) { | 585 | if (!page) { |
| 586 | ret = -ENOMEM; | 586 | ret = -ENOMEM; |
| 587 | page = page_cache_alloc_cold(mapping); | 587 | page = page_cache_alloc_cold(mapping); |
| 588 | if (unlikely(!page)) | 588 | if (unlikely(!page)) |
| 589 | goto out_ret; | 589 | goto out_ret; |
| 590 | 590 | ||
| 591 | /* | 591 | /* |
| 592 | * This will also lock the page | 592 | * This will also lock the page |
| 593 | */ | 593 | */ |
| 594 | ret = add_to_page_cache_lru(page, mapping, index, | 594 | ret = add_to_page_cache_lru(page, mapping, index, |
| 595 | GFP_KERNEL); | 595 | GFP_KERNEL); |
| 596 | if (unlikely(ret)) | 596 | if (unlikely(ret)) |
| 597 | goto out; | 597 | goto out; |
| 598 | } | 598 | } |
| 599 | 599 | ||
| 600 | ret = mapping->a_ops->prepare_write(file, page, offset, offset+this_len); | 600 | ret = mapping->a_ops->prepare_write(file, page, offset, offset+this_len); |
| 601 | if (unlikely(ret)) { | 601 | if (unlikely(ret)) { |
| 602 | loff_t isize = i_size_read(mapping->host); | 602 | loff_t isize = i_size_read(mapping->host); |
| 603 | 603 | ||
| 604 | if (ret != AOP_TRUNCATED_PAGE) | 604 | if (ret != AOP_TRUNCATED_PAGE) |
| 605 | unlock_page(page); | 605 | unlock_page(page); |
| 606 | page_cache_release(page); | 606 | page_cache_release(page); |
| 607 | if (ret == AOP_TRUNCATED_PAGE) | 607 | if (ret == AOP_TRUNCATED_PAGE) |
| 608 | goto find_page; | 608 | goto find_page; |
| 609 | 609 | ||
| 610 | /* | 610 | /* |
| 611 | * prepare_write() may have instantiated a few blocks | 611 | * prepare_write() may have instantiated a few blocks |
| 612 | * outside i_size. Trim these off again. | 612 | * outside i_size. Trim these off again. |
| 613 | */ | 613 | */ |
| 614 | if (sd->pos + this_len > isize) | 614 | if (sd->pos + this_len > isize) |
| 615 | vmtruncate(mapping->host, isize); | 615 | vmtruncate(mapping->host, isize); |
| 616 | 616 | ||
| 617 | goto out_ret; | 617 | goto out_ret; |
| 618 | } | 618 | } |
| 619 | 619 | ||
| 620 | if (buf->page != page) { | 620 | if (buf->page != page) { |
| 621 | /* | 621 | /* |
| 622 | * Careful, ->map() uses KM_USER0! | 622 | * Careful, ->map() uses KM_USER0! |
| 623 | */ | 623 | */ |
| 624 | char *src = buf->ops->map(pipe, buf, 1); | 624 | char *src = buf->ops->map(pipe, buf, 1); |
| 625 | char *dst = kmap_atomic(page, KM_USER1); | 625 | char *dst = kmap_atomic(page, KM_USER1); |
| 626 | 626 | ||
| 627 | memcpy(dst + offset, src + buf->offset, this_len); | 627 | memcpy(dst + offset, src + buf->offset, this_len); |
| 628 | flush_dcache_page(page); | 628 | flush_dcache_page(page); |
| 629 | kunmap_atomic(dst, KM_USER1); | 629 | kunmap_atomic(dst, KM_USER1); |
| 630 | buf->ops->unmap(pipe, buf, src); | 630 | buf->ops->unmap(pipe, buf, src); |
| 631 | } | 631 | } |
| 632 | 632 | ||
| 633 | ret = mapping->a_ops->commit_write(file, page, offset, offset+this_len); | 633 | ret = mapping->a_ops->commit_write(file, page, offset, offset+this_len); |
| 634 | if (ret) { | 634 | if (ret) { |
| 635 | if (ret == AOP_TRUNCATED_PAGE) { | 635 | if (ret == AOP_TRUNCATED_PAGE) { |
| 636 | page_cache_release(page); | 636 | page_cache_release(page); |
| 637 | goto find_page; | 637 | goto find_page; |
| 638 | } | 638 | } |
| 639 | if (ret < 0) | 639 | if (ret < 0) |
| 640 | goto out; | 640 | goto out; |
| 641 | /* | 641 | /* |
| 642 | * Partial write has happened, so 'ret' already initialized by | 642 | * Partial write has happened, so 'ret' already initialized by |
| 643 | * number of bytes written, Where is nothing we have to do here. | 643 | * number of bytes written, Where is nothing we have to do here. |
| 644 | */ | 644 | */ |
| 645 | } else | 645 | } else |
| 646 | ret = this_len; | 646 | ret = this_len; |
| 647 | /* | 647 | /* |
| 648 | * Return the number of bytes written and mark page as | 648 | * Return the number of bytes written and mark page as |
| 649 | * accessed, we are now done! | 649 | * accessed, we are now done! |
| 650 | */ | 650 | */ |
| 651 | mark_page_accessed(page); | 651 | mark_page_accessed(page); |
| 652 | out: | 652 | out: |
| 653 | page_cache_release(page); | 653 | page_cache_release(page); |
| 654 | unlock_page(page); | 654 | unlock_page(page); |
| 655 | out_ret: | 655 | out_ret: |
| 656 | return ret; | 656 | return ret; |
| 657 | } | 657 | } |
| 658 | 658 | ||
| 659 | /** | 659 | /** |
| 660 | * __splice_from_pipe - splice data from a pipe to given actor | 660 | * __splice_from_pipe - splice data from a pipe to given actor |
| 661 | * @pipe: pipe to splice from | 661 | * @pipe: pipe to splice from |
| 662 | * @sd: information to @actor | 662 | * @sd: information to @actor |
| 663 | * @actor: handler that splices the data | 663 | * @actor: handler that splices the data |
| 664 | * | 664 | * |
| 665 | * Description: | 665 | * Description: |
| 666 | * This function does little more than loop over the pipe and call | 666 | * This function does little more than loop over the pipe and call |
| 667 | * @actor to do the actual moving of a single struct pipe_buffer to | 667 | * @actor to do the actual moving of a single struct pipe_buffer to |
| 668 | * the desired destination. See pipe_to_file, pipe_to_sendpage, or | 668 | * the desired destination. See pipe_to_file, pipe_to_sendpage, or |
| 669 | * pipe_to_user. | 669 | * pipe_to_user. |
| 670 | * | 670 | * |
| 671 | */ | 671 | */ |
| 672 | ssize_t __splice_from_pipe(struct pipe_inode_info *pipe, struct splice_desc *sd, | 672 | ssize_t __splice_from_pipe(struct pipe_inode_info *pipe, struct splice_desc *sd, |
| 673 | splice_actor *actor) | 673 | splice_actor *actor) |
| 674 | { | 674 | { |
| 675 | int ret, do_wakeup, err; | 675 | int ret, do_wakeup, err; |
| 676 | 676 | ||
| 677 | ret = 0; | 677 | ret = 0; |
| 678 | do_wakeup = 0; | 678 | do_wakeup = 0; |
| 679 | 679 | ||
| 680 | for (;;) { | 680 | for (;;) { |
| 681 | if (pipe->nrbufs) { | 681 | if (pipe->nrbufs) { |
| 682 | struct pipe_buffer *buf = pipe->bufs + pipe->curbuf; | 682 | struct pipe_buffer *buf = pipe->bufs + pipe->curbuf; |
| 683 | const struct pipe_buf_operations *ops = buf->ops; | 683 | const struct pipe_buf_operations *ops = buf->ops; |
| 684 | 684 | ||
| 685 | sd->len = buf->len; | 685 | sd->len = buf->len; |
| 686 | if (sd->len > sd->total_len) | 686 | if (sd->len > sd->total_len) |
| 687 | sd->len = sd->total_len; | 687 | sd->len = sd->total_len; |
| 688 | 688 | ||
| 689 | err = actor(pipe, buf, sd); | 689 | err = actor(pipe, buf, sd); |
| 690 | if (err <= 0) { | 690 | if (err <= 0) { |
| 691 | if (!ret && err != -ENODATA) | 691 | if (!ret && err != -ENODATA) |
| 692 | ret = err; | 692 | ret = err; |
| 693 | 693 | ||
| 694 | break; | 694 | break; |
| 695 | } | 695 | } |
| 696 | 696 | ||
| 697 | ret += err; | 697 | ret += err; |
| 698 | buf->offset += err; | 698 | buf->offset += err; |
| 699 | buf->len -= err; | 699 | buf->len -= err; |
| 700 | 700 | ||
| 701 | sd->len -= err; | 701 | sd->len -= err; |
| 702 | sd->pos += err; | 702 | sd->pos += err; |
| 703 | sd->total_len -= err; | 703 | sd->total_len -= err; |
| 704 | if (sd->len) | 704 | if (sd->len) |
| 705 | continue; | 705 | continue; |
| 706 | 706 | ||
| 707 | if (!buf->len) { | 707 | if (!buf->len) { |
| 708 | buf->ops = NULL; | 708 | buf->ops = NULL; |
| 709 | ops->release(pipe, buf); | 709 | ops->release(pipe, buf); |
| 710 | pipe->curbuf = (pipe->curbuf + 1) & (PIPE_BUFFERS - 1); | 710 | pipe->curbuf = (pipe->curbuf + 1) & (PIPE_BUFFERS - 1); |
| 711 | pipe->nrbufs--; | 711 | pipe->nrbufs--; |
| 712 | if (pipe->inode) | 712 | if (pipe->inode) |
| 713 | do_wakeup = 1; | 713 | do_wakeup = 1; |
| 714 | } | 714 | } |
| 715 | 715 | ||
| 716 | if (!sd->total_len) | 716 | if (!sd->total_len) |
| 717 | break; | 717 | break; |
| 718 | } | 718 | } |
| 719 | 719 | ||
| 720 | if (pipe->nrbufs) | 720 | if (pipe->nrbufs) |
| 721 | continue; | 721 | continue; |
| 722 | if (!pipe->writers) | 722 | if (!pipe->writers) |
| 723 | break; | 723 | break; |
| 724 | if (!pipe->waiting_writers) { | 724 | if (!pipe->waiting_writers) { |
| 725 | if (ret) | 725 | if (ret) |
| 726 | break; | 726 | break; |
| 727 | } | 727 | } |
| 728 | 728 | ||
| 729 | if (sd->flags & SPLICE_F_NONBLOCK) { | 729 | if (sd->flags & SPLICE_F_NONBLOCK) { |
| 730 | if (!ret) | 730 | if (!ret) |
| 731 | ret = -EAGAIN; | 731 | ret = -EAGAIN; |
| 732 | break; | 732 | break; |
| 733 | } | 733 | } |
| 734 | 734 | ||
| 735 | if (signal_pending(current)) { | 735 | if (signal_pending(current)) { |
| 736 | if (!ret) | 736 | if (!ret) |
| 737 | ret = -ERESTARTSYS; | 737 | ret = -ERESTARTSYS; |
| 738 | break; | 738 | break; |
| 739 | } | 739 | } |
| 740 | 740 | ||
| 741 | if (do_wakeup) { | 741 | if (do_wakeup) { |
| 742 | smp_mb(); | 742 | smp_mb(); |
| 743 | if (waitqueue_active(&pipe->wait)) | 743 | if (waitqueue_active(&pipe->wait)) |
| 744 | wake_up_interruptible_sync(&pipe->wait); | 744 | wake_up_interruptible_sync(&pipe->wait); |
| 745 | kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); | 745 | kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); |
| 746 | do_wakeup = 0; | 746 | do_wakeup = 0; |
| 747 | } | 747 | } |
| 748 | 748 | ||
| 749 | pipe_wait(pipe); | 749 | pipe_wait(pipe); |
| 750 | } | 750 | } |
| 751 | 751 | ||
| 752 | if (do_wakeup) { | 752 | if (do_wakeup) { |
| 753 | smp_mb(); | 753 | smp_mb(); |
| 754 | if (waitqueue_active(&pipe->wait)) | 754 | if (waitqueue_active(&pipe->wait)) |
| 755 | wake_up_interruptible(&pipe->wait); | 755 | wake_up_interruptible(&pipe->wait); |
| 756 | kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); | 756 | kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); |
| 757 | } | 757 | } |
| 758 | 758 | ||
| 759 | return ret; | 759 | return ret; |
| 760 | } | 760 | } |
| 761 | EXPORT_SYMBOL(__splice_from_pipe); | 761 | EXPORT_SYMBOL(__splice_from_pipe); |
| 762 | 762 | ||
| 763 | /** | 763 | /** |
| 764 | * splice_from_pipe - splice data from a pipe to a file | 764 | * splice_from_pipe - splice data from a pipe to a file |
| 765 | * @pipe: pipe to splice from | 765 | * @pipe: pipe to splice from |
| 766 | * @out: file to splice to | 766 | * @out: file to splice to |
| 767 | * @ppos: position in @out | 767 | * @ppos: position in @out |
| 768 | * @len: how many bytes to splice | 768 | * @len: how many bytes to splice |
| 769 | * @flags: splice modifier flags | 769 | * @flags: splice modifier flags |
| 770 | * @actor: handler that splices the data | 770 | * @actor: handler that splices the data |
| 771 | * | 771 | * |
| 772 | * Description: | 772 | * Description: |
| 773 | * See __splice_from_pipe. This function locks the input and output inodes, | 773 | * See __splice_from_pipe. This function locks the input and output inodes, |
| 774 | * otherwise it's identical to __splice_from_pipe(). | 774 | * otherwise it's identical to __splice_from_pipe(). |
| 775 | * | 775 | * |
| 776 | */ | 776 | */ |
| 777 | ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out, | 777 | ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out, |
| 778 | loff_t *ppos, size_t len, unsigned int flags, | 778 | loff_t *ppos, size_t len, unsigned int flags, |
| 779 | splice_actor *actor) | 779 | splice_actor *actor) |
| 780 | { | 780 | { |
| 781 | ssize_t ret; | 781 | ssize_t ret; |
| 782 | struct inode *inode = out->f_mapping->host; | 782 | struct inode *inode = out->f_mapping->host; |
| 783 | struct splice_desc sd = { | 783 | struct splice_desc sd = { |
| 784 | .total_len = len, | 784 | .total_len = len, |
| 785 | .flags = flags, | 785 | .flags = flags, |
| 786 | .pos = *ppos, | 786 | .pos = *ppos, |
| 787 | .u.file = out, | 787 | .u.file = out, |
| 788 | }; | 788 | }; |
| 789 | 789 | ||
| 790 | /* | 790 | /* |
| 791 | * The actor worker might be calling ->prepare_write and | 791 | * The actor worker might be calling ->prepare_write and |
| 792 | * ->commit_write. Most of the time, these expect i_mutex to | 792 | * ->commit_write. Most of the time, these expect i_mutex to |
| 793 | * be held. Since this may result in an ABBA deadlock with | 793 | * be held. Since this may result in an ABBA deadlock with |
| 794 | * pipe->inode, we have to order lock acquiry here. | 794 | * pipe->inode, we have to order lock acquiry here. |
| 795 | */ | 795 | */ |
| 796 | inode_double_lock(inode, pipe->inode); | 796 | inode_double_lock(inode, pipe->inode); |
| 797 | ret = __splice_from_pipe(pipe, &sd, actor); | 797 | ret = __splice_from_pipe(pipe, &sd, actor); |
| 798 | inode_double_unlock(inode, pipe->inode); | 798 | inode_double_unlock(inode, pipe->inode); |
| 799 | 799 | ||
| 800 | return ret; | 800 | return ret; |
| 801 | } | 801 | } |
| 802 | 802 | ||
| 803 | /** | 803 | /** |
| 804 | * generic_file_splice_write_nolock - generic_file_splice_write without mutexes | 804 | * generic_file_splice_write_nolock - generic_file_splice_write without mutexes |
| 805 | * @pipe: pipe info | 805 | * @pipe: pipe info |
| 806 | * @out: file to write to | 806 | * @out: file to write to |
| 807 | * @ppos: position in @out | 807 | * @ppos: position in @out |
| 808 | * @len: number of bytes to splice | 808 | * @len: number of bytes to splice |
| 809 | * @flags: splice modifier flags | 809 | * @flags: splice modifier flags |
| 810 | * | 810 | * |
| 811 | * Description: | 811 | * Description: |
| 812 | * Will either move or copy pages (determined by @flags options) from | 812 | * Will either move or copy pages (determined by @flags options) from |
| 813 | * the given pipe inode to the given file. The caller is responsible | 813 | * the given pipe inode to the given file. The caller is responsible |
| 814 | * for acquiring i_mutex on both inodes. | 814 | * for acquiring i_mutex on both inodes. |
| 815 | * | 815 | * |
| 816 | */ | 816 | */ |
| 817 | ssize_t | 817 | ssize_t |
| 818 | generic_file_splice_write_nolock(struct pipe_inode_info *pipe, struct file *out, | 818 | generic_file_splice_write_nolock(struct pipe_inode_info *pipe, struct file *out, |
| 819 | loff_t *ppos, size_t len, unsigned int flags) | 819 | loff_t *ppos, size_t len, unsigned int flags) |
| 820 | { | 820 | { |
| 821 | struct address_space *mapping = out->f_mapping; | 821 | struct address_space *mapping = out->f_mapping; |
| 822 | struct inode *inode = mapping->host; | 822 | struct inode *inode = mapping->host; |
| 823 | struct splice_desc sd = { | 823 | struct splice_desc sd = { |
| 824 | .total_len = len, | 824 | .total_len = len, |
| 825 | .flags = flags, | 825 | .flags = flags, |
| 826 | .pos = *ppos, | 826 | .pos = *ppos, |
| 827 | .u.file = out, | 827 | .u.file = out, |
| 828 | }; | 828 | }; |
| 829 | ssize_t ret; | 829 | ssize_t ret; |
| 830 | int err; | 830 | int err; |
| 831 | 831 | ||
| 832 | err = remove_suid(out->f_path.dentry); | 832 | err = remove_suid(out->f_path.dentry); |
| 833 | if (unlikely(err)) | 833 | if (unlikely(err)) |
| 834 | return err; | 834 | return err; |
| 835 | 835 | ||
| 836 | ret = __splice_from_pipe(pipe, &sd, pipe_to_file); | 836 | ret = __splice_from_pipe(pipe, &sd, pipe_to_file); |
| 837 | if (ret > 0) { | 837 | if (ret > 0) { |
| 838 | unsigned long nr_pages; | 838 | unsigned long nr_pages; |
| 839 | 839 | ||
| 840 | *ppos += ret; | 840 | *ppos += ret; |
| 841 | nr_pages = (ret + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; | 841 | nr_pages = (ret + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; |
| 842 | 842 | ||
| 843 | /* | 843 | /* |
| 844 | * If file or inode is SYNC and we actually wrote some data, | 844 | * If file or inode is SYNC and we actually wrote some data, |
| 845 | * sync it. | 845 | * sync it. |
| 846 | */ | 846 | */ |
| 847 | if (unlikely((out->f_flags & O_SYNC) || IS_SYNC(inode))) { | 847 | if (unlikely((out->f_flags & O_SYNC) || IS_SYNC(inode))) { |
| 848 | err = generic_osync_inode(inode, mapping, | 848 | err = generic_osync_inode(inode, mapping, |
| 849 | OSYNC_METADATA|OSYNC_DATA); | 849 | OSYNC_METADATA|OSYNC_DATA); |
| 850 | 850 | ||
| 851 | if (err) | 851 | if (err) |
| 852 | ret = err; | 852 | ret = err; |
| 853 | } | 853 | } |
| 854 | balance_dirty_pages_ratelimited_nr(mapping, nr_pages); | 854 | balance_dirty_pages_ratelimited_nr(mapping, nr_pages); |
| 855 | } | 855 | } |
| 856 | 856 | ||
| 857 | return ret; | 857 | return ret; |
| 858 | } | 858 | } |
| 859 | 859 | ||
| 860 | EXPORT_SYMBOL(generic_file_splice_write_nolock); | 860 | EXPORT_SYMBOL(generic_file_splice_write_nolock); |
| 861 | 861 | ||
| 862 | /** | 862 | /** |
| 863 | * generic_file_splice_write - splice data from a pipe to a file | 863 | * generic_file_splice_write - splice data from a pipe to a file |
| 864 | * @pipe: pipe info | 864 | * @pipe: pipe info |
| 865 | * @out: file to write to | 865 | * @out: file to write to |
| 866 | * @ppos: position in @out | 866 | * @ppos: position in @out |
| 867 | * @len: number of bytes to splice | 867 | * @len: number of bytes to splice |
| 868 | * @flags: splice modifier flags | 868 | * @flags: splice modifier flags |
| 869 | * | 869 | * |
| 870 | * Description: | 870 | * Description: |
| 871 | * Will either move or copy pages (determined by @flags options) from | 871 | * Will either move or copy pages (determined by @flags options) from |
| 872 | * the given pipe inode to the given file. | 872 | * the given pipe inode to the given file. |
| 873 | * | 873 | * |
| 874 | */ | 874 | */ |
| 875 | ssize_t | 875 | ssize_t |
| 876 | generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out, | 876 | generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out, |
| 877 | loff_t *ppos, size_t len, unsigned int flags) | 877 | loff_t *ppos, size_t len, unsigned int flags) |
| 878 | { | 878 | { |
| 879 | struct address_space *mapping = out->f_mapping; | 879 | struct address_space *mapping = out->f_mapping; |
| 880 | struct inode *inode = mapping->host; | 880 | struct inode *inode = mapping->host; |
| 881 | ssize_t ret; | 881 | ssize_t ret; |
| 882 | int err; | 882 | int err; |
| 883 | 883 | ||
| 884 | err = should_remove_suid(out->f_path.dentry); | 884 | err = should_remove_suid(out->f_path.dentry); |
| 885 | if (unlikely(err)) { | 885 | if (unlikely(err)) { |
| 886 | mutex_lock(&inode->i_mutex); | 886 | mutex_lock(&inode->i_mutex); |
| 887 | err = __remove_suid(out->f_path.dentry, err); | 887 | err = __remove_suid(out->f_path.dentry, err); |
| 888 | mutex_unlock(&inode->i_mutex); | 888 | mutex_unlock(&inode->i_mutex); |
| 889 | if (err) | 889 | if (err) |
| 890 | return err; | 890 | return err; |
| 891 | } | 891 | } |
| 892 | 892 | ||
| 893 | ret = splice_from_pipe(pipe, out, ppos, len, flags, pipe_to_file); | 893 | ret = splice_from_pipe(pipe, out, ppos, len, flags, pipe_to_file); |
| 894 | if (ret > 0) { | 894 | if (ret > 0) { |
| 895 | unsigned long nr_pages; | 895 | unsigned long nr_pages; |
| 896 | 896 | ||
| 897 | *ppos += ret; | 897 | *ppos += ret; |
| 898 | nr_pages = (ret + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; | 898 | nr_pages = (ret + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; |
| 899 | 899 | ||
| 900 | /* | 900 | /* |
| 901 | * If file or inode is SYNC and we actually wrote some data, | 901 | * If file or inode is SYNC and we actually wrote some data, |
| 902 | * sync it. | 902 | * sync it. |
| 903 | */ | 903 | */ |
| 904 | if (unlikely((out->f_flags & O_SYNC) || IS_SYNC(inode))) { | 904 | if (unlikely((out->f_flags & O_SYNC) || IS_SYNC(inode))) { |
| 905 | mutex_lock(&inode->i_mutex); | 905 | mutex_lock(&inode->i_mutex); |
| 906 | err = generic_osync_inode(inode, mapping, | 906 | err = generic_osync_inode(inode, mapping, |
| 907 | OSYNC_METADATA|OSYNC_DATA); | 907 | OSYNC_METADATA|OSYNC_DATA); |
| 908 | mutex_unlock(&inode->i_mutex); | 908 | mutex_unlock(&inode->i_mutex); |
| 909 | 909 | ||
| 910 | if (err) | 910 | if (err) |
| 911 | ret = err; | 911 | ret = err; |
| 912 | } | 912 | } |
| 913 | balance_dirty_pages_ratelimited_nr(mapping, nr_pages); | 913 | balance_dirty_pages_ratelimited_nr(mapping, nr_pages); |
| 914 | } | 914 | } |
| 915 | 915 | ||
| 916 | return ret; | 916 | return ret; |
| 917 | } | 917 | } |
| 918 | 918 | ||
| 919 | EXPORT_SYMBOL(generic_file_splice_write); | 919 | EXPORT_SYMBOL(generic_file_splice_write); |
| 920 | 920 | ||
| 921 | /** | 921 | /** |
| 922 | * generic_splice_sendpage - splice data from a pipe to a socket | 922 | * generic_splice_sendpage - splice data from a pipe to a socket |
| 923 | * @pipe: pipe to splice from | 923 | * @pipe: pipe to splice from |
| 924 | * @out: socket to write to | 924 | * @out: socket to write to |
| 925 | * @ppos: position in @out | 925 | * @ppos: position in @out |
| 926 | * @len: number of bytes to splice | 926 | * @len: number of bytes to splice |
| 927 | * @flags: splice modifier flags | 927 | * @flags: splice modifier flags |
| 928 | * | 928 | * |
| 929 | * Description: | 929 | * Description: |
| 930 | * Will send @len bytes from the pipe to a network socket. No data copying | 930 | * Will send @len bytes from the pipe to a network socket. No data copying |
| 931 | * is involved. | 931 | * is involved. |
| 932 | * | 932 | * |
| 933 | */ | 933 | */ |
| 934 | ssize_t generic_splice_sendpage(struct pipe_inode_info *pipe, struct file *out, | 934 | ssize_t generic_splice_sendpage(struct pipe_inode_info *pipe, struct file *out, |
| 935 | loff_t *ppos, size_t len, unsigned int flags) | 935 | loff_t *ppos, size_t len, unsigned int flags) |
| 936 | { | 936 | { |
| 937 | return splice_from_pipe(pipe, out, ppos, len, flags, pipe_to_sendpage); | 937 | return splice_from_pipe(pipe, out, ppos, len, flags, pipe_to_sendpage); |
| 938 | } | 938 | } |
| 939 | 939 | ||
| 940 | EXPORT_SYMBOL(generic_splice_sendpage); | 940 | EXPORT_SYMBOL(generic_splice_sendpage); |
| 941 | 941 | ||
| 942 | /* | 942 | /* |
| 943 | * Attempt to initiate a splice from pipe to file. | 943 | * Attempt to initiate a splice from pipe to file. |
| 944 | */ | 944 | */ |
| 945 | static long do_splice_from(struct pipe_inode_info *pipe, struct file *out, | 945 | static long do_splice_from(struct pipe_inode_info *pipe, struct file *out, |
| 946 | loff_t *ppos, size_t len, unsigned int flags) | 946 | loff_t *ppos, size_t len, unsigned int flags) |
| 947 | { | 947 | { |
| 948 | int ret; | 948 | int ret; |
| 949 | 949 | ||
| 950 | if (unlikely(!out->f_op || !out->f_op->splice_write)) | 950 | if (unlikely(!out->f_op || !out->f_op->splice_write)) |
| 951 | return -EINVAL; | 951 | return -EINVAL; |
| 952 | 952 | ||
| 953 | if (unlikely(!(out->f_mode & FMODE_WRITE))) | 953 | if (unlikely(!(out->f_mode & FMODE_WRITE))) |
| 954 | return -EBADF; | 954 | return -EBADF; |
| 955 | 955 | ||
| 956 | ret = rw_verify_area(WRITE, out, ppos, len); | 956 | ret = rw_verify_area(WRITE, out, ppos, len); |
| 957 | if (unlikely(ret < 0)) | 957 | if (unlikely(ret < 0)) |
| 958 | return ret; | 958 | return ret; |
| 959 | 959 | ||
| 960 | return out->f_op->splice_write(pipe, out, ppos, len, flags); | 960 | return out->f_op->splice_write(pipe, out, ppos, len, flags); |
| 961 | } | 961 | } |
| 962 | 962 | ||
| 963 | /* | 963 | /* |
| 964 | * Attempt to initiate a splice from a file to a pipe. | 964 | * Attempt to initiate a splice from a file to a pipe. |
| 965 | */ | 965 | */ |
| 966 | static long do_splice_to(struct file *in, loff_t *ppos, | 966 | static long do_splice_to(struct file *in, loff_t *ppos, |
| 967 | struct pipe_inode_info *pipe, size_t len, | 967 | struct pipe_inode_info *pipe, size_t len, |
| 968 | unsigned int flags) | 968 | unsigned int flags) |
| 969 | { | 969 | { |
| 970 | int ret; | 970 | int ret; |
| 971 | 971 | ||
| 972 | if (unlikely(!in->f_op || !in->f_op->splice_read)) | 972 | if (unlikely(!in->f_op || !in->f_op->splice_read)) |
| 973 | return -EINVAL; | 973 | return -EINVAL; |
| 974 | 974 | ||
| 975 | if (unlikely(!(in->f_mode & FMODE_READ))) | 975 | if (unlikely(!(in->f_mode & FMODE_READ))) |
| 976 | return -EBADF; | 976 | return -EBADF; |
| 977 | 977 | ||
| 978 | ret = rw_verify_area(READ, in, ppos, len); | 978 | ret = rw_verify_area(READ, in, ppos, len); |
| 979 | if (unlikely(ret < 0)) | 979 | if (unlikely(ret < 0)) |
| 980 | return ret; | 980 | return ret; |
| 981 | 981 | ||
| 982 | return in->f_op->splice_read(in, ppos, pipe, len, flags); | 982 | return in->f_op->splice_read(in, ppos, pipe, len, flags); |
| 983 | } | 983 | } |
| 984 | 984 | ||
| 985 | /** | 985 | /** |
| 986 | * splice_direct_to_actor - splices data directly between two non-pipes | 986 | * splice_direct_to_actor - splices data directly between two non-pipes |
| 987 | * @in: file to splice from | 987 | * @in: file to splice from |
| 988 | * @sd: actor information on where to splice to | 988 | * @sd: actor information on where to splice to |
| 989 | * @actor: handles the data splicing | 989 | * @actor: handles the data splicing |
| 990 | * | 990 | * |
| 991 | * Description: | 991 | * Description: |
| 992 | * This is a special case helper to splice directly between two | 992 | * This is a special case helper to splice directly between two |
| 993 | * points, without requiring an explicit pipe. Internally an allocated | 993 | * points, without requiring an explicit pipe. Internally an allocated |
| 994 | * pipe is cached in the process, and reused during the life time of | 994 | * pipe is cached in the process, and reused during the life time of |
| 995 | * that process. | 995 | * that process. |
| 996 | * | 996 | * |
| 997 | */ | 997 | */ |
| 998 | ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd, | 998 | ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd, |
| 999 | splice_direct_actor *actor) | 999 | splice_direct_actor *actor) |
| 1000 | { | 1000 | { |
| 1001 | struct pipe_inode_info *pipe; | 1001 | struct pipe_inode_info *pipe; |
| 1002 | long ret, bytes; | 1002 | long ret, bytes; |
| 1003 | umode_t i_mode; | 1003 | umode_t i_mode; |
| 1004 | size_t len; | 1004 | size_t len; |
| 1005 | int i, flags; | 1005 | int i, flags; |
| 1006 | 1006 | ||
| 1007 | /* | 1007 | /* |
| 1008 | * We require the input being a regular file, as we don't want to | 1008 | * We require the input being a regular file, as we don't want to |
| 1009 | * randomly drop data for eg socket -> socket splicing. Use the | 1009 | * randomly drop data for eg socket -> socket splicing. Use the |
| 1010 | * piped splicing for that! | 1010 | * piped splicing for that! |
| 1011 | */ | 1011 | */ |
| 1012 | i_mode = in->f_path.dentry->d_inode->i_mode; | 1012 | i_mode = in->f_path.dentry->d_inode->i_mode; |
| 1013 | if (unlikely(!S_ISREG(i_mode) && !S_ISBLK(i_mode))) | 1013 | if (unlikely(!S_ISREG(i_mode) && !S_ISBLK(i_mode))) |
| 1014 | return -EINVAL; | 1014 | return -EINVAL; |
| 1015 | 1015 | ||
| 1016 | /* | 1016 | /* |
| 1017 | * neither in nor out is a pipe, setup an internal pipe attached to | 1017 | * neither in nor out is a pipe, setup an internal pipe attached to |
| 1018 | * 'out' and transfer the wanted data from 'in' to 'out' through that | 1018 | * 'out' and transfer the wanted data from 'in' to 'out' through that |
| 1019 | */ | 1019 | */ |
| 1020 | pipe = current->splice_pipe; | 1020 | pipe = current->splice_pipe; |
| 1021 | if (unlikely(!pipe)) { | 1021 | if (unlikely(!pipe)) { |
| 1022 | pipe = alloc_pipe_info(NULL); | 1022 | pipe = alloc_pipe_info(NULL); |
| 1023 | if (!pipe) | 1023 | if (!pipe) |
| 1024 | return -ENOMEM; | 1024 | return -ENOMEM; |
| 1025 | 1025 | ||
| 1026 | /* | 1026 | /* |
| 1027 | * We don't have an immediate reader, but we'll read the stuff | 1027 | * We don't have an immediate reader, but we'll read the stuff |
| 1028 | * out of the pipe right after the splice_to_pipe(). So set | 1028 | * out of the pipe right after the splice_to_pipe(). So set |
| 1029 | * PIPE_READERS appropriately. | 1029 | * PIPE_READERS appropriately. |
| 1030 | */ | 1030 | */ |
| 1031 | pipe->readers = 1; | 1031 | pipe->readers = 1; |
| 1032 | 1032 | ||
| 1033 | current->splice_pipe = pipe; | 1033 | current->splice_pipe = pipe; |
| 1034 | } | 1034 | } |
| 1035 | 1035 | ||
| 1036 | /* | 1036 | /* |
| 1037 | * Do the splice. | 1037 | * Do the splice. |
| 1038 | */ | 1038 | */ |
| 1039 | ret = 0; | 1039 | ret = 0; |
| 1040 | bytes = 0; | 1040 | bytes = 0; |
| 1041 | len = sd->total_len; | 1041 | len = sd->total_len; |
| 1042 | flags = sd->flags; | 1042 | flags = sd->flags; |
| 1043 | 1043 | ||
| 1044 | /* | 1044 | /* |
| 1045 | * Don't block on output, we have to drain the direct pipe. | 1045 | * Don't block on output, we have to drain the direct pipe. |
| 1046 | */ | 1046 | */ |
| 1047 | sd->flags &= ~SPLICE_F_NONBLOCK; | 1047 | sd->flags &= ~SPLICE_F_NONBLOCK; |
| 1048 | 1048 | ||
| 1049 | while (len) { | 1049 | while (len) { |
| 1050 | size_t read_len, max_read_len; | 1050 | size_t read_len, max_read_len; |
| 1051 | 1051 | ||
| 1052 | /* | 1052 | /* |
| 1053 | * Do at most PIPE_BUFFERS pages worth of transfer: | 1053 | * Do at most PIPE_BUFFERS pages worth of transfer: |
| 1054 | */ | 1054 | */ |
| 1055 | max_read_len = min(len, (size_t)(PIPE_BUFFERS*PAGE_SIZE)); | 1055 | max_read_len = min(len, (size_t)(PIPE_BUFFERS*PAGE_SIZE)); |
| 1056 | 1056 | ||
| 1057 | ret = do_splice_to(in, &sd->pos, pipe, max_read_len, flags); | 1057 | ret = do_splice_to(in, &sd->pos, pipe, max_read_len, flags); |
| 1058 | if (unlikely(ret < 0)) | 1058 | if (unlikely(ret < 0)) |
| 1059 | goto out_release; | 1059 | goto out_release; |
| 1060 | 1060 | ||
| 1061 | read_len = ret; | 1061 | read_len = ret; |
| 1062 | sd->total_len = read_len; | 1062 | sd->total_len = read_len; |
| 1063 | 1063 | ||
| 1064 | /* | 1064 | /* |
| 1065 | * NOTE: nonblocking mode only applies to the input. We | 1065 | * NOTE: nonblocking mode only applies to the input. We |
| 1066 | * must not do the output in nonblocking mode as then we | 1066 | * must not do the output in nonblocking mode as then we |
| 1067 | * could get stuck data in the internal pipe: | 1067 | * could get stuck data in the internal pipe: |
| 1068 | */ | 1068 | */ |
| 1069 | ret = actor(pipe, sd); | 1069 | ret = actor(pipe, sd); |
| 1070 | if (unlikely(ret < 0)) | 1070 | if (unlikely(ret < 0)) |
| 1071 | goto out_release; | 1071 | goto out_release; |
| 1072 | 1072 | ||
| 1073 | bytes += ret; | 1073 | bytes += ret; |
| 1074 | len -= ret; | 1074 | len -= ret; |
| 1075 | 1075 | ||
| 1076 | /* | 1076 | /* |
| 1077 | * In nonblocking mode, if we got back a short read then | 1077 | * In nonblocking mode, if we got back a short read then |
| 1078 | * that was due to either an IO error or due to the | 1078 | * that was due to either an IO error or due to the |
| 1079 | * pagecache entry not being there. In the IO error case | 1079 | * pagecache entry not being there. In the IO error case |
| 1080 | * the _next_ splice attempt will produce a clean IO error | 1080 | * the _next_ splice attempt will produce a clean IO error |
| 1081 | * return value (not a short read), so in both cases it's | 1081 | * return value (not a short read), so in both cases it's |
| 1082 | * correct to break out of the loop here: | 1082 | * correct to break out of the loop here: |
| 1083 | */ | 1083 | */ |
| 1084 | if ((flags & SPLICE_F_NONBLOCK) && (read_len < max_read_len)) | 1084 | if ((flags & SPLICE_F_NONBLOCK) && (read_len < max_read_len)) |
| 1085 | break; | 1085 | break; |
| 1086 | } | 1086 | } |
| 1087 | 1087 | ||
| 1088 | pipe->nrbufs = pipe->curbuf = 0; | 1088 | pipe->nrbufs = pipe->curbuf = 0; |
| 1089 | 1089 | ||
| 1090 | return bytes; | 1090 | return bytes; |
| 1091 | 1091 | ||
| 1092 | out_release: | 1092 | out_release: |
| 1093 | /* | 1093 | /* |
| 1094 | * If we did an incomplete transfer we must release | 1094 | * If we did an incomplete transfer we must release |
| 1095 | * the pipe buffers in question: | 1095 | * the pipe buffers in question: |
| 1096 | */ | 1096 | */ |
| 1097 | for (i = 0; i < PIPE_BUFFERS; i++) { | 1097 | for (i = 0; i < PIPE_BUFFERS; i++) { |
| 1098 | struct pipe_buffer *buf = pipe->bufs + i; | 1098 | struct pipe_buffer *buf = pipe->bufs + i; |
| 1099 | 1099 | ||
| 1100 | if (buf->ops) { | 1100 | if (buf->ops) { |
| 1101 | buf->ops->release(pipe, buf); | 1101 | buf->ops->release(pipe, buf); |
| 1102 | buf->ops = NULL; | 1102 | buf->ops = NULL; |
| 1103 | } | 1103 | } |
| 1104 | } | 1104 | } |
| 1105 | pipe->nrbufs = pipe->curbuf = 0; | 1105 | pipe->nrbufs = pipe->curbuf = 0; |
| 1106 | 1106 | ||
| 1107 | /* | 1107 | /* |
| 1108 | * If we transferred some data, return the number of bytes: | 1108 | * If we transferred some data, return the number of bytes: |
| 1109 | */ | 1109 | */ |
| 1110 | if (bytes > 0) | 1110 | if (bytes > 0) |
| 1111 | return bytes; | 1111 | return bytes; |
| 1112 | 1112 | ||
| 1113 | return ret; | 1113 | return ret; |
| 1114 | 1114 | ||
| 1115 | } | 1115 | } |
| 1116 | EXPORT_SYMBOL(splice_direct_to_actor); | 1116 | EXPORT_SYMBOL(splice_direct_to_actor); |
| 1117 | 1117 | ||
| 1118 | static int direct_splice_actor(struct pipe_inode_info *pipe, | 1118 | static int direct_splice_actor(struct pipe_inode_info *pipe, |
| 1119 | struct splice_desc *sd) | 1119 | struct splice_desc *sd) |
| 1120 | { | 1120 | { |
| 1121 | struct file *file = sd->u.file; | 1121 | struct file *file = sd->u.file; |
| 1122 | 1122 | ||
| 1123 | return do_splice_from(pipe, file, &sd->pos, sd->total_len, sd->flags); | 1123 | return do_splice_from(pipe, file, &sd->pos, sd->total_len, sd->flags); |
| 1124 | } | 1124 | } |
| 1125 | 1125 | ||
| 1126 | /** | 1126 | /** |
| 1127 | * do_splice_direct - splices data directly between two files | 1127 | * do_splice_direct - splices data directly between two files |
| 1128 | * @in: file to splice from | 1128 | * @in: file to splice from |
| 1129 | * @ppos: input file offset | 1129 | * @ppos: input file offset |
| 1130 | * @out: file to splice to | 1130 | * @out: file to splice to |
| 1131 | * @len: number of bytes to splice | 1131 | * @len: number of bytes to splice |
| 1132 | * @flags: splice modifier flags | 1132 | * @flags: splice modifier flags |
| 1133 | * | 1133 | * |
| 1134 | * Description: | 1134 | * Description: |
| 1135 | * For use by do_sendfile(). splice can easily emulate sendfile, but | 1135 | * For use by do_sendfile(). splice can easily emulate sendfile, but |
| 1136 | * doing it in the application would incur an extra system call | 1136 | * doing it in the application would incur an extra system call |
| 1137 | * (splice in + splice out, as compared to just sendfile()). So this helper | 1137 | * (splice in + splice out, as compared to just sendfile()). So this helper |
| 1138 | * can splice directly through a process-private pipe. | 1138 | * can splice directly through a process-private pipe. |
| 1139 | * | 1139 | * |
| 1140 | */ | 1140 | */ |
| 1141 | long do_splice_direct(struct file *in, loff_t *ppos, struct file *out, | 1141 | long do_splice_direct(struct file *in, loff_t *ppos, struct file *out, |
| 1142 | size_t len, unsigned int flags) | 1142 | size_t len, unsigned int flags) |
| 1143 | { | 1143 | { |
| 1144 | struct splice_desc sd = { | 1144 | struct splice_desc sd = { |
| 1145 | .len = len, | 1145 | .len = len, |
| 1146 | .total_len = len, | 1146 | .total_len = len, |
| 1147 | .flags = flags, | 1147 | .flags = flags, |
| 1148 | .pos = *ppos, | 1148 | .pos = *ppos, |
| 1149 | .u.file = out, | 1149 | .u.file = out, |
| 1150 | }; | 1150 | }; |
| 1151 | size_t ret; | 1151 | size_t ret; |
| 1152 | 1152 | ||
| 1153 | ret = splice_direct_to_actor(in, &sd, direct_splice_actor); | 1153 | ret = splice_direct_to_actor(in, &sd, direct_splice_actor); |
| 1154 | *ppos = sd.pos; | 1154 | *ppos = sd.pos; |
| 1155 | return ret; | 1155 | return ret; |
| 1156 | } | 1156 | } |
| 1157 | 1157 | ||
| 1158 | /* | 1158 | /* |
| 1159 | * After the inode slimming patch, i_pipe/i_bdev/i_cdev share the same | 1159 | * After the inode slimming patch, i_pipe/i_bdev/i_cdev share the same |
| 1160 | * location, so checking ->i_pipe is not enough to verify that this is a | 1160 | * location, so checking ->i_pipe is not enough to verify that this is a |
| 1161 | * pipe. | 1161 | * pipe. |
| 1162 | */ | 1162 | */ |
| 1163 | static inline struct pipe_inode_info *pipe_info(struct inode *inode) | 1163 | static inline struct pipe_inode_info *pipe_info(struct inode *inode) |
| 1164 | { | 1164 | { |
| 1165 | if (S_ISFIFO(inode->i_mode)) | 1165 | if (S_ISFIFO(inode->i_mode)) |
| 1166 | return inode->i_pipe; | 1166 | return inode->i_pipe; |
| 1167 | 1167 | ||
| 1168 | return NULL; | 1168 | return NULL; |
| 1169 | } | 1169 | } |
| 1170 | 1170 | ||
| 1171 | /* | 1171 | /* |
| 1172 | * Determine where to splice to/from. | 1172 | * Determine where to splice to/from. |
| 1173 | */ | 1173 | */ |
| 1174 | static long do_splice(struct file *in, loff_t __user *off_in, | 1174 | static long do_splice(struct file *in, loff_t __user *off_in, |
| 1175 | struct file *out, loff_t __user *off_out, | 1175 | struct file *out, loff_t __user *off_out, |
| 1176 | size_t len, unsigned int flags) | 1176 | size_t len, unsigned int flags) |
| 1177 | { | 1177 | { |
| 1178 | struct pipe_inode_info *pipe; | 1178 | struct pipe_inode_info *pipe; |
| 1179 | loff_t offset, *off; | 1179 | loff_t offset, *off; |
| 1180 | long ret; | 1180 | long ret; |
| 1181 | 1181 | ||
| 1182 | pipe = pipe_info(in->f_path.dentry->d_inode); | 1182 | pipe = pipe_info(in->f_path.dentry->d_inode); |
| 1183 | if (pipe) { | 1183 | if (pipe) { |
| 1184 | if (off_in) | 1184 | if (off_in) |
| 1185 | return -ESPIPE; | 1185 | return -ESPIPE; |
| 1186 | if (off_out) { | 1186 | if (off_out) { |
| 1187 | if (out->f_op->llseek == no_llseek) | 1187 | if (out->f_op->llseek == no_llseek) |
| 1188 | return -EINVAL; | 1188 | return -EINVAL; |
| 1189 | if (copy_from_user(&offset, off_out, sizeof(loff_t))) | 1189 | if (copy_from_user(&offset, off_out, sizeof(loff_t))) |
| 1190 | return -EFAULT; | 1190 | return -EFAULT; |
| 1191 | off = &offset; | 1191 | off = &offset; |
| 1192 | } else | 1192 | } else |
| 1193 | off = &out->f_pos; | 1193 | off = &out->f_pos; |
| 1194 | 1194 | ||
| 1195 | ret = do_splice_from(pipe, out, off, len, flags); | 1195 | ret = do_splice_from(pipe, out, off, len, flags); |
| 1196 | 1196 | ||
| 1197 | if (off_out && copy_to_user(off_out, off, sizeof(loff_t))) | 1197 | if (off_out && copy_to_user(off_out, off, sizeof(loff_t))) |
| 1198 | ret = -EFAULT; | 1198 | ret = -EFAULT; |
| 1199 | 1199 | ||
| 1200 | return ret; | 1200 | return ret; |
| 1201 | } | 1201 | } |
| 1202 | 1202 | ||
| 1203 | pipe = pipe_info(out->f_path.dentry->d_inode); | 1203 | pipe = pipe_info(out->f_path.dentry->d_inode); |
| 1204 | if (pipe) { | 1204 | if (pipe) { |
| 1205 | if (off_out) | 1205 | if (off_out) |
| 1206 | return -ESPIPE; | 1206 | return -ESPIPE; |
| 1207 | if (off_in) { | 1207 | if (off_in) { |
| 1208 | if (in->f_op->llseek == no_llseek) | 1208 | if (in->f_op->llseek == no_llseek) |
| 1209 | return -EINVAL; | 1209 | return -EINVAL; |
| 1210 | if (copy_from_user(&offset, off_in, sizeof(loff_t))) | 1210 | if (copy_from_user(&offset, off_in, sizeof(loff_t))) |
| 1211 | return -EFAULT; | 1211 | return -EFAULT; |
| 1212 | off = &offset; | 1212 | off = &offset; |
| 1213 | } else | 1213 | } else |
| 1214 | off = &in->f_pos; | 1214 | off = &in->f_pos; |
| 1215 | 1215 | ||
| 1216 | ret = do_splice_to(in, off, pipe, len, flags); | 1216 | ret = do_splice_to(in, off, pipe, len, flags); |
| 1217 | 1217 | ||
| 1218 | if (off_in && copy_to_user(off_in, off, sizeof(loff_t))) | 1218 | if (off_in && copy_to_user(off_in, off, sizeof(loff_t))) |
| 1219 | ret = -EFAULT; | 1219 | ret = -EFAULT; |
| 1220 | 1220 | ||
| 1221 | return ret; | 1221 | return ret; |
| 1222 | } | 1222 | } |
| 1223 | 1223 | ||
| 1224 | return -EINVAL; | 1224 | return -EINVAL; |
| 1225 | } | 1225 | } |
| 1226 | 1226 | ||
| 1227 | /* | 1227 | /* |
| 1228 | * Map an iov into an array of pages and offset/length tupples. With the | 1228 | * Map an iov into an array of pages and offset/length tupples. With the |
| 1229 | * partial_page structure, we can map several non-contiguous ranges into | 1229 | * partial_page structure, we can map several non-contiguous ranges into |
| 1230 | * our ones pages[] map instead of splitting that operation into pieces. | 1230 | * our ones pages[] map instead of splitting that operation into pieces. |
| 1231 | * Could easily be exported as a generic helper for other users, in which | 1231 | * Could easily be exported as a generic helper for other users, in which |
| 1232 | * case one would probably want to add a 'max_nr_pages' parameter as well. | 1232 | * case one would probably want to add a 'max_nr_pages' parameter as well. |
| 1233 | */ | 1233 | */ |
| 1234 | static int get_iovec_page_array(const struct iovec __user *iov, | 1234 | static int get_iovec_page_array(const struct iovec __user *iov, |
| 1235 | unsigned int nr_vecs, struct page **pages, | 1235 | unsigned int nr_vecs, struct page **pages, |
| 1236 | struct partial_page *partial, int aligned) | 1236 | struct partial_page *partial, int aligned) |
| 1237 | { | 1237 | { |
| 1238 | int buffers = 0, error = 0; | 1238 | int buffers = 0, error = 0; |
| 1239 | 1239 | ||
| 1240 | /* | 1240 | /* |
| 1241 | * It's ok to take the mmap_sem for reading, even | 1241 | * It's ok to take the mmap_sem for reading, even |
| 1242 | * across a "get_user()". | 1242 | * across a "get_user()". |
| 1243 | */ | 1243 | */ |
| 1244 | down_read(¤t->mm->mmap_sem); | 1244 | down_read(¤t->mm->mmap_sem); |
| 1245 | 1245 | ||
| 1246 | while (nr_vecs) { | 1246 | while (nr_vecs) { |
| 1247 | unsigned long off, npages; | 1247 | unsigned long off, npages; |
| 1248 | void __user *base; | 1248 | void __user *base; |
| 1249 | size_t len; | 1249 | size_t len; |
| 1250 | int i; | 1250 | int i; |
| 1251 | 1251 | ||
| 1252 | /* | 1252 | /* |
| 1253 | * Get user address base and length for this iovec. | 1253 | * Get user address base and length for this iovec. |
| 1254 | */ | 1254 | */ |
| 1255 | error = get_user(base, &iov->iov_base); | 1255 | error = get_user(base, &iov->iov_base); |
| 1256 | if (unlikely(error)) | 1256 | if (unlikely(error)) |
| 1257 | break; | 1257 | break; |
| 1258 | error = get_user(len, &iov->iov_len); | 1258 | error = get_user(len, &iov->iov_len); |
| 1259 | if (unlikely(error)) | 1259 | if (unlikely(error)) |
| 1260 | break; | 1260 | break; |
| 1261 | 1261 | ||
| 1262 | /* | 1262 | /* |
| 1263 | * Sanity check this iovec. 0 read succeeds. | 1263 | * Sanity check this iovec. 0 read succeeds. |
| 1264 | */ | 1264 | */ |
| 1265 | if (unlikely(!len)) | 1265 | if (unlikely(!len)) |
| 1266 | break; | 1266 | break; |
| 1267 | error = -EFAULT; | 1267 | error = -EFAULT; |
| 1268 | if (unlikely(!base)) | 1268 | if (unlikely(!base)) |
| 1269 | break; | 1269 | break; |
| 1270 | 1270 | ||
| 1271 | /* | 1271 | /* |
| 1272 | * Get this base offset and number of pages, then map | 1272 | * Get this base offset and number of pages, then map |
| 1273 | * in the user pages. | 1273 | * in the user pages. |
| 1274 | */ | 1274 | */ |
| 1275 | off = (unsigned long) base & ~PAGE_MASK; | 1275 | off = (unsigned long) base & ~PAGE_MASK; |
| 1276 | 1276 | ||
| 1277 | /* | 1277 | /* |
| 1278 | * If asked for alignment, the offset must be zero and the | 1278 | * If asked for alignment, the offset must be zero and the |
| 1279 | * length a multiple of the PAGE_SIZE. | 1279 | * length a multiple of the PAGE_SIZE. |
| 1280 | */ | 1280 | */ |
| 1281 | error = -EINVAL; | 1281 | error = -EINVAL; |
| 1282 | if (aligned && (off || len & ~PAGE_MASK)) | 1282 | if (aligned && (off || len & ~PAGE_MASK)) |
| 1283 | break; | 1283 | break; |
| 1284 | 1284 | ||
| 1285 | npages = (off + len + PAGE_SIZE - 1) >> PAGE_SHIFT; | 1285 | npages = (off + len + PAGE_SIZE - 1) >> PAGE_SHIFT; |
| 1286 | if (npages > PIPE_BUFFERS - buffers) | 1286 | if (npages > PIPE_BUFFERS - buffers) |
| 1287 | npages = PIPE_BUFFERS - buffers; | 1287 | npages = PIPE_BUFFERS - buffers; |
| 1288 | 1288 | ||
| 1289 | error = get_user_pages(current, current->mm, | 1289 | error = get_user_pages(current, current->mm, |
| 1290 | (unsigned long) base, npages, 0, 0, | 1290 | (unsigned long) base, npages, 0, 0, |
| 1291 | &pages[buffers], NULL); | 1291 | &pages[buffers], NULL); |
| 1292 | 1292 | ||
| 1293 | if (unlikely(error <= 0)) | 1293 | if (unlikely(error <= 0)) |
| 1294 | break; | 1294 | break; |
| 1295 | 1295 | ||
| 1296 | /* | 1296 | /* |
| 1297 | * Fill this contiguous range into the partial page map. | 1297 | * Fill this contiguous range into the partial page map. |
| 1298 | */ | 1298 | */ |
| 1299 | for (i = 0; i < error; i++) { | 1299 | for (i = 0; i < error; i++) { |
| 1300 | const int plen = min_t(size_t, len, PAGE_SIZE - off); | 1300 | const int plen = min_t(size_t, len, PAGE_SIZE - off); |
| 1301 | 1301 | ||
| 1302 | partial[buffers].offset = off; | 1302 | partial[buffers].offset = off; |
| 1303 | partial[buffers].len = plen; | 1303 | partial[buffers].len = plen; |
| 1304 | 1304 | ||
| 1305 | off = 0; | 1305 | off = 0; |
| 1306 | len -= plen; | 1306 | len -= plen; |
| 1307 | buffers++; | 1307 | buffers++; |
| 1308 | } | 1308 | } |
| 1309 | 1309 | ||
| 1310 | /* | 1310 | /* |
| 1311 | * We didn't complete this iov, stop here since it probably | 1311 | * We didn't complete this iov, stop here since it probably |
| 1312 | * means we have to move some of this into a pipe to | 1312 | * means we have to move some of this into a pipe to |
| 1313 | * be able to continue. | 1313 | * be able to continue. |
| 1314 | */ | 1314 | */ |
| 1315 | if (len) | 1315 | if (len) |
| 1316 | break; | 1316 | break; |
| 1317 | 1317 | ||
| 1318 | /* | 1318 | /* |
| 1319 | * Don't continue if we mapped fewer pages than we asked for, | 1319 | * Don't continue if we mapped fewer pages than we asked for, |
| 1320 | * or if we mapped the max number of pages that we have | 1320 | * or if we mapped the max number of pages that we have |
| 1321 | * room for. | 1321 | * room for. |
| 1322 | */ | 1322 | */ |
| 1323 | if (error < npages || buffers == PIPE_BUFFERS) | 1323 | if (error < npages || buffers == PIPE_BUFFERS) |
| 1324 | break; | 1324 | break; |
| 1325 | 1325 | ||
| 1326 | nr_vecs--; | 1326 | nr_vecs--; |
| 1327 | iov++; | 1327 | iov++; |
| 1328 | } | 1328 | } |
| 1329 | 1329 | ||
| 1330 | up_read(¤t->mm->mmap_sem); | 1330 | up_read(¤t->mm->mmap_sem); |
| 1331 | 1331 | ||
| 1332 | if (buffers) | 1332 | if (buffers) |
| 1333 | return buffers; | 1333 | return buffers; |
| 1334 | 1334 | ||
| 1335 | return error; | 1335 | return error; |
| 1336 | } | 1336 | } |
| 1337 | 1337 | ||
| 1338 | static int pipe_to_user(struct pipe_inode_info *pipe, struct pipe_buffer *buf, | 1338 | static int pipe_to_user(struct pipe_inode_info *pipe, struct pipe_buffer *buf, |
| 1339 | struct splice_desc *sd) | 1339 | struct splice_desc *sd) |
| 1340 | { | 1340 | { |
| 1341 | char *src; | 1341 | char *src; |
| 1342 | int ret; | 1342 | int ret; |
| 1343 | 1343 | ||
| 1344 | ret = buf->ops->pin(pipe, buf); | 1344 | ret = buf->ops->confirm(pipe, buf); |
| 1345 | if (unlikely(ret)) | 1345 | if (unlikely(ret)) |
| 1346 | return ret; | 1346 | return ret; |
| 1347 | 1347 | ||
| 1348 | /* | 1348 | /* |
| 1349 | * See if we can use the atomic maps, by prefaulting in the | 1349 | * See if we can use the atomic maps, by prefaulting in the |
| 1350 | * pages and doing an atomic copy | 1350 | * pages and doing an atomic copy |
| 1351 | */ | 1351 | */ |
| 1352 | if (!fault_in_pages_writeable(sd->u.userptr, sd->len)) { | 1352 | if (!fault_in_pages_writeable(sd->u.userptr, sd->len)) { |
| 1353 | src = buf->ops->map(pipe, buf, 1); | 1353 | src = buf->ops->map(pipe, buf, 1); |
| 1354 | ret = __copy_to_user_inatomic(sd->u.userptr, src + buf->offset, | 1354 | ret = __copy_to_user_inatomic(sd->u.userptr, src + buf->offset, |
| 1355 | sd->len); | 1355 | sd->len); |
| 1356 | buf->ops->unmap(pipe, buf, src); | 1356 | buf->ops->unmap(pipe, buf, src); |
| 1357 | if (!ret) { | 1357 | if (!ret) { |
| 1358 | ret = sd->len; | 1358 | ret = sd->len; |
| 1359 | goto out; | 1359 | goto out; |
| 1360 | } | 1360 | } |
| 1361 | } | 1361 | } |
| 1362 | 1362 | ||
| 1363 | /* | 1363 | /* |
| 1364 | * No dice, use slow non-atomic map and copy | 1364 | * No dice, use slow non-atomic map and copy |
| 1365 | */ | 1365 | */ |
| 1366 | src = buf->ops->map(pipe, buf, 0); | 1366 | src = buf->ops->map(pipe, buf, 0); |
| 1367 | 1367 | ||
| 1368 | ret = sd->len; | 1368 | ret = sd->len; |
| 1369 | if (copy_to_user(sd->u.userptr, src + buf->offset, sd->len)) | 1369 | if (copy_to_user(sd->u.userptr, src + buf->offset, sd->len)) |
| 1370 | ret = -EFAULT; | 1370 | ret = -EFAULT; |
| 1371 | 1371 | ||
| 1372 | out: | 1372 | out: |
| 1373 | if (ret > 0) | 1373 | if (ret > 0) |
| 1374 | sd->u.userptr += ret; | 1374 | sd->u.userptr += ret; |
| 1375 | buf->ops->unmap(pipe, buf, src); | 1375 | buf->ops->unmap(pipe, buf, src); |
| 1376 | return ret; | 1376 | return ret; |
| 1377 | } | 1377 | } |
| 1378 | 1378 | ||
| 1379 | /* | 1379 | /* |
| 1380 | * For lack of a better implementation, implement vmsplice() to userspace | 1380 | * For lack of a better implementation, implement vmsplice() to userspace |
| 1381 | * as a simple copy of the pipes pages to the user iov. | 1381 | * as a simple copy of the pipes pages to the user iov. |
| 1382 | */ | 1382 | */ |
| 1383 | static long vmsplice_to_user(struct file *file, const struct iovec __user *iov, | 1383 | static long vmsplice_to_user(struct file *file, const struct iovec __user *iov, |
| 1384 | unsigned long nr_segs, unsigned int flags) | 1384 | unsigned long nr_segs, unsigned int flags) |
| 1385 | { | 1385 | { |
| 1386 | struct pipe_inode_info *pipe; | 1386 | struct pipe_inode_info *pipe; |
| 1387 | struct splice_desc sd; | 1387 | struct splice_desc sd; |
| 1388 | ssize_t size; | 1388 | ssize_t size; |
| 1389 | int error; | 1389 | int error; |
| 1390 | long ret; | 1390 | long ret; |
| 1391 | 1391 | ||
| 1392 | pipe = pipe_info(file->f_path.dentry->d_inode); | 1392 | pipe = pipe_info(file->f_path.dentry->d_inode); |
| 1393 | if (!pipe) | 1393 | if (!pipe) |
| 1394 | return -EBADF; | 1394 | return -EBADF; |
| 1395 | 1395 | ||
| 1396 | if (pipe->inode) | 1396 | if (pipe->inode) |
| 1397 | mutex_lock(&pipe->inode->i_mutex); | 1397 | mutex_lock(&pipe->inode->i_mutex); |
| 1398 | 1398 | ||
| 1399 | error = ret = 0; | 1399 | error = ret = 0; |
| 1400 | while (nr_segs) { | 1400 | while (nr_segs) { |
| 1401 | void __user *base; | 1401 | void __user *base; |
| 1402 | size_t len; | 1402 | size_t len; |
| 1403 | 1403 | ||
| 1404 | /* | 1404 | /* |
| 1405 | * Get user address base and length for this iovec. | 1405 | * Get user address base and length for this iovec. |
| 1406 | */ | 1406 | */ |
| 1407 | error = get_user(base, &iov->iov_base); | 1407 | error = get_user(base, &iov->iov_base); |
| 1408 | if (unlikely(error)) | 1408 | if (unlikely(error)) |
| 1409 | break; | 1409 | break; |
| 1410 | error = get_user(len, &iov->iov_len); | 1410 | error = get_user(len, &iov->iov_len); |
| 1411 | if (unlikely(error)) | 1411 | if (unlikely(error)) |
| 1412 | break; | 1412 | break; |
| 1413 | 1413 | ||
| 1414 | /* | 1414 | /* |
| 1415 | * Sanity check this iovec. 0 read succeeds. | 1415 | * Sanity check this iovec. 0 read succeeds. |
| 1416 | */ | 1416 | */ |
| 1417 | if (unlikely(!len)) | 1417 | if (unlikely(!len)) |
| 1418 | break; | 1418 | break; |
| 1419 | if (unlikely(!base)) { | 1419 | if (unlikely(!base)) { |
| 1420 | error = -EFAULT; | 1420 | error = -EFAULT; |
| 1421 | break; | 1421 | break; |
| 1422 | } | 1422 | } |
| 1423 | 1423 | ||
| 1424 | sd.len = 0; | 1424 | sd.len = 0; |
| 1425 | sd.total_len = len; | 1425 | sd.total_len = len; |
| 1426 | sd.flags = flags; | 1426 | sd.flags = flags; |
| 1427 | sd.u.userptr = base; | 1427 | sd.u.userptr = base; |
| 1428 | sd.pos = 0; | 1428 | sd.pos = 0; |
| 1429 | 1429 | ||
| 1430 | size = __splice_from_pipe(pipe, &sd, pipe_to_user); | 1430 | size = __splice_from_pipe(pipe, &sd, pipe_to_user); |
| 1431 | if (size < 0) { | 1431 | if (size < 0) { |
| 1432 | if (!ret) | 1432 | if (!ret) |
| 1433 | ret = size; | 1433 | ret = size; |
| 1434 | 1434 | ||
| 1435 | break; | 1435 | break; |
| 1436 | } | 1436 | } |
| 1437 | 1437 | ||
| 1438 | ret += size; | 1438 | ret += size; |
| 1439 | 1439 | ||
| 1440 | if (size < len) | 1440 | if (size < len) |
| 1441 | break; | 1441 | break; |
| 1442 | 1442 | ||
| 1443 | nr_segs--; | 1443 | nr_segs--; |
| 1444 | iov++; | 1444 | iov++; |
| 1445 | } | 1445 | } |
| 1446 | 1446 | ||
| 1447 | if (pipe->inode) | 1447 | if (pipe->inode) |
| 1448 | mutex_unlock(&pipe->inode->i_mutex); | 1448 | mutex_unlock(&pipe->inode->i_mutex); |
| 1449 | 1449 | ||
| 1450 | if (!ret) | 1450 | if (!ret) |
| 1451 | ret = error; | 1451 | ret = error; |
| 1452 | 1452 | ||
| 1453 | return ret; | 1453 | return ret; |
| 1454 | } | 1454 | } |
| 1455 | 1455 | ||
| 1456 | /* | 1456 | /* |
| 1457 | * vmsplice splices a user address range into a pipe. It can be thought of | 1457 | * vmsplice splices a user address range into a pipe. It can be thought of |
| 1458 | * as splice-from-memory, where the regular splice is splice-from-file (or | 1458 | * as splice-from-memory, where the regular splice is splice-from-file (or |
| 1459 | * to file). In both cases the output is a pipe, naturally. | 1459 | * to file). In both cases the output is a pipe, naturally. |
| 1460 | */ | 1460 | */ |
| 1461 | static long vmsplice_to_pipe(struct file *file, const struct iovec __user *iov, | 1461 | static long vmsplice_to_pipe(struct file *file, const struct iovec __user *iov, |
| 1462 | unsigned long nr_segs, unsigned int flags) | 1462 | unsigned long nr_segs, unsigned int flags) |
| 1463 | { | 1463 | { |
| 1464 | struct pipe_inode_info *pipe; | 1464 | struct pipe_inode_info *pipe; |
| 1465 | struct page *pages[PIPE_BUFFERS]; | 1465 | struct page *pages[PIPE_BUFFERS]; |
| 1466 | struct partial_page partial[PIPE_BUFFERS]; | 1466 | struct partial_page partial[PIPE_BUFFERS]; |
| 1467 | struct splice_pipe_desc spd = { | 1467 | struct splice_pipe_desc spd = { |
| 1468 | .pages = pages, | 1468 | .pages = pages, |
| 1469 | .partial = partial, | 1469 | .partial = partial, |
| 1470 | .flags = flags, | 1470 | .flags = flags, |
| 1471 | .ops = &user_page_pipe_buf_ops, | 1471 | .ops = &user_page_pipe_buf_ops, |
| 1472 | }; | 1472 | }; |
| 1473 | 1473 | ||
| 1474 | pipe = pipe_info(file->f_path.dentry->d_inode); | 1474 | pipe = pipe_info(file->f_path.dentry->d_inode); |
| 1475 | if (!pipe) | 1475 | if (!pipe) |
| 1476 | return -EBADF; | 1476 | return -EBADF; |
| 1477 | 1477 | ||
| 1478 | spd.nr_pages = get_iovec_page_array(iov, nr_segs, pages, partial, | 1478 | spd.nr_pages = get_iovec_page_array(iov, nr_segs, pages, partial, |
| 1479 | flags & SPLICE_F_GIFT); | 1479 | flags & SPLICE_F_GIFT); |
| 1480 | if (spd.nr_pages <= 0) | 1480 | if (spd.nr_pages <= 0) |
| 1481 | return spd.nr_pages; | 1481 | return spd.nr_pages; |
| 1482 | 1482 | ||
| 1483 | return splice_to_pipe(pipe, &spd); | 1483 | return splice_to_pipe(pipe, &spd); |
| 1484 | } | 1484 | } |
| 1485 | 1485 | ||
| 1486 | /* | 1486 | /* |
| 1487 | * Note that vmsplice only really supports true splicing _from_ user memory | 1487 | * Note that vmsplice only really supports true splicing _from_ user memory |
| 1488 | * to a pipe, not the other way around. Splicing from user memory is a simple | 1488 | * to a pipe, not the other way around. Splicing from user memory is a simple |
| 1489 | * operation that can be supported without any funky alignment restrictions | 1489 | * operation that can be supported without any funky alignment restrictions |
| 1490 | * or nasty vm tricks. We simply map in the user memory and fill them into | 1490 | * or nasty vm tricks. We simply map in the user memory and fill them into |
| 1491 | * a pipe. The reverse isn't quite as easy, though. There are two possible | 1491 | * a pipe. The reverse isn't quite as easy, though. There are two possible |
| 1492 | * solutions for that: | 1492 | * solutions for that: |
| 1493 | * | 1493 | * |
| 1494 | * - memcpy() the data internally, at which point we might as well just | 1494 | * - memcpy() the data internally, at which point we might as well just |
| 1495 | * do a regular read() on the buffer anyway. | 1495 | * do a regular read() on the buffer anyway. |
| 1496 | * - Lots of nasty vm tricks, that are neither fast nor flexible (it | 1496 | * - Lots of nasty vm tricks, that are neither fast nor flexible (it |
| 1497 | * has restriction limitations on both ends of the pipe). | 1497 | * has restriction limitations on both ends of the pipe). |
| 1498 | * | 1498 | * |
| 1499 | * Currently we punt and implement it as a normal copy, see pipe_to_user(). | 1499 | * Currently we punt and implement it as a normal copy, see pipe_to_user(). |
| 1500 | * | 1500 | * |
| 1501 | */ | 1501 | */ |
| 1502 | asmlinkage long sys_vmsplice(int fd, const struct iovec __user *iov, | 1502 | asmlinkage long sys_vmsplice(int fd, const struct iovec __user *iov, |
| 1503 | unsigned long nr_segs, unsigned int flags) | 1503 | unsigned long nr_segs, unsigned int flags) |
| 1504 | { | 1504 | { |
| 1505 | struct file *file; | 1505 | struct file *file; |
| 1506 | long error; | 1506 | long error; |
| 1507 | int fput; | 1507 | int fput; |
| 1508 | 1508 | ||
| 1509 | if (unlikely(nr_segs > UIO_MAXIOV)) | 1509 | if (unlikely(nr_segs > UIO_MAXIOV)) |
| 1510 | return -EINVAL; | 1510 | return -EINVAL; |
| 1511 | else if (unlikely(!nr_segs)) | 1511 | else if (unlikely(!nr_segs)) |
| 1512 | return 0; | 1512 | return 0; |
| 1513 | 1513 | ||
| 1514 | error = -EBADF; | 1514 | error = -EBADF; |
| 1515 | file = fget_light(fd, &fput); | 1515 | file = fget_light(fd, &fput); |
| 1516 | if (file) { | 1516 | if (file) { |
| 1517 | if (file->f_mode & FMODE_WRITE) | 1517 | if (file->f_mode & FMODE_WRITE) |
| 1518 | error = vmsplice_to_pipe(file, iov, nr_segs, flags); | 1518 | error = vmsplice_to_pipe(file, iov, nr_segs, flags); |
| 1519 | else if (file->f_mode & FMODE_READ) | 1519 | else if (file->f_mode & FMODE_READ) |
| 1520 | error = vmsplice_to_user(file, iov, nr_segs, flags); | 1520 | error = vmsplice_to_user(file, iov, nr_segs, flags); |
| 1521 | 1521 | ||
| 1522 | fput_light(file, fput); | 1522 | fput_light(file, fput); |
| 1523 | } | 1523 | } |
| 1524 | 1524 | ||
| 1525 | return error; | 1525 | return error; |
| 1526 | } | 1526 | } |
| 1527 | 1527 | ||
| 1528 | asmlinkage long sys_splice(int fd_in, loff_t __user *off_in, | 1528 | asmlinkage long sys_splice(int fd_in, loff_t __user *off_in, |
| 1529 | int fd_out, loff_t __user *off_out, | 1529 | int fd_out, loff_t __user *off_out, |
| 1530 | size_t len, unsigned int flags) | 1530 | size_t len, unsigned int flags) |
| 1531 | { | 1531 | { |
| 1532 | long error; | 1532 | long error; |
| 1533 | struct file *in, *out; | 1533 | struct file *in, *out; |
| 1534 | int fput_in, fput_out; | 1534 | int fput_in, fput_out; |
| 1535 | 1535 | ||
| 1536 | if (unlikely(!len)) | 1536 | if (unlikely(!len)) |
| 1537 | return 0; | 1537 | return 0; |
| 1538 | 1538 | ||
| 1539 | error = -EBADF; | 1539 | error = -EBADF; |
| 1540 | in = fget_light(fd_in, &fput_in); | 1540 | in = fget_light(fd_in, &fput_in); |
| 1541 | if (in) { | 1541 | if (in) { |
| 1542 | if (in->f_mode & FMODE_READ) { | 1542 | if (in->f_mode & FMODE_READ) { |
| 1543 | out = fget_light(fd_out, &fput_out); | 1543 | out = fget_light(fd_out, &fput_out); |
| 1544 | if (out) { | 1544 | if (out) { |
| 1545 | if (out->f_mode & FMODE_WRITE) | 1545 | if (out->f_mode & FMODE_WRITE) |
| 1546 | error = do_splice(in, off_in, | 1546 | error = do_splice(in, off_in, |
| 1547 | out, off_out, | 1547 | out, off_out, |
| 1548 | len, flags); | 1548 | len, flags); |
| 1549 | fput_light(out, fput_out); | 1549 | fput_light(out, fput_out); |
| 1550 | } | 1550 | } |
| 1551 | } | 1551 | } |
| 1552 | 1552 | ||
| 1553 | fput_light(in, fput_in); | 1553 | fput_light(in, fput_in); |
| 1554 | } | 1554 | } |
| 1555 | 1555 | ||
| 1556 | return error; | 1556 | return error; |
| 1557 | } | 1557 | } |
| 1558 | 1558 | ||
| 1559 | /* | 1559 | /* |
| 1560 | * Make sure there's data to read. Wait for input if we can, otherwise | 1560 | * Make sure there's data to read. Wait for input if we can, otherwise |
| 1561 | * return an appropriate error. | 1561 | * return an appropriate error. |
| 1562 | */ | 1562 | */ |
| 1563 | static int link_ipipe_prep(struct pipe_inode_info *pipe, unsigned int flags) | 1563 | static int link_ipipe_prep(struct pipe_inode_info *pipe, unsigned int flags) |
| 1564 | { | 1564 | { |
| 1565 | int ret; | 1565 | int ret; |
| 1566 | 1566 | ||
| 1567 | /* | 1567 | /* |
| 1568 | * Check ->nrbufs without the inode lock first. This function | 1568 | * Check ->nrbufs without the inode lock first. This function |
| 1569 | * is speculative anyways, so missing one is ok. | 1569 | * is speculative anyways, so missing one is ok. |
| 1570 | */ | 1570 | */ |
| 1571 | if (pipe->nrbufs) | 1571 | if (pipe->nrbufs) |
| 1572 | return 0; | 1572 | return 0; |
| 1573 | 1573 | ||
| 1574 | ret = 0; | 1574 | ret = 0; |
| 1575 | mutex_lock(&pipe->inode->i_mutex); | 1575 | mutex_lock(&pipe->inode->i_mutex); |
| 1576 | 1576 | ||
| 1577 | while (!pipe->nrbufs) { | 1577 | while (!pipe->nrbufs) { |
| 1578 | if (signal_pending(current)) { | 1578 | if (signal_pending(current)) { |
| 1579 | ret = -ERESTARTSYS; | 1579 | ret = -ERESTARTSYS; |
| 1580 | break; | 1580 | break; |
| 1581 | } | 1581 | } |
| 1582 | if (!pipe->writers) | 1582 | if (!pipe->writers) |
| 1583 | break; | 1583 | break; |
| 1584 | if (!pipe->waiting_writers) { | 1584 | if (!pipe->waiting_writers) { |
| 1585 | if (flags & SPLICE_F_NONBLOCK) { | 1585 | if (flags & SPLICE_F_NONBLOCK) { |
| 1586 | ret = -EAGAIN; | 1586 | ret = -EAGAIN; |
| 1587 | break; | 1587 | break; |
| 1588 | } | 1588 | } |
| 1589 | } | 1589 | } |
| 1590 | pipe_wait(pipe); | 1590 | pipe_wait(pipe); |
| 1591 | } | 1591 | } |
| 1592 | 1592 | ||
| 1593 | mutex_unlock(&pipe->inode->i_mutex); | 1593 | mutex_unlock(&pipe->inode->i_mutex); |
| 1594 | return ret; | 1594 | return ret; |
| 1595 | } | 1595 | } |
| 1596 | 1596 | ||
| 1597 | /* | 1597 | /* |
| 1598 | * Make sure there's writeable room. Wait for room if we can, otherwise | 1598 | * Make sure there's writeable room. Wait for room if we can, otherwise |
| 1599 | * return an appropriate error. | 1599 | * return an appropriate error. |
| 1600 | */ | 1600 | */ |
| 1601 | static int link_opipe_prep(struct pipe_inode_info *pipe, unsigned int flags) | 1601 | static int link_opipe_prep(struct pipe_inode_info *pipe, unsigned int flags) |
| 1602 | { | 1602 | { |
| 1603 | int ret; | 1603 | int ret; |
| 1604 | 1604 | ||
| 1605 | /* | 1605 | /* |
| 1606 | * Check ->nrbufs without the inode lock first. This function | 1606 | * Check ->nrbufs without the inode lock first. This function |
| 1607 | * is speculative anyways, so missing one is ok. | 1607 | * is speculative anyways, so missing one is ok. |
| 1608 | */ | 1608 | */ |
| 1609 | if (pipe->nrbufs < PIPE_BUFFERS) | 1609 | if (pipe->nrbufs < PIPE_BUFFERS) |
| 1610 | return 0; | 1610 | return 0; |
| 1611 | 1611 | ||
| 1612 | ret = 0; | 1612 | ret = 0; |
| 1613 | mutex_lock(&pipe->inode->i_mutex); | 1613 | mutex_lock(&pipe->inode->i_mutex); |
| 1614 | 1614 | ||
| 1615 | while (pipe->nrbufs >= PIPE_BUFFERS) { | 1615 | while (pipe->nrbufs >= PIPE_BUFFERS) { |
| 1616 | if (!pipe->readers) { | 1616 | if (!pipe->readers) { |
| 1617 | send_sig(SIGPIPE, current, 0); | 1617 | send_sig(SIGPIPE, current, 0); |
| 1618 | ret = -EPIPE; | 1618 | ret = -EPIPE; |
| 1619 | break; | 1619 | break; |
| 1620 | } | 1620 | } |
| 1621 | if (flags & SPLICE_F_NONBLOCK) { | 1621 | if (flags & SPLICE_F_NONBLOCK) { |
| 1622 | ret = -EAGAIN; | 1622 | ret = -EAGAIN; |
| 1623 | break; | 1623 | break; |
| 1624 | } | 1624 | } |
| 1625 | if (signal_pending(current)) { | 1625 | if (signal_pending(current)) { |
| 1626 | ret = -ERESTARTSYS; | 1626 | ret = -ERESTARTSYS; |
| 1627 | break; | 1627 | break; |
| 1628 | } | 1628 | } |
| 1629 | pipe->waiting_writers++; | 1629 | pipe->waiting_writers++; |
| 1630 | pipe_wait(pipe); | 1630 | pipe_wait(pipe); |
| 1631 | pipe->waiting_writers--; | 1631 | pipe->waiting_writers--; |
| 1632 | } | 1632 | } |
| 1633 | 1633 | ||
| 1634 | mutex_unlock(&pipe->inode->i_mutex); | 1634 | mutex_unlock(&pipe->inode->i_mutex); |
| 1635 | return ret; | 1635 | return ret; |
| 1636 | } | 1636 | } |
| 1637 | 1637 | ||
| 1638 | /* | 1638 | /* |
| 1639 | * Link contents of ipipe to opipe. | 1639 | * Link contents of ipipe to opipe. |
| 1640 | */ | 1640 | */ |
| 1641 | static int link_pipe(struct pipe_inode_info *ipipe, | 1641 | static int link_pipe(struct pipe_inode_info *ipipe, |
| 1642 | struct pipe_inode_info *opipe, | 1642 | struct pipe_inode_info *opipe, |
| 1643 | size_t len, unsigned int flags) | 1643 | size_t len, unsigned int flags) |
| 1644 | { | 1644 | { |
| 1645 | struct pipe_buffer *ibuf, *obuf; | 1645 | struct pipe_buffer *ibuf, *obuf; |
| 1646 | int ret = 0, i = 0, nbuf; | 1646 | int ret = 0, i = 0, nbuf; |
| 1647 | 1647 | ||
| 1648 | /* | 1648 | /* |
| 1649 | * Potential ABBA deadlock, work around it by ordering lock | 1649 | * Potential ABBA deadlock, work around it by ordering lock |
| 1650 | * grabbing by inode address. Otherwise two different processes | 1650 | * grabbing by inode address. Otherwise two different processes |
| 1651 | * could deadlock (one doing tee from A -> B, the other from B -> A). | 1651 | * could deadlock (one doing tee from A -> B, the other from B -> A). |
| 1652 | */ | 1652 | */ |
| 1653 | inode_double_lock(ipipe->inode, opipe->inode); | 1653 | inode_double_lock(ipipe->inode, opipe->inode); |
| 1654 | 1654 | ||
| 1655 | do { | 1655 | do { |
| 1656 | if (!opipe->readers) { | 1656 | if (!opipe->readers) { |
| 1657 | send_sig(SIGPIPE, current, 0); | 1657 | send_sig(SIGPIPE, current, 0); |
| 1658 | if (!ret) | 1658 | if (!ret) |
| 1659 | ret = -EPIPE; | 1659 | ret = -EPIPE; |
| 1660 | break; | 1660 | break; |
| 1661 | } | 1661 | } |
| 1662 | 1662 | ||
| 1663 | /* | 1663 | /* |
| 1664 | * If we have iterated all input buffers or ran out of | 1664 | * If we have iterated all input buffers or ran out of |
| 1665 | * output room, break. | 1665 | * output room, break. |
| 1666 | */ | 1666 | */ |
| 1667 | if (i >= ipipe->nrbufs || opipe->nrbufs >= PIPE_BUFFERS) | 1667 | if (i >= ipipe->nrbufs || opipe->nrbufs >= PIPE_BUFFERS) |
| 1668 | break; | 1668 | break; |
| 1669 | 1669 | ||
| 1670 | ibuf = ipipe->bufs + ((ipipe->curbuf + i) & (PIPE_BUFFERS - 1)); | 1670 | ibuf = ipipe->bufs + ((ipipe->curbuf + i) & (PIPE_BUFFERS - 1)); |
| 1671 | nbuf = (opipe->curbuf + opipe->nrbufs) & (PIPE_BUFFERS - 1); | 1671 | nbuf = (opipe->curbuf + opipe->nrbufs) & (PIPE_BUFFERS - 1); |
| 1672 | 1672 | ||
| 1673 | /* | 1673 | /* |
| 1674 | * Get a reference to this pipe buffer, | 1674 | * Get a reference to this pipe buffer, |
| 1675 | * so we can copy the contents over. | 1675 | * so we can copy the contents over. |
| 1676 | */ | 1676 | */ |
| 1677 | ibuf->ops->get(ipipe, ibuf); | 1677 | ibuf->ops->get(ipipe, ibuf); |
| 1678 | 1678 | ||
| 1679 | obuf = opipe->bufs + nbuf; | 1679 | obuf = opipe->bufs + nbuf; |
| 1680 | *obuf = *ibuf; | 1680 | *obuf = *ibuf; |
| 1681 | 1681 | ||
| 1682 | /* | 1682 | /* |
| 1683 | * Don't inherit the gift flag, we need to | 1683 | * Don't inherit the gift flag, we need to |
| 1684 | * prevent multiple steals of this page. | 1684 | * prevent multiple steals of this page. |
| 1685 | */ | 1685 | */ |
| 1686 | obuf->flags &= ~PIPE_BUF_FLAG_GIFT; | 1686 | obuf->flags &= ~PIPE_BUF_FLAG_GIFT; |
| 1687 | 1687 | ||
| 1688 | if (obuf->len > len) | 1688 | if (obuf->len > len) |
| 1689 | obuf->len = len; | 1689 | obuf->len = len; |
| 1690 | 1690 | ||
| 1691 | opipe->nrbufs++; | 1691 | opipe->nrbufs++; |
| 1692 | ret += obuf->len; | 1692 | ret += obuf->len; |
| 1693 | len -= obuf->len; | 1693 | len -= obuf->len; |
| 1694 | i++; | 1694 | i++; |
| 1695 | } while (len); | 1695 | } while (len); |
| 1696 | 1696 | ||
| 1697 | inode_double_unlock(ipipe->inode, opipe->inode); | 1697 | inode_double_unlock(ipipe->inode, opipe->inode); |
| 1698 | 1698 | ||
| 1699 | /* | 1699 | /* |
| 1700 | * If we put data in the output pipe, wakeup any potential readers. | 1700 | * If we put data in the output pipe, wakeup any potential readers. |
| 1701 | */ | 1701 | */ |
| 1702 | if (ret > 0) { | 1702 | if (ret > 0) { |
| 1703 | smp_mb(); | 1703 | smp_mb(); |
| 1704 | if (waitqueue_active(&opipe->wait)) | 1704 | if (waitqueue_active(&opipe->wait)) |
| 1705 | wake_up_interruptible(&opipe->wait); | 1705 | wake_up_interruptible(&opipe->wait); |
| 1706 | kill_fasync(&opipe->fasync_readers, SIGIO, POLL_IN); | 1706 | kill_fasync(&opipe->fasync_readers, SIGIO, POLL_IN); |
| 1707 | } | 1707 | } |
| 1708 | 1708 | ||
| 1709 | return ret; | 1709 | return ret; |
| 1710 | } | 1710 | } |
| 1711 | 1711 | ||
| 1712 | /* | 1712 | /* |
| 1713 | * This is a tee(1) implementation that works on pipes. It doesn't copy | 1713 | * This is a tee(1) implementation that works on pipes. It doesn't copy |
| 1714 | * any data, it simply references the 'in' pages on the 'out' pipe. | 1714 | * any data, it simply references the 'in' pages on the 'out' pipe. |
| 1715 | * The 'flags' used are the SPLICE_F_* variants, currently the only | 1715 | * The 'flags' used are the SPLICE_F_* variants, currently the only |
| 1716 | * applicable one is SPLICE_F_NONBLOCK. | 1716 | * applicable one is SPLICE_F_NONBLOCK. |
| 1717 | */ | 1717 | */ |
| 1718 | static long do_tee(struct file *in, struct file *out, size_t len, | 1718 | static long do_tee(struct file *in, struct file *out, size_t len, |
| 1719 | unsigned int flags) | 1719 | unsigned int flags) |
| 1720 | { | 1720 | { |
| 1721 | struct pipe_inode_info *ipipe = pipe_info(in->f_path.dentry->d_inode); | 1721 | struct pipe_inode_info *ipipe = pipe_info(in->f_path.dentry->d_inode); |
| 1722 | struct pipe_inode_info *opipe = pipe_info(out->f_path.dentry->d_inode); | 1722 | struct pipe_inode_info *opipe = pipe_info(out->f_path.dentry->d_inode); |
| 1723 | int ret = -EINVAL; | 1723 | int ret = -EINVAL; |
| 1724 | 1724 | ||
| 1725 | /* | 1725 | /* |
| 1726 | * Duplicate the contents of ipipe to opipe without actually | 1726 | * Duplicate the contents of ipipe to opipe without actually |
| 1727 | * copying the data. | 1727 | * copying the data. |
| 1728 | */ | 1728 | */ |
| 1729 | if (ipipe && opipe && ipipe != opipe) { | 1729 | if (ipipe && opipe && ipipe != opipe) { |
| 1730 | /* | 1730 | /* |
| 1731 | * Keep going, unless we encounter an error. The ipipe/opipe | 1731 | * Keep going, unless we encounter an error. The ipipe/opipe |
| 1732 | * ordering doesn't really matter. | 1732 | * ordering doesn't really matter. |
| 1733 | */ | 1733 | */ |
| 1734 | ret = link_ipipe_prep(ipipe, flags); | 1734 | ret = link_ipipe_prep(ipipe, flags); |
| 1735 | if (!ret) { | 1735 | if (!ret) { |
| 1736 | ret = link_opipe_prep(opipe, flags); | 1736 | ret = link_opipe_prep(opipe, flags); |
| 1737 | if (!ret) { | 1737 | if (!ret) { |
| 1738 | ret = link_pipe(ipipe, opipe, len, flags); | 1738 | ret = link_pipe(ipipe, opipe, len, flags); |
| 1739 | if (!ret && (flags & SPLICE_F_NONBLOCK)) | 1739 | if (!ret && (flags & SPLICE_F_NONBLOCK)) |
| 1740 | ret = -EAGAIN; | 1740 | ret = -EAGAIN; |
| 1741 | } | 1741 | } |
| 1742 | } | 1742 | } |
| 1743 | } | 1743 | } |
| 1744 | 1744 | ||
| 1745 | return ret; | 1745 | return ret; |
| 1746 | } | 1746 | } |
| 1747 | 1747 | ||
| 1748 | asmlinkage long sys_tee(int fdin, int fdout, size_t len, unsigned int flags) | 1748 | asmlinkage long sys_tee(int fdin, int fdout, size_t len, unsigned int flags) |
| 1749 | { | 1749 | { |
| 1750 | struct file *in; | 1750 | struct file *in; |
| 1751 | int error, fput_in; | 1751 | int error, fput_in; |
| 1752 | 1752 | ||
| 1753 | if (unlikely(!len)) | 1753 | if (unlikely(!len)) |
| 1754 | return 0; | 1754 | return 0; |
| 1755 | 1755 | ||
| 1756 | error = -EBADF; | 1756 | error = -EBADF; |
| 1757 | in = fget_light(fdin, &fput_in); | 1757 | in = fget_light(fdin, &fput_in); |
| 1758 | if (in) { | 1758 | if (in) { |
| 1759 | if (in->f_mode & FMODE_READ) { | 1759 | if (in->f_mode & FMODE_READ) { |
| 1760 | int fput_out; | 1760 | int fput_out; |
| 1761 | struct file *out = fget_light(fdout, &fput_out); | 1761 | struct file *out = fget_light(fdout, &fput_out); |
| 1762 | 1762 | ||
| 1763 | if (out) { | 1763 | if (out) { |
| 1764 | if (out->f_mode & FMODE_WRITE) | 1764 | if (out->f_mode & FMODE_WRITE) |
| 1765 | error = do_tee(in, out, len, flags); | 1765 | error = do_tee(in, out, len, flags); |
| 1766 | fput_light(out, fput_out); | 1766 | fput_light(out, fput_out); |
| 1767 | } | 1767 | } |
| 1768 | } | 1768 | } |
| 1769 | fput_light(in, fput_in); | 1769 | fput_light(in, fput_in); |
| 1770 | } | 1770 | } |
| 1771 | 1771 | ||
| 1772 | return error; | 1772 | return error; |
| 1773 | } | 1773 | } |
| 1774 | 1774 |
include/linux/pipe_fs_i.h
| 1 | #ifndef _LINUX_PIPE_FS_I_H | 1 | #ifndef _LINUX_PIPE_FS_I_H |
| 2 | #define _LINUX_PIPE_FS_I_H | 2 | #define _LINUX_PIPE_FS_I_H |
| 3 | 3 | ||
| 4 | #define PIPEFS_MAGIC 0x50495045 | 4 | #define PIPEFS_MAGIC 0x50495045 |
| 5 | 5 | ||
| 6 | #define PIPE_BUFFERS (16) | 6 | #define PIPE_BUFFERS (16) |
| 7 | 7 | ||
| 8 | #define PIPE_BUF_FLAG_LRU 0x01 /* page is on the LRU */ | 8 | #define PIPE_BUF_FLAG_LRU 0x01 /* page is on the LRU */ |
| 9 | #define PIPE_BUF_FLAG_ATOMIC 0x02 /* was atomically mapped */ | 9 | #define PIPE_BUF_FLAG_ATOMIC 0x02 /* was atomically mapped */ |
| 10 | #define PIPE_BUF_FLAG_GIFT 0x04 /* page is a gift */ | 10 | #define PIPE_BUF_FLAG_GIFT 0x04 /* page is a gift */ |
| 11 | 11 | ||
| 12 | struct pipe_buffer { | 12 | struct pipe_buffer { |
| 13 | struct page *page; | 13 | struct page *page; |
| 14 | unsigned int offset, len; | 14 | unsigned int offset, len; |
| 15 | const struct pipe_buf_operations *ops; | 15 | const struct pipe_buf_operations *ops; |
| 16 | unsigned int flags; | 16 | unsigned int flags; |
| 17 | unsigned long private; | 17 | unsigned long private; |
| 18 | }; | 18 | }; |
| 19 | 19 | ||
| 20 | struct pipe_inode_info { | 20 | struct pipe_inode_info { |
| 21 | wait_queue_head_t wait; | 21 | wait_queue_head_t wait; |
| 22 | unsigned int nrbufs, curbuf; | 22 | unsigned int nrbufs, curbuf; |
| 23 | struct page *tmp_page; | 23 | struct page *tmp_page; |
| 24 | unsigned int readers; | 24 | unsigned int readers; |
| 25 | unsigned int writers; | 25 | unsigned int writers; |
| 26 | unsigned int waiting_writers; | 26 | unsigned int waiting_writers; |
| 27 | unsigned int r_counter; | 27 | unsigned int r_counter; |
| 28 | unsigned int w_counter; | 28 | unsigned int w_counter; |
| 29 | struct fasync_struct *fasync_readers; | 29 | struct fasync_struct *fasync_readers; |
| 30 | struct fasync_struct *fasync_writers; | 30 | struct fasync_struct *fasync_writers; |
| 31 | struct inode *inode; | 31 | struct inode *inode; |
| 32 | struct pipe_buffer bufs[PIPE_BUFFERS]; | 32 | struct pipe_buffer bufs[PIPE_BUFFERS]; |
| 33 | }; | 33 | }; |
| 34 | 34 | ||
| 35 | /* | 35 | /* |
| 36 | * Note on the nesting of these functions: | 36 | * Note on the nesting of these functions: |
| 37 | * | 37 | * |
| 38 | * ->pin() | 38 | * ->confirm() |
| 39 | * ->steal() | 39 | * ->steal() |
| 40 | * ... | 40 | * ... |
| 41 | * ->map() | 41 | * ->map() |
| 42 | * ... | 42 | * ... |
| 43 | * ->unmap() | 43 | * ->unmap() |
| 44 | * | 44 | * |
| 45 | * That is, ->map() must be called on a pinned buffer, same goes for ->steal(). | 45 | * That is, ->map() must be called on a confirmed buffer, |
| 46 | * same goes for ->steal(). | ||
| 46 | */ | 47 | */ |
| 47 | struct pipe_buf_operations { | 48 | struct pipe_buf_operations { |
| 48 | int can_merge; | 49 | int can_merge; |
| 49 | void * (*map)(struct pipe_inode_info *, struct pipe_buffer *, int); | 50 | void * (*map)(struct pipe_inode_info *, struct pipe_buffer *, int); |
| 50 | void (*unmap)(struct pipe_inode_info *, struct pipe_buffer *, void *); | 51 | void (*unmap)(struct pipe_inode_info *, struct pipe_buffer *, void *); |
| 51 | int (*pin)(struct pipe_inode_info *, struct pipe_buffer *); | 52 | int (*confirm)(struct pipe_inode_info *, struct pipe_buffer *); |
| 52 | void (*release)(struct pipe_inode_info *, struct pipe_buffer *); | 53 | void (*release)(struct pipe_inode_info *, struct pipe_buffer *); |
| 53 | int (*steal)(struct pipe_inode_info *, struct pipe_buffer *); | 54 | int (*steal)(struct pipe_inode_info *, struct pipe_buffer *); |
| 54 | void (*get)(struct pipe_inode_info *, struct pipe_buffer *); | 55 | void (*get)(struct pipe_inode_info *, struct pipe_buffer *); |
| 55 | }; | 56 | }; |
| 56 | 57 | ||
| 57 | /* Differs from PIPE_BUF in that PIPE_SIZE is the length of the actual | 58 | /* Differs from PIPE_BUF in that PIPE_SIZE is the length of the actual |
| 58 | memory allocation, whereas PIPE_BUF makes atomicity guarantees. */ | 59 | memory allocation, whereas PIPE_BUF makes atomicity guarantees. */ |
| 59 | #define PIPE_SIZE PAGE_SIZE | 60 | #define PIPE_SIZE PAGE_SIZE |
| 60 | 61 | ||
| 61 | /* Drop the inode semaphore and wait for a pipe event, atomically */ | 62 | /* Drop the inode semaphore and wait for a pipe event, atomically */ |
| 62 | void pipe_wait(struct pipe_inode_info *pipe); | 63 | void pipe_wait(struct pipe_inode_info *pipe); |
| 63 | 64 | ||
| 64 | struct pipe_inode_info * alloc_pipe_info(struct inode * inode); | 65 | struct pipe_inode_info * alloc_pipe_info(struct inode * inode); |
| 65 | void free_pipe_info(struct inode * inode); | 66 | void free_pipe_info(struct inode * inode); |
| 66 | void __free_pipe_info(struct pipe_inode_info *); | 67 | void __free_pipe_info(struct pipe_inode_info *); |
| 67 | 68 | ||
| 68 | /* Generic pipe buffer ops functions */ | 69 | /* Generic pipe buffer ops functions */ |
| 69 | void *generic_pipe_buf_map(struct pipe_inode_info *, struct pipe_buffer *, int); | 70 | void *generic_pipe_buf_map(struct pipe_inode_info *, struct pipe_buffer *, int); |
| 70 | void generic_pipe_buf_unmap(struct pipe_inode_info *, struct pipe_buffer *, void *); | 71 | void generic_pipe_buf_unmap(struct pipe_inode_info *, struct pipe_buffer *, void *); |
| 71 | void generic_pipe_buf_get(struct pipe_inode_info *, struct pipe_buffer *); | 72 | void generic_pipe_buf_get(struct pipe_inode_info *, struct pipe_buffer *); |
| 72 | int generic_pipe_buf_pin(struct pipe_inode_info *, struct pipe_buffer *); | 73 | int generic_pipe_buf_confirm(struct pipe_inode_info *, struct pipe_buffer *); |
| 73 | int generic_pipe_buf_steal(struct pipe_inode_info *, struct pipe_buffer *); | 74 | int generic_pipe_buf_steal(struct pipe_inode_info *, struct pipe_buffer *); |
| 74 | 75 | ||
| 75 | #endif | 76 | #endif |
| 76 | 77 |
kernel/relay.c
| 1 | /* | 1 | /* |
| 2 | * Public API and common code for kernel->userspace relay file support. | 2 | * Public API and common code for kernel->userspace relay file support. |
| 3 | * | 3 | * |
| 4 | * See Documentation/filesystems/relayfs.txt for an overview of relayfs. | 4 | * See Documentation/filesystems/relayfs.txt for an overview of relayfs. |
| 5 | * | 5 | * |
| 6 | * Copyright (C) 2002-2005 - Tom Zanussi (zanussi@us.ibm.com), IBM Corp | 6 | * Copyright (C) 2002-2005 - Tom Zanussi (zanussi@us.ibm.com), IBM Corp |
| 7 | * Copyright (C) 1999-2005 - Karim Yaghmour (karim@opersys.com) | 7 | * Copyright (C) 1999-2005 - Karim Yaghmour (karim@opersys.com) |
| 8 | * | 8 | * |
| 9 | * Moved to kernel/relay.c by Paul Mundt, 2006. | 9 | * Moved to kernel/relay.c by Paul Mundt, 2006. |
| 10 | * November 2006 - CPU hotplug support by Mathieu Desnoyers | 10 | * November 2006 - CPU hotplug support by Mathieu Desnoyers |
| 11 | * (mathieu.desnoyers@polymtl.ca) | 11 | * (mathieu.desnoyers@polymtl.ca) |
| 12 | * | 12 | * |
| 13 | * This file is released under the GPL. | 13 | * This file is released under the GPL. |
| 14 | */ | 14 | */ |
| 15 | #include <linux/errno.h> | 15 | #include <linux/errno.h> |
| 16 | #include <linux/stddef.h> | 16 | #include <linux/stddef.h> |
| 17 | #include <linux/slab.h> | 17 | #include <linux/slab.h> |
| 18 | #include <linux/module.h> | 18 | #include <linux/module.h> |
| 19 | #include <linux/string.h> | 19 | #include <linux/string.h> |
| 20 | #include <linux/relay.h> | 20 | #include <linux/relay.h> |
| 21 | #include <linux/vmalloc.h> | 21 | #include <linux/vmalloc.h> |
| 22 | #include <linux/mm.h> | 22 | #include <linux/mm.h> |
| 23 | #include <linux/cpu.h> | 23 | #include <linux/cpu.h> |
| 24 | #include <linux/splice.h> | 24 | #include <linux/splice.h> |
| 25 | 25 | ||
| 26 | /* list of open channels, for cpu hotplug */ | 26 | /* list of open channels, for cpu hotplug */ |
| 27 | static DEFINE_MUTEX(relay_channels_mutex); | 27 | static DEFINE_MUTEX(relay_channels_mutex); |
| 28 | static LIST_HEAD(relay_channels); | 28 | static LIST_HEAD(relay_channels); |
| 29 | 29 | ||
| 30 | /* | 30 | /* |
| 31 | * close() vm_op implementation for relay file mapping. | 31 | * close() vm_op implementation for relay file mapping. |
| 32 | */ | 32 | */ |
| 33 | static void relay_file_mmap_close(struct vm_area_struct *vma) | 33 | static void relay_file_mmap_close(struct vm_area_struct *vma) |
| 34 | { | 34 | { |
| 35 | struct rchan_buf *buf = vma->vm_private_data; | 35 | struct rchan_buf *buf = vma->vm_private_data; |
| 36 | buf->chan->cb->buf_unmapped(buf, vma->vm_file); | 36 | buf->chan->cb->buf_unmapped(buf, vma->vm_file); |
| 37 | } | 37 | } |
| 38 | 38 | ||
| 39 | /* | 39 | /* |
| 40 | * nopage() vm_op implementation for relay file mapping. | 40 | * nopage() vm_op implementation for relay file mapping. |
| 41 | */ | 41 | */ |
| 42 | static struct page *relay_buf_nopage(struct vm_area_struct *vma, | 42 | static struct page *relay_buf_nopage(struct vm_area_struct *vma, |
| 43 | unsigned long address, | 43 | unsigned long address, |
| 44 | int *type) | 44 | int *type) |
| 45 | { | 45 | { |
| 46 | struct page *page; | 46 | struct page *page; |
| 47 | struct rchan_buf *buf = vma->vm_private_data; | 47 | struct rchan_buf *buf = vma->vm_private_data; |
| 48 | unsigned long offset = address - vma->vm_start; | 48 | unsigned long offset = address - vma->vm_start; |
| 49 | 49 | ||
| 50 | if (address > vma->vm_end) | 50 | if (address > vma->vm_end) |
| 51 | return NOPAGE_SIGBUS; /* Disallow mremap */ | 51 | return NOPAGE_SIGBUS; /* Disallow mremap */ |
| 52 | if (!buf) | 52 | if (!buf) |
| 53 | return NOPAGE_OOM; | 53 | return NOPAGE_OOM; |
| 54 | 54 | ||
| 55 | page = vmalloc_to_page(buf->start + offset); | 55 | page = vmalloc_to_page(buf->start + offset); |
| 56 | if (!page) | 56 | if (!page) |
| 57 | return NOPAGE_OOM; | 57 | return NOPAGE_OOM; |
| 58 | get_page(page); | 58 | get_page(page); |
| 59 | 59 | ||
| 60 | if (type) | 60 | if (type) |
| 61 | *type = VM_FAULT_MINOR; | 61 | *type = VM_FAULT_MINOR; |
| 62 | 62 | ||
| 63 | return page; | 63 | return page; |
| 64 | } | 64 | } |
| 65 | 65 | ||
| 66 | /* | 66 | /* |
| 67 | * vm_ops for relay file mappings. | 67 | * vm_ops for relay file mappings. |
| 68 | */ | 68 | */ |
| 69 | static struct vm_operations_struct relay_file_mmap_ops = { | 69 | static struct vm_operations_struct relay_file_mmap_ops = { |
| 70 | .nopage = relay_buf_nopage, | 70 | .nopage = relay_buf_nopage, |
| 71 | .close = relay_file_mmap_close, | 71 | .close = relay_file_mmap_close, |
| 72 | }; | 72 | }; |
| 73 | 73 | ||
| 74 | /** | 74 | /** |
| 75 | * relay_mmap_buf: - mmap channel buffer to process address space | 75 | * relay_mmap_buf: - mmap channel buffer to process address space |
| 76 | * @buf: relay channel buffer | 76 | * @buf: relay channel buffer |
| 77 | * @vma: vm_area_struct describing memory to be mapped | 77 | * @vma: vm_area_struct describing memory to be mapped |
| 78 | * | 78 | * |
| 79 | * Returns 0 if ok, negative on error | 79 | * Returns 0 if ok, negative on error |
| 80 | * | 80 | * |
| 81 | * Caller should already have grabbed mmap_sem. | 81 | * Caller should already have grabbed mmap_sem. |
| 82 | */ | 82 | */ |
| 83 | int relay_mmap_buf(struct rchan_buf *buf, struct vm_area_struct *vma) | 83 | int relay_mmap_buf(struct rchan_buf *buf, struct vm_area_struct *vma) |
| 84 | { | 84 | { |
| 85 | unsigned long length = vma->vm_end - vma->vm_start; | 85 | unsigned long length = vma->vm_end - vma->vm_start; |
| 86 | struct file *filp = vma->vm_file; | 86 | struct file *filp = vma->vm_file; |
| 87 | 87 | ||
| 88 | if (!buf) | 88 | if (!buf) |
| 89 | return -EBADF; | 89 | return -EBADF; |
| 90 | 90 | ||
| 91 | if (length != (unsigned long)buf->chan->alloc_size) | 91 | if (length != (unsigned long)buf->chan->alloc_size) |
| 92 | return -EINVAL; | 92 | return -EINVAL; |
| 93 | 93 | ||
| 94 | vma->vm_ops = &relay_file_mmap_ops; | 94 | vma->vm_ops = &relay_file_mmap_ops; |
| 95 | vma->vm_private_data = buf; | 95 | vma->vm_private_data = buf; |
| 96 | buf->chan->cb->buf_mapped(buf, filp); | 96 | buf->chan->cb->buf_mapped(buf, filp); |
| 97 | 97 | ||
| 98 | return 0; | 98 | return 0; |
| 99 | } | 99 | } |
| 100 | 100 | ||
| 101 | /** | 101 | /** |
| 102 | * relay_alloc_buf - allocate a channel buffer | 102 | * relay_alloc_buf - allocate a channel buffer |
| 103 | * @buf: the buffer struct | 103 | * @buf: the buffer struct |
| 104 | * @size: total size of the buffer | 104 | * @size: total size of the buffer |
| 105 | * | 105 | * |
| 106 | * Returns a pointer to the resulting buffer, %NULL if unsuccessful. The | 106 | * Returns a pointer to the resulting buffer, %NULL if unsuccessful. The |
| 107 | * passed in size will get page aligned, if it isn't already. | 107 | * passed in size will get page aligned, if it isn't already. |
| 108 | */ | 108 | */ |
| 109 | static void *relay_alloc_buf(struct rchan_buf *buf, size_t *size) | 109 | static void *relay_alloc_buf(struct rchan_buf *buf, size_t *size) |
| 110 | { | 110 | { |
| 111 | void *mem; | 111 | void *mem; |
| 112 | unsigned int i, j, n_pages; | 112 | unsigned int i, j, n_pages; |
| 113 | 113 | ||
| 114 | *size = PAGE_ALIGN(*size); | 114 | *size = PAGE_ALIGN(*size); |
| 115 | n_pages = *size >> PAGE_SHIFT; | 115 | n_pages = *size >> PAGE_SHIFT; |
| 116 | 116 | ||
| 117 | buf->page_array = kcalloc(n_pages, sizeof(struct page *), GFP_KERNEL); | 117 | buf->page_array = kcalloc(n_pages, sizeof(struct page *), GFP_KERNEL); |
| 118 | if (!buf->page_array) | 118 | if (!buf->page_array) |
| 119 | return NULL; | 119 | return NULL; |
| 120 | 120 | ||
| 121 | for (i = 0; i < n_pages; i++) { | 121 | for (i = 0; i < n_pages; i++) { |
| 122 | buf->page_array[i] = alloc_page(GFP_KERNEL); | 122 | buf->page_array[i] = alloc_page(GFP_KERNEL); |
| 123 | if (unlikely(!buf->page_array[i])) | 123 | if (unlikely(!buf->page_array[i])) |
| 124 | goto depopulate; | 124 | goto depopulate; |
| 125 | set_page_private(buf->page_array[i], (unsigned long)buf); | 125 | set_page_private(buf->page_array[i], (unsigned long)buf); |
| 126 | } | 126 | } |
| 127 | mem = vmap(buf->page_array, n_pages, VM_MAP, PAGE_KERNEL); | 127 | mem = vmap(buf->page_array, n_pages, VM_MAP, PAGE_KERNEL); |
| 128 | if (!mem) | 128 | if (!mem) |
| 129 | goto depopulate; | 129 | goto depopulate; |
| 130 | 130 | ||
| 131 | memset(mem, 0, *size); | 131 | memset(mem, 0, *size); |
| 132 | buf->page_count = n_pages; | 132 | buf->page_count = n_pages; |
| 133 | return mem; | 133 | return mem; |
| 134 | 134 | ||
| 135 | depopulate: | 135 | depopulate: |
| 136 | for (j = 0; j < i; j++) | 136 | for (j = 0; j < i; j++) |
| 137 | __free_page(buf->page_array[j]); | 137 | __free_page(buf->page_array[j]); |
| 138 | kfree(buf->page_array); | 138 | kfree(buf->page_array); |
| 139 | return NULL; | 139 | return NULL; |
| 140 | } | 140 | } |
| 141 | 141 | ||
| 142 | /** | 142 | /** |
| 143 | * relay_create_buf - allocate and initialize a channel buffer | 143 | * relay_create_buf - allocate and initialize a channel buffer |
| 144 | * @chan: the relay channel | 144 | * @chan: the relay channel |
| 145 | * | 145 | * |
| 146 | * Returns channel buffer if successful, %NULL otherwise. | 146 | * Returns channel buffer if successful, %NULL otherwise. |
| 147 | */ | 147 | */ |
| 148 | struct rchan_buf *relay_create_buf(struct rchan *chan) | 148 | struct rchan_buf *relay_create_buf(struct rchan *chan) |
| 149 | { | 149 | { |
| 150 | struct rchan_buf *buf = kzalloc(sizeof(struct rchan_buf), GFP_KERNEL); | 150 | struct rchan_buf *buf = kzalloc(sizeof(struct rchan_buf), GFP_KERNEL); |
| 151 | if (!buf) | 151 | if (!buf) |
| 152 | return NULL; | 152 | return NULL; |
| 153 | 153 | ||
| 154 | buf->padding = kmalloc(chan->n_subbufs * sizeof(size_t *), GFP_KERNEL); | 154 | buf->padding = kmalloc(chan->n_subbufs * sizeof(size_t *), GFP_KERNEL); |
| 155 | if (!buf->padding) | 155 | if (!buf->padding) |
| 156 | goto free_buf; | 156 | goto free_buf; |
| 157 | 157 | ||
| 158 | buf->start = relay_alloc_buf(buf, &chan->alloc_size); | 158 | buf->start = relay_alloc_buf(buf, &chan->alloc_size); |
| 159 | if (!buf->start) | 159 | if (!buf->start) |
| 160 | goto free_buf; | 160 | goto free_buf; |
| 161 | 161 | ||
| 162 | buf->chan = chan; | 162 | buf->chan = chan; |
| 163 | kref_get(&buf->chan->kref); | 163 | kref_get(&buf->chan->kref); |
| 164 | return buf; | 164 | return buf; |
| 165 | 165 | ||
| 166 | free_buf: | 166 | free_buf: |
| 167 | kfree(buf->padding); | 167 | kfree(buf->padding); |
| 168 | kfree(buf); | 168 | kfree(buf); |
| 169 | return NULL; | 169 | return NULL; |
| 170 | } | 170 | } |
| 171 | 171 | ||
| 172 | /** | 172 | /** |
| 173 | * relay_destroy_channel - free the channel struct | 173 | * relay_destroy_channel - free the channel struct |
| 174 | * @kref: target kernel reference that contains the relay channel | 174 | * @kref: target kernel reference that contains the relay channel |
| 175 | * | 175 | * |
| 176 | * Should only be called from kref_put(). | 176 | * Should only be called from kref_put(). |
| 177 | */ | 177 | */ |
| 178 | void relay_destroy_channel(struct kref *kref) | 178 | void relay_destroy_channel(struct kref *kref) |
| 179 | { | 179 | { |
| 180 | struct rchan *chan = container_of(kref, struct rchan, kref); | 180 | struct rchan *chan = container_of(kref, struct rchan, kref); |
| 181 | kfree(chan); | 181 | kfree(chan); |
| 182 | } | 182 | } |
| 183 | 183 | ||
| 184 | /** | 184 | /** |
| 185 | * relay_destroy_buf - destroy an rchan_buf struct and associated buffer | 185 | * relay_destroy_buf - destroy an rchan_buf struct and associated buffer |
| 186 | * @buf: the buffer struct | 186 | * @buf: the buffer struct |
| 187 | */ | 187 | */ |
| 188 | void relay_destroy_buf(struct rchan_buf *buf) | 188 | void relay_destroy_buf(struct rchan_buf *buf) |
| 189 | { | 189 | { |
| 190 | struct rchan *chan = buf->chan; | 190 | struct rchan *chan = buf->chan; |
| 191 | unsigned int i; | 191 | unsigned int i; |
| 192 | 192 | ||
| 193 | if (likely(buf->start)) { | 193 | if (likely(buf->start)) { |
| 194 | vunmap(buf->start); | 194 | vunmap(buf->start); |
| 195 | for (i = 0; i < buf->page_count; i++) | 195 | for (i = 0; i < buf->page_count; i++) |
| 196 | __free_page(buf->page_array[i]); | 196 | __free_page(buf->page_array[i]); |
| 197 | kfree(buf->page_array); | 197 | kfree(buf->page_array); |
| 198 | } | 198 | } |
| 199 | chan->buf[buf->cpu] = NULL; | 199 | chan->buf[buf->cpu] = NULL; |
| 200 | kfree(buf->padding); | 200 | kfree(buf->padding); |
| 201 | kfree(buf); | 201 | kfree(buf); |
| 202 | kref_put(&chan->kref, relay_destroy_channel); | 202 | kref_put(&chan->kref, relay_destroy_channel); |
| 203 | } | 203 | } |
| 204 | 204 | ||
| 205 | /** | 205 | /** |
| 206 | * relay_remove_buf - remove a channel buffer | 206 | * relay_remove_buf - remove a channel buffer |
| 207 | * @kref: target kernel reference that contains the relay buffer | 207 | * @kref: target kernel reference that contains the relay buffer |
| 208 | * | 208 | * |
| 209 | * Removes the file from the fileystem, which also frees the | 209 | * Removes the file from the fileystem, which also frees the |
| 210 | * rchan_buf_struct and the channel buffer. Should only be called from | 210 | * rchan_buf_struct and the channel buffer. Should only be called from |
| 211 | * kref_put(). | 211 | * kref_put(). |
| 212 | */ | 212 | */ |
| 213 | void relay_remove_buf(struct kref *kref) | 213 | void relay_remove_buf(struct kref *kref) |
| 214 | { | 214 | { |
| 215 | struct rchan_buf *buf = container_of(kref, struct rchan_buf, kref); | 215 | struct rchan_buf *buf = container_of(kref, struct rchan_buf, kref); |
| 216 | buf->chan->cb->remove_buf_file(buf->dentry); | 216 | buf->chan->cb->remove_buf_file(buf->dentry); |
| 217 | relay_destroy_buf(buf); | 217 | relay_destroy_buf(buf); |
| 218 | } | 218 | } |
| 219 | 219 | ||
| 220 | /** | 220 | /** |
| 221 | * relay_buf_empty - boolean, is the channel buffer empty? | 221 | * relay_buf_empty - boolean, is the channel buffer empty? |
| 222 | * @buf: channel buffer | 222 | * @buf: channel buffer |
| 223 | * | 223 | * |
| 224 | * Returns 1 if the buffer is empty, 0 otherwise. | 224 | * Returns 1 if the buffer is empty, 0 otherwise. |
| 225 | */ | 225 | */ |
| 226 | int relay_buf_empty(struct rchan_buf *buf) | 226 | int relay_buf_empty(struct rchan_buf *buf) |
| 227 | { | 227 | { |
| 228 | return (buf->subbufs_produced - buf->subbufs_consumed) ? 0 : 1; | 228 | return (buf->subbufs_produced - buf->subbufs_consumed) ? 0 : 1; |
| 229 | } | 229 | } |
| 230 | EXPORT_SYMBOL_GPL(relay_buf_empty); | 230 | EXPORT_SYMBOL_GPL(relay_buf_empty); |
| 231 | 231 | ||
| 232 | /** | 232 | /** |
| 233 | * relay_buf_full - boolean, is the channel buffer full? | 233 | * relay_buf_full - boolean, is the channel buffer full? |
| 234 | * @buf: channel buffer | 234 | * @buf: channel buffer |
| 235 | * | 235 | * |
| 236 | * Returns 1 if the buffer is full, 0 otherwise. | 236 | * Returns 1 if the buffer is full, 0 otherwise. |
| 237 | */ | 237 | */ |
| 238 | int relay_buf_full(struct rchan_buf *buf) | 238 | int relay_buf_full(struct rchan_buf *buf) |
| 239 | { | 239 | { |
| 240 | size_t ready = buf->subbufs_produced - buf->subbufs_consumed; | 240 | size_t ready = buf->subbufs_produced - buf->subbufs_consumed; |
| 241 | return (ready >= buf->chan->n_subbufs) ? 1 : 0; | 241 | return (ready >= buf->chan->n_subbufs) ? 1 : 0; |
| 242 | } | 242 | } |
| 243 | EXPORT_SYMBOL_GPL(relay_buf_full); | 243 | EXPORT_SYMBOL_GPL(relay_buf_full); |
| 244 | 244 | ||
| 245 | /* | 245 | /* |
| 246 | * High-level relay kernel API and associated functions. | 246 | * High-level relay kernel API and associated functions. |
| 247 | */ | 247 | */ |
| 248 | 248 | ||
| 249 | /* | 249 | /* |
| 250 | * rchan_callback implementations defining default channel behavior. Used | 250 | * rchan_callback implementations defining default channel behavior. Used |
| 251 | * in place of corresponding NULL values in client callback struct. | 251 | * in place of corresponding NULL values in client callback struct. |
| 252 | */ | 252 | */ |
| 253 | 253 | ||
| 254 | /* | 254 | /* |
| 255 | * subbuf_start() default callback. Does nothing. | 255 | * subbuf_start() default callback. Does nothing. |
| 256 | */ | 256 | */ |
| 257 | static int subbuf_start_default_callback (struct rchan_buf *buf, | 257 | static int subbuf_start_default_callback (struct rchan_buf *buf, |
| 258 | void *subbuf, | 258 | void *subbuf, |
| 259 | void *prev_subbuf, | 259 | void *prev_subbuf, |
| 260 | size_t prev_padding) | 260 | size_t prev_padding) |
| 261 | { | 261 | { |
| 262 | if (relay_buf_full(buf)) | 262 | if (relay_buf_full(buf)) |
| 263 | return 0; | 263 | return 0; |
| 264 | 264 | ||
| 265 | return 1; | 265 | return 1; |
| 266 | } | 266 | } |
| 267 | 267 | ||
| 268 | /* | 268 | /* |
| 269 | * buf_mapped() default callback. Does nothing. | 269 | * buf_mapped() default callback. Does nothing. |
| 270 | */ | 270 | */ |
| 271 | static void buf_mapped_default_callback(struct rchan_buf *buf, | 271 | static void buf_mapped_default_callback(struct rchan_buf *buf, |
| 272 | struct file *filp) | 272 | struct file *filp) |
| 273 | { | 273 | { |
| 274 | } | 274 | } |
| 275 | 275 | ||
| 276 | /* | 276 | /* |
| 277 | * buf_unmapped() default callback. Does nothing. | 277 | * buf_unmapped() default callback. Does nothing. |
| 278 | */ | 278 | */ |
| 279 | static void buf_unmapped_default_callback(struct rchan_buf *buf, | 279 | static void buf_unmapped_default_callback(struct rchan_buf *buf, |
| 280 | struct file *filp) | 280 | struct file *filp) |
| 281 | { | 281 | { |
| 282 | } | 282 | } |
| 283 | 283 | ||
| 284 | /* | 284 | /* |
| 285 | * create_buf_file_create() default callback. Does nothing. | 285 | * create_buf_file_create() default callback. Does nothing. |
| 286 | */ | 286 | */ |
| 287 | static struct dentry *create_buf_file_default_callback(const char *filename, | 287 | static struct dentry *create_buf_file_default_callback(const char *filename, |
| 288 | struct dentry *parent, | 288 | struct dentry *parent, |
| 289 | int mode, | 289 | int mode, |
| 290 | struct rchan_buf *buf, | 290 | struct rchan_buf *buf, |
| 291 | int *is_global) | 291 | int *is_global) |
| 292 | { | 292 | { |
| 293 | return NULL; | 293 | return NULL; |
| 294 | } | 294 | } |
| 295 | 295 | ||
| 296 | /* | 296 | /* |
| 297 | * remove_buf_file() default callback. Does nothing. | 297 | * remove_buf_file() default callback. Does nothing. |
| 298 | */ | 298 | */ |
| 299 | static int remove_buf_file_default_callback(struct dentry *dentry) | 299 | static int remove_buf_file_default_callback(struct dentry *dentry) |
| 300 | { | 300 | { |
| 301 | return -EINVAL; | 301 | return -EINVAL; |
| 302 | } | 302 | } |
| 303 | 303 | ||
| 304 | /* relay channel default callbacks */ | 304 | /* relay channel default callbacks */ |
| 305 | static struct rchan_callbacks default_channel_callbacks = { | 305 | static struct rchan_callbacks default_channel_callbacks = { |
| 306 | .subbuf_start = subbuf_start_default_callback, | 306 | .subbuf_start = subbuf_start_default_callback, |
| 307 | .buf_mapped = buf_mapped_default_callback, | 307 | .buf_mapped = buf_mapped_default_callback, |
| 308 | .buf_unmapped = buf_unmapped_default_callback, | 308 | .buf_unmapped = buf_unmapped_default_callback, |
| 309 | .create_buf_file = create_buf_file_default_callback, | 309 | .create_buf_file = create_buf_file_default_callback, |
| 310 | .remove_buf_file = remove_buf_file_default_callback, | 310 | .remove_buf_file = remove_buf_file_default_callback, |
| 311 | }; | 311 | }; |
| 312 | 312 | ||
| 313 | /** | 313 | /** |
| 314 | * wakeup_readers - wake up readers waiting on a channel | 314 | * wakeup_readers - wake up readers waiting on a channel |
| 315 | * @data: contains the channel buffer | 315 | * @data: contains the channel buffer |
| 316 | * | 316 | * |
| 317 | * This is the timer function used to defer reader waking. | 317 | * This is the timer function used to defer reader waking. |
| 318 | */ | 318 | */ |
| 319 | static void wakeup_readers(unsigned long data) | 319 | static void wakeup_readers(unsigned long data) |
| 320 | { | 320 | { |
| 321 | struct rchan_buf *buf = (struct rchan_buf *)data; | 321 | struct rchan_buf *buf = (struct rchan_buf *)data; |
| 322 | wake_up_interruptible(&buf->read_wait); | 322 | wake_up_interruptible(&buf->read_wait); |
| 323 | } | 323 | } |
| 324 | 324 | ||
| 325 | /** | 325 | /** |
| 326 | * __relay_reset - reset a channel buffer | 326 | * __relay_reset - reset a channel buffer |
| 327 | * @buf: the channel buffer | 327 | * @buf: the channel buffer |
| 328 | * @init: 1 if this is a first-time initialization | 328 | * @init: 1 if this is a first-time initialization |
| 329 | * | 329 | * |
| 330 | * See relay_reset() for description of effect. | 330 | * See relay_reset() for description of effect. |
| 331 | */ | 331 | */ |
| 332 | static void __relay_reset(struct rchan_buf *buf, unsigned int init) | 332 | static void __relay_reset(struct rchan_buf *buf, unsigned int init) |
| 333 | { | 333 | { |
| 334 | size_t i; | 334 | size_t i; |
| 335 | 335 | ||
| 336 | if (init) { | 336 | if (init) { |
| 337 | init_waitqueue_head(&buf->read_wait); | 337 | init_waitqueue_head(&buf->read_wait); |
| 338 | kref_init(&buf->kref); | 338 | kref_init(&buf->kref); |
| 339 | setup_timer(&buf->timer, wakeup_readers, (unsigned long)buf); | 339 | setup_timer(&buf->timer, wakeup_readers, (unsigned long)buf); |
| 340 | } else | 340 | } else |
| 341 | del_timer_sync(&buf->timer); | 341 | del_timer_sync(&buf->timer); |
| 342 | 342 | ||
| 343 | buf->subbufs_produced = 0; | 343 | buf->subbufs_produced = 0; |
| 344 | buf->subbufs_consumed = 0; | 344 | buf->subbufs_consumed = 0; |
| 345 | buf->bytes_consumed = 0; | 345 | buf->bytes_consumed = 0; |
| 346 | buf->finalized = 0; | 346 | buf->finalized = 0; |
| 347 | buf->data = buf->start; | 347 | buf->data = buf->start; |
| 348 | buf->offset = 0; | 348 | buf->offset = 0; |
| 349 | 349 | ||
| 350 | for (i = 0; i < buf->chan->n_subbufs; i++) | 350 | for (i = 0; i < buf->chan->n_subbufs; i++) |
| 351 | buf->padding[i] = 0; | 351 | buf->padding[i] = 0; |
| 352 | 352 | ||
| 353 | buf->chan->cb->subbuf_start(buf, buf->data, NULL, 0); | 353 | buf->chan->cb->subbuf_start(buf, buf->data, NULL, 0); |
| 354 | } | 354 | } |
| 355 | 355 | ||
| 356 | /** | 356 | /** |
| 357 | * relay_reset - reset the channel | 357 | * relay_reset - reset the channel |
| 358 | * @chan: the channel | 358 | * @chan: the channel |
| 359 | * | 359 | * |
| 360 | * This has the effect of erasing all data from all channel buffers | 360 | * This has the effect of erasing all data from all channel buffers |
| 361 | * and restarting the channel in its initial state. The buffers | 361 | * and restarting the channel in its initial state. The buffers |
| 362 | * are not freed, so any mappings are still in effect. | 362 | * are not freed, so any mappings are still in effect. |
| 363 | * | 363 | * |
| 364 | * NOTE. Care should be taken that the channel isn't actually | 364 | * NOTE. Care should be taken that the channel isn't actually |
| 365 | * being used by anything when this call is made. | 365 | * being used by anything when this call is made. |
| 366 | */ | 366 | */ |
| 367 | void relay_reset(struct rchan *chan) | 367 | void relay_reset(struct rchan *chan) |
| 368 | { | 368 | { |
| 369 | unsigned int i; | 369 | unsigned int i; |
| 370 | 370 | ||
| 371 | if (!chan) | 371 | if (!chan) |
| 372 | return; | 372 | return; |
| 373 | 373 | ||
| 374 | if (chan->is_global && chan->buf[0]) { | 374 | if (chan->is_global && chan->buf[0]) { |
| 375 | __relay_reset(chan->buf[0], 0); | 375 | __relay_reset(chan->buf[0], 0); |
| 376 | return; | 376 | return; |
| 377 | } | 377 | } |
| 378 | 378 | ||
| 379 | mutex_lock(&relay_channels_mutex); | 379 | mutex_lock(&relay_channels_mutex); |
| 380 | for_each_online_cpu(i) | 380 | for_each_online_cpu(i) |
| 381 | if (chan->buf[i]) | 381 | if (chan->buf[i]) |
| 382 | __relay_reset(chan->buf[i], 0); | 382 | __relay_reset(chan->buf[i], 0); |
| 383 | mutex_unlock(&relay_channels_mutex); | 383 | mutex_unlock(&relay_channels_mutex); |
| 384 | } | 384 | } |
| 385 | EXPORT_SYMBOL_GPL(relay_reset); | 385 | EXPORT_SYMBOL_GPL(relay_reset); |
| 386 | 386 | ||
| 387 | /* | 387 | /* |
| 388 | * relay_open_buf - create a new relay channel buffer | 388 | * relay_open_buf - create a new relay channel buffer |
| 389 | * | 389 | * |
| 390 | * used by relay_open() and CPU hotplug. | 390 | * used by relay_open() and CPU hotplug. |
| 391 | */ | 391 | */ |
| 392 | static struct rchan_buf *relay_open_buf(struct rchan *chan, unsigned int cpu) | 392 | static struct rchan_buf *relay_open_buf(struct rchan *chan, unsigned int cpu) |
| 393 | { | 393 | { |
| 394 | struct rchan_buf *buf = NULL; | 394 | struct rchan_buf *buf = NULL; |
| 395 | struct dentry *dentry; | 395 | struct dentry *dentry; |
| 396 | char *tmpname; | 396 | char *tmpname; |
| 397 | 397 | ||
| 398 | if (chan->is_global) | 398 | if (chan->is_global) |
| 399 | return chan->buf[0]; | 399 | return chan->buf[0]; |
| 400 | 400 | ||
| 401 | tmpname = kzalloc(NAME_MAX + 1, GFP_KERNEL); | 401 | tmpname = kzalloc(NAME_MAX + 1, GFP_KERNEL); |
| 402 | if (!tmpname) | 402 | if (!tmpname) |
| 403 | goto end; | 403 | goto end; |
| 404 | snprintf(tmpname, NAME_MAX, "%s%d", chan->base_filename, cpu); | 404 | snprintf(tmpname, NAME_MAX, "%s%d", chan->base_filename, cpu); |
| 405 | 405 | ||
| 406 | buf = relay_create_buf(chan); | 406 | buf = relay_create_buf(chan); |
| 407 | if (!buf) | 407 | if (!buf) |
| 408 | goto free_name; | 408 | goto free_name; |
| 409 | 409 | ||
| 410 | buf->cpu = cpu; | 410 | buf->cpu = cpu; |
| 411 | __relay_reset(buf, 1); | 411 | __relay_reset(buf, 1); |
| 412 | 412 | ||
| 413 | /* Create file in fs */ | 413 | /* Create file in fs */ |
| 414 | dentry = chan->cb->create_buf_file(tmpname, chan->parent, S_IRUSR, | 414 | dentry = chan->cb->create_buf_file(tmpname, chan->parent, S_IRUSR, |
| 415 | buf, &chan->is_global); | 415 | buf, &chan->is_global); |
| 416 | if (!dentry) | 416 | if (!dentry) |
| 417 | goto free_buf; | 417 | goto free_buf; |
| 418 | 418 | ||
| 419 | buf->dentry = dentry; | 419 | buf->dentry = dentry; |
| 420 | 420 | ||
| 421 | if(chan->is_global) { | 421 | if(chan->is_global) { |
| 422 | chan->buf[0] = buf; | 422 | chan->buf[0] = buf; |
| 423 | buf->cpu = 0; | 423 | buf->cpu = 0; |
| 424 | } | 424 | } |
| 425 | 425 | ||
| 426 | goto free_name; | 426 | goto free_name; |
| 427 | 427 | ||
| 428 | free_buf: | 428 | free_buf: |
| 429 | relay_destroy_buf(buf); | 429 | relay_destroy_buf(buf); |
| 430 | free_name: | 430 | free_name: |
| 431 | kfree(tmpname); | 431 | kfree(tmpname); |
| 432 | end: | 432 | end: |
| 433 | return buf; | 433 | return buf; |
| 434 | } | 434 | } |
| 435 | 435 | ||
| 436 | /** | 436 | /** |
| 437 | * relay_close_buf - close a channel buffer | 437 | * relay_close_buf - close a channel buffer |
| 438 | * @buf: channel buffer | 438 | * @buf: channel buffer |
| 439 | * | 439 | * |
| 440 | * Marks the buffer finalized and restores the default callbacks. | 440 | * Marks the buffer finalized and restores the default callbacks. |
| 441 | * The channel buffer and channel buffer data structure are then freed | 441 | * The channel buffer and channel buffer data structure are then freed |
| 442 | * automatically when the last reference is given up. | 442 | * automatically when the last reference is given up. |
| 443 | */ | 443 | */ |
| 444 | static void relay_close_buf(struct rchan_buf *buf) | 444 | static void relay_close_buf(struct rchan_buf *buf) |
| 445 | { | 445 | { |
| 446 | buf->finalized = 1; | 446 | buf->finalized = 1; |
| 447 | del_timer_sync(&buf->timer); | 447 | del_timer_sync(&buf->timer); |
| 448 | kref_put(&buf->kref, relay_remove_buf); | 448 | kref_put(&buf->kref, relay_remove_buf); |
| 449 | } | 449 | } |
| 450 | 450 | ||
| 451 | static void setup_callbacks(struct rchan *chan, | 451 | static void setup_callbacks(struct rchan *chan, |
| 452 | struct rchan_callbacks *cb) | 452 | struct rchan_callbacks *cb) |
| 453 | { | 453 | { |
| 454 | if (!cb) { | 454 | if (!cb) { |
| 455 | chan->cb = &default_channel_callbacks; | 455 | chan->cb = &default_channel_callbacks; |
| 456 | return; | 456 | return; |
| 457 | } | 457 | } |
| 458 | 458 | ||
| 459 | if (!cb->subbuf_start) | 459 | if (!cb->subbuf_start) |
| 460 | cb->subbuf_start = subbuf_start_default_callback; | 460 | cb->subbuf_start = subbuf_start_default_callback; |
| 461 | if (!cb->buf_mapped) | 461 | if (!cb->buf_mapped) |
| 462 | cb->buf_mapped = buf_mapped_default_callback; | 462 | cb->buf_mapped = buf_mapped_default_callback; |
| 463 | if (!cb->buf_unmapped) | 463 | if (!cb->buf_unmapped) |
| 464 | cb->buf_unmapped = buf_unmapped_default_callback; | 464 | cb->buf_unmapped = buf_unmapped_default_callback; |
| 465 | if (!cb->create_buf_file) | 465 | if (!cb->create_buf_file) |
| 466 | cb->create_buf_file = create_buf_file_default_callback; | 466 | cb->create_buf_file = create_buf_file_default_callback; |
| 467 | if (!cb->remove_buf_file) | 467 | if (!cb->remove_buf_file) |
| 468 | cb->remove_buf_file = remove_buf_file_default_callback; | 468 | cb->remove_buf_file = remove_buf_file_default_callback; |
| 469 | chan->cb = cb; | 469 | chan->cb = cb; |
| 470 | } | 470 | } |
| 471 | 471 | ||
| 472 | /** | 472 | /** |
| 473 | * relay_hotcpu_callback - CPU hotplug callback | 473 | * relay_hotcpu_callback - CPU hotplug callback |
| 474 | * @nb: notifier block | 474 | * @nb: notifier block |
| 475 | * @action: hotplug action to take | 475 | * @action: hotplug action to take |
| 476 | * @hcpu: CPU number | 476 | * @hcpu: CPU number |
| 477 | * | 477 | * |
| 478 | * Returns the success/failure of the operation. (%NOTIFY_OK, %NOTIFY_BAD) | 478 | * Returns the success/failure of the operation. (%NOTIFY_OK, %NOTIFY_BAD) |
| 479 | */ | 479 | */ |
| 480 | static int __cpuinit relay_hotcpu_callback(struct notifier_block *nb, | 480 | static int __cpuinit relay_hotcpu_callback(struct notifier_block *nb, |
| 481 | unsigned long action, | 481 | unsigned long action, |
| 482 | void *hcpu) | 482 | void *hcpu) |
| 483 | { | 483 | { |
| 484 | unsigned int hotcpu = (unsigned long)hcpu; | 484 | unsigned int hotcpu = (unsigned long)hcpu; |
| 485 | struct rchan *chan; | 485 | struct rchan *chan; |
| 486 | 486 | ||
| 487 | switch(action) { | 487 | switch(action) { |
| 488 | case CPU_UP_PREPARE: | 488 | case CPU_UP_PREPARE: |
| 489 | case CPU_UP_PREPARE_FROZEN: | 489 | case CPU_UP_PREPARE_FROZEN: |
| 490 | mutex_lock(&relay_channels_mutex); | 490 | mutex_lock(&relay_channels_mutex); |
| 491 | list_for_each_entry(chan, &relay_channels, list) { | 491 | list_for_each_entry(chan, &relay_channels, list) { |
| 492 | if (chan->buf[hotcpu]) | 492 | if (chan->buf[hotcpu]) |
| 493 | continue; | 493 | continue; |
| 494 | chan->buf[hotcpu] = relay_open_buf(chan, hotcpu); | 494 | chan->buf[hotcpu] = relay_open_buf(chan, hotcpu); |
| 495 | if(!chan->buf[hotcpu]) { | 495 | if(!chan->buf[hotcpu]) { |
| 496 | printk(KERN_ERR | 496 | printk(KERN_ERR |
| 497 | "relay_hotcpu_callback: cpu %d buffer " | 497 | "relay_hotcpu_callback: cpu %d buffer " |
| 498 | "creation failed\n", hotcpu); | 498 | "creation failed\n", hotcpu); |
| 499 | mutex_unlock(&relay_channels_mutex); | 499 | mutex_unlock(&relay_channels_mutex); |
| 500 | return NOTIFY_BAD; | 500 | return NOTIFY_BAD; |
| 501 | } | 501 | } |
| 502 | } | 502 | } |
| 503 | mutex_unlock(&relay_channels_mutex); | 503 | mutex_unlock(&relay_channels_mutex); |
| 504 | break; | 504 | break; |
| 505 | case CPU_DEAD: | 505 | case CPU_DEAD: |
| 506 | case CPU_DEAD_FROZEN: | 506 | case CPU_DEAD_FROZEN: |
| 507 | /* No need to flush the cpu : will be flushed upon | 507 | /* No need to flush the cpu : will be flushed upon |
| 508 | * final relay_flush() call. */ | 508 | * final relay_flush() call. */ |
| 509 | break; | 509 | break; |
| 510 | } | 510 | } |
| 511 | return NOTIFY_OK; | 511 | return NOTIFY_OK; |
| 512 | } | 512 | } |
| 513 | 513 | ||
| 514 | /** | 514 | /** |
| 515 | * relay_open - create a new relay channel | 515 | * relay_open - create a new relay channel |
| 516 | * @base_filename: base name of files to create | 516 | * @base_filename: base name of files to create |
| 517 | * @parent: dentry of parent directory, %NULL for root directory | 517 | * @parent: dentry of parent directory, %NULL for root directory |
| 518 | * @subbuf_size: size of sub-buffers | 518 | * @subbuf_size: size of sub-buffers |
| 519 | * @n_subbufs: number of sub-buffers | 519 | * @n_subbufs: number of sub-buffers |
| 520 | * @cb: client callback functions | 520 | * @cb: client callback functions |
| 521 | * @private_data: user-defined data | 521 | * @private_data: user-defined data |
| 522 | * | 522 | * |
| 523 | * Returns channel pointer if successful, %NULL otherwise. | 523 | * Returns channel pointer if successful, %NULL otherwise. |
| 524 | * | 524 | * |
| 525 | * Creates a channel buffer for each cpu using the sizes and | 525 | * Creates a channel buffer for each cpu using the sizes and |
| 526 | * attributes specified. The created channel buffer files | 526 | * attributes specified. The created channel buffer files |
| 527 | * will be named base_filename0...base_filenameN-1. File | 527 | * will be named base_filename0...base_filenameN-1. File |
| 528 | * permissions will be %S_IRUSR. | 528 | * permissions will be %S_IRUSR. |
| 529 | */ | 529 | */ |
| 530 | struct rchan *relay_open(const char *base_filename, | 530 | struct rchan *relay_open(const char *base_filename, |
| 531 | struct dentry *parent, | 531 | struct dentry *parent, |
| 532 | size_t subbuf_size, | 532 | size_t subbuf_size, |
| 533 | size_t n_subbufs, | 533 | size_t n_subbufs, |
| 534 | struct rchan_callbacks *cb, | 534 | struct rchan_callbacks *cb, |
| 535 | void *private_data) | 535 | void *private_data) |
| 536 | { | 536 | { |
| 537 | unsigned int i; | 537 | unsigned int i; |
| 538 | struct rchan *chan; | 538 | struct rchan *chan; |
| 539 | if (!base_filename) | 539 | if (!base_filename) |
| 540 | return NULL; | 540 | return NULL; |
| 541 | 541 | ||
| 542 | if (!(subbuf_size && n_subbufs)) | 542 | if (!(subbuf_size && n_subbufs)) |
| 543 | return NULL; | 543 | return NULL; |
| 544 | 544 | ||
| 545 | chan = kzalloc(sizeof(struct rchan), GFP_KERNEL); | 545 | chan = kzalloc(sizeof(struct rchan), GFP_KERNEL); |
| 546 | if (!chan) | 546 | if (!chan) |
| 547 | return NULL; | 547 | return NULL; |
| 548 | 548 | ||
| 549 | chan->version = RELAYFS_CHANNEL_VERSION; | 549 | chan->version = RELAYFS_CHANNEL_VERSION; |
| 550 | chan->n_subbufs = n_subbufs; | 550 | chan->n_subbufs = n_subbufs; |
| 551 | chan->subbuf_size = subbuf_size; | 551 | chan->subbuf_size = subbuf_size; |
| 552 | chan->alloc_size = FIX_SIZE(subbuf_size * n_subbufs); | 552 | chan->alloc_size = FIX_SIZE(subbuf_size * n_subbufs); |
| 553 | chan->parent = parent; | 553 | chan->parent = parent; |
| 554 | chan->private_data = private_data; | 554 | chan->private_data = private_data; |
| 555 | strlcpy(chan->base_filename, base_filename, NAME_MAX); | 555 | strlcpy(chan->base_filename, base_filename, NAME_MAX); |
| 556 | setup_callbacks(chan, cb); | 556 | setup_callbacks(chan, cb); |
| 557 | kref_init(&chan->kref); | 557 | kref_init(&chan->kref); |
| 558 | 558 | ||
| 559 | mutex_lock(&relay_channels_mutex); | 559 | mutex_lock(&relay_channels_mutex); |
| 560 | for_each_online_cpu(i) { | 560 | for_each_online_cpu(i) { |
| 561 | chan->buf[i] = relay_open_buf(chan, i); | 561 | chan->buf[i] = relay_open_buf(chan, i); |
| 562 | if (!chan->buf[i]) | 562 | if (!chan->buf[i]) |
| 563 | goto free_bufs; | 563 | goto free_bufs; |
| 564 | } | 564 | } |
| 565 | list_add(&chan->list, &relay_channels); | 565 | list_add(&chan->list, &relay_channels); |
| 566 | mutex_unlock(&relay_channels_mutex); | 566 | mutex_unlock(&relay_channels_mutex); |
| 567 | 567 | ||
| 568 | return chan; | 568 | return chan; |
| 569 | 569 | ||
| 570 | free_bufs: | 570 | free_bufs: |
| 571 | for_each_online_cpu(i) { | 571 | for_each_online_cpu(i) { |
| 572 | if (!chan->buf[i]) | 572 | if (!chan->buf[i]) |
| 573 | break; | 573 | break; |
| 574 | relay_close_buf(chan->buf[i]); | 574 | relay_close_buf(chan->buf[i]); |
| 575 | } | 575 | } |
| 576 | 576 | ||
| 577 | kref_put(&chan->kref, relay_destroy_channel); | 577 | kref_put(&chan->kref, relay_destroy_channel); |
| 578 | mutex_unlock(&relay_channels_mutex); | 578 | mutex_unlock(&relay_channels_mutex); |
| 579 | return NULL; | 579 | return NULL; |
| 580 | } | 580 | } |
| 581 | EXPORT_SYMBOL_GPL(relay_open); | 581 | EXPORT_SYMBOL_GPL(relay_open); |
| 582 | 582 | ||
| 583 | /** | 583 | /** |
| 584 | * relay_switch_subbuf - switch to a new sub-buffer | 584 | * relay_switch_subbuf - switch to a new sub-buffer |
| 585 | * @buf: channel buffer | 585 | * @buf: channel buffer |
| 586 | * @length: size of current event | 586 | * @length: size of current event |
| 587 | * | 587 | * |
| 588 | * Returns either the length passed in or 0 if full. | 588 | * Returns either the length passed in or 0 if full. |
| 589 | * | 589 | * |
| 590 | * Performs sub-buffer-switch tasks such as invoking callbacks, | 590 | * Performs sub-buffer-switch tasks such as invoking callbacks, |
| 591 | * updating padding counts, waking up readers, etc. | 591 | * updating padding counts, waking up readers, etc. |
| 592 | */ | 592 | */ |
| 593 | size_t relay_switch_subbuf(struct rchan_buf *buf, size_t length) | 593 | size_t relay_switch_subbuf(struct rchan_buf *buf, size_t length) |
| 594 | { | 594 | { |
| 595 | void *old, *new; | 595 | void *old, *new; |
| 596 | size_t old_subbuf, new_subbuf; | 596 | size_t old_subbuf, new_subbuf; |
| 597 | 597 | ||
| 598 | if (unlikely(length > buf->chan->subbuf_size)) | 598 | if (unlikely(length > buf->chan->subbuf_size)) |
| 599 | goto toobig; | 599 | goto toobig; |
| 600 | 600 | ||
| 601 | if (buf->offset != buf->chan->subbuf_size + 1) { | 601 | if (buf->offset != buf->chan->subbuf_size + 1) { |
| 602 | buf->prev_padding = buf->chan->subbuf_size - buf->offset; | 602 | buf->prev_padding = buf->chan->subbuf_size - buf->offset; |
| 603 | old_subbuf = buf->subbufs_produced % buf->chan->n_subbufs; | 603 | old_subbuf = buf->subbufs_produced % buf->chan->n_subbufs; |
| 604 | buf->padding[old_subbuf] = buf->prev_padding; | 604 | buf->padding[old_subbuf] = buf->prev_padding; |
| 605 | buf->subbufs_produced++; | 605 | buf->subbufs_produced++; |
| 606 | buf->dentry->d_inode->i_size += buf->chan->subbuf_size - | 606 | buf->dentry->d_inode->i_size += buf->chan->subbuf_size - |
| 607 | buf->padding[old_subbuf]; | 607 | buf->padding[old_subbuf]; |
| 608 | smp_mb(); | 608 | smp_mb(); |
| 609 | if (waitqueue_active(&buf->read_wait)) | 609 | if (waitqueue_active(&buf->read_wait)) |
| 610 | /* | 610 | /* |
| 611 | * Calling wake_up_interruptible() from here | 611 | * Calling wake_up_interruptible() from here |
| 612 | * will deadlock if we happen to be logging | 612 | * will deadlock if we happen to be logging |
| 613 | * from the scheduler (trying to re-grab | 613 | * from the scheduler (trying to re-grab |
| 614 | * rq->lock), so defer it. | 614 | * rq->lock), so defer it. |
| 615 | */ | 615 | */ |
| 616 | __mod_timer(&buf->timer, jiffies + 1); | 616 | __mod_timer(&buf->timer, jiffies + 1); |
| 617 | } | 617 | } |
| 618 | 618 | ||
| 619 | old = buf->data; | 619 | old = buf->data; |
| 620 | new_subbuf = buf->subbufs_produced % buf->chan->n_subbufs; | 620 | new_subbuf = buf->subbufs_produced % buf->chan->n_subbufs; |
| 621 | new = buf->start + new_subbuf * buf->chan->subbuf_size; | 621 | new = buf->start + new_subbuf * buf->chan->subbuf_size; |
| 622 | buf->offset = 0; | 622 | buf->offset = 0; |
| 623 | if (!buf->chan->cb->subbuf_start(buf, new, old, buf->prev_padding)) { | 623 | if (!buf->chan->cb->subbuf_start(buf, new, old, buf->prev_padding)) { |
| 624 | buf->offset = buf->chan->subbuf_size + 1; | 624 | buf->offset = buf->chan->subbuf_size + 1; |
| 625 | return 0; | 625 | return 0; |
| 626 | } | 626 | } |
| 627 | buf->data = new; | 627 | buf->data = new; |
| 628 | buf->padding[new_subbuf] = 0; | 628 | buf->padding[new_subbuf] = 0; |
| 629 | 629 | ||
| 630 | if (unlikely(length + buf->offset > buf->chan->subbuf_size)) | 630 | if (unlikely(length + buf->offset > buf->chan->subbuf_size)) |
| 631 | goto toobig; | 631 | goto toobig; |
| 632 | 632 | ||
| 633 | return length; | 633 | return length; |
| 634 | 634 | ||
| 635 | toobig: | 635 | toobig: |
| 636 | buf->chan->last_toobig = length; | 636 | buf->chan->last_toobig = length; |
| 637 | return 0; | 637 | return 0; |
| 638 | } | 638 | } |
| 639 | EXPORT_SYMBOL_GPL(relay_switch_subbuf); | 639 | EXPORT_SYMBOL_GPL(relay_switch_subbuf); |
| 640 | 640 | ||
| 641 | /** | 641 | /** |
| 642 | * relay_subbufs_consumed - update the buffer's sub-buffers-consumed count | 642 | * relay_subbufs_consumed - update the buffer's sub-buffers-consumed count |
| 643 | * @chan: the channel | 643 | * @chan: the channel |
| 644 | * @cpu: the cpu associated with the channel buffer to update | 644 | * @cpu: the cpu associated with the channel buffer to update |
| 645 | * @subbufs_consumed: number of sub-buffers to add to current buf's count | 645 | * @subbufs_consumed: number of sub-buffers to add to current buf's count |
| 646 | * | 646 | * |
| 647 | * Adds to the channel buffer's consumed sub-buffer count. | 647 | * Adds to the channel buffer's consumed sub-buffer count. |
| 648 | * subbufs_consumed should be the number of sub-buffers newly consumed, | 648 | * subbufs_consumed should be the number of sub-buffers newly consumed, |
| 649 | * not the total consumed. | 649 | * not the total consumed. |
| 650 | * | 650 | * |
| 651 | * NOTE. Kernel clients don't need to call this function if the channel | 651 | * NOTE. Kernel clients don't need to call this function if the channel |
| 652 | * mode is 'overwrite'. | 652 | * mode is 'overwrite'. |
| 653 | */ | 653 | */ |
| 654 | void relay_subbufs_consumed(struct rchan *chan, | 654 | void relay_subbufs_consumed(struct rchan *chan, |
| 655 | unsigned int cpu, | 655 | unsigned int cpu, |
| 656 | size_t subbufs_consumed) | 656 | size_t subbufs_consumed) |
| 657 | { | 657 | { |
| 658 | struct rchan_buf *buf; | 658 | struct rchan_buf *buf; |
| 659 | 659 | ||
| 660 | if (!chan) | 660 | if (!chan) |
| 661 | return; | 661 | return; |
| 662 | 662 | ||
| 663 | if (cpu >= NR_CPUS || !chan->buf[cpu]) | 663 | if (cpu >= NR_CPUS || !chan->buf[cpu]) |
| 664 | return; | 664 | return; |
| 665 | 665 | ||
| 666 | buf = chan->buf[cpu]; | 666 | buf = chan->buf[cpu]; |
| 667 | buf->subbufs_consumed += subbufs_consumed; | 667 | buf->subbufs_consumed += subbufs_consumed; |
| 668 | if (buf->subbufs_consumed > buf->subbufs_produced) | 668 | if (buf->subbufs_consumed > buf->subbufs_produced) |
| 669 | buf->subbufs_consumed = buf->subbufs_produced; | 669 | buf->subbufs_consumed = buf->subbufs_produced; |
| 670 | } | 670 | } |
| 671 | EXPORT_SYMBOL_GPL(relay_subbufs_consumed); | 671 | EXPORT_SYMBOL_GPL(relay_subbufs_consumed); |
| 672 | 672 | ||
| 673 | /** | 673 | /** |
| 674 | * relay_close - close the channel | 674 | * relay_close - close the channel |
| 675 | * @chan: the channel | 675 | * @chan: the channel |
| 676 | * | 676 | * |
| 677 | * Closes all channel buffers and frees the channel. | 677 | * Closes all channel buffers and frees the channel. |
| 678 | */ | 678 | */ |
| 679 | void relay_close(struct rchan *chan) | 679 | void relay_close(struct rchan *chan) |
| 680 | { | 680 | { |
| 681 | unsigned int i; | 681 | unsigned int i; |
| 682 | 682 | ||
| 683 | if (!chan) | 683 | if (!chan) |
| 684 | return; | 684 | return; |
| 685 | 685 | ||
| 686 | mutex_lock(&relay_channels_mutex); | 686 | mutex_lock(&relay_channels_mutex); |
| 687 | if (chan->is_global && chan->buf[0]) | 687 | if (chan->is_global && chan->buf[0]) |
| 688 | relay_close_buf(chan->buf[0]); | 688 | relay_close_buf(chan->buf[0]); |
| 689 | else | 689 | else |
| 690 | for_each_possible_cpu(i) | 690 | for_each_possible_cpu(i) |
| 691 | if (chan->buf[i]) | 691 | if (chan->buf[i]) |
| 692 | relay_close_buf(chan->buf[i]); | 692 | relay_close_buf(chan->buf[i]); |
| 693 | 693 | ||
| 694 | if (chan->last_toobig) | 694 | if (chan->last_toobig) |
| 695 | printk(KERN_WARNING "relay: one or more items not logged " | 695 | printk(KERN_WARNING "relay: one or more items not logged " |
| 696 | "[item size (%Zd) > sub-buffer size (%Zd)]\n", | 696 | "[item size (%Zd) > sub-buffer size (%Zd)]\n", |
| 697 | chan->last_toobig, chan->subbuf_size); | 697 | chan->last_toobig, chan->subbuf_size); |
| 698 | 698 | ||
| 699 | list_del(&chan->list); | 699 | list_del(&chan->list); |
| 700 | kref_put(&chan->kref, relay_destroy_channel); | 700 | kref_put(&chan->kref, relay_destroy_channel); |
| 701 | mutex_unlock(&relay_channels_mutex); | 701 | mutex_unlock(&relay_channels_mutex); |
| 702 | } | 702 | } |
| 703 | EXPORT_SYMBOL_GPL(relay_close); | 703 | EXPORT_SYMBOL_GPL(relay_close); |
| 704 | 704 | ||
| 705 | /** | 705 | /** |
| 706 | * relay_flush - close the channel | 706 | * relay_flush - close the channel |
| 707 | * @chan: the channel | 707 | * @chan: the channel |
| 708 | * | 708 | * |
| 709 | * Flushes all channel buffers, i.e. forces buffer switch. | 709 | * Flushes all channel buffers, i.e. forces buffer switch. |
| 710 | */ | 710 | */ |
| 711 | void relay_flush(struct rchan *chan) | 711 | void relay_flush(struct rchan *chan) |
| 712 | { | 712 | { |
| 713 | unsigned int i; | 713 | unsigned int i; |
| 714 | 714 | ||
| 715 | if (!chan) | 715 | if (!chan) |
| 716 | return; | 716 | return; |
| 717 | 717 | ||
| 718 | if (chan->is_global && chan->buf[0]) { | 718 | if (chan->is_global && chan->buf[0]) { |
| 719 | relay_switch_subbuf(chan->buf[0], 0); | 719 | relay_switch_subbuf(chan->buf[0], 0); |
| 720 | return; | 720 | return; |
| 721 | } | 721 | } |
| 722 | 722 | ||
| 723 | mutex_lock(&relay_channels_mutex); | 723 | mutex_lock(&relay_channels_mutex); |
| 724 | for_each_possible_cpu(i) | 724 | for_each_possible_cpu(i) |
| 725 | if (chan->buf[i]) | 725 | if (chan->buf[i]) |
| 726 | relay_switch_subbuf(chan->buf[i], 0); | 726 | relay_switch_subbuf(chan->buf[i], 0); |
| 727 | mutex_unlock(&relay_channels_mutex); | 727 | mutex_unlock(&relay_channels_mutex); |
| 728 | } | 728 | } |
| 729 | EXPORT_SYMBOL_GPL(relay_flush); | 729 | EXPORT_SYMBOL_GPL(relay_flush); |
| 730 | 730 | ||
| 731 | /** | 731 | /** |
| 732 | * relay_file_open - open file op for relay files | 732 | * relay_file_open - open file op for relay files |
| 733 | * @inode: the inode | 733 | * @inode: the inode |
| 734 | * @filp: the file | 734 | * @filp: the file |
| 735 | * | 735 | * |
| 736 | * Increments the channel buffer refcount. | 736 | * Increments the channel buffer refcount. |
| 737 | */ | 737 | */ |
| 738 | static int relay_file_open(struct inode *inode, struct file *filp) | 738 | static int relay_file_open(struct inode *inode, struct file *filp) |
| 739 | { | 739 | { |
| 740 | struct rchan_buf *buf = inode->i_private; | 740 | struct rchan_buf *buf = inode->i_private; |
| 741 | kref_get(&buf->kref); | 741 | kref_get(&buf->kref); |
| 742 | filp->private_data = buf; | 742 | filp->private_data = buf; |
| 743 | 743 | ||
| 744 | return 0; | 744 | return 0; |
| 745 | } | 745 | } |
| 746 | 746 | ||
| 747 | /** | 747 | /** |
| 748 | * relay_file_mmap - mmap file op for relay files | 748 | * relay_file_mmap - mmap file op for relay files |
| 749 | * @filp: the file | 749 | * @filp: the file |
| 750 | * @vma: the vma describing what to map | 750 | * @vma: the vma describing what to map |
| 751 | * | 751 | * |
| 752 | * Calls upon relay_mmap_buf() to map the file into user space. | 752 | * Calls upon relay_mmap_buf() to map the file into user space. |
| 753 | */ | 753 | */ |
| 754 | static int relay_file_mmap(struct file *filp, struct vm_area_struct *vma) | 754 | static int relay_file_mmap(struct file *filp, struct vm_area_struct *vma) |
| 755 | { | 755 | { |
| 756 | struct rchan_buf *buf = filp->private_data; | 756 | struct rchan_buf *buf = filp->private_data; |
| 757 | return relay_mmap_buf(buf, vma); | 757 | return relay_mmap_buf(buf, vma); |
| 758 | } | 758 | } |
| 759 | 759 | ||
| 760 | /** | 760 | /** |
| 761 | * relay_file_poll - poll file op for relay files | 761 | * relay_file_poll - poll file op for relay files |
| 762 | * @filp: the file | 762 | * @filp: the file |
| 763 | * @wait: poll table | 763 | * @wait: poll table |
| 764 | * | 764 | * |
| 765 | * Poll implemention. | 765 | * Poll implemention. |
| 766 | */ | 766 | */ |
| 767 | static unsigned int relay_file_poll(struct file *filp, poll_table *wait) | 767 | static unsigned int relay_file_poll(struct file *filp, poll_table *wait) |
| 768 | { | 768 | { |
| 769 | unsigned int mask = 0; | 769 | unsigned int mask = 0; |
| 770 | struct rchan_buf *buf = filp->private_data; | 770 | struct rchan_buf *buf = filp->private_data; |
| 771 | 771 | ||
| 772 | if (buf->finalized) | 772 | if (buf->finalized) |
| 773 | return POLLERR; | 773 | return POLLERR; |
| 774 | 774 | ||
| 775 | if (filp->f_mode & FMODE_READ) { | 775 | if (filp->f_mode & FMODE_READ) { |
| 776 | poll_wait(filp, &buf->read_wait, wait); | 776 | poll_wait(filp, &buf->read_wait, wait); |
| 777 | if (!relay_buf_empty(buf)) | 777 | if (!relay_buf_empty(buf)) |
| 778 | mask |= POLLIN | POLLRDNORM; | 778 | mask |= POLLIN | POLLRDNORM; |
| 779 | } | 779 | } |
| 780 | 780 | ||
| 781 | return mask; | 781 | return mask; |
| 782 | } | 782 | } |
| 783 | 783 | ||
| 784 | /** | 784 | /** |
| 785 | * relay_file_release - release file op for relay files | 785 | * relay_file_release - release file op for relay files |
| 786 | * @inode: the inode | 786 | * @inode: the inode |
| 787 | * @filp: the file | 787 | * @filp: the file |
| 788 | * | 788 | * |
| 789 | * Decrements the channel refcount, as the filesystem is | 789 | * Decrements the channel refcount, as the filesystem is |
| 790 | * no longer using it. | 790 | * no longer using it. |
| 791 | */ | 791 | */ |
| 792 | static int relay_file_release(struct inode *inode, struct file *filp) | 792 | static int relay_file_release(struct inode *inode, struct file *filp) |
| 793 | { | 793 | { |
| 794 | struct rchan_buf *buf = filp->private_data; | 794 | struct rchan_buf *buf = filp->private_data; |
| 795 | kref_put(&buf->kref, relay_remove_buf); | 795 | kref_put(&buf->kref, relay_remove_buf); |
| 796 | 796 | ||
| 797 | return 0; | 797 | return 0; |
| 798 | } | 798 | } |
| 799 | 799 | ||
| 800 | /* | 800 | /* |
| 801 | * relay_file_read_consume - update the consumed count for the buffer | 801 | * relay_file_read_consume - update the consumed count for the buffer |
| 802 | */ | 802 | */ |
| 803 | static void relay_file_read_consume(struct rchan_buf *buf, | 803 | static void relay_file_read_consume(struct rchan_buf *buf, |
| 804 | size_t read_pos, | 804 | size_t read_pos, |
| 805 | size_t bytes_consumed) | 805 | size_t bytes_consumed) |
| 806 | { | 806 | { |
| 807 | size_t subbuf_size = buf->chan->subbuf_size; | 807 | size_t subbuf_size = buf->chan->subbuf_size; |
| 808 | size_t n_subbufs = buf->chan->n_subbufs; | 808 | size_t n_subbufs = buf->chan->n_subbufs; |
| 809 | size_t read_subbuf; | 809 | size_t read_subbuf; |
| 810 | 810 | ||
| 811 | if (buf->bytes_consumed + bytes_consumed > subbuf_size) { | 811 | if (buf->bytes_consumed + bytes_consumed > subbuf_size) { |
| 812 | relay_subbufs_consumed(buf->chan, buf->cpu, 1); | 812 | relay_subbufs_consumed(buf->chan, buf->cpu, 1); |
| 813 | buf->bytes_consumed = 0; | 813 | buf->bytes_consumed = 0; |
| 814 | } | 814 | } |
| 815 | 815 | ||
| 816 | buf->bytes_consumed += bytes_consumed; | 816 | buf->bytes_consumed += bytes_consumed; |
| 817 | if (!read_pos) | 817 | if (!read_pos) |
| 818 | read_subbuf = buf->subbufs_consumed % n_subbufs; | 818 | read_subbuf = buf->subbufs_consumed % n_subbufs; |
| 819 | else | 819 | else |
| 820 | read_subbuf = read_pos / buf->chan->subbuf_size; | 820 | read_subbuf = read_pos / buf->chan->subbuf_size; |
| 821 | if (buf->bytes_consumed + buf->padding[read_subbuf] == subbuf_size) { | 821 | if (buf->bytes_consumed + buf->padding[read_subbuf] == subbuf_size) { |
| 822 | if ((read_subbuf == buf->subbufs_produced % n_subbufs) && | 822 | if ((read_subbuf == buf->subbufs_produced % n_subbufs) && |
| 823 | (buf->offset == subbuf_size)) | 823 | (buf->offset == subbuf_size)) |
| 824 | return; | 824 | return; |
| 825 | relay_subbufs_consumed(buf->chan, buf->cpu, 1); | 825 | relay_subbufs_consumed(buf->chan, buf->cpu, 1); |
| 826 | buf->bytes_consumed = 0; | 826 | buf->bytes_consumed = 0; |
| 827 | } | 827 | } |
| 828 | } | 828 | } |
| 829 | 829 | ||
| 830 | /* | 830 | /* |
| 831 | * relay_file_read_avail - boolean, are there unconsumed bytes available? | 831 | * relay_file_read_avail - boolean, are there unconsumed bytes available? |
| 832 | */ | 832 | */ |
| 833 | static int relay_file_read_avail(struct rchan_buf *buf, size_t read_pos) | 833 | static int relay_file_read_avail(struct rchan_buf *buf, size_t read_pos) |
| 834 | { | 834 | { |
| 835 | size_t subbuf_size = buf->chan->subbuf_size; | 835 | size_t subbuf_size = buf->chan->subbuf_size; |
| 836 | size_t n_subbufs = buf->chan->n_subbufs; | 836 | size_t n_subbufs = buf->chan->n_subbufs; |
| 837 | size_t produced = buf->subbufs_produced; | 837 | size_t produced = buf->subbufs_produced; |
| 838 | size_t consumed = buf->subbufs_consumed; | 838 | size_t consumed = buf->subbufs_consumed; |
| 839 | 839 | ||
| 840 | relay_file_read_consume(buf, read_pos, 0); | 840 | relay_file_read_consume(buf, read_pos, 0); |
| 841 | 841 | ||
| 842 | if (unlikely(buf->offset > subbuf_size)) { | 842 | if (unlikely(buf->offset > subbuf_size)) { |
| 843 | if (produced == consumed) | 843 | if (produced == consumed) |
| 844 | return 0; | 844 | return 0; |
| 845 | return 1; | 845 | return 1; |
| 846 | } | 846 | } |
| 847 | 847 | ||
| 848 | if (unlikely(produced - consumed >= n_subbufs)) { | 848 | if (unlikely(produced - consumed >= n_subbufs)) { |
| 849 | consumed = produced - n_subbufs + 1; | 849 | consumed = produced - n_subbufs + 1; |
| 850 | buf->subbufs_consumed = consumed; | 850 | buf->subbufs_consumed = consumed; |
| 851 | buf->bytes_consumed = 0; | 851 | buf->bytes_consumed = 0; |
| 852 | } | 852 | } |
| 853 | 853 | ||
| 854 | produced = (produced % n_subbufs) * subbuf_size + buf->offset; | 854 | produced = (produced % n_subbufs) * subbuf_size + buf->offset; |
| 855 | consumed = (consumed % n_subbufs) * subbuf_size + buf->bytes_consumed; | 855 | consumed = (consumed % n_subbufs) * subbuf_size + buf->bytes_consumed; |
| 856 | 856 | ||
| 857 | if (consumed > produced) | 857 | if (consumed > produced) |
| 858 | produced += n_subbufs * subbuf_size; | 858 | produced += n_subbufs * subbuf_size; |
| 859 | 859 | ||
| 860 | if (consumed == produced) | 860 | if (consumed == produced) |
| 861 | return 0; | 861 | return 0; |
| 862 | 862 | ||
| 863 | return 1; | 863 | return 1; |
| 864 | } | 864 | } |
| 865 | 865 | ||
| 866 | /** | 866 | /** |
| 867 | * relay_file_read_subbuf_avail - return bytes available in sub-buffer | 867 | * relay_file_read_subbuf_avail - return bytes available in sub-buffer |
| 868 | * @read_pos: file read position | 868 | * @read_pos: file read position |
| 869 | * @buf: relay channel buffer | 869 | * @buf: relay channel buffer |
| 870 | */ | 870 | */ |
| 871 | static size_t relay_file_read_subbuf_avail(size_t read_pos, | 871 | static size_t relay_file_read_subbuf_avail(size_t read_pos, |
| 872 | struct rchan_buf *buf) | 872 | struct rchan_buf *buf) |
| 873 | { | 873 | { |
| 874 | size_t padding, avail = 0; | 874 | size_t padding, avail = 0; |
| 875 | size_t read_subbuf, read_offset, write_subbuf, write_offset; | 875 | size_t read_subbuf, read_offset, write_subbuf, write_offset; |
| 876 | size_t subbuf_size = buf->chan->subbuf_size; | 876 | size_t subbuf_size = buf->chan->subbuf_size; |
| 877 | 877 | ||
| 878 | write_subbuf = (buf->data - buf->start) / subbuf_size; | 878 | write_subbuf = (buf->data - buf->start) / subbuf_size; |
| 879 | write_offset = buf->offset > subbuf_size ? subbuf_size : buf->offset; | 879 | write_offset = buf->offset > subbuf_size ? subbuf_size : buf->offset; |
| 880 | read_subbuf = read_pos / subbuf_size; | 880 | read_subbuf = read_pos / subbuf_size; |
| 881 | read_offset = read_pos % subbuf_size; | 881 | read_offset = read_pos % subbuf_size; |
| 882 | padding = buf->padding[read_subbuf]; | 882 | padding = buf->padding[read_subbuf]; |
| 883 | 883 | ||
| 884 | if (read_subbuf == write_subbuf) { | 884 | if (read_subbuf == write_subbuf) { |
| 885 | if (read_offset + padding < write_offset) | 885 | if (read_offset + padding < write_offset) |
| 886 | avail = write_offset - (read_offset + padding); | 886 | avail = write_offset - (read_offset + padding); |
| 887 | } else | 887 | } else |
| 888 | avail = (subbuf_size - padding) - read_offset; | 888 | avail = (subbuf_size - padding) - read_offset; |
| 889 | 889 | ||
| 890 | return avail; | 890 | return avail; |
| 891 | } | 891 | } |
| 892 | 892 | ||
| 893 | /** | 893 | /** |
| 894 | * relay_file_read_start_pos - find the first available byte to read | 894 | * relay_file_read_start_pos - find the first available byte to read |
| 895 | * @read_pos: file read position | 895 | * @read_pos: file read position |
| 896 | * @buf: relay channel buffer | 896 | * @buf: relay channel buffer |
| 897 | * | 897 | * |
| 898 | * If the @read_pos is in the middle of padding, return the | 898 | * If the @read_pos is in the middle of padding, return the |
| 899 | * position of the first actually available byte, otherwise | 899 | * position of the first actually available byte, otherwise |
| 900 | * return the original value. | 900 | * return the original value. |
| 901 | */ | 901 | */ |
| 902 | static size_t relay_file_read_start_pos(size_t read_pos, | 902 | static size_t relay_file_read_start_pos(size_t read_pos, |
| 903 | struct rchan_buf *buf) | 903 | struct rchan_buf *buf) |
| 904 | { | 904 | { |
| 905 | size_t read_subbuf, padding, padding_start, padding_end; | 905 | size_t read_subbuf, padding, padding_start, padding_end; |
| 906 | size_t subbuf_size = buf->chan->subbuf_size; | 906 | size_t subbuf_size = buf->chan->subbuf_size; |
| 907 | size_t n_subbufs = buf->chan->n_subbufs; | 907 | size_t n_subbufs = buf->chan->n_subbufs; |
| 908 | size_t consumed = buf->subbufs_consumed % n_subbufs; | 908 | size_t consumed = buf->subbufs_consumed % n_subbufs; |
| 909 | 909 | ||
| 910 | if (!read_pos) | 910 | if (!read_pos) |
| 911 | read_pos = consumed * subbuf_size + buf->bytes_consumed; | 911 | read_pos = consumed * subbuf_size + buf->bytes_consumed; |
| 912 | read_subbuf = read_pos / subbuf_size; | 912 | read_subbuf = read_pos / subbuf_size; |
| 913 | padding = buf->padding[read_subbuf]; | 913 | padding = buf->padding[read_subbuf]; |
| 914 | padding_start = (read_subbuf + 1) * subbuf_size - padding; | 914 | padding_start = (read_subbuf + 1) * subbuf_size - padding; |
| 915 | padding_end = (read_subbuf + 1) * subbuf_size; | 915 | padding_end = (read_subbuf + 1) * subbuf_size; |
| 916 | if (read_pos >= padding_start && read_pos < padding_end) { | 916 | if (read_pos >= padding_start && read_pos < padding_end) { |
| 917 | read_subbuf = (read_subbuf + 1) % n_subbufs; | 917 | read_subbuf = (read_subbuf + 1) % n_subbufs; |
| 918 | read_pos = read_subbuf * subbuf_size; | 918 | read_pos = read_subbuf * subbuf_size; |
| 919 | } | 919 | } |
| 920 | 920 | ||
| 921 | return read_pos; | 921 | return read_pos; |
| 922 | } | 922 | } |
| 923 | 923 | ||
| 924 | /** | 924 | /** |
| 925 | * relay_file_read_end_pos - return the new read position | 925 | * relay_file_read_end_pos - return the new read position |
| 926 | * @read_pos: file read position | 926 | * @read_pos: file read position |
| 927 | * @buf: relay channel buffer | 927 | * @buf: relay channel buffer |
| 928 | * @count: number of bytes to be read | 928 | * @count: number of bytes to be read |
| 929 | */ | 929 | */ |
| 930 | static size_t relay_file_read_end_pos(struct rchan_buf *buf, | 930 | static size_t relay_file_read_end_pos(struct rchan_buf *buf, |
| 931 | size_t read_pos, | 931 | size_t read_pos, |
| 932 | size_t count) | 932 | size_t count) |
| 933 | { | 933 | { |
| 934 | size_t read_subbuf, padding, end_pos; | 934 | size_t read_subbuf, padding, end_pos; |
| 935 | size_t subbuf_size = buf->chan->subbuf_size; | 935 | size_t subbuf_size = buf->chan->subbuf_size; |
| 936 | size_t n_subbufs = buf->chan->n_subbufs; | 936 | size_t n_subbufs = buf->chan->n_subbufs; |
| 937 | 937 | ||
| 938 | read_subbuf = read_pos / subbuf_size; | 938 | read_subbuf = read_pos / subbuf_size; |
| 939 | padding = buf->padding[read_subbuf]; | 939 | padding = buf->padding[read_subbuf]; |
| 940 | if (read_pos % subbuf_size + count + padding == subbuf_size) | 940 | if (read_pos % subbuf_size + count + padding == subbuf_size) |
| 941 | end_pos = (read_subbuf + 1) * subbuf_size; | 941 | end_pos = (read_subbuf + 1) * subbuf_size; |
| 942 | else | 942 | else |
| 943 | end_pos = read_pos + count; | 943 | end_pos = read_pos + count; |
| 944 | if (end_pos >= subbuf_size * n_subbufs) | 944 | if (end_pos >= subbuf_size * n_subbufs) |
| 945 | end_pos = 0; | 945 | end_pos = 0; |
| 946 | 946 | ||
| 947 | return end_pos; | 947 | return end_pos; |
| 948 | } | 948 | } |
| 949 | 949 | ||
| 950 | /* | 950 | /* |
| 951 | * subbuf_read_actor - read up to one subbuf's worth of data | 951 | * subbuf_read_actor - read up to one subbuf's worth of data |
| 952 | */ | 952 | */ |
| 953 | static int subbuf_read_actor(size_t read_start, | 953 | static int subbuf_read_actor(size_t read_start, |
| 954 | struct rchan_buf *buf, | 954 | struct rchan_buf *buf, |
| 955 | size_t avail, | 955 | size_t avail, |
| 956 | read_descriptor_t *desc, | 956 | read_descriptor_t *desc, |
| 957 | read_actor_t actor) | 957 | read_actor_t actor) |
| 958 | { | 958 | { |
| 959 | void *from; | 959 | void *from; |
| 960 | int ret = 0; | 960 | int ret = 0; |
| 961 | 961 | ||
| 962 | from = buf->start + read_start; | 962 | from = buf->start + read_start; |
| 963 | ret = avail; | 963 | ret = avail; |
| 964 | if (copy_to_user(desc->arg.buf, from, avail)) { | 964 | if (copy_to_user(desc->arg.buf, from, avail)) { |
| 965 | desc->error = -EFAULT; | 965 | desc->error = -EFAULT; |
| 966 | ret = 0; | 966 | ret = 0; |
| 967 | } | 967 | } |
| 968 | desc->arg.data += ret; | 968 | desc->arg.data += ret; |
| 969 | desc->written += ret; | 969 | desc->written += ret; |
| 970 | desc->count -= ret; | 970 | desc->count -= ret; |
| 971 | 971 | ||
| 972 | return ret; | 972 | return ret; |
| 973 | } | 973 | } |
| 974 | 974 | ||
| 975 | typedef int (*subbuf_actor_t) (size_t read_start, | 975 | typedef int (*subbuf_actor_t) (size_t read_start, |
| 976 | struct rchan_buf *buf, | 976 | struct rchan_buf *buf, |
| 977 | size_t avail, | 977 | size_t avail, |
| 978 | read_descriptor_t *desc, | 978 | read_descriptor_t *desc, |
| 979 | read_actor_t actor); | 979 | read_actor_t actor); |
| 980 | 980 | ||
| 981 | /* | 981 | /* |
| 982 | * relay_file_read_subbufs - read count bytes, bridging subbuf boundaries | 982 | * relay_file_read_subbufs - read count bytes, bridging subbuf boundaries |
| 983 | */ | 983 | */ |
| 984 | static ssize_t relay_file_read_subbufs(struct file *filp, loff_t *ppos, | 984 | static ssize_t relay_file_read_subbufs(struct file *filp, loff_t *ppos, |
| 985 | subbuf_actor_t subbuf_actor, | 985 | subbuf_actor_t subbuf_actor, |
| 986 | read_actor_t actor, | 986 | read_actor_t actor, |
| 987 | read_descriptor_t *desc) | 987 | read_descriptor_t *desc) |
| 988 | { | 988 | { |
| 989 | struct rchan_buf *buf = filp->private_data; | 989 | struct rchan_buf *buf = filp->private_data; |
| 990 | size_t read_start, avail; | 990 | size_t read_start, avail; |
| 991 | int ret; | 991 | int ret; |
| 992 | 992 | ||
| 993 | if (!desc->count) | 993 | if (!desc->count) |
| 994 | return 0; | 994 | return 0; |
| 995 | 995 | ||
| 996 | mutex_lock(&filp->f_path.dentry->d_inode->i_mutex); | 996 | mutex_lock(&filp->f_path.dentry->d_inode->i_mutex); |
| 997 | do { | 997 | do { |
| 998 | if (!relay_file_read_avail(buf, *ppos)) | 998 | if (!relay_file_read_avail(buf, *ppos)) |
| 999 | break; | 999 | break; |
| 1000 | 1000 | ||
| 1001 | read_start = relay_file_read_start_pos(*ppos, buf); | 1001 | read_start = relay_file_read_start_pos(*ppos, buf); |
| 1002 | avail = relay_file_read_subbuf_avail(read_start, buf); | 1002 | avail = relay_file_read_subbuf_avail(read_start, buf); |
| 1003 | if (!avail) | 1003 | if (!avail) |
| 1004 | break; | 1004 | break; |
| 1005 | 1005 | ||
| 1006 | avail = min(desc->count, avail); | 1006 | avail = min(desc->count, avail); |
| 1007 | ret = subbuf_actor(read_start, buf, avail, desc, actor); | 1007 | ret = subbuf_actor(read_start, buf, avail, desc, actor); |
| 1008 | if (desc->error < 0) | 1008 | if (desc->error < 0) |
| 1009 | break; | 1009 | break; |
| 1010 | 1010 | ||
| 1011 | if (ret) { | 1011 | if (ret) { |
| 1012 | relay_file_read_consume(buf, read_start, ret); | 1012 | relay_file_read_consume(buf, read_start, ret); |
| 1013 | *ppos = relay_file_read_end_pos(buf, read_start, ret); | 1013 | *ppos = relay_file_read_end_pos(buf, read_start, ret); |
| 1014 | } | 1014 | } |
| 1015 | } while (desc->count && ret); | 1015 | } while (desc->count && ret); |
| 1016 | mutex_unlock(&filp->f_path.dentry->d_inode->i_mutex); | 1016 | mutex_unlock(&filp->f_path.dentry->d_inode->i_mutex); |
| 1017 | 1017 | ||
| 1018 | return desc->written; | 1018 | return desc->written; |
| 1019 | } | 1019 | } |
| 1020 | 1020 | ||
| 1021 | static ssize_t relay_file_read(struct file *filp, | 1021 | static ssize_t relay_file_read(struct file *filp, |
| 1022 | char __user *buffer, | 1022 | char __user *buffer, |
| 1023 | size_t count, | 1023 | size_t count, |
| 1024 | loff_t *ppos) | 1024 | loff_t *ppos) |
| 1025 | { | 1025 | { |
| 1026 | read_descriptor_t desc; | 1026 | read_descriptor_t desc; |
| 1027 | desc.written = 0; | 1027 | desc.written = 0; |
| 1028 | desc.count = count; | 1028 | desc.count = count; |
| 1029 | desc.arg.buf = buffer; | 1029 | desc.arg.buf = buffer; |
| 1030 | desc.error = 0; | 1030 | desc.error = 0; |
| 1031 | return relay_file_read_subbufs(filp, ppos, subbuf_read_actor, | 1031 | return relay_file_read_subbufs(filp, ppos, subbuf_read_actor, |
| 1032 | NULL, &desc); | 1032 | NULL, &desc); |
| 1033 | } | 1033 | } |
| 1034 | 1034 | ||
| 1035 | static void relay_consume_bytes(struct rchan_buf *rbuf, int bytes_consumed) | 1035 | static void relay_consume_bytes(struct rchan_buf *rbuf, int bytes_consumed) |
| 1036 | { | 1036 | { |
| 1037 | rbuf->bytes_consumed += bytes_consumed; | 1037 | rbuf->bytes_consumed += bytes_consumed; |
| 1038 | 1038 | ||
| 1039 | if (rbuf->bytes_consumed >= rbuf->chan->subbuf_size) { | 1039 | if (rbuf->bytes_consumed >= rbuf->chan->subbuf_size) { |
| 1040 | relay_subbufs_consumed(rbuf->chan, rbuf->cpu, 1); | 1040 | relay_subbufs_consumed(rbuf->chan, rbuf->cpu, 1); |
| 1041 | rbuf->bytes_consumed %= rbuf->chan->subbuf_size; | 1041 | rbuf->bytes_consumed %= rbuf->chan->subbuf_size; |
| 1042 | } | 1042 | } |
| 1043 | } | 1043 | } |
| 1044 | 1044 | ||
| 1045 | static void relay_pipe_buf_release(struct pipe_inode_info *pipe, | 1045 | static void relay_pipe_buf_release(struct pipe_inode_info *pipe, |
| 1046 | struct pipe_buffer *buf) | 1046 | struct pipe_buffer *buf) |
| 1047 | { | 1047 | { |
| 1048 | struct rchan_buf *rbuf; | 1048 | struct rchan_buf *rbuf; |
| 1049 | 1049 | ||
| 1050 | rbuf = (struct rchan_buf *)page_private(buf->page); | 1050 | rbuf = (struct rchan_buf *)page_private(buf->page); |
| 1051 | relay_consume_bytes(rbuf, buf->private); | 1051 | relay_consume_bytes(rbuf, buf->private); |
| 1052 | } | 1052 | } |
| 1053 | 1053 | ||
| 1054 | static struct pipe_buf_operations relay_pipe_buf_ops = { | 1054 | static struct pipe_buf_operations relay_pipe_buf_ops = { |
| 1055 | .can_merge = 0, | 1055 | .can_merge = 0, |
| 1056 | .map = generic_pipe_buf_map, | 1056 | .map = generic_pipe_buf_map, |
| 1057 | .unmap = generic_pipe_buf_unmap, | 1057 | .unmap = generic_pipe_buf_unmap, |
| 1058 | .pin = generic_pipe_buf_pin, | 1058 | .confirm = generic_pipe_buf_confirm, |
| 1059 | .release = relay_pipe_buf_release, | 1059 | .release = relay_pipe_buf_release, |
| 1060 | .steal = generic_pipe_buf_steal, | 1060 | .steal = generic_pipe_buf_steal, |
| 1061 | .get = generic_pipe_buf_get, | 1061 | .get = generic_pipe_buf_get, |
| 1062 | }; | 1062 | }; |
| 1063 | 1063 | ||
| 1064 | /** | 1064 | /** |
| 1065 | * subbuf_splice_actor - splice up to one subbuf's worth of data | 1065 | * subbuf_splice_actor - splice up to one subbuf's worth of data |
| 1066 | */ | 1066 | */ |
| 1067 | static int subbuf_splice_actor(struct file *in, | 1067 | static int subbuf_splice_actor(struct file *in, |
| 1068 | loff_t *ppos, | 1068 | loff_t *ppos, |
| 1069 | struct pipe_inode_info *pipe, | 1069 | struct pipe_inode_info *pipe, |
| 1070 | size_t len, | 1070 | size_t len, |
| 1071 | unsigned int flags, | 1071 | unsigned int flags, |
| 1072 | int *nonpad_ret) | 1072 | int *nonpad_ret) |
| 1073 | { | 1073 | { |
| 1074 | unsigned int pidx, poff, total_len, subbuf_pages, ret; | 1074 | unsigned int pidx, poff, total_len, subbuf_pages, ret; |
| 1075 | struct rchan_buf *rbuf = in->private_data; | 1075 | struct rchan_buf *rbuf = in->private_data; |
| 1076 | unsigned int subbuf_size = rbuf->chan->subbuf_size; | 1076 | unsigned int subbuf_size = rbuf->chan->subbuf_size; |
| 1077 | size_t read_start = ((size_t)*ppos) % rbuf->chan->alloc_size; | 1077 | size_t read_start = ((size_t)*ppos) % rbuf->chan->alloc_size; |
| 1078 | size_t read_subbuf = read_start / subbuf_size; | 1078 | size_t read_subbuf = read_start / subbuf_size; |
| 1079 | size_t padding = rbuf->padding[read_subbuf]; | 1079 | size_t padding = rbuf->padding[read_subbuf]; |
| 1080 | size_t nonpad_end = read_subbuf * subbuf_size + subbuf_size - padding; | 1080 | size_t nonpad_end = read_subbuf * subbuf_size + subbuf_size - padding; |
| 1081 | struct page *pages[PIPE_BUFFERS]; | 1081 | struct page *pages[PIPE_BUFFERS]; |
| 1082 | struct partial_page partial[PIPE_BUFFERS]; | 1082 | struct partial_page partial[PIPE_BUFFERS]; |
| 1083 | struct splice_pipe_desc spd = { | 1083 | struct splice_pipe_desc spd = { |
| 1084 | .pages = pages, | 1084 | .pages = pages, |
| 1085 | .nr_pages = 0, | 1085 | .nr_pages = 0, |
| 1086 | .partial = partial, | 1086 | .partial = partial, |
| 1087 | .flags = flags, | 1087 | .flags = flags, |
| 1088 | .ops = &relay_pipe_buf_ops, | 1088 | .ops = &relay_pipe_buf_ops, |
| 1089 | }; | 1089 | }; |
| 1090 | 1090 | ||
| 1091 | if (rbuf->subbufs_produced == rbuf->subbufs_consumed) | 1091 | if (rbuf->subbufs_produced == rbuf->subbufs_consumed) |
| 1092 | return 0; | 1092 | return 0; |
| 1093 | 1093 | ||
| 1094 | /* | 1094 | /* |
| 1095 | * Adjust read len, if longer than what is available | 1095 | * Adjust read len, if longer than what is available |
| 1096 | */ | 1096 | */ |
| 1097 | if (len > (subbuf_size - read_start % subbuf_size)) | 1097 | if (len > (subbuf_size - read_start % subbuf_size)) |
| 1098 | len = subbuf_size - read_start % subbuf_size; | 1098 | len = subbuf_size - read_start % subbuf_size; |
| 1099 | 1099 | ||
| 1100 | subbuf_pages = rbuf->chan->alloc_size >> PAGE_SHIFT; | 1100 | subbuf_pages = rbuf->chan->alloc_size >> PAGE_SHIFT; |
| 1101 | pidx = (read_start / PAGE_SIZE) % subbuf_pages; | 1101 | pidx = (read_start / PAGE_SIZE) % subbuf_pages; |
| 1102 | poff = read_start & ~PAGE_MASK; | 1102 | poff = read_start & ~PAGE_MASK; |
| 1103 | 1103 | ||
| 1104 | for (total_len = 0; spd.nr_pages < subbuf_pages; spd.nr_pages++) { | 1104 | for (total_len = 0; spd.nr_pages < subbuf_pages; spd.nr_pages++) { |
| 1105 | unsigned int this_len, this_end, private; | 1105 | unsigned int this_len, this_end, private; |
| 1106 | unsigned int cur_pos = read_start + total_len; | 1106 | unsigned int cur_pos = read_start + total_len; |
| 1107 | 1107 | ||
| 1108 | if (!len) | 1108 | if (!len) |
| 1109 | break; | 1109 | break; |
| 1110 | 1110 | ||
| 1111 | this_len = min_t(unsigned long, len, PAGE_SIZE - poff); | 1111 | this_len = min_t(unsigned long, len, PAGE_SIZE - poff); |
| 1112 | private = this_len; | 1112 | private = this_len; |
| 1113 | 1113 | ||
| 1114 | spd.pages[spd.nr_pages] = rbuf->page_array[pidx]; | 1114 | spd.pages[spd.nr_pages] = rbuf->page_array[pidx]; |
| 1115 | spd.partial[spd.nr_pages].offset = poff; | 1115 | spd.partial[spd.nr_pages].offset = poff; |
| 1116 | 1116 | ||
| 1117 | this_end = cur_pos + this_len; | 1117 | this_end = cur_pos + this_len; |
| 1118 | if (this_end >= nonpad_end) { | 1118 | if (this_end >= nonpad_end) { |
| 1119 | this_len = nonpad_end - cur_pos; | 1119 | this_len = nonpad_end - cur_pos; |
| 1120 | private = this_len + padding; | 1120 | private = this_len + padding; |
| 1121 | } | 1121 | } |
| 1122 | spd.partial[spd.nr_pages].len = this_len; | 1122 | spd.partial[spd.nr_pages].len = this_len; |
| 1123 | spd.partial[spd.nr_pages].private = private; | 1123 | spd.partial[spd.nr_pages].private = private; |
| 1124 | 1124 | ||
| 1125 | len -= this_len; | 1125 | len -= this_len; |
| 1126 | total_len += this_len; | 1126 | total_len += this_len; |
| 1127 | poff = 0; | 1127 | poff = 0; |
| 1128 | pidx = (pidx + 1) % subbuf_pages; | 1128 | pidx = (pidx + 1) % subbuf_pages; |
| 1129 | 1129 | ||
| 1130 | if (this_end >= nonpad_end) { | 1130 | if (this_end >= nonpad_end) { |
| 1131 | spd.nr_pages++; | 1131 | spd.nr_pages++; |
| 1132 | break; | 1132 | break; |
| 1133 | } | 1133 | } |
| 1134 | } | 1134 | } |
| 1135 | 1135 | ||
| 1136 | if (!spd.nr_pages) | 1136 | if (!spd.nr_pages) |
| 1137 | return 0; | 1137 | return 0; |
| 1138 | 1138 | ||
| 1139 | ret = *nonpad_ret = splice_to_pipe(pipe, &spd); | 1139 | ret = *nonpad_ret = splice_to_pipe(pipe, &spd); |
| 1140 | if (ret < 0 || ret < total_len) | 1140 | if (ret < 0 || ret < total_len) |
| 1141 | return ret; | 1141 | return ret; |
| 1142 | 1142 | ||
| 1143 | if (read_start + ret == nonpad_end) | 1143 | if (read_start + ret == nonpad_end) |
| 1144 | ret += padding; | 1144 | ret += padding; |
| 1145 | 1145 | ||
| 1146 | return ret; | 1146 | return ret; |
| 1147 | } | 1147 | } |
| 1148 | 1148 | ||
| 1149 | static ssize_t relay_file_splice_read(struct file *in, | 1149 | static ssize_t relay_file_splice_read(struct file *in, |
| 1150 | loff_t *ppos, | 1150 | loff_t *ppos, |
| 1151 | struct pipe_inode_info *pipe, | 1151 | struct pipe_inode_info *pipe, |
| 1152 | size_t len, | 1152 | size_t len, |
| 1153 | unsigned int flags) | 1153 | unsigned int flags) |
| 1154 | { | 1154 | { |
| 1155 | ssize_t spliced; | 1155 | ssize_t spliced; |
| 1156 | int ret; | 1156 | int ret; |
| 1157 | int nonpad_ret = 0; | 1157 | int nonpad_ret = 0; |
| 1158 | 1158 | ||
| 1159 | ret = 0; | 1159 | ret = 0; |
| 1160 | spliced = 0; | 1160 | spliced = 0; |
| 1161 | 1161 | ||
| 1162 | while (len) { | 1162 | while (len) { |
| 1163 | ret = subbuf_splice_actor(in, ppos, pipe, len, flags, &nonpad_ret); | 1163 | ret = subbuf_splice_actor(in, ppos, pipe, len, flags, &nonpad_ret); |
| 1164 | if (ret < 0) | 1164 | if (ret < 0) |
| 1165 | break; | 1165 | break; |
| 1166 | else if (!ret) { | 1166 | else if (!ret) { |
| 1167 | if (spliced) | 1167 | if (spliced) |
| 1168 | break; | 1168 | break; |
| 1169 | if (flags & SPLICE_F_NONBLOCK) { | 1169 | if (flags & SPLICE_F_NONBLOCK) { |
| 1170 | ret = -EAGAIN; | 1170 | ret = -EAGAIN; |
| 1171 | break; | 1171 | break; |
| 1172 | } | 1172 | } |
| 1173 | } | 1173 | } |
| 1174 | 1174 | ||
| 1175 | *ppos += ret; | 1175 | *ppos += ret; |
| 1176 | if (ret > len) | 1176 | if (ret > len) |
| 1177 | len = 0; | 1177 | len = 0; |
| 1178 | else | 1178 | else |
| 1179 | len -= ret; | 1179 | len -= ret; |
| 1180 | spliced += nonpad_ret; | 1180 | spliced += nonpad_ret; |
| 1181 | nonpad_ret = 0; | 1181 | nonpad_ret = 0; |
| 1182 | } | 1182 | } |
| 1183 | 1183 | ||
| 1184 | if (spliced) | 1184 | if (spliced) |
| 1185 | return spliced; | 1185 | return spliced; |
| 1186 | 1186 | ||
| 1187 | return ret; | 1187 | return ret; |
| 1188 | } | 1188 | } |
| 1189 | 1189 | ||
| 1190 | const struct file_operations relay_file_operations = { | 1190 | const struct file_operations relay_file_operations = { |
| 1191 | .open = relay_file_open, | 1191 | .open = relay_file_open, |
| 1192 | .poll = relay_file_poll, | 1192 | .poll = relay_file_poll, |
| 1193 | .mmap = relay_file_mmap, | 1193 | .mmap = relay_file_mmap, |
| 1194 | .read = relay_file_read, | 1194 | .read = relay_file_read, |
| 1195 | .llseek = no_llseek, | 1195 | .llseek = no_llseek, |
| 1196 | .release = relay_file_release, | 1196 | .release = relay_file_release, |
| 1197 | .splice_read = relay_file_splice_read, | 1197 | .splice_read = relay_file_splice_read, |
| 1198 | }; | 1198 | }; |
| 1199 | EXPORT_SYMBOL_GPL(relay_file_operations); | 1199 | EXPORT_SYMBOL_GPL(relay_file_operations); |
| 1200 | 1200 | ||
| 1201 | static __init int relay_init(void) | 1201 | static __init int relay_init(void) |
| 1202 | { | 1202 | { |
| 1203 | 1203 | ||
| 1204 | hotcpu_notifier(relay_hotcpu_callback, 0); | 1204 | hotcpu_notifier(relay_hotcpu_callback, 0); |
| 1205 | return 0; | 1205 | return 0; |
| 1206 | } | 1206 | } |
| 1207 | 1207 | ||
| 1208 | module_init(relay_init); | 1208 | module_init(relay_init); |
| 1209 | 1209 |