Eric Lee / linux-smarc-t335x-v3.2

Commit cac36bb06efe4880234524e117e0e712b10b1f16

Authored by Jens Axboe 2007-06-14 19:10:48 +0800

Exists in master and in 4 other branches

pipe: change the ->pin() operation to ->confirm()

The name 'pin' was badly chosen, it doesn't pin a pipe buffer
in the most commonly used sense in the kernel. So change the
name to 'confirm', after debating this issue with Hugh
Dickins a bit.

A good return from ->confirm() means that the buffer is really
there, and that the contents are good.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

Showing 7 changed files with 22 additions and 20 deletions Inline Diff

drivers/block/loop.c
fs/nfsd/vfs.c
fs/ocfs2/file.c
fs/pipe.c
fs/splice.c
include/linux/pipe_fs_i.h
kernel/relay.c

drivers/block/loop.c

Diff comments View file @ cac36bb

1	/*	1	/*
2	* linux/drivers/block/loop.c	2	* linux/drivers/block/loop.c
3	*	3	*
4	* Written by Theodore Ts'o, 3/29/93	4	* Written by Theodore Ts'o, 3/29/93
5	*	5	*
6	* Copyright 1993 by Theodore Ts'o. Redistribution of this file is	6	* Copyright 1993 by Theodore Ts'o. Redistribution of this file is
7	* permitted under the GNU General Public License.	7	* permitted under the GNU General Public License.
8	*	8	*
9	* DES encryption plus some minor changes by Werner Almesberger, 30-MAY-1993	9	* DES encryption plus some minor changes by Werner Almesberger, 30-MAY-1993
10	* more DES encryption plus IDEA encryption by Nicholas J. Leon, June 20, 1996	10	* more DES encryption plus IDEA encryption by Nicholas J. Leon, June 20, 1996
11	*	11	*
12	* Modularized and updated for 1.1.16 kernel - Mitch Dsouza 28th May 1994	12	* Modularized and updated for 1.1.16 kernel - Mitch Dsouza 28th May 1994
13	* Adapted for 1.3.59 kernel - Andries Brouwer, 1 Feb 1996	13	* Adapted for 1.3.59 kernel - Andries Brouwer, 1 Feb 1996
14	*	14	*
15	* Fixed do_loop_request() re-entrancy - Vincent.Renardias@waw.com Mar 20, 1997	15	* Fixed do_loop_request() re-entrancy - Vincent.Renardias@waw.com Mar 20, 1997
16	*	16	*
17	* Added devfs support - Richard Gooch <rgooch@atnf.csiro.au> 16-Jan-1998	17	* Added devfs support - Richard Gooch <rgooch@atnf.csiro.au> 16-Jan-1998
18	*	18	*
19	* Handle sparse backing files correctly - Kenn Humborg, Jun 28, 1998	19	* Handle sparse backing files correctly - Kenn Humborg, Jun 28, 1998
20	*	20	*
21	* Loadable modules and other fixes by AK, 1998	21	* Loadable modules and other fixes by AK, 1998
22	*	22	*
23	* Make real block number available to downstream transfer functions, enables	23	* Make real block number available to downstream transfer functions, enables
24	* CBC (and relatives) mode encryption requiring unique IVs per data block.	24	* CBC (and relatives) mode encryption requiring unique IVs per data block.
25	* Reed H. Petty, rhp@draper.net	25	* Reed H. Petty, rhp@draper.net
26	*	26	*
27	* Maximum number of loop devices now dynamic via max_loop module parameter.	27	* Maximum number of loop devices now dynamic via max_loop module parameter.
28	* Russell Kroll <rkroll@exploits.org> 19990701	28	* Russell Kroll <rkroll@exploits.org> 19990701
29	*	29	*
30	* Maximum number of loop devices when compiled-in now selectable by passing	30	* Maximum number of loop devices when compiled-in now selectable by passing
31	* max_loop=<1-255> to the kernel on boot.	31	* max_loop=<1-255> to the kernel on boot.
32	* Erik I. Bolsø, <eriki@himolde.no>, Oct 31, 1999	32	* Erik I. Bolsø, <eriki@himolde.no>, Oct 31, 1999
33	*	33	*
34	* Completely rewrite request handling to be make_request_fn style and	34	* Completely rewrite request handling to be make_request_fn style and
35	* non blocking, pushing work to a helper thread. Lots of fixes from	35	* non blocking, pushing work to a helper thread. Lots of fixes from
36	* Al Viro too.	36	* Al Viro too.
37	* Jens Axboe <axboe@suse.de>, Nov 2000	37	* Jens Axboe <axboe@suse.de>, Nov 2000
38	*	38	*
39	* Support up to 256 loop devices	39	* Support up to 256 loop devices
40	* Heinz Mauelshagen <mge@sistina.com>, Feb 2002	40	* Heinz Mauelshagen <mge@sistina.com>, Feb 2002
41	*	41	*
42	* Support for falling back on the write file operation when the address space	42	* Support for falling back on the write file operation when the address space
43	* operations prepare_write and/or commit_write are not available on the	43	* operations prepare_write and/or commit_write are not available on the
44	* backing filesystem.	44	* backing filesystem.
45	* Anton Altaparmakov, 16 Feb 2005	45	* Anton Altaparmakov, 16 Feb 2005
46	*	46	*
47	* Still To Fix:	47	* Still To Fix:
48	* - Advisory locking is ignored here.	48	* - Advisory locking is ignored here.
49	* - Should use an own CAP_* category instead of CAP_SYS_ADMIN	49	* - Should use an own CAP_* category instead of CAP_SYS_ADMIN
50	*	50	*
51	*/	51	*/
52		52
53	#include <linux/module.h>	53	#include <linux/module.h>
54	#include <linux/moduleparam.h>	54	#include <linux/moduleparam.h>
55	#include <linux/sched.h>	55	#include <linux/sched.h>
56	#include <linux/fs.h>	56	#include <linux/fs.h>
57	#include <linux/file.h>	57	#include <linux/file.h>
58	#include <linux/stat.h>	58	#include <linux/stat.h>
59	#include <linux/errno.h>	59	#include <linux/errno.h>
60	#include <linux/major.h>	60	#include <linux/major.h>
61	#include <linux/wait.h>	61	#include <linux/wait.h>
62	#include <linux/blkdev.h>	62	#include <linux/blkdev.h>
63	#include <linux/blkpg.h>	63	#include <linux/blkpg.h>
64	#include <linux/init.h>	64	#include <linux/init.h>
65	#include <linux/smp_lock.h>	65	#include <linux/smp_lock.h>
66	#include <linux/swap.h>	66	#include <linux/swap.h>
67	#include <linux/slab.h>	67	#include <linux/slab.h>
68	#include <linux/loop.h>	68	#include <linux/loop.h>
69	#include <linux/compat.h>	69	#include <linux/compat.h>
70	#include <linux/suspend.h>	70	#include <linux/suspend.h>
71	#include <linux/writeback.h>	71	#include <linux/writeback.h>
72	#include <linux/buffer_head.h> /* for invalidate_bdev() */	72	#include <linux/buffer_head.h> /* for invalidate_bdev() */
73	#include <linux/completion.h>	73	#include <linux/completion.h>
74	#include <linux/highmem.h>	74	#include <linux/highmem.h>
75	#include <linux/gfp.h>	75	#include <linux/gfp.h>
76	#include <linux/kthread.h>	76	#include <linux/kthread.h>
77	#include <linux/splice.h>	77	#include <linux/splice.h>
78		78
79	#include <asm/uaccess.h>	79	#include <asm/uaccess.h>
80		80
81	static LIST_HEAD(loop_devices);	81	static LIST_HEAD(loop_devices);
82	static DEFINE_MUTEX(loop_devices_mutex);	82	static DEFINE_MUTEX(loop_devices_mutex);
83		83
84	/*	84	/*
85	* Transfer functions	85	* Transfer functions
86	*/	86	*/
87	static int transfer_none(struct loop_device *lo, int cmd,	87	static int transfer_none(struct loop_device *lo, int cmd,
88	struct page *raw_page, unsigned raw_off,	88	struct page *raw_page, unsigned raw_off,
89	struct page *loop_page, unsigned loop_off,	89	struct page *loop_page, unsigned loop_off,
90	int size, sector_t real_block)	90	int size, sector_t real_block)
91	{	91	{
92	char *raw_buf = kmap_atomic(raw_page, KM_USER0) + raw_off;	92	char *raw_buf = kmap_atomic(raw_page, KM_USER0) + raw_off;
93	char *loop_buf = kmap_atomic(loop_page, KM_USER1) + loop_off;	93	char *loop_buf = kmap_atomic(loop_page, KM_USER1) + loop_off;
94		94
95	if (cmd == READ)	95	if (cmd == READ)
96	memcpy(loop_buf, raw_buf, size);	96	memcpy(loop_buf, raw_buf, size);
97	else	97	else
98	memcpy(raw_buf, loop_buf, size);	98	memcpy(raw_buf, loop_buf, size);
99		99
100	kunmap_atomic(raw_buf, KM_USER0);	100	kunmap_atomic(raw_buf, KM_USER0);
101	kunmap_atomic(loop_buf, KM_USER1);	101	kunmap_atomic(loop_buf, KM_USER1);
102	cond_resched();	102	cond_resched();
103	return 0;	103	return 0;
104	}	104	}
105		105
106	static int transfer_xor(struct loop_device *lo, int cmd,	106	static int transfer_xor(struct loop_device *lo, int cmd,
107	struct page *raw_page, unsigned raw_off,	107	struct page *raw_page, unsigned raw_off,
108	struct page *loop_page, unsigned loop_off,	108	struct page *loop_page, unsigned loop_off,
109	int size, sector_t real_block)	109	int size, sector_t real_block)
110	{	110	{
111	char *raw_buf = kmap_atomic(raw_page, KM_USER0) + raw_off;	111	char *raw_buf = kmap_atomic(raw_page, KM_USER0) + raw_off;
112	char *loop_buf = kmap_atomic(loop_page, KM_USER1) + loop_off;	112	char *loop_buf = kmap_atomic(loop_page, KM_USER1) + loop_off;
113	char in, out, *key;	113	char in, out, *key;
114	int i, keysize;	114	int i, keysize;
115		115
116	if (cmd == READ) {	116	if (cmd == READ) {
117	in = raw_buf;	117	in = raw_buf;
118	out = loop_buf;	118	out = loop_buf;
119	} else {	119	} else {
120	in = loop_buf;	120	in = loop_buf;
121	out = raw_buf;	121	out = raw_buf;
122	}	122	}
123		123
124	key = lo->lo_encrypt_key;	124	key = lo->lo_encrypt_key;
125	keysize = lo->lo_encrypt_key_size;	125	keysize = lo->lo_encrypt_key_size;
126	for (i = 0; i < size; i++)	126	for (i = 0; i < size; i++)
127	out++ = in++ ^ key[(i & 511) % keysize];	127	out++ = in++ ^ key[(i & 511) % keysize];
128		128
129	kunmap_atomic(raw_buf, KM_USER0);	129	kunmap_atomic(raw_buf, KM_USER0);
130	kunmap_atomic(loop_buf, KM_USER1);	130	kunmap_atomic(loop_buf, KM_USER1);
131	cond_resched();	131	cond_resched();
132	return 0;	132	return 0;
133	}	133	}
134		134
135	static int xor_init(struct loop_device lo, const struct loop_info64 info)	135	static int xor_init(struct loop_device lo, const struct loop_info64 info)
136	{	136	{
137	if (unlikely(info->lo_encrypt_key_size <= 0))	137	if (unlikely(info->lo_encrypt_key_size <= 0))
138	return -EINVAL;	138	return -EINVAL;
139	return 0;	139	return 0;
140	}	140	}
141		141
142	static struct loop_func_table none_funcs = {	142	static struct loop_func_table none_funcs = {
143	.number = LO_CRYPT_NONE,	143	.number = LO_CRYPT_NONE,
144	.transfer = transfer_none,	144	.transfer = transfer_none,
145	};	145	};
146		146
147	static struct loop_func_table xor_funcs = {	147	static struct loop_func_table xor_funcs = {
148	.number = LO_CRYPT_XOR,	148	.number = LO_CRYPT_XOR,
149	.transfer = transfer_xor,	149	.transfer = transfer_xor,
150	.init = xor_init	150	.init = xor_init
151	};	151	};
152		152
153	/* xfer_funcs[0] is special - its release function is never called */	153	/* xfer_funcs[0] is special - its release function is never called */
154	static struct loop_func_table *xfer_funcs[MAX_LO_CRYPT] = {	154	static struct loop_func_table *xfer_funcs[MAX_LO_CRYPT] = {
155	&none_funcs,	155	&none_funcs,
156	&xor_funcs	156	&xor_funcs
157	};	157	};
158		158
159	static loff_t get_loop_size(struct loop_device lo, struct file file)	159	static loff_t get_loop_size(struct loop_device lo, struct file file)
160	{	160	{
161	loff_t size, offset, loopsize;	161	loff_t size, offset, loopsize;
162		162
163	/* Compute loopsize in bytes */	163	/* Compute loopsize in bytes */
164	size = i_size_read(file->f_mapping->host);	164	size = i_size_read(file->f_mapping->host);
165	offset = lo->lo_offset;	165	offset = lo->lo_offset;
166	loopsize = size - offset;	166	loopsize = size - offset;
167	if (lo->lo_sizelimit > 0 && lo->lo_sizelimit < loopsize)	167	if (lo->lo_sizelimit > 0 && lo->lo_sizelimit < loopsize)
168	loopsize = lo->lo_sizelimit;	168	loopsize = lo->lo_sizelimit;
169		169
170	/*	170	/*
171	* Unfortunately, if we want to do I/O on the device,	171	* Unfortunately, if we want to do I/O on the device,
172	* the number of 512-byte sectors has to fit into a sector_t.	172	* the number of 512-byte sectors has to fit into a sector_t.
173	*/	173	*/
174	return loopsize >> 9;	174	return loopsize >> 9;
175	}	175	}
176		176
177	static int	177	static int
178	figure_loop_size(struct loop_device *lo)	178	figure_loop_size(struct loop_device *lo)
179	{	179	{
180	loff_t size = get_loop_size(lo, lo->lo_backing_file);	180	loff_t size = get_loop_size(lo, lo->lo_backing_file);
181	sector_t x = (sector_t)size;	181	sector_t x = (sector_t)size;
182		182
183	if (unlikely((loff_t)x != size))	183	if (unlikely((loff_t)x != size))
184	return -EFBIG;	184	return -EFBIG;
185		185
186	set_capacity(lo->lo_disk, x);	186	set_capacity(lo->lo_disk, x);
187	return 0;	187	return 0;
188	}	188	}
189		189
190	static inline int	190	static inline int
191	lo_do_transfer(struct loop_device *lo, int cmd,	191	lo_do_transfer(struct loop_device *lo, int cmd,
192	struct page *rpage, unsigned roffs,	192	struct page *rpage, unsigned roffs,
193	struct page *lpage, unsigned loffs,	193	struct page *lpage, unsigned loffs,
194	int size, sector_t rblock)	194	int size, sector_t rblock)
195	{	195	{
196	if (unlikely(!lo->transfer))	196	if (unlikely(!lo->transfer))
197	return 0;	197	return 0;
198		198
199	return lo->transfer(lo, cmd, rpage, roffs, lpage, loffs, size, rblock);	199	return lo->transfer(lo, cmd, rpage, roffs, lpage, loffs, size, rblock);
200	}	200	}
201		201
202	/**	202	/**
203	* do_lo_send_aops - helper for writing data to a loop device	203	* do_lo_send_aops - helper for writing data to a loop device
204	*	204	*
205	* This is the fast version for backing filesystems which implement the address	205	* This is the fast version for backing filesystems which implement the address
206	* space operations prepare_write and commit_write.	206	* space operations prepare_write and commit_write.
207	*/	207	*/
208	static int do_lo_send_aops(struct loop_device lo, struct bio_vec bvec,	208	static int do_lo_send_aops(struct loop_device lo, struct bio_vec bvec,
209	int bsize, loff_t pos, struct page *page)	209	int bsize, loff_t pos, struct page *page)
210	{	210	{
211	struct file file = lo->lo_backing_file; / kudos to NFsckingS */	211	struct file file = lo->lo_backing_file; / kudos to NFsckingS */
212	struct address_space *mapping = file->f_mapping;	212	struct address_space *mapping = file->f_mapping;
213	const struct address_space_operations *aops = mapping->a_ops;	213	const struct address_space_operations *aops = mapping->a_ops;
214	pgoff_t index;	214	pgoff_t index;
215	unsigned offset, bv_offs;	215	unsigned offset, bv_offs;
216	int len, ret;	216	int len, ret;
217		217
218	mutex_lock(&mapping->host->i_mutex);	218	mutex_lock(&mapping->host->i_mutex);
219	index = pos >> PAGE_CACHE_SHIFT;	219	index = pos >> PAGE_CACHE_SHIFT;
220	offset = pos & ((pgoff_t)PAGE_CACHE_SIZE - 1);	220	offset = pos & ((pgoff_t)PAGE_CACHE_SIZE - 1);
221	bv_offs = bvec->bv_offset;	221	bv_offs = bvec->bv_offset;
222	len = bvec->bv_len;	222	len = bvec->bv_len;
223	while (len > 0) {	223	while (len > 0) {
224	sector_t IV;	224	sector_t IV;
225	unsigned size;	225	unsigned size;
226	int transfer_result;	226	int transfer_result;
227		227
228	IV = ((sector_t)index << (PAGE_CACHE_SHIFT - 9))+(offset >> 9);	228	IV = ((sector_t)index << (PAGE_CACHE_SHIFT - 9))+(offset >> 9);
229	size = PAGE_CACHE_SIZE - offset;	229	size = PAGE_CACHE_SIZE - offset;
230	if (size > len)	230	if (size > len)
231	size = len;	231	size = len;
232	page = grab_cache_page(mapping, index);	232	page = grab_cache_page(mapping, index);
233	if (unlikely(!page))	233	if (unlikely(!page))
234	goto fail;	234	goto fail;
235	ret = aops->prepare_write(file, page, offset,	235	ret = aops->prepare_write(file, page, offset,
236	offset + size);	236	offset + size);
237	if (unlikely(ret)) {	237	if (unlikely(ret)) {
238	if (ret == AOP_TRUNCATED_PAGE) {	238	if (ret == AOP_TRUNCATED_PAGE) {
239	page_cache_release(page);	239	page_cache_release(page);
240	continue;	240	continue;
241	}	241	}
242	goto unlock;	242	goto unlock;
243	}	243	}
244	transfer_result = lo_do_transfer(lo, WRITE, page, offset,	244	transfer_result = lo_do_transfer(lo, WRITE, page, offset,
245	bvec->bv_page, bv_offs, size, IV);	245	bvec->bv_page, bv_offs, size, IV);
246	if (unlikely(transfer_result)) {	246	if (unlikely(transfer_result)) {
247	/*	247	/*
248	* The transfer failed, but we still write the data to	248	* The transfer failed, but we still write the data to
249	* keep prepare/commit calls balanced.	249	* keep prepare/commit calls balanced.
250	*/	250	*/
251	printk(KERN_ERR "loop: transfer error block %llu\n",	251	printk(KERN_ERR "loop: transfer error block %llu\n",
252	(unsigned long long)index);	252	(unsigned long long)index);
253	zero_user_page(page, offset, size, KM_USER0);	253	zero_user_page(page, offset, size, KM_USER0);
254	}	254	}
255	flush_dcache_page(page);	255	flush_dcache_page(page);
256	ret = aops->commit_write(file, page, offset,	256	ret = aops->commit_write(file, page, offset,
257	offset + size);	257	offset + size);
258	if (unlikely(ret)) {	258	if (unlikely(ret)) {
259	if (ret == AOP_TRUNCATED_PAGE) {	259	if (ret == AOP_TRUNCATED_PAGE) {
260	page_cache_release(page);	260	page_cache_release(page);
261	continue;	261	continue;
262	}	262	}
263	goto unlock;	263	goto unlock;
264	}	264	}
265	if (unlikely(transfer_result))	265	if (unlikely(transfer_result))
266	goto unlock;	266	goto unlock;
267	bv_offs += size;	267	bv_offs += size;
268	len -= size;	268	len -= size;
269	offset = 0;	269	offset = 0;
270	index++;	270	index++;
271	pos += size;	271	pos += size;
272	unlock_page(page);	272	unlock_page(page);
273	page_cache_release(page);	273	page_cache_release(page);
274	}	274	}
275	ret = 0;	275	ret = 0;
276	out:	276	out:
277	mutex_unlock(&mapping->host->i_mutex);	277	mutex_unlock(&mapping->host->i_mutex);
278	return ret;	278	return ret;
279	unlock:	279	unlock:
280	unlock_page(page);	280	unlock_page(page);
281	page_cache_release(page);	281	page_cache_release(page);
282	fail:	282	fail:
283	ret = -1;	283	ret = -1;
284	goto out;	284	goto out;
285	}	285	}
286		286
287	/**	287	/**
288	* __do_lo_send_write - helper for writing data to a loop device	288	* __do_lo_send_write - helper for writing data to a loop device
289	*	289	*
290	* This helper just factors out common code between do_lo_send_direct_write()	290	* This helper just factors out common code between do_lo_send_direct_write()
291	* and do_lo_send_write().	291	* and do_lo_send_write().
292	*/	292	*/
293	static int __do_lo_send_write(struct file *file,	293	static int __do_lo_send_write(struct file *file,
294	u8 *buf, const int len, loff_t pos)	294	u8 *buf, const int len, loff_t pos)
295	{	295	{
296	ssize_t bw;	296	ssize_t bw;
297	mm_segment_t old_fs = get_fs();	297	mm_segment_t old_fs = get_fs();
298		298
299	set_fs(get_ds());	299	set_fs(get_ds());
300	bw = file->f_op->write(file, buf, len, &pos);	300	bw = file->f_op->write(file, buf, len, &pos);
301	set_fs(old_fs);	301	set_fs(old_fs);
302	if (likely(bw == len))	302	if (likely(bw == len))
303	return 0;	303	return 0;
304	printk(KERN_ERR "loop: Write error at byte offset %llu, length %i.\n",	304	printk(KERN_ERR "loop: Write error at byte offset %llu, length %i.\n",
305	(unsigned long long)pos, len);	305	(unsigned long long)pos, len);
306	if (bw >= 0)	306	if (bw >= 0)
307	bw = -EIO;	307	bw = -EIO;
308	return bw;	308	return bw;
309	}	309	}
310		310
311	/**	311	/**
312	* do_lo_send_direct_write - helper for writing data to a loop device	312	* do_lo_send_direct_write - helper for writing data to a loop device
313	*	313	*
314	* This is the fast, non-transforming version for backing filesystems which do	314	* This is the fast, non-transforming version for backing filesystems which do
315	* not implement the address space operations prepare_write and commit_write.	315	* not implement the address space operations prepare_write and commit_write.
316	* It uses the write file operation which should be present on all writeable	316	* It uses the write file operation which should be present on all writeable
317	* filesystems.	317	* filesystems.
318	*/	318	*/
319	static int do_lo_send_direct_write(struct loop_device *lo,	319	static int do_lo_send_direct_write(struct loop_device *lo,
320	struct bio_vec bvec, int bsize, loff_t pos, struct page page)	320	struct bio_vec bvec, int bsize, loff_t pos, struct page page)
321	{	321	{
322	ssize_t bw = __do_lo_send_write(lo->lo_backing_file,	322	ssize_t bw = __do_lo_send_write(lo->lo_backing_file,
323	kmap(bvec->bv_page) + bvec->bv_offset,	323	kmap(bvec->bv_page) + bvec->bv_offset,
324	bvec->bv_len, pos);	324	bvec->bv_len, pos);
325	kunmap(bvec->bv_page);	325	kunmap(bvec->bv_page);
326	cond_resched();	326	cond_resched();
327	return bw;	327	return bw;
328	}	328	}
329		329
330	/**	330	/**
331	* do_lo_send_write - helper for writing data to a loop device	331	* do_lo_send_write - helper for writing data to a loop device
332	*	332	*
333	* This is the slow, transforming version for filesystems which do not	333	* This is the slow, transforming version for filesystems which do not
334	* implement the address space operations prepare_write and commit_write. It	334	* implement the address space operations prepare_write and commit_write. It
335	* uses the write file operation which should be present on all writeable	335	* uses the write file operation which should be present on all writeable
336	* filesystems.	336	* filesystems.
337	*	337	*
338	* Using fops->write is slower than using aops->{prepare,commit}_write in the	338	* Using fops->write is slower than using aops->{prepare,commit}_write in the
339	* transforming case because we need to double buffer the data as we cannot do	339	* transforming case because we need to double buffer the data as we cannot do
340	* the transformations in place as we do not have direct access to the	340	* the transformations in place as we do not have direct access to the
341	* destination pages of the backing file.	341	* destination pages of the backing file.
342	*/	342	*/
343	static int do_lo_send_write(struct loop_device lo, struct bio_vec bvec,	343	static int do_lo_send_write(struct loop_device lo, struct bio_vec bvec,
344	int bsize, loff_t pos, struct page *page)	344	int bsize, loff_t pos, struct page *page)
345	{	345	{
346	int ret = lo_do_transfer(lo, WRITE, page, 0, bvec->bv_page,	346	int ret = lo_do_transfer(lo, WRITE, page, 0, bvec->bv_page,
347	bvec->bv_offset, bvec->bv_len, pos >> 9);	347	bvec->bv_offset, bvec->bv_len, pos >> 9);
348	if (likely(!ret))	348	if (likely(!ret))
349	return __do_lo_send_write(lo->lo_backing_file,	349	return __do_lo_send_write(lo->lo_backing_file,
350	page_address(page), bvec->bv_len,	350	page_address(page), bvec->bv_len,
351	pos);	351	pos);
352	printk(KERN_ERR "loop: Transfer error at byte offset %llu, "	352	printk(KERN_ERR "loop: Transfer error at byte offset %llu, "
353	"length %i.\n", (unsigned long long)pos, bvec->bv_len);	353	"length %i.\n", (unsigned long long)pos, bvec->bv_len);
354	if (ret > 0)	354	if (ret > 0)
355	ret = -EIO;	355	ret = -EIO;
356	return ret;	356	return ret;
357	}	357	}
358		358
359	static int lo_send(struct loop_device lo, struct bio bio, int bsize,	359	static int lo_send(struct loop_device lo, struct bio bio, int bsize,
360	loff_t pos)	360	loff_t pos)
361	{	361	{
362	int (do_lo_send)(struct loop_device , struct bio_vec *, int, loff_t,	362	int (do_lo_send)(struct loop_device , struct bio_vec *, int, loff_t,
363	struct page *page);	363	struct page *page);
364	struct bio_vec *bvec;	364	struct bio_vec *bvec;
365	struct page *page = NULL;	365	struct page *page = NULL;
366	int i, ret = 0;	366	int i, ret = 0;
367		367
368	do_lo_send = do_lo_send_aops;	368	do_lo_send = do_lo_send_aops;
369	if (!(lo->lo_flags & LO_FLAGS_USE_AOPS)) {	369	if (!(lo->lo_flags & LO_FLAGS_USE_AOPS)) {
370	do_lo_send = do_lo_send_direct_write;	370	do_lo_send = do_lo_send_direct_write;
371	if (lo->transfer != transfer_none) {	371	if (lo->transfer != transfer_none) {
372	page = alloc_page(GFP_NOIO \| __GFP_HIGHMEM);	372	page = alloc_page(GFP_NOIO \| __GFP_HIGHMEM);
373	if (unlikely(!page))	373	if (unlikely(!page))
374	goto fail;	374	goto fail;
375	kmap(page);	375	kmap(page);
376	do_lo_send = do_lo_send_write;	376	do_lo_send = do_lo_send_write;
377	}	377	}
378	}	378	}
379	bio_for_each_segment(bvec, bio, i) {	379	bio_for_each_segment(bvec, bio, i) {
380	ret = do_lo_send(lo, bvec, bsize, pos, page);	380	ret = do_lo_send(lo, bvec, bsize, pos, page);
381	if (ret < 0)	381	if (ret < 0)
382	break;	382	break;
383	pos += bvec->bv_len;	383	pos += bvec->bv_len;
384	}	384	}
385	if (page) {	385	if (page) {
386	kunmap(page);	386	kunmap(page);
387	__free_page(page);	387	__free_page(page);
388	}	388	}
389	out:	389	out:
390	return ret;	390	return ret;
391	fail:	391	fail:
392	printk(KERN_ERR "loop: Failed to allocate temporary page for write.\n");	392	printk(KERN_ERR "loop: Failed to allocate temporary page for write.\n");
393	ret = -ENOMEM;	393	ret = -ENOMEM;
394	goto out;	394	goto out;
395	}	395	}
396		396
397	struct lo_read_data {	397	struct lo_read_data {
398	struct loop_device *lo;	398	struct loop_device *lo;
399	struct page *page;	399	struct page *page;
400	unsigned offset;	400	unsigned offset;
401	int bsize;	401	int bsize;
402	};	402	};
403		403
404	static int	404	static int
405	lo_splice_actor(struct pipe_inode_info pipe, struct pipe_buffer buf,	405	lo_splice_actor(struct pipe_inode_info pipe, struct pipe_buffer buf,
406	struct splice_desc *sd)	406	struct splice_desc *sd)
407	{	407	{
408	struct lo_read_data *p = sd->u.data;	408	struct lo_read_data *p = sd->u.data;
409	struct loop_device *lo = p->lo;	409	struct loop_device *lo = p->lo;
410	struct page *page = buf->page;	410	struct page *page = buf->page;
411	sector_t IV;	411	sector_t IV;
412	size_t size;	412	size_t size;
413	int ret;	413	int ret;
414		414
415	ret = buf->ops->pin(pipe, buf);	415	ret = buf->ops->confirm(pipe, buf);
416	if (unlikely(ret))	416	if (unlikely(ret))
417	return ret;	417	return ret;
418		418
419	IV = ((sector_t) page->index << (PAGE_CACHE_SHIFT - 9)) +	419	IV = ((sector_t) page->index << (PAGE_CACHE_SHIFT - 9)) +
420	(buf->offset >> 9);	420	(buf->offset >> 9);
421	size = sd->len;	421	size = sd->len;
422	if (size > p->bsize)	422	if (size > p->bsize)
423	size = p->bsize;	423	size = p->bsize;
424		424
425	if (lo_do_transfer(lo, READ, page, buf->offset, p->page, p->offset, size, IV)) {	425	if (lo_do_transfer(lo, READ, page, buf->offset, p->page, p->offset, size, IV)) {
426	printk(KERN_ERR "loop: transfer error block %ld\n",	426	printk(KERN_ERR "loop: transfer error block %ld\n",
427	page->index);	427	page->index);
428	size = -EINVAL;	428	size = -EINVAL;
429	}	429	}
430		430
431	flush_dcache_page(p->page);	431	flush_dcache_page(p->page);
432		432
433	if (size > 0)	433	if (size > 0)
434	p->offset += size;	434	p->offset += size;
435		435
436	return size;	436	return size;
437	}	437	}
438		438
439	static int	439	static int
440	lo_direct_splice_actor(struct pipe_inode_info pipe, struct splice_desc sd)	440	lo_direct_splice_actor(struct pipe_inode_info pipe, struct splice_desc sd)
441	{	441	{
442	return __splice_from_pipe(pipe, sd, lo_splice_actor);	442	return __splice_from_pipe(pipe, sd, lo_splice_actor);
443	}	443	}
444		444
445	static int	445	static int
446	do_lo_receive(struct loop_device *lo,	446	do_lo_receive(struct loop_device *lo,
447	struct bio_vec *bvec, int bsize, loff_t pos)	447	struct bio_vec *bvec, int bsize, loff_t pos)
448	{	448	{
449	struct lo_read_data cookie;	449	struct lo_read_data cookie;
450	struct splice_desc sd;	450	struct splice_desc sd;
451	struct file *file;	451	struct file *file;
452	long retval;	452	long retval;
453		453
454	cookie.lo = lo;	454	cookie.lo = lo;
455	cookie.page = bvec->bv_page;	455	cookie.page = bvec->bv_page;
456	cookie.offset = bvec->bv_offset;	456	cookie.offset = bvec->bv_offset;
457	cookie.bsize = bsize;	457	cookie.bsize = bsize;
458		458
459	sd.len = 0;	459	sd.len = 0;
460	sd.total_len = bvec->bv_len;	460	sd.total_len = bvec->bv_len;
461	sd.flags = 0;	461	sd.flags = 0;
462	sd.pos = pos;	462	sd.pos = pos;
463	sd.u.data = &cookie;	463	sd.u.data = &cookie;
464		464
465	file = lo->lo_backing_file;	465	file = lo->lo_backing_file;
466	retval = splice_direct_to_actor(file, &sd, lo_direct_splice_actor);	466	retval = splice_direct_to_actor(file, &sd, lo_direct_splice_actor);
467		467
468	if (retval < 0)	468	if (retval < 0)
469	return retval;	469	return retval;
470		470
471	return 0;	471	return 0;
472	}	472	}
473		473
474	static int	474	static int
475	lo_receive(struct loop_device lo, struct bio bio, int bsize, loff_t pos)	475	lo_receive(struct loop_device lo, struct bio bio, int bsize, loff_t pos)
476	{	476	{
477	struct bio_vec *bvec;	477	struct bio_vec *bvec;
478	int i, ret = 0;	478	int i, ret = 0;
479		479
480	bio_for_each_segment(bvec, bio, i) {	480	bio_for_each_segment(bvec, bio, i) {
481	ret = do_lo_receive(lo, bvec, bsize, pos);	481	ret = do_lo_receive(lo, bvec, bsize, pos);
482	if (ret < 0)	482	if (ret < 0)
483	break;	483	break;
484	pos += bvec->bv_len;	484	pos += bvec->bv_len;
485	}	485	}
486	return ret;	486	return ret;
487	}	487	}
488		488
489	static int do_bio_filebacked(struct loop_device lo, struct bio bio)	489	static int do_bio_filebacked(struct loop_device lo, struct bio bio)
490	{	490	{
491	loff_t pos;	491	loff_t pos;
492	int ret;	492	int ret;
493		493
494	pos = ((loff_t) bio->bi_sector << 9) + lo->lo_offset;	494	pos = ((loff_t) bio->bi_sector << 9) + lo->lo_offset;
495	if (bio_rw(bio) == WRITE)	495	if (bio_rw(bio) == WRITE)
496	ret = lo_send(lo, bio, lo->lo_blocksize, pos);	496	ret = lo_send(lo, bio, lo->lo_blocksize, pos);
497	else	497	else
498	ret = lo_receive(lo, bio, lo->lo_blocksize, pos);	498	ret = lo_receive(lo, bio, lo->lo_blocksize, pos);
499	return ret;	499	return ret;
500	}	500	}
501		501
502	/*	502	/*
503	* Add bio to back of pending list	503	* Add bio to back of pending list
504	*/	504	*/
505	static void loop_add_bio(struct loop_device lo, struct bio bio)	505	static void loop_add_bio(struct loop_device lo, struct bio bio)
506	{	506	{
507	if (lo->lo_biotail) {	507	if (lo->lo_biotail) {
508	lo->lo_biotail->bi_next = bio;	508	lo->lo_biotail->bi_next = bio;
509	lo->lo_biotail = bio;	509	lo->lo_biotail = bio;
510	} else	510	} else
511	lo->lo_bio = lo->lo_biotail = bio;	511	lo->lo_bio = lo->lo_biotail = bio;
512	}	512	}
513		513
514	/*	514	/*
515	* Grab first pending buffer	515	* Grab first pending buffer
516	*/	516	*/
517	static struct bio loop_get_bio(struct loop_device lo)	517	static struct bio loop_get_bio(struct loop_device lo)
518	{	518	{
519	struct bio *bio;	519	struct bio *bio;
520		520
521	if ((bio = lo->lo_bio)) {	521	if ((bio = lo->lo_bio)) {
522	if (bio == lo->lo_biotail)	522	if (bio == lo->lo_biotail)
523	lo->lo_biotail = NULL;	523	lo->lo_biotail = NULL;
524	lo->lo_bio = bio->bi_next;	524	lo->lo_bio = bio->bi_next;
525	bio->bi_next = NULL;	525	bio->bi_next = NULL;
526	}	526	}
527		527
528	return bio;	528	return bio;
529	}	529	}
530		530
531	static int loop_make_request(request_queue_t q, struct bio old_bio)	531	static int loop_make_request(request_queue_t q, struct bio old_bio)
532	{	532	{
533	struct loop_device *lo = q->queuedata;	533	struct loop_device *lo = q->queuedata;
534	int rw = bio_rw(old_bio);	534	int rw = bio_rw(old_bio);
535		535
536	if (rw == READA)	536	if (rw == READA)
537	rw = READ;	537	rw = READ;
538		538
539	BUG_ON(!lo \|\| (rw != READ && rw != WRITE));	539	BUG_ON(!lo \|\| (rw != READ && rw != WRITE));
540		540
541	spin_lock_irq(&lo->lo_lock);	541	spin_lock_irq(&lo->lo_lock);
542	if (lo->lo_state != Lo_bound)	542	if (lo->lo_state != Lo_bound)
543	goto out;	543	goto out;
544	if (unlikely(rw == WRITE && (lo->lo_flags & LO_FLAGS_READ_ONLY)))	544	if (unlikely(rw == WRITE && (lo->lo_flags & LO_FLAGS_READ_ONLY)))
545	goto out;	545	goto out;
546	loop_add_bio(lo, old_bio);	546	loop_add_bio(lo, old_bio);
547	wake_up(&lo->lo_event);	547	wake_up(&lo->lo_event);
548	spin_unlock_irq(&lo->lo_lock);	548	spin_unlock_irq(&lo->lo_lock);
549	return 0;	549	return 0;
550		550
551	out:	551	out:
552	spin_unlock_irq(&lo->lo_lock);	552	spin_unlock_irq(&lo->lo_lock);
553	bio_io_error(old_bio, old_bio->bi_size);	553	bio_io_error(old_bio, old_bio->bi_size);
554	return 0;	554	return 0;
555	}	555	}
556		556
557	/*	557	/*
558	* kick off io on the underlying address space	558	* kick off io on the underlying address space
559	*/	559	*/
560	static void loop_unplug(request_queue_t *q)	560	static void loop_unplug(request_queue_t *q)
561	{	561	{
562	struct loop_device *lo = q->queuedata;	562	struct loop_device *lo = q->queuedata;
563		563
564	clear_bit(QUEUE_FLAG_PLUGGED, &q->queue_flags);	564	clear_bit(QUEUE_FLAG_PLUGGED, &q->queue_flags);
565	blk_run_address_space(lo->lo_backing_file->f_mapping);	565	blk_run_address_space(lo->lo_backing_file->f_mapping);
566	}	566	}
567		567
568	struct switch_request {	568	struct switch_request {
569	struct file *file;	569	struct file *file;
570	struct completion wait;	570	struct completion wait;
571	};	571	};
572		572
573	static void do_loop_switch(struct loop_device , struct switch_request );	573	static void do_loop_switch(struct loop_device , struct switch_request );
574		574
575	static inline void loop_handle_bio(struct loop_device lo, struct bio bio)	575	static inline void loop_handle_bio(struct loop_device lo, struct bio bio)
576	{	576	{
577	if (unlikely(!bio->bi_bdev)) {	577	if (unlikely(!bio->bi_bdev)) {
578	do_loop_switch(lo, bio->bi_private);	578	do_loop_switch(lo, bio->bi_private);
579	bio_put(bio);	579	bio_put(bio);
580	} else {	580	} else {
581	int ret = do_bio_filebacked(lo, bio);	581	int ret = do_bio_filebacked(lo, bio);
582	bio_endio(bio, bio->bi_size, ret);	582	bio_endio(bio, bio->bi_size, ret);
583	}	583	}
584	}	584	}
585		585
586	/*	586	/*
587	* worker thread that handles reads/writes to file backed loop devices,	587	* worker thread that handles reads/writes to file backed loop devices,
588	* to avoid blocking in our make_request_fn. it also does loop decrypting	588	* to avoid blocking in our make_request_fn. it also does loop decrypting
589	* on reads for block backed loop, as that is too heavy to do from	589	* on reads for block backed loop, as that is too heavy to do from
590	* b_end_io context where irqs may be disabled.	590	* b_end_io context where irqs may be disabled.
591	*	591	*
592	* Loop explanation: loop_clr_fd() sets lo_state to Lo_rundown before	592	* Loop explanation: loop_clr_fd() sets lo_state to Lo_rundown before
593	* calling kthread_stop(). Therefore once kthread_should_stop() is	593	* calling kthread_stop(). Therefore once kthread_should_stop() is
594	* true, make_request will not place any more requests. Therefore	594	* true, make_request will not place any more requests. Therefore
595	* once kthread_should_stop() is true and lo_bio is NULL, we are	595	* once kthread_should_stop() is true and lo_bio is NULL, we are
596	* done with the loop.	596	* done with the loop.
597	*/	597	*/
598	static int loop_thread(void *data)	598	static int loop_thread(void *data)
599	{	599	{
600	struct loop_device *lo = data;	600	struct loop_device *lo = data;
601	struct bio *bio;	601	struct bio *bio;
602		602
603	/*	603	/*
604	* loop can be used in an encrypted device,	604	* loop can be used in an encrypted device,
605	* hence, it mustn't be stopped at all	605	* hence, it mustn't be stopped at all
606	* because it could be indirectly used during suspension	606	* because it could be indirectly used during suspension
607	*/	607	*/
608	current->flags \|= PF_NOFREEZE;	608	current->flags \|= PF_NOFREEZE;
609		609
610	set_user_nice(current, -20);	610	set_user_nice(current, -20);
611		611
612	while (!kthread_should_stop() \|\| lo->lo_bio) {	612	while (!kthread_should_stop() \|\| lo->lo_bio) {
613		613
614	wait_event_interruptible(lo->lo_event,	614	wait_event_interruptible(lo->lo_event,
615	lo->lo_bio \|\| kthread_should_stop());	615	lo->lo_bio \|\| kthread_should_stop());
616		616
617	if (!lo->lo_bio)	617	if (!lo->lo_bio)
618	continue;	618	continue;
619	spin_lock_irq(&lo->lo_lock);	619	spin_lock_irq(&lo->lo_lock);
620	bio = loop_get_bio(lo);	620	bio = loop_get_bio(lo);
621	spin_unlock_irq(&lo->lo_lock);	621	spin_unlock_irq(&lo->lo_lock);
622		622
623	BUG_ON(!bio);	623	BUG_ON(!bio);
624	loop_handle_bio(lo, bio);	624	loop_handle_bio(lo, bio);
625	}	625	}
626		626
627	return 0;	627	return 0;
628	}	628	}
629		629
630	/*	630	/*
631	* loop_switch performs the hard work of switching a backing store.	631	* loop_switch performs the hard work of switching a backing store.
632	* First it needs to flush existing IO, it does this by sending a magic	632	* First it needs to flush existing IO, it does this by sending a magic
633	* BIO down the pipe. The completion of this BIO does the actual switch.	633	* BIO down the pipe. The completion of this BIO does the actual switch.
634	*/	634	*/
635	static int loop_switch(struct loop_device lo, struct file file)	635	static int loop_switch(struct loop_device lo, struct file file)
636	{	636	{
637	struct switch_request w;	637	struct switch_request w;
638	struct bio *bio = bio_alloc(GFP_KERNEL, 1);	638	struct bio *bio = bio_alloc(GFP_KERNEL, 1);
639	if (!bio)	639	if (!bio)
640	return -ENOMEM;	640	return -ENOMEM;
641	init_completion(&w.wait);	641	init_completion(&w.wait);
642	w.file = file;	642	w.file = file;
643	bio->bi_private = &w;	643	bio->bi_private = &w;
644	bio->bi_bdev = NULL;	644	bio->bi_bdev = NULL;
645	loop_make_request(lo->lo_queue, bio);	645	loop_make_request(lo->lo_queue, bio);
646	wait_for_completion(&w.wait);	646	wait_for_completion(&w.wait);
647	return 0;	647	return 0;
648	}	648	}
649		649
650	/*	650	/*
651	* Do the actual switch; called from the BIO completion routine	651	* Do the actual switch; called from the BIO completion routine
652	*/	652	*/
653	static void do_loop_switch(struct loop_device lo, struct switch_request p)	653	static void do_loop_switch(struct loop_device lo, struct switch_request p)
654	{	654	{
655	struct file *file = p->file;	655	struct file *file = p->file;
656	struct file *old_file = lo->lo_backing_file;	656	struct file *old_file = lo->lo_backing_file;
657	struct address_space *mapping = file->f_mapping;	657	struct address_space *mapping = file->f_mapping;
658		658
659	mapping_set_gfp_mask(old_file->f_mapping, lo->old_gfp_mask);	659	mapping_set_gfp_mask(old_file->f_mapping, lo->old_gfp_mask);
660	lo->lo_backing_file = file;	660	lo->lo_backing_file = file;
661	lo->lo_blocksize = S_ISBLK(mapping->host->i_mode) ?	661	lo->lo_blocksize = S_ISBLK(mapping->host->i_mode) ?
662	mapping->host->i_bdev->bd_block_size : PAGE_SIZE;	662	mapping->host->i_bdev->bd_block_size : PAGE_SIZE;
663	lo->old_gfp_mask = mapping_gfp_mask(mapping);	663	lo->old_gfp_mask = mapping_gfp_mask(mapping);
664	mapping_set_gfp_mask(mapping, lo->old_gfp_mask & ~(__GFP_IO\|__GFP_FS));	664	mapping_set_gfp_mask(mapping, lo->old_gfp_mask & ~(__GFP_IO\|__GFP_FS));
665	complete(&p->wait);	665	complete(&p->wait);
666	}	666	}
667		667
668		668
669	/*	669	/*
670	* loop_change_fd switched the backing store of a loopback device to	670	* loop_change_fd switched the backing store of a loopback device to
671	* a new file. This is useful for operating system installers to free up	671	* a new file. This is useful for operating system installers to free up
672	* the original file and in High Availability environments to switch to	672	* the original file and in High Availability environments to switch to
673	* an alternative location for the content in case of server meltdown.	673	* an alternative location for the content in case of server meltdown.
674	* This can only work if the loop device is used read-only, and if the	674	* This can only work if the loop device is used read-only, and if the
675	* new backing store is the same size and type as the old backing store.	675	* new backing store is the same size and type as the old backing store.
676	*/	676	*/
677	static int loop_change_fd(struct loop_device lo, struct file lo_file,	677	static int loop_change_fd(struct loop_device lo, struct file lo_file,
678	struct block_device *bdev, unsigned int arg)	678	struct block_device *bdev, unsigned int arg)
679	{	679	{
680	struct file file, old_file;	680	struct file file, old_file;
681	struct inode *inode;	681	struct inode *inode;
682	int error;	682	int error;
683		683
684	error = -ENXIO;	684	error = -ENXIO;
685	if (lo->lo_state != Lo_bound)	685	if (lo->lo_state != Lo_bound)
686	goto out;	686	goto out;
687		687
688	/* the loop device has to be read-only */	688	/* the loop device has to be read-only */
689	error = -EINVAL;	689	error = -EINVAL;
690	if (!(lo->lo_flags & LO_FLAGS_READ_ONLY))	690	if (!(lo->lo_flags & LO_FLAGS_READ_ONLY))
691	goto out;	691	goto out;
692		692
693	error = -EBADF;	693	error = -EBADF;
694	file = fget(arg);	694	file = fget(arg);
695	if (!file)	695	if (!file)
696	goto out;	696	goto out;
697		697
698	inode = file->f_mapping->host;	698	inode = file->f_mapping->host;
699	old_file = lo->lo_backing_file;	699	old_file = lo->lo_backing_file;
700		700
701	error = -EINVAL;	701	error = -EINVAL;
702		702
703	if (!S_ISREG(inode->i_mode) && !S_ISBLK(inode->i_mode))	703	if (!S_ISREG(inode->i_mode) && !S_ISBLK(inode->i_mode))
704	goto out_putf;	704	goto out_putf;
705		705
706	/* new backing store needs to support loop (eg splice_read) */	706	/* new backing store needs to support loop (eg splice_read) */
707	if (!inode->i_fop->splice_read)	707	if (!inode->i_fop->splice_read)
708	goto out_putf;	708	goto out_putf;
709		709
710	/* size of the new backing store needs to be the same */	710	/* size of the new backing store needs to be the same */
711	if (get_loop_size(lo, file) != get_loop_size(lo, old_file))	711	if (get_loop_size(lo, file) != get_loop_size(lo, old_file))
712	goto out_putf;	712	goto out_putf;
713		713
714	/* and ... switch */	714	/* and ... switch */
715	error = loop_switch(lo, file);	715	error = loop_switch(lo, file);
716	if (error)	716	if (error)
717	goto out_putf;	717	goto out_putf;
718		718
719	fput(old_file);	719	fput(old_file);
720	return 0;	720	return 0;
721		721
722	out_putf:	722	out_putf:
723	fput(file);	723	fput(file);
724	out:	724	out:
725	return error;	725	return error;
726	}	726	}
727		727
728	static inline int is_loop_device(struct file *file)	728	static inline int is_loop_device(struct file *file)
729	{	729	{
730	struct inode *i = file->f_mapping->host;	730	struct inode *i = file->f_mapping->host;
731		731
732	return i && S_ISBLK(i->i_mode) && MAJOR(i->i_rdev) == LOOP_MAJOR;	732	return i && S_ISBLK(i->i_mode) && MAJOR(i->i_rdev) == LOOP_MAJOR;
733	}	733	}
734		734
735	static int loop_set_fd(struct loop_device lo, struct file lo_file,	735	static int loop_set_fd(struct loop_device lo, struct file lo_file,
736	struct block_device *bdev, unsigned int arg)	736	struct block_device *bdev, unsigned int arg)
737	{	737	{
738	struct file file, f;	738	struct file file, f;
739	struct inode *inode;	739	struct inode *inode;
740	struct address_space *mapping;	740	struct address_space *mapping;
741	unsigned lo_blocksize;	741	unsigned lo_blocksize;
742	int lo_flags = 0;	742	int lo_flags = 0;
743	int error;	743	int error;
744	loff_t size;	744	loff_t size;
745		745
746	/* This is safe, since we have a reference from open(). */	746	/* This is safe, since we have a reference from open(). */
747	__module_get(THIS_MODULE);	747	__module_get(THIS_MODULE);
748		748
749	error = -EBADF;	749	error = -EBADF;
750	file = fget(arg);	750	file = fget(arg);
751	if (!file)	751	if (!file)
752	goto out;	752	goto out;
753		753
754	error = -EBUSY;	754	error = -EBUSY;
755	if (lo->lo_state != Lo_unbound)	755	if (lo->lo_state != Lo_unbound)
756	goto out_putf;	756	goto out_putf;
757		757
758	/* Avoid recursion */	758	/* Avoid recursion */
759	f = file;	759	f = file;
760	while (is_loop_device(f)) {	760	while (is_loop_device(f)) {
761	struct loop_device *l;	761	struct loop_device *l;
762		762
763	if (f->f_mapping->host->i_rdev == lo_file->f_mapping->host->i_rdev)	763	if (f->f_mapping->host->i_rdev == lo_file->f_mapping->host->i_rdev)
764	goto out_putf;	764	goto out_putf;
765		765
766	l = f->f_mapping->host->i_bdev->bd_disk->private_data;	766	l = f->f_mapping->host->i_bdev->bd_disk->private_data;
767	if (l->lo_state == Lo_unbound) {	767	if (l->lo_state == Lo_unbound) {
768	error = -EINVAL;	768	error = -EINVAL;
769	goto out_putf;	769	goto out_putf;
770	}	770	}
771	f = l->lo_backing_file;	771	f = l->lo_backing_file;
772	}	772	}
773		773
774	mapping = file->f_mapping;	774	mapping = file->f_mapping;
775	inode = mapping->host;	775	inode = mapping->host;
776		776
777	if (!(file->f_mode & FMODE_WRITE))	777	if (!(file->f_mode & FMODE_WRITE))
778	lo_flags \|= LO_FLAGS_READ_ONLY;	778	lo_flags \|= LO_FLAGS_READ_ONLY;
779		779
780	error = -EINVAL;	780	error = -EINVAL;
781	if (S_ISREG(inode->i_mode) \|\| S_ISBLK(inode->i_mode)) {	781	if (S_ISREG(inode->i_mode) \|\| S_ISBLK(inode->i_mode)) {
782	const struct address_space_operations *aops = mapping->a_ops;	782	const struct address_space_operations *aops = mapping->a_ops;
783	/*	783	/*
784	* If we can't read - sorry. If we only can't write - well,	784	* If we can't read - sorry. If we only can't write - well,
785	* it's going to be read-only.	785	* it's going to be read-only.
786	*/	786	*/
787	if (!file->f_op->splice_read)	787	if (!file->f_op->splice_read)
788	goto out_putf;	788	goto out_putf;
789	if (aops->prepare_write && aops->commit_write)	789	if (aops->prepare_write && aops->commit_write)
790	lo_flags \|= LO_FLAGS_USE_AOPS;	790	lo_flags \|= LO_FLAGS_USE_AOPS;
791	if (!(lo_flags & LO_FLAGS_USE_AOPS) && !file->f_op->write)	791	if (!(lo_flags & LO_FLAGS_USE_AOPS) && !file->f_op->write)
792	lo_flags \|= LO_FLAGS_READ_ONLY;	792	lo_flags \|= LO_FLAGS_READ_ONLY;
793		793
794	lo_blocksize = S_ISBLK(inode->i_mode) ?	794	lo_blocksize = S_ISBLK(inode->i_mode) ?
795	inode->i_bdev->bd_block_size : PAGE_SIZE;	795	inode->i_bdev->bd_block_size : PAGE_SIZE;
796		796
797	error = 0;	797	error = 0;
798	} else {	798	} else {
799	goto out_putf;	799	goto out_putf;
800	}	800	}
801		801
802	size = get_loop_size(lo, file);	802	size = get_loop_size(lo, file);
803		803
804	if ((loff_t)(sector_t)size != size) {	804	if ((loff_t)(sector_t)size != size) {
805	error = -EFBIG;	805	error = -EFBIG;
806	goto out_putf;	806	goto out_putf;
807	}	807	}
808		808
809	if (!(lo_file->f_mode & FMODE_WRITE))	809	if (!(lo_file->f_mode & FMODE_WRITE))
810	lo_flags \|= LO_FLAGS_READ_ONLY;	810	lo_flags \|= LO_FLAGS_READ_ONLY;
811		811
812	set_device_ro(bdev, (lo_flags & LO_FLAGS_READ_ONLY) != 0);	812	set_device_ro(bdev, (lo_flags & LO_FLAGS_READ_ONLY) != 0);
813		813
814	lo->lo_blocksize = lo_blocksize;	814	lo->lo_blocksize = lo_blocksize;
815	lo->lo_device = bdev;	815	lo->lo_device = bdev;
816	lo->lo_flags = lo_flags;	816	lo->lo_flags = lo_flags;
817	lo->lo_backing_file = file;	817	lo->lo_backing_file = file;
818	lo->transfer = transfer_none;	818	lo->transfer = transfer_none;
819	lo->ioctl = NULL;	819	lo->ioctl = NULL;
820	lo->lo_sizelimit = 0;	820	lo->lo_sizelimit = 0;
821	lo->old_gfp_mask = mapping_gfp_mask(mapping);	821	lo->old_gfp_mask = mapping_gfp_mask(mapping);
822	mapping_set_gfp_mask(mapping, lo->old_gfp_mask & ~(__GFP_IO\|__GFP_FS));	822	mapping_set_gfp_mask(mapping, lo->old_gfp_mask & ~(__GFP_IO\|__GFP_FS));
823		823
824	lo->lo_bio = lo->lo_biotail = NULL;	824	lo->lo_bio = lo->lo_biotail = NULL;
825		825
826	/*	826	/*
827	* set queue make_request_fn, and add limits based on lower level	827	* set queue make_request_fn, and add limits based on lower level
828	* device	828	* device
829	*/	829	*/
830	blk_queue_make_request(lo->lo_queue, loop_make_request);	830	blk_queue_make_request(lo->lo_queue, loop_make_request);
831	lo->lo_queue->queuedata = lo;	831	lo->lo_queue->queuedata = lo;
832	lo->lo_queue->unplug_fn = loop_unplug;	832	lo->lo_queue->unplug_fn = loop_unplug;
833		833
834	set_capacity(lo->lo_disk, size);	834	set_capacity(lo->lo_disk, size);
835	bd_set_size(bdev, size << 9);	835	bd_set_size(bdev, size << 9);
836		836
837	set_blocksize(bdev, lo_blocksize);	837	set_blocksize(bdev, lo_blocksize);
838		838
839	lo->lo_thread = kthread_create(loop_thread, lo, "loop%d",	839	lo->lo_thread = kthread_create(loop_thread, lo, "loop%d",
840	lo->lo_number);	840	lo->lo_number);
841	if (IS_ERR(lo->lo_thread)) {	841	if (IS_ERR(lo->lo_thread)) {
842	error = PTR_ERR(lo->lo_thread);	842	error = PTR_ERR(lo->lo_thread);
843	goto out_clr;	843	goto out_clr;
844	}	844	}
845	lo->lo_state = Lo_bound;	845	lo->lo_state = Lo_bound;
846	wake_up_process(lo->lo_thread);	846	wake_up_process(lo->lo_thread);
847	return 0;	847	return 0;
848		848
849	out_clr:	849	out_clr:
850	lo->lo_thread = NULL;	850	lo->lo_thread = NULL;
851	lo->lo_device = NULL;	851	lo->lo_device = NULL;
852	lo->lo_backing_file = NULL;	852	lo->lo_backing_file = NULL;
853	lo->lo_flags = 0;	853	lo->lo_flags = 0;
854	set_capacity(lo->lo_disk, 0);	854	set_capacity(lo->lo_disk, 0);
855	invalidate_bdev(bdev);	855	invalidate_bdev(bdev);
856	bd_set_size(bdev, 0);	856	bd_set_size(bdev, 0);
857	mapping_set_gfp_mask(mapping, lo->old_gfp_mask);	857	mapping_set_gfp_mask(mapping, lo->old_gfp_mask);
858	lo->lo_state = Lo_unbound;	858	lo->lo_state = Lo_unbound;
859	out_putf:	859	out_putf:
860	fput(file);	860	fput(file);
861	out:	861	out:
862	/* This is safe: open() is still holding a reference. */	862	/* This is safe: open() is still holding a reference. */
863	module_put(THIS_MODULE);	863	module_put(THIS_MODULE);
864	return error;	864	return error;
865	}	865	}
866		866
867	static int	867	static int
868	loop_release_xfer(struct loop_device *lo)	868	loop_release_xfer(struct loop_device *lo)
869	{	869	{
870	int err = 0;	870	int err = 0;
871	struct loop_func_table *xfer = lo->lo_encryption;	871	struct loop_func_table *xfer = lo->lo_encryption;
872		872
873	if (xfer) {	873	if (xfer) {
874	if (xfer->release)	874	if (xfer->release)
875	err = xfer->release(lo);	875	err = xfer->release(lo);
876	lo->transfer = NULL;	876	lo->transfer = NULL;
877	lo->lo_encryption = NULL;	877	lo->lo_encryption = NULL;
878	module_put(xfer->owner);	878	module_put(xfer->owner);
879	}	879	}
880	return err;	880	return err;
881	}	881	}
882		882
883	static int	883	static int
884	loop_init_xfer(struct loop_device lo, struct loop_func_table xfer,	884	loop_init_xfer(struct loop_device lo, struct loop_func_table xfer,
885	const struct loop_info64 *i)	885	const struct loop_info64 *i)
886	{	886	{
887	int err = 0;	887	int err = 0;
888		888
889	if (xfer) {	889	if (xfer) {
890	struct module *owner = xfer->owner;	890	struct module *owner = xfer->owner;
891		891
892	if (!try_module_get(owner))	892	if (!try_module_get(owner))
893	return -EINVAL;	893	return -EINVAL;
894	if (xfer->init)	894	if (xfer->init)
895	err = xfer->init(lo, i);	895	err = xfer->init(lo, i);
896	if (err)	896	if (err)
897	module_put(owner);	897	module_put(owner);
898	else	898	else
899	lo->lo_encryption = xfer;	899	lo->lo_encryption = xfer;
900	}	900	}
901	return err;	901	return err;
902	}	902	}
903		903
904	static int loop_clr_fd(struct loop_device lo, struct block_device bdev)	904	static int loop_clr_fd(struct loop_device lo, struct block_device bdev)
905	{	905	{
906	struct file *filp = lo->lo_backing_file;	906	struct file *filp = lo->lo_backing_file;
907	gfp_t gfp = lo->old_gfp_mask;	907	gfp_t gfp = lo->old_gfp_mask;
908		908
909	if (lo->lo_state != Lo_bound)	909	if (lo->lo_state != Lo_bound)
910	return -ENXIO;	910	return -ENXIO;
911		911
912	if (lo->lo_refcnt > 1) /* we needed one fd for the ioctl */	912	if (lo->lo_refcnt > 1) /* we needed one fd for the ioctl */
913	return -EBUSY;	913	return -EBUSY;
914		914
915	if (filp == NULL)	915	if (filp == NULL)
916	return -EINVAL;	916	return -EINVAL;
917		917
918	spin_lock_irq(&lo->lo_lock);	918	spin_lock_irq(&lo->lo_lock);
919	lo->lo_state = Lo_rundown;	919	lo->lo_state = Lo_rundown;
920	spin_unlock_irq(&lo->lo_lock);	920	spin_unlock_irq(&lo->lo_lock);
921		921
922	kthread_stop(lo->lo_thread);	922	kthread_stop(lo->lo_thread);
923		923
924	lo->lo_backing_file = NULL;	924	lo->lo_backing_file = NULL;
925		925
926	loop_release_xfer(lo);	926	loop_release_xfer(lo);
927	lo->transfer = NULL;	927	lo->transfer = NULL;
928	lo->ioctl = NULL;	928	lo->ioctl = NULL;
929	lo->lo_device = NULL;	929	lo->lo_device = NULL;
930	lo->lo_encryption = NULL;	930	lo->lo_encryption = NULL;
931	lo->lo_offset = 0;	931	lo->lo_offset = 0;
932	lo->lo_sizelimit = 0;	932	lo->lo_sizelimit = 0;
933	lo->lo_encrypt_key_size = 0;	933	lo->lo_encrypt_key_size = 0;
934	lo->lo_flags = 0;	934	lo->lo_flags = 0;
935	lo->lo_thread = NULL;	935	lo->lo_thread = NULL;
936	memset(lo->lo_encrypt_key, 0, LO_KEY_SIZE);	936	memset(lo->lo_encrypt_key, 0, LO_KEY_SIZE);
937	memset(lo->lo_crypt_name, 0, LO_NAME_SIZE);	937	memset(lo->lo_crypt_name, 0, LO_NAME_SIZE);
938	memset(lo->lo_file_name, 0, LO_NAME_SIZE);	938	memset(lo->lo_file_name, 0, LO_NAME_SIZE);
939	invalidate_bdev(bdev);	939	invalidate_bdev(bdev);
940	set_capacity(lo->lo_disk, 0);	940	set_capacity(lo->lo_disk, 0);
941	bd_set_size(bdev, 0);	941	bd_set_size(bdev, 0);
942	mapping_set_gfp_mask(filp->f_mapping, gfp);	942	mapping_set_gfp_mask(filp->f_mapping, gfp);
943	lo->lo_state = Lo_unbound;	943	lo->lo_state = Lo_unbound;
944	fput(filp);	944	fput(filp);
945	/* This is safe: open() is still holding a reference. */	945	/* This is safe: open() is still holding a reference. */
946	module_put(THIS_MODULE);	946	module_put(THIS_MODULE);
947	return 0;	947	return 0;
948	}	948	}
949		949
950	static int	950	static int
951	loop_set_status(struct loop_device lo, const struct loop_info64 info)	951	loop_set_status(struct loop_device lo, const struct loop_info64 info)
952	{	952	{
953	int err;	953	int err;
954	struct loop_func_table *xfer;	954	struct loop_func_table *xfer;
955		955
956	if (lo->lo_encrypt_key_size && lo->lo_key_owner != current->uid &&	956	if (lo->lo_encrypt_key_size && lo->lo_key_owner != current->uid &&
957	!capable(CAP_SYS_ADMIN))	957	!capable(CAP_SYS_ADMIN))
958	return -EPERM;	958	return -EPERM;
959	if (lo->lo_state != Lo_bound)	959	if (lo->lo_state != Lo_bound)
960	return -ENXIO;	960	return -ENXIO;
961	if ((unsigned int) info->lo_encrypt_key_size > LO_KEY_SIZE)	961	if ((unsigned int) info->lo_encrypt_key_size > LO_KEY_SIZE)
962	return -EINVAL;	962	return -EINVAL;
963		963
964	err = loop_release_xfer(lo);	964	err = loop_release_xfer(lo);
965	if (err)	965	if (err)
966	return err;	966	return err;
967		967
968	if (info->lo_encrypt_type) {	968	if (info->lo_encrypt_type) {
969	unsigned int type = info->lo_encrypt_type;	969	unsigned int type = info->lo_encrypt_type;
970		970
971	if (type >= MAX_LO_CRYPT)	971	if (type >= MAX_LO_CRYPT)
972	return -EINVAL;	972	return -EINVAL;
973	xfer = xfer_funcs[type];	973	xfer = xfer_funcs[type];
974	if (xfer == NULL)	974	if (xfer == NULL)
975	return -EINVAL;	975	return -EINVAL;
976	} else	976	} else
977	xfer = NULL;	977	xfer = NULL;
978		978
979	err = loop_init_xfer(lo, xfer, info);	979	err = loop_init_xfer(lo, xfer, info);
980	if (err)	980	if (err)
981	return err;	981	return err;
982		982
983	if (lo->lo_offset != info->lo_offset \|\|	983	if (lo->lo_offset != info->lo_offset \|\|
984	lo->lo_sizelimit != info->lo_sizelimit) {	984	lo->lo_sizelimit != info->lo_sizelimit) {
985	lo->lo_offset = info->lo_offset;	985	lo->lo_offset = info->lo_offset;
986	lo->lo_sizelimit = info->lo_sizelimit;	986	lo->lo_sizelimit = info->lo_sizelimit;
987	if (figure_loop_size(lo))	987	if (figure_loop_size(lo))
988	return -EFBIG;	988	return -EFBIG;
989	}	989	}
990		990
991	memcpy(lo->lo_file_name, info->lo_file_name, LO_NAME_SIZE);	991	memcpy(lo->lo_file_name, info->lo_file_name, LO_NAME_SIZE);
992	memcpy(lo->lo_crypt_name, info->lo_crypt_name, LO_NAME_SIZE);	992	memcpy(lo->lo_crypt_name, info->lo_crypt_name, LO_NAME_SIZE);
993	lo->lo_file_name[LO_NAME_SIZE-1] = 0;	993	lo->lo_file_name[LO_NAME_SIZE-1] = 0;
994	lo->lo_crypt_name[LO_NAME_SIZE-1] = 0;	994	lo->lo_crypt_name[LO_NAME_SIZE-1] = 0;
995		995
996	if (!xfer)	996	if (!xfer)
997	xfer = &none_funcs;	997	xfer = &none_funcs;
998	lo->transfer = xfer->transfer;	998	lo->transfer = xfer->transfer;
999	lo->ioctl = xfer->ioctl;	999	lo->ioctl = xfer->ioctl;
1000		1000
1001	lo->lo_encrypt_key_size = info->lo_encrypt_key_size;	1001	lo->lo_encrypt_key_size = info->lo_encrypt_key_size;
1002	lo->lo_init[0] = info->lo_init[0];	1002	lo->lo_init[0] = info->lo_init[0];
1003	lo->lo_init[1] = info->lo_init[1];	1003	lo->lo_init[1] = info->lo_init[1];
1004	if (info->lo_encrypt_key_size) {	1004	if (info->lo_encrypt_key_size) {
1005	memcpy(lo->lo_encrypt_key, info->lo_encrypt_key,	1005	memcpy(lo->lo_encrypt_key, info->lo_encrypt_key,
1006	info->lo_encrypt_key_size);	1006	info->lo_encrypt_key_size);
1007	lo->lo_key_owner = current->uid;	1007	lo->lo_key_owner = current->uid;
1008	}	1008	}
1009		1009
1010	return 0;	1010	return 0;
1011	}	1011	}
1012		1012
1013	static int	1013	static int
1014	loop_get_status(struct loop_device lo, struct loop_info64 info)	1014	loop_get_status(struct loop_device lo, struct loop_info64 info)
1015	{	1015	{
1016	struct file *file = lo->lo_backing_file;	1016	struct file *file = lo->lo_backing_file;
1017	struct kstat stat;	1017	struct kstat stat;
1018	int error;	1018	int error;
1019		1019
1020	if (lo->lo_state != Lo_bound)	1020	if (lo->lo_state != Lo_bound)
1021	return -ENXIO;	1021	return -ENXIO;
1022	error = vfs_getattr(file->f_path.mnt, file->f_path.dentry, &stat);	1022	error = vfs_getattr(file->f_path.mnt, file->f_path.dentry, &stat);
1023	if (error)	1023	if (error)
1024	return error;	1024	return error;
1025	memset(info, 0, sizeof(*info));	1025	memset(info, 0, sizeof(*info));
1026	info->lo_number = lo->lo_number;	1026	info->lo_number = lo->lo_number;
1027	info->lo_device = huge_encode_dev(stat.dev);	1027	info->lo_device = huge_encode_dev(stat.dev);
1028	info->lo_inode = stat.ino;	1028	info->lo_inode = stat.ino;
1029	info->lo_rdevice = huge_encode_dev(lo->lo_device ? stat.rdev : stat.dev);	1029	info->lo_rdevice = huge_encode_dev(lo->lo_device ? stat.rdev : stat.dev);
1030	info->lo_offset = lo->lo_offset;	1030	info->lo_offset = lo->lo_offset;
1031	info->lo_sizelimit = lo->lo_sizelimit;	1031	info->lo_sizelimit = lo->lo_sizelimit;
1032	info->lo_flags = lo->lo_flags;	1032	info->lo_flags = lo->lo_flags;
1033	memcpy(info->lo_file_name, lo->lo_file_name, LO_NAME_SIZE);	1033	memcpy(info->lo_file_name, lo->lo_file_name, LO_NAME_SIZE);
1034	memcpy(info->lo_crypt_name, lo->lo_crypt_name, LO_NAME_SIZE);	1034	memcpy(info->lo_crypt_name, lo->lo_crypt_name, LO_NAME_SIZE);
1035	info->lo_encrypt_type =	1035	info->lo_encrypt_type =
1036	lo->lo_encryption ? lo->lo_encryption->number : 0;	1036	lo->lo_encryption ? lo->lo_encryption->number : 0;
1037	if (lo->lo_encrypt_key_size && capable(CAP_SYS_ADMIN)) {	1037	if (lo->lo_encrypt_key_size && capable(CAP_SYS_ADMIN)) {
1038	info->lo_encrypt_key_size = lo->lo_encrypt_key_size;	1038	info->lo_encrypt_key_size = lo->lo_encrypt_key_size;
1039	memcpy(info->lo_encrypt_key, lo->lo_encrypt_key,	1039	memcpy(info->lo_encrypt_key, lo->lo_encrypt_key,
1040	lo->lo_encrypt_key_size);	1040	lo->lo_encrypt_key_size);
1041	}	1041	}
1042	return 0;	1042	return 0;
1043	}	1043	}
1044		1044
1045	static void	1045	static void
1046	loop_info64_from_old(const struct loop_info info, struct loop_info64 info64)	1046	loop_info64_from_old(const struct loop_info info, struct loop_info64 info64)
1047	{	1047	{
1048	memset(info64, 0, sizeof(*info64));	1048	memset(info64, 0, sizeof(*info64));
1049	info64->lo_number = info->lo_number;	1049	info64->lo_number = info->lo_number;
1050	info64->lo_device = info->lo_device;	1050	info64->lo_device = info->lo_device;
1051	info64->lo_inode = info->lo_inode;	1051	info64->lo_inode = info->lo_inode;
1052	info64->lo_rdevice = info->lo_rdevice;	1052	info64->lo_rdevice = info->lo_rdevice;
1053	info64->lo_offset = info->lo_offset;	1053	info64->lo_offset = info->lo_offset;
1054	info64->lo_sizelimit = 0;	1054	info64->lo_sizelimit = 0;
1055	info64->lo_encrypt_type = info->lo_encrypt_type;	1055	info64->lo_encrypt_type = info->lo_encrypt_type;
1056	info64->lo_encrypt_key_size = info->lo_encrypt_key_size;	1056	info64->lo_encrypt_key_size = info->lo_encrypt_key_size;
1057	info64->lo_flags = info->lo_flags;	1057	info64->lo_flags = info->lo_flags;
1058	info64->lo_init[0] = info->lo_init[0];	1058	info64->lo_init[0] = info->lo_init[0];
1059	info64->lo_init[1] = info->lo_init[1];	1059	info64->lo_init[1] = info->lo_init[1];
1060	if (info->lo_encrypt_type == LO_CRYPT_CRYPTOAPI)	1060	if (info->lo_encrypt_type == LO_CRYPT_CRYPTOAPI)
1061	memcpy(info64->lo_crypt_name, info->lo_name, LO_NAME_SIZE);	1061	memcpy(info64->lo_crypt_name, info->lo_name, LO_NAME_SIZE);
1062	else	1062	else
1063	memcpy(info64->lo_file_name, info->lo_name, LO_NAME_SIZE);	1063	memcpy(info64->lo_file_name, info->lo_name, LO_NAME_SIZE);
1064	memcpy(info64->lo_encrypt_key, info->lo_encrypt_key, LO_KEY_SIZE);	1064	memcpy(info64->lo_encrypt_key, info->lo_encrypt_key, LO_KEY_SIZE);
1065	}	1065	}
1066		1066
1067	static int	1067	static int
1068	loop_info64_to_old(const struct loop_info64 info64, struct loop_info info)	1068	loop_info64_to_old(const struct loop_info64 info64, struct loop_info info)
1069	{	1069	{
1070	memset(info, 0, sizeof(*info));	1070	memset(info, 0, sizeof(*info));
1071	info->lo_number = info64->lo_number;	1071	info->lo_number = info64->lo_number;
1072	info->lo_device = info64->lo_device;	1072	info->lo_device = info64->lo_device;
1073	info->lo_inode = info64->lo_inode;	1073	info->lo_inode = info64->lo_inode;
1074	info->lo_rdevice = info64->lo_rdevice;	1074	info->lo_rdevice = info64->lo_rdevice;
1075	info->lo_offset = info64->lo_offset;	1075	info->lo_offset = info64->lo_offset;
1076	info->lo_encrypt_type = info64->lo_encrypt_type;	1076	info->lo_encrypt_type = info64->lo_encrypt_type;
1077	info->lo_encrypt_key_size = info64->lo_encrypt_key_size;	1077	info->lo_encrypt_key_size = info64->lo_encrypt_key_size;
1078	info->lo_flags = info64->lo_flags;	1078	info->lo_flags = info64->lo_flags;
1079	info->lo_init[0] = info64->lo_init[0];	1079	info->lo_init[0] = info64->lo_init[0];
1080	info->lo_init[1] = info64->lo_init[1];	1080	info->lo_init[1] = info64->lo_init[1];
1081	if (info->lo_encrypt_type == LO_CRYPT_CRYPTOAPI)	1081	if (info->lo_encrypt_type == LO_CRYPT_CRYPTOAPI)
1082	memcpy(info->lo_name, info64->lo_crypt_name, LO_NAME_SIZE);	1082	memcpy(info->lo_name, info64->lo_crypt_name, LO_NAME_SIZE);
1083	else	1083	else
1084	memcpy(info->lo_name, info64->lo_file_name, LO_NAME_SIZE);	1084	memcpy(info->lo_name, info64->lo_file_name, LO_NAME_SIZE);
1085	memcpy(info->lo_encrypt_key, info64->lo_encrypt_key, LO_KEY_SIZE);	1085	memcpy(info->lo_encrypt_key, info64->lo_encrypt_key, LO_KEY_SIZE);
1086		1086
1087	/* error in case values were truncated */	1087	/* error in case values were truncated */
1088	if (info->lo_device != info64->lo_device \|\|	1088	if (info->lo_device != info64->lo_device \|\|
1089	info->lo_rdevice != info64->lo_rdevice \|\|	1089	info->lo_rdevice != info64->lo_rdevice \|\|
1090	info->lo_inode != info64->lo_inode \|\|	1090	info->lo_inode != info64->lo_inode \|\|
1091	info->lo_offset != info64->lo_offset)	1091	info->lo_offset != info64->lo_offset)
1092	return -EOVERFLOW;	1092	return -EOVERFLOW;
1093		1093
1094	return 0;	1094	return 0;
1095	}	1095	}
1096		1096
1097	static int	1097	static int
1098	loop_set_status_old(struct loop_device lo, const struct loop_info __user arg)	1098	loop_set_status_old(struct loop_device lo, const struct loop_info __user arg)
1099	{	1099	{
1100	struct loop_info info;	1100	struct loop_info info;
1101	struct loop_info64 info64;	1101	struct loop_info64 info64;
1102		1102
1103	if (copy_from_user(&info, arg, sizeof (struct loop_info)))	1103	if (copy_from_user(&info, arg, sizeof (struct loop_info)))
1104	return -EFAULT;	1104	return -EFAULT;
1105	loop_info64_from_old(&info, &info64);	1105	loop_info64_from_old(&info, &info64);
1106	return loop_set_status(lo, &info64);	1106	return loop_set_status(lo, &info64);
1107	}	1107	}
1108		1108
1109	static int	1109	static int
1110	loop_set_status64(struct loop_device lo, const struct loop_info64 __user arg)	1110	loop_set_status64(struct loop_device lo, const struct loop_info64 __user arg)
1111	{	1111	{
1112	struct loop_info64 info64;	1112	struct loop_info64 info64;
1113		1113
1114	if (copy_from_user(&info64, arg, sizeof (struct loop_info64)))	1114	if (copy_from_user(&info64, arg, sizeof (struct loop_info64)))
1115	return -EFAULT;	1115	return -EFAULT;
1116	return loop_set_status(lo, &info64);	1116	return loop_set_status(lo, &info64);
1117	}	1117	}
1118		1118
1119	static int	1119	static int
1120	loop_get_status_old(struct loop_device lo, struct loop_info __user arg) {	1120	loop_get_status_old(struct loop_device lo, struct loop_info __user arg) {
1121	struct loop_info info;	1121	struct loop_info info;
1122	struct loop_info64 info64;	1122	struct loop_info64 info64;
1123	int err = 0;	1123	int err = 0;
1124		1124
1125	if (!arg)	1125	if (!arg)
1126	err = -EINVAL;	1126	err = -EINVAL;
1127	if (!err)	1127	if (!err)
1128	err = loop_get_status(lo, &info64);	1128	err = loop_get_status(lo, &info64);
1129	if (!err)	1129	if (!err)
1130	err = loop_info64_to_old(&info64, &info);	1130	err = loop_info64_to_old(&info64, &info);
1131	if (!err && copy_to_user(arg, &info, sizeof(info)))	1131	if (!err && copy_to_user(arg, &info, sizeof(info)))
1132	err = -EFAULT;	1132	err = -EFAULT;
1133		1133
1134	return err;	1134	return err;
1135	}	1135	}
1136		1136
1137	static int	1137	static int
1138	loop_get_status64(struct loop_device lo, struct loop_info64 __user arg) {	1138	loop_get_status64(struct loop_device lo, struct loop_info64 __user arg) {
1139	struct loop_info64 info64;	1139	struct loop_info64 info64;
1140	int err = 0;	1140	int err = 0;
1141		1141
1142	if (!arg)	1142	if (!arg)
1143	err = -EINVAL;	1143	err = -EINVAL;
1144	if (!err)	1144	if (!err)
1145	err = loop_get_status(lo, &info64);	1145	err = loop_get_status(lo, &info64);
1146	if (!err && copy_to_user(arg, &info64, sizeof(info64)))	1146	if (!err && copy_to_user(arg, &info64, sizeof(info64)))
1147	err = -EFAULT;	1147	err = -EFAULT;
1148		1148
1149	return err;	1149	return err;
1150	}	1150	}
1151		1151
1152	static int lo_ioctl(struct inode * inode, struct file * file,	1152	static int lo_ioctl(struct inode * inode, struct file * file,
1153	unsigned int cmd, unsigned long arg)	1153	unsigned int cmd, unsigned long arg)
1154	{	1154	{
1155	struct loop_device *lo = inode->i_bdev->bd_disk->private_data;	1155	struct loop_device *lo = inode->i_bdev->bd_disk->private_data;
1156	int err;	1156	int err;
1157		1157
1158	mutex_lock(&lo->lo_ctl_mutex);	1158	mutex_lock(&lo->lo_ctl_mutex);
1159	switch (cmd) {	1159	switch (cmd) {
1160	case LOOP_SET_FD:	1160	case LOOP_SET_FD:
1161	err = loop_set_fd(lo, file, inode->i_bdev, arg);	1161	err = loop_set_fd(lo, file, inode->i_bdev, arg);
1162	break;	1162	break;
1163	case LOOP_CHANGE_FD:	1163	case LOOP_CHANGE_FD:
1164	err = loop_change_fd(lo, file, inode->i_bdev, arg);	1164	err = loop_change_fd(lo, file, inode->i_bdev, arg);
1165	break;	1165	break;
1166	case LOOP_CLR_FD:	1166	case LOOP_CLR_FD:
1167	err = loop_clr_fd(lo, inode->i_bdev);	1167	err = loop_clr_fd(lo, inode->i_bdev);
1168	break;	1168	break;
1169	case LOOP_SET_STATUS:	1169	case LOOP_SET_STATUS:
1170	err = loop_set_status_old(lo, (struct loop_info __user *) arg);	1170	err = loop_set_status_old(lo, (struct loop_info __user *) arg);
1171	break;	1171	break;
1172	case LOOP_GET_STATUS:	1172	case LOOP_GET_STATUS:
1173	err = loop_get_status_old(lo, (struct loop_info __user *) arg);	1173	err = loop_get_status_old(lo, (struct loop_info __user *) arg);
1174	break;	1174	break;
1175	case LOOP_SET_STATUS64:	1175	case LOOP_SET_STATUS64:
1176	err = loop_set_status64(lo, (struct loop_info64 __user *) arg);	1176	err = loop_set_status64(lo, (struct loop_info64 __user *) arg);
1177	break;	1177	break;
1178	case LOOP_GET_STATUS64:	1178	case LOOP_GET_STATUS64:
1179	err = loop_get_status64(lo, (struct loop_info64 __user *) arg);	1179	err = loop_get_status64(lo, (struct loop_info64 __user *) arg);
1180	break;	1180	break;
1181	default:	1181	default:
1182	err = lo->ioctl ? lo->ioctl(lo, cmd, arg) : -EINVAL;	1182	err = lo->ioctl ? lo->ioctl(lo, cmd, arg) : -EINVAL;
1183	}	1183	}
1184	mutex_unlock(&lo->lo_ctl_mutex);	1184	mutex_unlock(&lo->lo_ctl_mutex);
1185	return err;	1185	return err;
1186	}	1186	}
1187		1187
1188	#ifdef CONFIG_COMPAT	1188	#ifdef CONFIG_COMPAT
1189	struct compat_loop_info {	1189	struct compat_loop_info {
1190	compat_int_t lo_number; /* ioctl r/o */	1190	compat_int_t lo_number; /* ioctl r/o */
1191	compat_dev_t lo_device; /* ioctl r/o */	1191	compat_dev_t lo_device; /* ioctl r/o */
1192	compat_ulong_t lo_inode; /* ioctl r/o */	1192	compat_ulong_t lo_inode; /* ioctl r/o */
1193	compat_dev_t lo_rdevice; /* ioctl r/o */	1193	compat_dev_t lo_rdevice; /* ioctl r/o */
1194	compat_int_t lo_offset;	1194	compat_int_t lo_offset;
1195	compat_int_t lo_encrypt_type;	1195	compat_int_t lo_encrypt_type;
1196	compat_int_t lo_encrypt_key_size; /* ioctl w/o */	1196	compat_int_t lo_encrypt_key_size; /* ioctl w/o */
1197	compat_int_t lo_flags; /* ioctl r/o */	1197	compat_int_t lo_flags; /* ioctl r/o */
1198	char lo_name[LO_NAME_SIZE];	1198	char lo_name[LO_NAME_SIZE];
1199	unsigned char lo_encrypt_key[LO_KEY_SIZE]; /* ioctl w/o */	1199	unsigned char lo_encrypt_key[LO_KEY_SIZE]; /* ioctl w/o */
1200	compat_ulong_t lo_init[2];	1200	compat_ulong_t lo_init[2];
1201	char reserved[4];	1201	char reserved[4];
1202	};	1202	};
1203		1203
1204	/*	1204	/*
1205	* Transfer 32-bit compatibility structure in userspace to 64-bit loop info	1205	* Transfer 32-bit compatibility structure in userspace to 64-bit loop info
1206	* - noinlined to reduce stack space usage in main part of driver	1206	* - noinlined to reduce stack space usage in main part of driver
1207	*/	1207	*/
1208	static noinline int	1208	static noinline int
1209	loop_info64_from_compat(const struct compat_loop_info __user *arg,	1209	loop_info64_from_compat(const struct compat_loop_info __user *arg,
1210	struct loop_info64 *info64)	1210	struct loop_info64 *info64)
1211	{	1211	{
1212	struct compat_loop_info info;	1212	struct compat_loop_info info;
1213		1213
1214	if (copy_from_user(&info, arg, sizeof(info)))	1214	if (copy_from_user(&info, arg, sizeof(info)))
1215	return -EFAULT;	1215	return -EFAULT;
1216		1216
1217	memset(info64, 0, sizeof(*info64));	1217	memset(info64, 0, sizeof(*info64));
1218	info64->lo_number = info.lo_number;	1218	info64->lo_number = info.lo_number;
1219	info64->lo_device = info.lo_device;	1219	info64->lo_device = info.lo_device;
1220	info64->lo_inode = info.lo_inode;	1220	info64->lo_inode = info.lo_inode;
1221	info64->lo_rdevice = info.lo_rdevice;	1221	info64->lo_rdevice = info.lo_rdevice;
1222	info64->lo_offset = info.lo_offset;	1222	info64->lo_offset = info.lo_offset;
1223	info64->lo_sizelimit = 0;	1223	info64->lo_sizelimit = 0;
1224	info64->lo_encrypt_type = info.lo_encrypt_type;	1224	info64->lo_encrypt_type = info.lo_encrypt_type;
1225	info64->lo_encrypt_key_size = info.lo_encrypt_key_size;	1225	info64->lo_encrypt_key_size = info.lo_encrypt_key_size;
1226	info64->lo_flags = info.lo_flags;	1226	info64->lo_flags = info.lo_flags;
1227	info64->lo_init[0] = info.lo_init[0];	1227	info64->lo_init[0] = info.lo_init[0];
1228	info64->lo_init[1] = info.lo_init[1];	1228	info64->lo_init[1] = info.lo_init[1];
1229	if (info.lo_encrypt_type == LO_CRYPT_CRYPTOAPI)	1229	if (info.lo_encrypt_type == LO_CRYPT_CRYPTOAPI)
1230	memcpy(info64->lo_crypt_name, info.lo_name, LO_NAME_SIZE);	1230	memcpy(info64->lo_crypt_name, info.lo_name, LO_NAME_SIZE);
1231	else	1231	else
1232	memcpy(info64->lo_file_name, info.lo_name, LO_NAME_SIZE);	1232	memcpy(info64->lo_file_name, info.lo_name, LO_NAME_SIZE);
1233	memcpy(info64->lo_encrypt_key, info.lo_encrypt_key, LO_KEY_SIZE);	1233	memcpy(info64->lo_encrypt_key, info.lo_encrypt_key, LO_KEY_SIZE);
1234	return 0;	1234	return 0;
1235	}	1235	}
1236		1236
1237	/*	1237	/*
1238	* Transfer 64-bit loop info to 32-bit compatibility structure in userspace	1238	* Transfer 64-bit loop info to 32-bit compatibility structure in userspace
1239	* - noinlined to reduce stack space usage in main part of driver	1239	* - noinlined to reduce stack space usage in main part of driver
1240	*/	1240	*/
1241	static noinline int	1241	static noinline int
1242	loop_info64_to_compat(const struct loop_info64 *info64,	1242	loop_info64_to_compat(const struct loop_info64 *info64,
1243	struct compat_loop_info __user *arg)	1243	struct compat_loop_info __user *arg)
1244	{	1244	{
1245	struct compat_loop_info info;	1245	struct compat_loop_info info;
1246		1246
1247	memset(&info, 0, sizeof(info));	1247	memset(&info, 0, sizeof(info));
1248	info.lo_number = info64->lo_number;	1248	info.lo_number = info64->lo_number;
1249	info.lo_device = info64->lo_device;	1249	info.lo_device = info64->lo_device;
1250	info.lo_inode = info64->lo_inode;	1250	info.lo_inode = info64->lo_inode;
1251	info.lo_rdevice = info64->lo_rdevice;	1251	info.lo_rdevice = info64->lo_rdevice;
1252	info.lo_offset = info64->lo_offset;	1252	info.lo_offset = info64->lo_offset;
1253	info.lo_encrypt_type = info64->lo_encrypt_type;	1253	info.lo_encrypt_type = info64->lo_encrypt_type;
1254	info.lo_encrypt_key_size = info64->lo_encrypt_key_size;	1254	info.lo_encrypt_key_size = info64->lo_encrypt_key_size;
1255	info.lo_flags = info64->lo_flags;	1255	info.lo_flags = info64->lo_flags;
1256	info.lo_init[0] = info64->lo_init[0];	1256	info.lo_init[0] = info64->lo_init[0];
1257	info.lo_init[1] = info64->lo_init[1];	1257	info.lo_init[1] = info64->lo_init[1];
1258	if (info.lo_encrypt_type == LO_CRYPT_CRYPTOAPI)	1258	if (info.lo_encrypt_type == LO_CRYPT_CRYPTOAPI)
1259	memcpy(info.lo_name, info64->lo_crypt_name, LO_NAME_SIZE);	1259	memcpy(info.lo_name, info64->lo_crypt_name, LO_NAME_SIZE);
1260	else	1260	else
1261	memcpy(info.lo_name, info64->lo_file_name, LO_NAME_SIZE);	1261	memcpy(info.lo_name, info64->lo_file_name, LO_NAME_SIZE);
1262	memcpy(info.lo_encrypt_key, info64->lo_encrypt_key, LO_KEY_SIZE);	1262	memcpy(info.lo_encrypt_key, info64->lo_encrypt_key, LO_KEY_SIZE);
1263		1263
1264	/* error in case values were truncated */	1264	/* error in case values were truncated */
1265	if (info.lo_device != info64->lo_device \|\|	1265	if (info.lo_device != info64->lo_device \|\|
1266	info.lo_rdevice != info64->lo_rdevice \|\|	1266	info.lo_rdevice != info64->lo_rdevice \|\|
1267	info.lo_inode != info64->lo_inode \|\|	1267	info.lo_inode != info64->lo_inode \|\|
1268	info.lo_offset != info64->lo_offset \|\|	1268	info.lo_offset != info64->lo_offset \|\|
1269	info.lo_init[0] != info64->lo_init[0] \|\|	1269	info.lo_init[0] != info64->lo_init[0] \|\|
1270	info.lo_init[1] != info64->lo_init[1])	1270	info.lo_init[1] != info64->lo_init[1])
1271	return -EOVERFLOW;	1271	return -EOVERFLOW;
1272		1272
1273	if (copy_to_user(arg, &info, sizeof(info)))	1273	if (copy_to_user(arg, &info, sizeof(info)))
1274	return -EFAULT;	1274	return -EFAULT;
1275	return 0;	1275	return 0;
1276	}	1276	}
1277		1277
1278	static int	1278	static int
1279	loop_set_status_compat(struct loop_device *lo,	1279	loop_set_status_compat(struct loop_device *lo,
1280	const struct compat_loop_info __user *arg)	1280	const struct compat_loop_info __user *arg)
1281	{	1281	{
1282	struct loop_info64 info64;	1282	struct loop_info64 info64;
1283	int ret;	1283	int ret;
1284		1284
1285	ret = loop_info64_from_compat(arg, &info64);	1285	ret = loop_info64_from_compat(arg, &info64);
1286	if (ret < 0)	1286	if (ret < 0)
1287	return ret;	1287	return ret;
1288	return loop_set_status(lo, &info64);	1288	return loop_set_status(lo, &info64);
1289	}	1289	}
1290		1290
1291	static int	1291	static int
1292	loop_get_status_compat(struct loop_device *lo,	1292	loop_get_status_compat(struct loop_device *lo,
1293	struct compat_loop_info __user *arg)	1293	struct compat_loop_info __user *arg)
1294	{	1294	{
1295	struct loop_info64 info64;	1295	struct loop_info64 info64;
1296	int err = 0;	1296	int err = 0;
1297		1297
1298	if (!arg)	1298	if (!arg)
1299	err = -EINVAL;	1299	err = -EINVAL;
1300	if (!err)	1300	if (!err)
1301	err = loop_get_status(lo, &info64);	1301	err = loop_get_status(lo, &info64);
1302	if (!err)	1302	if (!err)
1303	err = loop_info64_to_compat(&info64, arg);	1303	err = loop_info64_to_compat(&info64, arg);
1304	return err;	1304	return err;
1305	}	1305	}
1306		1306
1307	static long lo_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)	1307	static long lo_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
1308	{	1308	{
1309	struct inode *inode = file->f_path.dentry->d_inode;	1309	struct inode *inode = file->f_path.dentry->d_inode;
1310	struct loop_device *lo = inode->i_bdev->bd_disk->private_data;	1310	struct loop_device *lo = inode->i_bdev->bd_disk->private_data;
1311	int err;	1311	int err;
1312		1312
1313	lock_kernel();	1313	lock_kernel();
1314	switch(cmd) {	1314	switch(cmd) {
1315	case LOOP_SET_STATUS:	1315	case LOOP_SET_STATUS:
1316	mutex_lock(&lo->lo_ctl_mutex);	1316	mutex_lock(&lo->lo_ctl_mutex);
1317	err = loop_set_status_compat(	1317	err = loop_set_status_compat(
1318	lo, (const struct compat_loop_info __user *) arg);	1318	lo, (const struct compat_loop_info __user *) arg);
1319	mutex_unlock(&lo->lo_ctl_mutex);	1319	mutex_unlock(&lo->lo_ctl_mutex);
1320	break;	1320	break;
1321	case LOOP_GET_STATUS:	1321	case LOOP_GET_STATUS:
1322	mutex_lock(&lo->lo_ctl_mutex);	1322	mutex_lock(&lo->lo_ctl_mutex);
1323	err = loop_get_status_compat(	1323	err = loop_get_status_compat(
1324	lo, (struct compat_loop_info __user *) arg);	1324	lo, (struct compat_loop_info __user *) arg);
1325	mutex_unlock(&lo->lo_ctl_mutex);	1325	mutex_unlock(&lo->lo_ctl_mutex);
1326	break;	1326	break;
1327	case LOOP_CLR_FD:	1327	case LOOP_CLR_FD:
1328	case LOOP_GET_STATUS64:	1328	case LOOP_GET_STATUS64:
1329	case LOOP_SET_STATUS64:	1329	case LOOP_SET_STATUS64:
1330	arg = (unsigned long) compat_ptr(arg);	1330	arg = (unsigned long) compat_ptr(arg);
1331	case LOOP_SET_FD:	1331	case LOOP_SET_FD:
1332	case LOOP_CHANGE_FD:	1332	case LOOP_CHANGE_FD:
1333	err = lo_ioctl(inode, file, cmd, arg);	1333	err = lo_ioctl(inode, file, cmd, arg);
1334	break;	1334	break;
1335	default:	1335	default:
1336	err = -ENOIOCTLCMD;	1336	err = -ENOIOCTLCMD;
1337	break;	1337	break;
1338	}	1338	}
1339	unlock_kernel();	1339	unlock_kernel();
1340	return err;	1340	return err;
1341	}	1341	}
1342	#endif	1342	#endif
1343		1343
1344	static int lo_open(struct inode inode, struct file file)	1344	static int lo_open(struct inode inode, struct file file)
1345	{	1345	{
1346	struct loop_device *lo = inode->i_bdev->bd_disk->private_data;	1346	struct loop_device *lo = inode->i_bdev->bd_disk->private_data;
1347		1347
1348	mutex_lock(&lo->lo_ctl_mutex);	1348	mutex_lock(&lo->lo_ctl_mutex);
1349	lo->lo_refcnt++;	1349	lo->lo_refcnt++;
1350	mutex_unlock(&lo->lo_ctl_mutex);	1350	mutex_unlock(&lo->lo_ctl_mutex);
1351		1351
1352	return 0;	1352	return 0;
1353	}	1353	}
1354		1354
1355	static int lo_release(struct inode inode, struct file file)	1355	static int lo_release(struct inode inode, struct file file)
1356	{	1356	{
1357	struct loop_device *lo = inode->i_bdev->bd_disk->private_data;	1357	struct loop_device *lo = inode->i_bdev->bd_disk->private_data;
1358		1358
1359	mutex_lock(&lo->lo_ctl_mutex);	1359	mutex_lock(&lo->lo_ctl_mutex);
1360	--lo->lo_refcnt;	1360	--lo->lo_refcnt;
1361	mutex_unlock(&lo->lo_ctl_mutex);	1361	mutex_unlock(&lo->lo_ctl_mutex);
1362		1362
1363	return 0;	1363	return 0;
1364	}	1364	}
1365		1365
1366	static struct block_device_operations lo_fops = {	1366	static struct block_device_operations lo_fops = {
1367	.owner = THIS_MODULE,	1367	.owner = THIS_MODULE,
1368	.open = lo_open,	1368	.open = lo_open,
1369	.release = lo_release,	1369	.release = lo_release,
1370	.ioctl = lo_ioctl,	1370	.ioctl = lo_ioctl,
1371	#ifdef CONFIG_COMPAT	1371	#ifdef CONFIG_COMPAT
1372	.compat_ioctl = lo_compat_ioctl,	1372	.compat_ioctl = lo_compat_ioctl,
1373	#endif	1373	#endif
1374	};	1374	};
1375		1375
1376	/*	1376	/*
1377	* And now the modules code and kernel interface.	1377	* And now the modules code and kernel interface.
1378	*/	1378	*/
1379	static int max_loop;	1379	static int max_loop;
1380	module_param(max_loop, int, 0);	1380	module_param(max_loop, int, 0);
1381	MODULE_PARM_DESC(max_loop, "Maximum number of loop devices");	1381	MODULE_PARM_DESC(max_loop, "Maximum number of loop devices");
1382	MODULE_LICENSE("GPL");	1382	MODULE_LICENSE("GPL");
1383	MODULE_ALIAS_BLOCKDEV_MAJOR(LOOP_MAJOR);	1383	MODULE_ALIAS_BLOCKDEV_MAJOR(LOOP_MAJOR);
1384		1384
1385	int loop_register_transfer(struct loop_func_table *funcs)	1385	int loop_register_transfer(struct loop_func_table *funcs)
1386	{	1386	{
1387	unsigned int n = funcs->number;	1387	unsigned int n = funcs->number;
1388		1388
1389	if (n >= MAX_LO_CRYPT \|\| xfer_funcs[n])	1389	if (n >= MAX_LO_CRYPT \|\| xfer_funcs[n])
1390	return -EINVAL;	1390	return -EINVAL;
1391	xfer_funcs[n] = funcs;	1391	xfer_funcs[n] = funcs;
1392	return 0;	1392	return 0;
1393	}	1393	}
1394		1394
1395	int loop_unregister_transfer(int number)	1395	int loop_unregister_transfer(int number)
1396	{	1396	{
1397	unsigned int n = number;	1397	unsigned int n = number;
1398	struct loop_device *lo;	1398	struct loop_device *lo;
1399	struct loop_func_table *xfer;	1399	struct loop_func_table *xfer;
1400		1400
1401	if (n == 0 \|\| n >= MAX_LO_CRYPT \|\| (xfer = xfer_funcs[n]) == NULL)	1401	if (n == 0 \|\| n >= MAX_LO_CRYPT \|\| (xfer = xfer_funcs[n]) == NULL)
1402	return -EINVAL;	1402	return -EINVAL;
1403		1403
1404	xfer_funcs[n] = NULL;	1404	xfer_funcs[n] = NULL;
1405		1405
1406	list_for_each_entry(lo, &loop_devices, lo_list) {	1406	list_for_each_entry(lo, &loop_devices, lo_list) {
1407	mutex_lock(&lo->lo_ctl_mutex);	1407	mutex_lock(&lo->lo_ctl_mutex);
1408		1408
1409	if (lo->lo_encryption == xfer)	1409	if (lo->lo_encryption == xfer)
1410	loop_release_xfer(lo);	1410	loop_release_xfer(lo);
1411		1411
1412	mutex_unlock(&lo->lo_ctl_mutex);	1412	mutex_unlock(&lo->lo_ctl_mutex);
1413	}	1413	}
1414		1414
1415	return 0;	1415	return 0;
1416	}	1416	}
1417		1417
1418	EXPORT_SYMBOL(loop_register_transfer);	1418	EXPORT_SYMBOL(loop_register_transfer);
1419	EXPORT_SYMBOL(loop_unregister_transfer);	1419	EXPORT_SYMBOL(loop_unregister_transfer);
1420		1420
1421	static struct loop_device *loop_alloc(int i)	1421	static struct loop_device *loop_alloc(int i)
1422	{	1422	{
1423	struct loop_device *lo;	1423	struct loop_device *lo;
1424	struct gendisk *disk;	1424	struct gendisk *disk;
1425		1425
1426	lo = kzalloc(sizeof(*lo), GFP_KERNEL);	1426	lo = kzalloc(sizeof(*lo), GFP_KERNEL);
1427	if (!lo)	1427	if (!lo)
1428	goto out;	1428	goto out;
1429		1429
1430	lo->lo_queue = blk_alloc_queue(GFP_KERNEL);	1430	lo->lo_queue = blk_alloc_queue(GFP_KERNEL);
1431	if (!lo->lo_queue)	1431	if (!lo->lo_queue)
1432	goto out_free_dev;	1432	goto out_free_dev;
1433		1433
1434	disk = lo->lo_disk = alloc_disk(1);	1434	disk = lo->lo_disk = alloc_disk(1);
1435	if (!disk)	1435	if (!disk)
1436	goto out_free_queue;	1436	goto out_free_queue;
1437		1437
1438	mutex_init(&lo->lo_ctl_mutex);	1438	mutex_init(&lo->lo_ctl_mutex);
1439	lo->lo_number = i;	1439	lo->lo_number = i;
1440	lo->lo_thread = NULL;	1440	lo->lo_thread = NULL;
1441	init_waitqueue_head(&lo->lo_event);	1441	init_waitqueue_head(&lo->lo_event);
1442	spin_lock_init(&lo->lo_lock);	1442	spin_lock_init(&lo->lo_lock);
1443	disk->major = LOOP_MAJOR;	1443	disk->major = LOOP_MAJOR;
1444	disk->first_minor = i;	1444	disk->first_minor = i;
1445	disk->fops = &lo_fops;	1445	disk->fops = &lo_fops;
1446	disk->private_data = lo;	1446	disk->private_data = lo;
1447	disk->queue = lo->lo_queue;	1447	disk->queue = lo->lo_queue;
1448	sprintf(disk->disk_name, "loop%d", i);	1448	sprintf(disk->disk_name, "loop%d", i);
1449	return lo;	1449	return lo;
1450		1450
1451	out_free_queue:	1451	out_free_queue:
1452	blk_cleanup_queue(lo->lo_queue);	1452	blk_cleanup_queue(lo->lo_queue);
1453	out_free_dev:	1453	out_free_dev:
1454	kfree(lo);	1454	kfree(lo);
1455	out:	1455	out:
1456	return NULL;	1456	return NULL;
1457	}	1457	}
1458		1458
1459	static void loop_free(struct loop_device *lo)	1459	static void loop_free(struct loop_device *lo)
1460	{	1460	{
1461	blk_cleanup_queue(lo->lo_queue);	1461	blk_cleanup_queue(lo->lo_queue);
1462	put_disk(lo->lo_disk);	1462	put_disk(lo->lo_disk);
1463	list_del(&lo->lo_list);	1463	list_del(&lo->lo_list);
1464	kfree(lo);	1464	kfree(lo);
1465	}	1465	}
1466		1466
1467	static struct loop_device *loop_init_one(int i)	1467	static struct loop_device *loop_init_one(int i)
1468	{	1468	{
1469	struct loop_device *lo;	1469	struct loop_device *lo;
1470		1470
1471	list_for_each_entry(lo, &loop_devices, lo_list) {	1471	list_for_each_entry(lo, &loop_devices, lo_list) {
1472	if (lo->lo_number == i)	1472	if (lo->lo_number == i)
1473	return lo;	1473	return lo;
1474	}	1474	}
1475		1475
1476	lo = loop_alloc(i);	1476	lo = loop_alloc(i);
1477	if (lo) {	1477	if (lo) {
1478	add_disk(lo->lo_disk);	1478	add_disk(lo->lo_disk);
1479	list_add_tail(&lo->lo_list, &loop_devices);	1479	list_add_tail(&lo->lo_list, &loop_devices);
1480	}	1480	}
1481	return lo;	1481	return lo;
1482	}	1482	}
1483		1483
1484	static void loop_del_one(struct loop_device *lo)	1484	static void loop_del_one(struct loop_device *lo)
1485	{	1485	{
1486	del_gendisk(lo->lo_disk);	1486	del_gendisk(lo->lo_disk);
1487	loop_free(lo);	1487	loop_free(lo);
1488	}	1488	}
1489		1489
1490	static struct kobject loop_probe(dev_t dev, int part, void *data)	1490	static struct kobject loop_probe(dev_t dev, int part, void *data)
1491	{	1491	{
1492	struct loop_device *lo;	1492	struct loop_device *lo;
1493	struct kobject *kobj;	1493	struct kobject *kobj;
1494		1494
1495	mutex_lock(&loop_devices_mutex);	1495	mutex_lock(&loop_devices_mutex);
1496	lo = loop_init_one(dev & MINORMASK);	1496	lo = loop_init_one(dev & MINORMASK);
1497	kobj = lo ? get_disk(lo->lo_disk) : ERR_PTR(-ENOMEM);	1497	kobj = lo ? get_disk(lo->lo_disk) : ERR_PTR(-ENOMEM);
1498	mutex_unlock(&loop_devices_mutex);	1498	mutex_unlock(&loop_devices_mutex);
1499		1499
1500	*part = 0;	1500	*part = 0;
1501	return kobj;	1501	return kobj;
1502	}	1502	}
1503		1503
1504	static int __init loop_init(void)	1504	static int __init loop_init(void)
1505	{	1505	{
1506	int i, nr;	1506	int i, nr;
1507	unsigned long range;	1507	unsigned long range;
1508	struct loop_device lo, next;	1508	struct loop_device lo, next;
1509		1509
1510	/*	1510	/*
1511	* loop module now has a feature to instantiate underlying device	1511	* loop module now has a feature to instantiate underlying device
1512	* structure on-demand, provided that there is an access dev node.	1512	* structure on-demand, provided that there is an access dev node.
1513	* However, this will not work well with user space tool that doesn't	1513	* However, this will not work well with user space tool that doesn't
1514	* know about such "feature". In order to not break any existing	1514	* know about such "feature". In order to not break any existing
1515	* tool, we do the following:	1515	* tool, we do the following:
1516	*	1516	*
1517	* (1) if max_loop is specified, create that many upfront, and this	1517	* (1) if max_loop is specified, create that many upfront, and this
1518	* also becomes a hard limit.	1518	* also becomes a hard limit.
1519	* (2) if max_loop is not specified, create 8 loop device on module	1519	* (2) if max_loop is not specified, create 8 loop device on module
1520	* load, user can further extend loop device by create dev node	1520	* load, user can further extend loop device by create dev node
1521	* themselves and have kernel automatically instantiate actual	1521	* themselves and have kernel automatically instantiate actual
1522	* device on-demand.	1522	* device on-demand.
1523	*/	1523	*/
1524	if (max_loop > 1UL << MINORBITS)	1524	if (max_loop > 1UL << MINORBITS)
1525	return -EINVAL;	1525	return -EINVAL;
1526		1526
1527	if (max_loop) {	1527	if (max_loop) {
1528	nr = max_loop;	1528	nr = max_loop;
1529	range = max_loop;	1529	range = max_loop;
1530	} else {	1530	} else {
1531	nr = 8;	1531	nr = 8;
1532	range = 1UL << MINORBITS;	1532	range = 1UL << MINORBITS;
1533	}	1533	}
1534		1534
1535	if (register_blkdev(LOOP_MAJOR, "loop"))	1535	if (register_blkdev(LOOP_MAJOR, "loop"))
1536	return -EIO;	1536	return -EIO;
1537		1537
1538	for (i = 0; i < nr; i++) {	1538	for (i = 0; i < nr; i++) {
1539	lo = loop_alloc(i);	1539	lo = loop_alloc(i);
1540	if (!lo)	1540	if (!lo)
1541	goto Enomem;	1541	goto Enomem;
1542	list_add_tail(&lo->lo_list, &loop_devices);	1542	list_add_tail(&lo->lo_list, &loop_devices);
1543	}	1543	}
1544		1544
1545	/* point of no return */	1545	/* point of no return */
1546		1546
1547	list_for_each_entry(lo, &loop_devices, lo_list)	1547	list_for_each_entry(lo, &loop_devices, lo_list)
1548	add_disk(lo->lo_disk);	1548	add_disk(lo->lo_disk);
1549		1549
1550	blk_register_region(MKDEV(LOOP_MAJOR, 0), range,	1550	blk_register_region(MKDEV(LOOP_MAJOR, 0), range,
1551	THIS_MODULE, loop_probe, NULL, NULL);	1551	THIS_MODULE, loop_probe, NULL, NULL);
1552		1552
1553	printk(KERN_INFO "loop: module loaded\n");	1553	printk(KERN_INFO "loop: module loaded\n");
1554	return 0;	1554	return 0;
1555		1555
1556	Enomem:	1556	Enomem:
1557	printk(KERN_INFO "loop: out of memory\n");	1557	printk(KERN_INFO "loop: out of memory\n");
1558		1558
1559	list_for_each_entry_safe(lo, next, &loop_devices, lo_list)	1559	list_for_each_entry_safe(lo, next, &loop_devices, lo_list)
1560	loop_free(lo);	1560	loop_free(lo);
1561		1561
1562	unregister_blkdev(LOOP_MAJOR, "loop");	1562	unregister_blkdev(LOOP_MAJOR, "loop");
1563	return -ENOMEM;	1563	return -ENOMEM;
1564	}	1564	}
1565		1565
1566	static void __exit loop_exit(void)	1566	static void __exit loop_exit(void)
1567	{	1567	{
1568	unsigned long range;	1568	unsigned long range;
1569	struct loop_device lo, next;	1569	struct loop_device lo, next;
1570		1570
1571	range = max_loop ? max_loop : 1UL << MINORBITS;	1571	range = max_loop ? max_loop : 1UL << MINORBITS;
1572		1572
1573	list_for_each_entry_safe(lo, next, &loop_devices, lo_list)	1573	list_for_each_entry_safe(lo, next, &loop_devices, lo_list)
1574	loop_del_one(lo);	1574	loop_del_one(lo);
1575		1575
1576	blk_unregister_region(MKDEV(LOOP_MAJOR, 0), range);	1576	blk_unregister_region(MKDEV(LOOP_MAJOR, 0), range);
1577	if (unregister_blkdev(LOOP_MAJOR, "loop"))	1577	if (unregister_blkdev(LOOP_MAJOR, "loop"))
1578	printk(KERN_WARNING "loop: cannot unregister blkdev\n");	1578	printk(KERN_WARNING "loop: cannot unregister blkdev\n");
1579	}	1579	}
1580		1580
1581	module_init(loop_init);	1581	module_init(loop_init);
1582	module_exit(loop_exit);	1582	module_exit(loop_exit);
1583		1583
1584	#ifndef MODULE	1584	#ifndef MODULE
1585	static int __init max_loop_setup(char *str)	1585	static int __init max_loop_setup(char *str)
1586	{	1586	{
1587	max_loop = simple_strtol(str, NULL, 0);	1587	max_loop = simple_strtol(str, NULL, 0);
1588	return 1;	1588	return 1;
1589	}	1589	}
1590		1590
1591	__setup("max_loop=", max_loop_setup);	1591	__setup("max_loop=", max_loop_setup);
1592	#endif	1592	#endif
1593		1593

fs/nfsd/vfs.c

Diff comments View file @ cac36bb

1	#define MSNFS /* HACK HACK */	1	#define MSNFS /* HACK HACK */
2	/*	2	/*
3	* linux/fs/nfsd/vfs.c	3	* linux/fs/nfsd/vfs.c
4	*	4	*
5	* File operations used by nfsd. Some of these have been ripped from	5	* File operations used by nfsd. Some of these have been ripped from
6	* other parts of the kernel because they weren't exported, others	6	* other parts of the kernel because they weren't exported, others
7	* are partial duplicates with added or changed functionality.	7	* are partial duplicates with added or changed functionality.
8	*	8	*
9	* Note that several functions dget() the dentry upon which they want	9	* Note that several functions dget() the dentry upon which they want
10	* to act, most notably those that create directory entries. Response	10	* to act, most notably those that create directory entries. Response
11	* dentry's are dput()'d if necessary in the release callback.	11	* dentry's are dput()'d if necessary in the release callback.
12	* So if you notice code paths that apparently fail to dput() the	12	* So if you notice code paths that apparently fail to dput() the
13	* dentry, don't worry--they have been taken care of.	13	* dentry, don't worry--they have been taken care of.
14	*	14	*
15	* Copyright (C) 1995-1999 Olaf Kirch <okir@monad.swb.de>	15	* Copyright (C) 1995-1999 Olaf Kirch <okir@monad.swb.de>
16	* Zerocpy NFS support (C) 2002 Hirokazu Takahashi <taka@valinux.co.jp>	16	* Zerocpy NFS support (C) 2002 Hirokazu Takahashi <taka@valinux.co.jp>
17	*/	17	*/
18		18
19	#include <linux/string.h>	19	#include <linux/string.h>
20	#include <linux/time.h>	20	#include <linux/time.h>
21	#include <linux/errno.h>	21	#include <linux/errno.h>
22	#include <linux/fs.h>	22	#include <linux/fs.h>
23	#include <linux/file.h>	23	#include <linux/file.h>
24	#include <linux/mount.h>	24	#include <linux/mount.h>
25	#include <linux/major.h>	25	#include <linux/major.h>
26	#include <linux/splice.h>	26	#include <linux/splice.h>
27	#include <linux/proc_fs.h>	27	#include <linux/proc_fs.h>
28	#include <linux/stat.h>	28	#include <linux/stat.h>
29	#include <linux/fcntl.h>	29	#include <linux/fcntl.h>
30	#include <linux/net.h>	30	#include <linux/net.h>
31	#include <linux/unistd.h>	31	#include <linux/unistd.h>
32	#include <linux/slab.h>	32	#include <linux/slab.h>
33	#include <linux/pagemap.h>	33	#include <linux/pagemap.h>
34	#include <linux/in.h>	34	#include <linux/in.h>
35	#include <linux/module.h>	35	#include <linux/module.h>
36	#include <linux/namei.h>	36	#include <linux/namei.h>
37	#include <linux/vfs.h>	37	#include <linux/vfs.h>
38	#include <linux/delay.h>	38	#include <linux/delay.h>
39	#include <linux/sunrpc/svc.h>	39	#include <linux/sunrpc/svc.h>
40	#include <linux/nfsd/nfsd.h>	40	#include <linux/nfsd/nfsd.h>
41	#ifdef CONFIG_NFSD_V3	41	#ifdef CONFIG_NFSD_V3
42	#include <linux/nfs3.h>	42	#include <linux/nfs3.h>
43	#include <linux/nfsd/xdr3.h>	43	#include <linux/nfsd/xdr3.h>
44	#endif /* CONFIG_NFSD_V3 */	44	#endif /* CONFIG_NFSD_V3 */
45	#include <linux/nfsd/nfsfh.h>	45	#include <linux/nfsd/nfsfh.h>
46	#include <linux/quotaops.h>	46	#include <linux/quotaops.h>
47	#include <linux/fsnotify.h>	47	#include <linux/fsnotify.h>
48	#include <linux/posix_acl.h>	48	#include <linux/posix_acl.h>
49	#include <linux/posix_acl_xattr.h>	49	#include <linux/posix_acl_xattr.h>
50	#include <linux/xattr.h>	50	#include <linux/xattr.h>
51	#ifdef CONFIG_NFSD_V4	51	#ifdef CONFIG_NFSD_V4
52	#include <linux/nfs4.h>	52	#include <linux/nfs4.h>
53	#include <linux/nfs4_acl.h>	53	#include <linux/nfs4_acl.h>
54	#include <linux/nfsd_idmap.h>	54	#include <linux/nfsd_idmap.h>
55	#include <linux/security.h>	55	#include <linux/security.h>
56	#endif /* CONFIG_NFSD_V4 */	56	#endif /* CONFIG_NFSD_V4 */
57	#include <linux/jhash.h>	57	#include <linux/jhash.h>
58		58
59	#include <asm/uaccess.h>	59	#include <asm/uaccess.h>
60		60
61	#define NFSDDBG_FACILITY NFSDDBG_FILEOP	61	#define NFSDDBG_FACILITY NFSDDBG_FILEOP
62		62
63		63
64	/* We must ignore files (but only files) which might have mandatory	64	/* We must ignore files (but only files) which might have mandatory
65	* locks on them because there is no way to know if the accesser has	65	* locks on them because there is no way to know if the accesser has
66	* the lock.	66	* the lock.
67	*/	67	*/
68	#define IS_ISMNDLK(i) (S_ISREG((i)->i_mode) && MANDATORY_LOCK(i))	68	#define IS_ISMNDLK(i) (S_ISREG((i)->i_mode) && MANDATORY_LOCK(i))
69		69
70	/*	70	/*
71	* This is a cache of readahead params that help us choose the proper	71	* This is a cache of readahead params that help us choose the proper
72	* readahead strategy. Initially, we set all readahead parameters to 0	72	* readahead strategy. Initially, we set all readahead parameters to 0
73	* and let the VFS handle things.	73	* and let the VFS handle things.
74	* If you increase the number of cached files very much, you'll need to	74	* If you increase the number of cached files very much, you'll need to
75	* add a hash table here.	75	* add a hash table here.
76	*/	76	*/
77	struct raparms {	77	struct raparms {
78	struct raparms *p_next;	78	struct raparms *p_next;
79	unsigned int p_count;	79	unsigned int p_count;
80	ino_t p_ino;	80	ino_t p_ino;
81	dev_t p_dev;	81	dev_t p_dev;
82	int p_set;	82	int p_set;
83	struct file_ra_state p_ra;	83	struct file_ra_state p_ra;
84	unsigned int p_hindex;	84	unsigned int p_hindex;
85	};	85	};
86		86
87	struct raparm_hbucket {	87	struct raparm_hbucket {
88	struct raparms *pb_head;	88	struct raparms *pb_head;
89	spinlock_t pb_lock;	89	spinlock_t pb_lock;
90	} ____cacheline_aligned_in_smp;	90	} ____cacheline_aligned_in_smp;
91		91
92	static struct raparms * raparml;	92	static struct raparms * raparml;
93	#define RAPARM_HASH_BITS 4	93	#define RAPARM_HASH_BITS 4
94	#define RAPARM_HASH_SIZE (1<<RAPARM_HASH_BITS)	94	#define RAPARM_HASH_SIZE (1<<RAPARM_HASH_BITS)
95	#define RAPARM_HASH_MASK (RAPARM_HASH_SIZE-1)	95	#define RAPARM_HASH_MASK (RAPARM_HASH_SIZE-1)
96	static struct raparm_hbucket raparm_hash[RAPARM_HASH_SIZE];	96	static struct raparm_hbucket raparm_hash[RAPARM_HASH_SIZE];
97		97
98	/*	98	/*
99	* Called from nfsd_lookup and encode_dirent. Check if we have crossed	99	* Called from nfsd_lookup and encode_dirent. Check if we have crossed
100	* a mount point.	100	* a mount point.
101	* Returns -EAGAIN or -ETIMEDOUT leaving dpp and expp unchanged,	101	* Returns -EAGAIN or -ETIMEDOUT leaving dpp and expp unchanged,
102	* or nfs_ok having possibly changed dpp and expp	102	* or nfs_ok having possibly changed dpp and expp
103	*/	103	*/
104	int	104	int
105	nfsd_cross_mnt(struct svc_rqst rqstp, struct dentry *dpp,	105	nfsd_cross_mnt(struct svc_rqst rqstp, struct dentry *dpp,
106	struct svc_export **expp)	106	struct svc_export **expp)
107	{	107	{
108	struct svc_export exp = expp, *exp2 = NULL;	108	struct svc_export exp = expp, *exp2 = NULL;
109	struct dentry dentry = dpp;	109	struct dentry dentry = dpp;
110	struct vfsmount *mnt = mntget(exp->ex_mnt);	110	struct vfsmount *mnt = mntget(exp->ex_mnt);
111	struct dentry *mounts = dget(dentry);	111	struct dentry *mounts = dget(dentry);
112	int err = 0;	112	int err = 0;
113		113
114	while (follow_down(&mnt,&mounts)&&d_mountpoint(mounts));	114	while (follow_down(&mnt,&mounts)&&d_mountpoint(mounts));
115		115
116	exp2 = exp_get_by_name(exp->ex_client, mnt, mounts, &rqstp->rq_chandle);	116	exp2 = exp_get_by_name(exp->ex_client, mnt, mounts, &rqstp->rq_chandle);
117	if (IS_ERR(exp2)) {	117	if (IS_ERR(exp2)) {
118	err = PTR_ERR(exp2);	118	err = PTR_ERR(exp2);
119	dput(mounts);	119	dput(mounts);
120	mntput(mnt);	120	mntput(mnt);
121	goto out;	121	goto out;
122	}	122	}
123	if (exp2 && ((exp->ex_flags & NFSEXP_CROSSMOUNT) \|\| EX_NOHIDE(exp2))) {	123	if (exp2 && ((exp->ex_flags & NFSEXP_CROSSMOUNT) \|\| EX_NOHIDE(exp2))) {
124	/* successfully crossed mount point */	124	/* successfully crossed mount point */
125	exp_put(exp);	125	exp_put(exp);
126	*expp = exp2;	126	*expp = exp2;
127	dput(dentry);	127	dput(dentry);
128	*dpp = mounts;	128	*dpp = mounts;
129	} else {	129	} else {
130	if (exp2) exp_put(exp2);	130	if (exp2) exp_put(exp2);
131	dput(mounts);	131	dput(mounts);
132	}	132	}
133	mntput(mnt);	133	mntput(mnt);
134	out:	134	out:
135	return err;	135	return err;
136	}	136	}
137		137
138	/*	138	/*
139	* Look up one component of a pathname.	139	* Look up one component of a pathname.
140	* N.B. After this call _both_ fhp and resfh need an fh_put	140	* N.B. After this call _both_ fhp and resfh need an fh_put
141	*	141	*
142	* If the lookup would cross a mountpoint, and the mounted filesystem	142	* If the lookup would cross a mountpoint, and the mounted filesystem
143	* is exported to the client with NFSEXP_NOHIDE, then the lookup is	143	* is exported to the client with NFSEXP_NOHIDE, then the lookup is
144	* accepted as it stands and the mounted directory is	144	* accepted as it stands and the mounted directory is
145	* returned. Otherwise the covered directory is returned.	145	* returned. Otherwise the covered directory is returned.
146	* NOTE: this mountpoint crossing is not supported properly by all	146	* NOTE: this mountpoint crossing is not supported properly by all
147	* clients and is explicitly disallowed for NFSv3	147	* clients and is explicitly disallowed for NFSv3
148	* NeilBrown <neilb@cse.unsw.edu.au>	148	* NeilBrown <neilb@cse.unsw.edu.au>
149	*/	149	*/
150	__be32	150	__be32
151	nfsd_lookup(struct svc_rqst rqstp, struct svc_fh fhp, const char *name,	151	nfsd_lookup(struct svc_rqst rqstp, struct svc_fh fhp, const char *name,
152	int len, struct svc_fh *resfh)	152	int len, struct svc_fh *resfh)
153	{	153	{
154	struct svc_export *exp;	154	struct svc_export *exp;
155	struct dentry *dparent;	155	struct dentry *dparent;
156	struct dentry *dentry;	156	struct dentry *dentry;
157	__be32 err;	157	__be32 err;
158	int host_err;	158	int host_err;
159		159
160	dprintk("nfsd: nfsd_lookup(fh %s, %.*s)\n", SVCFH_fmt(fhp), len,name);	160	dprintk("nfsd: nfsd_lookup(fh %s, %.*s)\n", SVCFH_fmt(fhp), len,name);
161		161
162	/* Obtain dentry and export. */	162	/* Obtain dentry and export. */
163	err = fh_verify(rqstp, fhp, S_IFDIR, MAY_EXEC);	163	err = fh_verify(rqstp, fhp, S_IFDIR, MAY_EXEC);
164	if (err)	164	if (err)
165	return err;	165	return err;
166		166
167	dparent = fhp->fh_dentry;	167	dparent = fhp->fh_dentry;
168	exp = fhp->fh_export;	168	exp = fhp->fh_export;
169	exp_get(exp);	169	exp_get(exp);
170		170
171	err = nfserr_acces;	171	err = nfserr_acces;
172		172
173	/* Lookup the name, but don't follow links */	173	/* Lookup the name, but don't follow links */
174	if (isdotent(name, len)) {	174	if (isdotent(name, len)) {
175	if (len==1)	175	if (len==1)
176	dentry = dget(dparent);	176	dentry = dget(dparent);
177	else if (dparent != exp->ex_dentry) {	177	else if (dparent != exp->ex_dentry) {
178	dentry = dget_parent(dparent);	178	dentry = dget_parent(dparent);
179	} else if (!EX_NOHIDE(exp))	179	} else if (!EX_NOHIDE(exp))
180	dentry = dget(dparent); /* .. == . just like at / */	180	dentry = dget(dparent); /* .. == . just like at / */
181	else {	181	else {
182	/* checking mountpoint crossing is very different when stepping up */	182	/* checking mountpoint crossing is very different when stepping up */
183	struct svc_export *exp2 = NULL;	183	struct svc_export *exp2 = NULL;
184	struct dentry *dp;	184	struct dentry *dp;
185	struct vfsmount *mnt = mntget(exp->ex_mnt);	185	struct vfsmount *mnt = mntget(exp->ex_mnt);
186	dentry = dget(dparent);	186	dentry = dget(dparent);
187	while(dentry == mnt->mnt_root && follow_up(&mnt, &dentry))	187	while(dentry == mnt->mnt_root && follow_up(&mnt, &dentry))
188	;	188	;
189	dp = dget_parent(dentry);	189	dp = dget_parent(dentry);
190	dput(dentry);	190	dput(dentry);
191	dentry = dp;	191	dentry = dp;
192		192
193	exp2 = exp_parent(exp->ex_client, mnt, dentry,	193	exp2 = exp_parent(exp->ex_client, mnt, dentry,
194	&rqstp->rq_chandle);	194	&rqstp->rq_chandle);
195	if (IS_ERR(exp2)) {	195	if (IS_ERR(exp2)) {
196	host_err = PTR_ERR(exp2);	196	host_err = PTR_ERR(exp2);
197	dput(dentry);	197	dput(dentry);
198	mntput(mnt);	198	mntput(mnt);
199	goto out_nfserr;	199	goto out_nfserr;
200	}	200	}
201	if (!exp2) {	201	if (!exp2) {
202	dput(dentry);	202	dput(dentry);
203	dentry = dget(dparent);	203	dentry = dget(dparent);
204	} else {	204	} else {
205	exp_put(exp);	205	exp_put(exp);
206	exp = exp2;	206	exp = exp2;
207	}	207	}
208	mntput(mnt);	208	mntput(mnt);
209	}	209	}
210	} else {	210	} else {
211	fh_lock(fhp);	211	fh_lock(fhp);
212	dentry = lookup_one_len(name, dparent, len);	212	dentry = lookup_one_len(name, dparent, len);
213	host_err = PTR_ERR(dentry);	213	host_err = PTR_ERR(dentry);
214	if (IS_ERR(dentry))	214	if (IS_ERR(dentry))
215	goto out_nfserr;	215	goto out_nfserr;
216	/*	216	/*
217	* check if we have crossed a mount point ...	217	* check if we have crossed a mount point ...
218	*/	218	*/
219	if (d_mountpoint(dentry)) {	219	if (d_mountpoint(dentry)) {
220	if ((host_err = nfsd_cross_mnt(rqstp, &dentry, &exp))) {	220	if ((host_err = nfsd_cross_mnt(rqstp, &dentry, &exp))) {
221	dput(dentry);	221	dput(dentry);
222	goto out_nfserr;	222	goto out_nfserr;
223	}	223	}
224	}	224	}
225	}	225	}
226	/*	226	/*
227	* Note: we compose the file handle now, but as the	227	* Note: we compose the file handle now, but as the
228	* dentry may be negative, it may need to be updated.	228	* dentry may be negative, it may need to be updated.
229	*/	229	*/
230	err = fh_compose(resfh, exp, dentry, fhp);	230	err = fh_compose(resfh, exp, dentry, fhp);
231	if (!err && !dentry->d_inode)	231	if (!err && !dentry->d_inode)
232	err = nfserr_noent;	232	err = nfserr_noent;
233	dput(dentry);	233	dput(dentry);
234	out:	234	out:
235	exp_put(exp);	235	exp_put(exp);
236	return err;	236	return err;
237		237
238	out_nfserr:	238	out_nfserr:
239	err = nfserrno(host_err);	239	err = nfserrno(host_err);
240	goto out;	240	goto out;
241	}	241	}
242		242
243	/*	243	/*
244	* Set various file attributes.	244	* Set various file attributes.
245	* N.B. After this call fhp needs an fh_put	245	* N.B. After this call fhp needs an fh_put
246	*/	246	*/
247	__be32	247	__be32
248	nfsd_setattr(struct svc_rqst rqstp, struct svc_fh fhp, struct iattr *iap,	248	nfsd_setattr(struct svc_rqst rqstp, struct svc_fh fhp, struct iattr *iap,
249	int check_guard, time_t guardtime)	249	int check_guard, time_t guardtime)
250	{	250	{
251	struct dentry *dentry;	251	struct dentry *dentry;
252	struct inode *inode;	252	struct inode *inode;
253	int accmode = MAY_SATTR;	253	int accmode = MAY_SATTR;
254	int ftype = 0;	254	int ftype = 0;
255	int imode;	255	int imode;
256	__be32 err;	256	__be32 err;
257	int host_err;	257	int host_err;
258	int size_change = 0;	258	int size_change = 0;
259		259
260	if (iap->ia_valid & (ATTR_ATIME \| ATTR_MTIME \| ATTR_SIZE))	260	if (iap->ia_valid & (ATTR_ATIME \| ATTR_MTIME \| ATTR_SIZE))
261	accmode \|= MAY_WRITE\|MAY_OWNER_OVERRIDE;	261	accmode \|= MAY_WRITE\|MAY_OWNER_OVERRIDE;
262	if (iap->ia_valid & ATTR_SIZE)	262	if (iap->ia_valid & ATTR_SIZE)
263	ftype = S_IFREG;	263	ftype = S_IFREG;
264		264
265	/* Get inode */	265	/* Get inode */
266	err = fh_verify(rqstp, fhp, ftype, accmode);	266	err = fh_verify(rqstp, fhp, ftype, accmode);
267	if (err)	267	if (err)
268	goto out;	268	goto out;
269		269
270	dentry = fhp->fh_dentry;	270	dentry = fhp->fh_dentry;
271	inode = dentry->d_inode;	271	inode = dentry->d_inode;
272		272
273	/* Ignore any mode updates on symlinks */	273	/* Ignore any mode updates on symlinks */
274	if (S_ISLNK(inode->i_mode))	274	if (S_ISLNK(inode->i_mode))
275	iap->ia_valid &= ~ATTR_MODE;	275	iap->ia_valid &= ~ATTR_MODE;
276		276
277	if (!iap->ia_valid)	277	if (!iap->ia_valid)
278	goto out;	278	goto out;
279		279
280	/* NFSv2 does not differentiate between "set-[ac]time-to-now"	280	/* NFSv2 does not differentiate between "set-[ac]time-to-now"
281	* which only requires access, and "set-[ac]time-to-X" which	281	* which only requires access, and "set-[ac]time-to-X" which
282	* requires ownership.	282	* requires ownership.
283	* So if it looks like it might be "set both to the same time which	283	* So if it looks like it might be "set both to the same time which
284	* is close to now", and if inode_change_ok fails, then we	284	* is close to now", and if inode_change_ok fails, then we
285	* convert to "set to now" instead of "set to explicit time"	285	* convert to "set to now" instead of "set to explicit time"
286	*	286	*
287	* We only call inode_change_ok as the last test as technically	287	* We only call inode_change_ok as the last test as technically
288	* it is not an interface that we should be using. It is only	288	* it is not an interface that we should be using. It is only
289	* valid if the filesystem does not define it's own i_op->setattr.	289	* valid if the filesystem does not define it's own i_op->setattr.
290	*/	290	*/
291	#define BOTH_TIME_SET (ATTR_ATIME_SET \| ATTR_MTIME_SET)	291	#define BOTH_TIME_SET (ATTR_ATIME_SET \| ATTR_MTIME_SET)
292	#define MAX_TOUCH_TIME_ERROR (30*60)	292	#define MAX_TOUCH_TIME_ERROR (30*60)
293	if ((iap->ia_valid & BOTH_TIME_SET) == BOTH_TIME_SET	293	if ((iap->ia_valid & BOTH_TIME_SET) == BOTH_TIME_SET
294	&& iap->ia_mtime.tv_sec == iap->ia_atime.tv_sec	294	&& iap->ia_mtime.tv_sec == iap->ia_atime.tv_sec
295	) {	295	) {
296	/* Looks probable. Now just make sure time is in the right ballpark.	296	/* Looks probable. Now just make sure time is in the right ballpark.
297	* Solaris, at least, doesn't seem to care what the time request is.	297	* Solaris, at least, doesn't seem to care what the time request is.
298	* We require it be within 30 minutes of now.	298	* We require it be within 30 minutes of now.
299	*/	299	*/
300	time_t delta = iap->ia_atime.tv_sec - get_seconds();	300	time_t delta = iap->ia_atime.tv_sec - get_seconds();
301	if (delta<0) delta = -delta;	301	if (delta<0) delta = -delta;
302	if (delta < MAX_TOUCH_TIME_ERROR &&	302	if (delta < MAX_TOUCH_TIME_ERROR &&
303	inode_change_ok(inode, iap) != 0) {	303	inode_change_ok(inode, iap) != 0) {
304	/* turn off ATTR_[AM]TIME_SET but leave ATTR_[AM]TIME	304	/* turn off ATTR_[AM]TIME_SET but leave ATTR_[AM]TIME
305	* this will cause notify_change to set these times to "now"	305	* this will cause notify_change to set these times to "now"
306	*/	306	*/
307	iap->ia_valid &= ~BOTH_TIME_SET;	307	iap->ia_valid &= ~BOTH_TIME_SET;
308	}	308	}
309	}	309	}
310		310
311	/* The size case is special. It changes the file as well as the attributes. */	311	/* The size case is special. It changes the file as well as the attributes. */
312	if (iap->ia_valid & ATTR_SIZE) {	312	if (iap->ia_valid & ATTR_SIZE) {
313	if (iap->ia_size < inode->i_size) {	313	if (iap->ia_size < inode->i_size) {
314	err = nfsd_permission(fhp->fh_export, dentry, MAY_TRUNC\|MAY_OWNER_OVERRIDE);	314	err = nfsd_permission(fhp->fh_export, dentry, MAY_TRUNC\|MAY_OWNER_OVERRIDE);
315	if (err)	315	if (err)
316	goto out;	316	goto out;
317	}	317	}
318		318
319	/*	319	/*
320	* If we are changing the size of the file, then	320	* If we are changing the size of the file, then
321	* we need to break all leases.	321	* we need to break all leases.
322	*/	322	*/
323	host_err = break_lease(inode, FMODE_WRITE \| O_NONBLOCK);	323	host_err = break_lease(inode, FMODE_WRITE \| O_NONBLOCK);
324	if (host_err == -EWOULDBLOCK)	324	if (host_err == -EWOULDBLOCK)
325	host_err = -ETIMEDOUT;	325	host_err = -ETIMEDOUT;
326	if (host_err) /* ENOMEM or EWOULDBLOCK */	326	if (host_err) /* ENOMEM or EWOULDBLOCK */
327	goto out_nfserr;	327	goto out_nfserr;
328		328
329	host_err = get_write_access(inode);	329	host_err = get_write_access(inode);
330	if (host_err)	330	if (host_err)
331	goto out_nfserr;	331	goto out_nfserr;
332		332
333	size_change = 1;	333	size_change = 1;
334	host_err = locks_verify_truncate(inode, NULL, iap->ia_size);	334	host_err = locks_verify_truncate(inode, NULL, iap->ia_size);
335	if (host_err) {	335	if (host_err) {
336	put_write_access(inode);	336	put_write_access(inode);
337	goto out_nfserr;	337	goto out_nfserr;
338	}	338	}
339	DQUOT_INIT(inode);	339	DQUOT_INIT(inode);
340	}	340	}
341		341
342	imode = inode->i_mode;	342	imode = inode->i_mode;
343	if (iap->ia_valid & ATTR_MODE) {	343	if (iap->ia_valid & ATTR_MODE) {
344	iap->ia_mode &= S_IALLUGO;	344	iap->ia_mode &= S_IALLUGO;
345	imode = iap->ia_mode \|= (imode & ~S_IALLUGO);	345	imode = iap->ia_mode \|= (imode & ~S_IALLUGO);
346	}	346	}
347		347
348	/* Revoke setuid/setgid bit on chown/chgrp */	348	/* Revoke setuid/setgid bit on chown/chgrp */
349	if ((iap->ia_valid & ATTR_UID) && iap->ia_uid != inode->i_uid)	349	if ((iap->ia_valid & ATTR_UID) && iap->ia_uid != inode->i_uid)
350	iap->ia_valid \|= ATTR_KILL_SUID;	350	iap->ia_valid \|= ATTR_KILL_SUID;
351	if ((iap->ia_valid & ATTR_GID) && iap->ia_gid != inode->i_gid)	351	if ((iap->ia_valid & ATTR_GID) && iap->ia_gid != inode->i_gid)
352	iap->ia_valid \|= ATTR_KILL_SGID;	352	iap->ia_valid \|= ATTR_KILL_SGID;
353		353
354	/* Change the attributes. */	354	/* Change the attributes. */
355		355
356	iap->ia_valid \|= ATTR_CTIME;	356	iap->ia_valid \|= ATTR_CTIME;
357		357
358	err = nfserr_notsync;	358	err = nfserr_notsync;
359	if (!check_guard \|\| guardtime == inode->i_ctime.tv_sec) {	359	if (!check_guard \|\| guardtime == inode->i_ctime.tv_sec) {
360	fh_lock(fhp);	360	fh_lock(fhp);
361	host_err = notify_change(dentry, iap);	361	host_err = notify_change(dentry, iap);
362	err = nfserrno(host_err);	362	err = nfserrno(host_err);
363	fh_unlock(fhp);	363	fh_unlock(fhp);
364	}	364	}
365	if (size_change)	365	if (size_change)
366	put_write_access(inode);	366	put_write_access(inode);
367	if (!err)	367	if (!err)
368	if (EX_ISSYNC(fhp->fh_export))	368	if (EX_ISSYNC(fhp->fh_export))
369	write_inode_now(inode, 1);	369	write_inode_now(inode, 1);
370	out:	370	out:
371	return err;	371	return err;
372		372
373	out_nfserr:	373	out_nfserr:
374	err = nfserrno(host_err);	374	err = nfserrno(host_err);
375	goto out;	375	goto out;
376	}	376	}
377		377
378	#if defined(CONFIG_NFSD_V2_ACL) \|\| \	378	#if defined(CONFIG_NFSD_V2_ACL) \|\| \
379	defined(CONFIG_NFSD_V3_ACL) \|\| \	379	defined(CONFIG_NFSD_V3_ACL) \|\| \
380	defined(CONFIG_NFSD_V4)	380	defined(CONFIG_NFSD_V4)
381	static ssize_t nfsd_getxattr(struct dentry dentry, char key, void **buf)	381	static ssize_t nfsd_getxattr(struct dentry dentry, char key, void **buf)
382	{	382	{
383	ssize_t buflen;	383	ssize_t buflen;
384		384
385	buflen = vfs_getxattr(dentry, key, NULL, 0);	385	buflen = vfs_getxattr(dentry, key, NULL, 0);
386	if (buflen <= 0)	386	if (buflen <= 0)
387	return buflen;	387	return buflen;
388		388
389	*buf = kmalloc(buflen, GFP_KERNEL);	389	*buf = kmalloc(buflen, GFP_KERNEL);
390	if (!*buf)	390	if (!*buf)
391	return -ENOMEM;	391	return -ENOMEM;
392		392
393	return vfs_getxattr(dentry, key, *buf, buflen);	393	return vfs_getxattr(dentry, key, *buf, buflen);
394	}	394	}
395	#endif	395	#endif
396		396
397	#if defined(CONFIG_NFSD_V4)	397	#if defined(CONFIG_NFSD_V4)
398	static int	398	static int
399	set_nfsv4_acl_one(struct dentry dentry, struct posix_acl pacl, char *key)	399	set_nfsv4_acl_one(struct dentry dentry, struct posix_acl pacl, char *key)
400	{	400	{
401	int len;	401	int len;
402	size_t buflen;	402	size_t buflen;
403	char *buf = NULL;	403	char *buf = NULL;
404	int error = 0;	404	int error = 0;
405		405
406	buflen = posix_acl_xattr_size(pacl->a_count);	406	buflen = posix_acl_xattr_size(pacl->a_count);
407	buf = kmalloc(buflen, GFP_KERNEL);	407	buf = kmalloc(buflen, GFP_KERNEL);
408	error = -ENOMEM;	408	error = -ENOMEM;
409	if (buf == NULL)	409	if (buf == NULL)
410	goto out;	410	goto out;
411		411
412	len = posix_acl_to_xattr(pacl, buf, buflen);	412	len = posix_acl_to_xattr(pacl, buf, buflen);
413	if (len < 0) {	413	if (len < 0) {
414	error = len;	414	error = len;
415	goto out;	415	goto out;
416	}	416	}
417		417
418	error = vfs_setxattr(dentry, key, buf, len, 0);	418	error = vfs_setxattr(dentry, key, buf, len, 0);
419	out:	419	out:
420	kfree(buf);	420	kfree(buf);
421	return error;	421	return error;
422	}	422	}
423		423
424	__be32	424	__be32
425	nfsd4_set_nfs4_acl(struct svc_rqst rqstp, struct svc_fh fhp,	425	nfsd4_set_nfs4_acl(struct svc_rqst rqstp, struct svc_fh fhp,
426	struct nfs4_acl *acl)	426	struct nfs4_acl *acl)
427	{	427	{
428	__be32 error;	428	__be32 error;
429	int host_error;	429	int host_error;
430	struct dentry *dentry;	430	struct dentry *dentry;
431	struct inode *inode;	431	struct inode *inode;
432	struct posix_acl pacl = NULL, dpacl = NULL;	432	struct posix_acl pacl = NULL, dpacl = NULL;
433	unsigned int flags = 0;	433	unsigned int flags = 0;
434		434
435	/* Get inode */	435	/* Get inode */
436	error = fh_verify(rqstp, fhp, 0 /* S_IFREG */, MAY_SATTR);	436	error = fh_verify(rqstp, fhp, 0 /* S_IFREG */, MAY_SATTR);
437	if (error)	437	if (error)
438	goto out;	438	goto out;
439		439
440	dentry = fhp->fh_dentry;	440	dentry = fhp->fh_dentry;
441	inode = dentry->d_inode;	441	inode = dentry->d_inode;
442	if (S_ISDIR(inode->i_mode))	442	if (S_ISDIR(inode->i_mode))
443	flags = NFS4_ACL_DIR;	443	flags = NFS4_ACL_DIR;
444		444
445	host_error = nfs4_acl_nfsv4_to_posix(acl, &pacl, &dpacl, flags);	445	host_error = nfs4_acl_nfsv4_to_posix(acl, &pacl, &dpacl, flags);
446	if (host_error == -EINVAL) {	446	if (host_error == -EINVAL) {
447	error = nfserr_attrnotsupp;	447	error = nfserr_attrnotsupp;
448	goto out;	448	goto out;
449	} else if (host_error < 0)	449	} else if (host_error < 0)
450	goto out_nfserr;	450	goto out_nfserr;
451		451
452	host_error = set_nfsv4_acl_one(dentry, pacl, POSIX_ACL_XATTR_ACCESS);	452	host_error = set_nfsv4_acl_one(dentry, pacl, POSIX_ACL_XATTR_ACCESS);
453	if (host_error < 0)	453	if (host_error < 0)
454	goto out_nfserr;	454	goto out_nfserr;
455		455
456	if (S_ISDIR(inode->i_mode)) {	456	if (S_ISDIR(inode->i_mode)) {
457	host_error = set_nfsv4_acl_one(dentry, dpacl, POSIX_ACL_XATTR_DEFAULT);	457	host_error = set_nfsv4_acl_one(dentry, dpacl, POSIX_ACL_XATTR_DEFAULT);
458	if (host_error < 0)	458	if (host_error < 0)
459	goto out_nfserr;	459	goto out_nfserr;
460	}	460	}
461		461
462	error = nfs_ok;	462	error = nfs_ok;
463		463
464	out:	464	out:
465	posix_acl_release(pacl);	465	posix_acl_release(pacl);
466	posix_acl_release(dpacl);	466	posix_acl_release(dpacl);
467	return (error);	467	return (error);
468	out_nfserr:	468	out_nfserr:
469	if (host_error == -EOPNOTSUPP)	469	if (host_error == -EOPNOTSUPP)
470	error = nfserr_attrnotsupp;	470	error = nfserr_attrnotsupp;
471	else	471	else
472	error = nfserrno(host_error);	472	error = nfserrno(host_error);
473	goto out;	473	goto out;
474	}	474	}
475		475
476	static struct posix_acl *	476	static struct posix_acl *
477	_get_posix_acl(struct dentry dentry, char key)	477	_get_posix_acl(struct dentry dentry, char key)
478	{	478	{
479	void *buf = NULL;	479	void *buf = NULL;
480	struct posix_acl *pacl = NULL;	480	struct posix_acl *pacl = NULL;
481	int buflen;	481	int buflen;
482		482
483	buflen = nfsd_getxattr(dentry, key, &buf);	483	buflen = nfsd_getxattr(dentry, key, &buf);
484	if (!buflen)	484	if (!buflen)
485	buflen = -ENODATA;	485	buflen = -ENODATA;
486	if (buflen <= 0)	486	if (buflen <= 0)
487	return ERR_PTR(buflen);	487	return ERR_PTR(buflen);
488		488
489	pacl = posix_acl_from_xattr(buf, buflen);	489	pacl = posix_acl_from_xattr(buf, buflen);
490	kfree(buf);	490	kfree(buf);
491	return pacl;	491	return pacl;
492	}	492	}
493		493
494	int	494	int
495	nfsd4_get_nfs4_acl(struct svc_rqst rqstp, struct dentry dentry, struct nfs4_acl **acl)	495	nfsd4_get_nfs4_acl(struct svc_rqst rqstp, struct dentry dentry, struct nfs4_acl **acl)
496	{	496	{
497	struct inode *inode = dentry->d_inode;	497	struct inode *inode = dentry->d_inode;
498	int error = 0;	498	int error = 0;
499	struct posix_acl pacl = NULL, dpacl = NULL;	499	struct posix_acl pacl = NULL, dpacl = NULL;
500	unsigned int flags = 0;	500	unsigned int flags = 0;
501		501
502	pacl = _get_posix_acl(dentry, POSIX_ACL_XATTR_ACCESS);	502	pacl = _get_posix_acl(dentry, POSIX_ACL_XATTR_ACCESS);
503	if (IS_ERR(pacl) && PTR_ERR(pacl) == -ENODATA)	503	if (IS_ERR(pacl) && PTR_ERR(pacl) == -ENODATA)
504	pacl = posix_acl_from_mode(inode->i_mode, GFP_KERNEL);	504	pacl = posix_acl_from_mode(inode->i_mode, GFP_KERNEL);
505	if (IS_ERR(pacl)) {	505	if (IS_ERR(pacl)) {
506	error = PTR_ERR(pacl);	506	error = PTR_ERR(pacl);
507	pacl = NULL;	507	pacl = NULL;
508	goto out;	508	goto out;
509	}	509	}
510		510
511	if (S_ISDIR(inode->i_mode)) {	511	if (S_ISDIR(inode->i_mode)) {
512	dpacl = _get_posix_acl(dentry, POSIX_ACL_XATTR_DEFAULT);	512	dpacl = _get_posix_acl(dentry, POSIX_ACL_XATTR_DEFAULT);
513	if (IS_ERR(dpacl) && PTR_ERR(dpacl) == -ENODATA)	513	if (IS_ERR(dpacl) && PTR_ERR(dpacl) == -ENODATA)
514	dpacl = NULL;	514	dpacl = NULL;
515	else if (IS_ERR(dpacl)) {	515	else if (IS_ERR(dpacl)) {
516	error = PTR_ERR(dpacl);	516	error = PTR_ERR(dpacl);
517	dpacl = NULL;	517	dpacl = NULL;
518	goto out;	518	goto out;
519	}	519	}
520	flags = NFS4_ACL_DIR;	520	flags = NFS4_ACL_DIR;
521	}	521	}
522		522
523	*acl = nfs4_acl_posix_to_nfsv4(pacl, dpacl, flags);	523	*acl = nfs4_acl_posix_to_nfsv4(pacl, dpacl, flags);
524	if (IS_ERR(*acl)) {	524	if (IS_ERR(*acl)) {
525	error = PTR_ERR(*acl);	525	error = PTR_ERR(*acl);
526	*acl = NULL;	526	*acl = NULL;
527	}	527	}
528	out:	528	out:
529	posix_acl_release(pacl);	529	posix_acl_release(pacl);
530	posix_acl_release(dpacl);	530	posix_acl_release(dpacl);
531	return error;	531	return error;
532	}	532	}
533		533
534	#endif /* defined(CONFIG_NFS_V4) */	534	#endif /* defined(CONFIG_NFS_V4) */
535		535
536	#ifdef CONFIG_NFSD_V3	536	#ifdef CONFIG_NFSD_V3
537	/*	537	/*
538	* Check server access rights to a file system object	538	* Check server access rights to a file system object
539	*/	539	*/
540	struct accessmap {	540	struct accessmap {
541	u32 access;	541	u32 access;
542	int how;	542	int how;
543	};	543	};
544	static struct accessmap nfs3_regaccess[] = {	544	static struct accessmap nfs3_regaccess[] = {
545	{ NFS3_ACCESS_READ, MAY_READ },	545	{ NFS3_ACCESS_READ, MAY_READ },
546	{ NFS3_ACCESS_EXECUTE, MAY_EXEC },	546	{ NFS3_ACCESS_EXECUTE, MAY_EXEC },
547	{ NFS3_ACCESS_MODIFY, MAY_WRITE\|MAY_TRUNC },	547	{ NFS3_ACCESS_MODIFY, MAY_WRITE\|MAY_TRUNC },
548	{ NFS3_ACCESS_EXTEND, MAY_WRITE },	548	{ NFS3_ACCESS_EXTEND, MAY_WRITE },
549		549
550	{ 0, 0 }	550	{ 0, 0 }
551	};	551	};
552		552
553	static struct accessmap nfs3_diraccess[] = {	553	static struct accessmap nfs3_diraccess[] = {
554	{ NFS3_ACCESS_READ, MAY_READ },	554	{ NFS3_ACCESS_READ, MAY_READ },
555	{ NFS3_ACCESS_LOOKUP, MAY_EXEC },	555	{ NFS3_ACCESS_LOOKUP, MAY_EXEC },
556	{ NFS3_ACCESS_MODIFY, MAY_EXEC\|MAY_WRITE\|MAY_TRUNC },	556	{ NFS3_ACCESS_MODIFY, MAY_EXEC\|MAY_WRITE\|MAY_TRUNC },
557	{ NFS3_ACCESS_EXTEND, MAY_EXEC\|MAY_WRITE },	557	{ NFS3_ACCESS_EXTEND, MAY_EXEC\|MAY_WRITE },
558	{ NFS3_ACCESS_DELETE, MAY_REMOVE },	558	{ NFS3_ACCESS_DELETE, MAY_REMOVE },
559		559
560	{ 0, 0 }	560	{ 0, 0 }
561	};	561	};
562		562
563	static struct accessmap nfs3_anyaccess[] = {	563	static struct accessmap nfs3_anyaccess[] = {
564	/* Some clients - Solaris 2.6 at least, make an access call	564	/* Some clients - Solaris 2.6 at least, make an access call
565	* to the server to check for access for things like /dev/null	565	* to the server to check for access for things like /dev/null
566	* (which really, the server doesn't care about). So	566	* (which really, the server doesn't care about). So
567	* We provide simple access checking for them, looking	567	* We provide simple access checking for them, looking
568	* mainly at mode bits, and we make sure to ignore read-only	568	* mainly at mode bits, and we make sure to ignore read-only
569	* filesystem checks	569	* filesystem checks
570	*/	570	*/
571	{ NFS3_ACCESS_READ, MAY_READ },	571	{ NFS3_ACCESS_READ, MAY_READ },
572	{ NFS3_ACCESS_EXECUTE, MAY_EXEC },	572	{ NFS3_ACCESS_EXECUTE, MAY_EXEC },
573	{ NFS3_ACCESS_MODIFY, MAY_WRITE\|MAY_LOCAL_ACCESS },	573	{ NFS3_ACCESS_MODIFY, MAY_WRITE\|MAY_LOCAL_ACCESS },
574	{ NFS3_ACCESS_EXTEND, MAY_WRITE\|MAY_LOCAL_ACCESS },	574	{ NFS3_ACCESS_EXTEND, MAY_WRITE\|MAY_LOCAL_ACCESS },
575		575
576	{ 0, 0 }	576	{ 0, 0 }
577	};	577	};
578		578
579	__be32	579	__be32
580	nfsd_access(struct svc_rqst rqstp, struct svc_fh fhp, u32 access, u32 supported)	580	nfsd_access(struct svc_rqst rqstp, struct svc_fh fhp, u32 access, u32 supported)
581	{	581	{
582	struct accessmap *map;	582	struct accessmap *map;
583	struct svc_export *export;	583	struct svc_export *export;
584	struct dentry *dentry;	584	struct dentry *dentry;
585	u32 query, result = 0, sresult = 0;	585	u32 query, result = 0, sresult = 0;
586	__be32 error;	586	__be32 error;
587		587
588	error = fh_verify(rqstp, fhp, 0, MAY_NOP);	588	error = fh_verify(rqstp, fhp, 0, MAY_NOP);
589	if (error)	589	if (error)
590	goto out;	590	goto out;
591		591
592	export = fhp->fh_export;	592	export = fhp->fh_export;
593	dentry = fhp->fh_dentry;	593	dentry = fhp->fh_dentry;
594		594
595	if (S_ISREG(dentry->d_inode->i_mode))	595	if (S_ISREG(dentry->d_inode->i_mode))
596	map = nfs3_regaccess;	596	map = nfs3_regaccess;
597	else if (S_ISDIR(dentry->d_inode->i_mode))	597	else if (S_ISDIR(dentry->d_inode->i_mode))
598	map = nfs3_diraccess;	598	map = nfs3_diraccess;
599	else	599	else
600	map = nfs3_anyaccess;	600	map = nfs3_anyaccess;
601		601
602		602
603	query = *access;	603	query = *access;
604	for (; map->access; map++) {	604	for (; map->access; map++) {
605	if (map->access & query) {	605	if (map->access & query) {
606	__be32 err2;	606	__be32 err2;
607		607
608	sresult \|= map->access;	608	sresult \|= map->access;
609		609
610	err2 = nfsd_permission(export, dentry, map->how);	610	err2 = nfsd_permission(export, dentry, map->how);
611	switch (err2) {	611	switch (err2) {
612	case nfs_ok:	612	case nfs_ok:
613	result \|= map->access;	613	result \|= map->access;
614	break;	614	break;
615		615
616	/* the following error codes just mean the access was not allowed,	616	/* the following error codes just mean the access was not allowed,
617	* rather than an error occurred */	617	* rather than an error occurred */
618	case nfserr_rofs:	618	case nfserr_rofs:
619	case nfserr_acces:	619	case nfserr_acces:
620	case nfserr_perm:	620	case nfserr_perm:
621	/* simply don't "or" in the access bit. */	621	/* simply don't "or" in the access bit. */
622	break;	622	break;
623	default:	623	default:
624	error = err2;	624	error = err2;
625	goto out;	625	goto out;
626	}	626	}
627	}	627	}
628	}	628	}
629	*access = result;	629	*access = result;
630	if (supported)	630	if (supported)
631	*supported = sresult;	631	*supported = sresult;
632		632
633	out:	633	out:
634	return error;	634	return error;
635	}	635	}
636	#endif /* CONFIG_NFSD_V3 */	636	#endif /* CONFIG_NFSD_V3 */
637		637
638		638
639		639
640	/*	640	/*
641	* Open an existing file or directory.	641	* Open an existing file or directory.
642	* The access argument indicates the type of open (read/write/lock)	642	* The access argument indicates the type of open (read/write/lock)
643	* N.B. After this call fhp needs an fh_put	643	* N.B. After this call fhp needs an fh_put
644	*/	644	*/
645	__be32	645	__be32
646	nfsd_open(struct svc_rqst rqstp, struct svc_fh fhp, int type,	646	nfsd_open(struct svc_rqst rqstp, struct svc_fh fhp, int type,
647	int access, struct file **filp)	647	int access, struct file **filp)
648	{	648	{
649	struct dentry *dentry;	649	struct dentry *dentry;
650	struct inode *inode;	650	struct inode *inode;
651	int flags = O_RDONLY\|O_LARGEFILE;	651	int flags = O_RDONLY\|O_LARGEFILE;
652	__be32 err;	652	__be32 err;
653	int host_err;	653	int host_err;
654		654
655	/*	655	/*
656	* If we get here, then the client has already done an "open",	656	* If we get here, then the client has already done an "open",
657	* and (hopefully) checked permission - so allow OWNER_OVERRIDE	657	* and (hopefully) checked permission - so allow OWNER_OVERRIDE
658	* in case a chmod has now revoked permission.	658	* in case a chmod has now revoked permission.
659	*/	659	*/
660	err = fh_verify(rqstp, fhp, type, access \| MAY_OWNER_OVERRIDE);	660	err = fh_verify(rqstp, fhp, type, access \| MAY_OWNER_OVERRIDE);
661	if (err)	661	if (err)
662	goto out;	662	goto out;
663		663
664	dentry = fhp->fh_dentry;	664	dentry = fhp->fh_dentry;
665	inode = dentry->d_inode;	665	inode = dentry->d_inode;
666		666
667	/* Disallow write access to files with the append-only bit set	667	/* Disallow write access to files with the append-only bit set
668	* or any access when mandatory locking enabled	668	* or any access when mandatory locking enabled
669	*/	669	*/
670	err = nfserr_perm;	670	err = nfserr_perm;
671	if (IS_APPEND(inode) && (access & MAY_WRITE))	671	if (IS_APPEND(inode) && (access & MAY_WRITE))
672	goto out;	672	goto out;
673	if (IS_ISMNDLK(inode))	673	if (IS_ISMNDLK(inode))
674	goto out;	674	goto out;
675		675
676	if (!inode->i_fop)	676	if (!inode->i_fop)
677	goto out;	677	goto out;
678		678
679	/*	679	/*
680	* Check to see if there are any leases on this file.	680	* Check to see if there are any leases on this file.
681	* This may block while leases are broken.	681	* This may block while leases are broken.
682	*/	682	*/
683	host_err = break_lease(inode, O_NONBLOCK \| ((access & MAY_WRITE) ? FMODE_WRITE : 0));	683	host_err = break_lease(inode, O_NONBLOCK \| ((access & MAY_WRITE) ? FMODE_WRITE : 0));
684	if (host_err == -EWOULDBLOCK)	684	if (host_err == -EWOULDBLOCK)
685	host_err = -ETIMEDOUT;	685	host_err = -ETIMEDOUT;
686	if (host_err) /* NOMEM or WOULDBLOCK */	686	if (host_err) /* NOMEM or WOULDBLOCK */
687	goto out_nfserr;	687	goto out_nfserr;
688		688
689	if (access & MAY_WRITE) {	689	if (access & MAY_WRITE) {
690	if (access & MAY_READ)	690	if (access & MAY_READ)
691	flags = O_RDWR\|O_LARGEFILE;	691	flags = O_RDWR\|O_LARGEFILE;
692	else	692	else
693	flags = O_WRONLY\|O_LARGEFILE;	693	flags = O_WRONLY\|O_LARGEFILE;
694		694
695	DQUOT_INIT(inode);	695	DQUOT_INIT(inode);
696	}	696	}
697	*filp = dentry_open(dget(dentry), mntget(fhp->fh_export->ex_mnt), flags);	697	*filp = dentry_open(dget(dentry), mntget(fhp->fh_export->ex_mnt), flags);
698	if (IS_ERR(*filp))	698	if (IS_ERR(*filp))
699	host_err = PTR_ERR(*filp);	699	host_err = PTR_ERR(*filp);
700	out_nfserr:	700	out_nfserr:
701	err = nfserrno(host_err);	701	err = nfserrno(host_err);
702	out:	702	out:
703	return err;	703	return err;
704	}	704	}
705		705
706	/*	706	/*
707	* Close a file.	707	* Close a file.
708	*/	708	*/
709	void	709	void
710	nfsd_close(struct file *filp)	710	nfsd_close(struct file *filp)
711	{	711	{
712	fput(filp);	712	fput(filp);
713	}	713	}
714		714
715	/*	715	/*
716	* Sync a file	716	* Sync a file
717	* As this calls fsync (not fdatasync) there is no need for a write_inode	717	* As this calls fsync (not fdatasync) there is no need for a write_inode
718	* after it.	718	* after it.
719	*/	719	*/
720	static inline int nfsd_dosync(struct file filp, struct dentry dp,	720	static inline int nfsd_dosync(struct file filp, struct dentry dp,
721	const struct file_operations *fop)	721	const struct file_operations *fop)
722	{	722	{
723	struct inode *inode = dp->d_inode;	723	struct inode *inode = dp->d_inode;
724	int (fsync) (struct file , struct dentry *, int);	724	int (fsync) (struct file , struct dentry *, int);
725	int err;	725	int err;
726		726
727	err = filemap_fdatawrite(inode->i_mapping);	727	err = filemap_fdatawrite(inode->i_mapping);
728	if (err == 0 && fop && (fsync = fop->fsync))	728	if (err == 0 && fop && (fsync = fop->fsync))
729	err = fsync(filp, dp, 0);	729	err = fsync(filp, dp, 0);
730	if (err == 0)	730	if (err == 0)
731	err = filemap_fdatawait(inode->i_mapping);	731	err = filemap_fdatawait(inode->i_mapping);
732		732
733	return err;	733	return err;
734	}	734	}
735		735
736		736
737	static int	737	static int
738	nfsd_sync(struct file *filp)	738	nfsd_sync(struct file *filp)
739	{	739	{
740	int err;	740	int err;
741	struct inode *inode = filp->f_path.dentry->d_inode;	741	struct inode *inode = filp->f_path.dentry->d_inode;
742	dprintk("nfsd: sync file %s\n", filp->f_path.dentry->d_name.name);	742	dprintk("nfsd: sync file %s\n", filp->f_path.dentry->d_name.name);
743	mutex_lock(&inode->i_mutex);	743	mutex_lock(&inode->i_mutex);
744	err=nfsd_dosync(filp, filp->f_path.dentry, filp->f_op);	744	err=nfsd_dosync(filp, filp->f_path.dentry, filp->f_op);
745	mutex_unlock(&inode->i_mutex);	745	mutex_unlock(&inode->i_mutex);
746		746
747	return err;	747	return err;
748	}	748	}
749		749
750	int	750	int
751	nfsd_sync_dir(struct dentry *dp)	751	nfsd_sync_dir(struct dentry *dp)
752	{	752	{
753	return nfsd_dosync(NULL, dp, dp->d_inode->i_fop);	753	return nfsd_dosync(NULL, dp, dp->d_inode->i_fop);
754	}	754	}
755		755
756	/*	756	/*
757	* Obtain the readahead parameters for the file	757	* Obtain the readahead parameters for the file
758	* specified by (dev, ino).	758	* specified by (dev, ino).
759	*/	759	*/
760		760
761	static inline struct raparms *	761	static inline struct raparms *
762	nfsd_get_raparms(dev_t dev, ino_t ino)	762	nfsd_get_raparms(dev_t dev, ino_t ino)
763	{	763	{
764	struct raparms ra, rap, *frap = NULL;	764	struct raparms ra, rap, *frap = NULL;
765	int depth = 0;	765	int depth = 0;
766	unsigned int hash;	766	unsigned int hash;
767	struct raparm_hbucket *rab;	767	struct raparm_hbucket *rab;
768		768
769	hash = jhash_2words(dev, ino, 0xfeedbeef) & RAPARM_HASH_MASK;	769	hash = jhash_2words(dev, ino, 0xfeedbeef) & RAPARM_HASH_MASK;
770	rab = &raparm_hash[hash];	770	rab = &raparm_hash[hash];
771		771
772	spin_lock(&rab->pb_lock);	772	spin_lock(&rab->pb_lock);
773	for (rap = &rab->pb_head; (ra = *rap); rap = &ra->p_next) {	773	for (rap = &rab->pb_head; (ra = *rap); rap = &ra->p_next) {
774	if (ra->p_ino == ino && ra->p_dev == dev)	774	if (ra->p_ino == ino && ra->p_dev == dev)
775	goto found;	775	goto found;
776	depth++;	776	depth++;
777	if (ra->p_count == 0)	777	if (ra->p_count == 0)
778	frap = rap;	778	frap = rap;
779	}	779	}
780	depth = nfsdstats.ra_size*11/10;	780	depth = nfsdstats.ra_size*11/10;
781	if (!frap) {	781	if (!frap) {
782	spin_unlock(&rab->pb_lock);	782	spin_unlock(&rab->pb_lock);
783	return NULL;	783	return NULL;
784	}	784	}
785	rap = frap;	785	rap = frap;
786	ra = *frap;	786	ra = *frap;
787	ra->p_dev = dev;	787	ra->p_dev = dev;
788	ra->p_ino = ino;	788	ra->p_ino = ino;
789	ra->p_set = 0;	789	ra->p_set = 0;
790	ra->p_hindex = hash;	790	ra->p_hindex = hash;
791	found:	791	found:
792	if (rap != &rab->pb_head) {	792	if (rap != &rab->pb_head) {
793	*rap = ra->p_next;	793	*rap = ra->p_next;
794	ra->p_next = rab->pb_head;	794	ra->p_next = rab->pb_head;
795	rab->pb_head = ra;	795	rab->pb_head = ra;
796	}	796	}
797	ra->p_count++;	797	ra->p_count++;
798	nfsdstats.ra_depth[depth*10/nfsdstats.ra_size]++;	798	nfsdstats.ra_depth[depth*10/nfsdstats.ra_size]++;
799	spin_unlock(&rab->pb_lock);	799	spin_unlock(&rab->pb_lock);
800	return ra;	800	return ra;
801	}	801	}
802		802
803	/*	803	/*
804	* Grab and keep cached pages associated with a file in the svc_rqst	804	* Grab and keep cached pages associated with a file in the svc_rqst
805	* so that they can be passed to the network sendmsg/sendpage routines	805	* so that they can be passed to the network sendmsg/sendpage routines
806	* directly. They will be released after the sending has completed.	806	* directly. They will be released after the sending has completed.
807	*/	807	*/
808	static int	808	static int
809	nfsd_splice_actor(struct pipe_inode_info pipe, struct pipe_buffer buf,	809	nfsd_splice_actor(struct pipe_inode_info pipe, struct pipe_buffer buf,
810	struct splice_desc *sd)	810	struct splice_desc *sd)
811	{	811	{
812	struct svc_rqst *rqstp = sd->u.data;	812	struct svc_rqst *rqstp = sd->u.data;
813	struct page **pp = rqstp->rq_respages + rqstp->rq_resused;	813	struct page **pp = rqstp->rq_respages + rqstp->rq_resused;
814	struct page *page = buf->page;	814	struct page *page = buf->page;
815	size_t size;	815	size_t size;
816	int ret;	816	int ret;
817		817
818	ret = buf->ops->pin(pipe, buf);	818	ret = buf->ops->confirm(pipe, buf);
819	if (unlikely(ret))	819	if (unlikely(ret))
820	return ret;	820	return ret;
821		821
822	size = sd->len;	822	size = sd->len;
823		823
824	if (rqstp->rq_res.page_len == 0) {	824	if (rqstp->rq_res.page_len == 0) {
825	get_page(page);	825	get_page(page);
826	put_page(*pp);	826	put_page(*pp);
827	*pp = page;	827	*pp = page;
828	rqstp->rq_resused++;	828	rqstp->rq_resused++;
829	rqstp->rq_res.page_base = buf->offset;	829	rqstp->rq_res.page_base = buf->offset;
830	rqstp->rq_res.page_len = size;	830	rqstp->rq_res.page_len = size;
831	} else if (page != pp[-1]) {	831	} else if (page != pp[-1]) {
832	get_page(page);	832	get_page(page);
833	if (*pp)	833	if (*pp)
834	put_page(*pp);	834	put_page(*pp);
835	*pp = page;	835	*pp = page;
836	rqstp->rq_resused++;	836	rqstp->rq_resused++;
837	rqstp->rq_res.page_len += size;	837	rqstp->rq_res.page_len += size;
838	} else	838	} else
839	rqstp->rq_res.page_len += size;	839	rqstp->rq_res.page_len += size;
840		840
841	return size;	841	return size;
842	}	842	}
843		843
844	static int nfsd_direct_splice_actor(struct pipe_inode_info *pipe,	844	static int nfsd_direct_splice_actor(struct pipe_inode_info *pipe,
845	struct splice_desc *sd)	845	struct splice_desc *sd)
846	{	846	{
847	return __splice_from_pipe(pipe, sd, nfsd_splice_actor);	847	return __splice_from_pipe(pipe, sd, nfsd_splice_actor);
848	}	848	}
849		849
850	static __be32	850	static __be32
851	nfsd_vfs_read(struct svc_rqst rqstp, struct svc_fh fhp, struct file *file,	851	nfsd_vfs_read(struct svc_rqst rqstp, struct svc_fh fhp, struct file *file,
852	loff_t offset, struct kvec vec, int vlen, unsigned long count)	852	loff_t offset, struct kvec vec, int vlen, unsigned long count)
853	{	853	{
854	struct inode *inode;	854	struct inode *inode;
855	struct raparms *ra;	855	struct raparms *ra;
856	mm_segment_t oldfs;	856	mm_segment_t oldfs;
857	__be32 err;	857	__be32 err;
858	int host_err;	858	int host_err;
859		859
860	err = nfserr_perm;	860	err = nfserr_perm;
861	inode = file->f_path.dentry->d_inode;	861	inode = file->f_path.dentry->d_inode;
862	#ifdef MSNFS	862	#ifdef MSNFS
863	if ((fhp->fh_export->ex_flags & NFSEXP_MSNFS) &&	863	if ((fhp->fh_export->ex_flags & NFSEXP_MSNFS) &&
864	(!lock_may_read(inode, offset, *count)))	864	(!lock_may_read(inode, offset, *count)))
865	goto out;	865	goto out;
866	#endif	866	#endif
867		867
868	/* Get readahead parameters */	868	/* Get readahead parameters */
869	ra = nfsd_get_raparms(inode->i_sb->s_dev, inode->i_ino);	869	ra = nfsd_get_raparms(inode->i_sb->s_dev, inode->i_ino);
870		870
871	if (ra && ra->p_set)	871	if (ra && ra->p_set)
872	file->f_ra = ra->p_ra;	872	file->f_ra = ra->p_ra;
873		873
874	if (file->f_op->splice_read && rqstp->rq_splice_ok) {	874	if (file->f_op->splice_read && rqstp->rq_splice_ok) {
875	struct splice_desc sd = {	875	struct splice_desc sd = {
876	.len = 0,	876	.len = 0,
877	.total_len = *count,	877	.total_len = *count,
878	.pos = offset,	878	.pos = offset,
879	.u.data = rqstp,	879	.u.data = rqstp,
880	};	880	};
881		881
882	host_err = splice_direct_to_actor(file, &sd, nfsd_direct_splice_actor);	882	host_err = splice_direct_to_actor(file, &sd, nfsd_direct_splice_actor);
883	} else {	883	} else {
884	oldfs = get_fs();	884	oldfs = get_fs();
885	set_fs(KERNEL_DS);	885	set_fs(KERNEL_DS);
886	host_err = vfs_readv(file, (struct iovec __user *)vec, vlen, &offset);	886	host_err = vfs_readv(file, (struct iovec __user *)vec, vlen, &offset);
887	set_fs(oldfs);	887	set_fs(oldfs);
888	}	888	}
889		889
890	/* Write back readahead params */	890	/* Write back readahead params */
891	if (ra) {	891	if (ra) {
892	struct raparm_hbucket *rab = &raparm_hash[ra->p_hindex];	892	struct raparm_hbucket *rab = &raparm_hash[ra->p_hindex];
893	spin_lock(&rab->pb_lock);	893	spin_lock(&rab->pb_lock);
894	ra->p_ra = file->f_ra;	894	ra->p_ra = file->f_ra;
895	ra->p_set = 1;	895	ra->p_set = 1;
896	ra->p_count--;	896	ra->p_count--;
897	spin_unlock(&rab->pb_lock);	897	spin_unlock(&rab->pb_lock);
898	}	898	}
899		899
900	if (host_err >= 0) {	900	if (host_err >= 0) {
901	nfsdstats.io_read += host_err;	901	nfsdstats.io_read += host_err;
902	*count = host_err;	902	*count = host_err;
903	err = 0;	903	err = 0;
904	fsnotify_access(file->f_path.dentry);	904	fsnotify_access(file->f_path.dentry);
905	} else	905	} else
906	err = nfserrno(host_err);	906	err = nfserrno(host_err);
907	out:	907	out:
908	return err;	908	return err;
909	}	909	}
910		910
911	static void kill_suid(struct dentry *dentry)	911	static void kill_suid(struct dentry *dentry)
912	{	912	{
913	struct iattr ia;	913	struct iattr ia;
914	ia.ia_valid = ATTR_KILL_SUID \| ATTR_KILL_SGID;	914	ia.ia_valid = ATTR_KILL_SUID \| ATTR_KILL_SGID;
915		915
916	mutex_lock(&dentry->d_inode->i_mutex);	916	mutex_lock(&dentry->d_inode->i_mutex);
917	notify_change(dentry, &ia);	917	notify_change(dentry, &ia);
918	mutex_unlock(&dentry->d_inode->i_mutex);	918	mutex_unlock(&dentry->d_inode->i_mutex);
919	}	919	}
920		920
921	static __be32	921	static __be32
922	nfsd_vfs_write(struct svc_rqst rqstp, struct svc_fh fhp, struct file *file,	922	nfsd_vfs_write(struct svc_rqst rqstp, struct svc_fh fhp, struct file *file,
923	loff_t offset, struct kvec *vec, int vlen,	923	loff_t offset, struct kvec *vec, int vlen,
924	unsigned long cnt, int *stablep)	924	unsigned long cnt, int *stablep)
925	{	925	{
926	struct svc_export *exp;	926	struct svc_export *exp;
927	struct dentry *dentry;	927	struct dentry *dentry;
928	struct inode *inode;	928	struct inode *inode;
929	mm_segment_t oldfs;	929	mm_segment_t oldfs;
930	__be32 err = 0;	930	__be32 err = 0;
931	int host_err;	931	int host_err;
932	int stable = *stablep;	932	int stable = *stablep;
933		933
934	#ifdef MSNFS	934	#ifdef MSNFS
935	err = nfserr_perm;	935	err = nfserr_perm;
936		936
937	if ((fhp->fh_export->ex_flags & NFSEXP_MSNFS) &&	937	if ((fhp->fh_export->ex_flags & NFSEXP_MSNFS) &&
938	(!lock_may_write(file->f_path.dentry->d_inode, offset, cnt)))	938	(!lock_may_write(file->f_path.dentry->d_inode, offset, cnt)))
939	goto out;	939	goto out;
940	#endif	940	#endif
941		941
942	dentry = file->f_path.dentry;	942	dentry = file->f_path.dentry;
943	inode = dentry->d_inode;	943	inode = dentry->d_inode;
944	exp = fhp->fh_export;	944	exp = fhp->fh_export;
945		945
946	/*	946	/*
947	* Request sync writes if	947	* Request sync writes if
948	* - the sync export option has been set, or	948	* - the sync export option has been set, or
949	* - the client requested O_SYNC behavior (NFSv3 feature).	949	* - the client requested O_SYNC behavior (NFSv3 feature).
950	* - The file system doesn't support fsync().	950	* - The file system doesn't support fsync().
951	* When gathered writes have been configured for this volume,	951	* When gathered writes have been configured for this volume,
952	* flushing the data to disk is handled separately below.	952	* flushing the data to disk is handled separately below.
953	*/	953	*/
954		954
955	if (file->f_op->fsync == 0) {/* COMMIT3 cannot work */	955	if (file->f_op->fsync == 0) {/* COMMIT3 cannot work */
956	stable = 2;	956	stable = 2;
957	stablep = 2; / FILE_SYNC */	957	stablep = 2; / FILE_SYNC */
958	}	958	}
959		959
960	if (!EX_ISSYNC(exp))	960	if (!EX_ISSYNC(exp))
961	stable = 0;	961	stable = 0;
962	if (stable && !EX_WGATHER(exp))	962	if (stable && !EX_WGATHER(exp))
963	file->f_flags \|= O_SYNC;	963	file->f_flags \|= O_SYNC;
964		964
965	/* Write the data. */	965	/* Write the data. */
966	oldfs = get_fs(); set_fs(KERNEL_DS);	966	oldfs = get_fs(); set_fs(KERNEL_DS);
967	host_err = vfs_writev(file, (struct iovec __user *)vec, vlen, &offset);	967	host_err = vfs_writev(file, (struct iovec __user *)vec, vlen, &offset);
968	set_fs(oldfs);	968	set_fs(oldfs);
969	if (host_err >= 0) {	969	if (host_err >= 0) {
970	nfsdstats.io_write += cnt;	970	nfsdstats.io_write += cnt;
971	fsnotify_modify(file->f_path.dentry);	971	fsnotify_modify(file->f_path.dentry);
972	}	972	}
973		973
974	/* clear setuid/setgid flag after write */	974	/* clear setuid/setgid flag after write */
975	if (host_err >= 0 && (inode->i_mode & (S_ISUID \| S_ISGID)))	975	if (host_err >= 0 && (inode->i_mode & (S_ISUID \| S_ISGID)))
976	kill_suid(dentry);	976	kill_suid(dentry);
977		977
978	if (host_err >= 0 && stable) {	978	if (host_err >= 0 && stable) {
979	static ino_t last_ino;	979	static ino_t last_ino;
980	static dev_t last_dev;	980	static dev_t last_dev;
981		981
982	/*	982	/*
983	* Gathered writes: If another process is currently	983	* Gathered writes: If another process is currently
984	* writing to the file, there's a high chance	984	* writing to the file, there's a high chance
985	* this is another nfsd (triggered by a bulk write	985	* this is another nfsd (triggered by a bulk write
986	* from a client's biod). Rather than syncing the	986	* from a client's biod). Rather than syncing the
987	* file with each write request, we sleep for 10 msec.	987	* file with each write request, we sleep for 10 msec.
988	*	988	*
989	* I don't know if this roughly approximates	989	* I don't know if this roughly approximates
990	* C. Juszak's idea of gathered writes, but it's a	990	* C. Juszak's idea of gathered writes, but it's a
991	* nice and simple solution (IMHO), and it seems to	991	* nice and simple solution (IMHO), and it seems to
992	* work:-)	992	* work:-)
993	*/	993	*/
994	if (EX_WGATHER(exp)) {	994	if (EX_WGATHER(exp)) {
995	if (atomic_read(&inode->i_writecount) > 1	995	if (atomic_read(&inode->i_writecount) > 1
996	\|\| (last_ino == inode->i_ino && last_dev == inode->i_sb->s_dev)) {	996	\|\| (last_ino == inode->i_ino && last_dev == inode->i_sb->s_dev)) {
997	dprintk("nfsd: write defer %d\n", current->pid);	997	dprintk("nfsd: write defer %d\n", current->pid);
998	msleep(10);	998	msleep(10);
999	dprintk("nfsd: write resume %d\n", current->pid);	999	dprintk("nfsd: write resume %d\n", current->pid);
1000	}	1000	}
1001		1001
1002	if (inode->i_state & I_DIRTY) {	1002	if (inode->i_state & I_DIRTY) {
1003	dprintk("nfsd: write sync %d\n", current->pid);	1003	dprintk("nfsd: write sync %d\n", current->pid);
1004	host_err=nfsd_sync(file);	1004	host_err=nfsd_sync(file);
1005	}	1005	}
1006	#if 0	1006	#if 0
1007	wake_up(&inode->i_wait);	1007	wake_up(&inode->i_wait);
1008	#endif	1008	#endif
1009	}	1009	}
1010	last_ino = inode->i_ino;	1010	last_ino = inode->i_ino;
1011	last_dev = inode->i_sb->s_dev;	1011	last_dev = inode->i_sb->s_dev;
1012	}	1012	}
1013		1013
1014	dprintk("nfsd: write complete host_err=%d\n", host_err);	1014	dprintk("nfsd: write complete host_err=%d\n", host_err);
1015	if (host_err >= 0)	1015	if (host_err >= 0)
1016	err = 0;	1016	err = 0;
1017	else	1017	else
1018	err = nfserrno(host_err);	1018	err = nfserrno(host_err);
1019	out:	1019	out:
1020	return err;	1020	return err;
1021	}	1021	}
1022		1022
1023	/*	1023	/*
1024	* Read data from a file. count must contain the requested read count	1024	* Read data from a file. count must contain the requested read count
1025	* on entry. On return, *count contains the number of bytes actually read.	1025	* on entry. On return, *count contains the number of bytes actually read.
1026	* N.B. After this call fhp needs an fh_put	1026	* N.B. After this call fhp needs an fh_put
1027	*/	1027	*/
1028	__be32	1028	__be32
1029	nfsd_read(struct svc_rqst rqstp, struct svc_fh fhp, struct file *file,	1029	nfsd_read(struct svc_rqst rqstp, struct svc_fh fhp, struct file *file,
1030	loff_t offset, struct kvec *vec, int vlen,	1030	loff_t offset, struct kvec *vec, int vlen,
1031	unsigned long *count)	1031	unsigned long *count)
1032	{	1032	{
1033	__be32 err;	1033	__be32 err;
1034		1034
1035	if (file) {	1035	if (file) {
1036	err = nfsd_permission(fhp->fh_export, fhp->fh_dentry,	1036	err = nfsd_permission(fhp->fh_export, fhp->fh_dentry,
1037	MAY_READ\|MAY_OWNER_OVERRIDE);	1037	MAY_READ\|MAY_OWNER_OVERRIDE);
1038	if (err)	1038	if (err)
1039	goto out;	1039	goto out;
1040	err = nfsd_vfs_read(rqstp, fhp, file, offset, vec, vlen, count);	1040	err = nfsd_vfs_read(rqstp, fhp, file, offset, vec, vlen, count);
1041	} else {	1041	} else {
1042	err = nfsd_open(rqstp, fhp, S_IFREG, MAY_READ, &file);	1042	err = nfsd_open(rqstp, fhp, S_IFREG, MAY_READ, &file);
1043	if (err)	1043	if (err)
1044	goto out;	1044	goto out;
1045	err = nfsd_vfs_read(rqstp, fhp, file, offset, vec, vlen, count);	1045	err = nfsd_vfs_read(rqstp, fhp, file, offset, vec, vlen, count);
1046	nfsd_close(file);	1046	nfsd_close(file);
1047	}	1047	}
1048	out:	1048	out:
1049	return err;	1049	return err;
1050	}	1050	}
1051		1051
1052	/*	1052	/*
1053	* Write data to a file.	1053	* Write data to a file.
1054	* The stable flag requests synchronous writes.	1054	* The stable flag requests synchronous writes.
1055	* N.B. After this call fhp needs an fh_put	1055	* N.B. After this call fhp needs an fh_put
1056	*/	1056	*/
1057	__be32	1057	__be32
1058	nfsd_write(struct svc_rqst rqstp, struct svc_fh fhp, struct file *file,	1058	nfsd_write(struct svc_rqst rqstp, struct svc_fh fhp, struct file *file,
1059	loff_t offset, struct kvec *vec, int vlen, unsigned long cnt,	1059	loff_t offset, struct kvec *vec, int vlen, unsigned long cnt,
1060	int *stablep)	1060	int *stablep)
1061	{	1061	{
1062	__be32 err = 0;	1062	__be32 err = 0;
1063		1063
1064	if (file) {	1064	if (file) {
1065	err = nfsd_permission(fhp->fh_export, fhp->fh_dentry,	1065	err = nfsd_permission(fhp->fh_export, fhp->fh_dentry,
1066	MAY_WRITE\|MAY_OWNER_OVERRIDE);	1066	MAY_WRITE\|MAY_OWNER_OVERRIDE);
1067	if (err)	1067	if (err)
1068	goto out;	1068	goto out;
1069	err = nfsd_vfs_write(rqstp, fhp, file, offset, vec, vlen, cnt,	1069	err = nfsd_vfs_write(rqstp, fhp, file, offset, vec, vlen, cnt,
1070	stablep);	1070	stablep);
1071	} else {	1071	} else {
1072	err = nfsd_open(rqstp, fhp, S_IFREG, MAY_WRITE, &file);	1072	err = nfsd_open(rqstp, fhp, S_IFREG, MAY_WRITE, &file);
1073	if (err)	1073	if (err)
1074	goto out;	1074	goto out;
1075		1075
1076	if (cnt)	1076	if (cnt)
1077	err = nfsd_vfs_write(rqstp, fhp, file, offset, vec, vlen,	1077	err = nfsd_vfs_write(rqstp, fhp, file, offset, vec, vlen,
1078	cnt, stablep);	1078	cnt, stablep);
1079	nfsd_close(file);	1079	nfsd_close(file);
1080	}	1080	}
1081	out:	1081	out:
1082	return err;	1082	return err;
1083	}	1083	}
1084		1084
1085	#ifdef CONFIG_NFSD_V3	1085	#ifdef CONFIG_NFSD_V3
1086	/*	1086	/*
1087	* Commit all pending writes to stable storage.	1087	* Commit all pending writes to stable storage.
1088	* Strictly speaking, we could sync just the indicated file region here,	1088	* Strictly speaking, we could sync just the indicated file region here,
1089	* but there's currently no way we can ask the VFS to do so.	1089	* but there's currently no way we can ask the VFS to do so.
1090	*	1090	*
1091	* Unfortunately we cannot lock the file to make sure we return full WCC	1091	* Unfortunately we cannot lock the file to make sure we return full WCC
1092	* data to the client, as locking happens lower down in the filesystem.	1092	* data to the client, as locking happens lower down in the filesystem.
1093	*/	1093	*/
1094	__be32	1094	__be32
1095	nfsd_commit(struct svc_rqst rqstp, struct svc_fh fhp,	1095	nfsd_commit(struct svc_rqst rqstp, struct svc_fh fhp,
1096	loff_t offset, unsigned long count)	1096	loff_t offset, unsigned long count)
1097	{	1097	{
1098	struct file *file;	1098	struct file *file;
1099	__be32 err;	1099	__be32 err;
1100		1100
1101	if ((u64)count > ~(u64)offset)	1101	if ((u64)count > ~(u64)offset)
1102	return nfserr_inval;	1102	return nfserr_inval;
1103		1103
1104	if ((err = nfsd_open(rqstp, fhp, S_IFREG, MAY_WRITE, &file)) != 0)	1104	if ((err = nfsd_open(rqstp, fhp, S_IFREG, MAY_WRITE, &file)) != 0)
1105	return err;	1105	return err;
1106	if (EX_ISSYNC(fhp->fh_export)) {	1106	if (EX_ISSYNC(fhp->fh_export)) {
1107	if (file->f_op && file->f_op->fsync) {	1107	if (file->f_op && file->f_op->fsync) {
1108	err = nfserrno(nfsd_sync(file));	1108	err = nfserrno(nfsd_sync(file));
1109	} else {	1109	} else {
1110	err = nfserr_notsupp;	1110	err = nfserr_notsupp;
1111	}	1111	}
1112	}	1112	}
1113		1113
1114	nfsd_close(file);	1114	nfsd_close(file);
1115	return err;	1115	return err;
1116	}	1116	}
1117	#endif /* CONFIG_NFSD_V3 */	1117	#endif /* CONFIG_NFSD_V3 */
1118		1118
1119	/*	1119	/*
1120	* Create a file (regular, directory, device, fifo); UNIX sockets	1120	* Create a file (regular, directory, device, fifo); UNIX sockets
1121	* not yet implemented.	1121	* not yet implemented.
1122	* If the response fh has been verified, the parent directory should	1122	* If the response fh has been verified, the parent directory should
1123	* already be locked. Note that the parent directory is left locked.	1123	* already be locked. Note that the parent directory is left locked.
1124	*	1124	*
1125	* N.B. Every call to nfsd_create needs an fh_put for _both_ fhp and resfhp	1125	* N.B. Every call to nfsd_create needs an fh_put for _both_ fhp and resfhp
1126	*/	1126	*/
1127	__be32	1127	__be32
1128	nfsd_create(struct svc_rqst rqstp, struct svc_fh fhp,	1128	nfsd_create(struct svc_rqst rqstp, struct svc_fh fhp,
1129	char fname, int flen, struct iattr iap,	1129	char fname, int flen, struct iattr iap,
1130	int type, dev_t rdev, struct svc_fh *resfhp)	1130	int type, dev_t rdev, struct svc_fh *resfhp)
1131	{	1131	{
1132	struct dentry dentry, dchild = NULL;	1132	struct dentry dentry, dchild = NULL;
1133	struct inode *dirp;	1133	struct inode *dirp;
1134	__be32 err;	1134	__be32 err;
1135	int host_err;	1135	int host_err;
1136		1136
1137	err = nfserr_perm;	1137	err = nfserr_perm;
1138	if (!flen)	1138	if (!flen)
1139	goto out;	1139	goto out;
1140	err = nfserr_exist;	1140	err = nfserr_exist;
1141	if (isdotent(fname, flen))	1141	if (isdotent(fname, flen))
1142	goto out;	1142	goto out;
1143		1143
1144	err = fh_verify(rqstp, fhp, S_IFDIR, MAY_CREATE);	1144	err = fh_verify(rqstp, fhp, S_IFDIR, MAY_CREATE);
1145	if (err)	1145	if (err)
1146	goto out;	1146	goto out;
1147		1147
1148	dentry = fhp->fh_dentry;	1148	dentry = fhp->fh_dentry;
1149	dirp = dentry->d_inode;	1149	dirp = dentry->d_inode;
1150		1150
1151	err = nfserr_notdir;	1151	err = nfserr_notdir;
1152	if(!dirp->i_op \|\| !dirp->i_op->lookup)	1152	if(!dirp->i_op \|\| !dirp->i_op->lookup)
1153	goto out;	1153	goto out;
1154	/*	1154	/*
1155	* Check whether the response file handle has been verified yet.	1155	* Check whether the response file handle has been verified yet.
1156	* If it has, the parent directory should already be locked.	1156	* If it has, the parent directory should already be locked.
1157	*/	1157	*/
1158	if (!resfhp->fh_dentry) {	1158	if (!resfhp->fh_dentry) {
1159	/* called from nfsd_proc_mkdir, or possibly nfsd3_proc_create */	1159	/* called from nfsd_proc_mkdir, or possibly nfsd3_proc_create */
1160	fh_lock_nested(fhp, I_MUTEX_PARENT);	1160	fh_lock_nested(fhp, I_MUTEX_PARENT);
1161	dchild = lookup_one_len(fname, dentry, flen);	1161	dchild = lookup_one_len(fname, dentry, flen);
1162	host_err = PTR_ERR(dchild);	1162	host_err = PTR_ERR(dchild);
1163	if (IS_ERR(dchild))	1163	if (IS_ERR(dchild))
1164	goto out_nfserr;	1164	goto out_nfserr;
1165	err = fh_compose(resfhp, fhp->fh_export, dchild, fhp);	1165	err = fh_compose(resfhp, fhp->fh_export, dchild, fhp);
1166	if (err)	1166	if (err)
1167	goto out;	1167	goto out;
1168	} else {	1168	} else {
1169	/* called from nfsd_proc_create */	1169	/* called from nfsd_proc_create */
1170	dchild = dget(resfhp->fh_dentry);	1170	dchild = dget(resfhp->fh_dentry);
1171	if (!fhp->fh_locked) {	1171	if (!fhp->fh_locked) {
1172	/* not actually possible */	1172	/* not actually possible */
1173	printk(KERN_ERR	1173	printk(KERN_ERR
1174	"nfsd_create: parent %s/%s not locked!\n",	1174	"nfsd_create: parent %s/%s not locked!\n",
1175	dentry->d_parent->d_name.name,	1175	dentry->d_parent->d_name.name,
1176	dentry->d_name.name);	1176	dentry->d_name.name);
1177	err = nfserr_io;	1177	err = nfserr_io;
1178	goto out;	1178	goto out;
1179	}	1179	}
1180	}	1180	}
1181	/*	1181	/*
1182	* Make sure the child dentry is still negative ...	1182	* Make sure the child dentry is still negative ...
1183	*/	1183	*/
1184	err = nfserr_exist;	1184	err = nfserr_exist;
1185	if (dchild->d_inode) {	1185	if (dchild->d_inode) {
1186	dprintk("nfsd_create: dentry %s/%s not negative!\n",	1186	dprintk("nfsd_create: dentry %s/%s not negative!\n",
1187	dentry->d_name.name, dchild->d_name.name);	1187	dentry->d_name.name, dchild->d_name.name);
1188	goto out;	1188	goto out;
1189	}	1189	}
1190		1190
1191	if (!(iap->ia_valid & ATTR_MODE))	1191	if (!(iap->ia_valid & ATTR_MODE))
1192	iap->ia_mode = 0;	1192	iap->ia_mode = 0;
1193	iap->ia_mode = (iap->ia_mode & S_IALLUGO) \| type;	1193	iap->ia_mode = (iap->ia_mode & S_IALLUGO) \| type;
1194		1194
1195	/*	1195	/*
1196	* Get the dir op function pointer.	1196	* Get the dir op function pointer.
1197	*/	1197	*/
1198	err = 0;	1198	err = 0;
1199	switch (type) {	1199	switch (type) {
1200	case S_IFREG:	1200	case S_IFREG:
1201	host_err = vfs_create(dirp, dchild, iap->ia_mode, NULL);	1201	host_err = vfs_create(dirp, dchild, iap->ia_mode, NULL);
1202	break;	1202	break;
1203	case S_IFDIR:	1203	case S_IFDIR:
1204	host_err = vfs_mkdir(dirp, dchild, iap->ia_mode);	1204	host_err = vfs_mkdir(dirp, dchild, iap->ia_mode);
1205	break;	1205	break;
1206	case S_IFCHR:	1206	case S_IFCHR:
1207	case S_IFBLK:	1207	case S_IFBLK:
1208	case S_IFIFO:	1208	case S_IFIFO:
1209	case S_IFSOCK:	1209	case S_IFSOCK:
1210	host_err = vfs_mknod(dirp, dchild, iap->ia_mode, rdev);	1210	host_err = vfs_mknod(dirp, dchild, iap->ia_mode, rdev);
1211	break;	1211	break;
1212	default:	1212	default:
1213	printk("nfsd: bad file type %o in nfsd_create\n", type);	1213	printk("nfsd: bad file type %o in nfsd_create\n", type);
1214	host_err = -EINVAL;	1214	host_err = -EINVAL;
1215	}	1215	}
1216	if (host_err < 0)	1216	if (host_err < 0)
1217	goto out_nfserr;	1217	goto out_nfserr;
1218		1218
1219	if (EX_ISSYNC(fhp->fh_export)) {	1219	if (EX_ISSYNC(fhp->fh_export)) {
1220	err = nfserrno(nfsd_sync_dir(dentry));	1220	err = nfserrno(nfsd_sync_dir(dentry));
1221	write_inode_now(dchild->d_inode, 1);	1221	write_inode_now(dchild->d_inode, 1);
1222	}	1222	}
1223		1223
1224		1224
1225	/* Set file attributes. Mode has already been set and	1225	/* Set file attributes. Mode has already been set and
1226	* setting uid/gid works only for root. Irix appears to	1226	* setting uid/gid works only for root. Irix appears to
1227	* send along the gid when it tries to implement setgid	1227	* send along the gid when it tries to implement setgid
1228	* directories via NFS.	1228	* directories via NFS.
1229	*/	1229	*/
1230	if ((iap->ia_valid &= ~(ATTR_UID\|ATTR_GID\|ATTR_MODE)) != 0) {	1230	if ((iap->ia_valid &= ~(ATTR_UID\|ATTR_GID\|ATTR_MODE)) != 0) {
1231	__be32 err2 = nfsd_setattr(rqstp, resfhp, iap, 0, (time_t)0);	1231	__be32 err2 = nfsd_setattr(rqstp, resfhp, iap, 0, (time_t)0);
1232	if (err2)	1232	if (err2)
1233	err = err2;	1233	err = err2;
1234	}	1234	}
1235	/*	1235	/*
1236	* Update the file handle to get the new inode info.	1236	* Update the file handle to get the new inode info.
1237	*/	1237	*/
1238	if (!err)	1238	if (!err)
1239	err = fh_update(resfhp);	1239	err = fh_update(resfhp);
1240	out:	1240	out:
1241	if (dchild && !IS_ERR(dchild))	1241	if (dchild && !IS_ERR(dchild))
1242	dput(dchild);	1242	dput(dchild);
1243	return err;	1243	return err;
1244		1244
1245	out_nfserr:	1245	out_nfserr:
1246	err = nfserrno(host_err);	1246	err = nfserrno(host_err);
1247	goto out;	1247	goto out;
1248	}	1248	}
1249		1249
1250	#ifdef CONFIG_NFSD_V3	1250	#ifdef CONFIG_NFSD_V3
1251	/*	1251	/*
1252	* NFSv3 version of nfsd_create	1252	* NFSv3 version of nfsd_create
1253	*/	1253	*/
1254	__be32	1254	__be32
1255	nfsd_create_v3(struct svc_rqst rqstp, struct svc_fh fhp,	1255	nfsd_create_v3(struct svc_rqst rqstp, struct svc_fh fhp,
1256	char fname, int flen, struct iattr iap,	1256	char fname, int flen, struct iattr iap,
1257	struct svc_fh resfhp, int createmode, u32 verifier,	1257	struct svc_fh resfhp, int createmode, u32 verifier,
1258	int truncp, int created)	1258	int truncp, int created)
1259	{	1259	{
1260	struct dentry dentry, dchild = NULL;	1260	struct dentry dentry, dchild = NULL;
1261	struct inode *dirp;	1261	struct inode *dirp;
1262	__be32 err;	1262	__be32 err;
1263	int host_err;	1263	int host_err;
1264	__u32 v_mtime=0, v_atime=0;	1264	__u32 v_mtime=0, v_atime=0;
1265		1265
1266	err = nfserr_perm;	1266	err = nfserr_perm;
1267	if (!flen)	1267	if (!flen)
1268	goto out;	1268	goto out;
1269	err = nfserr_exist;	1269	err = nfserr_exist;
1270	if (isdotent(fname, flen))	1270	if (isdotent(fname, flen))
1271	goto out;	1271	goto out;
1272	if (!(iap->ia_valid & ATTR_MODE))	1272	if (!(iap->ia_valid & ATTR_MODE))
1273	iap->ia_mode = 0;	1273	iap->ia_mode = 0;
1274	err = fh_verify(rqstp, fhp, S_IFDIR, MAY_CREATE);	1274	err = fh_verify(rqstp, fhp, S_IFDIR, MAY_CREATE);
1275	if (err)	1275	if (err)
1276	goto out;	1276	goto out;
1277		1277
1278	dentry = fhp->fh_dentry;	1278	dentry = fhp->fh_dentry;
1279	dirp = dentry->d_inode;	1279	dirp = dentry->d_inode;
1280		1280
1281	/* Get all the sanity checks out of the way before	1281	/* Get all the sanity checks out of the way before
1282	* we lock the parent. */	1282	* we lock the parent. */
1283	err = nfserr_notdir;	1283	err = nfserr_notdir;
1284	if(!dirp->i_op \|\| !dirp->i_op->lookup)	1284	if(!dirp->i_op \|\| !dirp->i_op->lookup)
1285	goto out;	1285	goto out;
1286	fh_lock_nested(fhp, I_MUTEX_PARENT);	1286	fh_lock_nested(fhp, I_MUTEX_PARENT);
1287		1287
1288	/*	1288	/*
1289	* Compose the response file handle.	1289	* Compose the response file handle.
1290	*/	1290	*/
1291	dchild = lookup_one_len(fname, dentry, flen);	1291	dchild = lookup_one_len(fname, dentry, flen);
1292	host_err = PTR_ERR(dchild);	1292	host_err = PTR_ERR(dchild);
1293	if (IS_ERR(dchild))	1293	if (IS_ERR(dchild))
1294	goto out_nfserr;	1294	goto out_nfserr;
1295		1295
1296	err = fh_compose(resfhp, fhp->fh_export, dchild, fhp);	1296	err = fh_compose(resfhp, fhp->fh_export, dchild, fhp);
1297	if (err)	1297	if (err)
1298	goto out;	1298	goto out;
1299		1299
1300	if (createmode == NFS3_CREATE_EXCLUSIVE) {	1300	if (createmode == NFS3_CREATE_EXCLUSIVE) {
1301	/* solaris7 gets confused (bugid 4218508) if these have	1301	/* solaris7 gets confused (bugid 4218508) if these have
1302	* the high bit set, so just clear the high bits.	1302	* the high bit set, so just clear the high bits.
1303	*/	1303	*/
1304	v_mtime = verifier[0]&0x7fffffff;	1304	v_mtime = verifier[0]&0x7fffffff;
1305	v_atime = verifier[1]&0x7fffffff;	1305	v_atime = verifier[1]&0x7fffffff;
1306	}	1306	}
1307		1307
1308	if (dchild->d_inode) {	1308	if (dchild->d_inode) {
1309	err = 0;	1309	err = 0;
1310		1310
1311	switch (createmode) {	1311	switch (createmode) {
1312	case NFS3_CREATE_UNCHECKED:	1312	case NFS3_CREATE_UNCHECKED:
1313	if (! S_ISREG(dchild->d_inode->i_mode))	1313	if (! S_ISREG(dchild->d_inode->i_mode))
1314	err = nfserr_exist;	1314	err = nfserr_exist;
1315	else if (truncp) {	1315	else if (truncp) {
1316	/* in nfsv4, we need to treat this case a little	1316	/* in nfsv4, we need to treat this case a little
1317	* differently. we don't want to truncate the	1317	* differently. we don't want to truncate the
1318	* file now; this would be wrong if the OPEN	1318	* file now; this would be wrong if the OPEN
1319	* fails for some other reason. furthermore,	1319	* fails for some other reason. furthermore,
1320	* if the size is nonzero, we should ignore it	1320	* if the size is nonzero, we should ignore it
1321	* according to spec!	1321	* according to spec!
1322	*/	1322	*/
1323	*truncp = (iap->ia_valid & ATTR_SIZE) && !iap->ia_size;	1323	*truncp = (iap->ia_valid & ATTR_SIZE) && !iap->ia_size;
1324	}	1324	}
1325	else {	1325	else {
1326	iap->ia_valid &= ATTR_SIZE;	1326	iap->ia_valid &= ATTR_SIZE;
1327	goto set_attr;	1327	goto set_attr;
1328	}	1328	}
1329	break;	1329	break;
1330	case NFS3_CREATE_EXCLUSIVE:	1330	case NFS3_CREATE_EXCLUSIVE:
1331	if ( dchild->d_inode->i_mtime.tv_sec == v_mtime	1331	if ( dchild->d_inode->i_mtime.tv_sec == v_mtime
1332	&& dchild->d_inode->i_atime.tv_sec == v_atime	1332	&& dchild->d_inode->i_atime.tv_sec == v_atime
1333	&& dchild->d_inode->i_size == 0 )	1333	&& dchild->d_inode->i_size == 0 )
1334	break;	1334	break;
1335	/* fallthru */	1335	/* fallthru */
1336	case NFS3_CREATE_GUARDED:	1336	case NFS3_CREATE_GUARDED:
1337	err = nfserr_exist;	1337	err = nfserr_exist;
1338	}	1338	}
1339	goto out;	1339	goto out;
1340	}	1340	}
1341		1341
1342	host_err = vfs_create(dirp, dchild, iap->ia_mode, NULL);	1342	host_err = vfs_create(dirp, dchild, iap->ia_mode, NULL);
1343	if (host_err < 0)	1343	if (host_err < 0)
1344	goto out_nfserr;	1344	goto out_nfserr;
1345	if (created)	1345	if (created)
1346	*created = 1;	1346	*created = 1;
1347		1347
1348	if (EX_ISSYNC(fhp->fh_export)) {	1348	if (EX_ISSYNC(fhp->fh_export)) {
1349	err = nfserrno(nfsd_sync_dir(dentry));	1349	err = nfserrno(nfsd_sync_dir(dentry));
1350	/* setattr will sync the child (or not) */	1350	/* setattr will sync the child (or not) */
1351	}	1351	}
1352		1352
1353	if (createmode == NFS3_CREATE_EXCLUSIVE) {	1353	if (createmode == NFS3_CREATE_EXCLUSIVE) {
1354	/* Cram the verifier into atime/mtime */	1354	/* Cram the verifier into atime/mtime */
1355	iap->ia_valid = ATTR_MTIME\|ATTR_ATIME	1355	iap->ia_valid = ATTR_MTIME\|ATTR_ATIME
1356	\| ATTR_MTIME_SET\|ATTR_ATIME_SET;	1356	\| ATTR_MTIME_SET\|ATTR_ATIME_SET;
1357	/* XXX someone who knows this better please fix it for nsec */	1357	/* XXX someone who knows this better please fix it for nsec */
1358	iap->ia_mtime.tv_sec = v_mtime;	1358	iap->ia_mtime.tv_sec = v_mtime;
1359	iap->ia_atime.tv_sec = v_atime;	1359	iap->ia_atime.tv_sec = v_atime;
1360	iap->ia_mtime.tv_nsec = 0;	1360	iap->ia_mtime.tv_nsec = 0;
1361	iap->ia_atime.tv_nsec = 0;	1361	iap->ia_atime.tv_nsec = 0;
1362	}	1362	}
1363		1363
1364	/* Set file attributes.	1364	/* Set file attributes.
1365	* Irix appears to send along the gid when it tries to	1365	* Irix appears to send along the gid when it tries to
1366	* implement setgid directories via NFS. Clear out all that cruft.	1366	* implement setgid directories via NFS. Clear out all that cruft.
1367	*/	1367	*/
1368	set_attr:	1368	set_attr:
1369	if ((iap->ia_valid &= ~(ATTR_UID\|ATTR_GID\|ATTR_MODE)) != 0) {	1369	if ((iap->ia_valid &= ~(ATTR_UID\|ATTR_GID\|ATTR_MODE)) != 0) {
1370	__be32 err2 = nfsd_setattr(rqstp, resfhp, iap, 0, (time_t)0);	1370	__be32 err2 = nfsd_setattr(rqstp, resfhp, iap, 0, (time_t)0);
1371	if (err2)	1371	if (err2)
1372	err = err2;	1372	err = err2;
1373	}	1373	}
1374		1374
1375	/*	1375	/*
1376	* Update the filehandle to get the new inode info.	1376	* Update the filehandle to get the new inode info.
1377	*/	1377	*/
1378	if (!err)	1378	if (!err)
1379	err = fh_update(resfhp);	1379	err = fh_update(resfhp);
1380		1380
1381	out:	1381	out:
1382	fh_unlock(fhp);	1382	fh_unlock(fhp);
1383	if (dchild && !IS_ERR(dchild))	1383	if (dchild && !IS_ERR(dchild))
1384	dput(dchild);	1384	dput(dchild);
1385	return err;	1385	return err;
1386		1386
1387	out_nfserr:	1387	out_nfserr:
1388	err = nfserrno(host_err);	1388	err = nfserrno(host_err);
1389	goto out;	1389	goto out;
1390	}	1390	}
1391	#endif /* CONFIG_NFSD_V3 */	1391	#endif /* CONFIG_NFSD_V3 */
1392		1392
1393	/*	1393	/*
1394	* Read a symlink. On entry, *lenp must contain the maximum path length that	1394	* Read a symlink. On entry, *lenp must contain the maximum path length that
1395	* fits into the buffer. On return, it contains the true length.	1395	* fits into the buffer. On return, it contains the true length.
1396	* N.B. After this call fhp needs an fh_put	1396	* N.B. After this call fhp needs an fh_put
1397	*/	1397	*/
1398	__be32	1398	__be32
1399	nfsd_readlink(struct svc_rqst rqstp, struct svc_fh fhp, char buf, int lenp)	1399	nfsd_readlink(struct svc_rqst rqstp, struct svc_fh fhp, char buf, int lenp)
1400	{	1400	{
1401	struct dentry *dentry;	1401	struct dentry *dentry;
1402	struct inode *inode;	1402	struct inode *inode;
1403	mm_segment_t oldfs;	1403	mm_segment_t oldfs;
1404	__be32 err;	1404	__be32 err;
1405	int host_err;	1405	int host_err;
1406		1406
1407	err = fh_verify(rqstp, fhp, S_IFLNK, MAY_NOP);	1407	err = fh_verify(rqstp, fhp, S_IFLNK, MAY_NOP);
1408	if (err)	1408	if (err)
1409	goto out;	1409	goto out;
1410		1410
1411	dentry = fhp->fh_dentry;	1411	dentry = fhp->fh_dentry;
1412	inode = dentry->d_inode;	1412	inode = dentry->d_inode;
1413		1413
1414	err = nfserr_inval;	1414	err = nfserr_inval;
1415	if (!inode->i_op \|\| !inode->i_op->readlink)	1415	if (!inode->i_op \|\| !inode->i_op->readlink)
1416	goto out;	1416	goto out;
1417		1417
1418	touch_atime(fhp->fh_export->ex_mnt, dentry);	1418	touch_atime(fhp->fh_export->ex_mnt, dentry);
1419	/* N.B. Why does this call need a get_fs()??	1419	/* N.B. Why does this call need a get_fs()??
1420	* Remove the set_fs and watch the fireworks:-) --okir	1420	* Remove the set_fs and watch the fireworks:-) --okir
1421	*/	1421	*/
1422		1422
1423	oldfs = get_fs(); set_fs(KERNEL_DS);	1423	oldfs = get_fs(); set_fs(KERNEL_DS);
1424	host_err = inode->i_op->readlink(dentry, buf, *lenp);	1424	host_err = inode->i_op->readlink(dentry, buf, *lenp);
1425	set_fs(oldfs);	1425	set_fs(oldfs);
1426		1426
1427	if (host_err < 0)	1427	if (host_err < 0)
1428	goto out_nfserr;	1428	goto out_nfserr;
1429	*lenp = host_err;	1429	*lenp = host_err;
1430	err = 0;	1430	err = 0;
1431	out:	1431	out:
1432	return err;	1432	return err;
1433		1433
1434	out_nfserr:	1434	out_nfserr:
1435	err = nfserrno(host_err);	1435	err = nfserrno(host_err);
1436	goto out;	1436	goto out;
1437	}	1437	}
1438		1438
1439	/*	1439	/*
1440	* Create a symlink and look up its inode	1440	* Create a symlink and look up its inode
1441	* N.B. After this call _both_ fhp and resfhp need an fh_put	1441	* N.B. After this call _both_ fhp and resfhp need an fh_put
1442	*/	1442	*/
1443	__be32	1443	__be32
1444	nfsd_symlink(struct svc_rqst rqstp, struct svc_fh fhp,	1444	nfsd_symlink(struct svc_rqst rqstp, struct svc_fh fhp,
1445	char *fname, int flen,	1445	char *fname, int flen,
1446	char *path, int plen,	1446	char *path, int plen,
1447	struct svc_fh *resfhp,	1447	struct svc_fh *resfhp,
1448	struct iattr *iap)	1448	struct iattr *iap)
1449	{	1449	{
1450	struct dentry dentry, dnew;	1450	struct dentry dentry, dnew;
1451	__be32 err, cerr;	1451	__be32 err, cerr;
1452	int host_err;	1452	int host_err;
1453	umode_t mode;	1453	umode_t mode;
1454		1454
1455	err = nfserr_noent;	1455	err = nfserr_noent;
1456	if (!flen \|\| !plen)	1456	if (!flen \|\| !plen)
1457	goto out;	1457	goto out;
1458	err = nfserr_exist;	1458	err = nfserr_exist;
1459	if (isdotent(fname, flen))	1459	if (isdotent(fname, flen))
1460	goto out;	1460	goto out;
1461		1461
1462	err = fh_verify(rqstp, fhp, S_IFDIR, MAY_CREATE);	1462	err = fh_verify(rqstp, fhp, S_IFDIR, MAY_CREATE);
1463	if (err)	1463	if (err)
1464	goto out;	1464	goto out;
1465	fh_lock(fhp);	1465	fh_lock(fhp);
1466	dentry = fhp->fh_dentry;	1466	dentry = fhp->fh_dentry;
1467	dnew = lookup_one_len(fname, dentry, flen);	1467	dnew = lookup_one_len(fname, dentry, flen);
1468	host_err = PTR_ERR(dnew);	1468	host_err = PTR_ERR(dnew);
1469	if (IS_ERR(dnew))	1469	if (IS_ERR(dnew))
1470	goto out_nfserr;	1470	goto out_nfserr;
1471		1471
1472	mode = S_IALLUGO;	1472	mode = S_IALLUGO;
1473	/* Only the MODE ATTRibute is even vaguely meaningful */	1473	/* Only the MODE ATTRibute is even vaguely meaningful */
1474	if (iap && (iap->ia_valid & ATTR_MODE))	1474	if (iap && (iap->ia_valid & ATTR_MODE))
1475	mode = iap->ia_mode & S_IALLUGO;	1475	mode = iap->ia_mode & S_IALLUGO;
1476		1476
1477	if (unlikely(path[plen] != 0)) {	1477	if (unlikely(path[plen] != 0)) {
1478	char *path_alloced = kmalloc(plen+1, GFP_KERNEL);	1478	char *path_alloced = kmalloc(plen+1, GFP_KERNEL);
1479	if (path_alloced == NULL)	1479	if (path_alloced == NULL)
1480	host_err = -ENOMEM;	1480	host_err = -ENOMEM;
1481	else {	1481	else {
1482	strncpy(path_alloced, path, plen);	1482	strncpy(path_alloced, path, plen);
1483	path_alloced[plen] = 0;	1483	path_alloced[plen] = 0;
1484	host_err = vfs_symlink(dentry->d_inode, dnew, path_alloced, mode);	1484	host_err = vfs_symlink(dentry->d_inode, dnew, path_alloced, mode);
1485	kfree(path_alloced);	1485	kfree(path_alloced);
1486	}	1486	}
1487	} else	1487	} else
1488	host_err = vfs_symlink(dentry->d_inode, dnew, path, mode);	1488	host_err = vfs_symlink(dentry->d_inode, dnew, path, mode);
1489		1489
1490	if (!host_err) {	1490	if (!host_err) {
1491	if (EX_ISSYNC(fhp->fh_export))	1491	if (EX_ISSYNC(fhp->fh_export))
1492	host_err = nfsd_sync_dir(dentry);	1492	host_err = nfsd_sync_dir(dentry);
1493	}	1493	}
1494	err = nfserrno(host_err);	1494	err = nfserrno(host_err);
1495	fh_unlock(fhp);	1495	fh_unlock(fhp);
1496		1496
1497	cerr = fh_compose(resfhp, fhp->fh_export, dnew, fhp);	1497	cerr = fh_compose(resfhp, fhp->fh_export, dnew, fhp);
1498	dput(dnew);	1498	dput(dnew);
1499	if (err==0) err = cerr;	1499	if (err==0) err = cerr;
1500	out:	1500	out:
1501	return err;	1501	return err;
1502		1502
1503	out_nfserr:	1503	out_nfserr:
1504	err = nfserrno(host_err);	1504	err = nfserrno(host_err);
1505	goto out;	1505	goto out;
1506	}	1506	}
1507		1507
1508	/*	1508	/*
1509	* Create a hardlink	1509	* Create a hardlink
1510	* N.B. After this call _both_ ffhp and tfhp need an fh_put	1510	* N.B. After this call _both_ ffhp and tfhp need an fh_put
1511	*/	1511	*/
1512	__be32	1512	__be32
1513	nfsd_link(struct svc_rqst rqstp, struct svc_fh ffhp,	1513	nfsd_link(struct svc_rqst rqstp, struct svc_fh ffhp,
1514	char name, int len, struct svc_fh tfhp)	1514	char name, int len, struct svc_fh tfhp)
1515	{	1515	{
1516	struct dentry ddir, dnew, *dold;	1516	struct dentry ddir, dnew, *dold;
1517	struct inode dirp, dest;	1517	struct inode dirp, dest;
1518	__be32 err;	1518	__be32 err;
1519	int host_err;	1519	int host_err;
1520		1520
1521	err = fh_verify(rqstp, ffhp, S_IFDIR, MAY_CREATE);	1521	err = fh_verify(rqstp, ffhp, S_IFDIR, MAY_CREATE);
1522	if (err)	1522	if (err)
1523	goto out;	1523	goto out;
1524	err = fh_verify(rqstp, tfhp, -S_IFDIR, MAY_NOP);	1524	err = fh_verify(rqstp, tfhp, -S_IFDIR, MAY_NOP);
1525	if (err)	1525	if (err)
1526	goto out;	1526	goto out;
1527		1527
1528	err = nfserr_perm;	1528	err = nfserr_perm;
1529	if (!len)	1529	if (!len)
1530	goto out;	1530	goto out;
1531	err = nfserr_exist;	1531	err = nfserr_exist;
1532	if (isdotent(name, len))	1532	if (isdotent(name, len))
1533	goto out;	1533	goto out;
1534		1534
1535	fh_lock_nested(ffhp, I_MUTEX_PARENT);	1535	fh_lock_nested(ffhp, I_MUTEX_PARENT);
1536	ddir = ffhp->fh_dentry;	1536	ddir = ffhp->fh_dentry;
1537	dirp = ddir->d_inode;	1537	dirp = ddir->d_inode;
1538		1538
1539	dnew = lookup_one_len(name, ddir, len);	1539	dnew = lookup_one_len(name, ddir, len);
1540	host_err = PTR_ERR(dnew);	1540	host_err = PTR_ERR(dnew);
1541	if (IS_ERR(dnew))	1541	if (IS_ERR(dnew))
1542	goto out_nfserr;	1542	goto out_nfserr;
1543		1543
1544	dold = tfhp->fh_dentry;	1544	dold = tfhp->fh_dentry;
1545	dest = dold->d_inode;	1545	dest = dold->d_inode;
1546		1546
1547	host_err = vfs_link(dold, dirp, dnew);	1547	host_err = vfs_link(dold, dirp, dnew);
1548	if (!host_err) {	1548	if (!host_err) {
1549	if (EX_ISSYNC(ffhp->fh_export)) {	1549	if (EX_ISSYNC(ffhp->fh_export)) {
1550	err = nfserrno(nfsd_sync_dir(ddir));	1550	err = nfserrno(nfsd_sync_dir(ddir));
1551	write_inode_now(dest, 1);	1551	write_inode_now(dest, 1);
1552	}	1552	}
1553	err = 0;	1553	err = 0;
1554	} else {	1554	} else {
1555	if (host_err == -EXDEV && rqstp->rq_vers == 2)	1555	if (host_err == -EXDEV && rqstp->rq_vers == 2)
1556	err = nfserr_acces;	1556	err = nfserr_acces;
1557	else	1557	else
1558	err = nfserrno(host_err);	1558	err = nfserrno(host_err);
1559	}	1559	}
1560		1560
1561	dput(dnew);	1561	dput(dnew);
1562	out_unlock:	1562	out_unlock:
1563	fh_unlock(ffhp);	1563	fh_unlock(ffhp);
1564	out:	1564	out:
1565	return err;	1565	return err;
1566		1566
1567	out_nfserr:	1567	out_nfserr:
1568	err = nfserrno(host_err);	1568	err = nfserrno(host_err);
1569	goto out_unlock;	1569	goto out_unlock;
1570	}	1570	}
1571		1571
1572	/*	1572	/*
1573	* Rename a file	1573	* Rename a file
1574	* N.B. After this call _both_ ffhp and tfhp need an fh_put	1574	* N.B. After this call _both_ ffhp and tfhp need an fh_put
1575	*/	1575	*/
1576	__be32	1576	__be32
1577	nfsd_rename(struct svc_rqst rqstp, struct svc_fh ffhp, char *fname, int flen,	1577	nfsd_rename(struct svc_rqst rqstp, struct svc_fh ffhp, char *fname, int flen,
1578	struct svc_fh tfhp, char tname, int tlen)	1578	struct svc_fh tfhp, char tname, int tlen)
1579	{	1579	{
1580	struct dentry fdentry, tdentry, odentry, ndentry, *trap;	1580	struct dentry fdentry, tdentry, odentry, ndentry, *trap;
1581	struct inode fdir, tdir;	1581	struct inode fdir, tdir;
1582	__be32 err;	1582	__be32 err;
1583	int host_err;	1583	int host_err;
1584		1584
1585	err = fh_verify(rqstp, ffhp, S_IFDIR, MAY_REMOVE);	1585	err = fh_verify(rqstp, ffhp, S_IFDIR, MAY_REMOVE);
1586	if (err)	1586	if (err)
1587	goto out;	1587	goto out;
1588	err = fh_verify(rqstp, tfhp, S_IFDIR, MAY_CREATE);	1588	err = fh_verify(rqstp, tfhp, S_IFDIR, MAY_CREATE);
1589	if (err)	1589	if (err)
1590	goto out;	1590	goto out;
1591		1591
1592	fdentry = ffhp->fh_dentry;	1592	fdentry = ffhp->fh_dentry;
1593	fdir = fdentry->d_inode;	1593	fdir = fdentry->d_inode;
1594		1594
1595	tdentry = tfhp->fh_dentry;	1595	tdentry = tfhp->fh_dentry;
1596	tdir = tdentry->d_inode;	1596	tdir = tdentry->d_inode;
1597		1597
1598	err = (rqstp->rq_vers == 2) ? nfserr_acces : nfserr_xdev;	1598	err = (rqstp->rq_vers == 2) ? nfserr_acces : nfserr_xdev;
1599	if (ffhp->fh_export != tfhp->fh_export)	1599	if (ffhp->fh_export != tfhp->fh_export)
1600	goto out;	1600	goto out;
1601		1601
1602	err = nfserr_perm;	1602	err = nfserr_perm;
1603	if (!flen \|\| isdotent(fname, flen) \|\| !tlen \|\| isdotent(tname, tlen))	1603	if (!flen \|\| isdotent(fname, flen) \|\| !tlen \|\| isdotent(tname, tlen))
1604	goto out;	1604	goto out;
1605		1605
1606	/* cannot use fh_lock as we need deadlock protective ordering	1606	/* cannot use fh_lock as we need deadlock protective ordering
1607	* so do it by hand */	1607	* so do it by hand */
1608	trap = lock_rename(tdentry, fdentry);	1608	trap = lock_rename(tdentry, fdentry);
1609	ffhp->fh_locked = tfhp->fh_locked = 1;	1609	ffhp->fh_locked = tfhp->fh_locked = 1;
1610	fill_pre_wcc(ffhp);	1610	fill_pre_wcc(ffhp);
1611	fill_pre_wcc(tfhp);	1611	fill_pre_wcc(tfhp);
1612		1612
1613	odentry = lookup_one_len(fname, fdentry, flen);	1613	odentry = lookup_one_len(fname, fdentry, flen);
1614	host_err = PTR_ERR(odentry);	1614	host_err = PTR_ERR(odentry);
1615	if (IS_ERR(odentry))	1615	if (IS_ERR(odentry))
1616	goto out_nfserr;	1616	goto out_nfserr;
1617		1617
1618	host_err = -ENOENT;	1618	host_err = -ENOENT;
1619	if (!odentry->d_inode)	1619	if (!odentry->d_inode)
1620	goto out_dput_old;	1620	goto out_dput_old;
1621	host_err = -EINVAL;	1621	host_err = -EINVAL;
1622	if (odentry == trap)	1622	if (odentry == trap)
1623	goto out_dput_old;	1623	goto out_dput_old;
1624		1624
1625	ndentry = lookup_one_len(tname, tdentry, tlen);	1625	ndentry = lookup_one_len(tname, tdentry, tlen);
1626	host_err = PTR_ERR(ndentry);	1626	host_err = PTR_ERR(ndentry);
1627	if (IS_ERR(ndentry))	1627	if (IS_ERR(ndentry))
1628	goto out_dput_old;	1628	goto out_dput_old;
1629	host_err = -ENOTEMPTY;	1629	host_err = -ENOTEMPTY;
1630	if (ndentry == trap)	1630	if (ndentry == trap)
1631	goto out_dput_new;	1631	goto out_dput_new;
1632		1632
1633	#ifdef MSNFS	1633	#ifdef MSNFS
1634	if ((ffhp->fh_export->ex_flags & NFSEXP_MSNFS) &&	1634	if ((ffhp->fh_export->ex_flags & NFSEXP_MSNFS) &&
1635	((atomic_read(&odentry->d_count) > 1)	1635	((atomic_read(&odentry->d_count) > 1)
1636	\|\| (atomic_read(&ndentry->d_count) > 1))) {	1636	\|\| (atomic_read(&ndentry->d_count) > 1))) {
1637	host_err = -EPERM;	1637	host_err = -EPERM;
1638	} else	1638	} else
1639	#endif	1639	#endif
1640	host_err = vfs_rename(fdir, odentry, tdir, ndentry);	1640	host_err = vfs_rename(fdir, odentry, tdir, ndentry);
1641	if (!host_err && EX_ISSYNC(tfhp->fh_export)) {	1641	if (!host_err && EX_ISSYNC(tfhp->fh_export)) {
1642	host_err = nfsd_sync_dir(tdentry);	1642	host_err = nfsd_sync_dir(tdentry);
1643	if (!host_err)	1643	if (!host_err)
1644	host_err = nfsd_sync_dir(fdentry);	1644	host_err = nfsd_sync_dir(fdentry);
1645	}	1645	}
1646		1646
1647	out_dput_new:	1647	out_dput_new:
1648	dput(ndentry);	1648	dput(ndentry);
1649	out_dput_old:	1649	out_dput_old:
1650	dput(odentry);	1650	dput(odentry);
1651	out_nfserr:	1651	out_nfserr:
1652	err = nfserrno(host_err);	1652	err = nfserrno(host_err);
1653		1653
1654	/* we cannot reply on fh_unlock on the two filehandles,	1654	/* we cannot reply on fh_unlock on the two filehandles,
1655	* as that would do the wrong thing if the two directories	1655	* as that would do the wrong thing if the two directories
1656	* were the same, so again we do it by hand	1656	* were the same, so again we do it by hand
1657	*/	1657	*/
1658	fill_post_wcc(ffhp);	1658	fill_post_wcc(ffhp);
1659	fill_post_wcc(tfhp);	1659	fill_post_wcc(tfhp);
1660	unlock_rename(tdentry, fdentry);	1660	unlock_rename(tdentry, fdentry);
1661	ffhp->fh_locked = tfhp->fh_locked = 0;	1661	ffhp->fh_locked = tfhp->fh_locked = 0;
1662		1662
1663	out:	1663	out:
1664	return err;	1664	return err;
1665	}	1665	}
1666		1666
1667	/*	1667	/*
1668	* Unlink a file or directory	1668	* Unlink a file or directory
1669	* N.B. After this call fhp needs an fh_put	1669	* N.B. After this call fhp needs an fh_put
1670	*/	1670	*/
1671	__be32	1671	__be32
1672	nfsd_unlink(struct svc_rqst rqstp, struct svc_fh fhp, int type,	1672	nfsd_unlink(struct svc_rqst rqstp, struct svc_fh fhp, int type,
1673	char *fname, int flen)	1673	char *fname, int flen)
1674	{	1674	{
1675	struct dentry dentry, rdentry;	1675	struct dentry dentry, rdentry;
1676	struct inode *dirp;	1676	struct inode *dirp;
1677	__be32 err;	1677	__be32 err;
1678	int host_err;	1678	int host_err;
1679		1679
1680	err = nfserr_acces;	1680	err = nfserr_acces;
1681	if (!flen \|\| isdotent(fname, flen))	1681	if (!flen \|\| isdotent(fname, flen))
1682	goto out;	1682	goto out;
1683	err = fh_verify(rqstp, fhp, S_IFDIR, MAY_REMOVE);	1683	err = fh_verify(rqstp, fhp, S_IFDIR, MAY_REMOVE);
1684	if (err)	1684	if (err)
1685	goto out;	1685	goto out;
1686		1686
1687	fh_lock_nested(fhp, I_MUTEX_PARENT);	1687	fh_lock_nested(fhp, I_MUTEX_PARENT);
1688	dentry = fhp->fh_dentry;	1688	dentry = fhp->fh_dentry;
1689	dirp = dentry->d_inode;	1689	dirp = dentry->d_inode;
1690		1690
1691	rdentry = lookup_one_len(fname, dentry, flen);	1691	rdentry = lookup_one_len(fname, dentry, flen);
1692	host_err = PTR_ERR(rdentry);	1692	host_err = PTR_ERR(rdentry);
1693	if (IS_ERR(rdentry))	1693	if (IS_ERR(rdentry))
1694	goto out_nfserr;	1694	goto out_nfserr;
1695		1695
1696	if (!rdentry->d_inode) {	1696	if (!rdentry->d_inode) {
1697	dput(rdentry);	1697	dput(rdentry);
1698	err = nfserr_noent;	1698	err = nfserr_noent;
1699	goto out;	1699	goto out;
1700	}	1700	}
1701		1701
1702	if (!type)	1702	if (!type)
1703	type = rdentry->d_inode->i_mode & S_IFMT;	1703	type = rdentry->d_inode->i_mode & S_IFMT;
1704		1704
1705	if (type != S_IFDIR) { /* It's UNLINK */	1705	if (type != S_IFDIR) { /* It's UNLINK */
1706	#ifdef MSNFS	1706	#ifdef MSNFS
1707	if ((fhp->fh_export->ex_flags & NFSEXP_MSNFS) &&	1707	if ((fhp->fh_export->ex_flags & NFSEXP_MSNFS) &&
1708	(atomic_read(&rdentry->d_count) > 1)) {	1708	(atomic_read(&rdentry->d_count) > 1)) {
1709	host_err = -EPERM;	1709	host_err = -EPERM;
1710	} else	1710	} else
1711	#endif	1711	#endif
1712	host_err = vfs_unlink(dirp, rdentry);	1712	host_err = vfs_unlink(dirp, rdentry);
1713	} else { /* It's RMDIR */	1713	} else { /* It's RMDIR */
1714	host_err = vfs_rmdir(dirp, rdentry);	1714	host_err = vfs_rmdir(dirp, rdentry);
1715	}	1715	}
1716		1716
1717	dput(rdentry);	1717	dput(rdentry);
1718		1718
1719	if (host_err)	1719	if (host_err)
1720	goto out_nfserr;	1720	goto out_nfserr;
1721	if (EX_ISSYNC(fhp->fh_export))	1721	if (EX_ISSYNC(fhp->fh_export))
1722	host_err = nfsd_sync_dir(dentry);	1722	host_err = nfsd_sync_dir(dentry);
1723		1723
1724	out_nfserr:	1724	out_nfserr:
1725	err = nfserrno(host_err);	1725	err = nfserrno(host_err);
1726	out:	1726	out:
1727	return err;	1727	return err;
1728	}	1728	}
1729		1729
1730	/*	1730	/*
1731	* Read entries from a directory.	1731	* Read entries from a directory.
1732	* The NFSv3/4 verifier we ignore for now.	1732	* The NFSv3/4 verifier we ignore for now.
1733	*/	1733	*/
1734	__be32	1734	__be32
1735	nfsd_readdir(struct svc_rqst rqstp, struct svc_fh fhp, loff_t *offsetp,	1735	nfsd_readdir(struct svc_rqst rqstp, struct svc_fh fhp, loff_t *offsetp,
1736	struct readdir_cd *cdp, filldir_t func)	1736	struct readdir_cd *cdp, filldir_t func)
1737	{	1737	{
1738	__be32 err;	1738	__be32 err;
1739	int host_err;	1739	int host_err;
1740	struct file *file;	1740	struct file *file;
1741	loff_t offset = *offsetp;	1741	loff_t offset = *offsetp;
1742		1742
1743	err = nfsd_open(rqstp, fhp, S_IFDIR, MAY_READ, &file);	1743	err = nfsd_open(rqstp, fhp, S_IFDIR, MAY_READ, &file);
1744	if (err)	1744	if (err)
1745	goto out;	1745	goto out;
1746		1746
1747	offset = vfs_llseek(file, offset, 0);	1747	offset = vfs_llseek(file, offset, 0);
1748	if (offset < 0) {	1748	if (offset < 0) {
1749	err = nfserrno((int)offset);	1749	err = nfserrno((int)offset);
1750	goto out_close;	1750	goto out_close;
1751	}	1751	}
1752		1752
1753	/*	1753	/*
1754	* Read the directory entries. This silly loop is necessary because	1754	* Read the directory entries. This silly loop is necessary because
1755	* readdir() is not guaranteed to fill up the entire buffer, but	1755	* readdir() is not guaranteed to fill up the entire buffer, but
1756	* may choose to do less.	1756	* may choose to do less.
1757	*/	1757	*/
1758		1758
1759	do {	1759	do {
1760	cdp->err = nfserr_eof; /* will be cleared on successful read */	1760	cdp->err = nfserr_eof; /* will be cleared on successful read */
1761	host_err = vfs_readdir(file, func, cdp);	1761	host_err = vfs_readdir(file, func, cdp);
1762	} while (host_err >=0 && cdp->err == nfs_ok);	1762	} while (host_err >=0 && cdp->err == nfs_ok);
1763	if (host_err)	1763	if (host_err)
1764	err = nfserrno(host_err);	1764	err = nfserrno(host_err);
1765	else	1765	else
1766	err = cdp->err;	1766	err = cdp->err;
1767	*offsetp = vfs_llseek(file, 0, 1);	1767	*offsetp = vfs_llseek(file, 0, 1);
1768		1768
1769	if (err == nfserr_eof \|\| err == nfserr_toosmall)	1769	if (err == nfserr_eof \|\| err == nfserr_toosmall)
1770	err = nfs_ok; /* can still be found in ->err */	1770	err = nfs_ok; /* can still be found in ->err */
1771	out_close:	1771	out_close:
1772	nfsd_close(file);	1772	nfsd_close(file);
1773	out:	1773	out:
1774	return err;	1774	return err;
1775	}	1775	}
1776		1776
1777	/*	1777	/*
1778	* Get file system stats	1778	* Get file system stats
1779	* N.B. After this call fhp needs an fh_put	1779	* N.B. After this call fhp needs an fh_put
1780	*/	1780	*/
1781	__be32	1781	__be32
1782	nfsd_statfs(struct svc_rqst rqstp, struct svc_fh fhp, struct kstatfs *stat)	1782	nfsd_statfs(struct svc_rqst rqstp, struct svc_fh fhp, struct kstatfs *stat)
1783	{	1783	{
1784	__be32 err = fh_verify(rqstp, fhp, 0, MAY_NOP);	1784	__be32 err = fh_verify(rqstp, fhp, 0, MAY_NOP);
1785	if (!err && vfs_statfs(fhp->fh_dentry,stat))	1785	if (!err && vfs_statfs(fhp->fh_dentry,stat))
1786	err = nfserr_io;	1786	err = nfserr_io;
1787	return err;	1787	return err;
1788	}	1788	}
1789		1789
1790	/*	1790	/*
1791	* Check for a user's access permissions to this inode.	1791	* Check for a user's access permissions to this inode.
1792	*/	1792	*/
1793	__be32	1793	__be32
1794	nfsd_permission(struct svc_export exp, struct dentry dentry, int acc)	1794	nfsd_permission(struct svc_export exp, struct dentry dentry, int acc)
1795	{	1795	{
1796	struct inode *inode = dentry->d_inode;	1796	struct inode *inode = dentry->d_inode;
1797	int err;	1797	int err;
1798		1798
1799	if (acc == MAY_NOP)	1799	if (acc == MAY_NOP)
1800	return 0;	1800	return 0;
1801	#if 0	1801	#if 0
1802	dprintk("nfsd: permission 0x%x%s%s%s%s%s%s%s mode 0%o%s%s%s\n",	1802	dprintk("nfsd: permission 0x%x%s%s%s%s%s%s%s mode 0%o%s%s%s\n",
1803	acc,	1803	acc,
1804	(acc & MAY_READ)? " read" : "",	1804	(acc & MAY_READ)? " read" : "",
1805	(acc & MAY_WRITE)? " write" : "",	1805	(acc & MAY_WRITE)? " write" : "",
1806	(acc & MAY_EXEC)? " exec" : "",	1806	(acc & MAY_EXEC)? " exec" : "",
1807	(acc & MAY_SATTR)? " sattr" : "",	1807	(acc & MAY_SATTR)? " sattr" : "",
1808	(acc & MAY_TRUNC)? " trunc" : "",	1808	(acc & MAY_TRUNC)? " trunc" : "",
1809	(acc & MAY_LOCK)? " lock" : "",	1809	(acc & MAY_LOCK)? " lock" : "",
1810	(acc & MAY_OWNER_OVERRIDE)? " owneroverride" : "",	1810	(acc & MAY_OWNER_OVERRIDE)? " owneroverride" : "",
1811	inode->i_mode,	1811	inode->i_mode,
1812	IS_IMMUTABLE(inode)? " immut" : "",	1812	IS_IMMUTABLE(inode)? " immut" : "",
1813	IS_APPEND(inode)? " append" : "",	1813	IS_APPEND(inode)? " append" : "",
1814	IS_RDONLY(inode)? " ro" : "");	1814	IS_RDONLY(inode)? " ro" : "");
1815	dprintk(" owner %d/%d user %d/%d\n",	1815	dprintk(" owner %d/%d user %d/%d\n",
1816	inode->i_uid, inode->i_gid, current->fsuid, current->fsgid);	1816	inode->i_uid, inode->i_gid, current->fsuid, current->fsgid);
1817	#endif	1817	#endif
1818		1818
1819	/* Normally we reject any write/sattr etc access on a read-only file	1819	/* Normally we reject any write/sattr etc access on a read-only file
1820	* system. But if it is IRIX doing check on write-access for a	1820	* system. But if it is IRIX doing check on write-access for a
1821	* device special file, we ignore rofs.	1821	* device special file, we ignore rofs.
1822	*/	1822	*/
1823	if (!(acc & MAY_LOCAL_ACCESS))	1823	if (!(acc & MAY_LOCAL_ACCESS))
1824	if (acc & (MAY_WRITE \| MAY_SATTR \| MAY_TRUNC)) {	1824	if (acc & (MAY_WRITE \| MAY_SATTR \| MAY_TRUNC)) {
1825	if (EX_RDONLY(exp) \|\| IS_RDONLY(inode))	1825	if (EX_RDONLY(exp) \|\| IS_RDONLY(inode))
1826	return nfserr_rofs;	1826	return nfserr_rofs;
1827	if (/* (acc & MAY_WRITE) && */ IS_IMMUTABLE(inode))	1827	if (/* (acc & MAY_WRITE) && */ IS_IMMUTABLE(inode))
1828	return nfserr_perm;	1828	return nfserr_perm;
1829	}	1829	}
1830	if ((acc & MAY_TRUNC) && IS_APPEND(inode))	1830	if ((acc & MAY_TRUNC) && IS_APPEND(inode))
1831	return nfserr_perm;	1831	return nfserr_perm;
1832		1832
1833	if (acc & MAY_LOCK) {	1833	if (acc & MAY_LOCK) {
1834	/* If we cannot rely on authentication in NLM requests,	1834	/* If we cannot rely on authentication in NLM requests,
1835	* just allow locks, otherwise require read permission, or	1835	* just allow locks, otherwise require read permission, or
1836	* ownership	1836	* ownership
1837	*/	1837	*/
1838	if (exp->ex_flags & NFSEXP_NOAUTHNLM)	1838	if (exp->ex_flags & NFSEXP_NOAUTHNLM)
1839	return 0;	1839	return 0;
1840	else	1840	else
1841	acc = MAY_READ \| MAY_OWNER_OVERRIDE;	1841	acc = MAY_READ \| MAY_OWNER_OVERRIDE;
1842	}	1842	}
1843	/*	1843	/*
1844	* The file owner always gets access permission for accesses that	1844	* The file owner always gets access permission for accesses that
1845	* would normally be checked at open time. This is to make	1845	* would normally be checked at open time. This is to make
1846	* file access work even when the client has done a fchmod(fd, 0).	1846	* file access work even when the client has done a fchmod(fd, 0).
1847	*	1847	*
1848	* However, `cp foo bar' should fail nevertheless when bar is	1848	* However, `cp foo bar' should fail nevertheless when bar is
1849	* readonly. A sensible way to do this might be to reject all	1849	* readonly. A sensible way to do this might be to reject all
1850	* attempts to truncate a read-only file, because a creat() call	1850	* attempts to truncate a read-only file, because a creat() call
1851	* always implies file truncation.	1851	* always implies file truncation.
1852	* ... but this isn't really fair. A process may reasonably call	1852	* ... but this isn't really fair. A process may reasonably call
1853	* ftruncate on an open file descriptor on a file with perm 000.	1853	* ftruncate on an open file descriptor on a file with perm 000.
1854	* We must trust the client to do permission checking - using "ACCESS"	1854	* We must trust the client to do permission checking - using "ACCESS"
1855	* with NFSv3.	1855	* with NFSv3.
1856	*/	1856	*/
1857	if ((acc & MAY_OWNER_OVERRIDE) &&	1857	if ((acc & MAY_OWNER_OVERRIDE) &&
1858	inode->i_uid == current->fsuid)	1858	inode->i_uid == current->fsuid)
1859	return 0;	1859	return 0;
1860		1860
1861	err = permission(inode, acc & (MAY_READ\|MAY_WRITE\|MAY_EXEC), NULL);	1861	err = permission(inode, acc & (MAY_READ\|MAY_WRITE\|MAY_EXEC), NULL);
1862		1862
1863	/* Allow read access to binaries even when mode 111 */	1863	/* Allow read access to binaries even when mode 111 */
1864	if (err == -EACCES && S_ISREG(inode->i_mode) &&	1864	if (err == -EACCES && S_ISREG(inode->i_mode) &&
1865	acc == (MAY_READ \| MAY_OWNER_OVERRIDE))	1865	acc == (MAY_READ \| MAY_OWNER_OVERRIDE))
1866	err = permission(inode, MAY_EXEC, NULL);	1866	err = permission(inode, MAY_EXEC, NULL);
1867		1867
1868	return err? nfserrno(err) : 0;	1868	return err? nfserrno(err) : 0;
1869	}	1869	}
1870		1870
1871	void	1871	void
1872	nfsd_racache_shutdown(void)	1872	nfsd_racache_shutdown(void)
1873	{	1873	{
1874	if (!raparml)	1874	if (!raparml)
1875	return;	1875	return;
1876	dprintk("nfsd: freeing readahead buffers.\n");	1876	dprintk("nfsd: freeing readahead buffers.\n");
1877	kfree(raparml);	1877	kfree(raparml);
1878	raparml = NULL;	1878	raparml = NULL;
1879	}	1879	}
1880	/*	1880	/*
1881	* Initialize readahead param cache	1881	* Initialize readahead param cache
1882	*/	1882	*/
1883	int	1883	int
1884	nfsd_racache_init(int cache_size)	1884	nfsd_racache_init(int cache_size)
1885	{	1885	{
1886	int i;	1886	int i;
1887	int j = 0;	1887	int j = 0;
1888	int nperbucket;	1888	int nperbucket;
1889		1889
1890		1890
1891	if (raparml)	1891	if (raparml)
1892	return 0;	1892	return 0;
1893	if (cache_size < 2*RAPARM_HASH_SIZE)	1893	if (cache_size < 2*RAPARM_HASH_SIZE)
1894	cache_size = 2*RAPARM_HASH_SIZE;	1894	cache_size = 2*RAPARM_HASH_SIZE;
1895	raparml = kcalloc(cache_size, sizeof(struct raparms), GFP_KERNEL);	1895	raparml = kcalloc(cache_size, sizeof(struct raparms), GFP_KERNEL);
1896		1896
1897	if (!raparml) {	1897	if (!raparml) {
1898	printk(KERN_WARNING	1898	printk(KERN_WARNING
1899	"nfsd: Could not allocate memory read-ahead cache.\n");	1899	"nfsd: Could not allocate memory read-ahead cache.\n");
1900	return -ENOMEM;	1900	return -ENOMEM;
1901	}	1901	}
1902		1902
1903	dprintk("nfsd: allocating %d readahead buffers.\n", cache_size);	1903	dprintk("nfsd: allocating %d readahead buffers.\n", cache_size);
1904	for (i = 0 ; i < RAPARM_HASH_SIZE ; i++) {	1904	for (i = 0 ; i < RAPARM_HASH_SIZE ; i++) {
1905	raparm_hash[i].pb_head = NULL;	1905	raparm_hash[i].pb_head = NULL;
1906	spin_lock_init(&raparm_hash[i].pb_lock);	1906	spin_lock_init(&raparm_hash[i].pb_lock);
1907	}	1907	}
1908	nperbucket = cache_size >> RAPARM_HASH_BITS;	1908	nperbucket = cache_size >> RAPARM_HASH_BITS;
1909	for (i = 0; i < cache_size - 1; i++) {	1909	for (i = 0; i < cache_size - 1; i++) {
1910	if (i % nperbucket == 0)	1910	if (i % nperbucket == 0)
1911	raparm_hash[j++].pb_head = raparml + i;	1911	raparm_hash[j++].pb_head = raparml + i;
1912	if (i % nperbucket < nperbucket-1)	1912	if (i % nperbucket < nperbucket-1)
1913	raparml[i].p_next = raparml + i + 1;	1913	raparml[i].p_next = raparml + i + 1;
1914	}	1914	}
1915		1915
1916	nfsdstats.ra_size = cache_size;	1916	nfsdstats.ra_size = cache_size;
1917	return 0;	1917	return 0;
1918	}	1918	}
1919		1919
1920	#if defined(CONFIG_NFSD_V2_ACL) \|\| defined(CONFIG_NFSD_V3_ACL)	1920	#if defined(CONFIG_NFSD_V2_ACL) \|\| defined(CONFIG_NFSD_V3_ACL)
1921	struct posix_acl *	1921	struct posix_acl *
1922	nfsd_get_posix_acl(struct svc_fh *fhp, int type)	1922	nfsd_get_posix_acl(struct svc_fh *fhp, int type)
1923	{	1923	{
1924	struct inode *inode = fhp->fh_dentry->d_inode;	1924	struct inode *inode = fhp->fh_dentry->d_inode;
1925	char *name;	1925	char *name;
1926	void *value = NULL;	1926	void *value = NULL;
1927	ssize_t size;	1927	ssize_t size;
1928	struct posix_acl *acl;	1928	struct posix_acl *acl;
1929		1929
1930	if (!IS_POSIXACL(inode))	1930	if (!IS_POSIXACL(inode))
1931	return ERR_PTR(-EOPNOTSUPP);	1931	return ERR_PTR(-EOPNOTSUPP);
1932		1932
1933	switch (type) {	1933	switch (type) {
1934	case ACL_TYPE_ACCESS:	1934	case ACL_TYPE_ACCESS:
1935	name = POSIX_ACL_XATTR_ACCESS;	1935	name = POSIX_ACL_XATTR_ACCESS;
1936	break;	1936	break;
1937	case ACL_TYPE_DEFAULT:	1937	case ACL_TYPE_DEFAULT:
1938	name = POSIX_ACL_XATTR_DEFAULT;	1938	name = POSIX_ACL_XATTR_DEFAULT;
1939	break;	1939	break;
1940	default:	1940	default:
1941	return ERR_PTR(-EOPNOTSUPP);	1941	return ERR_PTR(-EOPNOTSUPP);
1942	}	1942	}
1943		1943
1944	size = nfsd_getxattr(fhp->fh_dentry, name, &value);	1944	size = nfsd_getxattr(fhp->fh_dentry, name, &value);
1945	if (size < 0)	1945	if (size < 0)
1946	return ERR_PTR(size);	1946	return ERR_PTR(size);
1947		1947
1948	acl = posix_acl_from_xattr(value, size);	1948	acl = posix_acl_from_xattr(value, size);
1949	kfree(value);	1949	kfree(value);
1950	return acl;	1950	return acl;
1951	}	1951	}
1952		1952
1953	int	1953	int
1954	nfsd_set_posix_acl(struct svc_fh fhp, int type, struct posix_acl acl)	1954	nfsd_set_posix_acl(struct svc_fh fhp, int type, struct posix_acl acl)
1955	{	1955	{
1956	struct inode *inode = fhp->fh_dentry->d_inode;	1956	struct inode *inode = fhp->fh_dentry->d_inode;
1957	char *name;	1957	char *name;
1958	void *value = NULL;	1958	void *value = NULL;
1959	size_t size;	1959	size_t size;
1960	int error;	1960	int error;
1961		1961
1962	if (!IS_POSIXACL(inode) \|\| !inode->i_op \|\|	1962	if (!IS_POSIXACL(inode) \|\| !inode->i_op \|\|
1963	!inode->i_op->setxattr \|\| !inode->i_op->removexattr)	1963	!inode->i_op->setxattr \|\| !inode->i_op->removexattr)
1964	return -EOPNOTSUPP;	1964	return -EOPNOTSUPP;
1965	switch(type) {	1965	switch(type) {
1966	case ACL_TYPE_ACCESS:	1966	case ACL_TYPE_ACCESS:
1967	name = POSIX_ACL_XATTR_ACCESS;	1967	name = POSIX_ACL_XATTR_ACCESS;
1968	break;	1968	break;
1969	case ACL_TYPE_DEFAULT:	1969	case ACL_TYPE_DEFAULT:
1970	name = POSIX_ACL_XATTR_DEFAULT;	1970	name = POSIX_ACL_XATTR_DEFAULT;
1971	break;	1971	break;
1972	default:	1972	default:
1973	return -EOPNOTSUPP;	1973	return -EOPNOTSUPP;
1974	}	1974	}
1975		1975
1976	if (acl && acl->a_count) {	1976	if (acl && acl->a_count) {
1977	size = posix_acl_xattr_size(acl->a_count);	1977	size = posix_acl_xattr_size(acl->a_count);
1978	value = kmalloc(size, GFP_KERNEL);	1978	value = kmalloc(size, GFP_KERNEL);
1979	if (!value)	1979	if (!value)
1980	return -ENOMEM;	1980	return -ENOMEM;
1981	error = posix_acl_to_xattr(acl, value, size);	1981	error = posix_acl_to_xattr(acl, value, size);
1982	if (error < 0)	1982	if (error < 0)
1983	goto getout;	1983	goto getout;
1984	size = error;	1984	size = error;
1985	} else	1985	} else
1986	size = 0;	1986	size = 0;
1987		1987
1988	if (size)	1988	if (size)
1989	error = vfs_setxattr(fhp->fh_dentry, name, value, size, 0);	1989	error = vfs_setxattr(fhp->fh_dentry, name, value, size, 0);
1990	else {	1990	else {
1991	if (!S_ISDIR(inode->i_mode) && type == ACL_TYPE_DEFAULT)	1991	if (!S_ISDIR(inode->i_mode) && type == ACL_TYPE_DEFAULT)
1992	error = 0;	1992	error = 0;
1993	else {	1993	else {
1994	error = vfs_removexattr(fhp->fh_dentry, name);	1994	error = vfs_removexattr(fhp->fh_dentry, name);
1995	if (error == -ENODATA)	1995	if (error == -ENODATA)
1996	error = 0;	1996	error = 0;
1997	}	1997	}
1998	}	1998	}
1999		1999
2000	getout:	2000	getout:
2001	kfree(value);	2001	kfree(value);
2002	return error;	2002	return error;
2003	}	2003	}
2004	#endif /* defined(CONFIG_NFSD_V2_ACL) \|\| defined(CONFIG_NFSD_V3_ACL) */	2004	#endif /* defined(CONFIG_NFSD_V2_ACL) \|\| defined(CONFIG_NFSD_V3_ACL) */
2005		2005

fs/ocfs2/file.c

Diff comments View file @ cac36bb

1	/* -- mode: c; c-basic-offset: 8; --	1	/* -- mode: c; c-basic-offset: 8; --
2	* vim: noexpandtab sw=8 ts=8 sts=0:	2	* vim: noexpandtab sw=8 ts=8 sts=0:
3	*	3	*
4	* file.c	4	* file.c
5	*	5	*
6	* File open, close, extend, truncate	6	* File open, close, extend, truncate
7	*	7	*
8	* Copyright (C) 2002, 2004 Oracle. All rights reserved.	8	* Copyright (C) 2002, 2004 Oracle. All rights reserved.
9	*	9	*
10	* This program is free software; you can redistribute it and/or	10	* This program is free software; you can redistribute it and/or
11	* modify it under the terms of the GNU General Public	11	* modify it under the terms of the GNU General Public
12	* License as published by the Free Software Foundation; either	12	* License as published by the Free Software Foundation; either
13	* version 2 of the License, or (at your option) any later version.	13	* version 2 of the License, or (at your option) any later version.
14	*	14	*
15	* This program is distributed in the hope that it will be useful,	15	* This program is distributed in the hope that it will be useful,
16	* but WITHOUT ANY WARRANTY; without even the implied warranty of	16	* but WITHOUT ANY WARRANTY; without even the implied warranty of
17	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU	17	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18	* General Public License for more details.	18	* General Public License for more details.
19	*	19	*
20	* You should have received a copy of the GNU General Public	20	* You should have received a copy of the GNU General Public
21	* License along with this program; if not, write to the	21	* License along with this program; if not, write to the
22	* Free Software Foundation, Inc., 59 Temple Place - Suite 330,	22	* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23	* Boston, MA 021110-1307, USA.	23	* Boston, MA 021110-1307, USA.
24	*/	24	*/
25		25
26	#include <linux/capability.h>	26	#include <linux/capability.h>
27	#include <linux/fs.h>	27	#include <linux/fs.h>
28	#include <linux/types.h>	28	#include <linux/types.h>
29	#include <linux/slab.h>	29	#include <linux/slab.h>
30	#include <linux/highmem.h>	30	#include <linux/highmem.h>
31	#include <linux/pagemap.h>	31	#include <linux/pagemap.h>
32	#include <linux/uio.h>	32	#include <linux/uio.h>
33	#include <linux/sched.h>	33	#include <linux/sched.h>
34	#include <linux/splice.h>	34	#include <linux/splice.h>
35	#include <linux/mount.h>	35	#include <linux/mount.h>
36	#include <linux/writeback.h>	36	#include <linux/writeback.h>
37		37
38	#define MLOG_MASK_PREFIX ML_INODE	38	#define MLOG_MASK_PREFIX ML_INODE
39	#include <cluster/masklog.h>	39	#include <cluster/masklog.h>
40		40
41	#include "ocfs2.h"	41	#include "ocfs2.h"
42		42
43	#include "alloc.h"	43	#include "alloc.h"
44	#include "aops.h"	44	#include "aops.h"
45	#include "dir.h"	45	#include "dir.h"
46	#include "dlmglue.h"	46	#include "dlmglue.h"
47	#include "extent_map.h"	47	#include "extent_map.h"
48	#include "file.h"	48	#include "file.h"
49	#include "sysfile.h"	49	#include "sysfile.h"
50	#include "inode.h"	50	#include "inode.h"
51	#include "ioctl.h"	51	#include "ioctl.h"
52	#include "journal.h"	52	#include "journal.h"
53	#include "mmap.h"	53	#include "mmap.h"
54	#include "suballoc.h"	54	#include "suballoc.h"
55	#include "super.h"	55	#include "super.h"
56		56
57	#include "buffer_head_io.h"	57	#include "buffer_head_io.h"
58		58
59	static int ocfs2_sync_inode(struct inode *inode)	59	static int ocfs2_sync_inode(struct inode *inode)
60	{	60	{
61	filemap_fdatawrite(inode->i_mapping);	61	filemap_fdatawrite(inode->i_mapping);
62	return sync_mapping_buffers(inode->i_mapping);	62	return sync_mapping_buffers(inode->i_mapping);
63	}	63	}
64		64
65	static int ocfs2_file_open(struct inode inode, struct file file)	65	static int ocfs2_file_open(struct inode inode, struct file file)
66	{	66	{
67	int status;	67	int status;
68	int mode = file->f_flags;	68	int mode = file->f_flags;
69	struct ocfs2_inode_info *oi = OCFS2_I(inode);	69	struct ocfs2_inode_info *oi = OCFS2_I(inode);
70		70
71	mlog_entry("(0x%p, 0x%p, '%.*s')\n", inode, file,	71	mlog_entry("(0x%p, 0x%p, '%.*s')\n", inode, file,
72	file->f_path.dentry->d_name.len, file->f_path.dentry->d_name.name);	72	file->f_path.dentry->d_name.len, file->f_path.dentry->d_name.name);
73		73
74	spin_lock(&oi->ip_lock);	74	spin_lock(&oi->ip_lock);
75		75
76	/* Check that the inode hasn't been wiped from disk by another	76	/* Check that the inode hasn't been wiped from disk by another
77	* node. If it hasn't then we're safe as long as we hold the	77	* node. If it hasn't then we're safe as long as we hold the
78	* spin lock until our increment of open count. */	78	* spin lock until our increment of open count. */
79	if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_DELETED) {	79	if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_DELETED) {
80	spin_unlock(&oi->ip_lock);	80	spin_unlock(&oi->ip_lock);
81		81
82	status = -ENOENT;	82	status = -ENOENT;
83	goto leave;	83	goto leave;
84	}	84	}
85		85
86	if (mode & O_DIRECT)	86	if (mode & O_DIRECT)
87	oi->ip_flags \|= OCFS2_INODE_OPEN_DIRECT;	87	oi->ip_flags \|= OCFS2_INODE_OPEN_DIRECT;
88		88
89	oi->ip_open_count++;	89	oi->ip_open_count++;
90	spin_unlock(&oi->ip_lock);	90	spin_unlock(&oi->ip_lock);
91	status = 0;	91	status = 0;
92	leave:	92	leave:
93	mlog_exit(status);	93	mlog_exit(status);
94	return status;	94	return status;
95	}	95	}
96		96
97	static int ocfs2_file_release(struct inode inode, struct file file)	97	static int ocfs2_file_release(struct inode inode, struct file file)
98	{	98	{
99	struct ocfs2_inode_info *oi = OCFS2_I(inode);	99	struct ocfs2_inode_info *oi = OCFS2_I(inode);
100		100
101	mlog_entry("(0x%p, 0x%p, '%.*s')\n", inode, file,	101	mlog_entry("(0x%p, 0x%p, '%.*s')\n", inode, file,
102	file->f_path.dentry->d_name.len,	102	file->f_path.dentry->d_name.len,
103	file->f_path.dentry->d_name.name);	103	file->f_path.dentry->d_name.name);
104		104
105	spin_lock(&oi->ip_lock);	105	spin_lock(&oi->ip_lock);
106	if (!--oi->ip_open_count)	106	if (!--oi->ip_open_count)
107	oi->ip_flags &= ~OCFS2_INODE_OPEN_DIRECT;	107	oi->ip_flags &= ~OCFS2_INODE_OPEN_DIRECT;
108	spin_unlock(&oi->ip_lock);	108	spin_unlock(&oi->ip_lock);
109		109
110	mlog_exit(0);	110	mlog_exit(0);
111		111
112	return 0;	112	return 0;
113	}	113	}
114		114
115	static int ocfs2_sync_file(struct file *file,	115	static int ocfs2_sync_file(struct file *file,
116	struct dentry *dentry,	116	struct dentry *dentry,
117	int datasync)	117	int datasync)
118	{	118	{
119	int err = 0;	119	int err = 0;
120	journal_t *journal;	120	journal_t *journal;
121	struct inode *inode = dentry->d_inode;	121	struct inode *inode = dentry->d_inode;
122	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);	122	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
123		123
124	mlog_entry("(0x%p, 0x%p, %d, '%.*s')\n", file, dentry, datasync,	124	mlog_entry("(0x%p, 0x%p, %d, '%.*s')\n", file, dentry, datasync,
125	dentry->d_name.len, dentry->d_name.name);	125	dentry->d_name.len, dentry->d_name.name);
126		126
127	err = ocfs2_sync_inode(dentry->d_inode);	127	err = ocfs2_sync_inode(dentry->d_inode);
128	if (err)	128	if (err)
129	goto bail;	129	goto bail;
130		130
131	journal = osb->journal->j_journal;	131	journal = osb->journal->j_journal;
132	err = journal_force_commit(journal);	132	err = journal_force_commit(journal);
133		133
134	bail:	134	bail:
135	mlog_exit(err);	135	mlog_exit(err);
136		136
137	return (err < 0) ? -EIO : 0;	137	return (err < 0) ? -EIO : 0;
138	}	138	}
139		139
140	int ocfs2_should_update_atime(struct inode *inode,	140	int ocfs2_should_update_atime(struct inode *inode,
141	struct vfsmount *vfsmnt)	141	struct vfsmount *vfsmnt)
142	{	142	{
143	struct timespec now;	143	struct timespec now;
144	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);	144	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
145		145
146	if (ocfs2_is_hard_readonly(osb) \|\| ocfs2_is_soft_readonly(osb))	146	if (ocfs2_is_hard_readonly(osb) \|\| ocfs2_is_soft_readonly(osb))
147	return 0;	147	return 0;
148		148
149	if ((inode->i_flags & S_NOATIME) \|\|	149	if ((inode->i_flags & S_NOATIME) \|\|
150	((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode)))	150	((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode)))
151	return 0;	151	return 0;
152		152
153	/*	153	/*
154	* We can be called with no vfsmnt structure - NFSD will	154	* We can be called with no vfsmnt structure - NFSD will
155	* sometimes do this.	155	* sometimes do this.
156	*	156	*
157	* Note that our action here is different than touch_atime() -	157	* Note that our action here is different than touch_atime() -
158	* if we can't tell whether this is a noatime mount, then we	158	* if we can't tell whether this is a noatime mount, then we
159	* don't know whether to trust the value of s_atime_quantum.	159	* don't know whether to trust the value of s_atime_quantum.
160	*/	160	*/
161	if (vfsmnt == NULL)	161	if (vfsmnt == NULL)
162	return 0;	162	return 0;
163		163
164	if ((vfsmnt->mnt_flags & MNT_NOATIME) \|\|	164	if ((vfsmnt->mnt_flags & MNT_NOATIME) \|\|
165	((vfsmnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode)))	165	((vfsmnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode)))
166	return 0;	166	return 0;
167		167
168	if (vfsmnt->mnt_flags & MNT_RELATIME) {	168	if (vfsmnt->mnt_flags & MNT_RELATIME) {
169	if ((timespec_compare(&inode->i_atime, &inode->i_mtime) <= 0) \|\|	169	if ((timespec_compare(&inode->i_atime, &inode->i_mtime) <= 0) \|\|
170	(timespec_compare(&inode->i_atime, &inode->i_ctime) <= 0))	170	(timespec_compare(&inode->i_atime, &inode->i_ctime) <= 0))
171	return 1;	171	return 1;
172		172
173	return 0;	173	return 0;
174	}	174	}
175		175
176	now = CURRENT_TIME;	176	now = CURRENT_TIME;
177	if ((now.tv_sec - inode->i_atime.tv_sec <= osb->s_atime_quantum))	177	if ((now.tv_sec - inode->i_atime.tv_sec <= osb->s_atime_quantum))
178	return 0;	178	return 0;
179	else	179	else
180	return 1;	180	return 1;
181	}	181	}
182		182
183	int ocfs2_update_inode_atime(struct inode *inode,	183	int ocfs2_update_inode_atime(struct inode *inode,
184	struct buffer_head *bh)	184	struct buffer_head *bh)
185	{	185	{
186	int ret;	186	int ret;
187	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);	187	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
188	handle_t *handle;	188	handle_t *handle;
189		189
190	mlog_entry_void();	190	mlog_entry_void();
191		191
192	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);	192	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
193	if (handle == NULL) {	193	if (handle == NULL) {
194	ret = -ENOMEM;	194	ret = -ENOMEM;
195	mlog_errno(ret);	195	mlog_errno(ret);
196	goto out;	196	goto out;
197	}	197	}
198		198
199	inode->i_atime = CURRENT_TIME;	199	inode->i_atime = CURRENT_TIME;
200	ret = ocfs2_mark_inode_dirty(handle, inode, bh);	200	ret = ocfs2_mark_inode_dirty(handle, inode, bh);
201	if (ret < 0)	201	if (ret < 0)
202	mlog_errno(ret);	202	mlog_errno(ret);
203		203
204	ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);	204	ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
205	out:	205	out:
206	mlog_exit(ret);	206	mlog_exit(ret);
207	return ret;	207	return ret;
208	}	208	}
209		209
210	static int ocfs2_set_inode_size(handle_t *handle,	210	static int ocfs2_set_inode_size(handle_t *handle,
211	struct inode *inode,	211	struct inode *inode,
212	struct buffer_head *fe_bh,	212	struct buffer_head *fe_bh,
213	u64 new_i_size)	213	u64 new_i_size)
214	{	214	{
215	int status;	215	int status;
216		216
217	mlog_entry_void();	217	mlog_entry_void();
218	i_size_write(inode, new_i_size);	218	i_size_write(inode, new_i_size);
219	inode->i_blocks = ocfs2_inode_sector_count(inode);	219	inode->i_blocks = ocfs2_inode_sector_count(inode);
220	inode->i_ctime = inode->i_mtime = CURRENT_TIME;	220	inode->i_ctime = inode->i_mtime = CURRENT_TIME;
221		221
222	status = ocfs2_mark_inode_dirty(handle, inode, fe_bh);	222	status = ocfs2_mark_inode_dirty(handle, inode, fe_bh);
223	if (status < 0) {	223	if (status < 0) {
224	mlog_errno(status);	224	mlog_errno(status);
225	goto bail;	225	goto bail;
226	}	226	}
227		227
228	bail:	228	bail:
229	mlog_exit(status);	229	mlog_exit(status);
230	return status;	230	return status;
231	}	231	}
232		232
233	static int ocfs2_simple_size_update(struct inode *inode,	233	static int ocfs2_simple_size_update(struct inode *inode,
234	struct buffer_head *di_bh,	234	struct buffer_head *di_bh,
235	u64 new_i_size)	235	u64 new_i_size)
236	{	236	{
237	int ret;	237	int ret;
238	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);	238	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
239	handle_t *handle = NULL;	239	handle_t *handle = NULL;
240		240
241	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);	241	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
242	if (handle == NULL) {	242	if (handle == NULL) {
243	ret = -ENOMEM;	243	ret = -ENOMEM;
244	mlog_errno(ret);	244	mlog_errno(ret);
245	goto out;	245	goto out;
246	}	246	}
247		247
248	ret = ocfs2_set_inode_size(handle, inode, di_bh,	248	ret = ocfs2_set_inode_size(handle, inode, di_bh,
249	new_i_size);	249	new_i_size);
250	if (ret < 0)	250	if (ret < 0)
251	mlog_errno(ret);	251	mlog_errno(ret);
252		252
253	ocfs2_commit_trans(osb, handle);	253	ocfs2_commit_trans(osb, handle);
254	out:	254	out:
255	return ret;	255	return ret;
256	}	256	}
257		257
258	static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb,	258	static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb,
259	struct inode *inode,	259	struct inode *inode,
260	struct buffer_head *fe_bh,	260	struct buffer_head *fe_bh,
261	u64 new_i_size)	261	u64 new_i_size)
262	{	262	{
263	int status;	263	int status;
264	handle_t *handle;	264	handle_t *handle;
265	struct ocfs2_dinode *di;	265	struct ocfs2_dinode *di;
266		266
267	mlog_entry_void();	267	mlog_entry_void();
268		268
269	/* TODO: This needs to actually orphan the inode in this	269	/* TODO: This needs to actually orphan the inode in this
270	* transaction. */	270	* transaction. */
271		271
272	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);	272	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
273	if (IS_ERR(handle)) {	273	if (IS_ERR(handle)) {
274	status = PTR_ERR(handle);	274	status = PTR_ERR(handle);
275	mlog_errno(status);	275	mlog_errno(status);
276	goto out;	276	goto out;
277	}	277	}
278		278
279	status = ocfs2_journal_access(handle, inode, fe_bh,	279	status = ocfs2_journal_access(handle, inode, fe_bh,
280	OCFS2_JOURNAL_ACCESS_WRITE);	280	OCFS2_JOURNAL_ACCESS_WRITE);
281	if (status < 0) {	281	if (status < 0) {
282	mlog_errno(status);	282	mlog_errno(status);
283	goto out_commit;	283	goto out_commit;
284	}	284	}
285		285
286	/*	286	/*
287	* Do this before setting i_size.	287	* Do this before setting i_size.
288	*/	288	*/
289	status = ocfs2_zero_tail_for_truncate(inode, handle, new_i_size);	289	status = ocfs2_zero_tail_for_truncate(inode, handle, new_i_size);
290	if (status) {	290	if (status) {
291	mlog_errno(status);	291	mlog_errno(status);
292	goto out_commit;	292	goto out_commit;
293	}	293	}
294		294
295	i_size_write(inode, new_i_size);	295	i_size_write(inode, new_i_size);
296	inode->i_blocks = ocfs2_align_bytes_to_sectors(new_i_size);	296	inode->i_blocks = ocfs2_align_bytes_to_sectors(new_i_size);
297	inode->i_ctime = inode->i_mtime = CURRENT_TIME;	297	inode->i_ctime = inode->i_mtime = CURRENT_TIME;
298		298
299	di = (struct ocfs2_dinode *) fe_bh->b_data;	299	di = (struct ocfs2_dinode *) fe_bh->b_data;
300	di->i_size = cpu_to_le64(new_i_size);	300	di->i_size = cpu_to_le64(new_i_size);
301	di->i_ctime = di->i_mtime = cpu_to_le64(inode->i_ctime.tv_sec);	301	di->i_ctime = di->i_mtime = cpu_to_le64(inode->i_ctime.tv_sec);
302	di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);	302	di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
303		303
304	status = ocfs2_journal_dirty(handle, fe_bh);	304	status = ocfs2_journal_dirty(handle, fe_bh);
305	if (status < 0)	305	if (status < 0)
306	mlog_errno(status);	306	mlog_errno(status);
307		307
308	out_commit:	308	out_commit:
309	ocfs2_commit_trans(osb, handle);	309	ocfs2_commit_trans(osb, handle);
310	out:	310	out:
311		311
312	mlog_exit(status);	312	mlog_exit(status);
313	return status;	313	return status;
314	}	314	}
315		315
316	static int ocfs2_truncate_file(struct inode *inode,	316	static int ocfs2_truncate_file(struct inode *inode,
317	struct buffer_head *di_bh,	317	struct buffer_head *di_bh,
318	u64 new_i_size)	318	u64 new_i_size)
319	{	319	{
320	int status = 0;	320	int status = 0;
321	struct ocfs2_dinode *fe = NULL;	321	struct ocfs2_dinode *fe = NULL;
322	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);	322	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
323	struct ocfs2_truncate_context *tc = NULL;	323	struct ocfs2_truncate_context *tc = NULL;
324		324
325	mlog_entry("(inode = %llu, new_i_size = %llu\n",	325	mlog_entry("(inode = %llu, new_i_size = %llu\n",
326	(unsigned long long)OCFS2_I(inode)->ip_blkno,	326	(unsigned long long)OCFS2_I(inode)->ip_blkno,
327	(unsigned long long)new_i_size);	327	(unsigned long long)new_i_size);
328		328
329	unmap_mapping_range(inode->i_mapping, new_i_size + PAGE_SIZE - 1, 0, 1);	329	unmap_mapping_range(inode->i_mapping, new_i_size + PAGE_SIZE - 1, 0, 1);
330	truncate_inode_pages(inode->i_mapping, new_i_size);	330	truncate_inode_pages(inode->i_mapping, new_i_size);
331		331
332	fe = (struct ocfs2_dinode *) di_bh->b_data;	332	fe = (struct ocfs2_dinode *) di_bh->b_data;
333	if (!OCFS2_IS_VALID_DINODE(fe)) {	333	if (!OCFS2_IS_VALID_DINODE(fe)) {
334	OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe);	334	OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe);
335	status = -EIO;	335	status = -EIO;
336	goto bail;	336	goto bail;
337	}	337	}
338		338
339	mlog_bug_on_msg(le64_to_cpu(fe->i_size) != i_size_read(inode),	339	mlog_bug_on_msg(le64_to_cpu(fe->i_size) != i_size_read(inode),
340	"Inode %llu, inode i_size = %lld != di "	340	"Inode %llu, inode i_size = %lld != di "
341	"i_size = %llu, i_flags = 0x%x\n",	341	"i_size = %llu, i_flags = 0x%x\n",
342	(unsigned long long)OCFS2_I(inode)->ip_blkno,	342	(unsigned long long)OCFS2_I(inode)->ip_blkno,
343	i_size_read(inode),	343	i_size_read(inode),
344	(unsigned long long)le64_to_cpu(fe->i_size),	344	(unsigned long long)le64_to_cpu(fe->i_size),
345	le32_to_cpu(fe->i_flags));	345	le32_to_cpu(fe->i_flags));
346		346
347	if (new_i_size > le64_to_cpu(fe->i_size)) {	347	if (new_i_size > le64_to_cpu(fe->i_size)) {
348	mlog(0, "asked to truncate file with size (%llu) to size (%llu)!\n",	348	mlog(0, "asked to truncate file with size (%llu) to size (%llu)!\n",
349	(unsigned long long)le64_to_cpu(fe->i_size),	349	(unsigned long long)le64_to_cpu(fe->i_size),
350	(unsigned long long)new_i_size);	350	(unsigned long long)new_i_size);
351	status = -EINVAL;	351	status = -EINVAL;
352	mlog_errno(status);	352	mlog_errno(status);
353	goto bail;	353	goto bail;
354	}	354	}
355		355
356	mlog(0, "inode %llu, i_size = %llu, new_i_size = %llu\n",	356	mlog(0, "inode %llu, i_size = %llu, new_i_size = %llu\n",
357	(unsigned long long)le64_to_cpu(fe->i_blkno),	357	(unsigned long long)le64_to_cpu(fe->i_blkno),
358	(unsigned long long)le64_to_cpu(fe->i_size),	358	(unsigned long long)le64_to_cpu(fe->i_size),
359	(unsigned long long)new_i_size);	359	(unsigned long long)new_i_size);
360		360
361	/* lets handle the simple truncate cases before doing any more	361	/* lets handle the simple truncate cases before doing any more
362	* cluster locking. */	362	* cluster locking. */
363	if (new_i_size == le64_to_cpu(fe->i_size))	363	if (new_i_size == le64_to_cpu(fe->i_size))
364	goto bail;	364	goto bail;
365		365
366	/* This forces other nodes to sync and drop their pages. Do	366	/* This forces other nodes to sync and drop their pages. Do
367	* this even if we have a truncate without allocation change -	367	* this even if we have a truncate without allocation change -
368	* ocfs2 cluster sizes can be much greater than page size, so	368	* ocfs2 cluster sizes can be much greater than page size, so
369	* we have to truncate them anyway. */	369	* we have to truncate them anyway. */
370	status = ocfs2_data_lock(inode, 1);	370	status = ocfs2_data_lock(inode, 1);
371	if (status < 0) {	371	if (status < 0) {
372	mlog_errno(status);	372	mlog_errno(status);
373	goto bail;	373	goto bail;
374	}	374	}
375		375
376	/* alright, we're going to need to do a full blown alloc size	376	/* alright, we're going to need to do a full blown alloc size
377	* change. Orphan the inode so that recovery can complete the	377	* change. Orphan the inode so that recovery can complete the
378	* truncate if necessary. This does the task of marking	378	* truncate if necessary. This does the task of marking
379	* i_size. */	379	* i_size. */
380	status = ocfs2_orphan_for_truncate(osb, inode, di_bh, new_i_size);	380	status = ocfs2_orphan_for_truncate(osb, inode, di_bh, new_i_size);
381	if (status < 0) {	381	if (status < 0) {
382	mlog_errno(status);	382	mlog_errno(status);
383	goto bail_unlock_data;	383	goto bail_unlock_data;
384	}	384	}
385		385
386	status = ocfs2_prepare_truncate(osb, inode, di_bh, &tc);	386	status = ocfs2_prepare_truncate(osb, inode, di_bh, &tc);
387	if (status < 0) {	387	if (status < 0) {
388	mlog_errno(status);	388	mlog_errno(status);
389	goto bail_unlock_data;	389	goto bail_unlock_data;
390	}	390	}
391		391
392	status = ocfs2_commit_truncate(osb, inode, di_bh, tc);	392	status = ocfs2_commit_truncate(osb, inode, di_bh, tc);
393	if (status < 0) {	393	if (status < 0) {
394	mlog_errno(status);	394	mlog_errno(status);
395	goto bail_unlock_data;	395	goto bail_unlock_data;
396	}	396	}
397		397
398	/* TODO: orphan dir cleanup here. */	398	/* TODO: orphan dir cleanup here. */
399	bail_unlock_data:	399	bail_unlock_data:
400	ocfs2_data_unlock(inode, 1);	400	ocfs2_data_unlock(inode, 1);
401		401
402	bail:	402	bail:
403		403
404	mlog_exit(status);	404	mlog_exit(status);
405	return status;	405	return status;
406	}	406	}
407		407
408	/*	408	/*
409	* extend allocation only here.	409	* extend allocation only here.
410	* we'll update all the disk stuff, and oip->alloc_size	410	* we'll update all the disk stuff, and oip->alloc_size
411	*	411	*
412	* expect stuff to be locked, a transaction started and enough data /	412	* expect stuff to be locked, a transaction started and enough data /
413	* metadata reservations in the contexts.	413	* metadata reservations in the contexts.
414	*	414	*
415	* Will return -EAGAIN, and a reason if a restart is needed.	415	* Will return -EAGAIN, and a reason if a restart is needed.
416	* If passed in, *reason will always be set, even in error.	416	* If passed in, *reason will always be set, even in error.
417	*/	417	*/
418	int ocfs2_do_extend_allocation(struct ocfs2_super *osb,	418	int ocfs2_do_extend_allocation(struct ocfs2_super *osb,
419	struct inode *inode,	419	struct inode *inode,
420	u32 *logical_offset,	420	u32 *logical_offset,
421	u32 clusters_to_add,	421	u32 clusters_to_add,
422	struct buffer_head *fe_bh,	422	struct buffer_head *fe_bh,
423	handle_t *handle,	423	handle_t *handle,
424	struct ocfs2_alloc_context *data_ac,	424	struct ocfs2_alloc_context *data_ac,
425	struct ocfs2_alloc_context *meta_ac,	425	struct ocfs2_alloc_context *meta_ac,
426	enum ocfs2_alloc_restarted *reason_ret)	426	enum ocfs2_alloc_restarted *reason_ret)
427	{	427	{
428	int status = 0;	428	int status = 0;
429	int free_extents;	429	int free_extents;
430	struct ocfs2_dinode fe = (struct ocfs2_dinode ) fe_bh->b_data;	430	struct ocfs2_dinode fe = (struct ocfs2_dinode ) fe_bh->b_data;
431	enum ocfs2_alloc_restarted reason = RESTART_NONE;	431	enum ocfs2_alloc_restarted reason = RESTART_NONE;
432	u32 bit_off, num_bits;	432	u32 bit_off, num_bits;
433	u64 block;	433	u64 block;
434		434
435	BUG_ON(!clusters_to_add);	435	BUG_ON(!clusters_to_add);
436		436
437	free_extents = ocfs2_num_free_extents(osb, inode, fe);	437	free_extents = ocfs2_num_free_extents(osb, inode, fe);
438	if (free_extents < 0) {	438	if (free_extents < 0) {
439	status = free_extents;	439	status = free_extents;
440	mlog_errno(status);	440	mlog_errno(status);
441	goto leave;	441	goto leave;
442	}	442	}
443		443
444	/* there are two cases which could cause us to EAGAIN in the	444	/* there are two cases which could cause us to EAGAIN in the
445	* we-need-more-metadata case:	445	* we-need-more-metadata case:
446	* 1) we haven't reserved any	446	* 1) we haven't reserved any
447	* 2) we are so fragmented, we've needed to add metadata too	447	* 2) we are so fragmented, we've needed to add metadata too
448	* many times. */	448	* many times. */
449	if (!free_extents && !meta_ac) {	449	if (!free_extents && !meta_ac) {
450	mlog(0, "we haven't reserved any metadata!\n");	450	mlog(0, "we haven't reserved any metadata!\n");
451	status = -EAGAIN;	451	status = -EAGAIN;
452	reason = RESTART_META;	452	reason = RESTART_META;
453	goto leave;	453	goto leave;
454	} else if ((!free_extents)	454	} else if ((!free_extents)
455	&& (ocfs2_alloc_context_bits_left(meta_ac)	455	&& (ocfs2_alloc_context_bits_left(meta_ac)
456	< ocfs2_extend_meta_needed(fe))) {	456	< ocfs2_extend_meta_needed(fe))) {
457	mlog(0, "filesystem is really fragmented...\n");	457	mlog(0, "filesystem is really fragmented...\n");
458	status = -EAGAIN;	458	status = -EAGAIN;
459	reason = RESTART_META;	459	reason = RESTART_META;
460	goto leave;	460	goto leave;
461	}	461	}
462		462
463	status = ocfs2_claim_clusters(osb, handle, data_ac, 1,	463	status = ocfs2_claim_clusters(osb, handle, data_ac, 1,
464	&bit_off, &num_bits);	464	&bit_off, &num_bits);
465	if (status < 0) {	465	if (status < 0) {
466	if (status != -ENOSPC)	466	if (status != -ENOSPC)
467	mlog_errno(status);	467	mlog_errno(status);
468	goto leave;	468	goto leave;
469	}	469	}
470		470
471	BUG_ON(num_bits > clusters_to_add);	471	BUG_ON(num_bits > clusters_to_add);
472		472
473	/* reserve our write early -- insert_extent may update the inode */	473	/* reserve our write early -- insert_extent may update the inode */
474	status = ocfs2_journal_access(handle, inode, fe_bh,	474	status = ocfs2_journal_access(handle, inode, fe_bh,
475	OCFS2_JOURNAL_ACCESS_WRITE);	475	OCFS2_JOURNAL_ACCESS_WRITE);
476	if (status < 0) {	476	if (status < 0) {
477	mlog_errno(status);	477	mlog_errno(status);
478	goto leave;	478	goto leave;
479	}	479	}
480		480
481	block = ocfs2_clusters_to_blocks(osb->sb, bit_off);	481	block = ocfs2_clusters_to_blocks(osb->sb, bit_off);
482	mlog(0, "Allocating %u clusters at block %u for inode %llu\n",	482	mlog(0, "Allocating %u clusters at block %u for inode %llu\n",
483	num_bits, bit_off, (unsigned long long)OCFS2_I(inode)->ip_blkno);	483	num_bits, bit_off, (unsigned long long)OCFS2_I(inode)->ip_blkno);
484	status = ocfs2_insert_extent(osb, handle, inode, fe_bh,	484	status = ocfs2_insert_extent(osb, handle, inode, fe_bh,
485	*logical_offset, block, num_bits,	485	*logical_offset, block, num_bits,
486	meta_ac);	486	meta_ac);
487	if (status < 0) {	487	if (status < 0) {
488	mlog_errno(status);	488	mlog_errno(status);
489	goto leave;	489	goto leave;
490	}	490	}
491		491
492	status = ocfs2_journal_dirty(handle, fe_bh);	492	status = ocfs2_journal_dirty(handle, fe_bh);
493	if (status < 0) {	493	if (status < 0) {
494	mlog_errno(status);	494	mlog_errno(status);
495	goto leave;	495	goto leave;
496	}	496	}
497		497
498	clusters_to_add -= num_bits;	498	clusters_to_add -= num_bits;
499	*logical_offset += num_bits;	499	*logical_offset += num_bits;
500		500
501	if (clusters_to_add) {	501	if (clusters_to_add) {
502	mlog(0, "need to alloc once more, clusters = %u, wanted = "	502	mlog(0, "need to alloc once more, clusters = %u, wanted = "
503	"%u\n", fe->i_clusters, clusters_to_add);	503	"%u\n", fe->i_clusters, clusters_to_add);
504	status = -EAGAIN;	504	status = -EAGAIN;
505	reason = RESTART_TRANS;	505	reason = RESTART_TRANS;
506	}	506	}
507		507
508	leave:	508	leave:
509	mlog_exit(status);	509	mlog_exit(status);
510	if (reason_ret)	510	if (reason_ret)
511	*reason_ret = reason;	511	*reason_ret = reason;
512	return status;	512	return status;
513	}	513	}
514		514
515	/*	515	/*
516	* For a given allocation, determine which allocators will need to be	516	* For a given allocation, determine which allocators will need to be
517	* accessed, and lock them, reserving the appropriate number of bits.	517	* accessed, and lock them, reserving the appropriate number of bits.
518	*	518	*
519	* Called from ocfs2_extend_allocation() for file systems which don't	519	* Called from ocfs2_extend_allocation() for file systems which don't
520	* support holes, and from ocfs2_write() for file systems which	520	* support holes, and from ocfs2_write() for file systems which
521	* understand sparse inodes.	521	* understand sparse inodes.
522	*/	522	*/
523	int ocfs2_lock_allocators(struct inode inode, struct ocfs2_dinode di,	523	int ocfs2_lock_allocators(struct inode inode, struct ocfs2_dinode di,
524	u32 clusters_to_add,	524	u32 clusters_to_add,
525	struct ocfs2_alloc_context **data_ac,	525	struct ocfs2_alloc_context **data_ac,
526	struct ocfs2_alloc_context **meta_ac)	526	struct ocfs2_alloc_context **meta_ac)
527	{	527	{
528	int ret, num_free_extents;	528	int ret, num_free_extents;
529	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);	529	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
530		530
531	*meta_ac = NULL;	531	*meta_ac = NULL;
532	*data_ac = NULL;	532	*data_ac = NULL;
533		533
534	mlog(0, "extend inode %llu, i_size = %lld, di->i_clusters = %u, "	534	mlog(0, "extend inode %llu, i_size = %lld, di->i_clusters = %u, "
535	"clusters_to_add = %u\n",	535	"clusters_to_add = %u\n",
536	(unsigned long long)OCFS2_I(inode)->ip_blkno, i_size_read(inode),	536	(unsigned long long)OCFS2_I(inode)->ip_blkno, i_size_read(inode),
537	le32_to_cpu(di->i_clusters), clusters_to_add);	537	le32_to_cpu(di->i_clusters), clusters_to_add);
538		538
539	num_free_extents = ocfs2_num_free_extents(osb, inode, di);	539	num_free_extents = ocfs2_num_free_extents(osb, inode, di);
540	if (num_free_extents < 0) {	540	if (num_free_extents < 0) {
541	ret = num_free_extents;	541	ret = num_free_extents;
542	mlog_errno(ret);	542	mlog_errno(ret);
543	goto out;	543	goto out;
544	}	544	}
545		545
546	/*	546	/*
547	* Sparse allocation file systems need to be more conservative	547	* Sparse allocation file systems need to be more conservative
548	* with reserving room for expansion - the actual allocation	548	* with reserving room for expansion - the actual allocation
549	* happens while we've got a journal handle open so re-taking	549	* happens while we've got a journal handle open so re-taking
550	* a cluster lock (because we ran out of room for another	550	* a cluster lock (because we ran out of room for another
551	* extent) will violate ordering rules.	551	* extent) will violate ordering rules.
552	*	552	*
553	* Most of the time we'll only be seeing this 1 cluster at a time	553	* Most of the time we'll only be seeing this 1 cluster at a time
554	* anyway.	554	* anyway.
555	*/	555	*/
556	if (!num_free_extents \|\|	556	if (!num_free_extents \|\|
557	(ocfs2_sparse_alloc(osb) && num_free_extents < clusters_to_add)) {	557	(ocfs2_sparse_alloc(osb) && num_free_extents < clusters_to_add)) {
558	ret = ocfs2_reserve_new_metadata(osb, di, meta_ac);	558	ret = ocfs2_reserve_new_metadata(osb, di, meta_ac);
559	if (ret < 0) {	559	if (ret < 0) {
560	if (ret != -ENOSPC)	560	if (ret != -ENOSPC)
561	mlog_errno(ret);	561	mlog_errno(ret);
562	goto out;	562	goto out;
563	}	563	}
564	}	564	}
565		565
566	ret = ocfs2_reserve_clusters(osb, clusters_to_add, data_ac);	566	ret = ocfs2_reserve_clusters(osb, clusters_to_add, data_ac);
567	if (ret < 0) {	567	if (ret < 0) {
568	if (ret != -ENOSPC)	568	if (ret != -ENOSPC)
569	mlog_errno(ret);	569	mlog_errno(ret);
570	goto out;	570	goto out;
571	}	571	}
572		572
573	out:	573	out:
574	if (ret) {	574	if (ret) {
575	if (*meta_ac) {	575	if (*meta_ac) {
576	ocfs2_free_alloc_context(*meta_ac);	576	ocfs2_free_alloc_context(*meta_ac);
577	*meta_ac = NULL;	577	*meta_ac = NULL;
578	}	578	}
579		579
580	/*	580	/*
581	* We cannot have an error and a non null *data_ac.	581	* We cannot have an error and a non null *data_ac.
582	*/	582	*/
583	}	583	}
584		584
585	return ret;	585	return ret;
586	}	586	}
587		587
588	static int ocfs2_extend_allocation(struct inode *inode,	588	static int ocfs2_extend_allocation(struct inode *inode,
589	u32 clusters_to_add)	589	u32 clusters_to_add)
590	{	590	{
591	int status = 0;	591	int status = 0;
592	int restart_func = 0;	592	int restart_func = 0;
593	int drop_alloc_sem = 0;	593	int drop_alloc_sem = 0;
594	int credits;	594	int credits;
595	u32 prev_clusters, logical_start;	595	u32 prev_clusters, logical_start;
596	struct buffer_head *bh = NULL;	596	struct buffer_head *bh = NULL;
597	struct ocfs2_dinode *fe = NULL;	597	struct ocfs2_dinode *fe = NULL;
598	handle_t *handle = NULL;	598	handle_t *handle = NULL;
599	struct ocfs2_alloc_context *data_ac = NULL;	599	struct ocfs2_alloc_context *data_ac = NULL;
600	struct ocfs2_alloc_context *meta_ac = NULL;	600	struct ocfs2_alloc_context *meta_ac = NULL;
601	enum ocfs2_alloc_restarted why;	601	enum ocfs2_alloc_restarted why;
602	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);	602	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
603		603
604	mlog_entry("(clusters_to_add = %u)\n", clusters_to_add);	604	mlog_entry("(clusters_to_add = %u)\n", clusters_to_add);
605		605
606	/*	606	/*
607	* This function only exists for file systems which don't	607	* This function only exists for file systems which don't
608	* support holes.	608	* support holes.
609	*/	609	*/
610	BUG_ON(ocfs2_sparse_alloc(osb));	610	BUG_ON(ocfs2_sparse_alloc(osb));
611		611
612	status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, &bh,	612	status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, &bh,
613	OCFS2_BH_CACHED, inode);	613	OCFS2_BH_CACHED, inode);
614	if (status < 0) {	614	if (status < 0) {
615	mlog_errno(status);	615	mlog_errno(status);
616	goto leave;	616	goto leave;
617	}	617	}
618		618
619	fe = (struct ocfs2_dinode *) bh->b_data;	619	fe = (struct ocfs2_dinode *) bh->b_data;
620	if (!OCFS2_IS_VALID_DINODE(fe)) {	620	if (!OCFS2_IS_VALID_DINODE(fe)) {
621	OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe);	621	OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe);
622	status = -EIO;	622	status = -EIO;
623	goto leave;	623	goto leave;
624	}	624	}
625		625
626	logical_start = OCFS2_I(inode)->ip_clusters;	626	logical_start = OCFS2_I(inode)->ip_clusters;
627		627
628	restart_all:	628	restart_all:
629	BUG_ON(le32_to_cpu(fe->i_clusters) != OCFS2_I(inode)->ip_clusters);	629	BUG_ON(le32_to_cpu(fe->i_clusters) != OCFS2_I(inode)->ip_clusters);
630		630
631	/* blocks peope in read/write from reading our allocation	631	/* blocks peope in read/write from reading our allocation
632	* until we're done changing it. We depend on i_mutex to block	632	* until we're done changing it. We depend on i_mutex to block
633	* other extend/truncate calls while we're here. Ordering wrt	633	* other extend/truncate calls while we're here. Ordering wrt
634	* start_trans is important here -- always do it before! */	634	* start_trans is important here -- always do it before! */
635	down_write(&OCFS2_I(inode)->ip_alloc_sem);	635	down_write(&OCFS2_I(inode)->ip_alloc_sem);
636	drop_alloc_sem = 1;	636	drop_alloc_sem = 1;
637		637
638	status = ocfs2_lock_allocators(inode, fe, clusters_to_add, &data_ac,	638	status = ocfs2_lock_allocators(inode, fe, clusters_to_add, &data_ac,
639	&meta_ac);	639	&meta_ac);
640	if (status) {	640	if (status) {
641	mlog_errno(status);	641	mlog_errno(status);
642	goto leave;	642	goto leave;
643	}	643	}
644		644
645	credits = ocfs2_calc_extend_credits(osb->sb, fe, clusters_to_add);	645	credits = ocfs2_calc_extend_credits(osb->sb, fe, clusters_to_add);
646	handle = ocfs2_start_trans(osb, credits);	646	handle = ocfs2_start_trans(osb, credits);
647	if (IS_ERR(handle)) {	647	if (IS_ERR(handle)) {
648	status = PTR_ERR(handle);	648	status = PTR_ERR(handle);
649	handle = NULL;	649	handle = NULL;
650	mlog_errno(status);	650	mlog_errno(status);
651	goto leave;	651	goto leave;
652	}	652	}
653		653
654	restarted_transaction:	654	restarted_transaction:
655	/* reserve a write to the file entry early on - that we if we	655	/* reserve a write to the file entry early on - that we if we
656	* run out of credits in the allocation path, we can still	656	* run out of credits in the allocation path, we can still
657	* update i_size. */	657	* update i_size. */
658	status = ocfs2_journal_access(handle, inode, bh,	658	status = ocfs2_journal_access(handle, inode, bh,
659	OCFS2_JOURNAL_ACCESS_WRITE);	659	OCFS2_JOURNAL_ACCESS_WRITE);
660	if (status < 0) {	660	if (status < 0) {
661	mlog_errno(status);	661	mlog_errno(status);
662	goto leave;	662	goto leave;
663	}	663	}
664		664
665	prev_clusters = OCFS2_I(inode)->ip_clusters;	665	prev_clusters = OCFS2_I(inode)->ip_clusters;
666		666
667	status = ocfs2_do_extend_allocation(osb,	667	status = ocfs2_do_extend_allocation(osb,
668	inode,	668	inode,
669	&logical_start,	669	&logical_start,
670	clusters_to_add,	670	clusters_to_add,
671	bh,	671	bh,
672	handle,	672	handle,
673	data_ac,	673	data_ac,
674	meta_ac,	674	meta_ac,
675	&why);	675	&why);
676	if ((status < 0) && (status != -EAGAIN)) {	676	if ((status < 0) && (status != -EAGAIN)) {
677	if (status != -ENOSPC)	677	if (status != -ENOSPC)
678	mlog_errno(status);	678	mlog_errno(status);
679	goto leave;	679	goto leave;
680	}	680	}
681		681
682	status = ocfs2_journal_dirty(handle, bh);	682	status = ocfs2_journal_dirty(handle, bh);
683	if (status < 0) {	683	if (status < 0) {
684	mlog_errno(status);	684	mlog_errno(status);
685	goto leave;	685	goto leave;
686	}	686	}
687		687
688	spin_lock(&OCFS2_I(inode)->ip_lock);	688	spin_lock(&OCFS2_I(inode)->ip_lock);
689	clusters_to_add -= (OCFS2_I(inode)->ip_clusters - prev_clusters);	689	clusters_to_add -= (OCFS2_I(inode)->ip_clusters - prev_clusters);
690	spin_unlock(&OCFS2_I(inode)->ip_lock);	690	spin_unlock(&OCFS2_I(inode)->ip_lock);
691		691
692	if (why != RESTART_NONE && clusters_to_add) {	692	if (why != RESTART_NONE && clusters_to_add) {
693	if (why == RESTART_META) {	693	if (why == RESTART_META) {
694	mlog(0, "restarting function.\n");	694	mlog(0, "restarting function.\n");
695	restart_func = 1;	695	restart_func = 1;
696	} else {	696	} else {
697	BUG_ON(why != RESTART_TRANS);	697	BUG_ON(why != RESTART_TRANS);
698		698
699	mlog(0, "restarting transaction.\n");	699	mlog(0, "restarting transaction.\n");
700	/* TODO: This can be more intelligent. */	700	/* TODO: This can be more intelligent. */
701	credits = ocfs2_calc_extend_credits(osb->sb,	701	credits = ocfs2_calc_extend_credits(osb->sb,
702	fe,	702	fe,
703	clusters_to_add);	703	clusters_to_add);
704	status = ocfs2_extend_trans(handle, credits);	704	status = ocfs2_extend_trans(handle, credits);
705	if (status < 0) {	705	if (status < 0) {
706	/* handle still has to be committed at	706	/* handle still has to be committed at
707	* this point. */	707	* this point. */
708	status = -ENOMEM;	708	status = -ENOMEM;
709	mlog_errno(status);	709	mlog_errno(status);
710	goto leave;	710	goto leave;
711	}	711	}
712	goto restarted_transaction;	712	goto restarted_transaction;
713	}	713	}
714	}	714	}
715		715
716	mlog(0, "fe: i_clusters = %u, i_size=%llu\n",	716	mlog(0, "fe: i_clusters = %u, i_size=%llu\n",
717	le32_to_cpu(fe->i_clusters),	717	le32_to_cpu(fe->i_clusters),
718	(unsigned long long)le64_to_cpu(fe->i_size));	718	(unsigned long long)le64_to_cpu(fe->i_size));
719	mlog(0, "inode: ip_clusters=%u, i_size=%lld\n",	719	mlog(0, "inode: ip_clusters=%u, i_size=%lld\n",
720	OCFS2_I(inode)->ip_clusters, i_size_read(inode));	720	OCFS2_I(inode)->ip_clusters, i_size_read(inode));
721		721
722	leave:	722	leave:
723	if (drop_alloc_sem) {	723	if (drop_alloc_sem) {
724	up_write(&OCFS2_I(inode)->ip_alloc_sem);	724	up_write(&OCFS2_I(inode)->ip_alloc_sem);
725	drop_alloc_sem = 0;	725	drop_alloc_sem = 0;
726	}	726	}
727	if (handle) {	727	if (handle) {
728	ocfs2_commit_trans(osb, handle);	728	ocfs2_commit_trans(osb, handle);
729	handle = NULL;	729	handle = NULL;
730	}	730	}
731	if (data_ac) {	731	if (data_ac) {
732	ocfs2_free_alloc_context(data_ac);	732	ocfs2_free_alloc_context(data_ac);
733	data_ac = NULL;	733	data_ac = NULL;
734	}	734	}
735	if (meta_ac) {	735	if (meta_ac) {
736	ocfs2_free_alloc_context(meta_ac);	736	ocfs2_free_alloc_context(meta_ac);
737	meta_ac = NULL;	737	meta_ac = NULL;
738	}	738	}
739	if ((!status) && restart_func) {	739	if ((!status) && restart_func) {
740	restart_func = 0;	740	restart_func = 0;
741	goto restart_all;	741	goto restart_all;
742	}	742	}
743	if (bh) {	743	if (bh) {
744	brelse(bh);	744	brelse(bh);
745	bh = NULL;	745	bh = NULL;
746	}	746	}
747		747
748	mlog_exit(status);	748	mlog_exit(status);
749	return status;	749	return status;
750	}	750	}
751		751
752	/* Some parts of this taken from generic_cont_expand, which turned out	752	/* Some parts of this taken from generic_cont_expand, which turned out
753	* to be too fragile to do exactly what we need without us having to	753	* to be too fragile to do exactly what we need without us having to
754	* worry about recursive locking in ->prepare_write() and	754	* worry about recursive locking in ->prepare_write() and
755	* ->commit_write(). */	755	* ->commit_write(). */
756	static int ocfs2_write_zero_page(struct inode *inode,	756	static int ocfs2_write_zero_page(struct inode *inode,
757	u64 size)	757	u64 size)
758	{	758	{
759	struct address_space *mapping = inode->i_mapping;	759	struct address_space *mapping = inode->i_mapping;
760	struct page *page;	760	struct page *page;
761	unsigned long index;	761	unsigned long index;
762	unsigned int offset;	762	unsigned int offset;
763	handle_t *handle = NULL;	763	handle_t *handle = NULL;
764	int ret;	764	int ret;
765		765
766	offset = (size & (PAGE_CACHE_SIZE-1)); /* Within page */	766	offset = (size & (PAGE_CACHE_SIZE-1)); /* Within page */
767	/* ugh. in prepare/commit_write, if from==to==start of block, we	767	/* ugh. in prepare/commit_write, if from==to==start of block, we
768	** skip the prepare. make sure we never send an offset for the start	768	** skip the prepare. make sure we never send an offset for the start
769	** of a block	769	** of a block
770	*/	770	*/
771	if ((offset & (inode->i_sb->s_blocksize - 1)) == 0) {	771	if ((offset & (inode->i_sb->s_blocksize - 1)) == 0) {
772	offset++;	772	offset++;
773	}	773	}
774	index = size >> PAGE_CACHE_SHIFT;	774	index = size >> PAGE_CACHE_SHIFT;
775		775
776	page = grab_cache_page(mapping, index);	776	page = grab_cache_page(mapping, index);
777	if (!page) {	777	if (!page) {
778	ret = -ENOMEM;	778	ret = -ENOMEM;
779	mlog_errno(ret);	779	mlog_errno(ret);
780	goto out;	780	goto out;
781	}	781	}
782		782
783	ret = ocfs2_prepare_write_nolock(inode, page, offset, offset);	783	ret = ocfs2_prepare_write_nolock(inode, page, offset, offset);
784	if (ret < 0) {	784	if (ret < 0) {
785	mlog_errno(ret);	785	mlog_errno(ret);
786	goto out_unlock;	786	goto out_unlock;
787	}	787	}
788		788
789	if (ocfs2_should_order_data(inode)) {	789	if (ocfs2_should_order_data(inode)) {
790	handle = ocfs2_start_walk_page_trans(inode, page, offset,	790	handle = ocfs2_start_walk_page_trans(inode, page, offset,
791	offset);	791	offset);
792	if (IS_ERR(handle)) {	792	if (IS_ERR(handle)) {
793	ret = PTR_ERR(handle);	793	ret = PTR_ERR(handle);
794	handle = NULL;	794	handle = NULL;
795	goto out_unlock;	795	goto out_unlock;
796	}	796	}
797	}	797	}
798		798
799	/* must not update i_size! */	799	/* must not update i_size! */
800	ret = block_commit_write(page, offset, offset);	800	ret = block_commit_write(page, offset, offset);
801	if (ret < 0)	801	if (ret < 0)
802	mlog_errno(ret);	802	mlog_errno(ret);
803	else	803	else
804	ret = 0;	804	ret = 0;
805		805
806	if (handle)	806	if (handle)
807	ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);	807	ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
808	out_unlock:	808	out_unlock:
809	unlock_page(page);	809	unlock_page(page);
810	page_cache_release(page);	810	page_cache_release(page);
811	out:	811	out:
812	return ret;	812	return ret;
813	}	813	}
814		814
815	static int ocfs2_zero_extend(struct inode *inode,	815	static int ocfs2_zero_extend(struct inode *inode,
816	u64 zero_to_size)	816	u64 zero_to_size)
817	{	817	{
818	int ret = 0;	818	int ret = 0;
819	u64 start_off;	819	u64 start_off;
820	struct super_block *sb = inode->i_sb;	820	struct super_block *sb = inode->i_sb;
821		821
822	start_off = ocfs2_align_bytes_to_blocks(sb, i_size_read(inode));	822	start_off = ocfs2_align_bytes_to_blocks(sb, i_size_read(inode));
823	while (start_off < zero_to_size) {	823	while (start_off < zero_to_size) {
824	ret = ocfs2_write_zero_page(inode, start_off);	824	ret = ocfs2_write_zero_page(inode, start_off);
825	if (ret < 0) {	825	if (ret < 0) {
826	mlog_errno(ret);	826	mlog_errno(ret);
827	goto out;	827	goto out;
828	}	828	}
829		829
830	start_off += sb->s_blocksize;	830	start_off += sb->s_blocksize;
831		831
832	/*	832	/*
833	* Very large extends have the potential to lock up	833	* Very large extends have the potential to lock up
834	* the cpu for extended periods of time.	834	* the cpu for extended periods of time.
835	*/	835	*/
836	cond_resched();	836	cond_resched();
837	}	837	}
838		838
839	out:	839	out:
840	return ret;	840	return ret;
841	}	841	}
842		842
843	/*	843	/*
844	* A tail_to_skip value > 0 indicates that we're being called from	844	* A tail_to_skip value > 0 indicates that we're being called from
845	* ocfs2_file_aio_write(). This has the following implications:	845	* ocfs2_file_aio_write(). This has the following implications:
846	*	846	*
847	* - we don't want to update i_size	847	* - we don't want to update i_size
848	* - di_bh will be NULL, which is fine because it's only used in the	848	* - di_bh will be NULL, which is fine because it's only used in the
849	* case where we want to update i_size.	849	* case where we want to update i_size.
850	* - ocfs2_zero_extend() will then only be filling the hole created	850	* - ocfs2_zero_extend() will then only be filling the hole created
851	* between i_size and the start of the write.	851	* between i_size and the start of the write.
852	*/	852	*/
853	static int ocfs2_extend_file(struct inode *inode,	853	static int ocfs2_extend_file(struct inode *inode,
854	struct buffer_head *di_bh,	854	struct buffer_head *di_bh,
855	u64 new_i_size,	855	u64 new_i_size,
856	size_t tail_to_skip)	856	size_t tail_to_skip)
857	{	857	{
858	int ret = 0;	858	int ret = 0;
859	u32 clusters_to_add = 0;	859	u32 clusters_to_add = 0;
860		860
861	BUG_ON(!tail_to_skip && !di_bh);	861	BUG_ON(!tail_to_skip && !di_bh);
862		862
863	/* setattr sometimes calls us like this. */	863	/* setattr sometimes calls us like this. */
864	if (new_i_size == 0)	864	if (new_i_size == 0)
865	goto out;	865	goto out;
866		866
867	if (i_size_read(inode) == new_i_size)	867	if (i_size_read(inode) == new_i_size)
868	goto out;	868	goto out;
869	BUG_ON(new_i_size < i_size_read(inode));	869	BUG_ON(new_i_size < i_size_read(inode));
870		870
871	if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) {	871	if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) {
872	BUG_ON(tail_to_skip != 0);	872	BUG_ON(tail_to_skip != 0);
873	goto out_update_size;	873	goto out_update_size;
874	}	874	}
875		875
876	clusters_to_add = ocfs2_clusters_for_bytes(inode->i_sb, new_i_size) -	876	clusters_to_add = ocfs2_clusters_for_bytes(inode->i_sb, new_i_size) -
877	OCFS2_I(inode)->ip_clusters;	877	OCFS2_I(inode)->ip_clusters;
878		878
879	/*	879	/*
880	* protect the pages that ocfs2_zero_extend is going to be	880	* protect the pages that ocfs2_zero_extend is going to be
881	* pulling into the page cache.. we do this before the	881	* pulling into the page cache.. we do this before the
882	* metadata extend so that we don't get into the situation	882	* metadata extend so that we don't get into the situation
883	* where we've extended the metadata but can't get the data	883	* where we've extended the metadata but can't get the data
884	* lock to zero.	884	* lock to zero.
885	*/	885	*/
886	ret = ocfs2_data_lock(inode, 1);	886	ret = ocfs2_data_lock(inode, 1);
887	if (ret < 0) {	887	if (ret < 0) {
888	mlog_errno(ret);	888	mlog_errno(ret);
889	goto out;	889	goto out;
890	}	890	}
891		891
892	if (clusters_to_add) {	892	if (clusters_to_add) {
893	ret = ocfs2_extend_allocation(inode, clusters_to_add);	893	ret = ocfs2_extend_allocation(inode, clusters_to_add);
894	if (ret < 0) {	894	if (ret < 0) {
895	mlog_errno(ret);	895	mlog_errno(ret);
896	goto out_unlock;	896	goto out_unlock;
897	}	897	}
898	}	898	}
899		899
900	/*	900	/*
901	* Call this even if we don't add any clusters to the tree. We	901	* Call this even if we don't add any clusters to the tree. We
902	* still need to zero the area between the old i_size and the	902	* still need to zero the area between the old i_size and the
903	* new i_size.	903	* new i_size.
904	*/	904	*/
905	ret = ocfs2_zero_extend(inode, (u64)new_i_size - tail_to_skip);	905	ret = ocfs2_zero_extend(inode, (u64)new_i_size - tail_to_skip);
906	if (ret < 0) {	906	if (ret < 0) {
907	mlog_errno(ret);	907	mlog_errno(ret);
908	goto out_unlock;	908	goto out_unlock;
909	}	909	}
910		910
911	out_update_size:	911	out_update_size:
912	if (!tail_to_skip) {	912	if (!tail_to_skip) {
913	/* We're being called from ocfs2_setattr() which wants	913	/* We're being called from ocfs2_setattr() which wants
914	* us to update i_size */	914	* us to update i_size */
915	ret = ocfs2_simple_size_update(inode, di_bh, new_i_size);	915	ret = ocfs2_simple_size_update(inode, di_bh, new_i_size);
916	if (ret < 0)	916	if (ret < 0)
917	mlog_errno(ret);	917	mlog_errno(ret);
918	}	918	}
919		919
920	out_unlock:	920	out_unlock:
921	if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))	921	if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
922	ocfs2_data_unlock(inode, 1);	922	ocfs2_data_unlock(inode, 1);
923		923
924	out:	924	out:
925	return ret;	925	return ret;
926	}	926	}
927		927
928	int ocfs2_setattr(struct dentry dentry, struct iattr attr)	928	int ocfs2_setattr(struct dentry dentry, struct iattr attr)
929	{	929	{
930	int status = 0, size_change;	930	int status = 0, size_change;
931	struct inode *inode = dentry->d_inode;	931	struct inode *inode = dentry->d_inode;
932	struct super_block *sb = inode->i_sb;	932	struct super_block *sb = inode->i_sb;
933	struct ocfs2_super *osb = OCFS2_SB(sb);	933	struct ocfs2_super *osb = OCFS2_SB(sb);
934	struct buffer_head *bh = NULL;	934	struct buffer_head *bh = NULL;
935	handle_t *handle = NULL;	935	handle_t *handle = NULL;
936		936
937	mlog_entry("(0x%p, '%.*s')\n", dentry,	937	mlog_entry("(0x%p, '%.*s')\n", dentry,
938	dentry->d_name.len, dentry->d_name.name);	938	dentry->d_name.len, dentry->d_name.name);
939		939
940	if (attr->ia_valid & ATTR_MODE)	940	if (attr->ia_valid & ATTR_MODE)
941	mlog(0, "mode change: %d\n", attr->ia_mode);	941	mlog(0, "mode change: %d\n", attr->ia_mode);
942	if (attr->ia_valid & ATTR_UID)	942	if (attr->ia_valid & ATTR_UID)
943	mlog(0, "uid change: %d\n", attr->ia_uid);	943	mlog(0, "uid change: %d\n", attr->ia_uid);
944	if (attr->ia_valid & ATTR_GID)	944	if (attr->ia_valid & ATTR_GID)
945	mlog(0, "gid change: %d\n", attr->ia_gid);	945	mlog(0, "gid change: %d\n", attr->ia_gid);
946	if (attr->ia_valid & ATTR_SIZE)	946	if (attr->ia_valid & ATTR_SIZE)
947	mlog(0, "size change...\n");	947	mlog(0, "size change...\n");
948	if (attr->ia_valid & (ATTR_ATIME \| ATTR_MTIME \| ATTR_CTIME))	948	if (attr->ia_valid & (ATTR_ATIME \| ATTR_MTIME \| ATTR_CTIME))
949	mlog(0, "time change...\n");	949	mlog(0, "time change...\n");
950		950
951	#define OCFS2_VALID_ATTRS (ATTR_ATIME \| ATTR_MTIME \| ATTR_CTIME \| ATTR_SIZE \	951	#define OCFS2_VALID_ATTRS (ATTR_ATIME \| ATTR_MTIME \| ATTR_CTIME \| ATTR_SIZE \
952	\| ATTR_GID \| ATTR_UID \| ATTR_MODE)	952	\| ATTR_GID \| ATTR_UID \| ATTR_MODE)
953	if (!(attr->ia_valid & OCFS2_VALID_ATTRS)) {	953	if (!(attr->ia_valid & OCFS2_VALID_ATTRS)) {
954	mlog(0, "can't handle attrs: 0x%x\n", attr->ia_valid);	954	mlog(0, "can't handle attrs: 0x%x\n", attr->ia_valid);
955	return 0;	955	return 0;
956	}	956	}
957		957
958	status = inode_change_ok(inode, attr);	958	status = inode_change_ok(inode, attr);
959	if (status)	959	if (status)
960	return status;	960	return status;
961		961
962	size_change = S_ISREG(inode->i_mode) && attr->ia_valid & ATTR_SIZE;	962	size_change = S_ISREG(inode->i_mode) && attr->ia_valid & ATTR_SIZE;
963	if (size_change) {	963	if (size_change) {
964	status = ocfs2_rw_lock(inode, 1);	964	status = ocfs2_rw_lock(inode, 1);
965	if (status < 0) {	965	if (status < 0) {
966	mlog_errno(status);	966	mlog_errno(status);
967	goto bail;	967	goto bail;
968	}	968	}
969	}	969	}
970		970
971	status = ocfs2_meta_lock(inode, &bh, 1);	971	status = ocfs2_meta_lock(inode, &bh, 1);
972	if (status < 0) {	972	if (status < 0) {
973	if (status != -ENOENT)	973	if (status != -ENOENT)
974	mlog_errno(status);	974	mlog_errno(status);
975	goto bail_unlock_rw;	975	goto bail_unlock_rw;
976	}	976	}
977		977
978	if (size_change && attr->ia_size != i_size_read(inode)) {	978	if (size_change && attr->ia_size != i_size_read(inode)) {
979	if (i_size_read(inode) > attr->ia_size)	979	if (i_size_read(inode) > attr->ia_size)
980	status = ocfs2_truncate_file(inode, bh, attr->ia_size);	980	status = ocfs2_truncate_file(inode, bh, attr->ia_size);
981	else	981	else
982	status = ocfs2_extend_file(inode, bh, attr->ia_size, 0);	982	status = ocfs2_extend_file(inode, bh, attr->ia_size, 0);
983	if (status < 0) {	983	if (status < 0) {
984	if (status != -ENOSPC)	984	if (status != -ENOSPC)
985	mlog_errno(status);	985	mlog_errno(status);
986	status = -ENOSPC;	986	status = -ENOSPC;
987	goto bail_unlock;	987	goto bail_unlock;
988	}	988	}
989	}	989	}
990		990
991	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);	991	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
992	if (IS_ERR(handle)) {	992	if (IS_ERR(handle)) {
993	status = PTR_ERR(handle);	993	status = PTR_ERR(handle);
994	mlog_errno(status);	994	mlog_errno(status);
995	goto bail_unlock;	995	goto bail_unlock;
996	}	996	}
997		997
998	status = inode_setattr(inode, attr);	998	status = inode_setattr(inode, attr);
999	if (status < 0) {	999	if (status < 0) {
1000	mlog_errno(status);	1000	mlog_errno(status);
1001	goto bail_commit;	1001	goto bail_commit;
1002	}	1002	}
1003		1003
1004	status = ocfs2_mark_inode_dirty(handle, inode, bh);	1004	status = ocfs2_mark_inode_dirty(handle, inode, bh);
1005	if (status < 0)	1005	if (status < 0)
1006	mlog_errno(status);	1006	mlog_errno(status);
1007		1007
1008	bail_commit:	1008	bail_commit:
1009	ocfs2_commit_trans(osb, handle);	1009	ocfs2_commit_trans(osb, handle);
1010	bail_unlock:	1010	bail_unlock:
1011	ocfs2_meta_unlock(inode, 1);	1011	ocfs2_meta_unlock(inode, 1);
1012	bail_unlock_rw:	1012	bail_unlock_rw:
1013	if (size_change)	1013	if (size_change)
1014	ocfs2_rw_unlock(inode, 1);	1014	ocfs2_rw_unlock(inode, 1);
1015	bail:	1015	bail:
1016	if (bh)	1016	if (bh)
1017	brelse(bh);	1017	brelse(bh);
1018		1018
1019	mlog_exit(status);	1019	mlog_exit(status);
1020	return status;	1020	return status;
1021	}	1021	}
1022		1022
1023	int ocfs2_getattr(struct vfsmount *mnt,	1023	int ocfs2_getattr(struct vfsmount *mnt,
1024	struct dentry *dentry,	1024	struct dentry *dentry,
1025	struct kstat *stat)	1025	struct kstat *stat)
1026	{	1026	{
1027	struct inode *inode = dentry->d_inode;	1027	struct inode *inode = dentry->d_inode;
1028	struct super_block *sb = dentry->d_inode->i_sb;	1028	struct super_block *sb = dentry->d_inode->i_sb;
1029	struct ocfs2_super *osb = sb->s_fs_info;	1029	struct ocfs2_super *osb = sb->s_fs_info;
1030	int err;	1030	int err;
1031		1031
1032	mlog_entry_void();	1032	mlog_entry_void();
1033		1033
1034	err = ocfs2_inode_revalidate(dentry);	1034	err = ocfs2_inode_revalidate(dentry);
1035	if (err) {	1035	if (err) {
1036	if (err != -ENOENT)	1036	if (err != -ENOENT)
1037	mlog_errno(err);	1037	mlog_errno(err);
1038	goto bail;	1038	goto bail;
1039	}	1039	}
1040		1040
1041	generic_fillattr(inode, stat);	1041	generic_fillattr(inode, stat);
1042		1042
1043	/* We set the blksize from the cluster size for performance */	1043	/* We set the blksize from the cluster size for performance */
1044	stat->blksize = osb->s_clustersize;	1044	stat->blksize = osb->s_clustersize;
1045		1045
1046	bail:	1046	bail:
1047	mlog_exit(err);	1047	mlog_exit(err);
1048		1048
1049	return err;	1049	return err;
1050	}	1050	}
1051		1051
1052	int ocfs2_permission(struct inode inode, int mask, struct nameidata nd)	1052	int ocfs2_permission(struct inode inode, int mask, struct nameidata nd)
1053	{	1053	{
1054	int ret;	1054	int ret;
1055		1055
1056	mlog_entry_void();	1056	mlog_entry_void();
1057		1057
1058	ret = ocfs2_meta_lock(inode, NULL, 0);	1058	ret = ocfs2_meta_lock(inode, NULL, 0);
1059	if (ret) {	1059	if (ret) {
1060	if (ret != -ENOENT)	1060	if (ret != -ENOENT)
1061	mlog_errno(ret);	1061	mlog_errno(ret);
1062	goto out;	1062	goto out;
1063	}	1063	}
1064		1064
1065	ret = generic_permission(inode, mask, NULL);	1065	ret = generic_permission(inode, mask, NULL);
1066		1066
1067	ocfs2_meta_unlock(inode, 0);	1067	ocfs2_meta_unlock(inode, 0);
1068	out:	1068	out:
1069	mlog_exit(ret);	1069	mlog_exit(ret);
1070	return ret;	1070	return ret;
1071	}	1071	}
1072		1072
1073	static int ocfs2_write_remove_suid(struct inode *inode)	1073	static int ocfs2_write_remove_suid(struct inode *inode)
1074	{	1074	{
1075	int ret;	1075	int ret;
1076	struct buffer_head *bh = NULL;	1076	struct buffer_head *bh = NULL;
1077	struct ocfs2_inode_info *oi = OCFS2_I(inode);	1077	struct ocfs2_inode_info *oi = OCFS2_I(inode);
1078	handle_t *handle;	1078	handle_t *handle;
1079	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);	1079	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1080	struct ocfs2_dinode *di;	1080	struct ocfs2_dinode *di;
1081		1081
1082	mlog_entry("(Inode %llu, mode 0%o)\n",	1082	mlog_entry("(Inode %llu, mode 0%o)\n",
1083	(unsigned long long)oi->ip_blkno, inode->i_mode);	1083	(unsigned long long)oi->ip_blkno, inode->i_mode);
1084		1084
1085	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);	1085	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
1086	if (handle == NULL) {	1086	if (handle == NULL) {
1087	ret = -ENOMEM;	1087	ret = -ENOMEM;
1088	mlog_errno(ret);	1088	mlog_errno(ret);
1089	goto out;	1089	goto out;
1090	}	1090	}
1091		1091
1092	ret = ocfs2_read_block(osb, oi->ip_blkno, &bh, OCFS2_BH_CACHED, inode);	1092	ret = ocfs2_read_block(osb, oi->ip_blkno, &bh, OCFS2_BH_CACHED, inode);
1093	if (ret < 0) {	1093	if (ret < 0) {
1094	mlog_errno(ret);	1094	mlog_errno(ret);
1095	goto out_trans;	1095	goto out_trans;
1096	}	1096	}
1097		1097
1098	ret = ocfs2_journal_access(handle, inode, bh,	1098	ret = ocfs2_journal_access(handle, inode, bh,
1099	OCFS2_JOURNAL_ACCESS_WRITE);	1099	OCFS2_JOURNAL_ACCESS_WRITE);
1100	if (ret < 0) {	1100	if (ret < 0) {
1101	mlog_errno(ret);	1101	mlog_errno(ret);
1102	goto out_bh;	1102	goto out_bh;
1103	}	1103	}
1104		1104
1105	inode->i_mode &= ~S_ISUID;	1105	inode->i_mode &= ~S_ISUID;
1106	if ((inode->i_mode & S_ISGID) && (inode->i_mode & S_IXGRP))	1106	if ((inode->i_mode & S_ISGID) && (inode->i_mode & S_IXGRP))
1107	inode->i_mode &= ~S_ISGID;	1107	inode->i_mode &= ~S_ISGID;
1108		1108
1109	di = (struct ocfs2_dinode *) bh->b_data;	1109	di = (struct ocfs2_dinode *) bh->b_data;
1110	di->i_mode = cpu_to_le16(inode->i_mode);	1110	di->i_mode = cpu_to_le16(inode->i_mode);
1111		1111
1112	ret = ocfs2_journal_dirty(handle, bh);	1112	ret = ocfs2_journal_dirty(handle, bh);
1113	if (ret < 0)	1113	if (ret < 0)
1114	mlog_errno(ret);	1114	mlog_errno(ret);
1115	out_bh:	1115	out_bh:
1116	brelse(bh);	1116	brelse(bh);
1117	out_trans:	1117	out_trans:
1118	ocfs2_commit_trans(osb, handle);	1118	ocfs2_commit_trans(osb, handle);
1119	out:	1119	out:
1120	mlog_exit(ret);	1120	mlog_exit(ret);
1121	return ret;	1121	return ret;
1122	}	1122	}
1123		1123
1124	/*	1124	/*
1125	* Will look for holes and unwritten extents in the range starting at	1125	* Will look for holes and unwritten extents in the range starting at
1126	* pos for count bytes (inclusive).	1126	* pos for count bytes (inclusive).
1127	*/	1127	*/
1128	static int ocfs2_check_range_for_holes(struct inode *inode, loff_t pos,	1128	static int ocfs2_check_range_for_holes(struct inode *inode, loff_t pos,
1129	size_t count)	1129	size_t count)
1130	{	1130	{
1131	int ret = 0;	1131	int ret = 0;
1132	unsigned int extent_flags;	1132	unsigned int extent_flags;
1133	u32 cpos, clusters, extent_len, phys_cpos;	1133	u32 cpos, clusters, extent_len, phys_cpos;
1134	struct super_block *sb = inode->i_sb;	1134	struct super_block *sb = inode->i_sb;
1135		1135
1136	cpos = pos >> OCFS2_SB(sb)->s_clustersize_bits;	1136	cpos = pos >> OCFS2_SB(sb)->s_clustersize_bits;
1137	clusters = ocfs2_clusters_for_bytes(sb, pos + count) - cpos;	1137	clusters = ocfs2_clusters_for_bytes(sb, pos + count) - cpos;
1138		1138
1139	while (clusters) {	1139	while (clusters) {
1140	ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, &extent_len,	1140	ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, &extent_len,
1141	&extent_flags);	1141	&extent_flags);
1142	if (ret < 0) {	1142	if (ret < 0) {
1143	mlog_errno(ret);	1143	mlog_errno(ret);
1144	goto out;	1144	goto out;
1145	}	1145	}
1146		1146
1147	if (phys_cpos == 0 \|\| (extent_flags & OCFS2_EXT_UNWRITTEN)) {	1147	if (phys_cpos == 0 \|\| (extent_flags & OCFS2_EXT_UNWRITTEN)) {
1148	ret = 1;	1148	ret = 1;
1149	break;	1149	break;
1150	}	1150	}
1151		1151
1152	if (extent_len > clusters)	1152	if (extent_len > clusters)
1153	extent_len = clusters;	1153	extent_len = clusters;
1154		1154
1155	clusters -= extent_len;	1155	clusters -= extent_len;
1156	cpos += extent_len;	1156	cpos += extent_len;
1157	}	1157	}
1158	out:	1158	out:
1159	return ret;	1159	return ret;
1160	}	1160	}
1161		1161
1162	static int ocfs2_prepare_inode_for_write(struct dentry *dentry,	1162	static int ocfs2_prepare_inode_for_write(struct dentry *dentry,
1163	loff_t *ppos,	1163	loff_t *ppos,
1164	size_t count,	1164	size_t count,
1165	int appending,	1165	int appending,
1166	int *direct_io)	1166	int *direct_io)
1167	{	1167	{
1168	int ret = 0, meta_level = appending;	1168	int ret = 0, meta_level = appending;
1169	struct inode *inode = dentry->d_inode;	1169	struct inode *inode = dentry->d_inode;
1170	u32 clusters;	1170	u32 clusters;
1171	loff_t newsize, saved_pos;	1171	loff_t newsize, saved_pos;
1172		1172
1173	/*	1173	/*
1174	* We sample i_size under a read level meta lock to see if our write	1174	* We sample i_size under a read level meta lock to see if our write
1175	* is extending the file, if it is we back off and get a write level	1175	* is extending the file, if it is we back off and get a write level
1176	* meta lock.	1176	* meta lock.
1177	*/	1177	*/
1178	for(;;) {	1178	for(;;) {
1179	ret = ocfs2_meta_lock(inode, NULL, meta_level);	1179	ret = ocfs2_meta_lock(inode, NULL, meta_level);
1180	if (ret < 0) {	1180	if (ret < 0) {
1181	meta_level = -1;	1181	meta_level = -1;
1182	mlog_errno(ret);	1182	mlog_errno(ret);
1183	goto out;	1183	goto out;
1184	}	1184	}
1185		1185
1186	/* Clear suid / sgid if necessary. We do this here	1186	/* Clear suid / sgid if necessary. We do this here
1187	* instead of later in the write path because	1187	* instead of later in the write path because
1188	* remove_suid() calls ->setattr without any hint that	1188	* remove_suid() calls ->setattr without any hint that
1189	* we may have already done our cluster locking. Since	1189	* we may have already done our cluster locking. Since
1190	* ocfs2_setattr() must take cluster locks to	1190	* ocfs2_setattr() must take cluster locks to
1191	* proceeed, this will lead us to recursively lock the	1191	* proceeed, this will lead us to recursively lock the
1192	* inode. There's also the dinode i_size state which	1192	* inode. There's also the dinode i_size state which
1193	* can be lost via setattr during extending writes (we	1193	* can be lost via setattr during extending writes (we
1194	* set inode->i_size at the end of a write. */	1194	* set inode->i_size at the end of a write. */
1195	if (should_remove_suid(dentry)) {	1195	if (should_remove_suid(dentry)) {
1196	if (meta_level == 0) {	1196	if (meta_level == 0) {
1197	ocfs2_meta_unlock(inode, meta_level);	1197	ocfs2_meta_unlock(inode, meta_level);
1198	meta_level = 1;	1198	meta_level = 1;
1199	continue;	1199	continue;
1200	}	1200	}
1201		1201
1202	ret = ocfs2_write_remove_suid(inode);	1202	ret = ocfs2_write_remove_suid(inode);
1203	if (ret < 0) {	1203	if (ret < 0) {
1204	mlog_errno(ret);	1204	mlog_errno(ret);
1205	goto out_unlock;	1205	goto out_unlock;
1206	}	1206	}
1207	}	1207	}
1208		1208
1209	/* work on a copy of ppos until we're sure that we won't have	1209	/* work on a copy of ppos until we're sure that we won't have
1210	* to recalculate it due to relocking. */	1210	* to recalculate it due to relocking. */
1211	if (appending) {	1211	if (appending) {
1212	saved_pos = i_size_read(inode);	1212	saved_pos = i_size_read(inode);
1213	mlog(0, "O_APPEND: inode->i_size=%llu\n", saved_pos);	1213	mlog(0, "O_APPEND: inode->i_size=%llu\n", saved_pos);
1214	} else {	1214	} else {
1215	saved_pos = *ppos;	1215	saved_pos = *ppos;
1216	}	1216	}
1217		1217
1218	if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) {	1218	if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) {
1219	loff_t end = saved_pos + count;	1219	loff_t end = saved_pos + count;
1220		1220
1221	/*	1221	/*
1222	* Skip the O_DIRECT checks if we don't need	1222	* Skip the O_DIRECT checks if we don't need
1223	* them.	1223	* them.
1224	*/	1224	*/
1225	if (!direct_io \|\| !(*direct_io))	1225	if (!direct_io \|\| !(*direct_io))
1226	break;	1226	break;
1227		1227
1228	/*	1228	/*
1229	* Allowing concurrent direct writes means	1229	* Allowing concurrent direct writes means
1230	* i_size changes wouldn't be synchronized, so	1230	* i_size changes wouldn't be synchronized, so
1231	* one node could wind up truncating another	1231	* one node could wind up truncating another
1232	* nodes writes.	1232	* nodes writes.
1233	*/	1233	*/
1234	if (end > i_size_read(inode)) {	1234	if (end > i_size_read(inode)) {
1235	*direct_io = 0;	1235	*direct_io = 0;
1236	break;	1236	break;
1237	}	1237	}
1238		1238
1239	/*	1239	/*
1240	* We don't fill holes during direct io, so	1240	* We don't fill holes during direct io, so
1241	* check for them here. If any are found, the	1241	* check for them here. If any are found, the
1242	* caller will have to retake some cluster	1242	* caller will have to retake some cluster
1243	* locks and initiate the io as buffered.	1243	* locks and initiate the io as buffered.
1244	*/	1244	*/
1245	ret = ocfs2_check_range_for_holes(inode, saved_pos,	1245	ret = ocfs2_check_range_for_holes(inode, saved_pos,
1246	count);	1246	count);
1247	if (ret == 1) {	1247	if (ret == 1) {
1248	*direct_io = 0;	1248	*direct_io = 0;
1249	ret = 0;	1249	ret = 0;
1250	} else if (ret < 0)	1250	} else if (ret < 0)
1251	mlog_errno(ret);	1251	mlog_errno(ret);
1252	break;	1252	break;
1253	}	1253	}
1254		1254
1255	/*	1255	/*
1256	* The rest of this loop is concerned with legacy file	1256	* The rest of this loop is concerned with legacy file
1257	* systems which don't support sparse files.	1257	* systems which don't support sparse files.
1258	*/	1258	*/
1259		1259
1260	newsize = count + saved_pos;	1260	newsize = count + saved_pos;
1261		1261
1262	mlog(0, "pos=%lld newsize=%lld cursize=%lld\n",	1262	mlog(0, "pos=%lld newsize=%lld cursize=%lld\n",
1263	(long long) saved_pos, (long long) newsize,	1263	(long long) saved_pos, (long long) newsize,
1264	(long long) i_size_read(inode));	1264	(long long) i_size_read(inode));
1265		1265
1266	/* No need for a higher level metadata lock if we're	1266	/* No need for a higher level metadata lock if we're
1267	* never going past i_size. */	1267	* never going past i_size. */
1268	if (newsize <= i_size_read(inode))	1268	if (newsize <= i_size_read(inode))
1269	break;	1269	break;
1270		1270
1271	if (meta_level == 0) {	1271	if (meta_level == 0) {
1272	ocfs2_meta_unlock(inode, meta_level);	1272	ocfs2_meta_unlock(inode, meta_level);
1273	meta_level = 1;	1273	meta_level = 1;
1274	continue;	1274	continue;
1275	}	1275	}
1276		1276
1277	spin_lock(&OCFS2_I(inode)->ip_lock);	1277	spin_lock(&OCFS2_I(inode)->ip_lock);
1278	clusters = ocfs2_clusters_for_bytes(inode->i_sb, newsize) -	1278	clusters = ocfs2_clusters_for_bytes(inode->i_sb, newsize) -
1279	OCFS2_I(inode)->ip_clusters;	1279	OCFS2_I(inode)->ip_clusters;
1280	spin_unlock(&OCFS2_I(inode)->ip_lock);	1280	spin_unlock(&OCFS2_I(inode)->ip_lock);
1281		1281
1282	mlog(0, "Writing at EOF, may need more allocation: "	1282	mlog(0, "Writing at EOF, may need more allocation: "
1283	"i_size = %lld, newsize = %lld, need %u clusters\n",	1283	"i_size = %lld, newsize = %lld, need %u clusters\n",
1284	(long long) i_size_read(inode), (long long) newsize,	1284	(long long) i_size_read(inode), (long long) newsize,
1285	clusters);	1285	clusters);
1286		1286
1287	/* We only want to continue the rest of this loop if	1287	/* We only want to continue the rest of this loop if
1288	* our extend will actually require more	1288	* our extend will actually require more
1289	* allocation. */	1289	* allocation. */
1290	if (!clusters)	1290	if (!clusters)
1291	break;	1291	break;
1292		1292
1293	ret = ocfs2_extend_file(inode, NULL, newsize, count);	1293	ret = ocfs2_extend_file(inode, NULL, newsize, count);
1294	if (ret < 0) {	1294	if (ret < 0) {
1295	if (ret != -ENOSPC)	1295	if (ret != -ENOSPC)
1296	mlog_errno(ret);	1296	mlog_errno(ret);
1297	goto out_unlock;	1297	goto out_unlock;
1298	}	1298	}
1299	break;	1299	break;
1300	}	1300	}
1301		1301
1302	if (appending)	1302	if (appending)
1303	*ppos = saved_pos;	1303	*ppos = saved_pos;
1304		1304
1305	out_unlock:	1305	out_unlock:
1306	ocfs2_meta_unlock(inode, meta_level);	1306	ocfs2_meta_unlock(inode, meta_level);
1307		1307
1308	out:	1308	out:
1309	return ret;	1309	return ret;
1310	}	1310	}
1311		1311
1312	static inline void	1312	static inline void
1313	ocfs2_set_next_iovec(const struct iovec *iovp, size_t basep, size_t bytes)	1313	ocfs2_set_next_iovec(const struct iovec *iovp, size_t basep, size_t bytes)
1314	{	1314	{
1315	const struct iovec iov = iovp;	1315	const struct iovec iov = iovp;
1316	size_t base = *basep;	1316	size_t base = *basep;
1317		1317
1318	do {	1318	do {
1319	int copy = min(bytes, iov->iov_len - base);	1319	int copy = min(bytes, iov->iov_len - base);
1320		1320
1321	bytes -= copy;	1321	bytes -= copy;
1322	base += copy;	1322	base += copy;
1323	if (iov->iov_len == base) {	1323	if (iov->iov_len == base) {
1324	iov++;	1324	iov++;
1325	base = 0;	1325	base = 0;
1326	}	1326	}
1327	} while (bytes);	1327	} while (bytes);
1328	*iovp = iov;	1328	*iovp = iov;
1329	*basep = base;	1329	*basep = base;
1330	}	1330	}
1331		1331
1332	static struct page * ocfs2_get_write_source(struct ocfs2_buffered_write_priv *bp,	1332	static struct page * ocfs2_get_write_source(struct ocfs2_buffered_write_priv *bp,
1333	const struct iovec *cur_iov,	1333	const struct iovec *cur_iov,
1334	size_t iov_offset)	1334	size_t iov_offset)
1335	{	1335	{
1336	int ret;	1336	int ret;
1337	char *buf;	1337	char *buf;
1338	struct page *src_page = NULL;	1338	struct page *src_page = NULL;
1339		1339
1340	buf = cur_iov->iov_base + iov_offset;	1340	buf = cur_iov->iov_base + iov_offset;
1341		1341
1342	if (!segment_eq(get_fs(), KERNEL_DS)) {	1342	if (!segment_eq(get_fs(), KERNEL_DS)) {
1343	/*	1343	/*
1344	* Pull in the user page. We want to do this outside	1344	* Pull in the user page. We want to do this outside
1345	* of the meta data locks in order to preserve locking	1345	* of the meta data locks in order to preserve locking
1346	* order in case of page fault.	1346	* order in case of page fault.
1347	*/	1347	*/
1348	ret = get_user_pages(current, current->mm,	1348	ret = get_user_pages(current, current->mm,
1349	(unsigned long)buf & PAGE_CACHE_MASK, 1,	1349	(unsigned long)buf & PAGE_CACHE_MASK, 1,
1350	0, 0, &src_page, NULL);	1350	0, 0, &src_page, NULL);
1351	if (ret == 1)	1351	if (ret == 1)
1352	bp->b_src_buf = kmap(src_page);	1352	bp->b_src_buf = kmap(src_page);
1353	else	1353	else
1354	src_page = ERR_PTR(-EFAULT);	1354	src_page = ERR_PTR(-EFAULT);
1355	} else {	1355	} else {
1356	bp->b_src_buf = buf;	1356	bp->b_src_buf = buf;
1357	}	1357	}
1358		1358
1359	return src_page;	1359	return src_page;
1360	}	1360	}
1361		1361
1362	static void ocfs2_put_write_source(struct ocfs2_buffered_write_priv *bp,	1362	static void ocfs2_put_write_source(struct ocfs2_buffered_write_priv *bp,
1363	struct page *page)	1363	struct page *page)
1364	{	1364	{
1365	if (page) {	1365	if (page) {
1366	kunmap(page);	1366	kunmap(page);
1367	page_cache_release(page);	1367	page_cache_release(page);
1368	}	1368	}
1369	}	1369	}
1370		1370
1371	static ssize_t ocfs2_file_buffered_write(struct file file, loff_t ppos,	1371	static ssize_t ocfs2_file_buffered_write(struct file file, loff_t ppos,
1372	const struct iovec *iov,	1372	const struct iovec *iov,
1373	unsigned long nr_segs,	1373	unsigned long nr_segs,
1374	size_t count,	1374	size_t count,
1375	ssize_t o_direct_written)	1375	ssize_t o_direct_written)
1376	{	1376	{
1377	int ret = 0;	1377	int ret = 0;
1378	ssize_t copied, total = 0;	1378	ssize_t copied, total = 0;
1379	size_t iov_offset = 0;	1379	size_t iov_offset = 0;
1380	const struct iovec *cur_iov = iov;	1380	const struct iovec *cur_iov = iov;
1381	struct ocfs2_buffered_write_priv bp;	1381	struct ocfs2_buffered_write_priv bp;
1382	struct page *page;	1382	struct page *page;
1383		1383
1384	/*	1384	/*
1385	* handle partial DIO write. Adjust cur_iov if needed.	1385	* handle partial DIO write. Adjust cur_iov if needed.
1386	*/	1386	*/
1387	ocfs2_set_next_iovec(&cur_iov, &iov_offset, o_direct_written);	1387	ocfs2_set_next_iovec(&cur_iov, &iov_offset, o_direct_written);
1388		1388
1389	do {	1389	do {
1390	bp.b_cur_off = iov_offset;	1390	bp.b_cur_off = iov_offset;
1391	bp.b_cur_iov = cur_iov;	1391	bp.b_cur_iov = cur_iov;
1392		1392
1393	page = ocfs2_get_write_source(&bp, cur_iov, iov_offset);	1393	page = ocfs2_get_write_source(&bp, cur_iov, iov_offset);
1394	if (IS_ERR(page)) {	1394	if (IS_ERR(page)) {
1395	ret = PTR_ERR(page);	1395	ret = PTR_ERR(page);
1396	goto out;	1396	goto out;
1397	}	1397	}
1398		1398
1399	copied = ocfs2_buffered_write_cluster(file, *ppos, count,	1399	copied = ocfs2_buffered_write_cluster(file, *ppos, count,
1400	ocfs2_map_and_write_user_data,	1400	ocfs2_map_and_write_user_data,
1401	&bp);	1401	&bp);
1402		1402
1403	ocfs2_put_write_source(&bp, page);	1403	ocfs2_put_write_source(&bp, page);
1404		1404
1405	if (copied < 0) {	1405	if (copied < 0) {
1406	mlog_errno(copied);	1406	mlog_errno(copied);
1407	ret = copied;	1407	ret = copied;
1408	goto out;	1408	goto out;
1409	}	1409	}
1410		1410
1411	total += copied;	1411	total += copied;
1412	ppos = ppos + copied;	1412	ppos = ppos + copied;
1413	count -= copied;	1413	count -= copied;
1414		1414
1415	ocfs2_set_next_iovec(&cur_iov, &iov_offset, copied);	1415	ocfs2_set_next_iovec(&cur_iov, &iov_offset, copied);
1416	} while(count);	1416	} while(count);
1417		1417
1418	out:	1418	out:
1419	return total ? total : ret;	1419	return total ? total : ret;
1420	}	1420	}
1421		1421
1422	static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,	1422	static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
1423	const struct iovec *iov,	1423	const struct iovec *iov,
1424	unsigned long nr_segs,	1424	unsigned long nr_segs,
1425	loff_t pos)	1425	loff_t pos)
1426	{	1426	{
1427	int ret, direct_io, appending, rw_level, have_alloc_sem = 0;	1427	int ret, direct_io, appending, rw_level, have_alloc_sem = 0;
1428	int can_do_direct, sync = 0;	1428	int can_do_direct, sync = 0;
1429	ssize_t written = 0;	1429	ssize_t written = 0;
1430	size_t ocount; /* original count */	1430	size_t ocount; /* original count */
1431	size_t count; /* after file limit checks */	1431	size_t count; /* after file limit checks */
1432	loff_t *ppos = &iocb->ki_pos;	1432	loff_t *ppos = &iocb->ki_pos;
1433	struct file *file = iocb->ki_filp;	1433	struct file *file = iocb->ki_filp;
1434	struct inode *inode = file->f_path.dentry->d_inode;	1434	struct inode *inode = file->f_path.dentry->d_inode;
1435		1435
1436	mlog_entry("(0x%p, %u, '%.*s')\n", file,	1436	mlog_entry("(0x%p, %u, '%.*s')\n", file,
1437	(unsigned int)nr_segs,	1437	(unsigned int)nr_segs,
1438	file->f_path.dentry->d_name.len,	1438	file->f_path.dentry->d_name.len,
1439	file->f_path.dentry->d_name.name);	1439	file->f_path.dentry->d_name.name);
1440		1440
1441	if (iocb->ki_left == 0)	1441	if (iocb->ki_left == 0)
1442	return 0;	1442	return 0;
1443		1443
1444	ret = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ);	1444	ret = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ);
1445	if (ret)	1445	if (ret)
1446	return ret;	1446	return ret;
1447		1447
1448	count = ocount;	1448	count = ocount;
1449		1449
1450	vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);	1450	vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
1451		1451
1452	appending = file->f_flags & O_APPEND ? 1 : 0;	1452	appending = file->f_flags & O_APPEND ? 1 : 0;
1453	direct_io = file->f_flags & O_DIRECT ? 1 : 0;	1453	direct_io = file->f_flags & O_DIRECT ? 1 : 0;
1454		1454
1455	mutex_lock(&inode->i_mutex);	1455	mutex_lock(&inode->i_mutex);
1456		1456
1457	relock:	1457	relock:
1458	/* to match setattr's i_mutex -> i_alloc_sem -> rw_lock ordering */	1458	/* to match setattr's i_mutex -> i_alloc_sem -> rw_lock ordering */
1459	if (direct_io) {	1459	if (direct_io) {
1460	down_read(&inode->i_alloc_sem);	1460	down_read(&inode->i_alloc_sem);
1461	have_alloc_sem = 1;	1461	have_alloc_sem = 1;
1462	}	1462	}
1463		1463
1464	/* concurrent O_DIRECT writes are allowed */	1464	/* concurrent O_DIRECT writes are allowed */
1465	rw_level = !direct_io;	1465	rw_level = !direct_io;
1466	ret = ocfs2_rw_lock(inode, rw_level);	1466	ret = ocfs2_rw_lock(inode, rw_level);
1467	if (ret < 0) {	1467	if (ret < 0) {
1468	mlog_errno(ret);	1468	mlog_errno(ret);
1469	goto out_sems;	1469	goto out_sems;
1470	}	1470	}
1471		1471
1472	can_do_direct = direct_io;	1472	can_do_direct = direct_io;
1473	ret = ocfs2_prepare_inode_for_write(file->f_path.dentry, ppos,	1473	ret = ocfs2_prepare_inode_for_write(file->f_path.dentry, ppos,
1474	iocb->ki_left, appending,	1474	iocb->ki_left, appending,
1475	&can_do_direct);	1475	&can_do_direct);
1476	if (ret < 0) {	1476	if (ret < 0) {
1477	mlog_errno(ret);	1477	mlog_errno(ret);
1478	goto out;	1478	goto out;
1479	}	1479	}
1480		1480
1481	/*	1481	/*
1482	* We can't complete the direct I/O as requested, fall back to	1482	* We can't complete the direct I/O as requested, fall back to
1483	* buffered I/O.	1483	* buffered I/O.
1484	*/	1484	*/
1485	if (direct_io && !can_do_direct) {	1485	if (direct_io && !can_do_direct) {
1486	ocfs2_rw_unlock(inode, rw_level);	1486	ocfs2_rw_unlock(inode, rw_level);
1487	up_read(&inode->i_alloc_sem);	1487	up_read(&inode->i_alloc_sem);
1488		1488
1489	have_alloc_sem = 0;	1489	have_alloc_sem = 0;
1490	rw_level = -1;	1490	rw_level = -1;
1491		1491
1492	direct_io = 0;	1492	direct_io = 0;
1493	sync = 1;	1493	sync = 1;
1494	goto relock;	1494	goto relock;
1495	}	1495	}
1496		1496
1497	if (!sync && ((file->f_flags & O_SYNC) \|\| IS_SYNC(inode)))	1497	if (!sync && ((file->f_flags & O_SYNC) \|\| IS_SYNC(inode)))
1498	sync = 1;	1498	sync = 1;
1499		1499
1500	/*	1500	/*
1501	* XXX: Is it ok to execute these checks a second time?	1501	* XXX: Is it ok to execute these checks a second time?
1502	*/	1502	*/
1503	ret = generic_write_checks(file, ppos, &count, S_ISBLK(inode->i_mode));	1503	ret = generic_write_checks(file, ppos, &count, S_ISBLK(inode->i_mode));
1504	if (ret)	1504	if (ret)
1505	goto out;	1505	goto out;
1506		1506
1507	/*	1507	/*
1508	* Set pos so that sync_page_range_nolock() below understands	1508	* Set pos so that sync_page_range_nolock() below understands
1509	* where to start from. We might've moved it around via the	1509	* where to start from. We might've moved it around via the
1510	* calls above. The range we want to actually sync starts from	1510	* calls above. The range we want to actually sync starts from
1511	* *ppos here.	1511	* *ppos here.
1512	*	1512	*
1513	*/	1513	*/
1514	pos = *ppos;	1514	pos = *ppos;
1515		1515
1516	/* communicate with ocfs2_dio_end_io */	1516	/* communicate with ocfs2_dio_end_io */
1517	ocfs2_iocb_set_rw_locked(iocb, rw_level);	1517	ocfs2_iocb_set_rw_locked(iocb, rw_level);
1518		1518
1519	if (direct_io) {	1519	if (direct_io) {
1520	written = generic_file_direct_write(iocb, iov, &nr_segs, *ppos,	1520	written = generic_file_direct_write(iocb, iov, &nr_segs, *ppos,
1521	ppos, count, ocount);	1521	ppos, count, ocount);
1522	if (written < 0) {	1522	if (written < 0) {
1523	ret = written;	1523	ret = written;
1524	goto out_dio;	1524	goto out_dio;
1525	}	1525	}
1526	} else {	1526	} else {
1527	written = ocfs2_file_buffered_write(file, ppos, iov, nr_segs,	1527	written = ocfs2_file_buffered_write(file, ppos, iov, nr_segs,
1528	count, written);	1528	count, written);
1529	if (written < 0) {	1529	if (written < 0) {
1530	ret = written;	1530	ret = written;
1531	if (ret != -EFAULT \|\| ret != -ENOSPC)	1531	if (ret != -EFAULT \|\| ret != -ENOSPC)
1532	mlog_errno(ret);	1532	mlog_errno(ret);
1533	goto out;	1533	goto out;
1534	}	1534	}
1535	}	1535	}
1536		1536
1537	out_dio:	1537	out_dio:
1538	/* buffered aio wouldn't have proper lock coverage today */	1538	/* buffered aio wouldn't have proper lock coverage today */
1539	BUG_ON(ret == -EIOCBQUEUED && !(file->f_flags & O_DIRECT));	1539	BUG_ON(ret == -EIOCBQUEUED && !(file->f_flags & O_DIRECT));
1540		1540
1541	/*	1541	/*
1542	* deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io	1542	* deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io
1543	* function pointer which is called when o_direct io completes so that	1543	* function pointer which is called when o_direct io completes so that
1544	* it can unlock our rw lock. (it's the clustered equivalent of	1544	* it can unlock our rw lock. (it's the clustered equivalent of
1545	* i_alloc_sem; protects truncate from racing with pending ios).	1545	* i_alloc_sem; protects truncate from racing with pending ios).
1546	* Unfortunately there are error cases which call end_io and others	1546	* Unfortunately there are error cases which call end_io and others
1547	* that don't. so we don't have to unlock the rw_lock if either an	1547	* that don't. so we don't have to unlock the rw_lock if either an
1548	* async dio is going to do it in the future or an end_io after an	1548	* async dio is going to do it in the future or an end_io after an
1549	* error has already done it.	1549	* error has already done it.
1550	*/	1550	*/
1551	if (ret == -EIOCBQUEUED \|\| !ocfs2_iocb_is_rw_locked(iocb)) {	1551	if (ret == -EIOCBQUEUED \|\| !ocfs2_iocb_is_rw_locked(iocb)) {
1552	rw_level = -1;	1552	rw_level = -1;
1553	have_alloc_sem = 0;	1553	have_alloc_sem = 0;
1554	}	1554	}
1555		1555
1556	out:	1556	out:
1557	if (rw_level != -1)	1557	if (rw_level != -1)
1558	ocfs2_rw_unlock(inode, rw_level);	1558	ocfs2_rw_unlock(inode, rw_level);
1559		1559
1560	out_sems:	1560	out_sems:
1561	if (have_alloc_sem)	1561	if (have_alloc_sem)
1562	up_read(&inode->i_alloc_sem);	1562	up_read(&inode->i_alloc_sem);
1563		1563
1564	if (written > 0 && sync) {	1564	if (written > 0 && sync) {
1565	ssize_t err;	1565	ssize_t err;
1566		1566
1567	err = sync_page_range_nolock(inode, file->f_mapping, pos, count);	1567	err = sync_page_range_nolock(inode, file->f_mapping, pos, count);
1568	if (err < 0)	1568	if (err < 0)
1569	written = err;	1569	written = err;
1570	}	1570	}
1571		1571
1572	mutex_unlock(&inode->i_mutex);	1572	mutex_unlock(&inode->i_mutex);
1573		1573
1574	mlog_exit(ret);	1574	mlog_exit(ret);
1575	return written ? written : ret;	1575	return written ? written : ret;
1576	}	1576	}
1577		1577
1578	static int ocfs2_splice_write_actor(struct pipe_inode_info *pipe,	1578	static int ocfs2_splice_write_actor(struct pipe_inode_info *pipe,
1579	struct pipe_buffer *buf,	1579	struct pipe_buffer *buf,
1580	struct splice_desc *sd)	1580	struct splice_desc *sd)
1581	{	1581	{
1582	int ret, count, total = 0;	1582	int ret, count, total = 0;
1583	ssize_t copied = 0;	1583	ssize_t copied = 0;
1584	struct ocfs2_splice_write_priv sp;	1584	struct ocfs2_splice_write_priv sp;
1585		1585
1586	ret = buf->ops->pin(pipe, buf);	1586	ret = buf->ops->confirm(pipe, buf);
1587	if (ret)	1587	if (ret)
1588	goto out;	1588	goto out;
1589		1589
1590	sp.s_sd = sd;	1590	sp.s_sd = sd;
1591	sp.s_buf = buf;	1591	sp.s_buf = buf;
1592	sp.s_pipe = pipe;	1592	sp.s_pipe = pipe;
1593	sp.s_offset = sd->pos & ~PAGE_CACHE_MASK;	1593	sp.s_offset = sd->pos & ~PAGE_CACHE_MASK;
1594	sp.s_buf_offset = buf->offset;	1594	sp.s_buf_offset = buf->offset;
1595		1595
1596	count = sd->len;	1596	count = sd->len;
1597	if (count + sp.s_offset > PAGE_CACHE_SIZE)	1597	if (count + sp.s_offset > PAGE_CACHE_SIZE)
1598	count = PAGE_CACHE_SIZE - sp.s_offset;	1598	count = PAGE_CACHE_SIZE - sp.s_offset;
1599		1599
1600	do {	1600	do {
1601	/*	1601	/*
1602	* splice wants us to copy up to one page at a	1602	* splice wants us to copy up to one page at a
1603	* time. For pagesize > cluster size, this means we	1603	* time. For pagesize > cluster size, this means we
1604	* might enter ocfs2_buffered_write_cluster() more	1604	* might enter ocfs2_buffered_write_cluster() more
1605	* than once, so keep track of our progress here.	1605	* than once, so keep track of our progress here.
1606	*/	1606	*/
1607	copied = ocfs2_buffered_write_cluster(sd->file,	1607	copied = ocfs2_buffered_write_cluster(sd->u.file,
1608	(loff_t)sd->pos + total,	1608	(loff_t)sd->pos + total,
1609	count,	1609	count,
1610	ocfs2_map_and_write_splice_data,	1610	ocfs2_map_and_write_splice_data,
1611	&sp);	1611	&sp);
1612	if (copied < 0) {	1612	if (copied < 0) {
1613	mlog_errno(copied);	1613	mlog_errno(copied);
1614	ret = copied;	1614	ret = copied;
1615	goto out;	1615	goto out;
1616	}	1616	}
1617		1617
1618	count -= copied;	1618	count -= copied;
1619	sp.s_offset += copied;	1619	sp.s_offset += copied;
1620	sp.s_buf_offset += copied;	1620	sp.s_buf_offset += copied;
1621	total += copied;	1621	total += copied;
1622	} while (count);	1622	} while (count);
1623		1623
1624	ret = 0;	1624	ret = 0;
1625	out:	1625	out:
1626		1626
1627	return total ? total : ret;	1627	return total ? total : ret;
1628	}	1628	}
1629		1629
1630	static ssize_t __ocfs2_file_splice_write(struct pipe_inode_info *pipe,	1630	static ssize_t __ocfs2_file_splice_write(struct pipe_inode_info *pipe,
1631	struct file *out,	1631	struct file *out,
1632	loff_t *ppos,	1632	loff_t *ppos,
1633	size_t len,	1633	size_t len,
1634	unsigned int flags)	1634	unsigned int flags)
1635	{	1635	{
1636	int ret, err;	1636	int ret, err;
1637	struct address_space *mapping = out->f_mapping;	1637	struct address_space *mapping = out->f_mapping;
1638	struct inode *inode = mapping->host;	1638	struct inode *inode = mapping->host;
1639	struct splice_desc sd = {	1639	struct splice_desc sd = {
1640	.total_len = len,	1640	.total_len = len,
1641	.flags = flags,	1641	.flags = flags,
1642	.pos = *ppos,	1642	.pos = *ppos,
1643	.u.file = out,	1643	.u.file = out,
1644	};	1644	};
1645		1645
1646	ret = __splice_from_pipe(pipe, &sd, ocfs2_splice_write_actor);	1646	ret = __splice_from_pipe(pipe, &sd, ocfs2_splice_write_actor);
1647	if (ret > 0) {	1647	if (ret > 0) {
1648	*ppos += ret;	1648	*ppos += ret;
1649		1649
1650	if (unlikely((out->f_flags & O_SYNC) \|\| IS_SYNC(inode))) {	1650	if (unlikely((out->f_flags & O_SYNC) \|\| IS_SYNC(inode))) {
1651	err = generic_osync_inode(inode, mapping,	1651	err = generic_osync_inode(inode, mapping,
1652	OSYNC_METADATA\|OSYNC_DATA);	1652	OSYNC_METADATA\|OSYNC_DATA);
1653	if (err)	1653	if (err)
1654	ret = err;	1654	ret = err;
1655	}	1655	}
1656	}	1656	}
1657		1657
1658	return ret;	1658	return ret;
1659	}	1659	}
1660		1660
1661	static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe,	1661	static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe,
1662	struct file *out,	1662	struct file *out,
1663	loff_t *ppos,	1663	loff_t *ppos,
1664	size_t len,	1664	size_t len,
1665	unsigned int flags)	1665	unsigned int flags)
1666	{	1666	{
1667	int ret;	1667	int ret;
1668	struct inode *inode = out->f_path.dentry->d_inode;	1668	struct inode *inode = out->f_path.dentry->d_inode;
1669		1669
1670	mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", out, pipe,	1670	mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", out, pipe,
1671	(unsigned int)len,	1671	(unsigned int)len,
1672	out->f_path.dentry->d_name.len,	1672	out->f_path.dentry->d_name.len,
1673	out->f_path.dentry->d_name.name);	1673	out->f_path.dentry->d_name.name);
1674		1674
1675	inode_double_lock(inode, pipe->inode);	1675	inode_double_lock(inode, pipe->inode);
1676		1676
1677	ret = ocfs2_rw_lock(inode, 1);	1677	ret = ocfs2_rw_lock(inode, 1);
1678	if (ret < 0) {	1678	if (ret < 0) {
1679	mlog_errno(ret);	1679	mlog_errno(ret);
1680	goto out;	1680	goto out;
1681	}	1681	}
1682		1682
1683	ret = ocfs2_prepare_inode_for_write(out->f_path.dentry, ppos, len, 0,	1683	ret = ocfs2_prepare_inode_for_write(out->f_path.dentry, ppos, len, 0,
1684	NULL);	1684	NULL);
1685	if (ret < 0) {	1685	if (ret < 0) {
1686	mlog_errno(ret);	1686	mlog_errno(ret);
1687	goto out_unlock;	1687	goto out_unlock;
1688	}	1688	}
1689		1689
1690	/* ok, we're done with i_size and alloc work */	1690	/* ok, we're done with i_size and alloc work */
1691	ret = __ocfs2_file_splice_write(pipe, out, ppos, len, flags);	1691	ret = __ocfs2_file_splice_write(pipe, out, ppos, len, flags);
1692		1692
1693	out_unlock:	1693	out_unlock:
1694	ocfs2_rw_unlock(inode, 1);	1694	ocfs2_rw_unlock(inode, 1);
1695	out:	1695	out:
1696	inode_double_unlock(inode, pipe->inode);	1696	inode_double_unlock(inode, pipe->inode);
1697		1697
1698	mlog_exit(ret);	1698	mlog_exit(ret);
1699	return ret;	1699	return ret;
1700	}	1700	}
1701		1701
1702	static ssize_t ocfs2_file_splice_read(struct file *in,	1702	static ssize_t ocfs2_file_splice_read(struct file *in,
1703	loff_t *ppos,	1703	loff_t *ppos,
1704	struct pipe_inode_info *pipe,	1704	struct pipe_inode_info *pipe,
1705	size_t len,	1705	size_t len,
1706	unsigned int flags)	1706	unsigned int flags)
1707	{	1707	{
1708	int ret = 0;	1708	int ret = 0;
1709	struct inode *inode = in->f_path.dentry->d_inode;	1709	struct inode *inode = in->f_path.dentry->d_inode;
1710		1710
1711	mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", in, pipe,	1711	mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", in, pipe,
1712	(unsigned int)len,	1712	(unsigned int)len,
1713	in->f_path.dentry->d_name.len,	1713	in->f_path.dentry->d_name.len,
1714	in->f_path.dentry->d_name.name);	1714	in->f_path.dentry->d_name.name);
1715		1715
1716	/*	1716	/*
1717	* See the comment in ocfs2_file_aio_read()	1717	* See the comment in ocfs2_file_aio_read()
1718	*/	1718	*/
1719	ret = ocfs2_meta_lock(inode, NULL, 0);	1719	ret = ocfs2_meta_lock(inode, NULL, 0);
1720	if (ret < 0) {	1720	if (ret < 0) {
1721	mlog_errno(ret);	1721	mlog_errno(ret);
1722	goto bail;	1722	goto bail;
1723	}	1723	}
1724	ocfs2_meta_unlock(inode, 0);	1724	ocfs2_meta_unlock(inode, 0);
1725		1725
1726	ret = generic_file_splice_read(in, ppos, pipe, len, flags);	1726	ret = generic_file_splice_read(in, ppos, pipe, len, flags);
1727		1727
1728	bail:	1728	bail:
1729	mlog_exit(ret);	1729	mlog_exit(ret);
1730	return ret;	1730	return ret;
1731	}	1731	}
1732		1732
1733	static ssize_t ocfs2_file_aio_read(struct kiocb *iocb,	1733	static ssize_t ocfs2_file_aio_read(struct kiocb *iocb,
1734	const struct iovec *iov,	1734	const struct iovec *iov,
1735	unsigned long nr_segs,	1735	unsigned long nr_segs,
1736	loff_t pos)	1736	loff_t pos)
1737	{	1737	{
1738	int ret = 0, rw_level = -1, have_alloc_sem = 0, lock_level = 0;	1738	int ret = 0, rw_level = -1, have_alloc_sem = 0, lock_level = 0;
1739	struct file *filp = iocb->ki_filp;	1739	struct file *filp = iocb->ki_filp;
1740	struct inode *inode = filp->f_path.dentry->d_inode;	1740	struct inode *inode = filp->f_path.dentry->d_inode;
1741		1741
1742	mlog_entry("(0x%p, %u, '%.*s')\n", filp,	1742	mlog_entry("(0x%p, %u, '%.*s')\n", filp,
1743	(unsigned int)nr_segs,	1743	(unsigned int)nr_segs,
1744	filp->f_path.dentry->d_name.len,	1744	filp->f_path.dentry->d_name.len,
1745	filp->f_path.dentry->d_name.name);	1745	filp->f_path.dentry->d_name.name);
1746		1746
1747	if (!inode) {	1747	if (!inode) {
1748	ret = -EINVAL;	1748	ret = -EINVAL;
1749	mlog_errno(ret);	1749	mlog_errno(ret);
1750	goto bail;	1750	goto bail;
1751	}	1751	}
1752		1752
1753	/*	1753	/*
1754	* buffered reads protect themselves in ->readpage(). O_DIRECT reads	1754	* buffered reads protect themselves in ->readpage(). O_DIRECT reads
1755	* need locks to protect pending reads from racing with truncate.	1755	* need locks to protect pending reads from racing with truncate.
1756	*/	1756	*/
1757	if (filp->f_flags & O_DIRECT) {	1757	if (filp->f_flags & O_DIRECT) {
1758	down_read(&inode->i_alloc_sem);	1758	down_read(&inode->i_alloc_sem);
1759	have_alloc_sem = 1;	1759	have_alloc_sem = 1;
1760		1760
1761	ret = ocfs2_rw_lock(inode, 0);	1761	ret = ocfs2_rw_lock(inode, 0);
1762	if (ret < 0) {	1762	if (ret < 0) {
1763	mlog_errno(ret);	1763	mlog_errno(ret);
1764	goto bail;	1764	goto bail;
1765	}	1765	}
1766	rw_level = 0;	1766	rw_level = 0;
1767	/* communicate with ocfs2_dio_end_io */	1767	/* communicate with ocfs2_dio_end_io */
1768	ocfs2_iocb_set_rw_locked(iocb, rw_level);	1768	ocfs2_iocb_set_rw_locked(iocb, rw_level);
1769	}	1769	}
1770		1770
1771	/*	1771	/*
1772	* We're fine letting folks race truncates and extending	1772	* We're fine letting folks race truncates and extending
1773	* writes with read across the cluster, just like they can	1773	* writes with read across the cluster, just like they can
1774	* locally. Hence no rw_lock during read.	1774	* locally. Hence no rw_lock during read.
1775	*	1775	*
1776	* Take and drop the meta data lock to update inode fields	1776	* Take and drop the meta data lock to update inode fields
1777	* like i_size. This allows the checks down below	1777	* like i_size. This allows the checks down below
1778	* generic_file_aio_read() a chance of actually working.	1778	* generic_file_aio_read() a chance of actually working.
1779	*/	1779	*/
1780	ret = ocfs2_meta_lock_atime(inode, filp->f_vfsmnt, &lock_level);	1780	ret = ocfs2_meta_lock_atime(inode, filp->f_vfsmnt, &lock_level);
1781	if (ret < 0) {	1781	if (ret < 0) {
1782	mlog_errno(ret);	1782	mlog_errno(ret);
1783	goto bail;	1783	goto bail;
1784	}	1784	}
1785	ocfs2_meta_unlock(inode, lock_level);	1785	ocfs2_meta_unlock(inode, lock_level);
1786		1786
1787	ret = generic_file_aio_read(iocb, iov, nr_segs, iocb->ki_pos);	1787	ret = generic_file_aio_read(iocb, iov, nr_segs, iocb->ki_pos);
1788	if (ret == -EINVAL)	1788	if (ret == -EINVAL)
1789	mlog(ML_ERROR, "generic_file_aio_read returned -EINVAL\n");	1789	mlog(ML_ERROR, "generic_file_aio_read returned -EINVAL\n");
1790		1790
1791	/* buffered aio wouldn't have proper lock coverage today */	1791	/* buffered aio wouldn't have proper lock coverage today */
1792	BUG_ON(ret == -EIOCBQUEUED && !(filp->f_flags & O_DIRECT));	1792	BUG_ON(ret == -EIOCBQUEUED && !(filp->f_flags & O_DIRECT));
1793		1793
1794	/* see ocfs2_file_aio_write */	1794	/* see ocfs2_file_aio_write */
1795	if (ret == -EIOCBQUEUED \|\| !ocfs2_iocb_is_rw_locked(iocb)) {	1795	if (ret == -EIOCBQUEUED \|\| !ocfs2_iocb_is_rw_locked(iocb)) {
1796	rw_level = -1;	1796	rw_level = -1;
1797	have_alloc_sem = 0;	1797	have_alloc_sem = 0;
1798	}	1798	}
1799		1799
1800	bail:	1800	bail:
1801	if (have_alloc_sem)	1801	if (have_alloc_sem)
1802	up_read(&inode->i_alloc_sem);	1802	up_read(&inode->i_alloc_sem);
1803	if (rw_level != -1)	1803	if (rw_level != -1)
1804	ocfs2_rw_unlock(inode, rw_level);	1804	ocfs2_rw_unlock(inode, rw_level);
1805	mlog_exit(ret);	1805	mlog_exit(ret);
1806		1806
1807	return ret;	1807	return ret;
1808	}	1808	}
1809		1809
1810	const struct inode_operations ocfs2_file_iops = {	1810	const struct inode_operations ocfs2_file_iops = {
1811	.setattr = ocfs2_setattr,	1811	.setattr = ocfs2_setattr,
1812	.getattr = ocfs2_getattr,	1812	.getattr = ocfs2_getattr,
1813	.permission = ocfs2_permission,	1813	.permission = ocfs2_permission,
1814	};	1814	};
1815		1815
1816	const struct inode_operations ocfs2_special_file_iops = {	1816	const struct inode_operations ocfs2_special_file_iops = {
1817	.setattr = ocfs2_setattr,	1817	.setattr = ocfs2_setattr,
1818	.getattr = ocfs2_getattr,	1818	.getattr = ocfs2_getattr,
1819	.permission = ocfs2_permission,	1819	.permission = ocfs2_permission,
1820	};	1820	};
1821		1821
1822	const struct file_operations ocfs2_fops = {	1822	const struct file_operations ocfs2_fops = {
1823	.read = do_sync_read,	1823	.read = do_sync_read,
1824	.write = do_sync_write,	1824	.write = do_sync_write,
1825	.mmap = ocfs2_mmap,	1825	.mmap = ocfs2_mmap,
1826	.fsync = ocfs2_sync_file,	1826	.fsync = ocfs2_sync_file,
1827	.release = ocfs2_file_release,	1827	.release = ocfs2_file_release,
1828	.open = ocfs2_file_open,	1828	.open = ocfs2_file_open,
1829	.aio_read = ocfs2_file_aio_read,	1829	.aio_read = ocfs2_file_aio_read,
1830	.aio_write = ocfs2_file_aio_write,	1830	.aio_write = ocfs2_file_aio_write,
1831	.ioctl = ocfs2_ioctl,	1831	.ioctl = ocfs2_ioctl,
1832	#ifdef CONFIG_COMPAT	1832	#ifdef CONFIG_COMPAT
1833	.compat_ioctl = ocfs2_compat_ioctl,	1833	.compat_ioctl = ocfs2_compat_ioctl,
1834	#endif	1834	#endif
1835	.splice_read = ocfs2_file_splice_read,	1835	.splice_read = ocfs2_file_splice_read,
1836	.splice_write = ocfs2_file_splice_write,	1836	.splice_write = ocfs2_file_splice_write,
1837	};	1837	};
1838		1838
1839	const struct file_operations ocfs2_dops = {	1839	const struct file_operations ocfs2_dops = {
1840	.read = generic_read_dir,	1840	.read = generic_read_dir,
1841	.readdir = ocfs2_readdir,	1841	.readdir = ocfs2_readdir,
1842	.fsync = ocfs2_sync_file,	1842	.fsync = ocfs2_sync_file,
1843	.ioctl = ocfs2_ioctl,	1843	.ioctl = ocfs2_ioctl,
1844	#ifdef CONFIG_COMPAT	1844	#ifdef CONFIG_COMPAT
1845	.compat_ioctl = ocfs2_compat_ioctl,	1845	.compat_ioctl = ocfs2_compat_ioctl,
1846	#endif	1846	#endif
1847	};	1847	};
1848		1848

fs/pipe.c

Diff comments View file @ cac36bb

 /*
  *  linux/fs/pipe.c
  *
  *  Copyright (C) 1991, 1992, 1999  Linus Torvalds
  */
 #include <linux/mm.h>
 #include <linux/file.h>
 #include <linux/poll.h>
 #include <linux/slab.h>
 #include <linux/module.h>
 #include <linux/init.h>
 #include <linux/fs.h>
 #include <linux/mount.h>
 #include <linux/pipe_fs_i.h>
 #include <linux/uio.h>
 #include <linux/highmem.h>
 #include <linux/pagemap.h>
 #include <linux/audit.h>
 #include <asm/uaccess.h>
 #include <asm/ioctls.h>
 /*
  * We use a start+len construction, which provides full use of the
  * allocated memory.
  * -- Florian Coosmann (FGC)
  *
  * Reads with count = 0 should always return 0.
  * -- Julian Bradfield 1999-06-07.
  *
  * FIFOs and Pipes now generate SIGIO for both readers and writers.
  * -- Jeremy Elson <jelson@circlemud.org> 2001-08-16
  *
  * pipe_read & write cleanup
  * -- Manfred Spraul <manfred@colorfullife.com> 2002-05-09
  */
 /* Drop the inode semaphore and wait for a pipe event, atomically */
 void pipe_wait(struct pipe_inode_info *pipe)
 {
 	DEFINE_WAIT(wait);
 	/*
 	 * Pipes are system-local resources, so sleeping on them
 	 * is considered a noninteractive wait:
 	 */
 	prepare_to_wait(&pipe->wait, &wait,
 			TASK_INTERRUPTIBLE | TASK_NONINTERACTIVE);
 	if (pipe->inode)
 		mutex_unlock(&pipe->inode->i_mutex);
 	schedule();
 	finish_wait(&pipe->wait, &wait);
 	if (pipe->inode)
 		mutex_lock(&pipe->inode->i_mutex);
 }
 static int
 pipe_iov_copy_from_user(void *to, struct iovec *iov, unsigned long len,
 			int atomic)
 {
 	unsigned long copy;
 	while (len > 0) {
 		while (!iov->iov_len)
 			iov++;
 		copy = min_t(unsigned long, len, iov->iov_len);
 		if (atomic) {
 			if (__copy_from_user_inatomic(to, iov->iov_base, copy))
 				return -EFAULT;
 		} else {
 			if (copy_from_user(to, iov->iov_base, copy))
 				return -EFAULT;
 		}
 		to += copy;
 		len -= copy;
 		iov->iov_base += copy;
 		iov->iov_len -= copy;
 	}
 	return 0;
 }
 static int
 pipe_iov_copy_to_user(struct iovec *iov, const void *from, unsigned long len,
 		      int atomic)
 {
 	unsigned long copy;
 	while (len > 0) {
 		while (!iov->iov_len)
 			iov++;
 		copy = min_t(unsigned long, len, iov->iov_len);
 		if (atomic) {
 			if (__copy_to_user_inatomic(iov->iov_base, from, copy))
 				return -EFAULT;
 		} else {
 			if (copy_to_user(iov->iov_base, from, copy))
 				return -EFAULT;
 		}
 		from += copy;
 		len -= copy;
 		iov->iov_base += copy;
 		iov->iov_len -= copy;
 	}
 	return 0;
 }
 /*
  * Attempt to pre-fault in the user memory, so we can use atomic copies.
  * Returns the number of bytes not faulted in.
  */
 static int iov_fault_in_pages_write(struct iovec *iov, unsigned long len)
 {
 	while (!iov->iov_len)
 		iov++;
 	while (len > 0) {
 		unsigned long this_len;
 		this_len = min_t(unsigned long, len, iov->iov_len);
 		if (fault_in_pages_writeable(iov->iov_base, this_len))
 			break;
 		len -= this_len;
 		iov++;
 	}
 	return len;
 }
 /*
  * Pre-fault in the user memory, so we can use atomic copies.
  */
 static void iov_fault_in_pages_read(struct iovec *iov, unsigned long len)
 {
 	while (!iov->iov_len)
 		iov++;
 	while (len > 0) {
 		unsigned long this_len;
 		this_len = min_t(unsigned long, len, iov->iov_len);
 		fault_in_pages_readable(iov->iov_base, this_len);
 		len -= this_len;
 		iov++;
 	}
 }
 static void anon_pipe_buf_release(struct pipe_inode_info *pipe,
 				  struct pipe_buffer *buf)
 {
 	struct page *page = buf->page;
 	/*
 	 * If nobody else uses this page, and we don't already have a
 	 * temporary page, let's keep track of it as a one-deep
 	 * allocation cache. (Otherwise just release our reference to it)
 	 */
 	if (page_count(page) == 1 && !pipe->tmp_page)
 		pipe->tmp_page = page;
 	else
 		page_cache_release(page);
 }
 void *generic_pipe_buf_map(struct pipe_inode_info *pipe,
 			   struct pipe_buffer *buf, int atomic)
 {
 	if (atomic) {
 		buf->flags |= PIPE_BUF_FLAG_ATOMIC;
 		return kmap_atomic(buf->page, KM_USER0);
 	}
 	return kmap(buf->page);
 }
 void generic_pipe_buf_unmap(struct pipe_inode_info *pipe,
 			    struct pipe_buffer *buf, void *map_data)
 {
 	if (buf->flags & PIPE_BUF_FLAG_ATOMIC) {
 		buf->flags &= ~PIPE_BUF_FLAG_ATOMIC;
 		kunmap_atomic(map_data, KM_USER0);
 	} else
 		kunmap(buf->page);
 }
 int generic_pipe_buf_steal(struct pipe_inode_info *pipe,
 			   struct pipe_buffer *buf)
 {
 	struct page *page = buf->page;
 	if (page_count(page) == 1) {
 		lock_page(page);
 		return 0;
 	}
 	return 1;
 }
 void generic_pipe_buf_get(struct pipe_inode_info *info, struct pipe_buffer *buf)
 {
 	page_cache_get(buf->page);
 }
-int generic_pipe_buf_pin(struct pipe_inode_info *info, struct pipe_buffer *buf)
+int generic_pipe_buf_confirm(struct pipe_inode_info *info,
+			     struct pipe_buffer *buf)
 {
 	return 0;
 }
 static const struct pipe_buf_operations anon_pipe_buf_ops = {
 	.can_merge = 1,
 	.map = generic_pipe_buf_map,
 	.unmap = generic_pipe_buf_unmap,
-	.pin = generic_pipe_buf_pin,
+	.confirm = generic_pipe_buf_confirm,
 	.release = anon_pipe_buf_release,
 	.steal = generic_pipe_buf_steal,
 	.get = generic_pipe_buf_get,
 };
 static ssize_t
 pipe_read(struct kiocb *iocb, const struct iovec *_iov,
 	   unsigned long nr_segs, loff_t pos)
 {
 	struct file *filp = iocb->ki_filp;
 	struct inode *inode = filp->f_path.dentry->d_inode;
 	struct pipe_inode_info *pipe;
 	int do_wakeup;
 	ssize_t ret;
 	struct iovec *iov = (struct iovec *)_iov;
 	size_t total_len;
 	total_len = iov_length(iov, nr_segs);
 	/* Null read succeeds. */
 	if (unlikely(total_len == 0))
 		return 0;
 	do_wakeup = 0;
 	ret = 0;
 	mutex_lock(&inode->i_mutex);
 	pipe = inode->i_pipe;
 	for (;;) {
 		int bufs = pipe->nrbufs;
 		if (bufs) {
 			int curbuf = pipe->curbuf;
 			struct pipe_buffer *buf = pipe->bufs + curbuf;
 			const struct pipe_buf_operations *ops = buf->ops;
 			void *addr;
 			size_t chars = buf->len;
 			int error, atomic;
 			if (chars > total_len)
 				chars = total_len;
-			error = ops->pin(pipe, buf);
+			error = ops->confirm(pipe, buf);
 			if (error) {
 				if (!ret)
 					error = ret;
 				break;
 			}
 			atomic = !iov_fault_in_pages_write(iov, chars);
 redo:
 			addr = ops->map(pipe, buf, atomic);
 			error = pipe_iov_copy_to_user(iov, addr + buf->offset, chars, atomic);
 			ops->unmap(pipe, buf, addr);
 			if (unlikely(error)) {
 				/*
 				 * Just retry with the slow path if we failed.
 				 */
 				if (atomic) {
 					atomic = 0;
 					goto redo;
 				}
 				if (!ret)
 					ret = error;
 				break;
 			}
 			ret += chars;
 			buf->offset += chars;
 			buf->len -= chars;
 			if (!buf->len) {
 				buf->ops = NULL;
 				ops->release(pipe, buf);
 				curbuf = (curbuf + 1) & (PIPE_BUFFERS-1);
 				pipe->curbuf = curbuf;
 				pipe->nrbufs = --bufs;
 				do_wakeup = 1;
 			}
 			total_len -= chars;
 			if (!total_len)
 				break;	/* common path: read succeeded */
 		}
 		if (bufs)	/* More to do? */
 			continue;
 		if (!pipe->writers)
 			break;
 		if (!pipe->waiting_writers) {
 			/* syscall merging: Usually we must not sleep
 			 * if O_NONBLOCK is set, or if we got some data.
 			 * But if a writer sleeps in kernel space, then
 			 * we can wait for that data without violating POSIX.
 			 */
 			if (ret)
 				break;
 			if (filp->f_flags & O_NONBLOCK) {
 				ret = -EAGAIN;
 				break;
 			}
 		}
 		if (signal_pending(current)) {
 			if (!ret)
 				ret = -ERESTARTSYS;
 			break;
 		}
 		if (do_wakeup) {
 			wake_up_interruptible_sync(&pipe->wait);
  			kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
 		}
 		pipe_wait(pipe);
 	}
 	mutex_unlock(&inode->i_mutex);
 	/* Signal writers asynchronously that there is more room. */
 	if (do_wakeup) {
 		wake_up_interruptible(&pipe->wait);
 		kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
 	}
 	if (ret > 0)
 		file_accessed(filp);
 	return ret;
 }
 static ssize_t
 pipe_write(struct kiocb *iocb, const struct iovec *_iov,
 	    unsigned long nr_segs, loff_t ppos)
 {
 	struct file *filp = iocb->ki_filp;
 	struct inode *inode = filp->f_path.dentry->d_inode;
 	struct pipe_inode_info *pipe;
 	ssize_t ret;
 	int do_wakeup;
 	struct iovec *iov = (struct iovec *)_iov;
 	size_t total_len;
 	ssize_t chars;
 	total_len = iov_length(iov, nr_segs);
 	/* Null write succeeds. */
 	if (unlikely(total_len == 0))
 		return 0;
 	do_wakeup = 0;
 	ret = 0;
 	mutex_lock(&inode->i_mutex);
 	pipe = inode->i_pipe;
 	if (!pipe->readers) {
 		send_sig(SIGPIPE, current, 0);
 		ret = -EPIPE;
 		goto out;
 	}
 	/* We try to merge small writes */
 	chars = total_len & (PAGE_SIZE-1); /* size of the last buffer */
 	if (pipe->nrbufs && chars != 0) {
 		int lastbuf = (pipe->curbuf + pipe->nrbufs - 1) &
 							(PIPE_BUFFERS-1);
 		struct pipe_buffer *buf = pipe->bufs + lastbuf;
 		const struct pipe_buf_operations *ops = buf->ops;
 		int offset = buf->offset + buf->len;
 		if (ops->can_merge && offset + chars <= PAGE_SIZE) {
 			int error, atomic = 1;
 			void *addr;
-			error = ops->pin(pipe, buf);
+			error = ops->confirm(pipe, buf);
 			if (error)
 				goto out;
 			iov_fault_in_pages_read(iov, chars);
 redo1:
 			addr = ops->map(pipe, buf, atomic);
 			error = pipe_iov_copy_from_user(offset + addr, iov,
 							chars, atomic);
 			ops->unmap(pipe, buf, addr);
 			ret = error;
 			do_wakeup = 1;
 			if (error) {
 				if (atomic) {
 					atomic = 0;
 					goto redo1;
 				}
 				goto out;
 			}
 			buf->len += chars;
 			total_len -= chars;
 			ret = chars;
 			if (!total_len)
 				goto out;
 		}
 	}
 	for (;;) {
 		int bufs;
 		if (!pipe->readers) {
 			send_sig(SIGPIPE, current, 0);
 			if (!ret)
 				ret = -EPIPE;
 			break;
 		}
 		bufs = pipe->nrbufs;
 		if (bufs < PIPE_BUFFERS) {
 			int newbuf = (pipe->curbuf + bufs) & (PIPE_BUFFERS-1);
 			struct pipe_buffer *buf = pipe->bufs + newbuf;
 			struct page *page = pipe->tmp_page;
 			char *src;
 			int error, atomic = 1;
 			if (!page) {
 				page = alloc_page(GFP_HIGHUSER);
 				if (unlikely(!page)) {
 					ret = ret ? : -ENOMEM;
 					break;
 				}
 				pipe->tmp_page = page;
 			}
 			/* Always wake up, even if the copy fails. Otherwise
 			 * we lock up (O_NONBLOCK-)readers that sleep due to
 			 * syscall merging.
 			 * FIXME! Is this really true?
 			 */
 			do_wakeup = 1;
 			chars = PAGE_SIZE;
 			if (chars > total_len)
 				chars = total_len;
 			iov_fault_in_pages_read(iov, chars);
 redo2:
 			if (atomic)
 				src = kmap_atomic(page, KM_USER0);
 			else
 				src = kmap(page);
 			error = pipe_iov_copy_from_user(src, iov, chars,
 							atomic);
 			if (atomic)
 				kunmap_atomic(src, KM_USER0);
 			else
 				kunmap(page);
 			if (unlikely(error)) {
 				if (atomic) {
 					atomic = 0;
 					goto redo2;
 				}
 				if (!ret)
 					ret = error;
 				break;
 			}
 			ret += chars;
 			/* Insert it into the buffer array */
 			buf->page = page;
 			buf->ops = &anon_pipe_buf_ops;
 			buf->offset = 0;
 			buf->len = chars;
 			pipe->nrbufs = ++bufs;
 			pipe->tmp_page = NULL;
 			total_len -= chars;
 			if (!total_len)
 				break;
 		}
 		if (bufs < PIPE_BUFFERS)
 			continue;
 		if (filp->f_flags & O_NONBLOCK) {
 			if (!ret)
 				ret = -EAGAIN;
 			break;
 		}
 		if (signal_pending(current)) {
 			if (!ret)
 				ret = -ERESTARTSYS;
 			break;
 		}
 		if (do_wakeup) {
 			wake_up_interruptible_sync(&pipe->wait);
 			kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
 			do_wakeup = 0;
 		}
 		pipe->waiting_writers++;
 		pipe_wait(pipe);
 		pipe->waiting_writers--;
 	}
 out:
 	mutex_unlock(&inode->i_mutex);
 	if (do_wakeup) {
 		wake_up_interruptible(&pipe->wait);
 		kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
 	}
 	if (ret > 0)
 		file_update_time(filp);
 	return ret;
 }
 static ssize_t
 bad_pipe_r(struct file *filp, char __user *buf, size_t count, loff_t *ppos)
 {
 	return -EBADF;
 }
 static ssize_t
 bad_pipe_w(struct file *filp, const char __user *buf, size_t count,
 	   loff_t *ppos)
 {
 	return -EBADF;
 }
 static int
 pipe_ioctl(struct inode *pino, struct file *filp,
 	   unsigned int cmd, unsigned long arg)
 {
 	struct inode *inode = filp->f_path.dentry->d_inode;
 	struct pipe_inode_info *pipe;
 	int count, buf, nrbufs;
 	switch (cmd) {
 		case FIONREAD:
 			mutex_lock(&inode->i_mutex);
 			pipe = inode->i_pipe;
 			count = 0;
 			buf = pipe->curbuf;
 			nrbufs = pipe->nrbufs;
 			while (--nrbufs >= 0) {
 				count += pipe->bufs[buf].len;
 				buf = (buf+1) & (PIPE_BUFFERS-1);
 			}
 			mutex_unlock(&inode->i_mutex);
 			return put_user(count, (int __user *)arg);
 		default:
 			return -EINVAL;
 	}
 }
 /* No kernel lock held - fine */
 static unsigned int
 pipe_poll(struct file *filp, poll_table *wait)
 {
 	unsigned int mask;
 	struct inode *inode = filp->f_path.dentry->d_inode;
 	struct pipe_inode_info *pipe = inode->i_pipe;
 	int nrbufs;
 	poll_wait(filp, &pipe->wait, wait);
 	/* Reading only -- no need for acquiring the semaphore.  */
 	nrbufs = pipe->nrbufs;
 	mask = 0;
 	if (filp->f_mode & FMODE_READ) {
 		mask = (nrbufs > 0) ? POLLIN | POLLRDNORM : 0;
 		if (!pipe->writers && filp->f_version != pipe->w_counter)
 			mask |= POLLHUP;
 	}
 	if (filp->f_mode & FMODE_WRITE) {
 		mask |= (nrbufs < PIPE_BUFFERS) ? POLLOUT | POLLWRNORM : 0;
 		/*
 		 * Most Unices do not set POLLERR for FIFOs but on Linux they
 		 * behave exactly like pipes for poll().
 		 */
 		if (!pipe->readers)
 			mask |= POLLERR;
 	}
 	return mask;
 }
 static int
 pipe_release(struct inode *inode, int decr, int decw)
 {
 	struct pipe_inode_info *pipe;
 	mutex_lock(&inode->i_mutex);
 	pipe = inode->i_pipe;
 	pipe->readers -= decr;
 	pipe->writers -= decw;
 	if (!pipe->readers && !pipe->writers) {
 		free_pipe_info(inode);
 	} else {
 		wake_up_interruptible(&pipe->wait);
 		kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
 		kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
 	}
 	mutex_unlock(&inode->i_mutex);
 	return 0;
 }
 static int
 pipe_read_fasync(int fd, struct file *filp, int on)
 {
 	struct inode *inode = filp->f_path.dentry->d_inode;
 	int retval;
 	mutex_lock(&inode->i_mutex);
 	retval = fasync_helper(fd, filp, on, &inode->i_pipe->fasync_readers);
 	mutex_unlock(&inode->i_mutex);
 	if (retval < 0)
 		return retval;
 	return 0;
 }
 static int
 pipe_write_fasync(int fd, struct file *filp, int on)
 {
 	struct inode *inode = filp->f_path.dentry->d_inode;
 	int retval;
 	mutex_lock(&inode->i_mutex);
 	retval = fasync_helper(fd, filp, on, &inode->i_pipe->fasync_writers);
 	mutex_unlock(&inode->i_mutex);
 	if (retval < 0)
 		return retval;
 	return 0;
 }
 static int
 pipe_rdwr_fasync(int fd, struct file *filp, int on)
 {
 	struct inode *inode = filp->f_path.dentry->d_inode;
 	struct pipe_inode_info *pipe = inode->i_pipe;
 	int retval;
 	mutex_lock(&inode->i_mutex);
 	retval = fasync_helper(fd, filp, on, &pipe->fasync_readers);
 	if (retval >= 0)
 		retval = fasync_helper(fd, filp, on, &pipe->fasync_writers);
 	mutex_unlock(&inode->i_mutex);
 	if (retval < 0)
 		return retval;
 	return 0;
 }
 static int
 pipe_read_release(struct inode *inode, struct file *filp)
 {
 	pipe_read_fasync(-1, filp, 0);
 	return pipe_release(inode, 1, 0);
 }
 static int
 pipe_write_release(struct inode *inode, struct file *filp)
 {
 	pipe_write_fasync(-1, filp, 0);
 	return pipe_release(inode, 0, 1);
 }
 static int
 pipe_rdwr_release(struct inode *inode, struct file *filp)
 {
 	int decr, decw;
 	pipe_rdwr_fasync(-1, filp, 0);
 	decr = (filp->f_mode & FMODE_READ) != 0;
 	decw = (filp->f_mode & FMODE_WRITE) != 0;
 	return pipe_release(inode, decr, decw);
 }
 static int
 pipe_read_open(struct inode *inode, struct file *filp)
 {
 	/* We could have perhaps used atomic_t, but this and friends
 	   below are the only places.  So it doesn't seem worthwhile.  */
 	mutex_lock(&inode->i_mutex);
 	inode->i_pipe->readers++;
 	mutex_unlock(&inode->i_mutex);
 	return 0;
 }
 static int
 pipe_write_open(struct inode *inode, struct file *filp)
 {
 	mutex_lock(&inode->i_mutex);
 	inode->i_pipe->writers++;
 	mutex_unlock(&inode->i_mutex);
 	return 0;
 }
 static int
 pipe_rdwr_open(struct inode *inode, struct file *filp)
 {
 	mutex_lock(&inode->i_mutex);
 	if (filp->f_mode & FMODE_READ)
 		inode->i_pipe->readers++;
 	if (filp->f_mode & FMODE_WRITE)
 		inode->i_pipe->writers++;
 	mutex_unlock(&inode->i_mutex);
 	return 0;
 }
 /*
  * The file_operations structs are not static because they
  * are also used in linux/fs/fifo.c to do operations on FIFOs.
  */
 const struct file_operations read_fifo_fops = {
 	.llseek		= no_llseek,
 	.read		= do_sync_read,
 	.aio_read	= pipe_read,
 	.write		= bad_pipe_w,
 	.poll		= pipe_poll,
 	.ioctl		= pipe_ioctl,
 	.open		= pipe_read_open,
 	.release	= pipe_read_release,
 	.fasync		= pipe_read_fasync,
 };
 const struct file_operations write_fifo_fops = {
 	.llseek		= no_llseek,
 	.read		= bad_pipe_r,
 	.write		= do_sync_write,
 	.aio_write	= pipe_write,
 	.poll		= pipe_poll,
 	.ioctl		= pipe_ioctl,
 	.open		= pipe_write_open,
 	.release	= pipe_write_release,
 	.fasync		= pipe_write_fasync,
 };
 const struct file_operations rdwr_fifo_fops = {
 	.llseek		= no_llseek,
 	.read		= do_sync_read,
 	.aio_read	= pipe_read,
 	.write		= do_sync_write,
 	.aio_write	= pipe_write,
 	.poll		= pipe_poll,
 	.ioctl		= pipe_ioctl,
 	.open		= pipe_rdwr_open,
 	.release	= pipe_rdwr_release,
 	.fasync		= pipe_rdwr_fasync,
 };
 static const struct file_operations read_pipe_fops = {
 	.llseek		= no_llseek,
 	.read		= do_sync_read,
 	.aio_read	= pipe_read,
 	.write		= bad_pipe_w,
 	.poll		= pipe_poll,
 	.ioctl		= pipe_ioctl,
 	.open		= pipe_read_open,
 	.release	= pipe_read_release,
 	.fasync		= pipe_read_fasync,
 };
 static const struct file_operations write_pipe_fops = {
 	.llseek		= no_llseek,
 	.read		= bad_pipe_r,
 	.write		= do_sync_write,
 	.aio_write	= pipe_write,
 	.poll		= pipe_poll,
 	.ioctl		= pipe_ioctl,
 	.open		= pipe_write_open,
 	.release	= pipe_write_release,
 	.fasync		= pipe_write_fasync,
 };
 static const struct file_operations rdwr_pipe_fops = {
 	.llseek		= no_llseek,
 	.read		= do_sync_read,
 	.aio_read	= pipe_read,
 	.write		= do_sync_write,
 	.aio_write	= pipe_write,
 	.poll		= pipe_poll,
 	.ioctl		= pipe_ioctl,
 	.open		= pipe_rdwr_open,
 	.release	= pipe_rdwr_release,
 	.fasync		= pipe_rdwr_fasync,
 };
 struct pipe_inode_info * alloc_pipe_info(struct inode *inode)
 {
 	struct pipe_inode_info *pipe;
 	pipe = kzalloc(sizeof(struct pipe_inode_info), GFP_KERNEL);
 	if (pipe) {
 		init_waitqueue_head(&pipe->wait);
 		pipe->r_counter = pipe->w_counter = 1;
 		pipe->inode = inode;
 	}
 	return pipe;
 }
 void __free_pipe_info(struct pipe_inode_info *pipe)
 {
 	int i;
 	for (i = 0; i < PIPE_BUFFERS; i++) {
 		struct pipe_buffer *buf = pipe->bufs + i;
 		if (buf->ops)
 			buf->ops->release(pipe, buf);
 	}
 	if (pipe->tmp_page)
 		__free_page(pipe->tmp_page);
 	kfree(pipe);
 }
 void free_pipe_info(struct inode *inode)
 {
 	__free_pipe_info(inode->i_pipe);
 	inode->i_pipe = NULL;
 }
 static struct vfsmount *pipe_mnt __read_mostly;
 static int pipefs_delete_dentry(struct dentry *dentry)
 {
 	/*
 	 * At creation time, we pretended this dentry was hashed
 	 * (by clearing DCACHE_UNHASHED bit in d_flags)
 	 * At delete time, we restore the truth : not hashed.
 	 * (so that dput() can proceed correctly)
 	 */
 	dentry->d_flags |= DCACHE_UNHASHED;
 	return 0;
 }
 /*
  * pipefs_dname() is called from d_path().
  */
 static char *pipefs_dname(struct dentry *dentry, char *buffer, int buflen)
 {
 	return dynamic_dname(dentry, buffer, buflen, "pipe:[%lu]",
 				dentry->d_inode->i_ino);
 }
 static struct dentry_operations pipefs_dentry_operations = {
 	.d_delete	= pipefs_delete_dentry,
 	.d_dname	= pipefs_dname,
 };
 static struct inode * get_pipe_inode(void)
 {
 	struct inode *inode = new_inode(pipe_mnt->mnt_sb);
 	struct pipe_inode_info *pipe;
 	if (!inode)
 		goto fail_inode;
 	pipe = alloc_pipe_info(inode);
 	if (!pipe)
 		goto fail_iput;
 	inode->i_pipe = pipe;
 	pipe->readers = pipe->writers = 1;
 	inode->i_fop = &rdwr_pipe_fops;
 	/*
 	 * Mark the inode dirty from the very beginning,
 	 * that way it will never be moved to the dirty
 	 * list because "mark_inode_dirty()" will think
 	 * that it already _is_ on the dirty list.
 	 */
 	inode->i_state = I_DIRTY;
 	inode->i_mode = S_IFIFO | S_IRUSR | S_IWUSR;
 	inode->i_uid = current->fsuid;
 	inode->i_gid = current->fsgid;
 	inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
 	return inode;
 fail_iput:
 	iput(inode);
 fail_inode:
 	return NULL;
 }
 struct file *create_write_pipe(void)
 {
 	int err;
 	struct inode *inode;
 	struct file *f;
 	struct dentry *dentry;
 	struct qstr name = { .name = "" };
 	f = get_empty_filp();
 	if (!f)
 		return ERR_PTR(-ENFILE);
 	err = -ENFILE;
 	inode = get_pipe_inode();
 	if (!inode)
 		goto err_file;
 	err = -ENOMEM;
 	dentry = d_alloc(pipe_mnt->mnt_sb->s_root, &name);
 	if (!dentry)
 		goto err_inode;
 	dentry->d_op = &pipefs_dentry_operations;
 	/*
 	 * We dont want to publish this dentry into global dentry hash table.
 	 * We pretend dentry is already hashed, by unsetting DCACHE_UNHASHED
 	 * This permits a working /proc/$pid/fd/XXX on pipes
 	 */
 	dentry->d_flags &= ~DCACHE_UNHASHED;
 	d_instantiate(dentry, inode);
 	f->f_path.mnt = mntget(pipe_mnt);
 	f->f_path.dentry = dentry;
 	f->f_mapping = inode->i_mapping;
 	f->f_flags = O_WRONLY;
 	f->f_op = &write_pipe_fops;
 	f->f_mode = FMODE_WRITE;
 	f->f_version = 0;
 	return f;
  err_inode:
 	free_pipe_info(inode);
 	iput(inode);
  err_file:
 	put_filp(f);
 	return ERR_PTR(err);
 }
 void free_write_pipe(struct file *f)
 {
 	free_pipe_info(f->f_dentry->d_inode);
 	dput(f->f_path.dentry);
 	mntput(f->f_path.mnt);
 	put_filp(f);
 }
 struct file *create_read_pipe(struct file *wrf)
 {
 	struct file *f = get_empty_filp();
 	if (!f)
 		return ERR_PTR(-ENFILE);
 	/* Grab pipe from the writer */
 	f->f_path.mnt = mntget(wrf->f_path.mnt);
 	f->f_path.dentry = dget(wrf->f_path.dentry);
 	f->f_mapping = wrf->f_path.dentry->d_inode->i_mapping;
 	f->f_pos = 0;
 	f->f_flags = O_RDONLY;
 	f->f_op = &read_pipe_fops;
 	f->f_mode = FMODE_READ;
 	f->f_version = 0;
 	return f;
 }
 int do_pipe(int *fd)
 {
 	struct file *fw, *fr;
 	int error;
 	int fdw, fdr;
 	fw = create_write_pipe();
 	if (IS_ERR(fw))
 		return PTR_ERR(fw);
 	fr = create_read_pipe(fw);
 	error = PTR_ERR(fr);
 	if (IS_ERR(fr))
 		goto err_write_pipe;
 	error = get_unused_fd();
 	if (error < 0)
 		goto err_read_pipe;
 	fdr = error;
 	error = get_unused_fd();
 	if (error < 0)
 		goto err_fdr;
 	fdw = error;
 	error = audit_fd_pair(fdr, fdw);
 	if (error < 0)
 		goto err_fdw;
 	fd_install(fdr, fr);
 	fd_install(fdw, fw);
 	fd[0] = fdr;
 	fd[1] = fdw;
 	return 0;
  err_fdw:
 	put_unused_fd(fdw);
  err_fdr:
 	put_unused_fd(fdr);
  err_read_pipe:
 	dput(fr->f_dentry);
 	mntput(fr->f_vfsmnt);
 	put_filp(fr);
  err_write_pipe:
 	free_write_pipe(fw);
 	return error;
 }
 /*
  * pipefs should _never_ be mounted by userland - too much of security hassle,
  * no real gain from having the whole whorehouse mounted. So we don't need
  * any operations on the root directory. However, we need a non-trivial
  * d_name - pipe: will go nicely and kill the special-casing in procfs.
  */
 static int pipefs_get_sb(struct file_system_type *fs_type,
 			 int flags, const char *dev_name, void *data,
 			 struct vfsmount *mnt)
 {
 	return get_sb_pseudo(fs_type, "pipe:", NULL, PIPEFS_MAGIC, mnt);
 }
 static struct file_system_type pipe_fs_type = {
 	.name		= "pipefs",
 	.get_sb		= pipefs_get_sb,
 	.kill_sb	= kill_anon_super,
 };
 static int __init init_pipe_fs(void)
 {
 	int err = register_filesystem(&pipe_fs_type);
 	if (!err) {
 		pipe_mnt = kern_mount(&pipe_fs_type);
 		if (IS_ERR(pipe_mnt)) {
 			err = PTR_ERR(pipe_mnt);
 			unregister_filesystem(&pipe_fs_type);
 		}
 	}
 	return err;
 }
 static void __exit exit_pipe_fs(void)
 {
 	unregister_filesystem(&pipe_fs_type);
 	mntput(pipe_mnt);
 }
 fs_initcall(init_pipe_fs);
 module_exit(exit_pipe_fs);

fs/splice.c

Diff comments View file @ cac36bb

1	/*	1	/*
2	* "splice": joining two ropes together by interweaving their strands.	2	* "splice": joining two ropes together by interweaving their strands.
3	*	3	*
4	* This is the "extended pipe" functionality, where a pipe is used as	4	* This is the "extended pipe" functionality, where a pipe is used as
5	* an arbitrary in-memory buffer. Think of a pipe as a small kernel	5	* an arbitrary in-memory buffer. Think of a pipe as a small kernel
6	* buffer that you can use to transfer data from one end to the other.	6	* buffer that you can use to transfer data from one end to the other.
7	*	7	*
8	* The traditional unix read/write is extended with a "splice()" operation	8	* The traditional unix read/write is extended with a "splice()" operation
9	* that transfers data buffers to or from a pipe buffer.	9	* that transfers data buffers to or from a pipe buffer.
10	*	10	*
11	* Named by Larry McVoy, original implementation from Linus, extended by	11	* Named by Larry McVoy, original implementation from Linus, extended by
12	* Jens to support splicing to files, network, direct splicing, etc and	12	* Jens to support splicing to files, network, direct splicing, etc and
13	* fixing lots of bugs.	13	* fixing lots of bugs.
14	*	14	*
15	* Copyright (C) 2005-2006 Jens Axboe <axboe@kernel.dk>	15	* Copyright (C) 2005-2006 Jens Axboe <axboe@kernel.dk>
16	* Copyright (C) 2005-2006 Linus Torvalds <torvalds@osdl.org>	16	* Copyright (C) 2005-2006 Linus Torvalds <torvalds@osdl.org>
17	* Copyright (C) 2006 Ingo Molnar <mingo@elte.hu>	17	* Copyright (C) 2006 Ingo Molnar <mingo@elte.hu>
18	*	18	*
19	*/	19	*/
20	#include <linux/fs.h>	20	#include <linux/fs.h>
21	#include <linux/file.h>	21	#include <linux/file.h>
22	#include <linux/pagemap.h>	22	#include <linux/pagemap.h>
23	#include <linux/splice.h>	23	#include <linux/splice.h>
24	#include <linux/mm_inline.h>	24	#include <linux/mm_inline.h>
25	#include <linux/swap.h>	25	#include <linux/swap.h>
26	#include <linux/writeback.h>	26	#include <linux/writeback.h>
27	#include <linux/buffer_head.h>	27	#include <linux/buffer_head.h>
28	#include <linux/module.h>	28	#include <linux/module.h>
29	#include <linux/syscalls.h>	29	#include <linux/syscalls.h>
30	#include <linux/uio.h>	30	#include <linux/uio.h>
31		31
32	/*	32	/*
33	* Attempt to steal a page from a pipe buffer. This should perhaps go into	33	* Attempt to steal a page from a pipe buffer. This should perhaps go into
34	* a vm helper function, it's already simplified quite a bit by the	34	* a vm helper function, it's already simplified quite a bit by the
35	* addition of remove_mapping(). If success is returned, the caller may	35	* addition of remove_mapping(). If success is returned, the caller may
36	* attempt to reuse this page for another destination.	36	* attempt to reuse this page for another destination.
37	*/	37	*/
38	static int page_cache_pipe_buf_steal(struct pipe_inode_info *pipe,	38	static int page_cache_pipe_buf_steal(struct pipe_inode_info *pipe,
39	struct pipe_buffer *buf)	39	struct pipe_buffer *buf)
40	{	40	{
41	struct page *page = buf->page;	41	struct page *page = buf->page;
42	struct address_space *mapping;	42	struct address_space *mapping;
43		43
44	lock_page(page);	44	lock_page(page);
45		45
46	mapping = page_mapping(page);	46	mapping = page_mapping(page);
47	if (mapping) {	47	if (mapping) {
48	WARN_ON(!PageUptodate(page));	48	WARN_ON(!PageUptodate(page));
49		49
50	/*	50	/*
51	* At least for ext2 with nobh option, we need to wait on	51	* At least for ext2 with nobh option, we need to wait on
52	* writeback completing on this page, since we'll remove it	52	* writeback completing on this page, since we'll remove it
53	* from the pagecache. Otherwise truncate wont wait on the	53	* from the pagecache. Otherwise truncate wont wait on the
54	* page, allowing the disk blocks to be reused by someone else	54	* page, allowing the disk blocks to be reused by someone else
55	* before we actually wrote our data to them. fs corruption	55	* before we actually wrote our data to them. fs corruption
56	* ensues.	56	* ensues.
57	*/	57	*/
58	wait_on_page_writeback(page);	58	wait_on_page_writeback(page);
59		59
60	if (PagePrivate(page))	60	if (PagePrivate(page))
61	try_to_release_page(page, GFP_KERNEL);	61	try_to_release_page(page, GFP_KERNEL);
62		62
63	/*	63	/*
64	* If we succeeded in removing the mapping, set LRU flag	64	* If we succeeded in removing the mapping, set LRU flag
65	* and return good.	65	* and return good.
66	*/	66	*/
67	if (remove_mapping(mapping, page)) {	67	if (remove_mapping(mapping, page)) {
68	buf->flags \|= PIPE_BUF_FLAG_LRU;	68	buf->flags \|= PIPE_BUF_FLAG_LRU;
69	return 0;	69	return 0;
70	}	70	}
71	}	71	}
72		72
73	/*	73	/*
74	* Raced with truncate or failed to remove page from current	74	* Raced with truncate or failed to remove page from current
75	* address space, unlock and return failure.	75	* address space, unlock and return failure.
76	*/	76	*/
77	unlock_page(page);	77	unlock_page(page);
78	return 1;	78	return 1;
79	}	79	}
80		80
81	static void page_cache_pipe_buf_release(struct pipe_inode_info *pipe,	81	static void page_cache_pipe_buf_release(struct pipe_inode_info *pipe,
82	struct pipe_buffer *buf)	82	struct pipe_buffer *buf)
83	{	83	{
84	page_cache_release(buf->page);	84	page_cache_release(buf->page);
85	buf->flags &= ~PIPE_BUF_FLAG_LRU;	85	buf->flags &= ~PIPE_BUF_FLAG_LRU;
86	}	86	}
87		87
88	static int page_cache_pipe_buf_pin(struct pipe_inode_info *pipe,	88	static int page_cache_pipe_buf_confirm(struct pipe_inode_info *pipe,
89	struct pipe_buffer *buf)	89	struct pipe_buffer *buf)
90	{	90	{
91	struct page *page = buf->page;	91	struct page *page = buf->page;
92	int err;	92	int err;
93		93
94	if (!PageUptodate(page)) {	94	if (!PageUptodate(page)) {
95	lock_page(page);	95	lock_page(page);
96		96
97	/*	97	/*
98	* Page got truncated/unhashed. This will cause a 0-byte	98	* Page got truncated/unhashed. This will cause a 0-byte
99	* splice, if this is the first page.	99	* splice, if this is the first page.
100	*/	100	*/
101	if (!page->mapping) {	101	if (!page->mapping) {
102	err = -ENODATA;	102	err = -ENODATA;
103	goto error;	103	goto error;
104	}	104	}
105		105
106	/*	106	/*
107	* Uh oh, read-error from disk.	107	* Uh oh, read-error from disk.
108	*/	108	*/
109	if (!PageUptodate(page)) {	109	if (!PageUptodate(page)) {
110	err = -EIO;	110	err = -EIO;
111	goto error;	111	goto error;
112	}	112	}
113		113
114	/*	114	/*
115	* Page is ok afterall, we are done.	115	* Page is ok afterall, we are done.
116	*/	116	*/
117	unlock_page(page);	117	unlock_page(page);
118	}	118	}
119		119
120	return 0;	120	return 0;
121	error:	121	error:
122	unlock_page(page);	122	unlock_page(page);
123	return err;	123	return err;
124	}	124	}
125		125
126	static const struct pipe_buf_operations page_cache_pipe_buf_ops = {	126	static const struct pipe_buf_operations page_cache_pipe_buf_ops = {
127	.can_merge = 0,	127	.can_merge = 0,
128	.map = generic_pipe_buf_map,	128	.map = generic_pipe_buf_map,
129	.unmap = generic_pipe_buf_unmap,	129	.unmap = generic_pipe_buf_unmap,
130	.pin = page_cache_pipe_buf_pin,	130	.confirm = page_cache_pipe_buf_confirm,
131	.release = page_cache_pipe_buf_release,	131	.release = page_cache_pipe_buf_release,
132	.steal = page_cache_pipe_buf_steal,	132	.steal = page_cache_pipe_buf_steal,
133	.get = generic_pipe_buf_get,	133	.get = generic_pipe_buf_get,
134	};	134	};
135		135
136	static int user_page_pipe_buf_steal(struct pipe_inode_info *pipe,	136	static int user_page_pipe_buf_steal(struct pipe_inode_info *pipe,
137	struct pipe_buffer *buf)	137	struct pipe_buffer *buf)
138	{	138	{
139	if (!(buf->flags & PIPE_BUF_FLAG_GIFT))	139	if (!(buf->flags & PIPE_BUF_FLAG_GIFT))
140	return 1;	140	return 1;
141		141
142	buf->flags \|= PIPE_BUF_FLAG_LRU;	142	buf->flags \|= PIPE_BUF_FLAG_LRU;
143	return generic_pipe_buf_steal(pipe, buf);	143	return generic_pipe_buf_steal(pipe, buf);
144	}	144	}
145		145
146	static const struct pipe_buf_operations user_page_pipe_buf_ops = {	146	static const struct pipe_buf_operations user_page_pipe_buf_ops = {
147	.can_merge = 0,	147	.can_merge = 0,
148	.map = generic_pipe_buf_map,	148	.map = generic_pipe_buf_map,
149	.unmap = generic_pipe_buf_unmap,	149	.unmap = generic_pipe_buf_unmap,
150	.pin = generic_pipe_buf_pin,	150	.confirm = generic_pipe_buf_confirm,
151	.release = page_cache_pipe_buf_release,	151	.release = page_cache_pipe_buf_release,
152	.steal = user_page_pipe_buf_steal,	152	.steal = user_page_pipe_buf_steal,
153	.get = generic_pipe_buf_get,	153	.get = generic_pipe_buf_get,
154	};	154	};
155		155
156	/**	156	/**
157	* splice_to_pipe - fill passed data into a pipe	157	* splice_to_pipe - fill passed data into a pipe
158	* @pipe: pipe to fill	158	* @pipe: pipe to fill
159	* @spd: data to fill	159	* @spd: data to fill
160	*	160	*
161	* Description:	161	* Description:
162	* @spd contains a map of pages and len/offset tupples, a long with	162	* @spd contains a map of pages and len/offset tupples, a long with
163	* the struct pipe_buf_operations associated with these pages. This	163	* the struct pipe_buf_operations associated with these pages. This
164	* function will link that data to the pipe.	164	* function will link that data to the pipe.
165	*	165	*
166	*/	166	*/
167	ssize_t splice_to_pipe(struct pipe_inode_info *pipe,	167	ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
168	struct splice_pipe_desc *spd)	168	struct splice_pipe_desc *spd)
169	{	169	{
170	unsigned int spd_pages = spd->nr_pages;	170	unsigned int spd_pages = spd->nr_pages;
171	int ret, do_wakeup, page_nr;	171	int ret, do_wakeup, page_nr;
172		172
173	ret = 0;	173	ret = 0;
174	do_wakeup = 0;	174	do_wakeup = 0;
175	page_nr = 0;	175	page_nr = 0;
176		176
177	if (pipe->inode)	177	if (pipe->inode)
178	mutex_lock(&pipe->inode->i_mutex);	178	mutex_lock(&pipe->inode->i_mutex);
179		179
180	for (;;) {	180	for (;;) {
181	if (!pipe->readers) {	181	if (!pipe->readers) {
182	send_sig(SIGPIPE, current, 0);	182	send_sig(SIGPIPE, current, 0);
183	if (!ret)	183	if (!ret)
184	ret = -EPIPE;	184	ret = -EPIPE;
185	break;	185	break;
186	}	186	}
187		187
188	if (pipe->nrbufs < PIPE_BUFFERS) {	188	if (pipe->nrbufs < PIPE_BUFFERS) {
189	int newbuf = (pipe->curbuf + pipe->nrbufs) & (PIPE_BUFFERS - 1);	189	int newbuf = (pipe->curbuf + pipe->nrbufs) & (PIPE_BUFFERS - 1);
190	struct pipe_buffer *buf = pipe->bufs + newbuf;	190	struct pipe_buffer *buf = pipe->bufs + newbuf;
191		191
192	buf->page = spd->pages[page_nr];	192	buf->page = spd->pages[page_nr];
193	buf->offset = spd->partial[page_nr].offset;	193	buf->offset = spd->partial[page_nr].offset;
194	buf->len = spd->partial[page_nr].len;	194	buf->len = spd->partial[page_nr].len;
195	buf->private = spd->partial[page_nr].private;	195	buf->private = spd->partial[page_nr].private;
196	buf->ops = spd->ops;	196	buf->ops = spd->ops;
197	if (spd->flags & SPLICE_F_GIFT)	197	if (spd->flags & SPLICE_F_GIFT)
198	buf->flags \|= PIPE_BUF_FLAG_GIFT;	198	buf->flags \|= PIPE_BUF_FLAG_GIFT;
199		199
200	pipe->nrbufs++;	200	pipe->nrbufs++;
201	page_nr++;	201	page_nr++;
202	ret += buf->len;	202	ret += buf->len;
203		203
204	if (pipe->inode)	204	if (pipe->inode)
205	do_wakeup = 1;	205	do_wakeup = 1;
206		206
207	if (!--spd->nr_pages)	207	if (!--spd->nr_pages)
208	break;	208	break;
209	if (pipe->nrbufs < PIPE_BUFFERS)	209	if (pipe->nrbufs < PIPE_BUFFERS)
210	continue;	210	continue;
211		211
212	break;	212	break;
213	}	213	}
214		214
215	if (spd->flags & SPLICE_F_NONBLOCK) {	215	if (spd->flags & SPLICE_F_NONBLOCK) {
216	if (!ret)	216	if (!ret)
217	ret = -EAGAIN;	217	ret = -EAGAIN;
218	break;	218	break;
219	}	219	}
220		220
221	if (signal_pending(current)) {	221	if (signal_pending(current)) {
222	if (!ret)	222	if (!ret)
223	ret = -ERESTARTSYS;	223	ret = -ERESTARTSYS;
224	break;	224	break;
225	}	225	}
226		226
227	if (do_wakeup) {	227	if (do_wakeup) {
228	smp_mb();	228	smp_mb();
229	if (waitqueue_active(&pipe->wait))	229	if (waitqueue_active(&pipe->wait))
230	wake_up_interruptible_sync(&pipe->wait);	230	wake_up_interruptible_sync(&pipe->wait);
231	kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);	231	kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
232	do_wakeup = 0;	232	do_wakeup = 0;
233	}	233	}
234		234
235	pipe->waiting_writers++;	235	pipe->waiting_writers++;
236	pipe_wait(pipe);	236	pipe_wait(pipe);
237	pipe->waiting_writers--;	237	pipe->waiting_writers--;
238	}	238	}
239		239
240	if (pipe->inode) {	240	if (pipe->inode) {
241	mutex_unlock(&pipe->inode->i_mutex);	241	mutex_unlock(&pipe->inode->i_mutex);
242		242
243	if (do_wakeup) {	243	if (do_wakeup) {
244	smp_mb();	244	smp_mb();
245	if (waitqueue_active(&pipe->wait))	245	if (waitqueue_active(&pipe->wait))
246	wake_up_interruptible(&pipe->wait);	246	wake_up_interruptible(&pipe->wait);
247	kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);	247	kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
248	}	248	}
249	}	249	}
250		250
251	while (page_nr < spd_pages)	251	while (page_nr < spd_pages)
252	page_cache_release(spd->pages[page_nr++]);	252	page_cache_release(spd->pages[page_nr++]);
253		253
254	return ret;	254	return ret;
255	}	255	}
256		256
257	static int	257	static int
258	__generic_file_splice_read(struct file in, loff_t ppos,	258	__generic_file_splice_read(struct file in, loff_t ppos,
259	struct pipe_inode_info *pipe, size_t len,	259	struct pipe_inode_info *pipe, size_t len,
260	unsigned int flags)	260	unsigned int flags)
261	{	261	{
262	struct address_space *mapping = in->f_mapping;	262	struct address_space *mapping = in->f_mapping;
263	unsigned int loff, nr_pages;	263	unsigned int loff, nr_pages;
264	struct page *pages[PIPE_BUFFERS];	264	struct page *pages[PIPE_BUFFERS];
265	struct partial_page partial[PIPE_BUFFERS];	265	struct partial_page partial[PIPE_BUFFERS];
266	struct page *page;	266	struct page *page;
267	pgoff_t index, end_index;	267	pgoff_t index, end_index;
268	loff_t isize;	268	loff_t isize;
269	int error, page_nr;	269	int error, page_nr;
270	struct splice_pipe_desc spd = {	270	struct splice_pipe_desc spd = {
271	.pages = pages,	271	.pages = pages,
272	.partial = partial,	272	.partial = partial,
273	.flags = flags,	273	.flags = flags,
274	.ops = &page_cache_pipe_buf_ops,	274	.ops = &page_cache_pipe_buf_ops,
275	};	275	};
276		276
277	index = *ppos >> PAGE_CACHE_SHIFT;	277	index = *ppos >> PAGE_CACHE_SHIFT;
278	loff = *ppos & ~PAGE_CACHE_MASK;	278	loff = *ppos & ~PAGE_CACHE_MASK;
279	nr_pages = (len + loff + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;	279	nr_pages = (len + loff + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
280		280
281	if (nr_pages > PIPE_BUFFERS)	281	if (nr_pages > PIPE_BUFFERS)
282	nr_pages = PIPE_BUFFERS;	282	nr_pages = PIPE_BUFFERS;
283		283
284	/*	284	/*
285	* Don't try to 2nd guess the read-ahead logic, call into	285	* Don't try to 2nd guess the read-ahead logic, call into
286	* page_cache_readahead() like the page cache reads would do.	286	* page_cache_readahead() like the page cache reads would do.
287	*/	287	*/
288	page_cache_readahead(mapping, &in->f_ra, in, index, nr_pages);	288	page_cache_readahead(mapping, &in->f_ra, in, index, nr_pages);
289		289
290	/*	290	/*
291	* Lookup the (hopefully) full range of pages we need.	291	* Lookup the (hopefully) full range of pages we need.
292	*/	292	*/
293	spd.nr_pages = find_get_pages_contig(mapping, index, nr_pages, pages);	293	spd.nr_pages = find_get_pages_contig(mapping, index, nr_pages, pages);
294		294
295	/*	295	/*
296	* If find_get_pages_contig() returned fewer pages than we needed,	296	* If find_get_pages_contig() returned fewer pages than we needed,
297	* allocate the rest and fill in the holes.	297	* allocate the rest and fill in the holes.
298	*/	298	*/
299	error = 0;	299	error = 0;
300	index += spd.nr_pages;	300	index += spd.nr_pages;
301	while (spd.nr_pages < nr_pages) {	301	while (spd.nr_pages < nr_pages) {
302	/*	302	/*
303	* Page could be there, find_get_pages_contig() breaks on	303	* Page could be there, find_get_pages_contig() breaks on
304	* the first hole.	304	* the first hole.
305	*/	305	*/
306	page = find_get_page(mapping, index);	306	page = find_get_page(mapping, index);
307	if (!page) {	307	if (!page) {
308	/*	308	/*
309	* Make sure the read-ahead engine is notified	309	* Make sure the read-ahead engine is notified
310	* about this failure.	310	* about this failure.
311	*/	311	*/
312	handle_ra_miss(mapping, &in->f_ra, index);	312	handle_ra_miss(mapping, &in->f_ra, index);
313		313
314	/*	314	/*
315	* page didn't exist, allocate one.	315	* page didn't exist, allocate one.
316	*/	316	*/
317	page = page_cache_alloc_cold(mapping);	317	page = page_cache_alloc_cold(mapping);
318	if (!page)	318	if (!page)
319	break;	319	break;
320		320
321	error = add_to_page_cache_lru(page, mapping, index,	321	error = add_to_page_cache_lru(page, mapping, index,
322	GFP_KERNEL);	322	GFP_KERNEL);
323	if (unlikely(error)) {	323	if (unlikely(error)) {
324	page_cache_release(page);	324	page_cache_release(page);
325	if (error == -EEXIST)	325	if (error == -EEXIST)
326	continue;	326	continue;
327	break;	327	break;
328	}	328	}
329	/*	329	/*
330	* add_to_page_cache() locks the page, unlock it	330	* add_to_page_cache() locks the page, unlock it
331	* to avoid convoluting the logic below even more.	331	* to avoid convoluting the logic below even more.
332	*/	332	*/
333	unlock_page(page);	333	unlock_page(page);
334	}	334	}
335		335
336	pages[spd.nr_pages++] = page;	336	pages[spd.nr_pages++] = page;
337	index++;	337	index++;
338	}	338	}
339		339
340	/*	340	/*
341	* Now loop over the map and see if we need to start IO on any	341	* Now loop over the map and see if we need to start IO on any
342	* pages, fill in the partial map, etc.	342	* pages, fill in the partial map, etc.
343	*/	343	*/
344	index = *ppos >> PAGE_CACHE_SHIFT;	344	index = *ppos >> PAGE_CACHE_SHIFT;
345	nr_pages = spd.nr_pages;	345	nr_pages = spd.nr_pages;
346	spd.nr_pages = 0;	346	spd.nr_pages = 0;
347	for (page_nr = 0; page_nr < nr_pages; page_nr++) {	347	for (page_nr = 0; page_nr < nr_pages; page_nr++) {
348	unsigned int this_len;	348	unsigned int this_len;
349		349
350	if (!len)	350	if (!len)
351	break;	351	break;
352		352
353	/*	353	/*
354	* this_len is the max we'll use from this page	354	* this_len is the max we'll use from this page
355	*/	355	*/
356	this_len = min_t(unsigned long, len, PAGE_CACHE_SIZE - loff);	356	this_len = min_t(unsigned long, len, PAGE_CACHE_SIZE - loff);
357	page = pages[page_nr];	357	page = pages[page_nr];
358		358
359	/*	359	/*
360	* If the page isn't uptodate, we may need to start io on it	360	* If the page isn't uptodate, we may need to start io on it
361	*/	361	*/
362	if (!PageUptodate(page)) {	362	if (!PageUptodate(page)) {
363	/*	363	/*
364	* If in nonblock mode then dont block on waiting	364	* If in nonblock mode then dont block on waiting
365	* for an in-flight io page	365	* for an in-flight io page
366	*/	366	*/
367	if (flags & SPLICE_F_NONBLOCK) {	367	if (flags & SPLICE_F_NONBLOCK) {
368	if (TestSetPageLocked(page))	368	if (TestSetPageLocked(page))
369	break;	369	break;
370	} else	370	} else
371	lock_page(page);	371	lock_page(page);
372		372
373	/*	373	/*
374	* page was truncated, stop here. if this isn't the	374	* page was truncated, stop here. if this isn't the
375	* first page, we'll just complete what we already	375	* first page, we'll just complete what we already
376	* added	376	* added
377	*/	377	*/
378	if (!page->mapping) {	378	if (!page->mapping) {
379	unlock_page(page);	379	unlock_page(page);
380	break;	380	break;
381	}	381	}
382	/*	382	/*
383	* page was already under io and is now done, great	383	* page was already under io and is now done, great
384	*/	384	*/
385	if (PageUptodate(page)) {	385	if (PageUptodate(page)) {
386	unlock_page(page);	386	unlock_page(page);
387	goto fill_it;	387	goto fill_it;
388	}	388	}
389		389
390	/*	390	/*
391	* need to read in the page	391	* need to read in the page
392	*/	392	*/
393	error = mapping->a_ops->readpage(in, page);	393	error = mapping->a_ops->readpage(in, page);
394	if (unlikely(error)) {	394	if (unlikely(error)) {
395	/*	395	/*
396	* We really should re-lookup the page here,	396	* We really should re-lookup the page here,
397	* but it complicates things a lot. Instead	397	* but it complicates things a lot. Instead
398	* lets just do what we already stored, and	398	* lets just do what we already stored, and
399	* we'll get it the next time we are called.	399	* we'll get it the next time we are called.
400	*/	400	*/
401	if (error == AOP_TRUNCATED_PAGE)	401	if (error == AOP_TRUNCATED_PAGE)
402	error = 0;	402	error = 0;
403		403
404	break;	404	break;
405	}	405	}
406	}	406	}
407	fill_it:	407	fill_it:
408	/*	408	/*
409	* i_size must be checked after PageUptodate.	409	* i_size must be checked after PageUptodate.
410	*/	410	*/
411	isize = i_size_read(mapping->host);	411	isize = i_size_read(mapping->host);
412	end_index = (isize - 1) >> PAGE_CACHE_SHIFT;	412	end_index = (isize - 1) >> PAGE_CACHE_SHIFT;
413	if (unlikely(!isize \|\| index > end_index))	413	if (unlikely(!isize \|\| index > end_index))
414	break;	414	break;
415		415
416	/*	416	/*
417	* if this is the last page, see if we need to shrink	417	* if this is the last page, see if we need to shrink
418	* the length and stop	418	* the length and stop
419	*/	419	*/
420	if (end_index == index) {	420	if (end_index == index) {
421	unsigned int plen;	421	unsigned int plen;
422		422
423	/*	423	/*
424	* max good bytes in this page	424	* max good bytes in this page
425	*/	425	*/
426	plen = ((isize - 1) & ~PAGE_CACHE_MASK) + 1;	426	plen = ((isize - 1) & ~PAGE_CACHE_MASK) + 1;
427	if (plen <= loff)	427	if (plen <= loff)
428	break;	428	break;
429		429
430	/*	430	/*
431	* force quit after adding this page	431	* force quit after adding this page
432	*/	432	*/
433	this_len = min(this_len, plen - loff);	433	this_len = min(this_len, plen - loff);
434	len = this_len;	434	len = this_len;
435	}	435	}
436		436
437	partial[page_nr].offset = loff;	437	partial[page_nr].offset = loff;
438	partial[page_nr].len = this_len;	438	partial[page_nr].len = this_len;
439	len -= this_len;	439	len -= this_len;
440	loff = 0;	440	loff = 0;
441	spd.nr_pages++;	441	spd.nr_pages++;
442	index++;	442	index++;
443	}	443	}
444		444
445	/*	445	/*
446	* Release any pages at the end, if we quit early. 'page_nr' is how far	446	* Release any pages at the end, if we quit early. 'page_nr' is how far
447	* we got, 'nr_pages' is how many pages are in the map.	447	* we got, 'nr_pages' is how many pages are in the map.
448	*/	448	*/
449	while (page_nr < nr_pages)	449	while (page_nr < nr_pages)
450	page_cache_release(pages[page_nr++]);	450	page_cache_release(pages[page_nr++]);
451		451
452	if (spd.nr_pages)	452	if (spd.nr_pages)
453	return splice_to_pipe(pipe, &spd);	453	return splice_to_pipe(pipe, &spd);
454		454
455	return error;	455	return error;
456	}	456	}
457		457
458	/**	458	/**
459	* generic_file_splice_read - splice data from file to a pipe	459	* generic_file_splice_read - splice data from file to a pipe
460	* @in: file to splice from	460	* @in: file to splice from
461	* @ppos: position in @in	461	* @ppos: position in @in
462	* @pipe: pipe to splice to	462	* @pipe: pipe to splice to
463	* @len: number of bytes to splice	463	* @len: number of bytes to splice
464	* @flags: splice modifier flags	464	* @flags: splice modifier flags
465	*	465	*
466	* Description:	466	* Description:
467	* Will read pages from given file and fill them into a pipe. Can be	467	* Will read pages from given file and fill them into a pipe. Can be
468	* used as long as the address_space operations for the source implements	468	* used as long as the address_space operations for the source implements
469	* a readpage() hook.	469	* a readpage() hook.
470	*	470	*
471	*/	471	*/
472	ssize_t generic_file_splice_read(struct file in, loff_t ppos,	472	ssize_t generic_file_splice_read(struct file in, loff_t ppos,
473	struct pipe_inode_info *pipe, size_t len,	473	struct pipe_inode_info *pipe, size_t len,
474	unsigned int flags)	474	unsigned int flags)
475	{	475	{
476	ssize_t spliced;	476	ssize_t spliced;
477	int ret;	477	int ret;
478	loff_t isize, left;	478	loff_t isize, left;
479		479
480	isize = i_size_read(in->f_mapping->host);	480	isize = i_size_read(in->f_mapping->host);
481	if (unlikely(*ppos >= isize))	481	if (unlikely(*ppos >= isize))
482	return 0;	482	return 0;
483		483
484	left = isize - *ppos;	484	left = isize - *ppos;
485	if (unlikely(left < len))	485	if (unlikely(left < len))
486	len = left;	486	len = left;
487		487
488	ret = 0;	488	ret = 0;
489	spliced = 0;	489	spliced = 0;
490	while (len) {	490	while (len) {
491	ret = __generic_file_splice_read(in, ppos, pipe, len, flags);	491	ret = __generic_file_splice_read(in, ppos, pipe, len, flags);
492		492
493	if (ret < 0)	493	if (ret < 0)
494	break;	494	break;
495	else if (!ret) {	495	else if (!ret) {
496	if (spliced)	496	if (spliced)
497	break;	497	break;
498	if (flags & SPLICE_F_NONBLOCK) {	498	if (flags & SPLICE_F_NONBLOCK) {
499	ret = -EAGAIN;	499	ret = -EAGAIN;
500	break;	500	break;
501	}	501	}
502	}	502	}
503		503
504	*ppos += ret;	504	*ppos += ret;
505	len -= ret;	505	len -= ret;
506	spliced += ret;	506	spliced += ret;
507	}	507	}
508		508
509	if (spliced)	509	if (spliced)
510	return spliced;	510	return spliced;
511		511
512	return ret;	512	return ret;
513	}	513	}
514		514
515	EXPORT_SYMBOL(generic_file_splice_read);	515	EXPORT_SYMBOL(generic_file_splice_read);
516		516
517	/*	517	/*
518	* Send 'sd->len' bytes to socket from 'sd->file' at position 'sd->pos'	518	* Send 'sd->len' bytes to socket from 'sd->file' at position 'sd->pos'
519	* using sendpage(). Return the number of bytes sent.	519	* using sendpage(). Return the number of bytes sent.
520	*/	520	*/
521	static int pipe_to_sendpage(struct pipe_inode_info *pipe,	521	static int pipe_to_sendpage(struct pipe_inode_info *pipe,
522	struct pipe_buffer buf, struct splice_desc sd)	522	struct pipe_buffer buf, struct splice_desc sd)
523	{	523	{
524	struct file *file = sd->u.file;	524	struct file *file = sd->u.file;
525	loff_t pos = sd->pos;	525	loff_t pos = sd->pos;
526	int ret, more;	526	int ret, more;
527		527
528	ret = buf->ops->pin(pipe, buf);	528	ret = buf->ops->confirm(pipe, buf);
529	if (!ret) {	529	if (!ret) {
530	more = (sd->flags & SPLICE_F_MORE) \|\| sd->len < sd->total_len;	530	more = (sd->flags & SPLICE_F_MORE) \|\| sd->len < sd->total_len;
531		531
532	ret = file->f_op->sendpage(file, buf->page, buf->offset,	532	ret = file->f_op->sendpage(file, buf->page, buf->offset,
533	sd->len, &pos, more);	533	sd->len, &pos, more);
534	}	534	}
535		535
536	return ret;	536	return ret;
537	}	537	}
538		538
539	/*	539	/*
540	* This is a little more tricky than the file -> pipe splicing. There are	540	* This is a little more tricky than the file -> pipe splicing. There are
541	* basically three cases:	541	* basically three cases:
542	*	542	*
543	* - Destination page already exists in the address space and there	543	* - Destination page already exists in the address space and there
544	* are users of it. For that case we have no other option that	544	* are users of it. For that case we have no other option that
545	* copying the data. Tough luck.	545	* copying the data. Tough luck.
546	* - Destination page already exists in the address space, but there	546	* - Destination page already exists in the address space, but there
547	* are no users of it. Make sure it's uptodate, then drop it. Fall	547	* are no users of it. Make sure it's uptodate, then drop it. Fall
548	* through to last case.	548	* through to last case.
549	* - Destination page does not exist, we can add the pipe page to	549	* - Destination page does not exist, we can add the pipe page to
550	* the page cache and avoid the copy.	550	* the page cache and avoid the copy.
551	*	551	*
552	* If asked to move pages to the output file (SPLICE_F_MOVE is set in	552	* If asked to move pages to the output file (SPLICE_F_MOVE is set in
553	* sd->flags), we attempt to migrate pages from the pipe to the output	553	* sd->flags), we attempt to migrate pages from the pipe to the output
554	* file address space page cache. This is possible if no one else has	554	* file address space page cache. This is possible if no one else has
555	* the pipe page referenced outside of the pipe and page cache. If	555	* the pipe page referenced outside of the pipe and page cache. If
556	* SPLICE_F_MOVE isn't set, or we cannot move the page, we simply create	556	* SPLICE_F_MOVE isn't set, or we cannot move the page, we simply create
557	* a new page in the output file page cache and fill/dirty that.	557	* a new page in the output file page cache and fill/dirty that.
558	*/	558	*/
559	static int pipe_to_file(struct pipe_inode_info pipe, struct pipe_buffer buf,	559	static int pipe_to_file(struct pipe_inode_info pipe, struct pipe_buffer buf,
560	struct splice_desc *sd)	560	struct splice_desc *sd)
561	{	561	{
562	struct file *file = sd->u.file;	562	struct file *file = sd->u.file;
563	struct address_space *mapping = file->f_mapping;	563	struct address_space *mapping = file->f_mapping;
564	unsigned int offset, this_len;	564	unsigned int offset, this_len;
565	struct page *page;	565	struct page *page;
566	pgoff_t index;	566	pgoff_t index;
567	int ret;	567	int ret;
568		568
569	/*	569	/*
570	* make sure the data in this buffer is uptodate	570	* make sure the data in this buffer is uptodate
571	*/	571	*/
572	ret = buf->ops->pin(pipe, buf);	572	ret = buf->ops->confirm(pipe, buf);
573	if (unlikely(ret))	573	if (unlikely(ret))
574	return ret;	574	return ret;
575		575
576	index = sd->pos >> PAGE_CACHE_SHIFT;	576	index = sd->pos >> PAGE_CACHE_SHIFT;
577	offset = sd->pos & ~PAGE_CACHE_MASK;	577	offset = sd->pos & ~PAGE_CACHE_MASK;
578		578
579	this_len = sd->len;	579	this_len = sd->len;
580	if (this_len + offset > PAGE_CACHE_SIZE)	580	if (this_len + offset > PAGE_CACHE_SIZE)
581	this_len = PAGE_CACHE_SIZE - offset;	581	this_len = PAGE_CACHE_SIZE - offset;
582		582
583	find_page:	583	find_page:
584	page = find_lock_page(mapping, index);	584	page = find_lock_page(mapping, index);
585	if (!page) {	585	if (!page) {
586	ret = -ENOMEM;	586	ret = -ENOMEM;
587	page = page_cache_alloc_cold(mapping);	587	page = page_cache_alloc_cold(mapping);
588	if (unlikely(!page))	588	if (unlikely(!page))
589	goto out_ret;	589	goto out_ret;
590		590
591	/*	591	/*
592	* This will also lock the page	592	* This will also lock the page
593	*/	593	*/
594	ret = add_to_page_cache_lru(page, mapping, index,	594	ret = add_to_page_cache_lru(page, mapping, index,
595	GFP_KERNEL);	595	GFP_KERNEL);
596	if (unlikely(ret))	596	if (unlikely(ret))
597	goto out;	597	goto out;
598	}	598	}
599		599
600	ret = mapping->a_ops->prepare_write(file, page, offset, offset+this_len);	600	ret = mapping->a_ops->prepare_write(file, page, offset, offset+this_len);
601	if (unlikely(ret)) {	601	if (unlikely(ret)) {
602	loff_t isize = i_size_read(mapping->host);	602	loff_t isize = i_size_read(mapping->host);
603		603
604	if (ret != AOP_TRUNCATED_PAGE)	604	if (ret != AOP_TRUNCATED_PAGE)
605	unlock_page(page);	605	unlock_page(page);
606	page_cache_release(page);	606	page_cache_release(page);
607	if (ret == AOP_TRUNCATED_PAGE)	607	if (ret == AOP_TRUNCATED_PAGE)
608	goto find_page;	608	goto find_page;
609		609
610	/*	610	/*
611	* prepare_write() may have instantiated a few blocks	611	* prepare_write() may have instantiated a few blocks
612	* outside i_size. Trim these off again.	612	* outside i_size. Trim these off again.
613	*/	613	*/
614	if (sd->pos + this_len > isize)	614	if (sd->pos + this_len > isize)
615	vmtruncate(mapping->host, isize);	615	vmtruncate(mapping->host, isize);
616		616
617	goto out_ret;	617	goto out_ret;
618	}	618	}
619		619
620	if (buf->page != page) {	620	if (buf->page != page) {
621	/*	621	/*
622	* Careful, ->map() uses KM_USER0!	622	* Careful, ->map() uses KM_USER0!
623	*/	623	*/
624	char *src = buf->ops->map(pipe, buf, 1);	624	char *src = buf->ops->map(pipe, buf, 1);
625	char *dst = kmap_atomic(page, KM_USER1);	625	char *dst = kmap_atomic(page, KM_USER1);
626		626
627	memcpy(dst + offset, src + buf->offset, this_len);	627	memcpy(dst + offset, src + buf->offset, this_len);
628	flush_dcache_page(page);	628	flush_dcache_page(page);
629	kunmap_atomic(dst, KM_USER1);	629	kunmap_atomic(dst, KM_USER1);
630	buf->ops->unmap(pipe, buf, src);	630	buf->ops->unmap(pipe, buf, src);
631	}	631	}
632		632
633	ret = mapping->a_ops->commit_write(file, page, offset, offset+this_len);	633	ret = mapping->a_ops->commit_write(file, page, offset, offset+this_len);
634	if (ret) {	634	if (ret) {
635	if (ret == AOP_TRUNCATED_PAGE) {	635	if (ret == AOP_TRUNCATED_PAGE) {
636	page_cache_release(page);	636	page_cache_release(page);
637	goto find_page;	637	goto find_page;
638	}	638	}
639	if (ret < 0)	639	if (ret < 0)
640	goto out;	640	goto out;
641	/*	641	/*
642	* Partial write has happened, so 'ret' already initialized by	642	* Partial write has happened, so 'ret' already initialized by
643	* number of bytes written, Where is nothing we have to do here.	643	* number of bytes written, Where is nothing we have to do here.
644	*/	644	*/
645	} else	645	} else
646	ret = this_len;	646	ret = this_len;
647	/*	647	/*
648	* Return the number of bytes written and mark page as	648	* Return the number of bytes written and mark page as
649	* accessed, we are now done!	649	* accessed, we are now done!
650	*/	650	*/
651	mark_page_accessed(page);	651	mark_page_accessed(page);
652	out:	652	out:
653	page_cache_release(page);	653	page_cache_release(page);
654	unlock_page(page);	654	unlock_page(page);
655	out_ret:	655	out_ret:
656	return ret;	656	return ret;
657	}	657	}
658		658
659	/**	659	/**
660	* __splice_from_pipe - splice data from a pipe to given actor	660	* __splice_from_pipe - splice data from a pipe to given actor
661	* @pipe: pipe to splice from	661	* @pipe: pipe to splice from
662	* @sd: information to @actor	662	* @sd: information to @actor
663	* @actor: handler that splices the data	663	* @actor: handler that splices the data
664	*	664	*
665	* Description:	665	* Description:
666	* This function does little more than loop over the pipe and call	666	* This function does little more than loop over the pipe and call
667	* @actor to do the actual moving of a single struct pipe_buffer to	667	* @actor to do the actual moving of a single struct pipe_buffer to
668	* the desired destination. See pipe_to_file, pipe_to_sendpage, or	668	* the desired destination. See pipe_to_file, pipe_to_sendpage, or
669	* pipe_to_user.	669	* pipe_to_user.
670	*	670	*
671	*/	671	*/
672	ssize_t __splice_from_pipe(struct pipe_inode_info pipe, struct splice_desc sd,	672	ssize_t __splice_from_pipe(struct pipe_inode_info pipe, struct splice_desc sd,
673	splice_actor *actor)	673	splice_actor *actor)
674	{	674	{
675	int ret, do_wakeup, err;	675	int ret, do_wakeup, err;
676		676
677	ret = 0;	677	ret = 0;
678	do_wakeup = 0;	678	do_wakeup = 0;
679		679
680	for (;;) {	680	for (;;) {
681	if (pipe->nrbufs) {	681	if (pipe->nrbufs) {
682	struct pipe_buffer *buf = pipe->bufs + pipe->curbuf;	682	struct pipe_buffer *buf = pipe->bufs + pipe->curbuf;
683	const struct pipe_buf_operations *ops = buf->ops;	683	const struct pipe_buf_operations *ops = buf->ops;
684		684
685	sd->len = buf->len;	685	sd->len = buf->len;
686	if (sd->len > sd->total_len)	686	if (sd->len > sd->total_len)
687	sd->len = sd->total_len;	687	sd->len = sd->total_len;
688		688
689	err = actor(pipe, buf, sd);	689	err = actor(pipe, buf, sd);
690	if (err <= 0) {	690	if (err <= 0) {
691	if (!ret && err != -ENODATA)	691	if (!ret && err != -ENODATA)
692	ret = err;	692	ret = err;
693		693
694	break;	694	break;
695	}	695	}
696		696
697	ret += err;	697	ret += err;
698	buf->offset += err;	698	buf->offset += err;
699	buf->len -= err;	699	buf->len -= err;
700		700
701	sd->len -= err;	701	sd->len -= err;
702	sd->pos += err;	702	sd->pos += err;
703	sd->total_len -= err;	703	sd->total_len -= err;
704	if (sd->len)	704	if (sd->len)
705	continue;	705	continue;
706		706
707	if (!buf->len) {	707	if (!buf->len) {
708	buf->ops = NULL;	708	buf->ops = NULL;
709	ops->release(pipe, buf);	709	ops->release(pipe, buf);
710	pipe->curbuf = (pipe->curbuf + 1) & (PIPE_BUFFERS - 1);	710	pipe->curbuf = (pipe->curbuf + 1) & (PIPE_BUFFERS - 1);
711	pipe->nrbufs--;	711	pipe->nrbufs--;
712	if (pipe->inode)	712	if (pipe->inode)
713	do_wakeup = 1;	713	do_wakeup = 1;
714	}	714	}
715		715
716	if (!sd->total_len)	716	if (!sd->total_len)
717	break;	717	break;
718	}	718	}
719		719
720	if (pipe->nrbufs)	720	if (pipe->nrbufs)
721	continue;	721	continue;
722	if (!pipe->writers)	722	if (!pipe->writers)
723	break;	723	break;
724	if (!pipe->waiting_writers) {	724	if (!pipe->waiting_writers) {
725	if (ret)	725	if (ret)
726	break;	726	break;
727	}	727	}
728		728
729	if (sd->flags & SPLICE_F_NONBLOCK) {	729	if (sd->flags & SPLICE_F_NONBLOCK) {
730	if (!ret)	730	if (!ret)
731	ret = -EAGAIN;	731	ret = -EAGAIN;
732	break;	732	break;
733	}	733	}
734		734
735	if (signal_pending(current)) {	735	if (signal_pending(current)) {
736	if (!ret)	736	if (!ret)
737	ret = -ERESTARTSYS;	737	ret = -ERESTARTSYS;
738	break;	738	break;
739	}	739	}
740		740
741	if (do_wakeup) {	741	if (do_wakeup) {
742	smp_mb();	742	smp_mb();
743	if (waitqueue_active(&pipe->wait))	743	if (waitqueue_active(&pipe->wait))
744	wake_up_interruptible_sync(&pipe->wait);	744	wake_up_interruptible_sync(&pipe->wait);
745	kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);	745	kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
746	do_wakeup = 0;	746	do_wakeup = 0;
747	}	747	}
748		748
749	pipe_wait(pipe);	749	pipe_wait(pipe);
750	}	750	}
751		751
752	if (do_wakeup) {	752	if (do_wakeup) {
753	smp_mb();	753	smp_mb();
754	if (waitqueue_active(&pipe->wait))	754	if (waitqueue_active(&pipe->wait))
755	wake_up_interruptible(&pipe->wait);	755	wake_up_interruptible(&pipe->wait);
756	kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);	756	kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
757	}	757	}
758		758
759	return ret;	759	return ret;
760	}	760	}
761	EXPORT_SYMBOL(__splice_from_pipe);	761	EXPORT_SYMBOL(__splice_from_pipe);
762		762
763	/**	763	/**
764	* splice_from_pipe - splice data from a pipe to a file	764	* splice_from_pipe - splice data from a pipe to a file
765	* @pipe: pipe to splice from	765	* @pipe: pipe to splice from
766	* @out: file to splice to	766	* @out: file to splice to
767	* @ppos: position in @out	767	* @ppos: position in @out
768	* @len: how many bytes to splice	768	* @len: how many bytes to splice
769	* @flags: splice modifier flags	769	* @flags: splice modifier flags
770	* @actor: handler that splices the data	770	* @actor: handler that splices the data
771	*	771	*
772	* Description:	772	* Description:
773	* See __splice_from_pipe. This function locks the input and output inodes,	773	* See __splice_from_pipe. This function locks the input and output inodes,
774	* otherwise it's identical to __splice_from_pipe().	774	* otherwise it's identical to __splice_from_pipe().
775	*	775	*
776	*/	776	*/
777	ssize_t splice_from_pipe(struct pipe_inode_info pipe, struct file out,	777	ssize_t splice_from_pipe(struct pipe_inode_info pipe, struct file out,
778	loff_t *ppos, size_t len, unsigned int flags,	778	loff_t *ppos, size_t len, unsigned int flags,
779	splice_actor *actor)	779	splice_actor *actor)
780	{	780	{
781	ssize_t ret;	781	ssize_t ret;
782	struct inode *inode = out->f_mapping->host;	782	struct inode *inode = out->f_mapping->host;
783	struct splice_desc sd = {	783	struct splice_desc sd = {
784	.total_len = len,	784	.total_len = len,
785	.flags = flags,	785	.flags = flags,
786	.pos = *ppos,	786	.pos = *ppos,
787	.u.file = out,	787	.u.file = out,
788	};	788	};
789		789
790	/*	790	/*
791	* The actor worker might be calling ->prepare_write and	791	* The actor worker might be calling ->prepare_write and
792	* ->commit_write. Most of the time, these expect i_mutex to	792	* ->commit_write. Most of the time, these expect i_mutex to
793	* be held. Since this may result in an ABBA deadlock with	793	* be held. Since this may result in an ABBA deadlock with
794	* pipe->inode, we have to order lock acquiry here.	794	* pipe->inode, we have to order lock acquiry here.
795	*/	795	*/
796	inode_double_lock(inode, pipe->inode);	796	inode_double_lock(inode, pipe->inode);
797	ret = __splice_from_pipe(pipe, &sd, actor);	797	ret = __splice_from_pipe(pipe, &sd, actor);
798	inode_double_unlock(inode, pipe->inode);	798	inode_double_unlock(inode, pipe->inode);
799		799
800	return ret;	800	return ret;
801	}	801	}
802		802
803	/**	803	/**
804	* generic_file_splice_write_nolock - generic_file_splice_write without mutexes	804	* generic_file_splice_write_nolock - generic_file_splice_write without mutexes
805	* @pipe: pipe info	805	* @pipe: pipe info
806	* @out: file to write to	806	* @out: file to write to
807	* @ppos: position in @out	807	* @ppos: position in @out
808	* @len: number of bytes to splice	808	* @len: number of bytes to splice
809	* @flags: splice modifier flags	809	* @flags: splice modifier flags
810	*	810	*
811	* Description:	811	* Description:
812	* Will either move or copy pages (determined by @flags options) from	812	* Will either move or copy pages (determined by @flags options) from
813	* the given pipe inode to the given file. The caller is responsible	813	* the given pipe inode to the given file. The caller is responsible
814	* for acquiring i_mutex on both inodes.	814	* for acquiring i_mutex on both inodes.
815	*	815	*
816	*/	816	*/
817	ssize_t	817	ssize_t
818	generic_file_splice_write_nolock(struct pipe_inode_info pipe, struct file out,	818	generic_file_splice_write_nolock(struct pipe_inode_info pipe, struct file out,
819	loff_t *ppos, size_t len, unsigned int flags)	819	loff_t *ppos, size_t len, unsigned int flags)
820	{	820	{
821	struct address_space *mapping = out->f_mapping;	821	struct address_space *mapping = out->f_mapping;
822	struct inode *inode = mapping->host;	822	struct inode *inode = mapping->host;
823	struct splice_desc sd = {	823	struct splice_desc sd = {
824	.total_len = len,	824	.total_len = len,
825	.flags = flags,	825	.flags = flags,
826	.pos = *ppos,	826	.pos = *ppos,
827	.u.file = out,	827	.u.file = out,
828	};	828	};
829	ssize_t ret;	829	ssize_t ret;
830	int err;	830	int err;
831		831
832	err = remove_suid(out->f_path.dentry);	832	err = remove_suid(out->f_path.dentry);
833	if (unlikely(err))	833	if (unlikely(err))
834	return err;	834	return err;
835		835
836	ret = __splice_from_pipe(pipe, &sd, pipe_to_file);	836	ret = __splice_from_pipe(pipe, &sd, pipe_to_file);
837	if (ret > 0) {	837	if (ret > 0) {
838	unsigned long nr_pages;	838	unsigned long nr_pages;
839		839
840	*ppos += ret;	840	*ppos += ret;
841	nr_pages = (ret + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;	841	nr_pages = (ret + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
842		842
843	/*	843	/*
844	* If file or inode is SYNC and we actually wrote some data,	844	* If file or inode is SYNC and we actually wrote some data,
845	* sync it.	845	* sync it.
846	*/	846	*/
847	if (unlikely((out->f_flags & O_SYNC) \|\| IS_SYNC(inode))) {	847	if (unlikely((out->f_flags & O_SYNC) \|\| IS_SYNC(inode))) {
848	err = generic_osync_inode(inode, mapping,	848	err = generic_osync_inode(inode, mapping,
849	OSYNC_METADATA\|OSYNC_DATA);	849	OSYNC_METADATA\|OSYNC_DATA);
850		850
851	if (err)	851	if (err)
852	ret = err;	852	ret = err;
853	}	853	}
854	balance_dirty_pages_ratelimited_nr(mapping, nr_pages);	854	balance_dirty_pages_ratelimited_nr(mapping, nr_pages);
855	}	855	}
856		856
857	return ret;	857	return ret;
858	}	858	}
859		859
860	EXPORT_SYMBOL(generic_file_splice_write_nolock);	860	EXPORT_SYMBOL(generic_file_splice_write_nolock);
861		861
862	/**	862	/**
863	* generic_file_splice_write - splice data from a pipe to a file	863	* generic_file_splice_write - splice data from a pipe to a file
864	* @pipe: pipe info	864	* @pipe: pipe info
865	* @out: file to write to	865	* @out: file to write to
866	* @ppos: position in @out	866	* @ppos: position in @out
867	* @len: number of bytes to splice	867	* @len: number of bytes to splice
868	* @flags: splice modifier flags	868	* @flags: splice modifier flags
869	*	869	*
870	* Description:	870	* Description:
871	* Will either move or copy pages (determined by @flags options) from	871	* Will either move or copy pages (determined by @flags options) from
872	* the given pipe inode to the given file.	872	* the given pipe inode to the given file.
873	*	873	*
874	*/	874	*/
875	ssize_t	875	ssize_t
876	generic_file_splice_write(struct pipe_inode_info pipe, struct file out,	876	generic_file_splice_write(struct pipe_inode_info pipe, struct file out,
877	loff_t *ppos, size_t len, unsigned int flags)	877	loff_t *ppos, size_t len, unsigned int flags)
878	{	878	{
879	struct address_space *mapping = out->f_mapping;	879	struct address_space *mapping = out->f_mapping;
880	struct inode *inode = mapping->host;	880	struct inode *inode = mapping->host;
881	ssize_t ret;	881	ssize_t ret;
882	int err;	882	int err;
883		883
884	err = should_remove_suid(out->f_path.dentry);	884	err = should_remove_suid(out->f_path.dentry);
885	if (unlikely(err)) {	885	if (unlikely(err)) {
886	mutex_lock(&inode->i_mutex);	886	mutex_lock(&inode->i_mutex);
887	err = __remove_suid(out->f_path.dentry, err);	887	err = __remove_suid(out->f_path.dentry, err);
888	mutex_unlock(&inode->i_mutex);	888	mutex_unlock(&inode->i_mutex);
889	if (err)	889	if (err)
890	return err;	890	return err;
891	}	891	}
892		892
893	ret = splice_from_pipe(pipe, out, ppos, len, flags, pipe_to_file);	893	ret = splice_from_pipe(pipe, out, ppos, len, flags, pipe_to_file);
894	if (ret > 0) {	894	if (ret > 0) {
895	unsigned long nr_pages;	895	unsigned long nr_pages;
896		896
897	*ppos += ret;	897	*ppos += ret;
898	nr_pages = (ret + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;	898	nr_pages = (ret + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
899		899
900	/*	900	/*
901	* If file or inode is SYNC and we actually wrote some data,	901	* If file or inode is SYNC and we actually wrote some data,
902	* sync it.	902	* sync it.
903	*/	903	*/
904	if (unlikely((out->f_flags & O_SYNC) \|\| IS_SYNC(inode))) {	904	if (unlikely((out->f_flags & O_SYNC) \|\| IS_SYNC(inode))) {
905	mutex_lock(&inode->i_mutex);	905	mutex_lock(&inode->i_mutex);
906	err = generic_osync_inode(inode, mapping,	906	err = generic_osync_inode(inode, mapping,
907	OSYNC_METADATA\|OSYNC_DATA);	907	OSYNC_METADATA\|OSYNC_DATA);
908	mutex_unlock(&inode->i_mutex);	908	mutex_unlock(&inode->i_mutex);
909		909
910	if (err)	910	if (err)
911	ret = err;	911	ret = err;
912	}	912	}
913	balance_dirty_pages_ratelimited_nr(mapping, nr_pages);	913	balance_dirty_pages_ratelimited_nr(mapping, nr_pages);
914	}	914	}
915		915
916	return ret;	916	return ret;
917	}	917	}
918		918
919	EXPORT_SYMBOL(generic_file_splice_write);	919	EXPORT_SYMBOL(generic_file_splice_write);
920		920
921	/**	921	/**
922	* generic_splice_sendpage - splice data from a pipe to a socket	922	* generic_splice_sendpage - splice data from a pipe to a socket
923	* @pipe: pipe to splice from	923	* @pipe: pipe to splice from
924	* @out: socket to write to	924	* @out: socket to write to
925	* @ppos: position in @out	925	* @ppos: position in @out
926	* @len: number of bytes to splice	926	* @len: number of bytes to splice
927	* @flags: splice modifier flags	927	* @flags: splice modifier flags
928	*	928	*
929	* Description:	929	* Description:
930	* Will send @len bytes from the pipe to a network socket. No data copying	930	* Will send @len bytes from the pipe to a network socket. No data copying
931	* is involved.	931	* is involved.
932	*	932	*
933	*/	933	*/
934	ssize_t generic_splice_sendpage(struct pipe_inode_info pipe, struct file out,	934	ssize_t generic_splice_sendpage(struct pipe_inode_info pipe, struct file out,
935	loff_t *ppos, size_t len, unsigned int flags)	935	loff_t *ppos, size_t len, unsigned int flags)
936	{	936	{
937	return splice_from_pipe(pipe, out, ppos, len, flags, pipe_to_sendpage);	937	return splice_from_pipe(pipe, out, ppos, len, flags, pipe_to_sendpage);
938	}	938	}
939		939
940	EXPORT_SYMBOL(generic_splice_sendpage);	940	EXPORT_SYMBOL(generic_splice_sendpage);
941		941
942	/*	942	/*
943	* Attempt to initiate a splice from pipe to file.	943	* Attempt to initiate a splice from pipe to file.
944	*/	944	*/
945	static long do_splice_from(struct pipe_inode_info pipe, struct file out,	945	static long do_splice_from(struct pipe_inode_info pipe, struct file out,
946	loff_t *ppos, size_t len, unsigned int flags)	946	loff_t *ppos, size_t len, unsigned int flags)
947	{	947	{
948	int ret;	948	int ret;
949		949
950	if (unlikely(!out->f_op \|\| !out->f_op->splice_write))	950	if (unlikely(!out->f_op \|\| !out->f_op->splice_write))
951	return -EINVAL;	951	return -EINVAL;
952		952
953	if (unlikely(!(out->f_mode & FMODE_WRITE)))	953	if (unlikely(!(out->f_mode & FMODE_WRITE)))
954	return -EBADF;	954	return -EBADF;
955		955
956	ret = rw_verify_area(WRITE, out, ppos, len);	956	ret = rw_verify_area(WRITE, out, ppos, len);
957	if (unlikely(ret < 0))	957	if (unlikely(ret < 0))
958	return ret;	958	return ret;
959		959
960	return out->f_op->splice_write(pipe, out, ppos, len, flags);	960	return out->f_op->splice_write(pipe, out, ppos, len, flags);
961	}	961	}
962		962
963	/*	963	/*
964	* Attempt to initiate a splice from a file to a pipe.	964	* Attempt to initiate a splice from a file to a pipe.
965	*/	965	*/
966	static long do_splice_to(struct file in, loff_t ppos,	966	static long do_splice_to(struct file in, loff_t ppos,
967	struct pipe_inode_info *pipe, size_t len,	967	struct pipe_inode_info *pipe, size_t len,
968	unsigned int flags)	968	unsigned int flags)
969	{	969	{
970	int ret;	970	int ret;
971		971
972	if (unlikely(!in->f_op \|\| !in->f_op->splice_read))	972	if (unlikely(!in->f_op \|\| !in->f_op->splice_read))
973	return -EINVAL;	973	return -EINVAL;
974		974
975	if (unlikely(!(in->f_mode & FMODE_READ)))	975	if (unlikely(!(in->f_mode & FMODE_READ)))
976	return -EBADF;	976	return -EBADF;
977		977
978	ret = rw_verify_area(READ, in, ppos, len);	978	ret = rw_verify_area(READ, in, ppos, len);
979	if (unlikely(ret < 0))	979	if (unlikely(ret < 0))
980	return ret;	980	return ret;
981		981
982	return in->f_op->splice_read(in, ppos, pipe, len, flags);	982	return in->f_op->splice_read(in, ppos, pipe, len, flags);
983	}	983	}
984		984
985	/**	985	/**
986	* splice_direct_to_actor - splices data directly between two non-pipes	986	* splice_direct_to_actor - splices data directly between two non-pipes
987	* @in: file to splice from	987	* @in: file to splice from
988	* @sd: actor information on where to splice to	988	* @sd: actor information on where to splice to
989	* @actor: handles the data splicing	989	* @actor: handles the data splicing
990	*	990	*
991	* Description:	991	* Description:
992	* This is a special case helper to splice directly between two	992	* This is a special case helper to splice directly between two
993	* points, without requiring an explicit pipe. Internally an allocated	993	* points, without requiring an explicit pipe. Internally an allocated
994	* pipe is cached in the process, and reused during the life time of	994	* pipe is cached in the process, and reused during the life time of
995	* that process.	995	* that process.
996	*	996	*
997	*/	997	*/
998	ssize_t splice_direct_to_actor(struct file in, struct splice_desc sd,	998	ssize_t splice_direct_to_actor(struct file in, struct splice_desc sd,
999	splice_direct_actor *actor)	999	splice_direct_actor *actor)
1000	{	1000	{
1001	struct pipe_inode_info *pipe;	1001	struct pipe_inode_info *pipe;
1002	long ret, bytes;	1002	long ret, bytes;
1003	umode_t i_mode;	1003	umode_t i_mode;
1004	size_t len;	1004	size_t len;
1005	int i, flags;	1005	int i, flags;
1006		1006
1007	/*	1007	/*
1008	* We require the input being a regular file, as we don't want to	1008	* We require the input being a regular file, as we don't want to
1009	* randomly drop data for eg socket -> socket splicing. Use the	1009	* randomly drop data for eg socket -> socket splicing. Use the
1010	* piped splicing for that!	1010	* piped splicing for that!
1011	*/	1011	*/
1012	i_mode = in->f_path.dentry->d_inode->i_mode;	1012	i_mode = in->f_path.dentry->d_inode->i_mode;
1013	if (unlikely(!S_ISREG(i_mode) && !S_ISBLK(i_mode)))	1013	if (unlikely(!S_ISREG(i_mode) && !S_ISBLK(i_mode)))
1014	return -EINVAL;	1014	return -EINVAL;
1015		1015
1016	/*	1016	/*
1017	* neither in nor out is a pipe, setup an internal pipe attached to	1017	* neither in nor out is a pipe, setup an internal pipe attached to
1018	* 'out' and transfer the wanted data from 'in' to 'out' through that	1018	* 'out' and transfer the wanted data from 'in' to 'out' through that
1019	*/	1019	*/
1020	pipe = current->splice_pipe;	1020	pipe = current->splice_pipe;
1021	if (unlikely(!pipe)) {	1021	if (unlikely(!pipe)) {
1022	pipe = alloc_pipe_info(NULL);	1022	pipe = alloc_pipe_info(NULL);
1023	if (!pipe)	1023	if (!pipe)
1024	return -ENOMEM;	1024	return -ENOMEM;
1025		1025
1026	/*	1026	/*
1027	* We don't have an immediate reader, but we'll read the stuff	1027	* We don't have an immediate reader, but we'll read the stuff
1028	* out of the pipe right after the splice_to_pipe(). So set	1028	* out of the pipe right after the splice_to_pipe(). So set
1029	* PIPE_READERS appropriately.	1029	* PIPE_READERS appropriately.
1030	*/	1030	*/
1031	pipe->readers = 1;	1031	pipe->readers = 1;
1032		1032
1033	current->splice_pipe = pipe;	1033	current->splice_pipe = pipe;
1034	}	1034	}
1035		1035
1036	/*	1036	/*
1037	* Do the splice.	1037	* Do the splice.
1038	*/	1038	*/
1039	ret = 0;	1039	ret = 0;
1040	bytes = 0;	1040	bytes = 0;
1041	len = sd->total_len;	1041	len = sd->total_len;
1042	flags = sd->flags;	1042	flags = sd->flags;
1043		1043
1044	/*	1044	/*
1045	* Don't block on output, we have to drain the direct pipe.	1045	* Don't block on output, we have to drain the direct pipe.
1046	*/	1046	*/
1047	sd->flags &= ~SPLICE_F_NONBLOCK;	1047	sd->flags &= ~SPLICE_F_NONBLOCK;
1048		1048
1049	while (len) {	1049	while (len) {
1050	size_t read_len, max_read_len;	1050	size_t read_len, max_read_len;
1051		1051
1052	/*	1052	/*
1053	* Do at most PIPE_BUFFERS pages worth of transfer:	1053	* Do at most PIPE_BUFFERS pages worth of transfer:
1054	*/	1054	*/
1055	max_read_len = min(len, (size_t)(PIPE_BUFFERS*PAGE_SIZE));	1055	max_read_len = min(len, (size_t)(PIPE_BUFFERS*PAGE_SIZE));
1056		1056
1057	ret = do_splice_to(in, &sd->pos, pipe, max_read_len, flags);	1057	ret = do_splice_to(in, &sd->pos, pipe, max_read_len, flags);
1058	if (unlikely(ret < 0))	1058	if (unlikely(ret < 0))
1059	goto out_release;	1059	goto out_release;
1060		1060
1061	read_len = ret;	1061	read_len = ret;
1062	sd->total_len = read_len;	1062	sd->total_len = read_len;
1063		1063
1064	/*	1064	/*
1065	* NOTE: nonblocking mode only applies to the input. We	1065	* NOTE: nonblocking mode only applies to the input. We
1066	* must not do the output in nonblocking mode as then we	1066	* must not do the output in nonblocking mode as then we
1067	* could get stuck data in the internal pipe:	1067	* could get stuck data in the internal pipe:
1068	*/	1068	*/
1069	ret = actor(pipe, sd);	1069	ret = actor(pipe, sd);
1070	if (unlikely(ret < 0))	1070	if (unlikely(ret < 0))
1071	goto out_release;	1071	goto out_release;
1072		1072
1073	bytes += ret;	1073	bytes += ret;
1074	len -= ret;	1074	len -= ret;
1075		1075
1076	/*	1076	/*
1077	* In nonblocking mode, if we got back a short read then	1077	* In nonblocking mode, if we got back a short read then
1078	* that was due to either an IO error or due to the	1078	* that was due to either an IO error or due to the
1079	* pagecache entry not being there. In the IO error case	1079	* pagecache entry not being there. In the IO error case
1080	* the _next_ splice attempt will produce a clean IO error	1080	* the _next_ splice attempt will produce a clean IO error
1081	* return value (not a short read), so in both cases it's	1081	* return value (not a short read), so in both cases it's
1082	* correct to break out of the loop here:	1082	* correct to break out of the loop here:
1083	*/	1083	*/
1084	if ((flags & SPLICE_F_NONBLOCK) && (read_len < max_read_len))	1084	if ((flags & SPLICE_F_NONBLOCK) && (read_len < max_read_len))
1085	break;	1085	break;
1086	}	1086	}
1087		1087
1088	pipe->nrbufs = pipe->curbuf = 0;	1088	pipe->nrbufs = pipe->curbuf = 0;
1089		1089
1090	return bytes;	1090	return bytes;
1091		1091
1092	out_release:	1092	out_release:
1093	/*	1093	/*
1094	* If we did an incomplete transfer we must release	1094	* If we did an incomplete transfer we must release
1095	* the pipe buffers in question:	1095	* the pipe buffers in question:
1096	*/	1096	*/
1097	for (i = 0; i < PIPE_BUFFERS; i++) {	1097	for (i = 0; i < PIPE_BUFFERS; i++) {
1098	struct pipe_buffer *buf = pipe->bufs + i;	1098	struct pipe_buffer *buf = pipe->bufs + i;
1099		1099
1100	if (buf->ops) {	1100	if (buf->ops) {
1101	buf->ops->release(pipe, buf);	1101	buf->ops->release(pipe, buf);
1102	buf->ops = NULL;	1102	buf->ops = NULL;
1103	}	1103	}
1104	}	1104	}
1105	pipe->nrbufs = pipe->curbuf = 0;	1105	pipe->nrbufs = pipe->curbuf = 0;
1106		1106
1107	/*	1107	/*
1108	* If we transferred some data, return the number of bytes:	1108	* If we transferred some data, return the number of bytes:
1109	*/	1109	*/
1110	if (bytes > 0)	1110	if (bytes > 0)
1111	return bytes;	1111	return bytes;
1112		1112
1113	return ret;	1113	return ret;
1114		1114
1115	}	1115	}
1116	EXPORT_SYMBOL(splice_direct_to_actor);	1116	EXPORT_SYMBOL(splice_direct_to_actor);
1117		1117
1118	static int direct_splice_actor(struct pipe_inode_info *pipe,	1118	static int direct_splice_actor(struct pipe_inode_info *pipe,
1119	struct splice_desc *sd)	1119	struct splice_desc *sd)
1120	{	1120	{
1121	struct file *file = sd->u.file;	1121	struct file *file = sd->u.file;
1122		1122
1123	return do_splice_from(pipe, file, &sd->pos, sd->total_len, sd->flags);	1123	return do_splice_from(pipe, file, &sd->pos, sd->total_len, sd->flags);
1124	}	1124	}
1125		1125
1126	/**	1126	/**
1127	* do_splice_direct - splices data directly between two files	1127	* do_splice_direct - splices data directly between two files
1128	* @in: file to splice from	1128	* @in: file to splice from
1129	* @ppos: input file offset	1129	* @ppos: input file offset
1130	* @out: file to splice to	1130	* @out: file to splice to
1131	* @len: number of bytes to splice	1131	* @len: number of bytes to splice
1132	* @flags: splice modifier flags	1132	* @flags: splice modifier flags
1133	*	1133	*
1134	* Description:	1134	* Description:
1135	* For use by do_sendfile(). splice can easily emulate sendfile, but	1135	* For use by do_sendfile(). splice can easily emulate sendfile, but
1136	* doing it in the application would incur an extra system call	1136	* doing it in the application would incur an extra system call
1137	* (splice in + splice out, as compared to just sendfile()). So this helper	1137	* (splice in + splice out, as compared to just sendfile()). So this helper
1138	* can splice directly through a process-private pipe.	1138	* can splice directly through a process-private pipe.
1139	*	1139	*
1140	*/	1140	*/
1141	long do_splice_direct(struct file in, loff_t ppos, struct file *out,	1141	long do_splice_direct(struct file in, loff_t ppos, struct file *out,
1142	size_t len, unsigned int flags)	1142	size_t len, unsigned int flags)
1143	{	1143	{
1144	struct splice_desc sd = {	1144	struct splice_desc sd = {
1145	.len = len,	1145	.len = len,
1146	.total_len = len,	1146	.total_len = len,
1147	.flags = flags,	1147	.flags = flags,
1148	.pos = *ppos,	1148	.pos = *ppos,
1149	.u.file = out,	1149	.u.file = out,
1150	};	1150	};
1151	size_t ret;	1151	size_t ret;
1152		1152
1153	ret = splice_direct_to_actor(in, &sd, direct_splice_actor);	1153	ret = splice_direct_to_actor(in, &sd, direct_splice_actor);
1154	*ppos = sd.pos;	1154	*ppos = sd.pos;
1155	return ret;	1155	return ret;
1156	}	1156	}
1157		1157
1158	/*	1158	/*
1159	* After the inode slimming patch, i_pipe/i_bdev/i_cdev share the same	1159	* After the inode slimming patch, i_pipe/i_bdev/i_cdev share the same
1160	* location, so checking ->i_pipe is not enough to verify that this is a	1160	* location, so checking ->i_pipe is not enough to verify that this is a
1161	* pipe.	1161	* pipe.
1162	*/	1162	*/
1163	static inline struct pipe_inode_info pipe_info(struct inode inode)	1163	static inline struct pipe_inode_info pipe_info(struct inode inode)
1164	{	1164	{
1165	if (S_ISFIFO(inode->i_mode))	1165	if (S_ISFIFO(inode->i_mode))
1166	return inode->i_pipe;	1166	return inode->i_pipe;
1167		1167
1168	return NULL;	1168	return NULL;
1169	}	1169	}
1170		1170
1171	/*	1171	/*
1172	* Determine where to splice to/from.	1172	* Determine where to splice to/from.
1173	*/	1173	*/
1174	static long do_splice(struct file in, loff_t __user off_in,	1174	static long do_splice(struct file in, loff_t __user off_in,
1175	struct file out, loff_t __user off_out,	1175	struct file out, loff_t __user off_out,
1176	size_t len, unsigned int flags)	1176	size_t len, unsigned int flags)
1177	{	1177	{
1178	struct pipe_inode_info *pipe;	1178	struct pipe_inode_info *pipe;
1179	loff_t offset, *off;	1179	loff_t offset, *off;
1180	long ret;	1180	long ret;
1181		1181
1182	pipe = pipe_info(in->f_path.dentry->d_inode);	1182	pipe = pipe_info(in->f_path.dentry->d_inode);
1183	if (pipe) {	1183	if (pipe) {
1184	if (off_in)	1184	if (off_in)
1185	return -ESPIPE;	1185	return -ESPIPE;
1186	if (off_out) {	1186	if (off_out) {
1187	if (out->f_op->llseek == no_llseek)	1187	if (out->f_op->llseek == no_llseek)
1188	return -EINVAL;	1188	return -EINVAL;
1189	if (copy_from_user(&offset, off_out, sizeof(loff_t)))	1189	if (copy_from_user(&offset, off_out, sizeof(loff_t)))
1190	return -EFAULT;	1190	return -EFAULT;
1191	off = &offset;	1191	off = &offset;
1192	} else	1192	} else
1193	off = &out->f_pos;	1193	off = &out->f_pos;
1194		1194
1195	ret = do_splice_from(pipe, out, off, len, flags);	1195	ret = do_splice_from(pipe, out, off, len, flags);
1196		1196
1197	if (off_out && copy_to_user(off_out, off, sizeof(loff_t)))	1197	if (off_out && copy_to_user(off_out, off, sizeof(loff_t)))
1198	ret = -EFAULT;	1198	ret = -EFAULT;
1199		1199
1200	return ret;	1200	return ret;
1201	}	1201	}
1202		1202
1203	pipe = pipe_info(out->f_path.dentry->d_inode);	1203	pipe = pipe_info(out->f_path.dentry->d_inode);
1204	if (pipe) {	1204	if (pipe) {
1205	if (off_out)	1205	if (off_out)
1206	return -ESPIPE;	1206	return -ESPIPE;
1207	if (off_in) {	1207	if (off_in) {
1208	if (in->f_op->llseek == no_llseek)	1208	if (in->f_op->llseek == no_llseek)
1209	return -EINVAL;	1209	return -EINVAL;
1210	if (copy_from_user(&offset, off_in, sizeof(loff_t)))	1210	if (copy_from_user(&offset, off_in, sizeof(loff_t)))
1211	return -EFAULT;	1211	return -EFAULT;
1212	off = &offset;	1212	off = &offset;
1213	} else	1213	} else
1214	off = &in->f_pos;	1214	off = &in->f_pos;
1215		1215
1216	ret = do_splice_to(in, off, pipe, len, flags);	1216	ret = do_splice_to(in, off, pipe, len, flags);
1217		1217
1218	if (off_in && copy_to_user(off_in, off, sizeof(loff_t)))	1218	if (off_in && copy_to_user(off_in, off, sizeof(loff_t)))
1219	ret = -EFAULT;	1219	ret = -EFAULT;
1220		1220
1221	return ret;	1221	return ret;
1222	}	1222	}
1223		1223
1224	return -EINVAL;	1224	return -EINVAL;
1225	}	1225	}
1226		1226
1227	/*	1227	/*
1228	* Map an iov into an array of pages and offset/length tupples. With the	1228	* Map an iov into an array of pages and offset/length tupples. With the
1229	* partial_page structure, we can map several non-contiguous ranges into	1229	* partial_page structure, we can map several non-contiguous ranges into
1230	* our ones pages[] map instead of splitting that operation into pieces.	1230	* our ones pages[] map instead of splitting that operation into pieces.
1231	* Could easily be exported as a generic helper for other users, in which	1231	* Could easily be exported as a generic helper for other users, in which
1232	* case one would probably want to add a 'max_nr_pages' parameter as well.	1232	* case one would probably want to add a 'max_nr_pages' parameter as well.
1233	*/	1233	*/
1234	static int get_iovec_page_array(const struct iovec __user *iov,	1234	static int get_iovec_page_array(const struct iovec __user *iov,
1235	unsigned int nr_vecs, struct page **pages,	1235	unsigned int nr_vecs, struct page **pages,
1236	struct partial_page *partial, int aligned)	1236	struct partial_page *partial, int aligned)
1237	{	1237	{
1238	int buffers = 0, error = 0;	1238	int buffers = 0, error = 0;
1239		1239
1240	/*	1240	/*
1241	* It's ok to take the mmap_sem for reading, even	1241	* It's ok to take the mmap_sem for reading, even
1242	* across a "get_user()".	1242	* across a "get_user()".
1243	*/	1243	*/
1244	down_read(&current->mm->mmap_sem);	1244	down_read(&current->mm->mmap_sem);
1245		1245
1246	while (nr_vecs) {	1246	while (nr_vecs) {
1247	unsigned long off, npages;	1247	unsigned long off, npages;
1248	void __user *base;	1248	void __user *base;
1249	size_t len;	1249	size_t len;
1250	int i;	1250	int i;
1251		1251
1252	/*	1252	/*
1253	* Get user address base and length for this iovec.	1253	* Get user address base and length for this iovec.
1254	*/	1254	*/
1255	error = get_user(base, &iov->iov_base);	1255	error = get_user(base, &iov->iov_base);
1256	if (unlikely(error))	1256	if (unlikely(error))
1257	break;	1257	break;
1258	error = get_user(len, &iov->iov_len);	1258	error = get_user(len, &iov->iov_len);
1259	if (unlikely(error))	1259	if (unlikely(error))
1260	break;	1260	break;
1261		1261
1262	/*	1262	/*
1263	* Sanity check this iovec. 0 read succeeds.	1263	* Sanity check this iovec. 0 read succeeds.
1264	*/	1264	*/
1265	if (unlikely(!len))	1265	if (unlikely(!len))
1266	break;	1266	break;
1267	error = -EFAULT;	1267	error = -EFAULT;
1268	if (unlikely(!base))	1268	if (unlikely(!base))
1269	break;	1269	break;
1270		1270
1271	/*	1271	/*
1272	* Get this base offset and number of pages, then map	1272	* Get this base offset and number of pages, then map
1273	* in the user pages.	1273	* in the user pages.
1274	*/	1274	*/
1275	off = (unsigned long) base & ~PAGE_MASK;	1275	off = (unsigned long) base & ~PAGE_MASK;
1276		1276
1277	/*	1277	/*
1278	* If asked for alignment, the offset must be zero and the	1278	* If asked for alignment, the offset must be zero and the
1279	* length a multiple of the PAGE_SIZE.	1279	* length a multiple of the PAGE_SIZE.
1280	*/	1280	*/
1281	error = -EINVAL;	1281	error = -EINVAL;
1282	if (aligned && (off \|\| len & ~PAGE_MASK))	1282	if (aligned && (off \|\| len & ~PAGE_MASK))
1283	break;	1283	break;
1284		1284
1285	npages = (off + len + PAGE_SIZE - 1) >> PAGE_SHIFT;	1285	npages = (off + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
1286	if (npages > PIPE_BUFFERS - buffers)	1286	if (npages > PIPE_BUFFERS - buffers)
1287	npages = PIPE_BUFFERS - buffers;	1287	npages = PIPE_BUFFERS - buffers;
1288		1288
1289	error = get_user_pages(current, current->mm,	1289	error = get_user_pages(current, current->mm,
1290	(unsigned long) base, npages, 0, 0,	1290	(unsigned long) base, npages, 0, 0,
1291	&pages[buffers], NULL);	1291	&pages[buffers], NULL);
1292		1292
1293	if (unlikely(error <= 0))	1293	if (unlikely(error <= 0))
1294	break;	1294	break;
1295		1295
1296	/*	1296	/*
1297	* Fill this contiguous range into the partial page map.	1297	* Fill this contiguous range into the partial page map.
1298	*/	1298	*/
1299	for (i = 0; i < error; i++) {	1299	for (i = 0; i < error; i++) {
1300	const int plen = min_t(size_t, len, PAGE_SIZE - off);	1300	const int plen = min_t(size_t, len, PAGE_SIZE - off);
1301		1301
1302	partial[buffers].offset = off;	1302	partial[buffers].offset = off;
1303	partial[buffers].len = plen;	1303	partial[buffers].len = plen;
1304		1304
1305	off = 0;	1305	off = 0;
1306	len -= plen;	1306	len -= plen;
1307	buffers++;	1307	buffers++;
1308	}	1308	}
1309		1309
1310	/*	1310	/*
1311	* We didn't complete this iov, stop here since it probably	1311	* We didn't complete this iov, stop here since it probably
1312	* means we have to move some of this into a pipe to	1312	* means we have to move some of this into a pipe to
1313	* be able to continue.	1313	* be able to continue.
1314	*/	1314	*/
1315	if (len)	1315	if (len)
1316	break;	1316	break;
1317		1317
1318	/*	1318	/*
1319	* Don't continue if we mapped fewer pages than we asked for,	1319	* Don't continue if we mapped fewer pages than we asked for,
1320	* or if we mapped the max number of pages that we have	1320	* or if we mapped the max number of pages that we have
1321	* room for.	1321	* room for.
1322	*/	1322	*/
1323	if (error < npages \|\| buffers == PIPE_BUFFERS)	1323	if (error < npages \|\| buffers == PIPE_BUFFERS)
1324	break;	1324	break;
1325		1325
1326	nr_vecs--;	1326	nr_vecs--;
1327	iov++;	1327	iov++;
1328	}	1328	}
1329		1329
1330	up_read(&current->mm->mmap_sem);	1330	up_read(&current->mm->mmap_sem);
1331		1331
1332	if (buffers)	1332	if (buffers)
1333	return buffers;	1333	return buffers;
1334		1334
1335	return error;	1335	return error;
1336	}	1336	}
1337		1337
1338	static int pipe_to_user(struct pipe_inode_info pipe, struct pipe_buffer buf,	1338	static int pipe_to_user(struct pipe_inode_info pipe, struct pipe_buffer buf,
1339	struct splice_desc *sd)	1339	struct splice_desc *sd)
1340	{	1340	{
1341	char *src;	1341	char *src;
1342	int ret;	1342	int ret;
1343		1343
1344	ret = buf->ops->pin(pipe, buf);	1344	ret = buf->ops->confirm(pipe, buf);
1345	if (unlikely(ret))	1345	if (unlikely(ret))
1346	return ret;	1346	return ret;
1347		1347
1348	/*	1348	/*
1349	* See if we can use the atomic maps, by prefaulting in the	1349	* See if we can use the atomic maps, by prefaulting in the
1350	* pages and doing an atomic copy	1350	* pages and doing an atomic copy
1351	*/	1351	*/
1352	if (!fault_in_pages_writeable(sd->u.userptr, sd->len)) {	1352	if (!fault_in_pages_writeable(sd->u.userptr, sd->len)) {
1353	src = buf->ops->map(pipe, buf, 1);	1353	src = buf->ops->map(pipe, buf, 1);
1354	ret = __copy_to_user_inatomic(sd->u.userptr, src + buf->offset,	1354	ret = __copy_to_user_inatomic(sd->u.userptr, src + buf->offset,
1355	sd->len);	1355	sd->len);
1356	buf->ops->unmap(pipe, buf, src);	1356	buf->ops->unmap(pipe, buf, src);
1357	if (!ret) {	1357	if (!ret) {
1358	ret = sd->len;	1358	ret = sd->len;
1359	goto out;	1359	goto out;
1360	}	1360	}
1361	}	1361	}
1362		1362
1363	/*	1363	/*
1364	* No dice, use slow non-atomic map and copy	1364	* No dice, use slow non-atomic map and copy
1365	*/	1365	*/
1366	src = buf->ops->map(pipe, buf, 0);	1366	src = buf->ops->map(pipe, buf, 0);
1367		1367
1368	ret = sd->len;	1368	ret = sd->len;
1369	if (copy_to_user(sd->u.userptr, src + buf->offset, sd->len))	1369	if (copy_to_user(sd->u.userptr, src + buf->offset, sd->len))
1370	ret = -EFAULT;	1370	ret = -EFAULT;
1371		1371
1372	out:	1372	out:
1373	if (ret > 0)	1373	if (ret > 0)
1374	sd->u.userptr += ret;	1374	sd->u.userptr += ret;
1375	buf->ops->unmap(pipe, buf, src);	1375	buf->ops->unmap(pipe, buf, src);
1376	return ret;	1376	return ret;
1377	}	1377	}
1378		1378
1379	/*	1379	/*
1380	* For lack of a better implementation, implement vmsplice() to userspace	1380	* For lack of a better implementation, implement vmsplice() to userspace
1381	* as a simple copy of the pipes pages to the user iov.	1381	* as a simple copy of the pipes pages to the user iov.
1382	*/	1382	*/
1383	static long vmsplice_to_user(struct file file, const struct iovec __user iov,	1383	static long vmsplice_to_user(struct file file, const struct iovec __user iov,
1384	unsigned long nr_segs, unsigned int flags)	1384	unsigned long nr_segs, unsigned int flags)
1385	{	1385	{
1386	struct pipe_inode_info *pipe;	1386	struct pipe_inode_info *pipe;
1387	struct splice_desc sd;	1387	struct splice_desc sd;
1388	ssize_t size;	1388	ssize_t size;
1389	int error;	1389	int error;
1390	long ret;	1390	long ret;
1391		1391
1392	pipe = pipe_info(file->f_path.dentry->d_inode);	1392	pipe = pipe_info(file->f_path.dentry->d_inode);
1393	if (!pipe)	1393	if (!pipe)
1394	return -EBADF;	1394	return -EBADF;
1395		1395
1396	if (pipe->inode)	1396	if (pipe->inode)
1397	mutex_lock(&pipe->inode->i_mutex);	1397	mutex_lock(&pipe->inode->i_mutex);
1398		1398
1399	error = ret = 0;	1399	error = ret = 0;
1400	while (nr_segs) {	1400	while (nr_segs) {
1401	void __user *base;	1401	void __user *base;
1402	size_t len;	1402	size_t len;
1403		1403
1404	/*	1404	/*
1405	* Get user address base and length for this iovec.	1405	* Get user address base and length for this iovec.
1406	*/	1406	*/
1407	error = get_user(base, &iov->iov_base);	1407	error = get_user(base, &iov->iov_base);
1408	if (unlikely(error))	1408	if (unlikely(error))
1409	break;	1409	break;
1410	error = get_user(len, &iov->iov_len);	1410	error = get_user(len, &iov->iov_len);
1411	if (unlikely(error))	1411	if (unlikely(error))
1412	break;	1412	break;
1413		1413
1414	/*	1414	/*
1415	* Sanity check this iovec. 0 read succeeds.	1415	* Sanity check this iovec. 0 read succeeds.
1416	*/	1416	*/
1417	if (unlikely(!len))	1417	if (unlikely(!len))
1418	break;	1418	break;
1419	if (unlikely(!base)) {	1419	if (unlikely(!base)) {
1420	error = -EFAULT;	1420	error = -EFAULT;
1421	break;	1421	break;
1422	}	1422	}
1423		1423
1424	sd.len = 0;	1424	sd.len = 0;
1425	sd.total_len = len;	1425	sd.total_len = len;
1426	sd.flags = flags;	1426	sd.flags = flags;
1427	sd.u.userptr = base;	1427	sd.u.userptr = base;
1428	sd.pos = 0;	1428	sd.pos = 0;
1429		1429
1430	size = __splice_from_pipe(pipe, &sd, pipe_to_user);	1430	size = __splice_from_pipe(pipe, &sd, pipe_to_user);
1431	if (size < 0) {	1431	if (size < 0) {
1432	if (!ret)	1432	if (!ret)
1433	ret = size;	1433	ret = size;
1434		1434
1435	break;	1435	break;
1436	}	1436	}
1437		1437
1438	ret += size;	1438	ret += size;
1439		1439
1440	if (size < len)	1440	if (size < len)
1441	break;	1441	break;
1442		1442
1443	nr_segs--;	1443	nr_segs--;
1444	iov++;	1444	iov++;
1445	}	1445	}
1446		1446
1447	if (pipe->inode)	1447	if (pipe->inode)
1448	mutex_unlock(&pipe->inode->i_mutex);	1448	mutex_unlock(&pipe->inode->i_mutex);
1449		1449
1450	if (!ret)	1450	if (!ret)
1451	ret = error;	1451	ret = error;
1452		1452
1453	return ret;	1453	return ret;
1454	}	1454	}
1455		1455
1456	/*	1456	/*
1457	* vmsplice splices a user address range into a pipe. It can be thought of	1457	* vmsplice splices a user address range into a pipe. It can be thought of
1458	* as splice-from-memory, where the regular splice is splice-from-file (or	1458	* as splice-from-memory, where the regular splice is splice-from-file (or
1459	* to file). In both cases the output is a pipe, naturally.	1459	* to file). In both cases the output is a pipe, naturally.
1460	*/	1460	*/
1461	static long vmsplice_to_pipe(struct file file, const struct iovec __user iov,	1461	static long vmsplice_to_pipe(struct file file, const struct iovec __user iov,
1462	unsigned long nr_segs, unsigned int flags)	1462	unsigned long nr_segs, unsigned int flags)
1463	{	1463	{
1464	struct pipe_inode_info *pipe;	1464	struct pipe_inode_info *pipe;
1465	struct page *pages[PIPE_BUFFERS];	1465	struct page *pages[PIPE_BUFFERS];
1466	struct partial_page partial[PIPE_BUFFERS];	1466	struct partial_page partial[PIPE_BUFFERS];
1467	struct splice_pipe_desc spd = {	1467	struct splice_pipe_desc spd = {
1468	.pages = pages,	1468	.pages = pages,
1469	.partial = partial,	1469	.partial = partial,
1470	.flags = flags,	1470	.flags = flags,
1471	.ops = &user_page_pipe_buf_ops,	1471	.ops = &user_page_pipe_buf_ops,
1472	};	1472	};
1473		1473
1474	pipe = pipe_info(file->f_path.dentry->d_inode);	1474	pipe = pipe_info(file->f_path.dentry->d_inode);
1475	if (!pipe)	1475	if (!pipe)
1476	return -EBADF;	1476	return -EBADF;
1477		1477
1478	spd.nr_pages = get_iovec_page_array(iov, nr_segs, pages, partial,	1478	spd.nr_pages = get_iovec_page_array(iov, nr_segs, pages, partial,
1479	flags & SPLICE_F_GIFT);	1479	flags & SPLICE_F_GIFT);
1480	if (spd.nr_pages <= 0)	1480	if (spd.nr_pages <= 0)
1481	return spd.nr_pages;	1481	return spd.nr_pages;
1482		1482
1483	return splice_to_pipe(pipe, &spd);	1483	return splice_to_pipe(pipe, &spd);
1484	}	1484	}
1485		1485
1486	/*	1486	/*
1487	* Note that vmsplice only really supports true splicing _from_ user memory	1487	* Note that vmsplice only really supports true splicing _from_ user memory
1488	* to a pipe, not the other way around. Splicing from user memory is a simple	1488	* to a pipe, not the other way around. Splicing from user memory is a simple
1489	* operation that can be supported without any funky alignment restrictions	1489	* operation that can be supported without any funky alignment restrictions
1490	* or nasty vm tricks. We simply map in the user memory and fill them into	1490	* or nasty vm tricks. We simply map in the user memory and fill them into
1491	* a pipe. The reverse isn't quite as easy, though. There are two possible	1491	* a pipe. The reverse isn't quite as easy, though. There are two possible
1492	* solutions for that:	1492	* solutions for that:
1493	*	1493	*
1494	* - memcpy() the data internally, at which point we might as well just	1494	* - memcpy() the data internally, at which point we might as well just
1495	* do a regular read() on the buffer anyway.	1495	* do a regular read() on the buffer anyway.
1496	* - Lots of nasty vm tricks, that are neither fast nor flexible (it	1496	* - Lots of nasty vm tricks, that are neither fast nor flexible (it
1497	* has restriction limitations on both ends of the pipe).	1497	* has restriction limitations on both ends of the pipe).
1498	*	1498	*
1499	* Currently we punt and implement it as a normal copy, see pipe_to_user().	1499	* Currently we punt and implement it as a normal copy, see pipe_to_user().
1500	*	1500	*
1501	*/	1501	*/
1502	asmlinkage long sys_vmsplice(int fd, const struct iovec __user *iov,	1502	asmlinkage long sys_vmsplice(int fd, const struct iovec __user *iov,
1503	unsigned long nr_segs, unsigned int flags)	1503	unsigned long nr_segs, unsigned int flags)
1504	{	1504	{
1505	struct file *file;	1505	struct file *file;
1506	long error;	1506	long error;
1507	int fput;	1507	int fput;
1508		1508
1509	if (unlikely(nr_segs > UIO_MAXIOV))	1509	if (unlikely(nr_segs > UIO_MAXIOV))
1510	return -EINVAL;	1510	return -EINVAL;
1511	else if (unlikely(!nr_segs))	1511	else if (unlikely(!nr_segs))
1512	return 0;	1512	return 0;
1513		1513
1514	error = -EBADF;	1514	error = -EBADF;
1515	file = fget_light(fd, &fput);	1515	file = fget_light(fd, &fput);
1516	if (file) {	1516	if (file) {
1517	if (file->f_mode & FMODE_WRITE)	1517	if (file->f_mode & FMODE_WRITE)
1518	error = vmsplice_to_pipe(file, iov, nr_segs, flags);	1518	error = vmsplice_to_pipe(file, iov, nr_segs, flags);
1519	else if (file->f_mode & FMODE_READ)	1519	else if (file->f_mode & FMODE_READ)
1520	error = vmsplice_to_user(file, iov, nr_segs, flags);	1520	error = vmsplice_to_user(file, iov, nr_segs, flags);
1521		1521
1522	fput_light(file, fput);	1522	fput_light(file, fput);
1523	}	1523	}
1524		1524
1525	return error;	1525	return error;
1526	}	1526	}
1527		1527
1528	asmlinkage long sys_splice(int fd_in, loff_t __user *off_in,	1528	asmlinkage long sys_splice(int fd_in, loff_t __user *off_in,
1529	int fd_out, loff_t __user *off_out,	1529	int fd_out, loff_t __user *off_out,
1530	size_t len, unsigned int flags)	1530	size_t len, unsigned int flags)
1531	{	1531	{
1532	long error;	1532	long error;
1533	struct file in, out;	1533	struct file in, out;
1534	int fput_in, fput_out;	1534	int fput_in, fput_out;
1535		1535
1536	if (unlikely(!len))	1536	if (unlikely(!len))
1537	return 0;	1537	return 0;
1538		1538
1539	error = -EBADF;	1539	error = -EBADF;
1540	in = fget_light(fd_in, &fput_in);	1540	in = fget_light(fd_in, &fput_in);
1541	if (in) {	1541	if (in) {
1542	if (in->f_mode & FMODE_READ) {	1542	if (in->f_mode & FMODE_READ) {
1543	out = fget_light(fd_out, &fput_out);	1543	out = fget_light(fd_out, &fput_out);
1544	if (out) {	1544	if (out) {
1545	if (out->f_mode & FMODE_WRITE)	1545	if (out->f_mode & FMODE_WRITE)
1546	error = do_splice(in, off_in,	1546	error = do_splice(in, off_in,
1547	out, off_out,	1547	out, off_out,
1548	len, flags);	1548	len, flags);
1549	fput_light(out, fput_out);	1549	fput_light(out, fput_out);
1550	}	1550	}
1551	}	1551	}
1552		1552
1553	fput_light(in, fput_in);	1553	fput_light(in, fput_in);
1554	}	1554	}
1555		1555
1556	return error;	1556	return error;
1557	}	1557	}
1558		1558
1559	/*	1559	/*
1560	* Make sure there's data to read. Wait for input if we can, otherwise	1560	* Make sure there's data to read. Wait for input if we can, otherwise
1561	* return an appropriate error.	1561	* return an appropriate error.
1562	*/	1562	*/
1563	static int link_ipipe_prep(struct pipe_inode_info *pipe, unsigned int flags)	1563	static int link_ipipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
1564	{	1564	{
1565	int ret;	1565	int ret;
1566		1566
1567	/*	1567	/*
1568	* Check ->nrbufs without the inode lock first. This function	1568	* Check ->nrbufs without the inode lock first. This function
1569	* is speculative anyways, so missing one is ok.	1569	* is speculative anyways, so missing one is ok.
1570	*/	1570	*/
1571	if (pipe->nrbufs)	1571	if (pipe->nrbufs)
1572	return 0;	1572	return 0;
1573		1573
1574	ret = 0;	1574	ret = 0;
1575	mutex_lock(&pipe->inode->i_mutex);	1575	mutex_lock(&pipe->inode->i_mutex);
1576		1576
1577	while (!pipe->nrbufs) {	1577	while (!pipe->nrbufs) {
1578	if (signal_pending(current)) {	1578	if (signal_pending(current)) {
1579	ret = -ERESTARTSYS;	1579	ret = -ERESTARTSYS;
1580	break;	1580	break;
1581	}	1581	}
1582	if (!pipe->writers)	1582	if (!pipe->writers)
1583	break;	1583	break;
1584	if (!pipe->waiting_writers) {	1584	if (!pipe->waiting_writers) {
1585	if (flags & SPLICE_F_NONBLOCK) {	1585	if (flags & SPLICE_F_NONBLOCK) {
1586	ret = -EAGAIN;	1586	ret = -EAGAIN;
1587	break;	1587	break;
1588	}	1588	}
1589	}	1589	}
1590	pipe_wait(pipe);	1590	pipe_wait(pipe);
1591	}	1591	}
1592		1592
1593	mutex_unlock(&pipe->inode->i_mutex);	1593	mutex_unlock(&pipe->inode->i_mutex);
1594	return ret;	1594	return ret;
1595	}	1595	}
1596		1596
1597	/*	1597	/*
1598	* Make sure there's writeable room. Wait for room if we can, otherwise	1598	* Make sure there's writeable room. Wait for room if we can, otherwise
1599	* return an appropriate error.	1599	* return an appropriate error.
1600	*/	1600	*/
1601	static int link_opipe_prep(struct pipe_inode_info *pipe, unsigned int flags)	1601	static int link_opipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
1602	{	1602	{
1603	int ret;	1603	int ret;
1604		1604
1605	/*	1605	/*
1606	* Check ->nrbufs without the inode lock first. This function	1606	* Check ->nrbufs without the inode lock first. This function
1607	* is speculative anyways, so missing one is ok.	1607	* is speculative anyways, so missing one is ok.
1608	*/	1608	*/
1609	if (pipe->nrbufs < PIPE_BUFFERS)	1609	if (pipe->nrbufs < PIPE_BUFFERS)
1610	return 0;	1610	return 0;
1611		1611
1612	ret = 0;	1612	ret = 0;
1613	mutex_lock(&pipe->inode->i_mutex);	1613	mutex_lock(&pipe->inode->i_mutex);
1614		1614
1615	while (pipe->nrbufs >= PIPE_BUFFERS) {	1615	while (pipe->nrbufs >= PIPE_BUFFERS) {
1616	if (!pipe->readers) {	1616	if (!pipe->readers) {
1617	send_sig(SIGPIPE, current, 0);	1617	send_sig(SIGPIPE, current, 0);
1618	ret = -EPIPE;	1618	ret = -EPIPE;
1619	break;	1619	break;
1620	}	1620	}
1621	if (flags & SPLICE_F_NONBLOCK) {	1621	if (flags & SPLICE_F_NONBLOCK) {
1622	ret = -EAGAIN;	1622	ret = -EAGAIN;
1623	break;	1623	break;
1624	}	1624	}
1625	if (signal_pending(current)) {	1625	if (signal_pending(current)) {
1626	ret = -ERESTARTSYS;	1626	ret = -ERESTARTSYS;
1627	break;	1627	break;
1628	}	1628	}
1629	pipe->waiting_writers++;	1629	pipe->waiting_writers++;
1630	pipe_wait(pipe);	1630	pipe_wait(pipe);
1631	pipe->waiting_writers--;	1631	pipe->waiting_writers--;
1632	}	1632	}
1633		1633
1634	mutex_unlock(&pipe->inode->i_mutex);	1634	mutex_unlock(&pipe->inode->i_mutex);
1635	return ret;	1635	return ret;
1636	}	1636	}
1637		1637
1638	/*	1638	/*
1639	* Link contents of ipipe to opipe.	1639	* Link contents of ipipe to opipe.
1640	*/	1640	*/
1641	static int link_pipe(struct pipe_inode_info *ipipe,	1641	static int link_pipe(struct pipe_inode_info *ipipe,
1642	struct pipe_inode_info *opipe,	1642	struct pipe_inode_info *opipe,
1643	size_t len, unsigned int flags)	1643	size_t len, unsigned int flags)
1644	{	1644	{
1645	struct pipe_buffer ibuf, obuf;	1645	struct pipe_buffer ibuf, obuf;
1646	int ret = 0, i = 0, nbuf;	1646	int ret = 0, i = 0, nbuf;
1647		1647
1648	/*	1648	/*
1649	* Potential ABBA deadlock, work around it by ordering lock	1649	* Potential ABBA deadlock, work around it by ordering lock
1650	* grabbing by inode address. Otherwise two different processes	1650	* grabbing by inode address. Otherwise two different processes
1651	* could deadlock (one doing tee from A -> B, the other from B -> A).	1651	* could deadlock (one doing tee from A -> B, the other from B -> A).
1652	*/	1652	*/
1653	inode_double_lock(ipipe->inode, opipe->inode);	1653	inode_double_lock(ipipe->inode, opipe->inode);
1654		1654
1655	do {	1655	do {
1656	if (!opipe->readers) {	1656	if (!opipe->readers) {
1657	send_sig(SIGPIPE, current, 0);	1657	send_sig(SIGPIPE, current, 0);
1658	if (!ret)	1658	if (!ret)
1659	ret = -EPIPE;	1659	ret = -EPIPE;
1660	break;	1660	break;
1661	}	1661	}
1662		1662
1663	/*	1663	/*
1664	* If we have iterated all input buffers or ran out of	1664	* If we have iterated all input buffers or ran out of
1665	* output room, break.	1665	* output room, break.
1666	*/	1666	*/
1667	if (i >= ipipe->nrbufs \|\| opipe->nrbufs >= PIPE_BUFFERS)	1667	if (i >= ipipe->nrbufs \|\| opipe->nrbufs >= PIPE_BUFFERS)
1668	break;	1668	break;
1669		1669
1670	ibuf = ipipe->bufs + ((ipipe->curbuf + i) & (PIPE_BUFFERS - 1));	1670	ibuf = ipipe->bufs + ((ipipe->curbuf + i) & (PIPE_BUFFERS - 1));
1671	nbuf = (opipe->curbuf + opipe->nrbufs) & (PIPE_BUFFERS - 1);	1671	nbuf = (opipe->curbuf + opipe->nrbufs) & (PIPE_BUFFERS - 1);
1672		1672
1673	/*	1673	/*
1674	* Get a reference to this pipe buffer,	1674	* Get a reference to this pipe buffer,
1675	* so we can copy the contents over.	1675	* so we can copy the contents over.
1676	*/	1676	*/
1677	ibuf->ops->get(ipipe, ibuf);	1677	ibuf->ops->get(ipipe, ibuf);
1678		1678
1679	obuf = opipe->bufs + nbuf;	1679	obuf = opipe->bufs + nbuf;
1680	obuf = ibuf;	1680	obuf = ibuf;
1681		1681
1682	/*	1682	/*
1683	* Don't inherit the gift flag, we need to	1683	* Don't inherit the gift flag, we need to
1684	* prevent multiple steals of this page.	1684	* prevent multiple steals of this page.
1685	*/	1685	*/
1686	obuf->flags &= ~PIPE_BUF_FLAG_GIFT;	1686	obuf->flags &= ~PIPE_BUF_FLAG_GIFT;
1687		1687
1688	if (obuf->len > len)	1688	if (obuf->len > len)
1689	obuf->len = len;	1689	obuf->len = len;
1690		1690
1691	opipe->nrbufs++;	1691	opipe->nrbufs++;
1692	ret += obuf->len;	1692	ret += obuf->len;
1693	len -= obuf->len;	1693	len -= obuf->len;
1694	i++;	1694	i++;
1695	} while (len);	1695	} while (len);
1696		1696
1697	inode_double_unlock(ipipe->inode, opipe->inode);	1697	inode_double_unlock(ipipe->inode, opipe->inode);
1698		1698
1699	/*	1699	/*
1700	* If we put data in the output pipe, wakeup any potential readers.	1700	* If we put data in the output pipe, wakeup any potential readers.
1701	*/	1701	*/
1702	if (ret > 0) {	1702	if (ret > 0) {
1703	smp_mb();	1703	smp_mb();
1704	if (waitqueue_active(&opipe->wait))	1704	if (waitqueue_active(&opipe->wait))
1705	wake_up_interruptible(&opipe->wait);	1705	wake_up_interruptible(&opipe->wait);
1706	kill_fasync(&opipe->fasync_readers, SIGIO, POLL_IN);	1706	kill_fasync(&opipe->fasync_readers, SIGIO, POLL_IN);
1707	}	1707	}
1708		1708
1709	return ret;	1709	return ret;
1710	}	1710	}
1711		1711
1712	/*	1712	/*
1713	* This is a tee(1) implementation that works on pipes. It doesn't copy	1713	* This is a tee(1) implementation that works on pipes. It doesn't copy
1714	* any data, it simply references the 'in' pages on the 'out' pipe.	1714	* any data, it simply references the 'in' pages on the 'out' pipe.
1715	* The 'flags' used are the SPLICE_F_* variants, currently the only	1715	* The 'flags' used are the SPLICE_F_* variants, currently the only
1716	* applicable one is SPLICE_F_NONBLOCK.	1716	* applicable one is SPLICE_F_NONBLOCK.
1717	*/	1717	*/
1718	static long do_tee(struct file in, struct file out, size_t len,	1718	static long do_tee(struct file in, struct file out, size_t len,
1719	unsigned int flags)	1719	unsigned int flags)
1720	{	1720	{
1721	struct pipe_inode_info *ipipe = pipe_info(in->f_path.dentry->d_inode);	1721	struct pipe_inode_info *ipipe = pipe_info(in->f_path.dentry->d_inode);
1722	struct pipe_inode_info *opipe = pipe_info(out->f_path.dentry->d_inode);	1722	struct pipe_inode_info *opipe = pipe_info(out->f_path.dentry->d_inode);
1723	int ret = -EINVAL;	1723	int ret = -EINVAL;
1724		1724
1725	/*	1725	/*
1726	* Duplicate the contents of ipipe to opipe without actually	1726	* Duplicate the contents of ipipe to opipe without actually
1727	* copying the data.	1727	* copying the data.
1728	*/	1728	*/
1729	if (ipipe && opipe && ipipe != opipe) {	1729	if (ipipe && opipe && ipipe != opipe) {
1730	/*	1730	/*
1731	* Keep going, unless we encounter an error. The ipipe/opipe	1731	* Keep going, unless we encounter an error. The ipipe/opipe
1732	* ordering doesn't really matter.	1732	* ordering doesn't really matter.
1733	*/	1733	*/
1734	ret = link_ipipe_prep(ipipe, flags);	1734	ret = link_ipipe_prep(ipipe, flags);
1735	if (!ret) {	1735	if (!ret) {
1736	ret = link_opipe_prep(opipe, flags);	1736	ret = link_opipe_prep(opipe, flags);
1737	if (!ret) {	1737	if (!ret) {
1738	ret = link_pipe(ipipe, opipe, len, flags);	1738	ret = link_pipe(ipipe, opipe, len, flags);
1739	if (!ret && (flags & SPLICE_F_NONBLOCK))	1739	if (!ret && (flags & SPLICE_F_NONBLOCK))
1740	ret = -EAGAIN;	1740	ret = -EAGAIN;
1741	}	1741	}
1742	}	1742	}
1743	}	1743	}
1744		1744
1745	return ret;	1745	return ret;
1746	}	1746	}
1747		1747
1748	asmlinkage long sys_tee(int fdin, int fdout, size_t len, unsigned int flags)	1748	asmlinkage long sys_tee(int fdin, int fdout, size_t len, unsigned int flags)
1749	{	1749	{
1750	struct file *in;	1750	struct file *in;
1751	int error, fput_in;	1751	int error, fput_in;
1752		1752
1753	if (unlikely(!len))	1753	if (unlikely(!len))
1754	return 0;	1754	return 0;
1755		1755
1756	error = -EBADF;	1756	error = -EBADF;
1757	in = fget_light(fdin, &fput_in);	1757	in = fget_light(fdin, &fput_in);
1758	if (in) {	1758	if (in) {
1759	if (in->f_mode & FMODE_READ) {	1759	if (in->f_mode & FMODE_READ) {
1760	int fput_out;	1760	int fput_out;
1761	struct file *out = fget_light(fdout, &fput_out);	1761	struct file *out = fget_light(fdout, &fput_out);
1762		1762
1763	if (out) {	1763	if (out) {
1764	if (out->f_mode & FMODE_WRITE)	1764	if (out->f_mode & FMODE_WRITE)
1765	error = do_tee(in, out, len, flags);	1765	error = do_tee(in, out, len, flags);
1766	fput_light(out, fput_out);	1766	fput_light(out, fput_out);
1767	}	1767	}
1768	}	1768	}
1769	fput_light(in, fput_in);	1769	fput_light(in, fput_in);
1770	}	1770	}
1771		1771
1772	return error;	1772	return error;
1773	}	1773	}
1774		1774

include/linux/pipe_fs_i.h

Diff comments View file @ cac36bb

 #ifndef _LINUX_PIPE_FS_I_H
 #define _LINUX_PIPE_FS_I_H
 #define PIPEFS_MAGIC 0x50495045
 #define PIPE_BUFFERS (16)
 #define PIPE_BUF_FLAG_LRU	0x01	/* page is on the LRU */
 #define PIPE_BUF_FLAG_ATOMIC	0x02	/* was atomically mapped */
 #define PIPE_BUF_FLAG_GIFT	0x04	/* page is a gift */
 struct pipe_buffer {
 	struct page *page;
 	unsigned int offset, len;
 	const struct pipe_buf_operations *ops;
 	unsigned int flags;
 	unsigned long private;
 };
 struct pipe_inode_info {
 	wait_queue_head_t wait;
 	unsigned int nrbufs, curbuf;
 	struct page *tmp_page;
 	unsigned int readers;
 	unsigned int writers;
 	unsigned int waiting_writers;
 	unsigned int r_counter;
 	unsigned int w_counter;
 	struct fasync_struct *fasync_readers;
 	struct fasync_struct *fasync_writers;
 	struct inode *inode;
 	struct pipe_buffer bufs[PIPE_BUFFERS];
 };
 /*
  * Note on the nesting of these functions:
  *
- * ->pin()
+ * ->confirm()
  *	->steal()
  *	...
  *	->map()
  *	...
  *	->unmap()
  *
- * That is, ->map() must be called on a pinned buffer, same goes for ->steal().
+ * That is, ->map() must be called on a confirmed buffer,
+ * same goes for ->steal().
  */
 struct pipe_buf_operations {
 	int can_merge;
 	void * (*map)(struct pipe_inode_info *, struct pipe_buffer *, int);
 	void (*unmap)(struct pipe_inode_info *, struct pipe_buffer *, void *);
-	int (*pin)(struct pipe_inode_info *, struct pipe_buffer *);
+	int (*confirm)(struct pipe_inode_info *, struct pipe_buffer *);
 	void (*release)(struct pipe_inode_info *, struct pipe_buffer *);
 	int (*steal)(struct pipe_inode_info *, struct pipe_buffer *);
 	void (*get)(struct pipe_inode_info *, struct pipe_buffer *);
 };
 /* Differs from PIPE_BUF in that PIPE_SIZE is the length of the actual
    memory allocation, whereas PIPE_BUF makes atomicity guarantees.  */
 #define PIPE_SIZE		PAGE_SIZE
 /* Drop the inode semaphore and wait for a pipe event, atomically */
 void pipe_wait(struct pipe_inode_info *pipe);
 struct pipe_inode_info * alloc_pipe_info(struct inode * inode);
 void free_pipe_info(struct inode * inode);
 void __free_pipe_info(struct pipe_inode_info *);
 /* Generic pipe buffer ops functions */
 void *generic_pipe_buf_map(struct pipe_inode_info *, struct pipe_buffer *, int);
 void generic_pipe_buf_unmap(struct pipe_inode_info *, struct pipe_buffer *, void *);
 void generic_pipe_buf_get(struct pipe_inode_info *, struct pipe_buffer *);
-int generic_pipe_buf_pin(struct pipe_inode_info *, struct pipe_buffer *);
+int generic_pipe_buf_confirm(struct pipe_inode_info *, struct pipe_buffer *);
 int generic_pipe_buf_steal(struct pipe_inode_info *, struct pipe_buffer *);
 #endif

kernel/relay.c

Diff comments View file @ cac36bb

1	/*	1	/*
2	* Public API and common code for kernel->userspace relay file support.	2	* Public API and common code for kernel->userspace relay file support.
3	*	3	*
4	* See Documentation/filesystems/relayfs.txt for an overview of relayfs.	4	* See Documentation/filesystems/relayfs.txt for an overview of relayfs.
5	*	5	*
6	* Copyright (C) 2002-2005 - Tom Zanussi (zanussi@us.ibm.com), IBM Corp	6	* Copyright (C) 2002-2005 - Tom Zanussi (zanussi@us.ibm.com), IBM Corp
7	* Copyright (C) 1999-2005 - Karim Yaghmour (karim@opersys.com)	7	* Copyright (C) 1999-2005 - Karim Yaghmour (karim@opersys.com)
8	*	8	*
9	* Moved to kernel/relay.c by Paul Mundt, 2006.	9	* Moved to kernel/relay.c by Paul Mundt, 2006.
10	* November 2006 - CPU hotplug support by Mathieu Desnoyers	10	* November 2006 - CPU hotplug support by Mathieu Desnoyers
11	* (mathieu.desnoyers@polymtl.ca)	11	* (mathieu.desnoyers@polymtl.ca)
12	*	12	*
13	* This file is released under the GPL.	13	* This file is released under the GPL.
14	*/	14	*/
15	#include <linux/errno.h>	15	#include <linux/errno.h>
16	#include <linux/stddef.h>	16	#include <linux/stddef.h>
17	#include <linux/slab.h>	17	#include <linux/slab.h>
18	#include <linux/module.h>	18	#include <linux/module.h>
19	#include <linux/string.h>	19	#include <linux/string.h>
20	#include <linux/relay.h>	20	#include <linux/relay.h>
21	#include <linux/vmalloc.h>	21	#include <linux/vmalloc.h>
22	#include <linux/mm.h>	22	#include <linux/mm.h>
23	#include <linux/cpu.h>	23	#include <linux/cpu.h>
24	#include <linux/splice.h>	24	#include <linux/splice.h>
25		25
26	/* list of open channels, for cpu hotplug */	26	/* list of open channels, for cpu hotplug */
27	static DEFINE_MUTEX(relay_channels_mutex);	27	static DEFINE_MUTEX(relay_channels_mutex);
28	static LIST_HEAD(relay_channels);	28	static LIST_HEAD(relay_channels);
29		29
30	/*	30	/*
31	* close() vm_op implementation for relay file mapping.	31	* close() vm_op implementation for relay file mapping.
32	*/	32	*/
33	static void relay_file_mmap_close(struct vm_area_struct *vma)	33	static void relay_file_mmap_close(struct vm_area_struct *vma)
34	{	34	{
35	struct rchan_buf *buf = vma->vm_private_data;	35	struct rchan_buf *buf = vma->vm_private_data;
36	buf->chan->cb->buf_unmapped(buf, vma->vm_file);	36	buf->chan->cb->buf_unmapped(buf, vma->vm_file);
37	}	37	}
38		38
39	/*	39	/*
40	* nopage() vm_op implementation for relay file mapping.	40	* nopage() vm_op implementation for relay file mapping.
41	*/	41	*/
42	static struct page relay_buf_nopage(struct vm_area_struct vma,	42	static struct page relay_buf_nopage(struct vm_area_struct vma,
43	unsigned long address,	43	unsigned long address,
44	int *type)	44	int *type)
45	{	45	{
46	struct page *page;	46	struct page *page;
47	struct rchan_buf *buf = vma->vm_private_data;	47	struct rchan_buf *buf = vma->vm_private_data;
48	unsigned long offset = address - vma->vm_start;	48	unsigned long offset = address - vma->vm_start;
49		49
50	if (address > vma->vm_end)	50	if (address > vma->vm_end)
51	return NOPAGE_SIGBUS; /* Disallow mremap */	51	return NOPAGE_SIGBUS; /* Disallow mremap */
52	if (!buf)	52	if (!buf)
53	return NOPAGE_OOM;	53	return NOPAGE_OOM;
54		54
55	page = vmalloc_to_page(buf->start + offset);	55	page = vmalloc_to_page(buf->start + offset);
56	if (!page)	56	if (!page)
57	return NOPAGE_OOM;	57	return NOPAGE_OOM;
58	get_page(page);	58	get_page(page);
59		59
60	if (type)	60	if (type)
61	*type = VM_FAULT_MINOR;	61	*type = VM_FAULT_MINOR;
62		62
63	return page;	63	return page;
64	}	64	}
65		65
66	/*	66	/*
67	* vm_ops for relay file mappings.	67	* vm_ops for relay file mappings.
68	*/	68	*/
69	static struct vm_operations_struct relay_file_mmap_ops = {	69	static struct vm_operations_struct relay_file_mmap_ops = {
70	.nopage = relay_buf_nopage,	70	.nopage = relay_buf_nopage,
71	.close = relay_file_mmap_close,	71	.close = relay_file_mmap_close,
72	};	72	};
73		73
74	/**	74	/**
75	* relay_mmap_buf: - mmap channel buffer to process address space	75	* relay_mmap_buf: - mmap channel buffer to process address space
76	* @buf: relay channel buffer	76	* @buf: relay channel buffer
77	* @vma: vm_area_struct describing memory to be mapped	77	* @vma: vm_area_struct describing memory to be mapped
78	*	78	*
79	* Returns 0 if ok, negative on error	79	* Returns 0 if ok, negative on error
80	*	80	*
81	* Caller should already have grabbed mmap_sem.	81	* Caller should already have grabbed mmap_sem.
82	*/	82	*/
83	int relay_mmap_buf(struct rchan_buf buf, struct vm_area_struct vma)	83	int relay_mmap_buf(struct rchan_buf buf, struct vm_area_struct vma)
84	{	84	{
85	unsigned long length = vma->vm_end - vma->vm_start;	85	unsigned long length = vma->vm_end - vma->vm_start;
86	struct file *filp = vma->vm_file;	86	struct file *filp = vma->vm_file;
87		87
88	if (!buf)	88	if (!buf)
89	return -EBADF;	89	return -EBADF;
90		90
91	if (length != (unsigned long)buf->chan->alloc_size)	91	if (length != (unsigned long)buf->chan->alloc_size)
92	return -EINVAL;	92	return -EINVAL;
93		93
94	vma->vm_ops = &relay_file_mmap_ops;	94	vma->vm_ops = &relay_file_mmap_ops;
95	vma->vm_private_data = buf;	95	vma->vm_private_data = buf;
96	buf->chan->cb->buf_mapped(buf, filp);	96	buf->chan->cb->buf_mapped(buf, filp);
97		97
98	return 0;	98	return 0;
99	}	99	}
100		100
101	/**	101	/**
102	* relay_alloc_buf - allocate a channel buffer	102	* relay_alloc_buf - allocate a channel buffer
103	* @buf: the buffer struct	103	* @buf: the buffer struct
104	* @size: total size of the buffer	104	* @size: total size of the buffer
105	*	105	*
106	* Returns a pointer to the resulting buffer, %NULL if unsuccessful. The	106	* Returns a pointer to the resulting buffer, %NULL if unsuccessful. The
107	* passed in size will get page aligned, if it isn't already.	107	* passed in size will get page aligned, if it isn't already.
108	*/	108	*/
109	static void relay_alloc_buf(struct rchan_buf buf, size_t *size)	109	static void relay_alloc_buf(struct rchan_buf buf, size_t *size)
110	{	110	{
111	void *mem;	111	void *mem;
112	unsigned int i, j, n_pages;	112	unsigned int i, j, n_pages;
113		113
114	size = PAGE_ALIGN(size);	114	size = PAGE_ALIGN(size);
115	n_pages = *size >> PAGE_SHIFT;	115	n_pages = *size >> PAGE_SHIFT;
116		116
117	buf->page_array = kcalloc(n_pages, sizeof(struct page *), GFP_KERNEL);	117	buf->page_array = kcalloc(n_pages, sizeof(struct page *), GFP_KERNEL);
118	if (!buf->page_array)	118	if (!buf->page_array)
119	return NULL;	119	return NULL;
120		120
121	for (i = 0; i < n_pages; i++) {	121	for (i = 0; i < n_pages; i++) {
122	buf->page_array[i] = alloc_page(GFP_KERNEL);	122	buf->page_array[i] = alloc_page(GFP_KERNEL);
123	if (unlikely(!buf->page_array[i]))	123	if (unlikely(!buf->page_array[i]))
124	goto depopulate;	124	goto depopulate;
125	set_page_private(buf->page_array[i], (unsigned long)buf);	125	set_page_private(buf->page_array[i], (unsigned long)buf);
126	}	126	}
127	mem = vmap(buf->page_array, n_pages, VM_MAP, PAGE_KERNEL);	127	mem = vmap(buf->page_array, n_pages, VM_MAP, PAGE_KERNEL);
128	if (!mem)	128	if (!mem)
129	goto depopulate;	129	goto depopulate;
130		130
131	memset(mem, 0, *size);	131	memset(mem, 0, *size);
132	buf->page_count = n_pages;	132	buf->page_count = n_pages;
133	return mem;	133	return mem;
134		134
135	depopulate:	135	depopulate:
136	for (j = 0; j < i; j++)	136	for (j = 0; j < i; j++)
137	__free_page(buf->page_array[j]);	137	__free_page(buf->page_array[j]);
138	kfree(buf->page_array);	138	kfree(buf->page_array);
139	return NULL;	139	return NULL;
140	}	140	}
141		141
142	/**	142	/**
143	* relay_create_buf - allocate and initialize a channel buffer	143	* relay_create_buf - allocate and initialize a channel buffer
144	* @chan: the relay channel	144	* @chan: the relay channel
145	*	145	*
146	* Returns channel buffer if successful, %NULL otherwise.	146	* Returns channel buffer if successful, %NULL otherwise.
147	*/	147	*/
148	struct rchan_buf relay_create_buf(struct rchan chan)	148	struct rchan_buf relay_create_buf(struct rchan chan)
149	{	149	{
150	struct rchan_buf *buf = kzalloc(sizeof(struct rchan_buf), GFP_KERNEL);	150	struct rchan_buf *buf = kzalloc(sizeof(struct rchan_buf), GFP_KERNEL);
151	if (!buf)	151	if (!buf)
152	return NULL;	152	return NULL;
153		153
154	buf->padding = kmalloc(chan->n_subbufs * sizeof(size_t *), GFP_KERNEL);	154	buf->padding = kmalloc(chan->n_subbufs * sizeof(size_t *), GFP_KERNEL);
155	if (!buf->padding)	155	if (!buf->padding)
156	goto free_buf;	156	goto free_buf;
157		157
158	buf->start = relay_alloc_buf(buf, &chan->alloc_size);	158	buf->start = relay_alloc_buf(buf, &chan->alloc_size);
159	if (!buf->start)	159	if (!buf->start)
160	goto free_buf;	160	goto free_buf;
161		161
162	buf->chan = chan;	162	buf->chan = chan;
163	kref_get(&buf->chan->kref);	163	kref_get(&buf->chan->kref);
164	return buf;	164	return buf;
165		165
166	free_buf:	166	free_buf:
167	kfree(buf->padding);	167	kfree(buf->padding);
168	kfree(buf);	168	kfree(buf);
169	return NULL;	169	return NULL;
170	}	170	}
171		171
172	/**	172	/**
173	* relay_destroy_channel - free the channel struct	173	* relay_destroy_channel - free the channel struct
174	* @kref: target kernel reference that contains the relay channel	174	* @kref: target kernel reference that contains the relay channel
175	*	175	*
176	* Should only be called from kref_put().	176	* Should only be called from kref_put().
177	*/	177	*/
178	void relay_destroy_channel(struct kref *kref)	178	void relay_destroy_channel(struct kref *kref)
179	{	179	{
180	struct rchan *chan = container_of(kref, struct rchan, kref);	180	struct rchan *chan = container_of(kref, struct rchan, kref);
181	kfree(chan);	181	kfree(chan);
182	}	182	}
183		183
184	/**	184	/**
185	* relay_destroy_buf - destroy an rchan_buf struct and associated buffer	185	* relay_destroy_buf - destroy an rchan_buf struct and associated buffer
186	* @buf: the buffer struct	186	* @buf: the buffer struct
187	*/	187	*/
188	void relay_destroy_buf(struct rchan_buf *buf)	188	void relay_destroy_buf(struct rchan_buf *buf)
189	{	189	{
190	struct rchan *chan = buf->chan;	190	struct rchan *chan = buf->chan;
191	unsigned int i;	191	unsigned int i;
192		192
193	if (likely(buf->start)) {	193	if (likely(buf->start)) {
194	vunmap(buf->start);	194	vunmap(buf->start);
195	for (i = 0; i < buf->page_count; i++)	195	for (i = 0; i < buf->page_count; i++)
196	__free_page(buf->page_array[i]);	196	__free_page(buf->page_array[i]);
197	kfree(buf->page_array);	197	kfree(buf->page_array);
198	}	198	}
199	chan->buf[buf->cpu] = NULL;	199	chan->buf[buf->cpu] = NULL;
200	kfree(buf->padding);	200	kfree(buf->padding);
201	kfree(buf);	201	kfree(buf);
202	kref_put(&chan->kref, relay_destroy_channel);	202	kref_put(&chan->kref, relay_destroy_channel);
203	}	203	}
204		204
205	/**	205	/**
206	* relay_remove_buf - remove a channel buffer	206	* relay_remove_buf - remove a channel buffer
207	* @kref: target kernel reference that contains the relay buffer	207	* @kref: target kernel reference that contains the relay buffer
208	*	208	*
209	* Removes the file from the fileystem, which also frees the	209	* Removes the file from the fileystem, which also frees the
210	* rchan_buf_struct and the channel buffer. Should only be called from	210	* rchan_buf_struct and the channel buffer. Should only be called from
211	* kref_put().	211	* kref_put().
212	*/	212	*/
213	void relay_remove_buf(struct kref *kref)	213	void relay_remove_buf(struct kref *kref)
214	{	214	{
215	struct rchan_buf *buf = container_of(kref, struct rchan_buf, kref);	215	struct rchan_buf *buf = container_of(kref, struct rchan_buf, kref);
216	buf->chan->cb->remove_buf_file(buf->dentry);	216	buf->chan->cb->remove_buf_file(buf->dentry);
217	relay_destroy_buf(buf);	217	relay_destroy_buf(buf);
218	}	218	}
219		219
220	/**	220	/**
221	* relay_buf_empty - boolean, is the channel buffer empty?	221	* relay_buf_empty - boolean, is the channel buffer empty?
222	* @buf: channel buffer	222	* @buf: channel buffer
223	*	223	*
224	* Returns 1 if the buffer is empty, 0 otherwise.	224	* Returns 1 if the buffer is empty, 0 otherwise.
225	*/	225	*/
226	int relay_buf_empty(struct rchan_buf *buf)	226	int relay_buf_empty(struct rchan_buf *buf)
227	{	227	{
228	return (buf->subbufs_produced - buf->subbufs_consumed) ? 0 : 1;	228	return (buf->subbufs_produced - buf->subbufs_consumed) ? 0 : 1;
229	}	229	}
230	EXPORT_SYMBOL_GPL(relay_buf_empty);	230	EXPORT_SYMBOL_GPL(relay_buf_empty);
231		231
232	/**	232	/**
233	* relay_buf_full - boolean, is the channel buffer full?	233	* relay_buf_full - boolean, is the channel buffer full?
234	* @buf: channel buffer	234	* @buf: channel buffer
235	*	235	*
236	* Returns 1 if the buffer is full, 0 otherwise.	236	* Returns 1 if the buffer is full, 0 otherwise.
237	*/	237	*/
238	int relay_buf_full(struct rchan_buf *buf)	238	int relay_buf_full(struct rchan_buf *buf)
239	{	239	{
240	size_t ready = buf->subbufs_produced - buf->subbufs_consumed;	240	size_t ready = buf->subbufs_produced - buf->subbufs_consumed;
241	return (ready >= buf->chan->n_subbufs) ? 1 : 0;	241	return (ready >= buf->chan->n_subbufs) ? 1 : 0;
242	}	242	}
243	EXPORT_SYMBOL_GPL(relay_buf_full);	243	EXPORT_SYMBOL_GPL(relay_buf_full);
244		244
245	/*	245	/*
246	* High-level relay kernel API and associated functions.	246	* High-level relay kernel API and associated functions.
247	*/	247	*/
248		248
249	/*	249	/*
250	* rchan_callback implementations defining default channel behavior. Used	250	* rchan_callback implementations defining default channel behavior. Used
251	* in place of corresponding NULL values in client callback struct.	251	* in place of corresponding NULL values in client callback struct.
252	*/	252	*/
253		253
254	/*	254	/*
255	* subbuf_start() default callback. Does nothing.	255	* subbuf_start() default callback. Does nothing.
256	*/	256	*/
257	static int subbuf_start_default_callback (struct rchan_buf *buf,	257	static int subbuf_start_default_callback (struct rchan_buf *buf,
258	void *subbuf,	258	void *subbuf,
259	void *prev_subbuf,	259	void *prev_subbuf,
260	size_t prev_padding)	260	size_t prev_padding)
261	{	261	{
262	if (relay_buf_full(buf))	262	if (relay_buf_full(buf))
263	return 0;	263	return 0;
264		264
265	return 1;	265	return 1;
266	}	266	}
267		267
268	/*	268	/*
269	* buf_mapped() default callback. Does nothing.	269	* buf_mapped() default callback. Does nothing.
270	*/	270	*/
271	static void buf_mapped_default_callback(struct rchan_buf *buf,	271	static void buf_mapped_default_callback(struct rchan_buf *buf,
272	struct file *filp)	272	struct file *filp)
273	{	273	{
274	}	274	}
275		275
276	/*	276	/*
277	* buf_unmapped() default callback. Does nothing.	277	* buf_unmapped() default callback. Does nothing.
278	*/	278	*/
279	static void buf_unmapped_default_callback(struct rchan_buf *buf,	279	static void buf_unmapped_default_callback(struct rchan_buf *buf,
280	struct file *filp)	280	struct file *filp)
281	{	281	{
282	}	282	}
283		283
284	/*	284	/*
285	* create_buf_file_create() default callback. Does nothing.	285	* create_buf_file_create() default callback. Does nothing.
286	*/	286	*/
287	static struct dentry create_buf_file_default_callback(const char filename,	287	static struct dentry create_buf_file_default_callback(const char filename,
288	struct dentry *parent,	288	struct dentry *parent,
289	int mode,	289	int mode,
290	struct rchan_buf *buf,	290	struct rchan_buf *buf,
291	int *is_global)	291	int *is_global)
292	{	292	{
293	return NULL;	293	return NULL;
294	}	294	}
295		295
296	/*	296	/*
297	* remove_buf_file() default callback. Does nothing.	297	* remove_buf_file() default callback. Does nothing.
298	*/	298	*/
299	static int remove_buf_file_default_callback(struct dentry *dentry)	299	static int remove_buf_file_default_callback(struct dentry *dentry)
300	{	300	{
301	return -EINVAL;	301	return -EINVAL;
302	}	302	}
303		303
304	/* relay channel default callbacks */	304	/* relay channel default callbacks */
305	static struct rchan_callbacks default_channel_callbacks = {	305	static struct rchan_callbacks default_channel_callbacks = {
306	.subbuf_start = subbuf_start_default_callback,	306	.subbuf_start = subbuf_start_default_callback,
307	.buf_mapped = buf_mapped_default_callback,	307	.buf_mapped = buf_mapped_default_callback,
308	.buf_unmapped = buf_unmapped_default_callback,	308	.buf_unmapped = buf_unmapped_default_callback,
309	.create_buf_file = create_buf_file_default_callback,	309	.create_buf_file = create_buf_file_default_callback,
310	.remove_buf_file = remove_buf_file_default_callback,	310	.remove_buf_file = remove_buf_file_default_callback,
311	};	311	};
312		312
313	/**	313	/**
314	* wakeup_readers - wake up readers waiting on a channel	314	* wakeup_readers - wake up readers waiting on a channel
315	* @data: contains the channel buffer	315	* @data: contains the channel buffer
316	*	316	*
317	* This is the timer function used to defer reader waking.	317	* This is the timer function used to defer reader waking.
318	*/	318	*/
319	static void wakeup_readers(unsigned long data)	319	static void wakeup_readers(unsigned long data)
320	{	320	{
321	struct rchan_buf buf = (struct rchan_buf )data;	321	struct rchan_buf buf = (struct rchan_buf )data;
322	wake_up_interruptible(&buf->read_wait);	322	wake_up_interruptible(&buf->read_wait);
323	}	323	}
324		324
325	/**	325	/**
326	* __relay_reset - reset a channel buffer	326	* __relay_reset - reset a channel buffer
327	* @buf: the channel buffer	327	* @buf: the channel buffer
328	* @init: 1 if this is a first-time initialization	328	* @init: 1 if this is a first-time initialization
329	*	329	*
330	* See relay_reset() for description of effect.	330	* See relay_reset() for description of effect.
331	*/	331	*/
332	static void __relay_reset(struct rchan_buf *buf, unsigned int init)	332	static void __relay_reset(struct rchan_buf *buf, unsigned int init)
333	{	333	{
334	size_t i;	334	size_t i;
335		335
336	if (init) {	336	if (init) {
337	init_waitqueue_head(&buf->read_wait);	337	init_waitqueue_head(&buf->read_wait);
338	kref_init(&buf->kref);	338	kref_init(&buf->kref);
339	setup_timer(&buf->timer, wakeup_readers, (unsigned long)buf);	339	setup_timer(&buf->timer, wakeup_readers, (unsigned long)buf);
340	} else	340	} else
341	del_timer_sync(&buf->timer);	341	del_timer_sync(&buf->timer);
342		342
343	buf->subbufs_produced = 0;	343	buf->subbufs_produced = 0;
344	buf->subbufs_consumed = 0;	344	buf->subbufs_consumed = 0;
345	buf->bytes_consumed = 0;	345	buf->bytes_consumed = 0;
346	buf->finalized = 0;	346	buf->finalized = 0;
347	buf->data = buf->start;	347	buf->data = buf->start;
348	buf->offset = 0;	348	buf->offset = 0;
349		349
350	for (i = 0; i < buf->chan->n_subbufs; i++)	350	for (i = 0; i < buf->chan->n_subbufs; i++)
351	buf->padding[i] = 0;	351	buf->padding[i] = 0;
352		352
353	buf->chan->cb->subbuf_start(buf, buf->data, NULL, 0);	353	buf->chan->cb->subbuf_start(buf, buf->data, NULL, 0);
354	}	354	}
355		355
356	/**	356	/**
357	* relay_reset - reset the channel	357	* relay_reset - reset the channel
358	* @chan: the channel	358	* @chan: the channel
359	*	359	*
360	* This has the effect of erasing all data from all channel buffers	360	* This has the effect of erasing all data from all channel buffers
361	* and restarting the channel in its initial state. The buffers	361	* and restarting the channel in its initial state. The buffers
362	* are not freed, so any mappings are still in effect.	362	* are not freed, so any mappings are still in effect.
363	*	363	*
364	* NOTE. Care should be taken that the channel isn't actually	364	* NOTE. Care should be taken that the channel isn't actually
365	* being used by anything when this call is made.	365	* being used by anything when this call is made.
366	*/	366	*/
367	void relay_reset(struct rchan *chan)	367	void relay_reset(struct rchan *chan)
368	{	368	{
369	unsigned int i;	369	unsigned int i;
370		370
371	if (!chan)	371	if (!chan)
372	return;	372	return;
373		373
374	if (chan->is_global && chan->buf[0]) {	374	if (chan->is_global && chan->buf[0]) {
375	__relay_reset(chan->buf[0], 0);	375	__relay_reset(chan->buf[0], 0);
376	return;	376	return;
377	}	377	}
378		378
379	mutex_lock(&relay_channels_mutex);	379	mutex_lock(&relay_channels_mutex);
380	for_each_online_cpu(i)	380	for_each_online_cpu(i)
381	if (chan->buf[i])	381	if (chan->buf[i])
382	__relay_reset(chan->buf[i], 0);	382	__relay_reset(chan->buf[i], 0);
383	mutex_unlock(&relay_channels_mutex);	383	mutex_unlock(&relay_channels_mutex);
384	}	384	}
385	EXPORT_SYMBOL_GPL(relay_reset);	385	EXPORT_SYMBOL_GPL(relay_reset);
386		386
387	/*	387	/*
388	* relay_open_buf - create a new relay channel buffer	388	* relay_open_buf - create a new relay channel buffer
389	*	389	*
390	* used by relay_open() and CPU hotplug.	390	* used by relay_open() and CPU hotplug.
391	*/	391	*/
392	static struct rchan_buf relay_open_buf(struct rchan chan, unsigned int cpu)	392	static struct rchan_buf relay_open_buf(struct rchan chan, unsigned int cpu)
393	{	393	{
394	struct rchan_buf *buf = NULL;	394	struct rchan_buf *buf = NULL;
395	struct dentry *dentry;	395	struct dentry *dentry;
396	char *tmpname;	396	char *tmpname;
397		397
398	if (chan->is_global)	398	if (chan->is_global)
399	return chan->buf[0];	399	return chan->buf[0];
400		400
401	tmpname = kzalloc(NAME_MAX + 1, GFP_KERNEL);	401	tmpname = kzalloc(NAME_MAX + 1, GFP_KERNEL);
402	if (!tmpname)	402	if (!tmpname)
403	goto end;	403	goto end;
404	snprintf(tmpname, NAME_MAX, "%s%d", chan->base_filename, cpu);	404	snprintf(tmpname, NAME_MAX, "%s%d", chan->base_filename, cpu);
405		405
406	buf = relay_create_buf(chan);	406	buf = relay_create_buf(chan);
407	if (!buf)	407	if (!buf)
408	goto free_name;	408	goto free_name;
409		409
410	buf->cpu = cpu;	410	buf->cpu = cpu;
411	__relay_reset(buf, 1);	411	__relay_reset(buf, 1);
412		412
413	/* Create file in fs */	413	/* Create file in fs */
414	dentry = chan->cb->create_buf_file(tmpname, chan->parent, S_IRUSR,	414	dentry = chan->cb->create_buf_file(tmpname, chan->parent, S_IRUSR,
415	buf, &chan->is_global);	415	buf, &chan->is_global);
416	if (!dentry)	416	if (!dentry)
417	goto free_buf;	417	goto free_buf;
418		418
419	buf->dentry = dentry;	419	buf->dentry = dentry;
420		420
421	if(chan->is_global) {	421	if(chan->is_global) {
422	chan->buf[0] = buf;	422	chan->buf[0] = buf;
423	buf->cpu = 0;	423	buf->cpu = 0;
424	}	424	}
425		425
426	goto free_name;	426	goto free_name;
427		427
428	free_buf:	428	free_buf:
429	relay_destroy_buf(buf);	429	relay_destroy_buf(buf);
430	free_name:	430	free_name:
431	kfree(tmpname);	431	kfree(tmpname);
432	end:	432	end:
433	return buf;	433	return buf;
434	}	434	}
435		435
436	/**	436	/**
437	* relay_close_buf - close a channel buffer	437	* relay_close_buf - close a channel buffer
438	* @buf: channel buffer	438	* @buf: channel buffer
439	*	439	*
440	* Marks the buffer finalized and restores the default callbacks.	440	* Marks the buffer finalized and restores the default callbacks.
441	* The channel buffer and channel buffer data structure are then freed	441	* The channel buffer and channel buffer data structure are then freed
442	* automatically when the last reference is given up.	442	* automatically when the last reference is given up.
443	*/	443	*/
444	static void relay_close_buf(struct rchan_buf *buf)	444	static void relay_close_buf(struct rchan_buf *buf)
445	{	445	{
446	buf->finalized = 1;	446	buf->finalized = 1;
447	del_timer_sync(&buf->timer);	447	del_timer_sync(&buf->timer);
448	kref_put(&buf->kref, relay_remove_buf);	448	kref_put(&buf->kref, relay_remove_buf);
449	}	449	}
450		450
451	static void setup_callbacks(struct rchan *chan,	451	static void setup_callbacks(struct rchan *chan,
452	struct rchan_callbacks *cb)	452	struct rchan_callbacks *cb)
453	{	453	{
454	if (!cb) {	454	if (!cb) {
455	chan->cb = &default_channel_callbacks;	455	chan->cb = &default_channel_callbacks;
456	return;	456	return;
457	}	457	}
458		458
459	if (!cb->subbuf_start)	459	if (!cb->subbuf_start)
460	cb->subbuf_start = subbuf_start_default_callback;	460	cb->subbuf_start = subbuf_start_default_callback;
461	if (!cb->buf_mapped)	461	if (!cb->buf_mapped)
462	cb->buf_mapped = buf_mapped_default_callback;	462	cb->buf_mapped = buf_mapped_default_callback;
463	if (!cb->buf_unmapped)	463	if (!cb->buf_unmapped)
464	cb->buf_unmapped = buf_unmapped_default_callback;	464	cb->buf_unmapped = buf_unmapped_default_callback;
465	if (!cb->create_buf_file)	465	if (!cb->create_buf_file)
466	cb->create_buf_file = create_buf_file_default_callback;	466	cb->create_buf_file = create_buf_file_default_callback;
467	if (!cb->remove_buf_file)	467	if (!cb->remove_buf_file)
468	cb->remove_buf_file = remove_buf_file_default_callback;	468	cb->remove_buf_file = remove_buf_file_default_callback;
469	chan->cb = cb;	469	chan->cb = cb;
470	}	470	}
471		471
472	/**	472	/**
473	* relay_hotcpu_callback - CPU hotplug callback	473	* relay_hotcpu_callback - CPU hotplug callback
474	* @nb: notifier block	474	* @nb: notifier block
475	* @action: hotplug action to take	475	* @action: hotplug action to take
476	* @hcpu: CPU number	476	* @hcpu: CPU number
477	*	477	*
478	* Returns the success/failure of the operation. (%NOTIFY_OK, %NOTIFY_BAD)	478	* Returns the success/failure of the operation. (%NOTIFY_OK, %NOTIFY_BAD)
479	*/	479	*/
480	static int __cpuinit relay_hotcpu_callback(struct notifier_block *nb,	480	static int __cpuinit relay_hotcpu_callback(struct notifier_block *nb,
481	unsigned long action,	481	unsigned long action,
482	void *hcpu)	482	void *hcpu)
483	{	483	{
484	unsigned int hotcpu = (unsigned long)hcpu;	484	unsigned int hotcpu = (unsigned long)hcpu;
485	struct rchan *chan;	485	struct rchan *chan;
486		486
487	switch(action) {	487	switch(action) {
488	case CPU_UP_PREPARE:	488	case CPU_UP_PREPARE:
489	case CPU_UP_PREPARE_FROZEN:	489	case CPU_UP_PREPARE_FROZEN:
490	mutex_lock(&relay_channels_mutex);	490	mutex_lock(&relay_channels_mutex);
491	list_for_each_entry(chan, &relay_channels, list) {	491	list_for_each_entry(chan, &relay_channels, list) {
492	if (chan->buf[hotcpu])	492	if (chan->buf[hotcpu])
493	continue;	493	continue;
494	chan->buf[hotcpu] = relay_open_buf(chan, hotcpu);	494	chan->buf[hotcpu] = relay_open_buf(chan, hotcpu);
495	if(!chan->buf[hotcpu]) {	495	if(!chan->buf[hotcpu]) {
496	printk(KERN_ERR	496	printk(KERN_ERR
497	"relay_hotcpu_callback: cpu %d buffer "	497	"relay_hotcpu_callback: cpu %d buffer "
498	"creation failed\n", hotcpu);	498	"creation failed\n", hotcpu);
499	mutex_unlock(&relay_channels_mutex);	499	mutex_unlock(&relay_channels_mutex);
500	return NOTIFY_BAD;	500	return NOTIFY_BAD;
501	}	501	}
502	}	502	}
503	mutex_unlock(&relay_channels_mutex);	503	mutex_unlock(&relay_channels_mutex);
504	break;	504	break;
505	case CPU_DEAD:	505	case CPU_DEAD:
506	case CPU_DEAD_FROZEN:	506	case CPU_DEAD_FROZEN:
507	/* No need to flush the cpu : will be flushed upon	507	/* No need to flush the cpu : will be flushed upon
508	* final relay_flush() call. */	508	* final relay_flush() call. */
509	break;	509	break;
510	}	510	}
511	return NOTIFY_OK;	511	return NOTIFY_OK;
512	}	512	}
513		513
514	/**	514	/**
515	* relay_open - create a new relay channel	515	* relay_open - create a new relay channel
516	* @base_filename: base name of files to create	516	* @base_filename: base name of files to create
517	* @parent: dentry of parent directory, %NULL for root directory	517	* @parent: dentry of parent directory, %NULL for root directory
518	* @subbuf_size: size of sub-buffers	518	* @subbuf_size: size of sub-buffers
519	* @n_subbufs: number of sub-buffers	519	* @n_subbufs: number of sub-buffers
520	* @cb: client callback functions	520	* @cb: client callback functions
521	* @private_data: user-defined data	521	* @private_data: user-defined data
522	*	522	*
523	* Returns channel pointer if successful, %NULL otherwise.	523	* Returns channel pointer if successful, %NULL otherwise.
524	*	524	*
525	* Creates a channel buffer for each cpu using the sizes and	525	* Creates a channel buffer for each cpu using the sizes and
526	* attributes specified. The created channel buffer files	526	* attributes specified. The created channel buffer files
527	* will be named base_filename0...base_filenameN-1. File	527	* will be named base_filename0...base_filenameN-1. File
528	* permissions will be %S_IRUSR.	528	* permissions will be %S_IRUSR.
529	*/	529	*/
530	struct rchan relay_open(const char base_filename,	530	struct rchan relay_open(const char base_filename,
531	struct dentry *parent,	531	struct dentry *parent,
532	size_t subbuf_size,	532	size_t subbuf_size,
533	size_t n_subbufs,	533	size_t n_subbufs,
534	struct rchan_callbacks *cb,	534	struct rchan_callbacks *cb,
535	void *private_data)	535	void *private_data)
536	{	536	{
537	unsigned int i;	537	unsigned int i;
538	struct rchan *chan;	538	struct rchan *chan;
539	if (!base_filename)	539	if (!base_filename)
540	return NULL;	540	return NULL;
541		541
542	if (!(subbuf_size && n_subbufs))	542	if (!(subbuf_size && n_subbufs))
543	return NULL;	543	return NULL;
544		544
545	chan = kzalloc(sizeof(struct rchan), GFP_KERNEL);	545	chan = kzalloc(sizeof(struct rchan), GFP_KERNEL);
546	if (!chan)	546	if (!chan)
547	return NULL;	547	return NULL;
548		548
549	chan->version = RELAYFS_CHANNEL_VERSION;	549	chan->version = RELAYFS_CHANNEL_VERSION;
550	chan->n_subbufs = n_subbufs;	550	chan->n_subbufs = n_subbufs;
551	chan->subbuf_size = subbuf_size;	551	chan->subbuf_size = subbuf_size;
552	chan->alloc_size = FIX_SIZE(subbuf_size * n_subbufs);	552	chan->alloc_size = FIX_SIZE(subbuf_size * n_subbufs);
553	chan->parent = parent;	553	chan->parent = parent;
554	chan->private_data = private_data;	554	chan->private_data = private_data;
555	strlcpy(chan->base_filename, base_filename, NAME_MAX);	555	strlcpy(chan->base_filename, base_filename, NAME_MAX);
556	setup_callbacks(chan, cb);	556	setup_callbacks(chan, cb);
557	kref_init(&chan->kref);	557	kref_init(&chan->kref);
558		558
559	mutex_lock(&relay_channels_mutex);	559	mutex_lock(&relay_channels_mutex);
560	for_each_online_cpu(i) {	560	for_each_online_cpu(i) {
561	chan->buf[i] = relay_open_buf(chan, i);	561	chan->buf[i] = relay_open_buf(chan, i);
562	if (!chan->buf[i])	562	if (!chan->buf[i])
563	goto free_bufs;	563	goto free_bufs;
564	}	564	}
565	list_add(&chan->list, &relay_channels);	565	list_add(&chan->list, &relay_channels);
566	mutex_unlock(&relay_channels_mutex);	566	mutex_unlock(&relay_channels_mutex);
567		567
568	return chan;	568	return chan;
569		569
570	free_bufs:	570	free_bufs:
571	for_each_online_cpu(i) {	571	for_each_online_cpu(i) {
572	if (!chan->buf[i])	572	if (!chan->buf[i])
573	break;	573	break;
574	relay_close_buf(chan->buf[i]);	574	relay_close_buf(chan->buf[i]);
575	}	575	}
576		576
577	kref_put(&chan->kref, relay_destroy_channel);	577	kref_put(&chan->kref, relay_destroy_channel);
578	mutex_unlock(&relay_channels_mutex);	578	mutex_unlock(&relay_channels_mutex);
579	return NULL;	579	return NULL;
580	}	580	}
581	EXPORT_SYMBOL_GPL(relay_open);	581	EXPORT_SYMBOL_GPL(relay_open);
582		582
583	/**	583	/**
584	* relay_switch_subbuf - switch to a new sub-buffer	584	* relay_switch_subbuf - switch to a new sub-buffer
585	* @buf: channel buffer	585	* @buf: channel buffer
586	* @length: size of current event	586	* @length: size of current event
587	*	587	*
588	* Returns either the length passed in or 0 if full.	588	* Returns either the length passed in or 0 if full.
589	*	589	*
590	* Performs sub-buffer-switch tasks such as invoking callbacks,	590	* Performs sub-buffer-switch tasks such as invoking callbacks,
591	* updating padding counts, waking up readers, etc.	591	* updating padding counts, waking up readers, etc.
592	*/	592	*/
593	size_t relay_switch_subbuf(struct rchan_buf *buf, size_t length)	593	size_t relay_switch_subbuf(struct rchan_buf *buf, size_t length)
594	{	594	{
595	void old, new;	595	void old, new;
596	size_t old_subbuf, new_subbuf;	596	size_t old_subbuf, new_subbuf;
597		597
598	if (unlikely(length > buf->chan->subbuf_size))	598	if (unlikely(length > buf->chan->subbuf_size))
599	goto toobig;	599	goto toobig;
600		600
601	if (buf->offset != buf->chan->subbuf_size + 1) {	601	if (buf->offset != buf->chan->subbuf_size + 1) {
602	buf->prev_padding = buf->chan->subbuf_size - buf->offset;	602	buf->prev_padding = buf->chan->subbuf_size - buf->offset;
603	old_subbuf = buf->subbufs_produced % buf->chan->n_subbufs;	603	old_subbuf = buf->subbufs_produced % buf->chan->n_subbufs;
604	buf->padding[old_subbuf] = buf->prev_padding;	604	buf->padding[old_subbuf] = buf->prev_padding;
605	buf->subbufs_produced++;	605	buf->subbufs_produced++;
606	buf->dentry->d_inode->i_size += buf->chan->subbuf_size -	606	buf->dentry->d_inode->i_size += buf->chan->subbuf_size -
607	buf->padding[old_subbuf];	607	buf->padding[old_subbuf];
608	smp_mb();	608	smp_mb();
609	if (waitqueue_active(&buf->read_wait))	609	if (waitqueue_active(&buf->read_wait))
610	/*	610	/*
611	* Calling wake_up_interruptible() from here	611	* Calling wake_up_interruptible() from here
612	* will deadlock if we happen to be logging	612	* will deadlock if we happen to be logging
613	* from the scheduler (trying to re-grab	613	* from the scheduler (trying to re-grab
614	* rq->lock), so defer it.	614	* rq->lock), so defer it.
615	*/	615	*/
616	__mod_timer(&buf->timer, jiffies + 1);	616	__mod_timer(&buf->timer, jiffies + 1);
617	}	617	}
618		618
619	old = buf->data;	619	old = buf->data;
620	new_subbuf = buf->subbufs_produced % buf->chan->n_subbufs;	620	new_subbuf = buf->subbufs_produced % buf->chan->n_subbufs;
621	new = buf->start + new_subbuf * buf->chan->subbuf_size;	621	new = buf->start + new_subbuf * buf->chan->subbuf_size;
622	buf->offset = 0;	622	buf->offset = 0;
623	if (!buf->chan->cb->subbuf_start(buf, new, old, buf->prev_padding)) {	623	if (!buf->chan->cb->subbuf_start(buf, new, old, buf->prev_padding)) {
624	buf->offset = buf->chan->subbuf_size + 1;	624	buf->offset = buf->chan->subbuf_size + 1;
625	return 0;	625	return 0;
626	}	626	}
627	buf->data = new;	627	buf->data = new;
628	buf->padding[new_subbuf] = 0;	628	buf->padding[new_subbuf] = 0;
629		629
630	if (unlikely(length + buf->offset > buf->chan->subbuf_size))	630	if (unlikely(length + buf->offset > buf->chan->subbuf_size))
631	goto toobig;	631	goto toobig;
632		632
633	return length;	633	return length;
634		634
635	toobig:	635	toobig:
636	buf->chan->last_toobig = length;	636	buf->chan->last_toobig = length;
637	return 0;	637	return 0;
638	}	638	}
639	EXPORT_SYMBOL_GPL(relay_switch_subbuf);	639	EXPORT_SYMBOL_GPL(relay_switch_subbuf);
640		640
641	/**	641	/**
642	* relay_subbufs_consumed - update the buffer's sub-buffers-consumed count	642	* relay_subbufs_consumed - update the buffer's sub-buffers-consumed count
643	* @chan: the channel	643	* @chan: the channel
644	* @cpu: the cpu associated with the channel buffer to update	644	* @cpu: the cpu associated with the channel buffer to update
645	* @subbufs_consumed: number of sub-buffers to add to current buf's count	645	* @subbufs_consumed: number of sub-buffers to add to current buf's count
646	*	646	*
647	* Adds to the channel buffer's consumed sub-buffer count.	647	* Adds to the channel buffer's consumed sub-buffer count.
648	* subbufs_consumed should be the number of sub-buffers newly consumed,	648	* subbufs_consumed should be the number of sub-buffers newly consumed,
649	* not the total consumed.	649	* not the total consumed.
650	*	650	*
651	* NOTE. Kernel clients don't need to call this function if the channel	651	* NOTE. Kernel clients don't need to call this function if the channel
652	* mode is 'overwrite'.	652	* mode is 'overwrite'.
653	*/	653	*/
654	void relay_subbufs_consumed(struct rchan *chan,	654	void relay_subbufs_consumed(struct rchan *chan,
655	unsigned int cpu,	655	unsigned int cpu,
656	size_t subbufs_consumed)	656	size_t subbufs_consumed)
657	{	657	{
658	struct rchan_buf *buf;	658	struct rchan_buf *buf;
659		659
660	if (!chan)	660	if (!chan)
661	return;	661	return;
662		662
663	if (cpu >= NR_CPUS \|\| !chan->buf[cpu])	663	if (cpu >= NR_CPUS \|\| !chan->buf[cpu])
664	return;	664	return;
665		665
666	buf = chan->buf[cpu];	666	buf = chan->buf[cpu];
667	buf->subbufs_consumed += subbufs_consumed;	667	buf->subbufs_consumed += subbufs_consumed;
668	if (buf->subbufs_consumed > buf->subbufs_produced)	668	if (buf->subbufs_consumed > buf->subbufs_produced)
669	buf->subbufs_consumed = buf->subbufs_produced;	669	buf->subbufs_consumed = buf->subbufs_produced;
670	}	670	}
671	EXPORT_SYMBOL_GPL(relay_subbufs_consumed);	671	EXPORT_SYMBOL_GPL(relay_subbufs_consumed);
672		672
673	/**	673	/**
674	* relay_close - close the channel	674	* relay_close - close the channel
675	* @chan: the channel	675	* @chan: the channel
676	*	676	*
677	* Closes all channel buffers and frees the channel.	677	* Closes all channel buffers and frees the channel.
678	*/	678	*/
679	void relay_close(struct rchan *chan)	679	void relay_close(struct rchan *chan)
680	{	680	{
681	unsigned int i;	681	unsigned int i;
682		682
683	if (!chan)	683	if (!chan)
684	return;	684	return;
685		685
686	mutex_lock(&relay_channels_mutex);	686	mutex_lock(&relay_channels_mutex);
687	if (chan->is_global && chan->buf[0])	687	if (chan->is_global && chan->buf[0])
688	relay_close_buf(chan->buf[0]);	688	relay_close_buf(chan->buf[0]);
689	else	689	else
690	for_each_possible_cpu(i)	690	for_each_possible_cpu(i)
691	if (chan->buf[i])	691	if (chan->buf[i])
692	relay_close_buf(chan->buf[i]);	692	relay_close_buf(chan->buf[i]);
693		693
694	if (chan->last_toobig)	694	if (chan->last_toobig)
695	printk(KERN_WARNING "relay: one or more items not logged "	695	printk(KERN_WARNING "relay: one or more items not logged "
696	"[item size (%Zd) > sub-buffer size (%Zd)]\n",	696	"[item size (%Zd) > sub-buffer size (%Zd)]\n",
697	chan->last_toobig, chan->subbuf_size);	697	chan->last_toobig, chan->subbuf_size);
698		698
699	list_del(&chan->list);	699	list_del(&chan->list);
700	kref_put(&chan->kref, relay_destroy_channel);	700	kref_put(&chan->kref, relay_destroy_channel);
701	mutex_unlock(&relay_channels_mutex);	701	mutex_unlock(&relay_channels_mutex);
702	}	702	}
703	EXPORT_SYMBOL_GPL(relay_close);	703	EXPORT_SYMBOL_GPL(relay_close);
704		704
705	/**	705	/**
706	* relay_flush - close the channel	706	* relay_flush - close the channel
707	* @chan: the channel	707	* @chan: the channel
708	*	708	*
709	* Flushes all channel buffers, i.e. forces buffer switch.	709	* Flushes all channel buffers, i.e. forces buffer switch.
710	*/	710	*/
711	void relay_flush(struct rchan *chan)	711	void relay_flush(struct rchan *chan)
712	{	712	{
713	unsigned int i;	713	unsigned int i;
714		714
715	if (!chan)	715	if (!chan)
716	return;	716	return;
717		717
718	if (chan->is_global && chan->buf[0]) {	718	if (chan->is_global && chan->buf[0]) {
719	relay_switch_subbuf(chan->buf[0], 0);	719	relay_switch_subbuf(chan->buf[0], 0);
720	return;	720	return;
721	}	721	}
722		722
723	mutex_lock(&relay_channels_mutex);	723	mutex_lock(&relay_channels_mutex);
724	for_each_possible_cpu(i)	724	for_each_possible_cpu(i)
725	if (chan->buf[i])	725	if (chan->buf[i])
726	relay_switch_subbuf(chan->buf[i], 0);	726	relay_switch_subbuf(chan->buf[i], 0);
727	mutex_unlock(&relay_channels_mutex);	727	mutex_unlock(&relay_channels_mutex);
728	}	728	}
729	EXPORT_SYMBOL_GPL(relay_flush);	729	EXPORT_SYMBOL_GPL(relay_flush);
730		730
731	/**	731	/**
732	* relay_file_open - open file op for relay files	732	* relay_file_open - open file op for relay files
733	* @inode: the inode	733	* @inode: the inode
734	* @filp: the file	734	* @filp: the file
735	*	735	*
736	* Increments the channel buffer refcount.	736	* Increments the channel buffer refcount.
737	*/	737	*/
738	static int relay_file_open(struct inode inode, struct file filp)	738	static int relay_file_open(struct inode inode, struct file filp)
739	{	739	{
740	struct rchan_buf *buf = inode->i_private;	740	struct rchan_buf *buf = inode->i_private;
741	kref_get(&buf->kref);	741	kref_get(&buf->kref);
742	filp->private_data = buf;	742	filp->private_data = buf;
743		743
744	return 0;	744	return 0;
745	}	745	}
746		746
747	/**	747	/**
748	* relay_file_mmap - mmap file op for relay files	748	* relay_file_mmap - mmap file op for relay files
749	* @filp: the file	749	* @filp: the file
750	* @vma: the vma describing what to map	750	* @vma: the vma describing what to map
751	*	751	*
752	* Calls upon relay_mmap_buf() to map the file into user space.	752	* Calls upon relay_mmap_buf() to map the file into user space.
753	*/	753	*/
754	static int relay_file_mmap(struct file filp, struct vm_area_struct vma)	754	static int relay_file_mmap(struct file filp, struct vm_area_struct vma)
755	{	755	{
756	struct rchan_buf *buf = filp->private_data;	756	struct rchan_buf *buf = filp->private_data;
757	return relay_mmap_buf(buf, vma);	757	return relay_mmap_buf(buf, vma);
758	}	758	}
759		759
760	/**	760	/**
761	* relay_file_poll - poll file op for relay files	761	* relay_file_poll - poll file op for relay files
762	* @filp: the file	762	* @filp: the file
763	* @wait: poll table	763	* @wait: poll table
764	*	764	*
765	* Poll implemention.	765	* Poll implemention.
766	*/	766	*/
767	static unsigned int relay_file_poll(struct file filp, poll_table wait)	767	static unsigned int relay_file_poll(struct file filp, poll_table wait)
768	{	768	{
769	unsigned int mask = 0;	769	unsigned int mask = 0;
770	struct rchan_buf *buf = filp->private_data;	770	struct rchan_buf *buf = filp->private_data;
771		771
772	if (buf->finalized)	772	if (buf->finalized)
773	return POLLERR;	773	return POLLERR;
774		774
775	if (filp->f_mode & FMODE_READ) {	775	if (filp->f_mode & FMODE_READ) {
776	poll_wait(filp, &buf->read_wait, wait);	776	poll_wait(filp, &buf->read_wait, wait);
777	if (!relay_buf_empty(buf))	777	if (!relay_buf_empty(buf))
778	mask \|= POLLIN \| POLLRDNORM;	778	mask \|= POLLIN \| POLLRDNORM;
779	}	779	}
780		780
781	return mask;	781	return mask;
782	}	782	}
783		783
784	/**	784	/**
785	* relay_file_release - release file op for relay files	785	* relay_file_release - release file op for relay files
786	* @inode: the inode	786	* @inode: the inode
787	* @filp: the file	787	* @filp: the file
788	*	788	*
789	* Decrements the channel refcount, as the filesystem is	789	* Decrements the channel refcount, as the filesystem is
790	* no longer using it.	790	* no longer using it.
791	*/	791	*/
792	static int relay_file_release(struct inode inode, struct file filp)	792	static int relay_file_release(struct inode inode, struct file filp)
793	{	793	{
794	struct rchan_buf *buf = filp->private_data;	794	struct rchan_buf *buf = filp->private_data;
795	kref_put(&buf->kref, relay_remove_buf);	795	kref_put(&buf->kref, relay_remove_buf);
796		796
797	return 0;	797	return 0;
798	}	798	}
799		799
800	/*	800	/*
801	* relay_file_read_consume - update the consumed count for the buffer	801	* relay_file_read_consume - update the consumed count for the buffer
802	*/	802	*/
803	static void relay_file_read_consume(struct rchan_buf *buf,	803	static void relay_file_read_consume(struct rchan_buf *buf,
804	size_t read_pos,	804	size_t read_pos,
805	size_t bytes_consumed)	805	size_t bytes_consumed)
806	{	806	{
807	size_t subbuf_size = buf->chan->subbuf_size;	807	size_t subbuf_size = buf->chan->subbuf_size;
808	size_t n_subbufs = buf->chan->n_subbufs;	808	size_t n_subbufs = buf->chan->n_subbufs;
809	size_t read_subbuf;	809	size_t read_subbuf;
810		810
811	if (buf->bytes_consumed + bytes_consumed > subbuf_size) {	811	if (buf->bytes_consumed + bytes_consumed > subbuf_size) {
812	relay_subbufs_consumed(buf->chan, buf->cpu, 1);	812	relay_subbufs_consumed(buf->chan, buf->cpu, 1);
813	buf->bytes_consumed = 0;	813	buf->bytes_consumed = 0;
814	}	814	}
815		815
816	buf->bytes_consumed += bytes_consumed;	816	buf->bytes_consumed += bytes_consumed;
817	if (!read_pos)	817	if (!read_pos)
818	read_subbuf = buf->subbufs_consumed % n_subbufs;	818	read_subbuf = buf->subbufs_consumed % n_subbufs;
819	else	819	else
820	read_subbuf = read_pos / buf->chan->subbuf_size;	820	read_subbuf = read_pos / buf->chan->subbuf_size;
821	if (buf->bytes_consumed + buf->padding[read_subbuf] == subbuf_size) {	821	if (buf->bytes_consumed + buf->padding[read_subbuf] == subbuf_size) {
822	if ((read_subbuf == buf->subbufs_produced % n_subbufs) &&	822	if ((read_subbuf == buf->subbufs_produced % n_subbufs) &&
823	(buf->offset == subbuf_size))	823	(buf->offset == subbuf_size))
824	return;	824	return;
825	relay_subbufs_consumed(buf->chan, buf->cpu, 1);	825	relay_subbufs_consumed(buf->chan, buf->cpu, 1);
826	buf->bytes_consumed = 0;	826	buf->bytes_consumed = 0;
827	}	827	}
828	}	828	}
829		829
830	/*	830	/*
831	* relay_file_read_avail - boolean, are there unconsumed bytes available?	831	* relay_file_read_avail - boolean, are there unconsumed bytes available?
832	*/	832	*/
833	static int relay_file_read_avail(struct rchan_buf *buf, size_t read_pos)	833	static int relay_file_read_avail(struct rchan_buf *buf, size_t read_pos)
834	{	834	{
835	size_t subbuf_size = buf->chan->subbuf_size;	835	size_t subbuf_size = buf->chan->subbuf_size;
836	size_t n_subbufs = buf->chan->n_subbufs;	836	size_t n_subbufs = buf->chan->n_subbufs;
837	size_t produced = buf->subbufs_produced;	837	size_t produced = buf->subbufs_produced;
838	size_t consumed = buf->subbufs_consumed;	838	size_t consumed = buf->subbufs_consumed;
839		839
840	relay_file_read_consume(buf, read_pos, 0);	840	relay_file_read_consume(buf, read_pos, 0);
841		841
842	if (unlikely(buf->offset > subbuf_size)) {	842	if (unlikely(buf->offset > subbuf_size)) {
843	if (produced == consumed)	843	if (produced == consumed)
844	return 0;	844	return 0;
845	return 1;	845	return 1;
846	}	846	}
847		847
848	if (unlikely(produced - consumed >= n_subbufs)) {	848	if (unlikely(produced - consumed >= n_subbufs)) {
849	consumed = produced - n_subbufs + 1;	849	consumed = produced - n_subbufs + 1;
850	buf->subbufs_consumed = consumed;	850	buf->subbufs_consumed = consumed;
851	buf->bytes_consumed = 0;	851	buf->bytes_consumed = 0;
852	}	852	}
853		853
854	produced = (produced % n_subbufs) * subbuf_size + buf->offset;	854	produced = (produced % n_subbufs) * subbuf_size + buf->offset;
855	consumed = (consumed % n_subbufs) * subbuf_size + buf->bytes_consumed;	855	consumed = (consumed % n_subbufs) * subbuf_size + buf->bytes_consumed;
856		856
857	if (consumed > produced)	857	if (consumed > produced)
858	produced += n_subbufs * subbuf_size;	858	produced += n_subbufs * subbuf_size;
859		859
860	if (consumed == produced)	860	if (consumed == produced)
861	return 0;	861	return 0;
862		862
863	return 1;	863	return 1;
864	}	864	}
865		865
866	/**	866	/**
867	* relay_file_read_subbuf_avail - return bytes available in sub-buffer	867	* relay_file_read_subbuf_avail - return bytes available in sub-buffer
868	* @read_pos: file read position	868	* @read_pos: file read position
869	* @buf: relay channel buffer	869	* @buf: relay channel buffer
870	*/	870	*/
871	static size_t relay_file_read_subbuf_avail(size_t read_pos,	871	static size_t relay_file_read_subbuf_avail(size_t read_pos,
872	struct rchan_buf *buf)	872	struct rchan_buf *buf)
873	{	873	{
874	size_t padding, avail = 0;	874	size_t padding, avail = 0;
875	size_t read_subbuf, read_offset, write_subbuf, write_offset;	875	size_t read_subbuf, read_offset, write_subbuf, write_offset;
876	size_t subbuf_size = buf->chan->subbuf_size;	876	size_t subbuf_size = buf->chan->subbuf_size;
877		877
878	write_subbuf = (buf->data - buf->start) / subbuf_size;	878	write_subbuf = (buf->data - buf->start) / subbuf_size;
879	write_offset = buf->offset > subbuf_size ? subbuf_size : buf->offset;	879	write_offset = buf->offset > subbuf_size ? subbuf_size : buf->offset;
880	read_subbuf = read_pos / subbuf_size;	880	read_subbuf = read_pos / subbuf_size;
881	read_offset = read_pos % subbuf_size;	881	read_offset = read_pos % subbuf_size;
882	padding = buf->padding[read_subbuf];	882	padding = buf->padding[read_subbuf];
883		883
884	if (read_subbuf == write_subbuf) {	884	if (read_subbuf == write_subbuf) {
885	if (read_offset + padding < write_offset)	885	if (read_offset + padding < write_offset)
886	avail = write_offset - (read_offset + padding);	886	avail = write_offset - (read_offset + padding);
887	} else	887	} else
888	avail = (subbuf_size - padding) - read_offset;	888	avail = (subbuf_size - padding) - read_offset;
889		889
890	return avail;	890	return avail;
891	}	891	}
892		892
893	/**	893	/**
894	* relay_file_read_start_pos - find the first available byte to read	894	* relay_file_read_start_pos - find the first available byte to read
895	* @read_pos: file read position	895	* @read_pos: file read position
896	* @buf: relay channel buffer	896	* @buf: relay channel buffer
897	*	897	*
898	* If the @read_pos is in the middle of padding, return the	898	* If the @read_pos is in the middle of padding, return the
899	* position of the first actually available byte, otherwise	899	* position of the first actually available byte, otherwise
900	* return the original value.	900	* return the original value.
901	*/	901	*/
902	static size_t relay_file_read_start_pos(size_t read_pos,	902	static size_t relay_file_read_start_pos(size_t read_pos,
903	struct rchan_buf *buf)	903	struct rchan_buf *buf)
904	{	904	{
905	size_t read_subbuf, padding, padding_start, padding_end;	905	size_t read_subbuf, padding, padding_start, padding_end;
906	size_t subbuf_size = buf->chan->subbuf_size;	906	size_t subbuf_size = buf->chan->subbuf_size;
907	size_t n_subbufs = buf->chan->n_subbufs;	907	size_t n_subbufs = buf->chan->n_subbufs;
908	size_t consumed = buf->subbufs_consumed % n_subbufs;	908	size_t consumed = buf->subbufs_consumed % n_subbufs;
909		909
910	if (!read_pos)	910	if (!read_pos)
911	read_pos = consumed * subbuf_size + buf->bytes_consumed;	911	read_pos = consumed * subbuf_size + buf->bytes_consumed;
912	read_subbuf = read_pos / subbuf_size;	912	read_subbuf = read_pos / subbuf_size;
913	padding = buf->padding[read_subbuf];	913	padding = buf->padding[read_subbuf];
914	padding_start = (read_subbuf + 1) * subbuf_size - padding;	914	padding_start = (read_subbuf + 1) * subbuf_size - padding;
915	padding_end = (read_subbuf + 1) * subbuf_size;	915	padding_end = (read_subbuf + 1) * subbuf_size;
916	if (read_pos >= padding_start && read_pos < padding_end) {	916	if (read_pos >= padding_start && read_pos < padding_end) {
917	read_subbuf = (read_subbuf + 1) % n_subbufs;	917	read_subbuf = (read_subbuf + 1) % n_subbufs;
918	read_pos = read_subbuf * subbuf_size;	918	read_pos = read_subbuf * subbuf_size;
919	}	919	}
920		920
921	return read_pos;	921	return read_pos;
922	}	922	}
923		923
924	/**	924	/**
925	* relay_file_read_end_pos - return the new read position	925	* relay_file_read_end_pos - return the new read position
926	* @read_pos: file read position	926	* @read_pos: file read position
927	* @buf: relay channel buffer	927	* @buf: relay channel buffer
928	* @count: number of bytes to be read	928	* @count: number of bytes to be read
929	*/	929	*/
930	static size_t relay_file_read_end_pos(struct rchan_buf *buf,	930	static size_t relay_file_read_end_pos(struct rchan_buf *buf,
931	size_t read_pos,	931	size_t read_pos,
932	size_t count)	932	size_t count)
933	{	933	{
934	size_t read_subbuf, padding, end_pos;	934	size_t read_subbuf, padding, end_pos;
935	size_t subbuf_size = buf->chan->subbuf_size;	935	size_t subbuf_size = buf->chan->subbuf_size;
936	size_t n_subbufs = buf->chan->n_subbufs;	936	size_t n_subbufs = buf->chan->n_subbufs;
937		937
938	read_subbuf = read_pos / subbuf_size;	938	read_subbuf = read_pos / subbuf_size;
939	padding = buf->padding[read_subbuf];	939	padding = buf->padding[read_subbuf];
940	if (read_pos % subbuf_size + count + padding == subbuf_size)	940	if (read_pos % subbuf_size + count + padding == subbuf_size)
941	end_pos = (read_subbuf + 1) * subbuf_size;	941	end_pos = (read_subbuf + 1) * subbuf_size;
942	else	942	else
943	end_pos = read_pos + count;	943	end_pos = read_pos + count;
944	if (end_pos >= subbuf_size * n_subbufs)	944	if (end_pos >= subbuf_size * n_subbufs)
945	end_pos = 0;	945	end_pos = 0;
946		946
947	return end_pos;	947	return end_pos;
948	}	948	}
949		949
950	/*	950	/*
951	* subbuf_read_actor - read up to one subbuf's worth of data	951	* subbuf_read_actor - read up to one subbuf's worth of data
952	*/	952	*/
953	static int subbuf_read_actor(size_t read_start,	953	static int subbuf_read_actor(size_t read_start,
954	struct rchan_buf *buf,	954	struct rchan_buf *buf,
955	size_t avail,	955	size_t avail,
956	read_descriptor_t *desc,	956	read_descriptor_t *desc,
957	read_actor_t actor)	957	read_actor_t actor)
958	{	958	{
959	void *from;	959	void *from;
960	int ret = 0;	960	int ret = 0;
961		961
962	from = buf->start + read_start;	962	from = buf->start + read_start;
963	ret = avail;	963	ret = avail;
964	if (copy_to_user(desc->arg.buf, from, avail)) {	964	if (copy_to_user(desc->arg.buf, from, avail)) {
965	desc->error = -EFAULT;	965	desc->error = -EFAULT;
966	ret = 0;	966	ret = 0;
967	}	967	}
968	desc->arg.data += ret;	968	desc->arg.data += ret;
969	desc->written += ret;	969	desc->written += ret;
970	desc->count -= ret;	970	desc->count -= ret;
971		971
972	return ret;	972	return ret;
973	}	973	}
974		974
975	typedef int (*subbuf_actor_t) (size_t read_start,	975	typedef int (*subbuf_actor_t) (size_t read_start,
976	struct rchan_buf *buf,	976	struct rchan_buf *buf,
977	size_t avail,	977	size_t avail,
978	read_descriptor_t *desc,	978	read_descriptor_t *desc,
979	read_actor_t actor);	979	read_actor_t actor);
980		980
981	/*	981	/*
982	* relay_file_read_subbufs - read count bytes, bridging subbuf boundaries	982	* relay_file_read_subbufs - read count bytes, bridging subbuf boundaries
983	*/	983	*/
984	static ssize_t relay_file_read_subbufs(struct file filp, loff_t ppos,	984	static ssize_t relay_file_read_subbufs(struct file filp, loff_t ppos,
985	subbuf_actor_t subbuf_actor,	985	subbuf_actor_t subbuf_actor,
986	read_actor_t actor,	986	read_actor_t actor,
987	read_descriptor_t *desc)	987	read_descriptor_t *desc)
988	{	988	{
989	struct rchan_buf *buf = filp->private_data;	989	struct rchan_buf *buf = filp->private_data;
990	size_t read_start, avail;	990	size_t read_start, avail;
991	int ret;	991	int ret;
992		992
993	if (!desc->count)	993	if (!desc->count)
994	return 0;	994	return 0;
995		995
996	mutex_lock(&filp->f_path.dentry->d_inode->i_mutex);	996	mutex_lock(&filp->f_path.dentry->d_inode->i_mutex);
997	do {	997	do {
998	if (!relay_file_read_avail(buf, *ppos))	998	if (!relay_file_read_avail(buf, *ppos))
999	break;	999	break;
1000		1000
1001	read_start = relay_file_read_start_pos(*ppos, buf);	1001	read_start = relay_file_read_start_pos(*ppos, buf);
1002	avail = relay_file_read_subbuf_avail(read_start, buf);	1002	avail = relay_file_read_subbuf_avail(read_start, buf);
1003	if (!avail)	1003	if (!avail)
1004	break;	1004	break;
1005		1005
1006	avail = min(desc->count, avail);	1006	avail = min(desc->count, avail);
1007	ret = subbuf_actor(read_start, buf, avail, desc, actor);	1007	ret = subbuf_actor(read_start, buf, avail, desc, actor);
1008	if (desc->error < 0)	1008	if (desc->error < 0)
1009	break;	1009	break;
1010		1010
1011	if (ret) {	1011	if (ret) {
1012	relay_file_read_consume(buf, read_start, ret);	1012	relay_file_read_consume(buf, read_start, ret);
1013	*ppos = relay_file_read_end_pos(buf, read_start, ret);	1013	*ppos = relay_file_read_end_pos(buf, read_start, ret);
1014	}	1014	}
1015	} while (desc->count && ret);	1015	} while (desc->count && ret);
1016	mutex_unlock(&filp->f_path.dentry->d_inode->i_mutex);	1016	mutex_unlock(&filp->f_path.dentry->d_inode->i_mutex);
1017		1017
1018	return desc->written;	1018	return desc->written;
1019	}	1019	}
1020		1020
1021	static ssize_t relay_file_read(struct file *filp,	1021	static ssize_t relay_file_read(struct file *filp,
1022	char __user *buffer,	1022	char __user *buffer,
1023	size_t count,	1023	size_t count,
1024	loff_t *ppos)	1024	loff_t *ppos)
1025	{	1025	{
1026	read_descriptor_t desc;	1026	read_descriptor_t desc;
1027	desc.written = 0;	1027	desc.written = 0;
1028	desc.count = count;	1028	desc.count = count;
1029	desc.arg.buf = buffer;	1029	desc.arg.buf = buffer;
1030	desc.error = 0;	1030	desc.error = 0;
1031	return relay_file_read_subbufs(filp, ppos, subbuf_read_actor,	1031	return relay_file_read_subbufs(filp, ppos, subbuf_read_actor,
1032	NULL, &desc);	1032	NULL, &desc);
1033	}	1033	}
1034		1034
1035	static void relay_consume_bytes(struct rchan_buf *rbuf, int bytes_consumed)	1035	static void relay_consume_bytes(struct rchan_buf *rbuf, int bytes_consumed)
1036	{	1036	{
1037	rbuf->bytes_consumed += bytes_consumed;	1037	rbuf->bytes_consumed += bytes_consumed;
1038		1038
1039	if (rbuf->bytes_consumed >= rbuf->chan->subbuf_size) {	1039	if (rbuf->bytes_consumed >= rbuf->chan->subbuf_size) {
1040	relay_subbufs_consumed(rbuf->chan, rbuf->cpu, 1);	1040	relay_subbufs_consumed(rbuf->chan, rbuf->cpu, 1);
1041	rbuf->bytes_consumed %= rbuf->chan->subbuf_size;	1041	rbuf->bytes_consumed %= rbuf->chan->subbuf_size;
1042	}	1042	}
1043	}	1043	}
1044		1044
1045	static void relay_pipe_buf_release(struct pipe_inode_info *pipe,	1045	static void relay_pipe_buf_release(struct pipe_inode_info *pipe,
1046	struct pipe_buffer *buf)	1046	struct pipe_buffer *buf)
1047	{	1047	{
1048	struct rchan_buf *rbuf;	1048	struct rchan_buf *rbuf;
1049		1049
1050	rbuf = (struct rchan_buf *)page_private(buf->page);	1050	rbuf = (struct rchan_buf *)page_private(buf->page);
1051	relay_consume_bytes(rbuf, buf->private);	1051	relay_consume_bytes(rbuf, buf->private);
1052	}	1052	}
1053		1053
1054	static struct pipe_buf_operations relay_pipe_buf_ops = {	1054	static struct pipe_buf_operations relay_pipe_buf_ops = {
1055	.can_merge = 0,	1055	.can_merge = 0,
1056	.map = generic_pipe_buf_map,	1056	.map = generic_pipe_buf_map,
1057	.unmap = generic_pipe_buf_unmap,	1057	.unmap = generic_pipe_buf_unmap,
1058	.pin = generic_pipe_buf_pin,	1058	.confirm = generic_pipe_buf_confirm,
1059	.release = relay_pipe_buf_release,	1059	.release = relay_pipe_buf_release,
1060	.steal = generic_pipe_buf_steal,	1060	.steal = generic_pipe_buf_steal,
1061	.get = generic_pipe_buf_get,	1061	.get = generic_pipe_buf_get,
1062	};	1062	};
1063		1063
1064	/**	1064	/**
1065	* subbuf_splice_actor - splice up to one subbuf's worth of data	1065	* subbuf_splice_actor - splice up to one subbuf's worth of data
1066	*/	1066	*/
1067	static int subbuf_splice_actor(struct file *in,	1067	static int subbuf_splice_actor(struct file *in,
1068	loff_t *ppos,	1068	loff_t *ppos,
1069	struct pipe_inode_info *pipe,	1069	struct pipe_inode_info *pipe,
1070	size_t len,	1070	size_t len,
1071	unsigned int flags,	1071	unsigned int flags,
1072	int *nonpad_ret)	1072	int *nonpad_ret)
1073	{	1073	{
1074	unsigned int pidx, poff, total_len, subbuf_pages, ret;	1074	unsigned int pidx, poff, total_len, subbuf_pages, ret;
1075	struct rchan_buf *rbuf = in->private_data;	1075	struct rchan_buf *rbuf = in->private_data;
1076	unsigned int subbuf_size = rbuf->chan->subbuf_size;	1076	unsigned int subbuf_size = rbuf->chan->subbuf_size;
1077	size_t read_start = ((size_t)*ppos) % rbuf->chan->alloc_size;	1077	size_t read_start = ((size_t)*ppos) % rbuf->chan->alloc_size;
1078	size_t read_subbuf = read_start / subbuf_size;	1078	size_t read_subbuf = read_start / subbuf_size;
1079	size_t padding = rbuf->padding[read_subbuf];	1079	size_t padding = rbuf->padding[read_subbuf];
1080	size_t nonpad_end = read_subbuf * subbuf_size + subbuf_size - padding;	1080	size_t nonpad_end = read_subbuf * subbuf_size + subbuf_size - padding;
1081	struct page *pages[PIPE_BUFFERS];	1081	struct page *pages[PIPE_BUFFERS];
1082	struct partial_page partial[PIPE_BUFFERS];	1082	struct partial_page partial[PIPE_BUFFERS];
1083	struct splice_pipe_desc spd = {	1083	struct splice_pipe_desc spd = {
1084	.pages = pages,	1084	.pages = pages,
1085	.nr_pages = 0,	1085	.nr_pages = 0,
1086	.partial = partial,	1086	.partial = partial,
1087	.flags = flags,	1087	.flags = flags,
1088	.ops = &relay_pipe_buf_ops,	1088	.ops = &relay_pipe_buf_ops,
1089	};	1089	};
1090		1090
1091	if (rbuf->subbufs_produced == rbuf->subbufs_consumed)	1091	if (rbuf->subbufs_produced == rbuf->subbufs_consumed)
1092	return 0;	1092	return 0;
1093		1093
1094	/*	1094	/*
1095	* Adjust read len, if longer than what is available	1095	* Adjust read len, if longer than what is available
1096	*/	1096	*/
1097	if (len > (subbuf_size - read_start % subbuf_size))	1097	if (len > (subbuf_size - read_start % subbuf_size))
1098	len = subbuf_size - read_start % subbuf_size;	1098	len = subbuf_size - read_start % subbuf_size;
1099		1099
1100	subbuf_pages = rbuf->chan->alloc_size >> PAGE_SHIFT;	1100	subbuf_pages = rbuf->chan->alloc_size >> PAGE_SHIFT;
1101	pidx = (read_start / PAGE_SIZE) % subbuf_pages;	1101	pidx = (read_start / PAGE_SIZE) % subbuf_pages;
1102	poff = read_start & ~PAGE_MASK;	1102	poff = read_start & ~PAGE_MASK;
1103		1103
1104	for (total_len = 0; spd.nr_pages < subbuf_pages; spd.nr_pages++) {	1104	for (total_len = 0; spd.nr_pages < subbuf_pages; spd.nr_pages++) {
1105	unsigned int this_len, this_end, private;	1105	unsigned int this_len, this_end, private;
1106	unsigned int cur_pos = read_start + total_len;	1106	unsigned int cur_pos = read_start + total_len;
1107		1107
1108	if (!len)	1108	if (!len)
1109	break;	1109	break;
1110		1110
1111	this_len = min_t(unsigned long, len, PAGE_SIZE - poff);	1111	this_len = min_t(unsigned long, len, PAGE_SIZE - poff);
1112	private = this_len;	1112	private = this_len;
1113		1113
1114	spd.pages[spd.nr_pages] = rbuf->page_array[pidx];	1114	spd.pages[spd.nr_pages] = rbuf->page_array[pidx];
1115	spd.partial[spd.nr_pages].offset = poff;	1115	spd.partial[spd.nr_pages].offset = poff;
1116		1116
1117	this_end = cur_pos + this_len;	1117	this_end = cur_pos + this_len;
1118	if (this_end >= nonpad_end) {	1118	if (this_end >= nonpad_end) {
1119	this_len = nonpad_end - cur_pos;	1119	this_len = nonpad_end - cur_pos;
1120	private = this_len + padding;	1120	private = this_len + padding;
1121	}	1121	}
1122	spd.partial[spd.nr_pages].len = this_len;	1122	spd.partial[spd.nr_pages].len = this_len;
1123	spd.partial[spd.nr_pages].private = private;	1123	spd.partial[spd.nr_pages].private = private;
1124		1124
1125	len -= this_len;	1125	len -= this_len;
1126	total_len += this_len;	1126	total_len += this_len;
1127	poff = 0;	1127	poff = 0;
1128	pidx = (pidx + 1) % subbuf_pages;	1128	pidx = (pidx + 1) % subbuf_pages;
1129		1129
1130	if (this_end >= nonpad_end) {	1130	if (this_end >= nonpad_end) {
1131	spd.nr_pages++;	1131	spd.nr_pages++;
1132	break;	1132	break;
1133	}	1133	}
1134	}	1134	}
1135		1135
1136	if (!spd.nr_pages)	1136	if (!spd.nr_pages)
1137	return 0;	1137	return 0;
1138		1138
1139	ret = *nonpad_ret = splice_to_pipe(pipe, &spd);	1139	ret = *nonpad_ret = splice_to_pipe(pipe, &spd);
1140	if (ret < 0 \|\| ret < total_len)	1140	if (ret < 0 \|\| ret < total_len)
1141	return ret;	1141	return ret;
1142		1142
1143	if (read_start + ret == nonpad_end)	1143	if (read_start + ret == nonpad_end)
1144	ret += padding;	1144	ret += padding;
1145		1145
1146	return ret;	1146	return ret;
1147	}	1147	}
1148		1148
1149	static ssize_t relay_file_splice_read(struct file *in,	1149	static ssize_t relay_file_splice_read(struct file *in,
1150	loff_t *ppos,	1150	loff_t *ppos,
1151	struct pipe_inode_info *pipe,	1151	struct pipe_inode_info *pipe,
1152	size_t len,	1152	size_t len,
1153	unsigned int flags)	1153	unsigned int flags)
1154	{	1154	{
1155	ssize_t spliced;	1155	ssize_t spliced;
1156	int ret;	1156	int ret;
1157	int nonpad_ret = 0;	1157	int nonpad_ret = 0;
1158		1158
1159	ret = 0;	1159	ret = 0;
1160	spliced = 0;	1160	spliced = 0;
1161		1161
1162	while (len) {	1162	while (len) {
1163	ret = subbuf_splice_actor(in, ppos, pipe, len, flags, &nonpad_ret);	1163	ret = subbuf_splice_actor(in, ppos, pipe, len, flags, &nonpad_ret);
1164	if (ret < 0)	1164	if (ret < 0)
1165	break;	1165	break;
1166	else if (!ret) {	1166	else if (!ret) {
1167	if (spliced)	1167	if (spliced)
1168	break;	1168	break;
1169	if (flags & SPLICE_F_NONBLOCK) {	1169	if (flags & SPLICE_F_NONBLOCK) {
1170	ret = -EAGAIN;	1170	ret = -EAGAIN;
1171	break;	1171	break;
1172	}	1172	}
1173	}	1173	}
1174		1174
1175	*ppos += ret;	1175	*ppos += ret;
1176	if (ret > len)	1176	if (ret > len)
1177	len = 0;	1177	len = 0;
1178	else	1178	else
1179	len -= ret;	1179	len -= ret;
1180	spliced += nonpad_ret;	1180	spliced += nonpad_ret;
1181	nonpad_ret = 0;	1181	nonpad_ret = 0;
1182	}	1182	}
1183		1183
1184	if (spliced)	1184	if (spliced)
1185	return spliced;	1185	return spliced;
1186		1186
1187	return ret;	1187	return ret;
1188	}	1188	}
1189		1189
1190	const struct file_operations relay_file_operations = {	1190	const struct file_operations relay_file_operations = {
1191	.open = relay_file_open,	1191	.open = relay_file_open,
1192	.poll = relay_file_poll,	1192	.poll = relay_file_poll,
1193	.mmap = relay_file_mmap,	1193	.mmap = relay_file_mmap,
1194	.read = relay_file_read,	1194	.read = relay_file_read,
1195	.llseek = no_llseek,	1195	.llseek = no_llseek,
1196	.release = relay_file_release,	1196	.release = relay_file_release,
1197	.splice_read = relay_file_splice_read,	1197	.splice_read = relay_file_splice_read,
1198	};	1198	};
1199	EXPORT_SYMBOL_GPL(relay_file_operations);	1199	EXPORT_SYMBOL_GPL(relay_file_operations);
1200		1200
1201	static __init int relay_init(void)	1201	static __init int relay_init(void)
1202	{	1202	{
1203		1203
1204	hotcpu_notifier(relay_hotcpu_callback, 0);	1204	hotcpu_notifier(relay_hotcpu_callback, 0);
1205	return 0;	1205	return 0;
1206	}	1206	}
1207		1207
1208	module_init(relay_init);	1208	module_init(relay_init);
1209		1209