Blame view

fs/aio.c 54.8 KB
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1
2
3
4
5
6
7
  /*
   *	An async IO implementation for Linux
   *	Written by Benjamin LaHaise <bcrl@kvack.org>
   *
   *	Implements an efficient asynchronous io interface.
   *
   *	Copyright 2000, 2001, 2002 Red Hat, Inc.  All Rights Reserved.
bfe4037e7   Christoph Hellwig   aio: implement IO...
8
   *	Copyright 2018 Christoph Hellwig.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
9
10
11
   *
   *	See ../COPYING for licensing terms.
   */
caf4167aa   Kent Overstreet   aio: dprintk() ->...
12
  #define pr_fmt(fmt) "%s: " fmt, __func__
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
13
14
15
16
17
  #include <linux/kernel.h>
  #include <linux/init.h>
  #include <linux/errno.h>
  #include <linux/time.h>
  #include <linux/aio_abi.h>
630d9c472   Paul Gortmaker   fs: reduce the us...
18
  #include <linux/export.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
19
  #include <linux/syscalls.h>
b9d128f10   Jens Axboe   block: move bdi/a...
20
  #include <linux/backing-dev.h>
9018ccc45   Christoph Hellwig   aio: add a iocb r...
21
  #include <linux/refcount.h>
027445c37   Badari Pulavarty   [PATCH] Vectorize...
22
  #include <linux/uio.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
23

174cd4b1e   Ingo Molnar   sched/headers: Pr...
24
  #include <linux/sched/signal.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
25
26
27
28
  #include <linux/fs.h>
  #include <linux/file.h>
  #include <linux/mm.h>
  #include <linux/mman.h>
3d2d827f5   Michael S. Tsirkin   mm: move use_mm/u...
29
  #include <linux/mmu_context.h>
e1bdd5f27   Kent Overstreet   aio: percpu reqs_...
30
  #include <linux/percpu.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
31
32
33
34
35
36
  #include <linux/slab.h>
  #include <linux/timer.h>
  #include <linux/aio.h>
  #include <linux/highmem.h>
  #include <linux/workqueue.h>
  #include <linux/security.h>
9c3060bed   Davide Libenzi   signal/timer/even...
37
  #include <linux/eventfd.h>
cfb1e33ee   Jeff Moyer   aio: implement re...
38
  #include <linux/blkdev.h>
9d85cba71   Jeff Moyer   aio: fix the comp...
39
  #include <linux/compat.h>
36bc08cc0   Gu Zheng   fs/aio: Add suppo...
40
41
  #include <linux/migrate.h>
  #include <linux/ramfs.h>
723be6e39   Kent Overstreet   aio: percpu ioctx...
42
  #include <linux/percpu-refcount.h>
71ad7490c   Benjamin LaHaise   rework aio migrat...
43
  #include <linux/mount.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
44
45
  
  #include <asm/kmap_types.h>
7c0f6ba68   Linus Torvalds   Replace <asm/uacc...
46
  #include <linux/uaccess.h>
a6136922d   Jeff Moyer   aio: fix spectre ...
47
  #include <linux/nospec.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
48

68d70d03f   Al Viro   constify rw_verif...
49
  #include "internal.h"
f3a2752a4   Christoph Hellwig   aio: simplify KIO...
50
  #define KIOCB_KEY		0
4e179bca6   Kent Overstreet   aio: move private...
51
52
53
54
55
56
  #define AIO_RING_MAGIC			0xa10a10a1
  #define AIO_RING_COMPAT_FEATURES	1
  #define AIO_RING_INCOMPAT_FEATURES	0
  struct aio_ring {
  	unsigned	id;	/* kernel internal index number */
  	unsigned	nr;	/* number of io_events */
fa8a53c39   Benjamin LaHaise   aio: v4 ensure ac...
57
58
  	unsigned	head;	/* Written to by userland or under ring_lock
  				 * mutex by aio_read_events_ring(). */
4e179bca6   Kent Overstreet   aio: move private...
59
60
61
62
63
64
65
66
67
68
69
70
  	unsigned	tail;
  
  	unsigned	magic;
  	unsigned	compat_features;
  	unsigned	incompat_features;
  	unsigned	header_length;	/* size of aio_ring */
  
  
  	struct io_event		io_events[0];
  }; /* 128 bytes + ring size */
  
  #define AIO_RING_PAGES	8
4e179bca6   Kent Overstreet   aio: move private...
71

db446a08c   Benjamin LaHaise   aio: convert the ...
72
  struct kioctx_table {
d0264c01e   Tejun Heo   fs/aio: Use RCU a...
73
74
75
  	struct rcu_head		rcu;
  	unsigned		nr;
  	struct kioctx __rcu	*table[];
db446a08c   Benjamin LaHaise   aio: convert the ...
76
  };
e1bdd5f27   Kent Overstreet   aio: percpu reqs_...
77
78
79
  struct kioctx_cpu {
  	unsigned		reqs_available;
  };
dc48e56d7   Jens Axboe   aio: fix serial d...
80
81
82
83
  struct ctx_rq_wait {
  	struct completion comp;
  	atomic_t count;
  };
4e179bca6   Kent Overstreet   aio: move private...
84
  struct kioctx {
723be6e39   Kent Overstreet   aio: percpu ioctx...
85
  	struct percpu_ref	users;
36f558890   Kent Overstreet   aio: refcounting ...
86
  	atomic_t		dead;
4e179bca6   Kent Overstreet   aio: move private...
87

e34ecee2a   Kent Overstreet   aio: Fix a trinit...
88
  	struct percpu_ref	reqs;
4e179bca6   Kent Overstreet   aio: move private...
89
  	unsigned long		user_id;
4e179bca6   Kent Overstreet   aio: move private...
90

e1bdd5f27   Kent Overstreet   aio: percpu reqs_...
91
92
93
94
95
96
97
  	struct __percpu kioctx_cpu *cpu;
  
  	/*
  	 * For percpu reqs_available, number of slots we move to/from global
  	 * counter at a time:
  	 */
  	unsigned		req_batch;
3e845ce01   Kent Overstreet   aio: change reqs_...
98
99
100
101
  	/*
  	 * This is what userspace passed to io_setup(), it's not used for
  	 * anything but counting against the global max_reqs quota.
  	 *
58c85dc20   Kent Overstreet   aio: kill struct ...
102
  	 * The real limit is nr_events - 1, which will be larger (see
3e845ce01   Kent Overstreet   aio: change reqs_...
103
104
  	 * aio_setup_ring())
  	 */
4e179bca6   Kent Overstreet   aio: move private...
105
  	unsigned		max_reqs;
58c85dc20   Kent Overstreet   aio: kill struct ...
106
107
  	/* Size of ringbuffer, in units of struct io_event */
  	unsigned		nr_events;
4e179bca6   Kent Overstreet   aio: move private...
108

58c85dc20   Kent Overstreet   aio: kill struct ...
109
110
111
112
113
  	unsigned long		mmap_base;
  	unsigned long		mmap_size;
  
  	struct page		**ring_pages;
  	long			nr_pages;
f729863a8   Tejun Heo   fs/aio: Use rcu_w...
114
  	struct rcu_work		free_rwork;	/* see free_ioctx() */
4e23bcaeb   Kent Overstreet   aio: give shared ...
115

e02ba72aa   Anatol Pomozov   aio: block io_des...
116
117
118
  	/*
  	 * signals when all in-flight requests are done
  	 */
dc48e56d7   Jens Axboe   aio: fix serial d...
119
  	struct ctx_rq_wait	*rq_wait;
e02ba72aa   Anatol Pomozov   aio: block io_des...
120

4e23bcaeb   Kent Overstreet   aio: give shared ...
121
  	struct {
34e83fc61   Kent Overstreet   aio: reqs_active ...
122
123
124
125
126
  		/*
  		 * This counts the number of available slots in the ringbuffer,
  		 * so we avoid overflowing it: it's decremented (if positive)
  		 * when allocating a kiocb and incremented when the resulting
  		 * io_event is pulled off the ringbuffer.
e1bdd5f27   Kent Overstreet   aio: percpu reqs_...
127
128
  		 *
  		 * We batch accesses to it with a percpu version.
34e83fc61   Kent Overstreet   aio: reqs_active ...
129
130
  		 */
  		atomic_t	reqs_available;
4e23bcaeb   Kent Overstreet   aio: give shared ...
131
132
133
134
135
136
  	} ____cacheline_aligned_in_smp;
  
  	struct {
  		spinlock_t	ctx_lock;
  		struct list_head active_reqs;	/* used for cancellation */
  	} ____cacheline_aligned_in_smp;
58c85dc20   Kent Overstreet   aio: kill struct ...
137
138
  	struct {
  		struct mutex	ring_lock;
4e23bcaeb   Kent Overstreet   aio: give shared ...
139
140
  		wait_queue_head_t wait;
  	} ____cacheline_aligned_in_smp;
58c85dc20   Kent Overstreet   aio: kill struct ...
141
142
143
  
  	struct {
  		unsigned	tail;
d856f32a8   Benjamin LaHaise   aio: fix reqs_ava...
144
  		unsigned	completed_events;
58c85dc20   Kent Overstreet   aio: kill struct ...
145
  		spinlock_t	completion_lock;
4e23bcaeb   Kent Overstreet   aio: give shared ...
146
  	} ____cacheline_aligned_in_smp;
58c85dc20   Kent Overstreet   aio: kill struct ...
147
148
  
  	struct page		*internal_pages[AIO_RING_PAGES];
36bc08cc0   Gu Zheng   fs/aio: Add suppo...
149
  	struct file		*aio_ring_file;
db446a08c   Benjamin LaHaise   aio: convert the ...
150
151
  
  	unsigned		id;
4e179bca6   Kent Overstreet   aio: move private...
152
  };
a3c0d439e   Christoph Hellwig   aio: implement IO...
153
154
155
156
157
  struct fsync_iocb {
  	struct work_struct	work;
  	struct file		*file;
  	bool			datasync;
  };
bfe4037e7   Christoph Hellwig   aio: implement IO...
158
159
160
161
162
163
164
165
166
  struct poll_iocb {
  	struct file		*file;
  	struct wait_queue_head	*head;
  	__poll_t		events;
  	bool			woken;
  	bool			cancelled;
  	struct wait_queue_entry	wait;
  	struct work_struct	work;
  };
04b2fa9f8   Christoph Hellwig   fs: split generic...
167
  struct aio_kiocb {
54843f875   Christoph Hellwig   aio: refactor rea...
168
169
  	union {
  		struct kiocb		rw;
a3c0d439e   Christoph Hellwig   aio: implement IO...
170
  		struct fsync_iocb	fsync;
bfe4037e7   Christoph Hellwig   aio: implement IO...
171
  		struct poll_iocb	poll;
54843f875   Christoph Hellwig   aio: refactor rea...
172
  	};
04b2fa9f8   Christoph Hellwig   fs: split generic...
173
174
175
176
177
178
179
180
181
  
  	struct kioctx		*ki_ctx;
  	kiocb_cancel_fn		*ki_cancel;
  
  	struct iocb __user	*ki_user_iocb;	/* user's aiocb */
  	__u64			ki_user_data;	/* user's data for completion */
  
  	struct list_head	ki_list;	/* the aio core uses this
  						 * for cancellation */
9018ccc45   Christoph Hellwig   aio: add a iocb r...
182
  	refcount_t		ki_refcnt;
04b2fa9f8   Christoph Hellwig   fs: split generic...
183
184
185
186
187
188
189
  
  	/*
  	 * If the aio_resfd field of the userspace iocb is not zero,
  	 * this is the underlying eventfd context to deliver events to.
  	 */
  	struct eventfd_ctx	*ki_eventfd;
  };
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
190
  /*------ sysctl variables----*/
d55b5fdaf   Zach Brown   [PATCH] aio: remo...
191
192
193
  static DEFINE_SPINLOCK(aio_nr_lock);
  unsigned long aio_nr;		/* current system wide number of aio requests */
  unsigned long aio_max_nr = 0x10000; /* system wide maximum number of aio requests */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
194
  /*----end sysctl variables---*/
e18b890bb   Christoph Lameter   [PATCH] slab: rem...
195
196
  static struct kmem_cache	*kiocb_cachep;
  static struct kmem_cache	*kioctx_cachep;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
197

71ad7490c   Benjamin LaHaise   rework aio migrat...
198
199
200
201
202
203
204
  static struct vfsmount *aio_mnt;
  
  static const struct file_operations aio_ring_fops;
  static const struct address_space_operations aio_ctx_aops;
  
  static struct file *aio_private_file(struct kioctx *ctx, loff_t nr_pages)
  {
71ad7490c   Benjamin LaHaise   rework aio migrat...
205
  	struct file *file;
71ad7490c   Benjamin LaHaise   rework aio migrat...
206
  	struct inode *inode = alloc_anon_inode(aio_mnt->mnt_sb);
7f62656be   Dan Carpenter   aio: checking for...
207
208
  	if (IS_ERR(inode))
  		return ERR_CAST(inode);
71ad7490c   Benjamin LaHaise   rework aio migrat...
209
210
211
212
  
  	inode->i_mapping->a_ops = &aio_ctx_aops;
  	inode->i_mapping->private_data = ctx;
  	inode->i_size = PAGE_SIZE * nr_pages;
d93aa9d82   Al Viro   new wrapper: allo...
213
214
  	file = alloc_file_pseudo(inode, aio_mnt, "[aio]",
  				O_RDWR, &aio_ring_fops);
c9c554f21   Al Viro   alloc_file(): swi...
215
  	if (IS_ERR(file))
71ad7490c   Benjamin LaHaise   rework aio migrat...
216
  		iput(inode);
71ad7490c   Benjamin LaHaise   rework aio migrat...
217
218
219
220
221
222
  	return file;
  }
  
  static struct dentry *aio_mount(struct file_system_type *fs_type,
  				int flags, const char *dev_name, void *data)
  {
d93aa9d82   Al Viro   new wrapper: allo...
223
  	struct dentry *root = mount_pseudo(fs_type, "aio:", NULL, NULL,
22f6b4d34   Jann Horn   aio: mark AIO pse...
224
225
226
227
228
  					   AIO_RING_MAGIC);
  
  	if (!IS_ERR(root))
  		root->d_sb->s_iflags |= SB_I_NOEXEC;
  	return root;
71ad7490c   Benjamin LaHaise   rework aio migrat...
229
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
230
231
232
233
234
235
  /* aio_setup
   *	Creates the slab caches used by the aio routines, panic on
   *	failure as this is done early during the boot sequence.
   */
  static int __init aio_setup(void)
  {
71ad7490c   Benjamin LaHaise   rework aio migrat...
236
237
238
239
240
241
242
243
  	static struct file_system_type aio_fs = {
  		.name		= "aio",
  		.mount		= aio_mount,
  		.kill_sb	= kill_anon_super,
  	};
  	aio_mnt = kern_mount(&aio_fs);
  	if (IS_ERR(aio_mnt))
  		panic("Failed to create aio fs mount.");
04b2fa9f8   Christoph Hellwig   fs: split generic...
244
  	kiocb_cachep = KMEM_CACHE(aio_kiocb, SLAB_HWCACHE_ALIGN|SLAB_PANIC);
0a31bd5f2   Christoph Lameter   KMEM_CACHE(): sim...
245
  	kioctx_cachep = KMEM_CACHE(kioctx,SLAB_HWCACHE_ALIGN|SLAB_PANIC);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
246
247
  	return 0;
  }
385773e04   H Hartley Sweeten   aio.c: move EXPOR...
248
  __initcall(aio_setup);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
249

5e9ae2e5d   Benjamin LaHaise   aio: fix use-afte...
250
251
252
  static void put_aio_ring_file(struct kioctx *ctx)
  {
  	struct file *aio_ring_file = ctx->aio_ring_file;
de04e7693   Rasmus Villemoes   fs/aio.c: elimina...
253
  	struct address_space *i_mapping;
5e9ae2e5d   Benjamin LaHaise   aio: fix use-afte...
254
  	if (aio_ring_file) {
450630975   Al Viro   don't open-code f...
255
  		truncate_setsize(file_inode(aio_ring_file), 0);
5e9ae2e5d   Benjamin LaHaise   aio: fix use-afte...
256
257
  
  		/* Prevent further access to the kioctx from migratepages */
450630975   Al Viro   don't open-code f...
258
  		i_mapping = aio_ring_file->f_mapping;
de04e7693   Rasmus Villemoes   fs/aio.c: elimina...
259
260
  		spin_lock(&i_mapping->private_lock);
  		i_mapping->private_data = NULL;
5e9ae2e5d   Benjamin LaHaise   aio: fix use-afte...
261
  		ctx->aio_ring_file = NULL;
de04e7693   Rasmus Villemoes   fs/aio.c: elimina...
262
  		spin_unlock(&i_mapping->private_lock);
5e9ae2e5d   Benjamin LaHaise   aio: fix use-afte...
263
264
265
266
  
  		fput(aio_ring_file);
  	}
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
267
268
  static void aio_free_ring(struct kioctx *ctx)
  {
36bc08cc0   Gu Zheng   fs/aio: Add suppo...
269
  	int i;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
270

fa8a53c39   Benjamin LaHaise   aio: v4 ensure ac...
271
272
273
274
  	/* Disconnect the kiotx from the ring file.  This prevents future
  	 * accesses to the kioctx from page migration.
  	 */
  	put_aio_ring_file(ctx);
36bc08cc0   Gu Zheng   fs/aio: Add suppo...
275
  	for (i = 0; i < ctx->nr_pages; i++) {
8e321fefb   Benjamin LaHaise   aio/migratepages:...
276
  		struct page *page;
36bc08cc0   Gu Zheng   fs/aio: Add suppo...
277
278
279
  		pr_debug("pid(%d) [%d] page->count=%d
  ", current->pid, i,
  				page_count(ctx->ring_pages[i]));
8e321fefb   Benjamin LaHaise   aio/migratepages:...
280
281
282
283
284
  		page = ctx->ring_pages[i];
  		if (!page)
  			continue;
  		ctx->ring_pages[i] = NULL;
  		put_page(page);
36bc08cc0   Gu Zheng   fs/aio: Add suppo...
285
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
286

ddb8c45ba   Sasha Levin   aio: nullify aio-...
287
  	if (ctx->ring_pages && ctx->ring_pages != ctx->internal_pages) {
58c85dc20   Kent Overstreet   aio: kill struct ...
288
  		kfree(ctx->ring_pages);
ddb8c45ba   Sasha Levin   aio: nullify aio-...
289
290
  		ctx->ring_pages = NULL;
  	}
36bc08cc0   Gu Zheng   fs/aio: Add suppo...
291
  }
5477e70a6   Oleg Nesterov   mm: move ->mremap...
292
  static int aio_ring_mremap(struct vm_area_struct *vma)
e4a0d3e72   Pavel Emelyanov   aio: Make it poss...
293
  {
5477e70a6   Oleg Nesterov   mm: move ->mremap...
294
  	struct file *file = vma->vm_file;
e4a0d3e72   Pavel Emelyanov   aio: Make it poss...
295
296
  	struct mm_struct *mm = vma->vm_mm;
  	struct kioctx_table *table;
b2edffdd9   Al Viro   fix mremap() vs. ...
297
  	int i, res = -EINVAL;
e4a0d3e72   Pavel Emelyanov   aio: Make it poss...
298
299
300
301
302
303
  
  	spin_lock(&mm->ioctx_lock);
  	rcu_read_lock();
  	table = rcu_dereference(mm->ioctx_table);
  	for (i = 0; i < table->nr; i++) {
  		struct kioctx *ctx;
d0264c01e   Tejun Heo   fs/aio: Use RCU a...
304
  		ctx = rcu_dereference(table->table[i]);
e4a0d3e72   Pavel Emelyanov   aio: Make it poss...
305
  		if (ctx && ctx->aio_ring_file == file) {
b2edffdd9   Al Viro   fix mremap() vs. ...
306
307
308
309
  			if (!atomic_read(&ctx->dead)) {
  				ctx->user_id = ctx->mmap_base = vma->vm_start;
  				res = 0;
  			}
e4a0d3e72   Pavel Emelyanov   aio: Make it poss...
310
311
312
313
314
315
  			break;
  		}
  	}
  
  	rcu_read_unlock();
  	spin_unlock(&mm->ioctx_lock);
b2edffdd9   Al Viro   fix mremap() vs. ...
316
  	return res;
e4a0d3e72   Pavel Emelyanov   aio: Make it poss...
317
  }
5477e70a6   Oleg Nesterov   mm: move ->mremap...
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
  static const struct vm_operations_struct aio_ring_vm_ops = {
  	.mremap		= aio_ring_mremap,
  #if IS_ENABLED(CONFIG_MMU)
  	.fault		= filemap_fault,
  	.map_pages	= filemap_map_pages,
  	.page_mkwrite	= filemap_page_mkwrite,
  #endif
  };
  
  static int aio_ring_mmap(struct file *file, struct vm_area_struct *vma)
  {
  	vma->vm_flags |= VM_DONTEXPAND;
  	vma->vm_ops = &aio_ring_vm_ops;
  	return 0;
  }
36bc08cc0   Gu Zheng   fs/aio: Add suppo...
333
334
335
  static const struct file_operations aio_ring_fops = {
  	.mmap = aio_ring_mmap,
  };
0c45355fc   Benjamin LaHaise   aio: fix build wh...
336
  #if IS_ENABLED(CONFIG_MIGRATION)
36bc08cc0   Gu Zheng   fs/aio: Add suppo...
337
338
339
  static int aio_migratepage(struct address_space *mapping, struct page *new,
  			struct page *old, enum migrate_mode mode)
  {
5e9ae2e5d   Benjamin LaHaise   aio: fix use-afte...
340
  	struct kioctx *ctx;
36bc08cc0   Gu Zheng   fs/aio: Add suppo...
341
  	unsigned long flags;
fa8a53c39   Benjamin LaHaise   aio: v4 ensure ac...
342
  	pgoff_t idx;
36bc08cc0   Gu Zheng   fs/aio: Add suppo...
343
  	int rc;
2916ecc0f   Jérôme Glisse   mm/migrate: new m...
344
345
346
347
348
349
350
  	/*
  	 * We cannot support the _NO_COPY case here, because copy needs to
  	 * happen under the ctx->completion_lock. That does not work with the
  	 * migration workflow of MIGRATE_SYNC_NO_COPY.
  	 */
  	if (mode == MIGRATE_SYNC_NO_COPY)
  		return -EINVAL;
8e321fefb   Benjamin LaHaise   aio/migratepages:...
351
  	rc = 0;
fa8a53c39   Benjamin LaHaise   aio: v4 ensure ac...
352
  	/* mapping->private_lock here protects against the kioctx teardown.  */
8e321fefb   Benjamin LaHaise   aio/migratepages:...
353
354
  	spin_lock(&mapping->private_lock);
  	ctx = mapping->private_data;
fa8a53c39   Benjamin LaHaise   aio: v4 ensure ac...
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
  	if (!ctx) {
  		rc = -EINVAL;
  		goto out;
  	}
  
  	/* The ring_lock mutex.  The prevents aio_read_events() from writing
  	 * to the ring's head, and prevents page migration from mucking in
  	 * a partially initialized kiotx.
  	 */
  	if (!mutex_trylock(&ctx->ring_lock)) {
  		rc = -EAGAIN;
  		goto out;
  	}
  
  	idx = old->index;
  	if (idx < (pgoff_t)ctx->nr_pages) {
  		/* Make sure the old page hasn't already been changed */
  		if (ctx->ring_pages[idx] != old)
  			rc = -EAGAIN;
8e321fefb   Benjamin LaHaise   aio/migratepages:...
374
375
  	} else
  		rc = -EINVAL;
8e321fefb   Benjamin LaHaise   aio/migratepages:...
376
377
  
  	if (rc != 0)
fa8a53c39   Benjamin LaHaise   aio: v4 ensure ac...
378
  		goto out_unlock;
8e321fefb   Benjamin LaHaise   aio/migratepages:...
379

36bc08cc0   Gu Zheng   fs/aio: Add suppo...
380
381
  	/* Writeback must be complete */
  	BUG_ON(PageWriteback(old));
8e321fefb   Benjamin LaHaise   aio/migratepages:...
382
  	get_page(new);
36bc08cc0   Gu Zheng   fs/aio: Add suppo...
383

8e321fefb   Benjamin LaHaise   aio/migratepages:...
384
  	rc = migrate_page_move_mapping(mapping, new, old, NULL, mode, 1);
36bc08cc0   Gu Zheng   fs/aio: Add suppo...
385
  	if (rc != MIGRATEPAGE_SUCCESS) {
8e321fefb   Benjamin LaHaise   aio/migratepages:...
386
  		put_page(new);
fa8a53c39   Benjamin LaHaise   aio: v4 ensure ac...
387
  		goto out_unlock;
36bc08cc0   Gu Zheng   fs/aio: Add suppo...
388
  	}
fa8a53c39   Benjamin LaHaise   aio: v4 ensure ac...
389
390
391
  	/* Take completion_lock to prevent other writes to the ring buffer
  	 * while the old page is copied to the new.  This prevents new
  	 * events from being lost.
5e9ae2e5d   Benjamin LaHaise   aio: fix use-afte...
392
  	 */
fa8a53c39   Benjamin LaHaise   aio: v4 ensure ac...
393
394
395
396
397
  	spin_lock_irqsave(&ctx->completion_lock, flags);
  	migrate_page_copy(new, old);
  	BUG_ON(ctx->ring_pages[idx] != old);
  	ctx->ring_pages[idx] = new;
  	spin_unlock_irqrestore(&ctx->completion_lock, flags);
36bc08cc0   Gu Zheng   fs/aio: Add suppo...
398

fa8a53c39   Benjamin LaHaise   aio: v4 ensure ac...
399
400
  	/* The old page is no longer accessible. */
  	put_page(old);
8e321fefb   Benjamin LaHaise   aio/migratepages:...
401

fa8a53c39   Benjamin LaHaise   aio: v4 ensure ac...
402
403
404
405
  out_unlock:
  	mutex_unlock(&ctx->ring_lock);
  out:
  	spin_unlock(&mapping->private_lock);
36bc08cc0   Gu Zheng   fs/aio: Add suppo...
406
  	return rc;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
407
  }
0c45355fc   Benjamin LaHaise   aio: fix build wh...
408
  #endif
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
409

36bc08cc0   Gu Zheng   fs/aio: Add suppo...
410
  static const struct address_space_operations aio_ctx_aops = {
835f252c6   Gu Zheng   aio: fix uncorren...
411
  	.set_page_dirty = __set_page_dirty_no_writeback,
0c45355fc   Benjamin LaHaise   aio: fix build wh...
412
  #if IS_ENABLED(CONFIG_MIGRATION)
36bc08cc0   Gu Zheng   fs/aio: Add suppo...
413
  	.migratepage	= aio_migratepage,
0c45355fc   Benjamin LaHaise   aio: fix build wh...
414
  #endif
36bc08cc0   Gu Zheng   fs/aio: Add suppo...
415
  };
2a8a98673   Mauricio Faria de Oliveira   fs: aio: fix the ...
416
  static int aio_setup_ring(struct kioctx *ctx, unsigned int nr_events)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
417
418
  {
  	struct aio_ring *ring;
41003a7bc   Zach Brown   aio: remove retry...
419
  	struct mm_struct *mm = current->mm;
3dc9acb67   Linus Torvalds   aio: clean up and...
420
  	unsigned long size, unused;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
421
  	int nr_pages;
36bc08cc0   Gu Zheng   fs/aio: Add suppo...
422
423
  	int i;
  	struct file *file;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
424
425
426
427
428
429
  
  	/* Compensate for the ring buffer's head/tail overlap entry */
  	nr_events += 2;	/* 1 is required, 2 for good luck */
  
  	size = sizeof(struct aio_ring);
  	size += sizeof(struct io_event) * nr_events;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
430

36bc08cc0   Gu Zheng   fs/aio: Add suppo...
431
  	nr_pages = PFN_UP(size);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
432
433
  	if (nr_pages < 0)
  		return -EINVAL;
71ad7490c   Benjamin LaHaise   rework aio migrat...
434
  	file = aio_private_file(ctx, nr_pages);
36bc08cc0   Gu Zheng   fs/aio: Add suppo...
435
436
  	if (IS_ERR(file)) {
  		ctx->aio_ring_file = NULL;
fa8a53c39   Benjamin LaHaise   aio: v4 ensure ac...
437
  		return -ENOMEM;
36bc08cc0   Gu Zheng   fs/aio: Add suppo...
438
  	}
3dc9acb67   Linus Torvalds   aio: clean up and...
439
440
441
442
443
444
445
446
447
448
449
450
451
  	ctx->aio_ring_file = file;
  	nr_events = (PAGE_SIZE * nr_pages - sizeof(struct aio_ring))
  			/ sizeof(struct io_event);
  
  	ctx->ring_pages = ctx->internal_pages;
  	if (nr_pages > AIO_RING_PAGES) {
  		ctx->ring_pages = kcalloc(nr_pages, sizeof(struct page *),
  					  GFP_KERNEL);
  		if (!ctx->ring_pages) {
  			put_aio_ring_file(ctx);
  			return -ENOMEM;
  		}
  	}
36bc08cc0   Gu Zheng   fs/aio: Add suppo...
452
453
  	for (i = 0; i < nr_pages; i++) {
  		struct page *page;
450630975   Al Viro   don't open-code f...
454
  		page = find_or_create_page(file->f_mapping,
36bc08cc0   Gu Zheng   fs/aio: Add suppo...
455
456
457
458
459
460
461
  					   i, GFP_HIGHUSER | __GFP_ZERO);
  		if (!page)
  			break;
  		pr_debug("pid(%d) page[%d]->count=%d
  ",
  			 current->pid, i, page_count(page));
  		SetPageUptodate(page);
36bc08cc0   Gu Zheng   fs/aio: Add suppo...
462
  		unlock_page(page);
3dc9acb67   Linus Torvalds   aio: clean up and...
463
464
  
  		ctx->ring_pages[i] = page;
36bc08cc0   Gu Zheng   fs/aio: Add suppo...
465
  	}
3dc9acb67   Linus Torvalds   aio: clean up and...
466
  	ctx->nr_pages = i;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
467

3dc9acb67   Linus Torvalds   aio: clean up and...
468
469
  	if (unlikely(i != nr_pages)) {
  		aio_free_ring(ctx);
fa8a53c39   Benjamin LaHaise   aio: v4 ensure ac...
470
  		return -ENOMEM;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
471
  	}
58c85dc20   Kent Overstreet   aio: kill struct ...
472
473
474
  	ctx->mmap_size = nr_pages * PAGE_SIZE;
  	pr_debug("attempting mmap of %lu bytes
  ", ctx->mmap_size);
36bc08cc0   Gu Zheng   fs/aio: Add suppo...
475

013373e8b   Michal Hocko   aio: make aio_set...
476
477
478
479
480
  	if (down_write_killable(&mm->mmap_sem)) {
  		ctx->mmap_size = 0;
  		aio_free_ring(ctx);
  		return -EINTR;
  	}
36bc08cc0   Gu Zheng   fs/aio: Add suppo...
481
482
  	ctx->mmap_base = do_mmap_pgoff(ctx->aio_ring_file, 0, ctx->mmap_size,
  				       PROT_READ | PROT_WRITE,
897ab3e0c   Mike Rapoport   userfaultfd: non-...
483
  				       MAP_SHARED, 0, &unused, NULL);
3dc9acb67   Linus Torvalds   aio: clean up and...
484
  	up_write(&mm->mmap_sem);
58c85dc20   Kent Overstreet   aio: kill struct ...
485
  	if (IS_ERR((void *)ctx->mmap_base)) {
58c85dc20   Kent Overstreet   aio: kill struct ...
486
  		ctx->mmap_size = 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
487
  		aio_free_ring(ctx);
fa8a53c39   Benjamin LaHaise   aio: v4 ensure ac...
488
  		return -ENOMEM;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
489
  	}
58c85dc20   Kent Overstreet   aio: kill struct ...
490
491
  	pr_debug("mmap address: 0x%08lx
  ", ctx->mmap_base);
d6c355c7d   Benjamin LaHaise   aio: fix race in ...
492

58c85dc20   Kent Overstreet   aio: kill struct ...
493
494
  	ctx->user_id = ctx->mmap_base;
  	ctx->nr_events = nr_events; /* trusted copy */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
495

58c85dc20   Kent Overstreet   aio: kill struct ...
496
  	ring = kmap_atomic(ctx->ring_pages[0]);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
497
  	ring->nr = nr_events;	/* user copy */
db446a08c   Benjamin LaHaise   aio: convert the ...
498
  	ring->id = ~0U;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
499
500
501
502
503
  	ring->head = ring->tail = 0;
  	ring->magic = AIO_RING_MAGIC;
  	ring->compat_features = AIO_RING_COMPAT_FEATURES;
  	ring->incompat_features = AIO_RING_INCOMPAT_FEATURES;
  	ring->header_length = sizeof(struct aio_ring);
e8e3c3d66   Cong Wang   fs: remove the se...
504
  	kunmap_atomic(ring);
58c85dc20   Kent Overstreet   aio: kill struct ...
505
  	flush_dcache_page(ctx->ring_pages[0]);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
506
507
508
  
  	return 0;
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
509
510
511
  #define AIO_EVENTS_PER_PAGE	(PAGE_SIZE / sizeof(struct io_event))
  #define AIO_EVENTS_FIRST_PAGE	((PAGE_SIZE - sizeof(struct aio_ring)) / sizeof(struct io_event))
  #define AIO_EVENTS_OFFSET	(AIO_EVENTS_PER_PAGE - AIO_EVENTS_FIRST_PAGE)
04b2fa9f8   Christoph Hellwig   fs: split generic...
512
  void kiocb_set_cancel_fn(struct kiocb *iocb, kiocb_cancel_fn *cancel)
0460fef2a   Kent Overstreet   aio: use cancella...
513
  {
54843f875   Christoph Hellwig   aio: refactor rea...
514
  	struct aio_kiocb *req = container_of(iocb, struct aio_kiocb, rw);
0460fef2a   Kent Overstreet   aio: use cancella...
515
516
  	struct kioctx *ctx = req->ki_ctx;
  	unsigned long flags;
75321b50a   Christoph Hellwig   aio: sanitize ki_...
517
518
  	if (WARN_ON_ONCE(!list_empty(&req->ki_list)))
  		return;
0460fef2a   Kent Overstreet   aio: use cancella...
519

75321b50a   Christoph Hellwig   aio: sanitize ki_...
520
521
  	spin_lock_irqsave(&ctx->ctx_lock, flags);
  	list_add_tail(&req->ki_list, &ctx->active_reqs);
0460fef2a   Kent Overstreet   aio: use cancella...
522
  	req->ki_cancel = cancel;
0460fef2a   Kent Overstreet   aio: use cancella...
523
524
525
  	spin_unlock_irqrestore(&ctx->ctx_lock, flags);
  }
  EXPORT_SYMBOL(kiocb_set_cancel_fn);
a6d7cff47   Tejun Heo   fs/aio: Add expli...
526
527
528
  /*
   * free_ioctx() should be RCU delayed to synchronize against the RCU
   * protected lookup_ioctx() and also needs process context to call
f729863a8   Tejun Heo   fs/aio: Use rcu_w...
529
   * aio_free_ring().  Use rcu_work.
a6d7cff47   Tejun Heo   fs/aio: Add expli...
530
   */
e34ecee2a   Kent Overstreet   aio: Fix a trinit...
531
  static void free_ioctx(struct work_struct *work)
36f558890   Kent Overstreet   aio: refcounting ...
532
  {
f729863a8   Tejun Heo   fs/aio: Use rcu_w...
533
534
  	struct kioctx *ctx = container_of(to_rcu_work(work), struct kioctx,
  					  free_rwork);
e34ecee2a   Kent Overstreet   aio: Fix a trinit...
535
536
  	pr_debug("freeing %p
  ", ctx);
e1bdd5f27   Kent Overstreet   aio: percpu reqs_...
537

e34ecee2a   Kent Overstreet   aio: Fix a trinit...
538
  	aio_free_ring(ctx);
e1bdd5f27   Kent Overstreet   aio: percpu reqs_...
539
  	free_percpu(ctx->cpu);
9a1049da9   Tejun Heo   percpu-refcount: ...
540
541
  	percpu_ref_exit(&ctx->reqs);
  	percpu_ref_exit(&ctx->users);
36f558890   Kent Overstreet   aio: refcounting ...
542
543
  	kmem_cache_free(kioctx_cachep, ctx);
  }
e34ecee2a   Kent Overstreet   aio: Fix a trinit...
544
545
546
  static void free_ioctx_reqs(struct percpu_ref *ref)
  {
  	struct kioctx *ctx = container_of(ref, struct kioctx, reqs);
e02ba72aa   Anatol Pomozov   aio: block io_des...
547
  	/* At this point we know that there are no any in-flight requests */
dc48e56d7   Jens Axboe   aio: fix serial d...
548
549
  	if (ctx->rq_wait && atomic_dec_and_test(&ctx->rq_wait->count))
  		complete(&ctx->rq_wait->comp);
e02ba72aa   Anatol Pomozov   aio: block io_des...
550

a6d7cff47   Tejun Heo   fs/aio: Add expli...
551
  	/* Synchronize against RCU protected table->table[] dereferences */
f729863a8   Tejun Heo   fs/aio: Use rcu_w...
552
553
  	INIT_RCU_WORK(&ctx->free_rwork, free_ioctx);
  	queue_rcu_work(system_wq, &ctx->free_rwork);
e34ecee2a   Kent Overstreet   aio: Fix a trinit...
554
  }
36f558890   Kent Overstreet   aio: refcounting ...
555
556
557
558
559
  /*
   * When this function runs, the kioctx has been removed from the "hash table"
   * and ctx->users has dropped to 0, so we know no more kiocbs can be submitted -
   * now it's safe to cancel any that need to be.
   */
e34ecee2a   Kent Overstreet   aio: Fix a trinit...
560
  static void free_ioctx_users(struct percpu_ref *ref)
36f558890   Kent Overstreet   aio: refcounting ...
561
  {
e34ecee2a   Kent Overstreet   aio: Fix a trinit...
562
  	struct kioctx *ctx = container_of(ref, struct kioctx, users);
04b2fa9f8   Christoph Hellwig   fs: split generic...
563
  	struct aio_kiocb *req;
36f558890   Kent Overstreet   aio: refcounting ...
564
565
566
567
568
  
  	spin_lock_irq(&ctx->ctx_lock);
  
  	while (!list_empty(&ctx->active_reqs)) {
  		req = list_first_entry(&ctx->active_reqs,
04b2fa9f8   Christoph Hellwig   fs: split generic...
569
  				       struct aio_kiocb, ki_list);
888933f8f   Christoph Hellwig   aio: simplify can...
570
  		req->ki_cancel(&req->rw);
4faa99965   Al Viro   fix io_destroy()/...
571
  		list_del_init(&req->ki_list);
36f558890   Kent Overstreet   aio: refcounting ...
572
573
574
  	}
  
  	spin_unlock_irq(&ctx->ctx_lock);
e34ecee2a   Kent Overstreet   aio: Fix a trinit...
575
576
  	percpu_ref_kill(&ctx->reqs);
  	percpu_ref_put(&ctx->reqs);
36f558890   Kent Overstreet   aio: refcounting ...
577
  }
db446a08c   Benjamin LaHaise   aio: convert the ...
578
579
580
581
582
583
584
  static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm)
  {
  	unsigned i, new_nr;
  	struct kioctx_table *table, *old;
  	struct aio_ring *ring;
  
  	spin_lock(&mm->ioctx_lock);
855ef0dec   Oleg Nesterov   aio: kill the mis...
585
  	table = rcu_dereference_raw(mm->ioctx_table);
db446a08c   Benjamin LaHaise   aio: convert the ...
586
587
588
589
  
  	while (1) {
  		if (table)
  			for (i = 0; i < table->nr; i++)
d0264c01e   Tejun Heo   fs/aio: Use RCU a...
590
  				if (!rcu_access_pointer(table->table[i])) {
db446a08c   Benjamin LaHaise   aio: convert the ...
591
  					ctx->id = i;
d0264c01e   Tejun Heo   fs/aio: Use RCU a...
592
  					rcu_assign_pointer(table->table[i], ctx);
db446a08c   Benjamin LaHaise   aio: convert the ...
593
  					spin_unlock(&mm->ioctx_lock);
fa8a53c39   Benjamin LaHaise   aio: v4 ensure ac...
594
595
596
597
  					/* While kioctx setup is in progress,
  					 * we are protected from page migration
  					 * changes ring_pages by ->ring_lock.
  					 */
db446a08c   Benjamin LaHaise   aio: convert the ...
598
599
600
601
602
603
604
  					ring = kmap_atomic(ctx->ring_pages[0]);
  					ring->id = ctx->id;
  					kunmap_atomic(ring);
  					return 0;
  				}
  
  		new_nr = (table ? table->nr : 1) * 4;
db446a08c   Benjamin LaHaise   aio: convert the ...
605
606
607
608
609
610
611
612
613
614
  		spin_unlock(&mm->ioctx_lock);
  
  		table = kzalloc(sizeof(*table) + sizeof(struct kioctx *) *
  				new_nr, GFP_KERNEL);
  		if (!table)
  			return -ENOMEM;
  
  		table->nr = new_nr;
  
  		spin_lock(&mm->ioctx_lock);
855ef0dec   Oleg Nesterov   aio: kill the mis...
615
  		old = rcu_dereference_raw(mm->ioctx_table);
db446a08c   Benjamin LaHaise   aio: convert the ...
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
  
  		if (!old) {
  			rcu_assign_pointer(mm->ioctx_table, table);
  		} else if (table->nr > old->nr) {
  			memcpy(table->table, old->table,
  			       old->nr * sizeof(struct kioctx *));
  
  			rcu_assign_pointer(mm->ioctx_table, table);
  			kfree_rcu(old, rcu);
  		} else {
  			kfree(table);
  			table = old;
  		}
  	}
  }
e34ecee2a   Kent Overstreet   aio: Fix a trinit...
631
632
633
634
635
636
637
638
639
  static void aio_nr_sub(unsigned nr)
  {
  	spin_lock(&aio_nr_lock);
  	if (WARN_ON(aio_nr - nr > aio_nr))
  		aio_nr = 0;
  	else
  		aio_nr -= nr;
  	spin_unlock(&aio_nr_lock);
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
640
641
642
643
644
  /* ioctx_alloc
   *	Allocates and initializes an ioctx.  Returns an ERR_PTR if it failed.
   */
  static struct kioctx *ioctx_alloc(unsigned nr_events)
  {
41003a7bc   Zach Brown   aio: remove retry...
645
  	struct mm_struct *mm = current->mm;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
646
  	struct kioctx *ctx;
e23754f88   Al Viro   aio: don't bother...
647
  	int err = -ENOMEM;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
648

e1bdd5f27   Kent Overstreet   aio: percpu reqs_...
649
  	/*
2a8a98673   Mauricio Faria de Oliveira   fs: aio: fix the ...
650
651
652
653
654
655
  	 * Store the original nr_events -- what userspace passed to io_setup(),
  	 * for counting against the global limit -- before it changes.
  	 */
  	unsigned int max_reqs = nr_events;
  
  	/*
e1bdd5f27   Kent Overstreet   aio: percpu reqs_...
656
657
658
659
660
661
662
663
664
665
  	 * We keep track of the number of available ringbuffer slots, to prevent
  	 * overflow (reqs_available), and we also use percpu counters for this.
  	 *
  	 * So since up to half the slots might be on other cpu's percpu counters
  	 * and unavailable, double nr_events so userspace sees what they
  	 * expected: additionally, we move req_batch slots to/from percpu
  	 * counters at a time, so make sure that isn't 0:
  	 */
  	nr_events = max(nr_events, num_possible_cpus() * 4);
  	nr_events *= 2;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
666
  	/* Prevent overflows */
08397acdd   Al Viro   ioctx_alloc(): re...
667
  	if (nr_events > (0x10000000U / sizeof(struct io_event))) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
668
669
670
671
  		pr_debug("ENOMEM: nr_events too high
  ");
  		return ERR_PTR(-EINVAL);
  	}
2a8a98673   Mauricio Faria de Oliveira   fs: aio: fix the ...
672
  	if (!nr_events || (unsigned long)max_reqs > aio_max_nr)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
673
  		return ERR_PTR(-EAGAIN);
c37622296   Robert P. J. Day   [PATCH] Transform...
674
  	ctx = kmem_cache_zalloc(kioctx_cachep, GFP_KERNEL);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
675
676
  	if (!ctx)
  		return ERR_PTR(-ENOMEM);
2a8a98673   Mauricio Faria de Oliveira   fs: aio: fix the ...
677
  	ctx->max_reqs = max_reqs;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
678

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
679
  	spin_lock_init(&ctx->ctx_lock);
0460fef2a   Kent Overstreet   aio: use cancella...
680
  	spin_lock_init(&ctx->completion_lock);
58c85dc20   Kent Overstreet   aio: kill struct ...
681
  	mutex_init(&ctx->ring_lock);
fa8a53c39   Benjamin LaHaise   aio: v4 ensure ac...
682
683
684
  	/* Protect against page migration throughout kiotx setup by keeping
  	 * the ring_lock mutex held until setup is complete. */
  	mutex_lock(&ctx->ring_lock);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
685
686
687
  	init_waitqueue_head(&ctx->wait);
  
  	INIT_LIST_HEAD(&ctx->active_reqs);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
688

2aad2a86f   Tejun Heo   percpu_ref: add P...
689
  	if (percpu_ref_init(&ctx->users, free_ioctx_users, 0, GFP_KERNEL))
fa8a53c39   Benjamin LaHaise   aio: v4 ensure ac...
690
  		goto err;
2aad2a86f   Tejun Heo   percpu_ref: add P...
691
  	if (percpu_ref_init(&ctx->reqs, free_ioctx_reqs, 0, GFP_KERNEL))
fa8a53c39   Benjamin LaHaise   aio: v4 ensure ac...
692
  		goto err;
e1bdd5f27   Kent Overstreet   aio: percpu reqs_...
693
694
  	ctx->cpu = alloc_percpu(struct kioctx_cpu);
  	if (!ctx->cpu)
e34ecee2a   Kent Overstreet   aio: Fix a trinit...
695
  		goto err;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
696

2a8a98673   Mauricio Faria de Oliveira   fs: aio: fix the ...
697
  	err = aio_setup_ring(ctx, nr_events);
fa8a53c39   Benjamin LaHaise   aio: v4 ensure ac...
698
  	if (err < 0)
e34ecee2a   Kent Overstreet   aio: Fix a trinit...
699
  		goto err;
e1bdd5f27   Kent Overstreet   aio: percpu reqs_...
700

34e83fc61   Kent Overstreet   aio: reqs_active ...
701
  	atomic_set(&ctx->reqs_available, ctx->nr_events - 1);
e1bdd5f27   Kent Overstreet   aio: percpu reqs_...
702
  	ctx->req_batch = (ctx->nr_events - 1) / (num_possible_cpus() * 4);
6878ea72a   Benjamin LaHaise   aio: be defensive...
703
704
  	if (ctx->req_batch < 1)
  		ctx->req_batch = 1;
34e83fc61   Kent Overstreet   aio: reqs_active ...
705

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
706
  	/* limit the number of system wide aios */
9fa1cb397   Al Viro   aio: aio_nr_lock ...
707
  	spin_lock(&aio_nr_lock);
2a8a98673   Mauricio Faria de Oliveira   fs: aio: fix the ...
708
709
  	if (aio_nr + ctx->max_reqs > aio_max_nr ||
  	    aio_nr + ctx->max_reqs < aio_nr) {
9fa1cb397   Al Viro   aio: aio_nr_lock ...
710
  		spin_unlock(&aio_nr_lock);
e34ecee2a   Kent Overstreet   aio: Fix a trinit...
711
  		err = -EAGAIN;
d1b943271   Gu Zheng   aio: clean up aio...
712
  		goto err_ctx;
2dd542b7a   Al Viro   aio: aio_nr decre...
713
714
  	}
  	aio_nr += ctx->max_reqs;
9fa1cb397   Al Viro   aio: aio_nr_lock ...
715
  	spin_unlock(&aio_nr_lock);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
716

1881686f8   Benjamin LaHaise   aio: fix kioctx l...
717
718
  	percpu_ref_get(&ctx->users);	/* io_setup() will drop this ref */
  	percpu_ref_get(&ctx->reqs);	/* free_ioctx_users() will drop this */
723be6e39   Kent Overstreet   aio: percpu ioctx...
719

da90382c2   Benjamin LaHaise   aio: fix error ha...
720
721
  	err = ioctx_add_table(ctx, mm);
  	if (err)
e34ecee2a   Kent Overstreet   aio: Fix a trinit...
722
  		goto err_cleanup;
da90382c2   Benjamin LaHaise   aio: fix error ha...
723

fa8a53c39   Benjamin LaHaise   aio: v4 ensure ac...
724
725
  	/* Release the ring_lock mutex now that all setup is complete. */
  	mutex_unlock(&ctx->ring_lock);
caf4167aa   Kent Overstreet   aio: dprintk() ->...
726
727
  	pr_debug("allocated ioctx %p[%ld]: mm=%p mask=0x%x
  ",
58c85dc20   Kent Overstreet   aio: kill struct ...
728
  		 ctx, ctx->user_id, mm, ctx->nr_events);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
729
  	return ctx;
e34ecee2a   Kent Overstreet   aio: Fix a trinit...
730
731
  err_cleanup:
  	aio_nr_sub(ctx->max_reqs);
d1b943271   Gu Zheng   aio: clean up aio...
732
  err_ctx:
deeb8525f   Al Viro   ioctx_alloc(): fi...
733
734
735
  	atomic_set(&ctx->dead, 1);
  	if (ctx->mmap_size)
  		vm_munmap(ctx->mmap_base, ctx->mmap_size);
d1b943271   Gu Zheng   aio: clean up aio...
736
  	aio_free_ring(ctx);
e34ecee2a   Kent Overstreet   aio: Fix a trinit...
737
  err:
fa8a53c39   Benjamin LaHaise   aio: v4 ensure ac...
738
  	mutex_unlock(&ctx->ring_lock);
e1bdd5f27   Kent Overstreet   aio: percpu reqs_...
739
  	free_percpu(ctx->cpu);
9a1049da9   Tejun Heo   percpu-refcount: ...
740
741
  	percpu_ref_exit(&ctx->reqs);
  	percpu_ref_exit(&ctx->users);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
742
  	kmem_cache_free(kioctx_cachep, ctx);
caf4167aa   Kent Overstreet   aio: dprintk() ->...
743
744
  	pr_debug("error allocating ioctx %d
  ", err);
e23754f88   Al Viro   aio: don't bother...
745
  	return ERR_PTR(err);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
746
  }
36f558890   Kent Overstreet   aio: refcounting ...
747
748
749
750
751
  /* kill_ioctx
   *	Cancels all outstanding aio requests on an aio context.  Used
   *	when the processes owning a context have all exited to encourage
   *	the rapid destruction of the kioctx.
   */
fb2d44838   Benjamin LaHaise   aio: report error...
752
  static int kill_ioctx(struct mm_struct *mm, struct kioctx *ctx,
dc48e56d7   Jens Axboe   aio: fix serial d...
753
  		      struct ctx_rq_wait *wait)
36f558890   Kent Overstreet   aio: refcounting ...
754
  {
fa88b6f88   Benjamin LaHaise   aio: cleanup: fla...
755
  	struct kioctx_table *table;
db446a08c   Benjamin LaHaise   aio: convert the ...
756

b2edffdd9   Al Viro   fix mremap() vs. ...
757
758
759
  	spin_lock(&mm->ioctx_lock);
  	if (atomic_xchg(&ctx->dead, 1)) {
  		spin_unlock(&mm->ioctx_lock);
fa88b6f88   Benjamin LaHaise   aio: cleanup: fla...
760
  		return -EINVAL;
b2edffdd9   Al Viro   fix mremap() vs. ...
761
  	}
db446a08c   Benjamin LaHaise   aio: convert the ...
762

855ef0dec   Oleg Nesterov   aio: kill the mis...
763
  	table = rcu_dereference_raw(mm->ioctx_table);
d0264c01e   Tejun Heo   fs/aio: Use RCU a...
764
765
  	WARN_ON(ctx != rcu_access_pointer(table->table[ctx->id]));
  	RCU_INIT_POINTER(table->table[ctx->id], NULL);
fa88b6f88   Benjamin LaHaise   aio: cleanup: fla...
766
  	spin_unlock(&mm->ioctx_lock);
4fcc712f5   Kent Overstreet   aio: fix io_destr...
767

a6d7cff47   Tejun Heo   fs/aio: Add expli...
768
  	/* free_ioctx_reqs() will do the necessary RCU synchronization */
fa88b6f88   Benjamin LaHaise   aio: cleanup: fla...
769
  	wake_up_all(&ctx->wait);
4fcc712f5   Kent Overstreet   aio: fix io_destr...
770

fa88b6f88   Benjamin LaHaise   aio: cleanup: fla...
771
772
773
774
775
776
777
778
  	/*
  	 * It'd be more correct to do this in free_ioctx(), after all
  	 * the outstanding kiocbs have finished - but by then io_destroy
  	 * has already returned, so io_setup() could potentially return
  	 * -EAGAIN with no ioctxs actually in use (as far as userspace
  	 *  could tell).
  	 */
  	aio_nr_sub(ctx->max_reqs);
4fcc712f5   Kent Overstreet   aio: fix io_destr...
779

fa88b6f88   Benjamin LaHaise   aio: cleanup: fla...
780
781
  	if (ctx->mmap_size)
  		vm_munmap(ctx->mmap_base, ctx->mmap_size);
fb2d44838   Benjamin LaHaise   aio: report error...
782

dc48e56d7   Jens Axboe   aio: fix serial d...
783
  	ctx->rq_wait = wait;
fa88b6f88   Benjamin LaHaise   aio: cleanup: fla...
784
785
  	percpu_ref_kill(&ctx->users);
  	return 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
786
  }
36f558890   Kent Overstreet   aio: refcounting ...
787
788
789
790
791
792
793
  /*
   * exit_aio: called when the last user of mm goes away.  At this point, there is
   * no way for any new requests to be submited or any of the io_* syscalls to be
   * called on the context.
   *
   * There may be outstanding kiocbs, but free_ioctx() will explicitly wait on
   * them.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
794
   */
fc9b52cd8   Harvey Harrison   fs: remove fastca...
795
  void exit_aio(struct mm_struct *mm)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
796
  {
4b70ac5fd   Oleg Nesterov   aio: change exit_...
797
  	struct kioctx_table *table = rcu_dereference_raw(mm->ioctx_table);
dc48e56d7   Jens Axboe   aio: fix serial d...
798
799
  	struct ctx_rq_wait wait;
  	int i, skipped;
db446a08c   Benjamin LaHaise   aio: convert the ...
800

4b70ac5fd   Oleg Nesterov   aio: change exit_...
801
802
  	if (!table)
  		return;
db446a08c   Benjamin LaHaise   aio: convert the ...
803

dc48e56d7   Jens Axboe   aio: fix serial d...
804
805
806
807
  	atomic_set(&wait.count, table->nr);
  	init_completion(&wait.comp);
  
  	skipped = 0;
4b70ac5fd   Oleg Nesterov   aio: change exit_...
808
  	for (i = 0; i < table->nr; ++i) {
d0264c01e   Tejun Heo   fs/aio: Use RCU a...
809
810
  		struct kioctx *ctx =
  			rcu_dereference_protected(table->table[i], true);
abf137dd7   Jens Axboe   aio: make the loo...
811

dc48e56d7   Jens Axboe   aio: fix serial d...
812
813
  		if (!ctx) {
  			skipped++;
4b70ac5fd   Oleg Nesterov   aio: change exit_...
814
  			continue;
dc48e56d7   Jens Axboe   aio: fix serial d...
815
  		}
936af1576   Al Viro   aio: don't bother...
816
  		/*
4b70ac5fd   Oleg Nesterov   aio: change exit_...
817
818
819
820
821
  		 * We don't need to bother with munmap() here - exit_mmap(mm)
  		 * is coming and it'll unmap everything. And we simply can't,
  		 * this is not necessarily our ->mm.
  		 * Since kill_ioctx() uses non-zero ->mmap_size as indicator
  		 * that it needs to unmap the area, just set it to 0.
936af1576   Al Viro   aio: don't bother...
822
  		 */
58c85dc20   Kent Overstreet   aio: kill struct ...
823
  		ctx->mmap_size = 0;
dc48e56d7   Jens Axboe   aio: fix serial d...
824
825
  		kill_ioctx(mm, ctx, &wait);
  	}
36f558890   Kent Overstreet   aio: refcounting ...
826

dc48e56d7   Jens Axboe   aio: fix serial d...
827
  	if (!atomic_sub_and_test(skipped, &wait.count)) {
6098b45b3   Gu Zheng   aio: block exit_a...
828
  		/* Wait until all IO for the context are done. */
dc48e56d7   Jens Axboe   aio: fix serial d...
829
  		wait_for_completion(&wait.comp);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
830
  	}
4b70ac5fd   Oleg Nesterov   aio: change exit_...
831
832
833
  
  	RCU_INIT_POINTER(mm->ioctx_table, NULL);
  	kfree(table);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
834
  }
e1bdd5f27   Kent Overstreet   aio: percpu reqs_...
835
836
837
  static void put_reqs_available(struct kioctx *ctx, unsigned nr)
  {
  	struct kioctx_cpu *kcpu;
263782c1c   Benjamin LaHaise   aio: protect reqs...
838
  	unsigned long flags;
e1bdd5f27   Kent Overstreet   aio: percpu reqs_...
839

263782c1c   Benjamin LaHaise   aio: protect reqs...
840
  	local_irq_save(flags);
be6fb451a   Benjamin LaHaise   aio: remove no lo...
841
  	kcpu = this_cpu_ptr(ctx->cpu);
e1bdd5f27   Kent Overstreet   aio: percpu reqs_...
842
  	kcpu->reqs_available += nr;
263782c1c   Benjamin LaHaise   aio: protect reqs...
843

e1bdd5f27   Kent Overstreet   aio: percpu reqs_...
844
845
846
847
  	while (kcpu->reqs_available >= ctx->req_batch * 2) {
  		kcpu->reqs_available -= ctx->req_batch;
  		atomic_add(ctx->req_batch, &ctx->reqs_available);
  	}
263782c1c   Benjamin LaHaise   aio: protect reqs...
848
  	local_irq_restore(flags);
e1bdd5f27   Kent Overstreet   aio: percpu reqs_...
849
850
851
852
853
854
  }
  
  static bool get_reqs_available(struct kioctx *ctx)
  {
  	struct kioctx_cpu *kcpu;
  	bool ret = false;
263782c1c   Benjamin LaHaise   aio: protect reqs...
855
  	unsigned long flags;
e1bdd5f27   Kent Overstreet   aio: percpu reqs_...
856

263782c1c   Benjamin LaHaise   aio: protect reqs...
857
  	local_irq_save(flags);
be6fb451a   Benjamin LaHaise   aio: remove no lo...
858
  	kcpu = this_cpu_ptr(ctx->cpu);
e1bdd5f27   Kent Overstreet   aio: percpu reqs_...
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
  	if (!kcpu->reqs_available) {
  		int old, avail = atomic_read(&ctx->reqs_available);
  
  		do {
  			if (avail < ctx->req_batch)
  				goto out;
  
  			old = avail;
  			avail = atomic_cmpxchg(&ctx->reqs_available,
  					       avail, avail - ctx->req_batch);
  		} while (avail != old);
  
  		kcpu->reqs_available += ctx->req_batch;
  	}
  
  	ret = true;
  	kcpu->reqs_available--;
  out:
263782c1c   Benjamin LaHaise   aio: protect reqs...
877
  	local_irq_restore(flags);
e1bdd5f27   Kent Overstreet   aio: percpu reqs_...
878
879
  	return ret;
  }
d856f32a8   Benjamin LaHaise   aio: fix reqs_ava...
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
  /* refill_reqs_available
   *	Updates the reqs_available reference counts used for tracking the
   *	number of free slots in the completion ring.  This can be called
   *	from aio_complete() (to optimistically update reqs_available) or
   *	from aio_get_req() (the we're out of events case).  It must be
   *	called holding ctx->completion_lock.
   */
  static void refill_reqs_available(struct kioctx *ctx, unsigned head,
                                    unsigned tail)
  {
  	unsigned events_in_ring, completed;
  
  	/* Clamp head since userland can write to it. */
  	head %= ctx->nr_events;
  	if (head <= tail)
  		events_in_ring = tail - head;
  	else
  		events_in_ring = ctx->nr_events - (head - tail);
  
  	completed = ctx->completed_events;
  	if (events_in_ring < completed)
  		completed -= events_in_ring;
  	else
  		completed = 0;
  
  	if (!completed)
  		return;
  
  	ctx->completed_events -= completed;
  	put_reqs_available(ctx, completed);
  }
  
  /* user_refill_reqs_available
   *	Called to refill reqs_available when aio_get_req() encounters an
   *	out of space in the completion ring.
   */
  static void user_refill_reqs_available(struct kioctx *ctx)
  {
  	spin_lock_irq(&ctx->completion_lock);
  	if (ctx->completed_events) {
  		struct aio_ring *ring;
  		unsigned head;
  
  		/* Access of ring->head may race with aio_read_events_ring()
  		 * here, but that's okay since whether we read the old version
  		 * or the new version, and either will be valid.  The important
  		 * part is that head cannot pass tail since we prevent
  		 * aio_complete() from updating tail by holding
  		 * ctx->completion_lock.  Even if head is invalid, the check
  		 * against ctx->completed_events below will make sure we do the
  		 * safe/right thing.
  		 */
  		ring = kmap_atomic(ctx->ring_pages[0]);
  		head = ring->head;
  		kunmap_atomic(ring);
  
  		refill_reqs_available(ctx, head, ctx->tail);
  	}
  
  	spin_unlock_irq(&ctx->completion_lock);
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
941
  /* aio_get_req
57282d8fd   Kent Overstreet   aio: Kill ki_users
942
943
   *	Allocate a slot for an aio request.
   * Returns NULL if no requests are free.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
944
   */
04b2fa9f8   Christoph Hellwig   fs: split generic...
945
  static inline struct aio_kiocb *aio_get_req(struct kioctx *ctx)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
946
  {
04b2fa9f8   Christoph Hellwig   fs: split generic...
947
  	struct aio_kiocb *req;
a1c8eae75   Kent Overstreet   aio: kill batch a...
948

d856f32a8   Benjamin LaHaise   aio: fix reqs_ava...
949
950
951
952
953
  	if (!get_reqs_available(ctx)) {
  		user_refill_reqs_available(ctx);
  		if (!get_reqs_available(ctx))
  			return NULL;
  	}
a1c8eae75   Kent Overstreet   aio: kill batch a...
954

0460fef2a   Kent Overstreet   aio: use cancella...
955
  	req = kmem_cache_alloc(kiocb_cachep, GFP_KERNEL|__GFP_ZERO);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
956
  	if (unlikely(!req))
a1c8eae75   Kent Overstreet   aio: kill batch a...
957
  		goto out_put;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
958

e34ecee2a   Kent Overstreet   aio: Fix a trinit...
959
  	percpu_ref_get(&ctx->reqs);
75321b50a   Christoph Hellwig   aio: sanitize ki_...
960
  	INIT_LIST_HEAD(&req->ki_list);
9018ccc45   Christoph Hellwig   aio: add a iocb r...
961
  	refcount_set(&req->ki_refcnt, 0);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
962
  	req->ki_ctx = ctx;
080d676de   Jeff Moyer   aio: allocate kio...
963
  	return req;
a1c8eae75   Kent Overstreet   aio: kill batch a...
964
  out_put:
e1bdd5f27   Kent Overstreet   aio: percpu reqs_...
965
  	put_reqs_available(ctx, 1);
a1c8eae75   Kent Overstreet   aio: kill batch a...
966
  	return NULL;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
967
  }
d5470b596   Adrian Bunk   fs/aio.c: make 3 ...
968
  static struct kioctx *lookup_ioctx(unsigned long ctx_id)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
969
  {
db446a08c   Benjamin LaHaise   aio: convert the ...
970
  	struct aio_ring __user *ring  = (void __user *)ctx_id;
abf137dd7   Jens Axboe   aio: make the loo...
971
  	struct mm_struct *mm = current->mm;
65c24491b   Jeff Moyer   aio: lookup_ioctx...
972
  	struct kioctx *ctx, *ret = NULL;
db446a08c   Benjamin LaHaise   aio: convert the ...
973
974
975
976
977
  	struct kioctx_table *table;
  	unsigned id;
  
  	if (get_user(id, &ring->id))
  		return NULL;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
978

abf137dd7   Jens Axboe   aio: make the loo...
979
  	rcu_read_lock();
db446a08c   Benjamin LaHaise   aio: convert the ...
980
  	table = rcu_dereference(mm->ioctx_table);
abf137dd7   Jens Axboe   aio: make the loo...
981

db446a08c   Benjamin LaHaise   aio: convert the ...
982
983
  	if (!table || id >= table->nr)
  		goto out;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
984

a6136922d   Jeff Moyer   aio: fix spectre ...
985
  	id = array_index_nospec(id, table->nr);
d0264c01e   Tejun Heo   fs/aio: Use RCU a...
986
  	ctx = rcu_dereference(table->table[id]);
f30d704fe   Benjamin LaHaise   aio: table lookup...
987
  	if (ctx && ctx->user_id == ctx_id) {
baf10564f   Al Viro   aio: fix io_destr...
988
989
  		if (percpu_ref_tryget_live(&ctx->users))
  			ret = ctx;
db446a08c   Benjamin LaHaise   aio: convert the ...
990
991
  	}
  out:
abf137dd7   Jens Axboe   aio: make the loo...
992
  	rcu_read_unlock();
65c24491b   Jeff Moyer   aio: lookup_ioctx...
993
  	return ret;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
994
  }
9018ccc45   Christoph Hellwig   aio: add a iocb r...
995
996
997
998
999
1000
1001
1002
  static inline void iocb_put(struct aio_kiocb *iocb)
  {
  	if (refcount_read(&iocb->ki_refcnt) == 0 ||
  	    refcount_dec_and_test(&iocb->ki_refcnt)) {
  		percpu_ref_put(&iocb->ki_ctx->reqs);
  		kmem_cache_free(kiocb_cachep, iocb);
  	}
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1003
1004
  /* aio_complete
   *	Called when the io request on the given iocb is complete.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1005
   */
54843f875   Christoph Hellwig   aio: refactor rea...
1006
  static void aio_complete(struct aio_kiocb *iocb, long res, long res2)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1007
1008
  {
  	struct kioctx	*ctx = iocb->ki_ctx;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1009
  	struct aio_ring	*ring;
21b40200c   Kent Overstreet   aio: use flush_dc...
1010
  	struct io_event	*ev_page, *event;
d856f32a8   Benjamin LaHaise   aio: fix reqs_ava...
1011
  	unsigned tail, pos, head;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1012
  	unsigned long	flags;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1013

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1014
  	/*
0460fef2a   Kent Overstreet   aio: use cancella...
1015
  	 * Add a completion event to the ring buffer. Must be done holding
4b30f07e7   Tang Chen   aio: fix wrong co...
1016
  	 * ctx->completion_lock to prevent other code from messing with the tail
0460fef2a   Kent Overstreet   aio: use cancella...
1017
1018
1019
  	 * pointer since we might be called from irq context.
  	 */
  	spin_lock_irqsave(&ctx->completion_lock, flags);
58c85dc20   Kent Overstreet   aio: kill struct ...
1020
  	tail = ctx->tail;
21b40200c   Kent Overstreet   aio: use flush_dc...
1021
  	pos = tail + AIO_EVENTS_OFFSET;
58c85dc20   Kent Overstreet   aio: kill struct ...
1022
  	if (++tail >= ctx->nr_events)
4bf69b2a0   Kenneth W Chen   [PATCH] aio: ring...
1023
  		tail = 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1024

58c85dc20   Kent Overstreet   aio: kill struct ...
1025
  	ev_page = kmap_atomic(ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE]);
21b40200c   Kent Overstreet   aio: use flush_dc...
1026
  	event = ev_page + pos % AIO_EVENTS_PER_PAGE;
04b2fa9f8   Christoph Hellwig   fs: split generic...
1027
  	event->obj = (u64)(unsigned long)iocb->ki_user_iocb;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1028
1029
1030
  	event->data = iocb->ki_user_data;
  	event->res = res;
  	event->res2 = res2;
21b40200c   Kent Overstreet   aio: use flush_dc...
1031
  	kunmap_atomic(ev_page);
58c85dc20   Kent Overstreet   aio: kill struct ...
1032
  	flush_dcache_page(ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE]);
21b40200c   Kent Overstreet   aio: use flush_dc...
1033
1034
1035
  
  	pr_debug("%p[%u]: %p: %p %Lx %lx %lx
  ",
04b2fa9f8   Christoph Hellwig   fs: split generic...
1036
  		 ctx, tail, iocb, iocb->ki_user_iocb, iocb->ki_user_data,
caf4167aa   Kent Overstreet   aio: dprintk() ->...
1037
  		 res, res2);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1038
1039
1040
1041
1042
  
  	/* after flagging the request as done, we
  	 * must never even look at it again
  	 */
  	smp_wmb();	/* make event visible before updating tail */
58c85dc20   Kent Overstreet   aio: kill struct ...
1043
  	ctx->tail = tail;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1044

58c85dc20   Kent Overstreet   aio: kill struct ...
1045
  	ring = kmap_atomic(ctx->ring_pages[0]);
d856f32a8   Benjamin LaHaise   aio: fix reqs_ava...
1046
  	head = ring->head;
21b40200c   Kent Overstreet   aio: use flush_dc...
1047
  	ring->tail = tail;
e8e3c3d66   Cong Wang   fs: remove the se...
1048
  	kunmap_atomic(ring);
58c85dc20   Kent Overstreet   aio: kill struct ...
1049
  	flush_dcache_page(ctx->ring_pages[0]);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1050

d856f32a8   Benjamin LaHaise   aio: fix reqs_ava...
1051
1052
1053
  	ctx->completed_events++;
  	if (ctx->completed_events > 1)
  		refill_reqs_available(ctx, head, tail);
0460fef2a   Kent Overstreet   aio: use cancella...
1054
  	spin_unlock_irqrestore(&ctx->completion_lock, flags);
21b40200c   Kent Overstreet   aio: use flush_dc...
1055
1056
  	pr_debug("added to ring %p at [%u]
  ", iocb, tail);
8d1c98b0b   Davide Libenzi   eventfd/kaio inte...
1057
1058
1059
1060
1061
1062
  
  	/*
  	 * Check if the user asked us to deliver the result through an
  	 * eventfd. The eventfd_signal() function is safe to be called
  	 * from IRQ context.
  	 */
54843f875   Christoph Hellwig   aio: refactor rea...
1063
  	if (iocb->ki_eventfd) {
8d1c98b0b   Davide Libenzi   eventfd/kaio inte...
1064
  		eventfd_signal(iocb->ki_eventfd, 1);
54843f875   Christoph Hellwig   aio: refactor rea...
1065
1066
  		eventfd_ctx_put(iocb->ki_eventfd);
  	}
8d1c98b0b   Davide Libenzi   eventfd/kaio inte...
1067

6cb2a2104   Quentin Barnes   aio: bad AIO race...
1068
1069
1070
1071
1072
1073
1074
  	/*
  	 * We have to order our ring_info tail store above and test
  	 * of the wait list below outside the wait lock.  This is
  	 * like in wake_up_bit() where clearing a bit has to be
  	 * ordered with the unlocked test.
  	 */
  	smp_mb();
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1075
1076
  	if (waitqueue_active(&ctx->wait))
  		wake_up(&ctx->wait);
9018ccc45   Christoph Hellwig   aio: add a iocb r...
1077
  	iocb_put(iocb);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1078
  }
2be4e7dee   Gu Zheng   aio: fix some com...
1079
  /* aio_read_events_ring
a31ad380b   Kent Overstreet   aio: make aio_rea...
1080
1081
   *	Pull an event off of the ioctx's event ring.  Returns the number of
   *	events fetched
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1082
   */
a31ad380b   Kent Overstreet   aio: make aio_rea...
1083
1084
  static long aio_read_events_ring(struct kioctx *ctx,
  				 struct io_event __user *event, long nr)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1085
  {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1086
  	struct aio_ring *ring;
5ffac122d   Kent Overstreet   aio: Don't use ct...
1087
  	unsigned head, tail, pos;
a31ad380b   Kent Overstreet   aio: make aio_rea...
1088
1089
  	long ret = 0;
  	int copy_ret;
9c9ce763b   Dave Chinner   aio: annotate aio...
1090
1091
1092
1093
1094
1095
1096
  	/*
  	 * The mutex can block and wake us up and that will cause
  	 * wait_event_interruptible_hrtimeout() to schedule without sleeping
  	 * and repeat. This should be rare enough that it doesn't cause
  	 * peformance issues. See the comment in read_events() for more detail.
  	 */
  	sched_annotate_sleep();
58c85dc20   Kent Overstreet   aio: kill struct ...
1097
  	mutex_lock(&ctx->ring_lock);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1098

fa8a53c39   Benjamin LaHaise   aio: v4 ensure ac...
1099
  	/* Access to ->ring_pages here is protected by ctx->ring_lock. */
58c85dc20   Kent Overstreet   aio: kill struct ...
1100
  	ring = kmap_atomic(ctx->ring_pages[0]);
a31ad380b   Kent Overstreet   aio: make aio_rea...
1101
  	head = ring->head;
5ffac122d   Kent Overstreet   aio: Don't use ct...
1102
  	tail = ring->tail;
a31ad380b   Kent Overstreet   aio: make aio_rea...
1103
  	kunmap_atomic(ring);
2ff396be6   Jeff Moyer   aio: add missing ...
1104
1105
1106
1107
1108
  	/*
  	 * Ensure that once we've read the current tail pointer, that
  	 * we also see the events that were stored up to the tail.
  	 */
  	smp_rmb();
5ffac122d   Kent Overstreet   aio: Don't use ct...
1109
1110
  	pr_debug("h%u t%u m%u
  ", head, tail, ctx->nr_events);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1111

5ffac122d   Kent Overstreet   aio: Don't use ct...
1112
  	if (head == tail)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1113
  		goto out;
edfbbf388   Benjamin LaHaise   aio: fix kernel m...
1114
1115
  	head %= ctx->nr_events;
  	tail %= ctx->nr_events;
a31ad380b   Kent Overstreet   aio: make aio_rea...
1116
1117
1118
1119
  	while (ret < nr) {
  		long avail;
  		struct io_event *ev;
  		struct page *page;
5ffac122d   Kent Overstreet   aio: Don't use ct...
1120
1121
  		avail = (head <= tail ?  tail : ctx->nr_events) - head;
  		if (head == tail)
a31ad380b   Kent Overstreet   aio: make aio_rea...
1122
  			break;
a31ad380b   Kent Overstreet   aio: make aio_rea...
1123
  		pos = head + AIO_EVENTS_OFFSET;
58c85dc20   Kent Overstreet   aio: kill struct ...
1124
  		page = ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE];
a31ad380b   Kent Overstreet   aio: make aio_rea...
1125
  		pos %= AIO_EVENTS_PER_PAGE;
d2988bd41   Al Viro   aio_read_events_r...
1126
1127
  		avail = min(avail, nr - ret);
  		avail = min_t(long, avail, AIO_EVENTS_PER_PAGE - pos);
a31ad380b   Kent Overstreet   aio: make aio_rea...
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
  		ev = kmap(page);
  		copy_ret = copy_to_user(event + ret, ev + pos,
  					sizeof(*ev) * avail);
  		kunmap(page);
  
  		if (unlikely(copy_ret)) {
  			ret = -EFAULT;
  			goto out;
  		}
  
  		ret += avail;
  		head += avail;
58c85dc20   Kent Overstreet   aio: kill struct ...
1140
  		head %= ctx->nr_events;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1141
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1142

58c85dc20   Kent Overstreet   aio: kill struct ...
1143
  	ring = kmap_atomic(ctx->ring_pages[0]);
a31ad380b   Kent Overstreet   aio: make aio_rea...
1144
  	ring->head = head;
91d80a84b   Zhao Hongjiang   aio: fix possible...
1145
  	kunmap_atomic(ring);
58c85dc20   Kent Overstreet   aio: kill struct ...
1146
  	flush_dcache_page(ctx->ring_pages[0]);
a31ad380b   Kent Overstreet   aio: make aio_rea...
1147

5ffac122d   Kent Overstreet   aio: Don't use ct...
1148
1149
  	pr_debug("%li  h%u t%u
  ", ret, head, tail);
a31ad380b   Kent Overstreet   aio: make aio_rea...
1150
  out:
58c85dc20   Kent Overstreet   aio: kill struct ...
1151
  	mutex_unlock(&ctx->ring_lock);
a31ad380b   Kent Overstreet   aio: make aio_rea...
1152

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1153
1154
  	return ret;
  }
a31ad380b   Kent Overstreet   aio: make aio_rea...
1155
1156
  static bool aio_read_events(struct kioctx *ctx, long min_nr, long nr,
  			    struct io_event __user *event, long *i)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1157
  {
a31ad380b   Kent Overstreet   aio: make aio_rea...
1158
  	long ret = aio_read_events_ring(ctx, event + *i, nr - *i);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1159

a31ad380b   Kent Overstreet   aio: make aio_rea...
1160
1161
  	if (ret > 0)
  		*i += ret;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1162

a31ad380b   Kent Overstreet   aio: make aio_rea...
1163
1164
  	if (unlikely(atomic_read(&ctx->dead)))
  		ret = -EINVAL;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1165

a31ad380b   Kent Overstreet   aio: make aio_rea...
1166
1167
  	if (!*i)
  		*i = ret;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1168

a31ad380b   Kent Overstreet   aio: make aio_rea...
1169
  	return ret < 0 || *i >= min_nr;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1170
  }
a31ad380b   Kent Overstreet   aio: make aio_rea...
1171
  static long read_events(struct kioctx *ctx, long min_nr, long nr,
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1172
  			struct io_event __user *event,
fa2e62a54   Deepa Dinamani   io_getevents: Use...
1173
  			ktime_t until)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1174
  {
a31ad380b   Kent Overstreet   aio: make aio_rea...
1175
  	long ret = 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1176

a31ad380b   Kent Overstreet   aio: make aio_rea...
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
  	/*
  	 * Note that aio_read_events() is being called as the conditional - i.e.
  	 * we're calling it after prepare_to_wait() has set task state to
  	 * TASK_INTERRUPTIBLE.
  	 *
  	 * But aio_read_events() can block, and if it blocks it's going to flip
  	 * the task state back to TASK_RUNNING.
  	 *
  	 * This should be ok, provided it doesn't flip the state back to
  	 * TASK_RUNNING and return 0 too much - that causes us to spin. That
  	 * will only happen if the mutex_lock() call blocks, and we then find
  	 * the ringbuffer empty. So in practice we should be ok, but it's
  	 * something to be aware of when touching this code.
  	 */
2456e8553   Thomas Gleixner   ktime: Get rid of...
1191
  	if (until == 0)
5f785de58   Fam Zheng   aio: Skip timer f...
1192
1193
1194
1195
1196
  		aio_read_events(ctx, min_nr, nr, event, &ret);
  	else
  		wait_event_interruptible_hrtimeout(ctx->wait,
  				aio_read_events(ctx, min_nr, nr, event, &ret),
  				until);
a31ad380b   Kent Overstreet   aio: make aio_rea...
1197
  	return ret;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1198
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
  /* sys_io_setup:
   *	Create an aio_context capable of receiving at least nr_events.
   *	ctxp must not point to an aio_context that already exists, and
   *	must be initialized to 0 prior to the call.  On successful
   *	creation of the aio_context, *ctxp is filled in with the resulting 
   *	handle.  May fail with -EINVAL if *ctxp is not initialized,
   *	if the specified nr_events exceeds internal limits.  May fail 
   *	with -EAGAIN if the specified nr_events exceeds the user's limit 
   *	of available events.  May fail with -ENOMEM if insufficient kernel
   *	resources are available.  May fail with -EFAULT if an invalid
   *	pointer is passed for ctxp.  Will fail with -ENOSYS if not
   *	implemented.
   */
002c8976e   Heiko Carstens   [CVE-2009-0029] S...
1212
  SYSCALL_DEFINE2(io_setup, unsigned, nr_events, aio_context_t __user *, ctxp)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
  {
  	struct kioctx *ioctx = NULL;
  	unsigned long ctx;
  	long ret;
  
  	ret = get_user(ctx, ctxp);
  	if (unlikely(ret))
  		goto out;
  
  	ret = -EINVAL;
d55b5fdaf   Zach Brown   [PATCH] aio: remo...
1223
  	if (unlikely(ctx || nr_events == 0)) {
acd88d4e1   Kinglong Mee   fs/aio.c: Remove ...
1224
1225
  		pr_debug("EINVAL: ctx %lu nr_events %u
  ",
d55b5fdaf   Zach Brown   [PATCH] aio: remo...
1226
  		         ctx, nr_events);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1227
1228
1229
1230
1231
1232
1233
  		goto out;
  	}
  
  	ioctx = ioctx_alloc(nr_events);
  	ret = PTR_ERR(ioctx);
  	if (!IS_ERR(ioctx)) {
  		ret = put_user(ioctx->user_id, ctxp);
a2e1859ad   Al Viro   aio: take final p...
1234
  		if (ret)
e02ba72aa   Anatol Pomozov   aio: block io_des...
1235
  			kill_ioctx(current->mm, ioctx, NULL);
723be6e39   Kent Overstreet   aio: percpu ioctx...
1236
  		percpu_ref_put(&ioctx->users);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1237
1238
1239
1240
1241
  	}
  
  out:
  	return ret;
  }
c00d2c7e8   Al Viro   move aio compat t...
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
  #ifdef CONFIG_COMPAT
  COMPAT_SYSCALL_DEFINE2(io_setup, unsigned, nr_events, u32 __user *, ctx32p)
  {
  	struct kioctx *ioctx = NULL;
  	unsigned long ctx;
  	long ret;
  
  	ret = get_user(ctx, ctx32p);
  	if (unlikely(ret))
  		goto out;
  
  	ret = -EINVAL;
  	if (unlikely(ctx || nr_events == 0)) {
  		pr_debug("EINVAL: ctx %lu nr_events %u
  ",
  		         ctx, nr_events);
  		goto out;
  	}
  
  	ioctx = ioctx_alloc(nr_events);
  	ret = PTR_ERR(ioctx);
  	if (!IS_ERR(ioctx)) {
  		/* truncating is ok because it's a user address */
  		ret = put_user((u32)ioctx->user_id, ctx32p);
  		if (ret)
  			kill_ioctx(current->mm, ioctx, NULL);
  		percpu_ref_put(&ioctx->users);
  	}
  
  out:
  	return ret;
  }
  #endif
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1275
1276
1277
  /* sys_io_destroy:
   *	Destroy the aio_context specified.  May cancel any outstanding 
   *	AIOs and block on completion.  Will fail with -ENOSYS if not
642b5123a   Satoru Takeuchi   aio: fix wrong su...
1278
   *	implemented.  May fail with -EINVAL if the context pointed to
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1279
1280
   *	is invalid.
   */
002c8976e   Heiko Carstens   [CVE-2009-0029] S...
1281
  SYSCALL_DEFINE1(io_destroy, aio_context_t, ctx)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1282
1283
1284
  {
  	struct kioctx *ioctx = lookup_ioctx(ctx);
  	if (likely(NULL != ioctx)) {
dc48e56d7   Jens Axboe   aio: fix serial d...
1285
  		struct ctx_rq_wait wait;
fb2d44838   Benjamin LaHaise   aio: report error...
1286
  		int ret;
e02ba72aa   Anatol Pomozov   aio: block io_des...
1287

dc48e56d7   Jens Axboe   aio: fix serial d...
1288
1289
  		init_completion(&wait.comp);
  		atomic_set(&wait.count, 1);
e02ba72aa   Anatol Pomozov   aio: block io_des...
1290
1291
1292
1293
  		/* Pass requests_done to kill_ioctx() where it can be set
  		 * in a thread-safe way. If we try to set it here then we have
  		 * a race condition if two io_destroy() called simultaneously.
  		 */
dc48e56d7   Jens Axboe   aio: fix serial d...
1294
  		ret = kill_ioctx(current->mm, ioctx, &wait);
723be6e39   Kent Overstreet   aio: percpu ioctx...
1295
  		percpu_ref_put(&ioctx->users);
e02ba72aa   Anatol Pomozov   aio: block io_des...
1296
1297
1298
1299
1300
  
  		/* Wait until all IO for the context are done. Otherwise kernel
  		 * keep using user-space buffers even if user thinks the context
  		 * is destroyed.
  		 */
fb2d44838   Benjamin LaHaise   aio: report error...
1301
  		if (!ret)
dc48e56d7   Jens Axboe   aio: fix serial d...
1302
  			wait_for_completion(&wait.comp);
e02ba72aa   Anatol Pomozov   aio: block io_des...
1303

fb2d44838   Benjamin LaHaise   aio: report error...
1304
  		return ret;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1305
  	}
acd88d4e1   Kinglong Mee   fs/aio.c: Remove ...
1306
1307
  	pr_debug("EINVAL: invalid context id
  ");
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1308
1309
  	return -EINVAL;
  }
3c96c7f4c   Al Viro   aio: take list re...
1310
1311
1312
1313
1314
1315
1316
1317
1318
  static void aio_remove_iocb(struct aio_kiocb *iocb)
  {
  	struct kioctx *ctx = iocb->ki_ctx;
  	unsigned long flags;
  
  	spin_lock_irqsave(&ctx->ctx_lock, flags);
  	list_del(&iocb->ki_list);
  	spin_unlock_irqrestore(&ctx->ctx_lock, flags);
  }
54843f875   Christoph Hellwig   aio: refactor rea...
1319
1320
1321
  static void aio_complete_rw(struct kiocb *kiocb, long res, long res2)
  {
  	struct aio_kiocb *iocb = container_of(kiocb, struct aio_kiocb, rw);
3c96c7f4c   Al Viro   aio: take list re...
1322
1323
  	if (!list_empty_careful(&iocb->ki_list))
  		aio_remove_iocb(iocb);
54843f875   Christoph Hellwig   aio: refactor rea...
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
  	if (kiocb->ki_flags & IOCB_WRITE) {
  		struct inode *inode = file_inode(kiocb->ki_filp);
  
  		/*
  		 * Tell lockdep we inherited freeze protection from submission
  		 * thread.
  		 */
  		if (S_ISREG(inode->i_mode))
  			__sb_writers_acquired(inode->i_sb, SB_FREEZE_WRITE);
  		file_end_write(kiocb->ki_filp);
  	}
  
  	fput(kiocb->ki_filp);
  	aio_complete(iocb, res, res2);
  }
  
  static int aio_prep_rw(struct kiocb *req, struct iocb *iocb)
  {
  	int ret;
  
  	req->ki_filp = fget(iocb->aio_fildes);
  	if (unlikely(!req->ki_filp))
  		return -EBADF;
  	req->ki_complete = aio_complete_rw;
  	req->ki_pos = iocb->aio_offset;
  	req->ki_flags = iocb_flags(req->ki_filp);
  	if (iocb->aio_flags & IOCB_FLAG_RESFD)
  		req->ki_flags |= IOCB_EVENTFD;
fc28724d6   Adam Manzanares   fs: Convert kiocb...
1352
  	req->ki_hint = ki_hint_validate(file_write_hint(req->ki_filp));
d9a08a9e6   Adam Manzanares   fs: Add aio iopri...
1353
1354
1355
1356
1357
1358
1359
1360
  	if (iocb->aio_flags & IOCB_FLAG_IOPRIO) {
  		/*
  		 * If the IOCB_FLAG_IOPRIO flag of aio_flags is set, then
  		 * aio_reqprio is interpreted as an I/O scheduling
  		 * class and priority.
  		 */
  		ret = ioprio_check_cap(iocb->aio_reqprio);
  		if (ret) {
9a6d9a62e   Adam Manzanares   fs: aio ioprio us...
1361
1362
  			pr_debug("aio ioprio check cap error: %d
  ", ret);
df66ef67c   Jens Axboe   aio: fix failure ...
1363
  			fput(req->ki_filp);
9a6d9a62e   Adam Manzanares   fs: aio ioprio us...
1364
  			return ret;
d9a08a9e6   Adam Manzanares   fs: Add aio iopri...
1365
1366
1367
1368
1369
  		}
  
  		req->ki_ioprio = iocb->aio_reqprio;
  	} else
  		req->ki_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_NONE, 0);
54843f875   Christoph Hellwig   aio: refactor rea...
1370
1371
1372
1373
1374
  	ret = kiocb_set_rw_flags(req, iocb->aio_rw_flags);
  	if (unlikely(ret))
  		fput(req->ki_filp);
  	return ret;
  }
89319d31d   Christoph Hellwig   fs: remove aio_ru...
1375
1376
  static int aio_setup_rw(int rw, struct iocb *iocb, struct iovec **iovec,
  		bool vectored, bool compat, struct iov_iter *iter)
eed4e51fb   Badari Pulavarty   [PATCH] Add vecto...
1377
  {
89319d31d   Christoph Hellwig   fs: remove aio_ru...
1378
1379
1380
1381
1382
1383
1384
1385
  	void __user *buf = (void __user *)(uintptr_t)iocb->aio_buf;
  	size_t len = iocb->aio_nbytes;
  
  	if (!vectored) {
  		ssize_t ret = import_single_range(rw, buf, len, *iovec, iter);
  		*iovec = NULL;
  		return ret;
  	}
9d85cba71   Jeff Moyer   aio: fix the comp...
1386
1387
  #ifdef CONFIG_COMPAT
  	if (compat)
89319d31d   Christoph Hellwig   fs: remove aio_ru...
1388
1389
  		return compat_import_iovec(rw, buf, len, UIO_FASTIOV, iovec,
  				iter);
9d85cba71   Jeff Moyer   aio: fix the comp...
1390
  #endif
89319d31d   Christoph Hellwig   fs: remove aio_ru...
1391
  	return import_iovec(rw, buf, len, UIO_FASTIOV, iovec, iter);
eed4e51fb   Badari Pulavarty   [PATCH] Add vecto...
1392
  }
9061d14a8   Al Viro   aio: all callers ...
1393
  static inline void aio_rw_done(struct kiocb *req, ssize_t ret)
89319d31d   Christoph Hellwig   fs: remove aio_ru...
1394
1395
1396
  {
  	switch (ret) {
  	case -EIOCBQUEUED:
9061d14a8   Al Viro   aio: all callers ...
1397
  		break;
89319d31d   Christoph Hellwig   fs: remove aio_ru...
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
  	case -ERESTARTSYS:
  	case -ERESTARTNOINTR:
  	case -ERESTARTNOHAND:
  	case -ERESTART_RESTARTBLOCK:
  		/*
  		 * There's no easy way to restart the syscall since other AIO's
  		 * may be already running. Just fail this IO with EINTR.
  		 */
  		ret = -EINTR;
  		/*FALLTHRU*/
  	default:
54843f875   Christoph Hellwig   aio: refactor rea...
1409
  		aio_complete_rw(req, ret, 0);
89319d31d   Christoph Hellwig   fs: remove aio_ru...
1410
1411
1412
1413
1414
  	}
  }
  
  static ssize_t aio_read(struct kiocb *req, struct iocb *iocb, bool vectored,
  		bool compat)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1415
  {
00fefb9cf   Gu Zheng   aio: use iovec ar...
1416
  	struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
293bc9822   Al Viro   new methods: ->re...
1417
  	struct iov_iter iter;
54843f875   Christoph Hellwig   aio: refactor rea...
1418
  	struct file *file;
89319d31d   Christoph Hellwig   fs: remove aio_ru...
1419
  	ssize_t ret;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1420

54843f875   Christoph Hellwig   aio: refactor rea...
1421
1422
1423
1424
1425
1426
  	ret = aio_prep_rw(req, iocb);
  	if (ret)
  		return ret;
  	file = req->ki_filp;
  
  	ret = -EBADF;
89319d31d   Christoph Hellwig   fs: remove aio_ru...
1427
  	if (unlikely(!(file->f_mode & FMODE_READ)))
54843f875   Christoph Hellwig   aio: refactor rea...
1428
1429
  		goto out_fput;
  	ret = -EINVAL;
89319d31d   Christoph Hellwig   fs: remove aio_ru...
1430
  	if (unlikely(!file->f_op->read_iter))
54843f875   Christoph Hellwig   aio: refactor rea...
1431
  		goto out_fput;
73a7075e3   Kent Overstreet   aio: Kill aio_rw_...
1432

89319d31d   Christoph Hellwig   fs: remove aio_ru...
1433
1434
  	ret = aio_setup_rw(READ, iocb, &iovec, vectored, compat, &iter);
  	if (ret)
54843f875   Christoph Hellwig   aio: refactor rea...
1435
  		goto out_fput;
89319d31d   Christoph Hellwig   fs: remove aio_ru...
1436
1437
  	ret = rw_verify_area(READ, file, &req->ki_pos, iov_iter_count(&iter));
  	if (!ret)
9061d14a8   Al Viro   aio: all callers ...
1438
  		aio_rw_done(req, call_read_iter(file, req, &iter));
89319d31d   Christoph Hellwig   fs: remove aio_ru...
1439
  	kfree(iovec);
54843f875   Christoph Hellwig   aio: refactor rea...
1440
  out_fput:
9061d14a8   Al Viro   aio: all callers ...
1441
  	if (unlikely(ret))
54843f875   Christoph Hellwig   aio: refactor rea...
1442
  		fput(file);
89319d31d   Christoph Hellwig   fs: remove aio_ru...
1443
1444
  	return ret;
  }
73a7075e3   Kent Overstreet   aio: Kill aio_rw_...
1445

89319d31d   Christoph Hellwig   fs: remove aio_ru...
1446
1447
1448
  static ssize_t aio_write(struct kiocb *req, struct iocb *iocb, bool vectored,
  		bool compat)
  {
89319d31d   Christoph Hellwig   fs: remove aio_ru...
1449
1450
  	struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
  	struct iov_iter iter;
54843f875   Christoph Hellwig   aio: refactor rea...
1451
  	struct file *file;
89319d31d   Christoph Hellwig   fs: remove aio_ru...
1452
  	ssize_t ret;
41ef4eb8e   Kent Overstreet   aio: kill ki_retry
1453

54843f875   Christoph Hellwig   aio: refactor rea...
1454
1455
1456
1457
1458
1459
  	ret = aio_prep_rw(req, iocb);
  	if (ret)
  		return ret;
  	file = req->ki_filp;
  
  	ret = -EBADF;
89319d31d   Christoph Hellwig   fs: remove aio_ru...
1460
  	if (unlikely(!(file->f_mode & FMODE_WRITE)))
54843f875   Christoph Hellwig   aio: refactor rea...
1461
1462
  		goto out_fput;
  	ret = -EINVAL;
89319d31d   Christoph Hellwig   fs: remove aio_ru...
1463
  	if (unlikely(!file->f_op->write_iter))
54843f875   Christoph Hellwig   aio: refactor rea...
1464
  		goto out_fput;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1465

89319d31d   Christoph Hellwig   fs: remove aio_ru...
1466
1467
  	ret = aio_setup_rw(WRITE, iocb, &iovec, vectored, compat, &iter);
  	if (ret)
54843f875   Christoph Hellwig   aio: refactor rea...
1468
  		goto out_fput;
89319d31d   Christoph Hellwig   fs: remove aio_ru...
1469
1470
  	ret = rw_verify_area(WRITE, file, &req->ki_pos, iov_iter_count(&iter));
  	if (!ret) {
70fe2f481   Jan Kara   aio: fix freeze p...
1471
  		/*
92ce47285   Christoph Hellwig   aio: remove the e...
1472
  		 * Open-code file_start_write here to grab freeze protection,
54843f875   Christoph Hellwig   aio: refactor rea...
1473
1474
1475
1476
  		 * which will be released by another thread in
  		 * aio_complete_rw().  Fool lockdep by telling it the lock got
  		 * released so that it doesn't complain about the held lock when
  		 * we return to userspace.
70fe2f481   Jan Kara   aio: fix freeze p...
1477
  		 */
92ce47285   Christoph Hellwig   aio: remove the e...
1478
1479
  		if (S_ISREG(file_inode(file)->i_mode)) {
  			__sb_start_write(file_inode(file)->i_sb, SB_FREEZE_WRITE, true);
a12f1ae61   Shaohua Li   aio: fix lock dep...
1480
  			__sb_writers_release(file_inode(file)->i_sb, SB_FREEZE_WRITE);
92ce47285   Christoph Hellwig   aio: remove the e...
1481
1482
  		}
  		req->ki_flags |= IOCB_WRITE;
9061d14a8   Al Viro   aio: all callers ...
1483
  		aio_rw_done(req, call_write_iter(file, req, &iter));
41ef4eb8e   Kent Overstreet   aio: kill ki_retry
1484
  	}
89319d31d   Christoph Hellwig   fs: remove aio_ru...
1485
  	kfree(iovec);
54843f875   Christoph Hellwig   aio: refactor rea...
1486
  out_fput:
9061d14a8   Al Viro   aio: all callers ...
1487
  	if (unlikely(ret))
54843f875   Christoph Hellwig   aio: refactor rea...
1488
  		fput(file);
89319d31d   Christoph Hellwig   fs: remove aio_ru...
1489
  	return ret;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1490
  }
a3c0d439e   Christoph Hellwig   aio: implement IO...
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
  static void aio_fsync_work(struct work_struct *work)
  {
  	struct fsync_iocb *req = container_of(work, struct fsync_iocb, work);
  	int ret;
  
  	ret = vfs_fsync(req->file, req->datasync);
  	fput(req->file);
  	aio_complete(container_of(req, struct aio_kiocb, fsync), ret, 0);
  }
  
  static int aio_fsync(struct fsync_iocb *req, struct iocb *iocb, bool datasync)
  {
  	if (unlikely(iocb->aio_buf || iocb->aio_offset || iocb->aio_nbytes ||
  			iocb->aio_rw_flags))
  		return -EINVAL;
a11e1d432   Linus Torvalds   Revert changes to...
1506

a3c0d439e   Christoph Hellwig   aio: implement IO...
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
  	req->file = fget(iocb->aio_fildes);
  	if (unlikely(!req->file))
  		return -EBADF;
  	if (unlikely(!req->file->f_op->fsync)) {
  		fput(req->file);
  		return -EINVAL;
  	}
  
  	req->datasync = datasync;
  	INIT_WORK(&req->work, aio_fsync_work);
  	schedule_work(&req->work);
9061d14a8   Al Viro   aio: all callers ...
1518
  	return 0;
a3c0d439e   Christoph Hellwig   aio: implement IO...
1519
  }
bfe4037e7   Christoph Hellwig   aio: implement IO...
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
  static inline void aio_poll_complete(struct aio_kiocb *iocb, __poll_t mask)
  {
  	struct file *file = iocb->poll.file;
  
  	aio_complete(iocb, mangle_poll(mask), 0);
  	fput(file);
  }
  
  static void aio_poll_complete_work(struct work_struct *work)
  {
  	struct poll_iocb *req = container_of(work, struct poll_iocb, work);
  	struct aio_kiocb *iocb = container_of(req, struct aio_kiocb, poll);
  	struct poll_table_struct pt = { ._key = req->events };
  	struct kioctx *ctx = iocb->ki_ctx;
  	__poll_t mask = 0;
  
  	if (!READ_ONCE(req->cancelled))
  		mask = vfs_poll(req->file, &pt) & req->events;
  
  	/*
  	 * Note that ->ki_cancel callers also delete iocb from active_reqs after
  	 * calling ->ki_cancel.  We need the ctx_lock roundtrip here to
  	 * synchronize with them.  In the cancellation case the list_del_init
  	 * itself is not actually needed, but harmless so we keep it in to
  	 * avoid further branches in the fast path.
  	 */
  	spin_lock_irq(&ctx->ctx_lock);
  	if (!mask && !READ_ONCE(req->cancelled)) {
  		add_wait_queue(req->head, &req->wait);
  		spin_unlock_irq(&ctx->ctx_lock);
  		return;
  	}
  	list_del_init(&iocb->ki_list);
  	spin_unlock_irq(&ctx->ctx_lock);
  
  	aio_poll_complete(iocb, mask);
  }
  
  /* assumes we are called with irqs disabled */
  static int aio_poll_cancel(struct kiocb *iocb)
  {
  	struct aio_kiocb *aiocb = container_of(iocb, struct aio_kiocb, rw);
  	struct poll_iocb *req = &aiocb->poll;
  
  	spin_lock(&req->head->lock);
  	WRITE_ONCE(req->cancelled, true);
  	if (!list_empty(&req->wait.entry)) {
  		list_del_init(&req->wait.entry);
  		schedule_work(&aiocb->poll.work);
  	}
  	spin_unlock(&req->head->lock);
  
  	return 0;
  }
  
  static int aio_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
  		void *key)
  {
  	struct poll_iocb *req = container_of(wait, struct poll_iocb, wait);
e8693bcfa   Christoph Hellwig   aio: allow direct...
1579
  	struct aio_kiocb *iocb = container_of(req, struct aio_kiocb, poll);
bfe4037e7   Christoph Hellwig   aio: implement IO...
1580
  	__poll_t mask = key_to_poll(key);
f5e66cdb5   Bart Van Assche   aio: Fix locking ...
1581
  	unsigned long flags;
bfe4037e7   Christoph Hellwig   aio: implement IO...
1582
1583
1584
1585
  
  	req->woken = true;
  
  	/* for instances that support it check for an event match first: */
e8693bcfa   Christoph Hellwig   aio: allow direct...
1586
1587
1588
  	if (mask) {
  		if (!(mask & req->events))
  			return 0;
f5e66cdb5   Bart Van Assche   aio: Fix locking ...
1589
1590
1591
1592
1593
1594
1595
  		/*
  		 * Try to complete the iocb inline if we can. Use
  		 * irqsave/irqrestore because not all filesystems (e.g. fuse)
  		 * call this function with IRQs disabled and because IRQs
  		 * have to be disabled before ctx_lock is obtained.
  		 */
  		if (spin_trylock_irqsave(&iocb->ki_ctx->ctx_lock, flags)) {
e8693bcfa   Christoph Hellwig   aio: allow direct...
1596
  			list_del(&iocb->ki_list);
f5e66cdb5   Bart Van Assche   aio: Fix locking ...
1597
  			spin_unlock_irqrestore(&iocb->ki_ctx->ctx_lock, flags);
e8693bcfa   Christoph Hellwig   aio: allow direct...
1598
1599
1600
1601
1602
1603
  
  			list_del_init(&req->wait.entry);
  			aio_poll_complete(iocb, mask);
  			return 1;
  		}
  	}
bfe4037e7   Christoph Hellwig   aio: implement IO...
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
  
  	list_del_init(&req->wait.entry);
  	schedule_work(&req->work);
  	return 1;
  }
  
  struct aio_poll_table {
  	struct poll_table_struct	pt;
  	struct aio_kiocb		*iocb;
  	int				error;
  };
  
  static void
  aio_poll_queue_proc(struct file *file, struct wait_queue_head *head,
  		struct poll_table_struct *p)
  {
  	struct aio_poll_table *pt = container_of(p, struct aio_poll_table, pt);
  
  	/* multiple wait queues per file are not supported */
  	if (unlikely(pt->iocb->poll.head)) {
  		pt->error = -EINVAL;
  		return;
  	}
  
  	pt->error = 0;
  	pt->iocb->poll.head = head;
  	add_wait_queue(head, &pt->iocb->poll.wait);
  }
  
  static ssize_t aio_poll(struct aio_kiocb *aiocb, struct iocb *iocb)
  {
  	struct kioctx *ctx = aiocb->ki_ctx;
  	struct poll_iocb *req = &aiocb->poll;
  	struct aio_poll_table apt;
  	__poll_t mask;
  
  	/* reject any unknown events outside the normal event mask. */
  	if ((u16)iocb->aio_buf != iocb->aio_buf)
  		return -EINVAL;
  	/* reject fields that are not defined for poll */
  	if (iocb->aio_offset || iocb->aio_nbytes || iocb->aio_rw_flags)
  		return -EINVAL;
  
  	INIT_WORK(&req->work, aio_poll_complete_work);
  	req->events = demangle_poll(iocb->aio_buf) | EPOLLERR | EPOLLHUP;
  	req->file = fget(iocb->aio_fildes);
  	if (unlikely(!req->file))
  		return -EBADF;
  
  	apt.pt._qproc = aio_poll_queue_proc;
  	apt.pt._key = req->events;
  	apt.iocb = aiocb;
  	apt.error = -EINVAL; /* same as no support for IOCB_CMD_POLL */
  
  	/* initialized the list so that we can do list_empty checks */
  	INIT_LIST_HEAD(&req->wait.entry);
  	init_waitqueue_func_entry(&req->wait, aio_poll_wake);
  
  	/* one for removal from waitqueue, one for this function */
  	refcount_set(&aiocb->ki_refcnt, 2);
  
  	mask = vfs_poll(req->file, &apt.pt) & req->events;
  	if (unlikely(!req->head)) {
  		/* we did not manage to set up a waitqueue, done */
  		goto out;
  	}
  
  	spin_lock_irq(&ctx->ctx_lock);
  	spin_lock(&req->head->lock);
  	if (req->woken) {
  		/* wake_up context handles the rest */
  		mask = 0;
  		apt.error = 0;
  	} else if (mask || apt.error) {
  		/* if we get an error or a mask we are done */
  		WARN_ON_ONCE(list_empty(&req->wait.entry));
  		list_del_init(&req->wait.entry);
  	} else {
  		/* actually waiting for an event */
  		list_add_tail(&aiocb->ki_list, &ctx->active_reqs);
  		aiocb->ki_cancel = aio_poll_cancel;
  	}
  	spin_unlock(&req->head->lock);
  	spin_unlock_irq(&ctx->ctx_lock);
  
  out:
  	if (unlikely(apt.error)) {
  		fput(req->file);
  		return apt.error;
  	}
  
  	if (mask)
  		aio_poll_complete(aiocb, mask);
  	iocb_put(aiocb);
  	return 0;
  }
d5470b596   Adrian Bunk   fs/aio.c: make 3 ...
1700
  static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
95af8496a   Al Viro   aio: shift copyin...
1701
  			 bool compat)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1702
  {
04b2fa9f8   Christoph Hellwig   fs: split generic...
1703
  	struct aio_kiocb *req;
95af8496a   Al Viro   aio: shift copyin...
1704
  	struct iocb iocb;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1705
  	ssize_t ret;
95af8496a   Al Viro   aio: shift copyin...
1706
1707
  	if (unlikely(copy_from_user(&iocb, user_iocb, sizeof(iocb))))
  		return -EFAULT;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1708
  	/* enforce forwards compatibility on users */
95af8496a   Al Viro   aio: shift copyin...
1709
  	if (unlikely(iocb.aio_reserved2)) {
caf4167aa   Kent Overstreet   aio: dprintk() ->...
1710
1711
  		pr_debug("EINVAL: reserve field set
  ");
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1712
1713
1714
1715
1716
  		return -EINVAL;
  	}
  
  	/* prevent overflows */
  	if (unlikely(
95af8496a   Al Viro   aio: shift copyin...
1717
1718
1719
  	    (iocb.aio_buf != (unsigned long)iocb.aio_buf) ||
  	    (iocb.aio_nbytes != (size_t)iocb.aio_nbytes) ||
  	    ((ssize_t)iocb.aio_nbytes < 0)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1720
  	   )) {
acd88d4e1   Kinglong Mee   fs/aio.c: Remove ...
1721
1722
  		pr_debug("EINVAL: overflow check
  ");
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1723
1724
  		return -EINVAL;
  	}
41ef4eb8e   Kent Overstreet   aio: kill ki_retry
1725
  	req = aio_get_req(ctx);
1d98ebfcc   Kent Overstreet   aio: do fget() af...
1726
  	if (unlikely(!req))
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1727
  		return -EAGAIN;
1d98ebfcc   Kent Overstreet   aio: do fget() af...
1728

95af8496a   Al Viro   aio: shift copyin...
1729
  	if (iocb.aio_flags & IOCB_FLAG_RESFD) {
9c3060bed   Davide Libenzi   signal/timer/even...
1730
1731
1732
1733
1734
1735
  		/*
  		 * If the IOCB_FLAG_RESFD flag of aio_flags is set, get an
  		 * instance of the file* now. The file descriptor must be
  		 * an eventfd() fd, and will be signaled for each completed
  		 * event using the eventfd_signal() function.
  		 */
95af8496a   Al Viro   aio: shift copyin...
1736
  		req->ki_eventfd = eventfd_ctx_fdget((int) iocb.aio_resfd);
801678c5a   Hirofumi Nakagawa   Remove duplicated...
1737
  		if (IS_ERR(req->ki_eventfd)) {
9c3060bed   Davide Libenzi   signal/timer/even...
1738
  			ret = PTR_ERR(req->ki_eventfd);
87c3a86e1   Davide Libenzi   eventfd: remove f...
1739
  			req->ki_eventfd = NULL;
9c3060bed   Davide Libenzi   signal/timer/even...
1740
1741
  			goto out_put_req;
  		}
9830f4be1   Goldwyn Rodrigues   fs: Use RWF_* fla...
1742
  	}
8a6608907   Kent Overstreet   aio: kill ki_key
1743
  	ret = put_user(KIOCB_KEY, &user_iocb->aio_key);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1744
  	if (unlikely(ret)) {
caf4167aa   Kent Overstreet   aio: dprintk() ->...
1745
1746
  		pr_debug("EFAULT: aio_key
  ");
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1747
1748
  		goto out_put_req;
  	}
04b2fa9f8   Christoph Hellwig   fs: split generic...
1749
  	req->ki_user_iocb = user_iocb;
95af8496a   Al Viro   aio: shift copyin...
1750
  	req->ki_user_data = iocb.aio_data;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1751

95af8496a   Al Viro   aio: shift copyin...
1752
  	switch (iocb.aio_lio_opcode) {
89319d31d   Christoph Hellwig   fs: remove aio_ru...
1753
  	case IOCB_CMD_PREAD:
95af8496a   Al Viro   aio: shift copyin...
1754
  		ret = aio_read(&req->rw, &iocb, false, compat);
89319d31d   Christoph Hellwig   fs: remove aio_ru...
1755
1756
  		break;
  	case IOCB_CMD_PWRITE:
95af8496a   Al Viro   aio: shift copyin...
1757
  		ret = aio_write(&req->rw, &iocb, false, compat);
89319d31d   Christoph Hellwig   fs: remove aio_ru...
1758
1759
  		break;
  	case IOCB_CMD_PREADV:
95af8496a   Al Viro   aio: shift copyin...
1760
  		ret = aio_read(&req->rw, &iocb, true, compat);
89319d31d   Christoph Hellwig   fs: remove aio_ru...
1761
1762
  		break;
  	case IOCB_CMD_PWRITEV:
95af8496a   Al Viro   aio: shift copyin...
1763
  		ret = aio_write(&req->rw, &iocb, true, compat);
89319d31d   Christoph Hellwig   fs: remove aio_ru...
1764
  		break;
a3c0d439e   Christoph Hellwig   aio: implement IO...
1765
  	case IOCB_CMD_FSYNC:
95af8496a   Al Viro   aio: shift copyin...
1766
  		ret = aio_fsync(&req->fsync, &iocb, false);
a3c0d439e   Christoph Hellwig   aio: implement IO...
1767
1768
  		break;
  	case IOCB_CMD_FDSYNC:
95af8496a   Al Viro   aio: shift copyin...
1769
  		ret = aio_fsync(&req->fsync, &iocb, true);
ac060cbaa   Christoph Hellwig   aio: add missing ...
1770
  		break;
bfe4037e7   Christoph Hellwig   aio: implement IO...
1771
1772
1773
  	case IOCB_CMD_POLL:
  		ret = aio_poll(req, &iocb);
  		break;
89319d31d   Christoph Hellwig   fs: remove aio_ru...
1774
  	default:
95af8496a   Al Viro   aio: shift copyin...
1775
1776
  		pr_debug("invalid aio operation %d
  ", iocb.aio_lio_opcode);
89319d31d   Christoph Hellwig   fs: remove aio_ru...
1777
1778
1779
  		ret = -EINVAL;
  		break;
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1780

92ce47285   Christoph Hellwig   aio: remove the e...
1781
  	/*
9061d14a8   Al Viro   aio: all callers ...
1782
1783
1784
  	 * If ret is 0, we'd either done aio_complete() ourselves or have
  	 * arranged for that to be done asynchronously.  Anything non-zero
  	 * means that we need to destroy req ourselves.
92ce47285   Christoph Hellwig   aio: remove the e...
1785
  	 */
9061d14a8   Al Viro   aio: all callers ...
1786
  	if (ret)
89319d31d   Christoph Hellwig   fs: remove aio_ru...
1787
  		goto out_put_req;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1788
  	return 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1789
  out_put_req:
e1bdd5f27   Kent Overstreet   aio: percpu reqs_...
1790
  	put_reqs_available(ctx, 1);
e34ecee2a   Kent Overstreet   aio: Fix a trinit...
1791
  	percpu_ref_put(&ctx->reqs);
54843f875   Christoph Hellwig   aio: refactor rea...
1792
1793
1794
  	if (req->ki_eventfd)
  		eventfd_ctx_put(req->ki_eventfd);
  	kmem_cache_free(kiocb_cachep, req);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1795
1796
  	return ret;
  }
67ba049f9   Al Viro   aio: fold do_io_s...
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
  /* sys_io_submit:
   *	Queue the nr iocbs pointed to by iocbpp for processing.  Returns
   *	the number of iocbs queued.  May return -EINVAL if the aio_context
   *	specified by ctx_id is invalid, if nr is < 0, if the iocb at
   *	*iocbpp[0] is not properly initialized, if the operation specified
   *	is invalid for the file descriptor in the iocb.  May fail with
   *	-EFAULT if any of the data structures point to invalid data.  May
   *	fail with -EBADF if the file descriptor specified in the first
   *	iocb is invalid.  May fail with -EAGAIN if insufficient resources
   *	are available to queue any iocbs.  Will return 0 if nr is 0.  Will
   *	fail with -ENOSYS if not implemented.
   */
  SYSCALL_DEFINE3(io_submit, aio_context_t, ctx_id, long, nr,
  		struct iocb __user * __user *, iocbpp)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1811
1812
1813
  {
  	struct kioctx *ctx;
  	long ret = 0;
080d676de   Jeff Moyer   aio: allocate kio...
1814
  	int i = 0;
9f5b94254   Shaohua Li   fs: make aio plug
1815
  	struct blk_plug plug;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1816
1817
1818
  
  	if (unlikely(nr < 0))
  		return -EINVAL;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1819
1820
  	ctx = lookup_ioctx(ctx_id);
  	if (unlikely(!ctx)) {
caf4167aa   Kent Overstreet   aio: dprintk() ->...
1821
1822
  		pr_debug("EINVAL: invalid context id
  ");
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1823
1824
  		return -EINVAL;
  	}
1da92779e   Al Viro   aio: sanitize the...
1825
1826
  	if (nr > ctx->nr_events)
  		nr = ctx->nr_events;
9f5b94254   Shaohua Li   fs: make aio plug
1827
  	blk_start_plug(&plug);
67ba049f9   Al Viro   aio: fold do_io_s...
1828
  	for (i = 0; i < nr; i++) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1829
  		struct iocb __user *user_iocb;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1830

67ba049f9   Al Viro   aio: fold do_io_s...
1831
  		if (unlikely(get_user(user_iocb, iocbpp + i))) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1832
1833
1834
  			ret = -EFAULT;
  			break;
  		}
67ba049f9   Al Viro   aio: fold do_io_s...
1835
  		ret = io_submit_one(ctx, user_iocb, false);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1836
1837
1838
  		if (ret)
  			break;
  	}
9f5b94254   Shaohua Li   fs: make aio plug
1839
  	blk_finish_plug(&plug);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1840

723be6e39   Kent Overstreet   aio: percpu ioctx...
1841
  	percpu_ref_put(&ctx->users);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1842
1843
  	return i ? i : ret;
  }
c00d2c7e8   Al Viro   move aio compat t...
1844
  #ifdef CONFIG_COMPAT
c00d2c7e8   Al Viro   move aio compat t...
1845
  COMPAT_SYSCALL_DEFINE3(io_submit, compat_aio_context_t, ctx_id,
67ba049f9   Al Viro   aio: fold do_io_s...
1846
  		       int, nr, compat_uptr_t __user *, iocbpp)
c00d2c7e8   Al Viro   move aio compat t...
1847
  {
67ba049f9   Al Viro   aio: fold do_io_s...
1848
1849
1850
1851
  	struct kioctx *ctx;
  	long ret = 0;
  	int i = 0;
  	struct blk_plug plug;
c00d2c7e8   Al Viro   move aio compat t...
1852
1853
1854
  
  	if (unlikely(nr < 0))
  		return -EINVAL;
67ba049f9   Al Viro   aio: fold do_io_s...
1855
1856
1857
1858
1859
1860
  	ctx = lookup_ioctx(ctx_id);
  	if (unlikely(!ctx)) {
  		pr_debug("EINVAL: invalid context id
  ");
  		return -EINVAL;
  	}
1da92779e   Al Viro   aio: sanitize the...
1861
1862
  	if (nr > ctx->nr_events)
  		nr = ctx->nr_events;
67ba049f9   Al Viro   aio: fold do_io_s...
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
  	blk_start_plug(&plug);
  	for (i = 0; i < nr; i++) {
  		compat_uptr_t user_iocb;
  
  		if (unlikely(get_user(user_iocb, iocbpp + i))) {
  			ret = -EFAULT;
  			break;
  		}
  
  		ret = io_submit_one(ctx, compat_ptr(user_iocb), true);
  		if (ret)
  			break;
  	}
  	blk_finish_plug(&plug);
  
  	percpu_ref_put(&ctx->users);
  	return i ? i : ret;
c00d2c7e8   Al Viro   move aio compat t...
1880
1881
  }
  #endif
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1882
1883
  /* lookup_kiocb
   *	Finds a given iocb for cancellation.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1884
   */
04b2fa9f8   Christoph Hellwig   fs: split generic...
1885
  static struct aio_kiocb *
f3a2752a4   Christoph Hellwig   aio: simplify KIO...
1886
  lookup_kiocb(struct kioctx *ctx, struct iocb __user *iocb)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1887
  {
04b2fa9f8   Christoph Hellwig   fs: split generic...
1888
  	struct aio_kiocb *kiocb;
d00689af6   Zach Brown   [PATCH] aio: repl...
1889
1890
  
  	assert_spin_locked(&ctx->ctx_lock);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1891
  	/* TODO: use a hash or array, this sucks. */
04b2fa9f8   Christoph Hellwig   fs: split generic...
1892
1893
  	list_for_each_entry(kiocb, &ctx->active_reqs, ki_list) {
  		if (kiocb->ki_user_iocb == iocb)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
  			return kiocb;
  	}
  	return NULL;
  }
  
  /* sys_io_cancel:
   *	Attempts to cancel an iocb previously passed to io_submit.  If
   *	the operation is successfully cancelled, the resulting event is
   *	copied into the memory pointed to by result without being placed
   *	into the completion queue and 0 is returned.  May fail with
   *	-EFAULT if any of the data structures pointed to are invalid.
   *	May fail with -EINVAL if aio_context specified by ctx_id is
   *	invalid.  May fail with -EAGAIN if the iocb specified was not
   *	cancelled.  Will fail with -ENOSYS if not implemented.
   */
002c8976e   Heiko Carstens   [CVE-2009-0029] S...
1909
1910
  SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb,
  		struct io_event __user *, result)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1911
  {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1912
  	struct kioctx *ctx;
04b2fa9f8   Christoph Hellwig   fs: split generic...
1913
  	struct aio_kiocb *kiocb;
888933f8f   Christoph Hellwig   aio: simplify can...
1914
  	int ret = -EINVAL;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1915
  	u32 key;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1916

f3a2752a4   Christoph Hellwig   aio: simplify KIO...
1917
  	if (unlikely(get_user(key, &iocb->aio_key)))
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1918
  		return -EFAULT;
f3a2752a4   Christoph Hellwig   aio: simplify KIO...
1919
1920
  	if (unlikely(key != KIOCB_KEY))
  		return -EINVAL;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1921
1922
1923
1924
1925
1926
  
  	ctx = lookup_ioctx(ctx_id);
  	if (unlikely(!ctx))
  		return -EINVAL;
  
  	spin_lock_irq(&ctx->ctx_lock);
f3a2752a4   Christoph Hellwig   aio: simplify KIO...
1927
  	kiocb = lookup_kiocb(ctx, iocb);
888933f8f   Christoph Hellwig   aio: simplify can...
1928
1929
1930
1931
  	if (kiocb) {
  		ret = kiocb->ki_cancel(&kiocb->rw);
  		list_del_init(&kiocb->ki_list);
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1932
  	spin_unlock_irq(&ctx->ctx_lock);
906b973cf   Kent Overstreet   aio: add kiocb_ca...
1933
  	if (!ret) {
bec68faaf   Kent Overstreet   aio: io_cancel() ...
1934
1935
1936
1937
  		/*
  		 * The result argument is no longer used - the io_event is
  		 * always delivered via the ring buffer. -EINPROGRESS indicates
  		 * cancellation is progress:
906b973cf   Kent Overstreet   aio: add kiocb_ca...
1938
  		 */
bec68faaf   Kent Overstreet   aio: io_cancel() ...
1939
  		ret = -EINPROGRESS;
906b973cf   Kent Overstreet   aio: add kiocb_ca...
1940
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1941

723be6e39   Kent Overstreet   aio: percpu ioctx...
1942
  	percpu_ref_put(&ctx->users);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1943
1944
1945
  
  	return ret;
  }
fa2e62a54   Deepa Dinamani   io_getevents: Use...
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
  static long do_io_getevents(aio_context_t ctx_id,
  		long min_nr,
  		long nr,
  		struct io_event __user *events,
  		struct timespec64 *ts)
  {
  	ktime_t until = ts ? timespec64_to_ktime(*ts) : KTIME_MAX;
  	struct kioctx *ioctx = lookup_ioctx(ctx_id);
  	long ret = -EINVAL;
  
  	if (likely(ioctx)) {
  		if (likely(min_nr <= nr && min_nr >= 0))
  			ret = read_events(ioctx, min_nr, nr, events, until);
  		percpu_ref_put(&ioctx->users);
  	}
  
  	return ret;
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1964
1965
  /* io_getevents:
   *	Attempts to read at least min_nr events and up to nr events from
642b5123a   Satoru Takeuchi   aio: fix wrong su...
1966
1967
1968
1969
1970
1971
1972
1973
   *	the completion queue for the aio_context specified by ctx_id. If
   *	it succeeds, the number of read events is returned. May fail with
   *	-EINVAL if ctx_id is invalid, if min_nr is out of range, if nr is
   *	out of range, if timeout is out of range.  May fail with -EFAULT
   *	if any of the memory specified is invalid.  May return 0 or
   *	< min_nr if the timeout specified by timeout has elapsed
   *	before sufficient events are available, where timeout == NULL
   *	specifies an infinite timeout. Note that the timeout pointed to by
6900807c6   Jeff Moyer   aio: fix io_getev...
1974
   *	timeout is relative.  Will fail with -ENOSYS if not implemented.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1975
   */
002c8976e   Heiko Carstens   [CVE-2009-0029] S...
1976
1977
1978
1979
1980
  SYSCALL_DEFINE5(io_getevents, aio_context_t, ctx_id,
  		long, min_nr,
  		long, nr,
  		struct io_event __user *, events,
  		struct timespec __user *, timeout)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1981
  {
fa2e62a54   Deepa Dinamani   io_getevents: Use...
1982
  	struct timespec64	ts;
7a074e96d   Christoph Hellwig   aio: implement io...
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
  	int			ret;
  
  	if (timeout && unlikely(get_timespec64(&ts, timeout)))
  		return -EFAULT;
  
  	ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &ts : NULL);
  	if (!ret && signal_pending(current))
  		ret = -EINTR;
  	return ret;
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1993

9ba546c01   Christoph Hellwig   aio: don't expose...
1994
1995
1996
1997
  struct __aio_sigset {
  	const sigset_t __user	*sigmask;
  	size_t		sigsetsize;
  };
7a074e96d   Christoph Hellwig   aio: implement io...
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
  SYSCALL_DEFINE6(io_pgetevents,
  		aio_context_t, ctx_id,
  		long, min_nr,
  		long, nr,
  		struct io_event __user *, events,
  		struct timespec __user *, timeout,
  		const struct __aio_sigset __user *, usig)
  {
  	struct __aio_sigset	ksig = { NULL, };
  	sigset_t		ksigmask, sigsaved;
  	struct timespec64	ts;
  	int ret;
  
  	if (timeout && unlikely(get_timespec64(&ts, timeout)))
  		return -EFAULT;
  
  	if (usig && copy_from_user(&ksig, usig, sizeof(ksig)))
  		return -EFAULT;
  
  	if (ksig.sigmask) {
  		if (ksig.sigsetsize != sizeof(sigset_t))
  			return -EINVAL;
  		if (copy_from_user(&ksigmask, ksig.sigmask, sizeof(ksigmask)))
fa2e62a54   Deepa Dinamani   io_getevents: Use...
2021
  			return -EFAULT;
7a074e96d   Christoph Hellwig   aio: implement io...
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
  		sigdelsetmask(&ksigmask, sigmask(SIGKILL) | sigmask(SIGSTOP));
  		sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
  	}
  
  	ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &ts : NULL);
  	if (signal_pending(current)) {
  		if (ksig.sigmask) {
  			current->saved_sigmask = sigsaved;
  			set_restore_sigmask();
  		}
  
  		if (!ret)
  			ret = -ERESTARTNOHAND;
  	} else {
  		if (ksig.sigmask)
  			sigprocmask(SIG_SETMASK, &sigsaved, NULL);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2038
  	}
fa2e62a54   Deepa Dinamani   io_getevents: Use...
2039

7a074e96d   Christoph Hellwig   aio: implement io...
2040
  	return ret;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2041
  }
c00d2c7e8   Al Viro   move aio compat t...
2042
2043
2044
2045
2046
2047
2048
2049
  
  #ifdef CONFIG_COMPAT
  COMPAT_SYSCALL_DEFINE5(io_getevents, compat_aio_context_t, ctx_id,
  		       compat_long_t, min_nr,
  		       compat_long_t, nr,
  		       struct io_event __user *, events,
  		       struct compat_timespec __user *, timeout)
  {
fa2e62a54   Deepa Dinamani   io_getevents: Use...
2050
  	struct timespec64 t;
7a074e96d   Christoph Hellwig   aio: implement io...
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
  	int ret;
  
  	if (timeout && compat_get_timespec64(&t, timeout))
  		return -EFAULT;
  
  	ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &t : NULL);
  	if (!ret && signal_pending(current))
  		ret = -EINTR;
  	return ret;
  }
c00d2c7e8   Al Viro   move aio compat t...
2061

7a074e96d   Christoph Hellwig   aio: implement io...
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
  struct __compat_aio_sigset {
  	compat_sigset_t __user	*sigmask;
  	compat_size_t		sigsetsize;
  };
  
  COMPAT_SYSCALL_DEFINE6(io_pgetevents,
  		compat_aio_context_t, ctx_id,
  		compat_long_t, min_nr,
  		compat_long_t, nr,
  		struct io_event __user *, events,
  		struct compat_timespec __user *, timeout,
  		const struct __compat_aio_sigset __user *, usig)
  {
  	struct __compat_aio_sigset ksig = { NULL, };
  	sigset_t ksigmask, sigsaved;
  	struct timespec64 t;
  	int ret;
  
  	if (timeout && compat_get_timespec64(&t, timeout))
  		return -EFAULT;
  
  	if (usig && copy_from_user(&ksig, usig, sizeof(ksig)))
  		return -EFAULT;
  
  	if (ksig.sigmask) {
  		if (ksig.sigsetsize != sizeof(compat_sigset_t))
  			return -EINVAL;
  		if (get_compat_sigset(&ksigmask, ksig.sigmask))
c00d2c7e8   Al Viro   move aio compat t...
2090
  			return -EFAULT;
7a074e96d   Christoph Hellwig   aio: implement io...
2091
2092
2093
  		sigdelsetmask(&ksigmask, sigmask(SIGKILL) | sigmask(SIGSTOP));
  		sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
  	}
c00d2c7e8   Al Viro   move aio compat t...
2094

7a074e96d   Christoph Hellwig   aio: implement io...
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
  	ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &t : NULL);
  	if (signal_pending(current)) {
  		if (ksig.sigmask) {
  			current->saved_sigmask = sigsaved;
  			set_restore_sigmask();
  		}
  		if (!ret)
  			ret = -ERESTARTNOHAND;
  	} else {
  		if (ksig.sigmask)
  			sigprocmask(SIG_SETMASK, &sigsaved, NULL);
c00d2c7e8   Al Viro   move aio compat t...
2106
  	}
fa2e62a54   Deepa Dinamani   io_getevents: Use...
2107

7a074e96d   Christoph Hellwig   aio: implement io...
2108
  	return ret;
c00d2c7e8   Al Viro   move aio compat t...
2109
2110
  }
  #endif