Blame view

fs/aio.c 43.9 KB
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1
2
3
4
5
6
7
8
9
10
  /*
   *	An async IO implementation for Linux
   *	Written by Benjamin LaHaise <bcrl@kvack.org>
   *
   *	Implements an efficient asynchronous io interface.
   *
   *	Copyright 2000, 2001, 2002 Red Hat, Inc.  All Rights Reserved.
   *
   *	See ../COPYING for licensing terms.
   */
caf4167aa   Kent Overstreet   aio: dprintk() ->...
11
  #define pr_fmt(fmt) "%s: " fmt, __func__
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
12
13
14
15
16
  #include <linux/kernel.h>
  #include <linux/init.h>
  #include <linux/errno.h>
  #include <linux/time.h>
  #include <linux/aio_abi.h>
630d9c472   Paul Gortmaker   fs: reduce the us...
17
  #include <linux/export.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
18
  #include <linux/syscalls.h>
b9d128f10   Jens Axboe   block: move bdi/a...
19
  #include <linux/backing-dev.h>
027445c37   Badari Pulavarty   [PATCH] Vectorize...
20
  #include <linux/uio.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
21

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
22
23
24
25
26
  #include <linux/sched.h>
  #include <linux/fs.h>
  #include <linux/file.h>
  #include <linux/mm.h>
  #include <linux/mman.h>
3d2d827f5   Michael S. Tsirkin   mm: move use_mm/u...
27
  #include <linux/mmu_context.h>
e1bdd5f27   Kent Overstreet   aio: percpu reqs_...
28
  #include <linux/percpu.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
29
30
31
32
33
34
  #include <linux/slab.h>
  #include <linux/timer.h>
  #include <linux/aio.h>
  #include <linux/highmem.h>
  #include <linux/workqueue.h>
  #include <linux/security.h>
9c3060bed   Davide Libenzi   signal/timer/even...
35
  #include <linux/eventfd.h>
cfb1e33ee   Jeff Moyer   aio: implement re...
36
  #include <linux/blkdev.h>
9d85cba71   Jeff Moyer   aio: fix the comp...
37
  #include <linux/compat.h>
36bc08cc0   Gu Zheng   fs/aio: Add suppo...
38
39
  #include <linux/migrate.h>
  #include <linux/ramfs.h>
723be6e39   Kent Overstreet   aio: percpu ioctx...
40
  #include <linux/percpu-refcount.h>
71ad7490c   Benjamin LaHaise   rework aio migrat...
41
  #include <linux/mount.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
42
43
44
  
  #include <asm/kmap_types.h>
  #include <asm/uaccess.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
45

68d70d03f   Al Viro   constify rw_verif...
46
  #include "internal.h"
4e179bca6   Kent Overstreet   aio: move private...
47
48
49
50
51
52
  #define AIO_RING_MAGIC			0xa10a10a1
  #define AIO_RING_COMPAT_FEATURES	1
  #define AIO_RING_INCOMPAT_FEATURES	0
  struct aio_ring {
  	unsigned	id;	/* kernel internal index number */
  	unsigned	nr;	/* number of io_events */
fa8a53c39   Benjamin LaHaise   aio: v4 ensure ac...
53
54
  	unsigned	head;	/* Written to by userland or under ring_lock
  				 * mutex by aio_read_events_ring(). */
4e179bca6   Kent Overstreet   aio: move private...
55
56
57
58
59
60
61
62
63
64
65
66
  	unsigned	tail;
  
  	unsigned	magic;
  	unsigned	compat_features;
  	unsigned	incompat_features;
  	unsigned	header_length;	/* size of aio_ring */
  
  
  	struct io_event		io_events[0];
  }; /* 128 bytes + ring size */
  
  #define AIO_RING_PAGES	8
4e179bca6   Kent Overstreet   aio: move private...
67

db446a08c   Benjamin LaHaise   aio: convert the ...
68
69
70
71
72
  struct kioctx_table {
  	struct rcu_head	rcu;
  	unsigned	nr;
  	struct kioctx	*table[];
  };
e1bdd5f27   Kent Overstreet   aio: percpu reqs_...
73
74
75
  struct kioctx_cpu {
  	unsigned		reqs_available;
  };
dc48e56d7   Jens Axboe   aio: fix serial d...
76
77
78
79
  struct ctx_rq_wait {
  	struct completion comp;
  	atomic_t count;
  };
4e179bca6   Kent Overstreet   aio: move private...
80
  struct kioctx {
723be6e39   Kent Overstreet   aio: percpu ioctx...
81
  	struct percpu_ref	users;
36f558890   Kent Overstreet   aio: refcounting ...
82
  	atomic_t		dead;
4e179bca6   Kent Overstreet   aio: move private...
83

e34ecee2a   Kent Overstreet   aio: Fix a trinit...
84
  	struct percpu_ref	reqs;
4e179bca6   Kent Overstreet   aio: move private...
85
  	unsigned long		user_id;
4e179bca6   Kent Overstreet   aio: move private...
86

e1bdd5f27   Kent Overstreet   aio: percpu reqs_...
87
88
89
90
91
92
93
  	struct __percpu kioctx_cpu *cpu;
  
  	/*
  	 * For percpu reqs_available, number of slots we move to/from global
  	 * counter at a time:
  	 */
  	unsigned		req_batch;
3e845ce01   Kent Overstreet   aio: change reqs_...
94
95
96
97
  	/*
  	 * This is what userspace passed to io_setup(), it's not used for
  	 * anything but counting against the global max_reqs quota.
  	 *
58c85dc20   Kent Overstreet   aio: kill struct ...
98
  	 * The real limit is nr_events - 1, which will be larger (see
3e845ce01   Kent Overstreet   aio: change reqs_...
99
100
  	 * aio_setup_ring())
  	 */
4e179bca6   Kent Overstreet   aio: move private...
101
  	unsigned		max_reqs;
58c85dc20   Kent Overstreet   aio: kill struct ...
102
103
  	/* Size of ringbuffer, in units of struct io_event */
  	unsigned		nr_events;
4e179bca6   Kent Overstreet   aio: move private...
104

58c85dc20   Kent Overstreet   aio: kill struct ...
105
106
107
108
109
  	unsigned long		mmap_base;
  	unsigned long		mmap_size;
  
  	struct page		**ring_pages;
  	long			nr_pages;
723be6e39   Kent Overstreet   aio: percpu ioctx...
110
  	struct work_struct	free_work;
4e23bcaeb   Kent Overstreet   aio: give shared ...
111

e02ba72aa   Anatol Pomozov   aio: block io_des...
112
113
114
  	/*
  	 * signals when all in-flight requests are done
  	 */
dc48e56d7   Jens Axboe   aio: fix serial d...
115
  	struct ctx_rq_wait	*rq_wait;
e02ba72aa   Anatol Pomozov   aio: block io_des...
116

4e23bcaeb   Kent Overstreet   aio: give shared ...
117
  	struct {
34e83fc61   Kent Overstreet   aio: reqs_active ...
118
119
120
121
122
  		/*
  		 * This counts the number of available slots in the ringbuffer,
  		 * so we avoid overflowing it: it's decremented (if positive)
  		 * when allocating a kiocb and incremented when the resulting
  		 * io_event is pulled off the ringbuffer.
e1bdd5f27   Kent Overstreet   aio: percpu reqs_...
123
124
  		 *
  		 * We batch accesses to it with a percpu version.
34e83fc61   Kent Overstreet   aio: reqs_active ...
125
126
  		 */
  		atomic_t	reqs_available;
4e23bcaeb   Kent Overstreet   aio: give shared ...
127
128
129
130
131
132
  	} ____cacheline_aligned_in_smp;
  
  	struct {
  		spinlock_t	ctx_lock;
  		struct list_head active_reqs;	/* used for cancellation */
  	} ____cacheline_aligned_in_smp;
58c85dc20   Kent Overstreet   aio: kill struct ...
133
134
  	struct {
  		struct mutex	ring_lock;
4e23bcaeb   Kent Overstreet   aio: give shared ...
135
136
  		wait_queue_head_t wait;
  	} ____cacheline_aligned_in_smp;
58c85dc20   Kent Overstreet   aio: kill struct ...
137
138
139
  
  	struct {
  		unsigned	tail;
d856f32a8   Benjamin LaHaise   aio: fix reqs_ava...
140
  		unsigned	completed_events;
58c85dc20   Kent Overstreet   aio: kill struct ...
141
  		spinlock_t	completion_lock;
4e23bcaeb   Kent Overstreet   aio: give shared ...
142
  	} ____cacheline_aligned_in_smp;
58c85dc20   Kent Overstreet   aio: kill struct ...
143
144
  
  	struct page		*internal_pages[AIO_RING_PAGES];
36bc08cc0   Gu Zheng   fs/aio: Add suppo...
145
  	struct file		*aio_ring_file;
db446a08c   Benjamin LaHaise   aio: convert the ...
146
147
  
  	unsigned		id;
4e179bca6   Kent Overstreet   aio: move private...
148
  };
04b2fa9f8   Christoph Hellwig   fs: split generic...
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
  /*
   * We use ki_cancel == KIOCB_CANCELLED to indicate that a kiocb has been either
   * cancelled or completed (this makes a certain amount of sense because
   * successful cancellation - io_cancel() - does deliver the completion to
   * userspace).
   *
   * And since most things don't implement kiocb cancellation and we'd really like
   * kiocb completion to be lockless when possible, we use ki_cancel to
   * synchronize cancellation and completion - we only set it to KIOCB_CANCELLED
   * with xchg() or cmpxchg(), see batch_complete_aio() and kiocb_cancel().
   */
  #define KIOCB_CANCELLED		((void *) (~0ULL))
  
  struct aio_kiocb {
  	struct kiocb		common;
  
  	struct kioctx		*ki_ctx;
  	kiocb_cancel_fn		*ki_cancel;
  
  	struct iocb __user	*ki_user_iocb;	/* user's aiocb */
  	__u64			ki_user_data;	/* user's data for completion */
  
  	struct list_head	ki_list;	/* the aio core uses this
  						 * for cancellation */
  
  	/*
  	 * If the aio_resfd field of the userspace iocb is not zero,
  	 * this is the underlying eventfd context to deliver events to.
  	 */
  	struct eventfd_ctx	*ki_eventfd;
  };
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
180
  /*------ sysctl variables----*/
d55b5fdaf   Zach Brown   [PATCH] aio: remo...
181
182
183
  static DEFINE_SPINLOCK(aio_nr_lock);
  unsigned long aio_nr;		/* current system wide number of aio requests */
  unsigned long aio_max_nr = 0x10000; /* system wide maximum number of aio requests */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
184
  /*----end sysctl variables---*/
e18b890bb   Christoph Lameter   [PATCH] slab: rem...
185
186
  static struct kmem_cache	*kiocb_cachep;
  static struct kmem_cache	*kioctx_cachep;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
187

71ad7490c   Benjamin LaHaise   rework aio migrat...
188
189
190
191
192
193
194
195
196
197
198
  static struct vfsmount *aio_mnt;
  
  static const struct file_operations aio_ring_fops;
  static const struct address_space_operations aio_ctx_aops;
  
  static struct file *aio_private_file(struct kioctx *ctx, loff_t nr_pages)
  {
  	struct qstr this = QSTR_INIT("[aio]", 5);
  	struct file *file;
  	struct path path;
  	struct inode *inode = alloc_anon_inode(aio_mnt->mnt_sb);
7f62656be   Dan Carpenter   aio: checking for...
199
200
  	if (IS_ERR(inode))
  		return ERR_CAST(inode);
71ad7490c   Benjamin LaHaise   rework aio migrat...
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
  
  	inode->i_mapping->a_ops = &aio_ctx_aops;
  	inode->i_mapping->private_data = ctx;
  	inode->i_size = PAGE_SIZE * nr_pages;
  
  	path.dentry = d_alloc_pseudo(aio_mnt->mnt_sb, &this);
  	if (!path.dentry) {
  		iput(inode);
  		return ERR_PTR(-ENOMEM);
  	}
  	path.mnt = mntget(aio_mnt);
  
  	d_instantiate(path.dentry, inode);
  	file = alloc_file(&path, FMODE_READ | FMODE_WRITE, &aio_ring_fops);
  	if (IS_ERR(file)) {
  		path_put(&path);
  		return file;
  	}
  
  	file->f_flags = O_RDWR;
71ad7490c   Benjamin LaHaise   rework aio migrat...
221
222
223
224
225
226
227
228
229
  	return file;
  }
  
  static struct dentry *aio_mount(struct file_system_type *fs_type,
  				int flags, const char *dev_name, void *data)
  {
  	static const struct dentry_operations ops = {
  		.d_dname	= simple_dname,
  	};
22f6b4d34   Jann Horn   aio: mark AIO pse...
230
231
232
233
234
235
  	struct dentry *root = mount_pseudo(fs_type, "aio:", NULL, &ops,
  					   AIO_RING_MAGIC);
  
  	if (!IS_ERR(root))
  		root->d_sb->s_iflags |= SB_I_NOEXEC;
  	return root;
71ad7490c   Benjamin LaHaise   rework aio migrat...
236
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
237
238
239
240
241
242
  /* aio_setup
   *	Creates the slab caches used by the aio routines, panic on
   *	failure as this is done early during the boot sequence.
   */
  static int __init aio_setup(void)
  {
71ad7490c   Benjamin LaHaise   rework aio migrat...
243
244
245
246
247
248
249
250
  	static struct file_system_type aio_fs = {
  		.name		= "aio",
  		.mount		= aio_mount,
  		.kill_sb	= kill_anon_super,
  	};
  	aio_mnt = kern_mount(&aio_fs);
  	if (IS_ERR(aio_mnt))
  		panic("Failed to create aio fs mount.");
04b2fa9f8   Christoph Hellwig   fs: split generic...
251
  	kiocb_cachep = KMEM_CACHE(aio_kiocb, SLAB_HWCACHE_ALIGN|SLAB_PANIC);
0a31bd5f2   Christoph Lameter   KMEM_CACHE(): sim...
252
  	kioctx_cachep = KMEM_CACHE(kioctx,SLAB_HWCACHE_ALIGN|SLAB_PANIC);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
253

caf4167aa   Kent Overstreet   aio: dprintk() ->...
254
255
  	pr_debug("sizeof(struct page) = %zu
  ", sizeof(struct page));
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
256
257
258
  
  	return 0;
  }
385773e04   H Hartley Sweeten   aio.c: move EXPOR...
259
  __initcall(aio_setup);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
260

5e9ae2e5d   Benjamin LaHaise   aio: fix use-afte...
261
262
263
  static void put_aio_ring_file(struct kioctx *ctx)
  {
  	struct file *aio_ring_file = ctx->aio_ring_file;
de04e7693   Rasmus Villemoes   fs/aio.c: elimina...
264
  	struct address_space *i_mapping;
5e9ae2e5d   Benjamin LaHaise   aio: fix use-afte...
265
266
267
268
  	if (aio_ring_file) {
  		truncate_setsize(aio_ring_file->f_inode, 0);
  
  		/* Prevent further access to the kioctx from migratepages */
de04e7693   Rasmus Villemoes   fs/aio.c: elimina...
269
270
271
  		i_mapping = aio_ring_file->f_inode->i_mapping;
  		spin_lock(&i_mapping->private_lock);
  		i_mapping->private_data = NULL;
5e9ae2e5d   Benjamin LaHaise   aio: fix use-afte...
272
  		ctx->aio_ring_file = NULL;
de04e7693   Rasmus Villemoes   fs/aio.c: elimina...
273
  		spin_unlock(&i_mapping->private_lock);
5e9ae2e5d   Benjamin LaHaise   aio: fix use-afte...
274
275
276
277
  
  		fput(aio_ring_file);
  	}
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
278
279
  static void aio_free_ring(struct kioctx *ctx)
  {
36bc08cc0   Gu Zheng   fs/aio: Add suppo...
280
  	int i;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
281

fa8a53c39   Benjamin LaHaise   aio: v4 ensure ac...
282
283
284
285
  	/* Disconnect the kiotx from the ring file.  This prevents future
  	 * accesses to the kioctx from page migration.
  	 */
  	put_aio_ring_file(ctx);
36bc08cc0   Gu Zheng   fs/aio: Add suppo...
286
  	for (i = 0; i < ctx->nr_pages; i++) {
8e321fefb   Benjamin LaHaise   aio/migratepages:...
287
  		struct page *page;
36bc08cc0   Gu Zheng   fs/aio: Add suppo...
288
289
290
  		pr_debug("pid(%d) [%d] page->count=%d
  ", current->pid, i,
  				page_count(ctx->ring_pages[i]));
8e321fefb   Benjamin LaHaise   aio/migratepages:...
291
292
293
294
295
  		page = ctx->ring_pages[i];
  		if (!page)
  			continue;
  		ctx->ring_pages[i] = NULL;
  		put_page(page);
36bc08cc0   Gu Zheng   fs/aio: Add suppo...
296
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
297

ddb8c45ba   Sasha Levin   aio: nullify aio-...
298
  	if (ctx->ring_pages && ctx->ring_pages != ctx->internal_pages) {
58c85dc20   Kent Overstreet   aio: kill struct ...
299
  		kfree(ctx->ring_pages);
ddb8c45ba   Sasha Levin   aio: nullify aio-...
300
301
  		ctx->ring_pages = NULL;
  	}
36bc08cc0   Gu Zheng   fs/aio: Add suppo...
302
  }
5477e70a6   Oleg Nesterov   mm: move ->mremap...
303
  static int aio_ring_mremap(struct vm_area_struct *vma)
e4a0d3e72   Pavel Emelyanov   aio: Make it poss...
304
  {
5477e70a6   Oleg Nesterov   mm: move ->mremap...
305
  	struct file *file = vma->vm_file;
e4a0d3e72   Pavel Emelyanov   aio: Make it poss...
306
307
  	struct mm_struct *mm = vma->vm_mm;
  	struct kioctx_table *table;
b2edffdd9   Al Viro   fix mremap() vs. ...
308
  	int i, res = -EINVAL;
e4a0d3e72   Pavel Emelyanov   aio: Make it poss...
309
310
311
312
313
314
315
316
317
  
  	spin_lock(&mm->ioctx_lock);
  	rcu_read_lock();
  	table = rcu_dereference(mm->ioctx_table);
  	for (i = 0; i < table->nr; i++) {
  		struct kioctx *ctx;
  
  		ctx = table->table[i];
  		if (ctx && ctx->aio_ring_file == file) {
b2edffdd9   Al Viro   fix mremap() vs. ...
318
319
320
321
  			if (!atomic_read(&ctx->dead)) {
  				ctx->user_id = ctx->mmap_base = vma->vm_start;
  				res = 0;
  			}
e4a0d3e72   Pavel Emelyanov   aio: Make it poss...
322
323
324
325
326
327
  			break;
  		}
  	}
  
  	rcu_read_unlock();
  	spin_unlock(&mm->ioctx_lock);
b2edffdd9   Al Viro   fix mremap() vs. ...
328
  	return res;
e4a0d3e72   Pavel Emelyanov   aio: Make it poss...
329
  }
5477e70a6   Oleg Nesterov   mm: move ->mremap...
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
  static const struct vm_operations_struct aio_ring_vm_ops = {
  	.mremap		= aio_ring_mremap,
  #if IS_ENABLED(CONFIG_MMU)
  	.fault		= filemap_fault,
  	.map_pages	= filemap_map_pages,
  	.page_mkwrite	= filemap_page_mkwrite,
  #endif
  };
  
  static int aio_ring_mmap(struct file *file, struct vm_area_struct *vma)
  {
  	vma->vm_flags |= VM_DONTEXPAND;
  	vma->vm_ops = &aio_ring_vm_ops;
  	return 0;
  }
36bc08cc0   Gu Zheng   fs/aio: Add suppo...
345
346
347
  static const struct file_operations aio_ring_fops = {
  	.mmap = aio_ring_mmap,
  };
0c45355fc   Benjamin LaHaise   aio: fix build wh...
348
  #if IS_ENABLED(CONFIG_MIGRATION)
36bc08cc0   Gu Zheng   fs/aio: Add suppo...
349
350
351
  static int aio_migratepage(struct address_space *mapping, struct page *new,
  			struct page *old, enum migrate_mode mode)
  {
5e9ae2e5d   Benjamin LaHaise   aio: fix use-afte...
352
  	struct kioctx *ctx;
36bc08cc0   Gu Zheng   fs/aio: Add suppo...
353
  	unsigned long flags;
fa8a53c39   Benjamin LaHaise   aio: v4 ensure ac...
354
  	pgoff_t idx;
36bc08cc0   Gu Zheng   fs/aio: Add suppo...
355
  	int rc;
8e321fefb   Benjamin LaHaise   aio/migratepages:...
356
  	rc = 0;
fa8a53c39   Benjamin LaHaise   aio: v4 ensure ac...
357
  	/* mapping->private_lock here protects against the kioctx teardown.  */
8e321fefb   Benjamin LaHaise   aio/migratepages:...
358
359
  	spin_lock(&mapping->private_lock);
  	ctx = mapping->private_data;
fa8a53c39   Benjamin LaHaise   aio: v4 ensure ac...
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
  	if (!ctx) {
  		rc = -EINVAL;
  		goto out;
  	}
  
  	/* The ring_lock mutex.  The prevents aio_read_events() from writing
  	 * to the ring's head, and prevents page migration from mucking in
  	 * a partially initialized kiotx.
  	 */
  	if (!mutex_trylock(&ctx->ring_lock)) {
  		rc = -EAGAIN;
  		goto out;
  	}
  
  	idx = old->index;
  	if (idx < (pgoff_t)ctx->nr_pages) {
  		/* Make sure the old page hasn't already been changed */
  		if (ctx->ring_pages[idx] != old)
  			rc = -EAGAIN;
8e321fefb   Benjamin LaHaise   aio/migratepages:...
379
380
  	} else
  		rc = -EINVAL;
8e321fefb   Benjamin LaHaise   aio/migratepages:...
381
382
  
  	if (rc != 0)
fa8a53c39   Benjamin LaHaise   aio: v4 ensure ac...
383
  		goto out_unlock;
8e321fefb   Benjamin LaHaise   aio/migratepages:...
384

36bc08cc0   Gu Zheng   fs/aio: Add suppo...
385
386
  	/* Writeback must be complete */
  	BUG_ON(PageWriteback(old));
8e321fefb   Benjamin LaHaise   aio/migratepages:...
387
  	get_page(new);
36bc08cc0   Gu Zheng   fs/aio: Add suppo...
388

8e321fefb   Benjamin LaHaise   aio/migratepages:...
389
  	rc = migrate_page_move_mapping(mapping, new, old, NULL, mode, 1);
36bc08cc0   Gu Zheng   fs/aio: Add suppo...
390
  	if (rc != MIGRATEPAGE_SUCCESS) {
8e321fefb   Benjamin LaHaise   aio/migratepages:...
391
  		put_page(new);
fa8a53c39   Benjamin LaHaise   aio: v4 ensure ac...
392
  		goto out_unlock;
36bc08cc0   Gu Zheng   fs/aio: Add suppo...
393
  	}
fa8a53c39   Benjamin LaHaise   aio: v4 ensure ac...
394
395
396
  	/* Take completion_lock to prevent other writes to the ring buffer
  	 * while the old page is copied to the new.  This prevents new
  	 * events from being lost.
5e9ae2e5d   Benjamin LaHaise   aio: fix use-afte...
397
  	 */
fa8a53c39   Benjamin LaHaise   aio: v4 ensure ac...
398
399
400
401
402
  	spin_lock_irqsave(&ctx->completion_lock, flags);
  	migrate_page_copy(new, old);
  	BUG_ON(ctx->ring_pages[idx] != old);
  	ctx->ring_pages[idx] = new;
  	spin_unlock_irqrestore(&ctx->completion_lock, flags);
36bc08cc0   Gu Zheng   fs/aio: Add suppo...
403

fa8a53c39   Benjamin LaHaise   aio: v4 ensure ac...
404
405
  	/* The old page is no longer accessible. */
  	put_page(old);
8e321fefb   Benjamin LaHaise   aio/migratepages:...
406

fa8a53c39   Benjamin LaHaise   aio: v4 ensure ac...
407
408
409
410
  out_unlock:
  	mutex_unlock(&ctx->ring_lock);
  out:
  	spin_unlock(&mapping->private_lock);
36bc08cc0   Gu Zheng   fs/aio: Add suppo...
411
  	return rc;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
412
  }
0c45355fc   Benjamin LaHaise   aio: fix build wh...
413
  #endif
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
414

36bc08cc0   Gu Zheng   fs/aio: Add suppo...
415
  static const struct address_space_operations aio_ctx_aops = {
835f252c6   Gu Zheng   aio: fix uncorren...
416
  	.set_page_dirty = __set_page_dirty_no_writeback,
0c45355fc   Benjamin LaHaise   aio: fix build wh...
417
  #if IS_ENABLED(CONFIG_MIGRATION)
36bc08cc0   Gu Zheng   fs/aio: Add suppo...
418
  	.migratepage	= aio_migratepage,
0c45355fc   Benjamin LaHaise   aio: fix build wh...
419
  #endif
36bc08cc0   Gu Zheng   fs/aio: Add suppo...
420
  };
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
421
422
423
  static int aio_setup_ring(struct kioctx *ctx)
  {
  	struct aio_ring *ring;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
424
  	unsigned nr_events = ctx->max_reqs;
41003a7bc   Zach Brown   aio: remove retry...
425
  	struct mm_struct *mm = current->mm;
3dc9acb67   Linus Torvalds   aio: clean up and...
426
  	unsigned long size, unused;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
427
  	int nr_pages;
36bc08cc0   Gu Zheng   fs/aio: Add suppo...
428
429
  	int i;
  	struct file *file;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
430
431
432
433
434
435
  
  	/* Compensate for the ring buffer's head/tail overlap entry */
  	nr_events += 2;	/* 1 is required, 2 for good luck */
  
  	size = sizeof(struct aio_ring);
  	size += sizeof(struct io_event) * nr_events;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
436

36bc08cc0   Gu Zheng   fs/aio: Add suppo...
437
  	nr_pages = PFN_UP(size);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
438
439
  	if (nr_pages < 0)
  		return -EINVAL;
71ad7490c   Benjamin LaHaise   rework aio migrat...
440
  	file = aio_private_file(ctx, nr_pages);
36bc08cc0   Gu Zheng   fs/aio: Add suppo...
441
442
  	if (IS_ERR(file)) {
  		ctx->aio_ring_file = NULL;
fa8a53c39   Benjamin LaHaise   aio: v4 ensure ac...
443
  		return -ENOMEM;
36bc08cc0   Gu Zheng   fs/aio: Add suppo...
444
  	}
3dc9acb67   Linus Torvalds   aio: clean up and...
445
446
447
448
449
450
451
452
453
454
455
456
457
  	ctx->aio_ring_file = file;
  	nr_events = (PAGE_SIZE * nr_pages - sizeof(struct aio_ring))
  			/ sizeof(struct io_event);
  
  	ctx->ring_pages = ctx->internal_pages;
  	if (nr_pages > AIO_RING_PAGES) {
  		ctx->ring_pages = kcalloc(nr_pages, sizeof(struct page *),
  					  GFP_KERNEL);
  		if (!ctx->ring_pages) {
  			put_aio_ring_file(ctx);
  			return -ENOMEM;
  		}
  	}
36bc08cc0   Gu Zheng   fs/aio: Add suppo...
458
459
460
461
462
463
464
465
466
467
  	for (i = 0; i < nr_pages; i++) {
  		struct page *page;
  		page = find_or_create_page(file->f_inode->i_mapping,
  					   i, GFP_HIGHUSER | __GFP_ZERO);
  		if (!page)
  			break;
  		pr_debug("pid(%d) page[%d]->count=%d
  ",
  			 current->pid, i, page_count(page));
  		SetPageUptodate(page);
36bc08cc0   Gu Zheng   fs/aio: Add suppo...
468
  		unlock_page(page);
3dc9acb67   Linus Torvalds   aio: clean up and...
469
470
  
  		ctx->ring_pages[i] = page;
36bc08cc0   Gu Zheng   fs/aio: Add suppo...
471
  	}
3dc9acb67   Linus Torvalds   aio: clean up and...
472
  	ctx->nr_pages = i;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
473

3dc9acb67   Linus Torvalds   aio: clean up and...
474
475
  	if (unlikely(i != nr_pages)) {
  		aio_free_ring(ctx);
fa8a53c39   Benjamin LaHaise   aio: v4 ensure ac...
476
  		return -ENOMEM;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
477
  	}
58c85dc20   Kent Overstreet   aio: kill struct ...
478
479
480
  	ctx->mmap_size = nr_pages * PAGE_SIZE;
  	pr_debug("attempting mmap of %lu bytes
  ", ctx->mmap_size);
36bc08cc0   Gu Zheng   fs/aio: Add suppo...
481

013373e8b   Michal Hocko   aio: make aio_set...
482
483
484
485
486
  	if (down_write_killable(&mm->mmap_sem)) {
  		ctx->mmap_size = 0;
  		aio_free_ring(ctx);
  		return -EINTR;
  	}
36bc08cc0   Gu Zheng   fs/aio: Add suppo...
487
488
  	ctx->mmap_base = do_mmap_pgoff(ctx->aio_ring_file, 0, ctx->mmap_size,
  				       PROT_READ | PROT_WRITE,
3dc9acb67   Linus Torvalds   aio: clean up and...
489
490
  				       MAP_SHARED, 0, &unused);
  	up_write(&mm->mmap_sem);
58c85dc20   Kent Overstreet   aio: kill struct ...
491
  	if (IS_ERR((void *)ctx->mmap_base)) {
58c85dc20   Kent Overstreet   aio: kill struct ...
492
  		ctx->mmap_size = 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
493
  		aio_free_ring(ctx);
fa8a53c39   Benjamin LaHaise   aio: v4 ensure ac...
494
  		return -ENOMEM;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
495
  	}
58c85dc20   Kent Overstreet   aio: kill struct ...
496
497
  	pr_debug("mmap address: 0x%08lx
  ", ctx->mmap_base);
d6c355c7d   Benjamin LaHaise   aio: fix race in ...
498

58c85dc20   Kent Overstreet   aio: kill struct ...
499
500
  	ctx->user_id = ctx->mmap_base;
  	ctx->nr_events = nr_events; /* trusted copy */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
501

58c85dc20   Kent Overstreet   aio: kill struct ...
502
  	ring = kmap_atomic(ctx->ring_pages[0]);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
503
  	ring->nr = nr_events;	/* user copy */
db446a08c   Benjamin LaHaise   aio: convert the ...
504
  	ring->id = ~0U;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
505
506
507
508
509
  	ring->head = ring->tail = 0;
  	ring->magic = AIO_RING_MAGIC;
  	ring->compat_features = AIO_RING_COMPAT_FEATURES;
  	ring->incompat_features = AIO_RING_INCOMPAT_FEATURES;
  	ring->header_length = sizeof(struct aio_ring);
e8e3c3d66   Cong Wang   fs: remove the se...
510
  	kunmap_atomic(ring);
58c85dc20   Kent Overstreet   aio: kill struct ...
511
  	flush_dcache_page(ctx->ring_pages[0]);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
512
513
514
  
  	return 0;
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
515
516
517
  #define AIO_EVENTS_PER_PAGE	(PAGE_SIZE / sizeof(struct io_event))
  #define AIO_EVENTS_FIRST_PAGE	((PAGE_SIZE - sizeof(struct aio_ring)) / sizeof(struct io_event))
  #define AIO_EVENTS_OFFSET	(AIO_EVENTS_PER_PAGE - AIO_EVENTS_FIRST_PAGE)
04b2fa9f8   Christoph Hellwig   fs: split generic...
518
  void kiocb_set_cancel_fn(struct kiocb *iocb, kiocb_cancel_fn *cancel)
0460fef2a   Kent Overstreet   aio: use cancella...
519
  {
04b2fa9f8   Christoph Hellwig   fs: split generic...
520
  	struct aio_kiocb *req = container_of(iocb, struct aio_kiocb, common);
0460fef2a   Kent Overstreet   aio: use cancella...
521
522
523
524
525
526
527
528
529
530
531
532
533
  	struct kioctx *ctx = req->ki_ctx;
  	unsigned long flags;
  
  	spin_lock_irqsave(&ctx->ctx_lock, flags);
  
  	if (!req->ki_list.next)
  		list_add(&req->ki_list, &ctx->active_reqs);
  
  	req->ki_cancel = cancel;
  
  	spin_unlock_irqrestore(&ctx->ctx_lock, flags);
  }
  EXPORT_SYMBOL(kiocb_set_cancel_fn);
04b2fa9f8   Christoph Hellwig   fs: split generic...
534
  static int kiocb_cancel(struct aio_kiocb *kiocb)
906b973cf   Kent Overstreet   aio: add kiocb_ca...
535
  {
0460fef2a   Kent Overstreet   aio: use cancella...
536
  	kiocb_cancel_fn *old, *cancel;
906b973cf   Kent Overstreet   aio: add kiocb_ca...
537

0460fef2a   Kent Overstreet   aio: use cancella...
538
539
540
541
542
543
544
545
  	/*
  	 * Don't want to set kiocb->ki_cancel = KIOCB_CANCELLED unless it
  	 * actually has a cancel function, hence the cmpxchg()
  	 */
  
  	cancel = ACCESS_ONCE(kiocb->ki_cancel);
  	do {
  		if (!cancel || cancel == KIOCB_CANCELLED)
57282d8fd   Kent Overstreet   aio: Kill ki_users
546
  			return -EINVAL;
906b973cf   Kent Overstreet   aio: add kiocb_ca...
547

0460fef2a   Kent Overstreet   aio: use cancella...
548
549
550
  		old = cancel;
  		cancel = cmpxchg(&kiocb->ki_cancel, old, KIOCB_CANCELLED);
  	} while (cancel != old);
906b973cf   Kent Overstreet   aio: add kiocb_ca...
551

04b2fa9f8   Christoph Hellwig   fs: split generic...
552
  	return cancel(&kiocb->common);
906b973cf   Kent Overstreet   aio: add kiocb_ca...
553
  }
e34ecee2a   Kent Overstreet   aio: Fix a trinit...
554
  static void free_ioctx(struct work_struct *work)
36f558890   Kent Overstreet   aio: refcounting ...
555
  {
e34ecee2a   Kent Overstreet   aio: Fix a trinit...
556
  	struct kioctx *ctx = container_of(work, struct kioctx, free_work);
e1bdd5f27   Kent Overstreet   aio: percpu reqs_...
557

e34ecee2a   Kent Overstreet   aio: Fix a trinit...
558
559
  	pr_debug("freeing %p
  ", ctx);
e1bdd5f27   Kent Overstreet   aio: percpu reqs_...
560

e34ecee2a   Kent Overstreet   aio: Fix a trinit...
561
  	aio_free_ring(ctx);
e1bdd5f27   Kent Overstreet   aio: percpu reqs_...
562
  	free_percpu(ctx->cpu);
9a1049da9   Tejun Heo   percpu-refcount: ...
563
564
  	percpu_ref_exit(&ctx->reqs);
  	percpu_ref_exit(&ctx->users);
36f558890   Kent Overstreet   aio: refcounting ...
565
566
  	kmem_cache_free(kioctx_cachep, ctx);
  }
e34ecee2a   Kent Overstreet   aio: Fix a trinit...
567
568
569
  static void free_ioctx_reqs(struct percpu_ref *ref)
  {
  	struct kioctx *ctx = container_of(ref, struct kioctx, reqs);
e02ba72aa   Anatol Pomozov   aio: block io_des...
570
  	/* At this point we know that there are no any in-flight requests */
dc48e56d7   Jens Axboe   aio: fix serial d...
571
572
  	if (ctx->rq_wait && atomic_dec_and_test(&ctx->rq_wait->count))
  		complete(&ctx->rq_wait->comp);
e02ba72aa   Anatol Pomozov   aio: block io_des...
573

e34ecee2a   Kent Overstreet   aio: Fix a trinit...
574
575
576
  	INIT_WORK(&ctx->free_work, free_ioctx);
  	schedule_work(&ctx->free_work);
  }
36f558890   Kent Overstreet   aio: refcounting ...
577
578
579
580
581
  /*
   * When this function runs, the kioctx has been removed from the "hash table"
   * and ctx->users has dropped to 0, so we know no more kiocbs can be submitted -
   * now it's safe to cancel any that need to be.
   */
e34ecee2a   Kent Overstreet   aio: Fix a trinit...
582
  static void free_ioctx_users(struct percpu_ref *ref)
36f558890   Kent Overstreet   aio: refcounting ...
583
  {
e34ecee2a   Kent Overstreet   aio: Fix a trinit...
584
  	struct kioctx *ctx = container_of(ref, struct kioctx, users);
04b2fa9f8   Christoph Hellwig   fs: split generic...
585
  	struct aio_kiocb *req;
36f558890   Kent Overstreet   aio: refcounting ...
586
587
588
589
590
  
  	spin_lock_irq(&ctx->ctx_lock);
  
  	while (!list_empty(&ctx->active_reqs)) {
  		req = list_first_entry(&ctx->active_reqs,
04b2fa9f8   Christoph Hellwig   fs: split generic...
591
  				       struct aio_kiocb, ki_list);
36f558890   Kent Overstreet   aio: refcounting ...
592
593
  
  		list_del_init(&req->ki_list);
d52a8f9ea   Fabian Frederick   fs/aio.c: Remove ...
594
  		kiocb_cancel(req);
36f558890   Kent Overstreet   aio: refcounting ...
595
596
597
  	}
  
  	spin_unlock_irq(&ctx->ctx_lock);
e34ecee2a   Kent Overstreet   aio: Fix a trinit...
598
599
  	percpu_ref_kill(&ctx->reqs);
  	percpu_ref_put(&ctx->reqs);
36f558890   Kent Overstreet   aio: refcounting ...
600
  }
db446a08c   Benjamin LaHaise   aio: convert the ...
601
602
603
604
605
606
607
  static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm)
  {
  	unsigned i, new_nr;
  	struct kioctx_table *table, *old;
  	struct aio_ring *ring;
  
  	spin_lock(&mm->ioctx_lock);
855ef0dec   Oleg Nesterov   aio: kill the mis...
608
  	table = rcu_dereference_raw(mm->ioctx_table);
db446a08c   Benjamin LaHaise   aio: convert the ...
609
610
611
612
613
614
615
616
  
  	while (1) {
  		if (table)
  			for (i = 0; i < table->nr; i++)
  				if (!table->table[i]) {
  					ctx->id = i;
  					table->table[i] = ctx;
  					spin_unlock(&mm->ioctx_lock);
fa8a53c39   Benjamin LaHaise   aio: v4 ensure ac...
617
618
619
620
  					/* While kioctx setup is in progress,
  					 * we are protected from page migration
  					 * changes ring_pages by ->ring_lock.
  					 */
db446a08c   Benjamin LaHaise   aio: convert the ...
621
622
623
624
625
626
627
  					ring = kmap_atomic(ctx->ring_pages[0]);
  					ring->id = ctx->id;
  					kunmap_atomic(ring);
  					return 0;
  				}
  
  		new_nr = (table ? table->nr : 1) * 4;
db446a08c   Benjamin LaHaise   aio: convert the ...
628
629
630
631
632
633
634
635
636
637
  		spin_unlock(&mm->ioctx_lock);
  
  		table = kzalloc(sizeof(*table) + sizeof(struct kioctx *) *
  				new_nr, GFP_KERNEL);
  		if (!table)
  			return -ENOMEM;
  
  		table->nr = new_nr;
  
  		spin_lock(&mm->ioctx_lock);
855ef0dec   Oleg Nesterov   aio: kill the mis...
638
  		old = rcu_dereference_raw(mm->ioctx_table);
db446a08c   Benjamin LaHaise   aio: convert the ...
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
  
  		if (!old) {
  			rcu_assign_pointer(mm->ioctx_table, table);
  		} else if (table->nr > old->nr) {
  			memcpy(table->table, old->table,
  			       old->nr * sizeof(struct kioctx *));
  
  			rcu_assign_pointer(mm->ioctx_table, table);
  			kfree_rcu(old, rcu);
  		} else {
  			kfree(table);
  			table = old;
  		}
  	}
  }
e34ecee2a   Kent Overstreet   aio: Fix a trinit...
654
655
656
657
658
659
660
661
662
  static void aio_nr_sub(unsigned nr)
  {
  	spin_lock(&aio_nr_lock);
  	if (WARN_ON(aio_nr - nr > aio_nr))
  		aio_nr = 0;
  	else
  		aio_nr -= nr;
  	spin_unlock(&aio_nr_lock);
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
663
664
665
666
667
  /* ioctx_alloc
   *	Allocates and initializes an ioctx.  Returns an ERR_PTR if it failed.
   */
  static struct kioctx *ioctx_alloc(unsigned nr_events)
  {
41003a7bc   Zach Brown   aio: remove retry...
668
  	struct mm_struct *mm = current->mm;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
669
  	struct kioctx *ctx;
e23754f88   Al Viro   aio: don't bother...
670
  	int err = -ENOMEM;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
671

e1bdd5f27   Kent Overstreet   aio: percpu reqs_...
672
673
674
675
676
677
678
679
680
681
682
  	/*
  	 * We keep track of the number of available ringbuffer slots, to prevent
  	 * overflow (reqs_available), and we also use percpu counters for this.
  	 *
  	 * So since up to half the slots might be on other cpu's percpu counters
  	 * and unavailable, double nr_events so userspace sees what they
  	 * expected: additionally, we move req_batch slots to/from percpu
  	 * counters at a time, so make sure that isn't 0:
  	 */
  	nr_events = max(nr_events, num_possible_cpus() * 4);
  	nr_events *= 2;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
683
  	/* Prevent overflows */
08397acdd   Al Viro   ioctx_alloc(): re...
684
  	if (nr_events > (0x10000000U / sizeof(struct io_event))) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
685
686
687
688
  		pr_debug("ENOMEM: nr_events too high
  ");
  		return ERR_PTR(-EINVAL);
  	}
4cd81c3df   Benjamin LaHaise   aio: double aio_m...
689
  	if (!nr_events || (unsigned long)nr_events > (aio_max_nr * 2UL))
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
690
  		return ERR_PTR(-EAGAIN);
c37622296   Robert P. J. Day   [PATCH] Transform...
691
  	ctx = kmem_cache_zalloc(kioctx_cachep, GFP_KERNEL);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
692
693
  	if (!ctx)
  		return ERR_PTR(-ENOMEM);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
694
  	ctx->max_reqs = nr_events;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
695

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
696
  	spin_lock_init(&ctx->ctx_lock);
0460fef2a   Kent Overstreet   aio: use cancella...
697
  	spin_lock_init(&ctx->completion_lock);
58c85dc20   Kent Overstreet   aio: kill struct ...
698
  	mutex_init(&ctx->ring_lock);
fa8a53c39   Benjamin LaHaise   aio: v4 ensure ac...
699
700
701
  	/* Protect against page migration throughout kiotx setup by keeping
  	 * the ring_lock mutex held until setup is complete. */
  	mutex_lock(&ctx->ring_lock);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
702
703
704
  	init_waitqueue_head(&ctx->wait);
  
  	INIT_LIST_HEAD(&ctx->active_reqs);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
705

2aad2a86f   Tejun Heo   percpu_ref: add P...
706
  	if (percpu_ref_init(&ctx->users, free_ioctx_users, 0, GFP_KERNEL))
fa8a53c39   Benjamin LaHaise   aio: v4 ensure ac...
707
  		goto err;
2aad2a86f   Tejun Heo   percpu_ref: add P...
708
  	if (percpu_ref_init(&ctx->reqs, free_ioctx_reqs, 0, GFP_KERNEL))
fa8a53c39   Benjamin LaHaise   aio: v4 ensure ac...
709
  		goto err;
e1bdd5f27   Kent Overstreet   aio: percpu reqs_...
710
711
  	ctx->cpu = alloc_percpu(struct kioctx_cpu);
  	if (!ctx->cpu)
e34ecee2a   Kent Overstreet   aio: Fix a trinit...
712
  		goto err;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
713

fa8a53c39   Benjamin LaHaise   aio: v4 ensure ac...
714
715
  	err = aio_setup_ring(ctx);
  	if (err < 0)
e34ecee2a   Kent Overstreet   aio: Fix a trinit...
716
  		goto err;
e1bdd5f27   Kent Overstreet   aio: percpu reqs_...
717

34e83fc61   Kent Overstreet   aio: reqs_active ...
718
  	atomic_set(&ctx->reqs_available, ctx->nr_events - 1);
e1bdd5f27   Kent Overstreet   aio: percpu reqs_...
719
  	ctx->req_batch = (ctx->nr_events - 1) / (num_possible_cpus() * 4);
6878ea72a   Benjamin LaHaise   aio: be defensive...
720
721
  	if (ctx->req_batch < 1)
  		ctx->req_batch = 1;
34e83fc61   Kent Overstreet   aio: reqs_active ...
722

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
723
  	/* limit the number of system wide aios */
9fa1cb397   Al Viro   aio: aio_nr_lock ...
724
  	spin_lock(&aio_nr_lock);
4cd81c3df   Benjamin LaHaise   aio: double aio_m...
725
  	if (aio_nr + nr_events > (aio_max_nr * 2UL) ||
2dd542b7a   Al Viro   aio: aio_nr decre...
726
  	    aio_nr + nr_events < aio_nr) {
9fa1cb397   Al Viro   aio: aio_nr_lock ...
727
  		spin_unlock(&aio_nr_lock);
e34ecee2a   Kent Overstreet   aio: Fix a trinit...
728
  		err = -EAGAIN;
d1b943271   Gu Zheng   aio: clean up aio...
729
  		goto err_ctx;
2dd542b7a   Al Viro   aio: aio_nr decre...
730
731
  	}
  	aio_nr += ctx->max_reqs;
9fa1cb397   Al Viro   aio: aio_nr_lock ...
732
  	spin_unlock(&aio_nr_lock);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
733

1881686f8   Benjamin LaHaise   aio: fix kioctx l...
734
735
  	percpu_ref_get(&ctx->users);	/* io_setup() will drop this ref */
  	percpu_ref_get(&ctx->reqs);	/* free_ioctx_users() will drop this */
723be6e39   Kent Overstreet   aio: percpu ioctx...
736

da90382c2   Benjamin LaHaise   aio: fix error ha...
737
738
  	err = ioctx_add_table(ctx, mm);
  	if (err)
e34ecee2a   Kent Overstreet   aio: Fix a trinit...
739
  		goto err_cleanup;
da90382c2   Benjamin LaHaise   aio: fix error ha...
740

fa8a53c39   Benjamin LaHaise   aio: v4 ensure ac...
741
742
  	/* Release the ring_lock mutex now that all setup is complete. */
  	mutex_unlock(&ctx->ring_lock);
caf4167aa   Kent Overstreet   aio: dprintk() ->...
743
744
  	pr_debug("allocated ioctx %p[%ld]: mm=%p mask=0x%x
  ",
58c85dc20   Kent Overstreet   aio: kill struct ...
745
  		 ctx, ctx->user_id, mm, ctx->nr_events);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
746
  	return ctx;
e34ecee2a   Kent Overstreet   aio: Fix a trinit...
747
748
  err_cleanup:
  	aio_nr_sub(ctx->max_reqs);
d1b943271   Gu Zheng   aio: clean up aio...
749
  err_ctx:
deeb8525f   Al Viro   ioctx_alloc(): fi...
750
751
752
  	atomic_set(&ctx->dead, 1);
  	if (ctx->mmap_size)
  		vm_munmap(ctx->mmap_base, ctx->mmap_size);
d1b943271   Gu Zheng   aio: clean up aio...
753
  	aio_free_ring(ctx);
e34ecee2a   Kent Overstreet   aio: Fix a trinit...
754
  err:
fa8a53c39   Benjamin LaHaise   aio: v4 ensure ac...
755
  	mutex_unlock(&ctx->ring_lock);
e1bdd5f27   Kent Overstreet   aio: percpu reqs_...
756
  	free_percpu(ctx->cpu);
9a1049da9   Tejun Heo   percpu-refcount: ...
757
758
  	percpu_ref_exit(&ctx->reqs);
  	percpu_ref_exit(&ctx->users);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
759
  	kmem_cache_free(kioctx_cachep, ctx);
caf4167aa   Kent Overstreet   aio: dprintk() ->...
760
761
  	pr_debug("error allocating ioctx %d
  ", err);
e23754f88   Al Viro   aio: don't bother...
762
  	return ERR_PTR(err);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
763
  }
36f558890   Kent Overstreet   aio: refcounting ...
764
765
766
767
768
  /* kill_ioctx
   *	Cancels all outstanding aio requests on an aio context.  Used
   *	when the processes owning a context have all exited to encourage
   *	the rapid destruction of the kioctx.
   */
fb2d44838   Benjamin LaHaise   aio: report error...
769
  static int kill_ioctx(struct mm_struct *mm, struct kioctx *ctx,
dc48e56d7   Jens Axboe   aio: fix serial d...
770
  		      struct ctx_rq_wait *wait)
36f558890   Kent Overstreet   aio: refcounting ...
771
  {
fa88b6f88   Benjamin LaHaise   aio: cleanup: fla...
772
  	struct kioctx_table *table;
db446a08c   Benjamin LaHaise   aio: convert the ...
773

b2edffdd9   Al Viro   fix mremap() vs. ...
774
775
776
  	spin_lock(&mm->ioctx_lock);
  	if (atomic_xchg(&ctx->dead, 1)) {
  		spin_unlock(&mm->ioctx_lock);
fa88b6f88   Benjamin LaHaise   aio: cleanup: fla...
777
  		return -EINVAL;
b2edffdd9   Al Viro   fix mremap() vs. ...
778
  	}
db446a08c   Benjamin LaHaise   aio: convert the ...
779

855ef0dec   Oleg Nesterov   aio: kill the mis...
780
  	table = rcu_dereference_raw(mm->ioctx_table);
fa88b6f88   Benjamin LaHaise   aio: cleanup: fla...
781
782
  	WARN_ON(ctx != table->table[ctx->id]);
  	table->table[ctx->id] = NULL;
fa88b6f88   Benjamin LaHaise   aio: cleanup: fla...
783
  	spin_unlock(&mm->ioctx_lock);
4fcc712f5   Kent Overstreet   aio: fix io_destr...
784

fa88b6f88   Benjamin LaHaise   aio: cleanup: fla...
785
786
  	/* percpu_ref_kill() will do the necessary call_rcu() */
  	wake_up_all(&ctx->wait);
4fcc712f5   Kent Overstreet   aio: fix io_destr...
787

fa88b6f88   Benjamin LaHaise   aio: cleanup: fla...
788
789
790
791
792
793
794
795
  	/*
  	 * It'd be more correct to do this in free_ioctx(), after all
  	 * the outstanding kiocbs have finished - but by then io_destroy
  	 * has already returned, so io_setup() could potentially return
  	 * -EAGAIN with no ioctxs actually in use (as far as userspace
  	 *  could tell).
  	 */
  	aio_nr_sub(ctx->max_reqs);
4fcc712f5   Kent Overstreet   aio: fix io_destr...
796

fa88b6f88   Benjamin LaHaise   aio: cleanup: fla...
797
798
  	if (ctx->mmap_size)
  		vm_munmap(ctx->mmap_base, ctx->mmap_size);
fb2d44838   Benjamin LaHaise   aio: report error...
799

dc48e56d7   Jens Axboe   aio: fix serial d...
800
  	ctx->rq_wait = wait;
fa88b6f88   Benjamin LaHaise   aio: cleanup: fla...
801
802
  	percpu_ref_kill(&ctx->users);
  	return 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
803
  }
36f558890   Kent Overstreet   aio: refcounting ...
804
805
806
807
808
809
810
  /*
   * exit_aio: called when the last user of mm goes away.  At this point, there is
   * no way for any new requests to be submited or any of the io_* syscalls to be
   * called on the context.
   *
   * There may be outstanding kiocbs, but free_ioctx() will explicitly wait on
   * them.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
811
   */
fc9b52cd8   Harvey Harrison   fs: remove fastca...
812
  void exit_aio(struct mm_struct *mm)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
813
  {
4b70ac5fd   Oleg Nesterov   aio: change exit_...
814
  	struct kioctx_table *table = rcu_dereference_raw(mm->ioctx_table);
dc48e56d7   Jens Axboe   aio: fix serial d...
815
816
  	struct ctx_rq_wait wait;
  	int i, skipped;
db446a08c   Benjamin LaHaise   aio: convert the ...
817

4b70ac5fd   Oleg Nesterov   aio: change exit_...
818
819
  	if (!table)
  		return;
db446a08c   Benjamin LaHaise   aio: convert the ...
820

dc48e56d7   Jens Axboe   aio: fix serial d...
821
822
823
824
  	atomic_set(&wait.count, table->nr);
  	init_completion(&wait.comp);
  
  	skipped = 0;
4b70ac5fd   Oleg Nesterov   aio: change exit_...
825
826
  	for (i = 0; i < table->nr; ++i) {
  		struct kioctx *ctx = table->table[i];
abf137dd7   Jens Axboe   aio: make the loo...
827

dc48e56d7   Jens Axboe   aio: fix serial d...
828
829
  		if (!ctx) {
  			skipped++;
4b70ac5fd   Oleg Nesterov   aio: change exit_...
830
  			continue;
dc48e56d7   Jens Axboe   aio: fix serial d...
831
  		}
936af1576   Al Viro   aio: don't bother...
832
  		/*
4b70ac5fd   Oleg Nesterov   aio: change exit_...
833
834
835
836
837
  		 * We don't need to bother with munmap() here - exit_mmap(mm)
  		 * is coming and it'll unmap everything. And we simply can't,
  		 * this is not necessarily our ->mm.
  		 * Since kill_ioctx() uses non-zero ->mmap_size as indicator
  		 * that it needs to unmap the area, just set it to 0.
936af1576   Al Viro   aio: don't bother...
838
  		 */
58c85dc20   Kent Overstreet   aio: kill struct ...
839
  		ctx->mmap_size = 0;
dc48e56d7   Jens Axboe   aio: fix serial d...
840
841
  		kill_ioctx(mm, ctx, &wait);
  	}
36f558890   Kent Overstreet   aio: refcounting ...
842

dc48e56d7   Jens Axboe   aio: fix serial d...
843
  	if (!atomic_sub_and_test(skipped, &wait.count)) {
6098b45b3   Gu Zheng   aio: block exit_a...
844
  		/* Wait until all IO for the context are done. */
dc48e56d7   Jens Axboe   aio: fix serial d...
845
  		wait_for_completion(&wait.comp);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
846
  	}
4b70ac5fd   Oleg Nesterov   aio: change exit_...
847
848
849
  
  	RCU_INIT_POINTER(mm->ioctx_table, NULL);
  	kfree(table);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
850
  }
e1bdd5f27   Kent Overstreet   aio: percpu reqs_...
851
852
853
  static void put_reqs_available(struct kioctx *ctx, unsigned nr)
  {
  	struct kioctx_cpu *kcpu;
263782c1c   Benjamin LaHaise   aio: protect reqs...
854
  	unsigned long flags;
e1bdd5f27   Kent Overstreet   aio: percpu reqs_...
855

263782c1c   Benjamin LaHaise   aio: protect reqs...
856
  	local_irq_save(flags);
be6fb451a   Benjamin LaHaise   aio: remove no lo...
857
  	kcpu = this_cpu_ptr(ctx->cpu);
e1bdd5f27   Kent Overstreet   aio: percpu reqs_...
858
  	kcpu->reqs_available += nr;
263782c1c   Benjamin LaHaise   aio: protect reqs...
859

e1bdd5f27   Kent Overstreet   aio: percpu reqs_...
860
861
862
863
  	while (kcpu->reqs_available >= ctx->req_batch * 2) {
  		kcpu->reqs_available -= ctx->req_batch;
  		atomic_add(ctx->req_batch, &ctx->reqs_available);
  	}
263782c1c   Benjamin LaHaise   aio: protect reqs...
864
  	local_irq_restore(flags);
e1bdd5f27   Kent Overstreet   aio: percpu reqs_...
865
866
867
868
869
870
  }
  
  static bool get_reqs_available(struct kioctx *ctx)
  {
  	struct kioctx_cpu *kcpu;
  	bool ret = false;
263782c1c   Benjamin LaHaise   aio: protect reqs...
871
  	unsigned long flags;
e1bdd5f27   Kent Overstreet   aio: percpu reqs_...
872

263782c1c   Benjamin LaHaise   aio: protect reqs...
873
  	local_irq_save(flags);
be6fb451a   Benjamin LaHaise   aio: remove no lo...
874
  	kcpu = this_cpu_ptr(ctx->cpu);
e1bdd5f27   Kent Overstreet   aio: percpu reqs_...
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
  	if (!kcpu->reqs_available) {
  		int old, avail = atomic_read(&ctx->reqs_available);
  
  		do {
  			if (avail < ctx->req_batch)
  				goto out;
  
  			old = avail;
  			avail = atomic_cmpxchg(&ctx->reqs_available,
  					       avail, avail - ctx->req_batch);
  		} while (avail != old);
  
  		kcpu->reqs_available += ctx->req_batch;
  	}
  
  	ret = true;
  	kcpu->reqs_available--;
  out:
263782c1c   Benjamin LaHaise   aio: protect reqs...
893
  	local_irq_restore(flags);
e1bdd5f27   Kent Overstreet   aio: percpu reqs_...
894
895
  	return ret;
  }
d856f32a8   Benjamin LaHaise   aio: fix reqs_ava...
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
  /* refill_reqs_available
   *	Updates the reqs_available reference counts used for tracking the
   *	number of free slots in the completion ring.  This can be called
   *	from aio_complete() (to optimistically update reqs_available) or
   *	from aio_get_req() (the we're out of events case).  It must be
   *	called holding ctx->completion_lock.
   */
  static void refill_reqs_available(struct kioctx *ctx, unsigned head,
                                    unsigned tail)
  {
  	unsigned events_in_ring, completed;
  
  	/* Clamp head since userland can write to it. */
  	head %= ctx->nr_events;
  	if (head <= tail)
  		events_in_ring = tail - head;
  	else
  		events_in_ring = ctx->nr_events - (head - tail);
  
  	completed = ctx->completed_events;
  	if (events_in_ring < completed)
  		completed -= events_in_ring;
  	else
  		completed = 0;
  
  	if (!completed)
  		return;
  
  	ctx->completed_events -= completed;
  	put_reqs_available(ctx, completed);
  }
  
  /* user_refill_reqs_available
   *	Called to refill reqs_available when aio_get_req() encounters an
   *	out of space in the completion ring.
   */
  static void user_refill_reqs_available(struct kioctx *ctx)
  {
  	spin_lock_irq(&ctx->completion_lock);
  	if (ctx->completed_events) {
  		struct aio_ring *ring;
  		unsigned head;
  
  		/* Access of ring->head may race with aio_read_events_ring()
  		 * here, but that's okay since whether we read the old version
  		 * or the new version, and either will be valid.  The important
  		 * part is that head cannot pass tail since we prevent
  		 * aio_complete() from updating tail by holding
  		 * ctx->completion_lock.  Even if head is invalid, the check
  		 * against ctx->completed_events below will make sure we do the
  		 * safe/right thing.
  		 */
  		ring = kmap_atomic(ctx->ring_pages[0]);
  		head = ring->head;
  		kunmap_atomic(ring);
  
  		refill_reqs_available(ctx, head, ctx->tail);
  	}
  
  	spin_unlock_irq(&ctx->completion_lock);
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
957
  /* aio_get_req
57282d8fd   Kent Overstreet   aio: Kill ki_users
958
959
   *	Allocate a slot for an aio request.
   * Returns NULL if no requests are free.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
960
   */
04b2fa9f8   Christoph Hellwig   fs: split generic...
961
  static inline struct aio_kiocb *aio_get_req(struct kioctx *ctx)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
962
  {
04b2fa9f8   Christoph Hellwig   fs: split generic...
963
  	struct aio_kiocb *req;
a1c8eae75   Kent Overstreet   aio: kill batch a...
964

d856f32a8   Benjamin LaHaise   aio: fix reqs_ava...
965
966
967
968
969
  	if (!get_reqs_available(ctx)) {
  		user_refill_reqs_available(ctx);
  		if (!get_reqs_available(ctx))
  			return NULL;
  	}
a1c8eae75   Kent Overstreet   aio: kill batch a...
970

0460fef2a   Kent Overstreet   aio: use cancella...
971
  	req = kmem_cache_alloc(kiocb_cachep, GFP_KERNEL|__GFP_ZERO);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
972
  	if (unlikely(!req))
a1c8eae75   Kent Overstreet   aio: kill batch a...
973
  		goto out_put;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
974

e34ecee2a   Kent Overstreet   aio: Fix a trinit...
975
  	percpu_ref_get(&ctx->reqs);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
976
  	req->ki_ctx = ctx;
080d676de   Jeff Moyer   aio: allocate kio...
977
  	return req;
a1c8eae75   Kent Overstreet   aio: kill batch a...
978
  out_put:
e1bdd5f27   Kent Overstreet   aio: percpu reqs_...
979
  	put_reqs_available(ctx, 1);
a1c8eae75   Kent Overstreet   aio: kill batch a...
980
  	return NULL;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
981
  }
04b2fa9f8   Christoph Hellwig   fs: split generic...
982
  static void kiocb_free(struct aio_kiocb *req)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
983
  {
04b2fa9f8   Christoph Hellwig   fs: split generic...
984
985
  	if (req->common.ki_filp)
  		fput(req->common.ki_filp);
133890103   Davide Libenzi   eventfd: revised ...
986
987
  	if (req->ki_eventfd != NULL)
  		eventfd_ctx_put(req->ki_eventfd);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
988
  	kmem_cache_free(kiocb_cachep, req);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
989
  }
d5470b596   Adrian Bunk   fs/aio.c: make 3 ...
990
  static struct kioctx *lookup_ioctx(unsigned long ctx_id)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
991
  {
db446a08c   Benjamin LaHaise   aio: convert the ...
992
  	struct aio_ring __user *ring  = (void __user *)ctx_id;
abf137dd7   Jens Axboe   aio: make the loo...
993
  	struct mm_struct *mm = current->mm;
65c24491b   Jeff Moyer   aio: lookup_ioctx...
994
  	struct kioctx *ctx, *ret = NULL;
db446a08c   Benjamin LaHaise   aio: convert the ...
995
996
997
998
999
  	struct kioctx_table *table;
  	unsigned id;
  
  	if (get_user(id, &ring->id))
  		return NULL;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1000

abf137dd7   Jens Axboe   aio: make the loo...
1001
  	rcu_read_lock();
db446a08c   Benjamin LaHaise   aio: convert the ...
1002
  	table = rcu_dereference(mm->ioctx_table);
abf137dd7   Jens Axboe   aio: make the loo...
1003

db446a08c   Benjamin LaHaise   aio: convert the ...
1004
1005
  	if (!table || id >= table->nr)
  		goto out;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1006

db446a08c   Benjamin LaHaise   aio: convert the ...
1007
  	ctx = table->table[id];
f30d704fe   Benjamin LaHaise   aio: table lookup...
1008
  	if (ctx && ctx->user_id == ctx_id) {
db446a08c   Benjamin LaHaise   aio: convert the ...
1009
1010
1011
1012
  		percpu_ref_get(&ctx->users);
  		ret = ctx;
  	}
  out:
abf137dd7   Jens Axboe   aio: make the loo...
1013
  	rcu_read_unlock();
65c24491b   Jeff Moyer   aio: lookup_ioctx...
1014
  	return ret;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1015
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1016
1017
  /* aio_complete
   *	Called when the io request on the given iocb is complete.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1018
   */
04b2fa9f8   Christoph Hellwig   fs: split generic...
1019
  static void aio_complete(struct kiocb *kiocb, long res, long res2)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1020
  {
04b2fa9f8   Christoph Hellwig   fs: split generic...
1021
  	struct aio_kiocb *iocb = container_of(kiocb, struct aio_kiocb, common);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1022
  	struct kioctx	*ctx = iocb->ki_ctx;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1023
  	struct aio_ring	*ring;
21b40200c   Kent Overstreet   aio: use flush_dc...
1024
  	struct io_event	*ev_page, *event;
d856f32a8   Benjamin LaHaise   aio: fix reqs_ava...
1025
  	unsigned tail, pos, head;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1026
  	unsigned long	flags;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1027

70fe2f481   Jan Kara   aio: fix freeze p...
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
  	if (kiocb->ki_flags & IOCB_WRITE) {
  		struct file *file = kiocb->ki_filp;
  
  		/*
  		 * Tell lockdep we inherited freeze protection from submission
  		 * thread.
  		 */
  		__sb_writers_acquired(file_inode(file)->i_sb, SB_FREEZE_WRITE);
  		file_end_write(file);
  	}
20dcae324   Zach Brown   [PATCH] aio: remo...
1038
1039
1040
1041
1042
1043
  	/*
  	 * Special case handling for sync iocbs:
  	 *  - events go directly into the iocb for fast handling
  	 *  - the sync task with the iocb in its stack holds the single iocb
  	 *    ref, no other paths have a way to get another ref
  	 *  - the sync task helpfully left a reference to itself in the iocb
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1044
  	 */
04b2fa9f8   Christoph Hellwig   fs: split generic...
1045
  	BUG_ON(is_sync_kiocb(kiocb));
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1046

0460fef2a   Kent Overstreet   aio: use cancella...
1047
1048
1049
1050
1051
1052
1053
  	if (iocb->ki_list.next) {
  		unsigned long flags;
  
  		spin_lock_irqsave(&ctx->ctx_lock, flags);
  		list_del(&iocb->ki_list);
  		spin_unlock_irqrestore(&ctx->ctx_lock, flags);
  	}
11599ebac   Kent Overstreet   aio: make aio_put...
1054

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1055
  	/*
0460fef2a   Kent Overstreet   aio: use cancella...
1056
  	 * Add a completion event to the ring buffer. Must be done holding
4b30f07e7   Tang Chen   aio: fix wrong co...
1057
  	 * ctx->completion_lock to prevent other code from messing with the tail
0460fef2a   Kent Overstreet   aio: use cancella...
1058
1059
1060
  	 * pointer since we might be called from irq context.
  	 */
  	spin_lock_irqsave(&ctx->completion_lock, flags);
58c85dc20   Kent Overstreet   aio: kill struct ...
1061
  	tail = ctx->tail;
21b40200c   Kent Overstreet   aio: use flush_dc...
1062
  	pos = tail + AIO_EVENTS_OFFSET;
58c85dc20   Kent Overstreet   aio: kill struct ...
1063
  	if (++tail >= ctx->nr_events)
4bf69b2a0   Kenneth W Chen   [PATCH] aio: ring...
1064
  		tail = 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1065

58c85dc20   Kent Overstreet   aio: kill struct ...
1066
  	ev_page = kmap_atomic(ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE]);
21b40200c   Kent Overstreet   aio: use flush_dc...
1067
  	event = ev_page + pos % AIO_EVENTS_PER_PAGE;
04b2fa9f8   Christoph Hellwig   fs: split generic...
1068
  	event->obj = (u64)(unsigned long)iocb->ki_user_iocb;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1069
1070
1071
  	event->data = iocb->ki_user_data;
  	event->res = res;
  	event->res2 = res2;
21b40200c   Kent Overstreet   aio: use flush_dc...
1072
  	kunmap_atomic(ev_page);
58c85dc20   Kent Overstreet   aio: kill struct ...
1073
  	flush_dcache_page(ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE]);
21b40200c   Kent Overstreet   aio: use flush_dc...
1074
1075
1076
  
  	pr_debug("%p[%u]: %p: %p %Lx %lx %lx
  ",
04b2fa9f8   Christoph Hellwig   fs: split generic...
1077
  		 ctx, tail, iocb, iocb->ki_user_iocb, iocb->ki_user_data,
caf4167aa   Kent Overstreet   aio: dprintk() ->...
1078
  		 res, res2);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1079
1080
1081
1082
1083
  
  	/* after flagging the request as done, we
  	 * must never even look at it again
  	 */
  	smp_wmb();	/* make event visible before updating tail */
58c85dc20   Kent Overstreet   aio: kill struct ...
1084
  	ctx->tail = tail;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1085

58c85dc20   Kent Overstreet   aio: kill struct ...
1086
  	ring = kmap_atomic(ctx->ring_pages[0]);
d856f32a8   Benjamin LaHaise   aio: fix reqs_ava...
1087
  	head = ring->head;
21b40200c   Kent Overstreet   aio: use flush_dc...
1088
  	ring->tail = tail;
e8e3c3d66   Cong Wang   fs: remove the se...
1089
  	kunmap_atomic(ring);
58c85dc20   Kent Overstreet   aio: kill struct ...
1090
  	flush_dcache_page(ctx->ring_pages[0]);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1091

d856f32a8   Benjamin LaHaise   aio: fix reqs_ava...
1092
1093
1094
  	ctx->completed_events++;
  	if (ctx->completed_events > 1)
  		refill_reqs_available(ctx, head, tail);
0460fef2a   Kent Overstreet   aio: use cancella...
1095
  	spin_unlock_irqrestore(&ctx->completion_lock, flags);
21b40200c   Kent Overstreet   aio: use flush_dc...
1096
1097
  	pr_debug("added to ring %p at [%u]
  ", iocb, tail);
8d1c98b0b   Davide Libenzi   eventfd/kaio inte...
1098
1099
1100
1101
1102
1103
  
  	/*
  	 * Check if the user asked us to deliver the result through an
  	 * eventfd. The eventfd_signal() function is safe to be called
  	 * from IRQ context.
  	 */
87c3a86e1   Davide Libenzi   eventfd: remove f...
1104
  	if (iocb->ki_eventfd != NULL)
8d1c98b0b   Davide Libenzi   eventfd/kaio inte...
1105
  		eventfd_signal(iocb->ki_eventfd, 1);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1106
  	/* everything turned out well, dispose of the aiocb. */
57282d8fd   Kent Overstreet   aio: Kill ki_users
1107
  	kiocb_free(iocb);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1108

6cb2a2104   Quentin Barnes   aio: bad AIO race...
1109
1110
1111
1112
1113
1114
1115
  	/*
  	 * We have to order our ring_info tail store above and test
  	 * of the wait list below outside the wait lock.  This is
  	 * like in wake_up_bit() where clearing a bit has to be
  	 * ordered with the unlocked test.
  	 */
  	smp_mb();
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1116
1117
  	if (waitqueue_active(&ctx->wait))
  		wake_up(&ctx->wait);
e34ecee2a   Kent Overstreet   aio: Fix a trinit...
1118
  	percpu_ref_put(&ctx->reqs);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1119
  }
2be4e7dee   Gu Zheng   aio: fix some com...
1120
  /* aio_read_events_ring
a31ad380b   Kent Overstreet   aio: make aio_rea...
1121
1122
   *	Pull an event off of the ioctx's event ring.  Returns the number of
   *	events fetched
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1123
   */
a31ad380b   Kent Overstreet   aio: make aio_rea...
1124
1125
  static long aio_read_events_ring(struct kioctx *ctx,
  				 struct io_event __user *event, long nr)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1126
  {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1127
  	struct aio_ring *ring;
5ffac122d   Kent Overstreet   aio: Don't use ct...
1128
  	unsigned head, tail, pos;
a31ad380b   Kent Overstreet   aio: make aio_rea...
1129
1130
  	long ret = 0;
  	int copy_ret;
9c9ce763b   Dave Chinner   aio: annotate aio...
1131
1132
1133
1134
1135
1136
1137
  	/*
  	 * The mutex can block and wake us up and that will cause
  	 * wait_event_interruptible_hrtimeout() to schedule without sleeping
  	 * and repeat. This should be rare enough that it doesn't cause
  	 * peformance issues. See the comment in read_events() for more detail.
  	 */
  	sched_annotate_sleep();
58c85dc20   Kent Overstreet   aio: kill struct ...
1138
  	mutex_lock(&ctx->ring_lock);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1139

fa8a53c39   Benjamin LaHaise   aio: v4 ensure ac...
1140
  	/* Access to ->ring_pages here is protected by ctx->ring_lock. */
58c85dc20   Kent Overstreet   aio: kill struct ...
1141
  	ring = kmap_atomic(ctx->ring_pages[0]);
a31ad380b   Kent Overstreet   aio: make aio_rea...
1142
  	head = ring->head;
5ffac122d   Kent Overstreet   aio: Don't use ct...
1143
  	tail = ring->tail;
a31ad380b   Kent Overstreet   aio: make aio_rea...
1144
  	kunmap_atomic(ring);
2ff396be6   Jeff Moyer   aio: add missing ...
1145
1146
1147
1148
1149
  	/*
  	 * Ensure that once we've read the current tail pointer, that
  	 * we also see the events that were stored up to the tail.
  	 */
  	smp_rmb();
5ffac122d   Kent Overstreet   aio: Don't use ct...
1150
1151
  	pr_debug("h%u t%u m%u
  ", head, tail, ctx->nr_events);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1152

5ffac122d   Kent Overstreet   aio: Don't use ct...
1153
  	if (head == tail)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1154
  		goto out;
edfbbf388   Benjamin LaHaise   aio: fix kernel m...
1155
1156
  	head %= ctx->nr_events;
  	tail %= ctx->nr_events;
a31ad380b   Kent Overstreet   aio: make aio_rea...
1157
1158
1159
1160
  	while (ret < nr) {
  		long avail;
  		struct io_event *ev;
  		struct page *page;
5ffac122d   Kent Overstreet   aio: Don't use ct...
1161
1162
  		avail = (head <= tail ?  tail : ctx->nr_events) - head;
  		if (head == tail)
a31ad380b   Kent Overstreet   aio: make aio_rea...
1163
1164
1165
1166
1167
1168
1169
  			break;
  
  		avail = min(avail, nr - ret);
  		avail = min_t(long, avail, AIO_EVENTS_PER_PAGE -
  			    ((head + AIO_EVENTS_OFFSET) % AIO_EVENTS_PER_PAGE));
  
  		pos = head + AIO_EVENTS_OFFSET;
58c85dc20   Kent Overstreet   aio: kill struct ...
1170
  		page = ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE];
a31ad380b   Kent Overstreet   aio: make aio_rea...
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
  		pos %= AIO_EVENTS_PER_PAGE;
  
  		ev = kmap(page);
  		copy_ret = copy_to_user(event + ret, ev + pos,
  					sizeof(*ev) * avail);
  		kunmap(page);
  
  		if (unlikely(copy_ret)) {
  			ret = -EFAULT;
  			goto out;
  		}
  
  		ret += avail;
  		head += avail;
58c85dc20   Kent Overstreet   aio: kill struct ...
1185
  		head %= ctx->nr_events;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1186
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1187

58c85dc20   Kent Overstreet   aio: kill struct ...
1188
  	ring = kmap_atomic(ctx->ring_pages[0]);
a31ad380b   Kent Overstreet   aio: make aio_rea...
1189
  	ring->head = head;
91d80a84b   Zhao Hongjiang   aio: fix possible...
1190
  	kunmap_atomic(ring);
58c85dc20   Kent Overstreet   aio: kill struct ...
1191
  	flush_dcache_page(ctx->ring_pages[0]);
a31ad380b   Kent Overstreet   aio: make aio_rea...
1192

5ffac122d   Kent Overstreet   aio: Don't use ct...
1193
1194
  	pr_debug("%li  h%u t%u
  ", ret, head, tail);
a31ad380b   Kent Overstreet   aio: make aio_rea...
1195
  out:
58c85dc20   Kent Overstreet   aio: kill struct ...
1196
  	mutex_unlock(&ctx->ring_lock);
a31ad380b   Kent Overstreet   aio: make aio_rea...
1197

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1198
1199
  	return ret;
  }
a31ad380b   Kent Overstreet   aio: make aio_rea...
1200
1201
  static bool aio_read_events(struct kioctx *ctx, long min_nr, long nr,
  			    struct io_event __user *event, long *i)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1202
  {
a31ad380b   Kent Overstreet   aio: make aio_rea...
1203
  	long ret = aio_read_events_ring(ctx, event + *i, nr - *i);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1204

a31ad380b   Kent Overstreet   aio: make aio_rea...
1205
1206
  	if (ret > 0)
  		*i += ret;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1207

a31ad380b   Kent Overstreet   aio: make aio_rea...
1208
1209
  	if (unlikely(atomic_read(&ctx->dead)))
  		ret = -EINVAL;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1210

a31ad380b   Kent Overstreet   aio: make aio_rea...
1211
1212
  	if (!*i)
  		*i = ret;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1213

a31ad380b   Kent Overstreet   aio: make aio_rea...
1214
  	return ret < 0 || *i >= min_nr;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1215
  }
a31ad380b   Kent Overstreet   aio: make aio_rea...
1216
  static long read_events(struct kioctx *ctx, long min_nr, long nr,
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1217
1218
1219
  			struct io_event __user *event,
  			struct timespec __user *timeout)
  {
a31ad380b   Kent Overstreet   aio: make aio_rea...
1220
1221
  	ktime_t until = { .tv64 = KTIME_MAX };
  	long ret = 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1222

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1223
1224
  	if (timeout) {
  		struct timespec	ts;
a31ad380b   Kent Overstreet   aio: make aio_rea...
1225

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1226
  		if (unlikely(copy_from_user(&ts, timeout, sizeof(ts))))
a31ad380b   Kent Overstreet   aio: make aio_rea...
1227
  			return -EFAULT;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1228

a31ad380b   Kent Overstreet   aio: make aio_rea...
1229
  		until = timespec_to_ktime(ts);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1230
  	}
a31ad380b   Kent Overstreet   aio: make aio_rea...
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
  	/*
  	 * Note that aio_read_events() is being called as the conditional - i.e.
  	 * we're calling it after prepare_to_wait() has set task state to
  	 * TASK_INTERRUPTIBLE.
  	 *
  	 * But aio_read_events() can block, and if it blocks it's going to flip
  	 * the task state back to TASK_RUNNING.
  	 *
  	 * This should be ok, provided it doesn't flip the state back to
  	 * TASK_RUNNING and return 0 too much - that causes us to spin. That
  	 * will only happen if the mutex_lock() call blocks, and we then find
  	 * the ringbuffer empty. So in practice we should be ok, but it's
  	 * something to be aware of when touching this code.
  	 */
5f785de58   Fam Zheng   aio: Skip timer f...
1245
1246
1247
1248
1249
1250
  	if (until.tv64 == 0)
  		aio_read_events(ctx, min_nr, nr, event, &ret);
  	else
  		wait_event_interruptible_hrtimeout(ctx->wait,
  				aio_read_events(ctx, min_nr, nr, event, &ret),
  				until);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1251

a31ad380b   Kent Overstreet   aio: make aio_rea...
1252
1253
  	if (!ret && signal_pending(current))
  		ret = -EINTR;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1254

a31ad380b   Kent Overstreet   aio: make aio_rea...
1255
  	return ret;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1256
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
  /* sys_io_setup:
   *	Create an aio_context capable of receiving at least nr_events.
   *	ctxp must not point to an aio_context that already exists, and
   *	must be initialized to 0 prior to the call.  On successful
   *	creation of the aio_context, *ctxp is filled in with the resulting 
   *	handle.  May fail with -EINVAL if *ctxp is not initialized,
   *	if the specified nr_events exceeds internal limits.  May fail 
   *	with -EAGAIN if the specified nr_events exceeds the user's limit 
   *	of available events.  May fail with -ENOMEM if insufficient kernel
   *	resources are available.  May fail with -EFAULT if an invalid
   *	pointer is passed for ctxp.  Will fail with -ENOSYS if not
   *	implemented.
   */
002c8976e   Heiko Carstens   [CVE-2009-0029] S...
1270
  SYSCALL_DEFINE2(io_setup, unsigned, nr_events, aio_context_t __user *, ctxp)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
  {
  	struct kioctx *ioctx = NULL;
  	unsigned long ctx;
  	long ret;
  
  	ret = get_user(ctx, ctxp);
  	if (unlikely(ret))
  		goto out;
  
  	ret = -EINVAL;
d55b5fdaf   Zach Brown   [PATCH] aio: remo...
1281
  	if (unlikely(ctx || nr_events == 0)) {
acd88d4e1   Kinglong Mee   fs/aio.c: Remove ...
1282
1283
  		pr_debug("EINVAL: ctx %lu nr_events %u
  ",
d55b5fdaf   Zach Brown   [PATCH] aio: remo...
1284
  		         ctx, nr_events);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1285
1286
1287
1288
1289
1290
1291
  		goto out;
  	}
  
  	ioctx = ioctx_alloc(nr_events);
  	ret = PTR_ERR(ioctx);
  	if (!IS_ERR(ioctx)) {
  		ret = put_user(ioctx->user_id, ctxp);
a2e1859ad   Al Viro   aio: take final p...
1292
  		if (ret)
e02ba72aa   Anatol Pomozov   aio: block io_des...
1293
  			kill_ioctx(current->mm, ioctx, NULL);
723be6e39   Kent Overstreet   aio: percpu ioctx...
1294
  		percpu_ref_put(&ioctx->users);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1295
1296
1297
1298
1299
1300
1301
1302
1303
  	}
  
  out:
  	return ret;
  }
  
  /* sys_io_destroy:
   *	Destroy the aio_context specified.  May cancel any outstanding 
   *	AIOs and block on completion.  Will fail with -ENOSYS if not
642b5123a   Satoru Takeuchi   aio: fix wrong su...
1304
   *	implemented.  May fail with -EINVAL if the context pointed to
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1305
1306
   *	is invalid.
   */
002c8976e   Heiko Carstens   [CVE-2009-0029] S...
1307
  SYSCALL_DEFINE1(io_destroy, aio_context_t, ctx)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1308
1309
1310
  {
  	struct kioctx *ioctx = lookup_ioctx(ctx);
  	if (likely(NULL != ioctx)) {
dc48e56d7   Jens Axboe   aio: fix serial d...
1311
  		struct ctx_rq_wait wait;
fb2d44838   Benjamin LaHaise   aio: report error...
1312
  		int ret;
e02ba72aa   Anatol Pomozov   aio: block io_des...
1313

dc48e56d7   Jens Axboe   aio: fix serial d...
1314
1315
  		init_completion(&wait.comp);
  		atomic_set(&wait.count, 1);
e02ba72aa   Anatol Pomozov   aio: block io_des...
1316
1317
1318
1319
  		/* Pass requests_done to kill_ioctx() where it can be set
  		 * in a thread-safe way. If we try to set it here then we have
  		 * a race condition if two io_destroy() called simultaneously.
  		 */
dc48e56d7   Jens Axboe   aio: fix serial d...
1320
  		ret = kill_ioctx(current->mm, ioctx, &wait);
723be6e39   Kent Overstreet   aio: percpu ioctx...
1321
  		percpu_ref_put(&ioctx->users);
e02ba72aa   Anatol Pomozov   aio: block io_des...
1322
1323
1324
1325
1326
  
  		/* Wait until all IO for the context are done. Otherwise kernel
  		 * keep using user-space buffers even if user thinks the context
  		 * is destroyed.
  		 */
fb2d44838   Benjamin LaHaise   aio: report error...
1327
  		if (!ret)
dc48e56d7   Jens Axboe   aio: fix serial d...
1328
  			wait_for_completion(&wait.comp);
e02ba72aa   Anatol Pomozov   aio: block io_des...
1329

fb2d44838   Benjamin LaHaise   aio: report error...
1330
  		return ret;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1331
  	}
acd88d4e1   Kinglong Mee   fs/aio.c: Remove ...
1332
1333
  	pr_debug("EINVAL: invalid context id
  ");
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1334
1335
  	return -EINVAL;
  }
89319d31d   Christoph Hellwig   fs: remove aio_ru...
1336
1337
  static int aio_setup_rw(int rw, struct iocb *iocb, struct iovec **iovec,
  		bool vectored, bool compat, struct iov_iter *iter)
eed4e51fb   Badari Pulavarty   [PATCH] Add vecto...
1338
  {
89319d31d   Christoph Hellwig   fs: remove aio_ru...
1339
1340
1341
1342
1343
1344
1345
1346
  	void __user *buf = (void __user *)(uintptr_t)iocb->aio_buf;
  	size_t len = iocb->aio_nbytes;
  
  	if (!vectored) {
  		ssize_t ret = import_single_range(rw, buf, len, *iovec, iter);
  		*iovec = NULL;
  		return ret;
  	}
9d85cba71   Jeff Moyer   aio: fix the comp...
1347
1348
  #ifdef CONFIG_COMPAT
  	if (compat)
89319d31d   Christoph Hellwig   fs: remove aio_ru...
1349
1350
  		return compat_import_iovec(rw, buf, len, UIO_FASTIOV, iovec,
  				iter);
9d85cba71   Jeff Moyer   aio: fix the comp...
1351
  #endif
89319d31d   Christoph Hellwig   fs: remove aio_ru...
1352
  	return import_iovec(rw, buf, len, UIO_FASTIOV, iovec, iter);
eed4e51fb   Badari Pulavarty   [PATCH] Add vecto...
1353
  }
89319d31d   Christoph Hellwig   fs: remove aio_ru...
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
  static inline ssize_t aio_ret(struct kiocb *req, ssize_t ret)
  {
  	switch (ret) {
  	case -EIOCBQUEUED:
  		return ret;
  	case -ERESTARTSYS:
  	case -ERESTARTNOINTR:
  	case -ERESTARTNOHAND:
  	case -ERESTART_RESTARTBLOCK:
  		/*
  		 * There's no easy way to restart the syscall since other AIO's
  		 * may be already running. Just fail this IO with EINTR.
  		 */
  		ret = -EINTR;
  		/*FALLTHRU*/
  	default:
  		aio_complete(req, ret, 0);
  		return 0;
  	}
  }
  
  static ssize_t aio_read(struct kiocb *req, struct iocb *iocb, bool vectored,
  		bool compat)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1377
  {
41ef4eb8e   Kent Overstreet   aio: kill ki_retry
1378
  	struct file *file = req->ki_filp;
00fefb9cf   Gu Zheng   aio: use iovec ar...
1379
  	struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
293bc9822   Al Viro   new methods: ->re...
1380
  	struct iov_iter iter;
89319d31d   Christoph Hellwig   fs: remove aio_ru...
1381
  	ssize_t ret;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1382

89319d31d   Christoph Hellwig   fs: remove aio_ru...
1383
1384
1385
1386
  	if (unlikely(!(file->f_mode & FMODE_READ)))
  		return -EBADF;
  	if (unlikely(!file->f_op->read_iter))
  		return -EINVAL;
73a7075e3   Kent Overstreet   aio: Kill aio_rw_...
1387

89319d31d   Christoph Hellwig   fs: remove aio_ru...
1388
1389
1390
1391
1392
1393
1394
1395
1396
  	ret = aio_setup_rw(READ, iocb, &iovec, vectored, compat, &iter);
  	if (ret)
  		return ret;
  	ret = rw_verify_area(READ, file, &req->ki_pos, iov_iter_count(&iter));
  	if (!ret)
  		ret = aio_ret(req, file->f_op->read_iter(req, &iter));
  	kfree(iovec);
  	return ret;
  }
73a7075e3   Kent Overstreet   aio: Kill aio_rw_...
1397

89319d31d   Christoph Hellwig   fs: remove aio_ru...
1398
1399
1400
1401
1402
1403
1404
  static ssize_t aio_write(struct kiocb *req, struct iocb *iocb, bool vectored,
  		bool compat)
  {
  	struct file *file = req->ki_filp;
  	struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
  	struct iov_iter iter;
  	ssize_t ret;
41ef4eb8e   Kent Overstreet   aio: kill ki_retry
1405

89319d31d   Christoph Hellwig   fs: remove aio_ru...
1406
1407
1408
  	if (unlikely(!(file->f_mode & FMODE_WRITE)))
  		return -EBADF;
  	if (unlikely(!file->f_op->write_iter))
41ef4eb8e   Kent Overstreet   aio: kill ki_retry
1409
  		return -EINVAL;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1410

89319d31d   Christoph Hellwig   fs: remove aio_ru...
1411
1412
1413
1414
1415
  	ret = aio_setup_rw(WRITE, iocb, &iovec, vectored, compat, &iter);
  	if (ret)
  		return ret;
  	ret = rw_verify_area(WRITE, file, &req->ki_pos, iov_iter_count(&iter));
  	if (!ret) {
70fe2f481   Jan Kara   aio: fix freeze p...
1416
  		req->ki_flags |= IOCB_WRITE;
89319d31d   Christoph Hellwig   fs: remove aio_ru...
1417
1418
  		file_start_write(file);
  		ret = aio_ret(req, file->f_op->write_iter(req, &iter));
70fe2f481   Jan Kara   aio: fix freeze p...
1419
1420
1421
1422
1423
1424
  		/*
  		 * We release freeze protection in aio_complete().  Fool lockdep
  		 * by telling it the lock got released so that it doesn't
  		 * complain about held lock when we return to userspace.
  		 */
  		__sb_writers_release(file_inode(file)->i_sb, SB_FREEZE_WRITE);
41ef4eb8e   Kent Overstreet   aio: kill ki_retry
1425
  	}
89319d31d   Christoph Hellwig   fs: remove aio_ru...
1426
1427
  	kfree(iovec);
  	return ret;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1428
  }
d5470b596   Adrian Bunk   fs/aio.c: make 3 ...
1429
  static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
a1c8eae75   Kent Overstreet   aio: kill batch a...
1430
  			 struct iocb *iocb, bool compat)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1431
  {
04b2fa9f8   Christoph Hellwig   fs: split generic...
1432
  	struct aio_kiocb *req;
89319d31d   Christoph Hellwig   fs: remove aio_ru...
1433
  	struct file *file;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1434
1435
1436
  	ssize_t ret;
  
  	/* enforce forwards compatibility on users */
9c3060bed   Davide Libenzi   signal/timer/even...
1437
  	if (unlikely(iocb->aio_reserved1 || iocb->aio_reserved2)) {
caf4167aa   Kent Overstreet   aio: dprintk() ->...
1438
1439
  		pr_debug("EINVAL: reserve field set
  ");
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1440
1441
1442
1443
1444
1445
1446
1447
1448
  		return -EINVAL;
  	}
  
  	/* prevent overflows */
  	if (unlikely(
  	    (iocb->aio_buf != (unsigned long)iocb->aio_buf) ||
  	    (iocb->aio_nbytes != (size_t)iocb->aio_nbytes) ||
  	    ((ssize_t)iocb->aio_nbytes < 0)
  	   )) {
acd88d4e1   Kinglong Mee   fs/aio.c: Remove ...
1449
1450
  		pr_debug("EINVAL: overflow check
  ");
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1451
1452
  		return -EINVAL;
  	}
41ef4eb8e   Kent Overstreet   aio: kill ki_retry
1453
  	req = aio_get_req(ctx);
1d98ebfcc   Kent Overstreet   aio: do fget() af...
1454
  	if (unlikely(!req))
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1455
  		return -EAGAIN;
1d98ebfcc   Kent Overstreet   aio: do fget() af...
1456

89319d31d   Christoph Hellwig   fs: remove aio_ru...
1457
  	req->common.ki_filp = file = fget(iocb->aio_fildes);
04b2fa9f8   Christoph Hellwig   fs: split generic...
1458
  	if (unlikely(!req->common.ki_filp)) {
1d98ebfcc   Kent Overstreet   aio: do fget() af...
1459
1460
  		ret = -EBADF;
  		goto out_put_req;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1461
  	}
04b2fa9f8   Christoph Hellwig   fs: split generic...
1462
1463
  	req->common.ki_pos = iocb->aio_offset;
  	req->common.ki_complete = aio_complete;
2ba48ce51   Al Viro   mirror O_APPEND a...
1464
  	req->common.ki_flags = iocb_flags(req->common.ki_filp);
1d98ebfcc   Kent Overstreet   aio: do fget() af...
1465

9c3060bed   Davide Libenzi   signal/timer/even...
1466
1467
1468
1469
1470
1471
1472
  	if (iocb->aio_flags & IOCB_FLAG_RESFD) {
  		/*
  		 * If the IOCB_FLAG_RESFD flag of aio_flags is set, get an
  		 * instance of the file* now. The file descriptor must be
  		 * an eventfd() fd, and will be signaled for each completed
  		 * event using the eventfd_signal() function.
  		 */
133890103   Davide Libenzi   eventfd: revised ...
1473
  		req->ki_eventfd = eventfd_ctx_fdget((int) iocb->aio_resfd);
801678c5a   Hirofumi Nakagawa   Remove duplicated...
1474
  		if (IS_ERR(req->ki_eventfd)) {
9c3060bed   Davide Libenzi   signal/timer/even...
1475
  			ret = PTR_ERR(req->ki_eventfd);
87c3a86e1   Davide Libenzi   eventfd: remove f...
1476
  			req->ki_eventfd = NULL;
9c3060bed   Davide Libenzi   signal/timer/even...
1477
1478
  			goto out_put_req;
  		}
04b2fa9f8   Christoph Hellwig   fs: split generic...
1479
1480
  
  		req->common.ki_flags |= IOCB_EVENTFD;
9c3060bed   Davide Libenzi   signal/timer/even...
1481
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1482

8a6608907   Kent Overstreet   aio: kill ki_key
1483
  	ret = put_user(KIOCB_KEY, &user_iocb->aio_key);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1484
  	if (unlikely(ret)) {
caf4167aa   Kent Overstreet   aio: dprintk() ->...
1485
1486
  		pr_debug("EFAULT: aio_key
  ");
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1487
1488
  		goto out_put_req;
  	}
04b2fa9f8   Christoph Hellwig   fs: split generic...
1489
  	req->ki_user_iocb = user_iocb;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1490
  	req->ki_user_data = iocb->aio_data;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1491

89319d31d   Christoph Hellwig   fs: remove aio_ru...
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
  	get_file(file);
  	switch (iocb->aio_lio_opcode) {
  	case IOCB_CMD_PREAD:
  		ret = aio_read(&req->common, iocb, false, compat);
  		break;
  	case IOCB_CMD_PWRITE:
  		ret = aio_write(&req->common, iocb, false, compat);
  		break;
  	case IOCB_CMD_PREADV:
  		ret = aio_read(&req->common, iocb, true, compat);
  		break;
  	case IOCB_CMD_PWRITEV:
  		ret = aio_write(&req->common, iocb, true, compat);
  		break;
  	default:
  		pr_debug("invalid aio operation %d
  ", iocb->aio_lio_opcode);
  		ret = -EINVAL;
  		break;
  	}
  	fput(file);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1513

89319d31d   Christoph Hellwig   fs: remove aio_ru...
1514
1515
  	if (ret && ret != -EIOCBQUEUED)
  		goto out_put_req;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1516
  	return 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1517
  out_put_req:
e1bdd5f27   Kent Overstreet   aio: percpu reqs_...
1518
  	put_reqs_available(ctx, 1);
e34ecee2a   Kent Overstreet   aio: Fix a trinit...
1519
  	percpu_ref_put(&ctx->reqs);
57282d8fd   Kent Overstreet   aio: Kill ki_users
1520
  	kiocb_free(req);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1521
1522
  	return ret;
  }
9d85cba71   Jeff Moyer   aio: fix the comp...
1523
1524
  long do_io_submit(aio_context_t ctx_id, long nr,
  		  struct iocb __user *__user *iocbpp, bool compat)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1525
1526
1527
  {
  	struct kioctx *ctx;
  	long ret = 0;
080d676de   Jeff Moyer   aio: allocate kio...
1528
  	int i = 0;
9f5b94254   Shaohua Li   fs: make aio plug
1529
  	struct blk_plug plug;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1530
1531
1532
  
  	if (unlikely(nr < 0))
  		return -EINVAL;
75e1c70fc   Jeff Moyer   aio: check for mu...
1533
1534
  	if (unlikely(nr > LONG_MAX/sizeof(*iocbpp)))
  		nr = LONG_MAX/sizeof(*iocbpp);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1535
1536
1537
1538
1539
  	if (unlikely(!access_ok(VERIFY_READ, iocbpp, (nr*sizeof(*iocbpp)))))
  		return -EFAULT;
  
  	ctx = lookup_ioctx(ctx_id);
  	if (unlikely(!ctx)) {
caf4167aa   Kent Overstreet   aio: dprintk() ->...
1540
1541
  		pr_debug("EINVAL: invalid context id
  ");
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1542
1543
  		return -EINVAL;
  	}
9f5b94254   Shaohua Li   fs: make aio plug
1544
  	blk_start_plug(&plug);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
  	/*
  	 * AKPM: should this return a partial result if some of the IOs were
  	 * successfully submitted?
  	 */
  	for (i=0; i<nr; i++) {
  		struct iocb __user *user_iocb;
  		struct iocb tmp;
  
  		if (unlikely(__get_user(user_iocb, iocbpp + i))) {
  			ret = -EFAULT;
  			break;
  		}
  
  		if (unlikely(copy_from_user(&tmp, user_iocb, sizeof(tmp)))) {
  			ret = -EFAULT;
  			break;
  		}
a1c8eae75   Kent Overstreet   aio: kill batch a...
1562
  		ret = io_submit_one(ctx, user_iocb, &tmp, compat);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1563
1564
1565
  		if (ret)
  			break;
  	}
9f5b94254   Shaohua Li   fs: make aio plug
1566
  	blk_finish_plug(&plug);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1567

723be6e39   Kent Overstreet   aio: percpu ioctx...
1568
  	percpu_ref_put(&ctx->users);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1569
1570
  	return i ? i : ret;
  }
9d85cba71   Jeff Moyer   aio: fix the comp...
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
  /* sys_io_submit:
   *	Queue the nr iocbs pointed to by iocbpp for processing.  Returns
   *	the number of iocbs queued.  May return -EINVAL if the aio_context
   *	specified by ctx_id is invalid, if nr is < 0, if the iocb at
   *	*iocbpp[0] is not properly initialized, if the operation specified
   *	is invalid for the file descriptor in the iocb.  May fail with
   *	-EFAULT if any of the data structures point to invalid data.  May
   *	fail with -EBADF if the file descriptor specified in the first
   *	iocb is invalid.  May fail with -EAGAIN if insufficient resources
   *	are available to queue any iocbs.  Will return 0 if nr is 0.  Will
   *	fail with -ENOSYS if not implemented.
   */
  SYSCALL_DEFINE3(io_submit, aio_context_t, ctx_id, long, nr,
  		struct iocb __user * __user *, iocbpp)
  {
  	return do_io_submit(ctx_id, nr, iocbpp, 0);
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1588
1589
  /* lookup_kiocb
   *	Finds a given iocb for cancellation.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1590
   */
04b2fa9f8   Christoph Hellwig   fs: split generic...
1591
1592
  static struct aio_kiocb *
  lookup_kiocb(struct kioctx *ctx, struct iocb __user *iocb, u32 key)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1593
  {
04b2fa9f8   Christoph Hellwig   fs: split generic...
1594
  	struct aio_kiocb *kiocb;
d00689af6   Zach Brown   [PATCH] aio: repl...
1595
1596
  
  	assert_spin_locked(&ctx->ctx_lock);
8a6608907   Kent Overstreet   aio: kill ki_key
1597
1598
  	if (key != KIOCB_KEY)
  		return NULL;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1599
  	/* TODO: use a hash or array, this sucks. */
04b2fa9f8   Christoph Hellwig   fs: split generic...
1600
1601
  	list_for_each_entry(kiocb, &ctx->active_reqs, ki_list) {
  		if (kiocb->ki_user_iocb == iocb)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
  			return kiocb;
  	}
  	return NULL;
  }
  
  /* sys_io_cancel:
   *	Attempts to cancel an iocb previously passed to io_submit.  If
   *	the operation is successfully cancelled, the resulting event is
   *	copied into the memory pointed to by result without being placed
   *	into the completion queue and 0 is returned.  May fail with
   *	-EFAULT if any of the data structures pointed to are invalid.
   *	May fail with -EINVAL if aio_context specified by ctx_id is
   *	invalid.  May fail with -EAGAIN if the iocb specified was not
   *	cancelled.  Will fail with -ENOSYS if not implemented.
   */
002c8976e   Heiko Carstens   [CVE-2009-0029] S...
1617
1618
  SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb,
  		struct io_event __user *, result)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1619
  {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1620
  	struct kioctx *ctx;
04b2fa9f8   Christoph Hellwig   fs: split generic...
1621
  	struct aio_kiocb *kiocb;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
  	u32 key;
  	int ret;
  
  	ret = get_user(key, &iocb->aio_key);
  	if (unlikely(ret))
  		return -EFAULT;
  
  	ctx = lookup_ioctx(ctx_id);
  	if (unlikely(!ctx))
  		return -EINVAL;
  
  	spin_lock_irq(&ctx->ctx_lock);
906b973cf   Kent Overstreet   aio: add kiocb_ca...
1634

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1635
  	kiocb = lookup_kiocb(ctx, iocb, key);
906b973cf   Kent Overstreet   aio: add kiocb_ca...
1636
  	if (kiocb)
d52a8f9ea   Fabian Frederick   fs/aio.c: Remove ...
1637
  		ret = kiocb_cancel(kiocb);
906b973cf   Kent Overstreet   aio: add kiocb_ca...
1638
1639
  	else
  		ret = -EINVAL;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1640
  	spin_unlock_irq(&ctx->ctx_lock);
906b973cf   Kent Overstreet   aio: add kiocb_ca...
1641
  	if (!ret) {
bec68faaf   Kent Overstreet   aio: io_cancel() ...
1642
1643
1644
1645
  		/*
  		 * The result argument is no longer used - the io_event is
  		 * always delivered via the ring buffer. -EINPROGRESS indicates
  		 * cancellation is progress:
906b973cf   Kent Overstreet   aio: add kiocb_ca...
1646
  		 */
bec68faaf   Kent Overstreet   aio: io_cancel() ...
1647
  		ret = -EINPROGRESS;
906b973cf   Kent Overstreet   aio: add kiocb_ca...
1648
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1649

723be6e39   Kent Overstreet   aio: percpu ioctx...
1650
  	percpu_ref_put(&ctx->users);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1651
1652
1653
1654
1655
1656
  
  	return ret;
  }
  
  /* io_getevents:
   *	Attempts to read at least min_nr events and up to nr events from
642b5123a   Satoru Takeuchi   aio: fix wrong su...
1657
1658
1659
1660
1661
1662
1663
1664
   *	the completion queue for the aio_context specified by ctx_id. If
   *	it succeeds, the number of read events is returned. May fail with
   *	-EINVAL if ctx_id is invalid, if min_nr is out of range, if nr is
   *	out of range, if timeout is out of range.  May fail with -EFAULT
   *	if any of the memory specified is invalid.  May return 0 or
   *	< min_nr if the timeout specified by timeout has elapsed
   *	before sufficient events are available, where timeout == NULL
   *	specifies an infinite timeout. Note that the timeout pointed to by
6900807c6   Jeff Moyer   aio: fix io_getev...
1665
   *	timeout is relative.  Will fail with -ENOSYS if not implemented.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1666
   */
002c8976e   Heiko Carstens   [CVE-2009-0029] S...
1667
1668
1669
1670
1671
  SYSCALL_DEFINE5(io_getevents, aio_context_t, ctx_id,
  		long, min_nr,
  		long, nr,
  		struct io_event __user *, events,
  		struct timespec __user *, timeout)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1672
1673
1674
1675
1676
  {
  	struct kioctx *ioctx = lookup_ioctx(ctx_id);
  	long ret = -EINVAL;
  
  	if (likely(ioctx)) {
2e4102559   Namhyung Kim   aio: remove unnec...
1677
  		if (likely(min_nr <= nr && min_nr >= 0))
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1678
  			ret = read_events(ioctx, min_nr, nr, events, timeout);
723be6e39   Kent Overstreet   aio: percpu ioctx...
1679
  		percpu_ref_put(&ioctx->users);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1680
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1681
1682
  	return ret;
  }