Blame view

fs/aio.c 42.8 KB
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1
2
3
4
5
6
7
8
9
10
  /*
   *	An async IO implementation for Linux
   *	Written by Benjamin LaHaise <bcrl@kvack.org>
   *
   *	Implements an efficient asynchronous io interface.
   *
   *	Copyright 2000, 2001, 2002 Red Hat, Inc.  All Rights Reserved.
   *
   *	See ../COPYING for licensing terms.
   */
caf4167aa   Kent Overstreet   aio: dprintk() ->...
11
  #define pr_fmt(fmt) "%s: " fmt, __func__
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
12
13
14
15
16
  #include <linux/kernel.h>
  #include <linux/init.h>
  #include <linux/errno.h>
  #include <linux/time.h>
  #include <linux/aio_abi.h>
630d9c472   Paul Gortmaker   fs: reduce the us...
17
  #include <linux/export.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
18
  #include <linux/syscalls.h>
b9d128f10   Jens Axboe   block: move bdi/a...
19
  #include <linux/backing-dev.h>
027445c37   Badari Pulavarty   [PATCH] Vectorize...
20
  #include <linux/uio.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
21

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
22
23
24
25
26
  #include <linux/sched.h>
  #include <linux/fs.h>
  #include <linux/file.h>
  #include <linux/mm.h>
  #include <linux/mman.h>
3d2d827f5   Michael S. Tsirkin   mm: move use_mm/u...
27
  #include <linux/mmu_context.h>
e1bdd5f27   Kent Overstreet   aio: percpu reqs_...
28
  #include <linux/percpu.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
29
30
31
32
33
34
  #include <linux/slab.h>
  #include <linux/timer.h>
  #include <linux/aio.h>
  #include <linux/highmem.h>
  #include <linux/workqueue.h>
  #include <linux/security.h>
9c3060bed   Davide Libenzi   signal/timer/even...
35
  #include <linux/eventfd.h>
cfb1e33ee   Jeff Moyer   aio: implement re...
36
  #include <linux/blkdev.h>
9d85cba71   Jeff Moyer   aio: fix the comp...
37
  #include <linux/compat.h>
36bc08cc0   Gu Zheng   fs/aio: Add suppo...
38
39
  #include <linux/migrate.h>
  #include <linux/ramfs.h>
723be6e39   Kent Overstreet   aio: percpu ioctx...
40
  #include <linux/percpu-refcount.h>
71ad7490c   Benjamin LaHaise   rework aio migrat...
41
  #include <linux/mount.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
42
43
44
  
  #include <asm/kmap_types.h>
  #include <asm/uaccess.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
45

68d70d03f   Al Viro   constify rw_verif...
46
  #include "internal.h"
4e179bca6   Kent Overstreet   aio: move private...
47
48
49
50
51
52
  #define AIO_RING_MAGIC			0xa10a10a1
  #define AIO_RING_COMPAT_FEATURES	1
  #define AIO_RING_INCOMPAT_FEATURES	0
  struct aio_ring {
  	unsigned	id;	/* kernel internal index number */
  	unsigned	nr;	/* number of io_events */
fa8a53c39   Benjamin LaHaise   aio: v4 ensure ac...
53
54
  	unsigned	head;	/* Written to by userland or under ring_lock
  				 * mutex by aio_read_events_ring(). */
4e179bca6   Kent Overstreet   aio: move private...
55
56
57
58
59
60
61
62
63
64
65
66
  	unsigned	tail;
  
  	unsigned	magic;
  	unsigned	compat_features;
  	unsigned	incompat_features;
  	unsigned	header_length;	/* size of aio_ring */
  
  
  	struct io_event		io_events[0];
  }; /* 128 bytes + ring size */
  
  #define AIO_RING_PAGES	8
4e179bca6   Kent Overstreet   aio: move private...
67

db446a08c   Benjamin LaHaise   aio: convert the ...
68
69
70
71
72
  struct kioctx_table {
  	struct rcu_head	rcu;
  	unsigned	nr;
  	struct kioctx	*table[];
  };
e1bdd5f27   Kent Overstreet   aio: percpu reqs_...
73
74
75
  struct kioctx_cpu {
  	unsigned		reqs_available;
  };
4e179bca6   Kent Overstreet   aio: move private...
76
  struct kioctx {
723be6e39   Kent Overstreet   aio: percpu ioctx...
77
  	struct percpu_ref	users;
36f558890   Kent Overstreet   aio: refcounting ...
78
  	atomic_t		dead;
4e179bca6   Kent Overstreet   aio: move private...
79

e34ecee2a   Kent Overstreet   aio: Fix a trinit...
80
  	struct percpu_ref	reqs;
4e179bca6   Kent Overstreet   aio: move private...
81
  	unsigned long		user_id;
4e179bca6   Kent Overstreet   aio: move private...
82

e1bdd5f27   Kent Overstreet   aio: percpu reqs_...
83
84
85
86
87
88
89
  	struct __percpu kioctx_cpu *cpu;
  
  	/*
  	 * For percpu reqs_available, number of slots we move to/from global
  	 * counter at a time:
  	 */
  	unsigned		req_batch;
3e845ce01   Kent Overstreet   aio: change reqs_...
90
91
92
93
  	/*
  	 * This is what userspace passed to io_setup(), it's not used for
  	 * anything but counting against the global max_reqs quota.
  	 *
58c85dc20   Kent Overstreet   aio: kill struct ...
94
  	 * The real limit is nr_events - 1, which will be larger (see
3e845ce01   Kent Overstreet   aio: change reqs_...
95
96
  	 * aio_setup_ring())
  	 */
4e179bca6   Kent Overstreet   aio: move private...
97
  	unsigned		max_reqs;
58c85dc20   Kent Overstreet   aio: kill struct ...
98
99
  	/* Size of ringbuffer, in units of struct io_event */
  	unsigned		nr_events;
4e179bca6   Kent Overstreet   aio: move private...
100

58c85dc20   Kent Overstreet   aio: kill struct ...
101
102
103
104
105
  	unsigned long		mmap_base;
  	unsigned long		mmap_size;
  
  	struct page		**ring_pages;
  	long			nr_pages;
723be6e39   Kent Overstreet   aio: percpu ioctx...
106
  	struct work_struct	free_work;
4e23bcaeb   Kent Overstreet   aio: give shared ...
107

e02ba72aa   Anatol Pomozov   aio: block io_des...
108
109
110
111
  	/*
  	 * signals when all in-flight requests are done
  	 */
  	struct completion *requests_done;
4e23bcaeb   Kent Overstreet   aio: give shared ...
112
  	struct {
34e83fc61   Kent Overstreet   aio: reqs_active ...
113
114
115
116
117
  		/*
  		 * This counts the number of available slots in the ringbuffer,
  		 * so we avoid overflowing it: it's decremented (if positive)
  		 * when allocating a kiocb and incremented when the resulting
  		 * io_event is pulled off the ringbuffer.
e1bdd5f27   Kent Overstreet   aio: percpu reqs_...
118
119
  		 *
  		 * We batch accesses to it with a percpu version.
34e83fc61   Kent Overstreet   aio: reqs_active ...
120
121
  		 */
  		atomic_t	reqs_available;
4e23bcaeb   Kent Overstreet   aio: give shared ...
122
123
124
125
126
127
  	} ____cacheline_aligned_in_smp;
  
  	struct {
  		spinlock_t	ctx_lock;
  		struct list_head active_reqs;	/* used for cancellation */
  	} ____cacheline_aligned_in_smp;
58c85dc20   Kent Overstreet   aio: kill struct ...
128
129
  	struct {
  		struct mutex	ring_lock;
4e23bcaeb   Kent Overstreet   aio: give shared ...
130
131
  		wait_queue_head_t wait;
  	} ____cacheline_aligned_in_smp;
58c85dc20   Kent Overstreet   aio: kill struct ...
132
133
134
  
  	struct {
  		unsigned	tail;
d856f32a8   Benjamin LaHaise   aio: fix reqs_ava...
135
  		unsigned	completed_events;
58c85dc20   Kent Overstreet   aio: kill struct ...
136
  		spinlock_t	completion_lock;
4e23bcaeb   Kent Overstreet   aio: give shared ...
137
  	} ____cacheline_aligned_in_smp;
58c85dc20   Kent Overstreet   aio: kill struct ...
138
139
  
  	struct page		*internal_pages[AIO_RING_PAGES];
36bc08cc0   Gu Zheng   fs/aio: Add suppo...
140
  	struct file		*aio_ring_file;
db446a08c   Benjamin LaHaise   aio: convert the ...
141
142
  
  	unsigned		id;
4e179bca6   Kent Overstreet   aio: move private...
143
  };
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
144
  /*------ sysctl variables----*/
d55b5fdaf   Zach Brown   [PATCH] aio: remo...
145
146
147
  static DEFINE_SPINLOCK(aio_nr_lock);
  unsigned long aio_nr;		/* current system wide number of aio requests */
  unsigned long aio_max_nr = 0x10000; /* system wide maximum number of aio requests */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
148
  /*----end sysctl variables---*/
e18b890bb   Christoph Lameter   [PATCH] slab: rem...
149
150
  static struct kmem_cache	*kiocb_cachep;
  static struct kmem_cache	*kioctx_cachep;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
151

71ad7490c   Benjamin LaHaise   rework aio migrat...
152
153
154
155
  static struct vfsmount *aio_mnt;
  
  static const struct file_operations aio_ring_fops;
  static const struct address_space_operations aio_ctx_aops;
835f252c6   Gu Zheng   aio: fix uncorren...
156
157
158
159
160
161
162
163
  /* Backing dev info for aio fs.
   * -no dirty page accounting or writeback happens
   */
  static struct backing_dev_info aio_fs_backing_dev_info = {
  	.name           = "aiofs",
  	.state          = 0,
  	.capabilities   = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_MAP_COPY,
  };
71ad7490c   Benjamin LaHaise   rework aio migrat...
164
165
166
167
168
169
  static struct file *aio_private_file(struct kioctx *ctx, loff_t nr_pages)
  {
  	struct qstr this = QSTR_INIT("[aio]", 5);
  	struct file *file;
  	struct path path;
  	struct inode *inode = alloc_anon_inode(aio_mnt->mnt_sb);
7f62656be   Dan Carpenter   aio: checking for...
170
171
  	if (IS_ERR(inode))
  		return ERR_CAST(inode);
71ad7490c   Benjamin LaHaise   rework aio migrat...
172
173
174
  
  	inode->i_mapping->a_ops = &aio_ctx_aops;
  	inode->i_mapping->private_data = ctx;
835f252c6   Gu Zheng   aio: fix uncorren...
175
  	inode->i_mapping->backing_dev_info = &aio_fs_backing_dev_info;
71ad7490c   Benjamin LaHaise   rework aio migrat...
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
  	inode->i_size = PAGE_SIZE * nr_pages;
  
  	path.dentry = d_alloc_pseudo(aio_mnt->mnt_sb, &this);
  	if (!path.dentry) {
  		iput(inode);
  		return ERR_PTR(-ENOMEM);
  	}
  	path.mnt = mntget(aio_mnt);
  
  	d_instantiate(path.dentry, inode);
  	file = alloc_file(&path, FMODE_READ | FMODE_WRITE, &aio_ring_fops);
  	if (IS_ERR(file)) {
  		path_put(&path);
  		return file;
  	}
  
  	file->f_flags = O_RDWR;
71ad7490c   Benjamin LaHaise   rework aio migrat...
193
194
195
196
197
198
199
200
201
  	return file;
  }
  
  static struct dentry *aio_mount(struct file_system_type *fs_type,
  				int flags, const char *dev_name, void *data)
  {
  	static const struct dentry_operations ops = {
  		.d_dname	= simple_dname,
  	};
8dc4379e1   Gu Zheng   aio: use the macr...
202
  	return mount_pseudo(fs_type, "aio:", NULL, &ops, AIO_RING_MAGIC);
71ad7490c   Benjamin LaHaise   rework aio migrat...
203
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
204
205
206
207
208
209
  /* aio_setup
   *	Creates the slab caches used by the aio routines, panic on
   *	failure as this is done early during the boot sequence.
   */
  static int __init aio_setup(void)
  {
71ad7490c   Benjamin LaHaise   rework aio migrat...
210
211
212
213
214
215
216
217
  	static struct file_system_type aio_fs = {
  		.name		= "aio",
  		.mount		= aio_mount,
  		.kill_sb	= kill_anon_super,
  	};
  	aio_mnt = kern_mount(&aio_fs);
  	if (IS_ERR(aio_mnt))
  		panic("Failed to create aio fs mount.");
835f252c6   Gu Zheng   aio: fix uncorren...
218
219
  	if (bdi_init(&aio_fs_backing_dev_info))
  		panic("Failed to init aio fs backing dev info.");
0a31bd5f2   Christoph Lameter   KMEM_CACHE(): sim...
220
221
  	kiocb_cachep = KMEM_CACHE(kiocb, SLAB_HWCACHE_ALIGN|SLAB_PANIC);
  	kioctx_cachep = KMEM_CACHE(kioctx,SLAB_HWCACHE_ALIGN|SLAB_PANIC);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
222

caf4167aa   Kent Overstreet   aio: dprintk() ->...
223
224
  	pr_debug("sizeof(struct page) = %zu
  ", sizeof(struct page));
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
225
226
227
  
  	return 0;
  }
385773e04   H Hartley Sweeten   aio.c: move EXPOR...
228
  __initcall(aio_setup);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
229

5e9ae2e5d   Benjamin LaHaise   aio: fix use-afte...
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
  static void put_aio_ring_file(struct kioctx *ctx)
  {
  	struct file *aio_ring_file = ctx->aio_ring_file;
  	if (aio_ring_file) {
  		truncate_setsize(aio_ring_file->f_inode, 0);
  
  		/* Prevent further access to the kioctx from migratepages */
  		spin_lock(&aio_ring_file->f_inode->i_mapping->private_lock);
  		aio_ring_file->f_inode->i_mapping->private_data = NULL;
  		ctx->aio_ring_file = NULL;
  		spin_unlock(&aio_ring_file->f_inode->i_mapping->private_lock);
  
  		fput(aio_ring_file);
  	}
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
245
246
  static void aio_free_ring(struct kioctx *ctx)
  {
36bc08cc0   Gu Zheng   fs/aio: Add suppo...
247
  	int i;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
248

fa8a53c39   Benjamin LaHaise   aio: v4 ensure ac...
249
250
251
252
  	/* Disconnect the kiotx from the ring file.  This prevents future
  	 * accesses to the kioctx from page migration.
  	 */
  	put_aio_ring_file(ctx);
36bc08cc0   Gu Zheng   fs/aio: Add suppo...
253
  	for (i = 0; i < ctx->nr_pages; i++) {
8e321fefb   Benjamin LaHaise   aio/migratepages:...
254
  		struct page *page;
36bc08cc0   Gu Zheng   fs/aio: Add suppo...
255
256
257
  		pr_debug("pid(%d) [%d] page->count=%d
  ", current->pid, i,
  				page_count(ctx->ring_pages[i]));
8e321fefb   Benjamin LaHaise   aio/migratepages:...
258
259
260
261
262
  		page = ctx->ring_pages[i];
  		if (!page)
  			continue;
  		ctx->ring_pages[i] = NULL;
  		put_page(page);
36bc08cc0   Gu Zheng   fs/aio: Add suppo...
263
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
264

ddb8c45ba   Sasha Levin   aio: nullify aio-...
265
  	if (ctx->ring_pages && ctx->ring_pages != ctx->internal_pages) {
58c85dc20   Kent Overstreet   aio: kill struct ...
266
  		kfree(ctx->ring_pages);
ddb8c45ba   Sasha Levin   aio: nullify aio-...
267
268
  		ctx->ring_pages = NULL;
  	}
36bc08cc0   Gu Zheng   fs/aio: Add suppo...
269
270
271
272
  }
  
  static int aio_ring_mmap(struct file *file, struct vm_area_struct *vma)
  {
e4a0d3e72   Pavel Emelyanov   aio: Make it poss...
273
  	vma->vm_flags |= VM_DONTEXPAND;
36bc08cc0   Gu Zheng   fs/aio: Add suppo...
274
275
276
  	vma->vm_ops = &generic_file_vm_ops;
  	return 0;
  }
e4a0d3e72   Pavel Emelyanov   aio: Make it poss...
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
  static void aio_ring_remap(struct file *file, struct vm_area_struct *vma)
  {
  	struct mm_struct *mm = vma->vm_mm;
  	struct kioctx_table *table;
  	int i;
  
  	spin_lock(&mm->ioctx_lock);
  	rcu_read_lock();
  	table = rcu_dereference(mm->ioctx_table);
  	for (i = 0; i < table->nr; i++) {
  		struct kioctx *ctx;
  
  		ctx = table->table[i];
  		if (ctx && ctx->aio_ring_file == file) {
  			ctx->user_id = ctx->mmap_base = vma->vm_start;
  			break;
  		}
  	}
  
  	rcu_read_unlock();
  	spin_unlock(&mm->ioctx_lock);
  }
36bc08cc0   Gu Zheng   fs/aio: Add suppo...
299
300
  static const struct file_operations aio_ring_fops = {
  	.mmap = aio_ring_mmap,
e4a0d3e72   Pavel Emelyanov   aio: Make it poss...
301
  	.mremap = aio_ring_remap,
36bc08cc0   Gu Zheng   fs/aio: Add suppo...
302
  };
0c45355fc   Benjamin LaHaise   aio: fix build wh...
303
  #if IS_ENABLED(CONFIG_MIGRATION)
36bc08cc0   Gu Zheng   fs/aio: Add suppo...
304
305
306
  static int aio_migratepage(struct address_space *mapping, struct page *new,
  			struct page *old, enum migrate_mode mode)
  {
5e9ae2e5d   Benjamin LaHaise   aio: fix use-afte...
307
  	struct kioctx *ctx;
36bc08cc0   Gu Zheng   fs/aio: Add suppo...
308
  	unsigned long flags;
fa8a53c39   Benjamin LaHaise   aio: v4 ensure ac...
309
  	pgoff_t idx;
36bc08cc0   Gu Zheng   fs/aio: Add suppo...
310
  	int rc;
8e321fefb   Benjamin LaHaise   aio/migratepages:...
311
  	rc = 0;
fa8a53c39   Benjamin LaHaise   aio: v4 ensure ac...
312
  	/* mapping->private_lock here protects against the kioctx teardown.  */
8e321fefb   Benjamin LaHaise   aio/migratepages:...
313
314
  	spin_lock(&mapping->private_lock);
  	ctx = mapping->private_data;
fa8a53c39   Benjamin LaHaise   aio: v4 ensure ac...
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
  	if (!ctx) {
  		rc = -EINVAL;
  		goto out;
  	}
  
  	/* The ring_lock mutex.  The prevents aio_read_events() from writing
  	 * to the ring's head, and prevents page migration from mucking in
  	 * a partially initialized kiotx.
  	 */
  	if (!mutex_trylock(&ctx->ring_lock)) {
  		rc = -EAGAIN;
  		goto out;
  	}
  
  	idx = old->index;
  	if (idx < (pgoff_t)ctx->nr_pages) {
  		/* Make sure the old page hasn't already been changed */
  		if (ctx->ring_pages[idx] != old)
  			rc = -EAGAIN;
8e321fefb   Benjamin LaHaise   aio/migratepages:...
334
335
  	} else
  		rc = -EINVAL;
8e321fefb   Benjamin LaHaise   aio/migratepages:...
336
337
  
  	if (rc != 0)
fa8a53c39   Benjamin LaHaise   aio: v4 ensure ac...
338
  		goto out_unlock;
8e321fefb   Benjamin LaHaise   aio/migratepages:...
339

36bc08cc0   Gu Zheng   fs/aio: Add suppo...
340
341
  	/* Writeback must be complete */
  	BUG_ON(PageWriteback(old));
8e321fefb   Benjamin LaHaise   aio/migratepages:...
342
  	get_page(new);
36bc08cc0   Gu Zheng   fs/aio: Add suppo...
343

8e321fefb   Benjamin LaHaise   aio/migratepages:...
344
  	rc = migrate_page_move_mapping(mapping, new, old, NULL, mode, 1);
36bc08cc0   Gu Zheng   fs/aio: Add suppo...
345
  	if (rc != MIGRATEPAGE_SUCCESS) {
8e321fefb   Benjamin LaHaise   aio/migratepages:...
346
  		put_page(new);
fa8a53c39   Benjamin LaHaise   aio: v4 ensure ac...
347
  		goto out_unlock;
36bc08cc0   Gu Zheng   fs/aio: Add suppo...
348
  	}
fa8a53c39   Benjamin LaHaise   aio: v4 ensure ac...
349
350
351
  	/* Take completion_lock to prevent other writes to the ring buffer
  	 * while the old page is copied to the new.  This prevents new
  	 * events from being lost.
5e9ae2e5d   Benjamin LaHaise   aio: fix use-afte...
352
  	 */
fa8a53c39   Benjamin LaHaise   aio: v4 ensure ac...
353
354
355
356
357
  	spin_lock_irqsave(&ctx->completion_lock, flags);
  	migrate_page_copy(new, old);
  	BUG_ON(ctx->ring_pages[idx] != old);
  	ctx->ring_pages[idx] = new;
  	spin_unlock_irqrestore(&ctx->completion_lock, flags);
36bc08cc0   Gu Zheng   fs/aio: Add suppo...
358

fa8a53c39   Benjamin LaHaise   aio: v4 ensure ac...
359
360
  	/* The old page is no longer accessible. */
  	put_page(old);
8e321fefb   Benjamin LaHaise   aio/migratepages:...
361

fa8a53c39   Benjamin LaHaise   aio: v4 ensure ac...
362
363
364
365
  out_unlock:
  	mutex_unlock(&ctx->ring_lock);
  out:
  	spin_unlock(&mapping->private_lock);
36bc08cc0   Gu Zheng   fs/aio: Add suppo...
366
  	return rc;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
367
  }
0c45355fc   Benjamin LaHaise   aio: fix build wh...
368
  #endif
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
369

36bc08cc0   Gu Zheng   fs/aio: Add suppo...
370
  static const struct address_space_operations aio_ctx_aops = {
835f252c6   Gu Zheng   aio: fix uncorren...
371
  	.set_page_dirty = __set_page_dirty_no_writeback,
0c45355fc   Benjamin LaHaise   aio: fix build wh...
372
  #if IS_ENABLED(CONFIG_MIGRATION)
36bc08cc0   Gu Zheng   fs/aio: Add suppo...
373
  	.migratepage	= aio_migratepage,
0c45355fc   Benjamin LaHaise   aio: fix build wh...
374
  #endif
36bc08cc0   Gu Zheng   fs/aio: Add suppo...
375
  };
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
376
377
378
  static int aio_setup_ring(struct kioctx *ctx)
  {
  	struct aio_ring *ring;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
379
  	unsigned nr_events = ctx->max_reqs;
41003a7bc   Zach Brown   aio: remove retry...
380
  	struct mm_struct *mm = current->mm;
3dc9acb67   Linus Torvalds   aio: clean up and...
381
  	unsigned long size, unused;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
382
  	int nr_pages;
36bc08cc0   Gu Zheng   fs/aio: Add suppo...
383
384
  	int i;
  	struct file *file;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
385
386
387
388
389
390
  
  	/* Compensate for the ring buffer's head/tail overlap entry */
  	nr_events += 2;	/* 1 is required, 2 for good luck */
  
  	size = sizeof(struct aio_ring);
  	size += sizeof(struct io_event) * nr_events;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
391

36bc08cc0   Gu Zheng   fs/aio: Add suppo...
392
  	nr_pages = PFN_UP(size);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
393
394
  	if (nr_pages < 0)
  		return -EINVAL;
71ad7490c   Benjamin LaHaise   rework aio migrat...
395
  	file = aio_private_file(ctx, nr_pages);
36bc08cc0   Gu Zheng   fs/aio: Add suppo...
396
397
  	if (IS_ERR(file)) {
  		ctx->aio_ring_file = NULL;
fa8a53c39   Benjamin LaHaise   aio: v4 ensure ac...
398
  		return -ENOMEM;
36bc08cc0   Gu Zheng   fs/aio: Add suppo...
399
  	}
3dc9acb67   Linus Torvalds   aio: clean up and...
400
401
402
403
404
405
406
407
408
409
410
411
412
  	ctx->aio_ring_file = file;
  	nr_events = (PAGE_SIZE * nr_pages - sizeof(struct aio_ring))
  			/ sizeof(struct io_event);
  
  	ctx->ring_pages = ctx->internal_pages;
  	if (nr_pages > AIO_RING_PAGES) {
  		ctx->ring_pages = kcalloc(nr_pages, sizeof(struct page *),
  					  GFP_KERNEL);
  		if (!ctx->ring_pages) {
  			put_aio_ring_file(ctx);
  			return -ENOMEM;
  		}
  	}
36bc08cc0   Gu Zheng   fs/aio: Add suppo...
413
414
415
416
417
418
419
420
421
422
  	for (i = 0; i < nr_pages; i++) {
  		struct page *page;
  		page = find_or_create_page(file->f_inode->i_mapping,
  					   i, GFP_HIGHUSER | __GFP_ZERO);
  		if (!page)
  			break;
  		pr_debug("pid(%d) page[%d]->count=%d
  ",
  			 current->pid, i, page_count(page));
  		SetPageUptodate(page);
36bc08cc0   Gu Zheng   fs/aio: Add suppo...
423
  		unlock_page(page);
3dc9acb67   Linus Torvalds   aio: clean up and...
424
425
  
  		ctx->ring_pages[i] = page;
36bc08cc0   Gu Zheng   fs/aio: Add suppo...
426
  	}
3dc9acb67   Linus Torvalds   aio: clean up and...
427
  	ctx->nr_pages = i;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
428

3dc9acb67   Linus Torvalds   aio: clean up and...
429
430
  	if (unlikely(i != nr_pages)) {
  		aio_free_ring(ctx);
fa8a53c39   Benjamin LaHaise   aio: v4 ensure ac...
431
  		return -ENOMEM;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
432
  	}
58c85dc20   Kent Overstreet   aio: kill struct ...
433
434
435
  	ctx->mmap_size = nr_pages * PAGE_SIZE;
  	pr_debug("attempting mmap of %lu bytes
  ", ctx->mmap_size);
36bc08cc0   Gu Zheng   fs/aio: Add suppo...
436

41003a7bc   Zach Brown   aio: remove retry...
437
  	down_write(&mm->mmap_sem);
36bc08cc0   Gu Zheng   fs/aio: Add suppo...
438
439
  	ctx->mmap_base = do_mmap_pgoff(ctx->aio_ring_file, 0, ctx->mmap_size,
  				       PROT_READ | PROT_WRITE,
3dc9acb67   Linus Torvalds   aio: clean up and...
440
441
  				       MAP_SHARED, 0, &unused);
  	up_write(&mm->mmap_sem);
58c85dc20   Kent Overstreet   aio: kill struct ...
442
  	if (IS_ERR((void *)ctx->mmap_base)) {
58c85dc20   Kent Overstreet   aio: kill struct ...
443
  		ctx->mmap_size = 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
444
  		aio_free_ring(ctx);
fa8a53c39   Benjamin LaHaise   aio: v4 ensure ac...
445
  		return -ENOMEM;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
446
  	}
58c85dc20   Kent Overstreet   aio: kill struct ...
447
448
  	pr_debug("mmap address: 0x%08lx
  ", ctx->mmap_base);
d6c355c7d   Benjamin LaHaise   aio: fix race in ...
449

58c85dc20   Kent Overstreet   aio: kill struct ...
450
451
  	ctx->user_id = ctx->mmap_base;
  	ctx->nr_events = nr_events; /* trusted copy */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
452

58c85dc20   Kent Overstreet   aio: kill struct ...
453
  	ring = kmap_atomic(ctx->ring_pages[0]);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
454
  	ring->nr = nr_events;	/* user copy */
db446a08c   Benjamin LaHaise   aio: convert the ...
455
  	ring->id = ~0U;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
456
457
458
459
460
  	ring->head = ring->tail = 0;
  	ring->magic = AIO_RING_MAGIC;
  	ring->compat_features = AIO_RING_COMPAT_FEATURES;
  	ring->incompat_features = AIO_RING_INCOMPAT_FEATURES;
  	ring->header_length = sizeof(struct aio_ring);
e8e3c3d66   Cong Wang   fs: remove the se...
461
  	kunmap_atomic(ring);
58c85dc20   Kent Overstreet   aio: kill struct ...
462
  	flush_dcache_page(ctx->ring_pages[0]);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
463
464
465
  
  	return 0;
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
466
467
468
  #define AIO_EVENTS_PER_PAGE	(PAGE_SIZE / sizeof(struct io_event))
  #define AIO_EVENTS_FIRST_PAGE	((PAGE_SIZE - sizeof(struct aio_ring)) / sizeof(struct io_event))
  #define AIO_EVENTS_OFFSET	(AIO_EVENTS_PER_PAGE - AIO_EVENTS_FIRST_PAGE)
0460fef2a   Kent Overstreet   aio: use cancella...
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
  void kiocb_set_cancel_fn(struct kiocb *req, kiocb_cancel_fn *cancel)
  {
  	struct kioctx *ctx = req->ki_ctx;
  	unsigned long flags;
  
  	spin_lock_irqsave(&ctx->ctx_lock, flags);
  
  	if (!req->ki_list.next)
  		list_add(&req->ki_list, &ctx->active_reqs);
  
  	req->ki_cancel = cancel;
  
  	spin_unlock_irqrestore(&ctx->ctx_lock, flags);
  }
  EXPORT_SYMBOL(kiocb_set_cancel_fn);
d52a8f9ea   Fabian Frederick   fs/aio.c: Remove ...
484
  static int kiocb_cancel(struct kiocb *kiocb)
906b973cf   Kent Overstreet   aio: add kiocb_ca...
485
  {
0460fef2a   Kent Overstreet   aio: use cancella...
486
  	kiocb_cancel_fn *old, *cancel;
906b973cf   Kent Overstreet   aio: add kiocb_ca...
487

0460fef2a   Kent Overstreet   aio: use cancella...
488
489
490
491
492
493
494
495
  	/*
  	 * Don't want to set kiocb->ki_cancel = KIOCB_CANCELLED unless it
  	 * actually has a cancel function, hence the cmpxchg()
  	 */
  
  	cancel = ACCESS_ONCE(kiocb->ki_cancel);
  	do {
  		if (!cancel || cancel == KIOCB_CANCELLED)
57282d8fd   Kent Overstreet   aio: Kill ki_users
496
  			return -EINVAL;
906b973cf   Kent Overstreet   aio: add kiocb_ca...
497

0460fef2a   Kent Overstreet   aio: use cancella...
498
499
500
  		old = cancel;
  		cancel = cmpxchg(&kiocb->ki_cancel, old, KIOCB_CANCELLED);
  	} while (cancel != old);
906b973cf   Kent Overstreet   aio: add kiocb_ca...
501

57282d8fd   Kent Overstreet   aio: Kill ki_users
502
  	return cancel(kiocb);
906b973cf   Kent Overstreet   aio: add kiocb_ca...
503
  }
e34ecee2a   Kent Overstreet   aio: Fix a trinit...
504
  static void free_ioctx(struct work_struct *work)
36f558890   Kent Overstreet   aio: refcounting ...
505
  {
e34ecee2a   Kent Overstreet   aio: Fix a trinit...
506
  	struct kioctx *ctx = container_of(work, struct kioctx, free_work);
e1bdd5f27   Kent Overstreet   aio: percpu reqs_...
507

e34ecee2a   Kent Overstreet   aio: Fix a trinit...
508
509
  	pr_debug("freeing %p
  ", ctx);
e1bdd5f27   Kent Overstreet   aio: percpu reqs_...
510

e34ecee2a   Kent Overstreet   aio: Fix a trinit...
511
  	aio_free_ring(ctx);
e1bdd5f27   Kent Overstreet   aio: percpu reqs_...
512
  	free_percpu(ctx->cpu);
9a1049da9   Tejun Heo   percpu-refcount: ...
513
514
  	percpu_ref_exit(&ctx->reqs);
  	percpu_ref_exit(&ctx->users);
36f558890   Kent Overstreet   aio: refcounting ...
515
516
  	kmem_cache_free(kioctx_cachep, ctx);
  }
e34ecee2a   Kent Overstreet   aio: Fix a trinit...
517
518
519
  static void free_ioctx_reqs(struct percpu_ref *ref)
  {
  	struct kioctx *ctx = container_of(ref, struct kioctx, reqs);
e02ba72aa   Anatol Pomozov   aio: block io_des...
520
521
522
  	/* At this point we know that there are no any in-flight requests */
  	if (ctx->requests_done)
  		complete(ctx->requests_done);
e34ecee2a   Kent Overstreet   aio: Fix a trinit...
523
524
525
  	INIT_WORK(&ctx->free_work, free_ioctx);
  	schedule_work(&ctx->free_work);
  }
36f558890   Kent Overstreet   aio: refcounting ...
526
527
528
529
530
  /*
   * When this function runs, the kioctx has been removed from the "hash table"
   * and ctx->users has dropped to 0, so we know no more kiocbs can be submitted -
   * now it's safe to cancel any that need to be.
   */
e34ecee2a   Kent Overstreet   aio: Fix a trinit...
531
  static void free_ioctx_users(struct percpu_ref *ref)
36f558890   Kent Overstreet   aio: refcounting ...
532
  {
e34ecee2a   Kent Overstreet   aio: Fix a trinit...
533
  	struct kioctx *ctx = container_of(ref, struct kioctx, users);
36f558890   Kent Overstreet   aio: refcounting ...
534
535
536
537
538
539
540
541
542
  	struct kiocb *req;
  
  	spin_lock_irq(&ctx->ctx_lock);
  
  	while (!list_empty(&ctx->active_reqs)) {
  		req = list_first_entry(&ctx->active_reqs,
  				       struct kiocb, ki_list);
  
  		list_del_init(&req->ki_list);
d52a8f9ea   Fabian Frederick   fs/aio.c: Remove ...
543
  		kiocb_cancel(req);
36f558890   Kent Overstreet   aio: refcounting ...
544
545
546
  	}
  
  	spin_unlock_irq(&ctx->ctx_lock);
e34ecee2a   Kent Overstreet   aio: Fix a trinit...
547
548
  	percpu_ref_kill(&ctx->reqs);
  	percpu_ref_put(&ctx->reqs);
36f558890   Kent Overstreet   aio: refcounting ...
549
  }
db446a08c   Benjamin LaHaise   aio: convert the ...
550
551
552
553
554
555
556
  static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm)
  {
  	unsigned i, new_nr;
  	struct kioctx_table *table, *old;
  	struct aio_ring *ring;
  
  	spin_lock(&mm->ioctx_lock);
855ef0dec   Oleg Nesterov   aio: kill the mis...
557
  	table = rcu_dereference_raw(mm->ioctx_table);
db446a08c   Benjamin LaHaise   aio: convert the ...
558
559
560
561
562
563
564
565
  
  	while (1) {
  		if (table)
  			for (i = 0; i < table->nr; i++)
  				if (!table->table[i]) {
  					ctx->id = i;
  					table->table[i] = ctx;
  					spin_unlock(&mm->ioctx_lock);
fa8a53c39   Benjamin LaHaise   aio: v4 ensure ac...
566
567
568
569
  					/* While kioctx setup is in progress,
  					 * we are protected from page migration
  					 * changes ring_pages by ->ring_lock.
  					 */
db446a08c   Benjamin LaHaise   aio: convert the ...
570
571
572
573
574
575
576
  					ring = kmap_atomic(ctx->ring_pages[0]);
  					ring->id = ctx->id;
  					kunmap_atomic(ring);
  					return 0;
  				}
  
  		new_nr = (table ? table->nr : 1) * 4;
db446a08c   Benjamin LaHaise   aio: convert the ...
577
578
579
580
581
582
583
584
585
586
  		spin_unlock(&mm->ioctx_lock);
  
  		table = kzalloc(sizeof(*table) + sizeof(struct kioctx *) *
  				new_nr, GFP_KERNEL);
  		if (!table)
  			return -ENOMEM;
  
  		table->nr = new_nr;
  
  		spin_lock(&mm->ioctx_lock);
855ef0dec   Oleg Nesterov   aio: kill the mis...
587
  		old = rcu_dereference_raw(mm->ioctx_table);
db446a08c   Benjamin LaHaise   aio: convert the ...
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
  
  		if (!old) {
  			rcu_assign_pointer(mm->ioctx_table, table);
  		} else if (table->nr > old->nr) {
  			memcpy(table->table, old->table,
  			       old->nr * sizeof(struct kioctx *));
  
  			rcu_assign_pointer(mm->ioctx_table, table);
  			kfree_rcu(old, rcu);
  		} else {
  			kfree(table);
  			table = old;
  		}
  	}
  }
e34ecee2a   Kent Overstreet   aio: Fix a trinit...
603
604
605
606
607
608
609
610
611
  static void aio_nr_sub(unsigned nr)
  {
  	spin_lock(&aio_nr_lock);
  	if (WARN_ON(aio_nr - nr > aio_nr))
  		aio_nr = 0;
  	else
  		aio_nr -= nr;
  	spin_unlock(&aio_nr_lock);
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
612
613
614
615
616
  /* ioctx_alloc
   *	Allocates and initializes an ioctx.  Returns an ERR_PTR if it failed.
   */
  static struct kioctx *ioctx_alloc(unsigned nr_events)
  {
41003a7bc   Zach Brown   aio: remove retry...
617
  	struct mm_struct *mm = current->mm;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
618
  	struct kioctx *ctx;
e23754f88   Al Viro   aio: don't bother...
619
  	int err = -ENOMEM;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
620

e1bdd5f27   Kent Overstreet   aio: percpu reqs_...
621
622
623
624
625
626
627
628
629
630
631
  	/*
  	 * We keep track of the number of available ringbuffer slots, to prevent
  	 * overflow (reqs_available), and we also use percpu counters for this.
  	 *
  	 * So since up to half the slots might be on other cpu's percpu counters
  	 * and unavailable, double nr_events so userspace sees what they
  	 * expected: additionally, we move req_batch slots to/from percpu
  	 * counters at a time, so make sure that isn't 0:
  	 */
  	nr_events = max(nr_events, num_possible_cpus() * 4);
  	nr_events *= 2;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
632
633
634
635
636
637
638
  	/* Prevent overflows */
  	if ((nr_events > (0x10000000U / sizeof(struct io_event))) ||
  	    (nr_events > (0x10000000U / sizeof(struct kiocb)))) {
  		pr_debug("ENOMEM: nr_events too high
  ");
  		return ERR_PTR(-EINVAL);
  	}
4cd81c3df   Benjamin LaHaise   aio: double aio_m...
639
  	if (!nr_events || (unsigned long)nr_events > (aio_max_nr * 2UL))
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
640
  		return ERR_PTR(-EAGAIN);
c37622296   Robert P. J. Day   [PATCH] Transform...
641
  	ctx = kmem_cache_zalloc(kioctx_cachep, GFP_KERNEL);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
642
643
  	if (!ctx)
  		return ERR_PTR(-ENOMEM);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
644
  	ctx->max_reqs = nr_events;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
645

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
646
  	spin_lock_init(&ctx->ctx_lock);
0460fef2a   Kent Overstreet   aio: use cancella...
647
  	spin_lock_init(&ctx->completion_lock);
58c85dc20   Kent Overstreet   aio: kill struct ...
648
  	mutex_init(&ctx->ring_lock);
fa8a53c39   Benjamin LaHaise   aio: v4 ensure ac...
649
650
651
  	/* Protect against page migration throughout kiotx setup by keeping
  	 * the ring_lock mutex held until setup is complete. */
  	mutex_lock(&ctx->ring_lock);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
652
653
654
  	init_waitqueue_head(&ctx->wait);
  
  	INIT_LIST_HEAD(&ctx->active_reqs);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
655

2aad2a86f   Tejun Heo   percpu_ref: add P...
656
  	if (percpu_ref_init(&ctx->users, free_ioctx_users, 0, GFP_KERNEL))
fa8a53c39   Benjamin LaHaise   aio: v4 ensure ac...
657
  		goto err;
2aad2a86f   Tejun Heo   percpu_ref: add P...
658
  	if (percpu_ref_init(&ctx->reqs, free_ioctx_reqs, 0, GFP_KERNEL))
fa8a53c39   Benjamin LaHaise   aio: v4 ensure ac...
659
  		goto err;
e1bdd5f27   Kent Overstreet   aio: percpu reqs_...
660
661
  	ctx->cpu = alloc_percpu(struct kioctx_cpu);
  	if (!ctx->cpu)
e34ecee2a   Kent Overstreet   aio: Fix a trinit...
662
  		goto err;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
663

fa8a53c39   Benjamin LaHaise   aio: v4 ensure ac...
664
665
  	err = aio_setup_ring(ctx);
  	if (err < 0)
e34ecee2a   Kent Overstreet   aio: Fix a trinit...
666
  		goto err;
e1bdd5f27   Kent Overstreet   aio: percpu reqs_...
667

34e83fc61   Kent Overstreet   aio: reqs_active ...
668
  	atomic_set(&ctx->reqs_available, ctx->nr_events - 1);
e1bdd5f27   Kent Overstreet   aio: percpu reqs_...
669
  	ctx->req_batch = (ctx->nr_events - 1) / (num_possible_cpus() * 4);
6878ea72a   Benjamin LaHaise   aio: be defensive...
670
671
  	if (ctx->req_batch < 1)
  		ctx->req_batch = 1;
34e83fc61   Kent Overstreet   aio: reqs_active ...
672

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
673
  	/* limit the number of system wide aios */
9fa1cb397   Al Viro   aio: aio_nr_lock ...
674
  	spin_lock(&aio_nr_lock);
4cd81c3df   Benjamin LaHaise   aio: double aio_m...
675
  	if (aio_nr + nr_events > (aio_max_nr * 2UL) ||
2dd542b7a   Al Viro   aio: aio_nr decre...
676
  	    aio_nr + nr_events < aio_nr) {
9fa1cb397   Al Viro   aio: aio_nr_lock ...
677
  		spin_unlock(&aio_nr_lock);
e34ecee2a   Kent Overstreet   aio: Fix a trinit...
678
  		err = -EAGAIN;
d1b943271   Gu Zheng   aio: clean up aio...
679
  		goto err_ctx;
2dd542b7a   Al Viro   aio: aio_nr decre...
680
681
  	}
  	aio_nr += ctx->max_reqs;
9fa1cb397   Al Viro   aio: aio_nr_lock ...
682
  	spin_unlock(&aio_nr_lock);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
683

1881686f8   Benjamin LaHaise   aio: fix kioctx l...
684
685
  	percpu_ref_get(&ctx->users);	/* io_setup() will drop this ref */
  	percpu_ref_get(&ctx->reqs);	/* free_ioctx_users() will drop this */
723be6e39   Kent Overstreet   aio: percpu ioctx...
686

da90382c2   Benjamin LaHaise   aio: fix error ha...
687
688
  	err = ioctx_add_table(ctx, mm);
  	if (err)
e34ecee2a   Kent Overstreet   aio: Fix a trinit...
689
  		goto err_cleanup;
da90382c2   Benjamin LaHaise   aio: fix error ha...
690

fa8a53c39   Benjamin LaHaise   aio: v4 ensure ac...
691
692
  	/* Release the ring_lock mutex now that all setup is complete. */
  	mutex_unlock(&ctx->ring_lock);
caf4167aa   Kent Overstreet   aio: dprintk() ->...
693
694
  	pr_debug("allocated ioctx %p[%ld]: mm=%p mask=0x%x
  ",
58c85dc20   Kent Overstreet   aio: kill struct ...
695
  		 ctx, ctx->user_id, mm, ctx->nr_events);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
696
  	return ctx;
e34ecee2a   Kent Overstreet   aio: Fix a trinit...
697
698
  err_cleanup:
  	aio_nr_sub(ctx->max_reqs);
d1b943271   Gu Zheng   aio: clean up aio...
699
700
  err_ctx:
  	aio_free_ring(ctx);
e34ecee2a   Kent Overstreet   aio: Fix a trinit...
701
  err:
fa8a53c39   Benjamin LaHaise   aio: v4 ensure ac...
702
  	mutex_unlock(&ctx->ring_lock);
e1bdd5f27   Kent Overstreet   aio: percpu reqs_...
703
  	free_percpu(ctx->cpu);
9a1049da9   Tejun Heo   percpu-refcount: ...
704
705
  	percpu_ref_exit(&ctx->reqs);
  	percpu_ref_exit(&ctx->users);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
706
  	kmem_cache_free(kioctx_cachep, ctx);
caf4167aa   Kent Overstreet   aio: dprintk() ->...
707
708
  	pr_debug("error allocating ioctx %d
  ", err);
e23754f88   Al Viro   aio: don't bother...
709
  	return ERR_PTR(err);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
710
  }
36f558890   Kent Overstreet   aio: refcounting ...
711
712
713
714
715
  /* kill_ioctx
   *	Cancels all outstanding aio requests on an aio context.  Used
   *	when the processes owning a context have all exited to encourage
   *	the rapid destruction of the kioctx.
   */
fb2d44838   Benjamin LaHaise   aio: report error...
716
  static int kill_ioctx(struct mm_struct *mm, struct kioctx *ctx,
e02ba72aa   Anatol Pomozov   aio: block io_des...
717
  		struct completion *requests_done)
36f558890   Kent Overstreet   aio: refcounting ...
718
  {
fa88b6f88   Benjamin LaHaise   aio: cleanup: fla...
719
  	struct kioctx_table *table;
db446a08c   Benjamin LaHaise   aio: convert the ...
720

fa88b6f88   Benjamin LaHaise   aio: cleanup: fla...
721
722
  	if (atomic_xchg(&ctx->dead, 1))
  		return -EINVAL;
db446a08c   Benjamin LaHaise   aio: convert the ...
723

db446a08c   Benjamin LaHaise   aio: convert the ...
724

fa88b6f88   Benjamin LaHaise   aio: cleanup: fla...
725
  	spin_lock(&mm->ioctx_lock);
855ef0dec   Oleg Nesterov   aio: kill the mis...
726
  	table = rcu_dereference_raw(mm->ioctx_table);
fa88b6f88   Benjamin LaHaise   aio: cleanup: fla...
727
728
  	WARN_ON(ctx != table->table[ctx->id]);
  	table->table[ctx->id] = NULL;
fa88b6f88   Benjamin LaHaise   aio: cleanup: fla...
729
  	spin_unlock(&mm->ioctx_lock);
4fcc712f5   Kent Overstreet   aio: fix io_destr...
730

fa88b6f88   Benjamin LaHaise   aio: cleanup: fla...
731
732
  	/* percpu_ref_kill() will do the necessary call_rcu() */
  	wake_up_all(&ctx->wait);
4fcc712f5   Kent Overstreet   aio: fix io_destr...
733

fa88b6f88   Benjamin LaHaise   aio: cleanup: fla...
734
735
736
737
738
739
740
741
  	/*
  	 * It'd be more correct to do this in free_ioctx(), after all
  	 * the outstanding kiocbs have finished - but by then io_destroy
  	 * has already returned, so io_setup() could potentially return
  	 * -EAGAIN with no ioctxs actually in use (as far as userspace
  	 *  could tell).
  	 */
  	aio_nr_sub(ctx->max_reqs);
4fcc712f5   Kent Overstreet   aio: fix io_destr...
742

fa88b6f88   Benjamin LaHaise   aio: cleanup: fla...
743
744
  	if (ctx->mmap_size)
  		vm_munmap(ctx->mmap_base, ctx->mmap_size);
fb2d44838   Benjamin LaHaise   aio: report error...
745

fa88b6f88   Benjamin LaHaise   aio: cleanup: fla...
746
747
748
  	ctx->requests_done = requests_done;
  	percpu_ref_kill(&ctx->users);
  	return 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
749
750
751
752
753
  }
  
  /* wait_on_sync_kiocb:
   *	Waits on the given sync kiocb to complete.
   */
57282d8fd   Kent Overstreet   aio: Kill ki_users
754
  ssize_t wait_on_sync_kiocb(struct kiocb *req)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
755
  {
57282d8fd   Kent Overstreet   aio: Kill ki_users
756
  	while (!req->ki_ctx) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
757
  		set_current_state(TASK_UNINTERRUPTIBLE);
57282d8fd   Kent Overstreet   aio: Kill ki_users
758
  		if (req->ki_ctx)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
759
  			break;
41d10da37   Jeff Moyer   aio: account I/O ...
760
  		io_schedule();
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
761
762
  	}
  	__set_current_state(TASK_RUNNING);
57282d8fd   Kent Overstreet   aio: Kill ki_users
763
  	return req->ki_user_data;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
764
  }
385773e04   H Hartley Sweeten   aio.c: move EXPOR...
765
  EXPORT_SYMBOL(wait_on_sync_kiocb);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
766

36f558890   Kent Overstreet   aio: refcounting ...
767
768
769
770
771
772
773
  /*
   * exit_aio: called when the last user of mm goes away.  At this point, there is
   * no way for any new requests to be submited or any of the io_* syscalls to be
   * called on the context.
   *
   * There may be outstanding kiocbs, but free_ioctx() will explicitly wait on
   * them.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
774
   */
fc9b52cd8   Harvey Harrison   fs: remove fastca...
775
  void exit_aio(struct mm_struct *mm)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
776
  {
4b70ac5fd   Oleg Nesterov   aio: change exit_...
777
778
  	struct kioctx_table *table = rcu_dereference_raw(mm->ioctx_table);
  	int i;
db446a08c   Benjamin LaHaise   aio: convert the ...
779

4b70ac5fd   Oleg Nesterov   aio: change exit_...
780
781
  	if (!table)
  		return;
db446a08c   Benjamin LaHaise   aio: convert the ...
782

4b70ac5fd   Oleg Nesterov   aio: change exit_...
783
784
  	for (i = 0; i < table->nr; ++i) {
  		struct kioctx *ctx = table->table[i];
6098b45b3   Gu Zheng   aio: block exit_a...
785
786
  		struct completion requests_done =
  			COMPLETION_INITIALIZER_ONSTACK(requests_done);
abf137dd7   Jens Axboe   aio: make the loo...
787

4b70ac5fd   Oleg Nesterov   aio: change exit_...
788
789
  		if (!ctx)
  			continue;
936af1576   Al Viro   aio: don't bother...
790
  		/*
4b70ac5fd   Oleg Nesterov   aio: change exit_...
791
792
793
794
795
  		 * We don't need to bother with munmap() here - exit_mmap(mm)
  		 * is coming and it'll unmap everything. And we simply can't,
  		 * this is not necessarily our ->mm.
  		 * Since kill_ioctx() uses non-zero ->mmap_size as indicator
  		 * that it needs to unmap the area, just set it to 0.
936af1576   Al Viro   aio: don't bother...
796
  		 */
58c85dc20   Kent Overstreet   aio: kill struct ...
797
  		ctx->mmap_size = 0;
6098b45b3   Gu Zheng   aio: block exit_a...
798
  		kill_ioctx(mm, ctx, &requests_done);
36f558890   Kent Overstreet   aio: refcounting ...
799

6098b45b3   Gu Zheng   aio: block exit_a...
800
801
  		/* Wait until all IO for the context are done. */
  		wait_for_completion(&requests_done);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
802
  	}
4b70ac5fd   Oleg Nesterov   aio: change exit_...
803
804
805
  
  	RCU_INIT_POINTER(mm->ioctx_table, NULL);
  	kfree(table);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
806
  }
e1bdd5f27   Kent Overstreet   aio: percpu reqs_...
807
808
809
  static void put_reqs_available(struct kioctx *ctx, unsigned nr)
  {
  	struct kioctx_cpu *kcpu;
263782c1c   Benjamin LaHaise   aio: protect reqs...
810
  	unsigned long flags;
e1bdd5f27   Kent Overstreet   aio: percpu reqs_...
811

263782c1c   Benjamin LaHaise   aio: protect reqs...
812
  	local_irq_save(flags);
be6fb451a   Benjamin LaHaise   aio: remove no lo...
813
  	kcpu = this_cpu_ptr(ctx->cpu);
e1bdd5f27   Kent Overstreet   aio: percpu reqs_...
814
  	kcpu->reqs_available += nr;
263782c1c   Benjamin LaHaise   aio: protect reqs...
815

e1bdd5f27   Kent Overstreet   aio: percpu reqs_...
816
817
818
819
  	while (kcpu->reqs_available >= ctx->req_batch * 2) {
  		kcpu->reqs_available -= ctx->req_batch;
  		atomic_add(ctx->req_batch, &ctx->reqs_available);
  	}
263782c1c   Benjamin LaHaise   aio: protect reqs...
820
  	local_irq_restore(flags);
e1bdd5f27   Kent Overstreet   aio: percpu reqs_...
821
822
823
824
825
826
  }
  
  static bool get_reqs_available(struct kioctx *ctx)
  {
  	struct kioctx_cpu *kcpu;
  	bool ret = false;
263782c1c   Benjamin LaHaise   aio: protect reqs...
827
  	unsigned long flags;
e1bdd5f27   Kent Overstreet   aio: percpu reqs_...
828

263782c1c   Benjamin LaHaise   aio: protect reqs...
829
  	local_irq_save(flags);
be6fb451a   Benjamin LaHaise   aio: remove no lo...
830
  	kcpu = this_cpu_ptr(ctx->cpu);
e1bdd5f27   Kent Overstreet   aio: percpu reqs_...
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
  	if (!kcpu->reqs_available) {
  		int old, avail = atomic_read(&ctx->reqs_available);
  
  		do {
  			if (avail < ctx->req_batch)
  				goto out;
  
  			old = avail;
  			avail = atomic_cmpxchg(&ctx->reqs_available,
  					       avail, avail - ctx->req_batch);
  		} while (avail != old);
  
  		kcpu->reqs_available += ctx->req_batch;
  	}
  
  	ret = true;
  	kcpu->reqs_available--;
  out:
263782c1c   Benjamin LaHaise   aio: protect reqs...
849
  	local_irq_restore(flags);
e1bdd5f27   Kent Overstreet   aio: percpu reqs_...
850
851
  	return ret;
  }
d856f32a8   Benjamin LaHaise   aio: fix reqs_ava...
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
  /* refill_reqs_available
   *	Updates the reqs_available reference counts used for tracking the
   *	number of free slots in the completion ring.  This can be called
   *	from aio_complete() (to optimistically update reqs_available) or
   *	from aio_get_req() (the we're out of events case).  It must be
   *	called holding ctx->completion_lock.
   */
  static void refill_reqs_available(struct kioctx *ctx, unsigned head,
                                    unsigned tail)
  {
  	unsigned events_in_ring, completed;
  
  	/* Clamp head since userland can write to it. */
  	head %= ctx->nr_events;
  	if (head <= tail)
  		events_in_ring = tail - head;
  	else
  		events_in_ring = ctx->nr_events - (head - tail);
  
  	completed = ctx->completed_events;
  	if (events_in_ring < completed)
  		completed -= events_in_ring;
  	else
  		completed = 0;
  
  	if (!completed)
  		return;
  
  	ctx->completed_events -= completed;
  	put_reqs_available(ctx, completed);
  }
  
  /* user_refill_reqs_available
   *	Called to refill reqs_available when aio_get_req() encounters an
   *	out of space in the completion ring.
   */
  static void user_refill_reqs_available(struct kioctx *ctx)
  {
  	spin_lock_irq(&ctx->completion_lock);
  	if (ctx->completed_events) {
  		struct aio_ring *ring;
  		unsigned head;
  
  		/* Access of ring->head may race with aio_read_events_ring()
  		 * here, but that's okay since whether we read the old version
  		 * or the new version, and either will be valid.  The important
  		 * part is that head cannot pass tail since we prevent
  		 * aio_complete() from updating tail by holding
  		 * ctx->completion_lock.  Even if head is invalid, the check
  		 * against ctx->completed_events below will make sure we do the
  		 * safe/right thing.
  		 */
  		ring = kmap_atomic(ctx->ring_pages[0]);
  		head = ring->head;
  		kunmap_atomic(ring);
  
  		refill_reqs_available(ctx, head, ctx->tail);
  	}
  
  	spin_unlock_irq(&ctx->completion_lock);
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
913
  /* aio_get_req
57282d8fd   Kent Overstreet   aio: Kill ki_users
914
915
   *	Allocate a slot for an aio request.
   * Returns NULL if no requests are free.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
916
   */
a1c8eae75   Kent Overstreet   aio: kill batch a...
917
  static inline struct kiocb *aio_get_req(struct kioctx *ctx)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
918
  {
a1c8eae75   Kent Overstreet   aio: kill batch a...
919
  	struct kiocb *req;
d856f32a8   Benjamin LaHaise   aio: fix reqs_ava...
920
921
922
923
924
  	if (!get_reqs_available(ctx)) {
  		user_refill_reqs_available(ctx);
  		if (!get_reqs_available(ctx))
  			return NULL;
  	}
a1c8eae75   Kent Overstreet   aio: kill batch a...
925

0460fef2a   Kent Overstreet   aio: use cancella...
926
  	req = kmem_cache_alloc(kiocb_cachep, GFP_KERNEL|__GFP_ZERO);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
927
  	if (unlikely(!req))
a1c8eae75   Kent Overstreet   aio: kill batch a...
928
  		goto out_put;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
929

e34ecee2a   Kent Overstreet   aio: Fix a trinit...
930
  	percpu_ref_get(&ctx->reqs);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
931
  	req->ki_ctx = ctx;
080d676de   Jeff Moyer   aio: allocate kio...
932
  	return req;
a1c8eae75   Kent Overstreet   aio: kill batch a...
933
  out_put:
e1bdd5f27   Kent Overstreet   aio: percpu reqs_...
934
  	put_reqs_available(ctx, 1);
a1c8eae75   Kent Overstreet   aio: kill batch a...
935
  	return NULL;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
936
  }
11599ebac   Kent Overstreet   aio: make aio_put...
937
  static void kiocb_free(struct kiocb *req)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
938
  {
1d98ebfcc   Kent Overstreet   aio: do fget() af...
939
940
  	if (req->ki_filp)
  		fput(req->ki_filp);
133890103   Davide Libenzi   eventfd: revised ...
941
942
  	if (req->ki_eventfd != NULL)
  		eventfd_ctx_put(req->ki_eventfd);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
943
  	kmem_cache_free(kiocb_cachep, req);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
944
  }
d5470b596   Adrian Bunk   fs/aio.c: make 3 ...
945
  static struct kioctx *lookup_ioctx(unsigned long ctx_id)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
946
  {
db446a08c   Benjamin LaHaise   aio: convert the ...
947
  	struct aio_ring __user *ring  = (void __user *)ctx_id;
abf137dd7   Jens Axboe   aio: make the loo...
948
  	struct mm_struct *mm = current->mm;
65c24491b   Jeff Moyer   aio: lookup_ioctx...
949
  	struct kioctx *ctx, *ret = NULL;
db446a08c   Benjamin LaHaise   aio: convert the ...
950
951
952
953
954
  	struct kioctx_table *table;
  	unsigned id;
  
  	if (get_user(id, &ring->id))
  		return NULL;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
955

abf137dd7   Jens Axboe   aio: make the loo...
956
  	rcu_read_lock();
db446a08c   Benjamin LaHaise   aio: convert the ...
957
  	table = rcu_dereference(mm->ioctx_table);
abf137dd7   Jens Axboe   aio: make the loo...
958

db446a08c   Benjamin LaHaise   aio: convert the ...
959
960
  	if (!table || id >= table->nr)
  		goto out;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
961

db446a08c   Benjamin LaHaise   aio: convert the ...
962
  	ctx = table->table[id];
f30d704fe   Benjamin LaHaise   aio: table lookup...
963
  	if (ctx && ctx->user_id == ctx_id) {
db446a08c   Benjamin LaHaise   aio: convert the ...
964
965
966
967
  		percpu_ref_get(&ctx->users);
  		ret = ctx;
  	}
  out:
abf137dd7   Jens Axboe   aio: make the loo...
968
  	rcu_read_unlock();
65c24491b   Jeff Moyer   aio: lookup_ioctx...
969
  	return ret;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
970
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
971
972
  /* aio_complete
   *	Called when the io request on the given iocb is complete.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
973
   */
2d68449e8   Kent Overstreet   aio: kill return ...
974
  void aio_complete(struct kiocb *iocb, long res, long res2)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
975
976
  {
  	struct kioctx	*ctx = iocb->ki_ctx;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
977
  	struct aio_ring	*ring;
21b40200c   Kent Overstreet   aio: use flush_dc...
978
  	struct io_event	*ev_page, *event;
d856f32a8   Benjamin LaHaise   aio: fix reqs_ava...
979
  	unsigned tail, pos, head;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
980
  	unsigned long	flags;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
981

20dcae324   Zach Brown   [PATCH] aio: remo...
982
983
984
985
986
987
  	/*
  	 * Special case handling for sync iocbs:
  	 *  - events go directly into the iocb for fast handling
  	 *  - the sync task with the iocb in its stack holds the single iocb
  	 *    ref, no other paths have a way to get another ref
  	 *  - the sync task helpfully left a reference to itself in the iocb
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
988
989
  	 */
  	if (is_sync_kiocb(iocb)) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
990
  		iocb->ki_user_data = res;
57282d8fd   Kent Overstreet   aio: Kill ki_users
991
992
  		smp_wmb();
  		iocb->ki_ctx = ERR_PTR(-EXDEV);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
993
  		wake_up_process(iocb->ki_obj.tsk);
2d68449e8   Kent Overstreet   aio: kill return ...
994
  		return;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
995
  	}
0460fef2a   Kent Overstreet   aio: use cancella...
996
997
998
999
1000
1001
1002
  	if (iocb->ki_list.next) {
  		unsigned long flags;
  
  		spin_lock_irqsave(&ctx->ctx_lock, flags);
  		list_del(&iocb->ki_list);
  		spin_unlock_irqrestore(&ctx->ctx_lock, flags);
  	}
11599ebac   Kent Overstreet   aio: make aio_put...
1003

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1004
  	/*
0460fef2a   Kent Overstreet   aio: use cancella...
1005
  	 * Add a completion event to the ring buffer. Must be done holding
4b30f07e7   Tang Chen   aio: fix wrong co...
1006
  	 * ctx->completion_lock to prevent other code from messing with the tail
0460fef2a   Kent Overstreet   aio: use cancella...
1007
1008
1009
  	 * pointer since we might be called from irq context.
  	 */
  	spin_lock_irqsave(&ctx->completion_lock, flags);
58c85dc20   Kent Overstreet   aio: kill struct ...
1010
  	tail = ctx->tail;
21b40200c   Kent Overstreet   aio: use flush_dc...
1011
  	pos = tail + AIO_EVENTS_OFFSET;
58c85dc20   Kent Overstreet   aio: kill struct ...
1012
  	if (++tail >= ctx->nr_events)
4bf69b2a0   Kenneth W Chen   [PATCH] aio: ring...
1013
  		tail = 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1014

58c85dc20   Kent Overstreet   aio: kill struct ...
1015
  	ev_page = kmap_atomic(ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE]);
21b40200c   Kent Overstreet   aio: use flush_dc...
1016
  	event = ev_page + pos % AIO_EVENTS_PER_PAGE;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1017
1018
1019
1020
  	event->obj = (u64)(unsigned long)iocb->ki_obj.user;
  	event->data = iocb->ki_user_data;
  	event->res = res;
  	event->res2 = res2;
21b40200c   Kent Overstreet   aio: use flush_dc...
1021
  	kunmap_atomic(ev_page);
58c85dc20   Kent Overstreet   aio: kill struct ...
1022
  	flush_dcache_page(ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE]);
21b40200c   Kent Overstreet   aio: use flush_dc...
1023
1024
1025
  
  	pr_debug("%p[%u]: %p: %p %Lx %lx %lx
  ",
caf4167aa   Kent Overstreet   aio: dprintk() ->...
1026
1027
  		 ctx, tail, iocb, iocb->ki_obj.user, iocb->ki_user_data,
  		 res, res2);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1028
1029
1030
1031
1032
  
  	/* after flagging the request as done, we
  	 * must never even look at it again
  	 */
  	smp_wmb();	/* make event visible before updating tail */
58c85dc20   Kent Overstreet   aio: kill struct ...
1033
  	ctx->tail = tail;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1034

58c85dc20   Kent Overstreet   aio: kill struct ...
1035
  	ring = kmap_atomic(ctx->ring_pages[0]);
d856f32a8   Benjamin LaHaise   aio: fix reqs_ava...
1036
  	head = ring->head;
21b40200c   Kent Overstreet   aio: use flush_dc...
1037
  	ring->tail = tail;
e8e3c3d66   Cong Wang   fs: remove the se...
1038
  	kunmap_atomic(ring);
58c85dc20   Kent Overstreet   aio: kill struct ...
1039
  	flush_dcache_page(ctx->ring_pages[0]);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1040

d856f32a8   Benjamin LaHaise   aio: fix reqs_ava...
1041
1042
1043
  	ctx->completed_events++;
  	if (ctx->completed_events > 1)
  		refill_reqs_available(ctx, head, tail);
0460fef2a   Kent Overstreet   aio: use cancella...
1044
  	spin_unlock_irqrestore(&ctx->completion_lock, flags);
21b40200c   Kent Overstreet   aio: use flush_dc...
1045
1046
  	pr_debug("added to ring %p at [%u]
  ", iocb, tail);
8d1c98b0b   Davide Libenzi   eventfd/kaio inte...
1047
1048
1049
1050
1051
1052
  
  	/*
  	 * Check if the user asked us to deliver the result through an
  	 * eventfd. The eventfd_signal() function is safe to be called
  	 * from IRQ context.
  	 */
87c3a86e1   Davide Libenzi   eventfd: remove f...
1053
  	if (iocb->ki_eventfd != NULL)
8d1c98b0b   Davide Libenzi   eventfd/kaio inte...
1054
  		eventfd_signal(iocb->ki_eventfd, 1);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1055
  	/* everything turned out well, dispose of the aiocb. */
57282d8fd   Kent Overstreet   aio: Kill ki_users
1056
  	kiocb_free(iocb);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1057

6cb2a2104   Quentin Barnes   aio: bad AIO race...
1058
1059
1060
1061
1062
1063
1064
  	/*
  	 * We have to order our ring_info tail store above and test
  	 * of the wait list below outside the wait lock.  This is
  	 * like in wake_up_bit() where clearing a bit has to be
  	 * ordered with the unlocked test.
  	 */
  	smp_mb();
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1065
1066
  	if (waitqueue_active(&ctx->wait))
  		wake_up(&ctx->wait);
e34ecee2a   Kent Overstreet   aio: Fix a trinit...
1067
  	percpu_ref_put(&ctx->reqs);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1068
  }
385773e04   H Hartley Sweeten   aio.c: move EXPOR...
1069
  EXPORT_SYMBOL(aio_complete);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1070

2be4e7dee   Gu Zheng   aio: fix some com...
1071
  /* aio_read_events_ring
a31ad380b   Kent Overstreet   aio: make aio_rea...
1072
1073
   *	Pull an event off of the ioctx's event ring.  Returns the number of
   *	events fetched
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1074
   */
a31ad380b   Kent Overstreet   aio: make aio_rea...
1075
1076
  static long aio_read_events_ring(struct kioctx *ctx,
  				 struct io_event __user *event, long nr)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1077
  {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1078
  	struct aio_ring *ring;
5ffac122d   Kent Overstreet   aio: Don't use ct...
1079
  	unsigned head, tail, pos;
a31ad380b   Kent Overstreet   aio: make aio_rea...
1080
1081
  	long ret = 0;
  	int copy_ret;
58c85dc20   Kent Overstreet   aio: kill struct ...
1082
  	mutex_lock(&ctx->ring_lock);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1083

fa8a53c39   Benjamin LaHaise   aio: v4 ensure ac...
1084
  	/* Access to ->ring_pages here is protected by ctx->ring_lock. */
58c85dc20   Kent Overstreet   aio: kill struct ...
1085
  	ring = kmap_atomic(ctx->ring_pages[0]);
a31ad380b   Kent Overstreet   aio: make aio_rea...
1086
  	head = ring->head;
5ffac122d   Kent Overstreet   aio: Don't use ct...
1087
  	tail = ring->tail;
a31ad380b   Kent Overstreet   aio: make aio_rea...
1088
  	kunmap_atomic(ring);
2ff396be6   Jeff Moyer   aio: add missing ...
1089
1090
1091
1092
1093
  	/*
  	 * Ensure that once we've read the current tail pointer, that
  	 * we also see the events that were stored up to the tail.
  	 */
  	smp_rmb();
5ffac122d   Kent Overstreet   aio: Don't use ct...
1094
1095
  	pr_debug("h%u t%u m%u
  ", head, tail, ctx->nr_events);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1096

5ffac122d   Kent Overstreet   aio: Don't use ct...
1097
  	if (head == tail)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1098
  		goto out;
edfbbf388   Benjamin LaHaise   aio: fix kernel m...
1099
1100
  	head %= ctx->nr_events;
  	tail %= ctx->nr_events;
a31ad380b   Kent Overstreet   aio: make aio_rea...
1101
1102
1103
1104
  	while (ret < nr) {
  		long avail;
  		struct io_event *ev;
  		struct page *page;
5ffac122d   Kent Overstreet   aio: Don't use ct...
1105
1106
  		avail = (head <= tail ?  tail : ctx->nr_events) - head;
  		if (head == tail)
a31ad380b   Kent Overstreet   aio: make aio_rea...
1107
1108
1109
1110
1111
1112
1113
  			break;
  
  		avail = min(avail, nr - ret);
  		avail = min_t(long, avail, AIO_EVENTS_PER_PAGE -
  			    ((head + AIO_EVENTS_OFFSET) % AIO_EVENTS_PER_PAGE));
  
  		pos = head + AIO_EVENTS_OFFSET;
58c85dc20   Kent Overstreet   aio: kill struct ...
1114
  		page = ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE];
a31ad380b   Kent Overstreet   aio: make aio_rea...
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
  		pos %= AIO_EVENTS_PER_PAGE;
  
  		ev = kmap(page);
  		copy_ret = copy_to_user(event + ret, ev + pos,
  					sizeof(*ev) * avail);
  		kunmap(page);
  
  		if (unlikely(copy_ret)) {
  			ret = -EFAULT;
  			goto out;
  		}
  
  		ret += avail;
  		head += avail;
58c85dc20   Kent Overstreet   aio: kill struct ...
1129
  		head %= ctx->nr_events;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1130
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1131

58c85dc20   Kent Overstreet   aio: kill struct ...
1132
  	ring = kmap_atomic(ctx->ring_pages[0]);
a31ad380b   Kent Overstreet   aio: make aio_rea...
1133
  	ring->head = head;
91d80a84b   Zhao Hongjiang   aio: fix possible...
1134
  	kunmap_atomic(ring);
58c85dc20   Kent Overstreet   aio: kill struct ...
1135
  	flush_dcache_page(ctx->ring_pages[0]);
a31ad380b   Kent Overstreet   aio: make aio_rea...
1136

5ffac122d   Kent Overstreet   aio: Don't use ct...
1137
1138
  	pr_debug("%li  h%u t%u
  ", ret, head, tail);
a31ad380b   Kent Overstreet   aio: make aio_rea...
1139
  out:
58c85dc20   Kent Overstreet   aio: kill struct ...
1140
  	mutex_unlock(&ctx->ring_lock);
a31ad380b   Kent Overstreet   aio: make aio_rea...
1141

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1142
1143
  	return ret;
  }
a31ad380b   Kent Overstreet   aio: make aio_rea...
1144
1145
  static bool aio_read_events(struct kioctx *ctx, long min_nr, long nr,
  			    struct io_event __user *event, long *i)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1146
  {
a31ad380b   Kent Overstreet   aio: make aio_rea...
1147
  	long ret = aio_read_events_ring(ctx, event + *i, nr - *i);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1148

a31ad380b   Kent Overstreet   aio: make aio_rea...
1149
1150
  	if (ret > 0)
  		*i += ret;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1151

a31ad380b   Kent Overstreet   aio: make aio_rea...
1152
1153
  	if (unlikely(atomic_read(&ctx->dead)))
  		ret = -EINVAL;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1154

a31ad380b   Kent Overstreet   aio: make aio_rea...
1155
1156
  	if (!*i)
  		*i = ret;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1157

a31ad380b   Kent Overstreet   aio: make aio_rea...
1158
  	return ret < 0 || *i >= min_nr;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1159
  }
a31ad380b   Kent Overstreet   aio: make aio_rea...
1160
  static long read_events(struct kioctx *ctx, long min_nr, long nr,
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1161
1162
1163
  			struct io_event __user *event,
  			struct timespec __user *timeout)
  {
a31ad380b   Kent Overstreet   aio: make aio_rea...
1164
1165
  	ktime_t until = { .tv64 = KTIME_MAX };
  	long ret = 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1166

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1167
1168
  	if (timeout) {
  		struct timespec	ts;
a31ad380b   Kent Overstreet   aio: make aio_rea...
1169

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1170
  		if (unlikely(copy_from_user(&ts, timeout, sizeof(ts))))
a31ad380b   Kent Overstreet   aio: make aio_rea...
1171
  			return -EFAULT;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1172

a31ad380b   Kent Overstreet   aio: make aio_rea...
1173
  		until = timespec_to_ktime(ts);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1174
  	}
a31ad380b   Kent Overstreet   aio: make aio_rea...
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
  	/*
  	 * Note that aio_read_events() is being called as the conditional - i.e.
  	 * we're calling it after prepare_to_wait() has set task state to
  	 * TASK_INTERRUPTIBLE.
  	 *
  	 * But aio_read_events() can block, and if it blocks it's going to flip
  	 * the task state back to TASK_RUNNING.
  	 *
  	 * This should be ok, provided it doesn't flip the state back to
  	 * TASK_RUNNING and return 0 too much - that causes us to spin. That
  	 * will only happen if the mutex_lock() call blocks, and we then find
  	 * the ringbuffer empty. So in practice we should be ok, but it's
  	 * something to be aware of when touching this code.
  	 */
5f785de58   Fam Zheng   aio: Skip timer f...
1189
1190
1191
1192
1193
1194
  	if (until.tv64 == 0)
  		aio_read_events(ctx, min_nr, nr, event, &ret);
  	else
  		wait_event_interruptible_hrtimeout(ctx->wait,
  				aio_read_events(ctx, min_nr, nr, event, &ret),
  				until);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1195

a31ad380b   Kent Overstreet   aio: make aio_rea...
1196
1197
  	if (!ret && signal_pending(current))
  		ret = -EINTR;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1198

a31ad380b   Kent Overstreet   aio: make aio_rea...
1199
  	return ret;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1200
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
  /* sys_io_setup:
   *	Create an aio_context capable of receiving at least nr_events.
   *	ctxp must not point to an aio_context that already exists, and
   *	must be initialized to 0 prior to the call.  On successful
   *	creation of the aio_context, *ctxp is filled in with the resulting 
   *	handle.  May fail with -EINVAL if *ctxp is not initialized,
   *	if the specified nr_events exceeds internal limits.  May fail 
   *	with -EAGAIN if the specified nr_events exceeds the user's limit 
   *	of available events.  May fail with -ENOMEM if insufficient kernel
   *	resources are available.  May fail with -EFAULT if an invalid
   *	pointer is passed for ctxp.  Will fail with -ENOSYS if not
   *	implemented.
   */
002c8976e   Heiko Carstens   [CVE-2009-0029] S...
1214
  SYSCALL_DEFINE2(io_setup, unsigned, nr_events, aio_context_t __user *, ctxp)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
  {
  	struct kioctx *ioctx = NULL;
  	unsigned long ctx;
  	long ret;
  
  	ret = get_user(ctx, ctxp);
  	if (unlikely(ret))
  		goto out;
  
  	ret = -EINVAL;
d55b5fdaf   Zach Brown   [PATCH] aio: remo...
1225
1226
1227
1228
  	if (unlikely(ctx || nr_events == 0)) {
  		pr_debug("EINVAL: io_setup: ctx %lu nr_events %u
  ",
  		         ctx, nr_events);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1229
1230
1231
1232
1233
1234
1235
  		goto out;
  	}
  
  	ioctx = ioctx_alloc(nr_events);
  	ret = PTR_ERR(ioctx);
  	if (!IS_ERR(ioctx)) {
  		ret = put_user(ioctx->user_id, ctxp);
a2e1859ad   Al Viro   aio: take final p...
1236
  		if (ret)
e02ba72aa   Anatol Pomozov   aio: block io_des...
1237
  			kill_ioctx(current->mm, ioctx, NULL);
723be6e39   Kent Overstreet   aio: percpu ioctx...
1238
  		percpu_ref_put(&ioctx->users);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1239
1240
1241
1242
1243
1244
1245
1246
1247
  	}
  
  out:
  	return ret;
  }
  
  /* sys_io_destroy:
   *	Destroy the aio_context specified.  May cancel any outstanding 
   *	AIOs and block on completion.  Will fail with -ENOSYS if not
642b5123a   Satoru Takeuchi   aio: fix wrong su...
1248
   *	implemented.  May fail with -EINVAL if the context pointed to
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1249
1250
   *	is invalid.
   */
002c8976e   Heiko Carstens   [CVE-2009-0029] S...
1251
  SYSCALL_DEFINE1(io_destroy, aio_context_t, ctx)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1252
1253
1254
  {
  	struct kioctx *ioctx = lookup_ioctx(ctx);
  	if (likely(NULL != ioctx)) {
e02ba72aa   Anatol Pomozov   aio: block io_des...
1255
1256
  		struct completion requests_done =
  			COMPLETION_INITIALIZER_ONSTACK(requests_done);
fb2d44838   Benjamin LaHaise   aio: report error...
1257
  		int ret;
e02ba72aa   Anatol Pomozov   aio: block io_des...
1258
1259
1260
1261
1262
  
  		/* Pass requests_done to kill_ioctx() where it can be set
  		 * in a thread-safe way. If we try to set it here then we have
  		 * a race condition if two io_destroy() called simultaneously.
  		 */
fb2d44838   Benjamin LaHaise   aio: report error...
1263
  		ret = kill_ioctx(current->mm, ioctx, &requests_done);
723be6e39   Kent Overstreet   aio: percpu ioctx...
1264
  		percpu_ref_put(&ioctx->users);
e02ba72aa   Anatol Pomozov   aio: block io_des...
1265
1266
1267
1268
1269
  
  		/* Wait until all IO for the context are done. Otherwise kernel
  		 * keep using user-space buffers even if user thinks the context
  		 * is destroyed.
  		 */
fb2d44838   Benjamin LaHaise   aio: report error...
1270
1271
  		if (!ret)
  			wait_for_completion(&requests_done);
e02ba72aa   Anatol Pomozov   aio: block io_des...
1272

fb2d44838   Benjamin LaHaise   aio: report error...
1273
  		return ret;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1274
1275
1276
1277
1278
  	}
  	pr_debug("EINVAL: io_destroy: invalid context id
  ");
  	return -EINVAL;
  }
41ef4eb8e   Kent Overstreet   aio: kill ki_retry
1279
1280
  typedef ssize_t (aio_rw_op)(struct kiocb *, const struct iovec *,
  			    unsigned long, loff_t);
293bc9822   Al Viro   new methods: ->re...
1281
  typedef ssize_t (rw_iter_op)(struct kiocb *, struct iov_iter *);
41ef4eb8e   Kent Overstreet   aio: kill ki_retry
1282

8bc92afcf   Kent Overstreet   aio: Kill unneede...
1283
1284
1285
1286
1287
  static ssize_t aio_setup_vectored_rw(struct kiocb *kiocb,
  				     int rw, char __user *buf,
  				     unsigned long *nr_segs,
  				     struct iovec **iovec,
  				     bool compat)
eed4e51fb   Badari Pulavarty   [PATCH] Add vecto...
1288
1289
  {
  	ssize_t ret;
8bc92afcf   Kent Overstreet   aio: Kill unneede...
1290
  	*nr_segs = kiocb->ki_nbytes;
41ef4eb8e   Kent Overstreet   aio: kill ki_retry
1291

9d85cba71   Jeff Moyer   aio: fix the comp...
1292
1293
  #ifdef CONFIG_COMPAT
  	if (compat)
41ef4eb8e   Kent Overstreet   aio: kill ki_retry
1294
  		ret = compat_rw_copy_check_uvector(rw,
8bc92afcf   Kent Overstreet   aio: Kill unneede...
1295
  				(struct compat_iovec __user *)buf,
00fefb9cf   Gu Zheng   aio: use iovec ar...
1296
  				*nr_segs, UIO_FASTIOV, *iovec, iovec);
9d85cba71   Jeff Moyer   aio: fix the comp...
1297
1298
  	else
  #endif
41ef4eb8e   Kent Overstreet   aio: kill ki_retry
1299
  		ret = rw_copy_check_uvector(rw,
8bc92afcf   Kent Overstreet   aio: Kill unneede...
1300
  				(struct iovec __user *)buf,
00fefb9cf   Gu Zheng   aio: use iovec ar...
1301
  				*nr_segs, UIO_FASTIOV, *iovec, iovec);
eed4e51fb   Badari Pulavarty   [PATCH] Add vecto...
1302
  	if (ret < 0)
41ef4eb8e   Kent Overstreet   aio: kill ki_retry
1303
  		return ret;
a70b52ec1   Linus Torvalds   vfs: make AIO use...
1304

41ef4eb8e   Kent Overstreet   aio: kill ki_retry
1305
  	/* ki_nbytes now reflect bytes instead of segs */
eed4e51fb   Badari Pulavarty   [PATCH] Add vecto...
1306
  	kiocb->ki_nbytes = ret;
41ef4eb8e   Kent Overstreet   aio: kill ki_retry
1307
  	return 0;
eed4e51fb   Badari Pulavarty   [PATCH] Add vecto...
1308
  }
8bc92afcf   Kent Overstreet   aio: Kill unneede...
1309
1310
1311
1312
  static ssize_t aio_setup_single_vector(struct kiocb *kiocb,
  				       int rw, char __user *buf,
  				       unsigned long *nr_segs,
  				       struct iovec *iovec)
eed4e51fb   Badari Pulavarty   [PATCH] Add vecto...
1313
  {
8bc92afcf   Kent Overstreet   aio: Kill unneede...
1314
  	if (unlikely(!access_ok(!rw, buf, kiocb->ki_nbytes)))
41ef4eb8e   Kent Overstreet   aio: kill ki_retry
1315
  		return -EFAULT;
a70b52ec1   Linus Torvalds   vfs: make AIO use...
1316

8bc92afcf   Kent Overstreet   aio: Kill unneede...
1317
1318
1319
  	iovec->iov_base = buf;
  	iovec->iov_len = kiocb->ki_nbytes;
  	*nr_segs = 1;
eed4e51fb   Badari Pulavarty   [PATCH] Add vecto...
1320
1321
  	return 0;
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1322
  /*
2be4e7dee   Gu Zheng   aio: fix some com...
1323
1324
   * aio_run_iocb:
   *	Performs the initial checks and io submission.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1325
   */
8bc92afcf   Kent Overstreet   aio: Kill unneede...
1326
1327
  static ssize_t aio_run_iocb(struct kiocb *req, unsigned opcode,
  			    char __user *buf, bool compat)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1328
  {
41ef4eb8e   Kent Overstreet   aio: kill ki_retry
1329
1330
  	struct file *file = req->ki_filp;
  	ssize_t ret;
8bc92afcf   Kent Overstreet   aio: Kill unneede...
1331
  	unsigned long nr_segs;
41ef4eb8e   Kent Overstreet   aio: kill ki_retry
1332
1333
1334
  	int rw;
  	fmode_t mode;
  	aio_rw_op *rw_op;
293bc9822   Al Viro   new methods: ->re...
1335
  	rw_iter_op *iter_op;
00fefb9cf   Gu Zheng   aio: use iovec ar...
1336
  	struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
293bc9822   Al Viro   new methods: ->re...
1337
  	struct iov_iter iter;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1338

8bc92afcf   Kent Overstreet   aio: Kill unneede...
1339
  	switch (opcode) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1340
  	case IOCB_CMD_PREAD:
eed4e51fb   Badari Pulavarty   [PATCH] Add vecto...
1341
  	case IOCB_CMD_PREADV:
41ef4eb8e   Kent Overstreet   aio: kill ki_retry
1342
1343
1344
  		mode	= FMODE_READ;
  		rw	= READ;
  		rw_op	= file->f_op->aio_read;
293bc9822   Al Viro   new methods: ->re...
1345
  		iter_op	= file->f_op->read_iter;
41ef4eb8e   Kent Overstreet   aio: kill ki_retry
1346
1347
1348
  		goto rw_common;
  
  	case IOCB_CMD_PWRITE:
eed4e51fb   Badari Pulavarty   [PATCH] Add vecto...
1349
  	case IOCB_CMD_PWRITEV:
41ef4eb8e   Kent Overstreet   aio: kill ki_retry
1350
1351
1352
  		mode	= FMODE_WRITE;
  		rw	= WRITE;
  		rw_op	= file->f_op->aio_write;
293bc9822   Al Viro   new methods: ->re...
1353
  		iter_op	= file->f_op->write_iter;
41ef4eb8e   Kent Overstreet   aio: kill ki_retry
1354
1355
1356
1357
  		goto rw_common;
  rw_common:
  		if (unlikely(!(file->f_mode & mode)))
  			return -EBADF;
293bc9822   Al Viro   new methods: ->re...
1358
  		if (!rw_op && !iter_op)
41ef4eb8e   Kent Overstreet   aio: kill ki_retry
1359
  			return -EINVAL;
8bc92afcf   Kent Overstreet   aio: Kill unneede...
1360
1361
1362
1363
1364
1365
  		ret = (opcode == IOCB_CMD_PREADV ||
  		       opcode == IOCB_CMD_PWRITEV)
  			? aio_setup_vectored_rw(req, rw, buf, &nr_segs,
  						&iovec, compat)
  			: aio_setup_single_vector(req, rw, buf, &nr_segs,
  						  iovec);
754320d6e   Leon Yu   aio: fix potentia...
1366
1367
  		if (!ret)
  			ret = rw_verify_area(rw, file, &req->ki_pos, req->ki_nbytes);
8bc92afcf   Kent Overstreet   aio: Kill unneede...
1368
  		if (ret < 0) {
00fefb9cf   Gu Zheng   aio: use iovec ar...
1369
  			if (iovec != inline_vecs)
8bc92afcf   Kent Overstreet   aio: Kill unneede...
1370
  				kfree(iovec);
41ef4eb8e   Kent Overstreet   aio: kill ki_retry
1371
  			return ret;
8bc92afcf   Kent Overstreet   aio: Kill unneede...
1372
  		}
41ef4eb8e   Kent Overstreet   aio: kill ki_retry
1373
1374
  
  		req->ki_nbytes = ret;
41ef4eb8e   Kent Overstreet   aio: kill ki_retry
1375

73a7075e3   Kent Overstreet   aio: Kill aio_rw_...
1376
1377
1378
1379
1380
1381
1382
1383
1384
  		/* XXX: move/kill - rw_verify_area()? */
  		/* This matches the pread()/pwrite() logic */
  		if (req->ki_pos < 0) {
  			ret = -EINVAL;
  			break;
  		}
  
  		if (rw == WRITE)
  			file_start_write(file);
293bc9822   Al Viro   new methods: ->re...
1385
1386
1387
1388
1389
1390
  		if (iter_op) {
  			iov_iter_init(&iter, rw, iovec, nr_segs, req->ki_nbytes);
  			ret = iter_op(req, &iter);
  		} else {
  			ret = rw_op(req, iovec, nr_segs, req->ki_pos);
  		}
73a7075e3   Kent Overstreet   aio: Kill aio_rw_...
1391
1392
1393
  
  		if (rw == WRITE)
  			file_end_write(file);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1394
  		break;
41ef4eb8e   Kent Overstreet   aio: kill ki_retry
1395

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1396
  	case IOCB_CMD_FDSYNC:
41ef4eb8e   Kent Overstreet   aio: kill ki_retry
1397
1398
1399
1400
  		if (!file->f_op->aio_fsync)
  			return -EINVAL;
  
  		ret = file->f_op->aio_fsync(req, 1);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1401
  		break;
41ef4eb8e   Kent Overstreet   aio: kill ki_retry
1402

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1403
  	case IOCB_CMD_FSYNC:
41ef4eb8e   Kent Overstreet   aio: kill ki_retry
1404
1405
1406
1407
  		if (!file->f_op->aio_fsync)
  			return -EINVAL;
  
  		ret = file->f_op->aio_fsync(req, 0);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1408
  		break;
41ef4eb8e   Kent Overstreet   aio: kill ki_retry
1409

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1410
  	default:
caf4167aa   Kent Overstreet   aio: dprintk() ->...
1411
1412
  		pr_debug("EINVAL: no operation provided
  ");
41ef4eb8e   Kent Overstreet   aio: kill ki_retry
1413
  		return -EINVAL;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1414
  	}
00fefb9cf   Gu Zheng   aio: use iovec ar...
1415
  	if (iovec != inline_vecs)
8bc92afcf   Kent Overstreet   aio: Kill unneede...
1416
  		kfree(iovec);
41ef4eb8e   Kent Overstreet   aio: kill ki_retry
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
  	if (ret != -EIOCBQUEUED) {
  		/*
  		 * There's no easy way to restart the syscall since other AIO's
  		 * may be already running. Just fail this IO with EINTR.
  		 */
  		if (unlikely(ret == -ERESTARTSYS || ret == -ERESTARTNOINTR ||
  			     ret == -ERESTARTNOHAND ||
  			     ret == -ERESTART_RESTARTBLOCK))
  			ret = -EINTR;
  		aio_complete(req, ret, 0);
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1428
1429
1430
  
  	return 0;
  }
d5470b596   Adrian Bunk   fs/aio.c: make 3 ...
1431
  static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
a1c8eae75   Kent Overstreet   aio: kill batch a...
1432
  			 struct iocb *iocb, bool compat)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1433
1434
  {
  	struct kiocb *req;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1435
1436
1437
  	ssize_t ret;
  
  	/* enforce forwards compatibility on users */
9c3060bed   Davide Libenzi   signal/timer/even...
1438
  	if (unlikely(iocb->aio_reserved1 || iocb->aio_reserved2)) {
caf4167aa   Kent Overstreet   aio: dprintk() ->...
1439
1440
  		pr_debug("EINVAL: reserve field set
  ");
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
  		return -EINVAL;
  	}
  
  	/* prevent overflows */
  	if (unlikely(
  	    (iocb->aio_buf != (unsigned long)iocb->aio_buf) ||
  	    (iocb->aio_nbytes != (size_t)iocb->aio_nbytes) ||
  	    ((ssize_t)iocb->aio_nbytes < 0)
  	   )) {
  		pr_debug("EINVAL: io_submit: overflow check
  ");
  		return -EINVAL;
  	}
41ef4eb8e   Kent Overstreet   aio: kill ki_retry
1454
  	req = aio_get_req(ctx);
1d98ebfcc   Kent Overstreet   aio: do fget() af...
1455
  	if (unlikely(!req))
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1456
  		return -EAGAIN;
1d98ebfcc   Kent Overstreet   aio: do fget() af...
1457
1458
1459
1460
1461
  
  	req->ki_filp = fget(iocb->aio_fildes);
  	if (unlikely(!req->ki_filp)) {
  		ret = -EBADF;
  		goto out_put_req;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1462
  	}
1d98ebfcc   Kent Overstreet   aio: do fget() af...
1463

9c3060bed   Davide Libenzi   signal/timer/even...
1464
1465
1466
1467
1468
1469
1470
  	if (iocb->aio_flags & IOCB_FLAG_RESFD) {
  		/*
  		 * If the IOCB_FLAG_RESFD flag of aio_flags is set, get an
  		 * instance of the file* now. The file descriptor must be
  		 * an eventfd() fd, and will be signaled for each completed
  		 * event using the eventfd_signal() function.
  		 */
133890103   Davide Libenzi   eventfd: revised ...
1471
  		req->ki_eventfd = eventfd_ctx_fdget((int) iocb->aio_resfd);
801678c5a   Hirofumi Nakagawa   Remove duplicated...
1472
  		if (IS_ERR(req->ki_eventfd)) {
9c3060bed   Davide Libenzi   signal/timer/even...
1473
  			ret = PTR_ERR(req->ki_eventfd);
87c3a86e1   Davide Libenzi   eventfd: remove f...
1474
  			req->ki_eventfd = NULL;
9c3060bed   Davide Libenzi   signal/timer/even...
1475
1476
1477
  			goto out_put_req;
  		}
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1478

8a6608907   Kent Overstreet   aio: kill ki_key
1479
  	ret = put_user(KIOCB_KEY, &user_iocb->aio_key);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1480
  	if (unlikely(ret)) {
caf4167aa   Kent Overstreet   aio: dprintk() ->...
1481
1482
  		pr_debug("EFAULT: aio_key
  ");
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1483
1484
1485
1486
1487
1488
  		goto out_put_req;
  	}
  
  	req->ki_obj.user = user_iocb;
  	req->ki_user_data = iocb->aio_data;
  	req->ki_pos = iocb->aio_offset;
73a7075e3   Kent Overstreet   aio: Kill aio_rw_...
1489
  	req->ki_nbytes = iocb->aio_nbytes;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1490

8bc92afcf   Kent Overstreet   aio: Kill unneede...
1491
1492
1493
  	ret = aio_run_iocb(req, iocb->aio_lio_opcode,
  			   (char __user *)(unsigned long)iocb->aio_buf,
  			   compat);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1494
1495
  	if (ret)
  		goto out_put_req;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1496
  	return 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1497
  out_put_req:
e1bdd5f27   Kent Overstreet   aio: percpu reqs_...
1498
  	put_reqs_available(ctx, 1);
e34ecee2a   Kent Overstreet   aio: Fix a trinit...
1499
  	percpu_ref_put(&ctx->reqs);
57282d8fd   Kent Overstreet   aio: Kill ki_users
1500
  	kiocb_free(req);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1501
1502
  	return ret;
  }
9d85cba71   Jeff Moyer   aio: fix the comp...
1503
1504
  long do_io_submit(aio_context_t ctx_id, long nr,
  		  struct iocb __user *__user *iocbpp, bool compat)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1505
1506
1507
  {
  	struct kioctx *ctx;
  	long ret = 0;
080d676de   Jeff Moyer   aio: allocate kio...
1508
  	int i = 0;
9f5b94254   Shaohua Li   fs: make aio plug
1509
  	struct blk_plug plug;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1510
1511
1512
  
  	if (unlikely(nr < 0))
  		return -EINVAL;
75e1c70fc   Jeff Moyer   aio: check for mu...
1513
1514
  	if (unlikely(nr > LONG_MAX/sizeof(*iocbpp)))
  		nr = LONG_MAX/sizeof(*iocbpp);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1515
1516
1517
1518
1519
  	if (unlikely(!access_ok(VERIFY_READ, iocbpp, (nr*sizeof(*iocbpp)))))
  		return -EFAULT;
  
  	ctx = lookup_ioctx(ctx_id);
  	if (unlikely(!ctx)) {
caf4167aa   Kent Overstreet   aio: dprintk() ->...
1520
1521
  		pr_debug("EINVAL: invalid context id
  ");
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1522
1523
  		return -EINVAL;
  	}
9f5b94254   Shaohua Li   fs: make aio plug
1524
  	blk_start_plug(&plug);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
  	/*
  	 * AKPM: should this return a partial result if some of the IOs were
  	 * successfully submitted?
  	 */
  	for (i=0; i<nr; i++) {
  		struct iocb __user *user_iocb;
  		struct iocb tmp;
  
  		if (unlikely(__get_user(user_iocb, iocbpp + i))) {
  			ret = -EFAULT;
  			break;
  		}
  
  		if (unlikely(copy_from_user(&tmp, user_iocb, sizeof(tmp)))) {
  			ret = -EFAULT;
  			break;
  		}
a1c8eae75   Kent Overstreet   aio: kill batch a...
1542
  		ret = io_submit_one(ctx, user_iocb, &tmp, compat);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1543
1544
1545
  		if (ret)
  			break;
  	}
9f5b94254   Shaohua Li   fs: make aio plug
1546
  	blk_finish_plug(&plug);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1547

723be6e39   Kent Overstreet   aio: percpu ioctx...
1548
  	percpu_ref_put(&ctx->users);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1549
1550
  	return i ? i : ret;
  }
9d85cba71   Jeff Moyer   aio: fix the comp...
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
  /* sys_io_submit:
   *	Queue the nr iocbs pointed to by iocbpp for processing.  Returns
   *	the number of iocbs queued.  May return -EINVAL if the aio_context
   *	specified by ctx_id is invalid, if nr is < 0, if the iocb at
   *	*iocbpp[0] is not properly initialized, if the operation specified
   *	is invalid for the file descriptor in the iocb.  May fail with
   *	-EFAULT if any of the data structures point to invalid data.  May
   *	fail with -EBADF if the file descriptor specified in the first
   *	iocb is invalid.  May fail with -EAGAIN if insufficient resources
   *	are available to queue any iocbs.  Will return 0 if nr is 0.  Will
   *	fail with -ENOSYS if not implemented.
   */
  SYSCALL_DEFINE3(io_submit, aio_context_t, ctx_id, long, nr,
  		struct iocb __user * __user *, iocbpp)
  {
  	return do_io_submit(ctx_id, nr, iocbpp, 0);
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1568
1569
  /* lookup_kiocb
   *	Finds a given iocb for cancellation.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1570
   */
25ee7e383   Adrian Bunk   [PATCH] fs/aio.c:...
1571
1572
  static struct kiocb *lookup_kiocb(struct kioctx *ctx, struct iocb __user *iocb,
  				  u32 key)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1573
1574
  {
  	struct list_head *pos;
d00689af6   Zach Brown   [PATCH] aio: repl...
1575
1576
  
  	assert_spin_locked(&ctx->ctx_lock);
8a6608907   Kent Overstreet   aio: kill ki_key
1577
1578
  	if (key != KIOCB_KEY)
  		return NULL;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1579
1580
1581
  	/* TODO: use a hash or array, this sucks. */
  	list_for_each(pos, &ctx->active_reqs) {
  		struct kiocb *kiocb = list_kiocb(pos);
8a6608907   Kent Overstreet   aio: kill ki_key
1582
  		if (kiocb->ki_obj.user == iocb)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
  			return kiocb;
  	}
  	return NULL;
  }
  
  /* sys_io_cancel:
   *	Attempts to cancel an iocb previously passed to io_submit.  If
   *	the operation is successfully cancelled, the resulting event is
   *	copied into the memory pointed to by result without being placed
   *	into the completion queue and 0 is returned.  May fail with
   *	-EFAULT if any of the data structures pointed to are invalid.
   *	May fail with -EINVAL if aio_context specified by ctx_id is
   *	invalid.  May fail with -EAGAIN if the iocb specified was not
   *	cancelled.  Will fail with -ENOSYS if not implemented.
   */
002c8976e   Heiko Carstens   [CVE-2009-0029] S...
1598
1599
  SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb,
  		struct io_event __user *, result)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1600
  {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
  	struct kioctx *ctx;
  	struct kiocb *kiocb;
  	u32 key;
  	int ret;
  
  	ret = get_user(key, &iocb->aio_key);
  	if (unlikely(ret))
  		return -EFAULT;
  
  	ctx = lookup_ioctx(ctx_id);
  	if (unlikely(!ctx))
  		return -EINVAL;
  
  	spin_lock_irq(&ctx->ctx_lock);
906b973cf   Kent Overstreet   aio: add kiocb_ca...
1615

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1616
  	kiocb = lookup_kiocb(ctx, iocb, key);
906b973cf   Kent Overstreet   aio: add kiocb_ca...
1617
  	if (kiocb)
d52a8f9ea   Fabian Frederick   fs/aio.c: Remove ...
1618
  		ret = kiocb_cancel(kiocb);
906b973cf   Kent Overstreet   aio: add kiocb_ca...
1619
1620
  	else
  		ret = -EINVAL;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1621
  	spin_unlock_irq(&ctx->ctx_lock);
906b973cf   Kent Overstreet   aio: add kiocb_ca...
1622
  	if (!ret) {
bec68faaf   Kent Overstreet   aio: io_cancel() ...
1623
1624
1625
1626
  		/*
  		 * The result argument is no longer used - the io_event is
  		 * always delivered via the ring buffer. -EINPROGRESS indicates
  		 * cancellation is progress:
906b973cf   Kent Overstreet   aio: add kiocb_ca...
1627
  		 */
bec68faaf   Kent Overstreet   aio: io_cancel() ...
1628
  		ret = -EINPROGRESS;
906b973cf   Kent Overstreet   aio: add kiocb_ca...
1629
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1630

723be6e39   Kent Overstreet   aio: percpu ioctx...
1631
  	percpu_ref_put(&ctx->users);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1632
1633
1634
1635
1636
1637
  
  	return ret;
  }
  
  /* io_getevents:
   *	Attempts to read at least min_nr events and up to nr events from
642b5123a   Satoru Takeuchi   aio: fix wrong su...
1638
1639
1640
1641
1642
1643
1644
1645
   *	the completion queue for the aio_context specified by ctx_id. If
   *	it succeeds, the number of read events is returned. May fail with
   *	-EINVAL if ctx_id is invalid, if min_nr is out of range, if nr is
   *	out of range, if timeout is out of range.  May fail with -EFAULT
   *	if any of the memory specified is invalid.  May return 0 or
   *	< min_nr if the timeout specified by timeout has elapsed
   *	before sufficient events are available, where timeout == NULL
   *	specifies an infinite timeout. Note that the timeout pointed to by
6900807c6   Jeff Moyer   aio: fix io_getev...
1646
   *	timeout is relative.  Will fail with -ENOSYS if not implemented.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1647
   */
002c8976e   Heiko Carstens   [CVE-2009-0029] S...
1648
1649
1650
1651
1652
  SYSCALL_DEFINE5(io_getevents, aio_context_t, ctx_id,
  		long, min_nr,
  		long, nr,
  		struct io_event __user *, events,
  		struct timespec __user *, timeout)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1653
1654
1655
1656
1657
  {
  	struct kioctx *ioctx = lookup_ioctx(ctx_id);
  	long ret = -EINVAL;
  
  	if (likely(ioctx)) {
2e4102559   Namhyung Kim   aio: remove unnec...
1658
  		if (likely(min_nr <= nr && min_nr >= 0))
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1659
  			ret = read_events(ioctx, min_nr, nr, events, timeout);
723be6e39   Kent Overstreet   aio: percpu ioctx...
1660
  		percpu_ref_put(&ioctx->users);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1661
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1662
1663
  	return ret;
  }