Blame view

fs/aio.c 42.2 KB
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1
2
3
4
5
6
7
8
9
10
  /*
   *	An async IO implementation for Linux
   *	Written by Benjamin LaHaise <bcrl@kvack.org>
   *
   *	Implements an efficient asynchronous io interface.
   *
   *	Copyright 2000, 2001, 2002 Red Hat, Inc.  All Rights Reserved.
   *
   *	See ../COPYING for licensing terms.
   */
caf4167aa   Kent Overstreet   aio: dprintk() ->...
11
  #define pr_fmt(fmt) "%s: " fmt, __func__
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
12
13
14
15
16
  #include <linux/kernel.h>
  #include <linux/init.h>
  #include <linux/errno.h>
  #include <linux/time.h>
  #include <linux/aio_abi.h>
630d9c472   Paul Gortmaker   fs: reduce the us...
17
  #include <linux/export.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
18
  #include <linux/syscalls.h>
b9d128f10   Jens Axboe   block: move bdi/a...
19
  #include <linux/backing-dev.h>
027445c37   Badari Pulavarty   [PATCH] Vectorize...
20
  #include <linux/uio.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
21

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
22
23
24
25
26
  #include <linux/sched.h>
  #include <linux/fs.h>
  #include <linux/file.h>
  #include <linux/mm.h>
  #include <linux/mman.h>
3d2d827f5   Michael S. Tsirkin   mm: move use_mm/u...
27
  #include <linux/mmu_context.h>
e1bdd5f27   Kent Overstreet   aio: percpu reqs_...
28
  #include <linux/percpu.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
29
30
31
32
33
34
  #include <linux/slab.h>
  #include <linux/timer.h>
  #include <linux/aio.h>
  #include <linux/highmem.h>
  #include <linux/workqueue.h>
  #include <linux/security.h>
9c3060bed   Davide Libenzi   signal/timer/even...
35
  #include <linux/eventfd.h>
cfb1e33ee   Jeff Moyer   aio: implement re...
36
  #include <linux/blkdev.h>
9d85cba71   Jeff Moyer   aio: fix the comp...
37
  #include <linux/compat.h>
36bc08cc0   Gu Zheng   fs/aio: Add suppo...
38
39
  #include <linux/migrate.h>
  #include <linux/ramfs.h>
723be6e39   Kent Overstreet   aio: percpu ioctx...
40
  #include <linux/percpu-refcount.h>
71ad7490c   Benjamin LaHaise   rework aio migrat...
41
  #include <linux/mount.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
42
43
44
  
  #include <asm/kmap_types.h>
  #include <asm/uaccess.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
45

68d70d03f   Al Viro   constify rw_verif...
46
  #include "internal.h"
4e179bca6   Kent Overstreet   aio: move private...
47
48
49
50
51
52
  #define AIO_RING_MAGIC			0xa10a10a1
  #define AIO_RING_COMPAT_FEATURES	1
  #define AIO_RING_INCOMPAT_FEATURES	0
  struct aio_ring {
  	unsigned	id;	/* kernel internal index number */
  	unsigned	nr;	/* number of io_events */
fa8a53c39   Benjamin LaHaise   aio: v4 ensure ac...
53
54
  	unsigned	head;	/* Written to by userland or under ring_lock
  				 * mutex by aio_read_events_ring(). */
4e179bca6   Kent Overstreet   aio: move private...
55
56
57
58
59
60
61
62
63
64
65
66
  	unsigned	tail;
  
  	unsigned	magic;
  	unsigned	compat_features;
  	unsigned	incompat_features;
  	unsigned	header_length;	/* size of aio_ring */
  
  
  	struct io_event		io_events[0];
  }; /* 128 bytes + ring size */
  
  #define AIO_RING_PAGES	8
4e179bca6   Kent Overstreet   aio: move private...
67

db446a08c   Benjamin LaHaise   aio: convert the ...
68
69
70
71
72
  struct kioctx_table {
  	struct rcu_head	rcu;
  	unsigned	nr;
  	struct kioctx	*table[];
  };
e1bdd5f27   Kent Overstreet   aio: percpu reqs_...
73
74
75
  struct kioctx_cpu {
  	unsigned		reqs_available;
  };
4e179bca6   Kent Overstreet   aio: move private...
76
  struct kioctx {
723be6e39   Kent Overstreet   aio: percpu ioctx...
77
  	struct percpu_ref	users;
36f558890   Kent Overstreet   aio: refcounting ...
78
  	atomic_t		dead;
4e179bca6   Kent Overstreet   aio: move private...
79

e34ecee2a   Kent Overstreet   aio: Fix a trinit...
80
  	struct percpu_ref	reqs;
4e179bca6   Kent Overstreet   aio: move private...
81
  	unsigned long		user_id;
4e179bca6   Kent Overstreet   aio: move private...
82

e1bdd5f27   Kent Overstreet   aio: percpu reqs_...
83
84
85
86
87
88
89
  	struct __percpu kioctx_cpu *cpu;
  
  	/*
  	 * For percpu reqs_available, number of slots we move to/from global
  	 * counter at a time:
  	 */
  	unsigned		req_batch;
3e845ce01   Kent Overstreet   aio: change reqs_...
90
91
92
93
  	/*
  	 * This is what userspace passed to io_setup(), it's not used for
  	 * anything but counting against the global max_reqs quota.
  	 *
58c85dc20   Kent Overstreet   aio: kill struct ...
94
  	 * The real limit is nr_events - 1, which will be larger (see
3e845ce01   Kent Overstreet   aio: change reqs_...
95
96
  	 * aio_setup_ring())
  	 */
4e179bca6   Kent Overstreet   aio: move private...
97
  	unsigned		max_reqs;
58c85dc20   Kent Overstreet   aio: kill struct ...
98
99
  	/* Size of ringbuffer, in units of struct io_event */
  	unsigned		nr_events;
4e179bca6   Kent Overstreet   aio: move private...
100

58c85dc20   Kent Overstreet   aio: kill struct ...
101
102
103
104
105
  	unsigned long		mmap_base;
  	unsigned long		mmap_size;
  
  	struct page		**ring_pages;
  	long			nr_pages;
723be6e39   Kent Overstreet   aio: percpu ioctx...
106
  	struct work_struct	free_work;
4e23bcaeb   Kent Overstreet   aio: give shared ...
107

e02ba72aa   Anatol Pomozov   aio: block io_des...
108
109
110
111
  	/*
  	 * signals when all in-flight requests are done
  	 */
  	struct completion *requests_done;
4e23bcaeb   Kent Overstreet   aio: give shared ...
112
  	struct {
34e83fc61   Kent Overstreet   aio: reqs_active ...
113
114
115
116
117
  		/*
  		 * This counts the number of available slots in the ringbuffer,
  		 * so we avoid overflowing it: it's decremented (if positive)
  		 * when allocating a kiocb and incremented when the resulting
  		 * io_event is pulled off the ringbuffer.
e1bdd5f27   Kent Overstreet   aio: percpu reqs_...
118
119
  		 *
  		 * We batch accesses to it with a percpu version.
34e83fc61   Kent Overstreet   aio: reqs_active ...
120
121
  		 */
  		atomic_t	reqs_available;
4e23bcaeb   Kent Overstreet   aio: give shared ...
122
123
124
125
126
127
  	} ____cacheline_aligned_in_smp;
  
  	struct {
  		spinlock_t	ctx_lock;
  		struct list_head active_reqs;	/* used for cancellation */
  	} ____cacheline_aligned_in_smp;
58c85dc20   Kent Overstreet   aio: kill struct ...
128
129
  	struct {
  		struct mutex	ring_lock;
4e23bcaeb   Kent Overstreet   aio: give shared ...
130
131
  		wait_queue_head_t wait;
  	} ____cacheline_aligned_in_smp;
58c85dc20   Kent Overstreet   aio: kill struct ...
132
133
134
  
  	struct {
  		unsigned	tail;
d856f32a8   Benjamin LaHaise   aio: fix reqs_ava...
135
  		unsigned	completed_events;
58c85dc20   Kent Overstreet   aio: kill struct ...
136
  		spinlock_t	completion_lock;
4e23bcaeb   Kent Overstreet   aio: give shared ...
137
  	} ____cacheline_aligned_in_smp;
58c85dc20   Kent Overstreet   aio: kill struct ...
138
139
  
  	struct page		*internal_pages[AIO_RING_PAGES];
36bc08cc0   Gu Zheng   fs/aio: Add suppo...
140
  	struct file		*aio_ring_file;
db446a08c   Benjamin LaHaise   aio: convert the ...
141
142
  
  	unsigned		id;
4e179bca6   Kent Overstreet   aio: move private...
143
  };
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
144
  /*------ sysctl variables----*/
d55b5fdaf   Zach Brown   [PATCH] aio: remo...
145
146
147
  static DEFINE_SPINLOCK(aio_nr_lock);
  unsigned long aio_nr;		/* current system wide number of aio requests */
  unsigned long aio_max_nr = 0x10000; /* system wide maximum number of aio requests */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
148
  /*----end sysctl variables---*/
e18b890bb   Christoph Lameter   [PATCH] slab: rem...
149
150
  static struct kmem_cache	*kiocb_cachep;
  static struct kmem_cache	*kioctx_cachep;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
151

71ad7490c   Benjamin LaHaise   rework aio migrat...
152
153
154
155
  static struct vfsmount *aio_mnt;
  
  static const struct file_operations aio_ring_fops;
  static const struct address_space_operations aio_ctx_aops;
835f252c6   Gu Zheng   aio: fix uncorren...
156
157
158
159
160
161
162
163
  /* Backing dev info for aio fs.
   * -no dirty page accounting or writeback happens
   */
  static struct backing_dev_info aio_fs_backing_dev_info = {
  	.name           = "aiofs",
  	.state          = 0,
  	.capabilities   = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_MAP_COPY,
  };
71ad7490c   Benjamin LaHaise   rework aio migrat...
164
165
166
167
168
169
  static struct file *aio_private_file(struct kioctx *ctx, loff_t nr_pages)
  {
  	struct qstr this = QSTR_INIT("[aio]", 5);
  	struct file *file;
  	struct path path;
  	struct inode *inode = alloc_anon_inode(aio_mnt->mnt_sb);
7f62656be   Dan Carpenter   aio: checking for...
170
171
  	if (IS_ERR(inode))
  		return ERR_CAST(inode);
71ad7490c   Benjamin LaHaise   rework aio migrat...
172
173
174
  
  	inode->i_mapping->a_ops = &aio_ctx_aops;
  	inode->i_mapping->private_data = ctx;
835f252c6   Gu Zheng   aio: fix uncorren...
175
  	inode->i_mapping->backing_dev_info = &aio_fs_backing_dev_info;
71ad7490c   Benjamin LaHaise   rework aio migrat...
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
  	inode->i_size = PAGE_SIZE * nr_pages;
  
  	path.dentry = d_alloc_pseudo(aio_mnt->mnt_sb, &this);
  	if (!path.dentry) {
  		iput(inode);
  		return ERR_PTR(-ENOMEM);
  	}
  	path.mnt = mntget(aio_mnt);
  
  	d_instantiate(path.dentry, inode);
  	file = alloc_file(&path, FMODE_READ | FMODE_WRITE, &aio_ring_fops);
  	if (IS_ERR(file)) {
  		path_put(&path);
  		return file;
  	}
  
  	file->f_flags = O_RDWR;
71ad7490c   Benjamin LaHaise   rework aio migrat...
193
194
195
196
197
198
199
200
201
  	return file;
  }
  
  static struct dentry *aio_mount(struct file_system_type *fs_type,
  				int flags, const char *dev_name, void *data)
  {
  	static const struct dentry_operations ops = {
  		.d_dname	= simple_dname,
  	};
8dc4379e1   Gu Zheng   aio: use the macr...
202
  	return mount_pseudo(fs_type, "aio:", NULL, &ops, AIO_RING_MAGIC);
71ad7490c   Benjamin LaHaise   rework aio migrat...
203
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
204
205
206
207
208
209
  /* aio_setup
   *	Creates the slab caches used by the aio routines, panic on
   *	failure as this is done early during the boot sequence.
   */
  static int __init aio_setup(void)
  {
71ad7490c   Benjamin LaHaise   rework aio migrat...
210
211
212
213
214
215
216
217
  	static struct file_system_type aio_fs = {
  		.name		= "aio",
  		.mount		= aio_mount,
  		.kill_sb	= kill_anon_super,
  	};
  	aio_mnt = kern_mount(&aio_fs);
  	if (IS_ERR(aio_mnt))
  		panic("Failed to create aio fs mount.");
835f252c6   Gu Zheng   aio: fix uncorren...
218
219
  	if (bdi_init(&aio_fs_backing_dev_info))
  		panic("Failed to init aio fs backing dev info.");
0a31bd5f2   Christoph Lameter   KMEM_CACHE(): sim...
220
221
  	kiocb_cachep = KMEM_CACHE(kiocb, SLAB_HWCACHE_ALIGN|SLAB_PANIC);
  	kioctx_cachep = KMEM_CACHE(kioctx,SLAB_HWCACHE_ALIGN|SLAB_PANIC);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
222

caf4167aa   Kent Overstreet   aio: dprintk() ->...
223
224
  	pr_debug("sizeof(struct page) = %zu
  ", sizeof(struct page));
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
225
226
227
  
  	return 0;
  }
385773e04   H Hartley Sweeten   aio.c: move EXPOR...
228
  __initcall(aio_setup);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
229

5e9ae2e5d   Benjamin LaHaise   aio: fix use-afte...
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
  static void put_aio_ring_file(struct kioctx *ctx)
  {
  	struct file *aio_ring_file = ctx->aio_ring_file;
  	if (aio_ring_file) {
  		truncate_setsize(aio_ring_file->f_inode, 0);
  
  		/* Prevent further access to the kioctx from migratepages */
  		spin_lock(&aio_ring_file->f_inode->i_mapping->private_lock);
  		aio_ring_file->f_inode->i_mapping->private_data = NULL;
  		ctx->aio_ring_file = NULL;
  		spin_unlock(&aio_ring_file->f_inode->i_mapping->private_lock);
  
  		fput(aio_ring_file);
  	}
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
245
246
  static void aio_free_ring(struct kioctx *ctx)
  {
36bc08cc0   Gu Zheng   fs/aio: Add suppo...
247
  	int i;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
248

fa8a53c39   Benjamin LaHaise   aio: v4 ensure ac...
249
250
251
252
  	/* Disconnect the kiotx from the ring file.  This prevents future
  	 * accesses to the kioctx from page migration.
  	 */
  	put_aio_ring_file(ctx);
36bc08cc0   Gu Zheng   fs/aio: Add suppo...
253
  	for (i = 0; i < ctx->nr_pages; i++) {
8e321fefb   Benjamin LaHaise   aio/migratepages:...
254
  		struct page *page;
36bc08cc0   Gu Zheng   fs/aio: Add suppo...
255
256
257
  		pr_debug("pid(%d) [%d] page->count=%d
  ", current->pid, i,
  				page_count(ctx->ring_pages[i]));
8e321fefb   Benjamin LaHaise   aio/migratepages:...
258
259
260
261
262
  		page = ctx->ring_pages[i];
  		if (!page)
  			continue;
  		ctx->ring_pages[i] = NULL;
  		put_page(page);
36bc08cc0   Gu Zheng   fs/aio: Add suppo...
263
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
264

ddb8c45ba   Sasha Levin   aio: nullify aio-...
265
  	if (ctx->ring_pages && ctx->ring_pages != ctx->internal_pages) {
58c85dc20   Kent Overstreet   aio: kill struct ...
266
  		kfree(ctx->ring_pages);
ddb8c45ba   Sasha Levin   aio: nullify aio-...
267
268
  		ctx->ring_pages = NULL;
  	}
36bc08cc0   Gu Zheng   fs/aio: Add suppo...
269
270
271
272
273
274
275
276
277
278
279
  }
  
  static int aio_ring_mmap(struct file *file, struct vm_area_struct *vma)
  {
  	vma->vm_ops = &generic_file_vm_ops;
  	return 0;
  }
  
  static const struct file_operations aio_ring_fops = {
  	.mmap = aio_ring_mmap,
  };
0c45355fc   Benjamin LaHaise   aio: fix build wh...
280
  #if IS_ENABLED(CONFIG_MIGRATION)
36bc08cc0   Gu Zheng   fs/aio: Add suppo...
281
282
283
  static int aio_migratepage(struct address_space *mapping, struct page *new,
  			struct page *old, enum migrate_mode mode)
  {
5e9ae2e5d   Benjamin LaHaise   aio: fix use-afte...
284
  	struct kioctx *ctx;
36bc08cc0   Gu Zheng   fs/aio: Add suppo...
285
  	unsigned long flags;
fa8a53c39   Benjamin LaHaise   aio: v4 ensure ac...
286
  	pgoff_t idx;
36bc08cc0   Gu Zheng   fs/aio: Add suppo...
287
  	int rc;
8e321fefb   Benjamin LaHaise   aio/migratepages:...
288
  	rc = 0;
fa8a53c39   Benjamin LaHaise   aio: v4 ensure ac...
289
  	/* mapping->private_lock here protects against the kioctx teardown.  */
8e321fefb   Benjamin LaHaise   aio/migratepages:...
290
291
  	spin_lock(&mapping->private_lock);
  	ctx = mapping->private_data;
fa8a53c39   Benjamin LaHaise   aio: v4 ensure ac...
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
  	if (!ctx) {
  		rc = -EINVAL;
  		goto out;
  	}
  
  	/* The ring_lock mutex.  The prevents aio_read_events() from writing
  	 * to the ring's head, and prevents page migration from mucking in
  	 * a partially initialized kiotx.
  	 */
  	if (!mutex_trylock(&ctx->ring_lock)) {
  		rc = -EAGAIN;
  		goto out;
  	}
  
  	idx = old->index;
  	if (idx < (pgoff_t)ctx->nr_pages) {
  		/* Make sure the old page hasn't already been changed */
  		if (ctx->ring_pages[idx] != old)
  			rc = -EAGAIN;
8e321fefb   Benjamin LaHaise   aio/migratepages:...
311
312
  	} else
  		rc = -EINVAL;
8e321fefb   Benjamin LaHaise   aio/migratepages:...
313
314
  
  	if (rc != 0)
fa8a53c39   Benjamin LaHaise   aio: v4 ensure ac...
315
  		goto out_unlock;
8e321fefb   Benjamin LaHaise   aio/migratepages:...
316

36bc08cc0   Gu Zheng   fs/aio: Add suppo...
317
318
  	/* Writeback must be complete */
  	BUG_ON(PageWriteback(old));
8e321fefb   Benjamin LaHaise   aio/migratepages:...
319
  	get_page(new);
36bc08cc0   Gu Zheng   fs/aio: Add suppo...
320

8e321fefb   Benjamin LaHaise   aio/migratepages:...
321
  	rc = migrate_page_move_mapping(mapping, new, old, NULL, mode, 1);
36bc08cc0   Gu Zheng   fs/aio: Add suppo...
322
  	if (rc != MIGRATEPAGE_SUCCESS) {
8e321fefb   Benjamin LaHaise   aio/migratepages:...
323
  		put_page(new);
fa8a53c39   Benjamin LaHaise   aio: v4 ensure ac...
324
  		goto out_unlock;
36bc08cc0   Gu Zheng   fs/aio: Add suppo...
325
  	}
fa8a53c39   Benjamin LaHaise   aio: v4 ensure ac...
326
327
328
  	/* Take completion_lock to prevent other writes to the ring buffer
  	 * while the old page is copied to the new.  This prevents new
  	 * events from being lost.
5e9ae2e5d   Benjamin LaHaise   aio: fix use-afte...
329
  	 */
fa8a53c39   Benjamin LaHaise   aio: v4 ensure ac...
330
331
332
333
334
  	spin_lock_irqsave(&ctx->completion_lock, flags);
  	migrate_page_copy(new, old);
  	BUG_ON(ctx->ring_pages[idx] != old);
  	ctx->ring_pages[idx] = new;
  	spin_unlock_irqrestore(&ctx->completion_lock, flags);
36bc08cc0   Gu Zheng   fs/aio: Add suppo...
335

fa8a53c39   Benjamin LaHaise   aio: v4 ensure ac...
336
337
  	/* The old page is no longer accessible. */
  	put_page(old);
8e321fefb   Benjamin LaHaise   aio/migratepages:...
338

fa8a53c39   Benjamin LaHaise   aio: v4 ensure ac...
339
340
341
342
  out_unlock:
  	mutex_unlock(&ctx->ring_lock);
  out:
  	spin_unlock(&mapping->private_lock);
36bc08cc0   Gu Zheng   fs/aio: Add suppo...
343
  	return rc;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
344
  }
0c45355fc   Benjamin LaHaise   aio: fix build wh...
345
  #endif
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
346

36bc08cc0   Gu Zheng   fs/aio: Add suppo...
347
  static const struct address_space_operations aio_ctx_aops = {
835f252c6   Gu Zheng   aio: fix uncorren...
348
  	.set_page_dirty = __set_page_dirty_no_writeback,
0c45355fc   Benjamin LaHaise   aio: fix build wh...
349
  #if IS_ENABLED(CONFIG_MIGRATION)
36bc08cc0   Gu Zheng   fs/aio: Add suppo...
350
  	.migratepage	= aio_migratepage,
0c45355fc   Benjamin LaHaise   aio: fix build wh...
351
  #endif
36bc08cc0   Gu Zheng   fs/aio: Add suppo...
352
  };
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
353
354
355
  static int aio_setup_ring(struct kioctx *ctx)
  {
  	struct aio_ring *ring;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
356
  	unsigned nr_events = ctx->max_reqs;
41003a7bc   Zach Brown   aio: remove retry...
357
  	struct mm_struct *mm = current->mm;
3dc9acb67   Linus Torvalds   aio: clean up and...
358
  	unsigned long size, unused;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
359
  	int nr_pages;
36bc08cc0   Gu Zheng   fs/aio: Add suppo...
360
361
  	int i;
  	struct file *file;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
362
363
364
365
366
367
  
  	/* Compensate for the ring buffer's head/tail overlap entry */
  	nr_events += 2;	/* 1 is required, 2 for good luck */
  
  	size = sizeof(struct aio_ring);
  	size += sizeof(struct io_event) * nr_events;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
368

36bc08cc0   Gu Zheng   fs/aio: Add suppo...
369
  	nr_pages = PFN_UP(size);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
370
371
  	if (nr_pages < 0)
  		return -EINVAL;
71ad7490c   Benjamin LaHaise   rework aio migrat...
372
  	file = aio_private_file(ctx, nr_pages);
36bc08cc0   Gu Zheng   fs/aio: Add suppo...
373
374
  	if (IS_ERR(file)) {
  		ctx->aio_ring_file = NULL;
fa8a53c39   Benjamin LaHaise   aio: v4 ensure ac...
375
  		return -ENOMEM;
36bc08cc0   Gu Zheng   fs/aio: Add suppo...
376
  	}
3dc9acb67   Linus Torvalds   aio: clean up and...
377
378
379
380
381
382
383
384
385
386
387
388
389
  	ctx->aio_ring_file = file;
  	nr_events = (PAGE_SIZE * nr_pages - sizeof(struct aio_ring))
  			/ sizeof(struct io_event);
  
  	ctx->ring_pages = ctx->internal_pages;
  	if (nr_pages > AIO_RING_PAGES) {
  		ctx->ring_pages = kcalloc(nr_pages, sizeof(struct page *),
  					  GFP_KERNEL);
  		if (!ctx->ring_pages) {
  			put_aio_ring_file(ctx);
  			return -ENOMEM;
  		}
  	}
36bc08cc0   Gu Zheng   fs/aio: Add suppo...
390
391
392
393
394
395
396
397
398
399
  	for (i = 0; i < nr_pages; i++) {
  		struct page *page;
  		page = find_or_create_page(file->f_inode->i_mapping,
  					   i, GFP_HIGHUSER | __GFP_ZERO);
  		if (!page)
  			break;
  		pr_debug("pid(%d) page[%d]->count=%d
  ",
  			 current->pid, i, page_count(page));
  		SetPageUptodate(page);
36bc08cc0   Gu Zheng   fs/aio: Add suppo...
400
  		unlock_page(page);
3dc9acb67   Linus Torvalds   aio: clean up and...
401
402
  
  		ctx->ring_pages[i] = page;
36bc08cc0   Gu Zheng   fs/aio: Add suppo...
403
  	}
3dc9acb67   Linus Torvalds   aio: clean up and...
404
  	ctx->nr_pages = i;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
405

3dc9acb67   Linus Torvalds   aio: clean up and...
406
407
  	if (unlikely(i != nr_pages)) {
  		aio_free_ring(ctx);
fa8a53c39   Benjamin LaHaise   aio: v4 ensure ac...
408
  		return -ENOMEM;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
409
  	}
58c85dc20   Kent Overstreet   aio: kill struct ...
410
411
412
  	ctx->mmap_size = nr_pages * PAGE_SIZE;
  	pr_debug("attempting mmap of %lu bytes
  ", ctx->mmap_size);
36bc08cc0   Gu Zheng   fs/aio: Add suppo...
413

41003a7bc   Zach Brown   aio: remove retry...
414
  	down_write(&mm->mmap_sem);
36bc08cc0   Gu Zheng   fs/aio: Add suppo...
415
416
  	ctx->mmap_base = do_mmap_pgoff(ctx->aio_ring_file, 0, ctx->mmap_size,
  				       PROT_READ | PROT_WRITE,
3dc9acb67   Linus Torvalds   aio: clean up and...
417
418
  				       MAP_SHARED, 0, &unused);
  	up_write(&mm->mmap_sem);
58c85dc20   Kent Overstreet   aio: kill struct ...
419
  	if (IS_ERR((void *)ctx->mmap_base)) {
58c85dc20   Kent Overstreet   aio: kill struct ...
420
  		ctx->mmap_size = 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
421
  		aio_free_ring(ctx);
fa8a53c39   Benjamin LaHaise   aio: v4 ensure ac...
422
  		return -ENOMEM;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
423
  	}
58c85dc20   Kent Overstreet   aio: kill struct ...
424
425
  	pr_debug("mmap address: 0x%08lx
  ", ctx->mmap_base);
d6c355c7d   Benjamin LaHaise   aio: fix race in ...
426

58c85dc20   Kent Overstreet   aio: kill struct ...
427
428
  	ctx->user_id = ctx->mmap_base;
  	ctx->nr_events = nr_events; /* trusted copy */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
429

58c85dc20   Kent Overstreet   aio: kill struct ...
430
  	ring = kmap_atomic(ctx->ring_pages[0]);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
431
  	ring->nr = nr_events;	/* user copy */
db446a08c   Benjamin LaHaise   aio: convert the ...
432
  	ring->id = ~0U;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
433
434
435
436
437
  	ring->head = ring->tail = 0;
  	ring->magic = AIO_RING_MAGIC;
  	ring->compat_features = AIO_RING_COMPAT_FEATURES;
  	ring->incompat_features = AIO_RING_INCOMPAT_FEATURES;
  	ring->header_length = sizeof(struct aio_ring);
e8e3c3d66   Cong Wang   fs: remove the se...
438
  	kunmap_atomic(ring);
58c85dc20   Kent Overstreet   aio: kill struct ...
439
  	flush_dcache_page(ctx->ring_pages[0]);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
440
441
442
  
  	return 0;
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
443
444
445
  #define AIO_EVENTS_PER_PAGE	(PAGE_SIZE / sizeof(struct io_event))
  #define AIO_EVENTS_FIRST_PAGE	((PAGE_SIZE - sizeof(struct aio_ring)) / sizeof(struct io_event))
  #define AIO_EVENTS_OFFSET	(AIO_EVENTS_PER_PAGE - AIO_EVENTS_FIRST_PAGE)
0460fef2a   Kent Overstreet   aio: use cancella...
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
  void kiocb_set_cancel_fn(struct kiocb *req, kiocb_cancel_fn *cancel)
  {
  	struct kioctx *ctx = req->ki_ctx;
  	unsigned long flags;
  
  	spin_lock_irqsave(&ctx->ctx_lock, flags);
  
  	if (!req->ki_list.next)
  		list_add(&req->ki_list, &ctx->active_reqs);
  
  	req->ki_cancel = cancel;
  
  	spin_unlock_irqrestore(&ctx->ctx_lock, flags);
  }
  EXPORT_SYMBOL(kiocb_set_cancel_fn);
d52a8f9ea   Fabian Frederick   fs/aio.c: Remove ...
461
  static int kiocb_cancel(struct kiocb *kiocb)
906b973cf   Kent Overstreet   aio: add kiocb_ca...
462
  {
0460fef2a   Kent Overstreet   aio: use cancella...
463
  	kiocb_cancel_fn *old, *cancel;
906b973cf   Kent Overstreet   aio: add kiocb_ca...
464

0460fef2a   Kent Overstreet   aio: use cancella...
465
466
467
468
469
470
471
472
  	/*
  	 * Don't want to set kiocb->ki_cancel = KIOCB_CANCELLED unless it
  	 * actually has a cancel function, hence the cmpxchg()
  	 */
  
  	cancel = ACCESS_ONCE(kiocb->ki_cancel);
  	do {
  		if (!cancel || cancel == KIOCB_CANCELLED)
57282d8fd   Kent Overstreet   aio: Kill ki_users
473
  			return -EINVAL;
906b973cf   Kent Overstreet   aio: add kiocb_ca...
474

0460fef2a   Kent Overstreet   aio: use cancella...
475
476
477
  		old = cancel;
  		cancel = cmpxchg(&kiocb->ki_cancel, old, KIOCB_CANCELLED);
  	} while (cancel != old);
906b973cf   Kent Overstreet   aio: add kiocb_ca...
478

57282d8fd   Kent Overstreet   aio: Kill ki_users
479
  	return cancel(kiocb);
906b973cf   Kent Overstreet   aio: add kiocb_ca...
480
  }
e34ecee2a   Kent Overstreet   aio: Fix a trinit...
481
  static void free_ioctx(struct work_struct *work)
36f558890   Kent Overstreet   aio: refcounting ...
482
  {
e34ecee2a   Kent Overstreet   aio: Fix a trinit...
483
  	struct kioctx *ctx = container_of(work, struct kioctx, free_work);
e1bdd5f27   Kent Overstreet   aio: percpu reqs_...
484

e34ecee2a   Kent Overstreet   aio: Fix a trinit...
485
486
  	pr_debug("freeing %p
  ", ctx);
e1bdd5f27   Kent Overstreet   aio: percpu reqs_...
487

e34ecee2a   Kent Overstreet   aio: Fix a trinit...
488
  	aio_free_ring(ctx);
e1bdd5f27   Kent Overstreet   aio: percpu reqs_...
489
  	free_percpu(ctx->cpu);
9a1049da9   Tejun Heo   percpu-refcount: ...
490
491
  	percpu_ref_exit(&ctx->reqs);
  	percpu_ref_exit(&ctx->users);
36f558890   Kent Overstreet   aio: refcounting ...
492
493
  	kmem_cache_free(kioctx_cachep, ctx);
  }
e34ecee2a   Kent Overstreet   aio: Fix a trinit...
494
495
496
  static void free_ioctx_reqs(struct percpu_ref *ref)
  {
  	struct kioctx *ctx = container_of(ref, struct kioctx, reqs);
e02ba72aa   Anatol Pomozov   aio: block io_des...
497
498
499
  	/* At this point we know that there are no any in-flight requests */
  	if (ctx->requests_done)
  		complete(ctx->requests_done);
e34ecee2a   Kent Overstreet   aio: Fix a trinit...
500
501
502
  	INIT_WORK(&ctx->free_work, free_ioctx);
  	schedule_work(&ctx->free_work);
  }
36f558890   Kent Overstreet   aio: refcounting ...
503
504
505
506
507
  /*
   * When this function runs, the kioctx has been removed from the "hash table"
   * and ctx->users has dropped to 0, so we know no more kiocbs can be submitted -
   * now it's safe to cancel any that need to be.
   */
e34ecee2a   Kent Overstreet   aio: Fix a trinit...
508
  static void free_ioctx_users(struct percpu_ref *ref)
36f558890   Kent Overstreet   aio: refcounting ...
509
  {
e34ecee2a   Kent Overstreet   aio: Fix a trinit...
510
  	struct kioctx *ctx = container_of(ref, struct kioctx, users);
36f558890   Kent Overstreet   aio: refcounting ...
511
512
513
514
515
516
517
518
519
  	struct kiocb *req;
  
  	spin_lock_irq(&ctx->ctx_lock);
  
  	while (!list_empty(&ctx->active_reqs)) {
  		req = list_first_entry(&ctx->active_reqs,
  				       struct kiocb, ki_list);
  
  		list_del_init(&req->ki_list);
d52a8f9ea   Fabian Frederick   fs/aio.c: Remove ...
520
  		kiocb_cancel(req);
36f558890   Kent Overstreet   aio: refcounting ...
521
522
523
  	}
  
  	spin_unlock_irq(&ctx->ctx_lock);
e34ecee2a   Kent Overstreet   aio: Fix a trinit...
524
525
  	percpu_ref_kill(&ctx->reqs);
  	percpu_ref_put(&ctx->reqs);
36f558890   Kent Overstreet   aio: refcounting ...
526
  }
db446a08c   Benjamin LaHaise   aio: convert the ...
527
528
529
530
531
532
533
  static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm)
  {
  	unsigned i, new_nr;
  	struct kioctx_table *table, *old;
  	struct aio_ring *ring;
  
  	spin_lock(&mm->ioctx_lock);
855ef0dec   Oleg Nesterov   aio: kill the mis...
534
  	table = rcu_dereference_raw(mm->ioctx_table);
db446a08c   Benjamin LaHaise   aio: convert the ...
535
536
537
538
539
540
541
542
  
  	while (1) {
  		if (table)
  			for (i = 0; i < table->nr; i++)
  				if (!table->table[i]) {
  					ctx->id = i;
  					table->table[i] = ctx;
  					spin_unlock(&mm->ioctx_lock);
fa8a53c39   Benjamin LaHaise   aio: v4 ensure ac...
543
544
545
546
  					/* While kioctx setup is in progress,
  					 * we are protected from page migration
  					 * changes ring_pages by ->ring_lock.
  					 */
db446a08c   Benjamin LaHaise   aio: convert the ...
547
548
549
550
551
552
553
  					ring = kmap_atomic(ctx->ring_pages[0]);
  					ring->id = ctx->id;
  					kunmap_atomic(ring);
  					return 0;
  				}
  
  		new_nr = (table ? table->nr : 1) * 4;
db446a08c   Benjamin LaHaise   aio: convert the ...
554
555
556
557
558
559
560
561
562
563
  		spin_unlock(&mm->ioctx_lock);
  
  		table = kzalloc(sizeof(*table) + sizeof(struct kioctx *) *
  				new_nr, GFP_KERNEL);
  		if (!table)
  			return -ENOMEM;
  
  		table->nr = new_nr;
  
  		spin_lock(&mm->ioctx_lock);
855ef0dec   Oleg Nesterov   aio: kill the mis...
564
  		old = rcu_dereference_raw(mm->ioctx_table);
db446a08c   Benjamin LaHaise   aio: convert the ...
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
  
  		if (!old) {
  			rcu_assign_pointer(mm->ioctx_table, table);
  		} else if (table->nr > old->nr) {
  			memcpy(table->table, old->table,
  			       old->nr * sizeof(struct kioctx *));
  
  			rcu_assign_pointer(mm->ioctx_table, table);
  			kfree_rcu(old, rcu);
  		} else {
  			kfree(table);
  			table = old;
  		}
  	}
  }
e34ecee2a   Kent Overstreet   aio: Fix a trinit...
580
581
582
583
584
585
586
587
588
  static void aio_nr_sub(unsigned nr)
  {
  	spin_lock(&aio_nr_lock);
  	if (WARN_ON(aio_nr - nr > aio_nr))
  		aio_nr = 0;
  	else
  		aio_nr -= nr;
  	spin_unlock(&aio_nr_lock);
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
589
590
591
592
593
  /* ioctx_alloc
   *	Allocates and initializes an ioctx.  Returns an ERR_PTR if it failed.
   */
  static struct kioctx *ioctx_alloc(unsigned nr_events)
  {
41003a7bc   Zach Brown   aio: remove retry...
594
  	struct mm_struct *mm = current->mm;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
595
  	struct kioctx *ctx;
e23754f88   Al Viro   aio: don't bother...
596
  	int err = -ENOMEM;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
597

e1bdd5f27   Kent Overstreet   aio: percpu reqs_...
598
599
600
601
602
603
604
605
606
607
608
  	/*
  	 * We keep track of the number of available ringbuffer slots, to prevent
  	 * overflow (reqs_available), and we also use percpu counters for this.
  	 *
  	 * So since up to half the slots might be on other cpu's percpu counters
  	 * and unavailable, double nr_events so userspace sees what they
  	 * expected: additionally, we move req_batch slots to/from percpu
  	 * counters at a time, so make sure that isn't 0:
  	 */
  	nr_events = max(nr_events, num_possible_cpus() * 4);
  	nr_events *= 2;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
609
610
611
612
613
614
615
  	/* Prevent overflows */
  	if ((nr_events > (0x10000000U / sizeof(struct io_event))) ||
  	    (nr_events > (0x10000000U / sizeof(struct kiocb)))) {
  		pr_debug("ENOMEM: nr_events too high
  ");
  		return ERR_PTR(-EINVAL);
  	}
4cd81c3df   Benjamin LaHaise   aio: double aio_m...
616
  	if (!nr_events || (unsigned long)nr_events > (aio_max_nr * 2UL))
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
617
  		return ERR_PTR(-EAGAIN);
c37622296   Robert P. J. Day   [PATCH] Transform...
618
  	ctx = kmem_cache_zalloc(kioctx_cachep, GFP_KERNEL);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
619
620
  	if (!ctx)
  		return ERR_PTR(-ENOMEM);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
621
  	ctx->max_reqs = nr_events;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
622

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
623
  	spin_lock_init(&ctx->ctx_lock);
0460fef2a   Kent Overstreet   aio: use cancella...
624
  	spin_lock_init(&ctx->completion_lock);
58c85dc20   Kent Overstreet   aio: kill struct ...
625
  	mutex_init(&ctx->ring_lock);
fa8a53c39   Benjamin LaHaise   aio: v4 ensure ac...
626
627
628
  	/* Protect against page migration throughout kiotx setup by keeping
  	 * the ring_lock mutex held until setup is complete. */
  	mutex_lock(&ctx->ring_lock);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
629
630
631
  	init_waitqueue_head(&ctx->wait);
  
  	INIT_LIST_HEAD(&ctx->active_reqs);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
632

2aad2a86f   Tejun Heo   percpu_ref: add P...
633
  	if (percpu_ref_init(&ctx->users, free_ioctx_users, 0, GFP_KERNEL))
fa8a53c39   Benjamin LaHaise   aio: v4 ensure ac...
634
  		goto err;
2aad2a86f   Tejun Heo   percpu_ref: add P...
635
  	if (percpu_ref_init(&ctx->reqs, free_ioctx_reqs, 0, GFP_KERNEL))
fa8a53c39   Benjamin LaHaise   aio: v4 ensure ac...
636
  		goto err;
e1bdd5f27   Kent Overstreet   aio: percpu reqs_...
637
638
  	ctx->cpu = alloc_percpu(struct kioctx_cpu);
  	if (!ctx->cpu)
e34ecee2a   Kent Overstreet   aio: Fix a trinit...
639
  		goto err;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
640

fa8a53c39   Benjamin LaHaise   aio: v4 ensure ac...
641
642
  	err = aio_setup_ring(ctx);
  	if (err < 0)
e34ecee2a   Kent Overstreet   aio: Fix a trinit...
643
  		goto err;
e1bdd5f27   Kent Overstreet   aio: percpu reqs_...
644

34e83fc61   Kent Overstreet   aio: reqs_active ...
645
  	atomic_set(&ctx->reqs_available, ctx->nr_events - 1);
e1bdd5f27   Kent Overstreet   aio: percpu reqs_...
646
  	ctx->req_batch = (ctx->nr_events - 1) / (num_possible_cpus() * 4);
6878ea72a   Benjamin LaHaise   aio: be defensive...
647
648
  	if (ctx->req_batch < 1)
  		ctx->req_batch = 1;
34e83fc61   Kent Overstreet   aio: reqs_active ...
649

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
650
  	/* limit the number of system wide aios */
9fa1cb397   Al Viro   aio: aio_nr_lock ...
651
  	spin_lock(&aio_nr_lock);
4cd81c3df   Benjamin LaHaise   aio: double aio_m...
652
  	if (aio_nr + nr_events > (aio_max_nr * 2UL) ||
2dd542b7a   Al Viro   aio: aio_nr decre...
653
  	    aio_nr + nr_events < aio_nr) {
9fa1cb397   Al Viro   aio: aio_nr_lock ...
654
  		spin_unlock(&aio_nr_lock);
e34ecee2a   Kent Overstreet   aio: Fix a trinit...
655
  		err = -EAGAIN;
d1b943271   Gu Zheng   aio: clean up aio...
656
  		goto err_ctx;
2dd542b7a   Al Viro   aio: aio_nr decre...
657
658
  	}
  	aio_nr += ctx->max_reqs;
9fa1cb397   Al Viro   aio: aio_nr_lock ...
659
  	spin_unlock(&aio_nr_lock);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
660

1881686f8   Benjamin LaHaise   aio: fix kioctx l...
661
662
  	percpu_ref_get(&ctx->users);	/* io_setup() will drop this ref */
  	percpu_ref_get(&ctx->reqs);	/* free_ioctx_users() will drop this */
723be6e39   Kent Overstreet   aio: percpu ioctx...
663

da90382c2   Benjamin LaHaise   aio: fix error ha...
664
665
  	err = ioctx_add_table(ctx, mm);
  	if (err)
e34ecee2a   Kent Overstreet   aio: Fix a trinit...
666
  		goto err_cleanup;
da90382c2   Benjamin LaHaise   aio: fix error ha...
667

fa8a53c39   Benjamin LaHaise   aio: v4 ensure ac...
668
669
  	/* Release the ring_lock mutex now that all setup is complete. */
  	mutex_unlock(&ctx->ring_lock);
caf4167aa   Kent Overstreet   aio: dprintk() ->...
670
671
  	pr_debug("allocated ioctx %p[%ld]: mm=%p mask=0x%x
  ",
58c85dc20   Kent Overstreet   aio: kill struct ...
672
  		 ctx, ctx->user_id, mm, ctx->nr_events);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
673
  	return ctx;
e34ecee2a   Kent Overstreet   aio: Fix a trinit...
674
675
  err_cleanup:
  	aio_nr_sub(ctx->max_reqs);
d1b943271   Gu Zheng   aio: clean up aio...
676
677
  err_ctx:
  	aio_free_ring(ctx);
e34ecee2a   Kent Overstreet   aio: Fix a trinit...
678
  err:
fa8a53c39   Benjamin LaHaise   aio: v4 ensure ac...
679
  	mutex_unlock(&ctx->ring_lock);
e1bdd5f27   Kent Overstreet   aio: percpu reqs_...
680
  	free_percpu(ctx->cpu);
9a1049da9   Tejun Heo   percpu-refcount: ...
681
682
  	percpu_ref_exit(&ctx->reqs);
  	percpu_ref_exit(&ctx->users);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
683
  	kmem_cache_free(kioctx_cachep, ctx);
caf4167aa   Kent Overstreet   aio: dprintk() ->...
684
685
  	pr_debug("error allocating ioctx %d
  ", err);
e23754f88   Al Viro   aio: don't bother...
686
  	return ERR_PTR(err);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
687
  }
36f558890   Kent Overstreet   aio: refcounting ...
688
689
690
691
692
  /* kill_ioctx
   *	Cancels all outstanding aio requests on an aio context.  Used
   *	when the processes owning a context have all exited to encourage
   *	the rapid destruction of the kioctx.
   */
fb2d44838   Benjamin LaHaise   aio: report error...
693
  static int kill_ioctx(struct mm_struct *mm, struct kioctx *ctx,
e02ba72aa   Anatol Pomozov   aio: block io_des...
694
  		struct completion *requests_done)
36f558890   Kent Overstreet   aio: refcounting ...
695
  {
fa88b6f88   Benjamin LaHaise   aio: cleanup: fla...
696
  	struct kioctx_table *table;
db446a08c   Benjamin LaHaise   aio: convert the ...
697

fa88b6f88   Benjamin LaHaise   aio: cleanup: fla...
698
699
  	if (atomic_xchg(&ctx->dead, 1))
  		return -EINVAL;
db446a08c   Benjamin LaHaise   aio: convert the ...
700

db446a08c   Benjamin LaHaise   aio: convert the ...
701

fa88b6f88   Benjamin LaHaise   aio: cleanup: fla...
702
  	spin_lock(&mm->ioctx_lock);
855ef0dec   Oleg Nesterov   aio: kill the mis...
703
  	table = rcu_dereference_raw(mm->ioctx_table);
fa88b6f88   Benjamin LaHaise   aio: cleanup: fla...
704
705
  	WARN_ON(ctx != table->table[ctx->id]);
  	table->table[ctx->id] = NULL;
fa88b6f88   Benjamin LaHaise   aio: cleanup: fla...
706
  	spin_unlock(&mm->ioctx_lock);
4fcc712f5   Kent Overstreet   aio: fix io_destr...
707

fa88b6f88   Benjamin LaHaise   aio: cleanup: fla...
708
709
  	/* percpu_ref_kill() will do the necessary call_rcu() */
  	wake_up_all(&ctx->wait);
4fcc712f5   Kent Overstreet   aio: fix io_destr...
710

fa88b6f88   Benjamin LaHaise   aio: cleanup: fla...
711
712
713
714
715
716
717
718
  	/*
  	 * It'd be more correct to do this in free_ioctx(), after all
  	 * the outstanding kiocbs have finished - but by then io_destroy
  	 * has already returned, so io_setup() could potentially return
  	 * -EAGAIN with no ioctxs actually in use (as far as userspace
  	 *  could tell).
  	 */
  	aio_nr_sub(ctx->max_reqs);
4fcc712f5   Kent Overstreet   aio: fix io_destr...
719

fa88b6f88   Benjamin LaHaise   aio: cleanup: fla...
720
721
  	if (ctx->mmap_size)
  		vm_munmap(ctx->mmap_base, ctx->mmap_size);
fb2d44838   Benjamin LaHaise   aio: report error...
722

fa88b6f88   Benjamin LaHaise   aio: cleanup: fla...
723
724
725
  	ctx->requests_done = requests_done;
  	percpu_ref_kill(&ctx->users);
  	return 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
726
727
728
729
730
  }
  
  /* wait_on_sync_kiocb:
   *	Waits on the given sync kiocb to complete.
   */
57282d8fd   Kent Overstreet   aio: Kill ki_users
731
  ssize_t wait_on_sync_kiocb(struct kiocb *req)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
732
  {
57282d8fd   Kent Overstreet   aio: Kill ki_users
733
  	while (!req->ki_ctx) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
734
  		set_current_state(TASK_UNINTERRUPTIBLE);
57282d8fd   Kent Overstreet   aio: Kill ki_users
735
  		if (req->ki_ctx)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
736
  			break;
41d10da37   Jeff Moyer   aio: account I/O ...
737
  		io_schedule();
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
738
739
  	}
  	__set_current_state(TASK_RUNNING);
57282d8fd   Kent Overstreet   aio: Kill ki_users
740
  	return req->ki_user_data;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
741
  }
385773e04   H Hartley Sweeten   aio.c: move EXPOR...
742
  EXPORT_SYMBOL(wait_on_sync_kiocb);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
743

36f558890   Kent Overstreet   aio: refcounting ...
744
745
746
747
748
749
750
  /*
   * exit_aio: called when the last user of mm goes away.  At this point, there is
   * no way for any new requests to be submited or any of the io_* syscalls to be
   * called on the context.
   *
   * There may be outstanding kiocbs, but free_ioctx() will explicitly wait on
   * them.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
751
   */
fc9b52cd8   Harvey Harrison   fs: remove fastca...
752
  void exit_aio(struct mm_struct *mm)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
753
  {
4b70ac5fd   Oleg Nesterov   aio: change exit_...
754
755
  	struct kioctx_table *table = rcu_dereference_raw(mm->ioctx_table);
  	int i;
db446a08c   Benjamin LaHaise   aio: convert the ...
756

4b70ac5fd   Oleg Nesterov   aio: change exit_...
757
758
  	if (!table)
  		return;
db446a08c   Benjamin LaHaise   aio: convert the ...
759

4b70ac5fd   Oleg Nesterov   aio: change exit_...
760
761
  	for (i = 0; i < table->nr; ++i) {
  		struct kioctx *ctx = table->table[i];
6098b45b3   Gu Zheng   aio: block exit_a...
762
763
  		struct completion requests_done =
  			COMPLETION_INITIALIZER_ONSTACK(requests_done);
abf137dd7   Jens Axboe   aio: make the loo...
764

4b70ac5fd   Oleg Nesterov   aio: change exit_...
765
766
  		if (!ctx)
  			continue;
936af1576   Al Viro   aio: don't bother...
767
  		/*
4b70ac5fd   Oleg Nesterov   aio: change exit_...
768
769
770
771
772
  		 * We don't need to bother with munmap() here - exit_mmap(mm)
  		 * is coming and it'll unmap everything. And we simply can't,
  		 * this is not necessarily our ->mm.
  		 * Since kill_ioctx() uses non-zero ->mmap_size as indicator
  		 * that it needs to unmap the area, just set it to 0.
936af1576   Al Viro   aio: don't bother...
773
  		 */
58c85dc20   Kent Overstreet   aio: kill struct ...
774
  		ctx->mmap_size = 0;
6098b45b3   Gu Zheng   aio: block exit_a...
775
  		kill_ioctx(mm, ctx, &requests_done);
36f558890   Kent Overstreet   aio: refcounting ...
776

6098b45b3   Gu Zheng   aio: block exit_a...
777
778
  		/* Wait until all IO for the context are done. */
  		wait_for_completion(&requests_done);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
779
  	}
4b70ac5fd   Oleg Nesterov   aio: change exit_...
780
781
782
  
  	RCU_INIT_POINTER(mm->ioctx_table, NULL);
  	kfree(table);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
783
  }
e1bdd5f27   Kent Overstreet   aio: percpu reqs_...
784
785
786
  static void put_reqs_available(struct kioctx *ctx, unsigned nr)
  {
  	struct kioctx_cpu *kcpu;
263782c1c   Benjamin LaHaise   aio: protect reqs...
787
  	unsigned long flags;
e1bdd5f27   Kent Overstreet   aio: percpu reqs_...
788

263782c1c   Benjamin LaHaise   aio: protect reqs...
789
  	local_irq_save(flags);
be6fb451a   Benjamin LaHaise   aio: remove no lo...
790
  	kcpu = this_cpu_ptr(ctx->cpu);
e1bdd5f27   Kent Overstreet   aio: percpu reqs_...
791
  	kcpu->reqs_available += nr;
263782c1c   Benjamin LaHaise   aio: protect reqs...
792

e1bdd5f27   Kent Overstreet   aio: percpu reqs_...
793
794
795
796
  	while (kcpu->reqs_available >= ctx->req_batch * 2) {
  		kcpu->reqs_available -= ctx->req_batch;
  		atomic_add(ctx->req_batch, &ctx->reqs_available);
  	}
263782c1c   Benjamin LaHaise   aio: protect reqs...
797
  	local_irq_restore(flags);
e1bdd5f27   Kent Overstreet   aio: percpu reqs_...
798
799
800
801
802
803
  }
  
  static bool get_reqs_available(struct kioctx *ctx)
  {
  	struct kioctx_cpu *kcpu;
  	bool ret = false;
263782c1c   Benjamin LaHaise   aio: protect reqs...
804
  	unsigned long flags;
e1bdd5f27   Kent Overstreet   aio: percpu reqs_...
805

263782c1c   Benjamin LaHaise   aio: protect reqs...
806
  	local_irq_save(flags);
be6fb451a   Benjamin LaHaise   aio: remove no lo...
807
  	kcpu = this_cpu_ptr(ctx->cpu);
e1bdd5f27   Kent Overstreet   aio: percpu reqs_...
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
  	if (!kcpu->reqs_available) {
  		int old, avail = atomic_read(&ctx->reqs_available);
  
  		do {
  			if (avail < ctx->req_batch)
  				goto out;
  
  			old = avail;
  			avail = atomic_cmpxchg(&ctx->reqs_available,
  					       avail, avail - ctx->req_batch);
  		} while (avail != old);
  
  		kcpu->reqs_available += ctx->req_batch;
  	}
  
  	ret = true;
  	kcpu->reqs_available--;
  out:
263782c1c   Benjamin LaHaise   aio: protect reqs...
826
  	local_irq_restore(flags);
e1bdd5f27   Kent Overstreet   aio: percpu reqs_...
827
828
  	return ret;
  }
d856f32a8   Benjamin LaHaise   aio: fix reqs_ava...
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
  /* refill_reqs_available
   *	Updates the reqs_available reference counts used for tracking the
   *	number of free slots in the completion ring.  This can be called
   *	from aio_complete() (to optimistically update reqs_available) or
   *	from aio_get_req() (the we're out of events case).  It must be
   *	called holding ctx->completion_lock.
   */
  static void refill_reqs_available(struct kioctx *ctx, unsigned head,
                                    unsigned tail)
  {
  	unsigned events_in_ring, completed;
  
  	/* Clamp head since userland can write to it. */
  	head %= ctx->nr_events;
  	if (head <= tail)
  		events_in_ring = tail - head;
  	else
  		events_in_ring = ctx->nr_events - (head - tail);
  
  	completed = ctx->completed_events;
  	if (events_in_ring < completed)
  		completed -= events_in_ring;
  	else
  		completed = 0;
  
  	if (!completed)
  		return;
  
  	ctx->completed_events -= completed;
  	put_reqs_available(ctx, completed);
  }
  
  /* user_refill_reqs_available
   *	Called to refill reqs_available when aio_get_req() encounters an
   *	out of space in the completion ring.
   */
  static void user_refill_reqs_available(struct kioctx *ctx)
  {
  	spin_lock_irq(&ctx->completion_lock);
  	if (ctx->completed_events) {
  		struct aio_ring *ring;
  		unsigned head;
  
  		/* Access of ring->head may race with aio_read_events_ring()
  		 * here, but that's okay since whether we read the old version
  		 * or the new version, and either will be valid.  The important
  		 * part is that head cannot pass tail since we prevent
  		 * aio_complete() from updating tail by holding
  		 * ctx->completion_lock.  Even if head is invalid, the check
  		 * against ctx->completed_events below will make sure we do the
  		 * safe/right thing.
  		 */
  		ring = kmap_atomic(ctx->ring_pages[0]);
  		head = ring->head;
  		kunmap_atomic(ring);
  
  		refill_reqs_available(ctx, head, ctx->tail);
  	}
  
  	spin_unlock_irq(&ctx->completion_lock);
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
890
  /* aio_get_req
57282d8fd   Kent Overstreet   aio: Kill ki_users
891
892
   *	Allocate a slot for an aio request.
   * Returns NULL if no requests are free.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
893
   */
a1c8eae75   Kent Overstreet   aio: kill batch a...
894
  static inline struct kiocb *aio_get_req(struct kioctx *ctx)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
895
  {
a1c8eae75   Kent Overstreet   aio: kill batch a...
896
  	struct kiocb *req;
d856f32a8   Benjamin LaHaise   aio: fix reqs_ava...
897
898
899
900
901
  	if (!get_reqs_available(ctx)) {
  		user_refill_reqs_available(ctx);
  		if (!get_reqs_available(ctx))
  			return NULL;
  	}
a1c8eae75   Kent Overstreet   aio: kill batch a...
902

0460fef2a   Kent Overstreet   aio: use cancella...
903
  	req = kmem_cache_alloc(kiocb_cachep, GFP_KERNEL|__GFP_ZERO);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
904
  	if (unlikely(!req))
a1c8eae75   Kent Overstreet   aio: kill batch a...
905
  		goto out_put;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
906

e34ecee2a   Kent Overstreet   aio: Fix a trinit...
907
  	percpu_ref_get(&ctx->reqs);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
908
  	req->ki_ctx = ctx;
080d676de   Jeff Moyer   aio: allocate kio...
909
  	return req;
a1c8eae75   Kent Overstreet   aio: kill batch a...
910
  out_put:
e1bdd5f27   Kent Overstreet   aio: percpu reqs_...
911
  	put_reqs_available(ctx, 1);
a1c8eae75   Kent Overstreet   aio: kill batch a...
912
  	return NULL;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
913
  }
11599ebac   Kent Overstreet   aio: make aio_put...
914
  static void kiocb_free(struct kiocb *req)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
915
  {
1d98ebfcc   Kent Overstreet   aio: do fget() af...
916
917
  	if (req->ki_filp)
  		fput(req->ki_filp);
133890103   Davide Libenzi   eventfd: revised ...
918
919
  	if (req->ki_eventfd != NULL)
  		eventfd_ctx_put(req->ki_eventfd);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
920
  	kmem_cache_free(kiocb_cachep, req);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
921
  }
d5470b596   Adrian Bunk   fs/aio.c: make 3 ...
922
  static struct kioctx *lookup_ioctx(unsigned long ctx_id)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
923
  {
db446a08c   Benjamin LaHaise   aio: convert the ...
924
  	struct aio_ring __user *ring  = (void __user *)ctx_id;
abf137dd7   Jens Axboe   aio: make the loo...
925
  	struct mm_struct *mm = current->mm;
65c24491b   Jeff Moyer   aio: lookup_ioctx...
926
  	struct kioctx *ctx, *ret = NULL;
db446a08c   Benjamin LaHaise   aio: convert the ...
927
928
929
930
931
  	struct kioctx_table *table;
  	unsigned id;
  
  	if (get_user(id, &ring->id))
  		return NULL;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
932

abf137dd7   Jens Axboe   aio: make the loo...
933
  	rcu_read_lock();
db446a08c   Benjamin LaHaise   aio: convert the ...
934
  	table = rcu_dereference(mm->ioctx_table);
abf137dd7   Jens Axboe   aio: make the loo...
935

db446a08c   Benjamin LaHaise   aio: convert the ...
936
937
  	if (!table || id >= table->nr)
  		goto out;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
938

db446a08c   Benjamin LaHaise   aio: convert the ...
939
  	ctx = table->table[id];
f30d704fe   Benjamin LaHaise   aio: table lookup...
940
  	if (ctx && ctx->user_id == ctx_id) {
db446a08c   Benjamin LaHaise   aio: convert the ...
941
942
943
944
  		percpu_ref_get(&ctx->users);
  		ret = ctx;
  	}
  out:
abf137dd7   Jens Axboe   aio: make the loo...
945
  	rcu_read_unlock();
65c24491b   Jeff Moyer   aio: lookup_ioctx...
946
  	return ret;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
947
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
948
949
  /* aio_complete
   *	Called when the io request on the given iocb is complete.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
950
   */
2d68449e8   Kent Overstreet   aio: kill return ...
951
  void aio_complete(struct kiocb *iocb, long res, long res2)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
952
953
  {
  	struct kioctx	*ctx = iocb->ki_ctx;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
954
  	struct aio_ring	*ring;
21b40200c   Kent Overstreet   aio: use flush_dc...
955
  	struct io_event	*ev_page, *event;
d856f32a8   Benjamin LaHaise   aio: fix reqs_ava...
956
  	unsigned tail, pos, head;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
957
  	unsigned long	flags;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
958

20dcae324   Zach Brown   [PATCH] aio: remo...
959
960
961
962
963
964
  	/*
  	 * Special case handling for sync iocbs:
  	 *  - events go directly into the iocb for fast handling
  	 *  - the sync task with the iocb in its stack holds the single iocb
  	 *    ref, no other paths have a way to get another ref
  	 *  - the sync task helpfully left a reference to itself in the iocb
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
965
966
  	 */
  	if (is_sync_kiocb(iocb)) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
967
  		iocb->ki_user_data = res;
57282d8fd   Kent Overstreet   aio: Kill ki_users
968
969
  		smp_wmb();
  		iocb->ki_ctx = ERR_PTR(-EXDEV);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
970
  		wake_up_process(iocb->ki_obj.tsk);
2d68449e8   Kent Overstreet   aio: kill return ...
971
  		return;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
972
  	}
0460fef2a   Kent Overstreet   aio: use cancella...
973
974
975
976
977
978
979
  	if (iocb->ki_list.next) {
  		unsigned long flags;
  
  		spin_lock_irqsave(&ctx->ctx_lock, flags);
  		list_del(&iocb->ki_list);
  		spin_unlock_irqrestore(&ctx->ctx_lock, flags);
  	}
11599ebac   Kent Overstreet   aio: make aio_put...
980

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
981
  	/*
0460fef2a   Kent Overstreet   aio: use cancella...
982
  	 * Add a completion event to the ring buffer. Must be done holding
4b30f07e7   Tang Chen   aio: fix wrong co...
983
  	 * ctx->completion_lock to prevent other code from messing with the tail
0460fef2a   Kent Overstreet   aio: use cancella...
984
985
986
  	 * pointer since we might be called from irq context.
  	 */
  	spin_lock_irqsave(&ctx->completion_lock, flags);
58c85dc20   Kent Overstreet   aio: kill struct ...
987
  	tail = ctx->tail;
21b40200c   Kent Overstreet   aio: use flush_dc...
988
  	pos = tail + AIO_EVENTS_OFFSET;
58c85dc20   Kent Overstreet   aio: kill struct ...
989
  	if (++tail >= ctx->nr_events)
4bf69b2a0   Kenneth W Chen   [PATCH] aio: ring...
990
  		tail = 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
991

58c85dc20   Kent Overstreet   aio: kill struct ...
992
  	ev_page = kmap_atomic(ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE]);
21b40200c   Kent Overstreet   aio: use flush_dc...
993
  	event = ev_page + pos % AIO_EVENTS_PER_PAGE;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
994
995
996
997
  	event->obj = (u64)(unsigned long)iocb->ki_obj.user;
  	event->data = iocb->ki_user_data;
  	event->res = res;
  	event->res2 = res2;
21b40200c   Kent Overstreet   aio: use flush_dc...
998
  	kunmap_atomic(ev_page);
58c85dc20   Kent Overstreet   aio: kill struct ...
999
  	flush_dcache_page(ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE]);
21b40200c   Kent Overstreet   aio: use flush_dc...
1000
1001
1002
  
  	pr_debug("%p[%u]: %p: %p %Lx %lx %lx
  ",
caf4167aa   Kent Overstreet   aio: dprintk() ->...
1003
1004
  		 ctx, tail, iocb, iocb->ki_obj.user, iocb->ki_user_data,
  		 res, res2);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1005
1006
1007
1008
1009
  
  	/* after flagging the request as done, we
  	 * must never even look at it again
  	 */
  	smp_wmb();	/* make event visible before updating tail */
58c85dc20   Kent Overstreet   aio: kill struct ...
1010
  	ctx->tail = tail;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1011

58c85dc20   Kent Overstreet   aio: kill struct ...
1012
  	ring = kmap_atomic(ctx->ring_pages[0]);
d856f32a8   Benjamin LaHaise   aio: fix reqs_ava...
1013
  	head = ring->head;
21b40200c   Kent Overstreet   aio: use flush_dc...
1014
  	ring->tail = tail;
e8e3c3d66   Cong Wang   fs: remove the se...
1015
  	kunmap_atomic(ring);
58c85dc20   Kent Overstreet   aio: kill struct ...
1016
  	flush_dcache_page(ctx->ring_pages[0]);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1017

d856f32a8   Benjamin LaHaise   aio: fix reqs_ava...
1018
1019
1020
  	ctx->completed_events++;
  	if (ctx->completed_events > 1)
  		refill_reqs_available(ctx, head, tail);
0460fef2a   Kent Overstreet   aio: use cancella...
1021
  	spin_unlock_irqrestore(&ctx->completion_lock, flags);
21b40200c   Kent Overstreet   aio: use flush_dc...
1022
1023
  	pr_debug("added to ring %p at [%u]
  ", iocb, tail);
8d1c98b0b   Davide Libenzi   eventfd/kaio inte...
1024
1025
1026
1027
1028
1029
  
  	/*
  	 * Check if the user asked us to deliver the result through an
  	 * eventfd. The eventfd_signal() function is safe to be called
  	 * from IRQ context.
  	 */
87c3a86e1   Davide Libenzi   eventfd: remove f...
1030
  	if (iocb->ki_eventfd != NULL)
8d1c98b0b   Davide Libenzi   eventfd/kaio inte...
1031
  		eventfd_signal(iocb->ki_eventfd, 1);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1032
  	/* everything turned out well, dispose of the aiocb. */
57282d8fd   Kent Overstreet   aio: Kill ki_users
1033
  	kiocb_free(iocb);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1034

6cb2a2104   Quentin Barnes   aio: bad AIO race...
1035
1036
1037
1038
1039
1040
1041
  	/*
  	 * We have to order our ring_info tail store above and test
  	 * of the wait list below outside the wait lock.  This is
  	 * like in wake_up_bit() where clearing a bit has to be
  	 * ordered with the unlocked test.
  	 */
  	smp_mb();
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1042
1043
  	if (waitqueue_active(&ctx->wait))
  		wake_up(&ctx->wait);
e34ecee2a   Kent Overstreet   aio: Fix a trinit...
1044
  	percpu_ref_put(&ctx->reqs);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1045
  }
385773e04   H Hartley Sweeten   aio.c: move EXPOR...
1046
  EXPORT_SYMBOL(aio_complete);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1047

2be4e7dee   Gu Zheng   aio: fix some com...
1048
  /* aio_read_events_ring
a31ad380b   Kent Overstreet   aio: make aio_rea...
1049
1050
   *	Pull an event off of the ioctx's event ring.  Returns the number of
   *	events fetched
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1051
   */
a31ad380b   Kent Overstreet   aio: make aio_rea...
1052
1053
  static long aio_read_events_ring(struct kioctx *ctx,
  				 struct io_event __user *event, long nr)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1054
  {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1055
  	struct aio_ring *ring;
5ffac122d   Kent Overstreet   aio: Don't use ct...
1056
  	unsigned head, tail, pos;
a31ad380b   Kent Overstreet   aio: make aio_rea...
1057
1058
  	long ret = 0;
  	int copy_ret;
58c85dc20   Kent Overstreet   aio: kill struct ...
1059
  	mutex_lock(&ctx->ring_lock);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1060

fa8a53c39   Benjamin LaHaise   aio: v4 ensure ac...
1061
  	/* Access to ->ring_pages here is protected by ctx->ring_lock. */
58c85dc20   Kent Overstreet   aio: kill struct ...
1062
  	ring = kmap_atomic(ctx->ring_pages[0]);
a31ad380b   Kent Overstreet   aio: make aio_rea...
1063
  	head = ring->head;
5ffac122d   Kent Overstreet   aio: Don't use ct...
1064
  	tail = ring->tail;
a31ad380b   Kent Overstreet   aio: make aio_rea...
1065
  	kunmap_atomic(ring);
2ff396be6   Jeff Moyer   aio: add missing ...
1066
1067
1068
1069
1070
  	/*
  	 * Ensure that once we've read the current tail pointer, that
  	 * we also see the events that were stored up to the tail.
  	 */
  	smp_rmb();
5ffac122d   Kent Overstreet   aio: Don't use ct...
1071
1072
  	pr_debug("h%u t%u m%u
  ", head, tail, ctx->nr_events);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1073

5ffac122d   Kent Overstreet   aio: Don't use ct...
1074
  	if (head == tail)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1075
  		goto out;
edfbbf388   Benjamin LaHaise   aio: fix kernel m...
1076
1077
  	head %= ctx->nr_events;
  	tail %= ctx->nr_events;
a31ad380b   Kent Overstreet   aio: make aio_rea...
1078
1079
1080
1081
  	while (ret < nr) {
  		long avail;
  		struct io_event *ev;
  		struct page *page;
5ffac122d   Kent Overstreet   aio: Don't use ct...
1082
1083
  		avail = (head <= tail ?  tail : ctx->nr_events) - head;
  		if (head == tail)
a31ad380b   Kent Overstreet   aio: make aio_rea...
1084
1085
1086
1087
1088
1089
1090
  			break;
  
  		avail = min(avail, nr - ret);
  		avail = min_t(long, avail, AIO_EVENTS_PER_PAGE -
  			    ((head + AIO_EVENTS_OFFSET) % AIO_EVENTS_PER_PAGE));
  
  		pos = head + AIO_EVENTS_OFFSET;
58c85dc20   Kent Overstreet   aio: kill struct ...
1091
  		page = ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE];
a31ad380b   Kent Overstreet   aio: make aio_rea...
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
  		pos %= AIO_EVENTS_PER_PAGE;
  
  		ev = kmap(page);
  		copy_ret = copy_to_user(event + ret, ev + pos,
  					sizeof(*ev) * avail);
  		kunmap(page);
  
  		if (unlikely(copy_ret)) {
  			ret = -EFAULT;
  			goto out;
  		}
  
  		ret += avail;
  		head += avail;
58c85dc20   Kent Overstreet   aio: kill struct ...
1106
  		head %= ctx->nr_events;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1107
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1108

58c85dc20   Kent Overstreet   aio: kill struct ...
1109
  	ring = kmap_atomic(ctx->ring_pages[0]);
a31ad380b   Kent Overstreet   aio: make aio_rea...
1110
  	ring->head = head;
91d80a84b   Zhao Hongjiang   aio: fix possible...
1111
  	kunmap_atomic(ring);
58c85dc20   Kent Overstreet   aio: kill struct ...
1112
  	flush_dcache_page(ctx->ring_pages[0]);
a31ad380b   Kent Overstreet   aio: make aio_rea...
1113

5ffac122d   Kent Overstreet   aio: Don't use ct...
1114
1115
  	pr_debug("%li  h%u t%u
  ", ret, head, tail);
a31ad380b   Kent Overstreet   aio: make aio_rea...
1116
  out:
58c85dc20   Kent Overstreet   aio: kill struct ...
1117
  	mutex_unlock(&ctx->ring_lock);
a31ad380b   Kent Overstreet   aio: make aio_rea...
1118

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1119
1120
  	return ret;
  }
a31ad380b   Kent Overstreet   aio: make aio_rea...
1121
1122
  static bool aio_read_events(struct kioctx *ctx, long min_nr, long nr,
  			    struct io_event __user *event, long *i)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1123
  {
a31ad380b   Kent Overstreet   aio: make aio_rea...
1124
  	long ret = aio_read_events_ring(ctx, event + *i, nr - *i);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1125

a31ad380b   Kent Overstreet   aio: make aio_rea...
1126
1127
  	if (ret > 0)
  		*i += ret;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1128

a31ad380b   Kent Overstreet   aio: make aio_rea...
1129
1130
  	if (unlikely(atomic_read(&ctx->dead)))
  		ret = -EINVAL;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1131

a31ad380b   Kent Overstreet   aio: make aio_rea...
1132
1133
  	if (!*i)
  		*i = ret;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1134

a31ad380b   Kent Overstreet   aio: make aio_rea...
1135
  	return ret < 0 || *i >= min_nr;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1136
  }
a31ad380b   Kent Overstreet   aio: make aio_rea...
1137
  static long read_events(struct kioctx *ctx, long min_nr, long nr,
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1138
1139
1140
  			struct io_event __user *event,
  			struct timespec __user *timeout)
  {
a31ad380b   Kent Overstreet   aio: make aio_rea...
1141
1142
  	ktime_t until = { .tv64 = KTIME_MAX };
  	long ret = 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1143

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1144
1145
  	if (timeout) {
  		struct timespec	ts;
a31ad380b   Kent Overstreet   aio: make aio_rea...
1146

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1147
  		if (unlikely(copy_from_user(&ts, timeout, sizeof(ts))))
a31ad380b   Kent Overstreet   aio: make aio_rea...
1148
  			return -EFAULT;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1149

a31ad380b   Kent Overstreet   aio: make aio_rea...
1150
  		until = timespec_to_ktime(ts);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1151
  	}
a31ad380b   Kent Overstreet   aio: make aio_rea...
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
  	/*
  	 * Note that aio_read_events() is being called as the conditional - i.e.
  	 * we're calling it after prepare_to_wait() has set task state to
  	 * TASK_INTERRUPTIBLE.
  	 *
  	 * But aio_read_events() can block, and if it blocks it's going to flip
  	 * the task state back to TASK_RUNNING.
  	 *
  	 * This should be ok, provided it doesn't flip the state back to
  	 * TASK_RUNNING and return 0 too much - that causes us to spin. That
  	 * will only happen if the mutex_lock() call blocks, and we then find
  	 * the ringbuffer empty. So in practice we should be ok, but it's
  	 * something to be aware of when touching this code.
  	 */
  	wait_event_interruptible_hrtimeout(ctx->wait,
  			aio_read_events(ctx, min_nr, nr, event, &ret), until);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1168

a31ad380b   Kent Overstreet   aio: make aio_rea...
1169
1170
  	if (!ret && signal_pending(current))
  		ret = -EINTR;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1171

a31ad380b   Kent Overstreet   aio: make aio_rea...
1172
  	return ret;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1173
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
  /* sys_io_setup:
   *	Create an aio_context capable of receiving at least nr_events.
   *	ctxp must not point to an aio_context that already exists, and
   *	must be initialized to 0 prior to the call.  On successful
   *	creation of the aio_context, *ctxp is filled in with the resulting 
   *	handle.  May fail with -EINVAL if *ctxp is not initialized,
   *	if the specified nr_events exceeds internal limits.  May fail 
   *	with -EAGAIN if the specified nr_events exceeds the user's limit 
   *	of available events.  May fail with -ENOMEM if insufficient kernel
   *	resources are available.  May fail with -EFAULT if an invalid
   *	pointer is passed for ctxp.  Will fail with -ENOSYS if not
   *	implemented.
   */
002c8976e   Heiko Carstens   [CVE-2009-0029] S...
1187
  SYSCALL_DEFINE2(io_setup, unsigned, nr_events, aio_context_t __user *, ctxp)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
  {
  	struct kioctx *ioctx = NULL;
  	unsigned long ctx;
  	long ret;
  
  	ret = get_user(ctx, ctxp);
  	if (unlikely(ret))
  		goto out;
  
  	ret = -EINVAL;
d55b5fdaf   Zach Brown   [PATCH] aio: remo...
1198
1199
1200
1201
  	if (unlikely(ctx || nr_events == 0)) {
  		pr_debug("EINVAL: io_setup: ctx %lu nr_events %u
  ",
  		         ctx, nr_events);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1202
1203
1204
1205
1206
1207
1208
  		goto out;
  	}
  
  	ioctx = ioctx_alloc(nr_events);
  	ret = PTR_ERR(ioctx);
  	if (!IS_ERR(ioctx)) {
  		ret = put_user(ioctx->user_id, ctxp);
a2e1859ad   Al Viro   aio: take final p...
1209
  		if (ret)
e02ba72aa   Anatol Pomozov   aio: block io_des...
1210
  			kill_ioctx(current->mm, ioctx, NULL);
723be6e39   Kent Overstreet   aio: percpu ioctx...
1211
  		percpu_ref_put(&ioctx->users);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1212
1213
1214
1215
1216
1217
1218
1219
1220
  	}
  
  out:
  	return ret;
  }
  
  /* sys_io_destroy:
   *	Destroy the aio_context specified.  May cancel any outstanding 
   *	AIOs and block on completion.  Will fail with -ENOSYS if not
642b5123a   Satoru Takeuchi   aio: fix wrong su...
1221
   *	implemented.  May fail with -EINVAL if the context pointed to
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1222
1223
   *	is invalid.
   */
002c8976e   Heiko Carstens   [CVE-2009-0029] S...
1224
  SYSCALL_DEFINE1(io_destroy, aio_context_t, ctx)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1225
1226
1227
  {
  	struct kioctx *ioctx = lookup_ioctx(ctx);
  	if (likely(NULL != ioctx)) {
e02ba72aa   Anatol Pomozov   aio: block io_des...
1228
1229
  		struct completion requests_done =
  			COMPLETION_INITIALIZER_ONSTACK(requests_done);
fb2d44838   Benjamin LaHaise   aio: report error...
1230
  		int ret;
e02ba72aa   Anatol Pomozov   aio: block io_des...
1231
1232
1233
1234
1235
  
  		/* Pass requests_done to kill_ioctx() where it can be set
  		 * in a thread-safe way. If we try to set it here then we have
  		 * a race condition if two io_destroy() called simultaneously.
  		 */
fb2d44838   Benjamin LaHaise   aio: report error...
1236
  		ret = kill_ioctx(current->mm, ioctx, &requests_done);
723be6e39   Kent Overstreet   aio: percpu ioctx...
1237
  		percpu_ref_put(&ioctx->users);
e02ba72aa   Anatol Pomozov   aio: block io_des...
1238
1239
1240
1241
1242
  
  		/* Wait until all IO for the context are done. Otherwise kernel
  		 * keep using user-space buffers even if user thinks the context
  		 * is destroyed.
  		 */
fb2d44838   Benjamin LaHaise   aio: report error...
1243
1244
  		if (!ret)
  			wait_for_completion(&requests_done);
e02ba72aa   Anatol Pomozov   aio: block io_des...
1245

fb2d44838   Benjamin LaHaise   aio: report error...
1246
  		return ret;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1247
1248
1249
1250
1251
  	}
  	pr_debug("EINVAL: io_destroy: invalid context id
  ");
  	return -EINVAL;
  }
41ef4eb8e   Kent Overstreet   aio: kill ki_retry
1252
1253
  typedef ssize_t (aio_rw_op)(struct kiocb *, const struct iovec *,
  			    unsigned long, loff_t);
293bc9822   Al Viro   new methods: ->re...
1254
  typedef ssize_t (rw_iter_op)(struct kiocb *, struct iov_iter *);
41ef4eb8e   Kent Overstreet   aio: kill ki_retry
1255

8bc92afcf   Kent Overstreet   aio: Kill unneede...
1256
1257
1258
1259
1260
  static ssize_t aio_setup_vectored_rw(struct kiocb *kiocb,
  				     int rw, char __user *buf,
  				     unsigned long *nr_segs,
  				     struct iovec **iovec,
  				     bool compat)
eed4e51fb   Badari Pulavarty   [PATCH] Add vecto...
1261
1262
  {
  	ssize_t ret;
8bc92afcf   Kent Overstreet   aio: Kill unneede...
1263
  	*nr_segs = kiocb->ki_nbytes;
41ef4eb8e   Kent Overstreet   aio: kill ki_retry
1264

9d85cba71   Jeff Moyer   aio: fix the comp...
1265
1266
  #ifdef CONFIG_COMPAT
  	if (compat)
41ef4eb8e   Kent Overstreet   aio: kill ki_retry
1267
  		ret = compat_rw_copy_check_uvector(rw,
8bc92afcf   Kent Overstreet   aio: Kill unneede...
1268
  				(struct compat_iovec __user *)buf,
00fefb9cf   Gu Zheng   aio: use iovec ar...
1269
  				*nr_segs, UIO_FASTIOV, *iovec, iovec);
9d85cba71   Jeff Moyer   aio: fix the comp...
1270
1271
  	else
  #endif
41ef4eb8e   Kent Overstreet   aio: kill ki_retry
1272
  		ret = rw_copy_check_uvector(rw,
8bc92afcf   Kent Overstreet   aio: Kill unneede...
1273
  				(struct iovec __user *)buf,
00fefb9cf   Gu Zheng   aio: use iovec ar...
1274
  				*nr_segs, UIO_FASTIOV, *iovec, iovec);
eed4e51fb   Badari Pulavarty   [PATCH] Add vecto...
1275
  	if (ret < 0)
41ef4eb8e   Kent Overstreet   aio: kill ki_retry
1276
  		return ret;
a70b52ec1   Linus Torvalds   vfs: make AIO use...
1277

41ef4eb8e   Kent Overstreet   aio: kill ki_retry
1278
  	/* ki_nbytes now reflect bytes instead of segs */
eed4e51fb   Badari Pulavarty   [PATCH] Add vecto...
1279
  	kiocb->ki_nbytes = ret;
41ef4eb8e   Kent Overstreet   aio: kill ki_retry
1280
  	return 0;
eed4e51fb   Badari Pulavarty   [PATCH] Add vecto...
1281
  }
8bc92afcf   Kent Overstreet   aio: Kill unneede...
1282
1283
1284
1285
  static ssize_t aio_setup_single_vector(struct kiocb *kiocb,
  				       int rw, char __user *buf,
  				       unsigned long *nr_segs,
  				       struct iovec *iovec)
eed4e51fb   Badari Pulavarty   [PATCH] Add vecto...
1286
  {
8bc92afcf   Kent Overstreet   aio: Kill unneede...
1287
  	if (unlikely(!access_ok(!rw, buf, kiocb->ki_nbytes)))
41ef4eb8e   Kent Overstreet   aio: kill ki_retry
1288
  		return -EFAULT;
a70b52ec1   Linus Torvalds   vfs: make AIO use...
1289

8bc92afcf   Kent Overstreet   aio: Kill unneede...
1290
1291
1292
  	iovec->iov_base = buf;
  	iovec->iov_len = kiocb->ki_nbytes;
  	*nr_segs = 1;
eed4e51fb   Badari Pulavarty   [PATCH] Add vecto...
1293
1294
  	return 0;
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1295
  /*
2be4e7dee   Gu Zheng   aio: fix some com...
1296
1297
   * aio_run_iocb:
   *	Performs the initial checks and io submission.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1298
   */
8bc92afcf   Kent Overstreet   aio: Kill unneede...
1299
1300
  static ssize_t aio_run_iocb(struct kiocb *req, unsigned opcode,
  			    char __user *buf, bool compat)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1301
  {
41ef4eb8e   Kent Overstreet   aio: kill ki_retry
1302
1303
  	struct file *file = req->ki_filp;
  	ssize_t ret;
8bc92afcf   Kent Overstreet   aio: Kill unneede...
1304
  	unsigned long nr_segs;
41ef4eb8e   Kent Overstreet   aio: kill ki_retry
1305
1306
1307
  	int rw;
  	fmode_t mode;
  	aio_rw_op *rw_op;
293bc9822   Al Viro   new methods: ->re...
1308
  	rw_iter_op *iter_op;
00fefb9cf   Gu Zheng   aio: use iovec ar...
1309
  	struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
293bc9822   Al Viro   new methods: ->re...
1310
  	struct iov_iter iter;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1311

8bc92afcf   Kent Overstreet   aio: Kill unneede...
1312
  	switch (opcode) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1313
  	case IOCB_CMD_PREAD:
eed4e51fb   Badari Pulavarty   [PATCH] Add vecto...
1314
  	case IOCB_CMD_PREADV:
41ef4eb8e   Kent Overstreet   aio: kill ki_retry
1315
1316
1317
  		mode	= FMODE_READ;
  		rw	= READ;
  		rw_op	= file->f_op->aio_read;
293bc9822   Al Viro   new methods: ->re...
1318
  		iter_op	= file->f_op->read_iter;
41ef4eb8e   Kent Overstreet   aio: kill ki_retry
1319
1320
1321
  		goto rw_common;
  
  	case IOCB_CMD_PWRITE:
eed4e51fb   Badari Pulavarty   [PATCH] Add vecto...
1322
  	case IOCB_CMD_PWRITEV:
41ef4eb8e   Kent Overstreet   aio: kill ki_retry
1323
1324
1325
  		mode	= FMODE_WRITE;
  		rw	= WRITE;
  		rw_op	= file->f_op->aio_write;
293bc9822   Al Viro   new methods: ->re...
1326
  		iter_op	= file->f_op->write_iter;
41ef4eb8e   Kent Overstreet   aio: kill ki_retry
1327
1328
1329
1330
  		goto rw_common;
  rw_common:
  		if (unlikely(!(file->f_mode & mode)))
  			return -EBADF;
293bc9822   Al Viro   new methods: ->re...
1331
  		if (!rw_op && !iter_op)
41ef4eb8e   Kent Overstreet   aio: kill ki_retry
1332
  			return -EINVAL;
8bc92afcf   Kent Overstreet   aio: Kill unneede...
1333
1334
1335
1336
1337
1338
  		ret = (opcode == IOCB_CMD_PREADV ||
  		       opcode == IOCB_CMD_PWRITEV)
  			? aio_setup_vectored_rw(req, rw, buf, &nr_segs,
  						&iovec, compat)
  			: aio_setup_single_vector(req, rw, buf, &nr_segs,
  						  iovec);
754320d6e   Leon Yu   aio: fix potentia...
1339
1340
  		if (!ret)
  			ret = rw_verify_area(rw, file, &req->ki_pos, req->ki_nbytes);
8bc92afcf   Kent Overstreet   aio: Kill unneede...
1341
  		if (ret < 0) {
00fefb9cf   Gu Zheng   aio: use iovec ar...
1342
  			if (iovec != inline_vecs)
8bc92afcf   Kent Overstreet   aio: Kill unneede...
1343
  				kfree(iovec);
41ef4eb8e   Kent Overstreet   aio: kill ki_retry
1344
  			return ret;
8bc92afcf   Kent Overstreet   aio: Kill unneede...
1345
  		}
41ef4eb8e   Kent Overstreet   aio: kill ki_retry
1346
1347
  
  		req->ki_nbytes = ret;
41ef4eb8e   Kent Overstreet   aio: kill ki_retry
1348

73a7075e3   Kent Overstreet   aio: Kill aio_rw_...
1349
1350
1351
1352
1353
1354
1355
1356
1357
  		/* XXX: move/kill - rw_verify_area()? */
  		/* This matches the pread()/pwrite() logic */
  		if (req->ki_pos < 0) {
  			ret = -EINVAL;
  			break;
  		}
  
  		if (rw == WRITE)
  			file_start_write(file);
293bc9822   Al Viro   new methods: ->re...
1358
1359
1360
1361
1362
1363
  		if (iter_op) {
  			iov_iter_init(&iter, rw, iovec, nr_segs, req->ki_nbytes);
  			ret = iter_op(req, &iter);
  		} else {
  			ret = rw_op(req, iovec, nr_segs, req->ki_pos);
  		}
73a7075e3   Kent Overstreet   aio: Kill aio_rw_...
1364
1365
1366
  
  		if (rw == WRITE)
  			file_end_write(file);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1367
  		break;
41ef4eb8e   Kent Overstreet   aio: kill ki_retry
1368

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1369
  	case IOCB_CMD_FDSYNC:
41ef4eb8e   Kent Overstreet   aio: kill ki_retry
1370
1371
1372
1373
  		if (!file->f_op->aio_fsync)
  			return -EINVAL;
  
  		ret = file->f_op->aio_fsync(req, 1);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1374
  		break;
41ef4eb8e   Kent Overstreet   aio: kill ki_retry
1375

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1376
  	case IOCB_CMD_FSYNC:
41ef4eb8e   Kent Overstreet   aio: kill ki_retry
1377
1378
1379
1380
  		if (!file->f_op->aio_fsync)
  			return -EINVAL;
  
  		ret = file->f_op->aio_fsync(req, 0);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1381
  		break;
41ef4eb8e   Kent Overstreet   aio: kill ki_retry
1382

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1383
  	default:
caf4167aa   Kent Overstreet   aio: dprintk() ->...
1384
1385
  		pr_debug("EINVAL: no operation provided
  ");
41ef4eb8e   Kent Overstreet   aio: kill ki_retry
1386
  		return -EINVAL;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1387
  	}
00fefb9cf   Gu Zheng   aio: use iovec ar...
1388
  	if (iovec != inline_vecs)
8bc92afcf   Kent Overstreet   aio: Kill unneede...
1389
  		kfree(iovec);
41ef4eb8e   Kent Overstreet   aio: kill ki_retry
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
  	if (ret != -EIOCBQUEUED) {
  		/*
  		 * There's no easy way to restart the syscall since other AIO's
  		 * may be already running. Just fail this IO with EINTR.
  		 */
  		if (unlikely(ret == -ERESTARTSYS || ret == -ERESTARTNOINTR ||
  			     ret == -ERESTARTNOHAND ||
  			     ret == -ERESTART_RESTARTBLOCK))
  			ret = -EINTR;
  		aio_complete(req, ret, 0);
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1401
1402
1403
  
  	return 0;
  }
d5470b596   Adrian Bunk   fs/aio.c: make 3 ...
1404
  static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
a1c8eae75   Kent Overstreet   aio: kill batch a...
1405
  			 struct iocb *iocb, bool compat)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1406
1407
  {
  	struct kiocb *req;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1408
1409
1410
  	ssize_t ret;
  
  	/* enforce forwards compatibility on users */
9c3060bed   Davide Libenzi   signal/timer/even...
1411
  	if (unlikely(iocb->aio_reserved1 || iocb->aio_reserved2)) {
caf4167aa   Kent Overstreet   aio: dprintk() ->...
1412
1413
  		pr_debug("EINVAL: reserve field set
  ");
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
  		return -EINVAL;
  	}
  
  	/* prevent overflows */
  	if (unlikely(
  	    (iocb->aio_buf != (unsigned long)iocb->aio_buf) ||
  	    (iocb->aio_nbytes != (size_t)iocb->aio_nbytes) ||
  	    ((ssize_t)iocb->aio_nbytes < 0)
  	   )) {
  		pr_debug("EINVAL: io_submit: overflow check
  ");
  		return -EINVAL;
  	}
41ef4eb8e   Kent Overstreet   aio: kill ki_retry
1427
  	req = aio_get_req(ctx);
1d98ebfcc   Kent Overstreet   aio: do fget() af...
1428
  	if (unlikely(!req))
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1429
  		return -EAGAIN;
1d98ebfcc   Kent Overstreet   aio: do fget() af...
1430
1431
1432
1433
1434
  
  	req->ki_filp = fget(iocb->aio_fildes);
  	if (unlikely(!req->ki_filp)) {
  		ret = -EBADF;
  		goto out_put_req;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1435
  	}
1d98ebfcc   Kent Overstreet   aio: do fget() af...
1436

9c3060bed   Davide Libenzi   signal/timer/even...
1437
1438
1439
1440
1441
1442
1443
  	if (iocb->aio_flags & IOCB_FLAG_RESFD) {
  		/*
  		 * If the IOCB_FLAG_RESFD flag of aio_flags is set, get an
  		 * instance of the file* now. The file descriptor must be
  		 * an eventfd() fd, and will be signaled for each completed
  		 * event using the eventfd_signal() function.
  		 */
133890103   Davide Libenzi   eventfd: revised ...
1444
  		req->ki_eventfd = eventfd_ctx_fdget((int) iocb->aio_resfd);
801678c5a   Hirofumi Nakagawa   Remove duplicated...
1445
  		if (IS_ERR(req->ki_eventfd)) {
9c3060bed   Davide Libenzi   signal/timer/even...
1446
  			ret = PTR_ERR(req->ki_eventfd);
87c3a86e1   Davide Libenzi   eventfd: remove f...
1447
  			req->ki_eventfd = NULL;
9c3060bed   Davide Libenzi   signal/timer/even...
1448
1449
1450
  			goto out_put_req;
  		}
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1451

8a6608907   Kent Overstreet   aio: kill ki_key
1452
  	ret = put_user(KIOCB_KEY, &user_iocb->aio_key);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1453
  	if (unlikely(ret)) {
caf4167aa   Kent Overstreet   aio: dprintk() ->...
1454
1455
  		pr_debug("EFAULT: aio_key
  ");
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1456
1457
1458
1459
1460
1461
  		goto out_put_req;
  	}
  
  	req->ki_obj.user = user_iocb;
  	req->ki_user_data = iocb->aio_data;
  	req->ki_pos = iocb->aio_offset;
73a7075e3   Kent Overstreet   aio: Kill aio_rw_...
1462
  	req->ki_nbytes = iocb->aio_nbytes;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1463

8bc92afcf   Kent Overstreet   aio: Kill unneede...
1464
1465
1466
  	ret = aio_run_iocb(req, iocb->aio_lio_opcode,
  			   (char __user *)(unsigned long)iocb->aio_buf,
  			   compat);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1467
1468
  	if (ret)
  		goto out_put_req;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1469
  	return 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1470
  out_put_req:
e1bdd5f27   Kent Overstreet   aio: percpu reqs_...
1471
  	put_reqs_available(ctx, 1);
e34ecee2a   Kent Overstreet   aio: Fix a trinit...
1472
  	percpu_ref_put(&ctx->reqs);
57282d8fd   Kent Overstreet   aio: Kill ki_users
1473
  	kiocb_free(req);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1474
1475
  	return ret;
  }
9d85cba71   Jeff Moyer   aio: fix the comp...
1476
1477
  long do_io_submit(aio_context_t ctx_id, long nr,
  		  struct iocb __user *__user *iocbpp, bool compat)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1478
1479
1480
  {
  	struct kioctx *ctx;
  	long ret = 0;
080d676de   Jeff Moyer   aio: allocate kio...
1481
  	int i = 0;
9f5b94254   Shaohua Li   fs: make aio plug
1482
  	struct blk_plug plug;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1483
1484
1485
  
  	if (unlikely(nr < 0))
  		return -EINVAL;
75e1c70fc   Jeff Moyer   aio: check for mu...
1486
1487
  	if (unlikely(nr > LONG_MAX/sizeof(*iocbpp)))
  		nr = LONG_MAX/sizeof(*iocbpp);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1488
1489
1490
1491
1492
  	if (unlikely(!access_ok(VERIFY_READ, iocbpp, (nr*sizeof(*iocbpp)))))
  		return -EFAULT;
  
  	ctx = lookup_ioctx(ctx_id);
  	if (unlikely(!ctx)) {
caf4167aa   Kent Overstreet   aio: dprintk() ->...
1493
1494
  		pr_debug("EINVAL: invalid context id
  ");
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1495
1496
  		return -EINVAL;
  	}
9f5b94254   Shaohua Li   fs: make aio plug
1497
  	blk_start_plug(&plug);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
  	/*
  	 * AKPM: should this return a partial result if some of the IOs were
  	 * successfully submitted?
  	 */
  	for (i=0; i<nr; i++) {
  		struct iocb __user *user_iocb;
  		struct iocb tmp;
  
  		if (unlikely(__get_user(user_iocb, iocbpp + i))) {
  			ret = -EFAULT;
  			break;
  		}
  
  		if (unlikely(copy_from_user(&tmp, user_iocb, sizeof(tmp)))) {
  			ret = -EFAULT;
  			break;
  		}
a1c8eae75   Kent Overstreet   aio: kill batch a...
1515
  		ret = io_submit_one(ctx, user_iocb, &tmp, compat);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1516
1517
1518
  		if (ret)
  			break;
  	}
9f5b94254   Shaohua Li   fs: make aio plug
1519
  	blk_finish_plug(&plug);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1520

723be6e39   Kent Overstreet   aio: percpu ioctx...
1521
  	percpu_ref_put(&ctx->users);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1522
1523
  	return i ? i : ret;
  }
9d85cba71   Jeff Moyer   aio: fix the comp...
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
  /* sys_io_submit:
   *	Queue the nr iocbs pointed to by iocbpp for processing.  Returns
   *	the number of iocbs queued.  May return -EINVAL if the aio_context
   *	specified by ctx_id is invalid, if nr is < 0, if the iocb at
   *	*iocbpp[0] is not properly initialized, if the operation specified
   *	is invalid for the file descriptor in the iocb.  May fail with
   *	-EFAULT if any of the data structures point to invalid data.  May
   *	fail with -EBADF if the file descriptor specified in the first
   *	iocb is invalid.  May fail with -EAGAIN if insufficient resources
   *	are available to queue any iocbs.  Will return 0 if nr is 0.  Will
   *	fail with -ENOSYS if not implemented.
   */
  SYSCALL_DEFINE3(io_submit, aio_context_t, ctx_id, long, nr,
  		struct iocb __user * __user *, iocbpp)
  {
  	return do_io_submit(ctx_id, nr, iocbpp, 0);
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1541
1542
  /* lookup_kiocb
   *	Finds a given iocb for cancellation.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1543
   */
25ee7e383   Adrian Bunk   [PATCH] fs/aio.c:...
1544
1545
  static struct kiocb *lookup_kiocb(struct kioctx *ctx, struct iocb __user *iocb,
  				  u32 key)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1546
1547
  {
  	struct list_head *pos;
d00689af6   Zach Brown   [PATCH] aio: repl...
1548
1549
  
  	assert_spin_locked(&ctx->ctx_lock);
8a6608907   Kent Overstreet   aio: kill ki_key
1550
1551
  	if (key != KIOCB_KEY)
  		return NULL;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1552
1553
1554
  	/* TODO: use a hash or array, this sucks. */
  	list_for_each(pos, &ctx->active_reqs) {
  		struct kiocb *kiocb = list_kiocb(pos);
8a6608907   Kent Overstreet   aio: kill ki_key
1555
  		if (kiocb->ki_obj.user == iocb)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
  			return kiocb;
  	}
  	return NULL;
  }
  
  /* sys_io_cancel:
   *	Attempts to cancel an iocb previously passed to io_submit.  If
   *	the operation is successfully cancelled, the resulting event is
   *	copied into the memory pointed to by result without being placed
   *	into the completion queue and 0 is returned.  May fail with
   *	-EFAULT if any of the data structures pointed to are invalid.
   *	May fail with -EINVAL if aio_context specified by ctx_id is
   *	invalid.  May fail with -EAGAIN if the iocb specified was not
   *	cancelled.  Will fail with -ENOSYS if not implemented.
   */
002c8976e   Heiko Carstens   [CVE-2009-0029] S...
1571
1572
  SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb,
  		struct io_event __user *, result)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1573
  {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
  	struct kioctx *ctx;
  	struct kiocb *kiocb;
  	u32 key;
  	int ret;
  
  	ret = get_user(key, &iocb->aio_key);
  	if (unlikely(ret))
  		return -EFAULT;
  
  	ctx = lookup_ioctx(ctx_id);
  	if (unlikely(!ctx))
  		return -EINVAL;
  
  	spin_lock_irq(&ctx->ctx_lock);
906b973cf   Kent Overstreet   aio: add kiocb_ca...
1588

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1589
  	kiocb = lookup_kiocb(ctx, iocb, key);
906b973cf   Kent Overstreet   aio: add kiocb_ca...
1590
  	if (kiocb)
d52a8f9ea   Fabian Frederick   fs/aio.c: Remove ...
1591
  		ret = kiocb_cancel(kiocb);
906b973cf   Kent Overstreet   aio: add kiocb_ca...
1592
1593
  	else
  		ret = -EINVAL;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1594
  	spin_unlock_irq(&ctx->ctx_lock);
906b973cf   Kent Overstreet   aio: add kiocb_ca...
1595
  	if (!ret) {
bec68faaf   Kent Overstreet   aio: io_cancel() ...
1596
1597
1598
1599
  		/*
  		 * The result argument is no longer used - the io_event is
  		 * always delivered via the ring buffer. -EINPROGRESS indicates
  		 * cancellation is progress:
906b973cf   Kent Overstreet   aio: add kiocb_ca...
1600
  		 */
bec68faaf   Kent Overstreet   aio: io_cancel() ...
1601
  		ret = -EINPROGRESS;
906b973cf   Kent Overstreet   aio: add kiocb_ca...
1602
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1603

723be6e39   Kent Overstreet   aio: percpu ioctx...
1604
  	percpu_ref_put(&ctx->users);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1605
1606
1607
1608
1609
1610
  
  	return ret;
  }
  
  /* io_getevents:
   *	Attempts to read at least min_nr events and up to nr events from
642b5123a   Satoru Takeuchi   aio: fix wrong su...
1611
1612
1613
1614
1615
1616
1617
1618
   *	the completion queue for the aio_context specified by ctx_id. If
   *	it succeeds, the number of read events is returned. May fail with
   *	-EINVAL if ctx_id is invalid, if min_nr is out of range, if nr is
   *	out of range, if timeout is out of range.  May fail with -EFAULT
   *	if any of the memory specified is invalid.  May return 0 or
   *	< min_nr if the timeout specified by timeout has elapsed
   *	before sufficient events are available, where timeout == NULL
   *	specifies an infinite timeout. Note that the timeout pointed to by
6900807c6   Jeff Moyer   aio: fix io_getev...
1619
   *	timeout is relative.  Will fail with -ENOSYS if not implemented.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1620
   */
002c8976e   Heiko Carstens   [CVE-2009-0029] S...
1621
1622
1623
1624
1625
  SYSCALL_DEFINE5(io_getevents, aio_context_t, ctx_id,
  		long, min_nr,
  		long, nr,
  		struct io_event __user *, events,
  		struct timespec __user *, timeout)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1626
1627
1628
1629
1630
  {
  	struct kioctx *ioctx = lookup_ioctx(ctx_id);
  	long ret = -EINVAL;
  
  	if (likely(ioctx)) {
2e4102559   Namhyung Kim   aio: remove unnec...
1631
  		if (likely(min_nr <= nr && min_nr >= 0))
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1632
  			ret = read_events(ioctx, min_nr, nr, events, timeout);
723be6e39   Kent Overstreet   aio: percpu ioctx...
1633
  		percpu_ref_put(&ioctx->users);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1634
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1635
1636
  	return ret;
  }