Commit a0c42bac79731276c9b2f28d54f9e658fcf843a2

Authored by Jan Kara
Committed by Linus Torvalds
1 parent d1908362ae

aio: do not return ERESTARTSYS as a result of AIO

OCFS2 can return ERESTARTSYS from its write function when the process is
signalled while waiting for a cluster lock (and the filesystem is mounted
with intr mount option).  Generally, it seems reasonable to allow
filesystems to return this error code from its IO functions.  As we must
not leak ERESTARTSYS (and similar error codes) to userspace as a result of
an AIO operation, we have to properly convert it to EINTR inside AIO code
(restarting the syscall isn't really an option because other AIO could
have been already submitted by the same io_submit syscall).

Signed-off-by: Jan Kara <jack@suse.cz>
Reviewed-by: Jeff Moyer <jmoyer@redhat.com>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Zach Brown <zach.brown@oracle.com>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Showing 1 changed file with 9 additions and 1 deletions Inline Diff

1 /* 1 /*
2 * An async IO implementation for Linux 2 * An async IO implementation for Linux
3 * Written by Benjamin LaHaise <bcrl@kvack.org> 3 * Written by Benjamin LaHaise <bcrl@kvack.org>
4 * 4 *
5 * Implements an efficient asynchronous io interface. 5 * Implements an efficient asynchronous io interface.
6 * 6 *
7 * Copyright 2000, 2001, 2002 Red Hat, Inc. All Rights Reserved. 7 * Copyright 2000, 2001, 2002 Red Hat, Inc. All Rights Reserved.
8 * 8 *
9 * See ../COPYING for licensing terms. 9 * See ../COPYING for licensing terms.
10 */ 10 */
11 #include <linux/kernel.h> 11 #include <linux/kernel.h>
12 #include <linux/init.h> 12 #include <linux/init.h>
13 #include <linux/errno.h> 13 #include <linux/errno.h>
14 #include <linux/time.h> 14 #include <linux/time.h>
15 #include <linux/aio_abi.h> 15 #include <linux/aio_abi.h>
16 #include <linux/module.h> 16 #include <linux/module.h>
17 #include <linux/syscalls.h> 17 #include <linux/syscalls.h>
18 #include <linux/backing-dev.h> 18 #include <linux/backing-dev.h>
19 #include <linux/uio.h> 19 #include <linux/uio.h>
20 20
21 #define DEBUG 0 21 #define DEBUG 0
22 22
23 #include <linux/sched.h> 23 #include <linux/sched.h>
24 #include <linux/fs.h> 24 #include <linux/fs.h>
25 #include <linux/file.h> 25 #include <linux/file.h>
26 #include <linux/mm.h> 26 #include <linux/mm.h>
27 #include <linux/mman.h> 27 #include <linux/mman.h>
28 #include <linux/mmu_context.h> 28 #include <linux/mmu_context.h>
29 #include <linux/slab.h> 29 #include <linux/slab.h>
30 #include <linux/timer.h> 30 #include <linux/timer.h>
31 #include <linux/aio.h> 31 #include <linux/aio.h>
32 #include <linux/highmem.h> 32 #include <linux/highmem.h>
33 #include <linux/workqueue.h> 33 #include <linux/workqueue.h>
34 #include <linux/security.h> 34 #include <linux/security.h>
35 #include <linux/eventfd.h> 35 #include <linux/eventfd.h>
36 #include <linux/blkdev.h> 36 #include <linux/blkdev.h>
37 #include <linux/mempool.h> 37 #include <linux/mempool.h>
38 #include <linux/hash.h> 38 #include <linux/hash.h>
39 #include <linux/compat.h> 39 #include <linux/compat.h>
40 40
41 #include <asm/kmap_types.h> 41 #include <asm/kmap_types.h>
42 #include <asm/uaccess.h> 42 #include <asm/uaccess.h>
43 43
44 #if DEBUG > 1 44 #if DEBUG > 1
45 #define dprintk printk 45 #define dprintk printk
46 #else 46 #else
47 #define dprintk(x...) do { ; } while (0) 47 #define dprintk(x...) do { ; } while (0)
48 #endif 48 #endif
49 49
50 /*------ sysctl variables----*/ 50 /*------ sysctl variables----*/
51 static DEFINE_SPINLOCK(aio_nr_lock); 51 static DEFINE_SPINLOCK(aio_nr_lock);
52 unsigned long aio_nr; /* current system wide number of aio requests */ 52 unsigned long aio_nr; /* current system wide number of aio requests */
53 unsigned long aio_max_nr = 0x10000; /* system wide maximum number of aio requests */ 53 unsigned long aio_max_nr = 0x10000; /* system wide maximum number of aio requests */
54 /*----end sysctl variables---*/ 54 /*----end sysctl variables---*/
55 55
56 static struct kmem_cache *kiocb_cachep; 56 static struct kmem_cache *kiocb_cachep;
57 static struct kmem_cache *kioctx_cachep; 57 static struct kmem_cache *kioctx_cachep;
58 58
59 static struct workqueue_struct *aio_wq; 59 static struct workqueue_struct *aio_wq;
60 60
61 /* Used for rare fput completion. */ 61 /* Used for rare fput completion. */
62 static void aio_fput_routine(struct work_struct *); 62 static void aio_fput_routine(struct work_struct *);
63 static DECLARE_WORK(fput_work, aio_fput_routine); 63 static DECLARE_WORK(fput_work, aio_fput_routine);
64 64
65 static DEFINE_SPINLOCK(fput_lock); 65 static DEFINE_SPINLOCK(fput_lock);
66 static LIST_HEAD(fput_head); 66 static LIST_HEAD(fput_head);
67 67
68 #define AIO_BATCH_HASH_BITS 3 /* allocated on-stack, so don't go crazy */ 68 #define AIO_BATCH_HASH_BITS 3 /* allocated on-stack, so don't go crazy */
69 #define AIO_BATCH_HASH_SIZE (1 << AIO_BATCH_HASH_BITS) 69 #define AIO_BATCH_HASH_SIZE (1 << AIO_BATCH_HASH_BITS)
70 struct aio_batch_entry { 70 struct aio_batch_entry {
71 struct hlist_node list; 71 struct hlist_node list;
72 struct address_space *mapping; 72 struct address_space *mapping;
73 }; 73 };
74 mempool_t *abe_pool; 74 mempool_t *abe_pool;
75 75
76 static void aio_kick_handler(struct work_struct *); 76 static void aio_kick_handler(struct work_struct *);
77 static void aio_queue_work(struct kioctx *); 77 static void aio_queue_work(struct kioctx *);
78 78
79 /* aio_setup 79 /* aio_setup
80 * Creates the slab caches used by the aio routines, panic on 80 * Creates the slab caches used by the aio routines, panic on
81 * failure as this is done early during the boot sequence. 81 * failure as this is done early during the boot sequence.
82 */ 82 */
83 static int __init aio_setup(void) 83 static int __init aio_setup(void)
84 { 84 {
85 kiocb_cachep = KMEM_CACHE(kiocb, SLAB_HWCACHE_ALIGN|SLAB_PANIC); 85 kiocb_cachep = KMEM_CACHE(kiocb, SLAB_HWCACHE_ALIGN|SLAB_PANIC);
86 kioctx_cachep = KMEM_CACHE(kioctx,SLAB_HWCACHE_ALIGN|SLAB_PANIC); 86 kioctx_cachep = KMEM_CACHE(kioctx,SLAB_HWCACHE_ALIGN|SLAB_PANIC);
87 87
88 aio_wq = create_workqueue("aio"); 88 aio_wq = create_workqueue("aio");
89 abe_pool = mempool_create_kmalloc_pool(1, sizeof(struct aio_batch_entry)); 89 abe_pool = mempool_create_kmalloc_pool(1, sizeof(struct aio_batch_entry));
90 BUG_ON(!abe_pool); 90 BUG_ON(!abe_pool);
91 91
92 pr_debug("aio_setup: sizeof(struct page) = %d\n", (int)sizeof(struct page)); 92 pr_debug("aio_setup: sizeof(struct page) = %d\n", (int)sizeof(struct page));
93 93
94 return 0; 94 return 0;
95 } 95 }
96 __initcall(aio_setup); 96 __initcall(aio_setup);
97 97
98 static void aio_free_ring(struct kioctx *ctx) 98 static void aio_free_ring(struct kioctx *ctx)
99 { 99 {
100 struct aio_ring_info *info = &ctx->ring_info; 100 struct aio_ring_info *info = &ctx->ring_info;
101 long i; 101 long i;
102 102
103 for (i=0; i<info->nr_pages; i++) 103 for (i=0; i<info->nr_pages; i++)
104 put_page(info->ring_pages[i]); 104 put_page(info->ring_pages[i]);
105 105
106 if (info->mmap_size) { 106 if (info->mmap_size) {
107 down_write(&ctx->mm->mmap_sem); 107 down_write(&ctx->mm->mmap_sem);
108 do_munmap(ctx->mm, info->mmap_base, info->mmap_size); 108 do_munmap(ctx->mm, info->mmap_base, info->mmap_size);
109 up_write(&ctx->mm->mmap_sem); 109 up_write(&ctx->mm->mmap_sem);
110 } 110 }
111 111
112 if (info->ring_pages && info->ring_pages != info->internal_pages) 112 if (info->ring_pages && info->ring_pages != info->internal_pages)
113 kfree(info->ring_pages); 113 kfree(info->ring_pages);
114 info->ring_pages = NULL; 114 info->ring_pages = NULL;
115 info->nr = 0; 115 info->nr = 0;
116 } 116 }
117 117
118 static int aio_setup_ring(struct kioctx *ctx) 118 static int aio_setup_ring(struct kioctx *ctx)
119 { 119 {
120 struct aio_ring *ring; 120 struct aio_ring *ring;
121 struct aio_ring_info *info = &ctx->ring_info; 121 struct aio_ring_info *info = &ctx->ring_info;
122 unsigned nr_events = ctx->max_reqs; 122 unsigned nr_events = ctx->max_reqs;
123 unsigned long size; 123 unsigned long size;
124 int nr_pages; 124 int nr_pages;
125 125
126 /* Compensate for the ring buffer's head/tail overlap entry */ 126 /* Compensate for the ring buffer's head/tail overlap entry */
127 nr_events += 2; /* 1 is required, 2 for good luck */ 127 nr_events += 2; /* 1 is required, 2 for good luck */
128 128
129 size = sizeof(struct aio_ring); 129 size = sizeof(struct aio_ring);
130 size += sizeof(struct io_event) * nr_events; 130 size += sizeof(struct io_event) * nr_events;
131 nr_pages = (size + PAGE_SIZE-1) >> PAGE_SHIFT; 131 nr_pages = (size + PAGE_SIZE-1) >> PAGE_SHIFT;
132 132
133 if (nr_pages < 0) 133 if (nr_pages < 0)
134 return -EINVAL; 134 return -EINVAL;
135 135
136 nr_events = (PAGE_SIZE * nr_pages - sizeof(struct aio_ring)) / sizeof(struct io_event); 136 nr_events = (PAGE_SIZE * nr_pages - sizeof(struct aio_ring)) / sizeof(struct io_event);
137 137
138 info->nr = 0; 138 info->nr = 0;
139 info->ring_pages = info->internal_pages; 139 info->ring_pages = info->internal_pages;
140 if (nr_pages > AIO_RING_PAGES) { 140 if (nr_pages > AIO_RING_PAGES) {
141 info->ring_pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL); 141 info->ring_pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL);
142 if (!info->ring_pages) 142 if (!info->ring_pages)
143 return -ENOMEM; 143 return -ENOMEM;
144 } 144 }
145 145
146 info->mmap_size = nr_pages * PAGE_SIZE; 146 info->mmap_size = nr_pages * PAGE_SIZE;
147 dprintk("attempting mmap of %lu bytes\n", info->mmap_size); 147 dprintk("attempting mmap of %lu bytes\n", info->mmap_size);
148 down_write(&ctx->mm->mmap_sem); 148 down_write(&ctx->mm->mmap_sem);
149 info->mmap_base = do_mmap(NULL, 0, info->mmap_size, 149 info->mmap_base = do_mmap(NULL, 0, info->mmap_size,
150 PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_PRIVATE, 150 PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_PRIVATE,
151 0); 151 0);
152 if (IS_ERR((void *)info->mmap_base)) { 152 if (IS_ERR((void *)info->mmap_base)) {
153 up_write(&ctx->mm->mmap_sem); 153 up_write(&ctx->mm->mmap_sem);
154 info->mmap_size = 0; 154 info->mmap_size = 0;
155 aio_free_ring(ctx); 155 aio_free_ring(ctx);
156 return -EAGAIN; 156 return -EAGAIN;
157 } 157 }
158 158
159 dprintk("mmap address: 0x%08lx\n", info->mmap_base); 159 dprintk("mmap address: 0x%08lx\n", info->mmap_base);
160 info->nr_pages = get_user_pages(current, ctx->mm, 160 info->nr_pages = get_user_pages(current, ctx->mm,
161 info->mmap_base, nr_pages, 161 info->mmap_base, nr_pages,
162 1, 0, info->ring_pages, NULL); 162 1, 0, info->ring_pages, NULL);
163 up_write(&ctx->mm->mmap_sem); 163 up_write(&ctx->mm->mmap_sem);
164 164
165 if (unlikely(info->nr_pages != nr_pages)) { 165 if (unlikely(info->nr_pages != nr_pages)) {
166 aio_free_ring(ctx); 166 aio_free_ring(ctx);
167 return -EAGAIN; 167 return -EAGAIN;
168 } 168 }
169 169
170 ctx->user_id = info->mmap_base; 170 ctx->user_id = info->mmap_base;
171 171
172 info->nr = nr_events; /* trusted copy */ 172 info->nr = nr_events; /* trusted copy */
173 173
174 ring = kmap_atomic(info->ring_pages[0], KM_USER0); 174 ring = kmap_atomic(info->ring_pages[0], KM_USER0);
175 ring->nr = nr_events; /* user copy */ 175 ring->nr = nr_events; /* user copy */
176 ring->id = ctx->user_id; 176 ring->id = ctx->user_id;
177 ring->head = ring->tail = 0; 177 ring->head = ring->tail = 0;
178 ring->magic = AIO_RING_MAGIC; 178 ring->magic = AIO_RING_MAGIC;
179 ring->compat_features = AIO_RING_COMPAT_FEATURES; 179 ring->compat_features = AIO_RING_COMPAT_FEATURES;
180 ring->incompat_features = AIO_RING_INCOMPAT_FEATURES; 180 ring->incompat_features = AIO_RING_INCOMPAT_FEATURES;
181 ring->header_length = sizeof(struct aio_ring); 181 ring->header_length = sizeof(struct aio_ring);
182 kunmap_atomic(ring, KM_USER0); 182 kunmap_atomic(ring, KM_USER0);
183 183
184 return 0; 184 return 0;
185 } 185 }
186 186
187 187
188 /* aio_ring_event: returns a pointer to the event at the given index from 188 /* aio_ring_event: returns a pointer to the event at the given index from
189 * kmap_atomic(, km). Release the pointer with put_aio_ring_event(); 189 * kmap_atomic(, km). Release the pointer with put_aio_ring_event();
190 */ 190 */
191 #define AIO_EVENTS_PER_PAGE (PAGE_SIZE / sizeof(struct io_event)) 191 #define AIO_EVENTS_PER_PAGE (PAGE_SIZE / sizeof(struct io_event))
192 #define AIO_EVENTS_FIRST_PAGE ((PAGE_SIZE - sizeof(struct aio_ring)) / sizeof(struct io_event)) 192 #define AIO_EVENTS_FIRST_PAGE ((PAGE_SIZE - sizeof(struct aio_ring)) / sizeof(struct io_event))
193 #define AIO_EVENTS_OFFSET (AIO_EVENTS_PER_PAGE - AIO_EVENTS_FIRST_PAGE) 193 #define AIO_EVENTS_OFFSET (AIO_EVENTS_PER_PAGE - AIO_EVENTS_FIRST_PAGE)
194 194
195 #define aio_ring_event(info, nr, km) ({ \ 195 #define aio_ring_event(info, nr, km) ({ \
196 unsigned pos = (nr) + AIO_EVENTS_OFFSET; \ 196 unsigned pos = (nr) + AIO_EVENTS_OFFSET; \
197 struct io_event *__event; \ 197 struct io_event *__event; \
198 __event = kmap_atomic( \ 198 __event = kmap_atomic( \
199 (info)->ring_pages[pos / AIO_EVENTS_PER_PAGE], km); \ 199 (info)->ring_pages[pos / AIO_EVENTS_PER_PAGE], km); \
200 __event += pos % AIO_EVENTS_PER_PAGE; \ 200 __event += pos % AIO_EVENTS_PER_PAGE; \
201 __event; \ 201 __event; \
202 }) 202 })
203 203
204 #define put_aio_ring_event(event, km) do { \ 204 #define put_aio_ring_event(event, km) do { \
205 struct io_event *__event = (event); \ 205 struct io_event *__event = (event); \
206 (void)__event; \ 206 (void)__event; \
207 kunmap_atomic((void *)((unsigned long)__event & PAGE_MASK), km); \ 207 kunmap_atomic((void *)((unsigned long)__event & PAGE_MASK), km); \
208 } while(0) 208 } while(0)
209 209
210 static void ctx_rcu_free(struct rcu_head *head) 210 static void ctx_rcu_free(struct rcu_head *head)
211 { 211 {
212 struct kioctx *ctx = container_of(head, struct kioctx, rcu_head); 212 struct kioctx *ctx = container_of(head, struct kioctx, rcu_head);
213 unsigned nr_events = ctx->max_reqs; 213 unsigned nr_events = ctx->max_reqs;
214 214
215 kmem_cache_free(kioctx_cachep, ctx); 215 kmem_cache_free(kioctx_cachep, ctx);
216 216
217 if (nr_events) { 217 if (nr_events) {
218 spin_lock(&aio_nr_lock); 218 spin_lock(&aio_nr_lock);
219 BUG_ON(aio_nr - nr_events > aio_nr); 219 BUG_ON(aio_nr - nr_events > aio_nr);
220 aio_nr -= nr_events; 220 aio_nr -= nr_events;
221 spin_unlock(&aio_nr_lock); 221 spin_unlock(&aio_nr_lock);
222 } 222 }
223 } 223 }
224 224
225 /* __put_ioctx 225 /* __put_ioctx
226 * Called when the last user of an aio context has gone away, 226 * Called when the last user of an aio context has gone away,
227 * and the struct needs to be freed. 227 * and the struct needs to be freed.
228 */ 228 */
229 static void __put_ioctx(struct kioctx *ctx) 229 static void __put_ioctx(struct kioctx *ctx)
230 { 230 {
231 BUG_ON(ctx->reqs_active); 231 BUG_ON(ctx->reqs_active);
232 232
233 cancel_delayed_work(&ctx->wq); 233 cancel_delayed_work(&ctx->wq);
234 cancel_work_sync(&ctx->wq.work); 234 cancel_work_sync(&ctx->wq.work);
235 aio_free_ring(ctx); 235 aio_free_ring(ctx);
236 mmdrop(ctx->mm); 236 mmdrop(ctx->mm);
237 ctx->mm = NULL; 237 ctx->mm = NULL;
238 pr_debug("__put_ioctx: freeing %p\n", ctx); 238 pr_debug("__put_ioctx: freeing %p\n", ctx);
239 call_rcu(&ctx->rcu_head, ctx_rcu_free); 239 call_rcu(&ctx->rcu_head, ctx_rcu_free);
240 } 240 }
241 241
242 #define get_ioctx(kioctx) do { \ 242 #define get_ioctx(kioctx) do { \
243 BUG_ON(atomic_read(&(kioctx)->users) <= 0); \ 243 BUG_ON(atomic_read(&(kioctx)->users) <= 0); \
244 atomic_inc(&(kioctx)->users); \ 244 atomic_inc(&(kioctx)->users); \
245 } while (0) 245 } while (0)
246 #define put_ioctx(kioctx) do { \ 246 #define put_ioctx(kioctx) do { \
247 BUG_ON(atomic_read(&(kioctx)->users) <= 0); \ 247 BUG_ON(atomic_read(&(kioctx)->users) <= 0); \
248 if (unlikely(atomic_dec_and_test(&(kioctx)->users))) \ 248 if (unlikely(atomic_dec_and_test(&(kioctx)->users))) \
249 __put_ioctx(kioctx); \ 249 __put_ioctx(kioctx); \
250 } while (0) 250 } while (0)
251 251
252 /* ioctx_alloc 252 /* ioctx_alloc
253 * Allocates and initializes an ioctx. Returns an ERR_PTR if it failed. 253 * Allocates and initializes an ioctx. Returns an ERR_PTR if it failed.
254 */ 254 */
255 static struct kioctx *ioctx_alloc(unsigned nr_events) 255 static struct kioctx *ioctx_alloc(unsigned nr_events)
256 { 256 {
257 struct mm_struct *mm; 257 struct mm_struct *mm;
258 struct kioctx *ctx; 258 struct kioctx *ctx;
259 int did_sync = 0; 259 int did_sync = 0;
260 260
261 /* Prevent overflows */ 261 /* Prevent overflows */
262 if ((nr_events > (0x10000000U / sizeof(struct io_event))) || 262 if ((nr_events > (0x10000000U / sizeof(struct io_event))) ||
263 (nr_events > (0x10000000U / sizeof(struct kiocb)))) { 263 (nr_events > (0x10000000U / sizeof(struct kiocb)))) {
264 pr_debug("ENOMEM: nr_events too high\n"); 264 pr_debug("ENOMEM: nr_events too high\n");
265 return ERR_PTR(-EINVAL); 265 return ERR_PTR(-EINVAL);
266 } 266 }
267 267
268 if ((unsigned long)nr_events > aio_max_nr) 268 if ((unsigned long)nr_events > aio_max_nr)
269 return ERR_PTR(-EAGAIN); 269 return ERR_PTR(-EAGAIN);
270 270
271 ctx = kmem_cache_zalloc(kioctx_cachep, GFP_KERNEL); 271 ctx = kmem_cache_zalloc(kioctx_cachep, GFP_KERNEL);
272 if (!ctx) 272 if (!ctx)
273 return ERR_PTR(-ENOMEM); 273 return ERR_PTR(-ENOMEM);
274 274
275 ctx->max_reqs = nr_events; 275 ctx->max_reqs = nr_events;
276 mm = ctx->mm = current->mm; 276 mm = ctx->mm = current->mm;
277 atomic_inc(&mm->mm_count); 277 atomic_inc(&mm->mm_count);
278 278
279 atomic_set(&ctx->users, 1); 279 atomic_set(&ctx->users, 1);
280 spin_lock_init(&ctx->ctx_lock); 280 spin_lock_init(&ctx->ctx_lock);
281 spin_lock_init(&ctx->ring_info.ring_lock); 281 spin_lock_init(&ctx->ring_info.ring_lock);
282 init_waitqueue_head(&ctx->wait); 282 init_waitqueue_head(&ctx->wait);
283 283
284 INIT_LIST_HEAD(&ctx->active_reqs); 284 INIT_LIST_HEAD(&ctx->active_reqs);
285 INIT_LIST_HEAD(&ctx->run_list); 285 INIT_LIST_HEAD(&ctx->run_list);
286 INIT_DELAYED_WORK(&ctx->wq, aio_kick_handler); 286 INIT_DELAYED_WORK(&ctx->wq, aio_kick_handler);
287 287
288 if (aio_setup_ring(ctx) < 0) 288 if (aio_setup_ring(ctx) < 0)
289 goto out_freectx; 289 goto out_freectx;
290 290
291 /* limit the number of system wide aios */ 291 /* limit the number of system wide aios */
292 do { 292 do {
293 spin_lock_bh(&aio_nr_lock); 293 spin_lock_bh(&aio_nr_lock);
294 if (aio_nr + nr_events > aio_max_nr || 294 if (aio_nr + nr_events > aio_max_nr ||
295 aio_nr + nr_events < aio_nr) 295 aio_nr + nr_events < aio_nr)
296 ctx->max_reqs = 0; 296 ctx->max_reqs = 0;
297 else 297 else
298 aio_nr += ctx->max_reqs; 298 aio_nr += ctx->max_reqs;
299 spin_unlock_bh(&aio_nr_lock); 299 spin_unlock_bh(&aio_nr_lock);
300 if (ctx->max_reqs || did_sync) 300 if (ctx->max_reqs || did_sync)
301 break; 301 break;
302 302
303 /* wait for rcu callbacks to have completed before giving up */ 303 /* wait for rcu callbacks to have completed before giving up */
304 synchronize_rcu(); 304 synchronize_rcu();
305 did_sync = 1; 305 did_sync = 1;
306 ctx->max_reqs = nr_events; 306 ctx->max_reqs = nr_events;
307 } while (1); 307 } while (1);
308 308
309 if (ctx->max_reqs == 0) 309 if (ctx->max_reqs == 0)
310 goto out_cleanup; 310 goto out_cleanup;
311 311
312 /* now link into global list. */ 312 /* now link into global list. */
313 spin_lock(&mm->ioctx_lock); 313 spin_lock(&mm->ioctx_lock);
314 hlist_add_head_rcu(&ctx->list, &mm->ioctx_list); 314 hlist_add_head_rcu(&ctx->list, &mm->ioctx_list);
315 spin_unlock(&mm->ioctx_lock); 315 spin_unlock(&mm->ioctx_lock);
316 316
317 dprintk("aio: allocated ioctx %p[%ld]: mm=%p mask=0x%x\n", 317 dprintk("aio: allocated ioctx %p[%ld]: mm=%p mask=0x%x\n",
318 ctx, ctx->user_id, current->mm, ctx->ring_info.nr); 318 ctx, ctx->user_id, current->mm, ctx->ring_info.nr);
319 return ctx; 319 return ctx;
320 320
321 out_cleanup: 321 out_cleanup:
322 __put_ioctx(ctx); 322 __put_ioctx(ctx);
323 return ERR_PTR(-EAGAIN); 323 return ERR_PTR(-EAGAIN);
324 324
325 out_freectx: 325 out_freectx:
326 mmdrop(mm); 326 mmdrop(mm);
327 kmem_cache_free(kioctx_cachep, ctx); 327 kmem_cache_free(kioctx_cachep, ctx);
328 ctx = ERR_PTR(-ENOMEM); 328 ctx = ERR_PTR(-ENOMEM);
329 329
330 dprintk("aio: error allocating ioctx %p\n", ctx); 330 dprintk("aio: error allocating ioctx %p\n", ctx);
331 return ctx; 331 return ctx;
332 } 332 }
333 333
334 /* aio_cancel_all 334 /* aio_cancel_all
335 * Cancels all outstanding aio requests on an aio context. Used 335 * Cancels all outstanding aio requests on an aio context. Used
336 * when the processes owning a context have all exited to encourage 336 * when the processes owning a context have all exited to encourage
337 * the rapid destruction of the kioctx. 337 * the rapid destruction of the kioctx.
338 */ 338 */
339 static void aio_cancel_all(struct kioctx *ctx) 339 static void aio_cancel_all(struct kioctx *ctx)
340 { 340 {
341 int (*cancel)(struct kiocb *, struct io_event *); 341 int (*cancel)(struct kiocb *, struct io_event *);
342 struct io_event res; 342 struct io_event res;
343 spin_lock_irq(&ctx->ctx_lock); 343 spin_lock_irq(&ctx->ctx_lock);
344 ctx->dead = 1; 344 ctx->dead = 1;
345 while (!list_empty(&ctx->active_reqs)) { 345 while (!list_empty(&ctx->active_reqs)) {
346 struct list_head *pos = ctx->active_reqs.next; 346 struct list_head *pos = ctx->active_reqs.next;
347 struct kiocb *iocb = list_kiocb(pos); 347 struct kiocb *iocb = list_kiocb(pos);
348 list_del_init(&iocb->ki_list); 348 list_del_init(&iocb->ki_list);
349 cancel = iocb->ki_cancel; 349 cancel = iocb->ki_cancel;
350 kiocbSetCancelled(iocb); 350 kiocbSetCancelled(iocb);
351 if (cancel) { 351 if (cancel) {
352 iocb->ki_users++; 352 iocb->ki_users++;
353 spin_unlock_irq(&ctx->ctx_lock); 353 spin_unlock_irq(&ctx->ctx_lock);
354 cancel(iocb, &res); 354 cancel(iocb, &res);
355 spin_lock_irq(&ctx->ctx_lock); 355 spin_lock_irq(&ctx->ctx_lock);
356 } 356 }
357 } 357 }
358 spin_unlock_irq(&ctx->ctx_lock); 358 spin_unlock_irq(&ctx->ctx_lock);
359 } 359 }
360 360
361 static void wait_for_all_aios(struct kioctx *ctx) 361 static void wait_for_all_aios(struct kioctx *ctx)
362 { 362 {
363 struct task_struct *tsk = current; 363 struct task_struct *tsk = current;
364 DECLARE_WAITQUEUE(wait, tsk); 364 DECLARE_WAITQUEUE(wait, tsk);
365 365
366 spin_lock_irq(&ctx->ctx_lock); 366 spin_lock_irq(&ctx->ctx_lock);
367 if (!ctx->reqs_active) 367 if (!ctx->reqs_active)
368 goto out; 368 goto out;
369 369
370 add_wait_queue(&ctx->wait, &wait); 370 add_wait_queue(&ctx->wait, &wait);
371 set_task_state(tsk, TASK_UNINTERRUPTIBLE); 371 set_task_state(tsk, TASK_UNINTERRUPTIBLE);
372 while (ctx->reqs_active) { 372 while (ctx->reqs_active) {
373 spin_unlock_irq(&ctx->ctx_lock); 373 spin_unlock_irq(&ctx->ctx_lock);
374 io_schedule(); 374 io_schedule();
375 set_task_state(tsk, TASK_UNINTERRUPTIBLE); 375 set_task_state(tsk, TASK_UNINTERRUPTIBLE);
376 spin_lock_irq(&ctx->ctx_lock); 376 spin_lock_irq(&ctx->ctx_lock);
377 } 377 }
378 __set_task_state(tsk, TASK_RUNNING); 378 __set_task_state(tsk, TASK_RUNNING);
379 remove_wait_queue(&ctx->wait, &wait); 379 remove_wait_queue(&ctx->wait, &wait);
380 380
381 out: 381 out:
382 spin_unlock_irq(&ctx->ctx_lock); 382 spin_unlock_irq(&ctx->ctx_lock);
383 } 383 }
384 384
385 /* wait_on_sync_kiocb: 385 /* wait_on_sync_kiocb:
386 * Waits on the given sync kiocb to complete. 386 * Waits on the given sync kiocb to complete.
387 */ 387 */
388 ssize_t wait_on_sync_kiocb(struct kiocb *iocb) 388 ssize_t wait_on_sync_kiocb(struct kiocb *iocb)
389 { 389 {
390 while (iocb->ki_users) { 390 while (iocb->ki_users) {
391 set_current_state(TASK_UNINTERRUPTIBLE); 391 set_current_state(TASK_UNINTERRUPTIBLE);
392 if (!iocb->ki_users) 392 if (!iocb->ki_users)
393 break; 393 break;
394 io_schedule(); 394 io_schedule();
395 } 395 }
396 __set_current_state(TASK_RUNNING); 396 __set_current_state(TASK_RUNNING);
397 return iocb->ki_user_data; 397 return iocb->ki_user_data;
398 } 398 }
399 EXPORT_SYMBOL(wait_on_sync_kiocb); 399 EXPORT_SYMBOL(wait_on_sync_kiocb);
400 400
401 /* exit_aio: called when the last user of mm goes away. At this point, 401 /* exit_aio: called when the last user of mm goes away. At this point,
402 * there is no way for any new requests to be submited or any of the 402 * there is no way for any new requests to be submited or any of the
403 * io_* syscalls to be called on the context. However, there may be 403 * io_* syscalls to be called on the context. However, there may be
404 * outstanding requests which hold references to the context; as they 404 * outstanding requests which hold references to the context; as they
405 * go away, they will call put_ioctx and release any pinned memory 405 * go away, they will call put_ioctx and release any pinned memory
406 * associated with the request (held via struct page * references). 406 * associated with the request (held via struct page * references).
407 */ 407 */
408 void exit_aio(struct mm_struct *mm) 408 void exit_aio(struct mm_struct *mm)
409 { 409 {
410 struct kioctx *ctx; 410 struct kioctx *ctx;
411 411
412 while (!hlist_empty(&mm->ioctx_list)) { 412 while (!hlist_empty(&mm->ioctx_list)) {
413 ctx = hlist_entry(mm->ioctx_list.first, struct kioctx, list); 413 ctx = hlist_entry(mm->ioctx_list.first, struct kioctx, list);
414 hlist_del_rcu(&ctx->list); 414 hlist_del_rcu(&ctx->list);
415 415
416 aio_cancel_all(ctx); 416 aio_cancel_all(ctx);
417 417
418 wait_for_all_aios(ctx); 418 wait_for_all_aios(ctx);
419 /* 419 /*
420 * Ensure we don't leave the ctx on the aio_wq 420 * Ensure we don't leave the ctx on the aio_wq
421 */ 421 */
422 cancel_work_sync(&ctx->wq.work); 422 cancel_work_sync(&ctx->wq.work);
423 423
424 if (1 != atomic_read(&ctx->users)) 424 if (1 != atomic_read(&ctx->users))
425 printk(KERN_DEBUG 425 printk(KERN_DEBUG
426 "exit_aio:ioctx still alive: %d %d %d\n", 426 "exit_aio:ioctx still alive: %d %d %d\n",
427 atomic_read(&ctx->users), ctx->dead, 427 atomic_read(&ctx->users), ctx->dead,
428 ctx->reqs_active); 428 ctx->reqs_active);
429 put_ioctx(ctx); 429 put_ioctx(ctx);
430 } 430 }
431 } 431 }
432 432
433 /* aio_get_req 433 /* aio_get_req
434 * Allocate a slot for an aio request. Increments the users count 434 * Allocate a slot for an aio request. Increments the users count
435 * of the kioctx so that the kioctx stays around until all requests are 435 * of the kioctx so that the kioctx stays around until all requests are
436 * complete. Returns NULL if no requests are free. 436 * complete. Returns NULL if no requests are free.
437 * 437 *
438 * Returns with kiocb->users set to 2. The io submit code path holds 438 * Returns with kiocb->users set to 2. The io submit code path holds
439 * an extra reference while submitting the i/o. 439 * an extra reference while submitting the i/o.
440 * This prevents races between the aio code path referencing the 440 * This prevents races between the aio code path referencing the
441 * req (after submitting it) and aio_complete() freeing the req. 441 * req (after submitting it) and aio_complete() freeing the req.
442 */ 442 */
443 static struct kiocb *__aio_get_req(struct kioctx *ctx) 443 static struct kiocb *__aio_get_req(struct kioctx *ctx)
444 { 444 {
445 struct kiocb *req = NULL; 445 struct kiocb *req = NULL;
446 struct aio_ring *ring; 446 struct aio_ring *ring;
447 int okay = 0; 447 int okay = 0;
448 448
449 req = kmem_cache_alloc(kiocb_cachep, GFP_KERNEL); 449 req = kmem_cache_alloc(kiocb_cachep, GFP_KERNEL);
450 if (unlikely(!req)) 450 if (unlikely(!req))
451 return NULL; 451 return NULL;
452 452
453 req->ki_flags = 0; 453 req->ki_flags = 0;
454 req->ki_users = 2; 454 req->ki_users = 2;
455 req->ki_key = 0; 455 req->ki_key = 0;
456 req->ki_ctx = ctx; 456 req->ki_ctx = ctx;
457 req->ki_cancel = NULL; 457 req->ki_cancel = NULL;
458 req->ki_retry = NULL; 458 req->ki_retry = NULL;
459 req->ki_dtor = NULL; 459 req->ki_dtor = NULL;
460 req->private = NULL; 460 req->private = NULL;
461 req->ki_iovec = NULL; 461 req->ki_iovec = NULL;
462 INIT_LIST_HEAD(&req->ki_run_list); 462 INIT_LIST_HEAD(&req->ki_run_list);
463 req->ki_eventfd = NULL; 463 req->ki_eventfd = NULL;
464 464
465 /* Check if the completion queue has enough free space to 465 /* Check if the completion queue has enough free space to
466 * accept an event from this io. 466 * accept an event from this io.
467 */ 467 */
468 spin_lock_irq(&ctx->ctx_lock); 468 spin_lock_irq(&ctx->ctx_lock);
469 ring = kmap_atomic(ctx->ring_info.ring_pages[0], KM_USER0); 469 ring = kmap_atomic(ctx->ring_info.ring_pages[0], KM_USER0);
470 if (ctx->reqs_active < aio_ring_avail(&ctx->ring_info, ring)) { 470 if (ctx->reqs_active < aio_ring_avail(&ctx->ring_info, ring)) {
471 list_add(&req->ki_list, &ctx->active_reqs); 471 list_add(&req->ki_list, &ctx->active_reqs);
472 ctx->reqs_active++; 472 ctx->reqs_active++;
473 okay = 1; 473 okay = 1;
474 } 474 }
475 kunmap_atomic(ring, KM_USER0); 475 kunmap_atomic(ring, KM_USER0);
476 spin_unlock_irq(&ctx->ctx_lock); 476 spin_unlock_irq(&ctx->ctx_lock);
477 477
478 if (!okay) { 478 if (!okay) {
479 kmem_cache_free(kiocb_cachep, req); 479 kmem_cache_free(kiocb_cachep, req);
480 req = NULL; 480 req = NULL;
481 } 481 }
482 482
483 return req; 483 return req;
484 } 484 }
485 485
486 static inline struct kiocb *aio_get_req(struct kioctx *ctx) 486 static inline struct kiocb *aio_get_req(struct kioctx *ctx)
487 { 487 {
488 struct kiocb *req; 488 struct kiocb *req;
489 /* Handle a potential starvation case -- should be exceedingly rare as 489 /* Handle a potential starvation case -- should be exceedingly rare as
490 * requests will be stuck on fput_head only if the aio_fput_routine is 490 * requests will be stuck on fput_head only if the aio_fput_routine is
491 * delayed and the requests were the last user of the struct file. 491 * delayed and the requests were the last user of the struct file.
492 */ 492 */
493 req = __aio_get_req(ctx); 493 req = __aio_get_req(ctx);
494 if (unlikely(NULL == req)) { 494 if (unlikely(NULL == req)) {
495 aio_fput_routine(NULL); 495 aio_fput_routine(NULL);
496 req = __aio_get_req(ctx); 496 req = __aio_get_req(ctx);
497 } 497 }
498 return req; 498 return req;
499 } 499 }
500 500
501 static inline void really_put_req(struct kioctx *ctx, struct kiocb *req) 501 static inline void really_put_req(struct kioctx *ctx, struct kiocb *req)
502 { 502 {
503 assert_spin_locked(&ctx->ctx_lock); 503 assert_spin_locked(&ctx->ctx_lock);
504 504
505 if (req->ki_eventfd != NULL) 505 if (req->ki_eventfd != NULL)
506 eventfd_ctx_put(req->ki_eventfd); 506 eventfd_ctx_put(req->ki_eventfd);
507 if (req->ki_dtor) 507 if (req->ki_dtor)
508 req->ki_dtor(req); 508 req->ki_dtor(req);
509 if (req->ki_iovec != &req->ki_inline_vec) 509 if (req->ki_iovec != &req->ki_inline_vec)
510 kfree(req->ki_iovec); 510 kfree(req->ki_iovec);
511 kmem_cache_free(kiocb_cachep, req); 511 kmem_cache_free(kiocb_cachep, req);
512 ctx->reqs_active--; 512 ctx->reqs_active--;
513 513
514 if (unlikely(!ctx->reqs_active && ctx->dead)) 514 if (unlikely(!ctx->reqs_active && ctx->dead))
515 wake_up(&ctx->wait); 515 wake_up(&ctx->wait);
516 } 516 }
517 517
518 static void aio_fput_routine(struct work_struct *data) 518 static void aio_fput_routine(struct work_struct *data)
519 { 519 {
520 spin_lock_irq(&fput_lock); 520 spin_lock_irq(&fput_lock);
521 while (likely(!list_empty(&fput_head))) { 521 while (likely(!list_empty(&fput_head))) {
522 struct kiocb *req = list_kiocb(fput_head.next); 522 struct kiocb *req = list_kiocb(fput_head.next);
523 struct kioctx *ctx = req->ki_ctx; 523 struct kioctx *ctx = req->ki_ctx;
524 524
525 list_del(&req->ki_list); 525 list_del(&req->ki_list);
526 spin_unlock_irq(&fput_lock); 526 spin_unlock_irq(&fput_lock);
527 527
528 /* Complete the fput(s) */ 528 /* Complete the fput(s) */
529 if (req->ki_filp != NULL) 529 if (req->ki_filp != NULL)
530 fput(req->ki_filp); 530 fput(req->ki_filp);
531 531
532 /* Link the iocb into the context's free list */ 532 /* Link the iocb into the context's free list */
533 spin_lock_irq(&ctx->ctx_lock); 533 spin_lock_irq(&ctx->ctx_lock);
534 really_put_req(ctx, req); 534 really_put_req(ctx, req);
535 spin_unlock_irq(&ctx->ctx_lock); 535 spin_unlock_irq(&ctx->ctx_lock);
536 536
537 put_ioctx(ctx); 537 put_ioctx(ctx);
538 spin_lock_irq(&fput_lock); 538 spin_lock_irq(&fput_lock);
539 } 539 }
540 spin_unlock_irq(&fput_lock); 540 spin_unlock_irq(&fput_lock);
541 } 541 }
542 542
543 /* __aio_put_req 543 /* __aio_put_req
544 * Returns true if this put was the last user of the request. 544 * Returns true if this put was the last user of the request.
545 */ 545 */
546 static int __aio_put_req(struct kioctx *ctx, struct kiocb *req) 546 static int __aio_put_req(struct kioctx *ctx, struct kiocb *req)
547 { 547 {
548 dprintk(KERN_DEBUG "aio_put(%p): f_count=%ld\n", 548 dprintk(KERN_DEBUG "aio_put(%p): f_count=%ld\n",
549 req, atomic_long_read(&req->ki_filp->f_count)); 549 req, atomic_long_read(&req->ki_filp->f_count));
550 550
551 assert_spin_locked(&ctx->ctx_lock); 551 assert_spin_locked(&ctx->ctx_lock);
552 552
553 req->ki_users--; 553 req->ki_users--;
554 BUG_ON(req->ki_users < 0); 554 BUG_ON(req->ki_users < 0);
555 if (likely(req->ki_users)) 555 if (likely(req->ki_users))
556 return 0; 556 return 0;
557 list_del(&req->ki_list); /* remove from active_reqs */ 557 list_del(&req->ki_list); /* remove from active_reqs */
558 req->ki_cancel = NULL; 558 req->ki_cancel = NULL;
559 req->ki_retry = NULL; 559 req->ki_retry = NULL;
560 560
561 /* 561 /*
562 * Try to optimize the aio and eventfd file* puts, by avoiding to 562 * Try to optimize the aio and eventfd file* puts, by avoiding to
563 * schedule work in case it is not final fput() time. In normal cases, 563 * schedule work in case it is not final fput() time. In normal cases,
564 * we would not be holding the last reference to the file*, so 564 * we would not be holding the last reference to the file*, so
565 * this function will be executed w/out any aio kthread wakeup. 565 * this function will be executed w/out any aio kthread wakeup.
566 */ 566 */
567 if (unlikely(!fput_atomic(req->ki_filp))) { 567 if (unlikely(!fput_atomic(req->ki_filp))) {
568 get_ioctx(ctx); 568 get_ioctx(ctx);
569 spin_lock(&fput_lock); 569 spin_lock(&fput_lock);
570 list_add(&req->ki_list, &fput_head); 570 list_add(&req->ki_list, &fput_head);
571 spin_unlock(&fput_lock); 571 spin_unlock(&fput_lock);
572 queue_work(aio_wq, &fput_work); 572 queue_work(aio_wq, &fput_work);
573 } else { 573 } else {
574 req->ki_filp = NULL; 574 req->ki_filp = NULL;
575 really_put_req(ctx, req); 575 really_put_req(ctx, req);
576 } 576 }
577 return 1; 577 return 1;
578 } 578 }
579 579
580 /* aio_put_req 580 /* aio_put_req
581 * Returns true if this put was the last user of the kiocb, 581 * Returns true if this put was the last user of the kiocb,
582 * false if the request is still in use. 582 * false if the request is still in use.
583 */ 583 */
584 int aio_put_req(struct kiocb *req) 584 int aio_put_req(struct kiocb *req)
585 { 585 {
586 struct kioctx *ctx = req->ki_ctx; 586 struct kioctx *ctx = req->ki_ctx;
587 int ret; 587 int ret;
588 spin_lock_irq(&ctx->ctx_lock); 588 spin_lock_irq(&ctx->ctx_lock);
589 ret = __aio_put_req(ctx, req); 589 ret = __aio_put_req(ctx, req);
590 spin_unlock_irq(&ctx->ctx_lock); 590 spin_unlock_irq(&ctx->ctx_lock);
591 return ret; 591 return ret;
592 } 592 }
593 EXPORT_SYMBOL(aio_put_req); 593 EXPORT_SYMBOL(aio_put_req);
594 594
595 static struct kioctx *lookup_ioctx(unsigned long ctx_id) 595 static struct kioctx *lookup_ioctx(unsigned long ctx_id)
596 { 596 {
597 struct mm_struct *mm = current->mm; 597 struct mm_struct *mm = current->mm;
598 struct kioctx *ctx, *ret = NULL; 598 struct kioctx *ctx, *ret = NULL;
599 struct hlist_node *n; 599 struct hlist_node *n;
600 600
601 rcu_read_lock(); 601 rcu_read_lock();
602 602
603 hlist_for_each_entry_rcu(ctx, n, &mm->ioctx_list, list) { 603 hlist_for_each_entry_rcu(ctx, n, &mm->ioctx_list, list) {
604 if (ctx->user_id == ctx_id && !ctx->dead) { 604 if (ctx->user_id == ctx_id && !ctx->dead) {
605 get_ioctx(ctx); 605 get_ioctx(ctx);
606 ret = ctx; 606 ret = ctx;
607 break; 607 break;
608 } 608 }
609 } 609 }
610 610
611 rcu_read_unlock(); 611 rcu_read_unlock();
612 return ret; 612 return ret;
613 } 613 }
614 614
615 /* 615 /*
616 * Queue up a kiocb to be retried. Assumes that the kiocb 616 * Queue up a kiocb to be retried. Assumes that the kiocb
617 * has already been marked as kicked, and places it on 617 * has already been marked as kicked, and places it on
618 * the retry run list for the corresponding ioctx, if it 618 * the retry run list for the corresponding ioctx, if it
619 * isn't already queued. Returns 1 if it actually queued 619 * isn't already queued. Returns 1 if it actually queued
620 * the kiocb (to tell the caller to activate the work 620 * the kiocb (to tell the caller to activate the work
621 * queue to process it), or 0, if it found that it was 621 * queue to process it), or 0, if it found that it was
622 * already queued. 622 * already queued.
623 */ 623 */
624 static inline int __queue_kicked_iocb(struct kiocb *iocb) 624 static inline int __queue_kicked_iocb(struct kiocb *iocb)
625 { 625 {
626 struct kioctx *ctx = iocb->ki_ctx; 626 struct kioctx *ctx = iocb->ki_ctx;
627 627
628 assert_spin_locked(&ctx->ctx_lock); 628 assert_spin_locked(&ctx->ctx_lock);
629 629
630 if (list_empty(&iocb->ki_run_list)) { 630 if (list_empty(&iocb->ki_run_list)) {
631 list_add_tail(&iocb->ki_run_list, 631 list_add_tail(&iocb->ki_run_list,
632 &ctx->run_list); 632 &ctx->run_list);
633 return 1; 633 return 1;
634 } 634 }
635 return 0; 635 return 0;
636 } 636 }
637 637
638 /* aio_run_iocb 638 /* aio_run_iocb
639 * This is the core aio execution routine. It is 639 * This is the core aio execution routine. It is
640 * invoked both for initial i/o submission and 640 * invoked both for initial i/o submission and
641 * subsequent retries via the aio_kick_handler. 641 * subsequent retries via the aio_kick_handler.
642 * Expects to be invoked with iocb->ki_ctx->lock 642 * Expects to be invoked with iocb->ki_ctx->lock
643 * already held. The lock is released and reacquired 643 * already held. The lock is released and reacquired
644 * as needed during processing. 644 * as needed during processing.
645 * 645 *
646 * Calls the iocb retry method (already setup for the 646 * Calls the iocb retry method (already setup for the
647 * iocb on initial submission) for operation specific 647 * iocb on initial submission) for operation specific
648 * handling, but takes care of most of common retry 648 * handling, but takes care of most of common retry
649 * execution details for a given iocb. The retry method 649 * execution details for a given iocb. The retry method
650 * needs to be non-blocking as far as possible, to avoid 650 * needs to be non-blocking as far as possible, to avoid
651 * holding up other iocbs waiting to be serviced by the 651 * holding up other iocbs waiting to be serviced by the
652 * retry kernel thread. 652 * retry kernel thread.
653 * 653 *
654 * The trickier parts in this code have to do with 654 * The trickier parts in this code have to do with
655 * ensuring that only one retry instance is in progress 655 * ensuring that only one retry instance is in progress
656 * for a given iocb at any time. Providing that guarantee 656 * for a given iocb at any time. Providing that guarantee
657 * simplifies the coding of individual aio operations as 657 * simplifies the coding of individual aio operations as
658 * it avoids various potential races. 658 * it avoids various potential races.
659 */ 659 */
660 static ssize_t aio_run_iocb(struct kiocb *iocb) 660 static ssize_t aio_run_iocb(struct kiocb *iocb)
661 { 661 {
662 struct kioctx *ctx = iocb->ki_ctx; 662 struct kioctx *ctx = iocb->ki_ctx;
663 ssize_t (*retry)(struct kiocb *); 663 ssize_t (*retry)(struct kiocb *);
664 ssize_t ret; 664 ssize_t ret;
665 665
666 if (!(retry = iocb->ki_retry)) { 666 if (!(retry = iocb->ki_retry)) {
667 printk("aio_run_iocb: iocb->ki_retry = NULL\n"); 667 printk("aio_run_iocb: iocb->ki_retry = NULL\n");
668 return 0; 668 return 0;
669 } 669 }
670 670
671 /* 671 /*
672 * We don't want the next retry iteration for this 672 * We don't want the next retry iteration for this
673 * operation to start until this one has returned and 673 * operation to start until this one has returned and
674 * updated the iocb state. However, wait_queue functions 674 * updated the iocb state. However, wait_queue functions
675 * can trigger a kick_iocb from interrupt context in the 675 * can trigger a kick_iocb from interrupt context in the
676 * meantime, indicating that data is available for the next 676 * meantime, indicating that data is available for the next
677 * iteration. We want to remember that and enable the 677 * iteration. We want to remember that and enable the
678 * next retry iteration _after_ we are through with 678 * next retry iteration _after_ we are through with
679 * this one. 679 * this one.
680 * 680 *
681 * So, in order to be able to register a "kick", but 681 * So, in order to be able to register a "kick", but
682 * prevent it from being queued now, we clear the kick 682 * prevent it from being queued now, we clear the kick
683 * flag, but make the kick code *think* that the iocb is 683 * flag, but make the kick code *think* that the iocb is
684 * still on the run list until we are actually done. 684 * still on the run list until we are actually done.
685 * When we are done with this iteration, we check if 685 * When we are done with this iteration, we check if
686 * the iocb was kicked in the meantime and if so, queue 686 * the iocb was kicked in the meantime and if so, queue
687 * it up afresh. 687 * it up afresh.
688 */ 688 */
689 689
690 kiocbClearKicked(iocb); 690 kiocbClearKicked(iocb);
691 691
692 /* 692 /*
693 * This is so that aio_complete knows it doesn't need to 693 * This is so that aio_complete knows it doesn't need to
694 * pull the iocb off the run list (We can't just call 694 * pull the iocb off the run list (We can't just call
695 * INIT_LIST_HEAD because we don't want a kick_iocb to 695 * INIT_LIST_HEAD because we don't want a kick_iocb to
696 * queue this on the run list yet) 696 * queue this on the run list yet)
697 */ 697 */
698 iocb->ki_run_list.next = iocb->ki_run_list.prev = NULL; 698 iocb->ki_run_list.next = iocb->ki_run_list.prev = NULL;
699 spin_unlock_irq(&ctx->ctx_lock); 699 spin_unlock_irq(&ctx->ctx_lock);
700 700
701 /* Quit retrying if the i/o has been cancelled */ 701 /* Quit retrying if the i/o has been cancelled */
702 if (kiocbIsCancelled(iocb)) { 702 if (kiocbIsCancelled(iocb)) {
703 ret = -EINTR; 703 ret = -EINTR;
704 aio_complete(iocb, ret, 0); 704 aio_complete(iocb, ret, 0);
705 /* must not access the iocb after this */ 705 /* must not access the iocb after this */
706 goto out; 706 goto out;
707 } 707 }
708 708
709 /* 709 /*
710 * Now we are all set to call the retry method in async 710 * Now we are all set to call the retry method in async
711 * context. 711 * context.
712 */ 712 */
713 ret = retry(iocb); 713 ret = retry(iocb);
714 714
715 if (ret != -EIOCBRETRY && ret != -EIOCBQUEUED) 715 if (ret != -EIOCBRETRY && ret != -EIOCBQUEUED) {
716 /*
717 * There's no easy way to restart the syscall since other AIO's
718 * may be already running. Just fail this IO with EINTR.
719 */
720 if (unlikely(ret == -ERESTARTSYS || ret == -ERESTARTNOINTR ||
721 ret == -ERESTARTNOHAND || ret == -ERESTART_RESTARTBLOCK))
722 ret = -EINTR;
716 aio_complete(iocb, ret, 0); 723 aio_complete(iocb, ret, 0);
724 }
717 out: 725 out:
718 spin_lock_irq(&ctx->ctx_lock); 726 spin_lock_irq(&ctx->ctx_lock);
719 727
720 if (-EIOCBRETRY == ret) { 728 if (-EIOCBRETRY == ret) {
721 /* 729 /*
722 * OK, now that we are done with this iteration 730 * OK, now that we are done with this iteration
723 * and know that there is more left to go, 731 * and know that there is more left to go,
724 * this is where we let go so that a subsequent 732 * this is where we let go so that a subsequent
725 * "kick" can start the next iteration 733 * "kick" can start the next iteration
726 */ 734 */
727 735
728 /* will make __queue_kicked_iocb succeed from here on */ 736 /* will make __queue_kicked_iocb succeed from here on */
729 INIT_LIST_HEAD(&iocb->ki_run_list); 737 INIT_LIST_HEAD(&iocb->ki_run_list);
730 /* we must queue the next iteration ourselves, if it 738 /* we must queue the next iteration ourselves, if it
731 * has already been kicked */ 739 * has already been kicked */
732 if (kiocbIsKicked(iocb)) { 740 if (kiocbIsKicked(iocb)) {
733 __queue_kicked_iocb(iocb); 741 __queue_kicked_iocb(iocb);
734 742
735 /* 743 /*
736 * __queue_kicked_iocb will always return 1 here, because 744 * __queue_kicked_iocb will always return 1 here, because
737 * iocb->ki_run_list is empty at this point so it should 745 * iocb->ki_run_list is empty at this point so it should
738 * be safe to unconditionally queue the context into the 746 * be safe to unconditionally queue the context into the
739 * work queue. 747 * work queue.
740 */ 748 */
741 aio_queue_work(ctx); 749 aio_queue_work(ctx);
742 } 750 }
743 } 751 }
744 return ret; 752 return ret;
745 } 753 }
746 754
747 /* 755 /*
748 * __aio_run_iocbs: 756 * __aio_run_iocbs:
749 * Process all pending retries queued on the ioctx 757 * Process all pending retries queued on the ioctx
750 * run list. 758 * run list.
751 * Assumes it is operating within the aio issuer's mm 759 * Assumes it is operating within the aio issuer's mm
752 * context. 760 * context.
753 */ 761 */
754 static int __aio_run_iocbs(struct kioctx *ctx) 762 static int __aio_run_iocbs(struct kioctx *ctx)
755 { 763 {
756 struct kiocb *iocb; 764 struct kiocb *iocb;
757 struct list_head run_list; 765 struct list_head run_list;
758 766
759 assert_spin_locked(&ctx->ctx_lock); 767 assert_spin_locked(&ctx->ctx_lock);
760 768
761 list_replace_init(&ctx->run_list, &run_list); 769 list_replace_init(&ctx->run_list, &run_list);
762 while (!list_empty(&run_list)) { 770 while (!list_empty(&run_list)) {
763 iocb = list_entry(run_list.next, struct kiocb, 771 iocb = list_entry(run_list.next, struct kiocb,
764 ki_run_list); 772 ki_run_list);
765 list_del(&iocb->ki_run_list); 773 list_del(&iocb->ki_run_list);
766 /* 774 /*
767 * Hold an extra reference while retrying i/o. 775 * Hold an extra reference while retrying i/o.
768 */ 776 */
769 iocb->ki_users++; /* grab extra reference */ 777 iocb->ki_users++; /* grab extra reference */
770 aio_run_iocb(iocb); 778 aio_run_iocb(iocb);
771 __aio_put_req(ctx, iocb); 779 __aio_put_req(ctx, iocb);
772 } 780 }
773 if (!list_empty(&ctx->run_list)) 781 if (!list_empty(&ctx->run_list))
774 return 1; 782 return 1;
775 return 0; 783 return 0;
776 } 784 }
777 785
778 static void aio_queue_work(struct kioctx * ctx) 786 static void aio_queue_work(struct kioctx * ctx)
779 { 787 {
780 unsigned long timeout; 788 unsigned long timeout;
781 /* 789 /*
782 * if someone is waiting, get the work started right 790 * if someone is waiting, get the work started right
783 * away, otherwise, use a longer delay 791 * away, otherwise, use a longer delay
784 */ 792 */
785 smp_mb(); 793 smp_mb();
786 if (waitqueue_active(&ctx->wait)) 794 if (waitqueue_active(&ctx->wait))
787 timeout = 1; 795 timeout = 1;
788 else 796 else
789 timeout = HZ/10; 797 timeout = HZ/10;
790 queue_delayed_work(aio_wq, &ctx->wq, timeout); 798 queue_delayed_work(aio_wq, &ctx->wq, timeout);
791 } 799 }
792 800
793 801
794 /* 802 /*
795 * aio_run_iocbs: 803 * aio_run_iocbs:
796 * Process all pending retries queued on the ioctx 804 * Process all pending retries queued on the ioctx
797 * run list. 805 * run list.
798 * Assumes it is operating within the aio issuer's mm 806 * Assumes it is operating within the aio issuer's mm
799 * context. 807 * context.
800 */ 808 */
801 static inline void aio_run_iocbs(struct kioctx *ctx) 809 static inline void aio_run_iocbs(struct kioctx *ctx)
802 { 810 {
803 int requeue; 811 int requeue;
804 812
805 spin_lock_irq(&ctx->ctx_lock); 813 spin_lock_irq(&ctx->ctx_lock);
806 814
807 requeue = __aio_run_iocbs(ctx); 815 requeue = __aio_run_iocbs(ctx);
808 spin_unlock_irq(&ctx->ctx_lock); 816 spin_unlock_irq(&ctx->ctx_lock);
809 if (requeue) 817 if (requeue)
810 aio_queue_work(ctx); 818 aio_queue_work(ctx);
811 } 819 }
812 820
813 /* 821 /*
814 * just like aio_run_iocbs, but keeps running them until 822 * just like aio_run_iocbs, but keeps running them until
815 * the list stays empty 823 * the list stays empty
816 */ 824 */
817 static inline void aio_run_all_iocbs(struct kioctx *ctx) 825 static inline void aio_run_all_iocbs(struct kioctx *ctx)
818 { 826 {
819 spin_lock_irq(&ctx->ctx_lock); 827 spin_lock_irq(&ctx->ctx_lock);
820 while (__aio_run_iocbs(ctx)) 828 while (__aio_run_iocbs(ctx))
821 ; 829 ;
822 spin_unlock_irq(&ctx->ctx_lock); 830 spin_unlock_irq(&ctx->ctx_lock);
823 } 831 }
824 832
825 /* 833 /*
826 * aio_kick_handler: 834 * aio_kick_handler:
827 * Work queue handler triggered to process pending 835 * Work queue handler triggered to process pending
828 * retries on an ioctx. Takes on the aio issuer's 836 * retries on an ioctx. Takes on the aio issuer's
829 * mm context before running the iocbs, so that 837 * mm context before running the iocbs, so that
830 * copy_xxx_user operates on the issuer's address 838 * copy_xxx_user operates on the issuer's address
831 * space. 839 * space.
832 * Run on aiod's context. 840 * Run on aiod's context.
833 */ 841 */
834 static void aio_kick_handler(struct work_struct *work) 842 static void aio_kick_handler(struct work_struct *work)
835 { 843 {
836 struct kioctx *ctx = container_of(work, struct kioctx, wq.work); 844 struct kioctx *ctx = container_of(work, struct kioctx, wq.work);
837 mm_segment_t oldfs = get_fs(); 845 mm_segment_t oldfs = get_fs();
838 struct mm_struct *mm; 846 struct mm_struct *mm;
839 int requeue; 847 int requeue;
840 848
841 set_fs(USER_DS); 849 set_fs(USER_DS);
842 use_mm(ctx->mm); 850 use_mm(ctx->mm);
843 spin_lock_irq(&ctx->ctx_lock); 851 spin_lock_irq(&ctx->ctx_lock);
844 requeue =__aio_run_iocbs(ctx); 852 requeue =__aio_run_iocbs(ctx);
845 mm = ctx->mm; 853 mm = ctx->mm;
846 spin_unlock_irq(&ctx->ctx_lock); 854 spin_unlock_irq(&ctx->ctx_lock);
847 unuse_mm(mm); 855 unuse_mm(mm);
848 set_fs(oldfs); 856 set_fs(oldfs);
849 /* 857 /*
850 * we're in a worker thread already, don't use queue_delayed_work, 858 * we're in a worker thread already, don't use queue_delayed_work,
851 */ 859 */
852 if (requeue) 860 if (requeue)
853 queue_delayed_work(aio_wq, &ctx->wq, 0); 861 queue_delayed_work(aio_wq, &ctx->wq, 0);
854 } 862 }
855 863
856 864
857 /* 865 /*
858 * Called by kick_iocb to queue the kiocb for retry 866 * Called by kick_iocb to queue the kiocb for retry
859 * and if required activate the aio work queue to process 867 * and if required activate the aio work queue to process
860 * it 868 * it
861 */ 869 */
862 static void try_queue_kicked_iocb(struct kiocb *iocb) 870 static void try_queue_kicked_iocb(struct kiocb *iocb)
863 { 871 {
864 struct kioctx *ctx = iocb->ki_ctx; 872 struct kioctx *ctx = iocb->ki_ctx;
865 unsigned long flags; 873 unsigned long flags;
866 int run = 0; 874 int run = 0;
867 875
868 spin_lock_irqsave(&ctx->ctx_lock, flags); 876 spin_lock_irqsave(&ctx->ctx_lock, flags);
869 /* set this inside the lock so that we can't race with aio_run_iocb() 877 /* set this inside the lock so that we can't race with aio_run_iocb()
870 * testing it and putting the iocb on the run list under the lock */ 878 * testing it and putting the iocb on the run list under the lock */
871 if (!kiocbTryKick(iocb)) 879 if (!kiocbTryKick(iocb))
872 run = __queue_kicked_iocb(iocb); 880 run = __queue_kicked_iocb(iocb);
873 spin_unlock_irqrestore(&ctx->ctx_lock, flags); 881 spin_unlock_irqrestore(&ctx->ctx_lock, flags);
874 if (run) 882 if (run)
875 aio_queue_work(ctx); 883 aio_queue_work(ctx);
876 } 884 }
877 885
878 /* 886 /*
879 * kick_iocb: 887 * kick_iocb:
880 * Called typically from a wait queue callback context 888 * Called typically from a wait queue callback context
881 * to trigger a retry of the iocb. 889 * to trigger a retry of the iocb.
882 * The retry is usually executed by aio workqueue 890 * The retry is usually executed by aio workqueue
883 * threads (See aio_kick_handler). 891 * threads (See aio_kick_handler).
884 */ 892 */
885 void kick_iocb(struct kiocb *iocb) 893 void kick_iocb(struct kiocb *iocb)
886 { 894 {
887 /* sync iocbs are easy: they can only ever be executing from a 895 /* sync iocbs are easy: they can only ever be executing from a
888 * single context. */ 896 * single context. */
889 if (is_sync_kiocb(iocb)) { 897 if (is_sync_kiocb(iocb)) {
890 kiocbSetKicked(iocb); 898 kiocbSetKicked(iocb);
891 wake_up_process(iocb->ki_obj.tsk); 899 wake_up_process(iocb->ki_obj.tsk);
892 return; 900 return;
893 } 901 }
894 902
895 try_queue_kicked_iocb(iocb); 903 try_queue_kicked_iocb(iocb);
896 } 904 }
897 EXPORT_SYMBOL(kick_iocb); 905 EXPORT_SYMBOL(kick_iocb);
898 906
899 /* aio_complete 907 /* aio_complete
900 * Called when the io request on the given iocb is complete. 908 * Called when the io request on the given iocb is complete.
901 * Returns true if this is the last user of the request. The 909 * Returns true if this is the last user of the request. The
902 * only other user of the request can be the cancellation code. 910 * only other user of the request can be the cancellation code.
903 */ 911 */
904 int aio_complete(struct kiocb *iocb, long res, long res2) 912 int aio_complete(struct kiocb *iocb, long res, long res2)
905 { 913 {
906 struct kioctx *ctx = iocb->ki_ctx; 914 struct kioctx *ctx = iocb->ki_ctx;
907 struct aio_ring_info *info; 915 struct aio_ring_info *info;
908 struct aio_ring *ring; 916 struct aio_ring *ring;
909 struct io_event *event; 917 struct io_event *event;
910 unsigned long flags; 918 unsigned long flags;
911 unsigned long tail; 919 unsigned long tail;
912 int ret; 920 int ret;
913 921
914 /* 922 /*
915 * Special case handling for sync iocbs: 923 * Special case handling for sync iocbs:
916 * - events go directly into the iocb for fast handling 924 * - events go directly into the iocb for fast handling
917 * - the sync task with the iocb in its stack holds the single iocb 925 * - the sync task with the iocb in its stack holds the single iocb
918 * ref, no other paths have a way to get another ref 926 * ref, no other paths have a way to get another ref
919 * - the sync task helpfully left a reference to itself in the iocb 927 * - the sync task helpfully left a reference to itself in the iocb
920 */ 928 */
921 if (is_sync_kiocb(iocb)) { 929 if (is_sync_kiocb(iocb)) {
922 BUG_ON(iocb->ki_users != 1); 930 BUG_ON(iocb->ki_users != 1);
923 iocb->ki_user_data = res; 931 iocb->ki_user_data = res;
924 iocb->ki_users = 0; 932 iocb->ki_users = 0;
925 wake_up_process(iocb->ki_obj.tsk); 933 wake_up_process(iocb->ki_obj.tsk);
926 return 1; 934 return 1;
927 } 935 }
928 936
929 info = &ctx->ring_info; 937 info = &ctx->ring_info;
930 938
931 /* add a completion event to the ring buffer. 939 /* add a completion event to the ring buffer.
932 * must be done holding ctx->ctx_lock to prevent 940 * must be done holding ctx->ctx_lock to prevent
933 * other code from messing with the tail 941 * other code from messing with the tail
934 * pointer since we might be called from irq 942 * pointer since we might be called from irq
935 * context. 943 * context.
936 */ 944 */
937 spin_lock_irqsave(&ctx->ctx_lock, flags); 945 spin_lock_irqsave(&ctx->ctx_lock, flags);
938 946
939 if (iocb->ki_run_list.prev && !list_empty(&iocb->ki_run_list)) 947 if (iocb->ki_run_list.prev && !list_empty(&iocb->ki_run_list))
940 list_del_init(&iocb->ki_run_list); 948 list_del_init(&iocb->ki_run_list);
941 949
942 /* 950 /*
943 * cancelled requests don't get events, userland was given one 951 * cancelled requests don't get events, userland was given one
944 * when the event got cancelled. 952 * when the event got cancelled.
945 */ 953 */
946 if (kiocbIsCancelled(iocb)) 954 if (kiocbIsCancelled(iocb))
947 goto put_rq; 955 goto put_rq;
948 956
949 ring = kmap_atomic(info->ring_pages[0], KM_IRQ1); 957 ring = kmap_atomic(info->ring_pages[0], KM_IRQ1);
950 958
951 tail = info->tail; 959 tail = info->tail;
952 event = aio_ring_event(info, tail, KM_IRQ0); 960 event = aio_ring_event(info, tail, KM_IRQ0);
953 if (++tail >= info->nr) 961 if (++tail >= info->nr)
954 tail = 0; 962 tail = 0;
955 963
956 event->obj = (u64)(unsigned long)iocb->ki_obj.user; 964 event->obj = (u64)(unsigned long)iocb->ki_obj.user;
957 event->data = iocb->ki_user_data; 965 event->data = iocb->ki_user_data;
958 event->res = res; 966 event->res = res;
959 event->res2 = res2; 967 event->res2 = res2;
960 968
961 dprintk("aio_complete: %p[%lu]: %p: %p %Lx %lx %lx\n", 969 dprintk("aio_complete: %p[%lu]: %p: %p %Lx %lx %lx\n",
962 ctx, tail, iocb, iocb->ki_obj.user, iocb->ki_user_data, 970 ctx, tail, iocb, iocb->ki_obj.user, iocb->ki_user_data,
963 res, res2); 971 res, res2);
964 972
965 /* after flagging the request as done, we 973 /* after flagging the request as done, we
966 * must never even look at it again 974 * must never even look at it again
967 */ 975 */
968 smp_wmb(); /* make event visible before updating tail */ 976 smp_wmb(); /* make event visible before updating tail */
969 977
970 info->tail = tail; 978 info->tail = tail;
971 ring->tail = tail; 979 ring->tail = tail;
972 980
973 put_aio_ring_event(event, KM_IRQ0); 981 put_aio_ring_event(event, KM_IRQ0);
974 kunmap_atomic(ring, KM_IRQ1); 982 kunmap_atomic(ring, KM_IRQ1);
975 983
976 pr_debug("added to ring %p at [%lu]\n", iocb, tail); 984 pr_debug("added to ring %p at [%lu]\n", iocb, tail);
977 985
978 /* 986 /*
979 * Check if the user asked us to deliver the result through an 987 * Check if the user asked us to deliver the result through an
980 * eventfd. The eventfd_signal() function is safe to be called 988 * eventfd. The eventfd_signal() function is safe to be called
981 * from IRQ context. 989 * from IRQ context.
982 */ 990 */
983 if (iocb->ki_eventfd != NULL) 991 if (iocb->ki_eventfd != NULL)
984 eventfd_signal(iocb->ki_eventfd, 1); 992 eventfd_signal(iocb->ki_eventfd, 1);
985 993
986 put_rq: 994 put_rq:
987 /* everything turned out well, dispose of the aiocb. */ 995 /* everything turned out well, dispose of the aiocb. */
988 ret = __aio_put_req(ctx, iocb); 996 ret = __aio_put_req(ctx, iocb);
989 997
990 /* 998 /*
991 * We have to order our ring_info tail store above and test 999 * We have to order our ring_info tail store above and test
992 * of the wait list below outside the wait lock. This is 1000 * of the wait list below outside the wait lock. This is
993 * like in wake_up_bit() where clearing a bit has to be 1001 * like in wake_up_bit() where clearing a bit has to be
994 * ordered with the unlocked test. 1002 * ordered with the unlocked test.
995 */ 1003 */
996 smp_mb(); 1004 smp_mb();
997 1005
998 if (waitqueue_active(&ctx->wait)) 1006 if (waitqueue_active(&ctx->wait))
999 wake_up(&ctx->wait); 1007 wake_up(&ctx->wait);
1000 1008
1001 spin_unlock_irqrestore(&ctx->ctx_lock, flags); 1009 spin_unlock_irqrestore(&ctx->ctx_lock, flags);
1002 return ret; 1010 return ret;
1003 } 1011 }
1004 EXPORT_SYMBOL(aio_complete); 1012 EXPORT_SYMBOL(aio_complete);
1005 1013
1006 /* aio_read_evt 1014 /* aio_read_evt
1007 * Pull an event off of the ioctx's event ring. Returns the number of 1015 * Pull an event off of the ioctx's event ring. Returns the number of
1008 * events fetched (0 or 1 ;-) 1016 * events fetched (0 or 1 ;-)
1009 * FIXME: make this use cmpxchg. 1017 * FIXME: make this use cmpxchg.
1010 * TODO: make the ringbuffer user mmap()able (requires FIXME). 1018 * TODO: make the ringbuffer user mmap()able (requires FIXME).
1011 */ 1019 */
1012 static int aio_read_evt(struct kioctx *ioctx, struct io_event *ent) 1020 static int aio_read_evt(struct kioctx *ioctx, struct io_event *ent)
1013 { 1021 {
1014 struct aio_ring_info *info = &ioctx->ring_info; 1022 struct aio_ring_info *info = &ioctx->ring_info;
1015 struct aio_ring *ring; 1023 struct aio_ring *ring;
1016 unsigned long head; 1024 unsigned long head;
1017 int ret = 0; 1025 int ret = 0;
1018 1026
1019 ring = kmap_atomic(info->ring_pages[0], KM_USER0); 1027 ring = kmap_atomic(info->ring_pages[0], KM_USER0);
1020 dprintk("in aio_read_evt h%lu t%lu m%lu\n", 1028 dprintk("in aio_read_evt h%lu t%lu m%lu\n",
1021 (unsigned long)ring->head, (unsigned long)ring->tail, 1029 (unsigned long)ring->head, (unsigned long)ring->tail,
1022 (unsigned long)ring->nr); 1030 (unsigned long)ring->nr);
1023 1031
1024 if (ring->head == ring->tail) 1032 if (ring->head == ring->tail)
1025 goto out; 1033 goto out;
1026 1034
1027 spin_lock(&info->ring_lock); 1035 spin_lock(&info->ring_lock);
1028 1036
1029 head = ring->head % info->nr; 1037 head = ring->head % info->nr;
1030 if (head != ring->tail) { 1038 if (head != ring->tail) {
1031 struct io_event *evp = aio_ring_event(info, head, KM_USER1); 1039 struct io_event *evp = aio_ring_event(info, head, KM_USER1);
1032 *ent = *evp; 1040 *ent = *evp;
1033 head = (head + 1) % info->nr; 1041 head = (head + 1) % info->nr;
1034 smp_mb(); /* finish reading the event before updatng the head */ 1042 smp_mb(); /* finish reading the event before updatng the head */
1035 ring->head = head; 1043 ring->head = head;
1036 ret = 1; 1044 ret = 1;
1037 put_aio_ring_event(evp, KM_USER1); 1045 put_aio_ring_event(evp, KM_USER1);
1038 } 1046 }
1039 spin_unlock(&info->ring_lock); 1047 spin_unlock(&info->ring_lock);
1040 1048
1041 out: 1049 out:
1042 kunmap_atomic(ring, KM_USER0); 1050 kunmap_atomic(ring, KM_USER0);
1043 dprintk("leaving aio_read_evt: %d h%lu t%lu\n", ret, 1051 dprintk("leaving aio_read_evt: %d h%lu t%lu\n", ret,
1044 (unsigned long)ring->head, (unsigned long)ring->tail); 1052 (unsigned long)ring->head, (unsigned long)ring->tail);
1045 return ret; 1053 return ret;
1046 } 1054 }
1047 1055
1048 struct aio_timeout { 1056 struct aio_timeout {
1049 struct timer_list timer; 1057 struct timer_list timer;
1050 int timed_out; 1058 int timed_out;
1051 struct task_struct *p; 1059 struct task_struct *p;
1052 }; 1060 };
1053 1061
1054 static void timeout_func(unsigned long data) 1062 static void timeout_func(unsigned long data)
1055 { 1063 {
1056 struct aio_timeout *to = (struct aio_timeout *)data; 1064 struct aio_timeout *to = (struct aio_timeout *)data;
1057 1065
1058 to->timed_out = 1; 1066 to->timed_out = 1;
1059 wake_up_process(to->p); 1067 wake_up_process(to->p);
1060 } 1068 }
1061 1069
1062 static inline void init_timeout(struct aio_timeout *to) 1070 static inline void init_timeout(struct aio_timeout *to)
1063 { 1071 {
1064 setup_timer_on_stack(&to->timer, timeout_func, (unsigned long) to); 1072 setup_timer_on_stack(&to->timer, timeout_func, (unsigned long) to);
1065 to->timed_out = 0; 1073 to->timed_out = 0;
1066 to->p = current; 1074 to->p = current;
1067 } 1075 }
1068 1076
1069 static inline void set_timeout(long start_jiffies, struct aio_timeout *to, 1077 static inline void set_timeout(long start_jiffies, struct aio_timeout *to,
1070 const struct timespec *ts) 1078 const struct timespec *ts)
1071 { 1079 {
1072 to->timer.expires = start_jiffies + timespec_to_jiffies(ts); 1080 to->timer.expires = start_jiffies + timespec_to_jiffies(ts);
1073 if (time_after(to->timer.expires, jiffies)) 1081 if (time_after(to->timer.expires, jiffies))
1074 add_timer(&to->timer); 1082 add_timer(&to->timer);
1075 else 1083 else
1076 to->timed_out = 1; 1084 to->timed_out = 1;
1077 } 1085 }
1078 1086
1079 static inline void clear_timeout(struct aio_timeout *to) 1087 static inline void clear_timeout(struct aio_timeout *to)
1080 { 1088 {
1081 del_singleshot_timer_sync(&to->timer); 1089 del_singleshot_timer_sync(&to->timer);
1082 } 1090 }
1083 1091
1084 static int read_events(struct kioctx *ctx, 1092 static int read_events(struct kioctx *ctx,
1085 long min_nr, long nr, 1093 long min_nr, long nr,
1086 struct io_event __user *event, 1094 struct io_event __user *event,
1087 struct timespec __user *timeout) 1095 struct timespec __user *timeout)
1088 { 1096 {
1089 long start_jiffies = jiffies; 1097 long start_jiffies = jiffies;
1090 struct task_struct *tsk = current; 1098 struct task_struct *tsk = current;
1091 DECLARE_WAITQUEUE(wait, tsk); 1099 DECLARE_WAITQUEUE(wait, tsk);
1092 int ret; 1100 int ret;
1093 int i = 0; 1101 int i = 0;
1094 struct io_event ent; 1102 struct io_event ent;
1095 struct aio_timeout to; 1103 struct aio_timeout to;
1096 int retry = 0; 1104 int retry = 0;
1097 1105
1098 /* needed to zero any padding within an entry (there shouldn't be 1106 /* needed to zero any padding within an entry (there shouldn't be
1099 * any, but C is fun! 1107 * any, but C is fun!
1100 */ 1108 */
1101 memset(&ent, 0, sizeof(ent)); 1109 memset(&ent, 0, sizeof(ent));
1102 retry: 1110 retry:
1103 ret = 0; 1111 ret = 0;
1104 while (likely(i < nr)) { 1112 while (likely(i < nr)) {
1105 ret = aio_read_evt(ctx, &ent); 1113 ret = aio_read_evt(ctx, &ent);
1106 if (unlikely(ret <= 0)) 1114 if (unlikely(ret <= 0))
1107 break; 1115 break;
1108 1116
1109 dprintk("read event: %Lx %Lx %Lx %Lx\n", 1117 dprintk("read event: %Lx %Lx %Lx %Lx\n",
1110 ent.data, ent.obj, ent.res, ent.res2); 1118 ent.data, ent.obj, ent.res, ent.res2);
1111 1119
1112 /* Could we split the check in two? */ 1120 /* Could we split the check in two? */
1113 ret = -EFAULT; 1121 ret = -EFAULT;
1114 if (unlikely(copy_to_user(event, &ent, sizeof(ent)))) { 1122 if (unlikely(copy_to_user(event, &ent, sizeof(ent)))) {
1115 dprintk("aio: lost an event due to EFAULT.\n"); 1123 dprintk("aio: lost an event due to EFAULT.\n");
1116 break; 1124 break;
1117 } 1125 }
1118 ret = 0; 1126 ret = 0;
1119 1127
1120 /* Good, event copied to userland, update counts. */ 1128 /* Good, event copied to userland, update counts. */
1121 event ++; 1129 event ++;
1122 i ++; 1130 i ++;
1123 } 1131 }
1124 1132
1125 if (min_nr <= i) 1133 if (min_nr <= i)
1126 return i; 1134 return i;
1127 if (ret) 1135 if (ret)
1128 return ret; 1136 return ret;
1129 1137
1130 /* End fast path */ 1138 /* End fast path */
1131 1139
1132 /* racey check, but it gets redone */ 1140 /* racey check, but it gets redone */
1133 if (!retry && unlikely(!list_empty(&ctx->run_list))) { 1141 if (!retry && unlikely(!list_empty(&ctx->run_list))) {
1134 retry = 1; 1142 retry = 1;
1135 aio_run_all_iocbs(ctx); 1143 aio_run_all_iocbs(ctx);
1136 goto retry; 1144 goto retry;
1137 } 1145 }
1138 1146
1139 init_timeout(&to); 1147 init_timeout(&to);
1140 if (timeout) { 1148 if (timeout) {
1141 struct timespec ts; 1149 struct timespec ts;
1142 ret = -EFAULT; 1150 ret = -EFAULT;
1143 if (unlikely(copy_from_user(&ts, timeout, sizeof(ts)))) 1151 if (unlikely(copy_from_user(&ts, timeout, sizeof(ts))))
1144 goto out; 1152 goto out;
1145 1153
1146 set_timeout(start_jiffies, &to, &ts); 1154 set_timeout(start_jiffies, &to, &ts);
1147 } 1155 }
1148 1156
1149 while (likely(i < nr)) { 1157 while (likely(i < nr)) {
1150 add_wait_queue_exclusive(&ctx->wait, &wait); 1158 add_wait_queue_exclusive(&ctx->wait, &wait);
1151 do { 1159 do {
1152 set_task_state(tsk, TASK_INTERRUPTIBLE); 1160 set_task_state(tsk, TASK_INTERRUPTIBLE);
1153 ret = aio_read_evt(ctx, &ent); 1161 ret = aio_read_evt(ctx, &ent);
1154 if (ret) 1162 if (ret)
1155 break; 1163 break;
1156 if (min_nr <= i) 1164 if (min_nr <= i)
1157 break; 1165 break;
1158 if (unlikely(ctx->dead)) { 1166 if (unlikely(ctx->dead)) {
1159 ret = -EINVAL; 1167 ret = -EINVAL;
1160 break; 1168 break;
1161 } 1169 }
1162 if (to.timed_out) /* Only check after read evt */ 1170 if (to.timed_out) /* Only check after read evt */
1163 break; 1171 break;
1164 /* Try to only show up in io wait if there are ops 1172 /* Try to only show up in io wait if there are ops
1165 * in flight */ 1173 * in flight */
1166 if (ctx->reqs_active) 1174 if (ctx->reqs_active)
1167 io_schedule(); 1175 io_schedule();
1168 else 1176 else
1169 schedule(); 1177 schedule();
1170 if (signal_pending(tsk)) { 1178 if (signal_pending(tsk)) {
1171 ret = -EINTR; 1179 ret = -EINTR;
1172 break; 1180 break;
1173 } 1181 }
1174 /*ret = aio_read_evt(ctx, &ent);*/ 1182 /*ret = aio_read_evt(ctx, &ent);*/
1175 } while (1) ; 1183 } while (1) ;
1176 1184
1177 set_task_state(tsk, TASK_RUNNING); 1185 set_task_state(tsk, TASK_RUNNING);
1178 remove_wait_queue(&ctx->wait, &wait); 1186 remove_wait_queue(&ctx->wait, &wait);
1179 1187
1180 if (unlikely(ret <= 0)) 1188 if (unlikely(ret <= 0))
1181 break; 1189 break;
1182 1190
1183 ret = -EFAULT; 1191 ret = -EFAULT;
1184 if (unlikely(copy_to_user(event, &ent, sizeof(ent)))) { 1192 if (unlikely(copy_to_user(event, &ent, sizeof(ent)))) {
1185 dprintk("aio: lost an event due to EFAULT.\n"); 1193 dprintk("aio: lost an event due to EFAULT.\n");
1186 break; 1194 break;
1187 } 1195 }
1188 1196
1189 /* Good, event copied to userland, update counts. */ 1197 /* Good, event copied to userland, update counts. */
1190 event ++; 1198 event ++;
1191 i ++; 1199 i ++;
1192 } 1200 }
1193 1201
1194 if (timeout) 1202 if (timeout)
1195 clear_timeout(&to); 1203 clear_timeout(&to);
1196 out: 1204 out:
1197 destroy_timer_on_stack(&to.timer); 1205 destroy_timer_on_stack(&to.timer);
1198 return i ? i : ret; 1206 return i ? i : ret;
1199 } 1207 }
1200 1208
1201 /* Take an ioctx and remove it from the list of ioctx's. Protects 1209 /* Take an ioctx and remove it from the list of ioctx's. Protects
1202 * against races with itself via ->dead. 1210 * against races with itself via ->dead.
1203 */ 1211 */
1204 static void io_destroy(struct kioctx *ioctx) 1212 static void io_destroy(struct kioctx *ioctx)
1205 { 1213 {
1206 struct mm_struct *mm = current->mm; 1214 struct mm_struct *mm = current->mm;
1207 int was_dead; 1215 int was_dead;
1208 1216
1209 /* delete the entry from the list is someone else hasn't already */ 1217 /* delete the entry from the list is someone else hasn't already */
1210 spin_lock(&mm->ioctx_lock); 1218 spin_lock(&mm->ioctx_lock);
1211 was_dead = ioctx->dead; 1219 was_dead = ioctx->dead;
1212 ioctx->dead = 1; 1220 ioctx->dead = 1;
1213 hlist_del_rcu(&ioctx->list); 1221 hlist_del_rcu(&ioctx->list);
1214 spin_unlock(&mm->ioctx_lock); 1222 spin_unlock(&mm->ioctx_lock);
1215 1223
1216 dprintk("aio_release(%p)\n", ioctx); 1224 dprintk("aio_release(%p)\n", ioctx);
1217 if (likely(!was_dead)) 1225 if (likely(!was_dead))
1218 put_ioctx(ioctx); /* twice for the list */ 1226 put_ioctx(ioctx); /* twice for the list */
1219 1227
1220 aio_cancel_all(ioctx); 1228 aio_cancel_all(ioctx);
1221 wait_for_all_aios(ioctx); 1229 wait_for_all_aios(ioctx);
1222 1230
1223 /* 1231 /*
1224 * Wake up any waiters. The setting of ctx->dead must be seen 1232 * Wake up any waiters. The setting of ctx->dead must be seen
1225 * by other CPUs at this point. Right now, we rely on the 1233 * by other CPUs at this point. Right now, we rely on the
1226 * locking done by the above calls to ensure this consistency. 1234 * locking done by the above calls to ensure this consistency.
1227 */ 1235 */
1228 wake_up(&ioctx->wait); 1236 wake_up(&ioctx->wait);
1229 put_ioctx(ioctx); /* once for the lookup */ 1237 put_ioctx(ioctx); /* once for the lookup */
1230 } 1238 }
1231 1239
1232 /* sys_io_setup: 1240 /* sys_io_setup:
1233 * Create an aio_context capable of receiving at least nr_events. 1241 * Create an aio_context capable of receiving at least nr_events.
1234 * ctxp must not point to an aio_context that already exists, and 1242 * ctxp must not point to an aio_context that already exists, and
1235 * must be initialized to 0 prior to the call. On successful 1243 * must be initialized to 0 prior to the call. On successful
1236 * creation of the aio_context, *ctxp is filled in with the resulting 1244 * creation of the aio_context, *ctxp is filled in with the resulting
1237 * handle. May fail with -EINVAL if *ctxp is not initialized, 1245 * handle. May fail with -EINVAL if *ctxp is not initialized,
1238 * if the specified nr_events exceeds internal limits. May fail 1246 * if the specified nr_events exceeds internal limits. May fail
1239 * with -EAGAIN if the specified nr_events exceeds the user's limit 1247 * with -EAGAIN if the specified nr_events exceeds the user's limit
1240 * of available events. May fail with -ENOMEM if insufficient kernel 1248 * of available events. May fail with -ENOMEM if insufficient kernel
1241 * resources are available. May fail with -EFAULT if an invalid 1249 * resources are available. May fail with -EFAULT if an invalid
1242 * pointer is passed for ctxp. Will fail with -ENOSYS if not 1250 * pointer is passed for ctxp. Will fail with -ENOSYS if not
1243 * implemented. 1251 * implemented.
1244 */ 1252 */
1245 SYSCALL_DEFINE2(io_setup, unsigned, nr_events, aio_context_t __user *, ctxp) 1253 SYSCALL_DEFINE2(io_setup, unsigned, nr_events, aio_context_t __user *, ctxp)
1246 { 1254 {
1247 struct kioctx *ioctx = NULL; 1255 struct kioctx *ioctx = NULL;
1248 unsigned long ctx; 1256 unsigned long ctx;
1249 long ret; 1257 long ret;
1250 1258
1251 ret = get_user(ctx, ctxp); 1259 ret = get_user(ctx, ctxp);
1252 if (unlikely(ret)) 1260 if (unlikely(ret))
1253 goto out; 1261 goto out;
1254 1262
1255 ret = -EINVAL; 1263 ret = -EINVAL;
1256 if (unlikely(ctx || nr_events == 0)) { 1264 if (unlikely(ctx || nr_events == 0)) {
1257 pr_debug("EINVAL: io_setup: ctx %lu nr_events %u\n", 1265 pr_debug("EINVAL: io_setup: ctx %lu nr_events %u\n",
1258 ctx, nr_events); 1266 ctx, nr_events);
1259 goto out; 1267 goto out;
1260 } 1268 }
1261 1269
1262 ioctx = ioctx_alloc(nr_events); 1270 ioctx = ioctx_alloc(nr_events);
1263 ret = PTR_ERR(ioctx); 1271 ret = PTR_ERR(ioctx);
1264 if (!IS_ERR(ioctx)) { 1272 if (!IS_ERR(ioctx)) {
1265 ret = put_user(ioctx->user_id, ctxp); 1273 ret = put_user(ioctx->user_id, ctxp);
1266 if (!ret) 1274 if (!ret)
1267 return 0; 1275 return 0;
1268 1276
1269 get_ioctx(ioctx); /* io_destroy() expects us to hold a ref */ 1277 get_ioctx(ioctx); /* io_destroy() expects us to hold a ref */
1270 io_destroy(ioctx); 1278 io_destroy(ioctx);
1271 } 1279 }
1272 1280
1273 out: 1281 out:
1274 return ret; 1282 return ret;
1275 } 1283 }
1276 1284
1277 /* sys_io_destroy: 1285 /* sys_io_destroy:
1278 * Destroy the aio_context specified. May cancel any outstanding 1286 * Destroy the aio_context specified. May cancel any outstanding
1279 * AIOs and block on completion. Will fail with -ENOSYS if not 1287 * AIOs and block on completion. Will fail with -ENOSYS if not
1280 * implemented. May fail with -EINVAL if the context pointed to 1288 * implemented. May fail with -EINVAL if the context pointed to
1281 * is invalid. 1289 * is invalid.
1282 */ 1290 */
1283 SYSCALL_DEFINE1(io_destroy, aio_context_t, ctx) 1291 SYSCALL_DEFINE1(io_destroy, aio_context_t, ctx)
1284 { 1292 {
1285 struct kioctx *ioctx = lookup_ioctx(ctx); 1293 struct kioctx *ioctx = lookup_ioctx(ctx);
1286 if (likely(NULL != ioctx)) { 1294 if (likely(NULL != ioctx)) {
1287 io_destroy(ioctx); 1295 io_destroy(ioctx);
1288 return 0; 1296 return 0;
1289 } 1297 }
1290 pr_debug("EINVAL: io_destroy: invalid context id\n"); 1298 pr_debug("EINVAL: io_destroy: invalid context id\n");
1291 return -EINVAL; 1299 return -EINVAL;
1292 } 1300 }
1293 1301
1294 static void aio_advance_iovec(struct kiocb *iocb, ssize_t ret) 1302 static void aio_advance_iovec(struct kiocb *iocb, ssize_t ret)
1295 { 1303 {
1296 struct iovec *iov = &iocb->ki_iovec[iocb->ki_cur_seg]; 1304 struct iovec *iov = &iocb->ki_iovec[iocb->ki_cur_seg];
1297 1305
1298 BUG_ON(ret <= 0); 1306 BUG_ON(ret <= 0);
1299 1307
1300 while (iocb->ki_cur_seg < iocb->ki_nr_segs && ret > 0) { 1308 while (iocb->ki_cur_seg < iocb->ki_nr_segs && ret > 0) {
1301 ssize_t this = min((ssize_t)iov->iov_len, ret); 1309 ssize_t this = min((ssize_t)iov->iov_len, ret);
1302 iov->iov_base += this; 1310 iov->iov_base += this;
1303 iov->iov_len -= this; 1311 iov->iov_len -= this;
1304 iocb->ki_left -= this; 1312 iocb->ki_left -= this;
1305 ret -= this; 1313 ret -= this;
1306 if (iov->iov_len == 0) { 1314 if (iov->iov_len == 0) {
1307 iocb->ki_cur_seg++; 1315 iocb->ki_cur_seg++;
1308 iov++; 1316 iov++;
1309 } 1317 }
1310 } 1318 }
1311 1319
1312 /* the caller should not have done more io than what fit in 1320 /* the caller should not have done more io than what fit in
1313 * the remaining iovecs */ 1321 * the remaining iovecs */
1314 BUG_ON(ret > 0 && iocb->ki_left == 0); 1322 BUG_ON(ret > 0 && iocb->ki_left == 0);
1315 } 1323 }
1316 1324
1317 static ssize_t aio_rw_vect_retry(struct kiocb *iocb) 1325 static ssize_t aio_rw_vect_retry(struct kiocb *iocb)
1318 { 1326 {
1319 struct file *file = iocb->ki_filp; 1327 struct file *file = iocb->ki_filp;
1320 struct address_space *mapping = file->f_mapping; 1328 struct address_space *mapping = file->f_mapping;
1321 struct inode *inode = mapping->host; 1329 struct inode *inode = mapping->host;
1322 ssize_t (*rw_op)(struct kiocb *, const struct iovec *, 1330 ssize_t (*rw_op)(struct kiocb *, const struct iovec *,
1323 unsigned long, loff_t); 1331 unsigned long, loff_t);
1324 ssize_t ret = 0; 1332 ssize_t ret = 0;
1325 unsigned short opcode; 1333 unsigned short opcode;
1326 1334
1327 if ((iocb->ki_opcode == IOCB_CMD_PREADV) || 1335 if ((iocb->ki_opcode == IOCB_CMD_PREADV) ||
1328 (iocb->ki_opcode == IOCB_CMD_PREAD)) { 1336 (iocb->ki_opcode == IOCB_CMD_PREAD)) {
1329 rw_op = file->f_op->aio_read; 1337 rw_op = file->f_op->aio_read;
1330 opcode = IOCB_CMD_PREADV; 1338 opcode = IOCB_CMD_PREADV;
1331 } else { 1339 } else {
1332 rw_op = file->f_op->aio_write; 1340 rw_op = file->f_op->aio_write;
1333 opcode = IOCB_CMD_PWRITEV; 1341 opcode = IOCB_CMD_PWRITEV;
1334 } 1342 }
1335 1343
1336 /* This matches the pread()/pwrite() logic */ 1344 /* This matches the pread()/pwrite() logic */
1337 if (iocb->ki_pos < 0) 1345 if (iocb->ki_pos < 0)
1338 return -EINVAL; 1346 return -EINVAL;
1339 1347
1340 do { 1348 do {
1341 ret = rw_op(iocb, &iocb->ki_iovec[iocb->ki_cur_seg], 1349 ret = rw_op(iocb, &iocb->ki_iovec[iocb->ki_cur_seg],
1342 iocb->ki_nr_segs - iocb->ki_cur_seg, 1350 iocb->ki_nr_segs - iocb->ki_cur_seg,
1343 iocb->ki_pos); 1351 iocb->ki_pos);
1344 if (ret > 0) 1352 if (ret > 0)
1345 aio_advance_iovec(iocb, ret); 1353 aio_advance_iovec(iocb, ret);
1346 1354
1347 /* retry all partial writes. retry partial reads as long as its a 1355 /* retry all partial writes. retry partial reads as long as its a
1348 * regular file. */ 1356 * regular file. */
1349 } while (ret > 0 && iocb->ki_left > 0 && 1357 } while (ret > 0 && iocb->ki_left > 0 &&
1350 (opcode == IOCB_CMD_PWRITEV || 1358 (opcode == IOCB_CMD_PWRITEV ||
1351 (!S_ISFIFO(inode->i_mode) && !S_ISSOCK(inode->i_mode)))); 1359 (!S_ISFIFO(inode->i_mode) && !S_ISSOCK(inode->i_mode))));
1352 1360
1353 /* This means we must have transferred all that we could */ 1361 /* This means we must have transferred all that we could */
1354 /* No need to retry anymore */ 1362 /* No need to retry anymore */
1355 if ((ret == 0) || (iocb->ki_left == 0)) 1363 if ((ret == 0) || (iocb->ki_left == 0))
1356 ret = iocb->ki_nbytes - iocb->ki_left; 1364 ret = iocb->ki_nbytes - iocb->ki_left;
1357 1365
1358 /* If we managed to write some out we return that, rather than 1366 /* If we managed to write some out we return that, rather than
1359 * the eventual error. */ 1367 * the eventual error. */
1360 if (opcode == IOCB_CMD_PWRITEV 1368 if (opcode == IOCB_CMD_PWRITEV
1361 && ret < 0 && ret != -EIOCBQUEUED && ret != -EIOCBRETRY 1369 && ret < 0 && ret != -EIOCBQUEUED && ret != -EIOCBRETRY
1362 && iocb->ki_nbytes - iocb->ki_left) 1370 && iocb->ki_nbytes - iocb->ki_left)
1363 ret = iocb->ki_nbytes - iocb->ki_left; 1371 ret = iocb->ki_nbytes - iocb->ki_left;
1364 1372
1365 return ret; 1373 return ret;
1366 } 1374 }
1367 1375
1368 static ssize_t aio_fdsync(struct kiocb *iocb) 1376 static ssize_t aio_fdsync(struct kiocb *iocb)
1369 { 1377 {
1370 struct file *file = iocb->ki_filp; 1378 struct file *file = iocb->ki_filp;
1371 ssize_t ret = -EINVAL; 1379 ssize_t ret = -EINVAL;
1372 1380
1373 if (file->f_op->aio_fsync) 1381 if (file->f_op->aio_fsync)
1374 ret = file->f_op->aio_fsync(iocb, 1); 1382 ret = file->f_op->aio_fsync(iocb, 1);
1375 return ret; 1383 return ret;
1376 } 1384 }
1377 1385
1378 static ssize_t aio_fsync(struct kiocb *iocb) 1386 static ssize_t aio_fsync(struct kiocb *iocb)
1379 { 1387 {
1380 struct file *file = iocb->ki_filp; 1388 struct file *file = iocb->ki_filp;
1381 ssize_t ret = -EINVAL; 1389 ssize_t ret = -EINVAL;
1382 1390
1383 if (file->f_op->aio_fsync) 1391 if (file->f_op->aio_fsync)
1384 ret = file->f_op->aio_fsync(iocb, 0); 1392 ret = file->f_op->aio_fsync(iocb, 0);
1385 return ret; 1393 return ret;
1386 } 1394 }
1387 1395
1388 static ssize_t aio_setup_vectored_rw(int type, struct kiocb *kiocb, bool compat) 1396 static ssize_t aio_setup_vectored_rw(int type, struct kiocb *kiocb, bool compat)
1389 { 1397 {
1390 ssize_t ret; 1398 ssize_t ret;
1391 1399
1392 #ifdef CONFIG_COMPAT 1400 #ifdef CONFIG_COMPAT
1393 if (compat) 1401 if (compat)
1394 ret = compat_rw_copy_check_uvector(type, 1402 ret = compat_rw_copy_check_uvector(type,
1395 (struct compat_iovec __user *)kiocb->ki_buf, 1403 (struct compat_iovec __user *)kiocb->ki_buf,
1396 kiocb->ki_nbytes, 1, &kiocb->ki_inline_vec, 1404 kiocb->ki_nbytes, 1, &kiocb->ki_inline_vec,
1397 &kiocb->ki_iovec); 1405 &kiocb->ki_iovec);
1398 else 1406 else
1399 #endif 1407 #endif
1400 ret = rw_copy_check_uvector(type, 1408 ret = rw_copy_check_uvector(type,
1401 (struct iovec __user *)kiocb->ki_buf, 1409 (struct iovec __user *)kiocb->ki_buf,
1402 kiocb->ki_nbytes, 1, &kiocb->ki_inline_vec, 1410 kiocb->ki_nbytes, 1, &kiocb->ki_inline_vec,
1403 &kiocb->ki_iovec); 1411 &kiocb->ki_iovec);
1404 if (ret < 0) 1412 if (ret < 0)
1405 goto out; 1413 goto out;
1406 1414
1407 kiocb->ki_nr_segs = kiocb->ki_nbytes; 1415 kiocb->ki_nr_segs = kiocb->ki_nbytes;
1408 kiocb->ki_cur_seg = 0; 1416 kiocb->ki_cur_seg = 0;
1409 /* ki_nbytes/left now reflect bytes instead of segs */ 1417 /* ki_nbytes/left now reflect bytes instead of segs */
1410 kiocb->ki_nbytes = ret; 1418 kiocb->ki_nbytes = ret;
1411 kiocb->ki_left = ret; 1419 kiocb->ki_left = ret;
1412 1420
1413 ret = 0; 1421 ret = 0;
1414 out: 1422 out:
1415 return ret; 1423 return ret;
1416 } 1424 }
1417 1425
1418 static ssize_t aio_setup_single_vector(struct kiocb *kiocb) 1426 static ssize_t aio_setup_single_vector(struct kiocb *kiocb)
1419 { 1427 {
1420 kiocb->ki_iovec = &kiocb->ki_inline_vec; 1428 kiocb->ki_iovec = &kiocb->ki_inline_vec;
1421 kiocb->ki_iovec->iov_base = kiocb->ki_buf; 1429 kiocb->ki_iovec->iov_base = kiocb->ki_buf;
1422 kiocb->ki_iovec->iov_len = kiocb->ki_left; 1430 kiocb->ki_iovec->iov_len = kiocb->ki_left;
1423 kiocb->ki_nr_segs = 1; 1431 kiocb->ki_nr_segs = 1;
1424 kiocb->ki_cur_seg = 0; 1432 kiocb->ki_cur_seg = 0;
1425 return 0; 1433 return 0;
1426 } 1434 }
1427 1435
1428 /* 1436 /*
1429 * aio_setup_iocb: 1437 * aio_setup_iocb:
1430 * Performs the initial checks and aio retry method 1438 * Performs the initial checks and aio retry method
1431 * setup for the kiocb at the time of io submission. 1439 * setup for the kiocb at the time of io submission.
1432 */ 1440 */
1433 static ssize_t aio_setup_iocb(struct kiocb *kiocb, bool compat) 1441 static ssize_t aio_setup_iocb(struct kiocb *kiocb, bool compat)
1434 { 1442 {
1435 struct file *file = kiocb->ki_filp; 1443 struct file *file = kiocb->ki_filp;
1436 ssize_t ret = 0; 1444 ssize_t ret = 0;
1437 1445
1438 switch (kiocb->ki_opcode) { 1446 switch (kiocb->ki_opcode) {
1439 case IOCB_CMD_PREAD: 1447 case IOCB_CMD_PREAD:
1440 ret = -EBADF; 1448 ret = -EBADF;
1441 if (unlikely(!(file->f_mode & FMODE_READ))) 1449 if (unlikely(!(file->f_mode & FMODE_READ)))
1442 break; 1450 break;
1443 ret = -EFAULT; 1451 ret = -EFAULT;
1444 if (unlikely(!access_ok(VERIFY_WRITE, kiocb->ki_buf, 1452 if (unlikely(!access_ok(VERIFY_WRITE, kiocb->ki_buf,
1445 kiocb->ki_left))) 1453 kiocb->ki_left)))
1446 break; 1454 break;
1447 ret = security_file_permission(file, MAY_READ); 1455 ret = security_file_permission(file, MAY_READ);
1448 if (unlikely(ret)) 1456 if (unlikely(ret))
1449 break; 1457 break;
1450 ret = aio_setup_single_vector(kiocb); 1458 ret = aio_setup_single_vector(kiocb);
1451 if (ret) 1459 if (ret)
1452 break; 1460 break;
1453 ret = -EINVAL; 1461 ret = -EINVAL;
1454 if (file->f_op->aio_read) 1462 if (file->f_op->aio_read)
1455 kiocb->ki_retry = aio_rw_vect_retry; 1463 kiocb->ki_retry = aio_rw_vect_retry;
1456 break; 1464 break;
1457 case IOCB_CMD_PWRITE: 1465 case IOCB_CMD_PWRITE:
1458 ret = -EBADF; 1466 ret = -EBADF;
1459 if (unlikely(!(file->f_mode & FMODE_WRITE))) 1467 if (unlikely(!(file->f_mode & FMODE_WRITE)))
1460 break; 1468 break;
1461 ret = -EFAULT; 1469 ret = -EFAULT;
1462 if (unlikely(!access_ok(VERIFY_READ, kiocb->ki_buf, 1470 if (unlikely(!access_ok(VERIFY_READ, kiocb->ki_buf,
1463 kiocb->ki_left))) 1471 kiocb->ki_left)))
1464 break; 1472 break;
1465 ret = security_file_permission(file, MAY_WRITE); 1473 ret = security_file_permission(file, MAY_WRITE);
1466 if (unlikely(ret)) 1474 if (unlikely(ret))
1467 break; 1475 break;
1468 ret = aio_setup_single_vector(kiocb); 1476 ret = aio_setup_single_vector(kiocb);
1469 if (ret) 1477 if (ret)
1470 break; 1478 break;
1471 ret = -EINVAL; 1479 ret = -EINVAL;
1472 if (file->f_op->aio_write) 1480 if (file->f_op->aio_write)
1473 kiocb->ki_retry = aio_rw_vect_retry; 1481 kiocb->ki_retry = aio_rw_vect_retry;
1474 break; 1482 break;
1475 case IOCB_CMD_PREADV: 1483 case IOCB_CMD_PREADV:
1476 ret = -EBADF; 1484 ret = -EBADF;
1477 if (unlikely(!(file->f_mode & FMODE_READ))) 1485 if (unlikely(!(file->f_mode & FMODE_READ)))
1478 break; 1486 break;
1479 ret = security_file_permission(file, MAY_READ); 1487 ret = security_file_permission(file, MAY_READ);
1480 if (unlikely(ret)) 1488 if (unlikely(ret))
1481 break; 1489 break;
1482 ret = aio_setup_vectored_rw(READ, kiocb, compat); 1490 ret = aio_setup_vectored_rw(READ, kiocb, compat);
1483 if (ret) 1491 if (ret)
1484 break; 1492 break;
1485 ret = -EINVAL; 1493 ret = -EINVAL;
1486 if (file->f_op->aio_read) 1494 if (file->f_op->aio_read)
1487 kiocb->ki_retry = aio_rw_vect_retry; 1495 kiocb->ki_retry = aio_rw_vect_retry;
1488 break; 1496 break;
1489 case IOCB_CMD_PWRITEV: 1497 case IOCB_CMD_PWRITEV:
1490 ret = -EBADF; 1498 ret = -EBADF;
1491 if (unlikely(!(file->f_mode & FMODE_WRITE))) 1499 if (unlikely(!(file->f_mode & FMODE_WRITE)))
1492 break; 1500 break;
1493 ret = security_file_permission(file, MAY_WRITE); 1501 ret = security_file_permission(file, MAY_WRITE);
1494 if (unlikely(ret)) 1502 if (unlikely(ret))
1495 break; 1503 break;
1496 ret = aio_setup_vectored_rw(WRITE, kiocb, compat); 1504 ret = aio_setup_vectored_rw(WRITE, kiocb, compat);
1497 if (ret) 1505 if (ret)
1498 break; 1506 break;
1499 ret = -EINVAL; 1507 ret = -EINVAL;
1500 if (file->f_op->aio_write) 1508 if (file->f_op->aio_write)
1501 kiocb->ki_retry = aio_rw_vect_retry; 1509 kiocb->ki_retry = aio_rw_vect_retry;
1502 break; 1510 break;
1503 case IOCB_CMD_FDSYNC: 1511 case IOCB_CMD_FDSYNC:
1504 ret = -EINVAL; 1512 ret = -EINVAL;
1505 if (file->f_op->aio_fsync) 1513 if (file->f_op->aio_fsync)
1506 kiocb->ki_retry = aio_fdsync; 1514 kiocb->ki_retry = aio_fdsync;
1507 break; 1515 break;
1508 case IOCB_CMD_FSYNC: 1516 case IOCB_CMD_FSYNC:
1509 ret = -EINVAL; 1517 ret = -EINVAL;
1510 if (file->f_op->aio_fsync) 1518 if (file->f_op->aio_fsync)
1511 kiocb->ki_retry = aio_fsync; 1519 kiocb->ki_retry = aio_fsync;
1512 break; 1520 break;
1513 default: 1521 default:
1514 dprintk("EINVAL: io_submit: no operation provided\n"); 1522 dprintk("EINVAL: io_submit: no operation provided\n");
1515 ret = -EINVAL; 1523 ret = -EINVAL;
1516 } 1524 }
1517 1525
1518 if (!kiocb->ki_retry) 1526 if (!kiocb->ki_retry)
1519 return ret; 1527 return ret;
1520 1528
1521 return 0; 1529 return 0;
1522 } 1530 }
1523 1531
1524 static void aio_batch_add(struct address_space *mapping, 1532 static void aio_batch_add(struct address_space *mapping,
1525 struct hlist_head *batch_hash) 1533 struct hlist_head *batch_hash)
1526 { 1534 {
1527 struct aio_batch_entry *abe; 1535 struct aio_batch_entry *abe;
1528 struct hlist_node *pos; 1536 struct hlist_node *pos;
1529 unsigned bucket; 1537 unsigned bucket;
1530 1538
1531 bucket = hash_ptr(mapping, AIO_BATCH_HASH_BITS); 1539 bucket = hash_ptr(mapping, AIO_BATCH_HASH_BITS);
1532 hlist_for_each_entry(abe, pos, &batch_hash[bucket], list) { 1540 hlist_for_each_entry(abe, pos, &batch_hash[bucket], list) {
1533 if (abe->mapping == mapping) 1541 if (abe->mapping == mapping)
1534 return; 1542 return;
1535 } 1543 }
1536 1544
1537 abe = mempool_alloc(abe_pool, GFP_KERNEL); 1545 abe = mempool_alloc(abe_pool, GFP_KERNEL);
1538 BUG_ON(!igrab(mapping->host)); 1546 BUG_ON(!igrab(mapping->host));
1539 abe->mapping = mapping; 1547 abe->mapping = mapping;
1540 hlist_add_head(&abe->list, &batch_hash[bucket]); 1548 hlist_add_head(&abe->list, &batch_hash[bucket]);
1541 return; 1549 return;
1542 } 1550 }
1543 1551
1544 static void aio_batch_free(struct hlist_head *batch_hash) 1552 static void aio_batch_free(struct hlist_head *batch_hash)
1545 { 1553 {
1546 struct aio_batch_entry *abe; 1554 struct aio_batch_entry *abe;
1547 struct hlist_node *pos, *n; 1555 struct hlist_node *pos, *n;
1548 int i; 1556 int i;
1549 1557
1550 for (i = 0; i < AIO_BATCH_HASH_SIZE; i++) { 1558 for (i = 0; i < AIO_BATCH_HASH_SIZE; i++) {
1551 hlist_for_each_entry_safe(abe, pos, n, &batch_hash[i], list) { 1559 hlist_for_each_entry_safe(abe, pos, n, &batch_hash[i], list) {
1552 blk_run_address_space(abe->mapping); 1560 blk_run_address_space(abe->mapping);
1553 iput(abe->mapping->host); 1561 iput(abe->mapping->host);
1554 hlist_del(&abe->list); 1562 hlist_del(&abe->list);
1555 mempool_free(abe, abe_pool); 1563 mempool_free(abe, abe_pool);
1556 } 1564 }
1557 } 1565 }
1558 } 1566 }
1559 1567
1560 static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, 1568 static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
1561 struct iocb *iocb, struct hlist_head *batch_hash, 1569 struct iocb *iocb, struct hlist_head *batch_hash,
1562 bool compat) 1570 bool compat)
1563 { 1571 {
1564 struct kiocb *req; 1572 struct kiocb *req;
1565 struct file *file; 1573 struct file *file;
1566 ssize_t ret; 1574 ssize_t ret;
1567 1575
1568 /* enforce forwards compatibility on users */ 1576 /* enforce forwards compatibility on users */
1569 if (unlikely(iocb->aio_reserved1 || iocb->aio_reserved2)) { 1577 if (unlikely(iocb->aio_reserved1 || iocb->aio_reserved2)) {
1570 pr_debug("EINVAL: io_submit: reserve field set\n"); 1578 pr_debug("EINVAL: io_submit: reserve field set\n");
1571 return -EINVAL; 1579 return -EINVAL;
1572 } 1580 }
1573 1581
1574 /* prevent overflows */ 1582 /* prevent overflows */
1575 if (unlikely( 1583 if (unlikely(
1576 (iocb->aio_buf != (unsigned long)iocb->aio_buf) || 1584 (iocb->aio_buf != (unsigned long)iocb->aio_buf) ||
1577 (iocb->aio_nbytes != (size_t)iocb->aio_nbytes) || 1585 (iocb->aio_nbytes != (size_t)iocb->aio_nbytes) ||
1578 ((ssize_t)iocb->aio_nbytes < 0) 1586 ((ssize_t)iocb->aio_nbytes < 0)
1579 )) { 1587 )) {
1580 pr_debug("EINVAL: io_submit: overflow check\n"); 1588 pr_debug("EINVAL: io_submit: overflow check\n");
1581 return -EINVAL; 1589 return -EINVAL;
1582 } 1590 }
1583 1591
1584 file = fget(iocb->aio_fildes); 1592 file = fget(iocb->aio_fildes);
1585 if (unlikely(!file)) 1593 if (unlikely(!file))
1586 return -EBADF; 1594 return -EBADF;
1587 1595
1588 req = aio_get_req(ctx); /* returns with 2 references to req */ 1596 req = aio_get_req(ctx); /* returns with 2 references to req */
1589 if (unlikely(!req)) { 1597 if (unlikely(!req)) {
1590 fput(file); 1598 fput(file);
1591 return -EAGAIN; 1599 return -EAGAIN;
1592 } 1600 }
1593 req->ki_filp = file; 1601 req->ki_filp = file;
1594 if (iocb->aio_flags & IOCB_FLAG_RESFD) { 1602 if (iocb->aio_flags & IOCB_FLAG_RESFD) {
1595 /* 1603 /*
1596 * If the IOCB_FLAG_RESFD flag of aio_flags is set, get an 1604 * If the IOCB_FLAG_RESFD flag of aio_flags is set, get an
1597 * instance of the file* now. The file descriptor must be 1605 * instance of the file* now. The file descriptor must be
1598 * an eventfd() fd, and will be signaled for each completed 1606 * an eventfd() fd, and will be signaled for each completed
1599 * event using the eventfd_signal() function. 1607 * event using the eventfd_signal() function.
1600 */ 1608 */
1601 req->ki_eventfd = eventfd_ctx_fdget((int) iocb->aio_resfd); 1609 req->ki_eventfd = eventfd_ctx_fdget((int) iocb->aio_resfd);
1602 if (IS_ERR(req->ki_eventfd)) { 1610 if (IS_ERR(req->ki_eventfd)) {
1603 ret = PTR_ERR(req->ki_eventfd); 1611 ret = PTR_ERR(req->ki_eventfd);
1604 req->ki_eventfd = NULL; 1612 req->ki_eventfd = NULL;
1605 goto out_put_req; 1613 goto out_put_req;
1606 } 1614 }
1607 } 1615 }
1608 1616
1609 ret = put_user(req->ki_key, &user_iocb->aio_key); 1617 ret = put_user(req->ki_key, &user_iocb->aio_key);
1610 if (unlikely(ret)) { 1618 if (unlikely(ret)) {
1611 dprintk("EFAULT: aio_key\n"); 1619 dprintk("EFAULT: aio_key\n");
1612 goto out_put_req; 1620 goto out_put_req;
1613 } 1621 }
1614 1622
1615 req->ki_obj.user = user_iocb; 1623 req->ki_obj.user = user_iocb;
1616 req->ki_user_data = iocb->aio_data; 1624 req->ki_user_data = iocb->aio_data;
1617 req->ki_pos = iocb->aio_offset; 1625 req->ki_pos = iocb->aio_offset;
1618 1626
1619 req->ki_buf = (char __user *)(unsigned long)iocb->aio_buf; 1627 req->ki_buf = (char __user *)(unsigned long)iocb->aio_buf;
1620 req->ki_left = req->ki_nbytes = iocb->aio_nbytes; 1628 req->ki_left = req->ki_nbytes = iocb->aio_nbytes;
1621 req->ki_opcode = iocb->aio_lio_opcode; 1629 req->ki_opcode = iocb->aio_lio_opcode;
1622 1630
1623 ret = aio_setup_iocb(req, compat); 1631 ret = aio_setup_iocb(req, compat);
1624 1632
1625 if (ret) 1633 if (ret)
1626 goto out_put_req; 1634 goto out_put_req;
1627 1635
1628 spin_lock_irq(&ctx->ctx_lock); 1636 spin_lock_irq(&ctx->ctx_lock);
1629 aio_run_iocb(req); 1637 aio_run_iocb(req);
1630 if (!list_empty(&ctx->run_list)) { 1638 if (!list_empty(&ctx->run_list)) {
1631 /* drain the run list */ 1639 /* drain the run list */
1632 while (__aio_run_iocbs(ctx)) 1640 while (__aio_run_iocbs(ctx))
1633 ; 1641 ;
1634 } 1642 }
1635 spin_unlock_irq(&ctx->ctx_lock); 1643 spin_unlock_irq(&ctx->ctx_lock);
1636 if (req->ki_opcode == IOCB_CMD_PREAD || 1644 if (req->ki_opcode == IOCB_CMD_PREAD ||
1637 req->ki_opcode == IOCB_CMD_PREADV || 1645 req->ki_opcode == IOCB_CMD_PREADV ||
1638 req->ki_opcode == IOCB_CMD_PWRITE || 1646 req->ki_opcode == IOCB_CMD_PWRITE ||
1639 req->ki_opcode == IOCB_CMD_PWRITEV) 1647 req->ki_opcode == IOCB_CMD_PWRITEV)
1640 aio_batch_add(file->f_mapping, batch_hash); 1648 aio_batch_add(file->f_mapping, batch_hash);
1641 1649
1642 aio_put_req(req); /* drop extra ref to req */ 1650 aio_put_req(req); /* drop extra ref to req */
1643 return 0; 1651 return 0;
1644 1652
1645 out_put_req: 1653 out_put_req:
1646 aio_put_req(req); /* drop extra ref to req */ 1654 aio_put_req(req); /* drop extra ref to req */
1647 aio_put_req(req); /* drop i/o ref to req */ 1655 aio_put_req(req); /* drop i/o ref to req */
1648 return ret; 1656 return ret;
1649 } 1657 }
1650 1658
1651 long do_io_submit(aio_context_t ctx_id, long nr, 1659 long do_io_submit(aio_context_t ctx_id, long nr,
1652 struct iocb __user *__user *iocbpp, bool compat) 1660 struct iocb __user *__user *iocbpp, bool compat)
1653 { 1661 {
1654 struct kioctx *ctx; 1662 struct kioctx *ctx;
1655 long ret = 0; 1663 long ret = 0;
1656 int i; 1664 int i;
1657 struct hlist_head batch_hash[AIO_BATCH_HASH_SIZE] = { { 0, }, }; 1665 struct hlist_head batch_hash[AIO_BATCH_HASH_SIZE] = { { 0, }, };
1658 1666
1659 if (unlikely(nr < 0)) 1667 if (unlikely(nr < 0))
1660 return -EINVAL; 1668 return -EINVAL;
1661 1669
1662 if (unlikely(nr > LONG_MAX/sizeof(*iocbpp))) 1670 if (unlikely(nr > LONG_MAX/sizeof(*iocbpp)))
1663 nr = LONG_MAX/sizeof(*iocbpp); 1671 nr = LONG_MAX/sizeof(*iocbpp);
1664 1672
1665 if (unlikely(!access_ok(VERIFY_READ, iocbpp, (nr*sizeof(*iocbpp))))) 1673 if (unlikely(!access_ok(VERIFY_READ, iocbpp, (nr*sizeof(*iocbpp)))))
1666 return -EFAULT; 1674 return -EFAULT;
1667 1675
1668 ctx = lookup_ioctx(ctx_id); 1676 ctx = lookup_ioctx(ctx_id);
1669 if (unlikely(!ctx)) { 1677 if (unlikely(!ctx)) {
1670 pr_debug("EINVAL: io_submit: invalid context id\n"); 1678 pr_debug("EINVAL: io_submit: invalid context id\n");
1671 return -EINVAL; 1679 return -EINVAL;
1672 } 1680 }
1673 1681
1674 /* 1682 /*
1675 * AKPM: should this return a partial result if some of the IOs were 1683 * AKPM: should this return a partial result if some of the IOs were
1676 * successfully submitted? 1684 * successfully submitted?
1677 */ 1685 */
1678 for (i=0; i<nr; i++) { 1686 for (i=0; i<nr; i++) {
1679 struct iocb __user *user_iocb; 1687 struct iocb __user *user_iocb;
1680 struct iocb tmp; 1688 struct iocb tmp;
1681 1689
1682 if (unlikely(__get_user(user_iocb, iocbpp + i))) { 1690 if (unlikely(__get_user(user_iocb, iocbpp + i))) {
1683 ret = -EFAULT; 1691 ret = -EFAULT;
1684 break; 1692 break;
1685 } 1693 }
1686 1694
1687 if (unlikely(copy_from_user(&tmp, user_iocb, sizeof(tmp)))) { 1695 if (unlikely(copy_from_user(&tmp, user_iocb, sizeof(tmp)))) {
1688 ret = -EFAULT; 1696 ret = -EFAULT;
1689 break; 1697 break;
1690 } 1698 }
1691 1699
1692 ret = io_submit_one(ctx, user_iocb, &tmp, batch_hash, compat); 1700 ret = io_submit_one(ctx, user_iocb, &tmp, batch_hash, compat);
1693 if (ret) 1701 if (ret)
1694 break; 1702 break;
1695 } 1703 }
1696 aio_batch_free(batch_hash); 1704 aio_batch_free(batch_hash);
1697 1705
1698 put_ioctx(ctx); 1706 put_ioctx(ctx);
1699 return i ? i : ret; 1707 return i ? i : ret;
1700 } 1708 }
1701 1709
1702 /* sys_io_submit: 1710 /* sys_io_submit:
1703 * Queue the nr iocbs pointed to by iocbpp for processing. Returns 1711 * Queue the nr iocbs pointed to by iocbpp for processing. Returns
1704 * the number of iocbs queued. May return -EINVAL if the aio_context 1712 * the number of iocbs queued. May return -EINVAL if the aio_context
1705 * specified by ctx_id is invalid, if nr is < 0, if the iocb at 1713 * specified by ctx_id is invalid, if nr is < 0, if the iocb at
1706 * *iocbpp[0] is not properly initialized, if the operation specified 1714 * *iocbpp[0] is not properly initialized, if the operation specified
1707 * is invalid for the file descriptor in the iocb. May fail with 1715 * is invalid for the file descriptor in the iocb. May fail with
1708 * -EFAULT if any of the data structures point to invalid data. May 1716 * -EFAULT if any of the data structures point to invalid data. May
1709 * fail with -EBADF if the file descriptor specified in the first 1717 * fail with -EBADF if the file descriptor specified in the first
1710 * iocb is invalid. May fail with -EAGAIN if insufficient resources 1718 * iocb is invalid. May fail with -EAGAIN if insufficient resources
1711 * are available to queue any iocbs. Will return 0 if nr is 0. Will 1719 * are available to queue any iocbs. Will return 0 if nr is 0. Will
1712 * fail with -ENOSYS if not implemented. 1720 * fail with -ENOSYS if not implemented.
1713 */ 1721 */
1714 SYSCALL_DEFINE3(io_submit, aio_context_t, ctx_id, long, nr, 1722 SYSCALL_DEFINE3(io_submit, aio_context_t, ctx_id, long, nr,
1715 struct iocb __user * __user *, iocbpp) 1723 struct iocb __user * __user *, iocbpp)
1716 { 1724 {
1717 return do_io_submit(ctx_id, nr, iocbpp, 0); 1725 return do_io_submit(ctx_id, nr, iocbpp, 0);
1718 } 1726 }
1719 1727
1720 /* lookup_kiocb 1728 /* lookup_kiocb
1721 * Finds a given iocb for cancellation. 1729 * Finds a given iocb for cancellation.
1722 */ 1730 */
1723 static struct kiocb *lookup_kiocb(struct kioctx *ctx, struct iocb __user *iocb, 1731 static struct kiocb *lookup_kiocb(struct kioctx *ctx, struct iocb __user *iocb,
1724 u32 key) 1732 u32 key)
1725 { 1733 {
1726 struct list_head *pos; 1734 struct list_head *pos;
1727 1735
1728 assert_spin_locked(&ctx->ctx_lock); 1736 assert_spin_locked(&ctx->ctx_lock);
1729 1737
1730 /* TODO: use a hash or array, this sucks. */ 1738 /* TODO: use a hash or array, this sucks. */
1731 list_for_each(pos, &ctx->active_reqs) { 1739 list_for_each(pos, &ctx->active_reqs) {
1732 struct kiocb *kiocb = list_kiocb(pos); 1740 struct kiocb *kiocb = list_kiocb(pos);
1733 if (kiocb->ki_obj.user == iocb && kiocb->ki_key == key) 1741 if (kiocb->ki_obj.user == iocb && kiocb->ki_key == key)
1734 return kiocb; 1742 return kiocb;
1735 } 1743 }
1736 return NULL; 1744 return NULL;
1737 } 1745 }
1738 1746
1739 /* sys_io_cancel: 1747 /* sys_io_cancel:
1740 * Attempts to cancel an iocb previously passed to io_submit. If 1748 * Attempts to cancel an iocb previously passed to io_submit. If
1741 * the operation is successfully cancelled, the resulting event is 1749 * the operation is successfully cancelled, the resulting event is
1742 * copied into the memory pointed to by result without being placed 1750 * copied into the memory pointed to by result without being placed
1743 * into the completion queue and 0 is returned. May fail with 1751 * into the completion queue and 0 is returned. May fail with
1744 * -EFAULT if any of the data structures pointed to are invalid. 1752 * -EFAULT if any of the data structures pointed to are invalid.
1745 * May fail with -EINVAL if aio_context specified by ctx_id is 1753 * May fail with -EINVAL if aio_context specified by ctx_id is
1746 * invalid. May fail with -EAGAIN if the iocb specified was not 1754 * invalid. May fail with -EAGAIN if the iocb specified was not
1747 * cancelled. Will fail with -ENOSYS if not implemented. 1755 * cancelled. Will fail with -ENOSYS if not implemented.
1748 */ 1756 */
1749 SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb, 1757 SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb,
1750 struct io_event __user *, result) 1758 struct io_event __user *, result)
1751 { 1759 {
1752 int (*cancel)(struct kiocb *iocb, struct io_event *res); 1760 int (*cancel)(struct kiocb *iocb, struct io_event *res);
1753 struct kioctx *ctx; 1761 struct kioctx *ctx;
1754 struct kiocb *kiocb; 1762 struct kiocb *kiocb;
1755 u32 key; 1763 u32 key;
1756 int ret; 1764 int ret;
1757 1765
1758 ret = get_user(key, &iocb->aio_key); 1766 ret = get_user(key, &iocb->aio_key);
1759 if (unlikely(ret)) 1767 if (unlikely(ret))
1760 return -EFAULT; 1768 return -EFAULT;
1761 1769
1762 ctx = lookup_ioctx(ctx_id); 1770 ctx = lookup_ioctx(ctx_id);
1763 if (unlikely(!ctx)) 1771 if (unlikely(!ctx))
1764 return -EINVAL; 1772 return -EINVAL;
1765 1773
1766 spin_lock_irq(&ctx->ctx_lock); 1774 spin_lock_irq(&ctx->ctx_lock);
1767 ret = -EAGAIN; 1775 ret = -EAGAIN;
1768 kiocb = lookup_kiocb(ctx, iocb, key); 1776 kiocb = lookup_kiocb(ctx, iocb, key);
1769 if (kiocb && kiocb->ki_cancel) { 1777 if (kiocb && kiocb->ki_cancel) {
1770 cancel = kiocb->ki_cancel; 1778 cancel = kiocb->ki_cancel;
1771 kiocb->ki_users ++; 1779 kiocb->ki_users ++;
1772 kiocbSetCancelled(kiocb); 1780 kiocbSetCancelled(kiocb);
1773 } else 1781 } else
1774 cancel = NULL; 1782 cancel = NULL;
1775 spin_unlock_irq(&ctx->ctx_lock); 1783 spin_unlock_irq(&ctx->ctx_lock);
1776 1784
1777 if (NULL != cancel) { 1785 if (NULL != cancel) {
1778 struct io_event tmp; 1786 struct io_event tmp;
1779 pr_debug("calling cancel\n"); 1787 pr_debug("calling cancel\n");
1780 memset(&tmp, 0, sizeof(tmp)); 1788 memset(&tmp, 0, sizeof(tmp));
1781 tmp.obj = (u64)(unsigned long)kiocb->ki_obj.user; 1789 tmp.obj = (u64)(unsigned long)kiocb->ki_obj.user;
1782 tmp.data = kiocb->ki_user_data; 1790 tmp.data = kiocb->ki_user_data;
1783 ret = cancel(kiocb, &tmp); 1791 ret = cancel(kiocb, &tmp);
1784 if (!ret) { 1792 if (!ret) {
1785 /* Cancellation succeeded -- copy the result 1793 /* Cancellation succeeded -- copy the result
1786 * into the user's buffer. 1794 * into the user's buffer.
1787 */ 1795 */
1788 if (copy_to_user(result, &tmp, sizeof(tmp))) 1796 if (copy_to_user(result, &tmp, sizeof(tmp)))
1789 ret = -EFAULT; 1797 ret = -EFAULT;
1790 } 1798 }
1791 } else 1799 } else
1792 ret = -EINVAL; 1800 ret = -EINVAL;
1793 1801
1794 put_ioctx(ctx); 1802 put_ioctx(ctx);
1795 1803
1796 return ret; 1804 return ret;
1797 } 1805 }
1798 1806
1799 /* io_getevents: 1807 /* io_getevents:
1800 * Attempts to read at least min_nr events and up to nr events from 1808 * Attempts to read at least min_nr events and up to nr events from
1801 * the completion queue for the aio_context specified by ctx_id. If 1809 * the completion queue for the aio_context specified by ctx_id. If
1802 * it succeeds, the number of read events is returned. May fail with 1810 * it succeeds, the number of read events is returned. May fail with
1803 * -EINVAL if ctx_id is invalid, if min_nr is out of range, if nr is 1811 * -EINVAL if ctx_id is invalid, if min_nr is out of range, if nr is
1804 * out of range, if timeout is out of range. May fail with -EFAULT 1812 * out of range, if timeout is out of range. May fail with -EFAULT
1805 * if any of the memory specified is invalid. May return 0 or 1813 * if any of the memory specified is invalid. May return 0 or
1806 * < min_nr if the timeout specified by timeout has elapsed 1814 * < min_nr if the timeout specified by timeout has elapsed
1807 * before sufficient events are available, where timeout == NULL 1815 * before sufficient events are available, where timeout == NULL
1808 * specifies an infinite timeout. Note that the timeout pointed to by 1816 * specifies an infinite timeout. Note that the timeout pointed to by
1809 * timeout is relative and will be updated if not NULL and the 1817 * timeout is relative and will be updated if not NULL and the
1810 * operation blocks. Will fail with -ENOSYS if not implemented. 1818 * operation blocks. Will fail with -ENOSYS if not implemented.
1811 */ 1819 */
1812 SYSCALL_DEFINE5(io_getevents, aio_context_t, ctx_id, 1820 SYSCALL_DEFINE5(io_getevents, aio_context_t, ctx_id,
1813 long, min_nr, 1821 long, min_nr,
1814 long, nr, 1822 long, nr,
1815 struct io_event __user *, events, 1823 struct io_event __user *, events,
1816 struct timespec __user *, timeout) 1824 struct timespec __user *, timeout)
1817 { 1825 {
1818 struct kioctx *ioctx = lookup_ioctx(ctx_id); 1826 struct kioctx *ioctx = lookup_ioctx(ctx_id);
1819 long ret = -EINVAL; 1827 long ret = -EINVAL;
1820 1828
1821 if (likely(ioctx)) { 1829 if (likely(ioctx)) {
1822 if (likely(min_nr <= nr && min_nr >= 0 && nr >= 0)) 1830 if (likely(min_nr <= nr && min_nr >= 0 && nr >= 0))
1823 ret = read_events(ioctx, min_nr, nr, events, timeout); 1831 ret = read_events(ioctx, min_nr, nr, events, timeout);
1824 put_ioctx(ioctx); 1832 put_ioctx(ioctx);
1825 } 1833 }
1826 1834
1827 asmlinkage_protect(5, ret, ctx_id, min_nr, nr, events, timeout); 1835 asmlinkage_protect(5, ret, ctx_id, min_nr, nr, events, timeout);
1828 return ret; 1836 return ret;
1829 } 1837 }
1830 1838