Commit a0c42bac79731276c9b2f28d54f9e658fcf843a2
Committed by
Linus Torvalds
1 parent
d1908362ae
Exists in
master
and in
4 other branches
aio: do not return ERESTARTSYS as a result of AIO
OCFS2 can return ERESTARTSYS from its write function when the process is signalled while waiting for a cluster lock (and the filesystem is mounted with intr mount option). Generally, it seems reasonable to allow filesystems to return this error code from its IO functions. As we must not leak ERESTARTSYS (and similar error codes) to userspace as a result of an AIO operation, we have to properly convert it to EINTR inside AIO code (restarting the syscall isn't really an option because other AIO could have been already submitted by the same io_submit syscall). Signed-off-by: Jan Kara <jack@suse.cz> Reviewed-by: Jeff Moyer <jmoyer@redhat.com> Cc: Christoph Hellwig <hch@infradead.org> Cc: Zach Brown <zach.brown@oracle.com> Cc: <stable@kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Showing 1 changed file with 9 additions and 1 deletions Inline Diff
fs/aio.c
1 | /* | 1 | /* |
2 | * An async IO implementation for Linux | 2 | * An async IO implementation for Linux |
3 | * Written by Benjamin LaHaise <bcrl@kvack.org> | 3 | * Written by Benjamin LaHaise <bcrl@kvack.org> |
4 | * | 4 | * |
5 | * Implements an efficient asynchronous io interface. | 5 | * Implements an efficient asynchronous io interface. |
6 | * | 6 | * |
7 | * Copyright 2000, 2001, 2002 Red Hat, Inc. All Rights Reserved. | 7 | * Copyright 2000, 2001, 2002 Red Hat, Inc. All Rights Reserved. |
8 | * | 8 | * |
9 | * See ../COPYING for licensing terms. | 9 | * See ../COPYING for licensing terms. |
10 | */ | 10 | */ |
11 | #include <linux/kernel.h> | 11 | #include <linux/kernel.h> |
12 | #include <linux/init.h> | 12 | #include <linux/init.h> |
13 | #include <linux/errno.h> | 13 | #include <linux/errno.h> |
14 | #include <linux/time.h> | 14 | #include <linux/time.h> |
15 | #include <linux/aio_abi.h> | 15 | #include <linux/aio_abi.h> |
16 | #include <linux/module.h> | 16 | #include <linux/module.h> |
17 | #include <linux/syscalls.h> | 17 | #include <linux/syscalls.h> |
18 | #include <linux/backing-dev.h> | 18 | #include <linux/backing-dev.h> |
19 | #include <linux/uio.h> | 19 | #include <linux/uio.h> |
20 | 20 | ||
21 | #define DEBUG 0 | 21 | #define DEBUG 0 |
22 | 22 | ||
23 | #include <linux/sched.h> | 23 | #include <linux/sched.h> |
24 | #include <linux/fs.h> | 24 | #include <linux/fs.h> |
25 | #include <linux/file.h> | 25 | #include <linux/file.h> |
26 | #include <linux/mm.h> | 26 | #include <linux/mm.h> |
27 | #include <linux/mman.h> | 27 | #include <linux/mman.h> |
28 | #include <linux/mmu_context.h> | 28 | #include <linux/mmu_context.h> |
29 | #include <linux/slab.h> | 29 | #include <linux/slab.h> |
30 | #include <linux/timer.h> | 30 | #include <linux/timer.h> |
31 | #include <linux/aio.h> | 31 | #include <linux/aio.h> |
32 | #include <linux/highmem.h> | 32 | #include <linux/highmem.h> |
33 | #include <linux/workqueue.h> | 33 | #include <linux/workqueue.h> |
34 | #include <linux/security.h> | 34 | #include <linux/security.h> |
35 | #include <linux/eventfd.h> | 35 | #include <linux/eventfd.h> |
36 | #include <linux/blkdev.h> | 36 | #include <linux/blkdev.h> |
37 | #include <linux/mempool.h> | 37 | #include <linux/mempool.h> |
38 | #include <linux/hash.h> | 38 | #include <linux/hash.h> |
39 | #include <linux/compat.h> | 39 | #include <linux/compat.h> |
40 | 40 | ||
41 | #include <asm/kmap_types.h> | 41 | #include <asm/kmap_types.h> |
42 | #include <asm/uaccess.h> | 42 | #include <asm/uaccess.h> |
43 | 43 | ||
44 | #if DEBUG > 1 | 44 | #if DEBUG > 1 |
45 | #define dprintk printk | 45 | #define dprintk printk |
46 | #else | 46 | #else |
47 | #define dprintk(x...) do { ; } while (0) | 47 | #define dprintk(x...) do { ; } while (0) |
48 | #endif | 48 | #endif |
49 | 49 | ||
50 | /*------ sysctl variables----*/ | 50 | /*------ sysctl variables----*/ |
51 | static DEFINE_SPINLOCK(aio_nr_lock); | 51 | static DEFINE_SPINLOCK(aio_nr_lock); |
52 | unsigned long aio_nr; /* current system wide number of aio requests */ | 52 | unsigned long aio_nr; /* current system wide number of aio requests */ |
53 | unsigned long aio_max_nr = 0x10000; /* system wide maximum number of aio requests */ | 53 | unsigned long aio_max_nr = 0x10000; /* system wide maximum number of aio requests */ |
54 | /*----end sysctl variables---*/ | 54 | /*----end sysctl variables---*/ |
55 | 55 | ||
56 | static struct kmem_cache *kiocb_cachep; | 56 | static struct kmem_cache *kiocb_cachep; |
57 | static struct kmem_cache *kioctx_cachep; | 57 | static struct kmem_cache *kioctx_cachep; |
58 | 58 | ||
59 | static struct workqueue_struct *aio_wq; | 59 | static struct workqueue_struct *aio_wq; |
60 | 60 | ||
61 | /* Used for rare fput completion. */ | 61 | /* Used for rare fput completion. */ |
62 | static void aio_fput_routine(struct work_struct *); | 62 | static void aio_fput_routine(struct work_struct *); |
63 | static DECLARE_WORK(fput_work, aio_fput_routine); | 63 | static DECLARE_WORK(fput_work, aio_fput_routine); |
64 | 64 | ||
65 | static DEFINE_SPINLOCK(fput_lock); | 65 | static DEFINE_SPINLOCK(fput_lock); |
66 | static LIST_HEAD(fput_head); | 66 | static LIST_HEAD(fput_head); |
67 | 67 | ||
68 | #define AIO_BATCH_HASH_BITS 3 /* allocated on-stack, so don't go crazy */ | 68 | #define AIO_BATCH_HASH_BITS 3 /* allocated on-stack, so don't go crazy */ |
69 | #define AIO_BATCH_HASH_SIZE (1 << AIO_BATCH_HASH_BITS) | 69 | #define AIO_BATCH_HASH_SIZE (1 << AIO_BATCH_HASH_BITS) |
70 | struct aio_batch_entry { | 70 | struct aio_batch_entry { |
71 | struct hlist_node list; | 71 | struct hlist_node list; |
72 | struct address_space *mapping; | 72 | struct address_space *mapping; |
73 | }; | 73 | }; |
74 | mempool_t *abe_pool; | 74 | mempool_t *abe_pool; |
75 | 75 | ||
76 | static void aio_kick_handler(struct work_struct *); | 76 | static void aio_kick_handler(struct work_struct *); |
77 | static void aio_queue_work(struct kioctx *); | 77 | static void aio_queue_work(struct kioctx *); |
78 | 78 | ||
79 | /* aio_setup | 79 | /* aio_setup |
80 | * Creates the slab caches used by the aio routines, panic on | 80 | * Creates the slab caches used by the aio routines, panic on |
81 | * failure as this is done early during the boot sequence. | 81 | * failure as this is done early during the boot sequence. |
82 | */ | 82 | */ |
83 | static int __init aio_setup(void) | 83 | static int __init aio_setup(void) |
84 | { | 84 | { |
85 | kiocb_cachep = KMEM_CACHE(kiocb, SLAB_HWCACHE_ALIGN|SLAB_PANIC); | 85 | kiocb_cachep = KMEM_CACHE(kiocb, SLAB_HWCACHE_ALIGN|SLAB_PANIC); |
86 | kioctx_cachep = KMEM_CACHE(kioctx,SLAB_HWCACHE_ALIGN|SLAB_PANIC); | 86 | kioctx_cachep = KMEM_CACHE(kioctx,SLAB_HWCACHE_ALIGN|SLAB_PANIC); |
87 | 87 | ||
88 | aio_wq = create_workqueue("aio"); | 88 | aio_wq = create_workqueue("aio"); |
89 | abe_pool = mempool_create_kmalloc_pool(1, sizeof(struct aio_batch_entry)); | 89 | abe_pool = mempool_create_kmalloc_pool(1, sizeof(struct aio_batch_entry)); |
90 | BUG_ON(!abe_pool); | 90 | BUG_ON(!abe_pool); |
91 | 91 | ||
92 | pr_debug("aio_setup: sizeof(struct page) = %d\n", (int)sizeof(struct page)); | 92 | pr_debug("aio_setup: sizeof(struct page) = %d\n", (int)sizeof(struct page)); |
93 | 93 | ||
94 | return 0; | 94 | return 0; |
95 | } | 95 | } |
96 | __initcall(aio_setup); | 96 | __initcall(aio_setup); |
97 | 97 | ||
98 | static void aio_free_ring(struct kioctx *ctx) | 98 | static void aio_free_ring(struct kioctx *ctx) |
99 | { | 99 | { |
100 | struct aio_ring_info *info = &ctx->ring_info; | 100 | struct aio_ring_info *info = &ctx->ring_info; |
101 | long i; | 101 | long i; |
102 | 102 | ||
103 | for (i=0; i<info->nr_pages; i++) | 103 | for (i=0; i<info->nr_pages; i++) |
104 | put_page(info->ring_pages[i]); | 104 | put_page(info->ring_pages[i]); |
105 | 105 | ||
106 | if (info->mmap_size) { | 106 | if (info->mmap_size) { |
107 | down_write(&ctx->mm->mmap_sem); | 107 | down_write(&ctx->mm->mmap_sem); |
108 | do_munmap(ctx->mm, info->mmap_base, info->mmap_size); | 108 | do_munmap(ctx->mm, info->mmap_base, info->mmap_size); |
109 | up_write(&ctx->mm->mmap_sem); | 109 | up_write(&ctx->mm->mmap_sem); |
110 | } | 110 | } |
111 | 111 | ||
112 | if (info->ring_pages && info->ring_pages != info->internal_pages) | 112 | if (info->ring_pages && info->ring_pages != info->internal_pages) |
113 | kfree(info->ring_pages); | 113 | kfree(info->ring_pages); |
114 | info->ring_pages = NULL; | 114 | info->ring_pages = NULL; |
115 | info->nr = 0; | 115 | info->nr = 0; |
116 | } | 116 | } |
117 | 117 | ||
118 | static int aio_setup_ring(struct kioctx *ctx) | 118 | static int aio_setup_ring(struct kioctx *ctx) |
119 | { | 119 | { |
120 | struct aio_ring *ring; | 120 | struct aio_ring *ring; |
121 | struct aio_ring_info *info = &ctx->ring_info; | 121 | struct aio_ring_info *info = &ctx->ring_info; |
122 | unsigned nr_events = ctx->max_reqs; | 122 | unsigned nr_events = ctx->max_reqs; |
123 | unsigned long size; | 123 | unsigned long size; |
124 | int nr_pages; | 124 | int nr_pages; |
125 | 125 | ||
126 | /* Compensate for the ring buffer's head/tail overlap entry */ | 126 | /* Compensate for the ring buffer's head/tail overlap entry */ |
127 | nr_events += 2; /* 1 is required, 2 for good luck */ | 127 | nr_events += 2; /* 1 is required, 2 for good luck */ |
128 | 128 | ||
129 | size = sizeof(struct aio_ring); | 129 | size = sizeof(struct aio_ring); |
130 | size += sizeof(struct io_event) * nr_events; | 130 | size += sizeof(struct io_event) * nr_events; |
131 | nr_pages = (size + PAGE_SIZE-1) >> PAGE_SHIFT; | 131 | nr_pages = (size + PAGE_SIZE-1) >> PAGE_SHIFT; |
132 | 132 | ||
133 | if (nr_pages < 0) | 133 | if (nr_pages < 0) |
134 | return -EINVAL; | 134 | return -EINVAL; |
135 | 135 | ||
136 | nr_events = (PAGE_SIZE * nr_pages - sizeof(struct aio_ring)) / sizeof(struct io_event); | 136 | nr_events = (PAGE_SIZE * nr_pages - sizeof(struct aio_ring)) / sizeof(struct io_event); |
137 | 137 | ||
138 | info->nr = 0; | 138 | info->nr = 0; |
139 | info->ring_pages = info->internal_pages; | 139 | info->ring_pages = info->internal_pages; |
140 | if (nr_pages > AIO_RING_PAGES) { | 140 | if (nr_pages > AIO_RING_PAGES) { |
141 | info->ring_pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL); | 141 | info->ring_pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL); |
142 | if (!info->ring_pages) | 142 | if (!info->ring_pages) |
143 | return -ENOMEM; | 143 | return -ENOMEM; |
144 | } | 144 | } |
145 | 145 | ||
146 | info->mmap_size = nr_pages * PAGE_SIZE; | 146 | info->mmap_size = nr_pages * PAGE_SIZE; |
147 | dprintk("attempting mmap of %lu bytes\n", info->mmap_size); | 147 | dprintk("attempting mmap of %lu bytes\n", info->mmap_size); |
148 | down_write(&ctx->mm->mmap_sem); | 148 | down_write(&ctx->mm->mmap_sem); |
149 | info->mmap_base = do_mmap(NULL, 0, info->mmap_size, | 149 | info->mmap_base = do_mmap(NULL, 0, info->mmap_size, |
150 | PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_PRIVATE, | 150 | PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_PRIVATE, |
151 | 0); | 151 | 0); |
152 | if (IS_ERR((void *)info->mmap_base)) { | 152 | if (IS_ERR((void *)info->mmap_base)) { |
153 | up_write(&ctx->mm->mmap_sem); | 153 | up_write(&ctx->mm->mmap_sem); |
154 | info->mmap_size = 0; | 154 | info->mmap_size = 0; |
155 | aio_free_ring(ctx); | 155 | aio_free_ring(ctx); |
156 | return -EAGAIN; | 156 | return -EAGAIN; |
157 | } | 157 | } |
158 | 158 | ||
159 | dprintk("mmap address: 0x%08lx\n", info->mmap_base); | 159 | dprintk("mmap address: 0x%08lx\n", info->mmap_base); |
160 | info->nr_pages = get_user_pages(current, ctx->mm, | 160 | info->nr_pages = get_user_pages(current, ctx->mm, |
161 | info->mmap_base, nr_pages, | 161 | info->mmap_base, nr_pages, |
162 | 1, 0, info->ring_pages, NULL); | 162 | 1, 0, info->ring_pages, NULL); |
163 | up_write(&ctx->mm->mmap_sem); | 163 | up_write(&ctx->mm->mmap_sem); |
164 | 164 | ||
165 | if (unlikely(info->nr_pages != nr_pages)) { | 165 | if (unlikely(info->nr_pages != nr_pages)) { |
166 | aio_free_ring(ctx); | 166 | aio_free_ring(ctx); |
167 | return -EAGAIN; | 167 | return -EAGAIN; |
168 | } | 168 | } |
169 | 169 | ||
170 | ctx->user_id = info->mmap_base; | 170 | ctx->user_id = info->mmap_base; |
171 | 171 | ||
172 | info->nr = nr_events; /* trusted copy */ | 172 | info->nr = nr_events; /* trusted copy */ |
173 | 173 | ||
174 | ring = kmap_atomic(info->ring_pages[0], KM_USER0); | 174 | ring = kmap_atomic(info->ring_pages[0], KM_USER0); |
175 | ring->nr = nr_events; /* user copy */ | 175 | ring->nr = nr_events; /* user copy */ |
176 | ring->id = ctx->user_id; | 176 | ring->id = ctx->user_id; |
177 | ring->head = ring->tail = 0; | 177 | ring->head = ring->tail = 0; |
178 | ring->magic = AIO_RING_MAGIC; | 178 | ring->magic = AIO_RING_MAGIC; |
179 | ring->compat_features = AIO_RING_COMPAT_FEATURES; | 179 | ring->compat_features = AIO_RING_COMPAT_FEATURES; |
180 | ring->incompat_features = AIO_RING_INCOMPAT_FEATURES; | 180 | ring->incompat_features = AIO_RING_INCOMPAT_FEATURES; |
181 | ring->header_length = sizeof(struct aio_ring); | 181 | ring->header_length = sizeof(struct aio_ring); |
182 | kunmap_atomic(ring, KM_USER0); | 182 | kunmap_atomic(ring, KM_USER0); |
183 | 183 | ||
184 | return 0; | 184 | return 0; |
185 | } | 185 | } |
186 | 186 | ||
187 | 187 | ||
188 | /* aio_ring_event: returns a pointer to the event at the given index from | 188 | /* aio_ring_event: returns a pointer to the event at the given index from |
189 | * kmap_atomic(, km). Release the pointer with put_aio_ring_event(); | 189 | * kmap_atomic(, km). Release the pointer with put_aio_ring_event(); |
190 | */ | 190 | */ |
191 | #define AIO_EVENTS_PER_PAGE (PAGE_SIZE / sizeof(struct io_event)) | 191 | #define AIO_EVENTS_PER_PAGE (PAGE_SIZE / sizeof(struct io_event)) |
192 | #define AIO_EVENTS_FIRST_PAGE ((PAGE_SIZE - sizeof(struct aio_ring)) / sizeof(struct io_event)) | 192 | #define AIO_EVENTS_FIRST_PAGE ((PAGE_SIZE - sizeof(struct aio_ring)) / sizeof(struct io_event)) |
193 | #define AIO_EVENTS_OFFSET (AIO_EVENTS_PER_PAGE - AIO_EVENTS_FIRST_PAGE) | 193 | #define AIO_EVENTS_OFFSET (AIO_EVENTS_PER_PAGE - AIO_EVENTS_FIRST_PAGE) |
194 | 194 | ||
195 | #define aio_ring_event(info, nr, km) ({ \ | 195 | #define aio_ring_event(info, nr, km) ({ \ |
196 | unsigned pos = (nr) + AIO_EVENTS_OFFSET; \ | 196 | unsigned pos = (nr) + AIO_EVENTS_OFFSET; \ |
197 | struct io_event *__event; \ | 197 | struct io_event *__event; \ |
198 | __event = kmap_atomic( \ | 198 | __event = kmap_atomic( \ |
199 | (info)->ring_pages[pos / AIO_EVENTS_PER_PAGE], km); \ | 199 | (info)->ring_pages[pos / AIO_EVENTS_PER_PAGE], km); \ |
200 | __event += pos % AIO_EVENTS_PER_PAGE; \ | 200 | __event += pos % AIO_EVENTS_PER_PAGE; \ |
201 | __event; \ | 201 | __event; \ |
202 | }) | 202 | }) |
203 | 203 | ||
204 | #define put_aio_ring_event(event, km) do { \ | 204 | #define put_aio_ring_event(event, km) do { \ |
205 | struct io_event *__event = (event); \ | 205 | struct io_event *__event = (event); \ |
206 | (void)__event; \ | 206 | (void)__event; \ |
207 | kunmap_atomic((void *)((unsigned long)__event & PAGE_MASK), km); \ | 207 | kunmap_atomic((void *)((unsigned long)__event & PAGE_MASK), km); \ |
208 | } while(0) | 208 | } while(0) |
209 | 209 | ||
210 | static void ctx_rcu_free(struct rcu_head *head) | 210 | static void ctx_rcu_free(struct rcu_head *head) |
211 | { | 211 | { |
212 | struct kioctx *ctx = container_of(head, struct kioctx, rcu_head); | 212 | struct kioctx *ctx = container_of(head, struct kioctx, rcu_head); |
213 | unsigned nr_events = ctx->max_reqs; | 213 | unsigned nr_events = ctx->max_reqs; |
214 | 214 | ||
215 | kmem_cache_free(kioctx_cachep, ctx); | 215 | kmem_cache_free(kioctx_cachep, ctx); |
216 | 216 | ||
217 | if (nr_events) { | 217 | if (nr_events) { |
218 | spin_lock(&aio_nr_lock); | 218 | spin_lock(&aio_nr_lock); |
219 | BUG_ON(aio_nr - nr_events > aio_nr); | 219 | BUG_ON(aio_nr - nr_events > aio_nr); |
220 | aio_nr -= nr_events; | 220 | aio_nr -= nr_events; |
221 | spin_unlock(&aio_nr_lock); | 221 | spin_unlock(&aio_nr_lock); |
222 | } | 222 | } |
223 | } | 223 | } |
224 | 224 | ||
225 | /* __put_ioctx | 225 | /* __put_ioctx |
226 | * Called when the last user of an aio context has gone away, | 226 | * Called when the last user of an aio context has gone away, |
227 | * and the struct needs to be freed. | 227 | * and the struct needs to be freed. |
228 | */ | 228 | */ |
229 | static void __put_ioctx(struct kioctx *ctx) | 229 | static void __put_ioctx(struct kioctx *ctx) |
230 | { | 230 | { |
231 | BUG_ON(ctx->reqs_active); | 231 | BUG_ON(ctx->reqs_active); |
232 | 232 | ||
233 | cancel_delayed_work(&ctx->wq); | 233 | cancel_delayed_work(&ctx->wq); |
234 | cancel_work_sync(&ctx->wq.work); | 234 | cancel_work_sync(&ctx->wq.work); |
235 | aio_free_ring(ctx); | 235 | aio_free_ring(ctx); |
236 | mmdrop(ctx->mm); | 236 | mmdrop(ctx->mm); |
237 | ctx->mm = NULL; | 237 | ctx->mm = NULL; |
238 | pr_debug("__put_ioctx: freeing %p\n", ctx); | 238 | pr_debug("__put_ioctx: freeing %p\n", ctx); |
239 | call_rcu(&ctx->rcu_head, ctx_rcu_free); | 239 | call_rcu(&ctx->rcu_head, ctx_rcu_free); |
240 | } | 240 | } |
241 | 241 | ||
242 | #define get_ioctx(kioctx) do { \ | 242 | #define get_ioctx(kioctx) do { \ |
243 | BUG_ON(atomic_read(&(kioctx)->users) <= 0); \ | 243 | BUG_ON(atomic_read(&(kioctx)->users) <= 0); \ |
244 | atomic_inc(&(kioctx)->users); \ | 244 | atomic_inc(&(kioctx)->users); \ |
245 | } while (0) | 245 | } while (0) |
246 | #define put_ioctx(kioctx) do { \ | 246 | #define put_ioctx(kioctx) do { \ |
247 | BUG_ON(atomic_read(&(kioctx)->users) <= 0); \ | 247 | BUG_ON(atomic_read(&(kioctx)->users) <= 0); \ |
248 | if (unlikely(atomic_dec_and_test(&(kioctx)->users))) \ | 248 | if (unlikely(atomic_dec_and_test(&(kioctx)->users))) \ |
249 | __put_ioctx(kioctx); \ | 249 | __put_ioctx(kioctx); \ |
250 | } while (0) | 250 | } while (0) |
251 | 251 | ||
252 | /* ioctx_alloc | 252 | /* ioctx_alloc |
253 | * Allocates and initializes an ioctx. Returns an ERR_PTR if it failed. | 253 | * Allocates and initializes an ioctx. Returns an ERR_PTR if it failed. |
254 | */ | 254 | */ |
255 | static struct kioctx *ioctx_alloc(unsigned nr_events) | 255 | static struct kioctx *ioctx_alloc(unsigned nr_events) |
256 | { | 256 | { |
257 | struct mm_struct *mm; | 257 | struct mm_struct *mm; |
258 | struct kioctx *ctx; | 258 | struct kioctx *ctx; |
259 | int did_sync = 0; | 259 | int did_sync = 0; |
260 | 260 | ||
261 | /* Prevent overflows */ | 261 | /* Prevent overflows */ |
262 | if ((nr_events > (0x10000000U / sizeof(struct io_event))) || | 262 | if ((nr_events > (0x10000000U / sizeof(struct io_event))) || |
263 | (nr_events > (0x10000000U / sizeof(struct kiocb)))) { | 263 | (nr_events > (0x10000000U / sizeof(struct kiocb)))) { |
264 | pr_debug("ENOMEM: nr_events too high\n"); | 264 | pr_debug("ENOMEM: nr_events too high\n"); |
265 | return ERR_PTR(-EINVAL); | 265 | return ERR_PTR(-EINVAL); |
266 | } | 266 | } |
267 | 267 | ||
268 | if ((unsigned long)nr_events > aio_max_nr) | 268 | if ((unsigned long)nr_events > aio_max_nr) |
269 | return ERR_PTR(-EAGAIN); | 269 | return ERR_PTR(-EAGAIN); |
270 | 270 | ||
271 | ctx = kmem_cache_zalloc(kioctx_cachep, GFP_KERNEL); | 271 | ctx = kmem_cache_zalloc(kioctx_cachep, GFP_KERNEL); |
272 | if (!ctx) | 272 | if (!ctx) |
273 | return ERR_PTR(-ENOMEM); | 273 | return ERR_PTR(-ENOMEM); |
274 | 274 | ||
275 | ctx->max_reqs = nr_events; | 275 | ctx->max_reqs = nr_events; |
276 | mm = ctx->mm = current->mm; | 276 | mm = ctx->mm = current->mm; |
277 | atomic_inc(&mm->mm_count); | 277 | atomic_inc(&mm->mm_count); |
278 | 278 | ||
279 | atomic_set(&ctx->users, 1); | 279 | atomic_set(&ctx->users, 1); |
280 | spin_lock_init(&ctx->ctx_lock); | 280 | spin_lock_init(&ctx->ctx_lock); |
281 | spin_lock_init(&ctx->ring_info.ring_lock); | 281 | spin_lock_init(&ctx->ring_info.ring_lock); |
282 | init_waitqueue_head(&ctx->wait); | 282 | init_waitqueue_head(&ctx->wait); |
283 | 283 | ||
284 | INIT_LIST_HEAD(&ctx->active_reqs); | 284 | INIT_LIST_HEAD(&ctx->active_reqs); |
285 | INIT_LIST_HEAD(&ctx->run_list); | 285 | INIT_LIST_HEAD(&ctx->run_list); |
286 | INIT_DELAYED_WORK(&ctx->wq, aio_kick_handler); | 286 | INIT_DELAYED_WORK(&ctx->wq, aio_kick_handler); |
287 | 287 | ||
288 | if (aio_setup_ring(ctx) < 0) | 288 | if (aio_setup_ring(ctx) < 0) |
289 | goto out_freectx; | 289 | goto out_freectx; |
290 | 290 | ||
291 | /* limit the number of system wide aios */ | 291 | /* limit the number of system wide aios */ |
292 | do { | 292 | do { |
293 | spin_lock_bh(&aio_nr_lock); | 293 | spin_lock_bh(&aio_nr_lock); |
294 | if (aio_nr + nr_events > aio_max_nr || | 294 | if (aio_nr + nr_events > aio_max_nr || |
295 | aio_nr + nr_events < aio_nr) | 295 | aio_nr + nr_events < aio_nr) |
296 | ctx->max_reqs = 0; | 296 | ctx->max_reqs = 0; |
297 | else | 297 | else |
298 | aio_nr += ctx->max_reqs; | 298 | aio_nr += ctx->max_reqs; |
299 | spin_unlock_bh(&aio_nr_lock); | 299 | spin_unlock_bh(&aio_nr_lock); |
300 | if (ctx->max_reqs || did_sync) | 300 | if (ctx->max_reqs || did_sync) |
301 | break; | 301 | break; |
302 | 302 | ||
303 | /* wait for rcu callbacks to have completed before giving up */ | 303 | /* wait for rcu callbacks to have completed before giving up */ |
304 | synchronize_rcu(); | 304 | synchronize_rcu(); |
305 | did_sync = 1; | 305 | did_sync = 1; |
306 | ctx->max_reqs = nr_events; | 306 | ctx->max_reqs = nr_events; |
307 | } while (1); | 307 | } while (1); |
308 | 308 | ||
309 | if (ctx->max_reqs == 0) | 309 | if (ctx->max_reqs == 0) |
310 | goto out_cleanup; | 310 | goto out_cleanup; |
311 | 311 | ||
312 | /* now link into global list. */ | 312 | /* now link into global list. */ |
313 | spin_lock(&mm->ioctx_lock); | 313 | spin_lock(&mm->ioctx_lock); |
314 | hlist_add_head_rcu(&ctx->list, &mm->ioctx_list); | 314 | hlist_add_head_rcu(&ctx->list, &mm->ioctx_list); |
315 | spin_unlock(&mm->ioctx_lock); | 315 | spin_unlock(&mm->ioctx_lock); |
316 | 316 | ||
317 | dprintk("aio: allocated ioctx %p[%ld]: mm=%p mask=0x%x\n", | 317 | dprintk("aio: allocated ioctx %p[%ld]: mm=%p mask=0x%x\n", |
318 | ctx, ctx->user_id, current->mm, ctx->ring_info.nr); | 318 | ctx, ctx->user_id, current->mm, ctx->ring_info.nr); |
319 | return ctx; | 319 | return ctx; |
320 | 320 | ||
321 | out_cleanup: | 321 | out_cleanup: |
322 | __put_ioctx(ctx); | 322 | __put_ioctx(ctx); |
323 | return ERR_PTR(-EAGAIN); | 323 | return ERR_PTR(-EAGAIN); |
324 | 324 | ||
325 | out_freectx: | 325 | out_freectx: |
326 | mmdrop(mm); | 326 | mmdrop(mm); |
327 | kmem_cache_free(kioctx_cachep, ctx); | 327 | kmem_cache_free(kioctx_cachep, ctx); |
328 | ctx = ERR_PTR(-ENOMEM); | 328 | ctx = ERR_PTR(-ENOMEM); |
329 | 329 | ||
330 | dprintk("aio: error allocating ioctx %p\n", ctx); | 330 | dprintk("aio: error allocating ioctx %p\n", ctx); |
331 | return ctx; | 331 | return ctx; |
332 | } | 332 | } |
333 | 333 | ||
334 | /* aio_cancel_all | 334 | /* aio_cancel_all |
335 | * Cancels all outstanding aio requests on an aio context. Used | 335 | * Cancels all outstanding aio requests on an aio context. Used |
336 | * when the processes owning a context have all exited to encourage | 336 | * when the processes owning a context have all exited to encourage |
337 | * the rapid destruction of the kioctx. | 337 | * the rapid destruction of the kioctx. |
338 | */ | 338 | */ |
339 | static void aio_cancel_all(struct kioctx *ctx) | 339 | static void aio_cancel_all(struct kioctx *ctx) |
340 | { | 340 | { |
341 | int (*cancel)(struct kiocb *, struct io_event *); | 341 | int (*cancel)(struct kiocb *, struct io_event *); |
342 | struct io_event res; | 342 | struct io_event res; |
343 | spin_lock_irq(&ctx->ctx_lock); | 343 | spin_lock_irq(&ctx->ctx_lock); |
344 | ctx->dead = 1; | 344 | ctx->dead = 1; |
345 | while (!list_empty(&ctx->active_reqs)) { | 345 | while (!list_empty(&ctx->active_reqs)) { |
346 | struct list_head *pos = ctx->active_reqs.next; | 346 | struct list_head *pos = ctx->active_reqs.next; |
347 | struct kiocb *iocb = list_kiocb(pos); | 347 | struct kiocb *iocb = list_kiocb(pos); |
348 | list_del_init(&iocb->ki_list); | 348 | list_del_init(&iocb->ki_list); |
349 | cancel = iocb->ki_cancel; | 349 | cancel = iocb->ki_cancel; |
350 | kiocbSetCancelled(iocb); | 350 | kiocbSetCancelled(iocb); |
351 | if (cancel) { | 351 | if (cancel) { |
352 | iocb->ki_users++; | 352 | iocb->ki_users++; |
353 | spin_unlock_irq(&ctx->ctx_lock); | 353 | spin_unlock_irq(&ctx->ctx_lock); |
354 | cancel(iocb, &res); | 354 | cancel(iocb, &res); |
355 | spin_lock_irq(&ctx->ctx_lock); | 355 | spin_lock_irq(&ctx->ctx_lock); |
356 | } | 356 | } |
357 | } | 357 | } |
358 | spin_unlock_irq(&ctx->ctx_lock); | 358 | spin_unlock_irq(&ctx->ctx_lock); |
359 | } | 359 | } |
360 | 360 | ||
361 | static void wait_for_all_aios(struct kioctx *ctx) | 361 | static void wait_for_all_aios(struct kioctx *ctx) |
362 | { | 362 | { |
363 | struct task_struct *tsk = current; | 363 | struct task_struct *tsk = current; |
364 | DECLARE_WAITQUEUE(wait, tsk); | 364 | DECLARE_WAITQUEUE(wait, tsk); |
365 | 365 | ||
366 | spin_lock_irq(&ctx->ctx_lock); | 366 | spin_lock_irq(&ctx->ctx_lock); |
367 | if (!ctx->reqs_active) | 367 | if (!ctx->reqs_active) |
368 | goto out; | 368 | goto out; |
369 | 369 | ||
370 | add_wait_queue(&ctx->wait, &wait); | 370 | add_wait_queue(&ctx->wait, &wait); |
371 | set_task_state(tsk, TASK_UNINTERRUPTIBLE); | 371 | set_task_state(tsk, TASK_UNINTERRUPTIBLE); |
372 | while (ctx->reqs_active) { | 372 | while (ctx->reqs_active) { |
373 | spin_unlock_irq(&ctx->ctx_lock); | 373 | spin_unlock_irq(&ctx->ctx_lock); |
374 | io_schedule(); | 374 | io_schedule(); |
375 | set_task_state(tsk, TASK_UNINTERRUPTIBLE); | 375 | set_task_state(tsk, TASK_UNINTERRUPTIBLE); |
376 | spin_lock_irq(&ctx->ctx_lock); | 376 | spin_lock_irq(&ctx->ctx_lock); |
377 | } | 377 | } |
378 | __set_task_state(tsk, TASK_RUNNING); | 378 | __set_task_state(tsk, TASK_RUNNING); |
379 | remove_wait_queue(&ctx->wait, &wait); | 379 | remove_wait_queue(&ctx->wait, &wait); |
380 | 380 | ||
381 | out: | 381 | out: |
382 | spin_unlock_irq(&ctx->ctx_lock); | 382 | spin_unlock_irq(&ctx->ctx_lock); |
383 | } | 383 | } |
384 | 384 | ||
385 | /* wait_on_sync_kiocb: | 385 | /* wait_on_sync_kiocb: |
386 | * Waits on the given sync kiocb to complete. | 386 | * Waits on the given sync kiocb to complete. |
387 | */ | 387 | */ |
388 | ssize_t wait_on_sync_kiocb(struct kiocb *iocb) | 388 | ssize_t wait_on_sync_kiocb(struct kiocb *iocb) |
389 | { | 389 | { |
390 | while (iocb->ki_users) { | 390 | while (iocb->ki_users) { |
391 | set_current_state(TASK_UNINTERRUPTIBLE); | 391 | set_current_state(TASK_UNINTERRUPTIBLE); |
392 | if (!iocb->ki_users) | 392 | if (!iocb->ki_users) |
393 | break; | 393 | break; |
394 | io_schedule(); | 394 | io_schedule(); |
395 | } | 395 | } |
396 | __set_current_state(TASK_RUNNING); | 396 | __set_current_state(TASK_RUNNING); |
397 | return iocb->ki_user_data; | 397 | return iocb->ki_user_data; |
398 | } | 398 | } |
399 | EXPORT_SYMBOL(wait_on_sync_kiocb); | 399 | EXPORT_SYMBOL(wait_on_sync_kiocb); |
400 | 400 | ||
401 | /* exit_aio: called when the last user of mm goes away. At this point, | 401 | /* exit_aio: called when the last user of mm goes away. At this point, |
402 | * there is no way for any new requests to be submited or any of the | 402 | * there is no way for any new requests to be submited or any of the |
403 | * io_* syscalls to be called on the context. However, there may be | 403 | * io_* syscalls to be called on the context. However, there may be |
404 | * outstanding requests which hold references to the context; as they | 404 | * outstanding requests which hold references to the context; as they |
405 | * go away, they will call put_ioctx and release any pinned memory | 405 | * go away, they will call put_ioctx and release any pinned memory |
406 | * associated with the request (held via struct page * references). | 406 | * associated with the request (held via struct page * references). |
407 | */ | 407 | */ |
408 | void exit_aio(struct mm_struct *mm) | 408 | void exit_aio(struct mm_struct *mm) |
409 | { | 409 | { |
410 | struct kioctx *ctx; | 410 | struct kioctx *ctx; |
411 | 411 | ||
412 | while (!hlist_empty(&mm->ioctx_list)) { | 412 | while (!hlist_empty(&mm->ioctx_list)) { |
413 | ctx = hlist_entry(mm->ioctx_list.first, struct kioctx, list); | 413 | ctx = hlist_entry(mm->ioctx_list.first, struct kioctx, list); |
414 | hlist_del_rcu(&ctx->list); | 414 | hlist_del_rcu(&ctx->list); |
415 | 415 | ||
416 | aio_cancel_all(ctx); | 416 | aio_cancel_all(ctx); |
417 | 417 | ||
418 | wait_for_all_aios(ctx); | 418 | wait_for_all_aios(ctx); |
419 | /* | 419 | /* |
420 | * Ensure we don't leave the ctx on the aio_wq | 420 | * Ensure we don't leave the ctx on the aio_wq |
421 | */ | 421 | */ |
422 | cancel_work_sync(&ctx->wq.work); | 422 | cancel_work_sync(&ctx->wq.work); |
423 | 423 | ||
424 | if (1 != atomic_read(&ctx->users)) | 424 | if (1 != atomic_read(&ctx->users)) |
425 | printk(KERN_DEBUG | 425 | printk(KERN_DEBUG |
426 | "exit_aio:ioctx still alive: %d %d %d\n", | 426 | "exit_aio:ioctx still alive: %d %d %d\n", |
427 | atomic_read(&ctx->users), ctx->dead, | 427 | atomic_read(&ctx->users), ctx->dead, |
428 | ctx->reqs_active); | 428 | ctx->reqs_active); |
429 | put_ioctx(ctx); | 429 | put_ioctx(ctx); |
430 | } | 430 | } |
431 | } | 431 | } |
432 | 432 | ||
433 | /* aio_get_req | 433 | /* aio_get_req |
434 | * Allocate a slot for an aio request. Increments the users count | 434 | * Allocate a slot for an aio request. Increments the users count |
435 | * of the kioctx so that the kioctx stays around until all requests are | 435 | * of the kioctx so that the kioctx stays around until all requests are |
436 | * complete. Returns NULL if no requests are free. | 436 | * complete. Returns NULL if no requests are free. |
437 | * | 437 | * |
438 | * Returns with kiocb->users set to 2. The io submit code path holds | 438 | * Returns with kiocb->users set to 2. The io submit code path holds |
439 | * an extra reference while submitting the i/o. | 439 | * an extra reference while submitting the i/o. |
440 | * This prevents races between the aio code path referencing the | 440 | * This prevents races between the aio code path referencing the |
441 | * req (after submitting it) and aio_complete() freeing the req. | 441 | * req (after submitting it) and aio_complete() freeing the req. |
442 | */ | 442 | */ |
443 | static struct kiocb *__aio_get_req(struct kioctx *ctx) | 443 | static struct kiocb *__aio_get_req(struct kioctx *ctx) |
444 | { | 444 | { |
445 | struct kiocb *req = NULL; | 445 | struct kiocb *req = NULL; |
446 | struct aio_ring *ring; | 446 | struct aio_ring *ring; |
447 | int okay = 0; | 447 | int okay = 0; |
448 | 448 | ||
449 | req = kmem_cache_alloc(kiocb_cachep, GFP_KERNEL); | 449 | req = kmem_cache_alloc(kiocb_cachep, GFP_KERNEL); |
450 | if (unlikely(!req)) | 450 | if (unlikely(!req)) |
451 | return NULL; | 451 | return NULL; |
452 | 452 | ||
453 | req->ki_flags = 0; | 453 | req->ki_flags = 0; |
454 | req->ki_users = 2; | 454 | req->ki_users = 2; |
455 | req->ki_key = 0; | 455 | req->ki_key = 0; |
456 | req->ki_ctx = ctx; | 456 | req->ki_ctx = ctx; |
457 | req->ki_cancel = NULL; | 457 | req->ki_cancel = NULL; |
458 | req->ki_retry = NULL; | 458 | req->ki_retry = NULL; |
459 | req->ki_dtor = NULL; | 459 | req->ki_dtor = NULL; |
460 | req->private = NULL; | 460 | req->private = NULL; |
461 | req->ki_iovec = NULL; | 461 | req->ki_iovec = NULL; |
462 | INIT_LIST_HEAD(&req->ki_run_list); | 462 | INIT_LIST_HEAD(&req->ki_run_list); |
463 | req->ki_eventfd = NULL; | 463 | req->ki_eventfd = NULL; |
464 | 464 | ||
465 | /* Check if the completion queue has enough free space to | 465 | /* Check if the completion queue has enough free space to |
466 | * accept an event from this io. | 466 | * accept an event from this io. |
467 | */ | 467 | */ |
468 | spin_lock_irq(&ctx->ctx_lock); | 468 | spin_lock_irq(&ctx->ctx_lock); |
469 | ring = kmap_atomic(ctx->ring_info.ring_pages[0], KM_USER0); | 469 | ring = kmap_atomic(ctx->ring_info.ring_pages[0], KM_USER0); |
470 | if (ctx->reqs_active < aio_ring_avail(&ctx->ring_info, ring)) { | 470 | if (ctx->reqs_active < aio_ring_avail(&ctx->ring_info, ring)) { |
471 | list_add(&req->ki_list, &ctx->active_reqs); | 471 | list_add(&req->ki_list, &ctx->active_reqs); |
472 | ctx->reqs_active++; | 472 | ctx->reqs_active++; |
473 | okay = 1; | 473 | okay = 1; |
474 | } | 474 | } |
475 | kunmap_atomic(ring, KM_USER0); | 475 | kunmap_atomic(ring, KM_USER0); |
476 | spin_unlock_irq(&ctx->ctx_lock); | 476 | spin_unlock_irq(&ctx->ctx_lock); |
477 | 477 | ||
478 | if (!okay) { | 478 | if (!okay) { |
479 | kmem_cache_free(kiocb_cachep, req); | 479 | kmem_cache_free(kiocb_cachep, req); |
480 | req = NULL; | 480 | req = NULL; |
481 | } | 481 | } |
482 | 482 | ||
483 | return req; | 483 | return req; |
484 | } | 484 | } |
485 | 485 | ||
486 | static inline struct kiocb *aio_get_req(struct kioctx *ctx) | 486 | static inline struct kiocb *aio_get_req(struct kioctx *ctx) |
487 | { | 487 | { |
488 | struct kiocb *req; | 488 | struct kiocb *req; |
489 | /* Handle a potential starvation case -- should be exceedingly rare as | 489 | /* Handle a potential starvation case -- should be exceedingly rare as |
490 | * requests will be stuck on fput_head only if the aio_fput_routine is | 490 | * requests will be stuck on fput_head only if the aio_fput_routine is |
491 | * delayed and the requests were the last user of the struct file. | 491 | * delayed and the requests were the last user of the struct file. |
492 | */ | 492 | */ |
493 | req = __aio_get_req(ctx); | 493 | req = __aio_get_req(ctx); |
494 | if (unlikely(NULL == req)) { | 494 | if (unlikely(NULL == req)) { |
495 | aio_fput_routine(NULL); | 495 | aio_fput_routine(NULL); |
496 | req = __aio_get_req(ctx); | 496 | req = __aio_get_req(ctx); |
497 | } | 497 | } |
498 | return req; | 498 | return req; |
499 | } | 499 | } |
500 | 500 | ||
501 | static inline void really_put_req(struct kioctx *ctx, struct kiocb *req) | 501 | static inline void really_put_req(struct kioctx *ctx, struct kiocb *req) |
502 | { | 502 | { |
503 | assert_spin_locked(&ctx->ctx_lock); | 503 | assert_spin_locked(&ctx->ctx_lock); |
504 | 504 | ||
505 | if (req->ki_eventfd != NULL) | 505 | if (req->ki_eventfd != NULL) |
506 | eventfd_ctx_put(req->ki_eventfd); | 506 | eventfd_ctx_put(req->ki_eventfd); |
507 | if (req->ki_dtor) | 507 | if (req->ki_dtor) |
508 | req->ki_dtor(req); | 508 | req->ki_dtor(req); |
509 | if (req->ki_iovec != &req->ki_inline_vec) | 509 | if (req->ki_iovec != &req->ki_inline_vec) |
510 | kfree(req->ki_iovec); | 510 | kfree(req->ki_iovec); |
511 | kmem_cache_free(kiocb_cachep, req); | 511 | kmem_cache_free(kiocb_cachep, req); |
512 | ctx->reqs_active--; | 512 | ctx->reqs_active--; |
513 | 513 | ||
514 | if (unlikely(!ctx->reqs_active && ctx->dead)) | 514 | if (unlikely(!ctx->reqs_active && ctx->dead)) |
515 | wake_up(&ctx->wait); | 515 | wake_up(&ctx->wait); |
516 | } | 516 | } |
517 | 517 | ||
518 | static void aio_fput_routine(struct work_struct *data) | 518 | static void aio_fput_routine(struct work_struct *data) |
519 | { | 519 | { |
520 | spin_lock_irq(&fput_lock); | 520 | spin_lock_irq(&fput_lock); |
521 | while (likely(!list_empty(&fput_head))) { | 521 | while (likely(!list_empty(&fput_head))) { |
522 | struct kiocb *req = list_kiocb(fput_head.next); | 522 | struct kiocb *req = list_kiocb(fput_head.next); |
523 | struct kioctx *ctx = req->ki_ctx; | 523 | struct kioctx *ctx = req->ki_ctx; |
524 | 524 | ||
525 | list_del(&req->ki_list); | 525 | list_del(&req->ki_list); |
526 | spin_unlock_irq(&fput_lock); | 526 | spin_unlock_irq(&fput_lock); |
527 | 527 | ||
528 | /* Complete the fput(s) */ | 528 | /* Complete the fput(s) */ |
529 | if (req->ki_filp != NULL) | 529 | if (req->ki_filp != NULL) |
530 | fput(req->ki_filp); | 530 | fput(req->ki_filp); |
531 | 531 | ||
532 | /* Link the iocb into the context's free list */ | 532 | /* Link the iocb into the context's free list */ |
533 | spin_lock_irq(&ctx->ctx_lock); | 533 | spin_lock_irq(&ctx->ctx_lock); |
534 | really_put_req(ctx, req); | 534 | really_put_req(ctx, req); |
535 | spin_unlock_irq(&ctx->ctx_lock); | 535 | spin_unlock_irq(&ctx->ctx_lock); |
536 | 536 | ||
537 | put_ioctx(ctx); | 537 | put_ioctx(ctx); |
538 | spin_lock_irq(&fput_lock); | 538 | spin_lock_irq(&fput_lock); |
539 | } | 539 | } |
540 | spin_unlock_irq(&fput_lock); | 540 | spin_unlock_irq(&fput_lock); |
541 | } | 541 | } |
542 | 542 | ||
543 | /* __aio_put_req | 543 | /* __aio_put_req |
544 | * Returns true if this put was the last user of the request. | 544 | * Returns true if this put was the last user of the request. |
545 | */ | 545 | */ |
546 | static int __aio_put_req(struct kioctx *ctx, struct kiocb *req) | 546 | static int __aio_put_req(struct kioctx *ctx, struct kiocb *req) |
547 | { | 547 | { |
548 | dprintk(KERN_DEBUG "aio_put(%p): f_count=%ld\n", | 548 | dprintk(KERN_DEBUG "aio_put(%p): f_count=%ld\n", |
549 | req, atomic_long_read(&req->ki_filp->f_count)); | 549 | req, atomic_long_read(&req->ki_filp->f_count)); |
550 | 550 | ||
551 | assert_spin_locked(&ctx->ctx_lock); | 551 | assert_spin_locked(&ctx->ctx_lock); |
552 | 552 | ||
553 | req->ki_users--; | 553 | req->ki_users--; |
554 | BUG_ON(req->ki_users < 0); | 554 | BUG_ON(req->ki_users < 0); |
555 | if (likely(req->ki_users)) | 555 | if (likely(req->ki_users)) |
556 | return 0; | 556 | return 0; |
557 | list_del(&req->ki_list); /* remove from active_reqs */ | 557 | list_del(&req->ki_list); /* remove from active_reqs */ |
558 | req->ki_cancel = NULL; | 558 | req->ki_cancel = NULL; |
559 | req->ki_retry = NULL; | 559 | req->ki_retry = NULL; |
560 | 560 | ||
561 | /* | 561 | /* |
562 | * Try to optimize the aio and eventfd file* puts, by avoiding to | 562 | * Try to optimize the aio and eventfd file* puts, by avoiding to |
563 | * schedule work in case it is not final fput() time. In normal cases, | 563 | * schedule work in case it is not final fput() time. In normal cases, |
564 | * we would not be holding the last reference to the file*, so | 564 | * we would not be holding the last reference to the file*, so |
565 | * this function will be executed w/out any aio kthread wakeup. | 565 | * this function will be executed w/out any aio kthread wakeup. |
566 | */ | 566 | */ |
567 | if (unlikely(!fput_atomic(req->ki_filp))) { | 567 | if (unlikely(!fput_atomic(req->ki_filp))) { |
568 | get_ioctx(ctx); | 568 | get_ioctx(ctx); |
569 | spin_lock(&fput_lock); | 569 | spin_lock(&fput_lock); |
570 | list_add(&req->ki_list, &fput_head); | 570 | list_add(&req->ki_list, &fput_head); |
571 | spin_unlock(&fput_lock); | 571 | spin_unlock(&fput_lock); |
572 | queue_work(aio_wq, &fput_work); | 572 | queue_work(aio_wq, &fput_work); |
573 | } else { | 573 | } else { |
574 | req->ki_filp = NULL; | 574 | req->ki_filp = NULL; |
575 | really_put_req(ctx, req); | 575 | really_put_req(ctx, req); |
576 | } | 576 | } |
577 | return 1; | 577 | return 1; |
578 | } | 578 | } |
579 | 579 | ||
580 | /* aio_put_req | 580 | /* aio_put_req |
581 | * Returns true if this put was the last user of the kiocb, | 581 | * Returns true if this put was the last user of the kiocb, |
582 | * false if the request is still in use. | 582 | * false if the request is still in use. |
583 | */ | 583 | */ |
584 | int aio_put_req(struct kiocb *req) | 584 | int aio_put_req(struct kiocb *req) |
585 | { | 585 | { |
586 | struct kioctx *ctx = req->ki_ctx; | 586 | struct kioctx *ctx = req->ki_ctx; |
587 | int ret; | 587 | int ret; |
588 | spin_lock_irq(&ctx->ctx_lock); | 588 | spin_lock_irq(&ctx->ctx_lock); |
589 | ret = __aio_put_req(ctx, req); | 589 | ret = __aio_put_req(ctx, req); |
590 | spin_unlock_irq(&ctx->ctx_lock); | 590 | spin_unlock_irq(&ctx->ctx_lock); |
591 | return ret; | 591 | return ret; |
592 | } | 592 | } |
593 | EXPORT_SYMBOL(aio_put_req); | 593 | EXPORT_SYMBOL(aio_put_req); |
594 | 594 | ||
595 | static struct kioctx *lookup_ioctx(unsigned long ctx_id) | 595 | static struct kioctx *lookup_ioctx(unsigned long ctx_id) |
596 | { | 596 | { |
597 | struct mm_struct *mm = current->mm; | 597 | struct mm_struct *mm = current->mm; |
598 | struct kioctx *ctx, *ret = NULL; | 598 | struct kioctx *ctx, *ret = NULL; |
599 | struct hlist_node *n; | 599 | struct hlist_node *n; |
600 | 600 | ||
601 | rcu_read_lock(); | 601 | rcu_read_lock(); |
602 | 602 | ||
603 | hlist_for_each_entry_rcu(ctx, n, &mm->ioctx_list, list) { | 603 | hlist_for_each_entry_rcu(ctx, n, &mm->ioctx_list, list) { |
604 | if (ctx->user_id == ctx_id && !ctx->dead) { | 604 | if (ctx->user_id == ctx_id && !ctx->dead) { |
605 | get_ioctx(ctx); | 605 | get_ioctx(ctx); |
606 | ret = ctx; | 606 | ret = ctx; |
607 | break; | 607 | break; |
608 | } | 608 | } |
609 | } | 609 | } |
610 | 610 | ||
611 | rcu_read_unlock(); | 611 | rcu_read_unlock(); |
612 | return ret; | 612 | return ret; |
613 | } | 613 | } |
614 | 614 | ||
615 | /* | 615 | /* |
616 | * Queue up a kiocb to be retried. Assumes that the kiocb | 616 | * Queue up a kiocb to be retried. Assumes that the kiocb |
617 | * has already been marked as kicked, and places it on | 617 | * has already been marked as kicked, and places it on |
618 | * the retry run list for the corresponding ioctx, if it | 618 | * the retry run list for the corresponding ioctx, if it |
619 | * isn't already queued. Returns 1 if it actually queued | 619 | * isn't already queued. Returns 1 if it actually queued |
620 | * the kiocb (to tell the caller to activate the work | 620 | * the kiocb (to tell the caller to activate the work |
621 | * queue to process it), or 0, if it found that it was | 621 | * queue to process it), or 0, if it found that it was |
622 | * already queued. | 622 | * already queued. |
623 | */ | 623 | */ |
624 | static inline int __queue_kicked_iocb(struct kiocb *iocb) | 624 | static inline int __queue_kicked_iocb(struct kiocb *iocb) |
625 | { | 625 | { |
626 | struct kioctx *ctx = iocb->ki_ctx; | 626 | struct kioctx *ctx = iocb->ki_ctx; |
627 | 627 | ||
628 | assert_spin_locked(&ctx->ctx_lock); | 628 | assert_spin_locked(&ctx->ctx_lock); |
629 | 629 | ||
630 | if (list_empty(&iocb->ki_run_list)) { | 630 | if (list_empty(&iocb->ki_run_list)) { |
631 | list_add_tail(&iocb->ki_run_list, | 631 | list_add_tail(&iocb->ki_run_list, |
632 | &ctx->run_list); | 632 | &ctx->run_list); |
633 | return 1; | 633 | return 1; |
634 | } | 634 | } |
635 | return 0; | 635 | return 0; |
636 | } | 636 | } |
637 | 637 | ||
638 | /* aio_run_iocb | 638 | /* aio_run_iocb |
639 | * This is the core aio execution routine. It is | 639 | * This is the core aio execution routine. It is |
640 | * invoked both for initial i/o submission and | 640 | * invoked both for initial i/o submission and |
641 | * subsequent retries via the aio_kick_handler. | 641 | * subsequent retries via the aio_kick_handler. |
642 | * Expects to be invoked with iocb->ki_ctx->lock | 642 | * Expects to be invoked with iocb->ki_ctx->lock |
643 | * already held. The lock is released and reacquired | 643 | * already held. The lock is released and reacquired |
644 | * as needed during processing. | 644 | * as needed during processing. |
645 | * | 645 | * |
646 | * Calls the iocb retry method (already setup for the | 646 | * Calls the iocb retry method (already setup for the |
647 | * iocb on initial submission) for operation specific | 647 | * iocb on initial submission) for operation specific |
648 | * handling, but takes care of most of common retry | 648 | * handling, but takes care of most of common retry |
649 | * execution details for a given iocb. The retry method | 649 | * execution details for a given iocb. The retry method |
650 | * needs to be non-blocking as far as possible, to avoid | 650 | * needs to be non-blocking as far as possible, to avoid |
651 | * holding up other iocbs waiting to be serviced by the | 651 | * holding up other iocbs waiting to be serviced by the |
652 | * retry kernel thread. | 652 | * retry kernel thread. |
653 | * | 653 | * |
654 | * The trickier parts in this code have to do with | 654 | * The trickier parts in this code have to do with |
655 | * ensuring that only one retry instance is in progress | 655 | * ensuring that only one retry instance is in progress |
656 | * for a given iocb at any time. Providing that guarantee | 656 | * for a given iocb at any time. Providing that guarantee |
657 | * simplifies the coding of individual aio operations as | 657 | * simplifies the coding of individual aio operations as |
658 | * it avoids various potential races. | 658 | * it avoids various potential races. |
659 | */ | 659 | */ |
660 | static ssize_t aio_run_iocb(struct kiocb *iocb) | 660 | static ssize_t aio_run_iocb(struct kiocb *iocb) |
661 | { | 661 | { |
662 | struct kioctx *ctx = iocb->ki_ctx; | 662 | struct kioctx *ctx = iocb->ki_ctx; |
663 | ssize_t (*retry)(struct kiocb *); | 663 | ssize_t (*retry)(struct kiocb *); |
664 | ssize_t ret; | 664 | ssize_t ret; |
665 | 665 | ||
666 | if (!(retry = iocb->ki_retry)) { | 666 | if (!(retry = iocb->ki_retry)) { |
667 | printk("aio_run_iocb: iocb->ki_retry = NULL\n"); | 667 | printk("aio_run_iocb: iocb->ki_retry = NULL\n"); |
668 | return 0; | 668 | return 0; |
669 | } | 669 | } |
670 | 670 | ||
671 | /* | 671 | /* |
672 | * We don't want the next retry iteration for this | 672 | * We don't want the next retry iteration for this |
673 | * operation to start until this one has returned and | 673 | * operation to start until this one has returned and |
674 | * updated the iocb state. However, wait_queue functions | 674 | * updated the iocb state. However, wait_queue functions |
675 | * can trigger a kick_iocb from interrupt context in the | 675 | * can trigger a kick_iocb from interrupt context in the |
676 | * meantime, indicating that data is available for the next | 676 | * meantime, indicating that data is available for the next |
677 | * iteration. We want to remember that and enable the | 677 | * iteration. We want to remember that and enable the |
678 | * next retry iteration _after_ we are through with | 678 | * next retry iteration _after_ we are through with |
679 | * this one. | 679 | * this one. |
680 | * | 680 | * |
681 | * So, in order to be able to register a "kick", but | 681 | * So, in order to be able to register a "kick", but |
682 | * prevent it from being queued now, we clear the kick | 682 | * prevent it from being queued now, we clear the kick |
683 | * flag, but make the kick code *think* that the iocb is | 683 | * flag, but make the kick code *think* that the iocb is |
684 | * still on the run list until we are actually done. | 684 | * still on the run list until we are actually done. |
685 | * When we are done with this iteration, we check if | 685 | * When we are done with this iteration, we check if |
686 | * the iocb was kicked in the meantime and if so, queue | 686 | * the iocb was kicked in the meantime and if so, queue |
687 | * it up afresh. | 687 | * it up afresh. |
688 | */ | 688 | */ |
689 | 689 | ||
690 | kiocbClearKicked(iocb); | 690 | kiocbClearKicked(iocb); |
691 | 691 | ||
692 | /* | 692 | /* |
693 | * This is so that aio_complete knows it doesn't need to | 693 | * This is so that aio_complete knows it doesn't need to |
694 | * pull the iocb off the run list (We can't just call | 694 | * pull the iocb off the run list (We can't just call |
695 | * INIT_LIST_HEAD because we don't want a kick_iocb to | 695 | * INIT_LIST_HEAD because we don't want a kick_iocb to |
696 | * queue this on the run list yet) | 696 | * queue this on the run list yet) |
697 | */ | 697 | */ |
698 | iocb->ki_run_list.next = iocb->ki_run_list.prev = NULL; | 698 | iocb->ki_run_list.next = iocb->ki_run_list.prev = NULL; |
699 | spin_unlock_irq(&ctx->ctx_lock); | 699 | spin_unlock_irq(&ctx->ctx_lock); |
700 | 700 | ||
701 | /* Quit retrying if the i/o has been cancelled */ | 701 | /* Quit retrying if the i/o has been cancelled */ |
702 | if (kiocbIsCancelled(iocb)) { | 702 | if (kiocbIsCancelled(iocb)) { |
703 | ret = -EINTR; | 703 | ret = -EINTR; |
704 | aio_complete(iocb, ret, 0); | 704 | aio_complete(iocb, ret, 0); |
705 | /* must not access the iocb after this */ | 705 | /* must not access the iocb after this */ |
706 | goto out; | 706 | goto out; |
707 | } | 707 | } |
708 | 708 | ||
709 | /* | 709 | /* |
710 | * Now we are all set to call the retry method in async | 710 | * Now we are all set to call the retry method in async |
711 | * context. | 711 | * context. |
712 | */ | 712 | */ |
713 | ret = retry(iocb); | 713 | ret = retry(iocb); |
714 | 714 | ||
715 | if (ret != -EIOCBRETRY && ret != -EIOCBQUEUED) | 715 | if (ret != -EIOCBRETRY && ret != -EIOCBQUEUED) { |
716 | /* | ||
717 | * There's no easy way to restart the syscall since other AIO's | ||
718 | * may be already running. Just fail this IO with EINTR. | ||
719 | */ | ||
720 | if (unlikely(ret == -ERESTARTSYS || ret == -ERESTARTNOINTR || | ||
721 | ret == -ERESTARTNOHAND || ret == -ERESTART_RESTARTBLOCK)) | ||
722 | ret = -EINTR; | ||
716 | aio_complete(iocb, ret, 0); | 723 | aio_complete(iocb, ret, 0); |
724 | } | ||
717 | out: | 725 | out: |
718 | spin_lock_irq(&ctx->ctx_lock); | 726 | spin_lock_irq(&ctx->ctx_lock); |
719 | 727 | ||
720 | if (-EIOCBRETRY == ret) { | 728 | if (-EIOCBRETRY == ret) { |
721 | /* | 729 | /* |
722 | * OK, now that we are done with this iteration | 730 | * OK, now that we are done with this iteration |
723 | * and know that there is more left to go, | 731 | * and know that there is more left to go, |
724 | * this is where we let go so that a subsequent | 732 | * this is where we let go so that a subsequent |
725 | * "kick" can start the next iteration | 733 | * "kick" can start the next iteration |
726 | */ | 734 | */ |
727 | 735 | ||
728 | /* will make __queue_kicked_iocb succeed from here on */ | 736 | /* will make __queue_kicked_iocb succeed from here on */ |
729 | INIT_LIST_HEAD(&iocb->ki_run_list); | 737 | INIT_LIST_HEAD(&iocb->ki_run_list); |
730 | /* we must queue the next iteration ourselves, if it | 738 | /* we must queue the next iteration ourselves, if it |
731 | * has already been kicked */ | 739 | * has already been kicked */ |
732 | if (kiocbIsKicked(iocb)) { | 740 | if (kiocbIsKicked(iocb)) { |
733 | __queue_kicked_iocb(iocb); | 741 | __queue_kicked_iocb(iocb); |
734 | 742 | ||
735 | /* | 743 | /* |
736 | * __queue_kicked_iocb will always return 1 here, because | 744 | * __queue_kicked_iocb will always return 1 here, because |
737 | * iocb->ki_run_list is empty at this point so it should | 745 | * iocb->ki_run_list is empty at this point so it should |
738 | * be safe to unconditionally queue the context into the | 746 | * be safe to unconditionally queue the context into the |
739 | * work queue. | 747 | * work queue. |
740 | */ | 748 | */ |
741 | aio_queue_work(ctx); | 749 | aio_queue_work(ctx); |
742 | } | 750 | } |
743 | } | 751 | } |
744 | return ret; | 752 | return ret; |
745 | } | 753 | } |
746 | 754 | ||
747 | /* | 755 | /* |
748 | * __aio_run_iocbs: | 756 | * __aio_run_iocbs: |
749 | * Process all pending retries queued on the ioctx | 757 | * Process all pending retries queued on the ioctx |
750 | * run list. | 758 | * run list. |
751 | * Assumes it is operating within the aio issuer's mm | 759 | * Assumes it is operating within the aio issuer's mm |
752 | * context. | 760 | * context. |
753 | */ | 761 | */ |
754 | static int __aio_run_iocbs(struct kioctx *ctx) | 762 | static int __aio_run_iocbs(struct kioctx *ctx) |
755 | { | 763 | { |
756 | struct kiocb *iocb; | 764 | struct kiocb *iocb; |
757 | struct list_head run_list; | 765 | struct list_head run_list; |
758 | 766 | ||
759 | assert_spin_locked(&ctx->ctx_lock); | 767 | assert_spin_locked(&ctx->ctx_lock); |
760 | 768 | ||
761 | list_replace_init(&ctx->run_list, &run_list); | 769 | list_replace_init(&ctx->run_list, &run_list); |
762 | while (!list_empty(&run_list)) { | 770 | while (!list_empty(&run_list)) { |
763 | iocb = list_entry(run_list.next, struct kiocb, | 771 | iocb = list_entry(run_list.next, struct kiocb, |
764 | ki_run_list); | 772 | ki_run_list); |
765 | list_del(&iocb->ki_run_list); | 773 | list_del(&iocb->ki_run_list); |
766 | /* | 774 | /* |
767 | * Hold an extra reference while retrying i/o. | 775 | * Hold an extra reference while retrying i/o. |
768 | */ | 776 | */ |
769 | iocb->ki_users++; /* grab extra reference */ | 777 | iocb->ki_users++; /* grab extra reference */ |
770 | aio_run_iocb(iocb); | 778 | aio_run_iocb(iocb); |
771 | __aio_put_req(ctx, iocb); | 779 | __aio_put_req(ctx, iocb); |
772 | } | 780 | } |
773 | if (!list_empty(&ctx->run_list)) | 781 | if (!list_empty(&ctx->run_list)) |
774 | return 1; | 782 | return 1; |
775 | return 0; | 783 | return 0; |
776 | } | 784 | } |
777 | 785 | ||
778 | static void aio_queue_work(struct kioctx * ctx) | 786 | static void aio_queue_work(struct kioctx * ctx) |
779 | { | 787 | { |
780 | unsigned long timeout; | 788 | unsigned long timeout; |
781 | /* | 789 | /* |
782 | * if someone is waiting, get the work started right | 790 | * if someone is waiting, get the work started right |
783 | * away, otherwise, use a longer delay | 791 | * away, otherwise, use a longer delay |
784 | */ | 792 | */ |
785 | smp_mb(); | 793 | smp_mb(); |
786 | if (waitqueue_active(&ctx->wait)) | 794 | if (waitqueue_active(&ctx->wait)) |
787 | timeout = 1; | 795 | timeout = 1; |
788 | else | 796 | else |
789 | timeout = HZ/10; | 797 | timeout = HZ/10; |
790 | queue_delayed_work(aio_wq, &ctx->wq, timeout); | 798 | queue_delayed_work(aio_wq, &ctx->wq, timeout); |
791 | } | 799 | } |
792 | 800 | ||
793 | 801 | ||
794 | /* | 802 | /* |
795 | * aio_run_iocbs: | 803 | * aio_run_iocbs: |
796 | * Process all pending retries queued on the ioctx | 804 | * Process all pending retries queued on the ioctx |
797 | * run list. | 805 | * run list. |
798 | * Assumes it is operating within the aio issuer's mm | 806 | * Assumes it is operating within the aio issuer's mm |
799 | * context. | 807 | * context. |
800 | */ | 808 | */ |
801 | static inline void aio_run_iocbs(struct kioctx *ctx) | 809 | static inline void aio_run_iocbs(struct kioctx *ctx) |
802 | { | 810 | { |
803 | int requeue; | 811 | int requeue; |
804 | 812 | ||
805 | spin_lock_irq(&ctx->ctx_lock); | 813 | spin_lock_irq(&ctx->ctx_lock); |
806 | 814 | ||
807 | requeue = __aio_run_iocbs(ctx); | 815 | requeue = __aio_run_iocbs(ctx); |
808 | spin_unlock_irq(&ctx->ctx_lock); | 816 | spin_unlock_irq(&ctx->ctx_lock); |
809 | if (requeue) | 817 | if (requeue) |
810 | aio_queue_work(ctx); | 818 | aio_queue_work(ctx); |
811 | } | 819 | } |
812 | 820 | ||
813 | /* | 821 | /* |
814 | * just like aio_run_iocbs, but keeps running them until | 822 | * just like aio_run_iocbs, but keeps running them until |
815 | * the list stays empty | 823 | * the list stays empty |
816 | */ | 824 | */ |
817 | static inline void aio_run_all_iocbs(struct kioctx *ctx) | 825 | static inline void aio_run_all_iocbs(struct kioctx *ctx) |
818 | { | 826 | { |
819 | spin_lock_irq(&ctx->ctx_lock); | 827 | spin_lock_irq(&ctx->ctx_lock); |
820 | while (__aio_run_iocbs(ctx)) | 828 | while (__aio_run_iocbs(ctx)) |
821 | ; | 829 | ; |
822 | spin_unlock_irq(&ctx->ctx_lock); | 830 | spin_unlock_irq(&ctx->ctx_lock); |
823 | } | 831 | } |
824 | 832 | ||
825 | /* | 833 | /* |
826 | * aio_kick_handler: | 834 | * aio_kick_handler: |
827 | * Work queue handler triggered to process pending | 835 | * Work queue handler triggered to process pending |
828 | * retries on an ioctx. Takes on the aio issuer's | 836 | * retries on an ioctx. Takes on the aio issuer's |
829 | * mm context before running the iocbs, so that | 837 | * mm context before running the iocbs, so that |
830 | * copy_xxx_user operates on the issuer's address | 838 | * copy_xxx_user operates on the issuer's address |
831 | * space. | 839 | * space. |
832 | * Run on aiod's context. | 840 | * Run on aiod's context. |
833 | */ | 841 | */ |
834 | static void aio_kick_handler(struct work_struct *work) | 842 | static void aio_kick_handler(struct work_struct *work) |
835 | { | 843 | { |
836 | struct kioctx *ctx = container_of(work, struct kioctx, wq.work); | 844 | struct kioctx *ctx = container_of(work, struct kioctx, wq.work); |
837 | mm_segment_t oldfs = get_fs(); | 845 | mm_segment_t oldfs = get_fs(); |
838 | struct mm_struct *mm; | 846 | struct mm_struct *mm; |
839 | int requeue; | 847 | int requeue; |
840 | 848 | ||
841 | set_fs(USER_DS); | 849 | set_fs(USER_DS); |
842 | use_mm(ctx->mm); | 850 | use_mm(ctx->mm); |
843 | spin_lock_irq(&ctx->ctx_lock); | 851 | spin_lock_irq(&ctx->ctx_lock); |
844 | requeue =__aio_run_iocbs(ctx); | 852 | requeue =__aio_run_iocbs(ctx); |
845 | mm = ctx->mm; | 853 | mm = ctx->mm; |
846 | spin_unlock_irq(&ctx->ctx_lock); | 854 | spin_unlock_irq(&ctx->ctx_lock); |
847 | unuse_mm(mm); | 855 | unuse_mm(mm); |
848 | set_fs(oldfs); | 856 | set_fs(oldfs); |
849 | /* | 857 | /* |
850 | * we're in a worker thread already, don't use queue_delayed_work, | 858 | * we're in a worker thread already, don't use queue_delayed_work, |
851 | */ | 859 | */ |
852 | if (requeue) | 860 | if (requeue) |
853 | queue_delayed_work(aio_wq, &ctx->wq, 0); | 861 | queue_delayed_work(aio_wq, &ctx->wq, 0); |
854 | } | 862 | } |
855 | 863 | ||
856 | 864 | ||
857 | /* | 865 | /* |
858 | * Called by kick_iocb to queue the kiocb for retry | 866 | * Called by kick_iocb to queue the kiocb for retry |
859 | * and if required activate the aio work queue to process | 867 | * and if required activate the aio work queue to process |
860 | * it | 868 | * it |
861 | */ | 869 | */ |
862 | static void try_queue_kicked_iocb(struct kiocb *iocb) | 870 | static void try_queue_kicked_iocb(struct kiocb *iocb) |
863 | { | 871 | { |
864 | struct kioctx *ctx = iocb->ki_ctx; | 872 | struct kioctx *ctx = iocb->ki_ctx; |
865 | unsigned long flags; | 873 | unsigned long flags; |
866 | int run = 0; | 874 | int run = 0; |
867 | 875 | ||
868 | spin_lock_irqsave(&ctx->ctx_lock, flags); | 876 | spin_lock_irqsave(&ctx->ctx_lock, flags); |
869 | /* set this inside the lock so that we can't race with aio_run_iocb() | 877 | /* set this inside the lock so that we can't race with aio_run_iocb() |
870 | * testing it and putting the iocb on the run list under the lock */ | 878 | * testing it and putting the iocb on the run list under the lock */ |
871 | if (!kiocbTryKick(iocb)) | 879 | if (!kiocbTryKick(iocb)) |
872 | run = __queue_kicked_iocb(iocb); | 880 | run = __queue_kicked_iocb(iocb); |
873 | spin_unlock_irqrestore(&ctx->ctx_lock, flags); | 881 | spin_unlock_irqrestore(&ctx->ctx_lock, flags); |
874 | if (run) | 882 | if (run) |
875 | aio_queue_work(ctx); | 883 | aio_queue_work(ctx); |
876 | } | 884 | } |
877 | 885 | ||
878 | /* | 886 | /* |
879 | * kick_iocb: | 887 | * kick_iocb: |
880 | * Called typically from a wait queue callback context | 888 | * Called typically from a wait queue callback context |
881 | * to trigger a retry of the iocb. | 889 | * to trigger a retry of the iocb. |
882 | * The retry is usually executed by aio workqueue | 890 | * The retry is usually executed by aio workqueue |
883 | * threads (See aio_kick_handler). | 891 | * threads (See aio_kick_handler). |
884 | */ | 892 | */ |
885 | void kick_iocb(struct kiocb *iocb) | 893 | void kick_iocb(struct kiocb *iocb) |
886 | { | 894 | { |
887 | /* sync iocbs are easy: they can only ever be executing from a | 895 | /* sync iocbs are easy: they can only ever be executing from a |
888 | * single context. */ | 896 | * single context. */ |
889 | if (is_sync_kiocb(iocb)) { | 897 | if (is_sync_kiocb(iocb)) { |
890 | kiocbSetKicked(iocb); | 898 | kiocbSetKicked(iocb); |
891 | wake_up_process(iocb->ki_obj.tsk); | 899 | wake_up_process(iocb->ki_obj.tsk); |
892 | return; | 900 | return; |
893 | } | 901 | } |
894 | 902 | ||
895 | try_queue_kicked_iocb(iocb); | 903 | try_queue_kicked_iocb(iocb); |
896 | } | 904 | } |
897 | EXPORT_SYMBOL(kick_iocb); | 905 | EXPORT_SYMBOL(kick_iocb); |
898 | 906 | ||
899 | /* aio_complete | 907 | /* aio_complete |
900 | * Called when the io request on the given iocb is complete. | 908 | * Called when the io request on the given iocb is complete. |
901 | * Returns true if this is the last user of the request. The | 909 | * Returns true if this is the last user of the request. The |
902 | * only other user of the request can be the cancellation code. | 910 | * only other user of the request can be the cancellation code. |
903 | */ | 911 | */ |
904 | int aio_complete(struct kiocb *iocb, long res, long res2) | 912 | int aio_complete(struct kiocb *iocb, long res, long res2) |
905 | { | 913 | { |
906 | struct kioctx *ctx = iocb->ki_ctx; | 914 | struct kioctx *ctx = iocb->ki_ctx; |
907 | struct aio_ring_info *info; | 915 | struct aio_ring_info *info; |
908 | struct aio_ring *ring; | 916 | struct aio_ring *ring; |
909 | struct io_event *event; | 917 | struct io_event *event; |
910 | unsigned long flags; | 918 | unsigned long flags; |
911 | unsigned long tail; | 919 | unsigned long tail; |
912 | int ret; | 920 | int ret; |
913 | 921 | ||
914 | /* | 922 | /* |
915 | * Special case handling for sync iocbs: | 923 | * Special case handling for sync iocbs: |
916 | * - events go directly into the iocb for fast handling | 924 | * - events go directly into the iocb for fast handling |
917 | * - the sync task with the iocb in its stack holds the single iocb | 925 | * - the sync task with the iocb in its stack holds the single iocb |
918 | * ref, no other paths have a way to get another ref | 926 | * ref, no other paths have a way to get another ref |
919 | * - the sync task helpfully left a reference to itself in the iocb | 927 | * - the sync task helpfully left a reference to itself in the iocb |
920 | */ | 928 | */ |
921 | if (is_sync_kiocb(iocb)) { | 929 | if (is_sync_kiocb(iocb)) { |
922 | BUG_ON(iocb->ki_users != 1); | 930 | BUG_ON(iocb->ki_users != 1); |
923 | iocb->ki_user_data = res; | 931 | iocb->ki_user_data = res; |
924 | iocb->ki_users = 0; | 932 | iocb->ki_users = 0; |
925 | wake_up_process(iocb->ki_obj.tsk); | 933 | wake_up_process(iocb->ki_obj.tsk); |
926 | return 1; | 934 | return 1; |
927 | } | 935 | } |
928 | 936 | ||
929 | info = &ctx->ring_info; | 937 | info = &ctx->ring_info; |
930 | 938 | ||
931 | /* add a completion event to the ring buffer. | 939 | /* add a completion event to the ring buffer. |
932 | * must be done holding ctx->ctx_lock to prevent | 940 | * must be done holding ctx->ctx_lock to prevent |
933 | * other code from messing with the tail | 941 | * other code from messing with the tail |
934 | * pointer since we might be called from irq | 942 | * pointer since we might be called from irq |
935 | * context. | 943 | * context. |
936 | */ | 944 | */ |
937 | spin_lock_irqsave(&ctx->ctx_lock, flags); | 945 | spin_lock_irqsave(&ctx->ctx_lock, flags); |
938 | 946 | ||
939 | if (iocb->ki_run_list.prev && !list_empty(&iocb->ki_run_list)) | 947 | if (iocb->ki_run_list.prev && !list_empty(&iocb->ki_run_list)) |
940 | list_del_init(&iocb->ki_run_list); | 948 | list_del_init(&iocb->ki_run_list); |
941 | 949 | ||
942 | /* | 950 | /* |
943 | * cancelled requests don't get events, userland was given one | 951 | * cancelled requests don't get events, userland was given one |
944 | * when the event got cancelled. | 952 | * when the event got cancelled. |
945 | */ | 953 | */ |
946 | if (kiocbIsCancelled(iocb)) | 954 | if (kiocbIsCancelled(iocb)) |
947 | goto put_rq; | 955 | goto put_rq; |
948 | 956 | ||
949 | ring = kmap_atomic(info->ring_pages[0], KM_IRQ1); | 957 | ring = kmap_atomic(info->ring_pages[0], KM_IRQ1); |
950 | 958 | ||
951 | tail = info->tail; | 959 | tail = info->tail; |
952 | event = aio_ring_event(info, tail, KM_IRQ0); | 960 | event = aio_ring_event(info, tail, KM_IRQ0); |
953 | if (++tail >= info->nr) | 961 | if (++tail >= info->nr) |
954 | tail = 0; | 962 | tail = 0; |
955 | 963 | ||
956 | event->obj = (u64)(unsigned long)iocb->ki_obj.user; | 964 | event->obj = (u64)(unsigned long)iocb->ki_obj.user; |
957 | event->data = iocb->ki_user_data; | 965 | event->data = iocb->ki_user_data; |
958 | event->res = res; | 966 | event->res = res; |
959 | event->res2 = res2; | 967 | event->res2 = res2; |
960 | 968 | ||
961 | dprintk("aio_complete: %p[%lu]: %p: %p %Lx %lx %lx\n", | 969 | dprintk("aio_complete: %p[%lu]: %p: %p %Lx %lx %lx\n", |
962 | ctx, tail, iocb, iocb->ki_obj.user, iocb->ki_user_data, | 970 | ctx, tail, iocb, iocb->ki_obj.user, iocb->ki_user_data, |
963 | res, res2); | 971 | res, res2); |
964 | 972 | ||
965 | /* after flagging the request as done, we | 973 | /* after flagging the request as done, we |
966 | * must never even look at it again | 974 | * must never even look at it again |
967 | */ | 975 | */ |
968 | smp_wmb(); /* make event visible before updating tail */ | 976 | smp_wmb(); /* make event visible before updating tail */ |
969 | 977 | ||
970 | info->tail = tail; | 978 | info->tail = tail; |
971 | ring->tail = tail; | 979 | ring->tail = tail; |
972 | 980 | ||
973 | put_aio_ring_event(event, KM_IRQ0); | 981 | put_aio_ring_event(event, KM_IRQ0); |
974 | kunmap_atomic(ring, KM_IRQ1); | 982 | kunmap_atomic(ring, KM_IRQ1); |
975 | 983 | ||
976 | pr_debug("added to ring %p at [%lu]\n", iocb, tail); | 984 | pr_debug("added to ring %p at [%lu]\n", iocb, tail); |
977 | 985 | ||
978 | /* | 986 | /* |
979 | * Check if the user asked us to deliver the result through an | 987 | * Check if the user asked us to deliver the result through an |
980 | * eventfd. The eventfd_signal() function is safe to be called | 988 | * eventfd. The eventfd_signal() function is safe to be called |
981 | * from IRQ context. | 989 | * from IRQ context. |
982 | */ | 990 | */ |
983 | if (iocb->ki_eventfd != NULL) | 991 | if (iocb->ki_eventfd != NULL) |
984 | eventfd_signal(iocb->ki_eventfd, 1); | 992 | eventfd_signal(iocb->ki_eventfd, 1); |
985 | 993 | ||
986 | put_rq: | 994 | put_rq: |
987 | /* everything turned out well, dispose of the aiocb. */ | 995 | /* everything turned out well, dispose of the aiocb. */ |
988 | ret = __aio_put_req(ctx, iocb); | 996 | ret = __aio_put_req(ctx, iocb); |
989 | 997 | ||
990 | /* | 998 | /* |
991 | * We have to order our ring_info tail store above and test | 999 | * We have to order our ring_info tail store above and test |
992 | * of the wait list below outside the wait lock. This is | 1000 | * of the wait list below outside the wait lock. This is |
993 | * like in wake_up_bit() where clearing a bit has to be | 1001 | * like in wake_up_bit() where clearing a bit has to be |
994 | * ordered with the unlocked test. | 1002 | * ordered with the unlocked test. |
995 | */ | 1003 | */ |
996 | smp_mb(); | 1004 | smp_mb(); |
997 | 1005 | ||
998 | if (waitqueue_active(&ctx->wait)) | 1006 | if (waitqueue_active(&ctx->wait)) |
999 | wake_up(&ctx->wait); | 1007 | wake_up(&ctx->wait); |
1000 | 1008 | ||
1001 | spin_unlock_irqrestore(&ctx->ctx_lock, flags); | 1009 | spin_unlock_irqrestore(&ctx->ctx_lock, flags); |
1002 | return ret; | 1010 | return ret; |
1003 | } | 1011 | } |
1004 | EXPORT_SYMBOL(aio_complete); | 1012 | EXPORT_SYMBOL(aio_complete); |
1005 | 1013 | ||
1006 | /* aio_read_evt | 1014 | /* aio_read_evt |
1007 | * Pull an event off of the ioctx's event ring. Returns the number of | 1015 | * Pull an event off of the ioctx's event ring. Returns the number of |
1008 | * events fetched (0 or 1 ;-) | 1016 | * events fetched (0 or 1 ;-) |
1009 | * FIXME: make this use cmpxchg. | 1017 | * FIXME: make this use cmpxchg. |
1010 | * TODO: make the ringbuffer user mmap()able (requires FIXME). | 1018 | * TODO: make the ringbuffer user mmap()able (requires FIXME). |
1011 | */ | 1019 | */ |
1012 | static int aio_read_evt(struct kioctx *ioctx, struct io_event *ent) | 1020 | static int aio_read_evt(struct kioctx *ioctx, struct io_event *ent) |
1013 | { | 1021 | { |
1014 | struct aio_ring_info *info = &ioctx->ring_info; | 1022 | struct aio_ring_info *info = &ioctx->ring_info; |
1015 | struct aio_ring *ring; | 1023 | struct aio_ring *ring; |
1016 | unsigned long head; | 1024 | unsigned long head; |
1017 | int ret = 0; | 1025 | int ret = 0; |
1018 | 1026 | ||
1019 | ring = kmap_atomic(info->ring_pages[0], KM_USER0); | 1027 | ring = kmap_atomic(info->ring_pages[0], KM_USER0); |
1020 | dprintk("in aio_read_evt h%lu t%lu m%lu\n", | 1028 | dprintk("in aio_read_evt h%lu t%lu m%lu\n", |
1021 | (unsigned long)ring->head, (unsigned long)ring->tail, | 1029 | (unsigned long)ring->head, (unsigned long)ring->tail, |
1022 | (unsigned long)ring->nr); | 1030 | (unsigned long)ring->nr); |
1023 | 1031 | ||
1024 | if (ring->head == ring->tail) | 1032 | if (ring->head == ring->tail) |
1025 | goto out; | 1033 | goto out; |
1026 | 1034 | ||
1027 | spin_lock(&info->ring_lock); | 1035 | spin_lock(&info->ring_lock); |
1028 | 1036 | ||
1029 | head = ring->head % info->nr; | 1037 | head = ring->head % info->nr; |
1030 | if (head != ring->tail) { | 1038 | if (head != ring->tail) { |
1031 | struct io_event *evp = aio_ring_event(info, head, KM_USER1); | 1039 | struct io_event *evp = aio_ring_event(info, head, KM_USER1); |
1032 | *ent = *evp; | 1040 | *ent = *evp; |
1033 | head = (head + 1) % info->nr; | 1041 | head = (head + 1) % info->nr; |
1034 | smp_mb(); /* finish reading the event before updatng the head */ | 1042 | smp_mb(); /* finish reading the event before updatng the head */ |
1035 | ring->head = head; | 1043 | ring->head = head; |
1036 | ret = 1; | 1044 | ret = 1; |
1037 | put_aio_ring_event(evp, KM_USER1); | 1045 | put_aio_ring_event(evp, KM_USER1); |
1038 | } | 1046 | } |
1039 | spin_unlock(&info->ring_lock); | 1047 | spin_unlock(&info->ring_lock); |
1040 | 1048 | ||
1041 | out: | 1049 | out: |
1042 | kunmap_atomic(ring, KM_USER0); | 1050 | kunmap_atomic(ring, KM_USER0); |
1043 | dprintk("leaving aio_read_evt: %d h%lu t%lu\n", ret, | 1051 | dprintk("leaving aio_read_evt: %d h%lu t%lu\n", ret, |
1044 | (unsigned long)ring->head, (unsigned long)ring->tail); | 1052 | (unsigned long)ring->head, (unsigned long)ring->tail); |
1045 | return ret; | 1053 | return ret; |
1046 | } | 1054 | } |
1047 | 1055 | ||
1048 | struct aio_timeout { | 1056 | struct aio_timeout { |
1049 | struct timer_list timer; | 1057 | struct timer_list timer; |
1050 | int timed_out; | 1058 | int timed_out; |
1051 | struct task_struct *p; | 1059 | struct task_struct *p; |
1052 | }; | 1060 | }; |
1053 | 1061 | ||
1054 | static void timeout_func(unsigned long data) | 1062 | static void timeout_func(unsigned long data) |
1055 | { | 1063 | { |
1056 | struct aio_timeout *to = (struct aio_timeout *)data; | 1064 | struct aio_timeout *to = (struct aio_timeout *)data; |
1057 | 1065 | ||
1058 | to->timed_out = 1; | 1066 | to->timed_out = 1; |
1059 | wake_up_process(to->p); | 1067 | wake_up_process(to->p); |
1060 | } | 1068 | } |
1061 | 1069 | ||
1062 | static inline void init_timeout(struct aio_timeout *to) | 1070 | static inline void init_timeout(struct aio_timeout *to) |
1063 | { | 1071 | { |
1064 | setup_timer_on_stack(&to->timer, timeout_func, (unsigned long) to); | 1072 | setup_timer_on_stack(&to->timer, timeout_func, (unsigned long) to); |
1065 | to->timed_out = 0; | 1073 | to->timed_out = 0; |
1066 | to->p = current; | 1074 | to->p = current; |
1067 | } | 1075 | } |
1068 | 1076 | ||
1069 | static inline void set_timeout(long start_jiffies, struct aio_timeout *to, | 1077 | static inline void set_timeout(long start_jiffies, struct aio_timeout *to, |
1070 | const struct timespec *ts) | 1078 | const struct timespec *ts) |
1071 | { | 1079 | { |
1072 | to->timer.expires = start_jiffies + timespec_to_jiffies(ts); | 1080 | to->timer.expires = start_jiffies + timespec_to_jiffies(ts); |
1073 | if (time_after(to->timer.expires, jiffies)) | 1081 | if (time_after(to->timer.expires, jiffies)) |
1074 | add_timer(&to->timer); | 1082 | add_timer(&to->timer); |
1075 | else | 1083 | else |
1076 | to->timed_out = 1; | 1084 | to->timed_out = 1; |
1077 | } | 1085 | } |
1078 | 1086 | ||
1079 | static inline void clear_timeout(struct aio_timeout *to) | 1087 | static inline void clear_timeout(struct aio_timeout *to) |
1080 | { | 1088 | { |
1081 | del_singleshot_timer_sync(&to->timer); | 1089 | del_singleshot_timer_sync(&to->timer); |
1082 | } | 1090 | } |
1083 | 1091 | ||
1084 | static int read_events(struct kioctx *ctx, | 1092 | static int read_events(struct kioctx *ctx, |
1085 | long min_nr, long nr, | 1093 | long min_nr, long nr, |
1086 | struct io_event __user *event, | 1094 | struct io_event __user *event, |
1087 | struct timespec __user *timeout) | 1095 | struct timespec __user *timeout) |
1088 | { | 1096 | { |
1089 | long start_jiffies = jiffies; | 1097 | long start_jiffies = jiffies; |
1090 | struct task_struct *tsk = current; | 1098 | struct task_struct *tsk = current; |
1091 | DECLARE_WAITQUEUE(wait, tsk); | 1099 | DECLARE_WAITQUEUE(wait, tsk); |
1092 | int ret; | 1100 | int ret; |
1093 | int i = 0; | 1101 | int i = 0; |
1094 | struct io_event ent; | 1102 | struct io_event ent; |
1095 | struct aio_timeout to; | 1103 | struct aio_timeout to; |
1096 | int retry = 0; | 1104 | int retry = 0; |
1097 | 1105 | ||
1098 | /* needed to zero any padding within an entry (there shouldn't be | 1106 | /* needed to zero any padding within an entry (there shouldn't be |
1099 | * any, but C is fun! | 1107 | * any, but C is fun! |
1100 | */ | 1108 | */ |
1101 | memset(&ent, 0, sizeof(ent)); | 1109 | memset(&ent, 0, sizeof(ent)); |
1102 | retry: | 1110 | retry: |
1103 | ret = 0; | 1111 | ret = 0; |
1104 | while (likely(i < nr)) { | 1112 | while (likely(i < nr)) { |
1105 | ret = aio_read_evt(ctx, &ent); | 1113 | ret = aio_read_evt(ctx, &ent); |
1106 | if (unlikely(ret <= 0)) | 1114 | if (unlikely(ret <= 0)) |
1107 | break; | 1115 | break; |
1108 | 1116 | ||
1109 | dprintk("read event: %Lx %Lx %Lx %Lx\n", | 1117 | dprintk("read event: %Lx %Lx %Lx %Lx\n", |
1110 | ent.data, ent.obj, ent.res, ent.res2); | 1118 | ent.data, ent.obj, ent.res, ent.res2); |
1111 | 1119 | ||
1112 | /* Could we split the check in two? */ | 1120 | /* Could we split the check in two? */ |
1113 | ret = -EFAULT; | 1121 | ret = -EFAULT; |
1114 | if (unlikely(copy_to_user(event, &ent, sizeof(ent)))) { | 1122 | if (unlikely(copy_to_user(event, &ent, sizeof(ent)))) { |
1115 | dprintk("aio: lost an event due to EFAULT.\n"); | 1123 | dprintk("aio: lost an event due to EFAULT.\n"); |
1116 | break; | 1124 | break; |
1117 | } | 1125 | } |
1118 | ret = 0; | 1126 | ret = 0; |
1119 | 1127 | ||
1120 | /* Good, event copied to userland, update counts. */ | 1128 | /* Good, event copied to userland, update counts. */ |
1121 | event ++; | 1129 | event ++; |
1122 | i ++; | 1130 | i ++; |
1123 | } | 1131 | } |
1124 | 1132 | ||
1125 | if (min_nr <= i) | 1133 | if (min_nr <= i) |
1126 | return i; | 1134 | return i; |
1127 | if (ret) | 1135 | if (ret) |
1128 | return ret; | 1136 | return ret; |
1129 | 1137 | ||
1130 | /* End fast path */ | 1138 | /* End fast path */ |
1131 | 1139 | ||
1132 | /* racey check, but it gets redone */ | 1140 | /* racey check, but it gets redone */ |
1133 | if (!retry && unlikely(!list_empty(&ctx->run_list))) { | 1141 | if (!retry && unlikely(!list_empty(&ctx->run_list))) { |
1134 | retry = 1; | 1142 | retry = 1; |
1135 | aio_run_all_iocbs(ctx); | 1143 | aio_run_all_iocbs(ctx); |
1136 | goto retry; | 1144 | goto retry; |
1137 | } | 1145 | } |
1138 | 1146 | ||
1139 | init_timeout(&to); | 1147 | init_timeout(&to); |
1140 | if (timeout) { | 1148 | if (timeout) { |
1141 | struct timespec ts; | 1149 | struct timespec ts; |
1142 | ret = -EFAULT; | 1150 | ret = -EFAULT; |
1143 | if (unlikely(copy_from_user(&ts, timeout, sizeof(ts)))) | 1151 | if (unlikely(copy_from_user(&ts, timeout, sizeof(ts)))) |
1144 | goto out; | 1152 | goto out; |
1145 | 1153 | ||
1146 | set_timeout(start_jiffies, &to, &ts); | 1154 | set_timeout(start_jiffies, &to, &ts); |
1147 | } | 1155 | } |
1148 | 1156 | ||
1149 | while (likely(i < nr)) { | 1157 | while (likely(i < nr)) { |
1150 | add_wait_queue_exclusive(&ctx->wait, &wait); | 1158 | add_wait_queue_exclusive(&ctx->wait, &wait); |
1151 | do { | 1159 | do { |
1152 | set_task_state(tsk, TASK_INTERRUPTIBLE); | 1160 | set_task_state(tsk, TASK_INTERRUPTIBLE); |
1153 | ret = aio_read_evt(ctx, &ent); | 1161 | ret = aio_read_evt(ctx, &ent); |
1154 | if (ret) | 1162 | if (ret) |
1155 | break; | 1163 | break; |
1156 | if (min_nr <= i) | 1164 | if (min_nr <= i) |
1157 | break; | 1165 | break; |
1158 | if (unlikely(ctx->dead)) { | 1166 | if (unlikely(ctx->dead)) { |
1159 | ret = -EINVAL; | 1167 | ret = -EINVAL; |
1160 | break; | 1168 | break; |
1161 | } | 1169 | } |
1162 | if (to.timed_out) /* Only check after read evt */ | 1170 | if (to.timed_out) /* Only check after read evt */ |
1163 | break; | 1171 | break; |
1164 | /* Try to only show up in io wait if there are ops | 1172 | /* Try to only show up in io wait if there are ops |
1165 | * in flight */ | 1173 | * in flight */ |
1166 | if (ctx->reqs_active) | 1174 | if (ctx->reqs_active) |
1167 | io_schedule(); | 1175 | io_schedule(); |
1168 | else | 1176 | else |
1169 | schedule(); | 1177 | schedule(); |
1170 | if (signal_pending(tsk)) { | 1178 | if (signal_pending(tsk)) { |
1171 | ret = -EINTR; | 1179 | ret = -EINTR; |
1172 | break; | 1180 | break; |
1173 | } | 1181 | } |
1174 | /*ret = aio_read_evt(ctx, &ent);*/ | 1182 | /*ret = aio_read_evt(ctx, &ent);*/ |
1175 | } while (1) ; | 1183 | } while (1) ; |
1176 | 1184 | ||
1177 | set_task_state(tsk, TASK_RUNNING); | 1185 | set_task_state(tsk, TASK_RUNNING); |
1178 | remove_wait_queue(&ctx->wait, &wait); | 1186 | remove_wait_queue(&ctx->wait, &wait); |
1179 | 1187 | ||
1180 | if (unlikely(ret <= 0)) | 1188 | if (unlikely(ret <= 0)) |
1181 | break; | 1189 | break; |
1182 | 1190 | ||
1183 | ret = -EFAULT; | 1191 | ret = -EFAULT; |
1184 | if (unlikely(copy_to_user(event, &ent, sizeof(ent)))) { | 1192 | if (unlikely(copy_to_user(event, &ent, sizeof(ent)))) { |
1185 | dprintk("aio: lost an event due to EFAULT.\n"); | 1193 | dprintk("aio: lost an event due to EFAULT.\n"); |
1186 | break; | 1194 | break; |
1187 | } | 1195 | } |
1188 | 1196 | ||
1189 | /* Good, event copied to userland, update counts. */ | 1197 | /* Good, event copied to userland, update counts. */ |
1190 | event ++; | 1198 | event ++; |
1191 | i ++; | 1199 | i ++; |
1192 | } | 1200 | } |
1193 | 1201 | ||
1194 | if (timeout) | 1202 | if (timeout) |
1195 | clear_timeout(&to); | 1203 | clear_timeout(&to); |
1196 | out: | 1204 | out: |
1197 | destroy_timer_on_stack(&to.timer); | 1205 | destroy_timer_on_stack(&to.timer); |
1198 | return i ? i : ret; | 1206 | return i ? i : ret; |
1199 | } | 1207 | } |
1200 | 1208 | ||
1201 | /* Take an ioctx and remove it from the list of ioctx's. Protects | 1209 | /* Take an ioctx and remove it from the list of ioctx's. Protects |
1202 | * against races with itself via ->dead. | 1210 | * against races with itself via ->dead. |
1203 | */ | 1211 | */ |
1204 | static void io_destroy(struct kioctx *ioctx) | 1212 | static void io_destroy(struct kioctx *ioctx) |
1205 | { | 1213 | { |
1206 | struct mm_struct *mm = current->mm; | 1214 | struct mm_struct *mm = current->mm; |
1207 | int was_dead; | 1215 | int was_dead; |
1208 | 1216 | ||
1209 | /* delete the entry from the list is someone else hasn't already */ | 1217 | /* delete the entry from the list is someone else hasn't already */ |
1210 | spin_lock(&mm->ioctx_lock); | 1218 | spin_lock(&mm->ioctx_lock); |
1211 | was_dead = ioctx->dead; | 1219 | was_dead = ioctx->dead; |
1212 | ioctx->dead = 1; | 1220 | ioctx->dead = 1; |
1213 | hlist_del_rcu(&ioctx->list); | 1221 | hlist_del_rcu(&ioctx->list); |
1214 | spin_unlock(&mm->ioctx_lock); | 1222 | spin_unlock(&mm->ioctx_lock); |
1215 | 1223 | ||
1216 | dprintk("aio_release(%p)\n", ioctx); | 1224 | dprintk("aio_release(%p)\n", ioctx); |
1217 | if (likely(!was_dead)) | 1225 | if (likely(!was_dead)) |
1218 | put_ioctx(ioctx); /* twice for the list */ | 1226 | put_ioctx(ioctx); /* twice for the list */ |
1219 | 1227 | ||
1220 | aio_cancel_all(ioctx); | 1228 | aio_cancel_all(ioctx); |
1221 | wait_for_all_aios(ioctx); | 1229 | wait_for_all_aios(ioctx); |
1222 | 1230 | ||
1223 | /* | 1231 | /* |
1224 | * Wake up any waiters. The setting of ctx->dead must be seen | 1232 | * Wake up any waiters. The setting of ctx->dead must be seen |
1225 | * by other CPUs at this point. Right now, we rely on the | 1233 | * by other CPUs at this point. Right now, we rely on the |
1226 | * locking done by the above calls to ensure this consistency. | 1234 | * locking done by the above calls to ensure this consistency. |
1227 | */ | 1235 | */ |
1228 | wake_up(&ioctx->wait); | 1236 | wake_up(&ioctx->wait); |
1229 | put_ioctx(ioctx); /* once for the lookup */ | 1237 | put_ioctx(ioctx); /* once for the lookup */ |
1230 | } | 1238 | } |
1231 | 1239 | ||
1232 | /* sys_io_setup: | 1240 | /* sys_io_setup: |
1233 | * Create an aio_context capable of receiving at least nr_events. | 1241 | * Create an aio_context capable of receiving at least nr_events. |
1234 | * ctxp must not point to an aio_context that already exists, and | 1242 | * ctxp must not point to an aio_context that already exists, and |
1235 | * must be initialized to 0 prior to the call. On successful | 1243 | * must be initialized to 0 prior to the call. On successful |
1236 | * creation of the aio_context, *ctxp is filled in with the resulting | 1244 | * creation of the aio_context, *ctxp is filled in with the resulting |
1237 | * handle. May fail with -EINVAL if *ctxp is not initialized, | 1245 | * handle. May fail with -EINVAL if *ctxp is not initialized, |
1238 | * if the specified nr_events exceeds internal limits. May fail | 1246 | * if the specified nr_events exceeds internal limits. May fail |
1239 | * with -EAGAIN if the specified nr_events exceeds the user's limit | 1247 | * with -EAGAIN if the specified nr_events exceeds the user's limit |
1240 | * of available events. May fail with -ENOMEM if insufficient kernel | 1248 | * of available events. May fail with -ENOMEM if insufficient kernel |
1241 | * resources are available. May fail with -EFAULT if an invalid | 1249 | * resources are available. May fail with -EFAULT if an invalid |
1242 | * pointer is passed for ctxp. Will fail with -ENOSYS if not | 1250 | * pointer is passed for ctxp. Will fail with -ENOSYS if not |
1243 | * implemented. | 1251 | * implemented. |
1244 | */ | 1252 | */ |
1245 | SYSCALL_DEFINE2(io_setup, unsigned, nr_events, aio_context_t __user *, ctxp) | 1253 | SYSCALL_DEFINE2(io_setup, unsigned, nr_events, aio_context_t __user *, ctxp) |
1246 | { | 1254 | { |
1247 | struct kioctx *ioctx = NULL; | 1255 | struct kioctx *ioctx = NULL; |
1248 | unsigned long ctx; | 1256 | unsigned long ctx; |
1249 | long ret; | 1257 | long ret; |
1250 | 1258 | ||
1251 | ret = get_user(ctx, ctxp); | 1259 | ret = get_user(ctx, ctxp); |
1252 | if (unlikely(ret)) | 1260 | if (unlikely(ret)) |
1253 | goto out; | 1261 | goto out; |
1254 | 1262 | ||
1255 | ret = -EINVAL; | 1263 | ret = -EINVAL; |
1256 | if (unlikely(ctx || nr_events == 0)) { | 1264 | if (unlikely(ctx || nr_events == 0)) { |
1257 | pr_debug("EINVAL: io_setup: ctx %lu nr_events %u\n", | 1265 | pr_debug("EINVAL: io_setup: ctx %lu nr_events %u\n", |
1258 | ctx, nr_events); | 1266 | ctx, nr_events); |
1259 | goto out; | 1267 | goto out; |
1260 | } | 1268 | } |
1261 | 1269 | ||
1262 | ioctx = ioctx_alloc(nr_events); | 1270 | ioctx = ioctx_alloc(nr_events); |
1263 | ret = PTR_ERR(ioctx); | 1271 | ret = PTR_ERR(ioctx); |
1264 | if (!IS_ERR(ioctx)) { | 1272 | if (!IS_ERR(ioctx)) { |
1265 | ret = put_user(ioctx->user_id, ctxp); | 1273 | ret = put_user(ioctx->user_id, ctxp); |
1266 | if (!ret) | 1274 | if (!ret) |
1267 | return 0; | 1275 | return 0; |
1268 | 1276 | ||
1269 | get_ioctx(ioctx); /* io_destroy() expects us to hold a ref */ | 1277 | get_ioctx(ioctx); /* io_destroy() expects us to hold a ref */ |
1270 | io_destroy(ioctx); | 1278 | io_destroy(ioctx); |
1271 | } | 1279 | } |
1272 | 1280 | ||
1273 | out: | 1281 | out: |
1274 | return ret; | 1282 | return ret; |
1275 | } | 1283 | } |
1276 | 1284 | ||
1277 | /* sys_io_destroy: | 1285 | /* sys_io_destroy: |
1278 | * Destroy the aio_context specified. May cancel any outstanding | 1286 | * Destroy the aio_context specified. May cancel any outstanding |
1279 | * AIOs and block on completion. Will fail with -ENOSYS if not | 1287 | * AIOs and block on completion. Will fail with -ENOSYS if not |
1280 | * implemented. May fail with -EINVAL if the context pointed to | 1288 | * implemented. May fail with -EINVAL if the context pointed to |
1281 | * is invalid. | 1289 | * is invalid. |
1282 | */ | 1290 | */ |
1283 | SYSCALL_DEFINE1(io_destroy, aio_context_t, ctx) | 1291 | SYSCALL_DEFINE1(io_destroy, aio_context_t, ctx) |
1284 | { | 1292 | { |
1285 | struct kioctx *ioctx = lookup_ioctx(ctx); | 1293 | struct kioctx *ioctx = lookup_ioctx(ctx); |
1286 | if (likely(NULL != ioctx)) { | 1294 | if (likely(NULL != ioctx)) { |
1287 | io_destroy(ioctx); | 1295 | io_destroy(ioctx); |
1288 | return 0; | 1296 | return 0; |
1289 | } | 1297 | } |
1290 | pr_debug("EINVAL: io_destroy: invalid context id\n"); | 1298 | pr_debug("EINVAL: io_destroy: invalid context id\n"); |
1291 | return -EINVAL; | 1299 | return -EINVAL; |
1292 | } | 1300 | } |
1293 | 1301 | ||
1294 | static void aio_advance_iovec(struct kiocb *iocb, ssize_t ret) | 1302 | static void aio_advance_iovec(struct kiocb *iocb, ssize_t ret) |
1295 | { | 1303 | { |
1296 | struct iovec *iov = &iocb->ki_iovec[iocb->ki_cur_seg]; | 1304 | struct iovec *iov = &iocb->ki_iovec[iocb->ki_cur_seg]; |
1297 | 1305 | ||
1298 | BUG_ON(ret <= 0); | 1306 | BUG_ON(ret <= 0); |
1299 | 1307 | ||
1300 | while (iocb->ki_cur_seg < iocb->ki_nr_segs && ret > 0) { | 1308 | while (iocb->ki_cur_seg < iocb->ki_nr_segs && ret > 0) { |
1301 | ssize_t this = min((ssize_t)iov->iov_len, ret); | 1309 | ssize_t this = min((ssize_t)iov->iov_len, ret); |
1302 | iov->iov_base += this; | 1310 | iov->iov_base += this; |
1303 | iov->iov_len -= this; | 1311 | iov->iov_len -= this; |
1304 | iocb->ki_left -= this; | 1312 | iocb->ki_left -= this; |
1305 | ret -= this; | 1313 | ret -= this; |
1306 | if (iov->iov_len == 0) { | 1314 | if (iov->iov_len == 0) { |
1307 | iocb->ki_cur_seg++; | 1315 | iocb->ki_cur_seg++; |
1308 | iov++; | 1316 | iov++; |
1309 | } | 1317 | } |
1310 | } | 1318 | } |
1311 | 1319 | ||
1312 | /* the caller should not have done more io than what fit in | 1320 | /* the caller should not have done more io than what fit in |
1313 | * the remaining iovecs */ | 1321 | * the remaining iovecs */ |
1314 | BUG_ON(ret > 0 && iocb->ki_left == 0); | 1322 | BUG_ON(ret > 0 && iocb->ki_left == 0); |
1315 | } | 1323 | } |
1316 | 1324 | ||
1317 | static ssize_t aio_rw_vect_retry(struct kiocb *iocb) | 1325 | static ssize_t aio_rw_vect_retry(struct kiocb *iocb) |
1318 | { | 1326 | { |
1319 | struct file *file = iocb->ki_filp; | 1327 | struct file *file = iocb->ki_filp; |
1320 | struct address_space *mapping = file->f_mapping; | 1328 | struct address_space *mapping = file->f_mapping; |
1321 | struct inode *inode = mapping->host; | 1329 | struct inode *inode = mapping->host; |
1322 | ssize_t (*rw_op)(struct kiocb *, const struct iovec *, | 1330 | ssize_t (*rw_op)(struct kiocb *, const struct iovec *, |
1323 | unsigned long, loff_t); | 1331 | unsigned long, loff_t); |
1324 | ssize_t ret = 0; | 1332 | ssize_t ret = 0; |
1325 | unsigned short opcode; | 1333 | unsigned short opcode; |
1326 | 1334 | ||
1327 | if ((iocb->ki_opcode == IOCB_CMD_PREADV) || | 1335 | if ((iocb->ki_opcode == IOCB_CMD_PREADV) || |
1328 | (iocb->ki_opcode == IOCB_CMD_PREAD)) { | 1336 | (iocb->ki_opcode == IOCB_CMD_PREAD)) { |
1329 | rw_op = file->f_op->aio_read; | 1337 | rw_op = file->f_op->aio_read; |
1330 | opcode = IOCB_CMD_PREADV; | 1338 | opcode = IOCB_CMD_PREADV; |
1331 | } else { | 1339 | } else { |
1332 | rw_op = file->f_op->aio_write; | 1340 | rw_op = file->f_op->aio_write; |
1333 | opcode = IOCB_CMD_PWRITEV; | 1341 | opcode = IOCB_CMD_PWRITEV; |
1334 | } | 1342 | } |
1335 | 1343 | ||
1336 | /* This matches the pread()/pwrite() logic */ | 1344 | /* This matches the pread()/pwrite() logic */ |
1337 | if (iocb->ki_pos < 0) | 1345 | if (iocb->ki_pos < 0) |
1338 | return -EINVAL; | 1346 | return -EINVAL; |
1339 | 1347 | ||
1340 | do { | 1348 | do { |
1341 | ret = rw_op(iocb, &iocb->ki_iovec[iocb->ki_cur_seg], | 1349 | ret = rw_op(iocb, &iocb->ki_iovec[iocb->ki_cur_seg], |
1342 | iocb->ki_nr_segs - iocb->ki_cur_seg, | 1350 | iocb->ki_nr_segs - iocb->ki_cur_seg, |
1343 | iocb->ki_pos); | 1351 | iocb->ki_pos); |
1344 | if (ret > 0) | 1352 | if (ret > 0) |
1345 | aio_advance_iovec(iocb, ret); | 1353 | aio_advance_iovec(iocb, ret); |
1346 | 1354 | ||
1347 | /* retry all partial writes. retry partial reads as long as its a | 1355 | /* retry all partial writes. retry partial reads as long as its a |
1348 | * regular file. */ | 1356 | * regular file. */ |
1349 | } while (ret > 0 && iocb->ki_left > 0 && | 1357 | } while (ret > 0 && iocb->ki_left > 0 && |
1350 | (opcode == IOCB_CMD_PWRITEV || | 1358 | (opcode == IOCB_CMD_PWRITEV || |
1351 | (!S_ISFIFO(inode->i_mode) && !S_ISSOCK(inode->i_mode)))); | 1359 | (!S_ISFIFO(inode->i_mode) && !S_ISSOCK(inode->i_mode)))); |
1352 | 1360 | ||
1353 | /* This means we must have transferred all that we could */ | 1361 | /* This means we must have transferred all that we could */ |
1354 | /* No need to retry anymore */ | 1362 | /* No need to retry anymore */ |
1355 | if ((ret == 0) || (iocb->ki_left == 0)) | 1363 | if ((ret == 0) || (iocb->ki_left == 0)) |
1356 | ret = iocb->ki_nbytes - iocb->ki_left; | 1364 | ret = iocb->ki_nbytes - iocb->ki_left; |
1357 | 1365 | ||
1358 | /* If we managed to write some out we return that, rather than | 1366 | /* If we managed to write some out we return that, rather than |
1359 | * the eventual error. */ | 1367 | * the eventual error. */ |
1360 | if (opcode == IOCB_CMD_PWRITEV | 1368 | if (opcode == IOCB_CMD_PWRITEV |
1361 | && ret < 0 && ret != -EIOCBQUEUED && ret != -EIOCBRETRY | 1369 | && ret < 0 && ret != -EIOCBQUEUED && ret != -EIOCBRETRY |
1362 | && iocb->ki_nbytes - iocb->ki_left) | 1370 | && iocb->ki_nbytes - iocb->ki_left) |
1363 | ret = iocb->ki_nbytes - iocb->ki_left; | 1371 | ret = iocb->ki_nbytes - iocb->ki_left; |
1364 | 1372 | ||
1365 | return ret; | 1373 | return ret; |
1366 | } | 1374 | } |
1367 | 1375 | ||
1368 | static ssize_t aio_fdsync(struct kiocb *iocb) | 1376 | static ssize_t aio_fdsync(struct kiocb *iocb) |
1369 | { | 1377 | { |
1370 | struct file *file = iocb->ki_filp; | 1378 | struct file *file = iocb->ki_filp; |
1371 | ssize_t ret = -EINVAL; | 1379 | ssize_t ret = -EINVAL; |
1372 | 1380 | ||
1373 | if (file->f_op->aio_fsync) | 1381 | if (file->f_op->aio_fsync) |
1374 | ret = file->f_op->aio_fsync(iocb, 1); | 1382 | ret = file->f_op->aio_fsync(iocb, 1); |
1375 | return ret; | 1383 | return ret; |
1376 | } | 1384 | } |
1377 | 1385 | ||
1378 | static ssize_t aio_fsync(struct kiocb *iocb) | 1386 | static ssize_t aio_fsync(struct kiocb *iocb) |
1379 | { | 1387 | { |
1380 | struct file *file = iocb->ki_filp; | 1388 | struct file *file = iocb->ki_filp; |
1381 | ssize_t ret = -EINVAL; | 1389 | ssize_t ret = -EINVAL; |
1382 | 1390 | ||
1383 | if (file->f_op->aio_fsync) | 1391 | if (file->f_op->aio_fsync) |
1384 | ret = file->f_op->aio_fsync(iocb, 0); | 1392 | ret = file->f_op->aio_fsync(iocb, 0); |
1385 | return ret; | 1393 | return ret; |
1386 | } | 1394 | } |
1387 | 1395 | ||
1388 | static ssize_t aio_setup_vectored_rw(int type, struct kiocb *kiocb, bool compat) | 1396 | static ssize_t aio_setup_vectored_rw(int type, struct kiocb *kiocb, bool compat) |
1389 | { | 1397 | { |
1390 | ssize_t ret; | 1398 | ssize_t ret; |
1391 | 1399 | ||
1392 | #ifdef CONFIG_COMPAT | 1400 | #ifdef CONFIG_COMPAT |
1393 | if (compat) | 1401 | if (compat) |
1394 | ret = compat_rw_copy_check_uvector(type, | 1402 | ret = compat_rw_copy_check_uvector(type, |
1395 | (struct compat_iovec __user *)kiocb->ki_buf, | 1403 | (struct compat_iovec __user *)kiocb->ki_buf, |
1396 | kiocb->ki_nbytes, 1, &kiocb->ki_inline_vec, | 1404 | kiocb->ki_nbytes, 1, &kiocb->ki_inline_vec, |
1397 | &kiocb->ki_iovec); | 1405 | &kiocb->ki_iovec); |
1398 | else | 1406 | else |
1399 | #endif | 1407 | #endif |
1400 | ret = rw_copy_check_uvector(type, | 1408 | ret = rw_copy_check_uvector(type, |
1401 | (struct iovec __user *)kiocb->ki_buf, | 1409 | (struct iovec __user *)kiocb->ki_buf, |
1402 | kiocb->ki_nbytes, 1, &kiocb->ki_inline_vec, | 1410 | kiocb->ki_nbytes, 1, &kiocb->ki_inline_vec, |
1403 | &kiocb->ki_iovec); | 1411 | &kiocb->ki_iovec); |
1404 | if (ret < 0) | 1412 | if (ret < 0) |
1405 | goto out; | 1413 | goto out; |
1406 | 1414 | ||
1407 | kiocb->ki_nr_segs = kiocb->ki_nbytes; | 1415 | kiocb->ki_nr_segs = kiocb->ki_nbytes; |
1408 | kiocb->ki_cur_seg = 0; | 1416 | kiocb->ki_cur_seg = 0; |
1409 | /* ki_nbytes/left now reflect bytes instead of segs */ | 1417 | /* ki_nbytes/left now reflect bytes instead of segs */ |
1410 | kiocb->ki_nbytes = ret; | 1418 | kiocb->ki_nbytes = ret; |
1411 | kiocb->ki_left = ret; | 1419 | kiocb->ki_left = ret; |
1412 | 1420 | ||
1413 | ret = 0; | 1421 | ret = 0; |
1414 | out: | 1422 | out: |
1415 | return ret; | 1423 | return ret; |
1416 | } | 1424 | } |
1417 | 1425 | ||
1418 | static ssize_t aio_setup_single_vector(struct kiocb *kiocb) | 1426 | static ssize_t aio_setup_single_vector(struct kiocb *kiocb) |
1419 | { | 1427 | { |
1420 | kiocb->ki_iovec = &kiocb->ki_inline_vec; | 1428 | kiocb->ki_iovec = &kiocb->ki_inline_vec; |
1421 | kiocb->ki_iovec->iov_base = kiocb->ki_buf; | 1429 | kiocb->ki_iovec->iov_base = kiocb->ki_buf; |
1422 | kiocb->ki_iovec->iov_len = kiocb->ki_left; | 1430 | kiocb->ki_iovec->iov_len = kiocb->ki_left; |
1423 | kiocb->ki_nr_segs = 1; | 1431 | kiocb->ki_nr_segs = 1; |
1424 | kiocb->ki_cur_seg = 0; | 1432 | kiocb->ki_cur_seg = 0; |
1425 | return 0; | 1433 | return 0; |
1426 | } | 1434 | } |
1427 | 1435 | ||
1428 | /* | 1436 | /* |
1429 | * aio_setup_iocb: | 1437 | * aio_setup_iocb: |
1430 | * Performs the initial checks and aio retry method | 1438 | * Performs the initial checks and aio retry method |
1431 | * setup for the kiocb at the time of io submission. | 1439 | * setup for the kiocb at the time of io submission. |
1432 | */ | 1440 | */ |
1433 | static ssize_t aio_setup_iocb(struct kiocb *kiocb, bool compat) | 1441 | static ssize_t aio_setup_iocb(struct kiocb *kiocb, bool compat) |
1434 | { | 1442 | { |
1435 | struct file *file = kiocb->ki_filp; | 1443 | struct file *file = kiocb->ki_filp; |
1436 | ssize_t ret = 0; | 1444 | ssize_t ret = 0; |
1437 | 1445 | ||
1438 | switch (kiocb->ki_opcode) { | 1446 | switch (kiocb->ki_opcode) { |
1439 | case IOCB_CMD_PREAD: | 1447 | case IOCB_CMD_PREAD: |
1440 | ret = -EBADF; | 1448 | ret = -EBADF; |
1441 | if (unlikely(!(file->f_mode & FMODE_READ))) | 1449 | if (unlikely(!(file->f_mode & FMODE_READ))) |
1442 | break; | 1450 | break; |
1443 | ret = -EFAULT; | 1451 | ret = -EFAULT; |
1444 | if (unlikely(!access_ok(VERIFY_WRITE, kiocb->ki_buf, | 1452 | if (unlikely(!access_ok(VERIFY_WRITE, kiocb->ki_buf, |
1445 | kiocb->ki_left))) | 1453 | kiocb->ki_left))) |
1446 | break; | 1454 | break; |
1447 | ret = security_file_permission(file, MAY_READ); | 1455 | ret = security_file_permission(file, MAY_READ); |
1448 | if (unlikely(ret)) | 1456 | if (unlikely(ret)) |
1449 | break; | 1457 | break; |
1450 | ret = aio_setup_single_vector(kiocb); | 1458 | ret = aio_setup_single_vector(kiocb); |
1451 | if (ret) | 1459 | if (ret) |
1452 | break; | 1460 | break; |
1453 | ret = -EINVAL; | 1461 | ret = -EINVAL; |
1454 | if (file->f_op->aio_read) | 1462 | if (file->f_op->aio_read) |
1455 | kiocb->ki_retry = aio_rw_vect_retry; | 1463 | kiocb->ki_retry = aio_rw_vect_retry; |
1456 | break; | 1464 | break; |
1457 | case IOCB_CMD_PWRITE: | 1465 | case IOCB_CMD_PWRITE: |
1458 | ret = -EBADF; | 1466 | ret = -EBADF; |
1459 | if (unlikely(!(file->f_mode & FMODE_WRITE))) | 1467 | if (unlikely(!(file->f_mode & FMODE_WRITE))) |
1460 | break; | 1468 | break; |
1461 | ret = -EFAULT; | 1469 | ret = -EFAULT; |
1462 | if (unlikely(!access_ok(VERIFY_READ, kiocb->ki_buf, | 1470 | if (unlikely(!access_ok(VERIFY_READ, kiocb->ki_buf, |
1463 | kiocb->ki_left))) | 1471 | kiocb->ki_left))) |
1464 | break; | 1472 | break; |
1465 | ret = security_file_permission(file, MAY_WRITE); | 1473 | ret = security_file_permission(file, MAY_WRITE); |
1466 | if (unlikely(ret)) | 1474 | if (unlikely(ret)) |
1467 | break; | 1475 | break; |
1468 | ret = aio_setup_single_vector(kiocb); | 1476 | ret = aio_setup_single_vector(kiocb); |
1469 | if (ret) | 1477 | if (ret) |
1470 | break; | 1478 | break; |
1471 | ret = -EINVAL; | 1479 | ret = -EINVAL; |
1472 | if (file->f_op->aio_write) | 1480 | if (file->f_op->aio_write) |
1473 | kiocb->ki_retry = aio_rw_vect_retry; | 1481 | kiocb->ki_retry = aio_rw_vect_retry; |
1474 | break; | 1482 | break; |
1475 | case IOCB_CMD_PREADV: | 1483 | case IOCB_CMD_PREADV: |
1476 | ret = -EBADF; | 1484 | ret = -EBADF; |
1477 | if (unlikely(!(file->f_mode & FMODE_READ))) | 1485 | if (unlikely(!(file->f_mode & FMODE_READ))) |
1478 | break; | 1486 | break; |
1479 | ret = security_file_permission(file, MAY_READ); | 1487 | ret = security_file_permission(file, MAY_READ); |
1480 | if (unlikely(ret)) | 1488 | if (unlikely(ret)) |
1481 | break; | 1489 | break; |
1482 | ret = aio_setup_vectored_rw(READ, kiocb, compat); | 1490 | ret = aio_setup_vectored_rw(READ, kiocb, compat); |
1483 | if (ret) | 1491 | if (ret) |
1484 | break; | 1492 | break; |
1485 | ret = -EINVAL; | 1493 | ret = -EINVAL; |
1486 | if (file->f_op->aio_read) | 1494 | if (file->f_op->aio_read) |
1487 | kiocb->ki_retry = aio_rw_vect_retry; | 1495 | kiocb->ki_retry = aio_rw_vect_retry; |
1488 | break; | 1496 | break; |
1489 | case IOCB_CMD_PWRITEV: | 1497 | case IOCB_CMD_PWRITEV: |
1490 | ret = -EBADF; | 1498 | ret = -EBADF; |
1491 | if (unlikely(!(file->f_mode & FMODE_WRITE))) | 1499 | if (unlikely(!(file->f_mode & FMODE_WRITE))) |
1492 | break; | 1500 | break; |
1493 | ret = security_file_permission(file, MAY_WRITE); | 1501 | ret = security_file_permission(file, MAY_WRITE); |
1494 | if (unlikely(ret)) | 1502 | if (unlikely(ret)) |
1495 | break; | 1503 | break; |
1496 | ret = aio_setup_vectored_rw(WRITE, kiocb, compat); | 1504 | ret = aio_setup_vectored_rw(WRITE, kiocb, compat); |
1497 | if (ret) | 1505 | if (ret) |
1498 | break; | 1506 | break; |
1499 | ret = -EINVAL; | 1507 | ret = -EINVAL; |
1500 | if (file->f_op->aio_write) | 1508 | if (file->f_op->aio_write) |
1501 | kiocb->ki_retry = aio_rw_vect_retry; | 1509 | kiocb->ki_retry = aio_rw_vect_retry; |
1502 | break; | 1510 | break; |
1503 | case IOCB_CMD_FDSYNC: | 1511 | case IOCB_CMD_FDSYNC: |
1504 | ret = -EINVAL; | 1512 | ret = -EINVAL; |
1505 | if (file->f_op->aio_fsync) | 1513 | if (file->f_op->aio_fsync) |
1506 | kiocb->ki_retry = aio_fdsync; | 1514 | kiocb->ki_retry = aio_fdsync; |
1507 | break; | 1515 | break; |
1508 | case IOCB_CMD_FSYNC: | 1516 | case IOCB_CMD_FSYNC: |
1509 | ret = -EINVAL; | 1517 | ret = -EINVAL; |
1510 | if (file->f_op->aio_fsync) | 1518 | if (file->f_op->aio_fsync) |
1511 | kiocb->ki_retry = aio_fsync; | 1519 | kiocb->ki_retry = aio_fsync; |
1512 | break; | 1520 | break; |
1513 | default: | 1521 | default: |
1514 | dprintk("EINVAL: io_submit: no operation provided\n"); | 1522 | dprintk("EINVAL: io_submit: no operation provided\n"); |
1515 | ret = -EINVAL; | 1523 | ret = -EINVAL; |
1516 | } | 1524 | } |
1517 | 1525 | ||
1518 | if (!kiocb->ki_retry) | 1526 | if (!kiocb->ki_retry) |
1519 | return ret; | 1527 | return ret; |
1520 | 1528 | ||
1521 | return 0; | 1529 | return 0; |
1522 | } | 1530 | } |
1523 | 1531 | ||
1524 | static void aio_batch_add(struct address_space *mapping, | 1532 | static void aio_batch_add(struct address_space *mapping, |
1525 | struct hlist_head *batch_hash) | 1533 | struct hlist_head *batch_hash) |
1526 | { | 1534 | { |
1527 | struct aio_batch_entry *abe; | 1535 | struct aio_batch_entry *abe; |
1528 | struct hlist_node *pos; | 1536 | struct hlist_node *pos; |
1529 | unsigned bucket; | 1537 | unsigned bucket; |
1530 | 1538 | ||
1531 | bucket = hash_ptr(mapping, AIO_BATCH_HASH_BITS); | 1539 | bucket = hash_ptr(mapping, AIO_BATCH_HASH_BITS); |
1532 | hlist_for_each_entry(abe, pos, &batch_hash[bucket], list) { | 1540 | hlist_for_each_entry(abe, pos, &batch_hash[bucket], list) { |
1533 | if (abe->mapping == mapping) | 1541 | if (abe->mapping == mapping) |
1534 | return; | 1542 | return; |
1535 | } | 1543 | } |
1536 | 1544 | ||
1537 | abe = mempool_alloc(abe_pool, GFP_KERNEL); | 1545 | abe = mempool_alloc(abe_pool, GFP_KERNEL); |
1538 | BUG_ON(!igrab(mapping->host)); | 1546 | BUG_ON(!igrab(mapping->host)); |
1539 | abe->mapping = mapping; | 1547 | abe->mapping = mapping; |
1540 | hlist_add_head(&abe->list, &batch_hash[bucket]); | 1548 | hlist_add_head(&abe->list, &batch_hash[bucket]); |
1541 | return; | 1549 | return; |
1542 | } | 1550 | } |
1543 | 1551 | ||
1544 | static void aio_batch_free(struct hlist_head *batch_hash) | 1552 | static void aio_batch_free(struct hlist_head *batch_hash) |
1545 | { | 1553 | { |
1546 | struct aio_batch_entry *abe; | 1554 | struct aio_batch_entry *abe; |
1547 | struct hlist_node *pos, *n; | 1555 | struct hlist_node *pos, *n; |
1548 | int i; | 1556 | int i; |
1549 | 1557 | ||
1550 | for (i = 0; i < AIO_BATCH_HASH_SIZE; i++) { | 1558 | for (i = 0; i < AIO_BATCH_HASH_SIZE; i++) { |
1551 | hlist_for_each_entry_safe(abe, pos, n, &batch_hash[i], list) { | 1559 | hlist_for_each_entry_safe(abe, pos, n, &batch_hash[i], list) { |
1552 | blk_run_address_space(abe->mapping); | 1560 | blk_run_address_space(abe->mapping); |
1553 | iput(abe->mapping->host); | 1561 | iput(abe->mapping->host); |
1554 | hlist_del(&abe->list); | 1562 | hlist_del(&abe->list); |
1555 | mempool_free(abe, abe_pool); | 1563 | mempool_free(abe, abe_pool); |
1556 | } | 1564 | } |
1557 | } | 1565 | } |
1558 | } | 1566 | } |
1559 | 1567 | ||
1560 | static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, | 1568 | static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, |
1561 | struct iocb *iocb, struct hlist_head *batch_hash, | 1569 | struct iocb *iocb, struct hlist_head *batch_hash, |
1562 | bool compat) | 1570 | bool compat) |
1563 | { | 1571 | { |
1564 | struct kiocb *req; | 1572 | struct kiocb *req; |
1565 | struct file *file; | 1573 | struct file *file; |
1566 | ssize_t ret; | 1574 | ssize_t ret; |
1567 | 1575 | ||
1568 | /* enforce forwards compatibility on users */ | 1576 | /* enforce forwards compatibility on users */ |
1569 | if (unlikely(iocb->aio_reserved1 || iocb->aio_reserved2)) { | 1577 | if (unlikely(iocb->aio_reserved1 || iocb->aio_reserved2)) { |
1570 | pr_debug("EINVAL: io_submit: reserve field set\n"); | 1578 | pr_debug("EINVAL: io_submit: reserve field set\n"); |
1571 | return -EINVAL; | 1579 | return -EINVAL; |
1572 | } | 1580 | } |
1573 | 1581 | ||
1574 | /* prevent overflows */ | 1582 | /* prevent overflows */ |
1575 | if (unlikely( | 1583 | if (unlikely( |
1576 | (iocb->aio_buf != (unsigned long)iocb->aio_buf) || | 1584 | (iocb->aio_buf != (unsigned long)iocb->aio_buf) || |
1577 | (iocb->aio_nbytes != (size_t)iocb->aio_nbytes) || | 1585 | (iocb->aio_nbytes != (size_t)iocb->aio_nbytes) || |
1578 | ((ssize_t)iocb->aio_nbytes < 0) | 1586 | ((ssize_t)iocb->aio_nbytes < 0) |
1579 | )) { | 1587 | )) { |
1580 | pr_debug("EINVAL: io_submit: overflow check\n"); | 1588 | pr_debug("EINVAL: io_submit: overflow check\n"); |
1581 | return -EINVAL; | 1589 | return -EINVAL; |
1582 | } | 1590 | } |
1583 | 1591 | ||
1584 | file = fget(iocb->aio_fildes); | 1592 | file = fget(iocb->aio_fildes); |
1585 | if (unlikely(!file)) | 1593 | if (unlikely(!file)) |
1586 | return -EBADF; | 1594 | return -EBADF; |
1587 | 1595 | ||
1588 | req = aio_get_req(ctx); /* returns with 2 references to req */ | 1596 | req = aio_get_req(ctx); /* returns with 2 references to req */ |
1589 | if (unlikely(!req)) { | 1597 | if (unlikely(!req)) { |
1590 | fput(file); | 1598 | fput(file); |
1591 | return -EAGAIN; | 1599 | return -EAGAIN; |
1592 | } | 1600 | } |
1593 | req->ki_filp = file; | 1601 | req->ki_filp = file; |
1594 | if (iocb->aio_flags & IOCB_FLAG_RESFD) { | 1602 | if (iocb->aio_flags & IOCB_FLAG_RESFD) { |
1595 | /* | 1603 | /* |
1596 | * If the IOCB_FLAG_RESFD flag of aio_flags is set, get an | 1604 | * If the IOCB_FLAG_RESFD flag of aio_flags is set, get an |
1597 | * instance of the file* now. The file descriptor must be | 1605 | * instance of the file* now. The file descriptor must be |
1598 | * an eventfd() fd, and will be signaled for each completed | 1606 | * an eventfd() fd, and will be signaled for each completed |
1599 | * event using the eventfd_signal() function. | 1607 | * event using the eventfd_signal() function. |
1600 | */ | 1608 | */ |
1601 | req->ki_eventfd = eventfd_ctx_fdget((int) iocb->aio_resfd); | 1609 | req->ki_eventfd = eventfd_ctx_fdget((int) iocb->aio_resfd); |
1602 | if (IS_ERR(req->ki_eventfd)) { | 1610 | if (IS_ERR(req->ki_eventfd)) { |
1603 | ret = PTR_ERR(req->ki_eventfd); | 1611 | ret = PTR_ERR(req->ki_eventfd); |
1604 | req->ki_eventfd = NULL; | 1612 | req->ki_eventfd = NULL; |
1605 | goto out_put_req; | 1613 | goto out_put_req; |
1606 | } | 1614 | } |
1607 | } | 1615 | } |
1608 | 1616 | ||
1609 | ret = put_user(req->ki_key, &user_iocb->aio_key); | 1617 | ret = put_user(req->ki_key, &user_iocb->aio_key); |
1610 | if (unlikely(ret)) { | 1618 | if (unlikely(ret)) { |
1611 | dprintk("EFAULT: aio_key\n"); | 1619 | dprintk("EFAULT: aio_key\n"); |
1612 | goto out_put_req; | 1620 | goto out_put_req; |
1613 | } | 1621 | } |
1614 | 1622 | ||
1615 | req->ki_obj.user = user_iocb; | 1623 | req->ki_obj.user = user_iocb; |
1616 | req->ki_user_data = iocb->aio_data; | 1624 | req->ki_user_data = iocb->aio_data; |
1617 | req->ki_pos = iocb->aio_offset; | 1625 | req->ki_pos = iocb->aio_offset; |
1618 | 1626 | ||
1619 | req->ki_buf = (char __user *)(unsigned long)iocb->aio_buf; | 1627 | req->ki_buf = (char __user *)(unsigned long)iocb->aio_buf; |
1620 | req->ki_left = req->ki_nbytes = iocb->aio_nbytes; | 1628 | req->ki_left = req->ki_nbytes = iocb->aio_nbytes; |
1621 | req->ki_opcode = iocb->aio_lio_opcode; | 1629 | req->ki_opcode = iocb->aio_lio_opcode; |
1622 | 1630 | ||
1623 | ret = aio_setup_iocb(req, compat); | 1631 | ret = aio_setup_iocb(req, compat); |
1624 | 1632 | ||
1625 | if (ret) | 1633 | if (ret) |
1626 | goto out_put_req; | 1634 | goto out_put_req; |
1627 | 1635 | ||
1628 | spin_lock_irq(&ctx->ctx_lock); | 1636 | spin_lock_irq(&ctx->ctx_lock); |
1629 | aio_run_iocb(req); | 1637 | aio_run_iocb(req); |
1630 | if (!list_empty(&ctx->run_list)) { | 1638 | if (!list_empty(&ctx->run_list)) { |
1631 | /* drain the run list */ | 1639 | /* drain the run list */ |
1632 | while (__aio_run_iocbs(ctx)) | 1640 | while (__aio_run_iocbs(ctx)) |
1633 | ; | 1641 | ; |
1634 | } | 1642 | } |
1635 | spin_unlock_irq(&ctx->ctx_lock); | 1643 | spin_unlock_irq(&ctx->ctx_lock); |
1636 | if (req->ki_opcode == IOCB_CMD_PREAD || | 1644 | if (req->ki_opcode == IOCB_CMD_PREAD || |
1637 | req->ki_opcode == IOCB_CMD_PREADV || | 1645 | req->ki_opcode == IOCB_CMD_PREADV || |
1638 | req->ki_opcode == IOCB_CMD_PWRITE || | 1646 | req->ki_opcode == IOCB_CMD_PWRITE || |
1639 | req->ki_opcode == IOCB_CMD_PWRITEV) | 1647 | req->ki_opcode == IOCB_CMD_PWRITEV) |
1640 | aio_batch_add(file->f_mapping, batch_hash); | 1648 | aio_batch_add(file->f_mapping, batch_hash); |
1641 | 1649 | ||
1642 | aio_put_req(req); /* drop extra ref to req */ | 1650 | aio_put_req(req); /* drop extra ref to req */ |
1643 | return 0; | 1651 | return 0; |
1644 | 1652 | ||
1645 | out_put_req: | 1653 | out_put_req: |
1646 | aio_put_req(req); /* drop extra ref to req */ | 1654 | aio_put_req(req); /* drop extra ref to req */ |
1647 | aio_put_req(req); /* drop i/o ref to req */ | 1655 | aio_put_req(req); /* drop i/o ref to req */ |
1648 | return ret; | 1656 | return ret; |
1649 | } | 1657 | } |
1650 | 1658 | ||
1651 | long do_io_submit(aio_context_t ctx_id, long nr, | 1659 | long do_io_submit(aio_context_t ctx_id, long nr, |
1652 | struct iocb __user *__user *iocbpp, bool compat) | 1660 | struct iocb __user *__user *iocbpp, bool compat) |
1653 | { | 1661 | { |
1654 | struct kioctx *ctx; | 1662 | struct kioctx *ctx; |
1655 | long ret = 0; | 1663 | long ret = 0; |
1656 | int i; | 1664 | int i; |
1657 | struct hlist_head batch_hash[AIO_BATCH_HASH_SIZE] = { { 0, }, }; | 1665 | struct hlist_head batch_hash[AIO_BATCH_HASH_SIZE] = { { 0, }, }; |
1658 | 1666 | ||
1659 | if (unlikely(nr < 0)) | 1667 | if (unlikely(nr < 0)) |
1660 | return -EINVAL; | 1668 | return -EINVAL; |
1661 | 1669 | ||
1662 | if (unlikely(nr > LONG_MAX/sizeof(*iocbpp))) | 1670 | if (unlikely(nr > LONG_MAX/sizeof(*iocbpp))) |
1663 | nr = LONG_MAX/sizeof(*iocbpp); | 1671 | nr = LONG_MAX/sizeof(*iocbpp); |
1664 | 1672 | ||
1665 | if (unlikely(!access_ok(VERIFY_READ, iocbpp, (nr*sizeof(*iocbpp))))) | 1673 | if (unlikely(!access_ok(VERIFY_READ, iocbpp, (nr*sizeof(*iocbpp))))) |
1666 | return -EFAULT; | 1674 | return -EFAULT; |
1667 | 1675 | ||
1668 | ctx = lookup_ioctx(ctx_id); | 1676 | ctx = lookup_ioctx(ctx_id); |
1669 | if (unlikely(!ctx)) { | 1677 | if (unlikely(!ctx)) { |
1670 | pr_debug("EINVAL: io_submit: invalid context id\n"); | 1678 | pr_debug("EINVAL: io_submit: invalid context id\n"); |
1671 | return -EINVAL; | 1679 | return -EINVAL; |
1672 | } | 1680 | } |
1673 | 1681 | ||
1674 | /* | 1682 | /* |
1675 | * AKPM: should this return a partial result if some of the IOs were | 1683 | * AKPM: should this return a partial result if some of the IOs were |
1676 | * successfully submitted? | 1684 | * successfully submitted? |
1677 | */ | 1685 | */ |
1678 | for (i=0; i<nr; i++) { | 1686 | for (i=0; i<nr; i++) { |
1679 | struct iocb __user *user_iocb; | 1687 | struct iocb __user *user_iocb; |
1680 | struct iocb tmp; | 1688 | struct iocb tmp; |
1681 | 1689 | ||
1682 | if (unlikely(__get_user(user_iocb, iocbpp + i))) { | 1690 | if (unlikely(__get_user(user_iocb, iocbpp + i))) { |
1683 | ret = -EFAULT; | 1691 | ret = -EFAULT; |
1684 | break; | 1692 | break; |
1685 | } | 1693 | } |
1686 | 1694 | ||
1687 | if (unlikely(copy_from_user(&tmp, user_iocb, sizeof(tmp)))) { | 1695 | if (unlikely(copy_from_user(&tmp, user_iocb, sizeof(tmp)))) { |
1688 | ret = -EFAULT; | 1696 | ret = -EFAULT; |
1689 | break; | 1697 | break; |
1690 | } | 1698 | } |
1691 | 1699 | ||
1692 | ret = io_submit_one(ctx, user_iocb, &tmp, batch_hash, compat); | 1700 | ret = io_submit_one(ctx, user_iocb, &tmp, batch_hash, compat); |
1693 | if (ret) | 1701 | if (ret) |
1694 | break; | 1702 | break; |
1695 | } | 1703 | } |
1696 | aio_batch_free(batch_hash); | 1704 | aio_batch_free(batch_hash); |
1697 | 1705 | ||
1698 | put_ioctx(ctx); | 1706 | put_ioctx(ctx); |
1699 | return i ? i : ret; | 1707 | return i ? i : ret; |
1700 | } | 1708 | } |
1701 | 1709 | ||
1702 | /* sys_io_submit: | 1710 | /* sys_io_submit: |
1703 | * Queue the nr iocbs pointed to by iocbpp for processing. Returns | 1711 | * Queue the nr iocbs pointed to by iocbpp for processing. Returns |
1704 | * the number of iocbs queued. May return -EINVAL if the aio_context | 1712 | * the number of iocbs queued. May return -EINVAL if the aio_context |
1705 | * specified by ctx_id is invalid, if nr is < 0, if the iocb at | 1713 | * specified by ctx_id is invalid, if nr is < 0, if the iocb at |
1706 | * *iocbpp[0] is not properly initialized, if the operation specified | 1714 | * *iocbpp[0] is not properly initialized, if the operation specified |
1707 | * is invalid for the file descriptor in the iocb. May fail with | 1715 | * is invalid for the file descriptor in the iocb. May fail with |
1708 | * -EFAULT if any of the data structures point to invalid data. May | 1716 | * -EFAULT if any of the data structures point to invalid data. May |
1709 | * fail with -EBADF if the file descriptor specified in the first | 1717 | * fail with -EBADF if the file descriptor specified in the first |
1710 | * iocb is invalid. May fail with -EAGAIN if insufficient resources | 1718 | * iocb is invalid. May fail with -EAGAIN if insufficient resources |
1711 | * are available to queue any iocbs. Will return 0 if nr is 0. Will | 1719 | * are available to queue any iocbs. Will return 0 if nr is 0. Will |
1712 | * fail with -ENOSYS if not implemented. | 1720 | * fail with -ENOSYS if not implemented. |
1713 | */ | 1721 | */ |
1714 | SYSCALL_DEFINE3(io_submit, aio_context_t, ctx_id, long, nr, | 1722 | SYSCALL_DEFINE3(io_submit, aio_context_t, ctx_id, long, nr, |
1715 | struct iocb __user * __user *, iocbpp) | 1723 | struct iocb __user * __user *, iocbpp) |
1716 | { | 1724 | { |
1717 | return do_io_submit(ctx_id, nr, iocbpp, 0); | 1725 | return do_io_submit(ctx_id, nr, iocbpp, 0); |
1718 | } | 1726 | } |
1719 | 1727 | ||
1720 | /* lookup_kiocb | 1728 | /* lookup_kiocb |
1721 | * Finds a given iocb for cancellation. | 1729 | * Finds a given iocb for cancellation. |
1722 | */ | 1730 | */ |
1723 | static struct kiocb *lookup_kiocb(struct kioctx *ctx, struct iocb __user *iocb, | 1731 | static struct kiocb *lookup_kiocb(struct kioctx *ctx, struct iocb __user *iocb, |
1724 | u32 key) | 1732 | u32 key) |
1725 | { | 1733 | { |
1726 | struct list_head *pos; | 1734 | struct list_head *pos; |
1727 | 1735 | ||
1728 | assert_spin_locked(&ctx->ctx_lock); | 1736 | assert_spin_locked(&ctx->ctx_lock); |
1729 | 1737 | ||
1730 | /* TODO: use a hash or array, this sucks. */ | 1738 | /* TODO: use a hash or array, this sucks. */ |
1731 | list_for_each(pos, &ctx->active_reqs) { | 1739 | list_for_each(pos, &ctx->active_reqs) { |
1732 | struct kiocb *kiocb = list_kiocb(pos); | 1740 | struct kiocb *kiocb = list_kiocb(pos); |
1733 | if (kiocb->ki_obj.user == iocb && kiocb->ki_key == key) | 1741 | if (kiocb->ki_obj.user == iocb && kiocb->ki_key == key) |
1734 | return kiocb; | 1742 | return kiocb; |
1735 | } | 1743 | } |
1736 | return NULL; | 1744 | return NULL; |
1737 | } | 1745 | } |
1738 | 1746 | ||
1739 | /* sys_io_cancel: | 1747 | /* sys_io_cancel: |
1740 | * Attempts to cancel an iocb previously passed to io_submit. If | 1748 | * Attempts to cancel an iocb previously passed to io_submit. If |
1741 | * the operation is successfully cancelled, the resulting event is | 1749 | * the operation is successfully cancelled, the resulting event is |
1742 | * copied into the memory pointed to by result without being placed | 1750 | * copied into the memory pointed to by result without being placed |
1743 | * into the completion queue and 0 is returned. May fail with | 1751 | * into the completion queue and 0 is returned. May fail with |
1744 | * -EFAULT if any of the data structures pointed to are invalid. | 1752 | * -EFAULT if any of the data structures pointed to are invalid. |
1745 | * May fail with -EINVAL if aio_context specified by ctx_id is | 1753 | * May fail with -EINVAL if aio_context specified by ctx_id is |
1746 | * invalid. May fail with -EAGAIN if the iocb specified was not | 1754 | * invalid. May fail with -EAGAIN if the iocb specified was not |
1747 | * cancelled. Will fail with -ENOSYS if not implemented. | 1755 | * cancelled. Will fail with -ENOSYS if not implemented. |
1748 | */ | 1756 | */ |
1749 | SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb, | 1757 | SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb, |
1750 | struct io_event __user *, result) | 1758 | struct io_event __user *, result) |
1751 | { | 1759 | { |
1752 | int (*cancel)(struct kiocb *iocb, struct io_event *res); | 1760 | int (*cancel)(struct kiocb *iocb, struct io_event *res); |
1753 | struct kioctx *ctx; | 1761 | struct kioctx *ctx; |
1754 | struct kiocb *kiocb; | 1762 | struct kiocb *kiocb; |
1755 | u32 key; | 1763 | u32 key; |
1756 | int ret; | 1764 | int ret; |
1757 | 1765 | ||
1758 | ret = get_user(key, &iocb->aio_key); | 1766 | ret = get_user(key, &iocb->aio_key); |
1759 | if (unlikely(ret)) | 1767 | if (unlikely(ret)) |
1760 | return -EFAULT; | 1768 | return -EFAULT; |
1761 | 1769 | ||
1762 | ctx = lookup_ioctx(ctx_id); | 1770 | ctx = lookup_ioctx(ctx_id); |
1763 | if (unlikely(!ctx)) | 1771 | if (unlikely(!ctx)) |
1764 | return -EINVAL; | 1772 | return -EINVAL; |
1765 | 1773 | ||
1766 | spin_lock_irq(&ctx->ctx_lock); | 1774 | spin_lock_irq(&ctx->ctx_lock); |
1767 | ret = -EAGAIN; | 1775 | ret = -EAGAIN; |
1768 | kiocb = lookup_kiocb(ctx, iocb, key); | 1776 | kiocb = lookup_kiocb(ctx, iocb, key); |
1769 | if (kiocb && kiocb->ki_cancel) { | 1777 | if (kiocb && kiocb->ki_cancel) { |
1770 | cancel = kiocb->ki_cancel; | 1778 | cancel = kiocb->ki_cancel; |
1771 | kiocb->ki_users ++; | 1779 | kiocb->ki_users ++; |
1772 | kiocbSetCancelled(kiocb); | 1780 | kiocbSetCancelled(kiocb); |
1773 | } else | 1781 | } else |
1774 | cancel = NULL; | 1782 | cancel = NULL; |
1775 | spin_unlock_irq(&ctx->ctx_lock); | 1783 | spin_unlock_irq(&ctx->ctx_lock); |
1776 | 1784 | ||
1777 | if (NULL != cancel) { | 1785 | if (NULL != cancel) { |
1778 | struct io_event tmp; | 1786 | struct io_event tmp; |
1779 | pr_debug("calling cancel\n"); | 1787 | pr_debug("calling cancel\n"); |
1780 | memset(&tmp, 0, sizeof(tmp)); | 1788 | memset(&tmp, 0, sizeof(tmp)); |
1781 | tmp.obj = (u64)(unsigned long)kiocb->ki_obj.user; | 1789 | tmp.obj = (u64)(unsigned long)kiocb->ki_obj.user; |
1782 | tmp.data = kiocb->ki_user_data; | 1790 | tmp.data = kiocb->ki_user_data; |
1783 | ret = cancel(kiocb, &tmp); | 1791 | ret = cancel(kiocb, &tmp); |
1784 | if (!ret) { | 1792 | if (!ret) { |
1785 | /* Cancellation succeeded -- copy the result | 1793 | /* Cancellation succeeded -- copy the result |
1786 | * into the user's buffer. | 1794 | * into the user's buffer. |
1787 | */ | 1795 | */ |
1788 | if (copy_to_user(result, &tmp, sizeof(tmp))) | 1796 | if (copy_to_user(result, &tmp, sizeof(tmp))) |
1789 | ret = -EFAULT; | 1797 | ret = -EFAULT; |
1790 | } | 1798 | } |
1791 | } else | 1799 | } else |
1792 | ret = -EINVAL; | 1800 | ret = -EINVAL; |
1793 | 1801 | ||
1794 | put_ioctx(ctx); | 1802 | put_ioctx(ctx); |
1795 | 1803 | ||
1796 | return ret; | 1804 | return ret; |
1797 | } | 1805 | } |
1798 | 1806 | ||
1799 | /* io_getevents: | 1807 | /* io_getevents: |
1800 | * Attempts to read at least min_nr events and up to nr events from | 1808 | * Attempts to read at least min_nr events and up to nr events from |
1801 | * the completion queue for the aio_context specified by ctx_id. If | 1809 | * the completion queue for the aio_context specified by ctx_id. If |
1802 | * it succeeds, the number of read events is returned. May fail with | 1810 | * it succeeds, the number of read events is returned. May fail with |
1803 | * -EINVAL if ctx_id is invalid, if min_nr is out of range, if nr is | 1811 | * -EINVAL if ctx_id is invalid, if min_nr is out of range, if nr is |
1804 | * out of range, if timeout is out of range. May fail with -EFAULT | 1812 | * out of range, if timeout is out of range. May fail with -EFAULT |
1805 | * if any of the memory specified is invalid. May return 0 or | 1813 | * if any of the memory specified is invalid. May return 0 or |
1806 | * < min_nr if the timeout specified by timeout has elapsed | 1814 | * < min_nr if the timeout specified by timeout has elapsed |
1807 | * before sufficient events are available, where timeout == NULL | 1815 | * before sufficient events are available, where timeout == NULL |
1808 | * specifies an infinite timeout. Note that the timeout pointed to by | 1816 | * specifies an infinite timeout. Note that the timeout pointed to by |
1809 | * timeout is relative and will be updated if not NULL and the | 1817 | * timeout is relative and will be updated if not NULL and the |
1810 | * operation blocks. Will fail with -ENOSYS if not implemented. | 1818 | * operation blocks. Will fail with -ENOSYS if not implemented. |
1811 | */ | 1819 | */ |
1812 | SYSCALL_DEFINE5(io_getevents, aio_context_t, ctx_id, | 1820 | SYSCALL_DEFINE5(io_getevents, aio_context_t, ctx_id, |
1813 | long, min_nr, | 1821 | long, min_nr, |
1814 | long, nr, | 1822 | long, nr, |
1815 | struct io_event __user *, events, | 1823 | struct io_event __user *, events, |
1816 | struct timespec __user *, timeout) | 1824 | struct timespec __user *, timeout) |
1817 | { | 1825 | { |
1818 | struct kioctx *ioctx = lookup_ioctx(ctx_id); | 1826 | struct kioctx *ioctx = lookup_ioctx(ctx_id); |
1819 | long ret = -EINVAL; | 1827 | long ret = -EINVAL; |
1820 | 1828 | ||
1821 | if (likely(ioctx)) { | 1829 | if (likely(ioctx)) { |
1822 | if (likely(min_nr <= nr && min_nr >= 0 && nr >= 0)) | 1830 | if (likely(min_nr <= nr && min_nr >= 0 && nr >= 0)) |
1823 | ret = read_events(ioctx, min_nr, nr, events, timeout); | 1831 | ret = read_events(ioctx, min_nr, nr, events, timeout); |
1824 | put_ioctx(ioctx); | 1832 | put_ioctx(ioctx); |
1825 | } | 1833 | } |
1826 | 1834 | ||
1827 | asmlinkage_protect(5, ret, ctx_id, min_nr, nr, events, timeout); | 1835 | asmlinkage_protect(5, ret, ctx_id, min_nr, nr, events, timeout); |
1828 | return ret; | 1836 | return ret; |
1829 | } | 1837 | } |
1830 | 1838 |