Commit 598af051a79d05b751fe793f1fe09fcf74763e02
Committed by
Linus Torvalds
1 parent
54a0151041
asmlinkage_protect sys_io_getevents
Use asmlinkage_protect in sys_io_getevents, because GCC for i386 with CONFIG_FRAME_POINTER=n can decide to clobber an argument word on the stack, i.e. the user struct pt_regs. Here the problem is not a tail call, but just the compiler's use of the stack when it inlines and optimizes the body of the called function. This seems to avoid it. Signed-off-by: Roland McGrath <roland@redhat.com> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Showing 1 changed file with 1 additions and 0 deletions Inline Diff
fs/aio.c
1 | /* | 1 | /* |
2 | * An async IO implementation for Linux | 2 | * An async IO implementation for Linux |
3 | * Written by Benjamin LaHaise <bcrl@kvack.org> | 3 | * Written by Benjamin LaHaise <bcrl@kvack.org> |
4 | * | 4 | * |
5 | * Implements an efficient asynchronous io interface. | 5 | * Implements an efficient asynchronous io interface. |
6 | * | 6 | * |
7 | * Copyright 2000, 2001, 2002 Red Hat, Inc. All Rights Reserved. | 7 | * Copyright 2000, 2001, 2002 Red Hat, Inc. All Rights Reserved. |
8 | * | 8 | * |
9 | * See ../COPYING for licensing terms. | 9 | * See ../COPYING for licensing terms. |
10 | */ | 10 | */ |
11 | #include <linux/kernel.h> | 11 | #include <linux/kernel.h> |
12 | #include <linux/init.h> | 12 | #include <linux/init.h> |
13 | #include <linux/errno.h> | 13 | #include <linux/errno.h> |
14 | #include <linux/time.h> | 14 | #include <linux/time.h> |
15 | #include <linux/aio_abi.h> | 15 | #include <linux/aio_abi.h> |
16 | #include <linux/module.h> | 16 | #include <linux/module.h> |
17 | #include <linux/syscalls.h> | 17 | #include <linux/syscalls.h> |
18 | #include <linux/uio.h> | 18 | #include <linux/uio.h> |
19 | 19 | ||
20 | #define DEBUG 0 | 20 | #define DEBUG 0 |
21 | 21 | ||
22 | #include <linux/sched.h> | 22 | #include <linux/sched.h> |
23 | #include <linux/fs.h> | 23 | #include <linux/fs.h> |
24 | #include <linux/file.h> | 24 | #include <linux/file.h> |
25 | #include <linux/mm.h> | 25 | #include <linux/mm.h> |
26 | #include <linux/mman.h> | 26 | #include <linux/mman.h> |
27 | #include <linux/slab.h> | 27 | #include <linux/slab.h> |
28 | #include <linux/timer.h> | 28 | #include <linux/timer.h> |
29 | #include <linux/aio.h> | 29 | #include <linux/aio.h> |
30 | #include <linux/highmem.h> | 30 | #include <linux/highmem.h> |
31 | #include <linux/workqueue.h> | 31 | #include <linux/workqueue.h> |
32 | #include <linux/security.h> | 32 | #include <linux/security.h> |
33 | #include <linux/eventfd.h> | 33 | #include <linux/eventfd.h> |
34 | 34 | ||
35 | #include <asm/kmap_types.h> | 35 | #include <asm/kmap_types.h> |
36 | #include <asm/uaccess.h> | 36 | #include <asm/uaccess.h> |
37 | #include <asm/mmu_context.h> | 37 | #include <asm/mmu_context.h> |
38 | 38 | ||
39 | #if DEBUG > 1 | 39 | #if DEBUG > 1 |
40 | #define dprintk printk | 40 | #define dprintk printk |
41 | #else | 41 | #else |
42 | #define dprintk(x...) do { ; } while (0) | 42 | #define dprintk(x...) do { ; } while (0) |
43 | #endif | 43 | #endif |
44 | 44 | ||
45 | /*------ sysctl variables----*/ | 45 | /*------ sysctl variables----*/ |
46 | static DEFINE_SPINLOCK(aio_nr_lock); | 46 | static DEFINE_SPINLOCK(aio_nr_lock); |
47 | unsigned long aio_nr; /* current system wide number of aio requests */ | 47 | unsigned long aio_nr; /* current system wide number of aio requests */ |
48 | unsigned long aio_max_nr = 0x10000; /* system wide maximum number of aio requests */ | 48 | unsigned long aio_max_nr = 0x10000; /* system wide maximum number of aio requests */ |
49 | /*----end sysctl variables---*/ | 49 | /*----end sysctl variables---*/ |
50 | 50 | ||
51 | static struct kmem_cache *kiocb_cachep; | 51 | static struct kmem_cache *kiocb_cachep; |
52 | static struct kmem_cache *kioctx_cachep; | 52 | static struct kmem_cache *kioctx_cachep; |
53 | 53 | ||
54 | static struct workqueue_struct *aio_wq; | 54 | static struct workqueue_struct *aio_wq; |
55 | 55 | ||
56 | /* Used for rare fput completion. */ | 56 | /* Used for rare fput completion. */ |
57 | static void aio_fput_routine(struct work_struct *); | 57 | static void aio_fput_routine(struct work_struct *); |
58 | static DECLARE_WORK(fput_work, aio_fput_routine); | 58 | static DECLARE_WORK(fput_work, aio_fput_routine); |
59 | 59 | ||
60 | static DEFINE_SPINLOCK(fput_lock); | 60 | static DEFINE_SPINLOCK(fput_lock); |
61 | static LIST_HEAD(fput_head); | 61 | static LIST_HEAD(fput_head); |
62 | 62 | ||
63 | static void aio_kick_handler(struct work_struct *); | 63 | static void aio_kick_handler(struct work_struct *); |
64 | static void aio_queue_work(struct kioctx *); | 64 | static void aio_queue_work(struct kioctx *); |
65 | 65 | ||
66 | /* aio_setup | 66 | /* aio_setup |
67 | * Creates the slab caches used by the aio routines, panic on | 67 | * Creates the slab caches used by the aio routines, panic on |
68 | * failure as this is done early during the boot sequence. | 68 | * failure as this is done early during the boot sequence. |
69 | */ | 69 | */ |
70 | static int __init aio_setup(void) | 70 | static int __init aio_setup(void) |
71 | { | 71 | { |
72 | kiocb_cachep = KMEM_CACHE(kiocb, SLAB_HWCACHE_ALIGN|SLAB_PANIC); | 72 | kiocb_cachep = KMEM_CACHE(kiocb, SLAB_HWCACHE_ALIGN|SLAB_PANIC); |
73 | kioctx_cachep = KMEM_CACHE(kioctx,SLAB_HWCACHE_ALIGN|SLAB_PANIC); | 73 | kioctx_cachep = KMEM_CACHE(kioctx,SLAB_HWCACHE_ALIGN|SLAB_PANIC); |
74 | 74 | ||
75 | aio_wq = create_workqueue("aio"); | 75 | aio_wq = create_workqueue("aio"); |
76 | 76 | ||
77 | pr_debug("aio_setup: sizeof(struct page) = %d\n", (int)sizeof(struct page)); | 77 | pr_debug("aio_setup: sizeof(struct page) = %d\n", (int)sizeof(struct page)); |
78 | 78 | ||
79 | return 0; | 79 | return 0; |
80 | } | 80 | } |
81 | 81 | ||
82 | static void aio_free_ring(struct kioctx *ctx) | 82 | static void aio_free_ring(struct kioctx *ctx) |
83 | { | 83 | { |
84 | struct aio_ring_info *info = &ctx->ring_info; | 84 | struct aio_ring_info *info = &ctx->ring_info; |
85 | long i; | 85 | long i; |
86 | 86 | ||
87 | for (i=0; i<info->nr_pages; i++) | 87 | for (i=0; i<info->nr_pages; i++) |
88 | put_page(info->ring_pages[i]); | 88 | put_page(info->ring_pages[i]); |
89 | 89 | ||
90 | if (info->mmap_size) { | 90 | if (info->mmap_size) { |
91 | down_write(&ctx->mm->mmap_sem); | 91 | down_write(&ctx->mm->mmap_sem); |
92 | do_munmap(ctx->mm, info->mmap_base, info->mmap_size); | 92 | do_munmap(ctx->mm, info->mmap_base, info->mmap_size); |
93 | up_write(&ctx->mm->mmap_sem); | 93 | up_write(&ctx->mm->mmap_sem); |
94 | } | 94 | } |
95 | 95 | ||
96 | if (info->ring_pages && info->ring_pages != info->internal_pages) | 96 | if (info->ring_pages && info->ring_pages != info->internal_pages) |
97 | kfree(info->ring_pages); | 97 | kfree(info->ring_pages); |
98 | info->ring_pages = NULL; | 98 | info->ring_pages = NULL; |
99 | info->nr = 0; | 99 | info->nr = 0; |
100 | } | 100 | } |
101 | 101 | ||
102 | static int aio_setup_ring(struct kioctx *ctx) | 102 | static int aio_setup_ring(struct kioctx *ctx) |
103 | { | 103 | { |
104 | struct aio_ring *ring; | 104 | struct aio_ring *ring; |
105 | struct aio_ring_info *info = &ctx->ring_info; | 105 | struct aio_ring_info *info = &ctx->ring_info; |
106 | unsigned nr_events = ctx->max_reqs; | 106 | unsigned nr_events = ctx->max_reqs; |
107 | unsigned long size; | 107 | unsigned long size; |
108 | int nr_pages; | 108 | int nr_pages; |
109 | 109 | ||
110 | /* Compensate for the ring buffer's head/tail overlap entry */ | 110 | /* Compensate for the ring buffer's head/tail overlap entry */ |
111 | nr_events += 2; /* 1 is required, 2 for good luck */ | 111 | nr_events += 2; /* 1 is required, 2 for good luck */ |
112 | 112 | ||
113 | size = sizeof(struct aio_ring); | 113 | size = sizeof(struct aio_ring); |
114 | size += sizeof(struct io_event) * nr_events; | 114 | size += sizeof(struct io_event) * nr_events; |
115 | nr_pages = (size + PAGE_SIZE-1) >> PAGE_SHIFT; | 115 | nr_pages = (size + PAGE_SIZE-1) >> PAGE_SHIFT; |
116 | 116 | ||
117 | if (nr_pages < 0) | 117 | if (nr_pages < 0) |
118 | return -EINVAL; | 118 | return -EINVAL; |
119 | 119 | ||
120 | nr_events = (PAGE_SIZE * nr_pages - sizeof(struct aio_ring)) / sizeof(struct io_event); | 120 | nr_events = (PAGE_SIZE * nr_pages - sizeof(struct aio_ring)) / sizeof(struct io_event); |
121 | 121 | ||
122 | info->nr = 0; | 122 | info->nr = 0; |
123 | info->ring_pages = info->internal_pages; | 123 | info->ring_pages = info->internal_pages; |
124 | if (nr_pages > AIO_RING_PAGES) { | 124 | if (nr_pages > AIO_RING_PAGES) { |
125 | info->ring_pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL); | 125 | info->ring_pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL); |
126 | if (!info->ring_pages) | 126 | if (!info->ring_pages) |
127 | return -ENOMEM; | 127 | return -ENOMEM; |
128 | } | 128 | } |
129 | 129 | ||
130 | info->mmap_size = nr_pages * PAGE_SIZE; | 130 | info->mmap_size = nr_pages * PAGE_SIZE; |
131 | dprintk("attempting mmap of %lu bytes\n", info->mmap_size); | 131 | dprintk("attempting mmap of %lu bytes\n", info->mmap_size); |
132 | down_write(&ctx->mm->mmap_sem); | 132 | down_write(&ctx->mm->mmap_sem); |
133 | info->mmap_base = do_mmap(NULL, 0, info->mmap_size, | 133 | info->mmap_base = do_mmap(NULL, 0, info->mmap_size, |
134 | PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_PRIVATE, | 134 | PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_PRIVATE, |
135 | 0); | 135 | 0); |
136 | if (IS_ERR((void *)info->mmap_base)) { | 136 | if (IS_ERR((void *)info->mmap_base)) { |
137 | up_write(&ctx->mm->mmap_sem); | 137 | up_write(&ctx->mm->mmap_sem); |
138 | info->mmap_size = 0; | 138 | info->mmap_size = 0; |
139 | aio_free_ring(ctx); | 139 | aio_free_ring(ctx); |
140 | return -EAGAIN; | 140 | return -EAGAIN; |
141 | } | 141 | } |
142 | 142 | ||
143 | dprintk("mmap address: 0x%08lx\n", info->mmap_base); | 143 | dprintk("mmap address: 0x%08lx\n", info->mmap_base); |
144 | info->nr_pages = get_user_pages(current, ctx->mm, | 144 | info->nr_pages = get_user_pages(current, ctx->mm, |
145 | info->mmap_base, nr_pages, | 145 | info->mmap_base, nr_pages, |
146 | 1, 0, info->ring_pages, NULL); | 146 | 1, 0, info->ring_pages, NULL); |
147 | up_write(&ctx->mm->mmap_sem); | 147 | up_write(&ctx->mm->mmap_sem); |
148 | 148 | ||
149 | if (unlikely(info->nr_pages != nr_pages)) { | 149 | if (unlikely(info->nr_pages != nr_pages)) { |
150 | aio_free_ring(ctx); | 150 | aio_free_ring(ctx); |
151 | return -EAGAIN; | 151 | return -EAGAIN; |
152 | } | 152 | } |
153 | 153 | ||
154 | ctx->user_id = info->mmap_base; | 154 | ctx->user_id = info->mmap_base; |
155 | 155 | ||
156 | info->nr = nr_events; /* trusted copy */ | 156 | info->nr = nr_events; /* trusted copy */ |
157 | 157 | ||
158 | ring = kmap_atomic(info->ring_pages[0], KM_USER0); | 158 | ring = kmap_atomic(info->ring_pages[0], KM_USER0); |
159 | ring->nr = nr_events; /* user copy */ | 159 | ring->nr = nr_events; /* user copy */ |
160 | ring->id = ctx->user_id; | 160 | ring->id = ctx->user_id; |
161 | ring->head = ring->tail = 0; | 161 | ring->head = ring->tail = 0; |
162 | ring->magic = AIO_RING_MAGIC; | 162 | ring->magic = AIO_RING_MAGIC; |
163 | ring->compat_features = AIO_RING_COMPAT_FEATURES; | 163 | ring->compat_features = AIO_RING_COMPAT_FEATURES; |
164 | ring->incompat_features = AIO_RING_INCOMPAT_FEATURES; | 164 | ring->incompat_features = AIO_RING_INCOMPAT_FEATURES; |
165 | ring->header_length = sizeof(struct aio_ring); | 165 | ring->header_length = sizeof(struct aio_ring); |
166 | kunmap_atomic(ring, KM_USER0); | 166 | kunmap_atomic(ring, KM_USER0); |
167 | 167 | ||
168 | return 0; | 168 | return 0; |
169 | } | 169 | } |
170 | 170 | ||
171 | 171 | ||
172 | /* aio_ring_event: returns a pointer to the event at the given index from | 172 | /* aio_ring_event: returns a pointer to the event at the given index from |
173 | * kmap_atomic(, km). Release the pointer with put_aio_ring_event(); | 173 | * kmap_atomic(, km). Release the pointer with put_aio_ring_event(); |
174 | */ | 174 | */ |
175 | #define AIO_EVENTS_PER_PAGE (PAGE_SIZE / sizeof(struct io_event)) | 175 | #define AIO_EVENTS_PER_PAGE (PAGE_SIZE / sizeof(struct io_event)) |
176 | #define AIO_EVENTS_FIRST_PAGE ((PAGE_SIZE - sizeof(struct aio_ring)) / sizeof(struct io_event)) | 176 | #define AIO_EVENTS_FIRST_PAGE ((PAGE_SIZE - sizeof(struct aio_ring)) / sizeof(struct io_event)) |
177 | #define AIO_EVENTS_OFFSET (AIO_EVENTS_PER_PAGE - AIO_EVENTS_FIRST_PAGE) | 177 | #define AIO_EVENTS_OFFSET (AIO_EVENTS_PER_PAGE - AIO_EVENTS_FIRST_PAGE) |
178 | 178 | ||
179 | #define aio_ring_event(info, nr, km) ({ \ | 179 | #define aio_ring_event(info, nr, km) ({ \ |
180 | unsigned pos = (nr) + AIO_EVENTS_OFFSET; \ | 180 | unsigned pos = (nr) + AIO_EVENTS_OFFSET; \ |
181 | struct io_event *__event; \ | 181 | struct io_event *__event; \ |
182 | __event = kmap_atomic( \ | 182 | __event = kmap_atomic( \ |
183 | (info)->ring_pages[pos / AIO_EVENTS_PER_PAGE], km); \ | 183 | (info)->ring_pages[pos / AIO_EVENTS_PER_PAGE], km); \ |
184 | __event += pos % AIO_EVENTS_PER_PAGE; \ | 184 | __event += pos % AIO_EVENTS_PER_PAGE; \ |
185 | __event; \ | 185 | __event; \ |
186 | }) | 186 | }) |
187 | 187 | ||
188 | #define put_aio_ring_event(event, km) do { \ | 188 | #define put_aio_ring_event(event, km) do { \ |
189 | struct io_event *__event = (event); \ | 189 | struct io_event *__event = (event); \ |
190 | (void)__event; \ | 190 | (void)__event; \ |
191 | kunmap_atomic((void *)((unsigned long)__event & PAGE_MASK), km); \ | 191 | kunmap_atomic((void *)((unsigned long)__event & PAGE_MASK), km); \ |
192 | } while(0) | 192 | } while(0) |
193 | 193 | ||
194 | /* ioctx_alloc | 194 | /* ioctx_alloc |
195 | * Allocates and initializes an ioctx. Returns an ERR_PTR if it failed. | 195 | * Allocates and initializes an ioctx. Returns an ERR_PTR if it failed. |
196 | */ | 196 | */ |
197 | static struct kioctx *ioctx_alloc(unsigned nr_events) | 197 | static struct kioctx *ioctx_alloc(unsigned nr_events) |
198 | { | 198 | { |
199 | struct mm_struct *mm; | 199 | struct mm_struct *mm; |
200 | struct kioctx *ctx; | 200 | struct kioctx *ctx; |
201 | 201 | ||
202 | /* Prevent overflows */ | 202 | /* Prevent overflows */ |
203 | if ((nr_events > (0x10000000U / sizeof(struct io_event))) || | 203 | if ((nr_events > (0x10000000U / sizeof(struct io_event))) || |
204 | (nr_events > (0x10000000U / sizeof(struct kiocb)))) { | 204 | (nr_events > (0x10000000U / sizeof(struct kiocb)))) { |
205 | pr_debug("ENOMEM: nr_events too high\n"); | 205 | pr_debug("ENOMEM: nr_events too high\n"); |
206 | return ERR_PTR(-EINVAL); | 206 | return ERR_PTR(-EINVAL); |
207 | } | 207 | } |
208 | 208 | ||
209 | if ((unsigned long)nr_events > aio_max_nr) | 209 | if ((unsigned long)nr_events > aio_max_nr) |
210 | return ERR_PTR(-EAGAIN); | 210 | return ERR_PTR(-EAGAIN); |
211 | 211 | ||
212 | ctx = kmem_cache_zalloc(kioctx_cachep, GFP_KERNEL); | 212 | ctx = kmem_cache_zalloc(kioctx_cachep, GFP_KERNEL); |
213 | if (!ctx) | 213 | if (!ctx) |
214 | return ERR_PTR(-ENOMEM); | 214 | return ERR_PTR(-ENOMEM); |
215 | 215 | ||
216 | ctx->max_reqs = nr_events; | 216 | ctx->max_reqs = nr_events; |
217 | mm = ctx->mm = current->mm; | 217 | mm = ctx->mm = current->mm; |
218 | atomic_inc(&mm->mm_count); | 218 | atomic_inc(&mm->mm_count); |
219 | 219 | ||
220 | atomic_set(&ctx->users, 1); | 220 | atomic_set(&ctx->users, 1); |
221 | spin_lock_init(&ctx->ctx_lock); | 221 | spin_lock_init(&ctx->ctx_lock); |
222 | spin_lock_init(&ctx->ring_info.ring_lock); | 222 | spin_lock_init(&ctx->ring_info.ring_lock); |
223 | init_waitqueue_head(&ctx->wait); | 223 | init_waitqueue_head(&ctx->wait); |
224 | 224 | ||
225 | INIT_LIST_HEAD(&ctx->active_reqs); | 225 | INIT_LIST_HEAD(&ctx->active_reqs); |
226 | INIT_LIST_HEAD(&ctx->run_list); | 226 | INIT_LIST_HEAD(&ctx->run_list); |
227 | INIT_DELAYED_WORK(&ctx->wq, aio_kick_handler); | 227 | INIT_DELAYED_WORK(&ctx->wq, aio_kick_handler); |
228 | 228 | ||
229 | if (aio_setup_ring(ctx) < 0) | 229 | if (aio_setup_ring(ctx) < 0) |
230 | goto out_freectx; | 230 | goto out_freectx; |
231 | 231 | ||
232 | /* limit the number of system wide aios */ | 232 | /* limit the number of system wide aios */ |
233 | spin_lock(&aio_nr_lock); | 233 | spin_lock(&aio_nr_lock); |
234 | if (aio_nr + ctx->max_reqs > aio_max_nr || | 234 | if (aio_nr + ctx->max_reqs > aio_max_nr || |
235 | aio_nr + ctx->max_reqs < aio_nr) | 235 | aio_nr + ctx->max_reqs < aio_nr) |
236 | ctx->max_reqs = 0; | 236 | ctx->max_reqs = 0; |
237 | else | 237 | else |
238 | aio_nr += ctx->max_reqs; | 238 | aio_nr += ctx->max_reqs; |
239 | spin_unlock(&aio_nr_lock); | 239 | spin_unlock(&aio_nr_lock); |
240 | if (ctx->max_reqs == 0) | 240 | if (ctx->max_reqs == 0) |
241 | goto out_cleanup; | 241 | goto out_cleanup; |
242 | 242 | ||
243 | /* now link into global list. kludge. FIXME */ | 243 | /* now link into global list. kludge. FIXME */ |
244 | write_lock(&mm->ioctx_list_lock); | 244 | write_lock(&mm->ioctx_list_lock); |
245 | ctx->next = mm->ioctx_list; | 245 | ctx->next = mm->ioctx_list; |
246 | mm->ioctx_list = ctx; | 246 | mm->ioctx_list = ctx; |
247 | write_unlock(&mm->ioctx_list_lock); | 247 | write_unlock(&mm->ioctx_list_lock); |
248 | 248 | ||
249 | dprintk("aio: allocated ioctx %p[%ld]: mm=%p mask=0x%x\n", | 249 | dprintk("aio: allocated ioctx %p[%ld]: mm=%p mask=0x%x\n", |
250 | ctx, ctx->user_id, current->mm, ctx->ring_info.nr); | 250 | ctx, ctx->user_id, current->mm, ctx->ring_info.nr); |
251 | return ctx; | 251 | return ctx; |
252 | 252 | ||
253 | out_cleanup: | 253 | out_cleanup: |
254 | __put_ioctx(ctx); | 254 | __put_ioctx(ctx); |
255 | return ERR_PTR(-EAGAIN); | 255 | return ERR_PTR(-EAGAIN); |
256 | 256 | ||
257 | out_freectx: | 257 | out_freectx: |
258 | mmdrop(mm); | 258 | mmdrop(mm); |
259 | kmem_cache_free(kioctx_cachep, ctx); | 259 | kmem_cache_free(kioctx_cachep, ctx); |
260 | ctx = ERR_PTR(-ENOMEM); | 260 | ctx = ERR_PTR(-ENOMEM); |
261 | 261 | ||
262 | dprintk("aio: error allocating ioctx %p\n", ctx); | 262 | dprintk("aio: error allocating ioctx %p\n", ctx); |
263 | return ctx; | 263 | return ctx; |
264 | } | 264 | } |
265 | 265 | ||
266 | /* aio_cancel_all | 266 | /* aio_cancel_all |
267 | * Cancels all outstanding aio requests on an aio context. Used | 267 | * Cancels all outstanding aio requests on an aio context. Used |
268 | * when the processes owning a context have all exited to encourage | 268 | * when the processes owning a context have all exited to encourage |
269 | * the rapid destruction of the kioctx. | 269 | * the rapid destruction of the kioctx. |
270 | */ | 270 | */ |
271 | static void aio_cancel_all(struct kioctx *ctx) | 271 | static void aio_cancel_all(struct kioctx *ctx) |
272 | { | 272 | { |
273 | int (*cancel)(struct kiocb *, struct io_event *); | 273 | int (*cancel)(struct kiocb *, struct io_event *); |
274 | struct io_event res; | 274 | struct io_event res; |
275 | spin_lock_irq(&ctx->ctx_lock); | 275 | spin_lock_irq(&ctx->ctx_lock); |
276 | ctx->dead = 1; | 276 | ctx->dead = 1; |
277 | while (!list_empty(&ctx->active_reqs)) { | 277 | while (!list_empty(&ctx->active_reqs)) { |
278 | struct list_head *pos = ctx->active_reqs.next; | 278 | struct list_head *pos = ctx->active_reqs.next; |
279 | struct kiocb *iocb = list_kiocb(pos); | 279 | struct kiocb *iocb = list_kiocb(pos); |
280 | list_del_init(&iocb->ki_list); | 280 | list_del_init(&iocb->ki_list); |
281 | cancel = iocb->ki_cancel; | 281 | cancel = iocb->ki_cancel; |
282 | kiocbSetCancelled(iocb); | 282 | kiocbSetCancelled(iocb); |
283 | if (cancel) { | 283 | if (cancel) { |
284 | iocb->ki_users++; | 284 | iocb->ki_users++; |
285 | spin_unlock_irq(&ctx->ctx_lock); | 285 | spin_unlock_irq(&ctx->ctx_lock); |
286 | cancel(iocb, &res); | 286 | cancel(iocb, &res); |
287 | spin_lock_irq(&ctx->ctx_lock); | 287 | spin_lock_irq(&ctx->ctx_lock); |
288 | } | 288 | } |
289 | } | 289 | } |
290 | spin_unlock_irq(&ctx->ctx_lock); | 290 | spin_unlock_irq(&ctx->ctx_lock); |
291 | } | 291 | } |
292 | 292 | ||
293 | static void wait_for_all_aios(struct kioctx *ctx) | 293 | static void wait_for_all_aios(struct kioctx *ctx) |
294 | { | 294 | { |
295 | struct task_struct *tsk = current; | 295 | struct task_struct *tsk = current; |
296 | DECLARE_WAITQUEUE(wait, tsk); | 296 | DECLARE_WAITQUEUE(wait, tsk); |
297 | 297 | ||
298 | spin_lock_irq(&ctx->ctx_lock); | 298 | spin_lock_irq(&ctx->ctx_lock); |
299 | if (!ctx->reqs_active) | 299 | if (!ctx->reqs_active) |
300 | goto out; | 300 | goto out; |
301 | 301 | ||
302 | add_wait_queue(&ctx->wait, &wait); | 302 | add_wait_queue(&ctx->wait, &wait); |
303 | set_task_state(tsk, TASK_UNINTERRUPTIBLE); | 303 | set_task_state(tsk, TASK_UNINTERRUPTIBLE); |
304 | while (ctx->reqs_active) { | 304 | while (ctx->reqs_active) { |
305 | spin_unlock_irq(&ctx->ctx_lock); | 305 | spin_unlock_irq(&ctx->ctx_lock); |
306 | io_schedule(); | 306 | io_schedule(); |
307 | set_task_state(tsk, TASK_UNINTERRUPTIBLE); | 307 | set_task_state(tsk, TASK_UNINTERRUPTIBLE); |
308 | spin_lock_irq(&ctx->ctx_lock); | 308 | spin_lock_irq(&ctx->ctx_lock); |
309 | } | 309 | } |
310 | __set_task_state(tsk, TASK_RUNNING); | 310 | __set_task_state(tsk, TASK_RUNNING); |
311 | remove_wait_queue(&ctx->wait, &wait); | 311 | remove_wait_queue(&ctx->wait, &wait); |
312 | 312 | ||
313 | out: | 313 | out: |
314 | spin_unlock_irq(&ctx->ctx_lock); | 314 | spin_unlock_irq(&ctx->ctx_lock); |
315 | } | 315 | } |
316 | 316 | ||
317 | /* wait_on_sync_kiocb: | 317 | /* wait_on_sync_kiocb: |
318 | * Waits on the given sync kiocb to complete. | 318 | * Waits on the given sync kiocb to complete. |
319 | */ | 319 | */ |
320 | ssize_t wait_on_sync_kiocb(struct kiocb *iocb) | 320 | ssize_t wait_on_sync_kiocb(struct kiocb *iocb) |
321 | { | 321 | { |
322 | while (iocb->ki_users) { | 322 | while (iocb->ki_users) { |
323 | set_current_state(TASK_UNINTERRUPTIBLE); | 323 | set_current_state(TASK_UNINTERRUPTIBLE); |
324 | if (!iocb->ki_users) | 324 | if (!iocb->ki_users) |
325 | break; | 325 | break; |
326 | io_schedule(); | 326 | io_schedule(); |
327 | } | 327 | } |
328 | __set_current_state(TASK_RUNNING); | 328 | __set_current_state(TASK_RUNNING); |
329 | return iocb->ki_user_data; | 329 | return iocb->ki_user_data; |
330 | } | 330 | } |
331 | 331 | ||
332 | /* exit_aio: called when the last user of mm goes away. At this point, | 332 | /* exit_aio: called when the last user of mm goes away. At this point, |
333 | * there is no way for any new requests to be submited or any of the | 333 | * there is no way for any new requests to be submited or any of the |
334 | * io_* syscalls to be called on the context. However, there may be | 334 | * io_* syscalls to be called on the context. However, there may be |
335 | * outstanding requests which hold references to the context; as they | 335 | * outstanding requests which hold references to the context; as they |
336 | * go away, they will call put_ioctx and release any pinned memory | 336 | * go away, they will call put_ioctx and release any pinned memory |
337 | * associated with the request (held via struct page * references). | 337 | * associated with the request (held via struct page * references). |
338 | */ | 338 | */ |
339 | void exit_aio(struct mm_struct *mm) | 339 | void exit_aio(struct mm_struct *mm) |
340 | { | 340 | { |
341 | struct kioctx *ctx = mm->ioctx_list; | 341 | struct kioctx *ctx = mm->ioctx_list; |
342 | mm->ioctx_list = NULL; | 342 | mm->ioctx_list = NULL; |
343 | while (ctx) { | 343 | while (ctx) { |
344 | struct kioctx *next = ctx->next; | 344 | struct kioctx *next = ctx->next; |
345 | ctx->next = NULL; | 345 | ctx->next = NULL; |
346 | aio_cancel_all(ctx); | 346 | aio_cancel_all(ctx); |
347 | 347 | ||
348 | wait_for_all_aios(ctx); | 348 | wait_for_all_aios(ctx); |
349 | /* | 349 | /* |
350 | * Ensure we don't leave the ctx on the aio_wq | 350 | * Ensure we don't leave the ctx on the aio_wq |
351 | */ | 351 | */ |
352 | cancel_work_sync(&ctx->wq.work); | 352 | cancel_work_sync(&ctx->wq.work); |
353 | 353 | ||
354 | if (1 != atomic_read(&ctx->users)) | 354 | if (1 != atomic_read(&ctx->users)) |
355 | printk(KERN_DEBUG | 355 | printk(KERN_DEBUG |
356 | "exit_aio:ioctx still alive: %d %d %d\n", | 356 | "exit_aio:ioctx still alive: %d %d %d\n", |
357 | atomic_read(&ctx->users), ctx->dead, | 357 | atomic_read(&ctx->users), ctx->dead, |
358 | ctx->reqs_active); | 358 | ctx->reqs_active); |
359 | put_ioctx(ctx); | 359 | put_ioctx(ctx); |
360 | ctx = next; | 360 | ctx = next; |
361 | } | 361 | } |
362 | } | 362 | } |
363 | 363 | ||
364 | /* __put_ioctx | 364 | /* __put_ioctx |
365 | * Called when the last user of an aio context has gone away, | 365 | * Called when the last user of an aio context has gone away, |
366 | * and the struct needs to be freed. | 366 | * and the struct needs to be freed. |
367 | */ | 367 | */ |
368 | void __put_ioctx(struct kioctx *ctx) | 368 | void __put_ioctx(struct kioctx *ctx) |
369 | { | 369 | { |
370 | unsigned nr_events = ctx->max_reqs; | 370 | unsigned nr_events = ctx->max_reqs; |
371 | 371 | ||
372 | BUG_ON(ctx->reqs_active); | 372 | BUG_ON(ctx->reqs_active); |
373 | 373 | ||
374 | cancel_delayed_work(&ctx->wq); | 374 | cancel_delayed_work(&ctx->wq); |
375 | cancel_work_sync(&ctx->wq.work); | 375 | cancel_work_sync(&ctx->wq.work); |
376 | aio_free_ring(ctx); | 376 | aio_free_ring(ctx); |
377 | mmdrop(ctx->mm); | 377 | mmdrop(ctx->mm); |
378 | ctx->mm = NULL; | 378 | ctx->mm = NULL; |
379 | pr_debug("__put_ioctx: freeing %p\n", ctx); | 379 | pr_debug("__put_ioctx: freeing %p\n", ctx); |
380 | kmem_cache_free(kioctx_cachep, ctx); | 380 | kmem_cache_free(kioctx_cachep, ctx); |
381 | 381 | ||
382 | if (nr_events) { | 382 | if (nr_events) { |
383 | spin_lock(&aio_nr_lock); | 383 | spin_lock(&aio_nr_lock); |
384 | BUG_ON(aio_nr - nr_events > aio_nr); | 384 | BUG_ON(aio_nr - nr_events > aio_nr); |
385 | aio_nr -= nr_events; | 385 | aio_nr -= nr_events; |
386 | spin_unlock(&aio_nr_lock); | 386 | spin_unlock(&aio_nr_lock); |
387 | } | 387 | } |
388 | } | 388 | } |
389 | 389 | ||
390 | /* aio_get_req | 390 | /* aio_get_req |
391 | * Allocate a slot for an aio request. Increments the users count | 391 | * Allocate a slot for an aio request. Increments the users count |
392 | * of the kioctx so that the kioctx stays around until all requests are | 392 | * of the kioctx so that the kioctx stays around until all requests are |
393 | * complete. Returns NULL if no requests are free. | 393 | * complete. Returns NULL if no requests are free. |
394 | * | 394 | * |
395 | * Returns with kiocb->users set to 2. The io submit code path holds | 395 | * Returns with kiocb->users set to 2. The io submit code path holds |
396 | * an extra reference while submitting the i/o. | 396 | * an extra reference while submitting the i/o. |
397 | * This prevents races between the aio code path referencing the | 397 | * This prevents races between the aio code path referencing the |
398 | * req (after submitting it) and aio_complete() freeing the req. | 398 | * req (after submitting it) and aio_complete() freeing the req. |
399 | */ | 399 | */ |
400 | static struct kiocb *__aio_get_req(struct kioctx *ctx) | 400 | static struct kiocb *__aio_get_req(struct kioctx *ctx) |
401 | { | 401 | { |
402 | struct kiocb *req = NULL; | 402 | struct kiocb *req = NULL; |
403 | struct aio_ring *ring; | 403 | struct aio_ring *ring; |
404 | int okay = 0; | 404 | int okay = 0; |
405 | 405 | ||
406 | req = kmem_cache_alloc(kiocb_cachep, GFP_KERNEL); | 406 | req = kmem_cache_alloc(kiocb_cachep, GFP_KERNEL); |
407 | if (unlikely(!req)) | 407 | if (unlikely(!req)) |
408 | return NULL; | 408 | return NULL; |
409 | 409 | ||
410 | req->ki_flags = 0; | 410 | req->ki_flags = 0; |
411 | req->ki_users = 2; | 411 | req->ki_users = 2; |
412 | req->ki_key = 0; | 412 | req->ki_key = 0; |
413 | req->ki_ctx = ctx; | 413 | req->ki_ctx = ctx; |
414 | req->ki_cancel = NULL; | 414 | req->ki_cancel = NULL; |
415 | req->ki_retry = NULL; | 415 | req->ki_retry = NULL; |
416 | req->ki_dtor = NULL; | 416 | req->ki_dtor = NULL; |
417 | req->private = NULL; | 417 | req->private = NULL; |
418 | req->ki_iovec = NULL; | 418 | req->ki_iovec = NULL; |
419 | INIT_LIST_HEAD(&req->ki_run_list); | 419 | INIT_LIST_HEAD(&req->ki_run_list); |
420 | req->ki_eventfd = ERR_PTR(-EINVAL); | 420 | req->ki_eventfd = ERR_PTR(-EINVAL); |
421 | 421 | ||
422 | /* Check if the completion queue has enough free space to | 422 | /* Check if the completion queue has enough free space to |
423 | * accept an event from this io. | 423 | * accept an event from this io. |
424 | */ | 424 | */ |
425 | spin_lock_irq(&ctx->ctx_lock); | 425 | spin_lock_irq(&ctx->ctx_lock); |
426 | ring = kmap_atomic(ctx->ring_info.ring_pages[0], KM_USER0); | 426 | ring = kmap_atomic(ctx->ring_info.ring_pages[0], KM_USER0); |
427 | if (ctx->reqs_active < aio_ring_avail(&ctx->ring_info, ring)) { | 427 | if (ctx->reqs_active < aio_ring_avail(&ctx->ring_info, ring)) { |
428 | list_add(&req->ki_list, &ctx->active_reqs); | 428 | list_add(&req->ki_list, &ctx->active_reqs); |
429 | ctx->reqs_active++; | 429 | ctx->reqs_active++; |
430 | okay = 1; | 430 | okay = 1; |
431 | } | 431 | } |
432 | kunmap_atomic(ring, KM_USER0); | 432 | kunmap_atomic(ring, KM_USER0); |
433 | spin_unlock_irq(&ctx->ctx_lock); | 433 | spin_unlock_irq(&ctx->ctx_lock); |
434 | 434 | ||
435 | if (!okay) { | 435 | if (!okay) { |
436 | kmem_cache_free(kiocb_cachep, req); | 436 | kmem_cache_free(kiocb_cachep, req); |
437 | req = NULL; | 437 | req = NULL; |
438 | } | 438 | } |
439 | 439 | ||
440 | return req; | 440 | return req; |
441 | } | 441 | } |
442 | 442 | ||
443 | static inline struct kiocb *aio_get_req(struct kioctx *ctx) | 443 | static inline struct kiocb *aio_get_req(struct kioctx *ctx) |
444 | { | 444 | { |
445 | struct kiocb *req; | 445 | struct kiocb *req; |
446 | /* Handle a potential starvation case -- should be exceedingly rare as | 446 | /* Handle a potential starvation case -- should be exceedingly rare as |
447 | * requests will be stuck on fput_head only if the aio_fput_routine is | 447 | * requests will be stuck on fput_head only if the aio_fput_routine is |
448 | * delayed and the requests were the last user of the struct file. | 448 | * delayed and the requests were the last user of the struct file. |
449 | */ | 449 | */ |
450 | req = __aio_get_req(ctx); | 450 | req = __aio_get_req(ctx); |
451 | if (unlikely(NULL == req)) { | 451 | if (unlikely(NULL == req)) { |
452 | aio_fput_routine(NULL); | 452 | aio_fput_routine(NULL); |
453 | req = __aio_get_req(ctx); | 453 | req = __aio_get_req(ctx); |
454 | } | 454 | } |
455 | return req; | 455 | return req; |
456 | } | 456 | } |
457 | 457 | ||
458 | static inline void really_put_req(struct kioctx *ctx, struct kiocb *req) | 458 | static inline void really_put_req(struct kioctx *ctx, struct kiocb *req) |
459 | { | 459 | { |
460 | assert_spin_locked(&ctx->ctx_lock); | 460 | assert_spin_locked(&ctx->ctx_lock); |
461 | 461 | ||
462 | if (!IS_ERR(req->ki_eventfd)) | 462 | if (!IS_ERR(req->ki_eventfd)) |
463 | fput(req->ki_eventfd); | 463 | fput(req->ki_eventfd); |
464 | if (req->ki_dtor) | 464 | if (req->ki_dtor) |
465 | req->ki_dtor(req); | 465 | req->ki_dtor(req); |
466 | if (req->ki_iovec != &req->ki_inline_vec) | 466 | if (req->ki_iovec != &req->ki_inline_vec) |
467 | kfree(req->ki_iovec); | 467 | kfree(req->ki_iovec); |
468 | kmem_cache_free(kiocb_cachep, req); | 468 | kmem_cache_free(kiocb_cachep, req); |
469 | ctx->reqs_active--; | 469 | ctx->reqs_active--; |
470 | 470 | ||
471 | if (unlikely(!ctx->reqs_active && ctx->dead)) | 471 | if (unlikely(!ctx->reqs_active && ctx->dead)) |
472 | wake_up(&ctx->wait); | 472 | wake_up(&ctx->wait); |
473 | } | 473 | } |
474 | 474 | ||
475 | static void aio_fput_routine(struct work_struct *data) | 475 | static void aio_fput_routine(struct work_struct *data) |
476 | { | 476 | { |
477 | spin_lock_irq(&fput_lock); | 477 | spin_lock_irq(&fput_lock); |
478 | while (likely(!list_empty(&fput_head))) { | 478 | while (likely(!list_empty(&fput_head))) { |
479 | struct kiocb *req = list_kiocb(fput_head.next); | 479 | struct kiocb *req = list_kiocb(fput_head.next); |
480 | struct kioctx *ctx = req->ki_ctx; | 480 | struct kioctx *ctx = req->ki_ctx; |
481 | 481 | ||
482 | list_del(&req->ki_list); | 482 | list_del(&req->ki_list); |
483 | spin_unlock_irq(&fput_lock); | 483 | spin_unlock_irq(&fput_lock); |
484 | 484 | ||
485 | /* Complete the fput */ | 485 | /* Complete the fput */ |
486 | __fput(req->ki_filp); | 486 | __fput(req->ki_filp); |
487 | 487 | ||
488 | /* Link the iocb into the context's free list */ | 488 | /* Link the iocb into the context's free list */ |
489 | spin_lock_irq(&ctx->ctx_lock); | 489 | spin_lock_irq(&ctx->ctx_lock); |
490 | really_put_req(ctx, req); | 490 | really_put_req(ctx, req); |
491 | spin_unlock_irq(&ctx->ctx_lock); | 491 | spin_unlock_irq(&ctx->ctx_lock); |
492 | 492 | ||
493 | put_ioctx(ctx); | 493 | put_ioctx(ctx); |
494 | spin_lock_irq(&fput_lock); | 494 | spin_lock_irq(&fput_lock); |
495 | } | 495 | } |
496 | spin_unlock_irq(&fput_lock); | 496 | spin_unlock_irq(&fput_lock); |
497 | } | 497 | } |
498 | 498 | ||
499 | /* __aio_put_req | 499 | /* __aio_put_req |
500 | * Returns true if this put was the last user of the request. | 500 | * Returns true if this put was the last user of the request. |
501 | */ | 501 | */ |
502 | static int __aio_put_req(struct kioctx *ctx, struct kiocb *req) | 502 | static int __aio_put_req(struct kioctx *ctx, struct kiocb *req) |
503 | { | 503 | { |
504 | dprintk(KERN_DEBUG "aio_put(%p): f_count=%d\n", | 504 | dprintk(KERN_DEBUG "aio_put(%p): f_count=%d\n", |
505 | req, atomic_read(&req->ki_filp->f_count)); | 505 | req, atomic_read(&req->ki_filp->f_count)); |
506 | 506 | ||
507 | assert_spin_locked(&ctx->ctx_lock); | 507 | assert_spin_locked(&ctx->ctx_lock); |
508 | 508 | ||
509 | req->ki_users --; | 509 | req->ki_users --; |
510 | BUG_ON(req->ki_users < 0); | 510 | BUG_ON(req->ki_users < 0); |
511 | if (likely(req->ki_users)) | 511 | if (likely(req->ki_users)) |
512 | return 0; | 512 | return 0; |
513 | list_del(&req->ki_list); /* remove from active_reqs */ | 513 | list_del(&req->ki_list); /* remove from active_reqs */ |
514 | req->ki_cancel = NULL; | 514 | req->ki_cancel = NULL; |
515 | req->ki_retry = NULL; | 515 | req->ki_retry = NULL; |
516 | 516 | ||
517 | /* Must be done under the lock to serialise against cancellation. | 517 | /* Must be done under the lock to serialise against cancellation. |
518 | * Call this aio_fput as it duplicates fput via the fput_work. | 518 | * Call this aio_fput as it duplicates fput via the fput_work. |
519 | */ | 519 | */ |
520 | if (unlikely(atomic_dec_and_test(&req->ki_filp->f_count))) { | 520 | if (unlikely(atomic_dec_and_test(&req->ki_filp->f_count))) { |
521 | get_ioctx(ctx); | 521 | get_ioctx(ctx); |
522 | spin_lock(&fput_lock); | 522 | spin_lock(&fput_lock); |
523 | list_add(&req->ki_list, &fput_head); | 523 | list_add(&req->ki_list, &fput_head); |
524 | spin_unlock(&fput_lock); | 524 | spin_unlock(&fput_lock); |
525 | queue_work(aio_wq, &fput_work); | 525 | queue_work(aio_wq, &fput_work); |
526 | } else | 526 | } else |
527 | really_put_req(ctx, req); | 527 | really_put_req(ctx, req); |
528 | return 1; | 528 | return 1; |
529 | } | 529 | } |
530 | 530 | ||
531 | /* aio_put_req | 531 | /* aio_put_req |
532 | * Returns true if this put was the last user of the kiocb, | 532 | * Returns true if this put was the last user of the kiocb, |
533 | * false if the request is still in use. | 533 | * false if the request is still in use. |
534 | */ | 534 | */ |
535 | int aio_put_req(struct kiocb *req) | 535 | int aio_put_req(struct kiocb *req) |
536 | { | 536 | { |
537 | struct kioctx *ctx = req->ki_ctx; | 537 | struct kioctx *ctx = req->ki_ctx; |
538 | int ret; | 538 | int ret; |
539 | spin_lock_irq(&ctx->ctx_lock); | 539 | spin_lock_irq(&ctx->ctx_lock); |
540 | ret = __aio_put_req(ctx, req); | 540 | ret = __aio_put_req(ctx, req); |
541 | spin_unlock_irq(&ctx->ctx_lock); | 541 | spin_unlock_irq(&ctx->ctx_lock); |
542 | return ret; | 542 | return ret; |
543 | } | 543 | } |
544 | 544 | ||
545 | /* Lookup an ioctx id. ioctx_list is lockless for reads. | 545 | /* Lookup an ioctx id. ioctx_list is lockless for reads. |
546 | * FIXME: this is O(n) and is only suitable for development. | 546 | * FIXME: this is O(n) and is only suitable for development. |
547 | */ | 547 | */ |
548 | struct kioctx *lookup_ioctx(unsigned long ctx_id) | 548 | struct kioctx *lookup_ioctx(unsigned long ctx_id) |
549 | { | 549 | { |
550 | struct kioctx *ioctx; | 550 | struct kioctx *ioctx; |
551 | struct mm_struct *mm; | 551 | struct mm_struct *mm; |
552 | 552 | ||
553 | mm = current->mm; | 553 | mm = current->mm; |
554 | read_lock(&mm->ioctx_list_lock); | 554 | read_lock(&mm->ioctx_list_lock); |
555 | for (ioctx = mm->ioctx_list; ioctx; ioctx = ioctx->next) | 555 | for (ioctx = mm->ioctx_list; ioctx; ioctx = ioctx->next) |
556 | if (likely(ioctx->user_id == ctx_id && !ioctx->dead)) { | 556 | if (likely(ioctx->user_id == ctx_id && !ioctx->dead)) { |
557 | get_ioctx(ioctx); | 557 | get_ioctx(ioctx); |
558 | break; | 558 | break; |
559 | } | 559 | } |
560 | read_unlock(&mm->ioctx_list_lock); | 560 | read_unlock(&mm->ioctx_list_lock); |
561 | 561 | ||
562 | return ioctx; | 562 | return ioctx; |
563 | } | 563 | } |
564 | 564 | ||
565 | /* | 565 | /* |
566 | * use_mm | 566 | * use_mm |
567 | * Makes the calling kernel thread take on the specified | 567 | * Makes the calling kernel thread take on the specified |
568 | * mm context. | 568 | * mm context. |
569 | * Called by the retry thread execute retries within the | 569 | * Called by the retry thread execute retries within the |
570 | * iocb issuer's mm context, so that copy_from/to_user | 570 | * iocb issuer's mm context, so that copy_from/to_user |
571 | * operations work seamlessly for aio. | 571 | * operations work seamlessly for aio. |
572 | * (Note: this routine is intended to be called only | 572 | * (Note: this routine is intended to be called only |
573 | * from a kernel thread context) | 573 | * from a kernel thread context) |
574 | */ | 574 | */ |
575 | static void use_mm(struct mm_struct *mm) | 575 | static void use_mm(struct mm_struct *mm) |
576 | { | 576 | { |
577 | struct mm_struct *active_mm; | 577 | struct mm_struct *active_mm; |
578 | struct task_struct *tsk = current; | 578 | struct task_struct *tsk = current; |
579 | 579 | ||
580 | task_lock(tsk); | 580 | task_lock(tsk); |
581 | tsk->flags |= PF_BORROWED_MM; | 581 | tsk->flags |= PF_BORROWED_MM; |
582 | active_mm = tsk->active_mm; | 582 | active_mm = tsk->active_mm; |
583 | atomic_inc(&mm->mm_count); | 583 | atomic_inc(&mm->mm_count); |
584 | tsk->mm = mm; | 584 | tsk->mm = mm; |
585 | tsk->active_mm = mm; | 585 | tsk->active_mm = mm; |
586 | /* | 586 | /* |
587 | * Note that on UML this *requires* PF_BORROWED_MM to be set, otherwise | 587 | * Note that on UML this *requires* PF_BORROWED_MM to be set, otherwise |
588 | * it won't work. Update it accordingly if you change it here | 588 | * it won't work. Update it accordingly if you change it here |
589 | */ | 589 | */ |
590 | switch_mm(active_mm, mm, tsk); | 590 | switch_mm(active_mm, mm, tsk); |
591 | task_unlock(tsk); | 591 | task_unlock(tsk); |
592 | 592 | ||
593 | mmdrop(active_mm); | 593 | mmdrop(active_mm); |
594 | } | 594 | } |
595 | 595 | ||
596 | /* | 596 | /* |
597 | * unuse_mm | 597 | * unuse_mm |
598 | * Reverses the effect of use_mm, i.e. releases the | 598 | * Reverses the effect of use_mm, i.e. releases the |
599 | * specified mm context which was earlier taken on | 599 | * specified mm context which was earlier taken on |
600 | * by the calling kernel thread | 600 | * by the calling kernel thread |
601 | * (Note: this routine is intended to be called only | 601 | * (Note: this routine is intended to be called only |
602 | * from a kernel thread context) | 602 | * from a kernel thread context) |
603 | */ | 603 | */ |
604 | static void unuse_mm(struct mm_struct *mm) | 604 | static void unuse_mm(struct mm_struct *mm) |
605 | { | 605 | { |
606 | struct task_struct *tsk = current; | 606 | struct task_struct *tsk = current; |
607 | 607 | ||
608 | task_lock(tsk); | 608 | task_lock(tsk); |
609 | tsk->flags &= ~PF_BORROWED_MM; | 609 | tsk->flags &= ~PF_BORROWED_MM; |
610 | tsk->mm = NULL; | 610 | tsk->mm = NULL; |
611 | /* active_mm is still 'mm' */ | 611 | /* active_mm is still 'mm' */ |
612 | enter_lazy_tlb(mm, tsk); | 612 | enter_lazy_tlb(mm, tsk); |
613 | task_unlock(tsk); | 613 | task_unlock(tsk); |
614 | } | 614 | } |
615 | 615 | ||
616 | /* | 616 | /* |
617 | * Queue up a kiocb to be retried. Assumes that the kiocb | 617 | * Queue up a kiocb to be retried. Assumes that the kiocb |
618 | * has already been marked as kicked, and places it on | 618 | * has already been marked as kicked, and places it on |
619 | * the retry run list for the corresponding ioctx, if it | 619 | * the retry run list for the corresponding ioctx, if it |
620 | * isn't already queued. Returns 1 if it actually queued | 620 | * isn't already queued. Returns 1 if it actually queued |
621 | * the kiocb (to tell the caller to activate the work | 621 | * the kiocb (to tell the caller to activate the work |
622 | * queue to process it), or 0, if it found that it was | 622 | * queue to process it), or 0, if it found that it was |
623 | * already queued. | 623 | * already queued. |
624 | */ | 624 | */ |
625 | static inline int __queue_kicked_iocb(struct kiocb *iocb) | 625 | static inline int __queue_kicked_iocb(struct kiocb *iocb) |
626 | { | 626 | { |
627 | struct kioctx *ctx = iocb->ki_ctx; | 627 | struct kioctx *ctx = iocb->ki_ctx; |
628 | 628 | ||
629 | assert_spin_locked(&ctx->ctx_lock); | 629 | assert_spin_locked(&ctx->ctx_lock); |
630 | 630 | ||
631 | if (list_empty(&iocb->ki_run_list)) { | 631 | if (list_empty(&iocb->ki_run_list)) { |
632 | list_add_tail(&iocb->ki_run_list, | 632 | list_add_tail(&iocb->ki_run_list, |
633 | &ctx->run_list); | 633 | &ctx->run_list); |
634 | return 1; | 634 | return 1; |
635 | } | 635 | } |
636 | return 0; | 636 | return 0; |
637 | } | 637 | } |
638 | 638 | ||
639 | /* aio_run_iocb | 639 | /* aio_run_iocb |
640 | * This is the core aio execution routine. It is | 640 | * This is the core aio execution routine. It is |
641 | * invoked both for initial i/o submission and | 641 | * invoked both for initial i/o submission and |
642 | * subsequent retries via the aio_kick_handler. | 642 | * subsequent retries via the aio_kick_handler. |
643 | * Expects to be invoked with iocb->ki_ctx->lock | 643 | * Expects to be invoked with iocb->ki_ctx->lock |
644 | * already held. The lock is released and reacquired | 644 | * already held. The lock is released and reacquired |
645 | * as needed during processing. | 645 | * as needed during processing. |
646 | * | 646 | * |
647 | * Calls the iocb retry method (already setup for the | 647 | * Calls the iocb retry method (already setup for the |
648 | * iocb on initial submission) for operation specific | 648 | * iocb on initial submission) for operation specific |
649 | * handling, but takes care of most of common retry | 649 | * handling, but takes care of most of common retry |
650 | * execution details for a given iocb. The retry method | 650 | * execution details for a given iocb. The retry method |
651 | * needs to be non-blocking as far as possible, to avoid | 651 | * needs to be non-blocking as far as possible, to avoid |
652 | * holding up other iocbs waiting to be serviced by the | 652 | * holding up other iocbs waiting to be serviced by the |
653 | * retry kernel thread. | 653 | * retry kernel thread. |
654 | * | 654 | * |
655 | * The trickier parts in this code have to do with | 655 | * The trickier parts in this code have to do with |
656 | * ensuring that only one retry instance is in progress | 656 | * ensuring that only one retry instance is in progress |
657 | * for a given iocb at any time. Providing that guarantee | 657 | * for a given iocb at any time. Providing that guarantee |
658 | * simplifies the coding of individual aio operations as | 658 | * simplifies the coding of individual aio operations as |
659 | * it avoids various potential races. | 659 | * it avoids various potential races. |
660 | */ | 660 | */ |
661 | static ssize_t aio_run_iocb(struct kiocb *iocb) | 661 | static ssize_t aio_run_iocb(struct kiocb *iocb) |
662 | { | 662 | { |
663 | struct kioctx *ctx = iocb->ki_ctx; | 663 | struct kioctx *ctx = iocb->ki_ctx; |
664 | ssize_t (*retry)(struct kiocb *); | 664 | ssize_t (*retry)(struct kiocb *); |
665 | ssize_t ret; | 665 | ssize_t ret; |
666 | 666 | ||
667 | if (!(retry = iocb->ki_retry)) { | 667 | if (!(retry = iocb->ki_retry)) { |
668 | printk("aio_run_iocb: iocb->ki_retry = NULL\n"); | 668 | printk("aio_run_iocb: iocb->ki_retry = NULL\n"); |
669 | return 0; | 669 | return 0; |
670 | } | 670 | } |
671 | 671 | ||
672 | /* | 672 | /* |
673 | * We don't want the next retry iteration for this | 673 | * We don't want the next retry iteration for this |
674 | * operation to start until this one has returned and | 674 | * operation to start until this one has returned and |
675 | * updated the iocb state. However, wait_queue functions | 675 | * updated the iocb state. However, wait_queue functions |
676 | * can trigger a kick_iocb from interrupt context in the | 676 | * can trigger a kick_iocb from interrupt context in the |
677 | * meantime, indicating that data is available for the next | 677 | * meantime, indicating that data is available for the next |
678 | * iteration. We want to remember that and enable the | 678 | * iteration. We want to remember that and enable the |
679 | * next retry iteration _after_ we are through with | 679 | * next retry iteration _after_ we are through with |
680 | * this one. | 680 | * this one. |
681 | * | 681 | * |
682 | * So, in order to be able to register a "kick", but | 682 | * So, in order to be able to register a "kick", but |
683 | * prevent it from being queued now, we clear the kick | 683 | * prevent it from being queued now, we clear the kick |
684 | * flag, but make the kick code *think* that the iocb is | 684 | * flag, but make the kick code *think* that the iocb is |
685 | * still on the run list until we are actually done. | 685 | * still on the run list until we are actually done. |
686 | * When we are done with this iteration, we check if | 686 | * When we are done with this iteration, we check if |
687 | * the iocb was kicked in the meantime and if so, queue | 687 | * the iocb was kicked in the meantime and if so, queue |
688 | * it up afresh. | 688 | * it up afresh. |
689 | */ | 689 | */ |
690 | 690 | ||
691 | kiocbClearKicked(iocb); | 691 | kiocbClearKicked(iocb); |
692 | 692 | ||
693 | /* | 693 | /* |
694 | * This is so that aio_complete knows it doesn't need to | 694 | * This is so that aio_complete knows it doesn't need to |
695 | * pull the iocb off the run list (We can't just call | 695 | * pull the iocb off the run list (We can't just call |
696 | * INIT_LIST_HEAD because we don't want a kick_iocb to | 696 | * INIT_LIST_HEAD because we don't want a kick_iocb to |
697 | * queue this on the run list yet) | 697 | * queue this on the run list yet) |
698 | */ | 698 | */ |
699 | iocb->ki_run_list.next = iocb->ki_run_list.prev = NULL; | 699 | iocb->ki_run_list.next = iocb->ki_run_list.prev = NULL; |
700 | spin_unlock_irq(&ctx->ctx_lock); | 700 | spin_unlock_irq(&ctx->ctx_lock); |
701 | 701 | ||
702 | /* Quit retrying if the i/o has been cancelled */ | 702 | /* Quit retrying if the i/o has been cancelled */ |
703 | if (kiocbIsCancelled(iocb)) { | 703 | if (kiocbIsCancelled(iocb)) { |
704 | ret = -EINTR; | 704 | ret = -EINTR; |
705 | aio_complete(iocb, ret, 0); | 705 | aio_complete(iocb, ret, 0); |
706 | /* must not access the iocb after this */ | 706 | /* must not access the iocb after this */ |
707 | goto out; | 707 | goto out; |
708 | } | 708 | } |
709 | 709 | ||
710 | /* | 710 | /* |
711 | * Now we are all set to call the retry method in async | 711 | * Now we are all set to call the retry method in async |
712 | * context. | 712 | * context. |
713 | */ | 713 | */ |
714 | ret = retry(iocb); | 714 | ret = retry(iocb); |
715 | 715 | ||
716 | if (ret != -EIOCBRETRY && ret != -EIOCBQUEUED) { | 716 | if (ret != -EIOCBRETRY && ret != -EIOCBQUEUED) { |
717 | BUG_ON(!list_empty(&iocb->ki_wait.task_list)); | 717 | BUG_ON(!list_empty(&iocb->ki_wait.task_list)); |
718 | aio_complete(iocb, ret, 0); | 718 | aio_complete(iocb, ret, 0); |
719 | } | 719 | } |
720 | out: | 720 | out: |
721 | spin_lock_irq(&ctx->ctx_lock); | 721 | spin_lock_irq(&ctx->ctx_lock); |
722 | 722 | ||
723 | if (-EIOCBRETRY == ret) { | 723 | if (-EIOCBRETRY == ret) { |
724 | /* | 724 | /* |
725 | * OK, now that we are done with this iteration | 725 | * OK, now that we are done with this iteration |
726 | * and know that there is more left to go, | 726 | * and know that there is more left to go, |
727 | * this is where we let go so that a subsequent | 727 | * this is where we let go so that a subsequent |
728 | * "kick" can start the next iteration | 728 | * "kick" can start the next iteration |
729 | */ | 729 | */ |
730 | 730 | ||
731 | /* will make __queue_kicked_iocb succeed from here on */ | 731 | /* will make __queue_kicked_iocb succeed from here on */ |
732 | INIT_LIST_HEAD(&iocb->ki_run_list); | 732 | INIT_LIST_HEAD(&iocb->ki_run_list); |
733 | /* we must queue the next iteration ourselves, if it | 733 | /* we must queue the next iteration ourselves, if it |
734 | * has already been kicked */ | 734 | * has already been kicked */ |
735 | if (kiocbIsKicked(iocb)) { | 735 | if (kiocbIsKicked(iocb)) { |
736 | __queue_kicked_iocb(iocb); | 736 | __queue_kicked_iocb(iocb); |
737 | 737 | ||
738 | /* | 738 | /* |
739 | * __queue_kicked_iocb will always return 1 here, because | 739 | * __queue_kicked_iocb will always return 1 here, because |
740 | * iocb->ki_run_list is empty at this point so it should | 740 | * iocb->ki_run_list is empty at this point so it should |
741 | * be safe to unconditionally queue the context into the | 741 | * be safe to unconditionally queue the context into the |
742 | * work queue. | 742 | * work queue. |
743 | */ | 743 | */ |
744 | aio_queue_work(ctx); | 744 | aio_queue_work(ctx); |
745 | } | 745 | } |
746 | } | 746 | } |
747 | return ret; | 747 | return ret; |
748 | } | 748 | } |
749 | 749 | ||
750 | /* | 750 | /* |
751 | * __aio_run_iocbs: | 751 | * __aio_run_iocbs: |
752 | * Process all pending retries queued on the ioctx | 752 | * Process all pending retries queued on the ioctx |
753 | * run list. | 753 | * run list. |
754 | * Assumes it is operating within the aio issuer's mm | 754 | * Assumes it is operating within the aio issuer's mm |
755 | * context. | 755 | * context. |
756 | */ | 756 | */ |
757 | static int __aio_run_iocbs(struct kioctx *ctx) | 757 | static int __aio_run_iocbs(struct kioctx *ctx) |
758 | { | 758 | { |
759 | struct kiocb *iocb; | 759 | struct kiocb *iocb; |
760 | struct list_head run_list; | 760 | struct list_head run_list; |
761 | 761 | ||
762 | assert_spin_locked(&ctx->ctx_lock); | 762 | assert_spin_locked(&ctx->ctx_lock); |
763 | 763 | ||
764 | list_replace_init(&ctx->run_list, &run_list); | 764 | list_replace_init(&ctx->run_list, &run_list); |
765 | while (!list_empty(&run_list)) { | 765 | while (!list_empty(&run_list)) { |
766 | iocb = list_entry(run_list.next, struct kiocb, | 766 | iocb = list_entry(run_list.next, struct kiocb, |
767 | ki_run_list); | 767 | ki_run_list); |
768 | list_del(&iocb->ki_run_list); | 768 | list_del(&iocb->ki_run_list); |
769 | /* | 769 | /* |
770 | * Hold an extra reference while retrying i/o. | 770 | * Hold an extra reference while retrying i/o. |
771 | */ | 771 | */ |
772 | iocb->ki_users++; /* grab extra reference */ | 772 | iocb->ki_users++; /* grab extra reference */ |
773 | aio_run_iocb(iocb); | 773 | aio_run_iocb(iocb); |
774 | __aio_put_req(ctx, iocb); | 774 | __aio_put_req(ctx, iocb); |
775 | } | 775 | } |
776 | if (!list_empty(&ctx->run_list)) | 776 | if (!list_empty(&ctx->run_list)) |
777 | return 1; | 777 | return 1; |
778 | return 0; | 778 | return 0; |
779 | } | 779 | } |
780 | 780 | ||
781 | static void aio_queue_work(struct kioctx * ctx) | 781 | static void aio_queue_work(struct kioctx * ctx) |
782 | { | 782 | { |
783 | unsigned long timeout; | 783 | unsigned long timeout; |
784 | /* | 784 | /* |
785 | * if someone is waiting, get the work started right | 785 | * if someone is waiting, get the work started right |
786 | * away, otherwise, use a longer delay | 786 | * away, otherwise, use a longer delay |
787 | */ | 787 | */ |
788 | smp_mb(); | 788 | smp_mb(); |
789 | if (waitqueue_active(&ctx->wait)) | 789 | if (waitqueue_active(&ctx->wait)) |
790 | timeout = 1; | 790 | timeout = 1; |
791 | else | 791 | else |
792 | timeout = HZ/10; | 792 | timeout = HZ/10; |
793 | queue_delayed_work(aio_wq, &ctx->wq, timeout); | 793 | queue_delayed_work(aio_wq, &ctx->wq, timeout); |
794 | } | 794 | } |
795 | 795 | ||
796 | 796 | ||
797 | /* | 797 | /* |
798 | * aio_run_iocbs: | 798 | * aio_run_iocbs: |
799 | * Process all pending retries queued on the ioctx | 799 | * Process all pending retries queued on the ioctx |
800 | * run list. | 800 | * run list. |
801 | * Assumes it is operating within the aio issuer's mm | 801 | * Assumes it is operating within the aio issuer's mm |
802 | * context. | 802 | * context. |
803 | */ | 803 | */ |
804 | static inline void aio_run_iocbs(struct kioctx *ctx) | 804 | static inline void aio_run_iocbs(struct kioctx *ctx) |
805 | { | 805 | { |
806 | int requeue; | 806 | int requeue; |
807 | 807 | ||
808 | spin_lock_irq(&ctx->ctx_lock); | 808 | spin_lock_irq(&ctx->ctx_lock); |
809 | 809 | ||
810 | requeue = __aio_run_iocbs(ctx); | 810 | requeue = __aio_run_iocbs(ctx); |
811 | spin_unlock_irq(&ctx->ctx_lock); | 811 | spin_unlock_irq(&ctx->ctx_lock); |
812 | if (requeue) | 812 | if (requeue) |
813 | aio_queue_work(ctx); | 813 | aio_queue_work(ctx); |
814 | } | 814 | } |
815 | 815 | ||
816 | /* | 816 | /* |
817 | * just like aio_run_iocbs, but keeps running them until | 817 | * just like aio_run_iocbs, but keeps running them until |
818 | * the list stays empty | 818 | * the list stays empty |
819 | */ | 819 | */ |
820 | static inline void aio_run_all_iocbs(struct kioctx *ctx) | 820 | static inline void aio_run_all_iocbs(struct kioctx *ctx) |
821 | { | 821 | { |
822 | spin_lock_irq(&ctx->ctx_lock); | 822 | spin_lock_irq(&ctx->ctx_lock); |
823 | while (__aio_run_iocbs(ctx)) | 823 | while (__aio_run_iocbs(ctx)) |
824 | ; | 824 | ; |
825 | spin_unlock_irq(&ctx->ctx_lock); | 825 | spin_unlock_irq(&ctx->ctx_lock); |
826 | } | 826 | } |
827 | 827 | ||
828 | /* | 828 | /* |
829 | * aio_kick_handler: | 829 | * aio_kick_handler: |
830 | * Work queue handler triggered to process pending | 830 | * Work queue handler triggered to process pending |
831 | * retries on an ioctx. Takes on the aio issuer's | 831 | * retries on an ioctx. Takes on the aio issuer's |
832 | * mm context before running the iocbs, so that | 832 | * mm context before running the iocbs, so that |
833 | * copy_xxx_user operates on the issuer's address | 833 | * copy_xxx_user operates on the issuer's address |
834 | * space. | 834 | * space. |
835 | * Run on aiod's context. | 835 | * Run on aiod's context. |
836 | */ | 836 | */ |
837 | static void aio_kick_handler(struct work_struct *work) | 837 | static void aio_kick_handler(struct work_struct *work) |
838 | { | 838 | { |
839 | struct kioctx *ctx = container_of(work, struct kioctx, wq.work); | 839 | struct kioctx *ctx = container_of(work, struct kioctx, wq.work); |
840 | mm_segment_t oldfs = get_fs(); | 840 | mm_segment_t oldfs = get_fs(); |
841 | struct mm_struct *mm; | 841 | struct mm_struct *mm; |
842 | int requeue; | 842 | int requeue; |
843 | 843 | ||
844 | set_fs(USER_DS); | 844 | set_fs(USER_DS); |
845 | use_mm(ctx->mm); | 845 | use_mm(ctx->mm); |
846 | spin_lock_irq(&ctx->ctx_lock); | 846 | spin_lock_irq(&ctx->ctx_lock); |
847 | requeue =__aio_run_iocbs(ctx); | 847 | requeue =__aio_run_iocbs(ctx); |
848 | mm = ctx->mm; | 848 | mm = ctx->mm; |
849 | spin_unlock_irq(&ctx->ctx_lock); | 849 | spin_unlock_irq(&ctx->ctx_lock); |
850 | unuse_mm(mm); | 850 | unuse_mm(mm); |
851 | set_fs(oldfs); | 851 | set_fs(oldfs); |
852 | /* | 852 | /* |
853 | * we're in a worker thread already, don't use queue_delayed_work, | 853 | * we're in a worker thread already, don't use queue_delayed_work, |
854 | */ | 854 | */ |
855 | if (requeue) | 855 | if (requeue) |
856 | queue_delayed_work(aio_wq, &ctx->wq, 0); | 856 | queue_delayed_work(aio_wq, &ctx->wq, 0); |
857 | } | 857 | } |
858 | 858 | ||
859 | 859 | ||
860 | /* | 860 | /* |
861 | * Called by kick_iocb to queue the kiocb for retry | 861 | * Called by kick_iocb to queue the kiocb for retry |
862 | * and if required activate the aio work queue to process | 862 | * and if required activate the aio work queue to process |
863 | * it | 863 | * it |
864 | */ | 864 | */ |
865 | static void try_queue_kicked_iocb(struct kiocb *iocb) | 865 | static void try_queue_kicked_iocb(struct kiocb *iocb) |
866 | { | 866 | { |
867 | struct kioctx *ctx = iocb->ki_ctx; | 867 | struct kioctx *ctx = iocb->ki_ctx; |
868 | unsigned long flags; | 868 | unsigned long flags; |
869 | int run = 0; | 869 | int run = 0; |
870 | 870 | ||
871 | /* We're supposed to be the only path putting the iocb back on the run | 871 | /* We're supposed to be the only path putting the iocb back on the run |
872 | * list. If we find that the iocb is *back* on a wait queue already | 872 | * list. If we find that the iocb is *back* on a wait queue already |
873 | * than retry has happened before we could queue the iocb. This also | 873 | * than retry has happened before we could queue the iocb. This also |
874 | * means that the retry could have completed and freed our iocb, no | 874 | * means that the retry could have completed and freed our iocb, no |
875 | * good. */ | 875 | * good. */ |
876 | BUG_ON((!list_empty(&iocb->ki_wait.task_list))); | 876 | BUG_ON((!list_empty(&iocb->ki_wait.task_list))); |
877 | 877 | ||
878 | spin_lock_irqsave(&ctx->ctx_lock, flags); | 878 | spin_lock_irqsave(&ctx->ctx_lock, flags); |
879 | /* set this inside the lock so that we can't race with aio_run_iocb() | 879 | /* set this inside the lock so that we can't race with aio_run_iocb() |
880 | * testing it and putting the iocb on the run list under the lock */ | 880 | * testing it and putting the iocb on the run list under the lock */ |
881 | if (!kiocbTryKick(iocb)) | 881 | if (!kiocbTryKick(iocb)) |
882 | run = __queue_kicked_iocb(iocb); | 882 | run = __queue_kicked_iocb(iocb); |
883 | spin_unlock_irqrestore(&ctx->ctx_lock, flags); | 883 | spin_unlock_irqrestore(&ctx->ctx_lock, flags); |
884 | if (run) | 884 | if (run) |
885 | aio_queue_work(ctx); | 885 | aio_queue_work(ctx); |
886 | } | 886 | } |
887 | 887 | ||
888 | /* | 888 | /* |
889 | * kick_iocb: | 889 | * kick_iocb: |
890 | * Called typically from a wait queue callback context | 890 | * Called typically from a wait queue callback context |
891 | * (aio_wake_function) to trigger a retry of the iocb. | 891 | * (aio_wake_function) to trigger a retry of the iocb. |
892 | * The retry is usually executed by aio workqueue | 892 | * The retry is usually executed by aio workqueue |
893 | * threads (See aio_kick_handler). | 893 | * threads (See aio_kick_handler). |
894 | */ | 894 | */ |
895 | void kick_iocb(struct kiocb *iocb) | 895 | void kick_iocb(struct kiocb *iocb) |
896 | { | 896 | { |
897 | /* sync iocbs are easy: they can only ever be executing from a | 897 | /* sync iocbs are easy: they can only ever be executing from a |
898 | * single context. */ | 898 | * single context. */ |
899 | if (is_sync_kiocb(iocb)) { | 899 | if (is_sync_kiocb(iocb)) { |
900 | kiocbSetKicked(iocb); | 900 | kiocbSetKicked(iocb); |
901 | wake_up_process(iocb->ki_obj.tsk); | 901 | wake_up_process(iocb->ki_obj.tsk); |
902 | return; | 902 | return; |
903 | } | 903 | } |
904 | 904 | ||
905 | try_queue_kicked_iocb(iocb); | 905 | try_queue_kicked_iocb(iocb); |
906 | } | 906 | } |
907 | EXPORT_SYMBOL(kick_iocb); | 907 | EXPORT_SYMBOL(kick_iocb); |
908 | 908 | ||
909 | /* aio_complete | 909 | /* aio_complete |
910 | * Called when the io request on the given iocb is complete. | 910 | * Called when the io request on the given iocb is complete. |
911 | * Returns true if this is the last user of the request. The | 911 | * Returns true if this is the last user of the request. The |
912 | * only other user of the request can be the cancellation code. | 912 | * only other user of the request can be the cancellation code. |
913 | */ | 913 | */ |
914 | int aio_complete(struct kiocb *iocb, long res, long res2) | 914 | int aio_complete(struct kiocb *iocb, long res, long res2) |
915 | { | 915 | { |
916 | struct kioctx *ctx = iocb->ki_ctx; | 916 | struct kioctx *ctx = iocb->ki_ctx; |
917 | struct aio_ring_info *info; | 917 | struct aio_ring_info *info; |
918 | struct aio_ring *ring; | 918 | struct aio_ring *ring; |
919 | struct io_event *event; | 919 | struct io_event *event; |
920 | unsigned long flags; | 920 | unsigned long flags; |
921 | unsigned long tail; | 921 | unsigned long tail; |
922 | int ret; | 922 | int ret; |
923 | 923 | ||
924 | /* | 924 | /* |
925 | * Special case handling for sync iocbs: | 925 | * Special case handling for sync iocbs: |
926 | * - events go directly into the iocb for fast handling | 926 | * - events go directly into the iocb for fast handling |
927 | * - the sync task with the iocb in its stack holds the single iocb | 927 | * - the sync task with the iocb in its stack holds the single iocb |
928 | * ref, no other paths have a way to get another ref | 928 | * ref, no other paths have a way to get another ref |
929 | * - the sync task helpfully left a reference to itself in the iocb | 929 | * - the sync task helpfully left a reference to itself in the iocb |
930 | */ | 930 | */ |
931 | if (is_sync_kiocb(iocb)) { | 931 | if (is_sync_kiocb(iocb)) { |
932 | BUG_ON(iocb->ki_users != 1); | 932 | BUG_ON(iocb->ki_users != 1); |
933 | iocb->ki_user_data = res; | 933 | iocb->ki_user_data = res; |
934 | iocb->ki_users = 0; | 934 | iocb->ki_users = 0; |
935 | wake_up_process(iocb->ki_obj.tsk); | 935 | wake_up_process(iocb->ki_obj.tsk); |
936 | return 1; | 936 | return 1; |
937 | } | 937 | } |
938 | 938 | ||
939 | /* | 939 | /* |
940 | * Check if the user asked us to deliver the result through an | 940 | * Check if the user asked us to deliver the result through an |
941 | * eventfd. The eventfd_signal() function is safe to be called | 941 | * eventfd. The eventfd_signal() function is safe to be called |
942 | * from IRQ context. | 942 | * from IRQ context. |
943 | */ | 943 | */ |
944 | if (!IS_ERR(iocb->ki_eventfd)) | 944 | if (!IS_ERR(iocb->ki_eventfd)) |
945 | eventfd_signal(iocb->ki_eventfd, 1); | 945 | eventfd_signal(iocb->ki_eventfd, 1); |
946 | 946 | ||
947 | info = &ctx->ring_info; | 947 | info = &ctx->ring_info; |
948 | 948 | ||
949 | /* add a completion event to the ring buffer. | 949 | /* add a completion event to the ring buffer. |
950 | * must be done holding ctx->ctx_lock to prevent | 950 | * must be done holding ctx->ctx_lock to prevent |
951 | * other code from messing with the tail | 951 | * other code from messing with the tail |
952 | * pointer since we might be called from irq | 952 | * pointer since we might be called from irq |
953 | * context. | 953 | * context. |
954 | */ | 954 | */ |
955 | spin_lock_irqsave(&ctx->ctx_lock, flags); | 955 | spin_lock_irqsave(&ctx->ctx_lock, flags); |
956 | 956 | ||
957 | if (iocb->ki_run_list.prev && !list_empty(&iocb->ki_run_list)) | 957 | if (iocb->ki_run_list.prev && !list_empty(&iocb->ki_run_list)) |
958 | list_del_init(&iocb->ki_run_list); | 958 | list_del_init(&iocb->ki_run_list); |
959 | 959 | ||
960 | /* | 960 | /* |
961 | * cancelled requests don't get events, userland was given one | 961 | * cancelled requests don't get events, userland was given one |
962 | * when the event got cancelled. | 962 | * when the event got cancelled. |
963 | */ | 963 | */ |
964 | if (kiocbIsCancelled(iocb)) | 964 | if (kiocbIsCancelled(iocb)) |
965 | goto put_rq; | 965 | goto put_rq; |
966 | 966 | ||
967 | ring = kmap_atomic(info->ring_pages[0], KM_IRQ1); | 967 | ring = kmap_atomic(info->ring_pages[0], KM_IRQ1); |
968 | 968 | ||
969 | tail = info->tail; | 969 | tail = info->tail; |
970 | event = aio_ring_event(info, tail, KM_IRQ0); | 970 | event = aio_ring_event(info, tail, KM_IRQ0); |
971 | if (++tail >= info->nr) | 971 | if (++tail >= info->nr) |
972 | tail = 0; | 972 | tail = 0; |
973 | 973 | ||
974 | event->obj = (u64)(unsigned long)iocb->ki_obj.user; | 974 | event->obj = (u64)(unsigned long)iocb->ki_obj.user; |
975 | event->data = iocb->ki_user_data; | 975 | event->data = iocb->ki_user_data; |
976 | event->res = res; | 976 | event->res = res; |
977 | event->res2 = res2; | 977 | event->res2 = res2; |
978 | 978 | ||
979 | dprintk("aio_complete: %p[%lu]: %p: %p %Lx %lx %lx\n", | 979 | dprintk("aio_complete: %p[%lu]: %p: %p %Lx %lx %lx\n", |
980 | ctx, tail, iocb, iocb->ki_obj.user, iocb->ki_user_data, | 980 | ctx, tail, iocb, iocb->ki_obj.user, iocb->ki_user_data, |
981 | res, res2); | 981 | res, res2); |
982 | 982 | ||
983 | /* after flagging the request as done, we | 983 | /* after flagging the request as done, we |
984 | * must never even look at it again | 984 | * must never even look at it again |
985 | */ | 985 | */ |
986 | smp_wmb(); /* make event visible before updating tail */ | 986 | smp_wmb(); /* make event visible before updating tail */ |
987 | 987 | ||
988 | info->tail = tail; | 988 | info->tail = tail; |
989 | ring->tail = tail; | 989 | ring->tail = tail; |
990 | 990 | ||
991 | put_aio_ring_event(event, KM_IRQ0); | 991 | put_aio_ring_event(event, KM_IRQ0); |
992 | kunmap_atomic(ring, KM_IRQ1); | 992 | kunmap_atomic(ring, KM_IRQ1); |
993 | 993 | ||
994 | pr_debug("added to ring %p at [%lu]\n", iocb, tail); | 994 | pr_debug("added to ring %p at [%lu]\n", iocb, tail); |
995 | put_rq: | 995 | put_rq: |
996 | /* everything turned out well, dispose of the aiocb. */ | 996 | /* everything turned out well, dispose of the aiocb. */ |
997 | ret = __aio_put_req(ctx, iocb); | 997 | ret = __aio_put_req(ctx, iocb); |
998 | 998 | ||
999 | /* | 999 | /* |
1000 | * We have to order our ring_info tail store above and test | 1000 | * We have to order our ring_info tail store above and test |
1001 | * of the wait list below outside the wait lock. This is | 1001 | * of the wait list below outside the wait lock. This is |
1002 | * like in wake_up_bit() where clearing a bit has to be | 1002 | * like in wake_up_bit() where clearing a bit has to be |
1003 | * ordered with the unlocked test. | 1003 | * ordered with the unlocked test. |
1004 | */ | 1004 | */ |
1005 | smp_mb(); | 1005 | smp_mb(); |
1006 | 1006 | ||
1007 | if (waitqueue_active(&ctx->wait)) | 1007 | if (waitqueue_active(&ctx->wait)) |
1008 | wake_up(&ctx->wait); | 1008 | wake_up(&ctx->wait); |
1009 | 1009 | ||
1010 | spin_unlock_irqrestore(&ctx->ctx_lock, flags); | 1010 | spin_unlock_irqrestore(&ctx->ctx_lock, flags); |
1011 | return ret; | 1011 | return ret; |
1012 | } | 1012 | } |
1013 | 1013 | ||
1014 | /* aio_read_evt | 1014 | /* aio_read_evt |
1015 | * Pull an event off of the ioctx's event ring. Returns the number of | 1015 | * Pull an event off of the ioctx's event ring. Returns the number of |
1016 | * events fetched (0 or 1 ;-) | 1016 | * events fetched (0 or 1 ;-) |
1017 | * FIXME: make this use cmpxchg. | 1017 | * FIXME: make this use cmpxchg. |
1018 | * TODO: make the ringbuffer user mmap()able (requires FIXME). | 1018 | * TODO: make the ringbuffer user mmap()able (requires FIXME). |
1019 | */ | 1019 | */ |
1020 | static int aio_read_evt(struct kioctx *ioctx, struct io_event *ent) | 1020 | static int aio_read_evt(struct kioctx *ioctx, struct io_event *ent) |
1021 | { | 1021 | { |
1022 | struct aio_ring_info *info = &ioctx->ring_info; | 1022 | struct aio_ring_info *info = &ioctx->ring_info; |
1023 | struct aio_ring *ring; | 1023 | struct aio_ring *ring; |
1024 | unsigned long head; | 1024 | unsigned long head; |
1025 | int ret = 0; | 1025 | int ret = 0; |
1026 | 1026 | ||
1027 | ring = kmap_atomic(info->ring_pages[0], KM_USER0); | 1027 | ring = kmap_atomic(info->ring_pages[0], KM_USER0); |
1028 | dprintk("in aio_read_evt h%lu t%lu m%lu\n", | 1028 | dprintk("in aio_read_evt h%lu t%lu m%lu\n", |
1029 | (unsigned long)ring->head, (unsigned long)ring->tail, | 1029 | (unsigned long)ring->head, (unsigned long)ring->tail, |
1030 | (unsigned long)ring->nr); | 1030 | (unsigned long)ring->nr); |
1031 | 1031 | ||
1032 | if (ring->head == ring->tail) | 1032 | if (ring->head == ring->tail) |
1033 | goto out; | 1033 | goto out; |
1034 | 1034 | ||
1035 | spin_lock(&info->ring_lock); | 1035 | spin_lock(&info->ring_lock); |
1036 | 1036 | ||
1037 | head = ring->head % info->nr; | 1037 | head = ring->head % info->nr; |
1038 | if (head != ring->tail) { | 1038 | if (head != ring->tail) { |
1039 | struct io_event *evp = aio_ring_event(info, head, KM_USER1); | 1039 | struct io_event *evp = aio_ring_event(info, head, KM_USER1); |
1040 | *ent = *evp; | 1040 | *ent = *evp; |
1041 | head = (head + 1) % info->nr; | 1041 | head = (head + 1) % info->nr; |
1042 | smp_mb(); /* finish reading the event before updatng the head */ | 1042 | smp_mb(); /* finish reading the event before updatng the head */ |
1043 | ring->head = head; | 1043 | ring->head = head; |
1044 | ret = 1; | 1044 | ret = 1; |
1045 | put_aio_ring_event(evp, KM_USER1); | 1045 | put_aio_ring_event(evp, KM_USER1); |
1046 | } | 1046 | } |
1047 | spin_unlock(&info->ring_lock); | 1047 | spin_unlock(&info->ring_lock); |
1048 | 1048 | ||
1049 | out: | 1049 | out: |
1050 | kunmap_atomic(ring, KM_USER0); | 1050 | kunmap_atomic(ring, KM_USER0); |
1051 | dprintk("leaving aio_read_evt: %d h%lu t%lu\n", ret, | 1051 | dprintk("leaving aio_read_evt: %d h%lu t%lu\n", ret, |
1052 | (unsigned long)ring->head, (unsigned long)ring->tail); | 1052 | (unsigned long)ring->head, (unsigned long)ring->tail); |
1053 | return ret; | 1053 | return ret; |
1054 | } | 1054 | } |
1055 | 1055 | ||
1056 | struct aio_timeout { | 1056 | struct aio_timeout { |
1057 | struct timer_list timer; | 1057 | struct timer_list timer; |
1058 | int timed_out; | 1058 | int timed_out; |
1059 | struct task_struct *p; | 1059 | struct task_struct *p; |
1060 | }; | 1060 | }; |
1061 | 1061 | ||
1062 | static void timeout_func(unsigned long data) | 1062 | static void timeout_func(unsigned long data) |
1063 | { | 1063 | { |
1064 | struct aio_timeout *to = (struct aio_timeout *)data; | 1064 | struct aio_timeout *to = (struct aio_timeout *)data; |
1065 | 1065 | ||
1066 | to->timed_out = 1; | 1066 | to->timed_out = 1; |
1067 | wake_up_process(to->p); | 1067 | wake_up_process(to->p); |
1068 | } | 1068 | } |
1069 | 1069 | ||
1070 | static inline void init_timeout(struct aio_timeout *to) | 1070 | static inline void init_timeout(struct aio_timeout *to) |
1071 | { | 1071 | { |
1072 | init_timer(&to->timer); | 1072 | init_timer(&to->timer); |
1073 | to->timer.data = (unsigned long)to; | 1073 | to->timer.data = (unsigned long)to; |
1074 | to->timer.function = timeout_func; | 1074 | to->timer.function = timeout_func; |
1075 | to->timed_out = 0; | 1075 | to->timed_out = 0; |
1076 | to->p = current; | 1076 | to->p = current; |
1077 | } | 1077 | } |
1078 | 1078 | ||
1079 | static inline void set_timeout(long start_jiffies, struct aio_timeout *to, | 1079 | static inline void set_timeout(long start_jiffies, struct aio_timeout *to, |
1080 | const struct timespec *ts) | 1080 | const struct timespec *ts) |
1081 | { | 1081 | { |
1082 | to->timer.expires = start_jiffies + timespec_to_jiffies(ts); | 1082 | to->timer.expires = start_jiffies + timespec_to_jiffies(ts); |
1083 | if (time_after(to->timer.expires, jiffies)) | 1083 | if (time_after(to->timer.expires, jiffies)) |
1084 | add_timer(&to->timer); | 1084 | add_timer(&to->timer); |
1085 | else | 1085 | else |
1086 | to->timed_out = 1; | 1086 | to->timed_out = 1; |
1087 | } | 1087 | } |
1088 | 1088 | ||
1089 | static inline void clear_timeout(struct aio_timeout *to) | 1089 | static inline void clear_timeout(struct aio_timeout *to) |
1090 | { | 1090 | { |
1091 | del_singleshot_timer_sync(&to->timer); | 1091 | del_singleshot_timer_sync(&to->timer); |
1092 | } | 1092 | } |
1093 | 1093 | ||
1094 | static int read_events(struct kioctx *ctx, | 1094 | static int read_events(struct kioctx *ctx, |
1095 | long min_nr, long nr, | 1095 | long min_nr, long nr, |
1096 | struct io_event __user *event, | 1096 | struct io_event __user *event, |
1097 | struct timespec __user *timeout) | 1097 | struct timespec __user *timeout) |
1098 | { | 1098 | { |
1099 | long start_jiffies = jiffies; | 1099 | long start_jiffies = jiffies; |
1100 | struct task_struct *tsk = current; | 1100 | struct task_struct *tsk = current; |
1101 | DECLARE_WAITQUEUE(wait, tsk); | 1101 | DECLARE_WAITQUEUE(wait, tsk); |
1102 | int ret; | 1102 | int ret; |
1103 | int i = 0; | 1103 | int i = 0; |
1104 | struct io_event ent; | 1104 | struct io_event ent; |
1105 | struct aio_timeout to; | 1105 | struct aio_timeout to; |
1106 | int retry = 0; | 1106 | int retry = 0; |
1107 | 1107 | ||
1108 | /* needed to zero any padding within an entry (there shouldn't be | 1108 | /* needed to zero any padding within an entry (there shouldn't be |
1109 | * any, but C is fun! | 1109 | * any, but C is fun! |
1110 | */ | 1110 | */ |
1111 | memset(&ent, 0, sizeof(ent)); | 1111 | memset(&ent, 0, sizeof(ent)); |
1112 | retry: | 1112 | retry: |
1113 | ret = 0; | 1113 | ret = 0; |
1114 | while (likely(i < nr)) { | 1114 | while (likely(i < nr)) { |
1115 | ret = aio_read_evt(ctx, &ent); | 1115 | ret = aio_read_evt(ctx, &ent); |
1116 | if (unlikely(ret <= 0)) | 1116 | if (unlikely(ret <= 0)) |
1117 | break; | 1117 | break; |
1118 | 1118 | ||
1119 | dprintk("read event: %Lx %Lx %Lx %Lx\n", | 1119 | dprintk("read event: %Lx %Lx %Lx %Lx\n", |
1120 | ent.data, ent.obj, ent.res, ent.res2); | 1120 | ent.data, ent.obj, ent.res, ent.res2); |
1121 | 1121 | ||
1122 | /* Could we split the check in two? */ | 1122 | /* Could we split the check in two? */ |
1123 | ret = -EFAULT; | 1123 | ret = -EFAULT; |
1124 | if (unlikely(copy_to_user(event, &ent, sizeof(ent)))) { | 1124 | if (unlikely(copy_to_user(event, &ent, sizeof(ent)))) { |
1125 | dprintk("aio: lost an event due to EFAULT.\n"); | 1125 | dprintk("aio: lost an event due to EFAULT.\n"); |
1126 | break; | 1126 | break; |
1127 | } | 1127 | } |
1128 | ret = 0; | 1128 | ret = 0; |
1129 | 1129 | ||
1130 | /* Good, event copied to userland, update counts. */ | 1130 | /* Good, event copied to userland, update counts. */ |
1131 | event ++; | 1131 | event ++; |
1132 | i ++; | 1132 | i ++; |
1133 | } | 1133 | } |
1134 | 1134 | ||
1135 | if (min_nr <= i) | 1135 | if (min_nr <= i) |
1136 | return i; | 1136 | return i; |
1137 | if (ret) | 1137 | if (ret) |
1138 | return ret; | 1138 | return ret; |
1139 | 1139 | ||
1140 | /* End fast path */ | 1140 | /* End fast path */ |
1141 | 1141 | ||
1142 | /* racey check, but it gets redone */ | 1142 | /* racey check, but it gets redone */ |
1143 | if (!retry && unlikely(!list_empty(&ctx->run_list))) { | 1143 | if (!retry && unlikely(!list_empty(&ctx->run_list))) { |
1144 | retry = 1; | 1144 | retry = 1; |
1145 | aio_run_all_iocbs(ctx); | 1145 | aio_run_all_iocbs(ctx); |
1146 | goto retry; | 1146 | goto retry; |
1147 | } | 1147 | } |
1148 | 1148 | ||
1149 | init_timeout(&to); | 1149 | init_timeout(&to); |
1150 | if (timeout) { | 1150 | if (timeout) { |
1151 | struct timespec ts; | 1151 | struct timespec ts; |
1152 | ret = -EFAULT; | 1152 | ret = -EFAULT; |
1153 | if (unlikely(copy_from_user(&ts, timeout, sizeof(ts)))) | 1153 | if (unlikely(copy_from_user(&ts, timeout, sizeof(ts)))) |
1154 | goto out; | 1154 | goto out; |
1155 | 1155 | ||
1156 | set_timeout(start_jiffies, &to, &ts); | 1156 | set_timeout(start_jiffies, &to, &ts); |
1157 | } | 1157 | } |
1158 | 1158 | ||
1159 | while (likely(i < nr)) { | 1159 | while (likely(i < nr)) { |
1160 | add_wait_queue_exclusive(&ctx->wait, &wait); | 1160 | add_wait_queue_exclusive(&ctx->wait, &wait); |
1161 | do { | 1161 | do { |
1162 | set_task_state(tsk, TASK_INTERRUPTIBLE); | 1162 | set_task_state(tsk, TASK_INTERRUPTIBLE); |
1163 | ret = aio_read_evt(ctx, &ent); | 1163 | ret = aio_read_evt(ctx, &ent); |
1164 | if (ret) | 1164 | if (ret) |
1165 | break; | 1165 | break; |
1166 | if (min_nr <= i) | 1166 | if (min_nr <= i) |
1167 | break; | 1167 | break; |
1168 | ret = 0; | 1168 | ret = 0; |
1169 | if (to.timed_out) /* Only check after read evt */ | 1169 | if (to.timed_out) /* Only check after read evt */ |
1170 | break; | 1170 | break; |
1171 | /* Try to only show up in io wait if there are ops | 1171 | /* Try to only show up in io wait if there are ops |
1172 | * in flight */ | 1172 | * in flight */ |
1173 | if (ctx->reqs_active) | 1173 | if (ctx->reqs_active) |
1174 | io_schedule(); | 1174 | io_schedule(); |
1175 | else | 1175 | else |
1176 | schedule(); | 1176 | schedule(); |
1177 | if (signal_pending(tsk)) { | 1177 | if (signal_pending(tsk)) { |
1178 | ret = -EINTR; | 1178 | ret = -EINTR; |
1179 | break; | 1179 | break; |
1180 | } | 1180 | } |
1181 | /*ret = aio_read_evt(ctx, &ent);*/ | 1181 | /*ret = aio_read_evt(ctx, &ent);*/ |
1182 | } while (1) ; | 1182 | } while (1) ; |
1183 | 1183 | ||
1184 | set_task_state(tsk, TASK_RUNNING); | 1184 | set_task_state(tsk, TASK_RUNNING); |
1185 | remove_wait_queue(&ctx->wait, &wait); | 1185 | remove_wait_queue(&ctx->wait, &wait); |
1186 | 1186 | ||
1187 | if (unlikely(ret <= 0)) | 1187 | if (unlikely(ret <= 0)) |
1188 | break; | 1188 | break; |
1189 | 1189 | ||
1190 | ret = -EFAULT; | 1190 | ret = -EFAULT; |
1191 | if (unlikely(copy_to_user(event, &ent, sizeof(ent)))) { | 1191 | if (unlikely(copy_to_user(event, &ent, sizeof(ent)))) { |
1192 | dprintk("aio: lost an event due to EFAULT.\n"); | 1192 | dprintk("aio: lost an event due to EFAULT.\n"); |
1193 | break; | 1193 | break; |
1194 | } | 1194 | } |
1195 | 1195 | ||
1196 | /* Good, event copied to userland, update counts. */ | 1196 | /* Good, event copied to userland, update counts. */ |
1197 | event ++; | 1197 | event ++; |
1198 | i ++; | 1198 | i ++; |
1199 | } | 1199 | } |
1200 | 1200 | ||
1201 | if (timeout) | 1201 | if (timeout) |
1202 | clear_timeout(&to); | 1202 | clear_timeout(&to); |
1203 | out: | 1203 | out: |
1204 | return i ? i : ret; | 1204 | return i ? i : ret; |
1205 | } | 1205 | } |
1206 | 1206 | ||
1207 | /* Take an ioctx and remove it from the list of ioctx's. Protects | 1207 | /* Take an ioctx and remove it from the list of ioctx's. Protects |
1208 | * against races with itself via ->dead. | 1208 | * against races with itself via ->dead. |
1209 | */ | 1209 | */ |
1210 | static void io_destroy(struct kioctx *ioctx) | 1210 | static void io_destroy(struct kioctx *ioctx) |
1211 | { | 1211 | { |
1212 | struct mm_struct *mm = current->mm; | 1212 | struct mm_struct *mm = current->mm; |
1213 | struct kioctx **tmp; | 1213 | struct kioctx **tmp; |
1214 | int was_dead; | 1214 | int was_dead; |
1215 | 1215 | ||
1216 | /* delete the entry from the list is someone else hasn't already */ | 1216 | /* delete the entry from the list is someone else hasn't already */ |
1217 | write_lock(&mm->ioctx_list_lock); | 1217 | write_lock(&mm->ioctx_list_lock); |
1218 | was_dead = ioctx->dead; | 1218 | was_dead = ioctx->dead; |
1219 | ioctx->dead = 1; | 1219 | ioctx->dead = 1; |
1220 | for (tmp = &mm->ioctx_list; *tmp && *tmp != ioctx; | 1220 | for (tmp = &mm->ioctx_list; *tmp && *tmp != ioctx; |
1221 | tmp = &(*tmp)->next) | 1221 | tmp = &(*tmp)->next) |
1222 | ; | 1222 | ; |
1223 | if (*tmp) | 1223 | if (*tmp) |
1224 | *tmp = ioctx->next; | 1224 | *tmp = ioctx->next; |
1225 | write_unlock(&mm->ioctx_list_lock); | 1225 | write_unlock(&mm->ioctx_list_lock); |
1226 | 1226 | ||
1227 | dprintk("aio_release(%p)\n", ioctx); | 1227 | dprintk("aio_release(%p)\n", ioctx); |
1228 | if (likely(!was_dead)) | 1228 | if (likely(!was_dead)) |
1229 | put_ioctx(ioctx); /* twice for the list */ | 1229 | put_ioctx(ioctx); /* twice for the list */ |
1230 | 1230 | ||
1231 | aio_cancel_all(ioctx); | 1231 | aio_cancel_all(ioctx); |
1232 | wait_for_all_aios(ioctx); | 1232 | wait_for_all_aios(ioctx); |
1233 | put_ioctx(ioctx); /* once for the lookup */ | 1233 | put_ioctx(ioctx); /* once for the lookup */ |
1234 | } | 1234 | } |
1235 | 1235 | ||
1236 | /* sys_io_setup: | 1236 | /* sys_io_setup: |
1237 | * Create an aio_context capable of receiving at least nr_events. | 1237 | * Create an aio_context capable of receiving at least nr_events. |
1238 | * ctxp must not point to an aio_context that already exists, and | 1238 | * ctxp must not point to an aio_context that already exists, and |
1239 | * must be initialized to 0 prior to the call. On successful | 1239 | * must be initialized to 0 prior to the call. On successful |
1240 | * creation of the aio_context, *ctxp is filled in with the resulting | 1240 | * creation of the aio_context, *ctxp is filled in with the resulting |
1241 | * handle. May fail with -EINVAL if *ctxp is not initialized, | 1241 | * handle. May fail with -EINVAL if *ctxp is not initialized, |
1242 | * if the specified nr_events exceeds internal limits. May fail | 1242 | * if the specified nr_events exceeds internal limits. May fail |
1243 | * with -EAGAIN if the specified nr_events exceeds the user's limit | 1243 | * with -EAGAIN if the specified nr_events exceeds the user's limit |
1244 | * of available events. May fail with -ENOMEM if insufficient kernel | 1244 | * of available events. May fail with -ENOMEM if insufficient kernel |
1245 | * resources are available. May fail with -EFAULT if an invalid | 1245 | * resources are available. May fail with -EFAULT if an invalid |
1246 | * pointer is passed for ctxp. Will fail with -ENOSYS if not | 1246 | * pointer is passed for ctxp. Will fail with -ENOSYS if not |
1247 | * implemented. | 1247 | * implemented. |
1248 | */ | 1248 | */ |
1249 | asmlinkage long sys_io_setup(unsigned nr_events, aio_context_t __user *ctxp) | 1249 | asmlinkage long sys_io_setup(unsigned nr_events, aio_context_t __user *ctxp) |
1250 | { | 1250 | { |
1251 | struct kioctx *ioctx = NULL; | 1251 | struct kioctx *ioctx = NULL; |
1252 | unsigned long ctx; | 1252 | unsigned long ctx; |
1253 | long ret; | 1253 | long ret; |
1254 | 1254 | ||
1255 | ret = get_user(ctx, ctxp); | 1255 | ret = get_user(ctx, ctxp); |
1256 | if (unlikely(ret)) | 1256 | if (unlikely(ret)) |
1257 | goto out; | 1257 | goto out; |
1258 | 1258 | ||
1259 | ret = -EINVAL; | 1259 | ret = -EINVAL; |
1260 | if (unlikely(ctx || nr_events == 0)) { | 1260 | if (unlikely(ctx || nr_events == 0)) { |
1261 | pr_debug("EINVAL: io_setup: ctx %lu nr_events %u\n", | 1261 | pr_debug("EINVAL: io_setup: ctx %lu nr_events %u\n", |
1262 | ctx, nr_events); | 1262 | ctx, nr_events); |
1263 | goto out; | 1263 | goto out; |
1264 | } | 1264 | } |
1265 | 1265 | ||
1266 | ioctx = ioctx_alloc(nr_events); | 1266 | ioctx = ioctx_alloc(nr_events); |
1267 | ret = PTR_ERR(ioctx); | 1267 | ret = PTR_ERR(ioctx); |
1268 | if (!IS_ERR(ioctx)) { | 1268 | if (!IS_ERR(ioctx)) { |
1269 | ret = put_user(ioctx->user_id, ctxp); | 1269 | ret = put_user(ioctx->user_id, ctxp); |
1270 | if (!ret) | 1270 | if (!ret) |
1271 | return 0; | 1271 | return 0; |
1272 | 1272 | ||
1273 | get_ioctx(ioctx); /* io_destroy() expects us to hold a ref */ | 1273 | get_ioctx(ioctx); /* io_destroy() expects us to hold a ref */ |
1274 | io_destroy(ioctx); | 1274 | io_destroy(ioctx); |
1275 | } | 1275 | } |
1276 | 1276 | ||
1277 | out: | 1277 | out: |
1278 | return ret; | 1278 | return ret; |
1279 | } | 1279 | } |
1280 | 1280 | ||
1281 | /* sys_io_destroy: | 1281 | /* sys_io_destroy: |
1282 | * Destroy the aio_context specified. May cancel any outstanding | 1282 | * Destroy the aio_context specified. May cancel any outstanding |
1283 | * AIOs and block on completion. Will fail with -ENOSYS if not | 1283 | * AIOs and block on completion. Will fail with -ENOSYS if not |
1284 | * implemented. May fail with -EFAULT if the context pointed to | 1284 | * implemented. May fail with -EFAULT if the context pointed to |
1285 | * is invalid. | 1285 | * is invalid. |
1286 | */ | 1286 | */ |
1287 | asmlinkage long sys_io_destroy(aio_context_t ctx) | 1287 | asmlinkage long sys_io_destroy(aio_context_t ctx) |
1288 | { | 1288 | { |
1289 | struct kioctx *ioctx = lookup_ioctx(ctx); | 1289 | struct kioctx *ioctx = lookup_ioctx(ctx); |
1290 | if (likely(NULL != ioctx)) { | 1290 | if (likely(NULL != ioctx)) { |
1291 | io_destroy(ioctx); | 1291 | io_destroy(ioctx); |
1292 | return 0; | 1292 | return 0; |
1293 | } | 1293 | } |
1294 | pr_debug("EINVAL: io_destroy: invalid context id\n"); | 1294 | pr_debug("EINVAL: io_destroy: invalid context id\n"); |
1295 | return -EINVAL; | 1295 | return -EINVAL; |
1296 | } | 1296 | } |
1297 | 1297 | ||
1298 | static void aio_advance_iovec(struct kiocb *iocb, ssize_t ret) | 1298 | static void aio_advance_iovec(struct kiocb *iocb, ssize_t ret) |
1299 | { | 1299 | { |
1300 | struct iovec *iov = &iocb->ki_iovec[iocb->ki_cur_seg]; | 1300 | struct iovec *iov = &iocb->ki_iovec[iocb->ki_cur_seg]; |
1301 | 1301 | ||
1302 | BUG_ON(ret <= 0); | 1302 | BUG_ON(ret <= 0); |
1303 | 1303 | ||
1304 | while (iocb->ki_cur_seg < iocb->ki_nr_segs && ret > 0) { | 1304 | while (iocb->ki_cur_seg < iocb->ki_nr_segs && ret > 0) { |
1305 | ssize_t this = min((ssize_t)iov->iov_len, ret); | 1305 | ssize_t this = min((ssize_t)iov->iov_len, ret); |
1306 | iov->iov_base += this; | 1306 | iov->iov_base += this; |
1307 | iov->iov_len -= this; | 1307 | iov->iov_len -= this; |
1308 | iocb->ki_left -= this; | 1308 | iocb->ki_left -= this; |
1309 | ret -= this; | 1309 | ret -= this; |
1310 | if (iov->iov_len == 0) { | 1310 | if (iov->iov_len == 0) { |
1311 | iocb->ki_cur_seg++; | 1311 | iocb->ki_cur_seg++; |
1312 | iov++; | 1312 | iov++; |
1313 | } | 1313 | } |
1314 | } | 1314 | } |
1315 | 1315 | ||
1316 | /* the caller should not have done more io than what fit in | 1316 | /* the caller should not have done more io than what fit in |
1317 | * the remaining iovecs */ | 1317 | * the remaining iovecs */ |
1318 | BUG_ON(ret > 0 && iocb->ki_left == 0); | 1318 | BUG_ON(ret > 0 && iocb->ki_left == 0); |
1319 | } | 1319 | } |
1320 | 1320 | ||
1321 | static ssize_t aio_rw_vect_retry(struct kiocb *iocb) | 1321 | static ssize_t aio_rw_vect_retry(struct kiocb *iocb) |
1322 | { | 1322 | { |
1323 | struct file *file = iocb->ki_filp; | 1323 | struct file *file = iocb->ki_filp; |
1324 | struct address_space *mapping = file->f_mapping; | 1324 | struct address_space *mapping = file->f_mapping; |
1325 | struct inode *inode = mapping->host; | 1325 | struct inode *inode = mapping->host; |
1326 | ssize_t (*rw_op)(struct kiocb *, const struct iovec *, | 1326 | ssize_t (*rw_op)(struct kiocb *, const struct iovec *, |
1327 | unsigned long, loff_t); | 1327 | unsigned long, loff_t); |
1328 | ssize_t ret = 0; | 1328 | ssize_t ret = 0; |
1329 | unsigned short opcode; | 1329 | unsigned short opcode; |
1330 | 1330 | ||
1331 | if ((iocb->ki_opcode == IOCB_CMD_PREADV) || | 1331 | if ((iocb->ki_opcode == IOCB_CMD_PREADV) || |
1332 | (iocb->ki_opcode == IOCB_CMD_PREAD)) { | 1332 | (iocb->ki_opcode == IOCB_CMD_PREAD)) { |
1333 | rw_op = file->f_op->aio_read; | 1333 | rw_op = file->f_op->aio_read; |
1334 | opcode = IOCB_CMD_PREADV; | 1334 | opcode = IOCB_CMD_PREADV; |
1335 | } else { | 1335 | } else { |
1336 | rw_op = file->f_op->aio_write; | 1336 | rw_op = file->f_op->aio_write; |
1337 | opcode = IOCB_CMD_PWRITEV; | 1337 | opcode = IOCB_CMD_PWRITEV; |
1338 | } | 1338 | } |
1339 | 1339 | ||
1340 | /* This matches the pread()/pwrite() logic */ | 1340 | /* This matches the pread()/pwrite() logic */ |
1341 | if (iocb->ki_pos < 0) | 1341 | if (iocb->ki_pos < 0) |
1342 | return -EINVAL; | 1342 | return -EINVAL; |
1343 | 1343 | ||
1344 | do { | 1344 | do { |
1345 | ret = rw_op(iocb, &iocb->ki_iovec[iocb->ki_cur_seg], | 1345 | ret = rw_op(iocb, &iocb->ki_iovec[iocb->ki_cur_seg], |
1346 | iocb->ki_nr_segs - iocb->ki_cur_seg, | 1346 | iocb->ki_nr_segs - iocb->ki_cur_seg, |
1347 | iocb->ki_pos); | 1347 | iocb->ki_pos); |
1348 | if (ret > 0) | 1348 | if (ret > 0) |
1349 | aio_advance_iovec(iocb, ret); | 1349 | aio_advance_iovec(iocb, ret); |
1350 | 1350 | ||
1351 | /* retry all partial writes. retry partial reads as long as its a | 1351 | /* retry all partial writes. retry partial reads as long as its a |
1352 | * regular file. */ | 1352 | * regular file. */ |
1353 | } while (ret > 0 && iocb->ki_left > 0 && | 1353 | } while (ret > 0 && iocb->ki_left > 0 && |
1354 | (opcode == IOCB_CMD_PWRITEV || | 1354 | (opcode == IOCB_CMD_PWRITEV || |
1355 | (!S_ISFIFO(inode->i_mode) && !S_ISSOCK(inode->i_mode)))); | 1355 | (!S_ISFIFO(inode->i_mode) && !S_ISSOCK(inode->i_mode)))); |
1356 | 1356 | ||
1357 | /* This means we must have transferred all that we could */ | 1357 | /* This means we must have transferred all that we could */ |
1358 | /* No need to retry anymore */ | 1358 | /* No need to retry anymore */ |
1359 | if ((ret == 0) || (iocb->ki_left == 0)) | 1359 | if ((ret == 0) || (iocb->ki_left == 0)) |
1360 | ret = iocb->ki_nbytes - iocb->ki_left; | 1360 | ret = iocb->ki_nbytes - iocb->ki_left; |
1361 | 1361 | ||
1362 | /* If we managed to write some out we return that, rather than | 1362 | /* If we managed to write some out we return that, rather than |
1363 | * the eventual error. */ | 1363 | * the eventual error. */ |
1364 | if (opcode == IOCB_CMD_PWRITEV | 1364 | if (opcode == IOCB_CMD_PWRITEV |
1365 | && ret < 0 && ret != -EIOCBQUEUED && ret != -EIOCBRETRY | 1365 | && ret < 0 && ret != -EIOCBQUEUED && ret != -EIOCBRETRY |
1366 | && iocb->ki_nbytes - iocb->ki_left) | 1366 | && iocb->ki_nbytes - iocb->ki_left) |
1367 | ret = iocb->ki_nbytes - iocb->ki_left; | 1367 | ret = iocb->ki_nbytes - iocb->ki_left; |
1368 | 1368 | ||
1369 | return ret; | 1369 | return ret; |
1370 | } | 1370 | } |
1371 | 1371 | ||
1372 | static ssize_t aio_fdsync(struct kiocb *iocb) | 1372 | static ssize_t aio_fdsync(struct kiocb *iocb) |
1373 | { | 1373 | { |
1374 | struct file *file = iocb->ki_filp; | 1374 | struct file *file = iocb->ki_filp; |
1375 | ssize_t ret = -EINVAL; | 1375 | ssize_t ret = -EINVAL; |
1376 | 1376 | ||
1377 | if (file->f_op->aio_fsync) | 1377 | if (file->f_op->aio_fsync) |
1378 | ret = file->f_op->aio_fsync(iocb, 1); | 1378 | ret = file->f_op->aio_fsync(iocb, 1); |
1379 | return ret; | 1379 | return ret; |
1380 | } | 1380 | } |
1381 | 1381 | ||
1382 | static ssize_t aio_fsync(struct kiocb *iocb) | 1382 | static ssize_t aio_fsync(struct kiocb *iocb) |
1383 | { | 1383 | { |
1384 | struct file *file = iocb->ki_filp; | 1384 | struct file *file = iocb->ki_filp; |
1385 | ssize_t ret = -EINVAL; | 1385 | ssize_t ret = -EINVAL; |
1386 | 1386 | ||
1387 | if (file->f_op->aio_fsync) | 1387 | if (file->f_op->aio_fsync) |
1388 | ret = file->f_op->aio_fsync(iocb, 0); | 1388 | ret = file->f_op->aio_fsync(iocb, 0); |
1389 | return ret; | 1389 | return ret; |
1390 | } | 1390 | } |
1391 | 1391 | ||
1392 | static ssize_t aio_setup_vectored_rw(int type, struct kiocb *kiocb) | 1392 | static ssize_t aio_setup_vectored_rw(int type, struct kiocb *kiocb) |
1393 | { | 1393 | { |
1394 | ssize_t ret; | 1394 | ssize_t ret; |
1395 | 1395 | ||
1396 | ret = rw_copy_check_uvector(type, (struct iovec __user *)kiocb->ki_buf, | 1396 | ret = rw_copy_check_uvector(type, (struct iovec __user *)kiocb->ki_buf, |
1397 | kiocb->ki_nbytes, 1, | 1397 | kiocb->ki_nbytes, 1, |
1398 | &kiocb->ki_inline_vec, &kiocb->ki_iovec); | 1398 | &kiocb->ki_inline_vec, &kiocb->ki_iovec); |
1399 | if (ret < 0) | 1399 | if (ret < 0) |
1400 | goto out; | 1400 | goto out; |
1401 | 1401 | ||
1402 | kiocb->ki_nr_segs = kiocb->ki_nbytes; | 1402 | kiocb->ki_nr_segs = kiocb->ki_nbytes; |
1403 | kiocb->ki_cur_seg = 0; | 1403 | kiocb->ki_cur_seg = 0; |
1404 | /* ki_nbytes/left now reflect bytes instead of segs */ | 1404 | /* ki_nbytes/left now reflect bytes instead of segs */ |
1405 | kiocb->ki_nbytes = ret; | 1405 | kiocb->ki_nbytes = ret; |
1406 | kiocb->ki_left = ret; | 1406 | kiocb->ki_left = ret; |
1407 | 1407 | ||
1408 | ret = 0; | 1408 | ret = 0; |
1409 | out: | 1409 | out: |
1410 | return ret; | 1410 | return ret; |
1411 | } | 1411 | } |
1412 | 1412 | ||
1413 | static ssize_t aio_setup_single_vector(struct kiocb *kiocb) | 1413 | static ssize_t aio_setup_single_vector(struct kiocb *kiocb) |
1414 | { | 1414 | { |
1415 | kiocb->ki_iovec = &kiocb->ki_inline_vec; | 1415 | kiocb->ki_iovec = &kiocb->ki_inline_vec; |
1416 | kiocb->ki_iovec->iov_base = kiocb->ki_buf; | 1416 | kiocb->ki_iovec->iov_base = kiocb->ki_buf; |
1417 | kiocb->ki_iovec->iov_len = kiocb->ki_left; | 1417 | kiocb->ki_iovec->iov_len = kiocb->ki_left; |
1418 | kiocb->ki_nr_segs = 1; | 1418 | kiocb->ki_nr_segs = 1; |
1419 | kiocb->ki_cur_seg = 0; | 1419 | kiocb->ki_cur_seg = 0; |
1420 | return 0; | 1420 | return 0; |
1421 | } | 1421 | } |
1422 | 1422 | ||
1423 | /* | 1423 | /* |
1424 | * aio_setup_iocb: | 1424 | * aio_setup_iocb: |
1425 | * Performs the initial checks and aio retry method | 1425 | * Performs the initial checks and aio retry method |
1426 | * setup for the kiocb at the time of io submission. | 1426 | * setup for the kiocb at the time of io submission. |
1427 | */ | 1427 | */ |
1428 | static ssize_t aio_setup_iocb(struct kiocb *kiocb) | 1428 | static ssize_t aio_setup_iocb(struct kiocb *kiocb) |
1429 | { | 1429 | { |
1430 | struct file *file = kiocb->ki_filp; | 1430 | struct file *file = kiocb->ki_filp; |
1431 | ssize_t ret = 0; | 1431 | ssize_t ret = 0; |
1432 | 1432 | ||
1433 | switch (kiocb->ki_opcode) { | 1433 | switch (kiocb->ki_opcode) { |
1434 | case IOCB_CMD_PREAD: | 1434 | case IOCB_CMD_PREAD: |
1435 | ret = -EBADF; | 1435 | ret = -EBADF; |
1436 | if (unlikely(!(file->f_mode & FMODE_READ))) | 1436 | if (unlikely(!(file->f_mode & FMODE_READ))) |
1437 | break; | 1437 | break; |
1438 | ret = -EFAULT; | 1438 | ret = -EFAULT; |
1439 | if (unlikely(!access_ok(VERIFY_WRITE, kiocb->ki_buf, | 1439 | if (unlikely(!access_ok(VERIFY_WRITE, kiocb->ki_buf, |
1440 | kiocb->ki_left))) | 1440 | kiocb->ki_left))) |
1441 | break; | 1441 | break; |
1442 | ret = security_file_permission(file, MAY_READ); | 1442 | ret = security_file_permission(file, MAY_READ); |
1443 | if (unlikely(ret)) | 1443 | if (unlikely(ret)) |
1444 | break; | 1444 | break; |
1445 | ret = aio_setup_single_vector(kiocb); | 1445 | ret = aio_setup_single_vector(kiocb); |
1446 | if (ret) | 1446 | if (ret) |
1447 | break; | 1447 | break; |
1448 | ret = -EINVAL; | 1448 | ret = -EINVAL; |
1449 | if (file->f_op->aio_read) | 1449 | if (file->f_op->aio_read) |
1450 | kiocb->ki_retry = aio_rw_vect_retry; | 1450 | kiocb->ki_retry = aio_rw_vect_retry; |
1451 | break; | 1451 | break; |
1452 | case IOCB_CMD_PWRITE: | 1452 | case IOCB_CMD_PWRITE: |
1453 | ret = -EBADF; | 1453 | ret = -EBADF; |
1454 | if (unlikely(!(file->f_mode & FMODE_WRITE))) | 1454 | if (unlikely(!(file->f_mode & FMODE_WRITE))) |
1455 | break; | 1455 | break; |
1456 | ret = -EFAULT; | 1456 | ret = -EFAULT; |
1457 | if (unlikely(!access_ok(VERIFY_READ, kiocb->ki_buf, | 1457 | if (unlikely(!access_ok(VERIFY_READ, kiocb->ki_buf, |
1458 | kiocb->ki_left))) | 1458 | kiocb->ki_left))) |
1459 | break; | 1459 | break; |
1460 | ret = security_file_permission(file, MAY_WRITE); | 1460 | ret = security_file_permission(file, MAY_WRITE); |
1461 | if (unlikely(ret)) | 1461 | if (unlikely(ret)) |
1462 | break; | 1462 | break; |
1463 | ret = aio_setup_single_vector(kiocb); | 1463 | ret = aio_setup_single_vector(kiocb); |
1464 | if (ret) | 1464 | if (ret) |
1465 | break; | 1465 | break; |
1466 | ret = -EINVAL; | 1466 | ret = -EINVAL; |
1467 | if (file->f_op->aio_write) | 1467 | if (file->f_op->aio_write) |
1468 | kiocb->ki_retry = aio_rw_vect_retry; | 1468 | kiocb->ki_retry = aio_rw_vect_retry; |
1469 | break; | 1469 | break; |
1470 | case IOCB_CMD_PREADV: | 1470 | case IOCB_CMD_PREADV: |
1471 | ret = -EBADF; | 1471 | ret = -EBADF; |
1472 | if (unlikely(!(file->f_mode & FMODE_READ))) | 1472 | if (unlikely(!(file->f_mode & FMODE_READ))) |
1473 | break; | 1473 | break; |
1474 | ret = security_file_permission(file, MAY_READ); | 1474 | ret = security_file_permission(file, MAY_READ); |
1475 | if (unlikely(ret)) | 1475 | if (unlikely(ret)) |
1476 | break; | 1476 | break; |
1477 | ret = aio_setup_vectored_rw(READ, kiocb); | 1477 | ret = aio_setup_vectored_rw(READ, kiocb); |
1478 | if (ret) | 1478 | if (ret) |
1479 | break; | 1479 | break; |
1480 | ret = -EINVAL; | 1480 | ret = -EINVAL; |
1481 | if (file->f_op->aio_read) | 1481 | if (file->f_op->aio_read) |
1482 | kiocb->ki_retry = aio_rw_vect_retry; | 1482 | kiocb->ki_retry = aio_rw_vect_retry; |
1483 | break; | 1483 | break; |
1484 | case IOCB_CMD_PWRITEV: | 1484 | case IOCB_CMD_PWRITEV: |
1485 | ret = -EBADF; | 1485 | ret = -EBADF; |
1486 | if (unlikely(!(file->f_mode & FMODE_WRITE))) | 1486 | if (unlikely(!(file->f_mode & FMODE_WRITE))) |
1487 | break; | 1487 | break; |
1488 | ret = security_file_permission(file, MAY_WRITE); | 1488 | ret = security_file_permission(file, MAY_WRITE); |
1489 | if (unlikely(ret)) | 1489 | if (unlikely(ret)) |
1490 | break; | 1490 | break; |
1491 | ret = aio_setup_vectored_rw(WRITE, kiocb); | 1491 | ret = aio_setup_vectored_rw(WRITE, kiocb); |
1492 | if (ret) | 1492 | if (ret) |
1493 | break; | 1493 | break; |
1494 | ret = -EINVAL; | 1494 | ret = -EINVAL; |
1495 | if (file->f_op->aio_write) | 1495 | if (file->f_op->aio_write) |
1496 | kiocb->ki_retry = aio_rw_vect_retry; | 1496 | kiocb->ki_retry = aio_rw_vect_retry; |
1497 | break; | 1497 | break; |
1498 | case IOCB_CMD_FDSYNC: | 1498 | case IOCB_CMD_FDSYNC: |
1499 | ret = -EINVAL; | 1499 | ret = -EINVAL; |
1500 | if (file->f_op->aio_fsync) | 1500 | if (file->f_op->aio_fsync) |
1501 | kiocb->ki_retry = aio_fdsync; | 1501 | kiocb->ki_retry = aio_fdsync; |
1502 | break; | 1502 | break; |
1503 | case IOCB_CMD_FSYNC: | 1503 | case IOCB_CMD_FSYNC: |
1504 | ret = -EINVAL; | 1504 | ret = -EINVAL; |
1505 | if (file->f_op->aio_fsync) | 1505 | if (file->f_op->aio_fsync) |
1506 | kiocb->ki_retry = aio_fsync; | 1506 | kiocb->ki_retry = aio_fsync; |
1507 | break; | 1507 | break; |
1508 | default: | 1508 | default: |
1509 | dprintk("EINVAL: io_submit: no operation provided\n"); | 1509 | dprintk("EINVAL: io_submit: no operation provided\n"); |
1510 | ret = -EINVAL; | 1510 | ret = -EINVAL; |
1511 | } | 1511 | } |
1512 | 1512 | ||
1513 | if (!kiocb->ki_retry) | 1513 | if (!kiocb->ki_retry) |
1514 | return ret; | 1514 | return ret; |
1515 | 1515 | ||
1516 | return 0; | 1516 | return 0; |
1517 | } | 1517 | } |
1518 | 1518 | ||
1519 | /* | 1519 | /* |
1520 | * aio_wake_function: | 1520 | * aio_wake_function: |
1521 | * wait queue callback function for aio notification, | 1521 | * wait queue callback function for aio notification, |
1522 | * Simply triggers a retry of the operation via kick_iocb. | 1522 | * Simply triggers a retry of the operation via kick_iocb. |
1523 | * | 1523 | * |
1524 | * This callback is specified in the wait queue entry in | 1524 | * This callback is specified in the wait queue entry in |
1525 | * a kiocb. | 1525 | * a kiocb. |
1526 | * | 1526 | * |
1527 | * Note: | 1527 | * Note: |
1528 | * This routine is executed with the wait queue lock held. | 1528 | * This routine is executed with the wait queue lock held. |
1529 | * Since kick_iocb acquires iocb->ctx->ctx_lock, it nests | 1529 | * Since kick_iocb acquires iocb->ctx->ctx_lock, it nests |
1530 | * the ioctx lock inside the wait queue lock. This is safe | 1530 | * the ioctx lock inside the wait queue lock. This is safe |
1531 | * because this callback isn't used for wait queues which | 1531 | * because this callback isn't used for wait queues which |
1532 | * are nested inside ioctx lock (i.e. ctx->wait) | 1532 | * are nested inside ioctx lock (i.e. ctx->wait) |
1533 | */ | 1533 | */ |
1534 | static int aio_wake_function(wait_queue_t *wait, unsigned mode, | 1534 | static int aio_wake_function(wait_queue_t *wait, unsigned mode, |
1535 | int sync, void *key) | 1535 | int sync, void *key) |
1536 | { | 1536 | { |
1537 | struct kiocb *iocb = container_of(wait, struct kiocb, ki_wait); | 1537 | struct kiocb *iocb = container_of(wait, struct kiocb, ki_wait); |
1538 | 1538 | ||
1539 | list_del_init(&wait->task_list); | 1539 | list_del_init(&wait->task_list); |
1540 | kick_iocb(iocb); | 1540 | kick_iocb(iocb); |
1541 | return 1; | 1541 | return 1; |
1542 | } | 1542 | } |
1543 | 1543 | ||
1544 | int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, | 1544 | int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, |
1545 | struct iocb *iocb) | 1545 | struct iocb *iocb) |
1546 | { | 1546 | { |
1547 | struct kiocb *req; | 1547 | struct kiocb *req; |
1548 | struct file *file; | 1548 | struct file *file; |
1549 | ssize_t ret; | 1549 | ssize_t ret; |
1550 | 1550 | ||
1551 | /* enforce forwards compatibility on users */ | 1551 | /* enforce forwards compatibility on users */ |
1552 | if (unlikely(iocb->aio_reserved1 || iocb->aio_reserved2)) { | 1552 | if (unlikely(iocb->aio_reserved1 || iocb->aio_reserved2)) { |
1553 | pr_debug("EINVAL: io_submit: reserve field set\n"); | 1553 | pr_debug("EINVAL: io_submit: reserve field set\n"); |
1554 | return -EINVAL; | 1554 | return -EINVAL; |
1555 | } | 1555 | } |
1556 | 1556 | ||
1557 | /* prevent overflows */ | 1557 | /* prevent overflows */ |
1558 | if (unlikely( | 1558 | if (unlikely( |
1559 | (iocb->aio_buf != (unsigned long)iocb->aio_buf) || | 1559 | (iocb->aio_buf != (unsigned long)iocb->aio_buf) || |
1560 | (iocb->aio_nbytes != (size_t)iocb->aio_nbytes) || | 1560 | (iocb->aio_nbytes != (size_t)iocb->aio_nbytes) || |
1561 | ((ssize_t)iocb->aio_nbytes < 0) | 1561 | ((ssize_t)iocb->aio_nbytes < 0) |
1562 | )) { | 1562 | )) { |
1563 | pr_debug("EINVAL: io_submit: overflow check\n"); | 1563 | pr_debug("EINVAL: io_submit: overflow check\n"); |
1564 | return -EINVAL; | 1564 | return -EINVAL; |
1565 | } | 1565 | } |
1566 | 1566 | ||
1567 | file = fget(iocb->aio_fildes); | 1567 | file = fget(iocb->aio_fildes); |
1568 | if (unlikely(!file)) | 1568 | if (unlikely(!file)) |
1569 | return -EBADF; | 1569 | return -EBADF; |
1570 | 1570 | ||
1571 | req = aio_get_req(ctx); /* returns with 2 references to req */ | 1571 | req = aio_get_req(ctx); /* returns with 2 references to req */ |
1572 | if (unlikely(!req)) { | 1572 | if (unlikely(!req)) { |
1573 | fput(file); | 1573 | fput(file); |
1574 | return -EAGAIN; | 1574 | return -EAGAIN; |
1575 | } | 1575 | } |
1576 | req->ki_filp = file; | 1576 | req->ki_filp = file; |
1577 | if (iocb->aio_flags & IOCB_FLAG_RESFD) { | 1577 | if (iocb->aio_flags & IOCB_FLAG_RESFD) { |
1578 | /* | 1578 | /* |
1579 | * If the IOCB_FLAG_RESFD flag of aio_flags is set, get an | 1579 | * If the IOCB_FLAG_RESFD flag of aio_flags is set, get an |
1580 | * instance of the file* now. The file descriptor must be | 1580 | * instance of the file* now. The file descriptor must be |
1581 | * an eventfd() fd, and will be signaled for each completed | 1581 | * an eventfd() fd, and will be signaled for each completed |
1582 | * event using the eventfd_signal() function. | 1582 | * event using the eventfd_signal() function. |
1583 | */ | 1583 | */ |
1584 | req->ki_eventfd = eventfd_fget((int) iocb->aio_resfd); | 1584 | req->ki_eventfd = eventfd_fget((int) iocb->aio_resfd); |
1585 | if (unlikely(IS_ERR(req->ki_eventfd))) { | 1585 | if (unlikely(IS_ERR(req->ki_eventfd))) { |
1586 | ret = PTR_ERR(req->ki_eventfd); | 1586 | ret = PTR_ERR(req->ki_eventfd); |
1587 | goto out_put_req; | 1587 | goto out_put_req; |
1588 | } | 1588 | } |
1589 | } | 1589 | } |
1590 | 1590 | ||
1591 | ret = put_user(req->ki_key, &user_iocb->aio_key); | 1591 | ret = put_user(req->ki_key, &user_iocb->aio_key); |
1592 | if (unlikely(ret)) { | 1592 | if (unlikely(ret)) { |
1593 | dprintk("EFAULT: aio_key\n"); | 1593 | dprintk("EFAULT: aio_key\n"); |
1594 | goto out_put_req; | 1594 | goto out_put_req; |
1595 | } | 1595 | } |
1596 | 1596 | ||
1597 | req->ki_obj.user = user_iocb; | 1597 | req->ki_obj.user = user_iocb; |
1598 | req->ki_user_data = iocb->aio_data; | 1598 | req->ki_user_data = iocb->aio_data; |
1599 | req->ki_pos = iocb->aio_offset; | 1599 | req->ki_pos = iocb->aio_offset; |
1600 | 1600 | ||
1601 | req->ki_buf = (char __user *)(unsigned long)iocb->aio_buf; | 1601 | req->ki_buf = (char __user *)(unsigned long)iocb->aio_buf; |
1602 | req->ki_left = req->ki_nbytes = iocb->aio_nbytes; | 1602 | req->ki_left = req->ki_nbytes = iocb->aio_nbytes; |
1603 | req->ki_opcode = iocb->aio_lio_opcode; | 1603 | req->ki_opcode = iocb->aio_lio_opcode; |
1604 | init_waitqueue_func_entry(&req->ki_wait, aio_wake_function); | 1604 | init_waitqueue_func_entry(&req->ki_wait, aio_wake_function); |
1605 | INIT_LIST_HEAD(&req->ki_wait.task_list); | 1605 | INIT_LIST_HEAD(&req->ki_wait.task_list); |
1606 | 1606 | ||
1607 | ret = aio_setup_iocb(req); | 1607 | ret = aio_setup_iocb(req); |
1608 | 1608 | ||
1609 | if (ret) | 1609 | if (ret) |
1610 | goto out_put_req; | 1610 | goto out_put_req; |
1611 | 1611 | ||
1612 | spin_lock_irq(&ctx->ctx_lock); | 1612 | spin_lock_irq(&ctx->ctx_lock); |
1613 | aio_run_iocb(req); | 1613 | aio_run_iocb(req); |
1614 | if (!list_empty(&ctx->run_list)) { | 1614 | if (!list_empty(&ctx->run_list)) { |
1615 | /* drain the run list */ | 1615 | /* drain the run list */ |
1616 | while (__aio_run_iocbs(ctx)) | 1616 | while (__aio_run_iocbs(ctx)) |
1617 | ; | 1617 | ; |
1618 | } | 1618 | } |
1619 | spin_unlock_irq(&ctx->ctx_lock); | 1619 | spin_unlock_irq(&ctx->ctx_lock); |
1620 | aio_put_req(req); /* drop extra ref to req */ | 1620 | aio_put_req(req); /* drop extra ref to req */ |
1621 | return 0; | 1621 | return 0; |
1622 | 1622 | ||
1623 | out_put_req: | 1623 | out_put_req: |
1624 | aio_put_req(req); /* drop extra ref to req */ | 1624 | aio_put_req(req); /* drop extra ref to req */ |
1625 | aio_put_req(req); /* drop i/o ref to req */ | 1625 | aio_put_req(req); /* drop i/o ref to req */ |
1626 | return ret; | 1626 | return ret; |
1627 | } | 1627 | } |
1628 | 1628 | ||
1629 | /* sys_io_submit: | 1629 | /* sys_io_submit: |
1630 | * Queue the nr iocbs pointed to by iocbpp for processing. Returns | 1630 | * Queue the nr iocbs pointed to by iocbpp for processing. Returns |
1631 | * the number of iocbs queued. May return -EINVAL if the aio_context | 1631 | * the number of iocbs queued. May return -EINVAL if the aio_context |
1632 | * specified by ctx_id is invalid, if nr is < 0, if the iocb at | 1632 | * specified by ctx_id is invalid, if nr is < 0, if the iocb at |
1633 | * *iocbpp[0] is not properly initialized, if the operation specified | 1633 | * *iocbpp[0] is not properly initialized, if the operation specified |
1634 | * is invalid for the file descriptor in the iocb. May fail with | 1634 | * is invalid for the file descriptor in the iocb. May fail with |
1635 | * -EFAULT if any of the data structures point to invalid data. May | 1635 | * -EFAULT if any of the data structures point to invalid data. May |
1636 | * fail with -EBADF if the file descriptor specified in the first | 1636 | * fail with -EBADF if the file descriptor specified in the first |
1637 | * iocb is invalid. May fail with -EAGAIN if insufficient resources | 1637 | * iocb is invalid. May fail with -EAGAIN if insufficient resources |
1638 | * are available to queue any iocbs. Will return 0 if nr is 0. Will | 1638 | * are available to queue any iocbs. Will return 0 if nr is 0. Will |
1639 | * fail with -ENOSYS if not implemented. | 1639 | * fail with -ENOSYS if not implemented. |
1640 | */ | 1640 | */ |
1641 | asmlinkage long sys_io_submit(aio_context_t ctx_id, long nr, | 1641 | asmlinkage long sys_io_submit(aio_context_t ctx_id, long nr, |
1642 | struct iocb __user * __user *iocbpp) | 1642 | struct iocb __user * __user *iocbpp) |
1643 | { | 1643 | { |
1644 | struct kioctx *ctx; | 1644 | struct kioctx *ctx; |
1645 | long ret = 0; | 1645 | long ret = 0; |
1646 | int i; | 1646 | int i; |
1647 | 1647 | ||
1648 | if (unlikely(nr < 0)) | 1648 | if (unlikely(nr < 0)) |
1649 | return -EINVAL; | 1649 | return -EINVAL; |
1650 | 1650 | ||
1651 | if (unlikely(!access_ok(VERIFY_READ, iocbpp, (nr*sizeof(*iocbpp))))) | 1651 | if (unlikely(!access_ok(VERIFY_READ, iocbpp, (nr*sizeof(*iocbpp))))) |
1652 | return -EFAULT; | 1652 | return -EFAULT; |
1653 | 1653 | ||
1654 | ctx = lookup_ioctx(ctx_id); | 1654 | ctx = lookup_ioctx(ctx_id); |
1655 | if (unlikely(!ctx)) { | 1655 | if (unlikely(!ctx)) { |
1656 | pr_debug("EINVAL: io_submit: invalid context id\n"); | 1656 | pr_debug("EINVAL: io_submit: invalid context id\n"); |
1657 | return -EINVAL; | 1657 | return -EINVAL; |
1658 | } | 1658 | } |
1659 | 1659 | ||
1660 | /* | 1660 | /* |
1661 | * AKPM: should this return a partial result if some of the IOs were | 1661 | * AKPM: should this return a partial result if some of the IOs were |
1662 | * successfully submitted? | 1662 | * successfully submitted? |
1663 | */ | 1663 | */ |
1664 | for (i=0; i<nr; i++) { | 1664 | for (i=0; i<nr; i++) { |
1665 | struct iocb __user *user_iocb; | 1665 | struct iocb __user *user_iocb; |
1666 | struct iocb tmp; | 1666 | struct iocb tmp; |
1667 | 1667 | ||
1668 | if (unlikely(__get_user(user_iocb, iocbpp + i))) { | 1668 | if (unlikely(__get_user(user_iocb, iocbpp + i))) { |
1669 | ret = -EFAULT; | 1669 | ret = -EFAULT; |
1670 | break; | 1670 | break; |
1671 | } | 1671 | } |
1672 | 1672 | ||
1673 | if (unlikely(copy_from_user(&tmp, user_iocb, sizeof(tmp)))) { | 1673 | if (unlikely(copy_from_user(&tmp, user_iocb, sizeof(tmp)))) { |
1674 | ret = -EFAULT; | 1674 | ret = -EFAULT; |
1675 | break; | 1675 | break; |
1676 | } | 1676 | } |
1677 | 1677 | ||
1678 | ret = io_submit_one(ctx, user_iocb, &tmp); | 1678 | ret = io_submit_one(ctx, user_iocb, &tmp); |
1679 | if (ret) | 1679 | if (ret) |
1680 | break; | 1680 | break; |
1681 | } | 1681 | } |
1682 | 1682 | ||
1683 | put_ioctx(ctx); | 1683 | put_ioctx(ctx); |
1684 | return i ? i : ret; | 1684 | return i ? i : ret; |
1685 | } | 1685 | } |
1686 | 1686 | ||
1687 | /* lookup_kiocb | 1687 | /* lookup_kiocb |
1688 | * Finds a given iocb for cancellation. | 1688 | * Finds a given iocb for cancellation. |
1689 | */ | 1689 | */ |
1690 | static struct kiocb *lookup_kiocb(struct kioctx *ctx, struct iocb __user *iocb, | 1690 | static struct kiocb *lookup_kiocb(struct kioctx *ctx, struct iocb __user *iocb, |
1691 | u32 key) | 1691 | u32 key) |
1692 | { | 1692 | { |
1693 | struct list_head *pos; | 1693 | struct list_head *pos; |
1694 | 1694 | ||
1695 | assert_spin_locked(&ctx->ctx_lock); | 1695 | assert_spin_locked(&ctx->ctx_lock); |
1696 | 1696 | ||
1697 | /* TODO: use a hash or array, this sucks. */ | 1697 | /* TODO: use a hash or array, this sucks. */ |
1698 | list_for_each(pos, &ctx->active_reqs) { | 1698 | list_for_each(pos, &ctx->active_reqs) { |
1699 | struct kiocb *kiocb = list_kiocb(pos); | 1699 | struct kiocb *kiocb = list_kiocb(pos); |
1700 | if (kiocb->ki_obj.user == iocb && kiocb->ki_key == key) | 1700 | if (kiocb->ki_obj.user == iocb && kiocb->ki_key == key) |
1701 | return kiocb; | 1701 | return kiocb; |
1702 | } | 1702 | } |
1703 | return NULL; | 1703 | return NULL; |
1704 | } | 1704 | } |
1705 | 1705 | ||
1706 | /* sys_io_cancel: | 1706 | /* sys_io_cancel: |
1707 | * Attempts to cancel an iocb previously passed to io_submit. If | 1707 | * Attempts to cancel an iocb previously passed to io_submit. If |
1708 | * the operation is successfully cancelled, the resulting event is | 1708 | * the operation is successfully cancelled, the resulting event is |
1709 | * copied into the memory pointed to by result without being placed | 1709 | * copied into the memory pointed to by result without being placed |
1710 | * into the completion queue and 0 is returned. May fail with | 1710 | * into the completion queue and 0 is returned. May fail with |
1711 | * -EFAULT if any of the data structures pointed to are invalid. | 1711 | * -EFAULT if any of the data structures pointed to are invalid. |
1712 | * May fail with -EINVAL if aio_context specified by ctx_id is | 1712 | * May fail with -EINVAL if aio_context specified by ctx_id is |
1713 | * invalid. May fail with -EAGAIN if the iocb specified was not | 1713 | * invalid. May fail with -EAGAIN if the iocb specified was not |
1714 | * cancelled. Will fail with -ENOSYS if not implemented. | 1714 | * cancelled. Will fail with -ENOSYS if not implemented. |
1715 | */ | 1715 | */ |
1716 | asmlinkage long sys_io_cancel(aio_context_t ctx_id, struct iocb __user *iocb, | 1716 | asmlinkage long sys_io_cancel(aio_context_t ctx_id, struct iocb __user *iocb, |
1717 | struct io_event __user *result) | 1717 | struct io_event __user *result) |
1718 | { | 1718 | { |
1719 | int (*cancel)(struct kiocb *iocb, struct io_event *res); | 1719 | int (*cancel)(struct kiocb *iocb, struct io_event *res); |
1720 | struct kioctx *ctx; | 1720 | struct kioctx *ctx; |
1721 | struct kiocb *kiocb; | 1721 | struct kiocb *kiocb; |
1722 | u32 key; | 1722 | u32 key; |
1723 | int ret; | 1723 | int ret; |
1724 | 1724 | ||
1725 | ret = get_user(key, &iocb->aio_key); | 1725 | ret = get_user(key, &iocb->aio_key); |
1726 | if (unlikely(ret)) | 1726 | if (unlikely(ret)) |
1727 | return -EFAULT; | 1727 | return -EFAULT; |
1728 | 1728 | ||
1729 | ctx = lookup_ioctx(ctx_id); | 1729 | ctx = lookup_ioctx(ctx_id); |
1730 | if (unlikely(!ctx)) | 1730 | if (unlikely(!ctx)) |
1731 | return -EINVAL; | 1731 | return -EINVAL; |
1732 | 1732 | ||
1733 | spin_lock_irq(&ctx->ctx_lock); | 1733 | spin_lock_irq(&ctx->ctx_lock); |
1734 | ret = -EAGAIN; | 1734 | ret = -EAGAIN; |
1735 | kiocb = lookup_kiocb(ctx, iocb, key); | 1735 | kiocb = lookup_kiocb(ctx, iocb, key); |
1736 | if (kiocb && kiocb->ki_cancel) { | 1736 | if (kiocb && kiocb->ki_cancel) { |
1737 | cancel = kiocb->ki_cancel; | 1737 | cancel = kiocb->ki_cancel; |
1738 | kiocb->ki_users ++; | 1738 | kiocb->ki_users ++; |
1739 | kiocbSetCancelled(kiocb); | 1739 | kiocbSetCancelled(kiocb); |
1740 | } else | 1740 | } else |
1741 | cancel = NULL; | 1741 | cancel = NULL; |
1742 | spin_unlock_irq(&ctx->ctx_lock); | 1742 | spin_unlock_irq(&ctx->ctx_lock); |
1743 | 1743 | ||
1744 | if (NULL != cancel) { | 1744 | if (NULL != cancel) { |
1745 | struct io_event tmp; | 1745 | struct io_event tmp; |
1746 | pr_debug("calling cancel\n"); | 1746 | pr_debug("calling cancel\n"); |
1747 | memset(&tmp, 0, sizeof(tmp)); | 1747 | memset(&tmp, 0, sizeof(tmp)); |
1748 | tmp.obj = (u64)(unsigned long)kiocb->ki_obj.user; | 1748 | tmp.obj = (u64)(unsigned long)kiocb->ki_obj.user; |
1749 | tmp.data = kiocb->ki_user_data; | 1749 | tmp.data = kiocb->ki_user_data; |
1750 | ret = cancel(kiocb, &tmp); | 1750 | ret = cancel(kiocb, &tmp); |
1751 | if (!ret) { | 1751 | if (!ret) { |
1752 | /* Cancellation succeeded -- copy the result | 1752 | /* Cancellation succeeded -- copy the result |
1753 | * into the user's buffer. | 1753 | * into the user's buffer. |
1754 | */ | 1754 | */ |
1755 | if (copy_to_user(result, &tmp, sizeof(tmp))) | 1755 | if (copy_to_user(result, &tmp, sizeof(tmp))) |
1756 | ret = -EFAULT; | 1756 | ret = -EFAULT; |
1757 | } | 1757 | } |
1758 | } else | 1758 | } else |
1759 | ret = -EINVAL; | 1759 | ret = -EINVAL; |
1760 | 1760 | ||
1761 | put_ioctx(ctx); | 1761 | put_ioctx(ctx); |
1762 | 1762 | ||
1763 | return ret; | 1763 | return ret; |
1764 | } | 1764 | } |
1765 | 1765 | ||
1766 | /* io_getevents: | 1766 | /* io_getevents: |
1767 | * Attempts to read at least min_nr events and up to nr events from | 1767 | * Attempts to read at least min_nr events and up to nr events from |
1768 | * the completion queue for the aio_context specified by ctx_id. May | 1768 | * the completion queue for the aio_context specified by ctx_id. May |
1769 | * fail with -EINVAL if ctx_id is invalid, if min_nr is out of range, | 1769 | * fail with -EINVAL if ctx_id is invalid, if min_nr is out of range, |
1770 | * if nr is out of range, if when is out of range. May fail with | 1770 | * if nr is out of range, if when is out of range. May fail with |
1771 | * -EFAULT if any of the memory specified to is invalid. May return | 1771 | * -EFAULT if any of the memory specified to is invalid. May return |
1772 | * 0 or < min_nr if no events are available and the timeout specified | 1772 | * 0 or < min_nr if no events are available and the timeout specified |
1773 | * by when has elapsed, where when == NULL specifies an infinite | 1773 | * by when has elapsed, where when == NULL specifies an infinite |
1774 | * timeout. Note that the timeout pointed to by when is relative and | 1774 | * timeout. Note that the timeout pointed to by when is relative and |
1775 | * will be updated if not NULL and the operation blocks. Will fail | 1775 | * will be updated if not NULL and the operation blocks. Will fail |
1776 | * with -ENOSYS if not implemented. | 1776 | * with -ENOSYS if not implemented. |
1777 | */ | 1777 | */ |
1778 | asmlinkage long sys_io_getevents(aio_context_t ctx_id, | 1778 | asmlinkage long sys_io_getevents(aio_context_t ctx_id, |
1779 | long min_nr, | 1779 | long min_nr, |
1780 | long nr, | 1780 | long nr, |
1781 | struct io_event __user *events, | 1781 | struct io_event __user *events, |
1782 | struct timespec __user *timeout) | 1782 | struct timespec __user *timeout) |
1783 | { | 1783 | { |
1784 | struct kioctx *ioctx = lookup_ioctx(ctx_id); | 1784 | struct kioctx *ioctx = lookup_ioctx(ctx_id); |
1785 | long ret = -EINVAL; | 1785 | long ret = -EINVAL; |
1786 | 1786 | ||
1787 | if (likely(ioctx)) { | 1787 | if (likely(ioctx)) { |
1788 | if (likely(min_nr <= nr && min_nr >= 0 && nr >= 0)) | 1788 | if (likely(min_nr <= nr && min_nr >= 0 && nr >= 0)) |
1789 | ret = read_events(ioctx, min_nr, nr, events, timeout); | 1789 | ret = read_events(ioctx, min_nr, nr, events, timeout); |
1790 | put_ioctx(ioctx); | 1790 | put_ioctx(ioctx); |
1791 | } | 1791 | } |
1792 | 1792 | ||
1793 | asmlinkage_protect(5, ret, ctx_id, min_nr, nr, events, timeout); | ||
1793 | return ret; | 1794 | return ret; |
1794 | } | 1795 | } |
1795 | 1796 | ||
1796 | __initcall(aio_setup); | 1797 | __initcall(aio_setup); |
1797 | 1798 | ||
1798 | EXPORT_SYMBOL(aio_complete); | 1799 | EXPORT_SYMBOL(aio_complete); |
1799 | EXPORT_SYMBOL(aio_put_req); | 1800 | EXPORT_SYMBOL(aio_put_req); |
1800 | EXPORT_SYMBOL(wait_on_sync_kiocb); | 1801 | EXPORT_SYMBOL(wait_on_sync_kiocb); |
1801 | 1802 |