Commit 54c807e71d5ac59dee56c685f2b66e27cd54c475

Authored by Jan Kara
Committed by Al Viro
1 parent da2d8455ed

fs: Fix possible use-after-free with AIO

Running AIO is pinning inode in memory using file reference. Once AIO
is completed using aio_complete(), file reference is put and inode can
be freed from memory. So we have to be sure that calling aio_complete()
is the last thing we do with the inode.

CC: Christoph Hellwig <hch@infradead.org>
CC: Jens Axboe <axboe@kernel.dk>
CC: Jeff Moyer <jmoyer@redhat.com>
CC: stable@vger.kernel.org
Acked-by: Jeff Moyer <jmoyer@redhat.com>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

Showing 1 changed file with 1 additions and 1 deletions Inline Diff

1 /* 1 /*
2 * fs/direct-io.c 2 * fs/direct-io.c
3 * 3 *
4 * Copyright (C) 2002, Linus Torvalds. 4 * Copyright (C) 2002, Linus Torvalds.
5 * 5 *
6 * O_DIRECT 6 * O_DIRECT
7 * 7 *
8 * 04Jul2002 Andrew Morton 8 * 04Jul2002 Andrew Morton
9 * Initial version 9 * Initial version
10 * 11Sep2002 janetinc@us.ibm.com 10 * 11Sep2002 janetinc@us.ibm.com
11 * added readv/writev support. 11 * added readv/writev support.
12 * 29Oct2002 Andrew Morton 12 * 29Oct2002 Andrew Morton
13 * rewrote bio_add_page() support. 13 * rewrote bio_add_page() support.
14 * 30Oct2002 pbadari@us.ibm.com 14 * 30Oct2002 pbadari@us.ibm.com
15 * added support for non-aligned IO. 15 * added support for non-aligned IO.
16 * 06Nov2002 pbadari@us.ibm.com 16 * 06Nov2002 pbadari@us.ibm.com
17 * added asynchronous IO support. 17 * added asynchronous IO support.
18 * 21Jul2003 nathans@sgi.com 18 * 21Jul2003 nathans@sgi.com
19 * added IO completion notifier. 19 * added IO completion notifier.
20 */ 20 */
21 21
22 #include <linux/kernel.h> 22 #include <linux/kernel.h>
23 #include <linux/module.h> 23 #include <linux/module.h>
24 #include <linux/types.h> 24 #include <linux/types.h>
25 #include <linux/fs.h> 25 #include <linux/fs.h>
26 #include <linux/mm.h> 26 #include <linux/mm.h>
27 #include <linux/slab.h> 27 #include <linux/slab.h>
28 #include <linux/highmem.h> 28 #include <linux/highmem.h>
29 #include <linux/pagemap.h> 29 #include <linux/pagemap.h>
30 #include <linux/task_io_accounting_ops.h> 30 #include <linux/task_io_accounting_ops.h>
31 #include <linux/bio.h> 31 #include <linux/bio.h>
32 #include <linux/wait.h> 32 #include <linux/wait.h>
33 #include <linux/err.h> 33 #include <linux/err.h>
34 #include <linux/blkdev.h> 34 #include <linux/blkdev.h>
35 #include <linux/buffer_head.h> 35 #include <linux/buffer_head.h>
36 #include <linux/rwsem.h> 36 #include <linux/rwsem.h>
37 #include <linux/uio.h> 37 #include <linux/uio.h>
38 #include <linux/atomic.h> 38 #include <linux/atomic.h>
39 #include <linux/prefetch.h> 39 #include <linux/prefetch.h>
40 40
41 /* 41 /*
42 * How many user pages to map in one call to get_user_pages(). This determines 42 * How many user pages to map in one call to get_user_pages(). This determines
43 * the size of a structure in the slab cache 43 * the size of a structure in the slab cache
44 */ 44 */
45 #define DIO_PAGES 64 45 #define DIO_PAGES 64
46 46
47 /* 47 /*
48 * This code generally works in units of "dio_blocks". A dio_block is 48 * This code generally works in units of "dio_blocks". A dio_block is
49 * somewhere between the hard sector size and the filesystem block size. it 49 * somewhere between the hard sector size and the filesystem block size. it
50 * is determined on a per-invocation basis. When talking to the filesystem 50 * is determined on a per-invocation basis. When talking to the filesystem
51 * we need to convert dio_blocks to fs_blocks by scaling the dio_block quantity 51 * we need to convert dio_blocks to fs_blocks by scaling the dio_block quantity
52 * down by dio->blkfactor. Similarly, fs-blocksize quantities are converted 52 * down by dio->blkfactor. Similarly, fs-blocksize quantities are converted
53 * to bio_block quantities by shifting left by blkfactor. 53 * to bio_block quantities by shifting left by blkfactor.
54 * 54 *
55 * If blkfactor is zero then the user's request was aligned to the filesystem's 55 * If blkfactor is zero then the user's request was aligned to the filesystem's
56 * blocksize. 56 * blocksize.
57 */ 57 */
58 58
59 /* dio_state only used in the submission path */ 59 /* dio_state only used in the submission path */
60 60
61 struct dio_submit { 61 struct dio_submit {
62 struct bio *bio; /* bio under assembly */ 62 struct bio *bio; /* bio under assembly */
63 unsigned blkbits; /* doesn't change */ 63 unsigned blkbits; /* doesn't change */
64 unsigned blkfactor; /* When we're using an alignment which 64 unsigned blkfactor; /* When we're using an alignment which
65 is finer than the filesystem's soft 65 is finer than the filesystem's soft
66 blocksize, this specifies how much 66 blocksize, this specifies how much
67 finer. blkfactor=2 means 1/4-block 67 finer. blkfactor=2 means 1/4-block
68 alignment. Does not change */ 68 alignment. Does not change */
69 unsigned start_zero_done; /* flag: sub-blocksize zeroing has 69 unsigned start_zero_done; /* flag: sub-blocksize zeroing has
70 been performed at the start of a 70 been performed at the start of a
71 write */ 71 write */
72 int pages_in_io; /* approximate total IO pages */ 72 int pages_in_io; /* approximate total IO pages */
73 size_t size; /* total request size (doesn't change)*/ 73 size_t size; /* total request size (doesn't change)*/
74 sector_t block_in_file; /* Current offset into the underlying 74 sector_t block_in_file; /* Current offset into the underlying
75 file in dio_block units. */ 75 file in dio_block units. */
76 unsigned blocks_available; /* At block_in_file. changes */ 76 unsigned blocks_available; /* At block_in_file. changes */
77 int reap_counter; /* rate limit reaping */ 77 int reap_counter; /* rate limit reaping */
78 sector_t final_block_in_request;/* doesn't change */ 78 sector_t final_block_in_request;/* doesn't change */
79 unsigned first_block_in_page; /* doesn't change, Used only once */ 79 unsigned first_block_in_page; /* doesn't change, Used only once */
80 int boundary; /* prev block is at a boundary */ 80 int boundary; /* prev block is at a boundary */
81 get_block_t *get_block; /* block mapping function */ 81 get_block_t *get_block; /* block mapping function */
82 dio_submit_t *submit_io; /* IO submition function */ 82 dio_submit_t *submit_io; /* IO submition function */
83 83
84 loff_t logical_offset_in_bio; /* current first logical block in bio */ 84 loff_t logical_offset_in_bio; /* current first logical block in bio */
85 sector_t final_block_in_bio; /* current final block in bio + 1 */ 85 sector_t final_block_in_bio; /* current final block in bio + 1 */
86 sector_t next_block_for_io; /* next block to be put under IO, 86 sector_t next_block_for_io; /* next block to be put under IO,
87 in dio_blocks units */ 87 in dio_blocks units */
88 88
89 /* 89 /*
90 * Deferred addition of a page to the dio. These variables are 90 * Deferred addition of a page to the dio. These variables are
91 * private to dio_send_cur_page(), submit_page_section() and 91 * private to dio_send_cur_page(), submit_page_section() and
92 * dio_bio_add_page(). 92 * dio_bio_add_page().
93 */ 93 */
94 struct page *cur_page; /* The page */ 94 struct page *cur_page; /* The page */
95 unsigned cur_page_offset; /* Offset into it, in bytes */ 95 unsigned cur_page_offset; /* Offset into it, in bytes */
96 unsigned cur_page_len; /* Nr of bytes at cur_page_offset */ 96 unsigned cur_page_len; /* Nr of bytes at cur_page_offset */
97 sector_t cur_page_block; /* Where it starts */ 97 sector_t cur_page_block; /* Where it starts */
98 loff_t cur_page_fs_offset; /* Offset in file */ 98 loff_t cur_page_fs_offset; /* Offset in file */
99 99
100 /* 100 /*
101 * Page fetching state. These variables belong to dio_refill_pages(). 101 * Page fetching state. These variables belong to dio_refill_pages().
102 */ 102 */
103 int curr_page; /* changes */ 103 int curr_page; /* changes */
104 int total_pages; /* doesn't change */ 104 int total_pages; /* doesn't change */
105 unsigned long curr_user_address;/* changes */ 105 unsigned long curr_user_address;/* changes */
106 106
107 /* 107 /*
108 * Page queue. These variables belong to dio_refill_pages() and 108 * Page queue. These variables belong to dio_refill_pages() and
109 * dio_get_page(). 109 * dio_get_page().
110 */ 110 */
111 unsigned head; /* next page to process */ 111 unsigned head; /* next page to process */
112 unsigned tail; /* last valid page + 1 */ 112 unsigned tail; /* last valid page + 1 */
113 }; 113 };
114 114
115 /* dio_state communicated between submission path and end_io */ 115 /* dio_state communicated between submission path and end_io */
116 struct dio { 116 struct dio {
117 int flags; /* doesn't change */ 117 int flags; /* doesn't change */
118 int rw; 118 int rw;
119 struct inode *inode; 119 struct inode *inode;
120 loff_t i_size; /* i_size when submitted */ 120 loff_t i_size; /* i_size when submitted */
121 dio_iodone_t *end_io; /* IO completion function */ 121 dio_iodone_t *end_io; /* IO completion function */
122 122
123 void *private; /* copy from map_bh.b_private */ 123 void *private; /* copy from map_bh.b_private */
124 124
125 /* BIO completion state */ 125 /* BIO completion state */
126 spinlock_t bio_lock; /* protects BIO fields below */ 126 spinlock_t bio_lock; /* protects BIO fields below */
127 int page_errors; /* errno from get_user_pages() */ 127 int page_errors; /* errno from get_user_pages() */
128 int is_async; /* is IO async ? */ 128 int is_async; /* is IO async ? */
129 int io_error; /* IO error in completion path */ 129 int io_error; /* IO error in completion path */
130 unsigned long refcount; /* direct_io_worker() and bios */ 130 unsigned long refcount; /* direct_io_worker() and bios */
131 struct bio *bio_list; /* singly linked via bi_private */ 131 struct bio *bio_list; /* singly linked via bi_private */
132 struct task_struct *waiter; /* waiting task (NULL if none) */ 132 struct task_struct *waiter; /* waiting task (NULL if none) */
133 133
134 /* AIO related stuff */ 134 /* AIO related stuff */
135 struct kiocb *iocb; /* kiocb */ 135 struct kiocb *iocb; /* kiocb */
136 ssize_t result; /* IO result */ 136 ssize_t result; /* IO result */
137 137
138 /* 138 /*
139 * pages[] (and any fields placed after it) are not zeroed out at 139 * pages[] (and any fields placed after it) are not zeroed out at
140 * allocation time. Don't add new fields after pages[] unless you 140 * allocation time. Don't add new fields after pages[] unless you
141 * wish that they not be zeroed. 141 * wish that they not be zeroed.
142 */ 142 */
143 struct page *pages[DIO_PAGES]; /* page buffer */ 143 struct page *pages[DIO_PAGES]; /* page buffer */
144 } ____cacheline_aligned_in_smp; 144 } ____cacheline_aligned_in_smp;
145 145
146 static struct kmem_cache *dio_cache __read_mostly; 146 static struct kmem_cache *dio_cache __read_mostly;
147 147
148 /* 148 /*
149 * How many pages are in the queue? 149 * How many pages are in the queue?
150 */ 150 */
151 static inline unsigned dio_pages_present(struct dio_submit *sdio) 151 static inline unsigned dio_pages_present(struct dio_submit *sdio)
152 { 152 {
153 return sdio->tail - sdio->head; 153 return sdio->tail - sdio->head;
154 } 154 }
155 155
156 /* 156 /*
157 * Go grab and pin some userspace pages. Typically we'll get 64 at a time. 157 * Go grab and pin some userspace pages. Typically we'll get 64 at a time.
158 */ 158 */
159 static inline int dio_refill_pages(struct dio *dio, struct dio_submit *sdio) 159 static inline int dio_refill_pages(struct dio *dio, struct dio_submit *sdio)
160 { 160 {
161 int ret; 161 int ret;
162 int nr_pages; 162 int nr_pages;
163 163
164 nr_pages = min(sdio->total_pages - sdio->curr_page, DIO_PAGES); 164 nr_pages = min(sdio->total_pages - sdio->curr_page, DIO_PAGES);
165 ret = get_user_pages_fast( 165 ret = get_user_pages_fast(
166 sdio->curr_user_address, /* Where from? */ 166 sdio->curr_user_address, /* Where from? */
167 nr_pages, /* How many pages? */ 167 nr_pages, /* How many pages? */
168 dio->rw == READ, /* Write to memory? */ 168 dio->rw == READ, /* Write to memory? */
169 &dio->pages[0]); /* Put results here */ 169 &dio->pages[0]); /* Put results here */
170 170
171 if (ret < 0 && sdio->blocks_available && (dio->rw & WRITE)) { 171 if (ret < 0 && sdio->blocks_available && (dio->rw & WRITE)) {
172 struct page *page = ZERO_PAGE(0); 172 struct page *page = ZERO_PAGE(0);
173 /* 173 /*
174 * A memory fault, but the filesystem has some outstanding 174 * A memory fault, but the filesystem has some outstanding
175 * mapped blocks. We need to use those blocks up to avoid 175 * mapped blocks. We need to use those blocks up to avoid
176 * leaking stale data in the file. 176 * leaking stale data in the file.
177 */ 177 */
178 if (dio->page_errors == 0) 178 if (dio->page_errors == 0)
179 dio->page_errors = ret; 179 dio->page_errors = ret;
180 page_cache_get(page); 180 page_cache_get(page);
181 dio->pages[0] = page; 181 dio->pages[0] = page;
182 sdio->head = 0; 182 sdio->head = 0;
183 sdio->tail = 1; 183 sdio->tail = 1;
184 ret = 0; 184 ret = 0;
185 goto out; 185 goto out;
186 } 186 }
187 187
188 if (ret >= 0) { 188 if (ret >= 0) {
189 sdio->curr_user_address += ret * PAGE_SIZE; 189 sdio->curr_user_address += ret * PAGE_SIZE;
190 sdio->curr_page += ret; 190 sdio->curr_page += ret;
191 sdio->head = 0; 191 sdio->head = 0;
192 sdio->tail = ret; 192 sdio->tail = ret;
193 ret = 0; 193 ret = 0;
194 } 194 }
195 out: 195 out:
196 return ret; 196 return ret;
197 } 197 }
198 198
199 /* 199 /*
200 * Get another userspace page. Returns an ERR_PTR on error. Pages are 200 * Get another userspace page. Returns an ERR_PTR on error. Pages are
201 * buffered inside the dio so that we can call get_user_pages() against a 201 * buffered inside the dio so that we can call get_user_pages() against a
202 * decent number of pages, less frequently. To provide nicer use of the 202 * decent number of pages, less frequently. To provide nicer use of the
203 * L1 cache. 203 * L1 cache.
204 */ 204 */
205 static inline struct page *dio_get_page(struct dio *dio, 205 static inline struct page *dio_get_page(struct dio *dio,
206 struct dio_submit *sdio) 206 struct dio_submit *sdio)
207 { 207 {
208 if (dio_pages_present(sdio) == 0) { 208 if (dio_pages_present(sdio) == 0) {
209 int ret; 209 int ret;
210 210
211 ret = dio_refill_pages(dio, sdio); 211 ret = dio_refill_pages(dio, sdio);
212 if (ret) 212 if (ret)
213 return ERR_PTR(ret); 213 return ERR_PTR(ret);
214 BUG_ON(dio_pages_present(sdio) == 0); 214 BUG_ON(dio_pages_present(sdio) == 0);
215 } 215 }
216 return dio->pages[sdio->head++]; 216 return dio->pages[sdio->head++];
217 } 217 }
218 218
219 /** 219 /**
220 * dio_complete() - called when all DIO BIO I/O has been completed 220 * dio_complete() - called when all DIO BIO I/O has been completed
221 * @offset: the byte offset in the file of the completed operation 221 * @offset: the byte offset in the file of the completed operation
222 * 222 *
223 * This releases locks as dictated by the locking type, lets interested parties 223 * This releases locks as dictated by the locking type, lets interested parties
224 * know that a DIO operation has completed, and calculates the resulting return 224 * know that a DIO operation has completed, and calculates the resulting return
225 * code for the operation. 225 * code for the operation.
226 * 226 *
227 * It lets the filesystem know if it registered an interest earlier via 227 * It lets the filesystem know if it registered an interest earlier via
228 * get_block. Pass the private field of the map buffer_head so that 228 * get_block. Pass the private field of the map buffer_head so that
229 * filesystems can use it to hold additional state between get_block calls and 229 * filesystems can use it to hold additional state between get_block calls and
230 * dio_complete. 230 * dio_complete.
231 */ 231 */
232 static ssize_t dio_complete(struct dio *dio, loff_t offset, ssize_t ret, bool is_async) 232 static ssize_t dio_complete(struct dio *dio, loff_t offset, ssize_t ret, bool is_async)
233 { 233 {
234 ssize_t transferred = 0; 234 ssize_t transferred = 0;
235 235
236 /* 236 /*
237 * AIO submission can race with bio completion to get here while 237 * AIO submission can race with bio completion to get here while
238 * expecting to have the last io completed by bio completion. 238 * expecting to have the last io completed by bio completion.
239 * In that case -EIOCBQUEUED is in fact not an error we want 239 * In that case -EIOCBQUEUED is in fact not an error we want
240 * to preserve through this call. 240 * to preserve through this call.
241 */ 241 */
242 if (ret == -EIOCBQUEUED) 242 if (ret == -EIOCBQUEUED)
243 ret = 0; 243 ret = 0;
244 244
245 if (dio->result) { 245 if (dio->result) {
246 transferred = dio->result; 246 transferred = dio->result;
247 247
248 /* Check for short read case */ 248 /* Check for short read case */
249 if ((dio->rw == READ) && ((offset + transferred) > dio->i_size)) 249 if ((dio->rw == READ) && ((offset + transferred) > dio->i_size))
250 transferred = dio->i_size - offset; 250 transferred = dio->i_size - offset;
251 } 251 }
252 252
253 if (ret == 0) 253 if (ret == 0)
254 ret = dio->page_errors; 254 ret = dio->page_errors;
255 if (ret == 0) 255 if (ret == 0)
256 ret = dio->io_error; 256 ret = dio->io_error;
257 if (ret == 0) 257 if (ret == 0)
258 ret = transferred; 258 ret = transferred;
259 259
260 if (dio->end_io && dio->result) { 260 if (dio->end_io && dio->result) {
261 dio->end_io(dio->iocb, offset, transferred, 261 dio->end_io(dio->iocb, offset, transferred,
262 dio->private, ret, is_async); 262 dio->private, ret, is_async);
263 } else { 263 } else {
264 inode_dio_done(dio->inode);
264 if (is_async) 265 if (is_async)
265 aio_complete(dio->iocb, ret, 0); 266 aio_complete(dio->iocb, ret, 0);
266 inode_dio_done(dio->inode);
267 } 267 }
268 268
269 return ret; 269 return ret;
270 } 270 }
271 271
272 static int dio_bio_complete(struct dio *dio, struct bio *bio); 272 static int dio_bio_complete(struct dio *dio, struct bio *bio);
273 /* 273 /*
274 * Asynchronous IO callback. 274 * Asynchronous IO callback.
275 */ 275 */
276 static void dio_bio_end_aio(struct bio *bio, int error) 276 static void dio_bio_end_aio(struct bio *bio, int error)
277 { 277 {
278 struct dio *dio = bio->bi_private; 278 struct dio *dio = bio->bi_private;
279 unsigned long remaining; 279 unsigned long remaining;
280 unsigned long flags; 280 unsigned long flags;
281 281
282 /* cleanup the bio */ 282 /* cleanup the bio */
283 dio_bio_complete(dio, bio); 283 dio_bio_complete(dio, bio);
284 284
285 spin_lock_irqsave(&dio->bio_lock, flags); 285 spin_lock_irqsave(&dio->bio_lock, flags);
286 remaining = --dio->refcount; 286 remaining = --dio->refcount;
287 if (remaining == 1 && dio->waiter) 287 if (remaining == 1 && dio->waiter)
288 wake_up_process(dio->waiter); 288 wake_up_process(dio->waiter);
289 spin_unlock_irqrestore(&dio->bio_lock, flags); 289 spin_unlock_irqrestore(&dio->bio_lock, flags);
290 290
291 if (remaining == 0) { 291 if (remaining == 0) {
292 dio_complete(dio, dio->iocb->ki_pos, 0, true); 292 dio_complete(dio, dio->iocb->ki_pos, 0, true);
293 kmem_cache_free(dio_cache, dio); 293 kmem_cache_free(dio_cache, dio);
294 } 294 }
295 } 295 }
296 296
297 /* 297 /*
298 * The BIO completion handler simply queues the BIO up for the process-context 298 * The BIO completion handler simply queues the BIO up for the process-context
299 * handler. 299 * handler.
300 * 300 *
301 * During I/O bi_private points at the dio. After I/O, bi_private is used to 301 * During I/O bi_private points at the dio. After I/O, bi_private is used to
302 * implement a singly-linked list of completed BIOs, at dio->bio_list. 302 * implement a singly-linked list of completed BIOs, at dio->bio_list.
303 */ 303 */
304 static void dio_bio_end_io(struct bio *bio, int error) 304 static void dio_bio_end_io(struct bio *bio, int error)
305 { 305 {
306 struct dio *dio = bio->bi_private; 306 struct dio *dio = bio->bi_private;
307 unsigned long flags; 307 unsigned long flags;
308 308
309 spin_lock_irqsave(&dio->bio_lock, flags); 309 spin_lock_irqsave(&dio->bio_lock, flags);
310 bio->bi_private = dio->bio_list; 310 bio->bi_private = dio->bio_list;
311 dio->bio_list = bio; 311 dio->bio_list = bio;
312 if (--dio->refcount == 1 && dio->waiter) 312 if (--dio->refcount == 1 && dio->waiter)
313 wake_up_process(dio->waiter); 313 wake_up_process(dio->waiter);
314 spin_unlock_irqrestore(&dio->bio_lock, flags); 314 spin_unlock_irqrestore(&dio->bio_lock, flags);
315 } 315 }
316 316
317 /** 317 /**
318 * dio_end_io - handle the end io action for the given bio 318 * dio_end_io - handle the end io action for the given bio
319 * @bio: The direct io bio thats being completed 319 * @bio: The direct io bio thats being completed
320 * @error: Error if there was one 320 * @error: Error if there was one
321 * 321 *
322 * This is meant to be called by any filesystem that uses their own dio_submit_t 322 * This is meant to be called by any filesystem that uses their own dio_submit_t
323 * so that the DIO specific endio actions are dealt with after the filesystem 323 * so that the DIO specific endio actions are dealt with after the filesystem
324 * has done it's completion work. 324 * has done it's completion work.
325 */ 325 */
326 void dio_end_io(struct bio *bio, int error) 326 void dio_end_io(struct bio *bio, int error)
327 { 327 {
328 struct dio *dio = bio->bi_private; 328 struct dio *dio = bio->bi_private;
329 329
330 if (dio->is_async) 330 if (dio->is_async)
331 dio_bio_end_aio(bio, error); 331 dio_bio_end_aio(bio, error);
332 else 332 else
333 dio_bio_end_io(bio, error); 333 dio_bio_end_io(bio, error);
334 } 334 }
335 EXPORT_SYMBOL_GPL(dio_end_io); 335 EXPORT_SYMBOL_GPL(dio_end_io);
336 336
337 static inline void 337 static inline void
338 dio_bio_alloc(struct dio *dio, struct dio_submit *sdio, 338 dio_bio_alloc(struct dio *dio, struct dio_submit *sdio,
339 struct block_device *bdev, 339 struct block_device *bdev,
340 sector_t first_sector, int nr_vecs) 340 sector_t first_sector, int nr_vecs)
341 { 341 {
342 struct bio *bio; 342 struct bio *bio;
343 343
344 /* 344 /*
345 * bio_alloc() is guaranteed to return a bio when called with 345 * bio_alloc() is guaranteed to return a bio when called with
346 * __GFP_WAIT and we request a valid number of vectors. 346 * __GFP_WAIT and we request a valid number of vectors.
347 */ 347 */
348 bio = bio_alloc(GFP_KERNEL, nr_vecs); 348 bio = bio_alloc(GFP_KERNEL, nr_vecs);
349 349
350 bio->bi_bdev = bdev; 350 bio->bi_bdev = bdev;
351 bio->bi_sector = first_sector; 351 bio->bi_sector = first_sector;
352 if (dio->is_async) 352 if (dio->is_async)
353 bio->bi_end_io = dio_bio_end_aio; 353 bio->bi_end_io = dio_bio_end_aio;
354 else 354 else
355 bio->bi_end_io = dio_bio_end_io; 355 bio->bi_end_io = dio_bio_end_io;
356 356
357 sdio->bio = bio; 357 sdio->bio = bio;
358 sdio->logical_offset_in_bio = sdio->cur_page_fs_offset; 358 sdio->logical_offset_in_bio = sdio->cur_page_fs_offset;
359 } 359 }
360 360
361 /* 361 /*
362 * In the AIO read case we speculatively dirty the pages before starting IO. 362 * In the AIO read case we speculatively dirty the pages before starting IO.
363 * During IO completion, any of these pages which happen to have been written 363 * During IO completion, any of these pages which happen to have been written
364 * back will be redirtied by bio_check_pages_dirty(). 364 * back will be redirtied by bio_check_pages_dirty().
365 * 365 *
366 * bios hold a dio reference between submit_bio and ->end_io. 366 * bios hold a dio reference between submit_bio and ->end_io.
367 */ 367 */
368 static inline void dio_bio_submit(struct dio *dio, struct dio_submit *sdio) 368 static inline void dio_bio_submit(struct dio *dio, struct dio_submit *sdio)
369 { 369 {
370 struct bio *bio = sdio->bio; 370 struct bio *bio = sdio->bio;
371 unsigned long flags; 371 unsigned long flags;
372 372
373 bio->bi_private = dio; 373 bio->bi_private = dio;
374 374
375 spin_lock_irqsave(&dio->bio_lock, flags); 375 spin_lock_irqsave(&dio->bio_lock, flags);
376 dio->refcount++; 376 dio->refcount++;
377 spin_unlock_irqrestore(&dio->bio_lock, flags); 377 spin_unlock_irqrestore(&dio->bio_lock, flags);
378 378
379 if (dio->is_async && dio->rw == READ) 379 if (dio->is_async && dio->rw == READ)
380 bio_set_pages_dirty(bio); 380 bio_set_pages_dirty(bio);
381 381
382 if (sdio->submit_io) 382 if (sdio->submit_io)
383 sdio->submit_io(dio->rw, bio, dio->inode, 383 sdio->submit_io(dio->rw, bio, dio->inode,
384 sdio->logical_offset_in_bio); 384 sdio->logical_offset_in_bio);
385 else 385 else
386 submit_bio(dio->rw, bio); 386 submit_bio(dio->rw, bio);
387 387
388 sdio->bio = NULL; 388 sdio->bio = NULL;
389 sdio->boundary = 0; 389 sdio->boundary = 0;
390 sdio->logical_offset_in_bio = 0; 390 sdio->logical_offset_in_bio = 0;
391 } 391 }
392 392
393 /* 393 /*
394 * Release any resources in case of a failure 394 * Release any resources in case of a failure
395 */ 395 */
396 static inline void dio_cleanup(struct dio *dio, struct dio_submit *sdio) 396 static inline void dio_cleanup(struct dio *dio, struct dio_submit *sdio)
397 { 397 {
398 while (dio_pages_present(sdio)) 398 while (dio_pages_present(sdio))
399 page_cache_release(dio_get_page(dio, sdio)); 399 page_cache_release(dio_get_page(dio, sdio));
400 } 400 }
401 401
402 /* 402 /*
403 * Wait for the next BIO to complete. Remove it and return it. NULL is 403 * Wait for the next BIO to complete. Remove it and return it. NULL is
404 * returned once all BIOs have been completed. This must only be called once 404 * returned once all BIOs have been completed. This must only be called once
405 * all bios have been issued so that dio->refcount can only decrease. This 405 * all bios have been issued so that dio->refcount can only decrease. This
406 * requires that that the caller hold a reference on the dio. 406 * requires that that the caller hold a reference on the dio.
407 */ 407 */
408 static struct bio *dio_await_one(struct dio *dio) 408 static struct bio *dio_await_one(struct dio *dio)
409 { 409 {
410 unsigned long flags; 410 unsigned long flags;
411 struct bio *bio = NULL; 411 struct bio *bio = NULL;
412 412
413 spin_lock_irqsave(&dio->bio_lock, flags); 413 spin_lock_irqsave(&dio->bio_lock, flags);
414 414
415 /* 415 /*
416 * Wait as long as the list is empty and there are bios in flight. bio 416 * Wait as long as the list is empty and there are bios in flight. bio
417 * completion drops the count, maybe adds to the list, and wakes while 417 * completion drops the count, maybe adds to the list, and wakes while
418 * holding the bio_lock so we don't need set_current_state()'s barrier 418 * holding the bio_lock so we don't need set_current_state()'s barrier
419 * and can call it after testing our condition. 419 * and can call it after testing our condition.
420 */ 420 */
421 while (dio->refcount > 1 && dio->bio_list == NULL) { 421 while (dio->refcount > 1 && dio->bio_list == NULL) {
422 __set_current_state(TASK_UNINTERRUPTIBLE); 422 __set_current_state(TASK_UNINTERRUPTIBLE);
423 dio->waiter = current; 423 dio->waiter = current;
424 spin_unlock_irqrestore(&dio->bio_lock, flags); 424 spin_unlock_irqrestore(&dio->bio_lock, flags);
425 io_schedule(); 425 io_schedule();
426 /* wake up sets us TASK_RUNNING */ 426 /* wake up sets us TASK_RUNNING */
427 spin_lock_irqsave(&dio->bio_lock, flags); 427 spin_lock_irqsave(&dio->bio_lock, flags);
428 dio->waiter = NULL; 428 dio->waiter = NULL;
429 } 429 }
430 if (dio->bio_list) { 430 if (dio->bio_list) {
431 bio = dio->bio_list; 431 bio = dio->bio_list;
432 dio->bio_list = bio->bi_private; 432 dio->bio_list = bio->bi_private;
433 } 433 }
434 spin_unlock_irqrestore(&dio->bio_lock, flags); 434 spin_unlock_irqrestore(&dio->bio_lock, flags);
435 return bio; 435 return bio;
436 } 436 }
437 437
438 /* 438 /*
439 * Process one completed BIO. No locks are held. 439 * Process one completed BIO. No locks are held.
440 */ 440 */
441 static int dio_bio_complete(struct dio *dio, struct bio *bio) 441 static int dio_bio_complete(struct dio *dio, struct bio *bio)
442 { 442 {
443 const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); 443 const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
444 struct bio_vec *bvec = bio->bi_io_vec; 444 struct bio_vec *bvec = bio->bi_io_vec;
445 int page_no; 445 int page_no;
446 446
447 if (!uptodate) 447 if (!uptodate)
448 dio->io_error = -EIO; 448 dio->io_error = -EIO;
449 449
450 if (dio->is_async && dio->rw == READ) { 450 if (dio->is_async && dio->rw == READ) {
451 bio_check_pages_dirty(bio); /* transfers ownership */ 451 bio_check_pages_dirty(bio); /* transfers ownership */
452 } else { 452 } else {
453 for (page_no = 0; page_no < bio->bi_vcnt; page_no++) { 453 for (page_no = 0; page_no < bio->bi_vcnt; page_no++) {
454 struct page *page = bvec[page_no].bv_page; 454 struct page *page = bvec[page_no].bv_page;
455 455
456 if (dio->rw == READ && !PageCompound(page)) 456 if (dio->rw == READ && !PageCompound(page))
457 set_page_dirty_lock(page); 457 set_page_dirty_lock(page);
458 page_cache_release(page); 458 page_cache_release(page);
459 } 459 }
460 bio_put(bio); 460 bio_put(bio);
461 } 461 }
462 return uptodate ? 0 : -EIO; 462 return uptodate ? 0 : -EIO;
463 } 463 }
464 464
465 /* 465 /*
466 * Wait on and process all in-flight BIOs. This must only be called once 466 * Wait on and process all in-flight BIOs. This must only be called once
467 * all bios have been issued so that the refcount can only decrease. 467 * all bios have been issued so that the refcount can only decrease.
468 * This just waits for all bios to make it through dio_bio_complete. IO 468 * This just waits for all bios to make it through dio_bio_complete. IO
469 * errors are propagated through dio->io_error and should be propagated via 469 * errors are propagated through dio->io_error and should be propagated via
470 * dio_complete(). 470 * dio_complete().
471 */ 471 */
472 static void dio_await_completion(struct dio *dio) 472 static void dio_await_completion(struct dio *dio)
473 { 473 {
474 struct bio *bio; 474 struct bio *bio;
475 do { 475 do {
476 bio = dio_await_one(dio); 476 bio = dio_await_one(dio);
477 if (bio) 477 if (bio)
478 dio_bio_complete(dio, bio); 478 dio_bio_complete(dio, bio);
479 } while (bio); 479 } while (bio);
480 } 480 }
481 481
482 /* 482 /*
483 * A really large O_DIRECT read or write can generate a lot of BIOs. So 483 * A really large O_DIRECT read or write can generate a lot of BIOs. So
484 * to keep the memory consumption sane we periodically reap any completed BIOs 484 * to keep the memory consumption sane we periodically reap any completed BIOs
485 * during the BIO generation phase. 485 * during the BIO generation phase.
486 * 486 *
487 * This also helps to limit the peak amount of pinned userspace memory. 487 * This also helps to limit the peak amount of pinned userspace memory.
488 */ 488 */
489 static inline int dio_bio_reap(struct dio *dio, struct dio_submit *sdio) 489 static inline int dio_bio_reap(struct dio *dio, struct dio_submit *sdio)
490 { 490 {
491 int ret = 0; 491 int ret = 0;
492 492
493 if (sdio->reap_counter++ >= 64) { 493 if (sdio->reap_counter++ >= 64) {
494 while (dio->bio_list) { 494 while (dio->bio_list) {
495 unsigned long flags; 495 unsigned long flags;
496 struct bio *bio; 496 struct bio *bio;
497 int ret2; 497 int ret2;
498 498
499 spin_lock_irqsave(&dio->bio_lock, flags); 499 spin_lock_irqsave(&dio->bio_lock, flags);
500 bio = dio->bio_list; 500 bio = dio->bio_list;
501 dio->bio_list = bio->bi_private; 501 dio->bio_list = bio->bi_private;
502 spin_unlock_irqrestore(&dio->bio_lock, flags); 502 spin_unlock_irqrestore(&dio->bio_lock, flags);
503 ret2 = dio_bio_complete(dio, bio); 503 ret2 = dio_bio_complete(dio, bio);
504 if (ret == 0) 504 if (ret == 0)
505 ret = ret2; 505 ret = ret2;
506 } 506 }
507 sdio->reap_counter = 0; 507 sdio->reap_counter = 0;
508 } 508 }
509 return ret; 509 return ret;
510 } 510 }
511 511
512 /* 512 /*
513 * Call into the fs to map some more disk blocks. We record the current number 513 * Call into the fs to map some more disk blocks. We record the current number
514 * of available blocks at sdio->blocks_available. These are in units of the 514 * of available blocks at sdio->blocks_available. These are in units of the
515 * fs blocksize, (1 << inode->i_blkbits). 515 * fs blocksize, (1 << inode->i_blkbits).
516 * 516 *
517 * The fs is allowed to map lots of blocks at once. If it wants to do that, 517 * The fs is allowed to map lots of blocks at once. If it wants to do that,
518 * it uses the passed inode-relative block number as the file offset, as usual. 518 * it uses the passed inode-relative block number as the file offset, as usual.
519 * 519 *
520 * get_block() is passed the number of i_blkbits-sized blocks which direct_io 520 * get_block() is passed the number of i_blkbits-sized blocks which direct_io
521 * has remaining to do. The fs should not map more than this number of blocks. 521 * has remaining to do. The fs should not map more than this number of blocks.
522 * 522 *
523 * If the fs has mapped a lot of blocks, it should populate bh->b_size to 523 * If the fs has mapped a lot of blocks, it should populate bh->b_size to
524 * indicate how much contiguous disk space has been made available at 524 * indicate how much contiguous disk space has been made available at
525 * bh->b_blocknr. 525 * bh->b_blocknr.
526 * 526 *
527 * If *any* of the mapped blocks are new, then the fs must set buffer_new(). 527 * If *any* of the mapped blocks are new, then the fs must set buffer_new().
528 * This isn't very efficient... 528 * This isn't very efficient...
529 * 529 *
530 * In the case of filesystem holes: the fs may return an arbitrarily-large 530 * In the case of filesystem holes: the fs may return an arbitrarily-large
531 * hole by returning an appropriate value in b_size and by clearing 531 * hole by returning an appropriate value in b_size and by clearing
532 * buffer_mapped(). However the direct-io code will only process holes one 532 * buffer_mapped(). However the direct-io code will only process holes one
533 * block at a time - it will repeatedly call get_block() as it walks the hole. 533 * block at a time - it will repeatedly call get_block() as it walks the hole.
534 */ 534 */
535 static int get_more_blocks(struct dio *dio, struct dio_submit *sdio, 535 static int get_more_blocks(struct dio *dio, struct dio_submit *sdio,
536 struct buffer_head *map_bh) 536 struct buffer_head *map_bh)
537 { 537 {
538 int ret; 538 int ret;
539 sector_t fs_startblk; /* Into file, in filesystem-sized blocks */ 539 sector_t fs_startblk; /* Into file, in filesystem-sized blocks */
540 sector_t fs_endblk; /* Into file, in filesystem-sized blocks */ 540 sector_t fs_endblk; /* Into file, in filesystem-sized blocks */
541 unsigned long fs_count; /* Number of filesystem-sized blocks */ 541 unsigned long fs_count; /* Number of filesystem-sized blocks */
542 int create; 542 int create;
543 unsigned int i_blkbits = sdio->blkbits + sdio->blkfactor; 543 unsigned int i_blkbits = sdio->blkbits + sdio->blkfactor;
544 544
545 /* 545 /*
546 * If there was a memory error and we've overwritten all the 546 * If there was a memory error and we've overwritten all the
547 * mapped blocks then we can now return that memory error 547 * mapped blocks then we can now return that memory error
548 */ 548 */
549 ret = dio->page_errors; 549 ret = dio->page_errors;
550 if (ret == 0) { 550 if (ret == 0) {
551 BUG_ON(sdio->block_in_file >= sdio->final_block_in_request); 551 BUG_ON(sdio->block_in_file >= sdio->final_block_in_request);
552 fs_startblk = sdio->block_in_file >> sdio->blkfactor; 552 fs_startblk = sdio->block_in_file >> sdio->blkfactor;
553 fs_endblk = (sdio->final_block_in_request - 1) >> 553 fs_endblk = (sdio->final_block_in_request - 1) >>
554 sdio->blkfactor; 554 sdio->blkfactor;
555 fs_count = fs_endblk - fs_startblk + 1; 555 fs_count = fs_endblk - fs_startblk + 1;
556 556
557 map_bh->b_state = 0; 557 map_bh->b_state = 0;
558 map_bh->b_size = fs_count << i_blkbits; 558 map_bh->b_size = fs_count << i_blkbits;
559 559
560 /* 560 /*
561 * For writes inside i_size on a DIO_SKIP_HOLES filesystem we 561 * For writes inside i_size on a DIO_SKIP_HOLES filesystem we
562 * forbid block creations: only overwrites are permitted. 562 * forbid block creations: only overwrites are permitted.
563 * We will return early to the caller once we see an 563 * We will return early to the caller once we see an
564 * unmapped buffer head returned, and the caller will fall 564 * unmapped buffer head returned, and the caller will fall
565 * back to buffered I/O. 565 * back to buffered I/O.
566 * 566 *
567 * Otherwise the decision is left to the get_blocks method, 567 * Otherwise the decision is left to the get_blocks method,
568 * which may decide to handle it or also return an unmapped 568 * which may decide to handle it or also return an unmapped
569 * buffer head. 569 * buffer head.
570 */ 570 */
571 create = dio->rw & WRITE; 571 create = dio->rw & WRITE;
572 if (dio->flags & DIO_SKIP_HOLES) { 572 if (dio->flags & DIO_SKIP_HOLES) {
573 if (sdio->block_in_file < (i_size_read(dio->inode) >> 573 if (sdio->block_in_file < (i_size_read(dio->inode) >>
574 sdio->blkbits)) 574 sdio->blkbits))
575 create = 0; 575 create = 0;
576 } 576 }
577 577
578 ret = (*sdio->get_block)(dio->inode, fs_startblk, 578 ret = (*sdio->get_block)(dio->inode, fs_startblk,
579 map_bh, create); 579 map_bh, create);
580 580
581 /* Store for completion */ 581 /* Store for completion */
582 dio->private = map_bh->b_private; 582 dio->private = map_bh->b_private;
583 } 583 }
584 return ret; 584 return ret;
585 } 585 }
586 586
587 /* 587 /*
588 * There is no bio. Make one now. 588 * There is no bio. Make one now.
589 */ 589 */
590 static inline int dio_new_bio(struct dio *dio, struct dio_submit *sdio, 590 static inline int dio_new_bio(struct dio *dio, struct dio_submit *sdio,
591 sector_t start_sector, struct buffer_head *map_bh) 591 sector_t start_sector, struct buffer_head *map_bh)
592 { 592 {
593 sector_t sector; 593 sector_t sector;
594 int ret, nr_pages; 594 int ret, nr_pages;
595 595
596 ret = dio_bio_reap(dio, sdio); 596 ret = dio_bio_reap(dio, sdio);
597 if (ret) 597 if (ret)
598 goto out; 598 goto out;
599 sector = start_sector << (sdio->blkbits - 9); 599 sector = start_sector << (sdio->blkbits - 9);
600 nr_pages = min(sdio->pages_in_io, bio_get_nr_vecs(map_bh->b_bdev)); 600 nr_pages = min(sdio->pages_in_io, bio_get_nr_vecs(map_bh->b_bdev));
601 nr_pages = min(nr_pages, BIO_MAX_PAGES); 601 nr_pages = min(nr_pages, BIO_MAX_PAGES);
602 BUG_ON(nr_pages <= 0); 602 BUG_ON(nr_pages <= 0);
603 dio_bio_alloc(dio, sdio, map_bh->b_bdev, sector, nr_pages); 603 dio_bio_alloc(dio, sdio, map_bh->b_bdev, sector, nr_pages);
604 sdio->boundary = 0; 604 sdio->boundary = 0;
605 out: 605 out:
606 return ret; 606 return ret;
607 } 607 }
608 608
609 /* 609 /*
610 * Attempt to put the current chunk of 'cur_page' into the current BIO. If 610 * Attempt to put the current chunk of 'cur_page' into the current BIO. If
611 * that was successful then update final_block_in_bio and take a ref against 611 * that was successful then update final_block_in_bio and take a ref against
612 * the just-added page. 612 * the just-added page.
613 * 613 *
614 * Return zero on success. Non-zero means the caller needs to start a new BIO. 614 * Return zero on success. Non-zero means the caller needs to start a new BIO.
615 */ 615 */
616 static inline int dio_bio_add_page(struct dio_submit *sdio) 616 static inline int dio_bio_add_page(struct dio_submit *sdio)
617 { 617 {
618 int ret; 618 int ret;
619 619
620 ret = bio_add_page(sdio->bio, sdio->cur_page, 620 ret = bio_add_page(sdio->bio, sdio->cur_page,
621 sdio->cur_page_len, sdio->cur_page_offset); 621 sdio->cur_page_len, sdio->cur_page_offset);
622 if (ret == sdio->cur_page_len) { 622 if (ret == sdio->cur_page_len) {
623 /* 623 /*
624 * Decrement count only, if we are done with this page 624 * Decrement count only, if we are done with this page
625 */ 625 */
626 if ((sdio->cur_page_len + sdio->cur_page_offset) == PAGE_SIZE) 626 if ((sdio->cur_page_len + sdio->cur_page_offset) == PAGE_SIZE)
627 sdio->pages_in_io--; 627 sdio->pages_in_io--;
628 page_cache_get(sdio->cur_page); 628 page_cache_get(sdio->cur_page);
629 sdio->final_block_in_bio = sdio->cur_page_block + 629 sdio->final_block_in_bio = sdio->cur_page_block +
630 (sdio->cur_page_len >> sdio->blkbits); 630 (sdio->cur_page_len >> sdio->blkbits);
631 ret = 0; 631 ret = 0;
632 } else { 632 } else {
633 ret = 1; 633 ret = 1;
634 } 634 }
635 return ret; 635 return ret;
636 } 636 }
637 637
638 /* 638 /*
639 * Put cur_page under IO. The section of cur_page which is described by 639 * Put cur_page under IO. The section of cur_page which is described by
640 * cur_page_offset,cur_page_len is put into a BIO. The section of cur_page 640 * cur_page_offset,cur_page_len is put into a BIO. The section of cur_page
641 * starts on-disk at cur_page_block. 641 * starts on-disk at cur_page_block.
642 * 642 *
643 * We take a ref against the page here (on behalf of its presence in the bio). 643 * We take a ref against the page here (on behalf of its presence in the bio).
644 * 644 *
645 * The caller of this function is responsible for removing cur_page from the 645 * The caller of this function is responsible for removing cur_page from the
646 * dio, and for dropping the refcount which came from that presence. 646 * dio, and for dropping the refcount which came from that presence.
647 */ 647 */
648 static inline int dio_send_cur_page(struct dio *dio, struct dio_submit *sdio, 648 static inline int dio_send_cur_page(struct dio *dio, struct dio_submit *sdio,
649 struct buffer_head *map_bh) 649 struct buffer_head *map_bh)
650 { 650 {
651 int ret = 0; 651 int ret = 0;
652 652
653 if (sdio->bio) { 653 if (sdio->bio) {
654 loff_t cur_offset = sdio->cur_page_fs_offset; 654 loff_t cur_offset = sdio->cur_page_fs_offset;
655 loff_t bio_next_offset = sdio->logical_offset_in_bio + 655 loff_t bio_next_offset = sdio->logical_offset_in_bio +
656 sdio->bio->bi_size; 656 sdio->bio->bi_size;
657 657
658 /* 658 /*
659 * See whether this new request is contiguous with the old. 659 * See whether this new request is contiguous with the old.
660 * 660 *
661 * Btrfs cannot handle having logically non-contiguous requests 661 * Btrfs cannot handle having logically non-contiguous requests
662 * submitted. For example if you have 662 * submitted. For example if you have
663 * 663 *
664 * Logical: [0-4095][HOLE][8192-12287] 664 * Logical: [0-4095][HOLE][8192-12287]
665 * Physical: [0-4095] [4096-8191] 665 * Physical: [0-4095] [4096-8191]
666 * 666 *
667 * We cannot submit those pages together as one BIO. So if our 667 * We cannot submit those pages together as one BIO. So if our
668 * current logical offset in the file does not equal what would 668 * current logical offset in the file does not equal what would
669 * be the next logical offset in the bio, submit the bio we 669 * be the next logical offset in the bio, submit the bio we
670 * have. 670 * have.
671 */ 671 */
672 if (sdio->final_block_in_bio != sdio->cur_page_block || 672 if (sdio->final_block_in_bio != sdio->cur_page_block ||
673 cur_offset != bio_next_offset) 673 cur_offset != bio_next_offset)
674 dio_bio_submit(dio, sdio); 674 dio_bio_submit(dio, sdio);
675 /* 675 /*
676 * Submit now if the underlying fs is about to perform a 676 * Submit now if the underlying fs is about to perform a
677 * metadata read 677 * metadata read
678 */ 678 */
679 else if (sdio->boundary) 679 else if (sdio->boundary)
680 dio_bio_submit(dio, sdio); 680 dio_bio_submit(dio, sdio);
681 } 681 }
682 682
683 if (sdio->bio == NULL) { 683 if (sdio->bio == NULL) {
684 ret = dio_new_bio(dio, sdio, sdio->cur_page_block, map_bh); 684 ret = dio_new_bio(dio, sdio, sdio->cur_page_block, map_bh);
685 if (ret) 685 if (ret)
686 goto out; 686 goto out;
687 } 687 }
688 688
689 if (dio_bio_add_page(sdio) != 0) { 689 if (dio_bio_add_page(sdio) != 0) {
690 dio_bio_submit(dio, sdio); 690 dio_bio_submit(dio, sdio);
691 ret = dio_new_bio(dio, sdio, sdio->cur_page_block, map_bh); 691 ret = dio_new_bio(dio, sdio, sdio->cur_page_block, map_bh);
692 if (ret == 0) { 692 if (ret == 0) {
693 ret = dio_bio_add_page(sdio); 693 ret = dio_bio_add_page(sdio);
694 BUG_ON(ret != 0); 694 BUG_ON(ret != 0);
695 } 695 }
696 } 696 }
697 out: 697 out:
698 return ret; 698 return ret;
699 } 699 }
700 700
701 /* 701 /*
702 * An autonomous function to put a chunk of a page under deferred IO. 702 * An autonomous function to put a chunk of a page under deferred IO.
703 * 703 *
704 * The caller doesn't actually know (or care) whether this piece of page is in 704 * The caller doesn't actually know (or care) whether this piece of page is in
705 * a BIO, or is under IO or whatever. We just take care of all possible 705 * a BIO, or is under IO or whatever. We just take care of all possible
706 * situations here. The separation between the logic of do_direct_IO() and 706 * situations here. The separation between the logic of do_direct_IO() and
707 * that of submit_page_section() is important for clarity. Please don't break. 707 * that of submit_page_section() is important for clarity. Please don't break.
708 * 708 *
709 * The chunk of page starts on-disk at blocknr. 709 * The chunk of page starts on-disk at blocknr.
710 * 710 *
711 * We perform deferred IO, by recording the last-submitted page inside our 711 * We perform deferred IO, by recording the last-submitted page inside our
712 * private part of the dio structure. If possible, we just expand the IO 712 * private part of the dio structure. If possible, we just expand the IO
713 * across that page here. 713 * across that page here.
714 * 714 *
715 * If that doesn't work out then we put the old page into the bio and add this 715 * If that doesn't work out then we put the old page into the bio and add this
716 * page to the dio instead. 716 * page to the dio instead.
717 */ 717 */
718 static inline int 718 static inline int
719 submit_page_section(struct dio *dio, struct dio_submit *sdio, struct page *page, 719 submit_page_section(struct dio *dio, struct dio_submit *sdio, struct page *page,
720 unsigned offset, unsigned len, sector_t blocknr, 720 unsigned offset, unsigned len, sector_t blocknr,
721 struct buffer_head *map_bh) 721 struct buffer_head *map_bh)
722 { 722 {
723 int ret = 0; 723 int ret = 0;
724 724
725 if (dio->rw & WRITE) { 725 if (dio->rw & WRITE) {
726 /* 726 /*
727 * Read accounting is performed in submit_bio() 727 * Read accounting is performed in submit_bio()
728 */ 728 */
729 task_io_account_write(len); 729 task_io_account_write(len);
730 } 730 }
731 731
732 /* 732 /*
733 * Can we just grow the current page's presence in the dio? 733 * Can we just grow the current page's presence in the dio?
734 */ 734 */
735 if (sdio->cur_page == page && 735 if (sdio->cur_page == page &&
736 sdio->cur_page_offset + sdio->cur_page_len == offset && 736 sdio->cur_page_offset + sdio->cur_page_len == offset &&
737 sdio->cur_page_block + 737 sdio->cur_page_block +
738 (sdio->cur_page_len >> sdio->blkbits) == blocknr) { 738 (sdio->cur_page_len >> sdio->blkbits) == blocknr) {
739 sdio->cur_page_len += len; 739 sdio->cur_page_len += len;
740 740
741 /* 741 /*
742 * If sdio->boundary then we want to schedule the IO now to 742 * If sdio->boundary then we want to schedule the IO now to
743 * avoid metadata seeks. 743 * avoid metadata seeks.
744 */ 744 */
745 if (sdio->boundary) { 745 if (sdio->boundary) {
746 ret = dio_send_cur_page(dio, sdio, map_bh); 746 ret = dio_send_cur_page(dio, sdio, map_bh);
747 page_cache_release(sdio->cur_page); 747 page_cache_release(sdio->cur_page);
748 sdio->cur_page = NULL; 748 sdio->cur_page = NULL;
749 } 749 }
750 goto out; 750 goto out;
751 } 751 }
752 752
753 /* 753 /*
754 * If there's a deferred page already there then send it. 754 * If there's a deferred page already there then send it.
755 */ 755 */
756 if (sdio->cur_page) { 756 if (sdio->cur_page) {
757 ret = dio_send_cur_page(dio, sdio, map_bh); 757 ret = dio_send_cur_page(dio, sdio, map_bh);
758 page_cache_release(sdio->cur_page); 758 page_cache_release(sdio->cur_page);
759 sdio->cur_page = NULL; 759 sdio->cur_page = NULL;
760 if (ret) 760 if (ret)
761 goto out; 761 goto out;
762 } 762 }
763 763
764 page_cache_get(page); /* It is in dio */ 764 page_cache_get(page); /* It is in dio */
765 sdio->cur_page = page; 765 sdio->cur_page = page;
766 sdio->cur_page_offset = offset; 766 sdio->cur_page_offset = offset;
767 sdio->cur_page_len = len; 767 sdio->cur_page_len = len;
768 sdio->cur_page_block = blocknr; 768 sdio->cur_page_block = blocknr;
769 sdio->cur_page_fs_offset = sdio->block_in_file << sdio->blkbits; 769 sdio->cur_page_fs_offset = sdio->block_in_file << sdio->blkbits;
770 out: 770 out:
771 return ret; 771 return ret;
772 } 772 }
773 773
774 /* 774 /*
775 * Clean any dirty buffers in the blockdev mapping which alias newly-created 775 * Clean any dirty buffers in the blockdev mapping which alias newly-created
776 * file blocks. Only called for S_ISREG files - blockdevs do not set 776 * file blocks. Only called for S_ISREG files - blockdevs do not set
777 * buffer_new 777 * buffer_new
778 */ 778 */
779 static void clean_blockdev_aliases(struct dio *dio, struct buffer_head *map_bh) 779 static void clean_blockdev_aliases(struct dio *dio, struct buffer_head *map_bh)
780 { 780 {
781 unsigned i; 781 unsigned i;
782 unsigned nblocks; 782 unsigned nblocks;
783 783
784 nblocks = map_bh->b_size >> dio->inode->i_blkbits; 784 nblocks = map_bh->b_size >> dio->inode->i_blkbits;
785 785
786 for (i = 0; i < nblocks; i++) { 786 for (i = 0; i < nblocks; i++) {
787 unmap_underlying_metadata(map_bh->b_bdev, 787 unmap_underlying_metadata(map_bh->b_bdev,
788 map_bh->b_blocknr + i); 788 map_bh->b_blocknr + i);
789 } 789 }
790 } 790 }
791 791
792 /* 792 /*
793 * If we are not writing the entire block and get_block() allocated 793 * If we are not writing the entire block and get_block() allocated
794 * the block for us, we need to fill-in the unused portion of the 794 * the block for us, we need to fill-in the unused portion of the
795 * block with zeros. This happens only if user-buffer, fileoffset or 795 * block with zeros. This happens only if user-buffer, fileoffset or
796 * io length is not filesystem block-size multiple. 796 * io length is not filesystem block-size multiple.
797 * 797 *
798 * `end' is zero if we're doing the start of the IO, 1 at the end of the 798 * `end' is zero if we're doing the start of the IO, 1 at the end of the
799 * IO. 799 * IO.
800 */ 800 */
801 static inline void dio_zero_block(struct dio *dio, struct dio_submit *sdio, 801 static inline void dio_zero_block(struct dio *dio, struct dio_submit *sdio,
802 int end, struct buffer_head *map_bh) 802 int end, struct buffer_head *map_bh)
803 { 803 {
804 unsigned dio_blocks_per_fs_block; 804 unsigned dio_blocks_per_fs_block;
805 unsigned this_chunk_blocks; /* In dio_blocks */ 805 unsigned this_chunk_blocks; /* In dio_blocks */
806 unsigned this_chunk_bytes; 806 unsigned this_chunk_bytes;
807 struct page *page; 807 struct page *page;
808 808
809 sdio->start_zero_done = 1; 809 sdio->start_zero_done = 1;
810 if (!sdio->blkfactor || !buffer_new(map_bh)) 810 if (!sdio->blkfactor || !buffer_new(map_bh))
811 return; 811 return;
812 812
813 dio_blocks_per_fs_block = 1 << sdio->blkfactor; 813 dio_blocks_per_fs_block = 1 << sdio->blkfactor;
814 this_chunk_blocks = sdio->block_in_file & (dio_blocks_per_fs_block - 1); 814 this_chunk_blocks = sdio->block_in_file & (dio_blocks_per_fs_block - 1);
815 815
816 if (!this_chunk_blocks) 816 if (!this_chunk_blocks)
817 return; 817 return;
818 818
819 /* 819 /*
820 * We need to zero out part of an fs block. It is either at the 820 * We need to zero out part of an fs block. It is either at the
821 * beginning or the end of the fs block. 821 * beginning or the end of the fs block.
822 */ 822 */
823 if (end) 823 if (end)
824 this_chunk_blocks = dio_blocks_per_fs_block - this_chunk_blocks; 824 this_chunk_blocks = dio_blocks_per_fs_block - this_chunk_blocks;
825 825
826 this_chunk_bytes = this_chunk_blocks << sdio->blkbits; 826 this_chunk_bytes = this_chunk_blocks << sdio->blkbits;
827 827
828 page = ZERO_PAGE(0); 828 page = ZERO_PAGE(0);
829 if (submit_page_section(dio, sdio, page, 0, this_chunk_bytes, 829 if (submit_page_section(dio, sdio, page, 0, this_chunk_bytes,
830 sdio->next_block_for_io, map_bh)) 830 sdio->next_block_for_io, map_bh))
831 return; 831 return;
832 832
833 sdio->next_block_for_io += this_chunk_blocks; 833 sdio->next_block_for_io += this_chunk_blocks;
834 } 834 }
835 835
836 /* 836 /*
837 * Walk the user pages, and the file, mapping blocks to disk and generating 837 * Walk the user pages, and the file, mapping blocks to disk and generating
838 * a sequence of (page,offset,len,block) mappings. These mappings are injected 838 * a sequence of (page,offset,len,block) mappings. These mappings are injected
839 * into submit_page_section(), which takes care of the next stage of submission 839 * into submit_page_section(), which takes care of the next stage of submission
840 * 840 *
841 * Direct IO against a blockdev is different from a file. Because we can 841 * Direct IO against a blockdev is different from a file. Because we can
842 * happily perform page-sized but 512-byte aligned IOs. It is important that 842 * happily perform page-sized but 512-byte aligned IOs. It is important that
843 * blockdev IO be able to have fine alignment and large sizes. 843 * blockdev IO be able to have fine alignment and large sizes.
844 * 844 *
845 * So what we do is to permit the ->get_block function to populate bh.b_size 845 * So what we do is to permit the ->get_block function to populate bh.b_size
846 * with the size of IO which is permitted at this offset and this i_blkbits. 846 * with the size of IO which is permitted at this offset and this i_blkbits.
847 * 847 *
848 * For best results, the blockdev should be set up with 512-byte i_blkbits and 848 * For best results, the blockdev should be set up with 512-byte i_blkbits and
849 * it should set b_size to PAGE_SIZE or more inside get_block(). This gives 849 * it should set b_size to PAGE_SIZE or more inside get_block(). This gives
850 * fine alignment but still allows this function to work in PAGE_SIZE units. 850 * fine alignment but still allows this function to work in PAGE_SIZE units.
851 */ 851 */
852 static int do_direct_IO(struct dio *dio, struct dio_submit *sdio, 852 static int do_direct_IO(struct dio *dio, struct dio_submit *sdio,
853 struct buffer_head *map_bh) 853 struct buffer_head *map_bh)
854 { 854 {
855 const unsigned blkbits = sdio->blkbits; 855 const unsigned blkbits = sdio->blkbits;
856 const unsigned blocks_per_page = PAGE_SIZE >> blkbits; 856 const unsigned blocks_per_page = PAGE_SIZE >> blkbits;
857 struct page *page; 857 struct page *page;
858 unsigned block_in_page; 858 unsigned block_in_page;
859 int ret = 0; 859 int ret = 0;
860 860
861 /* The I/O can start at any block offset within the first page */ 861 /* The I/O can start at any block offset within the first page */
862 block_in_page = sdio->first_block_in_page; 862 block_in_page = sdio->first_block_in_page;
863 863
864 while (sdio->block_in_file < sdio->final_block_in_request) { 864 while (sdio->block_in_file < sdio->final_block_in_request) {
865 page = dio_get_page(dio, sdio); 865 page = dio_get_page(dio, sdio);
866 if (IS_ERR(page)) { 866 if (IS_ERR(page)) {
867 ret = PTR_ERR(page); 867 ret = PTR_ERR(page);
868 goto out; 868 goto out;
869 } 869 }
870 870
871 while (block_in_page < blocks_per_page) { 871 while (block_in_page < blocks_per_page) {
872 unsigned offset_in_page = block_in_page << blkbits; 872 unsigned offset_in_page = block_in_page << blkbits;
873 unsigned this_chunk_bytes; /* # of bytes mapped */ 873 unsigned this_chunk_bytes; /* # of bytes mapped */
874 unsigned this_chunk_blocks; /* # of blocks */ 874 unsigned this_chunk_blocks; /* # of blocks */
875 unsigned u; 875 unsigned u;
876 876
877 if (sdio->blocks_available == 0) { 877 if (sdio->blocks_available == 0) {
878 /* 878 /*
879 * Need to go and map some more disk 879 * Need to go and map some more disk
880 */ 880 */
881 unsigned long blkmask; 881 unsigned long blkmask;
882 unsigned long dio_remainder; 882 unsigned long dio_remainder;
883 883
884 ret = get_more_blocks(dio, sdio, map_bh); 884 ret = get_more_blocks(dio, sdio, map_bh);
885 if (ret) { 885 if (ret) {
886 page_cache_release(page); 886 page_cache_release(page);
887 goto out; 887 goto out;
888 } 888 }
889 if (!buffer_mapped(map_bh)) 889 if (!buffer_mapped(map_bh))
890 goto do_holes; 890 goto do_holes;
891 891
892 sdio->blocks_available = 892 sdio->blocks_available =
893 map_bh->b_size >> sdio->blkbits; 893 map_bh->b_size >> sdio->blkbits;
894 sdio->next_block_for_io = 894 sdio->next_block_for_io =
895 map_bh->b_blocknr << sdio->blkfactor; 895 map_bh->b_blocknr << sdio->blkfactor;
896 if (buffer_new(map_bh)) 896 if (buffer_new(map_bh))
897 clean_blockdev_aliases(dio, map_bh); 897 clean_blockdev_aliases(dio, map_bh);
898 898
899 if (!sdio->blkfactor) 899 if (!sdio->blkfactor)
900 goto do_holes; 900 goto do_holes;
901 901
902 blkmask = (1 << sdio->blkfactor) - 1; 902 blkmask = (1 << sdio->blkfactor) - 1;
903 dio_remainder = (sdio->block_in_file & blkmask); 903 dio_remainder = (sdio->block_in_file & blkmask);
904 904
905 /* 905 /*
906 * If we are at the start of IO and that IO 906 * If we are at the start of IO and that IO
907 * starts partway into a fs-block, 907 * starts partway into a fs-block,
908 * dio_remainder will be non-zero. If the IO 908 * dio_remainder will be non-zero. If the IO
909 * is a read then we can simply advance the IO 909 * is a read then we can simply advance the IO
910 * cursor to the first block which is to be 910 * cursor to the first block which is to be
911 * read. But if the IO is a write and the 911 * read. But if the IO is a write and the
912 * block was newly allocated we cannot do that; 912 * block was newly allocated we cannot do that;
913 * the start of the fs block must be zeroed out 913 * the start of the fs block must be zeroed out
914 * on-disk 914 * on-disk
915 */ 915 */
916 if (!buffer_new(map_bh)) 916 if (!buffer_new(map_bh))
917 sdio->next_block_for_io += dio_remainder; 917 sdio->next_block_for_io += dio_remainder;
918 sdio->blocks_available -= dio_remainder; 918 sdio->blocks_available -= dio_remainder;
919 } 919 }
920 do_holes: 920 do_holes:
921 /* Handle holes */ 921 /* Handle holes */
922 if (!buffer_mapped(map_bh)) { 922 if (!buffer_mapped(map_bh)) {
923 loff_t i_size_aligned; 923 loff_t i_size_aligned;
924 924
925 /* AKPM: eargh, -ENOTBLK is a hack */ 925 /* AKPM: eargh, -ENOTBLK is a hack */
926 if (dio->rw & WRITE) { 926 if (dio->rw & WRITE) {
927 page_cache_release(page); 927 page_cache_release(page);
928 return -ENOTBLK; 928 return -ENOTBLK;
929 } 929 }
930 930
931 /* 931 /*
932 * Be sure to account for a partial block as the 932 * Be sure to account for a partial block as the
933 * last block in the file 933 * last block in the file
934 */ 934 */
935 i_size_aligned = ALIGN(i_size_read(dio->inode), 935 i_size_aligned = ALIGN(i_size_read(dio->inode),
936 1 << blkbits); 936 1 << blkbits);
937 if (sdio->block_in_file >= 937 if (sdio->block_in_file >=
938 i_size_aligned >> blkbits) { 938 i_size_aligned >> blkbits) {
939 /* We hit eof */ 939 /* We hit eof */
940 page_cache_release(page); 940 page_cache_release(page);
941 goto out; 941 goto out;
942 } 942 }
943 zero_user(page, block_in_page << blkbits, 943 zero_user(page, block_in_page << blkbits,
944 1 << blkbits); 944 1 << blkbits);
945 sdio->block_in_file++; 945 sdio->block_in_file++;
946 block_in_page++; 946 block_in_page++;
947 goto next_block; 947 goto next_block;
948 } 948 }
949 949
950 /* 950 /*
951 * If we're performing IO which has an alignment which 951 * If we're performing IO which has an alignment which
952 * is finer than the underlying fs, go check to see if 952 * is finer than the underlying fs, go check to see if
953 * we must zero out the start of this block. 953 * we must zero out the start of this block.
954 */ 954 */
955 if (unlikely(sdio->blkfactor && !sdio->start_zero_done)) 955 if (unlikely(sdio->blkfactor && !sdio->start_zero_done))
956 dio_zero_block(dio, sdio, 0, map_bh); 956 dio_zero_block(dio, sdio, 0, map_bh);
957 957
958 /* 958 /*
959 * Work out, in this_chunk_blocks, how much disk we 959 * Work out, in this_chunk_blocks, how much disk we
960 * can add to this page 960 * can add to this page
961 */ 961 */
962 this_chunk_blocks = sdio->blocks_available; 962 this_chunk_blocks = sdio->blocks_available;
963 u = (PAGE_SIZE - offset_in_page) >> blkbits; 963 u = (PAGE_SIZE - offset_in_page) >> blkbits;
964 if (this_chunk_blocks > u) 964 if (this_chunk_blocks > u)
965 this_chunk_blocks = u; 965 this_chunk_blocks = u;
966 u = sdio->final_block_in_request - sdio->block_in_file; 966 u = sdio->final_block_in_request - sdio->block_in_file;
967 if (this_chunk_blocks > u) 967 if (this_chunk_blocks > u)
968 this_chunk_blocks = u; 968 this_chunk_blocks = u;
969 this_chunk_bytes = this_chunk_blocks << blkbits; 969 this_chunk_bytes = this_chunk_blocks << blkbits;
970 BUG_ON(this_chunk_bytes == 0); 970 BUG_ON(this_chunk_bytes == 0);
971 971
972 sdio->boundary = buffer_boundary(map_bh); 972 sdio->boundary = buffer_boundary(map_bh);
973 ret = submit_page_section(dio, sdio, page, 973 ret = submit_page_section(dio, sdio, page,
974 offset_in_page, 974 offset_in_page,
975 this_chunk_bytes, 975 this_chunk_bytes,
976 sdio->next_block_for_io, 976 sdio->next_block_for_io,
977 map_bh); 977 map_bh);
978 if (ret) { 978 if (ret) {
979 page_cache_release(page); 979 page_cache_release(page);
980 goto out; 980 goto out;
981 } 981 }
982 sdio->next_block_for_io += this_chunk_blocks; 982 sdio->next_block_for_io += this_chunk_blocks;
983 983
984 sdio->block_in_file += this_chunk_blocks; 984 sdio->block_in_file += this_chunk_blocks;
985 block_in_page += this_chunk_blocks; 985 block_in_page += this_chunk_blocks;
986 sdio->blocks_available -= this_chunk_blocks; 986 sdio->blocks_available -= this_chunk_blocks;
987 next_block: 987 next_block:
988 BUG_ON(sdio->block_in_file > sdio->final_block_in_request); 988 BUG_ON(sdio->block_in_file > sdio->final_block_in_request);
989 if (sdio->block_in_file == sdio->final_block_in_request) 989 if (sdio->block_in_file == sdio->final_block_in_request)
990 break; 990 break;
991 } 991 }
992 992
993 /* Drop the ref which was taken in get_user_pages() */ 993 /* Drop the ref which was taken in get_user_pages() */
994 page_cache_release(page); 994 page_cache_release(page);
995 block_in_page = 0; 995 block_in_page = 0;
996 } 996 }
997 out: 997 out:
998 return ret; 998 return ret;
999 } 999 }
1000 1000
1001 static inline int drop_refcount(struct dio *dio) 1001 static inline int drop_refcount(struct dio *dio)
1002 { 1002 {
1003 int ret2; 1003 int ret2;
1004 unsigned long flags; 1004 unsigned long flags;
1005 1005
1006 /* 1006 /*
1007 * Sync will always be dropping the final ref and completing the 1007 * Sync will always be dropping the final ref and completing the
1008 * operation. AIO can if it was a broken operation described above or 1008 * operation. AIO can if it was a broken operation described above or
1009 * in fact if all the bios race to complete before we get here. In 1009 * in fact if all the bios race to complete before we get here. In
1010 * that case dio_complete() translates the EIOCBQUEUED into the proper 1010 * that case dio_complete() translates the EIOCBQUEUED into the proper
1011 * return code that the caller will hand to aio_complete(). 1011 * return code that the caller will hand to aio_complete().
1012 * 1012 *
1013 * This is managed by the bio_lock instead of being an atomic_t so that 1013 * This is managed by the bio_lock instead of being an atomic_t so that
1014 * completion paths can drop their ref and use the remaining count to 1014 * completion paths can drop their ref and use the remaining count to
1015 * decide to wake the submission path atomically. 1015 * decide to wake the submission path atomically.
1016 */ 1016 */
1017 spin_lock_irqsave(&dio->bio_lock, flags); 1017 spin_lock_irqsave(&dio->bio_lock, flags);
1018 ret2 = --dio->refcount; 1018 ret2 = --dio->refcount;
1019 spin_unlock_irqrestore(&dio->bio_lock, flags); 1019 spin_unlock_irqrestore(&dio->bio_lock, flags);
1020 return ret2; 1020 return ret2;
1021 } 1021 }
1022 1022
1023 /* 1023 /*
1024 * This is a library function for use by filesystem drivers. 1024 * This is a library function for use by filesystem drivers.
1025 * 1025 *
1026 * The locking rules are governed by the flags parameter: 1026 * The locking rules are governed by the flags parameter:
1027 * - if the flags value contains DIO_LOCKING we use a fancy locking 1027 * - if the flags value contains DIO_LOCKING we use a fancy locking
1028 * scheme for dumb filesystems. 1028 * scheme for dumb filesystems.
1029 * For writes this function is called under i_mutex and returns with 1029 * For writes this function is called under i_mutex and returns with
1030 * i_mutex held, for reads, i_mutex is not held on entry, but it is 1030 * i_mutex held, for reads, i_mutex is not held on entry, but it is
1031 * taken and dropped again before returning. 1031 * taken and dropped again before returning.
1032 * - if the flags value does NOT contain DIO_LOCKING we don't use any 1032 * - if the flags value does NOT contain DIO_LOCKING we don't use any
1033 * internal locking but rather rely on the filesystem to synchronize 1033 * internal locking but rather rely on the filesystem to synchronize
1034 * direct I/O reads/writes versus each other and truncate. 1034 * direct I/O reads/writes versus each other and truncate.
1035 * 1035 *
1036 * To help with locking against truncate we incremented the i_dio_count 1036 * To help with locking against truncate we incremented the i_dio_count
1037 * counter before starting direct I/O, and decrement it once we are done. 1037 * counter before starting direct I/O, and decrement it once we are done.
1038 * Truncate can wait for it to reach zero to provide exclusion. It is 1038 * Truncate can wait for it to reach zero to provide exclusion. It is
1039 * expected that filesystem provide exclusion between new direct I/O 1039 * expected that filesystem provide exclusion between new direct I/O
1040 * and truncates. For DIO_LOCKING filesystems this is done by i_mutex, 1040 * and truncates. For DIO_LOCKING filesystems this is done by i_mutex,
1041 * but other filesystems need to take care of this on their own. 1041 * but other filesystems need to take care of this on their own.
1042 * 1042 *
1043 * NOTE: if you pass "sdio" to anything by pointer make sure that function 1043 * NOTE: if you pass "sdio" to anything by pointer make sure that function
1044 * is always inlined. Otherwise gcc is unable to split the structure into 1044 * is always inlined. Otherwise gcc is unable to split the structure into
1045 * individual fields and will generate much worse code. This is important 1045 * individual fields and will generate much worse code. This is important
1046 * for the whole file. 1046 * for the whole file.
1047 */ 1047 */
1048 static inline ssize_t 1048 static inline ssize_t
1049 do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, 1049 do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
1050 struct block_device *bdev, const struct iovec *iov, loff_t offset, 1050 struct block_device *bdev, const struct iovec *iov, loff_t offset,
1051 unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io, 1051 unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io,
1052 dio_submit_t submit_io, int flags) 1052 dio_submit_t submit_io, int flags)
1053 { 1053 {
1054 int seg; 1054 int seg;
1055 size_t size; 1055 size_t size;
1056 unsigned long addr; 1056 unsigned long addr;
1057 unsigned i_blkbits = ACCESS_ONCE(inode->i_blkbits); 1057 unsigned i_blkbits = ACCESS_ONCE(inode->i_blkbits);
1058 unsigned blkbits = i_blkbits; 1058 unsigned blkbits = i_blkbits;
1059 unsigned blocksize_mask = (1 << blkbits) - 1; 1059 unsigned blocksize_mask = (1 << blkbits) - 1;
1060 ssize_t retval = -EINVAL; 1060 ssize_t retval = -EINVAL;
1061 loff_t end = offset; 1061 loff_t end = offset;
1062 struct dio *dio; 1062 struct dio *dio;
1063 struct dio_submit sdio = { 0, }; 1063 struct dio_submit sdio = { 0, };
1064 unsigned long user_addr; 1064 unsigned long user_addr;
1065 size_t bytes; 1065 size_t bytes;
1066 struct buffer_head map_bh = { 0, }; 1066 struct buffer_head map_bh = { 0, };
1067 struct blk_plug plug; 1067 struct blk_plug plug;
1068 1068
1069 if (rw & WRITE) 1069 if (rw & WRITE)
1070 rw = WRITE_ODIRECT; 1070 rw = WRITE_ODIRECT;
1071 1071
1072 /* 1072 /*
1073 * Avoid references to bdev if not absolutely needed to give 1073 * Avoid references to bdev if not absolutely needed to give
1074 * the early prefetch in the caller enough time. 1074 * the early prefetch in the caller enough time.
1075 */ 1075 */
1076 1076
1077 if (offset & blocksize_mask) { 1077 if (offset & blocksize_mask) {
1078 if (bdev) 1078 if (bdev)
1079 blkbits = blksize_bits(bdev_logical_block_size(bdev)); 1079 blkbits = blksize_bits(bdev_logical_block_size(bdev));
1080 blocksize_mask = (1 << blkbits) - 1; 1080 blocksize_mask = (1 << blkbits) - 1;
1081 if (offset & blocksize_mask) 1081 if (offset & blocksize_mask)
1082 goto out; 1082 goto out;
1083 } 1083 }
1084 1084
1085 /* Check the memory alignment. Blocks cannot straddle pages */ 1085 /* Check the memory alignment. Blocks cannot straddle pages */
1086 for (seg = 0; seg < nr_segs; seg++) { 1086 for (seg = 0; seg < nr_segs; seg++) {
1087 addr = (unsigned long)iov[seg].iov_base; 1087 addr = (unsigned long)iov[seg].iov_base;
1088 size = iov[seg].iov_len; 1088 size = iov[seg].iov_len;
1089 end += size; 1089 end += size;
1090 if (unlikely((addr & blocksize_mask) || 1090 if (unlikely((addr & blocksize_mask) ||
1091 (size & blocksize_mask))) { 1091 (size & blocksize_mask))) {
1092 if (bdev) 1092 if (bdev)
1093 blkbits = blksize_bits( 1093 blkbits = blksize_bits(
1094 bdev_logical_block_size(bdev)); 1094 bdev_logical_block_size(bdev));
1095 blocksize_mask = (1 << blkbits) - 1; 1095 blocksize_mask = (1 << blkbits) - 1;
1096 if ((addr & blocksize_mask) || (size & blocksize_mask)) 1096 if ((addr & blocksize_mask) || (size & blocksize_mask))
1097 goto out; 1097 goto out;
1098 } 1098 }
1099 } 1099 }
1100 1100
1101 /* watch out for a 0 len io from a tricksy fs */ 1101 /* watch out for a 0 len io from a tricksy fs */
1102 if (rw == READ && end == offset) 1102 if (rw == READ && end == offset)
1103 return 0; 1103 return 0;
1104 1104
1105 dio = kmem_cache_alloc(dio_cache, GFP_KERNEL); 1105 dio = kmem_cache_alloc(dio_cache, GFP_KERNEL);
1106 retval = -ENOMEM; 1106 retval = -ENOMEM;
1107 if (!dio) 1107 if (!dio)
1108 goto out; 1108 goto out;
1109 /* 1109 /*
1110 * Believe it or not, zeroing out the page array caused a .5% 1110 * Believe it or not, zeroing out the page array caused a .5%
1111 * performance regression in a database benchmark. So, we take 1111 * performance regression in a database benchmark. So, we take
1112 * care to only zero out what's needed. 1112 * care to only zero out what's needed.
1113 */ 1113 */
1114 memset(dio, 0, offsetof(struct dio, pages)); 1114 memset(dio, 0, offsetof(struct dio, pages));
1115 1115
1116 dio->flags = flags; 1116 dio->flags = flags;
1117 if (dio->flags & DIO_LOCKING) { 1117 if (dio->flags & DIO_LOCKING) {
1118 if (rw == READ) { 1118 if (rw == READ) {
1119 struct address_space *mapping = 1119 struct address_space *mapping =
1120 iocb->ki_filp->f_mapping; 1120 iocb->ki_filp->f_mapping;
1121 1121
1122 /* will be released by direct_io_worker */ 1122 /* will be released by direct_io_worker */
1123 mutex_lock(&inode->i_mutex); 1123 mutex_lock(&inode->i_mutex);
1124 1124
1125 retval = filemap_write_and_wait_range(mapping, offset, 1125 retval = filemap_write_and_wait_range(mapping, offset,
1126 end - 1); 1126 end - 1);
1127 if (retval) { 1127 if (retval) {
1128 mutex_unlock(&inode->i_mutex); 1128 mutex_unlock(&inode->i_mutex);
1129 kmem_cache_free(dio_cache, dio); 1129 kmem_cache_free(dio_cache, dio);
1130 goto out; 1130 goto out;
1131 } 1131 }
1132 } 1132 }
1133 } 1133 }
1134 1134
1135 /* 1135 /*
1136 * Will be decremented at I/O completion time. 1136 * Will be decremented at I/O completion time.
1137 */ 1137 */
1138 atomic_inc(&inode->i_dio_count); 1138 atomic_inc(&inode->i_dio_count);
1139 1139
1140 /* 1140 /*
1141 * For file extending writes updating i_size before data 1141 * For file extending writes updating i_size before data
1142 * writeouts complete can expose uninitialized blocks. So 1142 * writeouts complete can expose uninitialized blocks. So
1143 * even for AIO, we need to wait for i/o to complete before 1143 * even for AIO, we need to wait for i/o to complete before
1144 * returning in this case. 1144 * returning in this case.
1145 */ 1145 */
1146 dio->is_async = !is_sync_kiocb(iocb) && !((rw & WRITE) && 1146 dio->is_async = !is_sync_kiocb(iocb) && !((rw & WRITE) &&
1147 (end > i_size_read(inode))); 1147 (end > i_size_read(inode)));
1148 1148
1149 retval = 0; 1149 retval = 0;
1150 1150
1151 dio->inode = inode; 1151 dio->inode = inode;
1152 dio->rw = rw; 1152 dio->rw = rw;
1153 sdio.blkbits = blkbits; 1153 sdio.blkbits = blkbits;
1154 sdio.blkfactor = i_blkbits - blkbits; 1154 sdio.blkfactor = i_blkbits - blkbits;
1155 sdio.block_in_file = offset >> blkbits; 1155 sdio.block_in_file = offset >> blkbits;
1156 1156
1157 sdio.get_block = get_block; 1157 sdio.get_block = get_block;
1158 dio->end_io = end_io; 1158 dio->end_io = end_io;
1159 sdio.submit_io = submit_io; 1159 sdio.submit_io = submit_io;
1160 sdio.final_block_in_bio = -1; 1160 sdio.final_block_in_bio = -1;
1161 sdio.next_block_for_io = -1; 1161 sdio.next_block_for_io = -1;
1162 1162
1163 dio->iocb = iocb; 1163 dio->iocb = iocb;
1164 dio->i_size = i_size_read(inode); 1164 dio->i_size = i_size_read(inode);
1165 1165
1166 spin_lock_init(&dio->bio_lock); 1166 spin_lock_init(&dio->bio_lock);
1167 dio->refcount = 1; 1167 dio->refcount = 1;
1168 1168
1169 /* 1169 /*
1170 * In case of non-aligned buffers, we may need 2 more 1170 * In case of non-aligned buffers, we may need 2 more
1171 * pages since we need to zero out first and last block. 1171 * pages since we need to zero out first and last block.
1172 */ 1172 */
1173 if (unlikely(sdio.blkfactor)) 1173 if (unlikely(sdio.blkfactor))
1174 sdio.pages_in_io = 2; 1174 sdio.pages_in_io = 2;
1175 1175
1176 for (seg = 0; seg < nr_segs; seg++) { 1176 for (seg = 0; seg < nr_segs; seg++) {
1177 user_addr = (unsigned long)iov[seg].iov_base; 1177 user_addr = (unsigned long)iov[seg].iov_base;
1178 sdio.pages_in_io += 1178 sdio.pages_in_io +=
1179 ((user_addr + iov[seg].iov_len + PAGE_SIZE-1) / 1179 ((user_addr + iov[seg].iov_len + PAGE_SIZE-1) /
1180 PAGE_SIZE - user_addr / PAGE_SIZE); 1180 PAGE_SIZE - user_addr / PAGE_SIZE);
1181 } 1181 }
1182 1182
1183 blk_start_plug(&plug); 1183 blk_start_plug(&plug);
1184 1184
1185 for (seg = 0; seg < nr_segs; seg++) { 1185 for (seg = 0; seg < nr_segs; seg++) {
1186 user_addr = (unsigned long)iov[seg].iov_base; 1186 user_addr = (unsigned long)iov[seg].iov_base;
1187 sdio.size += bytes = iov[seg].iov_len; 1187 sdio.size += bytes = iov[seg].iov_len;
1188 1188
1189 /* Index into the first page of the first block */ 1189 /* Index into the first page of the first block */
1190 sdio.first_block_in_page = (user_addr & ~PAGE_MASK) >> blkbits; 1190 sdio.first_block_in_page = (user_addr & ~PAGE_MASK) >> blkbits;
1191 sdio.final_block_in_request = sdio.block_in_file + 1191 sdio.final_block_in_request = sdio.block_in_file +
1192 (bytes >> blkbits); 1192 (bytes >> blkbits);
1193 /* Page fetching state */ 1193 /* Page fetching state */
1194 sdio.head = 0; 1194 sdio.head = 0;
1195 sdio.tail = 0; 1195 sdio.tail = 0;
1196 sdio.curr_page = 0; 1196 sdio.curr_page = 0;
1197 1197
1198 sdio.total_pages = 0; 1198 sdio.total_pages = 0;
1199 if (user_addr & (PAGE_SIZE-1)) { 1199 if (user_addr & (PAGE_SIZE-1)) {
1200 sdio.total_pages++; 1200 sdio.total_pages++;
1201 bytes -= PAGE_SIZE - (user_addr & (PAGE_SIZE - 1)); 1201 bytes -= PAGE_SIZE - (user_addr & (PAGE_SIZE - 1));
1202 } 1202 }
1203 sdio.total_pages += (bytes + PAGE_SIZE - 1) / PAGE_SIZE; 1203 sdio.total_pages += (bytes + PAGE_SIZE - 1) / PAGE_SIZE;
1204 sdio.curr_user_address = user_addr; 1204 sdio.curr_user_address = user_addr;
1205 1205
1206 retval = do_direct_IO(dio, &sdio, &map_bh); 1206 retval = do_direct_IO(dio, &sdio, &map_bh);
1207 1207
1208 dio->result += iov[seg].iov_len - 1208 dio->result += iov[seg].iov_len -
1209 ((sdio.final_block_in_request - sdio.block_in_file) << 1209 ((sdio.final_block_in_request - sdio.block_in_file) <<
1210 blkbits); 1210 blkbits);
1211 1211
1212 if (retval) { 1212 if (retval) {
1213 dio_cleanup(dio, &sdio); 1213 dio_cleanup(dio, &sdio);
1214 break; 1214 break;
1215 } 1215 }
1216 } /* end iovec loop */ 1216 } /* end iovec loop */
1217 1217
1218 if (retval == -ENOTBLK) { 1218 if (retval == -ENOTBLK) {
1219 /* 1219 /*
1220 * The remaining part of the request will be 1220 * The remaining part of the request will be
1221 * be handled by buffered I/O when we return 1221 * be handled by buffered I/O when we return
1222 */ 1222 */
1223 retval = 0; 1223 retval = 0;
1224 } 1224 }
1225 /* 1225 /*
1226 * There may be some unwritten disk at the end of a part-written 1226 * There may be some unwritten disk at the end of a part-written
1227 * fs-block-sized block. Go zero that now. 1227 * fs-block-sized block. Go zero that now.
1228 */ 1228 */
1229 dio_zero_block(dio, &sdio, 1, &map_bh); 1229 dio_zero_block(dio, &sdio, 1, &map_bh);
1230 1230
1231 if (sdio.cur_page) { 1231 if (sdio.cur_page) {
1232 ssize_t ret2; 1232 ssize_t ret2;
1233 1233
1234 ret2 = dio_send_cur_page(dio, &sdio, &map_bh); 1234 ret2 = dio_send_cur_page(dio, &sdio, &map_bh);
1235 if (retval == 0) 1235 if (retval == 0)
1236 retval = ret2; 1236 retval = ret2;
1237 page_cache_release(sdio.cur_page); 1237 page_cache_release(sdio.cur_page);
1238 sdio.cur_page = NULL; 1238 sdio.cur_page = NULL;
1239 } 1239 }
1240 if (sdio.bio) 1240 if (sdio.bio)
1241 dio_bio_submit(dio, &sdio); 1241 dio_bio_submit(dio, &sdio);
1242 1242
1243 blk_finish_plug(&plug); 1243 blk_finish_plug(&plug);
1244 1244
1245 /* 1245 /*
1246 * It is possible that, we return short IO due to end of file. 1246 * It is possible that, we return short IO due to end of file.
1247 * In that case, we need to release all the pages we got hold on. 1247 * In that case, we need to release all the pages we got hold on.
1248 */ 1248 */
1249 dio_cleanup(dio, &sdio); 1249 dio_cleanup(dio, &sdio);
1250 1250
1251 /* 1251 /*
1252 * All block lookups have been performed. For READ requests 1252 * All block lookups have been performed. For READ requests
1253 * we can let i_mutex go now that its achieved its purpose 1253 * we can let i_mutex go now that its achieved its purpose
1254 * of protecting us from looking up uninitialized blocks. 1254 * of protecting us from looking up uninitialized blocks.
1255 */ 1255 */
1256 if (rw == READ && (dio->flags & DIO_LOCKING)) 1256 if (rw == READ && (dio->flags & DIO_LOCKING))
1257 mutex_unlock(&dio->inode->i_mutex); 1257 mutex_unlock(&dio->inode->i_mutex);
1258 1258
1259 /* 1259 /*
1260 * The only time we want to leave bios in flight is when a successful 1260 * The only time we want to leave bios in flight is when a successful
1261 * partial aio read or full aio write have been setup. In that case 1261 * partial aio read or full aio write have been setup. In that case
1262 * bio completion will call aio_complete. The only time it's safe to 1262 * bio completion will call aio_complete. The only time it's safe to
1263 * call aio_complete is when we return -EIOCBQUEUED, so we key on that. 1263 * call aio_complete is when we return -EIOCBQUEUED, so we key on that.
1264 * This had *better* be the only place that raises -EIOCBQUEUED. 1264 * This had *better* be the only place that raises -EIOCBQUEUED.
1265 */ 1265 */
1266 BUG_ON(retval == -EIOCBQUEUED); 1266 BUG_ON(retval == -EIOCBQUEUED);
1267 if (dio->is_async && retval == 0 && dio->result && 1267 if (dio->is_async && retval == 0 && dio->result &&
1268 ((rw == READ) || (dio->result == sdio.size))) 1268 ((rw == READ) || (dio->result == sdio.size)))
1269 retval = -EIOCBQUEUED; 1269 retval = -EIOCBQUEUED;
1270 1270
1271 if (retval != -EIOCBQUEUED) 1271 if (retval != -EIOCBQUEUED)
1272 dio_await_completion(dio); 1272 dio_await_completion(dio);
1273 1273
1274 if (drop_refcount(dio) == 0) { 1274 if (drop_refcount(dio) == 0) {
1275 retval = dio_complete(dio, offset, retval, false); 1275 retval = dio_complete(dio, offset, retval, false);
1276 kmem_cache_free(dio_cache, dio); 1276 kmem_cache_free(dio_cache, dio);
1277 } else 1277 } else
1278 BUG_ON(retval != -EIOCBQUEUED); 1278 BUG_ON(retval != -EIOCBQUEUED);
1279 1279
1280 out: 1280 out:
1281 return retval; 1281 return retval;
1282 } 1282 }
1283 1283
1284 ssize_t 1284 ssize_t
1285 __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, 1285 __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
1286 struct block_device *bdev, const struct iovec *iov, loff_t offset, 1286 struct block_device *bdev, const struct iovec *iov, loff_t offset,
1287 unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io, 1287 unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io,
1288 dio_submit_t submit_io, int flags) 1288 dio_submit_t submit_io, int flags)
1289 { 1289 {
1290 /* 1290 /*
1291 * The block device state is needed in the end to finally 1291 * The block device state is needed in the end to finally
1292 * submit everything. Since it's likely to be cache cold 1292 * submit everything. Since it's likely to be cache cold
1293 * prefetch it here as first thing to hide some of the 1293 * prefetch it here as first thing to hide some of the
1294 * latency. 1294 * latency.
1295 * 1295 *
1296 * Attempt to prefetch the pieces we likely need later. 1296 * Attempt to prefetch the pieces we likely need later.
1297 */ 1297 */
1298 prefetch(&bdev->bd_disk->part_tbl); 1298 prefetch(&bdev->bd_disk->part_tbl);
1299 prefetch(bdev->bd_queue); 1299 prefetch(bdev->bd_queue);
1300 prefetch((char *)bdev->bd_queue + SMP_CACHE_BYTES); 1300 prefetch((char *)bdev->bd_queue + SMP_CACHE_BYTES);
1301 1301
1302 return do_blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset, 1302 return do_blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset,
1303 nr_segs, get_block, end_io, 1303 nr_segs, get_block, end_io,
1304 submit_io, flags); 1304 submit_io, flags);
1305 } 1305 }
1306 1306
1307 EXPORT_SYMBOL(__blockdev_direct_IO); 1307 EXPORT_SYMBOL(__blockdev_direct_IO);
1308 1308
1309 static __init int dio_init(void) 1309 static __init int dio_init(void)
1310 { 1310 {
1311 dio_cache = KMEM_CACHE(dio, SLAB_PANIC); 1311 dio_cache = KMEM_CACHE(dio, SLAB_PANIC);
1312 return 0; 1312 return 0;
1313 } 1313 }
1314 module_init(dio_init) 1314 module_init(dio_init)