Commit f362f98e7c445643d27c610bb7a86b79727b592e
Exists in
master
and in
20 other branches
Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/hch/vfs-queue
* 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/hch/vfs-queue: (21 commits) leases: fix write-open/read-lease race nfs: drop unnecessary locking in llseek ext4: replace cut'n'pasted llseek code with generic_file_llseek_size vfs: add generic_file_llseek_size vfs: do (nearly) lockless generic_file_llseek direct-io: merge direct_io_walker into __blockdev_direct_IO direct-io: inline the complete submission path direct-io: separate map_bh from dio direct-io: use a slab cache for struct dio direct-io: rearrange fields in dio/dio_submit to avoid holes direct-io: fix a wrong comment direct-io: separate fields only used in the submission path from struct dio vfs: fix spinning prevention in prune_icache_sb vfs: add a comment to inode_permission() vfs: pass all mask flags check_acl and posix_acl_permission vfs: add hex format for MAY_* flag values vfs: indicate that the permission functions take all the MAY_* flags compat: sync compat_stats with statfs. vfs: add "device" tag to /proc/self/mountstats cleanup: vfs: small comment fix for block_invalidatepage ... Fix up trivial conflict in fs/gfs2/file.c (llseek changes)
Showing 22 changed files Side-by-side Diff
- arch/mips/include/asm/compat.h
- arch/parisc/include/asm/compat.h
- arch/powerpc/include/asm/compat.h
- arch/s390/include/asm/compat.h
- arch/sparc/include/asm/compat.h
- arch/x86/include/asm/compat.h
- fs/btrfs/file.c
- fs/buffer.c
- fs/cifs/cifsfs.c
- fs/compat.c
- fs/direct-io.c
- fs/ext4/file.c
- fs/gfs2/file.c
- fs/inode.c
- fs/namei.c
- fs/namespace.c
- fs/nfs/file.c
- fs/open.c
- fs/posix_acl.c
- fs/read_write.c
- include/linux/fs.h
- mm/filemap.c
arch/mips/include/asm/compat.h
arch/parisc/include/asm/compat.h
arch/powerpc/include/asm/compat.h
arch/s390/include/asm/compat.h
arch/sparc/include/asm/compat.h
arch/x86/include/asm/compat.h
fs/btrfs/file.c
... | ... | @@ -1821,7 +1821,7 @@ |
1821 | 1821 | switch (origin) { |
1822 | 1822 | case SEEK_END: |
1823 | 1823 | case SEEK_CUR: |
1824 | - offset = generic_file_llseek_unlocked(file, offset, origin); | |
1824 | + offset = generic_file_llseek(file, offset, origin); | |
1825 | 1825 | goto out; |
1826 | 1826 | case SEEK_DATA: |
1827 | 1827 | case SEEK_HOLE: |
fs/buffer.c
... | ... | @@ -1470,13 +1470,13 @@ |
1470 | 1470 | } |
1471 | 1471 | |
1472 | 1472 | /** |
1473 | - * block_invalidatepage - invalidate part of all of a buffer-backed page | |
1473 | + * block_invalidatepage - invalidate part or all of a buffer-backed page | |
1474 | 1474 | * |
1475 | 1475 | * @page: the page which is affected |
1476 | 1476 | * @offset: the index of the truncation point |
1477 | 1477 | * |
1478 | 1478 | * block_invalidatepage() is called when all or part of the page has become |
1479 | - * invalidatedby a truncate operation. | |
1479 | + * invalidated by a truncate operation. | |
1480 | 1480 | * |
1481 | 1481 | * block_invalidatepage() does not have to release all buffers, but it must |
1482 | 1482 | * ensure that no dirty buffer is left outside @offset and that no I/O |
fs/cifs/cifsfs.c
... | ... | @@ -730,7 +730,7 @@ |
730 | 730 | if (rc < 0) |
731 | 731 | return (loff_t)rc; |
732 | 732 | } |
733 | - return generic_file_llseek_unlocked(file, offset, origin); | |
733 | + return generic_file_llseek(file, offset, origin); | |
734 | 734 | } |
735 | 735 | |
736 | 736 | static int cifs_setlease(struct file *file, long arg, struct file_lock **lease) |
fs/compat.c
... | ... | @@ -246,11 +246,8 @@ |
246 | 246 | __put_user(kbuf->f_fsid.val[0], &ubuf->f_fsid.val[0]) || |
247 | 247 | __put_user(kbuf->f_fsid.val[1], &ubuf->f_fsid.val[1]) || |
248 | 248 | __put_user(kbuf->f_frsize, &ubuf->f_frsize) || |
249 | - __put_user(0, &ubuf->f_spare[0]) || | |
250 | - __put_user(0, &ubuf->f_spare[1]) || | |
251 | - __put_user(0, &ubuf->f_spare[2]) || | |
252 | - __put_user(0, &ubuf->f_spare[3]) || | |
253 | - __put_user(0, &ubuf->f_spare[4])) | |
249 | + __put_user(kbuf->f_flags, &ubuf->f_flags) || | |
250 | + __clear_user(ubuf->f_spare, sizeof(ubuf->f_spare))) | |
254 | 251 | return -EFAULT; |
255 | 252 | return 0; |
256 | 253 | } |
fs/direct-io.c
Changes suppressed. Click to show
... | ... | @@ -39,7 +39,7 @@ |
39 | 39 | |
40 | 40 | /* |
41 | 41 | * How many user pages to map in one call to get_user_pages(). This determines |
42 | - * the size of a structure on the stack. | |
42 | + * the size of a structure in the slab cache | |
43 | 43 | */ |
44 | 44 | #define DIO_PAGES 64 |
45 | 45 | |
46 | 46 | |
... | ... | @@ -55,13 +55,10 @@ |
55 | 55 | * blocksize. |
56 | 56 | */ |
57 | 57 | |
58 | -struct dio { | |
59 | - /* BIO submission state */ | |
58 | +/* dio_state only used in the submission path */ | |
59 | + | |
60 | +struct dio_submit { | |
60 | 61 | struct bio *bio; /* bio under assembly */ |
61 | - struct inode *inode; | |
62 | - int rw; | |
63 | - loff_t i_size; /* i_size when submitted */ | |
64 | - int flags; /* doesn't change */ | |
65 | 62 | unsigned blkbits; /* doesn't change */ |
66 | 63 | unsigned blkfactor; /* When we're using an alignment which |
67 | 64 | is finer than the filesystem's soft |
68 | 65 | |
69 | 66 | |
70 | 67 | |
71 | 68 | |
... | ... | @@ -76,18 +73,17 @@ |
76 | 73 | sector_t block_in_file; /* Current offset into the underlying |
77 | 74 | file in dio_block units. */ |
78 | 75 | unsigned blocks_available; /* At block_in_file. changes */ |
76 | + int reap_counter; /* rate limit reaping */ | |
79 | 77 | sector_t final_block_in_request;/* doesn't change */ |
80 | 78 | unsigned first_block_in_page; /* doesn't change, Used only once */ |
81 | 79 | int boundary; /* prev block is at a boundary */ |
82 | - int reap_counter; /* rate limit reaping */ | |
83 | 80 | get_block_t *get_block; /* block mapping function */ |
84 | - dio_iodone_t *end_io; /* IO completion function */ | |
85 | 81 | dio_submit_t *submit_io; /* IO submition function */ |
82 | + | |
86 | 83 | loff_t logical_offset_in_bio; /* current first logical block in bio */ |
87 | 84 | sector_t final_block_in_bio; /* current final block in bio + 1 */ |
88 | 85 | sector_t next_block_for_io; /* next block to be put under IO, |
89 | 86 | in dio_blocks units */ |
90 | - struct buffer_head map_bh; /* last get_block() result */ | |
91 | 87 | |
92 | 88 | /* |
93 | 89 | * Deferred addition of a page to the dio. These variables are |
... | ... | @@ -100,18 +96,6 @@ |
100 | 96 | sector_t cur_page_block; /* Where it starts */ |
101 | 97 | loff_t cur_page_fs_offset; /* Offset in file */ |
102 | 98 | |
103 | - /* BIO completion state */ | |
104 | - spinlock_t bio_lock; /* protects BIO fields below */ | |
105 | - unsigned long refcount; /* direct_io_worker() and bios */ | |
106 | - struct bio *bio_list; /* singly linked via bi_private */ | |
107 | - struct task_struct *waiter; /* waiting task (NULL if none) */ | |
108 | - | |
109 | - /* AIO related stuff */ | |
110 | - struct kiocb *iocb; /* kiocb */ | |
111 | - int is_async; /* is IO async ? */ | |
112 | - int io_error; /* IO error in completion path */ | |
113 | - ssize_t result; /* IO result */ | |
114 | - | |
115 | 99 | /* |
116 | 100 | * Page fetching state. These variables belong to dio_refill_pages(). |
117 | 101 | */ |
118 | 102 | |
119 | 103 | |
120 | 104 | |
121 | 105 | |
... | ... | @@ -125,16 +109,41 @@ |
125 | 109 | */ |
126 | 110 | unsigned head; /* next page to process */ |
127 | 111 | unsigned tail; /* last valid page + 1 */ |
112 | +}; | |
113 | + | |
114 | +/* dio_state communicated between submission path and end_io */ | |
115 | +struct dio { | |
116 | + int flags; /* doesn't change */ | |
117 | + int rw; | |
118 | + struct inode *inode; | |
119 | + loff_t i_size; /* i_size when submitted */ | |
120 | + dio_iodone_t *end_io; /* IO completion function */ | |
121 | + | |
122 | + void *private; /* copy from map_bh.b_private */ | |
123 | + | |
124 | + /* BIO completion state */ | |
125 | + spinlock_t bio_lock; /* protects BIO fields below */ | |
128 | 126 | int page_errors; /* errno from get_user_pages() */ |
127 | + int is_async; /* is IO async ? */ | |
128 | + int io_error; /* IO error in completion path */ | |
129 | + unsigned long refcount; /* direct_io_worker() and bios */ | |
130 | + struct bio *bio_list; /* singly linked via bi_private */ | |
131 | + struct task_struct *waiter; /* waiting task (NULL if none) */ | |
129 | 132 | |
133 | + /* AIO related stuff */ | |
134 | + struct kiocb *iocb; /* kiocb */ | |
135 | + ssize_t result; /* IO result */ | |
136 | + | |
130 | 137 | /* |
131 | 138 | * pages[] (and any fields placed after it) are not zeroed out at |
132 | 139 | * allocation time. Don't add new fields after pages[] unless you |
133 | 140 | * wish that they not be zeroed. |
134 | 141 | */ |
135 | 142 | struct page *pages[DIO_PAGES]; /* page buffer */ |
136 | -}; | |
143 | +} ____cacheline_aligned_in_smp; | |
137 | 144 | |
145 | +static struct kmem_cache *dio_cache __read_mostly; | |
146 | + | |
138 | 147 | static void __inode_dio_wait(struct inode *inode) |
139 | 148 | { |
140 | 149 | wait_queue_head_t *wq = bit_waitqueue(&inode->i_state, __I_DIO_WAKEUP); |
141 | 150 | |
142 | 151 | |
143 | 152 | |
144 | 153 | |
145 | 154 | |
... | ... | @@ -182,27 +191,27 @@ |
182 | 191 | /* |
183 | 192 | * How many pages are in the queue? |
184 | 193 | */ |
185 | -static inline unsigned dio_pages_present(struct dio *dio) | |
194 | +static inline unsigned dio_pages_present(struct dio_submit *sdio) | |
186 | 195 | { |
187 | - return dio->tail - dio->head; | |
196 | + return sdio->tail - sdio->head; | |
188 | 197 | } |
189 | 198 | |
190 | 199 | /* |
191 | 200 | * Go grab and pin some userspace pages. Typically we'll get 64 at a time. |
192 | 201 | */ |
193 | -static int dio_refill_pages(struct dio *dio) | |
202 | +static inline int dio_refill_pages(struct dio *dio, struct dio_submit *sdio) | |
194 | 203 | { |
195 | 204 | int ret; |
196 | 205 | int nr_pages; |
197 | 206 | |
198 | - nr_pages = min(dio->total_pages - dio->curr_page, DIO_PAGES); | |
207 | + nr_pages = min(sdio->total_pages - sdio->curr_page, DIO_PAGES); | |
199 | 208 | ret = get_user_pages_fast( |
200 | - dio->curr_user_address, /* Where from? */ | |
209 | + sdio->curr_user_address, /* Where from? */ | |
201 | 210 | nr_pages, /* How many pages? */ |
202 | 211 | dio->rw == READ, /* Write to memory? */ |
203 | 212 | &dio->pages[0]); /* Put results here */ |
204 | 213 | |
205 | - if (ret < 0 && dio->blocks_available && (dio->rw & WRITE)) { | |
214 | + if (ret < 0 && sdio->blocks_available && (dio->rw & WRITE)) { | |
206 | 215 | struct page *page = ZERO_PAGE(0); |
207 | 216 | /* |
208 | 217 | * A memory fault, but the filesystem has some outstanding |
209 | 218 | |
... | ... | @@ -213,17 +222,17 @@ |
213 | 222 | dio->page_errors = ret; |
214 | 223 | page_cache_get(page); |
215 | 224 | dio->pages[0] = page; |
216 | - dio->head = 0; | |
217 | - dio->tail = 1; | |
225 | + sdio->head = 0; | |
226 | + sdio->tail = 1; | |
218 | 227 | ret = 0; |
219 | 228 | goto out; |
220 | 229 | } |
221 | 230 | |
222 | 231 | if (ret >= 0) { |
223 | - dio->curr_user_address += ret * PAGE_SIZE; | |
224 | - dio->curr_page += ret; | |
225 | - dio->head = 0; | |
226 | - dio->tail = ret; | |
232 | + sdio->curr_user_address += ret * PAGE_SIZE; | |
233 | + sdio->curr_page += ret; | |
234 | + sdio->head = 0; | |
235 | + sdio->tail = ret; | |
227 | 236 | ret = 0; |
228 | 237 | } |
229 | 238 | out: |
230 | 239 | |
231 | 240 | |
232 | 241 | |
233 | 242 | |
... | ... | @@ -236,17 +245,18 @@ |
236 | 245 | * decent number of pages, less frequently. To provide nicer use of the |
237 | 246 | * L1 cache. |
238 | 247 | */ |
239 | -static struct page *dio_get_page(struct dio *dio) | |
248 | +static inline struct page *dio_get_page(struct dio *dio, | |
249 | + struct dio_submit *sdio) | |
240 | 250 | { |
241 | - if (dio_pages_present(dio) == 0) { | |
251 | + if (dio_pages_present(sdio) == 0) { | |
242 | 252 | int ret; |
243 | 253 | |
244 | - ret = dio_refill_pages(dio); | |
254 | + ret = dio_refill_pages(dio, sdio); | |
245 | 255 | if (ret) |
246 | 256 | return ERR_PTR(ret); |
247 | - BUG_ON(dio_pages_present(dio) == 0); | |
257 | + BUG_ON(dio_pages_present(sdio) == 0); | |
248 | 258 | } |
249 | - return dio->pages[dio->head++]; | |
259 | + return dio->pages[sdio->head++]; | |
250 | 260 | } |
251 | 261 | |
252 | 262 | /** |
... | ... | @@ -292,7 +302,7 @@ |
292 | 302 | |
293 | 303 | if (dio->end_io && dio->result) { |
294 | 304 | dio->end_io(dio->iocb, offset, transferred, |
295 | - dio->map_bh.b_private, ret, is_async); | |
305 | + dio->private, ret, is_async); | |
296 | 306 | } else { |
297 | 307 | if (is_async) |
298 | 308 | aio_complete(dio->iocb, ret, 0); |
... | ... | @@ -323,7 +333,7 @@ |
323 | 333 | |
324 | 334 | if (remaining == 0) { |
325 | 335 | dio_complete(dio, dio->iocb->ki_pos, 0, true); |
326 | - kfree(dio); | |
336 | + kmem_cache_free(dio_cache, dio); | |
327 | 337 | } |
328 | 338 | } |
329 | 339 | |
... | ... | @@ -367,9 +377,10 @@ |
367 | 377 | } |
368 | 378 | EXPORT_SYMBOL_GPL(dio_end_io); |
369 | 379 | |
370 | -static void | |
371 | -dio_bio_alloc(struct dio *dio, struct block_device *bdev, | |
372 | - sector_t first_sector, int nr_vecs) | |
380 | +static inline void | |
381 | +dio_bio_alloc(struct dio *dio, struct dio_submit *sdio, | |
382 | + struct block_device *bdev, | |
383 | + sector_t first_sector, int nr_vecs) | |
373 | 384 | { |
374 | 385 | struct bio *bio; |
375 | 386 | |
... | ... | @@ -386,8 +397,8 @@ |
386 | 397 | else |
387 | 398 | bio->bi_end_io = dio_bio_end_io; |
388 | 399 | |
389 | - dio->bio = bio; | |
390 | - dio->logical_offset_in_bio = dio->cur_page_fs_offset; | |
400 | + sdio->bio = bio; | |
401 | + sdio->logical_offset_in_bio = sdio->cur_page_fs_offset; | |
391 | 402 | } |
392 | 403 | |
393 | 404 | /* |
394 | 405 | |
... | ... | @@ -397,9 +408,9 @@ |
397 | 408 | * |
398 | 409 | * bios hold a dio reference between submit_bio and ->end_io. |
399 | 410 | */ |
400 | -static void dio_bio_submit(struct dio *dio) | |
411 | +static inline void dio_bio_submit(struct dio *dio, struct dio_submit *sdio) | |
401 | 412 | { |
402 | - struct bio *bio = dio->bio; | |
413 | + struct bio *bio = sdio->bio; | |
403 | 414 | unsigned long flags; |
404 | 415 | |
405 | 416 | bio->bi_private = dio; |
406 | 417 | |
407 | 418 | |
408 | 419 | |
... | ... | @@ -411,24 +422,24 @@ |
411 | 422 | if (dio->is_async && dio->rw == READ) |
412 | 423 | bio_set_pages_dirty(bio); |
413 | 424 | |
414 | - if (dio->submit_io) | |
415 | - dio->submit_io(dio->rw, bio, dio->inode, | |
416 | - dio->logical_offset_in_bio); | |
425 | + if (sdio->submit_io) | |
426 | + sdio->submit_io(dio->rw, bio, dio->inode, | |
427 | + sdio->logical_offset_in_bio); | |
417 | 428 | else |
418 | 429 | submit_bio(dio->rw, bio); |
419 | 430 | |
420 | - dio->bio = NULL; | |
421 | - dio->boundary = 0; | |
422 | - dio->logical_offset_in_bio = 0; | |
431 | + sdio->bio = NULL; | |
432 | + sdio->boundary = 0; | |
433 | + sdio->logical_offset_in_bio = 0; | |
423 | 434 | } |
424 | 435 | |
425 | 436 | /* |
426 | 437 | * Release any resources in case of a failure |
427 | 438 | */ |
428 | -static void dio_cleanup(struct dio *dio) | |
439 | +static inline void dio_cleanup(struct dio *dio, struct dio_submit *sdio) | |
429 | 440 | { |
430 | - while (dio_pages_present(dio)) | |
431 | - page_cache_release(dio_get_page(dio)); | |
441 | + while (dio_pages_present(sdio)) | |
442 | + page_cache_release(dio_get_page(dio, sdio)); | |
432 | 443 | } |
433 | 444 | |
434 | 445 | /* |
435 | 446 | |
... | ... | @@ -518,11 +529,11 @@ |
518 | 529 | * |
519 | 530 | * This also helps to limit the peak amount of pinned userspace memory. |
520 | 531 | */ |
521 | -static int dio_bio_reap(struct dio *dio) | |
532 | +static inline int dio_bio_reap(struct dio *dio, struct dio_submit *sdio) | |
522 | 533 | { |
523 | 534 | int ret = 0; |
524 | 535 | |
525 | - if (dio->reap_counter++ >= 64) { | |
536 | + if (sdio->reap_counter++ >= 64) { | |
526 | 537 | while (dio->bio_list) { |
527 | 538 | unsigned long flags; |
528 | 539 | struct bio *bio; |
529 | 540 | |
... | ... | @@ -536,14 +547,14 @@ |
536 | 547 | if (ret == 0) |
537 | 548 | ret = ret2; |
538 | 549 | } |
539 | - dio->reap_counter = 0; | |
550 | + sdio->reap_counter = 0; | |
540 | 551 | } |
541 | 552 | return ret; |
542 | 553 | } |
543 | 554 | |
544 | 555 | /* |
545 | 556 | * Call into the fs to map some more disk blocks. We record the current number |
546 | - * of available blocks at dio->blocks_available. These are in units of the | |
557 | + * of available blocks at sdio->blocks_available. These are in units of the | |
547 | 558 | * fs blocksize, (1 << inode->i_blkbits). |
548 | 559 | * |
549 | 560 | * The fs is allowed to map lots of blocks at once. If it wants to do that, |
550 | 561 | |
... | ... | @@ -564,10 +575,10 @@ |
564 | 575 | * buffer_mapped(). However the direct-io code will only process holes one |
565 | 576 | * block at a time - it will repeatedly call get_block() as it walks the hole. |
566 | 577 | */ |
567 | -static int get_more_blocks(struct dio *dio) | |
578 | +static int get_more_blocks(struct dio *dio, struct dio_submit *sdio, | |
579 | + struct buffer_head *map_bh) | |
568 | 580 | { |
569 | 581 | int ret; |
570 | - struct buffer_head *map_bh = &dio->map_bh; | |
571 | 582 | sector_t fs_startblk; /* Into file, in filesystem-sized blocks */ |
572 | 583 | unsigned long fs_count; /* Number of filesystem-sized blocks */ |
573 | 584 | unsigned long dio_count;/* Number of dio_block-sized blocks */ |
... | ... | @@ -580,11 +591,11 @@ |
580 | 591 | */ |
581 | 592 | ret = dio->page_errors; |
582 | 593 | if (ret == 0) { |
583 | - BUG_ON(dio->block_in_file >= dio->final_block_in_request); | |
584 | - fs_startblk = dio->block_in_file >> dio->blkfactor; | |
585 | - dio_count = dio->final_block_in_request - dio->block_in_file; | |
586 | - fs_count = dio_count >> dio->blkfactor; | |
587 | - blkmask = (1 << dio->blkfactor) - 1; | |
594 | + BUG_ON(sdio->block_in_file >= sdio->final_block_in_request); | |
595 | + fs_startblk = sdio->block_in_file >> sdio->blkfactor; | |
596 | + dio_count = sdio->final_block_in_request - sdio->block_in_file; | |
597 | + fs_count = dio_count >> sdio->blkfactor; | |
598 | + blkmask = (1 << sdio->blkfactor) - 1; | |
588 | 599 | if (dio_count & blkmask) |
589 | 600 | fs_count++; |
590 | 601 | |
591 | 602 | |
592 | 603 | |
... | ... | @@ -604,13 +615,16 @@ |
604 | 615 | */ |
605 | 616 | create = dio->rw & WRITE; |
606 | 617 | if (dio->flags & DIO_SKIP_HOLES) { |
607 | - if (dio->block_in_file < (i_size_read(dio->inode) >> | |
608 | - dio->blkbits)) | |
618 | + if (sdio->block_in_file < (i_size_read(dio->inode) >> | |
619 | + sdio->blkbits)) | |
609 | 620 | create = 0; |
610 | 621 | } |
611 | 622 | |
612 | - ret = (*dio->get_block)(dio->inode, fs_startblk, | |
623 | + ret = (*sdio->get_block)(dio->inode, fs_startblk, | |
613 | 624 | map_bh, create); |
625 | + | |
626 | + /* Store for completion */ | |
627 | + dio->private = map_bh->b_private; | |
614 | 628 | } |
615 | 629 | return ret; |
616 | 630 | } |
617 | 631 | |
618 | 632 | |
619 | 633 | |
... | ... | @@ -618,20 +632,21 @@ |
618 | 632 | /* |
619 | 633 | * There is no bio. Make one now. |
620 | 634 | */ |
621 | -static int dio_new_bio(struct dio *dio, sector_t start_sector) | |
635 | +static inline int dio_new_bio(struct dio *dio, struct dio_submit *sdio, | |
636 | + sector_t start_sector, struct buffer_head *map_bh) | |
622 | 637 | { |
623 | 638 | sector_t sector; |
624 | 639 | int ret, nr_pages; |
625 | 640 | |
626 | - ret = dio_bio_reap(dio); | |
641 | + ret = dio_bio_reap(dio, sdio); | |
627 | 642 | if (ret) |
628 | 643 | goto out; |
629 | - sector = start_sector << (dio->blkbits - 9); | |
630 | - nr_pages = min(dio->pages_in_io, bio_get_nr_vecs(dio->map_bh.b_bdev)); | |
644 | + sector = start_sector << (sdio->blkbits - 9); | |
645 | + nr_pages = min(sdio->pages_in_io, bio_get_nr_vecs(map_bh->b_bdev)); | |
631 | 646 | nr_pages = min(nr_pages, BIO_MAX_PAGES); |
632 | 647 | BUG_ON(nr_pages <= 0); |
633 | - dio_bio_alloc(dio, dio->map_bh.b_bdev, sector, nr_pages); | |
634 | - dio->boundary = 0; | |
648 | + dio_bio_alloc(dio, sdio, map_bh->b_bdev, sector, nr_pages); | |
649 | + sdio->boundary = 0; | |
635 | 650 | out: |
636 | 651 | return ret; |
637 | 652 | } |
638 | 653 | |
639 | 654 | |
... | ... | @@ -643,21 +658,21 @@ |
643 | 658 | * |
644 | 659 | * Return zero on success. Non-zero means the caller needs to start a new BIO. |
645 | 660 | */ |
646 | -static int dio_bio_add_page(struct dio *dio) | |
661 | +static inline int dio_bio_add_page(struct dio_submit *sdio) | |
647 | 662 | { |
648 | 663 | int ret; |
649 | 664 | |
650 | - ret = bio_add_page(dio->bio, dio->cur_page, | |
651 | - dio->cur_page_len, dio->cur_page_offset); | |
652 | - if (ret == dio->cur_page_len) { | |
665 | + ret = bio_add_page(sdio->bio, sdio->cur_page, | |
666 | + sdio->cur_page_len, sdio->cur_page_offset); | |
667 | + if (ret == sdio->cur_page_len) { | |
653 | 668 | /* |
654 | 669 | * Decrement count only, if we are done with this page |
655 | 670 | */ |
656 | - if ((dio->cur_page_len + dio->cur_page_offset) == PAGE_SIZE) | |
657 | - dio->pages_in_io--; | |
658 | - page_cache_get(dio->cur_page); | |
659 | - dio->final_block_in_bio = dio->cur_page_block + | |
660 | - (dio->cur_page_len >> dio->blkbits); | |
671 | + if ((sdio->cur_page_len + sdio->cur_page_offset) == PAGE_SIZE) | |
672 | + sdio->pages_in_io--; | |
673 | + page_cache_get(sdio->cur_page); | |
674 | + sdio->final_block_in_bio = sdio->cur_page_block + | |
675 | + (sdio->cur_page_len >> sdio->blkbits); | |
661 | 676 | ret = 0; |
662 | 677 | } else { |
663 | 678 | ret = 1; |
664 | 679 | |
... | ... | @@ -675,14 +690,15 @@ |
675 | 690 | * The caller of this function is responsible for removing cur_page from the |
676 | 691 | * dio, and for dropping the refcount which came from that presence. |
677 | 692 | */ |
678 | -static int dio_send_cur_page(struct dio *dio) | |
693 | +static inline int dio_send_cur_page(struct dio *dio, struct dio_submit *sdio, | |
694 | + struct buffer_head *map_bh) | |
679 | 695 | { |
680 | 696 | int ret = 0; |
681 | 697 | |
682 | - if (dio->bio) { | |
683 | - loff_t cur_offset = dio->cur_page_fs_offset; | |
684 | - loff_t bio_next_offset = dio->logical_offset_in_bio + | |
685 | - dio->bio->bi_size; | |
698 | + if (sdio->bio) { | |
699 | + loff_t cur_offset = sdio->cur_page_fs_offset; | |
700 | + loff_t bio_next_offset = sdio->logical_offset_in_bio + | |
701 | + sdio->bio->bi_size; | |
686 | 702 | |
687 | 703 | /* |
688 | 704 | * See whether this new request is contiguous with the old. |
689 | 705 | |
690 | 706 | |
691 | 707 | |
692 | 708 | |
693 | 709 | |
... | ... | @@ -698,28 +714,28 @@ |
698 | 714 | * be the next logical offset in the bio, submit the bio we |
699 | 715 | * have. |
700 | 716 | */ |
701 | - if (dio->final_block_in_bio != dio->cur_page_block || | |
717 | + if (sdio->final_block_in_bio != sdio->cur_page_block || | |
702 | 718 | cur_offset != bio_next_offset) |
703 | - dio_bio_submit(dio); | |
719 | + dio_bio_submit(dio, sdio); | |
704 | 720 | /* |
705 | 721 | * Submit now if the underlying fs is about to perform a |
706 | 722 | * metadata read |
707 | 723 | */ |
708 | - else if (dio->boundary) | |
709 | - dio_bio_submit(dio); | |
724 | + else if (sdio->boundary) | |
725 | + dio_bio_submit(dio, sdio); | |
710 | 726 | } |
711 | 727 | |
712 | - if (dio->bio == NULL) { | |
713 | - ret = dio_new_bio(dio, dio->cur_page_block); | |
728 | + if (sdio->bio == NULL) { | |
729 | + ret = dio_new_bio(dio, sdio, sdio->cur_page_block, map_bh); | |
714 | 730 | if (ret) |
715 | 731 | goto out; |
716 | 732 | } |
717 | 733 | |
718 | - if (dio_bio_add_page(dio) != 0) { | |
719 | - dio_bio_submit(dio); | |
720 | - ret = dio_new_bio(dio, dio->cur_page_block); | |
734 | + if (dio_bio_add_page(sdio) != 0) { | |
735 | + dio_bio_submit(dio, sdio); | |
736 | + ret = dio_new_bio(dio, sdio, sdio->cur_page_block, map_bh); | |
721 | 737 | if (ret == 0) { |
722 | - ret = dio_bio_add_page(dio); | |
738 | + ret = dio_bio_add_page(sdio); | |
723 | 739 | BUG_ON(ret != 0); |
724 | 740 | } |
725 | 741 | } |
... | ... | @@ -744,9 +760,10 @@ |
744 | 760 | * If that doesn't work out then we put the old page into the bio and add this |
745 | 761 | * page to the dio instead. |
746 | 762 | */ |
747 | -static int | |
748 | -submit_page_section(struct dio *dio, struct page *page, | |
749 | - unsigned offset, unsigned len, sector_t blocknr) | |
763 | +static inline int | |
764 | +submit_page_section(struct dio *dio, struct dio_submit *sdio, struct page *page, | |
765 | + unsigned offset, unsigned len, sector_t blocknr, | |
766 | + struct buffer_head *map_bh) | |
750 | 767 | { |
751 | 768 | int ret = 0; |
752 | 769 | |
753 | 770 | |
754 | 771 | |
... | ... | @@ -760,20 +777,20 @@ |
760 | 777 | /* |
761 | 778 | * Can we just grow the current page's presence in the dio? |
762 | 779 | */ |
763 | - if ( (dio->cur_page == page) && | |
764 | - (dio->cur_page_offset + dio->cur_page_len == offset) && | |
765 | - (dio->cur_page_block + | |
766 | - (dio->cur_page_len >> dio->blkbits) == blocknr)) { | |
767 | - dio->cur_page_len += len; | |
780 | + if (sdio->cur_page == page && | |
781 | + sdio->cur_page_offset + sdio->cur_page_len == offset && | |
782 | + sdio->cur_page_block + | |
783 | + (sdio->cur_page_len >> sdio->blkbits) == blocknr) { | |
784 | + sdio->cur_page_len += len; | |
768 | 785 | |
769 | 786 | /* |
770 | - * If dio->boundary then we want to schedule the IO now to | |
787 | + * If sdio->boundary then we want to schedule the IO now to | |
771 | 788 | * avoid metadata seeks. |
772 | 789 | */ |
773 | - if (dio->boundary) { | |
774 | - ret = dio_send_cur_page(dio); | |
775 | - page_cache_release(dio->cur_page); | |
776 | - dio->cur_page = NULL; | |
790 | + if (sdio->boundary) { | |
791 | + ret = dio_send_cur_page(dio, sdio, map_bh); | |
792 | + page_cache_release(sdio->cur_page); | |
793 | + sdio->cur_page = NULL; | |
777 | 794 | } |
778 | 795 | goto out; |
779 | 796 | } |
780 | 797 | |
... | ... | @@ -781,20 +798,20 @@ |
781 | 798 | /* |
782 | 799 | * If there's a deferred page already there then send it. |
783 | 800 | */ |
784 | - if (dio->cur_page) { | |
785 | - ret = dio_send_cur_page(dio); | |
786 | - page_cache_release(dio->cur_page); | |
787 | - dio->cur_page = NULL; | |
801 | + if (sdio->cur_page) { | |
802 | + ret = dio_send_cur_page(dio, sdio, map_bh); | |
803 | + page_cache_release(sdio->cur_page); | |
804 | + sdio->cur_page = NULL; | |
788 | 805 | if (ret) |
789 | 806 | goto out; |
790 | 807 | } |
791 | 808 | |
792 | 809 | page_cache_get(page); /* It is in dio */ |
793 | - dio->cur_page = page; | |
794 | - dio->cur_page_offset = offset; | |
795 | - dio->cur_page_len = len; | |
796 | - dio->cur_page_block = blocknr; | |
797 | - dio->cur_page_fs_offset = dio->block_in_file << dio->blkbits; | |
810 | + sdio->cur_page = page; | |
811 | + sdio->cur_page_offset = offset; | |
812 | + sdio->cur_page_len = len; | |
813 | + sdio->cur_page_block = blocknr; | |
814 | + sdio->cur_page_fs_offset = sdio->block_in_file << sdio->blkbits; | |
798 | 815 | out: |
799 | 816 | return ret; |
800 | 817 | } |
801 | 818 | |
802 | 819 | |
... | ... | @@ -804,16 +821,16 @@ |
804 | 821 | * file blocks. Only called for S_ISREG files - blockdevs do not set |
805 | 822 | * buffer_new |
806 | 823 | */ |
807 | -static void clean_blockdev_aliases(struct dio *dio) | |
824 | +static void clean_blockdev_aliases(struct dio *dio, struct buffer_head *map_bh) | |
808 | 825 | { |
809 | 826 | unsigned i; |
810 | 827 | unsigned nblocks; |
811 | 828 | |
812 | - nblocks = dio->map_bh.b_size >> dio->inode->i_blkbits; | |
829 | + nblocks = map_bh->b_size >> dio->inode->i_blkbits; | |
813 | 830 | |
814 | 831 | for (i = 0; i < nblocks; i++) { |
815 | - unmap_underlying_metadata(dio->map_bh.b_bdev, | |
816 | - dio->map_bh.b_blocknr + i); | |
832 | + unmap_underlying_metadata(map_bh->b_bdev, | |
833 | + map_bh->b_blocknr + i); | |
817 | 834 | } |
818 | 835 | } |
819 | 836 | |
820 | 837 | |
821 | 838 | |
... | ... | @@ -826,19 +843,20 @@ |
826 | 843 | * `end' is zero if we're doing the start of the IO, 1 at the end of the |
827 | 844 | * IO. |
828 | 845 | */ |
829 | -static void dio_zero_block(struct dio *dio, int end) | |
846 | +static inline void dio_zero_block(struct dio *dio, struct dio_submit *sdio, | |
847 | + int end, struct buffer_head *map_bh) | |
830 | 848 | { |
831 | 849 | unsigned dio_blocks_per_fs_block; |
832 | 850 | unsigned this_chunk_blocks; /* In dio_blocks */ |
833 | 851 | unsigned this_chunk_bytes; |
834 | 852 | struct page *page; |
835 | 853 | |
836 | - dio->start_zero_done = 1; | |
837 | - if (!dio->blkfactor || !buffer_new(&dio->map_bh)) | |
854 | + sdio->start_zero_done = 1; | |
855 | + if (!sdio->blkfactor || !buffer_new(map_bh)) | |
838 | 856 | return; |
839 | 857 | |
840 | - dio_blocks_per_fs_block = 1 << dio->blkfactor; | |
841 | - this_chunk_blocks = dio->block_in_file & (dio_blocks_per_fs_block - 1); | |
858 | + dio_blocks_per_fs_block = 1 << sdio->blkfactor; | |
859 | + this_chunk_blocks = sdio->block_in_file & (dio_blocks_per_fs_block - 1); | |
842 | 860 | |
843 | 861 | if (!this_chunk_blocks) |
844 | 862 | return; |
845 | 863 | |
846 | 864 | |
... | ... | @@ -850,14 +868,14 @@ |
850 | 868 | if (end) |
851 | 869 | this_chunk_blocks = dio_blocks_per_fs_block - this_chunk_blocks; |
852 | 870 | |
853 | - this_chunk_bytes = this_chunk_blocks << dio->blkbits; | |
871 | + this_chunk_bytes = this_chunk_blocks << sdio->blkbits; | |
854 | 872 | |
855 | 873 | page = ZERO_PAGE(0); |
856 | - if (submit_page_section(dio, page, 0, this_chunk_bytes, | |
857 | - dio->next_block_for_io)) | |
874 | + if (submit_page_section(dio, sdio, page, 0, this_chunk_bytes, | |
875 | + sdio->next_block_for_io, map_bh)) | |
858 | 876 | return; |
859 | 877 | |
860 | - dio->next_block_for_io += this_chunk_blocks; | |
878 | + sdio->next_block_for_io += this_chunk_blocks; | |
861 | 879 | } |
862 | 880 | |
863 | 881 | /* |
864 | 882 | |
865 | 883 | |
866 | 884 | |
867 | 885 | |
... | ... | @@ -876,20 +894,20 @@ |
876 | 894 | * it should set b_size to PAGE_SIZE or more inside get_block(). This gives |
877 | 895 | * fine alignment but still allows this function to work in PAGE_SIZE units. |
878 | 896 | */ |
879 | -static int do_direct_IO(struct dio *dio) | |
897 | +static int do_direct_IO(struct dio *dio, struct dio_submit *sdio, | |
898 | + struct buffer_head *map_bh) | |
880 | 899 | { |
881 | - const unsigned blkbits = dio->blkbits; | |
900 | + const unsigned blkbits = sdio->blkbits; | |
882 | 901 | const unsigned blocks_per_page = PAGE_SIZE >> blkbits; |
883 | 902 | struct page *page; |
884 | 903 | unsigned block_in_page; |
885 | - struct buffer_head *map_bh = &dio->map_bh; | |
886 | 904 | int ret = 0; |
887 | 905 | |
888 | 906 | /* The I/O can start at any block offset within the first page */ |
889 | - block_in_page = dio->first_block_in_page; | |
907 | + block_in_page = sdio->first_block_in_page; | |
890 | 908 | |
891 | - while (dio->block_in_file < dio->final_block_in_request) { | |
892 | - page = dio_get_page(dio); | |
909 | + while (sdio->block_in_file < sdio->final_block_in_request) { | |
910 | + page = dio_get_page(dio, sdio); | |
893 | 911 | if (IS_ERR(page)) { |
894 | 912 | ret = PTR_ERR(page); |
895 | 913 | goto out; |
896 | 914 | |
... | ... | @@ -901,14 +919,14 @@ |
901 | 919 | unsigned this_chunk_blocks; /* # of blocks */ |
902 | 920 | unsigned u; |
903 | 921 | |
904 | - if (dio->blocks_available == 0) { | |
922 | + if (sdio->blocks_available == 0) { | |
905 | 923 | /* |
906 | 924 | * Need to go and map some more disk |
907 | 925 | */ |
908 | 926 | unsigned long blkmask; |
909 | 927 | unsigned long dio_remainder; |
910 | 928 | |
911 | - ret = get_more_blocks(dio); | |
929 | + ret = get_more_blocks(dio, sdio, map_bh); | |
912 | 930 | if (ret) { |
913 | 931 | page_cache_release(page); |
914 | 932 | goto out; |
915 | 933 | |
916 | 934 | |
917 | 935 | |
... | ... | @@ -916,18 +934,18 @@ |
916 | 934 | if (!buffer_mapped(map_bh)) |
917 | 935 | goto do_holes; |
918 | 936 | |
919 | - dio->blocks_available = | |
920 | - map_bh->b_size >> dio->blkbits; | |
921 | - dio->next_block_for_io = | |
922 | - map_bh->b_blocknr << dio->blkfactor; | |
937 | + sdio->blocks_available = | |
938 | + map_bh->b_size >> sdio->blkbits; | |
939 | + sdio->next_block_for_io = | |
940 | + map_bh->b_blocknr << sdio->blkfactor; | |
923 | 941 | if (buffer_new(map_bh)) |
924 | - clean_blockdev_aliases(dio); | |
942 | + clean_blockdev_aliases(dio, map_bh); | |
925 | 943 | |
926 | - if (!dio->blkfactor) | |
944 | + if (!sdio->blkfactor) | |
927 | 945 | goto do_holes; |
928 | 946 | |
929 | - blkmask = (1 << dio->blkfactor) - 1; | |
930 | - dio_remainder = (dio->block_in_file & blkmask); | |
947 | + blkmask = (1 << sdio->blkfactor) - 1; | |
948 | + dio_remainder = (sdio->block_in_file & blkmask); | |
931 | 949 | |
932 | 950 | /* |
933 | 951 | * If we are at the start of IO and that IO |
... | ... | @@ -941,8 +959,8 @@ |
941 | 959 | * on-disk |
942 | 960 | */ |
943 | 961 | if (!buffer_new(map_bh)) |
944 | - dio->next_block_for_io += dio_remainder; | |
945 | - dio->blocks_available -= dio_remainder; | |
962 | + sdio->next_block_for_io += dio_remainder; | |
963 | + sdio->blocks_available -= dio_remainder; | |
946 | 964 | } |
947 | 965 | do_holes: |
948 | 966 | /* Handle holes */ |
... | ... | @@ -961,7 +979,7 @@ |
961 | 979 | */ |
962 | 980 | i_size_aligned = ALIGN(i_size_read(dio->inode), |
963 | 981 | 1 << blkbits); |
964 | - if (dio->block_in_file >= | |
982 | + if (sdio->block_in_file >= | |
965 | 983 | i_size_aligned >> blkbits) { |
966 | 984 | /* We hit eof */ |
967 | 985 | page_cache_release(page); |
... | ... | @@ -969,7 +987,7 @@ |
969 | 987 | } |
970 | 988 | zero_user(page, block_in_page << blkbits, |
971 | 989 | 1 << blkbits); |
972 | - dio->block_in_file++; | |
990 | + sdio->block_in_file++; | |
973 | 991 | block_in_page++; |
974 | 992 | goto next_block; |
975 | 993 | } |
976 | 994 | |
977 | 995 | |
978 | 996 | |
979 | 997 | |
980 | 998 | |
981 | 999 | |
982 | 1000 | |
... | ... | @@ -979,38 +997,41 @@ |
979 | 997 | * is finer than the underlying fs, go check to see if |
980 | 998 | * we must zero out the start of this block. |
981 | 999 | */ |
982 | - if (unlikely(dio->blkfactor && !dio->start_zero_done)) | |
983 | - dio_zero_block(dio, 0); | |
1000 | + if (unlikely(sdio->blkfactor && !sdio->start_zero_done)) | |
1001 | + dio_zero_block(dio, sdio, 0, map_bh); | |
984 | 1002 | |
985 | 1003 | /* |
986 | 1004 | * Work out, in this_chunk_blocks, how much disk we |
987 | 1005 | * can add to this page |
988 | 1006 | */ |
989 | - this_chunk_blocks = dio->blocks_available; | |
1007 | + this_chunk_blocks = sdio->blocks_available; | |
990 | 1008 | u = (PAGE_SIZE - offset_in_page) >> blkbits; |
991 | 1009 | if (this_chunk_blocks > u) |
992 | 1010 | this_chunk_blocks = u; |
993 | - u = dio->final_block_in_request - dio->block_in_file; | |
1011 | + u = sdio->final_block_in_request - sdio->block_in_file; | |
994 | 1012 | if (this_chunk_blocks > u) |
995 | 1013 | this_chunk_blocks = u; |
996 | 1014 | this_chunk_bytes = this_chunk_blocks << blkbits; |
997 | 1015 | BUG_ON(this_chunk_bytes == 0); |
998 | 1016 | |
999 | - dio->boundary = buffer_boundary(map_bh); | |
1000 | - ret = submit_page_section(dio, page, offset_in_page, | |
1001 | - this_chunk_bytes, dio->next_block_for_io); | |
1017 | + sdio->boundary = buffer_boundary(map_bh); | |
1018 | + ret = submit_page_section(dio, sdio, page, | |
1019 | + offset_in_page, | |
1020 | + this_chunk_bytes, | |
1021 | + sdio->next_block_for_io, | |
1022 | + map_bh); | |
1002 | 1023 | if (ret) { |
1003 | 1024 | page_cache_release(page); |
1004 | 1025 | goto out; |
1005 | 1026 | } |
1006 | - dio->next_block_for_io += this_chunk_blocks; | |
1027 | + sdio->next_block_for_io += this_chunk_blocks; | |
1007 | 1028 | |
1008 | - dio->block_in_file += this_chunk_blocks; | |
1029 | + sdio->block_in_file += this_chunk_blocks; | |
1009 | 1030 | block_in_page += this_chunk_blocks; |
1010 | - dio->blocks_available -= this_chunk_blocks; | |
1031 | + sdio->blocks_available -= this_chunk_blocks; | |
1011 | 1032 | next_block: |
1012 | - BUG_ON(dio->block_in_file > dio->final_block_in_request); | |
1013 | - if (dio->block_in_file == dio->final_block_in_request) | |
1033 | + BUG_ON(sdio->block_in_file > sdio->final_block_in_request); | |
1034 | + if (sdio->block_in_file == sdio->final_block_in_request) | |
1014 | 1035 | break; |
1015 | 1036 | } |
1016 | 1037 | |
1017 | 1038 | |
1018 | 1039 | |
1019 | 1040 | |
1020 | 1041 | |
... | ... | @@ -1022,137 +1043,12 @@ |
1022 | 1043 | return ret; |
1023 | 1044 | } |
1024 | 1045 | |
1025 | -static ssize_t | |
1026 | -direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode, | |
1027 | - const struct iovec *iov, loff_t offset, unsigned long nr_segs, | |
1028 | - unsigned blkbits, get_block_t get_block, dio_iodone_t end_io, | |
1029 | - dio_submit_t submit_io, struct dio *dio) | |
1046 | +static inline int drop_refcount(struct dio *dio) | |
1030 | 1047 | { |
1031 | - unsigned long user_addr; | |
1048 | + int ret2; | |
1032 | 1049 | unsigned long flags; |
1033 | - int seg; | |
1034 | - ssize_t ret = 0; | |
1035 | - ssize_t ret2; | |
1036 | - size_t bytes; | |
1037 | 1050 | |
1038 | - dio->inode = inode; | |
1039 | - dio->rw = rw; | |
1040 | - dio->blkbits = blkbits; | |
1041 | - dio->blkfactor = inode->i_blkbits - blkbits; | |
1042 | - dio->block_in_file = offset >> blkbits; | |
1043 | - | |
1044 | - dio->get_block = get_block; | |
1045 | - dio->end_io = end_io; | |
1046 | - dio->submit_io = submit_io; | |
1047 | - dio->final_block_in_bio = -1; | |
1048 | - dio->next_block_for_io = -1; | |
1049 | - | |
1050 | - dio->iocb = iocb; | |
1051 | - dio->i_size = i_size_read(inode); | |
1052 | - | |
1053 | - spin_lock_init(&dio->bio_lock); | |
1054 | - dio->refcount = 1; | |
1055 | - | |
1056 | 1051 | /* |
1057 | - * In case of non-aligned buffers, we may need 2 more | |
1058 | - * pages since we need to zero out first and last block. | |
1059 | - */ | |
1060 | - if (unlikely(dio->blkfactor)) | |
1061 | - dio->pages_in_io = 2; | |
1062 | - | |
1063 | - for (seg = 0; seg < nr_segs; seg++) { | |
1064 | - user_addr = (unsigned long)iov[seg].iov_base; | |
1065 | - dio->pages_in_io += | |
1066 | - ((user_addr+iov[seg].iov_len +PAGE_SIZE-1)/PAGE_SIZE | |
1067 | - - user_addr/PAGE_SIZE); | |
1068 | - } | |
1069 | - | |
1070 | - for (seg = 0; seg < nr_segs; seg++) { | |
1071 | - user_addr = (unsigned long)iov[seg].iov_base; | |
1072 | - dio->size += bytes = iov[seg].iov_len; | |
1073 | - | |
1074 | - /* Index into the first page of the first block */ | |
1075 | - dio->first_block_in_page = (user_addr & ~PAGE_MASK) >> blkbits; | |
1076 | - dio->final_block_in_request = dio->block_in_file + | |
1077 | - (bytes >> blkbits); | |
1078 | - /* Page fetching state */ | |
1079 | - dio->head = 0; | |
1080 | - dio->tail = 0; | |
1081 | - dio->curr_page = 0; | |
1082 | - | |
1083 | - dio->total_pages = 0; | |
1084 | - if (user_addr & (PAGE_SIZE-1)) { | |
1085 | - dio->total_pages++; | |
1086 | - bytes -= PAGE_SIZE - (user_addr & (PAGE_SIZE - 1)); | |
1087 | - } | |
1088 | - dio->total_pages += (bytes + PAGE_SIZE - 1) / PAGE_SIZE; | |
1089 | - dio->curr_user_address = user_addr; | |
1090 | - | |
1091 | - ret = do_direct_IO(dio); | |
1092 | - | |
1093 | - dio->result += iov[seg].iov_len - | |
1094 | - ((dio->final_block_in_request - dio->block_in_file) << | |
1095 | - blkbits); | |
1096 | - | |
1097 | - if (ret) { | |
1098 | - dio_cleanup(dio); | |
1099 | - break; | |
1100 | - } | |
1101 | - } /* end iovec loop */ | |
1102 | - | |
1103 | - if (ret == -ENOTBLK) { | |
1104 | - /* | |
1105 | - * The remaining part of the request will be | |
1106 | - * be handled by buffered I/O when we return | |
1107 | - */ | |
1108 | - ret = 0; | |
1109 | - } | |
1110 | - /* | |
1111 | - * There may be some unwritten disk at the end of a part-written | |
1112 | - * fs-block-sized block. Go zero that now. | |
1113 | - */ | |
1114 | - dio_zero_block(dio, 1); | |
1115 | - | |
1116 | - if (dio->cur_page) { | |
1117 | - ret2 = dio_send_cur_page(dio); | |
1118 | - if (ret == 0) | |
1119 | - ret = ret2; | |
1120 | - page_cache_release(dio->cur_page); | |
1121 | - dio->cur_page = NULL; | |
1122 | - } | |
1123 | - if (dio->bio) | |
1124 | - dio_bio_submit(dio); | |
1125 | - | |
1126 | - /* | |
1127 | - * It is possible that, we return short IO due to end of file. | |
1128 | - * In that case, we need to release all the pages we got hold on. | |
1129 | - */ | |
1130 | - dio_cleanup(dio); | |
1131 | - | |
1132 | - /* | |
1133 | - * All block lookups have been performed. For READ requests | |
1134 | - * we can let i_mutex go now that its achieved its purpose | |
1135 | - * of protecting us from looking up uninitialized blocks. | |
1136 | - */ | |
1137 | - if (rw == READ && (dio->flags & DIO_LOCKING)) | |
1138 | - mutex_unlock(&dio->inode->i_mutex); | |
1139 | - | |
1140 | - /* | |
1141 | - * The only time we want to leave bios in flight is when a successful | |
1142 | - * partial aio read or full aio write have been setup. In that case | |
1143 | - * bio completion will call aio_complete. The only time it's safe to | |
1144 | - * call aio_complete is when we return -EIOCBQUEUED, so we key on that. | |
1145 | - * This had *better* be the only place that raises -EIOCBQUEUED. | |
1146 | - */ | |
1147 | - BUG_ON(ret == -EIOCBQUEUED); | |
1148 | - if (dio->is_async && ret == 0 && dio->result && | |
1149 | - ((rw & READ) || (dio->result == dio->size))) | |
1150 | - ret = -EIOCBQUEUED; | |
1151 | - | |
1152 | - if (ret != -EIOCBQUEUED) | |
1153 | - dio_await_completion(dio); | |
1154 | - | |
1155 | - /* | |
1156 | 1052 | * Sync will always be dropping the final ref and completing the |
1157 | 1053 | * operation. AIO can if it was a broken operation described above or |
1158 | 1054 | * in fact if all the bios race to complete before we get here. In |
... | ... | @@ -1166,14 +1062,7 @@ |
1166 | 1062 | spin_lock_irqsave(&dio->bio_lock, flags); |
1167 | 1063 | ret2 = --dio->refcount; |
1168 | 1064 | spin_unlock_irqrestore(&dio->bio_lock, flags); |
1169 | - | |
1170 | - if (ret2 == 0) { | |
1171 | - ret = dio_complete(dio, offset, ret, false); | |
1172 | - kfree(dio); | |
1173 | - } else | |
1174 | - BUG_ON(ret != -EIOCBQUEUED); | |
1175 | - | |
1176 | - return ret; | |
1065 | + return ret2; | |
1177 | 1066 | } |
1178 | 1067 | |
1179 | 1068 | /* |
... | ... | @@ -1195,6 +1084,11 @@ |
1195 | 1084 | * expected that filesystem provide exclusion between new direct I/O |
1196 | 1085 | * and truncates. For DIO_LOCKING filesystems this is done by i_mutex, |
1197 | 1086 | * but other filesystems need to take care of this on their own. |
1087 | + * | |
1088 | + * NOTE: if you pass "sdio" to anything by pointer make sure that function | |
1089 | + * is always inlined. Otherwise gcc is unable to split the structure into | |
1090 | + * individual fields and will generate much worse code. This is important | |
1091 | + * for the whole file. | |
1198 | 1092 | */ |
1199 | 1093 | ssize_t |
1200 | 1094 | __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, |
... | ... | @@ -1211,6 +1105,10 @@ |
1211 | 1105 | ssize_t retval = -EINVAL; |
1212 | 1106 | loff_t end = offset; |
1213 | 1107 | struct dio *dio; |
1108 | + struct dio_submit sdio = { 0, }; | |
1109 | + unsigned long user_addr; | |
1110 | + size_t bytes; | |
1111 | + struct buffer_head map_bh = { 0, }; | |
1214 | 1112 | |
1215 | 1113 | if (rw & WRITE) |
1216 | 1114 | rw = WRITE_ODIRECT; |
... | ... | @@ -1244,7 +1142,7 @@ |
1244 | 1142 | if (rw == READ && end == offset) |
1245 | 1143 | return 0; |
1246 | 1144 | |
1247 | - dio = kmalloc(sizeof(*dio), GFP_KERNEL); | |
1145 | + dio = kmem_cache_alloc(dio_cache, GFP_KERNEL); | |
1248 | 1146 | retval = -ENOMEM; |
1249 | 1147 | if (!dio) |
1250 | 1148 | goto out; |
... | ... | @@ -1268,7 +1166,7 @@ |
1268 | 1166 | end - 1); |
1269 | 1167 | if (retval) { |
1270 | 1168 | mutex_unlock(&inode->i_mutex); |
1271 | - kfree(dio); | |
1169 | + kmem_cache_free(dio_cache, dio); | |
1272 | 1170 | goto out; |
1273 | 1171 | } |
1274 | 1172 | } |
1275 | 1173 | |
1276 | 1174 | |
... | ... | @@ -1288,12 +1186,142 @@ |
1288 | 1186 | dio->is_async = !is_sync_kiocb(iocb) && !((rw & WRITE) && |
1289 | 1187 | (end > i_size_read(inode))); |
1290 | 1188 | |
1291 | - retval = direct_io_worker(rw, iocb, inode, iov, offset, | |
1292 | - nr_segs, blkbits, get_block, end_io, | |
1293 | - submit_io, dio); | |
1189 | + retval = 0; | |
1294 | 1190 | |
1191 | + dio->inode = inode; | |
1192 | + dio->rw = rw; | |
1193 | + sdio.blkbits = blkbits; | |
1194 | + sdio.blkfactor = inode->i_blkbits - blkbits; | |
1195 | + sdio.block_in_file = offset >> blkbits; | |
1196 | + | |
1197 | + sdio.get_block = get_block; | |
1198 | + dio->end_io = end_io; | |
1199 | + sdio.submit_io = submit_io; | |
1200 | + sdio.final_block_in_bio = -1; | |
1201 | + sdio.next_block_for_io = -1; | |
1202 | + | |
1203 | + dio->iocb = iocb; | |
1204 | + dio->i_size = i_size_read(inode); | |
1205 | + | |
1206 | + spin_lock_init(&dio->bio_lock); | |
1207 | + dio->refcount = 1; | |
1208 | + | |
1209 | + /* | |
1210 | + * In case of non-aligned buffers, we may need 2 more | |
1211 | + * pages since we need to zero out first and last block. | |
1212 | + */ | |
1213 | + if (unlikely(sdio.blkfactor)) | |
1214 | + sdio.pages_in_io = 2; | |
1215 | + | |
1216 | + for (seg = 0; seg < nr_segs; seg++) { | |
1217 | + user_addr = (unsigned long)iov[seg].iov_base; | |
1218 | + sdio.pages_in_io += | |
1219 | + ((user_addr + iov[seg].iov_len + PAGE_SIZE-1) / | |
1220 | + PAGE_SIZE - user_addr / PAGE_SIZE); | |
1221 | + } | |
1222 | + | |
1223 | + for (seg = 0; seg < nr_segs; seg++) { | |
1224 | + user_addr = (unsigned long)iov[seg].iov_base; | |
1225 | + sdio.size += bytes = iov[seg].iov_len; | |
1226 | + | |
1227 | + /* Index into the first page of the first block */ | |
1228 | + sdio.first_block_in_page = (user_addr & ~PAGE_MASK) >> blkbits; | |
1229 | + sdio.final_block_in_request = sdio.block_in_file + | |
1230 | + (bytes >> blkbits); | |
1231 | + /* Page fetching state */ | |
1232 | + sdio.head = 0; | |
1233 | + sdio.tail = 0; | |
1234 | + sdio.curr_page = 0; | |
1235 | + | |
1236 | + sdio.total_pages = 0; | |
1237 | + if (user_addr & (PAGE_SIZE-1)) { | |
1238 | + sdio.total_pages++; | |
1239 | + bytes -= PAGE_SIZE - (user_addr & (PAGE_SIZE - 1)); | |
1240 | + } | |
1241 | + sdio.total_pages += (bytes + PAGE_SIZE - 1) / PAGE_SIZE; | |
1242 | + sdio.curr_user_address = user_addr; | |
1243 | + | |
1244 | + retval = do_direct_IO(dio, &sdio, &map_bh); | |
1245 | + | |
1246 | + dio->result += iov[seg].iov_len - | |
1247 | + ((sdio.final_block_in_request - sdio.block_in_file) << | |
1248 | + blkbits); | |
1249 | + | |
1250 | + if (retval) { | |
1251 | + dio_cleanup(dio, &sdio); | |
1252 | + break; | |
1253 | + } | |
1254 | + } /* end iovec loop */ | |
1255 | + | |
1256 | + if (retval == -ENOTBLK) { | |
1257 | + /* | |
1258 | + * The remaining part of the request will be | |
1259 | + * be handled by buffered I/O when we return | |
1260 | + */ | |
1261 | + retval = 0; | |
1262 | + } | |
1263 | + /* | |
1264 | + * There may be some unwritten disk at the end of a part-written | |
1265 | + * fs-block-sized block. Go zero that now. | |
1266 | + */ | |
1267 | + dio_zero_block(dio, &sdio, 1, &map_bh); | |
1268 | + | |
1269 | + if (sdio.cur_page) { | |
1270 | + ssize_t ret2; | |
1271 | + | |
1272 | + ret2 = dio_send_cur_page(dio, &sdio, &map_bh); | |
1273 | + if (retval == 0) | |
1274 | + retval = ret2; | |
1275 | + page_cache_release(sdio.cur_page); | |
1276 | + sdio.cur_page = NULL; | |
1277 | + } | |
1278 | + if (sdio.bio) | |
1279 | + dio_bio_submit(dio, &sdio); | |
1280 | + | |
1281 | + /* | |
1282 | + * It is possible that, we return short IO due to end of file. | |
1283 | + * In that case, we need to release all the pages we got hold on. | |
1284 | + */ | |
1285 | + dio_cleanup(dio, &sdio); | |
1286 | + | |
1287 | + /* | |
1288 | + * All block lookups have been performed. For READ requests | |
1289 | + * we can let i_mutex go now that its achieved its purpose | |
1290 | + * of protecting us from looking up uninitialized blocks. | |
1291 | + */ | |
1292 | + if (rw == READ && (dio->flags & DIO_LOCKING)) | |
1293 | + mutex_unlock(&dio->inode->i_mutex); | |
1294 | + | |
1295 | + /* | |
1296 | + * The only time we want to leave bios in flight is when a successful | |
1297 | + * partial aio read or full aio write have been setup. In that case | |
1298 | + * bio completion will call aio_complete. The only time it's safe to | |
1299 | + * call aio_complete is when we return -EIOCBQUEUED, so we key on that. | |
1300 | + * This had *better* be the only place that raises -EIOCBQUEUED. | |
1301 | + */ | |
1302 | + BUG_ON(retval == -EIOCBQUEUED); | |
1303 | + if (dio->is_async && retval == 0 && dio->result && | |
1304 | + ((rw & READ) || (dio->result == sdio.size))) | |
1305 | + retval = -EIOCBQUEUED; | |
1306 | + | |
1307 | + if (retval != -EIOCBQUEUED) | |
1308 | + dio_await_completion(dio); | |
1309 | + | |
1310 | + if (drop_refcount(dio) == 0) { | |
1311 | + retval = dio_complete(dio, offset, retval, false); | |
1312 | + kmem_cache_free(dio_cache, dio); | |
1313 | + } else | |
1314 | + BUG_ON(retval != -EIOCBQUEUED); | |
1315 | + | |
1295 | 1316 | out: |
1296 | 1317 | return retval; |
1297 | 1318 | } |
1298 | 1319 | EXPORT_SYMBOL(__blockdev_direct_IO); |
1320 | + | |
1321 | +static __init int dio_init(void) | |
1322 | +{ | |
1323 | + dio_cache = KMEM_CACHE(dio, SLAB_PANIC); | |
1324 | + return 0; | |
1325 | +} | |
1326 | +module_init(dio_init) |
fs/ext4/file.c
... | ... | @@ -224,53 +224,8 @@ |
224 | 224 | maxbytes = EXT4_SB(inode->i_sb)->s_bitmap_maxbytes; |
225 | 225 | else |
226 | 226 | maxbytes = inode->i_sb->s_maxbytes; |
227 | - mutex_lock(&inode->i_mutex); | |
228 | - switch (origin) { | |
229 | - case SEEK_END: | |
230 | - offset += inode->i_size; | |
231 | - break; | |
232 | - case SEEK_CUR: | |
233 | - if (offset == 0) { | |
234 | - mutex_unlock(&inode->i_mutex); | |
235 | - return file->f_pos; | |
236 | - } | |
237 | - offset += file->f_pos; | |
238 | - break; | |
239 | - case SEEK_DATA: | |
240 | - /* | |
241 | - * In the generic case the entire file is data, so as long as | |
242 | - * offset isn't at the end of the file then the offset is data. | |
243 | - */ | |
244 | - if (offset >= inode->i_size) { | |
245 | - mutex_unlock(&inode->i_mutex); | |
246 | - return -ENXIO; | |
247 | - } | |
248 | - break; | |
249 | - case SEEK_HOLE: | |
250 | - /* | |
251 | - * There is a virtual hole at the end of the file, so as long as | |
252 | - * offset isn't i_size or larger, return i_size. | |
253 | - */ | |
254 | - if (offset >= inode->i_size) { | |
255 | - mutex_unlock(&inode->i_mutex); | |
256 | - return -ENXIO; | |
257 | - } | |
258 | - offset = inode->i_size; | |
259 | - break; | |
260 | - } | |
261 | 227 | |
262 | - if (offset < 0 || offset > maxbytes) { | |
263 | - mutex_unlock(&inode->i_mutex); | |
264 | - return -EINVAL; | |
265 | - } | |
266 | - | |
267 | - if (offset != file->f_pos) { | |
268 | - file->f_pos = offset; | |
269 | - file->f_version = 0; | |
270 | - } | |
271 | - mutex_unlock(&inode->i_mutex); | |
272 | - | |
273 | - return offset; | |
228 | + return generic_file_llseek_size(file, offset, origin, maxbytes); | |
274 | 229 | } |
275 | 230 | |
276 | 231 | const struct file_operations ext4_file_operations = { |
fs/gfs2/file.c
... | ... | @@ -66,13 +66,13 @@ |
66 | 66 | error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, |
67 | 67 | &i_gh); |
68 | 68 | if (!error) { |
69 | - error = generic_file_llseek_unlocked(file, offset, origin); | |
69 | + error = generic_file_llseek(file, offset, origin); | |
70 | 70 | gfs2_glock_dq_uninit(&i_gh); |
71 | 71 | } |
72 | 72 | break; |
73 | 73 | case SEEK_CUR: |
74 | 74 | case SEEK_SET: |
75 | - error = generic_file_llseek_unlocked(file, offset, origin); | |
75 | + error = generic_file_llseek(file, offset, origin); | |
76 | 76 | break; |
77 | 77 | default: |
78 | 78 | error = -EINVAL; |
fs/inode.c
fs/namei.c
... | ... | @@ -221,14 +221,12 @@ |
221 | 221 | } |
222 | 222 | |
223 | 223 | /* |
224 | - * This does basic POSIX ACL permission checking | |
224 | + * This does the basic permission checking | |
225 | 225 | */ |
226 | 226 | static int acl_permission_check(struct inode *inode, int mask) |
227 | 227 | { |
228 | 228 | unsigned int mode = inode->i_mode; |
229 | 229 | |
230 | - mask &= MAY_READ | MAY_WRITE | MAY_EXEC | MAY_NOT_BLOCK; | |
231 | - | |
232 | 230 | if (current_user_ns() != inode_userns(inode)) |
233 | 231 | goto other_perms; |
234 | 232 | |
... | ... | @@ -257,7 +255,7 @@ |
257 | 255 | /** |
258 | 256 | * generic_permission - check for access rights on a Posix-like filesystem |
259 | 257 | * @inode: inode to check access rights for |
260 | - * @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC) | |
258 | + * @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC, ...) | |
261 | 259 | * |
262 | 260 | * Used to check for read/write/execute permissions on a file. |
263 | 261 | * We use "fsuid" for this, letting us set arbitrary permissions |
... | ... | @@ -273,7 +271,7 @@ |
273 | 271 | int ret; |
274 | 272 | |
275 | 273 | /* |
276 | - * Do the basic POSIX ACL permission checks. | |
274 | + * Do the basic permission checks. | |
277 | 275 | */ |
278 | 276 | ret = acl_permission_check(inode, mask); |
279 | 277 | if (ret != -EACCES) |
280 | 278 | |
... | ... | @@ -331,12 +329,14 @@ |
331 | 329 | /** |
332 | 330 | * inode_permission - check for access rights to a given inode |
333 | 331 | * @inode: inode to check permission on |
334 | - * @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC) | |
332 | + * @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC, ...) | |
335 | 333 | * |
336 | 334 | * Used to check for read/write/execute permissions on an inode. |
337 | 335 | * We use "fsuid" for this, letting us set arbitrary permissions |
338 | 336 | * for filesystem access without changing the "normal" uids which |
339 | 337 | * are used for other things. |
338 | + * | |
339 | + * When checking for MAY_APPEND, MAY_WRITE must also be set in @mask. | |
340 | 340 | */ |
341 | 341 | int inode_permission(struct inode *inode, int mask) |
342 | 342 | { |
... | ... | @@ -2035,10 +2035,7 @@ |
2035 | 2035 | if (flag & O_NOATIME && !inode_owner_or_capable(inode)) |
2036 | 2036 | return -EPERM; |
2037 | 2037 | |
2038 | - /* | |
2039 | - * Ensure there are no outstanding leases on the file. | |
2040 | - */ | |
2041 | - return break_lease(inode, flag); | |
2038 | + return 0; | |
2042 | 2039 | } |
2043 | 2040 | |
2044 | 2041 | static int handle_truncate(struct file *filp) |
fs/namespace.c
fs/nfs/file.c
... | ... | @@ -180,8 +180,6 @@ |
180 | 180 | |
181 | 181 | static loff_t nfs_file_llseek(struct file *filp, loff_t offset, int origin) |
182 | 182 | { |
183 | - loff_t loff; | |
184 | - | |
185 | 183 | dprintk("NFS: llseek file(%s/%s, %lld, %d)\n", |
186 | 184 | filp->f_path.dentry->d_parent->d_name.name, |
187 | 185 | filp->f_path.dentry->d_name.name, |
188 | 186 | |
... | ... | @@ -197,13 +195,9 @@ |
197 | 195 | int retval = nfs_revalidate_file_size(inode, filp); |
198 | 196 | if (retval < 0) |
199 | 197 | return (loff_t)retval; |
198 | + } | |
200 | 199 | |
201 | - spin_lock(&inode->i_lock); | |
202 | - loff = generic_file_llseek_unlocked(filp, offset, origin); | |
203 | - spin_unlock(&inode->i_lock); | |
204 | - } else | |
205 | - loff = generic_file_llseek_unlocked(filp, offset, origin); | |
206 | - return loff; | |
200 | + return generic_file_llseek(filp, offset, origin); | |
207 | 201 | } |
208 | 202 | |
209 | 203 | /* |
fs/open.c
fs/posix_acl.c
fs/read_write.c
... | ... | @@ -35,23 +35,45 @@ |
35 | 35 | return file->f_mode & FMODE_UNSIGNED_OFFSET; |
36 | 36 | } |
37 | 37 | |
38 | +static loff_t lseek_execute(struct file *file, struct inode *inode, | |
39 | + loff_t offset, loff_t maxsize) | |
40 | +{ | |
41 | + if (offset < 0 && !unsigned_offsets(file)) | |
42 | + return -EINVAL; | |
43 | + if (offset > maxsize) | |
44 | + return -EINVAL; | |
45 | + | |
46 | + if (offset != file->f_pos) { | |
47 | + file->f_pos = offset; | |
48 | + file->f_version = 0; | |
49 | + } | |
50 | + return offset; | |
51 | +} | |
52 | + | |
38 | 53 | /** |
39 | - * generic_file_llseek_unlocked - lockless generic llseek implementation | |
54 | + * generic_file_llseek_size - generic llseek implementation for regular files | |
40 | 55 | * @file: file structure to seek on |
41 | 56 | * @offset: file offset to seek to |
42 | 57 | * @origin: type of seek |
58 | + * @size: max size of file system | |
43 | 59 | * |
44 | - * Updates the file offset to the value specified by @offset and @origin. | |
45 | - * Locking must be provided by the caller. | |
60 | + * This is a variant of generic_file_llseek that allows passing in a custom | |
61 | + * file size. | |
62 | + * | |
63 | + * Synchronization: | |
64 | + * SEEK_SET and SEEK_END are unsynchronized (but atomic on 64bit platforms) | |
65 | + * SEEK_CUR is synchronized against other SEEK_CURs, but not read/writes. | |
66 | + * read/writes behave like SEEK_SET against seeks. | |
46 | 67 | */ |
47 | 68 | loff_t |
48 | -generic_file_llseek_unlocked(struct file *file, loff_t offset, int origin) | |
69 | +generic_file_llseek_size(struct file *file, loff_t offset, int origin, | |
70 | + loff_t maxsize) | |
49 | 71 | { |
50 | 72 | struct inode *inode = file->f_mapping->host; |
51 | 73 | |
52 | 74 | switch (origin) { |
53 | 75 | case SEEK_END: |
54 | - offset += inode->i_size; | |
76 | + offset += i_size_read(inode); | |
55 | 77 | break; |
56 | 78 | case SEEK_CUR: |
57 | 79 | /* |
58 | 80 | |
... | ... | @@ -62,14 +84,22 @@ |
62 | 84 | */ |
63 | 85 | if (offset == 0) |
64 | 86 | return file->f_pos; |
65 | - offset += file->f_pos; | |
66 | - break; | |
87 | + /* | |
88 | + * f_lock protects against read/modify/write race with other | |
89 | + * SEEK_CURs. Note that parallel writes and reads behave | |
90 | + * like SEEK_SET. | |
91 | + */ | |
92 | + spin_lock(&file->f_lock); | |
93 | + offset = lseek_execute(file, inode, file->f_pos + offset, | |
94 | + maxsize); | |
95 | + spin_unlock(&file->f_lock); | |
96 | + return offset; | |
67 | 97 | case SEEK_DATA: |
68 | 98 | /* |
69 | 99 | * In the generic case the entire file is data, so as long as |
70 | 100 | * offset isn't at the end of the file then the offset is data. |
71 | 101 | */ |
72 | - if (offset >= inode->i_size) | |
102 | + if (offset >= i_size_read(inode)) | |
73 | 103 | return -ENXIO; |
74 | 104 | break; |
75 | 105 | case SEEK_HOLE: |
76 | 106 | |
77 | 107 | |
78 | 108 | |
... | ... | @@ -77,26 +107,15 @@ |
77 | 107 | * There is a virtual hole at the end of the file, so as long as |
78 | 108 | * offset isn't i_size or larger, return i_size. |
79 | 109 | */ |
80 | - if (offset >= inode->i_size) | |
110 | + if (offset >= i_size_read(inode)) | |
81 | 111 | return -ENXIO; |
82 | - offset = inode->i_size; | |
112 | + offset = i_size_read(inode); | |
83 | 113 | break; |
84 | 114 | } |
85 | 115 | |
86 | - if (offset < 0 && !unsigned_offsets(file)) | |
87 | - return -EINVAL; | |
88 | - if (offset > inode->i_sb->s_maxbytes) | |
89 | - return -EINVAL; | |
90 | - | |
91 | - /* Special lock needed here? */ | |
92 | - if (offset != file->f_pos) { | |
93 | - file->f_pos = offset; | |
94 | - file->f_version = 0; | |
95 | - } | |
96 | - | |
97 | - return offset; | |
116 | + return lseek_execute(file, inode, offset, maxsize); | |
98 | 117 | } |
99 | -EXPORT_SYMBOL(generic_file_llseek_unlocked); | |
118 | +EXPORT_SYMBOL(generic_file_llseek_size); | |
100 | 119 | |
101 | 120 | /** |
102 | 121 | * generic_file_llseek - generic llseek implementation for regular files |
103 | 122 | |
... | ... | @@ -110,13 +129,10 @@ |
110 | 129 | */ |
111 | 130 | loff_t generic_file_llseek(struct file *file, loff_t offset, int origin) |
112 | 131 | { |
113 | - loff_t rval; | |
132 | + struct inode *inode = file->f_mapping->host; | |
114 | 133 | |
115 | - mutex_lock(&file->f_dentry->d_inode->i_mutex); | |
116 | - rval = generic_file_llseek_unlocked(file, offset, origin); | |
117 | - mutex_unlock(&file->f_dentry->d_inode->i_mutex); | |
118 | - | |
119 | - return rval; | |
134 | + return generic_file_llseek_size(file, offset, origin, | |
135 | + inode->i_sb->s_maxbytes); | |
120 | 136 | } |
121 | 137 | EXPORT_SYMBOL(generic_file_llseek); |
122 | 138 |
include/linux/fs.h
... | ... | @@ -58,14 +58,15 @@ |
58 | 58 | |
59 | 59 | #define NR_FILE 8192 /* this can well be larger on a larger system */ |
60 | 60 | |
61 | -#define MAY_EXEC 1 | |
62 | -#define MAY_WRITE 2 | |
63 | -#define MAY_READ 4 | |
64 | -#define MAY_APPEND 8 | |
65 | -#define MAY_ACCESS 16 | |
66 | -#define MAY_OPEN 32 | |
67 | -#define MAY_CHDIR 64 | |
68 | -#define MAY_NOT_BLOCK 128 /* called from RCU mode, don't block */ | |
61 | +#define MAY_EXEC 0x00000001 | |
62 | +#define MAY_WRITE 0x00000002 | |
63 | +#define MAY_READ 0x00000004 | |
64 | +#define MAY_APPEND 0x00000008 | |
65 | +#define MAY_ACCESS 0x00000010 | |
66 | +#define MAY_OPEN 0x00000020 | |
67 | +#define MAY_CHDIR 0x00000040 | |
68 | +/* called from RCU mode, don't block */ | |
69 | +#define MAY_NOT_BLOCK 0x00000080 | |
69 | 70 | |
70 | 71 | /* |
71 | 72 | * flags in file.f_mode. Note that FMODE_READ and FMODE_WRITE must correspond |
... | ... | @@ -963,7 +964,12 @@ |
963 | 964 | #define f_dentry f_path.dentry |
964 | 965 | #define f_vfsmnt f_path.mnt |
965 | 966 | const struct file_operations *f_op; |
966 | - spinlock_t f_lock; /* f_ep_links, f_flags, no IRQ */ | |
967 | + | |
968 | + /* | |
969 | + * Protects f_ep_links, f_flags, f_pos vs i_size in lseek SEEK_CUR. | |
970 | + * Must not be taken from IRQ context. | |
971 | + */ | |
972 | + spinlock_t f_lock; | |
967 | 973 | #ifdef CONFIG_SMP |
968 | 974 | int f_sb_list_cpu; |
969 | 975 | #endif |
... | ... | @@ -2401,8 +2407,8 @@ |
2401 | 2407 | extern loff_t noop_llseek(struct file *file, loff_t offset, int origin); |
2402 | 2408 | extern loff_t no_llseek(struct file *file, loff_t offset, int origin); |
2403 | 2409 | extern loff_t generic_file_llseek(struct file *file, loff_t offset, int origin); |
2404 | -extern loff_t generic_file_llseek_unlocked(struct file *file, loff_t offset, | |
2405 | - int origin); | |
2410 | +extern loff_t generic_file_llseek_size(struct file *file, loff_t offset, | |
2411 | + int origin, loff_t maxsize); | |
2406 | 2412 | extern int generic_file_open(struct inode * inode, struct file * filp); |
2407 | 2413 | extern int nonseekable_open(struct inode * inode, struct file * filp); |
2408 | 2414 |
mm/filemap.c
... | ... | @@ -2115,6 +2115,7 @@ |
2115 | 2115 | } else { |
2116 | 2116 | const struct iovec *iov = i->iov; |
2117 | 2117 | size_t base = i->iov_offset; |
2118 | + unsigned long nr_segs = i->nr_segs; | |
2118 | 2119 | |
2119 | 2120 | /* |
2120 | 2121 | * The !iov->iov_len check ensures we skip over unlikely |
2121 | 2122 | |
... | ... | @@ -2130,11 +2131,13 @@ |
2130 | 2131 | base += copy; |
2131 | 2132 | if (iov->iov_len == base) { |
2132 | 2133 | iov++; |
2134 | + nr_segs--; | |
2133 | 2135 | base = 0; |
2134 | 2136 | } |
2135 | 2137 | } |
2136 | 2138 | i->iov = iov; |
2137 | 2139 | i->iov_offset = base; |
2140 | + i->nr_segs = nr_segs; | |
2138 | 2141 | } |
2139 | 2142 | } |
2140 | 2143 | EXPORT_SYMBOL(iov_iter_advance); |