Commit 7c5465d6ccd759caa959828e2add5603518dafc4
Committed by
Trond Myklebust
1 parent
c0411a94a8
Exists in
master
and in
6 other branches
pnfsblock: alloc short extent before submit bio
As discussed earlier, it is better for block client to allocate memory for tracking extents state before submitting bio. So the patch does it by allocating a short_extent for every INVALID extent touched by write pagelist and for every zeroing page we created, saving them in layout header. Then in end_io we can just use them to create commit list items and avoid memory allocation there. Signed-off-by: Peng Tao <peng_tao@emc.com> Signed-off-by: Benny Halevy <bhalevy@tonian.com> Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Showing 3 changed files with 131 additions and 37 deletions Side-by-side Diff
fs/nfs/blocklayout/blocklayout.c
... | ... | @@ -90,8 +90,9 @@ |
90 | 90 | */ |
91 | 91 | struct parallel_io { |
92 | 92 | struct kref refcnt; |
93 | - void (*pnfs_callback) (void *data); | |
93 | + void (*pnfs_callback) (void *data, int num_se); | |
94 | 94 | void *data; |
95 | + int bse_count; | |
95 | 96 | }; |
96 | 97 | |
97 | 98 | static inline struct parallel_io *alloc_parallel(void *data) |
... | ... | @@ -102,6 +103,7 @@ |
102 | 103 | if (rv) { |
103 | 104 | rv->data = data; |
104 | 105 | kref_init(&rv->refcnt); |
106 | + rv->bse_count = 0; | |
105 | 107 | } |
106 | 108 | return rv; |
107 | 109 | } |
... | ... | @@ -116,7 +118,7 @@ |
116 | 118 | struct parallel_io *p = container_of(kref, struct parallel_io, refcnt); |
117 | 119 | |
118 | 120 | dprintk("%s enter\n", __func__); |
119 | - p->pnfs_callback(p->data); | |
121 | + p->pnfs_callback(p->data, p->bse_count); | |
120 | 122 | kfree(p); |
121 | 123 | } |
122 | 124 | |
... | ... | @@ -216,7 +218,7 @@ |
216 | 218 | } |
217 | 219 | |
218 | 220 | static void |
219 | -bl_end_par_io_read(void *data) | |
221 | +bl_end_par_io_read(void *data, int unused) | |
220 | 222 | { |
221 | 223 | struct nfs_read_data *rdata = data; |
222 | 224 | |
... | ... | @@ -317,6 +319,7 @@ |
317 | 319 | { |
318 | 320 | sector_t isect, end; |
319 | 321 | struct pnfs_block_extent *be; |
322 | + struct pnfs_block_short_extent *se; | |
320 | 323 | |
321 | 324 | dprintk("%s(%llu, %u)\n", __func__, offset, count); |
322 | 325 | if (count == 0) |
... | ... | @@ -329,8 +332,11 @@ |
329 | 332 | be = bl_find_get_extent(bl, isect, NULL); |
330 | 333 | BUG_ON(!be); /* FIXME */ |
331 | 334 | len = min(end, be->be_f_offset + be->be_length) - isect; |
332 | - if (be->be_state == PNFS_BLOCK_INVALID_DATA) | |
333 | - bl_mark_for_commit(be, isect, len); /* What if fails? */ | |
335 | + if (be->be_state == PNFS_BLOCK_INVALID_DATA) { | |
336 | + se = bl_pop_one_short_extent(be->be_inval); | |
337 | + BUG_ON(!se); | |
338 | + bl_mark_for_commit(be, isect, len, se); | |
339 | + } | |
334 | 340 | isect += len; |
335 | 341 | bl_put_extent(be); |
336 | 342 | } |
... | ... | @@ -352,7 +358,8 @@ |
352 | 358 | end_page_writeback(page); |
353 | 359 | page_cache_release(page); |
354 | 360 | } while (bvec >= bio->bi_io_vec); |
355 | - if (!uptodate) { | |
361 | + | |
362 | + if (unlikely(!uptodate)) { | |
356 | 363 | if (!wdata->pnfs_error) |
357 | 364 | wdata->pnfs_error = -EIO; |
358 | 365 | pnfs_set_lo_fail(wdata->lseg); |
... | ... | @@ -361,7 +368,6 @@ |
361 | 368 | put_parallel(par); |
362 | 369 | } |
363 | 370 | |
364 | -/* This is basically copied from mpage_end_io_read */ | |
365 | 371 | static void bl_end_io_write(struct bio *bio, int err) |
366 | 372 | { |
367 | 373 | struct parallel_io *par = bio->bi_private; |
... | ... | @@ -387,7 +393,7 @@ |
387 | 393 | dprintk("%s enter\n", __func__); |
388 | 394 | task = container_of(work, struct rpc_task, u.tk_work); |
389 | 395 | wdata = container_of(task, struct nfs_write_data, task); |
390 | - if (!wdata->pnfs_error) { | |
396 | + if (likely(!wdata->pnfs_error)) { | |
391 | 397 | /* Marks for LAYOUTCOMMIT */ |
392 | 398 | mark_extents_written(BLK_LSEG2EXT(wdata->lseg), |
393 | 399 | wdata->args.offset, wdata->args.count); |
394 | 400 | |
... | ... | @@ -396,10 +402,15 @@ |
396 | 402 | } |
397 | 403 | |
398 | 404 | /* Called when last of bios associated with a bl_write_pagelist call finishes */ |
399 | -static void bl_end_par_io_write(void *data) | |
405 | +static void bl_end_par_io_write(void *data, int num_se) | |
400 | 406 | { |
401 | 407 | struct nfs_write_data *wdata = data; |
402 | 408 | |
409 | + if (unlikely(wdata->pnfs_error)) { | |
410 | + bl_free_short_extents(&BLK_LSEG2EXT(wdata->lseg)->bl_inval, | |
411 | + num_se); | |
412 | + } | |
413 | + | |
403 | 414 | wdata->task.tk_status = wdata->pnfs_error; |
404 | 415 | wdata->verf.committed = NFS_FILE_SYNC; |
405 | 416 | INIT_WORK(&wdata->task.u.tk_work, bl_write_cleanup); |
... | ... | @@ -552,7 +563,7 @@ |
552 | 563 | */ |
553 | 564 | par = alloc_parallel(wdata); |
554 | 565 | if (!par) |
555 | - return PNFS_NOT_ATTEMPTED; | |
566 | + goto out_mds; | |
556 | 567 | par->pnfs_callback = bl_end_par_io_write; |
557 | 568 | /* At this point, have to be more careful with error handling */ |
558 | 569 | |
559 | 570 | |
... | ... | @@ -560,12 +571,15 @@ |
560 | 571 | be = bl_find_get_extent(BLK_LSEG2EXT(wdata->lseg), isect, &cow_read); |
561 | 572 | if (!be || !is_writable(be, isect)) { |
562 | 573 | dprintk("%s no matching extents!\n", __func__); |
563 | - wdata->pnfs_error = -EINVAL; | |
564 | - goto out; | |
574 | + goto out_mds; | |
565 | 575 | } |
566 | 576 | |
567 | 577 | /* First page inside INVALID extent */ |
568 | 578 | if (be->be_state == PNFS_BLOCK_INVALID_DATA) { |
579 | + if (likely(!bl_push_one_short_extent(be->be_inval))) | |
580 | + par->bse_count++; | |
581 | + else | |
582 | + goto out_mds; | |
569 | 583 | temp = offset >> PAGE_CACHE_SHIFT; |
570 | 584 | npg_zero = do_div(temp, npg_per_block); |
571 | 585 | isect = (sector_t) (((offset - npg_zero * PAGE_CACHE_SIZE) & |
... | ... | @@ -603,6 +617,19 @@ |
603 | 617 | wdata->pnfs_error = ret; |
604 | 618 | goto out; |
605 | 619 | } |
620 | + if (likely(!bl_push_one_short_extent(be->be_inval))) | |
621 | + par->bse_count++; | |
622 | + else { | |
623 | + end_page_writeback(page); | |
624 | + page_cache_release(page); | |
625 | + wdata->pnfs_error = -ENOMEM; | |
626 | + goto out; | |
627 | + } | |
628 | + /* FIXME: This should be done in bi_end_io */ | |
629 | + mark_extents_written(BLK_LSEG2EXT(wdata->lseg), | |
630 | + page->index << PAGE_CACHE_SHIFT, | |
631 | + PAGE_CACHE_SIZE); | |
632 | + | |
606 | 633 | bio = bl_add_page_to_bio(bio, npg_zero, WRITE, |
607 | 634 | isect, page, be, |
608 | 635 | bl_end_io_write_zero, par); |
... | ... | @@ -611,10 +638,6 @@ |
611 | 638 | bio = NULL; |
612 | 639 | goto out; |
613 | 640 | } |
614 | - /* FIXME: This should be done in bi_end_io */ | |
615 | - mark_extents_written(BLK_LSEG2EXT(wdata->lseg), | |
616 | - page->index << PAGE_CACHE_SHIFT, | |
617 | - PAGE_CACHE_SIZE); | |
618 | 641 | next_page: |
619 | 642 | isect += PAGE_CACHE_SECTORS; |
620 | 643 | extent_length -= PAGE_CACHE_SECTORS; |
... | ... | @@ -638,6 +661,15 @@ |
638 | 661 | wdata->pnfs_error = -EINVAL; |
639 | 662 | goto out; |
640 | 663 | } |
664 | + if (be->be_state == PNFS_BLOCK_INVALID_DATA) { | |
665 | + if (likely(!bl_push_one_short_extent( | |
666 | + be->be_inval))) | |
667 | + par->bse_count++; | |
668 | + else { | |
669 | + wdata->pnfs_error = -ENOMEM; | |
670 | + goto out; | |
671 | + } | |
672 | + } | |
641 | 673 | extent_length = be->be_length - |
642 | 674 | (isect - be->be_f_offset); |
643 | 675 | } |
... | ... | @@ -685,6 +717,10 @@ |
685 | 717 | bl_submit_bio(WRITE, bio); |
686 | 718 | put_parallel(par); |
687 | 719 | return PNFS_ATTEMPTED; |
720 | +out_mds: | |
721 | + bl_put_extent(be); | |
722 | + kfree(par); | |
723 | + return PNFS_NOT_ATTEMPTED; | |
688 | 724 | } |
689 | 725 | |
690 | 726 | /* FIXME - range ignored */ |
691 | 727 | |
... | ... | @@ -711,10 +747,16 @@ |
711 | 747 | release_inval_marks(struct pnfs_inval_markings *marks) |
712 | 748 | { |
713 | 749 | struct pnfs_inval_tracking *pos, *temp; |
750 | + struct pnfs_block_short_extent *se, *stemp; | |
714 | 751 | |
715 | 752 | list_for_each_entry_safe(pos, temp, &marks->im_tree.mtt_stub, it_link) { |
716 | 753 | list_del(&pos->it_link); |
717 | 754 | kfree(pos); |
755 | + } | |
756 | + | |
757 | + list_for_each_entry_safe(se, stemp, &marks->im_extents, bse_node) { | |
758 | + list_del(&se->bse_node); | |
759 | + kfree(se); | |
718 | 760 | } |
719 | 761 | return; |
720 | 762 | } |
fs/nfs/blocklayout/blocklayout.h
... | ... | @@ -70,6 +70,7 @@ |
70 | 70 | spinlock_t im_lock; |
71 | 71 | struct my_tree im_tree; /* Sectors that need LAYOUTCOMMIT */ |
72 | 72 | sector_t im_block_size; /* Server blocksize in sectors */ |
73 | + struct list_head im_extents; /* Short extents for INVAL->RW conversion */ | |
73 | 74 | }; |
74 | 75 | |
75 | 76 | struct pnfs_inval_tracking { |
... | ... | @@ -105,6 +106,7 @@ |
105 | 106 | { |
106 | 107 | spin_lock_init(&marks->im_lock); |
107 | 108 | INIT_LIST_HEAD(&marks->im_tree.mtt_stub); |
109 | + INIT_LIST_HEAD(&marks->im_extents); | |
108 | 110 | marks->im_block_size = blocksize; |
109 | 111 | marks->im_tree.mtt_step_size = min((sector_t)PAGE_CACHE_SECTORS, |
110 | 112 | blocksize); |
... | ... | @@ -199,7 +201,12 @@ |
199 | 201 | int bl_add_merge_extent(struct pnfs_block_layout *bl, |
200 | 202 | struct pnfs_block_extent *new); |
201 | 203 | int bl_mark_for_commit(struct pnfs_block_extent *be, |
202 | - sector_t offset, sector_t length); | |
204 | + sector_t offset, sector_t length, | |
205 | + struct pnfs_block_short_extent *new); | |
206 | +int bl_push_one_short_extent(struct pnfs_inval_markings *marks); | |
207 | +struct pnfs_block_short_extent * | |
208 | +bl_pop_one_short_extent(struct pnfs_inval_markings *marks); | |
209 | +void bl_free_short_extents(struct pnfs_inval_markings *marks, int num_to_free); | |
203 | 210 | |
204 | 211 | #endif /* FS_NFS_NFS4BLOCKLAYOUT_H */ |
fs/nfs/blocklayout/extents.c
... | ... | @@ -157,10 +157,10 @@ |
157 | 157 | goto out_cleanup; |
158 | 158 | } |
159 | 159 | |
160 | - spin_lock(&marks->im_lock); | |
160 | + spin_lock_bh(&marks->im_lock); | |
161 | 161 | for (s = start; s < end; s += tree->mtt_step_size) |
162 | 162 | used += _add_entry(tree, s, INTERNAL_EXISTS, storage[used]); |
163 | - spin_unlock(&marks->im_lock); | |
163 | + spin_unlock_bh(&marks->im_lock); | |
164 | 164 | |
165 | 165 | status = 0; |
166 | 166 | |
167 | 167 | |
... | ... | @@ -179,9 +179,9 @@ |
179 | 179 | { |
180 | 180 | int rv; |
181 | 181 | |
182 | - spin_lock(&marks->im_lock); | |
182 | + spin_lock_bh(&marks->im_lock); | |
183 | 183 | rv = _has_tag(&marks->im_tree, isect, EXTENT_INITIALIZED); |
184 | - spin_unlock(&marks->im_lock); | |
184 | + spin_unlock_bh(&marks->im_lock); | |
185 | 185 | return rv; |
186 | 186 | } |
187 | 187 | |
188 | 188 | |
... | ... | @@ -221,9 +221,9 @@ |
221 | 221 | { |
222 | 222 | int rv; |
223 | 223 | |
224 | - spin_lock(&marks->im_lock); | |
224 | + spin_lock_bh(&marks->im_lock); | |
225 | 225 | rv = _range_has_tag(&marks->im_tree, start, end, EXTENT_WRITTEN); |
226 | - spin_unlock(&marks->im_lock); | |
226 | + spin_unlock_bh(&marks->im_lock); | |
227 | 227 | return rv; |
228 | 228 | } |
229 | 229 | |
230 | 230 | |
231 | 231 | |
... | ... | @@ -244,15 +244,15 @@ |
244 | 244 | if (_preload_range(marks, start, end - start)) |
245 | 245 | goto outerr; |
246 | 246 | |
247 | - spin_lock(&marks->im_lock); | |
247 | + spin_lock_bh(&marks->im_lock); | |
248 | 248 | if (_set_range(&marks->im_tree, EXTENT_INITIALIZED, offset, length)) |
249 | 249 | goto out_unlock; |
250 | - spin_unlock(&marks->im_lock); | |
250 | + spin_unlock_bh(&marks->im_lock); | |
251 | 251 | |
252 | 252 | return 0; |
253 | 253 | |
254 | 254 | out_unlock: |
255 | - spin_unlock(&marks->im_lock); | |
255 | + spin_unlock_bh(&marks->im_lock); | |
256 | 256 | outerr: |
257 | 257 | return -ENOMEM; |
258 | 258 | } |
259 | 259 | |
... | ... | @@ -267,9 +267,9 @@ |
267 | 267 | |
268 | 268 | dprintk("%s(offset=%llu,len=%llu) enter\n", __func__, |
269 | 269 | (u64)offset, (u64)length); |
270 | - spin_lock(&marks->im_lock); | |
270 | + spin_lock_bh(&marks->im_lock); | |
271 | 271 | status = _set_range(&marks->im_tree, EXTENT_WRITTEN, offset, length); |
272 | - spin_unlock(&marks->im_lock); | |
272 | + spin_unlock_bh(&marks->im_lock); | |
273 | 273 | return status; |
274 | 274 | } |
275 | 275 | |
276 | 276 | |
277 | 277 | |
278 | 278 | |
... | ... | @@ -369,20 +369,18 @@ |
369 | 369 | |
370 | 370 | /* Note the range described by offset, length is guaranteed to be contained |
371 | 371 | * within be. |
372 | + * new will be freed, either by this function or add_to_commitlist if they | |
373 | + * decide not to use it, or after LAYOUTCOMMIT uses it in the commitlist. | |
372 | 374 | */ |
373 | 375 | int bl_mark_for_commit(struct pnfs_block_extent *be, |
374 | - sector_t offset, sector_t length) | |
376 | + sector_t offset, sector_t length, | |
377 | + struct pnfs_block_short_extent *new) | |
375 | 378 | { |
376 | 379 | sector_t new_end, end = offset + length; |
377 | - struct pnfs_block_short_extent *new; | |
378 | 380 | struct pnfs_block_layout *bl = container_of(be->be_inval, |
379 | 381 | struct pnfs_block_layout, |
380 | 382 | bl_inval); |
381 | 383 | |
382 | - new = kmalloc(sizeof(*new), GFP_NOFS); | |
383 | - if (!new) | |
384 | - return -ENOMEM; | |
385 | - | |
386 | 384 | mark_written_sectors(be->be_inval, offset, length); |
387 | 385 | /* We want to add the range to commit list, but it must be |
388 | 386 | * block-normalized, and verified that the normalized range has |
... | ... | @@ -412,9 +410,6 @@ |
412 | 410 | new->bse_mdev = be->be_mdev; |
413 | 411 | |
414 | 412 | spin_lock(&bl->bl_ext_lock); |
415 | - /* new will be freed, either by add_to_commitlist if it decides not | |
416 | - * to use it, or after LAYOUTCOMMIT uses it in the commitlist. | |
417 | - */ | |
418 | 413 | add_to_commitlist(bl, new); |
419 | 414 | spin_unlock(&bl->bl_ext_lock); |
420 | 415 | return 0; |
... | ... | @@ -861,5 +856,55 @@ |
861 | 856 | spin_unlock(&bl->bl_ext_lock); |
862 | 857 | } |
863 | 858 | } |
859 | +} | |
860 | + | |
861 | +int bl_push_one_short_extent(struct pnfs_inval_markings *marks) | |
862 | +{ | |
863 | + struct pnfs_block_short_extent *new; | |
864 | + | |
865 | + new = kmalloc(sizeof(*new), GFP_NOFS); | |
866 | + if (unlikely(!new)) | |
867 | + return -ENOMEM; | |
868 | + | |
869 | + spin_lock_bh(&marks->im_lock); | |
870 | + list_add(&new->bse_node, &marks->im_extents); | |
871 | + spin_unlock_bh(&marks->im_lock); | |
872 | + | |
873 | + return 0; | |
874 | +} | |
875 | + | |
876 | +struct pnfs_block_short_extent * | |
877 | +bl_pop_one_short_extent(struct pnfs_inval_markings *marks) | |
878 | +{ | |
879 | + struct pnfs_block_short_extent *rv = NULL; | |
880 | + | |
881 | + spin_lock_bh(&marks->im_lock); | |
882 | + if (!list_empty(&marks->im_extents)) { | |
883 | + rv = list_entry((&marks->im_extents)->next, | |
884 | + struct pnfs_block_short_extent, bse_node); | |
885 | + list_del_init(&rv->bse_node); | |
886 | + } | |
887 | + spin_unlock_bh(&marks->im_lock); | |
888 | + | |
889 | + return rv; | |
890 | +} | |
891 | + | |
892 | +void bl_free_short_extents(struct pnfs_inval_markings *marks, int num_to_free) | |
893 | +{ | |
894 | + struct pnfs_block_short_extent *se = NULL, *tmp; | |
895 | + | |
896 | + if (num_to_free <= 0) | |
897 | + return; | |
898 | + | |
899 | + spin_lock(&marks->im_lock); | |
900 | + list_for_each_entry_safe(se, tmp, &marks->im_extents, bse_node) { | |
901 | + list_del(&se->bse_node); | |
902 | + kfree(se); | |
903 | + if (--num_to_free == 0) | |
904 | + break; | |
905 | + } | |
906 | + spin_unlock(&marks->im_lock); | |
907 | + | |
908 | + BUG_ON(num_to_free > 0); | |
864 | 909 | } |