Commit 7c5465d6ccd759caa959828e2add5603518dafc4

Authored by Peng Tao
Committed by Trond Myklebust
1 parent c0411a94a8

pnfsblock: alloc short extent before submit bio

As discussed earlier, it is better for block client to allocate memory for
tracking extents state before submitting bio. So the patch does it by allocating
a short_extent for every INVALID extent touched by write pagelist and for
every zeroing page we created, saving them in layout header. Then in end_io we
can just use them to create commit list items and avoid memory allocation there.

Signed-off-by: Peng Tao <peng_tao@emc.com>
Signed-off-by: Benny Halevy <bhalevy@tonian.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>

Showing 3 changed files with 131 additions and 37 deletions Side-by-side Diff

fs/nfs/blocklayout/blocklayout.c
... ... @@ -90,8 +90,9 @@
90 90 */
91 91 struct parallel_io {
92 92 struct kref refcnt;
93   - void (*pnfs_callback) (void *data);
  93 + void (*pnfs_callback) (void *data, int num_se);
94 94 void *data;
  95 + int bse_count;
95 96 };
96 97  
97 98 static inline struct parallel_io *alloc_parallel(void *data)
... ... @@ -102,6 +103,7 @@
102 103 if (rv) {
103 104 rv->data = data;
104 105 kref_init(&rv->refcnt);
  106 + rv->bse_count = 0;
105 107 }
106 108 return rv;
107 109 }
... ... @@ -116,7 +118,7 @@
116 118 struct parallel_io *p = container_of(kref, struct parallel_io, refcnt);
117 119  
118 120 dprintk("%s enter\n", __func__);
119   - p->pnfs_callback(p->data);
  121 + p->pnfs_callback(p->data, p->bse_count);
120 122 kfree(p);
121 123 }
122 124  
... ... @@ -216,7 +218,7 @@
216 218 }
217 219  
218 220 static void
219   -bl_end_par_io_read(void *data)
  221 +bl_end_par_io_read(void *data, int unused)
220 222 {
221 223 struct nfs_read_data *rdata = data;
222 224  
... ... @@ -317,6 +319,7 @@
317 319 {
318 320 sector_t isect, end;
319 321 struct pnfs_block_extent *be;
  322 + struct pnfs_block_short_extent *se;
320 323  
321 324 dprintk("%s(%llu, %u)\n", __func__, offset, count);
322 325 if (count == 0)
... ... @@ -329,8 +332,11 @@
329 332 be = bl_find_get_extent(bl, isect, NULL);
330 333 BUG_ON(!be); /* FIXME */
331 334 len = min(end, be->be_f_offset + be->be_length) - isect;
332   - if (be->be_state == PNFS_BLOCK_INVALID_DATA)
333   - bl_mark_for_commit(be, isect, len); /* What if fails? */
  335 + if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
  336 + se = bl_pop_one_short_extent(be->be_inval);
  337 + BUG_ON(!se);
  338 + bl_mark_for_commit(be, isect, len, se);
  339 + }
334 340 isect += len;
335 341 bl_put_extent(be);
336 342 }
... ... @@ -352,7 +358,8 @@
352 358 end_page_writeback(page);
353 359 page_cache_release(page);
354 360 } while (bvec >= bio->bi_io_vec);
355   - if (!uptodate) {
  361 +
  362 + if (unlikely(!uptodate)) {
356 363 if (!wdata->pnfs_error)
357 364 wdata->pnfs_error = -EIO;
358 365 pnfs_set_lo_fail(wdata->lseg);
... ... @@ -361,7 +368,6 @@
361 368 put_parallel(par);
362 369 }
363 370  
364   -/* This is basically copied from mpage_end_io_read */
365 371 static void bl_end_io_write(struct bio *bio, int err)
366 372 {
367 373 struct parallel_io *par = bio->bi_private;
... ... @@ -387,7 +393,7 @@
387 393 dprintk("%s enter\n", __func__);
388 394 task = container_of(work, struct rpc_task, u.tk_work);
389 395 wdata = container_of(task, struct nfs_write_data, task);
390   - if (!wdata->pnfs_error) {
  396 + if (likely(!wdata->pnfs_error)) {
391 397 /* Marks for LAYOUTCOMMIT */
392 398 mark_extents_written(BLK_LSEG2EXT(wdata->lseg),
393 399 wdata->args.offset, wdata->args.count);
394 400  
... ... @@ -396,10 +402,15 @@
396 402 }
397 403  
398 404 /* Called when last of bios associated with a bl_write_pagelist call finishes */
399   -static void bl_end_par_io_write(void *data)
  405 +static void bl_end_par_io_write(void *data, int num_se)
400 406 {
401 407 struct nfs_write_data *wdata = data;
402 408  
  409 + if (unlikely(wdata->pnfs_error)) {
  410 + bl_free_short_extents(&BLK_LSEG2EXT(wdata->lseg)->bl_inval,
  411 + num_se);
  412 + }
  413 +
403 414 wdata->task.tk_status = wdata->pnfs_error;
404 415 wdata->verf.committed = NFS_FILE_SYNC;
405 416 INIT_WORK(&wdata->task.u.tk_work, bl_write_cleanup);
... ... @@ -552,7 +563,7 @@
552 563 */
553 564 par = alloc_parallel(wdata);
554 565 if (!par)
555   - return PNFS_NOT_ATTEMPTED;
  566 + goto out_mds;
556 567 par->pnfs_callback = bl_end_par_io_write;
557 568 /* At this point, have to be more careful with error handling */
558 569  
559 570  
... ... @@ -560,12 +571,15 @@
560 571 be = bl_find_get_extent(BLK_LSEG2EXT(wdata->lseg), isect, &cow_read);
561 572 if (!be || !is_writable(be, isect)) {
562 573 dprintk("%s no matching extents!\n", __func__);
563   - wdata->pnfs_error = -EINVAL;
564   - goto out;
  574 + goto out_mds;
565 575 }
566 576  
567 577 /* First page inside INVALID extent */
568 578 if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
  579 + if (likely(!bl_push_one_short_extent(be->be_inval)))
  580 + par->bse_count++;
  581 + else
  582 + goto out_mds;
569 583 temp = offset >> PAGE_CACHE_SHIFT;
570 584 npg_zero = do_div(temp, npg_per_block);
571 585 isect = (sector_t) (((offset - npg_zero * PAGE_CACHE_SIZE) &
... ... @@ -603,6 +617,19 @@
603 617 wdata->pnfs_error = ret;
604 618 goto out;
605 619 }
  620 + if (likely(!bl_push_one_short_extent(be->be_inval)))
  621 + par->bse_count++;
  622 + else {
  623 + end_page_writeback(page);
  624 + page_cache_release(page);
  625 + wdata->pnfs_error = -ENOMEM;
  626 + goto out;
  627 + }
  628 + /* FIXME: This should be done in bi_end_io */
  629 + mark_extents_written(BLK_LSEG2EXT(wdata->lseg),
  630 + page->index << PAGE_CACHE_SHIFT,
  631 + PAGE_CACHE_SIZE);
  632 +
606 633 bio = bl_add_page_to_bio(bio, npg_zero, WRITE,
607 634 isect, page, be,
608 635 bl_end_io_write_zero, par);
... ... @@ -611,10 +638,6 @@
611 638 bio = NULL;
612 639 goto out;
613 640 }
614   - /* FIXME: This should be done in bi_end_io */
615   - mark_extents_written(BLK_LSEG2EXT(wdata->lseg),
616   - page->index << PAGE_CACHE_SHIFT,
617   - PAGE_CACHE_SIZE);
618 641 next_page:
619 642 isect += PAGE_CACHE_SECTORS;
620 643 extent_length -= PAGE_CACHE_SECTORS;
... ... @@ -638,6 +661,15 @@
638 661 wdata->pnfs_error = -EINVAL;
639 662 goto out;
640 663 }
  664 + if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
  665 + if (likely(!bl_push_one_short_extent(
  666 + be->be_inval)))
  667 + par->bse_count++;
  668 + else {
  669 + wdata->pnfs_error = -ENOMEM;
  670 + goto out;
  671 + }
  672 + }
641 673 extent_length = be->be_length -
642 674 (isect - be->be_f_offset);
643 675 }
... ... @@ -685,6 +717,10 @@
685 717 bl_submit_bio(WRITE, bio);
686 718 put_parallel(par);
687 719 return PNFS_ATTEMPTED;
  720 +out_mds:
  721 + bl_put_extent(be);
  722 + kfree(par);
  723 + return PNFS_NOT_ATTEMPTED;
688 724 }
689 725  
690 726 /* FIXME - range ignored */
691 727  
... ... @@ -711,10 +747,16 @@
711 747 release_inval_marks(struct pnfs_inval_markings *marks)
712 748 {
713 749 struct pnfs_inval_tracking *pos, *temp;
  750 + struct pnfs_block_short_extent *se, *stemp;
714 751  
715 752 list_for_each_entry_safe(pos, temp, &marks->im_tree.mtt_stub, it_link) {
716 753 list_del(&pos->it_link);
717 754 kfree(pos);
  755 + }
  756 +
  757 + list_for_each_entry_safe(se, stemp, &marks->im_extents, bse_node) {
  758 + list_del(&se->bse_node);
  759 + kfree(se);
718 760 }
719 761 return;
720 762 }
fs/nfs/blocklayout/blocklayout.h
... ... @@ -70,6 +70,7 @@
70 70 spinlock_t im_lock;
71 71 struct my_tree im_tree; /* Sectors that need LAYOUTCOMMIT */
72 72 sector_t im_block_size; /* Server blocksize in sectors */
  73 + struct list_head im_extents; /* Short extents for INVAL->RW conversion */
73 74 };
74 75  
75 76 struct pnfs_inval_tracking {
... ... @@ -105,6 +106,7 @@
105 106 {
106 107 spin_lock_init(&marks->im_lock);
107 108 INIT_LIST_HEAD(&marks->im_tree.mtt_stub);
  109 + INIT_LIST_HEAD(&marks->im_extents);
108 110 marks->im_block_size = blocksize;
109 111 marks->im_tree.mtt_step_size = min((sector_t)PAGE_CACHE_SECTORS,
110 112 blocksize);
... ... @@ -199,7 +201,12 @@
199 201 int bl_add_merge_extent(struct pnfs_block_layout *bl,
200 202 struct pnfs_block_extent *new);
201 203 int bl_mark_for_commit(struct pnfs_block_extent *be,
202   - sector_t offset, sector_t length);
  204 + sector_t offset, sector_t length,
  205 + struct pnfs_block_short_extent *new);
  206 +int bl_push_one_short_extent(struct pnfs_inval_markings *marks);
  207 +struct pnfs_block_short_extent *
  208 +bl_pop_one_short_extent(struct pnfs_inval_markings *marks);
  209 +void bl_free_short_extents(struct pnfs_inval_markings *marks, int num_to_free);
203 210  
204 211 #endif /* FS_NFS_NFS4BLOCKLAYOUT_H */
fs/nfs/blocklayout/extents.c
... ... @@ -157,10 +157,10 @@
157 157 goto out_cleanup;
158 158 }
159 159  
160   - spin_lock(&marks->im_lock);
  160 + spin_lock_bh(&marks->im_lock);
161 161 for (s = start; s < end; s += tree->mtt_step_size)
162 162 used += _add_entry(tree, s, INTERNAL_EXISTS, storage[used]);
163   - spin_unlock(&marks->im_lock);
  163 + spin_unlock_bh(&marks->im_lock);
164 164  
165 165 status = 0;
166 166  
167 167  
... ... @@ -179,9 +179,9 @@
179 179 {
180 180 int rv;
181 181  
182   - spin_lock(&marks->im_lock);
  182 + spin_lock_bh(&marks->im_lock);
183 183 rv = _has_tag(&marks->im_tree, isect, EXTENT_INITIALIZED);
184   - spin_unlock(&marks->im_lock);
  184 + spin_unlock_bh(&marks->im_lock);
185 185 return rv;
186 186 }
187 187  
188 188  
... ... @@ -221,9 +221,9 @@
221 221 {
222 222 int rv;
223 223  
224   - spin_lock(&marks->im_lock);
  224 + spin_lock_bh(&marks->im_lock);
225 225 rv = _range_has_tag(&marks->im_tree, start, end, EXTENT_WRITTEN);
226   - spin_unlock(&marks->im_lock);
  226 + spin_unlock_bh(&marks->im_lock);
227 227 return rv;
228 228 }
229 229  
230 230  
231 231  
... ... @@ -244,15 +244,15 @@
244 244 if (_preload_range(marks, start, end - start))
245 245 goto outerr;
246 246  
247   - spin_lock(&marks->im_lock);
  247 + spin_lock_bh(&marks->im_lock);
248 248 if (_set_range(&marks->im_tree, EXTENT_INITIALIZED, offset, length))
249 249 goto out_unlock;
250   - spin_unlock(&marks->im_lock);
  250 + spin_unlock_bh(&marks->im_lock);
251 251  
252 252 return 0;
253 253  
254 254 out_unlock:
255   - spin_unlock(&marks->im_lock);
  255 + spin_unlock_bh(&marks->im_lock);
256 256 outerr:
257 257 return -ENOMEM;
258 258 }
259 259  
... ... @@ -267,9 +267,9 @@
267 267  
268 268 dprintk("%s(offset=%llu,len=%llu) enter\n", __func__,
269 269 (u64)offset, (u64)length);
270   - spin_lock(&marks->im_lock);
  270 + spin_lock_bh(&marks->im_lock);
271 271 status = _set_range(&marks->im_tree, EXTENT_WRITTEN, offset, length);
272   - spin_unlock(&marks->im_lock);
  272 + spin_unlock_bh(&marks->im_lock);
273 273 return status;
274 274 }
275 275  
276 276  
277 277  
278 278  
... ... @@ -369,20 +369,18 @@
369 369  
370 370 /* Note the range described by offset, length is guaranteed to be contained
371 371 * within be.
  372 + * new will be freed, either by this function or add_to_commitlist if they
  373 + * decide not to use it, or after LAYOUTCOMMIT uses it in the commitlist.
372 374 */
373 375 int bl_mark_for_commit(struct pnfs_block_extent *be,
374   - sector_t offset, sector_t length)
  376 + sector_t offset, sector_t length,
  377 + struct pnfs_block_short_extent *new)
375 378 {
376 379 sector_t new_end, end = offset + length;
377   - struct pnfs_block_short_extent *new;
378 380 struct pnfs_block_layout *bl = container_of(be->be_inval,
379 381 struct pnfs_block_layout,
380 382 bl_inval);
381 383  
382   - new = kmalloc(sizeof(*new), GFP_NOFS);
383   - if (!new)
384   - return -ENOMEM;
385   -
386 384 mark_written_sectors(be->be_inval, offset, length);
387 385 /* We want to add the range to commit list, but it must be
388 386 * block-normalized, and verified that the normalized range has
... ... @@ -412,9 +410,6 @@
412 410 new->bse_mdev = be->be_mdev;
413 411  
414 412 spin_lock(&bl->bl_ext_lock);
415   - /* new will be freed, either by add_to_commitlist if it decides not
416   - * to use it, or after LAYOUTCOMMIT uses it in the commitlist.
417   - */
418 413 add_to_commitlist(bl, new);
419 414 spin_unlock(&bl->bl_ext_lock);
420 415 return 0;
... ... @@ -861,5 +856,55 @@
861 856 spin_unlock(&bl->bl_ext_lock);
862 857 }
863 858 }
  859 +}
  860 +
  861 +int bl_push_one_short_extent(struct pnfs_inval_markings *marks)
  862 +{
  863 + struct pnfs_block_short_extent *new;
  864 +
  865 + new = kmalloc(sizeof(*new), GFP_NOFS);
  866 + if (unlikely(!new))
  867 + return -ENOMEM;
  868 +
  869 + spin_lock_bh(&marks->im_lock);
  870 + list_add(&new->bse_node, &marks->im_extents);
  871 + spin_unlock_bh(&marks->im_lock);
  872 +
  873 + return 0;
  874 +}
  875 +
  876 +struct pnfs_block_short_extent *
  877 +bl_pop_one_short_extent(struct pnfs_inval_markings *marks)
  878 +{
  879 + struct pnfs_block_short_extent *rv = NULL;
  880 +
  881 + spin_lock_bh(&marks->im_lock);
  882 + if (!list_empty(&marks->im_extents)) {
  883 + rv = list_entry((&marks->im_extents)->next,
  884 + struct pnfs_block_short_extent, bse_node);
  885 + list_del_init(&rv->bse_node);
  886 + }
  887 + spin_unlock_bh(&marks->im_lock);
  888 +
  889 + return rv;
  890 +}
  891 +
  892 +void bl_free_short_extents(struct pnfs_inval_markings *marks, int num_to_free)
  893 +{
  894 + struct pnfs_block_short_extent *se = NULL, *tmp;
  895 +
  896 + if (num_to_free <= 0)
  897 + return;
  898 +
  899 + spin_lock(&marks->im_lock);
  900 + list_for_each_entry_safe(se, tmp, &marks->im_extents, bse_node) {
  901 + list_del(&se->bse_node);
  902 + kfree(se);
  903 + if (--num_to_free == 0)
  904 + break;
  905 + }
  906 + spin_unlock(&marks->im_lock);
  907 +
  908 + BUG_ON(num_to_free > 0);
864 909 }