Commit f1f8cc94651738b418ba54c039df536303b91704

Authored by Tejun Heo
Committed by Jens Axboe
1 parent 9b84cacd01

block, cfq: move icq creation and rq->elv.icq association to block core

Now block layer knows everything necessary to create and associate
icq's with requests.  Move ioc_create_icq() to blk-ioc.c and update
get_request() such that, if elevator_type->icq_size is set, requests
are automatically associated with their matching icq's before
elv_set_request().  io_context reference is also managed by block core
on request alloc/free.

* Only ioprio/cgroup changed handling remains from cfq_get_cic().
  Collapsed into cfq_set_request().

* This removes queue kicking on icq allocation failure (for now).  As
  icq allocation failure is rare and the only effect of queue kicking
  achieved was possibily accelerating queue processing, this change
  shouldn't be noticeable.

  There is a larger underlying problem.  Unlike request allocation,
  icq allocation is not guaranteed to succeed eventually after
  retries.  The number of icq is unbound and thus mempool can't be the
  solution either.  This effectively adds allocation dependency on
  memory free path and thus possibility of deadlock.

  This usually wouldn't happen because icq allocation is not a hot
  path and, even when the condition triggers, it's highly unlikely
  that none of the writeback workers already has icq.

  However, this is still possible especially if elevator is being
  switched under high memory pressure, so we better get it fixed.
  Probably the only solution is just bypassing elevator and appending
  to dispatch queue on any elevator allocation failure.

* Comment added to explain how icq's are managed and synchronized.

This completes cleanup of io_context interface.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>

Showing 6 changed files with 173 additions and 136 deletions Side-by-side Diff

... ... @@ -640,13 +640,18 @@
640 640  
641 641 static inline void blk_free_request(struct request_queue *q, struct request *rq)
642 642 {
643   - if (rq->cmd_flags & REQ_ELVPRIV)
  643 + if (rq->cmd_flags & REQ_ELVPRIV) {
644 644 elv_put_request(q, rq);
  645 + if (rq->elv.icq)
  646 + put_io_context(rq->elv.icq->ioc, q);
  647 + }
  648 +
645 649 mempool_free(rq, q->rq.rq_pool);
646 650 }
647 651  
648 652 static struct request *
649   -blk_alloc_request(struct request_queue *q, unsigned int flags, gfp_t gfp_mask)
  653 +blk_alloc_request(struct request_queue *q, struct io_cq *icq,
  654 + unsigned int flags, gfp_t gfp_mask)
650 655 {
651 656 struct request *rq = mempool_alloc(q->rq.rq_pool, gfp_mask);
652 657  
... ... @@ -657,10 +662,15 @@
657 662  
658 663 rq->cmd_flags = flags | REQ_ALLOCED;
659 664  
660   - if ((flags & REQ_ELVPRIV) &&
661   - unlikely(elv_set_request(q, rq, gfp_mask))) {
662   - mempool_free(rq, q->rq.rq_pool);
663   - return NULL;
  665 + if (flags & REQ_ELVPRIV) {
  666 + rq->elv.icq = icq;
  667 + if (unlikely(elv_set_request(q, rq, gfp_mask))) {
  668 + mempool_free(rq, q->rq.rq_pool);
  669 + return NULL;
  670 + }
  671 + /* @rq->elv.icq holds on to io_context until @rq is freed */
  672 + if (icq)
  673 + get_io_context(icq->ioc);
664 674 }
665 675  
666 676 return rq;
667 677  
668 678  
... ... @@ -772,11 +782,14 @@
772 782 {
773 783 struct request *rq = NULL;
774 784 struct request_list *rl = &q->rq;
  785 + struct elevator_type *et;
775 786 struct io_context *ioc;
  787 + struct io_cq *icq = NULL;
776 788 const bool is_sync = rw_is_sync(rw_flags) != 0;
777 789 bool retried = false;
778 790 int may_queue;
779 791 retry:
  792 + et = q->elevator->type;
780 793 ioc = current->io_context;
781 794  
782 795 if (unlikely(blk_queue_dead(q)))
783 796  
784 797  
... ... @@ -837,17 +850,36 @@
837 850 rl->count[is_sync]++;
838 851 rl->starved[is_sync] = 0;
839 852  
  853 + /*
  854 + * Decide whether the new request will be managed by elevator. If
  855 + * so, mark @rw_flags and increment elvpriv. Non-zero elvpriv will
  856 + * prevent the current elevator from being destroyed until the new
  857 + * request is freed. This guarantees icq's won't be destroyed and
  858 + * makes creating new ones safe.
  859 + *
  860 + * Also, lookup icq while holding queue_lock. If it doesn't exist,
  861 + * it will be created after releasing queue_lock.
  862 + */
840 863 if (blk_rq_should_init_elevator(bio) &&
841 864 !test_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags)) {
842 865 rw_flags |= REQ_ELVPRIV;
843 866 rl->elvpriv++;
  867 + if (et->icq_cache && ioc)
  868 + icq = ioc_lookup_icq(ioc, q);
844 869 }
845 870  
846 871 if (blk_queue_io_stat(q))
847 872 rw_flags |= REQ_IO_STAT;
848 873 spin_unlock_irq(q->queue_lock);
849 874  
850   - rq = blk_alloc_request(q, rw_flags, gfp_mask);
  875 + /* create icq if missing */
  876 + if (unlikely(et->icq_cache && !icq))
  877 + icq = ioc_create_icq(q, gfp_mask);
  878 +
  879 + /* rqs are guaranteed to have icq on elv_set_request() if requested */
  880 + if (likely(!et->icq_cache || icq))
  881 + rq = blk_alloc_request(q, icq, rw_flags, gfp_mask);
  882 +
851 883 if (unlikely(!rq)) {
852 884 /*
853 885 * Allocation failed presumably due to memory. Undo anything
... ... @@ -289,7 +289,6 @@
289 289 kmem_cache_free(iocontext_cachep, ioc);
290 290 task_unlock(task);
291 291 }
292   -EXPORT_SYMBOL(create_io_context_slowpath);
293 292  
294 293 /**
295 294 * get_task_io_context - get io_context of a task
... ... @@ -361,6 +360,65 @@
361 360 return icq;
362 361 }
363 362 EXPORT_SYMBOL(ioc_lookup_icq);
  363 +
  364 +/**
  365 + * ioc_create_icq - create and link io_cq
  366 + * @q: request_queue of interest
  367 + * @gfp_mask: allocation mask
  368 + *
  369 + * Make sure io_cq linking %current->io_context and @q exists. If either
  370 + * io_context and/or icq don't exist, they will be created using @gfp_mask.
  371 + *
  372 + * The caller is responsible for ensuring @ioc won't go away and @q is
  373 + * alive and will stay alive until this function returns.
  374 + */
  375 +struct io_cq *ioc_create_icq(struct request_queue *q, gfp_t gfp_mask)
  376 +{
  377 + struct elevator_type *et = q->elevator->type;
  378 + struct io_context *ioc;
  379 + struct io_cq *icq;
  380 +
  381 + /* allocate stuff */
  382 + ioc = create_io_context(current, gfp_mask, q->node);
  383 + if (!ioc)
  384 + return NULL;
  385 +
  386 + icq = kmem_cache_alloc_node(et->icq_cache, gfp_mask | __GFP_ZERO,
  387 + q->node);
  388 + if (!icq)
  389 + return NULL;
  390 +
  391 + if (radix_tree_preload(gfp_mask) < 0) {
  392 + kmem_cache_free(et->icq_cache, icq);
  393 + return NULL;
  394 + }
  395 +
  396 + icq->ioc = ioc;
  397 + icq->q = q;
  398 + INIT_LIST_HEAD(&icq->q_node);
  399 + INIT_HLIST_NODE(&icq->ioc_node);
  400 +
  401 + /* lock both q and ioc and try to link @icq */
  402 + spin_lock_irq(q->queue_lock);
  403 + spin_lock(&ioc->lock);
  404 +
  405 + if (likely(!radix_tree_insert(&ioc->icq_tree, q->id, icq))) {
  406 + hlist_add_head(&icq->ioc_node, &ioc->icq_list);
  407 + list_add(&icq->q_node, &q->icq_list);
  408 + if (et->ops.elevator_init_icq_fn)
  409 + et->ops.elevator_init_icq_fn(icq);
  410 + } else {
  411 + kmem_cache_free(et->icq_cache, icq);
  412 + icq = ioc_lookup_icq(ioc, q);
  413 + if (!icq)
  414 + printk(KERN_ERR "cfq: icq link failed!\n");
  415 + }
  416 +
  417 + spin_unlock(&ioc->lock);
  418 + spin_unlock_irq(q->queue_lock);
  419 + radix_tree_preload_end();
  420 + return icq;
  421 +}
364 422  
365 423 void ioc_set_changed(struct io_context *ioc, int which)
366 424 {
... ... @@ -200,6 +200,7 @@
200 200 */
201 201 void get_io_context(struct io_context *ioc);
202 202 struct io_cq *ioc_lookup_icq(struct io_context *ioc, struct request_queue *q);
  203 +struct io_cq *ioc_create_icq(struct request_queue *q, gfp_t gfp_mask);
203 204 void ioc_clear_queue(struct request_queue *q);
204 205  
205 206 void create_io_context_slowpath(struct task_struct *task, gfp_t gfp_mask,
... ... @@ -2935,117 +2935,6 @@
2935 2935 return cfqq;
2936 2936 }
2937 2937  
2938   -/**
2939   - * ioc_create_icq - create and link io_cq
2940   - * @q: request_queue of interest
2941   - * @gfp_mask: allocation mask
2942   - *
2943   - * Make sure io_cq linking %current->io_context and @q exists. If either
2944   - * io_context and/or icq don't exist, they will be created using @gfp_mask.
2945   - *
2946   - * The caller is responsible for ensuring @ioc won't go away and @q is
2947   - * alive and will stay alive until this function returns.
2948   - */
2949   -static struct io_cq *ioc_create_icq(struct request_queue *q, gfp_t gfp_mask)
2950   -{
2951   - struct elevator_type *et = q->elevator->type;
2952   - struct io_context *ioc;
2953   - struct io_cq *icq;
2954   -
2955   - /* allocate stuff */
2956   - ioc = create_io_context(current, gfp_mask, q->node);
2957   - if (!ioc)
2958   - return NULL;
2959   -
2960   - icq = kmem_cache_alloc_node(et->icq_cache, gfp_mask | __GFP_ZERO,
2961   - q->node);
2962   - if (!icq)
2963   - return NULL;
2964   -
2965   - if (radix_tree_preload(gfp_mask) < 0) {
2966   - kmem_cache_free(et->icq_cache, icq);
2967   - return NULL;
2968   - }
2969   -
2970   - icq->ioc = ioc;
2971   - icq->q = q;
2972   - INIT_LIST_HEAD(&icq->q_node);
2973   - INIT_HLIST_NODE(&icq->ioc_node);
2974   -
2975   - /* lock both q and ioc and try to link @icq */
2976   - spin_lock_irq(q->queue_lock);
2977   - spin_lock(&ioc->lock);
2978   -
2979   - if (likely(!radix_tree_insert(&ioc->icq_tree, q->id, icq))) {
2980   - hlist_add_head(&icq->ioc_node, &ioc->icq_list);
2981   - list_add(&icq->q_node, &q->icq_list);
2982   - if (et->ops.elevator_init_icq_fn)
2983   - et->ops.elevator_init_icq_fn(icq);
2984   - } else {
2985   - kmem_cache_free(et->icq_cache, icq);
2986   - icq = ioc_lookup_icq(ioc, q);
2987   - if (!icq)
2988   - printk(KERN_ERR "cfq: icq link failed!\n");
2989   - }
2990   -
2991   - spin_unlock(&ioc->lock);
2992   - spin_unlock_irq(q->queue_lock);
2993   - radix_tree_preload_end();
2994   - return icq;
2995   -}
2996   -
2997   -/**
2998   - * cfq_get_cic - acquire cfq_io_cq and bump refcnt on io_context
2999   - * @cfqd: cfqd to setup cic for
3000   - * @gfp_mask: allocation mask
3001   - *
3002   - * Return cfq_io_cq associating @cfqd and %current->io_context and
3003   - * bump refcnt on io_context. If ioc or cic doesn't exist, they're created
3004   - * using @gfp_mask.
3005   - *
3006   - * Must be called under queue_lock which may be released and re-acquired.
3007   - * This function also may sleep depending on @gfp_mask.
3008   - */
3009   -static struct cfq_io_cq *cfq_get_cic(struct cfq_data *cfqd, gfp_t gfp_mask)
3010   -{
3011   - struct request_queue *q = cfqd->queue;
3012   - struct cfq_io_cq *cic = NULL;
3013   - struct io_context *ioc;
3014   -
3015   - lockdep_assert_held(q->queue_lock);
3016   -
3017   - while (true) {
3018   - /* fast path */
3019   - ioc = current->io_context;
3020   - if (likely(ioc)) {
3021   - cic = cfq_cic_lookup(cfqd, ioc);
3022   - if (likely(cic))
3023   - break;
3024   - }
3025   -
3026   - /* slow path - unlock, create missing ones and retry */
3027   - spin_unlock_irq(q->queue_lock);
3028   - cic = icq_to_cic(ioc_create_icq(q, gfp_mask));
3029   - spin_lock_irq(q->queue_lock);
3030   - if (!cic)
3031   - return NULL;
3032   - }
3033   -
3034   - /* bump @ioc's refcnt and handle changed notifications */
3035   - get_io_context(ioc);
3036   -
3037   - if (unlikely(cic->icq.changed)) {
3038   - if (test_and_clear_bit(ICQ_IOPRIO_CHANGED, &cic->icq.changed))
3039   - changed_ioprio(cic);
3040   -#ifdef CONFIG_CFQ_GROUP_IOSCHED
3041   - if (test_and_clear_bit(ICQ_CGROUP_CHANGED, &cic->icq.changed))
3042   - changed_cgroup(cic);
3043   -#endif
3044   - }
3045   -
3046   - return cic;
3047   -}
3048   -
3049 2938 static void
3050 2939 __cfq_update_io_thinktime(struct cfq_ttime *ttime, unsigned long slice_idle)
3051 2940 {
... ... @@ -3524,8 +3413,6 @@
3524 3413 BUG_ON(!cfqq->allocated[rw]);
3525 3414 cfqq->allocated[rw]--;
3526 3415  
3527   - put_io_context(RQ_CIC(rq)->icq.ioc, cfqq->cfqd->queue);
3528   -
3529 3416 /* Put down rq reference on cfqg */
3530 3417 cfq_put_cfqg(RQ_CFQG(rq));
3531 3418 rq->elv.priv[0] = NULL;
... ... @@ -3574,7 +3461,7 @@
3574 3461 cfq_set_request(struct request_queue *q, struct request *rq, gfp_t gfp_mask)
3575 3462 {
3576 3463 struct cfq_data *cfqd = q->elevator->elevator_data;
3577   - struct cfq_io_cq *cic;
  3464 + struct cfq_io_cq *cic = icq_to_cic(rq->elv.icq);
3578 3465 const int rw = rq_data_dir(rq);
3579 3466 const bool is_sync = rq_is_sync(rq);
3580 3467 struct cfq_queue *cfqq;
3581 3468  
... ... @@ -3582,10 +3469,17 @@
3582 3469 might_sleep_if(gfp_mask & __GFP_WAIT);
3583 3470  
3584 3471 spin_lock_irq(q->queue_lock);
3585   - cic = cfq_get_cic(cfqd, gfp_mask);
3586   - if (!cic)
3587   - goto queue_fail;
3588 3472  
  3473 + /* handle changed notifications */
  3474 + if (unlikely(cic->icq.changed)) {
  3475 + if (test_and_clear_bit(ICQ_IOPRIO_CHANGED, &cic->icq.changed))
  3476 + changed_ioprio(cic);
  3477 +#ifdef CONFIG_CFQ_GROUP_IOSCHED
  3478 + if (test_and_clear_bit(ICQ_CGROUP_CHANGED, &cic->icq.changed))
  3479 + changed_cgroup(cic);
  3480 +#endif
  3481 + }
  3482 +
3589 3483 new_queue:
3590 3484 cfqq = cic_to_cfqq(cic, is_sync);
3591 3485 if (!cfqq || cfqq == &cfqd->oom_cfqq) {
3592 3486  
... ... @@ -3615,17 +3509,10 @@
3615 3509 cfqq->allocated[rw]++;
3616 3510  
3617 3511 cfqq->ref++;
3618   - rq->elv.icq = &cic->icq;
3619 3512 rq->elv.priv[0] = cfqq;
3620 3513 rq->elv.priv[1] = cfq_ref_get_cfqg(cfqq->cfqg);
3621 3514 spin_unlock_irq(q->queue_lock);
3622 3515 return 0;
3623   -
3624   -queue_fail:
3625   - cfq_schedule_dispatch(cfqd);
3626   - spin_unlock_irq(q->queue_lock);
3627   - cfq_log(cfqd, "set_request fail");
3628   - return 1;
3629 3516 }
3630 3517  
3631 3518 static void cfq_kick_queue(struct work_struct *work)
include/linux/elevator.h
... ... @@ -60,8 +60,8 @@
60 60 elevator_request_list_fn *elevator_former_req_fn;
61 61 elevator_request_list_fn *elevator_latter_req_fn;
62 62  
63   - elevator_init_icq_fn *elevator_init_icq_fn;
64   - elevator_exit_icq_fn *elevator_exit_icq_fn;
  63 + elevator_init_icq_fn *elevator_init_icq_fn; /* see iocontext.h */
  64 + elevator_exit_icq_fn *elevator_exit_icq_fn; /* ditto */
65 65  
66 66 elevator_set_req_fn *elevator_set_req_fn;
67 67 elevator_put_req_fn *elevator_put_req_fn;
... ... @@ -90,8 +90,8 @@
90 90  
91 91 /* fields provided by elevator implementation */
92 92 struct elevator_ops ops;
93   - size_t icq_size;
94   - size_t icq_align;
  93 + size_t icq_size; /* see iocontext.h */
  94 + size_t icq_align; /* ditto */
95 95 struct elv_fs_entry *elevator_attrs;
96 96 char elevator_name[ELV_NAME_MAX];
97 97 struct module *elevator_owner;
include/linux/iocontext.h
... ... @@ -10,6 +10,65 @@
10 10 ICQ_CGROUP_CHANGED,
11 11 };
12 12  
  13 +/*
  14 + * An io_cq (icq) is association between an io_context (ioc) and a
  15 + * request_queue (q). This is used by elevators which need to track
  16 + * information per ioc - q pair.
  17 + *
  18 + * Elevator can request use of icq by setting elevator_type->icq_size and
  19 + * ->icq_align. Both size and align must be larger than that of struct
  20 + * io_cq and elevator can use the tail area for private information. The
  21 + * recommended way to do this is defining a struct which contains io_cq as
  22 + * the first member followed by private members and using its size and
  23 + * align. For example,
  24 + *
  25 + * struct snail_io_cq {
  26 + * struct io_cq icq;
  27 + * int poke_snail;
  28 + * int feed_snail;
  29 + * };
  30 + *
  31 + * struct elevator_type snail_elv_type {
  32 + * .ops = { ... },
  33 + * .icq_size = sizeof(struct snail_io_cq),
  34 + * .icq_align = __alignof__(struct snail_io_cq),
  35 + * ...
  36 + * };
  37 + *
  38 + * If icq_size is set, block core will manage icq's. All requests will
  39 + * have its ->elv.icq field set before elevator_ops->elevator_set_req_fn()
  40 + * is called and be holding a reference to the associated io_context.
  41 + *
  42 + * Whenever a new icq is created, elevator_ops->elevator_init_icq_fn() is
  43 + * called and, on destruction, ->elevator_exit_icq_fn(). Both functions
  44 + * are called with both the associated io_context and queue locks held.
  45 + *
  46 + * Elevator is allowed to lookup icq using ioc_lookup_icq() while holding
  47 + * queue lock but the returned icq is valid only until the queue lock is
  48 + * released. Elevators can not and should not try to create or destroy
  49 + * icq's.
  50 + *
  51 + * As icq's are linked from both ioc and q, the locking rules are a bit
  52 + * complex.
  53 + *
  54 + * - ioc lock nests inside q lock.
  55 + *
  56 + * - ioc->icq_list and icq->ioc_node are protected by ioc lock.
  57 + * q->icq_list and icq->q_node by q lock.
  58 + *
  59 + * - ioc->icq_tree and ioc->icq_hint are protected by ioc lock, while icq
  60 + * itself is protected by q lock. However, both the indexes and icq
  61 + * itself are also RCU managed and lookup can be performed holding only
  62 + * the q lock.
  63 + *
  64 + * - icq's are not reference counted. They are destroyed when either the
  65 + * ioc or q goes away. Each request with icq set holds an extra
  66 + * reference to ioc to ensure it stays until the request is completed.
  67 + *
  68 + * - Linking and unlinking icq's are performed while holding both ioc and q
  69 + * locks. Due to the lock ordering, q exit is simple but ioc exit
  70 + * requires reverse-order double lock dance.
  71 + */
13 72 struct io_cq {
14 73 struct request_queue *q;
15 74 struct io_context *ioc;