Commit b3c9dd182ed3bdcdaf0e42625a35924b0497afdc

Authored by Linus Torvalds

Merge branch 'for-3.3/core' of git://git.kernel.dk/linux-block

* 'for-3.3/core' of git://git.kernel.dk/linux-block: (37 commits)
  Revert "block: recursive merge requests"
  block: Stop using macro stubs for the bio data integrity calls
  blockdev: convert some macros to static inlines
  fs: remove unneeded plug in mpage_readpages()
  block: Add BLKROTATIONAL ioctl
  block: Introduce blk_set_stacking_limits function
  block: remove WARN_ON_ONCE() in exit_io_context()
  block: an exiting task should be allowed to create io_context
  block: ioc_cgroup_changed() needs to be exported
  block: recursive merge requests
  block, cfq: fix empty queue crash caused by request merge
  block, cfq: move icq creation and rq->elv.icq association to block core
  block, cfq: restructure io_cq creation path for io_context interface cleanup
  block, cfq: move io_cq exit/release to blk-ioc.c
  block, cfq: move icq cache management to block core
  block, cfq: move io_cq lookup to blk-ioc.c
  block, cfq: move cfqd->icq_list to request_queue and add request->elv.icq
  block, cfq: reorganize cfq_io_context into generic and cfq specific parts
  block: remove elevator_queue->ops
  block: reorder elevator switch sequence
  ...

Fix up conflicts in:
 - block/blk-cgroup.c
	Switch from can_attach_task to can_attach
 - block/cfq-iosched.c
	conflict with now removed cic index changes (we now use q->id instead)

Showing 28 changed files Side-by-side Diff

... ... @@ -1655,11 +1655,12 @@
1655 1655 struct io_context *ioc;
1656 1656  
1657 1657 cgroup_taskset_for_each(task, cgrp, tset) {
1658   - task_lock(task);
1659   - ioc = task->io_context;
1660   - if (ioc)
1661   - ioc->cgroup_changed = 1;
1662   - task_unlock(task);
  1658 + /* we don't lose anything even if ioc allocation fails */
  1659 + ioc = get_task_io_context(task, GFP_ATOMIC, NUMA_NO_NODE);
  1660 + if (ioc) {
  1661 + ioc_cgroup_changed(ioc);
  1662 + put_io_context(ioc, NULL);
  1663 + }
1663 1664 }
1664 1665 }
1665 1666  
... ... @@ -39,6 +39,8 @@
39 39 EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_remap);
40 40 EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_complete);
41 41  
  42 +DEFINE_IDA(blk_queue_ida);
  43 +
42 44 /*
43 45 * For the allocated request tables
44 46 */
... ... @@ -358,7 +360,8 @@
358 360 void blk_drain_queue(struct request_queue *q, bool drain_all)
359 361 {
360 362 while (true) {
361   - int nr_rqs;
  363 + bool drain = false;
  364 + int i;
362 365  
363 366 spin_lock_irq(q->queue_lock);
364 367  
365 368  
366 369  
... ... @@ -375,14 +378,25 @@
375 378 if (!list_empty(&q->queue_head))
376 379 __blk_run_queue(q);
377 380  
378   - if (drain_all)
379   - nr_rqs = q->rq.count[0] + q->rq.count[1];
380   - else
381   - nr_rqs = q->rq.elvpriv;
  381 + drain |= q->rq.elvpriv;
382 382  
  383 + /*
  384 + * Unfortunately, requests are queued at and tracked from
  385 + * multiple places and there's no single counter which can
  386 + * be drained. Check all the queues and counters.
  387 + */
  388 + if (drain_all) {
  389 + drain |= !list_empty(&q->queue_head);
  390 + for (i = 0; i < 2; i++) {
  391 + drain |= q->rq.count[i];
  392 + drain |= q->in_flight[i];
  393 + drain |= !list_empty(&q->flush_queue[i]);
  394 + }
  395 + }
  396 +
383 397 spin_unlock_irq(q->queue_lock);
384 398  
385   - if (!nr_rqs)
  399 + if (!drain)
386 400 break;
387 401 msleep(10);
388 402 }
... ... @@ -469,6 +483,10 @@
469 483 if (!q)
470 484 return NULL;
471 485  
  486 + q->id = ida_simple_get(&blk_queue_ida, 0, 0, GFP_KERNEL);
  487 + if (q->id < 0)
  488 + goto fail_q;
  489 +
472 490 q->backing_dev_info.ra_pages =
473 491 (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
474 492 q->backing_dev_info.state = 0;
475 493  
476 494  
... ... @@ -477,20 +495,17 @@
477 495 q->node = node_id;
478 496  
479 497 err = bdi_init(&q->backing_dev_info);
480   - if (err) {
481   - kmem_cache_free(blk_requestq_cachep, q);
482   - return NULL;
483   - }
  498 + if (err)
  499 + goto fail_id;
484 500  
485   - if (blk_throtl_init(q)) {
486   - kmem_cache_free(blk_requestq_cachep, q);
487   - return NULL;
488   - }
  501 + if (blk_throtl_init(q))
  502 + goto fail_id;
489 503  
490 504 setup_timer(&q->backing_dev_info.laptop_mode_wb_timer,
491 505 laptop_mode_timer_fn, (unsigned long) q);
492 506 setup_timer(&q->timeout, blk_rq_timed_out_timer, (unsigned long) q);
493 507 INIT_LIST_HEAD(&q->timeout_list);
  508 + INIT_LIST_HEAD(&q->icq_list);
494 509 INIT_LIST_HEAD(&q->flush_queue[0]);
495 510 INIT_LIST_HEAD(&q->flush_queue[1]);
496 511 INIT_LIST_HEAD(&q->flush_data_in_flight);
... ... @@ -508,6 +523,12 @@
508 523 q->queue_lock = &q->__queue_lock;
509 524  
510 525 return q;
  526 +
  527 +fail_id:
  528 + ida_simple_remove(&blk_queue_ida, q->id);
  529 +fail_q:
  530 + kmem_cache_free(blk_requestq_cachep, q);
  531 + return NULL;
511 532 }
512 533 EXPORT_SYMBOL(blk_alloc_queue_node);
513 534  
514 535  
515 536  
516 537  
517 538  
518 539  
... ... @@ -605,26 +626,31 @@
605 626 }
606 627 EXPORT_SYMBOL(blk_init_allocated_queue);
607 628  
608   -int blk_get_queue(struct request_queue *q)
  629 +bool blk_get_queue(struct request_queue *q)
609 630 {
610   - if (likely(!test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))) {
611   - kobject_get(&q->kobj);
612   - return 0;
  631 + if (likely(!blk_queue_dead(q))) {
  632 + __blk_get_queue(q);
  633 + return true;
613 634 }
614 635  
615   - return 1;
  636 + return false;
616 637 }
617 638 EXPORT_SYMBOL(blk_get_queue);
618 639  
619 640 static inline void blk_free_request(struct request_queue *q, struct request *rq)
620 641 {
621   - if (rq->cmd_flags & REQ_ELVPRIV)
  642 + if (rq->cmd_flags & REQ_ELVPRIV) {
622 643 elv_put_request(q, rq);
  644 + if (rq->elv.icq)
  645 + put_io_context(rq->elv.icq->ioc, q);
  646 + }
  647 +
623 648 mempool_free(rq, q->rq.rq_pool);
624 649 }
625 650  
626 651 static struct request *
627   -blk_alloc_request(struct request_queue *q, unsigned int flags, gfp_t gfp_mask)
  652 +blk_alloc_request(struct request_queue *q, struct io_cq *icq,
  653 + unsigned int flags, gfp_t gfp_mask)
628 654 {
629 655 struct request *rq = mempool_alloc(q->rq.rq_pool, gfp_mask);
630 656  
... ... @@ -635,10 +661,15 @@
635 661  
636 662 rq->cmd_flags = flags | REQ_ALLOCED;
637 663  
638   - if ((flags & REQ_ELVPRIV) &&
639   - unlikely(elv_set_request(q, rq, gfp_mask))) {
640   - mempool_free(rq, q->rq.rq_pool);
641   - return NULL;
  664 + if (flags & REQ_ELVPRIV) {
  665 + rq->elv.icq = icq;
  666 + if (unlikely(elv_set_request(q, rq, gfp_mask))) {
  667 + mempool_free(rq, q->rq.rq_pool);
  668 + return NULL;
  669 + }
  670 + /* @rq->elv.icq holds on to io_context until @rq is freed */
  671 + if (icq)
  672 + get_io_context(icq->ioc);
642 673 }
643 674  
644 675 return rq;
645 676  
646 677  
647 678  
... ... @@ -750,11 +781,17 @@
750 781 {
751 782 struct request *rq = NULL;
752 783 struct request_list *rl = &q->rq;
753   - struct io_context *ioc = NULL;
  784 + struct elevator_type *et;
  785 + struct io_context *ioc;
  786 + struct io_cq *icq = NULL;
754 787 const bool is_sync = rw_is_sync(rw_flags) != 0;
  788 + bool retried = false;
755 789 int may_queue;
  790 +retry:
  791 + et = q->elevator->type;
  792 + ioc = current->io_context;
756 793  
757   - if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)))
  794 + if (unlikely(blk_queue_dead(q)))
758 795 return NULL;
759 796  
760 797 may_queue = elv_may_queue(q, rw_flags);
761 798  
... ... @@ -763,8 +800,21 @@
763 800  
764 801 if (rl->count[is_sync]+1 >= queue_congestion_on_threshold(q)) {
765 802 if (rl->count[is_sync]+1 >= q->nr_requests) {
766   - ioc = current_io_context(GFP_ATOMIC, q->node);
767 803 /*
  804 + * We want ioc to record batching state. If it's
  805 + * not already there, creating a new one requires
  806 + * dropping queue_lock, which in turn requires
  807 + * retesting conditions to avoid queue hang.
  808 + */
  809 + if (!ioc && !retried) {
  810 + spin_unlock_irq(q->queue_lock);
  811 + create_io_context(current, gfp_mask, q->node);
  812 + spin_lock_irq(q->queue_lock);
  813 + retried = true;
  814 + goto retry;
  815 + }
  816 +
  817 + /*
768 818 * The queue will fill after this allocation, so set
769 819 * it as full, and mark this process as "batching".
770 820 * This process will be allowed to complete a batch of
771 821  
772 822  
... ... @@ -799,17 +849,36 @@
799 849 rl->count[is_sync]++;
800 850 rl->starved[is_sync] = 0;
801 851  
  852 + /*
  853 + * Decide whether the new request will be managed by elevator. If
  854 + * so, mark @rw_flags and increment elvpriv. Non-zero elvpriv will
  855 + * prevent the current elevator from being destroyed until the new
  856 + * request is freed. This guarantees icq's won't be destroyed and
  857 + * makes creating new ones safe.
  858 + *
  859 + * Also, lookup icq while holding queue_lock. If it doesn't exist,
  860 + * it will be created after releasing queue_lock.
  861 + */
802 862 if (blk_rq_should_init_elevator(bio) &&
803 863 !test_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags)) {
804 864 rw_flags |= REQ_ELVPRIV;
805 865 rl->elvpriv++;
  866 + if (et->icq_cache && ioc)
  867 + icq = ioc_lookup_icq(ioc, q);
806 868 }
807 869  
808 870 if (blk_queue_io_stat(q))
809 871 rw_flags |= REQ_IO_STAT;
810 872 spin_unlock_irq(q->queue_lock);
811 873  
812   - rq = blk_alloc_request(q, rw_flags, gfp_mask);
  874 + /* create icq if missing */
  875 + if (unlikely(et->icq_cache && !icq))
  876 + icq = ioc_create_icq(q, gfp_mask);
  877 +
  878 + /* rqs are guaranteed to have icq on elv_set_request() if requested */
  879 + if (likely(!et->icq_cache || icq))
  880 + rq = blk_alloc_request(q, icq, rw_flags, gfp_mask);
  881 +
813 882 if (unlikely(!rq)) {
814 883 /*
815 884 * Allocation failed presumably due to memory. Undo anything
816 885  
... ... @@ -871,10 +940,9 @@
871 940 rq = get_request(q, rw_flags, bio, GFP_NOIO);
872 941 while (!rq) {
873 942 DEFINE_WAIT(wait);
874   - struct io_context *ioc;
875 943 struct request_list *rl = &q->rq;
876 944  
877   - if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)))
  945 + if (unlikely(blk_queue_dead(q)))
878 946 return NULL;
879 947  
880 948 prepare_to_wait_exclusive(&rl->wait[is_sync], &wait,
... ... @@ -891,8 +959,8 @@
891 959 * up to a big batch of them for a small period time.
892 960 * See ioc_batching, ioc_set_batching
893 961 */
894   - ioc = current_io_context(GFP_NOIO, q->node);
895   - ioc_set_batching(q, ioc);
  962 + create_io_context(current, GFP_NOIO, q->node);
  963 + ioc_set_batching(q, current->io_context);
896 964  
897 965 spin_lock_irq(q->queue_lock);
898 966 finish_wait(&rl->wait[is_sync], &wait);
... ... @@ -1009,54 +1077,6 @@
1009 1077 __elv_add_request(q, rq, where);
1010 1078 }
1011 1079  
1012   -/**
1013   - * blk_insert_request - insert a special request into a request queue
1014   - * @q: request queue where request should be inserted
1015   - * @rq: request to be inserted
1016   - * @at_head: insert request at head or tail of queue
1017   - * @data: private data
1018   - *
1019   - * Description:
1020   - * Many block devices need to execute commands asynchronously, so they don't
1021   - * block the whole kernel from preemption during request execution. This is
1022   - * accomplished normally by inserting aritficial requests tagged as
1023   - * REQ_TYPE_SPECIAL in to the corresponding request queue, and letting them
1024   - * be scheduled for actual execution by the request queue.
1025   - *
1026   - * We have the option of inserting the head or the tail of the queue.
1027   - * Typically we use the tail for new ioctls and so forth. We use the head
1028   - * of the queue for things like a QUEUE_FULL message from a device, or a
1029   - * host that is unable to accept a particular command.
1030   - */
1031   -void blk_insert_request(struct request_queue *q, struct request *rq,
1032   - int at_head, void *data)
1033   -{
1034   - int where = at_head ? ELEVATOR_INSERT_FRONT : ELEVATOR_INSERT_BACK;
1035   - unsigned long flags;
1036   -
1037   - /*
1038   - * tell I/O scheduler that this isn't a regular read/write (ie it
1039   - * must not attempt merges on this) and that it acts as a soft
1040   - * barrier
1041   - */
1042   - rq->cmd_type = REQ_TYPE_SPECIAL;
1043   -
1044   - rq->special = data;
1045   -
1046   - spin_lock_irqsave(q->queue_lock, flags);
1047   -
1048   - /*
1049   - * If command is tagged, release the tag
1050   - */
1051   - if (blk_rq_tagged(rq))
1052   - blk_queue_end_tag(q, rq);
1053   -
1054   - add_acct_request(q, rq, where);
1055   - __blk_run_queue(q);
1056   - spin_unlock_irqrestore(q->queue_lock, flags);
1057   -}
1058   -EXPORT_SYMBOL(blk_insert_request);
1059   -
1060 1080 static void part_round_stats_single(int cpu, struct hd_struct *part,
1061 1081 unsigned long now)
1062 1082 {
... ... @@ -1766,6 +1786,10 @@
1766 1786 return -EIO;
1767 1787  
1768 1788 spin_lock_irqsave(q->queue_lock, flags);
  1789 + if (unlikely(blk_queue_dead(q))) {
  1790 + spin_unlock_irqrestore(q->queue_lock, flags);
  1791 + return -ENODEV;
  1792 + }
1769 1793  
1770 1794 /*
1771 1795 * Submitting request must be dequeued before calling this function
... ... @@ -2740,6 +2764,14 @@
2740 2764 trace_block_unplug(q, depth, !from_schedule);
2741 2765  
2742 2766 /*
  2767 + * Don't mess with dead queue.
  2768 + */
  2769 + if (unlikely(blk_queue_dead(q))) {
  2770 + spin_unlock(q->queue_lock);
  2771 + return;
  2772 + }
  2773 +
  2774 + /*
2743 2775 * If we are punting this to kblockd, then we can safely drop
2744 2776 * the queue_lock before waking kblockd (which needs to take
2745 2777 * this lock).
... ... @@ -2815,6 +2847,15 @@
2815 2847 depth = 0;
2816 2848 spin_lock(q->queue_lock);
2817 2849 }
  2850 +
  2851 + /*
  2852 + * Short-circuit if @q is dead
  2853 + */
  2854 + if (unlikely(blk_queue_dead(q))) {
  2855 + __blk_end_request_all(rq, -ENODEV);
  2856 + continue;
  2857 + }
  2858 +
2818 2859 /*
2819 2860 * rq is already accounted, so use raw insert
2820 2861 */
... ... @@ -50,7 +50,11 @@
50 50 {
51 51 int where = at_head ? ELEVATOR_INSERT_FRONT : ELEVATOR_INSERT_BACK;
52 52  
53   - if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))) {
  53 + WARN_ON(irqs_disabled());
  54 + spin_lock_irq(q->queue_lock);
  55 +
  56 + if (unlikely(blk_queue_dead(q))) {
  57 + spin_unlock_irq(q->queue_lock);
54 58 rq->errors = -ENXIO;
55 59 if (rq->end_io)
56 60 rq->end_io(rq, rq->errors);
... ... @@ -59,8 +63,6 @@
59 63  
60 64 rq->rq_disk = bd_disk;
61 65 rq->end_io = done;
62   - WARN_ON(irqs_disabled());
63   - spin_lock_irq(q->queue_lock);
64 66 __elv_add_request(q, rq, where);
65 67 __blk_run_queue(q);
66 68 /* the queue is stopped so it won't be run */
... ... @@ -16,53 +16,214 @@
16 16 */
17 17 static struct kmem_cache *iocontext_cachep;
18 18  
19   -static void cfq_dtor(struct io_context *ioc)
  19 +/**
  20 + * get_io_context - increment reference count to io_context
  21 + * @ioc: io_context to get
  22 + *
  23 + * Increment reference count to @ioc.
  24 + */
  25 +void get_io_context(struct io_context *ioc)
20 26 {
21   - if (!hlist_empty(&ioc->cic_list)) {
22   - struct cfq_io_context *cic;
  27 + BUG_ON(atomic_long_read(&ioc->refcount) <= 0);
  28 + atomic_long_inc(&ioc->refcount);
  29 +}
  30 +EXPORT_SYMBOL(get_io_context);
23 31  
24   - cic = hlist_entry(ioc->cic_list.first, struct cfq_io_context,
25   - cic_list);
26   - cic->dtor(ioc);
  32 +/*
  33 + * Releasing ioc may nest into another put_io_context() leading to nested
  34 + * fast path release. As the ioc's can't be the same, this is okay but
  35 + * makes lockdep whine. Keep track of nesting and use it as subclass.
  36 + */
  37 +#ifdef CONFIG_LOCKDEP
  38 +#define ioc_release_depth(q) ((q) ? (q)->ioc_release_depth : 0)
  39 +#define ioc_release_depth_inc(q) (q)->ioc_release_depth++
  40 +#define ioc_release_depth_dec(q) (q)->ioc_release_depth--
  41 +#else
  42 +#define ioc_release_depth(q) 0
  43 +#define ioc_release_depth_inc(q) do { } while (0)
  44 +#define ioc_release_depth_dec(q) do { } while (0)
  45 +#endif
  46 +
  47 +static void icq_free_icq_rcu(struct rcu_head *head)
  48 +{
  49 + struct io_cq *icq = container_of(head, struct io_cq, __rcu_head);
  50 +
  51 + kmem_cache_free(icq->__rcu_icq_cache, icq);
  52 +}
  53 +
  54 +/*
  55 + * Exit and free an icq. Called with both ioc and q locked.
  56 + */
  57 +static void ioc_exit_icq(struct io_cq *icq)
  58 +{
  59 + struct io_context *ioc = icq->ioc;
  60 + struct request_queue *q = icq->q;
  61 + struct elevator_type *et = q->elevator->type;
  62 +
  63 + lockdep_assert_held(&ioc->lock);
  64 + lockdep_assert_held(q->queue_lock);
  65 +
  66 + radix_tree_delete(&ioc->icq_tree, icq->q->id);
  67 + hlist_del_init(&icq->ioc_node);
  68 + list_del_init(&icq->q_node);
  69 +
  70 + /*
  71 + * Both setting lookup hint to and clearing it from @icq are done
  72 + * under queue_lock. If it's not pointing to @icq now, it never
  73 + * will. Hint assignment itself can race safely.
  74 + */
  75 + if (rcu_dereference_raw(ioc->icq_hint) == icq)
  76 + rcu_assign_pointer(ioc->icq_hint, NULL);
  77 +
  78 + if (et->ops.elevator_exit_icq_fn) {
  79 + ioc_release_depth_inc(q);
  80 + et->ops.elevator_exit_icq_fn(icq);
  81 + ioc_release_depth_dec(q);
27 82 }
  83 +
  84 + /*
  85 + * @icq->q might have gone away by the time RCU callback runs
  86 + * making it impossible to determine icq_cache. Record it in @icq.
  87 + */
  88 + icq->__rcu_icq_cache = et->icq_cache;
  89 + call_rcu(&icq->__rcu_head, icq_free_icq_rcu);
28 90 }
29 91  
30 92 /*
31   - * IO Context helper functions. put_io_context() returns 1 if there are no
32   - * more users of this io context, 0 otherwise.
  93 + * Slow path for ioc release in put_io_context(). Performs double-lock
  94 + * dancing to unlink all icq's and then frees ioc.
33 95 */
34   -int put_io_context(struct io_context *ioc)
  96 +static void ioc_release_fn(struct work_struct *work)
35 97 {
36   - if (ioc == NULL)
37   - return 1;
  98 + struct io_context *ioc = container_of(work, struct io_context,
  99 + release_work);
  100 + struct request_queue *last_q = NULL;
38 101  
39   - BUG_ON(atomic_long_read(&ioc->refcount) == 0);
  102 + spin_lock_irq(&ioc->lock);
40 103  
41   - if (atomic_long_dec_and_test(&ioc->refcount)) {
42   - rcu_read_lock();
43   - cfq_dtor(ioc);
44   - rcu_read_unlock();
  104 + while (!hlist_empty(&ioc->icq_list)) {
  105 + struct io_cq *icq = hlist_entry(ioc->icq_list.first,
  106 + struct io_cq, ioc_node);
  107 + struct request_queue *this_q = icq->q;
45 108  
46   - kmem_cache_free(iocontext_cachep, ioc);
47   - return 1;
  109 + if (this_q != last_q) {
  110 + /*
  111 + * Need to switch to @this_q. Once we release
  112 + * @ioc->lock, it can go away along with @cic.
  113 + * Hold on to it.
  114 + */
  115 + __blk_get_queue(this_q);
  116 +
  117 + /*
  118 + * blk_put_queue() might sleep thanks to kobject
  119 + * idiocy. Always release both locks, put and
  120 + * restart.
  121 + */
  122 + if (last_q) {
  123 + spin_unlock(last_q->queue_lock);
  124 + spin_unlock_irq(&ioc->lock);
  125 + blk_put_queue(last_q);
  126 + } else {
  127 + spin_unlock_irq(&ioc->lock);
  128 + }
  129 +
  130 + last_q = this_q;
  131 + spin_lock_irq(this_q->queue_lock);
  132 + spin_lock(&ioc->lock);
  133 + continue;
  134 + }
  135 + ioc_exit_icq(icq);
48 136 }
49   - return 0;
  137 +
  138 + if (last_q) {
  139 + spin_unlock(last_q->queue_lock);
  140 + spin_unlock_irq(&ioc->lock);
  141 + blk_put_queue(last_q);
  142 + } else {
  143 + spin_unlock_irq(&ioc->lock);
  144 + }
  145 +
  146 + kmem_cache_free(iocontext_cachep, ioc);
50 147 }
51   -EXPORT_SYMBOL(put_io_context);
52 148  
53   -static void cfq_exit(struct io_context *ioc)
  149 +/**
  150 + * put_io_context - put a reference of io_context
  151 + * @ioc: io_context to put
  152 + * @locked_q: request_queue the caller is holding queue_lock of (hint)
  153 + *
  154 + * Decrement reference count of @ioc and release it if the count reaches
  155 + * zero. If the caller is holding queue_lock of a queue, it can indicate
  156 + * that with @locked_q. This is an optimization hint and the caller is
  157 + * allowed to pass in %NULL even when it's holding a queue_lock.
  158 + */
  159 +void put_io_context(struct io_context *ioc, struct request_queue *locked_q)
54 160 {
55   - rcu_read_lock();
  161 + struct request_queue *last_q = locked_q;
  162 + unsigned long flags;
56 163  
57   - if (!hlist_empty(&ioc->cic_list)) {
58   - struct cfq_io_context *cic;
  164 + if (ioc == NULL)
  165 + return;
59 166  
60   - cic = hlist_entry(ioc->cic_list.first, struct cfq_io_context,
61   - cic_list);
62   - cic->exit(ioc);
  167 + BUG_ON(atomic_long_read(&ioc->refcount) <= 0);
  168 + if (locked_q)
  169 + lockdep_assert_held(locked_q->queue_lock);
  170 +
  171 + if (!atomic_long_dec_and_test(&ioc->refcount))
  172 + return;
  173 +
  174 + /*
  175 + * Destroy @ioc. This is a bit messy because icq's are chained
  176 + * from both ioc and queue, and ioc->lock nests inside queue_lock.
  177 + * The inner ioc->lock should be held to walk our icq_list and then
  178 + * for each icq the outer matching queue_lock should be grabbed.
  179 + * ie. We need to do reverse-order double lock dancing.
  180 + *
  181 + * Another twist is that we are often called with one of the
  182 + * matching queue_locks held as indicated by @locked_q, which
  183 + * prevents performing double-lock dance for other queues.
  184 + *
  185 + * So, we do it in two stages. The fast path uses the queue_lock
  186 + * the caller is holding and, if other queues need to be accessed,
  187 + * uses trylock to avoid introducing locking dependency. This can
  188 + * handle most cases, especially if @ioc was performing IO on only
  189 + * single device.
  190 + *
  191 + * If trylock doesn't cut it, we defer to @ioc->release_work which
  192 + * can do all the double-locking dancing.
  193 + */
  194 + spin_lock_irqsave_nested(&ioc->lock, flags,
  195 + ioc_release_depth(locked_q));
  196 +
  197 + while (!hlist_empty(&ioc->icq_list)) {
  198 + struct io_cq *icq = hlist_entry(ioc->icq_list.first,
  199 + struct io_cq, ioc_node);
  200 + struct request_queue *this_q = icq->q;
  201 +
  202 + if (this_q != last_q) {
  203 + if (last_q && last_q != locked_q)
  204 + spin_unlock(last_q->queue_lock);
  205 + last_q = NULL;
  206 +
  207 + if (!spin_trylock(this_q->queue_lock))
  208 + break;
  209 + last_q = this_q;
  210 + continue;
  211 + }
  212 + ioc_exit_icq(icq);
63 213 }
64   - rcu_read_unlock();
  214 +
  215 + if (last_q && last_q != locked_q)
  216 + spin_unlock(last_q->queue_lock);
  217 +
  218 + spin_unlock_irqrestore(&ioc->lock, flags);
  219 +
  220 + /* if no icq is left, we're done; otherwise, kick release_work */
  221 + if (hlist_empty(&ioc->icq_list))
  222 + kmem_cache_free(iocontext_cachep, ioc);
  223 + else
  224 + schedule_work(&ioc->release_work);
65 225 }
  226 +EXPORT_SYMBOL(put_io_context);
66 227  
67 228 /* Called by the exiting task */
68 229 void exit_io_context(struct task_struct *task)
69 230  
70 231  
71 232  
72 233  
73 234  
74 235  
75 236  
76 237  
77 238  
78 239  
79 240  
80 241  
81 242  
82 243  
83 244  
84 245  
85 246  
86 247  
87 248  
88 249  
89 250  
... ... @@ -74,86 +235,240 @@
74 235 task->io_context = NULL;
75 236 task_unlock(task);
76 237  
77   - if (atomic_dec_and_test(&ioc->nr_tasks))
78   - cfq_exit(ioc);
  238 + atomic_dec(&ioc->nr_tasks);
  239 + put_io_context(ioc, NULL);
  240 +}
79 241  
80   - put_io_context(ioc);
  242 +/**
  243 + * ioc_clear_queue - break any ioc association with the specified queue
  244 + * @q: request_queue being cleared
  245 + *
  246 + * Walk @q->icq_list and exit all io_cq's. Must be called with @q locked.
  247 + */
  248 +void ioc_clear_queue(struct request_queue *q)
  249 +{
  250 + lockdep_assert_held(q->queue_lock);
  251 +
  252 + while (!list_empty(&q->icq_list)) {
  253 + struct io_cq *icq = list_entry(q->icq_list.next,
  254 + struct io_cq, q_node);
  255 + struct io_context *ioc = icq->ioc;
  256 +
  257 + spin_lock(&ioc->lock);
  258 + ioc_exit_icq(icq);
  259 + spin_unlock(&ioc->lock);
  260 + }
81 261 }
82 262  
83   -struct io_context *alloc_io_context(gfp_t gfp_flags, int node)
  263 +void create_io_context_slowpath(struct task_struct *task, gfp_t gfp_flags,
  264 + int node)
84 265 {
85 266 struct io_context *ioc;
86 267  
87   - ioc = kmem_cache_alloc_node(iocontext_cachep, gfp_flags, node);
88   - if (ioc) {
89   - atomic_long_set(&ioc->refcount, 1);
90   - atomic_set(&ioc->nr_tasks, 1);
91   - spin_lock_init(&ioc->lock);
92   - ioc->ioprio_changed = 0;
93   - ioc->ioprio = 0;
94   - ioc->last_waited = 0; /* doesn't matter... */
95   - ioc->nr_batch_requests = 0; /* because this is 0 */
96   - INIT_RADIX_TREE(&ioc->radix_root, GFP_ATOMIC | __GFP_HIGH);
97   - INIT_HLIST_HEAD(&ioc->cic_list);
98   - ioc->ioc_data = NULL;
99   -#if defined(CONFIG_BLK_CGROUP) || defined(CONFIG_BLK_CGROUP_MODULE)
100   - ioc->cgroup_changed = 0;
101   -#endif
102   - }
  268 + ioc = kmem_cache_alloc_node(iocontext_cachep, gfp_flags | __GFP_ZERO,
  269 + node);
  270 + if (unlikely(!ioc))
  271 + return;
103 272  
104   - return ioc;
  273 + /* initialize */
  274 + atomic_long_set(&ioc->refcount, 1);
  275 + atomic_set(&ioc->nr_tasks, 1);
  276 + spin_lock_init(&ioc->lock);
  277 + INIT_RADIX_TREE(&ioc->icq_tree, GFP_ATOMIC | __GFP_HIGH);
  278 + INIT_HLIST_HEAD(&ioc->icq_list);
  279 + INIT_WORK(&ioc->release_work, ioc_release_fn);
  280 +
  281 + /*
  282 + * Try to install. ioc shouldn't be installed if someone else
  283 + * already did or @task, which isn't %current, is exiting. Note
  284 + * that we need to allow ioc creation on exiting %current as exit
  285 + * path may issue IOs from e.g. exit_files(). The exit path is
  286 + * responsible for not issuing IO after exit_io_context().
  287 + */
  288 + task_lock(task);
  289 + if (!task->io_context &&
  290 + (task == current || !(task->flags & PF_EXITING)))
  291 + task->io_context = ioc;
  292 + else
  293 + kmem_cache_free(iocontext_cachep, ioc);
  294 + task_unlock(task);
105 295 }
106 296  
107   -/*
108   - * If the current task has no IO context then create one and initialise it.
109   - * Otherwise, return its existing IO context.
  297 +/**
  298 + * get_task_io_context - get io_context of a task
  299 + * @task: task of interest
  300 + * @gfp_flags: allocation flags, used if allocation is necessary
  301 + * @node: allocation node, used if allocation is necessary
110 302 *
111   - * This returned IO context doesn't have a specifically elevated refcount,
112   - * but since the current task itself holds a reference, the context can be
113   - * used in general code, so long as it stays within `current` context.
  303 + * Return io_context of @task. If it doesn't exist, it is created with
  304 + * @gfp_flags and @node. The returned io_context has its reference count
  305 + * incremented.
  306 + *
  307 + * This function always goes through task_lock() and it's better to use
  308 + * %current->io_context + get_io_context() for %current.
114 309 */
115   -struct io_context *current_io_context(gfp_t gfp_flags, int node)
  310 +struct io_context *get_task_io_context(struct task_struct *task,
  311 + gfp_t gfp_flags, int node)
116 312 {
117   - struct task_struct *tsk = current;
118   - struct io_context *ret;
  313 + struct io_context *ioc;
119 314  
120   - ret = tsk->io_context;
121   - if (likely(ret))
122   - return ret;
  315 + might_sleep_if(gfp_flags & __GFP_WAIT);
123 316  
124   - ret = alloc_io_context(gfp_flags, node);
125   - if (ret) {
126   - /* make sure set_task_ioprio() sees the settings above */
127   - smp_wmb();
128   - tsk->io_context = ret;
129   - }
  317 + do {
  318 + task_lock(task);
  319 + ioc = task->io_context;
  320 + if (likely(ioc)) {
  321 + get_io_context(ioc);
  322 + task_unlock(task);
  323 + return ioc;
  324 + }
  325 + task_unlock(task);
  326 + } while (create_io_context(task, gfp_flags, node));
130 327  
131   - return ret;
  328 + return NULL;
132 329 }
  330 +EXPORT_SYMBOL(get_task_io_context);
133 331  
134   -/*
135   - * If the current task has no IO context then create one and initialise it.
136   - * If it does have a context, take a ref on it.
  332 +/**
  333 + * ioc_lookup_icq - lookup io_cq from ioc
  334 + * @ioc: the associated io_context
  335 + * @q: the associated request_queue
137 336 *
138   - * This is always called in the context of the task which submitted the I/O.
  337 + * Look up io_cq associated with @ioc - @q pair from @ioc. Must be called
  338 + * with @q->queue_lock held.
139 339 */
140   -struct io_context *get_io_context(gfp_t gfp_flags, int node)
  340 +struct io_cq *ioc_lookup_icq(struct io_context *ioc, struct request_queue *q)
141 341 {
142   - struct io_context *ioc = NULL;
  342 + struct io_cq *icq;
143 343  
  344 + lockdep_assert_held(q->queue_lock);
  345 +
144 346 /*
145   - * Check for unlikely race with exiting task. ioc ref count is
146   - * zero when ioc is being detached.
  347 + * icq's are indexed from @ioc using radix tree and hint pointer,
  348 + * both of which are protected with RCU. All removals are done
  349 + * holding both q and ioc locks, and we're holding q lock - if we
  350 + * find a icq which points to us, it's guaranteed to be valid.
147 351 */
148   - do {
149   - ioc = current_io_context(gfp_flags, node);
150   - if (unlikely(!ioc))
151   - break;
152   - } while (!atomic_long_inc_not_zero(&ioc->refcount));
  352 + rcu_read_lock();
  353 + icq = rcu_dereference(ioc->icq_hint);
  354 + if (icq && icq->q == q)
  355 + goto out;
153 356  
154   - return ioc;
  357 + icq = radix_tree_lookup(&ioc->icq_tree, q->id);
  358 + if (icq && icq->q == q)
  359 + rcu_assign_pointer(ioc->icq_hint, icq); /* allowed to race */
  360 + else
  361 + icq = NULL;
  362 +out:
  363 + rcu_read_unlock();
  364 + return icq;
155 365 }
156   -EXPORT_SYMBOL(get_io_context);
  366 +EXPORT_SYMBOL(ioc_lookup_icq);
  367 +
  368 +/**
  369 + * ioc_create_icq - create and link io_cq
  370 + * @q: request_queue of interest
  371 + * @gfp_mask: allocation mask
  372 + *
  373 + * Make sure io_cq linking %current->io_context and @q exists. If either
  374 + * io_context and/or icq don't exist, they will be created using @gfp_mask.
  375 + *
  376 + * The caller is responsible for ensuring @ioc won't go away and @q is
  377 + * alive and will stay alive until this function returns.
  378 + */
  379 +struct io_cq *ioc_create_icq(struct request_queue *q, gfp_t gfp_mask)
  380 +{
  381 + struct elevator_type *et = q->elevator->type;
  382 + struct io_context *ioc;
  383 + struct io_cq *icq;
  384 +
  385 + /* allocate stuff */
  386 + ioc = create_io_context(current, gfp_mask, q->node);
  387 + if (!ioc)
  388 + return NULL;
  389 +
  390 + icq = kmem_cache_alloc_node(et->icq_cache, gfp_mask | __GFP_ZERO,
  391 + q->node);
  392 + if (!icq)
  393 + return NULL;
  394 +
  395 + if (radix_tree_preload(gfp_mask) < 0) {
  396 + kmem_cache_free(et->icq_cache, icq);
  397 + return NULL;
  398 + }
  399 +
  400 + icq->ioc = ioc;
  401 + icq->q = q;
  402 + INIT_LIST_HEAD(&icq->q_node);
  403 + INIT_HLIST_NODE(&icq->ioc_node);
  404 +
  405 + /* lock both q and ioc and try to link @icq */
  406 + spin_lock_irq(q->queue_lock);
  407 + spin_lock(&ioc->lock);
  408 +
  409 + if (likely(!radix_tree_insert(&ioc->icq_tree, q->id, icq))) {
  410 + hlist_add_head(&icq->ioc_node, &ioc->icq_list);
  411 + list_add(&icq->q_node, &q->icq_list);
  412 + if (et->ops.elevator_init_icq_fn)
  413 + et->ops.elevator_init_icq_fn(icq);
  414 + } else {
  415 + kmem_cache_free(et->icq_cache, icq);
  416 + icq = ioc_lookup_icq(ioc, q);
  417 + if (!icq)
  418 + printk(KERN_ERR "cfq: icq link failed!\n");
  419 + }
  420 +
  421 + spin_unlock(&ioc->lock);
  422 + spin_unlock_irq(q->queue_lock);
  423 + radix_tree_preload_end();
  424 + return icq;
  425 +}
  426 +
  427 +void ioc_set_changed(struct io_context *ioc, int which)
  428 +{
  429 + struct io_cq *icq;
  430 + struct hlist_node *n;
  431 +
  432 + hlist_for_each_entry(icq, n, &ioc->icq_list, ioc_node)
  433 + set_bit(which, &icq->changed);
  434 +}
  435 +
  436 +/**
  437 + * ioc_ioprio_changed - notify ioprio change
  438 + * @ioc: io_context of interest
  439 + * @ioprio: new ioprio
  440 + *
  441 + * @ioc's ioprio has changed to @ioprio. Set %ICQ_IOPRIO_CHANGED for all
  442 + * icq's. iosched is responsible for checking the bit and applying it on
  443 + * request issue path.
  444 + */
  445 +void ioc_ioprio_changed(struct io_context *ioc, int ioprio)
  446 +{
  447 + unsigned long flags;
  448 +
  449 + spin_lock_irqsave(&ioc->lock, flags);
  450 + ioc->ioprio = ioprio;
  451 + ioc_set_changed(ioc, ICQ_IOPRIO_CHANGED);
  452 + spin_unlock_irqrestore(&ioc->lock, flags);
  453 +}
  454 +
  455 +/**
  456 + * ioc_cgroup_changed - notify cgroup change
  457 + * @ioc: io_context of interest
  458 + *
  459 + * @ioc's cgroup has changed. Set %ICQ_CGROUP_CHANGED for all icq's.
  460 + * iosched is responsible for checking the bit and applying it on request
  461 + * issue path.
  462 + */
  463 +void ioc_cgroup_changed(struct io_context *ioc)
  464 +{
  465 + unsigned long flags;
  466 +
  467 + spin_lock_irqsave(&ioc->lock, flags);
  468 + ioc_set_changed(ioc, ICQ_CGROUP_CHANGED);
  469 + spin_unlock_irqrestore(&ioc->lock, flags);
  470 +}
  471 +EXPORT_SYMBOL(ioc_cgroup_changed);
157 472  
158 473 static int __init blk_ioc_init(void)
159 474 {
block/blk-settings.c
... ... @@ -104,9 +104,7 @@
104 104 * @lim: the queue_limits structure to reset
105 105 *
106 106 * Description:
107   - * Returns a queue_limit struct to its default state. Can be used by
108   - * stacking drivers like DM that stage table swaps and reuse an
109   - * existing device queue.
  107 + * Returns a queue_limit struct to its default state.
110 108 */
111 109 void blk_set_default_limits(struct queue_limits *lim)
112 110 {
113 111  
... ... @@ -114,13 +112,12 @@
114 112 lim->max_integrity_segments = 0;
115 113 lim->seg_boundary_mask = BLK_SEG_BOUNDARY_MASK;
116 114 lim->max_segment_size = BLK_MAX_SEGMENT_SIZE;
117   - lim->max_sectors = BLK_DEF_MAX_SECTORS;
118   - lim->max_hw_sectors = INT_MAX;
  115 + lim->max_sectors = lim->max_hw_sectors = BLK_SAFE_MAX_SECTORS;
119 116 lim->max_discard_sectors = 0;
120 117 lim->discard_granularity = 0;
121 118 lim->discard_alignment = 0;
122 119 lim->discard_misaligned = 0;
123   - lim->discard_zeroes_data = 1;
  120 + lim->discard_zeroes_data = 0;
124 121 lim->logical_block_size = lim->physical_block_size = lim->io_min = 512;
125 122 lim->bounce_pfn = (unsigned long)(BLK_BOUNCE_ANY >> PAGE_SHIFT);
126 123 lim->alignment_offset = 0;
... ... @@ -131,6 +128,27 @@
131 128 EXPORT_SYMBOL(blk_set_default_limits);
132 129  
133 130 /**
  131 + * blk_set_stacking_limits - set default limits for stacking devices
  132 + * @lim: the queue_limits structure to reset
  133 + *
  134 + * Description:
  135 + * Returns a queue_limit struct to its default state. Should be used
  136 + * by stacking drivers like DM that have no internal limits.
  137 + */
  138 +void blk_set_stacking_limits(struct queue_limits *lim)
  139 +{
  140 + blk_set_default_limits(lim);
  141 +
  142 + /* Inherit limits from component devices */
  143 + lim->discard_zeroes_data = 1;
  144 + lim->max_segments = USHRT_MAX;
  145 + lim->max_hw_sectors = UINT_MAX;
  146 +
  147 + lim->max_sectors = BLK_DEF_MAX_SECTORS;
  148 +}
  149 +EXPORT_SYMBOL(blk_set_stacking_limits);
  150 +
  151 +/**
134 152 * blk_queue_make_request - define an alternate make_request function for a device
135 153 * @q: the request queue for the device to be affected
136 154 * @mfn: the alternate make_request function
... ... @@ -165,8 +183,6 @@
165 183 q->nr_batching = BLK_BATCH_REQ;
166 184  
167 185 blk_set_default_limits(&q->limits);
168   - blk_queue_max_hw_sectors(q, BLK_SAFE_MAX_SECTORS);
169   - q->limits.discard_zeroes_data = 0;
170 186  
171 187 /*
172 188 * by default assume old behaviour and bounce for any highmem page
... ... @@ -425,7 +425,7 @@
425 425 if (!entry->show)
426 426 return -EIO;
427 427 mutex_lock(&q->sysfs_lock);
428   - if (test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)) {
  428 + if (blk_queue_dead(q)) {
429 429 mutex_unlock(&q->sysfs_lock);
430 430 return -ENOENT;
431 431 }
... ... @@ -447,7 +447,7 @@
447 447  
448 448 q = container_of(kobj, struct request_queue, kobj);
449 449 mutex_lock(&q->sysfs_lock);
450   - if (test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)) {
  450 + if (blk_queue_dead(q)) {
451 451 mutex_unlock(&q->sysfs_lock);
452 452 return -ENOENT;
453 453 }
454 454  
... ... @@ -479,8 +479,12 @@
479 479  
480 480 blk_sync_queue(q);
481 481  
482   - if (q->elevator)
  482 + if (q->elevator) {
  483 + spin_lock_irq(q->queue_lock);
  484 + ioc_clear_queue(q);