Commit b3c9dd182ed3bdcdaf0e42625a35924b0497afdc

Authored by Linus Torvalds

Merge branch 'for-3.3/core' of git://git.kernel.dk/linux-block

* 'for-3.3/core' of git://git.kernel.dk/linux-block: (37 commits)
  Revert "block: recursive merge requests"
  block: Stop using macro stubs for the bio data integrity calls
  blockdev: convert some macros to static inlines
  fs: remove unneeded plug in mpage_readpages()
  block: Add BLKROTATIONAL ioctl
  block: Introduce blk_set_stacking_limits function
  block: remove WARN_ON_ONCE() in exit_io_context()
  block: an exiting task should be allowed to create io_context
  block: ioc_cgroup_changed() needs to be exported
  block: recursive merge requests
  block, cfq: fix empty queue crash caused by request merge
  block, cfq: move icq creation and rq->elv.icq association to block core
  block, cfq: restructure io_cq creation path for io_context interface cleanup
  block, cfq: move io_cq exit/release to blk-ioc.c
  block, cfq: move icq cache management to block core
  block, cfq: move io_cq lookup to blk-ioc.c
  block, cfq: move cfqd->icq_list to request_queue and add request->elv.icq
  block, cfq: reorganize cfq_io_context into generic and cfq specific parts
  block: remove elevator_queue->ops
  block: reorder elevator switch sequence
  ...

Fix up conflicts in:
 - block/blk-cgroup.c
	Switch from can_attach_task to can_attach
 - block/cfq-iosched.c
	conflict with now removed cic index changes (we now use q->id instead)

Showing 28 changed files Side-by-side Diff

... ... @@ -1655,11 +1655,12 @@
1655 1655 struct io_context *ioc;
1656 1656  
1657 1657 cgroup_taskset_for_each(task, cgrp, tset) {
1658   - task_lock(task);
1659   - ioc = task->io_context;
1660   - if (ioc)
1661   - ioc->cgroup_changed = 1;
1662   - task_unlock(task);
  1658 + /* we don't lose anything even if ioc allocation fails */
  1659 + ioc = get_task_io_context(task, GFP_ATOMIC, NUMA_NO_NODE);
  1660 + if (ioc) {
  1661 + ioc_cgroup_changed(ioc);
  1662 + put_io_context(ioc, NULL);
  1663 + }
1663 1664 }
1664 1665 }
1665 1666  
... ... @@ -39,6 +39,8 @@
39 39 EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_remap);
40 40 EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_complete);
41 41  
  42 +DEFINE_IDA(blk_queue_ida);
  43 +
42 44 /*
43 45 * For the allocated request tables
44 46 */
... ... @@ -358,7 +360,8 @@
358 360 void blk_drain_queue(struct request_queue *q, bool drain_all)
359 361 {
360 362 while (true) {
361   - int nr_rqs;
  363 + bool drain = false;
  364 + int i;
362 365  
363 366 spin_lock_irq(q->queue_lock);
364 367  
365 368  
366 369  
... ... @@ -375,14 +378,25 @@
375 378 if (!list_empty(&q->queue_head))
376 379 __blk_run_queue(q);
377 380  
378   - if (drain_all)
379   - nr_rqs = q->rq.count[0] + q->rq.count[1];
380   - else
381   - nr_rqs = q->rq.elvpriv;
  381 + drain |= q->rq.elvpriv;
382 382  
  383 + /*
  384 + * Unfortunately, requests are queued at and tracked from
  385 + * multiple places and there's no single counter which can
  386 + * be drained. Check all the queues and counters.
  387 + */
  388 + if (drain_all) {
  389 + drain |= !list_empty(&q->queue_head);
  390 + for (i = 0; i < 2; i++) {
  391 + drain |= q->rq.count[i];
  392 + drain |= q->in_flight[i];
  393 + drain |= !list_empty(&q->flush_queue[i]);
  394 + }
  395 + }
  396 +
383 397 spin_unlock_irq(q->queue_lock);
384 398  
385   - if (!nr_rqs)
  399 + if (!drain)
386 400 break;
387 401 msleep(10);
388 402 }
... ... @@ -469,6 +483,10 @@
469 483 if (!q)
470 484 return NULL;
471 485  
  486 + q->id = ida_simple_get(&blk_queue_ida, 0, 0, GFP_KERNEL);
  487 + if (q->id < 0)
  488 + goto fail_q;
  489 +
472 490 q->backing_dev_info.ra_pages =
473 491 (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
474 492 q->backing_dev_info.state = 0;
475 493  
476 494  
... ... @@ -477,20 +495,17 @@
477 495 q->node = node_id;
478 496  
479 497 err = bdi_init(&q->backing_dev_info);
480   - if (err) {
481   - kmem_cache_free(blk_requestq_cachep, q);
482   - return NULL;
483   - }
  498 + if (err)
  499 + goto fail_id;
484 500  
485   - if (blk_throtl_init(q)) {
486   - kmem_cache_free(blk_requestq_cachep, q);
487   - return NULL;
488   - }
  501 + if (blk_throtl_init(q))
  502 + goto fail_id;
489 503  
490 504 setup_timer(&q->backing_dev_info.laptop_mode_wb_timer,
491 505 laptop_mode_timer_fn, (unsigned long) q);
492 506 setup_timer(&q->timeout, blk_rq_timed_out_timer, (unsigned long) q);
493 507 INIT_LIST_HEAD(&q->timeout_list);
  508 + INIT_LIST_HEAD(&q->icq_list);
494 509 INIT_LIST_HEAD(&q->flush_queue[0]);
495 510 INIT_LIST_HEAD(&q->flush_queue[1]);
496 511 INIT_LIST_HEAD(&q->flush_data_in_flight);
... ... @@ -508,6 +523,12 @@
508 523 q->queue_lock = &q->__queue_lock;
509 524  
510 525 return q;
  526 +
  527 +fail_id:
  528 + ida_simple_remove(&blk_queue_ida, q->id);
  529 +fail_q:
  530 + kmem_cache_free(blk_requestq_cachep, q);
  531 + return NULL;
511 532 }
512 533 EXPORT_SYMBOL(blk_alloc_queue_node);
513 534  
514 535  
515 536  
516 537  
517 538  
518 539  
... ... @@ -605,26 +626,31 @@
605 626 }
606 627 EXPORT_SYMBOL(blk_init_allocated_queue);
607 628  
608   -int blk_get_queue(struct request_queue *q)
  629 +bool blk_get_queue(struct request_queue *q)
609 630 {
610   - if (likely(!test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))) {
611   - kobject_get(&q->kobj);
612   - return 0;
  631 + if (likely(!blk_queue_dead(q))) {
  632 + __blk_get_queue(q);
  633 + return true;
613 634 }
614 635  
615   - return 1;
  636 + return false;
616 637 }
617 638 EXPORT_SYMBOL(blk_get_queue);
618 639  
619 640 static inline void blk_free_request(struct request_queue *q, struct request *rq)
620 641 {
621   - if (rq->cmd_flags & REQ_ELVPRIV)
  642 + if (rq->cmd_flags & REQ_ELVPRIV) {
622 643 elv_put_request(q, rq);
  644 + if (rq->elv.icq)
  645 + put_io_context(rq->elv.icq->ioc, q);
  646 + }
  647 +
623 648 mempool_free(rq, q->rq.rq_pool);
624 649 }
625 650  
626 651 static struct request *
627   -blk_alloc_request(struct request_queue *q, unsigned int flags, gfp_t gfp_mask)
  652 +blk_alloc_request(struct request_queue *q, struct io_cq *icq,
  653 + unsigned int flags, gfp_t gfp_mask)
628 654 {
629 655 struct request *rq = mempool_alloc(q->rq.rq_pool, gfp_mask);
630 656  
... ... @@ -635,10 +661,15 @@
635 661  
636 662 rq->cmd_flags = flags | REQ_ALLOCED;
637 663  
638   - if ((flags & REQ_ELVPRIV) &&
639   - unlikely(elv_set_request(q, rq, gfp_mask))) {
640   - mempool_free(rq, q->rq.rq_pool);
641   - return NULL;
  664 + if (flags & REQ_ELVPRIV) {
  665 + rq->elv.icq = icq;
  666 + if (unlikely(elv_set_request(q, rq, gfp_mask))) {
  667 + mempool_free(rq, q->rq.rq_pool);
  668 + return NULL;
  669 + }
  670 + /* @rq->elv.icq holds on to io_context until @rq is freed */
  671 + if (icq)
  672 + get_io_context(icq->ioc);
642 673 }
643 674  
644 675 return rq;
645 676  
646 677  
647 678  
... ... @@ -750,11 +781,17 @@
750 781 {
751 782 struct request *rq = NULL;
752 783 struct request_list *rl = &q->rq;
753   - struct io_context *ioc = NULL;
  784 + struct elevator_type *et;
  785 + struct io_context *ioc;
  786 + struct io_cq *icq = NULL;
754 787 const bool is_sync = rw_is_sync(rw_flags) != 0;
  788 + bool retried = false;
755 789 int may_queue;
  790 +retry:
  791 + et = q->elevator->type;
  792 + ioc = current->io_context;
756 793  
757   - if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)))
  794 + if (unlikely(blk_queue_dead(q)))
758 795 return NULL;
759 796  
760 797 may_queue = elv_may_queue(q, rw_flags);
761 798  
... ... @@ -763,8 +800,21 @@
763 800  
764 801 if (rl->count[is_sync]+1 >= queue_congestion_on_threshold(q)) {
765 802 if (rl->count[is_sync]+1 >= q->nr_requests) {
766   - ioc = current_io_context(GFP_ATOMIC, q->node);
767 803 /*
  804 + * We want ioc to record batching state. If it's
  805 + * not already there, creating a new one requires
  806 + * dropping queue_lock, which in turn requires
  807 + * retesting conditions to avoid queue hang.
  808 + */
  809 + if (!ioc && !retried) {
  810 + spin_unlock_irq(q->queue_lock);
  811 + create_io_context(current, gfp_mask, q->node);
  812 + spin_lock_irq(q->queue_lock);
  813 + retried = true;
  814 + goto retry;
  815 + }
  816 +
  817 + /*
768 818 * The queue will fill after this allocation, so set
769 819 * it as full, and mark this process as "batching".
770 820 * This process will be allowed to complete a batch of
771 821  
772 822  
... ... @@ -799,17 +849,36 @@
799 849 rl->count[is_sync]++;
800 850 rl->starved[is_sync] = 0;
801 851  
  852 + /*
  853 + * Decide whether the new request will be managed by elevator. If
  854 + * so, mark @rw_flags and increment elvpriv. Non-zero elvpriv will
  855 + * prevent the current elevator from being destroyed until the new
  856 + * request is freed. This guarantees icq's won't be destroyed and
  857 + * makes creating new ones safe.
  858 + *
  859 + * Also, lookup icq while holding queue_lock. If it doesn't exist,
  860 + * it will be created after releasing queue_lock.
  861 + */
802 862 if (blk_rq_should_init_elevator(bio) &&
803 863 !test_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags)) {
804 864 rw_flags |= REQ_ELVPRIV;
805 865 rl->elvpriv++;
  866 + if (et->icq_cache && ioc)
  867 + icq = ioc_lookup_icq(ioc, q);
806 868 }
807 869  
808 870 if (blk_queue_io_stat(q))
809 871 rw_flags |= REQ_IO_STAT;
810 872 spin_unlock_irq(q->queue_lock);
811 873  
812   - rq = blk_alloc_request(q, rw_flags, gfp_mask);
  874 + /* create icq if missing */
  875 + if (unlikely(et->icq_cache && !icq))
  876 + icq = ioc_create_icq(q, gfp_mask);
  877 +
  878 + /* rqs are guaranteed to have icq on elv_set_request() if requested */
  879 + if (likely(!et->icq_cache || icq))
  880 + rq = blk_alloc_request(q, icq, rw_flags, gfp_mask);
  881 +
813 882 if (unlikely(!rq)) {
814 883 /*
815 884 * Allocation failed presumably due to memory. Undo anything
816 885  
... ... @@ -871,10 +940,9 @@
871 940 rq = get_request(q, rw_flags, bio, GFP_NOIO);
872 941 while (!rq) {
873 942 DEFINE_WAIT(wait);
874   - struct io_context *ioc;
875 943 struct request_list *rl = &q->rq;
876 944  
877   - if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)))
  945 + if (unlikely(blk_queue_dead(q)))
878 946 return NULL;
879 947  
880 948 prepare_to_wait_exclusive(&rl->wait[is_sync], &wait,
... ... @@ -891,8 +959,8 @@
891 959 * up to a big batch of them for a small period time.
892 960 * See ioc_batching, ioc_set_batching
893 961 */
894   - ioc = current_io_context(GFP_NOIO, q->node);
895   - ioc_set_batching(q, ioc);
  962 + create_io_context(current, GFP_NOIO, q->node);
  963 + ioc_set_batching(q, current->io_context);
896 964  
897 965 spin_lock_irq(q->queue_lock);
898 966 finish_wait(&rl->wait[is_sync], &wait);
... ... @@ -1009,54 +1077,6 @@
1009 1077 __elv_add_request(q, rq, where);
1010 1078 }
1011 1079  
1012   -/**
1013   - * blk_insert_request - insert a special request into a request queue
1014   - * @q: request queue where request should be inserted
1015   - * @rq: request to be inserted
1016   - * @at_head: insert request at head or tail of queue
1017   - * @data: private data
1018   - *
1019   - * Description:
1020   - * Many block devices need to execute commands asynchronously, so they don't
1021   - * block the whole kernel from preemption during request execution. This is
1022   - * accomplished normally by inserting aritficial requests tagged as
1023   - * REQ_TYPE_SPECIAL in to the corresponding request queue, and letting them
1024   - * be scheduled for actual execution by the request queue.
1025   - *
1026   - * We have the option of inserting the head or the tail of the queue.
1027   - * Typically we use the tail for new ioctls and so forth. We use the head
1028   - * of the queue for things like a QUEUE_FULL message from a device, or a
1029   - * host that is unable to accept a particular command.
1030   - */
1031   -void blk_insert_request(struct request_queue *q, struct request *rq,
1032   - int at_head, void *data)
1033   -{
1034   - int where = at_head ? ELEVATOR_INSERT_FRONT : ELEVATOR_INSERT_BACK;
1035   - unsigned long flags;
1036   -
1037   - /*
1038   - * tell I/O scheduler that this isn't a regular read/write (ie it
1039   - * must not attempt merges on this) and that it acts as a soft
1040   - * barrier
1041   - */
1042   - rq->cmd_type = REQ_TYPE_SPECIAL;
1043   -
1044   - rq->special = data;
1045   -
1046   - spin_lock_irqsave(q->queue_lock, flags);
1047   -
1048   - /*
1049   - * If command is tagged, release the tag
1050   - */
1051   - if (blk_rq_tagged(rq))
1052   - blk_queue_end_tag(q, rq);
1053   -
1054   - add_acct_request(q, rq, where);
1055   - __blk_run_queue(q);
1056   - spin_unlock_irqrestore(q->queue_lock, flags);
1057   -}
1058   -EXPORT_SYMBOL(blk_insert_request);
1059   -
1060 1080 static void part_round_stats_single(int cpu, struct hd_struct *part,
1061 1081 unsigned long now)
1062 1082 {
... ... @@ -1766,6 +1786,10 @@
1766 1786 return -EIO;
1767 1787  
1768 1788 spin_lock_irqsave(q->queue_lock, flags);
  1789 + if (unlikely(blk_queue_dead(q))) {
  1790 + spin_unlock_irqrestore(q->queue_lock, flags);
  1791 + return -ENODEV;
  1792 + }
1769 1793  
1770 1794 /*
1771 1795 * Submitting request must be dequeued before calling this function
... ... @@ -2740,6 +2764,14 @@
2740 2764 trace_block_unplug(q, depth, !from_schedule);
2741 2765  
2742 2766 /*
  2767 + * Don't mess with dead queue.
  2768 + */
  2769 + if (unlikely(blk_queue_dead(q))) {
  2770 + spin_unlock(q->queue_lock);
  2771 + return;
  2772 + }
  2773 +
  2774 + /*
2743 2775 * If we are punting this to kblockd, then we can safely drop
2744 2776 * the queue_lock before waking kblockd (which needs to take
2745 2777 * this lock).
... ... @@ -2815,6 +2847,15 @@
2815 2847 depth = 0;
2816 2848 spin_lock(q->queue_lock);
2817 2849 }
  2850 +
  2851 + /*
  2852 + * Short-circuit if @q is dead
  2853 + */
  2854 + if (unlikely(blk_queue_dead(q))) {
  2855 + __blk_end_request_all(rq, -ENODEV);
  2856 + continue;
  2857 + }
  2858 +
2818 2859 /*
2819 2860 * rq is already accounted, so use raw insert
2820 2861 */
... ... @@ -50,7 +50,11 @@
50 50 {
51 51 int where = at_head ? ELEVATOR_INSERT_FRONT : ELEVATOR_INSERT_BACK;
52 52  
53   - if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))) {
  53 + WARN_ON(irqs_disabled());
  54 + spin_lock_irq(q->queue_lock);
  55 +
  56 + if (unlikely(blk_queue_dead(q))) {
  57 + spin_unlock_irq(q->queue_lock);
54 58 rq->errors = -ENXIO;
55 59 if (rq->end_io)
56 60 rq->end_io(rq, rq->errors);
... ... @@ -59,8 +63,6 @@
59 63  
60 64 rq->rq_disk = bd_disk;
61 65 rq->end_io = done;
62   - WARN_ON(irqs_disabled());
63   - spin_lock_irq(q->queue_lock);
64 66 __elv_add_request(q, rq, where);
65 67 __blk_run_queue(q);
66 68 /* the queue is stopped so it won't be run */
... ... @@ -16,53 +16,214 @@
16 16 */
17 17 static struct kmem_cache *iocontext_cachep;
18 18  
19   -static void cfq_dtor(struct io_context *ioc)
  19 +/**
  20 + * get_io_context - increment reference count to io_context
  21 + * @ioc: io_context to get
  22 + *
  23 + * Increment reference count to @ioc.
  24 + */
  25 +void get_io_context(struct io_context *ioc)
20 26 {
21   - if (!hlist_empty(&ioc->cic_list)) {
22   - struct cfq_io_context *cic;
  27 + BUG_ON(atomic_long_read(&ioc->refcount) <= 0);
  28 + atomic_long_inc(&ioc->refcount);
  29 +}
  30 +EXPORT_SYMBOL(get_io_context);
23 31  
24   - cic = hlist_entry(ioc->cic_list.first, struct cfq_io_context,
25   - cic_list);
26   - cic->dtor(ioc);
  32 +/*
  33 + * Releasing ioc may nest into another put_io_context() leading to nested
  34 + * fast path release. As the ioc's can't be the same, this is okay but
  35 + * makes lockdep whine. Keep track of nesting and use it as subclass.
  36 + */
  37 +#ifdef CONFIG_LOCKDEP
  38 +#define ioc_release_depth(q) ((q) ? (q)->ioc_release_depth : 0)
  39 +#define ioc_release_depth_inc(q) (q)->ioc_release_depth++
  40 +#define ioc_release_depth_dec(q) (q)->ioc_release_depth--
  41 +#else
  42 +#define ioc_release_depth(q) 0
  43 +#define ioc_release_depth_inc(q) do { } while (0)
  44 +#define ioc_release_depth_dec(q) do { } while (0)
  45 +#endif
  46 +
  47 +static void icq_free_icq_rcu(struct rcu_head *head)
  48 +{
  49 + struct io_cq *icq = container_of(head, struct io_cq, __rcu_head);
  50 +
  51 + kmem_cache_free(icq->__rcu_icq_cache, icq);
  52 +}
  53 +
  54 +/*
  55 + * Exit and free an icq. Called with both ioc and q locked.
  56 + */
  57 +static void ioc_exit_icq(struct io_cq *icq)
  58 +{
  59 + struct io_context *ioc = icq->ioc;
  60 + struct request_queue *q = icq->q;
  61 + struct elevator_type *et = q->elevator->type;
  62 +
  63 + lockdep_assert_held(&ioc->lock);
  64 + lockdep_assert_held(q->queue_lock);
  65 +
  66 + radix_tree_delete(&ioc->icq_tree, icq->q->id);
  67 + hlist_del_init(&icq->ioc_node);
  68 + list_del_init(&icq->q_node);
  69 +
  70 + /*
  71 + * Both setting lookup hint to and clearing it from @icq are done
  72 + * under queue_lock. If it's not pointing to @icq now, it never
  73 + * will. Hint assignment itself can race safely.
  74 + */
  75 + if (rcu_dereference_raw(ioc->icq_hint) == icq)
  76 + rcu_assign_pointer(ioc->icq_hint, NULL);
  77 +
  78 + if (et->ops.elevator_exit_icq_fn) {
  79 + ioc_release_depth_inc(q);
  80 + et->ops.elevator_exit_icq_fn(icq);
  81 + ioc_release_depth_dec(q);
27 82 }
  83 +
  84 + /*
  85 + * @icq->q might have gone away by the time RCU callback runs
  86 + * making it impossible to determine icq_cache. Record it in @icq.
  87 + */
  88 + icq->__rcu_icq_cache = et->icq_cache;
  89 + call_rcu(&icq->__rcu_head, icq_free_icq_rcu);
28 90 }
29 91  
30 92 /*
31   - * IO Context helper functions. put_io_context() returns 1 if there are no
32   - * more users of this io context, 0 otherwise.
  93 + * Slow path for ioc release in put_io_context(). Performs double-lock
  94 + * dancing to unlink all icq's and then frees ioc.
33 95 */
34   -int put_io_context(struct io_context *ioc)
  96 +static void ioc_release_fn(struct work_struct *work)
35 97 {
36   - if (ioc == NULL)
37   - return 1;
  98 + struct io_context *ioc = container_of(work, struct io_context,
  99 + release_work);
  100 + struct request_queue *last_q = NULL;
38 101  
39   - BUG_ON(atomic_long_read(&ioc->refcount) == 0);
  102 + spin_lock_irq(&ioc->lock);
40 103  
41   - if (atomic_long_dec_and_test(&ioc->refcount)) {
42   - rcu_read_lock();
43   - cfq_dtor(ioc);
44   - rcu_read_unlock();
  104 + while (!hlist_empty(&ioc->icq_list)) {
  105 + struct io_cq *icq = hlist_entry(ioc->icq_list.first,
  106 + struct io_cq, ioc_node);
  107 + struct request_queue *this_q = icq->q;
45 108  
46   - kmem_cache_free(iocontext_cachep, ioc);
47   - return 1;
  109 + if (this_q != last_q) {
  110 + /*
  111 + * Need to switch to @this_q. Once we release
  112 + * @ioc->lock, it can go away along with @cic.
  113 + * Hold on to it.
  114 + */
  115 + __blk_get_queue(this_q);
  116 +
  117 + /*
  118 + * blk_put_queue() might sleep thanks to kobject
  119 + * idiocy. Always release both locks, put and
  120 + * restart.
  121 + */
  122 + if (last_q) {
  123 + spin_unlock(last_q->queue_lock);
  124 + spin_unlock_irq(&ioc->lock);
  125 + blk_put_queue(last_q);
  126 + } else {
  127 + spin_unlock_irq(&ioc->lock);
  128 + }
  129 +
  130 + last_q = this_q;
  131 + spin_lock_irq(this_q->queue_lock);
  132 + spin_lock(&ioc->lock);
  133 + continue;
  134 + }
  135 + ioc_exit_icq(icq);
48 136 }
49   - return 0;
  137 +
  138 + if (last_q) {
  139 + spin_unlock(last_q->queue_lock);
  140 + spin_unlock_irq(&ioc->lock);
  141 + blk_put_queue(last_q);
  142 + } else {
  143 + spin_unlock_irq(&ioc->lock);
  144 + }
  145 +
  146 + kmem_cache_free(iocontext_cachep, ioc);
50 147 }
51   -EXPORT_SYMBOL(put_io_context);
52 148  
53   -static void cfq_exit(struct io_context *ioc)
  149 +/**
  150 + * put_io_context - put a reference of io_context
  151 + * @ioc: io_context to put
  152 + * @locked_q: request_queue the caller is holding queue_lock of (hint)
  153 + *
  154 + * Decrement reference count of @ioc and release it if the count reaches
  155 + * zero. If the caller is holding queue_lock of a queue, it can indicate
  156 + * that with @locked_q. This is an optimization hint and the caller is
  157 + * allowed to pass in %NULL even when it's holding a queue_lock.
  158 + */
  159 +void put_io_context(struct io_context *ioc, struct request_queue *locked_q)
54 160 {
55   - rcu_read_lock();
  161 + struct request_queue *last_q = locked_q;
  162 + unsigned long flags;
56 163  
57   - if (!hlist_empty(&ioc->cic_list)) {
58   - struct cfq_io_context *cic;
  164 + if (ioc == NULL)
  165 + return;
59 166  
60   - cic = hlist_entry(ioc->cic_list.first, struct cfq_io_context,
61   - cic_list);
62   - cic->exit(ioc);
  167 + BUG_ON(atomic_long_read(&ioc->refcount) <= 0);
  168 + if (locked_q)
  169 + lockdep_assert_held(locked_q->queue_lock);
  170 +
  171 + if (!atomic_long_dec_and_test(&ioc->refcount))
  172 + return;
  173 +
  174 + /*
  175 + * Destroy @ioc. This is a bit messy because icq's are chained
  176 + * from both ioc and queue, and ioc->lock nests inside queue_lock.
  177 + * The inner ioc->lock should be held to walk our icq_list and then
  178 + * for each icq the outer matching queue_lock should be grabbed.
  179 + * ie. We need to do reverse-order double lock dancing.
  180 + *
  181 + * Another twist is that we are often called with one of the
  182 + * matching queue_locks held as indicated by @locked_q, which
  183 + * prevents performing double-lock dance for other queues.
  184 + *
  185 + * So, we do it in two stages. The fast path uses the queue_lock
  186 + * the caller is holding and, if other queues need to be accessed,
  187 + * uses trylock to avoid introducing locking dependency. This can
  188 + * handle most cases, especially if @ioc was performing IO on only
  189 + * single device.
  190 + *
  191 + * If trylock doesn't cut it, we defer to @ioc->release_work which
  192 + * can do all the double-locking dancing.
  193 + */
  194 + spin_lock_irqsave_nested(&ioc->lock, flags,
  195 + ioc_release_depth(locked_q));
  196 +
  197 + while (!hlist_empty(&ioc->icq_list)) {
  198 + struct io_cq *icq = hlist_entry(ioc->icq_list.first,
  199 + struct io_cq, ioc_node);
  200 + struct request_queue *this_q = icq->q;
  201 +
  202 + if (this_q != last_q) {
  203 + if (last_q && last_q != locked_q)
  204 + spin_unlock(last_q->queue_lock);
  205 + last_q = NULL;
  206 +
  207 + if (!spin_trylock(this_q->queue_lock))
  208 + break;
  209 + last_q = this_q;
  210 + continue;
  211 + }
  212 + ioc_exit_icq(icq);
63 213 }
64   - rcu_read_unlock();
  214 +
  215 + if (last_q && last_q != locked_q)
  216 + spin_unlock(last_q->queue_lock);
  217 +
  218 + spin_unlock_irqrestore(&ioc->lock, flags);
  219 +
  220 + /* if no icq is left, we're done; otherwise, kick release_work */
  221 + if (hlist_empty(&ioc->icq_list))
  222 + kmem_cache_free(iocontext_cachep, ioc);
  223 + else
  224 + schedule_work(&ioc->release_work);
65 225 }
  226 +EXPORT_SYMBOL(put_io_context);
66 227  
67 228 /* Called by the exiting task */
68 229 void exit_io_context(struct task_struct *task)
69 230  
70 231  
71 232  
72 233  
73 234  
74 235  
75 236  
76 237  
77 238  
78 239  
79 240  
80 241  
81 242  
82 243  
83 244  
84 245  
85 246  
86 247  
87 248  
88 249  
89 250  
... ... @@ -74,86 +235,240 @@
74 235 task->io_context = NULL;
75 236 task_unlock(task);
76 237  
77   - if (atomic_dec_and_test(&ioc->nr_tasks))
78   - cfq_exit(ioc);
  238 + atomic_dec(&ioc->nr_tasks);
  239 + put_io_context(ioc, NULL);
  240 +}
79 241  
80   - put_io_context(ioc);
  242 +/**
  243 + * ioc_clear_queue - break any ioc association with the specified queue
  244 + * @q: request_queue being cleared
  245 + *
  246 + * Walk @q->icq_list and exit all io_cq's. Must be called with @q locked.
  247 + */
  248 +void ioc_clear_queue(struct request_queue *q)
  249 +{
  250 + lockdep_assert_held(q->queue_lock);
  251 +
  252 + while (!list_empty(&q->icq_list)) {
  253 + struct io_cq *icq = list_entry(q->icq_list.next,
  254 + struct io_cq, q_node);
  255 + struct io_context *ioc = icq->ioc;
  256 +
  257 + spin_lock(&ioc->lock);
  258 + ioc_exit_icq(icq);
  259 + spin_unlock(&ioc->lock);
  260 + }
81 261 }
82 262  
83   -struct io_context *alloc_io_context(gfp_t gfp_flags, int node)
  263 +void create_io_context_slowpath(struct task_struct *task, gfp_t gfp_flags,
  264 + int node)
84 265 {
85 266 struct io_context *ioc;
86 267  
87   - ioc = kmem_cache_alloc_node(iocontext_cachep, gfp_flags, node);
88   - if (ioc) {
89   - atomic_long_set(&ioc->refcount, 1);
90   - atomic_set(&ioc->nr_tasks, 1);
91   - spin_lock_init(&ioc->lock);
92   - ioc->ioprio_changed = 0;
93   - ioc->ioprio = 0;
94   - ioc->last_waited = 0; /* doesn't matter... */
95   - ioc->nr_batch_requests = 0; /* because this is 0 */
96   - INIT_RADIX_TREE(&ioc->radix_root, GFP_ATOMIC | __GFP_HIGH);
97   - INIT_HLIST_HEAD(&ioc->cic_list);
98   - ioc->ioc_data = NULL;
99   -#if defined(CONFIG_BLK_CGROUP) || defined(CONFIG_BLK_CGROUP_MODULE)
100   - ioc->cgroup_changed = 0;
101   -#endif
102   - }
  268 + ioc = kmem_cache_alloc_node(iocontext_cachep, gfp_flags | __GFP_ZERO,
  269 + node);
  270 + if (unlikely(!ioc))
  271 + return;
103 272  
104   - return ioc;
  273 + /* initialize */
  274 + atomic_long_set(&ioc->refcount, 1);
  275 + atomic_set(&ioc->nr_tasks, 1);
  276 + spin_lock_init(&ioc->lock);
  277 + INIT_RADIX_TREE(&ioc->icq_tree, GFP_ATOMIC | __GFP_HIGH);
  278 + INIT_HLIST_HEAD(&ioc->icq_list);
  279 + INIT_WORK(&ioc->release_work, ioc_release_fn);
  280 +
  281 + /*
  282 + * Try to install. ioc shouldn't be installed if someone else
  283 + * already did or @task, which isn't %current, is exiting. Note
  284 + * that we need to allow ioc creation on exiting %current as exit
  285 + * path may issue IOs from e.g. exit_files(). The exit path is
  286 + * responsible for not issuing IO after exit_io_context().
  287 + */
  288 + task_lock(task);
  289 + if (!task->io_context &&
  290 + (task == current || !(task->flags & PF_EXITING)))
  291 + task->io_context = ioc;
  292 + else
  293 + kmem_cache_free(iocontext_cachep, ioc);
  294 + task_unlock(task);
105 295 }
106 296  
107   -/*
108   - * If the current task has no IO context then create one and initialise it.
109   - * Otherwise, return its existing IO context.
  297 +/**
  298 + * get_task_io_context - get io_context of a task
  299 + * @task: task of interest
  300 + * @gfp_flags: allocation flags, used if allocation is necessary
  301 + * @node: allocation node, used if allocation is necessary
110 302 *
111   - * This returned IO context doesn't have a specifically elevated refcount,
112   - * but since the current task itself holds a reference, the context can be
113   - * used in general code, so long as it stays within `current` context.
  303 + * Return io_context of @task. If it doesn't exist, it is created with
  304 + * @gfp_flags and @node. The returned io_context has its reference count
  305 + * incremented.
  306 + *
  307 + * This function always goes through task_lock() and it's better to use
  308 + * %current->io_context + get_io_context() for %current.
114 309 */
115   -struct io_context *current_io_context(gfp_t gfp_flags, int node)
  310 +struct io_context *get_task_io_context(struct task_struct *task,
  311 + gfp_t gfp_flags, int node)
116 312 {
117   - struct task_struct *tsk = current;
118   - struct io_context *ret;
  313 + struct io_context *ioc;
119 314  
120   - ret = tsk->io_context;
121   - if (likely(ret))
122   - return ret;
  315 + might_sleep_if(gfp_flags & __GFP_WAIT);
123 316  
124   - ret = alloc_io_context(gfp_flags, node);
125   - if (ret) {
126   - /* make sure set_task_ioprio() sees the settings above */
127   - smp_wmb();
128   - tsk->io_context = ret;
129   - }
  317 + do {
  318 + task_lock(task);
  319 + ioc = task->io_context;
  320 + if (likely(ioc)) {
  321 + get_io_context(ioc);
  322 + task_unlock(task);
  323 + return ioc;
  324 + }
  325 + task_unlock(task);
  326 + } while (create_io_context(task, gfp_flags, node));
130 327  
131   - return ret;
  328 + return NULL;
132 329 }
  330 +EXPORT_SYMBOL(get_task_io_context);
133 331  
134   -/*
135   - * If the current task has no IO context then create one and initialise it.
136   - * If it does have a context, take a ref on it.
  332 +/**
  333 + * ioc_lookup_icq - lookup io_cq from ioc
  334 + * @ioc: the associated io_context
  335 + * @q: the associated request_queue
137 336 *
138   - * This is always called in the context of the task which submitted the I/O.
  337 + * Look up io_cq associated with @ioc - @q pair from @ioc. Must be called
  338 + * with @q->queue_lock held.
139 339 */
140   -struct io_context *get_io_context(gfp_t gfp_flags, int node)
  340 +struct io_cq *ioc_lookup_icq(struct io_context *ioc, struct request_queue *q)
141 341 {
142   - struct io_context *ioc = NULL;
  342 + struct io_cq *icq;
143 343  
  344 + lockdep_assert_held(q->queue_lock);
  345 +
144 346 /*
145   - * Check for unlikely race with exiting task. ioc ref count is
146   - * zero when ioc is being detached.
  347 + * icq's are indexed from @ioc using radix tree and hint pointer,
  348 + * both of which are protected with RCU. All removals are done
  349 + * holding both q and ioc locks, and we're holding q lock - if we
  350 + * find a icq which points to us, it's guaranteed to be valid.
147 351 */
148   - do {
149   - ioc = current_io_context(gfp_flags, node);
150   - if (unlikely(!ioc))
151   - break;
152   - } while (!atomic_long_inc_not_zero(&ioc->refcount));
  352 + rcu_read_lock();
  353 + icq = rcu_dereference(ioc->icq_hint);
  354 + if (icq && icq->q == q)
  355 + goto out;
153 356  
154   - return ioc;
  357 + icq = radix_tree_lookup(&ioc->icq_tree, q->id);
  358 + if (icq && icq->q == q)
  359 + rcu_assign_pointer(ioc->icq_hint, icq); /* allowed to race */
  360 + else
  361 + icq = NULL;
  362 +out:
  363 + rcu_read_unlock();
  364 + return icq;
155 365 }
156   -EXPORT_SYMBOL(get_io_context);
  366 +EXPORT_SYMBOL(ioc_lookup_icq);
  367 +
  368 +/**
  369 + * ioc_create_icq - create and link io_cq
  370 + * @q: request_queue of interest
  371 + * @gfp_mask: allocation mask
  372 + *
  373 + * Make sure io_cq linking %current->io_context and @q exists. If either
  374 + * io_context and/or icq don't exist, they will be created using @gfp_mask.
  375 + *
  376 + * The caller is responsible for ensuring @ioc won't go away and @q is
  377 + * alive and will stay alive until this function returns.
  378 + */
  379 +struct io_cq *ioc_create_icq(struct request_queue *q, gfp_t gfp_mask)
  380 +{
  381 + struct elevator_type *et = q->elevator->type;
  382 + struct io_context *ioc;
  383 + struct io_cq *icq;
  384 +
  385 + /* allocate stuff */
  386 + ioc = create_io_context(current, gfp_mask, q->node);
  387 + if (!ioc)
  388 + return NULL;
  389 +
  390 + icq = kmem_cache_alloc_node(et->icq_cache, gfp_mask | __GFP_ZERO,
  391 + q->node);
  392 + if (!icq)
  393 + return NULL;
  394 +
  395 + if (radix_tree_preload(gfp_mask) < 0) {
  396 + kmem_cache_free(et->icq_cache, icq);
  397 + return NULL;
  398 + }
  399 +
  400 + icq->ioc = ioc;
  401 + icq->q = q;
  402 + INIT_LIST_HEAD(&icq->q_node);
  403 + INIT_HLIST_NODE(&icq->ioc_node);
  404 +
  405 + /* lock both q and ioc and try to link @icq */
  406 + spin_lock_irq(q->queue_lock);
  407 + spin_lock(&ioc->lock);
  408 +
  409 + if (likely(!radix_tree_insert(&ioc->icq_tree, q->id, icq))) {
  410 + hlist_add_head(&icq->ioc_node, &ioc->icq_list);
  411 + list_add(&icq->q_node, &q->icq_list);
  412 + if (et->ops.elevator_init_icq_fn)
  413 + et->ops.elevator_init_icq_fn(icq);
  414 + } else {
  415 + kmem_cache_free(et->icq_cache, icq);
  416 + icq = ioc_lookup_icq(ioc, q);
  417 + if (!icq)
  418 + printk(KERN_ERR "cfq: icq link failed!\n");
  419 + }
  420 +
  421 + spin_unlock(&ioc->lock);
  422 + spin_unlock_irq(q->queue_lock);
  423 + radix_tree_preload_end();
  424 + return icq;
  425 +}
  426 +
  427 +void ioc_set_changed(struct io_context *ioc, int which)
  428 +{
  429 + struct io_cq *icq;
  430 + struct hlist_node *n;
  431 +
  432 + hlist_for_each_entry(icq, n, &ioc->icq_list, ioc_node)
  433 + set_bit(which, &icq->changed);
  434 +}
  435 +
  436 +/**
  437 + * ioc_ioprio_changed - notify ioprio change
  438 + * @ioc: io_context of interest
  439 + * @ioprio: new ioprio
  440 + *
  441 + * @ioc's ioprio has changed to @ioprio. Set %ICQ_IOPRIO_CHANGED for all
  442 + * icq's. iosched is responsible for checking the bit and applying it on
  443 + * request issue path.
  444 + */
  445 +void ioc_ioprio_changed(struct io_context *ioc, int ioprio)
  446 +{
  447 + unsigned long flags;
  448 +
  449 + spin_lock_irqsave(&ioc->lock, flags);
  450 + ioc->ioprio = ioprio;
  451 + ioc_set_changed(ioc, ICQ_IOPRIO_CHANGED);
  452 + spin_unlock_irqrestore(&ioc->lock, flags);
  453 +}
  454 +
  455 +/**
  456 + * ioc_cgroup_changed - notify cgroup change
  457 + * @ioc: io_context of interest
  458 + *
  459 + * @ioc's cgroup has changed. Set %ICQ_CGROUP_CHANGED for all icq's.
  460 + * iosched is responsible for checking the bit and applying it on request
  461 + * issue path.
  462 + */
  463 +void ioc_cgroup_changed(struct io_context *ioc)
  464 +{
  465 + unsigned long flags;
  466 +
  467 + spin_lock_irqsave(&ioc->lock, flags);
  468 + ioc_set_changed(ioc, ICQ_CGROUP_CHANGED);
  469 + spin_unlock_irqrestore(&ioc->lock, flags);
  470 +}
  471 +EXPORT_SYMBOL(ioc_cgroup_changed);
157 472  
158 473 static int __init blk_ioc_init(void)
159 474 {
block/blk-settings.c
... ... @@ -104,9 +104,7 @@
104 104 * @lim: the queue_limits structure to reset
105 105 *
106 106 * Description:
107   - * Returns a queue_limit struct to its default state. Can be used by
108   - * stacking drivers like DM that stage table swaps and reuse an
109   - * existing device queue.
  107 + * Returns a queue_limit struct to its default state.
110 108 */
111 109 void blk_set_default_limits(struct queue_limits *lim)
112 110 {
113 111  
... ... @@ -114,13 +112,12 @@
114 112 lim->max_integrity_segments = 0;
115 113 lim->seg_boundary_mask = BLK_SEG_BOUNDARY_MASK;
116 114 lim->max_segment_size = BLK_MAX_SEGMENT_SIZE;
117   - lim->max_sectors = BLK_DEF_MAX_SECTORS;
118   - lim->max_hw_sectors = INT_MAX;
  115 + lim->max_sectors = lim->max_hw_sectors = BLK_SAFE_MAX_SECTORS;
119 116 lim->max_discard_sectors = 0;
120 117 lim->discard_granularity = 0;
121 118 lim->discard_alignment = 0;
122 119 lim->discard_misaligned = 0;
123   - lim->discard_zeroes_data = 1;
  120 + lim->discard_zeroes_data = 0;
124 121 lim->logical_block_size = lim->physical_block_size = lim->io_min = 512;
125 122 lim->bounce_pfn = (unsigned long)(BLK_BOUNCE_ANY >> PAGE_SHIFT);
126 123 lim->alignment_offset = 0;
... ... @@ -131,6 +128,27 @@
131 128 EXPORT_SYMBOL(blk_set_default_limits);
132 129  
133 130 /**
  131 + * blk_set_stacking_limits - set default limits for stacking devices
  132 + * @lim: the queue_limits structure to reset
  133 + *
  134 + * Description:
  135 + * Returns a queue_limit struct to its default state. Should be used
  136 + * by stacking drivers like DM that have no internal limits.
  137 + */
  138 +void blk_set_stacking_limits(struct queue_limits *lim)
  139 +{
  140 + blk_set_default_limits(lim);
  141 +
  142 + /* Inherit limits from component devices */
  143 + lim->discard_zeroes_data = 1;
  144 + lim->max_segments = USHRT_MAX;
  145 + lim->max_hw_sectors = UINT_MAX;
  146 +
  147 + lim->max_sectors = BLK_DEF_MAX_SECTORS;
  148 +}
  149 +EXPORT_SYMBOL(blk_set_stacking_limits);
  150 +
  151 +/**
134 152 * blk_queue_make_request - define an alternate make_request function for a device
135 153 * @q: the request queue for the device to be affected
136 154 * @mfn: the alternate make_request function
... ... @@ -165,8 +183,6 @@
165 183 q->nr_batching = BLK_BATCH_REQ;
166 184  
167 185 blk_set_default_limits(&q->limits);
168   - blk_queue_max_hw_sectors(q, BLK_SAFE_MAX_SECTORS);
169   - q->limits.discard_zeroes_data = 0;
170 186  
171 187 /*
172 188 * by default assume old behaviour and bounce for any highmem page
... ... @@ -425,7 +425,7 @@
425 425 if (!entry->show)
426 426 return -EIO;
427 427 mutex_lock(&q->sysfs_lock);
428   - if (test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)) {
  428 + if (blk_queue_dead(q)) {
429 429 mutex_unlock(&q->sysfs_lock);
430 430 return -ENOENT;
431 431 }
... ... @@ -447,7 +447,7 @@
447 447  
448 448 q = container_of(kobj, struct request_queue, kobj);
449 449 mutex_lock(&q->sysfs_lock);
450   - if (test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)) {
  450 + if (blk_queue_dead(q)) {
451 451 mutex_unlock(&q->sysfs_lock);
452 452 return -ENOENT;
453 453 }
454 454  
... ... @@ -479,8 +479,12 @@
479 479  
480 480 blk_sync_queue(q);
481 481  
482   - if (q->elevator)
  482 + if (q->elevator) {
  483 + spin_lock_irq(q->queue_lock);
  484 + ioc_clear_queue(q);
  485 + spin_unlock_irq(q->queue_lock);
483 486 elevator_exit(q->elevator);
  487 + }
484 488  
485 489 blk_throtl_exit(q);
486 490  
... ... @@ -494,6 +498,8 @@
494 498 blk_trace_shutdown(q);
495 499  
496 500 bdi_destroy(&q->backing_dev_info);
  501 +
  502 + ida_simple_remove(&blk_queue_ida, q->id);
497 503 kmem_cache_free(blk_requestq_cachep, q);
498 504 }
499 505  
block/blk-throttle.c
... ... @@ -310,7 +310,7 @@
310 310 struct request_queue *q = td->queue;
311 311  
312 312 /* no throttling for dead queue */
313   - if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)))
  313 + if (unlikely(blk_queue_dead(q)))
314 314 return NULL;
315 315  
316 316 rcu_read_lock();
... ... @@ -335,7 +335,7 @@
335 335 spin_lock_irq(q->queue_lock);
336 336  
337 337 /* Make sure @q is still alive */
338   - if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))) {
  338 + if (unlikely(blk_queue_dead(q))) {
339 339 kfree(tg);
340 340 return NULL;
341 341 }
1 1 #ifndef BLK_INTERNAL_H
2 2 #define BLK_INTERNAL_H
3 3  
  4 +#include <linux/idr.h>
  5 +
4 6 /* Amount of time in which a process may batch requests */
5 7 #define BLK_BATCH_TIME (HZ/50UL)
6 8  
7 9  
... ... @@ -9,7 +11,13 @@
9 11  
10 12 extern struct kmem_cache *blk_requestq_cachep;
11 13 extern struct kobj_type blk_queue_ktype;
  14 +extern struct ida blk_queue_ida;
12 15  
  16 +static inline void __blk_get_queue(struct request_queue *q)
  17 +{
  18 + kobject_get(&q->kobj);
  19 +}
  20 +
13 21 void init_request_from_bio(struct request *req, struct bio *bio);
14 22 void blk_rq_bio_prep(struct request_queue *q, struct request *rq,
15 23 struct bio *bio);
... ... @@ -85,8 +93,8 @@
85 93 q->flush_queue_delayed = 1;
86 94 return NULL;
87 95 }
88   - if (test_bit(QUEUE_FLAG_DEAD, &q->queue_flags) ||
89   - !q->elevator->ops->elevator_dispatch_fn(q, 0))
  96 + if (unlikely(blk_queue_dead(q)) ||
  97 + !q->elevator->type->ops.elevator_dispatch_fn(q, 0))
90 98 return NULL;
91 99 }
92 100 }
93 101  
... ... @@ -95,16 +103,16 @@
95 103 {
96 104 struct elevator_queue *e = q->elevator;
97 105  
98   - if (e->ops->elevator_activate_req_fn)
99   - e->ops->elevator_activate_req_fn(q, rq);
  106 + if (e->type->ops.elevator_activate_req_fn)
  107 + e->type->ops.elevator_activate_req_fn(q, rq);
100 108 }
101 109  
102 110 static inline void elv_deactivate_rq(struct request_queue *q, struct request *rq)
103 111 {
104 112 struct elevator_queue *e = q->elevator;
105 113  
106   - if (e->ops->elevator_deactivate_req_fn)
107   - e->ops->elevator_deactivate_req_fn(q, rq);
  114 + if (e->type->ops.elevator_deactivate_req_fn)
  115 + e->type->ops.elevator_deactivate_req_fn(q, rq);
108 116 }
109 117  
110 118 #ifdef CONFIG_FAIL_IO_TIMEOUT
... ... @@ -119,8 +127,6 @@
119 127 }
120 128 #endif
121 129  
122   -struct io_context *current_io_context(gfp_t gfp_flags, int node);
123   -
124 130 int ll_back_merge_fn(struct request_queue *q, struct request *req,
125 131 struct bio *bio);
126 132 int ll_front_merge_fn(struct request_queue *q, struct request *req,
... ... @@ -189,6 +195,42 @@
189 195 (rq->cmd_flags & REQ_DISCARD));
190 196 }
191 197  
  198 +/*
  199 + * Internal io_context interface
  200 + */
  201 +void get_io_context(struct io_context *ioc);
  202 +struct io_cq *ioc_lookup_icq(struct io_context *ioc, struct request_queue *q);
  203 +struct io_cq *ioc_create_icq(struct request_queue *q, gfp_t gfp_mask);
  204 +void ioc_clear_queue(struct request_queue *q);
  205 +
  206 +void create_io_context_slowpath(struct task_struct *task, gfp_t gfp_mask,
  207 + int node);
  208 +
  209 +/**
  210 + * create_io_context - try to create task->io_context
  211 + * @task: target task
  212 + * @gfp_mask: allocation mask
  213 + * @node: allocation node
  214 + *
  215 + * If @task->io_context is %NULL, allocate a new io_context and install it.
  216 + * Returns the current @task->io_context which may be %NULL if allocation
  217 + * failed.
  218 + *
  219 + * Note that this function can't be called with IRQ disabled because
  220 + * task_lock which protects @task->io_context is IRQ-unsafe.
  221 + */
  222 +static inline struct io_context *create_io_context(struct task_struct *task,
  223 + gfp_t gfp_mask, int node)
  224 +{
  225 + WARN_ON_ONCE(irqs_disabled());
  226 + if (unlikely(!task->io_context))
  227 + create_io_context_slowpath(task, gfp_mask, node);
  228 + return task->io_context;
  229 +}
  230 +
  231 +/*
  232 + * Internal throttling interface
  233 + */
192 234 #ifdef CONFIG_BLK_DEV_THROTTLING
193 235 extern bool blk_throtl_bio(struct request_queue *q, struct bio *bio);
194 236 extern void blk_throtl_drain(struct request_queue *q);
... ... @@ -769,12 +769,10 @@
769 769 struct file *file)
770 770 {
771 771 struct bsg_device *bd;
772   - int ret;
773 772 #ifdef BSG_DEBUG
774 773 unsigned char buf[32];
775 774 #endif
776   - ret = blk_get_queue(rq);
777   - if (ret)
  775 + if (!blk_get_queue(rq))
778 776 return ERR_PTR(-ENXIO);
779 777  
780 778 bd = bsg_alloc_device();
Changes suppressed. Click to show
... ... @@ -14,6 +14,7 @@
14 14 #include <linux/rbtree.h>
15 15 #include <linux/ioprio.h>
16 16 #include <linux/blktrace_api.h>
  17 +#include "blk.h"
17 18 #include "cfq.h"
18 19  
19 20 /*
20 21  
21 22  
... ... @@ -53,21 +54,12 @@
53 54 #define CFQQ_SECT_THR_NONROT (sector_t)(2 * 32)
54 55 #define CFQQ_SEEKY(cfqq) (hweight32(cfqq->seek_history) > 32/8)
55 56  
56   -#define RQ_CIC(rq) \
57   - ((struct cfq_io_context *) (rq)->elevator_private[0])
58   -#define RQ_CFQQ(rq) (struct cfq_queue *) ((rq)->elevator_private[1])
59   -#define RQ_CFQG(rq) (struct cfq_group *) ((rq)->elevator_private[2])
  57 +#define RQ_CIC(rq) icq_to_cic((rq)->elv.icq)
  58 +#define RQ_CFQQ(rq) (struct cfq_queue *) ((rq)->elv.priv[0])
  59 +#define RQ_CFQG(rq) (struct cfq_group *) ((rq)->elv.priv[1])
60 60  
61 61 static struct kmem_cache *cfq_pool;
62   -static struct kmem_cache *cfq_ioc_pool;
63 62  
64   -static DEFINE_PER_CPU(unsigned long, cfq_ioc_count);
65   -static struct completion *ioc_gone;
66   -static DEFINE_SPINLOCK(ioc_gone_lock);
67   -
68   -static DEFINE_SPINLOCK(cic_index_lock);
69   -static DEFINE_IDA(cic_index_ida);
70   -
71 63 #define CFQ_PRIO_LISTS IOPRIO_BE_NR
72 64 #define cfq_class_idle(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_IDLE)
73 65 #define cfq_class_rt(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_RT)
... ... @@ -75,6 +67,14 @@
75 67 #define sample_valid(samples) ((samples) > 80)
76 68 #define rb_entry_cfqg(node) rb_entry((node), struct cfq_group, rb_node)
77 69  
  70 +struct cfq_ttime {
  71 + unsigned long last_end_request;
  72 +
  73 + unsigned long ttime_total;
  74 + unsigned long ttime_samples;
  75 + unsigned long ttime_mean;
  76 +};
  77 +
78 78 /*
79 79 * Most of our rbtree usage is for sorting with min extraction, so
80 80 * if we cache the leftmost node we don't have to walk down the tree
... ... @@ -216,6 +216,12 @@
216 216 struct cfq_ttime ttime;
217 217 };
218 218  
  219 +struct cfq_io_cq {
  220 + struct io_cq icq; /* must be the first member */
  221 + struct cfq_queue *cfqq[2];
  222 + struct cfq_ttime ttime;
  223 +};
  224 +
219 225 /*
220 226 * Per block device queue structure
221 227 */
... ... @@ -267,7 +273,7 @@
267 273 struct work_struct unplug_work;
268 274  
269 275 struct cfq_queue *active_queue;
270   - struct cfq_io_context *active_cic;
  276 + struct cfq_io_cq *active_cic;
271 277  
272 278 /*
273 279 * async queue for each priority case
... ... @@ -290,9 +296,6 @@
290 296 unsigned int cfq_group_idle;
291 297 unsigned int cfq_latency;
292 298  
293   - unsigned int cic_index;
294   - struct list_head cic_list;
295   -
296 299 /*
297 300 * Fallback dummy cfqq for extreme OOM conditions
298 301 */
299 302  
300 303  
301 304  
302 305  
303 306  
304 307  
305 308  
306 309  
307 310  
... ... @@ -464,37 +467,35 @@
464 467 static void cfq_dispatch_insert(struct request_queue *, struct request *);
465 468 static struct cfq_queue *cfq_get_queue(struct cfq_data *, bool,
466 469 struct io_context *, gfp_t);
467   -static struct cfq_io_context *cfq_cic_lookup(struct cfq_data *,
468   - struct io_context *);
469 470  
470   -static inline struct cfq_queue *cic_to_cfqq(struct cfq_io_context *cic,
471   - bool is_sync)
  471 +static inline struct cfq_io_cq *icq_to_cic(struct io_cq *icq)
472 472 {
473   - return cic->cfqq[is_sync];
  473 + /* cic->icq is the first member, %NULL will convert to %NULL */
  474 + return container_of(icq, struct cfq_io_cq, icq);
474 475 }
475 476  
476   -static inline void cic_set_cfqq(struct cfq_io_context *cic,
477   - struct cfq_queue *cfqq, bool is_sync)
  477 +static inline struct cfq_io_cq *cfq_cic_lookup(struct cfq_data *cfqd,
  478 + struct io_context *ioc)
478 479 {
479   - cic->cfqq[is_sync] = cfqq;
  480 + if (ioc)
  481 + return icq_to_cic(ioc_lookup_icq(ioc, cfqd->queue));
  482 + return NULL;
480 483 }
481 484  
482   -#define CIC_DEAD_KEY 1ul
483   -#define CIC_DEAD_INDEX_SHIFT 1
484   -
485   -static inline void *cfqd_dead_key(struct cfq_data *cfqd)
  485 +static inline struct cfq_queue *cic_to_cfqq(struct cfq_io_cq *cic, bool is_sync)
486 486 {
487   - return (void *)(cfqd->cic_index << CIC_DEAD_INDEX_SHIFT | CIC_DEAD_KEY);
  487 + return cic->cfqq[is_sync];
488 488 }
489 489  
490   -static inline struct cfq_data *cic_to_cfqd(struct cfq_io_context *cic)
  490 +static inline void cic_set_cfqq(struct cfq_io_cq *cic, struct cfq_queue *cfqq,
  491 + bool is_sync)
491 492 {
492   - struct cfq_data *cfqd = cic->key;
  493 + cic->cfqq[is_sync] = cfqq;
  494 +}
493 495  
494   - if (unlikely((unsigned long) cfqd & CIC_DEAD_KEY))
495   - return NULL;
496   -
497   - return cfqd;
  496 +static inline struct cfq_data *cic_to_cfqd(struct cfq_io_cq *cic)
  497 +{
  498 + return cic->icq.q->elevator->elevator_data;
498 499 }
499 500  
500 501 /*
... ... @@ -1561,7 +1562,7 @@
1561 1562 cfq_find_rq_fmerge(struct cfq_data *cfqd, struct bio *bio)
1562 1563 {
1563 1564 struct task_struct *tsk = current;
1564   - struct cfq_io_context *cic;
  1565 + struct cfq_io_cq *cic;
1565 1566 struct cfq_queue *cfqq;
1566 1567  
1567 1568 cic = cfq_cic_lookup(cfqd, tsk->io_context);
... ... @@ -1687,7 +1688,7 @@
1687 1688 struct bio *bio)
1688 1689 {
1689 1690 struct cfq_data *cfqd = q->elevator->elevator_data;
1690   - struct cfq_io_context *cic;
  1691 + struct cfq_io_cq *cic;
1691 1692 struct cfq_queue *cfqq;
1692 1693  
1693 1694 /*
1694 1695  
... ... @@ -1697,12 +1698,19 @@
1697 1698 return false;
1698 1699  
1699 1700 /*
1700   - * Lookup the cfqq that this bio will be queued with. Allow
1701   - * merge only if rq is queued there.
  1701 + * Lookup the cfqq that this bio will be queued with and allow
  1702 + * merge only if rq is queued there. This function can be called
  1703 + * from plug merge without queue_lock. In such cases, ioc of @rq
  1704 + * and %current are guaranteed to be equal. Avoid lookup which
  1705 + * requires queue_lock by using @rq's cic.
1702 1706 */
1703   - cic = cfq_cic_lookup(cfqd, current->io_context);
1704   - if (!cic)
1705   - return false;
  1707 + if (current->io_context == RQ_CIC(rq)->icq.ioc) {
  1708 + cic = RQ_CIC(rq);
  1709 + } else {
  1710 + cic = cfq_cic_lookup(cfqd, current->io_context);
  1711 + if (!cic)
  1712 + return false;
  1713 + }
1706 1714  
1707 1715 cfqq = cic_to_cfqq(cic, cfq_bio_sync(bio));
1708 1716 return cfqq == RQ_CFQQ(rq);
... ... @@ -1786,7 +1794,7 @@
1786 1794 cfqd->active_queue = NULL;
1787 1795  
1788 1796 if (cfqd->active_cic) {
1789   - put_io_context(cfqd->active_cic->ioc);
  1797 + put_io_context(cfqd->active_cic->icq.ioc, cfqd->queue);
1790 1798 cfqd->active_cic = NULL;
1791 1799 }
1792 1800 }
... ... @@ -2006,7 +2014,7 @@
2006 2014 static void cfq_arm_slice_timer(struct cfq_data *cfqd)
2007 2015 {
2008 2016 struct cfq_queue *cfqq = cfqd->active_queue;
2009   - struct cfq_io_context *cic;
  2017 + struct cfq_io_cq *cic;
2010 2018 unsigned long sl, group_idle = 0;
2011 2019  
2012 2020 /*
... ... @@ -2041,7 +2049,7 @@
2041 2049 * task has exited, don't wait
2042 2050 */
2043 2051 cic = cfqd->active_cic;
2044   - if (!cic || !atomic_read(&cic->ioc->nr_tasks))
  2052 + if (!cic || !atomic_read(&cic->icq.ioc->nr_tasks))
2045 2053 return;
2046 2054  
2047 2055 /*
2048 2056  
... ... @@ -2592,9 +2600,9 @@
2592 2600 cfq_dispatch_insert(cfqd->queue, rq);
2593 2601  
2594 2602 if (!cfqd->active_cic) {
2595   - struct cfq_io_context *cic = RQ_CIC(rq);
  2603 + struct cfq_io_cq *cic = RQ_CIC(rq);
2596 2604  
2597   - atomic_long_inc(&cic->ioc->refcount);
  2605 + atomic_long_inc(&cic->icq.ioc->refcount);
2598 2606 cfqd->active_cic = cic;
2599 2607 }
2600 2608  
... ... @@ -2677,84 +2685,6 @@
2677 2685 cfq_put_cfqg(cfqg);
2678 2686 }
2679 2687  
2680   -/*
2681   - * Call func for each cic attached to this ioc.
2682   - */
2683   -static void
2684   -call_for_each_cic(struct io_context *ioc,
2685   - void (*func)(struct io_context *, struct cfq_io_context *))
2686   -{
2687   - struct cfq_io_context *cic;
2688   - struct hlist_node *n;
2689   -
2690   - rcu_read_lock();
2691   -
2692   - hlist_for_each_entry_rcu(cic, n, &ioc->cic_list, cic_list)
2693   - func(ioc, cic);
2694   -
2695   - rcu_read_unlock();
2696   -}
2697   -
2698   -static void cfq_cic_free_rcu(struct rcu_head *head)
2699   -{
2700   - struct cfq_io_context *cic;
2701   -
2702   - cic = container_of(head, struct cfq_io_context, rcu_head);
2703   -
2704   - kmem_cache_free(cfq_ioc_pool, cic);
2705   - elv_ioc_count_dec(cfq_ioc_count);
2706   -
2707   - if (ioc_gone) {
2708   - /*
2709   - * CFQ scheduler is exiting, grab exit lock and check
2710   - * the pending io context count. If it hits zero,
2711   - * complete ioc_gone and set it back to NULL
2712   - */
2713   - spin_lock(&ioc_gone_lock);
2714   - if (ioc_gone && !elv_ioc_count_read(cfq_ioc_count)) {
2715   - complete(ioc_gone);
2716   - ioc_gone = NULL;
2717   - }
2718   - spin_unlock(&ioc_gone_lock);
2719   - }
2720   -}
2721   -
2722   -static void cfq_cic_free(struct cfq_io_context *cic)
2723   -{
2724   - call_rcu(&cic->rcu_head, cfq_cic_free_rcu);
2725   -}
2726   -
2727   -static void cic_free_func(struct io_context *ioc, struct cfq_io_context *cic)
2728   -{
2729   - unsigned long flags;
2730   - unsigned long dead_key = (unsigned long) cic->key;
2731   -
2732   - BUG_ON(!(dead_key & CIC_DEAD_KEY));
2733   -
2734   - spin_lock_irqsave(&ioc->lock, flags);
2735   - radix_tree_delete(&ioc->radix_root, dead_key >> CIC_DEAD_INDEX_SHIFT);
2736   - hlist_del_rcu(&cic->cic_list);
2737   - spin_unlock_irqrestore(&ioc->lock, flags);
2738   -
2739   - cfq_cic_free(cic);
2740   -}
2741   -
2742   -/*
2743   - * Must be called with rcu_read_lock() held or preemption otherwise disabled.
2744   - * Only two callers of this - ->dtor() which is called with the rcu_read_lock(),
2745   - * and ->trim() which is called with the task lock held
2746   - */
2747   -static void cfq_free_io_context(struct io_context *ioc)
2748   -{
2749   - /*
2750   - * ioc->refcount is zero here, or we are called from elv_unregister(),
2751   - * so no more cic's are allowed to be linked into this ioc. So it
2752   - * should be ok to iterate over the known list, we will see all cic's
2753   - * since no new ones are added.
2754   - */
2755   - call_for_each_cic(ioc, cic_free_func);
2756   -}
2757   -
2758 2688 static void cfq_put_cooperator(struct cfq_queue *cfqq)
2759 2689 {
2760 2690 struct cfq_queue *__cfqq, *next;
2761 2691  
2762 2692  
2763 2693  
2764 2694  
... ... @@ -2788,28 +2718,18 @@
2788 2718 cfq_put_queue(cfqq);
2789 2719 }
2790 2720  
2791   -static void __cfq_exit_single_io_context(struct cfq_data *cfqd,
2792   - struct cfq_io_context *cic)
  2721 +static void cfq_init_icq(struct io_cq *icq)
2793 2722 {
2794   - struct io_context *ioc = cic->ioc;
  2723 + struct cfq_io_cq *cic = icq_to_cic(icq);
2795 2724  
2796   - list_del_init(&cic->queue_list);
  2725 + cic->ttime.last_end_request = jiffies;
  2726 +}
2797 2727  
2798   - /*
2799   - * Make sure dead mark is seen for dead queues
2800   - */
2801   - smp_wmb();
2802   - cic->key = cfqd_dead_key(cfqd);
  2728 +static void cfq_exit_icq(struct io_cq *icq)
  2729 +{
  2730 + struct cfq_io_cq *cic = icq_to_cic(icq);
  2731 + struct cfq_data *cfqd = cic_to_cfqd(cic);
2803 2732  
2804   - rcu_read_lock();
2805   - if (rcu_dereference(ioc->ioc_data) == cic) {
2806   - rcu_read_unlock();
2807   - spin_lock(&ioc->lock);
2808   - rcu_assign_pointer(ioc->ioc_data, NULL);
2809   - spin_unlock(&ioc->lock);
2810   - } else
2811   - rcu_read_unlock();
2812   -
2813 2733 if (cic->cfqq[BLK_RW_ASYNC]) {
2814 2734 cfq_exit_cfqq(cfqd, cic->cfqq[BLK_RW_ASYNC]);
2815 2735 cic->cfqq[BLK_RW_ASYNC] = NULL;
... ... @@ -2821,57 +2741,6 @@
2821 2741 }
2822 2742 }
2823 2743  
2824   -static void cfq_exit_single_io_context(struct io_context *ioc,
2825   - struct cfq_io_context *cic)
2826   -{
2827   - struct cfq_data *cfqd = cic_to_cfqd(cic);
2828   -
2829   - if (cfqd) {
2830   - struct request_queue *q = cfqd->queue;
2831   - unsigned long flags;
2832   -
2833   - spin_lock_irqsave(q->queue_lock, flags);
2834   -
2835   - /*
2836   - * Ensure we get a fresh copy of the ->key to prevent
2837   - * race between exiting task and queue
2838   - */
2839   - smp_read_barrier_depends();
2840   - if (cic->key == cfqd)
2841   - __cfq_exit_single_io_context(cfqd, cic);
2842   -
2843   - spin_unlock_irqrestore(q->queue_lock, flags);
2844   - }
2845   -}
2846   -
2847   -/*
2848   - * The process that ioc belongs to has exited, we need to clean up
2849   - * and put the internal structures we have that belongs to that process.
2850   - */
2851   -static void cfq_exit_io_context(struct io_context *ioc)
2852   -{
2853   - call_for_each_cic(ioc, cfq_exit_single_io_context);
2854   -}
2855   -
2856   -static struct cfq_io_context *
2857   -cfq_alloc_io_context(struct cfq_data *cfqd, gfp_t gfp_mask)
2858   -{
2859   - struct cfq_io_context *cic;
2860   -
2861   - cic = kmem_cache_alloc_node(cfq_ioc_pool, gfp_mask | __GFP_ZERO,
2862   - cfqd->queue->node);
2863   - if (cic) {
2864   - cic->ttime.last_end_request = jiffies;
2865   - INIT_LIST_HEAD(&cic->queue_list);
2866   - INIT_HLIST_NODE(&cic->cic_list);
2867   - cic->dtor = cfq_free_io_context;
2868   - cic->exit = cfq_exit_io_context;
2869   - elv_ioc_count_inc(cfq_ioc_count);
2870   - }
2871   -
2872   - return cic;
2873   -}
2874   -
2875 2744 static void cfq_init_prio_data(struct cfq_queue *cfqq, struct io_context *ioc)
2876 2745 {
2877 2746 struct task_struct *tsk = current;
2878 2747  
2879 2748  
2880 2749  
... ... @@ -2914,21 +2783,18 @@
2914 2783 cfq_clear_cfqq_prio_changed(cfqq);
2915 2784 }
2916 2785  
2917   -static void changed_ioprio(struct io_context *ioc, struct cfq_io_context *cic)
  2786 +static void changed_ioprio(struct cfq_io_cq *cic)
2918 2787 {
2919 2788 struct cfq_data *cfqd = cic_to_cfqd(cic);
2920 2789 struct cfq_queue *cfqq;
2921   - unsigned long flags;
2922 2790  
2923 2791 if (unlikely(!cfqd))
2924 2792 return;
2925 2793  
2926   - spin_lock_irqsave(cfqd->queue->queue_lock, flags);
2927   -
2928 2794 cfqq = cic->cfqq[BLK_RW_ASYNC];
2929 2795 if (cfqq) {
2930 2796 struct cfq_queue *new_cfqq;
2931   - new_cfqq = cfq_get_queue(cfqd, BLK_RW_ASYNC, cic->ioc,
  2797 + new_cfqq = cfq_get_queue(cfqd, BLK_RW_ASYNC, cic->icq.ioc,
2932 2798 GFP_ATOMIC);
2933 2799 if (new_cfqq) {
2934 2800 cic->cfqq[BLK_RW_ASYNC] = new_cfqq;
2935 2801  
... ... @@ -2939,16 +2805,8 @@
2939 2805 cfqq = cic->cfqq[BLK_RW_SYNC];
2940 2806 if (cfqq)
2941 2807 cfq_mark_cfqq_prio_changed(cfqq);
2942   -
2943   - spin_unlock_irqrestore(cfqd->queue->queue_lock, flags);
2944 2808 }
2945 2809  
2946   -static void cfq_ioc_set_ioprio(struct io_context *ioc)
2947   -{
2948   - call_for_each_cic(ioc, changed_ioprio);
2949   - ioc->ioprio_changed = 0;
2950   -}
2951   -
2952 2810 static void cfq_init_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
2953 2811 pid_t pid, bool is_sync)
2954 2812 {
2955 2813  
... ... @@ -2970,11 +2828,10 @@
2970 2828 }
2971 2829  
2972 2830 #ifdef CONFIG_CFQ_GROUP_IOSCHED
2973   -static void changed_cgroup(struct io_context *ioc, struct cfq_io_context *cic)
  2831 +static void changed_cgroup(struct cfq_io_cq *cic)
2974 2832 {
2975 2833 struct cfq_queue *sync_cfqq = cic_to_cfqq(cic, 1);
2976 2834 struct cfq_data *cfqd = cic_to_cfqd(cic);
2977   - unsigned long flags;
2978 2835 struct request_queue *q;
2979 2836  
2980 2837 if (unlikely(!cfqd))
... ... @@ -2982,8 +2839,6 @@
2982 2839  
2983 2840 q = cfqd->queue;
2984 2841  
2985   - spin_lock_irqsave(q->queue_lock, flags);
2986   -
2987 2842 if (sync_cfqq) {
2988 2843 /*
2989 2844 * Drop reference to sync queue. A new sync queue will be
2990 2845  
... ... @@ -2993,15 +2848,7 @@
2993 2848 cic_set_cfqq(cic, NULL, 1);
2994 2849 cfq_put_queue(sync_cfqq);
2995 2850 }
2996   -
2997   - spin_unlock_irqrestore(q->queue_lock, flags);
2998 2851 }
2999   -
3000   -static void cfq_ioc_set_cgroup(struct io_context *ioc)
3001   -{
3002   - call_for_each_cic(ioc, changed_cgroup);
3003   - ioc->cgroup_changed = 0;
3004   -}
3005 2852 #endif /* CONFIG_CFQ_GROUP_IOSCHED */
3006 2853  
3007 2854 static struct cfq_queue *
... ... @@ -3009,7 +2856,7 @@
3009 2856 struct io_context *ioc, gfp_t gfp_mask)
3010 2857 {
3011 2858 struct cfq_queue *cfqq, *new_cfqq = NULL;
3012   - struct cfq_io_context *cic;
  2859 + struct cfq_io_cq *cic;
3013 2860 struct cfq_group *cfqg;
3014 2861  
3015 2862 retry:
3016 2863  
... ... @@ -3100,161 +2947,7 @@
3100 2947 return cfqq;
3101 2948 }
3102 2949  
3103   -/*
3104   - * We drop cfq io contexts lazily, so we may find a dead one.
3105   - */
3106 2950 static void
3107   -cfq_drop_dead_cic(struct cfq_data *cfqd, struct io_context *ioc,
3108   - struct cfq_io_context *cic)
3109   -{
3110   - unsigned long flags;
3111   -
3112   - WARN_ON(!list_empty(&cic->queue_list));
3113   - BUG_ON(cic->key != cfqd_dead_key(cfqd));
3114   -
3115   - spin_lock_irqsave(&ioc->lock, flags);
3116   -
3117   - BUG_ON(rcu_dereference_check(ioc->ioc_data,
3118   - lockdep_is_held(&ioc->lock)) == cic);
3119   -
3120   - radix_tree_delete(&ioc->radix_root, cfqd->cic_index);
3121   - hlist_del_rcu(&cic->cic_list);
3122   - spin_unlock_irqrestore(&ioc->lock, flags);
3123   -
3124   - cfq_cic_free(cic);
3125   -}
3126   -
3127   -static struct cfq_io_context *
3128   -cfq_cic_lookup(struct cfq_data *cfqd, struct io_context *ioc)
3129   -{
3130   - struct cfq_io_context *cic;
3131   - unsigned long flags;
3132   -
3133   - if (unlikely(!ioc))
3134   - return NULL;
3135   -
3136   - rcu_read_lock();
3137   -
3138   - /*
3139   - * we maintain a last-hit cache, to avoid browsing over the tree
3140   - */
3141   - cic = rcu_dereference(ioc->ioc_data);
3142   - if (cic && cic->key == cfqd) {
3143   - rcu_read_unlock();
3144   - return cic;
3145   - }
3146   -
3147   - do {
3148   - cic = radix_tree_lookup(&ioc->radix_root, cfqd->cic_index);
3149   - rcu_read_unlock();
3150   - if (!cic)
3151   - break;
3152   - if (unlikely(cic->key != cfqd)) {
3153   - cfq_drop_dead_cic(cfqd, ioc, cic);
3154   - rcu_read_lock();
3155   - continue;
3156   - }
3157   -
3158   - spin_lock_irqsave(&ioc->lock, flags);
3159   - rcu_assign_pointer(ioc->ioc_data, cic);
3160   - spin_unlock_irqrestore(&ioc->lock, flags);
3161   - break;
3162   - } while (1);
3163   -
3164   - return cic;
3165   -}
3166   -
3167   -/*
3168   - * Add cic into ioc, using cfqd as the search key. This enables us to lookup
3169   - * the process specific cfq io context when entered from the block layer.
3170   - * Also adds the cic to a per-cfqd list, used when this queue is removed.
3171   - */
3172   -static int cfq_cic_link(struct cfq_data *cfqd, struct io_context *ioc,
3173   - struct cfq_io_context *cic, gfp_t gfp_mask)
3174   -{
3175   - unsigned long flags;
3176   - int ret;
3177   -
3178   - ret = radix_tree_preload(gfp_mask);
3179   - if (!ret) {
3180   - cic->ioc = ioc;
3181   - cic->key = cfqd;
3182   -
3183   - spin_lock_irqsave(&ioc->lock, flags);
3184   - ret = radix_tree_insert(&ioc->radix_root,
3185   - cfqd->cic_index, cic);
3186   - if (!ret)
3187   - hlist_add_head_rcu(&cic->cic_list, &ioc->cic_list);
3188   - spin_unlock_irqrestore(&ioc->lock, flags);
3189   -
3190   - radix_tree_preload_end();
3191   -
3192   - if (!ret) {
3193   - spin_lock_irqsave(cfqd->queue->queue_lock, flags);
3194   - list_add(&cic->queue_list, &cfqd->cic_list);
3195   - spin_unlock_irqrestore(cfqd->queue->queue_lock, flags);
3196   - }
3197   - }
3198   -
3199   - if (ret && ret != -EEXIST)
3200   - printk(KERN_ERR "cfq: cic link failed!\n");
3201   -
3202   - return ret;
3203   -}
3204   -
3205   -/*
3206   - * Setup general io context and cfq io context. There can be several cfq
3207   - * io contexts per general io context, if this process is doing io to more
3208   - * than one device managed by cfq.
3209   - */
3210   -static struct cfq_io_context *
3211   -cfq_get_io_context(struct cfq_data *cfqd, gfp_t gfp_mask)
3212   -{
3213   - struct io_context *ioc = NULL;
3214   - struct cfq_io_context *cic;
3215   - int ret;
3216   -
3217   - might_sleep_if(gfp_mask & __GFP_WAIT);
3218   -
3219   - ioc = get_io_context(gfp_mask, cfqd->queue->node);
3220   - if (!ioc)
3221   - return NULL;
3222   -
3223   -retry:
3224   - cic = cfq_cic_lookup(cfqd, ioc);
3225   - if (cic)
3226   - goto out;
3227   -
3228   - cic = cfq_alloc_io_context(cfqd, gfp_mask);
3229   - if (cic == NULL)
3230   - goto err;
3231   -
3232   - ret = cfq_cic_link(cfqd, ioc, cic, gfp_mask);
3233   - if (ret == -EEXIST) {
3234   - /* someone has linked cic to ioc already */
3235   - cfq_cic_free(cic);
3236   - goto retry;
3237   - } else if (ret)
3238   - goto err_free;
3239   -
3240   -out:
3241   - smp_read_barrier_depends();
3242   - if (unlikely(ioc->ioprio_changed))
3243   - cfq_ioc_set_ioprio(ioc);
3244   -
3245   -#ifdef CONFIG_CFQ_GROUP_IOSCHED
3246   - if (unlikely(ioc->cgroup_changed))
3247   - cfq_ioc_set_cgroup(ioc);
3248   -#endif
3249   - return cic;
3250   -err_free:
3251   - cfq_cic_free(cic);
3252   -err:
3253   - put_io_context(ioc);
3254   - return NULL;
3255   -}
3256   -
3257   -static void
3258 2951 __cfq_update_io_thinktime(struct cfq_ttime *ttime, unsigned long slice_idle)
3259 2952 {
3260 2953 unsigned long elapsed = jiffies - ttime->last_end_request;
... ... @@ -3267,7 +2960,7 @@
3267 2960  
3268 2961 static void
3269 2962 cfq_update_io_thinktime(struct cfq_data *cfqd, struct cfq_queue *cfqq,
3270   - struct cfq_io_context *cic)
  2963 + struct cfq_io_cq *cic)
3271 2964 {
3272 2965 if (cfq_cfqq_sync(cfqq)) {
3273 2966 __cfq_update_io_thinktime(&cic->ttime, cfqd->cfq_slice_idle);
... ... @@ -3305,7 +2998,7 @@
3305 2998 */
3306 2999 static void
3307 3000 cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq,
3308   - struct cfq_io_context *cic)
  3001 + struct cfq_io_cq *cic)
3309 3002 {
3310 3003 int old_idle, enable_idle;
3311 3004  
... ... @@ -3322,8 +3015,9 @@
3322 3015  
3323 3016 if (cfqq->next_rq && (cfqq->next_rq->cmd_flags & REQ_NOIDLE))
3324 3017 enable_idle = 0;
3325   - else if (!atomic_read(&cic->ioc->nr_tasks) || !cfqd->cfq_slice_idle ||
3326   - (!cfq_cfqq_deep(cfqq) && CFQQ_SEEKY(cfqq)))
  3018 + else if (!atomic_read(&cic->icq.ioc->nr_tasks) ||
  3019 + !cfqd->cfq_slice_idle ||
  3020 + (!cfq_cfqq_deep(cfqq) && CFQQ_SEEKY(cfqq)))
3327 3021 enable_idle = 0;
3328 3022 else if (sample_valid(cic->ttime.ttime_samples)) {
3329 3023 if (cic->ttime.ttime_mean > cfqd->cfq_slice_idle)
... ... @@ -3455,7 +3149,7 @@
3455 3149 cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq,
3456 3150 struct request *rq)
3457 3151 {
3458   - struct cfq_io_context *cic = RQ_CIC(rq);
  3152 + struct cfq_io_cq *cic = RQ_CIC(rq);
3459 3153  
3460 3154 cfqd->rq_queued++;
3461 3155 if (rq->cmd_flags & REQ_PRIO)
... ... @@ -3508,7 +3202,7 @@
3508 3202 struct cfq_queue *cfqq = RQ_CFQQ(rq);
3509 3203  
3510 3204 cfq_log_cfqq(cfqd, cfqq, "insert_request");
3511   - cfq_init_prio_data(cfqq, RQ_CIC(rq)->ioc);
  3205 + cfq_init_prio_data(cfqq, RQ_CIC(rq)->icq.ioc);
3512 3206  
3513 3207 rq_set_fifo_time(rq, jiffies + cfqd->cfq_fifo_expire[rq_is_sync(rq)]);
3514 3208 list_add_tail(&rq->queuelist, &cfqq->fifo);
... ... @@ -3558,7 +3252,7 @@
3558 3252  
3559 3253 static bool cfq_should_wait_busy(struct cfq_data *cfqd, struct cfq_queue *cfqq)
3560 3254 {
3561   - struct cfq_io_context *cic = cfqd->active_cic;
  3255 + struct cfq_io_cq *cic = cfqd->active_cic;
3562 3256  
3563 3257 /* If the queue already has requests, don't wait */
3564 3258 if (!RB_EMPTY_ROOT(&cfqq->sort_list))
... ... @@ -3695,7 +3389,7 @@
3695 3389 {
3696 3390 struct cfq_data *cfqd = q->elevator->elevator_data;
3697 3391 struct task_struct *tsk = current;
3698   - struct cfq_io_context *cic;
  3392 + struct cfq_io_cq *cic;
3699 3393 struct cfq_queue *cfqq;
3700 3394  
3701 3395 /*
... ... @@ -3710,7 +3404,7 @@
3710 3404  
3711 3405 cfqq = cic_to_cfqq(cic, rw_is_sync(rw));
3712 3406 if (cfqq) {
3713   - cfq_init_prio_data(cfqq, cic->ioc);
  3407 + cfq_init_prio_data(cfqq, cic->icq.ioc);
3714 3408  
3715 3409 return __cfq_may_queue(cfqq);
3716 3410 }
3717 3411  
3718 3412  
... ... @@ -3731,21 +3425,17 @@
3731 3425 BUG_ON(!cfqq->allocated[rw]);
3732 3426 cfqq->allocated[rw]--;
3733 3427  
3734   - put_io_context(RQ_CIC(rq)->ioc);
3735   -
3736   - rq->elevator_private[0] = NULL;
3737   - rq->elevator_private[1] = NULL;
3738   -
3739 3428 /* Put down rq reference on cfqg */
3740 3429 cfq_put_cfqg(RQ_CFQG(rq));
3741   - rq->elevator_private[2] = NULL;
  3430 + rq->elv.priv[0] = NULL;
  3431 + rq->elv.priv[1] = NULL;
3742 3432  
3743 3433 cfq_put_queue(cfqq);
3744 3434 }
3745 3435 }
3746 3436  
3747 3437 static struct cfq_queue *
3748   -cfq_merge_cfqqs(struct cfq_data *cfqd, struct cfq_io_context *cic,
  3438 +cfq_merge_cfqqs(struct cfq_data *cfqd, struct cfq_io_cq *cic,
3749 3439 struct cfq_queue *cfqq)
3750 3440 {
3751 3441 cfq_log_cfqq(cfqd, cfqq, "merging with queue %p", cfqq->new_cfqq);
... ... @@ -3760,7 +3450,7 @@
3760 3450 * was the last process referring to said cfqq.
3761 3451 */
3762 3452 static struct cfq_queue *
3763   -split_cfqq(struct cfq_io_context *cic, struct cfq_queue *cfqq)
  3453 +split_cfqq(struct cfq_io_cq *cic, struct cfq_queue *cfqq)
3764 3454 {
3765 3455 if (cfqq_process_refs(cfqq) == 1) {
3766 3456 cfqq->pid = current->pid;
3767 3457  
3768 3458  
3769 3459  
3770 3460  
3771 3461  
... ... @@ -3783,25 +3473,29 @@
3783 3473 cfq_set_request(struct request_queue *q, struct request *rq, gfp_t gfp_mask)
3784 3474 {
3785 3475 struct cfq_data *cfqd = q->elevator->elevator_data;
3786   - struct cfq_io_context *cic;
  3476 + struct cfq_io_cq *cic = icq_to_cic(rq->elv.icq);
3787 3477 const int rw = rq_data_dir(rq);
3788 3478 const bool is_sync = rq_is_sync(rq);
3789 3479 struct cfq_queue *cfqq;
3790   - unsigned long flags;
3791 3480  
3792 3481 might_sleep_if(gfp_mask & __GFP_WAIT);
3793 3482  
3794   - cic = cfq_get_io_context(cfqd, gfp_mask);
  3483 + spin_lock_irq(q->queue_lock);
3795 3484  
3796   - spin_lock_irqsave(q->queue_lock, flags);
  3485 + /* handle changed notifications */
  3486 + if (unlikely(cic->icq.changed)) {
  3487 + if (test_and_clear_bit(ICQ_IOPRIO_CHANGED, &cic->icq.changed))
  3488 + changed_ioprio(cic);
  3489 +#ifdef CONFIG_CFQ_GROUP_IOSCHED
  3490 + if (test_and_clear_bit(ICQ_CGROUP_CHANGED, &cic->icq.changed))
  3491 + changed_cgroup(cic);
  3492 +#endif
  3493 + }
3797 3494  
3798   - if (!cic)
3799   - goto queue_fail;
3800   -
3801 3495 new_queue:
3802 3496 cfqq = cic_to_cfqq(cic, is_sync);
3803 3497 if (!cfqq || cfqq == &cfqd->oom_cfqq) {
3804   - cfqq = cfq_get_queue(cfqd, is_sync, cic->ioc, gfp_mask);
  3498 + cfqq = cfq_get_queue(cfqd, is_sync, cic->icq.ioc, gfp_mask);
3805 3499 cic_set_cfqq(cic, cfqq, is_sync);
3806 3500 } else {
3807 3501 /*
3808 3502  
... ... @@ -3827,17 +3521,10 @@
3827 3521 cfqq->allocated[rw]++;
3828 3522  
3829 3523 cfqq->ref++;
3830   - rq->elevator_private[0] = cic;
3831   - rq->elevator_private[1] = cfqq;
3832   - rq->elevator_private[2] = cfq_ref_get_cfqg(cfqq->cfqg);
3833   - spin_unlock_irqrestore(q->queue_lock, flags);
  3524 + rq->elv.priv[0] = cfqq;
  3525 + rq->elv.priv[1] = cfq_ref_get_cfqg(cfqq->cfqg);
  3526 + spin_unlock_irq(q->queue_lock);
3834 3527 return 0;
3835   -
3836   -queue_fail:
3837   - cfq_schedule_dispatch(cfqd);
3838   - spin_unlock_irqrestore(q->queue_lock, flags);
3839   - cfq_log(cfqd, "set_request fail");
3840   - return 1;
3841 3528 }
3842 3529  
3843 3530 static void cfq_kick_queue(struct work_struct *work)
... ... @@ -3941,14 +3628,6 @@
3941 3628 if (cfqd->active_queue)
3942 3629 __cfq_slice_expired(cfqd, cfqd->active_queue, 0);
3943 3630  
3944   - while (!list_empty(&cfqd->cic_list)) {
3945   - struct cfq_io_context *cic = list_entry(cfqd->cic_list.next,
3946   - struct cfq_io_context,
3947   - queue_list);
3948   -
3949   - __cfq_exit_single_io_context(cfqd, cic);
3950   - }
3951   -
3952 3631 cfq_put_async_queues(cfqd);
3953 3632 cfq_release_cfq_groups(cfqd);
3954 3633  
... ... @@ -3963,10 +3642,6 @@
3963 3642  
3964 3643 cfq_shutdown_timer_wq(cfqd);
3965 3644  
3966   - spin_lock(&cic_index_lock);
3967   - ida_remove(&cic_index_ida, cfqd->cic_index);
3968   - spin_unlock(&cic_index_lock);
3969   -
3970 3645 /*
3971 3646 * Wait for cfqg->blkg->key accessors to exit their grace periods.
3972 3647 * Do this wait only if there are other unlinked groups out
... ... @@ -3988,24 +3663,6 @@
3988 3663 kfree(cfqd);
3989 3664 }
3990 3665  
3991   -static int cfq_alloc_cic_index(void)
3992   -{
3993   - int index, error;
3994   -
3995   - do {
3996   - if (!ida_pre_get(&cic_index_ida, GFP_KERNEL))
3997   - return -ENOMEM;
3998   -
3999   - spin_lock(&cic_index_lock);
4000   - error = ida_get_new(&cic_index_ida, &index);
4001   - spin_unlock(&cic_index_lock);
4002   - if (error && error != -EAGAIN)
4003   - return error;
4004   - } while (error);
4005   -
4006   - return index;
4007   -}
4008   -
4009 3666 static void *cfq_init_queue(struct request_queue *q)
4010 3667 {
4011 3668 struct cfq_data *cfqd;
4012 3669  
4013 3670  
4014 3671  
... ... @@ -4013,24 +3670,10 @@
4013 3670 struct cfq_group *cfqg;
4014 3671 struct cfq_rb_root *st;
4015 3672  
4016   - i = cfq_alloc_cic_index();
4017   - if (i < 0)
4018   - return NULL;
4019   -
4020 3673 cfqd = kmalloc_node(sizeof(*cfqd), GFP_KERNEL | __GFP_ZERO, q->node);
4021   - if (!cfqd) {
4022   - spin_lock(&cic_index_lock);
4023   - ida_remove(&cic_index_ida, i);
4024   - spin_unlock(&cic_index_lock);
  3674 + if (!cfqd)
4025 3675 return NULL;
4026   - }
4027 3676  
4028   - /*
4029   - * Don't need take queue_lock in the routine, since we are
4030   - * initializing the ioscheduler, and nobody is using cfqd
4031   - */
4032   - cfqd->cic_index = i;
4033   -
4034 3677 /* Init root service tree */
4035 3678 cfqd->grp_service_tree = CFQ_RB_ROOT;
4036 3679  
... ... @@ -4055,11 +3698,6 @@
4055 3698  
4056 3699 if (blkio_alloc_blkg_stats(&cfqg->blkg)) {
4057 3700 kfree(cfqg);
4058   -
4059   - spin_lock(&cic_index_lock);
4060   - ida_remove(&cic_index_ida, cfqd->cic_index);
4061   - spin_unlock(&cic_index_lock);
4062   -
4063 3701 kfree(cfqd);
4064 3702 return NULL;
4065 3703 }
... ... @@ -4091,8 +3729,6 @@
4091 3729 cfqd->oom_cfqq.ref++;
4092 3730 cfq_link_cfqq_cfqg(&cfqd->oom_cfqq, &cfqd->root_group);
4093 3731  
4094   - INIT_LIST_HEAD(&cfqd->cic_list);
4095   -
4096 3732 cfqd->queue = q;
4097 3733  
4098 3734 init_timer(&cfqd->idle_slice_timer);
... ... @@ -4121,34 +3757,6 @@
4121 3757 return cfqd;
4122 3758 }
4123 3759  
4124   -static void cfq_slab_kill(void)
4125   -{
4126   - /*
4127   - * Caller already ensured that pending RCU callbacks are completed,
4128   - * so we should have no busy allocations at this point.
4129   - */
4130   - if (cfq_pool)
4131   - kmem_cache_destroy(cfq_pool);
4132   - if (cfq_ioc_pool)
4133   - kmem_cache_destroy(cfq_ioc_pool);
4134   -}
4135   -
4136   -static int __init cfq_slab_setup(void)
4137   -{
4138   - cfq_pool = KMEM_CACHE(cfq_queue, 0);
4139   - if (!cfq_pool)
4140   - goto fail;
4141   -
4142   - cfq_ioc_pool = KMEM_CACHE(cfq_io_context, 0);
4143   - if (!cfq_ioc_pool)
4144   - goto fail;
4145   -
4146   - return 0;
4147   -fail:
4148   - cfq_slab_kill();
4149   - return -ENOMEM;
4150   -}
4151   -
4152 3760 /*
4153 3761 * sysfs parts below -->
4154 3762 */
4155 3763  
4156 3764  
4157 3765  
... ... @@ -4254,15 +3862,18 @@
4254 3862 .elevator_completed_req_fn = cfq_completed_request,
4255 3863 .elevator_former_req_fn = elv_rb_former_request,
4256 3864 .elevator_latter_req_fn = elv_rb_latter_request,
  3865 + .elevator_init_icq_fn = cfq_init_icq,
  3866 + .elevator_exit_icq_fn = cfq_exit_icq,
4257 3867 .elevator_set_req_fn = cfq_set_request,
4258 3868 .elevator_put_req_fn = cfq_put_request,
4259 3869 .elevator_may_queue_fn = cfq_may_queue,
4260 3870 .elevator_init_fn = cfq_init_queue,
4261 3871 .elevator_exit_fn = cfq_exit_queue,
4262   - .trim = cfq_free_io_context,
4263 3872 },
  3873 + .icq_size = sizeof(struct cfq_io_cq),
  3874 + .icq_align = __alignof__(struct cfq_io_cq),
4264 3875 .elevator_attrs = cfq_attrs,
4265   - .elevator_name = "cfq",
  3876 + .elevator_name = "cfq",
4266 3877 .elevator_owner = THIS_MODULE,
4267 3878 };
4268 3879  
... ... @@ -4280,6 +3891,8 @@
4280 3891  
4281 3892 static int __init cfq_init(void)
4282 3893 {
  3894 + int ret;
  3895 +
4283 3896 /*
4284 3897 * could be 0 on HZ < 1000 setups
4285 3898 */
4286 3899  
... ... @@ -4294,10 +3907,16 @@
4294 3907 #else
4295 3908 cfq_group_idle = 0;
4296 3909 #endif
4297   - if (cfq_slab_setup())
  3910 + cfq_pool = KMEM_CACHE(cfq_queue, 0);
  3911 + if (!cfq_pool)
4298 3912 return -ENOMEM;
4299 3913  
4300   - elv_register(&iosched_cfq);
  3914 + ret = elv_register(&iosched_cfq);
  3915 + if (ret) {
  3916 + kmem_cache_destroy(cfq_pool);
  3917 + return ret;
  3918 + }
  3919 +
4301 3920 blkio_policy_register(&blkio_policy_cfq);
4302 3921  
4303 3922 return 0;
4304 3923  
... ... @@ -4305,21 +3924,9 @@
4305 3924  
4306 3925 static void __exit cfq_exit(void)
4307 3926 {
4308   - DECLARE_COMPLETION_ONSTACK(all_gone);
4309 3927 blkio_policy_unregister(&blkio_policy_cfq);
4310 3928 elv_unregister(&iosched_cfq);
4311   - ioc_gone = &all_gone;
4312   - /* ioc_gone's update must be visible before reading ioc_count */
4313   - smp_wmb();
4314   -
4315   - /*
4316   - * this also protects us from entering cfq_slab_kill() with
4317   - * pending RCU callbacks
4318   - */
4319   - if (elv_ioc_count_read(cfq_ioc_count))
4320   - wait_for_completion(&all_gone);
4321   - ida_destroy(&cic_index_ida);
4322   - cfq_slab_kill();
  3929 + kmem_cache_destroy(cfq_pool);
4323 3930 }
4324 3931  
4325 3932 module_init(cfq_init);
block/compat_ioctl.c
... ... @@ -719,6 +719,9 @@
719 719 case BLKSECTGET:
720 720 return compat_put_ushort(arg,
721 721 queue_max_sectors(bdev_get_queue(bdev)));
  722 + case BLKROTATIONAL:
  723 + return compat_put_ushort(arg,
  724 + !blk_queue_nonrot(bdev_get_queue(bdev)));
722 725 case BLKRASET: /* compatible, but no compat_ptr (!) */
723 726 case BLKFRASET:
724 727 if (!capable(CAP_SYS_ADMIN))
block/deadline-iosched.c
... ... @@ -448,9 +448,7 @@
448 448  
449 449 static int __init deadline_init(void)
450 450 {
451   - elv_register(&iosched_deadline);
452   -
453   - return 0;
  451 + return elv_register(&iosched_deadline);
454 452 }
455 453  
456 454 static void __exit deadline_exit(void)
... ... @@ -61,8 +61,8 @@
61 61 struct request_queue *q = rq->q;
62 62 struct elevator_queue *e = q->elevator;
63 63  
64   - if (e->ops->elevator_allow_merge_fn)
65   - return e->ops->elevator_allow_merge_fn(q, rq, bio);
  64 + if (e->type->ops.elevator_allow_merge_fn)
  65 + return e->type->ops.elevator_allow_merge_fn(q, rq, bio);
66 66  
67 67 return 1;
68 68 }
69 69  
70 70  
... ... @@ -168,19 +168,15 @@
168 168 return e;
169 169 }
170 170  
171   -static void *elevator_init_queue(struct request_queue *q,
172   - struct elevator_queue *eq)
  171 +static int elevator_init_queue(struct request_queue *q,
  172 + struct elevator_queue *eq)
173 173 {
174   - return eq->ops->elevator_init_fn(q);
  174 + eq->elevator_data = eq->type->ops.elevator_init_fn(q);
  175 + if (eq->elevator_data)
  176 + return 0;
  177 + return -ENOMEM;
175 178 }
176 179  
177   -static void elevator_attach(struct request_queue *q, struct elevator_queue *eq,
178   - void *data)
179   -{
180   - q->elevator = eq;
181   - eq->elevator_data = data;
182   -}
183   -
184 180 static char chosen_elevator[ELV_NAME_MAX];
185 181  
186 182 static int __init elevator_setup(char *str)
... ... @@ -207,8 +203,7 @@
207 203 if (unlikely(!eq))
208 204 goto err;
209 205  
210   - eq->ops = &e->ops;
211   - eq->elevator_type = e;
  206 + eq->type = e;
212 207 kobject_init(&eq->kobj, &elv_ktype);
213 208 mutex_init(&eq->sysfs_lock);
214 209  
... ... @@ -232,7 +227,7 @@
232 227 struct elevator_queue *e;
233 228  
234 229 e = container_of(kobj, struct elevator_queue, kobj);
235   - elevator_put(e->elevator_type);
  230 + elevator_put(e->type);
236 231 kfree(e->hash);
237 232 kfree(e);
238 233 }
... ... @@ -241,7 +236,7 @@
241 236 {
242 237 struct elevator_type *e = NULL;
243 238 struct elevator_queue *eq;
244   - void *data;
  239 + int err;
245 240  
246 241 if (unlikely(q->elevator))
247 242 return 0;
248 243  
249 244  
... ... @@ -278,13 +273,13 @@
278 273 if (!eq)
279 274 return -ENOMEM;
280 275  
281   - data = elevator_init_queue(q, eq);
282   - if (!data) {
  276 + err = elevator_init_queue(q, eq);
  277 + if (err) {
283 278 kobject_put(&eq->kobj);
284   - return -ENOMEM;
  279 + return err;
285 280 }
286 281  
287   - elevator_attach(q, eq, data);
  282 + q->elevator = eq;
288 283 return 0;
289 284 }
290 285 EXPORT_SYMBOL(elevator_init);
... ... @@ -292,9 +287,8 @@
292 287 void elevator_exit(struct elevator_queue *e)
293 288 {
294 289 mutex_lock(&e->sysfs_lock);
295   - if (e->ops->elevator_exit_fn)
296   - e->ops->elevator_exit_fn(e);
297   - e->ops = NULL;
  290 + if (e->type->ops.elevator_exit_fn)
  291 + e->type->ops.elevator_exit_fn(e);
298 292 mutex_unlock(&e->sysfs_lock);
299 293  
300 294 kobject_put(&e->kobj);
... ... @@ -504,8 +498,8 @@
504 498 return ELEVATOR_BACK_MERGE;
505 499 }
506 500  
507   - if (e->ops->elevator_merge_fn)
508   - return e->ops->elevator_merge_fn(q, req, bio);
  501 + if (e->type->ops.elevator_merge_fn)
  502 + return e->type->ops.elevator_merge_fn(q, req, bio);
509 503  
510 504 return ELEVATOR_NO_MERGE;
511 505 }
... ... @@ -548,8 +542,8 @@
548 542 {
549 543 struct elevator_queue *e = q->elevator;
550 544  
551   - if (e->ops->elevator_merged_fn)
552   - e->ops->elevator_merged_fn(q, rq, type);
  545 + if (e->type->ops.elevator_merged_fn)
  546 + e->type->ops.elevator_merged_fn(q, rq, type);
553 547  
554 548 if (type == ELEVATOR_BACK_MERGE)
555 549 elv_rqhash_reposition(q, rq);
... ... @@ -563,8 +557,8 @@
563 557 struct elevator_queue *e = q->elevator;
564 558 const int next_sorted = next->cmd_flags & REQ_SORTED;
565 559  
566   - if (next_sorted && e->ops->elevator_merge_req_fn)
567   - e->ops->elevator_merge_req_fn(q, rq, next);
  560 + if (next_sorted && e->type->ops.elevator_merge_req_fn)
  561 + e->type->ops.elevator_merge_req_fn(q, rq, next);
568 562  
569 563 elv_rqhash_reposition(q, rq);
570 564  
... ... @@ -581,8 +575,8 @@
581 575 {
582 576 struct elevator_queue *e = q->elevator;
583 577  
584   - if (e->ops->elevator_bio_merged_fn)
585   - e->ops->elevator_bio_merged_fn(q, rq, bio);
  578 + if (e->type->ops.elevator_bio_merged_fn)
  579 + e->type->ops.elevator_bio_merged_fn(q, rq, bio);
586 580 }
587 581  
588 582 void elv_requeue_request(struct request_queue *q, struct request *rq)
589 583  
... ... @@ -608,12 +602,12 @@
608 602  
609 603 lockdep_assert_held(q->queue_lock);
610 604  
611   - while (q->elevator->ops->elevator_dispatch_fn(q, 1))
  605 + while (q->elevator->type->ops.elevator_dispatch_fn(q, 1))
612 606 ;
613 607 if (q->nr_sorted && printed++ < 10) {
614 608 printk(KERN_ERR "%s: forced dispatching is broken "
615 609 "(nr_sorted=%u), please report this\n",
616   - q->elevator->elevator_type->elevator_name, q->nr_sorted);
  610 + q->elevator->type->elevator_name, q->nr_sorted);
617 611 }
618 612 }
619 613  
... ... @@ -702,7 +696,7 @@
702 696 * rq cannot be accessed after calling
703 697 * elevator_add_req_fn.
704 698 */
705   - q->elevator->ops->elevator_add_req_fn(q, rq);
  699 + q->elevator->type->ops.elevator_add_req_fn(q, rq);
706 700 break;
707 701  
708 702 case ELEVATOR_INSERT_FLUSH:
... ... @@ -731,8 +725,8 @@
731 725 {
732 726 struct elevator_queue *e = q->elevator;
733 727  
734   - if (e->ops->elevator_latter_req_fn)
735   - return e->ops->elevator_latter_req_fn(q, rq);
  728 + if (e->type->ops.elevator_latter_req_fn)
  729 + return e->type->ops.elevator_latter_req_fn(q, rq);
736 730 return NULL;
737 731 }
738 732  
... ... @@ -740,8 +734,8 @@
740 734 {
741 735 struct elevator_queue *e = q->elevator;
742 736  
743   - if (e->ops->elevator_former_req_fn)
744   - return e->ops->elevator_former_req_fn(q, rq);
  737 + if (e->type->ops.elevator_former_req_fn)
  738 + return e->type->ops.elevator_former_req_fn(q, rq);
745 739 return NULL;
746 740 }
747 741  
... ... @@ -749,10 +743,8 @@
749 743 {
750 744 struct elevator_queue *e = q->elevator;
751 745  
752   - if (e->ops->elevator_set_req_fn)
753   - return e->ops->elevator_set_req_fn(q, rq, gfp_mask);
754   -
755   - rq->elevator_private[0] = NULL;
  746 + if (e->type->ops.elevator_set_req_fn)
  747 + return e->type->ops.elevator_set_req_fn(q, rq, gfp_mask);
756 748 return 0;
757 749 }
758 750  
759 751  
... ... @@ -760,16 +752,16 @@
760 752 {
761 753 struct elevator_queue *e = q->elevator;
762 754  
763   - if (e->ops->elevator_put_req_fn)
764   - e->ops->elevator_put_req_fn(rq);
  755 + if (e->type->ops.elevator_put_req_fn)
  756 + e->type->ops.elevator_put_req_fn(rq);
765 757 }
766 758  
767 759 int elv_may_queue(struct request_queue *q, int rw)
768 760 {
769 761 struct elevator_queue *e = q->elevator;
770 762  
771   - if (e->ops->elevator_may_queue_fn)
772   - return e->ops->elevator_may_queue_fn(q, rw);
  763 + if (e->type->ops.elevator_may_queue_fn)
  764 + return e->type->ops.elevator_may_queue_fn(q, rw);
773 765  
774 766 return ELV_MQUEUE_MAY;
775 767 }
... ... @@ -804,8 +796,8 @@
804 796 if (blk_account_rq(rq)) {
805 797 q->in_flight[rq_is_sync(rq)]--;
806 798 if ((rq->cmd_flags & REQ_SORTED) &&
807   - e->ops->elevator_completed_req_fn)
808   - e->ops->elevator_completed_req_fn(q, rq);
  799 + e->type->ops.elevator_completed_req_fn)
  800 + e->type->ops.elevator_completed_req_fn(q, rq);
809 801 }
810 802 }
811 803  
... ... @@ -823,7 +815,7 @@
823 815  
824 816 e = container_of(kobj, struct elevator_queue, kobj);
825 817 mutex_lock(&e->sysfs_lock);
826   - error = e->ops ? entry->show(e, page) : -ENOENT;
  818 + error = e->type ? entry->show(e, page) : -ENOENT;
827 819 mutex_unlock(&e->sysfs_lock);
828 820 return error;
829 821 }
... ... @@ -841,7 +833,7 @@
841 833  
842 834 e = container_of(kobj, struct elevator_queue, kobj);
843 835 mutex_lock(&e->sysfs_lock);
844   - error = e->ops ? entry->store(e, page, length) : -ENOENT;
  836 + error = e->type ? entry->store(e, page, length) : -ENOENT;
845 837 mutex_unlock(&e->sysfs_lock);
846 838 return error;
847 839 }
848 840  
849 841  
... ... @@ -856,14 +848,13 @@
856 848 .release = elevator_release,
857 849 };
858 850  
859   -int elv_register_queue(struct request_queue *q)
  851 +int __elv_register_queue(struct request_queue *q, struct elevator_queue *e)
860 852 {
861   - struct elevator_queue *e = q->elevator;
862 853 int error;
863 854  
864 855 error = kobject_add(&e->kobj, &q->kobj, "%s", "iosched");
865 856 if (!error) {
866   - struct elv_fs_entry *attr = e->elevator_type->elevator_attrs;
  857 + struct elv_fs_entry *attr = e->type->elevator_attrs;
867 858 if (attr) {
868 859 while (attr->attr.name) {
869 860 if (sysfs_create_file(&e->kobj, &attr->attr))
870 861  
871 862  
872 863  
873 864  
874 865  
875 866  
876 867  
877 868  
... ... @@ -876,31 +867,55 @@
876 867 }
877 868 return error;
878 869 }
879   -EXPORT_SYMBOL(elv_register_queue);
880 870  
881   -static void __elv_unregister_queue(struct elevator_queue *e)
  871 +int elv_register_queue(struct request_queue *q)
882 872 {
883   - kobject_uevent(&e->kobj, KOBJ_REMOVE);
884   - kobject_del(&e->kobj);
885   - e->registered = 0;
  873 + return __elv_register_queue(q, q->elevator);
886 874 }
  875 +EXPORT_SYMBOL(elv_register_queue);
887 876  
888 877 void elv_unregister_queue(struct request_queue *q)
889 878 {
890   - if (q)
891   - __elv_unregister_queue(q->elevator);
  879 + if (q) {
  880 + struct elevator_queue *e = q->elevator;
  881 +
  882 + kobject_uevent(&e->kobj, KOBJ_REMOVE);
  883 + kobject_del(&e->kobj);
  884 + e->registered = 0;
  885 + }
892 886 }
893 887 EXPORT_SYMBOL(elv_unregister_queue);
894 888  
895   -void elv_register(struct elevator_type *e)
  889 +int elv_register(struct elevator_type *e)
896 890 {
897 891 char *def = "";
898 892  
  893 + /* create icq_cache if requested */
  894 + if (e->icq_size) {
  895 + if (WARN_ON(e->icq_size < sizeof(struct io_cq)) ||
  896 + WARN_ON(e->icq_align < __alignof__(struct io_cq)))
  897 + return -EINVAL;
  898 +
  899 + snprintf(e->icq_cache_name, sizeof(e->icq_cache_name),
  900 + "%s_io_cq", e->elevator_name);
  901 + e->icq_cache = kmem_cache_create(e->icq_cache_name, e->icq_size,
  902 + e->icq_align, 0, NULL);
  903 + if (!e->icq_cache)
  904 + return -ENOMEM;
  905 + }
  906 +
  907 + /* register, don't allow duplicate names */
899 908 spin_lock(&elv_list_lock);
900   - BUG_ON(elevator_find(e->elevator_name));
  909 + if (elevator_find(e->elevator_name)) {
  910 + spin_unlock(&elv_list_lock);
  911 + if (e->icq_cache)
  912 + kmem_cache_destroy(e->icq_cache);
  913 + return -EBUSY;
  914 + }
901 915 list_add_tail(&e->list, &elv_list);
902 916 spin_unlock(&elv_list_lock);
903 917  
  918 + /* print pretty message */
904 919 if (!strcmp(e->elevator_name, chosen_elevator) ||
905 920 (!*chosen_elevator &&
906 921 !strcmp(e->elevator_name, CONFIG_DEFAULT_IOSCHED)))
907 922  
908 923  
909 924  
910 925  
... ... @@ -908,30 +923,26 @@
908 923  
909 924 printk(KERN_INFO "io scheduler %s registered%s\n", e->elevator_name,
910 925 def);
  926 + return 0;
911 927 }
912 928 EXPORT_SYMBOL_GPL(elv_register);
913 929  
914 930 void elv_unregister(struct elevator_type *e)
915 931 {
916   - struct task_struct *g, *p;
  932 + /* unregister */
  933 + spin_lock(&elv_list_lock);
  934 + list_del_init(&e->list);
  935 + spin_unlock(&elv_list_lock);
917 936  
918 937 /*
919   - * Iterate every thread in the process to remove the io contexts.
  938 + * Destroy icq_cache if it exists. icq's are RCU managed. Make
  939 + * sure all RCU operations are complete before proceeding.
920 940 */
921   - if (e->ops.trim) {
922   - read_lock(&tasklist_lock);
923   - do_each_thread(g, p) {
924   - task_lock(p);
925   - if (p->io_context)
926   - e->ops.trim(p->io_context);
927   - task_unlock(p);
928   - } while_each_thread(g, p);
929   - read_unlock(&tasklist_lock);
  941 + if (e->icq_cache) {
  942 + rcu_barrier();
  943 + kmem_cache_destroy(e->icq_cache);
  944 + e->icq_cache = NULL;
930 945 }
931   -
932   - spin_lock(&elv_list_lock);
933   - list_del_init(&e->list);
934   - spin_unlock(&elv_list_lock);
935 946 }
936 947 EXPORT_SYMBOL_GPL(elv_unregister);
937 948  
938 949  
939 950  
940 951  
941 952  
942 953  
943 954  
944 955  
... ... @@ -944,54 +955,41 @@
944 955 static int elevator_switch(struct request_queue *q, struct elevator_type *new_e)
945 956 {
946 957 struct elevator_queue *old_elevator, *e;
947   - void *data;
948 958 int err;
949 959  
950   - /*
951   - * Allocate new elevator
952   - */
  960 + /* allocate new elevator */
953 961 e = elevator_alloc(q, new_e);
954 962 if (!e)
955 963 return -ENOMEM;
956 964  
957   - data = elevator_init_queue(q, e);
958   - if (!data) {
  965 + err = elevator_init_queue(q, e);
  966 + if (err) {
959 967 kobject_put(&e->kobj);
960   - return -ENOMEM;
  968 + return err;
961 969 }
962 970  
963   - /*
964   - * Turn on BYPASS and drain all requests w/ elevator private data
965   - */
  971 + /* turn on BYPASS and drain all requests w/ elevator private data */
966 972 elv_quiesce_start(q);
967 973  
968   - /*
969   - * Remember old elevator.
970   - */
971   - old_elevator = q->elevator;
972   -
973   - /*
974   - * attach and start new elevator
975   - */
976   - spin_lock_irq(q->queue_lock);
977   - elevator_attach(q, e, data);
978   - spin_unlock_irq(q->queue_lock);
979   -
980   - if (old_elevator->registered) {
981   - __elv_unregister_queue(old_elevator);
982   -
983   - err = elv_register_queue(q);
  974 + /* unregister old queue, register new one and kill old elevator */
  975 + if (q->elevator->registered) {
  976 + elv_unregister_queue(q);
  977 + err = __elv_register_queue(q, e);
984 978 if (err)
985 979 goto fail_register;
986 980 }
987 981  
988   - /*
989   - * finally exit old elevator and turn off BYPASS.
990   - */
  982 + /* done, clear io_cq's, switch elevators and turn off BYPASS */
  983 + spin_lock_irq(q->queue_lock);
  984 + ioc_clear_queue(q);
  985 + old_elevator = q->elevator;
  986 + q->elevator = e;
  987 + spin_unlock_irq(q->queue_lock);
  988 +
991 989 elevator_exit(old_elevator);
992 990 elv_quiesce_end(q);
993 991  
994   - blk_add_trace_msg(q, "elv switch: %s", e->elevator_type->elevator_name);
  992 + blk_add_trace_msg(q, "elv switch: %s", e->type->elevator_name);
995 993  
996 994 return 0;
997 995  
... ... @@ -1001,7 +999,6 @@
1001 999 * one again (along with re-adding the sysfs dir)
1002 1000 */
1003 1001 elevator_exit(e);
1004   - q->elevator = old_elevator;
1005 1002 elv_register_queue(q);
1006 1003 elv_quiesce_end(q);
1007 1004  
... ... @@ -1026,7 +1023,7 @@
1026 1023 return -EINVAL;
1027 1024 }
1028 1025  
1029   - if (!strcmp(elevator_name, q->elevator->elevator_type->elevator_name)) {
  1026 + if (!strcmp(elevator_name, q->elevator->type->elevator_name)) {
1030 1027 elevator_put(e);
1031 1028 return 0;
1032 1029 }
... ... @@ -1061,7 +1058,7 @@
1061 1058 if (!q->elevator || !blk_queue_stackable(q))
1062 1059 return sprintf(name, "none\n");
1063 1060  
1064   - elv = e->elevator_type;
  1061 + elv = e->type;
1065 1062  
1066 1063 spin_lock(&elv_list_lock);
1067 1064 list_for_each_entry(__e, &elv_list, list) {
... ... @@ -614,7 +614,7 @@
614 614 * Take an extra ref on queue which will be put on disk_release()
615 615 * so that it sticks around as long as @disk is there.
616 616 */
617   - WARN_ON_ONCE(blk_get_queue(disk->queue));
  617 + WARN_ON_ONCE(!blk_get_queue(disk->queue));
618 618  
619 619 retval = sysfs_create_link(&disk_to_dev(disk)->kobj, &bdi->dev->kobj,
620 620 "bdi");
... ... @@ -296,6 +296,8 @@
296 296 return put_uint(arg, bdev_discard_zeroes_data(bdev));
297 297 case BLKSECTGET:
298 298 return put_ushort(arg, queue_max_sectors(bdev_get_queue(bdev)));
  299 + case BLKROTATIONAL:
  300 + return put_ushort(arg, !blk_queue_nonrot(bdev_get_queue(bdev)));
299 301 case BLKRASET:
300 302 case BLKFRASET:
301 303 if(!capable(CAP_SYS_ADMIN))
block/noop-iosched.c
... ... @@ -94,9 +94,7 @@
94 94  
95 95 static int __init noop_init(void)
96 96 {
97   - elv_register(&elevator_noop);
98   -
99   - return 0;
  97 + return elv_register(&elevator_noop);
100 98 }
101 99  
102 100 static void __exit noop_exit(void)
... ... @@ -619,8 +619,10 @@
619 619 host->state == HST_DEV_SCAN);
620 620 spin_unlock_irq(&host->lock);
621 621  
622   - DPRINTK("blk_insert_request, tag == %u\n", idx);
623   - blk_insert_request(host->oob_q, crq->rq, 1, crq);
  622 + DPRINTK("blk_execute_rq_nowait, tag == %u\n", idx);
  623 + crq->rq->cmd_type = REQ_TYPE_SPECIAL;
  624 + crq->rq->special = crq;
  625 + blk_execute_rq_nowait(host->oob_q, NULL, crq->rq, true, NULL);
624 626  
625 627 return 0;
626 628  
... ... @@ -658,8 +660,10 @@
658 660 BUG_ON(rc < 0);
659 661 crq->msg_bucket = (u32) rc;
660 662  
661   - DPRINTK("blk_insert_request, tag == %u\n", idx);
662   - blk_insert_request(host->oob_q, crq->rq, 1, crq);
  663 + DPRINTK("blk_execute_rq_nowait, tag == %u\n", idx);
  664 + crq->rq->cmd_type = REQ_TYPE_SPECIAL;
  665 + crq->rq->special = crq;
  666 + blk_execute_rq_nowait(host->oob_q, NULL, crq->rq, true, NULL);
663 667  
664 668 return 0;
665 669 }
drivers/md/dm-table.c
... ... @@ -699,7 +699,7 @@
699 699 while (i < dm_table_get_num_targets(table)) {
700 700 ti = dm_table_get_target(table, i++);
701 701  
702   - blk_set_default_limits(&ti_limits);
  702 + blk_set_stacking_limits(&ti_limits);
703 703  
704 704 /* combine all target devices' limits */
705 705 if (ti->type->iterate_devices)
706 706  
... ... @@ -1221,10 +1221,10 @@
1221 1221 struct queue_limits ti_limits;
1222 1222 unsigned i = 0;
1223 1223  
1224   - blk_set_default_limits(limits);
  1224 + blk_set_stacking_limits(limits);
1225 1225  
1226 1226 while (i < dm_table_get_num_targets(table)) {
1227   - blk_set_default_limits(&ti_limits);
  1227 + blk_set_stacking_limits(&ti_limits);
1228 1228  
1229 1229 ti = dm_table_get_target(table, i++);
1230 1230  
... ... @@ -4666,6 +4666,7 @@
4666 4666 mddev->queue->queuedata = mddev;
4667 4667  
4668 4668 blk_queue_make_request(mddev->queue, md_make_request);
  4669 + blk_set_stacking_limits(&mddev->queue->limits);
4669 4670  
4670 4671 disk = alloc_disk(1 << shift);
4671 4672 if (!disk) {
drivers/scsi/scsi_scan.c
... ... @@ -297,7 +297,7 @@
297 297 kfree(sdev);
298 298 goto out;
299 299 }
300   - blk_get_queue(sdev->request_queue);
  300 + WARN_ON_ONCE(!blk_get_queue(sdev->request_queue));
301 301 sdev->request_queue->queuedata = sdev;
302 302 scsi_adjust_queue_depth(sdev, 0, sdev->host->cmd_per_lun);
303 303  
... ... @@ -48,28 +48,12 @@
48 48 if (err)
49 49 return err;
50 50  
51   - task_lock(task);
52   - do {
53   - ioc = task->io_context;
54   - /* see wmb() in current_io_context() */
55   - smp_read_barrier_depends();
56   - if (ioc)
57   - break;
58   -
59   - ioc = alloc_io_context(GFP_ATOMIC, -1);
60   - if (!ioc) {
61   - err = -ENOMEM;
62   - break;
63   - }
64   - task->io_context = ioc;
65   - } while (1);
66   -
67   - if (!err) {
68   - ioc->ioprio = ioprio;
69   - ioc->ioprio_changed = 1;
  51 + ioc = get_task_io_context(task, GFP_ATOMIC, NUMA_NO_NODE);
  52 + if (ioc) {
  53 + ioc_ioprio_changed(ioc, ioprio);
  54 + put_io_context(ioc, NULL);
70 55 }
71 56  
72   - task_unlock(task);
73 57 return err;
74 58 }
75 59 EXPORT_SYMBOL_GPL(set_task_ioprio);
... ... @@ -371,10 +371,7 @@
371 371 sector_t last_block_in_bio = 0;
372 372 struct buffer_head map_bh;
373 373 unsigned long first_logical_block = 0;
374   - struct blk_plug plug;
375 374  
376   - blk_start_plug(&plug);
377   -
378 375 map_bh.b_state = 0;
379 376 map_bh.b_size = 0;
380 377 for (page_idx = 0; page_idx < nr_pages; page_idx++) {
... ... @@ -395,7 +392,6 @@
395 392 BUG_ON(!list_empty(pages));
396 393 if (bio)
397 394 mpage_bio_submit(READ, bio);
398   - blk_finish_plug(&plug);
399 395 return 0;
400 396 }
401 397 EXPORT_SYMBOL(mpage_readpages);
... ... @@ -515,24 +515,64 @@
515 515  
516 516 #else /* CONFIG_BLK_DEV_INTEGRITY */
517 517  
518   -#define bio_integrity(a) (0)
519   -#define bioset_integrity_create(a, b) (0)
520   -#define bio_integrity_prep(a) (0)
521   -#define bio_integrity_enabled(a) (0)
  518 +static inline int bio_integrity(struct bio *bio)
  519 +{
  520 + return 0;
  521 +}
  522 +
  523 +static inline int bio_integrity_enabled(struct bio *bio)
  524 +{
  525 + return 0;
  526 +}
  527 +
  528 +static inline int bioset_integrity_create(struct bio_set *bs, int pool_size)
  529 +{
  530 + return 0;
  531 +}
  532 +
  533 +static inline void bioset_integrity_free (struct bio_set *bs)
  534 +{
  535 + return;
  536 +}
  537 +
  538 +static inline int bio_integrity_prep(struct bio *bio)
  539 +{
  540 + return 0;
  541 +}
  542 +
  543 +static inline void bio_integrity_free(struct bio *bio, struct bio_set *bs)
  544 +{
  545 + return;
  546 +}
  547 +
522 548 static inline int bio_integrity_clone(struct bio *bio, struct bio *bio_src,
523 549 gfp_t gfp_mask, struct bio_set *bs)
524 550 {
525 551 return 0;
526 552 }
527   -#define bioset_integrity_free(a) do { } while (0)
528   -#define bio_integrity_free(a, b) do { } while (0)
529   -#define bio_integrity_endio(a, b) do { } while (0)
530   -#define bio_integrity_advance(a, b) do { } while (0)
531   -#define bio_integrity_trim(a, b, c) do { } while (0)
532   -#define bio_integrity_split(a, b, c) do { } while (0)
533   -#define bio_integrity_set_tag(a, b, c) do { } while (0)
534   -#define bio_integrity_get_tag(a, b, c) do { } while (0)
535   -#define bio_integrity_init(a) do { } while (0)
  553 +
  554 +static inline void bio_integrity_split(struct bio *bio, struct bio_pair *bp,
  555 + int sectors)
  556 +{
  557 + return;
  558 +}
  559 +
  560 +static inline void bio_integrity_advance(struct bio *bio,
  561 + unsigned int bytes_done)
  562 +{
  563 + return;
  564 +}
  565 +
  566 +static inline void bio_integrity_trim(struct bio *bio, unsigned int offset,
  567 + unsigned int sectors)
  568 +{
  569 + return;
  570 +}
  571 +
  572 +static inline void bio_integrity_init(void)
  573 +{
  574 + return;
  575 +}
536 576  
537 577 #endif /* CONFIG_BLK_DEV_INTEGRITY */
538 578  
include/linux/blkdev.h
... ... @@ -111,11 +111,15 @@
111 111 * Three pointers are available for the IO schedulers, if they need
112 112 * more they have to dynamically allocate it. Flush requests are
113 113 * never put on the IO scheduler. So let the flush fields share
114   - * space with the three elevator_private pointers.
  114 + * space with the elevator data.
115 115 */
116 116 union {
117   - void *elevator_private[3];
118 117 struct {
  118 + struct io_cq *icq;
  119 + void *priv[2];
  120 + } elv;
  121 +
  122 + struct {
119 123 unsigned int seq;
120 124 struct list_head list;
121 125 rq_end_io_fn *saved_end_io;
... ... @@ -311,6 +315,12 @@
311 315 unsigned long queue_flags;
312 316  
313 317 /*
  318 + * ida allocated id for this queue. Used to index queues from
  319 + * ioctx.
  320 + */
  321 + int id;
  322 +
  323 + /*
314 324 * queue needs bounce pages for pages above this limit
315 325 */
316 326 gfp_t bounce_gfp;
... ... @@ -351,6 +361,8 @@
351 361 struct timer_list timeout;
352 362 struct list_head timeout_list;
353 363  
  364 + struct list_head icq_list;
  365 +
354 366 struct queue_limits limits;
355 367  
356 368 /*
... ... @@ -387,6 +399,9 @@
387 399 /* Throttle data */
388 400 struct throtl_data *td;
389 401 #endif
  402 +#ifdef CONFIG_LOCKDEP
  403 + int ioc_release_depth;
  404 +#endif
390 405 };
391 406  
392 407 #define QUEUE_FLAG_QUEUED 1 /* uses generic tag queueing */
... ... @@ -481,6 +496,7 @@
481 496  
482 497 #define blk_queue_tagged(q) test_bit(QUEUE_FLAG_QUEUED, &(q)->queue_flags)
483 498 #define blk_queue_stopped(q) test_bit(QUEUE_FLAG_STOPPED, &(q)->queue_flags)
  499 +#define blk_queue_dead(q) test_bit(QUEUE_FLAG_DEAD, &(q)->queue_flags)
484 500 #define blk_queue_nomerges(q) test_bit(QUEUE_FLAG_NOMERGES, &(q)->queue_flags)
485 501 #define blk_queue_noxmerges(q) \
486 502 test_bit(QUEUE_FLAG_NOXMERGES, &(q)->queue_flags)
... ... @@ -660,7 +676,6 @@
660 676 extern struct request *blk_get_request(struct request_queue *, int, gfp_t);
661 677 extern struct request *blk_make_request(struct request_queue *, struct bio *,
662 678 gfp_t);
663   -extern void blk_insert_request(struct request_queue *, struct request *, int, void *);
664 679 extern void blk_requeue_request(struct request_queue *, struct request *);
665 680 extern void blk_add_request_payload(struct request *rq, struct page *page,
666 681 unsigned int len);
... ... @@ -829,6 +844,7 @@
829 844 extern void blk_limits_io_opt(struct queue_limits *limits, unsigned int opt);
830 845 extern void blk_queue_io_opt(struct request_queue *q, unsigned int opt);
831 846 extern void blk_set_default_limits(struct queue_limits *lim);
  847 +extern void blk_set_stacking_limits(struct queue_limits *lim);
832 848 extern int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
833 849 sector_t offset);
834 850 extern int bdev_stack_limits(struct queue_limits *t, struct block_device *bdev,
... ... @@ -859,7 +875,7 @@
859 875 extern void blk_dump_rq_flags(struct request *, char *);
860 876 extern long nr_blockdev_pages(void);
861 877  
862   -int blk_get_queue(struct request_queue *);
  878 +bool __must_check blk_get_queue(struct request_queue *);
863 879 struct request_queue *blk_alloc_queue(gfp_t);
864 880 struct request_queue *blk_alloc_queue_node(gfp_t, int);
865 881 extern void blk_put_queue(struct request_queue *);
... ... @@ -1282,19 +1298,70 @@
1282 1298  
1283 1299 #else /* CONFIG_BLK_DEV_INTEGRITY */
1284 1300  
1285   -#define blk_integrity_rq(rq) (0)
1286   -#define blk_rq_count_integrity_sg(a, b) (0)
1287   -#define blk_rq_map_integrity_sg(a, b, c) (0)
1288   -#define bdev_get_integrity(a) (0)
1289   -#define blk_get_integrity(a) (0)
1290   -#define blk_integrity_compare(a, b) (0)
1291   -#define blk_integrity_register(a, b) (0)
1292   -#define blk_integrity_unregister(a) do { } while (0)
1293   -#define blk_queue_max_integrity_segments(a, b) do { } while (0)
1294   -#define queue_max_integrity_segments(a) (0)
1295   -#define blk_integrity_merge_rq(a, b, c) (0)
1296   -#define blk_integrity_merge_bio(a, b, c) (0)
1297   -#define blk_integrity_is_initialized(a) (0)
  1301 +struct bio;
  1302 +struct block_device;
  1303 +struct gendisk;
  1304 +struct blk_integrity;
  1305 +
  1306 +static inline int blk_integrity_rq(struct request *rq)
  1307 +{
  1308 + return 0;
  1309 +}
  1310 +static inline int blk_rq_count_integrity_sg(struct request_queue *q,
  1311 + struct bio *b)
  1312 +{
  1313 + return 0;
  1314 +}
  1315 +static inline int blk_rq_map_integrity_sg(struct request_queue *q,
  1316 + struct bio *b,
  1317 + struct scatterlist *s)
  1318 +{
  1319 + return 0;
  1320 +}
  1321 +static inline struct blk_integrity *bdev_get_integrity(struct block_device *b)
  1322 +{
  1323 + return 0;
  1324 +}
  1325 +static inline struct blk_integrity *blk_get_integrity(struct gendisk *disk)
  1326 +{
  1327 + return NULL;
  1328 +}
  1329 +static inline int blk_integrity_compare(struct gendisk *a, struct gendisk *b)
  1330 +{
  1331 + return 0;
  1332 +}
  1333 +static inline int blk_integrity_register(struct gendisk *d,
  1334 + struct blk_integrity *b)
  1335 +{
  1336 + return 0;
  1337 +}
  1338 +static inline void blk_integrity_unregister(struct gendisk *d)
  1339 +{
  1340 +}
  1341 +static inline void blk_queue_max_integrity_segments(struct request_queue *q,
  1342 + unsigned int segs)
  1343 +{
  1344 +}
  1345 +static inline unsigned short queue_max_integrity_segments(struct request_queue *q)
  1346 +{
  1347 + return 0;
  1348 +}
  1349 +static inline int blk_integrity_merge_rq(struct request_queue *rq,
  1350 + struct request *r1,
  1351 + struct request *r2)
  1352 +{
  1353 + return 0;
  1354 +}
  1355 +static inline int blk_integrity_merge_bio(struct request_queue *rq,
  1356 + struct request *r,
  1357 + struct bio *b)
  1358 +{
  1359 + return 0;
  1360 +}
  1361 +static inline bool blk_integrity_is_initialized(struct gendisk *g)
  1362 +{
  1363 + return 0;
  1364 +}
1298 1365  
1299 1366 #endif /* CONFIG_BLK_DEV_INTEGRITY */
1300 1367  
include/linux/elevator.h
... ... @@ -5,6 +5,8 @@
5 5  
6 6 #ifdef CONFIG_BLOCK
7 7  
  8 +struct io_cq;
  9 +
8 10 typedef int (elevator_merge_fn) (struct request_queue *, struct request **,
9 11 struct bio *);
10 12  
... ... @@ -24,6 +26,8 @@
24 26 typedef void (elevator_completed_req_fn) (struct request_queue *, struct request *);
25 27 typedef int (elevator_may_queue_fn) (struct request_queue *, int);
26 28  
  29 +typedef void (elevator_init_icq_fn) (struct io_cq *);
  30 +typedef void (elevator_exit_icq_fn) (struct io_cq *);
27 31 typedef int (elevator_set_req_fn) (struct request_queue *, struct request *, gfp_t);
28 32 typedef void (elevator_put_req_fn) (struct request *);
29 33 typedef void (elevator_activate_req_fn) (struct request_queue *, struct request *);
... ... @@ -56,6 +60,9 @@
56 60 elevator_request_list_fn *elevator_former_req_fn;
57 61 elevator_request_list_fn *elevator_latter_req_fn;
58 62  
  63 + elevator_init_icq_fn *elevator_init_icq_fn; /* see iocontext.h */
  64 + elevator_exit_icq_fn *elevator_exit_icq_fn; /* ditto */
  65 +
59 66 elevator_set_req_fn *elevator_set_req_fn;
60 67 elevator_put_req_fn *elevator_put_req_fn;
61 68  
... ... @@ -63,7 +70,6 @@
63 70  
64 71 elevator_init_fn *elevator_init_fn;
65 72 elevator_exit_fn *elevator_exit_fn;
66   - void (*trim)(struct io_context *);
67 73 };
68 74  
69 75 #define ELV_NAME_MAX (16)
70 76  
71 77  
... ... @@ -79,11 +85,20 @@
79 85 */
80 86 struct elevator_type
81 87 {
82   - struct list_head list;
  88 + /* managed by elevator core */
  89 + struct kmem_cache *icq_cache;
  90 +
  91 + /* fields provided by elevator implementation */
83 92 struct elevator_ops ops;
  93 + size_t icq_size; /* see iocontext.h */
  94 + size_t icq_align; /* ditto */
84 95 struct elv_fs_entry *elevator_attrs;
85 96 char elevator_name[ELV_NAME_MAX];
86 97 struct module *elevator_owner;
  98 +
  99 + /* managed by elevator core */
  100 + char icq_cache_name[ELV_NAME_MAX + 5]; /* elvname + "_io_cq" */
  101 + struct list_head list;
87 102 };
88 103  
89 104 /*
90 105  
... ... @@ -91,10 +106,9 @@
91 106 */
92 107 struct elevator_queue
93 108 {
94   - struct elevator_ops *ops;
  109 + struct elevator_type *type;
95 110 void *elevator_data;
96 111 struct kobject kobj;
97   - struct elevator_type *elevator_type;
98 112 struct mutex sysfs_lock;
99 113 struct hlist_head *hash;
100 114 unsigned int registered:1;
... ... @@ -129,7 +143,7 @@
129 143 /*
130 144 * io scheduler registration
131 145 */
132   -extern void elv_register(struct elevator_type *);
  146 +extern int elv_register(struct elevator_type *);
133 147 extern void elv_unregister(struct elevator_type *);
134 148  
135 149 /*
... ... @@ -196,23 +210,6 @@
196 210 list_del_init(&(rq)->queuelist); \
197 211 INIT_LIST_HEAD(&(rq)->csd.list); \
198 212 } while (0)
199   -
200   -/*
201   - * io context count accounting
202   - */
203   -#define elv_ioc_count_mod(name, __val) this_cpu_add(name, __val)
204   -#define elv_ioc_count_inc(name) this_cpu_inc(name)
205   -#define elv_ioc_count_dec(name) this_cpu_dec(name)
206   -
207   -#define elv_ioc_count_read(name) \
208   -({ \
209   - unsigned long __val = 0; \
210   - int __cpu; \
211   - smp_wmb(); \
212   - for_each_possible_cpu(__cpu) \
213   - __val += per_cpu(name, __cpu); \
214   - __val; \
215   -})
216 213  
217 214 #endif /* CONFIG_BLOCK */
218 215 #endif
... ... @@ -319,6 +319,7 @@
319 319 #define BLKPBSZGET _IO(0x12,123)
320 320 #define BLKDISCARDZEROES _IO(0x12,124)
321 321 #define BLKSECDISCARD _IO(0x12,125)
  322 +#define BLKROTATIONAL _IO(0x12,126)
322 323  
323 324 #define BMAP_IOCTL 1 /* obsolete - kept for compatibility */
324 325 #define FIBMAP _IO(0x00,1) /* bmap access */
include/linux/iocontext.h
... ... @@ -3,32 +3,92 @@
3 3  
4 4 #include <linux/radix-tree.h>
5 5 #include <linux/rcupdate.h>
  6 +#include <linux/workqueue.h>
6 7  
7   -struct cfq_queue;
8   -struct cfq_ttime {
9   - unsigned long last_end_request;
10   -
11   - unsigned long ttime_total;
12   - unsigned long ttime_samples;
13   - unsigned long ttime_mean;
  8 +enum {
  9 + ICQ_IOPRIO_CHANGED,
  10 + ICQ_CGROUP_CHANGED,
14 11 };
15 12  
16   -struct cfq_io_context {
17   - void *key;
  13 +/*
  14 + * An io_cq (icq) is association between an io_context (ioc) and a
  15 + * request_queue (q). This is used by elevators which need to track
  16 + * information per ioc - q pair.
  17 + *
  18 + * Elevator can request use of icq by setting elevator_type->icq_size and
  19 + * ->icq_align. Both size and align must be larger than that of struct
  20 + * io_cq and elevator can use the tail area for private information. The
  21 + * recommended way to do this is defining a struct which contains io_cq as
  22 + * the first member followed by private members and using its size and
  23 + * align. For example,
  24 + *
  25 + * struct snail_io_cq {
  26 + * struct io_cq icq;
  27 + * int poke_snail;
  28 + * int feed_snail;
  29 + * };
  30 + *
  31 + * struct elevator_type snail_elv_type {
  32 + * .ops = { ... },
  33 + * .icq_size = sizeof(struct snail_io_cq),
  34 + * .icq_align = __alignof__(struct snail_io_cq),
  35 + * ...
  36 + * };
  37 + *
  38 + * If icq_size is set, block core will manage icq's. All requests will
  39 + * have its ->elv.icq field set before elevator_ops->elevator_set_req_fn()
  40 + * is called and be holding a reference to the associated io_context.
  41 + *
  42 + * Whenever a new icq is created, elevator_ops->elevator_init_icq_fn() is
  43 + * called and, on destruction, ->elevator_exit_icq_fn(). Both functions
  44 + * are called with both the associated io_context and queue locks held.
  45 + *
  46 + * Elevator is allowed to lookup icq using ioc_lookup_icq() while holding
  47 + * queue lock but the returned icq is valid only until the queue lock is
  48 + * released. Elevators can not and should not try to create or destroy
  49 + * icq's.
  50 + *
  51 + * As icq's are linked from both ioc and q, the locking rules are a bit
  52 + * complex.
  53 + *
  54 + * - ioc lock nests inside q lock.
  55 + *
  56 + * - ioc->icq_list and icq->ioc_node are protected by ioc lock.
  57 + * q->icq_list and icq->q_node by q lock.
  58 + *
  59 + * - ioc->icq_tree and ioc->icq_hint are protected by ioc lock, while icq
  60 + * itself is protected by q lock. However, both the indexes and icq
  61 + * itself are also RCU managed and lookup can be performed holding only
  62 + * the q lock.
  63 + *
  64 + * - icq's are not reference counted. They are destroyed when either the
  65 + * ioc or q goes away. Each request with icq set holds an extra
  66 + * reference to ioc to ensure it stays until the request is completed.
  67 + *
  68 + * - Linking and unlinking icq's are performed while holding both ioc and q
  69 + * locks. Due to the lock ordering, q exit is simple but ioc exit
  70 + * requires reverse-order double lock dance.
  71 + */
  72 +struct io_cq {
  73 + struct request_queue *q;
  74 + struct io_context *ioc;
18 75  
19   - struct cfq_queue *cfqq[2];
  76 + /*
  77 + * q_node and ioc_node link io_cq through icq_list of q and ioc
  78 + * respectively. Both fields are unused once ioc_exit_icq() is
  79 + * called and shared with __rcu_icq_cache and __rcu_head which are
  80 + * used for RCU free of io_cq.
  81 + */
  82 + union {
  83 + struct list_head q_node;
  84 + struct kmem_cache *__rcu_icq_cache;
  85 + };
  86 + union {
  87 + struct hlist_node ioc_node;
  88 + struct rcu_head __rcu_head;
  89 + };
20 90  
21   - struct io_context *ioc;
22   -
23   - struct cfq_ttime ttime;
24   -
25   - struct list_head queue_list;
26   - struct hlist_node cic_list;
27   -
28   - void (*dtor)(struct io_context *); /* destructor */
29   - void (*exit)(struct io_context *); /* called on task exit */
30   -
31   - struct rcu_head rcu_head;
  91 + unsigned long changed;
32 92 };
33 93  
34 94 /*
35 95  
36 96  
... ... @@ -43,21 +103,18 @@
43 103 spinlock_t lock;
44 104  
45 105 unsigned short ioprio;
46   - unsigned short ioprio_changed;
47 106  
48   -#if defined(CONFIG_BLK_CGROUP) || defined(CONFIG_BLK_CGROUP_MODULE)
49   - unsigned short cgroup_changed;
50   -#endif
51   -
52 107 /*
53 108 * For request batching
54 109 */
55 110 int nr_batch_requests; /* Number of requests left in the batch */
56 111 unsigned long last_waited; /* Time last woken after wait for request */
57 112  
58   - struct radix_tree_root radix_root;
59   - struct hlist_head cic_list;
60   - void __rcu *ioc_data;
  113 + struct radix_tree_root icq_tree;
  114 + struct io_cq __rcu *icq_hint;
  115 + struct hlist_head icq_list;
  116 +
  117 + struct work_struct release_work;
61 118 };
62 119  
63 120 static inline struct io_context *ioc_task_link(struct io_context *ioc)
64 121  
65 122  
66 123  
... ... @@ -76,20 +133,17 @@
76 133  
77 134 struct task_struct;
78 135 #ifdef CONFIG_BLOCK
79   -int put_io_context(struct io_context *ioc);
  136 +void put_io_context(struct io_context *ioc, struct request_queue *locked_q);
80 137 void exit_io_context(struct task_struct *task);
81   -struct io_context *get_io_context(gfp_t gfp_flags, int node);
82   -struct io_context *alloc_io_context(gfp_t gfp_flags, int node);
  138 +struct io_context *get_task_io_context(struct task_struct *task,
  139 + gfp_t gfp_flags, int node);
  140 +void ioc_ioprio_changed(struct io_context *ioc, int ioprio);
  141 +void ioc_cgroup_changed(struct io_context *ioc);
83 142 #else
84   -static inline void exit_io_context(struct task_struct *task)
85   -{
86   -}
87   -
88 143 struct io_context;
89   -static inline int put_io_context(struct io_context *ioc)
90   -{
91   - return 1;
92   -}
  144 +static inline void put_io_context(struct io_context *ioc,
  145 + struct request_queue *locked_q) { }
  146 +static inline void exit_io_context(struct task_struct *task) { }
93 147 #endif
94 148  
95 149 #endif
... ... @@ -873,6 +873,7 @@
873 873 {
874 874 #ifdef CONFIG_BLOCK
875 875 struct io_context *ioc = current->io_context;
  876 + struct io_context *new_ioc;
876 877  
877 878 if (!ioc)
878 879 return 0;
879 880  
... ... @@ -884,11 +885,12 @@
884 885 if (unlikely(!tsk->io_context))
885 886 return -ENOMEM;
886 887 } else if (ioprio_valid(ioc->ioprio)) {
887   - tsk->io_context = alloc_io_context(GFP_KERNEL, -1);
888   - if (unlikely(!tsk->io_context))
  888 + new_ioc = get_task_io_context(tsk, GFP_KERNEL, NUMA_NO_NODE);
  889 + if (unlikely(!new_ioc))
889 890 return -ENOMEM;
890 891  
891   - tsk->io_context->ioprio = ioc->ioprio;
  892 + new_ioc->ioprio = ioc->ioprio;
  893 + put_io_context(new_ioc, NULL);
892 894 }
893 895 #endif
894 896 return 0;