Commit bc1c116974a5c3f498112a6f175d3e4a8cd5bdbc

Authored by Jens Axboe
Committed by Linus Torvalds
1 parent 26e780e8ef

[PATCH] elevator switching race

There's a race between shutting down one io scheduler and firing up the
next, in which a new io could enter and cause the io scheduler to be
invoked with bad or NULL data.

To fix this, we need to maintain the queue lock for a bit longer.
Unfortunately we cannot do that, since the elevator init requires to be
run without the lock held.  This isn't easily fixable, without also
changing the mempool API.  So split the initialization into two parts,
and alloc-init operation and an attach operation.  Then we can
preallocate the io scheduler and related structures, and run the attach
inside the lock after we detach the old one.

This patch has survived 30 minutes of 1 second io scheduler switching
with a very busy io load.

Signed-off-by: Jens Axboe <axboe@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

Showing 6 changed files with 54 additions and 46 deletions Side-by-side Diff

... ... @@ -1648,17 +1648,17 @@
1648 1648 * initialize elevator private data (as_data), and alloc a arq for
1649 1649 * each request on the free lists
1650 1650 */
1651   -static int as_init_queue(request_queue_t *q, elevator_t *e)
  1651 +static void *as_init_queue(request_queue_t *q, elevator_t *e)
1652 1652 {
1653 1653 struct as_data *ad;
1654 1654 int i;
1655 1655  
1656 1656 if (!arq_pool)
1657   - return -ENOMEM;
  1657 + return NULL;
1658 1658  
1659 1659 ad = kmalloc_node(sizeof(*ad), GFP_KERNEL, q->node);
1660 1660 if (!ad)
1661   - return -ENOMEM;
  1661 + return NULL;
1662 1662 memset(ad, 0, sizeof(*ad));
1663 1663  
1664 1664 ad->q = q; /* Identify what queue the data belongs to */
... ... @@ -1667,7 +1667,7 @@
1667 1667 GFP_KERNEL, q->node);
1668 1668 if (!ad->hash) {
1669 1669 kfree(ad);
1670   - return -ENOMEM;
  1670 + return NULL;
1671 1671 }
1672 1672  
1673 1673 ad->arq_pool = mempool_create_node(BLKDEV_MIN_RQ, mempool_alloc_slab,
... ... @@ -1675,7 +1675,7 @@
1675 1675 if (!ad->arq_pool) {
1676 1676 kfree(ad->hash);
1677 1677 kfree(ad);
1678   - return -ENOMEM;
  1678 + return NULL;
1679 1679 }
1680 1680  
1681 1681 /* anticipatory scheduling helpers */
1682 1682  
... ... @@ -1696,14 +1696,13 @@
1696 1696 ad->antic_expire = default_antic_expire;
1697 1697 ad->batch_expire[REQ_SYNC] = default_read_batch_expire;
1698 1698 ad->batch_expire[REQ_ASYNC] = default_write_batch_expire;
1699   - e->elevator_data = ad;
1700 1699  
1701 1700 ad->current_batch_expires = jiffies + ad->batch_expire[REQ_SYNC];
1702 1701 ad->write_batch_count = ad->batch_expire[REQ_ASYNC] / 10;
1703 1702 if (ad->write_batch_count < 2)
1704 1703 ad->write_batch_count = 2;
1705 1704  
1706   - return 0;
  1705 + return ad;
1707 1706 }
1708 1707  
1709 1708 /*
... ... @@ -2251,14 +2251,14 @@
2251 2251 kfree(cfqd);
2252 2252 }
2253 2253  
2254   -static int cfq_init_queue(request_queue_t *q, elevator_t *e)
  2254 +static void *cfq_init_queue(request_queue_t *q, elevator_t *e)
2255 2255 {
2256 2256 struct cfq_data *cfqd;
2257 2257 int i;
2258 2258  
2259 2259 cfqd = kmalloc(sizeof(*cfqd), GFP_KERNEL);
2260 2260 if (!cfqd)
2261   - return -ENOMEM;
  2261 + return NULL;
2262 2262  
2263 2263 memset(cfqd, 0, sizeof(*cfqd));
2264 2264  
... ... @@ -2288,8 +2288,6 @@
2288 2288 for (i = 0; i < CFQ_QHASH_ENTRIES; i++)
2289 2289 INIT_HLIST_HEAD(&cfqd->cfq_hash[i]);
2290 2290  
2291   - e->elevator_data = cfqd;
2292   -
2293 2291 cfqd->queue = q;
2294 2292  
2295 2293 cfqd->max_queued = q->nr_requests / 4;
2296 2294  
... ... @@ -2316,14 +2314,14 @@
2316 2314 cfqd->cfq_slice_async_rq = cfq_slice_async_rq;
2317 2315 cfqd->cfq_slice_idle = cfq_slice_idle;
2318 2316  
2319   - return 0;
  2317 + return cfqd;
2320 2318 out_crqpool:
2321 2319 kfree(cfqd->cfq_hash);
2322 2320 out_cfqhash:
2323 2321 kfree(cfqd->crq_hash);
2324 2322 out_crqhash:
2325 2323 kfree(cfqd);
2326   - return -ENOMEM;
  2324 + return NULL;
2327 2325 }
2328 2326  
2329 2327 static void cfq_slab_kill(void)
block/deadline-iosched.c
... ... @@ -613,24 +613,24 @@
613 613 * initialize elevator private data (deadline_data), and alloc a drq for
614 614 * each request on the free lists
615 615 */
616   -static int deadline_init_queue(request_queue_t *q, elevator_t *e)
  616 +static void *deadline_init_queue(request_queue_t *q, elevator_t *e)
617 617 {
618 618 struct deadline_data *dd;
619 619 int i;
620 620  
621 621 if (!drq_pool)
622   - return -ENOMEM;
  622 + return NULL;
623 623  
624 624 dd = kmalloc_node(sizeof(*dd), GFP_KERNEL, q->node);
625 625 if (!dd)
626   - return -ENOMEM;
  626 + return NULL;
627 627 memset(dd, 0, sizeof(*dd));
628 628  
629 629 dd->hash = kmalloc_node(sizeof(struct list_head)*DL_HASH_ENTRIES,
630 630 GFP_KERNEL, q->node);
631 631 if (!dd->hash) {
632 632 kfree(dd);
633   - return -ENOMEM;
  633 + return NULL;
634 634 }
635 635  
636 636 dd->drq_pool = mempool_create_node(BLKDEV_MIN_RQ, mempool_alloc_slab,
... ... @@ -638,7 +638,7 @@
638 638 if (!dd->drq_pool) {
639 639 kfree(dd->hash);
640 640 kfree(dd);
641   - return -ENOMEM;
  641 + return NULL;
642 642 }
643 643  
644 644 for (i = 0; i < DL_HASH_ENTRIES; i++)
... ... @@ -653,8 +653,7 @@
653 653 dd->writes_starved = writes_starved;
654 654 dd->front_merges = 1;
655 655 dd->fifo_batch = fifo_batch;
656   - e->elevator_data = dd;
657   - return 0;
  656 + return dd;
658 657 }
659 658  
660 659 static void deadline_put_request(request_queue_t *q, struct request *rq)
... ... @@ -121,16 +121,16 @@
121 121 return e;
122 122 }
123 123  
124   -static int elevator_attach(request_queue_t *q, struct elevator_queue *eq)
  124 +static void *elevator_init_queue(request_queue_t *q, struct elevator_queue *eq)
125 125 {
126   - int ret = 0;
  126 + return eq->ops->elevator_init_fn(q, eq);
  127 +}
127 128  
  129 +static void elevator_attach(request_queue_t *q, struct elevator_queue *eq,
  130 + void *data)
  131 +{
128 132 q->elevator = eq;
129   -
130   - if (eq->ops->elevator_init_fn)
131   - ret = eq->ops->elevator_init_fn(q, eq);
132   -
133   - return ret;
  133 + eq->elevator_data = data;
134 134 }
135 135  
136 136 static char chosen_elevator[16];
... ... @@ -181,6 +181,7 @@
181 181 struct elevator_type *e = NULL;
182 182 struct elevator_queue *eq;
183 183 int ret = 0;
  184 + void *data;
184 185  
185 186 INIT_LIST_HEAD(&q->queue_head);
186 187 q->last_merge = NULL;
187 188  
188 189  
... ... @@ -202,10 +203,13 @@
202 203 if (!eq)
203 204 return -ENOMEM;
204 205  
205   - ret = elevator_attach(q, eq);
206   - if (ret)
  206 + data = elevator_init_queue(q, eq);
  207 + if (!data) {
207 208 kobject_put(&eq->kobj);
  209 + return -ENOMEM;
  210 + }
208 211  
  212 + elevator_attach(q, eq, data);
209 213 return ret;
210 214 }
211 215  
212 216  
... ... @@ -722,13 +726,16 @@
722 726 return error;
723 727 }
724 728  
  729 +static void __elv_unregister_queue(elevator_t *e)
  730 +{
  731 + kobject_uevent(&e->kobj, KOBJ_REMOVE);
  732 + kobject_del(&e->kobj);
  733 +}
  734 +
725 735 void elv_unregister_queue(struct request_queue *q)
726 736 {
727   - if (q) {
728   - elevator_t *e = q->elevator;
729   - kobject_uevent(&e->kobj, KOBJ_REMOVE);
730   - kobject_del(&e->kobj);
731   - }
  737 + if (q)
  738 + __elv_unregister_queue(q->elevator);
732 739 }
733 740  
734 741 int elv_register(struct elevator_type *e)
... ... @@ -780,6 +787,7 @@
780 787 static int elevator_switch(request_queue_t *q, struct elevator_type *new_e)
781 788 {
782 789 elevator_t *old_elevator, *e;
  790 + void *data;
783 791  
784 792 /*
785 793 * Allocate new elevator
... ... @@ -788,6 +796,12 @@
788 796 if (!e)
789 797 return 0;
790 798  
  799 + data = elevator_init_queue(q, e);
  800 + if (!data) {
  801 + kobject_put(&e->kobj);
  802 + return 0;
  803 + }
  804 +
791 805 /*
792 806 * Turn on BYPASS and drain all requests w/ elevator private data
793 807 */
794 808  
795 809  
796 810  
797 811  
... ... @@ -806,20 +820,20 @@
806 820 elv_drain_elevator(q);
807 821 }
808 822  
809   - spin_unlock_irq(q->queue_lock);
810   -
811 823 /*
812   - * unregister old elevator data
  824 + * Remember old elevator.
813 825 */
814   - elv_unregister_queue(q);
815 826 old_elevator = q->elevator;
816 827  
817 828 /*
818 829 * attach and start new elevator
819 830 */
820   - if (elevator_attach(q, e))
821   - goto fail;
  831 + elevator_attach(q, e, data);
822 832  
  833 + spin_unlock_irq(q->queue_lock);
  834 +
  835 + __elv_unregister_queue(old_elevator);
  836 +
823 837 if (elv_register_queue(q))
824 838 goto fail_register;
825 839  
... ... @@ -837,7 +851,6 @@
837 851 */
838 852 elevator_exit(e);
839 853 e = NULL;
840   -fail:
841 854 q->elevator = old_elevator;
842 855 elv_register_queue(q);
843 856 clear_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags);
block/noop-iosched.c
... ... @@ -65,16 +65,15 @@
65 65 return list_entry(rq->queuelist.next, struct request, queuelist);
66 66 }
67 67  
68   -static int noop_init_queue(request_queue_t *q, elevator_t *e)
  68 +static void *noop_init_queue(request_queue_t *q, elevator_t *e)
69 69 {
70 70 struct noop_data *nd;
71 71  
72 72 nd = kmalloc(sizeof(*nd), GFP_KERNEL);
73 73 if (!nd)
74   - return -ENOMEM;
  74 + return NULL;
75 75 INIT_LIST_HEAD(&nd->queue);
76   - e->elevator_data = nd;
77   - return 0;
  76 + return nd;
78 77 }
79 78  
80 79 static void noop_exit_queue(elevator_t *e)
include/linux/elevator.h
... ... @@ -21,7 +21,7 @@
21 21 typedef void (elevator_activate_req_fn) (request_queue_t *, struct request *);
22 22 typedef void (elevator_deactivate_req_fn) (request_queue_t *, struct request *);
23 23  
24   -typedef int (elevator_init_fn) (request_queue_t *, elevator_t *);
  24 +typedef void *(elevator_init_fn) (request_queue_t *, elevator_t *);
25 25 typedef void (elevator_exit_fn) (elevator_t *);
26 26  
27 27 struct elevator_ops