Commit 275220f0fcff1adf28a717076e00f575edf05fda

Authored by Linus Torvalds

Merge branch 'for-2.6.38/core' of git://git.kernel.dk/linux-2.6-block

* 'for-2.6.38/core' of git://git.kernel.dk/linux-2.6-block: (43 commits)
  block: ensure that completion error gets properly traced
  blktrace: add missing probe argument to block_bio_complete
  block cfq: don't use atomic_t for cfq_group
  block cfq: don't use atomic_t for cfq_queue
  block: trace event block fix unassigned field
  block: add internal hd part table references
  block: fix accounting bug on cross partition merges
  kref: add kref_test_and_get
  bio-integrity: mark kintegrityd_wq highpri and CPU intensive
  block: make kblockd_workqueue smarter
  Revert "sd: implement sd_check_events()"
  block: Clean up exit_io_context() source code.
  Fix compile warnings due to missing removal of a 'ret' variable
  fs/block: type signature of major_to_index(int) to major_to_index(unsigned)
  block: convert !IS_ERR(p) && p to !IS_ERR_NOR_NULL(p)
  cfq-iosched: don't check cfqg in choose_service_tree()
  fs/splice: Pull buf->ops->confirm() from splice_from_pipe actors
  cdrom: export cdrom_check_events()
  sd: implement sd_check_events()
  sr: implement sr_check_events()
  ...

Showing 53 changed files Side-by-side Diff

Documentation/cgroups/blkio-controller.txt
... ... @@ -89,6 +89,33 @@
89 89  
90 90 Limits for writes can be put using blkio.write_bps_device file.
91 91  
  92 +Hierarchical Cgroups
  93 +====================
  94 +- Currently none of the IO control policy supports hierarhical groups. But
  95 + cgroup interface does allow creation of hierarhical cgroups and internally
  96 + IO policies treat them as flat hierarchy.
  97 +
  98 + So this patch will allow creation of cgroup hierarhcy but at the backend
  99 + everything will be treated as flat. So if somebody created a hierarchy like
  100 + as follows.
  101 +
  102 + root
  103 + / \
  104 + test1 test2
  105 + |
  106 + test3
  107 +
  108 + CFQ and throttling will practically treat all groups at same level.
  109 +
  110 + pivot
  111 + / | \ \
  112 + root test1 test2 test3
  113 +
  114 + Down the line we can implement hierarchical accounting/control support
  115 + and also introduce a new cgroup file "use_hierarchy" which will control
  116 + whether cgroup hierarchy is viewed as flat or hierarchical by the policy..
  117 + This is how memory controller also has implemented the things.
  118 +
92 119 Various user visible config options
93 120 ===================================
94 121 CONFIG_BLK_CGROUP
... ... @@ -1452,10 +1452,6 @@
1452 1452 goto done;
1453 1453 }
1454 1454  
1455   - /* Currently we do not support hierarchy deeper than two level (0,1) */
1456   - if (parent != cgroup->top_cgroup)
1457   - return ERR_PTR(-EPERM);
1458   -
1459 1455 blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL);
1460 1456 if (!blkcg)
1461 1457 return ERR_PTR(-ENOMEM);
... ... @@ -33,7 +33,7 @@
33 33  
34 34 #include "blk.h"
35 35  
36   -EXPORT_TRACEPOINT_SYMBOL_GPL(block_remap);
  36 +EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_remap);
37 37 EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_remap);
38 38 EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_complete);
39 39  
40 40  
41 41  
42 42  
... ... @@ -64,13 +64,27 @@
64 64 return;
65 65  
66 66 cpu = part_stat_lock();
67   - part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq));
68 67  
69   - if (!new_io)
  68 + if (!new_io) {
  69 + part = rq->part;
70 70 part_stat_inc(cpu, part, merges[rw]);
71   - else {
  71 + } else {
  72 + part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq));
  73 + if (!hd_struct_try_get(part)) {
  74 + /*
  75 + * The partition is already being removed,
  76 + * the request will be accounted on the disk only
  77 + *
  78 + * We take a reference on disk->part0 although that
  79 + * partition will never be deleted, so we can treat
  80 + * it as any other partition.
  81 + */
  82 + part = &rq->rq_disk->part0;
  83 + hd_struct_get(part);
  84 + }
72 85 part_round_stats(cpu, part);
73 86 part_inc_in_flight(part, rw);
  87 + rq->part = part;
74 88 }
75 89  
76 90 part_stat_unlock();
... ... @@ -128,6 +142,7 @@
128 142 rq->ref_count = 1;
129 143 rq->start_time = jiffies;
130 144 set_start_time_ns(rq);
  145 + rq->part = NULL;
131 146 }
132 147 EXPORT_SYMBOL(blk_rq_init);
133 148  
... ... @@ -1329,9 +1344,9 @@
1329 1344 bio->bi_sector += p->start_sect;
1330 1345 bio->bi_bdev = bdev->bd_contains;
1331 1346  
1332   - trace_block_remap(bdev_get_queue(bio->bi_bdev), bio,
1333   - bdev->bd_dev,
1334   - bio->bi_sector - p->start_sect);
  1347 + trace_block_bio_remap(bdev_get_queue(bio->bi_bdev), bio,
  1348 + bdev->bd_dev,
  1349 + bio->bi_sector - p->start_sect);
1335 1350 }
1336 1351 }
1337 1352  
... ... @@ -1500,7 +1515,7 @@
1500 1515 goto end_io;
1501 1516  
1502 1517 if (old_sector != -1)
1503   - trace_block_remap(q, bio, old_dev, old_sector);
  1518 + trace_block_bio_remap(q, bio, old_dev, old_sector);
1504 1519  
1505 1520 old_sector = bio->bi_sector;
1506 1521 old_dev = bio->bi_bdev->bd_dev;
... ... @@ -1776,7 +1791,7 @@
1776 1791 int cpu;
1777 1792  
1778 1793 cpu = part_stat_lock();
1779   - part = disk_map_sector_rcu(req->rq_disk, blk_rq_pos(req));
  1794 + part = req->part;
1780 1795 part_stat_add(cpu, part, sectors[rw], bytes >> 9);
1781 1796 part_stat_unlock();
1782 1797 }
1783 1798  
... ... @@ -1796,13 +1811,14 @@
1796 1811 int cpu;
1797 1812  
1798 1813 cpu = part_stat_lock();
1799   - part = disk_map_sector_rcu(req->rq_disk, blk_rq_pos(req));
  1814 + part = req->part;
1800 1815  
1801 1816 part_stat_inc(cpu, part, ios[rw]);
1802 1817 part_stat_add(cpu, part, ticks[rw], duration);
1803 1818 part_round_stats(cpu, part);
1804 1819 part_dec_in_flight(part, rw);
1805 1820  
  1821 + hd_struct_put(part);
1806 1822 part_stat_unlock();
1807 1823 }
1808 1824 }
... ... @@ -2606,7 +2622,9 @@
2606 2622 BUILD_BUG_ON(__REQ_NR_BITS > 8 *
2607 2623 sizeof(((struct request *)0)->cmd_flags));
2608 2624  
2609   - kblockd_workqueue = create_workqueue("kblockd");
  2625 + /* used for unplugging and affects IO latency/throughput - HIGHPRI */
  2626 + kblockd_workqueue = alloc_workqueue("kblockd",
  2627 + WQ_MEM_RECLAIM | WQ_HIGHPRI, 0);
2610 2628 if (!kblockd_workqueue)
2611 2629 panic("Failed to create kblockd\n");
2612 2630  
... ... @@ -64,7 +64,7 @@
64 64 rcu_read_unlock();
65 65 }
66 66  
67   -/* Called by the exitting task */
  67 +/* Called by the exiting task */
68 68 void exit_io_context(struct task_struct *task)
69 69 {
70 70 struct io_context *ioc;
71 71  
... ... @@ -74,10 +74,9 @@
74 74 task->io_context = NULL;
75 75 task_unlock(task);
76 76  
77   - if (atomic_dec_and_test(&ioc->nr_tasks)) {
  77 + if (atomic_dec_and_test(&ioc->nr_tasks))
78 78 cfq_exit(ioc);
79 79  
80   - }
81 80 put_io_context(ioc);
82 81 }
83 82  
... ... @@ -351,11 +351,12 @@
351 351 int cpu;
352 352  
353 353 cpu = part_stat_lock();
354   - part = disk_map_sector_rcu(req->rq_disk, blk_rq_pos(req));
  354 + part = req->part;
355 355  
356 356 part_round_stats(cpu, part);
357 357 part_dec_in_flight(part, rq_data_dir(req));
358 358  
  359 + hd_struct_put(part);
359 360 part_stat_unlock();
360 361 }
361 362 }
... ... @@ -87,7 +87,6 @@
87 87 unsigned count;
88 88 unsigned total_weight;
89 89 u64 min_vdisktime;
90   - struct rb_node *active;
91 90 };
92 91 #define CFQ_RB_ROOT (struct cfq_rb_root) { .rb = RB_ROOT, .left = NULL, \
93 92 .count = 0, .min_vdisktime = 0, }
... ... @@ -97,7 +96,7 @@
97 96 */
98 97 struct cfq_queue {
99 98 /* reference count */
100   - atomic_t ref;
  99 + int ref;
101 100 /* various state flags, see below */
102 101 unsigned int flags;
103 102 /* parent cfq_data */
... ... @@ -180,7 +179,6 @@
180 179 /* group service_tree key */
181 180 u64 vdisktime;
182 181 unsigned int weight;
183   - bool on_st;
184 182  
185 183 /* number of cfqq currently on this group */
186 184 int nr_cfqq;
... ... @@ -209,7 +207,7 @@
209 207 struct blkio_group blkg;
210 208 #ifdef CONFIG_CFQ_GROUP_IOSCHED
211 209 struct hlist_node cfqd_node;
212   - atomic_t ref;
  210 + int ref;
213 211 #endif
214 212 /* number of requests that are on the dispatch list or inside driver */
215 213 int dispatched;
... ... @@ -563,11 +561,6 @@
563 561 u64 vdisktime = st->min_vdisktime;
564 562 struct cfq_group *cfqg;
565 563  
566   - if (st->active) {
567   - cfqg = rb_entry_cfqg(st->active);
568   - vdisktime = cfqg->vdisktime;
569   - }
570   -
571 564 if (st->left) {
572 565 cfqg = rb_entry_cfqg(st->left);
573 566 vdisktime = min_vdisktime(vdisktime, cfqg->vdisktime);
574 567  
575 568  
... ... @@ -646,11 +639,11 @@
646 639 static inline bool cfq_slice_used(struct cfq_queue *cfqq)
647 640 {
648 641 if (cfq_cfqq_slice_new(cfqq))
649   - return 0;
  642 + return false;
650 643 if (time_before(jiffies, cfqq->slice_end))
651   - return 0;
  644 + return false;
652 645  
653   - return 1;
  646 + return true;
654 647 }
655 648  
656 649 /*
... ... @@ -869,7 +862,7 @@
869 862 struct rb_node *n;
870 863  
871 864 cfqg->nr_cfqq++;
872   - if (cfqg->on_st)
  865 + if (!RB_EMPTY_NODE(&cfqg->rb_node))
873 866 return;
874 867  
875 868 /*
... ... @@ -885,7 +878,6 @@
885 878 cfqg->vdisktime = st->min_vdisktime;
886 879  
887 880 __cfq_group_service_tree_add(st, cfqg);
888   - cfqg->on_st = true;
889 881 st->total_weight += cfqg->weight;
890 882 }
891 883  
... ... @@ -894,9 +886,6 @@
894 886 {
895 887 struct cfq_rb_root *st = &cfqd->grp_service_tree;
896 888  
897   - if (st->active == &cfqg->rb_node)
898   - st->active = NULL;
899   -
900 889 BUG_ON(cfqg->nr_cfqq < 1);
901 890 cfqg->nr_cfqq--;
902 891  
... ... @@ -905,7 +894,6 @@
905 894 return;
906 895  
907 896 cfq_log_cfqg(cfqd, cfqg, "del_from_rr group");
908   - cfqg->on_st = false;
909 897 st->total_weight -= cfqg->weight;
910 898 if (!RB_EMPTY_NODE(&cfqg->rb_node))
911 899 cfq_rb_erase(&cfqg->rb_node, st);
... ... @@ -1026,7 +1014,7 @@
1026 1014 * elevator which will be dropped by either elevator exit
1027 1015 * or cgroup deletion path depending on who is exiting first.
1028 1016 */
1029   - atomic_set(&cfqg->ref, 1);
  1017 + cfqg->ref = 1;
1030 1018  
1031 1019 /*
1032 1020 * Add group onto cgroup list. It might happen that bdi->dev is
... ... @@ -1071,7 +1059,7 @@
1071 1059  
1072 1060 static inline struct cfq_group *cfq_ref_get_cfqg(struct cfq_group *cfqg)
1073 1061 {
1074   - atomic_inc(&cfqg->ref);
  1062 + cfqg->ref++;
1075 1063 return cfqg;
1076 1064 }
1077 1065  
... ... @@ -1083,7 +1071,7 @@
1083 1071  
1084 1072 cfqq->cfqg = cfqg;
1085 1073 /* cfqq reference on cfqg */
1086   - atomic_inc(&cfqq->cfqg->ref);
  1074 + cfqq->cfqg->ref++;
1087 1075 }
1088 1076  
1089 1077 static void cfq_put_cfqg(struct cfq_group *cfqg)
1090 1078  
... ... @@ -1091,11 +1079,12 @@
1091 1079 struct cfq_rb_root *st;
1092 1080 int i, j;
1093 1081  
1094   - BUG_ON(atomic_read(&cfqg->ref) <= 0);
1095   - if (!atomic_dec_and_test(&cfqg->ref))
  1082 + BUG_ON(cfqg->ref <= 0);
  1083 + cfqg->ref--;
  1084 + if (cfqg->ref)
1096 1085 return;
1097 1086 for_each_cfqg_st(cfqg, i, j, st)
1098   - BUG_ON(!RB_EMPTY_ROOT(&st->rb) || st->active != NULL);
  1087 + BUG_ON(!RB_EMPTY_ROOT(&st->rb));
1099 1088 kfree(cfqg);
1100 1089 }
1101 1090  
... ... @@ -1200,7 +1189,7 @@
1200 1189 cfq_group_service_tree_del(cfqd, cfqq->cfqg);
1201 1190 cfqq->orig_cfqg = cfqq->cfqg;
1202 1191 cfqq->cfqg = &cfqd->root_group;
1203   - atomic_inc(&cfqd->root_group.ref);
  1192 + cfqd->root_group.ref++;
1204 1193 group_changed = 1;
1205 1194 } else if (!cfqd->cfq_group_isolation
1206 1195 && cfqq_type(cfqq) == SYNC_WORKLOAD && cfqq->orig_cfqg) {
... ... @@ -1687,9 +1676,6 @@
1687 1676 if (cfqq == cfqd->active_queue)
1688 1677 cfqd->active_queue = NULL;
1689 1678  
1690   - if (&cfqq->cfqg->rb_node == cfqd->grp_service_tree.active)
1691   - cfqd->grp_service_tree.active = NULL;
1692   -
1693 1679 if (cfqd->active_cic) {
1694 1680 put_io_context(cfqd->active_cic->ioc);
1695 1681 cfqd->active_cic = NULL;
1696 1682  
... ... @@ -1901,10 +1887,10 @@
1901 1887 * in their service tree.
1902 1888 */
1903 1889 if (service_tree->count == 1 && cfq_cfqq_sync(cfqq))
1904   - return 1;
  1890 + return true;
1905 1891 cfq_log_cfqq(cfqd, cfqq, "Not idling. st->count:%d",
1906 1892 service_tree->count);
1907   - return 0;
  1893 + return false;
1908 1894 }
1909 1895  
1910 1896 static void cfq_arm_slice_timer(struct cfq_data *cfqd)
... ... @@ -2040,7 +2026,7 @@
2040 2026 int process_refs, io_refs;
2041 2027  
2042 2028 io_refs = cfqq->allocated[READ] + cfqq->allocated[WRITE];
2043   - process_refs = atomic_read(&cfqq->ref) - io_refs;
  2029 + process_refs = cfqq->ref - io_refs;
2044 2030 BUG_ON(process_refs < 0);
2045 2031 return process_refs;
2046 2032 }
2047 2033  
... ... @@ -2080,10 +2066,10 @@
2080 2066 */
2081 2067 if (new_process_refs >= process_refs) {
2082 2068 cfqq->new_cfqq = new_cfqq;
2083   - atomic_add(process_refs, &new_cfqq->ref);
  2069 + new_cfqq->ref += process_refs;
2084 2070 } else {
2085 2071 new_cfqq->new_cfqq = cfqq;
2086   - atomic_add(new_process_refs, &cfqq->ref);
  2072 + cfqq->ref += new_process_refs;
2087 2073 }
2088 2074 }
2089 2075  
2090 2076  
... ... @@ -2116,13 +2102,8 @@
2116 2102 unsigned count;
2117 2103 struct cfq_rb_root *st;
2118 2104 unsigned group_slice;
  2105 + enum wl_prio_t original_prio = cfqd->serving_prio;
2119 2106  
2120   - if (!cfqg) {
2121   - cfqd->serving_prio = IDLE_WORKLOAD;
2122   - cfqd->workload_expires = jiffies + 1;
2123   - return;
2124   - }
2125   -
2126 2107 /* Choose next priority. RT > BE > IDLE */
2127 2108 if (cfq_group_busy_queues_wl(RT_WORKLOAD, cfqd, cfqg))
2128 2109 cfqd->serving_prio = RT_WORKLOAD;
... ... @@ -2134,6 +2115,9 @@
2134 2115 return;
2135 2116 }
2136 2117  
  2118 + if (original_prio != cfqd->serving_prio)
  2119 + goto new_workload;
  2120 +
2137 2121 /*
2138 2122 * For RT and BE, we have to choose also the type
2139 2123 * (SYNC, SYNC_NOIDLE, ASYNC), and to compute a workload
... ... @@ -2148,6 +2132,7 @@
2148 2132 if (count && !time_after(jiffies, cfqd->workload_expires))
2149 2133 return;
2150 2134  
  2135 +new_workload:
2151 2136 /* otherwise select new workload type */
2152 2137 cfqd->serving_type =
2153 2138 cfq_choose_wl(cfqd, cfqg, cfqd->serving_prio);
... ... @@ -2199,7 +2184,6 @@
2199 2184 if (RB_EMPTY_ROOT(&st->rb))
2200 2185 return NULL;
2201 2186 cfqg = cfq_rb_first_group(st);
2202   - st->active = &cfqg->rb_node;
2203 2187 update_min_vdisktime(st);
2204 2188 return cfqg;
2205 2189 }
... ... @@ -2293,6 +2277,17 @@
2293 2277 goto keep_queue;
2294 2278 }
2295 2279  
  2280 + /*
  2281 + * This is a deep seek queue, but the device is much faster than
  2282 + * the queue can deliver, don't idle
  2283 + **/
  2284 + if (CFQQ_SEEKY(cfqq) && cfq_cfqq_idle_window(cfqq) &&
  2285 + (cfq_cfqq_slice_new(cfqq) ||
  2286 + (cfqq->slice_end - jiffies > jiffies - cfqq->slice_start))) {
  2287 + cfq_clear_cfqq_deep(cfqq);
  2288 + cfq_clear_cfqq_idle_window(cfqq);
  2289 + }
  2290 +
2296 2291 if (cfqq->dispatched && cfq_should_idle(cfqd, cfqq)) {
2297 2292 cfqq = NULL;
2298 2293 goto keep_queue;
2299 2294  
2300 2295  
... ... @@ -2367,12 +2362,12 @@
2367 2362 {
2368 2363 /* the queue hasn't finished any request, can't estimate */
2369 2364 if (cfq_cfqq_slice_new(cfqq))
2370   - return 1;
  2365 + return true;
2371 2366 if (time_after(jiffies + cfqd->cfq_slice_idle * cfqq->dispatched,
2372 2367 cfqq->slice_end))
2373   - return 1;
  2368 + return true;
2374 2369  
2375   - return 0;
  2370 + return false;
2376 2371 }
2377 2372  
2378 2373 static bool cfq_may_dispatch(struct cfq_data *cfqd, struct cfq_queue *cfqq)
2379 2374  
... ... @@ -2538,9 +2533,10 @@
2538 2533 struct cfq_data *cfqd = cfqq->cfqd;
2539 2534 struct cfq_group *cfqg, *orig_cfqg;
2540 2535  
2541   - BUG_ON(atomic_read(&cfqq->ref) <= 0);
  2536 + BUG_ON(cfqq->ref <= 0);
2542 2537  
2543   - if (!atomic_dec_and_test(&cfqq->ref))
  2538 + cfqq->ref--;
  2539 + if (cfqq->ref)
2544 2540 return;
2545 2541  
2546 2542 cfq_log_cfqq(cfqd, cfqq, "put_queue");
... ... @@ -2843,7 +2839,7 @@
2843 2839 RB_CLEAR_NODE(&cfqq->p_node);
2844 2840 INIT_LIST_HEAD(&cfqq->fifo);
2845 2841  
2846   - atomic_set(&cfqq->ref, 0);
  2842 + cfqq->ref = 0;
2847 2843 cfqq->cfqd = cfqd;
2848 2844  
2849 2845 cfq_mark_cfqq_prio_changed(cfqq);
2850 2846  
... ... @@ -2979,11 +2975,11 @@
2979 2975 * pin the queue now that it's allocated, scheduler exit will prune it
2980 2976 */
2981 2977 if (!is_sync && !(*async_cfqq)) {
2982   - atomic_inc(&cfqq->ref);
  2978 + cfqq->ref++;
2983 2979 *async_cfqq = cfqq;
2984 2980 }
2985 2981  
2986   - atomic_inc(&cfqq->ref);
  2982 + cfqq->ref++;
2987 2983 return cfqq;
2988 2984 }
2989 2985  
... ... @@ -3265,6 +3261,10 @@
3265 3261 if (cfq_class_rt(new_cfqq) && !cfq_class_rt(cfqq))
3266 3262 return true;
3267 3263  
  3264 + /* An idle queue should not be idle now for some reason */
  3265 + if (RB_EMPTY_ROOT(&cfqq->sort_list) && !cfq_should_idle(cfqd, cfqq))
  3266 + return true;
  3267 +
3268 3268 if (!cfqd->active_cic || !cfq_cfqq_wait_request(cfqq))
3269 3269 return false;
3270 3270  
3271 3271  
... ... @@ -3681,13 +3681,13 @@
3681 3681 }
3682 3682  
3683 3683 cfqq->allocated[rw]++;
3684   - atomic_inc(&cfqq->ref);
3685   -
3686   - spin_unlock_irqrestore(q->queue_lock, flags);
3687   -
  3684 + cfqq->ref++;
3688 3685 rq->elevator_private = cic;
3689 3686 rq->elevator_private2 = cfqq;
3690 3687 rq->elevator_private3 = cfq_ref_get_cfqg(cfqq->cfqg);
  3688 +
  3689 + spin_unlock_irqrestore(q->queue_lock, flags);
  3690 +
3691 3691 return 0;
3692 3692  
3693 3693 queue_fail:
... ... @@ -3862,6 +3862,10 @@
3862 3862 if (!cfqd)
3863 3863 return NULL;
3864 3864  
  3865 + /*
  3866 + * Don't need take queue_lock in the routine, since we are
  3867 + * initializing the ioscheduler, and nobody is using cfqd
  3868 + */
3865 3869 cfqd->cic_index = i;
3866 3870  
3867 3871 /* Init root service tree */
... ... @@ -3881,7 +3885,7 @@
3881 3885 * Take a reference to root group which we never drop. This is just
3882 3886 * to make sure that cfq_put_cfqg() does not try to kfree root group
3883 3887 */
3884   - atomic_set(&cfqg->ref, 1);
  3888 + cfqg->ref = 1;
3885 3889 rcu_read_lock();
3886 3890 cfq_blkiocg_add_blkio_group(&blkio_root_cgroup, &cfqg->blkg,
3887 3891 (void *)cfqd, 0);
... ... @@ -3901,7 +3905,7 @@
3901 3905 * will not attempt to free it.
3902 3906 */
3903 3907 cfq_init_cfqq(cfqd, &cfqd->oom_cfqq, 1, 0);
3904   - atomic_inc(&cfqd->oom_cfqq.ref);
  3908 + cfqd->oom_cfqq.ref++;
3905 3909 cfq_link_cfqq_cfqg(&cfqd->oom_cfqq, &cfqd->root_group);
3906 3910  
3907 3911 INIT_LIST_HEAD(&cfqd->cic_list);
... ... @@ -18,6 +18,7 @@
18 18 #include <linux/buffer_head.h>
19 19 #include <linux/mutex.h>
20 20 #include <linux/idr.h>
  21 +#include <linux/log2.h>
21 22  
22 23 #include "blk.h"
23 24  
... ... @@ -35,6 +36,10 @@
35 36  
36 37 static struct device_type disk_type;
37 38  
  39 +static void disk_add_events(struct gendisk *disk);
  40 +static void disk_del_events(struct gendisk *disk);
  41 +static void disk_release_events(struct gendisk *disk);
  42 +
38 43 /**
39 44 * disk_get_part - get partition
40 45 * @disk: disk to look partition from
... ... @@ -239,7 +244,7 @@
239 244 } *major_names[BLKDEV_MAJOR_HASH_SIZE];
240 245  
241 246 /* index in the above - for now: assume no multimajor ranges */
242   -static inline int major_to_index(int major)
  247 +static inline int major_to_index(unsigned major)
243 248 {
244 249 return major % BLKDEV_MAJOR_HASH_SIZE;
245 250 }
... ... @@ -502,6 +507,64 @@
502 507 return 0;
503 508 }
504 509  
  510 +void register_disk(struct gendisk *disk)
  511 +{
  512 + struct device *ddev = disk_to_dev(disk);
  513 + struct block_device *bdev;
  514 + struct disk_part_iter piter;
  515 + struct hd_struct *part;
  516 + int err;
  517 +
  518 + ddev->parent = disk->driverfs_dev;
  519 +
  520 + dev_set_name(ddev, disk->disk_name);
  521 +
  522 + /* delay uevents, until we scanned partition table */
  523 + dev_set_uevent_suppress(ddev, 1);
  524 +
  525 + if (device_add(ddev))
  526 + return;
  527 + if (!sysfs_deprecated) {
  528 + err = sysfs_create_link(block_depr, &ddev->kobj,
  529 + kobject_name(&ddev->kobj));
  530 + if (err) {
  531 + device_del(ddev);
  532 + return;
  533 + }
  534 + }
  535 + disk->part0.holder_dir = kobject_create_and_add("holders", &ddev->kobj);
  536 + disk->slave_dir = kobject_create_and_add("slaves", &ddev->kobj);
  537 +
  538 + /* No minors to use for partitions */
  539 + if (!disk_partitionable(disk))
  540 + goto exit;
  541 +
  542 + /* No such device (e.g., media were just removed) */
  543 + if (!get_capacity(disk))
  544 + goto exit;
  545 +
  546 + bdev = bdget_disk(disk, 0);
  547 + if (!bdev)
  548 + goto exit;
  549 +
  550 + bdev->bd_invalidated = 1;
  551 + err = blkdev_get(bdev, FMODE_READ, NULL);
  552 + if (err < 0)
  553 + goto exit;
  554 + blkdev_put(bdev, FMODE_READ);
  555 +
  556 +exit:
  557 + /* announce disk after possible partitions are created */
  558 + dev_set_uevent_suppress(ddev, 0);
  559 + kobject_uevent(&ddev->kobj, KOBJ_ADD);
  560 +
  561 + /* announce possible partitions */
  562 + disk_part_iter_init(&piter, disk, 0);
  563 + while ((part = disk_part_iter_next(&piter)))
  564 + kobject_uevent(&part_to_dev(part)->kobj, KOBJ_ADD);
  565 + disk_part_iter_exit(&piter);
  566 +}
  567 +
505 568 /**
506 569 * add_disk - add partitioning information to kernel list
507 570 * @disk: per-device partitioning information
508 571  
509 572  
510 573  
511 574  
512 575  
513 576  
... ... @@ -551,18 +614,48 @@
551 614 retval = sysfs_create_link(&disk_to_dev(disk)->kobj, &bdi->dev->kobj,
552 615 "bdi");
553 616 WARN_ON(retval);
554   -}
555 617  
  618 + disk_add_events(disk);
  619 +}
556 620 EXPORT_SYMBOL(add_disk);
557   -EXPORT_SYMBOL(del_gendisk); /* in partitions/check.c */
558 621  
559   -void unlink_gendisk(struct gendisk *disk)
  622 +void del_gendisk(struct gendisk *disk)
560 623 {
  624 + struct disk_part_iter piter;
  625 + struct hd_struct *part;
  626 +
  627 + disk_del_events(disk);
  628 +
  629 + /* invalidate stuff */
  630 + disk_part_iter_init(&piter, disk,
  631 + DISK_PITER_INCL_EMPTY | DISK_PITER_REVERSE);
  632 + while ((part = disk_part_iter_next(&piter))) {
  633 + invalidate_partition(disk, part->partno);
  634 + delete_partition(disk, part->partno);
  635 + }
  636 + disk_part_iter_exit(&piter);
  637 +
  638 + invalidate_partition(disk, 0);
  639 + blk_free_devt(disk_to_dev(disk)->devt);
  640 + set_capacity(disk, 0);
  641 + disk->flags &= ~GENHD_FL_UP;
  642 +
561 643 sysfs_remove_link(&disk_to_dev(disk)->kobj, "bdi");
562 644 bdi_unregister(&disk->queue->backing_dev_info);
563 645 blk_unregister_queue(disk);
564 646 blk_unregister_region(disk_devt(disk), disk->minors);
  647 +
  648 + part_stat_set_all(&disk->part0, 0);
  649 + disk->part0.stamp = 0;
  650 +
  651 + kobject_put(disk->part0.holder_dir);
  652 + kobject_put(disk->slave_dir);
  653 + disk->driverfs_dev = NULL;
  654 + if (!sysfs_deprecated)
  655 + sysfs_remove_link(block_depr, dev_name(disk_to_dev(disk)));
  656 + device_del(disk_to_dev(disk));
565 657 }
  658 +EXPORT_SYMBOL(del_gendisk);
566 659  
567 660 /**
568 661 * get_gendisk - get partitioning information for a given device
... ... @@ -735,7 +828,7 @@
735 828 static void *p;
736 829  
737 830 p = disk_seqf_start(seqf, pos);
738   - if (!IS_ERR(p) && p && !*pos)
  831 + if (!IS_ERR_OR_NULL(p) && !*pos)
739 832 seq_puts(seqf, "major minor #blocks name\n\n");
740 833 return p;
741 834 }
... ... @@ -1005,6 +1098,7 @@
1005 1098 {
1006 1099 struct gendisk *disk = dev_to_disk(dev);
1007 1100  
  1101 + disk_release_events(disk);
1008 1102 kfree(disk->random);
1009 1103 disk_replace_part_tbl(disk, NULL);
1010 1104 free_part_stats(&disk->part0);
... ... @@ -1110,29 +1204,6 @@
1110 1204 module_init(proc_genhd_init);
1111 1205 #endif /* CONFIG_PROC_FS */
1112 1206  
1113   -static void media_change_notify_thread(struct work_struct *work)
1114   -{
1115   - struct gendisk *gd = container_of(work, struct gendisk, async_notify);
1116   - char event[] = "MEDIA_CHANGE=1";
1117   - char *envp[] = { event, NULL };
1118   -
1119   - /*
1120   - * set enviroment vars to indicate which event this is for
1121   - * so that user space will know to go check the media status.
1122   - */
1123   - kobject_uevent_env(&disk_to_dev(gd)->kobj, KOBJ_CHANGE, envp);
1124   - put_device(gd->driverfs_dev);
1125   -}
1126   -
1127   -#if 0
1128   -void genhd_media_change_notify(struct gendisk *disk)
1129   -{
1130   - get_device(disk->driverfs_dev);
1131   - schedule_work(&disk->async_notify);
1132   -}
1133   -EXPORT_SYMBOL_GPL(genhd_media_change_notify);
1134   -#endif /* 0 */
1135   -
1136 1207 dev_t blk_lookup_devt(const char *name, int partno)
1137 1208 {
1138 1209 dev_t devt = MKDEV(0, 0);
1139 1210  
... ... @@ -1193,13 +1264,13 @@
1193 1264 }
1194 1265 disk->part_tbl->part[0] = &disk->part0;
1195 1266  
  1267 + hd_ref_init(&disk->part0);
  1268 +
1196 1269 disk->minors = minors;
1197 1270 rand_initialize_disk(disk);
1198 1271 disk_to_dev(disk)->class = &block_class;
1199 1272 disk_to_dev(disk)->type = &disk_type;
1200 1273 device_initialize(disk_to_dev(disk));
1201   - INIT_WORK(&disk->async_notify,
1202   - media_change_notify_thread);
1203 1274 }
1204 1275 return disk;
1205 1276 }
... ... @@ -1291,4 +1362,423 @@
1291 1362 }
1292 1363  
1293 1364 EXPORT_SYMBOL(invalidate_partition);
  1365 +
  1366 +/*
  1367 + * Disk events - monitor disk events like media change and eject request.
  1368 + */
  1369 +struct disk_events {
  1370 + struct list_head node; /* all disk_event's */
  1371 + struct gendisk *disk; /* the associated disk */
  1372 + spinlock_t lock;
  1373 +
  1374 + int block; /* event blocking depth */
  1375 + unsigned int pending; /* events already sent out */
  1376 + unsigned int clearing; /* events being cleared */
  1377 +
  1378 + long poll_msecs; /* interval, -1 for default */
  1379 + struct delayed_work dwork;
  1380 +};
  1381 +
  1382 +static const char *disk_events_strs[] = {
  1383 + [ilog2(DISK_EVENT_MEDIA_CHANGE)] = "media_change",
  1384 + [ilog2(DISK_EVENT_EJECT_REQUEST)] = "eject_request",
  1385 +};
  1386 +
  1387 +static char *disk_uevents[] = {
  1388 + [ilog2(DISK_EVENT_MEDIA_CHANGE)] = "DISK_MEDIA_CHANGE=1",
  1389 + [ilog2(DISK_EVENT_EJECT_REQUEST)] = "DISK_EJECT_REQUEST=1",
  1390 +};
  1391 +
  1392 +/* list of all disk_events */
  1393 +static DEFINE_MUTEX(disk_events_mutex);
  1394 +static LIST_HEAD(disk_events);
  1395 +
  1396 +/* disable in-kernel polling by default */
  1397 +static unsigned long disk_events_dfl_poll_msecs = 0;
  1398 +
  1399 +static unsigned long disk_events_poll_jiffies(struct gendisk *disk)
  1400 +{
  1401 + struct disk_events *ev = disk->ev;
  1402 + long intv_msecs = 0;
  1403 +
  1404 + /*
  1405 + * If device-specific poll interval is set, always use it. If
  1406 + * the default is being used, poll iff there are events which
  1407 + * can't be monitored asynchronously.
  1408 + */
  1409 + if (ev->poll_msecs >= 0)
  1410 + intv_msecs = ev->poll_msecs;
  1411 + else if (disk->events & ~disk->async_events)
  1412 + intv_msecs = disk_events_dfl_poll_msecs;
  1413 +
  1414 + return msecs_to_jiffies(intv_msecs);
  1415 +}
  1416 +
  1417 +static void __disk_block_events(struct gendisk *disk, bool sync)
  1418 +{
  1419 + struct disk_events *ev = disk->ev;
  1420 + unsigned long flags;
  1421 + bool cancel;
  1422 +
  1423 + spin_lock_irqsave(&ev->lock, flags);
  1424 + cancel = !ev->block++;
  1425 + spin_unlock_irqrestore(&ev->lock, flags);
  1426 +
  1427 + if (cancel) {
  1428 + if (sync)
  1429 + cancel_delayed_work_sync(&disk->ev->dwork);
  1430 + else
  1431 + cancel_delayed_work(&disk->ev->dwork);
  1432 + }
  1433 +}
  1434 +
  1435 +static void __disk_unblock_events(struct gendisk *disk, bool check_now)
  1436 +{
  1437 + struct disk_events *ev = disk->ev;
  1438 + unsigned long intv;
  1439 + unsigned long flags;
  1440 +
  1441 + spin_lock_irqsave(&ev->lock, flags);
  1442 +
  1443 + if (WARN_ON_ONCE(ev->block <= 0))
  1444 + goto out_unlock;
  1445 +
  1446 + if (--ev->block)
  1447 + goto out_unlock;
  1448 +
  1449 + /*
  1450 + * Not exactly a latency critical operation, set poll timer
  1451 + * slack to 25% and kick event check.
  1452 + */
  1453 + intv = disk_events_poll_jiffies(disk);
  1454 + set_timer_slack(&ev->dwork.timer, intv / 4);
  1455 + if (check_now)
  1456 + queue_delayed_work(system_nrt_wq, &ev->dwork, 0);
  1457 + else if (intv)
  1458 + queue_delayed_work(system_nrt_wq, &ev->dwork, intv);
  1459 +out_unlock:
  1460 + spin_unlock_irqrestore(&ev->lock, flags);
  1461 +}
  1462 +
  1463 +/**
  1464 + * disk_block_events - block and flush disk event checking
  1465 + * @disk: disk to block events for
  1466 + *
  1467 + * On return from this function, it is guaranteed that event checking
  1468 + * isn't in progress and won't happen until unblocked by
  1469 + * disk_unblock_events(). Events blocking is counted and the actual
  1470 + * unblocking happens after the matching number of unblocks are done.
  1471 + *
  1472 + * Note that this intentionally does not block event checking from
  1473 + * disk_clear_events().
  1474 + *
  1475 + * CONTEXT:
  1476 + * Might sleep.
  1477 + */
  1478 +void disk_block_events(struct gendisk *disk)
  1479 +{
  1480 + if (disk->ev)
  1481 + __disk_block_events(disk, true);
  1482 +}
  1483 +
  1484 +/**
  1485 + * disk_unblock_events - unblock disk event checking
  1486 + * @disk: disk to unblock events for
  1487 + *
  1488 + * Undo disk_block_events(). When the block count reaches zero, it
  1489 + * starts events polling if configured.
  1490 + *
  1491 + * CONTEXT:
  1492 + * Don't care. Safe to call from irq context.
  1493 + */
  1494 +void disk_unblock_events(struct gendisk *disk)
  1495 +{
  1496 + if (disk->ev)
  1497 + __disk_unblock_events(disk, true);
  1498 +}
  1499 +
  1500 +/**
  1501 + * disk_check_events - schedule immediate event checking
  1502 + * @disk: disk to check events for
  1503 + *
  1504 + * Schedule immediate event checking on @disk if not blocked.
  1505 + *
  1506 + * CONTEXT:
  1507 + * Don't care. Safe to call from irq context.
  1508 + */
  1509 +void disk_check_events(struct gendisk *disk)
  1510 +{
  1511 + if (disk->ev) {
  1512 + __disk_block_events(disk, false);
  1513 + __disk_unblock_events(disk, true);
  1514 + }
  1515 +}
  1516 +EXPORT_SYMBOL_GPL(disk_check_events);
  1517 +
  1518 +/**
  1519 + * disk_clear_events - synchronously check, clear and return pending events
  1520 + * @disk: disk to fetch and clear events from
  1521 + * @mask: mask of events to be fetched and clearted
  1522 + *
  1523 + * Disk events are synchronously checked and pending events in @mask
  1524 + * are cleared and returned. This ignores the block count.
  1525 + *
  1526 + * CONTEXT:
  1527 + * Might sleep.
  1528 + */
  1529 +unsigned int disk_clear_events(struct gendisk *disk, unsigned int mask)
  1530 +{
  1531 + const struct block_device_operations *bdops = disk->fops;
  1532 + struct disk_events *ev = disk->ev;
  1533 + unsigned int pending;
  1534 +
  1535 + if (!ev) {
  1536 + /* for drivers still using the old ->media_changed method */
  1537 + if ((mask & DISK_EVENT_MEDIA_CHANGE) &&
  1538 + bdops->media_changed && bdops->media_changed(disk))
  1539 + return DISK_EVENT_MEDIA_CHANGE;
  1540 + return 0;
  1541 + }
  1542 +
  1543 + /* tell the workfn about the events being cleared */
  1544 + spin_lock_irq(&ev->lock);
  1545 + ev->clearing |= mask;
  1546 + spin_unlock_irq(&ev->lock);
  1547 +
  1548 + /* uncondtionally schedule event check and wait for it to finish */
  1549 + __disk_block_events(disk, true);
  1550 + queue_delayed_work(system_nrt_wq, &ev->dwork, 0);
  1551 + flush_delayed_work(&ev->dwork);
  1552 + __disk_unblock_events(disk, false);
  1553 +
  1554 + /* then, fetch and clear pending events */
  1555 + spin_lock_irq(&ev->lock);
  1556 + WARN_ON_ONCE(ev->clearing & mask); /* cleared by workfn */
  1557 + pending = ev->pending & mask;
  1558 + ev->pending &= ~mask;
  1559 + spin_unlock_irq(&ev->lock);
  1560 +
  1561 + return pending;
  1562 +}
  1563 +
  1564 +static void disk_events_workfn(struct work_struct *work)
  1565 +{
  1566 + struct delayed_work *dwork = to_delayed_work(work);
  1567 + struct disk_events *ev = container_of(dwork, struct disk_events, dwork);
  1568 + struct gendisk *disk = ev->disk;
  1569 + char *envp[ARRAY_SIZE(disk_uevents) + 1] = { };
  1570 + unsigned int clearing = ev->clearing;
  1571 + unsigned int events;
  1572 + unsigned long intv;
  1573 + int nr_events = 0, i;
  1574 +
  1575 + /* check events */
  1576 + events = disk->fops->check_events(disk, clearing);
  1577 +
  1578 + /* accumulate pending events and schedule next poll if necessary */
  1579 + spin_lock_irq(&ev->lock);
  1580 +
  1581 + events &= ~ev->pending;
  1582 + ev->pending |= events;
  1583 + ev->clearing &= ~clearing;
  1584 +
  1585 + intv = disk_events_poll_jiffies(disk);
  1586 + if (!ev->block && intv)
  1587 + queue_delayed_work(system_nrt_wq, &ev->dwork, intv);
  1588 +
  1589 + spin_unlock_irq(&ev->lock);
  1590 +
  1591 + /* tell userland about new events */
  1592 + for (i = 0; i < ARRAY_SIZE(disk_uevents); i++)
  1593 + if (events & (1 << i))
  1594 + envp[nr_events++] = disk_uevents[i];
  1595 +
  1596 + if (nr_events)
  1597 + kobject_uevent_env(&disk_to_dev(disk)->kobj, KOBJ_CHANGE, envp);
  1598 +}
  1599 +
  1600 +/*
  1601 + * A disk events enabled device has the following sysfs nodes under
  1602 + * its /sys/block/X/ directory.
  1603 + *
  1604 + * events : list of all supported events
  1605 + * events_async : list of events which can be detected w/o polling
  1606 + * events_poll_msecs : polling interval, 0: disable, -1: system default
  1607 + */
  1608 +static ssize_t __disk_events_show(unsigned int events, char *buf)
  1609 +{
  1610 + const char *delim = "";
  1611 + ssize_t pos = 0;
  1612 + int i;
  1613 +
  1614 + for (i = 0; i < ARRAY_SIZE(disk_events_strs); i++)
  1615 + if (events & (1 << i)) {
  1616 + pos += sprintf(buf + pos, "%s%s",
  1617 + delim, disk_events_strs[i]);
  1618 + delim = " ";
  1619 + }
  1620 + if (pos)
  1621 + pos += sprintf(buf + pos, "\n");
  1622 + return pos;
  1623 +}
  1624 +
  1625 +static ssize_t disk_events_show(struct device *dev,
  1626 + struct device_attribute *attr, char *buf)
  1627 +{
  1628 + struct gendisk *disk = dev_to_disk(dev);
  1629 +
  1630 + return __disk_events_show(disk->events, buf);
  1631 +}
  1632 +
  1633 +static ssize_t disk_events_async_show(struct device *dev,
  1634 + struct device_attribute *attr, char *buf)
  1635 +{
  1636 + struct gendisk *disk = dev_to_disk(dev);
  1637 +
  1638 + return __disk_events_show(disk->async_events, buf);
  1639 +}
  1640 +
  1641 +static ssize_t disk_events_poll_msecs_show(struct device *dev,
  1642 + struct device_attribute *attr,
  1643 + char *buf)
  1644 +{
  1645 + struct gendisk *disk = dev_to_disk(dev);
  1646 +
  1647 + return sprintf(buf, "%ld\n", disk->ev->poll_msecs);
  1648 +}
  1649 +
  1650 +static ssize_t disk_events_poll_msecs_store(struct device *dev,
  1651 + struct device_attribute *attr,
  1652 + const char *buf, size_t count)
  1653 +{
  1654 + struct gendisk *disk = dev_to_disk(dev);
  1655 + long intv;
  1656 +
  1657 + if (!count || !sscanf(buf, "%ld", &intv))
  1658 + return -EINVAL;
  1659 +
  1660 + if (intv < 0 && intv != -1)
  1661 + return -EINVAL;
  1662 +
  1663 + __disk_block_events(disk, true);
  1664 + disk->ev->poll_msecs = intv;
  1665 + __disk_unblock_events(disk, true);
  1666 +
  1667 + return count;
  1668 +}
  1669 +
  1670 +static const DEVICE_ATTR(events, S_IRUGO, disk_events_show, NULL);
  1671 +static const DEVICE_ATTR(events_async, S_IRUGO, disk_events_async_show, NULL);
  1672 +static const DEVICE_ATTR(events_poll_msecs, S_IRUGO|S_IWUSR,
  1673 + disk_events_poll_msecs_show,
  1674 + disk_events_poll_msecs_store);
  1675 +
  1676 +static const struct attribute *disk_events_attrs[] = {
  1677 + &dev_attr_events.attr,
  1678 + &dev_attr_events_async.attr,
  1679 + &dev_attr_events_poll_msecs.attr,
  1680 + NULL,
  1681 +};
  1682 +
  1683 +/*
  1684 + * The default polling interval can be specified by the kernel
  1685 + * parameter block.events_dfl_poll_msecs which defaults to 0
  1686 + * (disable). This can also be modified runtime by writing to
  1687 + * /sys/module/block/events_dfl_poll_msecs.
  1688 + */
  1689 +static int disk_events_set_dfl_poll_msecs(const char *val,
  1690 + const struct kernel_param *kp)
  1691 +{
  1692 + struct disk_events *ev;
  1693 + int ret;
  1694 +
  1695 + ret = param_set_ulong(val, kp);
  1696 + if (ret < 0)
  1697 + return ret;
  1698 +
  1699 + mutex_lock(&disk_events_mutex);
  1700 +
  1701 + list_for_each_entry(ev, &disk_events, node)
  1702 + disk_check_events(ev->disk);
  1703 +
  1704 + mutex_unlock(&disk_events_mutex);
  1705 +
  1706 + return 0;
  1707 +}
  1708 +
  1709 +static const struct kernel_param_ops disk_events_dfl_poll_msecs_param_ops = {
  1710 + .set = disk_events_set_dfl_poll_msecs,
  1711 + .get = param_get_ulong,
  1712 +};
  1713 +
  1714 +#undef MODULE_PARAM_PREFIX
  1715 +#define MODULE_PARAM_PREFIX "block."
  1716 +
  1717 +module_param_cb(events_dfl_poll_msecs, &disk_events_dfl_poll_msecs_param_ops,
  1718 + &disk_events_dfl_poll_msecs, 0644);
  1719 +
  1720 +/*
  1721 + * disk_{add|del|release}_events - initialize and destroy disk_events.
  1722 + */
  1723 +static void disk_add_events(struct gendisk *disk)
  1724 +{
  1725 + struct disk_events *ev;
  1726 +
  1727 + if (!disk->fops->check_events || !(disk->events | disk->async_events))
  1728 + return;
  1729 +
  1730 + ev = kzalloc(sizeof(*ev), GFP_KERNEL);
  1731 + if (!ev) {
  1732 + pr_warn("%s: failed to initialize events\n", disk->disk_name);
  1733 + return;
  1734 + }
  1735 +
  1736 + if (sysfs_create_files(&disk_to_dev(disk)->kobj,
  1737 + disk_events_attrs) < 0) {
  1738 + pr_warn("%s: failed to create sysfs files for events\n",
  1739 + disk->disk_name);
  1740 + kfree(ev);
  1741 + return;
  1742 + }
  1743 +
  1744 + disk->ev = ev;
  1745 +
  1746 + INIT_LIST_HEAD(&ev->node);
  1747 + ev->disk = disk;
  1748 + spin_lock_init(&ev->lock);
  1749 + ev->block = 1;
  1750 + ev->poll_msecs = -1;
  1751 + INIT_DELAYED_WORK(&ev->dwork, disk_events_workfn);
  1752 +
  1753 + mutex_lock(&disk_events_mutex);
  1754 + list_add_tail(&ev->node, &disk_events);
  1755 + mutex_unlock(&disk_events_mutex);
  1756 +
  1757 + /*
  1758 + * Block count is initialized to 1 and the following initial
  1759 + * unblock kicks it into action.
  1760 + */
  1761 + __disk_unblock_events(disk, true);
  1762 +}
  1763 +
  1764 +static void disk_del_events(struct gendisk *disk)
  1765 +{
  1766 + if (!disk->ev)
  1767 + return;
  1768 +
  1769 + __disk_block_events(disk, true);
  1770 +
  1771 + mutex_lock(&disk_events_mutex);
  1772 + list_del_init(&disk->ev->node);
  1773 + mutex_unlock(&disk_events_mutex);
  1774 +
  1775 + sysfs_remove_files(&disk_to_dev(disk)->kobj, disk_events_attrs);
  1776 +}
  1777 +
  1778 +static void disk_release_events(struct gendisk *disk)
  1779 +{
  1780 + /* the block count should be 1 from disk_del_events() */
  1781 + WARN_ON_ONCE(disk->ev && disk->ev->block != 1);
  1782 + kfree(disk->ev);
  1783 +}
... ... @@ -294,11 +294,12 @@
294 294 return -EINVAL;
295 295 if (get_user(n, (int __user *) arg))
296 296 return -EFAULT;
297   - if (!(mode & FMODE_EXCL) && bd_claim(bdev, &bdev) < 0)
  297 + if (!(mode & FMODE_EXCL) &&
  298 + blkdev_get(bdev, mode | FMODE_EXCL, &bdev) < 0)
298 299 return -EBUSY;
299 300 ret = set_blocksize(bdev, n);
300 301 if (!(mode & FMODE_EXCL))
301   - bd_release(bdev);
  302 + blkdev_put(bdev, mode | FMODE_EXCL);
302 303 return ret;
303 304 case BLKPG:
304 305 ret = blkpg_ioctl(bdev, (struct blkpg_ioctl_arg __user *) arg);
drivers/block/drbd/drbd_int.h
... ... @@ -911,8 +911,6 @@
911 911 struct drbd_backing_dev {
912 912 struct block_device *backing_bdev;
913 913 struct block_device *md_bdev;
914   - struct file *lo_file;
915   - struct file *md_file;
916 914 struct drbd_md md;
917 915 struct disk_conf dc; /* The user provided config... */
918 916 sector_t known_size; /* last known size of that backing device */
drivers/block/drbd/drbd_main.c
... ... @@ -3372,11 +3372,8 @@
3372 3372 if (ldev == NULL)
3373 3373 return;
3374 3374  
3375   - bd_release(ldev->backing_bdev);
3376   - bd_release(ldev->md_bdev);
3377   -
3378   - fput(ldev->lo_file);
3379   - fput(ldev->md_file);
  3375 + blkdev_put(ldev->backing_bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
  3376 + blkdev_put(ldev->md_bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
3380 3377  
3381 3378 kfree(ldev);
3382 3379 }
drivers/block/drbd/drbd_nl.c
... ... @@ -855,7 +855,7 @@
855 855 sector_t max_possible_sectors;
856 856 sector_t min_md_device_sectors;
857 857 struct drbd_backing_dev *nbc = NULL; /* new_backing_conf */
858   - struct inode *inode, *inode2;
  858 + struct block_device *bdev;
859 859 struct lru_cache *resync_lru = NULL;
860 860 union drbd_state ns, os;
861 861 unsigned int max_seg_s;
862 862  
863 863  
864 864  
865 865  
866 866  
867 867  
868 868  
869 869  
870 870  
... ... @@ -907,78 +907,51 @@
907 907 }
908 908 }
909 909  
910   - nbc->lo_file = filp_open(nbc->dc.backing_dev, O_RDWR, 0);
911   - if (IS_ERR(nbc->lo_file)) {
  910 + bdev = blkdev_get_by_path(nbc->dc.backing_dev,
  911 + FMODE_READ | FMODE_WRITE | FMODE_EXCL, mdev);
  912 + if (IS_ERR(bdev)) {
912 913 dev_err(DEV, "open(\"%s\") failed with %ld\n", nbc->dc.backing_dev,
913   - PTR_ERR(nbc->lo_file));
914   - nbc->lo_file = NULL;
  914 + PTR_ERR(bdev));
915 915 retcode = ERR_OPEN_DISK;
916 916 goto fail;
917 917 }
  918 + nbc->backing_bdev = bdev;
918 919  
919   - inode = nbc->lo_file->f_dentry->d_inode;
920   -
921   - if (!S_ISBLK(inode->i_mode)) {
922   - retcode = ERR_DISK_NOT_BDEV;
923   - goto fail;
924   - }
925   -
926   - nbc->md_file = filp_open(nbc->dc.meta_dev, O_RDWR, 0);
927   - if (IS_ERR(nbc->md_file)) {
  920 + /*
  921 + * meta_dev_idx >= 0: external fixed size, possibly multiple
  922 + * drbd sharing one meta device. TODO in that case, paranoia
  923 + * check that [md_bdev, meta_dev_idx] is not yet used by some
  924 + * other drbd minor! (if you use drbd.conf + drbdadm, that
  925 + * should check it for you already; but if you don't, or
  926 + * someone fooled it, we need to double check here)
  927 + */
  928 + bdev = blkdev_get_by_path(nbc->dc.meta_dev,
  929 + FMODE_READ | FMODE_WRITE | FMODE_EXCL,
  930 + (nbc->dc.meta_dev_idx < 0) ?
  931 + (void *)mdev : (void *)drbd_m_holder);
  932 + if (IS_ERR(bdev)) {
928 933 dev_err(DEV, "open(\"%s\") failed with %ld\n", nbc->dc.meta_dev,
929   - PTR_ERR(nbc->md_file));
930   - nbc->md_file = NULL;
  934 + PTR_ERR(bdev));
931 935 retcode = ERR_OPEN_MD_DISK;
932 936 goto fail;
933 937 }
  938 + nbc->md_bdev = bdev;
934 939  
935   - inode2 = nbc->md_file->f_dentry->d_inode;
936   -
937   - if (!S_ISBLK(inode2->i_mode)) {
938   - retcode = ERR_MD_NOT_BDEV;
  940 + if ((nbc->backing_bdev == nbc->md_bdev) !=
  941 + (nbc->dc.meta_dev_idx == DRBD_MD_INDEX_INTERNAL ||
  942 + nbc->dc.meta_dev_idx == DRBD_MD_INDEX_FLEX_INT)) {
  943 + retcode = ERR_MD_IDX_INVALID;
939 944 goto fail;
940 945 }
941 946  
942   - nbc->backing_bdev = inode->i_bdev;
943   - if (bd_claim(nbc->backing_bdev, mdev)) {
944   - printk(KERN_ERR "drbd: bd_claim(%p,%p); failed [%p;%p;%u]\n",
945   - nbc->backing_bdev, mdev,
946   - nbc->backing_bdev->bd_holder,
947   - nbc->backing_bdev->bd_contains->bd_holder,
948   - nbc->backing_bdev->bd_holders);
949   - retcode = ERR_BDCLAIM_DISK;
950   - goto fail;
951   - }
952   -
953 947 resync_lru = lc_create("resync", drbd_bm_ext_cache,
954 948 61, sizeof(struct bm_extent),
955 949 offsetof(struct bm_extent, lce));
956 950 if (!resync_lru) {
957 951 retcode = ERR_NOMEM;
958   - goto release_bdev_fail;
  952 + goto fail;
959 953 }
960 954  
961   - /* meta_dev_idx >= 0: external fixed size,
962   - * possibly multiple drbd sharing one meta device.
963   - * TODO in that case, paranoia check that [md_bdev, meta_dev_idx] is
964   - * not yet used by some other drbd minor!
965   - * (if you use drbd.conf + drbdadm,
966   - * that should check it for you already; but if you don't, or someone
967   - * fooled it, we need to double check here) */
968   - nbc->md_bdev = inode2->i_bdev;
969   - if (bd_claim(nbc->md_bdev, (nbc->dc.meta_dev_idx < 0) ? (void *)mdev
970   - : (void *) drbd_m_holder)) {
971   - retcode = ERR_BDCLAIM_MD_DISK;
972   - goto release_bdev_fail;
973   - }
974   -
975   - if ((nbc->backing_bdev == nbc->md_bdev) !=
976   - (nbc->dc.meta_dev_idx == DRBD_MD_INDEX_INTERNAL ||
977   - nbc->dc.meta_dev_idx == DRBD_MD_INDEX_FLEX_INT)) {
978   - retcode = ERR_MD_IDX_INVALID;
979   - goto release_bdev2_fail;
980   - }
981   -
982 955 /* RT - for drbd_get_max_capacity() DRBD_MD_INDEX_FLEX_INT */
983 956 drbd_md_set_sector_offsets(mdev, nbc);
984 957  
... ... @@ -987,7 +960,7 @@
987 960 (unsigned long long) drbd_get_max_capacity(nbc),
988 961 (unsigned long long) nbc->dc.disk_size);
989 962 retcode = ERR_DISK_TO_SMALL;
990   - goto release_bdev2_fail;
  963 + goto fail;
991 964 }
992 965  
993 966 if (nbc->dc.meta_dev_idx < 0) {
... ... @@ -1004,7 +977,7 @@
1004 977 dev_warn(DEV, "refusing attach: md-device too small, "
1005 978 "at least %llu sectors needed for this meta-disk type\n",
1006 979 (unsigned long long) min_md_device_sectors);
1007   - goto release_bdev2_fail;
  980 + goto fail;
1008 981 }
1009 982  
1010 983 /* Make sure the new disk is big enough
... ... @@ -1012,7 +985,7 @@
1012 985 if (drbd_get_max_capacity(nbc) <
1013 986 drbd_get_capacity(mdev->this_bdev)) {
1014 987 retcode = ERR_DISK_TO_SMALL;
1015   - goto release_bdev2_fail;
  988 + goto fail;
1016 989 }
1017 990  
1018 991 nbc->known_size = drbd_get_capacity(nbc->backing_bdev);
... ... @@ -1035,7 +1008,7 @@
1035 1008 retcode = _drbd_request_state(mdev, NS(disk, D_ATTACHING), CS_VERBOSE);
1036 1009 drbd_resume_io(mdev);
1037 1010 if (retcode < SS_SUCCESS)
1038   - goto release_bdev2_fail;
  1011 + goto fail;
1039 1012  
1040 1013 if (!get_ldev_if_state(mdev, D_ATTACHING))
1041 1014 goto force_diskless;
1042 1015  
... ... @@ -1269,18 +1242,14 @@
1269 1242 force_diskless:
1270 1243 drbd_force_state(mdev, NS(disk, D_FAILED));
1271 1244 drbd_md_sync(mdev);
1272   - release_bdev2_fail:
1273   - if (nbc)
1274   - bd_release(nbc->md_bdev);
1275   - release_bdev_fail:
1276   - if (nbc)
1277   - bd_release(nbc->backing_bdev);
1278 1245 fail:
1279 1246 if (nbc) {
1280   - if (nbc->lo_file)
1281   - fput(nbc->lo_file);
1282   - if (nbc->md_file)
1283   - fput(nbc->md_file);
  1247 + if (nbc->backing_bdev)
  1248 + blkdev_put(nbc->backing_bdev,
  1249 + FMODE_READ | FMODE_WRITE | FMODE_EXCL);
  1250 + if (nbc->md_bdev)
  1251 + blkdev_put(nbc->md_bdev,
  1252 + FMODE_READ | FMODE_WRITE | FMODE_EXCL);
1284 1253 kfree(nbc);
1285 1254 }
1286 1255 lc_destroy(resync_lru);
drivers/block/loop.c
... ... @@ -395,11 +395,7 @@
395 395 struct loop_device *lo = p->lo;
396 396 struct page *page = buf->page;
397 397 sector_t IV;
398   - int size, ret;
399   -
400   - ret = buf->ops->confirm(pipe, buf);
401   - if (unlikely(ret))
402   - return ret;
  398 + int size;
403 399  
404 400 IV = ((sector_t) page->index << (PAGE_CACHE_SHIFT - 9)) +
405 401 (buf->offset >> 9);
drivers/block/pktcdvd.c
... ... @@ -2296,15 +2296,12 @@
2296 2296 * so bdget() can't fail.
2297 2297 */
2298 2298 bdget(pd->bdev->bd_dev);
2299   - if ((ret = blkdev_get(pd->bdev, FMODE_READ)))
  2299 + if ((ret = blkdev_get(pd->bdev, FMODE_READ | FMODE_EXCL, pd)))
2300 2300 goto out;
2301 2301  
2302   - if ((ret = bd_claim(pd->bdev, pd)))
2303   - goto out_putdev;
2304   -
2305 2302 if ((ret = pkt_get_last_written(pd, &lba))) {
2306 2303 printk(DRIVER_NAME": pkt_get_last_written failed\n");
2307   - goto out_unclaim;
  2304 + goto out_putdev;
2308 2305 }
2309 2306  
2310 2307 set_capacity(pd->disk, lba << 2);
... ... @@ -2314,7 +2311,7 @@
2314 2311 q = bdev_get_queue(pd->bdev);
2315 2312 if (write) {
2316 2313 if ((ret = pkt_open_write(pd)))
2317   - goto out_unclaim;
  2314 + goto out_putdev;
2318 2315 /*
2319 2316 * Some CDRW drives can not handle writes larger than one packet,
2320 2317 * even if the size is a multiple of the packet size.
2321 2318  
2322 2319  
2323 2320  
... ... @@ -2329,23 +2326,21 @@
2329 2326 }
2330 2327  
2331 2328 if ((ret = pkt_set_segment_merging(pd, q)))
2332   - goto out_unclaim;
  2329 + goto out_putdev;
2333 2330  
2334 2331 if (write) {
2335 2332 if (!pkt_grow_pktlist(pd, CONFIG_CDROM_PKTCDVD_BUFFERS)) {
2336 2333 printk(DRIVER_NAME": not enough memory for buffers\n");
2337 2334 ret = -ENOMEM;
2338   - goto out_unclaim;
  2335 + goto out_putdev;
2339 2336 }
2340 2337 printk(DRIVER_NAME": %lukB available on disc\n", lba << 1);
2341 2338 }
2342 2339  
2343 2340 return 0;
2344 2341  
2345   -out_unclaim:
2346   - bd_release(pd->bdev);
2347 2342 out_putdev:
2348   - blkdev_put(pd->bdev, FMODE_READ);
  2343 + blkdev_put(pd->bdev, FMODE_READ | FMODE_EXCL);
2349 2344 out:
2350 2345 return ret;
2351 2346 }
... ... @@ -2362,8 +2357,7 @@
2362 2357 pkt_lock_door(pd, 0);
2363 2358  
2364 2359 pkt_set_speed(pd, MAX_SPEED, MAX_SPEED);
2365   - bd_release(pd->bdev);
2366   - blkdev_put(pd->bdev, FMODE_READ);
  2360 + blkdev_put(pd->bdev, FMODE_READ | FMODE_EXCL);
2367 2361  
2368 2362 pkt_shrink_pktlist(pd);
2369 2363 }
... ... @@ -2733,7 +2727,7 @@
2733 2727 bdev = bdget(dev);
2734 2728 if (!bdev)
2735 2729 return -ENOMEM;
2736   - ret = blkdev_get(bdev, FMODE_READ | FMODE_NDELAY);
  2730 + ret = blkdev_get(bdev, FMODE_READ | FMODE_NDELAY, NULL);
2737 2731 if (ret)
2738 2732 return ret;
2739 2733  
drivers/cdrom/cdrom.c
... ... @@ -1348,7 +1348,10 @@
1348 1348 if (!CDROM_CAN(CDC_SELECT_DISC))
1349 1349 return -EDRIVE_CANT_DO_THIS;
1350 1350  
1351   - (void) cdi->ops->media_changed(cdi, slot);
  1351 + if (cdi->ops->check_events)
  1352 + cdi->ops->check_events(cdi, 0, slot);
  1353 + else
  1354 + cdi->ops->media_changed(cdi, slot);
1352 1355  
1353 1356 if (slot == CDSL_NONE) {
1354 1357 /* set media changed bits, on both queues */
... ... @@ -1392,6 +1395,42 @@
1392 1395 return slot;
1393 1396 }
1394 1397  
  1398 +/*
  1399 + * As cdrom implements an extra ioctl consumer for media changed
  1400 + * event, it needs to buffer ->check_events() output, such that event
  1401 + * is not lost for both the usual VFS and ioctl paths.
  1402 + * cdi->{vfs|ioctl}_events are used to buffer pending events for each
  1403 + * path.
  1404 + *
  1405 + * XXX: Locking is non-existent. cdi->ops->check_events() can be
  1406 + * called in parallel and buffering fields are accessed without any
  1407 + * exclusion. The original media_changed code had the same problem.
  1408 + * It might be better to simply deprecate CDROM_MEDIA_CHANGED ioctl
  1409 + * and remove this cruft altogether. It doesn't have much usefulness
  1410 + * at this point.
  1411 + */
  1412 +static void cdrom_update_events(struct cdrom_device_info *cdi,
  1413 + unsigned int clearing)
  1414 +{
  1415 + unsigned int events;
  1416 +
  1417 + events = cdi->ops->check_events(cdi, clearing, CDSL_CURRENT);
  1418 + cdi->vfs_events |= events;
  1419 + cdi->ioctl_events |= events;
  1420 +}
  1421 +
  1422 +unsigned int cdrom_check_events(struct cdrom_device_info *cdi,
  1423 + unsigned int clearing)
  1424 +{
  1425 + unsigned int events;
  1426 +
  1427 + cdrom_update_events(cdi, clearing);
  1428 + events = cdi->vfs_events;
  1429 + cdi->vfs_events = 0;
  1430 + return events;
  1431 +}
  1432 +EXPORT_SYMBOL(cdrom_check_events);
  1433 +
1395 1434 /* We want to make media_changed accessible to the user through an
1396 1435 * ioctl. The main problem now is that we must double-buffer the
1397 1436 * low-level implementation, to assure that the VFS and the user both
1398 1437  
1399 1438  
1400 1439  
... ... @@ -1403,15 +1442,26 @@
1403 1442 {
1404 1443 unsigned int mask = (1 << (queue & 1));
1405 1444 int ret = !!(cdi->mc_flags & mask);
  1445 + bool changed;
1406 1446  
1407 1447 if (!CDROM_CAN(CDC_MEDIA_CHANGED))
1408   - return ret;
  1448 + return ret;
  1449 +
1409 1450 /* changed since last call? */
1410   - if (cdi->ops->media_changed(cdi, CDSL_CURRENT)) {
  1451 + if (cdi->ops->check_events) {
  1452 + BUG_ON(!queue); /* shouldn't be called from VFS path */
  1453 + cdrom_update_events(cdi, DISK_EVENT_MEDIA_CHANGE);
  1454 + changed = cdi->ioctl_events & DISK_EVENT_MEDIA_CHANGE;
  1455 + cdi->ioctl_events = 0;
  1456 + } else
  1457 + changed = cdi->ops->media_changed(cdi, CDSL_CURRENT);
  1458 +
  1459 + if (changed) {
1411 1460 cdi->mc_flags = 0x3; /* set bit on both queues */
1412 1461 ret |= 1;
1413 1462 cdi->media_written = 0;
1414 1463 }
  1464 +
1415 1465 cdi->mc_flags &= ~mask; /* clear bit */
1416 1466 return ret;
1417 1467 }
... ... @@ -65,15 +65,12 @@
65 65 if (!bdev)
66 66 goto out;
67 67 igrab(bdev->bd_inode);
68   - err = blkdev_get(bdev, filp->f_mode);
  68 + err = blkdev_get(bdev, filp->f_mode | FMODE_EXCL, raw_open);
69 69 if (err)
70 70 goto out;
71   - err = bd_claim(bdev, raw_open);
72   - if (err)
73   - goto out1;
74 71 err = set_blocksize(bdev, bdev_logical_block_size(bdev));
75 72 if (err)
76   - goto out2;
  73 + goto out1;
77 74 filp->f_flags |= O_DIRECT;
78 75 filp->f_mapping = bdev->bd_inode->i_mapping;
79 76 if (++raw_devices[minor].inuse == 1)
80 77  
... ... @@ -83,10 +80,8 @@
83 80 mutex_unlock(&raw_mutex);
84 81 return 0;
85 82  
86   -out2:
87   - bd_release(bdev);
88 83 out1:
89   - blkdev_put(bdev, filp->f_mode);
  84 + blkdev_put(bdev, filp->f_mode | FMODE_EXCL);
90 85 out:
91 86 mutex_unlock(&raw_mutex);
92 87 return err;
... ... @@ -110,8 +105,7 @@
110 105 }
111 106 mutex_unlock(&raw_mutex);
112 107  
113   - bd_release(bdev);
114   - blkdev_put(bdev, filp->f_mode);
  108 + blkdev_put(bdev, filp->f_mode | FMODE_EXCL);
115 109 return 0;
116 110 }
117 111  
drivers/md/dm-table.c
... ... @@ -325,15 +325,18 @@
325 325  
326 326 BUG_ON(d->dm_dev.bdev);
327 327  
328   - bdev = open_by_devnum(dev, d->dm_dev.mode);
  328 + bdev = blkdev_get_by_dev(dev, d->dm_dev.mode | FMODE_EXCL, _claim_ptr);
329 329 if (IS_ERR(bdev))
330 330 return PTR_ERR(bdev);
331   - r = bd_claim_by_disk(bdev, _claim_ptr, dm_disk(md));
332   - if (r)
333   - blkdev_put(bdev, d->dm_dev.mode);
334   - else
335   - d->dm_dev.bdev = bdev;
336   - return r;
  331 +
  332 + r = bd_link_disk_holder(bdev, dm_disk(md));
  333 + if (r) {
  334 + blkdev_put(bdev, d->dm_dev.mode | FMODE_EXCL);
  335 + return r;
  336 + }
  337 +
  338 + d->dm_dev.bdev = bdev;
  339 + return 0;
337 340 }
338 341  
339 342 /*
... ... @@ -344,8 +347,7 @@
344 347 if (!d->dm_dev.bdev)
345 348 return;
346 349  
347   - bd_release_from_disk(d->dm_dev.bdev, dm_disk(md));
348   - blkdev_put(d->dm_dev.bdev, d->dm_dev.mode);
  350 + blkdev_put(d->dm_dev.bdev, d->dm_dev.mode | FMODE_EXCL);
349 351 d->dm_dev.bdev = NULL;
350 352 }
351 353  
... ... @@ -630,7 +630,7 @@
630 630 queue_io(md, bio);
631 631 } else {
632 632 /* done with normal IO or empty flush */
633   - trace_block_bio_complete(md->queue, bio);
  633 + trace_block_bio_complete(md->queue, bio, io_error);
634 634 bio_endio(bio, io_error);
635 635 }
636 636 }
... ... @@ -990,8 +990,8 @@
990 990 if (r == DM_MAPIO_REMAPPED) {
991 991 /* the bio has been remapped so dispatch it */
992 992  
993   - trace_block_remap(bdev_get_queue(clone->bi_bdev), clone,
994   - tio->io->bio->bi_bdev->bd_dev, sector);
  993 + trace_block_bio_remap(bdev_get_queue(clone->bi_bdev), clone,
  994 + tio->io->bio->bi_bdev->bd_dev, sector);
995 995  
996 996 generic_make_request(clone);
997 997 } else if (r < 0 || r == DM_MAPIO_REQUEUE) {
... ... @@ -1879,7 +1879,7 @@
1879 1879 rdev->sysfs_state = sysfs_get_dirent_safe(rdev->kobj.sd, "state");
1880 1880  
1881 1881 list_add_rcu(&rdev->same_set, &mddev->disks);
1882   - bd_claim_by_disk(rdev->bdev, rdev->bdev->bd_holder, mddev->gendisk);
  1882 + bd_link_disk_holder(rdev->bdev, mddev->gendisk);
1883 1883  
1884 1884 /* May as well allow recovery to be retried once */
1885 1885 mddev->recovery_disabled = 0;
... ... @@ -1906,7 +1906,6 @@
1906 1906 MD_BUG();
1907 1907 return;
1908 1908 }
1909   - bd_release_from_disk(rdev->bdev, rdev->mddev->gendisk);
1910 1909 list_del_rcu(&rdev->same_set);
1911 1910 printk(KERN_INFO "md: unbind<%s>\n", bdevname(rdev->bdev,b));
1912 1911 rdev->mddev = NULL;
1913 1912  
... ... @@ -1934,19 +1933,13 @@
1934 1933 struct block_device *bdev;
1935 1934 char b[BDEVNAME_SIZE];
1936 1935  
1937   - bdev = open_by_devnum(dev, FMODE_READ|FMODE_WRITE);
  1936 + bdev = blkdev_get_by_dev(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL,
  1937 + shared ? (mdk_rdev_t *)lock_rdev : rdev);
1938 1938 if (IS_ERR(bdev)) {
1939 1939 printk(KERN_ERR "md: could not open %s.\n",
1940 1940 __bdevname(dev, b));
1941 1941 return PTR_ERR(bdev);
1942 1942 }
1943   - err = bd_claim(bdev, shared ? (mdk_rdev_t *)lock_rdev : rdev);
1944   - if (err) {
1945   - printk(KERN_ERR "md: could not bd_claim %s.\n",
1946   - bdevname(bdev, b));
1947   - blkdev_put(bdev, FMODE_READ|FMODE_WRITE);
1948   - return err;
1949   - }
1950 1943 if (!shared)
1951 1944 set_bit(AllReserved, &rdev->flags);
1952 1945 rdev->bdev = bdev;
... ... @@ -1959,8 +1952,7 @@
1959 1952 rdev->bdev = NULL;
1960 1953 if (!bdev)
1961 1954 MD_BUG();
1962   - bd_release(bdev);
1963   - blkdev_put(bdev, FMODE_READ|FMODE_WRITE);
  1955 + blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
1964 1956 }
1965 1957  
1966 1958 void md_autodetect_dev(dev_t dev);
drivers/mtd/devices/block2mtd.c
... ... @@ -224,7 +224,7 @@
224 224 if (dev->blkdev) {
225 225 invalidate_mapping_pages(dev->blkdev->bd_inode->i_mapping,
226 226 0, -1);
227   - close_bdev_exclusive(dev->blkdev, FMODE_READ|FMODE_WRITE);
  227 + blkdev_put(dev->blkdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
228 228 }
229 229  
230 230 kfree(dev);
... ... @@ -234,6 +234,7 @@
234 234 /* FIXME: ensure that mtd->size % erase_size == 0 */
235 235 static struct block2mtd_dev *add_device(char *devname, int erase_size)
236 236 {
  237 + const fmode_t mode = FMODE_READ | FMODE_WRITE | FMODE_EXCL;
237 238 struct block_device *bdev;
238 239 struct block2mtd_dev *dev;
239 240 char *name;
... ... @@ -246,7 +247,7 @@
246 247 return NULL;
247 248  
248 249 /* Get a handle on the device */
249   - bdev = open_bdev_exclusive(devname, FMODE_READ|FMODE_WRITE, NULL);
  250 + bdev = blkdev_get_by_path(devname, mode, dev);
250 251 #ifndef MODULE
251 252 if (IS_ERR(bdev)) {
252 253  
... ... @@ -254,9 +255,8 @@
254 255 to resolve the device name by other means. */
255 256  
256 257 dev_t devt = name_to_dev_t(devname);
257   - if (devt) {
258   - bdev = open_by_devnum(devt, FMODE_WRITE | FMODE_READ);
259   - }
  258 + if (devt)
  259 + bdev = blkdev_get_by_dev(devt, mode, dev);
260 260 }
261 261 #endif
262 262  
drivers/s390/block/dasd_genhd.c
... ... @@ -103,7 +103,7 @@
103 103 struct block_device *bdev;
104 104  
105 105 bdev = bdget_disk(block->gdp, 0);
106   - if (!bdev || blkdev_get(bdev, FMODE_READ) < 0)
  106 + if (!bdev || blkdev_get(bdev, FMODE_READ, NULL) < 0)
107 107 return -ENODEV;
108 108 /*
109 109 * See fs/partition/check.c:register_disk,rescan_partitions
drivers/scsi/scsi_lib.c
... ... @@ -1977,8 +1977,7 @@
1977 1977 * in.
1978 1978 *
1979 1979 * Returns zero if unsuccessful or an error if TUR failed. For
1980   - * removable media, a return of NOT_READY or UNIT_ATTENTION is
1981   - * translated to success, with the ->changed flag updated.
  1980 + * removable media, UNIT_ATTENTION sets ->changed flag.
1982 1981 **/
1983 1982 int
1984 1983 scsi_test_unit_ready(struct scsi_device *sdev, int timeout, int retries,
... ... @@ -2005,16 +2004,6 @@
2005 2004 } while (scsi_sense_valid(sshdr) &&
2006 2005 sshdr->sense_key == UNIT_ATTENTION && --retries);
2007 2006  
2008   - if (!sshdr)
2009   - /* could not allocate sense buffer, so can't process it */
2010   - return result;
2011   -
2012   - if (sdev->removable && scsi_sense_valid(sshdr) &&
2013   - (sshdr->sense_key == UNIT_ATTENTION ||
2014   - sshdr->sense_key == NOT_READY)) {
2015   - sdev->changed = 1;
2016   - result = 0;
2017   - }
2018 2007 if (!sshdr_external)
2019 2008 kfree(sshdr);
2020 2009 return result;
... ... @@ -1043,15 +1043,7 @@
1043 1043 sshdr);
1044 1044 }
1045 1045  
1046   - /*
1047   - * Unable to test, unit probably not ready. This usually
1048   - * means there is no disc in the drive. Mark as changed,
1049   - * and we will figure it out later once the drive is
1050   - * available again.
1051   - */
1052   - if (retval || (scsi_sense_valid(sshdr) &&
1053   - /* 0x3a is medium not present */
1054   - sshdr->asc == 0x3a)) {
  1046 + if (retval) {
1055 1047 set_media_not_present(sdkp);
1056 1048 goto out;
1057 1049 }
... ... @@ -104,14 +104,15 @@
104 104 static void get_sectorsize(struct scsi_cd *);
105 105 static void get_capabilities(struct scsi_cd *);
106 106  
107   -static int sr_media_change(struct cdrom_device_info *, int);
  107 +static unsigned int sr_check_events(struct cdrom_device_info *cdi,
  108 + unsigned int clearing, int slot);
108 109 static int sr_packet(struct cdrom_device_info *, struct packet_command *);
109 110  
110 111 static struct cdrom_device_ops sr_dops = {
111 112 .open = sr_open,
112 113 .release = sr_release,
113 114 .drive_status = sr_drive_status,
114   - .media_changed = sr_media_change,
  115 + .check_events = sr_check_events,
115 116 .tray_move = sr_tray_move,
116 117 .lock_door = sr_lock_door,
117 118 .select_speed = sr_select_speed,
118 119  
119 120  
120 121  
121 122  
122 123  
123 124  
124 125  
125 126  
126 127  
127 128  
128 129  
129 130  
... ... @@ -165,90 +166,96 @@
165 166 mutex_unlock(&sr_ref_mutex);
166 167 }
167 168  
168   -/* identical to scsi_test_unit_ready except that it doesn't
169   - * eat the NOT_READY returns for removable media */
170   -int sr_test_unit_ready(struct scsi_device *sdev, struct scsi_sense_hdr *sshdr)
  169 +static unsigned int sr_get_events(struct scsi_device *sdev)
171 170 {
172   - int retries = MAX_RETRIES;
173   - int the_result;
174   - u8 cmd[] = {TEST_UNIT_READY, 0, 0, 0, 0, 0 };
  171 + u8 buf[8];
  172 + u8 cmd[] = { GET_EVENT_STATUS_NOTIFICATION,
  173 + 1, /* polled */
  174 + 0, 0, /* reserved */
  175 + 1 << 4, /* notification class: media */
  176 + 0, 0, /* reserved */
  177 + 0, sizeof(buf), /* allocation length */
  178 + 0, /* control */
  179 + };
  180 + struct event_header *eh = (void *)buf;
  181 + struct media_event_desc *med = (void *)(buf + 4);
  182 + struct scsi_sense_hdr sshdr;
  183 + int result;
175 184  
176   - /* issue TEST_UNIT_READY until the initial startup UNIT_ATTENTION
177   - * conditions are gone, or a timeout happens
178   - */
179   - do {
180   - the_result = scsi_execute_req(sdev, cmd, DMA_NONE, NULL,
181   - 0, sshdr, SR_TIMEOUT,
182   - retries--, NULL);
183   - if (scsi_sense_valid(sshdr) &&
184   - sshdr->sense_key == UNIT_ATTENTION)
185   - sdev->changed = 1;
  185 + result = scsi_execute_req(sdev, cmd, DMA_FROM_DEVICE, buf, sizeof(buf),
  186 + &sshdr, SR_TIMEOUT, MAX_RETRIES, NULL);
  187 + if (scsi_sense_valid(&sshdr) && sshdr.sense_key == UNIT_ATTENTION)
  188 + return DISK_EVENT_MEDIA_CHANGE;
186 189  
187   - } while (retries > 0 &&
188   - (!scsi_status_is_good(the_result) ||
189   - (scsi_sense_valid(sshdr) &&
190   - sshdr->sense_key == UNIT_ATTENTION)));
191   - return the_result;
  190 + if (result || be16_to_cpu(eh->data_len) < sizeof(*med))
  191 + return 0;
  192 +
  193 + if (eh->nea || eh->notification_class != 0x4)
  194 + return 0;
  195 +
  196 + if (med->media_event_code == 1)
  197 + return DISK_EVENT_EJECT_REQUEST;
  198 + else if (med->media_event_code == 2)
  199 + return DISK_EVENT_MEDIA_CHANGE;
  200 + return 0;
192 201 }
193 202  
194 203 /*
195   - * This function checks to see if the media has been changed in the
196   - * CDROM drive. It is possible that we have already sensed a change,
197   - * or the drive may have sensed one and not yet reported it. We must
198   - * be ready for either case. This function always reports the current
199   - * value of the changed bit. If flag is 0, then the changed bit is reset.
200   - * This function could be done as an ioctl, but we would need to have
201   - * an inode for that to work, and we do not always have one.
  204 + * This function checks to see if the media has been changed or eject
  205 + * button has been pressed. It is possible that we have already
  206 + * sensed a change, or the drive may have sensed one and not yet
  207 + * reported it. The past events are accumulated in sdev->changed and
  208 + * returned together with the current state.
202 209 */
203   -
204   -static int sr_media_change(struct cdrom_device_info *cdi, int slot)
  210 +static unsigned int sr_check_events(struct cdrom_device_info *cdi,
  211 + unsigned int clearing, int slot)
205 212 {
206 213 struct scsi_cd *cd = cdi->handle;
207   - int retval;
208   - struct scsi_sense_hdr *sshdr;
  214 + bool last_present;
  215 + struct scsi_sense_hdr sshdr;
  216 + unsigned int events;
  217 + int ret;
209 218  
210   - if (CDSL_CURRENT != slot) {
211   - /* no changer support */
212   - return -EINVAL;
213   - }
  219 + /* no changer support */
  220 + if (CDSL_CURRENT != slot)
  221 + return 0;
214 222  
215   - sshdr = kzalloc(sizeof(*sshdr), GFP_KERNEL);
216   - retval = sr_test_unit_ready(cd->device, sshdr);
217   - if (retval || (scsi_sense_valid(sshdr) &&
218   - /* 0x3a is medium not present */
219   - sshdr->asc == 0x3a)) {
220   - /* Media not present or unable to test, unit probably not
221   - * ready. This usually means there is no disc in the drive.
222   - * Mark as changed, and we will figure it out later once
223   - * the drive is available again.
224   - */
225   - cd->device->changed = 1;
226   - /* This will force a flush, if called from check_disk_change */
227   - retval = 1;
228   - goto out;
229   - };
  223 + events = sr_get_events(cd->device);
  224 + /*
  225 + * GET_EVENT_STATUS_NOTIFICATION is enough unless MEDIA_CHANGE
  226 + * is being cleared. Note that there are devices which hang
  227 + * if asked to execute TUR repeatedly.
  228 + */
  229 + if (!(clearing & DISK_EVENT_MEDIA_CHANGE))
  230 + goto skip_tur;
230 231  
231   - retval = cd->device->changed;
232   - cd->device->changed = 0;
233   - /* If the disk changed, the capacity will now be different,
234   - * so we force a re-read of this information */
235   - if (retval) {
236   - /* check multisession offset etc */
237   - sr_cd_check(cdi);
238   - get_sectorsize(cd);
  232 + /* let's see whether the media is there with TUR */
  233 + last_present = cd->media_present;
  234 + ret = scsi_test_unit_ready(cd->device, SR_TIMEOUT, MAX_RETRIES, &sshdr);
  235 +
  236 + /*
  237 + * Media is considered to be present if TUR succeeds or fails with
  238 + * sense data indicating something other than media-not-present
  239 + * (ASC 0x3a).
  240 + */
  241 + cd->media_present = scsi_status_is_good(ret) ||
  242 + (scsi_sense_valid(&sshdr) && sshdr.asc != 0x3a);
  243 +
  244 + if (last_present != cd->media_present)
  245 + events |= DISK_EVENT_MEDIA_CHANGE;
  246 +skip_tur:
  247 + if (cd->device->changed) {
  248 + events |= DISK_EVENT_MEDIA_CHANGE;
  249 + cd->device->changed = 0;
239 250 }
240 251  
241   -out:
242   - /* Notify userspace, that media has changed. */
243   - if (retval != cd->previous_state)
  252 + /* for backward compatibility */
  253 + if (events & DISK_EVENT_MEDIA_CHANGE)
244 254 sdev_evt_send_simple(cd->device, SDEV_EVT_MEDIA_CHANGE,
245 255 GFP_KERNEL);
246   - cd->previous_state = retval;
247   - kfree(sshdr);
248   -
249   - return retval;
  256 + return events;
250 257 }
251   -
  258 +
252 259 /*
253 260 * sr_done is the interrupt routine for the device driver.
254 261 *
255 262  
256 263  
257 264  
... ... @@ -533,19 +540,35 @@
533 540 return ret;
534 541 }
535 542  
536   -static int sr_block_media_changed(struct gendisk *disk)
  543 +static unsigned int sr_block_check_events(struct gendisk *disk,
  544 + unsigned int clearing)
537 545 {
538 546 struct scsi_cd *cd = scsi_cd(disk);
539   - return cdrom_media_changed(&cd->cdi);
  547 + return cdrom_check_events(&cd->cdi, clearing);
540 548 }
541 549  
  550 +static int sr_block_revalidate_disk(struct gendisk *disk)
  551 +{
  552 + struct scsi_cd *cd = scsi_cd(disk);
  553 + struct scsi_sense_hdr sshdr;
  554 +
  555 + /* if the unit is not ready, nothing more to do */
  556 + if (scsi_test_unit_ready(cd->device, SR_TIMEOUT, MAX_RETRIES, &sshdr))
  557 + return 0;
  558 +
  559 + sr_cd_check(&cd->cdi);
  560 + get_sectorsize(cd);
  561 + return 0;
  562 +}
  563 +
542 564 static const struct block_device_operations sr_bdops =
543 565 {
544 566 .owner = THIS_MODULE,
545 567 .open = sr_block_open,
546 568 .release = sr_block_release,
547 569 .ioctl = sr_block_ioctl,
548   - .media_changed = sr_block_media_changed,
  570 + .check_events = sr_block_check_events,
  571 + .revalidate_disk = sr_block_revalidate_disk,
549 572 /*
550 573 * No compat_ioctl for now because sr_block_ioctl never
551 574 * seems to pass arbitary ioctls down to host drivers.
... ... @@ -618,6 +641,7 @@
618 641 sprintf(disk->disk_name, "sr%d", minor);
619 642 disk->fops = &sr_bdops;
620 643 disk->flags = GENHD_FL_CD;
  644 + disk->events = DISK_EVENT_MEDIA_CHANGE | DISK_EVENT_EJECT_REQUEST;
621 645  
622 646 blk_queue_rq_timeout(sdev->request_queue, SR_TIMEOUT);
623 647  
... ... @@ -627,7 +651,7 @@
627 651 cd->disk = disk;
628 652 cd->capacity = 0x1fffff;
629 653 cd->device->changed = 1; /* force recheck CD type */
630   - cd->previous_state = 1;
  654 + cd->media_present = 1;
631 655 cd->use = 1;
632 656 cd->readcd_known = 0;
633 657 cd->readcd_cdda = 0;
... ... @@ -780,7 +804,7 @@
780 804 }
781 805  
782 806 /* eat unit attentions */
783   - sr_test_unit_ready(cd->device, &sshdr);
  807 + scsi_test_unit_ready(cd->device, SR_TIMEOUT, MAX_RETRIES, &sshdr);
784 808  
785 809 /* ask for mode page 0x2a */
786 810 rc = scsi_mode_sense(cd->device, 0, 0x2a, buffer, 128,
... ... @@ -40,7 +40,7 @@
40 40 unsigned xa_flag:1; /* CD has XA sectors ? */
41 41 unsigned readcd_known:1; /* drive supports READ_CD (0xbe) */
42 42 unsigned readcd_cdda:1; /* reading audio data using READ_CD */
43   - unsigned previous_state:1; /* media has changed */
  43 + unsigned media_present:1; /* media is present */
44 44 struct cdrom_device_info cdi;
45 45 /* We hold gendisk and scsi_device references on probe and use
46 46 * the refs on this kref to decide when to release them */
... ... @@ -61,7 +61,6 @@
61 61 int sr_audio_ioctl(struct cdrom_device_info *, unsigned int, void *);
62 62  
63 63 int sr_is_xa(Scsi_CD *);
64   -int sr_test_unit_ready(struct scsi_device *sdev, struct scsi_sense_hdr *sshdr);
65 64  
66 65 /* sr_vendor.c */
67 66 void sr_vendor_init(Scsi_CD *);
drivers/scsi/sr_ioctl.c
... ... @@ -307,7 +307,7 @@
307 307 /* we have no changer support */
308 308 return -EINVAL;
309 309 }
310   - if (0 == sr_test_unit_ready(cd->device, &sshdr))
  310 + if (!scsi_test_unit_ready(cd->device, SR_TIMEOUT, MAX_RETRIES, &sshdr))
311 311 return CDS_DISC_OK;
312 312  
313 313 /* SK/ASC/ASCQ of 2/4/1 means "unit is becoming ready" */
drivers/usb/gadget/storage_common.c
... ... @@ -543,7 +543,7 @@
543 543 ro = curlun->initially_ro;
544 544 if (!ro) {
545 545 filp = filp_open(filename, O_RDWR | O_LARGEFILE, 0);
546   - if (-EROFS == PTR_ERR(filp))
  546 + if (PTR_ERR(filp) == -EROFS || PTR_ERR(filp) == -EACCES)
547 547 ro = 1;
548 548 }
549 549 if (ro)
... ... @@ -558,10 +558,7 @@
558 558  
559 559 if (filp->f_path.dentry)
560 560 inode = filp->f_path.dentry->d_inode;
561   - if (inode && S_ISBLK(inode->i_mode)) {
562   - if (bdev_read_only(inode->i_bdev))
563   - ro = 1;
564   - } else if (!inode || !S_ISREG(inode->i_mode)) {
  561 + if (!inode || (!S_ISREG(inode->i_mode) && !S_ISBLK(inode->i_mode))) {
565 562 LINFO(curlun, "invalid file type: %s\n", filename);
566 563 goto out;
567 564 }
... ... @@ -782,7 +782,12 @@
782 782 {
783 783 unsigned int i;
784 784  
785   - kintegrityd_wq = create_workqueue("kintegrityd");
  785 + /*
  786 + * kintegrityd won't block much but may burn a lot of CPU cycles.
  787 + * Make it highpri CPU intensive wq with max concurrency of 1.
  788 + */
  789 + kintegrityd_wq = alloc_workqueue("kintegrityd", WQ_MEM_RECLAIM |
  790 + WQ_HIGHPRI | WQ_CPU_INTENSIVE, 1);
786 791 if (!kintegrityd_wq)
787 792 panic("Failed to create kintegrityd\n");
788 793  
... ... @@ -432,9 +432,6 @@
432 432 mutex_init(&bdev->bd_mutex);
433 433 INIT_LIST_HEAD(&bdev->bd_inodes);
434 434 INIT_LIST_HEAD(&bdev->bd_list);
435   -#ifdef CONFIG_SYSFS
436   - INIT_LIST_HEAD(&bdev->bd_holder_list);
437   -#endif
438 435 inode_init_once(&ei->vfs_inode);
439 436 /* Initialize mutex for freeze. */
440 437 mutex_init(&bdev->bd_fsfreeze_mutex);
... ... @@ -669,7 +666,7 @@
669 666 else if (bdev->bd_contains == bdev)
670 667 return true; /* is a whole device which isn't held */
671 668  
672   - else if (whole->bd_holder == bd_claim)
  669 + else if (whole->bd_holder == bd_may_claim)
673 670 return true; /* is a partition of a device that is being partitioned */
674 671 else if (whole->bd_holder != NULL)
675 672 return false; /* is a partition of a held device */
676 673  
677 674  
678 675  
679 676  
680 677  
681 678  
682 679  
683 680  
684 681  
685 682  
686 683  
687 684  
688 685  
689 686  
690 687  
691 688  
692 689  
693 690  
694 691  
695 692  
696 693  
697 694  
698 695  
699 696  
700 697  
701 698  
702 699  
... ... @@ -781,440 +778,88 @@
781 778 }
782 779 }
783 780  
784   -/* releases bdev_lock */
785   -static void __bd_abort_claiming(struct block_device *whole, void *holder)
786   -{
787   - BUG_ON(whole->bd_claiming != holder);
788   - whole->bd_claiming = NULL;
789   - wake_up_bit(&whole->bd_claiming, 0);
790   -
791   - spin_unlock(&bdev_lock);
792   - bdput(whole);
793   -}
794   -
795   -/**
796   - * bd_abort_claiming - abort claiming a block device
797   - * @whole: whole block device returned by bd_start_claiming()
798   - * @holder: holder trying to claim @bdev
799   - *
800   - * Abort a claiming block started by bd_start_claiming(). Note that
801   - * @whole is not the block device to be claimed but the whole device
802   - * returned by bd_start_claiming().
803   - *
804   - * CONTEXT:
805   - * Grabs and releases bdev_lock.
806   - */
807   -static void bd_abort_claiming(struct block_device *whole, void *holder)
808   -{
809   - spin_lock(&bdev_lock);
810   - __bd_abort_claiming(whole, holder); /* releases bdev_lock */
811   -}
812   -
813   -/* increment holders when we have a legitimate claim. requires bdev_lock */
814   -static void __bd_claim(struct block_device *bdev, struct block_device *whole,
815   - void *holder)
816   -{
817   - /* note that for a whole device bd_holders
818   - * will be incremented twice, and bd_holder will
819   - * be set to bd_claim before being set to holder
820   - */
821   - whole->bd_holders++;
822   - whole->bd_holder = bd_claim;
823   - bdev->bd_holders++;
824   - bdev->bd_holder = holder;
825   -}
826   -
827   -/**
828   - * bd_finish_claiming - finish claiming a block device
829   - * @bdev: block device of interest (passed to bd_start_claiming())
830   - * @whole: whole block device returned by bd_start_claiming()
831   - * @holder: holder trying to claim @bdev
832   - *
833   - * Finish a claiming block started by bd_start_claiming().
834   - *
835   - * CONTEXT:
836   - * Grabs and releases bdev_lock.
837   - */
838   -static void bd_finish_claiming(struct block_device *bdev,
839   - struct block_device *whole, void *holder)
840   -{
841   - spin_lock(&bdev_lock);
842   - BUG_ON(!bd_may_claim(bdev, whole, holder));
843   - __bd_claim(bdev, whole, holder);
844   - __bd_abort_claiming(whole, holder); /* not actually an abort */
845   -}
846   -
847   -/**
848   - * bd_claim - claim a block device
849   - * @bdev: block device to claim
850   - * @holder: holder trying to claim @bdev
851   - *
852   - * Try to claim @bdev which must have been opened successfully.
853   - *
854   - * CONTEXT:
855   - * Might sleep.
856   - *
857   - * RETURNS:
858   - * 0 if successful, -EBUSY if @bdev is already claimed.
859   - */
860   -int bd_claim(struct block_device *bdev, void *holder)
861   -{
862   - struct block_device *whole = bdev->bd_contains;
863   - int res;
864   -
865   - might_sleep();
866   -
867   - spin_lock(&bdev_lock);
868   - res = bd_prepare_to_claim(bdev, whole, holder);
869   - if (res == 0)
870   - __bd_claim(bdev, whole, holder);
871   - spin_unlock(&bdev_lock);
872   -
873   - return res;
874   -}
875   -EXPORT_SYMBOL(bd_claim);
876   -
877   -void bd_release(struct block_device *bdev)
878   -{
879   - spin_lock(&bdev_lock);
880   - if (!--bdev->bd_contains->bd_holders)
881   - bdev->bd_contains->bd_holder = NULL;
882   - if (!--bdev->bd_holders)
883   - bdev->bd_holder = NULL;
884   - spin_unlock(&bdev_lock);
885   -}
886   -
887   -EXPORT_SYMBOL(bd_release);
888   -
889 781 #ifdef CONFIG_SYSFS
890   -/*
891   - * Functions for bd_claim_by_kobject / bd_release_from_kobject
892   - *
893   - * If a kobject is passed to bd_claim_by_kobject()
894   - * and the kobject has a parent directory,
895   - * following symlinks are created:
896   - * o from the kobject to the claimed bdev
897   - * o from "holders" directory of the bdev to the parent of the kobject
898   - * bd_release_from_kobject() removes these symlinks.
899   - *
900   - * Example:
901   - * If /dev/dm-0 maps to /dev/sda, kobject corresponding to
902   - * /sys/block/dm-0/slaves is passed to bd_claim_by_kobject(), then:
903   - * /sys/block/dm-0/slaves/sda --> /sys/block/sda
904   - * /sys/block/sda/holders/dm-0 --> /sys/block/dm-0
905   - */
906   -
907 782 static int add_symlink(struct kobject *from, struct kobject *to)
908 783 {
909   - if (!from || !to)
910   - return 0;
911 784 return sysfs_create_link(from, to, kobject_name(to));
912 785 }
913 786  
914 787 static void del_symlink(struct kobject *from, struct kobject *to)
915 788 {
916   - if (!from || !to)
917   - return;
918 789 sysfs_remove_link(from, kobject_name(to));
919 790 }
920 791  
921   -/*
922   - * 'struct bd_holder' contains pointers to kobjects symlinked by
923   - * bd_claim_by_kobject.
924   - * It's connected to bd_holder_list which is protected by bdev->bd_sem.
925   - */
926   -struct bd_holder {
927   - struct list_head list; /* chain of holders of the bdev */
928   - int count; /* references from the holder */
929   - struct kobject *sdir; /* holder object, e.g. "/block/dm-0/slaves" */
930   - struct kobject *hdev; /* e.g. "/block/dm-0" */
931   - struct kobject *hdir; /* e.g. "/block/sda/holders" */
932   - struct kobject *sdev; /* e.g. "/block/sda" */
933   -};
934   -
935   -/*
936   - * Get references of related kobjects at once.
937   - * Returns 1 on success. 0 on failure.
938   - *
939   - * Should call bd_holder_release_dirs() after successful use.
940   - */
941   -static int bd_holder_grab_dirs(struct block_device *bdev,
942   - struct bd_holder *bo)
943   -{
944   - if (!bdev || !bo)
945   - return 0;
946   -
947   - bo->sdir = kobject_get(bo->sdir);
948   - if (!bo->sdir)
949   - return 0;
950   -
951   - bo->hdev = kobject_get(bo->sdir->parent);
952   - if (!bo->hdev)
953   - goto fail_put_sdir;
954   -
955   - bo->sdev = kobject_get(&part_to_dev(bdev->bd_part)->kobj);
956   - if (!bo->sdev)
957   - goto fail_put_hdev;
958   -
959   - bo->hdir = kobject_get(bdev->bd_part->holder_dir);
960   - if (!bo->hdir)
961   - goto fail_put_sdev;
962   -
963   - return 1;
964   -
965   -fail_put_sdev:
966   - kobject_put(bo->sdev);
967   -fail_put_hdev:
968   - kobject_put(bo->hdev);
969   -fail_put_sdir:
970   - kobject_put(bo->sdir);
971   -
972   - return 0;
973   -}
974   -
975   -/* Put references of related kobjects at once. */
976   -static void bd_holder_release_dirs(struct bd_holder *bo)
977   -{
978   - kobject_put(bo->hdir);
979   - kobject_put(bo->sdev);
980   - kobject_put(bo->hdev);
981   - kobject_put(bo->sdir);
982   -}
983   -
984   -static struct bd_holder *alloc_bd_holder(struct kobject *kobj)
985   -{
986   - struct bd_holder *bo;
987   -
988   - bo = kzalloc(sizeof(*bo), GFP_KERNEL);
989   - if (!bo)
990   - return NULL;
991   -
992   - bo->count = 1;
993   - bo->sdir = kobj;
994   -
995   - return bo;
996   -}
997   -
998   -static void free_bd_holder(struct bd_holder *bo)
999   -{
1000   - kfree(bo);
1001   -}
1002   -
1003 792 /**
1004   - * find_bd_holder - find matching struct bd_holder from the block device
  793 + * bd_link_disk_holder - create symlinks between holding disk and slave bdev
  794 + * @bdev: the claimed slave bdev
  795 + * @disk: the holding disk
1005 796 *
1006   - * @bdev: struct block device to be searched
1007   - * @bo: target struct bd_holder
  797 + * This functions creates the following sysfs symlinks.
1008 798 *
1009   - * Returns matching entry with @bo in @bdev->bd_holder_list.
1010   - * If found, increment the reference count and return the pointer.
1011   - * If not found, returns NULL.
1012   - */
1013   -static struct bd_holder *find_bd_holder(struct block_device *bdev,
1014   - struct bd_holder *bo)
1015   -{
1016   - struct bd_holder *tmp;
1017   -
1018   - list_for_each_entry(tmp, &bdev->bd_holder_list, list)
1019   - if (tmp->sdir == bo->sdir) {
1020   - tmp->count++;
1021   - return tmp;
1022   - }
1023   -
1024   - return NULL;
1025   -}
1026   -
1027   -/**
1028   - * add_bd_holder - create sysfs symlinks for bd_claim() relationship
  799 + * - from "slaves" directory of the holder @disk to the claimed @bdev
  800 + * - from "holders" directory of the @bdev to the holder @disk
1029 801 *
1030   - * @bdev: block device to be bd_claimed
1031   - * @bo: preallocated and initialized by alloc_bd_holder()
  802 + * For example, if /dev/dm-0 maps to /dev/sda and disk for dm-0 is
  803 + * passed to bd_link_disk_holder(), then:
1032 804 *
1033   - * Add @bo to @bdev->bd_holder_list, create symlinks.
  805 + * /sys/block/dm-0/slaves/sda --> /sys/block/sda
  806 + * /sys/block/sda/holders/dm-0 --> /sys/block/dm-0
1034 807 *
1035   - * Returns 0 if symlinks are created.
1036   - * Returns -ve if something fails.
1037   - */
1038   -static int add_bd_holder(struct block_device *bdev, struct bd_holder *bo)
1039   -{
1040   - int err;
1041   -
1042   - if (!bo)
1043   - return -EINVAL;
1044   -
1045   - if (!bd_holder_grab_dirs(bdev, bo))
1046   - return -EBUSY;
1047   -
1048   - err = add_symlink(bo->sdir, bo->sdev);
1049   - if (err)
1050   - return err;
1051   -
1052   - err = add_symlink(bo->hdir, bo->hdev);
1053   - if (err) {
1054   - del_symlink(bo->sdir, bo->sdev);
1055   - return err;
1056   - }
1057   -
1058   - list_add_tail(&bo->list, &bdev->bd_holder_list);
1059   - return 0;
1060   -}
1061   -
1062   -/**
1063   - * del_bd_holder - delete sysfs symlinks for bd_claim() relationship
  808 + * The caller must have claimed @bdev before calling this function and
  809 + * ensure that both @bdev and @disk are valid during the creation and
  810 + * lifetime of these symlinks.
1064 811 *
1065   - * @bdev: block device to be bd_claimed
1066   - * @kobj: holder's kobject
  812 + * CONTEXT:
  813 + * Might sleep.
1067 814 *
1068   - * If there is matching entry with @kobj in @bdev->bd_holder_list
1069   - * and no other bd_claim() from the same kobject,
1070   - * remove the struct bd_holder from the list, delete symlinks for it.
1071   - *
1072   - * Returns a pointer to the struct bd_holder when it's removed from the list
1073   - * and ready to be freed.
1074   - * Returns NULL if matching claim isn't found or there is other bd_claim()
1075   - * by the same kobject.
  815 + * RETURNS:
  816 + * 0 on success, -errno on failure.
1076 817 */
1077   -static struct bd_holder *del_bd_holder(struct block_device *bdev,
1078   - struct kobject *kobj)
  818 +int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk)
1079 819 {
1080   - struct bd_holder *bo;
  820 + int ret = 0;
1081 821  
1082   - list_for_each_entry(bo, &bdev->bd_holder_list, list) {
1083   - if (bo->sdir == kobj) {
1084   - bo->count--;
1085   - BUG_ON(bo->count < 0);
1086   - if (!bo->count) {
1087   - list_del(&bo->list);
1088   - del_symlink(bo->sdir, bo->sdev);
1089   - del_symlink(bo->hdir, bo->hdev);
1090   - bd_holder_release_dirs(bo);
1091   - return bo;
1092   - }
1093   - break;
1094   - }
1095   - }
  822 + mutex_lock(&bdev->bd_mutex);
1096 823  
1097   - return NULL;
1098   -}
  824 + WARN_ON_ONCE(!bdev->bd_holder || bdev->bd_holder_disk);
1099 825  
1100   -/**
1101   - * bd_claim_by_kobject - bd_claim() with additional kobject signature
1102   - *
1103   - * @bdev: block device to be claimed
1104   - * @holder: holder's signature
1105   - * @kobj: holder's kobject
1106   - *
1107   - * Do bd_claim() and if it succeeds, create sysfs symlinks between
1108   - * the bdev and the holder's kobject.
1109   - * Use bd_release_from_kobject() when relesing the claimed bdev.
1110   - *
1111   - * Returns 0 on success. (same as bd_claim())
1112   - * Returns errno on failure.
1113   - */
1114   -static int bd_claim_by_kobject(struct block_device *bdev, void *holder,
1115   - struct kobject *kobj)
1116   -{
1117   - int err;
1118   - struct bd_holder *bo, *found;
  826 + /* FIXME: remove the following once add_disk() handles errors */
  827 + if (WARN_ON(!disk->slave_dir || !bdev->bd_part->holder_dir))
  828 + goto out_unlock;
1119 829  
1120   - if (!kobj)
1121   - return -EINVAL;
  830 + ret = add_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj);
  831 + if (ret)
  832 + goto out_unlock;
1122 833  
1123   - bo = alloc_bd_holder(kobj);
1124   - if (!bo)
1125   - return -ENOMEM;
  834 + ret = add_symlink(bdev->bd_part->holder_dir, &disk_to_dev(disk)->kobj);
  835 + if (ret) {
  836 + del_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj);
  837 + goto out_unlock;
  838 + }
1126 839  
1127   - mutex_lock(&bdev->bd_mutex);
1128   -
1129   - err = bd_claim(bdev, holder);
1130   - if (err)
1131   - goto fail;
1132   -
1133   - found = find_bd_holder(bdev, bo);
1134   - if (found)
1135   - goto fail;
1136   -
1137   - err = add_bd_holder(bdev, bo);
1138   - if (err)
1139   - bd_release(bdev);
1140   - else
1141   - bo = NULL;
1142   -fail:
  840 + bdev->bd_holder_disk = disk;
  841 +out_unlock:
1143 842 mutex_unlock(&bdev->bd_mutex);
1144   - free_bd_holder(bo);
1145   - return err;
  843 + return ret;
1146 844 }
  845 +EXPORT_SYMBOL_GPL(bd_link_disk_holder);
1147 846  
1148   -/**
1149   - * bd_release_from_kobject - bd_release() with additional kobject signature
1150   - *
1151   - * @bdev: block device to be released
1152   - * @kobj: holder's kobject
1153   - *
1154   - * Do bd_release() and remove sysfs symlinks created by bd_claim_by_kobject().
1155   - */
1156   -static void bd_release_from_kobject(struct block_device *bdev,
1157   - struct kobject *kobj)
  847 +static void bd_unlink_disk_holder(struct block_device *bdev)
1158 848 {
1159   - if (!kobj)
  849 + struct gendisk *disk = bdev->bd_holder_disk;
  850 +
  851 + bdev->bd_holder_disk = NULL;
  852 + if (!disk)
1160 853 return;
1161 854  
1162   - mutex_lock(&bdev->bd_mutex);
1163   - bd_release(bdev);
1164   - free_bd_holder(del_bd_holder(bdev, kobj));
1165   - mutex_unlock(&bdev->bd_mutex);
  855 + del_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj);
  856 + del_symlink(bdev->bd_part->holder_dir, &disk_to_dev(disk)->kobj);
1166 857 }
1167   -
1168   -/**
1169   - * bd_claim_by_disk - wrapper function for bd_claim_by_kobject()
1170   - *
1171   - * @bdev: block device to be claimed
1172   - * @holder: holder's signature
1173   - * @disk: holder's gendisk
1174   - *
1175   - * Call bd_claim_by_kobject() with getting @disk->slave_dir.
1176   - */
1177   -int bd_claim_by_disk(struct block_device *bdev, void *holder,
1178   - struct gendisk *disk)
1179   -{
1180   - return bd_claim_by_kobject(bdev, holder, kobject_get(disk->slave_dir));
1181   -}
1182   -EXPORT_SYMBOL_GPL(bd_claim_by_disk);
1183   -
1184   -/**
1185   - * bd_release_from_disk - wrapper function for bd_release_from_kobject()
1186   - *
1187   - * @bdev: block device to be claimed
1188   - * @disk: holder's gendisk
1189   - *
1190   - * Call bd_release_from_kobject() and put @disk->slave_dir.
1191   - */
1192   -void bd_release_from_disk(struct block_device *bdev, struct gendisk *disk)
1193   -{
1194   - bd_release_from_kobject(bdev, disk->slave_dir);
1195   - kobject_put(disk->slave_dir);
1196   -}
1197   -EXPORT_SYMBOL_GPL(bd_release_from_disk);
  858 +#else
  859 +static inline void bd_unlink_disk_holder(struct block_device *bdev)
  860 +{ }
1198 861 #endif
1199 862  
1200   -/*
1201   - * Tries to open block device by device number. Use it ONLY if you
1202   - * really do not have anything better - i.e. when you are behind a
1203   - * truly sucky interface and all you are given is a device number. _Never_
1204   - * to be used for internal purposes. If you ever need it - reconsider
1205   - * your API.
1206   - */
1207   -struct block_device *open_by_devnum(dev_t dev, fmode_t mode)
1208   -{
1209   - struct block_device *bdev = bdget(dev);
1210   - int err = -ENOMEM;
1211   - if (bdev)
1212   - err = blkdev_get(bdev, mode);
1213   - return err ? ERR_PTR(err) : bdev;
1214   -}
1215   -
1216   -EXPORT_SYMBOL(open_by_devnum);
1217   -
1218 863 /**
1219 864 * flush_disk - invalidates all buffer-cache entries on a disk
1220 865 *
1221 866  
1222 867  
... ... @@ -1309,11 +954,12 @@
1309 954 {
1310 955 struct gendisk *disk = bdev->bd_disk;
1311 956 const struct block_device_operations *bdops = disk->fops;
  957 + unsigned int events;
1312 958  
1313   - if (!bdops->media_changed)
  959 + events = disk_clear_events(disk, DISK_EVENT_MEDIA_CHANGE |
  960 + DISK_EVENT_EJECT_REQUEST);
  961 + if (!(events & DISK_EVENT_MEDIA_CHANGE))
1314 962 return 0;
1315   - if (!bdops->media_changed(bdev->bd_disk))
1316   - return 0;
1317 963  
1318 964 flush_disk(bdev);
1319 965 if (bdops->revalidate_disk)
1320 966  
1321 967  
1322 968  
1323 969  
... ... @@ -1475,17 +1121,171 @@
1475 1121 return ret;
1476 1122 }
1477 1123  
1478   -int blkdev_get(struct block_device *bdev, fmode_t mode)
  1124 +/**
  1125 + * blkdev_get - open a block device
  1126 + * @bdev: block_device to open
  1127 + * @mode: FMODE_* mask
  1128 + * @holder: exclusive holder identifier
  1129 + *
  1130 + * Open @bdev with @mode. If @mode includes %FMODE_EXCL, @bdev is
  1131 + * open with exclusive access. Specifying %FMODE_EXCL with %NULL
  1132 + * @holder is invalid. Exclusive opens may nest for the same @holder.
  1133 + *
  1134 + * On success, the reference count of @bdev is unchanged. On failure,
  1135 + * @bdev is put.
  1136 + *
  1137 + * CONTEXT:
  1138 + * Might sleep.
  1139 + *
  1140 + * RETURNS:
  1141 + * 0 on success, -errno on failure.
  1142 + */
  1143 +int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder)
1479 1144 {
1480   - return __blkdev_get(bdev, mode, 0);
  1145 + struct block_device *whole = NULL;
  1146 + int res;
  1147 +
  1148 + WARN_ON_ONCE((mode & FMODE_EXCL) && !holder);
  1149 +
  1150 + if ((mode & FMODE_EXCL) && holder) {
  1151 + whole = bd_start_claiming(bdev, holder);
  1152 + if (IS_ERR(whole)) {
  1153 + bdput(bdev);
  1154 + return PTR_ERR(whole);
  1155 + }
  1156 + }
  1157 +
  1158 + res = __blkdev_get(bdev, mode, 0);
  1159 +
  1160 + /* __blkdev_get() may alter read only status, check it afterwards */
  1161 + if (!res && (mode & FMODE_WRITE) && bdev_read_only(bdev)) {
  1162 + __blkdev_put(bdev, mode, 0);
  1163 + res = -EACCES;
  1164 + }
  1165 +
  1166 + if (whole) {
  1167 + /* finish claiming */
  1168 + mutex_lock(&bdev->bd_mutex);
  1169 + spin_lock(&bdev_lock);
  1170 +
  1171 + if (!res) {
  1172 + BUG_ON(!bd_may_claim(bdev, whole, holder));
  1173 + /*
  1174 + * Note that for a whole device bd_holders
  1175 + * will be incremented twice, and bd_holder
  1176 + * will be set to bd_may_claim before being
  1177 + * set to holder
  1178 + */
  1179 + whole->bd_holders++;
  1180 + whole->bd_holder = bd_may_claim;
  1181 + bdev->bd_holders++;
  1182 + bdev->bd_holder = holder;
  1183 + }
  1184 +
  1185 + /* tell others that we're done */
  1186 + BUG_ON(whole->bd_claiming != holder);
  1187 + whole->bd_claiming = NULL;
  1188 + wake_up_bit(&whole->bd_claiming, 0);
  1189 +
  1190 + spin_unlock(&bdev_lock);
  1191 +
  1192 + /*
  1193 + * Block event polling for write claims. Any write
  1194 + * holder makes the write_holder state stick until all
  1195 + * are released. This is good enough and tracking
  1196 + * individual writeable reference is too fragile given
  1197 + * the way @mode is used in blkdev_get/put().
  1198 + */
  1199 + if (!res && (mode & FMODE_WRITE) && !bdev->bd_write_holder) {
  1200 + bdev->bd_write_holder = true;
  1201 + disk_block_events(bdev->bd_disk);
  1202 + }
  1203 +
  1204 + mutex_unlock(&bdev->bd_mutex);
  1205 + bdput(whole);
  1206 + }
  1207 +
  1208 + return res;
1481 1209 }
1482 1210 EXPORT_SYMBOL(blkdev_get);
1483 1211  
  1212 +/**
  1213 + * blkdev_get_by_path - open a block device by name
  1214 + * @path: path to the block device to open
  1215 + * @mode: FMODE_* mask
  1216 + * @holder: exclusive holder identifier
  1217 + *
  1218 + * Open the blockdevice described by the device file at @path. @mode
  1219 + * and @holder are identical to blkdev_get().
  1220 + *
  1221 + * On success, the returned block_device has reference count of one.
  1222 + *
  1223 + * CONTEXT:
  1224 + * Might sleep.
  1225 + *
  1226 + * RETURNS:
  1227 + * Pointer to block_device on success, ERR_PTR(-errno) on failure.
  1228 + */
  1229 +struct block_device *blkdev_get_by_path(const char *path, fmode_t mode,
  1230 + void *holder)
  1231 +{
  1232 + struct block_device *bdev;
  1233 + int err;
  1234 +
  1235 + bdev = lookup_bdev(path);
  1236 + if (IS_ERR(bdev))
  1237 + return bdev;
  1238 +
  1239 + err = blkdev_get(bdev, mode, holder);
  1240 + if (err)
  1241 + return ERR_PTR(err);
  1242 +
  1243 + return bdev;
  1244 +}
  1245 +EXPORT_SYMBOL(blkdev_get_by_path);
  1246 +
  1247 +/**
  1248 + * blkdev_get_by_dev - open a block device by device number
  1249 + * @dev: device number of block device to open
  1250 + * @mode: FMODE_* mask
  1251 + * @holder: exclusive holder identifier
  1252 + *
  1253 + * Open the blockdevice described by device number @dev. @mode and
  1254 + * @holder are identical to blkdev_get().
  1255 + *
  1256 + * Use it ONLY if you really do not have anything better - i.e. when
  1257 + * you are behind a truly sucky interface and all you are given is a
  1258 + * device number. _Never_ to be used for internal purposes. If you
  1259 + * ever need it - reconsider your API.
  1260 + *
  1261 + * On success, the returned block_device has reference count of one.
  1262 + *
  1263 + * CONTEXT:
  1264 + * Might sleep.
  1265 + *
  1266 + * RETURNS:
  1267 + * Pointer to block_device on success, ERR_PTR(-errno) on failure.
  1268 + */
  1269 +struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, void *holder)
  1270 +{
  1271 + struct block_device *bdev;
  1272 + int err;
  1273 +
  1274 + bdev = bdget(dev);
  1275 + if (!bdev)
  1276 + return ERR_PTR(-ENOMEM);
  1277 +
  1278 + err = blkdev_get(bdev, mode, holder);
  1279 + if (err)
  1280 + return ERR_PTR(err);
  1281 +
  1282 + return bdev;
  1283 +}
  1284 +EXPORT_SYMBOL(blkdev_get_by_dev);
  1285 +
1484 1286 static int blkdev_open(struct inode * inode, struct file * filp)
1485 1287 {
1486   - struct block_device *whole = NULL;
1487 1288 struct block_device *bdev;
1488   - int res;
1489 1289  
1490 1290 /*
1491 1291 * Preserve backwards compatibility and allow large file access
1492 1292  
... ... @@ -1506,26 +1306,9 @@
1506 1306 if (bdev == NULL)
1507 1307 return -ENOMEM;
1508 1308  
1509   - if (filp->f_mode & FMODE_EXCL) {
1510   - whole = bd_start_claiming(bdev, filp);
1511   - if (IS_ERR(whole)) {
1512   - bdput(bdev);
1513   - return PTR_ERR(whole);
1514   - }
1515   - }
1516   -
1517 1309 filp->f_mapping = bdev->bd_inode->i_mapping;
1518 1310  
1519   - res = blkdev_get(bdev, filp->f_mode);
1520   -
1521   - if (whole) {
1522   - if (res == 0)
1523   - bd_finish_claiming(bdev, whole, filp);
1524   - else
1525   - bd_abort_claiming(whole, filp);
1526   - }
1527   -
1528   - return res;
  1311 + return blkdev_get(bdev, filp->f_mode, filp);
1529 1312 }
1530 1313  
1531 1314 static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part)
... ... @@ -1539,6 +1322,7 @@
1539 1322 bdev->bd_part_count--;
1540 1323  
1541 1324 if (!--bdev->bd_openers) {
  1325 + WARN_ON_ONCE(bdev->bd_holders);
1542 1326 sync_blockdev(bdev);
1543 1327 kill_bdev(bdev);
1544 1328 }
... ... @@ -1569,6 +1353,45 @@
1569 1353  
1570 1354 int blkdev_put(struct block_device *bdev, fmode_t mode)
1571 1355 {
  1356 + if (mode & FMODE_EXCL) {
  1357 + bool bdev_free;
  1358 +
  1359 + /*
  1360 + * Release a claim on the device. The holder fields
  1361 + * are protected with bdev_lock. bd_mutex is to
  1362 + * synchronize disk_holder unlinking.
  1363 + */
  1364 + mutex_lock(&bdev->bd_mutex);
  1365 + spin_lock(&bdev_lock);
  1366 +
  1367 + WARN_ON_ONCE(--bdev->bd_holders < 0);
  1368 + WARN_ON_ONCE(--bdev->bd_contains->bd_holders < 0);
  1369 +
  1370 + /* bd_contains might point to self, check in a separate step */
  1371 + if ((bdev_free = !bdev->bd_holders))
  1372 + bdev->bd_holder = NULL;
  1373 + if (!bdev->bd_contains->bd_holders)
  1374 + bdev->bd_contains->bd_holder = NULL;
  1375 +
  1376 + spin_unlock(&bdev_lock);
  1377 +
  1378 + /*
  1379 + * If this was the last claim, remove holder link and
  1380 + * unblock evpoll if it was a write holder.
  1381 + */
  1382 + if (bdev_free) {
  1383 + bd_unlink_disk_holder(bdev);
  1384 + if (bdev->bd_write_holder) {
  1385 + disk_unblock_events(bdev->bd_disk);
  1386 + bdev->bd_write_holder = false;
  1387 + } else
  1388 + disk_check_events(bdev->bd_disk);
  1389 + }
  1390 +
  1391 + mutex_unlock(&bdev->bd_mutex);
  1392 + } else
  1393 + disk_check_events(bdev->bd_disk);
  1394 +
1572 1395 return __blkdev_put(bdev, mode, 0);
1573 1396 }
1574 1397 EXPORT_SYMBOL(blkdev_put);
... ... @@ -1576,8 +1399,7 @@
1576 1399 static int blkdev_close(struct inode * inode, struct file * filp)
1577 1400 {
1578 1401 struct block_device *bdev = I_BDEV(filp->f_mapping->host);
1579   - if (bdev->bd_holder == filp)
1580   - bd_release(bdev);
  1402 +
1581 1403 return blkdev_put(bdev, filp->f_mode);
1582 1404 }
1583 1405  
... ... @@ -1721,67 +1543,6 @@
1721 1543 goto out;
1722 1544 }
1723 1545 EXPORT_SYMBOL(lookup_bdev);
1724   -
1725   -/**
1726   - * open_bdev_exclusive - open a block device by name and set it up for use
1727   - *
1728   - * @path: special file representing the block device
1729   - * @mode: FMODE_... combination to pass be used
1730   - * @holder: owner for exclusion
1731   - *
1732   - * Open the blockdevice described by the special file at @path, claim it
1733   - * for the @holder.
1734   - */
1735   -struct block_device *open_bdev_exclusive(const char *path, fmode_t mode, void *holder)
1736   -{
1737   - struct block_device *bdev, *whole;
1738   - int error;
1739   -
1740   - bdev = lookup_bdev(path);
1741   - if (IS_ERR(bdev))
1742   - return bdev;
1743   -
1744   - whole = bd_start_claiming(bdev, holder);
1745   - if (IS_ERR(whole)) {
1746   - bdput(bdev);
1747   - return whole;
1748   - }
1749   -
1750   - error = blkdev_get(bdev, mode);
1751   - if (error)
1752   - goto out_abort_claiming;
1753   -
1754   - error = -EACCES;
1755   - if ((mode & FMODE_WRITE) && bdev_read_only(bdev))
1756   - goto out_blkdev_put;
1757   -
1758   - bd_finish_claiming(bdev, whole, holder);
1759   - return bdev;
1760   -
1761   -out_blkdev_put:
1762   - blkdev_put(bdev, mode);
1763   -out_abort_claiming:
1764   - bd_abort_claiming(whole, holder);
1765   - return ERR_PTR(error);
1766   -}
1767   -
1768   -EXPORT_SYMBOL(open_bdev_exclusive);
1769   -
1770   -/**
1771   - * close_bdev_exclusive - close a blockdevice opened by open_bdev_exclusive()
1772   - *
1773   - * @bdev: blockdevice to close
1774   - * @mode: mode, must match that used to open.
1775   - *
1776   - * This is the counterpart to open_bdev_exclusive().
1777   - */
1778   -void close_bdev_exclusive(struct block_device *bdev, fmode_t mode)
1779   -{
1780   - bd_release(bdev);
1781   - blkdev_put(bdev, mode);
1782   -}
1783   -
1784   -EXPORT_SYMBOL(close_bdev_exclusive);
1785 1546  
1786 1547 int __invalidate_device(struct block_device *bdev)
1787 1548 {
... ... @@ -493,7 +493,7 @@
493 493 continue;
494 494  
495 495 if (device->bdev) {
496   - close_bdev_exclusive(device->bdev, device->mode);
  496 + blkdev_put(device->bdev, device->mode);
497 497 device->bdev = NULL;
498 498 fs_devices->open_devices--;
499 499 }
... ... @@ -527,7 +527,7 @@
527 527  
528 528 list_for_each_entry(device, &fs_devices->devices, dev_list) {
529 529 if (device->bdev) {
530   - close_bdev_exclusive(device->bdev, device->mode);
  530 + blkdev_put(device->bdev, device->mode);
531 531 fs_devices->open_devices--;
532 532 }
533 533 if (device->writeable) {
534 534  
... ... @@ -584,13 +584,15 @@
584 584 int seeding = 1;
585 585 int ret = 0;
586 586  
  587 + flags |= FMODE_EXCL;
  588 +
587 589 list_for_each_entry(device, head, dev_list) {
588 590 if (device->bdev)
589 591 continue;
590 592 if (!device->name)
591 593 continue;
592 594  
593   - bdev = open_bdev_exclusive(device->name, flags, holder);
  595 + bdev = blkdev_get_by_path(device->name, flags, holder);
594 596 if (IS_ERR(bdev)) {
595 597 printk(KERN_INFO "open %s failed\n", device->name);
596 598 goto error;
... ... @@ -642,7 +644,7 @@
642 644 error_brelse:
643 645 brelse(bh);
644 646 error_close:
645   - close_bdev_exclusive(bdev, FMODE_READ);
  647 + blkdev_put(bdev, flags);
646 648 error:
647 649 continue;
648 650 }
... ... @@ -688,7 +690,8 @@
688 690  
689 691 mutex_lock(&uuid_mutex);
690 692  
691   - bdev = open_bdev_exclusive(path, flags, holder);
  693 + flags |= FMODE_EXCL;
  694 + bdev = blkdev_get_by_path(path, flags, holder);
692 695  
693 696 if (IS_ERR(bdev)) {
694 697 ret = PTR_ERR(bdev);
... ... @@ -720,7 +723,7 @@
720 723  
721 724 brelse(bh);
722 725 error_close:
723   - close_bdev_exclusive(bdev, flags);
  726 + blkdev_put(bdev, flags);
724 727 error:
725 728 mutex_unlock(&uuid_mutex);
726 729 return ret;
... ... @@ -1183,8 +1186,8 @@
1183 1186 goto out;
1184 1187 }
1185 1188 } else {
1186   - bdev = open_bdev_exclusive(device_path, FMODE_READ,
1187   - root->fs_info->bdev_holder);
  1189 + bdev = blkdev_get_by_path(device_path, FMODE_READ | FMODE_EXCL,
  1190 + root->fs_info->bdev_holder);
1188 1191 if (IS_ERR(bdev)) {
1189 1192 ret = PTR_ERR(bdev);
1190 1193 goto out;
... ... @@ -1251,7 +1254,7 @@
1251 1254 root->fs_info->fs_devices->latest_bdev = next_device->bdev;
1252 1255  
1253 1256 if (device->bdev) {
1254   - close_bdev_exclusive(device->bdev, device->mode);
  1257 + blkdev_put(device->bdev, device->mode);
1255 1258 device->bdev = NULL;
1256 1259 device->fs_devices->open_devices--;
1257 1260 }
... ... @@ -1294,7 +1297,7 @@
1294 1297 brelse(bh);
1295 1298 error_close:
1296 1299 if (bdev)
1297   - close_bdev_exclusive(bdev, FMODE_READ);
  1300 + blkdev_put(bdev, FMODE_READ | FMODE_EXCL);
1298 1301 out:
1299 1302 mutex_unlock(&root->fs_info->volume_mutex);
1300 1303 mutex_unlock(&uuid_mutex);
... ... @@ -1446,7 +1449,8 @@
1446 1449 if ((sb->s_flags & MS_RDONLY) && !root->fs_info->fs_devices->seeding)
1447 1450 return -EINVAL;
1448 1451  
1449   - bdev = open_bdev_exclusive(device_path, 0, root->fs_info->bdev_holder);
  1452 + bdev = blkdev_get_by_path(device_path, FMODE_EXCL,
  1453 + root->fs_info->bdev_holder);
1450 1454 if (IS_ERR(bdev))
1451 1455 return PTR_ERR(bdev);
1452 1456  
... ... @@ -1572,7 +1576,7 @@
1572 1576 mutex_unlock(&root->fs_info->volume_mutex);
1573 1577 return ret;
1574 1578 error:
1575   - close_bdev_exclusive(bdev, 0);
  1579 + blkdev_put(bdev, FMODE_EXCL);
1576 1580 if (seeding_dev) {
1577 1581 mutex_unlock(&uuid_mutex);
1578 1582 up_write(&sb->s_umount);
... ... @@ -50,7 +50,7 @@
50 50  
51 51 struct block_device *bdev;
52 52  
53   - /* the mode sent to open_bdev_exclusive */
  53 + /* the mode sent to blkdev_get */
54 54 fmode_t mode;
55 55  
56 56 char *name;
... ... @@ -59,7 +59,7 @@
59 59 } *chrdevs[CHRDEV_MAJOR_HASH_SIZE];
60 60  
61 61 /* index in the above */
62   -static inline int major_to_index(int major)
  62 +static inline int major_to_index(unsigned major)
63 63 {
64 64 return major % CHRDEV_MAJOR_HASH_SIZE;
65 65 }
... ... @@ -364,7 +364,7 @@
364 364 struct block_device *bdev;
365 365 char b[BDEVNAME_SIZE];
366 366  
367   - bdev = open_by_devnum(dev, FMODE_READ|FMODE_WRITE);
  367 + bdev = blkdev_get_by_dev(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL, sb);
368 368 if (IS_ERR(bdev))
369 369 goto fail;
370 370 return bdev;
... ... @@ -381,8 +381,7 @@
381 381 */
382 382 static int ext3_blkdev_put(struct block_device *bdev)
383 383 {
384   - bd_release(bdev);
385   - return blkdev_put(bdev, FMODE_READ|FMODE_WRITE);
  384 + return blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
386 385 }
387 386  
388 387 static int ext3_blkdev_remove(struct ext3_sb_info *sbi)
... ... @@ -2161,13 +2160,6 @@
2161 2160 bdev = ext3_blkdev_get(j_dev, sb);
2162 2161 if (bdev == NULL)
2163 2162 return NULL;
2164   -
2165   - if (bd_claim(bdev, sb)) {
2166   - ext3_msg(sb, KERN_ERR,
2167   - "error: failed to claim external journal device");
2168   - blkdev_put(bdev, FMODE_READ|FMODE_WRITE);
2169   - return NULL;
2170   - }
2171 2163  
2172 2164 blocksize = sb->s_blocksize;
2173 2165 hblock = bdev_logical_block_size(bdev);
... ... @@ -657,7 +657,7 @@
657 657 struct block_device *bdev;
658 658 char b[BDEVNAME_SIZE];
659 659  
660   - bdev = open_by_devnum(dev, FMODE_READ|FMODE_WRITE);
  660 + bdev = blkdev_get_by_dev(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL, sb);
661 661 if (IS_ERR(bdev))
662 662 goto fail;
663 663 return bdev;
... ... @@ -673,8 +673,7 @@
673 673 */
674 674 static int ext4_blkdev_put(struct block_device *bdev)
675 675 {
676   - bd_release(bdev);
677   - return blkdev_put(bdev, FMODE_READ|FMODE_WRITE);
  676 + return blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
678 677 }
679 678  
680 679 static int ext4_blkdev_remove(struct ext4_sb_info *sbi)
... ... @@ -3777,13 +3776,6 @@
3777 3776 bdev = ext4_blkdev_get(j_dev, sb);
3778 3777 if (bdev == NULL)
3779 3778 return NULL;
3780   -
3781   - if (bd_claim(bdev, sb)) {
3782   - ext4_msg(sb, KERN_ERR,
3783   - "failed to claim external journal device");
3784   - blkdev_put(bdev, FMODE_READ|FMODE_WRITE);
3785   - return NULL;
3786   - }
3787 3779  
3788 3780 blocksize = sb->s_blocksize;
3789 3781 hblock = bdev_logical_block_size(bdev);
fs/gfs2/ops_fstype.c
... ... @@ -1268,7 +1268,7 @@
1268 1268 {
1269 1269 struct block_device *bdev;
1270 1270 struct super_block *s;
1271   - fmode_t mode = FMODE_READ;
  1271 + fmode_t mode = FMODE_READ | FMODE_EXCL;
1272 1272 int error;
1273 1273 struct gfs2_args args;
1274 1274 struct gfs2_sbd *sdp;
... ... @@ -1276,7 +1276,7 @@
1276 1276 if (!(flags & MS_RDONLY))
1277 1277 mode |= FMODE_WRITE;
1278 1278  
1279   - bdev = open_bdev_exclusive(dev_name, mode, fs_type);
  1279 + bdev = blkdev_get_by_path(dev_name, mode, fs_type);
1280 1280 if (IS_ERR(bdev))
1281 1281 return ERR_CAST(bdev);
1282 1282  
... ... @@ -1298,7 +1298,7 @@
1298 1298 goto error_bdev;
1299 1299  
1300 1300 if (s->s_root)
1301   - close_bdev_exclusive(bdev, mode);
  1301 + blkdev_put(bdev, mode);
1302 1302  
1303 1303 memset(&args, 0, sizeof(args));
1304 1304 args.ar_quota = GFS2_QUOTA_DEFAULT;
... ... @@ -1342,7 +1342,7 @@
1342 1342 deactivate_locked_super(s);
1343 1343 return ERR_PTR(error);
1344 1344 error_bdev:
1345   - close_bdev_exclusive(bdev, mode);
  1345 + blkdev_put(bdev, mode);
1346 1346 return ERR_PTR(error);
1347 1347 }
1348 1348  
... ... @@ -1120,16 +1120,13 @@
1120 1120 * file systems to log may have n-to-1 relationship;
1121 1121 */
1122 1122  
1123   - bdev = open_by_devnum(sbi->logdev, FMODE_READ|FMODE_WRITE);
  1123 + bdev = blkdev_get_by_dev(sbi->logdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL,
  1124 + log);
1124 1125 if (IS_ERR(bdev)) {
1125 1126 rc = -PTR_ERR(bdev);
1126 1127 goto free;
1127 1128 }
1128 1129  
1129   - if ((rc = bd_claim(bdev, log))) {
1130   - goto close;
1131   - }
1132   -
1133 1130 log->bdev = bdev;
1134 1131 memcpy(log->uuid, sbi->loguuid, sizeof(log->uuid));
1135 1132  
... ... @@ -1137,7 +1134,7 @@
1137 1134 * initialize log:
1138 1135 */
1139 1136 if ((rc = lmLogInit(log)))
1140   - goto unclaim;
  1137 + goto close;
1141 1138  
1142 1139 list_add(&log->journal_list, &jfs_external_logs);
1143 1140  
1144 1141  
... ... @@ -1163,11 +1160,8 @@
1163 1160 list_del(&log->journal_list);
1164 1161 lbmLogShutdown(log);
1165 1162  
1166   - unclaim:
1167   - bd_release(bdev);
1168   -
1169 1163 close: /* close external log device */
1170   - blkdev_put(bdev, FMODE_READ|FMODE_WRITE);
  1164 + blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
1171 1165  
1172 1166 free: /* free log descriptor */
1173 1167 mutex_unlock(&jfs_log_mutex);
... ... @@ -1512,8 +1506,7 @@
1512 1506 bdev = log->bdev;
1513 1507 rc = lmLogShutdown(log);
1514 1508  
1515   - bd_release(bdev);
1516   - blkdev_put(bdev, FMODE_READ|FMODE_WRITE);
  1509 + blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
1517 1510  
1518 1511 kfree(log);
1519 1512  
... ... @@ -300,7 +300,7 @@
300 300  
301 301 static void bdev_put_device(struct logfs_super *s)
302 302 {
303   - close_bdev_exclusive(s->s_bdev, FMODE_READ|FMODE_WRITE);
  303 + blkdev_put(s->s_bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
304 304 }
305 305  
306 306 static int bdev_can_write_buf(struct super_block *sb, u64 ofs)
307 307  
... ... @@ -325,13 +325,14 @@
325 325 {
326 326 struct block_device *bdev;
327 327  
328   - bdev = open_bdev_exclusive(devname, FMODE_READ|FMODE_WRITE, type);
  328 + bdev = blkdev_get_by_path(devname, FMODE_READ|FMODE_WRITE|FMODE_EXCL,
  329 + type);
329 330 if (IS_ERR(bdev))
330 331 return PTR_ERR(bdev);
331 332  
332 333 if (MAJOR(bdev->bd_dev) == MTD_BLOCK_MAJOR) {
333 334 int mtdnr = MINOR(bdev->bd_dev);
334   - close_bdev_exclusive(bdev, FMODE_READ|FMODE_WRITE);
  335 + blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
335 336 return logfs_get_sb_mtd(p, mtdnr);
336 337 }
337 338  
... ... @@ -845,11 +845,6 @@
845 845 struct page **pp = rqstp->rq_respages + rqstp->rq_resused;
846 846 struct page *page = buf->page;
847 847 size_t size;
848   - int ret;
849   -
850   - ret = buf->ops->confirm(pipe, buf);
851   - if (unlikely(ret))
852   - return ret;
853 848  
854 849 size = sd->len;
855 850  
... ... @@ -1163,14 +1163,14 @@
1163 1163 {
1164 1164 struct nilfs_super_data sd;
1165 1165 struct super_block *s;
1166   - fmode_t mode = FMODE_READ;
  1166 + fmode_t mode = FMODE_READ | FMODE_EXCL;
1167 1167 struct dentry *root_dentry;
1168 1168 int err, s_new = false;
1169 1169  
1170 1170 if (!(flags & MS_RDONLY))
1171 1171 mode |= FMODE_WRITE;
1172 1172  
1173   - sd.bdev = open_bdev_exclusive(dev_name, mode, fs_type);
  1173 + sd.bdev = blkdev_get_by_path(dev_name, mode, fs_type);
1174 1174 if (IS_ERR(sd.bdev))
1175 1175 return ERR_CAST(sd.bdev);
1176 1176  
... ... @@ -1249,7 +1249,7 @@
1249 1249 }
1250 1250  
1251 1251 if (!s_new)
1252   - close_bdev_exclusive(sd.bdev, mode);
  1252 + blkdev_put(sd.bdev, mode);
1253 1253  
1254 1254 return root_dentry;
1255 1255  
... ... @@ -1258,7 +1258,7 @@
1258 1258  
1259 1259 failed:
1260 1260 if (!s_new)
1261   - close_bdev_exclusive(sd.bdev, mode);
  1261 + blkdev_put(sd.bdev, mode);
1262 1262 return ERR_PTR(err);
1263 1263 }
1264 1264  
fs/ocfs2/cluster/heartbeat.c
... ... @@ -1729,7 +1729,7 @@
1729 1729 goto out;
1730 1730  
1731 1731 reg->hr_bdev = I_BDEV(filp->f_mapping->host);
1732   - ret = blkdev_get(reg->hr_bdev, FMODE_WRITE | FMODE_READ);
  1732 + ret = blkdev_get(reg->hr_bdev, FMODE_WRITE | FMODE_READ, NULL);
1733 1733 if (ret) {
1734 1734 reg->hr_bdev = NULL;
1735 1735 goto out;
fs/partitions/check.c
... ... @@ -237,6 +237,13 @@
237 237 return sprintf(buf, "%llu\n",(unsigned long long)p->nr_sects);
238 238 }
239 239  
  240 +ssize_t part_ro_show(struct device *dev,
  241 + struct device_attribute *attr, char *buf)
  242 +{
  243 + struct hd_struct *p = dev_to_part(dev);
  244 + return sprintf(buf, "%d\n", p->policy ? 1 : 0);
  245 +}
  246 +
240 247 ssize_t part_alignment_offset_show(struct device *dev,
241 248 struct device_attribute *attr, char *buf)
242 249 {
... ... @@ -312,6 +319,7 @@
312 319 static DEVICE_ATTR(partition, S_IRUGO, part_partition_show, NULL);
313 320 static DEVICE_ATTR(start, S_IRUGO, part_start_show, NULL);
314 321 static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL);
  322 +static DEVICE_ATTR(ro, S_IRUGO, part_ro_show, NULL);
315 323 static DEVICE_ATTR(alignment_offset, S_IRUGO, part_alignment_offset_show, NULL);
316 324 static DEVICE_ATTR(discard_alignment, S_IRUGO, part_discard_alignment_show,
317 325 NULL);
... ... @@ -326,6 +334,7 @@
326 334 &dev_attr_partition.attr,
327 335 &dev_attr_start.attr,
328 336 &dev_attr_size.attr,
  337 + &dev_attr_ro.attr,
329 338 &dev_attr_alignment_offset.attr,
330 339 &dev_attr_discard_alignment.attr,
331 340 &dev_attr_stat.attr,
... ... @@ -372,6 +381,11 @@
372 381 put_device(part_to_dev(part));
373 382 }
374 383  
  384 +void __delete_partition(struct hd_struct *part)
  385 +{
  386 + call_rcu(&part->rcu_head, delete_partition_rcu_cb);
  387 +}
  388 +
375 389 void delete_partition(struct gendisk *disk, int partno)
376 390 {
377 391 struct disk_part_tbl *ptbl = disk->part_tbl;
... ... @@ -390,7 +404,7 @@
390 404 kobject_put(part->holder_dir);
391 405 device_del(part_to_dev(part));
392 406  
393   - call_rcu(&part->rcu_head, delete_partition_rcu_cb);
  407 + hd_struct_put(part);
394 408 }
395 409  
396 410 static ssize_t whole_disk_show(struct device *dev,
... ... @@ -489,6 +503,7 @@
489 503 if (!dev_get_uevent_suppress(ddev))
490 504 kobject_uevent(&pdev->kobj, KOBJ_ADD);
491 505  
  506 + hd_ref_init(p);
492 507 return p;
493 508  
494 509 out_free_info:
... ... @@ -507,65 +522,6 @@
507 522 return ERR_PTR(err);
508 523 }
509 524  
510   -/* Not exported, helper to add_disk(). */
511   -void register_disk(struct gendisk *disk)
512   -{
513   - struct device *ddev = disk_to_dev(disk);
514   - struct block_device *bdev;
515   - struct disk_part_iter piter;
516   - struct hd_struct *part;
517   - int err;
518   -
519   - ddev->parent = disk->driverfs_dev;
520   -
521   - dev_set_name(ddev, disk->disk_name);
522   -
523   - /* delay uevents, until we scanned partition table */
524   - dev_set_uevent_suppress(ddev, 1);
525   -
526   - if (device_add(ddev))
527   - return;
528   - if (!sysfs_deprecated) {
529   - err = sysfs_create_link(block_depr, &ddev->kobj,
530   - kobject_name(&ddev->kobj));
531   - if (err) {
532   - device_del(ddev);
533   - return;
534   - }
535   - }
536   - disk->part0.holder_dir = kobject_create_and_add("holders", &ddev->kobj);
537   - disk->slave_dir = kobject_create_and_add("slaves", &ddev->kobj);
538   -
539   - /* No minors to use for partitions */
540   - if (!disk_partitionable(disk))
541   - goto exit;
542   -
543   - /* No such device (e.g., media were just removed) */
544   - if (!get_capacity(disk))
545   - goto exit;
546   -
547   - bdev = bdget_disk(disk, 0);
548   - if (!bdev)
549   - goto exit;
550   -
551   - bdev->bd_invalidated = 1;
552   - err = blkdev_get(bdev, FMODE_READ);
553   - if (err < 0)
554   - goto exit;
555   - blkdev_put(bdev, FMODE_READ);
556   -
557   -exit:
558   - /* announce disk after possible partitions are created */
559   - dev_set_uevent_suppress(ddev, 0);
560   - kobject_uevent(&ddev->kobj, KOBJ_ADD);
561   -
562   - /* announce possible partitions */
563   - disk_part_iter_init(&piter, disk, 0);
564   - while ((part = disk_part_iter_next(&piter)))
565   - kobject_uevent(&part_to_dev(part)->kobj, KOBJ_ADD);
566   - disk_part_iter_exit(&piter);
567   -}
568   -
569 525 static bool disk_unlock_native_capacity(struct gendisk *disk)
570 526 {
571 527 const struct block_device_operations *bdops = disk->fops;
... ... @@ -728,34 +684,4 @@
728 684 }
729 685  
730 686 EXPORT_SYMBOL(read_dev_sector);
731   -
732   -void del_gendisk(struct gendisk *disk)
733   -{
734   - struct disk_part_iter piter;
735   - struct hd_struct *part;
736   -
737   - /* invalidate stuff */
738   - disk_part_iter_init(&piter, disk,
739   - DISK_PITER_INCL_EMPTY | DISK_PITER_REVERSE);
740   - while ((part = disk_part_iter_next(&piter))) {
741   - invalidate_partition(disk, part->partno);
742   - delete_partition(disk, part->partno);
743   - }
744   - disk_part_iter_exit(&piter);
745   -
746   - invalidate_partition(disk, 0);
747   - blk_free_devt(disk_to_dev(disk)->devt);
748   - set_capacity(disk, 0);
749   - disk->flags &= ~GENHD_FL_UP;
750   - unlink_gendisk(disk);
751   - part_stat_set_all(&disk->part0, 0);
752   - disk->part0.stamp = 0;
753   -
754   - kobject_put(disk->part0.holder_dir);
755   - kobject_put(disk->slave_dir);
756   - disk->driverfs_dev = NULL;
757   - if (!sysfs_deprecated)
758   - sysfs_remove_link(block_depr, dev_name(disk_to_dev(disk)));
759   - device_del(disk_to_dev(disk));
760   -}
fs/reiserfs/journal.c
... ... @@ -2551,8 +2551,6 @@
2551 2551 result = 0;
2552 2552  
2553 2553 if (journal->j_dev_bd != NULL) {
2554   - if (journal->j_dev_bd->bd_dev != super->s_dev)
2555   - bd_release(journal->j_dev_bd);
2556 2554 result = blkdev_put(journal->j_dev_bd, journal->j_dev_mode);
2557 2555 journal->j_dev_bd = NULL;
2558 2556 }
... ... @@ -2570,7 +2568,7 @@
2570 2568 {
2571 2569 int result;
2572 2570 dev_t jdev;
2573   - fmode_t blkdev_mode = FMODE_READ | FMODE_WRITE;
  2571 + fmode_t blkdev_mode = FMODE_READ | FMODE_WRITE | FMODE_EXCL;
2574 2572 char b[BDEVNAME_SIZE];
2575 2573  
2576 2574 result = 0;
... ... @@ -2584,7 +2582,10 @@
2584 2582  
2585 2583 /* there is no "jdev" option and journal is on separate device */
2586 2584 if ((!jdev_name || !jdev_name[0])) {
2587   - journal->j_dev_bd = open_by_devnum(jdev, blkdev_mode);
  2585 + if (jdev == super->s_dev)
  2586 + blkdev_mode &= ~FMODE_EXCL;
  2587 + journal->j_dev_bd = blkdev_get_by_dev(jdev, blkdev_mode,
  2588 + journal);
2588 2589 journal->j_dev_mode = blkdev_mode;
2589 2590 if (IS_ERR(journal->j_dev_bd)) {
2590 2591 result = PTR_ERR(journal->j_dev_bd);
2591 2592  
2592 2593  
... ... @@ -2593,22 +2594,14 @@
2593 2594 "cannot init journal device '%s': %i",
2594 2595 __bdevname(jdev, b), result);
2595 2596 return result;
2596   - } else if (jdev != super->s_dev) {
2597   - result = bd_claim(journal->j_dev_bd, journal);
2598   - if (result) {
2599   - blkdev_put(journal->j_dev_bd, blkdev_mode);
2600   - return result;
2601   - }
2602   -
  2597 + } else if (jdev != super->s_dev)
2603 2598 set_blocksize(journal->j_dev_bd, super->s_blocksize);
2604   - }
2605 2599  
2606 2600 return 0;
2607 2601 }
2608 2602  
2609 2603 journal->j_dev_mode = blkdev_mode;
2610   - journal->j_dev_bd = open_bdev_exclusive(jdev_name,
2611   - blkdev_mode, journal);
  2604 + journal->j_dev_bd = blkdev_get_by_path(jdev_name, blkdev_mode, journal);
2612 2605 if (IS_ERR(journal->j_dev_bd)) {
2613 2606 result = PTR_ERR(journal->j_dev_bd);
2614 2607 journal->j_dev_bd = NULL;
... ... @@ -682,19 +682,14 @@
682 682 {
683 683 struct file *file = sd->u.file;
684 684 loff_t pos = sd->pos;
685   - int ret, more;
  685 + int more;
686 686  
687   - ret = buf->ops->confirm(pipe, buf);
688   - if (!ret) {
689   - more = (sd->flags & SPLICE_F_MORE) || sd->len < sd->total_len;
690   - if (file->f_op && file->f_op->sendpage)
691   - ret = file->f_op->sendpage(file, buf->page, buf->offset,
692   - sd->len, &pos, more);
693   - else
694   - ret = -EINVAL;
695   - }
  687 + if (!likely(file->f_op && file->f_op->sendpage))
  688 + return -EINVAL;
696 689  
697   - return ret;
  690 + more = (sd->flags & SPLICE_F_MORE) || sd->len < sd->total_len;
  691 + return file->f_op->sendpage(file, buf->page, buf->offset,
  692 + sd->len, &pos, more);
698 693 }
699 694  
700 695 /*
... ... @@ -727,13 +722,6 @@
727 722 void *fsdata;
728 723 int ret;
729 724  
730   - /*
731   - * make sure the data in this buffer is uptodate
732   - */
733   - ret = buf->ops->confirm(pipe, buf);
734   - if (unlikely(ret))
735   - return ret;
736   -
737 725 offset = sd->pos & ~PAGE_CACHE_MASK;
738 726  
739 727 this_len = sd->len;
740 728  
... ... @@ -805,12 +793,17 @@
805 793 if (sd->len > sd->total_len)
806 794 sd->len = sd->total_len;
807 795  
808   - ret = actor(pipe, buf, sd);
809   - if (ret <= 0) {
  796 + ret = buf->ops->confirm(pipe, buf);
  797 + if (unlikely(ret)) {
810 798 if (ret == -ENODATA)
811 799 ret = 0;
812 800 return ret;
813 801 }
  802 +
  803 + ret = actor(pipe, buf, sd);
  804 + if (ret <= 0)
  805 + return ret;
  806 +
814 807 buf->offset += ret;
815 808 buf->len -= ret;
816 809  
... ... @@ -1044,10 +1037,6 @@
1044 1037 int ret;
1045 1038 void *data;
1046 1039  
1047   - ret = buf->ops->confirm(pipe, buf);
1048   - if (ret)
1049   - return ret;
1050   -
1051 1040 data = buf->ops->map(pipe, buf, 0);
1052 1041 ret = kernel_write(sd->u.file, data + buf->offset, sd->len, sd->pos);
1053 1042 buf->ops->unmap(pipe, buf, data);
... ... @@ -1494,10 +1483,6 @@
1494 1483 {
1495 1484 char *src;
1496 1485 int ret;
1497   -
1498   - ret = buf->ops->confirm(pipe, buf);
1499   - if (unlikely(ret))
1500   - return ret;
1501 1486  
1502 1487 /*
1503 1488 * See if we can use the atomic maps, by prefaulting in the
... ... @@ -767,13 +767,13 @@
767 767 {
768 768 struct block_device *bdev;
769 769 struct super_block *s;
770   - fmode_t mode = FMODE_READ;
  770 + fmode_t mode = FMODE_READ | FMODE_EXCL;
771 771 int error = 0;
772 772  
773 773 if (!(flags & MS_RDONLY))
774 774 mode |= FMODE_WRITE;
775 775  
776   - bdev = open_bdev_exclusive(dev_name, mode, fs_type);
  776 + bdev = blkdev_get_by_path(dev_name, mode, fs_type);
777 777 if (IS_ERR(bdev))
778 778 return ERR_CAST(bdev);
779 779  
780 780  
... ... @@ -802,13 +802,13 @@
802 802  
803 803 /*
804 804 * s_umount nests inside bd_mutex during
805   - * __invalidate_device(). close_bdev_exclusive()
806   - * acquires bd_mutex and can't be called under
807   - * s_umount. Drop s_umount temporarily. This is safe
808   - * as we're holding an active reference.
  805 + * __invalidate_device(). blkdev_put() acquires
  806 + * bd_mutex and can't be called under s_umount. Drop
  807 + * s_umount temporarily. This is safe as we're
  808 + * holding an active reference.
809 809 */
810 810 up_write(&s->s_umount);
811   - close_bdev_exclusive(bdev, mode);
  811 + blkdev_put(bdev, mode);
812 812 down_write(&s->s_umount);
813 813 } else {
814 814 char b[BDEVNAME_SIZE];
... ... @@ -832,7 +832,7 @@
832 832 error_s:
833 833 error = PTR_ERR(s);
834 834 error_bdev:
835   - close_bdev_exclusive(bdev, mode);
  835 + blkdev_put(bdev, mode);
836 836 error:
837 837 return ERR_PTR(error);
838 838 }
... ... @@ -863,7 +863,8 @@
863 863 bdev->bd_super = NULL;
864 864 generic_shutdown_super(sb);
865 865 sync_blockdev(bdev);
866   - close_bdev_exclusive(bdev, mode);
  866 + WARN_ON_ONCE(!(mode & FMODE_EXCL));
  867 + blkdev_put(bdev, mode | FMODE_EXCL);
867 868 }
868 869  
869 870 EXPORT_SYMBOL(kill_block_super);
fs/xfs/linux-2.6/xfs_super.c
... ... @@ -606,7 +606,8 @@
606 606 {
607 607 int error = 0;
608 608  
609   - *bdevp = open_bdev_exclusive(name, FMODE_READ|FMODE_WRITE, mp);
  609 + *bdevp = blkdev_get_by_path(name, FMODE_READ|FMODE_WRITE|FMODE_EXCL,
  610 + mp);
610 611 if (IS_ERR(*bdevp)) {
611 612 error = PTR_ERR(*bdevp);
612 613 printk("XFS: Invalid device [%s], error=%d\n", name, error);
... ... @@ -620,7 +621,7 @@
620 621 struct block_device *bdev)
621 622 {
622 623 if (bdev)
623   - close_bdev_exclusive(bdev, FMODE_READ|FMODE_WRITE);
  624 + blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
624 625 }
625 626  
626 627 /*
include/linux/blkdev.h
... ... @@ -115,6 +115,7 @@
115 115 void *elevator_private3;
116 116  
117 117 struct gendisk *rq_disk;
  118 + struct hd_struct *part;
118 119 unsigned long start_time;
119 120 #ifdef CONFIG_BLK_CGROUP
120 121 unsigned long long start_time_ns;
... ... @@ -646,7 +647,6 @@
646 647  
647 648 extern int blk_register_queue(struct gendisk *disk);
648 649 extern void blk_unregister_queue(struct gendisk *disk);
649   -extern void register_disk(struct gendisk *dev);
650 650 extern void generic_make_request(struct bio *bio);
651 651 extern void blk_rq_init(struct request_queue *q, struct request *rq);
652 652 extern void blk_put_request(struct request *);
... ... @@ -1256,6 +1256,9 @@
1256 1256 int (*compat_ioctl) (struct block_device *, fmode_t, unsigned, unsigned long);
1257 1257 int (*direct_access) (struct block_device *, sector_t,
1258 1258 void **, unsigned long *);
  1259 + unsigned int (*check_events) (struct gendisk *disk,
  1260 + unsigned int clearing);
  1261 + /* ->media_changed() is DEPRECATED, use ->check_events() instead */
1259 1262 int (*media_changed) (struct gendisk *);
1260 1263 void (*unlock_native_capacity) (struct gendisk *);
1261 1264 int (*revalidate_disk) (struct gendisk *);
include/linux/cdrom.h
... ... @@ -946,6 +946,8 @@
946 946 /* device-related storage */
947 947 unsigned int options : 30; /* options flags */
948 948 unsigned mc_flags : 2; /* media change buffer flags */
  949 + unsigned int vfs_events; /* cached events for vfs path */
  950 + unsigned int ioctl_events; /* cached events for ioctl path */
949 951 int use_count; /* number of times device opened */
950 952 char name[20]; /* name of the device type */
951 953 /* per-device flags */
... ... @@ -965,6 +967,8 @@
965 967 int (*open) (struct cdrom_device_info *, int);
966 968 void (*release) (struct cdrom_device_info *);
967 969 int (*drive_status) (struct cdrom_device_info *, int);
  970 + unsigned int (*check_events) (struct cdrom_device_info *cdi,
  971 + unsigned int clearing, int slot);
968 972 int (*media_changed) (struct cdrom_device_info *, int);
969 973 int (*tray_move) (struct cdrom_device_info *, int);
970 974 int (*lock_door) (struct cdrom_device_info *, int);
... ... @@ -993,6 +997,8 @@
993 997 extern void cdrom_release(struct cdrom_device_info *cdi, fmode_t mode);
994 998 extern int cdrom_ioctl(struct cdrom_device_info *cdi, struct block_device *bdev,
995 999 fmode_t mode, unsigned int cmd, unsigned long arg);
  1000 +extern unsigned int cdrom_check_events(struct cdrom_device_info *cdi,
  1001 + unsigned int clearing);
996 1002 extern int cdrom_media_changed(struct cdrom_device_info *);
997 1003  
998 1004 extern int register_cdrom(struct cdrom_device_info *cdi);
... ... @@ -664,8 +664,9 @@
664 664 void * bd_claiming;
665 665 void * bd_holder;
666 666 int bd_holders;
  667 + bool bd_write_holder;
667 668 #ifdef CONFIG_SYSFS
668   - struct list_head bd_holder_list;
  669 + struct gendisk * bd_holder_disk; /* for sysfs slave linkng */
669 670 #endif
670 671 struct block_device * bd_contains;
671 672 unsigned bd_block_size;
... ... @@ -2019,7 +2020,6 @@
2019 2020 extern void bd_set_size(struct block_device *, loff_t size);
2020 2021 extern void bd_forget(struct inode *inode);
2021 2022 extern void bdput(struct block_device *);
2022   -extern struct block_device *open_by_devnum(dev_t, fmode_t);
2023 2023 extern void invalidate_bdev(struct block_device *);
2024 2024 extern int sync_blockdev(struct block_device *bdev);
2025 2025 extern struct super_block *freeze_bdev(struct block_device *);
2026 2026  
2027 2027  
... ... @@ -2050,16 +2050,20 @@
2050 2050 extern int ioctl_by_bdev(struct block_device *, unsigned, unsigned long);
2051 2051 extern int blkdev_ioctl(struct block_device *, fmode_t, unsigned, unsigned long);
2052 2052 extern long compat_blkdev_ioctl(struct file *, unsigned, unsigned long);
2053   -extern int blkdev_get(struct block_device *, fmode_t);
2054   -extern int blkdev_put(struct block_device *, fmode_t);
2055   -extern int bd_claim(struct block_device *, void *);
2056   -extern void bd_release(struct block_device *);
  2053 +extern int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder);
  2054 +extern struct block_device *blkdev_get_by_path(const char *path, fmode_t mode,
  2055 + void *holder);
  2056 +extern struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode,
  2057 + void *holder);
  2058 +extern int blkdev_put(struct block_device *bdev, fmode_t mode);
2057 2059 #ifdef CONFIG_SYSFS
2058   -extern int bd_claim_by_disk(struct block_device *, void *, struct gendisk *);
2059   -extern void bd_release_from_disk(struct block_device *, struct gendisk *);
  2060 +extern int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk);
2060 2061 #else
2061   -#define bd_claim_by_disk(bdev, holder, disk) bd_claim(bdev, holder)
2062   -#define bd_release_from_disk(bdev, disk) bd_release(bdev)
  2062 +static inline int bd_link_disk_holder(struct block_device *bdev,
  2063 + struct gendisk *disk)
  2064 +{
  2065 + return 0;
  2066 +}
2063 2067 #endif
2064 2068 #endif
2065 2069  
... ... @@ -2095,8 +2099,6 @@
2095 2099 extern const char *__bdevname(dev_t, char *buffer);
2096 2100 extern const char *bdevname(struct block_device *bdev, char *buffer);
2097 2101 extern struct block_device *lookup_bdev(const char *);
2098   -extern struct block_device *open_bdev_exclusive(const char *, fmode_t, void *);
2099   -extern void close_bdev_exclusive(struct block_device *, fmode_t);
2100 2102 extern void blkdev_show(struct seq_file *,off_t);
2101 2103  
2102 2104 #else
include/linux/genhd.h
... ... @@ -115,6 +115,7 @@
115 115 #else
116 116 struct disk_stats dkstats;
117 117 #endif
  118 + atomic_t ref;
118 119 struct rcu_head rcu_head;
119 120 };
120 121  
... ... @@ -127,6 +128,11 @@
127 128 #define GENHD_FL_EXT_DEVT 64 /* allow extended devt */
128 129 #define GENHD_FL_NATIVE_CAPACITY 128
129 130  
  131 +enum {
  132 + DISK_EVENT_MEDIA_CHANGE = 1 << 0, /* media changed */
  133 + DISK_EVENT_EJECT_REQUEST = 1 << 1, /* eject requested */
  134 +};
  135 +
130 136 #define BLK_SCSI_MAX_CMDS (256)
131 137 #define BLK_SCSI_CMD_PER_LONG (BLK_SCSI_MAX_CMDS / (sizeof(long) * 8))
132 138  
... ... @@ -143,6 +149,8 @@
143 149 struct hd_struct __rcu *part[];
144 150 };
145 151  
  152 +struct disk_events;
  153 +
146 154 struct gendisk {
147 155 /* major, first_minor and minors are input parameters only,
148 156 * don't use directly. Use disk_devt() and disk_max_parts().
... ... @@ -154,6 +162,10 @@
154 162  
155 163 char disk_name[DISK_NAME_LEN]; /* name of major driver */
156 164 char *(*devnode)(struct gendisk *gd, mode_t *mode);
  165 +
  166 + unsigned int events; /* supported events */
  167 + unsigned int async_events; /* async events, subset of all */
  168 +
157 169 /* Array of pointers to partitions indexed by partno.
158 170 * Protected with matching bdev lock but stat and other
159 171 * non-critical accesses use RCU. Always access through
160 172  
... ... @@ -171,9 +183,8 @@
171 183 struct kobject *slave_dir;
172 184  
173 185 struct timer_rand_state *random;
174   -
175 186 atomic_t sync_io; /* RAID */
176   - struct work_struct async_notify;
  187 + struct disk_events *ev;
177 188 #ifdef CONFIG_BLK_DEV_INTEGRITY
178 189 struct blk_integrity *integrity;
179 190 #endif
... ... @@ -395,7 +406,6 @@
395 406 /* block/genhd.c */
396 407 extern void add_disk(struct gendisk *disk);
397 408 extern void del_gendisk(struct gendisk *gp);
398   -extern void unlink_gendisk(struct gendisk *gp);
399 409 extern struct gendisk *get_gendisk(dev_t dev, int *partno);
400 410 extern struct block_device *bdget_disk(struct gendisk *disk, int partno);
401 411  
... ... @@ -407,6 +417,11 @@
407 417 return disk->part0.policy;
408 418 }
409 419  
  420 +extern void disk_block_events(struct gendisk *disk);
  421 +extern void disk_unblock_events(struct gendisk *disk);
  422 +extern void disk_check_events(struct gendisk *disk);
  423 +extern unsigned int disk_clear_events(struct gendisk *disk, unsigned int mask);
  424 +
410 425 /* drivers/char/random.c */
411 426 extern void add_disk_randomness(struct gendisk *disk);
412 427 extern void rand_initialize_disk(struct gendisk *disk);
... ... @@ -583,6 +598,7 @@
583 598 sector_t len, int flags,
584 599 struct partition_meta_info
585 600 *info);
  601 +extern void __delete_partition(struct hd_struct *);
586 602 extern void delete_partition(struct gendisk *, int);
587 603 extern void printk_all_partitions(void);
588 604  
... ... @@ -610,6 +626,29 @@
610 626 struct device_attribute *attr,
611 627 const char *buf, size_t count);
612 628 #endif /* CONFIG_FAIL_MAKE_REQUEST */
  629 +
  630 +static inline void hd_ref_init(struct hd_struct *part)
  631 +{
  632 + atomic_set(&part->ref, 1);
  633 + smp_mb();
  634 +}
  635 +
  636 +static inline void hd_struct_get(struct hd_struct *part)
  637 +{
  638 + atomic_inc(&part->ref);
  639 + smp_mb__after_atomic_inc();
  640 +}
  641 +
  642 +static inline int hd_struct_try_get(struct hd_struct *part)
  643 +{
  644 + return atomic_inc_not_zero(&part->ref);
  645 +}
  646 +
  647 +static inline void hd_struct_put(struct hd_struct *part)
  648 +{
  649 + if (atomic_dec_and_test(&part->ref))
  650 + __delete_partition(part);
  651 +}
613 652  
614 653 #else /* CONFIG_BLOCK */
615 654  
... ... @@ -104,6 +104,7 @@
104 104 #define UNMAP 0x42
105 105 #define READ_TOC 0x43
106 106 #define READ_HEADER 0x44
  107 +#define GET_EVENT_STATUS_NOTIFICATION 0x4a
107 108 #define LOG_SELECT 0x4c
108 109 #define LOG_SENSE 0x4d
109 110 #define XDWRITEREAD_10 0x53
include/trace/events/block.h
... ... @@ -206,15 +206,16 @@
206 206 * block_bio_complete - completed all work on the block operation
207 207 * @q: queue holding the block operation
208 208 * @bio: block operation completed
  209 + * @error: io error value
209 210 *
210 211 * This tracepoint indicates there is no further work to do on this
211 212 * block IO operation @bio.
212 213 */
213 214 TRACE_EVENT(block_bio_complete,
214 215  
215   - TP_PROTO(struct request_queue *q, struct bio *bio),
  216 + TP_PROTO(struct request_queue *q, struct bio *bio, int error),
216 217  
217   - TP_ARGS(q, bio),
  218 + TP_ARGS(q, bio, error),
218 219  
219 220 TP_STRUCT__entry(
220 221 __field( dev_t, dev )
... ... @@ -228,6 +229,7 @@
228 229 __entry->dev = bio->bi_bdev->bd_dev;
229 230 __entry->sector = bio->bi_sector;
230 231 __entry->nr_sector = bio->bi_size >> 9;
  232 + __entry->error = error;
231 233 blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size);
232 234 ),
233 235  
234 236  
235 237  
... ... @@ -486,16 +488,16 @@
486 488 );
487 489  
488 490 /**
489   - * block_remap - map request for a partition to the raw device
  491 + * block_bio_remap - map request for a logical device to the raw device
490 492 * @q: queue holding the operation
491 493 * @bio: revised operation
492 494 * @dev: device for the operation
493 495 * @from: original sector for the operation
494 496 *
495   - * An operation for a partition on a block device has been mapped to the
  497 + * An operation for a logical device has been mapped to the
496 498 * raw block device.
497 499 */
498   -TRACE_EVENT(block_remap,
  500 +TRACE_EVENT(block_bio_remap,
499 501  
500 502 TP_PROTO(struct request_queue *q, struct bio *bio, dev_t dev,
501 503 sector_t from),
... ... @@ -224,7 +224,7 @@
224 224 return res;
225 225  
226 226 root_swap = res;
227   - res = blkdev_get(hib_resume_bdev, FMODE_WRITE);
  227 + res = blkdev_get(hib_resume_bdev, FMODE_WRITE, NULL);
228 228 if (res)
229 229 return res;
230 230  
... ... @@ -930,7 +930,8 @@
930 930 {
931 931 int error;
932 932  
933   - hib_resume_bdev = open_by_devnum(swsusp_resume_device, FMODE_READ);
  933 + hib_resume_bdev = blkdev_get_by_dev(swsusp_resume_device,
  934 + FMODE_READ, NULL);
934 935 if (!IS_ERR(hib_resume_bdev)) {
935 936 set_blocksize(hib_resume_bdev, PAGE_SIZE);
936 937 clear_page(swsusp_header);
kernel/trace/blktrace.c
... ... @@ -758,53 +758,58 @@
758 758 * @q: queue the io is for
759 759 * @bio: the source bio
760 760 * @what: the action
  761 + * @error: error, if any
761 762 *
762 763 * Description:
763 764 * Records an action against a bio. Will log the bio offset + size.
764 765 *
765 766 **/
766 767 static void blk_add_trace_bio(struct request_queue *q, struct bio *bio,
767   - u32 what)
  768 + u32 what, int error)
768 769 {
769 770 struct blk_trace *bt = q->blk_trace;
770 771  
771 772 if (likely(!bt))
772 773 return;
773 774  
  775 + if (!error && !bio_flagged(bio, BIO_UPTODATE))
  776 + error = EIO;
  777 +
774 778 __blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw, what,
775   - !bio_flagged(bio, BIO_UPTODATE), 0, NULL);
  779 + error, 0, NULL);
776 780 }
777 781  
778 782 static void blk_add_trace_bio_bounce(void *ignore,
779 783 struct request_queue *q, struct bio *bio)
780 784 {
781   - blk_add_trace_bio(q, bio, BLK_TA_BOUNCE);
  785 + blk_add_trace_bio(q, bio, BLK_TA_BOUNCE, 0);
782 786 }
783 787  
784 788 static void blk_add_trace_bio_complete(void *ignore,
785   - struct request_queue *q, struct bio *bio)
  789 + struct request_queue *q, struct bio *bio,
  790 + int error)
786 791 {
787   - blk_add_trace_bio(q, bio, BLK_TA_COMPLETE);
  792 + blk_add_trace_bio(q, bio, BLK_TA_COMPLETE, error);
788 793 }
789 794  
790 795 static void blk_add_trace_bio_backmerge(void *ignore,
791 796 struct request_queue *q,
792 797 struct bio *bio)
793 798 {
794   - blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE);
  799 + blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE, 0);
795 800 }
796 801  
797 802 static void blk_add_trace_bio_frontmerge(void *ignore,
798 803 struct request_queue *q,
799 804 struct bio *bio)
800 805 {
801   - blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE);
  806 + blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE, 0);
802 807 }
803 808  
804 809 static void blk_add_trace_bio_queue(void *ignore,
805 810 struct request_queue *q, struct bio *bio)
806 811 {
807   - blk_add_trace_bio(q, bio, BLK_TA_QUEUE);
  812 + blk_add_trace_bio(q, bio, BLK_TA_QUEUE, 0);
808 813 }
809 814  
810 815 static void blk_add_trace_getrq(void *ignore,
... ... @@ -812,7 +817,7 @@
812 817 struct bio *bio, int rw)
813 818 {
814 819 if (bio)
815   - blk_add_trace_bio(q, bio, BLK_TA_GETRQ);
  820 + blk_add_trace_bio(q, bio, BLK_TA_GETRQ, 0);
816 821 else {
817 822 struct blk_trace *bt = q->blk_trace;
818 823  
... ... @@ -827,7 +832,7 @@
827 832 struct bio *bio, int rw)
828 833 {
829 834 if (bio)
830   - blk_add_trace_bio(q, bio, BLK_TA_SLEEPRQ);
  835 + blk_add_trace_bio(q, bio, BLK_TA_SLEEPRQ, 0);
831 836 else {
832 837 struct blk_trace *bt = q->blk_trace;
833 838  
... ... @@ -887,7 +892,7 @@
887 892 }
888 893  
889 894 /**
890   - * blk_add_trace_remap - Add a trace for a remap operation
  895 + * blk_add_trace_bio_remap - Add a trace for a bio-remap operation
891 896 * @ignore: trace callback data parameter (not used)
892 897 * @q: queue the io is for
893 898 * @bio: the source bio
... ... @@ -899,9 +904,9 @@
899 904 * it spans a stripe (or similar). Add a trace for that action.
900 905 *
901 906 **/
902   -static void blk_add_trace_remap(void *ignore,
903   - struct request_queue *q, struct bio *bio,
904   - dev_t dev, sector_t from)
  907 +static void blk_add_trace_bio_remap(void *ignore,
  908 + struct request_queue *q, struct bio *bio,
  909 + dev_t dev, sector_t from)
905 910 {
906 911 struct blk_trace *bt = q->blk_trace;
907 912 struct blk_io_trace_remap r;
... ... @@ -1016,7 +1021,7 @@
1016 1021 WARN_ON(ret);
1017 1022 ret = register_trace_block_split(blk_add_trace_split, NULL);
1018 1023 WARN_ON(ret);
1019   - ret = register_trace_block_remap(blk_add_trace_remap, NULL);
  1024 + ret = register_trace_block_bio_remap(blk_add_trace_bio_remap, NULL);
1020 1025 WARN_ON(ret);
1021 1026 ret = register_trace_block_rq_remap(blk_add_trace_rq_remap, NULL);
1022 1027 WARN_ON(ret);
... ... @@ -1025,7 +1030,7 @@
1025 1030 static void blk_unregister_tracepoints(void)
1026 1031 {
1027 1032 unregister_trace_block_rq_remap(blk_add_trace_rq_remap, NULL);
1028   - unregister_trace_block_remap(blk_add_trace_remap, NULL);
  1033 + unregister_trace_block_bio_remap(blk_add_trace_bio_remap, NULL);
1029 1034 unregister_trace_block_split(blk_add_trace_split, NULL);
1030 1035 unregister_trace_block_unplug_io(blk_add_trace_unplug_io, NULL);
1031 1036 unregister_trace_block_unplug_timer(blk_add_trace_unplug_timer, NULL);
... ... @@ -1677,7 +1677,7 @@
1677 1677 if (S_ISBLK(inode->i_mode)) {
1678 1678 struct block_device *bdev = I_BDEV(inode);
1679 1679 set_blocksize(bdev, p->old_block_size);
1680   - bd_release(bdev);
  1680 + blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
1681 1681 } else {
1682 1682 mutex_lock(&inode->i_mutex);
1683 1683 inode->i_flags &= ~S_SWAPFILE;
... ... @@ -1939,7 +1939,8 @@
1939 1939 error = -EINVAL;
1940 1940 if (S_ISBLK(inode->i_mode)) {
1941 1941 bdev = I_BDEV(inode);
1942   - error = bd_claim(bdev, sys_swapon);
  1942 + error = blkdev_get(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL,
  1943 + sys_swapon);
1943 1944 if (error < 0) {
1944 1945 bdev = NULL;
1945 1946 error = -EINVAL;
... ... @@ -2136,7 +2137,7 @@
2136 2137 bad_swap:
2137 2138 if (bdev) {
2138 2139 set_blocksize(bdev, p->old_block_size);
2139   - bd_release(bdev);
  2140 + blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
2140 2141 }
2141 2142 destroy_swap_extents(p);
2142 2143 swap_cgroup_swapoff(type);