Commit 6f0d7a9eb60d70f22d71f00b2c762e255881ab31
Exists in
ti-lsk-linux-4.1.y
and in
10 other branches
Merge branch 'for-linus' of git://git.kernel.dk/linux-block
Pull block layer fixes from Jens Axboe: "Four small fixes that should be merged for the current 3.18-rc series. This pull request contains: - a minor bugfix for computation of best IO priority given two merging requests. From Jan Kara. - the final (final) merge count issue that has been plaguing virtio-blk. From Ming Lei. - enable parallel reinit notify for blk-mq queues, to combine the cost of an RCU grace period across lots of devices. From Tejun Heo. - an error handling fix for the SCSI_IOCTL_SEND_COMMAND ioctl. From Tony Battersby" * 'for-linus' of git://git.kernel.dk/linux-block: block: blk-merge: fix blk_recount_segments() scsi: Fix more error handling in SCSI_IOCTL_SEND_COMMAND blk-mq: make mq_queue_reinit_notify() freeze queues in parallel block: Fix computation of merged request priority
Showing 4 changed files Inline Diff
block/blk-merge.c
1 | /* | 1 | /* |
2 | * Functions related to segment and merge handling | 2 | * Functions related to segment and merge handling |
3 | */ | 3 | */ |
4 | #include <linux/kernel.h> | 4 | #include <linux/kernel.h> |
5 | #include <linux/module.h> | 5 | #include <linux/module.h> |
6 | #include <linux/bio.h> | 6 | #include <linux/bio.h> |
7 | #include <linux/blkdev.h> | 7 | #include <linux/blkdev.h> |
8 | #include <linux/scatterlist.h> | 8 | #include <linux/scatterlist.h> |
9 | 9 | ||
10 | #include "blk.h" | 10 | #include "blk.h" |
11 | 11 | ||
12 | static unsigned int __blk_recalc_rq_segments(struct request_queue *q, | 12 | static unsigned int __blk_recalc_rq_segments(struct request_queue *q, |
13 | struct bio *bio, | 13 | struct bio *bio, |
14 | bool no_sg_merge) | 14 | bool no_sg_merge) |
15 | { | 15 | { |
16 | struct bio_vec bv, bvprv = { NULL }; | 16 | struct bio_vec bv, bvprv = { NULL }; |
17 | int cluster, high, highprv = 1; | 17 | int cluster, high, highprv = 1; |
18 | unsigned int seg_size, nr_phys_segs; | 18 | unsigned int seg_size, nr_phys_segs; |
19 | struct bio *fbio, *bbio; | 19 | struct bio *fbio, *bbio; |
20 | struct bvec_iter iter; | 20 | struct bvec_iter iter; |
21 | 21 | ||
22 | if (!bio) | 22 | if (!bio) |
23 | return 0; | 23 | return 0; |
24 | 24 | ||
25 | /* | 25 | /* |
26 | * This should probably be returning 0, but blk_add_request_payload() | 26 | * This should probably be returning 0, but blk_add_request_payload() |
27 | * (Christoph!!!!) | 27 | * (Christoph!!!!) |
28 | */ | 28 | */ |
29 | if (bio->bi_rw & REQ_DISCARD) | 29 | if (bio->bi_rw & REQ_DISCARD) |
30 | return 1; | 30 | return 1; |
31 | 31 | ||
32 | if (bio->bi_rw & REQ_WRITE_SAME) | 32 | if (bio->bi_rw & REQ_WRITE_SAME) |
33 | return 1; | 33 | return 1; |
34 | 34 | ||
35 | fbio = bio; | 35 | fbio = bio; |
36 | cluster = blk_queue_cluster(q); | 36 | cluster = blk_queue_cluster(q); |
37 | seg_size = 0; | 37 | seg_size = 0; |
38 | nr_phys_segs = 0; | 38 | nr_phys_segs = 0; |
39 | high = 0; | 39 | high = 0; |
40 | for_each_bio(bio) { | 40 | for_each_bio(bio) { |
41 | bio_for_each_segment(bv, bio, iter) { | 41 | bio_for_each_segment(bv, bio, iter) { |
42 | /* | 42 | /* |
43 | * If SG merging is disabled, each bio vector is | 43 | * If SG merging is disabled, each bio vector is |
44 | * a segment | 44 | * a segment |
45 | */ | 45 | */ |
46 | if (no_sg_merge) | 46 | if (no_sg_merge) |
47 | goto new_segment; | 47 | goto new_segment; |
48 | 48 | ||
49 | /* | 49 | /* |
50 | * the trick here is making sure that a high page is | 50 | * the trick here is making sure that a high page is |
51 | * never considered part of another segment, since | 51 | * never considered part of another segment, since |
52 | * that might change with the bounce page. | 52 | * that might change with the bounce page. |
53 | */ | 53 | */ |
54 | high = page_to_pfn(bv.bv_page) > queue_bounce_pfn(q); | 54 | high = page_to_pfn(bv.bv_page) > queue_bounce_pfn(q); |
55 | if (!high && !highprv && cluster) { | 55 | if (!high && !highprv && cluster) { |
56 | if (seg_size + bv.bv_len | 56 | if (seg_size + bv.bv_len |
57 | > queue_max_segment_size(q)) | 57 | > queue_max_segment_size(q)) |
58 | goto new_segment; | 58 | goto new_segment; |
59 | if (!BIOVEC_PHYS_MERGEABLE(&bvprv, &bv)) | 59 | if (!BIOVEC_PHYS_MERGEABLE(&bvprv, &bv)) |
60 | goto new_segment; | 60 | goto new_segment; |
61 | if (!BIOVEC_SEG_BOUNDARY(q, &bvprv, &bv)) | 61 | if (!BIOVEC_SEG_BOUNDARY(q, &bvprv, &bv)) |
62 | goto new_segment; | 62 | goto new_segment; |
63 | 63 | ||
64 | seg_size += bv.bv_len; | 64 | seg_size += bv.bv_len; |
65 | bvprv = bv; | 65 | bvprv = bv; |
66 | continue; | 66 | continue; |
67 | } | 67 | } |
68 | new_segment: | 68 | new_segment: |
69 | if (nr_phys_segs == 1 && seg_size > | 69 | if (nr_phys_segs == 1 && seg_size > |
70 | fbio->bi_seg_front_size) | 70 | fbio->bi_seg_front_size) |
71 | fbio->bi_seg_front_size = seg_size; | 71 | fbio->bi_seg_front_size = seg_size; |
72 | 72 | ||
73 | nr_phys_segs++; | 73 | nr_phys_segs++; |
74 | bvprv = bv; | 74 | bvprv = bv; |
75 | seg_size = bv.bv_len; | 75 | seg_size = bv.bv_len; |
76 | highprv = high; | 76 | highprv = high; |
77 | } | 77 | } |
78 | bbio = bio; | 78 | bbio = bio; |
79 | } | 79 | } |
80 | 80 | ||
81 | if (nr_phys_segs == 1 && seg_size > fbio->bi_seg_front_size) | 81 | if (nr_phys_segs == 1 && seg_size > fbio->bi_seg_front_size) |
82 | fbio->bi_seg_front_size = seg_size; | 82 | fbio->bi_seg_front_size = seg_size; |
83 | if (seg_size > bbio->bi_seg_back_size) | 83 | if (seg_size > bbio->bi_seg_back_size) |
84 | bbio->bi_seg_back_size = seg_size; | 84 | bbio->bi_seg_back_size = seg_size; |
85 | 85 | ||
86 | return nr_phys_segs; | 86 | return nr_phys_segs; |
87 | } | 87 | } |
88 | 88 | ||
89 | void blk_recalc_rq_segments(struct request *rq) | 89 | void blk_recalc_rq_segments(struct request *rq) |
90 | { | 90 | { |
91 | bool no_sg_merge = !!test_bit(QUEUE_FLAG_NO_SG_MERGE, | 91 | bool no_sg_merge = !!test_bit(QUEUE_FLAG_NO_SG_MERGE, |
92 | &rq->q->queue_flags); | 92 | &rq->q->queue_flags); |
93 | 93 | ||
94 | rq->nr_phys_segments = __blk_recalc_rq_segments(rq->q, rq->bio, | 94 | rq->nr_phys_segments = __blk_recalc_rq_segments(rq->q, rq->bio, |
95 | no_sg_merge); | 95 | no_sg_merge); |
96 | } | 96 | } |
97 | 97 | ||
98 | void blk_recount_segments(struct request_queue *q, struct bio *bio) | 98 | void blk_recount_segments(struct request_queue *q, struct bio *bio) |
99 | { | 99 | { |
100 | bool no_sg_merge = !!test_bit(QUEUE_FLAG_NO_SG_MERGE, | 100 | unsigned short seg_cnt; |
101 | &q->queue_flags); | ||
102 | bool merge_not_need = bio->bi_vcnt < queue_max_segments(q); | ||
103 | 101 | ||
104 | if (no_sg_merge && !bio_flagged(bio, BIO_CLONED) && | 102 | /* estimate segment number by bi_vcnt for non-cloned bio */ |
105 | merge_not_need) | 103 | if (bio_flagged(bio, BIO_CLONED)) |
106 | bio->bi_phys_segments = bio->bi_vcnt; | 104 | seg_cnt = bio_segments(bio); |
105 | else | ||
106 | seg_cnt = bio->bi_vcnt; | ||
107 | |||
108 | if (test_bit(QUEUE_FLAG_NO_SG_MERGE, &q->queue_flags) && | ||
109 | (seg_cnt < queue_max_segments(q))) | ||
110 | bio->bi_phys_segments = seg_cnt; | ||
107 | else { | 111 | else { |
108 | struct bio *nxt = bio->bi_next; | 112 | struct bio *nxt = bio->bi_next; |
109 | 113 | ||
110 | bio->bi_next = NULL; | 114 | bio->bi_next = NULL; |
111 | bio->bi_phys_segments = __blk_recalc_rq_segments(q, bio, | 115 | bio->bi_phys_segments = __blk_recalc_rq_segments(q, bio, false); |
112 | no_sg_merge && merge_not_need); | ||
113 | bio->bi_next = nxt; | 116 | bio->bi_next = nxt; |
114 | } | 117 | } |
115 | 118 | ||
116 | bio->bi_flags |= (1 << BIO_SEG_VALID); | 119 | bio->bi_flags |= (1 << BIO_SEG_VALID); |
117 | } | 120 | } |
118 | EXPORT_SYMBOL(blk_recount_segments); | 121 | EXPORT_SYMBOL(blk_recount_segments); |
119 | 122 | ||
120 | static int blk_phys_contig_segment(struct request_queue *q, struct bio *bio, | 123 | static int blk_phys_contig_segment(struct request_queue *q, struct bio *bio, |
121 | struct bio *nxt) | 124 | struct bio *nxt) |
122 | { | 125 | { |
123 | struct bio_vec end_bv = { NULL }, nxt_bv; | 126 | struct bio_vec end_bv = { NULL }, nxt_bv; |
124 | struct bvec_iter iter; | 127 | struct bvec_iter iter; |
125 | 128 | ||
126 | if (!blk_queue_cluster(q)) | 129 | if (!blk_queue_cluster(q)) |
127 | return 0; | 130 | return 0; |
128 | 131 | ||
129 | if (bio->bi_seg_back_size + nxt->bi_seg_front_size > | 132 | if (bio->bi_seg_back_size + nxt->bi_seg_front_size > |
130 | queue_max_segment_size(q)) | 133 | queue_max_segment_size(q)) |
131 | return 0; | 134 | return 0; |
132 | 135 | ||
133 | if (!bio_has_data(bio)) | 136 | if (!bio_has_data(bio)) |
134 | return 1; | 137 | return 1; |
135 | 138 | ||
136 | bio_for_each_segment(end_bv, bio, iter) | 139 | bio_for_each_segment(end_bv, bio, iter) |
137 | if (end_bv.bv_len == iter.bi_size) | 140 | if (end_bv.bv_len == iter.bi_size) |
138 | break; | 141 | break; |
139 | 142 | ||
140 | nxt_bv = bio_iovec(nxt); | 143 | nxt_bv = bio_iovec(nxt); |
141 | 144 | ||
142 | if (!BIOVEC_PHYS_MERGEABLE(&end_bv, &nxt_bv)) | 145 | if (!BIOVEC_PHYS_MERGEABLE(&end_bv, &nxt_bv)) |
143 | return 0; | 146 | return 0; |
144 | 147 | ||
145 | /* | 148 | /* |
146 | * bio and nxt are contiguous in memory; check if the queue allows | 149 | * bio and nxt are contiguous in memory; check if the queue allows |
147 | * these two to be merged into one | 150 | * these two to be merged into one |
148 | */ | 151 | */ |
149 | if (BIOVEC_SEG_BOUNDARY(q, &end_bv, &nxt_bv)) | 152 | if (BIOVEC_SEG_BOUNDARY(q, &end_bv, &nxt_bv)) |
150 | return 1; | 153 | return 1; |
151 | 154 | ||
152 | return 0; | 155 | return 0; |
153 | } | 156 | } |
154 | 157 | ||
155 | static inline void | 158 | static inline void |
156 | __blk_segment_map_sg(struct request_queue *q, struct bio_vec *bvec, | 159 | __blk_segment_map_sg(struct request_queue *q, struct bio_vec *bvec, |
157 | struct scatterlist *sglist, struct bio_vec *bvprv, | 160 | struct scatterlist *sglist, struct bio_vec *bvprv, |
158 | struct scatterlist **sg, int *nsegs, int *cluster) | 161 | struct scatterlist **sg, int *nsegs, int *cluster) |
159 | { | 162 | { |
160 | 163 | ||
161 | int nbytes = bvec->bv_len; | 164 | int nbytes = bvec->bv_len; |
162 | 165 | ||
163 | if (*sg && *cluster) { | 166 | if (*sg && *cluster) { |
164 | if ((*sg)->length + nbytes > queue_max_segment_size(q)) | 167 | if ((*sg)->length + nbytes > queue_max_segment_size(q)) |
165 | goto new_segment; | 168 | goto new_segment; |
166 | 169 | ||
167 | if (!BIOVEC_PHYS_MERGEABLE(bvprv, bvec)) | 170 | if (!BIOVEC_PHYS_MERGEABLE(bvprv, bvec)) |
168 | goto new_segment; | 171 | goto new_segment; |
169 | if (!BIOVEC_SEG_BOUNDARY(q, bvprv, bvec)) | 172 | if (!BIOVEC_SEG_BOUNDARY(q, bvprv, bvec)) |
170 | goto new_segment; | 173 | goto new_segment; |
171 | 174 | ||
172 | (*sg)->length += nbytes; | 175 | (*sg)->length += nbytes; |
173 | } else { | 176 | } else { |
174 | new_segment: | 177 | new_segment: |
175 | if (!*sg) | 178 | if (!*sg) |
176 | *sg = sglist; | 179 | *sg = sglist; |
177 | else { | 180 | else { |
178 | /* | 181 | /* |
179 | * If the driver previously mapped a shorter | 182 | * If the driver previously mapped a shorter |
180 | * list, we could see a termination bit | 183 | * list, we could see a termination bit |
181 | * prematurely unless it fully inits the sg | 184 | * prematurely unless it fully inits the sg |
182 | * table on each mapping. We KNOW that there | 185 | * table on each mapping. We KNOW that there |
183 | * must be more entries here or the driver | 186 | * must be more entries here or the driver |
184 | * would be buggy, so force clear the | 187 | * would be buggy, so force clear the |
185 | * termination bit to avoid doing a full | 188 | * termination bit to avoid doing a full |
186 | * sg_init_table() in drivers for each command. | 189 | * sg_init_table() in drivers for each command. |
187 | */ | 190 | */ |
188 | sg_unmark_end(*sg); | 191 | sg_unmark_end(*sg); |
189 | *sg = sg_next(*sg); | 192 | *sg = sg_next(*sg); |
190 | } | 193 | } |
191 | 194 | ||
192 | sg_set_page(*sg, bvec->bv_page, nbytes, bvec->bv_offset); | 195 | sg_set_page(*sg, bvec->bv_page, nbytes, bvec->bv_offset); |
193 | (*nsegs)++; | 196 | (*nsegs)++; |
194 | } | 197 | } |
195 | *bvprv = *bvec; | 198 | *bvprv = *bvec; |
196 | } | 199 | } |
197 | 200 | ||
198 | static int __blk_bios_map_sg(struct request_queue *q, struct bio *bio, | 201 | static int __blk_bios_map_sg(struct request_queue *q, struct bio *bio, |
199 | struct scatterlist *sglist, | 202 | struct scatterlist *sglist, |
200 | struct scatterlist **sg) | 203 | struct scatterlist **sg) |
201 | { | 204 | { |
202 | struct bio_vec bvec, bvprv = { NULL }; | 205 | struct bio_vec bvec, bvprv = { NULL }; |
203 | struct bvec_iter iter; | 206 | struct bvec_iter iter; |
204 | int nsegs, cluster; | 207 | int nsegs, cluster; |
205 | 208 | ||
206 | nsegs = 0; | 209 | nsegs = 0; |
207 | cluster = blk_queue_cluster(q); | 210 | cluster = blk_queue_cluster(q); |
208 | 211 | ||
209 | if (bio->bi_rw & REQ_DISCARD) { | 212 | if (bio->bi_rw & REQ_DISCARD) { |
210 | /* | 213 | /* |
211 | * This is a hack - drivers should be neither modifying the | 214 | * This is a hack - drivers should be neither modifying the |
212 | * biovec, nor relying on bi_vcnt - but because of | 215 | * biovec, nor relying on bi_vcnt - but because of |
213 | * blk_add_request_payload(), a discard bio may or may not have | 216 | * blk_add_request_payload(), a discard bio may or may not have |
214 | * a payload we need to set up here (thank you Christoph) and | 217 | * a payload we need to set up here (thank you Christoph) and |
215 | * bi_vcnt is really the only way of telling if we need to. | 218 | * bi_vcnt is really the only way of telling if we need to. |
216 | */ | 219 | */ |
217 | 220 | ||
218 | if (bio->bi_vcnt) | 221 | if (bio->bi_vcnt) |
219 | goto single_segment; | 222 | goto single_segment; |
220 | 223 | ||
221 | return 0; | 224 | return 0; |
222 | } | 225 | } |
223 | 226 | ||
224 | if (bio->bi_rw & REQ_WRITE_SAME) { | 227 | if (bio->bi_rw & REQ_WRITE_SAME) { |
225 | single_segment: | 228 | single_segment: |
226 | *sg = sglist; | 229 | *sg = sglist; |
227 | bvec = bio_iovec(bio); | 230 | bvec = bio_iovec(bio); |
228 | sg_set_page(*sg, bvec.bv_page, bvec.bv_len, bvec.bv_offset); | 231 | sg_set_page(*sg, bvec.bv_page, bvec.bv_len, bvec.bv_offset); |
229 | return 1; | 232 | return 1; |
230 | } | 233 | } |
231 | 234 | ||
232 | for_each_bio(bio) | 235 | for_each_bio(bio) |
233 | bio_for_each_segment(bvec, bio, iter) | 236 | bio_for_each_segment(bvec, bio, iter) |
234 | __blk_segment_map_sg(q, &bvec, sglist, &bvprv, sg, | 237 | __blk_segment_map_sg(q, &bvec, sglist, &bvprv, sg, |
235 | &nsegs, &cluster); | 238 | &nsegs, &cluster); |
236 | 239 | ||
237 | return nsegs; | 240 | return nsegs; |
238 | } | 241 | } |
239 | 242 | ||
240 | /* | 243 | /* |
241 | * map a request to scatterlist, return number of sg entries setup. Caller | 244 | * map a request to scatterlist, return number of sg entries setup. Caller |
242 | * must make sure sg can hold rq->nr_phys_segments entries | 245 | * must make sure sg can hold rq->nr_phys_segments entries |
243 | */ | 246 | */ |
244 | int blk_rq_map_sg(struct request_queue *q, struct request *rq, | 247 | int blk_rq_map_sg(struct request_queue *q, struct request *rq, |
245 | struct scatterlist *sglist) | 248 | struct scatterlist *sglist) |
246 | { | 249 | { |
247 | struct scatterlist *sg = NULL; | 250 | struct scatterlist *sg = NULL; |
248 | int nsegs = 0; | 251 | int nsegs = 0; |
249 | 252 | ||
250 | if (rq->bio) | 253 | if (rq->bio) |
251 | nsegs = __blk_bios_map_sg(q, rq->bio, sglist, &sg); | 254 | nsegs = __blk_bios_map_sg(q, rq->bio, sglist, &sg); |
252 | 255 | ||
253 | if (unlikely(rq->cmd_flags & REQ_COPY_USER) && | 256 | if (unlikely(rq->cmd_flags & REQ_COPY_USER) && |
254 | (blk_rq_bytes(rq) & q->dma_pad_mask)) { | 257 | (blk_rq_bytes(rq) & q->dma_pad_mask)) { |
255 | unsigned int pad_len = | 258 | unsigned int pad_len = |
256 | (q->dma_pad_mask & ~blk_rq_bytes(rq)) + 1; | 259 | (q->dma_pad_mask & ~blk_rq_bytes(rq)) + 1; |
257 | 260 | ||
258 | sg->length += pad_len; | 261 | sg->length += pad_len; |
259 | rq->extra_len += pad_len; | 262 | rq->extra_len += pad_len; |
260 | } | 263 | } |
261 | 264 | ||
262 | if (q->dma_drain_size && q->dma_drain_needed(rq)) { | 265 | if (q->dma_drain_size && q->dma_drain_needed(rq)) { |
263 | if (rq->cmd_flags & REQ_WRITE) | 266 | if (rq->cmd_flags & REQ_WRITE) |
264 | memset(q->dma_drain_buffer, 0, q->dma_drain_size); | 267 | memset(q->dma_drain_buffer, 0, q->dma_drain_size); |
265 | 268 | ||
266 | sg->page_link &= ~0x02; | 269 | sg->page_link &= ~0x02; |
267 | sg = sg_next(sg); | 270 | sg = sg_next(sg); |
268 | sg_set_page(sg, virt_to_page(q->dma_drain_buffer), | 271 | sg_set_page(sg, virt_to_page(q->dma_drain_buffer), |
269 | q->dma_drain_size, | 272 | q->dma_drain_size, |
270 | ((unsigned long)q->dma_drain_buffer) & | 273 | ((unsigned long)q->dma_drain_buffer) & |
271 | (PAGE_SIZE - 1)); | 274 | (PAGE_SIZE - 1)); |
272 | nsegs++; | 275 | nsegs++; |
273 | rq->extra_len += q->dma_drain_size; | 276 | rq->extra_len += q->dma_drain_size; |
274 | } | 277 | } |
275 | 278 | ||
276 | if (sg) | 279 | if (sg) |
277 | sg_mark_end(sg); | 280 | sg_mark_end(sg); |
278 | 281 | ||
279 | return nsegs; | 282 | return nsegs; |
280 | } | 283 | } |
281 | EXPORT_SYMBOL(blk_rq_map_sg); | 284 | EXPORT_SYMBOL(blk_rq_map_sg); |
282 | 285 | ||
283 | /** | 286 | /** |
284 | * blk_bio_map_sg - map a bio to a scatterlist | 287 | * blk_bio_map_sg - map a bio to a scatterlist |
285 | * @q: request_queue in question | 288 | * @q: request_queue in question |
286 | * @bio: bio being mapped | 289 | * @bio: bio being mapped |
287 | * @sglist: scatterlist being mapped | 290 | * @sglist: scatterlist being mapped |
288 | * | 291 | * |
289 | * Note: | 292 | * Note: |
290 | * Caller must make sure sg can hold bio->bi_phys_segments entries | 293 | * Caller must make sure sg can hold bio->bi_phys_segments entries |
291 | * | 294 | * |
292 | * Will return the number of sg entries setup | 295 | * Will return the number of sg entries setup |
293 | */ | 296 | */ |
294 | int blk_bio_map_sg(struct request_queue *q, struct bio *bio, | 297 | int blk_bio_map_sg(struct request_queue *q, struct bio *bio, |
295 | struct scatterlist *sglist) | 298 | struct scatterlist *sglist) |
296 | { | 299 | { |
297 | struct scatterlist *sg = NULL; | 300 | struct scatterlist *sg = NULL; |
298 | int nsegs; | 301 | int nsegs; |
299 | struct bio *next = bio->bi_next; | 302 | struct bio *next = bio->bi_next; |
300 | bio->bi_next = NULL; | 303 | bio->bi_next = NULL; |
301 | 304 | ||
302 | nsegs = __blk_bios_map_sg(q, bio, sglist, &sg); | 305 | nsegs = __blk_bios_map_sg(q, bio, sglist, &sg); |
303 | bio->bi_next = next; | 306 | bio->bi_next = next; |
304 | if (sg) | 307 | if (sg) |
305 | sg_mark_end(sg); | 308 | sg_mark_end(sg); |
306 | 309 | ||
307 | BUG_ON(bio->bi_phys_segments && nsegs > bio->bi_phys_segments); | 310 | BUG_ON(bio->bi_phys_segments && nsegs > bio->bi_phys_segments); |
308 | return nsegs; | 311 | return nsegs; |
309 | } | 312 | } |
310 | EXPORT_SYMBOL(blk_bio_map_sg); | 313 | EXPORT_SYMBOL(blk_bio_map_sg); |
311 | 314 | ||
312 | static inline int ll_new_hw_segment(struct request_queue *q, | 315 | static inline int ll_new_hw_segment(struct request_queue *q, |
313 | struct request *req, | 316 | struct request *req, |
314 | struct bio *bio) | 317 | struct bio *bio) |
315 | { | 318 | { |
316 | int nr_phys_segs = bio_phys_segments(q, bio); | 319 | int nr_phys_segs = bio_phys_segments(q, bio); |
317 | 320 | ||
318 | if (req->nr_phys_segments + nr_phys_segs > queue_max_segments(q)) | 321 | if (req->nr_phys_segments + nr_phys_segs > queue_max_segments(q)) |
319 | goto no_merge; | 322 | goto no_merge; |
320 | 323 | ||
321 | if (blk_integrity_merge_bio(q, req, bio) == false) | 324 | if (blk_integrity_merge_bio(q, req, bio) == false) |
322 | goto no_merge; | 325 | goto no_merge; |
323 | 326 | ||
324 | /* | 327 | /* |
325 | * This will form the start of a new hw segment. Bump both | 328 | * This will form the start of a new hw segment. Bump both |
326 | * counters. | 329 | * counters. |
327 | */ | 330 | */ |
328 | req->nr_phys_segments += nr_phys_segs; | 331 | req->nr_phys_segments += nr_phys_segs; |
329 | return 1; | 332 | return 1; |
330 | 333 | ||
331 | no_merge: | 334 | no_merge: |
332 | req->cmd_flags |= REQ_NOMERGE; | 335 | req->cmd_flags |= REQ_NOMERGE; |
333 | if (req == q->last_merge) | 336 | if (req == q->last_merge) |
334 | q->last_merge = NULL; | 337 | q->last_merge = NULL; |
335 | return 0; | 338 | return 0; |
336 | } | 339 | } |
337 | 340 | ||
338 | int ll_back_merge_fn(struct request_queue *q, struct request *req, | 341 | int ll_back_merge_fn(struct request_queue *q, struct request *req, |
339 | struct bio *bio) | 342 | struct bio *bio) |
340 | { | 343 | { |
341 | if (blk_rq_sectors(req) + bio_sectors(bio) > | 344 | if (blk_rq_sectors(req) + bio_sectors(bio) > |
342 | blk_rq_get_max_sectors(req)) { | 345 | blk_rq_get_max_sectors(req)) { |
343 | req->cmd_flags |= REQ_NOMERGE; | 346 | req->cmd_flags |= REQ_NOMERGE; |
344 | if (req == q->last_merge) | 347 | if (req == q->last_merge) |
345 | q->last_merge = NULL; | 348 | q->last_merge = NULL; |
346 | return 0; | 349 | return 0; |
347 | } | 350 | } |
348 | if (!bio_flagged(req->biotail, BIO_SEG_VALID)) | 351 | if (!bio_flagged(req->biotail, BIO_SEG_VALID)) |
349 | blk_recount_segments(q, req->biotail); | 352 | blk_recount_segments(q, req->biotail); |
350 | if (!bio_flagged(bio, BIO_SEG_VALID)) | 353 | if (!bio_flagged(bio, BIO_SEG_VALID)) |
351 | blk_recount_segments(q, bio); | 354 | blk_recount_segments(q, bio); |
352 | 355 | ||
353 | return ll_new_hw_segment(q, req, bio); | 356 | return ll_new_hw_segment(q, req, bio); |
354 | } | 357 | } |
355 | 358 | ||
356 | int ll_front_merge_fn(struct request_queue *q, struct request *req, | 359 | int ll_front_merge_fn(struct request_queue *q, struct request *req, |
357 | struct bio *bio) | 360 | struct bio *bio) |
358 | { | 361 | { |
359 | if (blk_rq_sectors(req) + bio_sectors(bio) > | 362 | if (blk_rq_sectors(req) + bio_sectors(bio) > |
360 | blk_rq_get_max_sectors(req)) { | 363 | blk_rq_get_max_sectors(req)) { |
361 | req->cmd_flags |= REQ_NOMERGE; | 364 | req->cmd_flags |= REQ_NOMERGE; |
362 | if (req == q->last_merge) | 365 | if (req == q->last_merge) |
363 | q->last_merge = NULL; | 366 | q->last_merge = NULL; |
364 | return 0; | 367 | return 0; |
365 | } | 368 | } |
366 | if (!bio_flagged(bio, BIO_SEG_VALID)) | 369 | if (!bio_flagged(bio, BIO_SEG_VALID)) |
367 | blk_recount_segments(q, bio); | 370 | blk_recount_segments(q, bio); |
368 | if (!bio_flagged(req->bio, BIO_SEG_VALID)) | 371 | if (!bio_flagged(req->bio, BIO_SEG_VALID)) |
369 | blk_recount_segments(q, req->bio); | 372 | blk_recount_segments(q, req->bio); |
370 | 373 | ||
371 | return ll_new_hw_segment(q, req, bio); | 374 | return ll_new_hw_segment(q, req, bio); |
372 | } | 375 | } |
373 | 376 | ||
374 | /* | 377 | /* |
375 | * blk-mq uses req->special to carry normal driver per-request payload, it | 378 | * blk-mq uses req->special to carry normal driver per-request payload, it |
376 | * does not indicate a prepared command that we cannot merge with. | 379 | * does not indicate a prepared command that we cannot merge with. |
377 | */ | 380 | */ |
378 | static bool req_no_special_merge(struct request *req) | 381 | static bool req_no_special_merge(struct request *req) |
379 | { | 382 | { |
380 | struct request_queue *q = req->q; | 383 | struct request_queue *q = req->q; |
381 | 384 | ||
382 | return !q->mq_ops && req->special; | 385 | return !q->mq_ops && req->special; |
383 | } | 386 | } |
384 | 387 | ||
385 | static int ll_merge_requests_fn(struct request_queue *q, struct request *req, | 388 | static int ll_merge_requests_fn(struct request_queue *q, struct request *req, |
386 | struct request *next) | 389 | struct request *next) |
387 | { | 390 | { |
388 | int total_phys_segments; | 391 | int total_phys_segments; |
389 | unsigned int seg_size = | 392 | unsigned int seg_size = |
390 | req->biotail->bi_seg_back_size + next->bio->bi_seg_front_size; | 393 | req->biotail->bi_seg_back_size + next->bio->bi_seg_front_size; |
391 | 394 | ||
392 | /* | 395 | /* |
393 | * First check if the either of the requests are re-queued | 396 | * First check if the either of the requests are re-queued |
394 | * requests. Can't merge them if they are. | 397 | * requests. Can't merge them if they are. |
395 | */ | 398 | */ |
396 | if (req_no_special_merge(req) || req_no_special_merge(next)) | 399 | if (req_no_special_merge(req) || req_no_special_merge(next)) |
397 | return 0; | 400 | return 0; |
398 | 401 | ||
399 | /* | 402 | /* |
400 | * Will it become too large? | 403 | * Will it become too large? |
401 | */ | 404 | */ |
402 | if ((blk_rq_sectors(req) + blk_rq_sectors(next)) > | 405 | if ((blk_rq_sectors(req) + blk_rq_sectors(next)) > |
403 | blk_rq_get_max_sectors(req)) | 406 | blk_rq_get_max_sectors(req)) |
404 | return 0; | 407 | return 0; |
405 | 408 | ||
406 | total_phys_segments = req->nr_phys_segments + next->nr_phys_segments; | 409 | total_phys_segments = req->nr_phys_segments + next->nr_phys_segments; |
407 | if (blk_phys_contig_segment(q, req->biotail, next->bio)) { | 410 | if (blk_phys_contig_segment(q, req->biotail, next->bio)) { |
408 | if (req->nr_phys_segments == 1) | 411 | if (req->nr_phys_segments == 1) |
409 | req->bio->bi_seg_front_size = seg_size; | 412 | req->bio->bi_seg_front_size = seg_size; |
410 | if (next->nr_phys_segments == 1) | 413 | if (next->nr_phys_segments == 1) |
411 | next->biotail->bi_seg_back_size = seg_size; | 414 | next->biotail->bi_seg_back_size = seg_size; |
412 | total_phys_segments--; | 415 | total_phys_segments--; |
413 | } | 416 | } |
414 | 417 | ||
415 | if (total_phys_segments > queue_max_segments(q)) | 418 | if (total_phys_segments > queue_max_segments(q)) |
416 | return 0; | 419 | return 0; |
417 | 420 | ||
418 | if (blk_integrity_merge_rq(q, req, next) == false) | 421 | if (blk_integrity_merge_rq(q, req, next) == false) |
419 | return 0; | 422 | return 0; |
420 | 423 | ||
421 | /* Merge is OK... */ | 424 | /* Merge is OK... */ |
422 | req->nr_phys_segments = total_phys_segments; | 425 | req->nr_phys_segments = total_phys_segments; |
423 | return 1; | 426 | return 1; |
424 | } | 427 | } |
425 | 428 | ||
426 | /** | 429 | /** |
427 | * blk_rq_set_mixed_merge - mark a request as mixed merge | 430 | * blk_rq_set_mixed_merge - mark a request as mixed merge |
428 | * @rq: request to mark as mixed merge | 431 | * @rq: request to mark as mixed merge |
429 | * | 432 | * |
430 | * Description: | 433 | * Description: |
431 | * @rq is about to be mixed merged. Make sure the attributes | 434 | * @rq is about to be mixed merged. Make sure the attributes |
432 | * which can be mixed are set in each bio and mark @rq as mixed | 435 | * which can be mixed are set in each bio and mark @rq as mixed |
433 | * merged. | 436 | * merged. |
434 | */ | 437 | */ |
435 | void blk_rq_set_mixed_merge(struct request *rq) | 438 | void blk_rq_set_mixed_merge(struct request *rq) |
436 | { | 439 | { |
437 | unsigned int ff = rq->cmd_flags & REQ_FAILFAST_MASK; | 440 | unsigned int ff = rq->cmd_flags & REQ_FAILFAST_MASK; |
438 | struct bio *bio; | 441 | struct bio *bio; |
439 | 442 | ||
440 | if (rq->cmd_flags & REQ_MIXED_MERGE) | 443 | if (rq->cmd_flags & REQ_MIXED_MERGE) |
441 | return; | 444 | return; |
442 | 445 | ||
443 | /* | 446 | /* |
444 | * @rq will no longer represent mixable attributes for all the | 447 | * @rq will no longer represent mixable attributes for all the |
445 | * contained bios. It will just track those of the first one. | 448 | * contained bios. It will just track those of the first one. |
446 | * Distributes the attributs to each bio. | 449 | * Distributes the attributs to each bio. |
447 | */ | 450 | */ |
448 | for (bio = rq->bio; bio; bio = bio->bi_next) { | 451 | for (bio = rq->bio; bio; bio = bio->bi_next) { |
449 | WARN_ON_ONCE((bio->bi_rw & REQ_FAILFAST_MASK) && | 452 | WARN_ON_ONCE((bio->bi_rw & REQ_FAILFAST_MASK) && |
450 | (bio->bi_rw & REQ_FAILFAST_MASK) != ff); | 453 | (bio->bi_rw & REQ_FAILFAST_MASK) != ff); |
451 | bio->bi_rw |= ff; | 454 | bio->bi_rw |= ff; |
452 | } | 455 | } |
453 | rq->cmd_flags |= REQ_MIXED_MERGE; | 456 | rq->cmd_flags |= REQ_MIXED_MERGE; |
454 | } | 457 | } |
455 | 458 | ||
456 | static void blk_account_io_merge(struct request *req) | 459 | static void blk_account_io_merge(struct request *req) |
457 | { | 460 | { |
458 | if (blk_do_io_stat(req)) { | 461 | if (blk_do_io_stat(req)) { |
459 | struct hd_struct *part; | 462 | struct hd_struct *part; |
460 | int cpu; | 463 | int cpu; |
461 | 464 | ||
462 | cpu = part_stat_lock(); | 465 | cpu = part_stat_lock(); |
463 | part = req->part; | 466 | part = req->part; |
464 | 467 | ||
465 | part_round_stats(cpu, part); | 468 | part_round_stats(cpu, part); |
466 | part_dec_in_flight(part, rq_data_dir(req)); | 469 | part_dec_in_flight(part, rq_data_dir(req)); |
467 | 470 | ||
468 | hd_struct_put(part); | 471 | hd_struct_put(part); |
469 | part_stat_unlock(); | 472 | part_stat_unlock(); |
470 | } | 473 | } |
471 | } | 474 | } |
472 | 475 | ||
473 | /* | 476 | /* |
474 | * Has to be called with the request spinlock acquired | 477 | * Has to be called with the request spinlock acquired |
475 | */ | 478 | */ |
476 | static int attempt_merge(struct request_queue *q, struct request *req, | 479 | static int attempt_merge(struct request_queue *q, struct request *req, |
477 | struct request *next) | 480 | struct request *next) |
478 | { | 481 | { |
479 | if (!rq_mergeable(req) || !rq_mergeable(next)) | 482 | if (!rq_mergeable(req) || !rq_mergeable(next)) |
480 | return 0; | 483 | return 0; |
481 | 484 | ||
482 | if (!blk_check_merge_flags(req->cmd_flags, next->cmd_flags)) | 485 | if (!blk_check_merge_flags(req->cmd_flags, next->cmd_flags)) |
483 | return 0; | 486 | return 0; |
484 | 487 | ||
485 | /* | 488 | /* |
486 | * not contiguous | 489 | * not contiguous |
487 | */ | 490 | */ |
488 | if (blk_rq_pos(req) + blk_rq_sectors(req) != blk_rq_pos(next)) | 491 | if (blk_rq_pos(req) + blk_rq_sectors(req) != blk_rq_pos(next)) |
489 | return 0; | 492 | return 0; |
490 | 493 | ||
491 | if (rq_data_dir(req) != rq_data_dir(next) | 494 | if (rq_data_dir(req) != rq_data_dir(next) |
492 | || req->rq_disk != next->rq_disk | 495 | || req->rq_disk != next->rq_disk |
493 | || req_no_special_merge(next)) | 496 | || req_no_special_merge(next)) |
494 | return 0; | 497 | return 0; |
495 | 498 | ||
496 | if (req->cmd_flags & REQ_WRITE_SAME && | 499 | if (req->cmd_flags & REQ_WRITE_SAME && |
497 | !blk_write_same_mergeable(req->bio, next->bio)) | 500 | !blk_write_same_mergeable(req->bio, next->bio)) |
498 | return 0; | 501 | return 0; |
499 | 502 | ||
500 | /* | 503 | /* |
501 | * If we are allowed to merge, then append bio list | 504 | * If we are allowed to merge, then append bio list |
502 | * from next to rq and release next. merge_requests_fn | 505 | * from next to rq and release next. merge_requests_fn |
503 | * will have updated segment counts, update sector | 506 | * will have updated segment counts, update sector |
504 | * counts here. | 507 | * counts here. |
505 | */ | 508 | */ |
506 | if (!ll_merge_requests_fn(q, req, next)) | 509 | if (!ll_merge_requests_fn(q, req, next)) |
507 | return 0; | 510 | return 0; |
508 | 511 | ||
509 | /* | 512 | /* |
510 | * If failfast settings disagree or any of the two is already | 513 | * If failfast settings disagree or any of the two is already |
511 | * a mixed merge, mark both as mixed before proceeding. This | 514 | * a mixed merge, mark both as mixed before proceeding. This |
512 | * makes sure that all involved bios have mixable attributes | 515 | * makes sure that all involved bios have mixable attributes |
513 | * set properly. | 516 | * set properly. |
514 | */ | 517 | */ |
515 | if ((req->cmd_flags | next->cmd_flags) & REQ_MIXED_MERGE || | 518 | if ((req->cmd_flags | next->cmd_flags) & REQ_MIXED_MERGE || |
516 | (req->cmd_flags & REQ_FAILFAST_MASK) != | 519 | (req->cmd_flags & REQ_FAILFAST_MASK) != |
517 | (next->cmd_flags & REQ_FAILFAST_MASK)) { | 520 | (next->cmd_flags & REQ_FAILFAST_MASK)) { |
518 | blk_rq_set_mixed_merge(req); | 521 | blk_rq_set_mixed_merge(req); |
519 | blk_rq_set_mixed_merge(next); | 522 | blk_rq_set_mixed_merge(next); |
520 | } | 523 | } |
521 | 524 | ||
522 | /* | 525 | /* |
523 | * At this point we have either done a back merge | 526 | * At this point we have either done a back merge |
524 | * or front merge. We need the smaller start_time of | 527 | * or front merge. We need the smaller start_time of |
525 | * the merged requests to be the current request | 528 | * the merged requests to be the current request |
526 | * for accounting purposes. | 529 | * for accounting purposes. |
527 | */ | 530 | */ |
528 | if (time_after(req->start_time, next->start_time)) | 531 | if (time_after(req->start_time, next->start_time)) |
529 | req->start_time = next->start_time; | 532 | req->start_time = next->start_time; |
530 | 533 | ||
531 | req->biotail->bi_next = next->bio; | 534 | req->biotail->bi_next = next->bio; |
532 | req->biotail = next->biotail; | 535 | req->biotail = next->biotail; |
533 | 536 | ||
534 | req->__data_len += blk_rq_bytes(next); | 537 | req->__data_len += blk_rq_bytes(next); |
535 | 538 | ||
536 | elv_merge_requests(q, req, next); | 539 | elv_merge_requests(q, req, next); |
537 | 540 | ||
538 | /* | 541 | /* |
539 | * 'next' is going away, so update stats accordingly | 542 | * 'next' is going away, so update stats accordingly |
540 | */ | 543 | */ |
541 | blk_account_io_merge(next); | 544 | blk_account_io_merge(next); |
542 | 545 | ||
543 | req->ioprio = ioprio_best(req->ioprio, next->ioprio); | 546 | req->ioprio = ioprio_best(req->ioprio, next->ioprio); |
544 | if (blk_rq_cpu_valid(next)) | 547 | if (blk_rq_cpu_valid(next)) |
545 | req->cpu = next->cpu; | 548 | req->cpu = next->cpu; |
546 | 549 | ||
547 | /* owner-ship of bio passed from next to req */ | 550 | /* owner-ship of bio passed from next to req */ |
548 | next->bio = NULL; | 551 | next->bio = NULL; |
549 | __blk_put_request(q, next); | 552 | __blk_put_request(q, next); |
550 | return 1; | 553 | return 1; |
551 | } | 554 | } |
552 | 555 | ||
553 | int attempt_back_merge(struct request_queue *q, struct request *rq) | 556 | int attempt_back_merge(struct request_queue *q, struct request *rq) |
554 | { | 557 | { |
555 | struct request *next = elv_latter_request(q, rq); | 558 | struct request *next = elv_latter_request(q, rq); |
556 | 559 | ||
557 | if (next) | 560 | if (next) |
558 | return attempt_merge(q, rq, next); | 561 | return attempt_merge(q, rq, next); |
559 | 562 | ||
560 | return 0; | 563 | return 0; |
561 | } | 564 | } |
562 | 565 | ||
563 | int attempt_front_merge(struct request_queue *q, struct request *rq) | 566 | int attempt_front_merge(struct request_queue *q, struct request *rq) |
564 | { | 567 | { |
565 | struct request *prev = elv_former_request(q, rq); | 568 | struct request *prev = elv_former_request(q, rq); |
566 | 569 | ||
567 | if (prev) | 570 | if (prev) |
568 | return attempt_merge(q, prev, rq); | 571 | return attempt_merge(q, prev, rq); |
569 | 572 | ||
570 | return 0; | 573 | return 0; |
571 | } | 574 | } |
572 | 575 | ||
573 | int blk_attempt_req_merge(struct request_queue *q, struct request *rq, | 576 | int blk_attempt_req_merge(struct request_queue *q, struct request *rq, |
574 | struct request *next) | 577 | struct request *next) |
575 | { | 578 | { |
576 | return attempt_merge(q, rq, next); | 579 | return attempt_merge(q, rq, next); |
577 | } | 580 | } |
578 | 581 | ||
579 | bool blk_rq_merge_ok(struct request *rq, struct bio *bio) | 582 | bool blk_rq_merge_ok(struct request *rq, struct bio *bio) |
580 | { | 583 | { |
581 | struct request_queue *q = rq->q; | 584 | struct request_queue *q = rq->q; |
582 | 585 | ||
583 | if (!rq_mergeable(rq) || !bio_mergeable(bio)) | 586 | if (!rq_mergeable(rq) || !bio_mergeable(bio)) |
584 | return false; | 587 | return false; |
585 | 588 | ||
586 | if (!blk_check_merge_flags(rq->cmd_flags, bio->bi_rw)) | 589 | if (!blk_check_merge_flags(rq->cmd_flags, bio->bi_rw)) |
587 | return false; | 590 | return false; |
588 | 591 | ||
589 | /* different data direction or already started, don't merge */ | 592 | /* different data direction or already started, don't merge */ |
590 | if (bio_data_dir(bio) != rq_data_dir(rq)) | 593 | if (bio_data_dir(bio) != rq_data_dir(rq)) |
591 | return false; | 594 | return false; |
592 | 595 | ||
593 | /* must be same device and not a special request */ | 596 | /* must be same device and not a special request */ |
594 | if (rq->rq_disk != bio->bi_bdev->bd_disk || req_no_special_merge(rq)) | 597 | if (rq->rq_disk != bio->bi_bdev->bd_disk || req_no_special_merge(rq)) |
595 | return false; | 598 | return false; |
596 | 599 | ||
597 | /* only merge integrity protected bio into ditto rq */ | 600 | /* only merge integrity protected bio into ditto rq */ |
598 | if (blk_integrity_merge_bio(rq->q, rq, bio) == false) | 601 | if (blk_integrity_merge_bio(rq->q, rq, bio) == false) |
599 | return false; | 602 | return false; |
600 | 603 | ||
601 | /* must be using the same buffer */ | 604 | /* must be using the same buffer */ |
602 | if (rq->cmd_flags & REQ_WRITE_SAME && | 605 | if (rq->cmd_flags & REQ_WRITE_SAME && |
603 | !blk_write_same_mergeable(rq->bio, bio)) | 606 | !blk_write_same_mergeable(rq->bio, bio)) |
604 | return false; | 607 | return false; |
605 | 608 | ||
606 | if (q->queue_flags & (1 << QUEUE_FLAG_SG_GAPS)) { | 609 | if (q->queue_flags & (1 << QUEUE_FLAG_SG_GAPS)) { |
607 | struct bio_vec *bprev; | 610 | struct bio_vec *bprev; |
608 | 611 | ||
609 | bprev = &rq->biotail->bi_io_vec[bio->bi_vcnt - 1]; | 612 | bprev = &rq->biotail->bi_io_vec[bio->bi_vcnt - 1]; |
610 | if (bvec_gap_to_prev(bprev, bio->bi_io_vec[0].bv_offset)) | 613 | if (bvec_gap_to_prev(bprev, bio->bi_io_vec[0].bv_offset)) |
611 | return false; | 614 | return false; |
612 | } | 615 | } |
613 | 616 | ||
614 | return true; | 617 | return true; |
615 | } | 618 | } |
616 | 619 | ||
617 | int blk_try_merge(struct request *rq, struct bio *bio) | 620 | int blk_try_merge(struct request *rq, struct bio *bio) |
618 | { | 621 | { |
619 | if (blk_rq_pos(rq) + blk_rq_sectors(rq) == bio->bi_iter.bi_sector) | 622 | if (blk_rq_pos(rq) + blk_rq_sectors(rq) == bio->bi_iter.bi_sector) |
620 | return ELEVATOR_BACK_MERGE; | 623 | return ELEVATOR_BACK_MERGE; |
621 | else if (blk_rq_pos(rq) - bio_sectors(bio) == bio->bi_iter.bi_sector) | 624 | else if (blk_rq_pos(rq) - bio_sectors(bio) == bio->bi_iter.bi_sector) |
622 | return ELEVATOR_FRONT_MERGE; | 625 | return ELEVATOR_FRONT_MERGE; |
block/blk-mq.c
1 | /* | 1 | /* |
2 | * Block multiqueue core code | 2 | * Block multiqueue core code |
3 | * | 3 | * |
4 | * Copyright (C) 2013-2014 Jens Axboe | 4 | * Copyright (C) 2013-2014 Jens Axboe |
5 | * Copyright (C) 2013-2014 Christoph Hellwig | 5 | * Copyright (C) 2013-2014 Christoph Hellwig |
6 | */ | 6 | */ |
7 | #include <linux/kernel.h> | 7 | #include <linux/kernel.h> |
8 | #include <linux/module.h> | 8 | #include <linux/module.h> |
9 | #include <linux/backing-dev.h> | 9 | #include <linux/backing-dev.h> |
10 | #include <linux/bio.h> | 10 | #include <linux/bio.h> |
11 | #include <linux/blkdev.h> | 11 | #include <linux/blkdev.h> |
12 | #include <linux/mm.h> | 12 | #include <linux/mm.h> |
13 | #include <linux/init.h> | 13 | #include <linux/init.h> |
14 | #include <linux/slab.h> | 14 | #include <linux/slab.h> |
15 | #include <linux/workqueue.h> | 15 | #include <linux/workqueue.h> |
16 | #include <linux/smp.h> | 16 | #include <linux/smp.h> |
17 | #include <linux/llist.h> | 17 | #include <linux/llist.h> |
18 | #include <linux/list_sort.h> | 18 | #include <linux/list_sort.h> |
19 | #include <linux/cpu.h> | 19 | #include <linux/cpu.h> |
20 | #include <linux/cache.h> | 20 | #include <linux/cache.h> |
21 | #include <linux/sched/sysctl.h> | 21 | #include <linux/sched/sysctl.h> |
22 | #include <linux/delay.h> | 22 | #include <linux/delay.h> |
23 | #include <linux/crash_dump.h> | 23 | #include <linux/crash_dump.h> |
24 | 24 | ||
25 | #include <trace/events/block.h> | 25 | #include <trace/events/block.h> |
26 | 26 | ||
27 | #include <linux/blk-mq.h> | 27 | #include <linux/blk-mq.h> |
28 | #include "blk.h" | 28 | #include "blk.h" |
29 | #include "blk-mq.h" | 29 | #include "blk-mq.h" |
30 | #include "blk-mq-tag.h" | 30 | #include "blk-mq-tag.h" |
31 | 31 | ||
32 | static DEFINE_MUTEX(all_q_mutex); | 32 | static DEFINE_MUTEX(all_q_mutex); |
33 | static LIST_HEAD(all_q_list); | 33 | static LIST_HEAD(all_q_list); |
34 | 34 | ||
35 | static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx); | 35 | static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx); |
36 | 36 | ||
37 | /* | 37 | /* |
38 | * Check if any of the ctx's have pending work in this hardware queue | 38 | * Check if any of the ctx's have pending work in this hardware queue |
39 | */ | 39 | */ |
40 | static bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx) | 40 | static bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx) |
41 | { | 41 | { |
42 | unsigned int i; | 42 | unsigned int i; |
43 | 43 | ||
44 | for (i = 0; i < hctx->ctx_map.map_size; i++) | 44 | for (i = 0; i < hctx->ctx_map.map_size; i++) |
45 | if (hctx->ctx_map.map[i].word) | 45 | if (hctx->ctx_map.map[i].word) |
46 | return true; | 46 | return true; |
47 | 47 | ||
48 | return false; | 48 | return false; |
49 | } | 49 | } |
50 | 50 | ||
51 | static inline struct blk_align_bitmap *get_bm(struct blk_mq_hw_ctx *hctx, | 51 | static inline struct blk_align_bitmap *get_bm(struct blk_mq_hw_ctx *hctx, |
52 | struct blk_mq_ctx *ctx) | 52 | struct blk_mq_ctx *ctx) |
53 | { | 53 | { |
54 | return &hctx->ctx_map.map[ctx->index_hw / hctx->ctx_map.bits_per_word]; | 54 | return &hctx->ctx_map.map[ctx->index_hw / hctx->ctx_map.bits_per_word]; |
55 | } | 55 | } |
56 | 56 | ||
57 | #define CTX_TO_BIT(hctx, ctx) \ | 57 | #define CTX_TO_BIT(hctx, ctx) \ |
58 | ((ctx)->index_hw & ((hctx)->ctx_map.bits_per_word - 1)) | 58 | ((ctx)->index_hw & ((hctx)->ctx_map.bits_per_word - 1)) |
59 | 59 | ||
60 | /* | 60 | /* |
61 | * Mark this ctx as having pending work in this hardware queue | 61 | * Mark this ctx as having pending work in this hardware queue |
62 | */ | 62 | */ |
63 | static void blk_mq_hctx_mark_pending(struct blk_mq_hw_ctx *hctx, | 63 | static void blk_mq_hctx_mark_pending(struct blk_mq_hw_ctx *hctx, |
64 | struct blk_mq_ctx *ctx) | 64 | struct blk_mq_ctx *ctx) |
65 | { | 65 | { |
66 | struct blk_align_bitmap *bm = get_bm(hctx, ctx); | 66 | struct blk_align_bitmap *bm = get_bm(hctx, ctx); |
67 | 67 | ||
68 | if (!test_bit(CTX_TO_BIT(hctx, ctx), &bm->word)) | 68 | if (!test_bit(CTX_TO_BIT(hctx, ctx), &bm->word)) |
69 | set_bit(CTX_TO_BIT(hctx, ctx), &bm->word); | 69 | set_bit(CTX_TO_BIT(hctx, ctx), &bm->word); |
70 | } | 70 | } |
71 | 71 | ||
72 | static void blk_mq_hctx_clear_pending(struct blk_mq_hw_ctx *hctx, | 72 | static void blk_mq_hctx_clear_pending(struct blk_mq_hw_ctx *hctx, |
73 | struct blk_mq_ctx *ctx) | 73 | struct blk_mq_ctx *ctx) |
74 | { | 74 | { |
75 | struct blk_align_bitmap *bm = get_bm(hctx, ctx); | 75 | struct blk_align_bitmap *bm = get_bm(hctx, ctx); |
76 | 76 | ||
77 | clear_bit(CTX_TO_BIT(hctx, ctx), &bm->word); | 77 | clear_bit(CTX_TO_BIT(hctx, ctx), &bm->word); |
78 | } | 78 | } |
79 | 79 | ||
80 | static int blk_mq_queue_enter(struct request_queue *q) | 80 | static int blk_mq_queue_enter(struct request_queue *q) |
81 | { | 81 | { |
82 | while (true) { | 82 | while (true) { |
83 | int ret; | 83 | int ret; |
84 | 84 | ||
85 | if (percpu_ref_tryget_live(&q->mq_usage_counter)) | 85 | if (percpu_ref_tryget_live(&q->mq_usage_counter)) |
86 | return 0; | 86 | return 0; |
87 | 87 | ||
88 | ret = wait_event_interruptible(q->mq_freeze_wq, | 88 | ret = wait_event_interruptible(q->mq_freeze_wq, |
89 | !q->mq_freeze_depth || blk_queue_dying(q)); | 89 | !q->mq_freeze_depth || blk_queue_dying(q)); |
90 | if (blk_queue_dying(q)) | 90 | if (blk_queue_dying(q)) |
91 | return -ENODEV; | 91 | return -ENODEV; |
92 | if (ret) | 92 | if (ret) |
93 | return ret; | 93 | return ret; |
94 | } | 94 | } |
95 | } | 95 | } |
96 | 96 | ||
97 | static void blk_mq_queue_exit(struct request_queue *q) | 97 | static void blk_mq_queue_exit(struct request_queue *q) |
98 | { | 98 | { |
99 | percpu_ref_put(&q->mq_usage_counter); | 99 | percpu_ref_put(&q->mq_usage_counter); |
100 | } | 100 | } |
101 | 101 | ||
102 | static void blk_mq_usage_counter_release(struct percpu_ref *ref) | 102 | static void blk_mq_usage_counter_release(struct percpu_ref *ref) |
103 | { | 103 | { |
104 | struct request_queue *q = | 104 | struct request_queue *q = |
105 | container_of(ref, struct request_queue, mq_usage_counter); | 105 | container_of(ref, struct request_queue, mq_usage_counter); |
106 | 106 | ||
107 | wake_up_all(&q->mq_freeze_wq); | 107 | wake_up_all(&q->mq_freeze_wq); |
108 | } | 108 | } |
109 | 109 | ||
110 | /* | 110 | static void blk_mq_freeze_queue_start(struct request_queue *q) |
111 | * Guarantee no request is in use, so we can change any data structure of | ||
112 | * the queue afterward. | ||
113 | */ | ||
114 | void blk_mq_freeze_queue(struct request_queue *q) | ||
115 | { | 111 | { |
116 | bool freeze; | 112 | bool freeze; |
117 | 113 | ||
118 | spin_lock_irq(q->queue_lock); | 114 | spin_lock_irq(q->queue_lock); |
119 | freeze = !q->mq_freeze_depth++; | 115 | freeze = !q->mq_freeze_depth++; |
120 | spin_unlock_irq(q->queue_lock); | 116 | spin_unlock_irq(q->queue_lock); |
121 | 117 | ||
122 | if (freeze) { | 118 | if (freeze) { |
123 | percpu_ref_kill(&q->mq_usage_counter); | 119 | percpu_ref_kill(&q->mq_usage_counter); |
124 | blk_mq_run_queues(q, false); | 120 | blk_mq_run_queues(q, false); |
125 | } | 121 | } |
122 | } | ||
123 | |||
124 | static void blk_mq_freeze_queue_wait(struct request_queue *q) | ||
125 | { | ||
126 | wait_event(q->mq_freeze_wq, percpu_ref_is_zero(&q->mq_usage_counter)); | 126 | wait_event(q->mq_freeze_wq, percpu_ref_is_zero(&q->mq_usage_counter)); |
127 | } | 127 | } |
128 | 128 | ||
129 | /* | ||
130 | * Guarantee no request is in use, so we can change any data structure of | ||
131 | * the queue afterward. | ||
132 | */ | ||
133 | void blk_mq_freeze_queue(struct request_queue *q) | ||
134 | { | ||
135 | blk_mq_freeze_queue_start(q); | ||
136 | blk_mq_freeze_queue_wait(q); | ||
137 | } | ||
138 | |||
129 | static void blk_mq_unfreeze_queue(struct request_queue *q) | 139 | static void blk_mq_unfreeze_queue(struct request_queue *q) |
130 | { | 140 | { |
131 | bool wake; | 141 | bool wake; |
132 | 142 | ||
133 | spin_lock_irq(q->queue_lock); | 143 | spin_lock_irq(q->queue_lock); |
134 | wake = !--q->mq_freeze_depth; | 144 | wake = !--q->mq_freeze_depth; |
135 | WARN_ON_ONCE(q->mq_freeze_depth < 0); | 145 | WARN_ON_ONCE(q->mq_freeze_depth < 0); |
136 | spin_unlock_irq(q->queue_lock); | 146 | spin_unlock_irq(q->queue_lock); |
137 | if (wake) { | 147 | if (wake) { |
138 | percpu_ref_reinit(&q->mq_usage_counter); | 148 | percpu_ref_reinit(&q->mq_usage_counter); |
139 | wake_up_all(&q->mq_freeze_wq); | 149 | wake_up_all(&q->mq_freeze_wq); |
140 | } | 150 | } |
141 | } | 151 | } |
142 | 152 | ||
143 | bool blk_mq_can_queue(struct blk_mq_hw_ctx *hctx) | 153 | bool blk_mq_can_queue(struct blk_mq_hw_ctx *hctx) |
144 | { | 154 | { |
145 | return blk_mq_has_free_tags(hctx->tags); | 155 | return blk_mq_has_free_tags(hctx->tags); |
146 | } | 156 | } |
147 | EXPORT_SYMBOL(blk_mq_can_queue); | 157 | EXPORT_SYMBOL(blk_mq_can_queue); |
148 | 158 | ||
149 | static void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx, | 159 | static void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx, |
150 | struct request *rq, unsigned int rw_flags) | 160 | struct request *rq, unsigned int rw_flags) |
151 | { | 161 | { |
152 | if (blk_queue_io_stat(q)) | 162 | if (blk_queue_io_stat(q)) |
153 | rw_flags |= REQ_IO_STAT; | 163 | rw_flags |= REQ_IO_STAT; |
154 | 164 | ||
155 | INIT_LIST_HEAD(&rq->queuelist); | 165 | INIT_LIST_HEAD(&rq->queuelist); |
156 | /* csd/requeue_work/fifo_time is initialized before use */ | 166 | /* csd/requeue_work/fifo_time is initialized before use */ |
157 | rq->q = q; | 167 | rq->q = q; |
158 | rq->mq_ctx = ctx; | 168 | rq->mq_ctx = ctx; |
159 | rq->cmd_flags |= rw_flags; | 169 | rq->cmd_flags |= rw_flags; |
160 | /* do not touch atomic flags, it needs atomic ops against the timer */ | 170 | /* do not touch atomic flags, it needs atomic ops against the timer */ |
161 | rq->cpu = -1; | 171 | rq->cpu = -1; |
162 | INIT_HLIST_NODE(&rq->hash); | 172 | INIT_HLIST_NODE(&rq->hash); |
163 | RB_CLEAR_NODE(&rq->rb_node); | 173 | RB_CLEAR_NODE(&rq->rb_node); |
164 | rq->rq_disk = NULL; | 174 | rq->rq_disk = NULL; |
165 | rq->part = NULL; | 175 | rq->part = NULL; |
166 | rq->start_time = jiffies; | 176 | rq->start_time = jiffies; |
167 | #ifdef CONFIG_BLK_CGROUP | 177 | #ifdef CONFIG_BLK_CGROUP |
168 | rq->rl = NULL; | 178 | rq->rl = NULL; |
169 | set_start_time_ns(rq); | 179 | set_start_time_ns(rq); |
170 | rq->io_start_time_ns = 0; | 180 | rq->io_start_time_ns = 0; |
171 | #endif | 181 | #endif |
172 | rq->nr_phys_segments = 0; | 182 | rq->nr_phys_segments = 0; |
173 | #if defined(CONFIG_BLK_DEV_INTEGRITY) | 183 | #if defined(CONFIG_BLK_DEV_INTEGRITY) |
174 | rq->nr_integrity_segments = 0; | 184 | rq->nr_integrity_segments = 0; |
175 | #endif | 185 | #endif |
176 | rq->special = NULL; | 186 | rq->special = NULL; |
177 | /* tag was already set */ | 187 | /* tag was already set */ |
178 | rq->errors = 0; | 188 | rq->errors = 0; |
179 | 189 | ||
180 | rq->cmd = rq->__cmd; | 190 | rq->cmd = rq->__cmd; |
181 | 191 | ||
182 | rq->extra_len = 0; | 192 | rq->extra_len = 0; |
183 | rq->sense_len = 0; | 193 | rq->sense_len = 0; |
184 | rq->resid_len = 0; | 194 | rq->resid_len = 0; |
185 | rq->sense = NULL; | 195 | rq->sense = NULL; |
186 | 196 | ||
187 | INIT_LIST_HEAD(&rq->timeout_list); | 197 | INIT_LIST_HEAD(&rq->timeout_list); |
188 | rq->timeout = 0; | 198 | rq->timeout = 0; |
189 | 199 | ||
190 | rq->end_io = NULL; | 200 | rq->end_io = NULL; |
191 | rq->end_io_data = NULL; | 201 | rq->end_io_data = NULL; |
192 | rq->next_rq = NULL; | 202 | rq->next_rq = NULL; |
193 | 203 | ||
194 | ctx->rq_dispatched[rw_is_sync(rw_flags)]++; | 204 | ctx->rq_dispatched[rw_is_sync(rw_flags)]++; |
195 | } | 205 | } |
196 | 206 | ||
197 | static struct request * | 207 | static struct request * |
198 | __blk_mq_alloc_request(struct blk_mq_alloc_data *data, int rw) | 208 | __blk_mq_alloc_request(struct blk_mq_alloc_data *data, int rw) |
199 | { | 209 | { |
200 | struct request *rq; | 210 | struct request *rq; |
201 | unsigned int tag; | 211 | unsigned int tag; |
202 | 212 | ||
203 | tag = blk_mq_get_tag(data); | 213 | tag = blk_mq_get_tag(data); |
204 | if (tag != BLK_MQ_TAG_FAIL) { | 214 | if (tag != BLK_MQ_TAG_FAIL) { |
205 | rq = data->hctx->tags->rqs[tag]; | 215 | rq = data->hctx->tags->rqs[tag]; |
206 | 216 | ||
207 | if (blk_mq_tag_busy(data->hctx)) { | 217 | if (blk_mq_tag_busy(data->hctx)) { |
208 | rq->cmd_flags = REQ_MQ_INFLIGHT; | 218 | rq->cmd_flags = REQ_MQ_INFLIGHT; |
209 | atomic_inc(&data->hctx->nr_active); | 219 | atomic_inc(&data->hctx->nr_active); |
210 | } | 220 | } |
211 | 221 | ||
212 | rq->tag = tag; | 222 | rq->tag = tag; |
213 | blk_mq_rq_ctx_init(data->q, data->ctx, rq, rw); | 223 | blk_mq_rq_ctx_init(data->q, data->ctx, rq, rw); |
214 | return rq; | 224 | return rq; |
215 | } | 225 | } |
216 | 226 | ||
217 | return NULL; | 227 | return NULL; |
218 | } | 228 | } |
219 | 229 | ||
220 | struct request *blk_mq_alloc_request(struct request_queue *q, int rw, gfp_t gfp, | 230 | struct request *blk_mq_alloc_request(struct request_queue *q, int rw, gfp_t gfp, |
221 | bool reserved) | 231 | bool reserved) |
222 | { | 232 | { |
223 | struct blk_mq_ctx *ctx; | 233 | struct blk_mq_ctx *ctx; |
224 | struct blk_mq_hw_ctx *hctx; | 234 | struct blk_mq_hw_ctx *hctx; |
225 | struct request *rq; | 235 | struct request *rq; |
226 | struct blk_mq_alloc_data alloc_data; | 236 | struct blk_mq_alloc_data alloc_data; |
227 | int ret; | 237 | int ret; |
228 | 238 | ||
229 | ret = blk_mq_queue_enter(q); | 239 | ret = blk_mq_queue_enter(q); |
230 | if (ret) | 240 | if (ret) |
231 | return ERR_PTR(ret); | 241 | return ERR_PTR(ret); |
232 | 242 | ||
233 | ctx = blk_mq_get_ctx(q); | 243 | ctx = blk_mq_get_ctx(q); |
234 | hctx = q->mq_ops->map_queue(q, ctx->cpu); | 244 | hctx = q->mq_ops->map_queue(q, ctx->cpu); |
235 | blk_mq_set_alloc_data(&alloc_data, q, gfp & ~__GFP_WAIT, | 245 | blk_mq_set_alloc_data(&alloc_data, q, gfp & ~__GFP_WAIT, |
236 | reserved, ctx, hctx); | 246 | reserved, ctx, hctx); |
237 | 247 | ||
238 | rq = __blk_mq_alloc_request(&alloc_data, rw); | 248 | rq = __blk_mq_alloc_request(&alloc_data, rw); |
239 | if (!rq && (gfp & __GFP_WAIT)) { | 249 | if (!rq && (gfp & __GFP_WAIT)) { |
240 | __blk_mq_run_hw_queue(hctx); | 250 | __blk_mq_run_hw_queue(hctx); |
241 | blk_mq_put_ctx(ctx); | 251 | blk_mq_put_ctx(ctx); |
242 | 252 | ||
243 | ctx = blk_mq_get_ctx(q); | 253 | ctx = blk_mq_get_ctx(q); |
244 | hctx = q->mq_ops->map_queue(q, ctx->cpu); | 254 | hctx = q->mq_ops->map_queue(q, ctx->cpu); |
245 | blk_mq_set_alloc_data(&alloc_data, q, gfp, reserved, ctx, | 255 | blk_mq_set_alloc_data(&alloc_data, q, gfp, reserved, ctx, |
246 | hctx); | 256 | hctx); |
247 | rq = __blk_mq_alloc_request(&alloc_data, rw); | 257 | rq = __blk_mq_alloc_request(&alloc_data, rw); |
248 | ctx = alloc_data.ctx; | 258 | ctx = alloc_data.ctx; |
249 | } | 259 | } |
250 | blk_mq_put_ctx(ctx); | 260 | blk_mq_put_ctx(ctx); |
251 | if (!rq) | 261 | if (!rq) |
252 | return ERR_PTR(-EWOULDBLOCK); | 262 | return ERR_PTR(-EWOULDBLOCK); |
253 | return rq; | 263 | return rq; |
254 | } | 264 | } |
255 | EXPORT_SYMBOL(blk_mq_alloc_request); | 265 | EXPORT_SYMBOL(blk_mq_alloc_request); |
256 | 266 | ||
257 | static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx, | 267 | static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx, |
258 | struct blk_mq_ctx *ctx, struct request *rq) | 268 | struct blk_mq_ctx *ctx, struct request *rq) |
259 | { | 269 | { |
260 | const int tag = rq->tag; | 270 | const int tag = rq->tag; |
261 | struct request_queue *q = rq->q; | 271 | struct request_queue *q = rq->q; |
262 | 272 | ||
263 | if (rq->cmd_flags & REQ_MQ_INFLIGHT) | 273 | if (rq->cmd_flags & REQ_MQ_INFLIGHT) |
264 | atomic_dec(&hctx->nr_active); | 274 | atomic_dec(&hctx->nr_active); |
265 | rq->cmd_flags = 0; | 275 | rq->cmd_flags = 0; |
266 | 276 | ||
267 | clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags); | 277 | clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags); |
268 | blk_mq_put_tag(hctx, tag, &ctx->last_tag); | 278 | blk_mq_put_tag(hctx, tag, &ctx->last_tag); |
269 | blk_mq_queue_exit(q); | 279 | blk_mq_queue_exit(q); |
270 | } | 280 | } |
271 | 281 | ||
272 | void blk_mq_free_request(struct request *rq) | 282 | void blk_mq_free_request(struct request *rq) |
273 | { | 283 | { |
274 | struct blk_mq_ctx *ctx = rq->mq_ctx; | 284 | struct blk_mq_ctx *ctx = rq->mq_ctx; |
275 | struct blk_mq_hw_ctx *hctx; | 285 | struct blk_mq_hw_ctx *hctx; |
276 | struct request_queue *q = rq->q; | 286 | struct request_queue *q = rq->q; |
277 | 287 | ||
278 | ctx->rq_completed[rq_is_sync(rq)]++; | 288 | ctx->rq_completed[rq_is_sync(rq)]++; |
279 | 289 | ||
280 | hctx = q->mq_ops->map_queue(q, ctx->cpu); | 290 | hctx = q->mq_ops->map_queue(q, ctx->cpu); |
281 | __blk_mq_free_request(hctx, ctx, rq); | 291 | __blk_mq_free_request(hctx, ctx, rq); |
282 | } | 292 | } |
283 | 293 | ||
284 | inline void __blk_mq_end_request(struct request *rq, int error) | 294 | inline void __blk_mq_end_request(struct request *rq, int error) |
285 | { | 295 | { |
286 | blk_account_io_done(rq); | 296 | blk_account_io_done(rq); |
287 | 297 | ||
288 | if (rq->end_io) { | 298 | if (rq->end_io) { |
289 | rq->end_io(rq, error); | 299 | rq->end_io(rq, error); |
290 | } else { | 300 | } else { |
291 | if (unlikely(blk_bidi_rq(rq))) | 301 | if (unlikely(blk_bidi_rq(rq))) |
292 | blk_mq_free_request(rq->next_rq); | 302 | blk_mq_free_request(rq->next_rq); |
293 | blk_mq_free_request(rq); | 303 | blk_mq_free_request(rq); |
294 | } | 304 | } |
295 | } | 305 | } |
296 | EXPORT_SYMBOL(__blk_mq_end_request); | 306 | EXPORT_SYMBOL(__blk_mq_end_request); |
297 | 307 | ||
298 | void blk_mq_end_request(struct request *rq, int error) | 308 | void blk_mq_end_request(struct request *rq, int error) |
299 | { | 309 | { |
300 | if (blk_update_request(rq, error, blk_rq_bytes(rq))) | 310 | if (blk_update_request(rq, error, blk_rq_bytes(rq))) |
301 | BUG(); | 311 | BUG(); |
302 | __blk_mq_end_request(rq, error); | 312 | __blk_mq_end_request(rq, error); |
303 | } | 313 | } |
304 | EXPORT_SYMBOL(blk_mq_end_request); | 314 | EXPORT_SYMBOL(blk_mq_end_request); |
305 | 315 | ||
306 | static void __blk_mq_complete_request_remote(void *data) | 316 | static void __blk_mq_complete_request_remote(void *data) |
307 | { | 317 | { |
308 | struct request *rq = data; | 318 | struct request *rq = data; |
309 | 319 | ||
310 | rq->q->softirq_done_fn(rq); | 320 | rq->q->softirq_done_fn(rq); |
311 | } | 321 | } |
312 | 322 | ||
313 | static void blk_mq_ipi_complete_request(struct request *rq) | 323 | static void blk_mq_ipi_complete_request(struct request *rq) |
314 | { | 324 | { |
315 | struct blk_mq_ctx *ctx = rq->mq_ctx; | 325 | struct blk_mq_ctx *ctx = rq->mq_ctx; |
316 | bool shared = false; | 326 | bool shared = false; |
317 | int cpu; | 327 | int cpu; |
318 | 328 | ||
319 | if (!test_bit(QUEUE_FLAG_SAME_COMP, &rq->q->queue_flags)) { | 329 | if (!test_bit(QUEUE_FLAG_SAME_COMP, &rq->q->queue_flags)) { |
320 | rq->q->softirq_done_fn(rq); | 330 | rq->q->softirq_done_fn(rq); |
321 | return; | 331 | return; |
322 | } | 332 | } |
323 | 333 | ||
324 | cpu = get_cpu(); | 334 | cpu = get_cpu(); |
325 | if (!test_bit(QUEUE_FLAG_SAME_FORCE, &rq->q->queue_flags)) | 335 | if (!test_bit(QUEUE_FLAG_SAME_FORCE, &rq->q->queue_flags)) |
326 | shared = cpus_share_cache(cpu, ctx->cpu); | 336 | shared = cpus_share_cache(cpu, ctx->cpu); |
327 | 337 | ||
328 | if (cpu != ctx->cpu && !shared && cpu_online(ctx->cpu)) { | 338 | if (cpu != ctx->cpu && !shared && cpu_online(ctx->cpu)) { |
329 | rq->csd.func = __blk_mq_complete_request_remote; | 339 | rq->csd.func = __blk_mq_complete_request_remote; |
330 | rq->csd.info = rq; | 340 | rq->csd.info = rq; |
331 | rq->csd.flags = 0; | 341 | rq->csd.flags = 0; |
332 | smp_call_function_single_async(ctx->cpu, &rq->csd); | 342 | smp_call_function_single_async(ctx->cpu, &rq->csd); |
333 | } else { | 343 | } else { |
334 | rq->q->softirq_done_fn(rq); | 344 | rq->q->softirq_done_fn(rq); |
335 | } | 345 | } |
336 | put_cpu(); | 346 | put_cpu(); |
337 | } | 347 | } |
338 | 348 | ||
339 | void __blk_mq_complete_request(struct request *rq) | 349 | void __blk_mq_complete_request(struct request *rq) |
340 | { | 350 | { |
341 | struct request_queue *q = rq->q; | 351 | struct request_queue *q = rq->q; |
342 | 352 | ||
343 | if (!q->softirq_done_fn) | 353 | if (!q->softirq_done_fn) |
344 | blk_mq_end_request(rq, rq->errors); | 354 | blk_mq_end_request(rq, rq->errors); |
345 | else | 355 | else |
346 | blk_mq_ipi_complete_request(rq); | 356 | blk_mq_ipi_complete_request(rq); |
347 | } | 357 | } |
348 | 358 | ||
349 | /** | 359 | /** |
350 | * blk_mq_complete_request - end I/O on a request | 360 | * blk_mq_complete_request - end I/O on a request |
351 | * @rq: the request being processed | 361 | * @rq: the request being processed |
352 | * | 362 | * |
353 | * Description: | 363 | * Description: |
354 | * Ends all I/O on a request. It does not handle partial completions. | 364 | * Ends all I/O on a request. It does not handle partial completions. |
355 | * The actual completion happens out-of-order, through a IPI handler. | 365 | * The actual completion happens out-of-order, through a IPI handler. |
356 | **/ | 366 | **/ |
357 | void blk_mq_complete_request(struct request *rq) | 367 | void blk_mq_complete_request(struct request *rq) |
358 | { | 368 | { |
359 | struct request_queue *q = rq->q; | 369 | struct request_queue *q = rq->q; |
360 | 370 | ||
361 | if (unlikely(blk_should_fake_timeout(q))) | 371 | if (unlikely(blk_should_fake_timeout(q))) |
362 | return; | 372 | return; |
363 | if (!blk_mark_rq_complete(rq)) | 373 | if (!blk_mark_rq_complete(rq)) |
364 | __blk_mq_complete_request(rq); | 374 | __blk_mq_complete_request(rq); |
365 | } | 375 | } |
366 | EXPORT_SYMBOL(blk_mq_complete_request); | 376 | EXPORT_SYMBOL(blk_mq_complete_request); |
367 | 377 | ||
368 | void blk_mq_start_request(struct request *rq) | 378 | void blk_mq_start_request(struct request *rq) |
369 | { | 379 | { |
370 | struct request_queue *q = rq->q; | 380 | struct request_queue *q = rq->q; |
371 | 381 | ||
372 | trace_block_rq_issue(q, rq); | 382 | trace_block_rq_issue(q, rq); |
373 | 383 | ||
374 | rq->resid_len = blk_rq_bytes(rq); | 384 | rq->resid_len = blk_rq_bytes(rq); |
375 | if (unlikely(blk_bidi_rq(rq))) | 385 | if (unlikely(blk_bidi_rq(rq))) |
376 | rq->next_rq->resid_len = blk_rq_bytes(rq->next_rq); | 386 | rq->next_rq->resid_len = blk_rq_bytes(rq->next_rq); |
377 | 387 | ||
378 | blk_add_timer(rq); | 388 | blk_add_timer(rq); |
379 | 389 | ||
380 | /* | 390 | /* |
381 | * Ensure that ->deadline is visible before set the started | 391 | * Ensure that ->deadline is visible before set the started |
382 | * flag and clear the completed flag. | 392 | * flag and clear the completed flag. |
383 | */ | 393 | */ |
384 | smp_mb__before_atomic(); | 394 | smp_mb__before_atomic(); |
385 | 395 | ||
386 | /* | 396 | /* |
387 | * Mark us as started and clear complete. Complete might have been | 397 | * Mark us as started and clear complete. Complete might have been |
388 | * set if requeue raced with timeout, which then marked it as | 398 | * set if requeue raced with timeout, which then marked it as |
389 | * complete. So be sure to clear complete again when we start | 399 | * complete. So be sure to clear complete again when we start |
390 | * the request, otherwise we'll ignore the completion event. | 400 | * the request, otherwise we'll ignore the completion event. |
391 | */ | 401 | */ |
392 | if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) | 402 | if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) |
393 | set_bit(REQ_ATOM_STARTED, &rq->atomic_flags); | 403 | set_bit(REQ_ATOM_STARTED, &rq->atomic_flags); |
394 | if (test_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags)) | 404 | if (test_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags)) |
395 | clear_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags); | 405 | clear_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags); |
396 | 406 | ||
397 | if (q->dma_drain_size && blk_rq_bytes(rq)) { | 407 | if (q->dma_drain_size && blk_rq_bytes(rq)) { |
398 | /* | 408 | /* |
399 | * Make sure space for the drain appears. We know we can do | 409 | * Make sure space for the drain appears. We know we can do |
400 | * this because max_hw_segments has been adjusted to be one | 410 | * this because max_hw_segments has been adjusted to be one |
401 | * fewer than the device can handle. | 411 | * fewer than the device can handle. |
402 | */ | 412 | */ |
403 | rq->nr_phys_segments++; | 413 | rq->nr_phys_segments++; |
404 | } | 414 | } |
405 | } | 415 | } |
406 | EXPORT_SYMBOL(blk_mq_start_request); | 416 | EXPORT_SYMBOL(blk_mq_start_request); |
407 | 417 | ||
408 | static void __blk_mq_requeue_request(struct request *rq) | 418 | static void __blk_mq_requeue_request(struct request *rq) |
409 | { | 419 | { |
410 | struct request_queue *q = rq->q; | 420 | struct request_queue *q = rq->q; |
411 | 421 | ||
412 | trace_block_rq_requeue(q, rq); | 422 | trace_block_rq_requeue(q, rq); |
413 | 423 | ||
414 | if (test_and_clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) { | 424 | if (test_and_clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) { |
415 | if (q->dma_drain_size && blk_rq_bytes(rq)) | 425 | if (q->dma_drain_size && blk_rq_bytes(rq)) |
416 | rq->nr_phys_segments--; | 426 | rq->nr_phys_segments--; |
417 | } | 427 | } |
418 | } | 428 | } |
419 | 429 | ||
420 | void blk_mq_requeue_request(struct request *rq) | 430 | void blk_mq_requeue_request(struct request *rq) |
421 | { | 431 | { |
422 | __blk_mq_requeue_request(rq); | 432 | __blk_mq_requeue_request(rq); |
423 | 433 | ||
424 | BUG_ON(blk_queued_rq(rq)); | 434 | BUG_ON(blk_queued_rq(rq)); |
425 | blk_mq_add_to_requeue_list(rq, true); | 435 | blk_mq_add_to_requeue_list(rq, true); |
426 | } | 436 | } |
427 | EXPORT_SYMBOL(blk_mq_requeue_request); | 437 | EXPORT_SYMBOL(blk_mq_requeue_request); |
428 | 438 | ||
429 | static void blk_mq_requeue_work(struct work_struct *work) | 439 | static void blk_mq_requeue_work(struct work_struct *work) |
430 | { | 440 | { |
431 | struct request_queue *q = | 441 | struct request_queue *q = |
432 | container_of(work, struct request_queue, requeue_work); | 442 | container_of(work, struct request_queue, requeue_work); |
433 | LIST_HEAD(rq_list); | 443 | LIST_HEAD(rq_list); |
434 | struct request *rq, *next; | 444 | struct request *rq, *next; |
435 | unsigned long flags; | 445 | unsigned long flags; |
436 | 446 | ||
437 | spin_lock_irqsave(&q->requeue_lock, flags); | 447 | spin_lock_irqsave(&q->requeue_lock, flags); |
438 | list_splice_init(&q->requeue_list, &rq_list); | 448 | list_splice_init(&q->requeue_list, &rq_list); |
439 | spin_unlock_irqrestore(&q->requeue_lock, flags); | 449 | spin_unlock_irqrestore(&q->requeue_lock, flags); |
440 | 450 | ||
441 | list_for_each_entry_safe(rq, next, &rq_list, queuelist) { | 451 | list_for_each_entry_safe(rq, next, &rq_list, queuelist) { |
442 | if (!(rq->cmd_flags & REQ_SOFTBARRIER)) | 452 | if (!(rq->cmd_flags & REQ_SOFTBARRIER)) |
443 | continue; | 453 | continue; |
444 | 454 | ||
445 | rq->cmd_flags &= ~REQ_SOFTBARRIER; | 455 | rq->cmd_flags &= ~REQ_SOFTBARRIER; |
446 | list_del_init(&rq->queuelist); | 456 | list_del_init(&rq->queuelist); |
447 | blk_mq_insert_request(rq, true, false, false); | 457 | blk_mq_insert_request(rq, true, false, false); |
448 | } | 458 | } |
449 | 459 | ||
450 | while (!list_empty(&rq_list)) { | 460 | while (!list_empty(&rq_list)) { |
451 | rq = list_entry(rq_list.next, struct request, queuelist); | 461 | rq = list_entry(rq_list.next, struct request, queuelist); |
452 | list_del_init(&rq->queuelist); | 462 | list_del_init(&rq->queuelist); |
453 | blk_mq_insert_request(rq, false, false, false); | 463 | blk_mq_insert_request(rq, false, false, false); |
454 | } | 464 | } |
455 | 465 | ||
456 | /* | 466 | /* |
457 | * Use the start variant of queue running here, so that running | 467 | * Use the start variant of queue running here, so that running |
458 | * the requeue work will kick stopped queues. | 468 | * the requeue work will kick stopped queues. |
459 | */ | 469 | */ |
460 | blk_mq_start_hw_queues(q); | 470 | blk_mq_start_hw_queues(q); |
461 | } | 471 | } |
462 | 472 | ||
463 | void blk_mq_add_to_requeue_list(struct request *rq, bool at_head) | 473 | void blk_mq_add_to_requeue_list(struct request *rq, bool at_head) |
464 | { | 474 | { |
465 | struct request_queue *q = rq->q; | 475 | struct request_queue *q = rq->q; |
466 | unsigned long flags; | 476 | unsigned long flags; |
467 | 477 | ||
468 | /* | 478 | /* |
469 | * We abuse this flag that is otherwise used by the I/O scheduler to | 479 | * We abuse this flag that is otherwise used by the I/O scheduler to |
470 | * request head insertation from the workqueue. | 480 | * request head insertation from the workqueue. |
471 | */ | 481 | */ |
472 | BUG_ON(rq->cmd_flags & REQ_SOFTBARRIER); | 482 | BUG_ON(rq->cmd_flags & REQ_SOFTBARRIER); |
473 | 483 | ||
474 | spin_lock_irqsave(&q->requeue_lock, flags); | 484 | spin_lock_irqsave(&q->requeue_lock, flags); |
475 | if (at_head) { | 485 | if (at_head) { |
476 | rq->cmd_flags |= REQ_SOFTBARRIER; | 486 | rq->cmd_flags |= REQ_SOFTBARRIER; |
477 | list_add(&rq->queuelist, &q->requeue_list); | 487 | list_add(&rq->queuelist, &q->requeue_list); |
478 | } else { | 488 | } else { |
479 | list_add_tail(&rq->queuelist, &q->requeue_list); | 489 | list_add_tail(&rq->queuelist, &q->requeue_list); |
480 | } | 490 | } |
481 | spin_unlock_irqrestore(&q->requeue_lock, flags); | 491 | spin_unlock_irqrestore(&q->requeue_lock, flags); |
482 | } | 492 | } |
483 | EXPORT_SYMBOL(blk_mq_add_to_requeue_list); | 493 | EXPORT_SYMBOL(blk_mq_add_to_requeue_list); |
484 | 494 | ||
485 | void blk_mq_kick_requeue_list(struct request_queue *q) | 495 | void blk_mq_kick_requeue_list(struct request_queue *q) |
486 | { | 496 | { |
487 | kblockd_schedule_work(&q->requeue_work); | 497 | kblockd_schedule_work(&q->requeue_work); |
488 | } | 498 | } |
489 | EXPORT_SYMBOL(blk_mq_kick_requeue_list); | 499 | EXPORT_SYMBOL(blk_mq_kick_requeue_list); |
490 | 500 | ||
491 | static inline bool is_flush_request(struct request *rq, | 501 | static inline bool is_flush_request(struct request *rq, |
492 | struct blk_flush_queue *fq, unsigned int tag) | 502 | struct blk_flush_queue *fq, unsigned int tag) |
493 | { | 503 | { |
494 | return ((rq->cmd_flags & REQ_FLUSH_SEQ) && | 504 | return ((rq->cmd_flags & REQ_FLUSH_SEQ) && |
495 | fq->flush_rq->tag == tag); | 505 | fq->flush_rq->tag == tag); |
496 | } | 506 | } |
497 | 507 | ||
498 | struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag) | 508 | struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag) |
499 | { | 509 | { |
500 | struct request *rq = tags->rqs[tag]; | 510 | struct request *rq = tags->rqs[tag]; |
501 | /* mq_ctx of flush rq is always cloned from the corresponding req */ | 511 | /* mq_ctx of flush rq is always cloned from the corresponding req */ |
502 | struct blk_flush_queue *fq = blk_get_flush_queue(rq->q, rq->mq_ctx); | 512 | struct blk_flush_queue *fq = blk_get_flush_queue(rq->q, rq->mq_ctx); |
503 | 513 | ||
504 | if (!is_flush_request(rq, fq, tag)) | 514 | if (!is_flush_request(rq, fq, tag)) |
505 | return rq; | 515 | return rq; |
506 | 516 | ||
507 | return fq->flush_rq; | 517 | return fq->flush_rq; |
508 | } | 518 | } |
509 | EXPORT_SYMBOL(blk_mq_tag_to_rq); | 519 | EXPORT_SYMBOL(blk_mq_tag_to_rq); |
510 | 520 | ||
511 | struct blk_mq_timeout_data { | 521 | struct blk_mq_timeout_data { |
512 | unsigned long next; | 522 | unsigned long next; |
513 | unsigned int next_set; | 523 | unsigned int next_set; |
514 | }; | 524 | }; |
515 | 525 | ||
516 | void blk_mq_rq_timed_out(struct request *req, bool reserved) | 526 | void blk_mq_rq_timed_out(struct request *req, bool reserved) |
517 | { | 527 | { |
518 | struct blk_mq_ops *ops = req->q->mq_ops; | 528 | struct blk_mq_ops *ops = req->q->mq_ops; |
519 | enum blk_eh_timer_return ret = BLK_EH_RESET_TIMER; | 529 | enum blk_eh_timer_return ret = BLK_EH_RESET_TIMER; |
520 | 530 | ||
521 | /* | 531 | /* |
522 | * We know that complete is set at this point. If STARTED isn't set | 532 | * We know that complete is set at this point. If STARTED isn't set |
523 | * anymore, then the request isn't active and the "timeout" should | 533 | * anymore, then the request isn't active and the "timeout" should |
524 | * just be ignored. This can happen due to the bitflag ordering. | 534 | * just be ignored. This can happen due to the bitflag ordering. |
525 | * Timeout first checks if STARTED is set, and if it is, assumes | 535 | * Timeout first checks if STARTED is set, and if it is, assumes |
526 | * the request is active. But if we race with completion, then | 536 | * the request is active. But if we race with completion, then |
527 | * we both flags will get cleared. So check here again, and ignore | 537 | * we both flags will get cleared. So check here again, and ignore |
528 | * a timeout event with a request that isn't active. | 538 | * a timeout event with a request that isn't active. |
529 | */ | 539 | */ |
530 | if (!test_bit(REQ_ATOM_STARTED, &req->atomic_flags)) | 540 | if (!test_bit(REQ_ATOM_STARTED, &req->atomic_flags)) |
531 | return; | 541 | return; |
532 | 542 | ||
533 | if (ops->timeout) | 543 | if (ops->timeout) |
534 | ret = ops->timeout(req, reserved); | 544 | ret = ops->timeout(req, reserved); |
535 | 545 | ||
536 | switch (ret) { | 546 | switch (ret) { |
537 | case BLK_EH_HANDLED: | 547 | case BLK_EH_HANDLED: |
538 | __blk_mq_complete_request(req); | 548 | __blk_mq_complete_request(req); |
539 | break; | 549 | break; |
540 | case BLK_EH_RESET_TIMER: | 550 | case BLK_EH_RESET_TIMER: |
541 | blk_add_timer(req); | 551 | blk_add_timer(req); |
542 | blk_clear_rq_complete(req); | 552 | blk_clear_rq_complete(req); |
543 | break; | 553 | break; |
544 | case BLK_EH_NOT_HANDLED: | 554 | case BLK_EH_NOT_HANDLED: |
545 | break; | 555 | break; |
546 | default: | 556 | default: |
547 | printk(KERN_ERR "block: bad eh return: %d\n", ret); | 557 | printk(KERN_ERR "block: bad eh return: %d\n", ret); |
548 | break; | 558 | break; |
549 | } | 559 | } |
550 | } | 560 | } |
551 | 561 | ||
552 | static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx, | 562 | static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx, |
553 | struct request *rq, void *priv, bool reserved) | 563 | struct request *rq, void *priv, bool reserved) |
554 | { | 564 | { |
555 | struct blk_mq_timeout_data *data = priv; | 565 | struct blk_mq_timeout_data *data = priv; |
556 | 566 | ||
557 | if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) | 567 | if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) |
558 | return; | 568 | return; |
559 | 569 | ||
560 | if (time_after_eq(jiffies, rq->deadline)) { | 570 | if (time_after_eq(jiffies, rq->deadline)) { |
561 | if (!blk_mark_rq_complete(rq)) | 571 | if (!blk_mark_rq_complete(rq)) |
562 | blk_mq_rq_timed_out(rq, reserved); | 572 | blk_mq_rq_timed_out(rq, reserved); |
563 | } else if (!data->next_set || time_after(data->next, rq->deadline)) { | 573 | } else if (!data->next_set || time_after(data->next, rq->deadline)) { |
564 | data->next = rq->deadline; | 574 | data->next = rq->deadline; |
565 | data->next_set = 1; | 575 | data->next_set = 1; |
566 | } | 576 | } |
567 | } | 577 | } |
568 | 578 | ||
569 | static void blk_mq_rq_timer(unsigned long priv) | 579 | static void blk_mq_rq_timer(unsigned long priv) |
570 | { | 580 | { |
571 | struct request_queue *q = (struct request_queue *)priv; | 581 | struct request_queue *q = (struct request_queue *)priv; |
572 | struct blk_mq_timeout_data data = { | 582 | struct blk_mq_timeout_data data = { |
573 | .next = 0, | 583 | .next = 0, |
574 | .next_set = 0, | 584 | .next_set = 0, |
575 | }; | 585 | }; |
576 | struct blk_mq_hw_ctx *hctx; | 586 | struct blk_mq_hw_ctx *hctx; |
577 | int i; | 587 | int i; |
578 | 588 | ||
579 | queue_for_each_hw_ctx(q, hctx, i) { | 589 | queue_for_each_hw_ctx(q, hctx, i) { |
580 | /* | 590 | /* |
581 | * If not software queues are currently mapped to this | 591 | * If not software queues are currently mapped to this |
582 | * hardware queue, there's nothing to check | 592 | * hardware queue, there's nothing to check |
583 | */ | 593 | */ |
584 | if (!hctx->nr_ctx || !hctx->tags) | 594 | if (!hctx->nr_ctx || !hctx->tags) |
585 | continue; | 595 | continue; |
586 | 596 | ||
587 | blk_mq_tag_busy_iter(hctx, blk_mq_check_expired, &data); | 597 | blk_mq_tag_busy_iter(hctx, blk_mq_check_expired, &data); |
588 | } | 598 | } |
589 | 599 | ||
590 | if (data.next_set) { | 600 | if (data.next_set) { |
591 | data.next = blk_rq_timeout(round_jiffies_up(data.next)); | 601 | data.next = blk_rq_timeout(round_jiffies_up(data.next)); |
592 | mod_timer(&q->timeout, data.next); | 602 | mod_timer(&q->timeout, data.next); |
593 | } else { | 603 | } else { |
594 | queue_for_each_hw_ctx(q, hctx, i) | 604 | queue_for_each_hw_ctx(q, hctx, i) |
595 | blk_mq_tag_idle(hctx); | 605 | blk_mq_tag_idle(hctx); |
596 | } | 606 | } |
597 | } | 607 | } |
598 | 608 | ||
599 | /* | 609 | /* |
600 | * Reverse check our software queue for entries that we could potentially | 610 | * Reverse check our software queue for entries that we could potentially |
601 | * merge with. Currently includes a hand-wavy stop count of 8, to not spend | 611 | * merge with. Currently includes a hand-wavy stop count of 8, to not spend |
602 | * too much time checking for merges. | 612 | * too much time checking for merges. |
603 | */ | 613 | */ |
604 | static bool blk_mq_attempt_merge(struct request_queue *q, | 614 | static bool blk_mq_attempt_merge(struct request_queue *q, |
605 | struct blk_mq_ctx *ctx, struct bio *bio) | 615 | struct blk_mq_ctx *ctx, struct bio *bio) |
606 | { | 616 | { |
607 | struct request *rq; | 617 | struct request *rq; |
608 | int checked = 8; | 618 | int checked = 8; |
609 | 619 | ||
610 | list_for_each_entry_reverse(rq, &ctx->rq_list, queuelist) { | 620 | list_for_each_entry_reverse(rq, &ctx->rq_list, queuelist) { |
611 | int el_ret; | 621 | int el_ret; |
612 | 622 | ||
613 | if (!checked--) | 623 | if (!checked--) |
614 | break; | 624 | break; |
615 | 625 | ||
616 | if (!blk_rq_merge_ok(rq, bio)) | 626 | if (!blk_rq_merge_ok(rq, bio)) |
617 | continue; | 627 | continue; |
618 | 628 | ||
619 | el_ret = blk_try_merge(rq, bio); | 629 | el_ret = blk_try_merge(rq, bio); |
620 | if (el_ret == ELEVATOR_BACK_MERGE) { | 630 | if (el_ret == ELEVATOR_BACK_MERGE) { |
621 | if (bio_attempt_back_merge(q, rq, bio)) { | 631 | if (bio_attempt_back_merge(q, rq, bio)) { |
622 | ctx->rq_merged++; | 632 | ctx->rq_merged++; |
623 | return true; | 633 | return true; |
624 | } | 634 | } |
625 | break; | 635 | break; |
626 | } else if (el_ret == ELEVATOR_FRONT_MERGE) { | 636 | } else if (el_ret == ELEVATOR_FRONT_MERGE) { |
627 | if (bio_attempt_front_merge(q, rq, bio)) { | 637 | if (bio_attempt_front_merge(q, rq, bio)) { |
628 | ctx->rq_merged++; | 638 | ctx->rq_merged++; |
629 | return true; | 639 | return true; |
630 | } | 640 | } |
631 | break; | 641 | break; |
632 | } | 642 | } |
633 | } | 643 | } |
634 | 644 | ||
635 | return false; | 645 | return false; |
636 | } | 646 | } |
637 | 647 | ||
638 | /* | 648 | /* |
639 | * Process software queues that have been marked busy, splicing them | 649 | * Process software queues that have been marked busy, splicing them |
640 | * to the for-dispatch | 650 | * to the for-dispatch |
641 | */ | 651 | */ |
642 | static void flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list) | 652 | static void flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list) |
643 | { | 653 | { |
644 | struct blk_mq_ctx *ctx; | 654 | struct blk_mq_ctx *ctx; |
645 | int i; | 655 | int i; |
646 | 656 | ||
647 | for (i = 0; i < hctx->ctx_map.map_size; i++) { | 657 | for (i = 0; i < hctx->ctx_map.map_size; i++) { |
648 | struct blk_align_bitmap *bm = &hctx->ctx_map.map[i]; | 658 | struct blk_align_bitmap *bm = &hctx->ctx_map.map[i]; |
649 | unsigned int off, bit; | 659 | unsigned int off, bit; |
650 | 660 | ||
651 | if (!bm->word) | 661 | if (!bm->word) |
652 | continue; | 662 | continue; |
653 | 663 | ||
654 | bit = 0; | 664 | bit = 0; |
655 | off = i * hctx->ctx_map.bits_per_word; | 665 | off = i * hctx->ctx_map.bits_per_word; |
656 | do { | 666 | do { |
657 | bit = find_next_bit(&bm->word, bm->depth, bit); | 667 | bit = find_next_bit(&bm->word, bm->depth, bit); |
658 | if (bit >= bm->depth) | 668 | if (bit >= bm->depth) |
659 | break; | 669 | break; |
660 | 670 | ||
661 | ctx = hctx->ctxs[bit + off]; | 671 | ctx = hctx->ctxs[bit + off]; |
662 | clear_bit(bit, &bm->word); | 672 | clear_bit(bit, &bm->word); |
663 | spin_lock(&ctx->lock); | 673 | spin_lock(&ctx->lock); |
664 | list_splice_tail_init(&ctx->rq_list, list); | 674 | list_splice_tail_init(&ctx->rq_list, list); |
665 | spin_unlock(&ctx->lock); | 675 | spin_unlock(&ctx->lock); |
666 | 676 | ||
667 | bit++; | 677 | bit++; |
668 | } while (1); | 678 | } while (1); |
669 | } | 679 | } |
670 | } | 680 | } |
671 | 681 | ||
672 | /* | 682 | /* |
673 | * Run this hardware queue, pulling any software queues mapped to it in. | 683 | * Run this hardware queue, pulling any software queues mapped to it in. |
674 | * Note that this function currently has various problems around ordering | 684 | * Note that this function currently has various problems around ordering |
675 | * of IO. In particular, we'd like FIFO behaviour on handling existing | 685 | * of IO. In particular, we'd like FIFO behaviour on handling existing |
676 | * items on the hctx->dispatch list. Ignore that for now. | 686 | * items on the hctx->dispatch list. Ignore that for now. |
677 | */ | 687 | */ |
678 | static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) | 688 | static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) |
679 | { | 689 | { |
680 | struct request_queue *q = hctx->queue; | 690 | struct request_queue *q = hctx->queue; |
681 | struct request *rq; | 691 | struct request *rq; |
682 | LIST_HEAD(rq_list); | 692 | LIST_HEAD(rq_list); |
683 | int queued; | 693 | int queued; |
684 | 694 | ||
685 | WARN_ON(!cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask)); | 695 | WARN_ON(!cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask)); |
686 | 696 | ||
687 | if (unlikely(test_bit(BLK_MQ_S_STOPPED, &hctx->state))) | 697 | if (unlikely(test_bit(BLK_MQ_S_STOPPED, &hctx->state))) |
688 | return; | 698 | return; |
689 | 699 | ||
690 | hctx->run++; | 700 | hctx->run++; |
691 | 701 | ||
692 | /* | 702 | /* |
693 | * Touch any software queue that has pending entries. | 703 | * Touch any software queue that has pending entries. |
694 | */ | 704 | */ |
695 | flush_busy_ctxs(hctx, &rq_list); | 705 | flush_busy_ctxs(hctx, &rq_list); |
696 | 706 | ||
697 | /* | 707 | /* |
698 | * If we have previous entries on our dispatch list, grab them | 708 | * If we have previous entries on our dispatch list, grab them |
699 | * and stuff them at the front for more fair dispatch. | 709 | * and stuff them at the front for more fair dispatch. |
700 | */ | 710 | */ |
701 | if (!list_empty_careful(&hctx->dispatch)) { | 711 | if (!list_empty_careful(&hctx->dispatch)) { |
702 | spin_lock(&hctx->lock); | 712 | spin_lock(&hctx->lock); |
703 | if (!list_empty(&hctx->dispatch)) | 713 | if (!list_empty(&hctx->dispatch)) |
704 | list_splice_init(&hctx->dispatch, &rq_list); | 714 | list_splice_init(&hctx->dispatch, &rq_list); |
705 | spin_unlock(&hctx->lock); | 715 | spin_unlock(&hctx->lock); |
706 | } | 716 | } |
707 | 717 | ||
708 | /* | 718 | /* |
709 | * Now process all the entries, sending them to the driver. | 719 | * Now process all the entries, sending them to the driver. |
710 | */ | 720 | */ |
711 | queued = 0; | 721 | queued = 0; |
712 | while (!list_empty(&rq_list)) { | 722 | while (!list_empty(&rq_list)) { |
713 | int ret; | 723 | int ret; |
714 | 724 | ||
715 | rq = list_first_entry(&rq_list, struct request, queuelist); | 725 | rq = list_first_entry(&rq_list, struct request, queuelist); |
716 | list_del_init(&rq->queuelist); | 726 | list_del_init(&rq->queuelist); |
717 | 727 | ||
718 | ret = q->mq_ops->queue_rq(hctx, rq, list_empty(&rq_list)); | 728 | ret = q->mq_ops->queue_rq(hctx, rq, list_empty(&rq_list)); |
719 | switch (ret) { | 729 | switch (ret) { |
720 | case BLK_MQ_RQ_QUEUE_OK: | 730 | case BLK_MQ_RQ_QUEUE_OK: |
721 | queued++; | 731 | queued++; |
722 | continue; | 732 | continue; |
723 | case BLK_MQ_RQ_QUEUE_BUSY: | 733 | case BLK_MQ_RQ_QUEUE_BUSY: |
724 | list_add(&rq->queuelist, &rq_list); | 734 | list_add(&rq->queuelist, &rq_list); |
725 | __blk_mq_requeue_request(rq); | 735 | __blk_mq_requeue_request(rq); |
726 | break; | 736 | break; |
727 | default: | 737 | default: |
728 | pr_err("blk-mq: bad return on queue: %d\n", ret); | 738 | pr_err("blk-mq: bad return on queue: %d\n", ret); |
729 | case BLK_MQ_RQ_QUEUE_ERROR: | 739 | case BLK_MQ_RQ_QUEUE_ERROR: |
730 | rq->errors = -EIO; | 740 | rq->errors = -EIO; |
731 | blk_mq_end_request(rq, rq->errors); | 741 | blk_mq_end_request(rq, rq->errors); |
732 | break; | 742 | break; |
733 | } | 743 | } |
734 | 744 | ||
735 | if (ret == BLK_MQ_RQ_QUEUE_BUSY) | 745 | if (ret == BLK_MQ_RQ_QUEUE_BUSY) |
736 | break; | 746 | break; |
737 | } | 747 | } |
738 | 748 | ||
739 | if (!queued) | 749 | if (!queued) |
740 | hctx->dispatched[0]++; | 750 | hctx->dispatched[0]++; |
741 | else if (queued < (1 << (BLK_MQ_MAX_DISPATCH_ORDER - 1))) | 751 | else if (queued < (1 << (BLK_MQ_MAX_DISPATCH_ORDER - 1))) |
742 | hctx->dispatched[ilog2(queued) + 1]++; | 752 | hctx->dispatched[ilog2(queued) + 1]++; |
743 | 753 | ||
744 | /* | 754 | /* |
745 | * Any items that need requeuing? Stuff them into hctx->dispatch, | 755 | * Any items that need requeuing? Stuff them into hctx->dispatch, |
746 | * that is where we will continue on next queue run. | 756 | * that is where we will continue on next queue run. |
747 | */ | 757 | */ |
748 | if (!list_empty(&rq_list)) { | 758 | if (!list_empty(&rq_list)) { |
749 | spin_lock(&hctx->lock); | 759 | spin_lock(&hctx->lock); |
750 | list_splice(&rq_list, &hctx->dispatch); | 760 | list_splice(&rq_list, &hctx->dispatch); |
751 | spin_unlock(&hctx->lock); | 761 | spin_unlock(&hctx->lock); |
752 | } | 762 | } |
753 | } | 763 | } |
754 | 764 | ||
755 | /* | 765 | /* |
756 | * It'd be great if the workqueue API had a way to pass | 766 | * It'd be great if the workqueue API had a way to pass |
757 | * in a mask and had some smarts for more clever placement. | 767 | * in a mask and had some smarts for more clever placement. |
758 | * For now we just round-robin here, switching for every | 768 | * For now we just round-robin here, switching for every |
759 | * BLK_MQ_CPU_WORK_BATCH queued items. | 769 | * BLK_MQ_CPU_WORK_BATCH queued items. |
760 | */ | 770 | */ |
761 | static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx) | 771 | static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx) |
762 | { | 772 | { |
763 | int cpu = hctx->next_cpu; | 773 | int cpu = hctx->next_cpu; |
764 | 774 | ||
765 | if (--hctx->next_cpu_batch <= 0) { | 775 | if (--hctx->next_cpu_batch <= 0) { |
766 | int next_cpu; | 776 | int next_cpu; |
767 | 777 | ||
768 | next_cpu = cpumask_next(hctx->next_cpu, hctx->cpumask); | 778 | next_cpu = cpumask_next(hctx->next_cpu, hctx->cpumask); |
769 | if (next_cpu >= nr_cpu_ids) | 779 | if (next_cpu >= nr_cpu_ids) |
770 | next_cpu = cpumask_first(hctx->cpumask); | 780 | next_cpu = cpumask_first(hctx->cpumask); |
771 | 781 | ||
772 | hctx->next_cpu = next_cpu; | 782 | hctx->next_cpu = next_cpu; |
773 | hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH; | 783 | hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH; |
774 | } | 784 | } |
775 | 785 | ||
776 | return cpu; | 786 | return cpu; |
777 | } | 787 | } |
778 | 788 | ||
779 | void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async) | 789 | void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async) |
780 | { | 790 | { |
781 | if (unlikely(test_bit(BLK_MQ_S_STOPPED, &hctx->state))) | 791 | if (unlikely(test_bit(BLK_MQ_S_STOPPED, &hctx->state))) |
782 | return; | 792 | return; |
783 | 793 | ||
784 | if (!async && cpumask_test_cpu(smp_processor_id(), hctx->cpumask)) | 794 | if (!async && cpumask_test_cpu(smp_processor_id(), hctx->cpumask)) |
785 | __blk_mq_run_hw_queue(hctx); | 795 | __blk_mq_run_hw_queue(hctx); |
786 | else if (hctx->queue->nr_hw_queues == 1) | 796 | else if (hctx->queue->nr_hw_queues == 1) |
787 | kblockd_schedule_delayed_work(&hctx->run_work, 0); | 797 | kblockd_schedule_delayed_work(&hctx->run_work, 0); |
788 | else { | 798 | else { |
789 | unsigned int cpu; | 799 | unsigned int cpu; |
790 | 800 | ||
791 | cpu = blk_mq_hctx_next_cpu(hctx); | 801 | cpu = blk_mq_hctx_next_cpu(hctx); |
792 | kblockd_schedule_delayed_work_on(cpu, &hctx->run_work, 0); | 802 | kblockd_schedule_delayed_work_on(cpu, &hctx->run_work, 0); |
793 | } | 803 | } |
794 | } | 804 | } |
795 | 805 | ||
796 | void blk_mq_run_queues(struct request_queue *q, bool async) | 806 | void blk_mq_run_queues(struct request_queue *q, bool async) |
797 | { | 807 | { |
798 | struct blk_mq_hw_ctx *hctx; | 808 | struct blk_mq_hw_ctx *hctx; |
799 | int i; | 809 | int i; |
800 | 810 | ||
801 | queue_for_each_hw_ctx(q, hctx, i) { | 811 | queue_for_each_hw_ctx(q, hctx, i) { |
802 | if ((!blk_mq_hctx_has_pending(hctx) && | 812 | if ((!blk_mq_hctx_has_pending(hctx) && |
803 | list_empty_careful(&hctx->dispatch)) || | 813 | list_empty_careful(&hctx->dispatch)) || |
804 | test_bit(BLK_MQ_S_STOPPED, &hctx->state)) | 814 | test_bit(BLK_MQ_S_STOPPED, &hctx->state)) |
805 | continue; | 815 | continue; |
806 | 816 | ||
807 | preempt_disable(); | 817 | preempt_disable(); |
808 | blk_mq_run_hw_queue(hctx, async); | 818 | blk_mq_run_hw_queue(hctx, async); |
809 | preempt_enable(); | 819 | preempt_enable(); |
810 | } | 820 | } |
811 | } | 821 | } |
812 | EXPORT_SYMBOL(blk_mq_run_queues); | 822 | EXPORT_SYMBOL(blk_mq_run_queues); |
813 | 823 | ||
814 | void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx) | 824 | void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx) |
815 | { | 825 | { |
816 | cancel_delayed_work(&hctx->run_work); | 826 | cancel_delayed_work(&hctx->run_work); |
817 | cancel_delayed_work(&hctx->delay_work); | 827 | cancel_delayed_work(&hctx->delay_work); |
818 | set_bit(BLK_MQ_S_STOPPED, &hctx->state); | 828 | set_bit(BLK_MQ_S_STOPPED, &hctx->state); |
819 | } | 829 | } |
820 | EXPORT_SYMBOL(blk_mq_stop_hw_queue); | 830 | EXPORT_SYMBOL(blk_mq_stop_hw_queue); |
821 | 831 | ||
822 | void blk_mq_stop_hw_queues(struct request_queue *q) | 832 | void blk_mq_stop_hw_queues(struct request_queue *q) |
823 | { | 833 | { |
824 | struct blk_mq_hw_ctx *hctx; | 834 | struct blk_mq_hw_ctx *hctx; |
825 | int i; | 835 | int i; |
826 | 836 | ||
827 | queue_for_each_hw_ctx(q, hctx, i) | 837 | queue_for_each_hw_ctx(q, hctx, i) |
828 | blk_mq_stop_hw_queue(hctx); | 838 | blk_mq_stop_hw_queue(hctx); |
829 | } | 839 | } |
830 | EXPORT_SYMBOL(blk_mq_stop_hw_queues); | 840 | EXPORT_SYMBOL(blk_mq_stop_hw_queues); |
831 | 841 | ||
832 | void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx) | 842 | void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx) |
833 | { | 843 | { |
834 | clear_bit(BLK_MQ_S_STOPPED, &hctx->state); | 844 | clear_bit(BLK_MQ_S_STOPPED, &hctx->state); |
835 | 845 | ||
836 | preempt_disable(); | 846 | preempt_disable(); |
837 | blk_mq_run_hw_queue(hctx, false); | 847 | blk_mq_run_hw_queue(hctx, false); |
838 | preempt_enable(); | 848 | preempt_enable(); |
839 | } | 849 | } |
840 | EXPORT_SYMBOL(blk_mq_start_hw_queue); | 850 | EXPORT_SYMBOL(blk_mq_start_hw_queue); |
841 | 851 | ||
842 | void blk_mq_start_hw_queues(struct request_queue *q) | 852 | void blk_mq_start_hw_queues(struct request_queue *q) |
843 | { | 853 | { |
844 | struct blk_mq_hw_ctx *hctx; | 854 | struct blk_mq_hw_ctx *hctx; |
845 | int i; | 855 | int i; |
846 | 856 | ||
847 | queue_for_each_hw_ctx(q, hctx, i) | 857 | queue_for_each_hw_ctx(q, hctx, i) |
848 | blk_mq_start_hw_queue(hctx); | 858 | blk_mq_start_hw_queue(hctx); |
849 | } | 859 | } |
850 | EXPORT_SYMBOL(blk_mq_start_hw_queues); | 860 | EXPORT_SYMBOL(blk_mq_start_hw_queues); |
851 | 861 | ||
852 | 862 | ||
853 | void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async) | 863 | void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async) |
854 | { | 864 | { |
855 | struct blk_mq_hw_ctx *hctx; | 865 | struct blk_mq_hw_ctx *hctx; |
856 | int i; | 866 | int i; |
857 | 867 | ||
858 | queue_for_each_hw_ctx(q, hctx, i) { | 868 | queue_for_each_hw_ctx(q, hctx, i) { |
859 | if (!test_bit(BLK_MQ_S_STOPPED, &hctx->state)) | 869 | if (!test_bit(BLK_MQ_S_STOPPED, &hctx->state)) |
860 | continue; | 870 | continue; |
861 | 871 | ||
862 | clear_bit(BLK_MQ_S_STOPPED, &hctx->state); | 872 | clear_bit(BLK_MQ_S_STOPPED, &hctx->state); |
863 | preempt_disable(); | 873 | preempt_disable(); |
864 | blk_mq_run_hw_queue(hctx, async); | 874 | blk_mq_run_hw_queue(hctx, async); |
865 | preempt_enable(); | 875 | preempt_enable(); |
866 | } | 876 | } |
867 | } | 877 | } |
868 | EXPORT_SYMBOL(blk_mq_start_stopped_hw_queues); | 878 | EXPORT_SYMBOL(blk_mq_start_stopped_hw_queues); |
869 | 879 | ||
870 | static void blk_mq_run_work_fn(struct work_struct *work) | 880 | static void blk_mq_run_work_fn(struct work_struct *work) |
871 | { | 881 | { |
872 | struct blk_mq_hw_ctx *hctx; | 882 | struct blk_mq_hw_ctx *hctx; |
873 | 883 | ||
874 | hctx = container_of(work, struct blk_mq_hw_ctx, run_work.work); | 884 | hctx = container_of(work, struct blk_mq_hw_ctx, run_work.work); |
875 | 885 | ||
876 | __blk_mq_run_hw_queue(hctx); | 886 | __blk_mq_run_hw_queue(hctx); |
877 | } | 887 | } |
878 | 888 | ||
879 | static void blk_mq_delay_work_fn(struct work_struct *work) | 889 | static void blk_mq_delay_work_fn(struct work_struct *work) |
880 | { | 890 | { |
881 | struct blk_mq_hw_ctx *hctx; | 891 | struct blk_mq_hw_ctx *hctx; |
882 | 892 | ||
883 | hctx = container_of(work, struct blk_mq_hw_ctx, delay_work.work); | 893 | hctx = container_of(work, struct blk_mq_hw_ctx, delay_work.work); |
884 | 894 | ||
885 | if (test_and_clear_bit(BLK_MQ_S_STOPPED, &hctx->state)) | 895 | if (test_and_clear_bit(BLK_MQ_S_STOPPED, &hctx->state)) |
886 | __blk_mq_run_hw_queue(hctx); | 896 | __blk_mq_run_hw_queue(hctx); |
887 | } | 897 | } |
888 | 898 | ||
889 | void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs) | 899 | void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs) |
890 | { | 900 | { |
891 | unsigned long tmo = msecs_to_jiffies(msecs); | 901 | unsigned long tmo = msecs_to_jiffies(msecs); |
892 | 902 | ||
893 | if (hctx->queue->nr_hw_queues == 1) | 903 | if (hctx->queue->nr_hw_queues == 1) |
894 | kblockd_schedule_delayed_work(&hctx->delay_work, tmo); | 904 | kblockd_schedule_delayed_work(&hctx->delay_work, tmo); |
895 | else { | 905 | else { |
896 | unsigned int cpu; | 906 | unsigned int cpu; |
897 | 907 | ||
898 | cpu = blk_mq_hctx_next_cpu(hctx); | 908 | cpu = blk_mq_hctx_next_cpu(hctx); |
899 | kblockd_schedule_delayed_work_on(cpu, &hctx->delay_work, tmo); | 909 | kblockd_schedule_delayed_work_on(cpu, &hctx->delay_work, tmo); |
900 | } | 910 | } |
901 | } | 911 | } |
902 | EXPORT_SYMBOL(blk_mq_delay_queue); | 912 | EXPORT_SYMBOL(blk_mq_delay_queue); |
903 | 913 | ||
904 | static void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, | 914 | static void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, |
905 | struct request *rq, bool at_head) | 915 | struct request *rq, bool at_head) |
906 | { | 916 | { |
907 | struct blk_mq_ctx *ctx = rq->mq_ctx; | 917 | struct blk_mq_ctx *ctx = rq->mq_ctx; |
908 | 918 | ||
909 | trace_block_rq_insert(hctx->queue, rq); | 919 | trace_block_rq_insert(hctx->queue, rq); |
910 | 920 | ||
911 | if (at_head) | 921 | if (at_head) |
912 | list_add(&rq->queuelist, &ctx->rq_list); | 922 | list_add(&rq->queuelist, &ctx->rq_list); |
913 | else | 923 | else |
914 | list_add_tail(&rq->queuelist, &ctx->rq_list); | 924 | list_add_tail(&rq->queuelist, &ctx->rq_list); |
915 | 925 | ||
916 | blk_mq_hctx_mark_pending(hctx, ctx); | 926 | blk_mq_hctx_mark_pending(hctx, ctx); |
917 | } | 927 | } |
918 | 928 | ||
919 | void blk_mq_insert_request(struct request *rq, bool at_head, bool run_queue, | 929 | void blk_mq_insert_request(struct request *rq, bool at_head, bool run_queue, |
920 | bool async) | 930 | bool async) |
921 | { | 931 | { |
922 | struct request_queue *q = rq->q; | 932 | struct request_queue *q = rq->q; |
923 | struct blk_mq_hw_ctx *hctx; | 933 | struct blk_mq_hw_ctx *hctx; |
924 | struct blk_mq_ctx *ctx = rq->mq_ctx, *current_ctx; | 934 | struct blk_mq_ctx *ctx = rq->mq_ctx, *current_ctx; |
925 | 935 | ||
926 | current_ctx = blk_mq_get_ctx(q); | 936 | current_ctx = blk_mq_get_ctx(q); |
927 | if (!cpu_online(ctx->cpu)) | 937 | if (!cpu_online(ctx->cpu)) |
928 | rq->mq_ctx = ctx = current_ctx; | 938 | rq->mq_ctx = ctx = current_ctx; |
929 | 939 | ||
930 | hctx = q->mq_ops->map_queue(q, ctx->cpu); | 940 | hctx = q->mq_ops->map_queue(q, ctx->cpu); |
931 | 941 | ||
932 | spin_lock(&ctx->lock); | 942 | spin_lock(&ctx->lock); |
933 | __blk_mq_insert_request(hctx, rq, at_head); | 943 | __blk_mq_insert_request(hctx, rq, at_head); |
934 | spin_unlock(&ctx->lock); | 944 | spin_unlock(&ctx->lock); |
935 | 945 | ||
936 | if (run_queue) | 946 | if (run_queue) |
937 | blk_mq_run_hw_queue(hctx, async); | 947 | blk_mq_run_hw_queue(hctx, async); |
938 | 948 | ||
939 | blk_mq_put_ctx(current_ctx); | 949 | blk_mq_put_ctx(current_ctx); |
940 | } | 950 | } |
941 | 951 | ||
942 | static void blk_mq_insert_requests(struct request_queue *q, | 952 | static void blk_mq_insert_requests(struct request_queue *q, |
943 | struct blk_mq_ctx *ctx, | 953 | struct blk_mq_ctx *ctx, |
944 | struct list_head *list, | 954 | struct list_head *list, |
945 | int depth, | 955 | int depth, |
946 | bool from_schedule) | 956 | bool from_schedule) |
947 | 957 | ||
948 | { | 958 | { |
949 | struct blk_mq_hw_ctx *hctx; | 959 | struct blk_mq_hw_ctx *hctx; |
950 | struct blk_mq_ctx *current_ctx; | 960 | struct blk_mq_ctx *current_ctx; |
951 | 961 | ||
952 | trace_block_unplug(q, depth, !from_schedule); | 962 | trace_block_unplug(q, depth, !from_schedule); |
953 | 963 | ||
954 | current_ctx = blk_mq_get_ctx(q); | 964 | current_ctx = blk_mq_get_ctx(q); |
955 | 965 | ||
956 | if (!cpu_online(ctx->cpu)) | 966 | if (!cpu_online(ctx->cpu)) |
957 | ctx = current_ctx; | 967 | ctx = current_ctx; |
958 | hctx = q->mq_ops->map_queue(q, ctx->cpu); | 968 | hctx = q->mq_ops->map_queue(q, ctx->cpu); |
959 | 969 | ||
960 | /* | 970 | /* |
961 | * preemption doesn't flush plug list, so it's possible ctx->cpu is | 971 | * preemption doesn't flush plug list, so it's possible ctx->cpu is |
962 | * offline now | 972 | * offline now |
963 | */ | 973 | */ |
964 | spin_lock(&ctx->lock); | 974 | spin_lock(&ctx->lock); |
965 | while (!list_empty(list)) { | 975 | while (!list_empty(list)) { |
966 | struct request *rq; | 976 | struct request *rq; |
967 | 977 | ||
968 | rq = list_first_entry(list, struct request, queuelist); | 978 | rq = list_first_entry(list, struct request, queuelist); |
969 | list_del_init(&rq->queuelist); | 979 | list_del_init(&rq->queuelist); |
970 | rq->mq_ctx = ctx; | 980 | rq->mq_ctx = ctx; |
971 | __blk_mq_insert_request(hctx, rq, false); | 981 | __blk_mq_insert_request(hctx, rq, false); |
972 | } | 982 | } |
973 | spin_unlock(&ctx->lock); | 983 | spin_unlock(&ctx->lock); |
974 | 984 | ||
975 | blk_mq_run_hw_queue(hctx, from_schedule); | 985 | blk_mq_run_hw_queue(hctx, from_schedule); |
976 | blk_mq_put_ctx(current_ctx); | 986 | blk_mq_put_ctx(current_ctx); |
977 | } | 987 | } |
978 | 988 | ||
979 | static int plug_ctx_cmp(void *priv, struct list_head *a, struct list_head *b) | 989 | static int plug_ctx_cmp(void *priv, struct list_head *a, struct list_head *b) |
980 | { | 990 | { |
981 | struct request *rqa = container_of(a, struct request, queuelist); | 991 | struct request *rqa = container_of(a, struct request, queuelist); |
982 | struct request *rqb = container_of(b, struct request, queuelist); | 992 | struct request *rqb = container_of(b, struct request, queuelist); |
983 | 993 | ||
984 | return !(rqa->mq_ctx < rqb->mq_ctx || | 994 | return !(rqa->mq_ctx < rqb->mq_ctx || |
985 | (rqa->mq_ctx == rqb->mq_ctx && | 995 | (rqa->mq_ctx == rqb->mq_ctx && |
986 | blk_rq_pos(rqa) < blk_rq_pos(rqb))); | 996 | blk_rq_pos(rqa) < blk_rq_pos(rqb))); |
987 | } | 997 | } |
988 | 998 | ||
989 | void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule) | 999 | void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule) |
990 | { | 1000 | { |
991 | struct blk_mq_ctx *this_ctx; | 1001 | struct blk_mq_ctx *this_ctx; |
992 | struct request_queue *this_q; | 1002 | struct request_queue *this_q; |
993 | struct request *rq; | 1003 | struct request *rq; |
994 | LIST_HEAD(list); | 1004 | LIST_HEAD(list); |
995 | LIST_HEAD(ctx_list); | 1005 | LIST_HEAD(ctx_list); |
996 | unsigned int depth; | 1006 | unsigned int depth; |
997 | 1007 | ||
998 | list_splice_init(&plug->mq_list, &list); | 1008 | list_splice_init(&plug->mq_list, &list); |
999 | 1009 | ||
1000 | list_sort(NULL, &list, plug_ctx_cmp); | 1010 | list_sort(NULL, &list, plug_ctx_cmp); |
1001 | 1011 | ||
1002 | this_q = NULL; | 1012 | this_q = NULL; |
1003 | this_ctx = NULL; | 1013 | this_ctx = NULL; |
1004 | depth = 0; | 1014 | depth = 0; |
1005 | 1015 | ||
1006 | while (!list_empty(&list)) { | 1016 | while (!list_empty(&list)) { |
1007 | rq = list_entry_rq(list.next); | 1017 | rq = list_entry_rq(list.next); |
1008 | list_del_init(&rq->queuelist); | 1018 | list_del_init(&rq->queuelist); |
1009 | BUG_ON(!rq->q); | 1019 | BUG_ON(!rq->q); |
1010 | if (rq->mq_ctx != this_ctx) { | 1020 | if (rq->mq_ctx != this_ctx) { |
1011 | if (this_ctx) { | 1021 | if (this_ctx) { |
1012 | blk_mq_insert_requests(this_q, this_ctx, | 1022 | blk_mq_insert_requests(this_q, this_ctx, |
1013 | &ctx_list, depth, | 1023 | &ctx_list, depth, |
1014 | from_schedule); | 1024 | from_schedule); |
1015 | } | 1025 | } |
1016 | 1026 | ||
1017 | this_ctx = rq->mq_ctx; | 1027 | this_ctx = rq->mq_ctx; |
1018 | this_q = rq->q; | 1028 | this_q = rq->q; |
1019 | depth = 0; | 1029 | depth = 0; |
1020 | } | 1030 | } |
1021 | 1031 | ||
1022 | depth++; | 1032 | depth++; |
1023 | list_add_tail(&rq->queuelist, &ctx_list); | 1033 | list_add_tail(&rq->queuelist, &ctx_list); |
1024 | } | 1034 | } |
1025 | 1035 | ||
1026 | /* | 1036 | /* |
1027 | * If 'this_ctx' is set, we know we have entries to complete | 1037 | * If 'this_ctx' is set, we know we have entries to complete |
1028 | * on 'ctx_list'. Do those. | 1038 | * on 'ctx_list'. Do those. |
1029 | */ | 1039 | */ |
1030 | if (this_ctx) { | 1040 | if (this_ctx) { |
1031 | blk_mq_insert_requests(this_q, this_ctx, &ctx_list, depth, | 1041 | blk_mq_insert_requests(this_q, this_ctx, &ctx_list, depth, |
1032 | from_schedule); | 1042 | from_schedule); |
1033 | } | 1043 | } |
1034 | } | 1044 | } |
1035 | 1045 | ||
1036 | static void blk_mq_bio_to_request(struct request *rq, struct bio *bio) | 1046 | static void blk_mq_bio_to_request(struct request *rq, struct bio *bio) |
1037 | { | 1047 | { |
1038 | init_request_from_bio(rq, bio); | 1048 | init_request_from_bio(rq, bio); |
1039 | 1049 | ||
1040 | if (blk_do_io_stat(rq)) | 1050 | if (blk_do_io_stat(rq)) |
1041 | blk_account_io_start(rq, 1); | 1051 | blk_account_io_start(rq, 1); |
1042 | } | 1052 | } |
1043 | 1053 | ||
1044 | static inline bool hctx_allow_merges(struct blk_mq_hw_ctx *hctx) | 1054 | static inline bool hctx_allow_merges(struct blk_mq_hw_ctx *hctx) |
1045 | { | 1055 | { |
1046 | return (hctx->flags & BLK_MQ_F_SHOULD_MERGE) && | 1056 | return (hctx->flags & BLK_MQ_F_SHOULD_MERGE) && |
1047 | !blk_queue_nomerges(hctx->queue); | 1057 | !blk_queue_nomerges(hctx->queue); |
1048 | } | 1058 | } |
1049 | 1059 | ||
1050 | static inline bool blk_mq_merge_queue_io(struct blk_mq_hw_ctx *hctx, | 1060 | static inline bool blk_mq_merge_queue_io(struct blk_mq_hw_ctx *hctx, |
1051 | struct blk_mq_ctx *ctx, | 1061 | struct blk_mq_ctx *ctx, |
1052 | struct request *rq, struct bio *bio) | 1062 | struct request *rq, struct bio *bio) |
1053 | { | 1063 | { |
1054 | if (!hctx_allow_merges(hctx)) { | 1064 | if (!hctx_allow_merges(hctx)) { |
1055 | blk_mq_bio_to_request(rq, bio); | 1065 | blk_mq_bio_to_request(rq, bio); |
1056 | spin_lock(&ctx->lock); | 1066 | spin_lock(&ctx->lock); |
1057 | insert_rq: | 1067 | insert_rq: |
1058 | __blk_mq_insert_request(hctx, rq, false); | 1068 | __blk_mq_insert_request(hctx, rq, false); |
1059 | spin_unlock(&ctx->lock); | 1069 | spin_unlock(&ctx->lock); |
1060 | return false; | 1070 | return false; |
1061 | } else { | 1071 | } else { |
1062 | struct request_queue *q = hctx->queue; | 1072 | struct request_queue *q = hctx->queue; |
1063 | 1073 | ||
1064 | spin_lock(&ctx->lock); | 1074 | spin_lock(&ctx->lock); |
1065 | if (!blk_mq_attempt_merge(q, ctx, bio)) { | 1075 | if (!blk_mq_attempt_merge(q, ctx, bio)) { |
1066 | blk_mq_bio_to_request(rq, bio); | 1076 | blk_mq_bio_to_request(rq, bio); |
1067 | goto insert_rq; | 1077 | goto insert_rq; |
1068 | } | 1078 | } |
1069 | 1079 | ||
1070 | spin_unlock(&ctx->lock); | 1080 | spin_unlock(&ctx->lock); |
1071 | __blk_mq_free_request(hctx, ctx, rq); | 1081 | __blk_mq_free_request(hctx, ctx, rq); |
1072 | return true; | 1082 | return true; |
1073 | } | 1083 | } |
1074 | } | 1084 | } |
1075 | 1085 | ||
1076 | struct blk_map_ctx { | 1086 | struct blk_map_ctx { |
1077 | struct blk_mq_hw_ctx *hctx; | 1087 | struct blk_mq_hw_ctx *hctx; |
1078 | struct blk_mq_ctx *ctx; | 1088 | struct blk_mq_ctx *ctx; |
1079 | }; | 1089 | }; |
1080 | 1090 | ||
1081 | static struct request *blk_mq_map_request(struct request_queue *q, | 1091 | static struct request *blk_mq_map_request(struct request_queue *q, |
1082 | struct bio *bio, | 1092 | struct bio *bio, |
1083 | struct blk_map_ctx *data) | 1093 | struct blk_map_ctx *data) |
1084 | { | 1094 | { |
1085 | struct blk_mq_hw_ctx *hctx; | 1095 | struct blk_mq_hw_ctx *hctx; |
1086 | struct blk_mq_ctx *ctx; | 1096 | struct blk_mq_ctx *ctx; |
1087 | struct request *rq; | 1097 | struct request *rq; |
1088 | int rw = bio_data_dir(bio); | 1098 | int rw = bio_data_dir(bio); |
1089 | struct blk_mq_alloc_data alloc_data; | 1099 | struct blk_mq_alloc_data alloc_data; |
1090 | 1100 | ||
1091 | if (unlikely(blk_mq_queue_enter(q))) { | 1101 | if (unlikely(blk_mq_queue_enter(q))) { |
1092 | bio_endio(bio, -EIO); | 1102 | bio_endio(bio, -EIO); |
1093 | return NULL; | 1103 | return NULL; |
1094 | } | 1104 | } |
1095 | 1105 | ||
1096 | ctx = blk_mq_get_ctx(q); | 1106 | ctx = blk_mq_get_ctx(q); |
1097 | hctx = q->mq_ops->map_queue(q, ctx->cpu); | 1107 | hctx = q->mq_ops->map_queue(q, ctx->cpu); |
1098 | 1108 | ||
1099 | if (rw_is_sync(bio->bi_rw)) | 1109 | if (rw_is_sync(bio->bi_rw)) |
1100 | rw |= REQ_SYNC; | 1110 | rw |= REQ_SYNC; |
1101 | 1111 | ||
1102 | trace_block_getrq(q, bio, rw); | 1112 | trace_block_getrq(q, bio, rw); |
1103 | blk_mq_set_alloc_data(&alloc_data, q, GFP_ATOMIC, false, ctx, | 1113 | blk_mq_set_alloc_data(&alloc_data, q, GFP_ATOMIC, false, ctx, |
1104 | hctx); | 1114 | hctx); |
1105 | rq = __blk_mq_alloc_request(&alloc_data, rw); | 1115 | rq = __blk_mq_alloc_request(&alloc_data, rw); |
1106 | if (unlikely(!rq)) { | 1116 | if (unlikely(!rq)) { |
1107 | __blk_mq_run_hw_queue(hctx); | 1117 | __blk_mq_run_hw_queue(hctx); |
1108 | blk_mq_put_ctx(ctx); | 1118 | blk_mq_put_ctx(ctx); |
1109 | trace_block_sleeprq(q, bio, rw); | 1119 | trace_block_sleeprq(q, bio, rw); |
1110 | 1120 | ||
1111 | ctx = blk_mq_get_ctx(q); | 1121 | ctx = blk_mq_get_ctx(q); |
1112 | hctx = q->mq_ops->map_queue(q, ctx->cpu); | 1122 | hctx = q->mq_ops->map_queue(q, ctx->cpu); |
1113 | blk_mq_set_alloc_data(&alloc_data, q, | 1123 | blk_mq_set_alloc_data(&alloc_data, q, |
1114 | __GFP_WAIT|GFP_ATOMIC, false, ctx, hctx); | 1124 | __GFP_WAIT|GFP_ATOMIC, false, ctx, hctx); |
1115 | rq = __blk_mq_alloc_request(&alloc_data, rw); | 1125 | rq = __blk_mq_alloc_request(&alloc_data, rw); |
1116 | ctx = alloc_data.ctx; | 1126 | ctx = alloc_data.ctx; |
1117 | hctx = alloc_data.hctx; | 1127 | hctx = alloc_data.hctx; |
1118 | } | 1128 | } |
1119 | 1129 | ||
1120 | hctx->queued++; | 1130 | hctx->queued++; |
1121 | data->hctx = hctx; | 1131 | data->hctx = hctx; |
1122 | data->ctx = ctx; | 1132 | data->ctx = ctx; |
1123 | return rq; | 1133 | return rq; |
1124 | } | 1134 | } |
1125 | 1135 | ||
1126 | /* | 1136 | /* |
1127 | * Multiple hardware queue variant. This will not use per-process plugs, | 1137 | * Multiple hardware queue variant. This will not use per-process plugs, |
1128 | * but will attempt to bypass the hctx queueing if we can go straight to | 1138 | * but will attempt to bypass the hctx queueing if we can go straight to |
1129 | * hardware for SYNC IO. | 1139 | * hardware for SYNC IO. |
1130 | */ | 1140 | */ |
1131 | static void blk_mq_make_request(struct request_queue *q, struct bio *bio) | 1141 | static void blk_mq_make_request(struct request_queue *q, struct bio *bio) |
1132 | { | 1142 | { |
1133 | const int is_sync = rw_is_sync(bio->bi_rw); | 1143 | const int is_sync = rw_is_sync(bio->bi_rw); |
1134 | const int is_flush_fua = bio->bi_rw & (REQ_FLUSH | REQ_FUA); | 1144 | const int is_flush_fua = bio->bi_rw & (REQ_FLUSH | REQ_FUA); |
1135 | struct blk_map_ctx data; | 1145 | struct blk_map_ctx data; |
1136 | struct request *rq; | 1146 | struct request *rq; |
1137 | 1147 | ||
1138 | blk_queue_bounce(q, &bio); | 1148 | blk_queue_bounce(q, &bio); |
1139 | 1149 | ||
1140 | if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) { | 1150 | if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) { |
1141 | bio_endio(bio, -EIO); | 1151 | bio_endio(bio, -EIO); |
1142 | return; | 1152 | return; |
1143 | } | 1153 | } |
1144 | 1154 | ||
1145 | rq = blk_mq_map_request(q, bio, &data); | 1155 | rq = blk_mq_map_request(q, bio, &data); |
1146 | if (unlikely(!rq)) | 1156 | if (unlikely(!rq)) |
1147 | return; | 1157 | return; |
1148 | 1158 | ||
1149 | if (unlikely(is_flush_fua)) { | 1159 | if (unlikely(is_flush_fua)) { |
1150 | blk_mq_bio_to_request(rq, bio); | 1160 | blk_mq_bio_to_request(rq, bio); |
1151 | blk_insert_flush(rq); | 1161 | blk_insert_flush(rq); |
1152 | goto run_queue; | 1162 | goto run_queue; |
1153 | } | 1163 | } |
1154 | 1164 | ||
1155 | if (is_sync) { | 1165 | if (is_sync) { |
1156 | int ret; | 1166 | int ret; |
1157 | 1167 | ||
1158 | blk_mq_bio_to_request(rq, bio); | 1168 | blk_mq_bio_to_request(rq, bio); |
1159 | 1169 | ||
1160 | /* | 1170 | /* |
1161 | * For OK queue, we are done. For error, kill it. Any other | 1171 | * For OK queue, we are done. For error, kill it. Any other |
1162 | * error (busy), just add it to our list as we previously | 1172 | * error (busy), just add it to our list as we previously |
1163 | * would have done | 1173 | * would have done |
1164 | */ | 1174 | */ |
1165 | ret = q->mq_ops->queue_rq(data.hctx, rq, true); | 1175 | ret = q->mq_ops->queue_rq(data.hctx, rq, true); |
1166 | if (ret == BLK_MQ_RQ_QUEUE_OK) | 1176 | if (ret == BLK_MQ_RQ_QUEUE_OK) |
1167 | goto done; | 1177 | goto done; |
1168 | else { | 1178 | else { |
1169 | __blk_mq_requeue_request(rq); | 1179 | __blk_mq_requeue_request(rq); |
1170 | 1180 | ||
1171 | if (ret == BLK_MQ_RQ_QUEUE_ERROR) { | 1181 | if (ret == BLK_MQ_RQ_QUEUE_ERROR) { |
1172 | rq->errors = -EIO; | 1182 | rq->errors = -EIO; |
1173 | blk_mq_end_request(rq, rq->errors); | 1183 | blk_mq_end_request(rq, rq->errors); |
1174 | goto done; | 1184 | goto done; |
1175 | } | 1185 | } |
1176 | } | 1186 | } |
1177 | } | 1187 | } |
1178 | 1188 | ||
1179 | if (!blk_mq_merge_queue_io(data.hctx, data.ctx, rq, bio)) { | 1189 | if (!blk_mq_merge_queue_io(data.hctx, data.ctx, rq, bio)) { |
1180 | /* | 1190 | /* |
1181 | * For a SYNC request, send it to the hardware immediately. For | 1191 | * For a SYNC request, send it to the hardware immediately. For |
1182 | * an ASYNC request, just ensure that we run it later on. The | 1192 | * an ASYNC request, just ensure that we run it later on. The |
1183 | * latter allows for merging opportunities and more efficient | 1193 | * latter allows for merging opportunities and more efficient |
1184 | * dispatching. | 1194 | * dispatching. |
1185 | */ | 1195 | */ |
1186 | run_queue: | 1196 | run_queue: |
1187 | blk_mq_run_hw_queue(data.hctx, !is_sync || is_flush_fua); | 1197 | blk_mq_run_hw_queue(data.hctx, !is_sync || is_flush_fua); |
1188 | } | 1198 | } |
1189 | done: | 1199 | done: |
1190 | blk_mq_put_ctx(data.ctx); | 1200 | blk_mq_put_ctx(data.ctx); |
1191 | } | 1201 | } |
1192 | 1202 | ||
1193 | /* | 1203 | /* |
1194 | * Single hardware queue variant. This will attempt to use any per-process | 1204 | * Single hardware queue variant. This will attempt to use any per-process |
1195 | * plug for merging and IO deferral. | 1205 | * plug for merging and IO deferral. |
1196 | */ | 1206 | */ |
1197 | static void blk_sq_make_request(struct request_queue *q, struct bio *bio) | 1207 | static void blk_sq_make_request(struct request_queue *q, struct bio *bio) |
1198 | { | 1208 | { |
1199 | const int is_sync = rw_is_sync(bio->bi_rw); | 1209 | const int is_sync = rw_is_sync(bio->bi_rw); |
1200 | const int is_flush_fua = bio->bi_rw & (REQ_FLUSH | REQ_FUA); | 1210 | const int is_flush_fua = bio->bi_rw & (REQ_FLUSH | REQ_FUA); |
1201 | unsigned int use_plug, request_count = 0; | 1211 | unsigned int use_plug, request_count = 0; |
1202 | struct blk_map_ctx data; | 1212 | struct blk_map_ctx data; |
1203 | struct request *rq; | 1213 | struct request *rq; |
1204 | 1214 | ||
1205 | /* | 1215 | /* |
1206 | * If we have multiple hardware queues, just go directly to | 1216 | * If we have multiple hardware queues, just go directly to |
1207 | * one of those for sync IO. | 1217 | * one of those for sync IO. |
1208 | */ | 1218 | */ |
1209 | use_plug = !is_flush_fua && !is_sync; | 1219 | use_plug = !is_flush_fua && !is_sync; |
1210 | 1220 | ||
1211 | blk_queue_bounce(q, &bio); | 1221 | blk_queue_bounce(q, &bio); |
1212 | 1222 | ||
1213 | if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) { | 1223 | if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) { |
1214 | bio_endio(bio, -EIO); | 1224 | bio_endio(bio, -EIO); |
1215 | return; | 1225 | return; |
1216 | } | 1226 | } |
1217 | 1227 | ||
1218 | if (use_plug && !blk_queue_nomerges(q) && | 1228 | if (use_plug && !blk_queue_nomerges(q) && |
1219 | blk_attempt_plug_merge(q, bio, &request_count)) | 1229 | blk_attempt_plug_merge(q, bio, &request_count)) |
1220 | return; | 1230 | return; |
1221 | 1231 | ||
1222 | rq = blk_mq_map_request(q, bio, &data); | 1232 | rq = blk_mq_map_request(q, bio, &data); |
1223 | if (unlikely(!rq)) | 1233 | if (unlikely(!rq)) |
1224 | return; | 1234 | return; |
1225 | 1235 | ||
1226 | if (unlikely(is_flush_fua)) { | 1236 | if (unlikely(is_flush_fua)) { |
1227 | blk_mq_bio_to_request(rq, bio); | 1237 | blk_mq_bio_to_request(rq, bio); |
1228 | blk_insert_flush(rq); | 1238 | blk_insert_flush(rq); |
1229 | goto run_queue; | 1239 | goto run_queue; |
1230 | } | 1240 | } |
1231 | 1241 | ||
1232 | /* | 1242 | /* |
1233 | * A task plug currently exists. Since this is completely lockless, | 1243 | * A task plug currently exists. Since this is completely lockless, |
1234 | * utilize that to temporarily store requests until the task is | 1244 | * utilize that to temporarily store requests until the task is |
1235 | * either done or scheduled away. | 1245 | * either done or scheduled away. |
1236 | */ | 1246 | */ |
1237 | if (use_plug) { | 1247 | if (use_plug) { |
1238 | struct blk_plug *plug = current->plug; | 1248 | struct blk_plug *plug = current->plug; |
1239 | 1249 | ||
1240 | if (plug) { | 1250 | if (plug) { |
1241 | blk_mq_bio_to_request(rq, bio); | 1251 | blk_mq_bio_to_request(rq, bio); |
1242 | if (list_empty(&plug->mq_list)) | 1252 | if (list_empty(&plug->mq_list)) |
1243 | trace_block_plug(q); | 1253 | trace_block_plug(q); |
1244 | else if (request_count >= BLK_MAX_REQUEST_COUNT) { | 1254 | else if (request_count >= BLK_MAX_REQUEST_COUNT) { |
1245 | blk_flush_plug_list(plug, false); | 1255 | blk_flush_plug_list(plug, false); |
1246 | trace_block_plug(q); | 1256 | trace_block_plug(q); |
1247 | } | 1257 | } |
1248 | list_add_tail(&rq->queuelist, &plug->mq_list); | 1258 | list_add_tail(&rq->queuelist, &plug->mq_list); |
1249 | blk_mq_put_ctx(data.ctx); | 1259 | blk_mq_put_ctx(data.ctx); |
1250 | return; | 1260 | return; |
1251 | } | 1261 | } |
1252 | } | 1262 | } |
1253 | 1263 | ||
1254 | if (!blk_mq_merge_queue_io(data.hctx, data.ctx, rq, bio)) { | 1264 | if (!blk_mq_merge_queue_io(data.hctx, data.ctx, rq, bio)) { |
1255 | /* | 1265 | /* |
1256 | * For a SYNC request, send it to the hardware immediately. For | 1266 | * For a SYNC request, send it to the hardware immediately. For |
1257 | * an ASYNC request, just ensure that we run it later on. The | 1267 | * an ASYNC request, just ensure that we run it later on. The |
1258 | * latter allows for merging opportunities and more efficient | 1268 | * latter allows for merging opportunities and more efficient |
1259 | * dispatching. | 1269 | * dispatching. |
1260 | */ | 1270 | */ |
1261 | run_queue: | 1271 | run_queue: |
1262 | blk_mq_run_hw_queue(data.hctx, !is_sync || is_flush_fua); | 1272 | blk_mq_run_hw_queue(data.hctx, !is_sync || is_flush_fua); |
1263 | } | 1273 | } |
1264 | 1274 | ||
1265 | blk_mq_put_ctx(data.ctx); | 1275 | blk_mq_put_ctx(data.ctx); |
1266 | } | 1276 | } |
1267 | 1277 | ||
1268 | /* | 1278 | /* |
1269 | * Default mapping to a software queue, since we use one per CPU. | 1279 | * Default mapping to a software queue, since we use one per CPU. |
1270 | */ | 1280 | */ |
1271 | struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *q, const int cpu) | 1281 | struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *q, const int cpu) |
1272 | { | 1282 | { |
1273 | return q->queue_hw_ctx[q->mq_map[cpu]]; | 1283 | return q->queue_hw_ctx[q->mq_map[cpu]]; |
1274 | } | 1284 | } |
1275 | EXPORT_SYMBOL(blk_mq_map_queue); | 1285 | EXPORT_SYMBOL(blk_mq_map_queue); |
1276 | 1286 | ||
1277 | static void blk_mq_free_rq_map(struct blk_mq_tag_set *set, | 1287 | static void blk_mq_free_rq_map(struct blk_mq_tag_set *set, |
1278 | struct blk_mq_tags *tags, unsigned int hctx_idx) | 1288 | struct blk_mq_tags *tags, unsigned int hctx_idx) |
1279 | { | 1289 | { |
1280 | struct page *page; | 1290 | struct page *page; |
1281 | 1291 | ||
1282 | if (tags->rqs && set->ops->exit_request) { | 1292 | if (tags->rqs && set->ops->exit_request) { |
1283 | int i; | 1293 | int i; |
1284 | 1294 | ||
1285 | for (i = 0; i < tags->nr_tags; i++) { | 1295 | for (i = 0; i < tags->nr_tags; i++) { |
1286 | if (!tags->rqs[i]) | 1296 | if (!tags->rqs[i]) |
1287 | continue; | 1297 | continue; |
1288 | set->ops->exit_request(set->driver_data, tags->rqs[i], | 1298 | set->ops->exit_request(set->driver_data, tags->rqs[i], |
1289 | hctx_idx, i); | 1299 | hctx_idx, i); |
1290 | tags->rqs[i] = NULL; | 1300 | tags->rqs[i] = NULL; |
1291 | } | 1301 | } |
1292 | } | 1302 | } |
1293 | 1303 | ||
1294 | while (!list_empty(&tags->page_list)) { | 1304 | while (!list_empty(&tags->page_list)) { |
1295 | page = list_first_entry(&tags->page_list, struct page, lru); | 1305 | page = list_first_entry(&tags->page_list, struct page, lru); |
1296 | list_del_init(&page->lru); | 1306 | list_del_init(&page->lru); |
1297 | __free_pages(page, page->private); | 1307 | __free_pages(page, page->private); |
1298 | } | 1308 | } |
1299 | 1309 | ||
1300 | kfree(tags->rqs); | 1310 | kfree(tags->rqs); |
1301 | 1311 | ||
1302 | blk_mq_free_tags(tags); | 1312 | blk_mq_free_tags(tags); |
1303 | } | 1313 | } |
1304 | 1314 | ||
1305 | static size_t order_to_size(unsigned int order) | 1315 | static size_t order_to_size(unsigned int order) |
1306 | { | 1316 | { |
1307 | return (size_t)PAGE_SIZE << order; | 1317 | return (size_t)PAGE_SIZE << order; |
1308 | } | 1318 | } |
1309 | 1319 | ||
1310 | static struct blk_mq_tags *blk_mq_init_rq_map(struct blk_mq_tag_set *set, | 1320 | static struct blk_mq_tags *blk_mq_init_rq_map(struct blk_mq_tag_set *set, |
1311 | unsigned int hctx_idx) | 1321 | unsigned int hctx_idx) |
1312 | { | 1322 | { |
1313 | struct blk_mq_tags *tags; | 1323 | struct blk_mq_tags *tags; |
1314 | unsigned int i, j, entries_per_page, max_order = 4; | 1324 | unsigned int i, j, entries_per_page, max_order = 4; |
1315 | size_t rq_size, left; | 1325 | size_t rq_size, left; |
1316 | 1326 | ||
1317 | tags = blk_mq_init_tags(set->queue_depth, set->reserved_tags, | 1327 | tags = blk_mq_init_tags(set->queue_depth, set->reserved_tags, |
1318 | set->numa_node); | 1328 | set->numa_node); |
1319 | if (!tags) | 1329 | if (!tags) |
1320 | return NULL; | 1330 | return NULL; |
1321 | 1331 | ||
1322 | INIT_LIST_HEAD(&tags->page_list); | 1332 | INIT_LIST_HEAD(&tags->page_list); |
1323 | 1333 | ||
1324 | tags->rqs = kzalloc_node(set->queue_depth * sizeof(struct request *), | 1334 | tags->rqs = kzalloc_node(set->queue_depth * sizeof(struct request *), |
1325 | GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY, | 1335 | GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY, |
1326 | set->numa_node); | 1336 | set->numa_node); |
1327 | if (!tags->rqs) { | 1337 | if (!tags->rqs) { |
1328 | blk_mq_free_tags(tags); | 1338 | blk_mq_free_tags(tags); |
1329 | return NULL; | 1339 | return NULL; |
1330 | } | 1340 | } |
1331 | 1341 | ||
1332 | /* | 1342 | /* |
1333 | * rq_size is the size of the request plus driver payload, rounded | 1343 | * rq_size is the size of the request plus driver payload, rounded |
1334 | * to the cacheline size | 1344 | * to the cacheline size |
1335 | */ | 1345 | */ |
1336 | rq_size = round_up(sizeof(struct request) + set->cmd_size, | 1346 | rq_size = round_up(sizeof(struct request) + set->cmd_size, |
1337 | cache_line_size()); | 1347 | cache_line_size()); |
1338 | left = rq_size * set->queue_depth; | 1348 | left = rq_size * set->queue_depth; |
1339 | 1349 | ||
1340 | for (i = 0; i < set->queue_depth; ) { | 1350 | for (i = 0; i < set->queue_depth; ) { |
1341 | int this_order = max_order; | 1351 | int this_order = max_order; |
1342 | struct page *page; | 1352 | struct page *page; |
1343 | int to_do; | 1353 | int to_do; |
1344 | void *p; | 1354 | void *p; |
1345 | 1355 | ||
1346 | while (left < order_to_size(this_order - 1) && this_order) | 1356 | while (left < order_to_size(this_order - 1) && this_order) |
1347 | this_order--; | 1357 | this_order--; |
1348 | 1358 | ||
1349 | do { | 1359 | do { |
1350 | page = alloc_pages_node(set->numa_node, | 1360 | page = alloc_pages_node(set->numa_node, |
1351 | GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY, | 1361 | GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY, |
1352 | this_order); | 1362 | this_order); |
1353 | if (page) | 1363 | if (page) |
1354 | break; | 1364 | break; |
1355 | if (!this_order--) | 1365 | if (!this_order--) |
1356 | break; | 1366 | break; |
1357 | if (order_to_size(this_order) < rq_size) | 1367 | if (order_to_size(this_order) < rq_size) |
1358 | break; | 1368 | break; |
1359 | } while (1); | 1369 | } while (1); |
1360 | 1370 | ||
1361 | if (!page) | 1371 | if (!page) |
1362 | goto fail; | 1372 | goto fail; |
1363 | 1373 | ||
1364 | page->private = this_order; | 1374 | page->private = this_order; |
1365 | list_add_tail(&page->lru, &tags->page_list); | 1375 | list_add_tail(&page->lru, &tags->page_list); |
1366 | 1376 | ||
1367 | p = page_address(page); | 1377 | p = page_address(page); |
1368 | entries_per_page = order_to_size(this_order) / rq_size; | 1378 | entries_per_page = order_to_size(this_order) / rq_size; |
1369 | to_do = min(entries_per_page, set->queue_depth - i); | 1379 | to_do = min(entries_per_page, set->queue_depth - i); |
1370 | left -= to_do * rq_size; | 1380 | left -= to_do * rq_size; |
1371 | for (j = 0; j < to_do; j++) { | 1381 | for (j = 0; j < to_do; j++) { |
1372 | tags->rqs[i] = p; | 1382 | tags->rqs[i] = p; |
1373 | tags->rqs[i]->atomic_flags = 0; | 1383 | tags->rqs[i]->atomic_flags = 0; |
1374 | tags->rqs[i]->cmd_flags = 0; | 1384 | tags->rqs[i]->cmd_flags = 0; |
1375 | if (set->ops->init_request) { | 1385 | if (set->ops->init_request) { |
1376 | if (set->ops->init_request(set->driver_data, | 1386 | if (set->ops->init_request(set->driver_data, |
1377 | tags->rqs[i], hctx_idx, i, | 1387 | tags->rqs[i], hctx_idx, i, |
1378 | set->numa_node)) { | 1388 | set->numa_node)) { |
1379 | tags->rqs[i] = NULL; | 1389 | tags->rqs[i] = NULL; |
1380 | goto fail; | 1390 | goto fail; |
1381 | } | 1391 | } |
1382 | } | 1392 | } |
1383 | 1393 | ||
1384 | p += rq_size; | 1394 | p += rq_size; |
1385 | i++; | 1395 | i++; |
1386 | } | 1396 | } |
1387 | } | 1397 | } |
1388 | 1398 | ||
1389 | return tags; | 1399 | return tags; |
1390 | 1400 | ||
1391 | fail: | 1401 | fail: |
1392 | blk_mq_free_rq_map(set, tags, hctx_idx); | 1402 | blk_mq_free_rq_map(set, tags, hctx_idx); |
1393 | return NULL; | 1403 | return NULL; |
1394 | } | 1404 | } |
1395 | 1405 | ||
1396 | static void blk_mq_free_bitmap(struct blk_mq_ctxmap *bitmap) | 1406 | static void blk_mq_free_bitmap(struct blk_mq_ctxmap *bitmap) |
1397 | { | 1407 | { |
1398 | kfree(bitmap->map); | 1408 | kfree(bitmap->map); |
1399 | } | 1409 | } |
1400 | 1410 | ||
1401 | static int blk_mq_alloc_bitmap(struct blk_mq_ctxmap *bitmap, int node) | 1411 | static int blk_mq_alloc_bitmap(struct blk_mq_ctxmap *bitmap, int node) |
1402 | { | 1412 | { |
1403 | unsigned int bpw = 8, total, num_maps, i; | 1413 | unsigned int bpw = 8, total, num_maps, i; |
1404 | 1414 | ||
1405 | bitmap->bits_per_word = bpw; | 1415 | bitmap->bits_per_word = bpw; |
1406 | 1416 | ||
1407 | num_maps = ALIGN(nr_cpu_ids, bpw) / bpw; | 1417 | num_maps = ALIGN(nr_cpu_ids, bpw) / bpw; |
1408 | bitmap->map = kzalloc_node(num_maps * sizeof(struct blk_align_bitmap), | 1418 | bitmap->map = kzalloc_node(num_maps * sizeof(struct blk_align_bitmap), |
1409 | GFP_KERNEL, node); | 1419 | GFP_KERNEL, node); |
1410 | if (!bitmap->map) | 1420 | if (!bitmap->map) |
1411 | return -ENOMEM; | 1421 | return -ENOMEM; |
1412 | 1422 | ||
1413 | bitmap->map_size = num_maps; | 1423 | bitmap->map_size = num_maps; |
1414 | 1424 | ||
1415 | total = nr_cpu_ids; | 1425 | total = nr_cpu_ids; |
1416 | for (i = 0; i < num_maps; i++) { | 1426 | for (i = 0; i < num_maps; i++) { |
1417 | bitmap->map[i].depth = min(total, bitmap->bits_per_word); | 1427 | bitmap->map[i].depth = min(total, bitmap->bits_per_word); |
1418 | total -= bitmap->map[i].depth; | 1428 | total -= bitmap->map[i].depth; |
1419 | } | 1429 | } |
1420 | 1430 | ||
1421 | return 0; | 1431 | return 0; |
1422 | } | 1432 | } |
1423 | 1433 | ||
1424 | static int blk_mq_hctx_cpu_offline(struct blk_mq_hw_ctx *hctx, int cpu) | 1434 | static int blk_mq_hctx_cpu_offline(struct blk_mq_hw_ctx *hctx, int cpu) |
1425 | { | 1435 | { |
1426 | struct request_queue *q = hctx->queue; | 1436 | struct request_queue *q = hctx->queue; |
1427 | struct blk_mq_ctx *ctx; | 1437 | struct blk_mq_ctx *ctx; |
1428 | LIST_HEAD(tmp); | 1438 | LIST_HEAD(tmp); |
1429 | 1439 | ||
1430 | /* | 1440 | /* |
1431 | * Move ctx entries to new CPU, if this one is going away. | 1441 | * Move ctx entries to new CPU, if this one is going away. |
1432 | */ | 1442 | */ |
1433 | ctx = __blk_mq_get_ctx(q, cpu); | 1443 | ctx = __blk_mq_get_ctx(q, cpu); |
1434 | 1444 | ||
1435 | spin_lock(&ctx->lock); | 1445 | spin_lock(&ctx->lock); |
1436 | if (!list_empty(&ctx->rq_list)) { | 1446 | if (!list_empty(&ctx->rq_list)) { |
1437 | list_splice_init(&ctx->rq_list, &tmp); | 1447 | list_splice_init(&ctx->rq_list, &tmp); |
1438 | blk_mq_hctx_clear_pending(hctx, ctx); | 1448 | blk_mq_hctx_clear_pending(hctx, ctx); |
1439 | } | 1449 | } |
1440 | spin_unlock(&ctx->lock); | 1450 | spin_unlock(&ctx->lock); |
1441 | 1451 | ||
1442 | if (list_empty(&tmp)) | 1452 | if (list_empty(&tmp)) |
1443 | return NOTIFY_OK; | 1453 | return NOTIFY_OK; |
1444 | 1454 | ||
1445 | ctx = blk_mq_get_ctx(q); | 1455 | ctx = blk_mq_get_ctx(q); |
1446 | spin_lock(&ctx->lock); | 1456 | spin_lock(&ctx->lock); |
1447 | 1457 | ||
1448 | while (!list_empty(&tmp)) { | 1458 | while (!list_empty(&tmp)) { |
1449 | struct request *rq; | 1459 | struct request *rq; |
1450 | 1460 | ||
1451 | rq = list_first_entry(&tmp, struct request, queuelist); | 1461 | rq = list_first_entry(&tmp, struct request, queuelist); |
1452 | rq->mq_ctx = ctx; | 1462 | rq->mq_ctx = ctx; |
1453 | list_move_tail(&rq->queuelist, &ctx->rq_list); | 1463 | list_move_tail(&rq->queuelist, &ctx->rq_list); |
1454 | } | 1464 | } |
1455 | 1465 | ||
1456 | hctx = q->mq_ops->map_queue(q, ctx->cpu); | 1466 | hctx = q->mq_ops->map_queue(q, ctx->cpu); |
1457 | blk_mq_hctx_mark_pending(hctx, ctx); | 1467 | blk_mq_hctx_mark_pending(hctx, ctx); |
1458 | 1468 | ||
1459 | spin_unlock(&ctx->lock); | 1469 | spin_unlock(&ctx->lock); |
1460 | 1470 | ||
1461 | blk_mq_run_hw_queue(hctx, true); | 1471 | blk_mq_run_hw_queue(hctx, true); |
1462 | blk_mq_put_ctx(ctx); | 1472 | blk_mq_put_ctx(ctx); |
1463 | return NOTIFY_OK; | 1473 | return NOTIFY_OK; |
1464 | } | 1474 | } |
1465 | 1475 | ||
1466 | static int blk_mq_hctx_cpu_online(struct blk_mq_hw_ctx *hctx, int cpu) | 1476 | static int blk_mq_hctx_cpu_online(struct blk_mq_hw_ctx *hctx, int cpu) |
1467 | { | 1477 | { |
1468 | struct request_queue *q = hctx->queue; | 1478 | struct request_queue *q = hctx->queue; |
1469 | struct blk_mq_tag_set *set = q->tag_set; | 1479 | struct blk_mq_tag_set *set = q->tag_set; |
1470 | 1480 | ||
1471 | if (set->tags[hctx->queue_num]) | 1481 | if (set->tags[hctx->queue_num]) |
1472 | return NOTIFY_OK; | 1482 | return NOTIFY_OK; |
1473 | 1483 | ||
1474 | set->tags[hctx->queue_num] = blk_mq_init_rq_map(set, hctx->queue_num); | 1484 | set->tags[hctx->queue_num] = blk_mq_init_rq_map(set, hctx->queue_num); |
1475 | if (!set->tags[hctx->queue_num]) | 1485 | if (!set->tags[hctx->queue_num]) |
1476 | return NOTIFY_STOP; | 1486 | return NOTIFY_STOP; |
1477 | 1487 | ||
1478 | hctx->tags = set->tags[hctx->queue_num]; | 1488 | hctx->tags = set->tags[hctx->queue_num]; |
1479 | return NOTIFY_OK; | 1489 | return NOTIFY_OK; |
1480 | } | 1490 | } |
1481 | 1491 | ||
1482 | static int blk_mq_hctx_notify(void *data, unsigned long action, | 1492 | static int blk_mq_hctx_notify(void *data, unsigned long action, |
1483 | unsigned int cpu) | 1493 | unsigned int cpu) |
1484 | { | 1494 | { |
1485 | struct blk_mq_hw_ctx *hctx = data; | 1495 | struct blk_mq_hw_ctx *hctx = data; |
1486 | 1496 | ||
1487 | if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) | 1497 | if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) |
1488 | return blk_mq_hctx_cpu_offline(hctx, cpu); | 1498 | return blk_mq_hctx_cpu_offline(hctx, cpu); |
1489 | else if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) | 1499 | else if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) |
1490 | return blk_mq_hctx_cpu_online(hctx, cpu); | 1500 | return blk_mq_hctx_cpu_online(hctx, cpu); |
1491 | 1501 | ||
1492 | return NOTIFY_OK; | 1502 | return NOTIFY_OK; |
1493 | } | 1503 | } |
1494 | 1504 | ||
1495 | static void blk_mq_exit_hctx(struct request_queue *q, | 1505 | static void blk_mq_exit_hctx(struct request_queue *q, |
1496 | struct blk_mq_tag_set *set, | 1506 | struct blk_mq_tag_set *set, |
1497 | struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx) | 1507 | struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx) |
1498 | { | 1508 | { |
1499 | unsigned flush_start_tag = set->queue_depth; | 1509 | unsigned flush_start_tag = set->queue_depth; |
1500 | 1510 | ||
1501 | blk_mq_tag_idle(hctx); | 1511 | blk_mq_tag_idle(hctx); |
1502 | 1512 | ||
1503 | if (set->ops->exit_request) | 1513 | if (set->ops->exit_request) |
1504 | set->ops->exit_request(set->driver_data, | 1514 | set->ops->exit_request(set->driver_data, |
1505 | hctx->fq->flush_rq, hctx_idx, | 1515 | hctx->fq->flush_rq, hctx_idx, |
1506 | flush_start_tag + hctx_idx); | 1516 | flush_start_tag + hctx_idx); |
1507 | 1517 | ||
1508 | if (set->ops->exit_hctx) | 1518 | if (set->ops->exit_hctx) |
1509 | set->ops->exit_hctx(hctx, hctx_idx); | 1519 | set->ops->exit_hctx(hctx, hctx_idx); |
1510 | 1520 | ||
1511 | blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier); | 1521 | blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier); |
1512 | blk_free_flush_queue(hctx->fq); | 1522 | blk_free_flush_queue(hctx->fq); |
1513 | kfree(hctx->ctxs); | 1523 | kfree(hctx->ctxs); |
1514 | blk_mq_free_bitmap(&hctx->ctx_map); | 1524 | blk_mq_free_bitmap(&hctx->ctx_map); |
1515 | } | 1525 | } |
1516 | 1526 | ||
1517 | static void blk_mq_exit_hw_queues(struct request_queue *q, | 1527 | static void blk_mq_exit_hw_queues(struct request_queue *q, |
1518 | struct blk_mq_tag_set *set, int nr_queue) | 1528 | struct blk_mq_tag_set *set, int nr_queue) |
1519 | { | 1529 | { |
1520 | struct blk_mq_hw_ctx *hctx; | 1530 | struct blk_mq_hw_ctx *hctx; |
1521 | unsigned int i; | 1531 | unsigned int i; |
1522 | 1532 | ||
1523 | queue_for_each_hw_ctx(q, hctx, i) { | 1533 | queue_for_each_hw_ctx(q, hctx, i) { |
1524 | if (i == nr_queue) | 1534 | if (i == nr_queue) |
1525 | break; | 1535 | break; |
1526 | blk_mq_exit_hctx(q, set, hctx, i); | 1536 | blk_mq_exit_hctx(q, set, hctx, i); |
1527 | } | 1537 | } |
1528 | } | 1538 | } |
1529 | 1539 | ||
1530 | static void blk_mq_free_hw_queues(struct request_queue *q, | 1540 | static void blk_mq_free_hw_queues(struct request_queue *q, |
1531 | struct blk_mq_tag_set *set) | 1541 | struct blk_mq_tag_set *set) |
1532 | { | 1542 | { |
1533 | struct blk_mq_hw_ctx *hctx; | 1543 | struct blk_mq_hw_ctx *hctx; |
1534 | unsigned int i; | 1544 | unsigned int i; |
1535 | 1545 | ||
1536 | queue_for_each_hw_ctx(q, hctx, i) { | 1546 | queue_for_each_hw_ctx(q, hctx, i) { |
1537 | free_cpumask_var(hctx->cpumask); | 1547 | free_cpumask_var(hctx->cpumask); |
1538 | kfree(hctx); | 1548 | kfree(hctx); |
1539 | } | 1549 | } |
1540 | } | 1550 | } |
1541 | 1551 | ||
1542 | static int blk_mq_init_hctx(struct request_queue *q, | 1552 | static int blk_mq_init_hctx(struct request_queue *q, |
1543 | struct blk_mq_tag_set *set, | 1553 | struct blk_mq_tag_set *set, |
1544 | struct blk_mq_hw_ctx *hctx, unsigned hctx_idx) | 1554 | struct blk_mq_hw_ctx *hctx, unsigned hctx_idx) |
1545 | { | 1555 | { |
1546 | int node; | 1556 | int node; |
1547 | unsigned flush_start_tag = set->queue_depth; | 1557 | unsigned flush_start_tag = set->queue_depth; |
1548 | 1558 | ||
1549 | node = hctx->numa_node; | 1559 | node = hctx->numa_node; |
1550 | if (node == NUMA_NO_NODE) | 1560 | if (node == NUMA_NO_NODE) |
1551 | node = hctx->numa_node = set->numa_node; | 1561 | node = hctx->numa_node = set->numa_node; |
1552 | 1562 | ||
1553 | INIT_DELAYED_WORK(&hctx->run_work, blk_mq_run_work_fn); | 1563 | INIT_DELAYED_WORK(&hctx->run_work, blk_mq_run_work_fn); |
1554 | INIT_DELAYED_WORK(&hctx->delay_work, blk_mq_delay_work_fn); | 1564 | INIT_DELAYED_WORK(&hctx->delay_work, blk_mq_delay_work_fn); |
1555 | spin_lock_init(&hctx->lock); | 1565 | spin_lock_init(&hctx->lock); |
1556 | INIT_LIST_HEAD(&hctx->dispatch); | 1566 | INIT_LIST_HEAD(&hctx->dispatch); |
1557 | hctx->queue = q; | 1567 | hctx->queue = q; |
1558 | hctx->queue_num = hctx_idx; | 1568 | hctx->queue_num = hctx_idx; |
1559 | hctx->flags = set->flags; | 1569 | hctx->flags = set->flags; |
1560 | hctx->cmd_size = set->cmd_size; | 1570 | hctx->cmd_size = set->cmd_size; |
1561 | 1571 | ||
1562 | blk_mq_init_cpu_notifier(&hctx->cpu_notifier, | 1572 | blk_mq_init_cpu_notifier(&hctx->cpu_notifier, |
1563 | blk_mq_hctx_notify, hctx); | 1573 | blk_mq_hctx_notify, hctx); |
1564 | blk_mq_register_cpu_notifier(&hctx->cpu_notifier); | 1574 | blk_mq_register_cpu_notifier(&hctx->cpu_notifier); |
1565 | 1575 | ||
1566 | hctx->tags = set->tags[hctx_idx]; | 1576 | hctx->tags = set->tags[hctx_idx]; |
1567 | 1577 | ||
1568 | /* | 1578 | /* |
1569 | * Allocate space for all possible cpus to avoid allocation at | 1579 | * Allocate space for all possible cpus to avoid allocation at |
1570 | * runtime | 1580 | * runtime |
1571 | */ | 1581 | */ |
1572 | hctx->ctxs = kmalloc_node(nr_cpu_ids * sizeof(void *), | 1582 | hctx->ctxs = kmalloc_node(nr_cpu_ids * sizeof(void *), |
1573 | GFP_KERNEL, node); | 1583 | GFP_KERNEL, node); |
1574 | if (!hctx->ctxs) | 1584 | if (!hctx->ctxs) |
1575 | goto unregister_cpu_notifier; | 1585 | goto unregister_cpu_notifier; |
1576 | 1586 | ||
1577 | if (blk_mq_alloc_bitmap(&hctx->ctx_map, node)) | 1587 | if (blk_mq_alloc_bitmap(&hctx->ctx_map, node)) |
1578 | goto free_ctxs; | 1588 | goto free_ctxs; |
1579 | 1589 | ||
1580 | hctx->nr_ctx = 0; | 1590 | hctx->nr_ctx = 0; |
1581 | 1591 | ||
1582 | if (set->ops->init_hctx && | 1592 | if (set->ops->init_hctx && |
1583 | set->ops->init_hctx(hctx, set->driver_data, hctx_idx)) | 1593 | set->ops->init_hctx(hctx, set->driver_data, hctx_idx)) |
1584 | goto free_bitmap; | 1594 | goto free_bitmap; |
1585 | 1595 | ||
1586 | hctx->fq = blk_alloc_flush_queue(q, hctx->numa_node, set->cmd_size); | 1596 | hctx->fq = blk_alloc_flush_queue(q, hctx->numa_node, set->cmd_size); |
1587 | if (!hctx->fq) | 1597 | if (!hctx->fq) |
1588 | goto exit_hctx; | 1598 | goto exit_hctx; |
1589 | 1599 | ||
1590 | if (set->ops->init_request && | 1600 | if (set->ops->init_request && |
1591 | set->ops->init_request(set->driver_data, | 1601 | set->ops->init_request(set->driver_data, |
1592 | hctx->fq->flush_rq, hctx_idx, | 1602 | hctx->fq->flush_rq, hctx_idx, |
1593 | flush_start_tag + hctx_idx, node)) | 1603 | flush_start_tag + hctx_idx, node)) |
1594 | goto free_fq; | 1604 | goto free_fq; |
1595 | 1605 | ||
1596 | return 0; | 1606 | return 0; |
1597 | 1607 | ||
1598 | free_fq: | 1608 | free_fq: |
1599 | kfree(hctx->fq); | 1609 | kfree(hctx->fq); |
1600 | exit_hctx: | 1610 | exit_hctx: |
1601 | if (set->ops->exit_hctx) | 1611 | if (set->ops->exit_hctx) |
1602 | set->ops->exit_hctx(hctx, hctx_idx); | 1612 | set->ops->exit_hctx(hctx, hctx_idx); |
1603 | free_bitmap: | 1613 | free_bitmap: |
1604 | blk_mq_free_bitmap(&hctx->ctx_map); | 1614 | blk_mq_free_bitmap(&hctx->ctx_map); |
1605 | free_ctxs: | 1615 | free_ctxs: |
1606 | kfree(hctx->ctxs); | 1616 | kfree(hctx->ctxs); |
1607 | unregister_cpu_notifier: | 1617 | unregister_cpu_notifier: |
1608 | blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier); | 1618 | blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier); |
1609 | 1619 | ||
1610 | return -1; | 1620 | return -1; |
1611 | } | 1621 | } |
1612 | 1622 | ||
1613 | static int blk_mq_init_hw_queues(struct request_queue *q, | 1623 | static int blk_mq_init_hw_queues(struct request_queue *q, |
1614 | struct blk_mq_tag_set *set) | 1624 | struct blk_mq_tag_set *set) |
1615 | { | 1625 | { |
1616 | struct blk_mq_hw_ctx *hctx; | 1626 | struct blk_mq_hw_ctx *hctx; |
1617 | unsigned int i; | 1627 | unsigned int i; |
1618 | 1628 | ||
1619 | /* | 1629 | /* |
1620 | * Initialize hardware queues | 1630 | * Initialize hardware queues |
1621 | */ | 1631 | */ |
1622 | queue_for_each_hw_ctx(q, hctx, i) { | 1632 | queue_for_each_hw_ctx(q, hctx, i) { |
1623 | if (blk_mq_init_hctx(q, set, hctx, i)) | 1633 | if (blk_mq_init_hctx(q, set, hctx, i)) |
1624 | break; | 1634 | break; |
1625 | } | 1635 | } |
1626 | 1636 | ||
1627 | if (i == q->nr_hw_queues) | 1637 | if (i == q->nr_hw_queues) |
1628 | return 0; | 1638 | return 0; |
1629 | 1639 | ||
1630 | /* | 1640 | /* |
1631 | * Init failed | 1641 | * Init failed |
1632 | */ | 1642 | */ |
1633 | blk_mq_exit_hw_queues(q, set, i); | 1643 | blk_mq_exit_hw_queues(q, set, i); |
1634 | 1644 | ||
1635 | return 1; | 1645 | return 1; |
1636 | } | 1646 | } |
1637 | 1647 | ||
1638 | static void blk_mq_init_cpu_queues(struct request_queue *q, | 1648 | static void blk_mq_init_cpu_queues(struct request_queue *q, |
1639 | unsigned int nr_hw_queues) | 1649 | unsigned int nr_hw_queues) |
1640 | { | 1650 | { |
1641 | unsigned int i; | 1651 | unsigned int i; |
1642 | 1652 | ||
1643 | for_each_possible_cpu(i) { | 1653 | for_each_possible_cpu(i) { |
1644 | struct blk_mq_ctx *__ctx = per_cpu_ptr(q->queue_ctx, i); | 1654 | struct blk_mq_ctx *__ctx = per_cpu_ptr(q->queue_ctx, i); |
1645 | struct blk_mq_hw_ctx *hctx; | 1655 | struct blk_mq_hw_ctx *hctx; |
1646 | 1656 | ||
1647 | memset(__ctx, 0, sizeof(*__ctx)); | 1657 | memset(__ctx, 0, sizeof(*__ctx)); |
1648 | __ctx->cpu = i; | 1658 | __ctx->cpu = i; |
1649 | spin_lock_init(&__ctx->lock); | 1659 | spin_lock_init(&__ctx->lock); |
1650 | INIT_LIST_HEAD(&__ctx->rq_list); | 1660 | INIT_LIST_HEAD(&__ctx->rq_list); |
1651 | __ctx->queue = q; | 1661 | __ctx->queue = q; |
1652 | 1662 | ||
1653 | /* If the cpu isn't online, the cpu is mapped to first hctx */ | 1663 | /* If the cpu isn't online, the cpu is mapped to first hctx */ |
1654 | if (!cpu_online(i)) | 1664 | if (!cpu_online(i)) |
1655 | continue; | 1665 | continue; |
1656 | 1666 | ||
1657 | hctx = q->mq_ops->map_queue(q, i); | 1667 | hctx = q->mq_ops->map_queue(q, i); |
1658 | cpumask_set_cpu(i, hctx->cpumask); | 1668 | cpumask_set_cpu(i, hctx->cpumask); |
1659 | hctx->nr_ctx++; | 1669 | hctx->nr_ctx++; |
1660 | 1670 | ||
1661 | /* | 1671 | /* |
1662 | * Set local node, IFF we have more than one hw queue. If | 1672 | * Set local node, IFF we have more than one hw queue. If |
1663 | * not, we remain on the home node of the device | 1673 | * not, we remain on the home node of the device |
1664 | */ | 1674 | */ |
1665 | if (nr_hw_queues > 1 && hctx->numa_node == NUMA_NO_NODE) | 1675 | if (nr_hw_queues > 1 && hctx->numa_node == NUMA_NO_NODE) |
1666 | hctx->numa_node = cpu_to_node(i); | 1676 | hctx->numa_node = cpu_to_node(i); |
1667 | } | 1677 | } |
1668 | } | 1678 | } |
1669 | 1679 | ||
1670 | static void blk_mq_map_swqueue(struct request_queue *q) | 1680 | static void blk_mq_map_swqueue(struct request_queue *q) |
1671 | { | 1681 | { |
1672 | unsigned int i; | 1682 | unsigned int i; |
1673 | struct blk_mq_hw_ctx *hctx; | 1683 | struct blk_mq_hw_ctx *hctx; |
1674 | struct blk_mq_ctx *ctx; | 1684 | struct blk_mq_ctx *ctx; |
1675 | 1685 | ||
1676 | queue_for_each_hw_ctx(q, hctx, i) { | 1686 | queue_for_each_hw_ctx(q, hctx, i) { |
1677 | cpumask_clear(hctx->cpumask); | 1687 | cpumask_clear(hctx->cpumask); |
1678 | hctx->nr_ctx = 0; | 1688 | hctx->nr_ctx = 0; |
1679 | } | 1689 | } |
1680 | 1690 | ||
1681 | /* | 1691 | /* |
1682 | * Map software to hardware queues | 1692 | * Map software to hardware queues |
1683 | */ | 1693 | */ |
1684 | queue_for_each_ctx(q, ctx, i) { | 1694 | queue_for_each_ctx(q, ctx, i) { |
1685 | /* If the cpu isn't online, the cpu is mapped to first hctx */ | 1695 | /* If the cpu isn't online, the cpu is mapped to first hctx */ |
1686 | if (!cpu_online(i)) | 1696 | if (!cpu_online(i)) |
1687 | continue; | 1697 | continue; |
1688 | 1698 | ||
1689 | hctx = q->mq_ops->map_queue(q, i); | 1699 | hctx = q->mq_ops->map_queue(q, i); |
1690 | cpumask_set_cpu(i, hctx->cpumask); | 1700 | cpumask_set_cpu(i, hctx->cpumask); |
1691 | ctx->index_hw = hctx->nr_ctx; | 1701 | ctx->index_hw = hctx->nr_ctx; |
1692 | hctx->ctxs[hctx->nr_ctx++] = ctx; | 1702 | hctx->ctxs[hctx->nr_ctx++] = ctx; |
1693 | } | 1703 | } |
1694 | 1704 | ||
1695 | queue_for_each_hw_ctx(q, hctx, i) { | 1705 | queue_for_each_hw_ctx(q, hctx, i) { |
1696 | /* | 1706 | /* |
1697 | * If no software queues are mapped to this hardware queue, | 1707 | * If no software queues are mapped to this hardware queue, |
1698 | * disable it and free the request entries. | 1708 | * disable it and free the request entries. |
1699 | */ | 1709 | */ |
1700 | if (!hctx->nr_ctx) { | 1710 | if (!hctx->nr_ctx) { |
1701 | struct blk_mq_tag_set *set = q->tag_set; | 1711 | struct blk_mq_tag_set *set = q->tag_set; |
1702 | 1712 | ||
1703 | if (set->tags[i]) { | 1713 | if (set->tags[i]) { |
1704 | blk_mq_free_rq_map(set, set->tags[i], i); | 1714 | blk_mq_free_rq_map(set, set->tags[i], i); |
1705 | set->tags[i] = NULL; | 1715 | set->tags[i] = NULL; |
1706 | hctx->tags = NULL; | 1716 | hctx->tags = NULL; |
1707 | } | 1717 | } |
1708 | continue; | 1718 | continue; |
1709 | } | 1719 | } |
1710 | 1720 | ||
1711 | /* | 1721 | /* |
1712 | * Initialize batch roundrobin counts | 1722 | * Initialize batch roundrobin counts |
1713 | */ | 1723 | */ |
1714 | hctx->next_cpu = cpumask_first(hctx->cpumask); | 1724 | hctx->next_cpu = cpumask_first(hctx->cpumask); |
1715 | hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH; | 1725 | hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH; |
1716 | } | 1726 | } |
1717 | } | 1727 | } |
1718 | 1728 | ||
1719 | static void blk_mq_update_tag_set_depth(struct blk_mq_tag_set *set) | 1729 | static void blk_mq_update_tag_set_depth(struct blk_mq_tag_set *set) |
1720 | { | 1730 | { |
1721 | struct blk_mq_hw_ctx *hctx; | 1731 | struct blk_mq_hw_ctx *hctx; |
1722 | struct request_queue *q; | 1732 | struct request_queue *q; |
1723 | bool shared; | 1733 | bool shared; |
1724 | int i; | 1734 | int i; |
1725 | 1735 | ||
1726 | if (set->tag_list.next == set->tag_list.prev) | 1736 | if (set->tag_list.next == set->tag_list.prev) |
1727 | shared = false; | 1737 | shared = false; |
1728 | else | 1738 | else |
1729 | shared = true; | 1739 | shared = true; |
1730 | 1740 | ||
1731 | list_for_each_entry(q, &set->tag_list, tag_set_list) { | 1741 | list_for_each_entry(q, &set->tag_list, tag_set_list) { |
1732 | blk_mq_freeze_queue(q); | 1742 | blk_mq_freeze_queue(q); |
1733 | 1743 | ||
1734 | queue_for_each_hw_ctx(q, hctx, i) { | 1744 | queue_for_each_hw_ctx(q, hctx, i) { |
1735 | if (shared) | 1745 | if (shared) |
1736 | hctx->flags |= BLK_MQ_F_TAG_SHARED; | 1746 | hctx->flags |= BLK_MQ_F_TAG_SHARED; |
1737 | else | 1747 | else |
1738 | hctx->flags &= ~BLK_MQ_F_TAG_SHARED; | 1748 | hctx->flags &= ~BLK_MQ_F_TAG_SHARED; |
1739 | } | 1749 | } |
1740 | blk_mq_unfreeze_queue(q); | 1750 | blk_mq_unfreeze_queue(q); |
1741 | } | 1751 | } |
1742 | } | 1752 | } |
1743 | 1753 | ||
1744 | static void blk_mq_del_queue_tag_set(struct request_queue *q) | 1754 | static void blk_mq_del_queue_tag_set(struct request_queue *q) |
1745 | { | 1755 | { |
1746 | struct blk_mq_tag_set *set = q->tag_set; | 1756 | struct blk_mq_tag_set *set = q->tag_set; |
1747 | 1757 | ||
1748 | mutex_lock(&set->tag_list_lock); | 1758 | mutex_lock(&set->tag_list_lock); |
1749 | list_del_init(&q->tag_set_list); | 1759 | list_del_init(&q->tag_set_list); |
1750 | blk_mq_update_tag_set_depth(set); | 1760 | blk_mq_update_tag_set_depth(set); |
1751 | mutex_unlock(&set->tag_list_lock); | 1761 | mutex_unlock(&set->tag_list_lock); |
1752 | } | 1762 | } |
1753 | 1763 | ||
1754 | static void blk_mq_add_queue_tag_set(struct blk_mq_tag_set *set, | 1764 | static void blk_mq_add_queue_tag_set(struct blk_mq_tag_set *set, |
1755 | struct request_queue *q) | 1765 | struct request_queue *q) |
1756 | { | 1766 | { |
1757 | q->tag_set = set; | 1767 | q->tag_set = set; |
1758 | 1768 | ||
1759 | mutex_lock(&set->tag_list_lock); | 1769 | mutex_lock(&set->tag_list_lock); |
1760 | list_add_tail(&q->tag_set_list, &set->tag_list); | 1770 | list_add_tail(&q->tag_set_list, &set->tag_list); |
1761 | blk_mq_update_tag_set_depth(set); | 1771 | blk_mq_update_tag_set_depth(set); |
1762 | mutex_unlock(&set->tag_list_lock); | 1772 | mutex_unlock(&set->tag_list_lock); |
1763 | } | 1773 | } |
1764 | 1774 | ||
1765 | struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set) | 1775 | struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set) |
1766 | { | 1776 | { |
1767 | struct blk_mq_hw_ctx **hctxs; | 1777 | struct blk_mq_hw_ctx **hctxs; |
1768 | struct blk_mq_ctx __percpu *ctx; | 1778 | struct blk_mq_ctx __percpu *ctx; |
1769 | struct request_queue *q; | 1779 | struct request_queue *q; |
1770 | unsigned int *map; | 1780 | unsigned int *map; |
1771 | int i; | 1781 | int i; |
1772 | 1782 | ||
1773 | ctx = alloc_percpu(struct blk_mq_ctx); | 1783 | ctx = alloc_percpu(struct blk_mq_ctx); |
1774 | if (!ctx) | 1784 | if (!ctx) |
1775 | return ERR_PTR(-ENOMEM); | 1785 | return ERR_PTR(-ENOMEM); |
1776 | 1786 | ||
1777 | /* | 1787 | /* |
1778 | * If a crashdump is active, then we are potentially in a very | 1788 | * If a crashdump is active, then we are potentially in a very |
1779 | * memory constrained environment. Limit us to 1 queue and | 1789 | * memory constrained environment. Limit us to 1 queue and |
1780 | * 64 tags to prevent using too much memory. | 1790 | * 64 tags to prevent using too much memory. |
1781 | */ | 1791 | */ |
1782 | if (is_kdump_kernel()) { | 1792 | if (is_kdump_kernel()) { |
1783 | set->nr_hw_queues = 1; | 1793 | set->nr_hw_queues = 1; |
1784 | set->queue_depth = min(64U, set->queue_depth); | 1794 | set->queue_depth = min(64U, set->queue_depth); |
1785 | } | 1795 | } |
1786 | 1796 | ||
1787 | hctxs = kmalloc_node(set->nr_hw_queues * sizeof(*hctxs), GFP_KERNEL, | 1797 | hctxs = kmalloc_node(set->nr_hw_queues * sizeof(*hctxs), GFP_KERNEL, |
1788 | set->numa_node); | 1798 | set->numa_node); |
1789 | 1799 | ||
1790 | if (!hctxs) | 1800 | if (!hctxs) |
1791 | goto err_percpu; | 1801 | goto err_percpu; |
1792 | 1802 | ||
1793 | map = blk_mq_make_queue_map(set); | 1803 | map = blk_mq_make_queue_map(set); |
1794 | if (!map) | 1804 | if (!map) |
1795 | goto err_map; | 1805 | goto err_map; |
1796 | 1806 | ||
1797 | for (i = 0; i < set->nr_hw_queues; i++) { | 1807 | for (i = 0; i < set->nr_hw_queues; i++) { |
1798 | int node = blk_mq_hw_queue_to_node(map, i); | 1808 | int node = blk_mq_hw_queue_to_node(map, i); |
1799 | 1809 | ||
1800 | hctxs[i] = kzalloc_node(sizeof(struct blk_mq_hw_ctx), | 1810 | hctxs[i] = kzalloc_node(sizeof(struct blk_mq_hw_ctx), |
1801 | GFP_KERNEL, node); | 1811 | GFP_KERNEL, node); |
1802 | if (!hctxs[i]) | 1812 | if (!hctxs[i]) |
1803 | goto err_hctxs; | 1813 | goto err_hctxs; |
1804 | 1814 | ||
1805 | if (!zalloc_cpumask_var_node(&hctxs[i]->cpumask, GFP_KERNEL, | 1815 | if (!zalloc_cpumask_var_node(&hctxs[i]->cpumask, GFP_KERNEL, |
1806 | node)) | 1816 | node)) |
1807 | goto err_hctxs; | 1817 | goto err_hctxs; |
1808 | 1818 | ||
1809 | atomic_set(&hctxs[i]->nr_active, 0); | 1819 | atomic_set(&hctxs[i]->nr_active, 0); |
1810 | hctxs[i]->numa_node = node; | 1820 | hctxs[i]->numa_node = node; |
1811 | hctxs[i]->queue_num = i; | 1821 | hctxs[i]->queue_num = i; |
1812 | } | 1822 | } |
1813 | 1823 | ||
1814 | q = blk_alloc_queue_node(GFP_KERNEL, set->numa_node); | 1824 | q = blk_alloc_queue_node(GFP_KERNEL, set->numa_node); |
1815 | if (!q) | 1825 | if (!q) |
1816 | goto err_hctxs; | 1826 | goto err_hctxs; |
1817 | 1827 | ||
1818 | /* | 1828 | /* |
1819 | * Init percpu_ref in atomic mode so that it's faster to shutdown. | 1829 | * Init percpu_ref in atomic mode so that it's faster to shutdown. |
1820 | * See blk_register_queue() for details. | 1830 | * See blk_register_queue() for details. |
1821 | */ | 1831 | */ |
1822 | if (percpu_ref_init(&q->mq_usage_counter, blk_mq_usage_counter_release, | 1832 | if (percpu_ref_init(&q->mq_usage_counter, blk_mq_usage_counter_release, |
1823 | PERCPU_REF_INIT_ATOMIC, GFP_KERNEL)) | 1833 | PERCPU_REF_INIT_ATOMIC, GFP_KERNEL)) |
1824 | goto err_map; | 1834 | goto err_map; |
1825 | 1835 | ||
1826 | setup_timer(&q->timeout, blk_mq_rq_timer, (unsigned long) q); | 1836 | setup_timer(&q->timeout, blk_mq_rq_timer, (unsigned long) q); |
1827 | blk_queue_rq_timeout(q, 30000); | 1837 | blk_queue_rq_timeout(q, 30000); |
1828 | 1838 | ||
1829 | q->nr_queues = nr_cpu_ids; | 1839 | q->nr_queues = nr_cpu_ids; |
1830 | q->nr_hw_queues = set->nr_hw_queues; | 1840 | q->nr_hw_queues = set->nr_hw_queues; |
1831 | q->mq_map = map; | 1841 | q->mq_map = map; |
1832 | 1842 | ||
1833 | q->queue_ctx = ctx; | 1843 | q->queue_ctx = ctx; |
1834 | q->queue_hw_ctx = hctxs; | 1844 | q->queue_hw_ctx = hctxs; |
1835 | 1845 | ||
1836 | q->mq_ops = set->ops; | 1846 | q->mq_ops = set->ops; |
1837 | q->queue_flags |= QUEUE_FLAG_MQ_DEFAULT; | 1847 | q->queue_flags |= QUEUE_FLAG_MQ_DEFAULT; |
1838 | 1848 | ||
1839 | if (!(set->flags & BLK_MQ_F_SG_MERGE)) | 1849 | if (!(set->flags & BLK_MQ_F_SG_MERGE)) |
1840 | q->queue_flags |= 1 << QUEUE_FLAG_NO_SG_MERGE; | 1850 | q->queue_flags |= 1 << QUEUE_FLAG_NO_SG_MERGE; |
1841 | 1851 | ||
1842 | q->sg_reserved_size = INT_MAX; | 1852 | q->sg_reserved_size = INT_MAX; |
1843 | 1853 | ||
1844 | INIT_WORK(&q->requeue_work, blk_mq_requeue_work); | 1854 | INIT_WORK(&q->requeue_work, blk_mq_requeue_work); |
1845 | INIT_LIST_HEAD(&q->requeue_list); | 1855 | INIT_LIST_HEAD(&q->requeue_list); |
1846 | spin_lock_init(&q->requeue_lock); | 1856 | spin_lock_init(&q->requeue_lock); |
1847 | 1857 | ||
1848 | if (q->nr_hw_queues > 1) | 1858 | if (q->nr_hw_queues > 1) |
1849 | blk_queue_make_request(q, blk_mq_make_request); | 1859 | blk_queue_make_request(q, blk_mq_make_request); |
1850 | else | 1860 | else |
1851 | blk_queue_make_request(q, blk_sq_make_request); | 1861 | blk_queue_make_request(q, blk_sq_make_request); |
1852 | 1862 | ||
1853 | if (set->timeout) | 1863 | if (set->timeout) |
1854 | blk_queue_rq_timeout(q, set->timeout); | 1864 | blk_queue_rq_timeout(q, set->timeout); |
1855 | 1865 | ||
1856 | /* | 1866 | /* |
1857 | * Do this after blk_queue_make_request() overrides it... | 1867 | * Do this after blk_queue_make_request() overrides it... |
1858 | */ | 1868 | */ |
1859 | q->nr_requests = set->queue_depth; | 1869 | q->nr_requests = set->queue_depth; |
1860 | 1870 | ||
1861 | if (set->ops->complete) | 1871 | if (set->ops->complete) |
1862 | blk_queue_softirq_done(q, set->ops->complete); | 1872 | blk_queue_softirq_done(q, set->ops->complete); |
1863 | 1873 | ||
1864 | blk_mq_init_cpu_queues(q, set->nr_hw_queues); | 1874 | blk_mq_init_cpu_queues(q, set->nr_hw_queues); |
1865 | 1875 | ||
1866 | if (blk_mq_init_hw_queues(q, set)) | 1876 | if (blk_mq_init_hw_queues(q, set)) |
1867 | goto err_hw; | 1877 | goto err_hw; |
1868 | 1878 | ||
1869 | mutex_lock(&all_q_mutex); | 1879 | mutex_lock(&all_q_mutex); |
1870 | list_add_tail(&q->all_q_node, &all_q_list); | 1880 | list_add_tail(&q->all_q_node, &all_q_list); |
1871 | mutex_unlock(&all_q_mutex); | 1881 | mutex_unlock(&all_q_mutex); |
1872 | 1882 | ||
1873 | blk_mq_add_queue_tag_set(set, q); | 1883 | blk_mq_add_queue_tag_set(set, q); |
1874 | 1884 | ||
1875 | blk_mq_map_swqueue(q); | 1885 | blk_mq_map_swqueue(q); |
1876 | 1886 | ||
1877 | return q; | 1887 | return q; |
1878 | 1888 | ||
1879 | err_hw: | 1889 | err_hw: |
1880 | blk_cleanup_queue(q); | 1890 | blk_cleanup_queue(q); |
1881 | err_hctxs: | 1891 | err_hctxs: |
1882 | kfree(map); | 1892 | kfree(map); |
1883 | for (i = 0; i < set->nr_hw_queues; i++) { | 1893 | for (i = 0; i < set->nr_hw_queues; i++) { |
1884 | if (!hctxs[i]) | 1894 | if (!hctxs[i]) |
1885 | break; | 1895 | break; |
1886 | free_cpumask_var(hctxs[i]->cpumask); | 1896 | free_cpumask_var(hctxs[i]->cpumask); |
1887 | kfree(hctxs[i]); | 1897 | kfree(hctxs[i]); |
1888 | } | 1898 | } |
1889 | err_map: | 1899 | err_map: |
1890 | kfree(hctxs); | 1900 | kfree(hctxs); |
1891 | err_percpu: | 1901 | err_percpu: |
1892 | free_percpu(ctx); | 1902 | free_percpu(ctx); |
1893 | return ERR_PTR(-ENOMEM); | 1903 | return ERR_PTR(-ENOMEM); |
1894 | } | 1904 | } |
1895 | EXPORT_SYMBOL(blk_mq_init_queue); | 1905 | EXPORT_SYMBOL(blk_mq_init_queue); |
1896 | 1906 | ||
1897 | void blk_mq_free_queue(struct request_queue *q) | 1907 | void blk_mq_free_queue(struct request_queue *q) |
1898 | { | 1908 | { |
1899 | struct blk_mq_tag_set *set = q->tag_set; | 1909 | struct blk_mq_tag_set *set = q->tag_set; |
1900 | 1910 | ||
1901 | blk_mq_del_queue_tag_set(q); | 1911 | blk_mq_del_queue_tag_set(q); |
1902 | 1912 | ||
1903 | blk_mq_exit_hw_queues(q, set, set->nr_hw_queues); | 1913 | blk_mq_exit_hw_queues(q, set, set->nr_hw_queues); |
1904 | blk_mq_free_hw_queues(q, set); | 1914 | blk_mq_free_hw_queues(q, set); |
1905 | 1915 | ||
1906 | percpu_ref_exit(&q->mq_usage_counter); | 1916 | percpu_ref_exit(&q->mq_usage_counter); |
1907 | 1917 | ||
1908 | free_percpu(q->queue_ctx); | 1918 | free_percpu(q->queue_ctx); |
1909 | kfree(q->queue_hw_ctx); | 1919 | kfree(q->queue_hw_ctx); |
1910 | kfree(q->mq_map); | 1920 | kfree(q->mq_map); |
1911 | 1921 | ||
1912 | q->queue_ctx = NULL; | 1922 | q->queue_ctx = NULL; |
1913 | q->queue_hw_ctx = NULL; | 1923 | q->queue_hw_ctx = NULL; |
1914 | q->mq_map = NULL; | 1924 | q->mq_map = NULL; |
1915 | 1925 | ||
1916 | mutex_lock(&all_q_mutex); | 1926 | mutex_lock(&all_q_mutex); |
1917 | list_del_init(&q->all_q_node); | 1927 | list_del_init(&q->all_q_node); |
1918 | mutex_unlock(&all_q_mutex); | 1928 | mutex_unlock(&all_q_mutex); |
1919 | } | 1929 | } |
1920 | 1930 | ||
1921 | /* Basically redo blk_mq_init_queue with queue frozen */ | 1931 | /* Basically redo blk_mq_init_queue with queue frozen */ |
1922 | static void blk_mq_queue_reinit(struct request_queue *q) | 1932 | static void blk_mq_queue_reinit(struct request_queue *q) |
1923 | { | 1933 | { |
1924 | blk_mq_freeze_queue(q); | 1934 | WARN_ON_ONCE(!q->mq_freeze_depth); |
1925 | 1935 | ||
1926 | blk_mq_sysfs_unregister(q); | 1936 | blk_mq_sysfs_unregister(q); |
1927 | 1937 | ||
1928 | blk_mq_update_queue_map(q->mq_map, q->nr_hw_queues); | 1938 | blk_mq_update_queue_map(q->mq_map, q->nr_hw_queues); |
1929 | 1939 | ||
1930 | /* | 1940 | /* |
1931 | * redo blk_mq_init_cpu_queues and blk_mq_init_hw_queues. FIXME: maybe | 1941 | * redo blk_mq_init_cpu_queues and blk_mq_init_hw_queues. FIXME: maybe |
1932 | * we should change hctx numa_node according to new topology (this | 1942 | * we should change hctx numa_node according to new topology (this |
1933 | * involves free and re-allocate memory, worthy doing?) | 1943 | * involves free and re-allocate memory, worthy doing?) |
1934 | */ | 1944 | */ |
1935 | 1945 | ||
1936 | blk_mq_map_swqueue(q); | 1946 | blk_mq_map_swqueue(q); |
1937 | 1947 | ||
1938 | blk_mq_sysfs_register(q); | 1948 | blk_mq_sysfs_register(q); |
1939 | |||
1940 | blk_mq_unfreeze_queue(q); | ||
1941 | } | 1949 | } |
1942 | 1950 | ||
1943 | static int blk_mq_queue_reinit_notify(struct notifier_block *nb, | 1951 | static int blk_mq_queue_reinit_notify(struct notifier_block *nb, |
1944 | unsigned long action, void *hcpu) | 1952 | unsigned long action, void *hcpu) |
1945 | { | 1953 | { |
1946 | struct request_queue *q; | 1954 | struct request_queue *q; |
1947 | 1955 | ||
1948 | /* | 1956 | /* |
1949 | * Before new mappings are established, hotadded cpu might already | 1957 | * Before new mappings are established, hotadded cpu might already |
1950 | * start handling requests. This doesn't break anything as we map | 1958 | * start handling requests. This doesn't break anything as we map |
1951 | * offline CPUs to first hardware queue. We will re-init the queue | 1959 | * offline CPUs to first hardware queue. We will re-init the queue |
1952 | * below to get optimal settings. | 1960 | * below to get optimal settings. |
1953 | */ | 1961 | */ |
1954 | if (action != CPU_DEAD && action != CPU_DEAD_FROZEN && | 1962 | if (action != CPU_DEAD && action != CPU_DEAD_FROZEN && |
1955 | action != CPU_ONLINE && action != CPU_ONLINE_FROZEN) | 1963 | action != CPU_ONLINE && action != CPU_ONLINE_FROZEN) |
1956 | return NOTIFY_OK; | 1964 | return NOTIFY_OK; |
1957 | 1965 | ||
1958 | mutex_lock(&all_q_mutex); | 1966 | mutex_lock(&all_q_mutex); |
1967 | |||
1968 | /* | ||
1969 | * We need to freeze and reinit all existing queues. Freezing | ||
1970 | * involves synchronous wait for an RCU grace period and doing it | ||
1971 | * one by one may take a long time. Start freezing all queues in | ||
1972 | * one swoop and then wait for the completions so that freezing can | ||
1973 | * take place in parallel. | ||
1974 | */ | ||
1959 | list_for_each_entry(q, &all_q_list, all_q_node) | 1975 | list_for_each_entry(q, &all_q_list, all_q_node) |
1976 | blk_mq_freeze_queue_start(q); | ||
1977 | list_for_each_entry(q, &all_q_list, all_q_node) | ||
1978 | blk_mq_freeze_queue_wait(q); | ||
1979 | |||
1980 | list_for_each_entry(q, &all_q_list, all_q_node) | ||
1960 | blk_mq_queue_reinit(q); | 1981 | blk_mq_queue_reinit(q); |
1982 | |||
1983 | list_for_each_entry(q, &all_q_list, all_q_node) | ||
1984 | blk_mq_unfreeze_queue(q); | ||
1985 | |||
1961 | mutex_unlock(&all_q_mutex); | 1986 | mutex_unlock(&all_q_mutex); |
1962 | return NOTIFY_OK; | 1987 | return NOTIFY_OK; |
1963 | } | 1988 | } |
1964 | 1989 | ||
1965 | static int __blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set) | 1990 | static int __blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set) |
1966 | { | 1991 | { |
1967 | int i; | 1992 | int i; |
1968 | 1993 | ||
1969 | for (i = 0; i < set->nr_hw_queues; i++) { | 1994 | for (i = 0; i < set->nr_hw_queues; i++) { |
1970 | set->tags[i] = blk_mq_init_rq_map(set, i); | 1995 | set->tags[i] = blk_mq_init_rq_map(set, i); |
1971 | if (!set->tags[i]) | 1996 | if (!set->tags[i]) |
1972 | goto out_unwind; | 1997 | goto out_unwind; |
1973 | } | 1998 | } |
1974 | 1999 | ||
1975 | return 0; | 2000 | return 0; |
1976 | 2001 | ||
1977 | out_unwind: | 2002 | out_unwind: |
1978 | while (--i >= 0) | 2003 | while (--i >= 0) |
1979 | blk_mq_free_rq_map(set, set->tags[i], i); | 2004 | blk_mq_free_rq_map(set, set->tags[i], i); |
1980 | 2005 | ||
1981 | return -ENOMEM; | 2006 | return -ENOMEM; |
1982 | } | 2007 | } |
1983 | 2008 | ||
1984 | /* | 2009 | /* |
1985 | * Allocate the request maps associated with this tag_set. Note that this | 2010 | * Allocate the request maps associated with this tag_set. Note that this |
1986 | * may reduce the depth asked for, if memory is tight. set->queue_depth | 2011 | * may reduce the depth asked for, if memory is tight. set->queue_depth |
1987 | * will be updated to reflect the allocated depth. | 2012 | * will be updated to reflect the allocated depth. |
1988 | */ | 2013 | */ |
1989 | static int blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set) | 2014 | static int blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set) |
1990 | { | 2015 | { |
1991 | unsigned int depth; | 2016 | unsigned int depth; |
1992 | int err; | 2017 | int err; |
1993 | 2018 | ||
1994 | depth = set->queue_depth; | 2019 | depth = set->queue_depth; |
1995 | do { | 2020 | do { |
1996 | err = __blk_mq_alloc_rq_maps(set); | 2021 | err = __blk_mq_alloc_rq_maps(set); |
1997 | if (!err) | 2022 | if (!err) |
1998 | break; | 2023 | break; |
1999 | 2024 | ||
2000 | set->queue_depth >>= 1; | 2025 | set->queue_depth >>= 1; |
2001 | if (set->queue_depth < set->reserved_tags + BLK_MQ_TAG_MIN) { | 2026 | if (set->queue_depth < set->reserved_tags + BLK_MQ_TAG_MIN) { |
2002 | err = -ENOMEM; | 2027 | err = -ENOMEM; |
2003 | break; | 2028 | break; |
2004 | } | 2029 | } |
2005 | } while (set->queue_depth); | 2030 | } while (set->queue_depth); |
2006 | 2031 | ||
2007 | if (!set->queue_depth || err) { | 2032 | if (!set->queue_depth || err) { |
2008 | pr_err("blk-mq: failed to allocate request map\n"); | 2033 | pr_err("blk-mq: failed to allocate request map\n"); |
2009 | return -ENOMEM; | 2034 | return -ENOMEM; |
2010 | } | 2035 | } |
2011 | 2036 | ||
2012 | if (depth != set->queue_depth) | 2037 | if (depth != set->queue_depth) |
2013 | pr_info("blk-mq: reduced tag depth (%u -> %u)\n", | 2038 | pr_info("blk-mq: reduced tag depth (%u -> %u)\n", |
2014 | depth, set->queue_depth); | 2039 | depth, set->queue_depth); |
2015 | 2040 | ||
2016 | return 0; | 2041 | return 0; |
2017 | } | 2042 | } |
2018 | 2043 | ||
2019 | /* | 2044 | /* |
2020 | * Alloc a tag set to be associated with one or more request queues. | 2045 | * Alloc a tag set to be associated with one or more request queues. |
2021 | * May fail with EINVAL for various error conditions. May adjust the | 2046 | * May fail with EINVAL for various error conditions. May adjust the |
2022 | * requested depth down, if if it too large. In that case, the set | 2047 | * requested depth down, if if it too large. In that case, the set |
2023 | * value will be stored in set->queue_depth. | 2048 | * value will be stored in set->queue_depth. |
2024 | */ | 2049 | */ |
2025 | int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set) | 2050 | int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set) |
2026 | { | 2051 | { |
2027 | if (!set->nr_hw_queues) | 2052 | if (!set->nr_hw_queues) |
2028 | return -EINVAL; | 2053 | return -EINVAL; |
2029 | if (!set->queue_depth) | 2054 | if (!set->queue_depth) |
2030 | return -EINVAL; | 2055 | return -EINVAL; |
2031 | if (set->queue_depth < set->reserved_tags + BLK_MQ_TAG_MIN) | 2056 | if (set->queue_depth < set->reserved_tags + BLK_MQ_TAG_MIN) |
2032 | return -EINVAL; | 2057 | return -EINVAL; |
2033 | 2058 | ||
2034 | if (!set->nr_hw_queues || !set->ops->queue_rq || !set->ops->map_queue) | 2059 | if (!set->nr_hw_queues || !set->ops->queue_rq || !set->ops->map_queue) |
2035 | return -EINVAL; | 2060 | return -EINVAL; |
2036 | 2061 | ||
2037 | if (set->queue_depth > BLK_MQ_MAX_DEPTH) { | 2062 | if (set->queue_depth > BLK_MQ_MAX_DEPTH) { |
2038 | pr_info("blk-mq: reduced tag depth to %u\n", | 2063 | pr_info("blk-mq: reduced tag depth to %u\n", |
2039 | BLK_MQ_MAX_DEPTH); | 2064 | BLK_MQ_MAX_DEPTH); |
2040 | set->queue_depth = BLK_MQ_MAX_DEPTH; | 2065 | set->queue_depth = BLK_MQ_MAX_DEPTH; |
2041 | } | 2066 | } |
2042 | 2067 | ||
2043 | set->tags = kmalloc_node(set->nr_hw_queues * | 2068 | set->tags = kmalloc_node(set->nr_hw_queues * |
2044 | sizeof(struct blk_mq_tags *), | 2069 | sizeof(struct blk_mq_tags *), |
2045 | GFP_KERNEL, set->numa_node); | 2070 | GFP_KERNEL, set->numa_node); |
2046 | if (!set->tags) | 2071 | if (!set->tags) |
2047 | return -ENOMEM; | 2072 | return -ENOMEM; |
2048 | 2073 | ||
2049 | if (blk_mq_alloc_rq_maps(set)) | 2074 | if (blk_mq_alloc_rq_maps(set)) |
2050 | goto enomem; | 2075 | goto enomem; |
2051 | 2076 | ||
2052 | mutex_init(&set->tag_list_lock); | 2077 | mutex_init(&set->tag_list_lock); |
2053 | INIT_LIST_HEAD(&set->tag_list); | 2078 | INIT_LIST_HEAD(&set->tag_list); |
2054 | 2079 | ||
2055 | return 0; | 2080 | return 0; |
2056 | enomem: | 2081 | enomem: |
2057 | kfree(set->tags); | 2082 | kfree(set->tags); |
2058 | set->tags = NULL; | 2083 | set->tags = NULL; |
2059 | return -ENOMEM; | 2084 | return -ENOMEM; |
2060 | } | 2085 | } |
2061 | EXPORT_SYMBOL(blk_mq_alloc_tag_set); | 2086 | EXPORT_SYMBOL(blk_mq_alloc_tag_set); |
2062 | 2087 | ||
2063 | void blk_mq_free_tag_set(struct blk_mq_tag_set *set) | 2088 | void blk_mq_free_tag_set(struct blk_mq_tag_set *set) |
2064 | { | 2089 | { |
2065 | int i; | 2090 | int i; |
2066 | 2091 | ||
2067 | for (i = 0; i < set->nr_hw_queues; i++) { | 2092 | for (i = 0; i < set->nr_hw_queues; i++) { |
2068 | if (set->tags[i]) | 2093 | if (set->tags[i]) |
2069 | blk_mq_free_rq_map(set, set->tags[i], i); | 2094 | blk_mq_free_rq_map(set, set->tags[i], i); |
2070 | } | 2095 | } |
2071 | 2096 | ||
2072 | kfree(set->tags); | 2097 | kfree(set->tags); |
2073 | set->tags = NULL; | 2098 | set->tags = NULL; |
2074 | } | 2099 | } |
2075 | EXPORT_SYMBOL(blk_mq_free_tag_set); | 2100 | EXPORT_SYMBOL(blk_mq_free_tag_set); |
2076 | 2101 | ||
2077 | int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr) | 2102 | int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr) |
2078 | { | 2103 | { |
2079 | struct blk_mq_tag_set *set = q->tag_set; | 2104 | struct blk_mq_tag_set *set = q->tag_set; |
2080 | struct blk_mq_hw_ctx *hctx; | 2105 | struct blk_mq_hw_ctx *hctx; |
2081 | int i, ret; | 2106 | int i, ret; |
2082 | 2107 | ||
2083 | if (!set || nr > set->queue_depth) | 2108 | if (!set || nr > set->queue_depth) |
2084 | return -EINVAL; | 2109 | return -EINVAL; |
2085 | 2110 | ||
2086 | ret = 0; | 2111 | ret = 0; |
2087 | queue_for_each_hw_ctx(q, hctx, i) { | 2112 | queue_for_each_hw_ctx(q, hctx, i) { |
2088 | ret = blk_mq_tag_update_depth(hctx->tags, nr); | 2113 | ret = blk_mq_tag_update_depth(hctx->tags, nr); |
2089 | if (ret) | 2114 | if (ret) |
2090 | break; | 2115 | break; |
2091 | } | 2116 | } |
2092 | 2117 | ||
2093 | if (!ret) | 2118 | if (!ret) |
2094 | q->nr_requests = nr; | 2119 | q->nr_requests = nr; |
2095 | 2120 | ||
2096 | return ret; | 2121 | return ret; |
2097 | } | 2122 | } |
2098 | 2123 | ||
2099 | void blk_mq_disable_hotplug(void) | 2124 | void blk_mq_disable_hotplug(void) |
2100 | { | 2125 | { |
2101 | mutex_lock(&all_q_mutex); | 2126 | mutex_lock(&all_q_mutex); |
2102 | } | 2127 | } |
2103 | 2128 | ||
2104 | void blk_mq_enable_hotplug(void) | 2129 | void blk_mq_enable_hotplug(void) |
2105 | { | 2130 | { |
2106 | mutex_unlock(&all_q_mutex); | 2131 | mutex_unlock(&all_q_mutex); |
2107 | } | 2132 | } |
2108 | 2133 | ||
2109 | static int __init blk_mq_init(void) | 2134 | static int __init blk_mq_init(void) |
2110 | { | 2135 | { |
2111 | blk_mq_cpu_init(); | 2136 | blk_mq_cpu_init(); |
2112 | 2137 |
block/ioprio.c
1 | /* | 1 | /* |
2 | * fs/ioprio.c | 2 | * fs/ioprio.c |
3 | * | 3 | * |
4 | * Copyright (C) 2004 Jens Axboe <axboe@kernel.dk> | 4 | * Copyright (C) 2004 Jens Axboe <axboe@kernel.dk> |
5 | * | 5 | * |
6 | * Helper functions for setting/querying io priorities of processes. The | 6 | * Helper functions for setting/querying io priorities of processes. The |
7 | * system calls closely mimmick getpriority/setpriority, see the man page for | 7 | * system calls closely mimmick getpriority/setpriority, see the man page for |
8 | * those. The prio argument is a composite of prio class and prio data, where | 8 | * those. The prio argument is a composite of prio class and prio data, where |
9 | * the data argument has meaning within that class. The standard scheduling | 9 | * the data argument has meaning within that class. The standard scheduling |
10 | * classes have 8 distinct prio levels, with 0 being the highest prio and 7 | 10 | * classes have 8 distinct prio levels, with 0 being the highest prio and 7 |
11 | * being the lowest. | 11 | * being the lowest. |
12 | * | 12 | * |
13 | * IOW, setting BE scheduling class with prio 2 is done ala: | 13 | * IOW, setting BE scheduling class with prio 2 is done ala: |
14 | * | 14 | * |
15 | * unsigned int prio = (IOPRIO_CLASS_BE << IOPRIO_CLASS_SHIFT) | 2; | 15 | * unsigned int prio = (IOPRIO_CLASS_BE << IOPRIO_CLASS_SHIFT) | 2; |
16 | * | 16 | * |
17 | * ioprio_set(PRIO_PROCESS, pid, prio); | 17 | * ioprio_set(PRIO_PROCESS, pid, prio); |
18 | * | 18 | * |
19 | * See also Documentation/block/ioprio.txt | 19 | * See also Documentation/block/ioprio.txt |
20 | * | 20 | * |
21 | */ | 21 | */ |
22 | #include <linux/gfp.h> | 22 | #include <linux/gfp.h> |
23 | #include <linux/kernel.h> | 23 | #include <linux/kernel.h> |
24 | #include <linux/export.h> | 24 | #include <linux/export.h> |
25 | #include <linux/ioprio.h> | 25 | #include <linux/ioprio.h> |
26 | #include <linux/blkdev.h> | 26 | #include <linux/blkdev.h> |
27 | #include <linux/capability.h> | 27 | #include <linux/capability.h> |
28 | #include <linux/syscalls.h> | 28 | #include <linux/syscalls.h> |
29 | #include <linux/security.h> | 29 | #include <linux/security.h> |
30 | #include <linux/pid_namespace.h> | 30 | #include <linux/pid_namespace.h> |
31 | 31 | ||
32 | int set_task_ioprio(struct task_struct *task, int ioprio) | 32 | int set_task_ioprio(struct task_struct *task, int ioprio) |
33 | { | 33 | { |
34 | int err; | 34 | int err; |
35 | struct io_context *ioc; | 35 | struct io_context *ioc; |
36 | const struct cred *cred = current_cred(), *tcred; | 36 | const struct cred *cred = current_cred(), *tcred; |
37 | 37 | ||
38 | rcu_read_lock(); | 38 | rcu_read_lock(); |
39 | tcred = __task_cred(task); | 39 | tcred = __task_cred(task); |
40 | if (!uid_eq(tcred->uid, cred->euid) && | 40 | if (!uid_eq(tcred->uid, cred->euid) && |
41 | !uid_eq(tcred->uid, cred->uid) && !capable(CAP_SYS_NICE)) { | 41 | !uid_eq(tcred->uid, cred->uid) && !capable(CAP_SYS_NICE)) { |
42 | rcu_read_unlock(); | 42 | rcu_read_unlock(); |
43 | return -EPERM; | 43 | return -EPERM; |
44 | } | 44 | } |
45 | rcu_read_unlock(); | 45 | rcu_read_unlock(); |
46 | 46 | ||
47 | err = security_task_setioprio(task, ioprio); | 47 | err = security_task_setioprio(task, ioprio); |
48 | if (err) | 48 | if (err) |
49 | return err; | 49 | return err; |
50 | 50 | ||
51 | ioc = get_task_io_context(task, GFP_ATOMIC, NUMA_NO_NODE); | 51 | ioc = get_task_io_context(task, GFP_ATOMIC, NUMA_NO_NODE); |
52 | if (ioc) { | 52 | if (ioc) { |
53 | ioc->ioprio = ioprio; | 53 | ioc->ioprio = ioprio; |
54 | put_io_context(ioc); | 54 | put_io_context(ioc); |
55 | } | 55 | } |
56 | 56 | ||
57 | return err; | 57 | return err; |
58 | } | 58 | } |
59 | EXPORT_SYMBOL_GPL(set_task_ioprio); | 59 | EXPORT_SYMBOL_GPL(set_task_ioprio); |
60 | 60 | ||
61 | SYSCALL_DEFINE3(ioprio_set, int, which, int, who, int, ioprio) | 61 | SYSCALL_DEFINE3(ioprio_set, int, which, int, who, int, ioprio) |
62 | { | 62 | { |
63 | int class = IOPRIO_PRIO_CLASS(ioprio); | 63 | int class = IOPRIO_PRIO_CLASS(ioprio); |
64 | int data = IOPRIO_PRIO_DATA(ioprio); | 64 | int data = IOPRIO_PRIO_DATA(ioprio); |
65 | struct task_struct *p, *g; | 65 | struct task_struct *p, *g; |
66 | struct user_struct *user; | 66 | struct user_struct *user; |
67 | struct pid *pgrp; | 67 | struct pid *pgrp; |
68 | kuid_t uid; | 68 | kuid_t uid; |
69 | int ret; | 69 | int ret; |
70 | 70 | ||
71 | switch (class) { | 71 | switch (class) { |
72 | case IOPRIO_CLASS_RT: | 72 | case IOPRIO_CLASS_RT: |
73 | if (!capable(CAP_SYS_ADMIN)) | 73 | if (!capable(CAP_SYS_ADMIN)) |
74 | return -EPERM; | 74 | return -EPERM; |
75 | /* fall through, rt has prio field too */ | 75 | /* fall through, rt has prio field too */ |
76 | case IOPRIO_CLASS_BE: | 76 | case IOPRIO_CLASS_BE: |
77 | if (data >= IOPRIO_BE_NR || data < 0) | 77 | if (data >= IOPRIO_BE_NR || data < 0) |
78 | return -EINVAL; | 78 | return -EINVAL; |
79 | 79 | ||
80 | break; | 80 | break; |
81 | case IOPRIO_CLASS_IDLE: | 81 | case IOPRIO_CLASS_IDLE: |
82 | break; | 82 | break; |
83 | case IOPRIO_CLASS_NONE: | 83 | case IOPRIO_CLASS_NONE: |
84 | if (data) | 84 | if (data) |
85 | return -EINVAL; | 85 | return -EINVAL; |
86 | break; | 86 | break; |
87 | default: | 87 | default: |
88 | return -EINVAL; | 88 | return -EINVAL; |
89 | } | 89 | } |
90 | 90 | ||
91 | ret = -ESRCH; | 91 | ret = -ESRCH; |
92 | rcu_read_lock(); | 92 | rcu_read_lock(); |
93 | switch (which) { | 93 | switch (which) { |
94 | case IOPRIO_WHO_PROCESS: | 94 | case IOPRIO_WHO_PROCESS: |
95 | if (!who) | 95 | if (!who) |
96 | p = current; | 96 | p = current; |
97 | else | 97 | else |
98 | p = find_task_by_vpid(who); | 98 | p = find_task_by_vpid(who); |
99 | if (p) | 99 | if (p) |
100 | ret = set_task_ioprio(p, ioprio); | 100 | ret = set_task_ioprio(p, ioprio); |
101 | break; | 101 | break; |
102 | case IOPRIO_WHO_PGRP: | 102 | case IOPRIO_WHO_PGRP: |
103 | if (!who) | 103 | if (!who) |
104 | pgrp = task_pgrp(current); | 104 | pgrp = task_pgrp(current); |
105 | else | 105 | else |
106 | pgrp = find_vpid(who); | 106 | pgrp = find_vpid(who); |
107 | do_each_pid_thread(pgrp, PIDTYPE_PGID, p) { | 107 | do_each_pid_thread(pgrp, PIDTYPE_PGID, p) { |
108 | ret = set_task_ioprio(p, ioprio); | 108 | ret = set_task_ioprio(p, ioprio); |
109 | if (ret) | 109 | if (ret) |
110 | break; | 110 | break; |
111 | } while_each_pid_thread(pgrp, PIDTYPE_PGID, p); | 111 | } while_each_pid_thread(pgrp, PIDTYPE_PGID, p); |
112 | break; | 112 | break; |
113 | case IOPRIO_WHO_USER: | 113 | case IOPRIO_WHO_USER: |
114 | uid = make_kuid(current_user_ns(), who); | 114 | uid = make_kuid(current_user_ns(), who); |
115 | if (!uid_valid(uid)) | 115 | if (!uid_valid(uid)) |
116 | break; | 116 | break; |
117 | if (!who) | 117 | if (!who) |
118 | user = current_user(); | 118 | user = current_user(); |
119 | else | 119 | else |
120 | user = find_user(uid); | 120 | user = find_user(uid); |
121 | 121 | ||
122 | if (!user) | 122 | if (!user) |
123 | break; | 123 | break; |
124 | 124 | ||
125 | do_each_thread(g, p) { | 125 | do_each_thread(g, p) { |
126 | if (!uid_eq(task_uid(p), uid)) | 126 | if (!uid_eq(task_uid(p), uid)) |
127 | continue; | 127 | continue; |
128 | ret = set_task_ioprio(p, ioprio); | 128 | ret = set_task_ioprio(p, ioprio); |
129 | if (ret) | 129 | if (ret) |
130 | goto free_uid; | 130 | goto free_uid; |
131 | } while_each_thread(g, p); | 131 | } while_each_thread(g, p); |
132 | free_uid: | 132 | free_uid: |
133 | if (who) | 133 | if (who) |
134 | free_uid(user); | 134 | free_uid(user); |
135 | break; | 135 | break; |
136 | default: | 136 | default: |
137 | ret = -EINVAL; | 137 | ret = -EINVAL; |
138 | } | 138 | } |
139 | 139 | ||
140 | rcu_read_unlock(); | 140 | rcu_read_unlock(); |
141 | return ret; | 141 | return ret; |
142 | } | 142 | } |
143 | 143 | ||
144 | static int get_task_ioprio(struct task_struct *p) | 144 | static int get_task_ioprio(struct task_struct *p) |
145 | { | 145 | { |
146 | int ret; | 146 | int ret; |
147 | 147 | ||
148 | ret = security_task_getioprio(p); | 148 | ret = security_task_getioprio(p); |
149 | if (ret) | 149 | if (ret) |
150 | goto out; | 150 | goto out; |
151 | ret = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_NONE, IOPRIO_NORM); | 151 | ret = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_NONE, IOPRIO_NORM); |
152 | if (p->io_context) | 152 | if (p->io_context) |
153 | ret = p->io_context->ioprio; | 153 | ret = p->io_context->ioprio; |
154 | out: | 154 | out: |
155 | return ret; | 155 | return ret; |
156 | } | 156 | } |
157 | 157 | ||
158 | int ioprio_best(unsigned short aprio, unsigned short bprio) | 158 | int ioprio_best(unsigned short aprio, unsigned short bprio) |
159 | { | 159 | { |
160 | unsigned short aclass = IOPRIO_PRIO_CLASS(aprio); | 160 | unsigned short aclass; |
161 | unsigned short bclass = IOPRIO_PRIO_CLASS(bprio); | 161 | unsigned short bclass; |
162 | 162 | ||
163 | if (aclass == IOPRIO_CLASS_NONE) | 163 | if (!ioprio_valid(aprio)) |
164 | aclass = IOPRIO_CLASS_BE; | 164 | aprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, IOPRIO_NORM); |
165 | if (bclass == IOPRIO_CLASS_NONE) | 165 | if (!ioprio_valid(bprio)) |
166 | bclass = IOPRIO_CLASS_BE; | 166 | bprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, IOPRIO_NORM); |
167 | 167 | ||
168 | aclass = IOPRIO_PRIO_CLASS(aprio); | ||
169 | bclass = IOPRIO_PRIO_CLASS(bprio); | ||
168 | if (aclass == bclass) | 170 | if (aclass == bclass) |
169 | return min(aprio, bprio); | 171 | return min(aprio, bprio); |
170 | if (aclass > bclass) | 172 | if (aclass > bclass) |
171 | return bprio; | 173 | return bprio; |
172 | else | 174 | else |
173 | return aprio; | 175 | return aprio; |
174 | } | 176 | } |
175 | 177 | ||
176 | SYSCALL_DEFINE2(ioprio_get, int, which, int, who) | 178 | SYSCALL_DEFINE2(ioprio_get, int, which, int, who) |
177 | { | 179 | { |
178 | struct task_struct *g, *p; | 180 | struct task_struct *g, *p; |
179 | struct user_struct *user; | 181 | struct user_struct *user; |
180 | struct pid *pgrp; | 182 | struct pid *pgrp; |
181 | kuid_t uid; | 183 | kuid_t uid; |
182 | int ret = -ESRCH; | 184 | int ret = -ESRCH; |
183 | int tmpio; | 185 | int tmpio; |
184 | 186 | ||
185 | rcu_read_lock(); | 187 | rcu_read_lock(); |
186 | switch (which) { | 188 | switch (which) { |
187 | case IOPRIO_WHO_PROCESS: | 189 | case IOPRIO_WHO_PROCESS: |
188 | if (!who) | 190 | if (!who) |
189 | p = current; | 191 | p = current; |
190 | else | 192 | else |
191 | p = find_task_by_vpid(who); | 193 | p = find_task_by_vpid(who); |
192 | if (p) | 194 | if (p) |
193 | ret = get_task_ioprio(p); | 195 | ret = get_task_ioprio(p); |
194 | break; | 196 | break; |
195 | case IOPRIO_WHO_PGRP: | 197 | case IOPRIO_WHO_PGRP: |
196 | if (!who) | 198 | if (!who) |
197 | pgrp = task_pgrp(current); | 199 | pgrp = task_pgrp(current); |
198 | else | 200 | else |
199 | pgrp = find_vpid(who); | 201 | pgrp = find_vpid(who); |
200 | do_each_pid_thread(pgrp, PIDTYPE_PGID, p) { | 202 | do_each_pid_thread(pgrp, PIDTYPE_PGID, p) { |
201 | tmpio = get_task_ioprio(p); | 203 | tmpio = get_task_ioprio(p); |
202 | if (tmpio < 0) | 204 | if (tmpio < 0) |
203 | continue; | 205 | continue; |
204 | if (ret == -ESRCH) | 206 | if (ret == -ESRCH) |
205 | ret = tmpio; | 207 | ret = tmpio; |
206 | else | 208 | else |
207 | ret = ioprio_best(ret, tmpio); | 209 | ret = ioprio_best(ret, tmpio); |
208 | } while_each_pid_thread(pgrp, PIDTYPE_PGID, p); | 210 | } while_each_pid_thread(pgrp, PIDTYPE_PGID, p); |
209 | break; | 211 | break; |
210 | case IOPRIO_WHO_USER: | 212 | case IOPRIO_WHO_USER: |
211 | uid = make_kuid(current_user_ns(), who); | 213 | uid = make_kuid(current_user_ns(), who); |
212 | if (!who) | 214 | if (!who) |
213 | user = current_user(); | 215 | user = current_user(); |
214 | else | 216 | else |
215 | user = find_user(uid); | 217 | user = find_user(uid); |
216 | 218 | ||
217 | if (!user) | 219 | if (!user) |
218 | break; | 220 | break; |
219 | 221 | ||
220 | do_each_thread(g, p) { | 222 | do_each_thread(g, p) { |
221 | if (!uid_eq(task_uid(p), user->uid)) | 223 | if (!uid_eq(task_uid(p), user->uid)) |
222 | continue; | 224 | continue; |
223 | tmpio = get_task_ioprio(p); | 225 | tmpio = get_task_ioprio(p); |
224 | if (tmpio < 0) | 226 | if (tmpio < 0) |
225 | continue; | 227 | continue; |
226 | if (ret == -ESRCH) | 228 | if (ret == -ESRCH) |
227 | ret = tmpio; | 229 | ret = tmpio; |
228 | else | 230 | else |
229 | ret = ioprio_best(ret, tmpio); | 231 | ret = ioprio_best(ret, tmpio); |
230 | } while_each_thread(g, p); | 232 | } while_each_thread(g, p); |
231 | 233 | ||
232 | if (who) | 234 | if (who) |
233 | free_uid(user); | 235 | free_uid(user); |
234 | break; | 236 | break; |
235 | default: | 237 | default: |
236 | ret = -EINVAL; | 238 | ret = -EINVAL; |
237 | } | 239 | } |
238 | 240 | ||
239 | rcu_read_unlock(); | 241 | rcu_read_unlock(); |
240 | return ret; | 242 | return ret; |
241 | } | 243 | } |
242 | 244 |
block/scsi_ioctl.c
1 | /* | 1 | /* |
2 | * Copyright (C) 2001 Jens Axboe <axboe@suse.de> | 2 | * Copyright (C) 2001 Jens Axboe <axboe@suse.de> |
3 | * | 3 | * |
4 | * This program is free software; you can redistribute it and/or modify | 4 | * This program is free software; you can redistribute it and/or modify |
5 | * it under the terms of the GNU General Public License version 2 as | 5 | * it under the terms of the GNU General Public License version 2 as |
6 | * published by the Free Software Foundation. | 6 | * published by the Free Software Foundation. |
7 | * | 7 | * |
8 | * This program is distributed in the hope that it will be useful, | 8 | * This program is distributed in the hope that it will be useful, |
9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | 9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
10 | * | 10 | * |
11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
12 | * GNU General Public License for more details. | 12 | * GNU General Public License for more details. |
13 | * | 13 | * |
14 | * You should have received a copy of the GNU General Public Licens | 14 | * You should have received a copy of the GNU General Public Licens |
15 | * along with this program; if not, write to the Free Software | 15 | * along with this program; if not, write to the Free Software |
16 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111- | 16 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111- |
17 | * | 17 | * |
18 | */ | 18 | */ |
19 | #include <linux/kernel.h> | 19 | #include <linux/kernel.h> |
20 | #include <linux/errno.h> | 20 | #include <linux/errno.h> |
21 | #include <linux/string.h> | 21 | #include <linux/string.h> |
22 | #include <linux/module.h> | 22 | #include <linux/module.h> |
23 | #include <linux/blkdev.h> | 23 | #include <linux/blkdev.h> |
24 | #include <linux/capability.h> | 24 | #include <linux/capability.h> |
25 | #include <linux/completion.h> | 25 | #include <linux/completion.h> |
26 | #include <linux/cdrom.h> | 26 | #include <linux/cdrom.h> |
27 | #include <linux/ratelimit.h> | 27 | #include <linux/ratelimit.h> |
28 | #include <linux/slab.h> | 28 | #include <linux/slab.h> |
29 | #include <linux/times.h> | 29 | #include <linux/times.h> |
30 | #include <linux/uio.h> | 30 | #include <linux/uio.h> |
31 | #include <asm/uaccess.h> | 31 | #include <asm/uaccess.h> |
32 | 32 | ||
33 | #include <scsi/scsi.h> | 33 | #include <scsi/scsi.h> |
34 | #include <scsi/scsi_ioctl.h> | 34 | #include <scsi/scsi_ioctl.h> |
35 | #include <scsi/scsi_cmnd.h> | 35 | #include <scsi/scsi_cmnd.h> |
36 | 36 | ||
37 | struct blk_cmd_filter { | 37 | struct blk_cmd_filter { |
38 | unsigned long read_ok[BLK_SCSI_CMD_PER_LONG]; | 38 | unsigned long read_ok[BLK_SCSI_CMD_PER_LONG]; |
39 | unsigned long write_ok[BLK_SCSI_CMD_PER_LONG]; | 39 | unsigned long write_ok[BLK_SCSI_CMD_PER_LONG]; |
40 | }; | 40 | }; |
41 | 41 | ||
42 | static struct blk_cmd_filter blk_default_cmd_filter; | 42 | static struct blk_cmd_filter blk_default_cmd_filter; |
43 | 43 | ||
44 | /* Command group 3 is reserved and should never be used. */ | 44 | /* Command group 3 is reserved and should never be used. */ |
45 | const unsigned char scsi_command_size_tbl[8] = | 45 | const unsigned char scsi_command_size_tbl[8] = |
46 | { | 46 | { |
47 | 6, 10, 10, 12, | 47 | 6, 10, 10, 12, |
48 | 16, 12, 10, 10 | 48 | 16, 12, 10, 10 |
49 | }; | 49 | }; |
50 | EXPORT_SYMBOL(scsi_command_size_tbl); | 50 | EXPORT_SYMBOL(scsi_command_size_tbl); |
51 | 51 | ||
52 | #include <scsi/sg.h> | 52 | #include <scsi/sg.h> |
53 | 53 | ||
54 | static int sg_get_version(int __user *p) | 54 | static int sg_get_version(int __user *p) |
55 | { | 55 | { |
56 | static const int sg_version_num = 30527; | 56 | static const int sg_version_num = 30527; |
57 | return put_user(sg_version_num, p); | 57 | return put_user(sg_version_num, p); |
58 | } | 58 | } |
59 | 59 | ||
60 | static int scsi_get_idlun(struct request_queue *q, int __user *p) | 60 | static int scsi_get_idlun(struct request_queue *q, int __user *p) |
61 | { | 61 | { |
62 | return put_user(0, p); | 62 | return put_user(0, p); |
63 | } | 63 | } |
64 | 64 | ||
65 | static int scsi_get_bus(struct request_queue *q, int __user *p) | 65 | static int scsi_get_bus(struct request_queue *q, int __user *p) |
66 | { | 66 | { |
67 | return put_user(0, p); | 67 | return put_user(0, p); |
68 | } | 68 | } |
69 | 69 | ||
70 | static int sg_get_timeout(struct request_queue *q) | 70 | static int sg_get_timeout(struct request_queue *q) |
71 | { | 71 | { |
72 | return jiffies_to_clock_t(q->sg_timeout); | 72 | return jiffies_to_clock_t(q->sg_timeout); |
73 | } | 73 | } |
74 | 74 | ||
75 | static int sg_set_timeout(struct request_queue *q, int __user *p) | 75 | static int sg_set_timeout(struct request_queue *q, int __user *p) |
76 | { | 76 | { |
77 | int timeout, err = get_user(timeout, p); | 77 | int timeout, err = get_user(timeout, p); |
78 | 78 | ||
79 | if (!err) | 79 | if (!err) |
80 | q->sg_timeout = clock_t_to_jiffies(timeout); | 80 | q->sg_timeout = clock_t_to_jiffies(timeout); |
81 | 81 | ||
82 | return err; | 82 | return err; |
83 | } | 83 | } |
84 | 84 | ||
85 | static int max_sectors_bytes(struct request_queue *q) | 85 | static int max_sectors_bytes(struct request_queue *q) |
86 | { | 86 | { |
87 | unsigned int max_sectors = queue_max_sectors(q); | 87 | unsigned int max_sectors = queue_max_sectors(q); |
88 | 88 | ||
89 | max_sectors = min_t(unsigned int, max_sectors, INT_MAX >> 9); | 89 | max_sectors = min_t(unsigned int, max_sectors, INT_MAX >> 9); |
90 | 90 | ||
91 | return max_sectors << 9; | 91 | return max_sectors << 9; |
92 | } | 92 | } |
93 | 93 | ||
94 | static int sg_get_reserved_size(struct request_queue *q, int __user *p) | 94 | static int sg_get_reserved_size(struct request_queue *q, int __user *p) |
95 | { | 95 | { |
96 | int val = min_t(int, q->sg_reserved_size, max_sectors_bytes(q)); | 96 | int val = min_t(int, q->sg_reserved_size, max_sectors_bytes(q)); |
97 | 97 | ||
98 | return put_user(val, p); | 98 | return put_user(val, p); |
99 | } | 99 | } |
100 | 100 | ||
101 | static int sg_set_reserved_size(struct request_queue *q, int __user *p) | 101 | static int sg_set_reserved_size(struct request_queue *q, int __user *p) |
102 | { | 102 | { |
103 | int size, err = get_user(size, p); | 103 | int size, err = get_user(size, p); |
104 | 104 | ||
105 | if (err) | 105 | if (err) |
106 | return err; | 106 | return err; |
107 | 107 | ||
108 | if (size < 0) | 108 | if (size < 0) |
109 | return -EINVAL; | 109 | return -EINVAL; |
110 | 110 | ||
111 | q->sg_reserved_size = min(size, max_sectors_bytes(q)); | 111 | q->sg_reserved_size = min(size, max_sectors_bytes(q)); |
112 | return 0; | 112 | return 0; |
113 | } | 113 | } |
114 | 114 | ||
115 | /* | 115 | /* |
116 | * will always return that we are ATAPI even for a real SCSI drive, I'm not | 116 | * will always return that we are ATAPI even for a real SCSI drive, I'm not |
117 | * so sure this is worth doing anything about (why would you care??) | 117 | * so sure this is worth doing anything about (why would you care??) |
118 | */ | 118 | */ |
119 | static int sg_emulated_host(struct request_queue *q, int __user *p) | 119 | static int sg_emulated_host(struct request_queue *q, int __user *p) |
120 | { | 120 | { |
121 | return put_user(1, p); | 121 | return put_user(1, p); |
122 | } | 122 | } |
123 | 123 | ||
124 | static void blk_set_cmd_filter_defaults(struct blk_cmd_filter *filter) | 124 | static void blk_set_cmd_filter_defaults(struct blk_cmd_filter *filter) |
125 | { | 125 | { |
126 | /* Basic read-only commands */ | 126 | /* Basic read-only commands */ |
127 | __set_bit(TEST_UNIT_READY, filter->read_ok); | 127 | __set_bit(TEST_UNIT_READY, filter->read_ok); |
128 | __set_bit(REQUEST_SENSE, filter->read_ok); | 128 | __set_bit(REQUEST_SENSE, filter->read_ok); |
129 | __set_bit(READ_6, filter->read_ok); | 129 | __set_bit(READ_6, filter->read_ok); |
130 | __set_bit(READ_10, filter->read_ok); | 130 | __set_bit(READ_10, filter->read_ok); |
131 | __set_bit(READ_12, filter->read_ok); | 131 | __set_bit(READ_12, filter->read_ok); |
132 | __set_bit(READ_16, filter->read_ok); | 132 | __set_bit(READ_16, filter->read_ok); |
133 | __set_bit(READ_BUFFER, filter->read_ok); | 133 | __set_bit(READ_BUFFER, filter->read_ok); |
134 | __set_bit(READ_DEFECT_DATA, filter->read_ok); | 134 | __set_bit(READ_DEFECT_DATA, filter->read_ok); |
135 | __set_bit(READ_CAPACITY, filter->read_ok); | 135 | __set_bit(READ_CAPACITY, filter->read_ok); |
136 | __set_bit(READ_LONG, filter->read_ok); | 136 | __set_bit(READ_LONG, filter->read_ok); |
137 | __set_bit(INQUIRY, filter->read_ok); | 137 | __set_bit(INQUIRY, filter->read_ok); |
138 | __set_bit(MODE_SENSE, filter->read_ok); | 138 | __set_bit(MODE_SENSE, filter->read_ok); |
139 | __set_bit(MODE_SENSE_10, filter->read_ok); | 139 | __set_bit(MODE_SENSE_10, filter->read_ok); |
140 | __set_bit(LOG_SENSE, filter->read_ok); | 140 | __set_bit(LOG_SENSE, filter->read_ok); |
141 | __set_bit(START_STOP, filter->read_ok); | 141 | __set_bit(START_STOP, filter->read_ok); |
142 | __set_bit(GPCMD_VERIFY_10, filter->read_ok); | 142 | __set_bit(GPCMD_VERIFY_10, filter->read_ok); |
143 | __set_bit(VERIFY_16, filter->read_ok); | 143 | __set_bit(VERIFY_16, filter->read_ok); |
144 | __set_bit(REPORT_LUNS, filter->read_ok); | 144 | __set_bit(REPORT_LUNS, filter->read_ok); |
145 | __set_bit(SERVICE_ACTION_IN, filter->read_ok); | 145 | __set_bit(SERVICE_ACTION_IN, filter->read_ok); |
146 | __set_bit(RECEIVE_DIAGNOSTIC, filter->read_ok); | 146 | __set_bit(RECEIVE_DIAGNOSTIC, filter->read_ok); |
147 | __set_bit(MAINTENANCE_IN, filter->read_ok); | 147 | __set_bit(MAINTENANCE_IN, filter->read_ok); |
148 | __set_bit(GPCMD_READ_BUFFER_CAPACITY, filter->read_ok); | 148 | __set_bit(GPCMD_READ_BUFFER_CAPACITY, filter->read_ok); |
149 | 149 | ||
150 | /* Audio CD commands */ | 150 | /* Audio CD commands */ |
151 | __set_bit(GPCMD_PLAY_CD, filter->read_ok); | 151 | __set_bit(GPCMD_PLAY_CD, filter->read_ok); |
152 | __set_bit(GPCMD_PLAY_AUDIO_10, filter->read_ok); | 152 | __set_bit(GPCMD_PLAY_AUDIO_10, filter->read_ok); |
153 | __set_bit(GPCMD_PLAY_AUDIO_MSF, filter->read_ok); | 153 | __set_bit(GPCMD_PLAY_AUDIO_MSF, filter->read_ok); |
154 | __set_bit(GPCMD_PLAY_AUDIO_TI, filter->read_ok); | 154 | __set_bit(GPCMD_PLAY_AUDIO_TI, filter->read_ok); |
155 | __set_bit(GPCMD_PAUSE_RESUME, filter->read_ok); | 155 | __set_bit(GPCMD_PAUSE_RESUME, filter->read_ok); |
156 | 156 | ||
157 | /* CD/DVD data reading */ | 157 | /* CD/DVD data reading */ |
158 | __set_bit(GPCMD_READ_CD, filter->read_ok); | 158 | __set_bit(GPCMD_READ_CD, filter->read_ok); |
159 | __set_bit(GPCMD_READ_CD_MSF, filter->read_ok); | 159 | __set_bit(GPCMD_READ_CD_MSF, filter->read_ok); |
160 | __set_bit(GPCMD_READ_DISC_INFO, filter->read_ok); | 160 | __set_bit(GPCMD_READ_DISC_INFO, filter->read_ok); |
161 | __set_bit(GPCMD_READ_CDVD_CAPACITY, filter->read_ok); | 161 | __set_bit(GPCMD_READ_CDVD_CAPACITY, filter->read_ok); |
162 | __set_bit(GPCMD_READ_DVD_STRUCTURE, filter->read_ok); | 162 | __set_bit(GPCMD_READ_DVD_STRUCTURE, filter->read_ok); |
163 | __set_bit(GPCMD_READ_HEADER, filter->read_ok); | 163 | __set_bit(GPCMD_READ_HEADER, filter->read_ok); |
164 | __set_bit(GPCMD_READ_TRACK_RZONE_INFO, filter->read_ok); | 164 | __set_bit(GPCMD_READ_TRACK_RZONE_INFO, filter->read_ok); |
165 | __set_bit(GPCMD_READ_SUBCHANNEL, filter->read_ok); | 165 | __set_bit(GPCMD_READ_SUBCHANNEL, filter->read_ok); |
166 | __set_bit(GPCMD_READ_TOC_PMA_ATIP, filter->read_ok); | 166 | __set_bit(GPCMD_READ_TOC_PMA_ATIP, filter->read_ok); |
167 | __set_bit(GPCMD_REPORT_KEY, filter->read_ok); | 167 | __set_bit(GPCMD_REPORT_KEY, filter->read_ok); |
168 | __set_bit(GPCMD_SCAN, filter->read_ok); | 168 | __set_bit(GPCMD_SCAN, filter->read_ok); |
169 | __set_bit(GPCMD_GET_CONFIGURATION, filter->read_ok); | 169 | __set_bit(GPCMD_GET_CONFIGURATION, filter->read_ok); |
170 | __set_bit(GPCMD_READ_FORMAT_CAPACITIES, filter->read_ok); | 170 | __set_bit(GPCMD_READ_FORMAT_CAPACITIES, filter->read_ok); |
171 | __set_bit(GPCMD_GET_EVENT_STATUS_NOTIFICATION, filter->read_ok); | 171 | __set_bit(GPCMD_GET_EVENT_STATUS_NOTIFICATION, filter->read_ok); |
172 | __set_bit(GPCMD_GET_PERFORMANCE, filter->read_ok); | 172 | __set_bit(GPCMD_GET_PERFORMANCE, filter->read_ok); |
173 | __set_bit(GPCMD_SEEK, filter->read_ok); | 173 | __set_bit(GPCMD_SEEK, filter->read_ok); |
174 | __set_bit(GPCMD_STOP_PLAY_SCAN, filter->read_ok); | 174 | __set_bit(GPCMD_STOP_PLAY_SCAN, filter->read_ok); |
175 | 175 | ||
176 | /* Basic writing commands */ | 176 | /* Basic writing commands */ |
177 | __set_bit(WRITE_6, filter->write_ok); | 177 | __set_bit(WRITE_6, filter->write_ok); |
178 | __set_bit(WRITE_10, filter->write_ok); | 178 | __set_bit(WRITE_10, filter->write_ok); |
179 | __set_bit(WRITE_VERIFY, filter->write_ok); | 179 | __set_bit(WRITE_VERIFY, filter->write_ok); |
180 | __set_bit(WRITE_12, filter->write_ok); | 180 | __set_bit(WRITE_12, filter->write_ok); |
181 | __set_bit(WRITE_VERIFY_12, filter->write_ok); | 181 | __set_bit(WRITE_VERIFY_12, filter->write_ok); |
182 | __set_bit(WRITE_16, filter->write_ok); | 182 | __set_bit(WRITE_16, filter->write_ok); |
183 | __set_bit(WRITE_LONG, filter->write_ok); | 183 | __set_bit(WRITE_LONG, filter->write_ok); |
184 | __set_bit(WRITE_LONG_2, filter->write_ok); | 184 | __set_bit(WRITE_LONG_2, filter->write_ok); |
185 | __set_bit(ERASE, filter->write_ok); | 185 | __set_bit(ERASE, filter->write_ok); |
186 | __set_bit(GPCMD_MODE_SELECT_10, filter->write_ok); | 186 | __set_bit(GPCMD_MODE_SELECT_10, filter->write_ok); |
187 | __set_bit(MODE_SELECT, filter->write_ok); | 187 | __set_bit(MODE_SELECT, filter->write_ok); |
188 | __set_bit(LOG_SELECT, filter->write_ok); | 188 | __set_bit(LOG_SELECT, filter->write_ok); |
189 | __set_bit(GPCMD_BLANK, filter->write_ok); | 189 | __set_bit(GPCMD_BLANK, filter->write_ok); |
190 | __set_bit(GPCMD_CLOSE_TRACK, filter->write_ok); | 190 | __set_bit(GPCMD_CLOSE_TRACK, filter->write_ok); |
191 | __set_bit(GPCMD_FLUSH_CACHE, filter->write_ok); | 191 | __set_bit(GPCMD_FLUSH_CACHE, filter->write_ok); |
192 | __set_bit(GPCMD_FORMAT_UNIT, filter->write_ok); | 192 | __set_bit(GPCMD_FORMAT_UNIT, filter->write_ok); |
193 | __set_bit(GPCMD_REPAIR_RZONE_TRACK, filter->write_ok); | 193 | __set_bit(GPCMD_REPAIR_RZONE_TRACK, filter->write_ok); |
194 | __set_bit(GPCMD_RESERVE_RZONE_TRACK, filter->write_ok); | 194 | __set_bit(GPCMD_RESERVE_RZONE_TRACK, filter->write_ok); |
195 | __set_bit(GPCMD_SEND_DVD_STRUCTURE, filter->write_ok); | 195 | __set_bit(GPCMD_SEND_DVD_STRUCTURE, filter->write_ok); |
196 | __set_bit(GPCMD_SEND_EVENT, filter->write_ok); | 196 | __set_bit(GPCMD_SEND_EVENT, filter->write_ok); |
197 | __set_bit(GPCMD_SEND_KEY, filter->write_ok); | 197 | __set_bit(GPCMD_SEND_KEY, filter->write_ok); |
198 | __set_bit(GPCMD_SEND_OPC, filter->write_ok); | 198 | __set_bit(GPCMD_SEND_OPC, filter->write_ok); |
199 | __set_bit(GPCMD_SEND_CUE_SHEET, filter->write_ok); | 199 | __set_bit(GPCMD_SEND_CUE_SHEET, filter->write_ok); |
200 | __set_bit(GPCMD_SET_SPEED, filter->write_ok); | 200 | __set_bit(GPCMD_SET_SPEED, filter->write_ok); |
201 | __set_bit(GPCMD_PREVENT_ALLOW_MEDIUM_REMOVAL, filter->write_ok); | 201 | __set_bit(GPCMD_PREVENT_ALLOW_MEDIUM_REMOVAL, filter->write_ok); |
202 | __set_bit(GPCMD_LOAD_UNLOAD, filter->write_ok); | 202 | __set_bit(GPCMD_LOAD_UNLOAD, filter->write_ok); |
203 | __set_bit(GPCMD_SET_STREAMING, filter->write_ok); | 203 | __set_bit(GPCMD_SET_STREAMING, filter->write_ok); |
204 | __set_bit(GPCMD_SET_READ_AHEAD, filter->write_ok); | 204 | __set_bit(GPCMD_SET_READ_AHEAD, filter->write_ok); |
205 | } | 205 | } |
206 | 206 | ||
207 | int blk_verify_command(unsigned char *cmd, fmode_t has_write_perm) | 207 | int blk_verify_command(unsigned char *cmd, fmode_t has_write_perm) |
208 | { | 208 | { |
209 | struct blk_cmd_filter *filter = &blk_default_cmd_filter; | 209 | struct blk_cmd_filter *filter = &blk_default_cmd_filter; |
210 | 210 | ||
211 | /* root can do any command. */ | 211 | /* root can do any command. */ |
212 | if (capable(CAP_SYS_RAWIO)) | 212 | if (capable(CAP_SYS_RAWIO)) |
213 | return 0; | 213 | return 0; |
214 | 214 | ||
215 | /* Anybody who can open the device can do a read-safe command */ | 215 | /* Anybody who can open the device can do a read-safe command */ |
216 | if (test_bit(cmd[0], filter->read_ok)) | 216 | if (test_bit(cmd[0], filter->read_ok)) |
217 | return 0; | 217 | return 0; |
218 | 218 | ||
219 | /* Write-safe commands require a writable open */ | 219 | /* Write-safe commands require a writable open */ |
220 | if (test_bit(cmd[0], filter->write_ok) && has_write_perm) | 220 | if (test_bit(cmd[0], filter->write_ok) && has_write_perm) |
221 | return 0; | 221 | return 0; |
222 | 222 | ||
223 | return -EPERM; | 223 | return -EPERM; |
224 | } | 224 | } |
225 | EXPORT_SYMBOL(blk_verify_command); | 225 | EXPORT_SYMBOL(blk_verify_command); |
226 | 226 | ||
227 | static int blk_fill_sghdr_rq(struct request_queue *q, struct request *rq, | 227 | static int blk_fill_sghdr_rq(struct request_queue *q, struct request *rq, |
228 | struct sg_io_hdr *hdr, fmode_t mode) | 228 | struct sg_io_hdr *hdr, fmode_t mode) |
229 | { | 229 | { |
230 | if (copy_from_user(rq->cmd, hdr->cmdp, hdr->cmd_len)) | 230 | if (copy_from_user(rq->cmd, hdr->cmdp, hdr->cmd_len)) |
231 | return -EFAULT; | 231 | return -EFAULT; |
232 | if (blk_verify_command(rq->cmd, mode & FMODE_WRITE)) | 232 | if (blk_verify_command(rq->cmd, mode & FMODE_WRITE)) |
233 | return -EPERM; | 233 | return -EPERM; |
234 | 234 | ||
235 | /* | 235 | /* |
236 | * fill in request structure | 236 | * fill in request structure |
237 | */ | 237 | */ |
238 | rq->cmd_len = hdr->cmd_len; | 238 | rq->cmd_len = hdr->cmd_len; |
239 | 239 | ||
240 | rq->timeout = msecs_to_jiffies(hdr->timeout); | 240 | rq->timeout = msecs_to_jiffies(hdr->timeout); |
241 | if (!rq->timeout) | 241 | if (!rq->timeout) |
242 | rq->timeout = q->sg_timeout; | 242 | rq->timeout = q->sg_timeout; |
243 | if (!rq->timeout) | 243 | if (!rq->timeout) |
244 | rq->timeout = BLK_DEFAULT_SG_TIMEOUT; | 244 | rq->timeout = BLK_DEFAULT_SG_TIMEOUT; |
245 | if (rq->timeout < BLK_MIN_SG_TIMEOUT) | 245 | if (rq->timeout < BLK_MIN_SG_TIMEOUT) |
246 | rq->timeout = BLK_MIN_SG_TIMEOUT; | 246 | rq->timeout = BLK_MIN_SG_TIMEOUT; |
247 | 247 | ||
248 | return 0; | 248 | return 0; |
249 | } | 249 | } |
250 | 250 | ||
251 | static int blk_complete_sghdr_rq(struct request *rq, struct sg_io_hdr *hdr, | 251 | static int blk_complete_sghdr_rq(struct request *rq, struct sg_io_hdr *hdr, |
252 | struct bio *bio) | 252 | struct bio *bio) |
253 | { | 253 | { |
254 | int r, ret = 0; | 254 | int r, ret = 0; |
255 | 255 | ||
256 | /* | 256 | /* |
257 | * fill in all the output members | 257 | * fill in all the output members |
258 | */ | 258 | */ |
259 | hdr->status = rq->errors & 0xff; | 259 | hdr->status = rq->errors & 0xff; |
260 | hdr->masked_status = status_byte(rq->errors); | 260 | hdr->masked_status = status_byte(rq->errors); |
261 | hdr->msg_status = msg_byte(rq->errors); | 261 | hdr->msg_status = msg_byte(rq->errors); |
262 | hdr->host_status = host_byte(rq->errors); | 262 | hdr->host_status = host_byte(rq->errors); |
263 | hdr->driver_status = driver_byte(rq->errors); | 263 | hdr->driver_status = driver_byte(rq->errors); |
264 | hdr->info = 0; | 264 | hdr->info = 0; |
265 | if (hdr->masked_status || hdr->host_status || hdr->driver_status) | 265 | if (hdr->masked_status || hdr->host_status || hdr->driver_status) |
266 | hdr->info |= SG_INFO_CHECK; | 266 | hdr->info |= SG_INFO_CHECK; |
267 | hdr->resid = rq->resid_len; | 267 | hdr->resid = rq->resid_len; |
268 | hdr->sb_len_wr = 0; | 268 | hdr->sb_len_wr = 0; |
269 | 269 | ||
270 | if (rq->sense_len && hdr->sbp) { | 270 | if (rq->sense_len && hdr->sbp) { |
271 | int len = min((unsigned int) hdr->mx_sb_len, rq->sense_len); | 271 | int len = min((unsigned int) hdr->mx_sb_len, rq->sense_len); |
272 | 272 | ||
273 | if (!copy_to_user(hdr->sbp, rq->sense, len)) | 273 | if (!copy_to_user(hdr->sbp, rq->sense, len)) |
274 | hdr->sb_len_wr = len; | 274 | hdr->sb_len_wr = len; |
275 | else | 275 | else |
276 | ret = -EFAULT; | 276 | ret = -EFAULT; |
277 | } | 277 | } |
278 | 278 | ||
279 | r = blk_rq_unmap_user(bio); | 279 | r = blk_rq_unmap_user(bio); |
280 | if (!ret) | 280 | if (!ret) |
281 | ret = r; | 281 | ret = r; |
282 | 282 | ||
283 | return ret; | 283 | return ret; |
284 | } | 284 | } |
285 | 285 | ||
286 | static int sg_io(struct request_queue *q, struct gendisk *bd_disk, | 286 | static int sg_io(struct request_queue *q, struct gendisk *bd_disk, |
287 | struct sg_io_hdr *hdr, fmode_t mode) | 287 | struct sg_io_hdr *hdr, fmode_t mode) |
288 | { | 288 | { |
289 | unsigned long start_time; | 289 | unsigned long start_time; |
290 | ssize_t ret = 0; | 290 | ssize_t ret = 0; |
291 | int writing = 0; | 291 | int writing = 0; |
292 | int at_head = 0; | 292 | int at_head = 0; |
293 | struct request *rq; | 293 | struct request *rq; |
294 | char sense[SCSI_SENSE_BUFFERSIZE]; | 294 | char sense[SCSI_SENSE_BUFFERSIZE]; |
295 | struct bio *bio; | 295 | struct bio *bio; |
296 | 296 | ||
297 | if (hdr->interface_id != 'S') | 297 | if (hdr->interface_id != 'S') |
298 | return -EINVAL; | 298 | return -EINVAL; |
299 | 299 | ||
300 | if (hdr->dxfer_len > (queue_max_hw_sectors(q) << 9)) | 300 | if (hdr->dxfer_len > (queue_max_hw_sectors(q) << 9)) |
301 | return -EIO; | 301 | return -EIO; |
302 | 302 | ||
303 | if (hdr->dxfer_len) | 303 | if (hdr->dxfer_len) |
304 | switch (hdr->dxfer_direction) { | 304 | switch (hdr->dxfer_direction) { |
305 | default: | 305 | default: |
306 | return -EINVAL; | 306 | return -EINVAL; |
307 | case SG_DXFER_TO_DEV: | 307 | case SG_DXFER_TO_DEV: |
308 | writing = 1; | 308 | writing = 1; |
309 | break; | 309 | break; |
310 | case SG_DXFER_TO_FROM_DEV: | 310 | case SG_DXFER_TO_FROM_DEV: |
311 | case SG_DXFER_FROM_DEV: | 311 | case SG_DXFER_FROM_DEV: |
312 | break; | 312 | break; |
313 | } | 313 | } |
314 | if (hdr->flags & SG_FLAG_Q_AT_HEAD) | 314 | if (hdr->flags & SG_FLAG_Q_AT_HEAD) |
315 | at_head = 1; | 315 | at_head = 1; |
316 | 316 | ||
317 | ret = -ENOMEM; | 317 | ret = -ENOMEM; |
318 | rq = blk_get_request(q, writing ? WRITE : READ, GFP_KERNEL); | 318 | rq = blk_get_request(q, writing ? WRITE : READ, GFP_KERNEL); |
319 | if (IS_ERR(rq)) | 319 | if (IS_ERR(rq)) |
320 | return PTR_ERR(rq); | 320 | return PTR_ERR(rq); |
321 | blk_rq_set_block_pc(rq); | 321 | blk_rq_set_block_pc(rq); |
322 | 322 | ||
323 | if (hdr->cmd_len > BLK_MAX_CDB) { | 323 | if (hdr->cmd_len > BLK_MAX_CDB) { |
324 | rq->cmd = kzalloc(hdr->cmd_len, GFP_KERNEL); | 324 | rq->cmd = kzalloc(hdr->cmd_len, GFP_KERNEL); |
325 | if (!rq->cmd) | 325 | if (!rq->cmd) |
326 | goto out_put_request; | 326 | goto out_put_request; |
327 | } | 327 | } |
328 | 328 | ||
329 | ret = -EFAULT; | 329 | ret = -EFAULT; |
330 | if (blk_fill_sghdr_rq(q, rq, hdr, mode)) | 330 | if (blk_fill_sghdr_rq(q, rq, hdr, mode)) |
331 | goto out_free_cdb; | 331 | goto out_free_cdb; |
332 | 332 | ||
333 | ret = 0; | 333 | ret = 0; |
334 | if (hdr->iovec_count) { | 334 | if (hdr->iovec_count) { |
335 | size_t iov_data_len; | 335 | size_t iov_data_len; |
336 | struct iovec *iov = NULL; | 336 | struct iovec *iov = NULL; |
337 | 337 | ||
338 | ret = rw_copy_check_uvector(-1, hdr->dxferp, hdr->iovec_count, | 338 | ret = rw_copy_check_uvector(-1, hdr->dxferp, hdr->iovec_count, |
339 | 0, NULL, &iov); | 339 | 0, NULL, &iov); |
340 | if (ret < 0) { | 340 | if (ret < 0) { |
341 | kfree(iov); | 341 | kfree(iov); |
342 | goto out_free_cdb; | 342 | goto out_free_cdb; |
343 | } | 343 | } |
344 | 344 | ||
345 | iov_data_len = ret; | 345 | iov_data_len = ret; |
346 | ret = 0; | 346 | ret = 0; |
347 | 347 | ||
348 | /* SG_IO howto says that the shorter of the two wins */ | 348 | /* SG_IO howto says that the shorter of the two wins */ |
349 | if (hdr->dxfer_len < iov_data_len) { | 349 | if (hdr->dxfer_len < iov_data_len) { |
350 | hdr->iovec_count = iov_shorten(iov, | 350 | hdr->iovec_count = iov_shorten(iov, |
351 | hdr->iovec_count, | 351 | hdr->iovec_count, |
352 | hdr->dxfer_len); | 352 | hdr->dxfer_len); |
353 | iov_data_len = hdr->dxfer_len; | 353 | iov_data_len = hdr->dxfer_len; |
354 | } | 354 | } |
355 | 355 | ||
356 | ret = blk_rq_map_user_iov(q, rq, NULL, (struct sg_iovec *) iov, | 356 | ret = blk_rq_map_user_iov(q, rq, NULL, (struct sg_iovec *) iov, |
357 | hdr->iovec_count, | 357 | hdr->iovec_count, |
358 | iov_data_len, GFP_KERNEL); | 358 | iov_data_len, GFP_KERNEL); |
359 | kfree(iov); | 359 | kfree(iov); |
360 | } else if (hdr->dxfer_len) | 360 | } else if (hdr->dxfer_len) |
361 | ret = blk_rq_map_user(q, rq, NULL, hdr->dxferp, hdr->dxfer_len, | 361 | ret = blk_rq_map_user(q, rq, NULL, hdr->dxferp, hdr->dxfer_len, |
362 | GFP_KERNEL); | 362 | GFP_KERNEL); |
363 | 363 | ||
364 | if (ret) | 364 | if (ret) |
365 | goto out_free_cdb; | 365 | goto out_free_cdb; |
366 | 366 | ||
367 | bio = rq->bio; | 367 | bio = rq->bio; |
368 | memset(sense, 0, sizeof(sense)); | 368 | memset(sense, 0, sizeof(sense)); |
369 | rq->sense = sense; | 369 | rq->sense = sense; |
370 | rq->sense_len = 0; | 370 | rq->sense_len = 0; |
371 | rq->retries = 0; | 371 | rq->retries = 0; |
372 | 372 | ||
373 | start_time = jiffies; | 373 | start_time = jiffies; |
374 | 374 | ||
375 | /* ignore return value. All information is passed back to caller | 375 | /* ignore return value. All information is passed back to caller |
376 | * (if he doesn't check that is his problem). | 376 | * (if he doesn't check that is his problem). |
377 | * N.B. a non-zero SCSI status is _not_ necessarily an error. | 377 | * N.B. a non-zero SCSI status is _not_ necessarily an error. |
378 | */ | 378 | */ |
379 | blk_execute_rq(q, bd_disk, rq, at_head); | 379 | blk_execute_rq(q, bd_disk, rq, at_head); |
380 | 380 | ||
381 | hdr->duration = jiffies_to_msecs(jiffies - start_time); | 381 | hdr->duration = jiffies_to_msecs(jiffies - start_time); |
382 | 382 | ||
383 | ret = blk_complete_sghdr_rq(rq, hdr, bio); | 383 | ret = blk_complete_sghdr_rq(rq, hdr, bio); |
384 | 384 | ||
385 | out_free_cdb: | 385 | out_free_cdb: |
386 | if (rq->cmd != rq->__cmd) | 386 | if (rq->cmd != rq->__cmd) |
387 | kfree(rq->cmd); | 387 | kfree(rq->cmd); |
388 | out_put_request: | 388 | out_put_request: |
389 | blk_put_request(rq); | 389 | blk_put_request(rq); |
390 | return ret; | 390 | return ret; |
391 | } | 391 | } |
392 | 392 | ||
393 | /** | 393 | /** |
394 | * sg_scsi_ioctl -- handle deprecated SCSI_IOCTL_SEND_COMMAND ioctl | 394 | * sg_scsi_ioctl -- handle deprecated SCSI_IOCTL_SEND_COMMAND ioctl |
395 | * @file: file this ioctl operates on (optional) | 395 | * @file: file this ioctl operates on (optional) |
396 | * @q: request queue to send scsi commands down | 396 | * @q: request queue to send scsi commands down |
397 | * @disk: gendisk to operate on (option) | 397 | * @disk: gendisk to operate on (option) |
398 | * @sic: userspace structure describing the command to perform | 398 | * @sic: userspace structure describing the command to perform |
399 | * | 399 | * |
400 | * Send down the scsi command described by @sic to the device below | 400 | * Send down the scsi command described by @sic to the device below |
401 | * the request queue @q. If @file is non-NULL it's used to perform | 401 | * the request queue @q. If @file is non-NULL it's used to perform |
402 | * fine-grained permission checks that allow users to send down | 402 | * fine-grained permission checks that allow users to send down |
403 | * non-destructive SCSI commands. If the caller has a struct gendisk | 403 | * non-destructive SCSI commands. If the caller has a struct gendisk |
404 | * available it should be passed in as @disk to allow the low level | 404 | * available it should be passed in as @disk to allow the low level |
405 | * driver to use the information contained in it. A non-NULL @disk | 405 | * driver to use the information contained in it. A non-NULL @disk |
406 | * is only allowed if the caller knows that the low level driver doesn't | 406 | * is only allowed if the caller knows that the low level driver doesn't |
407 | * need it (e.g. in the scsi subsystem). | 407 | * need it (e.g. in the scsi subsystem). |
408 | * | 408 | * |
409 | * Notes: | 409 | * Notes: |
410 | * - This interface is deprecated - users should use the SG_IO | 410 | * - This interface is deprecated - users should use the SG_IO |
411 | * interface instead, as this is a more flexible approach to | 411 | * interface instead, as this is a more flexible approach to |
412 | * performing SCSI commands on a device. | 412 | * performing SCSI commands on a device. |
413 | * - The SCSI command length is determined by examining the 1st byte | 413 | * - The SCSI command length is determined by examining the 1st byte |
414 | * of the given command. There is no way to override this. | 414 | * of the given command. There is no way to override this. |
415 | * - Data transfers are limited to PAGE_SIZE | 415 | * - Data transfers are limited to PAGE_SIZE |
416 | * - The length (x + y) must be at least OMAX_SB_LEN bytes long to | 416 | * - The length (x + y) must be at least OMAX_SB_LEN bytes long to |
417 | * accommodate the sense buffer when an error occurs. | 417 | * accommodate the sense buffer when an error occurs. |
418 | * The sense buffer is truncated to OMAX_SB_LEN (16) bytes so that | 418 | * The sense buffer is truncated to OMAX_SB_LEN (16) bytes so that |
419 | * old code will not be surprised. | 419 | * old code will not be surprised. |
420 | * - If a Unix error occurs (e.g. ENOMEM) then the user will receive | 420 | * - If a Unix error occurs (e.g. ENOMEM) then the user will receive |
421 | * a negative return and the Unix error code in 'errno'. | 421 | * a negative return and the Unix error code in 'errno'. |
422 | * If the SCSI command succeeds then 0 is returned. | 422 | * If the SCSI command succeeds then 0 is returned. |
423 | * Positive numbers returned are the compacted SCSI error codes (4 | 423 | * Positive numbers returned are the compacted SCSI error codes (4 |
424 | * bytes in one int) where the lowest byte is the SCSI status. | 424 | * bytes in one int) where the lowest byte is the SCSI status. |
425 | */ | 425 | */ |
426 | #define OMAX_SB_LEN 16 /* For backward compatibility */ | 426 | #define OMAX_SB_LEN 16 /* For backward compatibility */ |
427 | int sg_scsi_ioctl(struct request_queue *q, struct gendisk *disk, fmode_t mode, | 427 | int sg_scsi_ioctl(struct request_queue *q, struct gendisk *disk, fmode_t mode, |
428 | struct scsi_ioctl_command __user *sic) | 428 | struct scsi_ioctl_command __user *sic) |
429 | { | 429 | { |
430 | struct request *rq; | 430 | struct request *rq; |
431 | int err; | 431 | int err; |
432 | unsigned int in_len, out_len, bytes, opcode, cmdlen; | 432 | unsigned int in_len, out_len, bytes, opcode, cmdlen; |
433 | char *buffer = NULL, sense[SCSI_SENSE_BUFFERSIZE]; | 433 | char *buffer = NULL, sense[SCSI_SENSE_BUFFERSIZE]; |
434 | 434 | ||
435 | if (!sic) | 435 | if (!sic) |
436 | return -EINVAL; | 436 | return -EINVAL; |
437 | 437 | ||
438 | /* | 438 | /* |
439 | * get in an out lengths, verify they don't exceed a page worth of data | 439 | * get in an out lengths, verify they don't exceed a page worth of data |
440 | */ | 440 | */ |
441 | if (get_user(in_len, &sic->inlen)) | 441 | if (get_user(in_len, &sic->inlen)) |
442 | return -EFAULT; | 442 | return -EFAULT; |
443 | if (get_user(out_len, &sic->outlen)) | 443 | if (get_user(out_len, &sic->outlen)) |
444 | return -EFAULT; | 444 | return -EFAULT; |
445 | if (in_len > PAGE_SIZE || out_len > PAGE_SIZE) | 445 | if (in_len > PAGE_SIZE || out_len > PAGE_SIZE) |
446 | return -EINVAL; | 446 | return -EINVAL; |
447 | if (get_user(opcode, sic->data)) | 447 | if (get_user(opcode, sic->data)) |
448 | return -EFAULT; | 448 | return -EFAULT; |
449 | 449 | ||
450 | bytes = max(in_len, out_len); | 450 | bytes = max(in_len, out_len); |
451 | if (bytes) { | 451 | if (bytes) { |
452 | buffer = kzalloc(bytes, q->bounce_gfp | GFP_USER| __GFP_NOWARN); | 452 | buffer = kzalloc(bytes, q->bounce_gfp | GFP_USER| __GFP_NOWARN); |
453 | if (!buffer) | 453 | if (!buffer) |
454 | return -ENOMEM; | 454 | return -ENOMEM; |
455 | 455 | ||
456 | } | 456 | } |
457 | 457 | ||
458 | rq = blk_get_request(q, in_len ? WRITE : READ, __GFP_WAIT); | 458 | rq = blk_get_request(q, in_len ? WRITE : READ, __GFP_WAIT); |
459 | if (IS_ERR(rq)) { | 459 | if (IS_ERR(rq)) { |
460 | err = PTR_ERR(rq); | 460 | err = PTR_ERR(rq); |
461 | goto error; | 461 | goto error_free_buffer; |
462 | } | 462 | } |
463 | blk_rq_set_block_pc(rq); | 463 | blk_rq_set_block_pc(rq); |
464 | 464 | ||
465 | cmdlen = COMMAND_SIZE(opcode); | 465 | cmdlen = COMMAND_SIZE(opcode); |
466 | 466 | ||
467 | /* | 467 | /* |
468 | * get command and data to send to device, if any | 468 | * get command and data to send to device, if any |
469 | */ | 469 | */ |
470 | err = -EFAULT; | 470 | err = -EFAULT; |
471 | rq->cmd_len = cmdlen; | 471 | rq->cmd_len = cmdlen; |
472 | if (copy_from_user(rq->cmd, sic->data, cmdlen)) | 472 | if (copy_from_user(rq->cmd, sic->data, cmdlen)) |
473 | goto error; | 473 | goto error; |
474 | 474 | ||
475 | if (in_len && copy_from_user(buffer, sic->data + cmdlen, in_len)) | 475 | if (in_len && copy_from_user(buffer, sic->data + cmdlen, in_len)) |
476 | goto error; | 476 | goto error; |
477 | 477 | ||
478 | err = blk_verify_command(rq->cmd, mode & FMODE_WRITE); | 478 | err = blk_verify_command(rq->cmd, mode & FMODE_WRITE); |
479 | if (err) | 479 | if (err) |
480 | goto error; | 480 | goto error; |
481 | 481 | ||
482 | /* default. possible overriden later */ | 482 | /* default. possible overriden later */ |
483 | rq->retries = 5; | 483 | rq->retries = 5; |
484 | 484 | ||
485 | switch (opcode) { | 485 | switch (opcode) { |
486 | case SEND_DIAGNOSTIC: | 486 | case SEND_DIAGNOSTIC: |
487 | case FORMAT_UNIT: | 487 | case FORMAT_UNIT: |
488 | rq->timeout = FORMAT_UNIT_TIMEOUT; | 488 | rq->timeout = FORMAT_UNIT_TIMEOUT; |
489 | rq->retries = 1; | 489 | rq->retries = 1; |
490 | break; | 490 | break; |
491 | case START_STOP: | 491 | case START_STOP: |
492 | rq->timeout = START_STOP_TIMEOUT; | 492 | rq->timeout = START_STOP_TIMEOUT; |
493 | break; | 493 | break; |
494 | case MOVE_MEDIUM: | 494 | case MOVE_MEDIUM: |
495 | rq->timeout = MOVE_MEDIUM_TIMEOUT; | 495 | rq->timeout = MOVE_MEDIUM_TIMEOUT; |
496 | break; | 496 | break; |
497 | case READ_ELEMENT_STATUS: | 497 | case READ_ELEMENT_STATUS: |
498 | rq->timeout = READ_ELEMENT_STATUS_TIMEOUT; | 498 | rq->timeout = READ_ELEMENT_STATUS_TIMEOUT; |
499 | break; | 499 | break; |
500 | case READ_DEFECT_DATA: | 500 | case READ_DEFECT_DATA: |
501 | rq->timeout = READ_DEFECT_DATA_TIMEOUT; | 501 | rq->timeout = READ_DEFECT_DATA_TIMEOUT; |
502 | rq->retries = 1; | 502 | rq->retries = 1; |
503 | break; | 503 | break; |
504 | default: | 504 | default: |
505 | rq->timeout = BLK_DEFAULT_SG_TIMEOUT; | 505 | rq->timeout = BLK_DEFAULT_SG_TIMEOUT; |
506 | break; | 506 | break; |
507 | } | 507 | } |
508 | 508 | ||
509 | if (bytes && blk_rq_map_kern(q, rq, buffer, bytes, __GFP_WAIT)) { | 509 | if (bytes && blk_rq_map_kern(q, rq, buffer, bytes, __GFP_WAIT)) { |
510 | err = DRIVER_ERROR << 24; | 510 | err = DRIVER_ERROR << 24; |
511 | goto error; | 511 | goto error; |
512 | } | 512 | } |
513 | 513 | ||
514 | memset(sense, 0, sizeof(sense)); | 514 | memset(sense, 0, sizeof(sense)); |
515 | rq->sense = sense; | 515 | rq->sense = sense; |
516 | rq->sense_len = 0; | 516 | rq->sense_len = 0; |
517 | 517 | ||
518 | blk_execute_rq(q, disk, rq, 0); | 518 | blk_execute_rq(q, disk, rq, 0); |
519 | 519 | ||
520 | err = rq->errors & 0xff; /* only 8 bit SCSI status */ | 520 | err = rq->errors & 0xff; /* only 8 bit SCSI status */ |
521 | if (err) { | 521 | if (err) { |
522 | if (rq->sense_len && rq->sense) { | 522 | if (rq->sense_len && rq->sense) { |
523 | bytes = (OMAX_SB_LEN > rq->sense_len) ? | 523 | bytes = (OMAX_SB_LEN > rq->sense_len) ? |
524 | rq->sense_len : OMAX_SB_LEN; | 524 | rq->sense_len : OMAX_SB_LEN; |
525 | if (copy_to_user(sic->data, rq->sense, bytes)) | 525 | if (copy_to_user(sic->data, rq->sense, bytes)) |
526 | err = -EFAULT; | 526 | err = -EFAULT; |
527 | } | 527 | } |
528 | } else { | 528 | } else { |
529 | if (copy_to_user(sic->data, buffer, out_len)) | 529 | if (copy_to_user(sic->data, buffer, out_len)) |
530 | err = -EFAULT; | 530 | err = -EFAULT; |
531 | } | 531 | } |
532 | 532 | ||
533 | error: | 533 | error: |
534 | blk_put_request(rq); | ||
535 | |||
536 | error_free_buffer: | ||
534 | kfree(buffer); | 537 | kfree(buffer); |
535 | if (rq) | 538 | |
536 | blk_put_request(rq); | ||
537 | return err; | 539 | return err; |
538 | } | 540 | } |
539 | EXPORT_SYMBOL_GPL(sg_scsi_ioctl); | 541 | EXPORT_SYMBOL_GPL(sg_scsi_ioctl); |
540 | 542 | ||
541 | /* Send basic block requests */ | 543 | /* Send basic block requests */ |
542 | static int __blk_send_generic(struct request_queue *q, struct gendisk *bd_disk, | 544 | static int __blk_send_generic(struct request_queue *q, struct gendisk *bd_disk, |
543 | int cmd, int data) | 545 | int cmd, int data) |
544 | { | 546 | { |
545 | struct request *rq; | 547 | struct request *rq; |
546 | int err; | 548 | int err; |
547 | 549 | ||
548 | rq = blk_get_request(q, WRITE, __GFP_WAIT); | 550 | rq = blk_get_request(q, WRITE, __GFP_WAIT); |
549 | if (IS_ERR(rq)) | 551 | if (IS_ERR(rq)) |
550 | return PTR_ERR(rq); | 552 | return PTR_ERR(rq); |
551 | blk_rq_set_block_pc(rq); | 553 | blk_rq_set_block_pc(rq); |
552 | rq->timeout = BLK_DEFAULT_SG_TIMEOUT; | 554 | rq->timeout = BLK_DEFAULT_SG_TIMEOUT; |
553 | rq->cmd[0] = cmd; | 555 | rq->cmd[0] = cmd; |
554 | rq->cmd[4] = data; | 556 | rq->cmd[4] = data; |
555 | rq->cmd_len = 6; | 557 | rq->cmd_len = 6; |
556 | err = blk_execute_rq(q, bd_disk, rq, 0); | 558 | err = blk_execute_rq(q, bd_disk, rq, 0); |
557 | blk_put_request(rq); | 559 | blk_put_request(rq); |
558 | 560 | ||
559 | return err; | 561 | return err; |
560 | } | 562 | } |
561 | 563 | ||
562 | static inline int blk_send_start_stop(struct request_queue *q, | 564 | static inline int blk_send_start_stop(struct request_queue *q, |
563 | struct gendisk *bd_disk, int data) | 565 | struct gendisk *bd_disk, int data) |
564 | { | 566 | { |
565 | return __blk_send_generic(q, bd_disk, GPCMD_START_STOP_UNIT, data); | 567 | return __blk_send_generic(q, bd_disk, GPCMD_START_STOP_UNIT, data); |
566 | } | 568 | } |
567 | 569 | ||
568 | int scsi_cmd_ioctl(struct request_queue *q, struct gendisk *bd_disk, fmode_t mode, | 570 | int scsi_cmd_ioctl(struct request_queue *q, struct gendisk *bd_disk, fmode_t mode, |
569 | unsigned int cmd, void __user *arg) | 571 | unsigned int cmd, void __user *arg) |
570 | { | 572 | { |
571 | int err; | 573 | int err; |
572 | 574 | ||
573 | if (!q) | 575 | if (!q) |
574 | return -ENXIO; | 576 | return -ENXIO; |
575 | 577 | ||
576 | switch (cmd) { | 578 | switch (cmd) { |
577 | /* | 579 | /* |
578 | * new sgv3 interface | 580 | * new sgv3 interface |
579 | */ | 581 | */ |
580 | case SG_GET_VERSION_NUM: | 582 | case SG_GET_VERSION_NUM: |
581 | err = sg_get_version(arg); | 583 | err = sg_get_version(arg); |
582 | break; | 584 | break; |
583 | case SCSI_IOCTL_GET_IDLUN: | 585 | case SCSI_IOCTL_GET_IDLUN: |
584 | err = scsi_get_idlun(q, arg); | 586 | err = scsi_get_idlun(q, arg); |
585 | break; | 587 | break; |
586 | case SCSI_IOCTL_GET_BUS_NUMBER: | 588 | case SCSI_IOCTL_GET_BUS_NUMBER: |
587 | err = scsi_get_bus(q, arg); | 589 | err = scsi_get_bus(q, arg); |
588 | break; | 590 | break; |
589 | case SG_SET_TIMEOUT: | 591 | case SG_SET_TIMEOUT: |
590 | err = sg_set_timeout(q, arg); | 592 | err = sg_set_timeout(q, arg); |
591 | break; | 593 | break; |
592 | case SG_GET_TIMEOUT: | 594 | case SG_GET_TIMEOUT: |
593 | err = sg_get_timeout(q); | 595 | err = sg_get_timeout(q); |
594 | break; | 596 | break; |
595 | case SG_GET_RESERVED_SIZE: | 597 | case SG_GET_RESERVED_SIZE: |
596 | err = sg_get_reserved_size(q, arg); | 598 | err = sg_get_reserved_size(q, arg); |
597 | break; | 599 | break; |
598 | case SG_SET_RESERVED_SIZE: | 600 | case SG_SET_RESERVED_SIZE: |
599 | err = sg_set_reserved_size(q, arg); | 601 | err = sg_set_reserved_size(q, arg); |
600 | break; | 602 | break; |
601 | case SG_EMULATED_HOST: | 603 | case SG_EMULATED_HOST: |
602 | err = sg_emulated_host(q, arg); | 604 | err = sg_emulated_host(q, arg); |
603 | break; | 605 | break; |
604 | case SG_IO: { | 606 | case SG_IO: { |
605 | struct sg_io_hdr hdr; | 607 | struct sg_io_hdr hdr; |
606 | 608 | ||
607 | err = -EFAULT; | 609 | err = -EFAULT; |
608 | if (copy_from_user(&hdr, arg, sizeof(hdr))) | 610 | if (copy_from_user(&hdr, arg, sizeof(hdr))) |
609 | break; | 611 | break; |
610 | err = sg_io(q, bd_disk, &hdr, mode); | 612 | err = sg_io(q, bd_disk, &hdr, mode); |
611 | if (err == -EFAULT) | 613 | if (err == -EFAULT) |
612 | break; | 614 | break; |
613 | 615 | ||
614 | if (copy_to_user(arg, &hdr, sizeof(hdr))) | 616 | if (copy_to_user(arg, &hdr, sizeof(hdr))) |
615 | err = -EFAULT; | 617 | err = -EFAULT; |
616 | break; | 618 | break; |
617 | } | 619 | } |
618 | case CDROM_SEND_PACKET: { | 620 | case CDROM_SEND_PACKET: { |
619 | struct cdrom_generic_command cgc; | 621 | struct cdrom_generic_command cgc; |
620 | struct sg_io_hdr hdr; | 622 | struct sg_io_hdr hdr; |
621 | 623 | ||
622 | err = -EFAULT; | 624 | err = -EFAULT; |
623 | if (copy_from_user(&cgc, arg, sizeof(cgc))) | 625 | if (copy_from_user(&cgc, arg, sizeof(cgc))) |
624 | break; | 626 | break; |
625 | cgc.timeout = clock_t_to_jiffies(cgc.timeout); | 627 | cgc.timeout = clock_t_to_jiffies(cgc.timeout); |
626 | memset(&hdr, 0, sizeof(hdr)); | 628 | memset(&hdr, 0, sizeof(hdr)); |
627 | hdr.interface_id = 'S'; | 629 | hdr.interface_id = 'S'; |
628 | hdr.cmd_len = sizeof(cgc.cmd); | 630 | hdr.cmd_len = sizeof(cgc.cmd); |
629 | hdr.dxfer_len = cgc.buflen; | 631 | hdr.dxfer_len = cgc.buflen; |
630 | err = 0; | 632 | err = 0; |
631 | switch (cgc.data_direction) { | 633 | switch (cgc.data_direction) { |
632 | case CGC_DATA_UNKNOWN: | 634 | case CGC_DATA_UNKNOWN: |
633 | hdr.dxfer_direction = SG_DXFER_UNKNOWN; | 635 | hdr.dxfer_direction = SG_DXFER_UNKNOWN; |
634 | break; | 636 | break; |
635 | case CGC_DATA_WRITE: | 637 | case CGC_DATA_WRITE: |
636 | hdr.dxfer_direction = SG_DXFER_TO_DEV; | 638 | hdr.dxfer_direction = SG_DXFER_TO_DEV; |
637 | break; | 639 | break; |
638 | case CGC_DATA_READ: | 640 | case CGC_DATA_READ: |
639 | hdr.dxfer_direction = SG_DXFER_FROM_DEV; | 641 | hdr.dxfer_direction = SG_DXFER_FROM_DEV; |
640 | break; | 642 | break; |
641 | case CGC_DATA_NONE: | 643 | case CGC_DATA_NONE: |
642 | hdr.dxfer_direction = SG_DXFER_NONE; | 644 | hdr.dxfer_direction = SG_DXFER_NONE; |
643 | break; | 645 | break; |
644 | default: | 646 | default: |
645 | err = -EINVAL; | 647 | err = -EINVAL; |
646 | } | 648 | } |
647 | if (err) | 649 | if (err) |
648 | break; | 650 | break; |
649 | 651 | ||
650 | hdr.dxferp = cgc.buffer; | 652 | hdr.dxferp = cgc.buffer; |
651 | hdr.sbp = cgc.sense; | 653 | hdr.sbp = cgc.sense; |
652 | if (hdr.sbp) | 654 | if (hdr.sbp) |
653 | hdr.mx_sb_len = sizeof(struct request_sense); | 655 | hdr.mx_sb_len = sizeof(struct request_sense); |
654 | hdr.timeout = jiffies_to_msecs(cgc.timeout); | 656 | hdr.timeout = jiffies_to_msecs(cgc.timeout); |
655 | hdr.cmdp = ((struct cdrom_generic_command __user*) arg)->cmd; | 657 | hdr.cmdp = ((struct cdrom_generic_command __user*) arg)->cmd; |
656 | hdr.cmd_len = sizeof(cgc.cmd); | 658 | hdr.cmd_len = sizeof(cgc.cmd); |
657 | 659 | ||
658 | err = sg_io(q, bd_disk, &hdr, mode); | 660 | err = sg_io(q, bd_disk, &hdr, mode); |
659 | if (err == -EFAULT) | 661 | if (err == -EFAULT) |
660 | break; | 662 | break; |
661 | 663 | ||
662 | if (hdr.status) | 664 | if (hdr.status) |
663 | err = -EIO; | 665 | err = -EIO; |
664 | 666 | ||
665 | cgc.stat = err; | 667 | cgc.stat = err; |
666 | cgc.buflen = hdr.resid; | 668 | cgc.buflen = hdr.resid; |
667 | if (copy_to_user(arg, &cgc, sizeof(cgc))) | 669 | if (copy_to_user(arg, &cgc, sizeof(cgc))) |
668 | err = -EFAULT; | 670 | err = -EFAULT; |
669 | 671 | ||
670 | break; | 672 | break; |
671 | } | 673 | } |
672 | 674 | ||
673 | /* | 675 | /* |
674 | * old junk scsi send command ioctl | 676 | * old junk scsi send command ioctl |
675 | */ | 677 | */ |
676 | case SCSI_IOCTL_SEND_COMMAND: | 678 | case SCSI_IOCTL_SEND_COMMAND: |
677 | printk(KERN_WARNING "program %s is using a deprecated SCSI ioctl, please convert it to SG_IO\n", current->comm); | 679 | printk(KERN_WARNING "program %s is using a deprecated SCSI ioctl, please convert it to SG_IO\n", current->comm); |
678 | err = -EINVAL; | 680 | err = -EINVAL; |
679 | if (!arg) | 681 | if (!arg) |
680 | break; | 682 | break; |
681 | 683 | ||
682 | err = sg_scsi_ioctl(q, bd_disk, mode, arg); | 684 | err = sg_scsi_ioctl(q, bd_disk, mode, arg); |
683 | break; | 685 | break; |
684 | case CDROMCLOSETRAY: | 686 | case CDROMCLOSETRAY: |
685 | err = blk_send_start_stop(q, bd_disk, 0x03); | 687 | err = blk_send_start_stop(q, bd_disk, 0x03); |
686 | break; | 688 | break; |
687 | case CDROMEJECT: | 689 | case CDROMEJECT: |
688 | err = blk_send_start_stop(q, bd_disk, 0x02); | 690 | err = blk_send_start_stop(q, bd_disk, 0x02); |
689 | break; | 691 | break; |
690 | default: | 692 | default: |
691 | err = -ENOTTY; | 693 | err = -ENOTTY; |
692 | } | 694 | } |
693 | 695 | ||
694 | return err; | 696 | return err; |
695 | } | 697 | } |
696 | EXPORT_SYMBOL(scsi_cmd_ioctl); | 698 | EXPORT_SYMBOL(scsi_cmd_ioctl); |
697 | 699 | ||
698 | int scsi_verify_blk_ioctl(struct block_device *bd, unsigned int cmd) | 700 | int scsi_verify_blk_ioctl(struct block_device *bd, unsigned int cmd) |
699 | { | 701 | { |
700 | if (bd && bd == bd->bd_contains) | 702 | if (bd && bd == bd->bd_contains) |
701 | return 0; | 703 | return 0; |
702 | 704 | ||
703 | /* Actually none of these is particularly useful on a partition, | 705 | /* Actually none of these is particularly useful on a partition, |
704 | * but they are safe. | 706 | * but they are safe. |
705 | */ | 707 | */ |
706 | switch (cmd) { | 708 | switch (cmd) { |
707 | case SCSI_IOCTL_GET_IDLUN: | 709 | case SCSI_IOCTL_GET_IDLUN: |
708 | case SCSI_IOCTL_GET_BUS_NUMBER: | 710 | case SCSI_IOCTL_GET_BUS_NUMBER: |
709 | case SCSI_IOCTL_GET_PCI: | 711 | case SCSI_IOCTL_GET_PCI: |
710 | case SCSI_IOCTL_PROBE_HOST: | 712 | case SCSI_IOCTL_PROBE_HOST: |
711 | case SG_GET_VERSION_NUM: | 713 | case SG_GET_VERSION_NUM: |
712 | case SG_SET_TIMEOUT: | 714 | case SG_SET_TIMEOUT: |
713 | case SG_GET_TIMEOUT: | 715 | case SG_GET_TIMEOUT: |
714 | case SG_GET_RESERVED_SIZE: | 716 | case SG_GET_RESERVED_SIZE: |
715 | case SG_SET_RESERVED_SIZE: | 717 | case SG_SET_RESERVED_SIZE: |
716 | case SG_EMULATED_HOST: | 718 | case SG_EMULATED_HOST: |
717 | return 0; | 719 | return 0; |
718 | case CDROM_GET_CAPABILITY: | 720 | case CDROM_GET_CAPABILITY: |
719 | /* Keep this until we remove the printk below. udev sends it | 721 | /* Keep this until we remove the printk below. udev sends it |
720 | * and we do not want to spam dmesg about it. CD-ROMs do | 722 | * and we do not want to spam dmesg about it. CD-ROMs do |
721 | * not have partitions, so we get here only for disks. | 723 | * not have partitions, so we get here only for disks. |
722 | */ | 724 | */ |
723 | return -ENOIOCTLCMD; | 725 | return -ENOIOCTLCMD; |
724 | default: | 726 | default: |
725 | break; | 727 | break; |
726 | } | 728 | } |
727 | 729 | ||
728 | if (capable(CAP_SYS_RAWIO)) | 730 | if (capable(CAP_SYS_RAWIO)) |
729 | return 0; | 731 | return 0; |
730 | 732 | ||
731 | /* In particular, rule out all resets and host-specific ioctls. */ | 733 | /* In particular, rule out all resets and host-specific ioctls. */ |
732 | printk_ratelimited(KERN_WARNING | 734 | printk_ratelimited(KERN_WARNING |
733 | "%s: sending ioctl %x to a partition!\n", current->comm, cmd); | 735 | "%s: sending ioctl %x to a partition!\n", current->comm, cmd); |
734 | 736 | ||
735 | return -ENOIOCTLCMD; | 737 | return -ENOIOCTLCMD; |
736 | } | 738 | } |
737 | EXPORT_SYMBOL(scsi_verify_blk_ioctl); | 739 | EXPORT_SYMBOL(scsi_verify_blk_ioctl); |
738 | 740 | ||
739 | int scsi_cmd_blk_ioctl(struct block_device *bd, fmode_t mode, | 741 | int scsi_cmd_blk_ioctl(struct block_device *bd, fmode_t mode, |
740 | unsigned int cmd, void __user *arg) | 742 | unsigned int cmd, void __user *arg) |
741 | { | 743 | { |
742 | int ret; | 744 | int ret; |
743 | 745 | ||
744 | ret = scsi_verify_blk_ioctl(bd, cmd); | 746 | ret = scsi_verify_blk_ioctl(bd, cmd); |
745 | if (ret < 0) | 747 | if (ret < 0) |
746 | return ret; | 748 | return ret; |
747 | 749 | ||
748 | return scsi_cmd_ioctl(bd->bd_disk->queue, bd->bd_disk, mode, cmd, arg); | 750 | return scsi_cmd_ioctl(bd->bd_disk->queue, bd->bd_disk, mode, cmd, arg); |
749 | } | 751 | } |
750 | EXPORT_SYMBOL(scsi_cmd_blk_ioctl); | 752 | EXPORT_SYMBOL(scsi_cmd_blk_ioctl); |
751 | 753 | ||
752 | static int __init blk_scsi_ioctl_init(void) | 754 | static int __init blk_scsi_ioctl_init(void) |
753 | { | 755 | { |
754 | blk_set_cmd_filter_defaults(&blk_default_cmd_filter); | 756 | blk_set_cmd_filter_defaults(&blk_default_cmd_filter); |
755 | return 0; | 757 | return 0; |
756 | } | 758 | } |
757 | fs_initcall(blk_scsi_ioctl_init); | 759 | fs_initcall(blk_scsi_ioctl_init); |