Commit 6f0d7a9eb60d70f22d71f00b2c762e255881ab31

Authored by Linus Torvalds

Merge branch 'for-linus' of git://git.kernel.dk/linux-block

Pull block layer fixes from Jens Axboe:
 "Four small fixes that should be merged for the current 3.18-rc series.
  This pull request contains:

   - a minor bugfix for computation of best IO priority given two
     merging requests.  From Jan Kara.

   - the final (final) merge count issue that has been plaguing
     virtio-blk.  From Ming Lei.

   - enable parallel reinit notify for blk-mq queues, to combine the
     cost of an RCU grace period across lots of devices.  From Tejun
     Heo.

   - an error handling fix for the SCSI_IOCTL_SEND_COMMAND ioctl.  From
     Tony Battersby"

* 'for-linus' of git://git.kernel.dk/linux-block:
  block: blk-merge: fix blk_recount_segments()
  scsi: Fix more error handling in SCSI_IOCTL_SEND_COMMAND
  blk-mq: make mq_queue_reinit_notify() freeze queues in parallel
  block: Fix computation of merged request priority

Showing 4 changed files Inline Diff

1 /* 1 /*
2 * Functions related to segment and merge handling 2 * Functions related to segment and merge handling
3 */ 3 */
4 #include <linux/kernel.h> 4 #include <linux/kernel.h>
5 #include <linux/module.h> 5 #include <linux/module.h>
6 #include <linux/bio.h> 6 #include <linux/bio.h>
7 #include <linux/blkdev.h> 7 #include <linux/blkdev.h>
8 #include <linux/scatterlist.h> 8 #include <linux/scatterlist.h>
9 9
10 #include "blk.h" 10 #include "blk.h"
11 11
12 static unsigned int __blk_recalc_rq_segments(struct request_queue *q, 12 static unsigned int __blk_recalc_rq_segments(struct request_queue *q,
13 struct bio *bio, 13 struct bio *bio,
14 bool no_sg_merge) 14 bool no_sg_merge)
15 { 15 {
16 struct bio_vec bv, bvprv = { NULL }; 16 struct bio_vec bv, bvprv = { NULL };
17 int cluster, high, highprv = 1; 17 int cluster, high, highprv = 1;
18 unsigned int seg_size, nr_phys_segs; 18 unsigned int seg_size, nr_phys_segs;
19 struct bio *fbio, *bbio; 19 struct bio *fbio, *bbio;
20 struct bvec_iter iter; 20 struct bvec_iter iter;
21 21
22 if (!bio) 22 if (!bio)
23 return 0; 23 return 0;
24 24
25 /* 25 /*
26 * This should probably be returning 0, but blk_add_request_payload() 26 * This should probably be returning 0, but blk_add_request_payload()
27 * (Christoph!!!!) 27 * (Christoph!!!!)
28 */ 28 */
29 if (bio->bi_rw & REQ_DISCARD) 29 if (bio->bi_rw & REQ_DISCARD)
30 return 1; 30 return 1;
31 31
32 if (bio->bi_rw & REQ_WRITE_SAME) 32 if (bio->bi_rw & REQ_WRITE_SAME)
33 return 1; 33 return 1;
34 34
35 fbio = bio; 35 fbio = bio;
36 cluster = blk_queue_cluster(q); 36 cluster = blk_queue_cluster(q);
37 seg_size = 0; 37 seg_size = 0;
38 nr_phys_segs = 0; 38 nr_phys_segs = 0;
39 high = 0; 39 high = 0;
40 for_each_bio(bio) { 40 for_each_bio(bio) {
41 bio_for_each_segment(bv, bio, iter) { 41 bio_for_each_segment(bv, bio, iter) {
42 /* 42 /*
43 * If SG merging is disabled, each bio vector is 43 * If SG merging is disabled, each bio vector is
44 * a segment 44 * a segment
45 */ 45 */
46 if (no_sg_merge) 46 if (no_sg_merge)
47 goto new_segment; 47 goto new_segment;
48 48
49 /* 49 /*
50 * the trick here is making sure that a high page is 50 * the trick here is making sure that a high page is
51 * never considered part of another segment, since 51 * never considered part of another segment, since
52 * that might change with the bounce page. 52 * that might change with the bounce page.
53 */ 53 */
54 high = page_to_pfn(bv.bv_page) > queue_bounce_pfn(q); 54 high = page_to_pfn(bv.bv_page) > queue_bounce_pfn(q);
55 if (!high && !highprv && cluster) { 55 if (!high && !highprv && cluster) {
56 if (seg_size + bv.bv_len 56 if (seg_size + bv.bv_len
57 > queue_max_segment_size(q)) 57 > queue_max_segment_size(q))
58 goto new_segment; 58 goto new_segment;
59 if (!BIOVEC_PHYS_MERGEABLE(&bvprv, &bv)) 59 if (!BIOVEC_PHYS_MERGEABLE(&bvprv, &bv))
60 goto new_segment; 60 goto new_segment;
61 if (!BIOVEC_SEG_BOUNDARY(q, &bvprv, &bv)) 61 if (!BIOVEC_SEG_BOUNDARY(q, &bvprv, &bv))
62 goto new_segment; 62 goto new_segment;
63 63
64 seg_size += bv.bv_len; 64 seg_size += bv.bv_len;
65 bvprv = bv; 65 bvprv = bv;
66 continue; 66 continue;
67 } 67 }
68 new_segment: 68 new_segment:
69 if (nr_phys_segs == 1 && seg_size > 69 if (nr_phys_segs == 1 && seg_size >
70 fbio->bi_seg_front_size) 70 fbio->bi_seg_front_size)
71 fbio->bi_seg_front_size = seg_size; 71 fbio->bi_seg_front_size = seg_size;
72 72
73 nr_phys_segs++; 73 nr_phys_segs++;
74 bvprv = bv; 74 bvprv = bv;
75 seg_size = bv.bv_len; 75 seg_size = bv.bv_len;
76 highprv = high; 76 highprv = high;
77 } 77 }
78 bbio = bio; 78 bbio = bio;
79 } 79 }
80 80
81 if (nr_phys_segs == 1 && seg_size > fbio->bi_seg_front_size) 81 if (nr_phys_segs == 1 && seg_size > fbio->bi_seg_front_size)
82 fbio->bi_seg_front_size = seg_size; 82 fbio->bi_seg_front_size = seg_size;
83 if (seg_size > bbio->bi_seg_back_size) 83 if (seg_size > bbio->bi_seg_back_size)
84 bbio->bi_seg_back_size = seg_size; 84 bbio->bi_seg_back_size = seg_size;
85 85
86 return nr_phys_segs; 86 return nr_phys_segs;
87 } 87 }
88 88
89 void blk_recalc_rq_segments(struct request *rq) 89 void blk_recalc_rq_segments(struct request *rq)
90 { 90 {
91 bool no_sg_merge = !!test_bit(QUEUE_FLAG_NO_SG_MERGE, 91 bool no_sg_merge = !!test_bit(QUEUE_FLAG_NO_SG_MERGE,
92 &rq->q->queue_flags); 92 &rq->q->queue_flags);
93 93
94 rq->nr_phys_segments = __blk_recalc_rq_segments(rq->q, rq->bio, 94 rq->nr_phys_segments = __blk_recalc_rq_segments(rq->q, rq->bio,
95 no_sg_merge); 95 no_sg_merge);
96 } 96 }
97 97
98 void blk_recount_segments(struct request_queue *q, struct bio *bio) 98 void blk_recount_segments(struct request_queue *q, struct bio *bio)
99 { 99 {
100 bool no_sg_merge = !!test_bit(QUEUE_FLAG_NO_SG_MERGE, 100 unsigned short seg_cnt;
101 &q->queue_flags);
102 bool merge_not_need = bio->bi_vcnt < queue_max_segments(q);
103 101
104 if (no_sg_merge && !bio_flagged(bio, BIO_CLONED) && 102 /* estimate segment number by bi_vcnt for non-cloned bio */
105 merge_not_need) 103 if (bio_flagged(bio, BIO_CLONED))
106 bio->bi_phys_segments = bio->bi_vcnt; 104 seg_cnt = bio_segments(bio);
105 else
106 seg_cnt = bio->bi_vcnt;
107
108 if (test_bit(QUEUE_FLAG_NO_SG_MERGE, &q->queue_flags) &&
109 (seg_cnt < queue_max_segments(q)))
110 bio->bi_phys_segments = seg_cnt;
107 else { 111 else {
108 struct bio *nxt = bio->bi_next; 112 struct bio *nxt = bio->bi_next;
109 113
110 bio->bi_next = NULL; 114 bio->bi_next = NULL;
111 bio->bi_phys_segments = __blk_recalc_rq_segments(q, bio, 115 bio->bi_phys_segments = __blk_recalc_rq_segments(q, bio, false);
112 no_sg_merge && merge_not_need);
113 bio->bi_next = nxt; 116 bio->bi_next = nxt;
114 } 117 }
115 118
116 bio->bi_flags |= (1 << BIO_SEG_VALID); 119 bio->bi_flags |= (1 << BIO_SEG_VALID);
117 } 120 }
118 EXPORT_SYMBOL(blk_recount_segments); 121 EXPORT_SYMBOL(blk_recount_segments);
119 122
120 static int blk_phys_contig_segment(struct request_queue *q, struct bio *bio, 123 static int blk_phys_contig_segment(struct request_queue *q, struct bio *bio,
121 struct bio *nxt) 124 struct bio *nxt)
122 { 125 {
123 struct bio_vec end_bv = { NULL }, nxt_bv; 126 struct bio_vec end_bv = { NULL }, nxt_bv;
124 struct bvec_iter iter; 127 struct bvec_iter iter;
125 128
126 if (!blk_queue_cluster(q)) 129 if (!blk_queue_cluster(q))
127 return 0; 130 return 0;
128 131
129 if (bio->bi_seg_back_size + nxt->bi_seg_front_size > 132 if (bio->bi_seg_back_size + nxt->bi_seg_front_size >
130 queue_max_segment_size(q)) 133 queue_max_segment_size(q))
131 return 0; 134 return 0;
132 135
133 if (!bio_has_data(bio)) 136 if (!bio_has_data(bio))
134 return 1; 137 return 1;
135 138
136 bio_for_each_segment(end_bv, bio, iter) 139 bio_for_each_segment(end_bv, bio, iter)
137 if (end_bv.bv_len == iter.bi_size) 140 if (end_bv.bv_len == iter.bi_size)
138 break; 141 break;
139 142
140 nxt_bv = bio_iovec(nxt); 143 nxt_bv = bio_iovec(nxt);
141 144
142 if (!BIOVEC_PHYS_MERGEABLE(&end_bv, &nxt_bv)) 145 if (!BIOVEC_PHYS_MERGEABLE(&end_bv, &nxt_bv))
143 return 0; 146 return 0;
144 147
145 /* 148 /*
146 * bio and nxt are contiguous in memory; check if the queue allows 149 * bio and nxt are contiguous in memory; check if the queue allows
147 * these two to be merged into one 150 * these two to be merged into one
148 */ 151 */
149 if (BIOVEC_SEG_BOUNDARY(q, &end_bv, &nxt_bv)) 152 if (BIOVEC_SEG_BOUNDARY(q, &end_bv, &nxt_bv))
150 return 1; 153 return 1;
151 154
152 return 0; 155 return 0;
153 } 156 }
154 157
155 static inline void 158 static inline void
156 __blk_segment_map_sg(struct request_queue *q, struct bio_vec *bvec, 159 __blk_segment_map_sg(struct request_queue *q, struct bio_vec *bvec,
157 struct scatterlist *sglist, struct bio_vec *bvprv, 160 struct scatterlist *sglist, struct bio_vec *bvprv,
158 struct scatterlist **sg, int *nsegs, int *cluster) 161 struct scatterlist **sg, int *nsegs, int *cluster)
159 { 162 {
160 163
161 int nbytes = bvec->bv_len; 164 int nbytes = bvec->bv_len;
162 165
163 if (*sg && *cluster) { 166 if (*sg && *cluster) {
164 if ((*sg)->length + nbytes > queue_max_segment_size(q)) 167 if ((*sg)->length + nbytes > queue_max_segment_size(q))
165 goto new_segment; 168 goto new_segment;
166 169
167 if (!BIOVEC_PHYS_MERGEABLE(bvprv, bvec)) 170 if (!BIOVEC_PHYS_MERGEABLE(bvprv, bvec))
168 goto new_segment; 171 goto new_segment;
169 if (!BIOVEC_SEG_BOUNDARY(q, bvprv, bvec)) 172 if (!BIOVEC_SEG_BOUNDARY(q, bvprv, bvec))
170 goto new_segment; 173 goto new_segment;
171 174
172 (*sg)->length += nbytes; 175 (*sg)->length += nbytes;
173 } else { 176 } else {
174 new_segment: 177 new_segment:
175 if (!*sg) 178 if (!*sg)
176 *sg = sglist; 179 *sg = sglist;
177 else { 180 else {
178 /* 181 /*
179 * If the driver previously mapped a shorter 182 * If the driver previously mapped a shorter
180 * list, we could see a termination bit 183 * list, we could see a termination bit
181 * prematurely unless it fully inits the sg 184 * prematurely unless it fully inits the sg
182 * table on each mapping. We KNOW that there 185 * table on each mapping. We KNOW that there
183 * must be more entries here or the driver 186 * must be more entries here or the driver
184 * would be buggy, so force clear the 187 * would be buggy, so force clear the
185 * termination bit to avoid doing a full 188 * termination bit to avoid doing a full
186 * sg_init_table() in drivers for each command. 189 * sg_init_table() in drivers for each command.
187 */ 190 */
188 sg_unmark_end(*sg); 191 sg_unmark_end(*sg);
189 *sg = sg_next(*sg); 192 *sg = sg_next(*sg);
190 } 193 }
191 194
192 sg_set_page(*sg, bvec->bv_page, nbytes, bvec->bv_offset); 195 sg_set_page(*sg, bvec->bv_page, nbytes, bvec->bv_offset);
193 (*nsegs)++; 196 (*nsegs)++;
194 } 197 }
195 *bvprv = *bvec; 198 *bvprv = *bvec;
196 } 199 }
197 200
198 static int __blk_bios_map_sg(struct request_queue *q, struct bio *bio, 201 static int __blk_bios_map_sg(struct request_queue *q, struct bio *bio,
199 struct scatterlist *sglist, 202 struct scatterlist *sglist,
200 struct scatterlist **sg) 203 struct scatterlist **sg)
201 { 204 {
202 struct bio_vec bvec, bvprv = { NULL }; 205 struct bio_vec bvec, bvprv = { NULL };
203 struct bvec_iter iter; 206 struct bvec_iter iter;
204 int nsegs, cluster; 207 int nsegs, cluster;
205 208
206 nsegs = 0; 209 nsegs = 0;
207 cluster = blk_queue_cluster(q); 210 cluster = blk_queue_cluster(q);
208 211
209 if (bio->bi_rw & REQ_DISCARD) { 212 if (bio->bi_rw & REQ_DISCARD) {
210 /* 213 /*
211 * This is a hack - drivers should be neither modifying the 214 * This is a hack - drivers should be neither modifying the
212 * biovec, nor relying on bi_vcnt - but because of 215 * biovec, nor relying on bi_vcnt - but because of
213 * blk_add_request_payload(), a discard bio may or may not have 216 * blk_add_request_payload(), a discard bio may or may not have
214 * a payload we need to set up here (thank you Christoph) and 217 * a payload we need to set up here (thank you Christoph) and
215 * bi_vcnt is really the only way of telling if we need to. 218 * bi_vcnt is really the only way of telling if we need to.
216 */ 219 */
217 220
218 if (bio->bi_vcnt) 221 if (bio->bi_vcnt)
219 goto single_segment; 222 goto single_segment;
220 223
221 return 0; 224 return 0;
222 } 225 }
223 226
224 if (bio->bi_rw & REQ_WRITE_SAME) { 227 if (bio->bi_rw & REQ_WRITE_SAME) {
225 single_segment: 228 single_segment:
226 *sg = sglist; 229 *sg = sglist;
227 bvec = bio_iovec(bio); 230 bvec = bio_iovec(bio);
228 sg_set_page(*sg, bvec.bv_page, bvec.bv_len, bvec.bv_offset); 231 sg_set_page(*sg, bvec.bv_page, bvec.bv_len, bvec.bv_offset);
229 return 1; 232 return 1;
230 } 233 }
231 234
232 for_each_bio(bio) 235 for_each_bio(bio)
233 bio_for_each_segment(bvec, bio, iter) 236 bio_for_each_segment(bvec, bio, iter)
234 __blk_segment_map_sg(q, &bvec, sglist, &bvprv, sg, 237 __blk_segment_map_sg(q, &bvec, sglist, &bvprv, sg,
235 &nsegs, &cluster); 238 &nsegs, &cluster);
236 239
237 return nsegs; 240 return nsegs;
238 } 241 }
239 242
240 /* 243 /*
241 * map a request to scatterlist, return number of sg entries setup. Caller 244 * map a request to scatterlist, return number of sg entries setup. Caller
242 * must make sure sg can hold rq->nr_phys_segments entries 245 * must make sure sg can hold rq->nr_phys_segments entries
243 */ 246 */
244 int blk_rq_map_sg(struct request_queue *q, struct request *rq, 247 int blk_rq_map_sg(struct request_queue *q, struct request *rq,
245 struct scatterlist *sglist) 248 struct scatterlist *sglist)
246 { 249 {
247 struct scatterlist *sg = NULL; 250 struct scatterlist *sg = NULL;
248 int nsegs = 0; 251 int nsegs = 0;
249 252
250 if (rq->bio) 253 if (rq->bio)
251 nsegs = __blk_bios_map_sg(q, rq->bio, sglist, &sg); 254 nsegs = __blk_bios_map_sg(q, rq->bio, sglist, &sg);
252 255
253 if (unlikely(rq->cmd_flags & REQ_COPY_USER) && 256 if (unlikely(rq->cmd_flags & REQ_COPY_USER) &&
254 (blk_rq_bytes(rq) & q->dma_pad_mask)) { 257 (blk_rq_bytes(rq) & q->dma_pad_mask)) {
255 unsigned int pad_len = 258 unsigned int pad_len =
256 (q->dma_pad_mask & ~blk_rq_bytes(rq)) + 1; 259 (q->dma_pad_mask & ~blk_rq_bytes(rq)) + 1;
257 260
258 sg->length += pad_len; 261 sg->length += pad_len;
259 rq->extra_len += pad_len; 262 rq->extra_len += pad_len;
260 } 263 }
261 264
262 if (q->dma_drain_size && q->dma_drain_needed(rq)) { 265 if (q->dma_drain_size && q->dma_drain_needed(rq)) {
263 if (rq->cmd_flags & REQ_WRITE) 266 if (rq->cmd_flags & REQ_WRITE)
264 memset(q->dma_drain_buffer, 0, q->dma_drain_size); 267 memset(q->dma_drain_buffer, 0, q->dma_drain_size);
265 268
266 sg->page_link &= ~0x02; 269 sg->page_link &= ~0x02;
267 sg = sg_next(sg); 270 sg = sg_next(sg);
268 sg_set_page(sg, virt_to_page(q->dma_drain_buffer), 271 sg_set_page(sg, virt_to_page(q->dma_drain_buffer),
269 q->dma_drain_size, 272 q->dma_drain_size,
270 ((unsigned long)q->dma_drain_buffer) & 273 ((unsigned long)q->dma_drain_buffer) &
271 (PAGE_SIZE - 1)); 274 (PAGE_SIZE - 1));
272 nsegs++; 275 nsegs++;
273 rq->extra_len += q->dma_drain_size; 276 rq->extra_len += q->dma_drain_size;
274 } 277 }
275 278
276 if (sg) 279 if (sg)
277 sg_mark_end(sg); 280 sg_mark_end(sg);
278 281
279 return nsegs; 282 return nsegs;
280 } 283 }
281 EXPORT_SYMBOL(blk_rq_map_sg); 284 EXPORT_SYMBOL(blk_rq_map_sg);
282 285
283 /** 286 /**
284 * blk_bio_map_sg - map a bio to a scatterlist 287 * blk_bio_map_sg - map a bio to a scatterlist
285 * @q: request_queue in question 288 * @q: request_queue in question
286 * @bio: bio being mapped 289 * @bio: bio being mapped
287 * @sglist: scatterlist being mapped 290 * @sglist: scatterlist being mapped
288 * 291 *
289 * Note: 292 * Note:
290 * Caller must make sure sg can hold bio->bi_phys_segments entries 293 * Caller must make sure sg can hold bio->bi_phys_segments entries
291 * 294 *
292 * Will return the number of sg entries setup 295 * Will return the number of sg entries setup
293 */ 296 */
294 int blk_bio_map_sg(struct request_queue *q, struct bio *bio, 297 int blk_bio_map_sg(struct request_queue *q, struct bio *bio,
295 struct scatterlist *sglist) 298 struct scatterlist *sglist)
296 { 299 {
297 struct scatterlist *sg = NULL; 300 struct scatterlist *sg = NULL;
298 int nsegs; 301 int nsegs;
299 struct bio *next = bio->bi_next; 302 struct bio *next = bio->bi_next;
300 bio->bi_next = NULL; 303 bio->bi_next = NULL;
301 304
302 nsegs = __blk_bios_map_sg(q, bio, sglist, &sg); 305 nsegs = __blk_bios_map_sg(q, bio, sglist, &sg);
303 bio->bi_next = next; 306 bio->bi_next = next;
304 if (sg) 307 if (sg)
305 sg_mark_end(sg); 308 sg_mark_end(sg);
306 309
307 BUG_ON(bio->bi_phys_segments && nsegs > bio->bi_phys_segments); 310 BUG_ON(bio->bi_phys_segments && nsegs > bio->bi_phys_segments);
308 return nsegs; 311 return nsegs;
309 } 312 }
310 EXPORT_SYMBOL(blk_bio_map_sg); 313 EXPORT_SYMBOL(blk_bio_map_sg);
311 314
312 static inline int ll_new_hw_segment(struct request_queue *q, 315 static inline int ll_new_hw_segment(struct request_queue *q,
313 struct request *req, 316 struct request *req,
314 struct bio *bio) 317 struct bio *bio)
315 { 318 {
316 int nr_phys_segs = bio_phys_segments(q, bio); 319 int nr_phys_segs = bio_phys_segments(q, bio);
317 320
318 if (req->nr_phys_segments + nr_phys_segs > queue_max_segments(q)) 321 if (req->nr_phys_segments + nr_phys_segs > queue_max_segments(q))
319 goto no_merge; 322 goto no_merge;
320 323
321 if (blk_integrity_merge_bio(q, req, bio) == false) 324 if (blk_integrity_merge_bio(q, req, bio) == false)
322 goto no_merge; 325 goto no_merge;
323 326
324 /* 327 /*
325 * This will form the start of a new hw segment. Bump both 328 * This will form the start of a new hw segment. Bump both
326 * counters. 329 * counters.
327 */ 330 */
328 req->nr_phys_segments += nr_phys_segs; 331 req->nr_phys_segments += nr_phys_segs;
329 return 1; 332 return 1;
330 333
331 no_merge: 334 no_merge:
332 req->cmd_flags |= REQ_NOMERGE; 335 req->cmd_flags |= REQ_NOMERGE;
333 if (req == q->last_merge) 336 if (req == q->last_merge)
334 q->last_merge = NULL; 337 q->last_merge = NULL;
335 return 0; 338 return 0;
336 } 339 }
337 340
338 int ll_back_merge_fn(struct request_queue *q, struct request *req, 341 int ll_back_merge_fn(struct request_queue *q, struct request *req,
339 struct bio *bio) 342 struct bio *bio)
340 { 343 {
341 if (blk_rq_sectors(req) + bio_sectors(bio) > 344 if (blk_rq_sectors(req) + bio_sectors(bio) >
342 blk_rq_get_max_sectors(req)) { 345 blk_rq_get_max_sectors(req)) {
343 req->cmd_flags |= REQ_NOMERGE; 346 req->cmd_flags |= REQ_NOMERGE;
344 if (req == q->last_merge) 347 if (req == q->last_merge)
345 q->last_merge = NULL; 348 q->last_merge = NULL;
346 return 0; 349 return 0;
347 } 350 }
348 if (!bio_flagged(req->biotail, BIO_SEG_VALID)) 351 if (!bio_flagged(req->biotail, BIO_SEG_VALID))
349 blk_recount_segments(q, req->biotail); 352 blk_recount_segments(q, req->biotail);
350 if (!bio_flagged(bio, BIO_SEG_VALID)) 353 if (!bio_flagged(bio, BIO_SEG_VALID))
351 blk_recount_segments(q, bio); 354 blk_recount_segments(q, bio);
352 355
353 return ll_new_hw_segment(q, req, bio); 356 return ll_new_hw_segment(q, req, bio);
354 } 357 }
355 358
356 int ll_front_merge_fn(struct request_queue *q, struct request *req, 359 int ll_front_merge_fn(struct request_queue *q, struct request *req,
357 struct bio *bio) 360 struct bio *bio)
358 { 361 {
359 if (blk_rq_sectors(req) + bio_sectors(bio) > 362 if (blk_rq_sectors(req) + bio_sectors(bio) >
360 blk_rq_get_max_sectors(req)) { 363 blk_rq_get_max_sectors(req)) {
361 req->cmd_flags |= REQ_NOMERGE; 364 req->cmd_flags |= REQ_NOMERGE;
362 if (req == q->last_merge) 365 if (req == q->last_merge)
363 q->last_merge = NULL; 366 q->last_merge = NULL;
364 return 0; 367 return 0;
365 } 368 }
366 if (!bio_flagged(bio, BIO_SEG_VALID)) 369 if (!bio_flagged(bio, BIO_SEG_VALID))
367 blk_recount_segments(q, bio); 370 blk_recount_segments(q, bio);
368 if (!bio_flagged(req->bio, BIO_SEG_VALID)) 371 if (!bio_flagged(req->bio, BIO_SEG_VALID))
369 blk_recount_segments(q, req->bio); 372 blk_recount_segments(q, req->bio);
370 373
371 return ll_new_hw_segment(q, req, bio); 374 return ll_new_hw_segment(q, req, bio);
372 } 375 }
373 376
374 /* 377 /*
375 * blk-mq uses req->special to carry normal driver per-request payload, it 378 * blk-mq uses req->special to carry normal driver per-request payload, it
376 * does not indicate a prepared command that we cannot merge with. 379 * does not indicate a prepared command that we cannot merge with.
377 */ 380 */
378 static bool req_no_special_merge(struct request *req) 381 static bool req_no_special_merge(struct request *req)
379 { 382 {
380 struct request_queue *q = req->q; 383 struct request_queue *q = req->q;
381 384
382 return !q->mq_ops && req->special; 385 return !q->mq_ops && req->special;
383 } 386 }
384 387
385 static int ll_merge_requests_fn(struct request_queue *q, struct request *req, 388 static int ll_merge_requests_fn(struct request_queue *q, struct request *req,
386 struct request *next) 389 struct request *next)
387 { 390 {
388 int total_phys_segments; 391 int total_phys_segments;
389 unsigned int seg_size = 392 unsigned int seg_size =
390 req->biotail->bi_seg_back_size + next->bio->bi_seg_front_size; 393 req->biotail->bi_seg_back_size + next->bio->bi_seg_front_size;
391 394
392 /* 395 /*
393 * First check if the either of the requests are re-queued 396 * First check if the either of the requests are re-queued
394 * requests. Can't merge them if they are. 397 * requests. Can't merge them if they are.
395 */ 398 */
396 if (req_no_special_merge(req) || req_no_special_merge(next)) 399 if (req_no_special_merge(req) || req_no_special_merge(next))
397 return 0; 400 return 0;
398 401
399 /* 402 /*
400 * Will it become too large? 403 * Will it become too large?
401 */ 404 */
402 if ((blk_rq_sectors(req) + blk_rq_sectors(next)) > 405 if ((blk_rq_sectors(req) + blk_rq_sectors(next)) >
403 blk_rq_get_max_sectors(req)) 406 blk_rq_get_max_sectors(req))
404 return 0; 407 return 0;
405 408
406 total_phys_segments = req->nr_phys_segments + next->nr_phys_segments; 409 total_phys_segments = req->nr_phys_segments + next->nr_phys_segments;
407 if (blk_phys_contig_segment(q, req->biotail, next->bio)) { 410 if (blk_phys_contig_segment(q, req->biotail, next->bio)) {
408 if (req->nr_phys_segments == 1) 411 if (req->nr_phys_segments == 1)
409 req->bio->bi_seg_front_size = seg_size; 412 req->bio->bi_seg_front_size = seg_size;
410 if (next->nr_phys_segments == 1) 413 if (next->nr_phys_segments == 1)
411 next->biotail->bi_seg_back_size = seg_size; 414 next->biotail->bi_seg_back_size = seg_size;
412 total_phys_segments--; 415 total_phys_segments--;
413 } 416 }
414 417
415 if (total_phys_segments > queue_max_segments(q)) 418 if (total_phys_segments > queue_max_segments(q))
416 return 0; 419 return 0;
417 420
418 if (blk_integrity_merge_rq(q, req, next) == false) 421 if (blk_integrity_merge_rq(q, req, next) == false)
419 return 0; 422 return 0;
420 423
421 /* Merge is OK... */ 424 /* Merge is OK... */
422 req->nr_phys_segments = total_phys_segments; 425 req->nr_phys_segments = total_phys_segments;
423 return 1; 426 return 1;
424 } 427 }
425 428
426 /** 429 /**
427 * blk_rq_set_mixed_merge - mark a request as mixed merge 430 * blk_rq_set_mixed_merge - mark a request as mixed merge
428 * @rq: request to mark as mixed merge 431 * @rq: request to mark as mixed merge
429 * 432 *
430 * Description: 433 * Description:
431 * @rq is about to be mixed merged. Make sure the attributes 434 * @rq is about to be mixed merged. Make sure the attributes
432 * which can be mixed are set in each bio and mark @rq as mixed 435 * which can be mixed are set in each bio and mark @rq as mixed
433 * merged. 436 * merged.
434 */ 437 */
435 void blk_rq_set_mixed_merge(struct request *rq) 438 void blk_rq_set_mixed_merge(struct request *rq)
436 { 439 {
437 unsigned int ff = rq->cmd_flags & REQ_FAILFAST_MASK; 440 unsigned int ff = rq->cmd_flags & REQ_FAILFAST_MASK;
438 struct bio *bio; 441 struct bio *bio;
439 442
440 if (rq->cmd_flags & REQ_MIXED_MERGE) 443 if (rq->cmd_flags & REQ_MIXED_MERGE)
441 return; 444 return;
442 445
443 /* 446 /*
444 * @rq will no longer represent mixable attributes for all the 447 * @rq will no longer represent mixable attributes for all the
445 * contained bios. It will just track those of the first one. 448 * contained bios. It will just track those of the first one.
446 * Distributes the attributs to each bio. 449 * Distributes the attributs to each bio.
447 */ 450 */
448 for (bio = rq->bio; bio; bio = bio->bi_next) { 451 for (bio = rq->bio; bio; bio = bio->bi_next) {
449 WARN_ON_ONCE((bio->bi_rw & REQ_FAILFAST_MASK) && 452 WARN_ON_ONCE((bio->bi_rw & REQ_FAILFAST_MASK) &&
450 (bio->bi_rw & REQ_FAILFAST_MASK) != ff); 453 (bio->bi_rw & REQ_FAILFAST_MASK) != ff);
451 bio->bi_rw |= ff; 454 bio->bi_rw |= ff;
452 } 455 }
453 rq->cmd_flags |= REQ_MIXED_MERGE; 456 rq->cmd_flags |= REQ_MIXED_MERGE;
454 } 457 }
455 458
456 static void blk_account_io_merge(struct request *req) 459 static void blk_account_io_merge(struct request *req)
457 { 460 {
458 if (blk_do_io_stat(req)) { 461 if (blk_do_io_stat(req)) {
459 struct hd_struct *part; 462 struct hd_struct *part;
460 int cpu; 463 int cpu;
461 464
462 cpu = part_stat_lock(); 465 cpu = part_stat_lock();
463 part = req->part; 466 part = req->part;
464 467
465 part_round_stats(cpu, part); 468 part_round_stats(cpu, part);
466 part_dec_in_flight(part, rq_data_dir(req)); 469 part_dec_in_flight(part, rq_data_dir(req));
467 470
468 hd_struct_put(part); 471 hd_struct_put(part);
469 part_stat_unlock(); 472 part_stat_unlock();
470 } 473 }
471 } 474 }
472 475
473 /* 476 /*
474 * Has to be called with the request spinlock acquired 477 * Has to be called with the request spinlock acquired
475 */ 478 */
476 static int attempt_merge(struct request_queue *q, struct request *req, 479 static int attempt_merge(struct request_queue *q, struct request *req,
477 struct request *next) 480 struct request *next)
478 { 481 {
479 if (!rq_mergeable(req) || !rq_mergeable(next)) 482 if (!rq_mergeable(req) || !rq_mergeable(next))
480 return 0; 483 return 0;
481 484
482 if (!blk_check_merge_flags(req->cmd_flags, next->cmd_flags)) 485 if (!blk_check_merge_flags(req->cmd_flags, next->cmd_flags))
483 return 0; 486 return 0;
484 487
485 /* 488 /*
486 * not contiguous 489 * not contiguous
487 */ 490 */
488 if (blk_rq_pos(req) + blk_rq_sectors(req) != blk_rq_pos(next)) 491 if (blk_rq_pos(req) + blk_rq_sectors(req) != blk_rq_pos(next))
489 return 0; 492 return 0;
490 493
491 if (rq_data_dir(req) != rq_data_dir(next) 494 if (rq_data_dir(req) != rq_data_dir(next)
492 || req->rq_disk != next->rq_disk 495 || req->rq_disk != next->rq_disk
493 || req_no_special_merge(next)) 496 || req_no_special_merge(next))
494 return 0; 497 return 0;
495 498
496 if (req->cmd_flags & REQ_WRITE_SAME && 499 if (req->cmd_flags & REQ_WRITE_SAME &&
497 !blk_write_same_mergeable(req->bio, next->bio)) 500 !blk_write_same_mergeable(req->bio, next->bio))
498 return 0; 501 return 0;
499 502
500 /* 503 /*
501 * If we are allowed to merge, then append bio list 504 * If we are allowed to merge, then append bio list
502 * from next to rq and release next. merge_requests_fn 505 * from next to rq and release next. merge_requests_fn
503 * will have updated segment counts, update sector 506 * will have updated segment counts, update sector
504 * counts here. 507 * counts here.
505 */ 508 */
506 if (!ll_merge_requests_fn(q, req, next)) 509 if (!ll_merge_requests_fn(q, req, next))
507 return 0; 510 return 0;
508 511
509 /* 512 /*
510 * If failfast settings disagree or any of the two is already 513 * If failfast settings disagree or any of the two is already
511 * a mixed merge, mark both as mixed before proceeding. This 514 * a mixed merge, mark both as mixed before proceeding. This
512 * makes sure that all involved bios have mixable attributes 515 * makes sure that all involved bios have mixable attributes
513 * set properly. 516 * set properly.
514 */ 517 */
515 if ((req->cmd_flags | next->cmd_flags) & REQ_MIXED_MERGE || 518 if ((req->cmd_flags | next->cmd_flags) & REQ_MIXED_MERGE ||
516 (req->cmd_flags & REQ_FAILFAST_MASK) != 519 (req->cmd_flags & REQ_FAILFAST_MASK) !=
517 (next->cmd_flags & REQ_FAILFAST_MASK)) { 520 (next->cmd_flags & REQ_FAILFAST_MASK)) {
518 blk_rq_set_mixed_merge(req); 521 blk_rq_set_mixed_merge(req);
519 blk_rq_set_mixed_merge(next); 522 blk_rq_set_mixed_merge(next);
520 } 523 }
521 524
522 /* 525 /*
523 * At this point we have either done a back merge 526 * At this point we have either done a back merge
524 * or front merge. We need the smaller start_time of 527 * or front merge. We need the smaller start_time of
525 * the merged requests to be the current request 528 * the merged requests to be the current request
526 * for accounting purposes. 529 * for accounting purposes.
527 */ 530 */
528 if (time_after(req->start_time, next->start_time)) 531 if (time_after(req->start_time, next->start_time))
529 req->start_time = next->start_time; 532 req->start_time = next->start_time;
530 533
531 req->biotail->bi_next = next->bio; 534 req->biotail->bi_next = next->bio;
532 req->biotail = next->biotail; 535 req->biotail = next->biotail;
533 536
534 req->__data_len += blk_rq_bytes(next); 537 req->__data_len += blk_rq_bytes(next);
535 538
536 elv_merge_requests(q, req, next); 539 elv_merge_requests(q, req, next);
537 540
538 /* 541 /*
539 * 'next' is going away, so update stats accordingly 542 * 'next' is going away, so update stats accordingly
540 */ 543 */
541 blk_account_io_merge(next); 544 blk_account_io_merge(next);
542 545
543 req->ioprio = ioprio_best(req->ioprio, next->ioprio); 546 req->ioprio = ioprio_best(req->ioprio, next->ioprio);
544 if (blk_rq_cpu_valid(next)) 547 if (blk_rq_cpu_valid(next))
545 req->cpu = next->cpu; 548 req->cpu = next->cpu;
546 549
547 /* owner-ship of bio passed from next to req */ 550 /* owner-ship of bio passed from next to req */
548 next->bio = NULL; 551 next->bio = NULL;
549 __blk_put_request(q, next); 552 __blk_put_request(q, next);
550 return 1; 553 return 1;
551 } 554 }
552 555
553 int attempt_back_merge(struct request_queue *q, struct request *rq) 556 int attempt_back_merge(struct request_queue *q, struct request *rq)
554 { 557 {
555 struct request *next = elv_latter_request(q, rq); 558 struct request *next = elv_latter_request(q, rq);
556 559
557 if (next) 560 if (next)
558 return attempt_merge(q, rq, next); 561 return attempt_merge(q, rq, next);
559 562
560 return 0; 563 return 0;
561 } 564 }
562 565
563 int attempt_front_merge(struct request_queue *q, struct request *rq) 566 int attempt_front_merge(struct request_queue *q, struct request *rq)
564 { 567 {
565 struct request *prev = elv_former_request(q, rq); 568 struct request *prev = elv_former_request(q, rq);
566 569
567 if (prev) 570 if (prev)
568 return attempt_merge(q, prev, rq); 571 return attempt_merge(q, prev, rq);
569 572
570 return 0; 573 return 0;
571 } 574 }
572 575
573 int blk_attempt_req_merge(struct request_queue *q, struct request *rq, 576 int blk_attempt_req_merge(struct request_queue *q, struct request *rq,
574 struct request *next) 577 struct request *next)
575 { 578 {
576 return attempt_merge(q, rq, next); 579 return attempt_merge(q, rq, next);
577 } 580 }
578 581
579 bool blk_rq_merge_ok(struct request *rq, struct bio *bio) 582 bool blk_rq_merge_ok(struct request *rq, struct bio *bio)
580 { 583 {
581 struct request_queue *q = rq->q; 584 struct request_queue *q = rq->q;
582 585
583 if (!rq_mergeable(rq) || !bio_mergeable(bio)) 586 if (!rq_mergeable(rq) || !bio_mergeable(bio))
584 return false; 587 return false;
585 588
586 if (!blk_check_merge_flags(rq->cmd_flags, bio->bi_rw)) 589 if (!blk_check_merge_flags(rq->cmd_flags, bio->bi_rw))
587 return false; 590 return false;
588 591
589 /* different data direction or already started, don't merge */ 592 /* different data direction or already started, don't merge */
590 if (bio_data_dir(bio) != rq_data_dir(rq)) 593 if (bio_data_dir(bio) != rq_data_dir(rq))
591 return false; 594 return false;
592 595
593 /* must be same device and not a special request */ 596 /* must be same device and not a special request */
594 if (rq->rq_disk != bio->bi_bdev->bd_disk || req_no_special_merge(rq)) 597 if (rq->rq_disk != bio->bi_bdev->bd_disk || req_no_special_merge(rq))
595 return false; 598 return false;
596 599
597 /* only merge integrity protected bio into ditto rq */ 600 /* only merge integrity protected bio into ditto rq */
598 if (blk_integrity_merge_bio(rq->q, rq, bio) == false) 601 if (blk_integrity_merge_bio(rq->q, rq, bio) == false)
599 return false; 602 return false;
600 603
601 /* must be using the same buffer */ 604 /* must be using the same buffer */
602 if (rq->cmd_flags & REQ_WRITE_SAME && 605 if (rq->cmd_flags & REQ_WRITE_SAME &&
603 !blk_write_same_mergeable(rq->bio, bio)) 606 !blk_write_same_mergeable(rq->bio, bio))
604 return false; 607 return false;
605 608
606 if (q->queue_flags & (1 << QUEUE_FLAG_SG_GAPS)) { 609 if (q->queue_flags & (1 << QUEUE_FLAG_SG_GAPS)) {
607 struct bio_vec *bprev; 610 struct bio_vec *bprev;
608 611
609 bprev = &rq->biotail->bi_io_vec[bio->bi_vcnt - 1]; 612 bprev = &rq->biotail->bi_io_vec[bio->bi_vcnt - 1];
610 if (bvec_gap_to_prev(bprev, bio->bi_io_vec[0].bv_offset)) 613 if (bvec_gap_to_prev(bprev, bio->bi_io_vec[0].bv_offset))
611 return false; 614 return false;
612 } 615 }
613 616
614 return true; 617 return true;
615 } 618 }
616 619
617 int blk_try_merge(struct request *rq, struct bio *bio) 620 int blk_try_merge(struct request *rq, struct bio *bio)
618 { 621 {
619 if (blk_rq_pos(rq) + blk_rq_sectors(rq) == bio->bi_iter.bi_sector) 622 if (blk_rq_pos(rq) + blk_rq_sectors(rq) == bio->bi_iter.bi_sector)
620 return ELEVATOR_BACK_MERGE; 623 return ELEVATOR_BACK_MERGE;
621 else if (blk_rq_pos(rq) - bio_sectors(bio) == bio->bi_iter.bi_sector) 624 else if (blk_rq_pos(rq) - bio_sectors(bio) == bio->bi_iter.bi_sector)
622 return ELEVATOR_FRONT_MERGE; 625 return ELEVATOR_FRONT_MERGE;
1 /* 1 /*
2 * Block multiqueue core code 2 * Block multiqueue core code
3 * 3 *
4 * Copyright (C) 2013-2014 Jens Axboe 4 * Copyright (C) 2013-2014 Jens Axboe
5 * Copyright (C) 2013-2014 Christoph Hellwig 5 * Copyright (C) 2013-2014 Christoph Hellwig
6 */ 6 */
7 #include <linux/kernel.h> 7 #include <linux/kernel.h>
8 #include <linux/module.h> 8 #include <linux/module.h>
9 #include <linux/backing-dev.h> 9 #include <linux/backing-dev.h>
10 #include <linux/bio.h> 10 #include <linux/bio.h>
11 #include <linux/blkdev.h> 11 #include <linux/blkdev.h>
12 #include <linux/mm.h> 12 #include <linux/mm.h>
13 #include <linux/init.h> 13 #include <linux/init.h>
14 #include <linux/slab.h> 14 #include <linux/slab.h>
15 #include <linux/workqueue.h> 15 #include <linux/workqueue.h>
16 #include <linux/smp.h> 16 #include <linux/smp.h>
17 #include <linux/llist.h> 17 #include <linux/llist.h>
18 #include <linux/list_sort.h> 18 #include <linux/list_sort.h>
19 #include <linux/cpu.h> 19 #include <linux/cpu.h>
20 #include <linux/cache.h> 20 #include <linux/cache.h>
21 #include <linux/sched/sysctl.h> 21 #include <linux/sched/sysctl.h>
22 #include <linux/delay.h> 22 #include <linux/delay.h>
23 #include <linux/crash_dump.h> 23 #include <linux/crash_dump.h>
24 24
25 #include <trace/events/block.h> 25 #include <trace/events/block.h>
26 26
27 #include <linux/blk-mq.h> 27 #include <linux/blk-mq.h>
28 #include "blk.h" 28 #include "blk.h"
29 #include "blk-mq.h" 29 #include "blk-mq.h"
30 #include "blk-mq-tag.h" 30 #include "blk-mq-tag.h"
31 31
32 static DEFINE_MUTEX(all_q_mutex); 32 static DEFINE_MUTEX(all_q_mutex);
33 static LIST_HEAD(all_q_list); 33 static LIST_HEAD(all_q_list);
34 34
35 static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx); 35 static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx);
36 36
37 /* 37 /*
38 * Check if any of the ctx's have pending work in this hardware queue 38 * Check if any of the ctx's have pending work in this hardware queue
39 */ 39 */
40 static bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx) 40 static bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx)
41 { 41 {
42 unsigned int i; 42 unsigned int i;
43 43
44 for (i = 0; i < hctx->ctx_map.map_size; i++) 44 for (i = 0; i < hctx->ctx_map.map_size; i++)
45 if (hctx->ctx_map.map[i].word) 45 if (hctx->ctx_map.map[i].word)
46 return true; 46 return true;
47 47
48 return false; 48 return false;
49 } 49 }
50 50
51 static inline struct blk_align_bitmap *get_bm(struct blk_mq_hw_ctx *hctx, 51 static inline struct blk_align_bitmap *get_bm(struct blk_mq_hw_ctx *hctx,
52 struct blk_mq_ctx *ctx) 52 struct blk_mq_ctx *ctx)
53 { 53 {
54 return &hctx->ctx_map.map[ctx->index_hw / hctx->ctx_map.bits_per_word]; 54 return &hctx->ctx_map.map[ctx->index_hw / hctx->ctx_map.bits_per_word];
55 } 55 }
56 56
57 #define CTX_TO_BIT(hctx, ctx) \ 57 #define CTX_TO_BIT(hctx, ctx) \
58 ((ctx)->index_hw & ((hctx)->ctx_map.bits_per_word - 1)) 58 ((ctx)->index_hw & ((hctx)->ctx_map.bits_per_word - 1))
59 59
60 /* 60 /*
61 * Mark this ctx as having pending work in this hardware queue 61 * Mark this ctx as having pending work in this hardware queue
62 */ 62 */
63 static void blk_mq_hctx_mark_pending(struct blk_mq_hw_ctx *hctx, 63 static void blk_mq_hctx_mark_pending(struct blk_mq_hw_ctx *hctx,
64 struct blk_mq_ctx *ctx) 64 struct blk_mq_ctx *ctx)
65 { 65 {
66 struct blk_align_bitmap *bm = get_bm(hctx, ctx); 66 struct blk_align_bitmap *bm = get_bm(hctx, ctx);
67 67
68 if (!test_bit(CTX_TO_BIT(hctx, ctx), &bm->word)) 68 if (!test_bit(CTX_TO_BIT(hctx, ctx), &bm->word))
69 set_bit(CTX_TO_BIT(hctx, ctx), &bm->word); 69 set_bit(CTX_TO_BIT(hctx, ctx), &bm->word);
70 } 70 }
71 71
72 static void blk_mq_hctx_clear_pending(struct blk_mq_hw_ctx *hctx, 72 static void blk_mq_hctx_clear_pending(struct blk_mq_hw_ctx *hctx,
73 struct blk_mq_ctx *ctx) 73 struct blk_mq_ctx *ctx)
74 { 74 {
75 struct blk_align_bitmap *bm = get_bm(hctx, ctx); 75 struct blk_align_bitmap *bm = get_bm(hctx, ctx);
76 76
77 clear_bit(CTX_TO_BIT(hctx, ctx), &bm->word); 77 clear_bit(CTX_TO_BIT(hctx, ctx), &bm->word);
78 } 78 }
79 79
80 static int blk_mq_queue_enter(struct request_queue *q) 80 static int blk_mq_queue_enter(struct request_queue *q)
81 { 81 {
82 while (true) { 82 while (true) {
83 int ret; 83 int ret;
84 84
85 if (percpu_ref_tryget_live(&q->mq_usage_counter)) 85 if (percpu_ref_tryget_live(&q->mq_usage_counter))
86 return 0; 86 return 0;
87 87
88 ret = wait_event_interruptible(q->mq_freeze_wq, 88 ret = wait_event_interruptible(q->mq_freeze_wq,
89 !q->mq_freeze_depth || blk_queue_dying(q)); 89 !q->mq_freeze_depth || blk_queue_dying(q));
90 if (blk_queue_dying(q)) 90 if (blk_queue_dying(q))
91 return -ENODEV; 91 return -ENODEV;
92 if (ret) 92 if (ret)
93 return ret; 93 return ret;
94 } 94 }
95 } 95 }
96 96
97 static void blk_mq_queue_exit(struct request_queue *q) 97 static void blk_mq_queue_exit(struct request_queue *q)
98 { 98 {
99 percpu_ref_put(&q->mq_usage_counter); 99 percpu_ref_put(&q->mq_usage_counter);
100 } 100 }
101 101
102 static void blk_mq_usage_counter_release(struct percpu_ref *ref) 102 static void blk_mq_usage_counter_release(struct percpu_ref *ref)
103 { 103 {
104 struct request_queue *q = 104 struct request_queue *q =
105 container_of(ref, struct request_queue, mq_usage_counter); 105 container_of(ref, struct request_queue, mq_usage_counter);
106 106
107 wake_up_all(&q->mq_freeze_wq); 107 wake_up_all(&q->mq_freeze_wq);
108 } 108 }
109 109
110 /* 110 static void blk_mq_freeze_queue_start(struct request_queue *q)
111 * Guarantee no request is in use, so we can change any data structure of
112 * the queue afterward.
113 */
114 void blk_mq_freeze_queue(struct request_queue *q)
115 { 111 {
116 bool freeze; 112 bool freeze;
117 113
118 spin_lock_irq(q->queue_lock); 114 spin_lock_irq(q->queue_lock);
119 freeze = !q->mq_freeze_depth++; 115 freeze = !q->mq_freeze_depth++;
120 spin_unlock_irq(q->queue_lock); 116 spin_unlock_irq(q->queue_lock);
121 117
122 if (freeze) { 118 if (freeze) {
123 percpu_ref_kill(&q->mq_usage_counter); 119 percpu_ref_kill(&q->mq_usage_counter);
124 blk_mq_run_queues(q, false); 120 blk_mq_run_queues(q, false);
125 } 121 }
122 }
123
124 static void blk_mq_freeze_queue_wait(struct request_queue *q)
125 {
126 wait_event(q->mq_freeze_wq, percpu_ref_is_zero(&q->mq_usage_counter)); 126 wait_event(q->mq_freeze_wq, percpu_ref_is_zero(&q->mq_usage_counter));
127 } 127 }
128 128
129 /*
130 * Guarantee no request is in use, so we can change any data structure of
131 * the queue afterward.
132 */
133 void blk_mq_freeze_queue(struct request_queue *q)
134 {
135 blk_mq_freeze_queue_start(q);
136 blk_mq_freeze_queue_wait(q);
137 }
138
129 static void blk_mq_unfreeze_queue(struct request_queue *q) 139 static void blk_mq_unfreeze_queue(struct request_queue *q)
130 { 140 {
131 bool wake; 141 bool wake;
132 142
133 spin_lock_irq(q->queue_lock); 143 spin_lock_irq(q->queue_lock);
134 wake = !--q->mq_freeze_depth; 144 wake = !--q->mq_freeze_depth;
135 WARN_ON_ONCE(q->mq_freeze_depth < 0); 145 WARN_ON_ONCE(q->mq_freeze_depth < 0);
136 spin_unlock_irq(q->queue_lock); 146 spin_unlock_irq(q->queue_lock);
137 if (wake) { 147 if (wake) {
138 percpu_ref_reinit(&q->mq_usage_counter); 148 percpu_ref_reinit(&q->mq_usage_counter);
139 wake_up_all(&q->mq_freeze_wq); 149 wake_up_all(&q->mq_freeze_wq);
140 } 150 }
141 } 151 }
142 152
143 bool blk_mq_can_queue(struct blk_mq_hw_ctx *hctx) 153 bool blk_mq_can_queue(struct blk_mq_hw_ctx *hctx)
144 { 154 {
145 return blk_mq_has_free_tags(hctx->tags); 155 return blk_mq_has_free_tags(hctx->tags);
146 } 156 }
147 EXPORT_SYMBOL(blk_mq_can_queue); 157 EXPORT_SYMBOL(blk_mq_can_queue);
148 158
149 static void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx, 159 static void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx,
150 struct request *rq, unsigned int rw_flags) 160 struct request *rq, unsigned int rw_flags)
151 { 161 {
152 if (blk_queue_io_stat(q)) 162 if (blk_queue_io_stat(q))
153 rw_flags |= REQ_IO_STAT; 163 rw_flags |= REQ_IO_STAT;
154 164
155 INIT_LIST_HEAD(&rq->queuelist); 165 INIT_LIST_HEAD(&rq->queuelist);
156 /* csd/requeue_work/fifo_time is initialized before use */ 166 /* csd/requeue_work/fifo_time is initialized before use */
157 rq->q = q; 167 rq->q = q;
158 rq->mq_ctx = ctx; 168 rq->mq_ctx = ctx;
159 rq->cmd_flags |= rw_flags; 169 rq->cmd_flags |= rw_flags;
160 /* do not touch atomic flags, it needs atomic ops against the timer */ 170 /* do not touch atomic flags, it needs atomic ops against the timer */
161 rq->cpu = -1; 171 rq->cpu = -1;
162 INIT_HLIST_NODE(&rq->hash); 172 INIT_HLIST_NODE(&rq->hash);
163 RB_CLEAR_NODE(&rq->rb_node); 173 RB_CLEAR_NODE(&rq->rb_node);
164 rq->rq_disk = NULL; 174 rq->rq_disk = NULL;
165 rq->part = NULL; 175 rq->part = NULL;
166 rq->start_time = jiffies; 176 rq->start_time = jiffies;
167 #ifdef CONFIG_BLK_CGROUP 177 #ifdef CONFIG_BLK_CGROUP
168 rq->rl = NULL; 178 rq->rl = NULL;
169 set_start_time_ns(rq); 179 set_start_time_ns(rq);
170 rq->io_start_time_ns = 0; 180 rq->io_start_time_ns = 0;
171 #endif 181 #endif
172 rq->nr_phys_segments = 0; 182 rq->nr_phys_segments = 0;
173 #if defined(CONFIG_BLK_DEV_INTEGRITY) 183 #if defined(CONFIG_BLK_DEV_INTEGRITY)
174 rq->nr_integrity_segments = 0; 184 rq->nr_integrity_segments = 0;
175 #endif 185 #endif
176 rq->special = NULL; 186 rq->special = NULL;
177 /* tag was already set */ 187 /* tag was already set */
178 rq->errors = 0; 188 rq->errors = 0;
179 189
180 rq->cmd = rq->__cmd; 190 rq->cmd = rq->__cmd;
181 191
182 rq->extra_len = 0; 192 rq->extra_len = 0;
183 rq->sense_len = 0; 193 rq->sense_len = 0;
184 rq->resid_len = 0; 194 rq->resid_len = 0;
185 rq->sense = NULL; 195 rq->sense = NULL;
186 196
187 INIT_LIST_HEAD(&rq->timeout_list); 197 INIT_LIST_HEAD(&rq->timeout_list);
188 rq->timeout = 0; 198 rq->timeout = 0;
189 199
190 rq->end_io = NULL; 200 rq->end_io = NULL;
191 rq->end_io_data = NULL; 201 rq->end_io_data = NULL;
192 rq->next_rq = NULL; 202 rq->next_rq = NULL;
193 203
194 ctx->rq_dispatched[rw_is_sync(rw_flags)]++; 204 ctx->rq_dispatched[rw_is_sync(rw_flags)]++;
195 } 205 }
196 206
197 static struct request * 207 static struct request *
198 __blk_mq_alloc_request(struct blk_mq_alloc_data *data, int rw) 208 __blk_mq_alloc_request(struct blk_mq_alloc_data *data, int rw)
199 { 209 {
200 struct request *rq; 210 struct request *rq;
201 unsigned int tag; 211 unsigned int tag;
202 212
203 tag = blk_mq_get_tag(data); 213 tag = blk_mq_get_tag(data);
204 if (tag != BLK_MQ_TAG_FAIL) { 214 if (tag != BLK_MQ_TAG_FAIL) {
205 rq = data->hctx->tags->rqs[tag]; 215 rq = data->hctx->tags->rqs[tag];
206 216
207 if (blk_mq_tag_busy(data->hctx)) { 217 if (blk_mq_tag_busy(data->hctx)) {
208 rq->cmd_flags = REQ_MQ_INFLIGHT; 218 rq->cmd_flags = REQ_MQ_INFLIGHT;
209 atomic_inc(&data->hctx->nr_active); 219 atomic_inc(&data->hctx->nr_active);
210 } 220 }
211 221
212 rq->tag = tag; 222 rq->tag = tag;
213 blk_mq_rq_ctx_init(data->q, data->ctx, rq, rw); 223 blk_mq_rq_ctx_init(data->q, data->ctx, rq, rw);
214 return rq; 224 return rq;
215 } 225 }
216 226
217 return NULL; 227 return NULL;
218 } 228 }
219 229
220 struct request *blk_mq_alloc_request(struct request_queue *q, int rw, gfp_t gfp, 230 struct request *blk_mq_alloc_request(struct request_queue *q, int rw, gfp_t gfp,
221 bool reserved) 231 bool reserved)
222 { 232 {
223 struct blk_mq_ctx *ctx; 233 struct blk_mq_ctx *ctx;
224 struct blk_mq_hw_ctx *hctx; 234 struct blk_mq_hw_ctx *hctx;
225 struct request *rq; 235 struct request *rq;
226 struct blk_mq_alloc_data alloc_data; 236 struct blk_mq_alloc_data alloc_data;
227 int ret; 237 int ret;
228 238
229 ret = blk_mq_queue_enter(q); 239 ret = blk_mq_queue_enter(q);
230 if (ret) 240 if (ret)
231 return ERR_PTR(ret); 241 return ERR_PTR(ret);
232 242
233 ctx = blk_mq_get_ctx(q); 243 ctx = blk_mq_get_ctx(q);
234 hctx = q->mq_ops->map_queue(q, ctx->cpu); 244 hctx = q->mq_ops->map_queue(q, ctx->cpu);
235 blk_mq_set_alloc_data(&alloc_data, q, gfp & ~__GFP_WAIT, 245 blk_mq_set_alloc_data(&alloc_data, q, gfp & ~__GFP_WAIT,
236 reserved, ctx, hctx); 246 reserved, ctx, hctx);
237 247
238 rq = __blk_mq_alloc_request(&alloc_data, rw); 248 rq = __blk_mq_alloc_request(&alloc_data, rw);
239 if (!rq && (gfp & __GFP_WAIT)) { 249 if (!rq && (gfp & __GFP_WAIT)) {
240 __blk_mq_run_hw_queue(hctx); 250 __blk_mq_run_hw_queue(hctx);
241 blk_mq_put_ctx(ctx); 251 blk_mq_put_ctx(ctx);
242 252
243 ctx = blk_mq_get_ctx(q); 253 ctx = blk_mq_get_ctx(q);
244 hctx = q->mq_ops->map_queue(q, ctx->cpu); 254 hctx = q->mq_ops->map_queue(q, ctx->cpu);
245 blk_mq_set_alloc_data(&alloc_data, q, gfp, reserved, ctx, 255 blk_mq_set_alloc_data(&alloc_data, q, gfp, reserved, ctx,
246 hctx); 256 hctx);
247 rq = __blk_mq_alloc_request(&alloc_data, rw); 257 rq = __blk_mq_alloc_request(&alloc_data, rw);
248 ctx = alloc_data.ctx; 258 ctx = alloc_data.ctx;
249 } 259 }
250 blk_mq_put_ctx(ctx); 260 blk_mq_put_ctx(ctx);
251 if (!rq) 261 if (!rq)
252 return ERR_PTR(-EWOULDBLOCK); 262 return ERR_PTR(-EWOULDBLOCK);
253 return rq; 263 return rq;
254 } 264 }
255 EXPORT_SYMBOL(blk_mq_alloc_request); 265 EXPORT_SYMBOL(blk_mq_alloc_request);
256 266
257 static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx, 267 static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx,
258 struct blk_mq_ctx *ctx, struct request *rq) 268 struct blk_mq_ctx *ctx, struct request *rq)
259 { 269 {
260 const int tag = rq->tag; 270 const int tag = rq->tag;
261 struct request_queue *q = rq->q; 271 struct request_queue *q = rq->q;
262 272
263 if (rq->cmd_flags & REQ_MQ_INFLIGHT) 273 if (rq->cmd_flags & REQ_MQ_INFLIGHT)
264 atomic_dec(&hctx->nr_active); 274 atomic_dec(&hctx->nr_active);
265 rq->cmd_flags = 0; 275 rq->cmd_flags = 0;
266 276
267 clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags); 277 clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
268 blk_mq_put_tag(hctx, tag, &ctx->last_tag); 278 blk_mq_put_tag(hctx, tag, &ctx->last_tag);
269 blk_mq_queue_exit(q); 279 blk_mq_queue_exit(q);
270 } 280 }
271 281
272 void blk_mq_free_request(struct request *rq) 282 void blk_mq_free_request(struct request *rq)
273 { 283 {
274 struct blk_mq_ctx *ctx = rq->mq_ctx; 284 struct blk_mq_ctx *ctx = rq->mq_ctx;
275 struct blk_mq_hw_ctx *hctx; 285 struct blk_mq_hw_ctx *hctx;
276 struct request_queue *q = rq->q; 286 struct request_queue *q = rq->q;
277 287
278 ctx->rq_completed[rq_is_sync(rq)]++; 288 ctx->rq_completed[rq_is_sync(rq)]++;
279 289
280 hctx = q->mq_ops->map_queue(q, ctx->cpu); 290 hctx = q->mq_ops->map_queue(q, ctx->cpu);
281 __blk_mq_free_request(hctx, ctx, rq); 291 __blk_mq_free_request(hctx, ctx, rq);
282 } 292 }
283 293
284 inline void __blk_mq_end_request(struct request *rq, int error) 294 inline void __blk_mq_end_request(struct request *rq, int error)
285 { 295 {
286 blk_account_io_done(rq); 296 blk_account_io_done(rq);
287 297
288 if (rq->end_io) { 298 if (rq->end_io) {
289 rq->end_io(rq, error); 299 rq->end_io(rq, error);
290 } else { 300 } else {
291 if (unlikely(blk_bidi_rq(rq))) 301 if (unlikely(blk_bidi_rq(rq)))
292 blk_mq_free_request(rq->next_rq); 302 blk_mq_free_request(rq->next_rq);
293 blk_mq_free_request(rq); 303 blk_mq_free_request(rq);
294 } 304 }
295 } 305 }
296 EXPORT_SYMBOL(__blk_mq_end_request); 306 EXPORT_SYMBOL(__blk_mq_end_request);
297 307
298 void blk_mq_end_request(struct request *rq, int error) 308 void blk_mq_end_request(struct request *rq, int error)
299 { 309 {
300 if (blk_update_request(rq, error, blk_rq_bytes(rq))) 310 if (blk_update_request(rq, error, blk_rq_bytes(rq)))
301 BUG(); 311 BUG();
302 __blk_mq_end_request(rq, error); 312 __blk_mq_end_request(rq, error);
303 } 313 }
304 EXPORT_SYMBOL(blk_mq_end_request); 314 EXPORT_SYMBOL(blk_mq_end_request);
305 315
306 static void __blk_mq_complete_request_remote(void *data) 316 static void __blk_mq_complete_request_remote(void *data)
307 { 317 {
308 struct request *rq = data; 318 struct request *rq = data;
309 319
310 rq->q->softirq_done_fn(rq); 320 rq->q->softirq_done_fn(rq);
311 } 321 }
312 322
313 static void blk_mq_ipi_complete_request(struct request *rq) 323 static void blk_mq_ipi_complete_request(struct request *rq)
314 { 324 {
315 struct blk_mq_ctx *ctx = rq->mq_ctx; 325 struct blk_mq_ctx *ctx = rq->mq_ctx;
316 bool shared = false; 326 bool shared = false;
317 int cpu; 327 int cpu;
318 328
319 if (!test_bit(QUEUE_FLAG_SAME_COMP, &rq->q->queue_flags)) { 329 if (!test_bit(QUEUE_FLAG_SAME_COMP, &rq->q->queue_flags)) {
320 rq->q->softirq_done_fn(rq); 330 rq->q->softirq_done_fn(rq);
321 return; 331 return;
322 } 332 }
323 333
324 cpu = get_cpu(); 334 cpu = get_cpu();
325 if (!test_bit(QUEUE_FLAG_SAME_FORCE, &rq->q->queue_flags)) 335 if (!test_bit(QUEUE_FLAG_SAME_FORCE, &rq->q->queue_flags))
326 shared = cpus_share_cache(cpu, ctx->cpu); 336 shared = cpus_share_cache(cpu, ctx->cpu);
327 337
328 if (cpu != ctx->cpu && !shared && cpu_online(ctx->cpu)) { 338 if (cpu != ctx->cpu && !shared && cpu_online(ctx->cpu)) {
329 rq->csd.func = __blk_mq_complete_request_remote; 339 rq->csd.func = __blk_mq_complete_request_remote;
330 rq->csd.info = rq; 340 rq->csd.info = rq;
331 rq->csd.flags = 0; 341 rq->csd.flags = 0;
332 smp_call_function_single_async(ctx->cpu, &rq->csd); 342 smp_call_function_single_async(ctx->cpu, &rq->csd);
333 } else { 343 } else {
334 rq->q->softirq_done_fn(rq); 344 rq->q->softirq_done_fn(rq);
335 } 345 }
336 put_cpu(); 346 put_cpu();
337 } 347 }
338 348
339 void __blk_mq_complete_request(struct request *rq) 349 void __blk_mq_complete_request(struct request *rq)
340 { 350 {
341 struct request_queue *q = rq->q; 351 struct request_queue *q = rq->q;
342 352
343 if (!q->softirq_done_fn) 353 if (!q->softirq_done_fn)
344 blk_mq_end_request(rq, rq->errors); 354 blk_mq_end_request(rq, rq->errors);
345 else 355 else
346 blk_mq_ipi_complete_request(rq); 356 blk_mq_ipi_complete_request(rq);
347 } 357 }
348 358
349 /** 359 /**
350 * blk_mq_complete_request - end I/O on a request 360 * blk_mq_complete_request - end I/O on a request
351 * @rq: the request being processed 361 * @rq: the request being processed
352 * 362 *
353 * Description: 363 * Description:
354 * Ends all I/O on a request. It does not handle partial completions. 364 * Ends all I/O on a request. It does not handle partial completions.
355 * The actual completion happens out-of-order, through a IPI handler. 365 * The actual completion happens out-of-order, through a IPI handler.
356 **/ 366 **/
357 void blk_mq_complete_request(struct request *rq) 367 void blk_mq_complete_request(struct request *rq)
358 { 368 {
359 struct request_queue *q = rq->q; 369 struct request_queue *q = rq->q;
360 370
361 if (unlikely(blk_should_fake_timeout(q))) 371 if (unlikely(blk_should_fake_timeout(q)))
362 return; 372 return;
363 if (!blk_mark_rq_complete(rq)) 373 if (!blk_mark_rq_complete(rq))
364 __blk_mq_complete_request(rq); 374 __blk_mq_complete_request(rq);
365 } 375 }
366 EXPORT_SYMBOL(blk_mq_complete_request); 376 EXPORT_SYMBOL(blk_mq_complete_request);
367 377
368 void blk_mq_start_request(struct request *rq) 378 void blk_mq_start_request(struct request *rq)
369 { 379 {
370 struct request_queue *q = rq->q; 380 struct request_queue *q = rq->q;
371 381
372 trace_block_rq_issue(q, rq); 382 trace_block_rq_issue(q, rq);
373 383
374 rq->resid_len = blk_rq_bytes(rq); 384 rq->resid_len = blk_rq_bytes(rq);
375 if (unlikely(blk_bidi_rq(rq))) 385 if (unlikely(blk_bidi_rq(rq)))
376 rq->next_rq->resid_len = blk_rq_bytes(rq->next_rq); 386 rq->next_rq->resid_len = blk_rq_bytes(rq->next_rq);
377 387
378 blk_add_timer(rq); 388 blk_add_timer(rq);
379 389
380 /* 390 /*
381 * Ensure that ->deadline is visible before set the started 391 * Ensure that ->deadline is visible before set the started
382 * flag and clear the completed flag. 392 * flag and clear the completed flag.
383 */ 393 */
384 smp_mb__before_atomic(); 394 smp_mb__before_atomic();
385 395
386 /* 396 /*
387 * Mark us as started and clear complete. Complete might have been 397 * Mark us as started and clear complete. Complete might have been
388 * set if requeue raced with timeout, which then marked it as 398 * set if requeue raced with timeout, which then marked it as
389 * complete. So be sure to clear complete again when we start 399 * complete. So be sure to clear complete again when we start
390 * the request, otherwise we'll ignore the completion event. 400 * the request, otherwise we'll ignore the completion event.
391 */ 401 */
392 if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) 402 if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags))
393 set_bit(REQ_ATOM_STARTED, &rq->atomic_flags); 403 set_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
394 if (test_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags)) 404 if (test_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags))
395 clear_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags); 405 clear_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags);
396 406
397 if (q->dma_drain_size && blk_rq_bytes(rq)) { 407 if (q->dma_drain_size && blk_rq_bytes(rq)) {
398 /* 408 /*
399 * Make sure space for the drain appears. We know we can do 409 * Make sure space for the drain appears. We know we can do
400 * this because max_hw_segments has been adjusted to be one 410 * this because max_hw_segments has been adjusted to be one
401 * fewer than the device can handle. 411 * fewer than the device can handle.
402 */ 412 */
403 rq->nr_phys_segments++; 413 rq->nr_phys_segments++;
404 } 414 }
405 } 415 }
406 EXPORT_SYMBOL(blk_mq_start_request); 416 EXPORT_SYMBOL(blk_mq_start_request);
407 417
408 static void __blk_mq_requeue_request(struct request *rq) 418 static void __blk_mq_requeue_request(struct request *rq)
409 { 419 {
410 struct request_queue *q = rq->q; 420 struct request_queue *q = rq->q;
411 421
412 trace_block_rq_requeue(q, rq); 422 trace_block_rq_requeue(q, rq);
413 423
414 if (test_and_clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) { 424 if (test_and_clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) {
415 if (q->dma_drain_size && blk_rq_bytes(rq)) 425 if (q->dma_drain_size && blk_rq_bytes(rq))
416 rq->nr_phys_segments--; 426 rq->nr_phys_segments--;
417 } 427 }
418 } 428 }
419 429
420 void blk_mq_requeue_request(struct request *rq) 430 void blk_mq_requeue_request(struct request *rq)
421 { 431 {
422 __blk_mq_requeue_request(rq); 432 __blk_mq_requeue_request(rq);
423 433
424 BUG_ON(blk_queued_rq(rq)); 434 BUG_ON(blk_queued_rq(rq));
425 blk_mq_add_to_requeue_list(rq, true); 435 blk_mq_add_to_requeue_list(rq, true);
426 } 436 }
427 EXPORT_SYMBOL(blk_mq_requeue_request); 437 EXPORT_SYMBOL(blk_mq_requeue_request);
428 438
429 static void blk_mq_requeue_work(struct work_struct *work) 439 static void blk_mq_requeue_work(struct work_struct *work)
430 { 440 {
431 struct request_queue *q = 441 struct request_queue *q =
432 container_of(work, struct request_queue, requeue_work); 442 container_of(work, struct request_queue, requeue_work);
433 LIST_HEAD(rq_list); 443 LIST_HEAD(rq_list);
434 struct request *rq, *next; 444 struct request *rq, *next;
435 unsigned long flags; 445 unsigned long flags;
436 446
437 spin_lock_irqsave(&q->requeue_lock, flags); 447 spin_lock_irqsave(&q->requeue_lock, flags);
438 list_splice_init(&q->requeue_list, &rq_list); 448 list_splice_init(&q->requeue_list, &rq_list);
439 spin_unlock_irqrestore(&q->requeue_lock, flags); 449 spin_unlock_irqrestore(&q->requeue_lock, flags);
440 450
441 list_for_each_entry_safe(rq, next, &rq_list, queuelist) { 451 list_for_each_entry_safe(rq, next, &rq_list, queuelist) {
442 if (!(rq->cmd_flags & REQ_SOFTBARRIER)) 452 if (!(rq->cmd_flags & REQ_SOFTBARRIER))
443 continue; 453 continue;
444 454
445 rq->cmd_flags &= ~REQ_SOFTBARRIER; 455 rq->cmd_flags &= ~REQ_SOFTBARRIER;
446 list_del_init(&rq->queuelist); 456 list_del_init(&rq->queuelist);
447 blk_mq_insert_request(rq, true, false, false); 457 blk_mq_insert_request(rq, true, false, false);
448 } 458 }
449 459
450 while (!list_empty(&rq_list)) { 460 while (!list_empty(&rq_list)) {
451 rq = list_entry(rq_list.next, struct request, queuelist); 461 rq = list_entry(rq_list.next, struct request, queuelist);
452 list_del_init(&rq->queuelist); 462 list_del_init(&rq->queuelist);
453 blk_mq_insert_request(rq, false, false, false); 463 blk_mq_insert_request(rq, false, false, false);
454 } 464 }
455 465
456 /* 466 /*
457 * Use the start variant of queue running here, so that running 467 * Use the start variant of queue running here, so that running
458 * the requeue work will kick stopped queues. 468 * the requeue work will kick stopped queues.
459 */ 469 */
460 blk_mq_start_hw_queues(q); 470 blk_mq_start_hw_queues(q);
461 } 471 }
462 472
463 void blk_mq_add_to_requeue_list(struct request *rq, bool at_head) 473 void blk_mq_add_to_requeue_list(struct request *rq, bool at_head)
464 { 474 {
465 struct request_queue *q = rq->q; 475 struct request_queue *q = rq->q;
466 unsigned long flags; 476 unsigned long flags;
467 477
468 /* 478 /*
469 * We abuse this flag that is otherwise used by the I/O scheduler to 479 * We abuse this flag that is otherwise used by the I/O scheduler to
470 * request head insertation from the workqueue. 480 * request head insertation from the workqueue.
471 */ 481 */
472 BUG_ON(rq->cmd_flags & REQ_SOFTBARRIER); 482 BUG_ON(rq->cmd_flags & REQ_SOFTBARRIER);
473 483
474 spin_lock_irqsave(&q->requeue_lock, flags); 484 spin_lock_irqsave(&q->requeue_lock, flags);
475 if (at_head) { 485 if (at_head) {
476 rq->cmd_flags |= REQ_SOFTBARRIER; 486 rq->cmd_flags |= REQ_SOFTBARRIER;
477 list_add(&rq->queuelist, &q->requeue_list); 487 list_add(&rq->queuelist, &q->requeue_list);
478 } else { 488 } else {
479 list_add_tail(&rq->queuelist, &q->requeue_list); 489 list_add_tail(&rq->queuelist, &q->requeue_list);
480 } 490 }
481 spin_unlock_irqrestore(&q->requeue_lock, flags); 491 spin_unlock_irqrestore(&q->requeue_lock, flags);
482 } 492 }
483 EXPORT_SYMBOL(blk_mq_add_to_requeue_list); 493 EXPORT_SYMBOL(blk_mq_add_to_requeue_list);
484 494
485 void blk_mq_kick_requeue_list(struct request_queue *q) 495 void blk_mq_kick_requeue_list(struct request_queue *q)
486 { 496 {
487 kblockd_schedule_work(&q->requeue_work); 497 kblockd_schedule_work(&q->requeue_work);
488 } 498 }
489 EXPORT_SYMBOL(blk_mq_kick_requeue_list); 499 EXPORT_SYMBOL(blk_mq_kick_requeue_list);
490 500
491 static inline bool is_flush_request(struct request *rq, 501 static inline bool is_flush_request(struct request *rq,
492 struct blk_flush_queue *fq, unsigned int tag) 502 struct blk_flush_queue *fq, unsigned int tag)
493 { 503 {
494 return ((rq->cmd_flags & REQ_FLUSH_SEQ) && 504 return ((rq->cmd_flags & REQ_FLUSH_SEQ) &&
495 fq->flush_rq->tag == tag); 505 fq->flush_rq->tag == tag);
496 } 506 }
497 507
498 struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag) 508 struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag)
499 { 509 {
500 struct request *rq = tags->rqs[tag]; 510 struct request *rq = tags->rqs[tag];
501 /* mq_ctx of flush rq is always cloned from the corresponding req */ 511 /* mq_ctx of flush rq is always cloned from the corresponding req */
502 struct blk_flush_queue *fq = blk_get_flush_queue(rq->q, rq->mq_ctx); 512 struct blk_flush_queue *fq = blk_get_flush_queue(rq->q, rq->mq_ctx);
503 513
504 if (!is_flush_request(rq, fq, tag)) 514 if (!is_flush_request(rq, fq, tag))
505 return rq; 515 return rq;
506 516
507 return fq->flush_rq; 517 return fq->flush_rq;
508 } 518 }
509 EXPORT_SYMBOL(blk_mq_tag_to_rq); 519 EXPORT_SYMBOL(blk_mq_tag_to_rq);
510 520
511 struct blk_mq_timeout_data { 521 struct blk_mq_timeout_data {
512 unsigned long next; 522 unsigned long next;
513 unsigned int next_set; 523 unsigned int next_set;
514 }; 524 };
515 525
516 void blk_mq_rq_timed_out(struct request *req, bool reserved) 526 void blk_mq_rq_timed_out(struct request *req, bool reserved)
517 { 527 {
518 struct blk_mq_ops *ops = req->q->mq_ops; 528 struct blk_mq_ops *ops = req->q->mq_ops;
519 enum blk_eh_timer_return ret = BLK_EH_RESET_TIMER; 529 enum blk_eh_timer_return ret = BLK_EH_RESET_TIMER;
520 530
521 /* 531 /*
522 * We know that complete is set at this point. If STARTED isn't set 532 * We know that complete is set at this point. If STARTED isn't set
523 * anymore, then the request isn't active and the "timeout" should 533 * anymore, then the request isn't active and the "timeout" should
524 * just be ignored. This can happen due to the bitflag ordering. 534 * just be ignored. This can happen due to the bitflag ordering.
525 * Timeout first checks if STARTED is set, and if it is, assumes 535 * Timeout first checks if STARTED is set, and if it is, assumes
526 * the request is active. But if we race with completion, then 536 * the request is active. But if we race with completion, then
527 * we both flags will get cleared. So check here again, and ignore 537 * we both flags will get cleared. So check here again, and ignore
528 * a timeout event with a request that isn't active. 538 * a timeout event with a request that isn't active.
529 */ 539 */
530 if (!test_bit(REQ_ATOM_STARTED, &req->atomic_flags)) 540 if (!test_bit(REQ_ATOM_STARTED, &req->atomic_flags))
531 return; 541 return;
532 542
533 if (ops->timeout) 543 if (ops->timeout)
534 ret = ops->timeout(req, reserved); 544 ret = ops->timeout(req, reserved);
535 545
536 switch (ret) { 546 switch (ret) {
537 case BLK_EH_HANDLED: 547 case BLK_EH_HANDLED:
538 __blk_mq_complete_request(req); 548 __blk_mq_complete_request(req);
539 break; 549 break;
540 case BLK_EH_RESET_TIMER: 550 case BLK_EH_RESET_TIMER:
541 blk_add_timer(req); 551 blk_add_timer(req);
542 blk_clear_rq_complete(req); 552 blk_clear_rq_complete(req);
543 break; 553 break;
544 case BLK_EH_NOT_HANDLED: 554 case BLK_EH_NOT_HANDLED:
545 break; 555 break;
546 default: 556 default:
547 printk(KERN_ERR "block: bad eh return: %d\n", ret); 557 printk(KERN_ERR "block: bad eh return: %d\n", ret);
548 break; 558 break;
549 } 559 }
550 } 560 }
551 561
552 static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx, 562 static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx,
553 struct request *rq, void *priv, bool reserved) 563 struct request *rq, void *priv, bool reserved)
554 { 564 {
555 struct blk_mq_timeout_data *data = priv; 565 struct blk_mq_timeout_data *data = priv;
556 566
557 if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) 567 if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags))
558 return; 568 return;
559 569
560 if (time_after_eq(jiffies, rq->deadline)) { 570 if (time_after_eq(jiffies, rq->deadline)) {
561 if (!blk_mark_rq_complete(rq)) 571 if (!blk_mark_rq_complete(rq))
562 blk_mq_rq_timed_out(rq, reserved); 572 blk_mq_rq_timed_out(rq, reserved);
563 } else if (!data->next_set || time_after(data->next, rq->deadline)) { 573 } else if (!data->next_set || time_after(data->next, rq->deadline)) {
564 data->next = rq->deadline; 574 data->next = rq->deadline;
565 data->next_set = 1; 575 data->next_set = 1;
566 } 576 }
567 } 577 }
568 578
569 static void blk_mq_rq_timer(unsigned long priv) 579 static void blk_mq_rq_timer(unsigned long priv)
570 { 580 {
571 struct request_queue *q = (struct request_queue *)priv; 581 struct request_queue *q = (struct request_queue *)priv;
572 struct blk_mq_timeout_data data = { 582 struct blk_mq_timeout_data data = {
573 .next = 0, 583 .next = 0,
574 .next_set = 0, 584 .next_set = 0,
575 }; 585 };
576 struct blk_mq_hw_ctx *hctx; 586 struct blk_mq_hw_ctx *hctx;
577 int i; 587 int i;
578 588
579 queue_for_each_hw_ctx(q, hctx, i) { 589 queue_for_each_hw_ctx(q, hctx, i) {
580 /* 590 /*
581 * If not software queues are currently mapped to this 591 * If not software queues are currently mapped to this
582 * hardware queue, there's nothing to check 592 * hardware queue, there's nothing to check
583 */ 593 */
584 if (!hctx->nr_ctx || !hctx->tags) 594 if (!hctx->nr_ctx || !hctx->tags)
585 continue; 595 continue;
586 596
587 blk_mq_tag_busy_iter(hctx, blk_mq_check_expired, &data); 597 blk_mq_tag_busy_iter(hctx, blk_mq_check_expired, &data);
588 } 598 }
589 599
590 if (data.next_set) { 600 if (data.next_set) {
591 data.next = blk_rq_timeout(round_jiffies_up(data.next)); 601 data.next = blk_rq_timeout(round_jiffies_up(data.next));
592 mod_timer(&q->timeout, data.next); 602 mod_timer(&q->timeout, data.next);
593 } else { 603 } else {
594 queue_for_each_hw_ctx(q, hctx, i) 604 queue_for_each_hw_ctx(q, hctx, i)
595 blk_mq_tag_idle(hctx); 605 blk_mq_tag_idle(hctx);
596 } 606 }
597 } 607 }
598 608
599 /* 609 /*
600 * Reverse check our software queue for entries that we could potentially 610 * Reverse check our software queue for entries that we could potentially
601 * merge with. Currently includes a hand-wavy stop count of 8, to not spend 611 * merge with. Currently includes a hand-wavy stop count of 8, to not spend
602 * too much time checking for merges. 612 * too much time checking for merges.
603 */ 613 */
604 static bool blk_mq_attempt_merge(struct request_queue *q, 614 static bool blk_mq_attempt_merge(struct request_queue *q,
605 struct blk_mq_ctx *ctx, struct bio *bio) 615 struct blk_mq_ctx *ctx, struct bio *bio)
606 { 616 {
607 struct request *rq; 617 struct request *rq;
608 int checked = 8; 618 int checked = 8;
609 619
610 list_for_each_entry_reverse(rq, &ctx->rq_list, queuelist) { 620 list_for_each_entry_reverse(rq, &ctx->rq_list, queuelist) {
611 int el_ret; 621 int el_ret;
612 622
613 if (!checked--) 623 if (!checked--)
614 break; 624 break;
615 625
616 if (!blk_rq_merge_ok(rq, bio)) 626 if (!blk_rq_merge_ok(rq, bio))
617 continue; 627 continue;
618 628
619 el_ret = blk_try_merge(rq, bio); 629 el_ret = blk_try_merge(rq, bio);
620 if (el_ret == ELEVATOR_BACK_MERGE) { 630 if (el_ret == ELEVATOR_BACK_MERGE) {
621 if (bio_attempt_back_merge(q, rq, bio)) { 631 if (bio_attempt_back_merge(q, rq, bio)) {
622 ctx->rq_merged++; 632 ctx->rq_merged++;
623 return true; 633 return true;
624 } 634 }
625 break; 635 break;
626 } else if (el_ret == ELEVATOR_FRONT_MERGE) { 636 } else if (el_ret == ELEVATOR_FRONT_MERGE) {
627 if (bio_attempt_front_merge(q, rq, bio)) { 637 if (bio_attempt_front_merge(q, rq, bio)) {
628 ctx->rq_merged++; 638 ctx->rq_merged++;
629 return true; 639 return true;
630 } 640 }
631 break; 641 break;
632 } 642 }
633 } 643 }
634 644
635 return false; 645 return false;
636 } 646 }
637 647
638 /* 648 /*
639 * Process software queues that have been marked busy, splicing them 649 * Process software queues that have been marked busy, splicing them
640 * to the for-dispatch 650 * to the for-dispatch
641 */ 651 */
642 static void flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list) 652 static void flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list)
643 { 653 {
644 struct blk_mq_ctx *ctx; 654 struct blk_mq_ctx *ctx;
645 int i; 655 int i;
646 656
647 for (i = 0; i < hctx->ctx_map.map_size; i++) { 657 for (i = 0; i < hctx->ctx_map.map_size; i++) {
648 struct blk_align_bitmap *bm = &hctx->ctx_map.map[i]; 658 struct blk_align_bitmap *bm = &hctx->ctx_map.map[i];
649 unsigned int off, bit; 659 unsigned int off, bit;
650 660
651 if (!bm->word) 661 if (!bm->word)
652 continue; 662 continue;
653 663
654 bit = 0; 664 bit = 0;
655 off = i * hctx->ctx_map.bits_per_word; 665 off = i * hctx->ctx_map.bits_per_word;
656 do { 666 do {
657 bit = find_next_bit(&bm->word, bm->depth, bit); 667 bit = find_next_bit(&bm->word, bm->depth, bit);
658 if (bit >= bm->depth) 668 if (bit >= bm->depth)
659 break; 669 break;
660 670
661 ctx = hctx->ctxs[bit + off]; 671 ctx = hctx->ctxs[bit + off];
662 clear_bit(bit, &bm->word); 672 clear_bit(bit, &bm->word);
663 spin_lock(&ctx->lock); 673 spin_lock(&ctx->lock);
664 list_splice_tail_init(&ctx->rq_list, list); 674 list_splice_tail_init(&ctx->rq_list, list);
665 spin_unlock(&ctx->lock); 675 spin_unlock(&ctx->lock);
666 676
667 bit++; 677 bit++;
668 } while (1); 678 } while (1);
669 } 679 }
670 } 680 }
671 681
672 /* 682 /*
673 * Run this hardware queue, pulling any software queues mapped to it in. 683 * Run this hardware queue, pulling any software queues mapped to it in.
674 * Note that this function currently has various problems around ordering 684 * Note that this function currently has various problems around ordering
675 * of IO. In particular, we'd like FIFO behaviour on handling existing 685 * of IO. In particular, we'd like FIFO behaviour on handling existing
676 * items on the hctx->dispatch list. Ignore that for now. 686 * items on the hctx->dispatch list. Ignore that for now.
677 */ 687 */
678 static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) 688 static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
679 { 689 {
680 struct request_queue *q = hctx->queue; 690 struct request_queue *q = hctx->queue;
681 struct request *rq; 691 struct request *rq;
682 LIST_HEAD(rq_list); 692 LIST_HEAD(rq_list);
683 int queued; 693 int queued;
684 694
685 WARN_ON(!cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask)); 695 WARN_ON(!cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask));
686 696
687 if (unlikely(test_bit(BLK_MQ_S_STOPPED, &hctx->state))) 697 if (unlikely(test_bit(BLK_MQ_S_STOPPED, &hctx->state)))
688 return; 698 return;
689 699
690 hctx->run++; 700 hctx->run++;
691 701
692 /* 702 /*
693 * Touch any software queue that has pending entries. 703 * Touch any software queue that has pending entries.
694 */ 704 */
695 flush_busy_ctxs(hctx, &rq_list); 705 flush_busy_ctxs(hctx, &rq_list);
696 706
697 /* 707 /*
698 * If we have previous entries on our dispatch list, grab them 708 * If we have previous entries on our dispatch list, grab them
699 * and stuff them at the front for more fair dispatch. 709 * and stuff them at the front for more fair dispatch.
700 */ 710 */
701 if (!list_empty_careful(&hctx->dispatch)) { 711 if (!list_empty_careful(&hctx->dispatch)) {
702 spin_lock(&hctx->lock); 712 spin_lock(&hctx->lock);
703 if (!list_empty(&hctx->dispatch)) 713 if (!list_empty(&hctx->dispatch))
704 list_splice_init(&hctx->dispatch, &rq_list); 714 list_splice_init(&hctx->dispatch, &rq_list);
705 spin_unlock(&hctx->lock); 715 spin_unlock(&hctx->lock);
706 } 716 }
707 717
708 /* 718 /*
709 * Now process all the entries, sending them to the driver. 719 * Now process all the entries, sending them to the driver.
710 */ 720 */
711 queued = 0; 721 queued = 0;
712 while (!list_empty(&rq_list)) { 722 while (!list_empty(&rq_list)) {
713 int ret; 723 int ret;
714 724
715 rq = list_first_entry(&rq_list, struct request, queuelist); 725 rq = list_first_entry(&rq_list, struct request, queuelist);
716 list_del_init(&rq->queuelist); 726 list_del_init(&rq->queuelist);
717 727
718 ret = q->mq_ops->queue_rq(hctx, rq, list_empty(&rq_list)); 728 ret = q->mq_ops->queue_rq(hctx, rq, list_empty(&rq_list));
719 switch (ret) { 729 switch (ret) {
720 case BLK_MQ_RQ_QUEUE_OK: 730 case BLK_MQ_RQ_QUEUE_OK:
721 queued++; 731 queued++;
722 continue; 732 continue;
723 case BLK_MQ_RQ_QUEUE_BUSY: 733 case BLK_MQ_RQ_QUEUE_BUSY:
724 list_add(&rq->queuelist, &rq_list); 734 list_add(&rq->queuelist, &rq_list);
725 __blk_mq_requeue_request(rq); 735 __blk_mq_requeue_request(rq);
726 break; 736 break;
727 default: 737 default:
728 pr_err("blk-mq: bad return on queue: %d\n", ret); 738 pr_err("blk-mq: bad return on queue: %d\n", ret);
729 case BLK_MQ_RQ_QUEUE_ERROR: 739 case BLK_MQ_RQ_QUEUE_ERROR:
730 rq->errors = -EIO; 740 rq->errors = -EIO;
731 blk_mq_end_request(rq, rq->errors); 741 blk_mq_end_request(rq, rq->errors);
732 break; 742 break;
733 } 743 }
734 744
735 if (ret == BLK_MQ_RQ_QUEUE_BUSY) 745 if (ret == BLK_MQ_RQ_QUEUE_BUSY)
736 break; 746 break;
737 } 747 }
738 748
739 if (!queued) 749 if (!queued)
740 hctx->dispatched[0]++; 750 hctx->dispatched[0]++;
741 else if (queued < (1 << (BLK_MQ_MAX_DISPATCH_ORDER - 1))) 751 else if (queued < (1 << (BLK_MQ_MAX_DISPATCH_ORDER - 1)))
742 hctx->dispatched[ilog2(queued) + 1]++; 752 hctx->dispatched[ilog2(queued) + 1]++;
743 753
744 /* 754 /*
745 * Any items that need requeuing? Stuff them into hctx->dispatch, 755 * Any items that need requeuing? Stuff them into hctx->dispatch,
746 * that is where we will continue on next queue run. 756 * that is where we will continue on next queue run.
747 */ 757 */
748 if (!list_empty(&rq_list)) { 758 if (!list_empty(&rq_list)) {
749 spin_lock(&hctx->lock); 759 spin_lock(&hctx->lock);
750 list_splice(&rq_list, &hctx->dispatch); 760 list_splice(&rq_list, &hctx->dispatch);
751 spin_unlock(&hctx->lock); 761 spin_unlock(&hctx->lock);
752 } 762 }
753 } 763 }
754 764
755 /* 765 /*
756 * It'd be great if the workqueue API had a way to pass 766 * It'd be great if the workqueue API had a way to pass
757 * in a mask and had some smarts for more clever placement. 767 * in a mask and had some smarts for more clever placement.
758 * For now we just round-robin here, switching for every 768 * For now we just round-robin here, switching for every
759 * BLK_MQ_CPU_WORK_BATCH queued items. 769 * BLK_MQ_CPU_WORK_BATCH queued items.
760 */ 770 */
761 static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx) 771 static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx)
762 { 772 {
763 int cpu = hctx->next_cpu; 773 int cpu = hctx->next_cpu;
764 774
765 if (--hctx->next_cpu_batch <= 0) { 775 if (--hctx->next_cpu_batch <= 0) {
766 int next_cpu; 776 int next_cpu;
767 777
768 next_cpu = cpumask_next(hctx->next_cpu, hctx->cpumask); 778 next_cpu = cpumask_next(hctx->next_cpu, hctx->cpumask);
769 if (next_cpu >= nr_cpu_ids) 779 if (next_cpu >= nr_cpu_ids)
770 next_cpu = cpumask_first(hctx->cpumask); 780 next_cpu = cpumask_first(hctx->cpumask);
771 781
772 hctx->next_cpu = next_cpu; 782 hctx->next_cpu = next_cpu;
773 hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH; 783 hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH;
774 } 784 }
775 785
776 return cpu; 786 return cpu;
777 } 787 }
778 788
779 void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async) 789 void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
780 { 790 {
781 if (unlikely(test_bit(BLK_MQ_S_STOPPED, &hctx->state))) 791 if (unlikely(test_bit(BLK_MQ_S_STOPPED, &hctx->state)))
782 return; 792 return;
783 793
784 if (!async && cpumask_test_cpu(smp_processor_id(), hctx->cpumask)) 794 if (!async && cpumask_test_cpu(smp_processor_id(), hctx->cpumask))
785 __blk_mq_run_hw_queue(hctx); 795 __blk_mq_run_hw_queue(hctx);
786 else if (hctx->queue->nr_hw_queues == 1) 796 else if (hctx->queue->nr_hw_queues == 1)
787 kblockd_schedule_delayed_work(&hctx->run_work, 0); 797 kblockd_schedule_delayed_work(&hctx->run_work, 0);
788 else { 798 else {
789 unsigned int cpu; 799 unsigned int cpu;
790 800
791 cpu = blk_mq_hctx_next_cpu(hctx); 801 cpu = blk_mq_hctx_next_cpu(hctx);
792 kblockd_schedule_delayed_work_on(cpu, &hctx->run_work, 0); 802 kblockd_schedule_delayed_work_on(cpu, &hctx->run_work, 0);
793 } 803 }
794 } 804 }
795 805
796 void blk_mq_run_queues(struct request_queue *q, bool async) 806 void blk_mq_run_queues(struct request_queue *q, bool async)
797 { 807 {
798 struct blk_mq_hw_ctx *hctx; 808 struct blk_mq_hw_ctx *hctx;
799 int i; 809 int i;
800 810
801 queue_for_each_hw_ctx(q, hctx, i) { 811 queue_for_each_hw_ctx(q, hctx, i) {
802 if ((!blk_mq_hctx_has_pending(hctx) && 812 if ((!blk_mq_hctx_has_pending(hctx) &&
803 list_empty_careful(&hctx->dispatch)) || 813 list_empty_careful(&hctx->dispatch)) ||
804 test_bit(BLK_MQ_S_STOPPED, &hctx->state)) 814 test_bit(BLK_MQ_S_STOPPED, &hctx->state))
805 continue; 815 continue;
806 816
807 preempt_disable(); 817 preempt_disable();
808 blk_mq_run_hw_queue(hctx, async); 818 blk_mq_run_hw_queue(hctx, async);
809 preempt_enable(); 819 preempt_enable();
810 } 820 }
811 } 821 }
812 EXPORT_SYMBOL(blk_mq_run_queues); 822 EXPORT_SYMBOL(blk_mq_run_queues);
813 823
814 void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx) 824 void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx)
815 { 825 {
816 cancel_delayed_work(&hctx->run_work); 826 cancel_delayed_work(&hctx->run_work);
817 cancel_delayed_work(&hctx->delay_work); 827 cancel_delayed_work(&hctx->delay_work);
818 set_bit(BLK_MQ_S_STOPPED, &hctx->state); 828 set_bit(BLK_MQ_S_STOPPED, &hctx->state);
819 } 829 }
820 EXPORT_SYMBOL(blk_mq_stop_hw_queue); 830 EXPORT_SYMBOL(blk_mq_stop_hw_queue);
821 831
822 void blk_mq_stop_hw_queues(struct request_queue *q) 832 void blk_mq_stop_hw_queues(struct request_queue *q)
823 { 833 {
824 struct blk_mq_hw_ctx *hctx; 834 struct blk_mq_hw_ctx *hctx;
825 int i; 835 int i;
826 836
827 queue_for_each_hw_ctx(q, hctx, i) 837 queue_for_each_hw_ctx(q, hctx, i)
828 blk_mq_stop_hw_queue(hctx); 838 blk_mq_stop_hw_queue(hctx);
829 } 839 }
830 EXPORT_SYMBOL(blk_mq_stop_hw_queues); 840 EXPORT_SYMBOL(blk_mq_stop_hw_queues);
831 841
832 void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx) 842 void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx)
833 { 843 {
834 clear_bit(BLK_MQ_S_STOPPED, &hctx->state); 844 clear_bit(BLK_MQ_S_STOPPED, &hctx->state);
835 845
836 preempt_disable(); 846 preempt_disable();
837 blk_mq_run_hw_queue(hctx, false); 847 blk_mq_run_hw_queue(hctx, false);
838 preempt_enable(); 848 preempt_enable();
839 } 849 }
840 EXPORT_SYMBOL(blk_mq_start_hw_queue); 850 EXPORT_SYMBOL(blk_mq_start_hw_queue);
841 851
842 void blk_mq_start_hw_queues(struct request_queue *q) 852 void blk_mq_start_hw_queues(struct request_queue *q)
843 { 853 {
844 struct blk_mq_hw_ctx *hctx; 854 struct blk_mq_hw_ctx *hctx;
845 int i; 855 int i;
846 856
847 queue_for_each_hw_ctx(q, hctx, i) 857 queue_for_each_hw_ctx(q, hctx, i)
848 blk_mq_start_hw_queue(hctx); 858 blk_mq_start_hw_queue(hctx);
849 } 859 }
850 EXPORT_SYMBOL(blk_mq_start_hw_queues); 860 EXPORT_SYMBOL(blk_mq_start_hw_queues);
851 861
852 862
853 void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async) 863 void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async)
854 { 864 {
855 struct blk_mq_hw_ctx *hctx; 865 struct blk_mq_hw_ctx *hctx;
856 int i; 866 int i;
857 867
858 queue_for_each_hw_ctx(q, hctx, i) { 868 queue_for_each_hw_ctx(q, hctx, i) {
859 if (!test_bit(BLK_MQ_S_STOPPED, &hctx->state)) 869 if (!test_bit(BLK_MQ_S_STOPPED, &hctx->state))
860 continue; 870 continue;
861 871
862 clear_bit(BLK_MQ_S_STOPPED, &hctx->state); 872 clear_bit(BLK_MQ_S_STOPPED, &hctx->state);
863 preempt_disable(); 873 preempt_disable();
864 blk_mq_run_hw_queue(hctx, async); 874 blk_mq_run_hw_queue(hctx, async);
865 preempt_enable(); 875 preempt_enable();
866 } 876 }
867 } 877 }
868 EXPORT_SYMBOL(blk_mq_start_stopped_hw_queues); 878 EXPORT_SYMBOL(blk_mq_start_stopped_hw_queues);
869 879
870 static void blk_mq_run_work_fn(struct work_struct *work) 880 static void blk_mq_run_work_fn(struct work_struct *work)
871 { 881 {
872 struct blk_mq_hw_ctx *hctx; 882 struct blk_mq_hw_ctx *hctx;
873 883
874 hctx = container_of(work, struct blk_mq_hw_ctx, run_work.work); 884 hctx = container_of(work, struct blk_mq_hw_ctx, run_work.work);
875 885
876 __blk_mq_run_hw_queue(hctx); 886 __blk_mq_run_hw_queue(hctx);
877 } 887 }
878 888
879 static void blk_mq_delay_work_fn(struct work_struct *work) 889 static void blk_mq_delay_work_fn(struct work_struct *work)
880 { 890 {
881 struct blk_mq_hw_ctx *hctx; 891 struct blk_mq_hw_ctx *hctx;
882 892
883 hctx = container_of(work, struct blk_mq_hw_ctx, delay_work.work); 893 hctx = container_of(work, struct blk_mq_hw_ctx, delay_work.work);
884 894
885 if (test_and_clear_bit(BLK_MQ_S_STOPPED, &hctx->state)) 895 if (test_and_clear_bit(BLK_MQ_S_STOPPED, &hctx->state))
886 __blk_mq_run_hw_queue(hctx); 896 __blk_mq_run_hw_queue(hctx);
887 } 897 }
888 898
889 void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs) 899 void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs)
890 { 900 {
891 unsigned long tmo = msecs_to_jiffies(msecs); 901 unsigned long tmo = msecs_to_jiffies(msecs);
892 902
893 if (hctx->queue->nr_hw_queues == 1) 903 if (hctx->queue->nr_hw_queues == 1)
894 kblockd_schedule_delayed_work(&hctx->delay_work, tmo); 904 kblockd_schedule_delayed_work(&hctx->delay_work, tmo);
895 else { 905 else {
896 unsigned int cpu; 906 unsigned int cpu;
897 907
898 cpu = blk_mq_hctx_next_cpu(hctx); 908 cpu = blk_mq_hctx_next_cpu(hctx);
899 kblockd_schedule_delayed_work_on(cpu, &hctx->delay_work, tmo); 909 kblockd_schedule_delayed_work_on(cpu, &hctx->delay_work, tmo);
900 } 910 }
901 } 911 }
902 EXPORT_SYMBOL(blk_mq_delay_queue); 912 EXPORT_SYMBOL(blk_mq_delay_queue);
903 913
904 static void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, 914 static void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx,
905 struct request *rq, bool at_head) 915 struct request *rq, bool at_head)
906 { 916 {
907 struct blk_mq_ctx *ctx = rq->mq_ctx; 917 struct blk_mq_ctx *ctx = rq->mq_ctx;
908 918
909 trace_block_rq_insert(hctx->queue, rq); 919 trace_block_rq_insert(hctx->queue, rq);
910 920
911 if (at_head) 921 if (at_head)
912 list_add(&rq->queuelist, &ctx->rq_list); 922 list_add(&rq->queuelist, &ctx->rq_list);
913 else 923 else
914 list_add_tail(&rq->queuelist, &ctx->rq_list); 924 list_add_tail(&rq->queuelist, &ctx->rq_list);
915 925
916 blk_mq_hctx_mark_pending(hctx, ctx); 926 blk_mq_hctx_mark_pending(hctx, ctx);
917 } 927 }
918 928
919 void blk_mq_insert_request(struct request *rq, bool at_head, bool run_queue, 929 void blk_mq_insert_request(struct request *rq, bool at_head, bool run_queue,
920 bool async) 930 bool async)
921 { 931 {
922 struct request_queue *q = rq->q; 932 struct request_queue *q = rq->q;
923 struct blk_mq_hw_ctx *hctx; 933 struct blk_mq_hw_ctx *hctx;
924 struct blk_mq_ctx *ctx = rq->mq_ctx, *current_ctx; 934 struct blk_mq_ctx *ctx = rq->mq_ctx, *current_ctx;
925 935
926 current_ctx = blk_mq_get_ctx(q); 936 current_ctx = blk_mq_get_ctx(q);
927 if (!cpu_online(ctx->cpu)) 937 if (!cpu_online(ctx->cpu))
928 rq->mq_ctx = ctx = current_ctx; 938 rq->mq_ctx = ctx = current_ctx;
929 939
930 hctx = q->mq_ops->map_queue(q, ctx->cpu); 940 hctx = q->mq_ops->map_queue(q, ctx->cpu);
931 941
932 spin_lock(&ctx->lock); 942 spin_lock(&ctx->lock);
933 __blk_mq_insert_request(hctx, rq, at_head); 943 __blk_mq_insert_request(hctx, rq, at_head);
934 spin_unlock(&ctx->lock); 944 spin_unlock(&ctx->lock);
935 945
936 if (run_queue) 946 if (run_queue)
937 blk_mq_run_hw_queue(hctx, async); 947 blk_mq_run_hw_queue(hctx, async);
938 948
939 blk_mq_put_ctx(current_ctx); 949 blk_mq_put_ctx(current_ctx);
940 } 950 }
941 951
942 static void blk_mq_insert_requests(struct request_queue *q, 952 static void blk_mq_insert_requests(struct request_queue *q,
943 struct blk_mq_ctx *ctx, 953 struct blk_mq_ctx *ctx,
944 struct list_head *list, 954 struct list_head *list,
945 int depth, 955 int depth,
946 bool from_schedule) 956 bool from_schedule)
947 957
948 { 958 {
949 struct blk_mq_hw_ctx *hctx; 959 struct blk_mq_hw_ctx *hctx;
950 struct blk_mq_ctx *current_ctx; 960 struct blk_mq_ctx *current_ctx;
951 961
952 trace_block_unplug(q, depth, !from_schedule); 962 trace_block_unplug(q, depth, !from_schedule);
953 963
954 current_ctx = blk_mq_get_ctx(q); 964 current_ctx = blk_mq_get_ctx(q);
955 965
956 if (!cpu_online(ctx->cpu)) 966 if (!cpu_online(ctx->cpu))
957 ctx = current_ctx; 967 ctx = current_ctx;
958 hctx = q->mq_ops->map_queue(q, ctx->cpu); 968 hctx = q->mq_ops->map_queue(q, ctx->cpu);
959 969
960 /* 970 /*
961 * preemption doesn't flush plug list, so it's possible ctx->cpu is 971 * preemption doesn't flush plug list, so it's possible ctx->cpu is
962 * offline now 972 * offline now
963 */ 973 */
964 spin_lock(&ctx->lock); 974 spin_lock(&ctx->lock);
965 while (!list_empty(list)) { 975 while (!list_empty(list)) {
966 struct request *rq; 976 struct request *rq;
967 977
968 rq = list_first_entry(list, struct request, queuelist); 978 rq = list_first_entry(list, struct request, queuelist);
969 list_del_init(&rq->queuelist); 979 list_del_init(&rq->queuelist);
970 rq->mq_ctx = ctx; 980 rq->mq_ctx = ctx;
971 __blk_mq_insert_request(hctx, rq, false); 981 __blk_mq_insert_request(hctx, rq, false);
972 } 982 }
973 spin_unlock(&ctx->lock); 983 spin_unlock(&ctx->lock);
974 984
975 blk_mq_run_hw_queue(hctx, from_schedule); 985 blk_mq_run_hw_queue(hctx, from_schedule);
976 blk_mq_put_ctx(current_ctx); 986 blk_mq_put_ctx(current_ctx);
977 } 987 }
978 988
979 static int plug_ctx_cmp(void *priv, struct list_head *a, struct list_head *b) 989 static int plug_ctx_cmp(void *priv, struct list_head *a, struct list_head *b)
980 { 990 {
981 struct request *rqa = container_of(a, struct request, queuelist); 991 struct request *rqa = container_of(a, struct request, queuelist);
982 struct request *rqb = container_of(b, struct request, queuelist); 992 struct request *rqb = container_of(b, struct request, queuelist);
983 993
984 return !(rqa->mq_ctx < rqb->mq_ctx || 994 return !(rqa->mq_ctx < rqb->mq_ctx ||
985 (rqa->mq_ctx == rqb->mq_ctx && 995 (rqa->mq_ctx == rqb->mq_ctx &&
986 blk_rq_pos(rqa) < blk_rq_pos(rqb))); 996 blk_rq_pos(rqa) < blk_rq_pos(rqb)));
987 } 997 }
988 998
989 void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule) 999 void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule)
990 { 1000 {
991 struct blk_mq_ctx *this_ctx; 1001 struct blk_mq_ctx *this_ctx;
992 struct request_queue *this_q; 1002 struct request_queue *this_q;
993 struct request *rq; 1003 struct request *rq;
994 LIST_HEAD(list); 1004 LIST_HEAD(list);
995 LIST_HEAD(ctx_list); 1005 LIST_HEAD(ctx_list);
996 unsigned int depth; 1006 unsigned int depth;
997 1007
998 list_splice_init(&plug->mq_list, &list); 1008 list_splice_init(&plug->mq_list, &list);
999 1009
1000 list_sort(NULL, &list, plug_ctx_cmp); 1010 list_sort(NULL, &list, plug_ctx_cmp);
1001 1011
1002 this_q = NULL; 1012 this_q = NULL;
1003 this_ctx = NULL; 1013 this_ctx = NULL;
1004 depth = 0; 1014 depth = 0;
1005 1015
1006 while (!list_empty(&list)) { 1016 while (!list_empty(&list)) {
1007 rq = list_entry_rq(list.next); 1017 rq = list_entry_rq(list.next);
1008 list_del_init(&rq->queuelist); 1018 list_del_init(&rq->queuelist);
1009 BUG_ON(!rq->q); 1019 BUG_ON(!rq->q);
1010 if (rq->mq_ctx != this_ctx) { 1020 if (rq->mq_ctx != this_ctx) {
1011 if (this_ctx) { 1021 if (this_ctx) {
1012 blk_mq_insert_requests(this_q, this_ctx, 1022 blk_mq_insert_requests(this_q, this_ctx,
1013 &ctx_list, depth, 1023 &ctx_list, depth,
1014 from_schedule); 1024 from_schedule);
1015 } 1025 }
1016 1026
1017 this_ctx = rq->mq_ctx; 1027 this_ctx = rq->mq_ctx;
1018 this_q = rq->q; 1028 this_q = rq->q;
1019 depth = 0; 1029 depth = 0;
1020 } 1030 }
1021 1031
1022 depth++; 1032 depth++;
1023 list_add_tail(&rq->queuelist, &ctx_list); 1033 list_add_tail(&rq->queuelist, &ctx_list);
1024 } 1034 }
1025 1035
1026 /* 1036 /*
1027 * If 'this_ctx' is set, we know we have entries to complete 1037 * If 'this_ctx' is set, we know we have entries to complete
1028 * on 'ctx_list'. Do those. 1038 * on 'ctx_list'. Do those.
1029 */ 1039 */
1030 if (this_ctx) { 1040 if (this_ctx) {
1031 blk_mq_insert_requests(this_q, this_ctx, &ctx_list, depth, 1041 blk_mq_insert_requests(this_q, this_ctx, &ctx_list, depth,
1032 from_schedule); 1042 from_schedule);
1033 } 1043 }
1034 } 1044 }
1035 1045
1036 static void blk_mq_bio_to_request(struct request *rq, struct bio *bio) 1046 static void blk_mq_bio_to_request(struct request *rq, struct bio *bio)
1037 { 1047 {
1038 init_request_from_bio(rq, bio); 1048 init_request_from_bio(rq, bio);
1039 1049
1040 if (blk_do_io_stat(rq)) 1050 if (blk_do_io_stat(rq))
1041 blk_account_io_start(rq, 1); 1051 blk_account_io_start(rq, 1);
1042 } 1052 }
1043 1053
1044 static inline bool hctx_allow_merges(struct blk_mq_hw_ctx *hctx) 1054 static inline bool hctx_allow_merges(struct blk_mq_hw_ctx *hctx)
1045 { 1055 {
1046 return (hctx->flags & BLK_MQ_F_SHOULD_MERGE) && 1056 return (hctx->flags & BLK_MQ_F_SHOULD_MERGE) &&
1047 !blk_queue_nomerges(hctx->queue); 1057 !blk_queue_nomerges(hctx->queue);
1048 } 1058 }
1049 1059
1050 static inline bool blk_mq_merge_queue_io(struct blk_mq_hw_ctx *hctx, 1060 static inline bool blk_mq_merge_queue_io(struct blk_mq_hw_ctx *hctx,
1051 struct blk_mq_ctx *ctx, 1061 struct blk_mq_ctx *ctx,
1052 struct request *rq, struct bio *bio) 1062 struct request *rq, struct bio *bio)
1053 { 1063 {
1054 if (!hctx_allow_merges(hctx)) { 1064 if (!hctx_allow_merges(hctx)) {
1055 blk_mq_bio_to_request(rq, bio); 1065 blk_mq_bio_to_request(rq, bio);
1056 spin_lock(&ctx->lock); 1066 spin_lock(&ctx->lock);
1057 insert_rq: 1067 insert_rq:
1058 __blk_mq_insert_request(hctx, rq, false); 1068 __blk_mq_insert_request(hctx, rq, false);
1059 spin_unlock(&ctx->lock); 1069 spin_unlock(&ctx->lock);
1060 return false; 1070 return false;
1061 } else { 1071 } else {
1062 struct request_queue *q = hctx->queue; 1072 struct request_queue *q = hctx->queue;
1063 1073
1064 spin_lock(&ctx->lock); 1074 spin_lock(&ctx->lock);
1065 if (!blk_mq_attempt_merge(q, ctx, bio)) { 1075 if (!blk_mq_attempt_merge(q, ctx, bio)) {
1066 blk_mq_bio_to_request(rq, bio); 1076 blk_mq_bio_to_request(rq, bio);
1067 goto insert_rq; 1077 goto insert_rq;
1068 } 1078 }
1069 1079
1070 spin_unlock(&ctx->lock); 1080 spin_unlock(&ctx->lock);
1071 __blk_mq_free_request(hctx, ctx, rq); 1081 __blk_mq_free_request(hctx, ctx, rq);
1072 return true; 1082 return true;
1073 } 1083 }
1074 } 1084 }
1075 1085
1076 struct blk_map_ctx { 1086 struct blk_map_ctx {
1077 struct blk_mq_hw_ctx *hctx; 1087 struct blk_mq_hw_ctx *hctx;
1078 struct blk_mq_ctx *ctx; 1088 struct blk_mq_ctx *ctx;
1079 }; 1089 };
1080 1090
1081 static struct request *blk_mq_map_request(struct request_queue *q, 1091 static struct request *blk_mq_map_request(struct request_queue *q,
1082 struct bio *bio, 1092 struct bio *bio,
1083 struct blk_map_ctx *data) 1093 struct blk_map_ctx *data)
1084 { 1094 {
1085 struct blk_mq_hw_ctx *hctx; 1095 struct blk_mq_hw_ctx *hctx;
1086 struct blk_mq_ctx *ctx; 1096 struct blk_mq_ctx *ctx;
1087 struct request *rq; 1097 struct request *rq;
1088 int rw = bio_data_dir(bio); 1098 int rw = bio_data_dir(bio);
1089 struct blk_mq_alloc_data alloc_data; 1099 struct blk_mq_alloc_data alloc_data;
1090 1100
1091 if (unlikely(blk_mq_queue_enter(q))) { 1101 if (unlikely(blk_mq_queue_enter(q))) {
1092 bio_endio(bio, -EIO); 1102 bio_endio(bio, -EIO);
1093 return NULL; 1103 return NULL;
1094 } 1104 }
1095 1105
1096 ctx = blk_mq_get_ctx(q); 1106 ctx = blk_mq_get_ctx(q);
1097 hctx = q->mq_ops->map_queue(q, ctx->cpu); 1107 hctx = q->mq_ops->map_queue(q, ctx->cpu);
1098 1108
1099 if (rw_is_sync(bio->bi_rw)) 1109 if (rw_is_sync(bio->bi_rw))
1100 rw |= REQ_SYNC; 1110 rw |= REQ_SYNC;
1101 1111
1102 trace_block_getrq(q, bio, rw); 1112 trace_block_getrq(q, bio, rw);
1103 blk_mq_set_alloc_data(&alloc_data, q, GFP_ATOMIC, false, ctx, 1113 blk_mq_set_alloc_data(&alloc_data, q, GFP_ATOMIC, false, ctx,
1104 hctx); 1114 hctx);
1105 rq = __blk_mq_alloc_request(&alloc_data, rw); 1115 rq = __blk_mq_alloc_request(&alloc_data, rw);
1106 if (unlikely(!rq)) { 1116 if (unlikely(!rq)) {
1107 __blk_mq_run_hw_queue(hctx); 1117 __blk_mq_run_hw_queue(hctx);
1108 blk_mq_put_ctx(ctx); 1118 blk_mq_put_ctx(ctx);
1109 trace_block_sleeprq(q, bio, rw); 1119 trace_block_sleeprq(q, bio, rw);
1110 1120
1111 ctx = blk_mq_get_ctx(q); 1121 ctx = blk_mq_get_ctx(q);
1112 hctx = q->mq_ops->map_queue(q, ctx->cpu); 1122 hctx = q->mq_ops->map_queue(q, ctx->cpu);
1113 blk_mq_set_alloc_data(&alloc_data, q, 1123 blk_mq_set_alloc_data(&alloc_data, q,
1114 __GFP_WAIT|GFP_ATOMIC, false, ctx, hctx); 1124 __GFP_WAIT|GFP_ATOMIC, false, ctx, hctx);
1115 rq = __blk_mq_alloc_request(&alloc_data, rw); 1125 rq = __blk_mq_alloc_request(&alloc_data, rw);
1116 ctx = alloc_data.ctx; 1126 ctx = alloc_data.ctx;
1117 hctx = alloc_data.hctx; 1127 hctx = alloc_data.hctx;
1118 } 1128 }
1119 1129
1120 hctx->queued++; 1130 hctx->queued++;
1121 data->hctx = hctx; 1131 data->hctx = hctx;
1122 data->ctx = ctx; 1132 data->ctx = ctx;
1123 return rq; 1133 return rq;
1124 } 1134 }
1125 1135
1126 /* 1136 /*
1127 * Multiple hardware queue variant. This will not use per-process plugs, 1137 * Multiple hardware queue variant. This will not use per-process plugs,
1128 * but will attempt to bypass the hctx queueing if we can go straight to 1138 * but will attempt to bypass the hctx queueing if we can go straight to
1129 * hardware for SYNC IO. 1139 * hardware for SYNC IO.
1130 */ 1140 */
1131 static void blk_mq_make_request(struct request_queue *q, struct bio *bio) 1141 static void blk_mq_make_request(struct request_queue *q, struct bio *bio)
1132 { 1142 {
1133 const int is_sync = rw_is_sync(bio->bi_rw); 1143 const int is_sync = rw_is_sync(bio->bi_rw);
1134 const int is_flush_fua = bio->bi_rw & (REQ_FLUSH | REQ_FUA); 1144 const int is_flush_fua = bio->bi_rw & (REQ_FLUSH | REQ_FUA);
1135 struct blk_map_ctx data; 1145 struct blk_map_ctx data;
1136 struct request *rq; 1146 struct request *rq;
1137 1147
1138 blk_queue_bounce(q, &bio); 1148 blk_queue_bounce(q, &bio);
1139 1149
1140 if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) { 1150 if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) {
1141 bio_endio(bio, -EIO); 1151 bio_endio(bio, -EIO);
1142 return; 1152 return;
1143 } 1153 }
1144 1154
1145 rq = blk_mq_map_request(q, bio, &data); 1155 rq = blk_mq_map_request(q, bio, &data);
1146 if (unlikely(!rq)) 1156 if (unlikely(!rq))
1147 return; 1157 return;
1148 1158
1149 if (unlikely(is_flush_fua)) { 1159 if (unlikely(is_flush_fua)) {
1150 blk_mq_bio_to_request(rq, bio); 1160 blk_mq_bio_to_request(rq, bio);
1151 blk_insert_flush(rq); 1161 blk_insert_flush(rq);
1152 goto run_queue; 1162 goto run_queue;
1153 } 1163 }
1154 1164
1155 if (is_sync) { 1165 if (is_sync) {
1156 int ret; 1166 int ret;
1157 1167
1158 blk_mq_bio_to_request(rq, bio); 1168 blk_mq_bio_to_request(rq, bio);
1159 1169
1160 /* 1170 /*
1161 * For OK queue, we are done. For error, kill it. Any other 1171 * For OK queue, we are done. For error, kill it. Any other
1162 * error (busy), just add it to our list as we previously 1172 * error (busy), just add it to our list as we previously
1163 * would have done 1173 * would have done
1164 */ 1174 */
1165 ret = q->mq_ops->queue_rq(data.hctx, rq, true); 1175 ret = q->mq_ops->queue_rq(data.hctx, rq, true);
1166 if (ret == BLK_MQ_RQ_QUEUE_OK) 1176 if (ret == BLK_MQ_RQ_QUEUE_OK)
1167 goto done; 1177 goto done;
1168 else { 1178 else {
1169 __blk_mq_requeue_request(rq); 1179 __blk_mq_requeue_request(rq);
1170 1180
1171 if (ret == BLK_MQ_RQ_QUEUE_ERROR) { 1181 if (ret == BLK_MQ_RQ_QUEUE_ERROR) {
1172 rq->errors = -EIO; 1182 rq->errors = -EIO;
1173 blk_mq_end_request(rq, rq->errors); 1183 blk_mq_end_request(rq, rq->errors);
1174 goto done; 1184 goto done;
1175 } 1185 }
1176 } 1186 }
1177 } 1187 }
1178 1188
1179 if (!blk_mq_merge_queue_io(data.hctx, data.ctx, rq, bio)) { 1189 if (!blk_mq_merge_queue_io(data.hctx, data.ctx, rq, bio)) {
1180 /* 1190 /*
1181 * For a SYNC request, send it to the hardware immediately. For 1191 * For a SYNC request, send it to the hardware immediately. For
1182 * an ASYNC request, just ensure that we run it later on. The 1192 * an ASYNC request, just ensure that we run it later on. The
1183 * latter allows for merging opportunities and more efficient 1193 * latter allows for merging opportunities and more efficient
1184 * dispatching. 1194 * dispatching.
1185 */ 1195 */
1186 run_queue: 1196 run_queue:
1187 blk_mq_run_hw_queue(data.hctx, !is_sync || is_flush_fua); 1197 blk_mq_run_hw_queue(data.hctx, !is_sync || is_flush_fua);
1188 } 1198 }
1189 done: 1199 done:
1190 blk_mq_put_ctx(data.ctx); 1200 blk_mq_put_ctx(data.ctx);
1191 } 1201 }
1192 1202
1193 /* 1203 /*
1194 * Single hardware queue variant. This will attempt to use any per-process 1204 * Single hardware queue variant. This will attempt to use any per-process
1195 * plug for merging and IO deferral. 1205 * plug for merging and IO deferral.
1196 */ 1206 */
1197 static void blk_sq_make_request(struct request_queue *q, struct bio *bio) 1207 static void blk_sq_make_request(struct request_queue *q, struct bio *bio)
1198 { 1208 {
1199 const int is_sync = rw_is_sync(bio->bi_rw); 1209 const int is_sync = rw_is_sync(bio->bi_rw);
1200 const int is_flush_fua = bio->bi_rw & (REQ_FLUSH | REQ_FUA); 1210 const int is_flush_fua = bio->bi_rw & (REQ_FLUSH | REQ_FUA);
1201 unsigned int use_plug, request_count = 0; 1211 unsigned int use_plug, request_count = 0;
1202 struct blk_map_ctx data; 1212 struct blk_map_ctx data;
1203 struct request *rq; 1213 struct request *rq;
1204 1214
1205 /* 1215 /*
1206 * If we have multiple hardware queues, just go directly to 1216 * If we have multiple hardware queues, just go directly to
1207 * one of those for sync IO. 1217 * one of those for sync IO.
1208 */ 1218 */
1209 use_plug = !is_flush_fua && !is_sync; 1219 use_plug = !is_flush_fua && !is_sync;
1210 1220
1211 blk_queue_bounce(q, &bio); 1221 blk_queue_bounce(q, &bio);
1212 1222
1213 if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) { 1223 if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) {
1214 bio_endio(bio, -EIO); 1224 bio_endio(bio, -EIO);
1215 return; 1225 return;
1216 } 1226 }
1217 1227
1218 if (use_plug && !blk_queue_nomerges(q) && 1228 if (use_plug && !blk_queue_nomerges(q) &&
1219 blk_attempt_plug_merge(q, bio, &request_count)) 1229 blk_attempt_plug_merge(q, bio, &request_count))
1220 return; 1230 return;
1221 1231
1222 rq = blk_mq_map_request(q, bio, &data); 1232 rq = blk_mq_map_request(q, bio, &data);
1223 if (unlikely(!rq)) 1233 if (unlikely(!rq))
1224 return; 1234 return;
1225 1235
1226 if (unlikely(is_flush_fua)) { 1236 if (unlikely(is_flush_fua)) {
1227 blk_mq_bio_to_request(rq, bio); 1237 blk_mq_bio_to_request(rq, bio);
1228 blk_insert_flush(rq); 1238 blk_insert_flush(rq);
1229 goto run_queue; 1239 goto run_queue;
1230 } 1240 }
1231 1241
1232 /* 1242 /*
1233 * A task plug currently exists. Since this is completely lockless, 1243 * A task plug currently exists. Since this is completely lockless,
1234 * utilize that to temporarily store requests until the task is 1244 * utilize that to temporarily store requests until the task is
1235 * either done or scheduled away. 1245 * either done or scheduled away.
1236 */ 1246 */
1237 if (use_plug) { 1247 if (use_plug) {
1238 struct blk_plug *plug = current->plug; 1248 struct blk_plug *plug = current->plug;
1239 1249
1240 if (plug) { 1250 if (plug) {
1241 blk_mq_bio_to_request(rq, bio); 1251 blk_mq_bio_to_request(rq, bio);
1242 if (list_empty(&plug->mq_list)) 1252 if (list_empty(&plug->mq_list))
1243 trace_block_plug(q); 1253 trace_block_plug(q);
1244 else if (request_count >= BLK_MAX_REQUEST_COUNT) { 1254 else if (request_count >= BLK_MAX_REQUEST_COUNT) {
1245 blk_flush_plug_list(plug, false); 1255 blk_flush_plug_list(plug, false);
1246 trace_block_plug(q); 1256 trace_block_plug(q);
1247 } 1257 }
1248 list_add_tail(&rq->queuelist, &plug->mq_list); 1258 list_add_tail(&rq->queuelist, &plug->mq_list);
1249 blk_mq_put_ctx(data.ctx); 1259 blk_mq_put_ctx(data.ctx);
1250 return; 1260 return;
1251 } 1261 }
1252 } 1262 }
1253 1263
1254 if (!blk_mq_merge_queue_io(data.hctx, data.ctx, rq, bio)) { 1264 if (!blk_mq_merge_queue_io(data.hctx, data.ctx, rq, bio)) {
1255 /* 1265 /*
1256 * For a SYNC request, send it to the hardware immediately. For 1266 * For a SYNC request, send it to the hardware immediately. For
1257 * an ASYNC request, just ensure that we run it later on. The 1267 * an ASYNC request, just ensure that we run it later on. The
1258 * latter allows for merging opportunities and more efficient 1268 * latter allows for merging opportunities and more efficient
1259 * dispatching. 1269 * dispatching.
1260 */ 1270 */
1261 run_queue: 1271 run_queue:
1262 blk_mq_run_hw_queue(data.hctx, !is_sync || is_flush_fua); 1272 blk_mq_run_hw_queue(data.hctx, !is_sync || is_flush_fua);
1263 } 1273 }
1264 1274
1265 blk_mq_put_ctx(data.ctx); 1275 blk_mq_put_ctx(data.ctx);
1266 } 1276 }
1267 1277
1268 /* 1278 /*
1269 * Default mapping to a software queue, since we use one per CPU. 1279 * Default mapping to a software queue, since we use one per CPU.
1270 */ 1280 */
1271 struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *q, const int cpu) 1281 struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *q, const int cpu)
1272 { 1282 {
1273 return q->queue_hw_ctx[q->mq_map[cpu]]; 1283 return q->queue_hw_ctx[q->mq_map[cpu]];
1274 } 1284 }
1275 EXPORT_SYMBOL(blk_mq_map_queue); 1285 EXPORT_SYMBOL(blk_mq_map_queue);
1276 1286
1277 static void blk_mq_free_rq_map(struct blk_mq_tag_set *set, 1287 static void blk_mq_free_rq_map(struct blk_mq_tag_set *set,
1278 struct blk_mq_tags *tags, unsigned int hctx_idx) 1288 struct blk_mq_tags *tags, unsigned int hctx_idx)
1279 { 1289 {
1280 struct page *page; 1290 struct page *page;
1281 1291
1282 if (tags->rqs && set->ops->exit_request) { 1292 if (tags->rqs && set->ops->exit_request) {
1283 int i; 1293 int i;
1284 1294
1285 for (i = 0; i < tags->nr_tags; i++) { 1295 for (i = 0; i < tags->nr_tags; i++) {
1286 if (!tags->rqs[i]) 1296 if (!tags->rqs[i])
1287 continue; 1297 continue;
1288 set->ops->exit_request(set->driver_data, tags->rqs[i], 1298 set->ops->exit_request(set->driver_data, tags->rqs[i],
1289 hctx_idx, i); 1299 hctx_idx, i);
1290 tags->rqs[i] = NULL; 1300 tags->rqs[i] = NULL;
1291 } 1301 }
1292 } 1302 }
1293 1303
1294 while (!list_empty(&tags->page_list)) { 1304 while (!list_empty(&tags->page_list)) {
1295 page = list_first_entry(&tags->page_list, struct page, lru); 1305 page = list_first_entry(&tags->page_list, struct page, lru);
1296 list_del_init(&page->lru); 1306 list_del_init(&page->lru);
1297 __free_pages(page, page->private); 1307 __free_pages(page, page->private);
1298 } 1308 }
1299 1309
1300 kfree(tags->rqs); 1310 kfree(tags->rqs);
1301 1311
1302 blk_mq_free_tags(tags); 1312 blk_mq_free_tags(tags);
1303 } 1313 }
1304 1314
1305 static size_t order_to_size(unsigned int order) 1315 static size_t order_to_size(unsigned int order)
1306 { 1316 {
1307 return (size_t)PAGE_SIZE << order; 1317 return (size_t)PAGE_SIZE << order;
1308 } 1318 }
1309 1319
1310 static struct blk_mq_tags *blk_mq_init_rq_map(struct blk_mq_tag_set *set, 1320 static struct blk_mq_tags *blk_mq_init_rq_map(struct blk_mq_tag_set *set,
1311 unsigned int hctx_idx) 1321 unsigned int hctx_idx)
1312 { 1322 {
1313 struct blk_mq_tags *tags; 1323 struct blk_mq_tags *tags;
1314 unsigned int i, j, entries_per_page, max_order = 4; 1324 unsigned int i, j, entries_per_page, max_order = 4;
1315 size_t rq_size, left; 1325 size_t rq_size, left;
1316 1326
1317 tags = blk_mq_init_tags(set->queue_depth, set->reserved_tags, 1327 tags = blk_mq_init_tags(set->queue_depth, set->reserved_tags,
1318 set->numa_node); 1328 set->numa_node);
1319 if (!tags) 1329 if (!tags)
1320 return NULL; 1330 return NULL;
1321 1331
1322 INIT_LIST_HEAD(&tags->page_list); 1332 INIT_LIST_HEAD(&tags->page_list);
1323 1333
1324 tags->rqs = kzalloc_node(set->queue_depth * sizeof(struct request *), 1334 tags->rqs = kzalloc_node(set->queue_depth * sizeof(struct request *),
1325 GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY, 1335 GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY,
1326 set->numa_node); 1336 set->numa_node);
1327 if (!tags->rqs) { 1337 if (!tags->rqs) {
1328 blk_mq_free_tags(tags); 1338 blk_mq_free_tags(tags);
1329 return NULL; 1339 return NULL;
1330 } 1340 }
1331 1341
1332 /* 1342 /*
1333 * rq_size is the size of the request plus driver payload, rounded 1343 * rq_size is the size of the request plus driver payload, rounded
1334 * to the cacheline size 1344 * to the cacheline size
1335 */ 1345 */
1336 rq_size = round_up(sizeof(struct request) + set->cmd_size, 1346 rq_size = round_up(sizeof(struct request) + set->cmd_size,
1337 cache_line_size()); 1347 cache_line_size());
1338 left = rq_size * set->queue_depth; 1348 left = rq_size * set->queue_depth;
1339 1349
1340 for (i = 0; i < set->queue_depth; ) { 1350 for (i = 0; i < set->queue_depth; ) {
1341 int this_order = max_order; 1351 int this_order = max_order;
1342 struct page *page; 1352 struct page *page;
1343 int to_do; 1353 int to_do;
1344 void *p; 1354 void *p;
1345 1355
1346 while (left < order_to_size(this_order - 1) && this_order) 1356 while (left < order_to_size(this_order - 1) && this_order)
1347 this_order--; 1357 this_order--;
1348 1358
1349 do { 1359 do {
1350 page = alloc_pages_node(set->numa_node, 1360 page = alloc_pages_node(set->numa_node,
1351 GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY, 1361 GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY,
1352 this_order); 1362 this_order);
1353 if (page) 1363 if (page)
1354 break; 1364 break;
1355 if (!this_order--) 1365 if (!this_order--)
1356 break; 1366 break;
1357 if (order_to_size(this_order) < rq_size) 1367 if (order_to_size(this_order) < rq_size)
1358 break; 1368 break;
1359 } while (1); 1369 } while (1);
1360 1370
1361 if (!page) 1371 if (!page)
1362 goto fail; 1372 goto fail;
1363 1373
1364 page->private = this_order; 1374 page->private = this_order;
1365 list_add_tail(&page->lru, &tags->page_list); 1375 list_add_tail(&page->lru, &tags->page_list);
1366 1376
1367 p = page_address(page); 1377 p = page_address(page);
1368 entries_per_page = order_to_size(this_order) / rq_size; 1378 entries_per_page = order_to_size(this_order) / rq_size;
1369 to_do = min(entries_per_page, set->queue_depth - i); 1379 to_do = min(entries_per_page, set->queue_depth - i);
1370 left -= to_do * rq_size; 1380 left -= to_do * rq_size;
1371 for (j = 0; j < to_do; j++) { 1381 for (j = 0; j < to_do; j++) {
1372 tags->rqs[i] = p; 1382 tags->rqs[i] = p;
1373 tags->rqs[i]->atomic_flags = 0; 1383 tags->rqs[i]->atomic_flags = 0;
1374 tags->rqs[i]->cmd_flags = 0; 1384 tags->rqs[i]->cmd_flags = 0;
1375 if (set->ops->init_request) { 1385 if (set->ops->init_request) {
1376 if (set->ops->init_request(set->driver_data, 1386 if (set->ops->init_request(set->driver_data,
1377 tags->rqs[i], hctx_idx, i, 1387 tags->rqs[i], hctx_idx, i,
1378 set->numa_node)) { 1388 set->numa_node)) {
1379 tags->rqs[i] = NULL; 1389 tags->rqs[i] = NULL;
1380 goto fail; 1390 goto fail;
1381 } 1391 }
1382 } 1392 }
1383 1393
1384 p += rq_size; 1394 p += rq_size;
1385 i++; 1395 i++;
1386 } 1396 }
1387 } 1397 }
1388 1398
1389 return tags; 1399 return tags;
1390 1400
1391 fail: 1401 fail:
1392 blk_mq_free_rq_map(set, tags, hctx_idx); 1402 blk_mq_free_rq_map(set, tags, hctx_idx);
1393 return NULL; 1403 return NULL;
1394 } 1404 }
1395 1405
1396 static void blk_mq_free_bitmap(struct blk_mq_ctxmap *bitmap) 1406 static void blk_mq_free_bitmap(struct blk_mq_ctxmap *bitmap)
1397 { 1407 {
1398 kfree(bitmap->map); 1408 kfree(bitmap->map);
1399 } 1409 }
1400 1410
1401 static int blk_mq_alloc_bitmap(struct blk_mq_ctxmap *bitmap, int node) 1411 static int blk_mq_alloc_bitmap(struct blk_mq_ctxmap *bitmap, int node)
1402 { 1412 {
1403 unsigned int bpw = 8, total, num_maps, i; 1413 unsigned int bpw = 8, total, num_maps, i;
1404 1414
1405 bitmap->bits_per_word = bpw; 1415 bitmap->bits_per_word = bpw;
1406 1416
1407 num_maps = ALIGN(nr_cpu_ids, bpw) / bpw; 1417 num_maps = ALIGN(nr_cpu_ids, bpw) / bpw;
1408 bitmap->map = kzalloc_node(num_maps * sizeof(struct blk_align_bitmap), 1418 bitmap->map = kzalloc_node(num_maps * sizeof(struct blk_align_bitmap),
1409 GFP_KERNEL, node); 1419 GFP_KERNEL, node);
1410 if (!bitmap->map) 1420 if (!bitmap->map)
1411 return -ENOMEM; 1421 return -ENOMEM;
1412 1422
1413 bitmap->map_size = num_maps; 1423 bitmap->map_size = num_maps;
1414 1424
1415 total = nr_cpu_ids; 1425 total = nr_cpu_ids;
1416 for (i = 0; i < num_maps; i++) { 1426 for (i = 0; i < num_maps; i++) {
1417 bitmap->map[i].depth = min(total, bitmap->bits_per_word); 1427 bitmap->map[i].depth = min(total, bitmap->bits_per_word);
1418 total -= bitmap->map[i].depth; 1428 total -= bitmap->map[i].depth;
1419 } 1429 }
1420 1430
1421 return 0; 1431 return 0;
1422 } 1432 }
1423 1433
1424 static int blk_mq_hctx_cpu_offline(struct blk_mq_hw_ctx *hctx, int cpu) 1434 static int blk_mq_hctx_cpu_offline(struct blk_mq_hw_ctx *hctx, int cpu)
1425 { 1435 {
1426 struct request_queue *q = hctx->queue; 1436 struct request_queue *q = hctx->queue;
1427 struct blk_mq_ctx *ctx; 1437 struct blk_mq_ctx *ctx;
1428 LIST_HEAD(tmp); 1438 LIST_HEAD(tmp);
1429 1439
1430 /* 1440 /*
1431 * Move ctx entries to new CPU, if this one is going away. 1441 * Move ctx entries to new CPU, if this one is going away.
1432 */ 1442 */
1433 ctx = __blk_mq_get_ctx(q, cpu); 1443 ctx = __blk_mq_get_ctx(q, cpu);
1434 1444
1435 spin_lock(&ctx->lock); 1445 spin_lock(&ctx->lock);
1436 if (!list_empty(&ctx->rq_list)) { 1446 if (!list_empty(&ctx->rq_list)) {
1437 list_splice_init(&ctx->rq_list, &tmp); 1447 list_splice_init(&ctx->rq_list, &tmp);
1438 blk_mq_hctx_clear_pending(hctx, ctx); 1448 blk_mq_hctx_clear_pending(hctx, ctx);
1439 } 1449 }
1440 spin_unlock(&ctx->lock); 1450 spin_unlock(&ctx->lock);
1441 1451
1442 if (list_empty(&tmp)) 1452 if (list_empty(&tmp))
1443 return NOTIFY_OK; 1453 return NOTIFY_OK;
1444 1454
1445 ctx = blk_mq_get_ctx(q); 1455 ctx = blk_mq_get_ctx(q);
1446 spin_lock(&ctx->lock); 1456 spin_lock(&ctx->lock);
1447 1457
1448 while (!list_empty(&tmp)) { 1458 while (!list_empty(&tmp)) {
1449 struct request *rq; 1459 struct request *rq;
1450 1460
1451 rq = list_first_entry(&tmp, struct request, queuelist); 1461 rq = list_first_entry(&tmp, struct request, queuelist);
1452 rq->mq_ctx = ctx; 1462 rq->mq_ctx = ctx;
1453 list_move_tail(&rq->queuelist, &ctx->rq_list); 1463 list_move_tail(&rq->queuelist, &ctx->rq_list);
1454 } 1464 }
1455 1465
1456 hctx = q->mq_ops->map_queue(q, ctx->cpu); 1466 hctx = q->mq_ops->map_queue(q, ctx->cpu);
1457 blk_mq_hctx_mark_pending(hctx, ctx); 1467 blk_mq_hctx_mark_pending(hctx, ctx);
1458 1468
1459 spin_unlock(&ctx->lock); 1469 spin_unlock(&ctx->lock);
1460 1470
1461 blk_mq_run_hw_queue(hctx, true); 1471 blk_mq_run_hw_queue(hctx, true);
1462 blk_mq_put_ctx(ctx); 1472 blk_mq_put_ctx(ctx);
1463 return NOTIFY_OK; 1473 return NOTIFY_OK;
1464 } 1474 }
1465 1475
1466 static int blk_mq_hctx_cpu_online(struct blk_mq_hw_ctx *hctx, int cpu) 1476 static int blk_mq_hctx_cpu_online(struct blk_mq_hw_ctx *hctx, int cpu)
1467 { 1477 {
1468 struct request_queue *q = hctx->queue; 1478 struct request_queue *q = hctx->queue;
1469 struct blk_mq_tag_set *set = q->tag_set; 1479 struct blk_mq_tag_set *set = q->tag_set;
1470 1480
1471 if (set->tags[hctx->queue_num]) 1481 if (set->tags[hctx->queue_num])
1472 return NOTIFY_OK; 1482 return NOTIFY_OK;
1473 1483
1474 set->tags[hctx->queue_num] = blk_mq_init_rq_map(set, hctx->queue_num); 1484 set->tags[hctx->queue_num] = blk_mq_init_rq_map(set, hctx->queue_num);
1475 if (!set->tags[hctx->queue_num]) 1485 if (!set->tags[hctx->queue_num])
1476 return NOTIFY_STOP; 1486 return NOTIFY_STOP;
1477 1487
1478 hctx->tags = set->tags[hctx->queue_num]; 1488 hctx->tags = set->tags[hctx->queue_num];
1479 return NOTIFY_OK; 1489 return NOTIFY_OK;
1480 } 1490 }
1481 1491
1482 static int blk_mq_hctx_notify(void *data, unsigned long action, 1492 static int blk_mq_hctx_notify(void *data, unsigned long action,
1483 unsigned int cpu) 1493 unsigned int cpu)
1484 { 1494 {
1485 struct blk_mq_hw_ctx *hctx = data; 1495 struct blk_mq_hw_ctx *hctx = data;
1486 1496
1487 if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) 1497 if (action == CPU_DEAD || action == CPU_DEAD_FROZEN)
1488 return blk_mq_hctx_cpu_offline(hctx, cpu); 1498 return blk_mq_hctx_cpu_offline(hctx, cpu);
1489 else if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) 1499 else if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN)
1490 return blk_mq_hctx_cpu_online(hctx, cpu); 1500 return blk_mq_hctx_cpu_online(hctx, cpu);
1491 1501
1492 return NOTIFY_OK; 1502 return NOTIFY_OK;
1493 } 1503 }
1494 1504
1495 static void blk_mq_exit_hctx(struct request_queue *q, 1505 static void blk_mq_exit_hctx(struct request_queue *q,
1496 struct blk_mq_tag_set *set, 1506 struct blk_mq_tag_set *set,
1497 struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx) 1507 struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
1498 { 1508 {
1499 unsigned flush_start_tag = set->queue_depth; 1509 unsigned flush_start_tag = set->queue_depth;
1500 1510
1501 blk_mq_tag_idle(hctx); 1511 blk_mq_tag_idle(hctx);
1502 1512
1503 if (set->ops->exit_request) 1513 if (set->ops->exit_request)
1504 set->ops->exit_request(set->driver_data, 1514 set->ops->exit_request(set->driver_data,
1505 hctx->fq->flush_rq, hctx_idx, 1515 hctx->fq->flush_rq, hctx_idx,
1506 flush_start_tag + hctx_idx); 1516 flush_start_tag + hctx_idx);
1507 1517
1508 if (set->ops->exit_hctx) 1518 if (set->ops->exit_hctx)
1509 set->ops->exit_hctx(hctx, hctx_idx); 1519 set->ops->exit_hctx(hctx, hctx_idx);
1510 1520
1511 blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier); 1521 blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier);
1512 blk_free_flush_queue(hctx->fq); 1522 blk_free_flush_queue(hctx->fq);
1513 kfree(hctx->ctxs); 1523 kfree(hctx->ctxs);
1514 blk_mq_free_bitmap(&hctx->ctx_map); 1524 blk_mq_free_bitmap(&hctx->ctx_map);
1515 } 1525 }
1516 1526
1517 static void blk_mq_exit_hw_queues(struct request_queue *q, 1527 static void blk_mq_exit_hw_queues(struct request_queue *q,
1518 struct blk_mq_tag_set *set, int nr_queue) 1528 struct blk_mq_tag_set *set, int nr_queue)
1519 { 1529 {
1520 struct blk_mq_hw_ctx *hctx; 1530 struct blk_mq_hw_ctx *hctx;
1521 unsigned int i; 1531 unsigned int i;
1522 1532
1523 queue_for_each_hw_ctx(q, hctx, i) { 1533 queue_for_each_hw_ctx(q, hctx, i) {
1524 if (i == nr_queue) 1534 if (i == nr_queue)
1525 break; 1535 break;
1526 blk_mq_exit_hctx(q, set, hctx, i); 1536 blk_mq_exit_hctx(q, set, hctx, i);
1527 } 1537 }
1528 } 1538 }
1529 1539
1530 static void blk_mq_free_hw_queues(struct request_queue *q, 1540 static void blk_mq_free_hw_queues(struct request_queue *q,
1531 struct blk_mq_tag_set *set) 1541 struct blk_mq_tag_set *set)
1532 { 1542 {
1533 struct blk_mq_hw_ctx *hctx; 1543 struct blk_mq_hw_ctx *hctx;
1534 unsigned int i; 1544 unsigned int i;
1535 1545
1536 queue_for_each_hw_ctx(q, hctx, i) { 1546 queue_for_each_hw_ctx(q, hctx, i) {
1537 free_cpumask_var(hctx->cpumask); 1547 free_cpumask_var(hctx->cpumask);
1538 kfree(hctx); 1548 kfree(hctx);
1539 } 1549 }
1540 } 1550 }
1541 1551
1542 static int blk_mq_init_hctx(struct request_queue *q, 1552 static int blk_mq_init_hctx(struct request_queue *q,
1543 struct blk_mq_tag_set *set, 1553 struct blk_mq_tag_set *set,
1544 struct blk_mq_hw_ctx *hctx, unsigned hctx_idx) 1554 struct blk_mq_hw_ctx *hctx, unsigned hctx_idx)
1545 { 1555 {
1546 int node; 1556 int node;
1547 unsigned flush_start_tag = set->queue_depth; 1557 unsigned flush_start_tag = set->queue_depth;
1548 1558
1549 node = hctx->numa_node; 1559 node = hctx->numa_node;
1550 if (node == NUMA_NO_NODE) 1560 if (node == NUMA_NO_NODE)
1551 node = hctx->numa_node = set->numa_node; 1561 node = hctx->numa_node = set->numa_node;
1552 1562
1553 INIT_DELAYED_WORK(&hctx->run_work, blk_mq_run_work_fn); 1563 INIT_DELAYED_WORK(&hctx->run_work, blk_mq_run_work_fn);
1554 INIT_DELAYED_WORK(&hctx->delay_work, blk_mq_delay_work_fn); 1564 INIT_DELAYED_WORK(&hctx->delay_work, blk_mq_delay_work_fn);
1555 spin_lock_init(&hctx->lock); 1565 spin_lock_init(&hctx->lock);
1556 INIT_LIST_HEAD(&hctx->dispatch); 1566 INIT_LIST_HEAD(&hctx->dispatch);
1557 hctx->queue = q; 1567 hctx->queue = q;
1558 hctx->queue_num = hctx_idx; 1568 hctx->queue_num = hctx_idx;
1559 hctx->flags = set->flags; 1569 hctx->flags = set->flags;
1560 hctx->cmd_size = set->cmd_size; 1570 hctx->cmd_size = set->cmd_size;
1561 1571
1562 blk_mq_init_cpu_notifier(&hctx->cpu_notifier, 1572 blk_mq_init_cpu_notifier(&hctx->cpu_notifier,
1563 blk_mq_hctx_notify, hctx); 1573 blk_mq_hctx_notify, hctx);
1564 blk_mq_register_cpu_notifier(&hctx->cpu_notifier); 1574 blk_mq_register_cpu_notifier(&hctx->cpu_notifier);
1565 1575
1566 hctx->tags = set->tags[hctx_idx]; 1576 hctx->tags = set->tags[hctx_idx];
1567 1577
1568 /* 1578 /*
1569 * Allocate space for all possible cpus to avoid allocation at 1579 * Allocate space for all possible cpus to avoid allocation at
1570 * runtime 1580 * runtime
1571 */ 1581 */
1572 hctx->ctxs = kmalloc_node(nr_cpu_ids * sizeof(void *), 1582 hctx->ctxs = kmalloc_node(nr_cpu_ids * sizeof(void *),
1573 GFP_KERNEL, node); 1583 GFP_KERNEL, node);
1574 if (!hctx->ctxs) 1584 if (!hctx->ctxs)
1575 goto unregister_cpu_notifier; 1585 goto unregister_cpu_notifier;
1576 1586
1577 if (blk_mq_alloc_bitmap(&hctx->ctx_map, node)) 1587 if (blk_mq_alloc_bitmap(&hctx->ctx_map, node))
1578 goto free_ctxs; 1588 goto free_ctxs;
1579 1589
1580 hctx->nr_ctx = 0; 1590 hctx->nr_ctx = 0;
1581 1591
1582 if (set->ops->init_hctx && 1592 if (set->ops->init_hctx &&
1583 set->ops->init_hctx(hctx, set->driver_data, hctx_idx)) 1593 set->ops->init_hctx(hctx, set->driver_data, hctx_idx))
1584 goto free_bitmap; 1594 goto free_bitmap;
1585 1595
1586 hctx->fq = blk_alloc_flush_queue(q, hctx->numa_node, set->cmd_size); 1596 hctx->fq = blk_alloc_flush_queue(q, hctx->numa_node, set->cmd_size);
1587 if (!hctx->fq) 1597 if (!hctx->fq)
1588 goto exit_hctx; 1598 goto exit_hctx;
1589 1599
1590 if (set->ops->init_request && 1600 if (set->ops->init_request &&
1591 set->ops->init_request(set->driver_data, 1601 set->ops->init_request(set->driver_data,
1592 hctx->fq->flush_rq, hctx_idx, 1602 hctx->fq->flush_rq, hctx_idx,
1593 flush_start_tag + hctx_idx, node)) 1603 flush_start_tag + hctx_idx, node))
1594 goto free_fq; 1604 goto free_fq;
1595 1605
1596 return 0; 1606 return 0;
1597 1607
1598 free_fq: 1608 free_fq:
1599 kfree(hctx->fq); 1609 kfree(hctx->fq);
1600 exit_hctx: 1610 exit_hctx:
1601 if (set->ops->exit_hctx) 1611 if (set->ops->exit_hctx)
1602 set->ops->exit_hctx(hctx, hctx_idx); 1612 set->ops->exit_hctx(hctx, hctx_idx);
1603 free_bitmap: 1613 free_bitmap:
1604 blk_mq_free_bitmap(&hctx->ctx_map); 1614 blk_mq_free_bitmap(&hctx->ctx_map);
1605 free_ctxs: 1615 free_ctxs:
1606 kfree(hctx->ctxs); 1616 kfree(hctx->ctxs);
1607 unregister_cpu_notifier: 1617 unregister_cpu_notifier:
1608 blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier); 1618 blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier);
1609 1619
1610 return -1; 1620 return -1;
1611 } 1621 }
1612 1622
1613 static int blk_mq_init_hw_queues(struct request_queue *q, 1623 static int blk_mq_init_hw_queues(struct request_queue *q,
1614 struct blk_mq_tag_set *set) 1624 struct blk_mq_tag_set *set)
1615 { 1625 {
1616 struct blk_mq_hw_ctx *hctx; 1626 struct blk_mq_hw_ctx *hctx;
1617 unsigned int i; 1627 unsigned int i;
1618 1628
1619 /* 1629 /*
1620 * Initialize hardware queues 1630 * Initialize hardware queues
1621 */ 1631 */
1622 queue_for_each_hw_ctx(q, hctx, i) { 1632 queue_for_each_hw_ctx(q, hctx, i) {
1623 if (blk_mq_init_hctx(q, set, hctx, i)) 1633 if (blk_mq_init_hctx(q, set, hctx, i))
1624 break; 1634 break;
1625 } 1635 }
1626 1636
1627 if (i == q->nr_hw_queues) 1637 if (i == q->nr_hw_queues)
1628 return 0; 1638 return 0;
1629 1639
1630 /* 1640 /*
1631 * Init failed 1641 * Init failed
1632 */ 1642 */
1633 blk_mq_exit_hw_queues(q, set, i); 1643 blk_mq_exit_hw_queues(q, set, i);
1634 1644
1635 return 1; 1645 return 1;
1636 } 1646 }
1637 1647
1638 static void blk_mq_init_cpu_queues(struct request_queue *q, 1648 static void blk_mq_init_cpu_queues(struct request_queue *q,
1639 unsigned int nr_hw_queues) 1649 unsigned int nr_hw_queues)
1640 { 1650 {
1641 unsigned int i; 1651 unsigned int i;
1642 1652
1643 for_each_possible_cpu(i) { 1653 for_each_possible_cpu(i) {
1644 struct blk_mq_ctx *__ctx = per_cpu_ptr(q->queue_ctx, i); 1654 struct blk_mq_ctx *__ctx = per_cpu_ptr(q->queue_ctx, i);
1645 struct blk_mq_hw_ctx *hctx; 1655 struct blk_mq_hw_ctx *hctx;
1646 1656
1647 memset(__ctx, 0, sizeof(*__ctx)); 1657 memset(__ctx, 0, sizeof(*__ctx));
1648 __ctx->cpu = i; 1658 __ctx->cpu = i;
1649 spin_lock_init(&__ctx->lock); 1659 spin_lock_init(&__ctx->lock);
1650 INIT_LIST_HEAD(&__ctx->rq_list); 1660 INIT_LIST_HEAD(&__ctx->rq_list);
1651 __ctx->queue = q; 1661 __ctx->queue = q;
1652 1662
1653 /* If the cpu isn't online, the cpu is mapped to first hctx */ 1663 /* If the cpu isn't online, the cpu is mapped to first hctx */
1654 if (!cpu_online(i)) 1664 if (!cpu_online(i))
1655 continue; 1665 continue;
1656 1666
1657 hctx = q->mq_ops->map_queue(q, i); 1667 hctx = q->mq_ops->map_queue(q, i);
1658 cpumask_set_cpu(i, hctx->cpumask); 1668 cpumask_set_cpu(i, hctx->cpumask);
1659 hctx->nr_ctx++; 1669 hctx->nr_ctx++;
1660 1670
1661 /* 1671 /*
1662 * Set local node, IFF we have more than one hw queue. If 1672 * Set local node, IFF we have more than one hw queue. If
1663 * not, we remain on the home node of the device 1673 * not, we remain on the home node of the device
1664 */ 1674 */
1665 if (nr_hw_queues > 1 && hctx->numa_node == NUMA_NO_NODE) 1675 if (nr_hw_queues > 1 && hctx->numa_node == NUMA_NO_NODE)
1666 hctx->numa_node = cpu_to_node(i); 1676 hctx->numa_node = cpu_to_node(i);
1667 } 1677 }
1668 } 1678 }
1669 1679
1670 static void blk_mq_map_swqueue(struct request_queue *q) 1680 static void blk_mq_map_swqueue(struct request_queue *q)
1671 { 1681 {
1672 unsigned int i; 1682 unsigned int i;
1673 struct blk_mq_hw_ctx *hctx; 1683 struct blk_mq_hw_ctx *hctx;
1674 struct blk_mq_ctx *ctx; 1684 struct blk_mq_ctx *ctx;
1675 1685
1676 queue_for_each_hw_ctx(q, hctx, i) { 1686 queue_for_each_hw_ctx(q, hctx, i) {
1677 cpumask_clear(hctx->cpumask); 1687 cpumask_clear(hctx->cpumask);
1678 hctx->nr_ctx = 0; 1688 hctx->nr_ctx = 0;
1679 } 1689 }
1680 1690
1681 /* 1691 /*
1682 * Map software to hardware queues 1692 * Map software to hardware queues
1683 */ 1693 */
1684 queue_for_each_ctx(q, ctx, i) { 1694 queue_for_each_ctx(q, ctx, i) {
1685 /* If the cpu isn't online, the cpu is mapped to first hctx */ 1695 /* If the cpu isn't online, the cpu is mapped to first hctx */
1686 if (!cpu_online(i)) 1696 if (!cpu_online(i))
1687 continue; 1697 continue;
1688 1698
1689 hctx = q->mq_ops->map_queue(q, i); 1699 hctx = q->mq_ops->map_queue(q, i);
1690 cpumask_set_cpu(i, hctx->cpumask); 1700 cpumask_set_cpu(i, hctx->cpumask);
1691 ctx->index_hw = hctx->nr_ctx; 1701 ctx->index_hw = hctx->nr_ctx;
1692 hctx->ctxs[hctx->nr_ctx++] = ctx; 1702 hctx->ctxs[hctx->nr_ctx++] = ctx;
1693 } 1703 }
1694 1704
1695 queue_for_each_hw_ctx(q, hctx, i) { 1705 queue_for_each_hw_ctx(q, hctx, i) {
1696 /* 1706 /*
1697 * If no software queues are mapped to this hardware queue, 1707 * If no software queues are mapped to this hardware queue,
1698 * disable it and free the request entries. 1708 * disable it and free the request entries.
1699 */ 1709 */
1700 if (!hctx->nr_ctx) { 1710 if (!hctx->nr_ctx) {
1701 struct blk_mq_tag_set *set = q->tag_set; 1711 struct blk_mq_tag_set *set = q->tag_set;
1702 1712
1703 if (set->tags[i]) { 1713 if (set->tags[i]) {
1704 blk_mq_free_rq_map(set, set->tags[i], i); 1714 blk_mq_free_rq_map(set, set->tags[i], i);
1705 set->tags[i] = NULL; 1715 set->tags[i] = NULL;
1706 hctx->tags = NULL; 1716 hctx->tags = NULL;
1707 } 1717 }
1708 continue; 1718 continue;
1709 } 1719 }
1710 1720
1711 /* 1721 /*
1712 * Initialize batch roundrobin counts 1722 * Initialize batch roundrobin counts
1713 */ 1723 */
1714 hctx->next_cpu = cpumask_first(hctx->cpumask); 1724 hctx->next_cpu = cpumask_first(hctx->cpumask);
1715 hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH; 1725 hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH;
1716 } 1726 }
1717 } 1727 }
1718 1728
1719 static void blk_mq_update_tag_set_depth(struct blk_mq_tag_set *set) 1729 static void blk_mq_update_tag_set_depth(struct blk_mq_tag_set *set)
1720 { 1730 {
1721 struct blk_mq_hw_ctx *hctx; 1731 struct blk_mq_hw_ctx *hctx;
1722 struct request_queue *q; 1732 struct request_queue *q;
1723 bool shared; 1733 bool shared;
1724 int i; 1734 int i;
1725 1735
1726 if (set->tag_list.next == set->tag_list.prev) 1736 if (set->tag_list.next == set->tag_list.prev)
1727 shared = false; 1737 shared = false;
1728 else 1738 else
1729 shared = true; 1739 shared = true;
1730 1740
1731 list_for_each_entry(q, &set->tag_list, tag_set_list) { 1741 list_for_each_entry(q, &set->tag_list, tag_set_list) {
1732 blk_mq_freeze_queue(q); 1742 blk_mq_freeze_queue(q);
1733 1743
1734 queue_for_each_hw_ctx(q, hctx, i) { 1744 queue_for_each_hw_ctx(q, hctx, i) {
1735 if (shared) 1745 if (shared)
1736 hctx->flags |= BLK_MQ_F_TAG_SHARED; 1746 hctx->flags |= BLK_MQ_F_TAG_SHARED;
1737 else 1747 else
1738 hctx->flags &= ~BLK_MQ_F_TAG_SHARED; 1748 hctx->flags &= ~BLK_MQ_F_TAG_SHARED;
1739 } 1749 }
1740 blk_mq_unfreeze_queue(q); 1750 blk_mq_unfreeze_queue(q);
1741 } 1751 }
1742 } 1752 }
1743 1753
1744 static void blk_mq_del_queue_tag_set(struct request_queue *q) 1754 static void blk_mq_del_queue_tag_set(struct request_queue *q)
1745 { 1755 {
1746 struct blk_mq_tag_set *set = q->tag_set; 1756 struct blk_mq_tag_set *set = q->tag_set;
1747 1757
1748 mutex_lock(&set->tag_list_lock); 1758 mutex_lock(&set->tag_list_lock);
1749 list_del_init(&q->tag_set_list); 1759 list_del_init(&q->tag_set_list);
1750 blk_mq_update_tag_set_depth(set); 1760 blk_mq_update_tag_set_depth(set);
1751 mutex_unlock(&set->tag_list_lock); 1761 mutex_unlock(&set->tag_list_lock);
1752 } 1762 }
1753 1763
1754 static void blk_mq_add_queue_tag_set(struct blk_mq_tag_set *set, 1764 static void blk_mq_add_queue_tag_set(struct blk_mq_tag_set *set,
1755 struct request_queue *q) 1765 struct request_queue *q)
1756 { 1766 {
1757 q->tag_set = set; 1767 q->tag_set = set;
1758 1768
1759 mutex_lock(&set->tag_list_lock); 1769 mutex_lock(&set->tag_list_lock);
1760 list_add_tail(&q->tag_set_list, &set->tag_list); 1770 list_add_tail(&q->tag_set_list, &set->tag_list);
1761 blk_mq_update_tag_set_depth(set); 1771 blk_mq_update_tag_set_depth(set);
1762 mutex_unlock(&set->tag_list_lock); 1772 mutex_unlock(&set->tag_list_lock);
1763 } 1773 }
1764 1774
1765 struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set) 1775 struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set)
1766 { 1776 {
1767 struct blk_mq_hw_ctx **hctxs; 1777 struct blk_mq_hw_ctx **hctxs;
1768 struct blk_mq_ctx __percpu *ctx; 1778 struct blk_mq_ctx __percpu *ctx;
1769 struct request_queue *q; 1779 struct request_queue *q;
1770 unsigned int *map; 1780 unsigned int *map;
1771 int i; 1781 int i;
1772 1782
1773 ctx = alloc_percpu(struct blk_mq_ctx); 1783 ctx = alloc_percpu(struct blk_mq_ctx);
1774 if (!ctx) 1784 if (!ctx)
1775 return ERR_PTR(-ENOMEM); 1785 return ERR_PTR(-ENOMEM);
1776 1786
1777 /* 1787 /*
1778 * If a crashdump is active, then we are potentially in a very 1788 * If a crashdump is active, then we are potentially in a very
1779 * memory constrained environment. Limit us to 1 queue and 1789 * memory constrained environment. Limit us to 1 queue and
1780 * 64 tags to prevent using too much memory. 1790 * 64 tags to prevent using too much memory.
1781 */ 1791 */
1782 if (is_kdump_kernel()) { 1792 if (is_kdump_kernel()) {
1783 set->nr_hw_queues = 1; 1793 set->nr_hw_queues = 1;
1784 set->queue_depth = min(64U, set->queue_depth); 1794 set->queue_depth = min(64U, set->queue_depth);
1785 } 1795 }
1786 1796
1787 hctxs = kmalloc_node(set->nr_hw_queues * sizeof(*hctxs), GFP_KERNEL, 1797 hctxs = kmalloc_node(set->nr_hw_queues * sizeof(*hctxs), GFP_KERNEL,
1788 set->numa_node); 1798 set->numa_node);
1789 1799
1790 if (!hctxs) 1800 if (!hctxs)
1791 goto err_percpu; 1801 goto err_percpu;
1792 1802
1793 map = blk_mq_make_queue_map(set); 1803 map = blk_mq_make_queue_map(set);
1794 if (!map) 1804 if (!map)
1795 goto err_map; 1805 goto err_map;
1796 1806
1797 for (i = 0; i < set->nr_hw_queues; i++) { 1807 for (i = 0; i < set->nr_hw_queues; i++) {
1798 int node = blk_mq_hw_queue_to_node(map, i); 1808 int node = blk_mq_hw_queue_to_node(map, i);
1799 1809
1800 hctxs[i] = kzalloc_node(sizeof(struct blk_mq_hw_ctx), 1810 hctxs[i] = kzalloc_node(sizeof(struct blk_mq_hw_ctx),
1801 GFP_KERNEL, node); 1811 GFP_KERNEL, node);
1802 if (!hctxs[i]) 1812 if (!hctxs[i])
1803 goto err_hctxs; 1813 goto err_hctxs;
1804 1814
1805 if (!zalloc_cpumask_var_node(&hctxs[i]->cpumask, GFP_KERNEL, 1815 if (!zalloc_cpumask_var_node(&hctxs[i]->cpumask, GFP_KERNEL,
1806 node)) 1816 node))
1807 goto err_hctxs; 1817 goto err_hctxs;
1808 1818
1809 atomic_set(&hctxs[i]->nr_active, 0); 1819 atomic_set(&hctxs[i]->nr_active, 0);
1810 hctxs[i]->numa_node = node; 1820 hctxs[i]->numa_node = node;
1811 hctxs[i]->queue_num = i; 1821 hctxs[i]->queue_num = i;
1812 } 1822 }
1813 1823
1814 q = blk_alloc_queue_node(GFP_KERNEL, set->numa_node); 1824 q = blk_alloc_queue_node(GFP_KERNEL, set->numa_node);
1815 if (!q) 1825 if (!q)
1816 goto err_hctxs; 1826 goto err_hctxs;
1817 1827
1818 /* 1828 /*
1819 * Init percpu_ref in atomic mode so that it's faster to shutdown. 1829 * Init percpu_ref in atomic mode so that it's faster to shutdown.
1820 * See blk_register_queue() for details. 1830 * See blk_register_queue() for details.
1821 */ 1831 */
1822 if (percpu_ref_init(&q->mq_usage_counter, blk_mq_usage_counter_release, 1832 if (percpu_ref_init(&q->mq_usage_counter, blk_mq_usage_counter_release,
1823 PERCPU_REF_INIT_ATOMIC, GFP_KERNEL)) 1833 PERCPU_REF_INIT_ATOMIC, GFP_KERNEL))
1824 goto err_map; 1834 goto err_map;
1825 1835
1826 setup_timer(&q->timeout, blk_mq_rq_timer, (unsigned long) q); 1836 setup_timer(&q->timeout, blk_mq_rq_timer, (unsigned long) q);
1827 blk_queue_rq_timeout(q, 30000); 1837 blk_queue_rq_timeout(q, 30000);
1828 1838
1829 q->nr_queues = nr_cpu_ids; 1839 q->nr_queues = nr_cpu_ids;
1830 q->nr_hw_queues = set->nr_hw_queues; 1840 q->nr_hw_queues = set->nr_hw_queues;
1831 q->mq_map = map; 1841 q->mq_map = map;
1832 1842
1833 q->queue_ctx = ctx; 1843 q->queue_ctx = ctx;
1834 q->queue_hw_ctx = hctxs; 1844 q->queue_hw_ctx = hctxs;
1835 1845
1836 q->mq_ops = set->ops; 1846 q->mq_ops = set->ops;
1837 q->queue_flags |= QUEUE_FLAG_MQ_DEFAULT; 1847 q->queue_flags |= QUEUE_FLAG_MQ_DEFAULT;
1838 1848
1839 if (!(set->flags & BLK_MQ_F_SG_MERGE)) 1849 if (!(set->flags & BLK_MQ_F_SG_MERGE))
1840 q->queue_flags |= 1 << QUEUE_FLAG_NO_SG_MERGE; 1850 q->queue_flags |= 1 << QUEUE_FLAG_NO_SG_MERGE;
1841 1851
1842 q->sg_reserved_size = INT_MAX; 1852 q->sg_reserved_size = INT_MAX;
1843 1853
1844 INIT_WORK(&q->requeue_work, blk_mq_requeue_work); 1854 INIT_WORK(&q->requeue_work, blk_mq_requeue_work);
1845 INIT_LIST_HEAD(&q->requeue_list); 1855 INIT_LIST_HEAD(&q->requeue_list);
1846 spin_lock_init(&q->requeue_lock); 1856 spin_lock_init(&q->requeue_lock);
1847 1857
1848 if (q->nr_hw_queues > 1) 1858 if (q->nr_hw_queues > 1)
1849 blk_queue_make_request(q, blk_mq_make_request); 1859 blk_queue_make_request(q, blk_mq_make_request);
1850 else 1860 else
1851 blk_queue_make_request(q, blk_sq_make_request); 1861 blk_queue_make_request(q, blk_sq_make_request);
1852 1862
1853 if (set->timeout) 1863 if (set->timeout)
1854 blk_queue_rq_timeout(q, set->timeout); 1864 blk_queue_rq_timeout(q, set->timeout);
1855 1865
1856 /* 1866 /*
1857 * Do this after blk_queue_make_request() overrides it... 1867 * Do this after blk_queue_make_request() overrides it...
1858 */ 1868 */
1859 q->nr_requests = set->queue_depth; 1869 q->nr_requests = set->queue_depth;
1860 1870
1861 if (set->ops->complete) 1871 if (set->ops->complete)
1862 blk_queue_softirq_done(q, set->ops->complete); 1872 blk_queue_softirq_done(q, set->ops->complete);
1863 1873
1864 blk_mq_init_cpu_queues(q, set->nr_hw_queues); 1874 blk_mq_init_cpu_queues(q, set->nr_hw_queues);
1865 1875
1866 if (blk_mq_init_hw_queues(q, set)) 1876 if (blk_mq_init_hw_queues(q, set))
1867 goto err_hw; 1877 goto err_hw;
1868 1878
1869 mutex_lock(&all_q_mutex); 1879 mutex_lock(&all_q_mutex);
1870 list_add_tail(&q->all_q_node, &all_q_list); 1880 list_add_tail(&q->all_q_node, &all_q_list);
1871 mutex_unlock(&all_q_mutex); 1881 mutex_unlock(&all_q_mutex);
1872 1882
1873 blk_mq_add_queue_tag_set(set, q); 1883 blk_mq_add_queue_tag_set(set, q);
1874 1884
1875 blk_mq_map_swqueue(q); 1885 blk_mq_map_swqueue(q);
1876 1886
1877 return q; 1887 return q;
1878 1888
1879 err_hw: 1889 err_hw:
1880 blk_cleanup_queue(q); 1890 blk_cleanup_queue(q);
1881 err_hctxs: 1891 err_hctxs:
1882 kfree(map); 1892 kfree(map);
1883 for (i = 0; i < set->nr_hw_queues; i++) { 1893 for (i = 0; i < set->nr_hw_queues; i++) {
1884 if (!hctxs[i]) 1894 if (!hctxs[i])
1885 break; 1895 break;
1886 free_cpumask_var(hctxs[i]->cpumask); 1896 free_cpumask_var(hctxs[i]->cpumask);
1887 kfree(hctxs[i]); 1897 kfree(hctxs[i]);
1888 } 1898 }
1889 err_map: 1899 err_map:
1890 kfree(hctxs); 1900 kfree(hctxs);
1891 err_percpu: 1901 err_percpu:
1892 free_percpu(ctx); 1902 free_percpu(ctx);
1893 return ERR_PTR(-ENOMEM); 1903 return ERR_PTR(-ENOMEM);
1894 } 1904 }
1895 EXPORT_SYMBOL(blk_mq_init_queue); 1905 EXPORT_SYMBOL(blk_mq_init_queue);
1896 1906
1897 void blk_mq_free_queue(struct request_queue *q) 1907 void blk_mq_free_queue(struct request_queue *q)
1898 { 1908 {
1899 struct blk_mq_tag_set *set = q->tag_set; 1909 struct blk_mq_tag_set *set = q->tag_set;
1900 1910
1901 blk_mq_del_queue_tag_set(q); 1911 blk_mq_del_queue_tag_set(q);
1902 1912
1903 blk_mq_exit_hw_queues(q, set, set->nr_hw_queues); 1913 blk_mq_exit_hw_queues(q, set, set->nr_hw_queues);
1904 blk_mq_free_hw_queues(q, set); 1914 blk_mq_free_hw_queues(q, set);
1905 1915
1906 percpu_ref_exit(&q->mq_usage_counter); 1916 percpu_ref_exit(&q->mq_usage_counter);
1907 1917
1908 free_percpu(q->queue_ctx); 1918 free_percpu(q->queue_ctx);
1909 kfree(q->queue_hw_ctx); 1919 kfree(q->queue_hw_ctx);
1910 kfree(q->mq_map); 1920 kfree(q->mq_map);
1911 1921
1912 q->queue_ctx = NULL; 1922 q->queue_ctx = NULL;
1913 q->queue_hw_ctx = NULL; 1923 q->queue_hw_ctx = NULL;
1914 q->mq_map = NULL; 1924 q->mq_map = NULL;
1915 1925
1916 mutex_lock(&all_q_mutex); 1926 mutex_lock(&all_q_mutex);
1917 list_del_init(&q->all_q_node); 1927 list_del_init(&q->all_q_node);
1918 mutex_unlock(&all_q_mutex); 1928 mutex_unlock(&all_q_mutex);
1919 } 1929 }
1920 1930
1921 /* Basically redo blk_mq_init_queue with queue frozen */ 1931 /* Basically redo blk_mq_init_queue with queue frozen */
1922 static void blk_mq_queue_reinit(struct request_queue *q) 1932 static void blk_mq_queue_reinit(struct request_queue *q)
1923 { 1933 {
1924 blk_mq_freeze_queue(q); 1934 WARN_ON_ONCE(!q->mq_freeze_depth);
1925 1935
1926 blk_mq_sysfs_unregister(q); 1936 blk_mq_sysfs_unregister(q);
1927 1937
1928 blk_mq_update_queue_map(q->mq_map, q->nr_hw_queues); 1938 blk_mq_update_queue_map(q->mq_map, q->nr_hw_queues);
1929 1939
1930 /* 1940 /*
1931 * redo blk_mq_init_cpu_queues and blk_mq_init_hw_queues. FIXME: maybe 1941 * redo blk_mq_init_cpu_queues and blk_mq_init_hw_queues. FIXME: maybe
1932 * we should change hctx numa_node according to new topology (this 1942 * we should change hctx numa_node according to new topology (this
1933 * involves free and re-allocate memory, worthy doing?) 1943 * involves free and re-allocate memory, worthy doing?)
1934 */ 1944 */
1935 1945
1936 blk_mq_map_swqueue(q); 1946 blk_mq_map_swqueue(q);
1937 1947
1938 blk_mq_sysfs_register(q); 1948 blk_mq_sysfs_register(q);
1939
1940 blk_mq_unfreeze_queue(q);
1941 } 1949 }
1942 1950
1943 static int blk_mq_queue_reinit_notify(struct notifier_block *nb, 1951 static int blk_mq_queue_reinit_notify(struct notifier_block *nb,
1944 unsigned long action, void *hcpu) 1952 unsigned long action, void *hcpu)
1945 { 1953 {
1946 struct request_queue *q; 1954 struct request_queue *q;
1947 1955
1948 /* 1956 /*
1949 * Before new mappings are established, hotadded cpu might already 1957 * Before new mappings are established, hotadded cpu might already
1950 * start handling requests. This doesn't break anything as we map 1958 * start handling requests. This doesn't break anything as we map
1951 * offline CPUs to first hardware queue. We will re-init the queue 1959 * offline CPUs to first hardware queue. We will re-init the queue
1952 * below to get optimal settings. 1960 * below to get optimal settings.
1953 */ 1961 */
1954 if (action != CPU_DEAD && action != CPU_DEAD_FROZEN && 1962 if (action != CPU_DEAD && action != CPU_DEAD_FROZEN &&
1955 action != CPU_ONLINE && action != CPU_ONLINE_FROZEN) 1963 action != CPU_ONLINE && action != CPU_ONLINE_FROZEN)
1956 return NOTIFY_OK; 1964 return NOTIFY_OK;
1957 1965
1958 mutex_lock(&all_q_mutex); 1966 mutex_lock(&all_q_mutex);
1967
1968 /*
1969 * We need to freeze and reinit all existing queues. Freezing
1970 * involves synchronous wait for an RCU grace period and doing it
1971 * one by one may take a long time. Start freezing all queues in
1972 * one swoop and then wait for the completions so that freezing can
1973 * take place in parallel.
1974 */
1959 list_for_each_entry(q, &all_q_list, all_q_node) 1975 list_for_each_entry(q, &all_q_list, all_q_node)
1976 blk_mq_freeze_queue_start(q);
1977 list_for_each_entry(q, &all_q_list, all_q_node)
1978 blk_mq_freeze_queue_wait(q);
1979
1980 list_for_each_entry(q, &all_q_list, all_q_node)
1960 blk_mq_queue_reinit(q); 1981 blk_mq_queue_reinit(q);
1982
1983 list_for_each_entry(q, &all_q_list, all_q_node)
1984 blk_mq_unfreeze_queue(q);
1985
1961 mutex_unlock(&all_q_mutex); 1986 mutex_unlock(&all_q_mutex);
1962 return NOTIFY_OK; 1987 return NOTIFY_OK;
1963 } 1988 }
1964 1989
1965 static int __blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set) 1990 static int __blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set)
1966 { 1991 {
1967 int i; 1992 int i;
1968 1993
1969 for (i = 0; i < set->nr_hw_queues; i++) { 1994 for (i = 0; i < set->nr_hw_queues; i++) {
1970 set->tags[i] = blk_mq_init_rq_map(set, i); 1995 set->tags[i] = blk_mq_init_rq_map(set, i);
1971 if (!set->tags[i]) 1996 if (!set->tags[i])
1972 goto out_unwind; 1997 goto out_unwind;
1973 } 1998 }
1974 1999
1975 return 0; 2000 return 0;
1976 2001
1977 out_unwind: 2002 out_unwind:
1978 while (--i >= 0) 2003 while (--i >= 0)
1979 blk_mq_free_rq_map(set, set->tags[i], i); 2004 blk_mq_free_rq_map(set, set->tags[i], i);
1980 2005
1981 return -ENOMEM; 2006 return -ENOMEM;
1982 } 2007 }
1983 2008
1984 /* 2009 /*
1985 * Allocate the request maps associated with this tag_set. Note that this 2010 * Allocate the request maps associated with this tag_set. Note that this
1986 * may reduce the depth asked for, if memory is tight. set->queue_depth 2011 * may reduce the depth asked for, if memory is tight. set->queue_depth
1987 * will be updated to reflect the allocated depth. 2012 * will be updated to reflect the allocated depth.
1988 */ 2013 */
1989 static int blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set) 2014 static int blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set)
1990 { 2015 {
1991 unsigned int depth; 2016 unsigned int depth;
1992 int err; 2017 int err;
1993 2018
1994 depth = set->queue_depth; 2019 depth = set->queue_depth;
1995 do { 2020 do {
1996 err = __blk_mq_alloc_rq_maps(set); 2021 err = __blk_mq_alloc_rq_maps(set);
1997 if (!err) 2022 if (!err)
1998 break; 2023 break;
1999 2024
2000 set->queue_depth >>= 1; 2025 set->queue_depth >>= 1;
2001 if (set->queue_depth < set->reserved_tags + BLK_MQ_TAG_MIN) { 2026 if (set->queue_depth < set->reserved_tags + BLK_MQ_TAG_MIN) {
2002 err = -ENOMEM; 2027 err = -ENOMEM;
2003 break; 2028 break;
2004 } 2029 }
2005 } while (set->queue_depth); 2030 } while (set->queue_depth);
2006 2031
2007 if (!set->queue_depth || err) { 2032 if (!set->queue_depth || err) {
2008 pr_err("blk-mq: failed to allocate request map\n"); 2033 pr_err("blk-mq: failed to allocate request map\n");
2009 return -ENOMEM; 2034 return -ENOMEM;
2010 } 2035 }
2011 2036
2012 if (depth != set->queue_depth) 2037 if (depth != set->queue_depth)
2013 pr_info("blk-mq: reduced tag depth (%u -> %u)\n", 2038 pr_info("blk-mq: reduced tag depth (%u -> %u)\n",
2014 depth, set->queue_depth); 2039 depth, set->queue_depth);
2015 2040
2016 return 0; 2041 return 0;
2017 } 2042 }
2018 2043
2019 /* 2044 /*
2020 * Alloc a tag set to be associated with one or more request queues. 2045 * Alloc a tag set to be associated with one or more request queues.
2021 * May fail with EINVAL for various error conditions. May adjust the 2046 * May fail with EINVAL for various error conditions. May adjust the
2022 * requested depth down, if if it too large. In that case, the set 2047 * requested depth down, if if it too large. In that case, the set
2023 * value will be stored in set->queue_depth. 2048 * value will be stored in set->queue_depth.
2024 */ 2049 */
2025 int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set) 2050 int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
2026 { 2051 {
2027 if (!set->nr_hw_queues) 2052 if (!set->nr_hw_queues)
2028 return -EINVAL; 2053 return -EINVAL;
2029 if (!set->queue_depth) 2054 if (!set->queue_depth)
2030 return -EINVAL; 2055 return -EINVAL;
2031 if (set->queue_depth < set->reserved_tags + BLK_MQ_TAG_MIN) 2056 if (set->queue_depth < set->reserved_tags + BLK_MQ_TAG_MIN)
2032 return -EINVAL; 2057 return -EINVAL;
2033 2058
2034 if (!set->nr_hw_queues || !set->ops->queue_rq || !set->ops->map_queue) 2059 if (!set->nr_hw_queues || !set->ops->queue_rq || !set->ops->map_queue)
2035 return -EINVAL; 2060 return -EINVAL;
2036 2061
2037 if (set->queue_depth > BLK_MQ_MAX_DEPTH) { 2062 if (set->queue_depth > BLK_MQ_MAX_DEPTH) {
2038 pr_info("blk-mq: reduced tag depth to %u\n", 2063 pr_info("blk-mq: reduced tag depth to %u\n",
2039 BLK_MQ_MAX_DEPTH); 2064 BLK_MQ_MAX_DEPTH);
2040 set->queue_depth = BLK_MQ_MAX_DEPTH; 2065 set->queue_depth = BLK_MQ_MAX_DEPTH;
2041 } 2066 }
2042 2067
2043 set->tags = kmalloc_node(set->nr_hw_queues * 2068 set->tags = kmalloc_node(set->nr_hw_queues *
2044 sizeof(struct blk_mq_tags *), 2069 sizeof(struct blk_mq_tags *),
2045 GFP_KERNEL, set->numa_node); 2070 GFP_KERNEL, set->numa_node);
2046 if (!set->tags) 2071 if (!set->tags)
2047 return -ENOMEM; 2072 return -ENOMEM;
2048 2073
2049 if (blk_mq_alloc_rq_maps(set)) 2074 if (blk_mq_alloc_rq_maps(set))
2050 goto enomem; 2075 goto enomem;
2051 2076
2052 mutex_init(&set->tag_list_lock); 2077 mutex_init(&set->tag_list_lock);
2053 INIT_LIST_HEAD(&set->tag_list); 2078 INIT_LIST_HEAD(&set->tag_list);
2054 2079
2055 return 0; 2080 return 0;
2056 enomem: 2081 enomem:
2057 kfree(set->tags); 2082 kfree(set->tags);
2058 set->tags = NULL; 2083 set->tags = NULL;
2059 return -ENOMEM; 2084 return -ENOMEM;
2060 } 2085 }
2061 EXPORT_SYMBOL(blk_mq_alloc_tag_set); 2086 EXPORT_SYMBOL(blk_mq_alloc_tag_set);
2062 2087
2063 void blk_mq_free_tag_set(struct blk_mq_tag_set *set) 2088 void blk_mq_free_tag_set(struct blk_mq_tag_set *set)
2064 { 2089 {
2065 int i; 2090 int i;
2066 2091
2067 for (i = 0; i < set->nr_hw_queues; i++) { 2092 for (i = 0; i < set->nr_hw_queues; i++) {
2068 if (set->tags[i]) 2093 if (set->tags[i])
2069 blk_mq_free_rq_map(set, set->tags[i], i); 2094 blk_mq_free_rq_map(set, set->tags[i], i);
2070 } 2095 }
2071 2096
2072 kfree(set->tags); 2097 kfree(set->tags);
2073 set->tags = NULL; 2098 set->tags = NULL;
2074 } 2099 }
2075 EXPORT_SYMBOL(blk_mq_free_tag_set); 2100 EXPORT_SYMBOL(blk_mq_free_tag_set);
2076 2101
2077 int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr) 2102 int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr)
2078 { 2103 {
2079 struct blk_mq_tag_set *set = q->tag_set; 2104 struct blk_mq_tag_set *set = q->tag_set;
2080 struct blk_mq_hw_ctx *hctx; 2105 struct blk_mq_hw_ctx *hctx;
2081 int i, ret; 2106 int i, ret;
2082 2107
2083 if (!set || nr > set->queue_depth) 2108 if (!set || nr > set->queue_depth)
2084 return -EINVAL; 2109 return -EINVAL;
2085 2110
2086 ret = 0; 2111 ret = 0;
2087 queue_for_each_hw_ctx(q, hctx, i) { 2112 queue_for_each_hw_ctx(q, hctx, i) {
2088 ret = blk_mq_tag_update_depth(hctx->tags, nr); 2113 ret = blk_mq_tag_update_depth(hctx->tags, nr);
2089 if (ret) 2114 if (ret)
2090 break; 2115 break;
2091 } 2116 }
2092 2117
2093 if (!ret) 2118 if (!ret)
2094 q->nr_requests = nr; 2119 q->nr_requests = nr;
2095 2120
2096 return ret; 2121 return ret;
2097 } 2122 }
2098 2123
2099 void blk_mq_disable_hotplug(void) 2124 void blk_mq_disable_hotplug(void)
2100 { 2125 {
2101 mutex_lock(&all_q_mutex); 2126 mutex_lock(&all_q_mutex);
2102 } 2127 }
2103 2128
2104 void blk_mq_enable_hotplug(void) 2129 void blk_mq_enable_hotplug(void)
2105 { 2130 {
2106 mutex_unlock(&all_q_mutex); 2131 mutex_unlock(&all_q_mutex);
2107 } 2132 }
2108 2133
2109 static int __init blk_mq_init(void) 2134 static int __init blk_mq_init(void)
2110 { 2135 {
2111 blk_mq_cpu_init(); 2136 blk_mq_cpu_init();
2112 2137
1 /* 1 /*
2 * fs/ioprio.c 2 * fs/ioprio.c
3 * 3 *
4 * Copyright (C) 2004 Jens Axboe <axboe@kernel.dk> 4 * Copyright (C) 2004 Jens Axboe <axboe@kernel.dk>
5 * 5 *
6 * Helper functions for setting/querying io priorities of processes. The 6 * Helper functions for setting/querying io priorities of processes. The
7 * system calls closely mimmick getpriority/setpriority, see the man page for 7 * system calls closely mimmick getpriority/setpriority, see the man page for
8 * those. The prio argument is a composite of prio class and prio data, where 8 * those. The prio argument is a composite of prio class and prio data, where
9 * the data argument has meaning within that class. The standard scheduling 9 * the data argument has meaning within that class. The standard scheduling
10 * classes have 8 distinct prio levels, with 0 being the highest prio and 7 10 * classes have 8 distinct prio levels, with 0 being the highest prio and 7
11 * being the lowest. 11 * being the lowest.
12 * 12 *
13 * IOW, setting BE scheduling class with prio 2 is done ala: 13 * IOW, setting BE scheduling class with prio 2 is done ala:
14 * 14 *
15 * unsigned int prio = (IOPRIO_CLASS_BE << IOPRIO_CLASS_SHIFT) | 2; 15 * unsigned int prio = (IOPRIO_CLASS_BE << IOPRIO_CLASS_SHIFT) | 2;
16 * 16 *
17 * ioprio_set(PRIO_PROCESS, pid, prio); 17 * ioprio_set(PRIO_PROCESS, pid, prio);
18 * 18 *
19 * See also Documentation/block/ioprio.txt 19 * See also Documentation/block/ioprio.txt
20 * 20 *
21 */ 21 */
22 #include <linux/gfp.h> 22 #include <linux/gfp.h>
23 #include <linux/kernel.h> 23 #include <linux/kernel.h>
24 #include <linux/export.h> 24 #include <linux/export.h>
25 #include <linux/ioprio.h> 25 #include <linux/ioprio.h>
26 #include <linux/blkdev.h> 26 #include <linux/blkdev.h>
27 #include <linux/capability.h> 27 #include <linux/capability.h>
28 #include <linux/syscalls.h> 28 #include <linux/syscalls.h>
29 #include <linux/security.h> 29 #include <linux/security.h>
30 #include <linux/pid_namespace.h> 30 #include <linux/pid_namespace.h>
31 31
32 int set_task_ioprio(struct task_struct *task, int ioprio) 32 int set_task_ioprio(struct task_struct *task, int ioprio)
33 { 33 {
34 int err; 34 int err;
35 struct io_context *ioc; 35 struct io_context *ioc;
36 const struct cred *cred = current_cred(), *tcred; 36 const struct cred *cred = current_cred(), *tcred;
37 37
38 rcu_read_lock(); 38 rcu_read_lock();
39 tcred = __task_cred(task); 39 tcred = __task_cred(task);
40 if (!uid_eq(tcred->uid, cred->euid) && 40 if (!uid_eq(tcred->uid, cred->euid) &&
41 !uid_eq(tcred->uid, cred->uid) && !capable(CAP_SYS_NICE)) { 41 !uid_eq(tcred->uid, cred->uid) && !capable(CAP_SYS_NICE)) {
42 rcu_read_unlock(); 42 rcu_read_unlock();
43 return -EPERM; 43 return -EPERM;
44 } 44 }
45 rcu_read_unlock(); 45 rcu_read_unlock();
46 46
47 err = security_task_setioprio(task, ioprio); 47 err = security_task_setioprio(task, ioprio);
48 if (err) 48 if (err)
49 return err; 49 return err;
50 50
51 ioc = get_task_io_context(task, GFP_ATOMIC, NUMA_NO_NODE); 51 ioc = get_task_io_context(task, GFP_ATOMIC, NUMA_NO_NODE);
52 if (ioc) { 52 if (ioc) {
53 ioc->ioprio = ioprio; 53 ioc->ioprio = ioprio;
54 put_io_context(ioc); 54 put_io_context(ioc);
55 } 55 }
56 56
57 return err; 57 return err;
58 } 58 }
59 EXPORT_SYMBOL_GPL(set_task_ioprio); 59 EXPORT_SYMBOL_GPL(set_task_ioprio);
60 60
61 SYSCALL_DEFINE3(ioprio_set, int, which, int, who, int, ioprio) 61 SYSCALL_DEFINE3(ioprio_set, int, which, int, who, int, ioprio)
62 { 62 {
63 int class = IOPRIO_PRIO_CLASS(ioprio); 63 int class = IOPRIO_PRIO_CLASS(ioprio);
64 int data = IOPRIO_PRIO_DATA(ioprio); 64 int data = IOPRIO_PRIO_DATA(ioprio);
65 struct task_struct *p, *g; 65 struct task_struct *p, *g;
66 struct user_struct *user; 66 struct user_struct *user;
67 struct pid *pgrp; 67 struct pid *pgrp;
68 kuid_t uid; 68 kuid_t uid;
69 int ret; 69 int ret;
70 70
71 switch (class) { 71 switch (class) {
72 case IOPRIO_CLASS_RT: 72 case IOPRIO_CLASS_RT:
73 if (!capable(CAP_SYS_ADMIN)) 73 if (!capable(CAP_SYS_ADMIN))
74 return -EPERM; 74 return -EPERM;
75 /* fall through, rt has prio field too */ 75 /* fall through, rt has prio field too */
76 case IOPRIO_CLASS_BE: 76 case IOPRIO_CLASS_BE:
77 if (data >= IOPRIO_BE_NR || data < 0) 77 if (data >= IOPRIO_BE_NR || data < 0)
78 return -EINVAL; 78 return -EINVAL;
79 79
80 break; 80 break;
81 case IOPRIO_CLASS_IDLE: 81 case IOPRIO_CLASS_IDLE:
82 break; 82 break;
83 case IOPRIO_CLASS_NONE: 83 case IOPRIO_CLASS_NONE:
84 if (data) 84 if (data)
85 return -EINVAL; 85 return -EINVAL;
86 break; 86 break;
87 default: 87 default:
88 return -EINVAL; 88 return -EINVAL;
89 } 89 }
90 90
91 ret = -ESRCH; 91 ret = -ESRCH;
92 rcu_read_lock(); 92 rcu_read_lock();
93 switch (which) { 93 switch (which) {
94 case IOPRIO_WHO_PROCESS: 94 case IOPRIO_WHO_PROCESS:
95 if (!who) 95 if (!who)
96 p = current; 96 p = current;
97 else 97 else
98 p = find_task_by_vpid(who); 98 p = find_task_by_vpid(who);
99 if (p) 99 if (p)
100 ret = set_task_ioprio(p, ioprio); 100 ret = set_task_ioprio(p, ioprio);
101 break; 101 break;
102 case IOPRIO_WHO_PGRP: 102 case IOPRIO_WHO_PGRP:
103 if (!who) 103 if (!who)
104 pgrp = task_pgrp(current); 104 pgrp = task_pgrp(current);
105 else 105 else
106 pgrp = find_vpid(who); 106 pgrp = find_vpid(who);
107 do_each_pid_thread(pgrp, PIDTYPE_PGID, p) { 107 do_each_pid_thread(pgrp, PIDTYPE_PGID, p) {
108 ret = set_task_ioprio(p, ioprio); 108 ret = set_task_ioprio(p, ioprio);
109 if (ret) 109 if (ret)
110 break; 110 break;
111 } while_each_pid_thread(pgrp, PIDTYPE_PGID, p); 111 } while_each_pid_thread(pgrp, PIDTYPE_PGID, p);
112 break; 112 break;
113 case IOPRIO_WHO_USER: 113 case IOPRIO_WHO_USER:
114 uid = make_kuid(current_user_ns(), who); 114 uid = make_kuid(current_user_ns(), who);
115 if (!uid_valid(uid)) 115 if (!uid_valid(uid))
116 break; 116 break;
117 if (!who) 117 if (!who)
118 user = current_user(); 118 user = current_user();
119 else 119 else
120 user = find_user(uid); 120 user = find_user(uid);
121 121
122 if (!user) 122 if (!user)
123 break; 123 break;
124 124
125 do_each_thread(g, p) { 125 do_each_thread(g, p) {
126 if (!uid_eq(task_uid(p), uid)) 126 if (!uid_eq(task_uid(p), uid))
127 continue; 127 continue;
128 ret = set_task_ioprio(p, ioprio); 128 ret = set_task_ioprio(p, ioprio);
129 if (ret) 129 if (ret)
130 goto free_uid; 130 goto free_uid;
131 } while_each_thread(g, p); 131 } while_each_thread(g, p);
132 free_uid: 132 free_uid:
133 if (who) 133 if (who)
134 free_uid(user); 134 free_uid(user);
135 break; 135 break;
136 default: 136 default:
137 ret = -EINVAL; 137 ret = -EINVAL;
138 } 138 }
139 139
140 rcu_read_unlock(); 140 rcu_read_unlock();
141 return ret; 141 return ret;
142 } 142 }
143 143
144 static int get_task_ioprio(struct task_struct *p) 144 static int get_task_ioprio(struct task_struct *p)
145 { 145 {
146 int ret; 146 int ret;
147 147
148 ret = security_task_getioprio(p); 148 ret = security_task_getioprio(p);
149 if (ret) 149 if (ret)
150 goto out; 150 goto out;
151 ret = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_NONE, IOPRIO_NORM); 151 ret = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_NONE, IOPRIO_NORM);
152 if (p->io_context) 152 if (p->io_context)
153 ret = p->io_context->ioprio; 153 ret = p->io_context->ioprio;
154 out: 154 out:
155 return ret; 155 return ret;
156 } 156 }
157 157
158 int ioprio_best(unsigned short aprio, unsigned short bprio) 158 int ioprio_best(unsigned short aprio, unsigned short bprio)
159 { 159 {
160 unsigned short aclass = IOPRIO_PRIO_CLASS(aprio); 160 unsigned short aclass;
161 unsigned short bclass = IOPRIO_PRIO_CLASS(bprio); 161 unsigned short bclass;
162 162
163 if (aclass == IOPRIO_CLASS_NONE) 163 if (!ioprio_valid(aprio))
164 aclass = IOPRIO_CLASS_BE; 164 aprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, IOPRIO_NORM);
165 if (bclass == IOPRIO_CLASS_NONE) 165 if (!ioprio_valid(bprio))
166 bclass = IOPRIO_CLASS_BE; 166 bprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, IOPRIO_NORM);
167 167
168 aclass = IOPRIO_PRIO_CLASS(aprio);
169 bclass = IOPRIO_PRIO_CLASS(bprio);
168 if (aclass == bclass) 170 if (aclass == bclass)
169 return min(aprio, bprio); 171 return min(aprio, bprio);
170 if (aclass > bclass) 172 if (aclass > bclass)
171 return bprio; 173 return bprio;
172 else 174 else
173 return aprio; 175 return aprio;
174 } 176 }
175 177
176 SYSCALL_DEFINE2(ioprio_get, int, which, int, who) 178 SYSCALL_DEFINE2(ioprio_get, int, which, int, who)
177 { 179 {
178 struct task_struct *g, *p; 180 struct task_struct *g, *p;
179 struct user_struct *user; 181 struct user_struct *user;
180 struct pid *pgrp; 182 struct pid *pgrp;
181 kuid_t uid; 183 kuid_t uid;
182 int ret = -ESRCH; 184 int ret = -ESRCH;
183 int tmpio; 185 int tmpio;
184 186
185 rcu_read_lock(); 187 rcu_read_lock();
186 switch (which) { 188 switch (which) {
187 case IOPRIO_WHO_PROCESS: 189 case IOPRIO_WHO_PROCESS:
188 if (!who) 190 if (!who)
189 p = current; 191 p = current;
190 else 192 else
191 p = find_task_by_vpid(who); 193 p = find_task_by_vpid(who);
192 if (p) 194 if (p)
193 ret = get_task_ioprio(p); 195 ret = get_task_ioprio(p);
194 break; 196 break;
195 case IOPRIO_WHO_PGRP: 197 case IOPRIO_WHO_PGRP:
196 if (!who) 198 if (!who)
197 pgrp = task_pgrp(current); 199 pgrp = task_pgrp(current);
198 else 200 else
199 pgrp = find_vpid(who); 201 pgrp = find_vpid(who);
200 do_each_pid_thread(pgrp, PIDTYPE_PGID, p) { 202 do_each_pid_thread(pgrp, PIDTYPE_PGID, p) {
201 tmpio = get_task_ioprio(p); 203 tmpio = get_task_ioprio(p);
202 if (tmpio < 0) 204 if (tmpio < 0)
203 continue; 205 continue;
204 if (ret == -ESRCH) 206 if (ret == -ESRCH)
205 ret = tmpio; 207 ret = tmpio;
206 else 208 else
207 ret = ioprio_best(ret, tmpio); 209 ret = ioprio_best(ret, tmpio);
208 } while_each_pid_thread(pgrp, PIDTYPE_PGID, p); 210 } while_each_pid_thread(pgrp, PIDTYPE_PGID, p);
209 break; 211 break;
210 case IOPRIO_WHO_USER: 212 case IOPRIO_WHO_USER:
211 uid = make_kuid(current_user_ns(), who); 213 uid = make_kuid(current_user_ns(), who);
212 if (!who) 214 if (!who)
213 user = current_user(); 215 user = current_user();
214 else 216 else
215 user = find_user(uid); 217 user = find_user(uid);
216 218
217 if (!user) 219 if (!user)
218 break; 220 break;
219 221
220 do_each_thread(g, p) { 222 do_each_thread(g, p) {
221 if (!uid_eq(task_uid(p), user->uid)) 223 if (!uid_eq(task_uid(p), user->uid))
222 continue; 224 continue;
223 tmpio = get_task_ioprio(p); 225 tmpio = get_task_ioprio(p);
224 if (tmpio < 0) 226 if (tmpio < 0)
225 continue; 227 continue;
226 if (ret == -ESRCH) 228 if (ret == -ESRCH)
227 ret = tmpio; 229 ret = tmpio;
228 else 230 else
229 ret = ioprio_best(ret, tmpio); 231 ret = ioprio_best(ret, tmpio);
230 } while_each_thread(g, p); 232 } while_each_thread(g, p);
231 233
232 if (who) 234 if (who)
233 free_uid(user); 235 free_uid(user);
234 break; 236 break;
235 default: 237 default:
236 ret = -EINVAL; 238 ret = -EINVAL;
237 } 239 }
238 240
239 rcu_read_unlock(); 241 rcu_read_unlock();
240 return ret; 242 return ret;
241 } 243 }
242 244
1 /* 1 /*
2 * Copyright (C) 2001 Jens Axboe <axboe@suse.de> 2 * Copyright (C) 2001 Jens Axboe <axboe@suse.de>
3 * 3 *
4 * This program is free software; you can redistribute it and/or modify 4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License version 2 as 5 * it under the terms of the GNU General Public License version 2 as
6 * published by the Free Software Foundation. 6 * published by the Free Software Foundation.
7 * 7 *
8 * This program is distributed in the hope that it will be useful, 8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of 9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * 10 *
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details. 12 * GNU General Public License for more details.
13 * 13 *
14 * You should have received a copy of the GNU General Public Licens 14 * You should have received a copy of the GNU General Public Licens
15 * along with this program; if not, write to the Free Software 15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111- 16 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-
17 * 17 *
18 */ 18 */
19 #include <linux/kernel.h> 19 #include <linux/kernel.h>
20 #include <linux/errno.h> 20 #include <linux/errno.h>
21 #include <linux/string.h> 21 #include <linux/string.h>
22 #include <linux/module.h> 22 #include <linux/module.h>
23 #include <linux/blkdev.h> 23 #include <linux/blkdev.h>
24 #include <linux/capability.h> 24 #include <linux/capability.h>
25 #include <linux/completion.h> 25 #include <linux/completion.h>
26 #include <linux/cdrom.h> 26 #include <linux/cdrom.h>
27 #include <linux/ratelimit.h> 27 #include <linux/ratelimit.h>
28 #include <linux/slab.h> 28 #include <linux/slab.h>
29 #include <linux/times.h> 29 #include <linux/times.h>
30 #include <linux/uio.h> 30 #include <linux/uio.h>
31 #include <asm/uaccess.h> 31 #include <asm/uaccess.h>
32 32
33 #include <scsi/scsi.h> 33 #include <scsi/scsi.h>
34 #include <scsi/scsi_ioctl.h> 34 #include <scsi/scsi_ioctl.h>
35 #include <scsi/scsi_cmnd.h> 35 #include <scsi/scsi_cmnd.h>
36 36
37 struct blk_cmd_filter { 37 struct blk_cmd_filter {
38 unsigned long read_ok[BLK_SCSI_CMD_PER_LONG]; 38 unsigned long read_ok[BLK_SCSI_CMD_PER_LONG];
39 unsigned long write_ok[BLK_SCSI_CMD_PER_LONG]; 39 unsigned long write_ok[BLK_SCSI_CMD_PER_LONG];
40 }; 40 };
41 41
42 static struct blk_cmd_filter blk_default_cmd_filter; 42 static struct blk_cmd_filter blk_default_cmd_filter;
43 43
44 /* Command group 3 is reserved and should never be used. */ 44 /* Command group 3 is reserved and should never be used. */
45 const unsigned char scsi_command_size_tbl[8] = 45 const unsigned char scsi_command_size_tbl[8] =
46 { 46 {
47 6, 10, 10, 12, 47 6, 10, 10, 12,
48 16, 12, 10, 10 48 16, 12, 10, 10
49 }; 49 };
50 EXPORT_SYMBOL(scsi_command_size_tbl); 50 EXPORT_SYMBOL(scsi_command_size_tbl);
51 51
52 #include <scsi/sg.h> 52 #include <scsi/sg.h>
53 53
54 static int sg_get_version(int __user *p) 54 static int sg_get_version(int __user *p)
55 { 55 {
56 static const int sg_version_num = 30527; 56 static const int sg_version_num = 30527;
57 return put_user(sg_version_num, p); 57 return put_user(sg_version_num, p);
58 } 58 }
59 59
60 static int scsi_get_idlun(struct request_queue *q, int __user *p) 60 static int scsi_get_idlun(struct request_queue *q, int __user *p)
61 { 61 {
62 return put_user(0, p); 62 return put_user(0, p);
63 } 63 }
64 64
65 static int scsi_get_bus(struct request_queue *q, int __user *p) 65 static int scsi_get_bus(struct request_queue *q, int __user *p)
66 { 66 {
67 return put_user(0, p); 67 return put_user(0, p);
68 } 68 }
69 69
70 static int sg_get_timeout(struct request_queue *q) 70 static int sg_get_timeout(struct request_queue *q)
71 { 71 {
72 return jiffies_to_clock_t(q->sg_timeout); 72 return jiffies_to_clock_t(q->sg_timeout);
73 } 73 }
74 74
75 static int sg_set_timeout(struct request_queue *q, int __user *p) 75 static int sg_set_timeout(struct request_queue *q, int __user *p)
76 { 76 {
77 int timeout, err = get_user(timeout, p); 77 int timeout, err = get_user(timeout, p);
78 78
79 if (!err) 79 if (!err)
80 q->sg_timeout = clock_t_to_jiffies(timeout); 80 q->sg_timeout = clock_t_to_jiffies(timeout);
81 81
82 return err; 82 return err;
83 } 83 }
84 84
85 static int max_sectors_bytes(struct request_queue *q) 85 static int max_sectors_bytes(struct request_queue *q)
86 { 86 {
87 unsigned int max_sectors = queue_max_sectors(q); 87 unsigned int max_sectors = queue_max_sectors(q);
88 88
89 max_sectors = min_t(unsigned int, max_sectors, INT_MAX >> 9); 89 max_sectors = min_t(unsigned int, max_sectors, INT_MAX >> 9);
90 90
91 return max_sectors << 9; 91 return max_sectors << 9;
92 } 92 }
93 93
94 static int sg_get_reserved_size(struct request_queue *q, int __user *p) 94 static int sg_get_reserved_size(struct request_queue *q, int __user *p)
95 { 95 {
96 int val = min_t(int, q->sg_reserved_size, max_sectors_bytes(q)); 96 int val = min_t(int, q->sg_reserved_size, max_sectors_bytes(q));
97 97
98 return put_user(val, p); 98 return put_user(val, p);
99 } 99 }
100 100
101 static int sg_set_reserved_size(struct request_queue *q, int __user *p) 101 static int sg_set_reserved_size(struct request_queue *q, int __user *p)
102 { 102 {
103 int size, err = get_user(size, p); 103 int size, err = get_user(size, p);
104 104
105 if (err) 105 if (err)
106 return err; 106 return err;
107 107
108 if (size < 0) 108 if (size < 0)
109 return -EINVAL; 109 return -EINVAL;
110 110
111 q->sg_reserved_size = min(size, max_sectors_bytes(q)); 111 q->sg_reserved_size = min(size, max_sectors_bytes(q));
112 return 0; 112 return 0;
113 } 113 }
114 114
115 /* 115 /*
116 * will always return that we are ATAPI even for a real SCSI drive, I'm not 116 * will always return that we are ATAPI even for a real SCSI drive, I'm not
117 * so sure this is worth doing anything about (why would you care??) 117 * so sure this is worth doing anything about (why would you care??)
118 */ 118 */
119 static int sg_emulated_host(struct request_queue *q, int __user *p) 119 static int sg_emulated_host(struct request_queue *q, int __user *p)
120 { 120 {
121 return put_user(1, p); 121 return put_user(1, p);
122 } 122 }
123 123
124 static void blk_set_cmd_filter_defaults(struct blk_cmd_filter *filter) 124 static void blk_set_cmd_filter_defaults(struct blk_cmd_filter *filter)
125 { 125 {
126 /* Basic read-only commands */ 126 /* Basic read-only commands */
127 __set_bit(TEST_UNIT_READY, filter->read_ok); 127 __set_bit(TEST_UNIT_READY, filter->read_ok);
128 __set_bit(REQUEST_SENSE, filter->read_ok); 128 __set_bit(REQUEST_SENSE, filter->read_ok);
129 __set_bit(READ_6, filter->read_ok); 129 __set_bit(READ_6, filter->read_ok);
130 __set_bit(READ_10, filter->read_ok); 130 __set_bit(READ_10, filter->read_ok);
131 __set_bit(READ_12, filter->read_ok); 131 __set_bit(READ_12, filter->read_ok);
132 __set_bit(READ_16, filter->read_ok); 132 __set_bit(READ_16, filter->read_ok);
133 __set_bit(READ_BUFFER, filter->read_ok); 133 __set_bit(READ_BUFFER, filter->read_ok);
134 __set_bit(READ_DEFECT_DATA, filter->read_ok); 134 __set_bit(READ_DEFECT_DATA, filter->read_ok);
135 __set_bit(READ_CAPACITY, filter->read_ok); 135 __set_bit(READ_CAPACITY, filter->read_ok);
136 __set_bit(READ_LONG, filter->read_ok); 136 __set_bit(READ_LONG, filter->read_ok);
137 __set_bit(INQUIRY, filter->read_ok); 137 __set_bit(INQUIRY, filter->read_ok);
138 __set_bit(MODE_SENSE, filter->read_ok); 138 __set_bit(MODE_SENSE, filter->read_ok);
139 __set_bit(MODE_SENSE_10, filter->read_ok); 139 __set_bit(MODE_SENSE_10, filter->read_ok);
140 __set_bit(LOG_SENSE, filter->read_ok); 140 __set_bit(LOG_SENSE, filter->read_ok);
141 __set_bit(START_STOP, filter->read_ok); 141 __set_bit(START_STOP, filter->read_ok);
142 __set_bit(GPCMD_VERIFY_10, filter->read_ok); 142 __set_bit(GPCMD_VERIFY_10, filter->read_ok);
143 __set_bit(VERIFY_16, filter->read_ok); 143 __set_bit(VERIFY_16, filter->read_ok);
144 __set_bit(REPORT_LUNS, filter->read_ok); 144 __set_bit(REPORT_LUNS, filter->read_ok);
145 __set_bit(SERVICE_ACTION_IN, filter->read_ok); 145 __set_bit(SERVICE_ACTION_IN, filter->read_ok);
146 __set_bit(RECEIVE_DIAGNOSTIC, filter->read_ok); 146 __set_bit(RECEIVE_DIAGNOSTIC, filter->read_ok);
147 __set_bit(MAINTENANCE_IN, filter->read_ok); 147 __set_bit(MAINTENANCE_IN, filter->read_ok);
148 __set_bit(GPCMD_READ_BUFFER_CAPACITY, filter->read_ok); 148 __set_bit(GPCMD_READ_BUFFER_CAPACITY, filter->read_ok);
149 149
150 /* Audio CD commands */ 150 /* Audio CD commands */
151 __set_bit(GPCMD_PLAY_CD, filter->read_ok); 151 __set_bit(GPCMD_PLAY_CD, filter->read_ok);
152 __set_bit(GPCMD_PLAY_AUDIO_10, filter->read_ok); 152 __set_bit(GPCMD_PLAY_AUDIO_10, filter->read_ok);
153 __set_bit(GPCMD_PLAY_AUDIO_MSF, filter->read_ok); 153 __set_bit(GPCMD_PLAY_AUDIO_MSF, filter->read_ok);
154 __set_bit(GPCMD_PLAY_AUDIO_TI, filter->read_ok); 154 __set_bit(GPCMD_PLAY_AUDIO_TI, filter->read_ok);
155 __set_bit(GPCMD_PAUSE_RESUME, filter->read_ok); 155 __set_bit(GPCMD_PAUSE_RESUME, filter->read_ok);
156 156
157 /* CD/DVD data reading */ 157 /* CD/DVD data reading */
158 __set_bit(GPCMD_READ_CD, filter->read_ok); 158 __set_bit(GPCMD_READ_CD, filter->read_ok);
159 __set_bit(GPCMD_READ_CD_MSF, filter->read_ok); 159 __set_bit(GPCMD_READ_CD_MSF, filter->read_ok);
160 __set_bit(GPCMD_READ_DISC_INFO, filter->read_ok); 160 __set_bit(GPCMD_READ_DISC_INFO, filter->read_ok);
161 __set_bit(GPCMD_READ_CDVD_CAPACITY, filter->read_ok); 161 __set_bit(GPCMD_READ_CDVD_CAPACITY, filter->read_ok);
162 __set_bit(GPCMD_READ_DVD_STRUCTURE, filter->read_ok); 162 __set_bit(GPCMD_READ_DVD_STRUCTURE, filter->read_ok);
163 __set_bit(GPCMD_READ_HEADER, filter->read_ok); 163 __set_bit(GPCMD_READ_HEADER, filter->read_ok);
164 __set_bit(GPCMD_READ_TRACK_RZONE_INFO, filter->read_ok); 164 __set_bit(GPCMD_READ_TRACK_RZONE_INFO, filter->read_ok);
165 __set_bit(GPCMD_READ_SUBCHANNEL, filter->read_ok); 165 __set_bit(GPCMD_READ_SUBCHANNEL, filter->read_ok);
166 __set_bit(GPCMD_READ_TOC_PMA_ATIP, filter->read_ok); 166 __set_bit(GPCMD_READ_TOC_PMA_ATIP, filter->read_ok);
167 __set_bit(GPCMD_REPORT_KEY, filter->read_ok); 167 __set_bit(GPCMD_REPORT_KEY, filter->read_ok);
168 __set_bit(GPCMD_SCAN, filter->read_ok); 168 __set_bit(GPCMD_SCAN, filter->read_ok);
169 __set_bit(GPCMD_GET_CONFIGURATION, filter->read_ok); 169 __set_bit(GPCMD_GET_CONFIGURATION, filter->read_ok);
170 __set_bit(GPCMD_READ_FORMAT_CAPACITIES, filter->read_ok); 170 __set_bit(GPCMD_READ_FORMAT_CAPACITIES, filter->read_ok);
171 __set_bit(GPCMD_GET_EVENT_STATUS_NOTIFICATION, filter->read_ok); 171 __set_bit(GPCMD_GET_EVENT_STATUS_NOTIFICATION, filter->read_ok);
172 __set_bit(GPCMD_GET_PERFORMANCE, filter->read_ok); 172 __set_bit(GPCMD_GET_PERFORMANCE, filter->read_ok);
173 __set_bit(GPCMD_SEEK, filter->read_ok); 173 __set_bit(GPCMD_SEEK, filter->read_ok);
174 __set_bit(GPCMD_STOP_PLAY_SCAN, filter->read_ok); 174 __set_bit(GPCMD_STOP_PLAY_SCAN, filter->read_ok);
175 175
176 /* Basic writing commands */ 176 /* Basic writing commands */
177 __set_bit(WRITE_6, filter->write_ok); 177 __set_bit(WRITE_6, filter->write_ok);
178 __set_bit(WRITE_10, filter->write_ok); 178 __set_bit(WRITE_10, filter->write_ok);
179 __set_bit(WRITE_VERIFY, filter->write_ok); 179 __set_bit(WRITE_VERIFY, filter->write_ok);
180 __set_bit(WRITE_12, filter->write_ok); 180 __set_bit(WRITE_12, filter->write_ok);
181 __set_bit(WRITE_VERIFY_12, filter->write_ok); 181 __set_bit(WRITE_VERIFY_12, filter->write_ok);
182 __set_bit(WRITE_16, filter->write_ok); 182 __set_bit(WRITE_16, filter->write_ok);
183 __set_bit(WRITE_LONG, filter->write_ok); 183 __set_bit(WRITE_LONG, filter->write_ok);
184 __set_bit(WRITE_LONG_2, filter->write_ok); 184 __set_bit(WRITE_LONG_2, filter->write_ok);
185 __set_bit(ERASE, filter->write_ok); 185 __set_bit(ERASE, filter->write_ok);
186 __set_bit(GPCMD_MODE_SELECT_10, filter->write_ok); 186 __set_bit(GPCMD_MODE_SELECT_10, filter->write_ok);
187 __set_bit(MODE_SELECT, filter->write_ok); 187 __set_bit(MODE_SELECT, filter->write_ok);
188 __set_bit(LOG_SELECT, filter->write_ok); 188 __set_bit(LOG_SELECT, filter->write_ok);
189 __set_bit(GPCMD_BLANK, filter->write_ok); 189 __set_bit(GPCMD_BLANK, filter->write_ok);
190 __set_bit(GPCMD_CLOSE_TRACK, filter->write_ok); 190 __set_bit(GPCMD_CLOSE_TRACK, filter->write_ok);
191 __set_bit(GPCMD_FLUSH_CACHE, filter->write_ok); 191 __set_bit(GPCMD_FLUSH_CACHE, filter->write_ok);
192 __set_bit(GPCMD_FORMAT_UNIT, filter->write_ok); 192 __set_bit(GPCMD_FORMAT_UNIT, filter->write_ok);
193 __set_bit(GPCMD_REPAIR_RZONE_TRACK, filter->write_ok); 193 __set_bit(GPCMD_REPAIR_RZONE_TRACK, filter->write_ok);
194 __set_bit(GPCMD_RESERVE_RZONE_TRACK, filter->write_ok); 194 __set_bit(GPCMD_RESERVE_RZONE_TRACK, filter->write_ok);
195 __set_bit(GPCMD_SEND_DVD_STRUCTURE, filter->write_ok); 195 __set_bit(GPCMD_SEND_DVD_STRUCTURE, filter->write_ok);
196 __set_bit(GPCMD_SEND_EVENT, filter->write_ok); 196 __set_bit(GPCMD_SEND_EVENT, filter->write_ok);
197 __set_bit(GPCMD_SEND_KEY, filter->write_ok); 197 __set_bit(GPCMD_SEND_KEY, filter->write_ok);
198 __set_bit(GPCMD_SEND_OPC, filter->write_ok); 198 __set_bit(GPCMD_SEND_OPC, filter->write_ok);
199 __set_bit(GPCMD_SEND_CUE_SHEET, filter->write_ok); 199 __set_bit(GPCMD_SEND_CUE_SHEET, filter->write_ok);
200 __set_bit(GPCMD_SET_SPEED, filter->write_ok); 200 __set_bit(GPCMD_SET_SPEED, filter->write_ok);
201 __set_bit(GPCMD_PREVENT_ALLOW_MEDIUM_REMOVAL, filter->write_ok); 201 __set_bit(GPCMD_PREVENT_ALLOW_MEDIUM_REMOVAL, filter->write_ok);
202 __set_bit(GPCMD_LOAD_UNLOAD, filter->write_ok); 202 __set_bit(GPCMD_LOAD_UNLOAD, filter->write_ok);
203 __set_bit(GPCMD_SET_STREAMING, filter->write_ok); 203 __set_bit(GPCMD_SET_STREAMING, filter->write_ok);
204 __set_bit(GPCMD_SET_READ_AHEAD, filter->write_ok); 204 __set_bit(GPCMD_SET_READ_AHEAD, filter->write_ok);
205 } 205 }
206 206
207 int blk_verify_command(unsigned char *cmd, fmode_t has_write_perm) 207 int blk_verify_command(unsigned char *cmd, fmode_t has_write_perm)
208 { 208 {
209 struct blk_cmd_filter *filter = &blk_default_cmd_filter; 209 struct blk_cmd_filter *filter = &blk_default_cmd_filter;
210 210
211 /* root can do any command. */ 211 /* root can do any command. */
212 if (capable(CAP_SYS_RAWIO)) 212 if (capable(CAP_SYS_RAWIO))
213 return 0; 213 return 0;
214 214
215 /* Anybody who can open the device can do a read-safe command */ 215 /* Anybody who can open the device can do a read-safe command */
216 if (test_bit(cmd[0], filter->read_ok)) 216 if (test_bit(cmd[0], filter->read_ok))
217 return 0; 217 return 0;
218 218
219 /* Write-safe commands require a writable open */ 219 /* Write-safe commands require a writable open */
220 if (test_bit(cmd[0], filter->write_ok) && has_write_perm) 220 if (test_bit(cmd[0], filter->write_ok) && has_write_perm)
221 return 0; 221 return 0;
222 222
223 return -EPERM; 223 return -EPERM;
224 } 224 }
225 EXPORT_SYMBOL(blk_verify_command); 225 EXPORT_SYMBOL(blk_verify_command);
226 226
227 static int blk_fill_sghdr_rq(struct request_queue *q, struct request *rq, 227 static int blk_fill_sghdr_rq(struct request_queue *q, struct request *rq,
228 struct sg_io_hdr *hdr, fmode_t mode) 228 struct sg_io_hdr *hdr, fmode_t mode)
229 { 229 {
230 if (copy_from_user(rq->cmd, hdr->cmdp, hdr->cmd_len)) 230 if (copy_from_user(rq->cmd, hdr->cmdp, hdr->cmd_len))
231 return -EFAULT; 231 return -EFAULT;
232 if (blk_verify_command(rq->cmd, mode & FMODE_WRITE)) 232 if (blk_verify_command(rq->cmd, mode & FMODE_WRITE))
233 return -EPERM; 233 return -EPERM;
234 234
235 /* 235 /*
236 * fill in request structure 236 * fill in request structure
237 */ 237 */
238 rq->cmd_len = hdr->cmd_len; 238 rq->cmd_len = hdr->cmd_len;
239 239
240 rq->timeout = msecs_to_jiffies(hdr->timeout); 240 rq->timeout = msecs_to_jiffies(hdr->timeout);
241 if (!rq->timeout) 241 if (!rq->timeout)
242 rq->timeout = q->sg_timeout; 242 rq->timeout = q->sg_timeout;
243 if (!rq->timeout) 243 if (!rq->timeout)
244 rq->timeout = BLK_DEFAULT_SG_TIMEOUT; 244 rq->timeout = BLK_DEFAULT_SG_TIMEOUT;
245 if (rq->timeout < BLK_MIN_SG_TIMEOUT) 245 if (rq->timeout < BLK_MIN_SG_TIMEOUT)
246 rq->timeout = BLK_MIN_SG_TIMEOUT; 246 rq->timeout = BLK_MIN_SG_TIMEOUT;
247 247
248 return 0; 248 return 0;
249 } 249 }
250 250
251 static int blk_complete_sghdr_rq(struct request *rq, struct sg_io_hdr *hdr, 251 static int blk_complete_sghdr_rq(struct request *rq, struct sg_io_hdr *hdr,
252 struct bio *bio) 252 struct bio *bio)
253 { 253 {
254 int r, ret = 0; 254 int r, ret = 0;
255 255
256 /* 256 /*
257 * fill in all the output members 257 * fill in all the output members
258 */ 258 */
259 hdr->status = rq->errors & 0xff; 259 hdr->status = rq->errors & 0xff;
260 hdr->masked_status = status_byte(rq->errors); 260 hdr->masked_status = status_byte(rq->errors);
261 hdr->msg_status = msg_byte(rq->errors); 261 hdr->msg_status = msg_byte(rq->errors);
262 hdr->host_status = host_byte(rq->errors); 262 hdr->host_status = host_byte(rq->errors);
263 hdr->driver_status = driver_byte(rq->errors); 263 hdr->driver_status = driver_byte(rq->errors);
264 hdr->info = 0; 264 hdr->info = 0;
265 if (hdr->masked_status || hdr->host_status || hdr->driver_status) 265 if (hdr->masked_status || hdr->host_status || hdr->driver_status)
266 hdr->info |= SG_INFO_CHECK; 266 hdr->info |= SG_INFO_CHECK;
267 hdr->resid = rq->resid_len; 267 hdr->resid = rq->resid_len;
268 hdr->sb_len_wr = 0; 268 hdr->sb_len_wr = 0;
269 269
270 if (rq->sense_len && hdr->sbp) { 270 if (rq->sense_len && hdr->sbp) {
271 int len = min((unsigned int) hdr->mx_sb_len, rq->sense_len); 271 int len = min((unsigned int) hdr->mx_sb_len, rq->sense_len);
272 272
273 if (!copy_to_user(hdr->sbp, rq->sense, len)) 273 if (!copy_to_user(hdr->sbp, rq->sense, len))
274 hdr->sb_len_wr = len; 274 hdr->sb_len_wr = len;
275 else 275 else
276 ret = -EFAULT; 276 ret = -EFAULT;
277 } 277 }
278 278
279 r = blk_rq_unmap_user(bio); 279 r = blk_rq_unmap_user(bio);
280 if (!ret) 280 if (!ret)
281 ret = r; 281 ret = r;
282 282
283 return ret; 283 return ret;
284 } 284 }
285 285
286 static int sg_io(struct request_queue *q, struct gendisk *bd_disk, 286 static int sg_io(struct request_queue *q, struct gendisk *bd_disk,
287 struct sg_io_hdr *hdr, fmode_t mode) 287 struct sg_io_hdr *hdr, fmode_t mode)
288 { 288 {
289 unsigned long start_time; 289 unsigned long start_time;
290 ssize_t ret = 0; 290 ssize_t ret = 0;
291 int writing = 0; 291 int writing = 0;
292 int at_head = 0; 292 int at_head = 0;
293 struct request *rq; 293 struct request *rq;
294 char sense[SCSI_SENSE_BUFFERSIZE]; 294 char sense[SCSI_SENSE_BUFFERSIZE];
295 struct bio *bio; 295 struct bio *bio;
296 296
297 if (hdr->interface_id != 'S') 297 if (hdr->interface_id != 'S')
298 return -EINVAL; 298 return -EINVAL;
299 299
300 if (hdr->dxfer_len > (queue_max_hw_sectors(q) << 9)) 300 if (hdr->dxfer_len > (queue_max_hw_sectors(q) << 9))
301 return -EIO; 301 return -EIO;
302 302
303 if (hdr->dxfer_len) 303 if (hdr->dxfer_len)
304 switch (hdr->dxfer_direction) { 304 switch (hdr->dxfer_direction) {
305 default: 305 default:
306 return -EINVAL; 306 return -EINVAL;
307 case SG_DXFER_TO_DEV: 307 case SG_DXFER_TO_DEV:
308 writing = 1; 308 writing = 1;
309 break; 309 break;
310 case SG_DXFER_TO_FROM_DEV: 310 case SG_DXFER_TO_FROM_DEV:
311 case SG_DXFER_FROM_DEV: 311 case SG_DXFER_FROM_DEV:
312 break; 312 break;
313 } 313 }
314 if (hdr->flags & SG_FLAG_Q_AT_HEAD) 314 if (hdr->flags & SG_FLAG_Q_AT_HEAD)
315 at_head = 1; 315 at_head = 1;
316 316
317 ret = -ENOMEM; 317 ret = -ENOMEM;
318 rq = blk_get_request(q, writing ? WRITE : READ, GFP_KERNEL); 318 rq = blk_get_request(q, writing ? WRITE : READ, GFP_KERNEL);
319 if (IS_ERR(rq)) 319 if (IS_ERR(rq))
320 return PTR_ERR(rq); 320 return PTR_ERR(rq);
321 blk_rq_set_block_pc(rq); 321 blk_rq_set_block_pc(rq);
322 322
323 if (hdr->cmd_len > BLK_MAX_CDB) { 323 if (hdr->cmd_len > BLK_MAX_CDB) {
324 rq->cmd = kzalloc(hdr->cmd_len, GFP_KERNEL); 324 rq->cmd = kzalloc(hdr->cmd_len, GFP_KERNEL);
325 if (!rq->cmd) 325 if (!rq->cmd)
326 goto out_put_request; 326 goto out_put_request;
327 } 327 }
328 328
329 ret = -EFAULT; 329 ret = -EFAULT;
330 if (blk_fill_sghdr_rq(q, rq, hdr, mode)) 330 if (blk_fill_sghdr_rq(q, rq, hdr, mode))
331 goto out_free_cdb; 331 goto out_free_cdb;
332 332
333 ret = 0; 333 ret = 0;
334 if (hdr->iovec_count) { 334 if (hdr->iovec_count) {
335 size_t iov_data_len; 335 size_t iov_data_len;
336 struct iovec *iov = NULL; 336 struct iovec *iov = NULL;
337 337
338 ret = rw_copy_check_uvector(-1, hdr->dxferp, hdr->iovec_count, 338 ret = rw_copy_check_uvector(-1, hdr->dxferp, hdr->iovec_count,
339 0, NULL, &iov); 339 0, NULL, &iov);
340 if (ret < 0) { 340 if (ret < 0) {
341 kfree(iov); 341 kfree(iov);
342 goto out_free_cdb; 342 goto out_free_cdb;
343 } 343 }
344 344
345 iov_data_len = ret; 345 iov_data_len = ret;
346 ret = 0; 346 ret = 0;
347 347
348 /* SG_IO howto says that the shorter of the two wins */ 348 /* SG_IO howto says that the shorter of the two wins */
349 if (hdr->dxfer_len < iov_data_len) { 349 if (hdr->dxfer_len < iov_data_len) {
350 hdr->iovec_count = iov_shorten(iov, 350 hdr->iovec_count = iov_shorten(iov,
351 hdr->iovec_count, 351 hdr->iovec_count,
352 hdr->dxfer_len); 352 hdr->dxfer_len);
353 iov_data_len = hdr->dxfer_len; 353 iov_data_len = hdr->dxfer_len;
354 } 354 }
355 355
356 ret = blk_rq_map_user_iov(q, rq, NULL, (struct sg_iovec *) iov, 356 ret = blk_rq_map_user_iov(q, rq, NULL, (struct sg_iovec *) iov,
357 hdr->iovec_count, 357 hdr->iovec_count,
358 iov_data_len, GFP_KERNEL); 358 iov_data_len, GFP_KERNEL);
359 kfree(iov); 359 kfree(iov);
360 } else if (hdr->dxfer_len) 360 } else if (hdr->dxfer_len)
361 ret = blk_rq_map_user(q, rq, NULL, hdr->dxferp, hdr->dxfer_len, 361 ret = blk_rq_map_user(q, rq, NULL, hdr->dxferp, hdr->dxfer_len,
362 GFP_KERNEL); 362 GFP_KERNEL);
363 363
364 if (ret) 364 if (ret)
365 goto out_free_cdb; 365 goto out_free_cdb;
366 366
367 bio = rq->bio; 367 bio = rq->bio;
368 memset(sense, 0, sizeof(sense)); 368 memset(sense, 0, sizeof(sense));
369 rq->sense = sense; 369 rq->sense = sense;
370 rq->sense_len = 0; 370 rq->sense_len = 0;
371 rq->retries = 0; 371 rq->retries = 0;
372 372
373 start_time = jiffies; 373 start_time = jiffies;
374 374
375 /* ignore return value. All information is passed back to caller 375 /* ignore return value. All information is passed back to caller
376 * (if he doesn't check that is his problem). 376 * (if he doesn't check that is his problem).
377 * N.B. a non-zero SCSI status is _not_ necessarily an error. 377 * N.B. a non-zero SCSI status is _not_ necessarily an error.
378 */ 378 */
379 blk_execute_rq(q, bd_disk, rq, at_head); 379 blk_execute_rq(q, bd_disk, rq, at_head);
380 380
381 hdr->duration = jiffies_to_msecs(jiffies - start_time); 381 hdr->duration = jiffies_to_msecs(jiffies - start_time);
382 382
383 ret = blk_complete_sghdr_rq(rq, hdr, bio); 383 ret = blk_complete_sghdr_rq(rq, hdr, bio);
384 384
385 out_free_cdb: 385 out_free_cdb:
386 if (rq->cmd != rq->__cmd) 386 if (rq->cmd != rq->__cmd)
387 kfree(rq->cmd); 387 kfree(rq->cmd);
388 out_put_request: 388 out_put_request:
389 blk_put_request(rq); 389 blk_put_request(rq);
390 return ret; 390 return ret;
391 } 391 }
392 392
393 /** 393 /**
394 * sg_scsi_ioctl -- handle deprecated SCSI_IOCTL_SEND_COMMAND ioctl 394 * sg_scsi_ioctl -- handle deprecated SCSI_IOCTL_SEND_COMMAND ioctl
395 * @file: file this ioctl operates on (optional) 395 * @file: file this ioctl operates on (optional)
396 * @q: request queue to send scsi commands down 396 * @q: request queue to send scsi commands down
397 * @disk: gendisk to operate on (option) 397 * @disk: gendisk to operate on (option)
398 * @sic: userspace structure describing the command to perform 398 * @sic: userspace structure describing the command to perform
399 * 399 *
400 * Send down the scsi command described by @sic to the device below 400 * Send down the scsi command described by @sic to the device below
401 * the request queue @q. If @file is non-NULL it's used to perform 401 * the request queue @q. If @file is non-NULL it's used to perform
402 * fine-grained permission checks that allow users to send down 402 * fine-grained permission checks that allow users to send down
403 * non-destructive SCSI commands. If the caller has a struct gendisk 403 * non-destructive SCSI commands. If the caller has a struct gendisk
404 * available it should be passed in as @disk to allow the low level 404 * available it should be passed in as @disk to allow the low level
405 * driver to use the information contained in it. A non-NULL @disk 405 * driver to use the information contained in it. A non-NULL @disk
406 * is only allowed if the caller knows that the low level driver doesn't 406 * is only allowed if the caller knows that the low level driver doesn't
407 * need it (e.g. in the scsi subsystem). 407 * need it (e.g. in the scsi subsystem).
408 * 408 *
409 * Notes: 409 * Notes:
410 * - This interface is deprecated - users should use the SG_IO 410 * - This interface is deprecated - users should use the SG_IO
411 * interface instead, as this is a more flexible approach to 411 * interface instead, as this is a more flexible approach to
412 * performing SCSI commands on a device. 412 * performing SCSI commands on a device.
413 * - The SCSI command length is determined by examining the 1st byte 413 * - The SCSI command length is determined by examining the 1st byte
414 * of the given command. There is no way to override this. 414 * of the given command. There is no way to override this.
415 * - Data transfers are limited to PAGE_SIZE 415 * - Data transfers are limited to PAGE_SIZE
416 * - The length (x + y) must be at least OMAX_SB_LEN bytes long to 416 * - The length (x + y) must be at least OMAX_SB_LEN bytes long to
417 * accommodate the sense buffer when an error occurs. 417 * accommodate the sense buffer when an error occurs.
418 * The sense buffer is truncated to OMAX_SB_LEN (16) bytes so that 418 * The sense buffer is truncated to OMAX_SB_LEN (16) bytes so that
419 * old code will not be surprised. 419 * old code will not be surprised.
420 * - If a Unix error occurs (e.g. ENOMEM) then the user will receive 420 * - If a Unix error occurs (e.g. ENOMEM) then the user will receive
421 * a negative return and the Unix error code in 'errno'. 421 * a negative return and the Unix error code in 'errno'.
422 * If the SCSI command succeeds then 0 is returned. 422 * If the SCSI command succeeds then 0 is returned.
423 * Positive numbers returned are the compacted SCSI error codes (4 423 * Positive numbers returned are the compacted SCSI error codes (4
424 * bytes in one int) where the lowest byte is the SCSI status. 424 * bytes in one int) where the lowest byte is the SCSI status.
425 */ 425 */
426 #define OMAX_SB_LEN 16 /* For backward compatibility */ 426 #define OMAX_SB_LEN 16 /* For backward compatibility */
427 int sg_scsi_ioctl(struct request_queue *q, struct gendisk *disk, fmode_t mode, 427 int sg_scsi_ioctl(struct request_queue *q, struct gendisk *disk, fmode_t mode,
428 struct scsi_ioctl_command __user *sic) 428 struct scsi_ioctl_command __user *sic)
429 { 429 {
430 struct request *rq; 430 struct request *rq;
431 int err; 431 int err;
432 unsigned int in_len, out_len, bytes, opcode, cmdlen; 432 unsigned int in_len, out_len, bytes, opcode, cmdlen;
433 char *buffer = NULL, sense[SCSI_SENSE_BUFFERSIZE]; 433 char *buffer = NULL, sense[SCSI_SENSE_BUFFERSIZE];
434 434
435 if (!sic) 435 if (!sic)
436 return -EINVAL; 436 return -EINVAL;
437 437
438 /* 438 /*
439 * get in an out lengths, verify they don't exceed a page worth of data 439 * get in an out lengths, verify they don't exceed a page worth of data
440 */ 440 */
441 if (get_user(in_len, &sic->inlen)) 441 if (get_user(in_len, &sic->inlen))
442 return -EFAULT; 442 return -EFAULT;
443 if (get_user(out_len, &sic->outlen)) 443 if (get_user(out_len, &sic->outlen))
444 return -EFAULT; 444 return -EFAULT;
445 if (in_len > PAGE_SIZE || out_len > PAGE_SIZE) 445 if (in_len > PAGE_SIZE || out_len > PAGE_SIZE)
446 return -EINVAL; 446 return -EINVAL;
447 if (get_user(opcode, sic->data)) 447 if (get_user(opcode, sic->data))
448 return -EFAULT; 448 return -EFAULT;
449 449
450 bytes = max(in_len, out_len); 450 bytes = max(in_len, out_len);
451 if (bytes) { 451 if (bytes) {
452 buffer = kzalloc(bytes, q->bounce_gfp | GFP_USER| __GFP_NOWARN); 452 buffer = kzalloc(bytes, q->bounce_gfp | GFP_USER| __GFP_NOWARN);
453 if (!buffer) 453 if (!buffer)
454 return -ENOMEM; 454 return -ENOMEM;
455 455
456 } 456 }
457 457
458 rq = blk_get_request(q, in_len ? WRITE : READ, __GFP_WAIT); 458 rq = blk_get_request(q, in_len ? WRITE : READ, __GFP_WAIT);
459 if (IS_ERR(rq)) { 459 if (IS_ERR(rq)) {
460 err = PTR_ERR(rq); 460 err = PTR_ERR(rq);
461 goto error; 461 goto error_free_buffer;
462 } 462 }
463 blk_rq_set_block_pc(rq); 463 blk_rq_set_block_pc(rq);
464 464
465 cmdlen = COMMAND_SIZE(opcode); 465 cmdlen = COMMAND_SIZE(opcode);
466 466
467 /* 467 /*
468 * get command and data to send to device, if any 468 * get command and data to send to device, if any
469 */ 469 */
470 err = -EFAULT; 470 err = -EFAULT;
471 rq->cmd_len = cmdlen; 471 rq->cmd_len = cmdlen;
472 if (copy_from_user(rq->cmd, sic->data, cmdlen)) 472 if (copy_from_user(rq->cmd, sic->data, cmdlen))
473 goto error; 473 goto error;
474 474
475 if (in_len && copy_from_user(buffer, sic->data + cmdlen, in_len)) 475 if (in_len && copy_from_user(buffer, sic->data + cmdlen, in_len))
476 goto error; 476 goto error;
477 477
478 err = blk_verify_command(rq->cmd, mode & FMODE_WRITE); 478 err = blk_verify_command(rq->cmd, mode & FMODE_WRITE);
479 if (err) 479 if (err)
480 goto error; 480 goto error;
481 481
482 /* default. possible overriden later */ 482 /* default. possible overriden later */
483 rq->retries = 5; 483 rq->retries = 5;
484 484
485 switch (opcode) { 485 switch (opcode) {
486 case SEND_DIAGNOSTIC: 486 case SEND_DIAGNOSTIC:
487 case FORMAT_UNIT: 487 case FORMAT_UNIT:
488 rq->timeout = FORMAT_UNIT_TIMEOUT; 488 rq->timeout = FORMAT_UNIT_TIMEOUT;
489 rq->retries = 1; 489 rq->retries = 1;
490 break; 490 break;
491 case START_STOP: 491 case START_STOP:
492 rq->timeout = START_STOP_TIMEOUT; 492 rq->timeout = START_STOP_TIMEOUT;
493 break; 493 break;
494 case MOVE_MEDIUM: 494 case MOVE_MEDIUM:
495 rq->timeout = MOVE_MEDIUM_TIMEOUT; 495 rq->timeout = MOVE_MEDIUM_TIMEOUT;
496 break; 496 break;
497 case READ_ELEMENT_STATUS: 497 case READ_ELEMENT_STATUS:
498 rq->timeout = READ_ELEMENT_STATUS_TIMEOUT; 498 rq->timeout = READ_ELEMENT_STATUS_TIMEOUT;
499 break; 499 break;
500 case READ_DEFECT_DATA: 500 case READ_DEFECT_DATA:
501 rq->timeout = READ_DEFECT_DATA_TIMEOUT; 501 rq->timeout = READ_DEFECT_DATA_TIMEOUT;
502 rq->retries = 1; 502 rq->retries = 1;
503 break; 503 break;
504 default: 504 default:
505 rq->timeout = BLK_DEFAULT_SG_TIMEOUT; 505 rq->timeout = BLK_DEFAULT_SG_TIMEOUT;
506 break; 506 break;
507 } 507 }
508 508
509 if (bytes && blk_rq_map_kern(q, rq, buffer, bytes, __GFP_WAIT)) { 509 if (bytes && blk_rq_map_kern(q, rq, buffer, bytes, __GFP_WAIT)) {
510 err = DRIVER_ERROR << 24; 510 err = DRIVER_ERROR << 24;
511 goto error; 511 goto error;
512 } 512 }
513 513
514 memset(sense, 0, sizeof(sense)); 514 memset(sense, 0, sizeof(sense));
515 rq->sense = sense; 515 rq->sense = sense;
516 rq->sense_len = 0; 516 rq->sense_len = 0;
517 517
518 blk_execute_rq(q, disk, rq, 0); 518 blk_execute_rq(q, disk, rq, 0);
519 519
520 err = rq->errors & 0xff; /* only 8 bit SCSI status */ 520 err = rq->errors & 0xff; /* only 8 bit SCSI status */
521 if (err) { 521 if (err) {
522 if (rq->sense_len && rq->sense) { 522 if (rq->sense_len && rq->sense) {
523 bytes = (OMAX_SB_LEN > rq->sense_len) ? 523 bytes = (OMAX_SB_LEN > rq->sense_len) ?
524 rq->sense_len : OMAX_SB_LEN; 524 rq->sense_len : OMAX_SB_LEN;
525 if (copy_to_user(sic->data, rq->sense, bytes)) 525 if (copy_to_user(sic->data, rq->sense, bytes))
526 err = -EFAULT; 526 err = -EFAULT;
527 } 527 }
528 } else { 528 } else {
529 if (copy_to_user(sic->data, buffer, out_len)) 529 if (copy_to_user(sic->data, buffer, out_len))
530 err = -EFAULT; 530 err = -EFAULT;
531 } 531 }
532 532
533 error: 533 error:
534 blk_put_request(rq);
535
536 error_free_buffer:
534 kfree(buffer); 537 kfree(buffer);
535 if (rq) 538
536 blk_put_request(rq);
537 return err; 539 return err;
538 } 540 }
539 EXPORT_SYMBOL_GPL(sg_scsi_ioctl); 541 EXPORT_SYMBOL_GPL(sg_scsi_ioctl);
540 542
541 /* Send basic block requests */ 543 /* Send basic block requests */
542 static int __blk_send_generic(struct request_queue *q, struct gendisk *bd_disk, 544 static int __blk_send_generic(struct request_queue *q, struct gendisk *bd_disk,
543 int cmd, int data) 545 int cmd, int data)
544 { 546 {
545 struct request *rq; 547 struct request *rq;
546 int err; 548 int err;
547 549
548 rq = blk_get_request(q, WRITE, __GFP_WAIT); 550 rq = blk_get_request(q, WRITE, __GFP_WAIT);
549 if (IS_ERR(rq)) 551 if (IS_ERR(rq))
550 return PTR_ERR(rq); 552 return PTR_ERR(rq);
551 blk_rq_set_block_pc(rq); 553 blk_rq_set_block_pc(rq);
552 rq->timeout = BLK_DEFAULT_SG_TIMEOUT; 554 rq->timeout = BLK_DEFAULT_SG_TIMEOUT;
553 rq->cmd[0] = cmd; 555 rq->cmd[0] = cmd;
554 rq->cmd[4] = data; 556 rq->cmd[4] = data;
555 rq->cmd_len = 6; 557 rq->cmd_len = 6;
556 err = blk_execute_rq(q, bd_disk, rq, 0); 558 err = blk_execute_rq(q, bd_disk, rq, 0);
557 blk_put_request(rq); 559 blk_put_request(rq);
558 560
559 return err; 561 return err;
560 } 562 }
561 563
562 static inline int blk_send_start_stop(struct request_queue *q, 564 static inline int blk_send_start_stop(struct request_queue *q,
563 struct gendisk *bd_disk, int data) 565 struct gendisk *bd_disk, int data)
564 { 566 {
565 return __blk_send_generic(q, bd_disk, GPCMD_START_STOP_UNIT, data); 567 return __blk_send_generic(q, bd_disk, GPCMD_START_STOP_UNIT, data);
566 } 568 }
567 569
568 int scsi_cmd_ioctl(struct request_queue *q, struct gendisk *bd_disk, fmode_t mode, 570 int scsi_cmd_ioctl(struct request_queue *q, struct gendisk *bd_disk, fmode_t mode,
569 unsigned int cmd, void __user *arg) 571 unsigned int cmd, void __user *arg)
570 { 572 {
571 int err; 573 int err;
572 574
573 if (!q) 575 if (!q)
574 return -ENXIO; 576 return -ENXIO;
575 577
576 switch (cmd) { 578 switch (cmd) {
577 /* 579 /*
578 * new sgv3 interface 580 * new sgv3 interface
579 */ 581 */
580 case SG_GET_VERSION_NUM: 582 case SG_GET_VERSION_NUM:
581 err = sg_get_version(arg); 583 err = sg_get_version(arg);
582 break; 584 break;
583 case SCSI_IOCTL_GET_IDLUN: 585 case SCSI_IOCTL_GET_IDLUN:
584 err = scsi_get_idlun(q, arg); 586 err = scsi_get_idlun(q, arg);
585 break; 587 break;
586 case SCSI_IOCTL_GET_BUS_NUMBER: 588 case SCSI_IOCTL_GET_BUS_NUMBER:
587 err = scsi_get_bus(q, arg); 589 err = scsi_get_bus(q, arg);
588 break; 590 break;
589 case SG_SET_TIMEOUT: 591 case SG_SET_TIMEOUT:
590 err = sg_set_timeout(q, arg); 592 err = sg_set_timeout(q, arg);
591 break; 593 break;
592 case SG_GET_TIMEOUT: 594 case SG_GET_TIMEOUT:
593 err = sg_get_timeout(q); 595 err = sg_get_timeout(q);
594 break; 596 break;
595 case SG_GET_RESERVED_SIZE: 597 case SG_GET_RESERVED_SIZE:
596 err = sg_get_reserved_size(q, arg); 598 err = sg_get_reserved_size(q, arg);
597 break; 599 break;
598 case SG_SET_RESERVED_SIZE: 600 case SG_SET_RESERVED_SIZE:
599 err = sg_set_reserved_size(q, arg); 601 err = sg_set_reserved_size(q, arg);
600 break; 602 break;
601 case SG_EMULATED_HOST: 603 case SG_EMULATED_HOST:
602 err = sg_emulated_host(q, arg); 604 err = sg_emulated_host(q, arg);
603 break; 605 break;
604 case SG_IO: { 606 case SG_IO: {
605 struct sg_io_hdr hdr; 607 struct sg_io_hdr hdr;
606 608
607 err = -EFAULT; 609 err = -EFAULT;
608 if (copy_from_user(&hdr, arg, sizeof(hdr))) 610 if (copy_from_user(&hdr, arg, sizeof(hdr)))
609 break; 611 break;
610 err = sg_io(q, bd_disk, &hdr, mode); 612 err = sg_io(q, bd_disk, &hdr, mode);
611 if (err == -EFAULT) 613 if (err == -EFAULT)
612 break; 614 break;
613 615
614 if (copy_to_user(arg, &hdr, sizeof(hdr))) 616 if (copy_to_user(arg, &hdr, sizeof(hdr)))
615 err = -EFAULT; 617 err = -EFAULT;
616 break; 618 break;
617 } 619 }
618 case CDROM_SEND_PACKET: { 620 case CDROM_SEND_PACKET: {
619 struct cdrom_generic_command cgc; 621 struct cdrom_generic_command cgc;
620 struct sg_io_hdr hdr; 622 struct sg_io_hdr hdr;
621 623
622 err = -EFAULT; 624 err = -EFAULT;
623 if (copy_from_user(&cgc, arg, sizeof(cgc))) 625 if (copy_from_user(&cgc, arg, sizeof(cgc)))
624 break; 626 break;
625 cgc.timeout = clock_t_to_jiffies(cgc.timeout); 627 cgc.timeout = clock_t_to_jiffies(cgc.timeout);
626 memset(&hdr, 0, sizeof(hdr)); 628 memset(&hdr, 0, sizeof(hdr));
627 hdr.interface_id = 'S'; 629 hdr.interface_id = 'S';
628 hdr.cmd_len = sizeof(cgc.cmd); 630 hdr.cmd_len = sizeof(cgc.cmd);
629 hdr.dxfer_len = cgc.buflen; 631 hdr.dxfer_len = cgc.buflen;
630 err = 0; 632 err = 0;
631 switch (cgc.data_direction) { 633 switch (cgc.data_direction) {
632 case CGC_DATA_UNKNOWN: 634 case CGC_DATA_UNKNOWN:
633 hdr.dxfer_direction = SG_DXFER_UNKNOWN; 635 hdr.dxfer_direction = SG_DXFER_UNKNOWN;
634 break; 636 break;
635 case CGC_DATA_WRITE: 637 case CGC_DATA_WRITE:
636 hdr.dxfer_direction = SG_DXFER_TO_DEV; 638 hdr.dxfer_direction = SG_DXFER_TO_DEV;
637 break; 639 break;
638 case CGC_DATA_READ: 640 case CGC_DATA_READ:
639 hdr.dxfer_direction = SG_DXFER_FROM_DEV; 641 hdr.dxfer_direction = SG_DXFER_FROM_DEV;
640 break; 642 break;
641 case CGC_DATA_NONE: 643 case CGC_DATA_NONE:
642 hdr.dxfer_direction = SG_DXFER_NONE; 644 hdr.dxfer_direction = SG_DXFER_NONE;
643 break; 645 break;
644 default: 646 default:
645 err = -EINVAL; 647 err = -EINVAL;
646 } 648 }
647 if (err) 649 if (err)
648 break; 650 break;
649 651
650 hdr.dxferp = cgc.buffer; 652 hdr.dxferp = cgc.buffer;
651 hdr.sbp = cgc.sense; 653 hdr.sbp = cgc.sense;
652 if (hdr.sbp) 654 if (hdr.sbp)
653 hdr.mx_sb_len = sizeof(struct request_sense); 655 hdr.mx_sb_len = sizeof(struct request_sense);
654 hdr.timeout = jiffies_to_msecs(cgc.timeout); 656 hdr.timeout = jiffies_to_msecs(cgc.timeout);
655 hdr.cmdp = ((struct cdrom_generic_command __user*) arg)->cmd; 657 hdr.cmdp = ((struct cdrom_generic_command __user*) arg)->cmd;
656 hdr.cmd_len = sizeof(cgc.cmd); 658 hdr.cmd_len = sizeof(cgc.cmd);
657 659
658 err = sg_io(q, bd_disk, &hdr, mode); 660 err = sg_io(q, bd_disk, &hdr, mode);
659 if (err == -EFAULT) 661 if (err == -EFAULT)
660 break; 662 break;
661 663
662 if (hdr.status) 664 if (hdr.status)
663 err = -EIO; 665 err = -EIO;
664 666
665 cgc.stat = err; 667 cgc.stat = err;
666 cgc.buflen = hdr.resid; 668 cgc.buflen = hdr.resid;
667 if (copy_to_user(arg, &cgc, sizeof(cgc))) 669 if (copy_to_user(arg, &cgc, sizeof(cgc)))
668 err = -EFAULT; 670 err = -EFAULT;
669 671
670 break; 672 break;
671 } 673 }
672 674
673 /* 675 /*
674 * old junk scsi send command ioctl 676 * old junk scsi send command ioctl
675 */ 677 */
676 case SCSI_IOCTL_SEND_COMMAND: 678 case SCSI_IOCTL_SEND_COMMAND:
677 printk(KERN_WARNING "program %s is using a deprecated SCSI ioctl, please convert it to SG_IO\n", current->comm); 679 printk(KERN_WARNING "program %s is using a deprecated SCSI ioctl, please convert it to SG_IO\n", current->comm);
678 err = -EINVAL; 680 err = -EINVAL;
679 if (!arg) 681 if (!arg)
680 break; 682 break;
681 683
682 err = sg_scsi_ioctl(q, bd_disk, mode, arg); 684 err = sg_scsi_ioctl(q, bd_disk, mode, arg);
683 break; 685 break;
684 case CDROMCLOSETRAY: 686 case CDROMCLOSETRAY:
685 err = blk_send_start_stop(q, bd_disk, 0x03); 687 err = blk_send_start_stop(q, bd_disk, 0x03);
686 break; 688 break;
687 case CDROMEJECT: 689 case CDROMEJECT:
688 err = blk_send_start_stop(q, bd_disk, 0x02); 690 err = blk_send_start_stop(q, bd_disk, 0x02);
689 break; 691 break;
690 default: 692 default:
691 err = -ENOTTY; 693 err = -ENOTTY;
692 } 694 }
693 695
694 return err; 696 return err;
695 } 697 }
696 EXPORT_SYMBOL(scsi_cmd_ioctl); 698 EXPORT_SYMBOL(scsi_cmd_ioctl);
697 699
698 int scsi_verify_blk_ioctl(struct block_device *bd, unsigned int cmd) 700 int scsi_verify_blk_ioctl(struct block_device *bd, unsigned int cmd)
699 { 701 {
700 if (bd && bd == bd->bd_contains) 702 if (bd && bd == bd->bd_contains)
701 return 0; 703 return 0;
702 704
703 /* Actually none of these is particularly useful on a partition, 705 /* Actually none of these is particularly useful on a partition,
704 * but they are safe. 706 * but they are safe.
705 */ 707 */
706 switch (cmd) { 708 switch (cmd) {
707 case SCSI_IOCTL_GET_IDLUN: 709 case SCSI_IOCTL_GET_IDLUN:
708 case SCSI_IOCTL_GET_BUS_NUMBER: 710 case SCSI_IOCTL_GET_BUS_NUMBER:
709 case SCSI_IOCTL_GET_PCI: 711 case SCSI_IOCTL_GET_PCI:
710 case SCSI_IOCTL_PROBE_HOST: 712 case SCSI_IOCTL_PROBE_HOST:
711 case SG_GET_VERSION_NUM: 713 case SG_GET_VERSION_NUM:
712 case SG_SET_TIMEOUT: 714 case SG_SET_TIMEOUT:
713 case SG_GET_TIMEOUT: 715 case SG_GET_TIMEOUT:
714 case SG_GET_RESERVED_SIZE: 716 case SG_GET_RESERVED_SIZE:
715 case SG_SET_RESERVED_SIZE: 717 case SG_SET_RESERVED_SIZE:
716 case SG_EMULATED_HOST: 718 case SG_EMULATED_HOST:
717 return 0; 719 return 0;
718 case CDROM_GET_CAPABILITY: 720 case CDROM_GET_CAPABILITY:
719 /* Keep this until we remove the printk below. udev sends it 721 /* Keep this until we remove the printk below. udev sends it
720 * and we do not want to spam dmesg about it. CD-ROMs do 722 * and we do not want to spam dmesg about it. CD-ROMs do
721 * not have partitions, so we get here only for disks. 723 * not have partitions, so we get here only for disks.
722 */ 724 */
723 return -ENOIOCTLCMD; 725 return -ENOIOCTLCMD;
724 default: 726 default:
725 break; 727 break;
726 } 728 }
727 729
728 if (capable(CAP_SYS_RAWIO)) 730 if (capable(CAP_SYS_RAWIO))
729 return 0; 731 return 0;
730 732
731 /* In particular, rule out all resets and host-specific ioctls. */ 733 /* In particular, rule out all resets and host-specific ioctls. */
732 printk_ratelimited(KERN_WARNING 734 printk_ratelimited(KERN_WARNING
733 "%s: sending ioctl %x to a partition!\n", current->comm, cmd); 735 "%s: sending ioctl %x to a partition!\n", current->comm, cmd);
734 736
735 return -ENOIOCTLCMD; 737 return -ENOIOCTLCMD;
736 } 738 }
737 EXPORT_SYMBOL(scsi_verify_blk_ioctl); 739 EXPORT_SYMBOL(scsi_verify_blk_ioctl);
738 740
739 int scsi_cmd_blk_ioctl(struct block_device *bd, fmode_t mode, 741 int scsi_cmd_blk_ioctl(struct block_device *bd, fmode_t mode,
740 unsigned int cmd, void __user *arg) 742 unsigned int cmd, void __user *arg)
741 { 743 {
742 int ret; 744 int ret;
743 745
744 ret = scsi_verify_blk_ioctl(bd, cmd); 746 ret = scsi_verify_blk_ioctl(bd, cmd);
745 if (ret < 0) 747 if (ret < 0)
746 return ret; 748 return ret;
747 749
748 return scsi_cmd_ioctl(bd->bd_disk->queue, bd->bd_disk, mode, cmd, arg); 750 return scsi_cmd_ioctl(bd->bd_disk->queue, bd->bd_disk, mode, cmd, arg);
749 } 751 }
750 EXPORT_SYMBOL(scsi_cmd_blk_ioctl); 752 EXPORT_SYMBOL(scsi_cmd_blk_ioctl);
751 753
752 static int __init blk_scsi_ioctl_init(void) 754 static int __init blk_scsi_ioctl_init(void)
753 { 755 {
754 blk_set_cmd_filter_defaults(&blk_default_cmd_filter); 756 blk_set_cmd_filter_defaults(&blk_default_cmd_filter);
755 return 0; 757 return 0;
756 } 758 }
757 fs_initcall(blk_scsi_ioctl_init); 759 fs_initcall(blk_scsi_ioctl_init);