Commit c7149d6bce2561aeaa48caaa1700aa8b3b22008f

Authored by Alan D. Brunelle
Committed by Jens Axboe
1 parent ec05b297f9

Fix remap handling by blktrace

This patch provides more information concerning REMAP operations on block
IOs. The additional information provides clearer details at the user level,
and supports post-processing analysis in btt.

o  Adds in partition remaps on the same device.
o  Fixed up the remap information in DM to be in the right order
o  Sent up mapped-from and mapped-to device information

Signed-off-by: Alan D. Brunelle <alan.brunelle@hp.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

Showing 3 changed files with 8 additions and 3 deletions Inline Diff

1 /* 1 /*
2 * Copyright (C) 1991, 1992 Linus Torvalds 2 * Copyright (C) 1991, 1992 Linus Torvalds
3 * Copyright (C) 1994, Karl Keyte: Added support for disk statistics 3 * Copyright (C) 1994, Karl Keyte: Added support for disk statistics
4 * Elevator latency, (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE 4 * Elevator latency, (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE
5 * Queue request tables / lock, selectable elevator, Jens Axboe <axboe@suse.de> 5 * Queue request tables / lock, selectable elevator, Jens Axboe <axboe@suse.de>
6 * kernel-doc documentation started by NeilBrown <neilb@cse.unsw.edu.au> - July2000 6 * kernel-doc documentation started by NeilBrown <neilb@cse.unsw.edu.au> - July2000
7 * bio rewrite, highmem i/o, etc, Jens Axboe <axboe@suse.de> - may 2001 7 * bio rewrite, highmem i/o, etc, Jens Axboe <axboe@suse.de> - may 2001
8 */ 8 */
9 9
10 /* 10 /*
11 * This handles all read/write requests to block devices 11 * This handles all read/write requests to block devices
12 */ 12 */
13 #include <linux/kernel.h> 13 #include <linux/kernel.h>
14 #include <linux/module.h> 14 #include <linux/module.h>
15 #include <linux/backing-dev.h> 15 #include <linux/backing-dev.h>
16 #include <linux/bio.h> 16 #include <linux/bio.h>
17 #include <linux/blkdev.h> 17 #include <linux/blkdev.h>
18 #include <linux/highmem.h> 18 #include <linux/highmem.h>
19 #include <linux/mm.h> 19 #include <linux/mm.h>
20 #include <linux/kernel_stat.h> 20 #include <linux/kernel_stat.h>
21 #include <linux/string.h> 21 #include <linux/string.h>
22 #include <linux/init.h> 22 #include <linux/init.h>
23 #include <linux/bootmem.h> /* for max_pfn/max_low_pfn */ 23 #include <linux/bootmem.h> /* for max_pfn/max_low_pfn */
24 #include <linux/completion.h> 24 #include <linux/completion.h>
25 #include <linux/slab.h> 25 #include <linux/slab.h>
26 #include <linux/swap.h> 26 #include <linux/swap.h>
27 #include <linux/writeback.h> 27 #include <linux/writeback.h>
28 #include <linux/task_io_accounting_ops.h> 28 #include <linux/task_io_accounting_ops.h>
29 #include <linux/interrupt.h> 29 #include <linux/interrupt.h>
30 #include <linux/cpu.h> 30 #include <linux/cpu.h>
31 #include <linux/blktrace_api.h> 31 #include <linux/blktrace_api.h>
32 #include <linux/fault-inject.h> 32 #include <linux/fault-inject.h>
33 33
34 /* 34 /*
35 * for max sense size 35 * for max sense size
36 */ 36 */
37 #include <scsi/scsi_cmnd.h> 37 #include <scsi/scsi_cmnd.h>
38 38
39 static void blk_unplug_work(struct work_struct *work); 39 static void blk_unplug_work(struct work_struct *work);
40 static void blk_unplug_timeout(unsigned long data); 40 static void blk_unplug_timeout(unsigned long data);
41 static void drive_stat_acct(struct request *rq, int nr_sectors, int new_io); 41 static void drive_stat_acct(struct request *rq, int nr_sectors, int new_io);
42 static void init_request_from_bio(struct request *req, struct bio *bio); 42 static void init_request_from_bio(struct request *req, struct bio *bio);
43 static int __make_request(struct request_queue *q, struct bio *bio); 43 static int __make_request(struct request_queue *q, struct bio *bio);
44 static struct io_context *current_io_context(gfp_t gfp_flags, int node); 44 static struct io_context *current_io_context(gfp_t gfp_flags, int node);
45 45
46 /* 46 /*
47 * For the allocated request tables 47 * For the allocated request tables
48 */ 48 */
49 static struct kmem_cache *request_cachep; 49 static struct kmem_cache *request_cachep;
50 50
51 /* 51 /*
52 * For queue allocation 52 * For queue allocation
53 */ 53 */
54 static struct kmem_cache *requestq_cachep; 54 static struct kmem_cache *requestq_cachep;
55 55
56 /* 56 /*
57 * For io context allocations 57 * For io context allocations
58 */ 58 */
59 static struct kmem_cache *iocontext_cachep; 59 static struct kmem_cache *iocontext_cachep;
60 60
61 /* 61 /*
62 * Controlling structure to kblockd 62 * Controlling structure to kblockd
63 */ 63 */
64 static struct workqueue_struct *kblockd_workqueue; 64 static struct workqueue_struct *kblockd_workqueue;
65 65
66 unsigned long blk_max_low_pfn, blk_max_pfn; 66 unsigned long blk_max_low_pfn, blk_max_pfn;
67 67
68 EXPORT_SYMBOL(blk_max_low_pfn); 68 EXPORT_SYMBOL(blk_max_low_pfn);
69 EXPORT_SYMBOL(blk_max_pfn); 69 EXPORT_SYMBOL(blk_max_pfn);
70 70
71 static DEFINE_PER_CPU(struct list_head, blk_cpu_done); 71 static DEFINE_PER_CPU(struct list_head, blk_cpu_done);
72 72
73 /* Amount of time in which a process may batch requests */ 73 /* Amount of time in which a process may batch requests */
74 #define BLK_BATCH_TIME (HZ/50UL) 74 #define BLK_BATCH_TIME (HZ/50UL)
75 75
76 /* Number of requests a "batching" process may submit */ 76 /* Number of requests a "batching" process may submit */
77 #define BLK_BATCH_REQ 32 77 #define BLK_BATCH_REQ 32
78 78
79 /* 79 /*
80 * Return the threshold (number of used requests) at which the queue is 80 * Return the threshold (number of used requests) at which the queue is
81 * considered to be congested. It include a little hysteresis to keep the 81 * considered to be congested. It include a little hysteresis to keep the
82 * context switch rate down. 82 * context switch rate down.
83 */ 83 */
84 static inline int queue_congestion_on_threshold(struct request_queue *q) 84 static inline int queue_congestion_on_threshold(struct request_queue *q)
85 { 85 {
86 return q->nr_congestion_on; 86 return q->nr_congestion_on;
87 } 87 }
88 88
89 /* 89 /*
90 * The threshold at which a queue is considered to be uncongested 90 * The threshold at which a queue is considered to be uncongested
91 */ 91 */
92 static inline int queue_congestion_off_threshold(struct request_queue *q) 92 static inline int queue_congestion_off_threshold(struct request_queue *q)
93 { 93 {
94 return q->nr_congestion_off; 94 return q->nr_congestion_off;
95 } 95 }
96 96
97 static void blk_queue_congestion_threshold(struct request_queue *q) 97 static void blk_queue_congestion_threshold(struct request_queue *q)
98 { 98 {
99 int nr; 99 int nr;
100 100
101 nr = q->nr_requests - (q->nr_requests / 8) + 1; 101 nr = q->nr_requests - (q->nr_requests / 8) + 1;
102 if (nr > q->nr_requests) 102 if (nr > q->nr_requests)
103 nr = q->nr_requests; 103 nr = q->nr_requests;
104 q->nr_congestion_on = nr; 104 q->nr_congestion_on = nr;
105 105
106 nr = q->nr_requests - (q->nr_requests / 8) - (q->nr_requests / 16) - 1; 106 nr = q->nr_requests - (q->nr_requests / 8) - (q->nr_requests / 16) - 1;
107 if (nr < 1) 107 if (nr < 1)
108 nr = 1; 108 nr = 1;
109 q->nr_congestion_off = nr; 109 q->nr_congestion_off = nr;
110 } 110 }
111 111
112 /** 112 /**
113 * blk_get_backing_dev_info - get the address of a queue's backing_dev_info 113 * blk_get_backing_dev_info - get the address of a queue's backing_dev_info
114 * @bdev: device 114 * @bdev: device
115 * 115 *
116 * Locates the passed device's request queue and returns the address of its 116 * Locates the passed device's request queue and returns the address of its
117 * backing_dev_info 117 * backing_dev_info
118 * 118 *
119 * Will return NULL if the request queue cannot be located. 119 * Will return NULL if the request queue cannot be located.
120 */ 120 */
121 struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev) 121 struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev)
122 { 122 {
123 struct backing_dev_info *ret = NULL; 123 struct backing_dev_info *ret = NULL;
124 struct request_queue *q = bdev_get_queue(bdev); 124 struct request_queue *q = bdev_get_queue(bdev);
125 125
126 if (q) 126 if (q)
127 ret = &q->backing_dev_info; 127 ret = &q->backing_dev_info;
128 return ret; 128 return ret;
129 } 129 }
130 EXPORT_SYMBOL(blk_get_backing_dev_info); 130 EXPORT_SYMBOL(blk_get_backing_dev_info);
131 131
132 /** 132 /**
133 * blk_queue_prep_rq - set a prepare_request function for queue 133 * blk_queue_prep_rq - set a prepare_request function for queue
134 * @q: queue 134 * @q: queue
135 * @pfn: prepare_request function 135 * @pfn: prepare_request function
136 * 136 *
137 * It's possible for a queue to register a prepare_request callback which 137 * It's possible for a queue to register a prepare_request callback which
138 * is invoked before the request is handed to the request_fn. The goal of 138 * is invoked before the request is handed to the request_fn. The goal of
139 * the function is to prepare a request for I/O, it can be used to build a 139 * the function is to prepare a request for I/O, it can be used to build a
140 * cdb from the request data for instance. 140 * cdb from the request data for instance.
141 * 141 *
142 */ 142 */
143 void blk_queue_prep_rq(struct request_queue *q, prep_rq_fn *pfn) 143 void blk_queue_prep_rq(struct request_queue *q, prep_rq_fn *pfn)
144 { 144 {
145 q->prep_rq_fn = pfn; 145 q->prep_rq_fn = pfn;
146 } 146 }
147 147
148 EXPORT_SYMBOL(blk_queue_prep_rq); 148 EXPORT_SYMBOL(blk_queue_prep_rq);
149 149
150 /** 150 /**
151 * blk_queue_merge_bvec - set a merge_bvec function for queue 151 * blk_queue_merge_bvec - set a merge_bvec function for queue
152 * @q: queue 152 * @q: queue
153 * @mbfn: merge_bvec_fn 153 * @mbfn: merge_bvec_fn
154 * 154 *
155 * Usually queues have static limitations on the max sectors or segments that 155 * Usually queues have static limitations on the max sectors or segments that
156 * we can put in a request. Stacking drivers may have some settings that 156 * we can put in a request. Stacking drivers may have some settings that
157 * are dynamic, and thus we have to query the queue whether it is ok to 157 * are dynamic, and thus we have to query the queue whether it is ok to
158 * add a new bio_vec to a bio at a given offset or not. If the block device 158 * add a new bio_vec to a bio at a given offset or not. If the block device
159 * has such limitations, it needs to register a merge_bvec_fn to control 159 * has such limitations, it needs to register a merge_bvec_fn to control
160 * the size of bio's sent to it. Note that a block device *must* allow a 160 * the size of bio's sent to it. Note that a block device *must* allow a
161 * single page to be added to an empty bio. The block device driver may want 161 * single page to be added to an empty bio. The block device driver may want
162 * to use the bio_split() function to deal with these bio's. By default 162 * to use the bio_split() function to deal with these bio's. By default
163 * no merge_bvec_fn is defined for a queue, and only the fixed limits are 163 * no merge_bvec_fn is defined for a queue, and only the fixed limits are
164 * honored. 164 * honored.
165 */ 165 */
166 void blk_queue_merge_bvec(struct request_queue *q, merge_bvec_fn *mbfn) 166 void blk_queue_merge_bvec(struct request_queue *q, merge_bvec_fn *mbfn)
167 { 167 {
168 q->merge_bvec_fn = mbfn; 168 q->merge_bvec_fn = mbfn;
169 } 169 }
170 170
171 EXPORT_SYMBOL(blk_queue_merge_bvec); 171 EXPORT_SYMBOL(blk_queue_merge_bvec);
172 172
173 void blk_queue_softirq_done(struct request_queue *q, softirq_done_fn *fn) 173 void blk_queue_softirq_done(struct request_queue *q, softirq_done_fn *fn)
174 { 174 {
175 q->softirq_done_fn = fn; 175 q->softirq_done_fn = fn;
176 } 176 }
177 177
178 EXPORT_SYMBOL(blk_queue_softirq_done); 178 EXPORT_SYMBOL(blk_queue_softirq_done);
179 179
180 /** 180 /**
181 * blk_queue_make_request - define an alternate make_request function for a device 181 * blk_queue_make_request - define an alternate make_request function for a device
182 * @q: the request queue for the device to be affected 182 * @q: the request queue for the device to be affected
183 * @mfn: the alternate make_request function 183 * @mfn: the alternate make_request function
184 * 184 *
185 * Description: 185 * Description:
186 * The normal way for &struct bios to be passed to a device 186 * The normal way for &struct bios to be passed to a device
187 * driver is for them to be collected into requests on a request 187 * driver is for them to be collected into requests on a request
188 * queue, and then to allow the device driver to select requests 188 * queue, and then to allow the device driver to select requests
189 * off that queue when it is ready. This works well for many block 189 * off that queue when it is ready. This works well for many block
190 * devices. However some block devices (typically virtual devices 190 * devices. However some block devices (typically virtual devices
191 * such as md or lvm) do not benefit from the processing on the 191 * such as md or lvm) do not benefit from the processing on the
192 * request queue, and are served best by having the requests passed 192 * request queue, and are served best by having the requests passed
193 * directly to them. This can be achieved by providing a function 193 * directly to them. This can be achieved by providing a function
194 * to blk_queue_make_request(). 194 * to blk_queue_make_request().
195 * 195 *
196 * Caveat: 196 * Caveat:
197 * The driver that does this *must* be able to deal appropriately 197 * The driver that does this *must* be able to deal appropriately
198 * with buffers in "highmemory". This can be accomplished by either calling 198 * with buffers in "highmemory". This can be accomplished by either calling
199 * __bio_kmap_atomic() to get a temporary kernel mapping, or by calling 199 * __bio_kmap_atomic() to get a temporary kernel mapping, or by calling
200 * blk_queue_bounce() to create a buffer in normal memory. 200 * blk_queue_bounce() to create a buffer in normal memory.
201 **/ 201 **/
202 void blk_queue_make_request(struct request_queue * q, make_request_fn * mfn) 202 void blk_queue_make_request(struct request_queue * q, make_request_fn * mfn)
203 { 203 {
204 /* 204 /*
205 * set defaults 205 * set defaults
206 */ 206 */
207 q->nr_requests = BLKDEV_MAX_RQ; 207 q->nr_requests = BLKDEV_MAX_RQ;
208 blk_queue_max_phys_segments(q, MAX_PHYS_SEGMENTS); 208 blk_queue_max_phys_segments(q, MAX_PHYS_SEGMENTS);
209 blk_queue_max_hw_segments(q, MAX_HW_SEGMENTS); 209 blk_queue_max_hw_segments(q, MAX_HW_SEGMENTS);
210 q->make_request_fn = mfn; 210 q->make_request_fn = mfn;
211 q->backing_dev_info.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE; 211 q->backing_dev_info.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
212 q->backing_dev_info.state = 0; 212 q->backing_dev_info.state = 0;
213 q->backing_dev_info.capabilities = BDI_CAP_MAP_COPY; 213 q->backing_dev_info.capabilities = BDI_CAP_MAP_COPY;
214 blk_queue_max_sectors(q, SAFE_MAX_SECTORS); 214 blk_queue_max_sectors(q, SAFE_MAX_SECTORS);
215 blk_queue_hardsect_size(q, 512); 215 blk_queue_hardsect_size(q, 512);
216 blk_queue_dma_alignment(q, 511); 216 blk_queue_dma_alignment(q, 511);
217 blk_queue_congestion_threshold(q); 217 blk_queue_congestion_threshold(q);
218 q->nr_batching = BLK_BATCH_REQ; 218 q->nr_batching = BLK_BATCH_REQ;
219 219
220 q->unplug_thresh = 4; /* hmm */ 220 q->unplug_thresh = 4; /* hmm */
221 q->unplug_delay = (3 * HZ) / 1000; /* 3 milliseconds */ 221 q->unplug_delay = (3 * HZ) / 1000; /* 3 milliseconds */
222 if (q->unplug_delay == 0) 222 if (q->unplug_delay == 0)
223 q->unplug_delay = 1; 223 q->unplug_delay = 1;
224 224
225 INIT_WORK(&q->unplug_work, blk_unplug_work); 225 INIT_WORK(&q->unplug_work, blk_unplug_work);
226 226
227 q->unplug_timer.function = blk_unplug_timeout; 227 q->unplug_timer.function = blk_unplug_timeout;
228 q->unplug_timer.data = (unsigned long)q; 228 q->unplug_timer.data = (unsigned long)q;
229 229
230 /* 230 /*
231 * by default assume old behaviour and bounce for any highmem page 231 * by default assume old behaviour and bounce for any highmem page
232 */ 232 */
233 blk_queue_bounce_limit(q, BLK_BOUNCE_HIGH); 233 blk_queue_bounce_limit(q, BLK_BOUNCE_HIGH);
234 } 234 }
235 235
236 EXPORT_SYMBOL(blk_queue_make_request); 236 EXPORT_SYMBOL(blk_queue_make_request);
237 237
238 static void rq_init(struct request_queue *q, struct request *rq) 238 static void rq_init(struct request_queue *q, struct request *rq)
239 { 239 {
240 INIT_LIST_HEAD(&rq->queuelist); 240 INIT_LIST_HEAD(&rq->queuelist);
241 INIT_LIST_HEAD(&rq->donelist); 241 INIT_LIST_HEAD(&rq->donelist);
242 242
243 rq->errors = 0; 243 rq->errors = 0;
244 rq->bio = rq->biotail = NULL; 244 rq->bio = rq->biotail = NULL;
245 INIT_HLIST_NODE(&rq->hash); 245 INIT_HLIST_NODE(&rq->hash);
246 RB_CLEAR_NODE(&rq->rb_node); 246 RB_CLEAR_NODE(&rq->rb_node);
247 rq->ioprio = 0; 247 rq->ioprio = 0;
248 rq->buffer = NULL; 248 rq->buffer = NULL;
249 rq->ref_count = 1; 249 rq->ref_count = 1;
250 rq->q = q; 250 rq->q = q;
251 rq->special = NULL; 251 rq->special = NULL;
252 rq->data_len = 0; 252 rq->data_len = 0;
253 rq->data = NULL; 253 rq->data = NULL;
254 rq->nr_phys_segments = 0; 254 rq->nr_phys_segments = 0;
255 rq->sense = NULL; 255 rq->sense = NULL;
256 rq->end_io = NULL; 256 rq->end_io = NULL;
257 rq->end_io_data = NULL; 257 rq->end_io_data = NULL;
258 rq->completion_data = NULL; 258 rq->completion_data = NULL;
259 rq->next_rq = NULL; 259 rq->next_rq = NULL;
260 } 260 }
261 261
262 /** 262 /**
263 * blk_queue_ordered - does this queue support ordered writes 263 * blk_queue_ordered - does this queue support ordered writes
264 * @q: the request queue 264 * @q: the request queue
265 * @ordered: one of QUEUE_ORDERED_* 265 * @ordered: one of QUEUE_ORDERED_*
266 * @prepare_flush_fn: rq setup helper for cache flush ordered writes 266 * @prepare_flush_fn: rq setup helper for cache flush ordered writes
267 * 267 *
268 * Description: 268 * Description:
269 * For journalled file systems, doing ordered writes on a commit 269 * For journalled file systems, doing ordered writes on a commit
270 * block instead of explicitly doing wait_on_buffer (which is bad 270 * block instead of explicitly doing wait_on_buffer (which is bad
271 * for performance) can be a big win. Block drivers supporting this 271 * for performance) can be a big win. Block drivers supporting this
272 * feature should call this function and indicate so. 272 * feature should call this function and indicate so.
273 * 273 *
274 **/ 274 **/
275 int blk_queue_ordered(struct request_queue *q, unsigned ordered, 275 int blk_queue_ordered(struct request_queue *q, unsigned ordered,
276 prepare_flush_fn *prepare_flush_fn) 276 prepare_flush_fn *prepare_flush_fn)
277 { 277 {
278 if (ordered & (QUEUE_ORDERED_PREFLUSH | QUEUE_ORDERED_POSTFLUSH) && 278 if (ordered & (QUEUE_ORDERED_PREFLUSH | QUEUE_ORDERED_POSTFLUSH) &&
279 prepare_flush_fn == NULL) { 279 prepare_flush_fn == NULL) {
280 printk(KERN_ERR "blk_queue_ordered: prepare_flush_fn required\n"); 280 printk(KERN_ERR "blk_queue_ordered: prepare_flush_fn required\n");
281 return -EINVAL; 281 return -EINVAL;
282 } 282 }
283 283
284 if (ordered != QUEUE_ORDERED_NONE && 284 if (ordered != QUEUE_ORDERED_NONE &&
285 ordered != QUEUE_ORDERED_DRAIN && 285 ordered != QUEUE_ORDERED_DRAIN &&
286 ordered != QUEUE_ORDERED_DRAIN_FLUSH && 286 ordered != QUEUE_ORDERED_DRAIN_FLUSH &&
287 ordered != QUEUE_ORDERED_DRAIN_FUA && 287 ordered != QUEUE_ORDERED_DRAIN_FUA &&
288 ordered != QUEUE_ORDERED_TAG && 288 ordered != QUEUE_ORDERED_TAG &&
289 ordered != QUEUE_ORDERED_TAG_FLUSH && 289 ordered != QUEUE_ORDERED_TAG_FLUSH &&
290 ordered != QUEUE_ORDERED_TAG_FUA) { 290 ordered != QUEUE_ORDERED_TAG_FUA) {
291 printk(KERN_ERR "blk_queue_ordered: bad value %d\n", ordered); 291 printk(KERN_ERR "blk_queue_ordered: bad value %d\n", ordered);
292 return -EINVAL; 292 return -EINVAL;
293 } 293 }
294 294
295 q->ordered = ordered; 295 q->ordered = ordered;
296 q->next_ordered = ordered; 296 q->next_ordered = ordered;
297 q->prepare_flush_fn = prepare_flush_fn; 297 q->prepare_flush_fn = prepare_flush_fn;
298 298
299 return 0; 299 return 0;
300 } 300 }
301 301
302 EXPORT_SYMBOL(blk_queue_ordered); 302 EXPORT_SYMBOL(blk_queue_ordered);
303 303
304 /** 304 /**
305 * blk_queue_issue_flush_fn - set function for issuing a flush 305 * blk_queue_issue_flush_fn - set function for issuing a flush
306 * @q: the request queue 306 * @q: the request queue
307 * @iff: the function to be called issuing the flush 307 * @iff: the function to be called issuing the flush
308 * 308 *
309 * Description: 309 * Description:
310 * If a driver supports issuing a flush command, the support is notified 310 * If a driver supports issuing a flush command, the support is notified
311 * to the block layer by defining it through this call. 311 * to the block layer by defining it through this call.
312 * 312 *
313 **/ 313 **/
314 void blk_queue_issue_flush_fn(struct request_queue *q, issue_flush_fn *iff) 314 void blk_queue_issue_flush_fn(struct request_queue *q, issue_flush_fn *iff)
315 { 315 {
316 q->issue_flush_fn = iff; 316 q->issue_flush_fn = iff;
317 } 317 }
318 318
319 EXPORT_SYMBOL(blk_queue_issue_flush_fn); 319 EXPORT_SYMBOL(blk_queue_issue_flush_fn);
320 320
321 /* 321 /*
322 * Cache flushing for ordered writes handling 322 * Cache flushing for ordered writes handling
323 */ 323 */
324 inline unsigned blk_ordered_cur_seq(struct request_queue *q) 324 inline unsigned blk_ordered_cur_seq(struct request_queue *q)
325 { 325 {
326 if (!q->ordseq) 326 if (!q->ordseq)
327 return 0; 327 return 0;
328 return 1 << ffz(q->ordseq); 328 return 1 << ffz(q->ordseq);
329 } 329 }
330 330
331 unsigned blk_ordered_req_seq(struct request *rq) 331 unsigned blk_ordered_req_seq(struct request *rq)
332 { 332 {
333 struct request_queue *q = rq->q; 333 struct request_queue *q = rq->q;
334 334
335 BUG_ON(q->ordseq == 0); 335 BUG_ON(q->ordseq == 0);
336 336
337 if (rq == &q->pre_flush_rq) 337 if (rq == &q->pre_flush_rq)
338 return QUEUE_ORDSEQ_PREFLUSH; 338 return QUEUE_ORDSEQ_PREFLUSH;
339 if (rq == &q->bar_rq) 339 if (rq == &q->bar_rq)
340 return QUEUE_ORDSEQ_BAR; 340 return QUEUE_ORDSEQ_BAR;
341 if (rq == &q->post_flush_rq) 341 if (rq == &q->post_flush_rq)
342 return QUEUE_ORDSEQ_POSTFLUSH; 342 return QUEUE_ORDSEQ_POSTFLUSH;
343 343
344 /* 344 /*
345 * !fs requests don't need to follow barrier ordering. Always 345 * !fs requests don't need to follow barrier ordering. Always
346 * put them at the front. This fixes the following deadlock. 346 * put them at the front. This fixes the following deadlock.
347 * 347 *
348 * http://thread.gmane.org/gmane.linux.kernel/537473 348 * http://thread.gmane.org/gmane.linux.kernel/537473
349 */ 349 */
350 if (!blk_fs_request(rq)) 350 if (!blk_fs_request(rq))
351 return QUEUE_ORDSEQ_DRAIN; 351 return QUEUE_ORDSEQ_DRAIN;
352 352
353 if ((rq->cmd_flags & REQ_ORDERED_COLOR) == 353 if ((rq->cmd_flags & REQ_ORDERED_COLOR) ==
354 (q->orig_bar_rq->cmd_flags & REQ_ORDERED_COLOR)) 354 (q->orig_bar_rq->cmd_flags & REQ_ORDERED_COLOR))
355 return QUEUE_ORDSEQ_DRAIN; 355 return QUEUE_ORDSEQ_DRAIN;
356 else 356 else
357 return QUEUE_ORDSEQ_DONE; 357 return QUEUE_ORDSEQ_DONE;
358 } 358 }
359 359
360 void blk_ordered_complete_seq(struct request_queue *q, unsigned seq, int error) 360 void blk_ordered_complete_seq(struct request_queue *q, unsigned seq, int error)
361 { 361 {
362 struct request *rq; 362 struct request *rq;
363 int uptodate; 363 int uptodate;
364 364
365 if (error && !q->orderr) 365 if (error && !q->orderr)
366 q->orderr = error; 366 q->orderr = error;
367 367
368 BUG_ON(q->ordseq & seq); 368 BUG_ON(q->ordseq & seq);
369 q->ordseq |= seq; 369 q->ordseq |= seq;
370 370
371 if (blk_ordered_cur_seq(q) != QUEUE_ORDSEQ_DONE) 371 if (blk_ordered_cur_seq(q) != QUEUE_ORDSEQ_DONE)
372 return; 372 return;
373 373
374 /* 374 /*
375 * Okay, sequence complete. 375 * Okay, sequence complete.
376 */ 376 */
377 rq = q->orig_bar_rq; 377 rq = q->orig_bar_rq;
378 uptodate = q->orderr ? q->orderr : 1; 378 uptodate = q->orderr ? q->orderr : 1;
379 379
380 q->ordseq = 0; 380 q->ordseq = 0;
381 381
382 end_that_request_first(rq, uptodate, rq->hard_nr_sectors); 382 end_that_request_first(rq, uptodate, rq->hard_nr_sectors);
383 end_that_request_last(rq, uptodate); 383 end_that_request_last(rq, uptodate);
384 } 384 }
385 385
386 static void pre_flush_end_io(struct request *rq, int error) 386 static void pre_flush_end_io(struct request *rq, int error)
387 { 387 {
388 elv_completed_request(rq->q, rq); 388 elv_completed_request(rq->q, rq);
389 blk_ordered_complete_seq(rq->q, QUEUE_ORDSEQ_PREFLUSH, error); 389 blk_ordered_complete_seq(rq->q, QUEUE_ORDSEQ_PREFLUSH, error);
390 } 390 }
391 391
392 static void bar_end_io(struct request *rq, int error) 392 static void bar_end_io(struct request *rq, int error)
393 { 393 {
394 elv_completed_request(rq->q, rq); 394 elv_completed_request(rq->q, rq);
395 blk_ordered_complete_seq(rq->q, QUEUE_ORDSEQ_BAR, error); 395 blk_ordered_complete_seq(rq->q, QUEUE_ORDSEQ_BAR, error);
396 } 396 }
397 397
398 static void post_flush_end_io(struct request *rq, int error) 398 static void post_flush_end_io(struct request *rq, int error)
399 { 399 {
400 elv_completed_request(rq->q, rq); 400 elv_completed_request(rq->q, rq);
401 blk_ordered_complete_seq(rq->q, QUEUE_ORDSEQ_POSTFLUSH, error); 401 blk_ordered_complete_seq(rq->q, QUEUE_ORDSEQ_POSTFLUSH, error);
402 } 402 }
403 403
404 static void queue_flush(struct request_queue *q, unsigned which) 404 static void queue_flush(struct request_queue *q, unsigned which)
405 { 405 {
406 struct request *rq; 406 struct request *rq;
407 rq_end_io_fn *end_io; 407 rq_end_io_fn *end_io;
408 408
409 if (which == QUEUE_ORDERED_PREFLUSH) { 409 if (which == QUEUE_ORDERED_PREFLUSH) {
410 rq = &q->pre_flush_rq; 410 rq = &q->pre_flush_rq;
411 end_io = pre_flush_end_io; 411 end_io = pre_flush_end_io;
412 } else { 412 } else {
413 rq = &q->post_flush_rq; 413 rq = &q->post_flush_rq;
414 end_io = post_flush_end_io; 414 end_io = post_flush_end_io;
415 } 415 }
416 416
417 rq->cmd_flags = REQ_HARDBARRIER; 417 rq->cmd_flags = REQ_HARDBARRIER;
418 rq_init(q, rq); 418 rq_init(q, rq);
419 rq->elevator_private = NULL; 419 rq->elevator_private = NULL;
420 rq->elevator_private2 = NULL; 420 rq->elevator_private2 = NULL;
421 rq->rq_disk = q->bar_rq.rq_disk; 421 rq->rq_disk = q->bar_rq.rq_disk;
422 rq->end_io = end_io; 422 rq->end_io = end_io;
423 q->prepare_flush_fn(q, rq); 423 q->prepare_flush_fn(q, rq);
424 424
425 elv_insert(q, rq, ELEVATOR_INSERT_FRONT); 425 elv_insert(q, rq, ELEVATOR_INSERT_FRONT);
426 } 426 }
427 427
428 static inline struct request *start_ordered(struct request_queue *q, 428 static inline struct request *start_ordered(struct request_queue *q,
429 struct request *rq) 429 struct request *rq)
430 { 430 {
431 q->bi_size = 0; 431 q->bi_size = 0;
432 q->orderr = 0; 432 q->orderr = 0;
433 q->ordered = q->next_ordered; 433 q->ordered = q->next_ordered;
434 q->ordseq |= QUEUE_ORDSEQ_STARTED; 434 q->ordseq |= QUEUE_ORDSEQ_STARTED;
435 435
436 /* 436 /*
437 * Prep proxy barrier request. 437 * Prep proxy barrier request.
438 */ 438 */
439 blkdev_dequeue_request(rq); 439 blkdev_dequeue_request(rq);
440 q->orig_bar_rq = rq; 440 q->orig_bar_rq = rq;
441 rq = &q->bar_rq; 441 rq = &q->bar_rq;
442 rq->cmd_flags = 0; 442 rq->cmd_flags = 0;
443 rq_init(q, rq); 443 rq_init(q, rq);
444 if (bio_data_dir(q->orig_bar_rq->bio) == WRITE) 444 if (bio_data_dir(q->orig_bar_rq->bio) == WRITE)
445 rq->cmd_flags |= REQ_RW; 445 rq->cmd_flags |= REQ_RW;
446 rq->cmd_flags |= q->ordered & QUEUE_ORDERED_FUA ? REQ_FUA : 0; 446 rq->cmd_flags |= q->ordered & QUEUE_ORDERED_FUA ? REQ_FUA : 0;
447 rq->elevator_private = NULL; 447 rq->elevator_private = NULL;
448 rq->elevator_private2 = NULL; 448 rq->elevator_private2 = NULL;
449 init_request_from_bio(rq, q->orig_bar_rq->bio); 449 init_request_from_bio(rq, q->orig_bar_rq->bio);
450 rq->end_io = bar_end_io; 450 rq->end_io = bar_end_io;
451 451
452 /* 452 /*
453 * Queue ordered sequence. As we stack them at the head, we 453 * Queue ordered sequence. As we stack them at the head, we
454 * need to queue in reverse order. Note that we rely on that 454 * need to queue in reverse order. Note that we rely on that
455 * no fs request uses ELEVATOR_INSERT_FRONT and thus no fs 455 * no fs request uses ELEVATOR_INSERT_FRONT and thus no fs
456 * request gets inbetween ordered sequence. 456 * request gets inbetween ordered sequence.
457 */ 457 */
458 if (q->ordered & QUEUE_ORDERED_POSTFLUSH) 458 if (q->ordered & QUEUE_ORDERED_POSTFLUSH)
459 queue_flush(q, QUEUE_ORDERED_POSTFLUSH); 459 queue_flush(q, QUEUE_ORDERED_POSTFLUSH);
460 else 460 else
461 q->ordseq |= QUEUE_ORDSEQ_POSTFLUSH; 461 q->ordseq |= QUEUE_ORDSEQ_POSTFLUSH;
462 462
463 elv_insert(q, rq, ELEVATOR_INSERT_FRONT); 463 elv_insert(q, rq, ELEVATOR_INSERT_FRONT);
464 464
465 if (q->ordered & QUEUE_ORDERED_PREFLUSH) { 465 if (q->ordered & QUEUE_ORDERED_PREFLUSH) {
466 queue_flush(q, QUEUE_ORDERED_PREFLUSH); 466 queue_flush(q, QUEUE_ORDERED_PREFLUSH);
467 rq = &q->pre_flush_rq; 467 rq = &q->pre_flush_rq;
468 } else 468 } else
469 q->ordseq |= QUEUE_ORDSEQ_PREFLUSH; 469 q->ordseq |= QUEUE_ORDSEQ_PREFLUSH;
470 470
471 if ((q->ordered & QUEUE_ORDERED_TAG) || q->in_flight == 0) 471 if ((q->ordered & QUEUE_ORDERED_TAG) || q->in_flight == 0)
472 q->ordseq |= QUEUE_ORDSEQ_DRAIN; 472 q->ordseq |= QUEUE_ORDSEQ_DRAIN;
473 else 473 else
474 rq = NULL; 474 rq = NULL;
475 475
476 return rq; 476 return rq;
477 } 477 }
478 478
479 int blk_do_ordered(struct request_queue *q, struct request **rqp) 479 int blk_do_ordered(struct request_queue *q, struct request **rqp)
480 { 480 {
481 struct request *rq = *rqp; 481 struct request *rq = *rqp;
482 int is_barrier = blk_fs_request(rq) && blk_barrier_rq(rq); 482 int is_barrier = blk_fs_request(rq) && blk_barrier_rq(rq);
483 483
484 if (!q->ordseq) { 484 if (!q->ordseq) {
485 if (!is_barrier) 485 if (!is_barrier)
486 return 1; 486 return 1;
487 487
488 if (q->next_ordered != QUEUE_ORDERED_NONE) { 488 if (q->next_ordered != QUEUE_ORDERED_NONE) {
489 *rqp = start_ordered(q, rq); 489 *rqp = start_ordered(q, rq);
490 return 1; 490 return 1;
491 } else { 491 } else {
492 /* 492 /*
493 * This can happen when the queue switches to 493 * This can happen when the queue switches to
494 * ORDERED_NONE while this request is on it. 494 * ORDERED_NONE while this request is on it.
495 */ 495 */
496 blkdev_dequeue_request(rq); 496 blkdev_dequeue_request(rq);
497 end_that_request_first(rq, -EOPNOTSUPP, 497 end_that_request_first(rq, -EOPNOTSUPP,
498 rq->hard_nr_sectors); 498 rq->hard_nr_sectors);
499 end_that_request_last(rq, -EOPNOTSUPP); 499 end_that_request_last(rq, -EOPNOTSUPP);
500 *rqp = NULL; 500 *rqp = NULL;
501 return 0; 501 return 0;
502 } 502 }
503 } 503 }
504 504
505 /* 505 /*
506 * Ordered sequence in progress 506 * Ordered sequence in progress
507 */ 507 */
508 508
509 /* Special requests are not subject to ordering rules. */ 509 /* Special requests are not subject to ordering rules. */
510 if (!blk_fs_request(rq) && 510 if (!blk_fs_request(rq) &&
511 rq != &q->pre_flush_rq && rq != &q->post_flush_rq) 511 rq != &q->pre_flush_rq && rq != &q->post_flush_rq)
512 return 1; 512 return 1;
513 513
514 if (q->ordered & QUEUE_ORDERED_TAG) { 514 if (q->ordered & QUEUE_ORDERED_TAG) {
515 /* Ordered by tag. Blocking the next barrier is enough. */ 515 /* Ordered by tag. Blocking the next barrier is enough. */
516 if (is_barrier && rq != &q->bar_rq) 516 if (is_barrier && rq != &q->bar_rq)
517 *rqp = NULL; 517 *rqp = NULL;
518 } else { 518 } else {
519 /* Ordered by draining. Wait for turn. */ 519 /* Ordered by draining. Wait for turn. */
520 WARN_ON(blk_ordered_req_seq(rq) < blk_ordered_cur_seq(q)); 520 WARN_ON(blk_ordered_req_seq(rq) < blk_ordered_cur_seq(q));
521 if (blk_ordered_req_seq(rq) > blk_ordered_cur_seq(q)) 521 if (blk_ordered_req_seq(rq) > blk_ordered_cur_seq(q))
522 *rqp = NULL; 522 *rqp = NULL;
523 } 523 }
524 524
525 return 1; 525 return 1;
526 } 526 }
527 527
528 static int flush_dry_bio_endio(struct bio *bio, unsigned int bytes, int error) 528 static int flush_dry_bio_endio(struct bio *bio, unsigned int bytes, int error)
529 { 529 {
530 struct request_queue *q = bio->bi_private; 530 struct request_queue *q = bio->bi_private;
531 531
532 /* 532 /*
533 * This is dry run, restore bio_sector and size. We'll finish 533 * This is dry run, restore bio_sector and size. We'll finish
534 * this request again with the original bi_end_io after an 534 * this request again with the original bi_end_io after an
535 * error occurs or post flush is complete. 535 * error occurs or post flush is complete.
536 */ 536 */
537 q->bi_size += bytes; 537 q->bi_size += bytes;
538 538
539 if (bio->bi_size) 539 if (bio->bi_size)
540 return 1; 540 return 1;
541 541
542 /* Reset bio */ 542 /* Reset bio */
543 set_bit(BIO_UPTODATE, &bio->bi_flags); 543 set_bit(BIO_UPTODATE, &bio->bi_flags);
544 bio->bi_size = q->bi_size; 544 bio->bi_size = q->bi_size;
545 bio->bi_sector -= (q->bi_size >> 9); 545 bio->bi_sector -= (q->bi_size >> 9);
546 q->bi_size = 0; 546 q->bi_size = 0;
547 547
548 return 0; 548 return 0;
549 } 549 }
550 550
551 static int ordered_bio_endio(struct request *rq, struct bio *bio, 551 static int ordered_bio_endio(struct request *rq, struct bio *bio,
552 unsigned int nbytes, int error) 552 unsigned int nbytes, int error)
553 { 553 {
554 struct request_queue *q = rq->q; 554 struct request_queue *q = rq->q;
555 bio_end_io_t *endio; 555 bio_end_io_t *endio;
556 void *private; 556 void *private;
557 557
558 if (&q->bar_rq != rq) 558 if (&q->bar_rq != rq)
559 return 0; 559 return 0;
560 560
561 /* 561 /*
562 * Okay, this is the barrier request in progress, dry finish it. 562 * Okay, this is the barrier request in progress, dry finish it.
563 */ 563 */
564 if (error && !q->orderr) 564 if (error && !q->orderr)
565 q->orderr = error; 565 q->orderr = error;
566 566
567 endio = bio->bi_end_io; 567 endio = bio->bi_end_io;
568 private = bio->bi_private; 568 private = bio->bi_private;
569 bio->bi_end_io = flush_dry_bio_endio; 569 bio->bi_end_io = flush_dry_bio_endio;
570 bio->bi_private = q; 570 bio->bi_private = q;
571 571
572 bio_endio(bio, nbytes, error); 572 bio_endio(bio, nbytes, error);
573 573
574 bio->bi_end_io = endio; 574 bio->bi_end_io = endio;
575 bio->bi_private = private; 575 bio->bi_private = private;
576 576
577 return 1; 577 return 1;
578 } 578 }
579 579
580 /** 580 /**
581 * blk_queue_bounce_limit - set bounce buffer limit for queue 581 * blk_queue_bounce_limit - set bounce buffer limit for queue
582 * @q: the request queue for the device 582 * @q: the request queue for the device
583 * @dma_addr: bus address limit 583 * @dma_addr: bus address limit
584 * 584 *
585 * Description: 585 * Description:
586 * Different hardware can have different requirements as to what pages 586 * Different hardware can have different requirements as to what pages
587 * it can do I/O directly to. A low level driver can call 587 * it can do I/O directly to. A low level driver can call
588 * blk_queue_bounce_limit to have lower memory pages allocated as bounce 588 * blk_queue_bounce_limit to have lower memory pages allocated as bounce
589 * buffers for doing I/O to pages residing above @page. 589 * buffers for doing I/O to pages residing above @page.
590 **/ 590 **/
591 void blk_queue_bounce_limit(struct request_queue *q, u64 dma_addr) 591 void blk_queue_bounce_limit(struct request_queue *q, u64 dma_addr)
592 { 592 {
593 unsigned long bounce_pfn = dma_addr >> PAGE_SHIFT; 593 unsigned long bounce_pfn = dma_addr >> PAGE_SHIFT;
594 int dma = 0; 594 int dma = 0;
595 595
596 q->bounce_gfp = GFP_NOIO; 596 q->bounce_gfp = GFP_NOIO;
597 #if BITS_PER_LONG == 64 597 #if BITS_PER_LONG == 64
598 /* Assume anything <= 4GB can be handled by IOMMU. 598 /* Assume anything <= 4GB can be handled by IOMMU.
599 Actually some IOMMUs can handle everything, but I don't 599 Actually some IOMMUs can handle everything, but I don't
600 know of a way to test this here. */ 600 know of a way to test this here. */
601 if (bounce_pfn < (min_t(u64,0xffffffff,BLK_BOUNCE_HIGH) >> PAGE_SHIFT)) 601 if (bounce_pfn < (min_t(u64,0xffffffff,BLK_BOUNCE_HIGH) >> PAGE_SHIFT))
602 dma = 1; 602 dma = 1;
603 q->bounce_pfn = max_low_pfn; 603 q->bounce_pfn = max_low_pfn;
604 #else 604 #else
605 if (bounce_pfn < blk_max_low_pfn) 605 if (bounce_pfn < blk_max_low_pfn)
606 dma = 1; 606 dma = 1;
607 q->bounce_pfn = bounce_pfn; 607 q->bounce_pfn = bounce_pfn;
608 #endif 608 #endif
609 if (dma) { 609 if (dma) {
610 init_emergency_isa_pool(); 610 init_emergency_isa_pool();
611 q->bounce_gfp = GFP_NOIO | GFP_DMA; 611 q->bounce_gfp = GFP_NOIO | GFP_DMA;
612 q->bounce_pfn = bounce_pfn; 612 q->bounce_pfn = bounce_pfn;
613 } 613 }
614 } 614 }
615 615
616 EXPORT_SYMBOL(blk_queue_bounce_limit); 616 EXPORT_SYMBOL(blk_queue_bounce_limit);
617 617
618 /** 618 /**
619 * blk_queue_max_sectors - set max sectors for a request for this queue 619 * blk_queue_max_sectors - set max sectors for a request for this queue
620 * @q: the request queue for the device 620 * @q: the request queue for the device
621 * @max_sectors: max sectors in the usual 512b unit 621 * @max_sectors: max sectors in the usual 512b unit
622 * 622 *
623 * Description: 623 * Description:
624 * Enables a low level driver to set an upper limit on the size of 624 * Enables a low level driver to set an upper limit on the size of
625 * received requests. 625 * received requests.
626 **/ 626 **/
627 void blk_queue_max_sectors(struct request_queue *q, unsigned int max_sectors) 627 void blk_queue_max_sectors(struct request_queue *q, unsigned int max_sectors)
628 { 628 {
629 if ((max_sectors << 9) < PAGE_CACHE_SIZE) { 629 if ((max_sectors << 9) < PAGE_CACHE_SIZE) {
630 max_sectors = 1 << (PAGE_CACHE_SHIFT - 9); 630 max_sectors = 1 << (PAGE_CACHE_SHIFT - 9);
631 printk("%s: set to minimum %d\n", __FUNCTION__, max_sectors); 631 printk("%s: set to minimum %d\n", __FUNCTION__, max_sectors);
632 } 632 }
633 633
634 if (BLK_DEF_MAX_SECTORS > max_sectors) 634 if (BLK_DEF_MAX_SECTORS > max_sectors)
635 q->max_hw_sectors = q->max_sectors = max_sectors; 635 q->max_hw_sectors = q->max_sectors = max_sectors;
636 else { 636 else {
637 q->max_sectors = BLK_DEF_MAX_SECTORS; 637 q->max_sectors = BLK_DEF_MAX_SECTORS;
638 q->max_hw_sectors = max_sectors; 638 q->max_hw_sectors = max_sectors;
639 } 639 }
640 } 640 }
641 641
642 EXPORT_SYMBOL(blk_queue_max_sectors); 642 EXPORT_SYMBOL(blk_queue_max_sectors);
643 643
644 /** 644 /**
645 * blk_queue_max_phys_segments - set max phys segments for a request for this queue 645 * blk_queue_max_phys_segments - set max phys segments for a request for this queue
646 * @q: the request queue for the device 646 * @q: the request queue for the device
647 * @max_segments: max number of segments 647 * @max_segments: max number of segments
648 * 648 *
649 * Description: 649 * Description:
650 * Enables a low level driver to set an upper limit on the number of 650 * Enables a low level driver to set an upper limit on the number of
651 * physical data segments in a request. This would be the largest sized 651 * physical data segments in a request. This would be the largest sized
652 * scatter list the driver could handle. 652 * scatter list the driver could handle.
653 **/ 653 **/
654 void blk_queue_max_phys_segments(struct request_queue *q, 654 void blk_queue_max_phys_segments(struct request_queue *q,
655 unsigned short max_segments) 655 unsigned short max_segments)
656 { 656 {
657 if (!max_segments) { 657 if (!max_segments) {
658 max_segments = 1; 658 max_segments = 1;
659 printk("%s: set to minimum %d\n", __FUNCTION__, max_segments); 659 printk("%s: set to minimum %d\n", __FUNCTION__, max_segments);
660 } 660 }
661 661
662 q->max_phys_segments = max_segments; 662 q->max_phys_segments = max_segments;
663 } 663 }
664 664
665 EXPORT_SYMBOL(blk_queue_max_phys_segments); 665 EXPORT_SYMBOL(blk_queue_max_phys_segments);
666 666
667 /** 667 /**
668 * blk_queue_max_hw_segments - set max hw segments for a request for this queue 668 * blk_queue_max_hw_segments - set max hw segments for a request for this queue
669 * @q: the request queue for the device 669 * @q: the request queue for the device
670 * @max_segments: max number of segments 670 * @max_segments: max number of segments
671 * 671 *
672 * Description: 672 * Description:
673 * Enables a low level driver to set an upper limit on the number of 673 * Enables a low level driver to set an upper limit on the number of
674 * hw data segments in a request. This would be the largest number of 674 * hw data segments in a request. This would be the largest number of
675 * address/length pairs the host adapter can actually give as once 675 * address/length pairs the host adapter can actually give as once
676 * to the device. 676 * to the device.
677 **/ 677 **/
678 void blk_queue_max_hw_segments(struct request_queue *q, 678 void blk_queue_max_hw_segments(struct request_queue *q,
679 unsigned short max_segments) 679 unsigned short max_segments)
680 { 680 {
681 if (!max_segments) { 681 if (!max_segments) {
682 max_segments = 1; 682 max_segments = 1;
683 printk("%s: set to minimum %d\n", __FUNCTION__, max_segments); 683 printk("%s: set to minimum %d\n", __FUNCTION__, max_segments);
684 } 684 }
685 685
686 q->max_hw_segments = max_segments; 686 q->max_hw_segments = max_segments;
687 } 687 }
688 688
689 EXPORT_SYMBOL(blk_queue_max_hw_segments); 689 EXPORT_SYMBOL(blk_queue_max_hw_segments);
690 690
691 /** 691 /**
692 * blk_queue_max_segment_size - set max segment size for blk_rq_map_sg 692 * blk_queue_max_segment_size - set max segment size for blk_rq_map_sg
693 * @q: the request queue for the device 693 * @q: the request queue for the device
694 * @max_size: max size of segment in bytes 694 * @max_size: max size of segment in bytes
695 * 695 *
696 * Description: 696 * Description:
697 * Enables a low level driver to set an upper limit on the size of a 697 * Enables a low level driver to set an upper limit on the size of a
698 * coalesced segment 698 * coalesced segment
699 **/ 699 **/
700 void blk_queue_max_segment_size(struct request_queue *q, unsigned int max_size) 700 void blk_queue_max_segment_size(struct request_queue *q, unsigned int max_size)
701 { 701 {
702 if (max_size < PAGE_CACHE_SIZE) { 702 if (max_size < PAGE_CACHE_SIZE) {
703 max_size = PAGE_CACHE_SIZE; 703 max_size = PAGE_CACHE_SIZE;
704 printk("%s: set to minimum %d\n", __FUNCTION__, max_size); 704 printk("%s: set to minimum %d\n", __FUNCTION__, max_size);
705 } 705 }
706 706
707 q->max_segment_size = max_size; 707 q->max_segment_size = max_size;
708 } 708 }
709 709
710 EXPORT_SYMBOL(blk_queue_max_segment_size); 710 EXPORT_SYMBOL(blk_queue_max_segment_size);
711 711
712 /** 712 /**
713 * blk_queue_hardsect_size - set hardware sector size for the queue 713 * blk_queue_hardsect_size - set hardware sector size for the queue
714 * @q: the request queue for the device 714 * @q: the request queue for the device
715 * @size: the hardware sector size, in bytes 715 * @size: the hardware sector size, in bytes
716 * 716 *
717 * Description: 717 * Description:
718 * This should typically be set to the lowest possible sector size 718 * This should typically be set to the lowest possible sector size
719 * that the hardware can operate on (possible without reverting to 719 * that the hardware can operate on (possible without reverting to
720 * even internal read-modify-write operations). Usually the default 720 * even internal read-modify-write operations). Usually the default
721 * of 512 covers most hardware. 721 * of 512 covers most hardware.
722 **/ 722 **/
723 void blk_queue_hardsect_size(struct request_queue *q, unsigned short size) 723 void blk_queue_hardsect_size(struct request_queue *q, unsigned short size)
724 { 724 {
725 q->hardsect_size = size; 725 q->hardsect_size = size;
726 } 726 }
727 727
728 EXPORT_SYMBOL(blk_queue_hardsect_size); 728 EXPORT_SYMBOL(blk_queue_hardsect_size);
729 729
730 /* 730 /*
731 * Returns the minimum that is _not_ zero, unless both are zero. 731 * Returns the minimum that is _not_ zero, unless both are zero.
732 */ 732 */
733 #define min_not_zero(l, r) (l == 0) ? r : ((r == 0) ? l : min(l, r)) 733 #define min_not_zero(l, r) (l == 0) ? r : ((r == 0) ? l : min(l, r))
734 734
735 /** 735 /**
736 * blk_queue_stack_limits - inherit underlying queue limits for stacked drivers 736 * blk_queue_stack_limits - inherit underlying queue limits for stacked drivers
737 * @t: the stacking driver (top) 737 * @t: the stacking driver (top)
738 * @b: the underlying device (bottom) 738 * @b: the underlying device (bottom)
739 **/ 739 **/
740 void blk_queue_stack_limits(struct request_queue *t, struct request_queue *b) 740 void blk_queue_stack_limits(struct request_queue *t, struct request_queue *b)
741 { 741 {
742 /* zero is "infinity" */ 742 /* zero is "infinity" */
743 t->max_sectors = min_not_zero(t->max_sectors,b->max_sectors); 743 t->max_sectors = min_not_zero(t->max_sectors,b->max_sectors);
744 t->max_hw_sectors = min_not_zero(t->max_hw_sectors,b->max_hw_sectors); 744 t->max_hw_sectors = min_not_zero(t->max_hw_sectors,b->max_hw_sectors);
745 745
746 t->max_phys_segments = min(t->max_phys_segments,b->max_phys_segments); 746 t->max_phys_segments = min(t->max_phys_segments,b->max_phys_segments);
747 t->max_hw_segments = min(t->max_hw_segments,b->max_hw_segments); 747 t->max_hw_segments = min(t->max_hw_segments,b->max_hw_segments);
748 t->max_segment_size = min(t->max_segment_size,b->max_segment_size); 748 t->max_segment_size = min(t->max_segment_size,b->max_segment_size);
749 t->hardsect_size = max(t->hardsect_size,b->hardsect_size); 749 t->hardsect_size = max(t->hardsect_size,b->hardsect_size);
750 if (!test_bit(QUEUE_FLAG_CLUSTER, &b->queue_flags)) 750 if (!test_bit(QUEUE_FLAG_CLUSTER, &b->queue_flags))
751 clear_bit(QUEUE_FLAG_CLUSTER, &t->queue_flags); 751 clear_bit(QUEUE_FLAG_CLUSTER, &t->queue_flags);
752 } 752 }
753 753
754 EXPORT_SYMBOL(blk_queue_stack_limits); 754 EXPORT_SYMBOL(blk_queue_stack_limits);
755 755
756 /** 756 /**
757 * blk_queue_segment_boundary - set boundary rules for segment merging 757 * blk_queue_segment_boundary - set boundary rules for segment merging
758 * @q: the request queue for the device 758 * @q: the request queue for the device
759 * @mask: the memory boundary mask 759 * @mask: the memory boundary mask
760 **/ 760 **/
761 void blk_queue_segment_boundary(struct request_queue *q, unsigned long mask) 761 void blk_queue_segment_boundary(struct request_queue *q, unsigned long mask)
762 { 762 {
763 if (mask < PAGE_CACHE_SIZE - 1) { 763 if (mask < PAGE_CACHE_SIZE - 1) {
764 mask = PAGE_CACHE_SIZE - 1; 764 mask = PAGE_CACHE_SIZE - 1;
765 printk("%s: set to minimum %lx\n", __FUNCTION__, mask); 765 printk("%s: set to minimum %lx\n", __FUNCTION__, mask);
766 } 766 }
767 767
768 q->seg_boundary_mask = mask; 768 q->seg_boundary_mask = mask;
769 } 769 }
770 770
771 EXPORT_SYMBOL(blk_queue_segment_boundary); 771 EXPORT_SYMBOL(blk_queue_segment_boundary);
772 772
773 /** 773 /**
774 * blk_queue_dma_alignment - set dma length and memory alignment 774 * blk_queue_dma_alignment - set dma length and memory alignment
775 * @q: the request queue for the device 775 * @q: the request queue for the device
776 * @mask: alignment mask 776 * @mask: alignment mask
777 * 777 *
778 * description: 778 * description:
779 * set required memory and length aligment for direct dma transactions. 779 * set required memory and length aligment for direct dma transactions.
780 * this is used when buiding direct io requests for the queue. 780 * this is used when buiding direct io requests for the queue.
781 * 781 *
782 **/ 782 **/
783 void blk_queue_dma_alignment(struct request_queue *q, int mask) 783 void blk_queue_dma_alignment(struct request_queue *q, int mask)
784 { 784 {
785 q->dma_alignment = mask; 785 q->dma_alignment = mask;
786 } 786 }
787 787
788 EXPORT_SYMBOL(blk_queue_dma_alignment); 788 EXPORT_SYMBOL(blk_queue_dma_alignment);
789 789
790 /** 790 /**
791 * blk_queue_find_tag - find a request by its tag and queue 791 * blk_queue_find_tag - find a request by its tag and queue
792 * @q: The request queue for the device 792 * @q: The request queue for the device
793 * @tag: The tag of the request 793 * @tag: The tag of the request
794 * 794 *
795 * Notes: 795 * Notes:
796 * Should be used when a device returns a tag and you want to match 796 * Should be used when a device returns a tag and you want to match
797 * it with a request. 797 * it with a request.
798 * 798 *
799 * no locks need be held. 799 * no locks need be held.
800 **/ 800 **/
801 struct request *blk_queue_find_tag(struct request_queue *q, int tag) 801 struct request *blk_queue_find_tag(struct request_queue *q, int tag)
802 { 802 {
803 return blk_map_queue_find_tag(q->queue_tags, tag); 803 return blk_map_queue_find_tag(q->queue_tags, tag);
804 } 804 }
805 805
806 EXPORT_SYMBOL(blk_queue_find_tag); 806 EXPORT_SYMBOL(blk_queue_find_tag);
807 807
808 /** 808 /**
809 * __blk_free_tags - release a given set of tag maintenance info 809 * __blk_free_tags - release a given set of tag maintenance info
810 * @bqt: the tag map to free 810 * @bqt: the tag map to free
811 * 811 *
812 * Tries to free the specified @bqt@. Returns true if it was 812 * Tries to free the specified @bqt@. Returns true if it was
813 * actually freed and false if there are still references using it 813 * actually freed and false if there are still references using it
814 */ 814 */
815 static int __blk_free_tags(struct blk_queue_tag *bqt) 815 static int __blk_free_tags(struct blk_queue_tag *bqt)
816 { 816 {
817 int retval; 817 int retval;
818 818
819 retval = atomic_dec_and_test(&bqt->refcnt); 819 retval = atomic_dec_and_test(&bqt->refcnt);
820 if (retval) { 820 if (retval) {
821 BUG_ON(bqt->busy); 821 BUG_ON(bqt->busy);
822 BUG_ON(!list_empty(&bqt->busy_list)); 822 BUG_ON(!list_empty(&bqt->busy_list));
823 823
824 kfree(bqt->tag_index); 824 kfree(bqt->tag_index);
825 bqt->tag_index = NULL; 825 bqt->tag_index = NULL;
826 826
827 kfree(bqt->tag_map); 827 kfree(bqt->tag_map);
828 bqt->tag_map = NULL; 828 bqt->tag_map = NULL;
829 829
830 kfree(bqt); 830 kfree(bqt);
831 831
832 } 832 }
833 833
834 return retval; 834 return retval;
835 } 835 }
836 836
837 /** 837 /**
838 * __blk_queue_free_tags - release tag maintenance info 838 * __blk_queue_free_tags - release tag maintenance info
839 * @q: the request queue for the device 839 * @q: the request queue for the device
840 * 840 *
841 * Notes: 841 * Notes:
842 * blk_cleanup_queue() will take care of calling this function, if tagging 842 * blk_cleanup_queue() will take care of calling this function, if tagging
843 * has been used. So there's no need to call this directly. 843 * has been used. So there's no need to call this directly.
844 **/ 844 **/
845 static void __blk_queue_free_tags(struct request_queue *q) 845 static void __blk_queue_free_tags(struct request_queue *q)
846 { 846 {
847 struct blk_queue_tag *bqt = q->queue_tags; 847 struct blk_queue_tag *bqt = q->queue_tags;
848 848
849 if (!bqt) 849 if (!bqt)
850 return; 850 return;
851 851
852 __blk_free_tags(bqt); 852 __blk_free_tags(bqt);
853 853
854 q->queue_tags = NULL; 854 q->queue_tags = NULL;
855 q->queue_flags &= ~(1 << QUEUE_FLAG_QUEUED); 855 q->queue_flags &= ~(1 << QUEUE_FLAG_QUEUED);
856 } 856 }
857 857
858 858
859 /** 859 /**
860 * blk_free_tags - release a given set of tag maintenance info 860 * blk_free_tags - release a given set of tag maintenance info
861 * @bqt: the tag map to free 861 * @bqt: the tag map to free
862 * 862 *
863 * For externally managed @bqt@ frees the map. Callers of this 863 * For externally managed @bqt@ frees the map. Callers of this
864 * function must guarantee to have released all the queues that 864 * function must guarantee to have released all the queues that
865 * might have been using this tag map. 865 * might have been using this tag map.
866 */ 866 */
867 void blk_free_tags(struct blk_queue_tag *bqt) 867 void blk_free_tags(struct blk_queue_tag *bqt)
868 { 868 {
869 if (unlikely(!__blk_free_tags(bqt))) 869 if (unlikely(!__blk_free_tags(bqt)))
870 BUG(); 870 BUG();
871 } 871 }
872 EXPORT_SYMBOL(blk_free_tags); 872 EXPORT_SYMBOL(blk_free_tags);
873 873
874 /** 874 /**
875 * blk_queue_free_tags - release tag maintenance info 875 * blk_queue_free_tags - release tag maintenance info
876 * @q: the request queue for the device 876 * @q: the request queue for the device
877 * 877 *
878 * Notes: 878 * Notes:
879 * This is used to disabled tagged queuing to a device, yet leave 879 * This is used to disabled tagged queuing to a device, yet leave
880 * queue in function. 880 * queue in function.
881 **/ 881 **/
882 void blk_queue_free_tags(struct request_queue *q) 882 void blk_queue_free_tags(struct request_queue *q)
883 { 883 {
884 clear_bit(QUEUE_FLAG_QUEUED, &q->queue_flags); 884 clear_bit(QUEUE_FLAG_QUEUED, &q->queue_flags);
885 } 885 }
886 886
887 EXPORT_SYMBOL(blk_queue_free_tags); 887 EXPORT_SYMBOL(blk_queue_free_tags);
888 888
889 static int 889 static int
890 init_tag_map(struct request_queue *q, struct blk_queue_tag *tags, int depth) 890 init_tag_map(struct request_queue *q, struct blk_queue_tag *tags, int depth)
891 { 891 {
892 struct request **tag_index; 892 struct request **tag_index;
893 unsigned long *tag_map; 893 unsigned long *tag_map;
894 int nr_ulongs; 894 int nr_ulongs;
895 895
896 if (q && depth > q->nr_requests * 2) { 896 if (q && depth > q->nr_requests * 2) {
897 depth = q->nr_requests * 2; 897 depth = q->nr_requests * 2;
898 printk(KERN_ERR "%s: adjusted depth to %d\n", 898 printk(KERN_ERR "%s: adjusted depth to %d\n",
899 __FUNCTION__, depth); 899 __FUNCTION__, depth);
900 } 900 }
901 901
902 tag_index = kzalloc(depth * sizeof(struct request *), GFP_ATOMIC); 902 tag_index = kzalloc(depth * sizeof(struct request *), GFP_ATOMIC);
903 if (!tag_index) 903 if (!tag_index)
904 goto fail; 904 goto fail;
905 905
906 nr_ulongs = ALIGN(depth, BITS_PER_LONG) / BITS_PER_LONG; 906 nr_ulongs = ALIGN(depth, BITS_PER_LONG) / BITS_PER_LONG;
907 tag_map = kzalloc(nr_ulongs * sizeof(unsigned long), GFP_ATOMIC); 907 tag_map = kzalloc(nr_ulongs * sizeof(unsigned long), GFP_ATOMIC);
908 if (!tag_map) 908 if (!tag_map)
909 goto fail; 909 goto fail;
910 910
911 tags->real_max_depth = depth; 911 tags->real_max_depth = depth;
912 tags->max_depth = depth; 912 tags->max_depth = depth;
913 tags->tag_index = tag_index; 913 tags->tag_index = tag_index;
914 tags->tag_map = tag_map; 914 tags->tag_map = tag_map;
915 915
916 return 0; 916 return 0;
917 fail: 917 fail:
918 kfree(tag_index); 918 kfree(tag_index);
919 return -ENOMEM; 919 return -ENOMEM;
920 } 920 }
921 921
922 static struct blk_queue_tag *__blk_queue_init_tags(struct request_queue *q, 922 static struct blk_queue_tag *__blk_queue_init_tags(struct request_queue *q,
923 int depth) 923 int depth)
924 { 924 {
925 struct blk_queue_tag *tags; 925 struct blk_queue_tag *tags;
926 926
927 tags = kmalloc(sizeof(struct blk_queue_tag), GFP_ATOMIC); 927 tags = kmalloc(sizeof(struct blk_queue_tag), GFP_ATOMIC);
928 if (!tags) 928 if (!tags)
929 goto fail; 929 goto fail;
930 930
931 if (init_tag_map(q, tags, depth)) 931 if (init_tag_map(q, tags, depth))
932 goto fail; 932 goto fail;
933 933
934 INIT_LIST_HEAD(&tags->busy_list); 934 INIT_LIST_HEAD(&tags->busy_list);
935 tags->busy = 0; 935 tags->busy = 0;
936 atomic_set(&tags->refcnt, 1); 936 atomic_set(&tags->refcnt, 1);
937 return tags; 937 return tags;
938 fail: 938 fail:
939 kfree(tags); 939 kfree(tags);
940 return NULL; 940 return NULL;
941 } 941 }
942 942
943 /** 943 /**
944 * blk_init_tags - initialize the tag info for an external tag map 944 * blk_init_tags - initialize the tag info for an external tag map
945 * @depth: the maximum queue depth supported 945 * @depth: the maximum queue depth supported
946 * @tags: the tag to use 946 * @tags: the tag to use
947 **/ 947 **/
948 struct blk_queue_tag *blk_init_tags(int depth) 948 struct blk_queue_tag *blk_init_tags(int depth)
949 { 949 {
950 return __blk_queue_init_tags(NULL, depth); 950 return __blk_queue_init_tags(NULL, depth);
951 } 951 }
952 EXPORT_SYMBOL(blk_init_tags); 952 EXPORT_SYMBOL(blk_init_tags);
953 953
954 /** 954 /**
955 * blk_queue_init_tags - initialize the queue tag info 955 * blk_queue_init_tags - initialize the queue tag info
956 * @q: the request queue for the device 956 * @q: the request queue for the device
957 * @depth: the maximum queue depth supported 957 * @depth: the maximum queue depth supported
958 * @tags: the tag to use 958 * @tags: the tag to use
959 **/ 959 **/
960 int blk_queue_init_tags(struct request_queue *q, int depth, 960 int blk_queue_init_tags(struct request_queue *q, int depth,
961 struct blk_queue_tag *tags) 961 struct blk_queue_tag *tags)
962 { 962 {
963 int rc; 963 int rc;
964 964
965 BUG_ON(tags && q->queue_tags && tags != q->queue_tags); 965 BUG_ON(tags && q->queue_tags && tags != q->queue_tags);
966 966
967 if (!tags && !q->queue_tags) { 967 if (!tags && !q->queue_tags) {
968 tags = __blk_queue_init_tags(q, depth); 968 tags = __blk_queue_init_tags(q, depth);
969 969
970 if (!tags) 970 if (!tags)
971 goto fail; 971 goto fail;
972 } else if (q->queue_tags) { 972 } else if (q->queue_tags) {
973 if ((rc = blk_queue_resize_tags(q, depth))) 973 if ((rc = blk_queue_resize_tags(q, depth)))
974 return rc; 974 return rc;
975 set_bit(QUEUE_FLAG_QUEUED, &q->queue_flags); 975 set_bit(QUEUE_FLAG_QUEUED, &q->queue_flags);
976 return 0; 976 return 0;
977 } else 977 } else
978 atomic_inc(&tags->refcnt); 978 atomic_inc(&tags->refcnt);
979 979
980 /* 980 /*
981 * assign it, all done 981 * assign it, all done
982 */ 982 */
983 q->queue_tags = tags; 983 q->queue_tags = tags;
984 q->queue_flags |= (1 << QUEUE_FLAG_QUEUED); 984 q->queue_flags |= (1 << QUEUE_FLAG_QUEUED);
985 return 0; 985 return 0;
986 fail: 986 fail:
987 kfree(tags); 987 kfree(tags);
988 return -ENOMEM; 988 return -ENOMEM;
989 } 989 }
990 990
991 EXPORT_SYMBOL(blk_queue_init_tags); 991 EXPORT_SYMBOL(blk_queue_init_tags);
992 992
993 /** 993 /**
994 * blk_queue_resize_tags - change the queueing depth 994 * blk_queue_resize_tags - change the queueing depth
995 * @q: the request queue for the device 995 * @q: the request queue for the device
996 * @new_depth: the new max command queueing depth 996 * @new_depth: the new max command queueing depth
997 * 997 *
998 * Notes: 998 * Notes:
999 * Must be called with the queue lock held. 999 * Must be called with the queue lock held.
1000 **/ 1000 **/
1001 int blk_queue_resize_tags(struct request_queue *q, int new_depth) 1001 int blk_queue_resize_tags(struct request_queue *q, int new_depth)
1002 { 1002 {
1003 struct blk_queue_tag *bqt = q->queue_tags; 1003 struct blk_queue_tag *bqt = q->queue_tags;
1004 struct request **tag_index; 1004 struct request **tag_index;
1005 unsigned long *tag_map; 1005 unsigned long *tag_map;
1006 int max_depth, nr_ulongs; 1006 int max_depth, nr_ulongs;
1007 1007
1008 if (!bqt) 1008 if (!bqt)
1009 return -ENXIO; 1009 return -ENXIO;
1010 1010
1011 /* 1011 /*
1012 * if we already have large enough real_max_depth. just 1012 * if we already have large enough real_max_depth. just
1013 * adjust max_depth. *NOTE* as requests with tag value 1013 * adjust max_depth. *NOTE* as requests with tag value
1014 * between new_depth and real_max_depth can be in-flight, tag 1014 * between new_depth and real_max_depth can be in-flight, tag
1015 * map can not be shrunk blindly here. 1015 * map can not be shrunk blindly here.
1016 */ 1016 */
1017 if (new_depth <= bqt->real_max_depth) { 1017 if (new_depth <= bqt->real_max_depth) {
1018 bqt->max_depth = new_depth; 1018 bqt->max_depth = new_depth;
1019 return 0; 1019 return 0;
1020 } 1020 }
1021 1021
1022 /* 1022 /*
1023 * Currently cannot replace a shared tag map with a new 1023 * Currently cannot replace a shared tag map with a new
1024 * one, so error out if this is the case 1024 * one, so error out if this is the case
1025 */ 1025 */
1026 if (atomic_read(&bqt->refcnt) != 1) 1026 if (atomic_read(&bqt->refcnt) != 1)
1027 return -EBUSY; 1027 return -EBUSY;
1028 1028
1029 /* 1029 /*
1030 * save the old state info, so we can copy it back 1030 * save the old state info, so we can copy it back
1031 */ 1031 */
1032 tag_index = bqt->tag_index; 1032 tag_index = bqt->tag_index;
1033 tag_map = bqt->tag_map; 1033 tag_map = bqt->tag_map;
1034 max_depth = bqt->real_max_depth; 1034 max_depth = bqt->real_max_depth;
1035 1035
1036 if (init_tag_map(q, bqt, new_depth)) 1036 if (init_tag_map(q, bqt, new_depth))
1037 return -ENOMEM; 1037 return -ENOMEM;
1038 1038
1039 memcpy(bqt->tag_index, tag_index, max_depth * sizeof(struct request *)); 1039 memcpy(bqt->tag_index, tag_index, max_depth * sizeof(struct request *));
1040 nr_ulongs = ALIGN(max_depth, BITS_PER_LONG) / BITS_PER_LONG; 1040 nr_ulongs = ALIGN(max_depth, BITS_PER_LONG) / BITS_PER_LONG;
1041 memcpy(bqt->tag_map, tag_map, nr_ulongs * sizeof(unsigned long)); 1041 memcpy(bqt->tag_map, tag_map, nr_ulongs * sizeof(unsigned long));
1042 1042
1043 kfree(tag_index); 1043 kfree(tag_index);
1044 kfree(tag_map); 1044 kfree(tag_map);
1045 return 0; 1045 return 0;
1046 } 1046 }
1047 1047
1048 EXPORT_SYMBOL(blk_queue_resize_tags); 1048 EXPORT_SYMBOL(blk_queue_resize_tags);
1049 1049
1050 /** 1050 /**
1051 * blk_queue_end_tag - end tag operations for a request 1051 * blk_queue_end_tag - end tag operations for a request
1052 * @q: the request queue for the device 1052 * @q: the request queue for the device
1053 * @rq: the request that has completed 1053 * @rq: the request that has completed
1054 * 1054 *
1055 * Description: 1055 * Description:
1056 * Typically called when end_that_request_first() returns 0, meaning 1056 * Typically called when end_that_request_first() returns 0, meaning
1057 * all transfers have been done for a request. It's important to call 1057 * all transfers have been done for a request. It's important to call
1058 * this function before end_that_request_last(), as that will put the 1058 * this function before end_that_request_last(), as that will put the
1059 * request back on the free list thus corrupting the internal tag list. 1059 * request back on the free list thus corrupting the internal tag list.
1060 * 1060 *
1061 * Notes: 1061 * Notes:
1062 * queue lock must be held. 1062 * queue lock must be held.
1063 **/ 1063 **/
1064 void blk_queue_end_tag(struct request_queue *q, struct request *rq) 1064 void blk_queue_end_tag(struct request_queue *q, struct request *rq)
1065 { 1065 {
1066 struct blk_queue_tag *bqt = q->queue_tags; 1066 struct blk_queue_tag *bqt = q->queue_tags;
1067 int tag = rq->tag; 1067 int tag = rq->tag;
1068 1068
1069 BUG_ON(tag == -1); 1069 BUG_ON(tag == -1);
1070 1070
1071 if (unlikely(tag >= bqt->real_max_depth)) 1071 if (unlikely(tag >= bqt->real_max_depth))
1072 /* 1072 /*
1073 * This can happen after tag depth has been reduced. 1073 * This can happen after tag depth has been reduced.
1074 * FIXME: how about a warning or info message here? 1074 * FIXME: how about a warning or info message here?
1075 */ 1075 */
1076 return; 1076 return;
1077 1077
1078 if (unlikely(!__test_and_clear_bit(tag, bqt->tag_map))) { 1078 if (unlikely(!__test_and_clear_bit(tag, bqt->tag_map))) {
1079 printk(KERN_ERR "%s: attempt to clear non-busy tag (%d)\n", 1079 printk(KERN_ERR "%s: attempt to clear non-busy tag (%d)\n",
1080 __FUNCTION__, tag); 1080 __FUNCTION__, tag);
1081 return; 1081 return;
1082 } 1082 }
1083 1083
1084 list_del_init(&rq->queuelist); 1084 list_del_init(&rq->queuelist);
1085 rq->cmd_flags &= ~REQ_QUEUED; 1085 rq->cmd_flags &= ~REQ_QUEUED;
1086 rq->tag = -1; 1086 rq->tag = -1;
1087 1087
1088 if (unlikely(bqt->tag_index[tag] == NULL)) 1088 if (unlikely(bqt->tag_index[tag] == NULL))
1089 printk(KERN_ERR "%s: tag %d is missing\n", 1089 printk(KERN_ERR "%s: tag %d is missing\n",
1090 __FUNCTION__, tag); 1090 __FUNCTION__, tag);
1091 1091
1092 bqt->tag_index[tag] = NULL; 1092 bqt->tag_index[tag] = NULL;
1093 bqt->busy--; 1093 bqt->busy--;
1094 } 1094 }
1095 1095
1096 EXPORT_SYMBOL(blk_queue_end_tag); 1096 EXPORT_SYMBOL(blk_queue_end_tag);
1097 1097
1098 /** 1098 /**
1099 * blk_queue_start_tag - find a free tag and assign it 1099 * blk_queue_start_tag - find a free tag and assign it
1100 * @q: the request queue for the device 1100 * @q: the request queue for the device
1101 * @rq: the block request that needs tagging 1101 * @rq: the block request that needs tagging
1102 * 1102 *
1103 * Description: 1103 * Description:
1104 * This can either be used as a stand-alone helper, or possibly be 1104 * This can either be used as a stand-alone helper, or possibly be
1105 * assigned as the queue &prep_rq_fn (in which case &struct request 1105 * assigned as the queue &prep_rq_fn (in which case &struct request
1106 * automagically gets a tag assigned). Note that this function 1106 * automagically gets a tag assigned). Note that this function
1107 * assumes that any type of request can be queued! if this is not 1107 * assumes that any type of request can be queued! if this is not
1108 * true for your device, you must check the request type before 1108 * true for your device, you must check the request type before
1109 * calling this function. The request will also be removed from 1109 * calling this function. The request will also be removed from
1110 * the request queue, so it's the drivers responsibility to readd 1110 * the request queue, so it's the drivers responsibility to readd
1111 * it if it should need to be restarted for some reason. 1111 * it if it should need to be restarted for some reason.
1112 * 1112 *
1113 * Notes: 1113 * Notes:
1114 * queue lock must be held. 1114 * queue lock must be held.
1115 **/ 1115 **/
1116 int blk_queue_start_tag(struct request_queue *q, struct request *rq) 1116 int blk_queue_start_tag(struct request_queue *q, struct request *rq)
1117 { 1117 {
1118 struct blk_queue_tag *bqt = q->queue_tags; 1118 struct blk_queue_tag *bqt = q->queue_tags;
1119 int tag; 1119 int tag;
1120 1120
1121 if (unlikely((rq->cmd_flags & REQ_QUEUED))) { 1121 if (unlikely((rq->cmd_flags & REQ_QUEUED))) {
1122 printk(KERN_ERR 1122 printk(KERN_ERR
1123 "%s: request %p for device [%s] already tagged %d", 1123 "%s: request %p for device [%s] already tagged %d",
1124 __FUNCTION__, rq, 1124 __FUNCTION__, rq,
1125 rq->rq_disk ? rq->rq_disk->disk_name : "?", rq->tag); 1125 rq->rq_disk ? rq->rq_disk->disk_name : "?", rq->tag);
1126 BUG(); 1126 BUG();
1127 } 1127 }
1128 1128
1129 /* 1129 /*
1130 * Protect against shared tag maps, as we may not have exclusive 1130 * Protect against shared tag maps, as we may not have exclusive
1131 * access to the tag map. 1131 * access to the tag map.
1132 */ 1132 */
1133 do { 1133 do {
1134 tag = find_first_zero_bit(bqt->tag_map, bqt->max_depth); 1134 tag = find_first_zero_bit(bqt->tag_map, bqt->max_depth);
1135 if (tag >= bqt->max_depth) 1135 if (tag >= bqt->max_depth)
1136 return 1; 1136 return 1;
1137 1137
1138 } while (test_and_set_bit(tag, bqt->tag_map)); 1138 } while (test_and_set_bit(tag, bqt->tag_map));
1139 1139
1140 rq->cmd_flags |= REQ_QUEUED; 1140 rq->cmd_flags |= REQ_QUEUED;
1141 rq->tag = tag; 1141 rq->tag = tag;
1142 bqt->tag_index[tag] = rq; 1142 bqt->tag_index[tag] = rq;
1143 blkdev_dequeue_request(rq); 1143 blkdev_dequeue_request(rq);
1144 list_add(&rq->queuelist, &bqt->busy_list); 1144 list_add(&rq->queuelist, &bqt->busy_list);
1145 bqt->busy++; 1145 bqt->busy++;
1146 return 0; 1146 return 0;
1147 } 1147 }
1148 1148
1149 EXPORT_SYMBOL(blk_queue_start_tag); 1149 EXPORT_SYMBOL(blk_queue_start_tag);
1150 1150
1151 /** 1151 /**
1152 * blk_queue_invalidate_tags - invalidate all pending tags 1152 * blk_queue_invalidate_tags - invalidate all pending tags
1153 * @q: the request queue for the device 1153 * @q: the request queue for the device
1154 * 1154 *
1155 * Description: 1155 * Description:
1156 * Hardware conditions may dictate a need to stop all pending requests. 1156 * Hardware conditions may dictate a need to stop all pending requests.
1157 * In this case, we will safely clear the block side of the tag queue and 1157 * In this case, we will safely clear the block side of the tag queue and
1158 * readd all requests to the request queue in the right order. 1158 * readd all requests to the request queue in the right order.
1159 * 1159 *
1160 * Notes: 1160 * Notes:
1161 * queue lock must be held. 1161 * queue lock must be held.
1162 **/ 1162 **/
1163 void blk_queue_invalidate_tags(struct request_queue *q) 1163 void blk_queue_invalidate_tags(struct request_queue *q)
1164 { 1164 {
1165 struct blk_queue_tag *bqt = q->queue_tags; 1165 struct blk_queue_tag *bqt = q->queue_tags;
1166 struct list_head *tmp, *n; 1166 struct list_head *tmp, *n;
1167 struct request *rq; 1167 struct request *rq;
1168 1168
1169 list_for_each_safe(tmp, n, &bqt->busy_list) { 1169 list_for_each_safe(tmp, n, &bqt->busy_list) {
1170 rq = list_entry_rq(tmp); 1170 rq = list_entry_rq(tmp);
1171 1171
1172 if (rq->tag == -1) { 1172 if (rq->tag == -1) {
1173 printk(KERN_ERR 1173 printk(KERN_ERR
1174 "%s: bad tag found on list\n", __FUNCTION__); 1174 "%s: bad tag found on list\n", __FUNCTION__);
1175 list_del_init(&rq->queuelist); 1175 list_del_init(&rq->queuelist);
1176 rq->cmd_flags &= ~REQ_QUEUED; 1176 rq->cmd_flags &= ~REQ_QUEUED;
1177 } else 1177 } else
1178 blk_queue_end_tag(q, rq); 1178 blk_queue_end_tag(q, rq);
1179 1179
1180 rq->cmd_flags &= ~REQ_STARTED; 1180 rq->cmd_flags &= ~REQ_STARTED;
1181 __elv_add_request(q, rq, ELEVATOR_INSERT_BACK, 0); 1181 __elv_add_request(q, rq, ELEVATOR_INSERT_BACK, 0);
1182 } 1182 }
1183 } 1183 }
1184 1184
1185 EXPORT_SYMBOL(blk_queue_invalidate_tags); 1185 EXPORT_SYMBOL(blk_queue_invalidate_tags);
1186 1186
1187 void blk_dump_rq_flags(struct request *rq, char *msg) 1187 void blk_dump_rq_flags(struct request *rq, char *msg)
1188 { 1188 {
1189 int bit; 1189 int bit;
1190 1190
1191 printk("%s: dev %s: type=%x, flags=%x\n", msg, 1191 printk("%s: dev %s: type=%x, flags=%x\n", msg,
1192 rq->rq_disk ? rq->rq_disk->disk_name : "?", rq->cmd_type, 1192 rq->rq_disk ? rq->rq_disk->disk_name : "?", rq->cmd_type,
1193 rq->cmd_flags); 1193 rq->cmd_flags);
1194 1194
1195 printk("\nsector %llu, nr/cnr %lu/%u\n", (unsigned long long)rq->sector, 1195 printk("\nsector %llu, nr/cnr %lu/%u\n", (unsigned long long)rq->sector,
1196 rq->nr_sectors, 1196 rq->nr_sectors,
1197 rq->current_nr_sectors); 1197 rq->current_nr_sectors);
1198 printk("bio %p, biotail %p, buffer %p, data %p, len %u\n", rq->bio, rq->biotail, rq->buffer, rq->data, rq->data_len); 1198 printk("bio %p, biotail %p, buffer %p, data %p, len %u\n", rq->bio, rq->biotail, rq->buffer, rq->data, rq->data_len);
1199 1199
1200 if (blk_pc_request(rq)) { 1200 if (blk_pc_request(rq)) {
1201 printk("cdb: "); 1201 printk("cdb: ");
1202 for (bit = 0; bit < sizeof(rq->cmd); bit++) 1202 for (bit = 0; bit < sizeof(rq->cmd); bit++)
1203 printk("%02x ", rq->cmd[bit]); 1203 printk("%02x ", rq->cmd[bit]);
1204 printk("\n"); 1204 printk("\n");
1205 } 1205 }
1206 } 1206 }
1207 1207
1208 EXPORT_SYMBOL(blk_dump_rq_flags); 1208 EXPORT_SYMBOL(blk_dump_rq_flags);
1209 1209
1210 void blk_recount_segments(struct request_queue *q, struct bio *bio) 1210 void blk_recount_segments(struct request_queue *q, struct bio *bio)
1211 { 1211 {
1212 struct bio_vec *bv, *bvprv = NULL; 1212 struct bio_vec *bv, *bvprv = NULL;
1213 int i, nr_phys_segs, nr_hw_segs, seg_size, hw_seg_size, cluster; 1213 int i, nr_phys_segs, nr_hw_segs, seg_size, hw_seg_size, cluster;
1214 int high, highprv = 1; 1214 int high, highprv = 1;
1215 1215
1216 if (unlikely(!bio->bi_io_vec)) 1216 if (unlikely(!bio->bi_io_vec))
1217 return; 1217 return;
1218 1218
1219 cluster = q->queue_flags & (1 << QUEUE_FLAG_CLUSTER); 1219 cluster = q->queue_flags & (1 << QUEUE_FLAG_CLUSTER);
1220 hw_seg_size = seg_size = nr_phys_segs = nr_hw_segs = 0; 1220 hw_seg_size = seg_size = nr_phys_segs = nr_hw_segs = 0;
1221 bio_for_each_segment(bv, bio, i) { 1221 bio_for_each_segment(bv, bio, i) {
1222 /* 1222 /*
1223 * the trick here is making sure that a high page is never 1223 * the trick here is making sure that a high page is never
1224 * considered part of another segment, since that might 1224 * considered part of another segment, since that might
1225 * change with the bounce page. 1225 * change with the bounce page.
1226 */ 1226 */
1227 high = page_to_pfn(bv->bv_page) > q->bounce_pfn; 1227 high = page_to_pfn(bv->bv_page) > q->bounce_pfn;
1228 if (high || highprv) 1228 if (high || highprv)
1229 goto new_hw_segment; 1229 goto new_hw_segment;
1230 if (cluster) { 1230 if (cluster) {
1231 if (seg_size + bv->bv_len > q->max_segment_size) 1231 if (seg_size + bv->bv_len > q->max_segment_size)
1232 goto new_segment; 1232 goto new_segment;
1233 if (!BIOVEC_PHYS_MERGEABLE(bvprv, bv)) 1233 if (!BIOVEC_PHYS_MERGEABLE(bvprv, bv))
1234 goto new_segment; 1234 goto new_segment;
1235 if (!BIOVEC_SEG_BOUNDARY(q, bvprv, bv)) 1235 if (!BIOVEC_SEG_BOUNDARY(q, bvprv, bv))
1236 goto new_segment; 1236 goto new_segment;
1237 if (BIOVEC_VIRT_OVERSIZE(hw_seg_size + bv->bv_len)) 1237 if (BIOVEC_VIRT_OVERSIZE(hw_seg_size + bv->bv_len))
1238 goto new_hw_segment; 1238 goto new_hw_segment;
1239 1239
1240 seg_size += bv->bv_len; 1240 seg_size += bv->bv_len;
1241 hw_seg_size += bv->bv_len; 1241 hw_seg_size += bv->bv_len;
1242 bvprv = bv; 1242 bvprv = bv;
1243 continue; 1243 continue;
1244 } 1244 }
1245 new_segment: 1245 new_segment:
1246 if (BIOVEC_VIRT_MERGEABLE(bvprv, bv) && 1246 if (BIOVEC_VIRT_MERGEABLE(bvprv, bv) &&
1247 !BIOVEC_VIRT_OVERSIZE(hw_seg_size + bv->bv_len)) { 1247 !BIOVEC_VIRT_OVERSIZE(hw_seg_size + bv->bv_len)) {
1248 hw_seg_size += bv->bv_len; 1248 hw_seg_size += bv->bv_len;
1249 } else { 1249 } else {
1250 new_hw_segment: 1250 new_hw_segment:
1251 if (hw_seg_size > bio->bi_hw_front_size) 1251 if (hw_seg_size > bio->bi_hw_front_size)
1252 bio->bi_hw_front_size = hw_seg_size; 1252 bio->bi_hw_front_size = hw_seg_size;
1253 hw_seg_size = BIOVEC_VIRT_START_SIZE(bv) + bv->bv_len; 1253 hw_seg_size = BIOVEC_VIRT_START_SIZE(bv) + bv->bv_len;
1254 nr_hw_segs++; 1254 nr_hw_segs++;
1255 } 1255 }
1256 1256
1257 nr_phys_segs++; 1257 nr_phys_segs++;
1258 bvprv = bv; 1258 bvprv = bv;
1259 seg_size = bv->bv_len; 1259 seg_size = bv->bv_len;
1260 highprv = high; 1260 highprv = high;
1261 } 1261 }
1262 if (hw_seg_size > bio->bi_hw_back_size) 1262 if (hw_seg_size > bio->bi_hw_back_size)
1263 bio->bi_hw_back_size = hw_seg_size; 1263 bio->bi_hw_back_size = hw_seg_size;
1264 if (nr_hw_segs == 1 && hw_seg_size > bio->bi_hw_front_size) 1264 if (nr_hw_segs == 1 && hw_seg_size > bio->bi_hw_front_size)
1265 bio->bi_hw_front_size = hw_seg_size; 1265 bio->bi_hw_front_size = hw_seg_size;
1266 bio->bi_phys_segments = nr_phys_segs; 1266 bio->bi_phys_segments = nr_phys_segs;
1267 bio->bi_hw_segments = nr_hw_segs; 1267 bio->bi_hw_segments = nr_hw_segs;
1268 bio->bi_flags |= (1 << BIO_SEG_VALID); 1268 bio->bi_flags |= (1 << BIO_SEG_VALID);
1269 } 1269 }
1270 EXPORT_SYMBOL(blk_recount_segments); 1270 EXPORT_SYMBOL(blk_recount_segments);
1271 1271
1272 static int blk_phys_contig_segment(struct request_queue *q, struct bio *bio, 1272 static int blk_phys_contig_segment(struct request_queue *q, struct bio *bio,
1273 struct bio *nxt) 1273 struct bio *nxt)
1274 { 1274 {
1275 if (!(q->queue_flags & (1 << QUEUE_FLAG_CLUSTER))) 1275 if (!(q->queue_flags & (1 << QUEUE_FLAG_CLUSTER)))
1276 return 0; 1276 return 0;
1277 1277
1278 if (!BIOVEC_PHYS_MERGEABLE(__BVEC_END(bio), __BVEC_START(nxt))) 1278 if (!BIOVEC_PHYS_MERGEABLE(__BVEC_END(bio), __BVEC_START(nxt)))
1279 return 0; 1279 return 0;
1280 if (bio->bi_size + nxt->bi_size > q->max_segment_size) 1280 if (bio->bi_size + nxt->bi_size > q->max_segment_size)
1281 return 0; 1281 return 0;
1282 1282
1283 /* 1283 /*
1284 * bio and nxt are contigous in memory, check if the queue allows 1284 * bio and nxt are contigous in memory, check if the queue allows
1285 * these two to be merged into one 1285 * these two to be merged into one
1286 */ 1286 */
1287 if (BIO_SEG_BOUNDARY(q, bio, nxt)) 1287 if (BIO_SEG_BOUNDARY(q, bio, nxt))
1288 return 1; 1288 return 1;
1289 1289
1290 return 0; 1290 return 0;
1291 } 1291 }
1292 1292
1293 static int blk_hw_contig_segment(struct request_queue *q, struct bio *bio, 1293 static int blk_hw_contig_segment(struct request_queue *q, struct bio *bio,
1294 struct bio *nxt) 1294 struct bio *nxt)
1295 { 1295 {
1296 if (unlikely(!bio_flagged(bio, BIO_SEG_VALID))) 1296 if (unlikely(!bio_flagged(bio, BIO_SEG_VALID)))
1297 blk_recount_segments(q, bio); 1297 blk_recount_segments(q, bio);
1298 if (unlikely(!bio_flagged(nxt, BIO_SEG_VALID))) 1298 if (unlikely(!bio_flagged(nxt, BIO_SEG_VALID)))
1299 blk_recount_segments(q, nxt); 1299 blk_recount_segments(q, nxt);
1300 if (!BIOVEC_VIRT_MERGEABLE(__BVEC_END(bio), __BVEC_START(nxt)) || 1300 if (!BIOVEC_VIRT_MERGEABLE(__BVEC_END(bio), __BVEC_START(nxt)) ||
1301 BIOVEC_VIRT_OVERSIZE(bio->bi_hw_back_size + nxt->bi_hw_front_size)) 1301 BIOVEC_VIRT_OVERSIZE(bio->bi_hw_back_size + nxt->bi_hw_front_size))
1302 return 0; 1302 return 0;
1303 if (bio->bi_hw_back_size + nxt->bi_hw_front_size > q->max_segment_size) 1303 if (bio->bi_hw_back_size + nxt->bi_hw_front_size > q->max_segment_size)
1304 return 0; 1304 return 0;
1305 1305
1306 return 1; 1306 return 1;
1307 } 1307 }
1308 1308
1309 /* 1309 /*
1310 * map a request to scatterlist, return number of sg entries setup. Caller 1310 * map a request to scatterlist, return number of sg entries setup. Caller
1311 * must make sure sg can hold rq->nr_phys_segments entries 1311 * must make sure sg can hold rq->nr_phys_segments entries
1312 */ 1312 */
1313 int blk_rq_map_sg(struct request_queue *q, struct request *rq, 1313 int blk_rq_map_sg(struct request_queue *q, struct request *rq,
1314 struct scatterlist *sg) 1314 struct scatterlist *sg)
1315 { 1315 {
1316 struct bio_vec *bvec, *bvprv; 1316 struct bio_vec *bvec, *bvprv;
1317 struct bio *bio; 1317 struct bio *bio;
1318 int nsegs, i, cluster; 1318 int nsegs, i, cluster;
1319 1319
1320 nsegs = 0; 1320 nsegs = 0;
1321 cluster = q->queue_flags & (1 << QUEUE_FLAG_CLUSTER); 1321 cluster = q->queue_flags & (1 << QUEUE_FLAG_CLUSTER);
1322 1322
1323 /* 1323 /*
1324 * for each bio in rq 1324 * for each bio in rq
1325 */ 1325 */
1326 bvprv = NULL; 1326 bvprv = NULL;
1327 rq_for_each_bio(bio, rq) { 1327 rq_for_each_bio(bio, rq) {
1328 /* 1328 /*
1329 * for each segment in bio 1329 * for each segment in bio
1330 */ 1330 */
1331 bio_for_each_segment(bvec, bio, i) { 1331 bio_for_each_segment(bvec, bio, i) {
1332 int nbytes = bvec->bv_len; 1332 int nbytes = bvec->bv_len;
1333 1333
1334 if (bvprv && cluster) { 1334 if (bvprv && cluster) {
1335 if (sg[nsegs - 1].length + nbytes > q->max_segment_size) 1335 if (sg[nsegs - 1].length + nbytes > q->max_segment_size)
1336 goto new_segment; 1336 goto new_segment;
1337 1337
1338 if (!BIOVEC_PHYS_MERGEABLE(bvprv, bvec)) 1338 if (!BIOVEC_PHYS_MERGEABLE(bvprv, bvec))
1339 goto new_segment; 1339 goto new_segment;
1340 if (!BIOVEC_SEG_BOUNDARY(q, bvprv, bvec)) 1340 if (!BIOVEC_SEG_BOUNDARY(q, bvprv, bvec))
1341 goto new_segment; 1341 goto new_segment;
1342 1342
1343 sg[nsegs - 1].length += nbytes; 1343 sg[nsegs - 1].length += nbytes;
1344 } else { 1344 } else {
1345 new_segment: 1345 new_segment:
1346 memset(&sg[nsegs],0,sizeof(struct scatterlist)); 1346 memset(&sg[nsegs],0,sizeof(struct scatterlist));
1347 sg[nsegs].page = bvec->bv_page; 1347 sg[nsegs].page = bvec->bv_page;
1348 sg[nsegs].length = nbytes; 1348 sg[nsegs].length = nbytes;
1349 sg[nsegs].offset = bvec->bv_offset; 1349 sg[nsegs].offset = bvec->bv_offset;
1350 1350
1351 nsegs++; 1351 nsegs++;
1352 } 1352 }
1353 bvprv = bvec; 1353 bvprv = bvec;
1354 } /* segments in bio */ 1354 } /* segments in bio */
1355 } /* bios in rq */ 1355 } /* bios in rq */
1356 1356
1357 return nsegs; 1357 return nsegs;
1358 } 1358 }
1359 1359
1360 EXPORT_SYMBOL(blk_rq_map_sg); 1360 EXPORT_SYMBOL(blk_rq_map_sg);
1361 1361
1362 /* 1362 /*
1363 * the standard queue merge functions, can be overridden with device 1363 * the standard queue merge functions, can be overridden with device
1364 * specific ones if so desired 1364 * specific ones if so desired
1365 */ 1365 */
1366 1366
1367 static inline int ll_new_mergeable(struct request_queue *q, 1367 static inline int ll_new_mergeable(struct request_queue *q,
1368 struct request *req, 1368 struct request *req,
1369 struct bio *bio) 1369 struct bio *bio)
1370 { 1370 {
1371 int nr_phys_segs = bio_phys_segments(q, bio); 1371 int nr_phys_segs = bio_phys_segments(q, bio);
1372 1372
1373 if (req->nr_phys_segments + nr_phys_segs > q->max_phys_segments) { 1373 if (req->nr_phys_segments + nr_phys_segs > q->max_phys_segments) {
1374 req->cmd_flags |= REQ_NOMERGE; 1374 req->cmd_flags |= REQ_NOMERGE;
1375 if (req == q->last_merge) 1375 if (req == q->last_merge)
1376 q->last_merge = NULL; 1376 q->last_merge = NULL;
1377 return 0; 1377 return 0;
1378 } 1378 }
1379 1379
1380 /* 1380 /*
1381 * A hw segment is just getting larger, bump just the phys 1381 * A hw segment is just getting larger, bump just the phys
1382 * counter. 1382 * counter.
1383 */ 1383 */
1384 req->nr_phys_segments += nr_phys_segs; 1384 req->nr_phys_segments += nr_phys_segs;
1385 return 1; 1385 return 1;
1386 } 1386 }
1387 1387
1388 static inline int ll_new_hw_segment(struct request_queue *q, 1388 static inline int ll_new_hw_segment(struct request_queue *q,
1389 struct request *req, 1389 struct request *req,
1390 struct bio *bio) 1390 struct bio *bio)
1391 { 1391 {
1392 int nr_hw_segs = bio_hw_segments(q, bio); 1392 int nr_hw_segs = bio_hw_segments(q, bio);
1393 int nr_phys_segs = bio_phys_segments(q, bio); 1393 int nr_phys_segs = bio_phys_segments(q, bio);
1394 1394
1395 if (req->nr_hw_segments + nr_hw_segs > q->max_hw_segments 1395 if (req->nr_hw_segments + nr_hw_segs > q->max_hw_segments
1396 || req->nr_phys_segments + nr_phys_segs > q->max_phys_segments) { 1396 || req->nr_phys_segments + nr_phys_segs > q->max_phys_segments) {
1397 req->cmd_flags |= REQ_NOMERGE; 1397 req->cmd_flags |= REQ_NOMERGE;
1398 if (req == q->last_merge) 1398 if (req == q->last_merge)
1399 q->last_merge = NULL; 1399 q->last_merge = NULL;
1400 return 0; 1400 return 0;
1401 } 1401 }
1402 1402
1403 /* 1403 /*
1404 * This will form the start of a new hw segment. Bump both 1404 * This will form the start of a new hw segment. Bump both
1405 * counters. 1405 * counters.
1406 */ 1406 */
1407 req->nr_hw_segments += nr_hw_segs; 1407 req->nr_hw_segments += nr_hw_segs;
1408 req->nr_phys_segments += nr_phys_segs; 1408 req->nr_phys_segments += nr_phys_segs;
1409 return 1; 1409 return 1;
1410 } 1410 }
1411 1411
1412 int ll_back_merge_fn(struct request_queue *q, struct request *req, struct bio *bio) 1412 int ll_back_merge_fn(struct request_queue *q, struct request *req, struct bio *bio)
1413 { 1413 {
1414 unsigned short max_sectors; 1414 unsigned short max_sectors;
1415 int len; 1415 int len;
1416 1416
1417 if (unlikely(blk_pc_request(req))) 1417 if (unlikely(blk_pc_request(req)))
1418 max_sectors = q->max_hw_sectors; 1418 max_sectors = q->max_hw_sectors;
1419 else 1419 else
1420 max_sectors = q->max_sectors; 1420 max_sectors = q->max_sectors;
1421 1421
1422 if (req->nr_sectors + bio_sectors(bio) > max_sectors) { 1422 if (req->nr_sectors + bio_sectors(bio) > max_sectors) {
1423 req->cmd_flags |= REQ_NOMERGE; 1423 req->cmd_flags |= REQ_NOMERGE;
1424 if (req == q->last_merge) 1424 if (req == q->last_merge)
1425 q->last_merge = NULL; 1425 q->last_merge = NULL;
1426 return 0; 1426 return 0;
1427 } 1427 }
1428 if (unlikely(!bio_flagged(req->biotail, BIO_SEG_VALID))) 1428 if (unlikely(!bio_flagged(req->biotail, BIO_SEG_VALID)))
1429 blk_recount_segments(q, req->biotail); 1429 blk_recount_segments(q, req->biotail);
1430 if (unlikely(!bio_flagged(bio, BIO_SEG_VALID))) 1430 if (unlikely(!bio_flagged(bio, BIO_SEG_VALID)))
1431 blk_recount_segments(q, bio); 1431 blk_recount_segments(q, bio);
1432 len = req->biotail->bi_hw_back_size + bio->bi_hw_front_size; 1432 len = req->biotail->bi_hw_back_size + bio->bi_hw_front_size;
1433 if (BIOVEC_VIRT_MERGEABLE(__BVEC_END(req->biotail), __BVEC_START(bio)) && 1433 if (BIOVEC_VIRT_MERGEABLE(__BVEC_END(req->biotail), __BVEC_START(bio)) &&
1434 !BIOVEC_VIRT_OVERSIZE(len)) { 1434 !BIOVEC_VIRT_OVERSIZE(len)) {
1435 int mergeable = ll_new_mergeable(q, req, bio); 1435 int mergeable = ll_new_mergeable(q, req, bio);
1436 1436
1437 if (mergeable) { 1437 if (mergeable) {
1438 if (req->nr_hw_segments == 1) 1438 if (req->nr_hw_segments == 1)
1439 req->bio->bi_hw_front_size = len; 1439 req->bio->bi_hw_front_size = len;
1440 if (bio->bi_hw_segments == 1) 1440 if (bio->bi_hw_segments == 1)
1441 bio->bi_hw_back_size = len; 1441 bio->bi_hw_back_size = len;
1442 } 1442 }
1443 return mergeable; 1443 return mergeable;
1444 } 1444 }
1445 1445
1446 return ll_new_hw_segment(q, req, bio); 1446 return ll_new_hw_segment(q, req, bio);
1447 } 1447 }
1448 EXPORT_SYMBOL(ll_back_merge_fn); 1448 EXPORT_SYMBOL(ll_back_merge_fn);
1449 1449
1450 static int ll_front_merge_fn(struct request_queue *q, struct request *req, 1450 static int ll_front_merge_fn(struct request_queue *q, struct request *req,
1451 struct bio *bio) 1451 struct bio *bio)
1452 { 1452 {
1453 unsigned short max_sectors; 1453 unsigned short max_sectors;
1454 int len; 1454 int len;
1455 1455
1456 if (unlikely(blk_pc_request(req))) 1456 if (unlikely(blk_pc_request(req)))
1457 max_sectors = q->max_hw_sectors; 1457 max_sectors = q->max_hw_sectors;
1458 else 1458 else
1459 max_sectors = q->max_sectors; 1459 max_sectors = q->max_sectors;
1460 1460
1461 1461
1462 if (req->nr_sectors + bio_sectors(bio) > max_sectors) { 1462 if (req->nr_sectors + bio_sectors(bio) > max_sectors) {
1463 req->cmd_flags |= REQ_NOMERGE; 1463 req->cmd_flags |= REQ_NOMERGE;
1464 if (req == q->last_merge) 1464 if (req == q->last_merge)
1465 q->last_merge = NULL; 1465 q->last_merge = NULL;
1466 return 0; 1466 return 0;
1467 } 1467 }
1468 len = bio->bi_hw_back_size + req->bio->bi_hw_front_size; 1468 len = bio->bi_hw_back_size + req->bio->bi_hw_front_size;
1469 if (unlikely(!bio_flagged(bio, BIO_SEG_VALID))) 1469 if (unlikely(!bio_flagged(bio, BIO_SEG_VALID)))
1470 blk_recount_segments(q, bio); 1470 blk_recount_segments(q, bio);
1471 if (unlikely(!bio_flagged(req->bio, BIO_SEG_VALID))) 1471 if (unlikely(!bio_flagged(req->bio, BIO_SEG_VALID)))
1472 blk_recount_segments(q, req->bio); 1472 blk_recount_segments(q, req->bio);
1473 if (BIOVEC_VIRT_MERGEABLE(__BVEC_END(bio), __BVEC_START(req->bio)) && 1473 if (BIOVEC_VIRT_MERGEABLE(__BVEC_END(bio), __BVEC_START(req->bio)) &&
1474 !BIOVEC_VIRT_OVERSIZE(len)) { 1474 !BIOVEC_VIRT_OVERSIZE(len)) {
1475 int mergeable = ll_new_mergeable(q, req, bio); 1475 int mergeable = ll_new_mergeable(q, req, bio);
1476 1476
1477 if (mergeable) { 1477 if (mergeable) {
1478 if (bio->bi_hw_segments == 1) 1478 if (bio->bi_hw_segments == 1)
1479 bio->bi_hw_front_size = len; 1479 bio->bi_hw_front_size = len;
1480 if (req->nr_hw_segments == 1) 1480 if (req->nr_hw_segments == 1)
1481 req->biotail->bi_hw_back_size = len; 1481 req->biotail->bi_hw_back_size = len;
1482 } 1482 }
1483 return mergeable; 1483 return mergeable;
1484 } 1484 }
1485 1485
1486 return ll_new_hw_segment(q, req, bio); 1486 return ll_new_hw_segment(q, req, bio);
1487 } 1487 }
1488 1488
1489 static int ll_merge_requests_fn(struct request_queue *q, struct request *req, 1489 static int ll_merge_requests_fn(struct request_queue *q, struct request *req,
1490 struct request *next) 1490 struct request *next)
1491 { 1491 {
1492 int total_phys_segments; 1492 int total_phys_segments;
1493 int total_hw_segments; 1493 int total_hw_segments;
1494 1494
1495 /* 1495 /*
1496 * First check if the either of the requests are re-queued 1496 * First check if the either of the requests are re-queued
1497 * requests. Can't merge them if they are. 1497 * requests. Can't merge them if they are.
1498 */ 1498 */
1499 if (req->special || next->special) 1499 if (req->special || next->special)
1500 return 0; 1500 return 0;
1501 1501
1502 /* 1502 /*
1503 * Will it become too large? 1503 * Will it become too large?
1504 */ 1504 */
1505 if ((req->nr_sectors + next->nr_sectors) > q->max_sectors) 1505 if ((req->nr_sectors + next->nr_sectors) > q->max_sectors)
1506 return 0; 1506 return 0;
1507 1507
1508 total_phys_segments = req->nr_phys_segments + next->nr_phys_segments; 1508 total_phys_segments = req->nr_phys_segments + next->nr_phys_segments;
1509 if (blk_phys_contig_segment(q, req->biotail, next->bio)) 1509 if (blk_phys_contig_segment(q, req->biotail, next->bio))
1510 total_phys_segments--; 1510 total_phys_segments--;
1511 1511
1512 if (total_phys_segments > q->max_phys_segments) 1512 if (total_phys_segments > q->max_phys_segments)
1513 return 0; 1513 return 0;
1514 1514
1515 total_hw_segments = req->nr_hw_segments + next->nr_hw_segments; 1515 total_hw_segments = req->nr_hw_segments + next->nr_hw_segments;
1516 if (blk_hw_contig_segment(q, req->biotail, next->bio)) { 1516 if (blk_hw_contig_segment(q, req->biotail, next->bio)) {
1517 int len = req->biotail->bi_hw_back_size + next->bio->bi_hw_front_size; 1517 int len = req->biotail->bi_hw_back_size + next->bio->bi_hw_front_size;
1518 /* 1518 /*
1519 * propagate the combined length to the end of the requests 1519 * propagate the combined length to the end of the requests
1520 */ 1520 */
1521 if (req->nr_hw_segments == 1) 1521 if (req->nr_hw_segments == 1)
1522 req->bio->bi_hw_front_size = len; 1522 req->bio->bi_hw_front_size = len;
1523 if (next->nr_hw_segments == 1) 1523 if (next->nr_hw_segments == 1)
1524 next->biotail->bi_hw_back_size = len; 1524 next->biotail->bi_hw_back_size = len;
1525 total_hw_segments--; 1525 total_hw_segments--;
1526 } 1526 }
1527 1527
1528 if (total_hw_segments > q->max_hw_segments) 1528 if (total_hw_segments > q->max_hw_segments)
1529 return 0; 1529 return 0;
1530 1530
1531 /* Merge is OK... */ 1531 /* Merge is OK... */
1532 req->nr_phys_segments = total_phys_segments; 1532 req->nr_phys_segments = total_phys_segments;
1533 req->nr_hw_segments = total_hw_segments; 1533 req->nr_hw_segments = total_hw_segments;
1534 return 1; 1534 return 1;
1535 } 1535 }
1536 1536
1537 /* 1537 /*
1538 * "plug" the device if there are no outstanding requests: this will 1538 * "plug" the device if there are no outstanding requests: this will
1539 * force the transfer to start only after we have put all the requests 1539 * force the transfer to start only after we have put all the requests
1540 * on the list. 1540 * on the list.
1541 * 1541 *
1542 * This is called with interrupts off and no requests on the queue and 1542 * This is called with interrupts off and no requests on the queue and
1543 * with the queue lock held. 1543 * with the queue lock held.
1544 */ 1544 */
1545 void blk_plug_device(struct request_queue *q) 1545 void blk_plug_device(struct request_queue *q)
1546 { 1546 {
1547 WARN_ON(!irqs_disabled()); 1547 WARN_ON(!irqs_disabled());
1548 1548
1549 /* 1549 /*
1550 * don't plug a stopped queue, it must be paired with blk_start_queue() 1550 * don't plug a stopped queue, it must be paired with blk_start_queue()
1551 * which will restart the queueing 1551 * which will restart the queueing
1552 */ 1552 */
1553 if (blk_queue_stopped(q)) 1553 if (blk_queue_stopped(q))
1554 return; 1554 return;
1555 1555
1556 if (!test_and_set_bit(QUEUE_FLAG_PLUGGED, &q->queue_flags)) { 1556 if (!test_and_set_bit(QUEUE_FLAG_PLUGGED, &q->queue_flags)) {
1557 mod_timer(&q->unplug_timer, jiffies + q->unplug_delay); 1557 mod_timer(&q->unplug_timer, jiffies + q->unplug_delay);
1558 blk_add_trace_generic(q, NULL, 0, BLK_TA_PLUG); 1558 blk_add_trace_generic(q, NULL, 0, BLK_TA_PLUG);
1559 } 1559 }
1560 } 1560 }
1561 1561
1562 EXPORT_SYMBOL(blk_plug_device); 1562 EXPORT_SYMBOL(blk_plug_device);
1563 1563
1564 /* 1564 /*
1565 * remove the queue from the plugged list, if present. called with 1565 * remove the queue from the plugged list, if present. called with
1566 * queue lock held and interrupts disabled. 1566 * queue lock held and interrupts disabled.
1567 */ 1567 */
1568 int blk_remove_plug(struct request_queue *q) 1568 int blk_remove_plug(struct request_queue *q)
1569 { 1569 {
1570 WARN_ON(!irqs_disabled()); 1570 WARN_ON(!irqs_disabled());
1571 1571
1572 if (!test_and_clear_bit(QUEUE_FLAG_PLUGGED, &q->queue_flags)) 1572 if (!test_and_clear_bit(QUEUE_FLAG_PLUGGED, &q->queue_flags))
1573 return 0; 1573 return 0;
1574 1574
1575 del_timer(&q->unplug_timer); 1575 del_timer(&q->unplug_timer);
1576 return 1; 1576 return 1;
1577 } 1577 }
1578 1578
1579 EXPORT_SYMBOL(blk_remove_plug); 1579 EXPORT_SYMBOL(blk_remove_plug);
1580 1580
1581 /* 1581 /*
1582 * remove the plug and let it rip.. 1582 * remove the plug and let it rip..
1583 */ 1583 */
1584 void __generic_unplug_device(struct request_queue *q) 1584 void __generic_unplug_device(struct request_queue *q)
1585 { 1585 {
1586 if (unlikely(blk_queue_stopped(q))) 1586 if (unlikely(blk_queue_stopped(q)))
1587 return; 1587 return;
1588 1588
1589 if (!blk_remove_plug(q)) 1589 if (!blk_remove_plug(q))
1590 return; 1590 return;
1591 1591
1592 q->request_fn(q); 1592 q->request_fn(q);
1593 } 1593 }
1594 EXPORT_SYMBOL(__generic_unplug_device); 1594 EXPORT_SYMBOL(__generic_unplug_device);
1595 1595
1596 /** 1596 /**
1597 * generic_unplug_device - fire a request queue 1597 * generic_unplug_device - fire a request queue
1598 * @q: The &struct request_queue in question 1598 * @q: The &struct request_queue in question
1599 * 1599 *
1600 * Description: 1600 * Description:
1601 * Linux uses plugging to build bigger requests queues before letting 1601 * Linux uses plugging to build bigger requests queues before letting
1602 * the device have at them. If a queue is plugged, the I/O scheduler 1602 * the device have at them. If a queue is plugged, the I/O scheduler
1603 * is still adding and merging requests on the queue. Once the queue 1603 * is still adding and merging requests on the queue. Once the queue
1604 * gets unplugged, the request_fn defined for the queue is invoked and 1604 * gets unplugged, the request_fn defined for the queue is invoked and
1605 * transfers started. 1605 * transfers started.
1606 **/ 1606 **/
1607 void generic_unplug_device(struct request_queue *q) 1607 void generic_unplug_device(struct request_queue *q)
1608 { 1608 {
1609 spin_lock_irq(q->queue_lock); 1609 spin_lock_irq(q->queue_lock);
1610 __generic_unplug_device(q); 1610 __generic_unplug_device(q);
1611 spin_unlock_irq(q->queue_lock); 1611 spin_unlock_irq(q->queue_lock);
1612 } 1612 }
1613 EXPORT_SYMBOL(generic_unplug_device); 1613 EXPORT_SYMBOL(generic_unplug_device);
1614 1614
1615 static void blk_backing_dev_unplug(struct backing_dev_info *bdi, 1615 static void blk_backing_dev_unplug(struct backing_dev_info *bdi,
1616 struct page *page) 1616 struct page *page)
1617 { 1617 {
1618 struct request_queue *q = bdi->unplug_io_data; 1618 struct request_queue *q = bdi->unplug_io_data;
1619 1619
1620 /* 1620 /*
1621 * devices don't necessarily have an ->unplug_fn defined 1621 * devices don't necessarily have an ->unplug_fn defined
1622 */ 1622 */
1623 if (q->unplug_fn) { 1623 if (q->unplug_fn) {
1624 blk_add_trace_pdu_int(q, BLK_TA_UNPLUG_IO, NULL, 1624 blk_add_trace_pdu_int(q, BLK_TA_UNPLUG_IO, NULL,
1625 q->rq.count[READ] + q->rq.count[WRITE]); 1625 q->rq.count[READ] + q->rq.count[WRITE]);
1626 1626
1627 q->unplug_fn(q); 1627 q->unplug_fn(q);
1628 } 1628 }
1629 } 1629 }
1630 1630
1631 static void blk_unplug_work(struct work_struct *work) 1631 static void blk_unplug_work(struct work_struct *work)
1632 { 1632 {
1633 struct request_queue *q = 1633 struct request_queue *q =
1634 container_of(work, struct request_queue, unplug_work); 1634 container_of(work, struct request_queue, unplug_work);
1635 1635
1636 blk_add_trace_pdu_int(q, BLK_TA_UNPLUG_IO, NULL, 1636 blk_add_trace_pdu_int(q, BLK_TA_UNPLUG_IO, NULL,
1637 q->rq.count[READ] + q->rq.count[WRITE]); 1637 q->rq.count[READ] + q->rq.count[WRITE]);
1638 1638
1639 q->unplug_fn(q); 1639 q->unplug_fn(q);
1640 } 1640 }
1641 1641
1642 static void blk_unplug_timeout(unsigned long data) 1642 static void blk_unplug_timeout(unsigned long data)
1643 { 1643 {
1644 struct request_queue *q = (struct request_queue *)data; 1644 struct request_queue *q = (struct request_queue *)data;
1645 1645
1646 blk_add_trace_pdu_int(q, BLK_TA_UNPLUG_TIMER, NULL, 1646 blk_add_trace_pdu_int(q, BLK_TA_UNPLUG_TIMER, NULL,
1647 q->rq.count[READ] + q->rq.count[WRITE]); 1647 q->rq.count[READ] + q->rq.count[WRITE]);
1648 1648
1649 kblockd_schedule_work(&q->unplug_work); 1649 kblockd_schedule_work(&q->unplug_work);
1650 } 1650 }
1651 1651
1652 /** 1652 /**
1653 * blk_start_queue - restart a previously stopped queue 1653 * blk_start_queue - restart a previously stopped queue
1654 * @q: The &struct request_queue in question 1654 * @q: The &struct request_queue in question
1655 * 1655 *
1656 * Description: 1656 * Description:
1657 * blk_start_queue() will clear the stop flag on the queue, and call 1657 * blk_start_queue() will clear the stop flag on the queue, and call
1658 * the request_fn for the queue if it was in a stopped state when 1658 * the request_fn for the queue if it was in a stopped state when
1659 * entered. Also see blk_stop_queue(). Queue lock must be held. 1659 * entered. Also see blk_stop_queue(). Queue lock must be held.
1660 **/ 1660 **/
1661 void blk_start_queue(struct request_queue *q) 1661 void blk_start_queue(struct request_queue *q)
1662 { 1662 {
1663 WARN_ON(!irqs_disabled()); 1663 WARN_ON(!irqs_disabled());
1664 1664
1665 clear_bit(QUEUE_FLAG_STOPPED, &q->queue_flags); 1665 clear_bit(QUEUE_FLAG_STOPPED, &q->queue_flags);
1666 1666
1667 /* 1667 /*
1668 * one level of recursion is ok and is much faster than kicking 1668 * one level of recursion is ok and is much faster than kicking
1669 * the unplug handling 1669 * the unplug handling
1670 */ 1670 */
1671 if (!test_and_set_bit(QUEUE_FLAG_REENTER, &q->queue_flags)) { 1671 if (!test_and_set_bit(QUEUE_FLAG_REENTER, &q->queue_flags)) {
1672 q->request_fn(q); 1672 q->request_fn(q);
1673 clear_bit(QUEUE_FLAG_REENTER, &q->queue_flags); 1673 clear_bit(QUEUE_FLAG_REENTER, &q->queue_flags);
1674 } else { 1674 } else {
1675 blk_plug_device(q); 1675 blk_plug_device(q);
1676 kblockd_schedule_work(&q->unplug_work); 1676 kblockd_schedule_work(&q->unplug_work);
1677 } 1677 }
1678 } 1678 }
1679 1679
1680 EXPORT_SYMBOL(blk_start_queue); 1680 EXPORT_SYMBOL(blk_start_queue);
1681 1681
1682 /** 1682 /**
1683 * blk_stop_queue - stop a queue 1683 * blk_stop_queue - stop a queue
1684 * @q: The &struct request_queue in question 1684 * @q: The &struct request_queue in question
1685 * 1685 *
1686 * Description: 1686 * Description:
1687 * The Linux block layer assumes that a block driver will consume all 1687 * The Linux block layer assumes that a block driver will consume all
1688 * entries on the request queue when the request_fn strategy is called. 1688 * entries on the request queue when the request_fn strategy is called.
1689 * Often this will not happen, because of hardware limitations (queue 1689 * Often this will not happen, because of hardware limitations (queue
1690 * depth settings). If a device driver gets a 'queue full' response, 1690 * depth settings). If a device driver gets a 'queue full' response,
1691 * or if it simply chooses not to queue more I/O at one point, it can 1691 * or if it simply chooses not to queue more I/O at one point, it can
1692 * call this function to prevent the request_fn from being called until 1692 * call this function to prevent the request_fn from being called until
1693 * the driver has signalled it's ready to go again. This happens by calling 1693 * the driver has signalled it's ready to go again. This happens by calling
1694 * blk_start_queue() to restart queue operations. Queue lock must be held. 1694 * blk_start_queue() to restart queue operations. Queue lock must be held.
1695 **/ 1695 **/
1696 void blk_stop_queue(struct request_queue *q) 1696 void blk_stop_queue(struct request_queue *q)
1697 { 1697 {
1698 blk_remove_plug(q); 1698 blk_remove_plug(q);
1699 set_bit(QUEUE_FLAG_STOPPED, &q->queue_flags); 1699 set_bit(QUEUE_FLAG_STOPPED, &q->queue_flags);
1700 } 1700 }
1701 EXPORT_SYMBOL(blk_stop_queue); 1701 EXPORT_SYMBOL(blk_stop_queue);
1702 1702
1703 /** 1703 /**
1704 * blk_sync_queue - cancel any pending callbacks on a queue 1704 * blk_sync_queue - cancel any pending callbacks on a queue
1705 * @q: the queue 1705 * @q: the queue
1706 * 1706 *
1707 * Description: 1707 * Description:
1708 * The block layer may perform asynchronous callback activity 1708 * The block layer may perform asynchronous callback activity
1709 * on a queue, such as calling the unplug function after a timeout. 1709 * on a queue, such as calling the unplug function after a timeout.
1710 * A block device may call blk_sync_queue to ensure that any 1710 * A block device may call blk_sync_queue to ensure that any
1711 * such activity is cancelled, thus allowing it to release resources 1711 * such activity is cancelled, thus allowing it to release resources
1712 * that the callbacks might use. The caller must already have made sure 1712 * that the callbacks might use. The caller must already have made sure
1713 * that its ->make_request_fn will not re-add plugging prior to calling 1713 * that its ->make_request_fn will not re-add plugging prior to calling
1714 * this function. 1714 * this function.
1715 * 1715 *
1716 */ 1716 */
1717 void blk_sync_queue(struct request_queue *q) 1717 void blk_sync_queue(struct request_queue *q)
1718 { 1718 {
1719 del_timer_sync(&q->unplug_timer); 1719 del_timer_sync(&q->unplug_timer);
1720 } 1720 }
1721 EXPORT_SYMBOL(blk_sync_queue); 1721 EXPORT_SYMBOL(blk_sync_queue);
1722 1722
1723 /** 1723 /**
1724 * blk_run_queue - run a single device queue 1724 * blk_run_queue - run a single device queue
1725 * @q: The queue to run 1725 * @q: The queue to run
1726 */ 1726 */
1727 void blk_run_queue(struct request_queue *q) 1727 void blk_run_queue(struct request_queue *q)
1728 { 1728 {
1729 unsigned long flags; 1729 unsigned long flags;
1730 1730
1731 spin_lock_irqsave(q->queue_lock, flags); 1731 spin_lock_irqsave(q->queue_lock, flags);
1732 blk_remove_plug(q); 1732 blk_remove_plug(q);
1733 1733
1734 /* 1734 /*
1735 * Only recurse once to avoid overrunning the stack, let the unplug 1735 * Only recurse once to avoid overrunning the stack, let the unplug
1736 * handling reinvoke the handler shortly if we already got there. 1736 * handling reinvoke the handler shortly if we already got there.
1737 */ 1737 */
1738 if (!elv_queue_empty(q)) { 1738 if (!elv_queue_empty(q)) {
1739 if (!test_and_set_bit(QUEUE_FLAG_REENTER, &q->queue_flags)) { 1739 if (!test_and_set_bit(QUEUE_FLAG_REENTER, &q->queue_flags)) {
1740 q->request_fn(q); 1740 q->request_fn(q);
1741 clear_bit(QUEUE_FLAG_REENTER, &q->queue_flags); 1741 clear_bit(QUEUE_FLAG_REENTER, &q->queue_flags);
1742 } else { 1742 } else {
1743 blk_plug_device(q); 1743 blk_plug_device(q);
1744 kblockd_schedule_work(&q->unplug_work); 1744 kblockd_schedule_work(&q->unplug_work);
1745 } 1745 }
1746 } 1746 }
1747 1747
1748 spin_unlock_irqrestore(q->queue_lock, flags); 1748 spin_unlock_irqrestore(q->queue_lock, flags);
1749 } 1749 }
1750 EXPORT_SYMBOL(blk_run_queue); 1750 EXPORT_SYMBOL(blk_run_queue);
1751 1751
1752 /** 1752 /**
1753 * blk_cleanup_queue: - release a &struct request_queue when it is no longer needed 1753 * blk_cleanup_queue: - release a &struct request_queue when it is no longer needed
1754 * @kobj: the kobj belonging of the request queue to be released 1754 * @kobj: the kobj belonging of the request queue to be released
1755 * 1755 *
1756 * Description: 1756 * Description:
1757 * blk_cleanup_queue is the pair to blk_init_queue() or 1757 * blk_cleanup_queue is the pair to blk_init_queue() or
1758 * blk_queue_make_request(). It should be called when a request queue is 1758 * blk_queue_make_request(). It should be called when a request queue is
1759 * being released; typically when a block device is being de-registered. 1759 * being released; typically when a block device is being de-registered.
1760 * Currently, its primary task it to free all the &struct request 1760 * Currently, its primary task it to free all the &struct request
1761 * structures that were allocated to the queue and the queue itself. 1761 * structures that were allocated to the queue and the queue itself.
1762 * 1762 *
1763 * Caveat: 1763 * Caveat:
1764 * Hopefully the low level driver will have finished any 1764 * Hopefully the low level driver will have finished any
1765 * outstanding requests first... 1765 * outstanding requests first...
1766 **/ 1766 **/
1767 static void blk_release_queue(struct kobject *kobj) 1767 static void blk_release_queue(struct kobject *kobj)
1768 { 1768 {
1769 struct request_queue *q = 1769 struct request_queue *q =
1770 container_of(kobj, struct request_queue, kobj); 1770 container_of(kobj, struct request_queue, kobj);
1771 struct request_list *rl = &q->rq; 1771 struct request_list *rl = &q->rq;
1772 1772
1773 blk_sync_queue(q); 1773 blk_sync_queue(q);
1774 1774
1775 if (rl->rq_pool) 1775 if (rl->rq_pool)
1776 mempool_destroy(rl->rq_pool); 1776 mempool_destroy(rl->rq_pool);
1777 1777
1778 if (q->queue_tags) 1778 if (q->queue_tags)
1779 __blk_queue_free_tags(q); 1779 __blk_queue_free_tags(q);
1780 1780
1781 blk_trace_shutdown(q); 1781 blk_trace_shutdown(q);
1782 1782
1783 kmem_cache_free(requestq_cachep, q); 1783 kmem_cache_free(requestq_cachep, q);
1784 } 1784 }
1785 1785
1786 void blk_put_queue(struct request_queue *q) 1786 void blk_put_queue(struct request_queue *q)
1787 { 1787 {
1788 kobject_put(&q->kobj); 1788 kobject_put(&q->kobj);
1789 } 1789 }
1790 EXPORT_SYMBOL(blk_put_queue); 1790 EXPORT_SYMBOL(blk_put_queue);
1791 1791
1792 void blk_cleanup_queue(struct request_queue * q) 1792 void blk_cleanup_queue(struct request_queue * q)
1793 { 1793 {
1794 mutex_lock(&q->sysfs_lock); 1794 mutex_lock(&q->sysfs_lock);
1795 set_bit(QUEUE_FLAG_DEAD, &q->queue_flags); 1795 set_bit(QUEUE_FLAG_DEAD, &q->queue_flags);
1796 mutex_unlock(&q->sysfs_lock); 1796 mutex_unlock(&q->sysfs_lock);
1797 1797
1798 if (q->elevator) 1798 if (q->elevator)
1799 elevator_exit(q->elevator); 1799 elevator_exit(q->elevator);
1800 1800
1801 blk_put_queue(q); 1801 blk_put_queue(q);
1802 } 1802 }
1803 1803
1804 EXPORT_SYMBOL(blk_cleanup_queue); 1804 EXPORT_SYMBOL(blk_cleanup_queue);
1805 1805
1806 static int blk_init_free_list(struct request_queue *q) 1806 static int blk_init_free_list(struct request_queue *q)
1807 { 1807 {
1808 struct request_list *rl = &q->rq; 1808 struct request_list *rl = &q->rq;
1809 1809
1810 rl->count[READ] = rl->count[WRITE] = 0; 1810 rl->count[READ] = rl->count[WRITE] = 0;
1811 rl->starved[READ] = rl->starved[WRITE] = 0; 1811 rl->starved[READ] = rl->starved[WRITE] = 0;
1812 rl->elvpriv = 0; 1812 rl->elvpriv = 0;
1813 init_waitqueue_head(&rl->wait[READ]); 1813 init_waitqueue_head(&rl->wait[READ]);
1814 init_waitqueue_head(&rl->wait[WRITE]); 1814 init_waitqueue_head(&rl->wait[WRITE]);
1815 1815
1816 rl->rq_pool = mempool_create_node(BLKDEV_MIN_RQ, mempool_alloc_slab, 1816 rl->rq_pool = mempool_create_node(BLKDEV_MIN_RQ, mempool_alloc_slab,
1817 mempool_free_slab, request_cachep, q->node); 1817 mempool_free_slab, request_cachep, q->node);
1818 1818
1819 if (!rl->rq_pool) 1819 if (!rl->rq_pool)
1820 return -ENOMEM; 1820 return -ENOMEM;
1821 1821
1822 return 0; 1822 return 0;
1823 } 1823 }
1824 1824
1825 struct request_queue *blk_alloc_queue(gfp_t gfp_mask) 1825 struct request_queue *blk_alloc_queue(gfp_t gfp_mask)
1826 { 1826 {
1827 return blk_alloc_queue_node(gfp_mask, -1); 1827 return blk_alloc_queue_node(gfp_mask, -1);
1828 } 1828 }
1829 EXPORT_SYMBOL(blk_alloc_queue); 1829 EXPORT_SYMBOL(blk_alloc_queue);
1830 1830
1831 static struct kobj_type queue_ktype; 1831 static struct kobj_type queue_ktype;
1832 1832
1833 struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id) 1833 struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
1834 { 1834 {
1835 struct request_queue *q; 1835 struct request_queue *q;
1836 1836
1837 q = kmem_cache_alloc_node(requestq_cachep, 1837 q = kmem_cache_alloc_node(requestq_cachep,
1838 gfp_mask | __GFP_ZERO, node_id); 1838 gfp_mask | __GFP_ZERO, node_id);
1839 if (!q) 1839 if (!q)
1840 return NULL; 1840 return NULL;
1841 1841
1842 init_timer(&q->unplug_timer); 1842 init_timer(&q->unplug_timer);
1843 1843
1844 snprintf(q->kobj.name, KOBJ_NAME_LEN, "%s", "queue"); 1844 snprintf(q->kobj.name, KOBJ_NAME_LEN, "%s", "queue");
1845 q->kobj.ktype = &queue_ktype; 1845 q->kobj.ktype = &queue_ktype;
1846 kobject_init(&q->kobj); 1846 kobject_init(&q->kobj);
1847 1847
1848 q->backing_dev_info.unplug_io_fn = blk_backing_dev_unplug; 1848 q->backing_dev_info.unplug_io_fn = blk_backing_dev_unplug;
1849 q->backing_dev_info.unplug_io_data = q; 1849 q->backing_dev_info.unplug_io_data = q;
1850 1850
1851 mutex_init(&q->sysfs_lock); 1851 mutex_init(&q->sysfs_lock);
1852 1852
1853 return q; 1853 return q;
1854 } 1854 }
1855 EXPORT_SYMBOL(blk_alloc_queue_node); 1855 EXPORT_SYMBOL(blk_alloc_queue_node);
1856 1856
1857 /** 1857 /**
1858 * blk_init_queue - prepare a request queue for use with a block device 1858 * blk_init_queue - prepare a request queue for use with a block device
1859 * @rfn: The function to be called to process requests that have been 1859 * @rfn: The function to be called to process requests that have been
1860 * placed on the queue. 1860 * placed on the queue.
1861 * @lock: Request queue spin lock 1861 * @lock: Request queue spin lock
1862 * 1862 *
1863 * Description: 1863 * Description:
1864 * If a block device wishes to use the standard request handling procedures, 1864 * If a block device wishes to use the standard request handling procedures,
1865 * which sorts requests and coalesces adjacent requests, then it must 1865 * which sorts requests and coalesces adjacent requests, then it must
1866 * call blk_init_queue(). The function @rfn will be called when there 1866 * call blk_init_queue(). The function @rfn will be called when there
1867 * are requests on the queue that need to be processed. If the device 1867 * are requests on the queue that need to be processed. If the device
1868 * supports plugging, then @rfn may not be called immediately when requests 1868 * supports plugging, then @rfn may not be called immediately when requests
1869 * are available on the queue, but may be called at some time later instead. 1869 * are available on the queue, but may be called at some time later instead.
1870 * Plugged queues are generally unplugged when a buffer belonging to one 1870 * Plugged queues are generally unplugged when a buffer belonging to one
1871 * of the requests on the queue is needed, or due to memory pressure. 1871 * of the requests on the queue is needed, or due to memory pressure.
1872 * 1872 *
1873 * @rfn is not required, or even expected, to remove all requests off the 1873 * @rfn is not required, or even expected, to remove all requests off the
1874 * queue, but only as many as it can handle at a time. If it does leave 1874 * queue, but only as many as it can handle at a time. If it does leave
1875 * requests on the queue, it is responsible for arranging that the requests 1875 * requests on the queue, it is responsible for arranging that the requests
1876 * get dealt with eventually. 1876 * get dealt with eventually.
1877 * 1877 *
1878 * The queue spin lock must be held while manipulating the requests on the 1878 * The queue spin lock must be held while manipulating the requests on the
1879 * request queue; this lock will be taken also from interrupt context, so irq 1879 * request queue; this lock will be taken also from interrupt context, so irq
1880 * disabling is needed for it. 1880 * disabling is needed for it.
1881 * 1881 *
1882 * Function returns a pointer to the initialized request queue, or NULL if 1882 * Function returns a pointer to the initialized request queue, or NULL if
1883 * it didn't succeed. 1883 * it didn't succeed.
1884 * 1884 *
1885 * Note: 1885 * Note:
1886 * blk_init_queue() must be paired with a blk_cleanup_queue() call 1886 * blk_init_queue() must be paired with a blk_cleanup_queue() call
1887 * when the block device is deactivated (such as at module unload). 1887 * when the block device is deactivated (such as at module unload).
1888 **/ 1888 **/
1889 1889
1890 struct request_queue *blk_init_queue(request_fn_proc *rfn, spinlock_t *lock) 1890 struct request_queue *blk_init_queue(request_fn_proc *rfn, spinlock_t *lock)
1891 { 1891 {
1892 return blk_init_queue_node(rfn, lock, -1); 1892 return blk_init_queue_node(rfn, lock, -1);
1893 } 1893 }
1894 EXPORT_SYMBOL(blk_init_queue); 1894 EXPORT_SYMBOL(blk_init_queue);
1895 1895
1896 struct request_queue * 1896 struct request_queue *
1897 blk_init_queue_node(request_fn_proc *rfn, spinlock_t *lock, int node_id) 1897 blk_init_queue_node(request_fn_proc *rfn, spinlock_t *lock, int node_id)
1898 { 1898 {
1899 struct request_queue *q = blk_alloc_queue_node(GFP_KERNEL, node_id); 1899 struct request_queue *q = blk_alloc_queue_node(GFP_KERNEL, node_id);
1900 1900
1901 if (!q) 1901 if (!q)
1902 return NULL; 1902 return NULL;
1903 1903
1904 q->node = node_id; 1904 q->node = node_id;
1905 if (blk_init_free_list(q)) { 1905 if (blk_init_free_list(q)) {
1906 kmem_cache_free(requestq_cachep, q); 1906 kmem_cache_free(requestq_cachep, q);
1907 return NULL; 1907 return NULL;
1908 } 1908 }
1909 1909
1910 /* 1910 /*
1911 * if caller didn't supply a lock, they get per-queue locking with 1911 * if caller didn't supply a lock, they get per-queue locking with
1912 * our embedded lock 1912 * our embedded lock
1913 */ 1913 */
1914 if (!lock) { 1914 if (!lock) {
1915 spin_lock_init(&q->__queue_lock); 1915 spin_lock_init(&q->__queue_lock);
1916 lock = &q->__queue_lock; 1916 lock = &q->__queue_lock;
1917 } 1917 }
1918 1918
1919 q->request_fn = rfn; 1919 q->request_fn = rfn;
1920 q->prep_rq_fn = NULL; 1920 q->prep_rq_fn = NULL;
1921 q->unplug_fn = generic_unplug_device; 1921 q->unplug_fn = generic_unplug_device;
1922 q->queue_flags = (1 << QUEUE_FLAG_CLUSTER); 1922 q->queue_flags = (1 << QUEUE_FLAG_CLUSTER);
1923 q->queue_lock = lock; 1923 q->queue_lock = lock;
1924 1924
1925 blk_queue_segment_boundary(q, 0xffffffff); 1925 blk_queue_segment_boundary(q, 0xffffffff);
1926 1926
1927 blk_queue_make_request(q, __make_request); 1927 blk_queue_make_request(q, __make_request);
1928 blk_queue_max_segment_size(q, MAX_SEGMENT_SIZE); 1928 blk_queue_max_segment_size(q, MAX_SEGMENT_SIZE);
1929 1929
1930 blk_queue_max_hw_segments(q, MAX_HW_SEGMENTS); 1930 blk_queue_max_hw_segments(q, MAX_HW_SEGMENTS);
1931 blk_queue_max_phys_segments(q, MAX_PHYS_SEGMENTS); 1931 blk_queue_max_phys_segments(q, MAX_PHYS_SEGMENTS);
1932 1932
1933 q->sg_reserved_size = INT_MAX; 1933 q->sg_reserved_size = INT_MAX;
1934 1934
1935 /* 1935 /*
1936 * all done 1936 * all done
1937 */ 1937 */
1938 if (!elevator_init(q, NULL)) { 1938 if (!elevator_init(q, NULL)) {
1939 blk_queue_congestion_threshold(q); 1939 blk_queue_congestion_threshold(q);
1940 return q; 1940 return q;
1941 } 1941 }
1942 1942
1943 blk_put_queue(q); 1943 blk_put_queue(q);
1944 return NULL; 1944 return NULL;
1945 } 1945 }
1946 EXPORT_SYMBOL(blk_init_queue_node); 1946 EXPORT_SYMBOL(blk_init_queue_node);
1947 1947
1948 int blk_get_queue(struct request_queue *q) 1948 int blk_get_queue(struct request_queue *q)
1949 { 1949 {
1950 if (likely(!test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))) { 1950 if (likely(!test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))) {
1951 kobject_get(&q->kobj); 1951 kobject_get(&q->kobj);
1952 return 0; 1952 return 0;
1953 } 1953 }
1954 1954
1955 return 1; 1955 return 1;
1956 } 1956 }
1957 1957
1958 EXPORT_SYMBOL(blk_get_queue); 1958 EXPORT_SYMBOL(blk_get_queue);
1959 1959
1960 static inline void blk_free_request(struct request_queue *q, struct request *rq) 1960 static inline void blk_free_request(struct request_queue *q, struct request *rq)
1961 { 1961 {
1962 if (rq->cmd_flags & REQ_ELVPRIV) 1962 if (rq->cmd_flags & REQ_ELVPRIV)
1963 elv_put_request(q, rq); 1963 elv_put_request(q, rq);
1964 mempool_free(rq, q->rq.rq_pool); 1964 mempool_free(rq, q->rq.rq_pool);
1965 } 1965 }
1966 1966
1967 static struct request * 1967 static struct request *
1968 blk_alloc_request(struct request_queue *q, int rw, int priv, gfp_t gfp_mask) 1968 blk_alloc_request(struct request_queue *q, int rw, int priv, gfp_t gfp_mask)
1969 { 1969 {
1970 struct request *rq = mempool_alloc(q->rq.rq_pool, gfp_mask); 1970 struct request *rq = mempool_alloc(q->rq.rq_pool, gfp_mask);
1971 1971
1972 if (!rq) 1972 if (!rq)
1973 return NULL; 1973 return NULL;
1974 1974
1975 /* 1975 /*
1976 * first three bits are identical in rq->cmd_flags and bio->bi_rw, 1976 * first three bits are identical in rq->cmd_flags and bio->bi_rw,
1977 * see bio.h and blkdev.h 1977 * see bio.h and blkdev.h
1978 */ 1978 */
1979 rq->cmd_flags = rw | REQ_ALLOCED; 1979 rq->cmd_flags = rw | REQ_ALLOCED;
1980 1980
1981 if (priv) { 1981 if (priv) {
1982 if (unlikely(elv_set_request(q, rq, gfp_mask))) { 1982 if (unlikely(elv_set_request(q, rq, gfp_mask))) {
1983 mempool_free(rq, q->rq.rq_pool); 1983 mempool_free(rq, q->rq.rq_pool);
1984 return NULL; 1984 return NULL;
1985 } 1985 }
1986 rq->cmd_flags |= REQ_ELVPRIV; 1986 rq->cmd_flags |= REQ_ELVPRIV;
1987 } 1987 }
1988 1988
1989 return rq; 1989 return rq;
1990 } 1990 }
1991 1991
1992 /* 1992 /*
1993 * ioc_batching returns true if the ioc is a valid batching request and 1993 * ioc_batching returns true if the ioc is a valid batching request and
1994 * should be given priority access to a request. 1994 * should be given priority access to a request.
1995 */ 1995 */
1996 static inline int ioc_batching(struct request_queue *q, struct io_context *ioc) 1996 static inline int ioc_batching(struct request_queue *q, struct io_context *ioc)
1997 { 1997 {
1998 if (!ioc) 1998 if (!ioc)
1999 return 0; 1999 return 0;
2000 2000
2001 /* 2001 /*
2002 * Make sure the process is able to allocate at least 1 request 2002 * Make sure the process is able to allocate at least 1 request
2003 * even if the batch times out, otherwise we could theoretically 2003 * even if the batch times out, otherwise we could theoretically
2004 * lose wakeups. 2004 * lose wakeups.
2005 */ 2005 */
2006 return ioc->nr_batch_requests == q->nr_batching || 2006 return ioc->nr_batch_requests == q->nr_batching ||
2007 (ioc->nr_batch_requests > 0 2007 (ioc->nr_batch_requests > 0
2008 && time_before(jiffies, ioc->last_waited + BLK_BATCH_TIME)); 2008 && time_before(jiffies, ioc->last_waited + BLK_BATCH_TIME));
2009 } 2009 }
2010 2010
2011 /* 2011 /*
2012 * ioc_set_batching sets ioc to be a new "batcher" if it is not one. This 2012 * ioc_set_batching sets ioc to be a new "batcher" if it is not one. This
2013 * will cause the process to be a "batcher" on all queues in the system. This 2013 * will cause the process to be a "batcher" on all queues in the system. This
2014 * is the behaviour we want though - once it gets a wakeup it should be given 2014 * is the behaviour we want though - once it gets a wakeup it should be given
2015 * a nice run. 2015 * a nice run.
2016 */ 2016 */
2017 static void ioc_set_batching(struct request_queue *q, struct io_context *ioc) 2017 static void ioc_set_batching(struct request_queue *q, struct io_context *ioc)
2018 { 2018 {
2019 if (!ioc || ioc_batching(q, ioc)) 2019 if (!ioc || ioc_batching(q, ioc))
2020 return; 2020 return;
2021 2021
2022 ioc->nr_batch_requests = q->nr_batching; 2022 ioc->nr_batch_requests = q->nr_batching;
2023 ioc->last_waited = jiffies; 2023 ioc->last_waited = jiffies;
2024 } 2024 }
2025 2025
2026 static void __freed_request(struct request_queue *q, int rw) 2026 static void __freed_request(struct request_queue *q, int rw)
2027 { 2027 {
2028 struct request_list *rl = &q->rq; 2028 struct request_list *rl = &q->rq;
2029 2029
2030 if (rl->count[rw] < queue_congestion_off_threshold(q)) 2030 if (rl->count[rw] < queue_congestion_off_threshold(q))
2031 blk_clear_queue_congested(q, rw); 2031 blk_clear_queue_congested(q, rw);
2032 2032
2033 if (rl->count[rw] + 1 <= q->nr_requests) { 2033 if (rl->count[rw] + 1 <= q->nr_requests) {
2034 if (waitqueue_active(&rl->wait[rw])) 2034 if (waitqueue_active(&rl->wait[rw]))
2035 wake_up(&rl->wait[rw]); 2035 wake_up(&rl->wait[rw]);
2036 2036
2037 blk_clear_queue_full(q, rw); 2037 blk_clear_queue_full(q, rw);
2038 } 2038 }
2039 } 2039 }
2040 2040
2041 /* 2041 /*
2042 * A request has just been released. Account for it, update the full and 2042 * A request has just been released. Account for it, update the full and
2043 * congestion status, wake up any waiters. Called under q->queue_lock. 2043 * congestion status, wake up any waiters. Called under q->queue_lock.
2044 */ 2044 */
2045 static void freed_request(struct request_queue *q, int rw, int priv) 2045 static void freed_request(struct request_queue *q, int rw, int priv)
2046 { 2046 {
2047 struct request_list *rl = &q->rq; 2047 struct request_list *rl = &q->rq;
2048 2048
2049 rl->count[rw]--; 2049 rl->count[rw]--;
2050 if (priv) 2050 if (priv)
2051 rl->elvpriv--; 2051 rl->elvpriv--;
2052 2052
2053 __freed_request(q, rw); 2053 __freed_request(q, rw);
2054 2054
2055 if (unlikely(rl->starved[rw ^ 1])) 2055 if (unlikely(rl->starved[rw ^ 1]))
2056 __freed_request(q, rw ^ 1); 2056 __freed_request(q, rw ^ 1);
2057 } 2057 }
2058 2058
2059 #define blkdev_free_rq(list) list_entry((list)->next, struct request, queuelist) 2059 #define blkdev_free_rq(list) list_entry((list)->next, struct request, queuelist)
2060 /* 2060 /*
2061 * Get a free request, queue_lock must be held. 2061 * Get a free request, queue_lock must be held.
2062 * Returns NULL on failure, with queue_lock held. 2062 * Returns NULL on failure, with queue_lock held.
2063 * Returns !NULL on success, with queue_lock *not held*. 2063 * Returns !NULL on success, with queue_lock *not held*.
2064 */ 2064 */
2065 static struct request *get_request(struct request_queue *q, int rw_flags, 2065 static struct request *get_request(struct request_queue *q, int rw_flags,
2066 struct bio *bio, gfp_t gfp_mask) 2066 struct bio *bio, gfp_t gfp_mask)
2067 { 2067 {
2068 struct request *rq = NULL; 2068 struct request *rq = NULL;
2069 struct request_list *rl = &q->rq; 2069 struct request_list *rl = &q->rq;
2070 struct io_context *ioc = NULL; 2070 struct io_context *ioc = NULL;
2071 const int rw = rw_flags & 0x01; 2071 const int rw = rw_flags & 0x01;
2072 int may_queue, priv; 2072 int may_queue, priv;
2073 2073
2074 may_queue = elv_may_queue(q, rw_flags); 2074 may_queue = elv_may_queue(q, rw_flags);
2075 if (may_queue == ELV_MQUEUE_NO) 2075 if (may_queue == ELV_MQUEUE_NO)
2076 goto rq_starved; 2076 goto rq_starved;
2077 2077
2078 if (rl->count[rw]+1 >= queue_congestion_on_threshold(q)) { 2078 if (rl->count[rw]+1 >= queue_congestion_on_threshold(q)) {
2079 if (rl->count[rw]+1 >= q->nr_requests) { 2079 if (rl->count[rw]+1 >= q->nr_requests) {
2080 ioc = current_io_context(GFP_ATOMIC, q->node); 2080 ioc = current_io_context(GFP_ATOMIC, q->node);
2081 /* 2081 /*
2082 * The queue will fill after this allocation, so set 2082 * The queue will fill after this allocation, so set
2083 * it as full, and mark this process as "batching". 2083 * it as full, and mark this process as "batching".
2084 * This process will be allowed to complete a batch of 2084 * This process will be allowed to complete a batch of
2085 * requests, others will be blocked. 2085 * requests, others will be blocked.
2086 */ 2086 */
2087 if (!blk_queue_full(q, rw)) { 2087 if (!blk_queue_full(q, rw)) {
2088 ioc_set_batching(q, ioc); 2088 ioc_set_batching(q, ioc);
2089 blk_set_queue_full(q, rw); 2089 blk_set_queue_full(q, rw);
2090 } else { 2090 } else {
2091 if (may_queue != ELV_MQUEUE_MUST 2091 if (may_queue != ELV_MQUEUE_MUST
2092 && !ioc_batching(q, ioc)) { 2092 && !ioc_batching(q, ioc)) {
2093 /* 2093 /*
2094 * The queue is full and the allocating 2094 * The queue is full and the allocating
2095 * process is not a "batcher", and not 2095 * process is not a "batcher", and not
2096 * exempted by the IO scheduler 2096 * exempted by the IO scheduler
2097 */ 2097 */
2098 goto out; 2098 goto out;
2099 } 2099 }
2100 } 2100 }
2101 } 2101 }
2102 blk_set_queue_congested(q, rw); 2102 blk_set_queue_congested(q, rw);
2103 } 2103 }
2104 2104
2105 /* 2105 /*
2106 * Only allow batching queuers to allocate up to 50% over the defined 2106 * Only allow batching queuers to allocate up to 50% over the defined
2107 * limit of requests, otherwise we could have thousands of requests 2107 * limit of requests, otherwise we could have thousands of requests
2108 * allocated with any setting of ->nr_requests 2108 * allocated with any setting of ->nr_requests
2109 */ 2109 */
2110 if (rl->count[rw] >= (3 * q->nr_requests / 2)) 2110 if (rl->count[rw] >= (3 * q->nr_requests / 2))
2111 goto out; 2111 goto out;
2112 2112
2113 rl->count[rw]++; 2113 rl->count[rw]++;
2114 rl->starved[rw] = 0; 2114 rl->starved[rw] = 0;
2115 2115
2116 priv = !test_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags); 2116 priv = !test_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags);
2117 if (priv) 2117 if (priv)
2118 rl->elvpriv++; 2118 rl->elvpriv++;
2119 2119
2120 spin_unlock_irq(q->queue_lock); 2120 spin_unlock_irq(q->queue_lock);
2121 2121
2122 rq = blk_alloc_request(q, rw_flags, priv, gfp_mask); 2122 rq = blk_alloc_request(q, rw_flags, priv, gfp_mask);
2123 if (unlikely(!rq)) { 2123 if (unlikely(!rq)) {
2124 /* 2124 /*
2125 * Allocation failed presumably due to memory. Undo anything 2125 * Allocation failed presumably due to memory. Undo anything
2126 * we might have messed up. 2126 * we might have messed up.
2127 * 2127 *
2128 * Allocating task should really be put onto the front of the 2128 * Allocating task should really be put onto the front of the
2129 * wait queue, but this is pretty rare. 2129 * wait queue, but this is pretty rare.
2130 */ 2130 */
2131 spin_lock_irq(q->queue_lock); 2131 spin_lock_irq(q->queue_lock);
2132 freed_request(q, rw, priv); 2132 freed_request(q, rw, priv);
2133 2133
2134 /* 2134 /*
2135 * in the very unlikely event that allocation failed and no 2135 * in the very unlikely event that allocation failed and no
2136 * requests for this direction was pending, mark us starved 2136 * requests for this direction was pending, mark us starved
2137 * so that freeing of a request in the other direction will 2137 * so that freeing of a request in the other direction will
2138 * notice us. another possible fix would be to split the 2138 * notice us. another possible fix would be to split the
2139 * rq mempool into READ and WRITE 2139 * rq mempool into READ and WRITE
2140 */ 2140 */
2141 rq_starved: 2141 rq_starved:
2142 if (unlikely(rl->count[rw] == 0)) 2142 if (unlikely(rl->count[rw] == 0))
2143 rl->starved[rw] = 1; 2143 rl->starved[rw] = 1;
2144 2144
2145 goto out; 2145 goto out;
2146 } 2146 }
2147 2147
2148 /* 2148 /*
2149 * ioc may be NULL here, and ioc_batching will be false. That's 2149 * ioc may be NULL here, and ioc_batching will be false. That's
2150 * OK, if the queue is under the request limit then requests need 2150 * OK, if the queue is under the request limit then requests need
2151 * not count toward the nr_batch_requests limit. There will always 2151 * not count toward the nr_batch_requests limit. There will always
2152 * be some limit enforced by BLK_BATCH_TIME. 2152 * be some limit enforced by BLK_BATCH_TIME.
2153 */ 2153 */
2154 if (ioc_batching(q, ioc)) 2154 if (ioc_batching(q, ioc))
2155 ioc->nr_batch_requests--; 2155 ioc->nr_batch_requests--;
2156 2156
2157 rq_init(q, rq); 2157 rq_init(q, rq);
2158 2158
2159 blk_add_trace_generic(q, bio, rw, BLK_TA_GETRQ); 2159 blk_add_trace_generic(q, bio, rw, BLK_TA_GETRQ);
2160 out: 2160 out:
2161 return rq; 2161 return rq;
2162 } 2162 }
2163 2163
2164 /* 2164 /*
2165 * No available requests for this queue, unplug the device and wait for some 2165 * No available requests for this queue, unplug the device and wait for some
2166 * requests to become available. 2166 * requests to become available.
2167 * 2167 *
2168 * Called with q->queue_lock held, and returns with it unlocked. 2168 * Called with q->queue_lock held, and returns with it unlocked.
2169 */ 2169 */
2170 static struct request *get_request_wait(struct request_queue *q, int rw_flags, 2170 static struct request *get_request_wait(struct request_queue *q, int rw_flags,
2171 struct bio *bio) 2171 struct bio *bio)
2172 { 2172 {
2173 const int rw = rw_flags & 0x01; 2173 const int rw = rw_flags & 0x01;
2174 struct request *rq; 2174 struct request *rq;
2175 2175
2176 rq = get_request(q, rw_flags, bio, GFP_NOIO); 2176 rq = get_request(q, rw_flags, bio, GFP_NOIO);
2177 while (!rq) { 2177 while (!rq) {
2178 DEFINE_WAIT(wait); 2178 DEFINE_WAIT(wait);
2179 struct request_list *rl = &q->rq; 2179 struct request_list *rl = &q->rq;
2180 2180
2181 prepare_to_wait_exclusive(&rl->wait[rw], &wait, 2181 prepare_to_wait_exclusive(&rl->wait[rw], &wait,
2182 TASK_UNINTERRUPTIBLE); 2182 TASK_UNINTERRUPTIBLE);
2183 2183
2184 rq = get_request(q, rw_flags, bio, GFP_NOIO); 2184 rq = get_request(q, rw_flags, bio, GFP_NOIO);
2185 2185
2186 if (!rq) { 2186 if (!rq) {
2187 struct io_context *ioc; 2187 struct io_context *ioc;
2188 2188
2189 blk_add_trace_generic(q, bio, rw, BLK_TA_SLEEPRQ); 2189 blk_add_trace_generic(q, bio, rw, BLK_TA_SLEEPRQ);
2190 2190
2191 __generic_unplug_device(q); 2191 __generic_unplug_device(q);
2192 spin_unlock_irq(q->queue_lock); 2192 spin_unlock_irq(q->queue_lock);
2193 io_schedule(); 2193 io_schedule();
2194 2194
2195 /* 2195 /*
2196 * After sleeping, we become a "batching" process and 2196 * After sleeping, we become a "batching" process and
2197 * will be able to allocate at least one request, and 2197 * will be able to allocate at least one request, and
2198 * up to a big batch of them for a small period time. 2198 * up to a big batch of them for a small period time.
2199 * See ioc_batching, ioc_set_batching 2199 * See ioc_batching, ioc_set_batching
2200 */ 2200 */
2201 ioc = current_io_context(GFP_NOIO, q->node); 2201 ioc = current_io_context(GFP_NOIO, q->node);
2202 ioc_set_batching(q, ioc); 2202 ioc_set_batching(q, ioc);
2203 2203
2204 spin_lock_irq(q->queue_lock); 2204 spin_lock_irq(q->queue_lock);
2205 } 2205 }
2206 finish_wait(&rl->wait[rw], &wait); 2206 finish_wait(&rl->wait[rw], &wait);
2207 } 2207 }
2208 2208
2209 return rq; 2209 return rq;
2210 } 2210 }
2211 2211
2212 struct request *blk_get_request(struct request_queue *q, int rw, gfp_t gfp_mask) 2212 struct request *blk_get_request(struct request_queue *q, int rw, gfp_t gfp_mask)
2213 { 2213 {
2214 struct request *rq; 2214 struct request *rq;
2215 2215
2216 BUG_ON(rw != READ && rw != WRITE); 2216 BUG_ON(rw != READ && rw != WRITE);
2217 2217
2218 spin_lock_irq(q->queue_lock); 2218 spin_lock_irq(q->queue_lock);
2219 if (gfp_mask & __GFP_WAIT) { 2219 if (gfp_mask & __GFP_WAIT) {
2220 rq = get_request_wait(q, rw, NULL); 2220 rq = get_request_wait(q, rw, NULL);
2221 } else { 2221 } else {
2222 rq = get_request(q, rw, NULL, gfp_mask); 2222 rq = get_request(q, rw, NULL, gfp_mask);
2223 if (!rq) 2223 if (!rq)
2224 spin_unlock_irq(q->queue_lock); 2224 spin_unlock_irq(q->queue_lock);
2225 } 2225 }
2226 /* q->queue_lock is unlocked at this point */ 2226 /* q->queue_lock is unlocked at this point */
2227 2227
2228 return rq; 2228 return rq;
2229 } 2229 }
2230 EXPORT_SYMBOL(blk_get_request); 2230 EXPORT_SYMBOL(blk_get_request);
2231 2231
2232 /** 2232 /**
2233 * blk_start_queueing - initiate dispatch of requests to device 2233 * blk_start_queueing - initiate dispatch of requests to device
2234 * @q: request queue to kick into gear 2234 * @q: request queue to kick into gear
2235 * 2235 *
2236 * This is basically a helper to remove the need to know whether a queue 2236 * This is basically a helper to remove the need to know whether a queue
2237 * is plugged or not if someone just wants to initiate dispatch of requests 2237 * is plugged or not if someone just wants to initiate dispatch of requests
2238 * for this queue. 2238 * for this queue.
2239 * 2239 *
2240 * The queue lock must be held with interrupts disabled. 2240 * The queue lock must be held with interrupts disabled.
2241 */ 2241 */
2242 void blk_start_queueing(struct request_queue *q) 2242 void blk_start_queueing(struct request_queue *q)
2243 { 2243 {
2244 if (!blk_queue_plugged(q)) 2244 if (!blk_queue_plugged(q))
2245 q->request_fn(q); 2245 q->request_fn(q);
2246 else 2246 else
2247 __generic_unplug_device(q); 2247 __generic_unplug_device(q);
2248 } 2248 }
2249 EXPORT_SYMBOL(blk_start_queueing); 2249 EXPORT_SYMBOL(blk_start_queueing);
2250 2250
2251 /** 2251 /**
2252 * blk_requeue_request - put a request back on queue 2252 * blk_requeue_request - put a request back on queue
2253 * @q: request queue where request should be inserted 2253 * @q: request queue where request should be inserted
2254 * @rq: request to be inserted 2254 * @rq: request to be inserted
2255 * 2255 *
2256 * Description: 2256 * Description:
2257 * Drivers often keep queueing requests until the hardware cannot accept 2257 * Drivers often keep queueing requests until the hardware cannot accept
2258 * more, when that condition happens we need to put the request back 2258 * more, when that condition happens we need to put the request back
2259 * on the queue. Must be called with queue lock held. 2259 * on the queue. Must be called with queue lock held.
2260 */ 2260 */
2261 void blk_requeue_request(struct request_queue *q, struct request *rq) 2261 void blk_requeue_request(struct request_queue *q, struct request *rq)
2262 { 2262 {
2263 blk_add_trace_rq(q, rq, BLK_TA_REQUEUE); 2263 blk_add_trace_rq(q, rq, BLK_TA_REQUEUE);
2264 2264
2265 if (blk_rq_tagged(rq)) 2265 if (blk_rq_tagged(rq))
2266 blk_queue_end_tag(q, rq); 2266 blk_queue_end_tag(q, rq);
2267 2267
2268 elv_requeue_request(q, rq); 2268 elv_requeue_request(q, rq);
2269 } 2269 }
2270 2270
2271 EXPORT_SYMBOL(blk_requeue_request); 2271 EXPORT_SYMBOL(blk_requeue_request);
2272 2272
2273 /** 2273 /**
2274 * blk_insert_request - insert a special request in to a request queue 2274 * blk_insert_request - insert a special request in to a request queue
2275 * @q: request queue where request should be inserted 2275 * @q: request queue where request should be inserted
2276 * @rq: request to be inserted 2276 * @rq: request to be inserted
2277 * @at_head: insert request at head or tail of queue 2277 * @at_head: insert request at head or tail of queue
2278 * @data: private data 2278 * @data: private data
2279 * 2279 *
2280 * Description: 2280 * Description:
2281 * Many block devices need to execute commands asynchronously, so they don't 2281 * Many block devices need to execute commands asynchronously, so they don't
2282 * block the whole kernel from preemption during request execution. This is 2282 * block the whole kernel from preemption during request execution. This is
2283 * accomplished normally by inserting aritficial requests tagged as 2283 * accomplished normally by inserting aritficial requests tagged as
2284 * REQ_SPECIAL in to the corresponding request queue, and letting them be 2284 * REQ_SPECIAL in to the corresponding request queue, and letting them be
2285 * scheduled for actual execution by the request queue. 2285 * scheduled for actual execution by the request queue.
2286 * 2286 *
2287 * We have the option of inserting the head or the tail of the queue. 2287 * We have the option of inserting the head or the tail of the queue.
2288 * Typically we use the tail for new ioctls and so forth. We use the head 2288 * Typically we use the tail for new ioctls and so forth. We use the head
2289 * of the queue for things like a QUEUE_FULL message from a device, or a 2289 * of the queue for things like a QUEUE_FULL message from a device, or a
2290 * host that is unable to accept a particular command. 2290 * host that is unable to accept a particular command.
2291 */ 2291 */
2292 void blk_insert_request(struct request_queue *q, struct request *rq, 2292 void blk_insert_request(struct request_queue *q, struct request *rq,
2293 int at_head, void *data) 2293 int at_head, void *data)
2294 { 2294 {
2295 int where = at_head ? ELEVATOR_INSERT_FRONT : ELEVATOR_INSERT_BACK; 2295 int where = at_head ? ELEVATOR_INSERT_FRONT : ELEVATOR_INSERT_BACK;
2296 unsigned long flags; 2296 unsigned long flags;
2297 2297
2298 /* 2298 /*
2299 * tell I/O scheduler that this isn't a regular read/write (ie it 2299 * tell I/O scheduler that this isn't a regular read/write (ie it
2300 * must not attempt merges on this) and that it acts as a soft 2300 * must not attempt merges on this) and that it acts as a soft
2301 * barrier 2301 * barrier
2302 */ 2302 */
2303 rq->cmd_type = REQ_TYPE_SPECIAL; 2303 rq->cmd_type = REQ_TYPE_SPECIAL;
2304 rq->cmd_flags |= REQ_SOFTBARRIER; 2304 rq->cmd_flags |= REQ_SOFTBARRIER;
2305 2305
2306 rq->special = data; 2306 rq->special = data;
2307 2307
2308 spin_lock_irqsave(q->queue_lock, flags); 2308 spin_lock_irqsave(q->queue_lock, flags);
2309 2309
2310 /* 2310 /*
2311 * If command is tagged, release the tag 2311 * If command is tagged, release the tag
2312 */ 2312 */
2313 if (blk_rq_tagged(rq)) 2313 if (blk_rq_tagged(rq))
2314 blk_queue_end_tag(q, rq); 2314 blk_queue_end_tag(q, rq);
2315 2315
2316 drive_stat_acct(rq, rq->nr_sectors, 1); 2316 drive_stat_acct(rq, rq->nr_sectors, 1);
2317 __elv_add_request(q, rq, where, 0); 2317 __elv_add_request(q, rq, where, 0);
2318 blk_start_queueing(q); 2318 blk_start_queueing(q);
2319 spin_unlock_irqrestore(q->queue_lock, flags); 2319 spin_unlock_irqrestore(q->queue_lock, flags);
2320 } 2320 }
2321 2321
2322 EXPORT_SYMBOL(blk_insert_request); 2322 EXPORT_SYMBOL(blk_insert_request);
2323 2323
2324 static int __blk_rq_unmap_user(struct bio *bio) 2324 static int __blk_rq_unmap_user(struct bio *bio)
2325 { 2325 {
2326 int ret = 0; 2326 int ret = 0;
2327 2327
2328 if (bio) { 2328 if (bio) {
2329 if (bio_flagged(bio, BIO_USER_MAPPED)) 2329 if (bio_flagged(bio, BIO_USER_MAPPED))
2330 bio_unmap_user(bio); 2330 bio_unmap_user(bio);
2331 else 2331 else
2332 ret = bio_uncopy_user(bio); 2332 ret = bio_uncopy_user(bio);
2333 } 2333 }
2334 2334
2335 return ret; 2335 return ret;
2336 } 2336 }
2337 2337
2338 static int __blk_rq_map_user(struct request_queue *q, struct request *rq, 2338 static int __blk_rq_map_user(struct request_queue *q, struct request *rq,
2339 void __user *ubuf, unsigned int len) 2339 void __user *ubuf, unsigned int len)
2340 { 2340 {
2341 unsigned long uaddr; 2341 unsigned long uaddr;
2342 struct bio *bio, *orig_bio; 2342 struct bio *bio, *orig_bio;
2343 int reading, ret; 2343 int reading, ret;
2344 2344
2345 reading = rq_data_dir(rq) == READ; 2345 reading = rq_data_dir(rq) == READ;
2346 2346
2347 /* 2347 /*
2348 * if alignment requirement is satisfied, map in user pages for 2348 * if alignment requirement is satisfied, map in user pages for
2349 * direct dma. else, set up kernel bounce buffers 2349 * direct dma. else, set up kernel bounce buffers
2350 */ 2350 */
2351 uaddr = (unsigned long) ubuf; 2351 uaddr = (unsigned long) ubuf;
2352 if (!(uaddr & queue_dma_alignment(q)) && !(len & queue_dma_alignment(q))) 2352 if (!(uaddr & queue_dma_alignment(q)) && !(len & queue_dma_alignment(q)))
2353 bio = bio_map_user(q, NULL, uaddr, len, reading); 2353 bio = bio_map_user(q, NULL, uaddr, len, reading);
2354 else 2354 else
2355 bio = bio_copy_user(q, uaddr, len, reading); 2355 bio = bio_copy_user(q, uaddr, len, reading);
2356 2356
2357 if (IS_ERR(bio)) 2357 if (IS_ERR(bio))
2358 return PTR_ERR(bio); 2358 return PTR_ERR(bio);
2359 2359
2360 orig_bio = bio; 2360 orig_bio = bio;
2361 blk_queue_bounce(q, &bio); 2361 blk_queue_bounce(q, &bio);
2362 2362
2363 /* 2363 /*
2364 * We link the bounce buffer in and could have to traverse it 2364 * We link the bounce buffer in and could have to traverse it
2365 * later so we have to get a ref to prevent it from being freed 2365 * later so we have to get a ref to prevent it from being freed
2366 */ 2366 */
2367 bio_get(bio); 2367 bio_get(bio);
2368 2368
2369 if (!rq->bio) 2369 if (!rq->bio)
2370 blk_rq_bio_prep(q, rq, bio); 2370 blk_rq_bio_prep(q, rq, bio);
2371 else if (!ll_back_merge_fn(q, rq, bio)) { 2371 else if (!ll_back_merge_fn(q, rq, bio)) {
2372 ret = -EINVAL; 2372 ret = -EINVAL;
2373 goto unmap_bio; 2373 goto unmap_bio;
2374 } else { 2374 } else {
2375 rq->biotail->bi_next = bio; 2375 rq->biotail->bi_next = bio;
2376 rq->biotail = bio; 2376 rq->biotail = bio;
2377 2377
2378 rq->data_len += bio->bi_size; 2378 rq->data_len += bio->bi_size;
2379 } 2379 }
2380 2380
2381 return bio->bi_size; 2381 return bio->bi_size;
2382 2382
2383 unmap_bio: 2383 unmap_bio:
2384 /* if it was boucned we must call the end io function */ 2384 /* if it was boucned we must call the end io function */
2385 bio_endio(bio, bio->bi_size, 0); 2385 bio_endio(bio, bio->bi_size, 0);
2386 __blk_rq_unmap_user(orig_bio); 2386 __blk_rq_unmap_user(orig_bio);
2387 bio_put(bio); 2387 bio_put(bio);
2388 return ret; 2388 return ret;
2389 } 2389 }
2390 2390
2391 /** 2391 /**
2392 * blk_rq_map_user - map user data to a request, for REQ_BLOCK_PC usage 2392 * blk_rq_map_user - map user data to a request, for REQ_BLOCK_PC usage
2393 * @q: request queue where request should be inserted 2393 * @q: request queue where request should be inserted
2394 * @rq: request structure to fill 2394 * @rq: request structure to fill
2395 * @ubuf: the user buffer 2395 * @ubuf: the user buffer
2396 * @len: length of user data 2396 * @len: length of user data
2397 * 2397 *
2398 * Description: 2398 * Description:
2399 * Data will be mapped directly for zero copy io, if possible. Otherwise 2399 * Data will be mapped directly for zero copy io, if possible. Otherwise
2400 * a kernel bounce buffer is used. 2400 * a kernel bounce buffer is used.
2401 * 2401 *
2402 * A matching blk_rq_unmap_user() must be issued at the end of io, while 2402 * A matching blk_rq_unmap_user() must be issued at the end of io, while
2403 * still in process context. 2403 * still in process context.
2404 * 2404 *
2405 * Note: The mapped bio may need to be bounced through blk_queue_bounce() 2405 * Note: The mapped bio may need to be bounced through blk_queue_bounce()
2406 * before being submitted to the device, as pages mapped may be out of 2406 * before being submitted to the device, as pages mapped may be out of
2407 * reach. It's the callers responsibility to make sure this happens. The 2407 * reach. It's the callers responsibility to make sure this happens. The
2408 * original bio must be passed back in to blk_rq_unmap_user() for proper 2408 * original bio must be passed back in to blk_rq_unmap_user() for proper
2409 * unmapping. 2409 * unmapping.
2410 */ 2410 */
2411 int blk_rq_map_user(struct request_queue *q, struct request *rq, 2411 int blk_rq_map_user(struct request_queue *q, struct request *rq,
2412 void __user *ubuf, unsigned long len) 2412 void __user *ubuf, unsigned long len)
2413 { 2413 {
2414 unsigned long bytes_read = 0; 2414 unsigned long bytes_read = 0;
2415 struct bio *bio = NULL; 2415 struct bio *bio = NULL;
2416 int ret; 2416 int ret;
2417 2417
2418 if (len > (q->max_hw_sectors << 9)) 2418 if (len > (q->max_hw_sectors << 9))
2419 return -EINVAL; 2419 return -EINVAL;
2420 if (!len || !ubuf) 2420 if (!len || !ubuf)
2421 return -EINVAL; 2421 return -EINVAL;
2422 2422
2423 while (bytes_read != len) { 2423 while (bytes_read != len) {
2424 unsigned long map_len, end, start; 2424 unsigned long map_len, end, start;
2425 2425
2426 map_len = min_t(unsigned long, len - bytes_read, BIO_MAX_SIZE); 2426 map_len = min_t(unsigned long, len - bytes_read, BIO_MAX_SIZE);
2427 end = ((unsigned long)ubuf + map_len + PAGE_SIZE - 1) 2427 end = ((unsigned long)ubuf + map_len + PAGE_SIZE - 1)
2428 >> PAGE_SHIFT; 2428 >> PAGE_SHIFT;
2429 start = (unsigned long)ubuf >> PAGE_SHIFT; 2429 start = (unsigned long)ubuf >> PAGE_SHIFT;
2430 2430
2431 /* 2431 /*
2432 * A bad offset could cause us to require BIO_MAX_PAGES + 1 2432 * A bad offset could cause us to require BIO_MAX_PAGES + 1
2433 * pages. If this happens we just lower the requested 2433 * pages. If this happens we just lower the requested
2434 * mapping len by a page so that we can fit 2434 * mapping len by a page so that we can fit
2435 */ 2435 */
2436 if (end - start > BIO_MAX_PAGES) 2436 if (end - start > BIO_MAX_PAGES)
2437 map_len -= PAGE_SIZE; 2437 map_len -= PAGE_SIZE;
2438 2438
2439 ret = __blk_rq_map_user(q, rq, ubuf, map_len); 2439 ret = __blk_rq_map_user(q, rq, ubuf, map_len);
2440 if (ret < 0) 2440 if (ret < 0)
2441 goto unmap_rq; 2441 goto unmap_rq;
2442 if (!bio) 2442 if (!bio)
2443 bio = rq->bio; 2443 bio = rq->bio;
2444 bytes_read += ret; 2444 bytes_read += ret;
2445 ubuf += ret; 2445 ubuf += ret;
2446 } 2446 }
2447 2447
2448 rq->buffer = rq->data = NULL; 2448 rq->buffer = rq->data = NULL;
2449 return 0; 2449 return 0;
2450 unmap_rq: 2450 unmap_rq:
2451 blk_rq_unmap_user(bio); 2451 blk_rq_unmap_user(bio);
2452 return ret; 2452 return ret;
2453 } 2453 }
2454 2454
2455 EXPORT_SYMBOL(blk_rq_map_user); 2455 EXPORT_SYMBOL(blk_rq_map_user);
2456 2456
2457 /** 2457 /**
2458 * blk_rq_map_user_iov - map user data to a request, for REQ_BLOCK_PC usage 2458 * blk_rq_map_user_iov - map user data to a request, for REQ_BLOCK_PC usage
2459 * @q: request queue where request should be inserted 2459 * @q: request queue where request should be inserted
2460 * @rq: request to map data to 2460 * @rq: request to map data to
2461 * @iov: pointer to the iovec 2461 * @iov: pointer to the iovec
2462 * @iov_count: number of elements in the iovec 2462 * @iov_count: number of elements in the iovec
2463 * @len: I/O byte count 2463 * @len: I/O byte count
2464 * 2464 *
2465 * Description: 2465 * Description:
2466 * Data will be mapped directly for zero copy io, if possible. Otherwise 2466 * Data will be mapped directly for zero copy io, if possible. Otherwise
2467 * a kernel bounce buffer is used. 2467 * a kernel bounce buffer is used.
2468 * 2468 *
2469 * A matching blk_rq_unmap_user() must be issued at the end of io, while 2469 * A matching blk_rq_unmap_user() must be issued at the end of io, while
2470 * still in process context. 2470 * still in process context.
2471 * 2471 *
2472 * Note: The mapped bio may need to be bounced through blk_queue_bounce() 2472 * Note: The mapped bio may need to be bounced through blk_queue_bounce()
2473 * before being submitted to the device, as pages mapped may be out of 2473 * before being submitted to the device, as pages mapped may be out of
2474 * reach. It's the callers responsibility to make sure this happens. The 2474 * reach. It's the callers responsibility to make sure this happens. The
2475 * original bio must be passed back in to blk_rq_unmap_user() for proper 2475 * original bio must be passed back in to blk_rq_unmap_user() for proper
2476 * unmapping. 2476 * unmapping.
2477 */ 2477 */
2478 int blk_rq_map_user_iov(struct request_queue *q, struct request *rq, 2478 int blk_rq_map_user_iov(struct request_queue *q, struct request *rq,
2479 struct sg_iovec *iov, int iov_count, unsigned int len) 2479 struct sg_iovec *iov, int iov_count, unsigned int len)
2480 { 2480 {
2481 struct bio *bio; 2481 struct bio *bio;
2482 2482
2483 if (!iov || iov_count <= 0) 2483 if (!iov || iov_count <= 0)
2484 return -EINVAL; 2484 return -EINVAL;
2485 2485
2486 /* we don't allow misaligned data like bio_map_user() does. If the 2486 /* we don't allow misaligned data like bio_map_user() does. If the
2487 * user is using sg, they're expected to know the alignment constraints 2487 * user is using sg, they're expected to know the alignment constraints
2488 * and respect them accordingly */ 2488 * and respect them accordingly */
2489 bio = bio_map_user_iov(q, NULL, iov, iov_count, rq_data_dir(rq)== READ); 2489 bio = bio_map_user_iov(q, NULL, iov, iov_count, rq_data_dir(rq)== READ);
2490 if (IS_ERR(bio)) 2490 if (IS_ERR(bio))
2491 return PTR_ERR(bio); 2491 return PTR_ERR(bio);
2492 2492
2493 if (bio->bi_size != len) { 2493 if (bio->bi_size != len) {
2494 bio_endio(bio, bio->bi_size, 0); 2494 bio_endio(bio, bio->bi_size, 0);
2495 bio_unmap_user(bio); 2495 bio_unmap_user(bio);
2496 return -EINVAL; 2496 return -EINVAL;
2497 } 2497 }
2498 2498
2499 bio_get(bio); 2499 bio_get(bio);
2500 blk_rq_bio_prep(q, rq, bio); 2500 blk_rq_bio_prep(q, rq, bio);
2501 rq->buffer = rq->data = NULL; 2501 rq->buffer = rq->data = NULL;
2502 return 0; 2502 return 0;
2503 } 2503 }
2504 2504
2505 EXPORT_SYMBOL(blk_rq_map_user_iov); 2505 EXPORT_SYMBOL(blk_rq_map_user_iov);
2506 2506
2507 /** 2507 /**
2508 * blk_rq_unmap_user - unmap a request with user data 2508 * blk_rq_unmap_user - unmap a request with user data
2509 * @bio: start of bio list 2509 * @bio: start of bio list
2510 * 2510 *
2511 * Description: 2511 * Description:
2512 * Unmap a rq previously mapped by blk_rq_map_user(). The caller must 2512 * Unmap a rq previously mapped by blk_rq_map_user(). The caller must
2513 * supply the original rq->bio from the blk_rq_map_user() return, since 2513 * supply the original rq->bio from the blk_rq_map_user() return, since
2514 * the io completion may have changed rq->bio. 2514 * the io completion may have changed rq->bio.
2515 */ 2515 */
2516 int blk_rq_unmap_user(struct bio *bio) 2516 int blk_rq_unmap_user(struct bio *bio)
2517 { 2517 {
2518 struct bio *mapped_bio; 2518 struct bio *mapped_bio;
2519 int ret = 0, ret2; 2519 int ret = 0, ret2;
2520 2520
2521 while (bio) { 2521 while (bio) {
2522 mapped_bio = bio; 2522 mapped_bio = bio;
2523 if (unlikely(bio_flagged(bio, BIO_BOUNCED))) 2523 if (unlikely(bio_flagged(bio, BIO_BOUNCED)))
2524 mapped_bio = bio->bi_private; 2524 mapped_bio = bio->bi_private;
2525 2525
2526 ret2 = __blk_rq_unmap_user(mapped_bio); 2526 ret2 = __blk_rq_unmap_user(mapped_bio);
2527 if (ret2 && !ret) 2527 if (ret2 && !ret)
2528 ret = ret2; 2528 ret = ret2;
2529 2529
2530 mapped_bio = bio; 2530 mapped_bio = bio;
2531 bio = bio->bi_next; 2531 bio = bio->bi_next;
2532 bio_put(mapped_bio); 2532 bio_put(mapped_bio);
2533 } 2533 }
2534 2534
2535 return ret; 2535 return ret;
2536 } 2536 }
2537 2537
2538 EXPORT_SYMBOL(blk_rq_unmap_user); 2538 EXPORT_SYMBOL(blk_rq_unmap_user);
2539 2539
2540 /** 2540 /**
2541 * blk_rq_map_kern - map kernel data to a request, for REQ_BLOCK_PC usage 2541 * blk_rq_map_kern - map kernel data to a request, for REQ_BLOCK_PC usage
2542 * @q: request queue where request should be inserted 2542 * @q: request queue where request should be inserted
2543 * @rq: request to fill 2543 * @rq: request to fill
2544 * @kbuf: the kernel buffer 2544 * @kbuf: the kernel buffer
2545 * @len: length of user data 2545 * @len: length of user data
2546 * @gfp_mask: memory allocation flags 2546 * @gfp_mask: memory allocation flags
2547 */ 2547 */
2548 int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf, 2548 int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf,
2549 unsigned int len, gfp_t gfp_mask) 2549 unsigned int len, gfp_t gfp_mask)
2550 { 2550 {
2551 struct bio *bio; 2551 struct bio *bio;
2552 2552
2553 if (len > (q->max_hw_sectors << 9)) 2553 if (len > (q->max_hw_sectors << 9))
2554 return -EINVAL; 2554 return -EINVAL;
2555 if (!len || !kbuf) 2555 if (!len || !kbuf)
2556 return -EINVAL; 2556 return -EINVAL;
2557 2557
2558 bio = bio_map_kern(q, kbuf, len, gfp_mask); 2558 bio = bio_map_kern(q, kbuf, len, gfp_mask);
2559 if (IS_ERR(bio)) 2559 if (IS_ERR(bio))
2560 return PTR_ERR(bio); 2560 return PTR_ERR(bio);
2561 2561
2562 if (rq_data_dir(rq) == WRITE) 2562 if (rq_data_dir(rq) == WRITE)
2563 bio->bi_rw |= (1 << BIO_RW); 2563 bio->bi_rw |= (1 << BIO_RW);
2564 2564
2565 blk_rq_bio_prep(q, rq, bio); 2565 blk_rq_bio_prep(q, rq, bio);
2566 blk_queue_bounce(q, &rq->bio); 2566 blk_queue_bounce(q, &rq->bio);
2567 rq->buffer = rq->data = NULL; 2567 rq->buffer = rq->data = NULL;
2568 return 0; 2568 return 0;
2569 } 2569 }
2570 2570
2571 EXPORT_SYMBOL(blk_rq_map_kern); 2571 EXPORT_SYMBOL(blk_rq_map_kern);
2572 2572
2573 /** 2573 /**
2574 * blk_execute_rq_nowait - insert a request into queue for execution 2574 * blk_execute_rq_nowait - insert a request into queue for execution
2575 * @q: queue to insert the request in 2575 * @q: queue to insert the request in
2576 * @bd_disk: matching gendisk 2576 * @bd_disk: matching gendisk
2577 * @rq: request to insert 2577 * @rq: request to insert
2578 * @at_head: insert request at head or tail of queue 2578 * @at_head: insert request at head or tail of queue
2579 * @done: I/O completion handler 2579 * @done: I/O completion handler
2580 * 2580 *
2581 * Description: 2581 * Description:
2582 * Insert a fully prepared request at the back of the io scheduler queue 2582 * Insert a fully prepared request at the back of the io scheduler queue
2583 * for execution. Don't wait for completion. 2583 * for execution. Don't wait for completion.
2584 */ 2584 */
2585 void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk, 2585 void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk,
2586 struct request *rq, int at_head, 2586 struct request *rq, int at_head,
2587 rq_end_io_fn *done) 2587 rq_end_io_fn *done)
2588 { 2588 {
2589 int where = at_head ? ELEVATOR_INSERT_FRONT : ELEVATOR_INSERT_BACK; 2589 int where = at_head ? ELEVATOR_INSERT_FRONT : ELEVATOR_INSERT_BACK;
2590 2590
2591 rq->rq_disk = bd_disk; 2591 rq->rq_disk = bd_disk;
2592 rq->cmd_flags |= REQ_NOMERGE; 2592 rq->cmd_flags |= REQ_NOMERGE;
2593 rq->end_io = done; 2593 rq->end_io = done;
2594 WARN_ON(irqs_disabled()); 2594 WARN_ON(irqs_disabled());
2595 spin_lock_irq(q->queue_lock); 2595 spin_lock_irq(q->queue_lock);
2596 __elv_add_request(q, rq, where, 1); 2596 __elv_add_request(q, rq, where, 1);
2597 __generic_unplug_device(q); 2597 __generic_unplug_device(q);
2598 spin_unlock_irq(q->queue_lock); 2598 spin_unlock_irq(q->queue_lock);
2599 } 2599 }
2600 EXPORT_SYMBOL_GPL(blk_execute_rq_nowait); 2600 EXPORT_SYMBOL_GPL(blk_execute_rq_nowait);
2601 2601
2602 /** 2602 /**
2603 * blk_execute_rq - insert a request into queue for execution 2603 * blk_execute_rq - insert a request into queue for execution
2604 * @q: queue to insert the request in 2604 * @q: queue to insert the request in
2605 * @bd_disk: matching gendisk 2605 * @bd_disk: matching gendisk
2606 * @rq: request to insert 2606 * @rq: request to insert
2607 * @at_head: insert request at head or tail of queue 2607 * @at_head: insert request at head or tail of queue
2608 * 2608 *
2609 * Description: 2609 * Description:
2610 * Insert a fully prepared request at the back of the io scheduler queue 2610 * Insert a fully prepared request at the back of the io scheduler queue
2611 * for execution and wait for completion. 2611 * for execution and wait for completion.
2612 */ 2612 */
2613 int blk_execute_rq(struct request_queue *q, struct gendisk *bd_disk, 2613 int blk_execute_rq(struct request_queue *q, struct gendisk *bd_disk,
2614 struct request *rq, int at_head) 2614 struct request *rq, int at_head)
2615 { 2615 {
2616 DECLARE_COMPLETION_ONSTACK(wait); 2616 DECLARE_COMPLETION_ONSTACK(wait);
2617 char sense[SCSI_SENSE_BUFFERSIZE]; 2617 char sense[SCSI_SENSE_BUFFERSIZE];
2618 int err = 0; 2618 int err = 0;
2619 2619
2620 /* 2620 /*
2621 * we need an extra reference to the request, so we can look at 2621 * we need an extra reference to the request, so we can look at
2622 * it after io completion 2622 * it after io completion
2623 */ 2623 */
2624 rq->ref_count++; 2624 rq->ref_count++;
2625 2625
2626 if (!rq->sense) { 2626 if (!rq->sense) {
2627 memset(sense, 0, sizeof(sense)); 2627 memset(sense, 0, sizeof(sense));
2628 rq->sense = sense; 2628 rq->sense = sense;
2629 rq->sense_len = 0; 2629 rq->sense_len = 0;
2630 } 2630 }
2631 2631
2632 rq->end_io_data = &wait; 2632 rq->end_io_data = &wait;
2633 blk_execute_rq_nowait(q, bd_disk, rq, at_head, blk_end_sync_rq); 2633 blk_execute_rq_nowait(q, bd_disk, rq, at_head, blk_end_sync_rq);
2634 wait_for_completion(&wait); 2634 wait_for_completion(&wait);
2635 2635
2636 if (rq->errors) 2636 if (rq->errors)
2637 err = -EIO; 2637 err = -EIO;
2638 2638
2639 return err; 2639 return err;
2640 } 2640 }
2641 2641
2642 EXPORT_SYMBOL(blk_execute_rq); 2642 EXPORT_SYMBOL(blk_execute_rq);
2643 2643
2644 /** 2644 /**
2645 * blkdev_issue_flush - queue a flush 2645 * blkdev_issue_flush - queue a flush
2646 * @bdev: blockdev to issue flush for 2646 * @bdev: blockdev to issue flush for
2647 * @error_sector: error sector 2647 * @error_sector: error sector
2648 * 2648 *
2649 * Description: 2649 * Description:
2650 * Issue a flush for the block device in question. Caller can supply 2650 * Issue a flush for the block device in question. Caller can supply
2651 * room for storing the error offset in case of a flush error, if they 2651 * room for storing the error offset in case of a flush error, if they
2652 * wish to. Caller must run wait_for_completion() on its own. 2652 * wish to. Caller must run wait_for_completion() on its own.
2653 */ 2653 */
2654 int blkdev_issue_flush(struct block_device *bdev, sector_t *error_sector) 2654 int blkdev_issue_flush(struct block_device *bdev, sector_t *error_sector)
2655 { 2655 {
2656 struct request_queue *q; 2656 struct request_queue *q;
2657 2657
2658 if (bdev->bd_disk == NULL) 2658 if (bdev->bd_disk == NULL)
2659 return -ENXIO; 2659 return -ENXIO;
2660 2660
2661 q = bdev_get_queue(bdev); 2661 q = bdev_get_queue(bdev);
2662 if (!q) 2662 if (!q)
2663 return -ENXIO; 2663 return -ENXIO;
2664 if (!q->issue_flush_fn) 2664 if (!q->issue_flush_fn)
2665 return -EOPNOTSUPP; 2665 return -EOPNOTSUPP;
2666 2666
2667 return q->issue_flush_fn(q, bdev->bd_disk, error_sector); 2667 return q->issue_flush_fn(q, bdev->bd_disk, error_sector);
2668 } 2668 }
2669 2669
2670 EXPORT_SYMBOL(blkdev_issue_flush); 2670 EXPORT_SYMBOL(blkdev_issue_flush);
2671 2671
2672 static void drive_stat_acct(struct request *rq, int nr_sectors, int new_io) 2672 static void drive_stat_acct(struct request *rq, int nr_sectors, int new_io)
2673 { 2673 {
2674 int rw = rq_data_dir(rq); 2674 int rw = rq_data_dir(rq);
2675 2675
2676 if (!blk_fs_request(rq) || !rq->rq_disk) 2676 if (!blk_fs_request(rq) || !rq->rq_disk)
2677 return; 2677 return;
2678 2678
2679 if (!new_io) { 2679 if (!new_io) {
2680 __disk_stat_inc(rq->rq_disk, merges[rw]); 2680 __disk_stat_inc(rq->rq_disk, merges[rw]);
2681 } else { 2681 } else {
2682 disk_round_stats(rq->rq_disk); 2682 disk_round_stats(rq->rq_disk);
2683 rq->rq_disk->in_flight++; 2683 rq->rq_disk->in_flight++;
2684 } 2684 }
2685 } 2685 }
2686 2686
2687 /* 2687 /*
2688 * add-request adds a request to the linked list. 2688 * add-request adds a request to the linked list.
2689 * queue lock is held and interrupts disabled, as we muck with the 2689 * queue lock is held and interrupts disabled, as we muck with the
2690 * request queue list. 2690 * request queue list.
2691 */ 2691 */
2692 static inline void add_request(struct request_queue * q, struct request * req) 2692 static inline void add_request(struct request_queue * q, struct request * req)
2693 { 2693 {
2694 drive_stat_acct(req, req->nr_sectors, 1); 2694 drive_stat_acct(req, req->nr_sectors, 1);
2695 2695
2696 /* 2696 /*
2697 * elevator indicated where it wants this request to be 2697 * elevator indicated where it wants this request to be
2698 * inserted at elevator_merge time 2698 * inserted at elevator_merge time
2699 */ 2699 */
2700 __elv_add_request(q, req, ELEVATOR_INSERT_SORT, 0); 2700 __elv_add_request(q, req, ELEVATOR_INSERT_SORT, 0);
2701 } 2701 }
2702 2702
2703 /* 2703 /*
2704 * disk_round_stats() - Round off the performance stats on a struct 2704 * disk_round_stats() - Round off the performance stats on a struct
2705 * disk_stats. 2705 * disk_stats.
2706 * 2706 *
2707 * The average IO queue length and utilisation statistics are maintained 2707 * The average IO queue length and utilisation statistics are maintained
2708 * by observing the current state of the queue length and the amount of 2708 * by observing the current state of the queue length and the amount of
2709 * time it has been in this state for. 2709 * time it has been in this state for.
2710 * 2710 *
2711 * Normally, that accounting is done on IO completion, but that can result 2711 * Normally, that accounting is done on IO completion, but that can result
2712 * in more than a second's worth of IO being accounted for within any one 2712 * in more than a second's worth of IO being accounted for within any one
2713 * second, leading to >100% utilisation. To deal with that, we call this 2713 * second, leading to >100% utilisation. To deal with that, we call this
2714 * function to do a round-off before returning the results when reading 2714 * function to do a round-off before returning the results when reading
2715 * /proc/diskstats. This accounts immediately for all queue usage up to 2715 * /proc/diskstats. This accounts immediately for all queue usage up to
2716 * the current jiffies and restarts the counters again. 2716 * the current jiffies and restarts the counters again.
2717 */ 2717 */
2718 void disk_round_stats(struct gendisk *disk) 2718 void disk_round_stats(struct gendisk *disk)
2719 { 2719 {
2720 unsigned long now = jiffies; 2720 unsigned long now = jiffies;
2721 2721
2722 if (now == disk->stamp) 2722 if (now == disk->stamp)
2723 return; 2723 return;
2724 2724
2725 if (disk->in_flight) { 2725 if (disk->in_flight) {
2726 __disk_stat_add(disk, time_in_queue, 2726 __disk_stat_add(disk, time_in_queue,
2727 disk->in_flight * (now - disk->stamp)); 2727 disk->in_flight * (now - disk->stamp));
2728 __disk_stat_add(disk, io_ticks, (now - disk->stamp)); 2728 __disk_stat_add(disk, io_ticks, (now - disk->stamp));
2729 } 2729 }
2730 disk->stamp = now; 2730 disk->stamp = now;
2731 } 2731 }
2732 2732
2733 EXPORT_SYMBOL_GPL(disk_round_stats); 2733 EXPORT_SYMBOL_GPL(disk_round_stats);
2734 2734
2735 /* 2735 /*
2736 * queue lock must be held 2736 * queue lock must be held
2737 */ 2737 */
2738 void __blk_put_request(struct request_queue *q, struct request *req) 2738 void __blk_put_request(struct request_queue *q, struct request *req)
2739 { 2739 {
2740 if (unlikely(!q)) 2740 if (unlikely(!q))
2741 return; 2741 return;
2742 if (unlikely(--req->ref_count)) 2742 if (unlikely(--req->ref_count))
2743 return; 2743 return;
2744 2744
2745 elv_completed_request(q, req); 2745 elv_completed_request(q, req);
2746 2746
2747 /* 2747 /*
2748 * Request may not have originated from ll_rw_blk. if not, 2748 * Request may not have originated from ll_rw_blk. if not,
2749 * it didn't come out of our reserved rq pools 2749 * it didn't come out of our reserved rq pools
2750 */ 2750 */
2751 if (req->cmd_flags & REQ_ALLOCED) { 2751 if (req->cmd_flags & REQ_ALLOCED) {
2752 int rw = rq_data_dir(req); 2752 int rw = rq_data_dir(req);
2753 int priv = req->cmd_flags & REQ_ELVPRIV; 2753 int priv = req->cmd_flags & REQ_ELVPRIV;
2754 2754
2755 BUG_ON(!list_empty(&req->queuelist)); 2755 BUG_ON(!list_empty(&req->queuelist));
2756 BUG_ON(!hlist_unhashed(&req->hash)); 2756 BUG_ON(!hlist_unhashed(&req->hash));
2757 2757
2758 blk_free_request(q, req); 2758 blk_free_request(q, req);
2759 freed_request(q, rw, priv); 2759 freed_request(q, rw, priv);
2760 } 2760 }
2761 } 2761 }
2762 2762
2763 EXPORT_SYMBOL_GPL(__blk_put_request); 2763 EXPORT_SYMBOL_GPL(__blk_put_request);
2764 2764
2765 void blk_put_request(struct request *req) 2765 void blk_put_request(struct request *req)
2766 { 2766 {
2767 unsigned long flags; 2767 unsigned long flags;
2768 struct request_queue *q = req->q; 2768 struct request_queue *q = req->q;
2769 2769
2770 /* 2770 /*
2771 * Gee, IDE calls in w/ NULL q. Fix IDE and remove the 2771 * Gee, IDE calls in w/ NULL q. Fix IDE and remove the
2772 * following if (q) test. 2772 * following if (q) test.
2773 */ 2773 */
2774 if (q) { 2774 if (q) {
2775 spin_lock_irqsave(q->queue_lock, flags); 2775 spin_lock_irqsave(q->queue_lock, flags);
2776 __blk_put_request(q, req); 2776 __blk_put_request(q, req);
2777 spin_unlock_irqrestore(q->queue_lock, flags); 2777 spin_unlock_irqrestore(q->queue_lock, flags);
2778 } 2778 }
2779 } 2779 }
2780 2780
2781 EXPORT_SYMBOL(blk_put_request); 2781 EXPORT_SYMBOL(blk_put_request);
2782 2782
2783 /** 2783 /**
2784 * blk_end_sync_rq - executes a completion event on a request 2784 * blk_end_sync_rq - executes a completion event on a request
2785 * @rq: request to complete 2785 * @rq: request to complete
2786 * @error: end io status of the request 2786 * @error: end io status of the request
2787 */ 2787 */
2788 void blk_end_sync_rq(struct request *rq, int error) 2788 void blk_end_sync_rq(struct request *rq, int error)
2789 { 2789 {
2790 struct completion *waiting = rq->end_io_data; 2790 struct completion *waiting = rq->end_io_data;
2791 2791
2792 rq->end_io_data = NULL; 2792 rq->end_io_data = NULL;
2793 __blk_put_request(rq->q, rq); 2793 __blk_put_request(rq->q, rq);
2794 2794
2795 /* 2795 /*
2796 * complete last, if this is a stack request the process (and thus 2796 * complete last, if this is a stack request the process (and thus
2797 * the rq pointer) could be invalid right after this complete() 2797 * the rq pointer) could be invalid right after this complete()
2798 */ 2798 */
2799 complete(waiting); 2799 complete(waiting);
2800 } 2800 }
2801 EXPORT_SYMBOL(blk_end_sync_rq); 2801 EXPORT_SYMBOL(blk_end_sync_rq);
2802 2802
2803 /* 2803 /*
2804 * Has to be called with the request spinlock acquired 2804 * Has to be called with the request spinlock acquired
2805 */ 2805 */
2806 static int attempt_merge(struct request_queue *q, struct request *req, 2806 static int attempt_merge(struct request_queue *q, struct request *req,
2807 struct request *next) 2807 struct request *next)
2808 { 2808 {
2809 if (!rq_mergeable(req) || !rq_mergeable(next)) 2809 if (!rq_mergeable(req) || !rq_mergeable(next))
2810 return 0; 2810 return 0;
2811 2811
2812 /* 2812 /*
2813 * not contiguous 2813 * not contiguous
2814 */ 2814 */
2815 if (req->sector + req->nr_sectors != next->sector) 2815 if (req->sector + req->nr_sectors != next->sector)
2816 return 0; 2816 return 0;
2817 2817
2818 if (rq_data_dir(req) != rq_data_dir(next) 2818 if (rq_data_dir(req) != rq_data_dir(next)
2819 || req->rq_disk != next->rq_disk 2819 || req->rq_disk != next->rq_disk
2820 || next->special) 2820 || next->special)
2821 return 0; 2821 return 0;
2822 2822
2823 /* 2823 /*
2824 * If we are allowed to merge, then append bio list 2824 * If we are allowed to merge, then append bio list
2825 * from next to rq and release next. merge_requests_fn 2825 * from next to rq and release next. merge_requests_fn
2826 * will have updated segment counts, update sector 2826 * will have updated segment counts, update sector
2827 * counts here. 2827 * counts here.
2828 */ 2828 */
2829 if (!ll_merge_requests_fn(q, req, next)) 2829 if (!ll_merge_requests_fn(q, req, next))
2830 return 0; 2830 return 0;
2831 2831
2832 /* 2832 /*
2833 * At this point we have either done a back merge 2833 * At this point we have either done a back merge
2834 * or front merge. We need the smaller start_time of 2834 * or front merge. We need the smaller start_time of
2835 * the merged requests to be the current request 2835 * the merged requests to be the current request
2836 * for accounting purposes. 2836 * for accounting purposes.
2837 */ 2837 */
2838 if (time_after(req->start_time, next->start_time)) 2838 if (time_after(req->start_time, next->start_time))
2839 req->start_time = next->start_time; 2839 req->start_time = next->start_time;
2840 2840
2841 req->biotail->bi_next = next->bio; 2841 req->biotail->bi_next = next->bio;
2842 req->biotail = next->biotail; 2842 req->biotail = next->biotail;
2843 2843
2844 req->nr_sectors = req->hard_nr_sectors += next->hard_nr_sectors; 2844 req->nr_sectors = req->hard_nr_sectors += next->hard_nr_sectors;
2845 2845
2846 elv_merge_requests(q, req, next); 2846 elv_merge_requests(q, req, next);
2847 2847
2848 if (req->rq_disk) { 2848 if (req->rq_disk) {
2849 disk_round_stats(req->rq_disk); 2849 disk_round_stats(req->rq_disk);
2850 req->rq_disk->in_flight--; 2850 req->rq_disk->in_flight--;
2851 } 2851 }
2852 2852
2853 req->ioprio = ioprio_best(req->ioprio, next->ioprio); 2853 req->ioprio = ioprio_best(req->ioprio, next->ioprio);
2854 2854
2855 __blk_put_request(q, next); 2855 __blk_put_request(q, next);
2856 return 1; 2856 return 1;
2857 } 2857 }
2858 2858
2859 static inline int attempt_back_merge(struct request_queue *q, 2859 static inline int attempt_back_merge(struct request_queue *q,
2860 struct request *rq) 2860 struct request *rq)
2861 { 2861 {
2862 struct request *next = elv_latter_request(q, rq); 2862 struct request *next = elv_latter_request(q, rq);
2863 2863
2864 if (next) 2864 if (next)
2865 return attempt_merge(q, rq, next); 2865 return attempt_merge(q, rq, next);
2866 2866
2867 return 0; 2867 return 0;
2868 } 2868 }
2869 2869
2870 static inline int attempt_front_merge(struct request_queue *q, 2870 static inline int attempt_front_merge(struct request_queue *q,
2871 struct request *rq) 2871 struct request *rq)
2872 { 2872 {
2873 struct request *prev = elv_former_request(q, rq); 2873 struct request *prev = elv_former_request(q, rq);
2874 2874
2875 if (prev) 2875 if (prev)
2876 return attempt_merge(q, prev, rq); 2876 return attempt_merge(q, prev, rq);
2877 2877
2878 return 0; 2878 return 0;
2879 } 2879 }
2880 2880
2881 static void init_request_from_bio(struct request *req, struct bio *bio) 2881 static void init_request_from_bio(struct request *req, struct bio *bio)
2882 { 2882 {
2883 req->cmd_type = REQ_TYPE_FS; 2883 req->cmd_type = REQ_TYPE_FS;
2884 2884
2885 /* 2885 /*
2886 * inherit FAILFAST from bio (for read-ahead, and explicit FAILFAST) 2886 * inherit FAILFAST from bio (for read-ahead, and explicit FAILFAST)
2887 */ 2887 */
2888 if (bio_rw_ahead(bio) || bio_failfast(bio)) 2888 if (bio_rw_ahead(bio) || bio_failfast(bio))
2889 req->cmd_flags |= REQ_FAILFAST; 2889 req->cmd_flags |= REQ_FAILFAST;
2890 2890
2891 /* 2891 /*
2892 * REQ_BARRIER implies no merging, but lets make it explicit 2892 * REQ_BARRIER implies no merging, but lets make it explicit
2893 */ 2893 */
2894 if (unlikely(bio_barrier(bio))) 2894 if (unlikely(bio_barrier(bio)))
2895 req->cmd_flags |= (REQ_HARDBARRIER | REQ_NOMERGE); 2895 req->cmd_flags |= (REQ_HARDBARRIER | REQ_NOMERGE);
2896 2896
2897 if (bio_sync(bio)) 2897 if (bio_sync(bio))
2898 req->cmd_flags |= REQ_RW_SYNC; 2898 req->cmd_flags |= REQ_RW_SYNC;
2899 if (bio_rw_meta(bio)) 2899 if (bio_rw_meta(bio))
2900 req->cmd_flags |= REQ_RW_META; 2900 req->cmd_flags |= REQ_RW_META;
2901 2901
2902 req->errors = 0; 2902 req->errors = 0;
2903 req->hard_sector = req->sector = bio->bi_sector; 2903 req->hard_sector = req->sector = bio->bi_sector;
2904 req->hard_nr_sectors = req->nr_sectors = bio_sectors(bio); 2904 req->hard_nr_sectors = req->nr_sectors = bio_sectors(bio);
2905 req->current_nr_sectors = req->hard_cur_sectors = bio_cur_sectors(bio); 2905 req->current_nr_sectors = req->hard_cur_sectors = bio_cur_sectors(bio);
2906 req->nr_phys_segments = bio_phys_segments(req->q, bio); 2906 req->nr_phys_segments = bio_phys_segments(req->q, bio);
2907 req->nr_hw_segments = bio_hw_segments(req->q, bio); 2907 req->nr_hw_segments = bio_hw_segments(req->q, bio);
2908 req->buffer = bio_data(bio); /* see ->buffer comment above */ 2908 req->buffer = bio_data(bio); /* see ->buffer comment above */
2909 req->bio = req->biotail = bio; 2909 req->bio = req->biotail = bio;
2910 req->ioprio = bio_prio(bio); 2910 req->ioprio = bio_prio(bio);
2911 req->rq_disk = bio->bi_bdev->bd_disk; 2911 req->rq_disk = bio->bi_bdev->bd_disk;
2912 req->start_time = jiffies; 2912 req->start_time = jiffies;
2913 } 2913 }
2914 2914
2915 static int __make_request(struct request_queue *q, struct bio *bio) 2915 static int __make_request(struct request_queue *q, struct bio *bio)
2916 { 2916 {
2917 struct request *req; 2917 struct request *req;
2918 int el_ret, nr_sectors, barrier, err; 2918 int el_ret, nr_sectors, barrier, err;
2919 const unsigned short prio = bio_prio(bio); 2919 const unsigned short prio = bio_prio(bio);
2920 const int sync = bio_sync(bio); 2920 const int sync = bio_sync(bio);
2921 int rw_flags; 2921 int rw_flags;
2922 2922
2923 nr_sectors = bio_sectors(bio); 2923 nr_sectors = bio_sectors(bio);
2924 2924
2925 /* 2925 /*
2926 * low level driver can indicate that it wants pages above a 2926 * low level driver can indicate that it wants pages above a
2927 * certain limit bounced to low memory (ie for highmem, or even 2927 * certain limit bounced to low memory (ie for highmem, or even
2928 * ISA dma in theory) 2928 * ISA dma in theory)
2929 */ 2929 */
2930 blk_queue_bounce(q, &bio); 2930 blk_queue_bounce(q, &bio);
2931 2931
2932 barrier = bio_barrier(bio); 2932 barrier = bio_barrier(bio);
2933 if (unlikely(barrier) && (q->next_ordered == QUEUE_ORDERED_NONE)) { 2933 if (unlikely(barrier) && (q->next_ordered == QUEUE_ORDERED_NONE)) {
2934 err = -EOPNOTSUPP; 2934 err = -EOPNOTSUPP;
2935 goto end_io; 2935 goto end_io;
2936 } 2936 }
2937 2937
2938 spin_lock_irq(q->queue_lock); 2938 spin_lock_irq(q->queue_lock);
2939 2939
2940 if (unlikely(barrier) || elv_queue_empty(q)) 2940 if (unlikely(barrier) || elv_queue_empty(q))
2941 goto get_rq; 2941 goto get_rq;
2942 2942
2943 el_ret = elv_merge(q, &req, bio); 2943 el_ret = elv_merge(q, &req, bio);
2944 switch (el_ret) { 2944 switch (el_ret) {
2945 case ELEVATOR_BACK_MERGE: 2945 case ELEVATOR_BACK_MERGE:
2946 BUG_ON(!rq_mergeable(req)); 2946 BUG_ON(!rq_mergeable(req));
2947 2947
2948 if (!ll_back_merge_fn(q, req, bio)) 2948 if (!ll_back_merge_fn(q, req, bio))
2949 break; 2949 break;
2950 2950
2951 blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE); 2951 blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE);
2952 2952
2953 req->biotail->bi_next = bio; 2953 req->biotail->bi_next = bio;
2954 req->biotail = bio; 2954 req->biotail = bio;
2955 req->nr_sectors = req->hard_nr_sectors += nr_sectors; 2955 req->nr_sectors = req->hard_nr_sectors += nr_sectors;
2956 req->ioprio = ioprio_best(req->ioprio, prio); 2956 req->ioprio = ioprio_best(req->ioprio, prio);
2957 drive_stat_acct(req, nr_sectors, 0); 2957 drive_stat_acct(req, nr_sectors, 0);
2958 if (!attempt_back_merge(q, req)) 2958 if (!attempt_back_merge(q, req))
2959 elv_merged_request(q, req, el_ret); 2959 elv_merged_request(q, req, el_ret);
2960 goto out; 2960 goto out;
2961 2961
2962 case ELEVATOR_FRONT_MERGE: 2962 case ELEVATOR_FRONT_MERGE:
2963 BUG_ON(!rq_mergeable(req)); 2963 BUG_ON(!rq_mergeable(req));
2964 2964
2965 if (!ll_front_merge_fn(q, req, bio)) 2965 if (!ll_front_merge_fn(q, req, bio))
2966 break; 2966 break;
2967 2967
2968 blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE); 2968 blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE);
2969 2969
2970 bio->bi_next = req->bio; 2970 bio->bi_next = req->bio;
2971 req->bio = bio; 2971 req->bio = bio;
2972 2972
2973 /* 2973 /*
2974 * may not be valid. if the low level driver said 2974 * may not be valid. if the low level driver said
2975 * it didn't need a bounce buffer then it better 2975 * it didn't need a bounce buffer then it better
2976 * not touch req->buffer either... 2976 * not touch req->buffer either...
2977 */ 2977 */
2978 req->buffer = bio_data(bio); 2978 req->buffer = bio_data(bio);
2979 req->current_nr_sectors = bio_cur_sectors(bio); 2979 req->current_nr_sectors = bio_cur_sectors(bio);
2980 req->hard_cur_sectors = req->current_nr_sectors; 2980 req->hard_cur_sectors = req->current_nr_sectors;
2981 req->sector = req->hard_sector = bio->bi_sector; 2981 req->sector = req->hard_sector = bio->bi_sector;
2982 req->nr_sectors = req->hard_nr_sectors += nr_sectors; 2982 req->nr_sectors = req->hard_nr_sectors += nr_sectors;
2983 req->ioprio = ioprio_best(req->ioprio, prio); 2983 req->ioprio = ioprio_best(req->ioprio, prio);
2984 drive_stat_acct(req, nr_sectors, 0); 2984 drive_stat_acct(req, nr_sectors, 0);
2985 if (!attempt_front_merge(q, req)) 2985 if (!attempt_front_merge(q, req))
2986 elv_merged_request(q, req, el_ret); 2986 elv_merged_request(q, req, el_ret);
2987 goto out; 2987 goto out;
2988 2988
2989 /* ELV_NO_MERGE: elevator says don't/can't merge. */ 2989 /* ELV_NO_MERGE: elevator says don't/can't merge. */
2990 default: 2990 default:
2991 ; 2991 ;
2992 } 2992 }
2993 2993
2994 get_rq: 2994 get_rq:
2995 /* 2995 /*
2996 * This sync check and mask will be re-done in init_request_from_bio(), 2996 * This sync check and mask will be re-done in init_request_from_bio(),
2997 * but we need to set it earlier to expose the sync flag to the 2997 * but we need to set it earlier to expose the sync flag to the
2998 * rq allocator and io schedulers. 2998 * rq allocator and io schedulers.
2999 */ 2999 */
3000 rw_flags = bio_data_dir(bio); 3000 rw_flags = bio_data_dir(bio);
3001 if (sync) 3001 if (sync)
3002 rw_flags |= REQ_RW_SYNC; 3002 rw_flags |= REQ_RW_SYNC;
3003 3003
3004 /* 3004 /*
3005 * Grab a free request. This is might sleep but can not fail. 3005 * Grab a free request. This is might sleep but can not fail.
3006 * Returns with the queue unlocked. 3006 * Returns with the queue unlocked.
3007 */ 3007 */
3008 req = get_request_wait(q, rw_flags, bio); 3008 req = get_request_wait(q, rw_flags, bio);
3009 3009
3010 /* 3010 /*
3011 * After dropping the lock and possibly sleeping here, our request 3011 * After dropping the lock and possibly sleeping here, our request
3012 * may now be mergeable after it had proven unmergeable (above). 3012 * may now be mergeable after it had proven unmergeable (above).
3013 * We don't worry about that case for efficiency. It won't happen 3013 * We don't worry about that case for efficiency. It won't happen
3014 * often, and the elevators are able to handle it. 3014 * often, and the elevators are able to handle it.
3015 */ 3015 */
3016 init_request_from_bio(req, bio); 3016 init_request_from_bio(req, bio);
3017 3017
3018 spin_lock_irq(q->queue_lock); 3018 spin_lock_irq(q->queue_lock);
3019 if (elv_queue_empty(q)) 3019 if (elv_queue_empty(q))
3020 blk_plug_device(q); 3020 blk_plug_device(q);
3021 add_request(q, req); 3021 add_request(q, req);
3022 out: 3022 out:
3023 if (sync) 3023 if (sync)
3024 __generic_unplug_device(q); 3024 __generic_unplug_device(q);
3025 3025
3026 spin_unlock_irq(q->queue_lock); 3026 spin_unlock_irq(q->queue_lock);
3027 return 0; 3027 return 0;
3028 3028
3029 end_io: 3029 end_io:
3030 bio_endio(bio, nr_sectors << 9, err); 3030 bio_endio(bio, nr_sectors << 9, err);
3031 return 0; 3031 return 0;
3032 } 3032 }
3033 3033
3034 /* 3034 /*
3035 * If bio->bi_dev is a partition, remap the location 3035 * If bio->bi_dev is a partition, remap the location
3036 */ 3036 */
3037 static inline void blk_partition_remap(struct bio *bio) 3037 static inline void blk_partition_remap(struct bio *bio)
3038 { 3038 {
3039 struct block_device *bdev = bio->bi_bdev; 3039 struct block_device *bdev = bio->bi_bdev;
3040 3040
3041 if (bdev != bdev->bd_contains) { 3041 if (bdev != bdev->bd_contains) {
3042 struct hd_struct *p = bdev->bd_part; 3042 struct hd_struct *p = bdev->bd_part;
3043 const int rw = bio_data_dir(bio); 3043 const int rw = bio_data_dir(bio);
3044 3044
3045 p->sectors[rw] += bio_sectors(bio); 3045 p->sectors[rw] += bio_sectors(bio);
3046 p->ios[rw]++; 3046 p->ios[rw]++;
3047 3047
3048 bio->bi_sector += p->start_sect; 3048 bio->bi_sector += p->start_sect;
3049 bio->bi_bdev = bdev->bd_contains; 3049 bio->bi_bdev = bdev->bd_contains;
3050
3051 blk_add_trace_remap(bdev_get_queue(bio->bi_bdev), bio,
3052 bdev->bd_dev, bio->bi_sector,
3053 bio->bi_sector - p->start_sect);
3050 } 3054 }
3051 } 3055 }
3052 3056
3053 static void handle_bad_sector(struct bio *bio) 3057 static void handle_bad_sector(struct bio *bio)
3054 { 3058 {
3055 char b[BDEVNAME_SIZE]; 3059 char b[BDEVNAME_SIZE];
3056 3060
3057 printk(KERN_INFO "attempt to access beyond end of device\n"); 3061 printk(KERN_INFO "attempt to access beyond end of device\n");
3058 printk(KERN_INFO "%s: rw=%ld, want=%Lu, limit=%Lu\n", 3062 printk(KERN_INFO "%s: rw=%ld, want=%Lu, limit=%Lu\n",
3059 bdevname(bio->bi_bdev, b), 3063 bdevname(bio->bi_bdev, b),
3060 bio->bi_rw, 3064 bio->bi_rw,
3061 (unsigned long long)bio->bi_sector + bio_sectors(bio), 3065 (unsigned long long)bio->bi_sector + bio_sectors(bio),
3062 (long long)(bio->bi_bdev->bd_inode->i_size >> 9)); 3066 (long long)(bio->bi_bdev->bd_inode->i_size >> 9));
3063 3067
3064 set_bit(BIO_EOF, &bio->bi_flags); 3068 set_bit(BIO_EOF, &bio->bi_flags);
3065 } 3069 }
3066 3070
3067 #ifdef CONFIG_FAIL_MAKE_REQUEST 3071 #ifdef CONFIG_FAIL_MAKE_REQUEST
3068 3072
3069 static DECLARE_FAULT_ATTR(fail_make_request); 3073 static DECLARE_FAULT_ATTR(fail_make_request);
3070 3074
3071 static int __init setup_fail_make_request(char *str) 3075 static int __init setup_fail_make_request(char *str)
3072 { 3076 {
3073 return setup_fault_attr(&fail_make_request, str); 3077 return setup_fault_attr(&fail_make_request, str);
3074 } 3078 }
3075 __setup("fail_make_request=", setup_fail_make_request); 3079 __setup("fail_make_request=", setup_fail_make_request);
3076 3080
3077 static int should_fail_request(struct bio *bio) 3081 static int should_fail_request(struct bio *bio)
3078 { 3082 {
3079 if ((bio->bi_bdev->bd_disk->flags & GENHD_FL_FAIL) || 3083 if ((bio->bi_bdev->bd_disk->flags & GENHD_FL_FAIL) ||
3080 (bio->bi_bdev->bd_part && bio->bi_bdev->bd_part->make_it_fail)) 3084 (bio->bi_bdev->bd_part && bio->bi_bdev->bd_part->make_it_fail))
3081 return should_fail(&fail_make_request, bio->bi_size); 3085 return should_fail(&fail_make_request, bio->bi_size);
3082 3086
3083 return 0; 3087 return 0;
3084 } 3088 }
3085 3089
3086 static int __init fail_make_request_debugfs(void) 3090 static int __init fail_make_request_debugfs(void)
3087 { 3091 {
3088 return init_fault_attr_dentries(&fail_make_request, 3092 return init_fault_attr_dentries(&fail_make_request,
3089 "fail_make_request"); 3093 "fail_make_request");
3090 } 3094 }
3091 3095
3092 late_initcall(fail_make_request_debugfs); 3096 late_initcall(fail_make_request_debugfs);
3093 3097
3094 #else /* CONFIG_FAIL_MAKE_REQUEST */ 3098 #else /* CONFIG_FAIL_MAKE_REQUEST */
3095 3099
3096 static inline int should_fail_request(struct bio *bio) 3100 static inline int should_fail_request(struct bio *bio)
3097 { 3101 {
3098 return 0; 3102 return 0;
3099 } 3103 }
3100 3104
3101 #endif /* CONFIG_FAIL_MAKE_REQUEST */ 3105 #endif /* CONFIG_FAIL_MAKE_REQUEST */
3102 3106
3103 /** 3107 /**
3104 * generic_make_request: hand a buffer to its device driver for I/O 3108 * generic_make_request: hand a buffer to its device driver for I/O
3105 * @bio: The bio describing the location in memory and on the device. 3109 * @bio: The bio describing the location in memory and on the device.
3106 * 3110 *
3107 * generic_make_request() is used to make I/O requests of block 3111 * generic_make_request() is used to make I/O requests of block
3108 * devices. It is passed a &struct bio, which describes the I/O that needs 3112 * devices. It is passed a &struct bio, which describes the I/O that needs
3109 * to be done. 3113 * to be done.
3110 * 3114 *
3111 * generic_make_request() does not return any status. The 3115 * generic_make_request() does not return any status. The
3112 * success/failure status of the request, along with notification of 3116 * success/failure status of the request, along with notification of
3113 * completion, is delivered asynchronously through the bio->bi_end_io 3117 * completion, is delivered asynchronously through the bio->bi_end_io
3114 * function described (one day) else where. 3118 * function described (one day) else where.
3115 * 3119 *
3116 * The caller of generic_make_request must make sure that bi_io_vec 3120 * The caller of generic_make_request must make sure that bi_io_vec
3117 * are set to describe the memory buffer, and that bi_dev and bi_sector are 3121 * are set to describe the memory buffer, and that bi_dev and bi_sector are
3118 * set to describe the device address, and the 3122 * set to describe the device address, and the
3119 * bi_end_io and optionally bi_private are set to describe how 3123 * bi_end_io and optionally bi_private are set to describe how
3120 * completion notification should be signaled. 3124 * completion notification should be signaled.
3121 * 3125 *
3122 * generic_make_request and the drivers it calls may use bi_next if this 3126 * generic_make_request and the drivers it calls may use bi_next if this
3123 * bio happens to be merged with someone else, and may change bi_dev and 3127 * bio happens to be merged with someone else, and may change bi_dev and
3124 * bi_sector for remaps as it sees fit. So the values of these fields 3128 * bi_sector for remaps as it sees fit. So the values of these fields
3125 * should NOT be depended on after the call to generic_make_request. 3129 * should NOT be depended on after the call to generic_make_request.
3126 */ 3130 */
3127 static inline void __generic_make_request(struct bio *bio) 3131 static inline void __generic_make_request(struct bio *bio)
3128 { 3132 {
3129 struct request_queue *q; 3133 struct request_queue *q;
3130 sector_t maxsector; 3134 sector_t maxsector;
3131 sector_t old_sector; 3135 sector_t old_sector;
3132 int ret, nr_sectors = bio_sectors(bio); 3136 int ret, nr_sectors = bio_sectors(bio);
3133 dev_t old_dev; 3137 dev_t old_dev;
3134 3138
3135 might_sleep(); 3139 might_sleep();
3136 /* Test device or partition size, when known. */ 3140 /* Test device or partition size, when known. */
3137 maxsector = bio->bi_bdev->bd_inode->i_size >> 9; 3141 maxsector = bio->bi_bdev->bd_inode->i_size >> 9;
3138 if (maxsector) { 3142 if (maxsector) {
3139 sector_t sector = bio->bi_sector; 3143 sector_t sector = bio->bi_sector;
3140 3144
3141 if (maxsector < nr_sectors || maxsector - nr_sectors < sector) { 3145 if (maxsector < nr_sectors || maxsector - nr_sectors < sector) {
3142 /* 3146 /*
3143 * This may well happen - the kernel calls bread() 3147 * This may well happen - the kernel calls bread()
3144 * without checking the size of the device, e.g., when 3148 * without checking the size of the device, e.g., when
3145 * mounting a device. 3149 * mounting a device.
3146 */ 3150 */
3147 handle_bad_sector(bio); 3151 handle_bad_sector(bio);
3148 goto end_io; 3152 goto end_io;
3149 } 3153 }
3150 } 3154 }
3151 3155
3152 /* 3156 /*
3153 * Resolve the mapping until finished. (drivers are 3157 * Resolve the mapping until finished. (drivers are
3154 * still free to implement/resolve their own stacking 3158 * still free to implement/resolve their own stacking
3155 * by explicitly returning 0) 3159 * by explicitly returning 0)
3156 * 3160 *
3157 * NOTE: we don't repeat the blk_size check for each new device. 3161 * NOTE: we don't repeat the blk_size check for each new device.
3158 * Stacking drivers are expected to know what they are doing. 3162 * Stacking drivers are expected to know what they are doing.
3159 */ 3163 */
3160 old_sector = -1; 3164 old_sector = -1;
3161 old_dev = 0; 3165 old_dev = 0;
3162 do { 3166 do {
3163 char b[BDEVNAME_SIZE]; 3167 char b[BDEVNAME_SIZE];
3164 3168
3165 q = bdev_get_queue(bio->bi_bdev); 3169 q = bdev_get_queue(bio->bi_bdev);
3166 if (!q) { 3170 if (!q) {
3167 printk(KERN_ERR 3171 printk(KERN_ERR
3168 "generic_make_request: Trying to access " 3172 "generic_make_request: Trying to access "
3169 "nonexistent block-device %s (%Lu)\n", 3173 "nonexistent block-device %s (%Lu)\n",
3170 bdevname(bio->bi_bdev, b), 3174 bdevname(bio->bi_bdev, b),
3171 (long long) bio->bi_sector); 3175 (long long) bio->bi_sector);
3172 end_io: 3176 end_io:
3173 bio_endio(bio, bio->bi_size, -EIO); 3177 bio_endio(bio, bio->bi_size, -EIO);
3174 break; 3178 break;
3175 } 3179 }
3176 3180
3177 if (unlikely(bio_sectors(bio) > q->max_hw_sectors)) { 3181 if (unlikely(bio_sectors(bio) > q->max_hw_sectors)) {
3178 printk("bio too big device %s (%u > %u)\n", 3182 printk("bio too big device %s (%u > %u)\n",
3179 bdevname(bio->bi_bdev, b), 3183 bdevname(bio->bi_bdev, b),
3180 bio_sectors(bio), 3184 bio_sectors(bio),
3181 q->max_hw_sectors); 3185 q->max_hw_sectors);
3182 goto end_io; 3186 goto end_io;
3183 } 3187 }
3184 3188
3185 if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))) 3189 if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)))
3186 goto end_io; 3190 goto end_io;
3187 3191
3188 if (should_fail_request(bio)) 3192 if (should_fail_request(bio))
3189 goto end_io; 3193 goto end_io;
3190 3194
3191 /* 3195 /*
3192 * If this device has partitions, remap block n 3196 * If this device has partitions, remap block n
3193 * of partition p to block n+start(p) of the disk. 3197 * of partition p to block n+start(p) of the disk.
3194 */ 3198 */
3195 blk_partition_remap(bio); 3199 blk_partition_remap(bio);
3196 3200
3197 if (old_sector != -1) 3201 if (old_sector != -1)
3198 blk_add_trace_remap(q, bio, old_dev, bio->bi_sector, 3202 blk_add_trace_remap(q, bio, old_dev, bio->bi_sector,
3199 old_sector); 3203 old_sector);
3200 3204
3201 blk_add_trace_bio(q, bio, BLK_TA_QUEUE); 3205 blk_add_trace_bio(q, bio, BLK_TA_QUEUE);
3202 3206
3203 old_sector = bio->bi_sector; 3207 old_sector = bio->bi_sector;
3204 old_dev = bio->bi_bdev->bd_dev; 3208 old_dev = bio->bi_bdev->bd_dev;
3205 3209
3206 maxsector = bio->bi_bdev->bd_inode->i_size >> 9; 3210 maxsector = bio->bi_bdev->bd_inode->i_size >> 9;
3207 if (maxsector) { 3211 if (maxsector) {
3208 sector_t sector = bio->bi_sector; 3212 sector_t sector = bio->bi_sector;
3209 3213
3210 if (maxsector < nr_sectors || 3214 if (maxsector < nr_sectors ||
3211 maxsector - nr_sectors < sector) { 3215 maxsector - nr_sectors < sector) {
3212 /* 3216 /*
3213 * This may well happen - partitions are not 3217 * This may well happen - partitions are not
3214 * checked to make sure they are within the size 3218 * checked to make sure they are within the size
3215 * of the whole device. 3219 * of the whole device.
3216 */ 3220 */
3217 handle_bad_sector(bio); 3221 handle_bad_sector(bio);
3218 goto end_io; 3222 goto end_io;
3219 } 3223 }
3220 } 3224 }
3221 3225
3222 ret = q->make_request_fn(q, bio); 3226 ret = q->make_request_fn(q, bio);
3223 } while (ret); 3227 } while (ret);
3224 } 3228 }
3225 3229
3226 /* 3230 /*
3227 * We only want one ->make_request_fn to be active at a time, 3231 * We only want one ->make_request_fn to be active at a time,
3228 * else stack usage with stacked devices could be a problem. 3232 * else stack usage with stacked devices could be a problem.
3229 * So use current->bio_{list,tail} to keep a list of requests 3233 * So use current->bio_{list,tail} to keep a list of requests
3230 * submited by a make_request_fn function. 3234 * submited by a make_request_fn function.
3231 * current->bio_tail is also used as a flag to say if 3235 * current->bio_tail is also used as a flag to say if
3232 * generic_make_request is currently active in this task or not. 3236 * generic_make_request is currently active in this task or not.
3233 * If it is NULL, then no make_request is active. If it is non-NULL, 3237 * If it is NULL, then no make_request is active. If it is non-NULL,
3234 * then a make_request is active, and new requests should be added 3238 * then a make_request is active, and new requests should be added
3235 * at the tail 3239 * at the tail
3236 */ 3240 */
3237 void generic_make_request(struct bio *bio) 3241 void generic_make_request(struct bio *bio)
3238 { 3242 {
3239 if (current->bio_tail) { 3243 if (current->bio_tail) {
3240 /* make_request is active */ 3244 /* make_request is active */
3241 *(current->bio_tail) = bio; 3245 *(current->bio_tail) = bio;
3242 bio->bi_next = NULL; 3246 bio->bi_next = NULL;
3243 current->bio_tail = &bio->bi_next; 3247 current->bio_tail = &bio->bi_next;
3244 return; 3248 return;
3245 } 3249 }
3246 /* following loop may be a bit non-obvious, and so deserves some 3250 /* following loop may be a bit non-obvious, and so deserves some
3247 * explanation. 3251 * explanation.
3248 * Before entering the loop, bio->bi_next is NULL (as all callers 3252 * Before entering the loop, bio->bi_next is NULL (as all callers
3249 * ensure that) so we have a list with a single bio. 3253 * ensure that) so we have a list with a single bio.
3250 * We pretend that we have just taken it off a longer list, so 3254 * We pretend that we have just taken it off a longer list, so
3251 * we assign bio_list to the next (which is NULL) and bio_tail 3255 * we assign bio_list to the next (which is NULL) and bio_tail
3252 * to &bio_list, thus initialising the bio_list of new bios to be 3256 * to &bio_list, thus initialising the bio_list of new bios to be
3253 * added. __generic_make_request may indeed add some more bios 3257 * added. __generic_make_request may indeed add some more bios
3254 * through a recursive call to generic_make_request. If it 3258 * through a recursive call to generic_make_request. If it
3255 * did, we find a non-NULL value in bio_list and re-enter the loop 3259 * did, we find a non-NULL value in bio_list and re-enter the loop
3256 * from the top. In this case we really did just take the bio 3260 * from the top. In this case we really did just take the bio
3257 * of the top of the list (no pretending) and so fixup bio_list and 3261 * of the top of the list (no pretending) and so fixup bio_list and
3258 * bio_tail or bi_next, and call into __generic_make_request again. 3262 * bio_tail or bi_next, and call into __generic_make_request again.
3259 * 3263 *
3260 * The loop was structured like this to make only one call to 3264 * The loop was structured like this to make only one call to
3261 * __generic_make_request (which is important as it is large and 3265 * __generic_make_request (which is important as it is large and
3262 * inlined) and to keep the structure simple. 3266 * inlined) and to keep the structure simple.
3263 */ 3267 */
3264 BUG_ON(bio->bi_next); 3268 BUG_ON(bio->bi_next);
3265 do { 3269 do {
3266 current->bio_list = bio->bi_next; 3270 current->bio_list = bio->bi_next;
3267 if (bio->bi_next == NULL) 3271 if (bio->bi_next == NULL)
3268 current->bio_tail = &current->bio_list; 3272 current->bio_tail = &current->bio_list;
3269 else 3273 else
3270 bio->bi_next = NULL; 3274 bio->bi_next = NULL;
3271 __generic_make_request(bio); 3275 __generic_make_request(bio);
3272 bio = current->bio_list; 3276 bio = current->bio_list;
3273 } while (bio); 3277 } while (bio);
3274 current->bio_tail = NULL; /* deactivate */ 3278 current->bio_tail = NULL; /* deactivate */
3275 } 3279 }
3276 3280
3277 EXPORT_SYMBOL(generic_make_request); 3281 EXPORT_SYMBOL(generic_make_request);
3278 3282
3279 /** 3283 /**
3280 * submit_bio: submit a bio to the block device layer for I/O 3284 * submit_bio: submit a bio to the block device layer for I/O
3281 * @rw: whether to %READ or %WRITE, or maybe to %READA (read ahead) 3285 * @rw: whether to %READ or %WRITE, or maybe to %READA (read ahead)
3282 * @bio: The &struct bio which describes the I/O 3286 * @bio: The &struct bio which describes the I/O
3283 * 3287 *
3284 * submit_bio() is very similar in purpose to generic_make_request(), and 3288 * submit_bio() is very similar in purpose to generic_make_request(), and
3285 * uses that function to do most of the work. Both are fairly rough 3289 * uses that function to do most of the work. Both are fairly rough
3286 * interfaces, @bio must be presetup and ready for I/O. 3290 * interfaces, @bio must be presetup and ready for I/O.
3287 * 3291 *
3288 */ 3292 */
3289 void submit_bio(int rw, struct bio *bio) 3293 void submit_bio(int rw, struct bio *bio)
3290 { 3294 {
3291 int count = bio_sectors(bio); 3295 int count = bio_sectors(bio);
3292 3296
3293 BIO_BUG_ON(!bio->bi_size); 3297 BIO_BUG_ON(!bio->bi_size);
3294 BIO_BUG_ON(!bio->bi_io_vec); 3298 BIO_BUG_ON(!bio->bi_io_vec);
3295 bio->bi_rw |= rw; 3299 bio->bi_rw |= rw;
3296 if (rw & WRITE) { 3300 if (rw & WRITE) {
3297 count_vm_events(PGPGOUT, count); 3301 count_vm_events(PGPGOUT, count);
3298 } else { 3302 } else {
3299 task_io_account_read(bio->bi_size); 3303 task_io_account_read(bio->bi_size);
3300 count_vm_events(PGPGIN, count); 3304 count_vm_events(PGPGIN, count);
3301 } 3305 }
3302 3306
3303 if (unlikely(block_dump)) { 3307 if (unlikely(block_dump)) {
3304 char b[BDEVNAME_SIZE]; 3308 char b[BDEVNAME_SIZE];
3305 printk(KERN_DEBUG "%s(%d): %s block %Lu on %s\n", 3309 printk(KERN_DEBUG "%s(%d): %s block %Lu on %s\n",
3306 current->comm, current->pid, 3310 current->comm, current->pid,
3307 (rw & WRITE) ? "WRITE" : "READ", 3311 (rw & WRITE) ? "WRITE" : "READ",
3308 (unsigned long long)bio->bi_sector, 3312 (unsigned long long)bio->bi_sector,
3309 bdevname(bio->bi_bdev,b)); 3313 bdevname(bio->bi_bdev,b));
3310 } 3314 }
3311 3315
3312 generic_make_request(bio); 3316 generic_make_request(bio);
3313 } 3317 }
3314 3318
3315 EXPORT_SYMBOL(submit_bio); 3319 EXPORT_SYMBOL(submit_bio);
3316 3320
3317 static void blk_recalc_rq_segments(struct request *rq) 3321 static void blk_recalc_rq_segments(struct request *rq)
3318 { 3322 {
3319 struct bio *bio, *prevbio = NULL; 3323 struct bio *bio, *prevbio = NULL;
3320 int nr_phys_segs, nr_hw_segs; 3324 int nr_phys_segs, nr_hw_segs;
3321 unsigned int phys_size, hw_size; 3325 unsigned int phys_size, hw_size;
3322 struct request_queue *q = rq->q; 3326 struct request_queue *q = rq->q;
3323 3327
3324 if (!rq->bio) 3328 if (!rq->bio)
3325 return; 3329 return;
3326 3330
3327 phys_size = hw_size = nr_phys_segs = nr_hw_segs = 0; 3331 phys_size = hw_size = nr_phys_segs = nr_hw_segs = 0;
3328 rq_for_each_bio(bio, rq) { 3332 rq_for_each_bio(bio, rq) {
3329 /* Force bio hw/phys segs to be recalculated. */ 3333 /* Force bio hw/phys segs to be recalculated. */
3330 bio->bi_flags &= ~(1 << BIO_SEG_VALID); 3334 bio->bi_flags &= ~(1 << BIO_SEG_VALID);
3331 3335
3332 nr_phys_segs += bio_phys_segments(q, bio); 3336 nr_phys_segs += bio_phys_segments(q, bio);
3333 nr_hw_segs += bio_hw_segments(q, bio); 3337 nr_hw_segs += bio_hw_segments(q, bio);
3334 if (prevbio) { 3338 if (prevbio) {
3335 int pseg = phys_size + prevbio->bi_size + bio->bi_size; 3339 int pseg = phys_size + prevbio->bi_size + bio->bi_size;
3336 int hseg = hw_size + prevbio->bi_size + bio->bi_size; 3340 int hseg = hw_size + prevbio->bi_size + bio->bi_size;
3337 3341
3338 if (blk_phys_contig_segment(q, prevbio, bio) && 3342 if (blk_phys_contig_segment(q, prevbio, bio) &&
3339 pseg <= q->max_segment_size) { 3343 pseg <= q->max_segment_size) {
3340 nr_phys_segs--; 3344 nr_phys_segs--;
3341 phys_size += prevbio->bi_size + bio->bi_size; 3345 phys_size += prevbio->bi_size + bio->bi_size;
3342 } else 3346 } else
3343 phys_size = 0; 3347 phys_size = 0;
3344 3348
3345 if (blk_hw_contig_segment(q, prevbio, bio) && 3349 if (blk_hw_contig_segment(q, prevbio, bio) &&
3346 hseg <= q->max_segment_size) { 3350 hseg <= q->max_segment_size) {
3347 nr_hw_segs--; 3351 nr_hw_segs--;
3348 hw_size += prevbio->bi_size + bio->bi_size; 3352 hw_size += prevbio->bi_size + bio->bi_size;
3349 } else 3353 } else
3350 hw_size = 0; 3354 hw_size = 0;
3351 } 3355 }
3352 prevbio = bio; 3356 prevbio = bio;
3353 } 3357 }
3354 3358
3355 rq->nr_phys_segments = nr_phys_segs; 3359 rq->nr_phys_segments = nr_phys_segs;
3356 rq->nr_hw_segments = nr_hw_segs; 3360 rq->nr_hw_segments = nr_hw_segs;
3357 } 3361 }
3358 3362
3359 static void blk_recalc_rq_sectors(struct request *rq, int nsect) 3363 static void blk_recalc_rq_sectors(struct request *rq, int nsect)
3360 { 3364 {
3361 if (blk_fs_request(rq)) { 3365 if (blk_fs_request(rq)) {
3362 rq->hard_sector += nsect; 3366 rq->hard_sector += nsect;
3363 rq->hard_nr_sectors -= nsect; 3367 rq->hard_nr_sectors -= nsect;
3364 3368
3365 /* 3369 /*
3366 * Move the I/O submission pointers ahead if required. 3370 * Move the I/O submission pointers ahead if required.
3367 */ 3371 */
3368 if ((rq->nr_sectors >= rq->hard_nr_sectors) && 3372 if ((rq->nr_sectors >= rq->hard_nr_sectors) &&
3369 (rq->sector <= rq->hard_sector)) { 3373 (rq->sector <= rq->hard_sector)) {
3370 rq->sector = rq->hard_sector; 3374 rq->sector = rq->hard_sector;
3371 rq->nr_sectors = rq->hard_nr_sectors; 3375 rq->nr_sectors = rq->hard_nr_sectors;
3372 rq->hard_cur_sectors = bio_cur_sectors(rq->bio); 3376 rq->hard_cur_sectors = bio_cur_sectors(rq->bio);
3373 rq->current_nr_sectors = rq->hard_cur_sectors; 3377 rq->current_nr_sectors = rq->hard_cur_sectors;
3374 rq->buffer = bio_data(rq->bio); 3378 rq->buffer = bio_data(rq->bio);
3375 } 3379 }
3376 3380
3377 /* 3381 /*
3378 * if total number of sectors is less than the first segment 3382 * if total number of sectors is less than the first segment
3379 * size, something has gone terribly wrong 3383 * size, something has gone terribly wrong
3380 */ 3384 */
3381 if (rq->nr_sectors < rq->current_nr_sectors) { 3385 if (rq->nr_sectors < rq->current_nr_sectors) {
3382 printk("blk: request botched\n"); 3386 printk("blk: request botched\n");
3383 rq->nr_sectors = rq->current_nr_sectors; 3387 rq->nr_sectors = rq->current_nr_sectors;
3384 } 3388 }
3385 } 3389 }
3386 } 3390 }
3387 3391
3388 static int __end_that_request_first(struct request *req, int uptodate, 3392 static int __end_that_request_first(struct request *req, int uptodate,
3389 int nr_bytes) 3393 int nr_bytes)
3390 { 3394 {
3391 int total_bytes, bio_nbytes, error, next_idx = 0; 3395 int total_bytes, bio_nbytes, error, next_idx = 0;
3392 struct bio *bio; 3396 struct bio *bio;
3393 3397
3394 blk_add_trace_rq(req->q, req, BLK_TA_COMPLETE); 3398 blk_add_trace_rq(req->q, req, BLK_TA_COMPLETE);
3395 3399
3396 /* 3400 /*
3397 * extend uptodate bool to allow < 0 value to be direct io error 3401 * extend uptodate bool to allow < 0 value to be direct io error
3398 */ 3402 */
3399 error = 0; 3403 error = 0;
3400 if (end_io_error(uptodate)) 3404 if (end_io_error(uptodate))
3401 error = !uptodate ? -EIO : uptodate; 3405 error = !uptodate ? -EIO : uptodate;
3402 3406
3403 /* 3407 /*
3404 * for a REQ_BLOCK_PC request, we want to carry any eventual 3408 * for a REQ_BLOCK_PC request, we want to carry any eventual
3405 * sense key with us all the way through 3409 * sense key with us all the way through
3406 */ 3410 */
3407 if (!blk_pc_request(req)) 3411 if (!blk_pc_request(req))
3408 req->errors = 0; 3412 req->errors = 0;
3409 3413
3410 if (!uptodate) { 3414 if (!uptodate) {
3411 if (blk_fs_request(req) && !(req->cmd_flags & REQ_QUIET)) 3415 if (blk_fs_request(req) && !(req->cmd_flags & REQ_QUIET))
3412 printk("end_request: I/O error, dev %s, sector %llu\n", 3416 printk("end_request: I/O error, dev %s, sector %llu\n",
3413 req->rq_disk ? req->rq_disk->disk_name : "?", 3417 req->rq_disk ? req->rq_disk->disk_name : "?",
3414 (unsigned long long)req->sector); 3418 (unsigned long long)req->sector);
3415 } 3419 }
3416 3420
3417 if (blk_fs_request(req) && req->rq_disk) { 3421 if (blk_fs_request(req) && req->rq_disk) {
3418 const int rw = rq_data_dir(req); 3422 const int rw = rq_data_dir(req);
3419 3423
3420 disk_stat_add(req->rq_disk, sectors[rw], nr_bytes >> 9); 3424 disk_stat_add(req->rq_disk, sectors[rw], nr_bytes >> 9);
3421 } 3425 }
3422 3426
3423 total_bytes = bio_nbytes = 0; 3427 total_bytes = bio_nbytes = 0;
3424 while ((bio = req->bio) != NULL) { 3428 while ((bio = req->bio) != NULL) {
3425 int nbytes; 3429 int nbytes;
3426 3430
3427 if (nr_bytes >= bio->bi_size) { 3431 if (nr_bytes >= bio->bi_size) {
3428 req->bio = bio->bi_next; 3432 req->bio = bio->bi_next;
3429 nbytes = bio->bi_size; 3433 nbytes = bio->bi_size;
3430 if (!ordered_bio_endio(req, bio, nbytes, error)) 3434 if (!ordered_bio_endio(req, bio, nbytes, error))
3431 bio_endio(bio, nbytes, error); 3435 bio_endio(bio, nbytes, error);
3432 next_idx = 0; 3436 next_idx = 0;
3433 bio_nbytes = 0; 3437 bio_nbytes = 0;
3434 } else { 3438 } else {
3435 int idx = bio->bi_idx + next_idx; 3439 int idx = bio->bi_idx + next_idx;
3436 3440
3437 if (unlikely(bio->bi_idx >= bio->bi_vcnt)) { 3441 if (unlikely(bio->bi_idx >= bio->bi_vcnt)) {
3438 blk_dump_rq_flags(req, "__end_that"); 3442 blk_dump_rq_flags(req, "__end_that");
3439 printk("%s: bio idx %d >= vcnt %d\n", 3443 printk("%s: bio idx %d >= vcnt %d\n",
3440 __FUNCTION__, 3444 __FUNCTION__,
3441 bio->bi_idx, bio->bi_vcnt); 3445 bio->bi_idx, bio->bi_vcnt);
3442 break; 3446 break;
3443 } 3447 }
3444 3448
3445 nbytes = bio_iovec_idx(bio, idx)->bv_len; 3449 nbytes = bio_iovec_idx(bio, idx)->bv_len;
3446 BIO_BUG_ON(nbytes > bio->bi_size); 3450 BIO_BUG_ON(nbytes > bio->bi_size);
3447 3451
3448 /* 3452 /*
3449 * not a complete bvec done 3453 * not a complete bvec done
3450 */ 3454 */
3451 if (unlikely(nbytes > nr_bytes)) { 3455 if (unlikely(nbytes > nr_bytes)) {
3452 bio_nbytes += nr_bytes; 3456 bio_nbytes += nr_bytes;
3453 total_bytes += nr_bytes; 3457 total_bytes += nr_bytes;
3454 break; 3458 break;
3455 } 3459 }
3456 3460
3457 /* 3461 /*
3458 * advance to the next vector 3462 * advance to the next vector
3459 */ 3463 */
3460 next_idx++; 3464 next_idx++;
3461 bio_nbytes += nbytes; 3465 bio_nbytes += nbytes;
3462 } 3466 }
3463 3467
3464 total_bytes += nbytes; 3468 total_bytes += nbytes;
3465 nr_bytes -= nbytes; 3469 nr_bytes -= nbytes;
3466 3470
3467 if ((bio = req->bio)) { 3471 if ((bio = req->bio)) {
3468 /* 3472 /*
3469 * end more in this run, or just return 'not-done' 3473 * end more in this run, or just return 'not-done'
3470 */ 3474 */
3471 if (unlikely(nr_bytes <= 0)) 3475 if (unlikely(nr_bytes <= 0))
3472 break; 3476 break;
3473 } 3477 }
3474 } 3478 }
3475 3479
3476 /* 3480 /*
3477 * completely done 3481 * completely done
3478 */ 3482 */
3479 if (!req->bio) 3483 if (!req->bio)
3480 return 0; 3484 return 0;
3481 3485
3482 /* 3486 /*
3483 * if the request wasn't completed, update state 3487 * if the request wasn't completed, update state
3484 */ 3488 */
3485 if (bio_nbytes) { 3489 if (bio_nbytes) {
3486 if (!ordered_bio_endio(req, bio, bio_nbytes, error)) 3490 if (!ordered_bio_endio(req, bio, bio_nbytes, error))
3487 bio_endio(bio, bio_nbytes, error); 3491 bio_endio(bio, bio_nbytes, error);
3488 bio->bi_idx += next_idx; 3492 bio->bi_idx += next_idx;
3489 bio_iovec(bio)->bv_offset += nr_bytes; 3493 bio_iovec(bio)->bv_offset += nr_bytes;
3490 bio_iovec(bio)->bv_len -= nr_bytes; 3494 bio_iovec(bio)->bv_len -= nr_bytes;
3491 } 3495 }
3492 3496
3493 blk_recalc_rq_sectors(req, total_bytes >> 9); 3497 blk_recalc_rq_sectors(req, total_bytes >> 9);
3494 blk_recalc_rq_segments(req); 3498 blk_recalc_rq_segments(req);
3495 return 1; 3499 return 1;
3496 } 3500 }
3497 3501
3498 /** 3502 /**
3499 * end_that_request_first - end I/O on a request 3503 * end_that_request_first - end I/O on a request
3500 * @req: the request being processed 3504 * @req: the request being processed
3501 * @uptodate: 1 for success, 0 for I/O error, < 0 for specific error 3505 * @uptodate: 1 for success, 0 for I/O error, < 0 for specific error
3502 * @nr_sectors: number of sectors to end I/O on 3506 * @nr_sectors: number of sectors to end I/O on
3503 * 3507 *
3504 * Description: 3508 * Description:
3505 * Ends I/O on a number of sectors attached to @req, and sets it up 3509 * Ends I/O on a number of sectors attached to @req, and sets it up
3506 * for the next range of segments (if any) in the cluster. 3510 * for the next range of segments (if any) in the cluster.
3507 * 3511 *
3508 * Return: 3512 * Return:
3509 * 0 - we are done with this request, call end_that_request_last() 3513 * 0 - we are done with this request, call end_that_request_last()
3510 * 1 - still buffers pending for this request 3514 * 1 - still buffers pending for this request
3511 **/ 3515 **/
3512 int end_that_request_first(struct request *req, int uptodate, int nr_sectors) 3516 int end_that_request_first(struct request *req, int uptodate, int nr_sectors)
3513 { 3517 {
3514 return __end_that_request_first(req, uptodate, nr_sectors << 9); 3518 return __end_that_request_first(req, uptodate, nr_sectors << 9);
3515 } 3519 }
3516 3520
3517 EXPORT_SYMBOL(end_that_request_first); 3521 EXPORT_SYMBOL(end_that_request_first);
3518 3522
3519 /** 3523 /**
3520 * end_that_request_chunk - end I/O on a request 3524 * end_that_request_chunk - end I/O on a request
3521 * @req: the request being processed 3525 * @req: the request being processed
3522 * @uptodate: 1 for success, 0 for I/O error, < 0 for specific error 3526 * @uptodate: 1 for success, 0 for I/O error, < 0 for specific error
3523 * @nr_bytes: number of bytes to complete 3527 * @nr_bytes: number of bytes to complete
3524 * 3528 *
3525 * Description: 3529 * Description:
3526 * Ends I/O on a number of bytes attached to @req, and sets it up 3530 * Ends I/O on a number of bytes attached to @req, and sets it up
3527 * for the next range of segments (if any). Like end_that_request_first(), 3531 * for the next range of segments (if any). Like end_that_request_first(),
3528 * but deals with bytes instead of sectors. 3532 * but deals with bytes instead of sectors.
3529 * 3533 *
3530 * Return: 3534 * Return:
3531 * 0 - we are done with this request, call end_that_request_last() 3535 * 0 - we are done with this request, call end_that_request_last()
3532 * 1 - still buffers pending for this request 3536 * 1 - still buffers pending for this request
3533 **/ 3537 **/
3534 int end_that_request_chunk(struct request *req, int uptodate, int nr_bytes) 3538 int end_that_request_chunk(struct request *req, int uptodate, int nr_bytes)
3535 { 3539 {
3536 return __end_that_request_first(req, uptodate, nr_bytes); 3540 return __end_that_request_first(req, uptodate, nr_bytes);
3537 } 3541 }
3538 3542
3539 EXPORT_SYMBOL(end_that_request_chunk); 3543 EXPORT_SYMBOL(end_that_request_chunk);
3540 3544
3541 /* 3545 /*
3542 * splice the completion data to a local structure and hand off to 3546 * splice the completion data to a local structure and hand off to
3543 * process_completion_queue() to complete the requests 3547 * process_completion_queue() to complete the requests
3544 */ 3548 */
3545 static void blk_done_softirq(struct softirq_action *h) 3549 static void blk_done_softirq(struct softirq_action *h)
3546 { 3550 {
3547 struct list_head *cpu_list, local_list; 3551 struct list_head *cpu_list, local_list;
3548 3552
3549 local_irq_disable(); 3553 local_irq_disable();
3550 cpu_list = &__get_cpu_var(blk_cpu_done); 3554 cpu_list = &__get_cpu_var(blk_cpu_done);
3551 list_replace_init(cpu_list, &local_list); 3555 list_replace_init(cpu_list, &local_list);
3552 local_irq_enable(); 3556 local_irq_enable();
3553 3557
3554 while (!list_empty(&local_list)) { 3558 while (!list_empty(&local_list)) {
3555 struct request *rq = list_entry(local_list.next, struct request, donelist); 3559 struct request *rq = list_entry(local_list.next, struct request, donelist);
3556 3560
3557 list_del_init(&rq->donelist); 3561 list_del_init(&rq->donelist);
3558 rq->q->softirq_done_fn(rq); 3562 rq->q->softirq_done_fn(rq);
3559 } 3563 }
3560 } 3564 }
3561 3565
3562 static int blk_cpu_notify(struct notifier_block *self, unsigned long action, 3566 static int blk_cpu_notify(struct notifier_block *self, unsigned long action,
3563 void *hcpu) 3567 void *hcpu)
3564 { 3568 {
3565 /* 3569 /*
3566 * If a CPU goes away, splice its entries to the current CPU 3570 * If a CPU goes away, splice its entries to the current CPU
3567 * and trigger a run of the softirq 3571 * and trigger a run of the softirq
3568 */ 3572 */
3569 if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) { 3573 if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
3570 int cpu = (unsigned long) hcpu; 3574 int cpu = (unsigned long) hcpu;
3571 3575
3572 local_irq_disable(); 3576 local_irq_disable();
3573 list_splice_init(&per_cpu(blk_cpu_done, cpu), 3577 list_splice_init(&per_cpu(blk_cpu_done, cpu),
3574 &__get_cpu_var(blk_cpu_done)); 3578 &__get_cpu_var(blk_cpu_done));
3575 raise_softirq_irqoff(BLOCK_SOFTIRQ); 3579 raise_softirq_irqoff(BLOCK_SOFTIRQ);
3576 local_irq_enable(); 3580 local_irq_enable();
3577 } 3581 }
3578 3582
3579 return NOTIFY_OK; 3583 return NOTIFY_OK;
3580 } 3584 }
3581 3585
3582 3586
3583 static struct notifier_block __devinitdata blk_cpu_notifier = { 3587 static struct notifier_block __devinitdata blk_cpu_notifier = {
3584 .notifier_call = blk_cpu_notify, 3588 .notifier_call = blk_cpu_notify,
3585 }; 3589 };
3586 3590
3587 /** 3591 /**
3588 * blk_complete_request - end I/O on a request 3592 * blk_complete_request - end I/O on a request
3589 * @req: the request being processed 3593 * @req: the request being processed
3590 * 3594 *
3591 * Description: 3595 * Description:
3592 * Ends all I/O on a request. It does not handle partial completions, 3596 * Ends all I/O on a request. It does not handle partial completions,
3593 * unless the driver actually implements this in its completion callback 3597 * unless the driver actually implements this in its completion callback
3594 * through requeueing. Theh actual completion happens out-of-order, 3598 * through requeueing. Theh actual completion happens out-of-order,
3595 * through a softirq handler. The user must have registered a completion 3599 * through a softirq handler. The user must have registered a completion
3596 * callback through blk_queue_softirq_done(). 3600 * callback through blk_queue_softirq_done().
3597 **/ 3601 **/
3598 3602
3599 void blk_complete_request(struct request *req) 3603 void blk_complete_request(struct request *req)
3600 { 3604 {
3601 struct list_head *cpu_list; 3605 struct list_head *cpu_list;
3602 unsigned long flags; 3606 unsigned long flags;
3603 3607
3604 BUG_ON(!req->q->softirq_done_fn); 3608 BUG_ON(!req->q->softirq_done_fn);
3605 3609
3606 local_irq_save(flags); 3610 local_irq_save(flags);
3607 3611
3608 cpu_list = &__get_cpu_var(blk_cpu_done); 3612 cpu_list = &__get_cpu_var(blk_cpu_done);
3609 list_add_tail(&req->donelist, cpu_list); 3613 list_add_tail(&req->donelist, cpu_list);
3610 raise_softirq_irqoff(BLOCK_SOFTIRQ); 3614 raise_softirq_irqoff(BLOCK_SOFTIRQ);
3611 3615
3612 local_irq_restore(flags); 3616 local_irq_restore(flags);
3613 } 3617 }
3614 3618
3615 EXPORT_SYMBOL(blk_complete_request); 3619 EXPORT_SYMBOL(blk_complete_request);
3616 3620
3617 /* 3621 /*
3618 * queue lock must be held 3622 * queue lock must be held
3619 */ 3623 */
3620 void end_that_request_last(struct request *req, int uptodate) 3624 void end_that_request_last(struct request *req, int uptodate)
3621 { 3625 {
3622 struct gendisk *disk = req->rq_disk; 3626 struct gendisk *disk = req->rq_disk;
3623 int error; 3627 int error;
3624 3628
3625 /* 3629 /*
3626 * extend uptodate bool to allow < 0 value to be direct io error 3630 * extend uptodate bool to allow < 0 value to be direct io error
3627 */ 3631 */
3628 error = 0; 3632 error = 0;
3629 if (end_io_error(uptodate)) 3633 if (end_io_error(uptodate))
3630 error = !uptodate ? -EIO : uptodate; 3634 error = !uptodate ? -EIO : uptodate;
3631 3635
3632 if (unlikely(laptop_mode) && blk_fs_request(req)) 3636 if (unlikely(laptop_mode) && blk_fs_request(req))
3633 laptop_io_completion(); 3637 laptop_io_completion();
3634 3638
3635 /* 3639 /*
3636 * Account IO completion. bar_rq isn't accounted as a normal 3640 * Account IO completion. bar_rq isn't accounted as a normal
3637 * IO on queueing nor completion. Accounting the containing 3641 * IO on queueing nor completion. Accounting the containing
3638 * request is enough. 3642 * request is enough.
3639 */ 3643 */
3640 if (disk && blk_fs_request(req) && req != &req->q->bar_rq) { 3644 if (disk && blk_fs_request(req) && req != &req->q->bar_rq) {
3641 unsigned long duration = jiffies - req->start_time; 3645 unsigned long duration = jiffies - req->start_time;
3642 const int rw = rq_data_dir(req); 3646 const int rw = rq_data_dir(req);
3643 3647
3644 __disk_stat_inc(disk, ios[rw]); 3648 __disk_stat_inc(disk, ios[rw]);
3645 __disk_stat_add(disk, ticks[rw], duration); 3649 __disk_stat_add(disk, ticks[rw], duration);
3646 disk_round_stats(disk); 3650 disk_round_stats(disk);
3647 disk->in_flight--; 3651 disk->in_flight--;
3648 } 3652 }
3649 if (req->end_io) 3653 if (req->end_io)
3650 req->end_io(req, error); 3654 req->end_io(req, error);
3651 else 3655 else
3652 __blk_put_request(req->q, req); 3656 __blk_put_request(req->q, req);
3653 } 3657 }
3654 3658
3655 EXPORT_SYMBOL(end_that_request_last); 3659 EXPORT_SYMBOL(end_that_request_last);
3656 3660
3657 void end_request(struct request *req, int uptodate) 3661 void end_request(struct request *req, int uptodate)
3658 { 3662 {
3659 if (!end_that_request_first(req, uptodate, req->hard_cur_sectors)) { 3663 if (!end_that_request_first(req, uptodate, req->hard_cur_sectors)) {
3660 add_disk_randomness(req->rq_disk); 3664 add_disk_randomness(req->rq_disk);
3661 blkdev_dequeue_request(req); 3665 blkdev_dequeue_request(req);
3662 end_that_request_last(req, uptodate); 3666 end_that_request_last(req, uptodate);
3663 } 3667 }
3664 } 3668 }
3665 3669
3666 EXPORT_SYMBOL(end_request); 3670 EXPORT_SYMBOL(end_request);
3667 3671
3668 void blk_rq_bio_prep(struct request_queue *q, struct request *rq, 3672 void blk_rq_bio_prep(struct request_queue *q, struct request *rq,
3669 struct bio *bio) 3673 struct bio *bio)
3670 { 3674 {
3671 /* first two bits are identical in rq->cmd_flags and bio->bi_rw */ 3675 /* first two bits are identical in rq->cmd_flags and bio->bi_rw */
3672 rq->cmd_flags |= (bio->bi_rw & 3); 3676 rq->cmd_flags |= (bio->bi_rw & 3);
3673 3677
3674 rq->nr_phys_segments = bio_phys_segments(q, bio); 3678 rq->nr_phys_segments = bio_phys_segments(q, bio);
3675 rq->nr_hw_segments = bio_hw_segments(q, bio); 3679 rq->nr_hw_segments = bio_hw_segments(q, bio);
3676 rq->current_nr_sectors = bio_cur_sectors(bio); 3680 rq->current_nr_sectors = bio_cur_sectors(bio);
3677 rq->hard_cur_sectors = rq->current_nr_sectors; 3681 rq->hard_cur_sectors = rq->current_nr_sectors;
3678 rq->hard_nr_sectors = rq->nr_sectors = bio_sectors(bio); 3682 rq->hard_nr_sectors = rq->nr_sectors = bio_sectors(bio);
3679 rq->buffer = bio_data(bio); 3683 rq->buffer = bio_data(bio);
3680 rq->data_len = bio->bi_size; 3684 rq->data_len = bio->bi_size;
3681 3685
3682 rq->bio = rq->biotail = bio; 3686 rq->bio = rq->biotail = bio;
3683 } 3687 }
3684 3688
3685 EXPORT_SYMBOL(blk_rq_bio_prep); 3689 EXPORT_SYMBOL(blk_rq_bio_prep);
3686 3690
3687 int kblockd_schedule_work(struct work_struct *work) 3691 int kblockd_schedule_work(struct work_struct *work)
3688 { 3692 {
3689 return queue_work(kblockd_workqueue, work); 3693 return queue_work(kblockd_workqueue, work);
3690 } 3694 }
3691 3695
3692 EXPORT_SYMBOL(kblockd_schedule_work); 3696 EXPORT_SYMBOL(kblockd_schedule_work);
3693 3697
3694 void kblockd_flush_work(struct work_struct *work) 3698 void kblockd_flush_work(struct work_struct *work)
3695 { 3699 {
3696 cancel_work_sync(work); 3700 cancel_work_sync(work);
3697 } 3701 }
3698 EXPORT_SYMBOL(kblockd_flush_work); 3702 EXPORT_SYMBOL(kblockd_flush_work);
3699 3703
3700 int __init blk_dev_init(void) 3704 int __init blk_dev_init(void)
3701 { 3705 {
3702 int i; 3706 int i;
3703 3707
3704 kblockd_workqueue = create_workqueue("kblockd"); 3708 kblockd_workqueue = create_workqueue("kblockd");
3705 if (!kblockd_workqueue) 3709 if (!kblockd_workqueue)
3706 panic("Failed to create kblockd\n"); 3710 panic("Failed to create kblockd\n");
3707 3711
3708 request_cachep = kmem_cache_create("blkdev_requests", 3712 request_cachep = kmem_cache_create("blkdev_requests",
3709 sizeof(struct request), 0, SLAB_PANIC, NULL); 3713 sizeof(struct request), 0, SLAB_PANIC, NULL);
3710 3714
3711 requestq_cachep = kmem_cache_create("blkdev_queue", 3715 requestq_cachep = kmem_cache_create("blkdev_queue",
3712 sizeof(struct request_queue), 0, SLAB_PANIC, NULL); 3716 sizeof(struct request_queue), 0, SLAB_PANIC, NULL);
3713 3717
3714 iocontext_cachep = kmem_cache_create("blkdev_ioc", 3718 iocontext_cachep = kmem_cache_create("blkdev_ioc",
3715 sizeof(struct io_context), 0, SLAB_PANIC, NULL); 3719 sizeof(struct io_context), 0, SLAB_PANIC, NULL);
3716 3720
3717 for_each_possible_cpu(i) 3721 for_each_possible_cpu(i)
3718 INIT_LIST_HEAD(&per_cpu(blk_cpu_done, i)); 3722 INIT_LIST_HEAD(&per_cpu(blk_cpu_done, i));
3719 3723
3720 open_softirq(BLOCK_SOFTIRQ, blk_done_softirq, NULL); 3724 open_softirq(BLOCK_SOFTIRQ, blk_done_softirq, NULL);
3721 register_hotcpu_notifier(&blk_cpu_notifier); 3725 register_hotcpu_notifier(&blk_cpu_notifier);
3722 3726
3723 blk_max_low_pfn = max_low_pfn - 1; 3727 blk_max_low_pfn = max_low_pfn - 1;
3724 blk_max_pfn = max_pfn - 1; 3728 blk_max_pfn = max_pfn - 1;
3725 3729
3726 return 0; 3730 return 0;
3727 } 3731 }
3728 3732
3729 /* 3733 /*
3730 * IO Context helper functions 3734 * IO Context helper functions
3731 */ 3735 */
3732 void put_io_context(struct io_context *ioc) 3736 void put_io_context(struct io_context *ioc)
3733 { 3737 {
3734 if (ioc == NULL) 3738 if (ioc == NULL)
3735 return; 3739 return;
3736 3740
3737 BUG_ON(atomic_read(&ioc->refcount) == 0); 3741 BUG_ON(atomic_read(&ioc->refcount) == 0);
3738 3742
3739 if (atomic_dec_and_test(&ioc->refcount)) { 3743 if (atomic_dec_and_test(&ioc->refcount)) {
3740 struct cfq_io_context *cic; 3744 struct cfq_io_context *cic;
3741 3745
3742 rcu_read_lock(); 3746 rcu_read_lock();
3743 if (ioc->aic && ioc->aic->dtor) 3747 if (ioc->aic && ioc->aic->dtor)
3744 ioc->aic->dtor(ioc->aic); 3748 ioc->aic->dtor(ioc->aic);
3745 if (ioc->cic_root.rb_node != NULL) { 3749 if (ioc->cic_root.rb_node != NULL) {
3746 struct rb_node *n = rb_first(&ioc->cic_root); 3750 struct rb_node *n = rb_first(&ioc->cic_root);
3747 3751
3748 cic = rb_entry(n, struct cfq_io_context, rb_node); 3752 cic = rb_entry(n, struct cfq_io_context, rb_node);
3749 cic->dtor(ioc); 3753 cic->dtor(ioc);
3750 } 3754 }
3751 rcu_read_unlock(); 3755 rcu_read_unlock();
3752 3756
3753 kmem_cache_free(iocontext_cachep, ioc); 3757 kmem_cache_free(iocontext_cachep, ioc);
3754 } 3758 }
3755 } 3759 }
3756 EXPORT_SYMBOL(put_io_context); 3760 EXPORT_SYMBOL(put_io_context);
3757 3761
3758 /* Called by the exitting task */ 3762 /* Called by the exitting task */
3759 void exit_io_context(void) 3763 void exit_io_context(void)
3760 { 3764 {
3761 struct io_context *ioc; 3765 struct io_context *ioc;
3762 struct cfq_io_context *cic; 3766 struct cfq_io_context *cic;
3763 3767
3764 task_lock(current); 3768 task_lock(current);
3765 ioc = current->io_context; 3769 ioc = current->io_context;
3766 current->io_context = NULL; 3770 current->io_context = NULL;
3767 task_unlock(current); 3771 task_unlock(current);
3768 3772
3769 ioc->task = NULL; 3773 ioc->task = NULL;
3770 if (ioc->aic && ioc->aic->exit) 3774 if (ioc->aic && ioc->aic->exit)
3771 ioc->aic->exit(ioc->aic); 3775 ioc->aic->exit(ioc->aic);
3772 if (ioc->cic_root.rb_node != NULL) { 3776 if (ioc->cic_root.rb_node != NULL) {
3773 cic = rb_entry(rb_first(&ioc->cic_root), struct cfq_io_context, rb_node); 3777 cic = rb_entry(rb_first(&ioc->cic_root), struct cfq_io_context, rb_node);
3774 cic->exit(ioc); 3778 cic->exit(ioc);
3775 } 3779 }
3776 3780
3777 put_io_context(ioc); 3781 put_io_context(ioc);
3778 } 3782 }
3779 3783
3780 /* 3784 /*
3781 * If the current task has no IO context then create one and initialise it. 3785 * If the current task has no IO context then create one and initialise it.
3782 * Otherwise, return its existing IO context. 3786 * Otherwise, return its existing IO context.
3783 * 3787 *
3784 * This returned IO context doesn't have a specifically elevated refcount, 3788 * This returned IO context doesn't have a specifically elevated refcount,
3785 * but since the current task itself holds a reference, the context can be 3789 * but since the current task itself holds a reference, the context can be
3786 * used in general code, so long as it stays within `current` context. 3790 * used in general code, so long as it stays within `current` context.
3787 */ 3791 */
3788 static struct io_context *current_io_context(gfp_t gfp_flags, int node) 3792 static struct io_context *current_io_context(gfp_t gfp_flags, int node)
3789 { 3793 {
3790 struct task_struct *tsk = current; 3794 struct task_struct *tsk = current;
3791 struct io_context *ret; 3795 struct io_context *ret;
3792 3796
3793 ret = tsk->io_context; 3797 ret = tsk->io_context;
3794 if (likely(ret)) 3798 if (likely(ret))
3795 return ret; 3799 return ret;
3796 3800
3797 ret = kmem_cache_alloc_node(iocontext_cachep, gfp_flags, node); 3801 ret = kmem_cache_alloc_node(iocontext_cachep, gfp_flags, node);
3798 if (ret) { 3802 if (ret) {
3799 atomic_set(&ret->refcount, 1); 3803 atomic_set(&ret->refcount, 1);
3800 ret->task = current; 3804 ret->task = current;
3801 ret->ioprio_changed = 0; 3805 ret->ioprio_changed = 0;
3802 ret->last_waited = jiffies; /* doesn't matter... */ 3806 ret->last_waited = jiffies; /* doesn't matter... */
3803 ret->nr_batch_requests = 0; /* because this is 0 */ 3807 ret->nr_batch_requests = 0; /* because this is 0 */
3804 ret->aic = NULL; 3808 ret->aic = NULL;
3805 ret->cic_root.rb_node = NULL; 3809 ret->cic_root.rb_node = NULL;
3806 ret->ioc_data = NULL; 3810 ret->ioc_data = NULL;
3807 /* make sure set_task_ioprio() sees the settings above */ 3811 /* make sure set_task_ioprio() sees the settings above */
3808 smp_wmb(); 3812 smp_wmb();
3809 tsk->io_context = ret; 3813 tsk->io_context = ret;
3810 } 3814 }
3811 3815
3812 return ret; 3816 return ret;
3813 } 3817 }
3814 3818
3815 /* 3819 /*
3816 * If the current task has no IO context then create one and initialise it. 3820 * If the current task has no IO context then create one and initialise it.
3817 * If it does have a context, take a ref on it. 3821 * If it does have a context, take a ref on it.
3818 * 3822 *
3819 * This is always called in the context of the task which submitted the I/O. 3823 * This is always called in the context of the task which submitted the I/O.
3820 */ 3824 */
3821 struct io_context *get_io_context(gfp_t gfp_flags, int node) 3825 struct io_context *get_io_context(gfp_t gfp_flags, int node)
3822 { 3826 {
3823 struct io_context *ret; 3827 struct io_context *ret;
3824 ret = current_io_context(gfp_flags, node); 3828 ret = current_io_context(gfp_flags, node);
3825 if (likely(ret)) 3829 if (likely(ret))
3826 atomic_inc(&ret->refcount); 3830 atomic_inc(&ret->refcount);
3827 return ret; 3831 return ret;
3828 } 3832 }
3829 EXPORT_SYMBOL(get_io_context); 3833 EXPORT_SYMBOL(get_io_context);
3830 3834
3831 void copy_io_context(struct io_context **pdst, struct io_context **psrc) 3835 void copy_io_context(struct io_context **pdst, struct io_context **psrc)
3832 { 3836 {
3833 struct io_context *src = *psrc; 3837 struct io_context *src = *psrc;
3834 struct io_context *dst = *pdst; 3838 struct io_context *dst = *pdst;
3835 3839
3836 if (src) { 3840 if (src) {
3837 BUG_ON(atomic_read(&src->refcount) == 0); 3841 BUG_ON(atomic_read(&src->refcount) == 0);
3838 atomic_inc(&src->refcount); 3842 atomic_inc(&src->refcount);
3839 put_io_context(dst); 3843 put_io_context(dst);
3840 *pdst = src; 3844 *pdst = src;
3841 } 3845 }
3842 } 3846 }
3843 EXPORT_SYMBOL(copy_io_context); 3847 EXPORT_SYMBOL(copy_io_context);
3844 3848
3845 void swap_io_context(struct io_context **ioc1, struct io_context **ioc2) 3849 void swap_io_context(struct io_context **ioc1, struct io_context **ioc2)
3846 { 3850 {
3847 struct io_context *temp; 3851 struct io_context *temp;
3848 temp = *ioc1; 3852 temp = *ioc1;
3849 *ioc1 = *ioc2; 3853 *ioc1 = *ioc2;
3850 *ioc2 = temp; 3854 *ioc2 = temp;
3851 } 3855 }
3852 EXPORT_SYMBOL(swap_io_context); 3856 EXPORT_SYMBOL(swap_io_context);
3853 3857
3854 /* 3858 /*
3855 * sysfs parts below 3859 * sysfs parts below
3856 */ 3860 */
3857 struct queue_sysfs_entry { 3861 struct queue_sysfs_entry {
3858 struct attribute attr; 3862 struct attribute attr;
3859 ssize_t (*show)(struct request_queue *, char *); 3863 ssize_t (*show)(struct request_queue *, char *);
3860 ssize_t (*store)(struct request_queue *, const char *, size_t); 3864 ssize_t (*store)(struct request_queue *, const char *, size_t);
3861 }; 3865 };
3862 3866
3863 static ssize_t 3867 static ssize_t
3864 queue_var_show(unsigned int var, char *page) 3868 queue_var_show(unsigned int var, char *page)
3865 { 3869 {
3866 return sprintf(page, "%d\n", var); 3870 return sprintf(page, "%d\n", var);
3867 } 3871 }
3868 3872
3869 static ssize_t 3873 static ssize_t
3870 queue_var_store(unsigned long *var, const char *page, size_t count) 3874 queue_var_store(unsigned long *var, const char *page, size_t count)
3871 { 3875 {
3872 char *p = (char *) page; 3876 char *p = (char *) page;
3873 3877
3874 *var = simple_strtoul(p, &p, 10); 3878 *var = simple_strtoul(p, &p, 10);
3875 return count; 3879 return count;
3876 } 3880 }
3877 3881
3878 static ssize_t queue_requests_show(struct request_queue *q, char *page) 3882 static ssize_t queue_requests_show(struct request_queue *q, char *page)
3879 { 3883 {
3880 return queue_var_show(q->nr_requests, (page)); 3884 return queue_var_show(q->nr_requests, (page));
3881 } 3885 }
3882 3886
3883 static ssize_t 3887 static ssize_t
3884 queue_requests_store(struct request_queue *q, const char *page, size_t count) 3888 queue_requests_store(struct request_queue *q, const char *page, size_t count)
3885 { 3889 {
3886 struct request_list *rl = &q->rq; 3890 struct request_list *rl = &q->rq;
3887 unsigned long nr; 3891 unsigned long nr;
3888 int ret = queue_var_store(&nr, page, count); 3892 int ret = queue_var_store(&nr, page, count);
3889 if (nr < BLKDEV_MIN_RQ) 3893 if (nr < BLKDEV_MIN_RQ)
3890 nr = BLKDEV_MIN_RQ; 3894 nr = BLKDEV_MIN_RQ;
3891 3895
3892 spin_lock_irq(q->queue_lock); 3896 spin_lock_irq(q->queue_lock);
3893 q->nr_requests = nr; 3897 q->nr_requests = nr;
3894 blk_queue_congestion_threshold(q); 3898 blk_queue_congestion_threshold(q);
3895 3899
3896 if (rl->count[READ] >= queue_congestion_on_threshold(q)) 3900 if (rl->count[READ] >= queue_congestion_on_threshold(q))
3897 blk_set_queue_congested(q, READ); 3901 blk_set_queue_congested(q, READ);
3898 else if (rl->count[READ] < queue_congestion_off_threshold(q)) 3902 else if (rl->count[READ] < queue_congestion_off_threshold(q))
3899 blk_clear_queue_congested(q, READ); 3903 blk_clear_queue_congested(q, READ);
3900 3904
3901 if (rl->count[WRITE] >= queue_congestion_on_threshold(q)) 3905 if (rl->count[WRITE] >= queue_congestion_on_threshold(q))
3902 blk_set_queue_congested(q, WRITE); 3906 blk_set_queue_congested(q, WRITE);
3903 else if (rl->count[WRITE] < queue_congestion_off_threshold(q)) 3907 else if (rl->count[WRITE] < queue_congestion_off_threshold(q))
3904 blk_clear_queue_congested(q, WRITE); 3908 blk_clear_queue_congested(q, WRITE);
3905 3909
3906 if (rl->count[READ] >= q->nr_requests) { 3910 if (rl->count[READ] >= q->nr_requests) {
3907 blk_set_queue_full(q, READ); 3911 blk_set_queue_full(q, READ);
3908 } else if (rl->count[READ]+1 <= q->nr_requests) { 3912 } else if (rl->count[READ]+1 <= q->nr_requests) {
3909 blk_clear_queue_full(q, READ); 3913 blk_clear_queue_full(q, READ);
3910 wake_up(&rl->wait[READ]); 3914 wake_up(&rl->wait[READ]);
3911 } 3915 }
3912 3916
3913 if (rl->count[WRITE] >= q->nr_requests) { 3917 if (rl->count[WRITE] >= q->nr_requests) {
3914 blk_set_queue_full(q, WRITE); 3918 blk_set_queue_full(q, WRITE);
3915 } else if (rl->count[WRITE]+1 <= q->nr_requests) { 3919 } else if (rl->count[WRITE]+1 <= q->nr_requests) {
3916 blk_clear_queue_full(q, WRITE); 3920 blk_clear_queue_full(q, WRITE);
3917 wake_up(&rl->wait[WRITE]); 3921 wake_up(&rl->wait[WRITE]);
3918 } 3922 }
3919 spin_unlock_irq(q->queue_lock); 3923 spin_unlock_irq(q->queue_lock);
3920 return ret; 3924 return ret;
3921 } 3925 }
3922 3926
3923 static ssize_t queue_ra_show(struct request_queue *q, char *page) 3927 static ssize_t queue_ra_show(struct request_queue *q, char *page)
3924 { 3928 {
3925 int ra_kb = q->backing_dev_info.ra_pages << (PAGE_CACHE_SHIFT - 10); 3929 int ra_kb = q->backing_dev_info.ra_pages << (PAGE_CACHE_SHIFT - 10);
3926 3930
3927 return queue_var_show(ra_kb, (page)); 3931 return queue_var_show(ra_kb, (page));
3928 } 3932 }
3929 3933
3930 static ssize_t 3934 static ssize_t
3931 queue_ra_store(struct request_queue *q, const char *page, size_t count) 3935 queue_ra_store(struct request_queue *q, const char *page, size_t count)
3932 { 3936 {
3933 unsigned long ra_kb; 3937 unsigned long ra_kb;
3934 ssize_t ret = queue_var_store(&ra_kb, page, count); 3938 ssize_t ret = queue_var_store(&ra_kb, page, count);
3935 3939
3936 spin_lock_irq(q->queue_lock); 3940 spin_lock_irq(q->queue_lock);
3937 q->backing_dev_info.ra_pages = ra_kb >> (PAGE_CACHE_SHIFT - 10); 3941 q->backing_dev_info.ra_pages = ra_kb >> (PAGE_CACHE_SHIFT - 10);
3938 spin_unlock_irq(q->queue_lock); 3942 spin_unlock_irq(q->queue_lock);
3939 3943
3940 return ret; 3944 return ret;
3941 } 3945 }
3942 3946
3943 static ssize_t queue_max_sectors_show(struct request_queue *q, char *page) 3947 static ssize_t queue_max_sectors_show(struct request_queue *q, char *page)
3944 { 3948 {
3945 int max_sectors_kb = q->max_sectors >> 1; 3949 int max_sectors_kb = q->max_sectors >> 1;
3946 3950
3947 return queue_var_show(max_sectors_kb, (page)); 3951 return queue_var_show(max_sectors_kb, (page));
3948 } 3952 }
3949 3953
3950 static ssize_t 3954 static ssize_t
3951 queue_max_sectors_store(struct request_queue *q, const char *page, size_t count) 3955 queue_max_sectors_store(struct request_queue *q, const char *page, size_t count)
3952 { 3956 {
3953 unsigned long max_sectors_kb, 3957 unsigned long max_sectors_kb,
3954 max_hw_sectors_kb = q->max_hw_sectors >> 1, 3958 max_hw_sectors_kb = q->max_hw_sectors >> 1,
3955 page_kb = 1 << (PAGE_CACHE_SHIFT - 10); 3959 page_kb = 1 << (PAGE_CACHE_SHIFT - 10);
3956 ssize_t ret = queue_var_store(&max_sectors_kb, page, count); 3960 ssize_t ret = queue_var_store(&max_sectors_kb, page, count);
3957 int ra_kb; 3961 int ra_kb;
3958 3962
3959 if (max_sectors_kb > max_hw_sectors_kb || max_sectors_kb < page_kb) 3963 if (max_sectors_kb > max_hw_sectors_kb || max_sectors_kb < page_kb)
3960 return -EINVAL; 3964 return -EINVAL;
3961 /* 3965 /*
3962 * Take the queue lock to update the readahead and max_sectors 3966 * Take the queue lock to update the readahead and max_sectors
3963 * values synchronously: 3967 * values synchronously:
3964 */ 3968 */
3965 spin_lock_irq(q->queue_lock); 3969 spin_lock_irq(q->queue_lock);
3966 /* 3970 /*
3967 * Trim readahead window as well, if necessary: 3971 * Trim readahead window as well, if necessary:
3968 */ 3972 */
3969 ra_kb = q->backing_dev_info.ra_pages << (PAGE_CACHE_SHIFT - 10); 3973 ra_kb = q->backing_dev_info.ra_pages << (PAGE_CACHE_SHIFT - 10);
3970 if (ra_kb > max_sectors_kb) 3974 if (ra_kb > max_sectors_kb)
3971 q->backing_dev_info.ra_pages = 3975 q->backing_dev_info.ra_pages =
3972 max_sectors_kb >> (PAGE_CACHE_SHIFT - 10); 3976 max_sectors_kb >> (PAGE_CACHE_SHIFT - 10);
3973 3977
3974 q->max_sectors = max_sectors_kb << 1; 3978 q->max_sectors = max_sectors_kb << 1;
3975 spin_unlock_irq(q->queue_lock); 3979 spin_unlock_irq(q->queue_lock);
3976 3980
3977 return ret; 3981 return ret;
3978 } 3982 }
3979 3983
3980 static ssize_t queue_max_hw_sectors_show(struct request_queue *q, char *page) 3984 static ssize_t queue_max_hw_sectors_show(struct request_queue *q, char *page)
3981 { 3985 {
3982 int max_hw_sectors_kb = q->max_hw_sectors >> 1; 3986 int max_hw_sectors_kb = q->max_hw_sectors >> 1;
3983 3987
3984 return queue_var_show(max_hw_sectors_kb, (page)); 3988 return queue_var_show(max_hw_sectors_kb, (page));
3985 } 3989 }
3986 3990
3987 3991
3988 static struct queue_sysfs_entry queue_requests_entry = { 3992 static struct queue_sysfs_entry queue_requests_entry = {
3989 .attr = {.name = "nr_requests", .mode = S_IRUGO | S_IWUSR }, 3993 .attr = {.name = "nr_requests", .mode = S_IRUGO | S_IWUSR },
3990 .show = queue_requests_show, 3994 .show = queue_requests_show,
3991 .store = queue_requests_store, 3995 .store = queue_requests_store,
3992 }; 3996 };
3993 3997
3994 static struct queue_sysfs_entry queue_ra_entry = { 3998 static struct queue_sysfs_entry queue_ra_entry = {
3995 .attr = {.name = "read_ahead_kb", .mode = S_IRUGO | S_IWUSR }, 3999 .attr = {.name = "read_ahead_kb", .mode = S_IRUGO | S_IWUSR },
3996 .show = queue_ra_show, 4000 .show = queue_ra_show,
3997 .store = queue_ra_store, 4001 .store = queue_ra_store,
3998 }; 4002 };
3999 4003
4000 static struct queue_sysfs_entry queue_max_sectors_entry = { 4004 static struct queue_sysfs_entry queue_max_sectors_entry = {
4001 .attr = {.name = "max_sectors_kb", .mode = S_IRUGO | S_IWUSR }, 4005 .attr = {.name = "max_sectors_kb", .mode = S_IRUGO | S_IWUSR },
4002 .show = queue_max_sectors_show, 4006 .show = queue_max_sectors_show,
4003 .store = queue_max_sectors_store, 4007 .store = queue_max_sectors_store,
4004 }; 4008 };
4005 4009
4006 static struct queue_sysfs_entry queue_max_hw_sectors_entry = { 4010 static struct queue_sysfs_entry queue_max_hw_sectors_entry = {
4007 .attr = {.name = "max_hw_sectors_kb", .mode = S_IRUGO }, 4011 .attr = {.name = "max_hw_sectors_kb", .mode = S_IRUGO },
4008 .show = queue_max_hw_sectors_show, 4012 .show = queue_max_hw_sectors_show,
4009 }; 4013 };
4010 4014
4011 static struct queue_sysfs_entry queue_iosched_entry = { 4015 static struct queue_sysfs_entry queue_iosched_entry = {
4012 .attr = {.name = "scheduler", .mode = S_IRUGO | S_IWUSR }, 4016 .attr = {.name = "scheduler", .mode = S_IRUGO | S_IWUSR },
4013 .show = elv_iosched_show, 4017 .show = elv_iosched_show,
4014 .store = elv_iosched_store, 4018 .store = elv_iosched_store,
4015 }; 4019 };
4016 4020
4017 static struct attribute *default_attrs[] = { 4021 static struct attribute *default_attrs[] = {
4018 &queue_requests_entry.attr, 4022 &queue_requests_entry.attr,
4019 &queue_ra_entry.attr, 4023 &queue_ra_entry.attr,
4020 &queue_max_hw_sectors_entry.attr, 4024 &queue_max_hw_sectors_entry.attr,
4021 &queue_max_sectors_entry.attr, 4025 &queue_max_sectors_entry.attr,
4022 &queue_iosched_entry.attr, 4026 &queue_iosched_entry.attr,
4023 NULL, 4027 NULL,
4024 }; 4028 };
4025 4029
4026 #define to_queue(atr) container_of((atr), struct queue_sysfs_entry, attr) 4030 #define to_queue(atr) container_of((atr), struct queue_sysfs_entry, attr)
4027 4031
4028 static ssize_t 4032 static ssize_t
4029 queue_attr_show(struct kobject *kobj, struct attribute *attr, char *page) 4033 queue_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
4030 { 4034 {
4031 struct queue_sysfs_entry *entry = to_queue(attr); 4035 struct queue_sysfs_entry *entry = to_queue(attr);
4032 struct request_queue *q = 4036 struct request_queue *q =
4033 container_of(kobj, struct request_queue, kobj); 4037 container_of(kobj, struct request_queue, kobj);
4034 ssize_t res; 4038 ssize_t res;
4035 4039
4036 if (!entry->show) 4040 if (!entry->show)
4037 return -EIO; 4041 return -EIO;
4038 mutex_lock(&q->sysfs_lock); 4042 mutex_lock(&q->sysfs_lock);
4039 if (test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)) { 4043 if (test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)) {
4040 mutex_unlock(&q->sysfs_lock); 4044 mutex_unlock(&q->sysfs_lock);
4041 return -ENOENT; 4045 return -ENOENT;
4042 } 4046 }
4043 res = entry->show(q, page); 4047 res = entry->show(q, page);
4044 mutex_unlock(&q->sysfs_lock); 4048 mutex_unlock(&q->sysfs_lock);
4045 return res; 4049 return res;
4046 } 4050 }
4047 4051
4048 static ssize_t 4052 static ssize_t
4049 queue_attr_store(struct kobject *kobj, struct attribute *attr, 4053 queue_attr_store(struct kobject *kobj, struct attribute *attr,
4050 const char *page, size_t length) 4054 const char *page, size_t length)
4051 { 4055 {
4052 struct queue_sysfs_entry *entry = to_queue(attr); 4056 struct queue_sysfs_entry *entry = to_queue(attr);
4053 struct request_queue *q = container_of(kobj, struct request_queue, kobj); 4057 struct request_queue *q = container_of(kobj, struct request_queue, kobj);
4054 4058
4055 ssize_t res; 4059 ssize_t res;
4056 4060
4057 if (!entry->store) 4061 if (!entry->store)
4058 return -EIO; 4062 return -EIO;
4059 mutex_lock(&q->sysfs_lock); 4063 mutex_lock(&q->sysfs_lock);
4060 if (test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)) { 4064 if (test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)) {
4061 mutex_unlock(&q->sysfs_lock); 4065 mutex_unlock(&q->sysfs_lock);
4062 return -ENOENT; 4066 return -ENOENT;
4063 } 4067 }
4064 res = entry->store(q, page, length); 4068 res = entry->store(q, page, length);
4065 mutex_unlock(&q->sysfs_lock); 4069 mutex_unlock(&q->sysfs_lock);
4066 return res; 4070 return res;
4067 } 4071 }
4068 4072
4069 static struct sysfs_ops queue_sysfs_ops = { 4073 static struct sysfs_ops queue_sysfs_ops = {
4070 .show = queue_attr_show, 4074 .show = queue_attr_show,
4071 .store = queue_attr_store, 4075 .store = queue_attr_store,
4072 }; 4076 };
4073 4077
4074 static struct kobj_type queue_ktype = { 4078 static struct kobj_type queue_ktype = {
4075 .sysfs_ops = &queue_sysfs_ops, 4079 .sysfs_ops = &queue_sysfs_ops,
4076 .default_attrs = default_attrs, 4080 .default_attrs = default_attrs,
4077 .release = blk_release_queue, 4081 .release = blk_release_queue,
4078 }; 4082 };
4079 4083
4080 int blk_register_queue(struct gendisk *disk) 4084 int blk_register_queue(struct gendisk *disk)
4081 { 4085 {
4082 int ret; 4086 int ret;
4083 4087
4084 struct request_queue *q = disk->queue; 4088 struct request_queue *q = disk->queue;
4085 4089
4086 if (!q || !q->request_fn) 4090 if (!q || !q->request_fn)
4087 return -ENXIO; 4091 return -ENXIO;
4088 4092
4089 q->kobj.parent = kobject_get(&disk->kobj); 4093 q->kobj.parent = kobject_get(&disk->kobj);
4090 4094
4091 ret = kobject_add(&q->kobj); 4095 ret = kobject_add(&q->kobj);
4092 if (ret < 0) 4096 if (ret < 0)
4093 return ret; 4097 return ret;
4094 4098
4095 kobject_uevent(&q->kobj, KOBJ_ADD); 4099 kobject_uevent(&q->kobj, KOBJ_ADD);
4096 4100
4097 ret = elv_register_queue(q); 4101 ret = elv_register_queue(q);
4098 if (ret) { 4102 if (ret) {
4099 kobject_uevent(&q->kobj, KOBJ_REMOVE); 4103 kobject_uevent(&q->kobj, KOBJ_REMOVE);
4100 kobject_del(&q->kobj); 4104 kobject_del(&q->kobj);
4101 return ret; 4105 return ret;
4102 } 4106 }
4103 4107
4104 return 0; 4108 return 0;
4105 } 4109 }
4106 4110
4107 void blk_unregister_queue(struct gendisk *disk) 4111 void blk_unregister_queue(struct gendisk *disk)
4108 { 4112 {
4109 struct request_queue *q = disk->queue; 4113 struct request_queue *q = disk->queue;
4110 4114
4111 if (q && q->request_fn) { 4115 if (q && q->request_fn) {
4112 elv_unregister_queue(q); 4116 elv_unregister_queue(q);
4113 4117
4114 kobject_uevent(&q->kobj, KOBJ_REMOVE); 4118 kobject_uevent(&q->kobj, KOBJ_REMOVE);
4115 kobject_del(&q->kobj); 4119 kobject_del(&q->kobj);
4116 kobject_put(&disk->kobj); 4120 kobject_put(&disk->kobj);
4117 } 4121 }
4118 } 4122 }
4119 4123
1 /* 1 /*
2 * Copyright (C) 2001, 2002 Sistina Software (UK) Limited. 2 * Copyright (C) 2001, 2002 Sistina Software (UK) Limited.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. 3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 * 4 *
5 * This file is released under the GPL. 5 * This file is released under the GPL.
6 */ 6 */
7 7
8 #include "dm.h" 8 #include "dm.h"
9 #include "dm-bio-list.h" 9 #include "dm-bio-list.h"
10 10
11 #include <linux/init.h> 11 #include <linux/init.h>
12 #include <linux/module.h> 12 #include <linux/module.h>
13 #include <linux/mutex.h> 13 #include <linux/mutex.h>
14 #include <linux/moduleparam.h> 14 #include <linux/moduleparam.h>
15 #include <linux/blkpg.h> 15 #include <linux/blkpg.h>
16 #include <linux/bio.h> 16 #include <linux/bio.h>
17 #include <linux/buffer_head.h> 17 #include <linux/buffer_head.h>
18 #include <linux/mempool.h> 18 #include <linux/mempool.h>
19 #include <linux/slab.h> 19 #include <linux/slab.h>
20 #include <linux/idr.h> 20 #include <linux/idr.h>
21 #include <linux/hdreg.h> 21 #include <linux/hdreg.h>
22 #include <linux/blktrace_api.h> 22 #include <linux/blktrace_api.h>
23 #include <linux/smp_lock.h> 23 #include <linux/smp_lock.h>
24 24
25 #define DM_MSG_PREFIX "core" 25 #define DM_MSG_PREFIX "core"
26 26
27 static const char *_name = DM_NAME; 27 static const char *_name = DM_NAME;
28 28
29 static unsigned int major = 0; 29 static unsigned int major = 0;
30 static unsigned int _major = 0; 30 static unsigned int _major = 0;
31 31
32 static DEFINE_SPINLOCK(_minor_lock); 32 static DEFINE_SPINLOCK(_minor_lock);
33 /* 33 /*
34 * One of these is allocated per bio. 34 * One of these is allocated per bio.
35 */ 35 */
36 struct dm_io { 36 struct dm_io {
37 struct mapped_device *md; 37 struct mapped_device *md;
38 int error; 38 int error;
39 struct bio *bio; 39 struct bio *bio;
40 atomic_t io_count; 40 atomic_t io_count;
41 unsigned long start_time; 41 unsigned long start_time;
42 }; 42 };
43 43
44 /* 44 /*
45 * One of these is allocated per target within a bio. Hopefully 45 * One of these is allocated per target within a bio. Hopefully
46 * this will be simplified out one day. 46 * this will be simplified out one day.
47 */ 47 */
48 struct dm_target_io { 48 struct dm_target_io {
49 struct dm_io *io; 49 struct dm_io *io;
50 struct dm_target *ti; 50 struct dm_target *ti;
51 union map_info info; 51 union map_info info;
52 }; 52 };
53 53
54 union map_info *dm_get_mapinfo(struct bio *bio) 54 union map_info *dm_get_mapinfo(struct bio *bio)
55 { 55 {
56 if (bio && bio->bi_private) 56 if (bio && bio->bi_private)
57 return &((struct dm_target_io *)bio->bi_private)->info; 57 return &((struct dm_target_io *)bio->bi_private)->info;
58 return NULL; 58 return NULL;
59 } 59 }
60 60
61 #define MINOR_ALLOCED ((void *)-1) 61 #define MINOR_ALLOCED ((void *)-1)
62 62
63 /* 63 /*
64 * Bits for the md->flags field. 64 * Bits for the md->flags field.
65 */ 65 */
66 #define DMF_BLOCK_IO 0 66 #define DMF_BLOCK_IO 0
67 #define DMF_SUSPENDED 1 67 #define DMF_SUSPENDED 1
68 #define DMF_FROZEN 2 68 #define DMF_FROZEN 2
69 #define DMF_FREEING 3 69 #define DMF_FREEING 3
70 #define DMF_DELETING 4 70 #define DMF_DELETING 4
71 #define DMF_NOFLUSH_SUSPENDING 5 71 #define DMF_NOFLUSH_SUSPENDING 5
72 72
73 struct mapped_device { 73 struct mapped_device {
74 struct rw_semaphore io_lock; 74 struct rw_semaphore io_lock;
75 struct semaphore suspend_lock; 75 struct semaphore suspend_lock;
76 spinlock_t pushback_lock; 76 spinlock_t pushback_lock;
77 rwlock_t map_lock; 77 rwlock_t map_lock;
78 atomic_t holders; 78 atomic_t holders;
79 atomic_t open_count; 79 atomic_t open_count;
80 80
81 unsigned long flags; 81 unsigned long flags;
82 82
83 struct request_queue *queue; 83 struct request_queue *queue;
84 struct gendisk *disk; 84 struct gendisk *disk;
85 char name[16]; 85 char name[16];
86 86
87 void *interface_ptr; 87 void *interface_ptr;
88 88
89 /* 89 /*
90 * A list of ios that arrived while we were suspended. 90 * A list of ios that arrived while we were suspended.
91 */ 91 */
92 atomic_t pending; 92 atomic_t pending;
93 wait_queue_head_t wait; 93 wait_queue_head_t wait;
94 struct bio_list deferred; 94 struct bio_list deferred;
95 struct bio_list pushback; 95 struct bio_list pushback;
96 96
97 /* 97 /*
98 * The current mapping. 98 * The current mapping.
99 */ 99 */
100 struct dm_table *map; 100 struct dm_table *map;
101 101
102 /* 102 /*
103 * io objects are allocated from here. 103 * io objects are allocated from here.
104 */ 104 */
105 mempool_t *io_pool; 105 mempool_t *io_pool;
106 mempool_t *tio_pool; 106 mempool_t *tio_pool;
107 107
108 struct bio_set *bs; 108 struct bio_set *bs;
109 109
110 /* 110 /*
111 * Event handling. 111 * Event handling.
112 */ 112 */
113 atomic_t event_nr; 113 atomic_t event_nr;
114 wait_queue_head_t eventq; 114 wait_queue_head_t eventq;
115 115
116 /* 116 /*
117 * freeze/thaw support require holding onto a super block 117 * freeze/thaw support require holding onto a super block
118 */ 118 */
119 struct super_block *frozen_sb; 119 struct super_block *frozen_sb;
120 struct block_device *suspended_bdev; 120 struct block_device *suspended_bdev;
121 121
122 /* forced geometry settings */ 122 /* forced geometry settings */
123 struct hd_geometry geometry; 123 struct hd_geometry geometry;
124 }; 124 };
125 125
126 #define MIN_IOS 256 126 #define MIN_IOS 256
127 static struct kmem_cache *_io_cache; 127 static struct kmem_cache *_io_cache;
128 static struct kmem_cache *_tio_cache; 128 static struct kmem_cache *_tio_cache;
129 129
130 static int __init local_init(void) 130 static int __init local_init(void)
131 { 131 {
132 int r; 132 int r;
133 133
134 /* allocate a slab for the dm_ios */ 134 /* allocate a slab for the dm_ios */
135 _io_cache = KMEM_CACHE(dm_io, 0); 135 _io_cache = KMEM_CACHE(dm_io, 0);
136 if (!_io_cache) 136 if (!_io_cache)
137 return -ENOMEM; 137 return -ENOMEM;
138 138
139 /* allocate a slab for the target ios */ 139 /* allocate a slab for the target ios */
140 _tio_cache = KMEM_CACHE(dm_target_io, 0); 140 _tio_cache = KMEM_CACHE(dm_target_io, 0);
141 if (!_tio_cache) { 141 if (!_tio_cache) {
142 kmem_cache_destroy(_io_cache); 142 kmem_cache_destroy(_io_cache);
143 return -ENOMEM; 143 return -ENOMEM;
144 } 144 }
145 145
146 _major = major; 146 _major = major;
147 r = register_blkdev(_major, _name); 147 r = register_blkdev(_major, _name);
148 if (r < 0) { 148 if (r < 0) {
149 kmem_cache_destroy(_tio_cache); 149 kmem_cache_destroy(_tio_cache);
150 kmem_cache_destroy(_io_cache); 150 kmem_cache_destroy(_io_cache);
151 return r; 151 return r;
152 } 152 }
153 153
154 if (!_major) 154 if (!_major)
155 _major = r; 155 _major = r;
156 156
157 return 0; 157 return 0;
158 } 158 }
159 159
160 static void local_exit(void) 160 static void local_exit(void)
161 { 161 {
162 kmem_cache_destroy(_tio_cache); 162 kmem_cache_destroy(_tio_cache);
163 kmem_cache_destroy(_io_cache); 163 kmem_cache_destroy(_io_cache);
164 unregister_blkdev(_major, _name); 164 unregister_blkdev(_major, _name);
165 165
166 _major = 0; 166 _major = 0;
167 167
168 DMINFO("cleaned up"); 168 DMINFO("cleaned up");
169 } 169 }
170 170
171 int (*_inits[])(void) __initdata = { 171 int (*_inits[])(void) __initdata = {
172 local_init, 172 local_init,
173 dm_target_init, 173 dm_target_init,
174 dm_linear_init, 174 dm_linear_init,
175 dm_stripe_init, 175 dm_stripe_init,
176 dm_interface_init, 176 dm_interface_init,
177 }; 177 };
178 178
179 void (*_exits[])(void) = { 179 void (*_exits[])(void) = {
180 local_exit, 180 local_exit,
181 dm_target_exit, 181 dm_target_exit,
182 dm_linear_exit, 182 dm_linear_exit,
183 dm_stripe_exit, 183 dm_stripe_exit,
184 dm_interface_exit, 184 dm_interface_exit,
185 }; 185 };
186 186
187 static int __init dm_init(void) 187 static int __init dm_init(void)
188 { 188 {
189 const int count = ARRAY_SIZE(_inits); 189 const int count = ARRAY_SIZE(_inits);
190 190
191 int r, i; 191 int r, i;
192 192
193 for (i = 0; i < count; i++) { 193 for (i = 0; i < count; i++) {
194 r = _inits[i](); 194 r = _inits[i]();
195 if (r) 195 if (r)
196 goto bad; 196 goto bad;
197 } 197 }
198 198
199 return 0; 199 return 0;
200 200
201 bad: 201 bad:
202 while (i--) 202 while (i--)
203 _exits[i](); 203 _exits[i]();
204 204
205 return r; 205 return r;
206 } 206 }
207 207
208 static void __exit dm_exit(void) 208 static void __exit dm_exit(void)
209 { 209 {
210 int i = ARRAY_SIZE(_exits); 210 int i = ARRAY_SIZE(_exits);
211 211
212 while (i--) 212 while (i--)
213 _exits[i](); 213 _exits[i]();
214 } 214 }
215 215
216 /* 216 /*
217 * Block device functions 217 * Block device functions
218 */ 218 */
219 static int dm_blk_open(struct inode *inode, struct file *file) 219 static int dm_blk_open(struct inode *inode, struct file *file)
220 { 220 {
221 struct mapped_device *md; 221 struct mapped_device *md;
222 222
223 spin_lock(&_minor_lock); 223 spin_lock(&_minor_lock);
224 224
225 md = inode->i_bdev->bd_disk->private_data; 225 md = inode->i_bdev->bd_disk->private_data;
226 if (!md) 226 if (!md)
227 goto out; 227 goto out;
228 228
229 if (test_bit(DMF_FREEING, &md->flags) || 229 if (test_bit(DMF_FREEING, &md->flags) ||
230 test_bit(DMF_DELETING, &md->flags)) { 230 test_bit(DMF_DELETING, &md->flags)) {
231 md = NULL; 231 md = NULL;
232 goto out; 232 goto out;
233 } 233 }
234 234
235 dm_get(md); 235 dm_get(md);
236 atomic_inc(&md->open_count); 236 atomic_inc(&md->open_count);
237 237
238 out: 238 out:
239 spin_unlock(&_minor_lock); 239 spin_unlock(&_minor_lock);
240 240
241 return md ? 0 : -ENXIO; 241 return md ? 0 : -ENXIO;
242 } 242 }
243 243
244 static int dm_blk_close(struct inode *inode, struct file *file) 244 static int dm_blk_close(struct inode *inode, struct file *file)
245 { 245 {
246 struct mapped_device *md; 246 struct mapped_device *md;
247 247
248 md = inode->i_bdev->bd_disk->private_data; 248 md = inode->i_bdev->bd_disk->private_data;
249 atomic_dec(&md->open_count); 249 atomic_dec(&md->open_count);
250 dm_put(md); 250 dm_put(md);
251 return 0; 251 return 0;
252 } 252 }
253 253
254 int dm_open_count(struct mapped_device *md) 254 int dm_open_count(struct mapped_device *md)
255 { 255 {
256 return atomic_read(&md->open_count); 256 return atomic_read(&md->open_count);
257 } 257 }
258 258
259 /* 259 /*
260 * Guarantees nothing is using the device before it's deleted. 260 * Guarantees nothing is using the device before it's deleted.
261 */ 261 */
262 int dm_lock_for_deletion(struct mapped_device *md) 262 int dm_lock_for_deletion(struct mapped_device *md)
263 { 263 {
264 int r = 0; 264 int r = 0;
265 265
266 spin_lock(&_minor_lock); 266 spin_lock(&_minor_lock);
267 267
268 if (dm_open_count(md)) 268 if (dm_open_count(md))
269 r = -EBUSY; 269 r = -EBUSY;
270 else 270 else
271 set_bit(DMF_DELETING, &md->flags); 271 set_bit(DMF_DELETING, &md->flags);
272 272
273 spin_unlock(&_minor_lock); 273 spin_unlock(&_minor_lock);
274 274
275 return r; 275 return r;
276 } 276 }
277 277
278 static int dm_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo) 278 static int dm_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo)
279 { 279 {
280 struct mapped_device *md = bdev->bd_disk->private_data; 280 struct mapped_device *md = bdev->bd_disk->private_data;
281 281
282 return dm_get_geometry(md, geo); 282 return dm_get_geometry(md, geo);
283 } 283 }
284 284
285 static int dm_blk_ioctl(struct inode *inode, struct file *file, 285 static int dm_blk_ioctl(struct inode *inode, struct file *file,
286 unsigned int cmd, unsigned long arg) 286 unsigned int cmd, unsigned long arg)
287 { 287 {
288 struct mapped_device *md; 288 struct mapped_device *md;
289 struct dm_table *map; 289 struct dm_table *map;
290 struct dm_target *tgt; 290 struct dm_target *tgt;
291 int r = -ENOTTY; 291 int r = -ENOTTY;
292 292
293 /* We don't really need this lock, but we do need 'inode'. */ 293 /* We don't really need this lock, but we do need 'inode'. */
294 unlock_kernel(); 294 unlock_kernel();
295 295
296 md = inode->i_bdev->bd_disk->private_data; 296 md = inode->i_bdev->bd_disk->private_data;
297 297
298 map = dm_get_table(md); 298 map = dm_get_table(md);
299 299
300 if (!map || !dm_table_get_size(map)) 300 if (!map || !dm_table_get_size(map))
301 goto out; 301 goto out;
302 302
303 /* We only support devices that have a single target */ 303 /* We only support devices that have a single target */
304 if (dm_table_get_num_targets(map) != 1) 304 if (dm_table_get_num_targets(map) != 1)
305 goto out; 305 goto out;
306 306
307 tgt = dm_table_get_target(map, 0); 307 tgt = dm_table_get_target(map, 0);
308 308
309 if (dm_suspended(md)) { 309 if (dm_suspended(md)) {
310 r = -EAGAIN; 310 r = -EAGAIN;
311 goto out; 311 goto out;
312 } 312 }
313 313
314 if (tgt->type->ioctl) 314 if (tgt->type->ioctl)
315 r = tgt->type->ioctl(tgt, inode, file, cmd, arg); 315 r = tgt->type->ioctl(tgt, inode, file, cmd, arg);
316 316
317 out: 317 out:
318 dm_table_put(map); 318 dm_table_put(map);
319 319
320 lock_kernel(); 320 lock_kernel();
321 return r; 321 return r;
322 } 322 }
323 323
324 static struct dm_io *alloc_io(struct mapped_device *md) 324 static struct dm_io *alloc_io(struct mapped_device *md)
325 { 325 {
326 return mempool_alloc(md->io_pool, GFP_NOIO); 326 return mempool_alloc(md->io_pool, GFP_NOIO);
327 } 327 }
328 328
329 static void free_io(struct mapped_device *md, struct dm_io *io) 329 static void free_io(struct mapped_device *md, struct dm_io *io)
330 { 330 {
331 mempool_free(io, md->io_pool); 331 mempool_free(io, md->io_pool);
332 } 332 }
333 333
334 static struct dm_target_io *alloc_tio(struct mapped_device *md) 334 static struct dm_target_io *alloc_tio(struct mapped_device *md)
335 { 335 {
336 return mempool_alloc(md->tio_pool, GFP_NOIO); 336 return mempool_alloc(md->tio_pool, GFP_NOIO);
337 } 337 }
338 338
339 static void free_tio(struct mapped_device *md, struct dm_target_io *tio) 339 static void free_tio(struct mapped_device *md, struct dm_target_io *tio)
340 { 340 {
341 mempool_free(tio, md->tio_pool); 341 mempool_free(tio, md->tio_pool);
342 } 342 }
343 343
344 static void start_io_acct(struct dm_io *io) 344 static void start_io_acct(struct dm_io *io)
345 { 345 {
346 struct mapped_device *md = io->md; 346 struct mapped_device *md = io->md;
347 347
348 io->start_time = jiffies; 348 io->start_time = jiffies;
349 349
350 preempt_disable(); 350 preempt_disable();
351 disk_round_stats(dm_disk(md)); 351 disk_round_stats(dm_disk(md));
352 preempt_enable(); 352 preempt_enable();
353 dm_disk(md)->in_flight = atomic_inc_return(&md->pending); 353 dm_disk(md)->in_flight = atomic_inc_return(&md->pending);
354 } 354 }
355 355
356 static int end_io_acct(struct dm_io *io) 356 static int end_io_acct(struct dm_io *io)
357 { 357 {
358 struct mapped_device *md = io->md; 358 struct mapped_device *md = io->md;
359 struct bio *bio = io->bio; 359 struct bio *bio = io->bio;
360 unsigned long duration = jiffies - io->start_time; 360 unsigned long duration = jiffies - io->start_time;
361 int pending; 361 int pending;
362 int rw = bio_data_dir(bio); 362 int rw = bio_data_dir(bio);
363 363
364 preempt_disable(); 364 preempt_disable();
365 disk_round_stats(dm_disk(md)); 365 disk_round_stats(dm_disk(md));
366 preempt_enable(); 366 preempt_enable();
367 dm_disk(md)->in_flight = pending = atomic_dec_return(&md->pending); 367 dm_disk(md)->in_flight = pending = atomic_dec_return(&md->pending);
368 368
369 disk_stat_add(dm_disk(md), ticks[rw], duration); 369 disk_stat_add(dm_disk(md), ticks[rw], duration);
370 370
371 return !pending; 371 return !pending;
372 } 372 }
373 373
374 /* 374 /*
375 * Add the bio to the list of deferred io. 375 * Add the bio to the list of deferred io.
376 */ 376 */
377 static int queue_io(struct mapped_device *md, struct bio *bio) 377 static int queue_io(struct mapped_device *md, struct bio *bio)
378 { 378 {
379 down_write(&md->io_lock); 379 down_write(&md->io_lock);
380 380
381 if (!test_bit(DMF_BLOCK_IO, &md->flags)) { 381 if (!test_bit(DMF_BLOCK_IO, &md->flags)) {
382 up_write(&md->io_lock); 382 up_write(&md->io_lock);
383 return 1; 383 return 1;
384 } 384 }
385 385
386 bio_list_add(&md->deferred, bio); 386 bio_list_add(&md->deferred, bio);
387 387
388 up_write(&md->io_lock); 388 up_write(&md->io_lock);
389 return 0; /* deferred successfully */ 389 return 0; /* deferred successfully */
390 } 390 }
391 391
392 /* 392 /*
393 * Everyone (including functions in this file), should use this 393 * Everyone (including functions in this file), should use this
394 * function to access the md->map field, and make sure they call 394 * function to access the md->map field, and make sure they call
395 * dm_table_put() when finished. 395 * dm_table_put() when finished.
396 */ 396 */
397 struct dm_table *dm_get_table(struct mapped_device *md) 397 struct dm_table *dm_get_table(struct mapped_device *md)
398 { 398 {
399 struct dm_table *t; 399 struct dm_table *t;
400 400
401 read_lock(&md->map_lock); 401 read_lock(&md->map_lock);
402 t = md->map; 402 t = md->map;
403 if (t) 403 if (t)
404 dm_table_get(t); 404 dm_table_get(t);
405 read_unlock(&md->map_lock); 405 read_unlock(&md->map_lock);
406 406
407 return t; 407 return t;
408 } 408 }
409 409
410 /* 410 /*
411 * Get the geometry associated with a dm device 411 * Get the geometry associated with a dm device
412 */ 412 */
413 int dm_get_geometry(struct mapped_device *md, struct hd_geometry *geo) 413 int dm_get_geometry(struct mapped_device *md, struct hd_geometry *geo)
414 { 414 {
415 *geo = md->geometry; 415 *geo = md->geometry;
416 416
417 return 0; 417 return 0;
418 } 418 }
419 419
420 /* 420 /*
421 * Set the geometry of a device. 421 * Set the geometry of a device.
422 */ 422 */
423 int dm_set_geometry(struct mapped_device *md, struct hd_geometry *geo) 423 int dm_set_geometry(struct mapped_device *md, struct hd_geometry *geo)
424 { 424 {
425 sector_t sz = (sector_t)geo->cylinders * geo->heads * geo->sectors; 425 sector_t sz = (sector_t)geo->cylinders * geo->heads * geo->sectors;
426 426
427 if (geo->start > sz) { 427 if (geo->start > sz) {
428 DMWARN("Start sector is beyond the geometry limits."); 428 DMWARN("Start sector is beyond the geometry limits.");
429 return -EINVAL; 429 return -EINVAL;
430 } 430 }
431 431
432 md->geometry = *geo; 432 md->geometry = *geo;
433 433
434 return 0; 434 return 0;
435 } 435 }
436 436
437 /*----------------------------------------------------------------- 437 /*-----------------------------------------------------------------
438 * CRUD START: 438 * CRUD START:
439 * A more elegant soln is in the works that uses the queue 439 * A more elegant soln is in the works that uses the queue
440 * merge fn, unfortunately there are a couple of changes to 440 * merge fn, unfortunately there are a couple of changes to
441 * the block layer that I want to make for this. So in the 441 * the block layer that I want to make for this. So in the
442 * interests of getting something for people to use I give 442 * interests of getting something for people to use I give
443 * you this clearly demarcated crap. 443 * you this clearly demarcated crap.
444 *---------------------------------------------------------------*/ 444 *---------------------------------------------------------------*/
445 445
446 static int __noflush_suspending(struct mapped_device *md) 446 static int __noflush_suspending(struct mapped_device *md)
447 { 447 {
448 return test_bit(DMF_NOFLUSH_SUSPENDING, &md->flags); 448 return test_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
449 } 449 }
450 450
451 /* 451 /*
452 * Decrements the number of outstanding ios that a bio has been 452 * Decrements the number of outstanding ios that a bio has been
453 * cloned into, completing the original io if necc. 453 * cloned into, completing the original io if necc.
454 */ 454 */
455 static void dec_pending(struct dm_io *io, int error) 455 static void dec_pending(struct dm_io *io, int error)
456 { 456 {
457 unsigned long flags; 457 unsigned long flags;
458 458
459 /* Push-back supersedes any I/O errors */ 459 /* Push-back supersedes any I/O errors */
460 if (error && !(io->error > 0 && __noflush_suspending(io->md))) 460 if (error && !(io->error > 0 && __noflush_suspending(io->md)))
461 io->error = error; 461 io->error = error;
462 462
463 if (atomic_dec_and_test(&io->io_count)) { 463 if (atomic_dec_and_test(&io->io_count)) {
464 if (io->error == DM_ENDIO_REQUEUE) { 464 if (io->error == DM_ENDIO_REQUEUE) {
465 /* 465 /*
466 * Target requested pushing back the I/O. 466 * Target requested pushing back the I/O.
467 * This must be handled before the sleeper on 467 * This must be handled before the sleeper on
468 * suspend queue merges the pushback list. 468 * suspend queue merges the pushback list.
469 */ 469 */
470 spin_lock_irqsave(&io->md->pushback_lock, flags); 470 spin_lock_irqsave(&io->md->pushback_lock, flags);
471 if (__noflush_suspending(io->md)) 471 if (__noflush_suspending(io->md))
472 bio_list_add(&io->md->pushback, io->bio); 472 bio_list_add(&io->md->pushback, io->bio);
473 else 473 else
474 /* noflush suspend was interrupted. */ 474 /* noflush suspend was interrupted. */
475 io->error = -EIO; 475 io->error = -EIO;
476 spin_unlock_irqrestore(&io->md->pushback_lock, flags); 476 spin_unlock_irqrestore(&io->md->pushback_lock, flags);
477 } 477 }
478 478
479 if (end_io_acct(io)) 479 if (end_io_acct(io))
480 /* nudge anyone waiting on suspend queue */ 480 /* nudge anyone waiting on suspend queue */
481 wake_up(&io->md->wait); 481 wake_up(&io->md->wait);
482 482
483 if (io->error != DM_ENDIO_REQUEUE) { 483 if (io->error != DM_ENDIO_REQUEUE) {
484 blk_add_trace_bio(io->md->queue, io->bio, 484 blk_add_trace_bio(io->md->queue, io->bio,
485 BLK_TA_COMPLETE); 485 BLK_TA_COMPLETE);
486 486
487 bio_endio(io->bio, io->bio->bi_size, io->error); 487 bio_endio(io->bio, io->bio->bi_size, io->error);
488 } 488 }
489 489
490 free_io(io->md, io); 490 free_io(io->md, io);
491 } 491 }
492 } 492 }
493 493
494 static int clone_endio(struct bio *bio, unsigned int done, int error) 494 static int clone_endio(struct bio *bio, unsigned int done, int error)
495 { 495 {
496 int r = 0; 496 int r = 0;
497 struct dm_target_io *tio = bio->bi_private; 497 struct dm_target_io *tio = bio->bi_private;
498 struct mapped_device *md = tio->io->md; 498 struct mapped_device *md = tio->io->md;
499 dm_endio_fn endio = tio->ti->type->end_io; 499 dm_endio_fn endio = tio->ti->type->end_io;
500 500
501 if (bio->bi_size) 501 if (bio->bi_size)
502 return 1; 502 return 1;
503 503
504 if (!bio_flagged(bio, BIO_UPTODATE) && !error) 504 if (!bio_flagged(bio, BIO_UPTODATE) && !error)
505 error = -EIO; 505 error = -EIO;
506 506
507 if (endio) { 507 if (endio) {
508 r = endio(tio->ti, bio, error, &tio->info); 508 r = endio(tio->ti, bio, error, &tio->info);
509 if (r < 0 || r == DM_ENDIO_REQUEUE) 509 if (r < 0 || r == DM_ENDIO_REQUEUE)
510 /* 510 /*
511 * error and requeue request are handled 511 * error and requeue request are handled
512 * in dec_pending(). 512 * in dec_pending().
513 */ 513 */
514 error = r; 514 error = r;
515 else if (r == DM_ENDIO_INCOMPLETE) 515 else if (r == DM_ENDIO_INCOMPLETE)
516 /* The target will handle the io */ 516 /* The target will handle the io */
517 return 1; 517 return 1;
518 else if (r) { 518 else if (r) {
519 DMWARN("unimplemented target endio return value: %d", r); 519 DMWARN("unimplemented target endio return value: %d", r);
520 BUG(); 520 BUG();
521 } 521 }
522 } 522 }
523 523
524 dec_pending(tio->io, error); 524 dec_pending(tio->io, error);
525 525
526 /* 526 /*
527 * Store md for cleanup instead of tio which is about to get freed. 527 * Store md for cleanup instead of tio which is about to get freed.
528 */ 528 */
529 bio->bi_private = md->bs; 529 bio->bi_private = md->bs;
530 530
531 bio_put(bio); 531 bio_put(bio);
532 free_tio(md, tio); 532 free_tio(md, tio);
533 return r; 533 return r;
534 } 534 }
535 535
536 static sector_t max_io_len(struct mapped_device *md, 536 static sector_t max_io_len(struct mapped_device *md,
537 sector_t sector, struct dm_target *ti) 537 sector_t sector, struct dm_target *ti)
538 { 538 {
539 sector_t offset = sector - ti->begin; 539 sector_t offset = sector - ti->begin;
540 sector_t len = ti->len - offset; 540 sector_t len = ti->len - offset;
541 541
542 /* 542 /*
543 * Does the target need to split even further ? 543 * Does the target need to split even further ?
544 */ 544 */
545 if (ti->split_io) { 545 if (ti->split_io) {
546 sector_t boundary; 546 sector_t boundary;
547 boundary = ((offset + ti->split_io) & ~(ti->split_io - 1)) 547 boundary = ((offset + ti->split_io) & ~(ti->split_io - 1))
548 - offset; 548 - offset;
549 if (len > boundary) 549 if (len > boundary)
550 len = boundary; 550 len = boundary;
551 } 551 }
552 552
553 return len; 553 return len;
554 } 554 }
555 555
556 static void __map_bio(struct dm_target *ti, struct bio *clone, 556 static void __map_bio(struct dm_target *ti, struct bio *clone,
557 struct dm_target_io *tio) 557 struct dm_target_io *tio)
558 { 558 {
559 int r; 559 int r;
560 sector_t sector; 560 sector_t sector;
561 struct mapped_device *md; 561 struct mapped_device *md;
562 562
563 /* 563 /*
564 * Sanity checks. 564 * Sanity checks.
565 */ 565 */
566 BUG_ON(!clone->bi_size); 566 BUG_ON(!clone->bi_size);
567 567
568 clone->bi_end_io = clone_endio; 568 clone->bi_end_io = clone_endio;
569 clone->bi_private = tio; 569 clone->bi_private = tio;
570 570
571 /* 571 /*
572 * Map the clone. If r == 0 we don't need to do 572 * Map the clone. If r == 0 we don't need to do
573 * anything, the target has assumed ownership of 573 * anything, the target has assumed ownership of
574 * this io. 574 * this io.
575 */ 575 */
576 atomic_inc(&tio->io->io_count); 576 atomic_inc(&tio->io->io_count);
577 sector = clone->bi_sector; 577 sector = clone->bi_sector;
578 r = ti->type->map(ti, clone, &tio->info); 578 r = ti->type->map(ti, clone, &tio->info);
579 if (r == DM_MAPIO_REMAPPED) { 579 if (r == DM_MAPIO_REMAPPED) {
580 /* the bio has been remapped so dispatch it */ 580 /* the bio has been remapped so dispatch it */
581 581
582 blk_add_trace_remap(bdev_get_queue(clone->bi_bdev), clone, 582 blk_add_trace_remap(bdev_get_queue(clone->bi_bdev), clone,
583 tio->io->bio->bi_bdev->bd_dev, sector, 583 tio->io->bio->bi_bdev->bd_dev,
584 clone->bi_sector); 584 clone->bi_sector, sector);
585 585
586 generic_make_request(clone); 586 generic_make_request(clone);
587 } else if (r < 0 || r == DM_MAPIO_REQUEUE) { 587 } else if (r < 0 || r == DM_MAPIO_REQUEUE) {
588 /* error the io and bail out, or requeue it if needed */ 588 /* error the io and bail out, or requeue it if needed */
589 md = tio->io->md; 589 md = tio->io->md;
590 dec_pending(tio->io, r); 590 dec_pending(tio->io, r);
591 /* 591 /*
592 * Store bio_set for cleanup. 592 * Store bio_set for cleanup.
593 */ 593 */
594 clone->bi_private = md->bs; 594 clone->bi_private = md->bs;
595 bio_put(clone); 595 bio_put(clone);
596 free_tio(md, tio); 596 free_tio(md, tio);
597 } else if (r) { 597 } else if (r) {
598 DMWARN("unimplemented target map return value: %d", r); 598 DMWARN("unimplemented target map return value: %d", r);
599 BUG(); 599 BUG();
600 } 600 }
601 } 601 }
602 602
603 struct clone_info { 603 struct clone_info {
604 struct mapped_device *md; 604 struct mapped_device *md;
605 struct dm_table *map; 605 struct dm_table *map;
606 struct bio *bio; 606 struct bio *bio;
607 struct dm_io *io; 607 struct dm_io *io;
608 sector_t sector; 608 sector_t sector;
609 sector_t sector_count; 609 sector_t sector_count;
610 unsigned short idx; 610 unsigned short idx;
611 }; 611 };
612 612
613 static void dm_bio_destructor(struct bio *bio) 613 static void dm_bio_destructor(struct bio *bio)
614 { 614 {
615 struct bio_set *bs = bio->bi_private; 615 struct bio_set *bs = bio->bi_private;
616 616
617 bio_free(bio, bs); 617 bio_free(bio, bs);
618 } 618 }
619 619
620 /* 620 /*
621 * Creates a little bio that is just does part of a bvec. 621 * Creates a little bio that is just does part of a bvec.
622 */ 622 */
623 static struct bio *split_bvec(struct bio *bio, sector_t sector, 623 static struct bio *split_bvec(struct bio *bio, sector_t sector,
624 unsigned short idx, unsigned int offset, 624 unsigned short idx, unsigned int offset,
625 unsigned int len, struct bio_set *bs) 625 unsigned int len, struct bio_set *bs)
626 { 626 {
627 struct bio *clone; 627 struct bio *clone;
628 struct bio_vec *bv = bio->bi_io_vec + idx; 628 struct bio_vec *bv = bio->bi_io_vec + idx;
629 629
630 clone = bio_alloc_bioset(GFP_NOIO, 1, bs); 630 clone = bio_alloc_bioset(GFP_NOIO, 1, bs);
631 clone->bi_destructor = dm_bio_destructor; 631 clone->bi_destructor = dm_bio_destructor;
632 *clone->bi_io_vec = *bv; 632 *clone->bi_io_vec = *bv;
633 633
634 clone->bi_sector = sector; 634 clone->bi_sector = sector;
635 clone->bi_bdev = bio->bi_bdev; 635 clone->bi_bdev = bio->bi_bdev;
636 clone->bi_rw = bio->bi_rw; 636 clone->bi_rw = bio->bi_rw;
637 clone->bi_vcnt = 1; 637 clone->bi_vcnt = 1;
638 clone->bi_size = to_bytes(len); 638 clone->bi_size = to_bytes(len);
639 clone->bi_io_vec->bv_offset = offset; 639 clone->bi_io_vec->bv_offset = offset;
640 clone->bi_io_vec->bv_len = clone->bi_size; 640 clone->bi_io_vec->bv_len = clone->bi_size;
641 641
642 return clone; 642 return clone;
643 } 643 }
644 644
645 /* 645 /*
646 * Creates a bio that consists of range of complete bvecs. 646 * Creates a bio that consists of range of complete bvecs.
647 */ 647 */
648 static struct bio *clone_bio(struct bio *bio, sector_t sector, 648 static struct bio *clone_bio(struct bio *bio, sector_t sector,
649 unsigned short idx, unsigned short bv_count, 649 unsigned short idx, unsigned short bv_count,
650 unsigned int len, struct bio_set *bs) 650 unsigned int len, struct bio_set *bs)
651 { 651 {
652 struct bio *clone; 652 struct bio *clone;
653 653
654 clone = bio_alloc_bioset(GFP_NOIO, bio->bi_max_vecs, bs); 654 clone = bio_alloc_bioset(GFP_NOIO, bio->bi_max_vecs, bs);
655 __bio_clone(clone, bio); 655 __bio_clone(clone, bio);
656 clone->bi_destructor = dm_bio_destructor; 656 clone->bi_destructor = dm_bio_destructor;
657 clone->bi_sector = sector; 657 clone->bi_sector = sector;
658 clone->bi_idx = idx; 658 clone->bi_idx = idx;
659 clone->bi_vcnt = idx + bv_count; 659 clone->bi_vcnt = idx + bv_count;
660 clone->bi_size = to_bytes(len); 660 clone->bi_size = to_bytes(len);
661 clone->bi_flags &= ~(1 << BIO_SEG_VALID); 661 clone->bi_flags &= ~(1 << BIO_SEG_VALID);
662 662
663 return clone; 663 return clone;
664 } 664 }
665 665
666 static void __clone_and_map(struct clone_info *ci) 666 static void __clone_and_map(struct clone_info *ci)
667 { 667 {
668 struct bio *clone, *bio = ci->bio; 668 struct bio *clone, *bio = ci->bio;
669 struct dm_target *ti = dm_table_find_target(ci->map, ci->sector); 669 struct dm_target *ti = dm_table_find_target(ci->map, ci->sector);
670 sector_t len = 0, max = max_io_len(ci->md, ci->sector, ti); 670 sector_t len = 0, max = max_io_len(ci->md, ci->sector, ti);
671 struct dm_target_io *tio; 671 struct dm_target_io *tio;
672 672
673 /* 673 /*
674 * Allocate a target io object. 674 * Allocate a target io object.
675 */ 675 */
676 tio = alloc_tio(ci->md); 676 tio = alloc_tio(ci->md);
677 tio->io = ci->io; 677 tio->io = ci->io;
678 tio->ti = ti; 678 tio->ti = ti;
679 memset(&tio->info, 0, sizeof(tio->info)); 679 memset(&tio->info, 0, sizeof(tio->info));
680 680
681 if (ci->sector_count <= max) { 681 if (ci->sector_count <= max) {
682 /* 682 /*
683 * Optimise for the simple case where we can do all of 683 * Optimise for the simple case where we can do all of
684 * the remaining io with a single clone. 684 * the remaining io with a single clone.
685 */ 685 */
686 clone = clone_bio(bio, ci->sector, ci->idx, 686 clone = clone_bio(bio, ci->sector, ci->idx,
687 bio->bi_vcnt - ci->idx, ci->sector_count, 687 bio->bi_vcnt - ci->idx, ci->sector_count,
688 ci->md->bs); 688 ci->md->bs);
689 __map_bio(ti, clone, tio); 689 __map_bio(ti, clone, tio);
690 ci->sector_count = 0; 690 ci->sector_count = 0;
691 691
692 } else if (to_sector(bio->bi_io_vec[ci->idx].bv_len) <= max) { 692 } else if (to_sector(bio->bi_io_vec[ci->idx].bv_len) <= max) {
693 /* 693 /*
694 * There are some bvecs that don't span targets. 694 * There are some bvecs that don't span targets.
695 * Do as many of these as possible. 695 * Do as many of these as possible.
696 */ 696 */
697 int i; 697 int i;
698 sector_t remaining = max; 698 sector_t remaining = max;
699 sector_t bv_len; 699 sector_t bv_len;
700 700
701 for (i = ci->idx; remaining && (i < bio->bi_vcnt); i++) { 701 for (i = ci->idx; remaining && (i < bio->bi_vcnt); i++) {
702 bv_len = to_sector(bio->bi_io_vec[i].bv_len); 702 bv_len = to_sector(bio->bi_io_vec[i].bv_len);
703 703
704 if (bv_len > remaining) 704 if (bv_len > remaining)
705 break; 705 break;
706 706
707 remaining -= bv_len; 707 remaining -= bv_len;
708 len += bv_len; 708 len += bv_len;
709 } 709 }
710 710
711 clone = clone_bio(bio, ci->sector, ci->idx, i - ci->idx, len, 711 clone = clone_bio(bio, ci->sector, ci->idx, i - ci->idx, len,
712 ci->md->bs); 712 ci->md->bs);
713 __map_bio(ti, clone, tio); 713 __map_bio(ti, clone, tio);
714 714
715 ci->sector += len; 715 ci->sector += len;
716 ci->sector_count -= len; 716 ci->sector_count -= len;
717 ci->idx = i; 717 ci->idx = i;
718 718
719 } else { 719 } else {
720 /* 720 /*
721 * Handle a bvec that must be split between two or more targets. 721 * Handle a bvec that must be split between two or more targets.
722 */ 722 */
723 struct bio_vec *bv = bio->bi_io_vec + ci->idx; 723 struct bio_vec *bv = bio->bi_io_vec + ci->idx;
724 sector_t remaining = to_sector(bv->bv_len); 724 sector_t remaining = to_sector(bv->bv_len);
725 unsigned int offset = 0; 725 unsigned int offset = 0;
726 726
727 do { 727 do {
728 if (offset) { 728 if (offset) {
729 ti = dm_table_find_target(ci->map, ci->sector); 729 ti = dm_table_find_target(ci->map, ci->sector);
730 max = max_io_len(ci->md, ci->sector, ti); 730 max = max_io_len(ci->md, ci->sector, ti);
731 731
732 tio = alloc_tio(ci->md); 732 tio = alloc_tio(ci->md);
733 tio->io = ci->io; 733 tio->io = ci->io;
734 tio->ti = ti; 734 tio->ti = ti;
735 memset(&tio->info, 0, sizeof(tio->info)); 735 memset(&tio->info, 0, sizeof(tio->info));
736 } 736 }
737 737
738 len = min(remaining, max); 738 len = min(remaining, max);
739 739
740 clone = split_bvec(bio, ci->sector, ci->idx, 740 clone = split_bvec(bio, ci->sector, ci->idx,
741 bv->bv_offset + offset, len, 741 bv->bv_offset + offset, len,
742 ci->md->bs); 742 ci->md->bs);
743 743
744 __map_bio(ti, clone, tio); 744 __map_bio(ti, clone, tio);
745 745
746 ci->sector += len; 746 ci->sector += len;
747 ci->sector_count -= len; 747 ci->sector_count -= len;
748 offset += to_bytes(len); 748 offset += to_bytes(len);
749 } while (remaining -= len); 749 } while (remaining -= len);
750 750
751 ci->idx++; 751 ci->idx++;
752 } 752 }
753 } 753 }
754 754
755 /* 755 /*
756 * Split the bio into several clones. 756 * Split the bio into several clones.
757 */ 757 */
758 static void __split_bio(struct mapped_device *md, struct bio *bio) 758 static void __split_bio(struct mapped_device *md, struct bio *bio)
759 { 759 {
760 struct clone_info ci; 760 struct clone_info ci;
761 761
762 ci.map = dm_get_table(md); 762 ci.map = dm_get_table(md);
763 if (!ci.map) { 763 if (!ci.map) {
764 bio_io_error(bio, bio->bi_size); 764 bio_io_error(bio, bio->bi_size);
765 return; 765 return;
766 } 766 }
767 767
768 ci.md = md; 768 ci.md = md;
769 ci.bio = bio; 769 ci.bio = bio;
770 ci.io = alloc_io(md); 770 ci.io = alloc_io(md);
771 ci.io->error = 0; 771 ci.io->error = 0;
772 atomic_set(&ci.io->io_count, 1); 772 atomic_set(&ci.io->io_count, 1);
773 ci.io->bio = bio; 773 ci.io->bio = bio;
774 ci.io->md = md; 774 ci.io->md = md;
775 ci.sector = bio->bi_sector; 775 ci.sector = bio->bi_sector;
776 ci.sector_count = bio_sectors(bio); 776 ci.sector_count = bio_sectors(bio);
777 ci.idx = bio->bi_idx; 777 ci.idx = bio->bi_idx;
778 778
779 start_io_acct(ci.io); 779 start_io_acct(ci.io);
780 while (ci.sector_count) 780 while (ci.sector_count)
781 __clone_and_map(&ci); 781 __clone_and_map(&ci);
782 782
783 /* drop the extra reference count */ 783 /* drop the extra reference count */
784 dec_pending(ci.io, 0); 784 dec_pending(ci.io, 0);
785 dm_table_put(ci.map); 785 dm_table_put(ci.map);
786 } 786 }
787 /*----------------------------------------------------------------- 787 /*-----------------------------------------------------------------
788 * CRUD END 788 * CRUD END
789 *---------------------------------------------------------------*/ 789 *---------------------------------------------------------------*/
790 790
791 /* 791 /*
792 * The request function that just remaps the bio built up by 792 * The request function that just remaps the bio built up by
793 * dm_merge_bvec. 793 * dm_merge_bvec.
794 */ 794 */
795 static int dm_request(struct request_queue *q, struct bio *bio) 795 static int dm_request(struct request_queue *q, struct bio *bio)
796 { 796 {
797 int r; 797 int r;
798 int rw = bio_data_dir(bio); 798 int rw = bio_data_dir(bio);
799 struct mapped_device *md = q->queuedata; 799 struct mapped_device *md = q->queuedata;
800 800
801 /* 801 /*
802 * There is no use in forwarding any barrier request since we can't 802 * There is no use in forwarding any barrier request since we can't
803 * guarantee it is (or can be) handled by the targets correctly. 803 * guarantee it is (or can be) handled by the targets correctly.
804 */ 804 */
805 if (unlikely(bio_barrier(bio))) { 805 if (unlikely(bio_barrier(bio))) {
806 bio_endio(bio, bio->bi_size, -EOPNOTSUPP); 806 bio_endio(bio, bio->bi_size, -EOPNOTSUPP);
807 return 0; 807 return 0;
808 } 808 }
809 809
810 down_read(&md->io_lock); 810 down_read(&md->io_lock);
811 811
812 disk_stat_inc(dm_disk(md), ios[rw]); 812 disk_stat_inc(dm_disk(md), ios[rw]);
813 disk_stat_add(dm_disk(md), sectors[rw], bio_sectors(bio)); 813 disk_stat_add(dm_disk(md), sectors[rw], bio_sectors(bio));
814 814
815 /* 815 /*
816 * If we're suspended we have to queue 816 * If we're suspended we have to queue
817 * this io for later. 817 * this io for later.
818 */ 818 */
819 while (test_bit(DMF_BLOCK_IO, &md->flags)) { 819 while (test_bit(DMF_BLOCK_IO, &md->flags)) {
820 up_read(&md->io_lock); 820 up_read(&md->io_lock);
821 821
822 if (bio_rw(bio) == READA) { 822 if (bio_rw(bio) == READA) {
823 bio_io_error(bio, bio->bi_size); 823 bio_io_error(bio, bio->bi_size);
824 return 0; 824 return 0;
825 } 825 }
826 826
827 r = queue_io(md, bio); 827 r = queue_io(md, bio);
828 if (r < 0) { 828 if (r < 0) {
829 bio_io_error(bio, bio->bi_size); 829 bio_io_error(bio, bio->bi_size);
830 return 0; 830 return 0;
831 831
832 } else if (r == 0) 832 } else if (r == 0)
833 return 0; /* deferred successfully */ 833 return 0; /* deferred successfully */
834 834
835 /* 835 /*
836 * We're in a while loop, because someone could suspend 836 * We're in a while loop, because someone could suspend
837 * before we get to the following read lock. 837 * before we get to the following read lock.
838 */ 838 */
839 down_read(&md->io_lock); 839 down_read(&md->io_lock);
840 } 840 }
841 841
842 __split_bio(md, bio); 842 __split_bio(md, bio);
843 up_read(&md->io_lock); 843 up_read(&md->io_lock);
844 return 0; 844 return 0;
845 } 845 }
846 846
847 static int dm_flush_all(struct request_queue *q, struct gendisk *disk, 847 static int dm_flush_all(struct request_queue *q, struct gendisk *disk,
848 sector_t *error_sector) 848 sector_t *error_sector)
849 { 849 {
850 struct mapped_device *md = q->queuedata; 850 struct mapped_device *md = q->queuedata;
851 struct dm_table *map = dm_get_table(md); 851 struct dm_table *map = dm_get_table(md);
852 int ret = -ENXIO; 852 int ret = -ENXIO;
853 853
854 if (map) { 854 if (map) {
855 ret = dm_table_flush_all(map); 855 ret = dm_table_flush_all(map);
856 dm_table_put(map); 856 dm_table_put(map);
857 } 857 }
858 858
859 return ret; 859 return ret;
860 } 860 }
861 861
862 static void dm_unplug_all(struct request_queue *q) 862 static void dm_unplug_all(struct request_queue *q)
863 { 863 {
864 struct mapped_device *md = q->queuedata; 864 struct mapped_device *md = q->queuedata;
865 struct dm_table *map = dm_get_table(md); 865 struct dm_table *map = dm_get_table(md);
866 866
867 if (map) { 867 if (map) {
868 dm_table_unplug_all(map); 868 dm_table_unplug_all(map);
869 dm_table_put(map); 869 dm_table_put(map);
870 } 870 }
871 } 871 }
872 872
873 static int dm_any_congested(void *congested_data, int bdi_bits) 873 static int dm_any_congested(void *congested_data, int bdi_bits)
874 { 874 {
875 int r; 875 int r;
876 struct mapped_device *md = (struct mapped_device *) congested_data; 876 struct mapped_device *md = (struct mapped_device *) congested_data;
877 struct dm_table *map = dm_get_table(md); 877 struct dm_table *map = dm_get_table(md);
878 878
879 if (!map || test_bit(DMF_BLOCK_IO, &md->flags)) 879 if (!map || test_bit(DMF_BLOCK_IO, &md->flags))
880 r = bdi_bits; 880 r = bdi_bits;
881 else 881 else
882 r = dm_table_any_congested(map, bdi_bits); 882 r = dm_table_any_congested(map, bdi_bits);
883 883
884 dm_table_put(map); 884 dm_table_put(map);
885 return r; 885 return r;
886 } 886 }
887 887
888 /*----------------------------------------------------------------- 888 /*-----------------------------------------------------------------
889 * An IDR is used to keep track of allocated minor numbers. 889 * An IDR is used to keep track of allocated minor numbers.
890 *---------------------------------------------------------------*/ 890 *---------------------------------------------------------------*/
891 static DEFINE_IDR(_minor_idr); 891 static DEFINE_IDR(_minor_idr);
892 892
893 static void free_minor(int minor) 893 static void free_minor(int minor)
894 { 894 {
895 spin_lock(&_minor_lock); 895 spin_lock(&_minor_lock);
896 idr_remove(&_minor_idr, minor); 896 idr_remove(&_minor_idr, minor);
897 spin_unlock(&_minor_lock); 897 spin_unlock(&_minor_lock);
898 } 898 }
899 899
900 /* 900 /*
901 * See if the device with a specific minor # is free. 901 * See if the device with a specific minor # is free.
902 */ 902 */
903 static int specific_minor(struct mapped_device *md, int minor) 903 static int specific_minor(struct mapped_device *md, int minor)
904 { 904 {
905 int r, m; 905 int r, m;
906 906
907 if (minor >= (1 << MINORBITS)) 907 if (minor >= (1 << MINORBITS))
908 return -EINVAL; 908 return -EINVAL;
909 909
910 r = idr_pre_get(&_minor_idr, GFP_KERNEL); 910 r = idr_pre_get(&_minor_idr, GFP_KERNEL);
911 if (!r) 911 if (!r)
912 return -ENOMEM; 912 return -ENOMEM;
913 913
914 spin_lock(&_minor_lock); 914 spin_lock(&_minor_lock);
915 915
916 if (idr_find(&_minor_idr, minor)) { 916 if (idr_find(&_minor_idr, minor)) {
917 r = -EBUSY; 917 r = -EBUSY;
918 goto out; 918 goto out;
919 } 919 }
920 920
921 r = idr_get_new_above(&_minor_idr, MINOR_ALLOCED, minor, &m); 921 r = idr_get_new_above(&_minor_idr, MINOR_ALLOCED, minor, &m);
922 if (r) 922 if (r)
923 goto out; 923 goto out;
924 924
925 if (m != minor) { 925 if (m != minor) {
926 idr_remove(&_minor_idr, m); 926 idr_remove(&_minor_idr, m);
927 r = -EBUSY; 927 r = -EBUSY;
928 goto out; 928 goto out;
929 } 929 }
930 930
931 out: 931 out:
932 spin_unlock(&_minor_lock); 932 spin_unlock(&_minor_lock);
933 return r; 933 return r;
934 } 934 }
935 935
936 static int next_free_minor(struct mapped_device *md, int *minor) 936 static int next_free_minor(struct mapped_device *md, int *minor)
937 { 937 {
938 int r, m; 938 int r, m;
939 939
940 r = idr_pre_get(&_minor_idr, GFP_KERNEL); 940 r = idr_pre_get(&_minor_idr, GFP_KERNEL);
941 if (!r) 941 if (!r)
942 return -ENOMEM; 942 return -ENOMEM;
943 943
944 spin_lock(&_minor_lock); 944 spin_lock(&_minor_lock);
945 945
946 r = idr_get_new(&_minor_idr, MINOR_ALLOCED, &m); 946 r = idr_get_new(&_minor_idr, MINOR_ALLOCED, &m);
947 if (r) { 947 if (r) {
948 goto out; 948 goto out;
949 } 949 }
950 950
951 if (m >= (1 << MINORBITS)) { 951 if (m >= (1 << MINORBITS)) {
952 idr_remove(&_minor_idr, m); 952 idr_remove(&_minor_idr, m);
953 r = -ENOSPC; 953 r = -ENOSPC;
954 goto out; 954 goto out;
955 } 955 }
956 956
957 *minor = m; 957 *minor = m;
958 958
959 out: 959 out:
960 spin_unlock(&_minor_lock); 960 spin_unlock(&_minor_lock);
961 return r; 961 return r;
962 } 962 }
963 963
964 static struct block_device_operations dm_blk_dops; 964 static struct block_device_operations dm_blk_dops;
965 965
966 /* 966 /*
967 * Allocate and initialise a blank device with a given minor. 967 * Allocate and initialise a blank device with a given minor.
968 */ 968 */
969 static struct mapped_device *alloc_dev(int minor) 969 static struct mapped_device *alloc_dev(int minor)
970 { 970 {
971 int r; 971 int r;
972 struct mapped_device *md = kmalloc(sizeof(*md), GFP_KERNEL); 972 struct mapped_device *md = kmalloc(sizeof(*md), GFP_KERNEL);
973 void *old_md; 973 void *old_md;
974 974
975 if (!md) { 975 if (!md) {
976 DMWARN("unable to allocate device, out of memory."); 976 DMWARN("unable to allocate device, out of memory.");
977 return NULL; 977 return NULL;
978 } 978 }
979 979
980 if (!try_module_get(THIS_MODULE)) 980 if (!try_module_get(THIS_MODULE))
981 goto bad0; 981 goto bad0;
982 982
983 /* get a minor number for the dev */ 983 /* get a minor number for the dev */
984 if (minor == DM_ANY_MINOR) 984 if (minor == DM_ANY_MINOR)
985 r = next_free_minor(md, &minor); 985 r = next_free_minor(md, &minor);
986 else 986 else
987 r = specific_minor(md, minor); 987 r = specific_minor(md, minor);
988 if (r < 0) 988 if (r < 0)
989 goto bad1; 989 goto bad1;
990 990
991 memset(md, 0, sizeof(*md)); 991 memset(md, 0, sizeof(*md));
992 init_rwsem(&md->io_lock); 992 init_rwsem(&md->io_lock);
993 init_MUTEX(&md->suspend_lock); 993 init_MUTEX(&md->suspend_lock);
994 spin_lock_init(&md->pushback_lock); 994 spin_lock_init(&md->pushback_lock);
995 rwlock_init(&md->map_lock); 995 rwlock_init(&md->map_lock);
996 atomic_set(&md->holders, 1); 996 atomic_set(&md->holders, 1);
997 atomic_set(&md->open_count, 0); 997 atomic_set(&md->open_count, 0);
998 atomic_set(&md->event_nr, 0); 998 atomic_set(&md->event_nr, 0);
999 999
1000 md->queue = blk_alloc_queue(GFP_KERNEL); 1000 md->queue = blk_alloc_queue(GFP_KERNEL);
1001 if (!md->queue) 1001 if (!md->queue)
1002 goto bad1_free_minor; 1002 goto bad1_free_minor;
1003 1003
1004 md->queue->queuedata = md; 1004 md->queue->queuedata = md;
1005 md->queue->backing_dev_info.congested_fn = dm_any_congested; 1005 md->queue->backing_dev_info.congested_fn = dm_any_congested;
1006 md->queue->backing_dev_info.congested_data = md; 1006 md->queue->backing_dev_info.congested_data = md;
1007 blk_queue_make_request(md->queue, dm_request); 1007 blk_queue_make_request(md->queue, dm_request);
1008 blk_queue_bounce_limit(md->queue, BLK_BOUNCE_ANY); 1008 blk_queue_bounce_limit(md->queue, BLK_BOUNCE_ANY);
1009 md->queue->unplug_fn = dm_unplug_all; 1009 md->queue->unplug_fn = dm_unplug_all;
1010 md->queue->issue_flush_fn = dm_flush_all; 1010 md->queue->issue_flush_fn = dm_flush_all;
1011 1011
1012 md->io_pool = mempool_create_slab_pool(MIN_IOS, _io_cache); 1012 md->io_pool = mempool_create_slab_pool(MIN_IOS, _io_cache);
1013 if (!md->io_pool) 1013 if (!md->io_pool)
1014 goto bad2; 1014 goto bad2;
1015 1015
1016 md->tio_pool = mempool_create_slab_pool(MIN_IOS, _tio_cache); 1016 md->tio_pool = mempool_create_slab_pool(MIN_IOS, _tio_cache);
1017 if (!md->tio_pool) 1017 if (!md->tio_pool)
1018 goto bad3; 1018 goto bad3;
1019 1019
1020 md->bs = bioset_create(16, 16); 1020 md->bs = bioset_create(16, 16);
1021 if (!md->bs) 1021 if (!md->bs)
1022 goto bad_no_bioset; 1022 goto bad_no_bioset;
1023 1023
1024 md->disk = alloc_disk(1); 1024 md->disk = alloc_disk(1);
1025 if (!md->disk) 1025 if (!md->disk)
1026 goto bad4; 1026 goto bad4;
1027 1027
1028 atomic_set(&md->pending, 0); 1028 atomic_set(&md->pending, 0);
1029 init_waitqueue_head(&md->wait); 1029 init_waitqueue_head(&md->wait);
1030 init_waitqueue_head(&md->eventq); 1030 init_waitqueue_head(&md->eventq);
1031 1031
1032 md->disk->major = _major; 1032 md->disk->major = _major;
1033 md->disk->first_minor = minor; 1033 md->disk->first_minor = minor;
1034 md->disk->fops = &dm_blk_dops; 1034 md->disk->fops = &dm_blk_dops;
1035 md->disk->queue = md->queue; 1035 md->disk->queue = md->queue;
1036 md->disk->private_data = md; 1036 md->disk->private_data = md;
1037 sprintf(md->disk->disk_name, "dm-%d", minor); 1037 sprintf(md->disk->disk_name, "dm-%d", minor);
1038 add_disk(md->disk); 1038 add_disk(md->disk);
1039 format_dev_t(md->name, MKDEV(_major, minor)); 1039 format_dev_t(md->name, MKDEV(_major, minor));
1040 1040
1041 /* Populate the mapping, nobody knows we exist yet */ 1041 /* Populate the mapping, nobody knows we exist yet */
1042 spin_lock(&_minor_lock); 1042 spin_lock(&_minor_lock);
1043 old_md = idr_replace(&_minor_idr, md, minor); 1043 old_md = idr_replace(&_minor_idr, md, minor);
1044 spin_unlock(&_minor_lock); 1044 spin_unlock(&_minor_lock);
1045 1045
1046 BUG_ON(old_md != MINOR_ALLOCED); 1046 BUG_ON(old_md != MINOR_ALLOCED);
1047 1047
1048 return md; 1048 return md;
1049 1049
1050 bad4: 1050 bad4:
1051 bioset_free(md->bs); 1051 bioset_free(md->bs);
1052 bad_no_bioset: 1052 bad_no_bioset:
1053 mempool_destroy(md->tio_pool); 1053 mempool_destroy(md->tio_pool);
1054 bad3: 1054 bad3:
1055 mempool_destroy(md->io_pool); 1055 mempool_destroy(md->io_pool);
1056 bad2: 1056 bad2:
1057 blk_cleanup_queue(md->queue); 1057 blk_cleanup_queue(md->queue);
1058 bad1_free_minor: 1058 bad1_free_minor:
1059 free_minor(minor); 1059 free_minor(minor);
1060 bad1: 1060 bad1:
1061 module_put(THIS_MODULE); 1061 module_put(THIS_MODULE);
1062 bad0: 1062 bad0:
1063 kfree(md); 1063 kfree(md);
1064 return NULL; 1064 return NULL;
1065 } 1065 }
1066 1066
1067 static void free_dev(struct mapped_device *md) 1067 static void free_dev(struct mapped_device *md)
1068 { 1068 {
1069 int minor = md->disk->first_minor; 1069 int minor = md->disk->first_minor;
1070 1070
1071 if (md->suspended_bdev) { 1071 if (md->suspended_bdev) {
1072 thaw_bdev(md->suspended_bdev, NULL); 1072 thaw_bdev(md->suspended_bdev, NULL);
1073 bdput(md->suspended_bdev); 1073 bdput(md->suspended_bdev);
1074 } 1074 }
1075 mempool_destroy(md->tio_pool); 1075 mempool_destroy(md->tio_pool);
1076 mempool_destroy(md->io_pool); 1076 mempool_destroy(md->io_pool);
1077 bioset_free(md->bs); 1077 bioset_free(md->bs);
1078 del_gendisk(md->disk); 1078 del_gendisk(md->disk);
1079 free_minor(minor); 1079 free_minor(minor);
1080 1080
1081 spin_lock(&_minor_lock); 1081 spin_lock(&_minor_lock);
1082 md->disk->private_data = NULL; 1082 md->disk->private_data = NULL;
1083 spin_unlock(&_minor_lock); 1083 spin_unlock(&_minor_lock);
1084 1084
1085 put_disk(md->disk); 1085 put_disk(md->disk);
1086 blk_cleanup_queue(md->queue); 1086 blk_cleanup_queue(md->queue);
1087 module_put(THIS_MODULE); 1087 module_put(THIS_MODULE);
1088 kfree(md); 1088 kfree(md);
1089 } 1089 }
1090 1090
1091 /* 1091 /*
1092 * Bind a table to the device. 1092 * Bind a table to the device.
1093 */ 1093 */
1094 static void event_callback(void *context) 1094 static void event_callback(void *context)
1095 { 1095 {
1096 struct mapped_device *md = (struct mapped_device *) context; 1096 struct mapped_device *md = (struct mapped_device *) context;
1097 1097
1098 atomic_inc(&md->event_nr); 1098 atomic_inc(&md->event_nr);
1099 wake_up(&md->eventq); 1099 wake_up(&md->eventq);
1100 } 1100 }
1101 1101
1102 static void __set_size(struct mapped_device *md, sector_t size) 1102 static void __set_size(struct mapped_device *md, sector_t size)
1103 { 1103 {
1104 set_capacity(md->disk, size); 1104 set_capacity(md->disk, size);
1105 1105
1106 mutex_lock(&md->suspended_bdev->bd_inode->i_mutex); 1106 mutex_lock(&md->suspended_bdev->bd_inode->i_mutex);
1107 i_size_write(md->suspended_bdev->bd_inode, (loff_t)size << SECTOR_SHIFT); 1107 i_size_write(md->suspended_bdev->bd_inode, (loff_t)size << SECTOR_SHIFT);
1108 mutex_unlock(&md->suspended_bdev->bd_inode->i_mutex); 1108 mutex_unlock(&md->suspended_bdev->bd_inode->i_mutex);
1109 } 1109 }
1110 1110
1111 static int __bind(struct mapped_device *md, struct dm_table *t) 1111 static int __bind(struct mapped_device *md, struct dm_table *t)
1112 { 1112 {
1113 struct request_queue *q = md->queue; 1113 struct request_queue *q = md->queue;
1114 sector_t size; 1114 sector_t size;
1115 1115
1116 size = dm_table_get_size(t); 1116 size = dm_table_get_size(t);
1117 1117
1118 /* 1118 /*
1119 * Wipe any geometry if the size of the table changed. 1119 * Wipe any geometry if the size of the table changed.
1120 */ 1120 */
1121 if (size != get_capacity(md->disk)) 1121 if (size != get_capacity(md->disk))
1122 memset(&md->geometry, 0, sizeof(md->geometry)); 1122 memset(&md->geometry, 0, sizeof(md->geometry));
1123 1123
1124 if (md->suspended_bdev) 1124 if (md->suspended_bdev)
1125 __set_size(md, size); 1125 __set_size(md, size);
1126 if (size == 0) 1126 if (size == 0)
1127 return 0; 1127 return 0;
1128 1128
1129 dm_table_get(t); 1129 dm_table_get(t);
1130 dm_table_event_callback(t, event_callback, md); 1130 dm_table_event_callback(t, event_callback, md);
1131 1131
1132 write_lock(&md->map_lock); 1132 write_lock(&md->map_lock);
1133 md->map = t; 1133 md->map = t;
1134 dm_table_set_restrictions(t, q); 1134 dm_table_set_restrictions(t, q);
1135 write_unlock(&md->map_lock); 1135 write_unlock(&md->map_lock);
1136 1136
1137 return 0; 1137 return 0;
1138 } 1138 }
1139 1139
1140 static void __unbind(struct mapped_device *md) 1140 static void __unbind(struct mapped_device *md)
1141 { 1141 {
1142 struct dm_table *map = md->map; 1142 struct dm_table *map = md->map;
1143 1143
1144 if (!map) 1144 if (!map)
1145 return; 1145 return;
1146 1146
1147 dm_table_event_callback(map, NULL, NULL); 1147 dm_table_event_callback(map, NULL, NULL);
1148 write_lock(&md->map_lock); 1148 write_lock(&md->map_lock);
1149 md->map = NULL; 1149 md->map = NULL;
1150 write_unlock(&md->map_lock); 1150 write_unlock(&md->map_lock);
1151 dm_table_put(map); 1151 dm_table_put(map);
1152 } 1152 }
1153 1153
1154 /* 1154 /*
1155 * Constructor for a new device. 1155 * Constructor for a new device.
1156 */ 1156 */
1157 int dm_create(int minor, struct mapped_device **result) 1157 int dm_create(int minor, struct mapped_device **result)
1158 { 1158 {
1159 struct mapped_device *md; 1159 struct mapped_device *md;
1160 1160
1161 md = alloc_dev(minor); 1161 md = alloc_dev(minor);
1162 if (!md) 1162 if (!md)
1163 return -ENXIO; 1163 return -ENXIO;
1164 1164
1165 *result = md; 1165 *result = md;
1166 return 0; 1166 return 0;
1167 } 1167 }
1168 1168
1169 static struct mapped_device *dm_find_md(dev_t dev) 1169 static struct mapped_device *dm_find_md(dev_t dev)
1170 { 1170 {
1171 struct mapped_device *md; 1171 struct mapped_device *md;
1172 unsigned minor = MINOR(dev); 1172 unsigned minor = MINOR(dev);
1173 1173
1174 if (MAJOR(dev) != _major || minor >= (1 << MINORBITS)) 1174 if (MAJOR(dev) != _major || minor >= (1 << MINORBITS))
1175 return NULL; 1175 return NULL;
1176 1176
1177 spin_lock(&_minor_lock); 1177 spin_lock(&_minor_lock);
1178 1178
1179 md = idr_find(&_minor_idr, minor); 1179 md = idr_find(&_minor_idr, minor);
1180 if (md && (md == MINOR_ALLOCED || 1180 if (md && (md == MINOR_ALLOCED ||
1181 (dm_disk(md)->first_minor != minor) || 1181 (dm_disk(md)->first_minor != minor) ||
1182 test_bit(DMF_FREEING, &md->flags))) { 1182 test_bit(DMF_FREEING, &md->flags))) {
1183 md = NULL; 1183 md = NULL;
1184 goto out; 1184 goto out;
1185 } 1185 }
1186 1186
1187 out: 1187 out:
1188 spin_unlock(&_minor_lock); 1188 spin_unlock(&_minor_lock);
1189 1189
1190 return md; 1190 return md;
1191 } 1191 }
1192 1192
1193 struct mapped_device *dm_get_md(dev_t dev) 1193 struct mapped_device *dm_get_md(dev_t dev)
1194 { 1194 {
1195 struct mapped_device *md = dm_find_md(dev); 1195 struct mapped_device *md = dm_find_md(dev);
1196 1196
1197 if (md) 1197 if (md)
1198 dm_get(md); 1198 dm_get(md);
1199 1199
1200 return md; 1200 return md;
1201 } 1201 }
1202 1202
1203 void *dm_get_mdptr(struct mapped_device *md) 1203 void *dm_get_mdptr(struct mapped_device *md)
1204 { 1204 {
1205 return md->interface_ptr; 1205 return md->interface_ptr;
1206 } 1206 }
1207 1207
1208 void dm_set_mdptr(struct mapped_device *md, void *ptr) 1208 void dm_set_mdptr(struct mapped_device *md, void *ptr)
1209 { 1209 {
1210 md->interface_ptr = ptr; 1210 md->interface_ptr = ptr;
1211 } 1211 }
1212 1212
1213 void dm_get(struct mapped_device *md) 1213 void dm_get(struct mapped_device *md)
1214 { 1214 {
1215 atomic_inc(&md->holders); 1215 atomic_inc(&md->holders);
1216 } 1216 }
1217 1217
1218 const char *dm_device_name(struct mapped_device *md) 1218 const char *dm_device_name(struct mapped_device *md)
1219 { 1219 {
1220 return md->name; 1220 return md->name;
1221 } 1221 }
1222 EXPORT_SYMBOL_GPL(dm_device_name); 1222 EXPORT_SYMBOL_GPL(dm_device_name);
1223 1223
1224 void dm_put(struct mapped_device *md) 1224 void dm_put(struct mapped_device *md)
1225 { 1225 {
1226 struct dm_table *map; 1226 struct dm_table *map;
1227 1227
1228 BUG_ON(test_bit(DMF_FREEING, &md->flags)); 1228 BUG_ON(test_bit(DMF_FREEING, &md->flags));
1229 1229
1230 if (atomic_dec_and_lock(&md->holders, &_minor_lock)) { 1230 if (atomic_dec_and_lock(&md->holders, &_minor_lock)) {
1231 map = dm_get_table(md); 1231 map = dm_get_table(md);
1232 idr_replace(&_minor_idr, MINOR_ALLOCED, dm_disk(md)->first_minor); 1232 idr_replace(&_minor_idr, MINOR_ALLOCED, dm_disk(md)->first_minor);
1233 set_bit(DMF_FREEING, &md->flags); 1233 set_bit(DMF_FREEING, &md->flags);
1234 spin_unlock(&_minor_lock); 1234 spin_unlock(&_minor_lock);
1235 if (!dm_suspended(md)) { 1235 if (!dm_suspended(md)) {
1236 dm_table_presuspend_targets(map); 1236 dm_table_presuspend_targets(map);
1237 dm_table_postsuspend_targets(map); 1237 dm_table_postsuspend_targets(map);
1238 } 1238 }
1239 __unbind(md); 1239 __unbind(md);
1240 dm_table_put(map); 1240 dm_table_put(map);
1241 free_dev(md); 1241 free_dev(md);
1242 } 1242 }
1243 } 1243 }
1244 EXPORT_SYMBOL_GPL(dm_put); 1244 EXPORT_SYMBOL_GPL(dm_put);
1245 1245
1246 /* 1246 /*
1247 * Process the deferred bios 1247 * Process the deferred bios
1248 */ 1248 */
1249 static void __flush_deferred_io(struct mapped_device *md, struct bio *c) 1249 static void __flush_deferred_io(struct mapped_device *md, struct bio *c)
1250 { 1250 {
1251 struct bio *n; 1251 struct bio *n;
1252 1252
1253 while (c) { 1253 while (c) {
1254 n = c->bi_next; 1254 n = c->bi_next;
1255 c->bi_next = NULL; 1255 c->bi_next = NULL;
1256 __split_bio(md, c); 1256 __split_bio(md, c);
1257 c = n; 1257 c = n;
1258 } 1258 }
1259 } 1259 }
1260 1260
1261 /* 1261 /*
1262 * Swap in a new table (destroying old one). 1262 * Swap in a new table (destroying old one).
1263 */ 1263 */
1264 int dm_swap_table(struct mapped_device *md, struct dm_table *table) 1264 int dm_swap_table(struct mapped_device *md, struct dm_table *table)
1265 { 1265 {
1266 int r = -EINVAL; 1266 int r = -EINVAL;
1267 1267
1268 down(&md->suspend_lock); 1268 down(&md->suspend_lock);
1269 1269
1270 /* device must be suspended */ 1270 /* device must be suspended */
1271 if (!dm_suspended(md)) 1271 if (!dm_suspended(md))
1272 goto out; 1272 goto out;
1273 1273
1274 /* without bdev, the device size cannot be changed */ 1274 /* without bdev, the device size cannot be changed */
1275 if (!md->suspended_bdev) 1275 if (!md->suspended_bdev)
1276 if (get_capacity(md->disk) != dm_table_get_size(table)) 1276 if (get_capacity(md->disk) != dm_table_get_size(table))
1277 goto out; 1277 goto out;
1278 1278
1279 __unbind(md); 1279 __unbind(md);
1280 r = __bind(md, table); 1280 r = __bind(md, table);
1281 1281
1282 out: 1282 out:
1283 up(&md->suspend_lock); 1283 up(&md->suspend_lock);
1284 return r; 1284 return r;
1285 } 1285 }
1286 1286
1287 /* 1287 /*
1288 * Functions to lock and unlock any filesystem running on the 1288 * Functions to lock and unlock any filesystem running on the
1289 * device. 1289 * device.
1290 */ 1290 */
1291 static int lock_fs(struct mapped_device *md) 1291 static int lock_fs(struct mapped_device *md)
1292 { 1292 {
1293 int r; 1293 int r;
1294 1294
1295 WARN_ON(md->frozen_sb); 1295 WARN_ON(md->frozen_sb);
1296 1296
1297 md->frozen_sb = freeze_bdev(md->suspended_bdev); 1297 md->frozen_sb = freeze_bdev(md->suspended_bdev);
1298 if (IS_ERR(md->frozen_sb)) { 1298 if (IS_ERR(md->frozen_sb)) {
1299 r = PTR_ERR(md->frozen_sb); 1299 r = PTR_ERR(md->frozen_sb);
1300 md->frozen_sb = NULL; 1300 md->frozen_sb = NULL;
1301 return r; 1301 return r;
1302 } 1302 }
1303 1303
1304 set_bit(DMF_FROZEN, &md->flags); 1304 set_bit(DMF_FROZEN, &md->flags);
1305 1305
1306 /* don't bdput right now, we don't want the bdev 1306 /* don't bdput right now, we don't want the bdev
1307 * to go away while it is locked. 1307 * to go away while it is locked.
1308 */ 1308 */
1309 return 0; 1309 return 0;
1310 } 1310 }
1311 1311
1312 static void unlock_fs(struct mapped_device *md) 1312 static void unlock_fs(struct mapped_device *md)
1313 { 1313 {
1314 if (!test_bit(DMF_FROZEN, &md->flags)) 1314 if (!test_bit(DMF_FROZEN, &md->flags))
1315 return; 1315 return;
1316 1316
1317 thaw_bdev(md->suspended_bdev, md->frozen_sb); 1317 thaw_bdev(md->suspended_bdev, md->frozen_sb);
1318 md->frozen_sb = NULL; 1318 md->frozen_sb = NULL;
1319 clear_bit(DMF_FROZEN, &md->flags); 1319 clear_bit(DMF_FROZEN, &md->flags);
1320 } 1320 }
1321 1321
1322 /* 1322 /*
1323 * We need to be able to change a mapping table under a mounted 1323 * We need to be able to change a mapping table under a mounted
1324 * filesystem. For example we might want to move some data in 1324 * filesystem. For example we might want to move some data in
1325 * the background. Before the table can be swapped with 1325 * the background. Before the table can be swapped with
1326 * dm_bind_table, dm_suspend must be called to flush any in 1326 * dm_bind_table, dm_suspend must be called to flush any in
1327 * flight bios and ensure that any further io gets deferred. 1327 * flight bios and ensure that any further io gets deferred.
1328 */ 1328 */
1329 int dm_suspend(struct mapped_device *md, unsigned suspend_flags) 1329 int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
1330 { 1330 {
1331 struct dm_table *map = NULL; 1331 struct dm_table *map = NULL;
1332 unsigned long flags; 1332 unsigned long flags;
1333 DECLARE_WAITQUEUE(wait, current); 1333 DECLARE_WAITQUEUE(wait, current);
1334 struct bio *def; 1334 struct bio *def;
1335 int r = -EINVAL; 1335 int r = -EINVAL;
1336 int do_lockfs = suspend_flags & DM_SUSPEND_LOCKFS_FLAG ? 1 : 0; 1336 int do_lockfs = suspend_flags & DM_SUSPEND_LOCKFS_FLAG ? 1 : 0;
1337 int noflush = suspend_flags & DM_SUSPEND_NOFLUSH_FLAG ? 1 : 0; 1337 int noflush = suspend_flags & DM_SUSPEND_NOFLUSH_FLAG ? 1 : 0;
1338 1338
1339 down(&md->suspend_lock); 1339 down(&md->suspend_lock);
1340 1340
1341 if (dm_suspended(md)) 1341 if (dm_suspended(md))
1342 goto out_unlock; 1342 goto out_unlock;
1343 1343
1344 map = dm_get_table(md); 1344 map = dm_get_table(md);
1345 1345
1346 /* 1346 /*
1347 * DMF_NOFLUSH_SUSPENDING must be set before presuspend. 1347 * DMF_NOFLUSH_SUSPENDING must be set before presuspend.
1348 * This flag is cleared before dm_suspend returns. 1348 * This flag is cleared before dm_suspend returns.
1349 */ 1349 */
1350 if (noflush) 1350 if (noflush)
1351 set_bit(DMF_NOFLUSH_SUSPENDING, &md->flags); 1351 set_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
1352 1352
1353 /* This does not get reverted if there's an error later. */ 1353 /* This does not get reverted if there's an error later. */
1354 dm_table_presuspend_targets(map); 1354 dm_table_presuspend_targets(map);
1355 1355
1356 /* bdget() can stall if the pending I/Os are not flushed */ 1356 /* bdget() can stall if the pending I/Os are not flushed */
1357 if (!noflush) { 1357 if (!noflush) {
1358 md->suspended_bdev = bdget_disk(md->disk, 0); 1358 md->suspended_bdev = bdget_disk(md->disk, 0);
1359 if (!md->suspended_bdev) { 1359 if (!md->suspended_bdev) {
1360 DMWARN("bdget failed in dm_suspend"); 1360 DMWARN("bdget failed in dm_suspend");
1361 r = -ENOMEM; 1361 r = -ENOMEM;
1362 goto flush_and_out; 1362 goto flush_and_out;
1363 } 1363 }
1364 } 1364 }
1365 1365
1366 /* 1366 /*
1367 * Flush I/O to the device. 1367 * Flush I/O to the device.
1368 * noflush supersedes do_lockfs, because lock_fs() needs to flush I/Os. 1368 * noflush supersedes do_lockfs, because lock_fs() needs to flush I/Os.
1369 */ 1369 */
1370 if (do_lockfs && !noflush) { 1370 if (do_lockfs && !noflush) {
1371 r = lock_fs(md); 1371 r = lock_fs(md);
1372 if (r) 1372 if (r)
1373 goto out; 1373 goto out;
1374 } 1374 }
1375 1375
1376 /* 1376 /*
1377 * First we set the BLOCK_IO flag so no more ios will be mapped. 1377 * First we set the BLOCK_IO flag so no more ios will be mapped.
1378 */ 1378 */
1379 down_write(&md->io_lock); 1379 down_write(&md->io_lock);
1380 set_bit(DMF_BLOCK_IO, &md->flags); 1380 set_bit(DMF_BLOCK_IO, &md->flags);
1381 1381
1382 add_wait_queue(&md->wait, &wait); 1382 add_wait_queue(&md->wait, &wait);
1383 up_write(&md->io_lock); 1383 up_write(&md->io_lock);
1384 1384
1385 /* unplug */ 1385 /* unplug */
1386 if (map) 1386 if (map)
1387 dm_table_unplug_all(map); 1387 dm_table_unplug_all(map);
1388 1388
1389 /* 1389 /*
1390 * Then we wait for the already mapped ios to 1390 * Then we wait for the already mapped ios to
1391 * complete. 1391 * complete.
1392 */ 1392 */
1393 while (1) { 1393 while (1) {
1394 set_current_state(TASK_INTERRUPTIBLE); 1394 set_current_state(TASK_INTERRUPTIBLE);
1395 1395
1396 if (!atomic_read(&md->pending) || signal_pending(current)) 1396 if (!atomic_read(&md->pending) || signal_pending(current))
1397 break; 1397 break;
1398 1398
1399 io_schedule(); 1399 io_schedule();
1400 } 1400 }
1401 set_current_state(TASK_RUNNING); 1401 set_current_state(TASK_RUNNING);
1402 1402
1403 down_write(&md->io_lock); 1403 down_write(&md->io_lock);
1404 remove_wait_queue(&md->wait, &wait); 1404 remove_wait_queue(&md->wait, &wait);
1405 1405
1406 if (noflush) { 1406 if (noflush) {
1407 spin_lock_irqsave(&md->pushback_lock, flags); 1407 spin_lock_irqsave(&md->pushback_lock, flags);
1408 clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags); 1408 clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
1409 bio_list_merge_head(&md->deferred, &md->pushback); 1409 bio_list_merge_head(&md->deferred, &md->pushback);
1410 bio_list_init(&md->pushback); 1410 bio_list_init(&md->pushback);
1411 spin_unlock_irqrestore(&md->pushback_lock, flags); 1411 spin_unlock_irqrestore(&md->pushback_lock, flags);
1412 } 1412 }
1413 1413
1414 /* were we interrupted ? */ 1414 /* were we interrupted ? */
1415 r = -EINTR; 1415 r = -EINTR;
1416 if (atomic_read(&md->pending)) { 1416 if (atomic_read(&md->pending)) {
1417 clear_bit(DMF_BLOCK_IO, &md->flags); 1417 clear_bit(DMF_BLOCK_IO, &md->flags);
1418 def = bio_list_get(&md->deferred); 1418 def = bio_list_get(&md->deferred);
1419 __flush_deferred_io(md, def); 1419 __flush_deferred_io(md, def);
1420 up_write(&md->io_lock); 1420 up_write(&md->io_lock);
1421 unlock_fs(md); 1421 unlock_fs(md);
1422 goto out; /* pushback list is already flushed, so skip flush */ 1422 goto out; /* pushback list is already flushed, so skip flush */
1423 } 1423 }
1424 up_write(&md->io_lock); 1424 up_write(&md->io_lock);
1425 1425
1426 dm_table_postsuspend_targets(map); 1426 dm_table_postsuspend_targets(map);
1427 1427
1428 set_bit(DMF_SUSPENDED, &md->flags); 1428 set_bit(DMF_SUSPENDED, &md->flags);
1429 1429
1430 r = 0; 1430 r = 0;
1431 1431
1432 flush_and_out: 1432 flush_and_out:
1433 if (r && noflush) { 1433 if (r && noflush) {
1434 /* 1434 /*
1435 * Because there may be already I/Os in the pushback list, 1435 * Because there may be already I/Os in the pushback list,
1436 * flush them before return. 1436 * flush them before return.
1437 */ 1437 */
1438 down_write(&md->io_lock); 1438 down_write(&md->io_lock);
1439 1439
1440 spin_lock_irqsave(&md->pushback_lock, flags); 1440 spin_lock_irqsave(&md->pushback_lock, flags);
1441 clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags); 1441 clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
1442 bio_list_merge_head(&md->deferred, &md->pushback); 1442 bio_list_merge_head(&md->deferred, &md->pushback);
1443 bio_list_init(&md->pushback); 1443 bio_list_init(&md->pushback);
1444 spin_unlock_irqrestore(&md->pushback_lock, flags); 1444 spin_unlock_irqrestore(&md->pushback_lock, flags);
1445 1445
1446 def = bio_list_get(&md->deferred); 1446 def = bio_list_get(&md->deferred);
1447 __flush_deferred_io(md, def); 1447 __flush_deferred_io(md, def);
1448 up_write(&md->io_lock); 1448 up_write(&md->io_lock);
1449 } 1449 }
1450 1450
1451 out: 1451 out:
1452 if (r && md->suspended_bdev) { 1452 if (r && md->suspended_bdev) {
1453 bdput(md->suspended_bdev); 1453 bdput(md->suspended_bdev);
1454 md->suspended_bdev = NULL; 1454 md->suspended_bdev = NULL;
1455 } 1455 }
1456 1456
1457 dm_table_put(map); 1457 dm_table_put(map);
1458 1458
1459 out_unlock: 1459 out_unlock:
1460 up(&md->suspend_lock); 1460 up(&md->suspend_lock);
1461 return r; 1461 return r;
1462 } 1462 }
1463 1463
1464 int dm_resume(struct mapped_device *md) 1464 int dm_resume(struct mapped_device *md)
1465 { 1465 {
1466 int r = -EINVAL; 1466 int r = -EINVAL;
1467 struct bio *def; 1467 struct bio *def;
1468 struct dm_table *map = NULL; 1468 struct dm_table *map = NULL;
1469 1469
1470 down(&md->suspend_lock); 1470 down(&md->suspend_lock);
1471 if (!dm_suspended(md)) 1471 if (!dm_suspended(md))
1472 goto out; 1472 goto out;
1473 1473
1474 map = dm_get_table(md); 1474 map = dm_get_table(md);
1475 if (!map || !dm_table_get_size(map)) 1475 if (!map || !dm_table_get_size(map))
1476 goto out; 1476 goto out;
1477 1477
1478 r = dm_table_resume_targets(map); 1478 r = dm_table_resume_targets(map);
1479 if (r) 1479 if (r)
1480 goto out; 1480 goto out;
1481 1481
1482 down_write(&md->io_lock); 1482 down_write(&md->io_lock);
1483 clear_bit(DMF_BLOCK_IO, &md->flags); 1483 clear_bit(DMF_BLOCK_IO, &md->flags);
1484 1484
1485 def = bio_list_get(&md->deferred); 1485 def = bio_list_get(&md->deferred);
1486 __flush_deferred_io(md, def); 1486 __flush_deferred_io(md, def);
1487 up_write(&md->io_lock); 1487 up_write(&md->io_lock);
1488 1488
1489 unlock_fs(md); 1489 unlock_fs(md);
1490 1490
1491 if (md->suspended_bdev) { 1491 if (md->suspended_bdev) {
1492 bdput(md->suspended_bdev); 1492 bdput(md->suspended_bdev);
1493 md->suspended_bdev = NULL; 1493 md->suspended_bdev = NULL;
1494 } 1494 }
1495 1495
1496 clear_bit(DMF_SUSPENDED, &md->flags); 1496 clear_bit(DMF_SUSPENDED, &md->flags);
1497 1497
1498 dm_table_unplug_all(map); 1498 dm_table_unplug_all(map);
1499 1499
1500 kobject_uevent(&md->disk->kobj, KOBJ_CHANGE); 1500 kobject_uevent(&md->disk->kobj, KOBJ_CHANGE);
1501 1501
1502 r = 0; 1502 r = 0;
1503 1503
1504 out: 1504 out:
1505 dm_table_put(map); 1505 dm_table_put(map);
1506 up(&md->suspend_lock); 1506 up(&md->suspend_lock);
1507 1507
1508 return r; 1508 return r;
1509 } 1509 }
1510 1510
1511 /*----------------------------------------------------------------- 1511 /*-----------------------------------------------------------------
1512 * Event notification. 1512 * Event notification.
1513 *---------------------------------------------------------------*/ 1513 *---------------------------------------------------------------*/
1514 uint32_t dm_get_event_nr(struct mapped_device *md) 1514 uint32_t dm_get_event_nr(struct mapped_device *md)
1515 { 1515 {
1516 return atomic_read(&md->event_nr); 1516 return atomic_read(&md->event_nr);
1517 } 1517 }
1518 1518
1519 int dm_wait_event(struct mapped_device *md, int event_nr) 1519 int dm_wait_event(struct mapped_device *md, int event_nr)
1520 { 1520 {
1521 return wait_event_interruptible(md->eventq, 1521 return wait_event_interruptible(md->eventq,
1522 (event_nr != atomic_read(&md->event_nr))); 1522 (event_nr != atomic_read(&md->event_nr)));
1523 } 1523 }
1524 1524
1525 /* 1525 /*
1526 * The gendisk is only valid as long as you have a reference 1526 * The gendisk is only valid as long as you have a reference
1527 * count on 'md'. 1527 * count on 'md'.
1528 */ 1528 */
1529 struct gendisk *dm_disk(struct mapped_device *md) 1529 struct gendisk *dm_disk(struct mapped_device *md)
1530 { 1530 {
1531 return md->disk; 1531 return md->disk;
1532 } 1532 }
1533 1533
1534 int dm_suspended(struct mapped_device *md) 1534 int dm_suspended(struct mapped_device *md)
1535 { 1535 {
1536 return test_bit(DMF_SUSPENDED, &md->flags); 1536 return test_bit(DMF_SUSPENDED, &md->flags);
1537 } 1537 }
1538 1538
1539 int dm_noflush_suspending(struct dm_target *ti) 1539 int dm_noflush_suspending(struct dm_target *ti)
1540 { 1540 {
1541 struct mapped_device *md = dm_table_get_md(ti->table); 1541 struct mapped_device *md = dm_table_get_md(ti->table);
1542 int r = __noflush_suspending(md); 1542 int r = __noflush_suspending(md);
1543 1543
1544 dm_put(md); 1544 dm_put(md);
1545 1545
1546 return r; 1546 return r;
1547 } 1547 }
1548 EXPORT_SYMBOL_GPL(dm_noflush_suspending); 1548 EXPORT_SYMBOL_GPL(dm_noflush_suspending);
1549 1549
1550 static struct block_device_operations dm_blk_dops = { 1550 static struct block_device_operations dm_blk_dops = {
1551 .open = dm_blk_open, 1551 .open = dm_blk_open,
1552 .release = dm_blk_close, 1552 .release = dm_blk_close,
1553 .ioctl = dm_blk_ioctl, 1553 .ioctl = dm_blk_ioctl,
1554 .getgeo = dm_blk_getgeo, 1554 .getgeo = dm_blk_getgeo,
1555 .owner = THIS_MODULE 1555 .owner = THIS_MODULE
1556 }; 1556 };
1557 1557
1558 EXPORT_SYMBOL(dm_get_mapinfo); 1558 EXPORT_SYMBOL(dm_get_mapinfo);
1559 1559
1560 /* 1560 /*
1561 * module hooks 1561 * module hooks
1562 */ 1562 */
1563 module_init(dm_init); 1563 module_init(dm_init);
1564 module_exit(dm_exit); 1564 module_exit(dm_exit);
1565 1565
1566 module_param(major, uint, 0); 1566 module_param(major, uint, 0);
1567 MODULE_PARM_DESC(major, "The major number of the device mapper"); 1567 MODULE_PARM_DESC(major, "The major number of the device mapper");
1568 MODULE_DESCRIPTION(DM_NAME " driver"); 1568 MODULE_DESCRIPTION(DM_NAME " driver");
1569 MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>"); 1569 MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
1570 MODULE_LICENSE("GPL"); 1570 MODULE_LICENSE("GPL");
1571 1571
include/linux/blktrace_api.h
1 #ifndef BLKTRACE_H 1 #ifndef BLKTRACE_H
2 #define BLKTRACE_H 2 #define BLKTRACE_H
3 3
4 #include <linux/blkdev.h> 4 #include <linux/blkdev.h>
5 #include <linux/relay.h> 5 #include <linux/relay.h>
6 6
7 /* 7 /*
8 * Trace categories 8 * Trace categories
9 */ 9 */
10 enum blktrace_cat { 10 enum blktrace_cat {
11 BLK_TC_READ = 1 << 0, /* reads */ 11 BLK_TC_READ = 1 << 0, /* reads */
12 BLK_TC_WRITE = 1 << 1, /* writes */ 12 BLK_TC_WRITE = 1 << 1, /* writes */
13 BLK_TC_BARRIER = 1 << 2, /* barrier */ 13 BLK_TC_BARRIER = 1 << 2, /* barrier */
14 BLK_TC_SYNC = 1 << 3, /* sync IO */ 14 BLK_TC_SYNC = 1 << 3, /* sync IO */
15 BLK_TC_QUEUE = 1 << 4, /* queueing/merging */ 15 BLK_TC_QUEUE = 1 << 4, /* queueing/merging */
16 BLK_TC_REQUEUE = 1 << 5, /* requeueing */ 16 BLK_TC_REQUEUE = 1 << 5, /* requeueing */
17 BLK_TC_ISSUE = 1 << 6, /* issue */ 17 BLK_TC_ISSUE = 1 << 6, /* issue */
18 BLK_TC_COMPLETE = 1 << 7, /* completions */ 18 BLK_TC_COMPLETE = 1 << 7, /* completions */
19 BLK_TC_FS = 1 << 8, /* fs requests */ 19 BLK_TC_FS = 1 << 8, /* fs requests */
20 BLK_TC_PC = 1 << 9, /* pc requests */ 20 BLK_TC_PC = 1 << 9, /* pc requests */
21 BLK_TC_NOTIFY = 1 << 10, /* special message */ 21 BLK_TC_NOTIFY = 1 << 10, /* special message */
22 BLK_TC_AHEAD = 1 << 11, /* readahead */ 22 BLK_TC_AHEAD = 1 << 11, /* readahead */
23 BLK_TC_META = 1 << 12, /* metadata */ 23 BLK_TC_META = 1 << 12, /* metadata */
24 24
25 BLK_TC_END = 1 << 15, /* only 16-bits, reminder */ 25 BLK_TC_END = 1 << 15, /* only 16-bits, reminder */
26 }; 26 };
27 27
28 #define BLK_TC_SHIFT (16) 28 #define BLK_TC_SHIFT (16)
29 #define BLK_TC_ACT(act) ((act) << BLK_TC_SHIFT) 29 #define BLK_TC_ACT(act) ((act) << BLK_TC_SHIFT)
30 30
31 /* 31 /*
32 * Basic trace actions 32 * Basic trace actions
33 */ 33 */
34 enum blktrace_act { 34 enum blktrace_act {
35 __BLK_TA_QUEUE = 1, /* queued */ 35 __BLK_TA_QUEUE = 1, /* queued */
36 __BLK_TA_BACKMERGE, /* back merged to existing rq */ 36 __BLK_TA_BACKMERGE, /* back merged to existing rq */
37 __BLK_TA_FRONTMERGE, /* front merge to existing rq */ 37 __BLK_TA_FRONTMERGE, /* front merge to existing rq */
38 __BLK_TA_GETRQ, /* allocated new request */ 38 __BLK_TA_GETRQ, /* allocated new request */
39 __BLK_TA_SLEEPRQ, /* sleeping on rq allocation */ 39 __BLK_TA_SLEEPRQ, /* sleeping on rq allocation */
40 __BLK_TA_REQUEUE, /* request requeued */ 40 __BLK_TA_REQUEUE, /* request requeued */
41 __BLK_TA_ISSUE, /* sent to driver */ 41 __BLK_TA_ISSUE, /* sent to driver */
42 __BLK_TA_COMPLETE, /* completed by driver */ 42 __BLK_TA_COMPLETE, /* completed by driver */
43 __BLK_TA_PLUG, /* queue was plugged */ 43 __BLK_TA_PLUG, /* queue was plugged */
44 __BLK_TA_UNPLUG_IO, /* queue was unplugged by io */ 44 __BLK_TA_UNPLUG_IO, /* queue was unplugged by io */
45 __BLK_TA_UNPLUG_TIMER, /* queue was unplugged by timer */ 45 __BLK_TA_UNPLUG_TIMER, /* queue was unplugged by timer */
46 __BLK_TA_INSERT, /* insert request */ 46 __BLK_TA_INSERT, /* insert request */
47 __BLK_TA_SPLIT, /* bio was split */ 47 __BLK_TA_SPLIT, /* bio was split */
48 __BLK_TA_BOUNCE, /* bio was bounced */ 48 __BLK_TA_BOUNCE, /* bio was bounced */
49 __BLK_TA_REMAP, /* bio was remapped */ 49 __BLK_TA_REMAP, /* bio was remapped */
50 }; 50 };
51 51
52 /* 52 /*
53 * Notify events. 53 * Notify events.
54 */ 54 */
55 enum blktrace_notify { 55 enum blktrace_notify {
56 __BLK_TN_PROCESS = 0, /* establish pid/name mapping */ 56 __BLK_TN_PROCESS = 0, /* establish pid/name mapping */
57 __BLK_TN_TIMESTAMP, /* include system clock */ 57 __BLK_TN_TIMESTAMP, /* include system clock */
58 }; 58 };
59 59
60 60
61 /* 61 /*
62 * Trace actions in full. Additionally, read or write is masked 62 * Trace actions in full. Additionally, read or write is masked
63 */ 63 */
64 #define BLK_TA_QUEUE (__BLK_TA_QUEUE | BLK_TC_ACT(BLK_TC_QUEUE)) 64 #define BLK_TA_QUEUE (__BLK_TA_QUEUE | BLK_TC_ACT(BLK_TC_QUEUE))
65 #define BLK_TA_BACKMERGE (__BLK_TA_BACKMERGE | BLK_TC_ACT(BLK_TC_QUEUE)) 65 #define BLK_TA_BACKMERGE (__BLK_TA_BACKMERGE | BLK_TC_ACT(BLK_TC_QUEUE))
66 #define BLK_TA_FRONTMERGE (__BLK_TA_FRONTMERGE | BLK_TC_ACT(BLK_TC_QUEUE)) 66 #define BLK_TA_FRONTMERGE (__BLK_TA_FRONTMERGE | BLK_TC_ACT(BLK_TC_QUEUE))
67 #define BLK_TA_GETRQ (__BLK_TA_GETRQ | BLK_TC_ACT(BLK_TC_QUEUE)) 67 #define BLK_TA_GETRQ (__BLK_TA_GETRQ | BLK_TC_ACT(BLK_TC_QUEUE))
68 #define BLK_TA_SLEEPRQ (__BLK_TA_SLEEPRQ | BLK_TC_ACT(BLK_TC_QUEUE)) 68 #define BLK_TA_SLEEPRQ (__BLK_TA_SLEEPRQ | BLK_TC_ACT(BLK_TC_QUEUE))
69 #define BLK_TA_REQUEUE (__BLK_TA_REQUEUE | BLK_TC_ACT(BLK_TC_REQUEUE)) 69 #define BLK_TA_REQUEUE (__BLK_TA_REQUEUE | BLK_TC_ACT(BLK_TC_REQUEUE))
70 #define BLK_TA_ISSUE (__BLK_TA_ISSUE | BLK_TC_ACT(BLK_TC_ISSUE)) 70 #define BLK_TA_ISSUE (__BLK_TA_ISSUE | BLK_TC_ACT(BLK_TC_ISSUE))
71 #define BLK_TA_COMPLETE (__BLK_TA_COMPLETE| BLK_TC_ACT(BLK_TC_COMPLETE)) 71 #define BLK_TA_COMPLETE (__BLK_TA_COMPLETE| BLK_TC_ACT(BLK_TC_COMPLETE))
72 #define BLK_TA_PLUG (__BLK_TA_PLUG | BLK_TC_ACT(BLK_TC_QUEUE)) 72 #define BLK_TA_PLUG (__BLK_TA_PLUG | BLK_TC_ACT(BLK_TC_QUEUE))
73 #define BLK_TA_UNPLUG_IO (__BLK_TA_UNPLUG_IO | BLK_TC_ACT(BLK_TC_QUEUE)) 73 #define BLK_TA_UNPLUG_IO (__BLK_TA_UNPLUG_IO | BLK_TC_ACT(BLK_TC_QUEUE))
74 #define BLK_TA_UNPLUG_TIMER (__BLK_TA_UNPLUG_TIMER | BLK_TC_ACT(BLK_TC_QUEUE)) 74 #define BLK_TA_UNPLUG_TIMER (__BLK_TA_UNPLUG_TIMER | BLK_TC_ACT(BLK_TC_QUEUE))
75 #define BLK_TA_INSERT (__BLK_TA_INSERT | BLK_TC_ACT(BLK_TC_QUEUE)) 75 #define BLK_TA_INSERT (__BLK_TA_INSERT | BLK_TC_ACT(BLK_TC_QUEUE))
76 #define BLK_TA_SPLIT (__BLK_TA_SPLIT) 76 #define BLK_TA_SPLIT (__BLK_TA_SPLIT)
77 #define BLK_TA_BOUNCE (__BLK_TA_BOUNCE) 77 #define BLK_TA_BOUNCE (__BLK_TA_BOUNCE)
78 #define BLK_TA_REMAP (__BLK_TA_REMAP | BLK_TC_ACT(BLK_TC_QUEUE)) 78 #define BLK_TA_REMAP (__BLK_TA_REMAP | BLK_TC_ACT(BLK_TC_QUEUE))
79 79
80 #define BLK_TN_PROCESS (__BLK_TN_PROCESS | BLK_TC_ACT(BLK_TC_NOTIFY)) 80 #define BLK_TN_PROCESS (__BLK_TN_PROCESS | BLK_TC_ACT(BLK_TC_NOTIFY))
81 #define BLK_TN_TIMESTAMP (__BLK_TN_TIMESTAMP | BLK_TC_ACT(BLK_TC_NOTIFY)) 81 #define BLK_TN_TIMESTAMP (__BLK_TN_TIMESTAMP | BLK_TC_ACT(BLK_TC_NOTIFY))
82 82
83 #define BLK_IO_TRACE_MAGIC 0x65617400 83 #define BLK_IO_TRACE_MAGIC 0x65617400
84 #define BLK_IO_TRACE_VERSION 0x07 84 #define BLK_IO_TRACE_VERSION 0x07
85 85
86 /* 86 /*
87 * The trace itself 87 * The trace itself
88 */ 88 */
89 struct blk_io_trace { 89 struct blk_io_trace {
90 u32 magic; /* MAGIC << 8 | version */ 90 u32 magic; /* MAGIC << 8 | version */
91 u32 sequence; /* event number */ 91 u32 sequence; /* event number */
92 u64 time; /* in microseconds */ 92 u64 time; /* in microseconds */
93 u64 sector; /* disk offset */ 93 u64 sector; /* disk offset */
94 u32 bytes; /* transfer length */ 94 u32 bytes; /* transfer length */
95 u32 action; /* what happened */ 95 u32 action; /* what happened */
96 u32 pid; /* who did it */ 96 u32 pid; /* who did it */
97 u32 device; /* device number */ 97 u32 device; /* device number */
98 u32 cpu; /* on what cpu did it happen */ 98 u32 cpu; /* on what cpu did it happen */
99 u16 error; /* completion error */ 99 u16 error; /* completion error */
100 u16 pdu_len; /* length of data after this trace */ 100 u16 pdu_len; /* length of data after this trace */
101 }; 101 };
102 102
103 /* 103 /*
104 * The remap event 104 * The remap event
105 */ 105 */
106 struct blk_io_trace_remap { 106 struct blk_io_trace_remap {
107 __be32 device; 107 __be32 device;
108 u32 __pad; 108 __be32 device_from;
109 __be64 sector; 109 __be64 sector;
110 }; 110 };
111 111
112 enum { 112 enum {
113 Blktrace_setup = 1, 113 Blktrace_setup = 1,
114 Blktrace_running, 114 Blktrace_running,
115 Blktrace_stopped, 115 Blktrace_stopped,
116 }; 116 };
117 117
118 struct blk_trace { 118 struct blk_trace {
119 int trace_state; 119 int trace_state;
120 struct rchan *rchan; 120 struct rchan *rchan;
121 unsigned long *sequence; 121 unsigned long *sequence;
122 u16 act_mask; 122 u16 act_mask;
123 u64 start_lba; 123 u64 start_lba;
124 u64 end_lba; 124 u64 end_lba;
125 u32 pid; 125 u32 pid;
126 u32 dev; 126 u32 dev;
127 struct dentry *dir; 127 struct dentry *dir;
128 struct dentry *dropped_file; 128 struct dentry *dropped_file;
129 atomic_t dropped; 129 atomic_t dropped;
130 }; 130 };
131 131
132 /* 132 /*
133 * User setup structure passed with BLKTRACESTART 133 * User setup structure passed with BLKTRACESTART
134 */ 134 */
135 struct blk_user_trace_setup { 135 struct blk_user_trace_setup {
136 char name[BDEVNAME_SIZE]; /* output */ 136 char name[BDEVNAME_SIZE]; /* output */
137 u16 act_mask; /* input */ 137 u16 act_mask; /* input */
138 u32 buf_size; /* input */ 138 u32 buf_size; /* input */
139 u32 buf_nr; /* input */ 139 u32 buf_nr; /* input */
140 u64 start_lba; 140 u64 start_lba;
141 u64 end_lba; 141 u64 end_lba;
142 u32 pid; 142 u32 pid;
143 }; 143 };
144 144
145 #if defined(CONFIG_BLK_DEV_IO_TRACE) 145 #if defined(CONFIG_BLK_DEV_IO_TRACE)
146 extern int blk_trace_ioctl(struct block_device *, unsigned, char __user *); 146 extern int blk_trace_ioctl(struct block_device *, unsigned, char __user *);
147 extern void blk_trace_shutdown(struct request_queue *); 147 extern void blk_trace_shutdown(struct request_queue *);
148 extern void __blk_add_trace(struct blk_trace *, sector_t, int, int, u32, int, int, void *); 148 extern void __blk_add_trace(struct blk_trace *, sector_t, int, int, u32, int, int, void *);
149 149
150 /** 150 /**
151 * blk_add_trace_rq - Add a trace for a request oriented action 151 * blk_add_trace_rq - Add a trace for a request oriented action
152 * @q: queue the io is for 152 * @q: queue the io is for
153 * @rq: the source request 153 * @rq: the source request
154 * @what: the action 154 * @what: the action
155 * 155 *
156 * Description: 156 * Description:
157 * Records an action against a request. Will log the bio offset + size. 157 * Records an action against a request. Will log the bio offset + size.
158 * 158 *
159 **/ 159 **/
160 static inline void blk_add_trace_rq(struct request_queue *q, struct request *rq, 160 static inline void blk_add_trace_rq(struct request_queue *q, struct request *rq,
161 u32 what) 161 u32 what)
162 { 162 {
163 struct blk_trace *bt = q->blk_trace; 163 struct blk_trace *bt = q->blk_trace;
164 int rw = rq->cmd_flags & 0x03; 164 int rw = rq->cmd_flags & 0x03;
165 165
166 if (likely(!bt)) 166 if (likely(!bt))
167 return; 167 return;
168 168
169 if (blk_pc_request(rq)) { 169 if (blk_pc_request(rq)) {
170 what |= BLK_TC_ACT(BLK_TC_PC); 170 what |= BLK_TC_ACT(BLK_TC_PC);
171 __blk_add_trace(bt, 0, rq->data_len, rw, what, rq->errors, sizeof(rq->cmd), rq->cmd); 171 __blk_add_trace(bt, 0, rq->data_len, rw, what, rq->errors, sizeof(rq->cmd), rq->cmd);
172 } else { 172 } else {
173 what |= BLK_TC_ACT(BLK_TC_FS); 173 what |= BLK_TC_ACT(BLK_TC_FS);
174 __blk_add_trace(bt, rq->hard_sector, rq->hard_nr_sectors << 9, rw, what, rq->errors, 0, NULL); 174 __blk_add_trace(bt, rq->hard_sector, rq->hard_nr_sectors << 9, rw, what, rq->errors, 0, NULL);
175 } 175 }
176 } 176 }
177 177
178 /** 178 /**
179 * blk_add_trace_bio - Add a trace for a bio oriented action 179 * blk_add_trace_bio - Add a trace for a bio oriented action
180 * @q: queue the io is for 180 * @q: queue the io is for
181 * @bio: the source bio 181 * @bio: the source bio
182 * @what: the action 182 * @what: the action
183 * 183 *
184 * Description: 184 * Description:
185 * Records an action against a bio. Will log the bio offset + size. 185 * Records an action against a bio. Will log the bio offset + size.
186 * 186 *
187 **/ 187 **/
188 static inline void blk_add_trace_bio(struct request_queue *q, struct bio *bio, 188 static inline void blk_add_trace_bio(struct request_queue *q, struct bio *bio,
189 u32 what) 189 u32 what)
190 { 190 {
191 struct blk_trace *bt = q->blk_trace; 191 struct blk_trace *bt = q->blk_trace;
192 192
193 if (likely(!bt)) 193 if (likely(!bt))
194 return; 194 return;
195 195
196 __blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw, what, !bio_flagged(bio, BIO_UPTODATE), 0, NULL); 196 __blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw, what, !bio_flagged(bio, BIO_UPTODATE), 0, NULL);
197 } 197 }
198 198
199 /** 199 /**
200 * blk_add_trace_generic - Add a trace for a generic action 200 * blk_add_trace_generic - Add a trace for a generic action
201 * @q: queue the io is for 201 * @q: queue the io is for
202 * @bio: the source bio 202 * @bio: the source bio
203 * @rw: the data direction 203 * @rw: the data direction
204 * @what: the action 204 * @what: the action
205 * 205 *
206 * Description: 206 * Description:
207 * Records a simple trace 207 * Records a simple trace
208 * 208 *
209 **/ 209 **/
210 static inline void blk_add_trace_generic(struct request_queue *q, 210 static inline void blk_add_trace_generic(struct request_queue *q,
211 struct bio *bio, int rw, u32 what) 211 struct bio *bio, int rw, u32 what)
212 { 212 {
213 struct blk_trace *bt = q->blk_trace; 213 struct blk_trace *bt = q->blk_trace;
214 214
215 if (likely(!bt)) 215 if (likely(!bt))
216 return; 216 return;
217 217
218 if (bio) 218 if (bio)
219 blk_add_trace_bio(q, bio, what); 219 blk_add_trace_bio(q, bio, what);
220 else 220 else
221 __blk_add_trace(bt, 0, 0, rw, what, 0, 0, NULL); 221 __blk_add_trace(bt, 0, 0, rw, what, 0, 0, NULL);
222 } 222 }
223 223
224 /** 224 /**
225 * blk_add_trace_pdu_int - Add a trace for a bio with an integer payload 225 * blk_add_trace_pdu_int - Add a trace for a bio with an integer payload
226 * @q: queue the io is for 226 * @q: queue the io is for
227 * @what: the action 227 * @what: the action
228 * @bio: the source bio 228 * @bio: the source bio
229 * @pdu: the integer payload 229 * @pdu: the integer payload
230 * 230 *
231 * Description: 231 * Description:
232 * Adds a trace with some integer payload. This might be an unplug 232 * Adds a trace with some integer payload. This might be an unplug
233 * option given as the action, with the depth at unplug time given 233 * option given as the action, with the depth at unplug time given
234 * as the payload 234 * as the payload
235 * 235 *
236 **/ 236 **/
237 static inline void blk_add_trace_pdu_int(struct request_queue *q, u32 what, 237 static inline void blk_add_trace_pdu_int(struct request_queue *q, u32 what,
238 struct bio *bio, unsigned int pdu) 238 struct bio *bio, unsigned int pdu)
239 { 239 {
240 struct blk_trace *bt = q->blk_trace; 240 struct blk_trace *bt = q->blk_trace;
241 __be64 rpdu = cpu_to_be64(pdu); 241 __be64 rpdu = cpu_to_be64(pdu);
242 242
243 if (likely(!bt)) 243 if (likely(!bt))
244 return; 244 return;
245 245
246 if (bio) 246 if (bio)
247 __blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw, what, !bio_flagged(bio, BIO_UPTODATE), sizeof(rpdu), &rpdu); 247 __blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw, what, !bio_flagged(bio, BIO_UPTODATE), sizeof(rpdu), &rpdu);
248 else 248 else
249 __blk_add_trace(bt, 0, 0, 0, what, 0, sizeof(rpdu), &rpdu); 249 __blk_add_trace(bt, 0, 0, 0, what, 0, sizeof(rpdu), &rpdu);
250 } 250 }
251 251
252 /** 252 /**
253 * blk_add_trace_remap - Add a trace for a remap operation 253 * blk_add_trace_remap - Add a trace for a remap operation
254 * @q: queue the io is for 254 * @q: queue the io is for
255 * @bio: the source bio 255 * @bio: the source bio
256 * @dev: target device 256 * @dev: target device
257 * @from: source sector 257 * @from: source sector
258 * @to: target sector 258 * @to: target sector
259 * 259 *
260 * Description: 260 * Description:
261 * Device mapper or raid target sometimes need to split a bio because 261 * Device mapper or raid target sometimes need to split a bio because
262 * it spans a stripe (or similar). Add a trace for that action. 262 * it spans a stripe (or similar). Add a trace for that action.
263 * 263 *
264 **/ 264 **/
265 static inline void blk_add_trace_remap(struct request_queue *q, struct bio *bio, 265 static inline void blk_add_trace_remap(struct request_queue *q, struct bio *bio,
266 dev_t dev, sector_t from, sector_t to) 266 dev_t dev, sector_t from, sector_t to)
267 { 267 {
268 struct blk_trace *bt = q->blk_trace; 268 struct blk_trace *bt = q->blk_trace;
269 struct blk_io_trace_remap r; 269 struct blk_io_trace_remap r;
270 270
271 if (likely(!bt)) 271 if (likely(!bt))
272 return; 272 return;
273 273
274 r.device = cpu_to_be32(dev); 274 r.device = cpu_to_be32(dev);
275 r.device_from = cpu_to_be32(bio->bi_bdev->bd_dev);
275 r.sector = cpu_to_be64(to); 276 r.sector = cpu_to_be64(to);
276 277
277 __blk_add_trace(bt, from, bio->bi_size, bio->bi_rw, BLK_TA_REMAP, !bio_flagged(bio, BIO_UPTODATE), sizeof(r), &r); 278 __blk_add_trace(bt, from, bio->bi_size, bio->bi_rw, BLK_TA_REMAP, !bio_flagged(bio, BIO_UPTODATE), sizeof(r), &r);
278 } 279 }
279 280
280 #else /* !CONFIG_BLK_DEV_IO_TRACE */ 281 #else /* !CONFIG_BLK_DEV_IO_TRACE */
281 #define blk_trace_ioctl(bdev, cmd, arg) (-ENOTTY) 282 #define blk_trace_ioctl(bdev, cmd, arg) (-ENOTTY)
282 #define blk_trace_shutdown(q) do { } while (0) 283 #define blk_trace_shutdown(q) do { } while (0)
283 #define blk_add_trace_rq(q, rq, what) do { } while (0) 284 #define blk_add_trace_rq(q, rq, what) do { } while (0)
284 #define blk_add_trace_bio(q, rq, what) do { } while (0) 285 #define blk_add_trace_bio(q, rq, what) do { } while (0)
285 #define blk_add_trace_generic(q, rq, rw, what) do { } while (0) 286 #define blk_add_trace_generic(q, rq, rw, what) do { } while (0)
286 #define blk_add_trace_pdu_int(q, what, bio, pdu) do { } while (0) 287 #define blk_add_trace_pdu_int(q, what, bio, pdu) do { } while (0)
287 #define blk_add_trace_remap(q, bio, dev, f, t) do {} while (0) 288 #define blk_add_trace_remap(q, bio, dev, f, t) do {} while (0)
288 #endif /* CONFIG_BLK_DEV_IO_TRACE */ 289 #endif /* CONFIG_BLK_DEV_IO_TRACE */
289 290
290 #endif 291 #endif
291 292