Commit c7149d6bce2561aeaa48caaa1700aa8b3b22008f
Committed by
Jens Axboe
1 parent
ec05b297f9
Fix remap handling by blktrace
This patch provides more information concerning REMAP operations on block IOs. The additional information provides clearer details at the user level, and supports post-processing analysis in btt. o Adds in partition remaps on the same device. o Fixed up the remap information in DM to be in the right order o Sent up mapped-from and mapped-to device information Signed-off-by: Alan D. Brunelle <alan.brunelle@hp.com> Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
Showing 3 changed files with 8 additions and 3 deletions Inline Diff
block/ll_rw_blk.c
1 | /* | 1 | /* |
2 | * Copyright (C) 1991, 1992 Linus Torvalds | 2 | * Copyright (C) 1991, 1992 Linus Torvalds |
3 | * Copyright (C) 1994, Karl Keyte: Added support for disk statistics | 3 | * Copyright (C) 1994, Karl Keyte: Added support for disk statistics |
4 | * Elevator latency, (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE | 4 | * Elevator latency, (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE |
5 | * Queue request tables / lock, selectable elevator, Jens Axboe <axboe@suse.de> | 5 | * Queue request tables / lock, selectable elevator, Jens Axboe <axboe@suse.de> |
6 | * kernel-doc documentation started by NeilBrown <neilb@cse.unsw.edu.au> - July2000 | 6 | * kernel-doc documentation started by NeilBrown <neilb@cse.unsw.edu.au> - July2000 |
7 | * bio rewrite, highmem i/o, etc, Jens Axboe <axboe@suse.de> - may 2001 | 7 | * bio rewrite, highmem i/o, etc, Jens Axboe <axboe@suse.de> - may 2001 |
8 | */ | 8 | */ |
9 | 9 | ||
10 | /* | 10 | /* |
11 | * This handles all read/write requests to block devices | 11 | * This handles all read/write requests to block devices |
12 | */ | 12 | */ |
13 | #include <linux/kernel.h> | 13 | #include <linux/kernel.h> |
14 | #include <linux/module.h> | 14 | #include <linux/module.h> |
15 | #include <linux/backing-dev.h> | 15 | #include <linux/backing-dev.h> |
16 | #include <linux/bio.h> | 16 | #include <linux/bio.h> |
17 | #include <linux/blkdev.h> | 17 | #include <linux/blkdev.h> |
18 | #include <linux/highmem.h> | 18 | #include <linux/highmem.h> |
19 | #include <linux/mm.h> | 19 | #include <linux/mm.h> |
20 | #include <linux/kernel_stat.h> | 20 | #include <linux/kernel_stat.h> |
21 | #include <linux/string.h> | 21 | #include <linux/string.h> |
22 | #include <linux/init.h> | 22 | #include <linux/init.h> |
23 | #include <linux/bootmem.h> /* for max_pfn/max_low_pfn */ | 23 | #include <linux/bootmem.h> /* for max_pfn/max_low_pfn */ |
24 | #include <linux/completion.h> | 24 | #include <linux/completion.h> |
25 | #include <linux/slab.h> | 25 | #include <linux/slab.h> |
26 | #include <linux/swap.h> | 26 | #include <linux/swap.h> |
27 | #include <linux/writeback.h> | 27 | #include <linux/writeback.h> |
28 | #include <linux/task_io_accounting_ops.h> | 28 | #include <linux/task_io_accounting_ops.h> |
29 | #include <linux/interrupt.h> | 29 | #include <linux/interrupt.h> |
30 | #include <linux/cpu.h> | 30 | #include <linux/cpu.h> |
31 | #include <linux/blktrace_api.h> | 31 | #include <linux/blktrace_api.h> |
32 | #include <linux/fault-inject.h> | 32 | #include <linux/fault-inject.h> |
33 | 33 | ||
34 | /* | 34 | /* |
35 | * for max sense size | 35 | * for max sense size |
36 | */ | 36 | */ |
37 | #include <scsi/scsi_cmnd.h> | 37 | #include <scsi/scsi_cmnd.h> |
38 | 38 | ||
39 | static void blk_unplug_work(struct work_struct *work); | 39 | static void blk_unplug_work(struct work_struct *work); |
40 | static void blk_unplug_timeout(unsigned long data); | 40 | static void blk_unplug_timeout(unsigned long data); |
41 | static void drive_stat_acct(struct request *rq, int nr_sectors, int new_io); | 41 | static void drive_stat_acct(struct request *rq, int nr_sectors, int new_io); |
42 | static void init_request_from_bio(struct request *req, struct bio *bio); | 42 | static void init_request_from_bio(struct request *req, struct bio *bio); |
43 | static int __make_request(struct request_queue *q, struct bio *bio); | 43 | static int __make_request(struct request_queue *q, struct bio *bio); |
44 | static struct io_context *current_io_context(gfp_t gfp_flags, int node); | 44 | static struct io_context *current_io_context(gfp_t gfp_flags, int node); |
45 | 45 | ||
46 | /* | 46 | /* |
47 | * For the allocated request tables | 47 | * For the allocated request tables |
48 | */ | 48 | */ |
49 | static struct kmem_cache *request_cachep; | 49 | static struct kmem_cache *request_cachep; |
50 | 50 | ||
51 | /* | 51 | /* |
52 | * For queue allocation | 52 | * For queue allocation |
53 | */ | 53 | */ |
54 | static struct kmem_cache *requestq_cachep; | 54 | static struct kmem_cache *requestq_cachep; |
55 | 55 | ||
56 | /* | 56 | /* |
57 | * For io context allocations | 57 | * For io context allocations |
58 | */ | 58 | */ |
59 | static struct kmem_cache *iocontext_cachep; | 59 | static struct kmem_cache *iocontext_cachep; |
60 | 60 | ||
61 | /* | 61 | /* |
62 | * Controlling structure to kblockd | 62 | * Controlling structure to kblockd |
63 | */ | 63 | */ |
64 | static struct workqueue_struct *kblockd_workqueue; | 64 | static struct workqueue_struct *kblockd_workqueue; |
65 | 65 | ||
66 | unsigned long blk_max_low_pfn, blk_max_pfn; | 66 | unsigned long blk_max_low_pfn, blk_max_pfn; |
67 | 67 | ||
68 | EXPORT_SYMBOL(blk_max_low_pfn); | 68 | EXPORT_SYMBOL(blk_max_low_pfn); |
69 | EXPORT_SYMBOL(blk_max_pfn); | 69 | EXPORT_SYMBOL(blk_max_pfn); |
70 | 70 | ||
71 | static DEFINE_PER_CPU(struct list_head, blk_cpu_done); | 71 | static DEFINE_PER_CPU(struct list_head, blk_cpu_done); |
72 | 72 | ||
73 | /* Amount of time in which a process may batch requests */ | 73 | /* Amount of time in which a process may batch requests */ |
74 | #define BLK_BATCH_TIME (HZ/50UL) | 74 | #define BLK_BATCH_TIME (HZ/50UL) |
75 | 75 | ||
76 | /* Number of requests a "batching" process may submit */ | 76 | /* Number of requests a "batching" process may submit */ |
77 | #define BLK_BATCH_REQ 32 | 77 | #define BLK_BATCH_REQ 32 |
78 | 78 | ||
79 | /* | 79 | /* |
80 | * Return the threshold (number of used requests) at which the queue is | 80 | * Return the threshold (number of used requests) at which the queue is |
81 | * considered to be congested. It include a little hysteresis to keep the | 81 | * considered to be congested. It include a little hysteresis to keep the |
82 | * context switch rate down. | 82 | * context switch rate down. |
83 | */ | 83 | */ |
84 | static inline int queue_congestion_on_threshold(struct request_queue *q) | 84 | static inline int queue_congestion_on_threshold(struct request_queue *q) |
85 | { | 85 | { |
86 | return q->nr_congestion_on; | 86 | return q->nr_congestion_on; |
87 | } | 87 | } |
88 | 88 | ||
89 | /* | 89 | /* |
90 | * The threshold at which a queue is considered to be uncongested | 90 | * The threshold at which a queue is considered to be uncongested |
91 | */ | 91 | */ |
92 | static inline int queue_congestion_off_threshold(struct request_queue *q) | 92 | static inline int queue_congestion_off_threshold(struct request_queue *q) |
93 | { | 93 | { |
94 | return q->nr_congestion_off; | 94 | return q->nr_congestion_off; |
95 | } | 95 | } |
96 | 96 | ||
97 | static void blk_queue_congestion_threshold(struct request_queue *q) | 97 | static void blk_queue_congestion_threshold(struct request_queue *q) |
98 | { | 98 | { |
99 | int nr; | 99 | int nr; |
100 | 100 | ||
101 | nr = q->nr_requests - (q->nr_requests / 8) + 1; | 101 | nr = q->nr_requests - (q->nr_requests / 8) + 1; |
102 | if (nr > q->nr_requests) | 102 | if (nr > q->nr_requests) |
103 | nr = q->nr_requests; | 103 | nr = q->nr_requests; |
104 | q->nr_congestion_on = nr; | 104 | q->nr_congestion_on = nr; |
105 | 105 | ||
106 | nr = q->nr_requests - (q->nr_requests / 8) - (q->nr_requests / 16) - 1; | 106 | nr = q->nr_requests - (q->nr_requests / 8) - (q->nr_requests / 16) - 1; |
107 | if (nr < 1) | 107 | if (nr < 1) |
108 | nr = 1; | 108 | nr = 1; |
109 | q->nr_congestion_off = nr; | 109 | q->nr_congestion_off = nr; |
110 | } | 110 | } |
111 | 111 | ||
112 | /** | 112 | /** |
113 | * blk_get_backing_dev_info - get the address of a queue's backing_dev_info | 113 | * blk_get_backing_dev_info - get the address of a queue's backing_dev_info |
114 | * @bdev: device | 114 | * @bdev: device |
115 | * | 115 | * |
116 | * Locates the passed device's request queue and returns the address of its | 116 | * Locates the passed device's request queue and returns the address of its |
117 | * backing_dev_info | 117 | * backing_dev_info |
118 | * | 118 | * |
119 | * Will return NULL if the request queue cannot be located. | 119 | * Will return NULL if the request queue cannot be located. |
120 | */ | 120 | */ |
121 | struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev) | 121 | struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev) |
122 | { | 122 | { |
123 | struct backing_dev_info *ret = NULL; | 123 | struct backing_dev_info *ret = NULL; |
124 | struct request_queue *q = bdev_get_queue(bdev); | 124 | struct request_queue *q = bdev_get_queue(bdev); |
125 | 125 | ||
126 | if (q) | 126 | if (q) |
127 | ret = &q->backing_dev_info; | 127 | ret = &q->backing_dev_info; |
128 | return ret; | 128 | return ret; |
129 | } | 129 | } |
130 | EXPORT_SYMBOL(blk_get_backing_dev_info); | 130 | EXPORT_SYMBOL(blk_get_backing_dev_info); |
131 | 131 | ||
132 | /** | 132 | /** |
133 | * blk_queue_prep_rq - set a prepare_request function for queue | 133 | * blk_queue_prep_rq - set a prepare_request function for queue |
134 | * @q: queue | 134 | * @q: queue |
135 | * @pfn: prepare_request function | 135 | * @pfn: prepare_request function |
136 | * | 136 | * |
137 | * It's possible for a queue to register a prepare_request callback which | 137 | * It's possible for a queue to register a prepare_request callback which |
138 | * is invoked before the request is handed to the request_fn. The goal of | 138 | * is invoked before the request is handed to the request_fn. The goal of |
139 | * the function is to prepare a request for I/O, it can be used to build a | 139 | * the function is to prepare a request for I/O, it can be used to build a |
140 | * cdb from the request data for instance. | 140 | * cdb from the request data for instance. |
141 | * | 141 | * |
142 | */ | 142 | */ |
143 | void blk_queue_prep_rq(struct request_queue *q, prep_rq_fn *pfn) | 143 | void blk_queue_prep_rq(struct request_queue *q, prep_rq_fn *pfn) |
144 | { | 144 | { |
145 | q->prep_rq_fn = pfn; | 145 | q->prep_rq_fn = pfn; |
146 | } | 146 | } |
147 | 147 | ||
148 | EXPORT_SYMBOL(blk_queue_prep_rq); | 148 | EXPORT_SYMBOL(blk_queue_prep_rq); |
149 | 149 | ||
150 | /** | 150 | /** |
151 | * blk_queue_merge_bvec - set a merge_bvec function for queue | 151 | * blk_queue_merge_bvec - set a merge_bvec function for queue |
152 | * @q: queue | 152 | * @q: queue |
153 | * @mbfn: merge_bvec_fn | 153 | * @mbfn: merge_bvec_fn |
154 | * | 154 | * |
155 | * Usually queues have static limitations on the max sectors or segments that | 155 | * Usually queues have static limitations on the max sectors or segments that |
156 | * we can put in a request. Stacking drivers may have some settings that | 156 | * we can put in a request. Stacking drivers may have some settings that |
157 | * are dynamic, and thus we have to query the queue whether it is ok to | 157 | * are dynamic, and thus we have to query the queue whether it is ok to |
158 | * add a new bio_vec to a bio at a given offset or not. If the block device | 158 | * add a new bio_vec to a bio at a given offset or not. If the block device |
159 | * has such limitations, it needs to register a merge_bvec_fn to control | 159 | * has such limitations, it needs to register a merge_bvec_fn to control |
160 | * the size of bio's sent to it. Note that a block device *must* allow a | 160 | * the size of bio's sent to it. Note that a block device *must* allow a |
161 | * single page to be added to an empty bio. The block device driver may want | 161 | * single page to be added to an empty bio. The block device driver may want |
162 | * to use the bio_split() function to deal with these bio's. By default | 162 | * to use the bio_split() function to deal with these bio's. By default |
163 | * no merge_bvec_fn is defined for a queue, and only the fixed limits are | 163 | * no merge_bvec_fn is defined for a queue, and only the fixed limits are |
164 | * honored. | 164 | * honored. |
165 | */ | 165 | */ |
166 | void blk_queue_merge_bvec(struct request_queue *q, merge_bvec_fn *mbfn) | 166 | void blk_queue_merge_bvec(struct request_queue *q, merge_bvec_fn *mbfn) |
167 | { | 167 | { |
168 | q->merge_bvec_fn = mbfn; | 168 | q->merge_bvec_fn = mbfn; |
169 | } | 169 | } |
170 | 170 | ||
171 | EXPORT_SYMBOL(blk_queue_merge_bvec); | 171 | EXPORT_SYMBOL(blk_queue_merge_bvec); |
172 | 172 | ||
173 | void blk_queue_softirq_done(struct request_queue *q, softirq_done_fn *fn) | 173 | void blk_queue_softirq_done(struct request_queue *q, softirq_done_fn *fn) |
174 | { | 174 | { |
175 | q->softirq_done_fn = fn; | 175 | q->softirq_done_fn = fn; |
176 | } | 176 | } |
177 | 177 | ||
178 | EXPORT_SYMBOL(blk_queue_softirq_done); | 178 | EXPORT_SYMBOL(blk_queue_softirq_done); |
179 | 179 | ||
180 | /** | 180 | /** |
181 | * blk_queue_make_request - define an alternate make_request function for a device | 181 | * blk_queue_make_request - define an alternate make_request function for a device |
182 | * @q: the request queue for the device to be affected | 182 | * @q: the request queue for the device to be affected |
183 | * @mfn: the alternate make_request function | 183 | * @mfn: the alternate make_request function |
184 | * | 184 | * |
185 | * Description: | 185 | * Description: |
186 | * The normal way for &struct bios to be passed to a device | 186 | * The normal way for &struct bios to be passed to a device |
187 | * driver is for them to be collected into requests on a request | 187 | * driver is for them to be collected into requests on a request |
188 | * queue, and then to allow the device driver to select requests | 188 | * queue, and then to allow the device driver to select requests |
189 | * off that queue when it is ready. This works well for many block | 189 | * off that queue when it is ready. This works well for many block |
190 | * devices. However some block devices (typically virtual devices | 190 | * devices. However some block devices (typically virtual devices |
191 | * such as md or lvm) do not benefit from the processing on the | 191 | * such as md or lvm) do not benefit from the processing on the |
192 | * request queue, and are served best by having the requests passed | 192 | * request queue, and are served best by having the requests passed |
193 | * directly to them. This can be achieved by providing a function | 193 | * directly to them. This can be achieved by providing a function |
194 | * to blk_queue_make_request(). | 194 | * to blk_queue_make_request(). |
195 | * | 195 | * |
196 | * Caveat: | 196 | * Caveat: |
197 | * The driver that does this *must* be able to deal appropriately | 197 | * The driver that does this *must* be able to deal appropriately |
198 | * with buffers in "highmemory". This can be accomplished by either calling | 198 | * with buffers in "highmemory". This can be accomplished by either calling |
199 | * __bio_kmap_atomic() to get a temporary kernel mapping, or by calling | 199 | * __bio_kmap_atomic() to get a temporary kernel mapping, or by calling |
200 | * blk_queue_bounce() to create a buffer in normal memory. | 200 | * blk_queue_bounce() to create a buffer in normal memory. |
201 | **/ | 201 | **/ |
202 | void blk_queue_make_request(struct request_queue * q, make_request_fn * mfn) | 202 | void blk_queue_make_request(struct request_queue * q, make_request_fn * mfn) |
203 | { | 203 | { |
204 | /* | 204 | /* |
205 | * set defaults | 205 | * set defaults |
206 | */ | 206 | */ |
207 | q->nr_requests = BLKDEV_MAX_RQ; | 207 | q->nr_requests = BLKDEV_MAX_RQ; |
208 | blk_queue_max_phys_segments(q, MAX_PHYS_SEGMENTS); | 208 | blk_queue_max_phys_segments(q, MAX_PHYS_SEGMENTS); |
209 | blk_queue_max_hw_segments(q, MAX_HW_SEGMENTS); | 209 | blk_queue_max_hw_segments(q, MAX_HW_SEGMENTS); |
210 | q->make_request_fn = mfn; | 210 | q->make_request_fn = mfn; |
211 | q->backing_dev_info.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE; | 211 | q->backing_dev_info.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE; |
212 | q->backing_dev_info.state = 0; | 212 | q->backing_dev_info.state = 0; |
213 | q->backing_dev_info.capabilities = BDI_CAP_MAP_COPY; | 213 | q->backing_dev_info.capabilities = BDI_CAP_MAP_COPY; |
214 | blk_queue_max_sectors(q, SAFE_MAX_SECTORS); | 214 | blk_queue_max_sectors(q, SAFE_MAX_SECTORS); |
215 | blk_queue_hardsect_size(q, 512); | 215 | blk_queue_hardsect_size(q, 512); |
216 | blk_queue_dma_alignment(q, 511); | 216 | blk_queue_dma_alignment(q, 511); |
217 | blk_queue_congestion_threshold(q); | 217 | blk_queue_congestion_threshold(q); |
218 | q->nr_batching = BLK_BATCH_REQ; | 218 | q->nr_batching = BLK_BATCH_REQ; |
219 | 219 | ||
220 | q->unplug_thresh = 4; /* hmm */ | 220 | q->unplug_thresh = 4; /* hmm */ |
221 | q->unplug_delay = (3 * HZ) / 1000; /* 3 milliseconds */ | 221 | q->unplug_delay = (3 * HZ) / 1000; /* 3 milliseconds */ |
222 | if (q->unplug_delay == 0) | 222 | if (q->unplug_delay == 0) |
223 | q->unplug_delay = 1; | 223 | q->unplug_delay = 1; |
224 | 224 | ||
225 | INIT_WORK(&q->unplug_work, blk_unplug_work); | 225 | INIT_WORK(&q->unplug_work, blk_unplug_work); |
226 | 226 | ||
227 | q->unplug_timer.function = blk_unplug_timeout; | 227 | q->unplug_timer.function = blk_unplug_timeout; |
228 | q->unplug_timer.data = (unsigned long)q; | 228 | q->unplug_timer.data = (unsigned long)q; |
229 | 229 | ||
230 | /* | 230 | /* |
231 | * by default assume old behaviour and bounce for any highmem page | 231 | * by default assume old behaviour and bounce for any highmem page |
232 | */ | 232 | */ |
233 | blk_queue_bounce_limit(q, BLK_BOUNCE_HIGH); | 233 | blk_queue_bounce_limit(q, BLK_BOUNCE_HIGH); |
234 | } | 234 | } |
235 | 235 | ||
236 | EXPORT_SYMBOL(blk_queue_make_request); | 236 | EXPORT_SYMBOL(blk_queue_make_request); |
237 | 237 | ||
238 | static void rq_init(struct request_queue *q, struct request *rq) | 238 | static void rq_init(struct request_queue *q, struct request *rq) |
239 | { | 239 | { |
240 | INIT_LIST_HEAD(&rq->queuelist); | 240 | INIT_LIST_HEAD(&rq->queuelist); |
241 | INIT_LIST_HEAD(&rq->donelist); | 241 | INIT_LIST_HEAD(&rq->donelist); |
242 | 242 | ||
243 | rq->errors = 0; | 243 | rq->errors = 0; |
244 | rq->bio = rq->biotail = NULL; | 244 | rq->bio = rq->biotail = NULL; |
245 | INIT_HLIST_NODE(&rq->hash); | 245 | INIT_HLIST_NODE(&rq->hash); |
246 | RB_CLEAR_NODE(&rq->rb_node); | 246 | RB_CLEAR_NODE(&rq->rb_node); |
247 | rq->ioprio = 0; | 247 | rq->ioprio = 0; |
248 | rq->buffer = NULL; | 248 | rq->buffer = NULL; |
249 | rq->ref_count = 1; | 249 | rq->ref_count = 1; |
250 | rq->q = q; | 250 | rq->q = q; |
251 | rq->special = NULL; | 251 | rq->special = NULL; |
252 | rq->data_len = 0; | 252 | rq->data_len = 0; |
253 | rq->data = NULL; | 253 | rq->data = NULL; |
254 | rq->nr_phys_segments = 0; | 254 | rq->nr_phys_segments = 0; |
255 | rq->sense = NULL; | 255 | rq->sense = NULL; |
256 | rq->end_io = NULL; | 256 | rq->end_io = NULL; |
257 | rq->end_io_data = NULL; | 257 | rq->end_io_data = NULL; |
258 | rq->completion_data = NULL; | 258 | rq->completion_data = NULL; |
259 | rq->next_rq = NULL; | 259 | rq->next_rq = NULL; |
260 | } | 260 | } |
261 | 261 | ||
262 | /** | 262 | /** |
263 | * blk_queue_ordered - does this queue support ordered writes | 263 | * blk_queue_ordered - does this queue support ordered writes |
264 | * @q: the request queue | 264 | * @q: the request queue |
265 | * @ordered: one of QUEUE_ORDERED_* | 265 | * @ordered: one of QUEUE_ORDERED_* |
266 | * @prepare_flush_fn: rq setup helper for cache flush ordered writes | 266 | * @prepare_flush_fn: rq setup helper for cache flush ordered writes |
267 | * | 267 | * |
268 | * Description: | 268 | * Description: |
269 | * For journalled file systems, doing ordered writes on a commit | 269 | * For journalled file systems, doing ordered writes on a commit |
270 | * block instead of explicitly doing wait_on_buffer (which is bad | 270 | * block instead of explicitly doing wait_on_buffer (which is bad |
271 | * for performance) can be a big win. Block drivers supporting this | 271 | * for performance) can be a big win. Block drivers supporting this |
272 | * feature should call this function and indicate so. | 272 | * feature should call this function and indicate so. |
273 | * | 273 | * |
274 | **/ | 274 | **/ |
275 | int blk_queue_ordered(struct request_queue *q, unsigned ordered, | 275 | int blk_queue_ordered(struct request_queue *q, unsigned ordered, |
276 | prepare_flush_fn *prepare_flush_fn) | 276 | prepare_flush_fn *prepare_flush_fn) |
277 | { | 277 | { |
278 | if (ordered & (QUEUE_ORDERED_PREFLUSH | QUEUE_ORDERED_POSTFLUSH) && | 278 | if (ordered & (QUEUE_ORDERED_PREFLUSH | QUEUE_ORDERED_POSTFLUSH) && |
279 | prepare_flush_fn == NULL) { | 279 | prepare_flush_fn == NULL) { |
280 | printk(KERN_ERR "blk_queue_ordered: prepare_flush_fn required\n"); | 280 | printk(KERN_ERR "blk_queue_ordered: prepare_flush_fn required\n"); |
281 | return -EINVAL; | 281 | return -EINVAL; |
282 | } | 282 | } |
283 | 283 | ||
284 | if (ordered != QUEUE_ORDERED_NONE && | 284 | if (ordered != QUEUE_ORDERED_NONE && |
285 | ordered != QUEUE_ORDERED_DRAIN && | 285 | ordered != QUEUE_ORDERED_DRAIN && |
286 | ordered != QUEUE_ORDERED_DRAIN_FLUSH && | 286 | ordered != QUEUE_ORDERED_DRAIN_FLUSH && |
287 | ordered != QUEUE_ORDERED_DRAIN_FUA && | 287 | ordered != QUEUE_ORDERED_DRAIN_FUA && |
288 | ordered != QUEUE_ORDERED_TAG && | 288 | ordered != QUEUE_ORDERED_TAG && |
289 | ordered != QUEUE_ORDERED_TAG_FLUSH && | 289 | ordered != QUEUE_ORDERED_TAG_FLUSH && |
290 | ordered != QUEUE_ORDERED_TAG_FUA) { | 290 | ordered != QUEUE_ORDERED_TAG_FUA) { |
291 | printk(KERN_ERR "blk_queue_ordered: bad value %d\n", ordered); | 291 | printk(KERN_ERR "blk_queue_ordered: bad value %d\n", ordered); |
292 | return -EINVAL; | 292 | return -EINVAL; |
293 | } | 293 | } |
294 | 294 | ||
295 | q->ordered = ordered; | 295 | q->ordered = ordered; |
296 | q->next_ordered = ordered; | 296 | q->next_ordered = ordered; |
297 | q->prepare_flush_fn = prepare_flush_fn; | 297 | q->prepare_flush_fn = prepare_flush_fn; |
298 | 298 | ||
299 | return 0; | 299 | return 0; |
300 | } | 300 | } |
301 | 301 | ||
302 | EXPORT_SYMBOL(blk_queue_ordered); | 302 | EXPORT_SYMBOL(blk_queue_ordered); |
303 | 303 | ||
304 | /** | 304 | /** |
305 | * blk_queue_issue_flush_fn - set function for issuing a flush | 305 | * blk_queue_issue_flush_fn - set function for issuing a flush |
306 | * @q: the request queue | 306 | * @q: the request queue |
307 | * @iff: the function to be called issuing the flush | 307 | * @iff: the function to be called issuing the flush |
308 | * | 308 | * |
309 | * Description: | 309 | * Description: |
310 | * If a driver supports issuing a flush command, the support is notified | 310 | * If a driver supports issuing a flush command, the support is notified |
311 | * to the block layer by defining it through this call. | 311 | * to the block layer by defining it through this call. |
312 | * | 312 | * |
313 | **/ | 313 | **/ |
314 | void blk_queue_issue_flush_fn(struct request_queue *q, issue_flush_fn *iff) | 314 | void blk_queue_issue_flush_fn(struct request_queue *q, issue_flush_fn *iff) |
315 | { | 315 | { |
316 | q->issue_flush_fn = iff; | 316 | q->issue_flush_fn = iff; |
317 | } | 317 | } |
318 | 318 | ||
319 | EXPORT_SYMBOL(blk_queue_issue_flush_fn); | 319 | EXPORT_SYMBOL(blk_queue_issue_flush_fn); |
320 | 320 | ||
321 | /* | 321 | /* |
322 | * Cache flushing for ordered writes handling | 322 | * Cache flushing for ordered writes handling |
323 | */ | 323 | */ |
324 | inline unsigned blk_ordered_cur_seq(struct request_queue *q) | 324 | inline unsigned blk_ordered_cur_seq(struct request_queue *q) |
325 | { | 325 | { |
326 | if (!q->ordseq) | 326 | if (!q->ordseq) |
327 | return 0; | 327 | return 0; |
328 | return 1 << ffz(q->ordseq); | 328 | return 1 << ffz(q->ordseq); |
329 | } | 329 | } |
330 | 330 | ||
331 | unsigned blk_ordered_req_seq(struct request *rq) | 331 | unsigned blk_ordered_req_seq(struct request *rq) |
332 | { | 332 | { |
333 | struct request_queue *q = rq->q; | 333 | struct request_queue *q = rq->q; |
334 | 334 | ||
335 | BUG_ON(q->ordseq == 0); | 335 | BUG_ON(q->ordseq == 0); |
336 | 336 | ||
337 | if (rq == &q->pre_flush_rq) | 337 | if (rq == &q->pre_flush_rq) |
338 | return QUEUE_ORDSEQ_PREFLUSH; | 338 | return QUEUE_ORDSEQ_PREFLUSH; |
339 | if (rq == &q->bar_rq) | 339 | if (rq == &q->bar_rq) |
340 | return QUEUE_ORDSEQ_BAR; | 340 | return QUEUE_ORDSEQ_BAR; |
341 | if (rq == &q->post_flush_rq) | 341 | if (rq == &q->post_flush_rq) |
342 | return QUEUE_ORDSEQ_POSTFLUSH; | 342 | return QUEUE_ORDSEQ_POSTFLUSH; |
343 | 343 | ||
344 | /* | 344 | /* |
345 | * !fs requests don't need to follow barrier ordering. Always | 345 | * !fs requests don't need to follow barrier ordering. Always |
346 | * put them at the front. This fixes the following deadlock. | 346 | * put them at the front. This fixes the following deadlock. |
347 | * | 347 | * |
348 | * http://thread.gmane.org/gmane.linux.kernel/537473 | 348 | * http://thread.gmane.org/gmane.linux.kernel/537473 |
349 | */ | 349 | */ |
350 | if (!blk_fs_request(rq)) | 350 | if (!blk_fs_request(rq)) |
351 | return QUEUE_ORDSEQ_DRAIN; | 351 | return QUEUE_ORDSEQ_DRAIN; |
352 | 352 | ||
353 | if ((rq->cmd_flags & REQ_ORDERED_COLOR) == | 353 | if ((rq->cmd_flags & REQ_ORDERED_COLOR) == |
354 | (q->orig_bar_rq->cmd_flags & REQ_ORDERED_COLOR)) | 354 | (q->orig_bar_rq->cmd_flags & REQ_ORDERED_COLOR)) |
355 | return QUEUE_ORDSEQ_DRAIN; | 355 | return QUEUE_ORDSEQ_DRAIN; |
356 | else | 356 | else |
357 | return QUEUE_ORDSEQ_DONE; | 357 | return QUEUE_ORDSEQ_DONE; |
358 | } | 358 | } |
359 | 359 | ||
360 | void blk_ordered_complete_seq(struct request_queue *q, unsigned seq, int error) | 360 | void blk_ordered_complete_seq(struct request_queue *q, unsigned seq, int error) |
361 | { | 361 | { |
362 | struct request *rq; | 362 | struct request *rq; |
363 | int uptodate; | 363 | int uptodate; |
364 | 364 | ||
365 | if (error && !q->orderr) | 365 | if (error && !q->orderr) |
366 | q->orderr = error; | 366 | q->orderr = error; |
367 | 367 | ||
368 | BUG_ON(q->ordseq & seq); | 368 | BUG_ON(q->ordseq & seq); |
369 | q->ordseq |= seq; | 369 | q->ordseq |= seq; |
370 | 370 | ||
371 | if (blk_ordered_cur_seq(q) != QUEUE_ORDSEQ_DONE) | 371 | if (blk_ordered_cur_seq(q) != QUEUE_ORDSEQ_DONE) |
372 | return; | 372 | return; |
373 | 373 | ||
374 | /* | 374 | /* |
375 | * Okay, sequence complete. | 375 | * Okay, sequence complete. |
376 | */ | 376 | */ |
377 | rq = q->orig_bar_rq; | 377 | rq = q->orig_bar_rq; |
378 | uptodate = q->orderr ? q->orderr : 1; | 378 | uptodate = q->orderr ? q->orderr : 1; |
379 | 379 | ||
380 | q->ordseq = 0; | 380 | q->ordseq = 0; |
381 | 381 | ||
382 | end_that_request_first(rq, uptodate, rq->hard_nr_sectors); | 382 | end_that_request_first(rq, uptodate, rq->hard_nr_sectors); |
383 | end_that_request_last(rq, uptodate); | 383 | end_that_request_last(rq, uptodate); |
384 | } | 384 | } |
385 | 385 | ||
386 | static void pre_flush_end_io(struct request *rq, int error) | 386 | static void pre_flush_end_io(struct request *rq, int error) |
387 | { | 387 | { |
388 | elv_completed_request(rq->q, rq); | 388 | elv_completed_request(rq->q, rq); |
389 | blk_ordered_complete_seq(rq->q, QUEUE_ORDSEQ_PREFLUSH, error); | 389 | blk_ordered_complete_seq(rq->q, QUEUE_ORDSEQ_PREFLUSH, error); |
390 | } | 390 | } |
391 | 391 | ||
392 | static void bar_end_io(struct request *rq, int error) | 392 | static void bar_end_io(struct request *rq, int error) |
393 | { | 393 | { |
394 | elv_completed_request(rq->q, rq); | 394 | elv_completed_request(rq->q, rq); |
395 | blk_ordered_complete_seq(rq->q, QUEUE_ORDSEQ_BAR, error); | 395 | blk_ordered_complete_seq(rq->q, QUEUE_ORDSEQ_BAR, error); |
396 | } | 396 | } |
397 | 397 | ||
398 | static void post_flush_end_io(struct request *rq, int error) | 398 | static void post_flush_end_io(struct request *rq, int error) |
399 | { | 399 | { |
400 | elv_completed_request(rq->q, rq); | 400 | elv_completed_request(rq->q, rq); |
401 | blk_ordered_complete_seq(rq->q, QUEUE_ORDSEQ_POSTFLUSH, error); | 401 | blk_ordered_complete_seq(rq->q, QUEUE_ORDSEQ_POSTFLUSH, error); |
402 | } | 402 | } |
403 | 403 | ||
404 | static void queue_flush(struct request_queue *q, unsigned which) | 404 | static void queue_flush(struct request_queue *q, unsigned which) |
405 | { | 405 | { |
406 | struct request *rq; | 406 | struct request *rq; |
407 | rq_end_io_fn *end_io; | 407 | rq_end_io_fn *end_io; |
408 | 408 | ||
409 | if (which == QUEUE_ORDERED_PREFLUSH) { | 409 | if (which == QUEUE_ORDERED_PREFLUSH) { |
410 | rq = &q->pre_flush_rq; | 410 | rq = &q->pre_flush_rq; |
411 | end_io = pre_flush_end_io; | 411 | end_io = pre_flush_end_io; |
412 | } else { | 412 | } else { |
413 | rq = &q->post_flush_rq; | 413 | rq = &q->post_flush_rq; |
414 | end_io = post_flush_end_io; | 414 | end_io = post_flush_end_io; |
415 | } | 415 | } |
416 | 416 | ||
417 | rq->cmd_flags = REQ_HARDBARRIER; | 417 | rq->cmd_flags = REQ_HARDBARRIER; |
418 | rq_init(q, rq); | 418 | rq_init(q, rq); |
419 | rq->elevator_private = NULL; | 419 | rq->elevator_private = NULL; |
420 | rq->elevator_private2 = NULL; | 420 | rq->elevator_private2 = NULL; |
421 | rq->rq_disk = q->bar_rq.rq_disk; | 421 | rq->rq_disk = q->bar_rq.rq_disk; |
422 | rq->end_io = end_io; | 422 | rq->end_io = end_io; |
423 | q->prepare_flush_fn(q, rq); | 423 | q->prepare_flush_fn(q, rq); |
424 | 424 | ||
425 | elv_insert(q, rq, ELEVATOR_INSERT_FRONT); | 425 | elv_insert(q, rq, ELEVATOR_INSERT_FRONT); |
426 | } | 426 | } |
427 | 427 | ||
428 | static inline struct request *start_ordered(struct request_queue *q, | 428 | static inline struct request *start_ordered(struct request_queue *q, |
429 | struct request *rq) | 429 | struct request *rq) |
430 | { | 430 | { |
431 | q->bi_size = 0; | 431 | q->bi_size = 0; |
432 | q->orderr = 0; | 432 | q->orderr = 0; |
433 | q->ordered = q->next_ordered; | 433 | q->ordered = q->next_ordered; |
434 | q->ordseq |= QUEUE_ORDSEQ_STARTED; | 434 | q->ordseq |= QUEUE_ORDSEQ_STARTED; |
435 | 435 | ||
436 | /* | 436 | /* |
437 | * Prep proxy barrier request. | 437 | * Prep proxy barrier request. |
438 | */ | 438 | */ |
439 | blkdev_dequeue_request(rq); | 439 | blkdev_dequeue_request(rq); |
440 | q->orig_bar_rq = rq; | 440 | q->orig_bar_rq = rq; |
441 | rq = &q->bar_rq; | 441 | rq = &q->bar_rq; |
442 | rq->cmd_flags = 0; | 442 | rq->cmd_flags = 0; |
443 | rq_init(q, rq); | 443 | rq_init(q, rq); |
444 | if (bio_data_dir(q->orig_bar_rq->bio) == WRITE) | 444 | if (bio_data_dir(q->orig_bar_rq->bio) == WRITE) |
445 | rq->cmd_flags |= REQ_RW; | 445 | rq->cmd_flags |= REQ_RW; |
446 | rq->cmd_flags |= q->ordered & QUEUE_ORDERED_FUA ? REQ_FUA : 0; | 446 | rq->cmd_flags |= q->ordered & QUEUE_ORDERED_FUA ? REQ_FUA : 0; |
447 | rq->elevator_private = NULL; | 447 | rq->elevator_private = NULL; |
448 | rq->elevator_private2 = NULL; | 448 | rq->elevator_private2 = NULL; |
449 | init_request_from_bio(rq, q->orig_bar_rq->bio); | 449 | init_request_from_bio(rq, q->orig_bar_rq->bio); |
450 | rq->end_io = bar_end_io; | 450 | rq->end_io = bar_end_io; |
451 | 451 | ||
452 | /* | 452 | /* |
453 | * Queue ordered sequence. As we stack them at the head, we | 453 | * Queue ordered sequence. As we stack them at the head, we |
454 | * need to queue in reverse order. Note that we rely on that | 454 | * need to queue in reverse order. Note that we rely on that |
455 | * no fs request uses ELEVATOR_INSERT_FRONT and thus no fs | 455 | * no fs request uses ELEVATOR_INSERT_FRONT and thus no fs |
456 | * request gets inbetween ordered sequence. | 456 | * request gets inbetween ordered sequence. |
457 | */ | 457 | */ |
458 | if (q->ordered & QUEUE_ORDERED_POSTFLUSH) | 458 | if (q->ordered & QUEUE_ORDERED_POSTFLUSH) |
459 | queue_flush(q, QUEUE_ORDERED_POSTFLUSH); | 459 | queue_flush(q, QUEUE_ORDERED_POSTFLUSH); |
460 | else | 460 | else |
461 | q->ordseq |= QUEUE_ORDSEQ_POSTFLUSH; | 461 | q->ordseq |= QUEUE_ORDSEQ_POSTFLUSH; |
462 | 462 | ||
463 | elv_insert(q, rq, ELEVATOR_INSERT_FRONT); | 463 | elv_insert(q, rq, ELEVATOR_INSERT_FRONT); |
464 | 464 | ||
465 | if (q->ordered & QUEUE_ORDERED_PREFLUSH) { | 465 | if (q->ordered & QUEUE_ORDERED_PREFLUSH) { |
466 | queue_flush(q, QUEUE_ORDERED_PREFLUSH); | 466 | queue_flush(q, QUEUE_ORDERED_PREFLUSH); |
467 | rq = &q->pre_flush_rq; | 467 | rq = &q->pre_flush_rq; |
468 | } else | 468 | } else |
469 | q->ordseq |= QUEUE_ORDSEQ_PREFLUSH; | 469 | q->ordseq |= QUEUE_ORDSEQ_PREFLUSH; |
470 | 470 | ||
471 | if ((q->ordered & QUEUE_ORDERED_TAG) || q->in_flight == 0) | 471 | if ((q->ordered & QUEUE_ORDERED_TAG) || q->in_flight == 0) |
472 | q->ordseq |= QUEUE_ORDSEQ_DRAIN; | 472 | q->ordseq |= QUEUE_ORDSEQ_DRAIN; |
473 | else | 473 | else |
474 | rq = NULL; | 474 | rq = NULL; |
475 | 475 | ||
476 | return rq; | 476 | return rq; |
477 | } | 477 | } |
478 | 478 | ||
479 | int blk_do_ordered(struct request_queue *q, struct request **rqp) | 479 | int blk_do_ordered(struct request_queue *q, struct request **rqp) |
480 | { | 480 | { |
481 | struct request *rq = *rqp; | 481 | struct request *rq = *rqp; |
482 | int is_barrier = blk_fs_request(rq) && blk_barrier_rq(rq); | 482 | int is_barrier = blk_fs_request(rq) && blk_barrier_rq(rq); |
483 | 483 | ||
484 | if (!q->ordseq) { | 484 | if (!q->ordseq) { |
485 | if (!is_barrier) | 485 | if (!is_barrier) |
486 | return 1; | 486 | return 1; |
487 | 487 | ||
488 | if (q->next_ordered != QUEUE_ORDERED_NONE) { | 488 | if (q->next_ordered != QUEUE_ORDERED_NONE) { |
489 | *rqp = start_ordered(q, rq); | 489 | *rqp = start_ordered(q, rq); |
490 | return 1; | 490 | return 1; |
491 | } else { | 491 | } else { |
492 | /* | 492 | /* |
493 | * This can happen when the queue switches to | 493 | * This can happen when the queue switches to |
494 | * ORDERED_NONE while this request is on it. | 494 | * ORDERED_NONE while this request is on it. |
495 | */ | 495 | */ |
496 | blkdev_dequeue_request(rq); | 496 | blkdev_dequeue_request(rq); |
497 | end_that_request_first(rq, -EOPNOTSUPP, | 497 | end_that_request_first(rq, -EOPNOTSUPP, |
498 | rq->hard_nr_sectors); | 498 | rq->hard_nr_sectors); |
499 | end_that_request_last(rq, -EOPNOTSUPP); | 499 | end_that_request_last(rq, -EOPNOTSUPP); |
500 | *rqp = NULL; | 500 | *rqp = NULL; |
501 | return 0; | 501 | return 0; |
502 | } | 502 | } |
503 | } | 503 | } |
504 | 504 | ||
505 | /* | 505 | /* |
506 | * Ordered sequence in progress | 506 | * Ordered sequence in progress |
507 | */ | 507 | */ |
508 | 508 | ||
509 | /* Special requests are not subject to ordering rules. */ | 509 | /* Special requests are not subject to ordering rules. */ |
510 | if (!blk_fs_request(rq) && | 510 | if (!blk_fs_request(rq) && |
511 | rq != &q->pre_flush_rq && rq != &q->post_flush_rq) | 511 | rq != &q->pre_flush_rq && rq != &q->post_flush_rq) |
512 | return 1; | 512 | return 1; |
513 | 513 | ||
514 | if (q->ordered & QUEUE_ORDERED_TAG) { | 514 | if (q->ordered & QUEUE_ORDERED_TAG) { |
515 | /* Ordered by tag. Blocking the next barrier is enough. */ | 515 | /* Ordered by tag. Blocking the next barrier is enough. */ |
516 | if (is_barrier && rq != &q->bar_rq) | 516 | if (is_barrier && rq != &q->bar_rq) |
517 | *rqp = NULL; | 517 | *rqp = NULL; |
518 | } else { | 518 | } else { |
519 | /* Ordered by draining. Wait for turn. */ | 519 | /* Ordered by draining. Wait for turn. */ |
520 | WARN_ON(blk_ordered_req_seq(rq) < blk_ordered_cur_seq(q)); | 520 | WARN_ON(blk_ordered_req_seq(rq) < blk_ordered_cur_seq(q)); |
521 | if (blk_ordered_req_seq(rq) > blk_ordered_cur_seq(q)) | 521 | if (blk_ordered_req_seq(rq) > blk_ordered_cur_seq(q)) |
522 | *rqp = NULL; | 522 | *rqp = NULL; |
523 | } | 523 | } |
524 | 524 | ||
525 | return 1; | 525 | return 1; |
526 | } | 526 | } |
527 | 527 | ||
528 | static int flush_dry_bio_endio(struct bio *bio, unsigned int bytes, int error) | 528 | static int flush_dry_bio_endio(struct bio *bio, unsigned int bytes, int error) |
529 | { | 529 | { |
530 | struct request_queue *q = bio->bi_private; | 530 | struct request_queue *q = bio->bi_private; |
531 | 531 | ||
532 | /* | 532 | /* |
533 | * This is dry run, restore bio_sector and size. We'll finish | 533 | * This is dry run, restore bio_sector and size. We'll finish |
534 | * this request again with the original bi_end_io after an | 534 | * this request again with the original bi_end_io after an |
535 | * error occurs or post flush is complete. | 535 | * error occurs or post flush is complete. |
536 | */ | 536 | */ |
537 | q->bi_size += bytes; | 537 | q->bi_size += bytes; |
538 | 538 | ||
539 | if (bio->bi_size) | 539 | if (bio->bi_size) |
540 | return 1; | 540 | return 1; |
541 | 541 | ||
542 | /* Reset bio */ | 542 | /* Reset bio */ |
543 | set_bit(BIO_UPTODATE, &bio->bi_flags); | 543 | set_bit(BIO_UPTODATE, &bio->bi_flags); |
544 | bio->bi_size = q->bi_size; | 544 | bio->bi_size = q->bi_size; |
545 | bio->bi_sector -= (q->bi_size >> 9); | 545 | bio->bi_sector -= (q->bi_size >> 9); |
546 | q->bi_size = 0; | 546 | q->bi_size = 0; |
547 | 547 | ||
548 | return 0; | 548 | return 0; |
549 | } | 549 | } |
550 | 550 | ||
551 | static int ordered_bio_endio(struct request *rq, struct bio *bio, | 551 | static int ordered_bio_endio(struct request *rq, struct bio *bio, |
552 | unsigned int nbytes, int error) | 552 | unsigned int nbytes, int error) |
553 | { | 553 | { |
554 | struct request_queue *q = rq->q; | 554 | struct request_queue *q = rq->q; |
555 | bio_end_io_t *endio; | 555 | bio_end_io_t *endio; |
556 | void *private; | 556 | void *private; |
557 | 557 | ||
558 | if (&q->bar_rq != rq) | 558 | if (&q->bar_rq != rq) |
559 | return 0; | 559 | return 0; |
560 | 560 | ||
561 | /* | 561 | /* |
562 | * Okay, this is the barrier request in progress, dry finish it. | 562 | * Okay, this is the barrier request in progress, dry finish it. |
563 | */ | 563 | */ |
564 | if (error && !q->orderr) | 564 | if (error && !q->orderr) |
565 | q->orderr = error; | 565 | q->orderr = error; |
566 | 566 | ||
567 | endio = bio->bi_end_io; | 567 | endio = bio->bi_end_io; |
568 | private = bio->bi_private; | 568 | private = bio->bi_private; |
569 | bio->bi_end_io = flush_dry_bio_endio; | 569 | bio->bi_end_io = flush_dry_bio_endio; |
570 | bio->bi_private = q; | 570 | bio->bi_private = q; |
571 | 571 | ||
572 | bio_endio(bio, nbytes, error); | 572 | bio_endio(bio, nbytes, error); |
573 | 573 | ||
574 | bio->bi_end_io = endio; | 574 | bio->bi_end_io = endio; |
575 | bio->bi_private = private; | 575 | bio->bi_private = private; |
576 | 576 | ||
577 | return 1; | 577 | return 1; |
578 | } | 578 | } |
579 | 579 | ||
580 | /** | 580 | /** |
581 | * blk_queue_bounce_limit - set bounce buffer limit for queue | 581 | * blk_queue_bounce_limit - set bounce buffer limit for queue |
582 | * @q: the request queue for the device | 582 | * @q: the request queue for the device |
583 | * @dma_addr: bus address limit | 583 | * @dma_addr: bus address limit |
584 | * | 584 | * |
585 | * Description: | 585 | * Description: |
586 | * Different hardware can have different requirements as to what pages | 586 | * Different hardware can have different requirements as to what pages |
587 | * it can do I/O directly to. A low level driver can call | 587 | * it can do I/O directly to. A low level driver can call |
588 | * blk_queue_bounce_limit to have lower memory pages allocated as bounce | 588 | * blk_queue_bounce_limit to have lower memory pages allocated as bounce |
589 | * buffers for doing I/O to pages residing above @page. | 589 | * buffers for doing I/O to pages residing above @page. |
590 | **/ | 590 | **/ |
591 | void blk_queue_bounce_limit(struct request_queue *q, u64 dma_addr) | 591 | void blk_queue_bounce_limit(struct request_queue *q, u64 dma_addr) |
592 | { | 592 | { |
593 | unsigned long bounce_pfn = dma_addr >> PAGE_SHIFT; | 593 | unsigned long bounce_pfn = dma_addr >> PAGE_SHIFT; |
594 | int dma = 0; | 594 | int dma = 0; |
595 | 595 | ||
596 | q->bounce_gfp = GFP_NOIO; | 596 | q->bounce_gfp = GFP_NOIO; |
597 | #if BITS_PER_LONG == 64 | 597 | #if BITS_PER_LONG == 64 |
598 | /* Assume anything <= 4GB can be handled by IOMMU. | 598 | /* Assume anything <= 4GB can be handled by IOMMU. |
599 | Actually some IOMMUs can handle everything, but I don't | 599 | Actually some IOMMUs can handle everything, but I don't |
600 | know of a way to test this here. */ | 600 | know of a way to test this here. */ |
601 | if (bounce_pfn < (min_t(u64,0xffffffff,BLK_BOUNCE_HIGH) >> PAGE_SHIFT)) | 601 | if (bounce_pfn < (min_t(u64,0xffffffff,BLK_BOUNCE_HIGH) >> PAGE_SHIFT)) |
602 | dma = 1; | 602 | dma = 1; |
603 | q->bounce_pfn = max_low_pfn; | 603 | q->bounce_pfn = max_low_pfn; |
604 | #else | 604 | #else |
605 | if (bounce_pfn < blk_max_low_pfn) | 605 | if (bounce_pfn < blk_max_low_pfn) |
606 | dma = 1; | 606 | dma = 1; |
607 | q->bounce_pfn = bounce_pfn; | 607 | q->bounce_pfn = bounce_pfn; |
608 | #endif | 608 | #endif |
609 | if (dma) { | 609 | if (dma) { |
610 | init_emergency_isa_pool(); | 610 | init_emergency_isa_pool(); |
611 | q->bounce_gfp = GFP_NOIO | GFP_DMA; | 611 | q->bounce_gfp = GFP_NOIO | GFP_DMA; |
612 | q->bounce_pfn = bounce_pfn; | 612 | q->bounce_pfn = bounce_pfn; |
613 | } | 613 | } |
614 | } | 614 | } |
615 | 615 | ||
616 | EXPORT_SYMBOL(blk_queue_bounce_limit); | 616 | EXPORT_SYMBOL(blk_queue_bounce_limit); |
617 | 617 | ||
618 | /** | 618 | /** |
619 | * blk_queue_max_sectors - set max sectors for a request for this queue | 619 | * blk_queue_max_sectors - set max sectors for a request for this queue |
620 | * @q: the request queue for the device | 620 | * @q: the request queue for the device |
621 | * @max_sectors: max sectors in the usual 512b unit | 621 | * @max_sectors: max sectors in the usual 512b unit |
622 | * | 622 | * |
623 | * Description: | 623 | * Description: |
624 | * Enables a low level driver to set an upper limit on the size of | 624 | * Enables a low level driver to set an upper limit on the size of |
625 | * received requests. | 625 | * received requests. |
626 | **/ | 626 | **/ |
627 | void blk_queue_max_sectors(struct request_queue *q, unsigned int max_sectors) | 627 | void blk_queue_max_sectors(struct request_queue *q, unsigned int max_sectors) |
628 | { | 628 | { |
629 | if ((max_sectors << 9) < PAGE_CACHE_SIZE) { | 629 | if ((max_sectors << 9) < PAGE_CACHE_SIZE) { |
630 | max_sectors = 1 << (PAGE_CACHE_SHIFT - 9); | 630 | max_sectors = 1 << (PAGE_CACHE_SHIFT - 9); |
631 | printk("%s: set to minimum %d\n", __FUNCTION__, max_sectors); | 631 | printk("%s: set to minimum %d\n", __FUNCTION__, max_sectors); |
632 | } | 632 | } |
633 | 633 | ||
634 | if (BLK_DEF_MAX_SECTORS > max_sectors) | 634 | if (BLK_DEF_MAX_SECTORS > max_sectors) |
635 | q->max_hw_sectors = q->max_sectors = max_sectors; | 635 | q->max_hw_sectors = q->max_sectors = max_sectors; |
636 | else { | 636 | else { |
637 | q->max_sectors = BLK_DEF_MAX_SECTORS; | 637 | q->max_sectors = BLK_DEF_MAX_SECTORS; |
638 | q->max_hw_sectors = max_sectors; | 638 | q->max_hw_sectors = max_sectors; |
639 | } | 639 | } |
640 | } | 640 | } |
641 | 641 | ||
642 | EXPORT_SYMBOL(blk_queue_max_sectors); | 642 | EXPORT_SYMBOL(blk_queue_max_sectors); |
643 | 643 | ||
644 | /** | 644 | /** |
645 | * blk_queue_max_phys_segments - set max phys segments for a request for this queue | 645 | * blk_queue_max_phys_segments - set max phys segments for a request for this queue |
646 | * @q: the request queue for the device | 646 | * @q: the request queue for the device |
647 | * @max_segments: max number of segments | 647 | * @max_segments: max number of segments |
648 | * | 648 | * |
649 | * Description: | 649 | * Description: |
650 | * Enables a low level driver to set an upper limit on the number of | 650 | * Enables a low level driver to set an upper limit on the number of |
651 | * physical data segments in a request. This would be the largest sized | 651 | * physical data segments in a request. This would be the largest sized |
652 | * scatter list the driver could handle. | 652 | * scatter list the driver could handle. |
653 | **/ | 653 | **/ |
654 | void blk_queue_max_phys_segments(struct request_queue *q, | 654 | void blk_queue_max_phys_segments(struct request_queue *q, |
655 | unsigned short max_segments) | 655 | unsigned short max_segments) |
656 | { | 656 | { |
657 | if (!max_segments) { | 657 | if (!max_segments) { |
658 | max_segments = 1; | 658 | max_segments = 1; |
659 | printk("%s: set to minimum %d\n", __FUNCTION__, max_segments); | 659 | printk("%s: set to minimum %d\n", __FUNCTION__, max_segments); |
660 | } | 660 | } |
661 | 661 | ||
662 | q->max_phys_segments = max_segments; | 662 | q->max_phys_segments = max_segments; |
663 | } | 663 | } |
664 | 664 | ||
665 | EXPORT_SYMBOL(blk_queue_max_phys_segments); | 665 | EXPORT_SYMBOL(blk_queue_max_phys_segments); |
666 | 666 | ||
667 | /** | 667 | /** |
668 | * blk_queue_max_hw_segments - set max hw segments for a request for this queue | 668 | * blk_queue_max_hw_segments - set max hw segments for a request for this queue |
669 | * @q: the request queue for the device | 669 | * @q: the request queue for the device |
670 | * @max_segments: max number of segments | 670 | * @max_segments: max number of segments |
671 | * | 671 | * |
672 | * Description: | 672 | * Description: |
673 | * Enables a low level driver to set an upper limit on the number of | 673 | * Enables a low level driver to set an upper limit on the number of |
674 | * hw data segments in a request. This would be the largest number of | 674 | * hw data segments in a request. This would be the largest number of |
675 | * address/length pairs the host adapter can actually give as once | 675 | * address/length pairs the host adapter can actually give as once |
676 | * to the device. | 676 | * to the device. |
677 | **/ | 677 | **/ |
678 | void blk_queue_max_hw_segments(struct request_queue *q, | 678 | void blk_queue_max_hw_segments(struct request_queue *q, |
679 | unsigned short max_segments) | 679 | unsigned short max_segments) |
680 | { | 680 | { |
681 | if (!max_segments) { | 681 | if (!max_segments) { |
682 | max_segments = 1; | 682 | max_segments = 1; |
683 | printk("%s: set to minimum %d\n", __FUNCTION__, max_segments); | 683 | printk("%s: set to minimum %d\n", __FUNCTION__, max_segments); |
684 | } | 684 | } |
685 | 685 | ||
686 | q->max_hw_segments = max_segments; | 686 | q->max_hw_segments = max_segments; |
687 | } | 687 | } |
688 | 688 | ||
689 | EXPORT_SYMBOL(blk_queue_max_hw_segments); | 689 | EXPORT_SYMBOL(blk_queue_max_hw_segments); |
690 | 690 | ||
691 | /** | 691 | /** |
692 | * blk_queue_max_segment_size - set max segment size for blk_rq_map_sg | 692 | * blk_queue_max_segment_size - set max segment size for blk_rq_map_sg |
693 | * @q: the request queue for the device | 693 | * @q: the request queue for the device |
694 | * @max_size: max size of segment in bytes | 694 | * @max_size: max size of segment in bytes |
695 | * | 695 | * |
696 | * Description: | 696 | * Description: |
697 | * Enables a low level driver to set an upper limit on the size of a | 697 | * Enables a low level driver to set an upper limit on the size of a |
698 | * coalesced segment | 698 | * coalesced segment |
699 | **/ | 699 | **/ |
700 | void blk_queue_max_segment_size(struct request_queue *q, unsigned int max_size) | 700 | void blk_queue_max_segment_size(struct request_queue *q, unsigned int max_size) |
701 | { | 701 | { |
702 | if (max_size < PAGE_CACHE_SIZE) { | 702 | if (max_size < PAGE_CACHE_SIZE) { |
703 | max_size = PAGE_CACHE_SIZE; | 703 | max_size = PAGE_CACHE_SIZE; |
704 | printk("%s: set to minimum %d\n", __FUNCTION__, max_size); | 704 | printk("%s: set to minimum %d\n", __FUNCTION__, max_size); |
705 | } | 705 | } |
706 | 706 | ||
707 | q->max_segment_size = max_size; | 707 | q->max_segment_size = max_size; |
708 | } | 708 | } |
709 | 709 | ||
710 | EXPORT_SYMBOL(blk_queue_max_segment_size); | 710 | EXPORT_SYMBOL(blk_queue_max_segment_size); |
711 | 711 | ||
712 | /** | 712 | /** |
713 | * blk_queue_hardsect_size - set hardware sector size for the queue | 713 | * blk_queue_hardsect_size - set hardware sector size for the queue |
714 | * @q: the request queue for the device | 714 | * @q: the request queue for the device |
715 | * @size: the hardware sector size, in bytes | 715 | * @size: the hardware sector size, in bytes |
716 | * | 716 | * |
717 | * Description: | 717 | * Description: |
718 | * This should typically be set to the lowest possible sector size | 718 | * This should typically be set to the lowest possible sector size |
719 | * that the hardware can operate on (possible without reverting to | 719 | * that the hardware can operate on (possible without reverting to |
720 | * even internal read-modify-write operations). Usually the default | 720 | * even internal read-modify-write operations). Usually the default |
721 | * of 512 covers most hardware. | 721 | * of 512 covers most hardware. |
722 | **/ | 722 | **/ |
723 | void blk_queue_hardsect_size(struct request_queue *q, unsigned short size) | 723 | void blk_queue_hardsect_size(struct request_queue *q, unsigned short size) |
724 | { | 724 | { |
725 | q->hardsect_size = size; | 725 | q->hardsect_size = size; |
726 | } | 726 | } |
727 | 727 | ||
728 | EXPORT_SYMBOL(blk_queue_hardsect_size); | 728 | EXPORT_SYMBOL(blk_queue_hardsect_size); |
729 | 729 | ||
730 | /* | 730 | /* |
731 | * Returns the minimum that is _not_ zero, unless both are zero. | 731 | * Returns the minimum that is _not_ zero, unless both are zero. |
732 | */ | 732 | */ |
733 | #define min_not_zero(l, r) (l == 0) ? r : ((r == 0) ? l : min(l, r)) | 733 | #define min_not_zero(l, r) (l == 0) ? r : ((r == 0) ? l : min(l, r)) |
734 | 734 | ||
735 | /** | 735 | /** |
736 | * blk_queue_stack_limits - inherit underlying queue limits for stacked drivers | 736 | * blk_queue_stack_limits - inherit underlying queue limits for stacked drivers |
737 | * @t: the stacking driver (top) | 737 | * @t: the stacking driver (top) |
738 | * @b: the underlying device (bottom) | 738 | * @b: the underlying device (bottom) |
739 | **/ | 739 | **/ |
740 | void blk_queue_stack_limits(struct request_queue *t, struct request_queue *b) | 740 | void blk_queue_stack_limits(struct request_queue *t, struct request_queue *b) |
741 | { | 741 | { |
742 | /* zero is "infinity" */ | 742 | /* zero is "infinity" */ |
743 | t->max_sectors = min_not_zero(t->max_sectors,b->max_sectors); | 743 | t->max_sectors = min_not_zero(t->max_sectors,b->max_sectors); |
744 | t->max_hw_sectors = min_not_zero(t->max_hw_sectors,b->max_hw_sectors); | 744 | t->max_hw_sectors = min_not_zero(t->max_hw_sectors,b->max_hw_sectors); |
745 | 745 | ||
746 | t->max_phys_segments = min(t->max_phys_segments,b->max_phys_segments); | 746 | t->max_phys_segments = min(t->max_phys_segments,b->max_phys_segments); |
747 | t->max_hw_segments = min(t->max_hw_segments,b->max_hw_segments); | 747 | t->max_hw_segments = min(t->max_hw_segments,b->max_hw_segments); |
748 | t->max_segment_size = min(t->max_segment_size,b->max_segment_size); | 748 | t->max_segment_size = min(t->max_segment_size,b->max_segment_size); |
749 | t->hardsect_size = max(t->hardsect_size,b->hardsect_size); | 749 | t->hardsect_size = max(t->hardsect_size,b->hardsect_size); |
750 | if (!test_bit(QUEUE_FLAG_CLUSTER, &b->queue_flags)) | 750 | if (!test_bit(QUEUE_FLAG_CLUSTER, &b->queue_flags)) |
751 | clear_bit(QUEUE_FLAG_CLUSTER, &t->queue_flags); | 751 | clear_bit(QUEUE_FLAG_CLUSTER, &t->queue_flags); |
752 | } | 752 | } |
753 | 753 | ||
754 | EXPORT_SYMBOL(blk_queue_stack_limits); | 754 | EXPORT_SYMBOL(blk_queue_stack_limits); |
755 | 755 | ||
756 | /** | 756 | /** |
757 | * blk_queue_segment_boundary - set boundary rules for segment merging | 757 | * blk_queue_segment_boundary - set boundary rules for segment merging |
758 | * @q: the request queue for the device | 758 | * @q: the request queue for the device |
759 | * @mask: the memory boundary mask | 759 | * @mask: the memory boundary mask |
760 | **/ | 760 | **/ |
761 | void blk_queue_segment_boundary(struct request_queue *q, unsigned long mask) | 761 | void blk_queue_segment_boundary(struct request_queue *q, unsigned long mask) |
762 | { | 762 | { |
763 | if (mask < PAGE_CACHE_SIZE - 1) { | 763 | if (mask < PAGE_CACHE_SIZE - 1) { |
764 | mask = PAGE_CACHE_SIZE - 1; | 764 | mask = PAGE_CACHE_SIZE - 1; |
765 | printk("%s: set to minimum %lx\n", __FUNCTION__, mask); | 765 | printk("%s: set to minimum %lx\n", __FUNCTION__, mask); |
766 | } | 766 | } |
767 | 767 | ||
768 | q->seg_boundary_mask = mask; | 768 | q->seg_boundary_mask = mask; |
769 | } | 769 | } |
770 | 770 | ||
771 | EXPORT_SYMBOL(blk_queue_segment_boundary); | 771 | EXPORT_SYMBOL(blk_queue_segment_boundary); |
772 | 772 | ||
773 | /** | 773 | /** |
774 | * blk_queue_dma_alignment - set dma length and memory alignment | 774 | * blk_queue_dma_alignment - set dma length and memory alignment |
775 | * @q: the request queue for the device | 775 | * @q: the request queue for the device |
776 | * @mask: alignment mask | 776 | * @mask: alignment mask |
777 | * | 777 | * |
778 | * description: | 778 | * description: |
779 | * set required memory and length aligment for direct dma transactions. | 779 | * set required memory and length aligment for direct dma transactions. |
780 | * this is used when buiding direct io requests for the queue. | 780 | * this is used when buiding direct io requests for the queue. |
781 | * | 781 | * |
782 | **/ | 782 | **/ |
783 | void blk_queue_dma_alignment(struct request_queue *q, int mask) | 783 | void blk_queue_dma_alignment(struct request_queue *q, int mask) |
784 | { | 784 | { |
785 | q->dma_alignment = mask; | 785 | q->dma_alignment = mask; |
786 | } | 786 | } |
787 | 787 | ||
788 | EXPORT_SYMBOL(blk_queue_dma_alignment); | 788 | EXPORT_SYMBOL(blk_queue_dma_alignment); |
789 | 789 | ||
790 | /** | 790 | /** |
791 | * blk_queue_find_tag - find a request by its tag and queue | 791 | * blk_queue_find_tag - find a request by its tag and queue |
792 | * @q: The request queue for the device | 792 | * @q: The request queue for the device |
793 | * @tag: The tag of the request | 793 | * @tag: The tag of the request |
794 | * | 794 | * |
795 | * Notes: | 795 | * Notes: |
796 | * Should be used when a device returns a tag and you want to match | 796 | * Should be used when a device returns a tag and you want to match |
797 | * it with a request. | 797 | * it with a request. |
798 | * | 798 | * |
799 | * no locks need be held. | 799 | * no locks need be held. |
800 | **/ | 800 | **/ |
801 | struct request *blk_queue_find_tag(struct request_queue *q, int tag) | 801 | struct request *blk_queue_find_tag(struct request_queue *q, int tag) |
802 | { | 802 | { |
803 | return blk_map_queue_find_tag(q->queue_tags, tag); | 803 | return blk_map_queue_find_tag(q->queue_tags, tag); |
804 | } | 804 | } |
805 | 805 | ||
806 | EXPORT_SYMBOL(blk_queue_find_tag); | 806 | EXPORT_SYMBOL(blk_queue_find_tag); |
807 | 807 | ||
808 | /** | 808 | /** |
809 | * __blk_free_tags - release a given set of tag maintenance info | 809 | * __blk_free_tags - release a given set of tag maintenance info |
810 | * @bqt: the tag map to free | 810 | * @bqt: the tag map to free |
811 | * | 811 | * |
812 | * Tries to free the specified @bqt@. Returns true if it was | 812 | * Tries to free the specified @bqt@. Returns true if it was |
813 | * actually freed and false if there are still references using it | 813 | * actually freed and false if there are still references using it |
814 | */ | 814 | */ |
815 | static int __blk_free_tags(struct blk_queue_tag *bqt) | 815 | static int __blk_free_tags(struct blk_queue_tag *bqt) |
816 | { | 816 | { |
817 | int retval; | 817 | int retval; |
818 | 818 | ||
819 | retval = atomic_dec_and_test(&bqt->refcnt); | 819 | retval = atomic_dec_and_test(&bqt->refcnt); |
820 | if (retval) { | 820 | if (retval) { |
821 | BUG_ON(bqt->busy); | 821 | BUG_ON(bqt->busy); |
822 | BUG_ON(!list_empty(&bqt->busy_list)); | 822 | BUG_ON(!list_empty(&bqt->busy_list)); |
823 | 823 | ||
824 | kfree(bqt->tag_index); | 824 | kfree(bqt->tag_index); |
825 | bqt->tag_index = NULL; | 825 | bqt->tag_index = NULL; |
826 | 826 | ||
827 | kfree(bqt->tag_map); | 827 | kfree(bqt->tag_map); |
828 | bqt->tag_map = NULL; | 828 | bqt->tag_map = NULL; |
829 | 829 | ||
830 | kfree(bqt); | 830 | kfree(bqt); |
831 | 831 | ||
832 | } | 832 | } |
833 | 833 | ||
834 | return retval; | 834 | return retval; |
835 | } | 835 | } |
836 | 836 | ||
837 | /** | 837 | /** |
838 | * __blk_queue_free_tags - release tag maintenance info | 838 | * __blk_queue_free_tags - release tag maintenance info |
839 | * @q: the request queue for the device | 839 | * @q: the request queue for the device |
840 | * | 840 | * |
841 | * Notes: | 841 | * Notes: |
842 | * blk_cleanup_queue() will take care of calling this function, if tagging | 842 | * blk_cleanup_queue() will take care of calling this function, if tagging |
843 | * has been used. So there's no need to call this directly. | 843 | * has been used. So there's no need to call this directly. |
844 | **/ | 844 | **/ |
845 | static void __blk_queue_free_tags(struct request_queue *q) | 845 | static void __blk_queue_free_tags(struct request_queue *q) |
846 | { | 846 | { |
847 | struct blk_queue_tag *bqt = q->queue_tags; | 847 | struct blk_queue_tag *bqt = q->queue_tags; |
848 | 848 | ||
849 | if (!bqt) | 849 | if (!bqt) |
850 | return; | 850 | return; |
851 | 851 | ||
852 | __blk_free_tags(bqt); | 852 | __blk_free_tags(bqt); |
853 | 853 | ||
854 | q->queue_tags = NULL; | 854 | q->queue_tags = NULL; |
855 | q->queue_flags &= ~(1 << QUEUE_FLAG_QUEUED); | 855 | q->queue_flags &= ~(1 << QUEUE_FLAG_QUEUED); |
856 | } | 856 | } |
857 | 857 | ||
858 | 858 | ||
859 | /** | 859 | /** |
860 | * blk_free_tags - release a given set of tag maintenance info | 860 | * blk_free_tags - release a given set of tag maintenance info |
861 | * @bqt: the tag map to free | 861 | * @bqt: the tag map to free |
862 | * | 862 | * |
863 | * For externally managed @bqt@ frees the map. Callers of this | 863 | * For externally managed @bqt@ frees the map. Callers of this |
864 | * function must guarantee to have released all the queues that | 864 | * function must guarantee to have released all the queues that |
865 | * might have been using this tag map. | 865 | * might have been using this tag map. |
866 | */ | 866 | */ |
867 | void blk_free_tags(struct blk_queue_tag *bqt) | 867 | void blk_free_tags(struct blk_queue_tag *bqt) |
868 | { | 868 | { |
869 | if (unlikely(!__blk_free_tags(bqt))) | 869 | if (unlikely(!__blk_free_tags(bqt))) |
870 | BUG(); | 870 | BUG(); |
871 | } | 871 | } |
872 | EXPORT_SYMBOL(blk_free_tags); | 872 | EXPORT_SYMBOL(blk_free_tags); |
873 | 873 | ||
874 | /** | 874 | /** |
875 | * blk_queue_free_tags - release tag maintenance info | 875 | * blk_queue_free_tags - release tag maintenance info |
876 | * @q: the request queue for the device | 876 | * @q: the request queue for the device |
877 | * | 877 | * |
878 | * Notes: | 878 | * Notes: |
879 | * This is used to disabled tagged queuing to a device, yet leave | 879 | * This is used to disabled tagged queuing to a device, yet leave |
880 | * queue in function. | 880 | * queue in function. |
881 | **/ | 881 | **/ |
882 | void blk_queue_free_tags(struct request_queue *q) | 882 | void blk_queue_free_tags(struct request_queue *q) |
883 | { | 883 | { |
884 | clear_bit(QUEUE_FLAG_QUEUED, &q->queue_flags); | 884 | clear_bit(QUEUE_FLAG_QUEUED, &q->queue_flags); |
885 | } | 885 | } |
886 | 886 | ||
887 | EXPORT_SYMBOL(blk_queue_free_tags); | 887 | EXPORT_SYMBOL(blk_queue_free_tags); |
888 | 888 | ||
889 | static int | 889 | static int |
890 | init_tag_map(struct request_queue *q, struct blk_queue_tag *tags, int depth) | 890 | init_tag_map(struct request_queue *q, struct blk_queue_tag *tags, int depth) |
891 | { | 891 | { |
892 | struct request **tag_index; | 892 | struct request **tag_index; |
893 | unsigned long *tag_map; | 893 | unsigned long *tag_map; |
894 | int nr_ulongs; | 894 | int nr_ulongs; |
895 | 895 | ||
896 | if (q && depth > q->nr_requests * 2) { | 896 | if (q && depth > q->nr_requests * 2) { |
897 | depth = q->nr_requests * 2; | 897 | depth = q->nr_requests * 2; |
898 | printk(KERN_ERR "%s: adjusted depth to %d\n", | 898 | printk(KERN_ERR "%s: adjusted depth to %d\n", |
899 | __FUNCTION__, depth); | 899 | __FUNCTION__, depth); |
900 | } | 900 | } |
901 | 901 | ||
902 | tag_index = kzalloc(depth * sizeof(struct request *), GFP_ATOMIC); | 902 | tag_index = kzalloc(depth * sizeof(struct request *), GFP_ATOMIC); |
903 | if (!tag_index) | 903 | if (!tag_index) |
904 | goto fail; | 904 | goto fail; |
905 | 905 | ||
906 | nr_ulongs = ALIGN(depth, BITS_PER_LONG) / BITS_PER_LONG; | 906 | nr_ulongs = ALIGN(depth, BITS_PER_LONG) / BITS_PER_LONG; |
907 | tag_map = kzalloc(nr_ulongs * sizeof(unsigned long), GFP_ATOMIC); | 907 | tag_map = kzalloc(nr_ulongs * sizeof(unsigned long), GFP_ATOMIC); |
908 | if (!tag_map) | 908 | if (!tag_map) |
909 | goto fail; | 909 | goto fail; |
910 | 910 | ||
911 | tags->real_max_depth = depth; | 911 | tags->real_max_depth = depth; |
912 | tags->max_depth = depth; | 912 | tags->max_depth = depth; |
913 | tags->tag_index = tag_index; | 913 | tags->tag_index = tag_index; |
914 | tags->tag_map = tag_map; | 914 | tags->tag_map = tag_map; |
915 | 915 | ||
916 | return 0; | 916 | return 0; |
917 | fail: | 917 | fail: |
918 | kfree(tag_index); | 918 | kfree(tag_index); |
919 | return -ENOMEM; | 919 | return -ENOMEM; |
920 | } | 920 | } |
921 | 921 | ||
922 | static struct blk_queue_tag *__blk_queue_init_tags(struct request_queue *q, | 922 | static struct blk_queue_tag *__blk_queue_init_tags(struct request_queue *q, |
923 | int depth) | 923 | int depth) |
924 | { | 924 | { |
925 | struct blk_queue_tag *tags; | 925 | struct blk_queue_tag *tags; |
926 | 926 | ||
927 | tags = kmalloc(sizeof(struct blk_queue_tag), GFP_ATOMIC); | 927 | tags = kmalloc(sizeof(struct blk_queue_tag), GFP_ATOMIC); |
928 | if (!tags) | 928 | if (!tags) |
929 | goto fail; | 929 | goto fail; |
930 | 930 | ||
931 | if (init_tag_map(q, tags, depth)) | 931 | if (init_tag_map(q, tags, depth)) |
932 | goto fail; | 932 | goto fail; |
933 | 933 | ||
934 | INIT_LIST_HEAD(&tags->busy_list); | 934 | INIT_LIST_HEAD(&tags->busy_list); |
935 | tags->busy = 0; | 935 | tags->busy = 0; |
936 | atomic_set(&tags->refcnt, 1); | 936 | atomic_set(&tags->refcnt, 1); |
937 | return tags; | 937 | return tags; |
938 | fail: | 938 | fail: |
939 | kfree(tags); | 939 | kfree(tags); |
940 | return NULL; | 940 | return NULL; |
941 | } | 941 | } |
942 | 942 | ||
943 | /** | 943 | /** |
944 | * blk_init_tags - initialize the tag info for an external tag map | 944 | * blk_init_tags - initialize the tag info for an external tag map |
945 | * @depth: the maximum queue depth supported | 945 | * @depth: the maximum queue depth supported |
946 | * @tags: the tag to use | 946 | * @tags: the tag to use |
947 | **/ | 947 | **/ |
948 | struct blk_queue_tag *blk_init_tags(int depth) | 948 | struct blk_queue_tag *blk_init_tags(int depth) |
949 | { | 949 | { |
950 | return __blk_queue_init_tags(NULL, depth); | 950 | return __blk_queue_init_tags(NULL, depth); |
951 | } | 951 | } |
952 | EXPORT_SYMBOL(blk_init_tags); | 952 | EXPORT_SYMBOL(blk_init_tags); |
953 | 953 | ||
954 | /** | 954 | /** |
955 | * blk_queue_init_tags - initialize the queue tag info | 955 | * blk_queue_init_tags - initialize the queue tag info |
956 | * @q: the request queue for the device | 956 | * @q: the request queue for the device |
957 | * @depth: the maximum queue depth supported | 957 | * @depth: the maximum queue depth supported |
958 | * @tags: the tag to use | 958 | * @tags: the tag to use |
959 | **/ | 959 | **/ |
960 | int blk_queue_init_tags(struct request_queue *q, int depth, | 960 | int blk_queue_init_tags(struct request_queue *q, int depth, |
961 | struct blk_queue_tag *tags) | 961 | struct blk_queue_tag *tags) |
962 | { | 962 | { |
963 | int rc; | 963 | int rc; |
964 | 964 | ||
965 | BUG_ON(tags && q->queue_tags && tags != q->queue_tags); | 965 | BUG_ON(tags && q->queue_tags && tags != q->queue_tags); |
966 | 966 | ||
967 | if (!tags && !q->queue_tags) { | 967 | if (!tags && !q->queue_tags) { |
968 | tags = __blk_queue_init_tags(q, depth); | 968 | tags = __blk_queue_init_tags(q, depth); |
969 | 969 | ||
970 | if (!tags) | 970 | if (!tags) |
971 | goto fail; | 971 | goto fail; |
972 | } else if (q->queue_tags) { | 972 | } else if (q->queue_tags) { |
973 | if ((rc = blk_queue_resize_tags(q, depth))) | 973 | if ((rc = blk_queue_resize_tags(q, depth))) |
974 | return rc; | 974 | return rc; |
975 | set_bit(QUEUE_FLAG_QUEUED, &q->queue_flags); | 975 | set_bit(QUEUE_FLAG_QUEUED, &q->queue_flags); |
976 | return 0; | 976 | return 0; |
977 | } else | 977 | } else |
978 | atomic_inc(&tags->refcnt); | 978 | atomic_inc(&tags->refcnt); |
979 | 979 | ||
980 | /* | 980 | /* |
981 | * assign it, all done | 981 | * assign it, all done |
982 | */ | 982 | */ |
983 | q->queue_tags = tags; | 983 | q->queue_tags = tags; |
984 | q->queue_flags |= (1 << QUEUE_FLAG_QUEUED); | 984 | q->queue_flags |= (1 << QUEUE_FLAG_QUEUED); |
985 | return 0; | 985 | return 0; |
986 | fail: | 986 | fail: |
987 | kfree(tags); | 987 | kfree(tags); |
988 | return -ENOMEM; | 988 | return -ENOMEM; |
989 | } | 989 | } |
990 | 990 | ||
991 | EXPORT_SYMBOL(blk_queue_init_tags); | 991 | EXPORT_SYMBOL(blk_queue_init_tags); |
992 | 992 | ||
993 | /** | 993 | /** |
994 | * blk_queue_resize_tags - change the queueing depth | 994 | * blk_queue_resize_tags - change the queueing depth |
995 | * @q: the request queue for the device | 995 | * @q: the request queue for the device |
996 | * @new_depth: the new max command queueing depth | 996 | * @new_depth: the new max command queueing depth |
997 | * | 997 | * |
998 | * Notes: | 998 | * Notes: |
999 | * Must be called with the queue lock held. | 999 | * Must be called with the queue lock held. |
1000 | **/ | 1000 | **/ |
1001 | int blk_queue_resize_tags(struct request_queue *q, int new_depth) | 1001 | int blk_queue_resize_tags(struct request_queue *q, int new_depth) |
1002 | { | 1002 | { |
1003 | struct blk_queue_tag *bqt = q->queue_tags; | 1003 | struct blk_queue_tag *bqt = q->queue_tags; |
1004 | struct request **tag_index; | 1004 | struct request **tag_index; |
1005 | unsigned long *tag_map; | 1005 | unsigned long *tag_map; |
1006 | int max_depth, nr_ulongs; | 1006 | int max_depth, nr_ulongs; |
1007 | 1007 | ||
1008 | if (!bqt) | 1008 | if (!bqt) |
1009 | return -ENXIO; | 1009 | return -ENXIO; |
1010 | 1010 | ||
1011 | /* | 1011 | /* |
1012 | * if we already have large enough real_max_depth. just | 1012 | * if we already have large enough real_max_depth. just |
1013 | * adjust max_depth. *NOTE* as requests with tag value | 1013 | * adjust max_depth. *NOTE* as requests with tag value |
1014 | * between new_depth and real_max_depth can be in-flight, tag | 1014 | * between new_depth and real_max_depth can be in-flight, tag |
1015 | * map can not be shrunk blindly here. | 1015 | * map can not be shrunk blindly here. |
1016 | */ | 1016 | */ |
1017 | if (new_depth <= bqt->real_max_depth) { | 1017 | if (new_depth <= bqt->real_max_depth) { |
1018 | bqt->max_depth = new_depth; | 1018 | bqt->max_depth = new_depth; |
1019 | return 0; | 1019 | return 0; |
1020 | } | 1020 | } |
1021 | 1021 | ||
1022 | /* | 1022 | /* |
1023 | * Currently cannot replace a shared tag map with a new | 1023 | * Currently cannot replace a shared tag map with a new |
1024 | * one, so error out if this is the case | 1024 | * one, so error out if this is the case |
1025 | */ | 1025 | */ |
1026 | if (atomic_read(&bqt->refcnt) != 1) | 1026 | if (atomic_read(&bqt->refcnt) != 1) |
1027 | return -EBUSY; | 1027 | return -EBUSY; |
1028 | 1028 | ||
1029 | /* | 1029 | /* |
1030 | * save the old state info, so we can copy it back | 1030 | * save the old state info, so we can copy it back |
1031 | */ | 1031 | */ |
1032 | tag_index = bqt->tag_index; | 1032 | tag_index = bqt->tag_index; |
1033 | tag_map = bqt->tag_map; | 1033 | tag_map = bqt->tag_map; |
1034 | max_depth = bqt->real_max_depth; | 1034 | max_depth = bqt->real_max_depth; |
1035 | 1035 | ||
1036 | if (init_tag_map(q, bqt, new_depth)) | 1036 | if (init_tag_map(q, bqt, new_depth)) |
1037 | return -ENOMEM; | 1037 | return -ENOMEM; |
1038 | 1038 | ||
1039 | memcpy(bqt->tag_index, tag_index, max_depth * sizeof(struct request *)); | 1039 | memcpy(bqt->tag_index, tag_index, max_depth * sizeof(struct request *)); |
1040 | nr_ulongs = ALIGN(max_depth, BITS_PER_LONG) / BITS_PER_LONG; | 1040 | nr_ulongs = ALIGN(max_depth, BITS_PER_LONG) / BITS_PER_LONG; |
1041 | memcpy(bqt->tag_map, tag_map, nr_ulongs * sizeof(unsigned long)); | 1041 | memcpy(bqt->tag_map, tag_map, nr_ulongs * sizeof(unsigned long)); |
1042 | 1042 | ||
1043 | kfree(tag_index); | 1043 | kfree(tag_index); |
1044 | kfree(tag_map); | 1044 | kfree(tag_map); |
1045 | return 0; | 1045 | return 0; |
1046 | } | 1046 | } |
1047 | 1047 | ||
1048 | EXPORT_SYMBOL(blk_queue_resize_tags); | 1048 | EXPORT_SYMBOL(blk_queue_resize_tags); |
1049 | 1049 | ||
1050 | /** | 1050 | /** |
1051 | * blk_queue_end_tag - end tag operations for a request | 1051 | * blk_queue_end_tag - end tag operations for a request |
1052 | * @q: the request queue for the device | 1052 | * @q: the request queue for the device |
1053 | * @rq: the request that has completed | 1053 | * @rq: the request that has completed |
1054 | * | 1054 | * |
1055 | * Description: | 1055 | * Description: |
1056 | * Typically called when end_that_request_first() returns 0, meaning | 1056 | * Typically called when end_that_request_first() returns 0, meaning |
1057 | * all transfers have been done for a request. It's important to call | 1057 | * all transfers have been done for a request. It's important to call |
1058 | * this function before end_that_request_last(), as that will put the | 1058 | * this function before end_that_request_last(), as that will put the |
1059 | * request back on the free list thus corrupting the internal tag list. | 1059 | * request back on the free list thus corrupting the internal tag list. |
1060 | * | 1060 | * |
1061 | * Notes: | 1061 | * Notes: |
1062 | * queue lock must be held. | 1062 | * queue lock must be held. |
1063 | **/ | 1063 | **/ |
1064 | void blk_queue_end_tag(struct request_queue *q, struct request *rq) | 1064 | void blk_queue_end_tag(struct request_queue *q, struct request *rq) |
1065 | { | 1065 | { |
1066 | struct blk_queue_tag *bqt = q->queue_tags; | 1066 | struct blk_queue_tag *bqt = q->queue_tags; |
1067 | int tag = rq->tag; | 1067 | int tag = rq->tag; |
1068 | 1068 | ||
1069 | BUG_ON(tag == -1); | 1069 | BUG_ON(tag == -1); |
1070 | 1070 | ||
1071 | if (unlikely(tag >= bqt->real_max_depth)) | 1071 | if (unlikely(tag >= bqt->real_max_depth)) |
1072 | /* | 1072 | /* |
1073 | * This can happen after tag depth has been reduced. | 1073 | * This can happen after tag depth has been reduced. |
1074 | * FIXME: how about a warning or info message here? | 1074 | * FIXME: how about a warning or info message here? |
1075 | */ | 1075 | */ |
1076 | return; | 1076 | return; |
1077 | 1077 | ||
1078 | if (unlikely(!__test_and_clear_bit(tag, bqt->tag_map))) { | 1078 | if (unlikely(!__test_and_clear_bit(tag, bqt->tag_map))) { |
1079 | printk(KERN_ERR "%s: attempt to clear non-busy tag (%d)\n", | 1079 | printk(KERN_ERR "%s: attempt to clear non-busy tag (%d)\n", |
1080 | __FUNCTION__, tag); | 1080 | __FUNCTION__, tag); |
1081 | return; | 1081 | return; |
1082 | } | 1082 | } |
1083 | 1083 | ||
1084 | list_del_init(&rq->queuelist); | 1084 | list_del_init(&rq->queuelist); |
1085 | rq->cmd_flags &= ~REQ_QUEUED; | 1085 | rq->cmd_flags &= ~REQ_QUEUED; |
1086 | rq->tag = -1; | 1086 | rq->tag = -1; |
1087 | 1087 | ||
1088 | if (unlikely(bqt->tag_index[tag] == NULL)) | 1088 | if (unlikely(bqt->tag_index[tag] == NULL)) |
1089 | printk(KERN_ERR "%s: tag %d is missing\n", | 1089 | printk(KERN_ERR "%s: tag %d is missing\n", |
1090 | __FUNCTION__, tag); | 1090 | __FUNCTION__, tag); |
1091 | 1091 | ||
1092 | bqt->tag_index[tag] = NULL; | 1092 | bqt->tag_index[tag] = NULL; |
1093 | bqt->busy--; | 1093 | bqt->busy--; |
1094 | } | 1094 | } |
1095 | 1095 | ||
1096 | EXPORT_SYMBOL(blk_queue_end_tag); | 1096 | EXPORT_SYMBOL(blk_queue_end_tag); |
1097 | 1097 | ||
1098 | /** | 1098 | /** |
1099 | * blk_queue_start_tag - find a free tag and assign it | 1099 | * blk_queue_start_tag - find a free tag and assign it |
1100 | * @q: the request queue for the device | 1100 | * @q: the request queue for the device |
1101 | * @rq: the block request that needs tagging | 1101 | * @rq: the block request that needs tagging |
1102 | * | 1102 | * |
1103 | * Description: | 1103 | * Description: |
1104 | * This can either be used as a stand-alone helper, or possibly be | 1104 | * This can either be used as a stand-alone helper, or possibly be |
1105 | * assigned as the queue &prep_rq_fn (in which case &struct request | 1105 | * assigned as the queue &prep_rq_fn (in which case &struct request |
1106 | * automagically gets a tag assigned). Note that this function | 1106 | * automagically gets a tag assigned). Note that this function |
1107 | * assumes that any type of request can be queued! if this is not | 1107 | * assumes that any type of request can be queued! if this is not |
1108 | * true for your device, you must check the request type before | 1108 | * true for your device, you must check the request type before |
1109 | * calling this function. The request will also be removed from | 1109 | * calling this function. The request will also be removed from |
1110 | * the request queue, so it's the drivers responsibility to readd | 1110 | * the request queue, so it's the drivers responsibility to readd |
1111 | * it if it should need to be restarted for some reason. | 1111 | * it if it should need to be restarted for some reason. |
1112 | * | 1112 | * |
1113 | * Notes: | 1113 | * Notes: |
1114 | * queue lock must be held. | 1114 | * queue lock must be held. |
1115 | **/ | 1115 | **/ |
1116 | int blk_queue_start_tag(struct request_queue *q, struct request *rq) | 1116 | int blk_queue_start_tag(struct request_queue *q, struct request *rq) |
1117 | { | 1117 | { |
1118 | struct blk_queue_tag *bqt = q->queue_tags; | 1118 | struct blk_queue_tag *bqt = q->queue_tags; |
1119 | int tag; | 1119 | int tag; |
1120 | 1120 | ||
1121 | if (unlikely((rq->cmd_flags & REQ_QUEUED))) { | 1121 | if (unlikely((rq->cmd_flags & REQ_QUEUED))) { |
1122 | printk(KERN_ERR | 1122 | printk(KERN_ERR |
1123 | "%s: request %p for device [%s] already tagged %d", | 1123 | "%s: request %p for device [%s] already tagged %d", |
1124 | __FUNCTION__, rq, | 1124 | __FUNCTION__, rq, |
1125 | rq->rq_disk ? rq->rq_disk->disk_name : "?", rq->tag); | 1125 | rq->rq_disk ? rq->rq_disk->disk_name : "?", rq->tag); |
1126 | BUG(); | 1126 | BUG(); |
1127 | } | 1127 | } |
1128 | 1128 | ||
1129 | /* | 1129 | /* |
1130 | * Protect against shared tag maps, as we may not have exclusive | 1130 | * Protect against shared tag maps, as we may not have exclusive |
1131 | * access to the tag map. | 1131 | * access to the tag map. |
1132 | */ | 1132 | */ |
1133 | do { | 1133 | do { |
1134 | tag = find_first_zero_bit(bqt->tag_map, bqt->max_depth); | 1134 | tag = find_first_zero_bit(bqt->tag_map, bqt->max_depth); |
1135 | if (tag >= bqt->max_depth) | 1135 | if (tag >= bqt->max_depth) |
1136 | return 1; | 1136 | return 1; |
1137 | 1137 | ||
1138 | } while (test_and_set_bit(tag, bqt->tag_map)); | 1138 | } while (test_and_set_bit(tag, bqt->tag_map)); |
1139 | 1139 | ||
1140 | rq->cmd_flags |= REQ_QUEUED; | 1140 | rq->cmd_flags |= REQ_QUEUED; |
1141 | rq->tag = tag; | 1141 | rq->tag = tag; |
1142 | bqt->tag_index[tag] = rq; | 1142 | bqt->tag_index[tag] = rq; |
1143 | blkdev_dequeue_request(rq); | 1143 | blkdev_dequeue_request(rq); |
1144 | list_add(&rq->queuelist, &bqt->busy_list); | 1144 | list_add(&rq->queuelist, &bqt->busy_list); |
1145 | bqt->busy++; | 1145 | bqt->busy++; |
1146 | return 0; | 1146 | return 0; |
1147 | } | 1147 | } |
1148 | 1148 | ||
1149 | EXPORT_SYMBOL(blk_queue_start_tag); | 1149 | EXPORT_SYMBOL(blk_queue_start_tag); |
1150 | 1150 | ||
1151 | /** | 1151 | /** |
1152 | * blk_queue_invalidate_tags - invalidate all pending tags | 1152 | * blk_queue_invalidate_tags - invalidate all pending tags |
1153 | * @q: the request queue for the device | 1153 | * @q: the request queue for the device |
1154 | * | 1154 | * |
1155 | * Description: | 1155 | * Description: |
1156 | * Hardware conditions may dictate a need to stop all pending requests. | 1156 | * Hardware conditions may dictate a need to stop all pending requests. |
1157 | * In this case, we will safely clear the block side of the tag queue and | 1157 | * In this case, we will safely clear the block side of the tag queue and |
1158 | * readd all requests to the request queue in the right order. | 1158 | * readd all requests to the request queue in the right order. |
1159 | * | 1159 | * |
1160 | * Notes: | 1160 | * Notes: |
1161 | * queue lock must be held. | 1161 | * queue lock must be held. |
1162 | **/ | 1162 | **/ |
1163 | void blk_queue_invalidate_tags(struct request_queue *q) | 1163 | void blk_queue_invalidate_tags(struct request_queue *q) |
1164 | { | 1164 | { |
1165 | struct blk_queue_tag *bqt = q->queue_tags; | 1165 | struct blk_queue_tag *bqt = q->queue_tags; |
1166 | struct list_head *tmp, *n; | 1166 | struct list_head *tmp, *n; |
1167 | struct request *rq; | 1167 | struct request *rq; |
1168 | 1168 | ||
1169 | list_for_each_safe(tmp, n, &bqt->busy_list) { | 1169 | list_for_each_safe(tmp, n, &bqt->busy_list) { |
1170 | rq = list_entry_rq(tmp); | 1170 | rq = list_entry_rq(tmp); |
1171 | 1171 | ||
1172 | if (rq->tag == -1) { | 1172 | if (rq->tag == -1) { |
1173 | printk(KERN_ERR | 1173 | printk(KERN_ERR |
1174 | "%s: bad tag found on list\n", __FUNCTION__); | 1174 | "%s: bad tag found on list\n", __FUNCTION__); |
1175 | list_del_init(&rq->queuelist); | 1175 | list_del_init(&rq->queuelist); |
1176 | rq->cmd_flags &= ~REQ_QUEUED; | 1176 | rq->cmd_flags &= ~REQ_QUEUED; |
1177 | } else | 1177 | } else |
1178 | blk_queue_end_tag(q, rq); | 1178 | blk_queue_end_tag(q, rq); |
1179 | 1179 | ||
1180 | rq->cmd_flags &= ~REQ_STARTED; | 1180 | rq->cmd_flags &= ~REQ_STARTED; |
1181 | __elv_add_request(q, rq, ELEVATOR_INSERT_BACK, 0); | 1181 | __elv_add_request(q, rq, ELEVATOR_INSERT_BACK, 0); |
1182 | } | 1182 | } |
1183 | } | 1183 | } |
1184 | 1184 | ||
1185 | EXPORT_SYMBOL(blk_queue_invalidate_tags); | 1185 | EXPORT_SYMBOL(blk_queue_invalidate_tags); |
1186 | 1186 | ||
1187 | void blk_dump_rq_flags(struct request *rq, char *msg) | 1187 | void blk_dump_rq_flags(struct request *rq, char *msg) |
1188 | { | 1188 | { |
1189 | int bit; | 1189 | int bit; |
1190 | 1190 | ||
1191 | printk("%s: dev %s: type=%x, flags=%x\n", msg, | 1191 | printk("%s: dev %s: type=%x, flags=%x\n", msg, |
1192 | rq->rq_disk ? rq->rq_disk->disk_name : "?", rq->cmd_type, | 1192 | rq->rq_disk ? rq->rq_disk->disk_name : "?", rq->cmd_type, |
1193 | rq->cmd_flags); | 1193 | rq->cmd_flags); |
1194 | 1194 | ||
1195 | printk("\nsector %llu, nr/cnr %lu/%u\n", (unsigned long long)rq->sector, | 1195 | printk("\nsector %llu, nr/cnr %lu/%u\n", (unsigned long long)rq->sector, |
1196 | rq->nr_sectors, | 1196 | rq->nr_sectors, |
1197 | rq->current_nr_sectors); | 1197 | rq->current_nr_sectors); |
1198 | printk("bio %p, biotail %p, buffer %p, data %p, len %u\n", rq->bio, rq->biotail, rq->buffer, rq->data, rq->data_len); | 1198 | printk("bio %p, biotail %p, buffer %p, data %p, len %u\n", rq->bio, rq->biotail, rq->buffer, rq->data, rq->data_len); |
1199 | 1199 | ||
1200 | if (blk_pc_request(rq)) { | 1200 | if (blk_pc_request(rq)) { |
1201 | printk("cdb: "); | 1201 | printk("cdb: "); |
1202 | for (bit = 0; bit < sizeof(rq->cmd); bit++) | 1202 | for (bit = 0; bit < sizeof(rq->cmd); bit++) |
1203 | printk("%02x ", rq->cmd[bit]); | 1203 | printk("%02x ", rq->cmd[bit]); |
1204 | printk("\n"); | 1204 | printk("\n"); |
1205 | } | 1205 | } |
1206 | } | 1206 | } |
1207 | 1207 | ||
1208 | EXPORT_SYMBOL(blk_dump_rq_flags); | 1208 | EXPORT_SYMBOL(blk_dump_rq_flags); |
1209 | 1209 | ||
1210 | void blk_recount_segments(struct request_queue *q, struct bio *bio) | 1210 | void blk_recount_segments(struct request_queue *q, struct bio *bio) |
1211 | { | 1211 | { |
1212 | struct bio_vec *bv, *bvprv = NULL; | 1212 | struct bio_vec *bv, *bvprv = NULL; |
1213 | int i, nr_phys_segs, nr_hw_segs, seg_size, hw_seg_size, cluster; | 1213 | int i, nr_phys_segs, nr_hw_segs, seg_size, hw_seg_size, cluster; |
1214 | int high, highprv = 1; | 1214 | int high, highprv = 1; |
1215 | 1215 | ||
1216 | if (unlikely(!bio->bi_io_vec)) | 1216 | if (unlikely(!bio->bi_io_vec)) |
1217 | return; | 1217 | return; |
1218 | 1218 | ||
1219 | cluster = q->queue_flags & (1 << QUEUE_FLAG_CLUSTER); | 1219 | cluster = q->queue_flags & (1 << QUEUE_FLAG_CLUSTER); |
1220 | hw_seg_size = seg_size = nr_phys_segs = nr_hw_segs = 0; | 1220 | hw_seg_size = seg_size = nr_phys_segs = nr_hw_segs = 0; |
1221 | bio_for_each_segment(bv, bio, i) { | 1221 | bio_for_each_segment(bv, bio, i) { |
1222 | /* | 1222 | /* |
1223 | * the trick here is making sure that a high page is never | 1223 | * the trick here is making sure that a high page is never |
1224 | * considered part of another segment, since that might | 1224 | * considered part of another segment, since that might |
1225 | * change with the bounce page. | 1225 | * change with the bounce page. |
1226 | */ | 1226 | */ |
1227 | high = page_to_pfn(bv->bv_page) > q->bounce_pfn; | 1227 | high = page_to_pfn(bv->bv_page) > q->bounce_pfn; |
1228 | if (high || highprv) | 1228 | if (high || highprv) |
1229 | goto new_hw_segment; | 1229 | goto new_hw_segment; |
1230 | if (cluster) { | 1230 | if (cluster) { |
1231 | if (seg_size + bv->bv_len > q->max_segment_size) | 1231 | if (seg_size + bv->bv_len > q->max_segment_size) |
1232 | goto new_segment; | 1232 | goto new_segment; |
1233 | if (!BIOVEC_PHYS_MERGEABLE(bvprv, bv)) | 1233 | if (!BIOVEC_PHYS_MERGEABLE(bvprv, bv)) |
1234 | goto new_segment; | 1234 | goto new_segment; |
1235 | if (!BIOVEC_SEG_BOUNDARY(q, bvprv, bv)) | 1235 | if (!BIOVEC_SEG_BOUNDARY(q, bvprv, bv)) |
1236 | goto new_segment; | 1236 | goto new_segment; |
1237 | if (BIOVEC_VIRT_OVERSIZE(hw_seg_size + bv->bv_len)) | 1237 | if (BIOVEC_VIRT_OVERSIZE(hw_seg_size + bv->bv_len)) |
1238 | goto new_hw_segment; | 1238 | goto new_hw_segment; |
1239 | 1239 | ||
1240 | seg_size += bv->bv_len; | 1240 | seg_size += bv->bv_len; |
1241 | hw_seg_size += bv->bv_len; | 1241 | hw_seg_size += bv->bv_len; |
1242 | bvprv = bv; | 1242 | bvprv = bv; |
1243 | continue; | 1243 | continue; |
1244 | } | 1244 | } |
1245 | new_segment: | 1245 | new_segment: |
1246 | if (BIOVEC_VIRT_MERGEABLE(bvprv, bv) && | 1246 | if (BIOVEC_VIRT_MERGEABLE(bvprv, bv) && |
1247 | !BIOVEC_VIRT_OVERSIZE(hw_seg_size + bv->bv_len)) { | 1247 | !BIOVEC_VIRT_OVERSIZE(hw_seg_size + bv->bv_len)) { |
1248 | hw_seg_size += bv->bv_len; | 1248 | hw_seg_size += bv->bv_len; |
1249 | } else { | 1249 | } else { |
1250 | new_hw_segment: | 1250 | new_hw_segment: |
1251 | if (hw_seg_size > bio->bi_hw_front_size) | 1251 | if (hw_seg_size > bio->bi_hw_front_size) |
1252 | bio->bi_hw_front_size = hw_seg_size; | 1252 | bio->bi_hw_front_size = hw_seg_size; |
1253 | hw_seg_size = BIOVEC_VIRT_START_SIZE(bv) + bv->bv_len; | 1253 | hw_seg_size = BIOVEC_VIRT_START_SIZE(bv) + bv->bv_len; |
1254 | nr_hw_segs++; | 1254 | nr_hw_segs++; |
1255 | } | 1255 | } |
1256 | 1256 | ||
1257 | nr_phys_segs++; | 1257 | nr_phys_segs++; |
1258 | bvprv = bv; | 1258 | bvprv = bv; |
1259 | seg_size = bv->bv_len; | 1259 | seg_size = bv->bv_len; |
1260 | highprv = high; | 1260 | highprv = high; |
1261 | } | 1261 | } |
1262 | if (hw_seg_size > bio->bi_hw_back_size) | 1262 | if (hw_seg_size > bio->bi_hw_back_size) |
1263 | bio->bi_hw_back_size = hw_seg_size; | 1263 | bio->bi_hw_back_size = hw_seg_size; |
1264 | if (nr_hw_segs == 1 && hw_seg_size > bio->bi_hw_front_size) | 1264 | if (nr_hw_segs == 1 && hw_seg_size > bio->bi_hw_front_size) |
1265 | bio->bi_hw_front_size = hw_seg_size; | 1265 | bio->bi_hw_front_size = hw_seg_size; |
1266 | bio->bi_phys_segments = nr_phys_segs; | 1266 | bio->bi_phys_segments = nr_phys_segs; |
1267 | bio->bi_hw_segments = nr_hw_segs; | 1267 | bio->bi_hw_segments = nr_hw_segs; |
1268 | bio->bi_flags |= (1 << BIO_SEG_VALID); | 1268 | bio->bi_flags |= (1 << BIO_SEG_VALID); |
1269 | } | 1269 | } |
1270 | EXPORT_SYMBOL(blk_recount_segments); | 1270 | EXPORT_SYMBOL(blk_recount_segments); |
1271 | 1271 | ||
1272 | static int blk_phys_contig_segment(struct request_queue *q, struct bio *bio, | 1272 | static int blk_phys_contig_segment(struct request_queue *q, struct bio *bio, |
1273 | struct bio *nxt) | 1273 | struct bio *nxt) |
1274 | { | 1274 | { |
1275 | if (!(q->queue_flags & (1 << QUEUE_FLAG_CLUSTER))) | 1275 | if (!(q->queue_flags & (1 << QUEUE_FLAG_CLUSTER))) |
1276 | return 0; | 1276 | return 0; |
1277 | 1277 | ||
1278 | if (!BIOVEC_PHYS_MERGEABLE(__BVEC_END(bio), __BVEC_START(nxt))) | 1278 | if (!BIOVEC_PHYS_MERGEABLE(__BVEC_END(bio), __BVEC_START(nxt))) |
1279 | return 0; | 1279 | return 0; |
1280 | if (bio->bi_size + nxt->bi_size > q->max_segment_size) | 1280 | if (bio->bi_size + nxt->bi_size > q->max_segment_size) |
1281 | return 0; | 1281 | return 0; |
1282 | 1282 | ||
1283 | /* | 1283 | /* |
1284 | * bio and nxt are contigous in memory, check if the queue allows | 1284 | * bio and nxt are contigous in memory, check if the queue allows |
1285 | * these two to be merged into one | 1285 | * these two to be merged into one |
1286 | */ | 1286 | */ |
1287 | if (BIO_SEG_BOUNDARY(q, bio, nxt)) | 1287 | if (BIO_SEG_BOUNDARY(q, bio, nxt)) |
1288 | return 1; | 1288 | return 1; |
1289 | 1289 | ||
1290 | return 0; | 1290 | return 0; |
1291 | } | 1291 | } |
1292 | 1292 | ||
1293 | static int blk_hw_contig_segment(struct request_queue *q, struct bio *bio, | 1293 | static int blk_hw_contig_segment(struct request_queue *q, struct bio *bio, |
1294 | struct bio *nxt) | 1294 | struct bio *nxt) |
1295 | { | 1295 | { |
1296 | if (unlikely(!bio_flagged(bio, BIO_SEG_VALID))) | 1296 | if (unlikely(!bio_flagged(bio, BIO_SEG_VALID))) |
1297 | blk_recount_segments(q, bio); | 1297 | blk_recount_segments(q, bio); |
1298 | if (unlikely(!bio_flagged(nxt, BIO_SEG_VALID))) | 1298 | if (unlikely(!bio_flagged(nxt, BIO_SEG_VALID))) |
1299 | blk_recount_segments(q, nxt); | 1299 | blk_recount_segments(q, nxt); |
1300 | if (!BIOVEC_VIRT_MERGEABLE(__BVEC_END(bio), __BVEC_START(nxt)) || | 1300 | if (!BIOVEC_VIRT_MERGEABLE(__BVEC_END(bio), __BVEC_START(nxt)) || |
1301 | BIOVEC_VIRT_OVERSIZE(bio->bi_hw_back_size + nxt->bi_hw_front_size)) | 1301 | BIOVEC_VIRT_OVERSIZE(bio->bi_hw_back_size + nxt->bi_hw_front_size)) |
1302 | return 0; | 1302 | return 0; |
1303 | if (bio->bi_hw_back_size + nxt->bi_hw_front_size > q->max_segment_size) | 1303 | if (bio->bi_hw_back_size + nxt->bi_hw_front_size > q->max_segment_size) |
1304 | return 0; | 1304 | return 0; |
1305 | 1305 | ||
1306 | return 1; | 1306 | return 1; |
1307 | } | 1307 | } |
1308 | 1308 | ||
1309 | /* | 1309 | /* |
1310 | * map a request to scatterlist, return number of sg entries setup. Caller | 1310 | * map a request to scatterlist, return number of sg entries setup. Caller |
1311 | * must make sure sg can hold rq->nr_phys_segments entries | 1311 | * must make sure sg can hold rq->nr_phys_segments entries |
1312 | */ | 1312 | */ |
1313 | int blk_rq_map_sg(struct request_queue *q, struct request *rq, | 1313 | int blk_rq_map_sg(struct request_queue *q, struct request *rq, |
1314 | struct scatterlist *sg) | 1314 | struct scatterlist *sg) |
1315 | { | 1315 | { |
1316 | struct bio_vec *bvec, *bvprv; | 1316 | struct bio_vec *bvec, *bvprv; |
1317 | struct bio *bio; | 1317 | struct bio *bio; |
1318 | int nsegs, i, cluster; | 1318 | int nsegs, i, cluster; |
1319 | 1319 | ||
1320 | nsegs = 0; | 1320 | nsegs = 0; |
1321 | cluster = q->queue_flags & (1 << QUEUE_FLAG_CLUSTER); | 1321 | cluster = q->queue_flags & (1 << QUEUE_FLAG_CLUSTER); |
1322 | 1322 | ||
1323 | /* | 1323 | /* |
1324 | * for each bio in rq | 1324 | * for each bio in rq |
1325 | */ | 1325 | */ |
1326 | bvprv = NULL; | 1326 | bvprv = NULL; |
1327 | rq_for_each_bio(bio, rq) { | 1327 | rq_for_each_bio(bio, rq) { |
1328 | /* | 1328 | /* |
1329 | * for each segment in bio | 1329 | * for each segment in bio |
1330 | */ | 1330 | */ |
1331 | bio_for_each_segment(bvec, bio, i) { | 1331 | bio_for_each_segment(bvec, bio, i) { |
1332 | int nbytes = bvec->bv_len; | 1332 | int nbytes = bvec->bv_len; |
1333 | 1333 | ||
1334 | if (bvprv && cluster) { | 1334 | if (bvprv && cluster) { |
1335 | if (sg[nsegs - 1].length + nbytes > q->max_segment_size) | 1335 | if (sg[nsegs - 1].length + nbytes > q->max_segment_size) |
1336 | goto new_segment; | 1336 | goto new_segment; |
1337 | 1337 | ||
1338 | if (!BIOVEC_PHYS_MERGEABLE(bvprv, bvec)) | 1338 | if (!BIOVEC_PHYS_MERGEABLE(bvprv, bvec)) |
1339 | goto new_segment; | 1339 | goto new_segment; |
1340 | if (!BIOVEC_SEG_BOUNDARY(q, bvprv, bvec)) | 1340 | if (!BIOVEC_SEG_BOUNDARY(q, bvprv, bvec)) |
1341 | goto new_segment; | 1341 | goto new_segment; |
1342 | 1342 | ||
1343 | sg[nsegs - 1].length += nbytes; | 1343 | sg[nsegs - 1].length += nbytes; |
1344 | } else { | 1344 | } else { |
1345 | new_segment: | 1345 | new_segment: |
1346 | memset(&sg[nsegs],0,sizeof(struct scatterlist)); | 1346 | memset(&sg[nsegs],0,sizeof(struct scatterlist)); |
1347 | sg[nsegs].page = bvec->bv_page; | 1347 | sg[nsegs].page = bvec->bv_page; |
1348 | sg[nsegs].length = nbytes; | 1348 | sg[nsegs].length = nbytes; |
1349 | sg[nsegs].offset = bvec->bv_offset; | 1349 | sg[nsegs].offset = bvec->bv_offset; |
1350 | 1350 | ||
1351 | nsegs++; | 1351 | nsegs++; |
1352 | } | 1352 | } |
1353 | bvprv = bvec; | 1353 | bvprv = bvec; |
1354 | } /* segments in bio */ | 1354 | } /* segments in bio */ |
1355 | } /* bios in rq */ | 1355 | } /* bios in rq */ |
1356 | 1356 | ||
1357 | return nsegs; | 1357 | return nsegs; |
1358 | } | 1358 | } |
1359 | 1359 | ||
1360 | EXPORT_SYMBOL(blk_rq_map_sg); | 1360 | EXPORT_SYMBOL(blk_rq_map_sg); |
1361 | 1361 | ||
1362 | /* | 1362 | /* |
1363 | * the standard queue merge functions, can be overridden with device | 1363 | * the standard queue merge functions, can be overridden with device |
1364 | * specific ones if so desired | 1364 | * specific ones if so desired |
1365 | */ | 1365 | */ |
1366 | 1366 | ||
1367 | static inline int ll_new_mergeable(struct request_queue *q, | 1367 | static inline int ll_new_mergeable(struct request_queue *q, |
1368 | struct request *req, | 1368 | struct request *req, |
1369 | struct bio *bio) | 1369 | struct bio *bio) |
1370 | { | 1370 | { |
1371 | int nr_phys_segs = bio_phys_segments(q, bio); | 1371 | int nr_phys_segs = bio_phys_segments(q, bio); |
1372 | 1372 | ||
1373 | if (req->nr_phys_segments + nr_phys_segs > q->max_phys_segments) { | 1373 | if (req->nr_phys_segments + nr_phys_segs > q->max_phys_segments) { |
1374 | req->cmd_flags |= REQ_NOMERGE; | 1374 | req->cmd_flags |= REQ_NOMERGE; |
1375 | if (req == q->last_merge) | 1375 | if (req == q->last_merge) |
1376 | q->last_merge = NULL; | 1376 | q->last_merge = NULL; |
1377 | return 0; | 1377 | return 0; |
1378 | } | 1378 | } |
1379 | 1379 | ||
1380 | /* | 1380 | /* |
1381 | * A hw segment is just getting larger, bump just the phys | 1381 | * A hw segment is just getting larger, bump just the phys |
1382 | * counter. | 1382 | * counter. |
1383 | */ | 1383 | */ |
1384 | req->nr_phys_segments += nr_phys_segs; | 1384 | req->nr_phys_segments += nr_phys_segs; |
1385 | return 1; | 1385 | return 1; |
1386 | } | 1386 | } |
1387 | 1387 | ||
1388 | static inline int ll_new_hw_segment(struct request_queue *q, | 1388 | static inline int ll_new_hw_segment(struct request_queue *q, |
1389 | struct request *req, | 1389 | struct request *req, |
1390 | struct bio *bio) | 1390 | struct bio *bio) |
1391 | { | 1391 | { |
1392 | int nr_hw_segs = bio_hw_segments(q, bio); | 1392 | int nr_hw_segs = bio_hw_segments(q, bio); |
1393 | int nr_phys_segs = bio_phys_segments(q, bio); | 1393 | int nr_phys_segs = bio_phys_segments(q, bio); |
1394 | 1394 | ||
1395 | if (req->nr_hw_segments + nr_hw_segs > q->max_hw_segments | 1395 | if (req->nr_hw_segments + nr_hw_segs > q->max_hw_segments |
1396 | || req->nr_phys_segments + nr_phys_segs > q->max_phys_segments) { | 1396 | || req->nr_phys_segments + nr_phys_segs > q->max_phys_segments) { |
1397 | req->cmd_flags |= REQ_NOMERGE; | 1397 | req->cmd_flags |= REQ_NOMERGE; |
1398 | if (req == q->last_merge) | 1398 | if (req == q->last_merge) |
1399 | q->last_merge = NULL; | 1399 | q->last_merge = NULL; |
1400 | return 0; | 1400 | return 0; |
1401 | } | 1401 | } |
1402 | 1402 | ||
1403 | /* | 1403 | /* |
1404 | * This will form the start of a new hw segment. Bump both | 1404 | * This will form the start of a new hw segment. Bump both |
1405 | * counters. | 1405 | * counters. |
1406 | */ | 1406 | */ |
1407 | req->nr_hw_segments += nr_hw_segs; | 1407 | req->nr_hw_segments += nr_hw_segs; |
1408 | req->nr_phys_segments += nr_phys_segs; | 1408 | req->nr_phys_segments += nr_phys_segs; |
1409 | return 1; | 1409 | return 1; |
1410 | } | 1410 | } |
1411 | 1411 | ||
1412 | int ll_back_merge_fn(struct request_queue *q, struct request *req, struct bio *bio) | 1412 | int ll_back_merge_fn(struct request_queue *q, struct request *req, struct bio *bio) |
1413 | { | 1413 | { |
1414 | unsigned short max_sectors; | 1414 | unsigned short max_sectors; |
1415 | int len; | 1415 | int len; |
1416 | 1416 | ||
1417 | if (unlikely(blk_pc_request(req))) | 1417 | if (unlikely(blk_pc_request(req))) |
1418 | max_sectors = q->max_hw_sectors; | 1418 | max_sectors = q->max_hw_sectors; |
1419 | else | 1419 | else |
1420 | max_sectors = q->max_sectors; | 1420 | max_sectors = q->max_sectors; |
1421 | 1421 | ||
1422 | if (req->nr_sectors + bio_sectors(bio) > max_sectors) { | 1422 | if (req->nr_sectors + bio_sectors(bio) > max_sectors) { |
1423 | req->cmd_flags |= REQ_NOMERGE; | 1423 | req->cmd_flags |= REQ_NOMERGE; |
1424 | if (req == q->last_merge) | 1424 | if (req == q->last_merge) |
1425 | q->last_merge = NULL; | 1425 | q->last_merge = NULL; |
1426 | return 0; | 1426 | return 0; |
1427 | } | 1427 | } |
1428 | if (unlikely(!bio_flagged(req->biotail, BIO_SEG_VALID))) | 1428 | if (unlikely(!bio_flagged(req->biotail, BIO_SEG_VALID))) |
1429 | blk_recount_segments(q, req->biotail); | 1429 | blk_recount_segments(q, req->biotail); |
1430 | if (unlikely(!bio_flagged(bio, BIO_SEG_VALID))) | 1430 | if (unlikely(!bio_flagged(bio, BIO_SEG_VALID))) |
1431 | blk_recount_segments(q, bio); | 1431 | blk_recount_segments(q, bio); |
1432 | len = req->biotail->bi_hw_back_size + bio->bi_hw_front_size; | 1432 | len = req->biotail->bi_hw_back_size + bio->bi_hw_front_size; |
1433 | if (BIOVEC_VIRT_MERGEABLE(__BVEC_END(req->biotail), __BVEC_START(bio)) && | 1433 | if (BIOVEC_VIRT_MERGEABLE(__BVEC_END(req->biotail), __BVEC_START(bio)) && |
1434 | !BIOVEC_VIRT_OVERSIZE(len)) { | 1434 | !BIOVEC_VIRT_OVERSIZE(len)) { |
1435 | int mergeable = ll_new_mergeable(q, req, bio); | 1435 | int mergeable = ll_new_mergeable(q, req, bio); |
1436 | 1436 | ||
1437 | if (mergeable) { | 1437 | if (mergeable) { |
1438 | if (req->nr_hw_segments == 1) | 1438 | if (req->nr_hw_segments == 1) |
1439 | req->bio->bi_hw_front_size = len; | 1439 | req->bio->bi_hw_front_size = len; |
1440 | if (bio->bi_hw_segments == 1) | 1440 | if (bio->bi_hw_segments == 1) |
1441 | bio->bi_hw_back_size = len; | 1441 | bio->bi_hw_back_size = len; |
1442 | } | 1442 | } |
1443 | return mergeable; | 1443 | return mergeable; |
1444 | } | 1444 | } |
1445 | 1445 | ||
1446 | return ll_new_hw_segment(q, req, bio); | 1446 | return ll_new_hw_segment(q, req, bio); |
1447 | } | 1447 | } |
1448 | EXPORT_SYMBOL(ll_back_merge_fn); | 1448 | EXPORT_SYMBOL(ll_back_merge_fn); |
1449 | 1449 | ||
1450 | static int ll_front_merge_fn(struct request_queue *q, struct request *req, | 1450 | static int ll_front_merge_fn(struct request_queue *q, struct request *req, |
1451 | struct bio *bio) | 1451 | struct bio *bio) |
1452 | { | 1452 | { |
1453 | unsigned short max_sectors; | 1453 | unsigned short max_sectors; |
1454 | int len; | 1454 | int len; |
1455 | 1455 | ||
1456 | if (unlikely(blk_pc_request(req))) | 1456 | if (unlikely(blk_pc_request(req))) |
1457 | max_sectors = q->max_hw_sectors; | 1457 | max_sectors = q->max_hw_sectors; |
1458 | else | 1458 | else |
1459 | max_sectors = q->max_sectors; | 1459 | max_sectors = q->max_sectors; |
1460 | 1460 | ||
1461 | 1461 | ||
1462 | if (req->nr_sectors + bio_sectors(bio) > max_sectors) { | 1462 | if (req->nr_sectors + bio_sectors(bio) > max_sectors) { |
1463 | req->cmd_flags |= REQ_NOMERGE; | 1463 | req->cmd_flags |= REQ_NOMERGE; |
1464 | if (req == q->last_merge) | 1464 | if (req == q->last_merge) |
1465 | q->last_merge = NULL; | 1465 | q->last_merge = NULL; |
1466 | return 0; | 1466 | return 0; |
1467 | } | 1467 | } |
1468 | len = bio->bi_hw_back_size + req->bio->bi_hw_front_size; | 1468 | len = bio->bi_hw_back_size + req->bio->bi_hw_front_size; |
1469 | if (unlikely(!bio_flagged(bio, BIO_SEG_VALID))) | 1469 | if (unlikely(!bio_flagged(bio, BIO_SEG_VALID))) |
1470 | blk_recount_segments(q, bio); | 1470 | blk_recount_segments(q, bio); |
1471 | if (unlikely(!bio_flagged(req->bio, BIO_SEG_VALID))) | 1471 | if (unlikely(!bio_flagged(req->bio, BIO_SEG_VALID))) |
1472 | blk_recount_segments(q, req->bio); | 1472 | blk_recount_segments(q, req->bio); |
1473 | if (BIOVEC_VIRT_MERGEABLE(__BVEC_END(bio), __BVEC_START(req->bio)) && | 1473 | if (BIOVEC_VIRT_MERGEABLE(__BVEC_END(bio), __BVEC_START(req->bio)) && |
1474 | !BIOVEC_VIRT_OVERSIZE(len)) { | 1474 | !BIOVEC_VIRT_OVERSIZE(len)) { |
1475 | int mergeable = ll_new_mergeable(q, req, bio); | 1475 | int mergeable = ll_new_mergeable(q, req, bio); |
1476 | 1476 | ||
1477 | if (mergeable) { | 1477 | if (mergeable) { |
1478 | if (bio->bi_hw_segments == 1) | 1478 | if (bio->bi_hw_segments == 1) |
1479 | bio->bi_hw_front_size = len; | 1479 | bio->bi_hw_front_size = len; |
1480 | if (req->nr_hw_segments == 1) | 1480 | if (req->nr_hw_segments == 1) |
1481 | req->biotail->bi_hw_back_size = len; | 1481 | req->biotail->bi_hw_back_size = len; |
1482 | } | 1482 | } |
1483 | return mergeable; | 1483 | return mergeable; |
1484 | } | 1484 | } |
1485 | 1485 | ||
1486 | return ll_new_hw_segment(q, req, bio); | 1486 | return ll_new_hw_segment(q, req, bio); |
1487 | } | 1487 | } |
1488 | 1488 | ||
1489 | static int ll_merge_requests_fn(struct request_queue *q, struct request *req, | 1489 | static int ll_merge_requests_fn(struct request_queue *q, struct request *req, |
1490 | struct request *next) | 1490 | struct request *next) |
1491 | { | 1491 | { |
1492 | int total_phys_segments; | 1492 | int total_phys_segments; |
1493 | int total_hw_segments; | 1493 | int total_hw_segments; |
1494 | 1494 | ||
1495 | /* | 1495 | /* |
1496 | * First check if the either of the requests are re-queued | 1496 | * First check if the either of the requests are re-queued |
1497 | * requests. Can't merge them if they are. | 1497 | * requests. Can't merge them if they are. |
1498 | */ | 1498 | */ |
1499 | if (req->special || next->special) | 1499 | if (req->special || next->special) |
1500 | return 0; | 1500 | return 0; |
1501 | 1501 | ||
1502 | /* | 1502 | /* |
1503 | * Will it become too large? | 1503 | * Will it become too large? |
1504 | */ | 1504 | */ |
1505 | if ((req->nr_sectors + next->nr_sectors) > q->max_sectors) | 1505 | if ((req->nr_sectors + next->nr_sectors) > q->max_sectors) |
1506 | return 0; | 1506 | return 0; |
1507 | 1507 | ||
1508 | total_phys_segments = req->nr_phys_segments + next->nr_phys_segments; | 1508 | total_phys_segments = req->nr_phys_segments + next->nr_phys_segments; |
1509 | if (blk_phys_contig_segment(q, req->biotail, next->bio)) | 1509 | if (blk_phys_contig_segment(q, req->biotail, next->bio)) |
1510 | total_phys_segments--; | 1510 | total_phys_segments--; |
1511 | 1511 | ||
1512 | if (total_phys_segments > q->max_phys_segments) | 1512 | if (total_phys_segments > q->max_phys_segments) |
1513 | return 0; | 1513 | return 0; |
1514 | 1514 | ||
1515 | total_hw_segments = req->nr_hw_segments + next->nr_hw_segments; | 1515 | total_hw_segments = req->nr_hw_segments + next->nr_hw_segments; |
1516 | if (blk_hw_contig_segment(q, req->biotail, next->bio)) { | 1516 | if (blk_hw_contig_segment(q, req->biotail, next->bio)) { |
1517 | int len = req->biotail->bi_hw_back_size + next->bio->bi_hw_front_size; | 1517 | int len = req->biotail->bi_hw_back_size + next->bio->bi_hw_front_size; |
1518 | /* | 1518 | /* |
1519 | * propagate the combined length to the end of the requests | 1519 | * propagate the combined length to the end of the requests |
1520 | */ | 1520 | */ |
1521 | if (req->nr_hw_segments == 1) | 1521 | if (req->nr_hw_segments == 1) |
1522 | req->bio->bi_hw_front_size = len; | 1522 | req->bio->bi_hw_front_size = len; |
1523 | if (next->nr_hw_segments == 1) | 1523 | if (next->nr_hw_segments == 1) |
1524 | next->biotail->bi_hw_back_size = len; | 1524 | next->biotail->bi_hw_back_size = len; |
1525 | total_hw_segments--; | 1525 | total_hw_segments--; |
1526 | } | 1526 | } |
1527 | 1527 | ||
1528 | if (total_hw_segments > q->max_hw_segments) | 1528 | if (total_hw_segments > q->max_hw_segments) |
1529 | return 0; | 1529 | return 0; |
1530 | 1530 | ||
1531 | /* Merge is OK... */ | 1531 | /* Merge is OK... */ |
1532 | req->nr_phys_segments = total_phys_segments; | 1532 | req->nr_phys_segments = total_phys_segments; |
1533 | req->nr_hw_segments = total_hw_segments; | 1533 | req->nr_hw_segments = total_hw_segments; |
1534 | return 1; | 1534 | return 1; |
1535 | } | 1535 | } |
1536 | 1536 | ||
1537 | /* | 1537 | /* |
1538 | * "plug" the device if there are no outstanding requests: this will | 1538 | * "plug" the device if there are no outstanding requests: this will |
1539 | * force the transfer to start only after we have put all the requests | 1539 | * force the transfer to start only after we have put all the requests |
1540 | * on the list. | 1540 | * on the list. |
1541 | * | 1541 | * |
1542 | * This is called with interrupts off and no requests on the queue and | 1542 | * This is called with interrupts off and no requests on the queue and |
1543 | * with the queue lock held. | 1543 | * with the queue lock held. |
1544 | */ | 1544 | */ |
1545 | void blk_plug_device(struct request_queue *q) | 1545 | void blk_plug_device(struct request_queue *q) |
1546 | { | 1546 | { |
1547 | WARN_ON(!irqs_disabled()); | 1547 | WARN_ON(!irqs_disabled()); |
1548 | 1548 | ||
1549 | /* | 1549 | /* |
1550 | * don't plug a stopped queue, it must be paired with blk_start_queue() | 1550 | * don't plug a stopped queue, it must be paired with blk_start_queue() |
1551 | * which will restart the queueing | 1551 | * which will restart the queueing |
1552 | */ | 1552 | */ |
1553 | if (blk_queue_stopped(q)) | 1553 | if (blk_queue_stopped(q)) |
1554 | return; | 1554 | return; |
1555 | 1555 | ||
1556 | if (!test_and_set_bit(QUEUE_FLAG_PLUGGED, &q->queue_flags)) { | 1556 | if (!test_and_set_bit(QUEUE_FLAG_PLUGGED, &q->queue_flags)) { |
1557 | mod_timer(&q->unplug_timer, jiffies + q->unplug_delay); | 1557 | mod_timer(&q->unplug_timer, jiffies + q->unplug_delay); |
1558 | blk_add_trace_generic(q, NULL, 0, BLK_TA_PLUG); | 1558 | blk_add_trace_generic(q, NULL, 0, BLK_TA_PLUG); |
1559 | } | 1559 | } |
1560 | } | 1560 | } |
1561 | 1561 | ||
1562 | EXPORT_SYMBOL(blk_plug_device); | 1562 | EXPORT_SYMBOL(blk_plug_device); |
1563 | 1563 | ||
1564 | /* | 1564 | /* |
1565 | * remove the queue from the plugged list, if present. called with | 1565 | * remove the queue from the plugged list, if present. called with |
1566 | * queue lock held and interrupts disabled. | 1566 | * queue lock held and interrupts disabled. |
1567 | */ | 1567 | */ |
1568 | int blk_remove_plug(struct request_queue *q) | 1568 | int blk_remove_plug(struct request_queue *q) |
1569 | { | 1569 | { |
1570 | WARN_ON(!irqs_disabled()); | 1570 | WARN_ON(!irqs_disabled()); |
1571 | 1571 | ||
1572 | if (!test_and_clear_bit(QUEUE_FLAG_PLUGGED, &q->queue_flags)) | 1572 | if (!test_and_clear_bit(QUEUE_FLAG_PLUGGED, &q->queue_flags)) |
1573 | return 0; | 1573 | return 0; |
1574 | 1574 | ||
1575 | del_timer(&q->unplug_timer); | 1575 | del_timer(&q->unplug_timer); |
1576 | return 1; | 1576 | return 1; |
1577 | } | 1577 | } |
1578 | 1578 | ||
1579 | EXPORT_SYMBOL(blk_remove_plug); | 1579 | EXPORT_SYMBOL(blk_remove_plug); |
1580 | 1580 | ||
1581 | /* | 1581 | /* |
1582 | * remove the plug and let it rip.. | 1582 | * remove the plug and let it rip.. |
1583 | */ | 1583 | */ |
1584 | void __generic_unplug_device(struct request_queue *q) | 1584 | void __generic_unplug_device(struct request_queue *q) |
1585 | { | 1585 | { |
1586 | if (unlikely(blk_queue_stopped(q))) | 1586 | if (unlikely(blk_queue_stopped(q))) |
1587 | return; | 1587 | return; |
1588 | 1588 | ||
1589 | if (!blk_remove_plug(q)) | 1589 | if (!blk_remove_plug(q)) |
1590 | return; | 1590 | return; |
1591 | 1591 | ||
1592 | q->request_fn(q); | 1592 | q->request_fn(q); |
1593 | } | 1593 | } |
1594 | EXPORT_SYMBOL(__generic_unplug_device); | 1594 | EXPORT_SYMBOL(__generic_unplug_device); |
1595 | 1595 | ||
1596 | /** | 1596 | /** |
1597 | * generic_unplug_device - fire a request queue | 1597 | * generic_unplug_device - fire a request queue |
1598 | * @q: The &struct request_queue in question | 1598 | * @q: The &struct request_queue in question |
1599 | * | 1599 | * |
1600 | * Description: | 1600 | * Description: |
1601 | * Linux uses plugging to build bigger requests queues before letting | 1601 | * Linux uses plugging to build bigger requests queues before letting |
1602 | * the device have at them. If a queue is plugged, the I/O scheduler | 1602 | * the device have at them. If a queue is plugged, the I/O scheduler |
1603 | * is still adding and merging requests on the queue. Once the queue | 1603 | * is still adding and merging requests on the queue. Once the queue |
1604 | * gets unplugged, the request_fn defined for the queue is invoked and | 1604 | * gets unplugged, the request_fn defined for the queue is invoked and |
1605 | * transfers started. | 1605 | * transfers started. |
1606 | **/ | 1606 | **/ |
1607 | void generic_unplug_device(struct request_queue *q) | 1607 | void generic_unplug_device(struct request_queue *q) |
1608 | { | 1608 | { |
1609 | spin_lock_irq(q->queue_lock); | 1609 | spin_lock_irq(q->queue_lock); |
1610 | __generic_unplug_device(q); | 1610 | __generic_unplug_device(q); |
1611 | spin_unlock_irq(q->queue_lock); | 1611 | spin_unlock_irq(q->queue_lock); |
1612 | } | 1612 | } |
1613 | EXPORT_SYMBOL(generic_unplug_device); | 1613 | EXPORT_SYMBOL(generic_unplug_device); |
1614 | 1614 | ||
1615 | static void blk_backing_dev_unplug(struct backing_dev_info *bdi, | 1615 | static void blk_backing_dev_unplug(struct backing_dev_info *bdi, |
1616 | struct page *page) | 1616 | struct page *page) |
1617 | { | 1617 | { |
1618 | struct request_queue *q = bdi->unplug_io_data; | 1618 | struct request_queue *q = bdi->unplug_io_data; |
1619 | 1619 | ||
1620 | /* | 1620 | /* |
1621 | * devices don't necessarily have an ->unplug_fn defined | 1621 | * devices don't necessarily have an ->unplug_fn defined |
1622 | */ | 1622 | */ |
1623 | if (q->unplug_fn) { | 1623 | if (q->unplug_fn) { |
1624 | blk_add_trace_pdu_int(q, BLK_TA_UNPLUG_IO, NULL, | 1624 | blk_add_trace_pdu_int(q, BLK_TA_UNPLUG_IO, NULL, |
1625 | q->rq.count[READ] + q->rq.count[WRITE]); | 1625 | q->rq.count[READ] + q->rq.count[WRITE]); |
1626 | 1626 | ||
1627 | q->unplug_fn(q); | 1627 | q->unplug_fn(q); |
1628 | } | 1628 | } |
1629 | } | 1629 | } |
1630 | 1630 | ||
1631 | static void blk_unplug_work(struct work_struct *work) | 1631 | static void blk_unplug_work(struct work_struct *work) |
1632 | { | 1632 | { |
1633 | struct request_queue *q = | 1633 | struct request_queue *q = |
1634 | container_of(work, struct request_queue, unplug_work); | 1634 | container_of(work, struct request_queue, unplug_work); |
1635 | 1635 | ||
1636 | blk_add_trace_pdu_int(q, BLK_TA_UNPLUG_IO, NULL, | 1636 | blk_add_trace_pdu_int(q, BLK_TA_UNPLUG_IO, NULL, |
1637 | q->rq.count[READ] + q->rq.count[WRITE]); | 1637 | q->rq.count[READ] + q->rq.count[WRITE]); |
1638 | 1638 | ||
1639 | q->unplug_fn(q); | 1639 | q->unplug_fn(q); |
1640 | } | 1640 | } |
1641 | 1641 | ||
1642 | static void blk_unplug_timeout(unsigned long data) | 1642 | static void blk_unplug_timeout(unsigned long data) |
1643 | { | 1643 | { |
1644 | struct request_queue *q = (struct request_queue *)data; | 1644 | struct request_queue *q = (struct request_queue *)data; |
1645 | 1645 | ||
1646 | blk_add_trace_pdu_int(q, BLK_TA_UNPLUG_TIMER, NULL, | 1646 | blk_add_trace_pdu_int(q, BLK_TA_UNPLUG_TIMER, NULL, |
1647 | q->rq.count[READ] + q->rq.count[WRITE]); | 1647 | q->rq.count[READ] + q->rq.count[WRITE]); |
1648 | 1648 | ||
1649 | kblockd_schedule_work(&q->unplug_work); | 1649 | kblockd_schedule_work(&q->unplug_work); |
1650 | } | 1650 | } |
1651 | 1651 | ||
1652 | /** | 1652 | /** |
1653 | * blk_start_queue - restart a previously stopped queue | 1653 | * blk_start_queue - restart a previously stopped queue |
1654 | * @q: The &struct request_queue in question | 1654 | * @q: The &struct request_queue in question |
1655 | * | 1655 | * |
1656 | * Description: | 1656 | * Description: |
1657 | * blk_start_queue() will clear the stop flag on the queue, and call | 1657 | * blk_start_queue() will clear the stop flag on the queue, and call |
1658 | * the request_fn for the queue if it was in a stopped state when | 1658 | * the request_fn for the queue if it was in a stopped state when |
1659 | * entered. Also see blk_stop_queue(). Queue lock must be held. | 1659 | * entered. Also see blk_stop_queue(). Queue lock must be held. |
1660 | **/ | 1660 | **/ |
1661 | void blk_start_queue(struct request_queue *q) | 1661 | void blk_start_queue(struct request_queue *q) |
1662 | { | 1662 | { |
1663 | WARN_ON(!irqs_disabled()); | 1663 | WARN_ON(!irqs_disabled()); |
1664 | 1664 | ||
1665 | clear_bit(QUEUE_FLAG_STOPPED, &q->queue_flags); | 1665 | clear_bit(QUEUE_FLAG_STOPPED, &q->queue_flags); |
1666 | 1666 | ||
1667 | /* | 1667 | /* |
1668 | * one level of recursion is ok and is much faster than kicking | 1668 | * one level of recursion is ok and is much faster than kicking |
1669 | * the unplug handling | 1669 | * the unplug handling |
1670 | */ | 1670 | */ |
1671 | if (!test_and_set_bit(QUEUE_FLAG_REENTER, &q->queue_flags)) { | 1671 | if (!test_and_set_bit(QUEUE_FLAG_REENTER, &q->queue_flags)) { |
1672 | q->request_fn(q); | 1672 | q->request_fn(q); |
1673 | clear_bit(QUEUE_FLAG_REENTER, &q->queue_flags); | 1673 | clear_bit(QUEUE_FLAG_REENTER, &q->queue_flags); |
1674 | } else { | 1674 | } else { |
1675 | blk_plug_device(q); | 1675 | blk_plug_device(q); |
1676 | kblockd_schedule_work(&q->unplug_work); | 1676 | kblockd_schedule_work(&q->unplug_work); |
1677 | } | 1677 | } |
1678 | } | 1678 | } |
1679 | 1679 | ||
1680 | EXPORT_SYMBOL(blk_start_queue); | 1680 | EXPORT_SYMBOL(blk_start_queue); |
1681 | 1681 | ||
1682 | /** | 1682 | /** |
1683 | * blk_stop_queue - stop a queue | 1683 | * blk_stop_queue - stop a queue |
1684 | * @q: The &struct request_queue in question | 1684 | * @q: The &struct request_queue in question |
1685 | * | 1685 | * |
1686 | * Description: | 1686 | * Description: |
1687 | * The Linux block layer assumes that a block driver will consume all | 1687 | * The Linux block layer assumes that a block driver will consume all |
1688 | * entries on the request queue when the request_fn strategy is called. | 1688 | * entries on the request queue when the request_fn strategy is called. |
1689 | * Often this will not happen, because of hardware limitations (queue | 1689 | * Often this will not happen, because of hardware limitations (queue |
1690 | * depth settings). If a device driver gets a 'queue full' response, | 1690 | * depth settings). If a device driver gets a 'queue full' response, |
1691 | * or if it simply chooses not to queue more I/O at one point, it can | 1691 | * or if it simply chooses not to queue more I/O at one point, it can |
1692 | * call this function to prevent the request_fn from being called until | 1692 | * call this function to prevent the request_fn from being called until |
1693 | * the driver has signalled it's ready to go again. This happens by calling | 1693 | * the driver has signalled it's ready to go again. This happens by calling |
1694 | * blk_start_queue() to restart queue operations. Queue lock must be held. | 1694 | * blk_start_queue() to restart queue operations. Queue lock must be held. |
1695 | **/ | 1695 | **/ |
1696 | void blk_stop_queue(struct request_queue *q) | 1696 | void blk_stop_queue(struct request_queue *q) |
1697 | { | 1697 | { |
1698 | blk_remove_plug(q); | 1698 | blk_remove_plug(q); |
1699 | set_bit(QUEUE_FLAG_STOPPED, &q->queue_flags); | 1699 | set_bit(QUEUE_FLAG_STOPPED, &q->queue_flags); |
1700 | } | 1700 | } |
1701 | EXPORT_SYMBOL(blk_stop_queue); | 1701 | EXPORT_SYMBOL(blk_stop_queue); |
1702 | 1702 | ||
1703 | /** | 1703 | /** |
1704 | * blk_sync_queue - cancel any pending callbacks on a queue | 1704 | * blk_sync_queue - cancel any pending callbacks on a queue |
1705 | * @q: the queue | 1705 | * @q: the queue |
1706 | * | 1706 | * |
1707 | * Description: | 1707 | * Description: |
1708 | * The block layer may perform asynchronous callback activity | 1708 | * The block layer may perform asynchronous callback activity |
1709 | * on a queue, such as calling the unplug function after a timeout. | 1709 | * on a queue, such as calling the unplug function after a timeout. |
1710 | * A block device may call blk_sync_queue to ensure that any | 1710 | * A block device may call blk_sync_queue to ensure that any |
1711 | * such activity is cancelled, thus allowing it to release resources | 1711 | * such activity is cancelled, thus allowing it to release resources |
1712 | * that the callbacks might use. The caller must already have made sure | 1712 | * that the callbacks might use. The caller must already have made sure |
1713 | * that its ->make_request_fn will not re-add plugging prior to calling | 1713 | * that its ->make_request_fn will not re-add plugging prior to calling |
1714 | * this function. | 1714 | * this function. |
1715 | * | 1715 | * |
1716 | */ | 1716 | */ |
1717 | void blk_sync_queue(struct request_queue *q) | 1717 | void blk_sync_queue(struct request_queue *q) |
1718 | { | 1718 | { |
1719 | del_timer_sync(&q->unplug_timer); | 1719 | del_timer_sync(&q->unplug_timer); |
1720 | } | 1720 | } |
1721 | EXPORT_SYMBOL(blk_sync_queue); | 1721 | EXPORT_SYMBOL(blk_sync_queue); |
1722 | 1722 | ||
1723 | /** | 1723 | /** |
1724 | * blk_run_queue - run a single device queue | 1724 | * blk_run_queue - run a single device queue |
1725 | * @q: The queue to run | 1725 | * @q: The queue to run |
1726 | */ | 1726 | */ |
1727 | void blk_run_queue(struct request_queue *q) | 1727 | void blk_run_queue(struct request_queue *q) |
1728 | { | 1728 | { |
1729 | unsigned long flags; | 1729 | unsigned long flags; |
1730 | 1730 | ||
1731 | spin_lock_irqsave(q->queue_lock, flags); | 1731 | spin_lock_irqsave(q->queue_lock, flags); |
1732 | blk_remove_plug(q); | 1732 | blk_remove_plug(q); |
1733 | 1733 | ||
1734 | /* | 1734 | /* |
1735 | * Only recurse once to avoid overrunning the stack, let the unplug | 1735 | * Only recurse once to avoid overrunning the stack, let the unplug |
1736 | * handling reinvoke the handler shortly if we already got there. | 1736 | * handling reinvoke the handler shortly if we already got there. |
1737 | */ | 1737 | */ |
1738 | if (!elv_queue_empty(q)) { | 1738 | if (!elv_queue_empty(q)) { |
1739 | if (!test_and_set_bit(QUEUE_FLAG_REENTER, &q->queue_flags)) { | 1739 | if (!test_and_set_bit(QUEUE_FLAG_REENTER, &q->queue_flags)) { |
1740 | q->request_fn(q); | 1740 | q->request_fn(q); |
1741 | clear_bit(QUEUE_FLAG_REENTER, &q->queue_flags); | 1741 | clear_bit(QUEUE_FLAG_REENTER, &q->queue_flags); |
1742 | } else { | 1742 | } else { |
1743 | blk_plug_device(q); | 1743 | blk_plug_device(q); |
1744 | kblockd_schedule_work(&q->unplug_work); | 1744 | kblockd_schedule_work(&q->unplug_work); |
1745 | } | 1745 | } |
1746 | } | 1746 | } |
1747 | 1747 | ||
1748 | spin_unlock_irqrestore(q->queue_lock, flags); | 1748 | spin_unlock_irqrestore(q->queue_lock, flags); |
1749 | } | 1749 | } |
1750 | EXPORT_SYMBOL(blk_run_queue); | 1750 | EXPORT_SYMBOL(blk_run_queue); |
1751 | 1751 | ||
1752 | /** | 1752 | /** |
1753 | * blk_cleanup_queue: - release a &struct request_queue when it is no longer needed | 1753 | * blk_cleanup_queue: - release a &struct request_queue when it is no longer needed |
1754 | * @kobj: the kobj belonging of the request queue to be released | 1754 | * @kobj: the kobj belonging of the request queue to be released |
1755 | * | 1755 | * |
1756 | * Description: | 1756 | * Description: |
1757 | * blk_cleanup_queue is the pair to blk_init_queue() or | 1757 | * blk_cleanup_queue is the pair to blk_init_queue() or |
1758 | * blk_queue_make_request(). It should be called when a request queue is | 1758 | * blk_queue_make_request(). It should be called when a request queue is |
1759 | * being released; typically when a block device is being de-registered. | 1759 | * being released; typically when a block device is being de-registered. |
1760 | * Currently, its primary task it to free all the &struct request | 1760 | * Currently, its primary task it to free all the &struct request |
1761 | * structures that were allocated to the queue and the queue itself. | 1761 | * structures that were allocated to the queue and the queue itself. |
1762 | * | 1762 | * |
1763 | * Caveat: | 1763 | * Caveat: |
1764 | * Hopefully the low level driver will have finished any | 1764 | * Hopefully the low level driver will have finished any |
1765 | * outstanding requests first... | 1765 | * outstanding requests first... |
1766 | **/ | 1766 | **/ |
1767 | static void blk_release_queue(struct kobject *kobj) | 1767 | static void blk_release_queue(struct kobject *kobj) |
1768 | { | 1768 | { |
1769 | struct request_queue *q = | 1769 | struct request_queue *q = |
1770 | container_of(kobj, struct request_queue, kobj); | 1770 | container_of(kobj, struct request_queue, kobj); |
1771 | struct request_list *rl = &q->rq; | 1771 | struct request_list *rl = &q->rq; |
1772 | 1772 | ||
1773 | blk_sync_queue(q); | 1773 | blk_sync_queue(q); |
1774 | 1774 | ||
1775 | if (rl->rq_pool) | 1775 | if (rl->rq_pool) |
1776 | mempool_destroy(rl->rq_pool); | 1776 | mempool_destroy(rl->rq_pool); |
1777 | 1777 | ||
1778 | if (q->queue_tags) | 1778 | if (q->queue_tags) |
1779 | __blk_queue_free_tags(q); | 1779 | __blk_queue_free_tags(q); |
1780 | 1780 | ||
1781 | blk_trace_shutdown(q); | 1781 | blk_trace_shutdown(q); |
1782 | 1782 | ||
1783 | kmem_cache_free(requestq_cachep, q); | 1783 | kmem_cache_free(requestq_cachep, q); |
1784 | } | 1784 | } |
1785 | 1785 | ||
1786 | void blk_put_queue(struct request_queue *q) | 1786 | void blk_put_queue(struct request_queue *q) |
1787 | { | 1787 | { |
1788 | kobject_put(&q->kobj); | 1788 | kobject_put(&q->kobj); |
1789 | } | 1789 | } |
1790 | EXPORT_SYMBOL(blk_put_queue); | 1790 | EXPORT_SYMBOL(blk_put_queue); |
1791 | 1791 | ||
1792 | void blk_cleanup_queue(struct request_queue * q) | 1792 | void blk_cleanup_queue(struct request_queue * q) |
1793 | { | 1793 | { |
1794 | mutex_lock(&q->sysfs_lock); | 1794 | mutex_lock(&q->sysfs_lock); |
1795 | set_bit(QUEUE_FLAG_DEAD, &q->queue_flags); | 1795 | set_bit(QUEUE_FLAG_DEAD, &q->queue_flags); |
1796 | mutex_unlock(&q->sysfs_lock); | 1796 | mutex_unlock(&q->sysfs_lock); |
1797 | 1797 | ||
1798 | if (q->elevator) | 1798 | if (q->elevator) |
1799 | elevator_exit(q->elevator); | 1799 | elevator_exit(q->elevator); |
1800 | 1800 | ||
1801 | blk_put_queue(q); | 1801 | blk_put_queue(q); |
1802 | } | 1802 | } |
1803 | 1803 | ||
1804 | EXPORT_SYMBOL(blk_cleanup_queue); | 1804 | EXPORT_SYMBOL(blk_cleanup_queue); |
1805 | 1805 | ||
1806 | static int blk_init_free_list(struct request_queue *q) | 1806 | static int blk_init_free_list(struct request_queue *q) |
1807 | { | 1807 | { |
1808 | struct request_list *rl = &q->rq; | 1808 | struct request_list *rl = &q->rq; |
1809 | 1809 | ||
1810 | rl->count[READ] = rl->count[WRITE] = 0; | 1810 | rl->count[READ] = rl->count[WRITE] = 0; |
1811 | rl->starved[READ] = rl->starved[WRITE] = 0; | 1811 | rl->starved[READ] = rl->starved[WRITE] = 0; |
1812 | rl->elvpriv = 0; | 1812 | rl->elvpriv = 0; |
1813 | init_waitqueue_head(&rl->wait[READ]); | 1813 | init_waitqueue_head(&rl->wait[READ]); |
1814 | init_waitqueue_head(&rl->wait[WRITE]); | 1814 | init_waitqueue_head(&rl->wait[WRITE]); |
1815 | 1815 | ||
1816 | rl->rq_pool = mempool_create_node(BLKDEV_MIN_RQ, mempool_alloc_slab, | 1816 | rl->rq_pool = mempool_create_node(BLKDEV_MIN_RQ, mempool_alloc_slab, |
1817 | mempool_free_slab, request_cachep, q->node); | 1817 | mempool_free_slab, request_cachep, q->node); |
1818 | 1818 | ||
1819 | if (!rl->rq_pool) | 1819 | if (!rl->rq_pool) |
1820 | return -ENOMEM; | 1820 | return -ENOMEM; |
1821 | 1821 | ||
1822 | return 0; | 1822 | return 0; |
1823 | } | 1823 | } |
1824 | 1824 | ||
1825 | struct request_queue *blk_alloc_queue(gfp_t gfp_mask) | 1825 | struct request_queue *blk_alloc_queue(gfp_t gfp_mask) |
1826 | { | 1826 | { |
1827 | return blk_alloc_queue_node(gfp_mask, -1); | 1827 | return blk_alloc_queue_node(gfp_mask, -1); |
1828 | } | 1828 | } |
1829 | EXPORT_SYMBOL(blk_alloc_queue); | 1829 | EXPORT_SYMBOL(blk_alloc_queue); |
1830 | 1830 | ||
1831 | static struct kobj_type queue_ktype; | 1831 | static struct kobj_type queue_ktype; |
1832 | 1832 | ||
1833 | struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id) | 1833 | struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id) |
1834 | { | 1834 | { |
1835 | struct request_queue *q; | 1835 | struct request_queue *q; |
1836 | 1836 | ||
1837 | q = kmem_cache_alloc_node(requestq_cachep, | 1837 | q = kmem_cache_alloc_node(requestq_cachep, |
1838 | gfp_mask | __GFP_ZERO, node_id); | 1838 | gfp_mask | __GFP_ZERO, node_id); |
1839 | if (!q) | 1839 | if (!q) |
1840 | return NULL; | 1840 | return NULL; |
1841 | 1841 | ||
1842 | init_timer(&q->unplug_timer); | 1842 | init_timer(&q->unplug_timer); |
1843 | 1843 | ||
1844 | snprintf(q->kobj.name, KOBJ_NAME_LEN, "%s", "queue"); | 1844 | snprintf(q->kobj.name, KOBJ_NAME_LEN, "%s", "queue"); |
1845 | q->kobj.ktype = &queue_ktype; | 1845 | q->kobj.ktype = &queue_ktype; |
1846 | kobject_init(&q->kobj); | 1846 | kobject_init(&q->kobj); |
1847 | 1847 | ||
1848 | q->backing_dev_info.unplug_io_fn = blk_backing_dev_unplug; | 1848 | q->backing_dev_info.unplug_io_fn = blk_backing_dev_unplug; |
1849 | q->backing_dev_info.unplug_io_data = q; | 1849 | q->backing_dev_info.unplug_io_data = q; |
1850 | 1850 | ||
1851 | mutex_init(&q->sysfs_lock); | 1851 | mutex_init(&q->sysfs_lock); |
1852 | 1852 | ||
1853 | return q; | 1853 | return q; |
1854 | } | 1854 | } |
1855 | EXPORT_SYMBOL(blk_alloc_queue_node); | 1855 | EXPORT_SYMBOL(blk_alloc_queue_node); |
1856 | 1856 | ||
1857 | /** | 1857 | /** |
1858 | * blk_init_queue - prepare a request queue for use with a block device | 1858 | * blk_init_queue - prepare a request queue for use with a block device |
1859 | * @rfn: The function to be called to process requests that have been | 1859 | * @rfn: The function to be called to process requests that have been |
1860 | * placed on the queue. | 1860 | * placed on the queue. |
1861 | * @lock: Request queue spin lock | 1861 | * @lock: Request queue spin lock |
1862 | * | 1862 | * |
1863 | * Description: | 1863 | * Description: |
1864 | * If a block device wishes to use the standard request handling procedures, | 1864 | * If a block device wishes to use the standard request handling procedures, |
1865 | * which sorts requests and coalesces adjacent requests, then it must | 1865 | * which sorts requests and coalesces adjacent requests, then it must |
1866 | * call blk_init_queue(). The function @rfn will be called when there | 1866 | * call blk_init_queue(). The function @rfn will be called when there |
1867 | * are requests on the queue that need to be processed. If the device | 1867 | * are requests on the queue that need to be processed. If the device |
1868 | * supports plugging, then @rfn may not be called immediately when requests | 1868 | * supports plugging, then @rfn may not be called immediately when requests |
1869 | * are available on the queue, but may be called at some time later instead. | 1869 | * are available on the queue, but may be called at some time later instead. |
1870 | * Plugged queues are generally unplugged when a buffer belonging to one | 1870 | * Plugged queues are generally unplugged when a buffer belonging to one |
1871 | * of the requests on the queue is needed, or due to memory pressure. | 1871 | * of the requests on the queue is needed, or due to memory pressure. |
1872 | * | 1872 | * |
1873 | * @rfn is not required, or even expected, to remove all requests off the | 1873 | * @rfn is not required, or even expected, to remove all requests off the |
1874 | * queue, but only as many as it can handle at a time. If it does leave | 1874 | * queue, but only as many as it can handle at a time. If it does leave |
1875 | * requests on the queue, it is responsible for arranging that the requests | 1875 | * requests on the queue, it is responsible for arranging that the requests |
1876 | * get dealt with eventually. | 1876 | * get dealt with eventually. |
1877 | * | 1877 | * |
1878 | * The queue spin lock must be held while manipulating the requests on the | 1878 | * The queue spin lock must be held while manipulating the requests on the |
1879 | * request queue; this lock will be taken also from interrupt context, so irq | 1879 | * request queue; this lock will be taken also from interrupt context, so irq |
1880 | * disabling is needed for it. | 1880 | * disabling is needed for it. |
1881 | * | 1881 | * |
1882 | * Function returns a pointer to the initialized request queue, or NULL if | 1882 | * Function returns a pointer to the initialized request queue, or NULL if |
1883 | * it didn't succeed. | 1883 | * it didn't succeed. |
1884 | * | 1884 | * |
1885 | * Note: | 1885 | * Note: |
1886 | * blk_init_queue() must be paired with a blk_cleanup_queue() call | 1886 | * blk_init_queue() must be paired with a blk_cleanup_queue() call |
1887 | * when the block device is deactivated (such as at module unload). | 1887 | * when the block device is deactivated (such as at module unload). |
1888 | **/ | 1888 | **/ |
1889 | 1889 | ||
1890 | struct request_queue *blk_init_queue(request_fn_proc *rfn, spinlock_t *lock) | 1890 | struct request_queue *blk_init_queue(request_fn_proc *rfn, spinlock_t *lock) |
1891 | { | 1891 | { |
1892 | return blk_init_queue_node(rfn, lock, -1); | 1892 | return blk_init_queue_node(rfn, lock, -1); |
1893 | } | 1893 | } |
1894 | EXPORT_SYMBOL(blk_init_queue); | 1894 | EXPORT_SYMBOL(blk_init_queue); |
1895 | 1895 | ||
1896 | struct request_queue * | 1896 | struct request_queue * |
1897 | blk_init_queue_node(request_fn_proc *rfn, spinlock_t *lock, int node_id) | 1897 | blk_init_queue_node(request_fn_proc *rfn, spinlock_t *lock, int node_id) |
1898 | { | 1898 | { |
1899 | struct request_queue *q = blk_alloc_queue_node(GFP_KERNEL, node_id); | 1899 | struct request_queue *q = blk_alloc_queue_node(GFP_KERNEL, node_id); |
1900 | 1900 | ||
1901 | if (!q) | 1901 | if (!q) |
1902 | return NULL; | 1902 | return NULL; |
1903 | 1903 | ||
1904 | q->node = node_id; | 1904 | q->node = node_id; |
1905 | if (blk_init_free_list(q)) { | 1905 | if (blk_init_free_list(q)) { |
1906 | kmem_cache_free(requestq_cachep, q); | 1906 | kmem_cache_free(requestq_cachep, q); |
1907 | return NULL; | 1907 | return NULL; |
1908 | } | 1908 | } |
1909 | 1909 | ||
1910 | /* | 1910 | /* |
1911 | * if caller didn't supply a lock, they get per-queue locking with | 1911 | * if caller didn't supply a lock, they get per-queue locking with |
1912 | * our embedded lock | 1912 | * our embedded lock |
1913 | */ | 1913 | */ |
1914 | if (!lock) { | 1914 | if (!lock) { |
1915 | spin_lock_init(&q->__queue_lock); | 1915 | spin_lock_init(&q->__queue_lock); |
1916 | lock = &q->__queue_lock; | 1916 | lock = &q->__queue_lock; |
1917 | } | 1917 | } |
1918 | 1918 | ||
1919 | q->request_fn = rfn; | 1919 | q->request_fn = rfn; |
1920 | q->prep_rq_fn = NULL; | 1920 | q->prep_rq_fn = NULL; |
1921 | q->unplug_fn = generic_unplug_device; | 1921 | q->unplug_fn = generic_unplug_device; |
1922 | q->queue_flags = (1 << QUEUE_FLAG_CLUSTER); | 1922 | q->queue_flags = (1 << QUEUE_FLAG_CLUSTER); |
1923 | q->queue_lock = lock; | 1923 | q->queue_lock = lock; |
1924 | 1924 | ||
1925 | blk_queue_segment_boundary(q, 0xffffffff); | 1925 | blk_queue_segment_boundary(q, 0xffffffff); |
1926 | 1926 | ||
1927 | blk_queue_make_request(q, __make_request); | 1927 | blk_queue_make_request(q, __make_request); |
1928 | blk_queue_max_segment_size(q, MAX_SEGMENT_SIZE); | 1928 | blk_queue_max_segment_size(q, MAX_SEGMENT_SIZE); |
1929 | 1929 | ||
1930 | blk_queue_max_hw_segments(q, MAX_HW_SEGMENTS); | 1930 | blk_queue_max_hw_segments(q, MAX_HW_SEGMENTS); |
1931 | blk_queue_max_phys_segments(q, MAX_PHYS_SEGMENTS); | 1931 | blk_queue_max_phys_segments(q, MAX_PHYS_SEGMENTS); |
1932 | 1932 | ||
1933 | q->sg_reserved_size = INT_MAX; | 1933 | q->sg_reserved_size = INT_MAX; |
1934 | 1934 | ||
1935 | /* | 1935 | /* |
1936 | * all done | 1936 | * all done |
1937 | */ | 1937 | */ |
1938 | if (!elevator_init(q, NULL)) { | 1938 | if (!elevator_init(q, NULL)) { |
1939 | blk_queue_congestion_threshold(q); | 1939 | blk_queue_congestion_threshold(q); |
1940 | return q; | 1940 | return q; |
1941 | } | 1941 | } |
1942 | 1942 | ||
1943 | blk_put_queue(q); | 1943 | blk_put_queue(q); |
1944 | return NULL; | 1944 | return NULL; |
1945 | } | 1945 | } |
1946 | EXPORT_SYMBOL(blk_init_queue_node); | 1946 | EXPORT_SYMBOL(blk_init_queue_node); |
1947 | 1947 | ||
1948 | int blk_get_queue(struct request_queue *q) | 1948 | int blk_get_queue(struct request_queue *q) |
1949 | { | 1949 | { |
1950 | if (likely(!test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))) { | 1950 | if (likely(!test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))) { |
1951 | kobject_get(&q->kobj); | 1951 | kobject_get(&q->kobj); |
1952 | return 0; | 1952 | return 0; |
1953 | } | 1953 | } |
1954 | 1954 | ||
1955 | return 1; | 1955 | return 1; |
1956 | } | 1956 | } |
1957 | 1957 | ||
1958 | EXPORT_SYMBOL(blk_get_queue); | 1958 | EXPORT_SYMBOL(blk_get_queue); |
1959 | 1959 | ||
1960 | static inline void blk_free_request(struct request_queue *q, struct request *rq) | 1960 | static inline void blk_free_request(struct request_queue *q, struct request *rq) |
1961 | { | 1961 | { |
1962 | if (rq->cmd_flags & REQ_ELVPRIV) | 1962 | if (rq->cmd_flags & REQ_ELVPRIV) |
1963 | elv_put_request(q, rq); | 1963 | elv_put_request(q, rq); |
1964 | mempool_free(rq, q->rq.rq_pool); | 1964 | mempool_free(rq, q->rq.rq_pool); |
1965 | } | 1965 | } |
1966 | 1966 | ||
1967 | static struct request * | 1967 | static struct request * |
1968 | blk_alloc_request(struct request_queue *q, int rw, int priv, gfp_t gfp_mask) | 1968 | blk_alloc_request(struct request_queue *q, int rw, int priv, gfp_t gfp_mask) |
1969 | { | 1969 | { |
1970 | struct request *rq = mempool_alloc(q->rq.rq_pool, gfp_mask); | 1970 | struct request *rq = mempool_alloc(q->rq.rq_pool, gfp_mask); |
1971 | 1971 | ||
1972 | if (!rq) | 1972 | if (!rq) |
1973 | return NULL; | 1973 | return NULL; |
1974 | 1974 | ||
1975 | /* | 1975 | /* |
1976 | * first three bits are identical in rq->cmd_flags and bio->bi_rw, | 1976 | * first three bits are identical in rq->cmd_flags and bio->bi_rw, |
1977 | * see bio.h and blkdev.h | 1977 | * see bio.h and blkdev.h |
1978 | */ | 1978 | */ |
1979 | rq->cmd_flags = rw | REQ_ALLOCED; | 1979 | rq->cmd_flags = rw | REQ_ALLOCED; |
1980 | 1980 | ||
1981 | if (priv) { | 1981 | if (priv) { |
1982 | if (unlikely(elv_set_request(q, rq, gfp_mask))) { | 1982 | if (unlikely(elv_set_request(q, rq, gfp_mask))) { |
1983 | mempool_free(rq, q->rq.rq_pool); | 1983 | mempool_free(rq, q->rq.rq_pool); |
1984 | return NULL; | 1984 | return NULL; |
1985 | } | 1985 | } |
1986 | rq->cmd_flags |= REQ_ELVPRIV; | 1986 | rq->cmd_flags |= REQ_ELVPRIV; |
1987 | } | 1987 | } |
1988 | 1988 | ||
1989 | return rq; | 1989 | return rq; |
1990 | } | 1990 | } |
1991 | 1991 | ||
1992 | /* | 1992 | /* |
1993 | * ioc_batching returns true if the ioc is a valid batching request and | 1993 | * ioc_batching returns true if the ioc is a valid batching request and |
1994 | * should be given priority access to a request. | 1994 | * should be given priority access to a request. |
1995 | */ | 1995 | */ |
1996 | static inline int ioc_batching(struct request_queue *q, struct io_context *ioc) | 1996 | static inline int ioc_batching(struct request_queue *q, struct io_context *ioc) |
1997 | { | 1997 | { |
1998 | if (!ioc) | 1998 | if (!ioc) |
1999 | return 0; | 1999 | return 0; |
2000 | 2000 | ||
2001 | /* | 2001 | /* |
2002 | * Make sure the process is able to allocate at least 1 request | 2002 | * Make sure the process is able to allocate at least 1 request |
2003 | * even if the batch times out, otherwise we could theoretically | 2003 | * even if the batch times out, otherwise we could theoretically |
2004 | * lose wakeups. | 2004 | * lose wakeups. |
2005 | */ | 2005 | */ |
2006 | return ioc->nr_batch_requests == q->nr_batching || | 2006 | return ioc->nr_batch_requests == q->nr_batching || |
2007 | (ioc->nr_batch_requests > 0 | 2007 | (ioc->nr_batch_requests > 0 |
2008 | && time_before(jiffies, ioc->last_waited + BLK_BATCH_TIME)); | 2008 | && time_before(jiffies, ioc->last_waited + BLK_BATCH_TIME)); |
2009 | } | 2009 | } |
2010 | 2010 | ||
2011 | /* | 2011 | /* |
2012 | * ioc_set_batching sets ioc to be a new "batcher" if it is not one. This | 2012 | * ioc_set_batching sets ioc to be a new "batcher" if it is not one. This |
2013 | * will cause the process to be a "batcher" on all queues in the system. This | 2013 | * will cause the process to be a "batcher" on all queues in the system. This |
2014 | * is the behaviour we want though - once it gets a wakeup it should be given | 2014 | * is the behaviour we want though - once it gets a wakeup it should be given |
2015 | * a nice run. | 2015 | * a nice run. |
2016 | */ | 2016 | */ |
2017 | static void ioc_set_batching(struct request_queue *q, struct io_context *ioc) | 2017 | static void ioc_set_batching(struct request_queue *q, struct io_context *ioc) |
2018 | { | 2018 | { |
2019 | if (!ioc || ioc_batching(q, ioc)) | 2019 | if (!ioc || ioc_batching(q, ioc)) |
2020 | return; | 2020 | return; |
2021 | 2021 | ||
2022 | ioc->nr_batch_requests = q->nr_batching; | 2022 | ioc->nr_batch_requests = q->nr_batching; |
2023 | ioc->last_waited = jiffies; | 2023 | ioc->last_waited = jiffies; |
2024 | } | 2024 | } |
2025 | 2025 | ||
2026 | static void __freed_request(struct request_queue *q, int rw) | 2026 | static void __freed_request(struct request_queue *q, int rw) |
2027 | { | 2027 | { |
2028 | struct request_list *rl = &q->rq; | 2028 | struct request_list *rl = &q->rq; |
2029 | 2029 | ||
2030 | if (rl->count[rw] < queue_congestion_off_threshold(q)) | 2030 | if (rl->count[rw] < queue_congestion_off_threshold(q)) |
2031 | blk_clear_queue_congested(q, rw); | 2031 | blk_clear_queue_congested(q, rw); |
2032 | 2032 | ||
2033 | if (rl->count[rw] + 1 <= q->nr_requests) { | 2033 | if (rl->count[rw] + 1 <= q->nr_requests) { |
2034 | if (waitqueue_active(&rl->wait[rw])) | 2034 | if (waitqueue_active(&rl->wait[rw])) |
2035 | wake_up(&rl->wait[rw]); | 2035 | wake_up(&rl->wait[rw]); |
2036 | 2036 | ||
2037 | blk_clear_queue_full(q, rw); | 2037 | blk_clear_queue_full(q, rw); |
2038 | } | 2038 | } |
2039 | } | 2039 | } |
2040 | 2040 | ||
2041 | /* | 2041 | /* |
2042 | * A request has just been released. Account for it, update the full and | 2042 | * A request has just been released. Account for it, update the full and |
2043 | * congestion status, wake up any waiters. Called under q->queue_lock. | 2043 | * congestion status, wake up any waiters. Called under q->queue_lock. |
2044 | */ | 2044 | */ |
2045 | static void freed_request(struct request_queue *q, int rw, int priv) | 2045 | static void freed_request(struct request_queue *q, int rw, int priv) |
2046 | { | 2046 | { |
2047 | struct request_list *rl = &q->rq; | 2047 | struct request_list *rl = &q->rq; |
2048 | 2048 | ||
2049 | rl->count[rw]--; | 2049 | rl->count[rw]--; |
2050 | if (priv) | 2050 | if (priv) |
2051 | rl->elvpriv--; | 2051 | rl->elvpriv--; |
2052 | 2052 | ||
2053 | __freed_request(q, rw); | 2053 | __freed_request(q, rw); |
2054 | 2054 | ||
2055 | if (unlikely(rl->starved[rw ^ 1])) | 2055 | if (unlikely(rl->starved[rw ^ 1])) |
2056 | __freed_request(q, rw ^ 1); | 2056 | __freed_request(q, rw ^ 1); |
2057 | } | 2057 | } |
2058 | 2058 | ||
2059 | #define blkdev_free_rq(list) list_entry((list)->next, struct request, queuelist) | 2059 | #define blkdev_free_rq(list) list_entry((list)->next, struct request, queuelist) |
2060 | /* | 2060 | /* |
2061 | * Get a free request, queue_lock must be held. | 2061 | * Get a free request, queue_lock must be held. |
2062 | * Returns NULL on failure, with queue_lock held. | 2062 | * Returns NULL on failure, with queue_lock held. |
2063 | * Returns !NULL on success, with queue_lock *not held*. | 2063 | * Returns !NULL on success, with queue_lock *not held*. |
2064 | */ | 2064 | */ |
2065 | static struct request *get_request(struct request_queue *q, int rw_flags, | 2065 | static struct request *get_request(struct request_queue *q, int rw_flags, |
2066 | struct bio *bio, gfp_t gfp_mask) | 2066 | struct bio *bio, gfp_t gfp_mask) |
2067 | { | 2067 | { |
2068 | struct request *rq = NULL; | 2068 | struct request *rq = NULL; |
2069 | struct request_list *rl = &q->rq; | 2069 | struct request_list *rl = &q->rq; |
2070 | struct io_context *ioc = NULL; | 2070 | struct io_context *ioc = NULL; |
2071 | const int rw = rw_flags & 0x01; | 2071 | const int rw = rw_flags & 0x01; |
2072 | int may_queue, priv; | 2072 | int may_queue, priv; |
2073 | 2073 | ||
2074 | may_queue = elv_may_queue(q, rw_flags); | 2074 | may_queue = elv_may_queue(q, rw_flags); |
2075 | if (may_queue == ELV_MQUEUE_NO) | 2075 | if (may_queue == ELV_MQUEUE_NO) |
2076 | goto rq_starved; | 2076 | goto rq_starved; |
2077 | 2077 | ||
2078 | if (rl->count[rw]+1 >= queue_congestion_on_threshold(q)) { | 2078 | if (rl->count[rw]+1 >= queue_congestion_on_threshold(q)) { |
2079 | if (rl->count[rw]+1 >= q->nr_requests) { | 2079 | if (rl->count[rw]+1 >= q->nr_requests) { |
2080 | ioc = current_io_context(GFP_ATOMIC, q->node); | 2080 | ioc = current_io_context(GFP_ATOMIC, q->node); |
2081 | /* | 2081 | /* |
2082 | * The queue will fill after this allocation, so set | 2082 | * The queue will fill after this allocation, so set |
2083 | * it as full, and mark this process as "batching". | 2083 | * it as full, and mark this process as "batching". |
2084 | * This process will be allowed to complete a batch of | 2084 | * This process will be allowed to complete a batch of |
2085 | * requests, others will be blocked. | 2085 | * requests, others will be blocked. |
2086 | */ | 2086 | */ |
2087 | if (!blk_queue_full(q, rw)) { | 2087 | if (!blk_queue_full(q, rw)) { |
2088 | ioc_set_batching(q, ioc); | 2088 | ioc_set_batching(q, ioc); |
2089 | blk_set_queue_full(q, rw); | 2089 | blk_set_queue_full(q, rw); |
2090 | } else { | 2090 | } else { |
2091 | if (may_queue != ELV_MQUEUE_MUST | 2091 | if (may_queue != ELV_MQUEUE_MUST |
2092 | && !ioc_batching(q, ioc)) { | 2092 | && !ioc_batching(q, ioc)) { |
2093 | /* | 2093 | /* |
2094 | * The queue is full and the allocating | 2094 | * The queue is full and the allocating |
2095 | * process is not a "batcher", and not | 2095 | * process is not a "batcher", and not |
2096 | * exempted by the IO scheduler | 2096 | * exempted by the IO scheduler |
2097 | */ | 2097 | */ |
2098 | goto out; | 2098 | goto out; |
2099 | } | 2099 | } |
2100 | } | 2100 | } |
2101 | } | 2101 | } |
2102 | blk_set_queue_congested(q, rw); | 2102 | blk_set_queue_congested(q, rw); |
2103 | } | 2103 | } |
2104 | 2104 | ||
2105 | /* | 2105 | /* |
2106 | * Only allow batching queuers to allocate up to 50% over the defined | 2106 | * Only allow batching queuers to allocate up to 50% over the defined |
2107 | * limit of requests, otherwise we could have thousands of requests | 2107 | * limit of requests, otherwise we could have thousands of requests |
2108 | * allocated with any setting of ->nr_requests | 2108 | * allocated with any setting of ->nr_requests |
2109 | */ | 2109 | */ |
2110 | if (rl->count[rw] >= (3 * q->nr_requests / 2)) | 2110 | if (rl->count[rw] >= (3 * q->nr_requests / 2)) |
2111 | goto out; | 2111 | goto out; |
2112 | 2112 | ||
2113 | rl->count[rw]++; | 2113 | rl->count[rw]++; |
2114 | rl->starved[rw] = 0; | 2114 | rl->starved[rw] = 0; |
2115 | 2115 | ||
2116 | priv = !test_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags); | 2116 | priv = !test_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags); |
2117 | if (priv) | 2117 | if (priv) |
2118 | rl->elvpriv++; | 2118 | rl->elvpriv++; |
2119 | 2119 | ||
2120 | spin_unlock_irq(q->queue_lock); | 2120 | spin_unlock_irq(q->queue_lock); |
2121 | 2121 | ||
2122 | rq = blk_alloc_request(q, rw_flags, priv, gfp_mask); | 2122 | rq = blk_alloc_request(q, rw_flags, priv, gfp_mask); |
2123 | if (unlikely(!rq)) { | 2123 | if (unlikely(!rq)) { |
2124 | /* | 2124 | /* |
2125 | * Allocation failed presumably due to memory. Undo anything | 2125 | * Allocation failed presumably due to memory. Undo anything |
2126 | * we might have messed up. | 2126 | * we might have messed up. |
2127 | * | 2127 | * |
2128 | * Allocating task should really be put onto the front of the | 2128 | * Allocating task should really be put onto the front of the |
2129 | * wait queue, but this is pretty rare. | 2129 | * wait queue, but this is pretty rare. |
2130 | */ | 2130 | */ |
2131 | spin_lock_irq(q->queue_lock); | 2131 | spin_lock_irq(q->queue_lock); |
2132 | freed_request(q, rw, priv); | 2132 | freed_request(q, rw, priv); |
2133 | 2133 | ||
2134 | /* | 2134 | /* |
2135 | * in the very unlikely event that allocation failed and no | 2135 | * in the very unlikely event that allocation failed and no |
2136 | * requests for this direction was pending, mark us starved | 2136 | * requests for this direction was pending, mark us starved |
2137 | * so that freeing of a request in the other direction will | 2137 | * so that freeing of a request in the other direction will |
2138 | * notice us. another possible fix would be to split the | 2138 | * notice us. another possible fix would be to split the |
2139 | * rq mempool into READ and WRITE | 2139 | * rq mempool into READ and WRITE |
2140 | */ | 2140 | */ |
2141 | rq_starved: | 2141 | rq_starved: |
2142 | if (unlikely(rl->count[rw] == 0)) | 2142 | if (unlikely(rl->count[rw] == 0)) |
2143 | rl->starved[rw] = 1; | 2143 | rl->starved[rw] = 1; |
2144 | 2144 | ||
2145 | goto out; | 2145 | goto out; |
2146 | } | 2146 | } |
2147 | 2147 | ||
2148 | /* | 2148 | /* |
2149 | * ioc may be NULL here, and ioc_batching will be false. That's | 2149 | * ioc may be NULL here, and ioc_batching will be false. That's |
2150 | * OK, if the queue is under the request limit then requests need | 2150 | * OK, if the queue is under the request limit then requests need |
2151 | * not count toward the nr_batch_requests limit. There will always | 2151 | * not count toward the nr_batch_requests limit. There will always |
2152 | * be some limit enforced by BLK_BATCH_TIME. | 2152 | * be some limit enforced by BLK_BATCH_TIME. |
2153 | */ | 2153 | */ |
2154 | if (ioc_batching(q, ioc)) | 2154 | if (ioc_batching(q, ioc)) |
2155 | ioc->nr_batch_requests--; | 2155 | ioc->nr_batch_requests--; |
2156 | 2156 | ||
2157 | rq_init(q, rq); | 2157 | rq_init(q, rq); |
2158 | 2158 | ||
2159 | blk_add_trace_generic(q, bio, rw, BLK_TA_GETRQ); | 2159 | blk_add_trace_generic(q, bio, rw, BLK_TA_GETRQ); |
2160 | out: | 2160 | out: |
2161 | return rq; | 2161 | return rq; |
2162 | } | 2162 | } |
2163 | 2163 | ||
2164 | /* | 2164 | /* |
2165 | * No available requests for this queue, unplug the device and wait for some | 2165 | * No available requests for this queue, unplug the device and wait for some |
2166 | * requests to become available. | 2166 | * requests to become available. |
2167 | * | 2167 | * |
2168 | * Called with q->queue_lock held, and returns with it unlocked. | 2168 | * Called with q->queue_lock held, and returns with it unlocked. |
2169 | */ | 2169 | */ |
2170 | static struct request *get_request_wait(struct request_queue *q, int rw_flags, | 2170 | static struct request *get_request_wait(struct request_queue *q, int rw_flags, |
2171 | struct bio *bio) | 2171 | struct bio *bio) |
2172 | { | 2172 | { |
2173 | const int rw = rw_flags & 0x01; | 2173 | const int rw = rw_flags & 0x01; |
2174 | struct request *rq; | 2174 | struct request *rq; |
2175 | 2175 | ||
2176 | rq = get_request(q, rw_flags, bio, GFP_NOIO); | 2176 | rq = get_request(q, rw_flags, bio, GFP_NOIO); |
2177 | while (!rq) { | 2177 | while (!rq) { |
2178 | DEFINE_WAIT(wait); | 2178 | DEFINE_WAIT(wait); |
2179 | struct request_list *rl = &q->rq; | 2179 | struct request_list *rl = &q->rq; |
2180 | 2180 | ||
2181 | prepare_to_wait_exclusive(&rl->wait[rw], &wait, | 2181 | prepare_to_wait_exclusive(&rl->wait[rw], &wait, |
2182 | TASK_UNINTERRUPTIBLE); | 2182 | TASK_UNINTERRUPTIBLE); |
2183 | 2183 | ||
2184 | rq = get_request(q, rw_flags, bio, GFP_NOIO); | 2184 | rq = get_request(q, rw_flags, bio, GFP_NOIO); |
2185 | 2185 | ||
2186 | if (!rq) { | 2186 | if (!rq) { |
2187 | struct io_context *ioc; | 2187 | struct io_context *ioc; |
2188 | 2188 | ||
2189 | blk_add_trace_generic(q, bio, rw, BLK_TA_SLEEPRQ); | 2189 | blk_add_trace_generic(q, bio, rw, BLK_TA_SLEEPRQ); |
2190 | 2190 | ||
2191 | __generic_unplug_device(q); | 2191 | __generic_unplug_device(q); |
2192 | spin_unlock_irq(q->queue_lock); | 2192 | spin_unlock_irq(q->queue_lock); |
2193 | io_schedule(); | 2193 | io_schedule(); |
2194 | 2194 | ||
2195 | /* | 2195 | /* |
2196 | * After sleeping, we become a "batching" process and | 2196 | * After sleeping, we become a "batching" process and |
2197 | * will be able to allocate at least one request, and | 2197 | * will be able to allocate at least one request, and |
2198 | * up to a big batch of them for a small period time. | 2198 | * up to a big batch of them for a small period time. |
2199 | * See ioc_batching, ioc_set_batching | 2199 | * See ioc_batching, ioc_set_batching |
2200 | */ | 2200 | */ |
2201 | ioc = current_io_context(GFP_NOIO, q->node); | 2201 | ioc = current_io_context(GFP_NOIO, q->node); |
2202 | ioc_set_batching(q, ioc); | 2202 | ioc_set_batching(q, ioc); |
2203 | 2203 | ||
2204 | spin_lock_irq(q->queue_lock); | 2204 | spin_lock_irq(q->queue_lock); |
2205 | } | 2205 | } |
2206 | finish_wait(&rl->wait[rw], &wait); | 2206 | finish_wait(&rl->wait[rw], &wait); |
2207 | } | 2207 | } |
2208 | 2208 | ||
2209 | return rq; | 2209 | return rq; |
2210 | } | 2210 | } |
2211 | 2211 | ||
2212 | struct request *blk_get_request(struct request_queue *q, int rw, gfp_t gfp_mask) | 2212 | struct request *blk_get_request(struct request_queue *q, int rw, gfp_t gfp_mask) |
2213 | { | 2213 | { |
2214 | struct request *rq; | 2214 | struct request *rq; |
2215 | 2215 | ||
2216 | BUG_ON(rw != READ && rw != WRITE); | 2216 | BUG_ON(rw != READ && rw != WRITE); |
2217 | 2217 | ||
2218 | spin_lock_irq(q->queue_lock); | 2218 | spin_lock_irq(q->queue_lock); |
2219 | if (gfp_mask & __GFP_WAIT) { | 2219 | if (gfp_mask & __GFP_WAIT) { |
2220 | rq = get_request_wait(q, rw, NULL); | 2220 | rq = get_request_wait(q, rw, NULL); |
2221 | } else { | 2221 | } else { |
2222 | rq = get_request(q, rw, NULL, gfp_mask); | 2222 | rq = get_request(q, rw, NULL, gfp_mask); |
2223 | if (!rq) | 2223 | if (!rq) |
2224 | spin_unlock_irq(q->queue_lock); | 2224 | spin_unlock_irq(q->queue_lock); |
2225 | } | 2225 | } |
2226 | /* q->queue_lock is unlocked at this point */ | 2226 | /* q->queue_lock is unlocked at this point */ |
2227 | 2227 | ||
2228 | return rq; | 2228 | return rq; |
2229 | } | 2229 | } |
2230 | EXPORT_SYMBOL(blk_get_request); | 2230 | EXPORT_SYMBOL(blk_get_request); |
2231 | 2231 | ||
2232 | /** | 2232 | /** |
2233 | * blk_start_queueing - initiate dispatch of requests to device | 2233 | * blk_start_queueing - initiate dispatch of requests to device |
2234 | * @q: request queue to kick into gear | 2234 | * @q: request queue to kick into gear |
2235 | * | 2235 | * |
2236 | * This is basically a helper to remove the need to know whether a queue | 2236 | * This is basically a helper to remove the need to know whether a queue |
2237 | * is plugged or not if someone just wants to initiate dispatch of requests | 2237 | * is plugged or not if someone just wants to initiate dispatch of requests |
2238 | * for this queue. | 2238 | * for this queue. |
2239 | * | 2239 | * |
2240 | * The queue lock must be held with interrupts disabled. | 2240 | * The queue lock must be held with interrupts disabled. |
2241 | */ | 2241 | */ |
2242 | void blk_start_queueing(struct request_queue *q) | 2242 | void blk_start_queueing(struct request_queue *q) |
2243 | { | 2243 | { |
2244 | if (!blk_queue_plugged(q)) | 2244 | if (!blk_queue_plugged(q)) |
2245 | q->request_fn(q); | 2245 | q->request_fn(q); |
2246 | else | 2246 | else |
2247 | __generic_unplug_device(q); | 2247 | __generic_unplug_device(q); |
2248 | } | 2248 | } |
2249 | EXPORT_SYMBOL(blk_start_queueing); | 2249 | EXPORT_SYMBOL(blk_start_queueing); |
2250 | 2250 | ||
2251 | /** | 2251 | /** |
2252 | * blk_requeue_request - put a request back on queue | 2252 | * blk_requeue_request - put a request back on queue |
2253 | * @q: request queue where request should be inserted | 2253 | * @q: request queue where request should be inserted |
2254 | * @rq: request to be inserted | 2254 | * @rq: request to be inserted |
2255 | * | 2255 | * |
2256 | * Description: | 2256 | * Description: |
2257 | * Drivers often keep queueing requests until the hardware cannot accept | 2257 | * Drivers often keep queueing requests until the hardware cannot accept |
2258 | * more, when that condition happens we need to put the request back | 2258 | * more, when that condition happens we need to put the request back |
2259 | * on the queue. Must be called with queue lock held. | 2259 | * on the queue. Must be called with queue lock held. |
2260 | */ | 2260 | */ |
2261 | void blk_requeue_request(struct request_queue *q, struct request *rq) | 2261 | void blk_requeue_request(struct request_queue *q, struct request *rq) |
2262 | { | 2262 | { |
2263 | blk_add_trace_rq(q, rq, BLK_TA_REQUEUE); | 2263 | blk_add_trace_rq(q, rq, BLK_TA_REQUEUE); |
2264 | 2264 | ||
2265 | if (blk_rq_tagged(rq)) | 2265 | if (blk_rq_tagged(rq)) |
2266 | blk_queue_end_tag(q, rq); | 2266 | blk_queue_end_tag(q, rq); |
2267 | 2267 | ||
2268 | elv_requeue_request(q, rq); | 2268 | elv_requeue_request(q, rq); |
2269 | } | 2269 | } |
2270 | 2270 | ||
2271 | EXPORT_SYMBOL(blk_requeue_request); | 2271 | EXPORT_SYMBOL(blk_requeue_request); |
2272 | 2272 | ||
2273 | /** | 2273 | /** |
2274 | * blk_insert_request - insert a special request in to a request queue | 2274 | * blk_insert_request - insert a special request in to a request queue |
2275 | * @q: request queue where request should be inserted | 2275 | * @q: request queue where request should be inserted |
2276 | * @rq: request to be inserted | 2276 | * @rq: request to be inserted |
2277 | * @at_head: insert request at head or tail of queue | 2277 | * @at_head: insert request at head or tail of queue |
2278 | * @data: private data | 2278 | * @data: private data |
2279 | * | 2279 | * |
2280 | * Description: | 2280 | * Description: |
2281 | * Many block devices need to execute commands asynchronously, so they don't | 2281 | * Many block devices need to execute commands asynchronously, so they don't |
2282 | * block the whole kernel from preemption during request execution. This is | 2282 | * block the whole kernel from preemption during request execution. This is |
2283 | * accomplished normally by inserting aritficial requests tagged as | 2283 | * accomplished normally by inserting aritficial requests tagged as |
2284 | * REQ_SPECIAL in to the corresponding request queue, and letting them be | 2284 | * REQ_SPECIAL in to the corresponding request queue, and letting them be |
2285 | * scheduled for actual execution by the request queue. | 2285 | * scheduled for actual execution by the request queue. |
2286 | * | 2286 | * |
2287 | * We have the option of inserting the head or the tail of the queue. | 2287 | * We have the option of inserting the head or the tail of the queue. |
2288 | * Typically we use the tail for new ioctls and so forth. We use the head | 2288 | * Typically we use the tail for new ioctls and so forth. We use the head |
2289 | * of the queue for things like a QUEUE_FULL message from a device, or a | 2289 | * of the queue for things like a QUEUE_FULL message from a device, or a |
2290 | * host that is unable to accept a particular command. | 2290 | * host that is unable to accept a particular command. |
2291 | */ | 2291 | */ |
2292 | void blk_insert_request(struct request_queue *q, struct request *rq, | 2292 | void blk_insert_request(struct request_queue *q, struct request *rq, |
2293 | int at_head, void *data) | 2293 | int at_head, void *data) |
2294 | { | 2294 | { |
2295 | int where = at_head ? ELEVATOR_INSERT_FRONT : ELEVATOR_INSERT_BACK; | 2295 | int where = at_head ? ELEVATOR_INSERT_FRONT : ELEVATOR_INSERT_BACK; |
2296 | unsigned long flags; | 2296 | unsigned long flags; |
2297 | 2297 | ||
2298 | /* | 2298 | /* |
2299 | * tell I/O scheduler that this isn't a regular read/write (ie it | 2299 | * tell I/O scheduler that this isn't a regular read/write (ie it |
2300 | * must not attempt merges on this) and that it acts as a soft | 2300 | * must not attempt merges on this) and that it acts as a soft |
2301 | * barrier | 2301 | * barrier |
2302 | */ | 2302 | */ |
2303 | rq->cmd_type = REQ_TYPE_SPECIAL; | 2303 | rq->cmd_type = REQ_TYPE_SPECIAL; |
2304 | rq->cmd_flags |= REQ_SOFTBARRIER; | 2304 | rq->cmd_flags |= REQ_SOFTBARRIER; |
2305 | 2305 | ||
2306 | rq->special = data; | 2306 | rq->special = data; |
2307 | 2307 | ||
2308 | spin_lock_irqsave(q->queue_lock, flags); | 2308 | spin_lock_irqsave(q->queue_lock, flags); |
2309 | 2309 | ||
2310 | /* | 2310 | /* |
2311 | * If command is tagged, release the tag | 2311 | * If command is tagged, release the tag |
2312 | */ | 2312 | */ |
2313 | if (blk_rq_tagged(rq)) | 2313 | if (blk_rq_tagged(rq)) |
2314 | blk_queue_end_tag(q, rq); | 2314 | blk_queue_end_tag(q, rq); |
2315 | 2315 | ||
2316 | drive_stat_acct(rq, rq->nr_sectors, 1); | 2316 | drive_stat_acct(rq, rq->nr_sectors, 1); |
2317 | __elv_add_request(q, rq, where, 0); | 2317 | __elv_add_request(q, rq, where, 0); |
2318 | blk_start_queueing(q); | 2318 | blk_start_queueing(q); |
2319 | spin_unlock_irqrestore(q->queue_lock, flags); | 2319 | spin_unlock_irqrestore(q->queue_lock, flags); |
2320 | } | 2320 | } |
2321 | 2321 | ||
2322 | EXPORT_SYMBOL(blk_insert_request); | 2322 | EXPORT_SYMBOL(blk_insert_request); |
2323 | 2323 | ||
2324 | static int __blk_rq_unmap_user(struct bio *bio) | 2324 | static int __blk_rq_unmap_user(struct bio *bio) |
2325 | { | 2325 | { |
2326 | int ret = 0; | 2326 | int ret = 0; |
2327 | 2327 | ||
2328 | if (bio) { | 2328 | if (bio) { |
2329 | if (bio_flagged(bio, BIO_USER_MAPPED)) | 2329 | if (bio_flagged(bio, BIO_USER_MAPPED)) |
2330 | bio_unmap_user(bio); | 2330 | bio_unmap_user(bio); |
2331 | else | 2331 | else |
2332 | ret = bio_uncopy_user(bio); | 2332 | ret = bio_uncopy_user(bio); |
2333 | } | 2333 | } |
2334 | 2334 | ||
2335 | return ret; | 2335 | return ret; |
2336 | } | 2336 | } |
2337 | 2337 | ||
2338 | static int __blk_rq_map_user(struct request_queue *q, struct request *rq, | 2338 | static int __blk_rq_map_user(struct request_queue *q, struct request *rq, |
2339 | void __user *ubuf, unsigned int len) | 2339 | void __user *ubuf, unsigned int len) |
2340 | { | 2340 | { |
2341 | unsigned long uaddr; | 2341 | unsigned long uaddr; |
2342 | struct bio *bio, *orig_bio; | 2342 | struct bio *bio, *orig_bio; |
2343 | int reading, ret; | 2343 | int reading, ret; |
2344 | 2344 | ||
2345 | reading = rq_data_dir(rq) == READ; | 2345 | reading = rq_data_dir(rq) == READ; |
2346 | 2346 | ||
2347 | /* | 2347 | /* |
2348 | * if alignment requirement is satisfied, map in user pages for | 2348 | * if alignment requirement is satisfied, map in user pages for |
2349 | * direct dma. else, set up kernel bounce buffers | 2349 | * direct dma. else, set up kernel bounce buffers |
2350 | */ | 2350 | */ |
2351 | uaddr = (unsigned long) ubuf; | 2351 | uaddr = (unsigned long) ubuf; |
2352 | if (!(uaddr & queue_dma_alignment(q)) && !(len & queue_dma_alignment(q))) | 2352 | if (!(uaddr & queue_dma_alignment(q)) && !(len & queue_dma_alignment(q))) |
2353 | bio = bio_map_user(q, NULL, uaddr, len, reading); | 2353 | bio = bio_map_user(q, NULL, uaddr, len, reading); |
2354 | else | 2354 | else |
2355 | bio = bio_copy_user(q, uaddr, len, reading); | 2355 | bio = bio_copy_user(q, uaddr, len, reading); |
2356 | 2356 | ||
2357 | if (IS_ERR(bio)) | 2357 | if (IS_ERR(bio)) |
2358 | return PTR_ERR(bio); | 2358 | return PTR_ERR(bio); |
2359 | 2359 | ||
2360 | orig_bio = bio; | 2360 | orig_bio = bio; |
2361 | blk_queue_bounce(q, &bio); | 2361 | blk_queue_bounce(q, &bio); |
2362 | 2362 | ||
2363 | /* | 2363 | /* |
2364 | * We link the bounce buffer in and could have to traverse it | 2364 | * We link the bounce buffer in and could have to traverse it |
2365 | * later so we have to get a ref to prevent it from being freed | 2365 | * later so we have to get a ref to prevent it from being freed |
2366 | */ | 2366 | */ |
2367 | bio_get(bio); | 2367 | bio_get(bio); |
2368 | 2368 | ||
2369 | if (!rq->bio) | 2369 | if (!rq->bio) |
2370 | blk_rq_bio_prep(q, rq, bio); | 2370 | blk_rq_bio_prep(q, rq, bio); |
2371 | else if (!ll_back_merge_fn(q, rq, bio)) { | 2371 | else if (!ll_back_merge_fn(q, rq, bio)) { |
2372 | ret = -EINVAL; | 2372 | ret = -EINVAL; |
2373 | goto unmap_bio; | 2373 | goto unmap_bio; |
2374 | } else { | 2374 | } else { |
2375 | rq->biotail->bi_next = bio; | 2375 | rq->biotail->bi_next = bio; |
2376 | rq->biotail = bio; | 2376 | rq->biotail = bio; |
2377 | 2377 | ||
2378 | rq->data_len += bio->bi_size; | 2378 | rq->data_len += bio->bi_size; |
2379 | } | 2379 | } |
2380 | 2380 | ||
2381 | return bio->bi_size; | 2381 | return bio->bi_size; |
2382 | 2382 | ||
2383 | unmap_bio: | 2383 | unmap_bio: |
2384 | /* if it was boucned we must call the end io function */ | 2384 | /* if it was boucned we must call the end io function */ |
2385 | bio_endio(bio, bio->bi_size, 0); | 2385 | bio_endio(bio, bio->bi_size, 0); |
2386 | __blk_rq_unmap_user(orig_bio); | 2386 | __blk_rq_unmap_user(orig_bio); |
2387 | bio_put(bio); | 2387 | bio_put(bio); |
2388 | return ret; | 2388 | return ret; |
2389 | } | 2389 | } |
2390 | 2390 | ||
2391 | /** | 2391 | /** |
2392 | * blk_rq_map_user - map user data to a request, for REQ_BLOCK_PC usage | 2392 | * blk_rq_map_user - map user data to a request, for REQ_BLOCK_PC usage |
2393 | * @q: request queue where request should be inserted | 2393 | * @q: request queue where request should be inserted |
2394 | * @rq: request structure to fill | 2394 | * @rq: request structure to fill |
2395 | * @ubuf: the user buffer | 2395 | * @ubuf: the user buffer |
2396 | * @len: length of user data | 2396 | * @len: length of user data |
2397 | * | 2397 | * |
2398 | * Description: | 2398 | * Description: |
2399 | * Data will be mapped directly for zero copy io, if possible. Otherwise | 2399 | * Data will be mapped directly for zero copy io, if possible. Otherwise |
2400 | * a kernel bounce buffer is used. | 2400 | * a kernel bounce buffer is used. |
2401 | * | 2401 | * |
2402 | * A matching blk_rq_unmap_user() must be issued at the end of io, while | 2402 | * A matching blk_rq_unmap_user() must be issued at the end of io, while |
2403 | * still in process context. | 2403 | * still in process context. |
2404 | * | 2404 | * |
2405 | * Note: The mapped bio may need to be bounced through blk_queue_bounce() | 2405 | * Note: The mapped bio may need to be bounced through blk_queue_bounce() |
2406 | * before being submitted to the device, as pages mapped may be out of | 2406 | * before being submitted to the device, as pages mapped may be out of |
2407 | * reach. It's the callers responsibility to make sure this happens. The | 2407 | * reach. It's the callers responsibility to make sure this happens. The |
2408 | * original bio must be passed back in to blk_rq_unmap_user() for proper | 2408 | * original bio must be passed back in to blk_rq_unmap_user() for proper |
2409 | * unmapping. | 2409 | * unmapping. |
2410 | */ | 2410 | */ |
2411 | int blk_rq_map_user(struct request_queue *q, struct request *rq, | 2411 | int blk_rq_map_user(struct request_queue *q, struct request *rq, |
2412 | void __user *ubuf, unsigned long len) | 2412 | void __user *ubuf, unsigned long len) |
2413 | { | 2413 | { |
2414 | unsigned long bytes_read = 0; | 2414 | unsigned long bytes_read = 0; |
2415 | struct bio *bio = NULL; | 2415 | struct bio *bio = NULL; |
2416 | int ret; | 2416 | int ret; |
2417 | 2417 | ||
2418 | if (len > (q->max_hw_sectors << 9)) | 2418 | if (len > (q->max_hw_sectors << 9)) |
2419 | return -EINVAL; | 2419 | return -EINVAL; |
2420 | if (!len || !ubuf) | 2420 | if (!len || !ubuf) |
2421 | return -EINVAL; | 2421 | return -EINVAL; |
2422 | 2422 | ||
2423 | while (bytes_read != len) { | 2423 | while (bytes_read != len) { |
2424 | unsigned long map_len, end, start; | 2424 | unsigned long map_len, end, start; |
2425 | 2425 | ||
2426 | map_len = min_t(unsigned long, len - bytes_read, BIO_MAX_SIZE); | 2426 | map_len = min_t(unsigned long, len - bytes_read, BIO_MAX_SIZE); |
2427 | end = ((unsigned long)ubuf + map_len + PAGE_SIZE - 1) | 2427 | end = ((unsigned long)ubuf + map_len + PAGE_SIZE - 1) |
2428 | >> PAGE_SHIFT; | 2428 | >> PAGE_SHIFT; |
2429 | start = (unsigned long)ubuf >> PAGE_SHIFT; | 2429 | start = (unsigned long)ubuf >> PAGE_SHIFT; |
2430 | 2430 | ||
2431 | /* | 2431 | /* |
2432 | * A bad offset could cause us to require BIO_MAX_PAGES + 1 | 2432 | * A bad offset could cause us to require BIO_MAX_PAGES + 1 |
2433 | * pages. If this happens we just lower the requested | 2433 | * pages. If this happens we just lower the requested |
2434 | * mapping len by a page so that we can fit | 2434 | * mapping len by a page so that we can fit |
2435 | */ | 2435 | */ |
2436 | if (end - start > BIO_MAX_PAGES) | 2436 | if (end - start > BIO_MAX_PAGES) |
2437 | map_len -= PAGE_SIZE; | 2437 | map_len -= PAGE_SIZE; |
2438 | 2438 | ||
2439 | ret = __blk_rq_map_user(q, rq, ubuf, map_len); | 2439 | ret = __blk_rq_map_user(q, rq, ubuf, map_len); |
2440 | if (ret < 0) | 2440 | if (ret < 0) |
2441 | goto unmap_rq; | 2441 | goto unmap_rq; |
2442 | if (!bio) | 2442 | if (!bio) |
2443 | bio = rq->bio; | 2443 | bio = rq->bio; |
2444 | bytes_read += ret; | 2444 | bytes_read += ret; |
2445 | ubuf += ret; | 2445 | ubuf += ret; |
2446 | } | 2446 | } |
2447 | 2447 | ||
2448 | rq->buffer = rq->data = NULL; | 2448 | rq->buffer = rq->data = NULL; |
2449 | return 0; | 2449 | return 0; |
2450 | unmap_rq: | 2450 | unmap_rq: |
2451 | blk_rq_unmap_user(bio); | 2451 | blk_rq_unmap_user(bio); |
2452 | return ret; | 2452 | return ret; |
2453 | } | 2453 | } |
2454 | 2454 | ||
2455 | EXPORT_SYMBOL(blk_rq_map_user); | 2455 | EXPORT_SYMBOL(blk_rq_map_user); |
2456 | 2456 | ||
2457 | /** | 2457 | /** |
2458 | * blk_rq_map_user_iov - map user data to a request, for REQ_BLOCK_PC usage | 2458 | * blk_rq_map_user_iov - map user data to a request, for REQ_BLOCK_PC usage |
2459 | * @q: request queue where request should be inserted | 2459 | * @q: request queue where request should be inserted |
2460 | * @rq: request to map data to | 2460 | * @rq: request to map data to |
2461 | * @iov: pointer to the iovec | 2461 | * @iov: pointer to the iovec |
2462 | * @iov_count: number of elements in the iovec | 2462 | * @iov_count: number of elements in the iovec |
2463 | * @len: I/O byte count | 2463 | * @len: I/O byte count |
2464 | * | 2464 | * |
2465 | * Description: | 2465 | * Description: |
2466 | * Data will be mapped directly for zero copy io, if possible. Otherwise | 2466 | * Data will be mapped directly for zero copy io, if possible. Otherwise |
2467 | * a kernel bounce buffer is used. | 2467 | * a kernel bounce buffer is used. |
2468 | * | 2468 | * |
2469 | * A matching blk_rq_unmap_user() must be issued at the end of io, while | 2469 | * A matching blk_rq_unmap_user() must be issued at the end of io, while |
2470 | * still in process context. | 2470 | * still in process context. |
2471 | * | 2471 | * |
2472 | * Note: The mapped bio may need to be bounced through blk_queue_bounce() | 2472 | * Note: The mapped bio may need to be bounced through blk_queue_bounce() |
2473 | * before being submitted to the device, as pages mapped may be out of | 2473 | * before being submitted to the device, as pages mapped may be out of |
2474 | * reach. It's the callers responsibility to make sure this happens. The | 2474 | * reach. It's the callers responsibility to make sure this happens. The |
2475 | * original bio must be passed back in to blk_rq_unmap_user() for proper | 2475 | * original bio must be passed back in to blk_rq_unmap_user() for proper |
2476 | * unmapping. | 2476 | * unmapping. |
2477 | */ | 2477 | */ |
2478 | int blk_rq_map_user_iov(struct request_queue *q, struct request *rq, | 2478 | int blk_rq_map_user_iov(struct request_queue *q, struct request *rq, |
2479 | struct sg_iovec *iov, int iov_count, unsigned int len) | 2479 | struct sg_iovec *iov, int iov_count, unsigned int len) |
2480 | { | 2480 | { |
2481 | struct bio *bio; | 2481 | struct bio *bio; |
2482 | 2482 | ||
2483 | if (!iov || iov_count <= 0) | 2483 | if (!iov || iov_count <= 0) |
2484 | return -EINVAL; | 2484 | return -EINVAL; |
2485 | 2485 | ||
2486 | /* we don't allow misaligned data like bio_map_user() does. If the | 2486 | /* we don't allow misaligned data like bio_map_user() does. If the |
2487 | * user is using sg, they're expected to know the alignment constraints | 2487 | * user is using sg, they're expected to know the alignment constraints |
2488 | * and respect them accordingly */ | 2488 | * and respect them accordingly */ |
2489 | bio = bio_map_user_iov(q, NULL, iov, iov_count, rq_data_dir(rq)== READ); | 2489 | bio = bio_map_user_iov(q, NULL, iov, iov_count, rq_data_dir(rq)== READ); |
2490 | if (IS_ERR(bio)) | 2490 | if (IS_ERR(bio)) |
2491 | return PTR_ERR(bio); | 2491 | return PTR_ERR(bio); |
2492 | 2492 | ||
2493 | if (bio->bi_size != len) { | 2493 | if (bio->bi_size != len) { |
2494 | bio_endio(bio, bio->bi_size, 0); | 2494 | bio_endio(bio, bio->bi_size, 0); |
2495 | bio_unmap_user(bio); | 2495 | bio_unmap_user(bio); |
2496 | return -EINVAL; | 2496 | return -EINVAL; |
2497 | } | 2497 | } |
2498 | 2498 | ||
2499 | bio_get(bio); | 2499 | bio_get(bio); |
2500 | blk_rq_bio_prep(q, rq, bio); | 2500 | blk_rq_bio_prep(q, rq, bio); |
2501 | rq->buffer = rq->data = NULL; | 2501 | rq->buffer = rq->data = NULL; |
2502 | return 0; | 2502 | return 0; |
2503 | } | 2503 | } |
2504 | 2504 | ||
2505 | EXPORT_SYMBOL(blk_rq_map_user_iov); | 2505 | EXPORT_SYMBOL(blk_rq_map_user_iov); |
2506 | 2506 | ||
2507 | /** | 2507 | /** |
2508 | * blk_rq_unmap_user - unmap a request with user data | 2508 | * blk_rq_unmap_user - unmap a request with user data |
2509 | * @bio: start of bio list | 2509 | * @bio: start of bio list |
2510 | * | 2510 | * |
2511 | * Description: | 2511 | * Description: |
2512 | * Unmap a rq previously mapped by blk_rq_map_user(). The caller must | 2512 | * Unmap a rq previously mapped by blk_rq_map_user(). The caller must |
2513 | * supply the original rq->bio from the blk_rq_map_user() return, since | 2513 | * supply the original rq->bio from the blk_rq_map_user() return, since |
2514 | * the io completion may have changed rq->bio. | 2514 | * the io completion may have changed rq->bio. |
2515 | */ | 2515 | */ |
2516 | int blk_rq_unmap_user(struct bio *bio) | 2516 | int blk_rq_unmap_user(struct bio *bio) |
2517 | { | 2517 | { |
2518 | struct bio *mapped_bio; | 2518 | struct bio *mapped_bio; |
2519 | int ret = 0, ret2; | 2519 | int ret = 0, ret2; |
2520 | 2520 | ||
2521 | while (bio) { | 2521 | while (bio) { |
2522 | mapped_bio = bio; | 2522 | mapped_bio = bio; |
2523 | if (unlikely(bio_flagged(bio, BIO_BOUNCED))) | 2523 | if (unlikely(bio_flagged(bio, BIO_BOUNCED))) |
2524 | mapped_bio = bio->bi_private; | 2524 | mapped_bio = bio->bi_private; |
2525 | 2525 | ||
2526 | ret2 = __blk_rq_unmap_user(mapped_bio); | 2526 | ret2 = __blk_rq_unmap_user(mapped_bio); |
2527 | if (ret2 && !ret) | 2527 | if (ret2 && !ret) |
2528 | ret = ret2; | 2528 | ret = ret2; |
2529 | 2529 | ||
2530 | mapped_bio = bio; | 2530 | mapped_bio = bio; |
2531 | bio = bio->bi_next; | 2531 | bio = bio->bi_next; |
2532 | bio_put(mapped_bio); | 2532 | bio_put(mapped_bio); |
2533 | } | 2533 | } |
2534 | 2534 | ||
2535 | return ret; | 2535 | return ret; |
2536 | } | 2536 | } |
2537 | 2537 | ||
2538 | EXPORT_SYMBOL(blk_rq_unmap_user); | 2538 | EXPORT_SYMBOL(blk_rq_unmap_user); |
2539 | 2539 | ||
2540 | /** | 2540 | /** |
2541 | * blk_rq_map_kern - map kernel data to a request, for REQ_BLOCK_PC usage | 2541 | * blk_rq_map_kern - map kernel data to a request, for REQ_BLOCK_PC usage |
2542 | * @q: request queue where request should be inserted | 2542 | * @q: request queue where request should be inserted |
2543 | * @rq: request to fill | 2543 | * @rq: request to fill |
2544 | * @kbuf: the kernel buffer | 2544 | * @kbuf: the kernel buffer |
2545 | * @len: length of user data | 2545 | * @len: length of user data |
2546 | * @gfp_mask: memory allocation flags | 2546 | * @gfp_mask: memory allocation flags |
2547 | */ | 2547 | */ |
2548 | int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf, | 2548 | int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf, |
2549 | unsigned int len, gfp_t gfp_mask) | 2549 | unsigned int len, gfp_t gfp_mask) |
2550 | { | 2550 | { |
2551 | struct bio *bio; | 2551 | struct bio *bio; |
2552 | 2552 | ||
2553 | if (len > (q->max_hw_sectors << 9)) | 2553 | if (len > (q->max_hw_sectors << 9)) |
2554 | return -EINVAL; | 2554 | return -EINVAL; |
2555 | if (!len || !kbuf) | 2555 | if (!len || !kbuf) |
2556 | return -EINVAL; | 2556 | return -EINVAL; |
2557 | 2557 | ||
2558 | bio = bio_map_kern(q, kbuf, len, gfp_mask); | 2558 | bio = bio_map_kern(q, kbuf, len, gfp_mask); |
2559 | if (IS_ERR(bio)) | 2559 | if (IS_ERR(bio)) |
2560 | return PTR_ERR(bio); | 2560 | return PTR_ERR(bio); |
2561 | 2561 | ||
2562 | if (rq_data_dir(rq) == WRITE) | 2562 | if (rq_data_dir(rq) == WRITE) |
2563 | bio->bi_rw |= (1 << BIO_RW); | 2563 | bio->bi_rw |= (1 << BIO_RW); |
2564 | 2564 | ||
2565 | blk_rq_bio_prep(q, rq, bio); | 2565 | blk_rq_bio_prep(q, rq, bio); |
2566 | blk_queue_bounce(q, &rq->bio); | 2566 | blk_queue_bounce(q, &rq->bio); |
2567 | rq->buffer = rq->data = NULL; | 2567 | rq->buffer = rq->data = NULL; |
2568 | return 0; | 2568 | return 0; |
2569 | } | 2569 | } |
2570 | 2570 | ||
2571 | EXPORT_SYMBOL(blk_rq_map_kern); | 2571 | EXPORT_SYMBOL(blk_rq_map_kern); |
2572 | 2572 | ||
2573 | /** | 2573 | /** |
2574 | * blk_execute_rq_nowait - insert a request into queue for execution | 2574 | * blk_execute_rq_nowait - insert a request into queue for execution |
2575 | * @q: queue to insert the request in | 2575 | * @q: queue to insert the request in |
2576 | * @bd_disk: matching gendisk | 2576 | * @bd_disk: matching gendisk |
2577 | * @rq: request to insert | 2577 | * @rq: request to insert |
2578 | * @at_head: insert request at head or tail of queue | 2578 | * @at_head: insert request at head or tail of queue |
2579 | * @done: I/O completion handler | 2579 | * @done: I/O completion handler |
2580 | * | 2580 | * |
2581 | * Description: | 2581 | * Description: |
2582 | * Insert a fully prepared request at the back of the io scheduler queue | 2582 | * Insert a fully prepared request at the back of the io scheduler queue |
2583 | * for execution. Don't wait for completion. | 2583 | * for execution. Don't wait for completion. |
2584 | */ | 2584 | */ |
2585 | void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk, | 2585 | void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk, |
2586 | struct request *rq, int at_head, | 2586 | struct request *rq, int at_head, |
2587 | rq_end_io_fn *done) | 2587 | rq_end_io_fn *done) |
2588 | { | 2588 | { |
2589 | int where = at_head ? ELEVATOR_INSERT_FRONT : ELEVATOR_INSERT_BACK; | 2589 | int where = at_head ? ELEVATOR_INSERT_FRONT : ELEVATOR_INSERT_BACK; |
2590 | 2590 | ||
2591 | rq->rq_disk = bd_disk; | 2591 | rq->rq_disk = bd_disk; |
2592 | rq->cmd_flags |= REQ_NOMERGE; | 2592 | rq->cmd_flags |= REQ_NOMERGE; |
2593 | rq->end_io = done; | 2593 | rq->end_io = done; |
2594 | WARN_ON(irqs_disabled()); | 2594 | WARN_ON(irqs_disabled()); |
2595 | spin_lock_irq(q->queue_lock); | 2595 | spin_lock_irq(q->queue_lock); |
2596 | __elv_add_request(q, rq, where, 1); | 2596 | __elv_add_request(q, rq, where, 1); |
2597 | __generic_unplug_device(q); | 2597 | __generic_unplug_device(q); |
2598 | spin_unlock_irq(q->queue_lock); | 2598 | spin_unlock_irq(q->queue_lock); |
2599 | } | 2599 | } |
2600 | EXPORT_SYMBOL_GPL(blk_execute_rq_nowait); | 2600 | EXPORT_SYMBOL_GPL(blk_execute_rq_nowait); |
2601 | 2601 | ||
2602 | /** | 2602 | /** |
2603 | * blk_execute_rq - insert a request into queue for execution | 2603 | * blk_execute_rq - insert a request into queue for execution |
2604 | * @q: queue to insert the request in | 2604 | * @q: queue to insert the request in |
2605 | * @bd_disk: matching gendisk | 2605 | * @bd_disk: matching gendisk |
2606 | * @rq: request to insert | 2606 | * @rq: request to insert |
2607 | * @at_head: insert request at head or tail of queue | 2607 | * @at_head: insert request at head or tail of queue |
2608 | * | 2608 | * |
2609 | * Description: | 2609 | * Description: |
2610 | * Insert a fully prepared request at the back of the io scheduler queue | 2610 | * Insert a fully prepared request at the back of the io scheduler queue |
2611 | * for execution and wait for completion. | 2611 | * for execution and wait for completion. |
2612 | */ | 2612 | */ |
2613 | int blk_execute_rq(struct request_queue *q, struct gendisk *bd_disk, | 2613 | int blk_execute_rq(struct request_queue *q, struct gendisk *bd_disk, |
2614 | struct request *rq, int at_head) | 2614 | struct request *rq, int at_head) |
2615 | { | 2615 | { |
2616 | DECLARE_COMPLETION_ONSTACK(wait); | 2616 | DECLARE_COMPLETION_ONSTACK(wait); |
2617 | char sense[SCSI_SENSE_BUFFERSIZE]; | 2617 | char sense[SCSI_SENSE_BUFFERSIZE]; |
2618 | int err = 0; | 2618 | int err = 0; |
2619 | 2619 | ||
2620 | /* | 2620 | /* |
2621 | * we need an extra reference to the request, so we can look at | 2621 | * we need an extra reference to the request, so we can look at |
2622 | * it after io completion | 2622 | * it after io completion |
2623 | */ | 2623 | */ |
2624 | rq->ref_count++; | 2624 | rq->ref_count++; |
2625 | 2625 | ||
2626 | if (!rq->sense) { | 2626 | if (!rq->sense) { |
2627 | memset(sense, 0, sizeof(sense)); | 2627 | memset(sense, 0, sizeof(sense)); |
2628 | rq->sense = sense; | 2628 | rq->sense = sense; |
2629 | rq->sense_len = 0; | 2629 | rq->sense_len = 0; |
2630 | } | 2630 | } |
2631 | 2631 | ||
2632 | rq->end_io_data = &wait; | 2632 | rq->end_io_data = &wait; |
2633 | blk_execute_rq_nowait(q, bd_disk, rq, at_head, blk_end_sync_rq); | 2633 | blk_execute_rq_nowait(q, bd_disk, rq, at_head, blk_end_sync_rq); |
2634 | wait_for_completion(&wait); | 2634 | wait_for_completion(&wait); |
2635 | 2635 | ||
2636 | if (rq->errors) | 2636 | if (rq->errors) |
2637 | err = -EIO; | 2637 | err = -EIO; |
2638 | 2638 | ||
2639 | return err; | 2639 | return err; |
2640 | } | 2640 | } |
2641 | 2641 | ||
2642 | EXPORT_SYMBOL(blk_execute_rq); | 2642 | EXPORT_SYMBOL(blk_execute_rq); |
2643 | 2643 | ||
2644 | /** | 2644 | /** |
2645 | * blkdev_issue_flush - queue a flush | 2645 | * blkdev_issue_flush - queue a flush |
2646 | * @bdev: blockdev to issue flush for | 2646 | * @bdev: blockdev to issue flush for |
2647 | * @error_sector: error sector | 2647 | * @error_sector: error sector |
2648 | * | 2648 | * |
2649 | * Description: | 2649 | * Description: |
2650 | * Issue a flush for the block device in question. Caller can supply | 2650 | * Issue a flush for the block device in question. Caller can supply |
2651 | * room for storing the error offset in case of a flush error, if they | 2651 | * room for storing the error offset in case of a flush error, if they |
2652 | * wish to. Caller must run wait_for_completion() on its own. | 2652 | * wish to. Caller must run wait_for_completion() on its own. |
2653 | */ | 2653 | */ |
2654 | int blkdev_issue_flush(struct block_device *bdev, sector_t *error_sector) | 2654 | int blkdev_issue_flush(struct block_device *bdev, sector_t *error_sector) |
2655 | { | 2655 | { |
2656 | struct request_queue *q; | 2656 | struct request_queue *q; |
2657 | 2657 | ||
2658 | if (bdev->bd_disk == NULL) | 2658 | if (bdev->bd_disk == NULL) |
2659 | return -ENXIO; | 2659 | return -ENXIO; |
2660 | 2660 | ||
2661 | q = bdev_get_queue(bdev); | 2661 | q = bdev_get_queue(bdev); |
2662 | if (!q) | 2662 | if (!q) |
2663 | return -ENXIO; | 2663 | return -ENXIO; |
2664 | if (!q->issue_flush_fn) | 2664 | if (!q->issue_flush_fn) |
2665 | return -EOPNOTSUPP; | 2665 | return -EOPNOTSUPP; |
2666 | 2666 | ||
2667 | return q->issue_flush_fn(q, bdev->bd_disk, error_sector); | 2667 | return q->issue_flush_fn(q, bdev->bd_disk, error_sector); |
2668 | } | 2668 | } |
2669 | 2669 | ||
2670 | EXPORT_SYMBOL(blkdev_issue_flush); | 2670 | EXPORT_SYMBOL(blkdev_issue_flush); |
2671 | 2671 | ||
2672 | static void drive_stat_acct(struct request *rq, int nr_sectors, int new_io) | 2672 | static void drive_stat_acct(struct request *rq, int nr_sectors, int new_io) |
2673 | { | 2673 | { |
2674 | int rw = rq_data_dir(rq); | 2674 | int rw = rq_data_dir(rq); |
2675 | 2675 | ||
2676 | if (!blk_fs_request(rq) || !rq->rq_disk) | 2676 | if (!blk_fs_request(rq) || !rq->rq_disk) |
2677 | return; | 2677 | return; |
2678 | 2678 | ||
2679 | if (!new_io) { | 2679 | if (!new_io) { |
2680 | __disk_stat_inc(rq->rq_disk, merges[rw]); | 2680 | __disk_stat_inc(rq->rq_disk, merges[rw]); |
2681 | } else { | 2681 | } else { |
2682 | disk_round_stats(rq->rq_disk); | 2682 | disk_round_stats(rq->rq_disk); |
2683 | rq->rq_disk->in_flight++; | 2683 | rq->rq_disk->in_flight++; |
2684 | } | 2684 | } |
2685 | } | 2685 | } |
2686 | 2686 | ||
2687 | /* | 2687 | /* |
2688 | * add-request adds a request to the linked list. | 2688 | * add-request adds a request to the linked list. |
2689 | * queue lock is held and interrupts disabled, as we muck with the | 2689 | * queue lock is held and interrupts disabled, as we muck with the |
2690 | * request queue list. | 2690 | * request queue list. |
2691 | */ | 2691 | */ |
2692 | static inline void add_request(struct request_queue * q, struct request * req) | 2692 | static inline void add_request(struct request_queue * q, struct request * req) |
2693 | { | 2693 | { |
2694 | drive_stat_acct(req, req->nr_sectors, 1); | 2694 | drive_stat_acct(req, req->nr_sectors, 1); |
2695 | 2695 | ||
2696 | /* | 2696 | /* |
2697 | * elevator indicated where it wants this request to be | 2697 | * elevator indicated where it wants this request to be |
2698 | * inserted at elevator_merge time | 2698 | * inserted at elevator_merge time |
2699 | */ | 2699 | */ |
2700 | __elv_add_request(q, req, ELEVATOR_INSERT_SORT, 0); | 2700 | __elv_add_request(q, req, ELEVATOR_INSERT_SORT, 0); |
2701 | } | 2701 | } |
2702 | 2702 | ||
2703 | /* | 2703 | /* |
2704 | * disk_round_stats() - Round off the performance stats on a struct | 2704 | * disk_round_stats() - Round off the performance stats on a struct |
2705 | * disk_stats. | 2705 | * disk_stats. |
2706 | * | 2706 | * |
2707 | * The average IO queue length and utilisation statistics are maintained | 2707 | * The average IO queue length and utilisation statistics are maintained |
2708 | * by observing the current state of the queue length and the amount of | 2708 | * by observing the current state of the queue length and the amount of |
2709 | * time it has been in this state for. | 2709 | * time it has been in this state for. |
2710 | * | 2710 | * |
2711 | * Normally, that accounting is done on IO completion, but that can result | 2711 | * Normally, that accounting is done on IO completion, but that can result |
2712 | * in more than a second's worth of IO being accounted for within any one | 2712 | * in more than a second's worth of IO being accounted for within any one |
2713 | * second, leading to >100% utilisation. To deal with that, we call this | 2713 | * second, leading to >100% utilisation. To deal with that, we call this |
2714 | * function to do a round-off before returning the results when reading | 2714 | * function to do a round-off before returning the results when reading |
2715 | * /proc/diskstats. This accounts immediately for all queue usage up to | 2715 | * /proc/diskstats. This accounts immediately for all queue usage up to |
2716 | * the current jiffies and restarts the counters again. | 2716 | * the current jiffies and restarts the counters again. |
2717 | */ | 2717 | */ |
2718 | void disk_round_stats(struct gendisk *disk) | 2718 | void disk_round_stats(struct gendisk *disk) |
2719 | { | 2719 | { |
2720 | unsigned long now = jiffies; | 2720 | unsigned long now = jiffies; |
2721 | 2721 | ||
2722 | if (now == disk->stamp) | 2722 | if (now == disk->stamp) |
2723 | return; | 2723 | return; |
2724 | 2724 | ||
2725 | if (disk->in_flight) { | 2725 | if (disk->in_flight) { |
2726 | __disk_stat_add(disk, time_in_queue, | 2726 | __disk_stat_add(disk, time_in_queue, |
2727 | disk->in_flight * (now - disk->stamp)); | 2727 | disk->in_flight * (now - disk->stamp)); |
2728 | __disk_stat_add(disk, io_ticks, (now - disk->stamp)); | 2728 | __disk_stat_add(disk, io_ticks, (now - disk->stamp)); |
2729 | } | 2729 | } |
2730 | disk->stamp = now; | 2730 | disk->stamp = now; |
2731 | } | 2731 | } |
2732 | 2732 | ||
2733 | EXPORT_SYMBOL_GPL(disk_round_stats); | 2733 | EXPORT_SYMBOL_GPL(disk_round_stats); |
2734 | 2734 | ||
2735 | /* | 2735 | /* |
2736 | * queue lock must be held | 2736 | * queue lock must be held |
2737 | */ | 2737 | */ |
2738 | void __blk_put_request(struct request_queue *q, struct request *req) | 2738 | void __blk_put_request(struct request_queue *q, struct request *req) |
2739 | { | 2739 | { |
2740 | if (unlikely(!q)) | 2740 | if (unlikely(!q)) |
2741 | return; | 2741 | return; |
2742 | if (unlikely(--req->ref_count)) | 2742 | if (unlikely(--req->ref_count)) |
2743 | return; | 2743 | return; |
2744 | 2744 | ||
2745 | elv_completed_request(q, req); | 2745 | elv_completed_request(q, req); |
2746 | 2746 | ||
2747 | /* | 2747 | /* |
2748 | * Request may not have originated from ll_rw_blk. if not, | 2748 | * Request may not have originated from ll_rw_blk. if not, |
2749 | * it didn't come out of our reserved rq pools | 2749 | * it didn't come out of our reserved rq pools |
2750 | */ | 2750 | */ |
2751 | if (req->cmd_flags & REQ_ALLOCED) { | 2751 | if (req->cmd_flags & REQ_ALLOCED) { |
2752 | int rw = rq_data_dir(req); | 2752 | int rw = rq_data_dir(req); |
2753 | int priv = req->cmd_flags & REQ_ELVPRIV; | 2753 | int priv = req->cmd_flags & REQ_ELVPRIV; |
2754 | 2754 | ||
2755 | BUG_ON(!list_empty(&req->queuelist)); | 2755 | BUG_ON(!list_empty(&req->queuelist)); |
2756 | BUG_ON(!hlist_unhashed(&req->hash)); | 2756 | BUG_ON(!hlist_unhashed(&req->hash)); |
2757 | 2757 | ||
2758 | blk_free_request(q, req); | 2758 | blk_free_request(q, req); |
2759 | freed_request(q, rw, priv); | 2759 | freed_request(q, rw, priv); |
2760 | } | 2760 | } |
2761 | } | 2761 | } |
2762 | 2762 | ||
2763 | EXPORT_SYMBOL_GPL(__blk_put_request); | 2763 | EXPORT_SYMBOL_GPL(__blk_put_request); |
2764 | 2764 | ||
2765 | void blk_put_request(struct request *req) | 2765 | void blk_put_request(struct request *req) |
2766 | { | 2766 | { |
2767 | unsigned long flags; | 2767 | unsigned long flags; |
2768 | struct request_queue *q = req->q; | 2768 | struct request_queue *q = req->q; |
2769 | 2769 | ||
2770 | /* | 2770 | /* |
2771 | * Gee, IDE calls in w/ NULL q. Fix IDE and remove the | 2771 | * Gee, IDE calls in w/ NULL q. Fix IDE and remove the |
2772 | * following if (q) test. | 2772 | * following if (q) test. |
2773 | */ | 2773 | */ |
2774 | if (q) { | 2774 | if (q) { |
2775 | spin_lock_irqsave(q->queue_lock, flags); | 2775 | spin_lock_irqsave(q->queue_lock, flags); |
2776 | __blk_put_request(q, req); | 2776 | __blk_put_request(q, req); |
2777 | spin_unlock_irqrestore(q->queue_lock, flags); | 2777 | spin_unlock_irqrestore(q->queue_lock, flags); |
2778 | } | 2778 | } |
2779 | } | 2779 | } |
2780 | 2780 | ||
2781 | EXPORT_SYMBOL(blk_put_request); | 2781 | EXPORT_SYMBOL(blk_put_request); |
2782 | 2782 | ||
2783 | /** | 2783 | /** |
2784 | * blk_end_sync_rq - executes a completion event on a request | 2784 | * blk_end_sync_rq - executes a completion event on a request |
2785 | * @rq: request to complete | 2785 | * @rq: request to complete |
2786 | * @error: end io status of the request | 2786 | * @error: end io status of the request |
2787 | */ | 2787 | */ |
2788 | void blk_end_sync_rq(struct request *rq, int error) | 2788 | void blk_end_sync_rq(struct request *rq, int error) |
2789 | { | 2789 | { |
2790 | struct completion *waiting = rq->end_io_data; | 2790 | struct completion *waiting = rq->end_io_data; |
2791 | 2791 | ||
2792 | rq->end_io_data = NULL; | 2792 | rq->end_io_data = NULL; |
2793 | __blk_put_request(rq->q, rq); | 2793 | __blk_put_request(rq->q, rq); |
2794 | 2794 | ||
2795 | /* | 2795 | /* |
2796 | * complete last, if this is a stack request the process (and thus | 2796 | * complete last, if this is a stack request the process (and thus |
2797 | * the rq pointer) could be invalid right after this complete() | 2797 | * the rq pointer) could be invalid right after this complete() |
2798 | */ | 2798 | */ |
2799 | complete(waiting); | 2799 | complete(waiting); |
2800 | } | 2800 | } |
2801 | EXPORT_SYMBOL(blk_end_sync_rq); | 2801 | EXPORT_SYMBOL(blk_end_sync_rq); |
2802 | 2802 | ||
2803 | /* | 2803 | /* |
2804 | * Has to be called with the request spinlock acquired | 2804 | * Has to be called with the request spinlock acquired |
2805 | */ | 2805 | */ |
2806 | static int attempt_merge(struct request_queue *q, struct request *req, | 2806 | static int attempt_merge(struct request_queue *q, struct request *req, |
2807 | struct request *next) | 2807 | struct request *next) |
2808 | { | 2808 | { |
2809 | if (!rq_mergeable(req) || !rq_mergeable(next)) | 2809 | if (!rq_mergeable(req) || !rq_mergeable(next)) |
2810 | return 0; | 2810 | return 0; |
2811 | 2811 | ||
2812 | /* | 2812 | /* |
2813 | * not contiguous | 2813 | * not contiguous |
2814 | */ | 2814 | */ |
2815 | if (req->sector + req->nr_sectors != next->sector) | 2815 | if (req->sector + req->nr_sectors != next->sector) |
2816 | return 0; | 2816 | return 0; |
2817 | 2817 | ||
2818 | if (rq_data_dir(req) != rq_data_dir(next) | 2818 | if (rq_data_dir(req) != rq_data_dir(next) |
2819 | || req->rq_disk != next->rq_disk | 2819 | || req->rq_disk != next->rq_disk |
2820 | || next->special) | 2820 | || next->special) |
2821 | return 0; | 2821 | return 0; |
2822 | 2822 | ||
2823 | /* | 2823 | /* |
2824 | * If we are allowed to merge, then append bio list | 2824 | * If we are allowed to merge, then append bio list |
2825 | * from next to rq and release next. merge_requests_fn | 2825 | * from next to rq and release next. merge_requests_fn |
2826 | * will have updated segment counts, update sector | 2826 | * will have updated segment counts, update sector |
2827 | * counts here. | 2827 | * counts here. |
2828 | */ | 2828 | */ |
2829 | if (!ll_merge_requests_fn(q, req, next)) | 2829 | if (!ll_merge_requests_fn(q, req, next)) |
2830 | return 0; | 2830 | return 0; |
2831 | 2831 | ||
2832 | /* | 2832 | /* |
2833 | * At this point we have either done a back merge | 2833 | * At this point we have either done a back merge |
2834 | * or front merge. We need the smaller start_time of | 2834 | * or front merge. We need the smaller start_time of |
2835 | * the merged requests to be the current request | 2835 | * the merged requests to be the current request |
2836 | * for accounting purposes. | 2836 | * for accounting purposes. |
2837 | */ | 2837 | */ |
2838 | if (time_after(req->start_time, next->start_time)) | 2838 | if (time_after(req->start_time, next->start_time)) |
2839 | req->start_time = next->start_time; | 2839 | req->start_time = next->start_time; |
2840 | 2840 | ||
2841 | req->biotail->bi_next = next->bio; | 2841 | req->biotail->bi_next = next->bio; |
2842 | req->biotail = next->biotail; | 2842 | req->biotail = next->biotail; |
2843 | 2843 | ||
2844 | req->nr_sectors = req->hard_nr_sectors += next->hard_nr_sectors; | 2844 | req->nr_sectors = req->hard_nr_sectors += next->hard_nr_sectors; |
2845 | 2845 | ||
2846 | elv_merge_requests(q, req, next); | 2846 | elv_merge_requests(q, req, next); |
2847 | 2847 | ||
2848 | if (req->rq_disk) { | 2848 | if (req->rq_disk) { |
2849 | disk_round_stats(req->rq_disk); | 2849 | disk_round_stats(req->rq_disk); |
2850 | req->rq_disk->in_flight--; | 2850 | req->rq_disk->in_flight--; |
2851 | } | 2851 | } |
2852 | 2852 | ||
2853 | req->ioprio = ioprio_best(req->ioprio, next->ioprio); | 2853 | req->ioprio = ioprio_best(req->ioprio, next->ioprio); |
2854 | 2854 | ||
2855 | __blk_put_request(q, next); | 2855 | __blk_put_request(q, next); |
2856 | return 1; | 2856 | return 1; |
2857 | } | 2857 | } |
2858 | 2858 | ||
2859 | static inline int attempt_back_merge(struct request_queue *q, | 2859 | static inline int attempt_back_merge(struct request_queue *q, |
2860 | struct request *rq) | 2860 | struct request *rq) |
2861 | { | 2861 | { |
2862 | struct request *next = elv_latter_request(q, rq); | 2862 | struct request *next = elv_latter_request(q, rq); |
2863 | 2863 | ||
2864 | if (next) | 2864 | if (next) |
2865 | return attempt_merge(q, rq, next); | 2865 | return attempt_merge(q, rq, next); |
2866 | 2866 | ||
2867 | return 0; | 2867 | return 0; |
2868 | } | 2868 | } |
2869 | 2869 | ||
2870 | static inline int attempt_front_merge(struct request_queue *q, | 2870 | static inline int attempt_front_merge(struct request_queue *q, |
2871 | struct request *rq) | 2871 | struct request *rq) |
2872 | { | 2872 | { |
2873 | struct request *prev = elv_former_request(q, rq); | 2873 | struct request *prev = elv_former_request(q, rq); |
2874 | 2874 | ||
2875 | if (prev) | 2875 | if (prev) |
2876 | return attempt_merge(q, prev, rq); | 2876 | return attempt_merge(q, prev, rq); |
2877 | 2877 | ||
2878 | return 0; | 2878 | return 0; |
2879 | } | 2879 | } |
2880 | 2880 | ||
2881 | static void init_request_from_bio(struct request *req, struct bio *bio) | 2881 | static void init_request_from_bio(struct request *req, struct bio *bio) |
2882 | { | 2882 | { |
2883 | req->cmd_type = REQ_TYPE_FS; | 2883 | req->cmd_type = REQ_TYPE_FS; |
2884 | 2884 | ||
2885 | /* | 2885 | /* |
2886 | * inherit FAILFAST from bio (for read-ahead, and explicit FAILFAST) | 2886 | * inherit FAILFAST from bio (for read-ahead, and explicit FAILFAST) |
2887 | */ | 2887 | */ |
2888 | if (bio_rw_ahead(bio) || bio_failfast(bio)) | 2888 | if (bio_rw_ahead(bio) || bio_failfast(bio)) |
2889 | req->cmd_flags |= REQ_FAILFAST; | 2889 | req->cmd_flags |= REQ_FAILFAST; |
2890 | 2890 | ||
2891 | /* | 2891 | /* |
2892 | * REQ_BARRIER implies no merging, but lets make it explicit | 2892 | * REQ_BARRIER implies no merging, but lets make it explicit |
2893 | */ | 2893 | */ |
2894 | if (unlikely(bio_barrier(bio))) | 2894 | if (unlikely(bio_barrier(bio))) |
2895 | req->cmd_flags |= (REQ_HARDBARRIER | REQ_NOMERGE); | 2895 | req->cmd_flags |= (REQ_HARDBARRIER | REQ_NOMERGE); |
2896 | 2896 | ||
2897 | if (bio_sync(bio)) | 2897 | if (bio_sync(bio)) |
2898 | req->cmd_flags |= REQ_RW_SYNC; | 2898 | req->cmd_flags |= REQ_RW_SYNC; |
2899 | if (bio_rw_meta(bio)) | 2899 | if (bio_rw_meta(bio)) |
2900 | req->cmd_flags |= REQ_RW_META; | 2900 | req->cmd_flags |= REQ_RW_META; |
2901 | 2901 | ||
2902 | req->errors = 0; | 2902 | req->errors = 0; |
2903 | req->hard_sector = req->sector = bio->bi_sector; | 2903 | req->hard_sector = req->sector = bio->bi_sector; |
2904 | req->hard_nr_sectors = req->nr_sectors = bio_sectors(bio); | 2904 | req->hard_nr_sectors = req->nr_sectors = bio_sectors(bio); |
2905 | req->current_nr_sectors = req->hard_cur_sectors = bio_cur_sectors(bio); | 2905 | req->current_nr_sectors = req->hard_cur_sectors = bio_cur_sectors(bio); |
2906 | req->nr_phys_segments = bio_phys_segments(req->q, bio); | 2906 | req->nr_phys_segments = bio_phys_segments(req->q, bio); |
2907 | req->nr_hw_segments = bio_hw_segments(req->q, bio); | 2907 | req->nr_hw_segments = bio_hw_segments(req->q, bio); |
2908 | req->buffer = bio_data(bio); /* see ->buffer comment above */ | 2908 | req->buffer = bio_data(bio); /* see ->buffer comment above */ |
2909 | req->bio = req->biotail = bio; | 2909 | req->bio = req->biotail = bio; |
2910 | req->ioprio = bio_prio(bio); | 2910 | req->ioprio = bio_prio(bio); |
2911 | req->rq_disk = bio->bi_bdev->bd_disk; | 2911 | req->rq_disk = bio->bi_bdev->bd_disk; |
2912 | req->start_time = jiffies; | 2912 | req->start_time = jiffies; |
2913 | } | 2913 | } |
2914 | 2914 | ||
2915 | static int __make_request(struct request_queue *q, struct bio *bio) | 2915 | static int __make_request(struct request_queue *q, struct bio *bio) |
2916 | { | 2916 | { |
2917 | struct request *req; | 2917 | struct request *req; |
2918 | int el_ret, nr_sectors, barrier, err; | 2918 | int el_ret, nr_sectors, barrier, err; |
2919 | const unsigned short prio = bio_prio(bio); | 2919 | const unsigned short prio = bio_prio(bio); |
2920 | const int sync = bio_sync(bio); | 2920 | const int sync = bio_sync(bio); |
2921 | int rw_flags; | 2921 | int rw_flags; |
2922 | 2922 | ||
2923 | nr_sectors = bio_sectors(bio); | 2923 | nr_sectors = bio_sectors(bio); |
2924 | 2924 | ||
2925 | /* | 2925 | /* |
2926 | * low level driver can indicate that it wants pages above a | 2926 | * low level driver can indicate that it wants pages above a |
2927 | * certain limit bounced to low memory (ie for highmem, or even | 2927 | * certain limit bounced to low memory (ie for highmem, or even |
2928 | * ISA dma in theory) | 2928 | * ISA dma in theory) |
2929 | */ | 2929 | */ |
2930 | blk_queue_bounce(q, &bio); | 2930 | blk_queue_bounce(q, &bio); |
2931 | 2931 | ||
2932 | barrier = bio_barrier(bio); | 2932 | barrier = bio_barrier(bio); |
2933 | if (unlikely(barrier) && (q->next_ordered == QUEUE_ORDERED_NONE)) { | 2933 | if (unlikely(barrier) && (q->next_ordered == QUEUE_ORDERED_NONE)) { |
2934 | err = -EOPNOTSUPP; | 2934 | err = -EOPNOTSUPP; |
2935 | goto end_io; | 2935 | goto end_io; |
2936 | } | 2936 | } |
2937 | 2937 | ||
2938 | spin_lock_irq(q->queue_lock); | 2938 | spin_lock_irq(q->queue_lock); |
2939 | 2939 | ||
2940 | if (unlikely(barrier) || elv_queue_empty(q)) | 2940 | if (unlikely(barrier) || elv_queue_empty(q)) |
2941 | goto get_rq; | 2941 | goto get_rq; |
2942 | 2942 | ||
2943 | el_ret = elv_merge(q, &req, bio); | 2943 | el_ret = elv_merge(q, &req, bio); |
2944 | switch (el_ret) { | 2944 | switch (el_ret) { |
2945 | case ELEVATOR_BACK_MERGE: | 2945 | case ELEVATOR_BACK_MERGE: |
2946 | BUG_ON(!rq_mergeable(req)); | 2946 | BUG_ON(!rq_mergeable(req)); |
2947 | 2947 | ||
2948 | if (!ll_back_merge_fn(q, req, bio)) | 2948 | if (!ll_back_merge_fn(q, req, bio)) |
2949 | break; | 2949 | break; |
2950 | 2950 | ||
2951 | blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE); | 2951 | blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE); |
2952 | 2952 | ||
2953 | req->biotail->bi_next = bio; | 2953 | req->biotail->bi_next = bio; |
2954 | req->biotail = bio; | 2954 | req->biotail = bio; |
2955 | req->nr_sectors = req->hard_nr_sectors += nr_sectors; | 2955 | req->nr_sectors = req->hard_nr_sectors += nr_sectors; |
2956 | req->ioprio = ioprio_best(req->ioprio, prio); | 2956 | req->ioprio = ioprio_best(req->ioprio, prio); |
2957 | drive_stat_acct(req, nr_sectors, 0); | 2957 | drive_stat_acct(req, nr_sectors, 0); |
2958 | if (!attempt_back_merge(q, req)) | 2958 | if (!attempt_back_merge(q, req)) |
2959 | elv_merged_request(q, req, el_ret); | 2959 | elv_merged_request(q, req, el_ret); |
2960 | goto out; | 2960 | goto out; |
2961 | 2961 | ||
2962 | case ELEVATOR_FRONT_MERGE: | 2962 | case ELEVATOR_FRONT_MERGE: |
2963 | BUG_ON(!rq_mergeable(req)); | 2963 | BUG_ON(!rq_mergeable(req)); |
2964 | 2964 | ||
2965 | if (!ll_front_merge_fn(q, req, bio)) | 2965 | if (!ll_front_merge_fn(q, req, bio)) |
2966 | break; | 2966 | break; |
2967 | 2967 | ||
2968 | blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE); | 2968 | blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE); |
2969 | 2969 | ||
2970 | bio->bi_next = req->bio; | 2970 | bio->bi_next = req->bio; |
2971 | req->bio = bio; | 2971 | req->bio = bio; |
2972 | 2972 | ||
2973 | /* | 2973 | /* |
2974 | * may not be valid. if the low level driver said | 2974 | * may not be valid. if the low level driver said |
2975 | * it didn't need a bounce buffer then it better | 2975 | * it didn't need a bounce buffer then it better |
2976 | * not touch req->buffer either... | 2976 | * not touch req->buffer either... |
2977 | */ | 2977 | */ |
2978 | req->buffer = bio_data(bio); | 2978 | req->buffer = bio_data(bio); |
2979 | req->current_nr_sectors = bio_cur_sectors(bio); | 2979 | req->current_nr_sectors = bio_cur_sectors(bio); |
2980 | req->hard_cur_sectors = req->current_nr_sectors; | 2980 | req->hard_cur_sectors = req->current_nr_sectors; |
2981 | req->sector = req->hard_sector = bio->bi_sector; | 2981 | req->sector = req->hard_sector = bio->bi_sector; |
2982 | req->nr_sectors = req->hard_nr_sectors += nr_sectors; | 2982 | req->nr_sectors = req->hard_nr_sectors += nr_sectors; |
2983 | req->ioprio = ioprio_best(req->ioprio, prio); | 2983 | req->ioprio = ioprio_best(req->ioprio, prio); |
2984 | drive_stat_acct(req, nr_sectors, 0); | 2984 | drive_stat_acct(req, nr_sectors, 0); |
2985 | if (!attempt_front_merge(q, req)) | 2985 | if (!attempt_front_merge(q, req)) |
2986 | elv_merged_request(q, req, el_ret); | 2986 | elv_merged_request(q, req, el_ret); |
2987 | goto out; | 2987 | goto out; |
2988 | 2988 | ||
2989 | /* ELV_NO_MERGE: elevator says don't/can't merge. */ | 2989 | /* ELV_NO_MERGE: elevator says don't/can't merge. */ |
2990 | default: | 2990 | default: |
2991 | ; | 2991 | ; |
2992 | } | 2992 | } |
2993 | 2993 | ||
2994 | get_rq: | 2994 | get_rq: |
2995 | /* | 2995 | /* |
2996 | * This sync check and mask will be re-done in init_request_from_bio(), | 2996 | * This sync check and mask will be re-done in init_request_from_bio(), |
2997 | * but we need to set it earlier to expose the sync flag to the | 2997 | * but we need to set it earlier to expose the sync flag to the |
2998 | * rq allocator and io schedulers. | 2998 | * rq allocator and io schedulers. |
2999 | */ | 2999 | */ |
3000 | rw_flags = bio_data_dir(bio); | 3000 | rw_flags = bio_data_dir(bio); |
3001 | if (sync) | 3001 | if (sync) |
3002 | rw_flags |= REQ_RW_SYNC; | 3002 | rw_flags |= REQ_RW_SYNC; |
3003 | 3003 | ||
3004 | /* | 3004 | /* |
3005 | * Grab a free request. This is might sleep but can not fail. | 3005 | * Grab a free request. This is might sleep but can not fail. |
3006 | * Returns with the queue unlocked. | 3006 | * Returns with the queue unlocked. |
3007 | */ | 3007 | */ |
3008 | req = get_request_wait(q, rw_flags, bio); | 3008 | req = get_request_wait(q, rw_flags, bio); |
3009 | 3009 | ||
3010 | /* | 3010 | /* |
3011 | * After dropping the lock and possibly sleeping here, our request | 3011 | * After dropping the lock and possibly sleeping here, our request |
3012 | * may now be mergeable after it had proven unmergeable (above). | 3012 | * may now be mergeable after it had proven unmergeable (above). |
3013 | * We don't worry about that case for efficiency. It won't happen | 3013 | * We don't worry about that case for efficiency. It won't happen |
3014 | * often, and the elevators are able to handle it. | 3014 | * often, and the elevators are able to handle it. |
3015 | */ | 3015 | */ |
3016 | init_request_from_bio(req, bio); | 3016 | init_request_from_bio(req, bio); |
3017 | 3017 | ||
3018 | spin_lock_irq(q->queue_lock); | 3018 | spin_lock_irq(q->queue_lock); |
3019 | if (elv_queue_empty(q)) | 3019 | if (elv_queue_empty(q)) |
3020 | blk_plug_device(q); | 3020 | blk_plug_device(q); |
3021 | add_request(q, req); | 3021 | add_request(q, req); |
3022 | out: | 3022 | out: |
3023 | if (sync) | 3023 | if (sync) |
3024 | __generic_unplug_device(q); | 3024 | __generic_unplug_device(q); |
3025 | 3025 | ||
3026 | spin_unlock_irq(q->queue_lock); | 3026 | spin_unlock_irq(q->queue_lock); |
3027 | return 0; | 3027 | return 0; |
3028 | 3028 | ||
3029 | end_io: | 3029 | end_io: |
3030 | bio_endio(bio, nr_sectors << 9, err); | 3030 | bio_endio(bio, nr_sectors << 9, err); |
3031 | return 0; | 3031 | return 0; |
3032 | } | 3032 | } |
3033 | 3033 | ||
3034 | /* | 3034 | /* |
3035 | * If bio->bi_dev is a partition, remap the location | 3035 | * If bio->bi_dev is a partition, remap the location |
3036 | */ | 3036 | */ |
3037 | static inline void blk_partition_remap(struct bio *bio) | 3037 | static inline void blk_partition_remap(struct bio *bio) |
3038 | { | 3038 | { |
3039 | struct block_device *bdev = bio->bi_bdev; | 3039 | struct block_device *bdev = bio->bi_bdev; |
3040 | 3040 | ||
3041 | if (bdev != bdev->bd_contains) { | 3041 | if (bdev != bdev->bd_contains) { |
3042 | struct hd_struct *p = bdev->bd_part; | 3042 | struct hd_struct *p = bdev->bd_part; |
3043 | const int rw = bio_data_dir(bio); | 3043 | const int rw = bio_data_dir(bio); |
3044 | 3044 | ||
3045 | p->sectors[rw] += bio_sectors(bio); | 3045 | p->sectors[rw] += bio_sectors(bio); |
3046 | p->ios[rw]++; | 3046 | p->ios[rw]++; |
3047 | 3047 | ||
3048 | bio->bi_sector += p->start_sect; | 3048 | bio->bi_sector += p->start_sect; |
3049 | bio->bi_bdev = bdev->bd_contains; | 3049 | bio->bi_bdev = bdev->bd_contains; |
3050 | |||
3051 | blk_add_trace_remap(bdev_get_queue(bio->bi_bdev), bio, | ||
3052 | bdev->bd_dev, bio->bi_sector, | ||
3053 | bio->bi_sector - p->start_sect); | ||
3050 | } | 3054 | } |
3051 | } | 3055 | } |
3052 | 3056 | ||
3053 | static void handle_bad_sector(struct bio *bio) | 3057 | static void handle_bad_sector(struct bio *bio) |
3054 | { | 3058 | { |
3055 | char b[BDEVNAME_SIZE]; | 3059 | char b[BDEVNAME_SIZE]; |
3056 | 3060 | ||
3057 | printk(KERN_INFO "attempt to access beyond end of device\n"); | 3061 | printk(KERN_INFO "attempt to access beyond end of device\n"); |
3058 | printk(KERN_INFO "%s: rw=%ld, want=%Lu, limit=%Lu\n", | 3062 | printk(KERN_INFO "%s: rw=%ld, want=%Lu, limit=%Lu\n", |
3059 | bdevname(bio->bi_bdev, b), | 3063 | bdevname(bio->bi_bdev, b), |
3060 | bio->bi_rw, | 3064 | bio->bi_rw, |
3061 | (unsigned long long)bio->bi_sector + bio_sectors(bio), | 3065 | (unsigned long long)bio->bi_sector + bio_sectors(bio), |
3062 | (long long)(bio->bi_bdev->bd_inode->i_size >> 9)); | 3066 | (long long)(bio->bi_bdev->bd_inode->i_size >> 9)); |
3063 | 3067 | ||
3064 | set_bit(BIO_EOF, &bio->bi_flags); | 3068 | set_bit(BIO_EOF, &bio->bi_flags); |
3065 | } | 3069 | } |
3066 | 3070 | ||
3067 | #ifdef CONFIG_FAIL_MAKE_REQUEST | 3071 | #ifdef CONFIG_FAIL_MAKE_REQUEST |
3068 | 3072 | ||
3069 | static DECLARE_FAULT_ATTR(fail_make_request); | 3073 | static DECLARE_FAULT_ATTR(fail_make_request); |
3070 | 3074 | ||
3071 | static int __init setup_fail_make_request(char *str) | 3075 | static int __init setup_fail_make_request(char *str) |
3072 | { | 3076 | { |
3073 | return setup_fault_attr(&fail_make_request, str); | 3077 | return setup_fault_attr(&fail_make_request, str); |
3074 | } | 3078 | } |
3075 | __setup("fail_make_request=", setup_fail_make_request); | 3079 | __setup("fail_make_request=", setup_fail_make_request); |
3076 | 3080 | ||
3077 | static int should_fail_request(struct bio *bio) | 3081 | static int should_fail_request(struct bio *bio) |
3078 | { | 3082 | { |
3079 | if ((bio->bi_bdev->bd_disk->flags & GENHD_FL_FAIL) || | 3083 | if ((bio->bi_bdev->bd_disk->flags & GENHD_FL_FAIL) || |
3080 | (bio->bi_bdev->bd_part && bio->bi_bdev->bd_part->make_it_fail)) | 3084 | (bio->bi_bdev->bd_part && bio->bi_bdev->bd_part->make_it_fail)) |
3081 | return should_fail(&fail_make_request, bio->bi_size); | 3085 | return should_fail(&fail_make_request, bio->bi_size); |
3082 | 3086 | ||
3083 | return 0; | 3087 | return 0; |
3084 | } | 3088 | } |
3085 | 3089 | ||
3086 | static int __init fail_make_request_debugfs(void) | 3090 | static int __init fail_make_request_debugfs(void) |
3087 | { | 3091 | { |
3088 | return init_fault_attr_dentries(&fail_make_request, | 3092 | return init_fault_attr_dentries(&fail_make_request, |
3089 | "fail_make_request"); | 3093 | "fail_make_request"); |
3090 | } | 3094 | } |
3091 | 3095 | ||
3092 | late_initcall(fail_make_request_debugfs); | 3096 | late_initcall(fail_make_request_debugfs); |
3093 | 3097 | ||
3094 | #else /* CONFIG_FAIL_MAKE_REQUEST */ | 3098 | #else /* CONFIG_FAIL_MAKE_REQUEST */ |
3095 | 3099 | ||
3096 | static inline int should_fail_request(struct bio *bio) | 3100 | static inline int should_fail_request(struct bio *bio) |
3097 | { | 3101 | { |
3098 | return 0; | 3102 | return 0; |
3099 | } | 3103 | } |
3100 | 3104 | ||
3101 | #endif /* CONFIG_FAIL_MAKE_REQUEST */ | 3105 | #endif /* CONFIG_FAIL_MAKE_REQUEST */ |
3102 | 3106 | ||
3103 | /** | 3107 | /** |
3104 | * generic_make_request: hand a buffer to its device driver for I/O | 3108 | * generic_make_request: hand a buffer to its device driver for I/O |
3105 | * @bio: The bio describing the location in memory and on the device. | 3109 | * @bio: The bio describing the location in memory and on the device. |
3106 | * | 3110 | * |
3107 | * generic_make_request() is used to make I/O requests of block | 3111 | * generic_make_request() is used to make I/O requests of block |
3108 | * devices. It is passed a &struct bio, which describes the I/O that needs | 3112 | * devices. It is passed a &struct bio, which describes the I/O that needs |
3109 | * to be done. | 3113 | * to be done. |
3110 | * | 3114 | * |
3111 | * generic_make_request() does not return any status. The | 3115 | * generic_make_request() does not return any status. The |
3112 | * success/failure status of the request, along with notification of | 3116 | * success/failure status of the request, along with notification of |
3113 | * completion, is delivered asynchronously through the bio->bi_end_io | 3117 | * completion, is delivered asynchronously through the bio->bi_end_io |
3114 | * function described (one day) else where. | 3118 | * function described (one day) else where. |
3115 | * | 3119 | * |
3116 | * The caller of generic_make_request must make sure that bi_io_vec | 3120 | * The caller of generic_make_request must make sure that bi_io_vec |
3117 | * are set to describe the memory buffer, and that bi_dev and bi_sector are | 3121 | * are set to describe the memory buffer, and that bi_dev and bi_sector are |
3118 | * set to describe the device address, and the | 3122 | * set to describe the device address, and the |
3119 | * bi_end_io and optionally bi_private are set to describe how | 3123 | * bi_end_io and optionally bi_private are set to describe how |
3120 | * completion notification should be signaled. | 3124 | * completion notification should be signaled. |
3121 | * | 3125 | * |
3122 | * generic_make_request and the drivers it calls may use bi_next if this | 3126 | * generic_make_request and the drivers it calls may use bi_next if this |
3123 | * bio happens to be merged with someone else, and may change bi_dev and | 3127 | * bio happens to be merged with someone else, and may change bi_dev and |
3124 | * bi_sector for remaps as it sees fit. So the values of these fields | 3128 | * bi_sector for remaps as it sees fit. So the values of these fields |
3125 | * should NOT be depended on after the call to generic_make_request. | 3129 | * should NOT be depended on after the call to generic_make_request. |
3126 | */ | 3130 | */ |
3127 | static inline void __generic_make_request(struct bio *bio) | 3131 | static inline void __generic_make_request(struct bio *bio) |
3128 | { | 3132 | { |
3129 | struct request_queue *q; | 3133 | struct request_queue *q; |
3130 | sector_t maxsector; | 3134 | sector_t maxsector; |
3131 | sector_t old_sector; | 3135 | sector_t old_sector; |
3132 | int ret, nr_sectors = bio_sectors(bio); | 3136 | int ret, nr_sectors = bio_sectors(bio); |
3133 | dev_t old_dev; | 3137 | dev_t old_dev; |
3134 | 3138 | ||
3135 | might_sleep(); | 3139 | might_sleep(); |
3136 | /* Test device or partition size, when known. */ | 3140 | /* Test device or partition size, when known. */ |
3137 | maxsector = bio->bi_bdev->bd_inode->i_size >> 9; | 3141 | maxsector = bio->bi_bdev->bd_inode->i_size >> 9; |
3138 | if (maxsector) { | 3142 | if (maxsector) { |
3139 | sector_t sector = bio->bi_sector; | 3143 | sector_t sector = bio->bi_sector; |
3140 | 3144 | ||
3141 | if (maxsector < nr_sectors || maxsector - nr_sectors < sector) { | 3145 | if (maxsector < nr_sectors || maxsector - nr_sectors < sector) { |
3142 | /* | 3146 | /* |
3143 | * This may well happen - the kernel calls bread() | 3147 | * This may well happen - the kernel calls bread() |
3144 | * without checking the size of the device, e.g., when | 3148 | * without checking the size of the device, e.g., when |
3145 | * mounting a device. | 3149 | * mounting a device. |
3146 | */ | 3150 | */ |
3147 | handle_bad_sector(bio); | 3151 | handle_bad_sector(bio); |
3148 | goto end_io; | 3152 | goto end_io; |
3149 | } | 3153 | } |
3150 | } | 3154 | } |
3151 | 3155 | ||
3152 | /* | 3156 | /* |
3153 | * Resolve the mapping until finished. (drivers are | 3157 | * Resolve the mapping until finished. (drivers are |
3154 | * still free to implement/resolve their own stacking | 3158 | * still free to implement/resolve their own stacking |
3155 | * by explicitly returning 0) | 3159 | * by explicitly returning 0) |
3156 | * | 3160 | * |
3157 | * NOTE: we don't repeat the blk_size check for each new device. | 3161 | * NOTE: we don't repeat the blk_size check for each new device. |
3158 | * Stacking drivers are expected to know what they are doing. | 3162 | * Stacking drivers are expected to know what they are doing. |
3159 | */ | 3163 | */ |
3160 | old_sector = -1; | 3164 | old_sector = -1; |
3161 | old_dev = 0; | 3165 | old_dev = 0; |
3162 | do { | 3166 | do { |
3163 | char b[BDEVNAME_SIZE]; | 3167 | char b[BDEVNAME_SIZE]; |
3164 | 3168 | ||
3165 | q = bdev_get_queue(bio->bi_bdev); | 3169 | q = bdev_get_queue(bio->bi_bdev); |
3166 | if (!q) { | 3170 | if (!q) { |
3167 | printk(KERN_ERR | 3171 | printk(KERN_ERR |
3168 | "generic_make_request: Trying to access " | 3172 | "generic_make_request: Trying to access " |
3169 | "nonexistent block-device %s (%Lu)\n", | 3173 | "nonexistent block-device %s (%Lu)\n", |
3170 | bdevname(bio->bi_bdev, b), | 3174 | bdevname(bio->bi_bdev, b), |
3171 | (long long) bio->bi_sector); | 3175 | (long long) bio->bi_sector); |
3172 | end_io: | 3176 | end_io: |
3173 | bio_endio(bio, bio->bi_size, -EIO); | 3177 | bio_endio(bio, bio->bi_size, -EIO); |
3174 | break; | 3178 | break; |
3175 | } | 3179 | } |
3176 | 3180 | ||
3177 | if (unlikely(bio_sectors(bio) > q->max_hw_sectors)) { | 3181 | if (unlikely(bio_sectors(bio) > q->max_hw_sectors)) { |
3178 | printk("bio too big device %s (%u > %u)\n", | 3182 | printk("bio too big device %s (%u > %u)\n", |
3179 | bdevname(bio->bi_bdev, b), | 3183 | bdevname(bio->bi_bdev, b), |
3180 | bio_sectors(bio), | 3184 | bio_sectors(bio), |
3181 | q->max_hw_sectors); | 3185 | q->max_hw_sectors); |
3182 | goto end_io; | 3186 | goto end_io; |
3183 | } | 3187 | } |
3184 | 3188 | ||
3185 | if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))) | 3189 | if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))) |
3186 | goto end_io; | 3190 | goto end_io; |
3187 | 3191 | ||
3188 | if (should_fail_request(bio)) | 3192 | if (should_fail_request(bio)) |
3189 | goto end_io; | 3193 | goto end_io; |
3190 | 3194 | ||
3191 | /* | 3195 | /* |
3192 | * If this device has partitions, remap block n | 3196 | * If this device has partitions, remap block n |
3193 | * of partition p to block n+start(p) of the disk. | 3197 | * of partition p to block n+start(p) of the disk. |
3194 | */ | 3198 | */ |
3195 | blk_partition_remap(bio); | 3199 | blk_partition_remap(bio); |
3196 | 3200 | ||
3197 | if (old_sector != -1) | 3201 | if (old_sector != -1) |
3198 | blk_add_trace_remap(q, bio, old_dev, bio->bi_sector, | 3202 | blk_add_trace_remap(q, bio, old_dev, bio->bi_sector, |
3199 | old_sector); | 3203 | old_sector); |
3200 | 3204 | ||
3201 | blk_add_trace_bio(q, bio, BLK_TA_QUEUE); | 3205 | blk_add_trace_bio(q, bio, BLK_TA_QUEUE); |
3202 | 3206 | ||
3203 | old_sector = bio->bi_sector; | 3207 | old_sector = bio->bi_sector; |
3204 | old_dev = bio->bi_bdev->bd_dev; | 3208 | old_dev = bio->bi_bdev->bd_dev; |
3205 | 3209 | ||
3206 | maxsector = bio->bi_bdev->bd_inode->i_size >> 9; | 3210 | maxsector = bio->bi_bdev->bd_inode->i_size >> 9; |
3207 | if (maxsector) { | 3211 | if (maxsector) { |
3208 | sector_t sector = bio->bi_sector; | 3212 | sector_t sector = bio->bi_sector; |
3209 | 3213 | ||
3210 | if (maxsector < nr_sectors || | 3214 | if (maxsector < nr_sectors || |
3211 | maxsector - nr_sectors < sector) { | 3215 | maxsector - nr_sectors < sector) { |
3212 | /* | 3216 | /* |
3213 | * This may well happen - partitions are not | 3217 | * This may well happen - partitions are not |
3214 | * checked to make sure they are within the size | 3218 | * checked to make sure they are within the size |
3215 | * of the whole device. | 3219 | * of the whole device. |
3216 | */ | 3220 | */ |
3217 | handle_bad_sector(bio); | 3221 | handle_bad_sector(bio); |
3218 | goto end_io; | 3222 | goto end_io; |
3219 | } | 3223 | } |
3220 | } | 3224 | } |
3221 | 3225 | ||
3222 | ret = q->make_request_fn(q, bio); | 3226 | ret = q->make_request_fn(q, bio); |
3223 | } while (ret); | 3227 | } while (ret); |
3224 | } | 3228 | } |
3225 | 3229 | ||
3226 | /* | 3230 | /* |
3227 | * We only want one ->make_request_fn to be active at a time, | 3231 | * We only want one ->make_request_fn to be active at a time, |
3228 | * else stack usage with stacked devices could be a problem. | 3232 | * else stack usage with stacked devices could be a problem. |
3229 | * So use current->bio_{list,tail} to keep a list of requests | 3233 | * So use current->bio_{list,tail} to keep a list of requests |
3230 | * submited by a make_request_fn function. | 3234 | * submited by a make_request_fn function. |
3231 | * current->bio_tail is also used as a flag to say if | 3235 | * current->bio_tail is also used as a flag to say if |
3232 | * generic_make_request is currently active in this task or not. | 3236 | * generic_make_request is currently active in this task or not. |
3233 | * If it is NULL, then no make_request is active. If it is non-NULL, | 3237 | * If it is NULL, then no make_request is active. If it is non-NULL, |
3234 | * then a make_request is active, and new requests should be added | 3238 | * then a make_request is active, and new requests should be added |
3235 | * at the tail | 3239 | * at the tail |
3236 | */ | 3240 | */ |
3237 | void generic_make_request(struct bio *bio) | 3241 | void generic_make_request(struct bio *bio) |
3238 | { | 3242 | { |
3239 | if (current->bio_tail) { | 3243 | if (current->bio_tail) { |
3240 | /* make_request is active */ | 3244 | /* make_request is active */ |
3241 | *(current->bio_tail) = bio; | 3245 | *(current->bio_tail) = bio; |
3242 | bio->bi_next = NULL; | 3246 | bio->bi_next = NULL; |
3243 | current->bio_tail = &bio->bi_next; | 3247 | current->bio_tail = &bio->bi_next; |
3244 | return; | 3248 | return; |
3245 | } | 3249 | } |
3246 | /* following loop may be a bit non-obvious, and so deserves some | 3250 | /* following loop may be a bit non-obvious, and so deserves some |
3247 | * explanation. | 3251 | * explanation. |
3248 | * Before entering the loop, bio->bi_next is NULL (as all callers | 3252 | * Before entering the loop, bio->bi_next is NULL (as all callers |
3249 | * ensure that) so we have a list with a single bio. | 3253 | * ensure that) so we have a list with a single bio. |
3250 | * We pretend that we have just taken it off a longer list, so | 3254 | * We pretend that we have just taken it off a longer list, so |
3251 | * we assign bio_list to the next (which is NULL) and bio_tail | 3255 | * we assign bio_list to the next (which is NULL) and bio_tail |
3252 | * to &bio_list, thus initialising the bio_list of new bios to be | 3256 | * to &bio_list, thus initialising the bio_list of new bios to be |
3253 | * added. __generic_make_request may indeed add some more bios | 3257 | * added. __generic_make_request may indeed add some more bios |
3254 | * through a recursive call to generic_make_request. If it | 3258 | * through a recursive call to generic_make_request. If it |
3255 | * did, we find a non-NULL value in bio_list and re-enter the loop | 3259 | * did, we find a non-NULL value in bio_list and re-enter the loop |
3256 | * from the top. In this case we really did just take the bio | 3260 | * from the top. In this case we really did just take the bio |
3257 | * of the top of the list (no pretending) and so fixup bio_list and | 3261 | * of the top of the list (no pretending) and so fixup bio_list and |
3258 | * bio_tail or bi_next, and call into __generic_make_request again. | 3262 | * bio_tail or bi_next, and call into __generic_make_request again. |
3259 | * | 3263 | * |
3260 | * The loop was structured like this to make only one call to | 3264 | * The loop was structured like this to make only one call to |
3261 | * __generic_make_request (which is important as it is large and | 3265 | * __generic_make_request (which is important as it is large and |
3262 | * inlined) and to keep the structure simple. | 3266 | * inlined) and to keep the structure simple. |
3263 | */ | 3267 | */ |
3264 | BUG_ON(bio->bi_next); | 3268 | BUG_ON(bio->bi_next); |
3265 | do { | 3269 | do { |
3266 | current->bio_list = bio->bi_next; | 3270 | current->bio_list = bio->bi_next; |
3267 | if (bio->bi_next == NULL) | 3271 | if (bio->bi_next == NULL) |
3268 | current->bio_tail = ¤t->bio_list; | 3272 | current->bio_tail = ¤t->bio_list; |
3269 | else | 3273 | else |
3270 | bio->bi_next = NULL; | 3274 | bio->bi_next = NULL; |
3271 | __generic_make_request(bio); | 3275 | __generic_make_request(bio); |
3272 | bio = current->bio_list; | 3276 | bio = current->bio_list; |
3273 | } while (bio); | 3277 | } while (bio); |
3274 | current->bio_tail = NULL; /* deactivate */ | 3278 | current->bio_tail = NULL; /* deactivate */ |
3275 | } | 3279 | } |
3276 | 3280 | ||
3277 | EXPORT_SYMBOL(generic_make_request); | 3281 | EXPORT_SYMBOL(generic_make_request); |
3278 | 3282 | ||
3279 | /** | 3283 | /** |
3280 | * submit_bio: submit a bio to the block device layer for I/O | 3284 | * submit_bio: submit a bio to the block device layer for I/O |
3281 | * @rw: whether to %READ or %WRITE, or maybe to %READA (read ahead) | 3285 | * @rw: whether to %READ or %WRITE, or maybe to %READA (read ahead) |
3282 | * @bio: The &struct bio which describes the I/O | 3286 | * @bio: The &struct bio which describes the I/O |
3283 | * | 3287 | * |
3284 | * submit_bio() is very similar in purpose to generic_make_request(), and | 3288 | * submit_bio() is very similar in purpose to generic_make_request(), and |
3285 | * uses that function to do most of the work. Both are fairly rough | 3289 | * uses that function to do most of the work. Both are fairly rough |
3286 | * interfaces, @bio must be presetup and ready for I/O. | 3290 | * interfaces, @bio must be presetup and ready for I/O. |
3287 | * | 3291 | * |
3288 | */ | 3292 | */ |
3289 | void submit_bio(int rw, struct bio *bio) | 3293 | void submit_bio(int rw, struct bio *bio) |
3290 | { | 3294 | { |
3291 | int count = bio_sectors(bio); | 3295 | int count = bio_sectors(bio); |
3292 | 3296 | ||
3293 | BIO_BUG_ON(!bio->bi_size); | 3297 | BIO_BUG_ON(!bio->bi_size); |
3294 | BIO_BUG_ON(!bio->bi_io_vec); | 3298 | BIO_BUG_ON(!bio->bi_io_vec); |
3295 | bio->bi_rw |= rw; | 3299 | bio->bi_rw |= rw; |
3296 | if (rw & WRITE) { | 3300 | if (rw & WRITE) { |
3297 | count_vm_events(PGPGOUT, count); | 3301 | count_vm_events(PGPGOUT, count); |
3298 | } else { | 3302 | } else { |
3299 | task_io_account_read(bio->bi_size); | 3303 | task_io_account_read(bio->bi_size); |
3300 | count_vm_events(PGPGIN, count); | 3304 | count_vm_events(PGPGIN, count); |
3301 | } | 3305 | } |
3302 | 3306 | ||
3303 | if (unlikely(block_dump)) { | 3307 | if (unlikely(block_dump)) { |
3304 | char b[BDEVNAME_SIZE]; | 3308 | char b[BDEVNAME_SIZE]; |
3305 | printk(KERN_DEBUG "%s(%d): %s block %Lu on %s\n", | 3309 | printk(KERN_DEBUG "%s(%d): %s block %Lu on %s\n", |
3306 | current->comm, current->pid, | 3310 | current->comm, current->pid, |
3307 | (rw & WRITE) ? "WRITE" : "READ", | 3311 | (rw & WRITE) ? "WRITE" : "READ", |
3308 | (unsigned long long)bio->bi_sector, | 3312 | (unsigned long long)bio->bi_sector, |
3309 | bdevname(bio->bi_bdev,b)); | 3313 | bdevname(bio->bi_bdev,b)); |
3310 | } | 3314 | } |
3311 | 3315 | ||
3312 | generic_make_request(bio); | 3316 | generic_make_request(bio); |
3313 | } | 3317 | } |
3314 | 3318 | ||
3315 | EXPORT_SYMBOL(submit_bio); | 3319 | EXPORT_SYMBOL(submit_bio); |
3316 | 3320 | ||
3317 | static void blk_recalc_rq_segments(struct request *rq) | 3321 | static void blk_recalc_rq_segments(struct request *rq) |
3318 | { | 3322 | { |
3319 | struct bio *bio, *prevbio = NULL; | 3323 | struct bio *bio, *prevbio = NULL; |
3320 | int nr_phys_segs, nr_hw_segs; | 3324 | int nr_phys_segs, nr_hw_segs; |
3321 | unsigned int phys_size, hw_size; | 3325 | unsigned int phys_size, hw_size; |
3322 | struct request_queue *q = rq->q; | 3326 | struct request_queue *q = rq->q; |
3323 | 3327 | ||
3324 | if (!rq->bio) | 3328 | if (!rq->bio) |
3325 | return; | 3329 | return; |
3326 | 3330 | ||
3327 | phys_size = hw_size = nr_phys_segs = nr_hw_segs = 0; | 3331 | phys_size = hw_size = nr_phys_segs = nr_hw_segs = 0; |
3328 | rq_for_each_bio(bio, rq) { | 3332 | rq_for_each_bio(bio, rq) { |
3329 | /* Force bio hw/phys segs to be recalculated. */ | 3333 | /* Force bio hw/phys segs to be recalculated. */ |
3330 | bio->bi_flags &= ~(1 << BIO_SEG_VALID); | 3334 | bio->bi_flags &= ~(1 << BIO_SEG_VALID); |
3331 | 3335 | ||
3332 | nr_phys_segs += bio_phys_segments(q, bio); | 3336 | nr_phys_segs += bio_phys_segments(q, bio); |
3333 | nr_hw_segs += bio_hw_segments(q, bio); | 3337 | nr_hw_segs += bio_hw_segments(q, bio); |
3334 | if (prevbio) { | 3338 | if (prevbio) { |
3335 | int pseg = phys_size + prevbio->bi_size + bio->bi_size; | 3339 | int pseg = phys_size + prevbio->bi_size + bio->bi_size; |
3336 | int hseg = hw_size + prevbio->bi_size + bio->bi_size; | 3340 | int hseg = hw_size + prevbio->bi_size + bio->bi_size; |
3337 | 3341 | ||
3338 | if (blk_phys_contig_segment(q, prevbio, bio) && | 3342 | if (blk_phys_contig_segment(q, prevbio, bio) && |
3339 | pseg <= q->max_segment_size) { | 3343 | pseg <= q->max_segment_size) { |
3340 | nr_phys_segs--; | 3344 | nr_phys_segs--; |
3341 | phys_size += prevbio->bi_size + bio->bi_size; | 3345 | phys_size += prevbio->bi_size + bio->bi_size; |
3342 | } else | 3346 | } else |
3343 | phys_size = 0; | 3347 | phys_size = 0; |
3344 | 3348 | ||
3345 | if (blk_hw_contig_segment(q, prevbio, bio) && | 3349 | if (blk_hw_contig_segment(q, prevbio, bio) && |
3346 | hseg <= q->max_segment_size) { | 3350 | hseg <= q->max_segment_size) { |
3347 | nr_hw_segs--; | 3351 | nr_hw_segs--; |
3348 | hw_size += prevbio->bi_size + bio->bi_size; | 3352 | hw_size += prevbio->bi_size + bio->bi_size; |
3349 | } else | 3353 | } else |
3350 | hw_size = 0; | 3354 | hw_size = 0; |
3351 | } | 3355 | } |
3352 | prevbio = bio; | 3356 | prevbio = bio; |
3353 | } | 3357 | } |
3354 | 3358 | ||
3355 | rq->nr_phys_segments = nr_phys_segs; | 3359 | rq->nr_phys_segments = nr_phys_segs; |
3356 | rq->nr_hw_segments = nr_hw_segs; | 3360 | rq->nr_hw_segments = nr_hw_segs; |
3357 | } | 3361 | } |
3358 | 3362 | ||
3359 | static void blk_recalc_rq_sectors(struct request *rq, int nsect) | 3363 | static void blk_recalc_rq_sectors(struct request *rq, int nsect) |
3360 | { | 3364 | { |
3361 | if (blk_fs_request(rq)) { | 3365 | if (blk_fs_request(rq)) { |
3362 | rq->hard_sector += nsect; | 3366 | rq->hard_sector += nsect; |
3363 | rq->hard_nr_sectors -= nsect; | 3367 | rq->hard_nr_sectors -= nsect; |
3364 | 3368 | ||
3365 | /* | 3369 | /* |
3366 | * Move the I/O submission pointers ahead if required. | 3370 | * Move the I/O submission pointers ahead if required. |
3367 | */ | 3371 | */ |
3368 | if ((rq->nr_sectors >= rq->hard_nr_sectors) && | 3372 | if ((rq->nr_sectors >= rq->hard_nr_sectors) && |
3369 | (rq->sector <= rq->hard_sector)) { | 3373 | (rq->sector <= rq->hard_sector)) { |
3370 | rq->sector = rq->hard_sector; | 3374 | rq->sector = rq->hard_sector; |
3371 | rq->nr_sectors = rq->hard_nr_sectors; | 3375 | rq->nr_sectors = rq->hard_nr_sectors; |
3372 | rq->hard_cur_sectors = bio_cur_sectors(rq->bio); | 3376 | rq->hard_cur_sectors = bio_cur_sectors(rq->bio); |
3373 | rq->current_nr_sectors = rq->hard_cur_sectors; | 3377 | rq->current_nr_sectors = rq->hard_cur_sectors; |
3374 | rq->buffer = bio_data(rq->bio); | 3378 | rq->buffer = bio_data(rq->bio); |
3375 | } | 3379 | } |
3376 | 3380 | ||
3377 | /* | 3381 | /* |
3378 | * if total number of sectors is less than the first segment | 3382 | * if total number of sectors is less than the first segment |
3379 | * size, something has gone terribly wrong | 3383 | * size, something has gone terribly wrong |
3380 | */ | 3384 | */ |
3381 | if (rq->nr_sectors < rq->current_nr_sectors) { | 3385 | if (rq->nr_sectors < rq->current_nr_sectors) { |
3382 | printk("blk: request botched\n"); | 3386 | printk("blk: request botched\n"); |
3383 | rq->nr_sectors = rq->current_nr_sectors; | 3387 | rq->nr_sectors = rq->current_nr_sectors; |
3384 | } | 3388 | } |
3385 | } | 3389 | } |
3386 | } | 3390 | } |
3387 | 3391 | ||
3388 | static int __end_that_request_first(struct request *req, int uptodate, | 3392 | static int __end_that_request_first(struct request *req, int uptodate, |
3389 | int nr_bytes) | 3393 | int nr_bytes) |
3390 | { | 3394 | { |
3391 | int total_bytes, bio_nbytes, error, next_idx = 0; | 3395 | int total_bytes, bio_nbytes, error, next_idx = 0; |
3392 | struct bio *bio; | 3396 | struct bio *bio; |
3393 | 3397 | ||
3394 | blk_add_trace_rq(req->q, req, BLK_TA_COMPLETE); | 3398 | blk_add_trace_rq(req->q, req, BLK_TA_COMPLETE); |
3395 | 3399 | ||
3396 | /* | 3400 | /* |
3397 | * extend uptodate bool to allow < 0 value to be direct io error | 3401 | * extend uptodate bool to allow < 0 value to be direct io error |
3398 | */ | 3402 | */ |
3399 | error = 0; | 3403 | error = 0; |
3400 | if (end_io_error(uptodate)) | 3404 | if (end_io_error(uptodate)) |
3401 | error = !uptodate ? -EIO : uptodate; | 3405 | error = !uptodate ? -EIO : uptodate; |
3402 | 3406 | ||
3403 | /* | 3407 | /* |
3404 | * for a REQ_BLOCK_PC request, we want to carry any eventual | 3408 | * for a REQ_BLOCK_PC request, we want to carry any eventual |
3405 | * sense key with us all the way through | 3409 | * sense key with us all the way through |
3406 | */ | 3410 | */ |
3407 | if (!blk_pc_request(req)) | 3411 | if (!blk_pc_request(req)) |
3408 | req->errors = 0; | 3412 | req->errors = 0; |
3409 | 3413 | ||
3410 | if (!uptodate) { | 3414 | if (!uptodate) { |
3411 | if (blk_fs_request(req) && !(req->cmd_flags & REQ_QUIET)) | 3415 | if (blk_fs_request(req) && !(req->cmd_flags & REQ_QUIET)) |
3412 | printk("end_request: I/O error, dev %s, sector %llu\n", | 3416 | printk("end_request: I/O error, dev %s, sector %llu\n", |
3413 | req->rq_disk ? req->rq_disk->disk_name : "?", | 3417 | req->rq_disk ? req->rq_disk->disk_name : "?", |
3414 | (unsigned long long)req->sector); | 3418 | (unsigned long long)req->sector); |
3415 | } | 3419 | } |
3416 | 3420 | ||
3417 | if (blk_fs_request(req) && req->rq_disk) { | 3421 | if (blk_fs_request(req) && req->rq_disk) { |
3418 | const int rw = rq_data_dir(req); | 3422 | const int rw = rq_data_dir(req); |
3419 | 3423 | ||
3420 | disk_stat_add(req->rq_disk, sectors[rw], nr_bytes >> 9); | 3424 | disk_stat_add(req->rq_disk, sectors[rw], nr_bytes >> 9); |
3421 | } | 3425 | } |
3422 | 3426 | ||
3423 | total_bytes = bio_nbytes = 0; | 3427 | total_bytes = bio_nbytes = 0; |
3424 | while ((bio = req->bio) != NULL) { | 3428 | while ((bio = req->bio) != NULL) { |
3425 | int nbytes; | 3429 | int nbytes; |
3426 | 3430 | ||
3427 | if (nr_bytes >= bio->bi_size) { | 3431 | if (nr_bytes >= bio->bi_size) { |
3428 | req->bio = bio->bi_next; | 3432 | req->bio = bio->bi_next; |
3429 | nbytes = bio->bi_size; | 3433 | nbytes = bio->bi_size; |
3430 | if (!ordered_bio_endio(req, bio, nbytes, error)) | 3434 | if (!ordered_bio_endio(req, bio, nbytes, error)) |
3431 | bio_endio(bio, nbytes, error); | 3435 | bio_endio(bio, nbytes, error); |
3432 | next_idx = 0; | 3436 | next_idx = 0; |
3433 | bio_nbytes = 0; | 3437 | bio_nbytes = 0; |
3434 | } else { | 3438 | } else { |
3435 | int idx = bio->bi_idx + next_idx; | 3439 | int idx = bio->bi_idx + next_idx; |
3436 | 3440 | ||
3437 | if (unlikely(bio->bi_idx >= bio->bi_vcnt)) { | 3441 | if (unlikely(bio->bi_idx >= bio->bi_vcnt)) { |
3438 | blk_dump_rq_flags(req, "__end_that"); | 3442 | blk_dump_rq_flags(req, "__end_that"); |
3439 | printk("%s: bio idx %d >= vcnt %d\n", | 3443 | printk("%s: bio idx %d >= vcnt %d\n", |
3440 | __FUNCTION__, | 3444 | __FUNCTION__, |
3441 | bio->bi_idx, bio->bi_vcnt); | 3445 | bio->bi_idx, bio->bi_vcnt); |
3442 | break; | 3446 | break; |
3443 | } | 3447 | } |
3444 | 3448 | ||
3445 | nbytes = bio_iovec_idx(bio, idx)->bv_len; | 3449 | nbytes = bio_iovec_idx(bio, idx)->bv_len; |
3446 | BIO_BUG_ON(nbytes > bio->bi_size); | 3450 | BIO_BUG_ON(nbytes > bio->bi_size); |
3447 | 3451 | ||
3448 | /* | 3452 | /* |
3449 | * not a complete bvec done | 3453 | * not a complete bvec done |
3450 | */ | 3454 | */ |
3451 | if (unlikely(nbytes > nr_bytes)) { | 3455 | if (unlikely(nbytes > nr_bytes)) { |
3452 | bio_nbytes += nr_bytes; | 3456 | bio_nbytes += nr_bytes; |
3453 | total_bytes += nr_bytes; | 3457 | total_bytes += nr_bytes; |
3454 | break; | 3458 | break; |
3455 | } | 3459 | } |
3456 | 3460 | ||
3457 | /* | 3461 | /* |
3458 | * advance to the next vector | 3462 | * advance to the next vector |
3459 | */ | 3463 | */ |
3460 | next_idx++; | 3464 | next_idx++; |
3461 | bio_nbytes += nbytes; | 3465 | bio_nbytes += nbytes; |
3462 | } | 3466 | } |
3463 | 3467 | ||
3464 | total_bytes += nbytes; | 3468 | total_bytes += nbytes; |
3465 | nr_bytes -= nbytes; | 3469 | nr_bytes -= nbytes; |
3466 | 3470 | ||
3467 | if ((bio = req->bio)) { | 3471 | if ((bio = req->bio)) { |
3468 | /* | 3472 | /* |
3469 | * end more in this run, or just return 'not-done' | 3473 | * end more in this run, or just return 'not-done' |
3470 | */ | 3474 | */ |
3471 | if (unlikely(nr_bytes <= 0)) | 3475 | if (unlikely(nr_bytes <= 0)) |
3472 | break; | 3476 | break; |
3473 | } | 3477 | } |
3474 | } | 3478 | } |
3475 | 3479 | ||
3476 | /* | 3480 | /* |
3477 | * completely done | 3481 | * completely done |
3478 | */ | 3482 | */ |
3479 | if (!req->bio) | 3483 | if (!req->bio) |
3480 | return 0; | 3484 | return 0; |
3481 | 3485 | ||
3482 | /* | 3486 | /* |
3483 | * if the request wasn't completed, update state | 3487 | * if the request wasn't completed, update state |
3484 | */ | 3488 | */ |
3485 | if (bio_nbytes) { | 3489 | if (bio_nbytes) { |
3486 | if (!ordered_bio_endio(req, bio, bio_nbytes, error)) | 3490 | if (!ordered_bio_endio(req, bio, bio_nbytes, error)) |
3487 | bio_endio(bio, bio_nbytes, error); | 3491 | bio_endio(bio, bio_nbytes, error); |
3488 | bio->bi_idx += next_idx; | 3492 | bio->bi_idx += next_idx; |
3489 | bio_iovec(bio)->bv_offset += nr_bytes; | 3493 | bio_iovec(bio)->bv_offset += nr_bytes; |
3490 | bio_iovec(bio)->bv_len -= nr_bytes; | 3494 | bio_iovec(bio)->bv_len -= nr_bytes; |
3491 | } | 3495 | } |
3492 | 3496 | ||
3493 | blk_recalc_rq_sectors(req, total_bytes >> 9); | 3497 | blk_recalc_rq_sectors(req, total_bytes >> 9); |
3494 | blk_recalc_rq_segments(req); | 3498 | blk_recalc_rq_segments(req); |
3495 | return 1; | 3499 | return 1; |
3496 | } | 3500 | } |
3497 | 3501 | ||
3498 | /** | 3502 | /** |
3499 | * end_that_request_first - end I/O on a request | 3503 | * end_that_request_first - end I/O on a request |
3500 | * @req: the request being processed | 3504 | * @req: the request being processed |
3501 | * @uptodate: 1 for success, 0 for I/O error, < 0 for specific error | 3505 | * @uptodate: 1 for success, 0 for I/O error, < 0 for specific error |
3502 | * @nr_sectors: number of sectors to end I/O on | 3506 | * @nr_sectors: number of sectors to end I/O on |
3503 | * | 3507 | * |
3504 | * Description: | 3508 | * Description: |
3505 | * Ends I/O on a number of sectors attached to @req, and sets it up | 3509 | * Ends I/O on a number of sectors attached to @req, and sets it up |
3506 | * for the next range of segments (if any) in the cluster. | 3510 | * for the next range of segments (if any) in the cluster. |
3507 | * | 3511 | * |
3508 | * Return: | 3512 | * Return: |
3509 | * 0 - we are done with this request, call end_that_request_last() | 3513 | * 0 - we are done with this request, call end_that_request_last() |
3510 | * 1 - still buffers pending for this request | 3514 | * 1 - still buffers pending for this request |
3511 | **/ | 3515 | **/ |
3512 | int end_that_request_first(struct request *req, int uptodate, int nr_sectors) | 3516 | int end_that_request_first(struct request *req, int uptodate, int nr_sectors) |
3513 | { | 3517 | { |
3514 | return __end_that_request_first(req, uptodate, nr_sectors << 9); | 3518 | return __end_that_request_first(req, uptodate, nr_sectors << 9); |
3515 | } | 3519 | } |
3516 | 3520 | ||
3517 | EXPORT_SYMBOL(end_that_request_first); | 3521 | EXPORT_SYMBOL(end_that_request_first); |
3518 | 3522 | ||
3519 | /** | 3523 | /** |
3520 | * end_that_request_chunk - end I/O on a request | 3524 | * end_that_request_chunk - end I/O on a request |
3521 | * @req: the request being processed | 3525 | * @req: the request being processed |
3522 | * @uptodate: 1 for success, 0 for I/O error, < 0 for specific error | 3526 | * @uptodate: 1 for success, 0 for I/O error, < 0 for specific error |
3523 | * @nr_bytes: number of bytes to complete | 3527 | * @nr_bytes: number of bytes to complete |
3524 | * | 3528 | * |
3525 | * Description: | 3529 | * Description: |
3526 | * Ends I/O on a number of bytes attached to @req, and sets it up | 3530 | * Ends I/O on a number of bytes attached to @req, and sets it up |
3527 | * for the next range of segments (if any). Like end_that_request_first(), | 3531 | * for the next range of segments (if any). Like end_that_request_first(), |
3528 | * but deals with bytes instead of sectors. | 3532 | * but deals with bytes instead of sectors. |
3529 | * | 3533 | * |
3530 | * Return: | 3534 | * Return: |
3531 | * 0 - we are done with this request, call end_that_request_last() | 3535 | * 0 - we are done with this request, call end_that_request_last() |
3532 | * 1 - still buffers pending for this request | 3536 | * 1 - still buffers pending for this request |
3533 | **/ | 3537 | **/ |
3534 | int end_that_request_chunk(struct request *req, int uptodate, int nr_bytes) | 3538 | int end_that_request_chunk(struct request *req, int uptodate, int nr_bytes) |
3535 | { | 3539 | { |
3536 | return __end_that_request_first(req, uptodate, nr_bytes); | 3540 | return __end_that_request_first(req, uptodate, nr_bytes); |
3537 | } | 3541 | } |
3538 | 3542 | ||
3539 | EXPORT_SYMBOL(end_that_request_chunk); | 3543 | EXPORT_SYMBOL(end_that_request_chunk); |
3540 | 3544 | ||
3541 | /* | 3545 | /* |
3542 | * splice the completion data to a local structure and hand off to | 3546 | * splice the completion data to a local structure and hand off to |
3543 | * process_completion_queue() to complete the requests | 3547 | * process_completion_queue() to complete the requests |
3544 | */ | 3548 | */ |
3545 | static void blk_done_softirq(struct softirq_action *h) | 3549 | static void blk_done_softirq(struct softirq_action *h) |
3546 | { | 3550 | { |
3547 | struct list_head *cpu_list, local_list; | 3551 | struct list_head *cpu_list, local_list; |
3548 | 3552 | ||
3549 | local_irq_disable(); | 3553 | local_irq_disable(); |
3550 | cpu_list = &__get_cpu_var(blk_cpu_done); | 3554 | cpu_list = &__get_cpu_var(blk_cpu_done); |
3551 | list_replace_init(cpu_list, &local_list); | 3555 | list_replace_init(cpu_list, &local_list); |
3552 | local_irq_enable(); | 3556 | local_irq_enable(); |
3553 | 3557 | ||
3554 | while (!list_empty(&local_list)) { | 3558 | while (!list_empty(&local_list)) { |
3555 | struct request *rq = list_entry(local_list.next, struct request, donelist); | 3559 | struct request *rq = list_entry(local_list.next, struct request, donelist); |
3556 | 3560 | ||
3557 | list_del_init(&rq->donelist); | 3561 | list_del_init(&rq->donelist); |
3558 | rq->q->softirq_done_fn(rq); | 3562 | rq->q->softirq_done_fn(rq); |
3559 | } | 3563 | } |
3560 | } | 3564 | } |
3561 | 3565 | ||
3562 | static int blk_cpu_notify(struct notifier_block *self, unsigned long action, | 3566 | static int blk_cpu_notify(struct notifier_block *self, unsigned long action, |
3563 | void *hcpu) | 3567 | void *hcpu) |
3564 | { | 3568 | { |
3565 | /* | 3569 | /* |
3566 | * If a CPU goes away, splice its entries to the current CPU | 3570 | * If a CPU goes away, splice its entries to the current CPU |
3567 | * and trigger a run of the softirq | 3571 | * and trigger a run of the softirq |
3568 | */ | 3572 | */ |
3569 | if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) { | 3573 | if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) { |
3570 | int cpu = (unsigned long) hcpu; | 3574 | int cpu = (unsigned long) hcpu; |
3571 | 3575 | ||
3572 | local_irq_disable(); | 3576 | local_irq_disable(); |
3573 | list_splice_init(&per_cpu(blk_cpu_done, cpu), | 3577 | list_splice_init(&per_cpu(blk_cpu_done, cpu), |
3574 | &__get_cpu_var(blk_cpu_done)); | 3578 | &__get_cpu_var(blk_cpu_done)); |
3575 | raise_softirq_irqoff(BLOCK_SOFTIRQ); | 3579 | raise_softirq_irqoff(BLOCK_SOFTIRQ); |
3576 | local_irq_enable(); | 3580 | local_irq_enable(); |
3577 | } | 3581 | } |
3578 | 3582 | ||
3579 | return NOTIFY_OK; | 3583 | return NOTIFY_OK; |
3580 | } | 3584 | } |
3581 | 3585 | ||
3582 | 3586 | ||
3583 | static struct notifier_block __devinitdata blk_cpu_notifier = { | 3587 | static struct notifier_block __devinitdata blk_cpu_notifier = { |
3584 | .notifier_call = blk_cpu_notify, | 3588 | .notifier_call = blk_cpu_notify, |
3585 | }; | 3589 | }; |
3586 | 3590 | ||
3587 | /** | 3591 | /** |
3588 | * blk_complete_request - end I/O on a request | 3592 | * blk_complete_request - end I/O on a request |
3589 | * @req: the request being processed | 3593 | * @req: the request being processed |
3590 | * | 3594 | * |
3591 | * Description: | 3595 | * Description: |
3592 | * Ends all I/O on a request. It does not handle partial completions, | 3596 | * Ends all I/O on a request. It does not handle partial completions, |
3593 | * unless the driver actually implements this in its completion callback | 3597 | * unless the driver actually implements this in its completion callback |
3594 | * through requeueing. Theh actual completion happens out-of-order, | 3598 | * through requeueing. Theh actual completion happens out-of-order, |
3595 | * through a softirq handler. The user must have registered a completion | 3599 | * through a softirq handler. The user must have registered a completion |
3596 | * callback through blk_queue_softirq_done(). | 3600 | * callback through blk_queue_softirq_done(). |
3597 | **/ | 3601 | **/ |
3598 | 3602 | ||
3599 | void blk_complete_request(struct request *req) | 3603 | void blk_complete_request(struct request *req) |
3600 | { | 3604 | { |
3601 | struct list_head *cpu_list; | 3605 | struct list_head *cpu_list; |
3602 | unsigned long flags; | 3606 | unsigned long flags; |
3603 | 3607 | ||
3604 | BUG_ON(!req->q->softirq_done_fn); | 3608 | BUG_ON(!req->q->softirq_done_fn); |
3605 | 3609 | ||
3606 | local_irq_save(flags); | 3610 | local_irq_save(flags); |
3607 | 3611 | ||
3608 | cpu_list = &__get_cpu_var(blk_cpu_done); | 3612 | cpu_list = &__get_cpu_var(blk_cpu_done); |
3609 | list_add_tail(&req->donelist, cpu_list); | 3613 | list_add_tail(&req->donelist, cpu_list); |
3610 | raise_softirq_irqoff(BLOCK_SOFTIRQ); | 3614 | raise_softirq_irqoff(BLOCK_SOFTIRQ); |
3611 | 3615 | ||
3612 | local_irq_restore(flags); | 3616 | local_irq_restore(flags); |
3613 | } | 3617 | } |
3614 | 3618 | ||
3615 | EXPORT_SYMBOL(blk_complete_request); | 3619 | EXPORT_SYMBOL(blk_complete_request); |
3616 | 3620 | ||
3617 | /* | 3621 | /* |
3618 | * queue lock must be held | 3622 | * queue lock must be held |
3619 | */ | 3623 | */ |
3620 | void end_that_request_last(struct request *req, int uptodate) | 3624 | void end_that_request_last(struct request *req, int uptodate) |
3621 | { | 3625 | { |
3622 | struct gendisk *disk = req->rq_disk; | 3626 | struct gendisk *disk = req->rq_disk; |
3623 | int error; | 3627 | int error; |
3624 | 3628 | ||
3625 | /* | 3629 | /* |
3626 | * extend uptodate bool to allow < 0 value to be direct io error | 3630 | * extend uptodate bool to allow < 0 value to be direct io error |
3627 | */ | 3631 | */ |
3628 | error = 0; | 3632 | error = 0; |
3629 | if (end_io_error(uptodate)) | 3633 | if (end_io_error(uptodate)) |
3630 | error = !uptodate ? -EIO : uptodate; | 3634 | error = !uptodate ? -EIO : uptodate; |
3631 | 3635 | ||
3632 | if (unlikely(laptop_mode) && blk_fs_request(req)) | 3636 | if (unlikely(laptop_mode) && blk_fs_request(req)) |
3633 | laptop_io_completion(); | 3637 | laptop_io_completion(); |
3634 | 3638 | ||
3635 | /* | 3639 | /* |
3636 | * Account IO completion. bar_rq isn't accounted as a normal | 3640 | * Account IO completion. bar_rq isn't accounted as a normal |
3637 | * IO on queueing nor completion. Accounting the containing | 3641 | * IO on queueing nor completion. Accounting the containing |
3638 | * request is enough. | 3642 | * request is enough. |
3639 | */ | 3643 | */ |
3640 | if (disk && blk_fs_request(req) && req != &req->q->bar_rq) { | 3644 | if (disk && blk_fs_request(req) && req != &req->q->bar_rq) { |
3641 | unsigned long duration = jiffies - req->start_time; | 3645 | unsigned long duration = jiffies - req->start_time; |
3642 | const int rw = rq_data_dir(req); | 3646 | const int rw = rq_data_dir(req); |
3643 | 3647 | ||
3644 | __disk_stat_inc(disk, ios[rw]); | 3648 | __disk_stat_inc(disk, ios[rw]); |
3645 | __disk_stat_add(disk, ticks[rw], duration); | 3649 | __disk_stat_add(disk, ticks[rw], duration); |
3646 | disk_round_stats(disk); | 3650 | disk_round_stats(disk); |
3647 | disk->in_flight--; | 3651 | disk->in_flight--; |
3648 | } | 3652 | } |
3649 | if (req->end_io) | 3653 | if (req->end_io) |
3650 | req->end_io(req, error); | 3654 | req->end_io(req, error); |
3651 | else | 3655 | else |
3652 | __blk_put_request(req->q, req); | 3656 | __blk_put_request(req->q, req); |
3653 | } | 3657 | } |
3654 | 3658 | ||
3655 | EXPORT_SYMBOL(end_that_request_last); | 3659 | EXPORT_SYMBOL(end_that_request_last); |
3656 | 3660 | ||
3657 | void end_request(struct request *req, int uptodate) | 3661 | void end_request(struct request *req, int uptodate) |
3658 | { | 3662 | { |
3659 | if (!end_that_request_first(req, uptodate, req->hard_cur_sectors)) { | 3663 | if (!end_that_request_first(req, uptodate, req->hard_cur_sectors)) { |
3660 | add_disk_randomness(req->rq_disk); | 3664 | add_disk_randomness(req->rq_disk); |
3661 | blkdev_dequeue_request(req); | 3665 | blkdev_dequeue_request(req); |
3662 | end_that_request_last(req, uptodate); | 3666 | end_that_request_last(req, uptodate); |
3663 | } | 3667 | } |
3664 | } | 3668 | } |
3665 | 3669 | ||
3666 | EXPORT_SYMBOL(end_request); | 3670 | EXPORT_SYMBOL(end_request); |
3667 | 3671 | ||
3668 | void blk_rq_bio_prep(struct request_queue *q, struct request *rq, | 3672 | void blk_rq_bio_prep(struct request_queue *q, struct request *rq, |
3669 | struct bio *bio) | 3673 | struct bio *bio) |
3670 | { | 3674 | { |
3671 | /* first two bits are identical in rq->cmd_flags and bio->bi_rw */ | 3675 | /* first two bits are identical in rq->cmd_flags and bio->bi_rw */ |
3672 | rq->cmd_flags |= (bio->bi_rw & 3); | 3676 | rq->cmd_flags |= (bio->bi_rw & 3); |
3673 | 3677 | ||
3674 | rq->nr_phys_segments = bio_phys_segments(q, bio); | 3678 | rq->nr_phys_segments = bio_phys_segments(q, bio); |
3675 | rq->nr_hw_segments = bio_hw_segments(q, bio); | 3679 | rq->nr_hw_segments = bio_hw_segments(q, bio); |
3676 | rq->current_nr_sectors = bio_cur_sectors(bio); | 3680 | rq->current_nr_sectors = bio_cur_sectors(bio); |
3677 | rq->hard_cur_sectors = rq->current_nr_sectors; | 3681 | rq->hard_cur_sectors = rq->current_nr_sectors; |
3678 | rq->hard_nr_sectors = rq->nr_sectors = bio_sectors(bio); | 3682 | rq->hard_nr_sectors = rq->nr_sectors = bio_sectors(bio); |
3679 | rq->buffer = bio_data(bio); | 3683 | rq->buffer = bio_data(bio); |
3680 | rq->data_len = bio->bi_size; | 3684 | rq->data_len = bio->bi_size; |
3681 | 3685 | ||
3682 | rq->bio = rq->biotail = bio; | 3686 | rq->bio = rq->biotail = bio; |
3683 | } | 3687 | } |
3684 | 3688 | ||
3685 | EXPORT_SYMBOL(blk_rq_bio_prep); | 3689 | EXPORT_SYMBOL(blk_rq_bio_prep); |
3686 | 3690 | ||
3687 | int kblockd_schedule_work(struct work_struct *work) | 3691 | int kblockd_schedule_work(struct work_struct *work) |
3688 | { | 3692 | { |
3689 | return queue_work(kblockd_workqueue, work); | 3693 | return queue_work(kblockd_workqueue, work); |
3690 | } | 3694 | } |
3691 | 3695 | ||
3692 | EXPORT_SYMBOL(kblockd_schedule_work); | 3696 | EXPORT_SYMBOL(kblockd_schedule_work); |
3693 | 3697 | ||
3694 | void kblockd_flush_work(struct work_struct *work) | 3698 | void kblockd_flush_work(struct work_struct *work) |
3695 | { | 3699 | { |
3696 | cancel_work_sync(work); | 3700 | cancel_work_sync(work); |
3697 | } | 3701 | } |
3698 | EXPORT_SYMBOL(kblockd_flush_work); | 3702 | EXPORT_SYMBOL(kblockd_flush_work); |
3699 | 3703 | ||
3700 | int __init blk_dev_init(void) | 3704 | int __init blk_dev_init(void) |
3701 | { | 3705 | { |
3702 | int i; | 3706 | int i; |
3703 | 3707 | ||
3704 | kblockd_workqueue = create_workqueue("kblockd"); | 3708 | kblockd_workqueue = create_workqueue("kblockd"); |
3705 | if (!kblockd_workqueue) | 3709 | if (!kblockd_workqueue) |
3706 | panic("Failed to create kblockd\n"); | 3710 | panic("Failed to create kblockd\n"); |
3707 | 3711 | ||
3708 | request_cachep = kmem_cache_create("blkdev_requests", | 3712 | request_cachep = kmem_cache_create("blkdev_requests", |
3709 | sizeof(struct request), 0, SLAB_PANIC, NULL); | 3713 | sizeof(struct request), 0, SLAB_PANIC, NULL); |
3710 | 3714 | ||
3711 | requestq_cachep = kmem_cache_create("blkdev_queue", | 3715 | requestq_cachep = kmem_cache_create("blkdev_queue", |
3712 | sizeof(struct request_queue), 0, SLAB_PANIC, NULL); | 3716 | sizeof(struct request_queue), 0, SLAB_PANIC, NULL); |
3713 | 3717 | ||
3714 | iocontext_cachep = kmem_cache_create("blkdev_ioc", | 3718 | iocontext_cachep = kmem_cache_create("blkdev_ioc", |
3715 | sizeof(struct io_context), 0, SLAB_PANIC, NULL); | 3719 | sizeof(struct io_context), 0, SLAB_PANIC, NULL); |
3716 | 3720 | ||
3717 | for_each_possible_cpu(i) | 3721 | for_each_possible_cpu(i) |
3718 | INIT_LIST_HEAD(&per_cpu(blk_cpu_done, i)); | 3722 | INIT_LIST_HEAD(&per_cpu(blk_cpu_done, i)); |
3719 | 3723 | ||
3720 | open_softirq(BLOCK_SOFTIRQ, blk_done_softirq, NULL); | 3724 | open_softirq(BLOCK_SOFTIRQ, blk_done_softirq, NULL); |
3721 | register_hotcpu_notifier(&blk_cpu_notifier); | 3725 | register_hotcpu_notifier(&blk_cpu_notifier); |
3722 | 3726 | ||
3723 | blk_max_low_pfn = max_low_pfn - 1; | 3727 | blk_max_low_pfn = max_low_pfn - 1; |
3724 | blk_max_pfn = max_pfn - 1; | 3728 | blk_max_pfn = max_pfn - 1; |
3725 | 3729 | ||
3726 | return 0; | 3730 | return 0; |
3727 | } | 3731 | } |
3728 | 3732 | ||
3729 | /* | 3733 | /* |
3730 | * IO Context helper functions | 3734 | * IO Context helper functions |
3731 | */ | 3735 | */ |
3732 | void put_io_context(struct io_context *ioc) | 3736 | void put_io_context(struct io_context *ioc) |
3733 | { | 3737 | { |
3734 | if (ioc == NULL) | 3738 | if (ioc == NULL) |
3735 | return; | 3739 | return; |
3736 | 3740 | ||
3737 | BUG_ON(atomic_read(&ioc->refcount) == 0); | 3741 | BUG_ON(atomic_read(&ioc->refcount) == 0); |
3738 | 3742 | ||
3739 | if (atomic_dec_and_test(&ioc->refcount)) { | 3743 | if (atomic_dec_and_test(&ioc->refcount)) { |
3740 | struct cfq_io_context *cic; | 3744 | struct cfq_io_context *cic; |
3741 | 3745 | ||
3742 | rcu_read_lock(); | 3746 | rcu_read_lock(); |
3743 | if (ioc->aic && ioc->aic->dtor) | 3747 | if (ioc->aic && ioc->aic->dtor) |
3744 | ioc->aic->dtor(ioc->aic); | 3748 | ioc->aic->dtor(ioc->aic); |
3745 | if (ioc->cic_root.rb_node != NULL) { | 3749 | if (ioc->cic_root.rb_node != NULL) { |
3746 | struct rb_node *n = rb_first(&ioc->cic_root); | 3750 | struct rb_node *n = rb_first(&ioc->cic_root); |
3747 | 3751 | ||
3748 | cic = rb_entry(n, struct cfq_io_context, rb_node); | 3752 | cic = rb_entry(n, struct cfq_io_context, rb_node); |
3749 | cic->dtor(ioc); | 3753 | cic->dtor(ioc); |
3750 | } | 3754 | } |
3751 | rcu_read_unlock(); | 3755 | rcu_read_unlock(); |
3752 | 3756 | ||
3753 | kmem_cache_free(iocontext_cachep, ioc); | 3757 | kmem_cache_free(iocontext_cachep, ioc); |
3754 | } | 3758 | } |
3755 | } | 3759 | } |
3756 | EXPORT_SYMBOL(put_io_context); | 3760 | EXPORT_SYMBOL(put_io_context); |
3757 | 3761 | ||
3758 | /* Called by the exitting task */ | 3762 | /* Called by the exitting task */ |
3759 | void exit_io_context(void) | 3763 | void exit_io_context(void) |
3760 | { | 3764 | { |
3761 | struct io_context *ioc; | 3765 | struct io_context *ioc; |
3762 | struct cfq_io_context *cic; | 3766 | struct cfq_io_context *cic; |
3763 | 3767 | ||
3764 | task_lock(current); | 3768 | task_lock(current); |
3765 | ioc = current->io_context; | 3769 | ioc = current->io_context; |
3766 | current->io_context = NULL; | 3770 | current->io_context = NULL; |
3767 | task_unlock(current); | 3771 | task_unlock(current); |
3768 | 3772 | ||
3769 | ioc->task = NULL; | 3773 | ioc->task = NULL; |
3770 | if (ioc->aic && ioc->aic->exit) | 3774 | if (ioc->aic && ioc->aic->exit) |
3771 | ioc->aic->exit(ioc->aic); | 3775 | ioc->aic->exit(ioc->aic); |
3772 | if (ioc->cic_root.rb_node != NULL) { | 3776 | if (ioc->cic_root.rb_node != NULL) { |
3773 | cic = rb_entry(rb_first(&ioc->cic_root), struct cfq_io_context, rb_node); | 3777 | cic = rb_entry(rb_first(&ioc->cic_root), struct cfq_io_context, rb_node); |
3774 | cic->exit(ioc); | 3778 | cic->exit(ioc); |
3775 | } | 3779 | } |
3776 | 3780 | ||
3777 | put_io_context(ioc); | 3781 | put_io_context(ioc); |
3778 | } | 3782 | } |
3779 | 3783 | ||
3780 | /* | 3784 | /* |
3781 | * If the current task has no IO context then create one and initialise it. | 3785 | * If the current task has no IO context then create one and initialise it. |
3782 | * Otherwise, return its existing IO context. | 3786 | * Otherwise, return its existing IO context. |
3783 | * | 3787 | * |
3784 | * This returned IO context doesn't have a specifically elevated refcount, | 3788 | * This returned IO context doesn't have a specifically elevated refcount, |
3785 | * but since the current task itself holds a reference, the context can be | 3789 | * but since the current task itself holds a reference, the context can be |
3786 | * used in general code, so long as it stays within `current` context. | 3790 | * used in general code, so long as it stays within `current` context. |
3787 | */ | 3791 | */ |
3788 | static struct io_context *current_io_context(gfp_t gfp_flags, int node) | 3792 | static struct io_context *current_io_context(gfp_t gfp_flags, int node) |
3789 | { | 3793 | { |
3790 | struct task_struct *tsk = current; | 3794 | struct task_struct *tsk = current; |
3791 | struct io_context *ret; | 3795 | struct io_context *ret; |
3792 | 3796 | ||
3793 | ret = tsk->io_context; | 3797 | ret = tsk->io_context; |
3794 | if (likely(ret)) | 3798 | if (likely(ret)) |
3795 | return ret; | 3799 | return ret; |
3796 | 3800 | ||
3797 | ret = kmem_cache_alloc_node(iocontext_cachep, gfp_flags, node); | 3801 | ret = kmem_cache_alloc_node(iocontext_cachep, gfp_flags, node); |
3798 | if (ret) { | 3802 | if (ret) { |
3799 | atomic_set(&ret->refcount, 1); | 3803 | atomic_set(&ret->refcount, 1); |
3800 | ret->task = current; | 3804 | ret->task = current; |
3801 | ret->ioprio_changed = 0; | 3805 | ret->ioprio_changed = 0; |
3802 | ret->last_waited = jiffies; /* doesn't matter... */ | 3806 | ret->last_waited = jiffies; /* doesn't matter... */ |
3803 | ret->nr_batch_requests = 0; /* because this is 0 */ | 3807 | ret->nr_batch_requests = 0; /* because this is 0 */ |
3804 | ret->aic = NULL; | 3808 | ret->aic = NULL; |
3805 | ret->cic_root.rb_node = NULL; | 3809 | ret->cic_root.rb_node = NULL; |
3806 | ret->ioc_data = NULL; | 3810 | ret->ioc_data = NULL; |
3807 | /* make sure set_task_ioprio() sees the settings above */ | 3811 | /* make sure set_task_ioprio() sees the settings above */ |
3808 | smp_wmb(); | 3812 | smp_wmb(); |
3809 | tsk->io_context = ret; | 3813 | tsk->io_context = ret; |
3810 | } | 3814 | } |
3811 | 3815 | ||
3812 | return ret; | 3816 | return ret; |
3813 | } | 3817 | } |
3814 | 3818 | ||
3815 | /* | 3819 | /* |
3816 | * If the current task has no IO context then create one and initialise it. | 3820 | * If the current task has no IO context then create one and initialise it. |
3817 | * If it does have a context, take a ref on it. | 3821 | * If it does have a context, take a ref on it. |
3818 | * | 3822 | * |
3819 | * This is always called in the context of the task which submitted the I/O. | 3823 | * This is always called in the context of the task which submitted the I/O. |
3820 | */ | 3824 | */ |
3821 | struct io_context *get_io_context(gfp_t gfp_flags, int node) | 3825 | struct io_context *get_io_context(gfp_t gfp_flags, int node) |
3822 | { | 3826 | { |
3823 | struct io_context *ret; | 3827 | struct io_context *ret; |
3824 | ret = current_io_context(gfp_flags, node); | 3828 | ret = current_io_context(gfp_flags, node); |
3825 | if (likely(ret)) | 3829 | if (likely(ret)) |
3826 | atomic_inc(&ret->refcount); | 3830 | atomic_inc(&ret->refcount); |
3827 | return ret; | 3831 | return ret; |
3828 | } | 3832 | } |
3829 | EXPORT_SYMBOL(get_io_context); | 3833 | EXPORT_SYMBOL(get_io_context); |
3830 | 3834 | ||
3831 | void copy_io_context(struct io_context **pdst, struct io_context **psrc) | 3835 | void copy_io_context(struct io_context **pdst, struct io_context **psrc) |
3832 | { | 3836 | { |
3833 | struct io_context *src = *psrc; | 3837 | struct io_context *src = *psrc; |
3834 | struct io_context *dst = *pdst; | 3838 | struct io_context *dst = *pdst; |
3835 | 3839 | ||
3836 | if (src) { | 3840 | if (src) { |
3837 | BUG_ON(atomic_read(&src->refcount) == 0); | 3841 | BUG_ON(atomic_read(&src->refcount) == 0); |
3838 | atomic_inc(&src->refcount); | 3842 | atomic_inc(&src->refcount); |
3839 | put_io_context(dst); | 3843 | put_io_context(dst); |
3840 | *pdst = src; | 3844 | *pdst = src; |
3841 | } | 3845 | } |
3842 | } | 3846 | } |
3843 | EXPORT_SYMBOL(copy_io_context); | 3847 | EXPORT_SYMBOL(copy_io_context); |
3844 | 3848 | ||
3845 | void swap_io_context(struct io_context **ioc1, struct io_context **ioc2) | 3849 | void swap_io_context(struct io_context **ioc1, struct io_context **ioc2) |
3846 | { | 3850 | { |
3847 | struct io_context *temp; | 3851 | struct io_context *temp; |
3848 | temp = *ioc1; | 3852 | temp = *ioc1; |
3849 | *ioc1 = *ioc2; | 3853 | *ioc1 = *ioc2; |
3850 | *ioc2 = temp; | 3854 | *ioc2 = temp; |
3851 | } | 3855 | } |
3852 | EXPORT_SYMBOL(swap_io_context); | 3856 | EXPORT_SYMBOL(swap_io_context); |
3853 | 3857 | ||
3854 | /* | 3858 | /* |
3855 | * sysfs parts below | 3859 | * sysfs parts below |
3856 | */ | 3860 | */ |
3857 | struct queue_sysfs_entry { | 3861 | struct queue_sysfs_entry { |
3858 | struct attribute attr; | 3862 | struct attribute attr; |
3859 | ssize_t (*show)(struct request_queue *, char *); | 3863 | ssize_t (*show)(struct request_queue *, char *); |
3860 | ssize_t (*store)(struct request_queue *, const char *, size_t); | 3864 | ssize_t (*store)(struct request_queue *, const char *, size_t); |
3861 | }; | 3865 | }; |
3862 | 3866 | ||
3863 | static ssize_t | 3867 | static ssize_t |
3864 | queue_var_show(unsigned int var, char *page) | 3868 | queue_var_show(unsigned int var, char *page) |
3865 | { | 3869 | { |
3866 | return sprintf(page, "%d\n", var); | 3870 | return sprintf(page, "%d\n", var); |
3867 | } | 3871 | } |
3868 | 3872 | ||
3869 | static ssize_t | 3873 | static ssize_t |
3870 | queue_var_store(unsigned long *var, const char *page, size_t count) | 3874 | queue_var_store(unsigned long *var, const char *page, size_t count) |
3871 | { | 3875 | { |
3872 | char *p = (char *) page; | 3876 | char *p = (char *) page; |
3873 | 3877 | ||
3874 | *var = simple_strtoul(p, &p, 10); | 3878 | *var = simple_strtoul(p, &p, 10); |
3875 | return count; | 3879 | return count; |
3876 | } | 3880 | } |
3877 | 3881 | ||
3878 | static ssize_t queue_requests_show(struct request_queue *q, char *page) | 3882 | static ssize_t queue_requests_show(struct request_queue *q, char *page) |
3879 | { | 3883 | { |
3880 | return queue_var_show(q->nr_requests, (page)); | 3884 | return queue_var_show(q->nr_requests, (page)); |
3881 | } | 3885 | } |
3882 | 3886 | ||
3883 | static ssize_t | 3887 | static ssize_t |
3884 | queue_requests_store(struct request_queue *q, const char *page, size_t count) | 3888 | queue_requests_store(struct request_queue *q, const char *page, size_t count) |
3885 | { | 3889 | { |
3886 | struct request_list *rl = &q->rq; | 3890 | struct request_list *rl = &q->rq; |
3887 | unsigned long nr; | 3891 | unsigned long nr; |
3888 | int ret = queue_var_store(&nr, page, count); | 3892 | int ret = queue_var_store(&nr, page, count); |
3889 | if (nr < BLKDEV_MIN_RQ) | 3893 | if (nr < BLKDEV_MIN_RQ) |
3890 | nr = BLKDEV_MIN_RQ; | 3894 | nr = BLKDEV_MIN_RQ; |
3891 | 3895 | ||
3892 | spin_lock_irq(q->queue_lock); | 3896 | spin_lock_irq(q->queue_lock); |
3893 | q->nr_requests = nr; | 3897 | q->nr_requests = nr; |
3894 | blk_queue_congestion_threshold(q); | 3898 | blk_queue_congestion_threshold(q); |
3895 | 3899 | ||
3896 | if (rl->count[READ] >= queue_congestion_on_threshold(q)) | 3900 | if (rl->count[READ] >= queue_congestion_on_threshold(q)) |
3897 | blk_set_queue_congested(q, READ); | 3901 | blk_set_queue_congested(q, READ); |
3898 | else if (rl->count[READ] < queue_congestion_off_threshold(q)) | 3902 | else if (rl->count[READ] < queue_congestion_off_threshold(q)) |
3899 | blk_clear_queue_congested(q, READ); | 3903 | blk_clear_queue_congested(q, READ); |
3900 | 3904 | ||
3901 | if (rl->count[WRITE] >= queue_congestion_on_threshold(q)) | 3905 | if (rl->count[WRITE] >= queue_congestion_on_threshold(q)) |
3902 | blk_set_queue_congested(q, WRITE); | 3906 | blk_set_queue_congested(q, WRITE); |
3903 | else if (rl->count[WRITE] < queue_congestion_off_threshold(q)) | 3907 | else if (rl->count[WRITE] < queue_congestion_off_threshold(q)) |
3904 | blk_clear_queue_congested(q, WRITE); | 3908 | blk_clear_queue_congested(q, WRITE); |
3905 | 3909 | ||
3906 | if (rl->count[READ] >= q->nr_requests) { | 3910 | if (rl->count[READ] >= q->nr_requests) { |
3907 | blk_set_queue_full(q, READ); | 3911 | blk_set_queue_full(q, READ); |
3908 | } else if (rl->count[READ]+1 <= q->nr_requests) { | 3912 | } else if (rl->count[READ]+1 <= q->nr_requests) { |
3909 | blk_clear_queue_full(q, READ); | 3913 | blk_clear_queue_full(q, READ); |
3910 | wake_up(&rl->wait[READ]); | 3914 | wake_up(&rl->wait[READ]); |
3911 | } | 3915 | } |
3912 | 3916 | ||
3913 | if (rl->count[WRITE] >= q->nr_requests) { | 3917 | if (rl->count[WRITE] >= q->nr_requests) { |
3914 | blk_set_queue_full(q, WRITE); | 3918 | blk_set_queue_full(q, WRITE); |
3915 | } else if (rl->count[WRITE]+1 <= q->nr_requests) { | 3919 | } else if (rl->count[WRITE]+1 <= q->nr_requests) { |
3916 | blk_clear_queue_full(q, WRITE); | 3920 | blk_clear_queue_full(q, WRITE); |
3917 | wake_up(&rl->wait[WRITE]); | 3921 | wake_up(&rl->wait[WRITE]); |
3918 | } | 3922 | } |
3919 | spin_unlock_irq(q->queue_lock); | 3923 | spin_unlock_irq(q->queue_lock); |
3920 | return ret; | 3924 | return ret; |
3921 | } | 3925 | } |
3922 | 3926 | ||
3923 | static ssize_t queue_ra_show(struct request_queue *q, char *page) | 3927 | static ssize_t queue_ra_show(struct request_queue *q, char *page) |
3924 | { | 3928 | { |
3925 | int ra_kb = q->backing_dev_info.ra_pages << (PAGE_CACHE_SHIFT - 10); | 3929 | int ra_kb = q->backing_dev_info.ra_pages << (PAGE_CACHE_SHIFT - 10); |
3926 | 3930 | ||
3927 | return queue_var_show(ra_kb, (page)); | 3931 | return queue_var_show(ra_kb, (page)); |
3928 | } | 3932 | } |
3929 | 3933 | ||
3930 | static ssize_t | 3934 | static ssize_t |
3931 | queue_ra_store(struct request_queue *q, const char *page, size_t count) | 3935 | queue_ra_store(struct request_queue *q, const char *page, size_t count) |
3932 | { | 3936 | { |
3933 | unsigned long ra_kb; | 3937 | unsigned long ra_kb; |
3934 | ssize_t ret = queue_var_store(&ra_kb, page, count); | 3938 | ssize_t ret = queue_var_store(&ra_kb, page, count); |
3935 | 3939 | ||
3936 | spin_lock_irq(q->queue_lock); | 3940 | spin_lock_irq(q->queue_lock); |
3937 | q->backing_dev_info.ra_pages = ra_kb >> (PAGE_CACHE_SHIFT - 10); | 3941 | q->backing_dev_info.ra_pages = ra_kb >> (PAGE_CACHE_SHIFT - 10); |
3938 | spin_unlock_irq(q->queue_lock); | 3942 | spin_unlock_irq(q->queue_lock); |
3939 | 3943 | ||
3940 | return ret; | 3944 | return ret; |
3941 | } | 3945 | } |
3942 | 3946 | ||
3943 | static ssize_t queue_max_sectors_show(struct request_queue *q, char *page) | 3947 | static ssize_t queue_max_sectors_show(struct request_queue *q, char *page) |
3944 | { | 3948 | { |
3945 | int max_sectors_kb = q->max_sectors >> 1; | 3949 | int max_sectors_kb = q->max_sectors >> 1; |
3946 | 3950 | ||
3947 | return queue_var_show(max_sectors_kb, (page)); | 3951 | return queue_var_show(max_sectors_kb, (page)); |
3948 | } | 3952 | } |
3949 | 3953 | ||
3950 | static ssize_t | 3954 | static ssize_t |
3951 | queue_max_sectors_store(struct request_queue *q, const char *page, size_t count) | 3955 | queue_max_sectors_store(struct request_queue *q, const char *page, size_t count) |
3952 | { | 3956 | { |
3953 | unsigned long max_sectors_kb, | 3957 | unsigned long max_sectors_kb, |
3954 | max_hw_sectors_kb = q->max_hw_sectors >> 1, | 3958 | max_hw_sectors_kb = q->max_hw_sectors >> 1, |
3955 | page_kb = 1 << (PAGE_CACHE_SHIFT - 10); | 3959 | page_kb = 1 << (PAGE_CACHE_SHIFT - 10); |
3956 | ssize_t ret = queue_var_store(&max_sectors_kb, page, count); | 3960 | ssize_t ret = queue_var_store(&max_sectors_kb, page, count); |
3957 | int ra_kb; | 3961 | int ra_kb; |
3958 | 3962 | ||
3959 | if (max_sectors_kb > max_hw_sectors_kb || max_sectors_kb < page_kb) | 3963 | if (max_sectors_kb > max_hw_sectors_kb || max_sectors_kb < page_kb) |
3960 | return -EINVAL; | 3964 | return -EINVAL; |
3961 | /* | 3965 | /* |
3962 | * Take the queue lock to update the readahead and max_sectors | 3966 | * Take the queue lock to update the readahead and max_sectors |
3963 | * values synchronously: | 3967 | * values synchronously: |
3964 | */ | 3968 | */ |
3965 | spin_lock_irq(q->queue_lock); | 3969 | spin_lock_irq(q->queue_lock); |
3966 | /* | 3970 | /* |
3967 | * Trim readahead window as well, if necessary: | 3971 | * Trim readahead window as well, if necessary: |
3968 | */ | 3972 | */ |
3969 | ra_kb = q->backing_dev_info.ra_pages << (PAGE_CACHE_SHIFT - 10); | 3973 | ra_kb = q->backing_dev_info.ra_pages << (PAGE_CACHE_SHIFT - 10); |
3970 | if (ra_kb > max_sectors_kb) | 3974 | if (ra_kb > max_sectors_kb) |
3971 | q->backing_dev_info.ra_pages = | 3975 | q->backing_dev_info.ra_pages = |
3972 | max_sectors_kb >> (PAGE_CACHE_SHIFT - 10); | 3976 | max_sectors_kb >> (PAGE_CACHE_SHIFT - 10); |
3973 | 3977 | ||
3974 | q->max_sectors = max_sectors_kb << 1; | 3978 | q->max_sectors = max_sectors_kb << 1; |
3975 | spin_unlock_irq(q->queue_lock); | 3979 | spin_unlock_irq(q->queue_lock); |
3976 | 3980 | ||
3977 | return ret; | 3981 | return ret; |
3978 | } | 3982 | } |
3979 | 3983 | ||
3980 | static ssize_t queue_max_hw_sectors_show(struct request_queue *q, char *page) | 3984 | static ssize_t queue_max_hw_sectors_show(struct request_queue *q, char *page) |
3981 | { | 3985 | { |
3982 | int max_hw_sectors_kb = q->max_hw_sectors >> 1; | 3986 | int max_hw_sectors_kb = q->max_hw_sectors >> 1; |
3983 | 3987 | ||
3984 | return queue_var_show(max_hw_sectors_kb, (page)); | 3988 | return queue_var_show(max_hw_sectors_kb, (page)); |
3985 | } | 3989 | } |
3986 | 3990 | ||
3987 | 3991 | ||
3988 | static struct queue_sysfs_entry queue_requests_entry = { | 3992 | static struct queue_sysfs_entry queue_requests_entry = { |
3989 | .attr = {.name = "nr_requests", .mode = S_IRUGO | S_IWUSR }, | 3993 | .attr = {.name = "nr_requests", .mode = S_IRUGO | S_IWUSR }, |
3990 | .show = queue_requests_show, | 3994 | .show = queue_requests_show, |
3991 | .store = queue_requests_store, | 3995 | .store = queue_requests_store, |
3992 | }; | 3996 | }; |
3993 | 3997 | ||
3994 | static struct queue_sysfs_entry queue_ra_entry = { | 3998 | static struct queue_sysfs_entry queue_ra_entry = { |
3995 | .attr = {.name = "read_ahead_kb", .mode = S_IRUGO | S_IWUSR }, | 3999 | .attr = {.name = "read_ahead_kb", .mode = S_IRUGO | S_IWUSR }, |
3996 | .show = queue_ra_show, | 4000 | .show = queue_ra_show, |
3997 | .store = queue_ra_store, | 4001 | .store = queue_ra_store, |
3998 | }; | 4002 | }; |
3999 | 4003 | ||
4000 | static struct queue_sysfs_entry queue_max_sectors_entry = { | 4004 | static struct queue_sysfs_entry queue_max_sectors_entry = { |
4001 | .attr = {.name = "max_sectors_kb", .mode = S_IRUGO | S_IWUSR }, | 4005 | .attr = {.name = "max_sectors_kb", .mode = S_IRUGO | S_IWUSR }, |
4002 | .show = queue_max_sectors_show, | 4006 | .show = queue_max_sectors_show, |
4003 | .store = queue_max_sectors_store, | 4007 | .store = queue_max_sectors_store, |
4004 | }; | 4008 | }; |
4005 | 4009 | ||
4006 | static struct queue_sysfs_entry queue_max_hw_sectors_entry = { | 4010 | static struct queue_sysfs_entry queue_max_hw_sectors_entry = { |
4007 | .attr = {.name = "max_hw_sectors_kb", .mode = S_IRUGO }, | 4011 | .attr = {.name = "max_hw_sectors_kb", .mode = S_IRUGO }, |
4008 | .show = queue_max_hw_sectors_show, | 4012 | .show = queue_max_hw_sectors_show, |
4009 | }; | 4013 | }; |
4010 | 4014 | ||
4011 | static struct queue_sysfs_entry queue_iosched_entry = { | 4015 | static struct queue_sysfs_entry queue_iosched_entry = { |
4012 | .attr = {.name = "scheduler", .mode = S_IRUGO | S_IWUSR }, | 4016 | .attr = {.name = "scheduler", .mode = S_IRUGO | S_IWUSR }, |
4013 | .show = elv_iosched_show, | 4017 | .show = elv_iosched_show, |
4014 | .store = elv_iosched_store, | 4018 | .store = elv_iosched_store, |
4015 | }; | 4019 | }; |
4016 | 4020 | ||
4017 | static struct attribute *default_attrs[] = { | 4021 | static struct attribute *default_attrs[] = { |
4018 | &queue_requests_entry.attr, | 4022 | &queue_requests_entry.attr, |
4019 | &queue_ra_entry.attr, | 4023 | &queue_ra_entry.attr, |
4020 | &queue_max_hw_sectors_entry.attr, | 4024 | &queue_max_hw_sectors_entry.attr, |
4021 | &queue_max_sectors_entry.attr, | 4025 | &queue_max_sectors_entry.attr, |
4022 | &queue_iosched_entry.attr, | 4026 | &queue_iosched_entry.attr, |
4023 | NULL, | 4027 | NULL, |
4024 | }; | 4028 | }; |
4025 | 4029 | ||
4026 | #define to_queue(atr) container_of((atr), struct queue_sysfs_entry, attr) | 4030 | #define to_queue(atr) container_of((atr), struct queue_sysfs_entry, attr) |
4027 | 4031 | ||
4028 | static ssize_t | 4032 | static ssize_t |
4029 | queue_attr_show(struct kobject *kobj, struct attribute *attr, char *page) | 4033 | queue_attr_show(struct kobject *kobj, struct attribute *attr, char *page) |
4030 | { | 4034 | { |
4031 | struct queue_sysfs_entry *entry = to_queue(attr); | 4035 | struct queue_sysfs_entry *entry = to_queue(attr); |
4032 | struct request_queue *q = | 4036 | struct request_queue *q = |
4033 | container_of(kobj, struct request_queue, kobj); | 4037 | container_of(kobj, struct request_queue, kobj); |
4034 | ssize_t res; | 4038 | ssize_t res; |
4035 | 4039 | ||
4036 | if (!entry->show) | 4040 | if (!entry->show) |
4037 | return -EIO; | 4041 | return -EIO; |
4038 | mutex_lock(&q->sysfs_lock); | 4042 | mutex_lock(&q->sysfs_lock); |
4039 | if (test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)) { | 4043 | if (test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)) { |
4040 | mutex_unlock(&q->sysfs_lock); | 4044 | mutex_unlock(&q->sysfs_lock); |
4041 | return -ENOENT; | 4045 | return -ENOENT; |
4042 | } | 4046 | } |
4043 | res = entry->show(q, page); | 4047 | res = entry->show(q, page); |
4044 | mutex_unlock(&q->sysfs_lock); | 4048 | mutex_unlock(&q->sysfs_lock); |
4045 | return res; | 4049 | return res; |
4046 | } | 4050 | } |
4047 | 4051 | ||
4048 | static ssize_t | 4052 | static ssize_t |
4049 | queue_attr_store(struct kobject *kobj, struct attribute *attr, | 4053 | queue_attr_store(struct kobject *kobj, struct attribute *attr, |
4050 | const char *page, size_t length) | 4054 | const char *page, size_t length) |
4051 | { | 4055 | { |
4052 | struct queue_sysfs_entry *entry = to_queue(attr); | 4056 | struct queue_sysfs_entry *entry = to_queue(attr); |
4053 | struct request_queue *q = container_of(kobj, struct request_queue, kobj); | 4057 | struct request_queue *q = container_of(kobj, struct request_queue, kobj); |
4054 | 4058 | ||
4055 | ssize_t res; | 4059 | ssize_t res; |
4056 | 4060 | ||
4057 | if (!entry->store) | 4061 | if (!entry->store) |
4058 | return -EIO; | 4062 | return -EIO; |
4059 | mutex_lock(&q->sysfs_lock); | 4063 | mutex_lock(&q->sysfs_lock); |
4060 | if (test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)) { | 4064 | if (test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)) { |
4061 | mutex_unlock(&q->sysfs_lock); | 4065 | mutex_unlock(&q->sysfs_lock); |
4062 | return -ENOENT; | 4066 | return -ENOENT; |
4063 | } | 4067 | } |
4064 | res = entry->store(q, page, length); | 4068 | res = entry->store(q, page, length); |
4065 | mutex_unlock(&q->sysfs_lock); | 4069 | mutex_unlock(&q->sysfs_lock); |
4066 | return res; | 4070 | return res; |
4067 | } | 4071 | } |
4068 | 4072 | ||
4069 | static struct sysfs_ops queue_sysfs_ops = { | 4073 | static struct sysfs_ops queue_sysfs_ops = { |
4070 | .show = queue_attr_show, | 4074 | .show = queue_attr_show, |
4071 | .store = queue_attr_store, | 4075 | .store = queue_attr_store, |
4072 | }; | 4076 | }; |
4073 | 4077 | ||
4074 | static struct kobj_type queue_ktype = { | 4078 | static struct kobj_type queue_ktype = { |
4075 | .sysfs_ops = &queue_sysfs_ops, | 4079 | .sysfs_ops = &queue_sysfs_ops, |
4076 | .default_attrs = default_attrs, | 4080 | .default_attrs = default_attrs, |
4077 | .release = blk_release_queue, | 4081 | .release = blk_release_queue, |
4078 | }; | 4082 | }; |
4079 | 4083 | ||
4080 | int blk_register_queue(struct gendisk *disk) | 4084 | int blk_register_queue(struct gendisk *disk) |
4081 | { | 4085 | { |
4082 | int ret; | 4086 | int ret; |
4083 | 4087 | ||
4084 | struct request_queue *q = disk->queue; | 4088 | struct request_queue *q = disk->queue; |
4085 | 4089 | ||
4086 | if (!q || !q->request_fn) | 4090 | if (!q || !q->request_fn) |
4087 | return -ENXIO; | 4091 | return -ENXIO; |
4088 | 4092 | ||
4089 | q->kobj.parent = kobject_get(&disk->kobj); | 4093 | q->kobj.parent = kobject_get(&disk->kobj); |
4090 | 4094 | ||
4091 | ret = kobject_add(&q->kobj); | 4095 | ret = kobject_add(&q->kobj); |
4092 | if (ret < 0) | 4096 | if (ret < 0) |
4093 | return ret; | 4097 | return ret; |
4094 | 4098 | ||
4095 | kobject_uevent(&q->kobj, KOBJ_ADD); | 4099 | kobject_uevent(&q->kobj, KOBJ_ADD); |
4096 | 4100 | ||
4097 | ret = elv_register_queue(q); | 4101 | ret = elv_register_queue(q); |
4098 | if (ret) { | 4102 | if (ret) { |
4099 | kobject_uevent(&q->kobj, KOBJ_REMOVE); | 4103 | kobject_uevent(&q->kobj, KOBJ_REMOVE); |
4100 | kobject_del(&q->kobj); | 4104 | kobject_del(&q->kobj); |
4101 | return ret; | 4105 | return ret; |
4102 | } | 4106 | } |
4103 | 4107 | ||
4104 | return 0; | 4108 | return 0; |
4105 | } | 4109 | } |
4106 | 4110 | ||
4107 | void blk_unregister_queue(struct gendisk *disk) | 4111 | void blk_unregister_queue(struct gendisk *disk) |
4108 | { | 4112 | { |
4109 | struct request_queue *q = disk->queue; | 4113 | struct request_queue *q = disk->queue; |
4110 | 4114 | ||
4111 | if (q && q->request_fn) { | 4115 | if (q && q->request_fn) { |
4112 | elv_unregister_queue(q); | 4116 | elv_unregister_queue(q); |
4113 | 4117 | ||
4114 | kobject_uevent(&q->kobj, KOBJ_REMOVE); | 4118 | kobject_uevent(&q->kobj, KOBJ_REMOVE); |
4115 | kobject_del(&q->kobj); | 4119 | kobject_del(&q->kobj); |
4116 | kobject_put(&disk->kobj); | 4120 | kobject_put(&disk->kobj); |
4117 | } | 4121 | } |
4118 | } | 4122 | } |
4119 | 4123 |
drivers/md/dm.c
1 | /* | 1 | /* |
2 | * Copyright (C) 2001, 2002 Sistina Software (UK) Limited. | 2 | * Copyright (C) 2001, 2002 Sistina Software (UK) Limited. |
3 | * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. | 3 | * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. |
4 | * | 4 | * |
5 | * This file is released under the GPL. | 5 | * This file is released under the GPL. |
6 | */ | 6 | */ |
7 | 7 | ||
8 | #include "dm.h" | 8 | #include "dm.h" |
9 | #include "dm-bio-list.h" | 9 | #include "dm-bio-list.h" |
10 | 10 | ||
11 | #include <linux/init.h> | 11 | #include <linux/init.h> |
12 | #include <linux/module.h> | 12 | #include <linux/module.h> |
13 | #include <linux/mutex.h> | 13 | #include <linux/mutex.h> |
14 | #include <linux/moduleparam.h> | 14 | #include <linux/moduleparam.h> |
15 | #include <linux/blkpg.h> | 15 | #include <linux/blkpg.h> |
16 | #include <linux/bio.h> | 16 | #include <linux/bio.h> |
17 | #include <linux/buffer_head.h> | 17 | #include <linux/buffer_head.h> |
18 | #include <linux/mempool.h> | 18 | #include <linux/mempool.h> |
19 | #include <linux/slab.h> | 19 | #include <linux/slab.h> |
20 | #include <linux/idr.h> | 20 | #include <linux/idr.h> |
21 | #include <linux/hdreg.h> | 21 | #include <linux/hdreg.h> |
22 | #include <linux/blktrace_api.h> | 22 | #include <linux/blktrace_api.h> |
23 | #include <linux/smp_lock.h> | 23 | #include <linux/smp_lock.h> |
24 | 24 | ||
25 | #define DM_MSG_PREFIX "core" | 25 | #define DM_MSG_PREFIX "core" |
26 | 26 | ||
27 | static const char *_name = DM_NAME; | 27 | static const char *_name = DM_NAME; |
28 | 28 | ||
29 | static unsigned int major = 0; | 29 | static unsigned int major = 0; |
30 | static unsigned int _major = 0; | 30 | static unsigned int _major = 0; |
31 | 31 | ||
32 | static DEFINE_SPINLOCK(_minor_lock); | 32 | static DEFINE_SPINLOCK(_minor_lock); |
33 | /* | 33 | /* |
34 | * One of these is allocated per bio. | 34 | * One of these is allocated per bio. |
35 | */ | 35 | */ |
36 | struct dm_io { | 36 | struct dm_io { |
37 | struct mapped_device *md; | 37 | struct mapped_device *md; |
38 | int error; | 38 | int error; |
39 | struct bio *bio; | 39 | struct bio *bio; |
40 | atomic_t io_count; | 40 | atomic_t io_count; |
41 | unsigned long start_time; | 41 | unsigned long start_time; |
42 | }; | 42 | }; |
43 | 43 | ||
44 | /* | 44 | /* |
45 | * One of these is allocated per target within a bio. Hopefully | 45 | * One of these is allocated per target within a bio. Hopefully |
46 | * this will be simplified out one day. | 46 | * this will be simplified out one day. |
47 | */ | 47 | */ |
48 | struct dm_target_io { | 48 | struct dm_target_io { |
49 | struct dm_io *io; | 49 | struct dm_io *io; |
50 | struct dm_target *ti; | 50 | struct dm_target *ti; |
51 | union map_info info; | 51 | union map_info info; |
52 | }; | 52 | }; |
53 | 53 | ||
54 | union map_info *dm_get_mapinfo(struct bio *bio) | 54 | union map_info *dm_get_mapinfo(struct bio *bio) |
55 | { | 55 | { |
56 | if (bio && bio->bi_private) | 56 | if (bio && bio->bi_private) |
57 | return &((struct dm_target_io *)bio->bi_private)->info; | 57 | return &((struct dm_target_io *)bio->bi_private)->info; |
58 | return NULL; | 58 | return NULL; |
59 | } | 59 | } |
60 | 60 | ||
61 | #define MINOR_ALLOCED ((void *)-1) | 61 | #define MINOR_ALLOCED ((void *)-1) |
62 | 62 | ||
63 | /* | 63 | /* |
64 | * Bits for the md->flags field. | 64 | * Bits for the md->flags field. |
65 | */ | 65 | */ |
66 | #define DMF_BLOCK_IO 0 | 66 | #define DMF_BLOCK_IO 0 |
67 | #define DMF_SUSPENDED 1 | 67 | #define DMF_SUSPENDED 1 |
68 | #define DMF_FROZEN 2 | 68 | #define DMF_FROZEN 2 |
69 | #define DMF_FREEING 3 | 69 | #define DMF_FREEING 3 |
70 | #define DMF_DELETING 4 | 70 | #define DMF_DELETING 4 |
71 | #define DMF_NOFLUSH_SUSPENDING 5 | 71 | #define DMF_NOFLUSH_SUSPENDING 5 |
72 | 72 | ||
73 | struct mapped_device { | 73 | struct mapped_device { |
74 | struct rw_semaphore io_lock; | 74 | struct rw_semaphore io_lock; |
75 | struct semaphore suspend_lock; | 75 | struct semaphore suspend_lock; |
76 | spinlock_t pushback_lock; | 76 | spinlock_t pushback_lock; |
77 | rwlock_t map_lock; | 77 | rwlock_t map_lock; |
78 | atomic_t holders; | 78 | atomic_t holders; |
79 | atomic_t open_count; | 79 | atomic_t open_count; |
80 | 80 | ||
81 | unsigned long flags; | 81 | unsigned long flags; |
82 | 82 | ||
83 | struct request_queue *queue; | 83 | struct request_queue *queue; |
84 | struct gendisk *disk; | 84 | struct gendisk *disk; |
85 | char name[16]; | 85 | char name[16]; |
86 | 86 | ||
87 | void *interface_ptr; | 87 | void *interface_ptr; |
88 | 88 | ||
89 | /* | 89 | /* |
90 | * A list of ios that arrived while we were suspended. | 90 | * A list of ios that arrived while we were suspended. |
91 | */ | 91 | */ |
92 | atomic_t pending; | 92 | atomic_t pending; |
93 | wait_queue_head_t wait; | 93 | wait_queue_head_t wait; |
94 | struct bio_list deferred; | 94 | struct bio_list deferred; |
95 | struct bio_list pushback; | 95 | struct bio_list pushback; |
96 | 96 | ||
97 | /* | 97 | /* |
98 | * The current mapping. | 98 | * The current mapping. |
99 | */ | 99 | */ |
100 | struct dm_table *map; | 100 | struct dm_table *map; |
101 | 101 | ||
102 | /* | 102 | /* |
103 | * io objects are allocated from here. | 103 | * io objects are allocated from here. |
104 | */ | 104 | */ |
105 | mempool_t *io_pool; | 105 | mempool_t *io_pool; |
106 | mempool_t *tio_pool; | 106 | mempool_t *tio_pool; |
107 | 107 | ||
108 | struct bio_set *bs; | 108 | struct bio_set *bs; |
109 | 109 | ||
110 | /* | 110 | /* |
111 | * Event handling. | 111 | * Event handling. |
112 | */ | 112 | */ |
113 | atomic_t event_nr; | 113 | atomic_t event_nr; |
114 | wait_queue_head_t eventq; | 114 | wait_queue_head_t eventq; |
115 | 115 | ||
116 | /* | 116 | /* |
117 | * freeze/thaw support require holding onto a super block | 117 | * freeze/thaw support require holding onto a super block |
118 | */ | 118 | */ |
119 | struct super_block *frozen_sb; | 119 | struct super_block *frozen_sb; |
120 | struct block_device *suspended_bdev; | 120 | struct block_device *suspended_bdev; |
121 | 121 | ||
122 | /* forced geometry settings */ | 122 | /* forced geometry settings */ |
123 | struct hd_geometry geometry; | 123 | struct hd_geometry geometry; |
124 | }; | 124 | }; |
125 | 125 | ||
126 | #define MIN_IOS 256 | 126 | #define MIN_IOS 256 |
127 | static struct kmem_cache *_io_cache; | 127 | static struct kmem_cache *_io_cache; |
128 | static struct kmem_cache *_tio_cache; | 128 | static struct kmem_cache *_tio_cache; |
129 | 129 | ||
130 | static int __init local_init(void) | 130 | static int __init local_init(void) |
131 | { | 131 | { |
132 | int r; | 132 | int r; |
133 | 133 | ||
134 | /* allocate a slab for the dm_ios */ | 134 | /* allocate a slab for the dm_ios */ |
135 | _io_cache = KMEM_CACHE(dm_io, 0); | 135 | _io_cache = KMEM_CACHE(dm_io, 0); |
136 | if (!_io_cache) | 136 | if (!_io_cache) |
137 | return -ENOMEM; | 137 | return -ENOMEM; |
138 | 138 | ||
139 | /* allocate a slab for the target ios */ | 139 | /* allocate a slab for the target ios */ |
140 | _tio_cache = KMEM_CACHE(dm_target_io, 0); | 140 | _tio_cache = KMEM_CACHE(dm_target_io, 0); |
141 | if (!_tio_cache) { | 141 | if (!_tio_cache) { |
142 | kmem_cache_destroy(_io_cache); | 142 | kmem_cache_destroy(_io_cache); |
143 | return -ENOMEM; | 143 | return -ENOMEM; |
144 | } | 144 | } |
145 | 145 | ||
146 | _major = major; | 146 | _major = major; |
147 | r = register_blkdev(_major, _name); | 147 | r = register_blkdev(_major, _name); |
148 | if (r < 0) { | 148 | if (r < 0) { |
149 | kmem_cache_destroy(_tio_cache); | 149 | kmem_cache_destroy(_tio_cache); |
150 | kmem_cache_destroy(_io_cache); | 150 | kmem_cache_destroy(_io_cache); |
151 | return r; | 151 | return r; |
152 | } | 152 | } |
153 | 153 | ||
154 | if (!_major) | 154 | if (!_major) |
155 | _major = r; | 155 | _major = r; |
156 | 156 | ||
157 | return 0; | 157 | return 0; |
158 | } | 158 | } |
159 | 159 | ||
160 | static void local_exit(void) | 160 | static void local_exit(void) |
161 | { | 161 | { |
162 | kmem_cache_destroy(_tio_cache); | 162 | kmem_cache_destroy(_tio_cache); |
163 | kmem_cache_destroy(_io_cache); | 163 | kmem_cache_destroy(_io_cache); |
164 | unregister_blkdev(_major, _name); | 164 | unregister_blkdev(_major, _name); |
165 | 165 | ||
166 | _major = 0; | 166 | _major = 0; |
167 | 167 | ||
168 | DMINFO("cleaned up"); | 168 | DMINFO("cleaned up"); |
169 | } | 169 | } |
170 | 170 | ||
171 | int (*_inits[])(void) __initdata = { | 171 | int (*_inits[])(void) __initdata = { |
172 | local_init, | 172 | local_init, |
173 | dm_target_init, | 173 | dm_target_init, |
174 | dm_linear_init, | 174 | dm_linear_init, |
175 | dm_stripe_init, | 175 | dm_stripe_init, |
176 | dm_interface_init, | 176 | dm_interface_init, |
177 | }; | 177 | }; |
178 | 178 | ||
179 | void (*_exits[])(void) = { | 179 | void (*_exits[])(void) = { |
180 | local_exit, | 180 | local_exit, |
181 | dm_target_exit, | 181 | dm_target_exit, |
182 | dm_linear_exit, | 182 | dm_linear_exit, |
183 | dm_stripe_exit, | 183 | dm_stripe_exit, |
184 | dm_interface_exit, | 184 | dm_interface_exit, |
185 | }; | 185 | }; |
186 | 186 | ||
187 | static int __init dm_init(void) | 187 | static int __init dm_init(void) |
188 | { | 188 | { |
189 | const int count = ARRAY_SIZE(_inits); | 189 | const int count = ARRAY_SIZE(_inits); |
190 | 190 | ||
191 | int r, i; | 191 | int r, i; |
192 | 192 | ||
193 | for (i = 0; i < count; i++) { | 193 | for (i = 0; i < count; i++) { |
194 | r = _inits[i](); | 194 | r = _inits[i](); |
195 | if (r) | 195 | if (r) |
196 | goto bad; | 196 | goto bad; |
197 | } | 197 | } |
198 | 198 | ||
199 | return 0; | 199 | return 0; |
200 | 200 | ||
201 | bad: | 201 | bad: |
202 | while (i--) | 202 | while (i--) |
203 | _exits[i](); | 203 | _exits[i](); |
204 | 204 | ||
205 | return r; | 205 | return r; |
206 | } | 206 | } |
207 | 207 | ||
208 | static void __exit dm_exit(void) | 208 | static void __exit dm_exit(void) |
209 | { | 209 | { |
210 | int i = ARRAY_SIZE(_exits); | 210 | int i = ARRAY_SIZE(_exits); |
211 | 211 | ||
212 | while (i--) | 212 | while (i--) |
213 | _exits[i](); | 213 | _exits[i](); |
214 | } | 214 | } |
215 | 215 | ||
216 | /* | 216 | /* |
217 | * Block device functions | 217 | * Block device functions |
218 | */ | 218 | */ |
219 | static int dm_blk_open(struct inode *inode, struct file *file) | 219 | static int dm_blk_open(struct inode *inode, struct file *file) |
220 | { | 220 | { |
221 | struct mapped_device *md; | 221 | struct mapped_device *md; |
222 | 222 | ||
223 | spin_lock(&_minor_lock); | 223 | spin_lock(&_minor_lock); |
224 | 224 | ||
225 | md = inode->i_bdev->bd_disk->private_data; | 225 | md = inode->i_bdev->bd_disk->private_data; |
226 | if (!md) | 226 | if (!md) |
227 | goto out; | 227 | goto out; |
228 | 228 | ||
229 | if (test_bit(DMF_FREEING, &md->flags) || | 229 | if (test_bit(DMF_FREEING, &md->flags) || |
230 | test_bit(DMF_DELETING, &md->flags)) { | 230 | test_bit(DMF_DELETING, &md->flags)) { |
231 | md = NULL; | 231 | md = NULL; |
232 | goto out; | 232 | goto out; |
233 | } | 233 | } |
234 | 234 | ||
235 | dm_get(md); | 235 | dm_get(md); |
236 | atomic_inc(&md->open_count); | 236 | atomic_inc(&md->open_count); |
237 | 237 | ||
238 | out: | 238 | out: |
239 | spin_unlock(&_minor_lock); | 239 | spin_unlock(&_minor_lock); |
240 | 240 | ||
241 | return md ? 0 : -ENXIO; | 241 | return md ? 0 : -ENXIO; |
242 | } | 242 | } |
243 | 243 | ||
244 | static int dm_blk_close(struct inode *inode, struct file *file) | 244 | static int dm_blk_close(struct inode *inode, struct file *file) |
245 | { | 245 | { |
246 | struct mapped_device *md; | 246 | struct mapped_device *md; |
247 | 247 | ||
248 | md = inode->i_bdev->bd_disk->private_data; | 248 | md = inode->i_bdev->bd_disk->private_data; |
249 | atomic_dec(&md->open_count); | 249 | atomic_dec(&md->open_count); |
250 | dm_put(md); | 250 | dm_put(md); |
251 | return 0; | 251 | return 0; |
252 | } | 252 | } |
253 | 253 | ||
254 | int dm_open_count(struct mapped_device *md) | 254 | int dm_open_count(struct mapped_device *md) |
255 | { | 255 | { |
256 | return atomic_read(&md->open_count); | 256 | return atomic_read(&md->open_count); |
257 | } | 257 | } |
258 | 258 | ||
259 | /* | 259 | /* |
260 | * Guarantees nothing is using the device before it's deleted. | 260 | * Guarantees nothing is using the device before it's deleted. |
261 | */ | 261 | */ |
262 | int dm_lock_for_deletion(struct mapped_device *md) | 262 | int dm_lock_for_deletion(struct mapped_device *md) |
263 | { | 263 | { |
264 | int r = 0; | 264 | int r = 0; |
265 | 265 | ||
266 | spin_lock(&_minor_lock); | 266 | spin_lock(&_minor_lock); |
267 | 267 | ||
268 | if (dm_open_count(md)) | 268 | if (dm_open_count(md)) |
269 | r = -EBUSY; | 269 | r = -EBUSY; |
270 | else | 270 | else |
271 | set_bit(DMF_DELETING, &md->flags); | 271 | set_bit(DMF_DELETING, &md->flags); |
272 | 272 | ||
273 | spin_unlock(&_minor_lock); | 273 | spin_unlock(&_minor_lock); |
274 | 274 | ||
275 | return r; | 275 | return r; |
276 | } | 276 | } |
277 | 277 | ||
278 | static int dm_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo) | 278 | static int dm_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo) |
279 | { | 279 | { |
280 | struct mapped_device *md = bdev->bd_disk->private_data; | 280 | struct mapped_device *md = bdev->bd_disk->private_data; |
281 | 281 | ||
282 | return dm_get_geometry(md, geo); | 282 | return dm_get_geometry(md, geo); |
283 | } | 283 | } |
284 | 284 | ||
285 | static int dm_blk_ioctl(struct inode *inode, struct file *file, | 285 | static int dm_blk_ioctl(struct inode *inode, struct file *file, |
286 | unsigned int cmd, unsigned long arg) | 286 | unsigned int cmd, unsigned long arg) |
287 | { | 287 | { |
288 | struct mapped_device *md; | 288 | struct mapped_device *md; |
289 | struct dm_table *map; | 289 | struct dm_table *map; |
290 | struct dm_target *tgt; | 290 | struct dm_target *tgt; |
291 | int r = -ENOTTY; | 291 | int r = -ENOTTY; |
292 | 292 | ||
293 | /* We don't really need this lock, but we do need 'inode'. */ | 293 | /* We don't really need this lock, but we do need 'inode'. */ |
294 | unlock_kernel(); | 294 | unlock_kernel(); |
295 | 295 | ||
296 | md = inode->i_bdev->bd_disk->private_data; | 296 | md = inode->i_bdev->bd_disk->private_data; |
297 | 297 | ||
298 | map = dm_get_table(md); | 298 | map = dm_get_table(md); |
299 | 299 | ||
300 | if (!map || !dm_table_get_size(map)) | 300 | if (!map || !dm_table_get_size(map)) |
301 | goto out; | 301 | goto out; |
302 | 302 | ||
303 | /* We only support devices that have a single target */ | 303 | /* We only support devices that have a single target */ |
304 | if (dm_table_get_num_targets(map) != 1) | 304 | if (dm_table_get_num_targets(map) != 1) |
305 | goto out; | 305 | goto out; |
306 | 306 | ||
307 | tgt = dm_table_get_target(map, 0); | 307 | tgt = dm_table_get_target(map, 0); |
308 | 308 | ||
309 | if (dm_suspended(md)) { | 309 | if (dm_suspended(md)) { |
310 | r = -EAGAIN; | 310 | r = -EAGAIN; |
311 | goto out; | 311 | goto out; |
312 | } | 312 | } |
313 | 313 | ||
314 | if (tgt->type->ioctl) | 314 | if (tgt->type->ioctl) |
315 | r = tgt->type->ioctl(tgt, inode, file, cmd, arg); | 315 | r = tgt->type->ioctl(tgt, inode, file, cmd, arg); |
316 | 316 | ||
317 | out: | 317 | out: |
318 | dm_table_put(map); | 318 | dm_table_put(map); |
319 | 319 | ||
320 | lock_kernel(); | 320 | lock_kernel(); |
321 | return r; | 321 | return r; |
322 | } | 322 | } |
323 | 323 | ||
324 | static struct dm_io *alloc_io(struct mapped_device *md) | 324 | static struct dm_io *alloc_io(struct mapped_device *md) |
325 | { | 325 | { |
326 | return mempool_alloc(md->io_pool, GFP_NOIO); | 326 | return mempool_alloc(md->io_pool, GFP_NOIO); |
327 | } | 327 | } |
328 | 328 | ||
329 | static void free_io(struct mapped_device *md, struct dm_io *io) | 329 | static void free_io(struct mapped_device *md, struct dm_io *io) |
330 | { | 330 | { |
331 | mempool_free(io, md->io_pool); | 331 | mempool_free(io, md->io_pool); |
332 | } | 332 | } |
333 | 333 | ||
334 | static struct dm_target_io *alloc_tio(struct mapped_device *md) | 334 | static struct dm_target_io *alloc_tio(struct mapped_device *md) |
335 | { | 335 | { |
336 | return mempool_alloc(md->tio_pool, GFP_NOIO); | 336 | return mempool_alloc(md->tio_pool, GFP_NOIO); |
337 | } | 337 | } |
338 | 338 | ||
339 | static void free_tio(struct mapped_device *md, struct dm_target_io *tio) | 339 | static void free_tio(struct mapped_device *md, struct dm_target_io *tio) |
340 | { | 340 | { |
341 | mempool_free(tio, md->tio_pool); | 341 | mempool_free(tio, md->tio_pool); |
342 | } | 342 | } |
343 | 343 | ||
344 | static void start_io_acct(struct dm_io *io) | 344 | static void start_io_acct(struct dm_io *io) |
345 | { | 345 | { |
346 | struct mapped_device *md = io->md; | 346 | struct mapped_device *md = io->md; |
347 | 347 | ||
348 | io->start_time = jiffies; | 348 | io->start_time = jiffies; |
349 | 349 | ||
350 | preempt_disable(); | 350 | preempt_disable(); |
351 | disk_round_stats(dm_disk(md)); | 351 | disk_round_stats(dm_disk(md)); |
352 | preempt_enable(); | 352 | preempt_enable(); |
353 | dm_disk(md)->in_flight = atomic_inc_return(&md->pending); | 353 | dm_disk(md)->in_flight = atomic_inc_return(&md->pending); |
354 | } | 354 | } |
355 | 355 | ||
356 | static int end_io_acct(struct dm_io *io) | 356 | static int end_io_acct(struct dm_io *io) |
357 | { | 357 | { |
358 | struct mapped_device *md = io->md; | 358 | struct mapped_device *md = io->md; |
359 | struct bio *bio = io->bio; | 359 | struct bio *bio = io->bio; |
360 | unsigned long duration = jiffies - io->start_time; | 360 | unsigned long duration = jiffies - io->start_time; |
361 | int pending; | 361 | int pending; |
362 | int rw = bio_data_dir(bio); | 362 | int rw = bio_data_dir(bio); |
363 | 363 | ||
364 | preempt_disable(); | 364 | preempt_disable(); |
365 | disk_round_stats(dm_disk(md)); | 365 | disk_round_stats(dm_disk(md)); |
366 | preempt_enable(); | 366 | preempt_enable(); |
367 | dm_disk(md)->in_flight = pending = atomic_dec_return(&md->pending); | 367 | dm_disk(md)->in_flight = pending = atomic_dec_return(&md->pending); |
368 | 368 | ||
369 | disk_stat_add(dm_disk(md), ticks[rw], duration); | 369 | disk_stat_add(dm_disk(md), ticks[rw], duration); |
370 | 370 | ||
371 | return !pending; | 371 | return !pending; |
372 | } | 372 | } |
373 | 373 | ||
374 | /* | 374 | /* |
375 | * Add the bio to the list of deferred io. | 375 | * Add the bio to the list of deferred io. |
376 | */ | 376 | */ |
377 | static int queue_io(struct mapped_device *md, struct bio *bio) | 377 | static int queue_io(struct mapped_device *md, struct bio *bio) |
378 | { | 378 | { |
379 | down_write(&md->io_lock); | 379 | down_write(&md->io_lock); |
380 | 380 | ||
381 | if (!test_bit(DMF_BLOCK_IO, &md->flags)) { | 381 | if (!test_bit(DMF_BLOCK_IO, &md->flags)) { |
382 | up_write(&md->io_lock); | 382 | up_write(&md->io_lock); |
383 | return 1; | 383 | return 1; |
384 | } | 384 | } |
385 | 385 | ||
386 | bio_list_add(&md->deferred, bio); | 386 | bio_list_add(&md->deferred, bio); |
387 | 387 | ||
388 | up_write(&md->io_lock); | 388 | up_write(&md->io_lock); |
389 | return 0; /* deferred successfully */ | 389 | return 0; /* deferred successfully */ |
390 | } | 390 | } |
391 | 391 | ||
392 | /* | 392 | /* |
393 | * Everyone (including functions in this file), should use this | 393 | * Everyone (including functions in this file), should use this |
394 | * function to access the md->map field, and make sure they call | 394 | * function to access the md->map field, and make sure they call |
395 | * dm_table_put() when finished. | 395 | * dm_table_put() when finished. |
396 | */ | 396 | */ |
397 | struct dm_table *dm_get_table(struct mapped_device *md) | 397 | struct dm_table *dm_get_table(struct mapped_device *md) |
398 | { | 398 | { |
399 | struct dm_table *t; | 399 | struct dm_table *t; |
400 | 400 | ||
401 | read_lock(&md->map_lock); | 401 | read_lock(&md->map_lock); |
402 | t = md->map; | 402 | t = md->map; |
403 | if (t) | 403 | if (t) |
404 | dm_table_get(t); | 404 | dm_table_get(t); |
405 | read_unlock(&md->map_lock); | 405 | read_unlock(&md->map_lock); |
406 | 406 | ||
407 | return t; | 407 | return t; |
408 | } | 408 | } |
409 | 409 | ||
410 | /* | 410 | /* |
411 | * Get the geometry associated with a dm device | 411 | * Get the geometry associated with a dm device |
412 | */ | 412 | */ |
413 | int dm_get_geometry(struct mapped_device *md, struct hd_geometry *geo) | 413 | int dm_get_geometry(struct mapped_device *md, struct hd_geometry *geo) |
414 | { | 414 | { |
415 | *geo = md->geometry; | 415 | *geo = md->geometry; |
416 | 416 | ||
417 | return 0; | 417 | return 0; |
418 | } | 418 | } |
419 | 419 | ||
420 | /* | 420 | /* |
421 | * Set the geometry of a device. | 421 | * Set the geometry of a device. |
422 | */ | 422 | */ |
423 | int dm_set_geometry(struct mapped_device *md, struct hd_geometry *geo) | 423 | int dm_set_geometry(struct mapped_device *md, struct hd_geometry *geo) |
424 | { | 424 | { |
425 | sector_t sz = (sector_t)geo->cylinders * geo->heads * geo->sectors; | 425 | sector_t sz = (sector_t)geo->cylinders * geo->heads * geo->sectors; |
426 | 426 | ||
427 | if (geo->start > sz) { | 427 | if (geo->start > sz) { |
428 | DMWARN("Start sector is beyond the geometry limits."); | 428 | DMWARN("Start sector is beyond the geometry limits."); |
429 | return -EINVAL; | 429 | return -EINVAL; |
430 | } | 430 | } |
431 | 431 | ||
432 | md->geometry = *geo; | 432 | md->geometry = *geo; |
433 | 433 | ||
434 | return 0; | 434 | return 0; |
435 | } | 435 | } |
436 | 436 | ||
437 | /*----------------------------------------------------------------- | 437 | /*----------------------------------------------------------------- |
438 | * CRUD START: | 438 | * CRUD START: |
439 | * A more elegant soln is in the works that uses the queue | 439 | * A more elegant soln is in the works that uses the queue |
440 | * merge fn, unfortunately there are a couple of changes to | 440 | * merge fn, unfortunately there are a couple of changes to |
441 | * the block layer that I want to make for this. So in the | 441 | * the block layer that I want to make for this. So in the |
442 | * interests of getting something for people to use I give | 442 | * interests of getting something for people to use I give |
443 | * you this clearly demarcated crap. | 443 | * you this clearly demarcated crap. |
444 | *---------------------------------------------------------------*/ | 444 | *---------------------------------------------------------------*/ |
445 | 445 | ||
446 | static int __noflush_suspending(struct mapped_device *md) | 446 | static int __noflush_suspending(struct mapped_device *md) |
447 | { | 447 | { |
448 | return test_bit(DMF_NOFLUSH_SUSPENDING, &md->flags); | 448 | return test_bit(DMF_NOFLUSH_SUSPENDING, &md->flags); |
449 | } | 449 | } |
450 | 450 | ||
451 | /* | 451 | /* |
452 | * Decrements the number of outstanding ios that a bio has been | 452 | * Decrements the number of outstanding ios that a bio has been |
453 | * cloned into, completing the original io if necc. | 453 | * cloned into, completing the original io if necc. |
454 | */ | 454 | */ |
455 | static void dec_pending(struct dm_io *io, int error) | 455 | static void dec_pending(struct dm_io *io, int error) |
456 | { | 456 | { |
457 | unsigned long flags; | 457 | unsigned long flags; |
458 | 458 | ||
459 | /* Push-back supersedes any I/O errors */ | 459 | /* Push-back supersedes any I/O errors */ |
460 | if (error && !(io->error > 0 && __noflush_suspending(io->md))) | 460 | if (error && !(io->error > 0 && __noflush_suspending(io->md))) |
461 | io->error = error; | 461 | io->error = error; |
462 | 462 | ||
463 | if (atomic_dec_and_test(&io->io_count)) { | 463 | if (atomic_dec_and_test(&io->io_count)) { |
464 | if (io->error == DM_ENDIO_REQUEUE) { | 464 | if (io->error == DM_ENDIO_REQUEUE) { |
465 | /* | 465 | /* |
466 | * Target requested pushing back the I/O. | 466 | * Target requested pushing back the I/O. |
467 | * This must be handled before the sleeper on | 467 | * This must be handled before the sleeper on |
468 | * suspend queue merges the pushback list. | 468 | * suspend queue merges the pushback list. |
469 | */ | 469 | */ |
470 | spin_lock_irqsave(&io->md->pushback_lock, flags); | 470 | spin_lock_irqsave(&io->md->pushback_lock, flags); |
471 | if (__noflush_suspending(io->md)) | 471 | if (__noflush_suspending(io->md)) |
472 | bio_list_add(&io->md->pushback, io->bio); | 472 | bio_list_add(&io->md->pushback, io->bio); |
473 | else | 473 | else |
474 | /* noflush suspend was interrupted. */ | 474 | /* noflush suspend was interrupted. */ |
475 | io->error = -EIO; | 475 | io->error = -EIO; |
476 | spin_unlock_irqrestore(&io->md->pushback_lock, flags); | 476 | spin_unlock_irqrestore(&io->md->pushback_lock, flags); |
477 | } | 477 | } |
478 | 478 | ||
479 | if (end_io_acct(io)) | 479 | if (end_io_acct(io)) |
480 | /* nudge anyone waiting on suspend queue */ | 480 | /* nudge anyone waiting on suspend queue */ |
481 | wake_up(&io->md->wait); | 481 | wake_up(&io->md->wait); |
482 | 482 | ||
483 | if (io->error != DM_ENDIO_REQUEUE) { | 483 | if (io->error != DM_ENDIO_REQUEUE) { |
484 | blk_add_trace_bio(io->md->queue, io->bio, | 484 | blk_add_trace_bio(io->md->queue, io->bio, |
485 | BLK_TA_COMPLETE); | 485 | BLK_TA_COMPLETE); |
486 | 486 | ||
487 | bio_endio(io->bio, io->bio->bi_size, io->error); | 487 | bio_endio(io->bio, io->bio->bi_size, io->error); |
488 | } | 488 | } |
489 | 489 | ||
490 | free_io(io->md, io); | 490 | free_io(io->md, io); |
491 | } | 491 | } |
492 | } | 492 | } |
493 | 493 | ||
494 | static int clone_endio(struct bio *bio, unsigned int done, int error) | 494 | static int clone_endio(struct bio *bio, unsigned int done, int error) |
495 | { | 495 | { |
496 | int r = 0; | 496 | int r = 0; |
497 | struct dm_target_io *tio = bio->bi_private; | 497 | struct dm_target_io *tio = bio->bi_private; |
498 | struct mapped_device *md = tio->io->md; | 498 | struct mapped_device *md = tio->io->md; |
499 | dm_endio_fn endio = tio->ti->type->end_io; | 499 | dm_endio_fn endio = tio->ti->type->end_io; |
500 | 500 | ||
501 | if (bio->bi_size) | 501 | if (bio->bi_size) |
502 | return 1; | 502 | return 1; |
503 | 503 | ||
504 | if (!bio_flagged(bio, BIO_UPTODATE) && !error) | 504 | if (!bio_flagged(bio, BIO_UPTODATE) && !error) |
505 | error = -EIO; | 505 | error = -EIO; |
506 | 506 | ||
507 | if (endio) { | 507 | if (endio) { |
508 | r = endio(tio->ti, bio, error, &tio->info); | 508 | r = endio(tio->ti, bio, error, &tio->info); |
509 | if (r < 0 || r == DM_ENDIO_REQUEUE) | 509 | if (r < 0 || r == DM_ENDIO_REQUEUE) |
510 | /* | 510 | /* |
511 | * error and requeue request are handled | 511 | * error and requeue request are handled |
512 | * in dec_pending(). | 512 | * in dec_pending(). |
513 | */ | 513 | */ |
514 | error = r; | 514 | error = r; |
515 | else if (r == DM_ENDIO_INCOMPLETE) | 515 | else if (r == DM_ENDIO_INCOMPLETE) |
516 | /* The target will handle the io */ | 516 | /* The target will handle the io */ |
517 | return 1; | 517 | return 1; |
518 | else if (r) { | 518 | else if (r) { |
519 | DMWARN("unimplemented target endio return value: %d", r); | 519 | DMWARN("unimplemented target endio return value: %d", r); |
520 | BUG(); | 520 | BUG(); |
521 | } | 521 | } |
522 | } | 522 | } |
523 | 523 | ||
524 | dec_pending(tio->io, error); | 524 | dec_pending(tio->io, error); |
525 | 525 | ||
526 | /* | 526 | /* |
527 | * Store md for cleanup instead of tio which is about to get freed. | 527 | * Store md for cleanup instead of tio which is about to get freed. |
528 | */ | 528 | */ |
529 | bio->bi_private = md->bs; | 529 | bio->bi_private = md->bs; |
530 | 530 | ||
531 | bio_put(bio); | 531 | bio_put(bio); |
532 | free_tio(md, tio); | 532 | free_tio(md, tio); |
533 | return r; | 533 | return r; |
534 | } | 534 | } |
535 | 535 | ||
536 | static sector_t max_io_len(struct mapped_device *md, | 536 | static sector_t max_io_len(struct mapped_device *md, |
537 | sector_t sector, struct dm_target *ti) | 537 | sector_t sector, struct dm_target *ti) |
538 | { | 538 | { |
539 | sector_t offset = sector - ti->begin; | 539 | sector_t offset = sector - ti->begin; |
540 | sector_t len = ti->len - offset; | 540 | sector_t len = ti->len - offset; |
541 | 541 | ||
542 | /* | 542 | /* |
543 | * Does the target need to split even further ? | 543 | * Does the target need to split even further ? |
544 | */ | 544 | */ |
545 | if (ti->split_io) { | 545 | if (ti->split_io) { |
546 | sector_t boundary; | 546 | sector_t boundary; |
547 | boundary = ((offset + ti->split_io) & ~(ti->split_io - 1)) | 547 | boundary = ((offset + ti->split_io) & ~(ti->split_io - 1)) |
548 | - offset; | 548 | - offset; |
549 | if (len > boundary) | 549 | if (len > boundary) |
550 | len = boundary; | 550 | len = boundary; |
551 | } | 551 | } |
552 | 552 | ||
553 | return len; | 553 | return len; |
554 | } | 554 | } |
555 | 555 | ||
556 | static void __map_bio(struct dm_target *ti, struct bio *clone, | 556 | static void __map_bio(struct dm_target *ti, struct bio *clone, |
557 | struct dm_target_io *tio) | 557 | struct dm_target_io *tio) |
558 | { | 558 | { |
559 | int r; | 559 | int r; |
560 | sector_t sector; | 560 | sector_t sector; |
561 | struct mapped_device *md; | 561 | struct mapped_device *md; |
562 | 562 | ||
563 | /* | 563 | /* |
564 | * Sanity checks. | 564 | * Sanity checks. |
565 | */ | 565 | */ |
566 | BUG_ON(!clone->bi_size); | 566 | BUG_ON(!clone->bi_size); |
567 | 567 | ||
568 | clone->bi_end_io = clone_endio; | 568 | clone->bi_end_io = clone_endio; |
569 | clone->bi_private = tio; | 569 | clone->bi_private = tio; |
570 | 570 | ||
571 | /* | 571 | /* |
572 | * Map the clone. If r == 0 we don't need to do | 572 | * Map the clone. If r == 0 we don't need to do |
573 | * anything, the target has assumed ownership of | 573 | * anything, the target has assumed ownership of |
574 | * this io. | 574 | * this io. |
575 | */ | 575 | */ |
576 | atomic_inc(&tio->io->io_count); | 576 | atomic_inc(&tio->io->io_count); |
577 | sector = clone->bi_sector; | 577 | sector = clone->bi_sector; |
578 | r = ti->type->map(ti, clone, &tio->info); | 578 | r = ti->type->map(ti, clone, &tio->info); |
579 | if (r == DM_MAPIO_REMAPPED) { | 579 | if (r == DM_MAPIO_REMAPPED) { |
580 | /* the bio has been remapped so dispatch it */ | 580 | /* the bio has been remapped so dispatch it */ |
581 | 581 | ||
582 | blk_add_trace_remap(bdev_get_queue(clone->bi_bdev), clone, | 582 | blk_add_trace_remap(bdev_get_queue(clone->bi_bdev), clone, |
583 | tio->io->bio->bi_bdev->bd_dev, sector, | 583 | tio->io->bio->bi_bdev->bd_dev, |
584 | clone->bi_sector); | 584 | clone->bi_sector, sector); |
585 | 585 | ||
586 | generic_make_request(clone); | 586 | generic_make_request(clone); |
587 | } else if (r < 0 || r == DM_MAPIO_REQUEUE) { | 587 | } else if (r < 0 || r == DM_MAPIO_REQUEUE) { |
588 | /* error the io and bail out, or requeue it if needed */ | 588 | /* error the io and bail out, or requeue it if needed */ |
589 | md = tio->io->md; | 589 | md = tio->io->md; |
590 | dec_pending(tio->io, r); | 590 | dec_pending(tio->io, r); |
591 | /* | 591 | /* |
592 | * Store bio_set for cleanup. | 592 | * Store bio_set for cleanup. |
593 | */ | 593 | */ |
594 | clone->bi_private = md->bs; | 594 | clone->bi_private = md->bs; |
595 | bio_put(clone); | 595 | bio_put(clone); |
596 | free_tio(md, tio); | 596 | free_tio(md, tio); |
597 | } else if (r) { | 597 | } else if (r) { |
598 | DMWARN("unimplemented target map return value: %d", r); | 598 | DMWARN("unimplemented target map return value: %d", r); |
599 | BUG(); | 599 | BUG(); |
600 | } | 600 | } |
601 | } | 601 | } |
602 | 602 | ||
603 | struct clone_info { | 603 | struct clone_info { |
604 | struct mapped_device *md; | 604 | struct mapped_device *md; |
605 | struct dm_table *map; | 605 | struct dm_table *map; |
606 | struct bio *bio; | 606 | struct bio *bio; |
607 | struct dm_io *io; | 607 | struct dm_io *io; |
608 | sector_t sector; | 608 | sector_t sector; |
609 | sector_t sector_count; | 609 | sector_t sector_count; |
610 | unsigned short idx; | 610 | unsigned short idx; |
611 | }; | 611 | }; |
612 | 612 | ||
613 | static void dm_bio_destructor(struct bio *bio) | 613 | static void dm_bio_destructor(struct bio *bio) |
614 | { | 614 | { |
615 | struct bio_set *bs = bio->bi_private; | 615 | struct bio_set *bs = bio->bi_private; |
616 | 616 | ||
617 | bio_free(bio, bs); | 617 | bio_free(bio, bs); |
618 | } | 618 | } |
619 | 619 | ||
620 | /* | 620 | /* |
621 | * Creates a little bio that is just does part of a bvec. | 621 | * Creates a little bio that is just does part of a bvec. |
622 | */ | 622 | */ |
623 | static struct bio *split_bvec(struct bio *bio, sector_t sector, | 623 | static struct bio *split_bvec(struct bio *bio, sector_t sector, |
624 | unsigned short idx, unsigned int offset, | 624 | unsigned short idx, unsigned int offset, |
625 | unsigned int len, struct bio_set *bs) | 625 | unsigned int len, struct bio_set *bs) |
626 | { | 626 | { |
627 | struct bio *clone; | 627 | struct bio *clone; |
628 | struct bio_vec *bv = bio->bi_io_vec + idx; | 628 | struct bio_vec *bv = bio->bi_io_vec + idx; |
629 | 629 | ||
630 | clone = bio_alloc_bioset(GFP_NOIO, 1, bs); | 630 | clone = bio_alloc_bioset(GFP_NOIO, 1, bs); |
631 | clone->bi_destructor = dm_bio_destructor; | 631 | clone->bi_destructor = dm_bio_destructor; |
632 | *clone->bi_io_vec = *bv; | 632 | *clone->bi_io_vec = *bv; |
633 | 633 | ||
634 | clone->bi_sector = sector; | 634 | clone->bi_sector = sector; |
635 | clone->bi_bdev = bio->bi_bdev; | 635 | clone->bi_bdev = bio->bi_bdev; |
636 | clone->bi_rw = bio->bi_rw; | 636 | clone->bi_rw = bio->bi_rw; |
637 | clone->bi_vcnt = 1; | 637 | clone->bi_vcnt = 1; |
638 | clone->bi_size = to_bytes(len); | 638 | clone->bi_size = to_bytes(len); |
639 | clone->bi_io_vec->bv_offset = offset; | 639 | clone->bi_io_vec->bv_offset = offset; |
640 | clone->bi_io_vec->bv_len = clone->bi_size; | 640 | clone->bi_io_vec->bv_len = clone->bi_size; |
641 | 641 | ||
642 | return clone; | 642 | return clone; |
643 | } | 643 | } |
644 | 644 | ||
645 | /* | 645 | /* |
646 | * Creates a bio that consists of range of complete bvecs. | 646 | * Creates a bio that consists of range of complete bvecs. |
647 | */ | 647 | */ |
648 | static struct bio *clone_bio(struct bio *bio, sector_t sector, | 648 | static struct bio *clone_bio(struct bio *bio, sector_t sector, |
649 | unsigned short idx, unsigned short bv_count, | 649 | unsigned short idx, unsigned short bv_count, |
650 | unsigned int len, struct bio_set *bs) | 650 | unsigned int len, struct bio_set *bs) |
651 | { | 651 | { |
652 | struct bio *clone; | 652 | struct bio *clone; |
653 | 653 | ||
654 | clone = bio_alloc_bioset(GFP_NOIO, bio->bi_max_vecs, bs); | 654 | clone = bio_alloc_bioset(GFP_NOIO, bio->bi_max_vecs, bs); |
655 | __bio_clone(clone, bio); | 655 | __bio_clone(clone, bio); |
656 | clone->bi_destructor = dm_bio_destructor; | 656 | clone->bi_destructor = dm_bio_destructor; |
657 | clone->bi_sector = sector; | 657 | clone->bi_sector = sector; |
658 | clone->bi_idx = idx; | 658 | clone->bi_idx = idx; |
659 | clone->bi_vcnt = idx + bv_count; | 659 | clone->bi_vcnt = idx + bv_count; |
660 | clone->bi_size = to_bytes(len); | 660 | clone->bi_size = to_bytes(len); |
661 | clone->bi_flags &= ~(1 << BIO_SEG_VALID); | 661 | clone->bi_flags &= ~(1 << BIO_SEG_VALID); |
662 | 662 | ||
663 | return clone; | 663 | return clone; |
664 | } | 664 | } |
665 | 665 | ||
666 | static void __clone_and_map(struct clone_info *ci) | 666 | static void __clone_and_map(struct clone_info *ci) |
667 | { | 667 | { |
668 | struct bio *clone, *bio = ci->bio; | 668 | struct bio *clone, *bio = ci->bio; |
669 | struct dm_target *ti = dm_table_find_target(ci->map, ci->sector); | 669 | struct dm_target *ti = dm_table_find_target(ci->map, ci->sector); |
670 | sector_t len = 0, max = max_io_len(ci->md, ci->sector, ti); | 670 | sector_t len = 0, max = max_io_len(ci->md, ci->sector, ti); |
671 | struct dm_target_io *tio; | 671 | struct dm_target_io *tio; |
672 | 672 | ||
673 | /* | 673 | /* |
674 | * Allocate a target io object. | 674 | * Allocate a target io object. |
675 | */ | 675 | */ |
676 | tio = alloc_tio(ci->md); | 676 | tio = alloc_tio(ci->md); |
677 | tio->io = ci->io; | 677 | tio->io = ci->io; |
678 | tio->ti = ti; | 678 | tio->ti = ti; |
679 | memset(&tio->info, 0, sizeof(tio->info)); | 679 | memset(&tio->info, 0, sizeof(tio->info)); |
680 | 680 | ||
681 | if (ci->sector_count <= max) { | 681 | if (ci->sector_count <= max) { |
682 | /* | 682 | /* |
683 | * Optimise for the simple case where we can do all of | 683 | * Optimise for the simple case where we can do all of |
684 | * the remaining io with a single clone. | 684 | * the remaining io with a single clone. |
685 | */ | 685 | */ |
686 | clone = clone_bio(bio, ci->sector, ci->idx, | 686 | clone = clone_bio(bio, ci->sector, ci->idx, |
687 | bio->bi_vcnt - ci->idx, ci->sector_count, | 687 | bio->bi_vcnt - ci->idx, ci->sector_count, |
688 | ci->md->bs); | 688 | ci->md->bs); |
689 | __map_bio(ti, clone, tio); | 689 | __map_bio(ti, clone, tio); |
690 | ci->sector_count = 0; | 690 | ci->sector_count = 0; |
691 | 691 | ||
692 | } else if (to_sector(bio->bi_io_vec[ci->idx].bv_len) <= max) { | 692 | } else if (to_sector(bio->bi_io_vec[ci->idx].bv_len) <= max) { |
693 | /* | 693 | /* |
694 | * There are some bvecs that don't span targets. | 694 | * There are some bvecs that don't span targets. |
695 | * Do as many of these as possible. | 695 | * Do as many of these as possible. |
696 | */ | 696 | */ |
697 | int i; | 697 | int i; |
698 | sector_t remaining = max; | 698 | sector_t remaining = max; |
699 | sector_t bv_len; | 699 | sector_t bv_len; |
700 | 700 | ||
701 | for (i = ci->idx; remaining && (i < bio->bi_vcnt); i++) { | 701 | for (i = ci->idx; remaining && (i < bio->bi_vcnt); i++) { |
702 | bv_len = to_sector(bio->bi_io_vec[i].bv_len); | 702 | bv_len = to_sector(bio->bi_io_vec[i].bv_len); |
703 | 703 | ||
704 | if (bv_len > remaining) | 704 | if (bv_len > remaining) |
705 | break; | 705 | break; |
706 | 706 | ||
707 | remaining -= bv_len; | 707 | remaining -= bv_len; |
708 | len += bv_len; | 708 | len += bv_len; |
709 | } | 709 | } |
710 | 710 | ||
711 | clone = clone_bio(bio, ci->sector, ci->idx, i - ci->idx, len, | 711 | clone = clone_bio(bio, ci->sector, ci->idx, i - ci->idx, len, |
712 | ci->md->bs); | 712 | ci->md->bs); |
713 | __map_bio(ti, clone, tio); | 713 | __map_bio(ti, clone, tio); |
714 | 714 | ||
715 | ci->sector += len; | 715 | ci->sector += len; |
716 | ci->sector_count -= len; | 716 | ci->sector_count -= len; |
717 | ci->idx = i; | 717 | ci->idx = i; |
718 | 718 | ||
719 | } else { | 719 | } else { |
720 | /* | 720 | /* |
721 | * Handle a bvec that must be split between two or more targets. | 721 | * Handle a bvec that must be split between two or more targets. |
722 | */ | 722 | */ |
723 | struct bio_vec *bv = bio->bi_io_vec + ci->idx; | 723 | struct bio_vec *bv = bio->bi_io_vec + ci->idx; |
724 | sector_t remaining = to_sector(bv->bv_len); | 724 | sector_t remaining = to_sector(bv->bv_len); |
725 | unsigned int offset = 0; | 725 | unsigned int offset = 0; |
726 | 726 | ||
727 | do { | 727 | do { |
728 | if (offset) { | 728 | if (offset) { |
729 | ti = dm_table_find_target(ci->map, ci->sector); | 729 | ti = dm_table_find_target(ci->map, ci->sector); |
730 | max = max_io_len(ci->md, ci->sector, ti); | 730 | max = max_io_len(ci->md, ci->sector, ti); |
731 | 731 | ||
732 | tio = alloc_tio(ci->md); | 732 | tio = alloc_tio(ci->md); |
733 | tio->io = ci->io; | 733 | tio->io = ci->io; |
734 | tio->ti = ti; | 734 | tio->ti = ti; |
735 | memset(&tio->info, 0, sizeof(tio->info)); | 735 | memset(&tio->info, 0, sizeof(tio->info)); |
736 | } | 736 | } |
737 | 737 | ||
738 | len = min(remaining, max); | 738 | len = min(remaining, max); |
739 | 739 | ||
740 | clone = split_bvec(bio, ci->sector, ci->idx, | 740 | clone = split_bvec(bio, ci->sector, ci->idx, |
741 | bv->bv_offset + offset, len, | 741 | bv->bv_offset + offset, len, |
742 | ci->md->bs); | 742 | ci->md->bs); |
743 | 743 | ||
744 | __map_bio(ti, clone, tio); | 744 | __map_bio(ti, clone, tio); |
745 | 745 | ||
746 | ci->sector += len; | 746 | ci->sector += len; |
747 | ci->sector_count -= len; | 747 | ci->sector_count -= len; |
748 | offset += to_bytes(len); | 748 | offset += to_bytes(len); |
749 | } while (remaining -= len); | 749 | } while (remaining -= len); |
750 | 750 | ||
751 | ci->idx++; | 751 | ci->idx++; |
752 | } | 752 | } |
753 | } | 753 | } |
754 | 754 | ||
755 | /* | 755 | /* |
756 | * Split the bio into several clones. | 756 | * Split the bio into several clones. |
757 | */ | 757 | */ |
758 | static void __split_bio(struct mapped_device *md, struct bio *bio) | 758 | static void __split_bio(struct mapped_device *md, struct bio *bio) |
759 | { | 759 | { |
760 | struct clone_info ci; | 760 | struct clone_info ci; |
761 | 761 | ||
762 | ci.map = dm_get_table(md); | 762 | ci.map = dm_get_table(md); |
763 | if (!ci.map) { | 763 | if (!ci.map) { |
764 | bio_io_error(bio, bio->bi_size); | 764 | bio_io_error(bio, bio->bi_size); |
765 | return; | 765 | return; |
766 | } | 766 | } |
767 | 767 | ||
768 | ci.md = md; | 768 | ci.md = md; |
769 | ci.bio = bio; | 769 | ci.bio = bio; |
770 | ci.io = alloc_io(md); | 770 | ci.io = alloc_io(md); |
771 | ci.io->error = 0; | 771 | ci.io->error = 0; |
772 | atomic_set(&ci.io->io_count, 1); | 772 | atomic_set(&ci.io->io_count, 1); |
773 | ci.io->bio = bio; | 773 | ci.io->bio = bio; |
774 | ci.io->md = md; | 774 | ci.io->md = md; |
775 | ci.sector = bio->bi_sector; | 775 | ci.sector = bio->bi_sector; |
776 | ci.sector_count = bio_sectors(bio); | 776 | ci.sector_count = bio_sectors(bio); |
777 | ci.idx = bio->bi_idx; | 777 | ci.idx = bio->bi_idx; |
778 | 778 | ||
779 | start_io_acct(ci.io); | 779 | start_io_acct(ci.io); |
780 | while (ci.sector_count) | 780 | while (ci.sector_count) |
781 | __clone_and_map(&ci); | 781 | __clone_and_map(&ci); |
782 | 782 | ||
783 | /* drop the extra reference count */ | 783 | /* drop the extra reference count */ |
784 | dec_pending(ci.io, 0); | 784 | dec_pending(ci.io, 0); |
785 | dm_table_put(ci.map); | 785 | dm_table_put(ci.map); |
786 | } | 786 | } |
787 | /*----------------------------------------------------------------- | 787 | /*----------------------------------------------------------------- |
788 | * CRUD END | 788 | * CRUD END |
789 | *---------------------------------------------------------------*/ | 789 | *---------------------------------------------------------------*/ |
790 | 790 | ||
791 | /* | 791 | /* |
792 | * The request function that just remaps the bio built up by | 792 | * The request function that just remaps the bio built up by |
793 | * dm_merge_bvec. | 793 | * dm_merge_bvec. |
794 | */ | 794 | */ |
795 | static int dm_request(struct request_queue *q, struct bio *bio) | 795 | static int dm_request(struct request_queue *q, struct bio *bio) |
796 | { | 796 | { |
797 | int r; | 797 | int r; |
798 | int rw = bio_data_dir(bio); | 798 | int rw = bio_data_dir(bio); |
799 | struct mapped_device *md = q->queuedata; | 799 | struct mapped_device *md = q->queuedata; |
800 | 800 | ||
801 | /* | 801 | /* |
802 | * There is no use in forwarding any barrier request since we can't | 802 | * There is no use in forwarding any barrier request since we can't |
803 | * guarantee it is (or can be) handled by the targets correctly. | 803 | * guarantee it is (or can be) handled by the targets correctly. |
804 | */ | 804 | */ |
805 | if (unlikely(bio_barrier(bio))) { | 805 | if (unlikely(bio_barrier(bio))) { |
806 | bio_endio(bio, bio->bi_size, -EOPNOTSUPP); | 806 | bio_endio(bio, bio->bi_size, -EOPNOTSUPP); |
807 | return 0; | 807 | return 0; |
808 | } | 808 | } |
809 | 809 | ||
810 | down_read(&md->io_lock); | 810 | down_read(&md->io_lock); |
811 | 811 | ||
812 | disk_stat_inc(dm_disk(md), ios[rw]); | 812 | disk_stat_inc(dm_disk(md), ios[rw]); |
813 | disk_stat_add(dm_disk(md), sectors[rw], bio_sectors(bio)); | 813 | disk_stat_add(dm_disk(md), sectors[rw], bio_sectors(bio)); |
814 | 814 | ||
815 | /* | 815 | /* |
816 | * If we're suspended we have to queue | 816 | * If we're suspended we have to queue |
817 | * this io for later. | 817 | * this io for later. |
818 | */ | 818 | */ |
819 | while (test_bit(DMF_BLOCK_IO, &md->flags)) { | 819 | while (test_bit(DMF_BLOCK_IO, &md->flags)) { |
820 | up_read(&md->io_lock); | 820 | up_read(&md->io_lock); |
821 | 821 | ||
822 | if (bio_rw(bio) == READA) { | 822 | if (bio_rw(bio) == READA) { |
823 | bio_io_error(bio, bio->bi_size); | 823 | bio_io_error(bio, bio->bi_size); |
824 | return 0; | 824 | return 0; |
825 | } | 825 | } |
826 | 826 | ||
827 | r = queue_io(md, bio); | 827 | r = queue_io(md, bio); |
828 | if (r < 0) { | 828 | if (r < 0) { |
829 | bio_io_error(bio, bio->bi_size); | 829 | bio_io_error(bio, bio->bi_size); |
830 | return 0; | 830 | return 0; |
831 | 831 | ||
832 | } else if (r == 0) | 832 | } else if (r == 0) |
833 | return 0; /* deferred successfully */ | 833 | return 0; /* deferred successfully */ |
834 | 834 | ||
835 | /* | 835 | /* |
836 | * We're in a while loop, because someone could suspend | 836 | * We're in a while loop, because someone could suspend |
837 | * before we get to the following read lock. | 837 | * before we get to the following read lock. |
838 | */ | 838 | */ |
839 | down_read(&md->io_lock); | 839 | down_read(&md->io_lock); |
840 | } | 840 | } |
841 | 841 | ||
842 | __split_bio(md, bio); | 842 | __split_bio(md, bio); |
843 | up_read(&md->io_lock); | 843 | up_read(&md->io_lock); |
844 | return 0; | 844 | return 0; |
845 | } | 845 | } |
846 | 846 | ||
847 | static int dm_flush_all(struct request_queue *q, struct gendisk *disk, | 847 | static int dm_flush_all(struct request_queue *q, struct gendisk *disk, |
848 | sector_t *error_sector) | 848 | sector_t *error_sector) |
849 | { | 849 | { |
850 | struct mapped_device *md = q->queuedata; | 850 | struct mapped_device *md = q->queuedata; |
851 | struct dm_table *map = dm_get_table(md); | 851 | struct dm_table *map = dm_get_table(md); |
852 | int ret = -ENXIO; | 852 | int ret = -ENXIO; |
853 | 853 | ||
854 | if (map) { | 854 | if (map) { |
855 | ret = dm_table_flush_all(map); | 855 | ret = dm_table_flush_all(map); |
856 | dm_table_put(map); | 856 | dm_table_put(map); |
857 | } | 857 | } |
858 | 858 | ||
859 | return ret; | 859 | return ret; |
860 | } | 860 | } |
861 | 861 | ||
862 | static void dm_unplug_all(struct request_queue *q) | 862 | static void dm_unplug_all(struct request_queue *q) |
863 | { | 863 | { |
864 | struct mapped_device *md = q->queuedata; | 864 | struct mapped_device *md = q->queuedata; |
865 | struct dm_table *map = dm_get_table(md); | 865 | struct dm_table *map = dm_get_table(md); |
866 | 866 | ||
867 | if (map) { | 867 | if (map) { |
868 | dm_table_unplug_all(map); | 868 | dm_table_unplug_all(map); |
869 | dm_table_put(map); | 869 | dm_table_put(map); |
870 | } | 870 | } |
871 | } | 871 | } |
872 | 872 | ||
873 | static int dm_any_congested(void *congested_data, int bdi_bits) | 873 | static int dm_any_congested(void *congested_data, int bdi_bits) |
874 | { | 874 | { |
875 | int r; | 875 | int r; |
876 | struct mapped_device *md = (struct mapped_device *) congested_data; | 876 | struct mapped_device *md = (struct mapped_device *) congested_data; |
877 | struct dm_table *map = dm_get_table(md); | 877 | struct dm_table *map = dm_get_table(md); |
878 | 878 | ||
879 | if (!map || test_bit(DMF_BLOCK_IO, &md->flags)) | 879 | if (!map || test_bit(DMF_BLOCK_IO, &md->flags)) |
880 | r = bdi_bits; | 880 | r = bdi_bits; |
881 | else | 881 | else |
882 | r = dm_table_any_congested(map, bdi_bits); | 882 | r = dm_table_any_congested(map, bdi_bits); |
883 | 883 | ||
884 | dm_table_put(map); | 884 | dm_table_put(map); |
885 | return r; | 885 | return r; |
886 | } | 886 | } |
887 | 887 | ||
888 | /*----------------------------------------------------------------- | 888 | /*----------------------------------------------------------------- |
889 | * An IDR is used to keep track of allocated minor numbers. | 889 | * An IDR is used to keep track of allocated minor numbers. |
890 | *---------------------------------------------------------------*/ | 890 | *---------------------------------------------------------------*/ |
891 | static DEFINE_IDR(_minor_idr); | 891 | static DEFINE_IDR(_minor_idr); |
892 | 892 | ||
893 | static void free_minor(int minor) | 893 | static void free_minor(int minor) |
894 | { | 894 | { |
895 | spin_lock(&_minor_lock); | 895 | spin_lock(&_minor_lock); |
896 | idr_remove(&_minor_idr, minor); | 896 | idr_remove(&_minor_idr, minor); |
897 | spin_unlock(&_minor_lock); | 897 | spin_unlock(&_minor_lock); |
898 | } | 898 | } |
899 | 899 | ||
900 | /* | 900 | /* |
901 | * See if the device with a specific minor # is free. | 901 | * See if the device with a specific minor # is free. |
902 | */ | 902 | */ |
903 | static int specific_minor(struct mapped_device *md, int minor) | 903 | static int specific_minor(struct mapped_device *md, int minor) |
904 | { | 904 | { |
905 | int r, m; | 905 | int r, m; |
906 | 906 | ||
907 | if (minor >= (1 << MINORBITS)) | 907 | if (minor >= (1 << MINORBITS)) |
908 | return -EINVAL; | 908 | return -EINVAL; |
909 | 909 | ||
910 | r = idr_pre_get(&_minor_idr, GFP_KERNEL); | 910 | r = idr_pre_get(&_minor_idr, GFP_KERNEL); |
911 | if (!r) | 911 | if (!r) |
912 | return -ENOMEM; | 912 | return -ENOMEM; |
913 | 913 | ||
914 | spin_lock(&_minor_lock); | 914 | spin_lock(&_minor_lock); |
915 | 915 | ||
916 | if (idr_find(&_minor_idr, minor)) { | 916 | if (idr_find(&_minor_idr, minor)) { |
917 | r = -EBUSY; | 917 | r = -EBUSY; |
918 | goto out; | 918 | goto out; |
919 | } | 919 | } |
920 | 920 | ||
921 | r = idr_get_new_above(&_minor_idr, MINOR_ALLOCED, minor, &m); | 921 | r = idr_get_new_above(&_minor_idr, MINOR_ALLOCED, minor, &m); |
922 | if (r) | 922 | if (r) |
923 | goto out; | 923 | goto out; |
924 | 924 | ||
925 | if (m != minor) { | 925 | if (m != minor) { |
926 | idr_remove(&_minor_idr, m); | 926 | idr_remove(&_minor_idr, m); |
927 | r = -EBUSY; | 927 | r = -EBUSY; |
928 | goto out; | 928 | goto out; |
929 | } | 929 | } |
930 | 930 | ||
931 | out: | 931 | out: |
932 | spin_unlock(&_minor_lock); | 932 | spin_unlock(&_minor_lock); |
933 | return r; | 933 | return r; |
934 | } | 934 | } |
935 | 935 | ||
936 | static int next_free_minor(struct mapped_device *md, int *minor) | 936 | static int next_free_minor(struct mapped_device *md, int *minor) |
937 | { | 937 | { |
938 | int r, m; | 938 | int r, m; |
939 | 939 | ||
940 | r = idr_pre_get(&_minor_idr, GFP_KERNEL); | 940 | r = idr_pre_get(&_minor_idr, GFP_KERNEL); |
941 | if (!r) | 941 | if (!r) |
942 | return -ENOMEM; | 942 | return -ENOMEM; |
943 | 943 | ||
944 | spin_lock(&_minor_lock); | 944 | spin_lock(&_minor_lock); |
945 | 945 | ||
946 | r = idr_get_new(&_minor_idr, MINOR_ALLOCED, &m); | 946 | r = idr_get_new(&_minor_idr, MINOR_ALLOCED, &m); |
947 | if (r) { | 947 | if (r) { |
948 | goto out; | 948 | goto out; |
949 | } | 949 | } |
950 | 950 | ||
951 | if (m >= (1 << MINORBITS)) { | 951 | if (m >= (1 << MINORBITS)) { |
952 | idr_remove(&_minor_idr, m); | 952 | idr_remove(&_minor_idr, m); |
953 | r = -ENOSPC; | 953 | r = -ENOSPC; |
954 | goto out; | 954 | goto out; |
955 | } | 955 | } |
956 | 956 | ||
957 | *minor = m; | 957 | *minor = m; |
958 | 958 | ||
959 | out: | 959 | out: |
960 | spin_unlock(&_minor_lock); | 960 | spin_unlock(&_minor_lock); |
961 | return r; | 961 | return r; |
962 | } | 962 | } |
963 | 963 | ||
964 | static struct block_device_operations dm_blk_dops; | 964 | static struct block_device_operations dm_blk_dops; |
965 | 965 | ||
966 | /* | 966 | /* |
967 | * Allocate and initialise a blank device with a given minor. | 967 | * Allocate and initialise a blank device with a given minor. |
968 | */ | 968 | */ |
969 | static struct mapped_device *alloc_dev(int minor) | 969 | static struct mapped_device *alloc_dev(int minor) |
970 | { | 970 | { |
971 | int r; | 971 | int r; |
972 | struct mapped_device *md = kmalloc(sizeof(*md), GFP_KERNEL); | 972 | struct mapped_device *md = kmalloc(sizeof(*md), GFP_KERNEL); |
973 | void *old_md; | 973 | void *old_md; |
974 | 974 | ||
975 | if (!md) { | 975 | if (!md) { |
976 | DMWARN("unable to allocate device, out of memory."); | 976 | DMWARN("unable to allocate device, out of memory."); |
977 | return NULL; | 977 | return NULL; |
978 | } | 978 | } |
979 | 979 | ||
980 | if (!try_module_get(THIS_MODULE)) | 980 | if (!try_module_get(THIS_MODULE)) |
981 | goto bad0; | 981 | goto bad0; |
982 | 982 | ||
983 | /* get a minor number for the dev */ | 983 | /* get a minor number for the dev */ |
984 | if (minor == DM_ANY_MINOR) | 984 | if (minor == DM_ANY_MINOR) |
985 | r = next_free_minor(md, &minor); | 985 | r = next_free_minor(md, &minor); |
986 | else | 986 | else |
987 | r = specific_minor(md, minor); | 987 | r = specific_minor(md, minor); |
988 | if (r < 0) | 988 | if (r < 0) |
989 | goto bad1; | 989 | goto bad1; |
990 | 990 | ||
991 | memset(md, 0, sizeof(*md)); | 991 | memset(md, 0, sizeof(*md)); |
992 | init_rwsem(&md->io_lock); | 992 | init_rwsem(&md->io_lock); |
993 | init_MUTEX(&md->suspend_lock); | 993 | init_MUTEX(&md->suspend_lock); |
994 | spin_lock_init(&md->pushback_lock); | 994 | spin_lock_init(&md->pushback_lock); |
995 | rwlock_init(&md->map_lock); | 995 | rwlock_init(&md->map_lock); |
996 | atomic_set(&md->holders, 1); | 996 | atomic_set(&md->holders, 1); |
997 | atomic_set(&md->open_count, 0); | 997 | atomic_set(&md->open_count, 0); |
998 | atomic_set(&md->event_nr, 0); | 998 | atomic_set(&md->event_nr, 0); |
999 | 999 | ||
1000 | md->queue = blk_alloc_queue(GFP_KERNEL); | 1000 | md->queue = blk_alloc_queue(GFP_KERNEL); |
1001 | if (!md->queue) | 1001 | if (!md->queue) |
1002 | goto bad1_free_minor; | 1002 | goto bad1_free_minor; |
1003 | 1003 | ||
1004 | md->queue->queuedata = md; | 1004 | md->queue->queuedata = md; |
1005 | md->queue->backing_dev_info.congested_fn = dm_any_congested; | 1005 | md->queue->backing_dev_info.congested_fn = dm_any_congested; |
1006 | md->queue->backing_dev_info.congested_data = md; | 1006 | md->queue->backing_dev_info.congested_data = md; |
1007 | blk_queue_make_request(md->queue, dm_request); | 1007 | blk_queue_make_request(md->queue, dm_request); |
1008 | blk_queue_bounce_limit(md->queue, BLK_BOUNCE_ANY); | 1008 | blk_queue_bounce_limit(md->queue, BLK_BOUNCE_ANY); |
1009 | md->queue->unplug_fn = dm_unplug_all; | 1009 | md->queue->unplug_fn = dm_unplug_all; |
1010 | md->queue->issue_flush_fn = dm_flush_all; | 1010 | md->queue->issue_flush_fn = dm_flush_all; |
1011 | 1011 | ||
1012 | md->io_pool = mempool_create_slab_pool(MIN_IOS, _io_cache); | 1012 | md->io_pool = mempool_create_slab_pool(MIN_IOS, _io_cache); |
1013 | if (!md->io_pool) | 1013 | if (!md->io_pool) |
1014 | goto bad2; | 1014 | goto bad2; |
1015 | 1015 | ||
1016 | md->tio_pool = mempool_create_slab_pool(MIN_IOS, _tio_cache); | 1016 | md->tio_pool = mempool_create_slab_pool(MIN_IOS, _tio_cache); |
1017 | if (!md->tio_pool) | 1017 | if (!md->tio_pool) |
1018 | goto bad3; | 1018 | goto bad3; |
1019 | 1019 | ||
1020 | md->bs = bioset_create(16, 16); | 1020 | md->bs = bioset_create(16, 16); |
1021 | if (!md->bs) | 1021 | if (!md->bs) |
1022 | goto bad_no_bioset; | 1022 | goto bad_no_bioset; |
1023 | 1023 | ||
1024 | md->disk = alloc_disk(1); | 1024 | md->disk = alloc_disk(1); |
1025 | if (!md->disk) | 1025 | if (!md->disk) |
1026 | goto bad4; | 1026 | goto bad4; |
1027 | 1027 | ||
1028 | atomic_set(&md->pending, 0); | 1028 | atomic_set(&md->pending, 0); |
1029 | init_waitqueue_head(&md->wait); | 1029 | init_waitqueue_head(&md->wait); |
1030 | init_waitqueue_head(&md->eventq); | 1030 | init_waitqueue_head(&md->eventq); |
1031 | 1031 | ||
1032 | md->disk->major = _major; | 1032 | md->disk->major = _major; |
1033 | md->disk->first_minor = minor; | 1033 | md->disk->first_minor = minor; |
1034 | md->disk->fops = &dm_blk_dops; | 1034 | md->disk->fops = &dm_blk_dops; |
1035 | md->disk->queue = md->queue; | 1035 | md->disk->queue = md->queue; |
1036 | md->disk->private_data = md; | 1036 | md->disk->private_data = md; |
1037 | sprintf(md->disk->disk_name, "dm-%d", minor); | 1037 | sprintf(md->disk->disk_name, "dm-%d", minor); |
1038 | add_disk(md->disk); | 1038 | add_disk(md->disk); |
1039 | format_dev_t(md->name, MKDEV(_major, minor)); | 1039 | format_dev_t(md->name, MKDEV(_major, minor)); |
1040 | 1040 | ||
1041 | /* Populate the mapping, nobody knows we exist yet */ | 1041 | /* Populate the mapping, nobody knows we exist yet */ |
1042 | spin_lock(&_minor_lock); | 1042 | spin_lock(&_minor_lock); |
1043 | old_md = idr_replace(&_minor_idr, md, minor); | 1043 | old_md = idr_replace(&_minor_idr, md, minor); |
1044 | spin_unlock(&_minor_lock); | 1044 | spin_unlock(&_minor_lock); |
1045 | 1045 | ||
1046 | BUG_ON(old_md != MINOR_ALLOCED); | 1046 | BUG_ON(old_md != MINOR_ALLOCED); |
1047 | 1047 | ||
1048 | return md; | 1048 | return md; |
1049 | 1049 | ||
1050 | bad4: | 1050 | bad4: |
1051 | bioset_free(md->bs); | 1051 | bioset_free(md->bs); |
1052 | bad_no_bioset: | 1052 | bad_no_bioset: |
1053 | mempool_destroy(md->tio_pool); | 1053 | mempool_destroy(md->tio_pool); |
1054 | bad3: | 1054 | bad3: |
1055 | mempool_destroy(md->io_pool); | 1055 | mempool_destroy(md->io_pool); |
1056 | bad2: | 1056 | bad2: |
1057 | blk_cleanup_queue(md->queue); | 1057 | blk_cleanup_queue(md->queue); |
1058 | bad1_free_minor: | 1058 | bad1_free_minor: |
1059 | free_minor(minor); | 1059 | free_minor(minor); |
1060 | bad1: | 1060 | bad1: |
1061 | module_put(THIS_MODULE); | 1061 | module_put(THIS_MODULE); |
1062 | bad0: | 1062 | bad0: |
1063 | kfree(md); | 1063 | kfree(md); |
1064 | return NULL; | 1064 | return NULL; |
1065 | } | 1065 | } |
1066 | 1066 | ||
1067 | static void free_dev(struct mapped_device *md) | 1067 | static void free_dev(struct mapped_device *md) |
1068 | { | 1068 | { |
1069 | int minor = md->disk->first_minor; | 1069 | int minor = md->disk->first_minor; |
1070 | 1070 | ||
1071 | if (md->suspended_bdev) { | 1071 | if (md->suspended_bdev) { |
1072 | thaw_bdev(md->suspended_bdev, NULL); | 1072 | thaw_bdev(md->suspended_bdev, NULL); |
1073 | bdput(md->suspended_bdev); | 1073 | bdput(md->suspended_bdev); |
1074 | } | 1074 | } |
1075 | mempool_destroy(md->tio_pool); | 1075 | mempool_destroy(md->tio_pool); |
1076 | mempool_destroy(md->io_pool); | 1076 | mempool_destroy(md->io_pool); |
1077 | bioset_free(md->bs); | 1077 | bioset_free(md->bs); |
1078 | del_gendisk(md->disk); | 1078 | del_gendisk(md->disk); |
1079 | free_minor(minor); | 1079 | free_minor(minor); |
1080 | 1080 | ||
1081 | spin_lock(&_minor_lock); | 1081 | spin_lock(&_minor_lock); |
1082 | md->disk->private_data = NULL; | 1082 | md->disk->private_data = NULL; |
1083 | spin_unlock(&_minor_lock); | 1083 | spin_unlock(&_minor_lock); |
1084 | 1084 | ||
1085 | put_disk(md->disk); | 1085 | put_disk(md->disk); |
1086 | blk_cleanup_queue(md->queue); | 1086 | blk_cleanup_queue(md->queue); |
1087 | module_put(THIS_MODULE); | 1087 | module_put(THIS_MODULE); |
1088 | kfree(md); | 1088 | kfree(md); |
1089 | } | 1089 | } |
1090 | 1090 | ||
1091 | /* | 1091 | /* |
1092 | * Bind a table to the device. | 1092 | * Bind a table to the device. |
1093 | */ | 1093 | */ |
1094 | static void event_callback(void *context) | 1094 | static void event_callback(void *context) |
1095 | { | 1095 | { |
1096 | struct mapped_device *md = (struct mapped_device *) context; | 1096 | struct mapped_device *md = (struct mapped_device *) context; |
1097 | 1097 | ||
1098 | atomic_inc(&md->event_nr); | 1098 | atomic_inc(&md->event_nr); |
1099 | wake_up(&md->eventq); | 1099 | wake_up(&md->eventq); |
1100 | } | 1100 | } |
1101 | 1101 | ||
1102 | static void __set_size(struct mapped_device *md, sector_t size) | 1102 | static void __set_size(struct mapped_device *md, sector_t size) |
1103 | { | 1103 | { |
1104 | set_capacity(md->disk, size); | 1104 | set_capacity(md->disk, size); |
1105 | 1105 | ||
1106 | mutex_lock(&md->suspended_bdev->bd_inode->i_mutex); | 1106 | mutex_lock(&md->suspended_bdev->bd_inode->i_mutex); |
1107 | i_size_write(md->suspended_bdev->bd_inode, (loff_t)size << SECTOR_SHIFT); | 1107 | i_size_write(md->suspended_bdev->bd_inode, (loff_t)size << SECTOR_SHIFT); |
1108 | mutex_unlock(&md->suspended_bdev->bd_inode->i_mutex); | 1108 | mutex_unlock(&md->suspended_bdev->bd_inode->i_mutex); |
1109 | } | 1109 | } |
1110 | 1110 | ||
1111 | static int __bind(struct mapped_device *md, struct dm_table *t) | 1111 | static int __bind(struct mapped_device *md, struct dm_table *t) |
1112 | { | 1112 | { |
1113 | struct request_queue *q = md->queue; | 1113 | struct request_queue *q = md->queue; |
1114 | sector_t size; | 1114 | sector_t size; |
1115 | 1115 | ||
1116 | size = dm_table_get_size(t); | 1116 | size = dm_table_get_size(t); |
1117 | 1117 | ||
1118 | /* | 1118 | /* |
1119 | * Wipe any geometry if the size of the table changed. | 1119 | * Wipe any geometry if the size of the table changed. |
1120 | */ | 1120 | */ |
1121 | if (size != get_capacity(md->disk)) | 1121 | if (size != get_capacity(md->disk)) |
1122 | memset(&md->geometry, 0, sizeof(md->geometry)); | 1122 | memset(&md->geometry, 0, sizeof(md->geometry)); |
1123 | 1123 | ||
1124 | if (md->suspended_bdev) | 1124 | if (md->suspended_bdev) |
1125 | __set_size(md, size); | 1125 | __set_size(md, size); |
1126 | if (size == 0) | 1126 | if (size == 0) |
1127 | return 0; | 1127 | return 0; |
1128 | 1128 | ||
1129 | dm_table_get(t); | 1129 | dm_table_get(t); |
1130 | dm_table_event_callback(t, event_callback, md); | 1130 | dm_table_event_callback(t, event_callback, md); |
1131 | 1131 | ||
1132 | write_lock(&md->map_lock); | 1132 | write_lock(&md->map_lock); |
1133 | md->map = t; | 1133 | md->map = t; |
1134 | dm_table_set_restrictions(t, q); | 1134 | dm_table_set_restrictions(t, q); |
1135 | write_unlock(&md->map_lock); | 1135 | write_unlock(&md->map_lock); |
1136 | 1136 | ||
1137 | return 0; | 1137 | return 0; |
1138 | } | 1138 | } |
1139 | 1139 | ||
1140 | static void __unbind(struct mapped_device *md) | 1140 | static void __unbind(struct mapped_device *md) |
1141 | { | 1141 | { |
1142 | struct dm_table *map = md->map; | 1142 | struct dm_table *map = md->map; |
1143 | 1143 | ||
1144 | if (!map) | 1144 | if (!map) |
1145 | return; | 1145 | return; |
1146 | 1146 | ||
1147 | dm_table_event_callback(map, NULL, NULL); | 1147 | dm_table_event_callback(map, NULL, NULL); |
1148 | write_lock(&md->map_lock); | 1148 | write_lock(&md->map_lock); |
1149 | md->map = NULL; | 1149 | md->map = NULL; |
1150 | write_unlock(&md->map_lock); | 1150 | write_unlock(&md->map_lock); |
1151 | dm_table_put(map); | 1151 | dm_table_put(map); |
1152 | } | 1152 | } |
1153 | 1153 | ||
1154 | /* | 1154 | /* |
1155 | * Constructor for a new device. | 1155 | * Constructor for a new device. |
1156 | */ | 1156 | */ |
1157 | int dm_create(int minor, struct mapped_device **result) | 1157 | int dm_create(int minor, struct mapped_device **result) |
1158 | { | 1158 | { |
1159 | struct mapped_device *md; | 1159 | struct mapped_device *md; |
1160 | 1160 | ||
1161 | md = alloc_dev(minor); | 1161 | md = alloc_dev(minor); |
1162 | if (!md) | 1162 | if (!md) |
1163 | return -ENXIO; | 1163 | return -ENXIO; |
1164 | 1164 | ||
1165 | *result = md; | 1165 | *result = md; |
1166 | return 0; | 1166 | return 0; |
1167 | } | 1167 | } |
1168 | 1168 | ||
1169 | static struct mapped_device *dm_find_md(dev_t dev) | 1169 | static struct mapped_device *dm_find_md(dev_t dev) |
1170 | { | 1170 | { |
1171 | struct mapped_device *md; | 1171 | struct mapped_device *md; |
1172 | unsigned minor = MINOR(dev); | 1172 | unsigned minor = MINOR(dev); |
1173 | 1173 | ||
1174 | if (MAJOR(dev) != _major || minor >= (1 << MINORBITS)) | 1174 | if (MAJOR(dev) != _major || minor >= (1 << MINORBITS)) |
1175 | return NULL; | 1175 | return NULL; |
1176 | 1176 | ||
1177 | spin_lock(&_minor_lock); | 1177 | spin_lock(&_minor_lock); |
1178 | 1178 | ||
1179 | md = idr_find(&_minor_idr, minor); | 1179 | md = idr_find(&_minor_idr, minor); |
1180 | if (md && (md == MINOR_ALLOCED || | 1180 | if (md && (md == MINOR_ALLOCED || |
1181 | (dm_disk(md)->first_minor != minor) || | 1181 | (dm_disk(md)->first_minor != minor) || |
1182 | test_bit(DMF_FREEING, &md->flags))) { | 1182 | test_bit(DMF_FREEING, &md->flags))) { |
1183 | md = NULL; | 1183 | md = NULL; |
1184 | goto out; | 1184 | goto out; |
1185 | } | 1185 | } |
1186 | 1186 | ||
1187 | out: | 1187 | out: |
1188 | spin_unlock(&_minor_lock); | 1188 | spin_unlock(&_minor_lock); |
1189 | 1189 | ||
1190 | return md; | 1190 | return md; |
1191 | } | 1191 | } |
1192 | 1192 | ||
1193 | struct mapped_device *dm_get_md(dev_t dev) | 1193 | struct mapped_device *dm_get_md(dev_t dev) |
1194 | { | 1194 | { |
1195 | struct mapped_device *md = dm_find_md(dev); | 1195 | struct mapped_device *md = dm_find_md(dev); |
1196 | 1196 | ||
1197 | if (md) | 1197 | if (md) |
1198 | dm_get(md); | 1198 | dm_get(md); |
1199 | 1199 | ||
1200 | return md; | 1200 | return md; |
1201 | } | 1201 | } |
1202 | 1202 | ||
1203 | void *dm_get_mdptr(struct mapped_device *md) | 1203 | void *dm_get_mdptr(struct mapped_device *md) |
1204 | { | 1204 | { |
1205 | return md->interface_ptr; | 1205 | return md->interface_ptr; |
1206 | } | 1206 | } |
1207 | 1207 | ||
1208 | void dm_set_mdptr(struct mapped_device *md, void *ptr) | 1208 | void dm_set_mdptr(struct mapped_device *md, void *ptr) |
1209 | { | 1209 | { |
1210 | md->interface_ptr = ptr; | 1210 | md->interface_ptr = ptr; |
1211 | } | 1211 | } |
1212 | 1212 | ||
1213 | void dm_get(struct mapped_device *md) | 1213 | void dm_get(struct mapped_device *md) |
1214 | { | 1214 | { |
1215 | atomic_inc(&md->holders); | 1215 | atomic_inc(&md->holders); |
1216 | } | 1216 | } |
1217 | 1217 | ||
1218 | const char *dm_device_name(struct mapped_device *md) | 1218 | const char *dm_device_name(struct mapped_device *md) |
1219 | { | 1219 | { |
1220 | return md->name; | 1220 | return md->name; |
1221 | } | 1221 | } |
1222 | EXPORT_SYMBOL_GPL(dm_device_name); | 1222 | EXPORT_SYMBOL_GPL(dm_device_name); |
1223 | 1223 | ||
1224 | void dm_put(struct mapped_device *md) | 1224 | void dm_put(struct mapped_device *md) |
1225 | { | 1225 | { |
1226 | struct dm_table *map; | 1226 | struct dm_table *map; |
1227 | 1227 | ||
1228 | BUG_ON(test_bit(DMF_FREEING, &md->flags)); | 1228 | BUG_ON(test_bit(DMF_FREEING, &md->flags)); |
1229 | 1229 | ||
1230 | if (atomic_dec_and_lock(&md->holders, &_minor_lock)) { | 1230 | if (atomic_dec_and_lock(&md->holders, &_minor_lock)) { |
1231 | map = dm_get_table(md); | 1231 | map = dm_get_table(md); |
1232 | idr_replace(&_minor_idr, MINOR_ALLOCED, dm_disk(md)->first_minor); | 1232 | idr_replace(&_minor_idr, MINOR_ALLOCED, dm_disk(md)->first_minor); |
1233 | set_bit(DMF_FREEING, &md->flags); | 1233 | set_bit(DMF_FREEING, &md->flags); |
1234 | spin_unlock(&_minor_lock); | 1234 | spin_unlock(&_minor_lock); |
1235 | if (!dm_suspended(md)) { | 1235 | if (!dm_suspended(md)) { |
1236 | dm_table_presuspend_targets(map); | 1236 | dm_table_presuspend_targets(map); |
1237 | dm_table_postsuspend_targets(map); | 1237 | dm_table_postsuspend_targets(map); |
1238 | } | 1238 | } |
1239 | __unbind(md); | 1239 | __unbind(md); |
1240 | dm_table_put(map); | 1240 | dm_table_put(map); |
1241 | free_dev(md); | 1241 | free_dev(md); |
1242 | } | 1242 | } |
1243 | } | 1243 | } |
1244 | EXPORT_SYMBOL_GPL(dm_put); | 1244 | EXPORT_SYMBOL_GPL(dm_put); |
1245 | 1245 | ||
1246 | /* | 1246 | /* |
1247 | * Process the deferred bios | 1247 | * Process the deferred bios |
1248 | */ | 1248 | */ |
1249 | static void __flush_deferred_io(struct mapped_device *md, struct bio *c) | 1249 | static void __flush_deferred_io(struct mapped_device *md, struct bio *c) |
1250 | { | 1250 | { |
1251 | struct bio *n; | 1251 | struct bio *n; |
1252 | 1252 | ||
1253 | while (c) { | 1253 | while (c) { |
1254 | n = c->bi_next; | 1254 | n = c->bi_next; |
1255 | c->bi_next = NULL; | 1255 | c->bi_next = NULL; |
1256 | __split_bio(md, c); | 1256 | __split_bio(md, c); |
1257 | c = n; | 1257 | c = n; |
1258 | } | 1258 | } |
1259 | } | 1259 | } |
1260 | 1260 | ||
1261 | /* | 1261 | /* |
1262 | * Swap in a new table (destroying old one). | 1262 | * Swap in a new table (destroying old one). |
1263 | */ | 1263 | */ |
1264 | int dm_swap_table(struct mapped_device *md, struct dm_table *table) | 1264 | int dm_swap_table(struct mapped_device *md, struct dm_table *table) |
1265 | { | 1265 | { |
1266 | int r = -EINVAL; | 1266 | int r = -EINVAL; |
1267 | 1267 | ||
1268 | down(&md->suspend_lock); | 1268 | down(&md->suspend_lock); |
1269 | 1269 | ||
1270 | /* device must be suspended */ | 1270 | /* device must be suspended */ |
1271 | if (!dm_suspended(md)) | 1271 | if (!dm_suspended(md)) |
1272 | goto out; | 1272 | goto out; |
1273 | 1273 | ||
1274 | /* without bdev, the device size cannot be changed */ | 1274 | /* without bdev, the device size cannot be changed */ |
1275 | if (!md->suspended_bdev) | 1275 | if (!md->suspended_bdev) |
1276 | if (get_capacity(md->disk) != dm_table_get_size(table)) | 1276 | if (get_capacity(md->disk) != dm_table_get_size(table)) |
1277 | goto out; | 1277 | goto out; |
1278 | 1278 | ||
1279 | __unbind(md); | 1279 | __unbind(md); |
1280 | r = __bind(md, table); | 1280 | r = __bind(md, table); |
1281 | 1281 | ||
1282 | out: | 1282 | out: |
1283 | up(&md->suspend_lock); | 1283 | up(&md->suspend_lock); |
1284 | return r; | 1284 | return r; |
1285 | } | 1285 | } |
1286 | 1286 | ||
1287 | /* | 1287 | /* |
1288 | * Functions to lock and unlock any filesystem running on the | 1288 | * Functions to lock and unlock any filesystem running on the |
1289 | * device. | 1289 | * device. |
1290 | */ | 1290 | */ |
1291 | static int lock_fs(struct mapped_device *md) | 1291 | static int lock_fs(struct mapped_device *md) |
1292 | { | 1292 | { |
1293 | int r; | 1293 | int r; |
1294 | 1294 | ||
1295 | WARN_ON(md->frozen_sb); | 1295 | WARN_ON(md->frozen_sb); |
1296 | 1296 | ||
1297 | md->frozen_sb = freeze_bdev(md->suspended_bdev); | 1297 | md->frozen_sb = freeze_bdev(md->suspended_bdev); |
1298 | if (IS_ERR(md->frozen_sb)) { | 1298 | if (IS_ERR(md->frozen_sb)) { |
1299 | r = PTR_ERR(md->frozen_sb); | 1299 | r = PTR_ERR(md->frozen_sb); |
1300 | md->frozen_sb = NULL; | 1300 | md->frozen_sb = NULL; |
1301 | return r; | 1301 | return r; |
1302 | } | 1302 | } |
1303 | 1303 | ||
1304 | set_bit(DMF_FROZEN, &md->flags); | 1304 | set_bit(DMF_FROZEN, &md->flags); |
1305 | 1305 | ||
1306 | /* don't bdput right now, we don't want the bdev | 1306 | /* don't bdput right now, we don't want the bdev |
1307 | * to go away while it is locked. | 1307 | * to go away while it is locked. |
1308 | */ | 1308 | */ |
1309 | return 0; | 1309 | return 0; |
1310 | } | 1310 | } |
1311 | 1311 | ||
1312 | static void unlock_fs(struct mapped_device *md) | 1312 | static void unlock_fs(struct mapped_device *md) |
1313 | { | 1313 | { |
1314 | if (!test_bit(DMF_FROZEN, &md->flags)) | 1314 | if (!test_bit(DMF_FROZEN, &md->flags)) |
1315 | return; | 1315 | return; |
1316 | 1316 | ||
1317 | thaw_bdev(md->suspended_bdev, md->frozen_sb); | 1317 | thaw_bdev(md->suspended_bdev, md->frozen_sb); |
1318 | md->frozen_sb = NULL; | 1318 | md->frozen_sb = NULL; |
1319 | clear_bit(DMF_FROZEN, &md->flags); | 1319 | clear_bit(DMF_FROZEN, &md->flags); |
1320 | } | 1320 | } |
1321 | 1321 | ||
1322 | /* | 1322 | /* |
1323 | * We need to be able to change a mapping table under a mounted | 1323 | * We need to be able to change a mapping table under a mounted |
1324 | * filesystem. For example we might want to move some data in | 1324 | * filesystem. For example we might want to move some data in |
1325 | * the background. Before the table can be swapped with | 1325 | * the background. Before the table can be swapped with |
1326 | * dm_bind_table, dm_suspend must be called to flush any in | 1326 | * dm_bind_table, dm_suspend must be called to flush any in |
1327 | * flight bios and ensure that any further io gets deferred. | 1327 | * flight bios and ensure that any further io gets deferred. |
1328 | */ | 1328 | */ |
1329 | int dm_suspend(struct mapped_device *md, unsigned suspend_flags) | 1329 | int dm_suspend(struct mapped_device *md, unsigned suspend_flags) |
1330 | { | 1330 | { |
1331 | struct dm_table *map = NULL; | 1331 | struct dm_table *map = NULL; |
1332 | unsigned long flags; | 1332 | unsigned long flags; |
1333 | DECLARE_WAITQUEUE(wait, current); | 1333 | DECLARE_WAITQUEUE(wait, current); |
1334 | struct bio *def; | 1334 | struct bio *def; |
1335 | int r = -EINVAL; | 1335 | int r = -EINVAL; |
1336 | int do_lockfs = suspend_flags & DM_SUSPEND_LOCKFS_FLAG ? 1 : 0; | 1336 | int do_lockfs = suspend_flags & DM_SUSPEND_LOCKFS_FLAG ? 1 : 0; |
1337 | int noflush = suspend_flags & DM_SUSPEND_NOFLUSH_FLAG ? 1 : 0; | 1337 | int noflush = suspend_flags & DM_SUSPEND_NOFLUSH_FLAG ? 1 : 0; |
1338 | 1338 | ||
1339 | down(&md->suspend_lock); | 1339 | down(&md->suspend_lock); |
1340 | 1340 | ||
1341 | if (dm_suspended(md)) | 1341 | if (dm_suspended(md)) |
1342 | goto out_unlock; | 1342 | goto out_unlock; |
1343 | 1343 | ||
1344 | map = dm_get_table(md); | 1344 | map = dm_get_table(md); |
1345 | 1345 | ||
1346 | /* | 1346 | /* |
1347 | * DMF_NOFLUSH_SUSPENDING must be set before presuspend. | 1347 | * DMF_NOFLUSH_SUSPENDING must be set before presuspend. |
1348 | * This flag is cleared before dm_suspend returns. | 1348 | * This flag is cleared before dm_suspend returns. |
1349 | */ | 1349 | */ |
1350 | if (noflush) | 1350 | if (noflush) |
1351 | set_bit(DMF_NOFLUSH_SUSPENDING, &md->flags); | 1351 | set_bit(DMF_NOFLUSH_SUSPENDING, &md->flags); |
1352 | 1352 | ||
1353 | /* This does not get reverted if there's an error later. */ | 1353 | /* This does not get reverted if there's an error later. */ |
1354 | dm_table_presuspend_targets(map); | 1354 | dm_table_presuspend_targets(map); |
1355 | 1355 | ||
1356 | /* bdget() can stall if the pending I/Os are not flushed */ | 1356 | /* bdget() can stall if the pending I/Os are not flushed */ |
1357 | if (!noflush) { | 1357 | if (!noflush) { |
1358 | md->suspended_bdev = bdget_disk(md->disk, 0); | 1358 | md->suspended_bdev = bdget_disk(md->disk, 0); |
1359 | if (!md->suspended_bdev) { | 1359 | if (!md->suspended_bdev) { |
1360 | DMWARN("bdget failed in dm_suspend"); | 1360 | DMWARN("bdget failed in dm_suspend"); |
1361 | r = -ENOMEM; | 1361 | r = -ENOMEM; |
1362 | goto flush_and_out; | 1362 | goto flush_and_out; |
1363 | } | 1363 | } |
1364 | } | 1364 | } |
1365 | 1365 | ||
1366 | /* | 1366 | /* |
1367 | * Flush I/O to the device. | 1367 | * Flush I/O to the device. |
1368 | * noflush supersedes do_lockfs, because lock_fs() needs to flush I/Os. | 1368 | * noflush supersedes do_lockfs, because lock_fs() needs to flush I/Os. |
1369 | */ | 1369 | */ |
1370 | if (do_lockfs && !noflush) { | 1370 | if (do_lockfs && !noflush) { |
1371 | r = lock_fs(md); | 1371 | r = lock_fs(md); |
1372 | if (r) | 1372 | if (r) |
1373 | goto out; | 1373 | goto out; |
1374 | } | 1374 | } |
1375 | 1375 | ||
1376 | /* | 1376 | /* |
1377 | * First we set the BLOCK_IO flag so no more ios will be mapped. | 1377 | * First we set the BLOCK_IO flag so no more ios will be mapped. |
1378 | */ | 1378 | */ |
1379 | down_write(&md->io_lock); | 1379 | down_write(&md->io_lock); |
1380 | set_bit(DMF_BLOCK_IO, &md->flags); | 1380 | set_bit(DMF_BLOCK_IO, &md->flags); |
1381 | 1381 | ||
1382 | add_wait_queue(&md->wait, &wait); | 1382 | add_wait_queue(&md->wait, &wait); |
1383 | up_write(&md->io_lock); | 1383 | up_write(&md->io_lock); |
1384 | 1384 | ||
1385 | /* unplug */ | 1385 | /* unplug */ |
1386 | if (map) | 1386 | if (map) |
1387 | dm_table_unplug_all(map); | 1387 | dm_table_unplug_all(map); |
1388 | 1388 | ||
1389 | /* | 1389 | /* |
1390 | * Then we wait for the already mapped ios to | 1390 | * Then we wait for the already mapped ios to |
1391 | * complete. | 1391 | * complete. |
1392 | */ | 1392 | */ |
1393 | while (1) { | 1393 | while (1) { |
1394 | set_current_state(TASK_INTERRUPTIBLE); | 1394 | set_current_state(TASK_INTERRUPTIBLE); |
1395 | 1395 | ||
1396 | if (!atomic_read(&md->pending) || signal_pending(current)) | 1396 | if (!atomic_read(&md->pending) || signal_pending(current)) |
1397 | break; | 1397 | break; |
1398 | 1398 | ||
1399 | io_schedule(); | 1399 | io_schedule(); |
1400 | } | 1400 | } |
1401 | set_current_state(TASK_RUNNING); | 1401 | set_current_state(TASK_RUNNING); |
1402 | 1402 | ||
1403 | down_write(&md->io_lock); | 1403 | down_write(&md->io_lock); |
1404 | remove_wait_queue(&md->wait, &wait); | 1404 | remove_wait_queue(&md->wait, &wait); |
1405 | 1405 | ||
1406 | if (noflush) { | 1406 | if (noflush) { |
1407 | spin_lock_irqsave(&md->pushback_lock, flags); | 1407 | spin_lock_irqsave(&md->pushback_lock, flags); |
1408 | clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags); | 1408 | clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags); |
1409 | bio_list_merge_head(&md->deferred, &md->pushback); | 1409 | bio_list_merge_head(&md->deferred, &md->pushback); |
1410 | bio_list_init(&md->pushback); | 1410 | bio_list_init(&md->pushback); |
1411 | spin_unlock_irqrestore(&md->pushback_lock, flags); | 1411 | spin_unlock_irqrestore(&md->pushback_lock, flags); |
1412 | } | 1412 | } |
1413 | 1413 | ||
1414 | /* were we interrupted ? */ | 1414 | /* were we interrupted ? */ |
1415 | r = -EINTR; | 1415 | r = -EINTR; |
1416 | if (atomic_read(&md->pending)) { | 1416 | if (atomic_read(&md->pending)) { |
1417 | clear_bit(DMF_BLOCK_IO, &md->flags); | 1417 | clear_bit(DMF_BLOCK_IO, &md->flags); |
1418 | def = bio_list_get(&md->deferred); | 1418 | def = bio_list_get(&md->deferred); |
1419 | __flush_deferred_io(md, def); | 1419 | __flush_deferred_io(md, def); |
1420 | up_write(&md->io_lock); | 1420 | up_write(&md->io_lock); |
1421 | unlock_fs(md); | 1421 | unlock_fs(md); |
1422 | goto out; /* pushback list is already flushed, so skip flush */ | 1422 | goto out; /* pushback list is already flushed, so skip flush */ |
1423 | } | 1423 | } |
1424 | up_write(&md->io_lock); | 1424 | up_write(&md->io_lock); |
1425 | 1425 | ||
1426 | dm_table_postsuspend_targets(map); | 1426 | dm_table_postsuspend_targets(map); |
1427 | 1427 | ||
1428 | set_bit(DMF_SUSPENDED, &md->flags); | 1428 | set_bit(DMF_SUSPENDED, &md->flags); |
1429 | 1429 | ||
1430 | r = 0; | 1430 | r = 0; |
1431 | 1431 | ||
1432 | flush_and_out: | 1432 | flush_and_out: |
1433 | if (r && noflush) { | 1433 | if (r && noflush) { |
1434 | /* | 1434 | /* |
1435 | * Because there may be already I/Os in the pushback list, | 1435 | * Because there may be already I/Os in the pushback list, |
1436 | * flush them before return. | 1436 | * flush them before return. |
1437 | */ | 1437 | */ |
1438 | down_write(&md->io_lock); | 1438 | down_write(&md->io_lock); |
1439 | 1439 | ||
1440 | spin_lock_irqsave(&md->pushback_lock, flags); | 1440 | spin_lock_irqsave(&md->pushback_lock, flags); |
1441 | clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags); | 1441 | clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags); |
1442 | bio_list_merge_head(&md->deferred, &md->pushback); | 1442 | bio_list_merge_head(&md->deferred, &md->pushback); |
1443 | bio_list_init(&md->pushback); | 1443 | bio_list_init(&md->pushback); |
1444 | spin_unlock_irqrestore(&md->pushback_lock, flags); | 1444 | spin_unlock_irqrestore(&md->pushback_lock, flags); |
1445 | 1445 | ||
1446 | def = bio_list_get(&md->deferred); | 1446 | def = bio_list_get(&md->deferred); |
1447 | __flush_deferred_io(md, def); | 1447 | __flush_deferred_io(md, def); |
1448 | up_write(&md->io_lock); | 1448 | up_write(&md->io_lock); |
1449 | } | 1449 | } |
1450 | 1450 | ||
1451 | out: | 1451 | out: |
1452 | if (r && md->suspended_bdev) { | 1452 | if (r && md->suspended_bdev) { |
1453 | bdput(md->suspended_bdev); | 1453 | bdput(md->suspended_bdev); |
1454 | md->suspended_bdev = NULL; | 1454 | md->suspended_bdev = NULL; |
1455 | } | 1455 | } |
1456 | 1456 | ||
1457 | dm_table_put(map); | 1457 | dm_table_put(map); |
1458 | 1458 | ||
1459 | out_unlock: | 1459 | out_unlock: |
1460 | up(&md->suspend_lock); | 1460 | up(&md->suspend_lock); |
1461 | return r; | 1461 | return r; |
1462 | } | 1462 | } |
1463 | 1463 | ||
1464 | int dm_resume(struct mapped_device *md) | 1464 | int dm_resume(struct mapped_device *md) |
1465 | { | 1465 | { |
1466 | int r = -EINVAL; | 1466 | int r = -EINVAL; |
1467 | struct bio *def; | 1467 | struct bio *def; |
1468 | struct dm_table *map = NULL; | 1468 | struct dm_table *map = NULL; |
1469 | 1469 | ||
1470 | down(&md->suspend_lock); | 1470 | down(&md->suspend_lock); |
1471 | if (!dm_suspended(md)) | 1471 | if (!dm_suspended(md)) |
1472 | goto out; | 1472 | goto out; |
1473 | 1473 | ||
1474 | map = dm_get_table(md); | 1474 | map = dm_get_table(md); |
1475 | if (!map || !dm_table_get_size(map)) | 1475 | if (!map || !dm_table_get_size(map)) |
1476 | goto out; | 1476 | goto out; |
1477 | 1477 | ||
1478 | r = dm_table_resume_targets(map); | 1478 | r = dm_table_resume_targets(map); |
1479 | if (r) | 1479 | if (r) |
1480 | goto out; | 1480 | goto out; |
1481 | 1481 | ||
1482 | down_write(&md->io_lock); | 1482 | down_write(&md->io_lock); |
1483 | clear_bit(DMF_BLOCK_IO, &md->flags); | 1483 | clear_bit(DMF_BLOCK_IO, &md->flags); |
1484 | 1484 | ||
1485 | def = bio_list_get(&md->deferred); | 1485 | def = bio_list_get(&md->deferred); |
1486 | __flush_deferred_io(md, def); | 1486 | __flush_deferred_io(md, def); |
1487 | up_write(&md->io_lock); | 1487 | up_write(&md->io_lock); |
1488 | 1488 | ||
1489 | unlock_fs(md); | 1489 | unlock_fs(md); |
1490 | 1490 | ||
1491 | if (md->suspended_bdev) { | 1491 | if (md->suspended_bdev) { |
1492 | bdput(md->suspended_bdev); | 1492 | bdput(md->suspended_bdev); |
1493 | md->suspended_bdev = NULL; | 1493 | md->suspended_bdev = NULL; |
1494 | } | 1494 | } |
1495 | 1495 | ||
1496 | clear_bit(DMF_SUSPENDED, &md->flags); | 1496 | clear_bit(DMF_SUSPENDED, &md->flags); |
1497 | 1497 | ||
1498 | dm_table_unplug_all(map); | 1498 | dm_table_unplug_all(map); |
1499 | 1499 | ||
1500 | kobject_uevent(&md->disk->kobj, KOBJ_CHANGE); | 1500 | kobject_uevent(&md->disk->kobj, KOBJ_CHANGE); |
1501 | 1501 | ||
1502 | r = 0; | 1502 | r = 0; |
1503 | 1503 | ||
1504 | out: | 1504 | out: |
1505 | dm_table_put(map); | 1505 | dm_table_put(map); |
1506 | up(&md->suspend_lock); | 1506 | up(&md->suspend_lock); |
1507 | 1507 | ||
1508 | return r; | 1508 | return r; |
1509 | } | 1509 | } |
1510 | 1510 | ||
1511 | /*----------------------------------------------------------------- | 1511 | /*----------------------------------------------------------------- |
1512 | * Event notification. | 1512 | * Event notification. |
1513 | *---------------------------------------------------------------*/ | 1513 | *---------------------------------------------------------------*/ |
1514 | uint32_t dm_get_event_nr(struct mapped_device *md) | 1514 | uint32_t dm_get_event_nr(struct mapped_device *md) |
1515 | { | 1515 | { |
1516 | return atomic_read(&md->event_nr); | 1516 | return atomic_read(&md->event_nr); |
1517 | } | 1517 | } |
1518 | 1518 | ||
1519 | int dm_wait_event(struct mapped_device *md, int event_nr) | 1519 | int dm_wait_event(struct mapped_device *md, int event_nr) |
1520 | { | 1520 | { |
1521 | return wait_event_interruptible(md->eventq, | 1521 | return wait_event_interruptible(md->eventq, |
1522 | (event_nr != atomic_read(&md->event_nr))); | 1522 | (event_nr != atomic_read(&md->event_nr))); |
1523 | } | 1523 | } |
1524 | 1524 | ||
1525 | /* | 1525 | /* |
1526 | * The gendisk is only valid as long as you have a reference | 1526 | * The gendisk is only valid as long as you have a reference |
1527 | * count on 'md'. | 1527 | * count on 'md'. |
1528 | */ | 1528 | */ |
1529 | struct gendisk *dm_disk(struct mapped_device *md) | 1529 | struct gendisk *dm_disk(struct mapped_device *md) |
1530 | { | 1530 | { |
1531 | return md->disk; | 1531 | return md->disk; |
1532 | } | 1532 | } |
1533 | 1533 | ||
1534 | int dm_suspended(struct mapped_device *md) | 1534 | int dm_suspended(struct mapped_device *md) |
1535 | { | 1535 | { |
1536 | return test_bit(DMF_SUSPENDED, &md->flags); | 1536 | return test_bit(DMF_SUSPENDED, &md->flags); |
1537 | } | 1537 | } |
1538 | 1538 | ||
1539 | int dm_noflush_suspending(struct dm_target *ti) | 1539 | int dm_noflush_suspending(struct dm_target *ti) |
1540 | { | 1540 | { |
1541 | struct mapped_device *md = dm_table_get_md(ti->table); | 1541 | struct mapped_device *md = dm_table_get_md(ti->table); |
1542 | int r = __noflush_suspending(md); | 1542 | int r = __noflush_suspending(md); |
1543 | 1543 | ||
1544 | dm_put(md); | 1544 | dm_put(md); |
1545 | 1545 | ||
1546 | return r; | 1546 | return r; |
1547 | } | 1547 | } |
1548 | EXPORT_SYMBOL_GPL(dm_noflush_suspending); | 1548 | EXPORT_SYMBOL_GPL(dm_noflush_suspending); |
1549 | 1549 | ||
1550 | static struct block_device_operations dm_blk_dops = { | 1550 | static struct block_device_operations dm_blk_dops = { |
1551 | .open = dm_blk_open, | 1551 | .open = dm_blk_open, |
1552 | .release = dm_blk_close, | 1552 | .release = dm_blk_close, |
1553 | .ioctl = dm_blk_ioctl, | 1553 | .ioctl = dm_blk_ioctl, |
1554 | .getgeo = dm_blk_getgeo, | 1554 | .getgeo = dm_blk_getgeo, |
1555 | .owner = THIS_MODULE | 1555 | .owner = THIS_MODULE |
1556 | }; | 1556 | }; |
1557 | 1557 | ||
1558 | EXPORT_SYMBOL(dm_get_mapinfo); | 1558 | EXPORT_SYMBOL(dm_get_mapinfo); |
1559 | 1559 | ||
1560 | /* | 1560 | /* |
1561 | * module hooks | 1561 | * module hooks |
1562 | */ | 1562 | */ |
1563 | module_init(dm_init); | 1563 | module_init(dm_init); |
1564 | module_exit(dm_exit); | 1564 | module_exit(dm_exit); |
1565 | 1565 | ||
1566 | module_param(major, uint, 0); | 1566 | module_param(major, uint, 0); |
1567 | MODULE_PARM_DESC(major, "The major number of the device mapper"); | 1567 | MODULE_PARM_DESC(major, "The major number of the device mapper"); |
1568 | MODULE_DESCRIPTION(DM_NAME " driver"); | 1568 | MODULE_DESCRIPTION(DM_NAME " driver"); |
1569 | MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>"); | 1569 | MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>"); |
1570 | MODULE_LICENSE("GPL"); | 1570 | MODULE_LICENSE("GPL"); |
1571 | 1571 |
include/linux/blktrace_api.h
1 | #ifndef BLKTRACE_H | 1 | #ifndef BLKTRACE_H |
2 | #define BLKTRACE_H | 2 | #define BLKTRACE_H |
3 | 3 | ||
4 | #include <linux/blkdev.h> | 4 | #include <linux/blkdev.h> |
5 | #include <linux/relay.h> | 5 | #include <linux/relay.h> |
6 | 6 | ||
7 | /* | 7 | /* |
8 | * Trace categories | 8 | * Trace categories |
9 | */ | 9 | */ |
10 | enum blktrace_cat { | 10 | enum blktrace_cat { |
11 | BLK_TC_READ = 1 << 0, /* reads */ | 11 | BLK_TC_READ = 1 << 0, /* reads */ |
12 | BLK_TC_WRITE = 1 << 1, /* writes */ | 12 | BLK_TC_WRITE = 1 << 1, /* writes */ |
13 | BLK_TC_BARRIER = 1 << 2, /* barrier */ | 13 | BLK_TC_BARRIER = 1 << 2, /* barrier */ |
14 | BLK_TC_SYNC = 1 << 3, /* sync IO */ | 14 | BLK_TC_SYNC = 1 << 3, /* sync IO */ |
15 | BLK_TC_QUEUE = 1 << 4, /* queueing/merging */ | 15 | BLK_TC_QUEUE = 1 << 4, /* queueing/merging */ |
16 | BLK_TC_REQUEUE = 1 << 5, /* requeueing */ | 16 | BLK_TC_REQUEUE = 1 << 5, /* requeueing */ |
17 | BLK_TC_ISSUE = 1 << 6, /* issue */ | 17 | BLK_TC_ISSUE = 1 << 6, /* issue */ |
18 | BLK_TC_COMPLETE = 1 << 7, /* completions */ | 18 | BLK_TC_COMPLETE = 1 << 7, /* completions */ |
19 | BLK_TC_FS = 1 << 8, /* fs requests */ | 19 | BLK_TC_FS = 1 << 8, /* fs requests */ |
20 | BLK_TC_PC = 1 << 9, /* pc requests */ | 20 | BLK_TC_PC = 1 << 9, /* pc requests */ |
21 | BLK_TC_NOTIFY = 1 << 10, /* special message */ | 21 | BLK_TC_NOTIFY = 1 << 10, /* special message */ |
22 | BLK_TC_AHEAD = 1 << 11, /* readahead */ | 22 | BLK_TC_AHEAD = 1 << 11, /* readahead */ |
23 | BLK_TC_META = 1 << 12, /* metadata */ | 23 | BLK_TC_META = 1 << 12, /* metadata */ |
24 | 24 | ||
25 | BLK_TC_END = 1 << 15, /* only 16-bits, reminder */ | 25 | BLK_TC_END = 1 << 15, /* only 16-bits, reminder */ |
26 | }; | 26 | }; |
27 | 27 | ||
28 | #define BLK_TC_SHIFT (16) | 28 | #define BLK_TC_SHIFT (16) |
29 | #define BLK_TC_ACT(act) ((act) << BLK_TC_SHIFT) | 29 | #define BLK_TC_ACT(act) ((act) << BLK_TC_SHIFT) |
30 | 30 | ||
31 | /* | 31 | /* |
32 | * Basic trace actions | 32 | * Basic trace actions |
33 | */ | 33 | */ |
34 | enum blktrace_act { | 34 | enum blktrace_act { |
35 | __BLK_TA_QUEUE = 1, /* queued */ | 35 | __BLK_TA_QUEUE = 1, /* queued */ |
36 | __BLK_TA_BACKMERGE, /* back merged to existing rq */ | 36 | __BLK_TA_BACKMERGE, /* back merged to existing rq */ |
37 | __BLK_TA_FRONTMERGE, /* front merge to existing rq */ | 37 | __BLK_TA_FRONTMERGE, /* front merge to existing rq */ |
38 | __BLK_TA_GETRQ, /* allocated new request */ | 38 | __BLK_TA_GETRQ, /* allocated new request */ |
39 | __BLK_TA_SLEEPRQ, /* sleeping on rq allocation */ | 39 | __BLK_TA_SLEEPRQ, /* sleeping on rq allocation */ |
40 | __BLK_TA_REQUEUE, /* request requeued */ | 40 | __BLK_TA_REQUEUE, /* request requeued */ |
41 | __BLK_TA_ISSUE, /* sent to driver */ | 41 | __BLK_TA_ISSUE, /* sent to driver */ |
42 | __BLK_TA_COMPLETE, /* completed by driver */ | 42 | __BLK_TA_COMPLETE, /* completed by driver */ |
43 | __BLK_TA_PLUG, /* queue was plugged */ | 43 | __BLK_TA_PLUG, /* queue was plugged */ |
44 | __BLK_TA_UNPLUG_IO, /* queue was unplugged by io */ | 44 | __BLK_TA_UNPLUG_IO, /* queue was unplugged by io */ |
45 | __BLK_TA_UNPLUG_TIMER, /* queue was unplugged by timer */ | 45 | __BLK_TA_UNPLUG_TIMER, /* queue was unplugged by timer */ |
46 | __BLK_TA_INSERT, /* insert request */ | 46 | __BLK_TA_INSERT, /* insert request */ |
47 | __BLK_TA_SPLIT, /* bio was split */ | 47 | __BLK_TA_SPLIT, /* bio was split */ |
48 | __BLK_TA_BOUNCE, /* bio was bounced */ | 48 | __BLK_TA_BOUNCE, /* bio was bounced */ |
49 | __BLK_TA_REMAP, /* bio was remapped */ | 49 | __BLK_TA_REMAP, /* bio was remapped */ |
50 | }; | 50 | }; |
51 | 51 | ||
52 | /* | 52 | /* |
53 | * Notify events. | 53 | * Notify events. |
54 | */ | 54 | */ |
55 | enum blktrace_notify { | 55 | enum blktrace_notify { |
56 | __BLK_TN_PROCESS = 0, /* establish pid/name mapping */ | 56 | __BLK_TN_PROCESS = 0, /* establish pid/name mapping */ |
57 | __BLK_TN_TIMESTAMP, /* include system clock */ | 57 | __BLK_TN_TIMESTAMP, /* include system clock */ |
58 | }; | 58 | }; |
59 | 59 | ||
60 | 60 | ||
61 | /* | 61 | /* |
62 | * Trace actions in full. Additionally, read or write is masked | 62 | * Trace actions in full. Additionally, read or write is masked |
63 | */ | 63 | */ |
64 | #define BLK_TA_QUEUE (__BLK_TA_QUEUE | BLK_TC_ACT(BLK_TC_QUEUE)) | 64 | #define BLK_TA_QUEUE (__BLK_TA_QUEUE | BLK_TC_ACT(BLK_TC_QUEUE)) |
65 | #define BLK_TA_BACKMERGE (__BLK_TA_BACKMERGE | BLK_TC_ACT(BLK_TC_QUEUE)) | 65 | #define BLK_TA_BACKMERGE (__BLK_TA_BACKMERGE | BLK_TC_ACT(BLK_TC_QUEUE)) |
66 | #define BLK_TA_FRONTMERGE (__BLK_TA_FRONTMERGE | BLK_TC_ACT(BLK_TC_QUEUE)) | 66 | #define BLK_TA_FRONTMERGE (__BLK_TA_FRONTMERGE | BLK_TC_ACT(BLK_TC_QUEUE)) |
67 | #define BLK_TA_GETRQ (__BLK_TA_GETRQ | BLK_TC_ACT(BLK_TC_QUEUE)) | 67 | #define BLK_TA_GETRQ (__BLK_TA_GETRQ | BLK_TC_ACT(BLK_TC_QUEUE)) |
68 | #define BLK_TA_SLEEPRQ (__BLK_TA_SLEEPRQ | BLK_TC_ACT(BLK_TC_QUEUE)) | 68 | #define BLK_TA_SLEEPRQ (__BLK_TA_SLEEPRQ | BLK_TC_ACT(BLK_TC_QUEUE)) |
69 | #define BLK_TA_REQUEUE (__BLK_TA_REQUEUE | BLK_TC_ACT(BLK_TC_REQUEUE)) | 69 | #define BLK_TA_REQUEUE (__BLK_TA_REQUEUE | BLK_TC_ACT(BLK_TC_REQUEUE)) |
70 | #define BLK_TA_ISSUE (__BLK_TA_ISSUE | BLK_TC_ACT(BLK_TC_ISSUE)) | 70 | #define BLK_TA_ISSUE (__BLK_TA_ISSUE | BLK_TC_ACT(BLK_TC_ISSUE)) |
71 | #define BLK_TA_COMPLETE (__BLK_TA_COMPLETE| BLK_TC_ACT(BLK_TC_COMPLETE)) | 71 | #define BLK_TA_COMPLETE (__BLK_TA_COMPLETE| BLK_TC_ACT(BLK_TC_COMPLETE)) |
72 | #define BLK_TA_PLUG (__BLK_TA_PLUG | BLK_TC_ACT(BLK_TC_QUEUE)) | 72 | #define BLK_TA_PLUG (__BLK_TA_PLUG | BLK_TC_ACT(BLK_TC_QUEUE)) |
73 | #define BLK_TA_UNPLUG_IO (__BLK_TA_UNPLUG_IO | BLK_TC_ACT(BLK_TC_QUEUE)) | 73 | #define BLK_TA_UNPLUG_IO (__BLK_TA_UNPLUG_IO | BLK_TC_ACT(BLK_TC_QUEUE)) |
74 | #define BLK_TA_UNPLUG_TIMER (__BLK_TA_UNPLUG_TIMER | BLK_TC_ACT(BLK_TC_QUEUE)) | 74 | #define BLK_TA_UNPLUG_TIMER (__BLK_TA_UNPLUG_TIMER | BLK_TC_ACT(BLK_TC_QUEUE)) |
75 | #define BLK_TA_INSERT (__BLK_TA_INSERT | BLK_TC_ACT(BLK_TC_QUEUE)) | 75 | #define BLK_TA_INSERT (__BLK_TA_INSERT | BLK_TC_ACT(BLK_TC_QUEUE)) |
76 | #define BLK_TA_SPLIT (__BLK_TA_SPLIT) | 76 | #define BLK_TA_SPLIT (__BLK_TA_SPLIT) |
77 | #define BLK_TA_BOUNCE (__BLK_TA_BOUNCE) | 77 | #define BLK_TA_BOUNCE (__BLK_TA_BOUNCE) |
78 | #define BLK_TA_REMAP (__BLK_TA_REMAP | BLK_TC_ACT(BLK_TC_QUEUE)) | 78 | #define BLK_TA_REMAP (__BLK_TA_REMAP | BLK_TC_ACT(BLK_TC_QUEUE)) |
79 | 79 | ||
80 | #define BLK_TN_PROCESS (__BLK_TN_PROCESS | BLK_TC_ACT(BLK_TC_NOTIFY)) | 80 | #define BLK_TN_PROCESS (__BLK_TN_PROCESS | BLK_TC_ACT(BLK_TC_NOTIFY)) |
81 | #define BLK_TN_TIMESTAMP (__BLK_TN_TIMESTAMP | BLK_TC_ACT(BLK_TC_NOTIFY)) | 81 | #define BLK_TN_TIMESTAMP (__BLK_TN_TIMESTAMP | BLK_TC_ACT(BLK_TC_NOTIFY)) |
82 | 82 | ||
83 | #define BLK_IO_TRACE_MAGIC 0x65617400 | 83 | #define BLK_IO_TRACE_MAGIC 0x65617400 |
84 | #define BLK_IO_TRACE_VERSION 0x07 | 84 | #define BLK_IO_TRACE_VERSION 0x07 |
85 | 85 | ||
86 | /* | 86 | /* |
87 | * The trace itself | 87 | * The trace itself |
88 | */ | 88 | */ |
89 | struct blk_io_trace { | 89 | struct blk_io_trace { |
90 | u32 magic; /* MAGIC << 8 | version */ | 90 | u32 magic; /* MAGIC << 8 | version */ |
91 | u32 sequence; /* event number */ | 91 | u32 sequence; /* event number */ |
92 | u64 time; /* in microseconds */ | 92 | u64 time; /* in microseconds */ |
93 | u64 sector; /* disk offset */ | 93 | u64 sector; /* disk offset */ |
94 | u32 bytes; /* transfer length */ | 94 | u32 bytes; /* transfer length */ |
95 | u32 action; /* what happened */ | 95 | u32 action; /* what happened */ |
96 | u32 pid; /* who did it */ | 96 | u32 pid; /* who did it */ |
97 | u32 device; /* device number */ | 97 | u32 device; /* device number */ |
98 | u32 cpu; /* on what cpu did it happen */ | 98 | u32 cpu; /* on what cpu did it happen */ |
99 | u16 error; /* completion error */ | 99 | u16 error; /* completion error */ |
100 | u16 pdu_len; /* length of data after this trace */ | 100 | u16 pdu_len; /* length of data after this trace */ |
101 | }; | 101 | }; |
102 | 102 | ||
103 | /* | 103 | /* |
104 | * The remap event | 104 | * The remap event |
105 | */ | 105 | */ |
106 | struct blk_io_trace_remap { | 106 | struct blk_io_trace_remap { |
107 | __be32 device; | 107 | __be32 device; |
108 | u32 __pad; | 108 | __be32 device_from; |
109 | __be64 sector; | 109 | __be64 sector; |
110 | }; | 110 | }; |
111 | 111 | ||
112 | enum { | 112 | enum { |
113 | Blktrace_setup = 1, | 113 | Blktrace_setup = 1, |
114 | Blktrace_running, | 114 | Blktrace_running, |
115 | Blktrace_stopped, | 115 | Blktrace_stopped, |
116 | }; | 116 | }; |
117 | 117 | ||
118 | struct blk_trace { | 118 | struct blk_trace { |
119 | int trace_state; | 119 | int trace_state; |
120 | struct rchan *rchan; | 120 | struct rchan *rchan; |
121 | unsigned long *sequence; | 121 | unsigned long *sequence; |
122 | u16 act_mask; | 122 | u16 act_mask; |
123 | u64 start_lba; | 123 | u64 start_lba; |
124 | u64 end_lba; | 124 | u64 end_lba; |
125 | u32 pid; | 125 | u32 pid; |
126 | u32 dev; | 126 | u32 dev; |
127 | struct dentry *dir; | 127 | struct dentry *dir; |
128 | struct dentry *dropped_file; | 128 | struct dentry *dropped_file; |
129 | atomic_t dropped; | 129 | atomic_t dropped; |
130 | }; | 130 | }; |
131 | 131 | ||
132 | /* | 132 | /* |
133 | * User setup structure passed with BLKTRACESTART | 133 | * User setup structure passed with BLKTRACESTART |
134 | */ | 134 | */ |
135 | struct blk_user_trace_setup { | 135 | struct blk_user_trace_setup { |
136 | char name[BDEVNAME_SIZE]; /* output */ | 136 | char name[BDEVNAME_SIZE]; /* output */ |
137 | u16 act_mask; /* input */ | 137 | u16 act_mask; /* input */ |
138 | u32 buf_size; /* input */ | 138 | u32 buf_size; /* input */ |
139 | u32 buf_nr; /* input */ | 139 | u32 buf_nr; /* input */ |
140 | u64 start_lba; | 140 | u64 start_lba; |
141 | u64 end_lba; | 141 | u64 end_lba; |
142 | u32 pid; | 142 | u32 pid; |
143 | }; | 143 | }; |
144 | 144 | ||
145 | #if defined(CONFIG_BLK_DEV_IO_TRACE) | 145 | #if defined(CONFIG_BLK_DEV_IO_TRACE) |
146 | extern int blk_trace_ioctl(struct block_device *, unsigned, char __user *); | 146 | extern int blk_trace_ioctl(struct block_device *, unsigned, char __user *); |
147 | extern void blk_trace_shutdown(struct request_queue *); | 147 | extern void blk_trace_shutdown(struct request_queue *); |
148 | extern void __blk_add_trace(struct blk_trace *, sector_t, int, int, u32, int, int, void *); | 148 | extern void __blk_add_trace(struct blk_trace *, sector_t, int, int, u32, int, int, void *); |
149 | 149 | ||
150 | /** | 150 | /** |
151 | * blk_add_trace_rq - Add a trace for a request oriented action | 151 | * blk_add_trace_rq - Add a trace for a request oriented action |
152 | * @q: queue the io is for | 152 | * @q: queue the io is for |
153 | * @rq: the source request | 153 | * @rq: the source request |
154 | * @what: the action | 154 | * @what: the action |
155 | * | 155 | * |
156 | * Description: | 156 | * Description: |
157 | * Records an action against a request. Will log the bio offset + size. | 157 | * Records an action against a request. Will log the bio offset + size. |
158 | * | 158 | * |
159 | **/ | 159 | **/ |
160 | static inline void blk_add_trace_rq(struct request_queue *q, struct request *rq, | 160 | static inline void blk_add_trace_rq(struct request_queue *q, struct request *rq, |
161 | u32 what) | 161 | u32 what) |
162 | { | 162 | { |
163 | struct blk_trace *bt = q->blk_trace; | 163 | struct blk_trace *bt = q->blk_trace; |
164 | int rw = rq->cmd_flags & 0x03; | 164 | int rw = rq->cmd_flags & 0x03; |
165 | 165 | ||
166 | if (likely(!bt)) | 166 | if (likely(!bt)) |
167 | return; | 167 | return; |
168 | 168 | ||
169 | if (blk_pc_request(rq)) { | 169 | if (blk_pc_request(rq)) { |
170 | what |= BLK_TC_ACT(BLK_TC_PC); | 170 | what |= BLK_TC_ACT(BLK_TC_PC); |
171 | __blk_add_trace(bt, 0, rq->data_len, rw, what, rq->errors, sizeof(rq->cmd), rq->cmd); | 171 | __blk_add_trace(bt, 0, rq->data_len, rw, what, rq->errors, sizeof(rq->cmd), rq->cmd); |
172 | } else { | 172 | } else { |
173 | what |= BLK_TC_ACT(BLK_TC_FS); | 173 | what |= BLK_TC_ACT(BLK_TC_FS); |
174 | __blk_add_trace(bt, rq->hard_sector, rq->hard_nr_sectors << 9, rw, what, rq->errors, 0, NULL); | 174 | __blk_add_trace(bt, rq->hard_sector, rq->hard_nr_sectors << 9, rw, what, rq->errors, 0, NULL); |
175 | } | 175 | } |
176 | } | 176 | } |
177 | 177 | ||
178 | /** | 178 | /** |
179 | * blk_add_trace_bio - Add a trace for a bio oriented action | 179 | * blk_add_trace_bio - Add a trace for a bio oriented action |
180 | * @q: queue the io is for | 180 | * @q: queue the io is for |
181 | * @bio: the source bio | 181 | * @bio: the source bio |
182 | * @what: the action | 182 | * @what: the action |
183 | * | 183 | * |
184 | * Description: | 184 | * Description: |
185 | * Records an action against a bio. Will log the bio offset + size. | 185 | * Records an action against a bio. Will log the bio offset + size. |
186 | * | 186 | * |
187 | **/ | 187 | **/ |
188 | static inline void blk_add_trace_bio(struct request_queue *q, struct bio *bio, | 188 | static inline void blk_add_trace_bio(struct request_queue *q, struct bio *bio, |
189 | u32 what) | 189 | u32 what) |
190 | { | 190 | { |
191 | struct blk_trace *bt = q->blk_trace; | 191 | struct blk_trace *bt = q->blk_trace; |
192 | 192 | ||
193 | if (likely(!bt)) | 193 | if (likely(!bt)) |
194 | return; | 194 | return; |
195 | 195 | ||
196 | __blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw, what, !bio_flagged(bio, BIO_UPTODATE), 0, NULL); | 196 | __blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw, what, !bio_flagged(bio, BIO_UPTODATE), 0, NULL); |
197 | } | 197 | } |
198 | 198 | ||
199 | /** | 199 | /** |
200 | * blk_add_trace_generic - Add a trace for a generic action | 200 | * blk_add_trace_generic - Add a trace for a generic action |
201 | * @q: queue the io is for | 201 | * @q: queue the io is for |
202 | * @bio: the source bio | 202 | * @bio: the source bio |
203 | * @rw: the data direction | 203 | * @rw: the data direction |
204 | * @what: the action | 204 | * @what: the action |
205 | * | 205 | * |
206 | * Description: | 206 | * Description: |
207 | * Records a simple trace | 207 | * Records a simple trace |
208 | * | 208 | * |
209 | **/ | 209 | **/ |
210 | static inline void blk_add_trace_generic(struct request_queue *q, | 210 | static inline void blk_add_trace_generic(struct request_queue *q, |
211 | struct bio *bio, int rw, u32 what) | 211 | struct bio *bio, int rw, u32 what) |
212 | { | 212 | { |
213 | struct blk_trace *bt = q->blk_trace; | 213 | struct blk_trace *bt = q->blk_trace; |
214 | 214 | ||
215 | if (likely(!bt)) | 215 | if (likely(!bt)) |
216 | return; | 216 | return; |
217 | 217 | ||
218 | if (bio) | 218 | if (bio) |
219 | blk_add_trace_bio(q, bio, what); | 219 | blk_add_trace_bio(q, bio, what); |
220 | else | 220 | else |
221 | __blk_add_trace(bt, 0, 0, rw, what, 0, 0, NULL); | 221 | __blk_add_trace(bt, 0, 0, rw, what, 0, 0, NULL); |
222 | } | 222 | } |
223 | 223 | ||
224 | /** | 224 | /** |
225 | * blk_add_trace_pdu_int - Add a trace for a bio with an integer payload | 225 | * blk_add_trace_pdu_int - Add a trace for a bio with an integer payload |
226 | * @q: queue the io is for | 226 | * @q: queue the io is for |
227 | * @what: the action | 227 | * @what: the action |
228 | * @bio: the source bio | 228 | * @bio: the source bio |
229 | * @pdu: the integer payload | 229 | * @pdu: the integer payload |
230 | * | 230 | * |
231 | * Description: | 231 | * Description: |
232 | * Adds a trace with some integer payload. This might be an unplug | 232 | * Adds a trace with some integer payload. This might be an unplug |
233 | * option given as the action, with the depth at unplug time given | 233 | * option given as the action, with the depth at unplug time given |
234 | * as the payload | 234 | * as the payload |
235 | * | 235 | * |
236 | **/ | 236 | **/ |
237 | static inline void blk_add_trace_pdu_int(struct request_queue *q, u32 what, | 237 | static inline void blk_add_trace_pdu_int(struct request_queue *q, u32 what, |
238 | struct bio *bio, unsigned int pdu) | 238 | struct bio *bio, unsigned int pdu) |
239 | { | 239 | { |
240 | struct blk_trace *bt = q->blk_trace; | 240 | struct blk_trace *bt = q->blk_trace; |
241 | __be64 rpdu = cpu_to_be64(pdu); | 241 | __be64 rpdu = cpu_to_be64(pdu); |
242 | 242 | ||
243 | if (likely(!bt)) | 243 | if (likely(!bt)) |
244 | return; | 244 | return; |
245 | 245 | ||
246 | if (bio) | 246 | if (bio) |
247 | __blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw, what, !bio_flagged(bio, BIO_UPTODATE), sizeof(rpdu), &rpdu); | 247 | __blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw, what, !bio_flagged(bio, BIO_UPTODATE), sizeof(rpdu), &rpdu); |
248 | else | 248 | else |
249 | __blk_add_trace(bt, 0, 0, 0, what, 0, sizeof(rpdu), &rpdu); | 249 | __blk_add_trace(bt, 0, 0, 0, what, 0, sizeof(rpdu), &rpdu); |
250 | } | 250 | } |
251 | 251 | ||
252 | /** | 252 | /** |
253 | * blk_add_trace_remap - Add a trace for a remap operation | 253 | * blk_add_trace_remap - Add a trace for a remap operation |
254 | * @q: queue the io is for | 254 | * @q: queue the io is for |
255 | * @bio: the source bio | 255 | * @bio: the source bio |
256 | * @dev: target device | 256 | * @dev: target device |
257 | * @from: source sector | 257 | * @from: source sector |
258 | * @to: target sector | 258 | * @to: target sector |
259 | * | 259 | * |
260 | * Description: | 260 | * Description: |
261 | * Device mapper or raid target sometimes need to split a bio because | 261 | * Device mapper or raid target sometimes need to split a bio because |
262 | * it spans a stripe (or similar). Add a trace for that action. | 262 | * it spans a stripe (or similar). Add a trace for that action. |
263 | * | 263 | * |
264 | **/ | 264 | **/ |
265 | static inline void blk_add_trace_remap(struct request_queue *q, struct bio *bio, | 265 | static inline void blk_add_trace_remap(struct request_queue *q, struct bio *bio, |
266 | dev_t dev, sector_t from, sector_t to) | 266 | dev_t dev, sector_t from, sector_t to) |
267 | { | 267 | { |
268 | struct blk_trace *bt = q->blk_trace; | 268 | struct blk_trace *bt = q->blk_trace; |
269 | struct blk_io_trace_remap r; | 269 | struct blk_io_trace_remap r; |
270 | 270 | ||
271 | if (likely(!bt)) | 271 | if (likely(!bt)) |
272 | return; | 272 | return; |
273 | 273 | ||
274 | r.device = cpu_to_be32(dev); | 274 | r.device = cpu_to_be32(dev); |
275 | r.device_from = cpu_to_be32(bio->bi_bdev->bd_dev); | ||
275 | r.sector = cpu_to_be64(to); | 276 | r.sector = cpu_to_be64(to); |
276 | 277 | ||
277 | __blk_add_trace(bt, from, bio->bi_size, bio->bi_rw, BLK_TA_REMAP, !bio_flagged(bio, BIO_UPTODATE), sizeof(r), &r); | 278 | __blk_add_trace(bt, from, bio->bi_size, bio->bi_rw, BLK_TA_REMAP, !bio_flagged(bio, BIO_UPTODATE), sizeof(r), &r); |
278 | } | 279 | } |
279 | 280 | ||
280 | #else /* !CONFIG_BLK_DEV_IO_TRACE */ | 281 | #else /* !CONFIG_BLK_DEV_IO_TRACE */ |
281 | #define blk_trace_ioctl(bdev, cmd, arg) (-ENOTTY) | 282 | #define blk_trace_ioctl(bdev, cmd, arg) (-ENOTTY) |
282 | #define blk_trace_shutdown(q) do { } while (0) | 283 | #define blk_trace_shutdown(q) do { } while (0) |
283 | #define blk_add_trace_rq(q, rq, what) do { } while (0) | 284 | #define blk_add_trace_rq(q, rq, what) do { } while (0) |
284 | #define blk_add_trace_bio(q, rq, what) do { } while (0) | 285 | #define blk_add_trace_bio(q, rq, what) do { } while (0) |
285 | #define blk_add_trace_generic(q, rq, rw, what) do { } while (0) | 286 | #define blk_add_trace_generic(q, rq, rw, what) do { } while (0) |
286 | #define blk_add_trace_pdu_int(q, what, bio, pdu) do { } while (0) | 287 | #define blk_add_trace_pdu_int(q, what, bio, pdu) do { } while (0) |
287 | #define blk_add_trace_remap(q, bio, dev, f, t) do {} while (0) | 288 | #define blk_add_trace_remap(q, bio, dev, f, t) do {} while (0) |
288 | #endif /* CONFIG_BLK_DEV_IO_TRACE */ | 289 | #endif /* CONFIG_BLK_DEV_IO_TRACE */ |
289 | 290 | ||
290 | #endif | 291 | #endif |
291 | 292 |