Commit bf2de6f5a4faf0197268f18d08969b003b87b6e8

Authored by Jens Axboe
1 parent c07e2b4129

block: Initial support for data-less (or empty) barrier support

This implements functionality to pass down or insert a barrier
in a queue, without having data attached to it. The ->prepare_flush_fn()
infrastructure from data barriers are reused to provide this
functionality.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

Showing 5 changed files with 71 additions and 21 deletions Inline Diff

1 /* 1 /*
2 * Block device elevator/IO-scheduler. 2 * Block device elevator/IO-scheduler.
3 * 3 *
4 * Copyright (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE 4 * Copyright (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE
5 * 5 *
6 * 30042000 Jens Axboe <axboe@kernel.dk> : 6 * 30042000 Jens Axboe <axboe@kernel.dk> :
7 * 7 *
8 * Split the elevator a bit so that it is possible to choose a different 8 * Split the elevator a bit so that it is possible to choose a different
9 * one or even write a new "plug in". There are three pieces: 9 * one or even write a new "plug in". There are three pieces:
10 * - elevator_fn, inserts a new request in the queue list 10 * - elevator_fn, inserts a new request in the queue list
11 * - elevator_merge_fn, decides whether a new buffer can be merged with 11 * - elevator_merge_fn, decides whether a new buffer can be merged with
12 * an existing request 12 * an existing request
13 * - elevator_dequeue_fn, called when a request is taken off the active list 13 * - elevator_dequeue_fn, called when a request is taken off the active list
14 * 14 *
15 * 20082000 Dave Jones <davej@suse.de> : 15 * 20082000 Dave Jones <davej@suse.de> :
16 * Removed tests for max-bomb-segments, which was breaking elvtune 16 * Removed tests for max-bomb-segments, which was breaking elvtune
17 * when run without -bN 17 * when run without -bN
18 * 18 *
19 * Jens: 19 * Jens:
20 * - Rework again to work with bio instead of buffer_heads 20 * - Rework again to work with bio instead of buffer_heads
21 * - loose bi_dev comparisons, partition handling is right now 21 * - loose bi_dev comparisons, partition handling is right now
22 * - completely modularize elevator setup and teardown 22 * - completely modularize elevator setup and teardown
23 * 23 *
24 */ 24 */
25 #include <linux/kernel.h> 25 #include <linux/kernel.h>
26 #include <linux/fs.h> 26 #include <linux/fs.h>
27 #include <linux/blkdev.h> 27 #include <linux/blkdev.h>
28 #include <linux/elevator.h> 28 #include <linux/elevator.h>
29 #include <linux/bio.h> 29 #include <linux/bio.h>
30 #include <linux/module.h> 30 #include <linux/module.h>
31 #include <linux/slab.h> 31 #include <linux/slab.h>
32 #include <linux/init.h> 32 #include <linux/init.h>
33 #include <linux/compiler.h> 33 #include <linux/compiler.h>
34 #include <linux/delay.h> 34 #include <linux/delay.h>
35 #include <linux/blktrace_api.h> 35 #include <linux/blktrace_api.h>
36 #include <linux/hash.h> 36 #include <linux/hash.h>
37 37
38 #include <asm/uaccess.h> 38 #include <asm/uaccess.h>
39 39
40 static DEFINE_SPINLOCK(elv_list_lock); 40 static DEFINE_SPINLOCK(elv_list_lock);
41 static LIST_HEAD(elv_list); 41 static LIST_HEAD(elv_list);
42 42
43 /* 43 /*
44 * Merge hash stuff. 44 * Merge hash stuff.
45 */ 45 */
46 static const int elv_hash_shift = 6; 46 static const int elv_hash_shift = 6;
47 #define ELV_HASH_BLOCK(sec) ((sec) >> 3) 47 #define ELV_HASH_BLOCK(sec) ((sec) >> 3)
48 #define ELV_HASH_FN(sec) (hash_long(ELV_HASH_BLOCK((sec)), elv_hash_shift)) 48 #define ELV_HASH_FN(sec) (hash_long(ELV_HASH_BLOCK((sec)), elv_hash_shift))
49 #define ELV_HASH_ENTRIES (1 << elv_hash_shift) 49 #define ELV_HASH_ENTRIES (1 << elv_hash_shift)
50 #define rq_hash_key(rq) ((rq)->sector + (rq)->nr_sectors) 50 #define rq_hash_key(rq) ((rq)->sector + (rq)->nr_sectors)
51 #define ELV_ON_HASH(rq) (!hlist_unhashed(&(rq)->hash)) 51 #define ELV_ON_HASH(rq) (!hlist_unhashed(&(rq)->hash))
52 52
53 /* 53 /*
54 * Query io scheduler to see if the current process issuing bio may be 54 * Query io scheduler to see if the current process issuing bio may be
55 * merged with rq. 55 * merged with rq.
56 */ 56 */
57 static int elv_iosched_allow_merge(struct request *rq, struct bio *bio) 57 static int elv_iosched_allow_merge(struct request *rq, struct bio *bio)
58 { 58 {
59 struct request_queue *q = rq->q; 59 struct request_queue *q = rq->q;
60 elevator_t *e = q->elevator; 60 elevator_t *e = q->elevator;
61 61
62 if (e->ops->elevator_allow_merge_fn) 62 if (e->ops->elevator_allow_merge_fn)
63 return e->ops->elevator_allow_merge_fn(q, rq, bio); 63 return e->ops->elevator_allow_merge_fn(q, rq, bio);
64 64
65 return 1; 65 return 1;
66 } 66 }
67 67
68 /* 68 /*
69 * can we safely merge with this request? 69 * can we safely merge with this request?
70 */ 70 */
71 inline int elv_rq_merge_ok(struct request *rq, struct bio *bio) 71 inline int elv_rq_merge_ok(struct request *rq, struct bio *bio)
72 { 72 {
73 if (!rq_mergeable(rq)) 73 if (!rq_mergeable(rq))
74 return 0; 74 return 0;
75 75
76 /* 76 /*
77 * different data direction or already started, don't merge 77 * different data direction or already started, don't merge
78 */ 78 */
79 if (bio_data_dir(bio) != rq_data_dir(rq)) 79 if (bio_data_dir(bio) != rq_data_dir(rq))
80 return 0; 80 return 0;
81 81
82 /* 82 /*
83 * must be same device and not a special request 83 * must be same device and not a special request
84 */ 84 */
85 if (rq->rq_disk != bio->bi_bdev->bd_disk || rq->special) 85 if (rq->rq_disk != bio->bi_bdev->bd_disk || rq->special)
86 return 0; 86 return 0;
87 87
88 if (!elv_iosched_allow_merge(rq, bio)) 88 if (!elv_iosched_allow_merge(rq, bio))
89 return 0; 89 return 0;
90 90
91 return 1; 91 return 1;
92 } 92 }
93 EXPORT_SYMBOL(elv_rq_merge_ok); 93 EXPORT_SYMBOL(elv_rq_merge_ok);
94 94
95 static inline int elv_try_merge(struct request *__rq, struct bio *bio) 95 static inline int elv_try_merge(struct request *__rq, struct bio *bio)
96 { 96 {
97 int ret = ELEVATOR_NO_MERGE; 97 int ret = ELEVATOR_NO_MERGE;
98 98
99 /* 99 /*
100 * we can merge and sequence is ok, check if it's possible 100 * we can merge and sequence is ok, check if it's possible
101 */ 101 */
102 if (elv_rq_merge_ok(__rq, bio)) { 102 if (elv_rq_merge_ok(__rq, bio)) {
103 if (__rq->sector + __rq->nr_sectors == bio->bi_sector) 103 if (__rq->sector + __rq->nr_sectors == bio->bi_sector)
104 ret = ELEVATOR_BACK_MERGE; 104 ret = ELEVATOR_BACK_MERGE;
105 else if (__rq->sector - bio_sectors(bio) == bio->bi_sector) 105 else if (__rq->sector - bio_sectors(bio) == bio->bi_sector)
106 ret = ELEVATOR_FRONT_MERGE; 106 ret = ELEVATOR_FRONT_MERGE;
107 } 107 }
108 108
109 return ret; 109 return ret;
110 } 110 }
111 111
112 static struct elevator_type *elevator_find(const char *name) 112 static struct elevator_type *elevator_find(const char *name)
113 { 113 {
114 struct elevator_type *e; 114 struct elevator_type *e;
115 115
116 list_for_each_entry(e, &elv_list, list) { 116 list_for_each_entry(e, &elv_list, list) {
117 if (!strcmp(e->elevator_name, name)) 117 if (!strcmp(e->elevator_name, name))
118 return e; 118 return e;
119 } 119 }
120 120
121 return NULL; 121 return NULL;
122 } 122 }
123 123
124 static void elevator_put(struct elevator_type *e) 124 static void elevator_put(struct elevator_type *e)
125 { 125 {
126 module_put(e->elevator_owner); 126 module_put(e->elevator_owner);
127 } 127 }
128 128
129 static struct elevator_type *elevator_get(const char *name) 129 static struct elevator_type *elevator_get(const char *name)
130 { 130 {
131 struct elevator_type *e; 131 struct elevator_type *e;
132 132
133 spin_lock(&elv_list_lock); 133 spin_lock(&elv_list_lock);
134 134
135 e = elevator_find(name); 135 e = elevator_find(name);
136 if (e && !try_module_get(e->elevator_owner)) 136 if (e && !try_module_get(e->elevator_owner))
137 e = NULL; 137 e = NULL;
138 138
139 spin_unlock(&elv_list_lock); 139 spin_unlock(&elv_list_lock);
140 140
141 return e; 141 return e;
142 } 142 }
143 143
144 static void *elevator_init_queue(struct request_queue *q, 144 static void *elevator_init_queue(struct request_queue *q,
145 struct elevator_queue *eq) 145 struct elevator_queue *eq)
146 { 146 {
147 return eq->ops->elevator_init_fn(q); 147 return eq->ops->elevator_init_fn(q);
148 } 148 }
149 149
150 static void elevator_attach(struct request_queue *q, struct elevator_queue *eq, 150 static void elevator_attach(struct request_queue *q, struct elevator_queue *eq,
151 void *data) 151 void *data)
152 { 152 {
153 q->elevator = eq; 153 q->elevator = eq;
154 eq->elevator_data = data; 154 eq->elevator_data = data;
155 } 155 }
156 156
157 static char chosen_elevator[16]; 157 static char chosen_elevator[16];
158 158
159 static int __init elevator_setup(char *str) 159 static int __init elevator_setup(char *str)
160 { 160 {
161 /* 161 /*
162 * Be backwards-compatible with previous kernels, so users 162 * Be backwards-compatible with previous kernels, so users
163 * won't get the wrong elevator. 163 * won't get the wrong elevator.
164 */ 164 */
165 if (!strcmp(str, "as")) 165 if (!strcmp(str, "as"))
166 strcpy(chosen_elevator, "anticipatory"); 166 strcpy(chosen_elevator, "anticipatory");
167 else 167 else
168 strncpy(chosen_elevator, str, sizeof(chosen_elevator) - 1); 168 strncpy(chosen_elevator, str, sizeof(chosen_elevator) - 1);
169 return 1; 169 return 1;
170 } 170 }
171 171
172 __setup("elevator=", elevator_setup); 172 __setup("elevator=", elevator_setup);
173 173
174 static struct kobj_type elv_ktype; 174 static struct kobj_type elv_ktype;
175 175
176 static elevator_t *elevator_alloc(struct request_queue *q, 176 static elevator_t *elevator_alloc(struct request_queue *q,
177 struct elevator_type *e) 177 struct elevator_type *e)
178 { 178 {
179 elevator_t *eq; 179 elevator_t *eq;
180 int i; 180 int i;
181 181
182 eq = kmalloc_node(sizeof(elevator_t), GFP_KERNEL | __GFP_ZERO, q->node); 182 eq = kmalloc_node(sizeof(elevator_t), GFP_KERNEL | __GFP_ZERO, q->node);
183 if (unlikely(!eq)) 183 if (unlikely(!eq))
184 goto err; 184 goto err;
185 185
186 eq->ops = &e->ops; 186 eq->ops = &e->ops;
187 eq->elevator_type = e; 187 eq->elevator_type = e;
188 kobject_init(&eq->kobj); 188 kobject_init(&eq->kobj);
189 kobject_set_name(&eq->kobj, "%s", "iosched"); 189 kobject_set_name(&eq->kobj, "%s", "iosched");
190 eq->kobj.ktype = &elv_ktype; 190 eq->kobj.ktype = &elv_ktype;
191 mutex_init(&eq->sysfs_lock); 191 mutex_init(&eq->sysfs_lock);
192 192
193 eq->hash = kmalloc_node(sizeof(struct hlist_head) * ELV_HASH_ENTRIES, 193 eq->hash = kmalloc_node(sizeof(struct hlist_head) * ELV_HASH_ENTRIES,
194 GFP_KERNEL, q->node); 194 GFP_KERNEL, q->node);
195 if (!eq->hash) 195 if (!eq->hash)
196 goto err; 196 goto err;
197 197
198 for (i = 0; i < ELV_HASH_ENTRIES; i++) 198 for (i = 0; i < ELV_HASH_ENTRIES; i++)
199 INIT_HLIST_HEAD(&eq->hash[i]); 199 INIT_HLIST_HEAD(&eq->hash[i]);
200 200
201 return eq; 201 return eq;
202 err: 202 err:
203 kfree(eq); 203 kfree(eq);
204 elevator_put(e); 204 elevator_put(e);
205 return NULL; 205 return NULL;
206 } 206 }
207 207
208 static void elevator_release(struct kobject *kobj) 208 static void elevator_release(struct kobject *kobj)
209 { 209 {
210 elevator_t *e = container_of(kobj, elevator_t, kobj); 210 elevator_t *e = container_of(kobj, elevator_t, kobj);
211 211
212 elevator_put(e->elevator_type); 212 elevator_put(e->elevator_type);
213 kfree(e->hash); 213 kfree(e->hash);
214 kfree(e); 214 kfree(e);
215 } 215 }
216 216
217 int elevator_init(struct request_queue *q, char *name) 217 int elevator_init(struct request_queue *q, char *name)
218 { 218 {
219 struct elevator_type *e = NULL; 219 struct elevator_type *e = NULL;
220 struct elevator_queue *eq; 220 struct elevator_queue *eq;
221 int ret = 0; 221 int ret = 0;
222 void *data; 222 void *data;
223 223
224 INIT_LIST_HEAD(&q->queue_head); 224 INIT_LIST_HEAD(&q->queue_head);
225 q->last_merge = NULL; 225 q->last_merge = NULL;
226 q->end_sector = 0; 226 q->end_sector = 0;
227 q->boundary_rq = NULL; 227 q->boundary_rq = NULL;
228 228
229 if (name && !(e = elevator_get(name))) 229 if (name && !(e = elevator_get(name)))
230 return -EINVAL; 230 return -EINVAL;
231 231
232 if (!e && *chosen_elevator && !(e = elevator_get(chosen_elevator))) 232 if (!e && *chosen_elevator && !(e = elevator_get(chosen_elevator)))
233 printk("I/O scheduler %s not found\n", chosen_elevator); 233 printk("I/O scheduler %s not found\n", chosen_elevator);
234 234
235 if (!e && !(e = elevator_get(CONFIG_DEFAULT_IOSCHED))) { 235 if (!e && !(e = elevator_get(CONFIG_DEFAULT_IOSCHED))) {
236 printk("Default I/O scheduler not found, using no-op\n"); 236 printk("Default I/O scheduler not found, using no-op\n");
237 e = elevator_get("noop"); 237 e = elevator_get("noop");
238 } 238 }
239 239
240 eq = elevator_alloc(q, e); 240 eq = elevator_alloc(q, e);
241 if (!eq) 241 if (!eq)
242 return -ENOMEM; 242 return -ENOMEM;
243 243
244 data = elevator_init_queue(q, eq); 244 data = elevator_init_queue(q, eq);
245 if (!data) { 245 if (!data) {
246 kobject_put(&eq->kobj); 246 kobject_put(&eq->kobj);
247 return -ENOMEM; 247 return -ENOMEM;
248 } 248 }
249 249
250 elevator_attach(q, eq, data); 250 elevator_attach(q, eq, data);
251 return ret; 251 return ret;
252 } 252 }
253 253
254 EXPORT_SYMBOL(elevator_init); 254 EXPORT_SYMBOL(elevator_init);
255 255
256 void elevator_exit(elevator_t *e) 256 void elevator_exit(elevator_t *e)
257 { 257 {
258 mutex_lock(&e->sysfs_lock); 258 mutex_lock(&e->sysfs_lock);
259 if (e->ops->elevator_exit_fn) 259 if (e->ops->elevator_exit_fn)
260 e->ops->elevator_exit_fn(e); 260 e->ops->elevator_exit_fn(e);
261 e->ops = NULL; 261 e->ops = NULL;
262 mutex_unlock(&e->sysfs_lock); 262 mutex_unlock(&e->sysfs_lock);
263 263
264 kobject_put(&e->kobj); 264 kobject_put(&e->kobj);
265 } 265 }
266 266
267 EXPORT_SYMBOL(elevator_exit); 267 EXPORT_SYMBOL(elevator_exit);
268 268
269 static void elv_activate_rq(struct request_queue *q, struct request *rq) 269 static void elv_activate_rq(struct request_queue *q, struct request *rq)
270 { 270 {
271 elevator_t *e = q->elevator; 271 elevator_t *e = q->elevator;
272 272
273 if (e->ops->elevator_activate_req_fn) 273 if (e->ops->elevator_activate_req_fn)
274 e->ops->elevator_activate_req_fn(q, rq); 274 e->ops->elevator_activate_req_fn(q, rq);
275 } 275 }
276 276
277 static void elv_deactivate_rq(struct request_queue *q, struct request *rq) 277 static void elv_deactivate_rq(struct request_queue *q, struct request *rq)
278 { 278 {
279 elevator_t *e = q->elevator; 279 elevator_t *e = q->elevator;
280 280
281 if (e->ops->elevator_deactivate_req_fn) 281 if (e->ops->elevator_deactivate_req_fn)
282 e->ops->elevator_deactivate_req_fn(q, rq); 282 e->ops->elevator_deactivate_req_fn(q, rq);
283 } 283 }
284 284
285 static inline void __elv_rqhash_del(struct request *rq) 285 static inline void __elv_rqhash_del(struct request *rq)
286 { 286 {
287 hlist_del_init(&rq->hash); 287 hlist_del_init(&rq->hash);
288 } 288 }
289 289
290 static void elv_rqhash_del(struct request_queue *q, struct request *rq) 290 static void elv_rqhash_del(struct request_queue *q, struct request *rq)
291 { 291 {
292 if (ELV_ON_HASH(rq)) 292 if (ELV_ON_HASH(rq))
293 __elv_rqhash_del(rq); 293 __elv_rqhash_del(rq);
294 } 294 }
295 295
296 static void elv_rqhash_add(struct request_queue *q, struct request *rq) 296 static void elv_rqhash_add(struct request_queue *q, struct request *rq)
297 { 297 {
298 elevator_t *e = q->elevator; 298 elevator_t *e = q->elevator;
299 299
300 BUG_ON(ELV_ON_HASH(rq)); 300 BUG_ON(ELV_ON_HASH(rq));
301 hlist_add_head(&rq->hash, &e->hash[ELV_HASH_FN(rq_hash_key(rq))]); 301 hlist_add_head(&rq->hash, &e->hash[ELV_HASH_FN(rq_hash_key(rq))]);
302 } 302 }
303 303
304 static void elv_rqhash_reposition(struct request_queue *q, struct request *rq) 304 static void elv_rqhash_reposition(struct request_queue *q, struct request *rq)
305 { 305 {
306 __elv_rqhash_del(rq); 306 __elv_rqhash_del(rq);
307 elv_rqhash_add(q, rq); 307 elv_rqhash_add(q, rq);
308 } 308 }
309 309
310 static struct request *elv_rqhash_find(struct request_queue *q, sector_t offset) 310 static struct request *elv_rqhash_find(struct request_queue *q, sector_t offset)
311 { 311 {
312 elevator_t *e = q->elevator; 312 elevator_t *e = q->elevator;
313 struct hlist_head *hash_list = &e->hash[ELV_HASH_FN(offset)]; 313 struct hlist_head *hash_list = &e->hash[ELV_HASH_FN(offset)];
314 struct hlist_node *entry, *next; 314 struct hlist_node *entry, *next;
315 struct request *rq; 315 struct request *rq;
316 316
317 hlist_for_each_entry_safe(rq, entry, next, hash_list, hash) { 317 hlist_for_each_entry_safe(rq, entry, next, hash_list, hash) {
318 BUG_ON(!ELV_ON_HASH(rq)); 318 BUG_ON(!ELV_ON_HASH(rq));
319 319
320 if (unlikely(!rq_mergeable(rq))) { 320 if (unlikely(!rq_mergeable(rq))) {
321 __elv_rqhash_del(rq); 321 __elv_rqhash_del(rq);
322 continue; 322 continue;
323 } 323 }
324 324
325 if (rq_hash_key(rq) == offset) 325 if (rq_hash_key(rq) == offset)
326 return rq; 326 return rq;
327 } 327 }
328 328
329 return NULL; 329 return NULL;
330 } 330 }
331 331
332 /* 332 /*
333 * RB-tree support functions for inserting/lookup/removal of requests 333 * RB-tree support functions for inserting/lookup/removal of requests
334 * in a sorted RB tree. 334 * in a sorted RB tree.
335 */ 335 */
336 struct request *elv_rb_add(struct rb_root *root, struct request *rq) 336 struct request *elv_rb_add(struct rb_root *root, struct request *rq)
337 { 337 {
338 struct rb_node **p = &root->rb_node; 338 struct rb_node **p = &root->rb_node;
339 struct rb_node *parent = NULL; 339 struct rb_node *parent = NULL;
340 struct request *__rq; 340 struct request *__rq;
341 341
342 while (*p) { 342 while (*p) {
343 parent = *p; 343 parent = *p;
344 __rq = rb_entry(parent, struct request, rb_node); 344 __rq = rb_entry(parent, struct request, rb_node);
345 345
346 if (rq->sector < __rq->sector) 346 if (rq->sector < __rq->sector)
347 p = &(*p)->rb_left; 347 p = &(*p)->rb_left;
348 else if (rq->sector > __rq->sector) 348 else if (rq->sector > __rq->sector)
349 p = &(*p)->rb_right; 349 p = &(*p)->rb_right;
350 else 350 else
351 return __rq; 351 return __rq;
352 } 352 }
353 353
354 rb_link_node(&rq->rb_node, parent, p); 354 rb_link_node(&rq->rb_node, parent, p);
355 rb_insert_color(&rq->rb_node, root); 355 rb_insert_color(&rq->rb_node, root);
356 return NULL; 356 return NULL;
357 } 357 }
358 358
359 EXPORT_SYMBOL(elv_rb_add); 359 EXPORT_SYMBOL(elv_rb_add);
360 360
361 void elv_rb_del(struct rb_root *root, struct request *rq) 361 void elv_rb_del(struct rb_root *root, struct request *rq)
362 { 362 {
363 BUG_ON(RB_EMPTY_NODE(&rq->rb_node)); 363 BUG_ON(RB_EMPTY_NODE(&rq->rb_node));
364 rb_erase(&rq->rb_node, root); 364 rb_erase(&rq->rb_node, root);
365 RB_CLEAR_NODE(&rq->rb_node); 365 RB_CLEAR_NODE(&rq->rb_node);
366 } 366 }
367 367
368 EXPORT_SYMBOL(elv_rb_del); 368 EXPORT_SYMBOL(elv_rb_del);
369 369
370 struct request *elv_rb_find(struct rb_root *root, sector_t sector) 370 struct request *elv_rb_find(struct rb_root *root, sector_t sector)
371 { 371 {
372 struct rb_node *n = root->rb_node; 372 struct rb_node *n = root->rb_node;
373 struct request *rq; 373 struct request *rq;
374 374
375 while (n) { 375 while (n) {
376 rq = rb_entry(n, struct request, rb_node); 376 rq = rb_entry(n, struct request, rb_node);
377 377
378 if (sector < rq->sector) 378 if (sector < rq->sector)
379 n = n->rb_left; 379 n = n->rb_left;
380 else if (sector > rq->sector) 380 else if (sector > rq->sector)
381 n = n->rb_right; 381 n = n->rb_right;
382 else 382 else
383 return rq; 383 return rq;
384 } 384 }
385 385
386 return NULL; 386 return NULL;
387 } 387 }
388 388
389 EXPORT_SYMBOL(elv_rb_find); 389 EXPORT_SYMBOL(elv_rb_find);
390 390
391 /* 391 /*
392 * Insert rq into dispatch queue of q. Queue lock must be held on 392 * Insert rq into dispatch queue of q. Queue lock must be held on
393 * entry. rq is sort insted into the dispatch queue. To be used by 393 * entry. rq is sort insted into the dispatch queue. To be used by
394 * specific elevators. 394 * specific elevators.
395 */ 395 */
396 void elv_dispatch_sort(struct request_queue *q, struct request *rq) 396 void elv_dispatch_sort(struct request_queue *q, struct request *rq)
397 { 397 {
398 sector_t boundary; 398 sector_t boundary;
399 struct list_head *entry; 399 struct list_head *entry;
400 400
401 if (q->last_merge == rq) 401 if (q->last_merge == rq)
402 q->last_merge = NULL; 402 q->last_merge = NULL;
403 403
404 elv_rqhash_del(q, rq); 404 elv_rqhash_del(q, rq);
405 405
406 q->nr_sorted--; 406 q->nr_sorted--;
407 407
408 boundary = q->end_sector; 408 boundary = q->end_sector;
409 409
410 list_for_each_prev(entry, &q->queue_head) { 410 list_for_each_prev(entry, &q->queue_head) {
411 struct request *pos = list_entry_rq(entry); 411 struct request *pos = list_entry_rq(entry);
412 412
413 if (rq_data_dir(rq) != rq_data_dir(pos)) 413 if (rq_data_dir(rq) != rq_data_dir(pos))
414 break; 414 break;
415 if (pos->cmd_flags & (REQ_SOFTBARRIER|REQ_HARDBARRIER|REQ_STARTED)) 415 if (pos->cmd_flags & (REQ_SOFTBARRIER|REQ_HARDBARRIER|REQ_STARTED))
416 break; 416 break;
417 if (rq->sector >= boundary) { 417 if (rq->sector >= boundary) {
418 if (pos->sector < boundary) 418 if (pos->sector < boundary)
419 continue; 419 continue;
420 } else { 420 } else {
421 if (pos->sector >= boundary) 421 if (pos->sector >= boundary)
422 break; 422 break;
423 } 423 }
424 if (rq->sector >= pos->sector) 424 if (rq->sector >= pos->sector)
425 break; 425 break;
426 } 426 }
427 427
428 list_add(&rq->queuelist, entry); 428 list_add(&rq->queuelist, entry);
429 } 429 }
430 430
431 EXPORT_SYMBOL(elv_dispatch_sort); 431 EXPORT_SYMBOL(elv_dispatch_sort);
432 432
433 /* 433 /*
434 * Insert rq into dispatch queue of q. Queue lock must be held on 434 * Insert rq into dispatch queue of q. Queue lock must be held on
435 * entry. rq is added to the back of the dispatch queue. To be used by 435 * entry. rq is added to the back of the dispatch queue. To be used by
436 * specific elevators. 436 * specific elevators.
437 */ 437 */
438 void elv_dispatch_add_tail(struct request_queue *q, struct request *rq) 438 void elv_dispatch_add_tail(struct request_queue *q, struct request *rq)
439 { 439 {
440 if (q->last_merge == rq) 440 if (q->last_merge == rq)
441 q->last_merge = NULL; 441 q->last_merge = NULL;
442 442
443 elv_rqhash_del(q, rq); 443 elv_rqhash_del(q, rq);
444 444
445 q->nr_sorted--; 445 q->nr_sorted--;
446 446
447 q->end_sector = rq_end_sector(rq); 447 q->end_sector = rq_end_sector(rq);
448 q->boundary_rq = rq; 448 q->boundary_rq = rq;
449 list_add_tail(&rq->queuelist, &q->queue_head); 449 list_add_tail(&rq->queuelist, &q->queue_head);
450 } 450 }
451 451
452 EXPORT_SYMBOL(elv_dispatch_add_tail); 452 EXPORT_SYMBOL(elv_dispatch_add_tail);
453 453
454 int elv_merge(struct request_queue *q, struct request **req, struct bio *bio) 454 int elv_merge(struct request_queue *q, struct request **req, struct bio *bio)
455 { 455 {
456 elevator_t *e = q->elevator; 456 elevator_t *e = q->elevator;
457 struct request *__rq; 457 struct request *__rq;
458 int ret; 458 int ret;
459 459
460 /* 460 /*
461 * First try one-hit cache. 461 * First try one-hit cache.
462 */ 462 */
463 if (q->last_merge) { 463 if (q->last_merge) {
464 ret = elv_try_merge(q->last_merge, bio); 464 ret = elv_try_merge(q->last_merge, bio);
465 if (ret != ELEVATOR_NO_MERGE) { 465 if (ret != ELEVATOR_NO_MERGE) {
466 *req = q->last_merge; 466 *req = q->last_merge;
467 return ret; 467 return ret;
468 } 468 }
469 } 469 }
470 470
471 /* 471 /*
472 * See if our hash lookup can find a potential backmerge. 472 * See if our hash lookup can find a potential backmerge.
473 */ 473 */
474 __rq = elv_rqhash_find(q, bio->bi_sector); 474 __rq = elv_rqhash_find(q, bio->bi_sector);
475 if (__rq && elv_rq_merge_ok(__rq, bio)) { 475 if (__rq && elv_rq_merge_ok(__rq, bio)) {
476 *req = __rq; 476 *req = __rq;
477 return ELEVATOR_BACK_MERGE; 477 return ELEVATOR_BACK_MERGE;
478 } 478 }
479 479
480 if (e->ops->elevator_merge_fn) 480 if (e->ops->elevator_merge_fn)
481 return e->ops->elevator_merge_fn(q, req, bio); 481 return e->ops->elevator_merge_fn(q, req, bio);
482 482
483 return ELEVATOR_NO_MERGE; 483 return ELEVATOR_NO_MERGE;
484 } 484 }
485 485
486 void elv_merged_request(struct request_queue *q, struct request *rq, int type) 486 void elv_merged_request(struct request_queue *q, struct request *rq, int type)
487 { 487 {
488 elevator_t *e = q->elevator; 488 elevator_t *e = q->elevator;
489 489
490 if (e->ops->elevator_merged_fn) 490 if (e->ops->elevator_merged_fn)
491 e->ops->elevator_merged_fn(q, rq, type); 491 e->ops->elevator_merged_fn(q, rq, type);
492 492
493 if (type == ELEVATOR_BACK_MERGE) 493 if (type == ELEVATOR_BACK_MERGE)
494 elv_rqhash_reposition(q, rq); 494 elv_rqhash_reposition(q, rq);
495 495
496 q->last_merge = rq; 496 q->last_merge = rq;
497 } 497 }
498 498
499 void elv_merge_requests(struct request_queue *q, struct request *rq, 499 void elv_merge_requests(struct request_queue *q, struct request *rq,
500 struct request *next) 500 struct request *next)
501 { 501 {
502 elevator_t *e = q->elevator; 502 elevator_t *e = q->elevator;
503 503
504 if (e->ops->elevator_merge_req_fn) 504 if (e->ops->elevator_merge_req_fn)
505 e->ops->elevator_merge_req_fn(q, rq, next); 505 e->ops->elevator_merge_req_fn(q, rq, next);
506 506
507 elv_rqhash_reposition(q, rq); 507 elv_rqhash_reposition(q, rq);
508 elv_rqhash_del(q, next); 508 elv_rqhash_del(q, next);
509 509
510 q->nr_sorted--; 510 q->nr_sorted--;
511 q->last_merge = rq; 511 q->last_merge = rq;
512 } 512 }
513 513
514 void elv_requeue_request(struct request_queue *q, struct request *rq) 514 void elv_requeue_request(struct request_queue *q, struct request *rq)
515 { 515 {
516 /* 516 /*
517 * it already went through dequeue, we need to decrement the 517 * it already went through dequeue, we need to decrement the
518 * in_flight count again 518 * in_flight count again
519 */ 519 */
520 if (blk_account_rq(rq)) { 520 if (blk_account_rq(rq)) {
521 q->in_flight--; 521 q->in_flight--;
522 if (blk_sorted_rq(rq)) 522 if (blk_sorted_rq(rq))
523 elv_deactivate_rq(q, rq); 523 elv_deactivate_rq(q, rq);
524 } 524 }
525 525
526 rq->cmd_flags &= ~REQ_STARTED; 526 rq->cmd_flags &= ~REQ_STARTED;
527 527
528 elv_insert(q, rq, ELEVATOR_INSERT_REQUEUE); 528 elv_insert(q, rq, ELEVATOR_INSERT_REQUEUE);
529 } 529 }
530 530
531 static void elv_drain_elevator(struct request_queue *q) 531 static void elv_drain_elevator(struct request_queue *q)
532 { 532 {
533 static int printed; 533 static int printed;
534 while (q->elevator->ops->elevator_dispatch_fn(q, 1)) 534 while (q->elevator->ops->elevator_dispatch_fn(q, 1))
535 ; 535 ;
536 if (q->nr_sorted == 0) 536 if (q->nr_sorted == 0)
537 return; 537 return;
538 if (printed++ < 10) { 538 if (printed++ < 10) {
539 printk(KERN_ERR "%s: forced dispatching is broken " 539 printk(KERN_ERR "%s: forced dispatching is broken "
540 "(nr_sorted=%u), please report this\n", 540 "(nr_sorted=%u), please report this\n",
541 q->elevator->elevator_type->elevator_name, q->nr_sorted); 541 q->elevator->elevator_type->elevator_name, q->nr_sorted);
542 } 542 }
543 } 543 }
544 544
545 void elv_insert(struct request_queue *q, struct request *rq, int where) 545 void elv_insert(struct request_queue *q, struct request *rq, int where)
546 { 546 {
547 struct list_head *pos; 547 struct list_head *pos;
548 unsigned ordseq; 548 unsigned ordseq;
549 int unplug_it = 1; 549 int unplug_it = 1;
550 550
551 blk_add_trace_rq(q, rq, BLK_TA_INSERT); 551 blk_add_trace_rq(q, rq, BLK_TA_INSERT);
552 552
553 rq->q = q; 553 rq->q = q;
554 554
555 switch (where) { 555 switch (where) {
556 case ELEVATOR_INSERT_FRONT: 556 case ELEVATOR_INSERT_FRONT:
557 rq->cmd_flags |= REQ_SOFTBARRIER; 557 rq->cmd_flags |= REQ_SOFTBARRIER;
558 558
559 list_add(&rq->queuelist, &q->queue_head); 559 list_add(&rq->queuelist, &q->queue_head);
560 break; 560 break;
561 561
562 case ELEVATOR_INSERT_BACK: 562 case ELEVATOR_INSERT_BACK:
563 rq->cmd_flags |= REQ_SOFTBARRIER; 563 rq->cmd_flags |= REQ_SOFTBARRIER;
564 elv_drain_elevator(q); 564 elv_drain_elevator(q);
565 list_add_tail(&rq->queuelist, &q->queue_head); 565 list_add_tail(&rq->queuelist, &q->queue_head);
566 /* 566 /*
567 * We kick the queue here for the following reasons. 567 * We kick the queue here for the following reasons.
568 * - The elevator might have returned NULL previously 568 * - The elevator might have returned NULL previously
569 * to delay requests and returned them now. As the 569 * to delay requests and returned them now. As the
570 * queue wasn't empty before this request, ll_rw_blk 570 * queue wasn't empty before this request, ll_rw_blk
571 * won't run the queue on return, resulting in hang. 571 * won't run the queue on return, resulting in hang.
572 * - Usually, back inserted requests won't be merged 572 * - Usually, back inserted requests won't be merged
573 * with anything. There's no point in delaying queue 573 * with anything. There's no point in delaying queue
574 * processing. 574 * processing.
575 */ 575 */
576 blk_remove_plug(q); 576 blk_remove_plug(q);
577 q->request_fn(q); 577 q->request_fn(q);
578 break; 578 break;
579 579
580 case ELEVATOR_INSERT_SORT: 580 case ELEVATOR_INSERT_SORT:
581 BUG_ON(!blk_fs_request(rq)); 581 BUG_ON(!blk_fs_request(rq));
582 rq->cmd_flags |= REQ_SORTED; 582 rq->cmd_flags |= REQ_SORTED;
583 q->nr_sorted++; 583 q->nr_sorted++;
584 if (rq_mergeable(rq)) { 584 if (rq_mergeable(rq)) {
585 elv_rqhash_add(q, rq); 585 elv_rqhash_add(q, rq);
586 if (!q->last_merge) 586 if (!q->last_merge)
587 q->last_merge = rq; 587 q->last_merge = rq;
588 } 588 }
589 589
590 /* 590 /*
591 * Some ioscheds (cfq) run q->request_fn directly, so 591 * Some ioscheds (cfq) run q->request_fn directly, so
592 * rq cannot be accessed after calling 592 * rq cannot be accessed after calling
593 * elevator_add_req_fn. 593 * elevator_add_req_fn.
594 */ 594 */
595 q->elevator->ops->elevator_add_req_fn(q, rq); 595 q->elevator->ops->elevator_add_req_fn(q, rq);
596 break; 596 break;
597 597
598 case ELEVATOR_INSERT_REQUEUE: 598 case ELEVATOR_INSERT_REQUEUE:
599 /* 599 /*
600 * If ordered flush isn't in progress, we do front 600 * If ordered flush isn't in progress, we do front
601 * insertion; otherwise, requests should be requeued 601 * insertion; otherwise, requests should be requeued
602 * in ordseq order. 602 * in ordseq order.
603 */ 603 */
604 rq->cmd_flags |= REQ_SOFTBARRIER; 604 rq->cmd_flags |= REQ_SOFTBARRIER;
605 605
606 /* 606 /*
607 * Most requeues happen because of a busy condition, 607 * Most requeues happen because of a busy condition,
608 * don't force unplug of the queue for that case. 608 * don't force unplug of the queue for that case.
609 */ 609 */
610 unplug_it = 0; 610 unplug_it = 0;
611 611
612 if (q->ordseq == 0) { 612 if (q->ordseq == 0) {
613 list_add(&rq->queuelist, &q->queue_head); 613 list_add(&rq->queuelist, &q->queue_head);
614 break; 614 break;
615 } 615 }
616 616
617 ordseq = blk_ordered_req_seq(rq); 617 ordseq = blk_ordered_req_seq(rq);
618 618
619 list_for_each(pos, &q->queue_head) { 619 list_for_each(pos, &q->queue_head) {
620 struct request *pos_rq = list_entry_rq(pos); 620 struct request *pos_rq = list_entry_rq(pos);
621 if (ordseq <= blk_ordered_req_seq(pos_rq)) 621 if (ordseq <= blk_ordered_req_seq(pos_rq))
622 break; 622 break;
623 } 623 }
624 624
625 list_add_tail(&rq->queuelist, pos); 625 list_add_tail(&rq->queuelist, pos);
626 break; 626 break;
627 627
628 default: 628 default:
629 printk(KERN_ERR "%s: bad insertion point %d\n", 629 printk(KERN_ERR "%s: bad insertion point %d\n",
630 __FUNCTION__, where); 630 __FUNCTION__, where);
631 BUG(); 631 BUG();
632 } 632 }
633 633
634 if (unplug_it && blk_queue_plugged(q)) { 634 if (unplug_it && blk_queue_plugged(q)) {
635 int nrq = q->rq.count[READ] + q->rq.count[WRITE] 635 int nrq = q->rq.count[READ] + q->rq.count[WRITE]
636 - q->in_flight; 636 - q->in_flight;
637 637
638 if (nrq >= q->unplug_thresh) 638 if (nrq >= q->unplug_thresh)
639 __generic_unplug_device(q); 639 __generic_unplug_device(q);
640 } 640 }
641 } 641 }
642 642
643 void __elv_add_request(struct request_queue *q, struct request *rq, int where, 643 void __elv_add_request(struct request_queue *q, struct request *rq, int where,
644 int plug) 644 int plug)
645 { 645 {
646 if (q->ordcolor) 646 if (q->ordcolor)
647 rq->cmd_flags |= REQ_ORDERED_COLOR; 647 rq->cmd_flags |= REQ_ORDERED_COLOR;
648 648
649 if (rq->cmd_flags & (REQ_SOFTBARRIER | REQ_HARDBARRIER)) { 649 if (rq->cmd_flags & (REQ_SOFTBARRIER | REQ_HARDBARRIER)) {
650 /* 650 /*
651 * toggle ordered color 651 * toggle ordered color
652 */ 652 */
653 if (blk_barrier_rq(rq)) 653 if (blk_barrier_rq(rq))
654 q->ordcolor ^= 1; 654 q->ordcolor ^= 1;
655 655
656 /* 656 /*
657 * barriers implicitly indicate back insertion 657 * barriers implicitly indicate back insertion
658 */ 658 */
659 if (where == ELEVATOR_INSERT_SORT) 659 if (where == ELEVATOR_INSERT_SORT)
660 where = ELEVATOR_INSERT_BACK; 660 where = ELEVATOR_INSERT_BACK;
661 661
662 /* 662 /*
663 * this request is scheduling boundary, update 663 * this request is scheduling boundary, update
664 * end_sector 664 * end_sector
665 */ 665 */
666 if (blk_fs_request(rq)) { 666 if (blk_fs_request(rq)) {
667 q->end_sector = rq_end_sector(rq); 667 q->end_sector = rq_end_sector(rq);
668 q->boundary_rq = rq; 668 q->boundary_rq = rq;
669 } 669 }
670 } else if (!(rq->cmd_flags & REQ_ELVPRIV) && where == ELEVATOR_INSERT_SORT) 670 } else if (!(rq->cmd_flags & REQ_ELVPRIV) && where == ELEVATOR_INSERT_SORT)
671 where = ELEVATOR_INSERT_BACK; 671 where = ELEVATOR_INSERT_BACK;
672 672
673 if (plug) 673 if (plug)
674 blk_plug_device(q); 674 blk_plug_device(q);
675 675
676 elv_insert(q, rq, where); 676 elv_insert(q, rq, where);
677 } 677 }
678 678
679 EXPORT_SYMBOL(__elv_add_request); 679 EXPORT_SYMBOL(__elv_add_request);
680 680
681 void elv_add_request(struct request_queue *q, struct request *rq, int where, 681 void elv_add_request(struct request_queue *q, struct request *rq, int where,
682 int plug) 682 int plug)
683 { 683 {
684 unsigned long flags; 684 unsigned long flags;
685 685
686 spin_lock_irqsave(q->queue_lock, flags); 686 spin_lock_irqsave(q->queue_lock, flags);
687 __elv_add_request(q, rq, where, plug); 687 __elv_add_request(q, rq, where, plug);
688 spin_unlock_irqrestore(q->queue_lock, flags); 688 spin_unlock_irqrestore(q->queue_lock, flags);
689 } 689 }
690 690
691 EXPORT_SYMBOL(elv_add_request); 691 EXPORT_SYMBOL(elv_add_request);
692 692
693 static inline struct request *__elv_next_request(struct request_queue *q) 693 static inline struct request *__elv_next_request(struct request_queue *q)
694 { 694 {
695 struct request *rq; 695 struct request *rq;
696 696
697 while (1) { 697 while (1) {
698 while (!list_empty(&q->queue_head)) { 698 while (!list_empty(&q->queue_head)) {
699 rq = list_entry_rq(q->queue_head.next); 699 rq = list_entry_rq(q->queue_head.next);
700 if (blk_do_ordered(q, &rq)) 700 if (blk_do_ordered(q, &rq))
701 return rq; 701 return rq;
702 } 702 }
703 703
704 if (!q->elevator->ops->elevator_dispatch_fn(q, 0)) 704 if (!q->elevator->ops->elevator_dispatch_fn(q, 0))
705 return NULL; 705 return NULL;
706 } 706 }
707 } 707 }
708 708
709 struct request *elv_next_request(struct request_queue *q) 709 struct request *elv_next_request(struct request_queue *q)
710 { 710 {
711 struct request *rq; 711 struct request *rq;
712 int ret; 712 int ret;
713 713
714 while ((rq = __elv_next_request(q)) != NULL) { 714 while ((rq = __elv_next_request(q)) != NULL) {
715 /*
716 * Kill the empty barrier place holder, the driver must
717 * not ever see it.
718 */
719 if (blk_empty_barrier(rq)) {
720 end_queued_request(rq, 1);
721 continue;
722 }
715 if (!(rq->cmd_flags & REQ_STARTED)) { 723 if (!(rq->cmd_flags & REQ_STARTED)) {
716 /* 724 /*
717 * This is the first time the device driver 725 * This is the first time the device driver
718 * sees this request (possibly after 726 * sees this request (possibly after
719 * requeueing). Notify IO scheduler. 727 * requeueing). Notify IO scheduler.
720 */ 728 */
721 if (blk_sorted_rq(rq)) 729 if (blk_sorted_rq(rq))
722 elv_activate_rq(q, rq); 730 elv_activate_rq(q, rq);
723 731
724 /* 732 /*
725 * just mark as started even if we don't start 733 * just mark as started even if we don't start
726 * it, a request that has been delayed should 734 * it, a request that has been delayed should
727 * not be passed by new incoming requests 735 * not be passed by new incoming requests
728 */ 736 */
729 rq->cmd_flags |= REQ_STARTED; 737 rq->cmd_flags |= REQ_STARTED;
730 blk_add_trace_rq(q, rq, BLK_TA_ISSUE); 738 blk_add_trace_rq(q, rq, BLK_TA_ISSUE);
731 } 739 }
732 740
733 if (!q->boundary_rq || q->boundary_rq == rq) { 741 if (!q->boundary_rq || q->boundary_rq == rq) {
734 q->end_sector = rq_end_sector(rq); 742 q->end_sector = rq_end_sector(rq);
735 q->boundary_rq = NULL; 743 q->boundary_rq = NULL;
736 } 744 }
737 745
738 if ((rq->cmd_flags & REQ_DONTPREP) || !q->prep_rq_fn) 746 if ((rq->cmd_flags & REQ_DONTPREP) || !q->prep_rq_fn)
739 break; 747 break;
740 748
741 ret = q->prep_rq_fn(q, rq); 749 ret = q->prep_rq_fn(q, rq);
742 if (ret == BLKPREP_OK) { 750 if (ret == BLKPREP_OK) {
743 break; 751 break;
744 } else if (ret == BLKPREP_DEFER) { 752 } else if (ret == BLKPREP_DEFER) {
745 /* 753 /*
746 * the request may have been (partially) prepped. 754 * the request may have been (partially) prepped.
747 * we need to keep this request in the front to 755 * we need to keep this request in the front to
748 * avoid resource deadlock. REQ_STARTED will 756 * avoid resource deadlock. REQ_STARTED will
749 * prevent other fs requests from passing this one. 757 * prevent other fs requests from passing this one.
750 */ 758 */
751 rq = NULL; 759 rq = NULL;
752 break; 760 break;
753 } else if (ret == BLKPREP_KILL) { 761 } else if (ret == BLKPREP_KILL) {
754 rq->cmd_flags |= REQ_QUIET; 762 rq->cmd_flags |= REQ_QUIET;
755 end_queued_request(rq, 0); 763 end_queued_request(rq, 0);
756 } else { 764 } else {
757 printk(KERN_ERR "%s: bad return=%d\n", __FUNCTION__, 765 printk(KERN_ERR "%s: bad return=%d\n", __FUNCTION__,
758 ret); 766 ret);
759 break; 767 break;
760 } 768 }
761 } 769 }
762 770
763 return rq; 771 return rq;
764 } 772 }
765 773
766 EXPORT_SYMBOL(elv_next_request); 774 EXPORT_SYMBOL(elv_next_request);
767 775
768 void elv_dequeue_request(struct request_queue *q, struct request *rq) 776 void elv_dequeue_request(struct request_queue *q, struct request *rq)
769 { 777 {
770 BUG_ON(list_empty(&rq->queuelist)); 778 BUG_ON(list_empty(&rq->queuelist));
771 BUG_ON(ELV_ON_HASH(rq)); 779 BUG_ON(ELV_ON_HASH(rq));
772 780
773 list_del_init(&rq->queuelist); 781 list_del_init(&rq->queuelist);
774 782
775 /* 783 /*
776 * the time frame between a request being removed from the lists 784 * the time frame between a request being removed from the lists
777 * and to it is freed is accounted as io that is in progress at 785 * and to it is freed is accounted as io that is in progress at
778 * the driver side. 786 * the driver side.
779 */ 787 */
780 if (blk_account_rq(rq)) 788 if (blk_account_rq(rq))
781 q->in_flight++; 789 q->in_flight++;
782 } 790 }
783 791
784 EXPORT_SYMBOL(elv_dequeue_request); 792 EXPORT_SYMBOL(elv_dequeue_request);
785 793
786 int elv_queue_empty(struct request_queue *q) 794 int elv_queue_empty(struct request_queue *q)
787 { 795 {
788 elevator_t *e = q->elevator; 796 elevator_t *e = q->elevator;
789 797
790 if (!list_empty(&q->queue_head)) 798 if (!list_empty(&q->queue_head))
791 return 0; 799 return 0;
792 800
793 if (e->ops->elevator_queue_empty_fn) 801 if (e->ops->elevator_queue_empty_fn)
794 return e->ops->elevator_queue_empty_fn(q); 802 return e->ops->elevator_queue_empty_fn(q);
795 803
796 return 1; 804 return 1;
797 } 805 }
798 806
799 EXPORT_SYMBOL(elv_queue_empty); 807 EXPORT_SYMBOL(elv_queue_empty);
800 808
801 struct request *elv_latter_request(struct request_queue *q, struct request *rq) 809 struct request *elv_latter_request(struct request_queue *q, struct request *rq)
802 { 810 {
803 elevator_t *e = q->elevator; 811 elevator_t *e = q->elevator;
804 812
805 if (e->ops->elevator_latter_req_fn) 813 if (e->ops->elevator_latter_req_fn)
806 return e->ops->elevator_latter_req_fn(q, rq); 814 return e->ops->elevator_latter_req_fn(q, rq);
807 return NULL; 815 return NULL;
808 } 816 }
809 817
810 struct request *elv_former_request(struct request_queue *q, struct request *rq) 818 struct request *elv_former_request(struct request_queue *q, struct request *rq)
811 { 819 {
812 elevator_t *e = q->elevator; 820 elevator_t *e = q->elevator;
813 821
814 if (e->ops->elevator_former_req_fn) 822 if (e->ops->elevator_former_req_fn)
815 return e->ops->elevator_former_req_fn(q, rq); 823 return e->ops->elevator_former_req_fn(q, rq);
816 return NULL; 824 return NULL;
817 } 825 }
818 826
819 int elv_set_request(struct request_queue *q, struct request *rq, gfp_t gfp_mask) 827 int elv_set_request(struct request_queue *q, struct request *rq, gfp_t gfp_mask)
820 { 828 {
821 elevator_t *e = q->elevator; 829 elevator_t *e = q->elevator;
822 830
823 if (e->ops->elevator_set_req_fn) 831 if (e->ops->elevator_set_req_fn)
824 return e->ops->elevator_set_req_fn(q, rq, gfp_mask); 832 return e->ops->elevator_set_req_fn(q, rq, gfp_mask);
825 833
826 rq->elevator_private = NULL; 834 rq->elevator_private = NULL;
827 return 0; 835 return 0;
828 } 836 }
829 837
830 void elv_put_request(struct request_queue *q, struct request *rq) 838 void elv_put_request(struct request_queue *q, struct request *rq)
831 { 839 {
832 elevator_t *e = q->elevator; 840 elevator_t *e = q->elevator;
833 841
834 if (e->ops->elevator_put_req_fn) 842 if (e->ops->elevator_put_req_fn)
835 e->ops->elevator_put_req_fn(rq); 843 e->ops->elevator_put_req_fn(rq);
836 } 844 }
837 845
838 int elv_may_queue(struct request_queue *q, int rw) 846 int elv_may_queue(struct request_queue *q, int rw)
839 { 847 {
840 elevator_t *e = q->elevator; 848 elevator_t *e = q->elevator;
841 849
842 if (e->ops->elevator_may_queue_fn) 850 if (e->ops->elevator_may_queue_fn)
843 return e->ops->elevator_may_queue_fn(q, rw); 851 return e->ops->elevator_may_queue_fn(q, rw);
844 852
845 return ELV_MQUEUE_MAY; 853 return ELV_MQUEUE_MAY;
846 } 854 }
847 855
848 void elv_completed_request(struct request_queue *q, struct request *rq) 856 void elv_completed_request(struct request_queue *q, struct request *rq)
849 { 857 {
850 elevator_t *e = q->elevator; 858 elevator_t *e = q->elevator;
851 859
852 /* 860 /*
853 * request is released from the driver, io must be done 861 * request is released from the driver, io must be done
854 */ 862 */
855 if (blk_account_rq(rq)) { 863 if (blk_account_rq(rq)) {
856 q->in_flight--; 864 q->in_flight--;
857 if (blk_sorted_rq(rq) && e->ops->elevator_completed_req_fn) 865 if (blk_sorted_rq(rq) && e->ops->elevator_completed_req_fn)
858 e->ops->elevator_completed_req_fn(q, rq); 866 e->ops->elevator_completed_req_fn(q, rq);
859 } 867 }
860 868
861 /* 869 /*
862 * Check if the queue is waiting for fs requests to be 870 * Check if the queue is waiting for fs requests to be
863 * drained for flush sequence. 871 * drained for flush sequence.
864 */ 872 */
865 if (unlikely(q->ordseq)) { 873 if (unlikely(q->ordseq)) {
866 struct request *first_rq = list_entry_rq(q->queue_head.next); 874 struct request *first_rq = list_entry_rq(q->queue_head.next);
867 if (q->in_flight == 0 && 875 if (q->in_flight == 0 &&
868 blk_ordered_cur_seq(q) == QUEUE_ORDSEQ_DRAIN && 876 blk_ordered_cur_seq(q) == QUEUE_ORDSEQ_DRAIN &&
869 blk_ordered_req_seq(first_rq) > QUEUE_ORDSEQ_DRAIN) { 877 blk_ordered_req_seq(first_rq) > QUEUE_ORDSEQ_DRAIN) {
870 blk_ordered_complete_seq(q, QUEUE_ORDSEQ_DRAIN, 0); 878 blk_ordered_complete_seq(q, QUEUE_ORDSEQ_DRAIN, 0);
871 q->request_fn(q); 879 q->request_fn(q);
872 } 880 }
873 } 881 }
874 } 882 }
875 883
876 #define to_elv(atr) container_of((atr), struct elv_fs_entry, attr) 884 #define to_elv(atr) container_of((atr), struct elv_fs_entry, attr)
877 885
878 static ssize_t 886 static ssize_t
879 elv_attr_show(struct kobject *kobj, struct attribute *attr, char *page) 887 elv_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
880 { 888 {
881 elevator_t *e = container_of(kobj, elevator_t, kobj); 889 elevator_t *e = container_of(kobj, elevator_t, kobj);
882 struct elv_fs_entry *entry = to_elv(attr); 890 struct elv_fs_entry *entry = to_elv(attr);
883 ssize_t error; 891 ssize_t error;
884 892
885 if (!entry->show) 893 if (!entry->show)
886 return -EIO; 894 return -EIO;
887 895
888 mutex_lock(&e->sysfs_lock); 896 mutex_lock(&e->sysfs_lock);
889 error = e->ops ? entry->show(e, page) : -ENOENT; 897 error = e->ops ? entry->show(e, page) : -ENOENT;
890 mutex_unlock(&e->sysfs_lock); 898 mutex_unlock(&e->sysfs_lock);
891 return error; 899 return error;
892 } 900 }
893 901
894 static ssize_t 902 static ssize_t
895 elv_attr_store(struct kobject *kobj, struct attribute *attr, 903 elv_attr_store(struct kobject *kobj, struct attribute *attr,
896 const char *page, size_t length) 904 const char *page, size_t length)
897 { 905 {
898 elevator_t *e = container_of(kobj, elevator_t, kobj); 906 elevator_t *e = container_of(kobj, elevator_t, kobj);
899 struct elv_fs_entry *entry = to_elv(attr); 907 struct elv_fs_entry *entry = to_elv(attr);
900 ssize_t error; 908 ssize_t error;
901 909
902 if (!entry->store) 910 if (!entry->store)
903 return -EIO; 911 return -EIO;
904 912
905 mutex_lock(&e->sysfs_lock); 913 mutex_lock(&e->sysfs_lock);
906 error = e->ops ? entry->store(e, page, length) : -ENOENT; 914 error = e->ops ? entry->store(e, page, length) : -ENOENT;
907 mutex_unlock(&e->sysfs_lock); 915 mutex_unlock(&e->sysfs_lock);
908 return error; 916 return error;
909 } 917 }
910 918
911 static struct sysfs_ops elv_sysfs_ops = { 919 static struct sysfs_ops elv_sysfs_ops = {
912 .show = elv_attr_show, 920 .show = elv_attr_show,
913 .store = elv_attr_store, 921 .store = elv_attr_store,
914 }; 922 };
915 923
916 static struct kobj_type elv_ktype = { 924 static struct kobj_type elv_ktype = {
917 .sysfs_ops = &elv_sysfs_ops, 925 .sysfs_ops = &elv_sysfs_ops,
918 .release = elevator_release, 926 .release = elevator_release,
919 }; 927 };
920 928
921 int elv_register_queue(struct request_queue *q) 929 int elv_register_queue(struct request_queue *q)
922 { 930 {
923 elevator_t *e = q->elevator; 931 elevator_t *e = q->elevator;
924 int error; 932 int error;
925 933
926 e->kobj.parent = &q->kobj; 934 e->kobj.parent = &q->kobj;
927 935
928 error = kobject_add(&e->kobj); 936 error = kobject_add(&e->kobj);
929 if (!error) { 937 if (!error) {
930 struct elv_fs_entry *attr = e->elevator_type->elevator_attrs; 938 struct elv_fs_entry *attr = e->elevator_type->elevator_attrs;
931 if (attr) { 939 if (attr) {
932 while (attr->attr.name) { 940 while (attr->attr.name) {
933 if (sysfs_create_file(&e->kobj, &attr->attr)) 941 if (sysfs_create_file(&e->kobj, &attr->attr))
934 break; 942 break;
935 attr++; 943 attr++;
936 } 944 }
937 } 945 }
938 kobject_uevent(&e->kobj, KOBJ_ADD); 946 kobject_uevent(&e->kobj, KOBJ_ADD);
939 } 947 }
940 return error; 948 return error;
941 } 949 }
942 950
943 static void __elv_unregister_queue(elevator_t *e) 951 static void __elv_unregister_queue(elevator_t *e)
944 { 952 {
945 kobject_uevent(&e->kobj, KOBJ_REMOVE); 953 kobject_uevent(&e->kobj, KOBJ_REMOVE);
946 kobject_del(&e->kobj); 954 kobject_del(&e->kobj);
947 } 955 }
948 956
949 void elv_unregister_queue(struct request_queue *q) 957 void elv_unregister_queue(struct request_queue *q)
950 { 958 {
951 if (q) 959 if (q)
952 __elv_unregister_queue(q->elevator); 960 __elv_unregister_queue(q->elevator);
953 } 961 }
954 962
955 int elv_register(struct elevator_type *e) 963 int elv_register(struct elevator_type *e)
956 { 964 {
957 char *def = ""; 965 char *def = "";
958 966
959 spin_lock(&elv_list_lock); 967 spin_lock(&elv_list_lock);
960 BUG_ON(elevator_find(e->elevator_name)); 968 BUG_ON(elevator_find(e->elevator_name));
961 list_add_tail(&e->list, &elv_list); 969 list_add_tail(&e->list, &elv_list);
962 spin_unlock(&elv_list_lock); 970 spin_unlock(&elv_list_lock);
963 971
964 if (!strcmp(e->elevator_name, chosen_elevator) || 972 if (!strcmp(e->elevator_name, chosen_elevator) ||
965 (!*chosen_elevator && 973 (!*chosen_elevator &&
966 !strcmp(e->elevator_name, CONFIG_DEFAULT_IOSCHED))) 974 !strcmp(e->elevator_name, CONFIG_DEFAULT_IOSCHED)))
967 def = " (default)"; 975 def = " (default)";
968 976
969 printk(KERN_INFO "io scheduler %s registered%s\n", e->elevator_name, def); 977 printk(KERN_INFO "io scheduler %s registered%s\n", e->elevator_name, def);
970 return 0; 978 return 0;
971 } 979 }
972 EXPORT_SYMBOL_GPL(elv_register); 980 EXPORT_SYMBOL_GPL(elv_register);
973 981
974 void elv_unregister(struct elevator_type *e) 982 void elv_unregister(struct elevator_type *e)
975 { 983 {
976 struct task_struct *g, *p; 984 struct task_struct *g, *p;
977 985
978 /* 986 /*
979 * Iterate every thread in the process to remove the io contexts. 987 * Iterate every thread in the process to remove the io contexts.
980 */ 988 */
981 if (e->ops.trim) { 989 if (e->ops.trim) {
982 read_lock(&tasklist_lock); 990 read_lock(&tasklist_lock);
983 do_each_thread(g, p) { 991 do_each_thread(g, p) {
984 task_lock(p); 992 task_lock(p);
985 if (p->io_context) 993 if (p->io_context)
986 e->ops.trim(p->io_context); 994 e->ops.trim(p->io_context);
987 task_unlock(p); 995 task_unlock(p);
988 } while_each_thread(g, p); 996 } while_each_thread(g, p);
989 read_unlock(&tasklist_lock); 997 read_unlock(&tasklist_lock);
990 } 998 }
991 999
992 spin_lock(&elv_list_lock); 1000 spin_lock(&elv_list_lock);
993 list_del_init(&e->list); 1001 list_del_init(&e->list);
994 spin_unlock(&elv_list_lock); 1002 spin_unlock(&elv_list_lock);
995 } 1003 }
996 EXPORT_SYMBOL_GPL(elv_unregister); 1004 EXPORT_SYMBOL_GPL(elv_unregister);
997 1005
998 /* 1006 /*
999 * switch to new_e io scheduler. be careful not to introduce deadlocks - 1007 * switch to new_e io scheduler. be careful not to introduce deadlocks -
1000 * we don't free the old io scheduler, before we have allocated what we 1008 * we don't free the old io scheduler, before we have allocated what we
1001 * need for the new one. this way we have a chance of going back to the old 1009 * need for the new one. this way we have a chance of going back to the old
1002 * one, if the new one fails init for some reason. 1010 * one, if the new one fails init for some reason.
1003 */ 1011 */
1004 static int elevator_switch(struct request_queue *q, struct elevator_type *new_e) 1012 static int elevator_switch(struct request_queue *q, struct elevator_type *new_e)
1005 { 1013 {
1006 elevator_t *old_elevator, *e; 1014 elevator_t *old_elevator, *e;
1007 void *data; 1015 void *data;
1008 1016
1009 /* 1017 /*
1010 * Allocate new elevator 1018 * Allocate new elevator
1011 */ 1019 */
1012 e = elevator_alloc(q, new_e); 1020 e = elevator_alloc(q, new_e);
1013 if (!e) 1021 if (!e)
1014 return 0; 1022 return 0;
1015 1023
1016 data = elevator_init_queue(q, e); 1024 data = elevator_init_queue(q, e);
1017 if (!data) { 1025 if (!data) {
1018 kobject_put(&e->kobj); 1026 kobject_put(&e->kobj);
1019 return 0; 1027 return 0;
1020 } 1028 }
1021 1029
1022 /* 1030 /*
1023 * Turn on BYPASS and drain all requests w/ elevator private data 1031 * Turn on BYPASS and drain all requests w/ elevator private data
1024 */ 1032 */
1025 spin_lock_irq(q->queue_lock); 1033 spin_lock_irq(q->queue_lock);
1026 1034
1027 set_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags); 1035 set_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags);
1028 1036
1029 elv_drain_elevator(q); 1037 elv_drain_elevator(q);
1030 1038
1031 while (q->rq.elvpriv) { 1039 while (q->rq.elvpriv) {
1032 blk_remove_plug(q); 1040 blk_remove_plug(q);
1033 q->request_fn(q); 1041 q->request_fn(q);
1034 spin_unlock_irq(q->queue_lock); 1042 spin_unlock_irq(q->queue_lock);
1035 msleep(10); 1043 msleep(10);
1036 spin_lock_irq(q->queue_lock); 1044 spin_lock_irq(q->queue_lock);
1037 elv_drain_elevator(q); 1045 elv_drain_elevator(q);
1038 } 1046 }
1039 1047
1040 /* 1048 /*
1041 * Remember old elevator. 1049 * Remember old elevator.
1042 */ 1050 */
1043 old_elevator = q->elevator; 1051 old_elevator = q->elevator;
1044 1052
1045 /* 1053 /*
1046 * attach and start new elevator 1054 * attach and start new elevator
1047 */ 1055 */
1048 elevator_attach(q, e, data); 1056 elevator_attach(q, e, data);
1049 1057
1050 spin_unlock_irq(q->queue_lock); 1058 spin_unlock_irq(q->queue_lock);
1051 1059
1052 __elv_unregister_queue(old_elevator); 1060 __elv_unregister_queue(old_elevator);
1053 1061
1054 if (elv_register_queue(q)) 1062 if (elv_register_queue(q))
1055 goto fail_register; 1063 goto fail_register;
1056 1064
1057 /* 1065 /*
1058 * finally exit old elevator and turn off BYPASS. 1066 * finally exit old elevator and turn off BYPASS.
1059 */ 1067 */
1060 elevator_exit(old_elevator); 1068 elevator_exit(old_elevator);
1061 clear_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags); 1069 clear_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags);
1062 return 1; 1070 return 1;
1063 1071
1064 fail_register: 1072 fail_register:
1065 /* 1073 /*
1066 * switch failed, exit the new io scheduler and reattach the old 1074 * switch failed, exit the new io scheduler and reattach the old
1067 * one again (along with re-adding the sysfs dir) 1075 * one again (along with re-adding the sysfs dir)
1068 */ 1076 */
1069 elevator_exit(e); 1077 elevator_exit(e);
1070 q->elevator = old_elevator; 1078 q->elevator = old_elevator;
1071 elv_register_queue(q); 1079 elv_register_queue(q);
1072 clear_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags); 1080 clear_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags);
1073 return 0; 1081 return 0;
1074 } 1082 }
1075 1083
1076 ssize_t elv_iosched_store(struct request_queue *q, const char *name, 1084 ssize_t elv_iosched_store(struct request_queue *q, const char *name,
1077 size_t count) 1085 size_t count)
1078 { 1086 {
1079 char elevator_name[ELV_NAME_MAX]; 1087 char elevator_name[ELV_NAME_MAX];
1080 size_t len; 1088 size_t len;
1081 struct elevator_type *e; 1089 struct elevator_type *e;
1082 1090
1083 elevator_name[sizeof(elevator_name) - 1] = '\0'; 1091 elevator_name[sizeof(elevator_name) - 1] = '\0';
1084 strncpy(elevator_name, name, sizeof(elevator_name) - 1); 1092 strncpy(elevator_name, name, sizeof(elevator_name) - 1);
1085 len = strlen(elevator_name); 1093 len = strlen(elevator_name);
1086 1094
1087 if (len && elevator_name[len - 1] == '\n') 1095 if (len && elevator_name[len - 1] == '\n')
1088 elevator_name[len - 1] = '\0'; 1096 elevator_name[len - 1] = '\0';
1089 1097
1090 e = elevator_get(elevator_name); 1098 e = elevator_get(elevator_name);
1091 if (!e) { 1099 if (!e) {
1092 printk(KERN_ERR "elevator: type %s not found\n", elevator_name); 1100 printk(KERN_ERR "elevator: type %s not found\n", elevator_name);
1093 return -EINVAL; 1101 return -EINVAL;
1094 } 1102 }
1095 1103
1096 if (!strcmp(elevator_name, q->elevator->elevator_type->elevator_name)) { 1104 if (!strcmp(elevator_name, q->elevator->elevator_type->elevator_name)) {
1097 elevator_put(e); 1105 elevator_put(e);
1098 return count; 1106 return count;
1099 } 1107 }
1100 1108
1101 if (!elevator_switch(q, e)) 1109 if (!elevator_switch(q, e))
1102 printk(KERN_ERR "elevator: switch to %s failed\n",elevator_name); 1110 printk(KERN_ERR "elevator: switch to %s failed\n",elevator_name);
1103 return count; 1111 return count;
1104 } 1112 }
1105 1113
1106 ssize_t elv_iosched_show(struct request_queue *q, char *name) 1114 ssize_t elv_iosched_show(struct request_queue *q, char *name)
1107 { 1115 {
1108 elevator_t *e = q->elevator; 1116 elevator_t *e = q->elevator;
1109 struct elevator_type *elv = e->elevator_type; 1117 struct elevator_type *elv = e->elevator_type;
1110 struct elevator_type *__e; 1118 struct elevator_type *__e;
1111 int len = 0; 1119 int len = 0;
1112 1120
1113 spin_lock(&elv_list_lock); 1121 spin_lock(&elv_list_lock);
1114 list_for_each_entry(__e, &elv_list, list) { 1122 list_for_each_entry(__e, &elv_list, list) {
1115 if (!strcmp(elv->elevator_name, __e->elevator_name)) 1123 if (!strcmp(elv->elevator_name, __e->elevator_name))
1116 len += sprintf(name+len, "[%s] ", elv->elevator_name); 1124 len += sprintf(name+len, "[%s] ", elv->elevator_name);
1117 else 1125 else
1118 len += sprintf(name+len, "%s ", __e->elevator_name); 1126 len += sprintf(name+len, "%s ", __e->elevator_name);
1119 } 1127 }
1120 spin_unlock(&elv_list_lock); 1128 spin_unlock(&elv_list_lock);
1121 1129
1122 len += sprintf(len+name, "\n"); 1130 len += sprintf(len+name, "\n");
1123 return len; 1131 return len;
1124 } 1132 }
1125 1133
1126 struct request *elv_rb_former_request(struct request_queue *q, 1134 struct request *elv_rb_former_request(struct request_queue *q,
1127 struct request *rq) 1135 struct request *rq)
1128 { 1136 {
1129 struct rb_node *rbprev = rb_prev(&rq->rb_node); 1137 struct rb_node *rbprev = rb_prev(&rq->rb_node);
1130 1138
1131 if (rbprev) 1139 if (rbprev)
1132 return rb_entry_rq(rbprev); 1140 return rb_entry_rq(rbprev);
1133 1141
1134 return NULL; 1142 return NULL;
1135 } 1143 }
1136 1144
1137 EXPORT_SYMBOL(elv_rb_former_request); 1145 EXPORT_SYMBOL(elv_rb_former_request);
1138 1146
1139 struct request *elv_rb_latter_request(struct request_queue *q, 1147 struct request *elv_rb_latter_request(struct request_queue *q,
1140 struct request *rq) 1148 struct request *rq)
1141 { 1149 {
1142 struct rb_node *rbnext = rb_next(&rq->rb_node); 1150 struct rb_node *rbnext = rb_next(&rq->rb_node);
1143 1151
1144 if (rbnext) 1152 if (rbnext)
1145 return rb_entry_rq(rbnext); 1153 return rb_entry_rq(rbnext);
1146 1154
1147 return NULL; 1155 return NULL;
1148 } 1156 }
1149 1157
1150 EXPORT_SYMBOL(elv_rb_latter_request); 1158 EXPORT_SYMBOL(elv_rb_latter_request);
1151 1159
1 /* 1 /*
2 * Copyright (C) 1991, 1992 Linus Torvalds 2 * Copyright (C) 1991, 1992 Linus Torvalds
3 * Copyright (C) 1994, Karl Keyte: Added support for disk statistics 3 * Copyright (C) 1994, Karl Keyte: Added support for disk statistics
4 * Elevator latency, (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE 4 * Elevator latency, (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE
5 * Queue request tables / lock, selectable elevator, Jens Axboe <axboe@suse.de> 5 * Queue request tables / lock, selectable elevator, Jens Axboe <axboe@suse.de>
6 * kernel-doc documentation started by NeilBrown <neilb@cse.unsw.edu.au> - July2000 6 * kernel-doc documentation started by NeilBrown <neilb@cse.unsw.edu.au> - July2000
7 * bio rewrite, highmem i/o, etc, Jens Axboe <axboe@suse.de> - may 2001 7 * bio rewrite, highmem i/o, etc, Jens Axboe <axboe@suse.de> - may 2001
8 */ 8 */
9 9
10 /* 10 /*
11 * This handles all read/write requests to block devices 11 * This handles all read/write requests to block devices
12 */ 12 */
13 #include <linux/kernel.h> 13 #include <linux/kernel.h>
14 #include <linux/module.h> 14 #include <linux/module.h>
15 #include <linux/backing-dev.h> 15 #include <linux/backing-dev.h>
16 #include <linux/bio.h> 16 #include <linux/bio.h>
17 #include <linux/blkdev.h> 17 #include <linux/blkdev.h>
18 #include <linux/highmem.h> 18 #include <linux/highmem.h>
19 #include <linux/mm.h> 19 #include <linux/mm.h>
20 #include <linux/kernel_stat.h> 20 #include <linux/kernel_stat.h>
21 #include <linux/string.h> 21 #include <linux/string.h>
22 #include <linux/init.h> 22 #include <linux/init.h>
23 #include <linux/bootmem.h> /* for max_pfn/max_low_pfn */ 23 #include <linux/bootmem.h> /* for max_pfn/max_low_pfn */
24 #include <linux/completion.h> 24 #include <linux/completion.h>
25 #include <linux/slab.h> 25 #include <linux/slab.h>
26 #include <linux/swap.h> 26 #include <linux/swap.h>
27 #include <linux/writeback.h> 27 #include <linux/writeback.h>
28 #include <linux/task_io_accounting_ops.h> 28 #include <linux/task_io_accounting_ops.h>
29 #include <linux/interrupt.h> 29 #include <linux/interrupt.h>
30 #include <linux/cpu.h> 30 #include <linux/cpu.h>
31 #include <linux/blktrace_api.h> 31 #include <linux/blktrace_api.h>
32 #include <linux/fault-inject.h> 32 #include <linux/fault-inject.h>
33 33
34 /* 34 /*
35 * for max sense size 35 * for max sense size
36 */ 36 */
37 #include <scsi/scsi_cmnd.h> 37 #include <scsi/scsi_cmnd.h>
38 38
39 static void blk_unplug_work(struct work_struct *work); 39 static void blk_unplug_work(struct work_struct *work);
40 static void blk_unplug_timeout(unsigned long data); 40 static void blk_unplug_timeout(unsigned long data);
41 static void drive_stat_acct(struct request *rq, int nr_sectors, int new_io); 41 static void drive_stat_acct(struct request *rq, int nr_sectors, int new_io);
42 static void init_request_from_bio(struct request *req, struct bio *bio); 42 static void init_request_from_bio(struct request *req, struct bio *bio);
43 static int __make_request(struct request_queue *q, struct bio *bio); 43 static int __make_request(struct request_queue *q, struct bio *bio);
44 static struct io_context *current_io_context(gfp_t gfp_flags, int node); 44 static struct io_context *current_io_context(gfp_t gfp_flags, int node);
45 static void blk_recalc_rq_segments(struct request *rq); 45 static void blk_recalc_rq_segments(struct request *rq);
46 static void blk_rq_bio_prep(struct request_queue *q, struct request *rq, 46 static void blk_rq_bio_prep(struct request_queue *q, struct request *rq,
47 struct bio *bio); 47 struct bio *bio);
48 48
49 /* 49 /*
50 * For the allocated request tables 50 * For the allocated request tables
51 */ 51 */
52 static struct kmem_cache *request_cachep; 52 static struct kmem_cache *request_cachep;
53 53
54 /* 54 /*
55 * For queue allocation 55 * For queue allocation
56 */ 56 */
57 static struct kmem_cache *requestq_cachep; 57 static struct kmem_cache *requestq_cachep;
58 58
59 /* 59 /*
60 * For io context allocations 60 * For io context allocations
61 */ 61 */
62 static struct kmem_cache *iocontext_cachep; 62 static struct kmem_cache *iocontext_cachep;
63 63
64 /* 64 /*
65 * Controlling structure to kblockd 65 * Controlling structure to kblockd
66 */ 66 */
67 static struct workqueue_struct *kblockd_workqueue; 67 static struct workqueue_struct *kblockd_workqueue;
68 68
69 unsigned long blk_max_low_pfn, blk_max_pfn; 69 unsigned long blk_max_low_pfn, blk_max_pfn;
70 70
71 EXPORT_SYMBOL(blk_max_low_pfn); 71 EXPORT_SYMBOL(blk_max_low_pfn);
72 EXPORT_SYMBOL(blk_max_pfn); 72 EXPORT_SYMBOL(blk_max_pfn);
73 73
74 static DEFINE_PER_CPU(struct list_head, blk_cpu_done); 74 static DEFINE_PER_CPU(struct list_head, blk_cpu_done);
75 75
76 /* Amount of time in which a process may batch requests */ 76 /* Amount of time in which a process may batch requests */
77 #define BLK_BATCH_TIME (HZ/50UL) 77 #define BLK_BATCH_TIME (HZ/50UL)
78 78
79 /* Number of requests a "batching" process may submit */ 79 /* Number of requests a "batching" process may submit */
80 #define BLK_BATCH_REQ 32 80 #define BLK_BATCH_REQ 32
81 81
82 /* 82 /*
83 * Return the threshold (number of used requests) at which the queue is 83 * Return the threshold (number of used requests) at which the queue is
84 * considered to be congested. It include a little hysteresis to keep the 84 * considered to be congested. It include a little hysteresis to keep the
85 * context switch rate down. 85 * context switch rate down.
86 */ 86 */
87 static inline int queue_congestion_on_threshold(struct request_queue *q) 87 static inline int queue_congestion_on_threshold(struct request_queue *q)
88 { 88 {
89 return q->nr_congestion_on; 89 return q->nr_congestion_on;
90 } 90 }
91 91
92 /* 92 /*
93 * The threshold at which a queue is considered to be uncongested 93 * The threshold at which a queue is considered to be uncongested
94 */ 94 */
95 static inline int queue_congestion_off_threshold(struct request_queue *q) 95 static inline int queue_congestion_off_threshold(struct request_queue *q)
96 { 96 {
97 return q->nr_congestion_off; 97 return q->nr_congestion_off;
98 } 98 }
99 99
100 static void blk_queue_congestion_threshold(struct request_queue *q) 100 static void blk_queue_congestion_threshold(struct request_queue *q)
101 { 101 {
102 int nr; 102 int nr;
103 103
104 nr = q->nr_requests - (q->nr_requests / 8) + 1; 104 nr = q->nr_requests - (q->nr_requests / 8) + 1;
105 if (nr > q->nr_requests) 105 if (nr > q->nr_requests)
106 nr = q->nr_requests; 106 nr = q->nr_requests;
107 q->nr_congestion_on = nr; 107 q->nr_congestion_on = nr;
108 108
109 nr = q->nr_requests - (q->nr_requests / 8) - (q->nr_requests / 16) - 1; 109 nr = q->nr_requests - (q->nr_requests / 8) - (q->nr_requests / 16) - 1;
110 if (nr < 1) 110 if (nr < 1)
111 nr = 1; 111 nr = 1;
112 q->nr_congestion_off = nr; 112 q->nr_congestion_off = nr;
113 } 113 }
114 114
115 /** 115 /**
116 * blk_get_backing_dev_info - get the address of a queue's backing_dev_info 116 * blk_get_backing_dev_info - get the address of a queue's backing_dev_info
117 * @bdev: device 117 * @bdev: device
118 * 118 *
119 * Locates the passed device's request queue and returns the address of its 119 * Locates the passed device's request queue and returns the address of its
120 * backing_dev_info 120 * backing_dev_info
121 * 121 *
122 * Will return NULL if the request queue cannot be located. 122 * Will return NULL if the request queue cannot be located.
123 */ 123 */
124 struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev) 124 struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev)
125 { 125 {
126 struct backing_dev_info *ret = NULL; 126 struct backing_dev_info *ret = NULL;
127 struct request_queue *q = bdev_get_queue(bdev); 127 struct request_queue *q = bdev_get_queue(bdev);
128 128
129 if (q) 129 if (q)
130 ret = &q->backing_dev_info; 130 ret = &q->backing_dev_info;
131 return ret; 131 return ret;
132 } 132 }
133 EXPORT_SYMBOL(blk_get_backing_dev_info); 133 EXPORT_SYMBOL(blk_get_backing_dev_info);
134 134
135 /** 135 /**
136 * blk_queue_prep_rq - set a prepare_request function for queue 136 * blk_queue_prep_rq - set a prepare_request function for queue
137 * @q: queue 137 * @q: queue
138 * @pfn: prepare_request function 138 * @pfn: prepare_request function
139 * 139 *
140 * It's possible for a queue to register a prepare_request callback which 140 * It's possible for a queue to register a prepare_request callback which
141 * is invoked before the request is handed to the request_fn. The goal of 141 * is invoked before the request is handed to the request_fn. The goal of
142 * the function is to prepare a request for I/O, it can be used to build a 142 * the function is to prepare a request for I/O, it can be used to build a
143 * cdb from the request data for instance. 143 * cdb from the request data for instance.
144 * 144 *
145 */ 145 */
146 void blk_queue_prep_rq(struct request_queue *q, prep_rq_fn *pfn) 146 void blk_queue_prep_rq(struct request_queue *q, prep_rq_fn *pfn)
147 { 147 {
148 q->prep_rq_fn = pfn; 148 q->prep_rq_fn = pfn;
149 } 149 }
150 150
151 EXPORT_SYMBOL(blk_queue_prep_rq); 151 EXPORT_SYMBOL(blk_queue_prep_rq);
152 152
153 /** 153 /**
154 * blk_queue_merge_bvec - set a merge_bvec function for queue 154 * blk_queue_merge_bvec - set a merge_bvec function for queue
155 * @q: queue 155 * @q: queue
156 * @mbfn: merge_bvec_fn 156 * @mbfn: merge_bvec_fn
157 * 157 *
158 * Usually queues have static limitations on the max sectors or segments that 158 * Usually queues have static limitations on the max sectors or segments that
159 * we can put in a request. Stacking drivers may have some settings that 159 * we can put in a request. Stacking drivers may have some settings that
160 * are dynamic, and thus we have to query the queue whether it is ok to 160 * are dynamic, and thus we have to query the queue whether it is ok to
161 * add a new bio_vec to a bio at a given offset or not. If the block device 161 * add a new bio_vec to a bio at a given offset or not. If the block device
162 * has such limitations, it needs to register a merge_bvec_fn to control 162 * has such limitations, it needs to register a merge_bvec_fn to control
163 * the size of bio's sent to it. Note that a block device *must* allow a 163 * the size of bio's sent to it. Note that a block device *must* allow a
164 * single page to be added to an empty bio. The block device driver may want 164 * single page to be added to an empty bio. The block device driver may want
165 * to use the bio_split() function to deal with these bio's. By default 165 * to use the bio_split() function to deal with these bio's. By default
166 * no merge_bvec_fn is defined for a queue, and only the fixed limits are 166 * no merge_bvec_fn is defined for a queue, and only the fixed limits are
167 * honored. 167 * honored.
168 */ 168 */
169 void blk_queue_merge_bvec(struct request_queue *q, merge_bvec_fn *mbfn) 169 void blk_queue_merge_bvec(struct request_queue *q, merge_bvec_fn *mbfn)
170 { 170 {
171 q->merge_bvec_fn = mbfn; 171 q->merge_bvec_fn = mbfn;
172 } 172 }
173 173
174 EXPORT_SYMBOL(blk_queue_merge_bvec); 174 EXPORT_SYMBOL(blk_queue_merge_bvec);
175 175
176 void blk_queue_softirq_done(struct request_queue *q, softirq_done_fn *fn) 176 void blk_queue_softirq_done(struct request_queue *q, softirq_done_fn *fn)
177 { 177 {
178 q->softirq_done_fn = fn; 178 q->softirq_done_fn = fn;
179 } 179 }
180 180
181 EXPORT_SYMBOL(blk_queue_softirq_done); 181 EXPORT_SYMBOL(blk_queue_softirq_done);
182 182
183 /** 183 /**
184 * blk_queue_make_request - define an alternate make_request function for a device 184 * blk_queue_make_request - define an alternate make_request function for a device
185 * @q: the request queue for the device to be affected 185 * @q: the request queue for the device to be affected
186 * @mfn: the alternate make_request function 186 * @mfn: the alternate make_request function
187 * 187 *
188 * Description: 188 * Description:
189 * The normal way for &struct bios to be passed to a device 189 * The normal way for &struct bios to be passed to a device
190 * driver is for them to be collected into requests on a request 190 * driver is for them to be collected into requests on a request
191 * queue, and then to allow the device driver to select requests 191 * queue, and then to allow the device driver to select requests
192 * off that queue when it is ready. This works well for many block 192 * off that queue when it is ready. This works well for many block
193 * devices. However some block devices (typically virtual devices 193 * devices. However some block devices (typically virtual devices
194 * such as md or lvm) do not benefit from the processing on the 194 * such as md or lvm) do not benefit from the processing on the
195 * request queue, and are served best by having the requests passed 195 * request queue, and are served best by having the requests passed
196 * directly to them. This can be achieved by providing a function 196 * directly to them. This can be achieved by providing a function
197 * to blk_queue_make_request(). 197 * to blk_queue_make_request().
198 * 198 *
199 * Caveat: 199 * Caveat:
200 * The driver that does this *must* be able to deal appropriately 200 * The driver that does this *must* be able to deal appropriately
201 * with buffers in "highmemory". This can be accomplished by either calling 201 * with buffers in "highmemory". This can be accomplished by either calling
202 * __bio_kmap_atomic() to get a temporary kernel mapping, or by calling 202 * __bio_kmap_atomic() to get a temporary kernel mapping, or by calling
203 * blk_queue_bounce() to create a buffer in normal memory. 203 * blk_queue_bounce() to create a buffer in normal memory.
204 **/ 204 **/
205 void blk_queue_make_request(struct request_queue * q, make_request_fn * mfn) 205 void blk_queue_make_request(struct request_queue * q, make_request_fn * mfn)
206 { 206 {
207 /* 207 /*
208 * set defaults 208 * set defaults
209 */ 209 */
210 q->nr_requests = BLKDEV_MAX_RQ; 210 q->nr_requests = BLKDEV_MAX_RQ;
211 blk_queue_max_phys_segments(q, MAX_PHYS_SEGMENTS); 211 blk_queue_max_phys_segments(q, MAX_PHYS_SEGMENTS);
212 blk_queue_max_hw_segments(q, MAX_HW_SEGMENTS); 212 blk_queue_max_hw_segments(q, MAX_HW_SEGMENTS);
213 q->make_request_fn = mfn; 213 q->make_request_fn = mfn;
214 q->backing_dev_info.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE; 214 q->backing_dev_info.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
215 q->backing_dev_info.state = 0; 215 q->backing_dev_info.state = 0;
216 q->backing_dev_info.capabilities = BDI_CAP_MAP_COPY; 216 q->backing_dev_info.capabilities = BDI_CAP_MAP_COPY;
217 blk_queue_max_sectors(q, SAFE_MAX_SECTORS); 217 blk_queue_max_sectors(q, SAFE_MAX_SECTORS);
218 blk_queue_hardsect_size(q, 512); 218 blk_queue_hardsect_size(q, 512);
219 blk_queue_dma_alignment(q, 511); 219 blk_queue_dma_alignment(q, 511);
220 blk_queue_congestion_threshold(q); 220 blk_queue_congestion_threshold(q);
221 q->nr_batching = BLK_BATCH_REQ; 221 q->nr_batching = BLK_BATCH_REQ;
222 222
223 q->unplug_thresh = 4; /* hmm */ 223 q->unplug_thresh = 4; /* hmm */
224 q->unplug_delay = (3 * HZ) / 1000; /* 3 milliseconds */ 224 q->unplug_delay = (3 * HZ) / 1000; /* 3 milliseconds */
225 if (q->unplug_delay == 0) 225 if (q->unplug_delay == 0)
226 q->unplug_delay = 1; 226 q->unplug_delay = 1;
227 227
228 INIT_WORK(&q->unplug_work, blk_unplug_work); 228 INIT_WORK(&q->unplug_work, blk_unplug_work);
229 229
230 q->unplug_timer.function = blk_unplug_timeout; 230 q->unplug_timer.function = blk_unplug_timeout;
231 q->unplug_timer.data = (unsigned long)q; 231 q->unplug_timer.data = (unsigned long)q;
232 232
233 /* 233 /*
234 * by default assume old behaviour and bounce for any highmem page 234 * by default assume old behaviour and bounce for any highmem page
235 */ 235 */
236 blk_queue_bounce_limit(q, BLK_BOUNCE_HIGH); 236 blk_queue_bounce_limit(q, BLK_BOUNCE_HIGH);
237 } 237 }
238 238
239 EXPORT_SYMBOL(blk_queue_make_request); 239 EXPORT_SYMBOL(blk_queue_make_request);
240 240
241 static void rq_init(struct request_queue *q, struct request *rq) 241 static void rq_init(struct request_queue *q, struct request *rq)
242 { 242 {
243 INIT_LIST_HEAD(&rq->queuelist); 243 INIT_LIST_HEAD(&rq->queuelist);
244 INIT_LIST_HEAD(&rq->donelist); 244 INIT_LIST_HEAD(&rq->donelist);
245 245
246 rq->errors = 0; 246 rq->errors = 0;
247 rq->bio = rq->biotail = NULL; 247 rq->bio = rq->biotail = NULL;
248 INIT_HLIST_NODE(&rq->hash); 248 INIT_HLIST_NODE(&rq->hash);
249 RB_CLEAR_NODE(&rq->rb_node); 249 RB_CLEAR_NODE(&rq->rb_node);
250 rq->ioprio = 0; 250 rq->ioprio = 0;
251 rq->buffer = NULL; 251 rq->buffer = NULL;
252 rq->ref_count = 1; 252 rq->ref_count = 1;
253 rq->q = q; 253 rq->q = q;
254 rq->special = NULL; 254 rq->special = NULL;
255 rq->data_len = 0; 255 rq->data_len = 0;
256 rq->data = NULL; 256 rq->data = NULL;
257 rq->nr_phys_segments = 0; 257 rq->nr_phys_segments = 0;
258 rq->sense = NULL; 258 rq->sense = NULL;
259 rq->end_io = NULL; 259 rq->end_io = NULL;
260 rq->end_io_data = NULL; 260 rq->end_io_data = NULL;
261 rq->completion_data = NULL; 261 rq->completion_data = NULL;
262 rq->next_rq = NULL; 262 rq->next_rq = NULL;
263 } 263 }
264 264
265 /** 265 /**
266 * blk_queue_ordered - does this queue support ordered writes 266 * blk_queue_ordered - does this queue support ordered writes
267 * @q: the request queue 267 * @q: the request queue
268 * @ordered: one of QUEUE_ORDERED_* 268 * @ordered: one of QUEUE_ORDERED_*
269 * @prepare_flush_fn: rq setup helper for cache flush ordered writes 269 * @prepare_flush_fn: rq setup helper for cache flush ordered writes
270 * 270 *
271 * Description: 271 * Description:
272 * For journalled file systems, doing ordered writes on a commit 272 * For journalled file systems, doing ordered writes on a commit
273 * block instead of explicitly doing wait_on_buffer (which is bad 273 * block instead of explicitly doing wait_on_buffer (which is bad
274 * for performance) can be a big win. Block drivers supporting this 274 * for performance) can be a big win. Block drivers supporting this
275 * feature should call this function and indicate so. 275 * feature should call this function and indicate so.
276 * 276 *
277 **/ 277 **/
278 int blk_queue_ordered(struct request_queue *q, unsigned ordered, 278 int blk_queue_ordered(struct request_queue *q, unsigned ordered,
279 prepare_flush_fn *prepare_flush_fn) 279 prepare_flush_fn *prepare_flush_fn)
280 { 280 {
281 if (ordered & (QUEUE_ORDERED_PREFLUSH | QUEUE_ORDERED_POSTFLUSH) && 281 if (ordered & (QUEUE_ORDERED_PREFLUSH | QUEUE_ORDERED_POSTFLUSH) &&
282 prepare_flush_fn == NULL) { 282 prepare_flush_fn == NULL) {
283 printk(KERN_ERR "blk_queue_ordered: prepare_flush_fn required\n"); 283 printk(KERN_ERR "blk_queue_ordered: prepare_flush_fn required\n");
284 return -EINVAL; 284 return -EINVAL;
285 } 285 }
286 286
287 if (ordered != QUEUE_ORDERED_NONE && 287 if (ordered != QUEUE_ORDERED_NONE &&
288 ordered != QUEUE_ORDERED_DRAIN && 288 ordered != QUEUE_ORDERED_DRAIN &&
289 ordered != QUEUE_ORDERED_DRAIN_FLUSH && 289 ordered != QUEUE_ORDERED_DRAIN_FLUSH &&
290 ordered != QUEUE_ORDERED_DRAIN_FUA && 290 ordered != QUEUE_ORDERED_DRAIN_FUA &&
291 ordered != QUEUE_ORDERED_TAG && 291 ordered != QUEUE_ORDERED_TAG &&
292 ordered != QUEUE_ORDERED_TAG_FLUSH && 292 ordered != QUEUE_ORDERED_TAG_FLUSH &&
293 ordered != QUEUE_ORDERED_TAG_FUA) { 293 ordered != QUEUE_ORDERED_TAG_FUA) {
294 printk(KERN_ERR "blk_queue_ordered: bad value %d\n", ordered); 294 printk(KERN_ERR "blk_queue_ordered: bad value %d\n", ordered);
295 return -EINVAL; 295 return -EINVAL;
296 } 296 }
297 297
298 q->ordered = ordered; 298 q->ordered = ordered;
299 q->next_ordered = ordered; 299 q->next_ordered = ordered;
300 q->prepare_flush_fn = prepare_flush_fn; 300 q->prepare_flush_fn = prepare_flush_fn;
301 301
302 return 0; 302 return 0;
303 } 303 }
304 304
305 EXPORT_SYMBOL(blk_queue_ordered); 305 EXPORT_SYMBOL(blk_queue_ordered);
306 306
307 /** 307 /**
308 * blk_queue_issue_flush_fn - set function for issuing a flush 308 * blk_queue_issue_flush_fn - set function for issuing a flush
309 * @q: the request queue 309 * @q: the request queue
310 * @iff: the function to be called issuing the flush 310 * @iff: the function to be called issuing the flush
311 * 311 *
312 * Description: 312 * Description:
313 * If a driver supports issuing a flush command, the support is notified 313 * If a driver supports issuing a flush command, the support is notified
314 * to the block layer by defining it through this call. 314 * to the block layer by defining it through this call.
315 * 315 *
316 **/ 316 **/
317 void blk_queue_issue_flush_fn(struct request_queue *q, issue_flush_fn *iff) 317 void blk_queue_issue_flush_fn(struct request_queue *q, issue_flush_fn *iff)
318 { 318 {
319 q->issue_flush_fn = iff; 319 q->issue_flush_fn = iff;
320 } 320 }
321 321
322 EXPORT_SYMBOL(blk_queue_issue_flush_fn); 322 EXPORT_SYMBOL(blk_queue_issue_flush_fn);
323 323
324 /* 324 /*
325 * Cache flushing for ordered writes handling 325 * Cache flushing for ordered writes handling
326 */ 326 */
327 inline unsigned blk_ordered_cur_seq(struct request_queue *q) 327 inline unsigned blk_ordered_cur_seq(struct request_queue *q)
328 { 328 {
329 if (!q->ordseq) 329 if (!q->ordseq)
330 return 0; 330 return 0;
331 return 1 << ffz(q->ordseq); 331 return 1 << ffz(q->ordseq);
332 } 332 }
333 333
334 unsigned blk_ordered_req_seq(struct request *rq) 334 unsigned blk_ordered_req_seq(struct request *rq)
335 { 335 {
336 struct request_queue *q = rq->q; 336 struct request_queue *q = rq->q;
337 337
338 BUG_ON(q->ordseq == 0); 338 BUG_ON(q->ordseq == 0);
339 339
340 if (rq == &q->pre_flush_rq) 340 if (rq == &q->pre_flush_rq)
341 return QUEUE_ORDSEQ_PREFLUSH; 341 return QUEUE_ORDSEQ_PREFLUSH;
342 if (rq == &q->bar_rq) 342 if (rq == &q->bar_rq)
343 return QUEUE_ORDSEQ_BAR; 343 return QUEUE_ORDSEQ_BAR;
344 if (rq == &q->post_flush_rq) 344 if (rq == &q->post_flush_rq)
345 return QUEUE_ORDSEQ_POSTFLUSH; 345 return QUEUE_ORDSEQ_POSTFLUSH;
346 346
347 /* 347 /*
348 * !fs requests don't need to follow barrier ordering. Always 348 * !fs requests don't need to follow barrier ordering. Always
349 * put them at the front. This fixes the following deadlock. 349 * put them at the front. This fixes the following deadlock.
350 * 350 *
351 * http://thread.gmane.org/gmane.linux.kernel/537473 351 * http://thread.gmane.org/gmane.linux.kernel/537473
352 */ 352 */
353 if (!blk_fs_request(rq)) 353 if (!blk_fs_request(rq))
354 return QUEUE_ORDSEQ_DRAIN; 354 return QUEUE_ORDSEQ_DRAIN;
355 355
356 if ((rq->cmd_flags & REQ_ORDERED_COLOR) == 356 if ((rq->cmd_flags & REQ_ORDERED_COLOR) ==
357 (q->orig_bar_rq->cmd_flags & REQ_ORDERED_COLOR)) 357 (q->orig_bar_rq->cmd_flags & REQ_ORDERED_COLOR))
358 return QUEUE_ORDSEQ_DRAIN; 358 return QUEUE_ORDSEQ_DRAIN;
359 else 359 else
360 return QUEUE_ORDSEQ_DONE; 360 return QUEUE_ORDSEQ_DONE;
361 } 361 }
362 362
363 void blk_ordered_complete_seq(struct request_queue *q, unsigned seq, int error) 363 void blk_ordered_complete_seq(struct request_queue *q, unsigned seq, int error)
364 { 364 {
365 struct request *rq; 365 struct request *rq;
366 int uptodate; 366 int uptodate;
367 367
368 if (error && !q->orderr) 368 if (error && !q->orderr)
369 q->orderr = error; 369 q->orderr = error;
370 370
371 BUG_ON(q->ordseq & seq); 371 BUG_ON(q->ordseq & seq);
372 q->ordseq |= seq; 372 q->ordseq |= seq;
373 373
374 if (blk_ordered_cur_seq(q) != QUEUE_ORDSEQ_DONE) 374 if (blk_ordered_cur_seq(q) != QUEUE_ORDSEQ_DONE)
375 return; 375 return;
376 376
377 /* 377 /*
378 * Okay, sequence complete. 378 * Okay, sequence complete.
379 */ 379 */
380 uptodate = 1; 380 uptodate = 1;
381 if (q->orderr) 381 if (q->orderr)
382 uptodate = q->orderr; 382 uptodate = q->orderr;
383 383
384 q->ordseq = 0; 384 q->ordseq = 0;
385 rq = q->orig_bar_rq; 385 rq = q->orig_bar_rq;
386 386
387 end_that_request_first(rq, uptodate, rq->hard_nr_sectors); 387 end_that_request_first(rq, uptodate, rq->hard_nr_sectors);
388 end_that_request_last(rq, uptodate); 388 end_that_request_last(rq, uptodate);
389 } 389 }
390 390
391 static void pre_flush_end_io(struct request *rq, int error) 391 static void pre_flush_end_io(struct request *rq, int error)
392 { 392 {
393 elv_completed_request(rq->q, rq); 393 elv_completed_request(rq->q, rq);
394 blk_ordered_complete_seq(rq->q, QUEUE_ORDSEQ_PREFLUSH, error); 394 blk_ordered_complete_seq(rq->q, QUEUE_ORDSEQ_PREFLUSH, error);
395 } 395 }
396 396
397 static void bar_end_io(struct request *rq, int error) 397 static void bar_end_io(struct request *rq, int error)
398 { 398 {
399 elv_completed_request(rq->q, rq); 399 elv_completed_request(rq->q, rq);
400 blk_ordered_complete_seq(rq->q, QUEUE_ORDSEQ_BAR, error); 400 blk_ordered_complete_seq(rq->q, QUEUE_ORDSEQ_BAR, error);
401 } 401 }
402 402
403 static void post_flush_end_io(struct request *rq, int error) 403 static void post_flush_end_io(struct request *rq, int error)
404 { 404 {
405 elv_completed_request(rq->q, rq); 405 elv_completed_request(rq->q, rq);
406 blk_ordered_complete_seq(rq->q, QUEUE_ORDSEQ_POSTFLUSH, error); 406 blk_ordered_complete_seq(rq->q, QUEUE_ORDSEQ_POSTFLUSH, error);
407 } 407 }
408 408
409 static void queue_flush(struct request_queue *q, unsigned which) 409 static void queue_flush(struct request_queue *q, unsigned which)
410 { 410 {
411 struct request *rq; 411 struct request *rq;
412 rq_end_io_fn *end_io; 412 rq_end_io_fn *end_io;
413 413
414 if (which == QUEUE_ORDERED_PREFLUSH) { 414 if (which == QUEUE_ORDERED_PREFLUSH) {
415 rq = &q->pre_flush_rq; 415 rq = &q->pre_flush_rq;
416 end_io = pre_flush_end_io; 416 end_io = pre_flush_end_io;
417 } else { 417 } else {
418 rq = &q->post_flush_rq; 418 rq = &q->post_flush_rq;
419 end_io = post_flush_end_io; 419 end_io = post_flush_end_io;
420 } 420 }
421 421
422 rq->cmd_flags = REQ_HARDBARRIER; 422 rq->cmd_flags = REQ_HARDBARRIER;
423 rq_init(q, rq); 423 rq_init(q, rq);
424 rq->elevator_private = NULL; 424 rq->elevator_private = NULL;
425 rq->elevator_private2 = NULL; 425 rq->elevator_private2 = NULL;
426 rq->rq_disk = q->bar_rq.rq_disk; 426 rq->rq_disk = q->bar_rq.rq_disk;
427 rq->end_io = end_io; 427 rq->end_io = end_io;
428 q->prepare_flush_fn(q, rq); 428 q->prepare_flush_fn(q, rq);
429 429
430 elv_insert(q, rq, ELEVATOR_INSERT_FRONT); 430 elv_insert(q, rq, ELEVATOR_INSERT_FRONT);
431 } 431 }
432 432
433 static inline struct request *start_ordered(struct request_queue *q, 433 static inline struct request *start_ordered(struct request_queue *q,
434 struct request *rq) 434 struct request *rq)
435 { 435 {
436 q->orderr = 0; 436 q->orderr = 0;
437 q->ordered = q->next_ordered; 437 q->ordered = q->next_ordered;
438 q->ordseq |= QUEUE_ORDSEQ_STARTED; 438 q->ordseq |= QUEUE_ORDSEQ_STARTED;
439 439
440 /* 440 /*
441 * Prep proxy barrier request. 441 * Prep proxy barrier request.
442 */ 442 */
443 blkdev_dequeue_request(rq); 443 blkdev_dequeue_request(rq);
444 q->orig_bar_rq = rq; 444 q->orig_bar_rq = rq;
445 rq = &q->bar_rq; 445 rq = &q->bar_rq;
446 rq->cmd_flags = 0; 446 rq->cmd_flags = 0;
447 rq_init(q, rq); 447 rq_init(q, rq);
448 if (bio_data_dir(q->orig_bar_rq->bio) == WRITE) 448 if (bio_data_dir(q->orig_bar_rq->bio) == WRITE)
449 rq->cmd_flags |= REQ_RW; 449 rq->cmd_flags |= REQ_RW;
450 if (q->ordered & QUEUE_ORDERED_FUA) 450 if (q->ordered & QUEUE_ORDERED_FUA)
451 rq->cmd_flags |= REQ_FUA; 451 rq->cmd_flags |= REQ_FUA;
452 rq->elevator_private = NULL; 452 rq->elevator_private = NULL;
453 rq->elevator_private2 = NULL; 453 rq->elevator_private2 = NULL;
454 init_request_from_bio(rq, q->orig_bar_rq->bio); 454 init_request_from_bio(rq, q->orig_bar_rq->bio);
455 rq->end_io = bar_end_io; 455 rq->end_io = bar_end_io;
456 456
457 /* 457 /*
458 * Queue ordered sequence. As we stack them at the head, we 458 * Queue ordered sequence. As we stack them at the head, we
459 * need to queue in reverse order. Note that we rely on that 459 * need to queue in reverse order. Note that we rely on that
460 * no fs request uses ELEVATOR_INSERT_FRONT and thus no fs 460 * no fs request uses ELEVATOR_INSERT_FRONT and thus no fs
461 * request gets inbetween ordered sequence. 461 * request gets inbetween ordered sequence. If this request is
462 * an empty barrier, we don't need to do a postflush ever since
463 * there will be no data written between the pre and post flush.
464 * Hence a single flush will suffice.
462 */ 465 */
463 if (q->ordered & QUEUE_ORDERED_POSTFLUSH) 466 if ((q->ordered & QUEUE_ORDERED_POSTFLUSH) && !blk_empty_barrier(rq))
464 queue_flush(q, QUEUE_ORDERED_POSTFLUSH); 467 queue_flush(q, QUEUE_ORDERED_POSTFLUSH);
465 else 468 else
466 q->ordseq |= QUEUE_ORDSEQ_POSTFLUSH; 469 q->ordseq |= QUEUE_ORDSEQ_POSTFLUSH;
467 470
468 elv_insert(q, rq, ELEVATOR_INSERT_FRONT); 471 elv_insert(q, rq, ELEVATOR_INSERT_FRONT);
469 472
470 if (q->ordered & QUEUE_ORDERED_PREFLUSH) { 473 if (q->ordered & QUEUE_ORDERED_PREFLUSH) {
471 queue_flush(q, QUEUE_ORDERED_PREFLUSH); 474 queue_flush(q, QUEUE_ORDERED_PREFLUSH);
472 rq = &q->pre_flush_rq; 475 rq = &q->pre_flush_rq;
473 } else 476 } else
474 q->ordseq |= QUEUE_ORDSEQ_PREFLUSH; 477 q->ordseq |= QUEUE_ORDSEQ_PREFLUSH;
475 478
476 if ((q->ordered & QUEUE_ORDERED_TAG) || q->in_flight == 0) 479 if ((q->ordered & QUEUE_ORDERED_TAG) || q->in_flight == 0)
477 q->ordseq |= QUEUE_ORDSEQ_DRAIN; 480 q->ordseq |= QUEUE_ORDSEQ_DRAIN;
478 else 481 else
479 rq = NULL; 482 rq = NULL;
480 483
481 return rq; 484 return rq;
482 } 485 }
483 486
484 int blk_do_ordered(struct request_queue *q, struct request **rqp) 487 int blk_do_ordered(struct request_queue *q, struct request **rqp)
485 { 488 {
486 struct request *rq = *rqp; 489 struct request *rq = *rqp;
487 int is_barrier = blk_fs_request(rq) && blk_barrier_rq(rq); 490 const int is_barrier = blk_fs_request(rq) && blk_barrier_rq(rq);
488 491
489 if (!q->ordseq) { 492 if (!q->ordseq) {
490 if (!is_barrier) 493 if (!is_barrier)
491 return 1; 494 return 1;
492 495
493 if (q->next_ordered != QUEUE_ORDERED_NONE) { 496 if (q->next_ordered != QUEUE_ORDERED_NONE) {
494 *rqp = start_ordered(q, rq); 497 *rqp = start_ordered(q, rq);
495 return 1; 498 return 1;
496 } else { 499 } else {
497 /* 500 /*
498 * This can happen when the queue switches to 501 * This can happen when the queue switches to
499 * ORDERED_NONE while this request is on it. 502 * ORDERED_NONE while this request is on it.
500 */ 503 */
501 blkdev_dequeue_request(rq); 504 blkdev_dequeue_request(rq);
502 end_that_request_first(rq, -EOPNOTSUPP, 505 end_that_request_first(rq, -EOPNOTSUPP,
503 rq->hard_nr_sectors); 506 rq->hard_nr_sectors);
504 end_that_request_last(rq, -EOPNOTSUPP); 507 end_that_request_last(rq, -EOPNOTSUPP);
505 *rqp = NULL; 508 *rqp = NULL;
506 return 0; 509 return 0;
507 } 510 }
508 } 511 }
509 512
510 /* 513 /*
511 * Ordered sequence in progress 514 * Ordered sequence in progress
512 */ 515 */
513 516
514 /* Special requests are not subject to ordering rules. */ 517 /* Special requests are not subject to ordering rules. */
515 if (!blk_fs_request(rq) && 518 if (!blk_fs_request(rq) &&
516 rq != &q->pre_flush_rq && rq != &q->post_flush_rq) 519 rq != &q->pre_flush_rq && rq != &q->post_flush_rq)
517 return 1; 520 return 1;
518 521
519 if (q->ordered & QUEUE_ORDERED_TAG) { 522 if (q->ordered & QUEUE_ORDERED_TAG) {
520 /* Ordered by tag. Blocking the next barrier is enough. */ 523 /* Ordered by tag. Blocking the next barrier is enough. */
521 if (is_barrier && rq != &q->bar_rq) 524 if (is_barrier && rq != &q->bar_rq)
522 *rqp = NULL; 525 *rqp = NULL;
523 } else { 526 } else {
524 /* Ordered by draining. Wait for turn. */ 527 /* Ordered by draining. Wait for turn. */
525 WARN_ON(blk_ordered_req_seq(rq) < blk_ordered_cur_seq(q)); 528 WARN_ON(blk_ordered_req_seq(rq) < blk_ordered_cur_seq(q));
526 if (blk_ordered_req_seq(rq) > blk_ordered_cur_seq(q)) 529 if (blk_ordered_req_seq(rq) > blk_ordered_cur_seq(q))
527 *rqp = NULL; 530 *rqp = NULL;
528 } 531 }
529 532
530 return 1; 533 return 1;
531 } 534 }
532 535
533 static void req_bio_endio(struct request *rq, struct bio *bio, 536 static void req_bio_endio(struct request *rq, struct bio *bio,
534 unsigned int nbytes, int error) 537 unsigned int nbytes, int error)
535 { 538 {
536 struct request_queue *q = rq->q; 539 struct request_queue *q = rq->q;
537 540
538 if (&q->bar_rq != rq) { 541 if (&q->bar_rq != rq) {
539 if (error) 542 if (error)
540 clear_bit(BIO_UPTODATE, &bio->bi_flags); 543 clear_bit(BIO_UPTODATE, &bio->bi_flags);
541 else if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) 544 else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
542 error = -EIO; 545 error = -EIO;
543 546
544 if (unlikely(nbytes > bio->bi_size)) { 547 if (unlikely(nbytes > bio->bi_size)) {
545 printk("%s: want %u bytes done, only %u left\n", 548 printk("%s: want %u bytes done, only %u left\n",
546 __FUNCTION__, nbytes, bio->bi_size); 549 __FUNCTION__, nbytes, bio->bi_size);
547 nbytes = bio->bi_size; 550 nbytes = bio->bi_size;
548 } 551 }
549 552
550 bio->bi_size -= nbytes; 553 bio->bi_size -= nbytes;
551 bio->bi_sector += (nbytes >> 9); 554 bio->bi_sector += (nbytes >> 9);
552 if (bio->bi_size == 0) 555 if (bio->bi_size == 0)
553 bio_endio(bio, error); 556 bio_endio(bio, error);
554 } else { 557 } else {
555 558
556 /* 559 /*
557 * Okay, this is the barrier request in progress, just 560 * Okay, this is the barrier request in progress, just
558 * record the error; 561 * record the error;
559 */ 562 */
560 if (error && !q->orderr) 563 if (error && !q->orderr)
561 q->orderr = error; 564 q->orderr = error;
562 } 565 }
563 } 566 }
564 567
565 /** 568 /**
566 * blk_queue_bounce_limit - set bounce buffer limit for queue 569 * blk_queue_bounce_limit - set bounce buffer limit for queue
567 * @q: the request queue for the device 570 * @q: the request queue for the device
568 * @dma_addr: bus address limit 571 * @dma_addr: bus address limit
569 * 572 *
570 * Description: 573 * Description:
571 * Different hardware can have different requirements as to what pages 574 * Different hardware can have different requirements as to what pages
572 * it can do I/O directly to. A low level driver can call 575 * it can do I/O directly to. A low level driver can call
573 * blk_queue_bounce_limit to have lower memory pages allocated as bounce 576 * blk_queue_bounce_limit to have lower memory pages allocated as bounce
574 * buffers for doing I/O to pages residing above @page. 577 * buffers for doing I/O to pages residing above @page.
575 **/ 578 **/
576 void blk_queue_bounce_limit(struct request_queue *q, u64 dma_addr) 579 void blk_queue_bounce_limit(struct request_queue *q, u64 dma_addr)
577 { 580 {
578 unsigned long bounce_pfn = dma_addr >> PAGE_SHIFT; 581 unsigned long bounce_pfn = dma_addr >> PAGE_SHIFT;
579 int dma = 0; 582 int dma = 0;
580 583
581 q->bounce_gfp = GFP_NOIO; 584 q->bounce_gfp = GFP_NOIO;
582 #if BITS_PER_LONG == 64 585 #if BITS_PER_LONG == 64
583 /* Assume anything <= 4GB can be handled by IOMMU. 586 /* Assume anything <= 4GB can be handled by IOMMU.
584 Actually some IOMMUs can handle everything, but I don't 587 Actually some IOMMUs can handle everything, but I don't
585 know of a way to test this here. */ 588 know of a way to test this here. */
586 if (bounce_pfn < (min_t(u64,0xffffffff,BLK_BOUNCE_HIGH) >> PAGE_SHIFT)) 589 if (bounce_pfn < (min_t(u64,0xffffffff,BLK_BOUNCE_HIGH) >> PAGE_SHIFT))
587 dma = 1; 590 dma = 1;
588 q->bounce_pfn = max_low_pfn; 591 q->bounce_pfn = max_low_pfn;
589 #else 592 #else
590 if (bounce_pfn < blk_max_low_pfn) 593 if (bounce_pfn < blk_max_low_pfn)
591 dma = 1; 594 dma = 1;
592 q->bounce_pfn = bounce_pfn; 595 q->bounce_pfn = bounce_pfn;
593 #endif 596 #endif
594 if (dma) { 597 if (dma) {
595 init_emergency_isa_pool(); 598 init_emergency_isa_pool();
596 q->bounce_gfp = GFP_NOIO | GFP_DMA; 599 q->bounce_gfp = GFP_NOIO | GFP_DMA;
597 q->bounce_pfn = bounce_pfn; 600 q->bounce_pfn = bounce_pfn;
598 } 601 }
599 } 602 }
600 603
601 EXPORT_SYMBOL(blk_queue_bounce_limit); 604 EXPORT_SYMBOL(blk_queue_bounce_limit);
602 605
603 /** 606 /**
604 * blk_queue_max_sectors - set max sectors for a request for this queue 607 * blk_queue_max_sectors - set max sectors for a request for this queue
605 * @q: the request queue for the device 608 * @q: the request queue for the device
606 * @max_sectors: max sectors in the usual 512b unit 609 * @max_sectors: max sectors in the usual 512b unit
607 * 610 *
608 * Description: 611 * Description:
609 * Enables a low level driver to set an upper limit on the size of 612 * Enables a low level driver to set an upper limit on the size of
610 * received requests. 613 * received requests.
611 **/ 614 **/
612 void blk_queue_max_sectors(struct request_queue *q, unsigned int max_sectors) 615 void blk_queue_max_sectors(struct request_queue *q, unsigned int max_sectors)
613 { 616 {
614 if ((max_sectors << 9) < PAGE_CACHE_SIZE) { 617 if ((max_sectors << 9) < PAGE_CACHE_SIZE) {
615 max_sectors = 1 << (PAGE_CACHE_SHIFT - 9); 618 max_sectors = 1 << (PAGE_CACHE_SHIFT - 9);
616 printk("%s: set to minimum %d\n", __FUNCTION__, max_sectors); 619 printk("%s: set to minimum %d\n", __FUNCTION__, max_sectors);
617 } 620 }
618 621
619 if (BLK_DEF_MAX_SECTORS > max_sectors) 622 if (BLK_DEF_MAX_SECTORS > max_sectors)
620 q->max_hw_sectors = q->max_sectors = max_sectors; 623 q->max_hw_sectors = q->max_sectors = max_sectors;
621 else { 624 else {
622 q->max_sectors = BLK_DEF_MAX_SECTORS; 625 q->max_sectors = BLK_DEF_MAX_SECTORS;
623 q->max_hw_sectors = max_sectors; 626 q->max_hw_sectors = max_sectors;
624 } 627 }
625 } 628 }
626 629
627 EXPORT_SYMBOL(blk_queue_max_sectors); 630 EXPORT_SYMBOL(blk_queue_max_sectors);
628 631
629 /** 632 /**
630 * blk_queue_max_phys_segments - set max phys segments for a request for this queue 633 * blk_queue_max_phys_segments - set max phys segments for a request for this queue
631 * @q: the request queue for the device 634 * @q: the request queue for the device
632 * @max_segments: max number of segments 635 * @max_segments: max number of segments
633 * 636 *
634 * Description: 637 * Description:
635 * Enables a low level driver to set an upper limit on the number of 638 * Enables a low level driver to set an upper limit on the number of
636 * physical data segments in a request. This would be the largest sized 639 * physical data segments in a request. This would be the largest sized
637 * scatter list the driver could handle. 640 * scatter list the driver could handle.
638 **/ 641 **/
639 void blk_queue_max_phys_segments(struct request_queue *q, 642 void blk_queue_max_phys_segments(struct request_queue *q,
640 unsigned short max_segments) 643 unsigned short max_segments)
641 { 644 {
642 if (!max_segments) { 645 if (!max_segments) {
643 max_segments = 1; 646 max_segments = 1;
644 printk("%s: set to minimum %d\n", __FUNCTION__, max_segments); 647 printk("%s: set to minimum %d\n", __FUNCTION__, max_segments);
645 } 648 }
646 649
647 q->max_phys_segments = max_segments; 650 q->max_phys_segments = max_segments;
648 } 651 }
649 652
650 EXPORT_SYMBOL(blk_queue_max_phys_segments); 653 EXPORT_SYMBOL(blk_queue_max_phys_segments);
651 654
652 /** 655 /**
653 * blk_queue_max_hw_segments - set max hw segments for a request for this queue 656 * blk_queue_max_hw_segments - set max hw segments for a request for this queue
654 * @q: the request queue for the device 657 * @q: the request queue for the device
655 * @max_segments: max number of segments 658 * @max_segments: max number of segments
656 * 659 *
657 * Description: 660 * Description:
658 * Enables a low level driver to set an upper limit on the number of 661 * Enables a low level driver to set an upper limit on the number of
659 * hw data segments in a request. This would be the largest number of 662 * hw data segments in a request. This would be the largest number of
660 * address/length pairs the host adapter can actually give as once 663 * address/length pairs the host adapter can actually give as once
661 * to the device. 664 * to the device.
662 **/ 665 **/
663 void blk_queue_max_hw_segments(struct request_queue *q, 666 void blk_queue_max_hw_segments(struct request_queue *q,
664 unsigned short max_segments) 667 unsigned short max_segments)
665 { 668 {
666 if (!max_segments) { 669 if (!max_segments) {
667 max_segments = 1; 670 max_segments = 1;
668 printk("%s: set to minimum %d\n", __FUNCTION__, max_segments); 671 printk("%s: set to minimum %d\n", __FUNCTION__, max_segments);
669 } 672 }
670 673
671 q->max_hw_segments = max_segments; 674 q->max_hw_segments = max_segments;
672 } 675 }
673 676
674 EXPORT_SYMBOL(blk_queue_max_hw_segments); 677 EXPORT_SYMBOL(blk_queue_max_hw_segments);
675 678
676 /** 679 /**
677 * blk_queue_max_segment_size - set max segment size for blk_rq_map_sg 680 * blk_queue_max_segment_size - set max segment size for blk_rq_map_sg
678 * @q: the request queue for the device 681 * @q: the request queue for the device
679 * @max_size: max size of segment in bytes 682 * @max_size: max size of segment in bytes
680 * 683 *
681 * Description: 684 * Description:
682 * Enables a low level driver to set an upper limit on the size of a 685 * Enables a low level driver to set an upper limit on the size of a
683 * coalesced segment 686 * coalesced segment
684 **/ 687 **/
685 void blk_queue_max_segment_size(struct request_queue *q, unsigned int max_size) 688 void blk_queue_max_segment_size(struct request_queue *q, unsigned int max_size)
686 { 689 {
687 if (max_size < PAGE_CACHE_SIZE) { 690 if (max_size < PAGE_CACHE_SIZE) {
688 max_size = PAGE_CACHE_SIZE; 691 max_size = PAGE_CACHE_SIZE;
689 printk("%s: set to minimum %d\n", __FUNCTION__, max_size); 692 printk("%s: set to minimum %d\n", __FUNCTION__, max_size);
690 } 693 }
691 694
692 q->max_segment_size = max_size; 695 q->max_segment_size = max_size;
693 } 696 }
694 697
695 EXPORT_SYMBOL(blk_queue_max_segment_size); 698 EXPORT_SYMBOL(blk_queue_max_segment_size);
696 699
697 /** 700 /**
698 * blk_queue_hardsect_size - set hardware sector size for the queue 701 * blk_queue_hardsect_size - set hardware sector size for the queue
699 * @q: the request queue for the device 702 * @q: the request queue for the device
700 * @size: the hardware sector size, in bytes 703 * @size: the hardware sector size, in bytes
701 * 704 *
702 * Description: 705 * Description:
703 * This should typically be set to the lowest possible sector size 706 * This should typically be set to the lowest possible sector size
704 * that the hardware can operate on (possible without reverting to 707 * that the hardware can operate on (possible without reverting to
705 * even internal read-modify-write operations). Usually the default 708 * even internal read-modify-write operations). Usually the default
706 * of 512 covers most hardware. 709 * of 512 covers most hardware.
707 **/ 710 **/
708 void blk_queue_hardsect_size(struct request_queue *q, unsigned short size) 711 void blk_queue_hardsect_size(struct request_queue *q, unsigned short size)
709 { 712 {
710 q->hardsect_size = size; 713 q->hardsect_size = size;
711 } 714 }
712 715
713 EXPORT_SYMBOL(blk_queue_hardsect_size); 716 EXPORT_SYMBOL(blk_queue_hardsect_size);
714 717
715 /* 718 /*
716 * Returns the minimum that is _not_ zero, unless both are zero. 719 * Returns the minimum that is _not_ zero, unless both are zero.
717 */ 720 */
718 #define min_not_zero(l, r) (l == 0) ? r : ((r == 0) ? l : min(l, r)) 721 #define min_not_zero(l, r) (l == 0) ? r : ((r == 0) ? l : min(l, r))
719 722
720 /** 723 /**
721 * blk_queue_stack_limits - inherit underlying queue limits for stacked drivers 724 * blk_queue_stack_limits - inherit underlying queue limits for stacked drivers
722 * @t: the stacking driver (top) 725 * @t: the stacking driver (top)
723 * @b: the underlying device (bottom) 726 * @b: the underlying device (bottom)
724 **/ 727 **/
725 void blk_queue_stack_limits(struct request_queue *t, struct request_queue *b) 728 void blk_queue_stack_limits(struct request_queue *t, struct request_queue *b)
726 { 729 {
727 /* zero is "infinity" */ 730 /* zero is "infinity" */
728 t->max_sectors = min_not_zero(t->max_sectors,b->max_sectors); 731 t->max_sectors = min_not_zero(t->max_sectors,b->max_sectors);
729 t->max_hw_sectors = min_not_zero(t->max_hw_sectors,b->max_hw_sectors); 732 t->max_hw_sectors = min_not_zero(t->max_hw_sectors,b->max_hw_sectors);
730 733
731 t->max_phys_segments = min(t->max_phys_segments,b->max_phys_segments); 734 t->max_phys_segments = min(t->max_phys_segments,b->max_phys_segments);
732 t->max_hw_segments = min(t->max_hw_segments,b->max_hw_segments); 735 t->max_hw_segments = min(t->max_hw_segments,b->max_hw_segments);
733 t->max_segment_size = min(t->max_segment_size,b->max_segment_size); 736 t->max_segment_size = min(t->max_segment_size,b->max_segment_size);
734 t->hardsect_size = max(t->hardsect_size,b->hardsect_size); 737 t->hardsect_size = max(t->hardsect_size,b->hardsect_size);
735 if (!test_bit(QUEUE_FLAG_CLUSTER, &b->queue_flags)) 738 if (!test_bit(QUEUE_FLAG_CLUSTER, &b->queue_flags))
736 clear_bit(QUEUE_FLAG_CLUSTER, &t->queue_flags); 739 clear_bit(QUEUE_FLAG_CLUSTER, &t->queue_flags);
737 } 740 }
738 741
739 EXPORT_SYMBOL(blk_queue_stack_limits); 742 EXPORT_SYMBOL(blk_queue_stack_limits);
740 743
741 /** 744 /**
742 * blk_queue_segment_boundary - set boundary rules for segment merging 745 * blk_queue_segment_boundary - set boundary rules for segment merging
743 * @q: the request queue for the device 746 * @q: the request queue for the device
744 * @mask: the memory boundary mask 747 * @mask: the memory boundary mask
745 **/ 748 **/
746 void blk_queue_segment_boundary(struct request_queue *q, unsigned long mask) 749 void blk_queue_segment_boundary(struct request_queue *q, unsigned long mask)
747 { 750 {
748 if (mask < PAGE_CACHE_SIZE - 1) { 751 if (mask < PAGE_CACHE_SIZE - 1) {
749 mask = PAGE_CACHE_SIZE - 1; 752 mask = PAGE_CACHE_SIZE - 1;
750 printk("%s: set to minimum %lx\n", __FUNCTION__, mask); 753 printk("%s: set to minimum %lx\n", __FUNCTION__, mask);
751 } 754 }
752 755
753 q->seg_boundary_mask = mask; 756 q->seg_boundary_mask = mask;
754 } 757 }
755 758
756 EXPORT_SYMBOL(blk_queue_segment_boundary); 759 EXPORT_SYMBOL(blk_queue_segment_boundary);
757 760
758 /** 761 /**
759 * blk_queue_dma_alignment - set dma length and memory alignment 762 * blk_queue_dma_alignment - set dma length and memory alignment
760 * @q: the request queue for the device 763 * @q: the request queue for the device
761 * @mask: alignment mask 764 * @mask: alignment mask
762 * 765 *
763 * description: 766 * description:
764 * set required memory and length aligment for direct dma transactions. 767 * set required memory and length aligment for direct dma transactions.
765 * this is used when buiding direct io requests for the queue. 768 * this is used when buiding direct io requests for the queue.
766 * 769 *
767 **/ 770 **/
768 void blk_queue_dma_alignment(struct request_queue *q, int mask) 771 void blk_queue_dma_alignment(struct request_queue *q, int mask)
769 { 772 {
770 q->dma_alignment = mask; 773 q->dma_alignment = mask;
771 } 774 }
772 775
773 EXPORT_SYMBOL(blk_queue_dma_alignment); 776 EXPORT_SYMBOL(blk_queue_dma_alignment);
774 777
775 /** 778 /**
776 * blk_queue_find_tag - find a request by its tag and queue 779 * blk_queue_find_tag - find a request by its tag and queue
777 * @q: The request queue for the device 780 * @q: The request queue for the device
778 * @tag: The tag of the request 781 * @tag: The tag of the request
779 * 782 *
780 * Notes: 783 * Notes:
781 * Should be used when a device returns a tag and you want to match 784 * Should be used when a device returns a tag and you want to match
782 * it with a request. 785 * it with a request.
783 * 786 *
784 * no locks need be held. 787 * no locks need be held.
785 **/ 788 **/
786 struct request *blk_queue_find_tag(struct request_queue *q, int tag) 789 struct request *blk_queue_find_tag(struct request_queue *q, int tag)
787 { 790 {
788 return blk_map_queue_find_tag(q->queue_tags, tag); 791 return blk_map_queue_find_tag(q->queue_tags, tag);
789 } 792 }
790 793
791 EXPORT_SYMBOL(blk_queue_find_tag); 794 EXPORT_SYMBOL(blk_queue_find_tag);
792 795
793 /** 796 /**
794 * __blk_free_tags - release a given set of tag maintenance info 797 * __blk_free_tags - release a given set of tag maintenance info
795 * @bqt: the tag map to free 798 * @bqt: the tag map to free
796 * 799 *
797 * Tries to free the specified @bqt@. Returns true if it was 800 * Tries to free the specified @bqt@. Returns true if it was
798 * actually freed and false if there are still references using it 801 * actually freed and false if there are still references using it
799 */ 802 */
800 static int __blk_free_tags(struct blk_queue_tag *bqt) 803 static int __blk_free_tags(struct blk_queue_tag *bqt)
801 { 804 {
802 int retval; 805 int retval;
803 806
804 retval = atomic_dec_and_test(&bqt->refcnt); 807 retval = atomic_dec_and_test(&bqt->refcnt);
805 if (retval) { 808 if (retval) {
806 BUG_ON(bqt->busy); 809 BUG_ON(bqt->busy);
807 BUG_ON(!list_empty(&bqt->busy_list)); 810 BUG_ON(!list_empty(&bqt->busy_list));
808 811
809 kfree(bqt->tag_index); 812 kfree(bqt->tag_index);
810 bqt->tag_index = NULL; 813 bqt->tag_index = NULL;
811 814
812 kfree(bqt->tag_map); 815 kfree(bqt->tag_map);
813 bqt->tag_map = NULL; 816 bqt->tag_map = NULL;
814 817
815 kfree(bqt); 818 kfree(bqt);
816 819
817 } 820 }
818 821
819 return retval; 822 return retval;
820 } 823 }
821 824
822 /** 825 /**
823 * __blk_queue_free_tags - release tag maintenance info 826 * __blk_queue_free_tags - release tag maintenance info
824 * @q: the request queue for the device 827 * @q: the request queue for the device
825 * 828 *
826 * Notes: 829 * Notes:
827 * blk_cleanup_queue() will take care of calling this function, if tagging 830 * blk_cleanup_queue() will take care of calling this function, if tagging
828 * has been used. So there's no need to call this directly. 831 * has been used. So there's no need to call this directly.
829 **/ 832 **/
830 static void __blk_queue_free_tags(struct request_queue *q) 833 static void __blk_queue_free_tags(struct request_queue *q)
831 { 834 {
832 struct blk_queue_tag *bqt = q->queue_tags; 835 struct blk_queue_tag *bqt = q->queue_tags;
833 836
834 if (!bqt) 837 if (!bqt)
835 return; 838 return;
836 839
837 __blk_free_tags(bqt); 840 __blk_free_tags(bqt);
838 841
839 q->queue_tags = NULL; 842 q->queue_tags = NULL;
840 q->queue_flags &= ~(1 << QUEUE_FLAG_QUEUED); 843 q->queue_flags &= ~(1 << QUEUE_FLAG_QUEUED);
841 } 844 }
842 845
843 846
844 /** 847 /**
845 * blk_free_tags - release a given set of tag maintenance info 848 * blk_free_tags - release a given set of tag maintenance info
846 * @bqt: the tag map to free 849 * @bqt: the tag map to free
847 * 850 *
848 * For externally managed @bqt@ frees the map. Callers of this 851 * For externally managed @bqt@ frees the map. Callers of this
849 * function must guarantee to have released all the queues that 852 * function must guarantee to have released all the queues that
850 * might have been using this tag map. 853 * might have been using this tag map.
851 */ 854 */
852 void blk_free_tags(struct blk_queue_tag *bqt) 855 void blk_free_tags(struct blk_queue_tag *bqt)
853 { 856 {
854 if (unlikely(!__blk_free_tags(bqt))) 857 if (unlikely(!__blk_free_tags(bqt)))
855 BUG(); 858 BUG();
856 } 859 }
857 EXPORT_SYMBOL(blk_free_tags); 860 EXPORT_SYMBOL(blk_free_tags);
858 861
859 /** 862 /**
860 * blk_queue_free_tags - release tag maintenance info 863 * blk_queue_free_tags - release tag maintenance info
861 * @q: the request queue for the device 864 * @q: the request queue for the device
862 * 865 *
863 * Notes: 866 * Notes:
864 * This is used to disabled tagged queuing to a device, yet leave 867 * This is used to disabled tagged queuing to a device, yet leave
865 * queue in function. 868 * queue in function.
866 **/ 869 **/
867 void blk_queue_free_tags(struct request_queue *q) 870 void blk_queue_free_tags(struct request_queue *q)
868 { 871 {
869 clear_bit(QUEUE_FLAG_QUEUED, &q->queue_flags); 872 clear_bit(QUEUE_FLAG_QUEUED, &q->queue_flags);
870 } 873 }
871 874
872 EXPORT_SYMBOL(blk_queue_free_tags); 875 EXPORT_SYMBOL(blk_queue_free_tags);
873 876
874 static int 877 static int
875 init_tag_map(struct request_queue *q, struct blk_queue_tag *tags, int depth) 878 init_tag_map(struct request_queue *q, struct blk_queue_tag *tags, int depth)
876 { 879 {
877 struct request **tag_index; 880 struct request **tag_index;
878 unsigned long *tag_map; 881 unsigned long *tag_map;
879 int nr_ulongs; 882 int nr_ulongs;
880 883
881 if (q && depth > q->nr_requests * 2) { 884 if (q && depth > q->nr_requests * 2) {
882 depth = q->nr_requests * 2; 885 depth = q->nr_requests * 2;
883 printk(KERN_ERR "%s: adjusted depth to %d\n", 886 printk(KERN_ERR "%s: adjusted depth to %d\n",
884 __FUNCTION__, depth); 887 __FUNCTION__, depth);
885 } 888 }
886 889
887 tag_index = kzalloc(depth * sizeof(struct request *), GFP_ATOMIC); 890 tag_index = kzalloc(depth * sizeof(struct request *), GFP_ATOMIC);
888 if (!tag_index) 891 if (!tag_index)
889 goto fail; 892 goto fail;
890 893
891 nr_ulongs = ALIGN(depth, BITS_PER_LONG) / BITS_PER_LONG; 894 nr_ulongs = ALIGN(depth, BITS_PER_LONG) / BITS_PER_LONG;
892 tag_map = kzalloc(nr_ulongs * sizeof(unsigned long), GFP_ATOMIC); 895 tag_map = kzalloc(nr_ulongs * sizeof(unsigned long), GFP_ATOMIC);
893 if (!tag_map) 896 if (!tag_map)
894 goto fail; 897 goto fail;
895 898
896 tags->real_max_depth = depth; 899 tags->real_max_depth = depth;
897 tags->max_depth = depth; 900 tags->max_depth = depth;
898 tags->tag_index = tag_index; 901 tags->tag_index = tag_index;
899 tags->tag_map = tag_map; 902 tags->tag_map = tag_map;
900 903
901 return 0; 904 return 0;
902 fail: 905 fail:
903 kfree(tag_index); 906 kfree(tag_index);
904 return -ENOMEM; 907 return -ENOMEM;
905 } 908 }
906 909
907 static struct blk_queue_tag *__blk_queue_init_tags(struct request_queue *q, 910 static struct blk_queue_tag *__blk_queue_init_tags(struct request_queue *q,
908 int depth) 911 int depth)
909 { 912 {
910 struct blk_queue_tag *tags; 913 struct blk_queue_tag *tags;
911 914
912 tags = kmalloc(sizeof(struct blk_queue_tag), GFP_ATOMIC); 915 tags = kmalloc(sizeof(struct blk_queue_tag), GFP_ATOMIC);
913 if (!tags) 916 if (!tags)
914 goto fail; 917 goto fail;
915 918
916 if (init_tag_map(q, tags, depth)) 919 if (init_tag_map(q, tags, depth))
917 goto fail; 920 goto fail;
918 921
919 INIT_LIST_HEAD(&tags->busy_list); 922 INIT_LIST_HEAD(&tags->busy_list);
920 tags->busy = 0; 923 tags->busy = 0;
921 atomic_set(&tags->refcnt, 1); 924 atomic_set(&tags->refcnt, 1);
922 return tags; 925 return tags;
923 fail: 926 fail:
924 kfree(tags); 927 kfree(tags);
925 return NULL; 928 return NULL;
926 } 929 }
927 930
928 /** 931 /**
929 * blk_init_tags - initialize the tag info for an external tag map 932 * blk_init_tags - initialize the tag info for an external tag map
930 * @depth: the maximum queue depth supported 933 * @depth: the maximum queue depth supported
931 * @tags: the tag to use 934 * @tags: the tag to use
932 **/ 935 **/
933 struct blk_queue_tag *blk_init_tags(int depth) 936 struct blk_queue_tag *blk_init_tags(int depth)
934 { 937 {
935 return __blk_queue_init_tags(NULL, depth); 938 return __blk_queue_init_tags(NULL, depth);
936 } 939 }
937 EXPORT_SYMBOL(blk_init_tags); 940 EXPORT_SYMBOL(blk_init_tags);
938 941
939 /** 942 /**
940 * blk_queue_init_tags - initialize the queue tag info 943 * blk_queue_init_tags - initialize the queue tag info
941 * @q: the request queue for the device 944 * @q: the request queue for the device
942 * @depth: the maximum queue depth supported 945 * @depth: the maximum queue depth supported
943 * @tags: the tag to use 946 * @tags: the tag to use
944 **/ 947 **/
945 int blk_queue_init_tags(struct request_queue *q, int depth, 948 int blk_queue_init_tags(struct request_queue *q, int depth,
946 struct blk_queue_tag *tags) 949 struct blk_queue_tag *tags)
947 { 950 {
948 int rc; 951 int rc;
949 952
950 BUG_ON(tags && q->queue_tags && tags != q->queue_tags); 953 BUG_ON(tags && q->queue_tags && tags != q->queue_tags);
951 954
952 if (!tags && !q->queue_tags) { 955 if (!tags && !q->queue_tags) {
953 tags = __blk_queue_init_tags(q, depth); 956 tags = __blk_queue_init_tags(q, depth);
954 957
955 if (!tags) 958 if (!tags)
956 goto fail; 959 goto fail;
957 } else if (q->queue_tags) { 960 } else if (q->queue_tags) {
958 if ((rc = blk_queue_resize_tags(q, depth))) 961 if ((rc = blk_queue_resize_tags(q, depth)))
959 return rc; 962 return rc;
960 set_bit(QUEUE_FLAG_QUEUED, &q->queue_flags); 963 set_bit(QUEUE_FLAG_QUEUED, &q->queue_flags);
961 return 0; 964 return 0;
962 } else 965 } else
963 atomic_inc(&tags->refcnt); 966 atomic_inc(&tags->refcnt);
964 967
965 /* 968 /*
966 * assign it, all done 969 * assign it, all done
967 */ 970 */
968 q->queue_tags = tags; 971 q->queue_tags = tags;
969 q->queue_flags |= (1 << QUEUE_FLAG_QUEUED); 972 q->queue_flags |= (1 << QUEUE_FLAG_QUEUED);
970 return 0; 973 return 0;
971 fail: 974 fail:
972 kfree(tags); 975 kfree(tags);
973 return -ENOMEM; 976 return -ENOMEM;
974 } 977 }
975 978
976 EXPORT_SYMBOL(blk_queue_init_tags); 979 EXPORT_SYMBOL(blk_queue_init_tags);
977 980
978 /** 981 /**
979 * blk_queue_resize_tags - change the queueing depth 982 * blk_queue_resize_tags - change the queueing depth
980 * @q: the request queue for the device 983 * @q: the request queue for the device
981 * @new_depth: the new max command queueing depth 984 * @new_depth: the new max command queueing depth
982 * 985 *
983 * Notes: 986 * Notes:
984 * Must be called with the queue lock held. 987 * Must be called with the queue lock held.
985 **/ 988 **/
986 int blk_queue_resize_tags(struct request_queue *q, int new_depth) 989 int blk_queue_resize_tags(struct request_queue *q, int new_depth)
987 { 990 {
988 struct blk_queue_tag *bqt = q->queue_tags; 991 struct blk_queue_tag *bqt = q->queue_tags;
989 struct request **tag_index; 992 struct request **tag_index;
990 unsigned long *tag_map; 993 unsigned long *tag_map;
991 int max_depth, nr_ulongs; 994 int max_depth, nr_ulongs;
992 995
993 if (!bqt) 996 if (!bqt)
994 return -ENXIO; 997 return -ENXIO;
995 998
996 /* 999 /*
997 * if we already have large enough real_max_depth. just 1000 * if we already have large enough real_max_depth. just
998 * adjust max_depth. *NOTE* as requests with tag value 1001 * adjust max_depth. *NOTE* as requests with tag value
999 * between new_depth and real_max_depth can be in-flight, tag 1002 * between new_depth and real_max_depth can be in-flight, tag
1000 * map can not be shrunk blindly here. 1003 * map can not be shrunk blindly here.
1001 */ 1004 */
1002 if (new_depth <= bqt->real_max_depth) { 1005 if (new_depth <= bqt->real_max_depth) {
1003 bqt->max_depth = new_depth; 1006 bqt->max_depth = new_depth;
1004 return 0; 1007 return 0;
1005 } 1008 }
1006 1009
1007 /* 1010 /*
1008 * Currently cannot replace a shared tag map with a new 1011 * Currently cannot replace a shared tag map with a new
1009 * one, so error out if this is the case 1012 * one, so error out if this is the case
1010 */ 1013 */
1011 if (atomic_read(&bqt->refcnt) != 1) 1014 if (atomic_read(&bqt->refcnt) != 1)
1012 return -EBUSY; 1015 return -EBUSY;
1013 1016
1014 /* 1017 /*
1015 * save the old state info, so we can copy it back 1018 * save the old state info, so we can copy it back
1016 */ 1019 */
1017 tag_index = bqt->tag_index; 1020 tag_index = bqt->tag_index;
1018 tag_map = bqt->tag_map; 1021 tag_map = bqt->tag_map;
1019 max_depth = bqt->real_max_depth; 1022 max_depth = bqt->real_max_depth;
1020 1023
1021 if (init_tag_map(q, bqt, new_depth)) 1024 if (init_tag_map(q, bqt, new_depth))
1022 return -ENOMEM; 1025 return -ENOMEM;
1023 1026
1024 memcpy(bqt->tag_index, tag_index, max_depth * sizeof(struct request *)); 1027 memcpy(bqt->tag_index, tag_index, max_depth * sizeof(struct request *));
1025 nr_ulongs = ALIGN(max_depth, BITS_PER_LONG) / BITS_PER_LONG; 1028 nr_ulongs = ALIGN(max_depth, BITS_PER_LONG) / BITS_PER_LONG;
1026 memcpy(bqt->tag_map, tag_map, nr_ulongs * sizeof(unsigned long)); 1029 memcpy(bqt->tag_map, tag_map, nr_ulongs * sizeof(unsigned long));
1027 1030
1028 kfree(tag_index); 1031 kfree(tag_index);
1029 kfree(tag_map); 1032 kfree(tag_map);
1030 return 0; 1033 return 0;
1031 } 1034 }
1032 1035
1033 EXPORT_SYMBOL(blk_queue_resize_tags); 1036 EXPORT_SYMBOL(blk_queue_resize_tags);
1034 1037
1035 /** 1038 /**
1036 * blk_queue_end_tag - end tag operations for a request 1039 * blk_queue_end_tag - end tag operations for a request
1037 * @q: the request queue for the device 1040 * @q: the request queue for the device
1038 * @rq: the request that has completed 1041 * @rq: the request that has completed
1039 * 1042 *
1040 * Description: 1043 * Description:
1041 * Typically called when end_that_request_first() returns 0, meaning 1044 * Typically called when end_that_request_first() returns 0, meaning
1042 * all transfers have been done for a request. It's important to call 1045 * all transfers have been done for a request. It's important to call
1043 * this function before end_that_request_last(), as that will put the 1046 * this function before end_that_request_last(), as that will put the
1044 * request back on the free list thus corrupting the internal tag list. 1047 * request back on the free list thus corrupting the internal tag list.
1045 * 1048 *
1046 * Notes: 1049 * Notes:
1047 * queue lock must be held. 1050 * queue lock must be held.
1048 **/ 1051 **/
1049 void blk_queue_end_tag(struct request_queue *q, struct request *rq) 1052 void blk_queue_end_tag(struct request_queue *q, struct request *rq)
1050 { 1053 {
1051 struct blk_queue_tag *bqt = q->queue_tags; 1054 struct blk_queue_tag *bqt = q->queue_tags;
1052 int tag = rq->tag; 1055 int tag = rq->tag;
1053 1056
1054 BUG_ON(tag == -1); 1057 BUG_ON(tag == -1);
1055 1058
1056 if (unlikely(tag >= bqt->real_max_depth)) 1059 if (unlikely(tag >= bqt->real_max_depth))
1057 /* 1060 /*
1058 * This can happen after tag depth has been reduced. 1061 * This can happen after tag depth has been reduced.
1059 * FIXME: how about a warning or info message here? 1062 * FIXME: how about a warning or info message here?
1060 */ 1063 */
1061 return; 1064 return;
1062 1065
1063 list_del_init(&rq->queuelist); 1066 list_del_init(&rq->queuelist);
1064 rq->cmd_flags &= ~REQ_QUEUED; 1067 rq->cmd_flags &= ~REQ_QUEUED;
1065 rq->tag = -1; 1068 rq->tag = -1;
1066 1069
1067 if (unlikely(bqt->tag_index[tag] == NULL)) 1070 if (unlikely(bqt->tag_index[tag] == NULL))
1068 printk(KERN_ERR "%s: tag %d is missing\n", 1071 printk(KERN_ERR "%s: tag %d is missing\n",
1069 __FUNCTION__, tag); 1072 __FUNCTION__, tag);
1070 1073
1071 bqt->tag_index[tag] = NULL; 1074 bqt->tag_index[tag] = NULL;
1072 1075
1073 /* 1076 /*
1074 * We use test_and_clear_bit's memory ordering properties here. 1077 * We use test_and_clear_bit's memory ordering properties here.
1075 * The tag_map bit acts as a lock for tag_index[bit], so we need 1078 * The tag_map bit acts as a lock for tag_index[bit], so we need
1076 * a barrer before clearing the bit (precisely: release semantics). 1079 * a barrer before clearing the bit (precisely: release semantics).
1077 * Could use clear_bit_unlock when it is merged. 1080 * Could use clear_bit_unlock when it is merged.
1078 */ 1081 */
1079 if (unlikely(!test_and_clear_bit(tag, bqt->tag_map))) { 1082 if (unlikely(!test_and_clear_bit(tag, bqt->tag_map))) {
1080 printk(KERN_ERR "%s: attempt to clear non-busy tag (%d)\n", 1083 printk(KERN_ERR "%s: attempt to clear non-busy tag (%d)\n",
1081 __FUNCTION__, tag); 1084 __FUNCTION__, tag);
1082 return; 1085 return;
1083 } 1086 }
1084 1087
1085 bqt->busy--; 1088 bqt->busy--;
1086 } 1089 }
1087 1090
1088 EXPORT_SYMBOL(blk_queue_end_tag); 1091 EXPORT_SYMBOL(blk_queue_end_tag);
1089 1092
1090 /** 1093 /**
1091 * blk_queue_start_tag - find a free tag and assign it 1094 * blk_queue_start_tag - find a free tag and assign it
1092 * @q: the request queue for the device 1095 * @q: the request queue for the device
1093 * @rq: the block request that needs tagging 1096 * @rq: the block request that needs tagging
1094 * 1097 *
1095 * Description: 1098 * Description:
1096 * This can either be used as a stand-alone helper, or possibly be 1099 * This can either be used as a stand-alone helper, or possibly be
1097 * assigned as the queue &prep_rq_fn (in which case &struct request 1100 * assigned as the queue &prep_rq_fn (in which case &struct request
1098 * automagically gets a tag assigned). Note that this function 1101 * automagically gets a tag assigned). Note that this function
1099 * assumes that any type of request can be queued! if this is not 1102 * assumes that any type of request can be queued! if this is not
1100 * true for your device, you must check the request type before 1103 * true for your device, you must check the request type before
1101 * calling this function. The request will also be removed from 1104 * calling this function. The request will also be removed from
1102 * the request queue, so it's the drivers responsibility to readd 1105 * the request queue, so it's the drivers responsibility to readd
1103 * it if it should need to be restarted for some reason. 1106 * it if it should need to be restarted for some reason.
1104 * 1107 *
1105 * Notes: 1108 * Notes:
1106 * queue lock must be held. 1109 * queue lock must be held.
1107 **/ 1110 **/
1108 int blk_queue_start_tag(struct request_queue *q, struct request *rq) 1111 int blk_queue_start_tag(struct request_queue *q, struct request *rq)
1109 { 1112 {
1110 struct blk_queue_tag *bqt = q->queue_tags; 1113 struct blk_queue_tag *bqt = q->queue_tags;
1111 int tag; 1114 int tag;
1112 1115
1113 if (unlikely((rq->cmd_flags & REQ_QUEUED))) { 1116 if (unlikely((rq->cmd_flags & REQ_QUEUED))) {
1114 printk(KERN_ERR 1117 printk(KERN_ERR
1115 "%s: request %p for device [%s] already tagged %d", 1118 "%s: request %p for device [%s] already tagged %d",
1116 __FUNCTION__, rq, 1119 __FUNCTION__, rq,
1117 rq->rq_disk ? rq->rq_disk->disk_name : "?", rq->tag); 1120 rq->rq_disk ? rq->rq_disk->disk_name : "?", rq->tag);
1118 BUG(); 1121 BUG();
1119 } 1122 }
1120 1123
1121 /* 1124 /*
1122 * Protect against shared tag maps, as we may not have exclusive 1125 * Protect against shared tag maps, as we may not have exclusive
1123 * access to the tag map. 1126 * access to the tag map.
1124 */ 1127 */
1125 do { 1128 do {
1126 tag = find_first_zero_bit(bqt->tag_map, bqt->max_depth); 1129 tag = find_first_zero_bit(bqt->tag_map, bqt->max_depth);
1127 if (tag >= bqt->max_depth) 1130 if (tag >= bqt->max_depth)
1128 return 1; 1131 return 1;
1129 1132
1130 } while (test_and_set_bit(tag, bqt->tag_map)); 1133 } while (test_and_set_bit(tag, bqt->tag_map));
1131 /* 1134 /*
1132 * We rely on test_and_set_bit providing lock memory ordering semantics 1135 * We rely on test_and_set_bit providing lock memory ordering semantics
1133 * (could use test_and_set_bit_lock when it is merged). 1136 * (could use test_and_set_bit_lock when it is merged).
1134 */ 1137 */
1135 1138
1136 rq->cmd_flags |= REQ_QUEUED; 1139 rq->cmd_flags |= REQ_QUEUED;
1137 rq->tag = tag; 1140 rq->tag = tag;
1138 bqt->tag_index[tag] = rq; 1141 bqt->tag_index[tag] = rq;
1139 blkdev_dequeue_request(rq); 1142 blkdev_dequeue_request(rq);
1140 list_add(&rq->queuelist, &bqt->busy_list); 1143 list_add(&rq->queuelist, &bqt->busy_list);
1141 bqt->busy++; 1144 bqt->busy++;
1142 return 0; 1145 return 0;
1143 } 1146 }
1144 1147
1145 EXPORT_SYMBOL(blk_queue_start_tag); 1148 EXPORT_SYMBOL(blk_queue_start_tag);
1146 1149
1147 /** 1150 /**
1148 * blk_queue_invalidate_tags - invalidate all pending tags 1151 * blk_queue_invalidate_tags - invalidate all pending tags
1149 * @q: the request queue for the device 1152 * @q: the request queue for the device
1150 * 1153 *
1151 * Description: 1154 * Description:
1152 * Hardware conditions may dictate a need to stop all pending requests. 1155 * Hardware conditions may dictate a need to stop all pending requests.
1153 * In this case, we will safely clear the block side of the tag queue and 1156 * In this case, we will safely clear the block side of the tag queue and
1154 * readd all requests to the request queue in the right order. 1157 * readd all requests to the request queue in the right order.
1155 * 1158 *
1156 * Notes: 1159 * Notes:
1157 * queue lock must be held. 1160 * queue lock must be held.
1158 **/ 1161 **/
1159 void blk_queue_invalidate_tags(struct request_queue *q) 1162 void blk_queue_invalidate_tags(struct request_queue *q)
1160 { 1163 {
1161 struct blk_queue_tag *bqt = q->queue_tags; 1164 struct blk_queue_tag *bqt = q->queue_tags;
1162 struct list_head *tmp, *n; 1165 struct list_head *tmp, *n;
1163 struct request *rq; 1166 struct request *rq;
1164 1167
1165 list_for_each_safe(tmp, n, &bqt->busy_list) { 1168 list_for_each_safe(tmp, n, &bqt->busy_list) {
1166 rq = list_entry_rq(tmp); 1169 rq = list_entry_rq(tmp);
1167 1170
1168 if (rq->tag == -1) { 1171 if (rq->tag == -1) {
1169 printk(KERN_ERR 1172 printk(KERN_ERR
1170 "%s: bad tag found on list\n", __FUNCTION__); 1173 "%s: bad tag found on list\n", __FUNCTION__);
1171 list_del_init(&rq->queuelist); 1174 list_del_init(&rq->queuelist);
1172 rq->cmd_flags &= ~REQ_QUEUED; 1175 rq->cmd_flags &= ~REQ_QUEUED;
1173 } else 1176 } else
1174 blk_queue_end_tag(q, rq); 1177 blk_queue_end_tag(q, rq);
1175 1178
1176 rq->cmd_flags &= ~REQ_STARTED; 1179 rq->cmd_flags &= ~REQ_STARTED;
1177 __elv_add_request(q, rq, ELEVATOR_INSERT_BACK, 0); 1180 __elv_add_request(q, rq, ELEVATOR_INSERT_BACK, 0);
1178 } 1181 }
1179 } 1182 }
1180 1183
1181 EXPORT_SYMBOL(blk_queue_invalidate_tags); 1184 EXPORT_SYMBOL(blk_queue_invalidate_tags);
1182 1185
1183 void blk_dump_rq_flags(struct request *rq, char *msg) 1186 void blk_dump_rq_flags(struct request *rq, char *msg)
1184 { 1187 {
1185 int bit; 1188 int bit;
1186 1189
1187 printk("%s: dev %s: type=%x, flags=%x\n", msg, 1190 printk("%s: dev %s: type=%x, flags=%x\n", msg,
1188 rq->rq_disk ? rq->rq_disk->disk_name : "?", rq->cmd_type, 1191 rq->rq_disk ? rq->rq_disk->disk_name : "?", rq->cmd_type,
1189 rq->cmd_flags); 1192 rq->cmd_flags);
1190 1193
1191 printk("\nsector %llu, nr/cnr %lu/%u\n", (unsigned long long)rq->sector, 1194 printk("\nsector %llu, nr/cnr %lu/%u\n", (unsigned long long)rq->sector,
1192 rq->nr_sectors, 1195 rq->nr_sectors,
1193 rq->current_nr_sectors); 1196 rq->current_nr_sectors);
1194 printk("bio %p, biotail %p, buffer %p, data %p, len %u\n", rq->bio, rq->biotail, rq->buffer, rq->data, rq->data_len); 1197 printk("bio %p, biotail %p, buffer %p, data %p, len %u\n", rq->bio, rq->biotail, rq->buffer, rq->data, rq->data_len);
1195 1198
1196 if (blk_pc_request(rq)) { 1199 if (blk_pc_request(rq)) {
1197 printk("cdb: "); 1200 printk("cdb: ");
1198 for (bit = 0; bit < sizeof(rq->cmd); bit++) 1201 for (bit = 0; bit < sizeof(rq->cmd); bit++)
1199 printk("%02x ", rq->cmd[bit]); 1202 printk("%02x ", rq->cmd[bit]);
1200 printk("\n"); 1203 printk("\n");
1201 } 1204 }
1202 } 1205 }
1203 1206
1204 EXPORT_SYMBOL(blk_dump_rq_flags); 1207 EXPORT_SYMBOL(blk_dump_rq_flags);
1205 1208
1206 void blk_recount_segments(struct request_queue *q, struct bio *bio) 1209 void blk_recount_segments(struct request_queue *q, struct bio *bio)
1207 { 1210 {
1208 struct request rq; 1211 struct request rq;
1209 struct bio *nxt = bio->bi_next; 1212 struct bio *nxt = bio->bi_next;
1210 rq.q = q; 1213 rq.q = q;
1211 rq.bio = rq.biotail = bio; 1214 rq.bio = rq.biotail = bio;
1212 bio->bi_next = NULL; 1215 bio->bi_next = NULL;
1213 blk_recalc_rq_segments(&rq); 1216 blk_recalc_rq_segments(&rq);
1214 bio->bi_next = nxt; 1217 bio->bi_next = nxt;
1215 bio->bi_phys_segments = rq.nr_phys_segments; 1218 bio->bi_phys_segments = rq.nr_phys_segments;
1216 bio->bi_hw_segments = rq.nr_hw_segments; 1219 bio->bi_hw_segments = rq.nr_hw_segments;
1217 bio->bi_flags |= (1 << BIO_SEG_VALID); 1220 bio->bi_flags |= (1 << BIO_SEG_VALID);
1218 } 1221 }
1219 EXPORT_SYMBOL(blk_recount_segments); 1222 EXPORT_SYMBOL(blk_recount_segments);
1220 1223
1221 static void blk_recalc_rq_segments(struct request *rq) 1224 static void blk_recalc_rq_segments(struct request *rq)
1222 { 1225 {
1223 int nr_phys_segs; 1226 int nr_phys_segs;
1224 int nr_hw_segs; 1227 int nr_hw_segs;
1225 unsigned int phys_size; 1228 unsigned int phys_size;
1226 unsigned int hw_size; 1229 unsigned int hw_size;
1227 struct bio_vec *bv, *bvprv = NULL; 1230 struct bio_vec *bv, *bvprv = NULL;
1228 int seg_size; 1231 int seg_size;
1229 int hw_seg_size; 1232 int hw_seg_size;
1230 int cluster; 1233 int cluster;
1231 struct req_iterator iter; 1234 struct req_iterator iter;
1232 int high, highprv = 1; 1235 int high, highprv = 1;
1233 struct request_queue *q = rq->q; 1236 struct request_queue *q = rq->q;
1234 1237
1235 if (!rq->bio) 1238 if (!rq->bio)
1236 return; 1239 return;
1237 1240
1238 cluster = q->queue_flags & (1 << QUEUE_FLAG_CLUSTER); 1241 cluster = q->queue_flags & (1 << QUEUE_FLAG_CLUSTER);
1239 hw_seg_size = seg_size = 0; 1242 hw_seg_size = seg_size = 0;
1240 phys_size = hw_size = nr_phys_segs = nr_hw_segs = 0; 1243 phys_size = hw_size = nr_phys_segs = nr_hw_segs = 0;
1241 rq_for_each_segment(bv, rq, iter) { 1244 rq_for_each_segment(bv, rq, iter) {
1242 /* 1245 /*
1243 * the trick here is making sure that a high page is never 1246 * the trick here is making sure that a high page is never
1244 * considered part of another segment, since that might 1247 * considered part of another segment, since that might
1245 * change with the bounce page. 1248 * change with the bounce page.
1246 */ 1249 */
1247 high = page_to_pfn(bv->bv_page) > q->bounce_pfn; 1250 high = page_to_pfn(bv->bv_page) > q->bounce_pfn;
1248 if (high || highprv) 1251 if (high || highprv)
1249 goto new_hw_segment; 1252 goto new_hw_segment;
1250 if (cluster) { 1253 if (cluster) {
1251 if (seg_size + bv->bv_len > q->max_segment_size) 1254 if (seg_size + bv->bv_len > q->max_segment_size)
1252 goto new_segment; 1255 goto new_segment;
1253 if (!BIOVEC_PHYS_MERGEABLE(bvprv, bv)) 1256 if (!BIOVEC_PHYS_MERGEABLE(bvprv, bv))
1254 goto new_segment; 1257 goto new_segment;
1255 if (!BIOVEC_SEG_BOUNDARY(q, bvprv, bv)) 1258 if (!BIOVEC_SEG_BOUNDARY(q, bvprv, bv))
1256 goto new_segment; 1259 goto new_segment;
1257 if (BIOVEC_VIRT_OVERSIZE(hw_seg_size + bv->bv_len)) 1260 if (BIOVEC_VIRT_OVERSIZE(hw_seg_size + bv->bv_len))
1258 goto new_hw_segment; 1261 goto new_hw_segment;
1259 1262
1260 seg_size += bv->bv_len; 1263 seg_size += bv->bv_len;
1261 hw_seg_size += bv->bv_len; 1264 hw_seg_size += bv->bv_len;
1262 bvprv = bv; 1265 bvprv = bv;
1263 continue; 1266 continue;
1264 } 1267 }
1265 new_segment: 1268 new_segment:
1266 if (BIOVEC_VIRT_MERGEABLE(bvprv, bv) && 1269 if (BIOVEC_VIRT_MERGEABLE(bvprv, bv) &&
1267 !BIOVEC_VIRT_OVERSIZE(hw_seg_size + bv->bv_len)) 1270 !BIOVEC_VIRT_OVERSIZE(hw_seg_size + bv->bv_len))
1268 hw_seg_size += bv->bv_len; 1271 hw_seg_size += bv->bv_len;
1269 else { 1272 else {
1270 new_hw_segment: 1273 new_hw_segment:
1271 if (nr_hw_segs == 1 && 1274 if (nr_hw_segs == 1 &&
1272 hw_seg_size > rq->bio->bi_hw_front_size) 1275 hw_seg_size > rq->bio->bi_hw_front_size)
1273 rq->bio->bi_hw_front_size = hw_seg_size; 1276 rq->bio->bi_hw_front_size = hw_seg_size;
1274 hw_seg_size = BIOVEC_VIRT_START_SIZE(bv) + bv->bv_len; 1277 hw_seg_size = BIOVEC_VIRT_START_SIZE(bv) + bv->bv_len;
1275 nr_hw_segs++; 1278 nr_hw_segs++;
1276 } 1279 }
1277 1280
1278 nr_phys_segs++; 1281 nr_phys_segs++;
1279 bvprv = bv; 1282 bvprv = bv;
1280 seg_size = bv->bv_len; 1283 seg_size = bv->bv_len;
1281 highprv = high; 1284 highprv = high;
1282 } 1285 }
1283 1286
1284 if (nr_hw_segs == 1 && 1287 if (nr_hw_segs == 1 &&
1285 hw_seg_size > rq->bio->bi_hw_front_size) 1288 hw_seg_size > rq->bio->bi_hw_front_size)
1286 rq->bio->bi_hw_front_size = hw_seg_size; 1289 rq->bio->bi_hw_front_size = hw_seg_size;
1287 if (hw_seg_size > rq->biotail->bi_hw_back_size) 1290 if (hw_seg_size > rq->biotail->bi_hw_back_size)
1288 rq->biotail->bi_hw_back_size = hw_seg_size; 1291 rq->biotail->bi_hw_back_size = hw_seg_size;
1289 rq->nr_phys_segments = nr_phys_segs; 1292 rq->nr_phys_segments = nr_phys_segs;
1290 rq->nr_hw_segments = nr_hw_segs; 1293 rq->nr_hw_segments = nr_hw_segs;
1291 } 1294 }
1292 1295
1293 static int blk_phys_contig_segment(struct request_queue *q, struct bio *bio, 1296 static int blk_phys_contig_segment(struct request_queue *q, struct bio *bio,
1294 struct bio *nxt) 1297 struct bio *nxt)
1295 { 1298 {
1296 if (!(q->queue_flags & (1 << QUEUE_FLAG_CLUSTER))) 1299 if (!(q->queue_flags & (1 << QUEUE_FLAG_CLUSTER)))
1297 return 0; 1300 return 0;
1298 1301
1299 if (!BIOVEC_PHYS_MERGEABLE(__BVEC_END(bio), __BVEC_START(nxt))) 1302 if (!BIOVEC_PHYS_MERGEABLE(__BVEC_END(bio), __BVEC_START(nxt)))
1300 return 0; 1303 return 0;
1301 if (bio->bi_size + nxt->bi_size > q->max_segment_size) 1304 if (bio->bi_size + nxt->bi_size > q->max_segment_size)
1302 return 0; 1305 return 0;
1303 1306
1304 /* 1307 /*
1305 * bio and nxt are contigous in memory, check if the queue allows 1308 * bio and nxt are contigous in memory, check if the queue allows
1306 * these two to be merged into one 1309 * these two to be merged into one
1307 */ 1310 */
1308 if (BIO_SEG_BOUNDARY(q, bio, nxt)) 1311 if (BIO_SEG_BOUNDARY(q, bio, nxt))
1309 return 1; 1312 return 1;
1310 1313
1311 return 0; 1314 return 0;
1312 } 1315 }
1313 1316
1314 static int blk_hw_contig_segment(struct request_queue *q, struct bio *bio, 1317 static int blk_hw_contig_segment(struct request_queue *q, struct bio *bio,
1315 struct bio *nxt) 1318 struct bio *nxt)
1316 { 1319 {
1317 if (unlikely(!bio_flagged(bio, BIO_SEG_VALID))) 1320 if (unlikely(!bio_flagged(bio, BIO_SEG_VALID)))
1318 blk_recount_segments(q, bio); 1321 blk_recount_segments(q, bio);
1319 if (unlikely(!bio_flagged(nxt, BIO_SEG_VALID))) 1322 if (unlikely(!bio_flagged(nxt, BIO_SEG_VALID)))
1320 blk_recount_segments(q, nxt); 1323 blk_recount_segments(q, nxt);
1321 if (!BIOVEC_VIRT_MERGEABLE(__BVEC_END(bio), __BVEC_START(nxt)) || 1324 if (!BIOVEC_VIRT_MERGEABLE(__BVEC_END(bio), __BVEC_START(nxt)) ||
1322 BIOVEC_VIRT_OVERSIZE(bio->bi_hw_back_size + nxt->bi_hw_front_size)) 1325 BIOVEC_VIRT_OVERSIZE(bio->bi_hw_back_size + nxt->bi_hw_front_size))
1323 return 0; 1326 return 0;
1324 if (bio->bi_hw_back_size + nxt->bi_hw_front_size > q->max_segment_size) 1327 if (bio->bi_hw_back_size + nxt->bi_hw_front_size > q->max_segment_size)
1325 return 0; 1328 return 0;
1326 1329
1327 return 1; 1330 return 1;
1328 } 1331 }
1329 1332
1330 /* 1333 /*
1331 * map a request to scatterlist, return number of sg entries setup. Caller 1334 * map a request to scatterlist, return number of sg entries setup. Caller
1332 * must make sure sg can hold rq->nr_phys_segments entries 1335 * must make sure sg can hold rq->nr_phys_segments entries
1333 */ 1336 */
1334 int blk_rq_map_sg(struct request_queue *q, struct request *rq, 1337 int blk_rq_map_sg(struct request_queue *q, struct request *rq,
1335 struct scatterlist *sg) 1338 struct scatterlist *sg)
1336 { 1339 {
1337 struct bio_vec *bvec, *bvprv; 1340 struct bio_vec *bvec, *bvprv;
1338 struct req_iterator iter; 1341 struct req_iterator iter;
1339 int nsegs, cluster; 1342 int nsegs, cluster;
1340 1343
1341 nsegs = 0; 1344 nsegs = 0;
1342 cluster = q->queue_flags & (1 << QUEUE_FLAG_CLUSTER); 1345 cluster = q->queue_flags & (1 << QUEUE_FLAG_CLUSTER);
1343 1346
1344 /* 1347 /*
1345 * for each bio in rq 1348 * for each bio in rq
1346 */ 1349 */
1347 bvprv = NULL; 1350 bvprv = NULL;
1348 rq_for_each_segment(bvec, rq, iter) { 1351 rq_for_each_segment(bvec, rq, iter) {
1349 int nbytes = bvec->bv_len; 1352 int nbytes = bvec->bv_len;
1350 1353
1351 if (bvprv && cluster) { 1354 if (bvprv && cluster) {
1352 if (sg[nsegs - 1].length + nbytes > q->max_segment_size) 1355 if (sg[nsegs - 1].length + nbytes > q->max_segment_size)
1353 goto new_segment; 1356 goto new_segment;
1354 1357
1355 if (!BIOVEC_PHYS_MERGEABLE(bvprv, bvec)) 1358 if (!BIOVEC_PHYS_MERGEABLE(bvprv, bvec))
1356 goto new_segment; 1359 goto new_segment;
1357 if (!BIOVEC_SEG_BOUNDARY(q, bvprv, bvec)) 1360 if (!BIOVEC_SEG_BOUNDARY(q, bvprv, bvec))
1358 goto new_segment; 1361 goto new_segment;
1359 1362
1360 sg[nsegs - 1].length += nbytes; 1363 sg[nsegs - 1].length += nbytes;
1361 } else { 1364 } else {
1362 new_segment: 1365 new_segment:
1363 memset(&sg[nsegs],0,sizeof(struct scatterlist)); 1366 memset(&sg[nsegs],0,sizeof(struct scatterlist));
1364 sg[nsegs].page = bvec->bv_page; 1367 sg[nsegs].page = bvec->bv_page;
1365 sg[nsegs].length = nbytes; 1368 sg[nsegs].length = nbytes;
1366 sg[nsegs].offset = bvec->bv_offset; 1369 sg[nsegs].offset = bvec->bv_offset;
1367 1370
1368 nsegs++; 1371 nsegs++;
1369 } 1372 }
1370 bvprv = bvec; 1373 bvprv = bvec;
1371 } /* segments in rq */ 1374 } /* segments in rq */
1372 1375
1373 return nsegs; 1376 return nsegs;
1374 } 1377 }
1375 1378
1376 EXPORT_SYMBOL(blk_rq_map_sg); 1379 EXPORT_SYMBOL(blk_rq_map_sg);
1377 1380
1378 /* 1381 /*
1379 * the standard queue merge functions, can be overridden with device 1382 * the standard queue merge functions, can be overridden with device
1380 * specific ones if so desired 1383 * specific ones if so desired
1381 */ 1384 */
1382 1385
1383 static inline int ll_new_mergeable(struct request_queue *q, 1386 static inline int ll_new_mergeable(struct request_queue *q,
1384 struct request *req, 1387 struct request *req,
1385 struct bio *bio) 1388 struct bio *bio)
1386 { 1389 {
1387 int nr_phys_segs = bio_phys_segments(q, bio); 1390 int nr_phys_segs = bio_phys_segments(q, bio);
1388 1391
1389 if (req->nr_phys_segments + nr_phys_segs > q->max_phys_segments) { 1392 if (req->nr_phys_segments + nr_phys_segs > q->max_phys_segments) {
1390 req->cmd_flags |= REQ_NOMERGE; 1393 req->cmd_flags |= REQ_NOMERGE;
1391 if (req == q->last_merge) 1394 if (req == q->last_merge)
1392 q->last_merge = NULL; 1395 q->last_merge = NULL;
1393 return 0; 1396 return 0;
1394 } 1397 }
1395 1398
1396 /* 1399 /*
1397 * A hw segment is just getting larger, bump just the phys 1400 * A hw segment is just getting larger, bump just the phys
1398 * counter. 1401 * counter.
1399 */ 1402 */
1400 req->nr_phys_segments += nr_phys_segs; 1403 req->nr_phys_segments += nr_phys_segs;
1401 return 1; 1404 return 1;
1402 } 1405 }
1403 1406
1404 static inline int ll_new_hw_segment(struct request_queue *q, 1407 static inline int ll_new_hw_segment(struct request_queue *q,
1405 struct request *req, 1408 struct request *req,
1406 struct bio *bio) 1409 struct bio *bio)
1407 { 1410 {
1408 int nr_hw_segs = bio_hw_segments(q, bio); 1411 int nr_hw_segs = bio_hw_segments(q, bio);
1409 int nr_phys_segs = bio_phys_segments(q, bio); 1412 int nr_phys_segs = bio_phys_segments(q, bio);
1410 1413
1411 if (req->nr_hw_segments + nr_hw_segs > q->max_hw_segments 1414 if (req->nr_hw_segments + nr_hw_segs > q->max_hw_segments
1412 || req->nr_phys_segments + nr_phys_segs > q->max_phys_segments) { 1415 || req->nr_phys_segments + nr_phys_segs > q->max_phys_segments) {
1413 req->cmd_flags |= REQ_NOMERGE; 1416 req->cmd_flags |= REQ_NOMERGE;
1414 if (req == q->last_merge) 1417 if (req == q->last_merge)
1415 q->last_merge = NULL; 1418 q->last_merge = NULL;
1416 return 0; 1419 return 0;
1417 } 1420 }
1418 1421
1419 /* 1422 /*
1420 * This will form the start of a new hw segment. Bump both 1423 * This will form the start of a new hw segment. Bump both
1421 * counters. 1424 * counters.
1422 */ 1425 */
1423 req->nr_hw_segments += nr_hw_segs; 1426 req->nr_hw_segments += nr_hw_segs;
1424 req->nr_phys_segments += nr_phys_segs; 1427 req->nr_phys_segments += nr_phys_segs;
1425 return 1; 1428 return 1;
1426 } 1429 }
1427 1430
1428 static int ll_back_merge_fn(struct request_queue *q, struct request *req, 1431 static int ll_back_merge_fn(struct request_queue *q, struct request *req,
1429 struct bio *bio) 1432 struct bio *bio)
1430 { 1433 {
1431 unsigned short max_sectors; 1434 unsigned short max_sectors;
1432 int len; 1435 int len;
1433 1436
1434 if (unlikely(blk_pc_request(req))) 1437 if (unlikely(blk_pc_request(req)))
1435 max_sectors = q->max_hw_sectors; 1438 max_sectors = q->max_hw_sectors;
1436 else 1439 else
1437 max_sectors = q->max_sectors; 1440 max_sectors = q->max_sectors;
1438 1441
1439 if (req->nr_sectors + bio_sectors(bio) > max_sectors) { 1442 if (req->nr_sectors + bio_sectors(bio) > max_sectors) {
1440 req->cmd_flags |= REQ_NOMERGE; 1443 req->cmd_flags |= REQ_NOMERGE;
1441 if (req == q->last_merge) 1444 if (req == q->last_merge)
1442 q->last_merge = NULL; 1445 q->last_merge = NULL;
1443 return 0; 1446 return 0;
1444 } 1447 }
1445 if (unlikely(!bio_flagged(req->biotail, BIO_SEG_VALID))) 1448 if (unlikely(!bio_flagged(req->biotail, BIO_SEG_VALID)))
1446 blk_recount_segments(q, req->biotail); 1449 blk_recount_segments(q, req->biotail);
1447 if (unlikely(!bio_flagged(bio, BIO_SEG_VALID))) 1450 if (unlikely(!bio_flagged(bio, BIO_SEG_VALID)))
1448 blk_recount_segments(q, bio); 1451 blk_recount_segments(q, bio);
1449 len = req->biotail->bi_hw_back_size + bio->bi_hw_front_size; 1452 len = req->biotail->bi_hw_back_size + bio->bi_hw_front_size;
1450 if (BIOVEC_VIRT_MERGEABLE(__BVEC_END(req->biotail), __BVEC_START(bio)) && 1453 if (BIOVEC_VIRT_MERGEABLE(__BVEC_END(req->biotail), __BVEC_START(bio)) &&
1451 !BIOVEC_VIRT_OVERSIZE(len)) { 1454 !BIOVEC_VIRT_OVERSIZE(len)) {
1452 int mergeable = ll_new_mergeable(q, req, bio); 1455 int mergeable = ll_new_mergeable(q, req, bio);
1453 1456
1454 if (mergeable) { 1457 if (mergeable) {
1455 if (req->nr_hw_segments == 1) 1458 if (req->nr_hw_segments == 1)
1456 req->bio->bi_hw_front_size = len; 1459 req->bio->bi_hw_front_size = len;
1457 if (bio->bi_hw_segments == 1) 1460 if (bio->bi_hw_segments == 1)
1458 bio->bi_hw_back_size = len; 1461 bio->bi_hw_back_size = len;
1459 } 1462 }
1460 return mergeable; 1463 return mergeable;
1461 } 1464 }
1462 1465
1463 return ll_new_hw_segment(q, req, bio); 1466 return ll_new_hw_segment(q, req, bio);
1464 } 1467 }
1465 1468
1466 static int ll_front_merge_fn(struct request_queue *q, struct request *req, 1469 static int ll_front_merge_fn(struct request_queue *q, struct request *req,
1467 struct bio *bio) 1470 struct bio *bio)
1468 { 1471 {
1469 unsigned short max_sectors; 1472 unsigned short max_sectors;
1470 int len; 1473 int len;
1471 1474
1472 if (unlikely(blk_pc_request(req))) 1475 if (unlikely(blk_pc_request(req)))
1473 max_sectors = q->max_hw_sectors; 1476 max_sectors = q->max_hw_sectors;
1474 else 1477 else
1475 max_sectors = q->max_sectors; 1478 max_sectors = q->max_sectors;
1476 1479
1477 1480
1478 if (req->nr_sectors + bio_sectors(bio) > max_sectors) { 1481 if (req->nr_sectors + bio_sectors(bio) > max_sectors) {
1479 req->cmd_flags |= REQ_NOMERGE; 1482 req->cmd_flags |= REQ_NOMERGE;
1480 if (req == q->last_merge) 1483 if (req == q->last_merge)
1481 q->last_merge = NULL; 1484 q->last_merge = NULL;
1482 return 0; 1485 return 0;
1483 } 1486 }
1484 len = bio->bi_hw_back_size + req->bio->bi_hw_front_size; 1487 len = bio->bi_hw_back_size + req->bio->bi_hw_front_size;
1485 if (unlikely(!bio_flagged(bio, BIO_SEG_VALID))) 1488 if (unlikely(!bio_flagged(bio, BIO_SEG_VALID)))
1486 blk_recount_segments(q, bio); 1489 blk_recount_segments(q, bio);
1487 if (unlikely(!bio_flagged(req->bio, BIO_SEG_VALID))) 1490 if (unlikely(!bio_flagged(req->bio, BIO_SEG_VALID)))
1488 blk_recount_segments(q, req->bio); 1491 blk_recount_segments(q, req->bio);
1489 if (BIOVEC_VIRT_MERGEABLE(__BVEC_END(bio), __BVEC_START(req->bio)) && 1492 if (BIOVEC_VIRT_MERGEABLE(__BVEC_END(bio), __BVEC_START(req->bio)) &&
1490 !BIOVEC_VIRT_OVERSIZE(len)) { 1493 !BIOVEC_VIRT_OVERSIZE(len)) {
1491 int mergeable = ll_new_mergeable(q, req, bio); 1494 int mergeable = ll_new_mergeable(q, req, bio);
1492 1495
1493 if (mergeable) { 1496 if (mergeable) {
1494 if (bio->bi_hw_segments == 1) 1497 if (bio->bi_hw_segments == 1)
1495 bio->bi_hw_front_size = len; 1498 bio->bi_hw_front_size = len;
1496 if (req->nr_hw_segments == 1) 1499 if (req->nr_hw_segments == 1)
1497 req->biotail->bi_hw_back_size = len; 1500 req->biotail->bi_hw_back_size = len;
1498 } 1501 }
1499 return mergeable; 1502 return mergeable;
1500 } 1503 }
1501 1504
1502 return ll_new_hw_segment(q, req, bio); 1505 return ll_new_hw_segment(q, req, bio);
1503 } 1506 }
1504 1507
1505 static int ll_merge_requests_fn(struct request_queue *q, struct request *req, 1508 static int ll_merge_requests_fn(struct request_queue *q, struct request *req,
1506 struct request *next) 1509 struct request *next)
1507 { 1510 {
1508 int total_phys_segments; 1511 int total_phys_segments;
1509 int total_hw_segments; 1512 int total_hw_segments;
1510 1513
1511 /* 1514 /*
1512 * First check if the either of the requests are re-queued 1515 * First check if the either of the requests are re-queued
1513 * requests. Can't merge them if they are. 1516 * requests. Can't merge them if they are.
1514 */ 1517 */
1515 if (req->special || next->special) 1518 if (req->special || next->special)
1516 return 0; 1519 return 0;
1517 1520
1518 /* 1521 /*
1519 * Will it become too large? 1522 * Will it become too large?
1520 */ 1523 */
1521 if ((req->nr_sectors + next->nr_sectors) > q->max_sectors) 1524 if ((req->nr_sectors + next->nr_sectors) > q->max_sectors)
1522 return 0; 1525 return 0;
1523 1526
1524 total_phys_segments = req->nr_phys_segments + next->nr_phys_segments; 1527 total_phys_segments = req->nr_phys_segments + next->nr_phys_segments;
1525 if (blk_phys_contig_segment(q, req->biotail, next->bio)) 1528 if (blk_phys_contig_segment(q, req->biotail, next->bio))
1526 total_phys_segments--; 1529 total_phys_segments--;
1527 1530
1528 if (total_phys_segments > q->max_phys_segments) 1531 if (total_phys_segments > q->max_phys_segments)
1529 return 0; 1532 return 0;
1530 1533
1531 total_hw_segments = req->nr_hw_segments + next->nr_hw_segments; 1534 total_hw_segments = req->nr_hw_segments + next->nr_hw_segments;
1532 if (blk_hw_contig_segment(q, req->biotail, next->bio)) { 1535 if (blk_hw_contig_segment(q, req->biotail, next->bio)) {
1533 int len = req->biotail->bi_hw_back_size + next->bio->bi_hw_front_size; 1536 int len = req->biotail->bi_hw_back_size + next->bio->bi_hw_front_size;
1534 /* 1537 /*
1535 * propagate the combined length to the end of the requests 1538 * propagate the combined length to the end of the requests
1536 */ 1539 */
1537 if (req->nr_hw_segments == 1) 1540 if (req->nr_hw_segments == 1)
1538 req->bio->bi_hw_front_size = len; 1541 req->bio->bi_hw_front_size = len;
1539 if (next->nr_hw_segments == 1) 1542 if (next->nr_hw_segments == 1)
1540 next->biotail->bi_hw_back_size = len; 1543 next->biotail->bi_hw_back_size = len;
1541 total_hw_segments--; 1544 total_hw_segments--;
1542 } 1545 }
1543 1546
1544 if (total_hw_segments > q->max_hw_segments) 1547 if (total_hw_segments > q->max_hw_segments)
1545 return 0; 1548 return 0;
1546 1549
1547 /* Merge is OK... */ 1550 /* Merge is OK... */
1548 req->nr_phys_segments = total_phys_segments; 1551 req->nr_phys_segments = total_phys_segments;
1549 req->nr_hw_segments = total_hw_segments; 1552 req->nr_hw_segments = total_hw_segments;
1550 return 1; 1553 return 1;
1551 } 1554 }
1552 1555
1553 /* 1556 /*
1554 * "plug" the device if there are no outstanding requests: this will 1557 * "plug" the device if there are no outstanding requests: this will
1555 * force the transfer to start only after we have put all the requests 1558 * force the transfer to start only after we have put all the requests
1556 * on the list. 1559 * on the list.
1557 * 1560 *
1558 * This is called with interrupts off and no requests on the queue and 1561 * This is called with interrupts off and no requests on the queue and
1559 * with the queue lock held. 1562 * with the queue lock held.
1560 */ 1563 */
1561 void blk_plug_device(struct request_queue *q) 1564 void blk_plug_device(struct request_queue *q)
1562 { 1565 {
1563 WARN_ON(!irqs_disabled()); 1566 WARN_ON(!irqs_disabled());
1564 1567
1565 /* 1568 /*
1566 * don't plug a stopped queue, it must be paired with blk_start_queue() 1569 * don't plug a stopped queue, it must be paired with blk_start_queue()
1567 * which will restart the queueing 1570 * which will restart the queueing
1568 */ 1571 */
1569 if (blk_queue_stopped(q)) 1572 if (blk_queue_stopped(q))
1570 return; 1573 return;
1571 1574
1572 if (!test_and_set_bit(QUEUE_FLAG_PLUGGED, &q->queue_flags)) { 1575 if (!test_and_set_bit(QUEUE_FLAG_PLUGGED, &q->queue_flags)) {
1573 mod_timer(&q->unplug_timer, jiffies + q->unplug_delay); 1576 mod_timer(&q->unplug_timer, jiffies + q->unplug_delay);
1574 blk_add_trace_generic(q, NULL, 0, BLK_TA_PLUG); 1577 blk_add_trace_generic(q, NULL, 0, BLK_TA_PLUG);
1575 } 1578 }
1576 } 1579 }
1577 1580
1578 EXPORT_SYMBOL(blk_plug_device); 1581 EXPORT_SYMBOL(blk_plug_device);
1579 1582
1580 /* 1583 /*
1581 * remove the queue from the plugged list, if present. called with 1584 * remove the queue from the plugged list, if present. called with
1582 * queue lock held and interrupts disabled. 1585 * queue lock held and interrupts disabled.
1583 */ 1586 */
1584 int blk_remove_plug(struct request_queue *q) 1587 int blk_remove_plug(struct request_queue *q)
1585 { 1588 {
1586 WARN_ON(!irqs_disabled()); 1589 WARN_ON(!irqs_disabled());
1587 1590
1588 if (!test_and_clear_bit(QUEUE_FLAG_PLUGGED, &q->queue_flags)) 1591 if (!test_and_clear_bit(QUEUE_FLAG_PLUGGED, &q->queue_flags))
1589 return 0; 1592 return 0;
1590 1593
1591 del_timer(&q->unplug_timer); 1594 del_timer(&q->unplug_timer);
1592 return 1; 1595 return 1;
1593 } 1596 }
1594 1597
1595 EXPORT_SYMBOL(blk_remove_plug); 1598 EXPORT_SYMBOL(blk_remove_plug);
1596 1599
1597 /* 1600 /*
1598 * remove the plug and let it rip.. 1601 * remove the plug and let it rip..
1599 */ 1602 */
1600 void __generic_unplug_device(struct request_queue *q) 1603 void __generic_unplug_device(struct request_queue *q)
1601 { 1604 {
1602 if (unlikely(blk_queue_stopped(q))) 1605 if (unlikely(blk_queue_stopped(q)))
1603 return; 1606 return;
1604 1607
1605 if (!blk_remove_plug(q)) 1608 if (!blk_remove_plug(q))
1606 return; 1609 return;
1607 1610
1608 q->request_fn(q); 1611 q->request_fn(q);
1609 } 1612 }
1610 EXPORT_SYMBOL(__generic_unplug_device); 1613 EXPORT_SYMBOL(__generic_unplug_device);
1611 1614
1612 /** 1615 /**
1613 * generic_unplug_device - fire a request queue 1616 * generic_unplug_device - fire a request queue
1614 * @q: The &struct request_queue in question 1617 * @q: The &struct request_queue in question
1615 * 1618 *
1616 * Description: 1619 * Description:
1617 * Linux uses plugging to build bigger requests queues before letting 1620 * Linux uses plugging to build bigger requests queues before letting
1618 * the device have at them. If a queue is plugged, the I/O scheduler 1621 * the device have at them. If a queue is plugged, the I/O scheduler
1619 * is still adding and merging requests on the queue. Once the queue 1622 * is still adding and merging requests on the queue. Once the queue
1620 * gets unplugged, the request_fn defined for the queue is invoked and 1623 * gets unplugged, the request_fn defined for the queue is invoked and
1621 * transfers started. 1624 * transfers started.
1622 **/ 1625 **/
1623 void generic_unplug_device(struct request_queue *q) 1626 void generic_unplug_device(struct request_queue *q)
1624 { 1627 {
1625 spin_lock_irq(q->queue_lock); 1628 spin_lock_irq(q->queue_lock);
1626 __generic_unplug_device(q); 1629 __generic_unplug_device(q);
1627 spin_unlock_irq(q->queue_lock); 1630 spin_unlock_irq(q->queue_lock);
1628 } 1631 }
1629 EXPORT_SYMBOL(generic_unplug_device); 1632 EXPORT_SYMBOL(generic_unplug_device);
1630 1633
1631 static void blk_backing_dev_unplug(struct backing_dev_info *bdi, 1634 static void blk_backing_dev_unplug(struct backing_dev_info *bdi,
1632 struct page *page) 1635 struct page *page)
1633 { 1636 {
1634 struct request_queue *q = bdi->unplug_io_data; 1637 struct request_queue *q = bdi->unplug_io_data;
1635 1638
1636 /* 1639 /*
1637 * devices don't necessarily have an ->unplug_fn defined 1640 * devices don't necessarily have an ->unplug_fn defined
1638 */ 1641 */
1639 if (q->unplug_fn) { 1642 if (q->unplug_fn) {
1640 blk_add_trace_pdu_int(q, BLK_TA_UNPLUG_IO, NULL, 1643 blk_add_trace_pdu_int(q, BLK_TA_UNPLUG_IO, NULL,
1641 q->rq.count[READ] + q->rq.count[WRITE]); 1644 q->rq.count[READ] + q->rq.count[WRITE]);
1642 1645
1643 q->unplug_fn(q); 1646 q->unplug_fn(q);
1644 } 1647 }
1645 } 1648 }
1646 1649
1647 static void blk_unplug_work(struct work_struct *work) 1650 static void blk_unplug_work(struct work_struct *work)
1648 { 1651 {
1649 struct request_queue *q = 1652 struct request_queue *q =
1650 container_of(work, struct request_queue, unplug_work); 1653 container_of(work, struct request_queue, unplug_work);
1651 1654
1652 blk_add_trace_pdu_int(q, BLK_TA_UNPLUG_IO, NULL, 1655 blk_add_trace_pdu_int(q, BLK_TA_UNPLUG_IO, NULL,
1653 q->rq.count[READ] + q->rq.count[WRITE]); 1656 q->rq.count[READ] + q->rq.count[WRITE]);
1654 1657
1655 q->unplug_fn(q); 1658 q->unplug_fn(q);
1656 } 1659 }
1657 1660
1658 static void blk_unplug_timeout(unsigned long data) 1661 static void blk_unplug_timeout(unsigned long data)
1659 { 1662 {
1660 struct request_queue *q = (struct request_queue *)data; 1663 struct request_queue *q = (struct request_queue *)data;
1661 1664
1662 blk_add_trace_pdu_int(q, BLK_TA_UNPLUG_TIMER, NULL, 1665 blk_add_trace_pdu_int(q, BLK_TA_UNPLUG_TIMER, NULL,
1663 q->rq.count[READ] + q->rq.count[WRITE]); 1666 q->rq.count[READ] + q->rq.count[WRITE]);
1664 1667
1665 kblockd_schedule_work(&q->unplug_work); 1668 kblockd_schedule_work(&q->unplug_work);
1666 } 1669 }
1667 1670
1668 /** 1671 /**
1669 * blk_start_queue - restart a previously stopped queue 1672 * blk_start_queue - restart a previously stopped queue
1670 * @q: The &struct request_queue in question 1673 * @q: The &struct request_queue in question
1671 * 1674 *
1672 * Description: 1675 * Description:
1673 * blk_start_queue() will clear the stop flag on the queue, and call 1676 * blk_start_queue() will clear the stop flag on the queue, and call
1674 * the request_fn for the queue if it was in a stopped state when 1677 * the request_fn for the queue if it was in a stopped state when
1675 * entered. Also see blk_stop_queue(). Queue lock must be held. 1678 * entered. Also see blk_stop_queue(). Queue lock must be held.
1676 **/ 1679 **/
1677 void blk_start_queue(struct request_queue *q) 1680 void blk_start_queue(struct request_queue *q)
1678 { 1681 {
1679 WARN_ON(!irqs_disabled()); 1682 WARN_ON(!irqs_disabled());
1680 1683
1681 clear_bit(QUEUE_FLAG_STOPPED, &q->queue_flags); 1684 clear_bit(QUEUE_FLAG_STOPPED, &q->queue_flags);
1682 1685
1683 /* 1686 /*
1684 * one level of recursion is ok and is much faster than kicking 1687 * one level of recursion is ok and is much faster than kicking
1685 * the unplug handling 1688 * the unplug handling
1686 */ 1689 */
1687 if (!test_and_set_bit(QUEUE_FLAG_REENTER, &q->queue_flags)) { 1690 if (!test_and_set_bit(QUEUE_FLAG_REENTER, &q->queue_flags)) {
1688 q->request_fn(q); 1691 q->request_fn(q);
1689 clear_bit(QUEUE_FLAG_REENTER, &q->queue_flags); 1692 clear_bit(QUEUE_FLAG_REENTER, &q->queue_flags);
1690 } else { 1693 } else {
1691 blk_plug_device(q); 1694 blk_plug_device(q);
1692 kblockd_schedule_work(&q->unplug_work); 1695 kblockd_schedule_work(&q->unplug_work);
1693 } 1696 }
1694 } 1697 }
1695 1698
1696 EXPORT_SYMBOL(blk_start_queue); 1699 EXPORT_SYMBOL(blk_start_queue);
1697 1700
1698 /** 1701 /**
1699 * blk_stop_queue - stop a queue 1702 * blk_stop_queue - stop a queue
1700 * @q: The &struct request_queue in question 1703 * @q: The &struct request_queue in question
1701 * 1704 *
1702 * Description: 1705 * Description:
1703 * The Linux block layer assumes that a block driver will consume all 1706 * The Linux block layer assumes that a block driver will consume all
1704 * entries on the request queue when the request_fn strategy is called. 1707 * entries on the request queue when the request_fn strategy is called.
1705 * Often this will not happen, because of hardware limitations (queue 1708 * Often this will not happen, because of hardware limitations (queue
1706 * depth settings). If a device driver gets a 'queue full' response, 1709 * depth settings). If a device driver gets a 'queue full' response,
1707 * or if it simply chooses not to queue more I/O at one point, it can 1710 * or if it simply chooses not to queue more I/O at one point, it can
1708 * call this function to prevent the request_fn from being called until 1711 * call this function to prevent the request_fn from being called until
1709 * the driver has signalled it's ready to go again. This happens by calling 1712 * the driver has signalled it's ready to go again. This happens by calling
1710 * blk_start_queue() to restart queue operations. Queue lock must be held. 1713 * blk_start_queue() to restart queue operations. Queue lock must be held.
1711 **/ 1714 **/
1712 void blk_stop_queue(struct request_queue *q) 1715 void blk_stop_queue(struct request_queue *q)
1713 { 1716 {
1714 blk_remove_plug(q); 1717 blk_remove_plug(q);
1715 set_bit(QUEUE_FLAG_STOPPED, &q->queue_flags); 1718 set_bit(QUEUE_FLAG_STOPPED, &q->queue_flags);
1716 } 1719 }
1717 EXPORT_SYMBOL(blk_stop_queue); 1720 EXPORT_SYMBOL(blk_stop_queue);
1718 1721
1719 /** 1722 /**
1720 * blk_sync_queue - cancel any pending callbacks on a queue 1723 * blk_sync_queue - cancel any pending callbacks on a queue
1721 * @q: the queue 1724 * @q: the queue
1722 * 1725 *
1723 * Description: 1726 * Description:
1724 * The block layer may perform asynchronous callback activity 1727 * The block layer may perform asynchronous callback activity
1725 * on a queue, such as calling the unplug function after a timeout. 1728 * on a queue, such as calling the unplug function after a timeout.
1726 * A block device may call blk_sync_queue to ensure that any 1729 * A block device may call blk_sync_queue to ensure that any
1727 * such activity is cancelled, thus allowing it to release resources 1730 * such activity is cancelled, thus allowing it to release resources
1728 * that the callbacks might use. The caller must already have made sure 1731 * that the callbacks might use. The caller must already have made sure
1729 * that its ->make_request_fn will not re-add plugging prior to calling 1732 * that its ->make_request_fn will not re-add plugging prior to calling
1730 * this function. 1733 * this function.
1731 * 1734 *
1732 */ 1735 */
1733 void blk_sync_queue(struct request_queue *q) 1736 void blk_sync_queue(struct request_queue *q)
1734 { 1737 {
1735 del_timer_sync(&q->unplug_timer); 1738 del_timer_sync(&q->unplug_timer);
1736 } 1739 }
1737 EXPORT_SYMBOL(blk_sync_queue); 1740 EXPORT_SYMBOL(blk_sync_queue);
1738 1741
1739 /** 1742 /**
1740 * blk_run_queue - run a single device queue 1743 * blk_run_queue - run a single device queue
1741 * @q: The queue to run 1744 * @q: The queue to run
1742 */ 1745 */
1743 void blk_run_queue(struct request_queue *q) 1746 void blk_run_queue(struct request_queue *q)
1744 { 1747 {
1745 unsigned long flags; 1748 unsigned long flags;
1746 1749
1747 spin_lock_irqsave(q->queue_lock, flags); 1750 spin_lock_irqsave(q->queue_lock, flags);
1748 blk_remove_plug(q); 1751 blk_remove_plug(q);
1749 1752
1750 /* 1753 /*
1751 * Only recurse once to avoid overrunning the stack, let the unplug 1754 * Only recurse once to avoid overrunning the stack, let the unplug
1752 * handling reinvoke the handler shortly if we already got there. 1755 * handling reinvoke the handler shortly if we already got there.
1753 */ 1756 */
1754 if (!elv_queue_empty(q)) { 1757 if (!elv_queue_empty(q)) {
1755 if (!test_and_set_bit(QUEUE_FLAG_REENTER, &q->queue_flags)) { 1758 if (!test_and_set_bit(QUEUE_FLAG_REENTER, &q->queue_flags)) {
1756 q->request_fn(q); 1759 q->request_fn(q);
1757 clear_bit(QUEUE_FLAG_REENTER, &q->queue_flags); 1760 clear_bit(QUEUE_FLAG_REENTER, &q->queue_flags);
1758 } else { 1761 } else {
1759 blk_plug_device(q); 1762 blk_plug_device(q);
1760 kblockd_schedule_work(&q->unplug_work); 1763 kblockd_schedule_work(&q->unplug_work);
1761 } 1764 }
1762 } 1765 }
1763 1766
1764 spin_unlock_irqrestore(q->queue_lock, flags); 1767 spin_unlock_irqrestore(q->queue_lock, flags);
1765 } 1768 }
1766 EXPORT_SYMBOL(blk_run_queue); 1769 EXPORT_SYMBOL(blk_run_queue);
1767 1770
1768 /** 1771 /**
1769 * blk_cleanup_queue: - release a &struct request_queue when it is no longer needed 1772 * blk_cleanup_queue: - release a &struct request_queue when it is no longer needed
1770 * @kobj: the kobj belonging of the request queue to be released 1773 * @kobj: the kobj belonging of the request queue to be released
1771 * 1774 *
1772 * Description: 1775 * Description:
1773 * blk_cleanup_queue is the pair to blk_init_queue() or 1776 * blk_cleanup_queue is the pair to blk_init_queue() or
1774 * blk_queue_make_request(). It should be called when a request queue is 1777 * blk_queue_make_request(). It should be called when a request queue is
1775 * being released; typically when a block device is being de-registered. 1778 * being released; typically when a block device is being de-registered.
1776 * Currently, its primary task it to free all the &struct request 1779 * Currently, its primary task it to free all the &struct request
1777 * structures that were allocated to the queue and the queue itself. 1780 * structures that were allocated to the queue and the queue itself.
1778 * 1781 *
1779 * Caveat: 1782 * Caveat:
1780 * Hopefully the low level driver will have finished any 1783 * Hopefully the low level driver will have finished any
1781 * outstanding requests first... 1784 * outstanding requests first...
1782 **/ 1785 **/
1783 static void blk_release_queue(struct kobject *kobj) 1786 static void blk_release_queue(struct kobject *kobj)
1784 { 1787 {
1785 struct request_queue *q = 1788 struct request_queue *q =
1786 container_of(kobj, struct request_queue, kobj); 1789 container_of(kobj, struct request_queue, kobj);
1787 struct request_list *rl = &q->rq; 1790 struct request_list *rl = &q->rq;
1788 1791
1789 blk_sync_queue(q); 1792 blk_sync_queue(q);
1790 1793
1791 if (rl->rq_pool) 1794 if (rl->rq_pool)
1792 mempool_destroy(rl->rq_pool); 1795 mempool_destroy(rl->rq_pool);
1793 1796
1794 if (q->queue_tags) 1797 if (q->queue_tags)
1795 __blk_queue_free_tags(q); 1798 __blk_queue_free_tags(q);
1796 1799
1797 blk_trace_shutdown(q); 1800 blk_trace_shutdown(q);
1798 1801
1799 kmem_cache_free(requestq_cachep, q); 1802 kmem_cache_free(requestq_cachep, q);
1800 } 1803 }
1801 1804
1802 void blk_put_queue(struct request_queue *q) 1805 void blk_put_queue(struct request_queue *q)
1803 { 1806 {
1804 kobject_put(&q->kobj); 1807 kobject_put(&q->kobj);
1805 } 1808 }
1806 EXPORT_SYMBOL(blk_put_queue); 1809 EXPORT_SYMBOL(blk_put_queue);
1807 1810
1808 void blk_cleanup_queue(struct request_queue * q) 1811 void blk_cleanup_queue(struct request_queue * q)
1809 { 1812 {
1810 mutex_lock(&q->sysfs_lock); 1813 mutex_lock(&q->sysfs_lock);
1811 set_bit(QUEUE_FLAG_DEAD, &q->queue_flags); 1814 set_bit(QUEUE_FLAG_DEAD, &q->queue_flags);
1812 mutex_unlock(&q->sysfs_lock); 1815 mutex_unlock(&q->sysfs_lock);
1813 1816
1814 if (q->elevator) 1817 if (q->elevator)
1815 elevator_exit(q->elevator); 1818 elevator_exit(q->elevator);
1816 1819
1817 blk_put_queue(q); 1820 blk_put_queue(q);
1818 } 1821 }
1819 1822
1820 EXPORT_SYMBOL(blk_cleanup_queue); 1823 EXPORT_SYMBOL(blk_cleanup_queue);
1821 1824
1822 static int blk_init_free_list(struct request_queue *q) 1825 static int blk_init_free_list(struct request_queue *q)
1823 { 1826 {
1824 struct request_list *rl = &q->rq; 1827 struct request_list *rl = &q->rq;
1825 1828
1826 rl->count[READ] = rl->count[WRITE] = 0; 1829 rl->count[READ] = rl->count[WRITE] = 0;
1827 rl->starved[READ] = rl->starved[WRITE] = 0; 1830 rl->starved[READ] = rl->starved[WRITE] = 0;
1828 rl->elvpriv = 0; 1831 rl->elvpriv = 0;
1829 init_waitqueue_head(&rl->wait[READ]); 1832 init_waitqueue_head(&rl->wait[READ]);
1830 init_waitqueue_head(&rl->wait[WRITE]); 1833 init_waitqueue_head(&rl->wait[WRITE]);
1831 1834
1832 rl->rq_pool = mempool_create_node(BLKDEV_MIN_RQ, mempool_alloc_slab, 1835 rl->rq_pool = mempool_create_node(BLKDEV_MIN_RQ, mempool_alloc_slab,
1833 mempool_free_slab, request_cachep, q->node); 1836 mempool_free_slab, request_cachep, q->node);
1834 1837
1835 if (!rl->rq_pool) 1838 if (!rl->rq_pool)
1836 return -ENOMEM; 1839 return -ENOMEM;
1837 1840
1838 return 0; 1841 return 0;
1839 } 1842 }
1840 1843
1841 struct request_queue *blk_alloc_queue(gfp_t gfp_mask) 1844 struct request_queue *blk_alloc_queue(gfp_t gfp_mask)
1842 { 1845 {
1843 return blk_alloc_queue_node(gfp_mask, -1); 1846 return blk_alloc_queue_node(gfp_mask, -1);
1844 } 1847 }
1845 EXPORT_SYMBOL(blk_alloc_queue); 1848 EXPORT_SYMBOL(blk_alloc_queue);
1846 1849
1847 static struct kobj_type queue_ktype; 1850 static struct kobj_type queue_ktype;
1848 1851
1849 struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id) 1852 struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
1850 { 1853 {
1851 struct request_queue *q; 1854 struct request_queue *q;
1852 1855
1853 q = kmem_cache_alloc_node(requestq_cachep, 1856 q = kmem_cache_alloc_node(requestq_cachep,
1854 gfp_mask | __GFP_ZERO, node_id); 1857 gfp_mask | __GFP_ZERO, node_id);
1855 if (!q) 1858 if (!q)
1856 return NULL; 1859 return NULL;
1857 1860
1858 init_timer(&q->unplug_timer); 1861 init_timer(&q->unplug_timer);
1859 1862
1860 kobject_set_name(&q->kobj, "%s", "queue"); 1863 kobject_set_name(&q->kobj, "%s", "queue");
1861 q->kobj.ktype = &queue_ktype; 1864 q->kobj.ktype = &queue_ktype;
1862 kobject_init(&q->kobj); 1865 kobject_init(&q->kobj);
1863 1866
1864 q->backing_dev_info.unplug_io_fn = blk_backing_dev_unplug; 1867 q->backing_dev_info.unplug_io_fn = blk_backing_dev_unplug;
1865 q->backing_dev_info.unplug_io_data = q; 1868 q->backing_dev_info.unplug_io_data = q;
1866 1869
1867 mutex_init(&q->sysfs_lock); 1870 mutex_init(&q->sysfs_lock);
1868 1871
1869 return q; 1872 return q;
1870 } 1873 }
1871 EXPORT_SYMBOL(blk_alloc_queue_node); 1874 EXPORT_SYMBOL(blk_alloc_queue_node);
1872 1875
1873 /** 1876 /**
1874 * blk_init_queue - prepare a request queue for use with a block device 1877 * blk_init_queue - prepare a request queue for use with a block device
1875 * @rfn: The function to be called to process requests that have been 1878 * @rfn: The function to be called to process requests that have been
1876 * placed on the queue. 1879 * placed on the queue.
1877 * @lock: Request queue spin lock 1880 * @lock: Request queue spin lock
1878 * 1881 *
1879 * Description: 1882 * Description:
1880 * If a block device wishes to use the standard request handling procedures, 1883 * If a block device wishes to use the standard request handling procedures,
1881 * which sorts requests and coalesces adjacent requests, then it must 1884 * which sorts requests and coalesces adjacent requests, then it must
1882 * call blk_init_queue(). The function @rfn will be called when there 1885 * call blk_init_queue(). The function @rfn will be called when there
1883 * are requests on the queue that need to be processed. If the device 1886 * are requests on the queue that need to be processed. If the device
1884 * supports plugging, then @rfn may not be called immediately when requests 1887 * supports plugging, then @rfn may not be called immediately when requests
1885 * are available on the queue, but may be called at some time later instead. 1888 * are available on the queue, but may be called at some time later instead.
1886 * Plugged queues are generally unplugged when a buffer belonging to one 1889 * Plugged queues are generally unplugged when a buffer belonging to one
1887 * of the requests on the queue is needed, or due to memory pressure. 1890 * of the requests on the queue is needed, or due to memory pressure.
1888 * 1891 *
1889 * @rfn is not required, or even expected, to remove all requests off the 1892 * @rfn is not required, or even expected, to remove all requests off the
1890 * queue, but only as many as it can handle at a time. If it does leave 1893 * queue, but only as many as it can handle at a time. If it does leave
1891 * requests on the queue, it is responsible for arranging that the requests 1894 * requests on the queue, it is responsible for arranging that the requests
1892 * get dealt with eventually. 1895 * get dealt with eventually.
1893 * 1896 *
1894 * The queue spin lock must be held while manipulating the requests on the 1897 * The queue spin lock must be held while manipulating the requests on the
1895 * request queue; this lock will be taken also from interrupt context, so irq 1898 * request queue; this lock will be taken also from interrupt context, so irq
1896 * disabling is needed for it. 1899 * disabling is needed for it.
1897 * 1900 *
1898 * Function returns a pointer to the initialized request queue, or NULL if 1901 * Function returns a pointer to the initialized request queue, or NULL if
1899 * it didn't succeed. 1902 * it didn't succeed.
1900 * 1903 *
1901 * Note: 1904 * Note:
1902 * blk_init_queue() must be paired with a blk_cleanup_queue() call 1905 * blk_init_queue() must be paired with a blk_cleanup_queue() call
1903 * when the block device is deactivated (such as at module unload). 1906 * when the block device is deactivated (such as at module unload).
1904 **/ 1907 **/
1905 1908
1906 struct request_queue *blk_init_queue(request_fn_proc *rfn, spinlock_t *lock) 1909 struct request_queue *blk_init_queue(request_fn_proc *rfn, spinlock_t *lock)
1907 { 1910 {
1908 return blk_init_queue_node(rfn, lock, -1); 1911 return blk_init_queue_node(rfn, lock, -1);
1909 } 1912 }
1910 EXPORT_SYMBOL(blk_init_queue); 1913 EXPORT_SYMBOL(blk_init_queue);
1911 1914
1912 struct request_queue * 1915 struct request_queue *
1913 blk_init_queue_node(request_fn_proc *rfn, spinlock_t *lock, int node_id) 1916 blk_init_queue_node(request_fn_proc *rfn, spinlock_t *lock, int node_id)
1914 { 1917 {
1915 struct request_queue *q = blk_alloc_queue_node(GFP_KERNEL, node_id); 1918 struct request_queue *q = blk_alloc_queue_node(GFP_KERNEL, node_id);
1916 1919
1917 if (!q) 1920 if (!q)
1918 return NULL; 1921 return NULL;
1919 1922
1920 q->node = node_id; 1923 q->node = node_id;
1921 if (blk_init_free_list(q)) { 1924 if (blk_init_free_list(q)) {
1922 kmem_cache_free(requestq_cachep, q); 1925 kmem_cache_free(requestq_cachep, q);
1923 return NULL; 1926 return NULL;
1924 } 1927 }
1925 1928
1926 /* 1929 /*
1927 * if caller didn't supply a lock, they get per-queue locking with 1930 * if caller didn't supply a lock, they get per-queue locking with
1928 * our embedded lock 1931 * our embedded lock
1929 */ 1932 */
1930 if (!lock) { 1933 if (!lock) {
1931 spin_lock_init(&q->__queue_lock); 1934 spin_lock_init(&q->__queue_lock);
1932 lock = &q->__queue_lock; 1935 lock = &q->__queue_lock;
1933 } 1936 }
1934 1937
1935 q->request_fn = rfn; 1938 q->request_fn = rfn;
1936 q->prep_rq_fn = NULL; 1939 q->prep_rq_fn = NULL;
1937 q->unplug_fn = generic_unplug_device; 1940 q->unplug_fn = generic_unplug_device;
1938 q->queue_flags = (1 << QUEUE_FLAG_CLUSTER); 1941 q->queue_flags = (1 << QUEUE_FLAG_CLUSTER);
1939 q->queue_lock = lock; 1942 q->queue_lock = lock;
1940 1943
1941 blk_queue_segment_boundary(q, 0xffffffff); 1944 blk_queue_segment_boundary(q, 0xffffffff);
1942 1945
1943 blk_queue_make_request(q, __make_request); 1946 blk_queue_make_request(q, __make_request);
1944 blk_queue_max_segment_size(q, MAX_SEGMENT_SIZE); 1947 blk_queue_max_segment_size(q, MAX_SEGMENT_SIZE);
1945 1948
1946 blk_queue_max_hw_segments(q, MAX_HW_SEGMENTS); 1949 blk_queue_max_hw_segments(q, MAX_HW_SEGMENTS);
1947 blk_queue_max_phys_segments(q, MAX_PHYS_SEGMENTS); 1950 blk_queue_max_phys_segments(q, MAX_PHYS_SEGMENTS);
1948 1951
1949 q->sg_reserved_size = INT_MAX; 1952 q->sg_reserved_size = INT_MAX;
1950 1953
1951 /* 1954 /*
1952 * all done 1955 * all done
1953 */ 1956 */
1954 if (!elevator_init(q, NULL)) { 1957 if (!elevator_init(q, NULL)) {
1955 blk_queue_congestion_threshold(q); 1958 blk_queue_congestion_threshold(q);
1956 return q; 1959 return q;
1957 } 1960 }
1958 1961
1959 blk_put_queue(q); 1962 blk_put_queue(q);
1960 return NULL; 1963 return NULL;
1961 } 1964 }
1962 EXPORT_SYMBOL(blk_init_queue_node); 1965 EXPORT_SYMBOL(blk_init_queue_node);
1963 1966
1964 int blk_get_queue(struct request_queue *q) 1967 int blk_get_queue(struct request_queue *q)
1965 { 1968 {
1966 if (likely(!test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))) { 1969 if (likely(!test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))) {
1967 kobject_get(&q->kobj); 1970 kobject_get(&q->kobj);
1968 return 0; 1971 return 0;
1969 } 1972 }
1970 1973
1971 return 1; 1974 return 1;
1972 } 1975 }
1973 1976
1974 EXPORT_SYMBOL(blk_get_queue); 1977 EXPORT_SYMBOL(blk_get_queue);
1975 1978
1976 static inline void blk_free_request(struct request_queue *q, struct request *rq) 1979 static inline void blk_free_request(struct request_queue *q, struct request *rq)
1977 { 1980 {
1978 if (rq->cmd_flags & REQ_ELVPRIV) 1981 if (rq->cmd_flags & REQ_ELVPRIV)
1979 elv_put_request(q, rq); 1982 elv_put_request(q, rq);
1980 mempool_free(rq, q->rq.rq_pool); 1983 mempool_free(rq, q->rq.rq_pool);
1981 } 1984 }
1982 1985
1983 static struct request * 1986 static struct request *
1984 blk_alloc_request(struct request_queue *q, int rw, int priv, gfp_t gfp_mask) 1987 blk_alloc_request(struct request_queue *q, int rw, int priv, gfp_t gfp_mask)
1985 { 1988 {
1986 struct request *rq = mempool_alloc(q->rq.rq_pool, gfp_mask); 1989 struct request *rq = mempool_alloc(q->rq.rq_pool, gfp_mask);
1987 1990
1988 if (!rq) 1991 if (!rq)
1989 return NULL; 1992 return NULL;
1990 1993
1991 /* 1994 /*
1992 * first three bits are identical in rq->cmd_flags and bio->bi_rw, 1995 * first three bits are identical in rq->cmd_flags and bio->bi_rw,
1993 * see bio.h and blkdev.h 1996 * see bio.h and blkdev.h
1994 */ 1997 */
1995 rq->cmd_flags = rw | REQ_ALLOCED; 1998 rq->cmd_flags = rw | REQ_ALLOCED;
1996 1999
1997 if (priv) { 2000 if (priv) {
1998 if (unlikely(elv_set_request(q, rq, gfp_mask))) { 2001 if (unlikely(elv_set_request(q, rq, gfp_mask))) {
1999 mempool_free(rq, q->rq.rq_pool); 2002 mempool_free(rq, q->rq.rq_pool);
2000 return NULL; 2003 return NULL;
2001 } 2004 }
2002 rq->cmd_flags |= REQ_ELVPRIV; 2005 rq->cmd_flags |= REQ_ELVPRIV;
2003 } 2006 }
2004 2007
2005 return rq; 2008 return rq;
2006 } 2009 }
2007 2010
2008 /* 2011 /*
2009 * ioc_batching returns true if the ioc is a valid batching request and 2012 * ioc_batching returns true if the ioc is a valid batching request and
2010 * should be given priority access to a request. 2013 * should be given priority access to a request.
2011 */ 2014 */
2012 static inline int ioc_batching(struct request_queue *q, struct io_context *ioc) 2015 static inline int ioc_batching(struct request_queue *q, struct io_context *ioc)
2013 { 2016 {
2014 if (!ioc) 2017 if (!ioc)
2015 return 0; 2018 return 0;
2016 2019
2017 /* 2020 /*
2018 * Make sure the process is able to allocate at least 1 request 2021 * Make sure the process is able to allocate at least 1 request
2019 * even if the batch times out, otherwise we could theoretically 2022 * even if the batch times out, otherwise we could theoretically
2020 * lose wakeups. 2023 * lose wakeups.
2021 */ 2024 */
2022 return ioc->nr_batch_requests == q->nr_batching || 2025 return ioc->nr_batch_requests == q->nr_batching ||
2023 (ioc->nr_batch_requests > 0 2026 (ioc->nr_batch_requests > 0
2024 && time_before(jiffies, ioc->last_waited + BLK_BATCH_TIME)); 2027 && time_before(jiffies, ioc->last_waited + BLK_BATCH_TIME));
2025 } 2028 }
2026 2029
2027 /* 2030 /*
2028 * ioc_set_batching sets ioc to be a new "batcher" if it is not one. This 2031 * ioc_set_batching sets ioc to be a new "batcher" if it is not one. This
2029 * will cause the process to be a "batcher" on all queues in the system. This 2032 * will cause the process to be a "batcher" on all queues in the system. This
2030 * is the behaviour we want though - once it gets a wakeup it should be given 2033 * is the behaviour we want though - once it gets a wakeup it should be given
2031 * a nice run. 2034 * a nice run.
2032 */ 2035 */
2033 static void ioc_set_batching(struct request_queue *q, struct io_context *ioc) 2036 static void ioc_set_batching(struct request_queue *q, struct io_context *ioc)
2034 { 2037 {
2035 if (!ioc || ioc_batching(q, ioc)) 2038 if (!ioc || ioc_batching(q, ioc))
2036 return; 2039 return;
2037 2040
2038 ioc->nr_batch_requests = q->nr_batching; 2041 ioc->nr_batch_requests = q->nr_batching;
2039 ioc->last_waited = jiffies; 2042 ioc->last_waited = jiffies;
2040 } 2043 }
2041 2044
2042 static void __freed_request(struct request_queue *q, int rw) 2045 static void __freed_request(struct request_queue *q, int rw)
2043 { 2046 {
2044 struct request_list *rl = &q->rq; 2047 struct request_list *rl = &q->rq;
2045 2048
2046 if (rl->count[rw] < queue_congestion_off_threshold(q)) 2049 if (rl->count[rw] < queue_congestion_off_threshold(q))
2047 blk_clear_queue_congested(q, rw); 2050 blk_clear_queue_congested(q, rw);
2048 2051
2049 if (rl->count[rw] + 1 <= q->nr_requests) { 2052 if (rl->count[rw] + 1 <= q->nr_requests) {
2050 if (waitqueue_active(&rl->wait[rw])) 2053 if (waitqueue_active(&rl->wait[rw]))
2051 wake_up(&rl->wait[rw]); 2054 wake_up(&rl->wait[rw]);
2052 2055
2053 blk_clear_queue_full(q, rw); 2056 blk_clear_queue_full(q, rw);
2054 } 2057 }
2055 } 2058 }
2056 2059
2057 /* 2060 /*
2058 * A request has just been released. Account for it, update the full and 2061 * A request has just been released. Account for it, update the full and
2059 * congestion status, wake up any waiters. Called under q->queue_lock. 2062 * congestion status, wake up any waiters. Called under q->queue_lock.
2060 */ 2063 */
2061 static void freed_request(struct request_queue *q, int rw, int priv) 2064 static void freed_request(struct request_queue *q, int rw, int priv)
2062 { 2065 {
2063 struct request_list *rl = &q->rq; 2066 struct request_list *rl = &q->rq;
2064 2067
2065 rl->count[rw]--; 2068 rl->count[rw]--;
2066 if (priv) 2069 if (priv)
2067 rl->elvpriv--; 2070 rl->elvpriv--;
2068 2071
2069 __freed_request(q, rw); 2072 __freed_request(q, rw);
2070 2073
2071 if (unlikely(rl->starved[rw ^ 1])) 2074 if (unlikely(rl->starved[rw ^ 1]))
2072 __freed_request(q, rw ^ 1); 2075 __freed_request(q, rw ^ 1);
2073 } 2076 }
2074 2077
2075 #define blkdev_free_rq(list) list_entry((list)->next, struct request, queuelist) 2078 #define blkdev_free_rq(list) list_entry((list)->next, struct request, queuelist)
2076 /* 2079 /*
2077 * Get a free request, queue_lock must be held. 2080 * Get a free request, queue_lock must be held.
2078 * Returns NULL on failure, with queue_lock held. 2081 * Returns NULL on failure, with queue_lock held.
2079 * Returns !NULL on success, with queue_lock *not held*. 2082 * Returns !NULL on success, with queue_lock *not held*.
2080 */ 2083 */
2081 static struct request *get_request(struct request_queue *q, int rw_flags, 2084 static struct request *get_request(struct request_queue *q, int rw_flags,
2082 struct bio *bio, gfp_t gfp_mask) 2085 struct bio *bio, gfp_t gfp_mask)
2083 { 2086 {
2084 struct request *rq = NULL; 2087 struct request *rq = NULL;
2085 struct request_list *rl = &q->rq; 2088 struct request_list *rl = &q->rq;
2086 struct io_context *ioc = NULL; 2089 struct io_context *ioc = NULL;
2087 const int rw = rw_flags & 0x01; 2090 const int rw = rw_flags & 0x01;
2088 int may_queue, priv; 2091 int may_queue, priv;
2089 2092
2090 may_queue = elv_may_queue(q, rw_flags); 2093 may_queue = elv_may_queue(q, rw_flags);
2091 if (may_queue == ELV_MQUEUE_NO) 2094 if (may_queue == ELV_MQUEUE_NO)
2092 goto rq_starved; 2095 goto rq_starved;
2093 2096
2094 if (rl->count[rw]+1 >= queue_congestion_on_threshold(q)) { 2097 if (rl->count[rw]+1 >= queue_congestion_on_threshold(q)) {
2095 if (rl->count[rw]+1 >= q->nr_requests) { 2098 if (rl->count[rw]+1 >= q->nr_requests) {
2096 ioc = current_io_context(GFP_ATOMIC, q->node); 2099 ioc = current_io_context(GFP_ATOMIC, q->node);
2097 /* 2100 /*
2098 * The queue will fill after this allocation, so set 2101 * The queue will fill after this allocation, so set
2099 * it as full, and mark this process as "batching". 2102 * it as full, and mark this process as "batching".
2100 * This process will be allowed to complete a batch of 2103 * This process will be allowed to complete a batch of
2101 * requests, others will be blocked. 2104 * requests, others will be blocked.
2102 */ 2105 */
2103 if (!blk_queue_full(q, rw)) { 2106 if (!blk_queue_full(q, rw)) {
2104 ioc_set_batching(q, ioc); 2107 ioc_set_batching(q, ioc);
2105 blk_set_queue_full(q, rw); 2108 blk_set_queue_full(q, rw);
2106 } else { 2109 } else {
2107 if (may_queue != ELV_MQUEUE_MUST 2110 if (may_queue != ELV_MQUEUE_MUST
2108 && !ioc_batching(q, ioc)) { 2111 && !ioc_batching(q, ioc)) {
2109 /* 2112 /*
2110 * The queue is full and the allocating 2113 * The queue is full and the allocating
2111 * process is not a "batcher", and not 2114 * process is not a "batcher", and not
2112 * exempted by the IO scheduler 2115 * exempted by the IO scheduler
2113 */ 2116 */
2114 goto out; 2117 goto out;
2115 } 2118 }
2116 } 2119 }
2117 } 2120 }
2118 blk_set_queue_congested(q, rw); 2121 blk_set_queue_congested(q, rw);
2119 } 2122 }
2120 2123
2121 /* 2124 /*
2122 * Only allow batching queuers to allocate up to 50% over the defined 2125 * Only allow batching queuers to allocate up to 50% over the defined
2123 * limit of requests, otherwise we could have thousands of requests 2126 * limit of requests, otherwise we could have thousands of requests
2124 * allocated with any setting of ->nr_requests 2127 * allocated with any setting of ->nr_requests
2125 */ 2128 */
2126 if (rl->count[rw] >= (3 * q->nr_requests / 2)) 2129 if (rl->count[rw] >= (3 * q->nr_requests / 2))
2127 goto out; 2130 goto out;
2128 2131
2129 rl->count[rw]++; 2132 rl->count[rw]++;
2130 rl->starved[rw] = 0; 2133 rl->starved[rw] = 0;
2131 2134
2132 priv = !test_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags); 2135 priv = !test_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags);
2133 if (priv) 2136 if (priv)
2134 rl->elvpriv++; 2137 rl->elvpriv++;
2135 2138
2136 spin_unlock_irq(q->queue_lock); 2139 spin_unlock_irq(q->queue_lock);
2137 2140
2138 rq = blk_alloc_request(q, rw_flags, priv, gfp_mask); 2141 rq = blk_alloc_request(q, rw_flags, priv, gfp_mask);
2139 if (unlikely(!rq)) { 2142 if (unlikely(!rq)) {
2140 /* 2143 /*
2141 * Allocation failed presumably due to memory. Undo anything 2144 * Allocation failed presumably due to memory. Undo anything
2142 * we might have messed up. 2145 * we might have messed up.
2143 * 2146 *
2144 * Allocating task should really be put onto the front of the 2147 * Allocating task should really be put onto the front of the
2145 * wait queue, but this is pretty rare. 2148 * wait queue, but this is pretty rare.
2146 */ 2149 */
2147 spin_lock_irq(q->queue_lock); 2150 spin_lock_irq(q->queue_lock);
2148 freed_request(q, rw, priv); 2151 freed_request(q, rw, priv);
2149 2152
2150 /* 2153 /*
2151 * in the very unlikely event that allocation failed and no 2154 * in the very unlikely event that allocation failed and no
2152 * requests for this direction was pending, mark us starved 2155 * requests for this direction was pending, mark us starved
2153 * so that freeing of a request in the other direction will 2156 * so that freeing of a request in the other direction will
2154 * notice us. another possible fix would be to split the 2157 * notice us. another possible fix would be to split the
2155 * rq mempool into READ and WRITE 2158 * rq mempool into READ and WRITE
2156 */ 2159 */
2157 rq_starved: 2160 rq_starved:
2158 if (unlikely(rl->count[rw] == 0)) 2161 if (unlikely(rl->count[rw] == 0))
2159 rl->starved[rw] = 1; 2162 rl->starved[rw] = 1;
2160 2163
2161 goto out; 2164 goto out;
2162 } 2165 }
2163 2166
2164 /* 2167 /*
2165 * ioc may be NULL here, and ioc_batching will be false. That's 2168 * ioc may be NULL here, and ioc_batching will be false. That's
2166 * OK, if the queue is under the request limit then requests need 2169 * OK, if the queue is under the request limit then requests need
2167 * not count toward the nr_batch_requests limit. There will always 2170 * not count toward the nr_batch_requests limit. There will always
2168 * be some limit enforced by BLK_BATCH_TIME. 2171 * be some limit enforced by BLK_BATCH_TIME.
2169 */ 2172 */
2170 if (ioc_batching(q, ioc)) 2173 if (ioc_batching(q, ioc))
2171 ioc->nr_batch_requests--; 2174 ioc->nr_batch_requests--;
2172 2175
2173 rq_init(q, rq); 2176 rq_init(q, rq);
2174 2177
2175 blk_add_trace_generic(q, bio, rw, BLK_TA_GETRQ); 2178 blk_add_trace_generic(q, bio, rw, BLK_TA_GETRQ);
2176 out: 2179 out:
2177 return rq; 2180 return rq;
2178 } 2181 }
2179 2182
2180 /* 2183 /*
2181 * No available requests for this queue, unplug the device and wait for some 2184 * No available requests for this queue, unplug the device and wait for some
2182 * requests to become available. 2185 * requests to become available.
2183 * 2186 *
2184 * Called with q->queue_lock held, and returns with it unlocked. 2187 * Called with q->queue_lock held, and returns with it unlocked.
2185 */ 2188 */
2186 static struct request *get_request_wait(struct request_queue *q, int rw_flags, 2189 static struct request *get_request_wait(struct request_queue *q, int rw_flags,
2187 struct bio *bio) 2190 struct bio *bio)
2188 { 2191 {
2189 const int rw = rw_flags & 0x01; 2192 const int rw = rw_flags & 0x01;
2190 struct request *rq; 2193 struct request *rq;
2191 2194
2192 rq = get_request(q, rw_flags, bio, GFP_NOIO); 2195 rq = get_request(q, rw_flags, bio, GFP_NOIO);
2193 while (!rq) { 2196 while (!rq) {
2194 DEFINE_WAIT(wait); 2197 DEFINE_WAIT(wait);
2195 struct request_list *rl = &q->rq; 2198 struct request_list *rl = &q->rq;
2196 2199
2197 prepare_to_wait_exclusive(&rl->wait[rw], &wait, 2200 prepare_to_wait_exclusive(&rl->wait[rw], &wait,
2198 TASK_UNINTERRUPTIBLE); 2201 TASK_UNINTERRUPTIBLE);
2199 2202
2200 rq = get_request(q, rw_flags, bio, GFP_NOIO); 2203 rq = get_request(q, rw_flags, bio, GFP_NOIO);
2201 2204
2202 if (!rq) { 2205 if (!rq) {
2203 struct io_context *ioc; 2206 struct io_context *ioc;
2204 2207
2205 blk_add_trace_generic(q, bio, rw, BLK_TA_SLEEPRQ); 2208 blk_add_trace_generic(q, bio, rw, BLK_TA_SLEEPRQ);
2206 2209
2207 __generic_unplug_device(q); 2210 __generic_unplug_device(q);
2208 spin_unlock_irq(q->queue_lock); 2211 spin_unlock_irq(q->queue_lock);
2209 io_schedule(); 2212 io_schedule();
2210 2213
2211 /* 2214 /*
2212 * After sleeping, we become a "batching" process and 2215 * After sleeping, we become a "batching" process and
2213 * will be able to allocate at least one request, and 2216 * will be able to allocate at least one request, and
2214 * up to a big batch of them for a small period time. 2217 * up to a big batch of them for a small period time.
2215 * See ioc_batching, ioc_set_batching 2218 * See ioc_batching, ioc_set_batching
2216 */ 2219 */
2217 ioc = current_io_context(GFP_NOIO, q->node); 2220 ioc = current_io_context(GFP_NOIO, q->node);
2218 ioc_set_batching(q, ioc); 2221 ioc_set_batching(q, ioc);
2219 2222
2220 spin_lock_irq(q->queue_lock); 2223 spin_lock_irq(q->queue_lock);
2221 } 2224 }
2222 finish_wait(&rl->wait[rw], &wait); 2225 finish_wait(&rl->wait[rw], &wait);
2223 } 2226 }
2224 2227
2225 return rq; 2228 return rq;
2226 } 2229 }
2227 2230
2228 struct request *blk_get_request(struct request_queue *q, int rw, gfp_t gfp_mask) 2231 struct request *blk_get_request(struct request_queue *q, int rw, gfp_t gfp_mask)
2229 { 2232 {
2230 struct request *rq; 2233 struct request *rq;
2231 2234
2232 BUG_ON(rw != READ && rw != WRITE); 2235 BUG_ON(rw != READ && rw != WRITE);
2233 2236
2234 spin_lock_irq(q->queue_lock); 2237 spin_lock_irq(q->queue_lock);
2235 if (gfp_mask & __GFP_WAIT) { 2238 if (gfp_mask & __GFP_WAIT) {
2236 rq = get_request_wait(q, rw, NULL); 2239 rq = get_request_wait(q, rw, NULL);
2237 } else { 2240 } else {
2238 rq = get_request(q, rw, NULL, gfp_mask); 2241 rq = get_request(q, rw, NULL, gfp_mask);
2239 if (!rq) 2242 if (!rq)
2240 spin_unlock_irq(q->queue_lock); 2243 spin_unlock_irq(q->queue_lock);
2241 } 2244 }
2242 /* q->queue_lock is unlocked at this point */ 2245 /* q->queue_lock is unlocked at this point */
2243 2246
2244 return rq; 2247 return rq;
2245 } 2248 }
2246 EXPORT_SYMBOL(blk_get_request); 2249 EXPORT_SYMBOL(blk_get_request);
2247 2250
2248 /** 2251 /**
2249 * blk_start_queueing - initiate dispatch of requests to device 2252 * blk_start_queueing - initiate dispatch of requests to device
2250 * @q: request queue to kick into gear 2253 * @q: request queue to kick into gear
2251 * 2254 *
2252 * This is basically a helper to remove the need to know whether a queue 2255 * This is basically a helper to remove the need to know whether a queue
2253 * is plugged or not if someone just wants to initiate dispatch of requests 2256 * is plugged or not if someone just wants to initiate dispatch of requests
2254 * for this queue. 2257 * for this queue.
2255 * 2258 *
2256 * The queue lock must be held with interrupts disabled. 2259 * The queue lock must be held with interrupts disabled.
2257 */ 2260 */
2258 void blk_start_queueing(struct request_queue *q) 2261 void blk_start_queueing(struct request_queue *q)
2259 { 2262 {
2260 if (!blk_queue_plugged(q)) 2263 if (!blk_queue_plugged(q))
2261 q->request_fn(q); 2264 q->request_fn(q);
2262 else 2265 else
2263 __generic_unplug_device(q); 2266 __generic_unplug_device(q);
2264 } 2267 }
2265 EXPORT_SYMBOL(blk_start_queueing); 2268 EXPORT_SYMBOL(blk_start_queueing);
2266 2269
2267 /** 2270 /**
2268 * blk_requeue_request - put a request back on queue 2271 * blk_requeue_request - put a request back on queue
2269 * @q: request queue where request should be inserted 2272 * @q: request queue where request should be inserted
2270 * @rq: request to be inserted 2273 * @rq: request to be inserted
2271 * 2274 *
2272 * Description: 2275 * Description:
2273 * Drivers often keep queueing requests until the hardware cannot accept 2276 * Drivers often keep queueing requests until the hardware cannot accept
2274 * more, when that condition happens we need to put the request back 2277 * more, when that condition happens we need to put the request back
2275 * on the queue. Must be called with queue lock held. 2278 * on the queue. Must be called with queue lock held.
2276 */ 2279 */
2277 void blk_requeue_request(struct request_queue *q, struct request *rq) 2280 void blk_requeue_request(struct request_queue *q, struct request *rq)
2278 { 2281 {
2279 blk_add_trace_rq(q, rq, BLK_TA_REQUEUE); 2282 blk_add_trace_rq(q, rq, BLK_TA_REQUEUE);
2280 2283
2281 if (blk_rq_tagged(rq)) 2284 if (blk_rq_tagged(rq))
2282 blk_queue_end_tag(q, rq); 2285 blk_queue_end_tag(q, rq);
2283 2286
2284 elv_requeue_request(q, rq); 2287 elv_requeue_request(q, rq);
2285 } 2288 }
2286 2289
2287 EXPORT_SYMBOL(blk_requeue_request); 2290 EXPORT_SYMBOL(blk_requeue_request);
2288 2291
2289 /** 2292 /**
2290 * blk_insert_request - insert a special request in to a request queue 2293 * blk_insert_request - insert a special request in to a request queue
2291 * @q: request queue where request should be inserted 2294 * @q: request queue where request should be inserted
2292 * @rq: request to be inserted 2295 * @rq: request to be inserted
2293 * @at_head: insert request at head or tail of queue 2296 * @at_head: insert request at head or tail of queue
2294 * @data: private data 2297 * @data: private data
2295 * 2298 *
2296 * Description: 2299 * Description:
2297 * Many block devices need to execute commands asynchronously, so they don't 2300 * Many block devices need to execute commands asynchronously, so they don't
2298 * block the whole kernel from preemption during request execution. This is 2301 * block the whole kernel from preemption during request execution. This is
2299 * accomplished normally by inserting aritficial requests tagged as 2302 * accomplished normally by inserting aritficial requests tagged as
2300 * REQ_SPECIAL in to the corresponding request queue, and letting them be 2303 * REQ_SPECIAL in to the corresponding request queue, and letting them be
2301 * scheduled for actual execution by the request queue. 2304 * scheduled for actual execution by the request queue.
2302 * 2305 *
2303 * We have the option of inserting the head or the tail of the queue. 2306 * We have the option of inserting the head or the tail of the queue.
2304 * Typically we use the tail for new ioctls and so forth. We use the head 2307 * Typically we use the tail for new ioctls and so forth. We use the head
2305 * of the queue for things like a QUEUE_FULL message from a device, or a 2308 * of the queue for things like a QUEUE_FULL message from a device, or a
2306 * host that is unable to accept a particular command. 2309 * host that is unable to accept a particular command.
2307 */ 2310 */
2308 void blk_insert_request(struct request_queue *q, struct request *rq, 2311 void blk_insert_request(struct request_queue *q, struct request *rq,
2309 int at_head, void *data) 2312 int at_head, void *data)
2310 { 2313 {
2311 int where = at_head ? ELEVATOR_INSERT_FRONT : ELEVATOR_INSERT_BACK; 2314 int where = at_head ? ELEVATOR_INSERT_FRONT : ELEVATOR_INSERT_BACK;
2312 unsigned long flags; 2315 unsigned long flags;
2313 2316
2314 /* 2317 /*
2315 * tell I/O scheduler that this isn't a regular read/write (ie it 2318 * tell I/O scheduler that this isn't a regular read/write (ie it
2316 * must not attempt merges on this) and that it acts as a soft 2319 * must not attempt merges on this) and that it acts as a soft
2317 * barrier 2320 * barrier
2318 */ 2321 */
2319 rq->cmd_type = REQ_TYPE_SPECIAL; 2322 rq->cmd_type = REQ_TYPE_SPECIAL;
2320 rq->cmd_flags |= REQ_SOFTBARRIER; 2323 rq->cmd_flags |= REQ_SOFTBARRIER;
2321 2324
2322 rq->special = data; 2325 rq->special = data;
2323 2326
2324 spin_lock_irqsave(q->queue_lock, flags); 2327 spin_lock_irqsave(q->queue_lock, flags);
2325 2328
2326 /* 2329 /*
2327 * If command is tagged, release the tag 2330 * If command is tagged, release the tag
2328 */ 2331 */
2329 if (blk_rq_tagged(rq)) 2332 if (blk_rq_tagged(rq))
2330 blk_queue_end_tag(q, rq); 2333 blk_queue_end_tag(q, rq);
2331 2334
2332 drive_stat_acct(rq, rq->nr_sectors, 1); 2335 drive_stat_acct(rq, rq->nr_sectors, 1);
2333 __elv_add_request(q, rq, where, 0); 2336 __elv_add_request(q, rq, where, 0);
2334 blk_start_queueing(q); 2337 blk_start_queueing(q);
2335 spin_unlock_irqrestore(q->queue_lock, flags); 2338 spin_unlock_irqrestore(q->queue_lock, flags);
2336 } 2339 }
2337 2340
2338 EXPORT_SYMBOL(blk_insert_request); 2341 EXPORT_SYMBOL(blk_insert_request);
2339 2342
2340 static int __blk_rq_unmap_user(struct bio *bio) 2343 static int __blk_rq_unmap_user(struct bio *bio)
2341 { 2344 {
2342 int ret = 0; 2345 int ret = 0;
2343 2346
2344 if (bio) { 2347 if (bio) {
2345 if (bio_flagged(bio, BIO_USER_MAPPED)) 2348 if (bio_flagged(bio, BIO_USER_MAPPED))
2346 bio_unmap_user(bio); 2349 bio_unmap_user(bio);
2347 else 2350 else
2348 ret = bio_uncopy_user(bio); 2351 ret = bio_uncopy_user(bio);
2349 } 2352 }
2350 2353
2351 return ret; 2354 return ret;
2352 } 2355 }
2353 2356
2354 int blk_rq_append_bio(struct request_queue *q, struct request *rq, 2357 int blk_rq_append_bio(struct request_queue *q, struct request *rq,
2355 struct bio *bio) 2358 struct bio *bio)
2356 { 2359 {
2357 if (!rq->bio) 2360 if (!rq->bio)
2358 blk_rq_bio_prep(q, rq, bio); 2361 blk_rq_bio_prep(q, rq, bio);
2359 else if (!ll_back_merge_fn(q, rq, bio)) 2362 else if (!ll_back_merge_fn(q, rq, bio))
2360 return -EINVAL; 2363 return -EINVAL;
2361 else { 2364 else {
2362 rq->biotail->bi_next = bio; 2365 rq->biotail->bi_next = bio;
2363 rq->biotail = bio; 2366 rq->biotail = bio;
2364 2367
2365 rq->data_len += bio->bi_size; 2368 rq->data_len += bio->bi_size;
2366 } 2369 }
2367 return 0; 2370 return 0;
2368 } 2371 }
2369 EXPORT_SYMBOL(blk_rq_append_bio); 2372 EXPORT_SYMBOL(blk_rq_append_bio);
2370 2373
2371 static int __blk_rq_map_user(struct request_queue *q, struct request *rq, 2374 static int __blk_rq_map_user(struct request_queue *q, struct request *rq,
2372 void __user *ubuf, unsigned int len) 2375 void __user *ubuf, unsigned int len)
2373 { 2376 {
2374 unsigned long uaddr; 2377 unsigned long uaddr;
2375 struct bio *bio, *orig_bio; 2378 struct bio *bio, *orig_bio;
2376 int reading, ret; 2379 int reading, ret;
2377 2380
2378 reading = rq_data_dir(rq) == READ; 2381 reading = rq_data_dir(rq) == READ;
2379 2382
2380 /* 2383 /*
2381 * if alignment requirement is satisfied, map in user pages for 2384 * if alignment requirement is satisfied, map in user pages for
2382 * direct dma. else, set up kernel bounce buffers 2385 * direct dma. else, set up kernel bounce buffers
2383 */ 2386 */
2384 uaddr = (unsigned long) ubuf; 2387 uaddr = (unsigned long) ubuf;
2385 if (!(uaddr & queue_dma_alignment(q)) && !(len & queue_dma_alignment(q))) 2388 if (!(uaddr & queue_dma_alignment(q)) && !(len & queue_dma_alignment(q)))
2386 bio = bio_map_user(q, NULL, uaddr, len, reading); 2389 bio = bio_map_user(q, NULL, uaddr, len, reading);
2387 else 2390 else
2388 bio = bio_copy_user(q, uaddr, len, reading); 2391 bio = bio_copy_user(q, uaddr, len, reading);
2389 2392
2390 if (IS_ERR(bio)) 2393 if (IS_ERR(bio))
2391 return PTR_ERR(bio); 2394 return PTR_ERR(bio);
2392 2395
2393 orig_bio = bio; 2396 orig_bio = bio;
2394 blk_queue_bounce(q, &bio); 2397 blk_queue_bounce(q, &bio);
2395 2398
2396 /* 2399 /*
2397 * We link the bounce buffer in and could have to traverse it 2400 * We link the bounce buffer in and could have to traverse it
2398 * later so we have to get a ref to prevent it from being freed 2401 * later so we have to get a ref to prevent it from being freed
2399 */ 2402 */
2400 bio_get(bio); 2403 bio_get(bio);
2401 2404
2402 ret = blk_rq_append_bio(q, rq, bio); 2405 ret = blk_rq_append_bio(q, rq, bio);
2403 if (!ret) 2406 if (!ret)
2404 return bio->bi_size; 2407 return bio->bi_size;
2405 2408
2406 /* if it was boucned we must call the end io function */ 2409 /* if it was boucned we must call the end io function */
2407 bio_endio(bio, 0); 2410 bio_endio(bio, 0);
2408 __blk_rq_unmap_user(orig_bio); 2411 __blk_rq_unmap_user(orig_bio);
2409 bio_put(bio); 2412 bio_put(bio);
2410 return ret; 2413 return ret;
2411 } 2414 }
2412 2415
2413 /** 2416 /**
2414 * blk_rq_map_user - map user data to a request, for REQ_BLOCK_PC usage 2417 * blk_rq_map_user - map user data to a request, for REQ_BLOCK_PC usage
2415 * @q: request queue where request should be inserted 2418 * @q: request queue where request should be inserted
2416 * @rq: request structure to fill 2419 * @rq: request structure to fill
2417 * @ubuf: the user buffer 2420 * @ubuf: the user buffer
2418 * @len: length of user data 2421 * @len: length of user data
2419 * 2422 *
2420 * Description: 2423 * Description:
2421 * Data will be mapped directly for zero copy io, if possible. Otherwise 2424 * Data will be mapped directly for zero copy io, if possible. Otherwise
2422 * a kernel bounce buffer is used. 2425 * a kernel bounce buffer is used.
2423 * 2426 *
2424 * A matching blk_rq_unmap_user() must be issued at the end of io, while 2427 * A matching blk_rq_unmap_user() must be issued at the end of io, while
2425 * still in process context. 2428 * still in process context.
2426 * 2429 *
2427 * Note: The mapped bio may need to be bounced through blk_queue_bounce() 2430 * Note: The mapped bio may need to be bounced through blk_queue_bounce()
2428 * before being submitted to the device, as pages mapped may be out of 2431 * before being submitted to the device, as pages mapped may be out of
2429 * reach. It's the callers responsibility to make sure this happens. The 2432 * reach. It's the callers responsibility to make sure this happens. The
2430 * original bio must be passed back in to blk_rq_unmap_user() for proper 2433 * original bio must be passed back in to blk_rq_unmap_user() for proper
2431 * unmapping. 2434 * unmapping.
2432 */ 2435 */
2433 int blk_rq_map_user(struct request_queue *q, struct request *rq, 2436 int blk_rq_map_user(struct request_queue *q, struct request *rq,
2434 void __user *ubuf, unsigned long len) 2437 void __user *ubuf, unsigned long len)
2435 { 2438 {
2436 unsigned long bytes_read = 0; 2439 unsigned long bytes_read = 0;
2437 struct bio *bio = NULL; 2440 struct bio *bio = NULL;
2438 int ret; 2441 int ret;
2439 2442
2440 if (len > (q->max_hw_sectors << 9)) 2443 if (len > (q->max_hw_sectors << 9))
2441 return -EINVAL; 2444 return -EINVAL;
2442 if (!len || !ubuf) 2445 if (!len || !ubuf)
2443 return -EINVAL; 2446 return -EINVAL;
2444 2447
2445 while (bytes_read != len) { 2448 while (bytes_read != len) {
2446 unsigned long map_len, end, start; 2449 unsigned long map_len, end, start;
2447 2450
2448 map_len = min_t(unsigned long, len - bytes_read, BIO_MAX_SIZE); 2451 map_len = min_t(unsigned long, len - bytes_read, BIO_MAX_SIZE);
2449 end = ((unsigned long)ubuf + map_len + PAGE_SIZE - 1) 2452 end = ((unsigned long)ubuf + map_len + PAGE_SIZE - 1)
2450 >> PAGE_SHIFT; 2453 >> PAGE_SHIFT;
2451 start = (unsigned long)ubuf >> PAGE_SHIFT; 2454 start = (unsigned long)ubuf >> PAGE_SHIFT;
2452 2455
2453 /* 2456 /*
2454 * A bad offset could cause us to require BIO_MAX_PAGES + 1 2457 * A bad offset could cause us to require BIO_MAX_PAGES + 1
2455 * pages. If this happens we just lower the requested 2458 * pages. If this happens we just lower the requested
2456 * mapping len by a page so that we can fit 2459 * mapping len by a page so that we can fit
2457 */ 2460 */
2458 if (end - start > BIO_MAX_PAGES) 2461 if (end - start > BIO_MAX_PAGES)
2459 map_len -= PAGE_SIZE; 2462 map_len -= PAGE_SIZE;
2460 2463
2461 ret = __blk_rq_map_user(q, rq, ubuf, map_len); 2464 ret = __blk_rq_map_user(q, rq, ubuf, map_len);
2462 if (ret < 0) 2465 if (ret < 0)
2463 goto unmap_rq; 2466 goto unmap_rq;
2464 if (!bio) 2467 if (!bio)
2465 bio = rq->bio; 2468 bio = rq->bio;
2466 bytes_read += ret; 2469 bytes_read += ret;
2467 ubuf += ret; 2470 ubuf += ret;
2468 } 2471 }
2469 2472
2470 rq->buffer = rq->data = NULL; 2473 rq->buffer = rq->data = NULL;
2471 return 0; 2474 return 0;
2472 unmap_rq: 2475 unmap_rq:
2473 blk_rq_unmap_user(bio); 2476 blk_rq_unmap_user(bio);
2474 return ret; 2477 return ret;
2475 } 2478 }
2476 2479
2477 EXPORT_SYMBOL(blk_rq_map_user); 2480 EXPORT_SYMBOL(blk_rq_map_user);
2478 2481
2479 /** 2482 /**
2480 * blk_rq_map_user_iov - map user data to a request, for REQ_BLOCK_PC usage 2483 * blk_rq_map_user_iov - map user data to a request, for REQ_BLOCK_PC usage
2481 * @q: request queue where request should be inserted 2484 * @q: request queue where request should be inserted
2482 * @rq: request to map data to 2485 * @rq: request to map data to
2483 * @iov: pointer to the iovec 2486 * @iov: pointer to the iovec
2484 * @iov_count: number of elements in the iovec 2487 * @iov_count: number of elements in the iovec
2485 * @len: I/O byte count 2488 * @len: I/O byte count
2486 * 2489 *
2487 * Description: 2490 * Description:
2488 * Data will be mapped directly for zero copy io, if possible. Otherwise 2491 * Data will be mapped directly for zero copy io, if possible. Otherwise
2489 * a kernel bounce buffer is used. 2492 * a kernel bounce buffer is used.
2490 * 2493 *
2491 * A matching blk_rq_unmap_user() must be issued at the end of io, while 2494 * A matching blk_rq_unmap_user() must be issued at the end of io, while
2492 * still in process context. 2495 * still in process context.
2493 * 2496 *
2494 * Note: The mapped bio may need to be bounced through blk_queue_bounce() 2497 * Note: The mapped bio may need to be bounced through blk_queue_bounce()
2495 * before being submitted to the device, as pages mapped may be out of 2498 * before being submitted to the device, as pages mapped may be out of
2496 * reach. It's the callers responsibility to make sure this happens. The 2499 * reach. It's the callers responsibility to make sure this happens. The
2497 * original bio must be passed back in to blk_rq_unmap_user() for proper 2500 * original bio must be passed back in to blk_rq_unmap_user() for proper
2498 * unmapping. 2501 * unmapping.
2499 */ 2502 */
2500 int blk_rq_map_user_iov(struct request_queue *q, struct request *rq, 2503 int blk_rq_map_user_iov(struct request_queue *q, struct request *rq,
2501 struct sg_iovec *iov, int iov_count, unsigned int len) 2504 struct sg_iovec *iov, int iov_count, unsigned int len)
2502 { 2505 {
2503 struct bio *bio; 2506 struct bio *bio;
2504 2507
2505 if (!iov || iov_count <= 0) 2508 if (!iov || iov_count <= 0)
2506 return -EINVAL; 2509 return -EINVAL;
2507 2510
2508 /* we don't allow misaligned data like bio_map_user() does. If the 2511 /* we don't allow misaligned data like bio_map_user() does. If the
2509 * user is using sg, they're expected to know the alignment constraints 2512 * user is using sg, they're expected to know the alignment constraints
2510 * and respect them accordingly */ 2513 * and respect them accordingly */
2511 bio = bio_map_user_iov(q, NULL, iov, iov_count, rq_data_dir(rq)== READ); 2514 bio = bio_map_user_iov(q, NULL, iov, iov_count, rq_data_dir(rq)== READ);
2512 if (IS_ERR(bio)) 2515 if (IS_ERR(bio))
2513 return PTR_ERR(bio); 2516 return PTR_ERR(bio);
2514 2517
2515 if (bio->bi_size != len) { 2518 if (bio->bi_size != len) {
2516 bio_endio(bio, 0); 2519 bio_endio(bio, 0);
2517 bio_unmap_user(bio); 2520 bio_unmap_user(bio);
2518 return -EINVAL; 2521 return -EINVAL;
2519 } 2522 }
2520 2523
2521 bio_get(bio); 2524 bio_get(bio);
2522 blk_rq_bio_prep(q, rq, bio); 2525 blk_rq_bio_prep(q, rq, bio);
2523 rq->buffer = rq->data = NULL; 2526 rq->buffer = rq->data = NULL;
2524 return 0; 2527 return 0;
2525 } 2528 }
2526 2529
2527 EXPORT_SYMBOL(blk_rq_map_user_iov); 2530 EXPORT_SYMBOL(blk_rq_map_user_iov);
2528 2531
2529 /** 2532 /**
2530 * blk_rq_unmap_user - unmap a request with user data 2533 * blk_rq_unmap_user - unmap a request with user data
2531 * @bio: start of bio list 2534 * @bio: start of bio list
2532 * 2535 *
2533 * Description: 2536 * Description:
2534 * Unmap a rq previously mapped by blk_rq_map_user(). The caller must 2537 * Unmap a rq previously mapped by blk_rq_map_user(). The caller must
2535 * supply the original rq->bio from the blk_rq_map_user() return, since 2538 * supply the original rq->bio from the blk_rq_map_user() return, since
2536 * the io completion may have changed rq->bio. 2539 * the io completion may have changed rq->bio.
2537 */ 2540 */
2538 int blk_rq_unmap_user(struct bio *bio) 2541 int blk_rq_unmap_user(struct bio *bio)
2539 { 2542 {
2540 struct bio *mapped_bio; 2543 struct bio *mapped_bio;
2541 int ret = 0, ret2; 2544 int ret = 0, ret2;
2542 2545
2543 while (bio) { 2546 while (bio) {
2544 mapped_bio = bio; 2547 mapped_bio = bio;
2545 if (unlikely(bio_flagged(bio, BIO_BOUNCED))) 2548 if (unlikely(bio_flagged(bio, BIO_BOUNCED)))
2546 mapped_bio = bio->bi_private; 2549 mapped_bio = bio->bi_private;
2547 2550
2548 ret2 = __blk_rq_unmap_user(mapped_bio); 2551 ret2 = __blk_rq_unmap_user(mapped_bio);
2549 if (ret2 && !ret) 2552 if (ret2 && !ret)
2550 ret = ret2; 2553 ret = ret2;
2551 2554
2552 mapped_bio = bio; 2555 mapped_bio = bio;
2553 bio = bio->bi_next; 2556 bio = bio->bi_next;
2554 bio_put(mapped_bio); 2557 bio_put(mapped_bio);
2555 } 2558 }
2556 2559
2557 return ret; 2560 return ret;
2558 } 2561 }
2559 2562
2560 EXPORT_SYMBOL(blk_rq_unmap_user); 2563 EXPORT_SYMBOL(blk_rq_unmap_user);
2561 2564
2562 /** 2565 /**
2563 * blk_rq_map_kern - map kernel data to a request, for REQ_BLOCK_PC usage 2566 * blk_rq_map_kern - map kernel data to a request, for REQ_BLOCK_PC usage
2564 * @q: request queue where request should be inserted 2567 * @q: request queue where request should be inserted
2565 * @rq: request to fill 2568 * @rq: request to fill
2566 * @kbuf: the kernel buffer 2569 * @kbuf: the kernel buffer
2567 * @len: length of user data 2570 * @len: length of user data
2568 * @gfp_mask: memory allocation flags 2571 * @gfp_mask: memory allocation flags
2569 */ 2572 */
2570 int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf, 2573 int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf,
2571 unsigned int len, gfp_t gfp_mask) 2574 unsigned int len, gfp_t gfp_mask)
2572 { 2575 {
2573 struct bio *bio; 2576 struct bio *bio;
2574 2577
2575 if (len > (q->max_hw_sectors << 9)) 2578 if (len > (q->max_hw_sectors << 9))
2576 return -EINVAL; 2579 return -EINVAL;
2577 if (!len || !kbuf) 2580 if (!len || !kbuf)
2578 return -EINVAL; 2581 return -EINVAL;
2579 2582
2580 bio = bio_map_kern(q, kbuf, len, gfp_mask); 2583 bio = bio_map_kern(q, kbuf, len, gfp_mask);
2581 if (IS_ERR(bio)) 2584 if (IS_ERR(bio))
2582 return PTR_ERR(bio); 2585 return PTR_ERR(bio);
2583 2586
2584 if (rq_data_dir(rq) == WRITE) 2587 if (rq_data_dir(rq) == WRITE)
2585 bio->bi_rw |= (1 << BIO_RW); 2588 bio->bi_rw |= (1 << BIO_RW);
2586 2589
2587 blk_rq_bio_prep(q, rq, bio); 2590 blk_rq_bio_prep(q, rq, bio);
2588 blk_queue_bounce(q, &rq->bio); 2591 blk_queue_bounce(q, &rq->bio);
2589 rq->buffer = rq->data = NULL; 2592 rq->buffer = rq->data = NULL;
2590 return 0; 2593 return 0;
2591 } 2594 }
2592 2595
2593 EXPORT_SYMBOL(blk_rq_map_kern); 2596 EXPORT_SYMBOL(blk_rq_map_kern);
2594 2597
2595 /** 2598 /**
2596 * blk_execute_rq_nowait - insert a request into queue for execution 2599 * blk_execute_rq_nowait - insert a request into queue for execution
2597 * @q: queue to insert the request in 2600 * @q: queue to insert the request in
2598 * @bd_disk: matching gendisk 2601 * @bd_disk: matching gendisk
2599 * @rq: request to insert 2602 * @rq: request to insert
2600 * @at_head: insert request at head or tail of queue 2603 * @at_head: insert request at head or tail of queue
2601 * @done: I/O completion handler 2604 * @done: I/O completion handler
2602 * 2605 *
2603 * Description: 2606 * Description:
2604 * Insert a fully prepared request at the back of the io scheduler queue 2607 * Insert a fully prepared request at the back of the io scheduler queue
2605 * for execution. Don't wait for completion. 2608 * for execution. Don't wait for completion.
2606 */ 2609 */
2607 void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk, 2610 void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk,
2608 struct request *rq, int at_head, 2611 struct request *rq, int at_head,
2609 rq_end_io_fn *done) 2612 rq_end_io_fn *done)
2610 { 2613 {
2611 int where = at_head ? ELEVATOR_INSERT_FRONT : ELEVATOR_INSERT_BACK; 2614 int where = at_head ? ELEVATOR_INSERT_FRONT : ELEVATOR_INSERT_BACK;
2612 2615
2613 rq->rq_disk = bd_disk; 2616 rq->rq_disk = bd_disk;
2614 rq->cmd_flags |= REQ_NOMERGE; 2617 rq->cmd_flags |= REQ_NOMERGE;
2615 rq->end_io = done; 2618 rq->end_io = done;
2616 WARN_ON(irqs_disabled()); 2619 WARN_ON(irqs_disabled());
2617 spin_lock_irq(q->queue_lock); 2620 spin_lock_irq(q->queue_lock);
2618 __elv_add_request(q, rq, where, 1); 2621 __elv_add_request(q, rq, where, 1);
2619 __generic_unplug_device(q); 2622 __generic_unplug_device(q);
2620 spin_unlock_irq(q->queue_lock); 2623 spin_unlock_irq(q->queue_lock);
2621 } 2624 }
2622 EXPORT_SYMBOL_GPL(blk_execute_rq_nowait); 2625 EXPORT_SYMBOL_GPL(blk_execute_rq_nowait);
2623 2626
2624 /** 2627 /**
2625 * blk_execute_rq - insert a request into queue for execution 2628 * blk_execute_rq - insert a request into queue for execution
2626 * @q: queue to insert the request in 2629 * @q: queue to insert the request in
2627 * @bd_disk: matching gendisk 2630 * @bd_disk: matching gendisk
2628 * @rq: request to insert 2631 * @rq: request to insert
2629 * @at_head: insert request at head or tail of queue 2632 * @at_head: insert request at head or tail of queue
2630 * 2633 *
2631 * Description: 2634 * Description:
2632 * Insert a fully prepared request at the back of the io scheduler queue 2635 * Insert a fully prepared request at the back of the io scheduler queue
2633 * for execution and wait for completion. 2636 * for execution and wait for completion.
2634 */ 2637 */
2635 int blk_execute_rq(struct request_queue *q, struct gendisk *bd_disk, 2638 int blk_execute_rq(struct request_queue *q, struct gendisk *bd_disk,
2636 struct request *rq, int at_head) 2639 struct request *rq, int at_head)
2637 { 2640 {
2638 DECLARE_COMPLETION_ONSTACK(wait); 2641 DECLARE_COMPLETION_ONSTACK(wait);
2639 char sense[SCSI_SENSE_BUFFERSIZE]; 2642 char sense[SCSI_SENSE_BUFFERSIZE];
2640 int err = 0; 2643 int err = 0;
2641 2644
2642 /* 2645 /*
2643 * we need an extra reference to the request, so we can look at 2646 * we need an extra reference to the request, so we can look at
2644 * it after io completion 2647 * it after io completion
2645 */ 2648 */
2646 rq->ref_count++; 2649 rq->ref_count++;
2647 2650
2648 if (!rq->sense) { 2651 if (!rq->sense) {
2649 memset(sense, 0, sizeof(sense)); 2652 memset(sense, 0, sizeof(sense));
2650 rq->sense = sense; 2653 rq->sense = sense;
2651 rq->sense_len = 0; 2654 rq->sense_len = 0;
2652 } 2655 }
2653 2656
2654 rq->end_io_data = &wait; 2657 rq->end_io_data = &wait;
2655 blk_execute_rq_nowait(q, bd_disk, rq, at_head, blk_end_sync_rq); 2658 blk_execute_rq_nowait(q, bd_disk, rq, at_head, blk_end_sync_rq);
2656 wait_for_completion(&wait); 2659 wait_for_completion(&wait);
2657 2660
2658 if (rq->errors) 2661 if (rq->errors)
2659 err = -EIO; 2662 err = -EIO;
2660 2663
2661 return err; 2664 return err;
2662 } 2665 }
2663 2666
2664 EXPORT_SYMBOL(blk_execute_rq); 2667 EXPORT_SYMBOL(blk_execute_rq);
2665 2668
2666 /** 2669 /**
2667 * blkdev_issue_flush - queue a flush 2670 * blkdev_issue_flush - queue a flush
2668 * @bdev: blockdev to issue flush for 2671 * @bdev: blockdev to issue flush for
2669 * @error_sector: error sector 2672 * @error_sector: error sector
2670 * 2673 *
2671 * Description: 2674 * Description:
2672 * Issue a flush for the block device in question. Caller can supply 2675 * Issue a flush for the block device in question. Caller can supply
2673 * room for storing the error offset in case of a flush error, if they 2676 * room for storing the error offset in case of a flush error, if they
2674 * wish to. Caller must run wait_for_completion() on its own. 2677 * wish to. Caller must run wait_for_completion() on its own.
2675 */ 2678 */
2676 int blkdev_issue_flush(struct block_device *bdev, sector_t *error_sector) 2679 int blkdev_issue_flush(struct block_device *bdev, sector_t *error_sector)
2677 { 2680 {
2678 struct request_queue *q; 2681 struct request_queue *q;
2679 2682
2680 if (bdev->bd_disk == NULL) 2683 if (bdev->bd_disk == NULL)
2681 return -ENXIO; 2684 return -ENXIO;
2682 2685
2683 q = bdev_get_queue(bdev); 2686 q = bdev_get_queue(bdev);
2684 if (!q) 2687 if (!q)
2685 return -ENXIO; 2688 return -ENXIO;
2686 if (!q->issue_flush_fn) 2689 if (!q->issue_flush_fn)
2687 return -EOPNOTSUPP; 2690 return -EOPNOTSUPP;
2688 2691
2689 return q->issue_flush_fn(q, bdev->bd_disk, error_sector); 2692 return q->issue_flush_fn(q, bdev->bd_disk, error_sector);
2690 } 2693 }
2691 2694
2692 EXPORT_SYMBOL(blkdev_issue_flush); 2695 EXPORT_SYMBOL(blkdev_issue_flush);
2693 2696
2694 static void drive_stat_acct(struct request *rq, int nr_sectors, int new_io) 2697 static void drive_stat_acct(struct request *rq, int nr_sectors, int new_io)
2695 { 2698 {
2696 int rw = rq_data_dir(rq); 2699 int rw = rq_data_dir(rq);
2697 2700
2698 if (!blk_fs_request(rq) || !rq->rq_disk) 2701 if (!blk_fs_request(rq) || !rq->rq_disk)
2699 return; 2702 return;
2700 2703
2701 if (!new_io) { 2704 if (!new_io) {
2702 __disk_stat_inc(rq->rq_disk, merges[rw]); 2705 __disk_stat_inc(rq->rq_disk, merges[rw]);
2703 } else { 2706 } else {
2704 disk_round_stats(rq->rq_disk); 2707 disk_round_stats(rq->rq_disk);
2705 rq->rq_disk->in_flight++; 2708 rq->rq_disk->in_flight++;
2706 } 2709 }
2707 } 2710 }
2708 2711
2709 /* 2712 /*
2710 * add-request adds a request to the linked list. 2713 * add-request adds a request to the linked list.
2711 * queue lock is held and interrupts disabled, as we muck with the 2714 * queue lock is held and interrupts disabled, as we muck with the
2712 * request queue list. 2715 * request queue list.
2713 */ 2716 */
2714 static inline void add_request(struct request_queue * q, struct request * req) 2717 static inline void add_request(struct request_queue * q, struct request * req)
2715 { 2718 {
2716 drive_stat_acct(req, req->nr_sectors, 1); 2719 drive_stat_acct(req, req->nr_sectors, 1);
2717 2720
2718 /* 2721 /*
2719 * elevator indicated where it wants this request to be 2722 * elevator indicated where it wants this request to be
2720 * inserted at elevator_merge time 2723 * inserted at elevator_merge time
2721 */ 2724 */
2722 __elv_add_request(q, req, ELEVATOR_INSERT_SORT, 0); 2725 __elv_add_request(q, req, ELEVATOR_INSERT_SORT, 0);
2723 } 2726 }
2724 2727
2725 /* 2728 /*
2726 * disk_round_stats() - Round off the performance stats on a struct 2729 * disk_round_stats() - Round off the performance stats on a struct
2727 * disk_stats. 2730 * disk_stats.
2728 * 2731 *
2729 * The average IO queue length and utilisation statistics are maintained 2732 * The average IO queue length and utilisation statistics are maintained
2730 * by observing the current state of the queue length and the amount of 2733 * by observing the current state of the queue length and the amount of
2731 * time it has been in this state for. 2734 * time it has been in this state for.
2732 * 2735 *
2733 * Normally, that accounting is done on IO completion, but that can result 2736 * Normally, that accounting is done on IO completion, but that can result
2734 * in more than a second's worth of IO being accounted for within any one 2737 * in more than a second's worth of IO being accounted for within any one
2735 * second, leading to >100% utilisation. To deal with that, we call this 2738 * second, leading to >100% utilisation. To deal with that, we call this
2736 * function to do a round-off before returning the results when reading 2739 * function to do a round-off before returning the results when reading
2737 * /proc/diskstats. This accounts immediately for all queue usage up to 2740 * /proc/diskstats. This accounts immediately for all queue usage up to
2738 * the current jiffies and restarts the counters again. 2741 * the current jiffies and restarts the counters again.
2739 */ 2742 */
2740 void disk_round_stats(struct gendisk *disk) 2743 void disk_round_stats(struct gendisk *disk)
2741 { 2744 {
2742 unsigned long now = jiffies; 2745 unsigned long now = jiffies;
2743 2746
2744 if (now == disk->stamp) 2747 if (now == disk->stamp)
2745 return; 2748 return;
2746 2749
2747 if (disk->in_flight) { 2750 if (disk->in_flight) {
2748 __disk_stat_add(disk, time_in_queue, 2751 __disk_stat_add(disk, time_in_queue,
2749 disk->in_flight * (now - disk->stamp)); 2752 disk->in_flight * (now - disk->stamp));
2750 __disk_stat_add(disk, io_ticks, (now - disk->stamp)); 2753 __disk_stat_add(disk, io_ticks, (now - disk->stamp));
2751 } 2754 }
2752 disk->stamp = now; 2755 disk->stamp = now;
2753 } 2756 }
2754 2757
2755 EXPORT_SYMBOL_GPL(disk_round_stats); 2758 EXPORT_SYMBOL_GPL(disk_round_stats);
2756 2759
2757 /* 2760 /*
2758 * queue lock must be held 2761 * queue lock must be held
2759 */ 2762 */
2760 void __blk_put_request(struct request_queue *q, struct request *req) 2763 void __blk_put_request(struct request_queue *q, struct request *req)
2761 { 2764 {
2762 if (unlikely(!q)) 2765 if (unlikely(!q))
2763 return; 2766 return;
2764 if (unlikely(--req->ref_count)) 2767 if (unlikely(--req->ref_count))
2765 return; 2768 return;
2766 2769
2767 elv_completed_request(q, req); 2770 elv_completed_request(q, req);
2768 2771
2769 /* 2772 /*
2770 * Request may not have originated from ll_rw_blk. if not, 2773 * Request may not have originated from ll_rw_blk. if not,
2771 * it didn't come out of our reserved rq pools 2774 * it didn't come out of our reserved rq pools
2772 */ 2775 */
2773 if (req->cmd_flags & REQ_ALLOCED) { 2776 if (req->cmd_flags & REQ_ALLOCED) {
2774 int rw = rq_data_dir(req); 2777 int rw = rq_data_dir(req);
2775 int priv = req->cmd_flags & REQ_ELVPRIV; 2778 int priv = req->cmd_flags & REQ_ELVPRIV;
2776 2779
2777 BUG_ON(!list_empty(&req->queuelist)); 2780 BUG_ON(!list_empty(&req->queuelist));
2778 BUG_ON(!hlist_unhashed(&req->hash)); 2781 BUG_ON(!hlist_unhashed(&req->hash));
2779 2782
2780 blk_free_request(q, req); 2783 blk_free_request(q, req);
2781 freed_request(q, rw, priv); 2784 freed_request(q, rw, priv);
2782 } 2785 }
2783 } 2786 }
2784 2787
2785 EXPORT_SYMBOL_GPL(__blk_put_request); 2788 EXPORT_SYMBOL_GPL(__blk_put_request);
2786 2789
2787 void blk_put_request(struct request *req) 2790 void blk_put_request(struct request *req)
2788 { 2791 {
2789 unsigned long flags; 2792 unsigned long flags;
2790 struct request_queue *q = req->q; 2793 struct request_queue *q = req->q;
2791 2794
2792 /* 2795 /*
2793 * Gee, IDE calls in w/ NULL q. Fix IDE and remove the 2796 * Gee, IDE calls in w/ NULL q. Fix IDE and remove the
2794 * following if (q) test. 2797 * following if (q) test.
2795 */ 2798 */
2796 if (q) { 2799 if (q) {
2797 spin_lock_irqsave(q->queue_lock, flags); 2800 spin_lock_irqsave(q->queue_lock, flags);
2798 __blk_put_request(q, req); 2801 __blk_put_request(q, req);
2799 spin_unlock_irqrestore(q->queue_lock, flags); 2802 spin_unlock_irqrestore(q->queue_lock, flags);
2800 } 2803 }
2801 } 2804 }
2802 2805
2803 EXPORT_SYMBOL(blk_put_request); 2806 EXPORT_SYMBOL(blk_put_request);
2804 2807
2805 /** 2808 /**
2806 * blk_end_sync_rq - executes a completion event on a request 2809 * blk_end_sync_rq - executes a completion event on a request
2807 * @rq: request to complete 2810 * @rq: request to complete
2808 * @error: end io status of the request 2811 * @error: end io status of the request
2809 */ 2812 */
2810 void blk_end_sync_rq(struct request *rq, int error) 2813 void blk_end_sync_rq(struct request *rq, int error)
2811 { 2814 {
2812 struct completion *waiting = rq->end_io_data; 2815 struct completion *waiting = rq->end_io_data;
2813 2816
2814 rq->end_io_data = NULL; 2817 rq->end_io_data = NULL;
2815 __blk_put_request(rq->q, rq); 2818 __blk_put_request(rq->q, rq);
2816 2819
2817 /* 2820 /*
2818 * complete last, if this is a stack request the process (and thus 2821 * complete last, if this is a stack request the process (and thus
2819 * the rq pointer) could be invalid right after this complete() 2822 * the rq pointer) could be invalid right after this complete()
2820 */ 2823 */
2821 complete(waiting); 2824 complete(waiting);
2822 } 2825 }
2823 EXPORT_SYMBOL(blk_end_sync_rq); 2826 EXPORT_SYMBOL(blk_end_sync_rq);
2824 2827
2825 /* 2828 /*
2826 * Has to be called with the request spinlock acquired 2829 * Has to be called with the request spinlock acquired
2827 */ 2830 */
2828 static int attempt_merge(struct request_queue *q, struct request *req, 2831 static int attempt_merge(struct request_queue *q, struct request *req,
2829 struct request *next) 2832 struct request *next)
2830 { 2833 {
2831 if (!rq_mergeable(req) || !rq_mergeable(next)) 2834 if (!rq_mergeable(req) || !rq_mergeable(next))
2832 return 0; 2835 return 0;
2833 2836
2834 /* 2837 /*
2835 * not contiguous 2838 * not contiguous
2836 */ 2839 */
2837 if (req->sector + req->nr_sectors != next->sector) 2840 if (req->sector + req->nr_sectors != next->sector)
2838 return 0; 2841 return 0;
2839 2842
2840 if (rq_data_dir(req) != rq_data_dir(next) 2843 if (rq_data_dir(req) != rq_data_dir(next)
2841 || req->rq_disk != next->rq_disk 2844 || req->rq_disk != next->rq_disk
2842 || next->special) 2845 || next->special)
2843 return 0; 2846 return 0;
2844 2847
2845 /* 2848 /*
2846 * If we are allowed to merge, then append bio list 2849 * If we are allowed to merge, then append bio list
2847 * from next to rq and release next. merge_requests_fn 2850 * from next to rq and release next. merge_requests_fn
2848 * will have updated segment counts, update sector 2851 * will have updated segment counts, update sector
2849 * counts here. 2852 * counts here.
2850 */ 2853 */
2851 if (!ll_merge_requests_fn(q, req, next)) 2854 if (!ll_merge_requests_fn(q, req, next))
2852 return 0; 2855 return 0;
2853 2856
2854 /* 2857 /*
2855 * At this point we have either done a back merge 2858 * At this point we have either done a back merge
2856 * or front merge. We need the smaller start_time of 2859 * or front merge. We need the smaller start_time of
2857 * the merged requests to be the current request 2860 * the merged requests to be the current request
2858 * for accounting purposes. 2861 * for accounting purposes.
2859 */ 2862 */
2860 if (time_after(req->start_time, next->start_time)) 2863 if (time_after(req->start_time, next->start_time))
2861 req->start_time = next->start_time; 2864 req->start_time = next->start_time;
2862 2865
2863 req->biotail->bi_next = next->bio; 2866 req->biotail->bi_next = next->bio;
2864 req->biotail = next->biotail; 2867 req->biotail = next->biotail;
2865 2868
2866 req->nr_sectors = req->hard_nr_sectors += next->hard_nr_sectors; 2869 req->nr_sectors = req->hard_nr_sectors += next->hard_nr_sectors;
2867 2870
2868 elv_merge_requests(q, req, next); 2871 elv_merge_requests(q, req, next);
2869 2872
2870 if (req->rq_disk) { 2873 if (req->rq_disk) {
2871 disk_round_stats(req->rq_disk); 2874 disk_round_stats(req->rq_disk);
2872 req->rq_disk->in_flight--; 2875 req->rq_disk->in_flight--;
2873 } 2876 }
2874 2877
2875 req->ioprio = ioprio_best(req->ioprio, next->ioprio); 2878 req->ioprio = ioprio_best(req->ioprio, next->ioprio);
2876 2879
2877 __blk_put_request(q, next); 2880 __blk_put_request(q, next);
2878 return 1; 2881 return 1;
2879 } 2882 }
2880 2883
2881 static inline int attempt_back_merge(struct request_queue *q, 2884 static inline int attempt_back_merge(struct request_queue *q,
2882 struct request *rq) 2885 struct request *rq)
2883 { 2886 {
2884 struct request *next = elv_latter_request(q, rq); 2887 struct request *next = elv_latter_request(q, rq);
2885 2888
2886 if (next) 2889 if (next)
2887 return attempt_merge(q, rq, next); 2890 return attempt_merge(q, rq, next);
2888 2891
2889 return 0; 2892 return 0;
2890 } 2893 }
2891 2894
2892 static inline int attempt_front_merge(struct request_queue *q, 2895 static inline int attempt_front_merge(struct request_queue *q,
2893 struct request *rq) 2896 struct request *rq)
2894 { 2897 {
2895 struct request *prev = elv_former_request(q, rq); 2898 struct request *prev = elv_former_request(q, rq);
2896 2899
2897 if (prev) 2900 if (prev)
2898 return attempt_merge(q, prev, rq); 2901 return attempt_merge(q, prev, rq);
2899 2902
2900 return 0; 2903 return 0;
2901 } 2904 }
2902 2905
2903 static void init_request_from_bio(struct request *req, struct bio *bio) 2906 static void init_request_from_bio(struct request *req, struct bio *bio)
2904 { 2907 {
2905 req->cmd_type = REQ_TYPE_FS; 2908 req->cmd_type = REQ_TYPE_FS;
2906 2909
2907 /* 2910 /*
2908 * inherit FAILFAST from bio (for read-ahead, and explicit FAILFAST) 2911 * inherit FAILFAST from bio (for read-ahead, and explicit FAILFAST)
2909 */ 2912 */
2910 if (bio_rw_ahead(bio) || bio_failfast(bio)) 2913 if (bio_rw_ahead(bio) || bio_failfast(bio))
2911 req->cmd_flags |= REQ_FAILFAST; 2914 req->cmd_flags |= REQ_FAILFAST;
2912 2915
2913 /* 2916 /*
2914 * REQ_BARRIER implies no merging, but lets make it explicit 2917 * REQ_BARRIER implies no merging, but lets make it explicit
2915 */ 2918 */
2916 if (unlikely(bio_barrier(bio))) 2919 if (unlikely(bio_barrier(bio)))
2917 req->cmd_flags |= (REQ_HARDBARRIER | REQ_NOMERGE); 2920 req->cmd_flags |= (REQ_HARDBARRIER | REQ_NOMERGE);
2918 2921
2919 if (bio_sync(bio)) 2922 if (bio_sync(bio))
2920 req->cmd_flags |= REQ_RW_SYNC; 2923 req->cmd_flags |= REQ_RW_SYNC;
2921 if (bio_rw_meta(bio)) 2924 if (bio_rw_meta(bio))
2922 req->cmd_flags |= REQ_RW_META; 2925 req->cmd_flags |= REQ_RW_META;
2923 2926
2924 req->errors = 0; 2927 req->errors = 0;
2925 req->hard_sector = req->sector = bio->bi_sector; 2928 req->hard_sector = req->sector = bio->bi_sector;
2926 req->ioprio = bio_prio(bio); 2929 req->ioprio = bio_prio(bio);
2927 req->start_time = jiffies; 2930 req->start_time = jiffies;
2928 blk_rq_bio_prep(req->q, req, bio); 2931 blk_rq_bio_prep(req->q, req, bio);
2929 } 2932 }
2930 2933
2931 static int __make_request(struct request_queue *q, struct bio *bio) 2934 static int __make_request(struct request_queue *q, struct bio *bio)
2932 { 2935 {
2933 struct request *req; 2936 struct request *req;
2934 int el_ret, nr_sectors, barrier, err; 2937 int el_ret, nr_sectors, barrier, err;
2935 const unsigned short prio = bio_prio(bio); 2938 const unsigned short prio = bio_prio(bio);
2936 const int sync = bio_sync(bio); 2939 const int sync = bio_sync(bio);
2937 int rw_flags; 2940 int rw_flags;
2938 2941
2939 nr_sectors = bio_sectors(bio); 2942 nr_sectors = bio_sectors(bio);
2940 2943
2941 /* 2944 /*
2942 * low level driver can indicate that it wants pages above a 2945 * low level driver can indicate that it wants pages above a
2943 * certain limit bounced to low memory (ie for highmem, or even 2946 * certain limit bounced to low memory (ie for highmem, or even
2944 * ISA dma in theory) 2947 * ISA dma in theory)
2945 */ 2948 */
2946 blk_queue_bounce(q, &bio); 2949 blk_queue_bounce(q, &bio);
2947 2950
2948 barrier = bio_barrier(bio); 2951 barrier = bio_barrier(bio);
2949 if (unlikely(barrier) && (q->next_ordered == QUEUE_ORDERED_NONE)) { 2952 if (unlikely(barrier) && (q->next_ordered == QUEUE_ORDERED_NONE)) {
2950 err = -EOPNOTSUPP; 2953 err = -EOPNOTSUPP;
2951 goto end_io; 2954 goto end_io;
2952 } 2955 }
2953 2956
2954 spin_lock_irq(q->queue_lock); 2957 spin_lock_irq(q->queue_lock);
2955 2958
2956 if (unlikely(barrier) || elv_queue_empty(q)) 2959 if (unlikely(barrier) || elv_queue_empty(q))
2957 goto get_rq; 2960 goto get_rq;
2958 2961
2959 el_ret = elv_merge(q, &req, bio); 2962 el_ret = elv_merge(q, &req, bio);
2960 switch (el_ret) { 2963 switch (el_ret) {
2961 case ELEVATOR_BACK_MERGE: 2964 case ELEVATOR_BACK_MERGE:
2962 BUG_ON(!rq_mergeable(req)); 2965 BUG_ON(!rq_mergeable(req));
2963 2966
2964 if (!ll_back_merge_fn(q, req, bio)) 2967 if (!ll_back_merge_fn(q, req, bio))
2965 break; 2968 break;
2966 2969
2967 blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE); 2970 blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE);
2968 2971
2969 req->biotail->bi_next = bio; 2972 req->biotail->bi_next = bio;
2970 req->biotail = bio; 2973 req->biotail = bio;
2971 req->nr_sectors = req->hard_nr_sectors += nr_sectors; 2974 req->nr_sectors = req->hard_nr_sectors += nr_sectors;
2972 req->ioprio = ioprio_best(req->ioprio, prio); 2975 req->ioprio = ioprio_best(req->ioprio, prio);
2973 drive_stat_acct(req, nr_sectors, 0); 2976 drive_stat_acct(req, nr_sectors, 0);
2974 if (!attempt_back_merge(q, req)) 2977 if (!attempt_back_merge(q, req))
2975 elv_merged_request(q, req, el_ret); 2978 elv_merged_request(q, req, el_ret);
2976 goto out; 2979 goto out;
2977 2980
2978 case ELEVATOR_FRONT_MERGE: 2981 case ELEVATOR_FRONT_MERGE:
2979 BUG_ON(!rq_mergeable(req)); 2982 BUG_ON(!rq_mergeable(req));
2980 2983
2981 if (!ll_front_merge_fn(q, req, bio)) 2984 if (!ll_front_merge_fn(q, req, bio))
2982 break; 2985 break;
2983 2986
2984 blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE); 2987 blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE);
2985 2988
2986 bio->bi_next = req->bio; 2989 bio->bi_next = req->bio;
2987 req->bio = bio; 2990 req->bio = bio;
2988 2991
2989 /* 2992 /*
2990 * may not be valid. if the low level driver said 2993 * may not be valid. if the low level driver said
2991 * it didn't need a bounce buffer then it better 2994 * it didn't need a bounce buffer then it better
2992 * not touch req->buffer either... 2995 * not touch req->buffer either...
2993 */ 2996 */
2994 req->buffer = bio_data(bio); 2997 req->buffer = bio_data(bio);
2995 req->current_nr_sectors = bio_cur_sectors(bio); 2998 req->current_nr_sectors = bio_cur_sectors(bio);
2996 req->hard_cur_sectors = req->current_nr_sectors; 2999 req->hard_cur_sectors = req->current_nr_sectors;
2997 req->sector = req->hard_sector = bio->bi_sector; 3000 req->sector = req->hard_sector = bio->bi_sector;
2998 req->nr_sectors = req->hard_nr_sectors += nr_sectors; 3001 req->nr_sectors = req->hard_nr_sectors += nr_sectors;
2999 req->ioprio = ioprio_best(req->ioprio, prio); 3002 req->ioprio = ioprio_best(req->ioprio, prio);
3000 drive_stat_acct(req, nr_sectors, 0); 3003 drive_stat_acct(req, nr_sectors, 0);
3001 if (!attempt_front_merge(q, req)) 3004 if (!attempt_front_merge(q, req))
3002 elv_merged_request(q, req, el_ret); 3005 elv_merged_request(q, req, el_ret);
3003 goto out; 3006 goto out;
3004 3007
3005 /* ELV_NO_MERGE: elevator says don't/can't merge. */ 3008 /* ELV_NO_MERGE: elevator says don't/can't merge. */
3006 default: 3009 default:
3007 ; 3010 ;
3008 } 3011 }
3009 3012
3010 get_rq: 3013 get_rq:
3011 /* 3014 /*
3012 * This sync check and mask will be re-done in init_request_from_bio(), 3015 * This sync check and mask will be re-done in init_request_from_bio(),
3013 * but we need to set it earlier to expose the sync flag to the 3016 * but we need to set it earlier to expose the sync flag to the
3014 * rq allocator and io schedulers. 3017 * rq allocator and io schedulers.
3015 */ 3018 */
3016 rw_flags = bio_data_dir(bio); 3019 rw_flags = bio_data_dir(bio);
3017 if (sync) 3020 if (sync)
3018 rw_flags |= REQ_RW_SYNC; 3021 rw_flags |= REQ_RW_SYNC;
3019 3022
3020 /* 3023 /*
3021 * Grab a free request. This is might sleep but can not fail. 3024 * Grab a free request. This is might sleep but can not fail.
3022 * Returns with the queue unlocked. 3025 * Returns with the queue unlocked.
3023 */ 3026 */
3024 req = get_request_wait(q, rw_flags, bio); 3027 req = get_request_wait(q, rw_flags, bio);
3025 3028
3026 /* 3029 /*
3027 * After dropping the lock and possibly sleeping here, our request 3030 * After dropping the lock and possibly sleeping here, our request
3028 * may now be mergeable after it had proven unmergeable (above). 3031 * may now be mergeable after it had proven unmergeable (above).
3029 * We don't worry about that case for efficiency. It won't happen 3032 * We don't worry about that case for efficiency. It won't happen
3030 * often, and the elevators are able to handle it. 3033 * often, and the elevators are able to handle it.
3031 */ 3034 */
3032 init_request_from_bio(req, bio); 3035 init_request_from_bio(req, bio);
3033 3036
3034 spin_lock_irq(q->queue_lock); 3037 spin_lock_irq(q->queue_lock);
3035 if (elv_queue_empty(q)) 3038 if (elv_queue_empty(q))
3036 blk_plug_device(q); 3039 blk_plug_device(q);
3037 add_request(q, req); 3040 add_request(q, req);
3038 out: 3041 out:
3039 if (sync) 3042 if (sync)
3040 __generic_unplug_device(q); 3043 __generic_unplug_device(q);
3041 3044
3042 spin_unlock_irq(q->queue_lock); 3045 spin_unlock_irq(q->queue_lock);
3043 return 0; 3046 return 0;
3044 3047
3045 end_io: 3048 end_io:
3046 bio_endio(bio, err); 3049 bio_endio(bio, err);
3047 return 0; 3050 return 0;
3048 } 3051 }
3049 3052
3050 /* 3053 /*
3051 * If bio->bi_dev is a partition, remap the location 3054 * If bio->bi_dev is a partition, remap the location
3052 */ 3055 */
3053 static inline void blk_partition_remap(struct bio *bio) 3056 static inline void blk_partition_remap(struct bio *bio)
3054 { 3057 {
3055 struct block_device *bdev = bio->bi_bdev; 3058 struct block_device *bdev = bio->bi_bdev;
3056 3059
3057 if (bdev != bdev->bd_contains) { 3060 if (bio_sectors(bio) && bdev != bdev->bd_contains) {
3058 struct hd_struct *p = bdev->bd_part; 3061 struct hd_struct *p = bdev->bd_part;
3059 const int rw = bio_data_dir(bio); 3062 const int rw = bio_data_dir(bio);
3060 3063
3061 p->sectors[rw] += bio_sectors(bio); 3064 p->sectors[rw] += bio_sectors(bio);
3062 p->ios[rw]++; 3065 p->ios[rw]++;
3063 3066
3064 bio->bi_sector += p->start_sect; 3067 bio->bi_sector += p->start_sect;
3065 bio->bi_bdev = bdev->bd_contains; 3068 bio->bi_bdev = bdev->bd_contains;
3066 3069
3067 blk_add_trace_remap(bdev_get_queue(bio->bi_bdev), bio, 3070 blk_add_trace_remap(bdev_get_queue(bio->bi_bdev), bio,
3068 bdev->bd_dev, bio->bi_sector, 3071 bdev->bd_dev, bio->bi_sector,
3069 bio->bi_sector - p->start_sect); 3072 bio->bi_sector - p->start_sect);
3070 } 3073 }
3071 } 3074 }
3072 3075
3073 static void handle_bad_sector(struct bio *bio) 3076 static void handle_bad_sector(struct bio *bio)
3074 { 3077 {
3075 char b[BDEVNAME_SIZE]; 3078 char b[BDEVNAME_SIZE];
3076 3079
3077 printk(KERN_INFO "attempt to access beyond end of device\n"); 3080 printk(KERN_INFO "attempt to access beyond end of device\n");
3078 printk(KERN_INFO "%s: rw=%ld, want=%Lu, limit=%Lu\n", 3081 printk(KERN_INFO "%s: rw=%ld, want=%Lu, limit=%Lu\n",
3079 bdevname(bio->bi_bdev, b), 3082 bdevname(bio->bi_bdev, b),
3080 bio->bi_rw, 3083 bio->bi_rw,
3081 (unsigned long long)bio->bi_sector + bio_sectors(bio), 3084 (unsigned long long)bio->bi_sector + bio_sectors(bio),
3082 (long long)(bio->bi_bdev->bd_inode->i_size >> 9)); 3085 (long long)(bio->bi_bdev->bd_inode->i_size >> 9));
3083 3086
3084 set_bit(BIO_EOF, &bio->bi_flags); 3087 set_bit(BIO_EOF, &bio->bi_flags);
3085 } 3088 }
3086 3089
3087 #ifdef CONFIG_FAIL_MAKE_REQUEST 3090 #ifdef CONFIG_FAIL_MAKE_REQUEST
3088 3091
3089 static DECLARE_FAULT_ATTR(fail_make_request); 3092 static DECLARE_FAULT_ATTR(fail_make_request);
3090 3093
3091 static int __init setup_fail_make_request(char *str) 3094 static int __init setup_fail_make_request(char *str)
3092 { 3095 {
3093 return setup_fault_attr(&fail_make_request, str); 3096 return setup_fault_attr(&fail_make_request, str);
3094 } 3097 }
3095 __setup("fail_make_request=", setup_fail_make_request); 3098 __setup("fail_make_request=", setup_fail_make_request);
3096 3099
3097 static int should_fail_request(struct bio *bio) 3100 static int should_fail_request(struct bio *bio)
3098 { 3101 {
3099 if ((bio->bi_bdev->bd_disk->flags & GENHD_FL_FAIL) || 3102 if ((bio->bi_bdev->bd_disk->flags & GENHD_FL_FAIL) ||
3100 (bio->bi_bdev->bd_part && bio->bi_bdev->bd_part->make_it_fail)) 3103 (bio->bi_bdev->bd_part && bio->bi_bdev->bd_part->make_it_fail))
3101 return should_fail(&fail_make_request, bio->bi_size); 3104 return should_fail(&fail_make_request, bio->bi_size);
3102 3105
3103 return 0; 3106 return 0;
3104 } 3107 }
3105 3108
3106 static int __init fail_make_request_debugfs(void) 3109 static int __init fail_make_request_debugfs(void)
3107 { 3110 {
3108 return init_fault_attr_dentries(&fail_make_request, 3111 return init_fault_attr_dentries(&fail_make_request,
3109 "fail_make_request"); 3112 "fail_make_request");
3110 } 3113 }
3111 3114
3112 late_initcall(fail_make_request_debugfs); 3115 late_initcall(fail_make_request_debugfs);
3113 3116
3114 #else /* CONFIG_FAIL_MAKE_REQUEST */ 3117 #else /* CONFIG_FAIL_MAKE_REQUEST */
3115 3118
3116 static inline int should_fail_request(struct bio *bio) 3119 static inline int should_fail_request(struct bio *bio)
3117 { 3120 {
3118 return 0; 3121 return 0;
3119 } 3122 }
3120 3123
3121 #endif /* CONFIG_FAIL_MAKE_REQUEST */ 3124 #endif /* CONFIG_FAIL_MAKE_REQUEST */
3122 3125
3123 /* 3126 /*
3124 * Check whether this bio extends beyond the end of the device. 3127 * Check whether this bio extends beyond the end of the device.
3125 */ 3128 */
3126 static inline int bio_check_eod(struct bio *bio, unsigned int nr_sectors) 3129 static inline int bio_check_eod(struct bio *bio, unsigned int nr_sectors)
3127 { 3130 {
3128 sector_t maxsector; 3131 sector_t maxsector;
3129 3132
3130 if (!nr_sectors) 3133 if (!nr_sectors)
3131 return 0; 3134 return 0;
3132 3135
3133 /* Test device or partition size, when known. */ 3136 /* Test device or partition size, when known. */
3134 maxsector = bio->bi_bdev->bd_inode->i_size >> 9; 3137 maxsector = bio->bi_bdev->bd_inode->i_size >> 9;
3135 if (maxsector) { 3138 if (maxsector) {
3136 sector_t sector = bio->bi_sector; 3139 sector_t sector = bio->bi_sector;
3137 3140
3138 if (maxsector < nr_sectors || maxsector - nr_sectors < sector) { 3141 if (maxsector < nr_sectors || maxsector - nr_sectors < sector) {
3139 /* 3142 /*
3140 * This may well happen - the kernel calls bread() 3143 * This may well happen - the kernel calls bread()
3141 * without checking the size of the device, e.g., when 3144 * without checking the size of the device, e.g., when
3142 * mounting a device. 3145 * mounting a device.
3143 */ 3146 */
3144 handle_bad_sector(bio); 3147 handle_bad_sector(bio);
3145 return 1; 3148 return 1;
3146 } 3149 }
3147 } 3150 }
3148 3151
3149 return 0; 3152 return 0;
3150 } 3153 }
3151 3154
3152 /** 3155 /**
3153 * generic_make_request: hand a buffer to its device driver for I/O 3156 * generic_make_request: hand a buffer to its device driver for I/O
3154 * @bio: The bio describing the location in memory and on the device. 3157 * @bio: The bio describing the location in memory and on the device.
3155 * 3158 *
3156 * generic_make_request() is used to make I/O requests of block 3159 * generic_make_request() is used to make I/O requests of block
3157 * devices. It is passed a &struct bio, which describes the I/O that needs 3160 * devices. It is passed a &struct bio, which describes the I/O that needs
3158 * to be done. 3161 * to be done.
3159 * 3162 *
3160 * generic_make_request() does not return any status. The 3163 * generic_make_request() does not return any status. The
3161 * success/failure status of the request, along with notification of 3164 * success/failure status of the request, along with notification of
3162 * completion, is delivered asynchronously through the bio->bi_end_io 3165 * completion, is delivered asynchronously through the bio->bi_end_io
3163 * function described (one day) else where. 3166 * function described (one day) else where.
3164 * 3167 *
3165 * The caller of generic_make_request must make sure that bi_io_vec 3168 * The caller of generic_make_request must make sure that bi_io_vec
3166 * are set to describe the memory buffer, and that bi_dev and bi_sector are 3169 * are set to describe the memory buffer, and that bi_dev and bi_sector are
3167 * set to describe the device address, and the 3170 * set to describe the device address, and the
3168 * bi_end_io and optionally bi_private are set to describe how 3171 * bi_end_io and optionally bi_private are set to describe how
3169 * completion notification should be signaled. 3172 * completion notification should be signaled.
3170 * 3173 *
3171 * generic_make_request and the drivers it calls may use bi_next if this 3174 * generic_make_request and the drivers it calls may use bi_next if this
3172 * bio happens to be merged with someone else, and may change bi_dev and 3175 * bio happens to be merged with someone else, and may change bi_dev and
3173 * bi_sector for remaps as it sees fit. So the values of these fields 3176 * bi_sector for remaps as it sees fit. So the values of these fields
3174 * should NOT be depended on after the call to generic_make_request. 3177 * should NOT be depended on after the call to generic_make_request.
3175 */ 3178 */
3176 static inline void __generic_make_request(struct bio *bio) 3179 static inline void __generic_make_request(struct bio *bio)
3177 { 3180 {
3178 struct request_queue *q; 3181 struct request_queue *q;
3179 sector_t old_sector; 3182 sector_t old_sector;
3180 int ret, nr_sectors = bio_sectors(bio); 3183 int ret, nr_sectors = bio_sectors(bio);
3181 dev_t old_dev; 3184 dev_t old_dev;
3182 3185
3183 might_sleep(); 3186 might_sleep();
3184 3187
3185 if (bio_check_eod(bio, nr_sectors)) 3188 if (bio_check_eod(bio, nr_sectors))
3186 goto end_io; 3189 goto end_io;
3187 3190
3188 /* 3191 /*
3189 * Resolve the mapping until finished. (drivers are 3192 * Resolve the mapping until finished. (drivers are
3190 * still free to implement/resolve their own stacking 3193 * still free to implement/resolve their own stacking
3191 * by explicitly returning 0) 3194 * by explicitly returning 0)
3192 * 3195 *
3193 * NOTE: we don't repeat the blk_size check for each new device. 3196 * NOTE: we don't repeat the blk_size check for each new device.
3194 * Stacking drivers are expected to know what they are doing. 3197 * Stacking drivers are expected to know what they are doing.
3195 */ 3198 */
3196 old_sector = -1; 3199 old_sector = -1;
3197 old_dev = 0; 3200 old_dev = 0;
3198 do { 3201 do {
3199 char b[BDEVNAME_SIZE]; 3202 char b[BDEVNAME_SIZE];
3200 3203
3201 q = bdev_get_queue(bio->bi_bdev); 3204 q = bdev_get_queue(bio->bi_bdev);
3202 if (!q) { 3205 if (!q) {
3203 printk(KERN_ERR 3206 printk(KERN_ERR
3204 "generic_make_request: Trying to access " 3207 "generic_make_request: Trying to access "
3205 "nonexistent block-device %s (%Lu)\n", 3208 "nonexistent block-device %s (%Lu)\n",
3206 bdevname(bio->bi_bdev, b), 3209 bdevname(bio->bi_bdev, b),
3207 (long long) bio->bi_sector); 3210 (long long) bio->bi_sector);
3208 end_io: 3211 end_io:
3209 bio_endio(bio, -EIO); 3212 bio_endio(bio, -EIO);
3210 break; 3213 break;
3211 } 3214 }
3212 3215
3213 if (unlikely(nr_sectors > q->max_hw_sectors)) { 3216 if (unlikely(nr_sectors > q->max_hw_sectors)) {
3214 printk("bio too big device %s (%u > %u)\n", 3217 printk("bio too big device %s (%u > %u)\n",
3215 bdevname(bio->bi_bdev, b), 3218 bdevname(bio->bi_bdev, b),
3216 bio_sectors(bio), 3219 bio_sectors(bio),
3217 q->max_hw_sectors); 3220 q->max_hw_sectors);
3218 goto end_io; 3221 goto end_io;
3219 } 3222 }
3220 3223
3221 if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))) 3224 if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)))
3222 goto end_io; 3225 goto end_io;
3223 3226
3224 if (should_fail_request(bio)) 3227 if (should_fail_request(bio))
3225 goto end_io; 3228 goto end_io;
3226 3229
3227 /* 3230 /*
3228 * If this device has partitions, remap block n 3231 * If this device has partitions, remap block n
3229 * of partition p to block n+start(p) of the disk. 3232 * of partition p to block n+start(p) of the disk.
3230 */ 3233 */
3231 blk_partition_remap(bio); 3234 blk_partition_remap(bio);
3232 3235
3233 if (old_sector != -1) 3236 if (old_sector != -1)
3234 blk_add_trace_remap(q, bio, old_dev, bio->bi_sector, 3237 blk_add_trace_remap(q, bio, old_dev, bio->bi_sector,
3235 old_sector); 3238 old_sector);
3236 3239
3237 blk_add_trace_bio(q, bio, BLK_TA_QUEUE); 3240 blk_add_trace_bio(q, bio, BLK_TA_QUEUE);
3238 3241
3239 old_sector = bio->bi_sector; 3242 old_sector = bio->bi_sector;
3240 old_dev = bio->bi_bdev->bd_dev; 3243 old_dev = bio->bi_bdev->bd_dev;
3241 3244
3242 if (bio_check_eod(bio, nr_sectors)) 3245 if (bio_check_eod(bio, nr_sectors))
3243 goto end_io; 3246 goto end_io;
3244 3247
3245 ret = q->make_request_fn(q, bio); 3248 ret = q->make_request_fn(q, bio);
3246 } while (ret); 3249 } while (ret);
3247 } 3250 }
3248 3251
3249 /* 3252 /*
3250 * We only want one ->make_request_fn to be active at a time, 3253 * We only want one ->make_request_fn to be active at a time,
3251 * else stack usage with stacked devices could be a problem. 3254 * else stack usage with stacked devices could be a problem.
3252 * So use current->bio_{list,tail} to keep a list of requests 3255 * So use current->bio_{list,tail} to keep a list of requests
3253 * submited by a make_request_fn function. 3256 * submited by a make_request_fn function.
3254 * current->bio_tail is also used as a flag to say if 3257 * current->bio_tail is also used as a flag to say if
3255 * generic_make_request is currently active in this task or not. 3258 * generic_make_request is currently active in this task or not.
3256 * If it is NULL, then no make_request is active. If it is non-NULL, 3259 * If it is NULL, then no make_request is active. If it is non-NULL,
3257 * then a make_request is active, and new requests should be added 3260 * then a make_request is active, and new requests should be added
3258 * at the tail 3261 * at the tail
3259 */ 3262 */
3260 void generic_make_request(struct bio *bio) 3263 void generic_make_request(struct bio *bio)
3261 { 3264 {
3262 if (current->bio_tail) { 3265 if (current->bio_tail) {
3263 /* make_request is active */ 3266 /* make_request is active */
3264 *(current->bio_tail) = bio; 3267 *(current->bio_tail) = bio;
3265 bio->bi_next = NULL; 3268 bio->bi_next = NULL;
3266 current->bio_tail = &bio->bi_next; 3269 current->bio_tail = &bio->bi_next;
3267 return; 3270 return;
3268 } 3271 }
3269 /* following loop may be a bit non-obvious, and so deserves some 3272 /* following loop may be a bit non-obvious, and so deserves some
3270 * explanation. 3273 * explanation.
3271 * Before entering the loop, bio->bi_next is NULL (as all callers 3274 * Before entering the loop, bio->bi_next is NULL (as all callers
3272 * ensure that) so we have a list with a single bio. 3275 * ensure that) so we have a list with a single bio.
3273 * We pretend that we have just taken it off a longer list, so 3276 * We pretend that we have just taken it off a longer list, so
3274 * we assign bio_list to the next (which is NULL) and bio_tail 3277 * we assign bio_list to the next (which is NULL) and bio_tail
3275 * to &bio_list, thus initialising the bio_list of new bios to be 3278 * to &bio_list, thus initialising the bio_list of new bios to be
3276 * added. __generic_make_request may indeed add some more bios 3279 * added. __generic_make_request may indeed add some more bios
3277 * through a recursive call to generic_make_request. If it 3280 * through a recursive call to generic_make_request. If it
3278 * did, we find a non-NULL value in bio_list and re-enter the loop 3281 * did, we find a non-NULL value in bio_list and re-enter the loop
3279 * from the top. In this case we really did just take the bio 3282 * from the top. In this case we really did just take the bio
3280 * of the top of the list (no pretending) and so fixup bio_list and 3283 * of the top of the list (no pretending) and so fixup bio_list and
3281 * bio_tail or bi_next, and call into __generic_make_request again. 3284 * bio_tail or bi_next, and call into __generic_make_request again.
3282 * 3285 *
3283 * The loop was structured like this to make only one call to 3286 * The loop was structured like this to make only one call to
3284 * __generic_make_request (which is important as it is large and 3287 * __generic_make_request (which is important as it is large and
3285 * inlined) and to keep the structure simple. 3288 * inlined) and to keep the structure simple.
3286 */ 3289 */
3287 BUG_ON(bio->bi_next); 3290 BUG_ON(bio->bi_next);
3288 do { 3291 do {
3289 current->bio_list = bio->bi_next; 3292 current->bio_list = bio->bi_next;
3290 if (bio->bi_next == NULL) 3293 if (bio->bi_next == NULL)
3291 current->bio_tail = &current->bio_list; 3294 current->bio_tail = &current->bio_list;
3292 else 3295 else
3293 bio->bi_next = NULL; 3296 bio->bi_next = NULL;
3294 __generic_make_request(bio); 3297 __generic_make_request(bio);
3295 bio = current->bio_list; 3298 bio = current->bio_list;
3296 } while (bio); 3299 } while (bio);
3297 current->bio_tail = NULL; /* deactivate */ 3300 current->bio_tail = NULL; /* deactivate */
3298 } 3301 }
3299 3302
3300 EXPORT_SYMBOL(generic_make_request); 3303 EXPORT_SYMBOL(generic_make_request);
3301 3304
3302 /** 3305 /**
3303 * submit_bio: submit a bio to the block device layer for I/O 3306 * submit_bio: submit a bio to the block device layer for I/O
3304 * @rw: whether to %READ or %WRITE, or maybe to %READA (read ahead) 3307 * @rw: whether to %READ or %WRITE, or maybe to %READA (read ahead)
3305 * @bio: The &struct bio which describes the I/O 3308 * @bio: The &struct bio which describes the I/O
3306 * 3309 *
3307 * submit_bio() is very similar in purpose to generic_make_request(), and 3310 * submit_bio() is very similar in purpose to generic_make_request(), and
3308 * uses that function to do most of the work. Both are fairly rough 3311 * uses that function to do most of the work. Both are fairly rough
3309 * interfaces, @bio must be presetup and ready for I/O. 3312 * interfaces, @bio must be presetup and ready for I/O.
3310 * 3313 *
3311 */ 3314 */
3312 void submit_bio(int rw, struct bio *bio) 3315 void submit_bio(int rw, struct bio *bio)
3313 { 3316 {
3314 int count = bio_sectors(bio); 3317 int count = bio_sectors(bio);
3315 3318
3316 BIO_BUG_ON(!bio->bi_size);
3317 BIO_BUG_ON(!bio->bi_io_vec);
3318 bio->bi_rw |= rw; 3319 bio->bi_rw |= rw;
3319 if (rw & WRITE) {
3320 count_vm_events(PGPGOUT, count);
3321 } else {
3322 task_io_account_read(bio->bi_size);
3323 count_vm_events(PGPGIN, count);
3324 }
3325 3320
3326 if (unlikely(block_dump)) { 3321 /*
3327 char b[BDEVNAME_SIZE]; 3322 * If it's a regular read/write or a barrier with data attached,
3328 printk(KERN_DEBUG "%s(%d): %s block %Lu on %s\n", 3323 * go through the normal accounting stuff before submission.
3329 current->comm, current->pid, 3324 */
3330 (rw & WRITE) ? "WRITE" : "READ", 3325 if (!bio_empty_barrier(bio)) {
3331 (unsigned long long)bio->bi_sector, 3326
3332 bdevname(bio->bi_bdev,b)); 3327 BIO_BUG_ON(!bio->bi_size);
3328 BIO_BUG_ON(!bio->bi_io_vec);
3329
3330 if (rw & WRITE) {
3331 count_vm_events(PGPGOUT, count);
3332 } else {
3333 task_io_account_read(bio->bi_size);
3334 count_vm_events(PGPGIN, count);
3335 }
3336
3337 if (unlikely(block_dump)) {
3338 char b[BDEVNAME_SIZE];
3339 printk(KERN_DEBUG "%s(%d): %s block %Lu on %s\n",
3340 current->comm, current->pid,
3341 (rw & WRITE) ? "WRITE" : "READ",
3342 (unsigned long long)bio->bi_sector,
3343 bdevname(bio->bi_bdev,b));
3344 }
3333 } 3345 }
3334 3346
3335 generic_make_request(bio); 3347 generic_make_request(bio);
3336 } 3348 }
3337 3349
3338 EXPORT_SYMBOL(submit_bio); 3350 EXPORT_SYMBOL(submit_bio);
3339 3351
3340 static void blk_recalc_rq_sectors(struct request *rq, int nsect) 3352 static void blk_recalc_rq_sectors(struct request *rq, int nsect)
3341 { 3353 {
3342 if (blk_fs_request(rq)) { 3354 if (blk_fs_request(rq)) {
3343 rq->hard_sector += nsect; 3355 rq->hard_sector += nsect;
3344 rq->hard_nr_sectors -= nsect; 3356 rq->hard_nr_sectors -= nsect;
3345 3357
3346 /* 3358 /*
3347 * Move the I/O submission pointers ahead if required. 3359 * Move the I/O submission pointers ahead if required.
3348 */ 3360 */
3349 if ((rq->nr_sectors >= rq->hard_nr_sectors) && 3361 if ((rq->nr_sectors >= rq->hard_nr_sectors) &&
3350 (rq->sector <= rq->hard_sector)) { 3362 (rq->sector <= rq->hard_sector)) {
3351 rq->sector = rq->hard_sector; 3363 rq->sector = rq->hard_sector;
3352 rq->nr_sectors = rq->hard_nr_sectors; 3364 rq->nr_sectors = rq->hard_nr_sectors;
3353 rq->hard_cur_sectors = bio_cur_sectors(rq->bio); 3365 rq->hard_cur_sectors = bio_cur_sectors(rq->bio);
3354 rq->current_nr_sectors = rq->hard_cur_sectors; 3366 rq->current_nr_sectors = rq->hard_cur_sectors;
3355 rq->buffer = bio_data(rq->bio); 3367 rq->buffer = bio_data(rq->bio);
3356 } 3368 }
3357 3369
3358 /* 3370 /*
3359 * if total number of sectors is less than the first segment 3371 * if total number of sectors is less than the first segment
3360 * size, something has gone terribly wrong 3372 * size, something has gone terribly wrong
3361 */ 3373 */
3362 if (rq->nr_sectors < rq->current_nr_sectors) { 3374 if (rq->nr_sectors < rq->current_nr_sectors) {
3363 printk("blk: request botched\n"); 3375 printk("blk: request botched\n");
3364 rq->nr_sectors = rq->current_nr_sectors; 3376 rq->nr_sectors = rq->current_nr_sectors;
3365 } 3377 }
3366 } 3378 }
3367 } 3379 }
3368 3380
3369 static int __end_that_request_first(struct request *req, int uptodate, 3381 static int __end_that_request_first(struct request *req, int uptodate,
3370 int nr_bytes) 3382 int nr_bytes)
3371 { 3383 {
3372 int total_bytes, bio_nbytes, error, next_idx = 0; 3384 int total_bytes, bio_nbytes, error, next_idx = 0;
3373 struct bio *bio; 3385 struct bio *bio;
3374 3386
3375 blk_add_trace_rq(req->q, req, BLK_TA_COMPLETE); 3387 blk_add_trace_rq(req->q, req, BLK_TA_COMPLETE);
3376 3388
3377 /* 3389 /*
3378 * extend uptodate bool to allow < 0 value to be direct io error 3390 * extend uptodate bool to allow < 0 value to be direct io error
3379 */ 3391 */
3380 error = 0; 3392 error = 0;
3381 if (end_io_error(uptodate)) 3393 if (end_io_error(uptodate))
3382 error = !uptodate ? -EIO : uptodate; 3394 error = !uptodate ? -EIO : uptodate;
3383 3395
3384 /* 3396 /*
3385 * for a REQ_BLOCK_PC request, we want to carry any eventual 3397 * for a REQ_BLOCK_PC request, we want to carry any eventual
3386 * sense key with us all the way through 3398 * sense key with us all the way through
3387 */ 3399 */
3388 if (!blk_pc_request(req)) 3400 if (!blk_pc_request(req))
3389 req->errors = 0; 3401 req->errors = 0;
3390 3402
3391 if (!uptodate) { 3403 if (!uptodate) {
3392 if (blk_fs_request(req) && !(req->cmd_flags & REQ_QUIET)) 3404 if (blk_fs_request(req) && !(req->cmd_flags & REQ_QUIET))
3393 printk("end_request: I/O error, dev %s, sector %llu\n", 3405 printk("end_request: I/O error, dev %s, sector %llu\n",
3394 req->rq_disk ? req->rq_disk->disk_name : "?", 3406 req->rq_disk ? req->rq_disk->disk_name : "?",
3395 (unsigned long long)req->sector); 3407 (unsigned long long)req->sector);
3396 } 3408 }
3397 3409
3398 if (blk_fs_request(req) && req->rq_disk) { 3410 if (blk_fs_request(req) && req->rq_disk) {
3399 const int rw = rq_data_dir(req); 3411 const int rw = rq_data_dir(req);
3400 3412
3401 disk_stat_add(req->rq_disk, sectors[rw], nr_bytes >> 9); 3413 disk_stat_add(req->rq_disk, sectors[rw], nr_bytes >> 9);
3402 } 3414 }
3403 3415
3404 total_bytes = bio_nbytes = 0; 3416 total_bytes = bio_nbytes = 0;
3405 while ((bio = req->bio) != NULL) { 3417 while ((bio = req->bio) != NULL) {
3406 int nbytes; 3418 int nbytes;
3419
3420 /*
3421 * For an empty barrier request, the low level driver must
3422 * store a potential error location in ->sector. We pass
3423 * that back up in ->bi_sector.
3424 */
3425 if (blk_empty_barrier(req))
3426 bio->bi_sector = req->sector;
3407 3427
3408 if (nr_bytes >= bio->bi_size) { 3428 if (nr_bytes >= bio->bi_size) {
3409 req->bio = bio->bi_next; 3429 req->bio = bio->bi_next;
3410 nbytes = bio->bi_size; 3430 nbytes = bio->bi_size;
3411 req_bio_endio(req, bio, nbytes, error); 3431 req_bio_endio(req, bio, nbytes, error);
3412 next_idx = 0; 3432 next_idx = 0;
3413 bio_nbytes = 0; 3433 bio_nbytes = 0;
3414 } else { 3434 } else {
3415 int idx = bio->bi_idx + next_idx; 3435 int idx = bio->bi_idx + next_idx;
3416 3436
3417 if (unlikely(bio->bi_idx >= bio->bi_vcnt)) { 3437 if (unlikely(bio->bi_idx >= bio->bi_vcnt)) {
3418 blk_dump_rq_flags(req, "__end_that"); 3438 blk_dump_rq_flags(req, "__end_that");
3419 printk("%s: bio idx %d >= vcnt %d\n", 3439 printk("%s: bio idx %d >= vcnt %d\n",
3420 __FUNCTION__, 3440 __FUNCTION__,
3421 bio->bi_idx, bio->bi_vcnt); 3441 bio->bi_idx, bio->bi_vcnt);
3422 break; 3442 break;
3423 } 3443 }
3424 3444
3425 nbytes = bio_iovec_idx(bio, idx)->bv_len; 3445 nbytes = bio_iovec_idx(bio, idx)->bv_len;
3426 BIO_BUG_ON(nbytes > bio->bi_size); 3446 BIO_BUG_ON(nbytes > bio->bi_size);
3427 3447
3428 /* 3448 /*
3429 * not a complete bvec done 3449 * not a complete bvec done
3430 */ 3450 */
3431 if (unlikely(nbytes > nr_bytes)) { 3451 if (unlikely(nbytes > nr_bytes)) {
3432 bio_nbytes += nr_bytes; 3452 bio_nbytes += nr_bytes;
3433 total_bytes += nr_bytes; 3453 total_bytes += nr_bytes;
3434 break; 3454 break;
3435 } 3455 }
3436 3456
3437 /* 3457 /*
3438 * advance to the next vector 3458 * advance to the next vector
3439 */ 3459 */
3440 next_idx++; 3460 next_idx++;
3441 bio_nbytes += nbytes; 3461 bio_nbytes += nbytes;
3442 } 3462 }
3443 3463
3444 total_bytes += nbytes; 3464 total_bytes += nbytes;
3445 nr_bytes -= nbytes; 3465 nr_bytes -= nbytes;
3446 3466
3447 if ((bio = req->bio)) { 3467 if ((bio = req->bio)) {
3448 /* 3468 /*
3449 * end more in this run, or just return 'not-done' 3469 * end more in this run, or just return 'not-done'
3450 */ 3470 */
3451 if (unlikely(nr_bytes <= 0)) 3471 if (unlikely(nr_bytes <= 0))
3452 break; 3472 break;
3453 } 3473 }
3454 } 3474 }
3455 3475
3456 /* 3476 /*
3457 * completely done 3477 * completely done
3458 */ 3478 */
3459 if (!req->bio) 3479 if (!req->bio)
3460 return 0; 3480 return 0;
3461 3481
3462 /* 3482 /*
3463 * if the request wasn't completed, update state 3483 * if the request wasn't completed, update state
3464 */ 3484 */
3465 if (bio_nbytes) { 3485 if (bio_nbytes) {
3466 req_bio_endio(req, bio, bio_nbytes, error); 3486 req_bio_endio(req, bio, bio_nbytes, error);
3467 bio->bi_idx += next_idx; 3487 bio->bi_idx += next_idx;
3468 bio_iovec(bio)->bv_offset += nr_bytes; 3488 bio_iovec(bio)->bv_offset += nr_bytes;
3469 bio_iovec(bio)->bv_len -= nr_bytes; 3489 bio_iovec(bio)->bv_len -= nr_bytes;
3470 } 3490 }
3471 3491
3472 blk_recalc_rq_sectors(req, total_bytes >> 9); 3492 blk_recalc_rq_sectors(req, total_bytes >> 9);
3473 blk_recalc_rq_segments(req); 3493 blk_recalc_rq_segments(req);
3474 return 1; 3494 return 1;
3475 } 3495 }
3476 3496
3477 /** 3497 /**
3478 * end_that_request_first - end I/O on a request 3498 * end_that_request_first - end I/O on a request
3479 * @req: the request being processed 3499 * @req: the request being processed
3480 * @uptodate: 1 for success, 0 for I/O error, < 0 for specific error 3500 * @uptodate: 1 for success, 0 for I/O error, < 0 for specific error
3481 * @nr_sectors: number of sectors to end I/O on 3501 * @nr_sectors: number of sectors to end I/O on
3482 * 3502 *
3483 * Description: 3503 * Description:
3484 * Ends I/O on a number of sectors attached to @req, and sets it up 3504 * Ends I/O on a number of sectors attached to @req, and sets it up
3485 * for the next range of segments (if any) in the cluster. 3505 * for the next range of segments (if any) in the cluster.
3486 * 3506 *
3487 * Return: 3507 * Return:
3488 * 0 - we are done with this request, call end_that_request_last() 3508 * 0 - we are done with this request, call end_that_request_last()
3489 * 1 - still buffers pending for this request 3509 * 1 - still buffers pending for this request
3490 **/ 3510 **/
3491 int end_that_request_first(struct request *req, int uptodate, int nr_sectors) 3511 int end_that_request_first(struct request *req, int uptodate, int nr_sectors)
3492 { 3512 {
3493 return __end_that_request_first(req, uptodate, nr_sectors << 9); 3513 return __end_that_request_first(req, uptodate, nr_sectors << 9);
3494 } 3514 }
3495 3515
3496 EXPORT_SYMBOL(end_that_request_first); 3516 EXPORT_SYMBOL(end_that_request_first);
3497 3517
3498 /** 3518 /**
3499 * end_that_request_chunk - end I/O on a request 3519 * end_that_request_chunk - end I/O on a request
3500 * @req: the request being processed 3520 * @req: the request being processed
3501 * @uptodate: 1 for success, 0 for I/O error, < 0 for specific error 3521 * @uptodate: 1 for success, 0 for I/O error, < 0 for specific error
3502 * @nr_bytes: number of bytes to complete 3522 * @nr_bytes: number of bytes to complete
3503 * 3523 *
3504 * Description: 3524 * Description:
3505 * Ends I/O on a number of bytes attached to @req, and sets it up 3525 * Ends I/O on a number of bytes attached to @req, and sets it up
3506 * for the next range of segments (if any). Like end_that_request_first(), 3526 * for the next range of segments (if any). Like end_that_request_first(),
3507 * but deals with bytes instead of sectors. 3527 * but deals with bytes instead of sectors.
3508 * 3528 *
3509 * Return: 3529 * Return:
3510 * 0 - we are done with this request, call end_that_request_last() 3530 * 0 - we are done with this request, call end_that_request_last()
3511 * 1 - still buffers pending for this request 3531 * 1 - still buffers pending for this request
3512 **/ 3532 **/
3513 int end_that_request_chunk(struct request *req, int uptodate, int nr_bytes) 3533 int end_that_request_chunk(struct request *req, int uptodate, int nr_bytes)
3514 { 3534 {
3515 return __end_that_request_first(req, uptodate, nr_bytes); 3535 return __end_that_request_first(req, uptodate, nr_bytes);
3516 } 3536 }
3517 3537
3518 EXPORT_SYMBOL(end_that_request_chunk); 3538 EXPORT_SYMBOL(end_that_request_chunk);
3519 3539
3520 /* 3540 /*
3521 * splice the completion data to a local structure and hand off to 3541 * splice the completion data to a local structure and hand off to
3522 * process_completion_queue() to complete the requests 3542 * process_completion_queue() to complete the requests
3523 */ 3543 */
3524 static void blk_done_softirq(struct softirq_action *h) 3544 static void blk_done_softirq(struct softirq_action *h)
3525 { 3545 {
3526 struct list_head *cpu_list, local_list; 3546 struct list_head *cpu_list, local_list;
3527 3547
3528 local_irq_disable(); 3548 local_irq_disable();
3529 cpu_list = &__get_cpu_var(blk_cpu_done); 3549 cpu_list = &__get_cpu_var(blk_cpu_done);
3530 list_replace_init(cpu_list, &local_list); 3550 list_replace_init(cpu_list, &local_list);
3531 local_irq_enable(); 3551 local_irq_enable();
3532 3552
3533 while (!list_empty(&local_list)) { 3553 while (!list_empty(&local_list)) {
3534 struct request *rq = list_entry(local_list.next, struct request, donelist); 3554 struct request *rq = list_entry(local_list.next, struct request, donelist);
3535 3555
3536 list_del_init(&rq->donelist); 3556 list_del_init(&rq->donelist);
3537 rq->q->softirq_done_fn(rq); 3557 rq->q->softirq_done_fn(rq);
3538 } 3558 }
3539 } 3559 }
3540 3560
3541 static int __cpuinit blk_cpu_notify(struct notifier_block *self, unsigned long action, 3561 static int __cpuinit blk_cpu_notify(struct notifier_block *self, unsigned long action,
3542 void *hcpu) 3562 void *hcpu)
3543 { 3563 {
3544 /* 3564 /*
3545 * If a CPU goes away, splice its entries to the current CPU 3565 * If a CPU goes away, splice its entries to the current CPU
3546 * and trigger a run of the softirq 3566 * and trigger a run of the softirq
3547 */ 3567 */
3548 if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) { 3568 if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
3549 int cpu = (unsigned long) hcpu; 3569 int cpu = (unsigned long) hcpu;
3550 3570
3551 local_irq_disable(); 3571 local_irq_disable();
3552 list_splice_init(&per_cpu(blk_cpu_done, cpu), 3572 list_splice_init(&per_cpu(blk_cpu_done, cpu),
3553 &__get_cpu_var(blk_cpu_done)); 3573 &__get_cpu_var(blk_cpu_done));
3554 raise_softirq_irqoff(BLOCK_SOFTIRQ); 3574 raise_softirq_irqoff(BLOCK_SOFTIRQ);
3555 local_irq_enable(); 3575 local_irq_enable();
3556 } 3576 }
3557 3577
3558 return NOTIFY_OK; 3578 return NOTIFY_OK;
3559 } 3579 }
3560 3580
3561 3581
3562 static struct notifier_block blk_cpu_notifier __cpuinitdata = { 3582 static struct notifier_block blk_cpu_notifier __cpuinitdata = {
3563 .notifier_call = blk_cpu_notify, 3583 .notifier_call = blk_cpu_notify,
3564 }; 3584 };
3565 3585
3566 /** 3586 /**
3567 * blk_complete_request - end I/O on a request 3587 * blk_complete_request - end I/O on a request
3568 * @req: the request being processed 3588 * @req: the request being processed
3569 * 3589 *
3570 * Description: 3590 * Description:
3571 * Ends all I/O on a request. It does not handle partial completions, 3591 * Ends all I/O on a request. It does not handle partial completions,
3572 * unless the driver actually implements this in its completion callback 3592 * unless the driver actually implements this in its completion callback
3573 * through requeueing. The actual completion happens out-of-order, 3593 * through requeueing. The actual completion happens out-of-order,
3574 * through a softirq handler. The user must have registered a completion 3594 * through a softirq handler. The user must have registered a completion
3575 * callback through blk_queue_softirq_done(). 3595 * callback through blk_queue_softirq_done().
3576 **/ 3596 **/
3577 3597
3578 void blk_complete_request(struct request *req) 3598 void blk_complete_request(struct request *req)
3579 { 3599 {
3580 struct list_head *cpu_list; 3600 struct list_head *cpu_list;
3581 unsigned long flags; 3601 unsigned long flags;
3582 3602
3583 BUG_ON(!req->q->softirq_done_fn); 3603 BUG_ON(!req->q->softirq_done_fn);
3584 3604
3585 local_irq_save(flags); 3605 local_irq_save(flags);
3586 3606
3587 cpu_list = &__get_cpu_var(blk_cpu_done); 3607 cpu_list = &__get_cpu_var(blk_cpu_done);
3588 list_add_tail(&req->donelist, cpu_list); 3608 list_add_tail(&req->donelist, cpu_list);
3589 raise_softirq_irqoff(BLOCK_SOFTIRQ); 3609 raise_softirq_irqoff(BLOCK_SOFTIRQ);
3590 3610
3591 local_irq_restore(flags); 3611 local_irq_restore(flags);
3592 } 3612 }
3593 3613
3594 EXPORT_SYMBOL(blk_complete_request); 3614 EXPORT_SYMBOL(blk_complete_request);
3595 3615
3596 /* 3616 /*
3597 * queue lock must be held 3617 * queue lock must be held
3598 */ 3618 */
3599 void end_that_request_last(struct request *req, int uptodate) 3619 void end_that_request_last(struct request *req, int uptodate)
3600 { 3620 {
3601 struct gendisk *disk = req->rq_disk; 3621 struct gendisk *disk = req->rq_disk;
3602 int error; 3622 int error;
3603 3623
3604 /* 3624 /*
3605 * extend uptodate bool to allow < 0 value to be direct io error 3625 * extend uptodate bool to allow < 0 value to be direct io error
3606 */ 3626 */
3607 error = 0; 3627 error = 0;
3608 if (end_io_error(uptodate)) 3628 if (end_io_error(uptodate))
3609 error = !uptodate ? -EIO : uptodate; 3629 error = !uptodate ? -EIO : uptodate;
3610 3630
3611 if (unlikely(laptop_mode) && blk_fs_request(req)) 3631 if (unlikely(laptop_mode) && blk_fs_request(req))
3612 laptop_io_completion(); 3632 laptop_io_completion();
3613 3633
3614 /* 3634 /*
3615 * Account IO completion. bar_rq isn't accounted as a normal 3635 * Account IO completion. bar_rq isn't accounted as a normal
3616 * IO on queueing nor completion. Accounting the containing 3636 * IO on queueing nor completion. Accounting the containing
3617 * request is enough. 3637 * request is enough.
3618 */ 3638 */
3619 if (disk && blk_fs_request(req) && req != &req->q->bar_rq) { 3639 if (disk && blk_fs_request(req) && req != &req->q->bar_rq) {
3620 unsigned long duration = jiffies - req->start_time; 3640 unsigned long duration = jiffies - req->start_time;
3621 const int rw = rq_data_dir(req); 3641 const int rw = rq_data_dir(req);
3622 3642
3623 __disk_stat_inc(disk, ios[rw]); 3643 __disk_stat_inc(disk, ios[rw]);
3624 __disk_stat_add(disk, ticks[rw], duration); 3644 __disk_stat_add(disk, ticks[rw], duration);
3625 disk_round_stats(disk); 3645 disk_round_stats(disk);
3626 disk->in_flight--; 3646 disk->in_flight--;
3627 } 3647 }
3628 if (req->end_io) 3648 if (req->end_io)
3629 req->end_io(req, error); 3649 req->end_io(req, error);
3630 else 3650 else
3631 __blk_put_request(req->q, req); 3651 __blk_put_request(req->q, req);
3632 } 3652 }
3633 3653
3634 EXPORT_SYMBOL(end_that_request_last); 3654 EXPORT_SYMBOL(end_that_request_last);
3635 3655
3636 static inline void __end_request(struct request *rq, int uptodate, 3656 static inline void __end_request(struct request *rq, int uptodate,
3637 unsigned int nr_bytes, int dequeue) 3657 unsigned int nr_bytes, int dequeue)
3638 { 3658 {
3639 if (!end_that_request_chunk(rq, uptodate, nr_bytes)) { 3659 if (!end_that_request_chunk(rq, uptodate, nr_bytes)) {
3640 if (dequeue) 3660 if (dequeue)
3641 blkdev_dequeue_request(rq); 3661 blkdev_dequeue_request(rq);
3642 add_disk_randomness(rq->rq_disk); 3662 add_disk_randomness(rq->rq_disk);
3643 end_that_request_last(rq, uptodate); 3663 end_that_request_last(rq, uptodate);
3644 } 3664 }
3645 } 3665 }
3646 3666
3647 static unsigned int rq_byte_size(struct request *rq) 3667 static unsigned int rq_byte_size(struct request *rq)
3648 { 3668 {
3649 if (blk_fs_request(rq)) 3669 if (blk_fs_request(rq))
3650 return rq->hard_nr_sectors << 9; 3670 return rq->hard_nr_sectors << 9;
3651 3671
3652 return rq->data_len; 3672 return rq->data_len;
3653 } 3673 }
3654 3674
3655 /** 3675 /**
3656 * end_queued_request - end all I/O on a queued request 3676 * end_queued_request - end all I/O on a queued request
3657 * @rq: the request being processed 3677 * @rq: the request being processed
3658 * @uptodate: error value or 0/1 uptodate flag 3678 * @uptodate: error value or 0/1 uptodate flag
3659 * 3679 *
3660 * Description: 3680 * Description:
3661 * Ends all I/O on a request, and removes it from the block layer queues. 3681 * Ends all I/O on a request, and removes it from the block layer queues.
3662 * Not suitable for normal IO completion, unless the driver still has 3682 * Not suitable for normal IO completion, unless the driver still has
3663 * the request attached to the block layer. 3683 * the request attached to the block layer.
3664 * 3684 *
3665 **/ 3685 **/
3666 void end_queued_request(struct request *rq, int uptodate) 3686 void end_queued_request(struct request *rq, int uptodate)
3667 { 3687 {
3668 __end_request(rq, uptodate, rq_byte_size(rq), 1); 3688 __end_request(rq, uptodate, rq_byte_size(rq), 1);
3669 } 3689 }
3670 EXPORT_SYMBOL(end_queued_request); 3690 EXPORT_SYMBOL(end_queued_request);
3671 3691
3672 /** 3692 /**
3673 * end_dequeued_request - end all I/O on a dequeued request 3693 * end_dequeued_request - end all I/O on a dequeued request
3674 * @rq: the request being processed 3694 * @rq: the request being processed
3675 * @uptodate: error value or 0/1 uptodate flag 3695 * @uptodate: error value or 0/1 uptodate flag
3676 * 3696 *
3677 * Description: 3697 * Description:
3678 * Ends all I/O on a request. The request must already have been 3698 * Ends all I/O on a request. The request must already have been
3679 * dequeued using blkdev_dequeue_request(), as is normally the case 3699 * dequeued using blkdev_dequeue_request(), as is normally the case
3680 * for most drivers. 3700 * for most drivers.
3681 * 3701 *
3682 **/ 3702 **/
3683 void end_dequeued_request(struct request *rq, int uptodate) 3703 void end_dequeued_request(struct request *rq, int uptodate)
3684 { 3704 {
3685 __end_request(rq, uptodate, rq_byte_size(rq), 0); 3705 __end_request(rq, uptodate, rq_byte_size(rq), 0);
3686 } 3706 }
3687 EXPORT_SYMBOL(end_dequeued_request); 3707 EXPORT_SYMBOL(end_dequeued_request);
3688 3708
3689 3709
3690 /** 3710 /**
3691 * end_request - end I/O on the current segment of the request 3711 * end_request - end I/O on the current segment of the request
3692 * @rq: the request being processed 3712 * @rq: the request being processed
3693 * @uptodate: error value or 0/1 uptodate flag 3713 * @uptodate: error value or 0/1 uptodate flag
3694 * 3714 *
3695 * Description: 3715 * Description:
3696 * Ends I/O on the current segment of a request. If that is the only 3716 * Ends I/O on the current segment of a request. If that is the only
3697 * remaining segment, the request is also completed and freed. 3717 * remaining segment, the request is also completed and freed.
3698 * 3718 *
3699 * This is a remnant of how older block drivers handled IO completions. 3719 * This is a remnant of how older block drivers handled IO completions.
3700 * Modern drivers typically end IO on the full request in one go, unless 3720 * Modern drivers typically end IO on the full request in one go, unless
3701 * they have a residual value to account for. For that case this function 3721 * they have a residual value to account for. For that case this function
3702 * isn't really useful, unless the residual just happens to be the 3722 * isn't really useful, unless the residual just happens to be the
3703 * full current segment. In other words, don't use this function in new 3723 * full current segment. In other words, don't use this function in new
3704 * code. Either use end_request_completely(), or the 3724 * code. Either use end_request_completely(), or the
3705 * end_that_request_chunk() (along with end_that_request_last()) for 3725 * end_that_request_chunk() (along with end_that_request_last()) for
3706 * partial completions. 3726 * partial completions.
3707 * 3727 *
3708 **/ 3728 **/
3709 void end_request(struct request *req, int uptodate) 3729 void end_request(struct request *req, int uptodate)
3710 { 3730 {
3711 __end_request(req, uptodate, req->hard_cur_sectors << 9, 1); 3731 __end_request(req, uptodate, req->hard_cur_sectors << 9, 1);
3712 } 3732 }
3713 EXPORT_SYMBOL(end_request); 3733 EXPORT_SYMBOL(end_request);
3714 3734
3715 static void blk_rq_bio_prep(struct request_queue *q, struct request *rq, 3735 static void blk_rq_bio_prep(struct request_queue *q, struct request *rq,
3716 struct bio *bio) 3736 struct bio *bio)
3717 { 3737 {
3718 /* first two bits are identical in rq->cmd_flags and bio->bi_rw */ 3738 /* first two bits are identical in rq->cmd_flags and bio->bi_rw */
3719 rq->cmd_flags |= (bio->bi_rw & 3); 3739 rq->cmd_flags |= (bio->bi_rw & 3);
3720 3740
3721 rq->nr_phys_segments = bio_phys_segments(q, bio); 3741 rq->nr_phys_segments = bio_phys_segments(q, bio);
3722 rq->nr_hw_segments = bio_hw_segments(q, bio); 3742 rq->nr_hw_segments = bio_hw_segments(q, bio);
3723 rq->current_nr_sectors = bio_cur_sectors(bio); 3743 rq->current_nr_sectors = bio_cur_sectors(bio);
3724 rq->hard_cur_sectors = rq->current_nr_sectors; 3744 rq->hard_cur_sectors = rq->current_nr_sectors;
3725 rq->hard_nr_sectors = rq->nr_sectors = bio_sectors(bio); 3745 rq->hard_nr_sectors = rq->nr_sectors = bio_sectors(bio);
3726 rq->buffer = bio_data(bio); 3746 rq->buffer = bio_data(bio);
3727 rq->data_len = bio->bi_size; 3747 rq->data_len = bio->bi_size;
3728 3748
3729 rq->bio = rq->biotail = bio; 3749 rq->bio = rq->biotail = bio;
3730 3750
3731 if (bio->bi_bdev) 3751 if (bio->bi_bdev)
3732 rq->rq_disk = bio->bi_bdev->bd_disk; 3752 rq->rq_disk = bio->bi_bdev->bd_disk;
3733 } 3753 }
3734 3754
3735 int kblockd_schedule_work(struct work_struct *work) 3755 int kblockd_schedule_work(struct work_struct *work)
3736 { 3756 {
3737 return queue_work(kblockd_workqueue, work); 3757 return queue_work(kblockd_workqueue, work);
3738 } 3758 }
3739 3759
3740 EXPORT_SYMBOL(kblockd_schedule_work); 3760 EXPORT_SYMBOL(kblockd_schedule_work);
3741 3761
3742 void kblockd_flush_work(struct work_struct *work) 3762 void kblockd_flush_work(struct work_struct *work)
3743 { 3763 {
3744 cancel_work_sync(work); 3764 cancel_work_sync(work);
3745 } 3765 }
3746 EXPORT_SYMBOL(kblockd_flush_work); 3766 EXPORT_SYMBOL(kblockd_flush_work);
3747 3767
3748 int __init blk_dev_init(void) 3768 int __init blk_dev_init(void)
3749 { 3769 {
3750 int i; 3770 int i;
3751 3771
3752 kblockd_workqueue = create_workqueue("kblockd"); 3772 kblockd_workqueue = create_workqueue("kblockd");
3753 if (!kblockd_workqueue) 3773 if (!kblockd_workqueue)
3754 panic("Failed to create kblockd\n"); 3774 panic("Failed to create kblockd\n");
3755 3775
3756 request_cachep = kmem_cache_create("blkdev_requests", 3776 request_cachep = kmem_cache_create("blkdev_requests",
3757 sizeof(struct request), 0, SLAB_PANIC, NULL); 3777 sizeof(struct request), 0, SLAB_PANIC, NULL);
3758 3778
3759 requestq_cachep = kmem_cache_create("blkdev_queue", 3779 requestq_cachep = kmem_cache_create("blkdev_queue",
3760 sizeof(struct request_queue), 0, SLAB_PANIC, NULL); 3780 sizeof(struct request_queue), 0, SLAB_PANIC, NULL);
3761 3781
3762 iocontext_cachep = kmem_cache_create("blkdev_ioc", 3782 iocontext_cachep = kmem_cache_create("blkdev_ioc",
3763 sizeof(struct io_context), 0, SLAB_PANIC, NULL); 3783 sizeof(struct io_context), 0, SLAB_PANIC, NULL);
3764 3784
3765 for_each_possible_cpu(i) 3785 for_each_possible_cpu(i)
3766 INIT_LIST_HEAD(&per_cpu(blk_cpu_done, i)); 3786 INIT_LIST_HEAD(&per_cpu(blk_cpu_done, i));
3767 3787
3768 open_softirq(BLOCK_SOFTIRQ, blk_done_softirq, NULL); 3788 open_softirq(BLOCK_SOFTIRQ, blk_done_softirq, NULL);
3769 register_hotcpu_notifier(&blk_cpu_notifier); 3789 register_hotcpu_notifier(&blk_cpu_notifier);
3770 3790
3771 blk_max_low_pfn = max_low_pfn - 1; 3791 blk_max_low_pfn = max_low_pfn - 1;
3772 blk_max_pfn = max_pfn - 1; 3792 blk_max_pfn = max_pfn - 1;
3773 3793
3774 return 0; 3794 return 0;
3775 } 3795 }
3776 3796
3777 /* 3797 /*
3778 * IO Context helper functions 3798 * IO Context helper functions
3779 */ 3799 */
3780 void put_io_context(struct io_context *ioc) 3800 void put_io_context(struct io_context *ioc)
3781 { 3801 {
3782 if (ioc == NULL) 3802 if (ioc == NULL)
3783 return; 3803 return;
3784 3804
3785 BUG_ON(atomic_read(&ioc->refcount) == 0); 3805 BUG_ON(atomic_read(&ioc->refcount) == 0);
3786 3806
3787 if (atomic_dec_and_test(&ioc->refcount)) { 3807 if (atomic_dec_and_test(&ioc->refcount)) {
3788 struct cfq_io_context *cic; 3808 struct cfq_io_context *cic;
3789 3809
3790 rcu_read_lock(); 3810 rcu_read_lock();
3791 if (ioc->aic && ioc->aic->dtor) 3811 if (ioc->aic && ioc->aic->dtor)
3792 ioc->aic->dtor(ioc->aic); 3812 ioc->aic->dtor(ioc->aic);
3793 if (ioc->cic_root.rb_node != NULL) { 3813 if (ioc->cic_root.rb_node != NULL) {
3794 struct rb_node *n = rb_first(&ioc->cic_root); 3814 struct rb_node *n = rb_first(&ioc->cic_root);
3795 3815
3796 cic = rb_entry(n, struct cfq_io_context, rb_node); 3816 cic = rb_entry(n, struct cfq_io_context, rb_node);
3797 cic->dtor(ioc); 3817 cic->dtor(ioc);
3798 } 3818 }
3799 rcu_read_unlock(); 3819 rcu_read_unlock();
3800 3820
3801 kmem_cache_free(iocontext_cachep, ioc); 3821 kmem_cache_free(iocontext_cachep, ioc);
3802 } 3822 }
3803 } 3823 }
3804 EXPORT_SYMBOL(put_io_context); 3824 EXPORT_SYMBOL(put_io_context);
3805 3825
3806 /* Called by the exitting task */ 3826 /* Called by the exitting task */
3807 void exit_io_context(void) 3827 void exit_io_context(void)
3808 { 3828 {
3809 struct io_context *ioc; 3829 struct io_context *ioc;
3810 struct cfq_io_context *cic; 3830 struct cfq_io_context *cic;
3811 3831
3812 task_lock(current); 3832 task_lock(current);
3813 ioc = current->io_context; 3833 ioc = current->io_context;
3814 current->io_context = NULL; 3834 current->io_context = NULL;
3815 task_unlock(current); 3835 task_unlock(current);
3816 3836
3817 ioc->task = NULL; 3837 ioc->task = NULL;
3818 if (ioc->aic && ioc->aic->exit) 3838 if (ioc->aic && ioc->aic->exit)
3819 ioc->aic->exit(ioc->aic); 3839 ioc->aic->exit(ioc->aic);
3820 if (ioc->cic_root.rb_node != NULL) { 3840 if (ioc->cic_root.rb_node != NULL) {
3821 cic = rb_entry(rb_first(&ioc->cic_root), struct cfq_io_context, rb_node); 3841 cic = rb_entry(rb_first(&ioc->cic_root), struct cfq_io_context, rb_node);
3822 cic->exit(ioc); 3842 cic->exit(ioc);
3823 } 3843 }
3824 3844
3825 put_io_context(ioc); 3845 put_io_context(ioc);
3826 } 3846 }
3827 3847
3828 /* 3848 /*
3829 * If the current task has no IO context then create one and initialise it. 3849 * If the current task has no IO context then create one and initialise it.
3830 * Otherwise, return its existing IO context. 3850 * Otherwise, return its existing IO context.
3831 * 3851 *
3832 * This returned IO context doesn't have a specifically elevated refcount, 3852 * This returned IO context doesn't have a specifically elevated refcount,
3833 * but since the current task itself holds a reference, the context can be 3853 * but since the current task itself holds a reference, the context can be
3834 * used in general code, so long as it stays within `current` context. 3854 * used in general code, so long as it stays within `current` context.
3835 */ 3855 */
3836 static struct io_context *current_io_context(gfp_t gfp_flags, int node) 3856 static struct io_context *current_io_context(gfp_t gfp_flags, int node)
3837 { 3857 {
3838 struct task_struct *tsk = current; 3858 struct task_struct *tsk = current;
3839 struct io_context *ret; 3859 struct io_context *ret;
3840 3860
3841 ret = tsk->io_context; 3861 ret = tsk->io_context;
3842 if (likely(ret)) 3862 if (likely(ret))
3843 return ret; 3863 return ret;
3844 3864
3845 ret = kmem_cache_alloc_node(iocontext_cachep, gfp_flags, node); 3865 ret = kmem_cache_alloc_node(iocontext_cachep, gfp_flags, node);
3846 if (ret) { 3866 if (ret) {
3847 atomic_set(&ret->refcount, 1); 3867 atomic_set(&ret->refcount, 1);
3848 ret->task = current; 3868 ret->task = current;
3849 ret->ioprio_changed = 0; 3869 ret->ioprio_changed = 0;
3850 ret->last_waited = jiffies; /* doesn't matter... */ 3870 ret->last_waited = jiffies; /* doesn't matter... */
3851 ret->nr_batch_requests = 0; /* because this is 0 */ 3871 ret->nr_batch_requests = 0; /* because this is 0 */
3852 ret->aic = NULL; 3872 ret->aic = NULL;
3853 ret->cic_root.rb_node = NULL; 3873 ret->cic_root.rb_node = NULL;
3854 ret->ioc_data = NULL; 3874 ret->ioc_data = NULL;
3855 /* make sure set_task_ioprio() sees the settings above */ 3875 /* make sure set_task_ioprio() sees the settings above */
3856 smp_wmb(); 3876 smp_wmb();
3857 tsk->io_context = ret; 3877 tsk->io_context = ret;
3858 } 3878 }
3859 3879
3860 return ret; 3880 return ret;
3861 } 3881 }
3862 3882
3863 /* 3883 /*
3864 * If the current task has no IO context then create one and initialise it. 3884 * If the current task has no IO context then create one and initialise it.
3865 * If it does have a context, take a ref on it. 3885 * If it does have a context, take a ref on it.
3866 * 3886 *
3867 * This is always called in the context of the task which submitted the I/O. 3887 * This is always called in the context of the task which submitted the I/O.
3868 */ 3888 */
3869 struct io_context *get_io_context(gfp_t gfp_flags, int node) 3889 struct io_context *get_io_context(gfp_t gfp_flags, int node)
3870 { 3890 {
3871 struct io_context *ret; 3891 struct io_context *ret;
3872 ret = current_io_context(gfp_flags, node); 3892 ret = current_io_context(gfp_flags, node);
3873 if (likely(ret)) 3893 if (likely(ret))
3874 atomic_inc(&ret->refcount); 3894 atomic_inc(&ret->refcount);
3875 return ret; 3895 return ret;
3876 } 3896 }
3877 EXPORT_SYMBOL(get_io_context); 3897 EXPORT_SYMBOL(get_io_context);
3878 3898
3879 void copy_io_context(struct io_context **pdst, struct io_context **psrc) 3899 void copy_io_context(struct io_context **pdst, struct io_context **psrc)
3880 { 3900 {
3881 struct io_context *src = *psrc; 3901 struct io_context *src = *psrc;
3882 struct io_context *dst = *pdst; 3902 struct io_context *dst = *pdst;
3883 3903
3884 if (src) { 3904 if (src) {
3885 BUG_ON(atomic_read(&src->refcount) == 0); 3905 BUG_ON(atomic_read(&src->refcount) == 0);
3886 atomic_inc(&src->refcount); 3906 atomic_inc(&src->refcount);
3887 put_io_context(dst); 3907 put_io_context(dst);
3888 *pdst = src; 3908 *pdst = src;
3889 } 3909 }
3890 } 3910 }
3891 EXPORT_SYMBOL(copy_io_context); 3911 EXPORT_SYMBOL(copy_io_context);
3892 3912
3893 void swap_io_context(struct io_context **ioc1, struct io_context **ioc2) 3913 void swap_io_context(struct io_context **ioc1, struct io_context **ioc2)
3894 { 3914 {
3895 struct io_context *temp; 3915 struct io_context *temp;
3896 temp = *ioc1; 3916 temp = *ioc1;
3897 *ioc1 = *ioc2; 3917 *ioc1 = *ioc2;
3898 *ioc2 = temp; 3918 *ioc2 = temp;
3899 } 3919 }
3900 EXPORT_SYMBOL(swap_io_context); 3920 EXPORT_SYMBOL(swap_io_context);
3901 3921
3902 /* 3922 /*
3903 * sysfs parts below 3923 * sysfs parts below
3904 */ 3924 */
3905 struct queue_sysfs_entry { 3925 struct queue_sysfs_entry {
3906 struct attribute attr; 3926 struct attribute attr;
3907 ssize_t (*show)(struct request_queue *, char *); 3927 ssize_t (*show)(struct request_queue *, char *);
3908 ssize_t (*store)(struct request_queue *, const char *, size_t); 3928 ssize_t (*store)(struct request_queue *, const char *, size_t);
3909 }; 3929 };
3910 3930
3911 static ssize_t 3931 static ssize_t
3912 queue_var_show(unsigned int var, char *page) 3932 queue_var_show(unsigned int var, char *page)
3913 { 3933 {
3914 return sprintf(page, "%d\n", var); 3934 return sprintf(page, "%d\n", var);
3915 } 3935 }
3916 3936
3917 static ssize_t 3937 static ssize_t
3918 queue_var_store(unsigned long *var, const char *page, size_t count) 3938 queue_var_store(unsigned long *var, const char *page, size_t count)
3919 { 3939 {
3920 char *p = (char *) page; 3940 char *p = (char *) page;
3921 3941
3922 *var = simple_strtoul(p, &p, 10); 3942 *var = simple_strtoul(p, &p, 10);
3923 return count; 3943 return count;
3924 } 3944 }
3925 3945
3926 static ssize_t queue_requests_show(struct request_queue *q, char *page) 3946 static ssize_t queue_requests_show(struct request_queue *q, char *page)
3927 { 3947 {
3928 return queue_var_show(q->nr_requests, (page)); 3948 return queue_var_show(q->nr_requests, (page));
3929 } 3949 }
3930 3950
3931 static ssize_t 3951 static ssize_t
3932 queue_requests_store(struct request_queue *q, const char *page, size_t count) 3952 queue_requests_store(struct request_queue *q, const char *page, size_t count)
3933 { 3953 {
3934 struct request_list *rl = &q->rq; 3954 struct request_list *rl = &q->rq;
3935 unsigned long nr; 3955 unsigned long nr;
3936 int ret = queue_var_store(&nr, page, count); 3956 int ret = queue_var_store(&nr, page, count);
3937 if (nr < BLKDEV_MIN_RQ) 3957 if (nr < BLKDEV_MIN_RQ)
3938 nr = BLKDEV_MIN_RQ; 3958 nr = BLKDEV_MIN_RQ;
3939 3959
3940 spin_lock_irq(q->queue_lock); 3960 spin_lock_irq(q->queue_lock);
3941 q->nr_requests = nr; 3961 q->nr_requests = nr;
3942 blk_queue_congestion_threshold(q); 3962 blk_queue_congestion_threshold(q);
3943 3963
3944 if (rl->count[READ] >= queue_congestion_on_threshold(q)) 3964 if (rl->count[READ] >= queue_congestion_on_threshold(q))
3945 blk_set_queue_congested(q, READ); 3965 blk_set_queue_congested(q, READ);
3946 else if (rl->count[READ] < queue_congestion_off_threshold(q)) 3966 else if (rl->count[READ] < queue_congestion_off_threshold(q))
3947 blk_clear_queue_congested(q, READ); 3967 blk_clear_queue_congested(q, READ);
3948 3968
3949 if (rl->count[WRITE] >= queue_congestion_on_threshold(q)) 3969 if (rl->count[WRITE] >= queue_congestion_on_threshold(q))
3950 blk_set_queue_congested(q, WRITE); 3970 blk_set_queue_congested(q, WRITE);
3951 else if (rl->count[WRITE] < queue_congestion_off_threshold(q)) 3971 else if (rl->count[WRITE] < queue_congestion_off_threshold(q))
3952 blk_clear_queue_congested(q, WRITE); 3972 blk_clear_queue_congested(q, WRITE);
3953 3973
3954 if (rl->count[READ] >= q->nr_requests) { 3974 if (rl->count[READ] >= q->nr_requests) {
3955 blk_set_queue_full(q, READ); 3975 blk_set_queue_full(q, READ);
3956 } else if (rl->count[READ]+1 <= q->nr_requests) { 3976 } else if (rl->count[READ]+1 <= q->nr_requests) {
3957 blk_clear_queue_full(q, READ); 3977 blk_clear_queue_full(q, READ);
3958 wake_up(&rl->wait[READ]); 3978 wake_up(&rl->wait[READ]);
3959 } 3979 }
3960 3980
3961 if (rl->count[WRITE] >= q->nr_requests) { 3981 if (rl->count[WRITE] >= q->nr_requests) {
3962 blk_set_queue_full(q, WRITE); 3982 blk_set_queue_full(q, WRITE);
3963 } else if (rl->count[WRITE]+1 <= q->nr_requests) { 3983 } else if (rl->count[WRITE]+1 <= q->nr_requests) {
3964 blk_clear_queue_full(q, WRITE); 3984 blk_clear_queue_full(q, WRITE);
3965 wake_up(&rl->wait[WRITE]); 3985 wake_up(&rl->wait[WRITE]);
3966 } 3986 }
3967 spin_unlock_irq(q->queue_lock); 3987 spin_unlock_irq(q->queue_lock);
3968 return ret; 3988 return ret;
3969 } 3989 }
3970 3990
3971 static ssize_t queue_ra_show(struct request_queue *q, char *page) 3991 static ssize_t queue_ra_show(struct request_queue *q, char *page)
3972 { 3992 {
3973 int ra_kb = q->backing_dev_info.ra_pages << (PAGE_CACHE_SHIFT - 10); 3993 int ra_kb = q->backing_dev_info.ra_pages << (PAGE_CACHE_SHIFT - 10);
3974 3994
3975 return queue_var_show(ra_kb, (page)); 3995 return queue_var_show(ra_kb, (page));
3976 } 3996 }
3977 3997
3978 static ssize_t 3998 static ssize_t
3979 queue_ra_store(struct request_queue *q, const char *page, size_t count) 3999 queue_ra_store(struct request_queue *q, const char *page, size_t count)
3980 { 4000 {
3981 unsigned long ra_kb; 4001 unsigned long ra_kb;
3982 ssize_t ret = queue_var_store(&ra_kb, page, count); 4002 ssize_t ret = queue_var_store(&ra_kb, page, count);
3983 4003
3984 spin_lock_irq(q->queue_lock); 4004 spin_lock_irq(q->queue_lock);
3985 q->backing_dev_info.ra_pages = ra_kb >> (PAGE_CACHE_SHIFT - 10); 4005 q->backing_dev_info.ra_pages = ra_kb >> (PAGE_CACHE_SHIFT - 10);
3986 spin_unlock_irq(q->queue_lock); 4006 spin_unlock_irq(q->queue_lock);
3987 4007
3988 return ret; 4008 return ret;
3989 } 4009 }
3990 4010
3991 static ssize_t queue_max_sectors_show(struct request_queue *q, char *page) 4011 static ssize_t queue_max_sectors_show(struct request_queue *q, char *page)
3992 { 4012 {
3993 int max_sectors_kb = q->max_sectors >> 1; 4013 int max_sectors_kb = q->max_sectors >> 1;
3994 4014
3995 return queue_var_show(max_sectors_kb, (page)); 4015 return queue_var_show(max_sectors_kb, (page));
3996 } 4016 }
3997 4017
3998 static ssize_t 4018 static ssize_t
3999 queue_max_sectors_store(struct request_queue *q, const char *page, size_t count) 4019 queue_max_sectors_store(struct request_queue *q, const char *page, size_t count)
4000 { 4020 {
4001 unsigned long max_sectors_kb, 4021 unsigned long max_sectors_kb,
4002 max_hw_sectors_kb = q->max_hw_sectors >> 1, 4022 max_hw_sectors_kb = q->max_hw_sectors >> 1,
4003 page_kb = 1 << (PAGE_CACHE_SHIFT - 10); 4023 page_kb = 1 << (PAGE_CACHE_SHIFT - 10);
4004 ssize_t ret = queue_var_store(&max_sectors_kb, page, count); 4024 ssize_t ret = queue_var_store(&max_sectors_kb, page, count);
4005 int ra_kb; 4025 int ra_kb;
4006 4026
4007 if (max_sectors_kb > max_hw_sectors_kb || max_sectors_kb < page_kb) 4027 if (max_sectors_kb > max_hw_sectors_kb || max_sectors_kb < page_kb)
4008 return -EINVAL; 4028 return -EINVAL;
4009 /* 4029 /*
4010 * Take the queue lock to update the readahead and max_sectors 4030 * Take the queue lock to update the readahead and max_sectors
4011 * values synchronously: 4031 * values synchronously:
4012 */ 4032 */
4013 spin_lock_irq(q->queue_lock); 4033 spin_lock_irq(q->queue_lock);
4014 /* 4034 /*
4015 * Trim readahead window as well, if necessary: 4035 * Trim readahead window as well, if necessary:
4016 */ 4036 */
4017 ra_kb = q->backing_dev_info.ra_pages << (PAGE_CACHE_SHIFT - 10); 4037 ra_kb = q->backing_dev_info.ra_pages << (PAGE_CACHE_SHIFT - 10);
4018 if (ra_kb > max_sectors_kb) 4038 if (ra_kb > max_sectors_kb)
4019 q->backing_dev_info.ra_pages = 4039 q->backing_dev_info.ra_pages =
4020 max_sectors_kb >> (PAGE_CACHE_SHIFT - 10); 4040 max_sectors_kb >> (PAGE_CACHE_SHIFT - 10);
4021 4041
4022 q->max_sectors = max_sectors_kb << 1; 4042 q->max_sectors = max_sectors_kb << 1;
4023 spin_unlock_irq(q->queue_lock); 4043 spin_unlock_irq(q->queue_lock);
4024 4044
4025 return ret; 4045 return ret;
4026 } 4046 }
4027 4047
4028 static ssize_t queue_max_hw_sectors_show(struct request_queue *q, char *page) 4048 static ssize_t queue_max_hw_sectors_show(struct request_queue *q, char *page)
4029 { 4049 {
4030 int max_hw_sectors_kb = q->max_hw_sectors >> 1; 4050 int max_hw_sectors_kb = q->max_hw_sectors >> 1;
4031 4051
4032 return queue_var_show(max_hw_sectors_kb, (page)); 4052 return queue_var_show(max_hw_sectors_kb, (page));
4033 } 4053 }
4034 4054
4035 4055
4036 static struct queue_sysfs_entry queue_requests_entry = { 4056 static struct queue_sysfs_entry queue_requests_entry = {
4037 .attr = {.name = "nr_requests", .mode = S_IRUGO | S_IWUSR }, 4057 .attr = {.name = "nr_requests", .mode = S_IRUGO | S_IWUSR },
4038 .show = queue_requests_show, 4058 .show = queue_requests_show,
4039 .store = queue_requests_store, 4059 .store = queue_requests_store,
4040 }; 4060 };
4041 4061
4042 static struct queue_sysfs_entry queue_ra_entry = { 4062 static struct queue_sysfs_entry queue_ra_entry = {
4043 .attr = {.name = "read_ahead_kb", .mode = S_IRUGO | S_IWUSR }, 4063 .attr = {.name = "read_ahead_kb", .mode = S_IRUGO | S_IWUSR },
4044 .show = queue_ra_show, 4064 .show = queue_ra_show,
4045 .store = queue_ra_store, 4065 .store = queue_ra_store,
4046 }; 4066 };
4047 4067
4048 static struct queue_sysfs_entry queue_max_sectors_entry = { 4068 static struct queue_sysfs_entry queue_max_sectors_entry = {
4049 .attr = {.name = "max_sectors_kb", .mode = S_IRUGO | S_IWUSR }, 4069 .attr = {.name = "max_sectors_kb", .mode = S_IRUGO | S_IWUSR },
4050 .show = queue_max_sectors_show, 4070 .show = queue_max_sectors_show,
4051 .store = queue_max_sectors_store, 4071 .store = queue_max_sectors_store,
4052 }; 4072 };
4053 4073
4054 static struct queue_sysfs_entry queue_max_hw_sectors_entry = { 4074 static struct queue_sysfs_entry queue_max_hw_sectors_entry = {
4055 .attr = {.name = "max_hw_sectors_kb", .mode = S_IRUGO }, 4075 .attr = {.name = "max_hw_sectors_kb", .mode = S_IRUGO },
4056 .show = queue_max_hw_sectors_show, 4076 .show = queue_max_hw_sectors_show,
4057 }; 4077 };
4058 4078
4059 static struct queue_sysfs_entry queue_iosched_entry = { 4079 static struct queue_sysfs_entry queue_iosched_entry = {
4060 .attr = {.name = "scheduler", .mode = S_IRUGO | S_IWUSR }, 4080 .attr = {.name = "scheduler", .mode = S_IRUGO | S_IWUSR },
4061 .show = elv_iosched_show, 4081 .show = elv_iosched_show,
4062 .store = elv_iosched_store, 4082 .store = elv_iosched_store,
4063 }; 4083 };
4064 4084
4065 static struct attribute *default_attrs[] = { 4085 static struct attribute *default_attrs[] = {
4066 &queue_requests_entry.attr, 4086 &queue_requests_entry.attr,
4067 &queue_ra_entry.attr, 4087 &queue_ra_entry.attr,
4068 &queue_max_hw_sectors_entry.attr, 4088 &queue_max_hw_sectors_entry.attr,
4069 &queue_max_sectors_entry.attr, 4089 &queue_max_sectors_entry.attr,
4070 &queue_iosched_entry.attr, 4090 &queue_iosched_entry.attr,
4071 NULL, 4091 NULL,
4072 }; 4092 };
4073 4093
4074 #define to_queue(atr) container_of((atr), struct queue_sysfs_entry, attr) 4094 #define to_queue(atr) container_of((atr), struct queue_sysfs_entry, attr)
4075 4095
4076 static ssize_t 4096 static ssize_t
4077 queue_attr_show(struct kobject *kobj, struct attribute *attr, char *page) 4097 queue_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
4078 { 4098 {
4079 struct queue_sysfs_entry *entry = to_queue(attr); 4099 struct queue_sysfs_entry *entry = to_queue(attr);
4080 struct request_queue *q = 4100 struct request_queue *q =
4081 container_of(kobj, struct request_queue, kobj); 4101 container_of(kobj, struct request_queue, kobj);
4082 ssize_t res; 4102 ssize_t res;
4083 4103
4084 if (!entry->show) 4104 if (!entry->show)
4085 return -EIO; 4105 return -EIO;
4086 mutex_lock(&q->sysfs_lock); 4106 mutex_lock(&q->sysfs_lock);
4087 if (test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)) { 4107 if (test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)) {
4088 mutex_unlock(&q->sysfs_lock); 4108 mutex_unlock(&q->sysfs_lock);
4089 return -ENOENT; 4109 return -ENOENT;
4090 } 4110 }
4091 res = entry->show(q, page); 4111 res = entry->show(q, page);
4092 mutex_unlock(&q->sysfs_lock); 4112 mutex_unlock(&q->sysfs_lock);
4093 return res; 4113 return res;
4094 } 4114 }
4095 4115
4096 static ssize_t 4116 static ssize_t
4097 queue_attr_store(struct kobject *kobj, struct attribute *attr, 4117 queue_attr_store(struct kobject *kobj, struct attribute *attr,
4098 const char *page, size_t length) 4118 const char *page, size_t length)
4099 { 4119 {
4100 struct queue_sysfs_entry *entry = to_queue(attr); 4120 struct queue_sysfs_entry *entry = to_queue(attr);
4101 struct request_queue *q = container_of(kobj, struct request_queue, kobj); 4121 struct request_queue *q = container_of(kobj, struct request_queue, kobj);
4102 4122
4103 ssize_t res; 4123 ssize_t res;
4104 4124
4105 if (!entry->store) 4125 if (!entry->store)
4106 return -EIO; 4126 return -EIO;
4107 mutex_lock(&q->sysfs_lock); 4127 mutex_lock(&q->sysfs_lock);
4108 if (test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)) { 4128 if (test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)) {
4109 mutex_unlock(&q->sysfs_lock); 4129 mutex_unlock(&q->sysfs_lock);
4110 return -ENOENT; 4130 return -ENOENT;
4111 } 4131 }
4112 res = entry->store(q, page, length); 4132 res = entry->store(q, page, length);
4113 mutex_unlock(&q->sysfs_lock); 4133 mutex_unlock(&q->sysfs_lock);
4114 return res; 4134 return res;
4115 } 4135 }
4116 4136
4117 static struct sysfs_ops queue_sysfs_ops = { 4137 static struct sysfs_ops queue_sysfs_ops = {
4118 .show = queue_attr_show, 4138 .show = queue_attr_show,
4119 .store = queue_attr_store, 4139 .store = queue_attr_store,
4120 }; 4140 };
4121 4141
4122 static struct kobj_type queue_ktype = { 4142 static struct kobj_type queue_ktype = {
4123 .sysfs_ops = &queue_sysfs_ops, 4143 .sysfs_ops = &queue_sysfs_ops,
4124 .default_attrs = default_attrs, 4144 .default_attrs = default_attrs,
4125 .release = blk_release_queue, 4145 .release = blk_release_queue,
4126 }; 4146 };
4127 4147
4128 int blk_register_queue(struct gendisk *disk) 4148 int blk_register_queue(struct gendisk *disk)
4129 { 4149 {
4130 int ret; 4150 int ret;
4131 4151
4132 struct request_queue *q = disk->queue; 4152 struct request_queue *q = disk->queue;
4133 4153
4134 if (!q || !q->request_fn) 4154 if (!q || !q->request_fn)
4135 return -ENXIO; 4155 return -ENXIO;
4136 4156
4137 q->kobj.parent = kobject_get(&disk->kobj); 4157 q->kobj.parent = kobject_get(&disk->kobj);
4138 4158
4139 ret = kobject_add(&q->kobj); 4159 ret = kobject_add(&q->kobj);
4140 if (ret < 0) 4160 if (ret < 0)
4141 return ret; 4161 return ret;
4142 4162
4143 kobject_uevent(&q->kobj, KOBJ_ADD); 4163 kobject_uevent(&q->kobj, KOBJ_ADD);
4144 4164
4145 ret = elv_register_queue(q); 4165 ret = elv_register_queue(q);
4146 if (ret) { 4166 if (ret) {
4147 kobject_uevent(&q->kobj, KOBJ_REMOVE); 4167 kobject_uevent(&q->kobj, KOBJ_REMOVE);
4148 kobject_del(&q->kobj); 4168 kobject_del(&q->kobj);
4149 return ret; 4169 return ret;
4150 } 4170 }
4151 4171
4152 return 0; 4172 return 0;
4153 } 4173 }
4154 4174
4155 void blk_unregister_queue(struct gendisk *disk) 4175 void blk_unregister_queue(struct gendisk *disk)
4156 { 4176 {
4157 struct request_queue *q = disk->queue; 4177 struct request_queue *q = disk->queue;
4158 4178
4159 if (q && q->request_fn) { 4179 if (q && q->request_fn) {
1 /* 1 /*
2 * 2.5 block I/O model 2 * 2.5 block I/O model
3 * 3 *
4 * Copyright (C) 2001 Jens Axboe <axboe@suse.de> 4 * Copyright (C) 2001 Jens Axboe <axboe@suse.de>
5 * 5 *
6 * This program is free software; you can redistribute it and/or modify 6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 as 7 * it under the terms of the GNU General Public License version 2 as
8 * published by the Free Software Foundation. 8 * published by the Free Software Foundation.
9 * 9 *
10 * This program is distributed in the hope that it will be useful, 10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 12
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details. 14 * GNU General Public License for more details.
15 * 15 *
16 * You should have received a copy of the GNU General Public Licens 16 * You should have received a copy of the GNU General Public Licens
17 * along with this program; if not, write to the Free Software 17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111- 18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-
19 */ 19 */
20 #ifndef __LINUX_BIO_H 20 #ifndef __LINUX_BIO_H
21 #define __LINUX_BIO_H 21 #define __LINUX_BIO_H
22 22
23 #include <linux/highmem.h> 23 #include <linux/highmem.h>
24 #include <linux/mempool.h> 24 #include <linux/mempool.h>
25 #include <linux/ioprio.h> 25 #include <linux/ioprio.h>
26 26
27 #ifdef CONFIG_BLOCK 27 #ifdef CONFIG_BLOCK
28 28
29 /* Platforms may set this to teach the BIO layer about IOMMU hardware. */ 29 /* Platforms may set this to teach the BIO layer about IOMMU hardware. */
30 #include <asm/io.h> 30 #include <asm/io.h>
31 31
32 #if defined(BIO_VMERGE_MAX_SIZE) && defined(BIO_VMERGE_BOUNDARY) 32 #if defined(BIO_VMERGE_MAX_SIZE) && defined(BIO_VMERGE_BOUNDARY)
33 #define BIOVEC_VIRT_START_SIZE(x) (bvec_to_phys(x) & (BIO_VMERGE_BOUNDARY - 1)) 33 #define BIOVEC_VIRT_START_SIZE(x) (bvec_to_phys(x) & (BIO_VMERGE_BOUNDARY - 1))
34 #define BIOVEC_VIRT_OVERSIZE(x) ((x) > BIO_VMERGE_MAX_SIZE) 34 #define BIOVEC_VIRT_OVERSIZE(x) ((x) > BIO_VMERGE_MAX_SIZE)
35 #else 35 #else
36 #define BIOVEC_VIRT_START_SIZE(x) 0 36 #define BIOVEC_VIRT_START_SIZE(x) 0
37 #define BIOVEC_VIRT_OVERSIZE(x) 0 37 #define BIOVEC_VIRT_OVERSIZE(x) 0
38 #endif 38 #endif
39 39
40 #ifndef BIO_VMERGE_BOUNDARY 40 #ifndef BIO_VMERGE_BOUNDARY
41 #define BIO_VMERGE_BOUNDARY 0 41 #define BIO_VMERGE_BOUNDARY 0
42 #endif 42 #endif
43 43
44 #define BIO_DEBUG 44 #define BIO_DEBUG
45 45
46 #ifdef BIO_DEBUG 46 #ifdef BIO_DEBUG
47 #define BIO_BUG_ON BUG_ON 47 #define BIO_BUG_ON BUG_ON
48 #else 48 #else
49 #define BIO_BUG_ON 49 #define BIO_BUG_ON
50 #endif 50 #endif
51 51
52 #define BIO_MAX_PAGES 256 52 #define BIO_MAX_PAGES 256
53 #define BIO_MAX_SIZE (BIO_MAX_PAGES << PAGE_CACHE_SHIFT) 53 #define BIO_MAX_SIZE (BIO_MAX_PAGES << PAGE_CACHE_SHIFT)
54 #define BIO_MAX_SECTORS (BIO_MAX_SIZE >> 9) 54 #define BIO_MAX_SECTORS (BIO_MAX_SIZE >> 9)
55 55
56 /* 56 /*
57 * was unsigned short, but we might as well be ready for > 64kB I/O pages 57 * was unsigned short, but we might as well be ready for > 64kB I/O pages
58 */ 58 */
59 struct bio_vec { 59 struct bio_vec {
60 struct page *bv_page; 60 struct page *bv_page;
61 unsigned int bv_len; 61 unsigned int bv_len;
62 unsigned int bv_offset; 62 unsigned int bv_offset;
63 }; 63 };
64 64
65 struct bio_set; 65 struct bio_set;
66 struct bio; 66 struct bio;
67 typedef void (bio_end_io_t) (struct bio *, int); 67 typedef void (bio_end_io_t) (struct bio *, int);
68 typedef void (bio_destructor_t) (struct bio *); 68 typedef void (bio_destructor_t) (struct bio *);
69 69
70 /* 70 /*
71 * main unit of I/O for the block layer and lower layers (ie drivers and 71 * main unit of I/O for the block layer and lower layers (ie drivers and
72 * stacking drivers) 72 * stacking drivers)
73 */ 73 */
74 struct bio { 74 struct bio {
75 sector_t bi_sector; /* device address in 512 byte 75 sector_t bi_sector; /* device address in 512 byte
76 sectors */ 76 sectors */
77 struct bio *bi_next; /* request queue link */ 77 struct bio *bi_next; /* request queue link */
78 struct block_device *bi_bdev; 78 struct block_device *bi_bdev;
79 unsigned long bi_flags; /* status, command, etc */ 79 unsigned long bi_flags; /* status, command, etc */
80 unsigned long bi_rw; /* bottom bits READ/WRITE, 80 unsigned long bi_rw; /* bottom bits READ/WRITE,
81 * top bits priority 81 * top bits priority
82 */ 82 */
83 83
84 unsigned short bi_vcnt; /* how many bio_vec's */ 84 unsigned short bi_vcnt; /* how many bio_vec's */
85 unsigned short bi_idx; /* current index into bvl_vec */ 85 unsigned short bi_idx; /* current index into bvl_vec */
86 86
87 /* Number of segments in this BIO after 87 /* Number of segments in this BIO after
88 * physical address coalescing is performed. 88 * physical address coalescing is performed.
89 */ 89 */
90 unsigned short bi_phys_segments; 90 unsigned short bi_phys_segments;
91 91
92 /* Number of segments after physical and DMA remapping 92 /* Number of segments after physical and DMA remapping
93 * hardware coalescing is performed. 93 * hardware coalescing is performed.
94 */ 94 */
95 unsigned short bi_hw_segments; 95 unsigned short bi_hw_segments;
96 96
97 unsigned int bi_size; /* residual I/O count */ 97 unsigned int bi_size; /* residual I/O count */
98 98
99 /* 99 /*
100 * To keep track of the max hw size, we account for the 100 * To keep track of the max hw size, we account for the
101 * sizes of the first and last virtually mergeable segments 101 * sizes of the first and last virtually mergeable segments
102 * in this bio 102 * in this bio
103 */ 103 */
104 unsigned int bi_hw_front_size; 104 unsigned int bi_hw_front_size;
105 unsigned int bi_hw_back_size; 105 unsigned int bi_hw_back_size;
106 106
107 unsigned int bi_max_vecs; /* max bvl_vecs we can hold */ 107 unsigned int bi_max_vecs; /* max bvl_vecs we can hold */
108 108
109 struct bio_vec *bi_io_vec; /* the actual vec list */ 109 struct bio_vec *bi_io_vec; /* the actual vec list */
110 110
111 bio_end_io_t *bi_end_io; 111 bio_end_io_t *bi_end_io;
112 atomic_t bi_cnt; /* pin count */ 112 atomic_t bi_cnt; /* pin count */
113 113
114 void *bi_private; 114 void *bi_private;
115 115
116 bio_destructor_t *bi_destructor; /* destructor */ 116 bio_destructor_t *bi_destructor; /* destructor */
117 }; 117 };
118 118
119 /* 119 /*
120 * bio flags 120 * bio flags
121 */ 121 */
122 #define BIO_UPTODATE 0 /* ok after I/O completion */ 122 #define BIO_UPTODATE 0 /* ok after I/O completion */
123 #define BIO_RW_BLOCK 1 /* RW_AHEAD set, and read/write would block */ 123 #define BIO_RW_BLOCK 1 /* RW_AHEAD set, and read/write would block */
124 #define BIO_EOF 2 /* out-out-bounds error */ 124 #define BIO_EOF 2 /* out-out-bounds error */
125 #define BIO_SEG_VALID 3 /* nr_hw_seg valid */ 125 #define BIO_SEG_VALID 3 /* nr_hw_seg valid */
126 #define BIO_CLONED 4 /* doesn't own data */ 126 #define BIO_CLONED 4 /* doesn't own data */
127 #define BIO_BOUNCED 5 /* bio is a bounce bio */ 127 #define BIO_BOUNCED 5 /* bio is a bounce bio */
128 #define BIO_USER_MAPPED 6 /* contains user pages */ 128 #define BIO_USER_MAPPED 6 /* contains user pages */
129 #define BIO_EOPNOTSUPP 7 /* not supported */ 129 #define BIO_EOPNOTSUPP 7 /* not supported */
130 #define bio_flagged(bio, flag) ((bio)->bi_flags & (1 << (flag))) 130 #define bio_flagged(bio, flag) ((bio)->bi_flags & (1 << (flag)))
131 131
132 /* 132 /*
133 * top 4 bits of bio flags indicate the pool this bio came from 133 * top 4 bits of bio flags indicate the pool this bio came from
134 */ 134 */
135 #define BIO_POOL_BITS (4) 135 #define BIO_POOL_BITS (4)
136 #define BIO_POOL_OFFSET (BITS_PER_LONG - BIO_POOL_BITS) 136 #define BIO_POOL_OFFSET (BITS_PER_LONG - BIO_POOL_BITS)
137 #define BIO_POOL_MASK (1UL << BIO_POOL_OFFSET) 137 #define BIO_POOL_MASK (1UL << BIO_POOL_OFFSET)
138 #define BIO_POOL_IDX(bio) ((bio)->bi_flags >> BIO_POOL_OFFSET) 138 #define BIO_POOL_IDX(bio) ((bio)->bi_flags >> BIO_POOL_OFFSET)
139 139
140 /* 140 /*
141 * bio bi_rw flags 141 * bio bi_rw flags
142 * 142 *
143 * bit 0 -- read (not set) or write (set) 143 * bit 0 -- read (not set) or write (set)
144 * bit 1 -- rw-ahead when set 144 * bit 1 -- rw-ahead when set
145 * bit 2 -- barrier 145 * bit 2 -- barrier
146 * bit 3 -- fail fast, don't want low level driver retries 146 * bit 3 -- fail fast, don't want low level driver retries
147 * bit 4 -- synchronous I/O hint: the block layer will unplug immediately 147 * bit 4 -- synchronous I/O hint: the block layer will unplug immediately
148 */ 148 */
149 #define BIO_RW 0 149 #define BIO_RW 0
150 #define BIO_RW_AHEAD 1 150 #define BIO_RW_AHEAD 1
151 #define BIO_RW_BARRIER 2 151 #define BIO_RW_BARRIER 2
152 #define BIO_RW_FAILFAST 3 152 #define BIO_RW_FAILFAST 3
153 #define BIO_RW_SYNC 4 153 #define BIO_RW_SYNC 4
154 #define BIO_RW_META 5 154 #define BIO_RW_META 5
155 155
156 /* 156 /*
157 * upper 16 bits of bi_rw define the io priority of this bio 157 * upper 16 bits of bi_rw define the io priority of this bio
158 */ 158 */
159 #define BIO_PRIO_SHIFT (8 * sizeof(unsigned long) - IOPRIO_BITS) 159 #define BIO_PRIO_SHIFT (8 * sizeof(unsigned long) - IOPRIO_BITS)
160 #define bio_prio(bio) ((bio)->bi_rw >> BIO_PRIO_SHIFT) 160 #define bio_prio(bio) ((bio)->bi_rw >> BIO_PRIO_SHIFT)
161 #define bio_prio_valid(bio) ioprio_valid(bio_prio(bio)) 161 #define bio_prio_valid(bio) ioprio_valid(bio_prio(bio))
162 162
163 #define bio_set_prio(bio, prio) do { \ 163 #define bio_set_prio(bio, prio) do { \
164 WARN_ON(prio >= (1 << IOPRIO_BITS)); \ 164 WARN_ON(prio >= (1 << IOPRIO_BITS)); \
165 (bio)->bi_rw &= ((1UL << BIO_PRIO_SHIFT) - 1); \ 165 (bio)->bi_rw &= ((1UL << BIO_PRIO_SHIFT) - 1); \
166 (bio)->bi_rw |= ((unsigned long) (prio) << BIO_PRIO_SHIFT); \ 166 (bio)->bi_rw |= ((unsigned long) (prio) << BIO_PRIO_SHIFT); \
167 } while (0) 167 } while (0)
168 168
169 /* 169 /*
170 * various member access, note that bio_data should of course not be used 170 * various member access, note that bio_data should of course not be used
171 * on highmem page vectors 171 * on highmem page vectors
172 */ 172 */
173 #define bio_iovec_idx(bio, idx) (&((bio)->bi_io_vec[(idx)])) 173 #define bio_iovec_idx(bio, idx) (&((bio)->bi_io_vec[(idx)]))
174 #define bio_iovec(bio) bio_iovec_idx((bio), (bio)->bi_idx) 174 #define bio_iovec(bio) bio_iovec_idx((bio), (bio)->bi_idx)
175 #define bio_page(bio) bio_iovec((bio))->bv_page 175 #define bio_page(bio) bio_iovec((bio))->bv_page
176 #define bio_offset(bio) bio_iovec((bio))->bv_offset 176 #define bio_offset(bio) bio_iovec((bio))->bv_offset
177 #define bio_segments(bio) ((bio)->bi_vcnt - (bio)->bi_idx) 177 #define bio_segments(bio) ((bio)->bi_vcnt - (bio)->bi_idx)
178 #define bio_sectors(bio) ((bio)->bi_size >> 9) 178 #define bio_sectors(bio) ((bio)->bi_size >> 9)
179 #define bio_cur_sectors(bio) (bio_iovec(bio)->bv_len >> 9)
180 #define bio_data(bio) (page_address(bio_page((bio))) + bio_offset((bio)))
181 #define bio_barrier(bio) ((bio)->bi_rw & (1 << BIO_RW_BARRIER)) 179 #define bio_barrier(bio) ((bio)->bi_rw & (1 << BIO_RW_BARRIER))
182 #define bio_sync(bio) ((bio)->bi_rw & (1 << BIO_RW_SYNC)) 180 #define bio_sync(bio) ((bio)->bi_rw & (1 << BIO_RW_SYNC))
183 #define bio_failfast(bio) ((bio)->bi_rw & (1 << BIO_RW_FAILFAST)) 181 #define bio_failfast(bio) ((bio)->bi_rw & (1 << BIO_RW_FAILFAST))
184 #define bio_rw_ahead(bio) ((bio)->bi_rw & (1 << BIO_RW_AHEAD)) 182 #define bio_rw_ahead(bio) ((bio)->bi_rw & (1 << BIO_RW_AHEAD))
185 #define bio_rw_meta(bio) ((bio)->bi_rw & (1 << BIO_RW_META)) 183 #define bio_rw_meta(bio) ((bio)->bi_rw & (1 << BIO_RW_META))
184 #define bio_empty_barrier(bio) (bio_barrier(bio) && !(bio)->bi_size)
185
186 static inline unsigned int bio_cur_sectors(struct bio *bio)
187 {
188 if (bio->bi_vcnt)
189 return bio_iovec(bio)->bv_len >> 9;
190
191 return 0;
192 }
193
194 static inline void *bio_data(struct bio *bio)
195 {
196 if (bio->bi_vcnt)
197 return page_address(bio_page(bio)) + bio_offset(bio);
198
199 return NULL;
200 }
186 201
187 /* 202 /*
188 * will die 203 * will die
189 */ 204 */
190 #define bio_to_phys(bio) (page_to_phys(bio_page((bio))) + (unsigned long) bio_offset((bio))) 205 #define bio_to_phys(bio) (page_to_phys(bio_page((bio))) + (unsigned long) bio_offset((bio)))
191 #define bvec_to_phys(bv) (page_to_phys((bv)->bv_page) + (unsigned long) (bv)->bv_offset) 206 #define bvec_to_phys(bv) (page_to_phys((bv)->bv_page) + (unsigned long) (bv)->bv_offset)
192 207
193 /* 208 /*
194 * queues that have highmem support enabled may still need to revert to 209 * queues that have highmem support enabled may still need to revert to
195 * PIO transfers occasionally and thus map high pages temporarily. For 210 * PIO transfers occasionally and thus map high pages temporarily. For
196 * permanent PIO fall back, user is probably better off disabling highmem 211 * permanent PIO fall back, user is probably better off disabling highmem
197 * I/O completely on that queue (see ide-dma for example) 212 * I/O completely on that queue (see ide-dma for example)
198 */ 213 */
199 #define __bio_kmap_atomic(bio, idx, kmtype) \ 214 #define __bio_kmap_atomic(bio, idx, kmtype) \
200 (kmap_atomic(bio_iovec_idx((bio), (idx))->bv_page, kmtype) + \ 215 (kmap_atomic(bio_iovec_idx((bio), (idx))->bv_page, kmtype) + \
201 bio_iovec_idx((bio), (idx))->bv_offset) 216 bio_iovec_idx((bio), (idx))->bv_offset)
202 217
203 #define __bio_kunmap_atomic(addr, kmtype) kunmap_atomic(addr, kmtype) 218 #define __bio_kunmap_atomic(addr, kmtype) kunmap_atomic(addr, kmtype)
204 219
205 /* 220 /*
206 * merge helpers etc 221 * merge helpers etc
207 */ 222 */
208 223
209 #define __BVEC_END(bio) bio_iovec_idx((bio), (bio)->bi_vcnt - 1) 224 #define __BVEC_END(bio) bio_iovec_idx((bio), (bio)->bi_vcnt - 1)
210 #define __BVEC_START(bio) bio_iovec_idx((bio), (bio)->bi_idx) 225 #define __BVEC_START(bio) bio_iovec_idx((bio), (bio)->bi_idx)
211 226
212 /* 227 /*
213 * allow arch override, for eg virtualized architectures (put in asm/io.h) 228 * allow arch override, for eg virtualized architectures (put in asm/io.h)
214 */ 229 */
215 #ifndef BIOVEC_PHYS_MERGEABLE 230 #ifndef BIOVEC_PHYS_MERGEABLE
216 #define BIOVEC_PHYS_MERGEABLE(vec1, vec2) \ 231 #define BIOVEC_PHYS_MERGEABLE(vec1, vec2) \
217 ((bvec_to_phys((vec1)) + (vec1)->bv_len) == bvec_to_phys((vec2))) 232 ((bvec_to_phys((vec1)) + (vec1)->bv_len) == bvec_to_phys((vec2)))
218 #endif 233 #endif
219 234
220 #define BIOVEC_VIRT_MERGEABLE(vec1, vec2) \ 235 #define BIOVEC_VIRT_MERGEABLE(vec1, vec2) \
221 ((((bvec_to_phys((vec1)) + (vec1)->bv_len) | bvec_to_phys((vec2))) & (BIO_VMERGE_BOUNDARY - 1)) == 0) 236 ((((bvec_to_phys((vec1)) + (vec1)->bv_len) | bvec_to_phys((vec2))) & (BIO_VMERGE_BOUNDARY - 1)) == 0)
222 #define __BIO_SEG_BOUNDARY(addr1, addr2, mask) \ 237 #define __BIO_SEG_BOUNDARY(addr1, addr2, mask) \
223 (((addr1) | (mask)) == (((addr2) - 1) | (mask))) 238 (((addr1) | (mask)) == (((addr2) - 1) | (mask)))
224 #define BIOVEC_SEG_BOUNDARY(q, b1, b2) \ 239 #define BIOVEC_SEG_BOUNDARY(q, b1, b2) \
225 __BIO_SEG_BOUNDARY(bvec_to_phys((b1)), bvec_to_phys((b2)) + (b2)->bv_len, (q)->seg_boundary_mask) 240 __BIO_SEG_BOUNDARY(bvec_to_phys((b1)), bvec_to_phys((b2)) + (b2)->bv_len, (q)->seg_boundary_mask)
226 #define BIO_SEG_BOUNDARY(q, b1, b2) \ 241 #define BIO_SEG_BOUNDARY(q, b1, b2) \
227 BIOVEC_SEG_BOUNDARY((q), __BVEC_END((b1)), __BVEC_START((b2))) 242 BIOVEC_SEG_BOUNDARY((q), __BVEC_END((b1)), __BVEC_START((b2)))
228 243
229 #define bio_io_error(bio) bio_endio((bio), -EIO) 244 #define bio_io_error(bio) bio_endio((bio), -EIO)
230 245
231 /* 246 /*
232 * drivers should not use the __ version unless they _really_ want to 247 * drivers should not use the __ version unless they _really_ want to
233 * run through the entire bio and not just pending pieces 248 * run through the entire bio and not just pending pieces
234 */ 249 */
235 #define __bio_for_each_segment(bvl, bio, i, start_idx) \ 250 #define __bio_for_each_segment(bvl, bio, i, start_idx) \
236 for (bvl = bio_iovec_idx((bio), (start_idx)), i = (start_idx); \ 251 for (bvl = bio_iovec_idx((bio), (start_idx)), i = (start_idx); \
237 i < (bio)->bi_vcnt; \ 252 i < (bio)->bi_vcnt; \
238 bvl++, i++) 253 bvl++, i++)
239 254
240 #define bio_for_each_segment(bvl, bio, i) \ 255 #define bio_for_each_segment(bvl, bio, i) \
241 __bio_for_each_segment(bvl, bio, i, (bio)->bi_idx) 256 __bio_for_each_segment(bvl, bio, i, (bio)->bi_idx)
242 257
243 /* 258 /*
244 * get a reference to a bio, so it won't disappear. the intended use is 259 * get a reference to a bio, so it won't disappear. the intended use is
245 * something like: 260 * something like:
246 * 261 *
247 * bio_get(bio); 262 * bio_get(bio);
248 * submit_bio(rw, bio); 263 * submit_bio(rw, bio);
249 * if (bio->bi_flags ...) 264 * if (bio->bi_flags ...)
250 * do_something 265 * do_something
251 * bio_put(bio); 266 * bio_put(bio);
252 * 267 *
253 * without the bio_get(), it could potentially complete I/O before submit_bio 268 * without the bio_get(), it could potentially complete I/O before submit_bio
254 * returns. and then bio would be freed memory when if (bio->bi_flags ...) 269 * returns. and then bio would be freed memory when if (bio->bi_flags ...)
255 * runs 270 * runs
256 */ 271 */
257 #define bio_get(bio) atomic_inc(&(bio)->bi_cnt) 272 #define bio_get(bio) atomic_inc(&(bio)->bi_cnt)
258 273
259 274
260 /* 275 /*
261 * A bio_pair is used when we need to split a bio. 276 * A bio_pair is used when we need to split a bio.
262 * This can only happen for a bio that refers to just one 277 * This can only happen for a bio that refers to just one
263 * page of data, and in the unusual situation when the 278 * page of data, and in the unusual situation when the
264 * page crosses a chunk/device boundary 279 * page crosses a chunk/device boundary
265 * 280 *
266 * The address of the master bio is stored in bio1.bi_private 281 * The address of the master bio is stored in bio1.bi_private
267 * The address of the pool the pair was allocated from is stored 282 * The address of the pool the pair was allocated from is stored
268 * in bio2.bi_private 283 * in bio2.bi_private
269 */ 284 */
270 struct bio_pair { 285 struct bio_pair {
271 struct bio bio1, bio2; 286 struct bio bio1, bio2;
272 struct bio_vec bv1, bv2; 287 struct bio_vec bv1, bv2;
273 atomic_t cnt; 288 atomic_t cnt;
274 int error; 289 int error;
275 }; 290 };
276 extern struct bio_pair *bio_split(struct bio *bi, mempool_t *pool, 291 extern struct bio_pair *bio_split(struct bio *bi, mempool_t *pool,
277 int first_sectors); 292 int first_sectors);
278 extern mempool_t *bio_split_pool; 293 extern mempool_t *bio_split_pool;
279 extern void bio_pair_release(struct bio_pair *dbio); 294 extern void bio_pair_release(struct bio_pair *dbio);
280 295
281 extern struct bio_set *bioset_create(int, int); 296 extern struct bio_set *bioset_create(int, int);
282 extern void bioset_free(struct bio_set *); 297 extern void bioset_free(struct bio_set *);
283 298
284 extern struct bio *bio_alloc(gfp_t, int); 299 extern struct bio *bio_alloc(gfp_t, int);
285 extern struct bio *bio_alloc_bioset(gfp_t, int, struct bio_set *); 300 extern struct bio *bio_alloc_bioset(gfp_t, int, struct bio_set *);
286 extern void bio_put(struct bio *); 301 extern void bio_put(struct bio *);
287 extern void bio_free(struct bio *, struct bio_set *); 302 extern void bio_free(struct bio *, struct bio_set *);
288 303
289 extern void bio_endio(struct bio *, int); 304 extern void bio_endio(struct bio *, int);
290 struct request_queue; 305 struct request_queue;
291 extern int bio_phys_segments(struct request_queue *, struct bio *); 306 extern int bio_phys_segments(struct request_queue *, struct bio *);
292 extern int bio_hw_segments(struct request_queue *, struct bio *); 307 extern int bio_hw_segments(struct request_queue *, struct bio *);
293 308
294 extern void __bio_clone(struct bio *, struct bio *); 309 extern void __bio_clone(struct bio *, struct bio *);
295 extern struct bio *bio_clone(struct bio *, gfp_t); 310 extern struct bio *bio_clone(struct bio *, gfp_t);
296 311
297 extern void bio_init(struct bio *); 312 extern void bio_init(struct bio *);
298 313
299 extern int bio_add_page(struct bio *, struct page *, unsigned int,unsigned int); 314 extern int bio_add_page(struct bio *, struct page *, unsigned int,unsigned int);
300 extern int bio_add_pc_page(struct request_queue *, struct bio *, struct page *, 315 extern int bio_add_pc_page(struct request_queue *, struct bio *, struct page *,
301 unsigned int, unsigned int); 316 unsigned int, unsigned int);
302 extern int bio_get_nr_vecs(struct block_device *); 317 extern int bio_get_nr_vecs(struct block_device *);
303 extern struct bio *bio_map_user(struct request_queue *, struct block_device *, 318 extern struct bio *bio_map_user(struct request_queue *, struct block_device *,
304 unsigned long, unsigned int, int); 319 unsigned long, unsigned int, int);
305 struct sg_iovec; 320 struct sg_iovec;
306 extern struct bio *bio_map_user_iov(struct request_queue *, 321 extern struct bio *bio_map_user_iov(struct request_queue *,
307 struct block_device *, 322 struct block_device *,
308 struct sg_iovec *, int, int); 323 struct sg_iovec *, int, int);
309 extern void bio_unmap_user(struct bio *); 324 extern void bio_unmap_user(struct bio *);
310 extern struct bio *bio_map_kern(struct request_queue *, void *, unsigned int, 325 extern struct bio *bio_map_kern(struct request_queue *, void *, unsigned int,
311 gfp_t); 326 gfp_t);
312 extern void bio_set_pages_dirty(struct bio *bio); 327 extern void bio_set_pages_dirty(struct bio *bio);
313 extern void bio_check_pages_dirty(struct bio *bio); 328 extern void bio_check_pages_dirty(struct bio *bio);
314 extern void bio_release_pages(struct bio *bio); 329 extern void bio_release_pages(struct bio *bio);
315 extern struct bio *bio_copy_user(struct request_queue *, unsigned long, unsigned int, int); 330 extern struct bio *bio_copy_user(struct request_queue *, unsigned long, unsigned int, int);
316 extern int bio_uncopy_user(struct bio *); 331 extern int bio_uncopy_user(struct bio *);
317 void zero_fill_bio(struct bio *bio); 332 void zero_fill_bio(struct bio *bio);
318 333
319 #ifdef CONFIG_HIGHMEM 334 #ifdef CONFIG_HIGHMEM
320 /* 335 /*
321 * remember to add offset! and never ever reenable interrupts between a 336 * remember to add offset! and never ever reenable interrupts between a
322 * bvec_kmap_irq and bvec_kunmap_irq!! 337 * bvec_kmap_irq and bvec_kunmap_irq!!
323 * 338 *
324 * This function MUST be inlined - it plays with the CPU interrupt flags. 339 * This function MUST be inlined - it plays with the CPU interrupt flags.
325 */ 340 */
326 static inline char *bvec_kmap_irq(struct bio_vec *bvec, unsigned long *flags) 341 static inline char *bvec_kmap_irq(struct bio_vec *bvec, unsigned long *flags)
327 { 342 {
328 unsigned long addr; 343 unsigned long addr;
329 344
330 /* 345 /*
331 * might not be a highmem page, but the preempt/irq count 346 * might not be a highmem page, but the preempt/irq count
332 * balancing is a lot nicer this way 347 * balancing is a lot nicer this way
333 */ 348 */
334 local_irq_save(*flags); 349 local_irq_save(*flags);
335 addr = (unsigned long) kmap_atomic(bvec->bv_page, KM_BIO_SRC_IRQ); 350 addr = (unsigned long) kmap_atomic(bvec->bv_page, KM_BIO_SRC_IRQ);
336 351
337 BUG_ON(addr & ~PAGE_MASK); 352 BUG_ON(addr & ~PAGE_MASK);
338 353
339 return (char *) addr + bvec->bv_offset; 354 return (char *) addr + bvec->bv_offset;
340 } 355 }
341 356
342 static inline void bvec_kunmap_irq(char *buffer, unsigned long *flags) 357 static inline void bvec_kunmap_irq(char *buffer, unsigned long *flags)
343 { 358 {
344 unsigned long ptr = (unsigned long) buffer & PAGE_MASK; 359 unsigned long ptr = (unsigned long) buffer & PAGE_MASK;
345 360
346 kunmap_atomic((void *) ptr, KM_BIO_SRC_IRQ); 361 kunmap_atomic((void *) ptr, KM_BIO_SRC_IRQ);
347 local_irq_restore(*flags); 362 local_irq_restore(*flags);
348 } 363 }
349 364
350 #else 365 #else
351 #define bvec_kmap_irq(bvec, flags) (page_address((bvec)->bv_page) + (bvec)->bv_offset) 366 #define bvec_kmap_irq(bvec, flags) (page_address((bvec)->bv_page) + (bvec)->bv_offset)
352 #define bvec_kunmap_irq(buf, flags) do { *(flags) = 0; } while (0) 367 #define bvec_kunmap_irq(buf, flags) do { *(flags) = 0; } while (0)
353 #endif 368 #endif
354 369
355 static inline char *__bio_kmap_irq(struct bio *bio, unsigned short idx, 370 static inline char *__bio_kmap_irq(struct bio *bio, unsigned short idx,
356 unsigned long *flags) 371 unsigned long *flags)
357 { 372 {
358 return bvec_kmap_irq(bio_iovec_idx(bio, idx), flags); 373 return bvec_kmap_irq(bio_iovec_idx(bio, idx), flags);
359 } 374 }
360 #define __bio_kunmap_irq(buf, flags) bvec_kunmap_irq(buf, flags) 375 #define __bio_kunmap_irq(buf, flags) bvec_kunmap_irq(buf, flags)
361 376
362 #define bio_kmap_irq(bio, flags) \ 377 #define bio_kmap_irq(bio, flags) \
363 __bio_kmap_irq((bio), (bio)->bi_idx, (flags)) 378 __bio_kmap_irq((bio), (bio)->bi_idx, (flags))
364 #define bio_kunmap_irq(buf,flags) __bio_kunmap_irq(buf, flags) 379 #define bio_kunmap_irq(buf,flags) __bio_kunmap_irq(buf, flags)
365 380
366 #endif /* CONFIG_BLOCK */ 381 #endif /* CONFIG_BLOCK */
include/linux/blkdev.h
1 #ifndef _LINUX_BLKDEV_H 1 #ifndef _LINUX_BLKDEV_H
2 #define _LINUX_BLKDEV_H 2 #define _LINUX_BLKDEV_H
3 3
4 #ifdef CONFIG_BLOCK 4 #ifdef CONFIG_BLOCK
5 5
6 #include <linux/sched.h> 6 #include <linux/sched.h>
7 #include <linux/major.h> 7 #include <linux/major.h>
8 #include <linux/genhd.h> 8 #include <linux/genhd.h>
9 #include <linux/list.h> 9 #include <linux/list.h>
10 #include <linux/timer.h> 10 #include <linux/timer.h>
11 #include <linux/workqueue.h> 11 #include <linux/workqueue.h>
12 #include <linux/pagemap.h> 12 #include <linux/pagemap.h>
13 #include <linux/backing-dev.h> 13 #include <linux/backing-dev.h>
14 #include <linux/wait.h> 14 #include <linux/wait.h>
15 #include <linux/mempool.h> 15 #include <linux/mempool.h>
16 #include <linux/bio.h> 16 #include <linux/bio.h>
17 #include <linux/module.h> 17 #include <linux/module.h>
18 #include <linux/stringify.h> 18 #include <linux/stringify.h>
19 #include <linux/bsg.h> 19 #include <linux/bsg.h>
20 20
21 #include <asm/scatterlist.h> 21 #include <asm/scatterlist.h>
22 22
23 struct scsi_ioctl_command; 23 struct scsi_ioctl_command;
24 24
25 struct request_queue; 25 struct request_queue;
26 typedef struct request_queue request_queue_t __deprecated; 26 typedef struct request_queue request_queue_t __deprecated;
27 struct elevator_queue; 27 struct elevator_queue;
28 typedef struct elevator_queue elevator_t; 28 typedef struct elevator_queue elevator_t;
29 struct request_pm_state; 29 struct request_pm_state;
30 struct blk_trace; 30 struct blk_trace;
31 struct request; 31 struct request;
32 struct sg_io_hdr; 32 struct sg_io_hdr;
33 33
34 #define BLKDEV_MIN_RQ 4 34 #define BLKDEV_MIN_RQ 4
35 #define BLKDEV_MAX_RQ 128 /* Default maximum */ 35 #define BLKDEV_MAX_RQ 128 /* Default maximum */
36 36
37 /* 37 /*
38 * This is the per-process anticipatory I/O scheduler state. 38 * This is the per-process anticipatory I/O scheduler state.
39 */ 39 */
40 struct as_io_context { 40 struct as_io_context {
41 spinlock_t lock; 41 spinlock_t lock;
42 42
43 void (*dtor)(struct as_io_context *aic); /* destructor */ 43 void (*dtor)(struct as_io_context *aic); /* destructor */
44 void (*exit)(struct as_io_context *aic); /* called on task exit */ 44 void (*exit)(struct as_io_context *aic); /* called on task exit */
45 45
46 unsigned long state; 46 unsigned long state;
47 atomic_t nr_queued; /* queued reads & sync writes */ 47 atomic_t nr_queued; /* queued reads & sync writes */
48 atomic_t nr_dispatched; /* number of requests gone to the drivers */ 48 atomic_t nr_dispatched; /* number of requests gone to the drivers */
49 49
50 /* IO History tracking */ 50 /* IO History tracking */
51 /* Thinktime */ 51 /* Thinktime */
52 unsigned long last_end_request; 52 unsigned long last_end_request;
53 unsigned long ttime_total; 53 unsigned long ttime_total;
54 unsigned long ttime_samples; 54 unsigned long ttime_samples;
55 unsigned long ttime_mean; 55 unsigned long ttime_mean;
56 /* Layout pattern */ 56 /* Layout pattern */
57 unsigned int seek_samples; 57 unsigned int seek_samples;
58 sector_t last_request_pos; 58 sector_t last_request_pos;
59 u64 seek_total; 59 u64 seek_total;
60 sector_t seek_mean; 60 sector_t seek_mean;
61 }; 61 };
62 62
63 struct cfq_queue; 63 struct cfq_queue;
64 struct cfq_io_context { 64 struct cfq_io_context {
65 struct rb_node rb_node; 65 struct rb_node rb_node;
66 void *key; 66 void *key;
67 67
68 struct cfq_queue *cfqq[2]; 68 struct cfq_queue *cfqq[2];
69 69
70 struct io_context *ioc; 70 struct io_context *ioc;
71 71
72 unsigned long last_end_request; 72 unsigned long last_end_request;
73 sector_t last_request_pos; 73 sector_t last_request_pos;
74 74
75 unsigned long ttime_total; 75 unsigned long ttime_total;
76 unsigned long ttime_samples; 76 unsigned long ttime_samples;
77 unsigned long ttime_mean; 77 unsigned long ttime_mean;
78 78
79 unsigned int seek_samples; 79 unsigned int seek_samples;
80 u64 seek_total; 80 u64 seek_total;
81 sector_t seek_mean; 81 sector_t seek_mean;
82 82
83 struct list_head queue_list; 83 struct list_head queue_list;
84 84
85 void (*dtor)(struct io_context *); /* destructor */ 85 void (*dtor)(struct io_context *); /* destructor */
86 void (*exit)(struct io_context *); /* called on task exit */ 86 void (*exit)(struct io_context *); /* called on task exit */
87 }; 87 };
88 88
89 /* 89 /*
90 * This is the per-process I/O subsystem state. It is refcounted and 90 * This is the per-process I/O subsystem state. It is refcounted and
91 * kmalloc'ed. Currently all fields are modified in process io context 91 * kmalloc'ed. Currently all fields are modified in process io context
92 * (apart from the atomic refcount), so require no locking. 92 * (apart from the atomic refcount), so require no locking.
93 */ 93 */
94 struct io_context { 94 struct io_context {
95 atomic_t refcount; 95 atomic_t refcount;
96 struct task_struct *task; 96 struct task_struct *task;
97 97
98 unsigned int ioprio_changed; 98 unsigned int ioprio_changed;
99 99
100 /* 100 /*
101 * For request batching 101 * For request batching
102 */ 102 */
103 unsigned long last_waited; /* Time last woken after wait for request */ 103 unsigned long last_waited; /* Time last woken after wait for request */
104 int nr_batch_requests; /* Number of requests left in the batch */ 104 int nr_batch_requests; /* Number of requests left in the batch */
105 105
106 struct as_io_context *aic; 106 struct as_io_context *aic;
107 struct rb_root cic_root; 107 struct rb_root cic_root;
108 void *ioc_data; 108 void *ioc_data;
109 }; 109 };
110 110
111 void put_io_context(struct io_context *ioc); 111 void put_io_context(struct io_context *ioc);
112 void exit_io_context(void); 112 void exit_io_context(void);
113 struct io_context *get_io_context(gfp_t gfp_flags, int node); 113 struct io_context *get_io_context(gfp_t gfp_flags, int node);
114 void copy_io_context(struct io_context **pdst, struct io_context **psrc); 114 void copy_io_context(struct io_context **pdst, struct io_context **psrc);
115 void swap_io_context(struct io_context **ioc1, struct io_context **ioc2); 115 void swap_io_context(struct io_context **ioc1, struct io_context **ioc2);
116 116
117 struct request; 117 struct request;
118 typedef void (rq_end_io_fn)(struct request *, int); 118 typedef void (rq_end_io_fn)(struct request *, int);
119 119
120 struct request_list { 120 struct request_list {
121 int count[2]; 121 int count[2];
122 int starved[2]; 122 int starved[2];
123 int elvpriv; 123 int elvpriv;
124 mempool_t *rq_pool; 124 mempool_t *rq_pool;
125 wait_queue_head_t wait[2]; 125 wait_queue_head_t wait[2];
126 }; 126 };
127 127
128 /* 128 /*
129 * request command types 129 * request command types
130 */ 130 */
131 enum rq_cmd_type_bits { 131 enum rq_cmd_type_bits {
132 REQ_TYPE_FS = 1, /* fs request */ 132 REQ_TYPE_FS = 1, /* fs request */
133 REQ_TYPE_BLOCK_PC, /* scsi command */ 133 REQ_TYPE_BLOCK_PC, /* scsi command */
134 REQ_TYPE_SENSE, /* sense request */ 134 REQ_TYPE_SENSE, /* sense request */
135 REQ_TYPE_PM_SUSPEND, /* suspend request */ 135 REQ_TYPE_PM_SUSPEND, /* suspend request */
136 REQ_TYPE_PM_RESUME, /* resume request */ 136 REQ_TYPE_PM_RESUME, /* resume request */
137 REQ_TYPE_PM_SHUTDOWN, /* shutdown request */ 137 REQ_TYPE_PM_SHUTDOWN, /* shutdown request */
138 REQ_TYPE_FLUSH, /* flush request */ 138 REQ_TYPE_FLUSH, /* flush request */
139 REQ_TYPE_SPECIAL, /* driver defined type */ 139 REQ_TYPE_SPECIAL, /* driver defined type */
140 REQ_TYPE_LINUX_BLOCK, /* generic block layer message */ 140 REQ_TYPE_LINUX_BLOCK, /* generic block layer message */
141 /* 141 /*
142 * for ATA/ATAPI devices. this really doesn't belong here, ide should 142 * for ATA/ATAPI devices. this really doesn't belong here, ide should
143 * use REQ_TYPE_SPECIAL and use rq->cmd[0] with the range of driver 143 * use REQ_TYPE_SPECIAL and use rq->cmd[0] with the range of driver
144 * private REQ_LB opcodes to differentiate what type of request this is 144 * private REQ_LB opcodes to differentiate what type of request this is
145 */ 145 */
146 REQ_TYPE_ATA_CMD, 146 REQ_TYPE_ATA_CMD,
147 REQ_TYPE_ATA_TASK, 147 REQ_TYPE_ATA_TASK,
148 REQ_TYPE_ATA_TASKFILE, 148 REQ_TYPE_ATA_TASKFILE,
149 REQ_TYPE_ATA_PC, 149 REQ_TYPE_ATA_PC,
150 }; 150 };
151 151
152 /* 152 /*
153 * For request of type REQ_TYPE_LINUX_BLOCK, rq->cmd[0] is the opcode being 153 * For request of type REQ_TYPE_LINUX_BLOCK, rq->cmd[0] is the opcode being
154 * sent down (similar to how REQ_TYPE_BLOCK_PC means that ->cmd[] holds a 154 * sent down (similar to how REQ_TYPE_BLOCK_PC means that ->cmd[] holds a
155 * SCSI cdb. 155 * SCSI cdb.
156 * 156 *
157 * 0x00 -> 0x3f are driver private, to be used for whatever purpose they need, 157 * 0x00 -> 0x3f are driver private, to be used for whatever purpose they need,
158 * typically to differentiate REQ_TYPE_SPECIAL requests. 158 * typically to differentiate REQ_TYPE_SPECIAL requests.
159 * 159 *
160 */ 160 */
161 enum { 161 enum {
162 /* 162 /*
163 * just examples for now 163 * just examples for now
164 */ 164 */
165 REQ_LB_OP_EJECT = 0x40, /* eject request */ 165 REQ_LB_OP_EJECT = 0x40, /* eject request */
166 REQ_LB_OP_FLUSH = 0x41, /* flush device */ 166 REQ_LB_OP_FLUSH = 0x41, /* flush device */
167 }; 167 };
168 168
169 /* 169 /*
170 * request type modified bits. first three bits match BIO_RW* bits, important 170 * request type modified bits. first three bits match BIO_RW* bits, important
171 */ 171 */
172 enum rq_flag_bits { 172 enum rq_flag_bits {
173 __REQ_RW, /* not set, read. set, write */ 173 __REQ_RW, /* not set, read. set, write */
174 __REQ_FAILFAST, /* no low level driver retries */ 174 __REQ_FAILFAST, /* no low level driver retries */
175 __REQ_SORTED, /* elevator knows about this request */ 175 __REQ_SORTED, /* elevator knows about this request */
176 __REQ_SOFTBARRIER, /* may not be passed by ioscheduler */ 176 __REQ_SOFTBARRIER, /* may not be passed by ioscheduler */
177 __REQ_HARDBARRIER, /* may not be passed by drive either */ 177 __REQ_HARDBARRIER, /* may not be passed by drive either */
178 __REQ_FUA, /* forced unit access */ 178 __REQ_FUA, /* forced unit access */
179 __REQ_NOMERGE, /* don't touch this for merging */ 179 __REQ_NOMERGE, /* don't touch this for merging */
180 __REQ_STARTED, /* drive already may have started this one */ 180 __REQ_STARTED, /* drive already may have started this one */
181 __REQ_DONTPREP, /* don't call prep for this one */ 181 __REQ_DONTPREP, /* don't call prep for this one */
182 __REQ_QUEUED, /* uses queueing */ 182 __REQ_QUEUED, /* uses queueing */
183 __REQ_ELVPRIV, /* elevator private data attached */ 183 __REQ_ELVPRIV, /* elevator private data attached */
184 __REQ_FAILED, /* set if the request failed */ 184 __REQ_FAILED, /* set if the request failed */
185 __REQ_QUIET, /* don't worry about errors */ 185 __REQ_QUIET, /* don't worry about errors */
186 __REQ_PREEMPT, /* set for "ide_preempt" requests */ 186 __REQ_PREEMPT, /* set for "ide_preempt" requests */
187 __REQ_ORDERED_COLOR, /* is before or after barrier */ 187 __REQ_ORDERED_COLOR, /* is before or after barrier */
188 __REQ_RW_SYNC, /* request is sync (O_DIRECT) */ 188 __REQ_RW_SYNC, /* request is sync (O_DIRECT) */
189 __REQ_ALLOCED, /* request came from our alloc pool */ 189 __REQ_ALLOCED, /* request came from our alloc pool */
190 __REQ_RW_META, /* metadata io request */ 190 __REQ_RW_META, /* metadata io request */
191 __REQ_NR_BITS, /* stops here */ 191 __REQ_NR_BITS, /* stops here */
192 }; 192 };
193 193
194 #define REQ_RW (1 << __REQ_RW) 194 #define REQ_RW (1 << __REQ_RW)
195 #define REQ_FAILFAST (1 << __REQ_FAILFAST) 195 #define REQ_FAILFAST (1 << __REQ_FAILFAST)
196 #define REQ_SORTED (1 << __REQ_SORTED) 196 #define REQ_SORTED (1 << __REQ_SORTED)
197 #define REQ_SOFTBARRIER (1 << __REQ_SOFTBARRIER) 197 #define REQ_SOFTBARRIER (1 << __REQ_SOFTBARRIER)
198 #define REQ_HARDBARRIER (1 << __REQ_HARDBARRIER) 198 #define REQ_HARDBARRIER (1 << __REQ_HARDBARRIER)
199 #define REQ_FUA (1 << __REQ_FUA) 199 #define REQ_FUA (1 << __REQ_FUA)
200 #define REQ_NOMERGE (1 << __REQ_NOMERGE) 200 #define REQ_NOMERGE (1 << __REQ_NOMERGE)
201 #define REQ_STARTED (1 << __REQ_STARTED) 201 #define REQ_STARTED (1 << __REQ_STARTED)
202 #define REQ_DONTPREP (1 << __REQ_DONTPREP) 202 #define REQ_DONTPREP (1 << __REQ_DONTPREP)
203 #define REQ_QUEUED (1 << __REQ_QUEUED) 203 #define REQ_QUEUED (1 << __REQ_QUEUED)
204 #define REQ_ELVPRIV (1 << __REQ_ELVPRIV) 204 #define REQ_ELVPRIV (1 << __REQ_ELVPRIV)
205 #define REQ_FAILED (1 << __REQ_FAILED) 205 #define REQ_FAILED (1 << __REQ_FAILED)
206 #define REQ_QUIET (1 << __REQ_QUIET) 206 #define REQ_QUIET (1 << __REQ_QUIET)
207 #define REQ_PREEMPT (1 << __REQ_PREEMPT) 207 #define REQ_PREEMPT (1 << __REQ_PREEMPT)
208 #define REQ_ORDERED_COLOR (1 << __REQ_ORDERED_COLOR) 208 #define REQ_ORDERED_COLOR (1 << __REQ_ORDERED_COLOR)
209 #define REQ_RW_SYNC (1 << __REQ_RW_SYNC) 209 #define REQ_RW_SYNC (1 << __REQ_RW_SYNC)
210 #define REQ_ALLOCED (1 << __REQ_ALLOCED) 210 #define REQ_ALLOCED (1 << __REQ_ALLOCED)
211 #define REQ_RW_META (1 << __REQ_RW_META) 211 #define REQ_RW_META (1 << __REQ_RW_META)
212 212
213 #define BLK_MAX_CDB 16 213 #define BLK_MAX_CDB 16
214 214
215 /* 215 /*
216 * try to put the fields that are referenced together in the same cacheline 216 * try to put the fields that are referenced together in the same cacheline
217 */ 217 */
218 struct request { 218 struct request {
219 struct list_head queuelist; 219 struct list_head queuelist;
220 struct list_head donelist; 220 struct list_head donelist;
221 221
222 struct request_queue *q; 222 struct request_queue *q;
223 223
224 unsigned int cmd_flags; 224 unsigned int cmd_flags;
225 enum rq_cmd_type_bits cmd_type; 225 enum rq_cmd_type_bits cmd_type;
226 226
227 /* Maintain bio traversal state for part by part I/O submission. 227 /* Maintain bio traversal state for part by part I/O submission.
228 * hard_* are block layer internals, no driver should touch them! 228 * hard_* are block layer internals, no driver should touch them!
229 */ 229 */
230 230
231 sector_t sector; /* next sector to submit */ 231 sector_t sector; /* next sector to submit */
232 sector_t hard_sector; /* next sector to complete */ 232 sector_t hard_sector; /* next sector to complete */
233 unsigned long nr_sectors; /* no. of sectors left to submit */ 233 unsigned long nr_sectors; /* no. of sectors left to submit */
234 unsigned long hard_nr_sectors; /* no. of sectors left to complete */ 234 unsigned long hard_nr_sectors; /* no. of sectors left to complete */
235 /* no. of sectors left to submit in the current segment */ 235 /* no. of sectors left to submit in the current segment */
236 unsigned int current_nr_sectors; 236 unsigned int current_nr_sectors;
237 237
238 /* no. of sectors left to complete in the current segment */ 238 /* no. of sectors left to complete in the current segment */
239 unsigned int hard_cur_sectors; 239 unsigned int hard_cur_sectors;
240 240
241 struct bio *bio; 241 struct bio *bio;
242 struct bio *biotail; 242 struct bio *biotail;
243 243
244 struct hlist_node hash; /* merge hash */ 244 struct hlist_node hash; /* merge hash */
245 /* 245 /*
246 * The rb_node is only used inside the io scheduler, requests 246 * The rb_node is only used inside the io scheduler, requests
247 * are pruned when moved to the dispatch queue. So let the 247 * are pruned when moved to the dispatch queue. So let the
248 * completion_data share space with the rb_node. 248 * completion_data share space with the rb_node.
249 */ 249 */
250 union { 250 union {
251 struct rb_node rb_node; /* sort/lookup */ 251 struct rb_node rb_node; /* sort/lookup */
252 void *completion_data; 252 void *completion_data;
253 }; 253 };
254 254
255 /* 255 /*
256 * two pointers are available for the IO schedulers, if they need 256 * two pointers are available for the IO schedulers, if they need
257 * more they have to dynamically allocate it. 257 * more they have to dynamically allocate it.
258 */ 258 */
259 void *elevator_private; 259 void *elevator_private;
260 void *elevator_private2; 260 void *elevator_private2;
261 261
262 struct gendisk *rq_disk; 262 struct gendisk *rq_disk;
263 unsigned long start_time; 263 unsigned long start_time;
264 264
265 /* Number of scatter-gather DMA addr+len pairs after 265 /* Number of scatter-gather DMA addr+len pairs after
266 * physical address coalescing is performed. 266 * physical address coalescing is performed.
267 */ 267 */
268 unsigned short nr_phys_segments; 268 unsigned short nr_phys_segments;
269 269
270 /* Number of scatter-gather addr+len pairs after 270 /* Number of scatter-gather addr+len pairs after
271 * physical and DMA remapping hardware coalescing is performed. 271 * physical and DMA remapping hardware coalescing is performed.
272 * This is the number of scatter-gather entries the driver 272 * This is the number of scatter-gather entries the driver
273 * will actually have to deal with after DMA mapping is done. 273 * will actually have to deal with after DMA mapping is done.
274 */ 274 */
275 unsigned short nr_hw_segments; 275 unsigned short nr_hw_segments;
276 276
277 unsigned short ioprio; 277 unsigned short ioprio;
278 278
279 void *special; 279 void *special;
280 char *buffer; 280 char *buffer;
281 281
282 int tag; 282 int tag;
283 int errors; 283 int errors;
284 284
285 int ref_count; 285 int ref_count;
286 286
287 /* 287 /*
288 * when request is used as a packet command carrier 288 * when request is used as a packet command carrier
289 */ 289 */
290 unsigned int cmd_len; 290 unsigned int cmd_len;
291 unsigned char cmd[BLK_MAX_CDB]; 291 unsigned char cmd[BLK_MAX_CDB];
292 292
293 unsigned int data_len; 293 unsigned int data_len;
294 unsigned int sense_len; 294 unsigned int sense_len;
295 void *data; 295 void *data;
296 void *sense; 296 void *sense;
297 297
298 unsigned int timeout; 298 unsigned int timeout;
299 int retries; 299 int retries;
300 300
301 /* 301 /*
302 * completion callback. 302 * completion callback.
303 */ 303 */
304 rq_end_io_fn *end_io; 304 rq_end_io_fn *end_io;
305 void *end_io_data; 305 void *end_io_data;
306 306
307 /* for bidi */ 307 /* for bidi */
308 struct request *next_rq; 308 struct request *next_rq;
309 }; 309 };
310 310
311 /* 311 /*
312 * State information carried for REQ_TYPE_PM_SUSPEND and REQ_TYPE_PM_RESUME 312 * State information carried for REQ_TYPE_PM_SUSPEND and REQ_TYPE_PM_RESUME
313 * requests. Some step values could eventually be made generic. 313 * requests. Some step values could eventually be made generic.
314 */ 314 */
315 struct request_pm_state 315 struct request_pm_state
316 { 316 {
317 /* PM state machine step value, currently driver specific */ 317 /* PM state machine step value, currently driver specific */
318 int pm_step; 318 int pm_step;
319 /* requested PM state value (S1, S2, S3, S4, ...) */ 319 /* requested PM state value (S1, S2, S3, S4, ...) */
320 u32 pm_state; 320 u32 pm_state;
321 void* data; /* for driver use */ 321 void* data; /* for driver use */
322 }; 322 };
323 323
324 #include <linux/elevator.h> 324 #include <linux/elevator.h>
325 325
326 typedef void (request_fn_proc) (struct request_queue *q); 326 typedef void (request_fn_proc) (struct request_queue *q);
327 typedef int (make_request_fn) (struct request_queue *q, struct bio *bio); 327 typedef int (make_request_fn) (struct request_queue *q, struct bio *bio);
328 typedef int (prep_rq_fn) (struct request_queue *, struct request *); 328 typedef int (prep_rq_fn) (struct request_queue *, struct request *);
329 typedef void (unplug_fn) (struct request_queue *); 329 typedef void (unplug_fn) (struct request_queue *);
330 330
331 struct bio_vec; 331 struct bio_vec;
332 typedef int (merge_bvec_fn) (struct request_queue *, struct bio *, struct bio_vec *); 332 typedef int (merge_bvec_fn) (struct request_queue *, struct bio *, struct bio_vec *);
333 typedef int (issue_flush_fn) (struct request_queue *, struct gendisk *, sector_t *); 333 typedef int (issue_flush_fn) (struct request_queue *, struct gendisk *, sector_t *);
334 typedef void (prepare_flush_fn) (struct request_queue *, struct request *); 334 typedef void (prepare_flush_fn) (struct request_queue *, struct request *);
335 typedef void (softirq_done_fn)(struct request *); 335 typedef void (softirq_done_fn)(struct request *);
336 336
337 enum blk_queue_state { 337 enum blk_queue_state {
338 Queue_down, 338 Queue_down,
339 Queue_up, 339 Queue_up,
340 }; 340 };
341 341
342 struct blk_queue_tag { 342 struct blk_queue_tag {
343 struct request **tag_index; /* map of busy tags */ 343 struct request **tag_index; /* map of busy tags */
344 unsigned long *tag_map; /* bit map of free/busy tags */ 344 unsigned long *tag_map; /* bit map of free/busy tags */
345 struct list_head busy_list; /* fifo list of busy tags */ 345 struct list_head busy_list; /* fifo list of busy tags */
346 int busy; /* current depth */ 346 int busy; /* current depth */
347 int max_depth; /* what we will send to device */ 347 int max_depth; /* what we will send to device */
348 int real_max_depth; /* what the array can hold */ 348 int real_max_depth; /* what the array can hold */
349 atomic_t refcnt; /* map can be shared */ 349 atomic_t refcnt; /* map can be shared */
350 }; 350 };
351 351
352 struct request_queue 352 struct request_queue
353 { 353 {
354 /* 354 /*
355 * Together with queue_head for cacheline sharing 355 * Together with queue_head for cacheline sharing
356 */ 356 */
357 struct list_head queue_head; 357 struct list_head queue_head;
358 struct request *last_merge; 358 struct request *last_merge;
359 elevator_t *elevator; 359 elevator_t *elevator;
360 360
361 /* 361 /*
362 * the queue request freelist, one for reads and one for writes 362 * the queue request freelist, one for reads and one for writes
363 */ 363 */
364 struct request_list rq; 364 struct request_list rq;
365 365
366 request_fn_proc *request_fn; 366 request_fn_proc *request_fn;
367 make_request_fn *make_request_fn; 367 make_request_fn *make_request_fn;
368 prep_rq_fn *prep_rq_fn; 368 prep_rq_fn *prep_rq_fn;
369 unplug_fn *unplug_fn; 369 unplug_fn *unplug_fn;
370 merge_bvec_fn *merge_bvec_fn; 370 merge_bvec_fn *merge_bvec_fn;
371 issue_flush_fn *issue_flush_fn; 371 issue_flush_fn *issue_flush_fn;
372 prepare_flush_fn *prepare_flush_fn; 372 prepare_flush_fn *prepare_flush_fn;
373 softirq_done_fn *softirq_done_fn; 373 softirq_done_fn *softirq_done_fn;
374 374
375 /* 375 /*
376 * Dispatch queue sorting 376 * Dispatch queue sorting
377 */ 377 */
378 sector_t end_sector; 378 sector_t end_sector;
379 struct request *boundary_rq; 379 struct request *boundary_rq;
380 380
381 /* 381 /*
382 * Auto-unplugging state 382 * Auto-unplugging state
383 */ 383 */
384 struct timer_list unplug_timer; 384 struct timer_list unplug_timer;
385 int unplug_thresh; /* After this many requests */ 385 int unplug_thresh; /* After this many requests */
386 unsigned long unplug_delay; /* After this many jiffies */ 386 unsigned long unplug_delay; /* After this many jiffies */
387 struct work_struct unplug_work; 387 struct work_struct unplug_work;
388 388
389 struct backing_dev_info backing_dev_info; 389 struct backing_dev_info backing_dev_info;
390 390
391 /* 391 /*
392 * The queue owner gets to use this for whatever they like. 392 * The queue owner gets to use this for whatever they like.
393 * ll_rw_blk doesn't touch it. 393 * ll_rw_blk doesn't touch it.
394 */ 394 */
395 void *queuedata; 395 void *queuedata;
396 396
397 /* 397 /*
398 * queue needs bounce pages for pages above this limit 398 * queue needs bounce pages for pages above this limit
399 */ 399 */
400 unsigned long bounce_pfn; 400 unsigned long bounce_pfn;
401 gfp_t bounce_gfp; 401 gfp_t bounce_gfp;
402 402
403 /* 403 /*
404 * various queue flags, see QUEUE_* below 404 * various queue flags, see QUEUE_* below
405 */ 405 */
406 unsigned long queue_flags; 406 unsigned long queue_flags;
407 407
408 /* 408 /*
409 * protects queue structures from reentrancy. ->__queue_lock should 409 * protects queue structures from reentrancy. ->__queue_lock should
410 * _never_ be used directly, it is queue private. always use 410 * _never_ be used directly, it is queue private. always use
411 * ->queue_lock. 411 * ->queue_lock.
412 */ 412 */
413 spinlock_t __queue_lock; 413 spinlock_t __queue_lock;
414 spinlock_t *queue_lock; 414 spinlock_t *queue_lock;
415 415
416 /* 416 /*
417 * queue kobject 417 * queue kobject
418 */ 418 */
419 struct kobject kobj; 419 struct kobject kobj;
420 420
421 /* 421 /*
422 * queue settings 422 * queue settings
423 */ 423 */
424 unsigned long nr_requests; /* Max # of requests */ 424 unsigned long nr_requests; /* Max # of requests */
425 unsigned int nr_congestion_on; 425 unsigned int nr_congestion_on;
426 unsigned int nr_congestion_off; 426 unsigned int nr_congestion_off;
427 unsigned int nr_batching; 427 unsigned int nr_batching;
428 428
429 unsigned int max_sectors; 429 unsigned int max_sectors;
430 unsigned int max_hw_sectors; 430 unsigned int max_hw_sectors;
431 unsigned short max_phys_segments; 431 unsigned short max_phys_segments;
432 unsigned short max_hw_segments; 432 unsigned short max_hw_segments;
433 unsigned short hardsect_size; 433 unsigned short hardsect_size;
434 unsigned int max_segment_size; 434 unsigned int max_segment_size;
435 435
436 unsigned long seg_boundary_mask; 436 unsigned long seg_boundary_mask;
437 unsigned int dma_alignment; 437 unsigned int dma_alignment;
438 438
439 struct blk_queue_tag *queue_tags; 439 struct blk_queue_tag *queue_tags;
440 440
441 unsigned int nr_sorted; 441 unsigned int nr_sorted;
442 unsigned int in_flight; 442 unsigned int in_flight;
443 443
444 /* 444 /*
445 * sg stuff 445 * sg stuff
446 */ 446 */
447 unsigned int sg_timeout; 447 unsigned int sg_timeout;
448 unsigned int sg_reserved_size; 448 unsigned int sg_reserved_size;
449 int node; 449 int node;
450 #ifdef CONFIG_BLK_DEV_IO_TRACE 450 #ifdef CONFIG_BLK_DEV_IO_TRACE
451 struct blk_trace *blk_trace; 451 struct blk_trace *blk_trace;
452 #endif 452 #endif
453 /* 453 /*
454 * reserved for flush operations 454 * reserved for flush operations
455 */ 455 */
456 unsigned int ordered, next_ordered, ordseq; 456 unsigned int ordered, next_ordered, ordseq;
457 int orderr, ordcolor; 457 int orderr, ordcolor;
458 struct request pre_flush_rq, bar_rq, post_flush_rq; 458 struct request pre_flush_rq, bar_rq, post_flush_rq;
459 struct request *orig_bar_rq; 459 struct request *orig_bar_rq;
460 460
461 struct mutex sysfs_lock; 461 struct mutex sysfs_lock;
462 462
463 #if defined(CONFIG_BLK_DEV_BSG) 463 #if defined(CONFIG_BLK_DEV_BSG)
464 struct bsg_class_device bsg_dev; 464 struct bsg_class_device bsg_dev;
465 #endif 465 #endif
466 }; 466 };
467 467
468 #define QUEUE_FLAG_CLUSTER 0 /* cluster several segments into 1 */ 468 #define QUEUE_FLAG_CLUSTER 0 /* cluster several segments into 1 */
469 #define QUEUE_FLAG_QUEUED 1 /* uses generic tag queueing */ 469 #define QUEUE_FLAG_QUEUED 1 /* uses generic tag queueing */
470 #define QUEUE_FLAG_STOPPED 2 /* queue is stopped */ 470 #define QUEUE_FLAG_STOPPED 2 /* queue is stopped */
471 #define QUEUE_FLAG_READFULL 3 /* read queue has been filled */ 471 #define QUEUE_FLAG_READFULL 3 /* read queue has been filled */
472 #define QUEUE_FLAG_WRITEFULL 4 /* write queue has been filled */ 472 #define QUEUE_FLAG_WRITEFULL 4 /* write queue has been filled */
473 #define QUEUE_FLAG_DEAD 5 /* queue being torn down */ 473 #define QUEUE_FLAG_DEAD 5 /* queue being torn down */
474 #define QUEUE_FLAG_REENTER 6 /* Re-entrancy avoidance */ 474 #define QUEUE_FLAG_REENTER 6 /* Re-entrancy avoidance */
475 #define QUEUE_FLAG_PLUGGED 7 /* queue is plugged */ 475 #define QUEUE_FLAG_PLUGGED 7 /* queue is plugged */
476 #define QUEUE_FLAG_ELVSWITCH 8 /* don't use elevator, just do FIFO */ 476 #define QUEUE_FLAG_ELVSWITCH 8 /* don't use elevator, just do FIFO */
477 #define QUEUE_FLAG_BIDI 9 /* queue supports bidi requests */ 477 #define QUEUE_FLAG_BIDI 9 /* queue supports bidi requests */
478 478
479 enum { 479 enum {
480 /* 480 /*
481 * Hardbarrier is supported with one of the following methods. 481 * Hardbarrier is supported with one of the following methods.
482 * 482 *
483 * NONE : hardbarrier unsupported 483 * NONE : hardbarrier unsupported
484 * DRAIN : ordering by draining is enough 484 * DRAIN : ordering by draining is enough
485 * DRAIN_FLUSH : ordering by draining w/ pre and post flushes 485 * DRAIN_FLUSH : ordering by draining w/ pre and post flushes
486 * DRAIN_FUA : ordering by draining w/ pre flush and FUA write 486 * DRAIN_FUA : ordering by draining w/ pre flush and FUA write
487 * TAG : ordering by tag is enough 487 * TAG : ordering by tag is enough
488 * TAG_FLUSH : ordering by tag w/ pre and post flushes 488 * TAG_FLUSH : ordering by tag w/ pre and post flushes
489 * TAG_FUA : ordering by tag w/ pre flush and FUA write 489 * TAG_FUA : ordering by tag w/ pre flush and FUA write
490 */ 490 */
491 QUEUE_ORDERED_NONE = 0x00, 491 QUEUE_ORDERED_NONE = 0x00,
492 QUEUE_ORDERED_DRAIN = 0x01, 492 QUEUE_ORDERED_DRAIN = 0x01,
493 QUEUE_ORDERED_TAG = 0x02, 493 QUEUE_ORDERED_TAG = 0x02,
494 494
495 QUEUE_ORDERED_PREFLUSH = 0x10, 495 QUEUE_ORDERED_PREFLUSH = 0x10,
496 QUEUE_ORDERED_POSTFLUSH = 0x20, 496 QUEUE_ORDERED_POSTFLUSH = 0x20,
497 QUEUE_ORDERED_FUA = 0x40, 497 QUEUE_ORDERED_FUA = 0x40,
498 498
499 QUEUE_ORDERED_DRAIN_FLUSH = QUEUE_ORDERED_DRAIN | 499 QUEUE_ORDERED_DRAIN_FLUSH = QUEUE_ORDERED_DRAIN |
500 QUEUE_ORDERED_PREFLUSH | QUEUE_ORDERED_POSTFLUSH, 500 QUEUE_ORDERED_PREFLUSH | QUEUE_ORDERED_POSTFLUSH,
501 QUEUE_ORDERED_DRAIN_FUA = QUEUE_ORDERED_DRAIN | 501 QUEUE_ORDERED_DRAIN_FUA = QUEUE_ORDERED_DRAIN |
502 QUEUE_ORDERED_PREFLUSH | QUEUE_ORDERED_FUA, 502 QUEUE_ORDERED_PREFLUSH | QUEUE_ORDERED_FUA,
503 QUEUE_ORDERED_TAG_FLUSH = QUEUE_ORDERED_TAG | 503 QUEUE_ORDERED_TAG_FLUSH = QUEUE_ORDERED_TAG |
504 QUEUE_ORDERED_PREFLUSH | QUEUE_ORDERED_POSTFLUSH, 504 QUEUE_ORDERED_PREFLUSH | QUEUE_ORDERED_POSTFLUSH,
505 QUEUE_ORDERED_TAG_FUA = QUEUE_ORDERED_TAG | 505 QUEUE_ORDERED_TAG_FUA = QUEUE_ORDERED_TAG |
506 QUEUE_ORDERED_PREFLUSH | QUEUE_ORDERED_FUA, 506 QUEUE_ORDERED_PREFLUSH | QUEUE_ORDERED_FUA,
507 507
508 /* 508 /*
509 * Ordered operation sequence 509 * Ordered operation sequence
510 */ 510 */
511 QUEUE_ORDSEQ_STARTED = 0x01, /* flushing in progress */ 511 QUEUE_ORDSEQ_STARTED = 0x01, /* flushing in progress */
512 QUEUE_ORDSEQ_DRAIN = 0x02, /* waiting for the queue to be drained */ 512 QUEUE_ORDSEQ_DRAIN = 0x02, /* waiting for the queue to be drained */
513 QUEUE_ORDSEQ_PREFLUSH = 0x04, /* pre-flushing in progress */ 513 QUEUE_ORDSEQ_PREFLUSH = 0x04, /* pre-flushing in progress */
514 QUEUE_ORDSEQ_BAR = 0x08, /* original barrier req in progress */ 514 QUEUE_ORDSEQ_BAR = 0x08, /* original barrier req in progress */
515 QUEUE_ORDSEQ_POSTFLUSH = 0x10, /* post-flushing in progress */ 515 QUEUE_ORDSEQ_POSTFLUSH = 0x10, /* post-flushing in progress */
516 QUEUE_ORDSEQ_DONE = 0x20, 516 QUEUE_ORDSEQ_DONE = 0x20,
517 }; 517 };
518 518
519 #define blk_queue_plugged(q) test_bit(QUEUE_FLAG_PLUGGED, &(q)->queue_flags) 519 #define blk_queue_plugged(q) test_bit(QUEUE_FLAG_PLUGGED, &(q)->queue_flags)
520 #define blk_queue_tagged(q) test_bit(QUEUE_FLAG_QUEUED, &(q)->queue_flags) 520 #define blk_queue_tagged(q) test_bit(QUEUE_FLAG_QUEUED, &(q)->queue_flags)
521 #define blk_queue_stopped(q) test_bit(QUEUE_FLAG_STOPPED, &(q)->queue_flags) 521 #define blk_queue_stopped(q) test_bit(QUEUE_FLAG_STOPPED, &(q)->queue_flags)
522 #define blk_queue_flushing(q) ((q)->ordseq) 522 #define blk_queue_flushing(q) ((q)->ordseq)
523 523
524 #define blk_fs_request(rq) ((rq)->cmd_type == REQ_TYPE_FS) 524 #define blk_fs_request(rq) ((rq)->cmd_type == REQ_TYPE_FS)
525 #define blk_pc_request(rq) ((rq)->cmd_type == REQ_TYPE_BLOCK_PC) 525 #define blk_pc_request(rq) ((rq)->cmd_type == REQ_TYPE_BLOCK_PC)
526 #define blk_special_request(rq) ((rq)->cmd_type == REQ_TYPE_SPECIAL) 526 #define blk_special_request(rq) ((rq)->cmd_type == REQ_TYPE_SPECIAL)
527 #define blk_sense_request(rq) ((rq)->cmd_type == REQ_TYPE_SENSE) 527 #define blk_sense_request(rq) ((rq)->cmd_type == REQ_TYPE_SENSE)
528 528
529 #define blk_noretry_request(rq) ((rq)->cmd_flags & REQ_FAILFAST) 529 #define blk_noretry_request(rq) ((rq)->cmd_flags & REQ_FAILFAST)
530 #define blk_rq_started(rq) ((rq)->cmd_flags & REQ_STARTED) 530 #define blk_rq_started(rq) ((rq)->cmd_flags & REQ_STARTED)
531 531
532 #define blk_account_rq(rq) (blk_rq_started(rq) && blk_fs_request(rq)) 532 #define blk_account_rq(rq) (blk_rq_started(rq) && blk_fs_request(rq))
533 533
534 #define blk_pm_suspend_request(rq) ((rq)->cmd_type == REQ_TYPE_PM_SUSPEND) 534 #define blk_pm_suspend_request(rq) ((rq)->cmd_type == REQ_TYPE_PM_SUSPEND)
535 #define blk_pm_resume_request(rq) ((rq)->cmd_type == REQ_TYPE_PM_RESUME) 535 #define blk_pm_resume_request(rq) ((rq)->cmd_type == REQ_TYPE_PM_RESUME)
536 #define blk_pm_request(rq) \ 536 #define blk_pm_request(rq) \
537 (blk_pm_suspend_request(rq) || blk_pm_resume_request(rq)) 537 (blk_pm_suspend_request(rq) || blk_pm_resume_request(rq))
538 538
539 #define blk_sorted_rq(rq) ((rq)->cmd_flags & REQ_SORTED) 539 #define blk_sorted_rq(rq) ((rq)->cmd_flags & REQ_SORTED)
540 #define blk_barrier_rq(rq) ((rq)->cmd_flags & REQ_HARDBARRIER) 540 #define blk_barrier_rq(rq) ((rq)->cmd_flags & REQ_HARDBARRIER)
541 #define blk_fua_rq(rq) ((rq)->cmd_flags & REQ_FUA) 541 #define blk_fua_rq(rq) ((rq)->cmd_flags & REQ_FUA)
542 #define blk_bidi_rq(rq) ((rq)->next_rq != NULL) 542 #define blk_bidi_rq(rq) ((rq)->next_rq != NULL)
543 #define blk_empty_barrier(rq) (blk_barrier_rq(rq) && blk_fs_request(rq) && !(rq)->hard_nr_sectors)
543 544
544 #define list_entry_rq(ptr) list_entry((ptr), struct request, queuelist) 545 #define list_entry_rq(ptr) list_entry((ptr), struct request, queuelist)
545 546
546 #define rq_data_dir(rq) ((rq)->cmd_flags & 1) 547 #define rq_data_dir(rq) ((rq)->cmd_flags & 1)
547 548
548 /* 549 /*
549 * We regard a request as sync, if it's a READ or a SYNC write. 550 * We regard a request as sync, if it's a READ or a SYNC write.
550 */ 551 */
551 #define rq_is_sync(rq) (rq_data_dir((rq)) == READ || (rq)->cmd_flags & REQ_RW_SYNC) 552 #define rq_is_sync(rq) (rq_data_dir((rq)) == READ || (rq)->cmd_flags & REQ_RW_SYNC)
552 #define rq_is_meta(rq) ((rq)->cmd_flags & REQ_RW_META) 553 #define rq_is_meta(rq) ((rq)->cmd_flags & REQ_RW_META)
553 554
554 static inline int blk_queue_full(struct request_queue *q, int rw) 555 static inline int blk_queue_full(struct request_queue *q, int rw)
555 { 556 {
556 if (rw == READ) 557 if (rw == READ)
557 return test_bit(QUEUE_FLAG_READFULL, &q->queue_flags); 558 return test_bit(QUEUE_FLAG_READFULL, &q->queue_flags);
558 return test_bit(QUEUE_FLAG_WRITEFULL, &q->queue_flags); 559 return test_bit(QUEUE_FLAG_WRITEFULL, &q->queue_flags);
559 } 560 }
560 561
561 static inline void blk_set_queue_full(struct request_queue *q, int rw) 562 static inline void blk_set_queue_full(struct request_queue *q, int rw)
562 { 563 {
563 if (rw == READ) 564 if (rw == READ)
564 set_bit(QUEUE_FLAG_READFULL, &q->queue_flags); 565 set_bit(QUEUE_FLAG_READFULL, &q->queue_flags);
565 else 566 else
566 set_bit(QUEUE_FLAG_WRITEFULL, &q->queue_flags); 567 set_bit(QUEUE_FLAG_WRITEFULL, &q->queue_flags);
567 } 568 }
568 569
569 static inline void blk_clear_queue_full(struct request_queue *q, int rw) 570 static inline void blk_clear_queue_full(struct request_queue *q, int rw)
570 { 571 {
571 if (rw == READ) 572 if (rw == READ)
572 clear_bit(QUEUE_FLAG_READFULL, &q->queue_flags); 573 clear_bit(QUEUE_FLAG_READFULL, &q->queue_flags);
573 else 574 else
574 clear_bit(QUEUE_FLAG_WRITEFULL, &q->queue_flags); 575 clear_bit(QUEUE_FLAG_WRITEFULL, &q->queue_flags);
575 } 576 }
576 577
577 578
578 /* 579 /*
579 * mergeable request must not have _NOMERGE or _BARRIER bit set, nor may 580 * mergeable request must not have _NOMERGE or _BARRIER bit set, nor may
580 * it already be started by driver. 581 * it already be started by driver.
581 */ 582 */
582 #define RQ_NOMERGE_FLAGS \ 583 #define RQ_NOMERGE_FLAGS \
583 (REQ_NOMERGE | REQ_STARTED | REQ_HARDBARRIER | REQ_SOFTBARRIER) 584 (REQ_NOMERGE | REQ_STARTED | REQ_HARDBARRIER | REQ_SOFTBARRIER)
584 #define rq_mergeable(rq) \ 585 #define rq_mergeable(rq) \
585 (!((rq)->cmd_flags & RQ_NOMERGE_FLAGS) && blk_fs_request((rq))) 586 (!((rq)->cmd_flags & RQ_NOMERGE_FLAGS) && blk_fs_request((rq)))
586 587
587 /* 588 /*
588 * q->prep_rq_fn return values 589 * q->prep_rq_fn return values
589 */ 590 */
590 #define BLKPREP_OK 0 /* serve it */ 591 #define BLKPREP_OK 0 /* serve it */
591 #define BLKPREP_KILL 1 /* fatal error, kill */ 592 #define BLKPREP_KILL 1 /* fatal error, kill */
592 #define BLKPREP_DEFER 2 /* leave on queue */ 593 #define BLKPREP_DEFER 2 /* leave on queue */
593 594
594 extern unsigned long blk_max_low_pfn, blk_max_pfn; 595 extern unsigned long blk_max_low_pfn, blk_max_pfn;
595 596
596 /* 597 /*
597 * standard bounce addresses: 598 * standard bounce addresses:
598 * 599 *
599 * BLK_BOUNCE_HIGH : bounce all highmem pages 600 * BLK_BOUNCE_HIGH : bounce all highmem pages
600 * BLK_BOUNCE_ANY : don't bounce anything 601 * BLK_BOUNCE_ANY : don't bounce anything
601 * BLK_BOUNCE_ISA : bounce pages above ISA DMA boundary 602 * BLK_BOUNCE_ISA : bounce pages above ISA DMA boundary
602 */ 603 */
603 #define BLK_BOUNCE_HIGH ((u64)blk_max_low_pfn << PAGE_SHIFT) 604 #define BLK_BOUNCE_HIGH ((u64)blk_max_low_pfn << PAGE_SHIFT)
604 #define BLK_BOUNCE_ANY ((u64)blk_max_pfn << PAGE_SHIFT) 605 #define BLK_BOUNCE_ANY ((u64)blk_max_pfn << PAGE_SHIFT)
605 #define BLK_BOUNCE_ISA (ISA_DMA_THRESHOLD) 606 #define BLK_BOUNCE_ISA (ISA_DMA_THRESHOLD)
606 607
607 /* 608 /*
608 * default timeout for SG_IO if none specified 609 * default timeout for SG_IO if none specified
609 */ 610 */
610 #define BLK_DEFAULT_SG_TIMEOUT (60 * HZ) 611 #define BLK_DEFAULT_SG_TIMEOUT (60 * HZ)
611 612
612 #ifdef CONFIG_BOUNCE 613 #ifdef CONFIG_BOUNCE
613 extern int init_emergency_isa_pool(void); 614 extern int init_emergency_isa_pool(void);
614 extern void blk_queue_bounce(struct request_queue *q, struct bio **bio); 615 extern void blk_queue_bounce(struct request_queue *q, struct bio **bio);
615 #else 616 #else
616 static inline int init_emergency_isa_pool(void) 617 static inline int init_emergency_isa_pool(void)
617 { 618 {
618 return 0; 619 return 0;
619 } 620 }
620 static inline void blk_queue_bounce(struct request_queue *q, struct bio **bio) 621 static inline void blk_queue_bounce(struct request_queue *q, struct bio **bio)
621 { 622 {
622 } 623 }
623 #endif /* CONFIG_MMU */ 624 #endif /* CONFIG_MMU */
624 625
625 struct req_iterator { 626 struct req_iterator {
626 int i; 627 int i;
627 struct bio *bio; 628 struct bio *bio;
628 }; 629 };
629 630
630 /* This should not be used directly - use rq_for_each_segment */ 631 /* This should not be used directly - use rq_for_each_segment */
631 #define __rq_for_each_bio(_bio, rq) \ 632 #define __rq_for_each_bio(_bio, rq) \
632 if ((rq->bio)) \ 633 if ((rq->bio)) \
633 for (_bio = (rq)->bio; _bio; _bio = _bio->bi_next) 634 for (_bio = (rq)->bio; _bio; _bio = _bio->bi_next)
634 635
635 #define rq_for_each_segment(bvl, _rq, _iter) \ 636 #define rq_for_each_segment(bvl, _rq, _iter) \
636 __rq_for_each_bio(_iter.bio, _rq) \ 637 __rq_for_each_bio(_iter.bio, _rq) \
637 bio_for_each_segment(bvl, _iter.bio, _iter.i) 638 bio_for_each_segment(bvl, _iter.bio, _iter.i)
638 639
639 #define rq_iter_last(rq, _iter) \ 640 #define rq_iter_last(rq, _iter) \
640 (_iter.bio->bi_next == NULL && _iter.i == _iter.bio->bi_vcnt-1) 641 (_iter.bio->bi_next == NULL && _iter.i == _iter.bio->bi_vcnt-1)
641 642
642 extern int blk_register_queue(struct gendisk *disk); 643 extern int blk_register_queue(struct gendisk *disk);
643 extern void blk_unregister_queue(struct gendisk *disk); 644 extern void blk_unregister_queue(struct gendisk *disk);
644 extern void register_disk(struct gendisk *dev); 645 extern void register_disk(struct gendisk *dev);
645 extern void generic_make_request(struct bio *bio); 646 extern void generic_make_request(struct bio *bio);
646 extern void blk_put_request(struct request *); 647 extern void blk_put_request(struct request *);
647 extern void __blk_put_request(struct request_queue *, struct request *); 648 extern void __blk_put_request(struct request_queue *, struct request *);
648 extern void blk_end_sync_rq(struct request *rq, int error); 649 extern void blk_end_sync_rq(struct request *rq, int error);
649 extern struct request *blk_get_request(struct request_queue *, int, gfp_t); 650 extern struct request *blk_get_request(struct request_queue *, int, gfp_t);
650 extern void blk_insert_request(struct request_queue *, struct request *, int, void *); 651 extern void blk_insert_request(struct request_queue *, struct request *, int, void *);
651 extern void blk_requeue_request(struct request_queue *, struct request *); 652 extern void blk_requeue_request(struct request_queue *, struct request *);
652 extern void blk_plug_device(struct request_queue *); 653 extern void blk_plug_device(struct request_queue *);
653 extern int blk_remove_plug(struct request_queue *); 654 extern int blk_remove_plug(struct request_queue *);
654 extern void blk_recount_segments(struct request_queue *, struct bio *); 655 extern void blk_recount_segments(struct request_queue *, struct bio *);
655 extern int scsi_cmd_ioctl(struct file *, struct request_queue *, 656 extern int scsi_cmd_ioctl(struct file *, struct request_queue *,
656 struct gendisk *, unsigned int, void __user *); 657 struct gendisk *, unsigned int, void __user *);
657 extern int sg_scsi_ioctl(struct file *, struct request_queue *, 658 extern int sg_scsi_ioctl(struct file *, struct request_queue *,
658 struct gendisk *, struct scsi_ioctl_command __user *); 659 struct gendisk *, struct scsi_ioctl_command __user *);
659 660
660 /* 661 /*
661 * Temporary export, until SCSI gets fixed up. 662 * Temporary export, until SCSI gets fixed up.
662 */ 663 */
663 extern int blk_rq_append_bio(struct request_queue *q, struct request *rq, 664 extern int blk_rq_append_bio(struct request_queue *q, struct request *rq,
664 struct bio *bio); 665 struct bio *bio);
665 666
666 /* 667 /*
667 * A queue has just exitted congestion. Note this in the global counter of 668 * A queue has just exitted congestion. Note this in the global counter of
668 * congested queues, and wake up anyone who was waiting for requests to be 669 * congested queues, and wake up anyone who was waiting for requests to be
669 * put back. 670 * put back.
670 */ 671 */
671 static inline void blk_clear_queue_congested(struct request_queue *q, int rw) 672 static inline void blk_clear_queue_congested(struct request_queue *q, int rw)
672 { 673 {
673 clear_bdi_congested(&q->backing_dev_info, rw); 674 clear_bdi_congested(&q->backing_dev_info, rw);
674 } 675 }
675 676
676 /* 677 /*
677 * A queue has just entered congestion. Flag that in the queue's VM-visible 678 * A queue has just entered congestion. Flag that in the queue's VM-visible
678 * state flags and increment the global gounter of congested queues. 679 * state flags and increment the global gounter of congested queues.
679 */ 680 */
680 static inline void blk_set_queue_congested(struct request_queue *q, int rw) 681 static inline void blk_set_queue_congested(struct request_queue *q, int rw)
681 { 682 {
682 set_bdi_congested(&q->backing_dev_info, rw); 683 set_bdi_congested(&q->backing_dev_info, rw);
683 } 684 }
684 685
685 extern void blk_start_queue(struct request_queue *q); 686 extern void blk_start_queue(struct request_queue *q);
686 extern void blk_stop_queue(struct request_queue *q); 687 extern void blk_stop_queue(struct request_queue *q);
687 extern void blk_sync_queue(struct request_queue *q); 688 extern void blk_sync_queue(struct request_queue *q);
688 extern void __blk_stop_queue(struct request_queue *q); 689 extern void __blk_stop_queue(struct request_queue *q);
689 extern void blk_run_queue(struct request_queue *); 690 extern void blk_run_queue(struct request_queue *);
690 extern void blk_start_queueing(struct request_queue *); 691 extern void blk_start_queueing(struct request_queue *);
691 extern int blk_rq_map_user(struct request_queue *, struct request *, void __user *, unsigned long); 692 extern int blk_rq_map_user(struct request_queue *, struct request *, void __user *, unsigned long);
692 extern int blk_rq_unmap_user(struct bio *); 693 extern int blk_rq_unmap_user(struct bio *);
693 extern int blk_rq_map_kern(struct request_queue *, struct request *, void *, unsigned int, gfp_t); 694 extern int blk_rq_map_kern(struct request_queue *, struct request *, void *, unsigned int, gfp_t);
694 extern int blk_rq_map_user_iov(struct request_queue *, struct request *, 695 extern int blk_rq_map_user_iov(struct request_queue *, struct request *,
695 struct sg_iovec *, int, unsigned int); 696 struct sg_iovec *, int, unsigned int);
696 extern int blk_execute_rq(struct request_queue *, struct gendisk *, 697 extern int blk_execute_rq(struct request_queue *, struct gendisk *,
697 struct request *, int); 698 struct request *, int);
698 extern void blk_execute_rq_nowait(struct request_queue *, struct gendisk *, 699 extern void blk_execute_rq_nowait(struct request_queue *, struct gendisk *,
699 struct request *, int, rq_end_io_fn *); 700 struct request *, int, rq_end_io_fn *);
700 extern int blk_verify_command(unsigned char *, int); 701 extern int blk_verify_command(unsigned char *, int);
701 702
702 static inline struct request_queue *bdev_get_queue(struct block_device *bdev) 703 static inline struct request_queue *bdev_get_queue(struct block_device *bdev)
703 { 704 {
704 return bdev->bd_disk->queue; 705 return bdev->bd_disk->queue;
705 } 706 }
706 707
707 static inline void blk_run_backing_dev(struct backing_dev_info *bdi, 708 static inline void blk_run_backing_dev(struct backing_dev_info *bdi,
708 struct page *page) 709 struct page *page)
709 { 710 {
710 if (bdi && bdi->unplug_io_fn) 711 if (bdi && bdi->unplug_io_fn)
711 bdi->unplug_io_fn(bdi, page); 712 bdi->unplug_io_fn(bdi, page);
712 } 713 }
713 714
714 static inline void blk_run_address_space(struct address_space *mapping) 715 static inline void blk_run_address_space(struct address_space *mapping)
715 { 716 {
716 if (mapping) 717 if (mapping)
717 blk_run_backing_dev(mapping->backing_dev_info, NULL); 718 blk_run_backing_dev(mapping->backing_dev_info, NULL);
718 } 719 }
719 720
720 /* 721 /*
721 * end_request() and friends. Must be called with the request queue spinlock 722 * end_request() and friends. Must be called with the request queue spinlock
722 * acquired. All functions called within end_request() _must_be_ atomic. 723 * acquired. All functions called within end_request() _must_be_ atomic.
723 * 724 *
724 * Several drivers define their own end_request and call 725 * Several drivers define their own end_request and call
725 * end_that_request_first() and end_that_request_last() 726 * end_that_request_first() and end_that_request_last()
726 * for parts of the original function. This prevents 727 * for parts of the original function. This prevents
727 * code duplication in drivers. 728 * code duplication in drivers.
728 */ 729 */
729 extern int end_that_request_first(struct request *, int, int); 730 extern int end_that_request_first(struct request *, int, int);
730 extern int end_that_request_chunk(struct request *, int, int); 731 extern int end_that_request_chunk(struct request *, int, int);
731 extern void end_that_request_last(struct request *, int); 732 extern void end_that_request_last(struct request *, int);
732 extern void end_request(struct request *, int); 733 extern void end_request(struct request *, int);
733 extern void end_queued_request(struct request *, int); 734 extern void end_queued_request(struct request *, int);
734 extern void end_dequeued_request(struct request *, int); 735 extern void end_dequeued_request(struct request *, int);
735 extern void blk_complete_request(struct request *); 736 extern void blk_complete_request(struct request *);
736 737
737 /* 738 /*
738 * end_that_request_first/chunk() takes an uptodate argument. we account 739 * end_that_request_first/chunk() takes an uptodate argument. we account
739 * any value <= as an io error. 0 means -EIO for compatability reasons, 740 * any value <= as an io error. 0 means -EIO for compatability reasons,
740 * any other < 0 value is the direct error type. An uptodate value of 741 * any other < 0 value is the direct error type. An uptodate value of
741 * 1 indicates successful io completion 742 * 1 indicates successful io completion
742 */ 743 */
743 #define end_io_error(uptodate) (unlikely((uptodate) <= 0)) 744 #define end_io_error(uptodate) (unlikely((uptodate) <= 0))
744 745
745 static inline void blkdev_dequeue_request(struct request *req) 746 static inline void blkdev_dequeue_request(struct request *req)
746 { 747 {
747 elv_dequeue_request(req->q, req); 748 elv_dequeue_request(req->q, req);
748 } 749 }
749 750
750 /* 751 /*
751 * Access functions for manipulating queue properties 752 * Access functions for manipulating queue properties
752 */ 753 */
753 extern struct request_queue *blk_init_queue_node(request_fn_proc *rfn, 754 extern struct request_queue *blk_init_queue_node(request_fn_proc *rfn,
754 spinlock_t *lock, int node_id); 755 spinlock_t *lock, int node_id);
755 extern struct request_queue *blk_init_queue(request_fn_proc *, spinlock_t *); 756 extern struct request_queue *blk_init_queue(request_fn_proc *, spinlock_t *);
756 extern void blk_cleanup_queue(struct request_queue *); 757 extern void blk_cleanup_queue(struct request_queue *);
757 extern void blk_queue_make_request(struct request_queue *, make_request_fn *); 758 extern void blk_queue_make_request(struct request_queue *, make_request_fn *);
758 extern void blk_queue_bounce_limit(struct request_queue *, u64); 759 extern void blk_queue_bounce_limit(struct request_queue *, u64);
759 extern void blk_queue_max_sectors(struct request_queue *, unsigned int); 760 extern void blk_queue_max_sectors(struct request_queue *, unsigned int);
760 extern void blk_queue_max_phys_segments(struct request_queue *, unsigned short); 761 extern void blk_queue_max_phys_segments(struct request_queue *, unsigned short);
761 extern void blk_queue_max_hw_segments(struct request_queue *, unsigned short); 762 extern void blk_queue_max_hw_segments(struct request_queue *, unsigned short);
762 extern void blk_queue_max_segment_size(struct request_queue *, unsigned int); 763 extern void blk_queue_max_segment_size(struct request_queue *, unsigned int);
763 extern void blk_queue_hardsect_size(struct request_queue *, unsigned short); 764 extern void blk_queue_hardsect_size(struct request_queue *, unsigned short);
764 extern void blk_queue_stack_limits(struct request_queue *t, struct request_queue *b); 765 extern void blk_queue_stack_limits(struct request_queue *t, struct request_queue *b);
765 extern void blk_queue_segment_boundary(struct request_queue *, unsigned long); 766 extern void blk_queue_segment_boundary(struct request_queue *, unsigned long);
766 extern void blk_queue_prep_rq(struct request_queue *, prep_rq_fn *pfn); 767 extern void blk_queue_prep_rq(struct request_queue *, prep_rq_fn *pfn);
767 extern void blk_queue_merge_bvec(struct request_queue *, merge_bvec_fn *); 768 extern void blk_queue_merge_bvec(struct request_queue *, merge_bvec_fn *);
768 extern void blk_queue_dma_alignment(struct request_queue *, int); 769 extern void blk_queue_dma_alignment(struct request_queue *, int);
769 extern void blk_queue_softirq_done(struct request_queue *, softirq_done_fn *); 770 extern void blk_queue_softirq_done(struct request_queue *, softirq_done_fn *);
770 extern struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev); 771 extern struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev);
771 extern int blk_queue_ordered(struct request_queue *, unsigned, prepare_flush_fn *); 772 extern int blk_queue_ordered(struct request_queue *, unsigned, prepare_flush_fn *);
772 extern void blk_queue_issue_flush_fn(struct request_queue *, issue_flush_fn *); 773 extern void blk_queue_issue_flush_fn(struct request_queue *, issue_flush_fn *);
773 extern int blk_do_ordered(struct request_queue *, struct request **); 774 extern int blk_do_ordered(struct request_queue *, struct request **);
774 extern unsigned blk_ordered_cur_seq(struct request_queue *); 775 extern unsigned blk_ordered_cur_seq(struct request_queue *);
775 extern unsigned blk_ordered_req_seq(struct request *); 776 extern unsigned blk_ordered_req_seq(struct request *);
776 extern void blk_ordered_complete_seq(struct request_queue *, unsigned, int); 777 extern void blk_ordered_complete_seq(struct request_queue *, unsigned, int);
777 778
778 extern int blk_rq_map_sg(struct request_queue *, struct request *, struct scatterlist *); 779 extern int blk_rq_map_sg(struct request_queue *, struct request *, struct scatterlist *);
779 extern void blk_dump_rq_flags(struct request *, char *); 780 extern void blk_dump_rq_flags(struct request *, char *);
780 extern void generic_unplug_device(struct request_queue *); 781 extern void generic_unplug_device(struct request_queue *);
781 extern void __generic_unplug_device(struct request_queue *); 782 extern void __generic_unplug_device(struct request_queue *);
782 extern long nr_blockdev_pages(void); 783 extern long nr_blockdev_pages(void);
783 784
784 int blk_get_queue(struct request_queue *); 785 int blk_get_queue(struct request_queue *);
785 struct request_queue *blk_alloc_queue(gfp_t); 786 struct request_queue *blk_alloc_queue(gfp_t);
786 struct request_queue *blk_alloc_queue_node(gfp_t, int); 787 struct request_queue *blk_alloc_queue_node(gfp_t, int);
787 extern void blk_put_queue(struct request_queue *); 788 extern void blk_put_queue(struct request_queue *);
788 789
789 /* 790 /*
790 * tag stuff 791 * tag stuff
791 */ 792 */
792 #define blk_queue_tag_depth(q) ((q)->queue_tags->busy) 793 #define blk_queue_tag_depth(q) ((q)->queue_tags->busy)
793 #define blk_queue_tag_queue(q) ((q)->queue_tags->busy < (q)->queue_tags->max_depth) 794 #define blk_queue_tag_queue(q) ((q)->queue_tags->busy < (q)->queue_tags->max_depth)
794 #define blk_rq_tagged(rq) ((rq)->cmd_flags & REQ_QUEUED) 795 #define blk_rq_tagged(rq) ((rq)->cmd_flags & REQ_QUEUED)
795 extern int blk_queue_start_tag(struct request_queue *, struct request *); 796 extern int blk_queue_start_tag(struct request_queue *, struct request *);
796 extern struct request *blk_queue_find_tag(struct request_queue *, int); 797 extern struct request *blk_queue_find_tag(struct request_queue *, int);
797 extern void blk_queue_end_tag(struct request_queue *, struct request *); 798 extern void blk_queue_end_tag(struct request_queue *, struct request *);
798 extern int blk_queue_init_tags(struct request_queue *, int, struct blk_queue_tag *); 799 extern int blk_queue_init_tags(struct request_queue *, int, struct blk_queue_tag *);
799 extern void blk_queue_free_tags(struct request_queue *); 800 extern void blk_queue_free_tags(struct request_queue *);
800 extern int blk_queue_resize_tags(struct request_queue *, int); 801 extern int blk_queue_resize_tags(struct request_queue *, int);
801 extern void blk_queue_invalidate_tags(struct request_queue *); 802 extern void blk_queue_invalidate_tags(struct request_queue *);
802 extern struct blk_queue_tag *blk_init_tags(int); 803 extern struct blk_queue_tag *blk_init_tags(int);
803 extern void blk_free_tags(struct blk_queue_tag *); 804 extern void blk_free_tags(struct blk_queue_tag *);
804 805
805 static inline struct request *blk_map_queue_find_tag(struct blk_queue_tag *bqt, 806 static inline struct request *blk_map_queue_find_tag(struct blk_queue_tag *bqt,
806 int tag) 807 int tag)
807 { 808 {
808 if (unlikely(bqt == NULL || tag >= bqt->real_max_depth)) 809 if (unlikely(bqt == NULL || tag >= bqt->real_max_depth))
809 return NULL; 810 return NULL;
810 return bqt->tag_index[tag]; 811 return bqt->tag_index[tag];
811 } 812 }
812 813
813 extern int blkdev_issue_flush(struct block_device *, sector_t *); 814 extern int blkdev_issue_flush(struct block_device *, sector_t *);
814 815
815 #define MAX_PHYS_SEGMENTS 128 816 #define MAX_PHYS_SEGMENTS 128
816 #define MAX_HW_SEGMENTS 128 817 #define MAX_HW_SEGMENTS 128
817 #define SAFE_MAX_SECTORS 255 818 #define SAFE_MAX_SECTORS 255
818 #define BLK_DEF_MAX_SECTORS 1024 819 #define BLK_DEF_MAX_SECTORS 1024
819 820
820 #define MAX_SEGMENT_SIZE 65536 821 #define MAX_SEGMENT_SIZE 65536
821 822
822 #define blkdev_entry_to_request(entry) list_entry((entry), struct request, queuelist) 823 #define blkdev_entry_to_request(entry) list_entry((entry), struct request, queuelist)
823 824
824 static inline int queue_hardsect_size(struct request_queue *q) 825 static inline int queue_hardsect_size(struct request_queue *q)
825 { 826 {
826 int retval = 512; 827 int retval = 512;
827 828
828 if (q && q->hardsect_size) 829 if (q && q->hardsect_size)
829 retval = q->hardsect_size; 830 retval = q->hardsect_size;
830 831
831 return retval; 832 return retval;
832 } 833 }
833 834
834 static inline int bdev_hardsect_size(struct block_device *bdev) 835 static inline int bdev_hardsect_size(struct block_device *bdev)
835 { 836 {
836 return queue_hardsect_size(bdev_get_queue(bdev)); 837 return queue_hardsect_size(bdev_get_queue(bdev));
837 } 838 }
838 839
839 static inline int queue_dma_alignment(struct request_queue *q) 840 static inline int queue_dma_alignment(struct request_queue *q)
840 { 841 {
841 int retval = 511; 842 int retval = 511;
842 843
843 if (q && q->dma_alignment) 844 if (q && q->dma_alignment)
844 retval = q->dma_alignment; 845 retval = q->dma_alignment;
845 846
846 return retval; 847 return retval;
847 } 848 }
848 849
849 /* assumes size > 256 */ 850 /* assumes size > 256 */
850 static inline unsigned int blksize_bits(unsigned int size) 851 static inline unsigned int blksize_bits(unsigned int size)
851 { 852 {
852 unsigned int bits = 8; 853 unsigned int bits = 8;
853 do { 854 do {
854 bits++; 855 bits++;
855 size >>= 1; 856 size >>= 1;
856 } while (size > 256); 857 } while (size > 256);
857 return bits; 858 return bits;
858 } 859 }
859 860
860 static inline unsigned int block_size(struct block_device *bdev) 861 static inline unsigned int block_size(struct block_device *bdev)
861 { 862 {
862 return bdev->bd_block_size; 863 return bdev->bd_block_size;
863 } 864 }
864 865
865 typedef struct {struct page *v;} Sector; 866 typedef struct {struct page *v;} Sector;
866 867
867 unsigned char *read_dev_sector(struct block_device *, sector_t, Sector *); 868 unsigned char *read_dev_sector(struct block_device *, sector_t, Sector *);
868 869
869 static inline void put_dev_sector(Sector p) 870 static inline void put_dev_sector(Sector p)
870 { 871 {
871 page_cache_release(p.v); 872 page_cache_release(p.v);
872 } 873 }
873 874
874 struct work_struct; 875 struct work_struct;
875 int kblockd_schedule_work(struct work_struct *work); 876 int kblockd_schedule_work(struct work_struct *work);
876 void kblockd_flush_work(struct work_struct *work); 877 void kblockd_flush_work(struct work_struct *work);
877 878
878 #define MODULE_ALIAS_BLOCKDEV(major,minor) \ 879 #define MODULE_ALIAS_BLOCKDEV(major,minor) \
879 MODULE_ALIAS("block-major-" __stringify(major) "-" __stringify(minor)) 880 MODULE_ALIAS("block-major-" __stringify(major) "-" __stringify(minor))
880 #define MODULE_ALIAS_BLOCKDEV_MAJOR(major) \ 881 #define MODULE_ALIAS_BLOCKDEV_MAJOR(major) \
881 MODULE_ALIAS("block-major-" __stringify(major) "-*") 882 MODULE_ALIAS("block-major-" __stringify(major) "-*")
882 883
883 884
884 #else /* CONFIG_BLOCK */ 885 #else /* CONFIG_BLOCK */
885 /* 886 /*
886 * stubs for when the block layer is configured out 887 * stubs for when the block layer is configured out
887 */ 888 */
888 #define buffer_heads_over_limit 0 889 #define buffer_heads_over_limit 0
889 890
890 static inline long nr_blockdev_pages(void) 891 static inline long nr_blockdev_pages(void)
891 { 892 {
892 return 0; 893 return 0;
893 } 894 }
894 895
895 static inline void exit_io_context(void) 896 static inline void exit_io_context(void)
896 { 897 {
897 } 898 }
898 899
899 #endif /* CONFIG_BLOCK */ 900 #endif /* CONFIG_BLOCK */
900 901
901 #endif 902 #endif
902 903
1 /* bounce buffer handling for block devices 1 /* bounce buffer handling for block devices
2 * 2 *
3 * - Split from highmem.c 3 * - Split from highmem.c
4 */ 4 */
5 5
6 #include <linux/mm.h> 6 #include <linux/mm.h>
7 #include <linux/module.h> 7 #include <linux/module.h>
8 #include <linux/swap.h> 8 #include <linux/swap.h>
9 #include <linux/bio.h> 9 #include <linux/bio.h>
10 #include <linux/pagemap.h> 10 #include <linux/pagemap.h>
11 #include <linux/mempool.h> 11 #include <linux/mempool.h>
12 #include <linux/blkdev.h> 12 #include <linux/blkdev.h>
13 #include <linux/init.h> 13 #include <linux/init.h>
14 #include <linux/hash.h> 14 #include <linux/hash.h>
15 #include <linux/highmem.h> 15 #include <linux/highmem.h>
16 #include <linux/blktrace_api.h> 16 #include <linux/blktrace_api.h>
17 #include <asm/tlbflush.h> 17 #include <asm/tlbflush.h>
18 18
19 #define POOL_SIZE 64 19 #define POOL_SIZE 64
20 #define ISA_POOL_SIZE 16 20 #define ISA_POOL_SIZE 16
21 21
22 static mempool_t *page_pool, *isa_page_pool; 22 static mempool_t *page_pool, *isa_page_pool;
23 23
24 #ifdef CONFIG_HIGHMEM 24 #ifdef CONFIG_HIGHMEM
25 static __init int init_emergency_pool(void) 25 static __init int init_emergency_pool(void)
26 { 26 {
27 struct sysinfo i; 27 struct sysinfo i;
28 si_meminfo(&i); 28 si_meminfo(&i);
29 si_swapinfo(&i); 29 si_swapinfo(&i);
30 30
31 if (!i.totalhigh) 31 if (!i.totalhigh)
32 return 0; 32 return 0;
33 33
34 page_pool = mempool_create_page_pool(POOL_SIZE, 0); 34 page_pool = mempool_create_page_pool(POOL_SIZE, 0);
35 BUG_ON(!page_pool); 35 BUG_ON(!page_pool);
36 printk("highmem bounce pool size: %d pages\n", POOL_SIZE); 36 printk("highmem bounce pool size: %d pages\n", POOL_SIZE);
37 37
38 return 0; 38 return 0;
39 } 39 }
40 40
41 __initcall(init_emergency_pool); 41 __initcall(init_emergency_pool);
42 42
43 /* 43 /*
44 * highmem version, map in to vec 44 * highmem version, map in to vec
45 */ 45 */
46 static void bounce_copy_vec(struct bio_vec *to, unsigned char *vfrom) 46 static void bounce_copy_vec(struct bio_vec *to, unsigned char *vfrom)
47 { 47 {
48 unsigned long flags; 48 unsigned long flags;
49 unsigned char *vto; 49 unsigned char *vto;
50 50
51 local_irq_save(flags); 51 local_irq_save(flags);
52 vto = kmap_atomic(to->bv_page, KM_BOUNCE_READ); 52 vto = kmap_atomic(to->bv_page, KM_BOUNCE_READ);
53 memcpy(vto + to->bv_offset, vfrom, to->bv_len); 53 memcpy(vto + to->bv_offset, vfrom, to->bv_len);
54 kunmap_atomic(vto, KM_BOUNCE_READ); 54 kunmap_atomic(vto, KM_BOUNCE_READ);
55 local_irq_restore(flags); 55 local_irq_restore(flags);
56 } 56 }
57 57
58 #else /* CONFIG_HIGHMEM */ 58 #else /* CONFIG_HIGHMEM */
59 59
60 #define bounce_copy_vec(to, vfrom) \ 60 #define bounce_copy_vec(to, vfrom) \
61 memcpy(page_address((to)->bv_page) + (to)->bv_offset, vfrom, (to)->bv_len) 61 memcpy(page_address((to)->bv_page) + (to)->bv_offset, vfrom, (to)->bv_len)
62 62
63 #endif /* CONFIG_HIGHMEM */ 63 #endif /* CONFIG_HIGHMEM */
64 64
65 /* 65 /*
66 * allocate pages in the DMA region for the ISA pool 66 * allocate pages in the DMA region for the ISA pool
67 */ 67 */
68 static void *mempool_alloc_pages_isa(gfp_t gfp_mask, void *data) 68 static void *mempool_alloc_pages_isa(gfp_t gfp_mask, void *data)
69 { 69 {
70 return mempool_alloc_pages(gfp_mask | GFP_DMA, data); 70 return mempool_alloc_pages(gfp_mask | GFP_DMA, data);
71 } 71 }
72 72
73 /* 73 /*
74 * gets called "every" time someone init's a queue with BLK_BOUNCE_ISA 74 * gets called "every" time someone init's a queue with BLK_BOUNCE_ISA
75 * as the max address, so check if the pool has already been created. 75 * as the max address, so check if the pool has already been created.
76 */ 76 */
77 int init_emergency_isa_pool(void) 77 int init_emergency_isa_pool(void)
78 { 78 {
79 if (isa_page_pool) 79 if (isa_page_pool)
80 return 0; 80 return 0;
81 81
82 isa_page_pool = mempool_create(ISA_POOL_SIZE, mempool_alloc_pages_isa, 82 isa_page_pool = mempool_create(ISA_POOL_SIZE, mempool_alloc_pages_isa,
83 mempool_free_pages, (void *) 0); 83 mempool_free_pages, (void *) 0);
84 BUG_ON(!isa_page_pool); 84 BUG_ON(!isa_page_pool);
85 85
86 printk("isa bounce pool size: %d pages\n", ISA_POOL_SIZE); 86 printk("isa bounce pool size: %d pages\n", ISA_POOL_SIZE);
87 return 0; 87 return 0;
88 } 88 }
89 89
90 /* 90 /*
91 * Simple bounce buffer support for highmem pages. Depending on the 91 * Simple bounce buffer support for highmem pages. Depending on the
92 * queue gfp mask set, *to may or may not be a highmem page. kmap it 92 * queue gfp mask set, *to may or may not be a highmem page. kmap it
93 * always, it will do the Right Thing 93 * always, it will do the Right Thing
94 */ 94 */
95 static void copy_to_high_bio_irq(struct bio *to, struct bio *from) 95 static void copy_to_high_bio_irq(struct bio *to, struct bio *from)
96 { 96 {
97 unsigned char *vfrom; 97 unsigned char *vfrom;
98 struct bio_vec *tovec, *fromvec; 98 struct bio_vec *tovec, *fromvec;
99 int i; 99 int i;
100 100
101 __bio_for_each_segment(tovec, to, i, 0) { 101 __bio_for_each_segment(tovec, to, i, 0) {
102 fromvec = from->bi_io_vec + i; 102 fromvec = from->bi_io_vec + i;
103 103
104 /* 104 /*
105 * not bounced 105 * not bounced
106 */ 106 */
107 if (tovec->bv_page == fromvec->bv_page) 107 if (tovec->bv_page == fromvec->bv_page)
108 continue; 108 continue;
109 109
110 /* 110 /*
111 * fromvec->bv_offset and fromvec->bv_len might have been 111 * fromvec->bv_offset and fromvec->bv_len might have been
112 * modified by the block layer, so use the original copy, 112 * modified by the block layer, so use the original copy,
113 * bounce_copy_vec already uses tovec->bv_len 113 * bounce_copy_vec already uses tovec->bv_len
114 */ 114 */
115 vfrom = page_address(fromvec->bv_page) + tovec->bv_offset; 115 vfrom = page_address(fromvec->bv_page) + tovec->bv_offset;
116 116
117 flush_dcache_page(tovec->bv_page); 117 flush_dcache_page(tovec->bv_page);
118 bounce_copy_vec(tovec, vfrom); 118 bounce_copy_vec(tovec, vfrom);
119 } 119 }
120 } 120 }
121 121
122 static void bounce_end_io(struct bio *bio, mempool_t *pool, int err) 122 static void bounce_end_io(struct bio *bio, mempool_t *pool, int err)
123 { 123 {
124 struct bio *bio_orig = bio->bi_private; 124 struct bio *bio_orig = bio->bi_private;
125 struct bio_vec *bvec, *org_vec; 125 struct bio_vec *bvec, *org_vec;
126 int i; 126 int i;
127 127
128 if (test_bit(BIO_EOPNOTSUPP, &bio->bi_flags)) 128 if (test_bit(BIO_EOPNOTSUPP, &bio->bi_flags))
129 set_bit(BIO_EOPNOTSUPP, &bio_orig->bi_flags); 129 set_bit(BIO_EOPNOTSUPP, &bio_orig->bi_flags);
130 130
131 /* 131 /*
132 * free up bounce indirect pages used 132 * free up bounce indirect pages used
133 */ 133 */
134 __bio_for_each_segment(bvec, bio, i, 0) { 134 __bio_for_each_segment(bvec, bio, i, 0) {
135 org_vec = bio_orig->bi_io_vec + i; 135 org_vec = bio_orig->bi_io_vec + i;
136 if (bvec->bv_page == org_vec->bv_page) 136 if (bvec->bv_page == org_vec->bv_page)
137 continue; 137 continue;
138 138
139 dec_zone_page_state(bvec->bv_page, NR_BOUNCE); 139 dec_zone_page_state(bvec->bv_page, NR_BOUNCE);
140 mempool_free(bvec->bv_page, pool); 140 mempool_free(bvec->bv_page, pool);
141 } 141 }
142 142
143 bio_endio(bio_orig, err); 143 bio_endio(bio_orig, err);
144 bio_put(bio); 144 bio_put(bio);
145 } 145 }
146 146
147 static void bounce_end_io_write(struct bio *bio, int err) 147 static void bounce_end_io_write(struct bio *bio, int err)
148 { 148 {
149 bounce_end_io(bio, page_pool, err); 149 bounce_end_io(bio, page_pool, err);
150 } 150 }
151 151
152 static void bounce_end_io_write_isa(struct bio *bio, int err) 152 static void bounce_end_io_write_isa(struct bio *bio, int err)
153 { 153 {
154 154
155 bounce_end_io(bio, isa_page_pool, err); 155 bounce_end_io(bio, isa_page_pool, err);
156 } 156 }
157 157
158 static void __bounce_end_io_read(struct bio *bio, mempool_t *pool, int err) 158 static void __bounce_end_io_read(struct bio *bio, mempool_t *pool, int err)
159 { 159 {
160 struct bio *bio_orig = bio->bi_private; 160 struct bio *bio_orig = bio->bi_private;
161 161
162 if (test_bit(BIO_UPTODATE, &bio->bi_flags)) 162 if (test_bit(BIO_UPTODATE, &bio->bi_flags))
163 copy_to_high_bio_irq(bio_orig, bio); 163 copy_to_high_bio_irq(bio_orig, bio);
164 164
165 bounce_end_io(bio, pool, err); 165 bounce_end_io(bio, pool, err);
166 } 166 }
167 167
168 static void bounce_end_io_read(struct bio *bio, int err) 168 static void bounce_end_io_read(struct bio *bio, int err)
169 { 169 {
170 __bounce_end_io_read(bio, page_pool, err); 170 __bounce_end_io_read(bio, page_pool, err);
171 } 171 }
172 172
173 static void bounce_end_io_read_isa(struct bio *bio, int err) 173 static void bounce_end_io_read_isa(struct bio *bio, int err)
174 { 174 {
175 __bounce_end_io_read(bio, isa_page_pool, err); 175 __bounce_end_io_read(bio, isa_page_pool, err);
176 } 176 }
177 177
178 static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig, 178 static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig,
179 mempool_t *pool) 179 mempool_t *pool)
180 { 180 {
181 struct page *page; 181 struct page *page;
182 struct bio *bio = NULL; 182 struct bio *bio = NULL;
183 int i, rw = bio_data_dir(*bio_orig); 183 int i, rw = bio_data_dir(*bio_orig);
184 struct bio_vec *to, *from; 184 struct bio_vec *to, *from;
185 185
186 bio_for_each_segment(from, *bio_orig, i) { 186 bio_for_each_segment(from, *bio_orig, i) {
187 page = from->bv_page; 187 page = from->bv_page;
188 188
189 /* 189 /*
190 * is destination page below bounce pfn? 190 * is destination page below bounce pfn?
191 */ 191 */
192 if (page_to_pfn(page) <= q->bounce_pfn) 192 if (page_to_pfn(page) <= q->bounce_pfn)
193 continue; 193 continue;
194 194
195 /* 195 /*
196 * irk, bounce it 196 * irk, bounce it
197 */ 197 */
198 if (!bio) 198 if (!bio)
199 bio = bio_alloc(GFP_NOIO, (*bio_orig)->bi_vcnt); 199 bio = bio_alloc(GFP_NOIO, (*bio_orig)->bi_vcnt);
200 200
201 to = bio->bi_io_vec + i; 201 to = bio->bi_io_vec + i;
202 202
203 to->bv_page = mempool_alloc(pool, q->bounce_gfp); 203 to->bv_page = mempool_alloc(pool, q->bounce_gfp);
204 to->bv_len = from->bv_len; 204 to->bv_len = from->bv_len;
205 to->bv_offset = from->bv_offset; 205 to->bv_offset = from->bv_offset;
206 inc_zone_page_state(to->bv_page, NR_BOUNCE); 206 inc_zone_page_state(to->bv_page, NR_BOUNCE);
207 207
208 if (rw == WRITE) { 208 if (rw == WRITE) {
209 char *vto, *vfrom; 209 char *vto, *vfrom;
210 210
211 flush_dcache_page(from->bv_page); 211 flush_dcache_page(from->bv_page);
212 vto = page_address(to->bv_page) + to->bv_offset; 212 vto = page_address(to->bv_page) + to->bv_offset;
213 vfrom = kmap(from->bv_page) + from->bv_offset; 213 vfrom = kmap(from->bv_page) + from->bv_offset;
214 memcpy(vto, vfrom, to->bv_len); 214 memcpy(vto, vfrom, to->bv_len);
215 kunmap(from->bv_page); 215 kunmap(from->bv_page);
216 } 216 }
217 } 217 }
218 218
219 /* 219 /*
220 * no pages bounced 220 * no pages bounced
221 */ 221 */
222 if (!bio) 222 if (!bio)
223 return; 223 return;
224 224
225 blk_add_trace_bio(q, *bio_orig, BLK_TA_BOUNCE); 225 blk_add_trace_bio(q, *bio_orig, BLK_TA_BOUNCE);
226 226
227 /* 227 /*
228 * at least one page was bounced, fill in possible non-highmem 228 * at least one page was bounced, fill in possible non-highmem
229 * pages 229 * pages
230 */ 230 */
231 __bio_for_each_segment(from, *bio_orig, i, 0) { 231 __bio_for_each_segment(from, *bio_orig, i, 0) {
232 to = bio_iovec_idx(bio, i); 232 to = bio_iovec_idx(bio, i);
233 if (!to->bv_page) { 233 if (!to->bv_page) {
234 to->bv_page = from->bv_page; 234 to->bv_page = from->bv_page;
235 to->bv_len = from->bv_len; 235 to->bv_len = from->bv_len;
236 to->bv_offset = from->bv_offset; 236 to->bv_offset = from->bv_offset;
237 } 237 }
238 } 238 }
239 239
240 bio->bi_bdev = (*bio_orig)->bi_bdev; 240 bio->bi_bdev = (*bio_orig)->bi_bdev;
241 bio->bi_flags |= (1 << BIO_BOUNCED); 241 bio->bi_flags |= (1 << BIO_BOUNCED);
242 bio->bi_sector = (*bio_orig)->bi_sector; 242 bio->bi_sector = (*bio_orig)->bi_sector;
243 bio->bi_rw = (*bio_orig)->bi_rw; 243 bio->bi_rw = (*bio_orig)->bi_rw;
244 244
245 bio->bi_vcnt = (*bio_orig)->bi_vcnt; 245 bio->bi_vcnt = (*bio_orig)->bi_vcnt;
246 bio->bi_idx = (*bio_orig)->bi_idx; 246 bio->bi_idx = (*bio_orig)->bi_idx;
247 bio->bi_size = (*bio_orig)->bi_size; 247 bio->bi_size = (*bio_orig)->bi_size;
248 248
249 if (pool == page_pool) { 249 if (pool == page_pool) {
250 bio->bi_end_io = bounce_end_io_write; 250 bio->bi_end_io = bounce_end_io_write;
251 if (rw == READ) 251 if (rw == READ)
252 bio->bi_end_io = bounce_end_io_read; 252 bio->bi_end_io = bounce_end_io_read;
253 } else { 253 } else {
254 bio->bi_end_io = bounce_end_io_write_isa; 254 bio->bi_end_io = bounce_end_io_write_isa;
255 if (rw == READ) 255 if (rw == READ)
256 bio->bi_end_io = bounce_end_io_read_isa; 256 bio->bi_end_io = bounce_end_io_read_isa;
257 } 257 }
258 258
259 bio->bi_private = *bio_orig; 259 bio->bi_private = *bio_orig;
260 *bio_orig = bio; 260 *bio_orig = bio;
261 } 261 }
262 262
263 void blk_queue_bounce(struct request_queue *q, struct bio **bio_orig) 263 void blk_queue_bounce(struct request_queue *q, struct bio **bio_orig)
264 { 264 {
265 mempool_t *pool; 265 mempool_t *pool;
266 266
267 /* 267 /*
268 * Data-less bio, nothing to bounce
269 */
270 if (bio_empty_barrier(*bio_orig))
271 return;
272
273 /*
268 * for non-isa bounce case, just check if the bounce pfn is equal 274 * for non-isa bounce case, just check if the bounce pfn is equal
269 * to or bigger than the highest pfn in the system -- in that case, 275 * to or bigger than the highest pfn in the system -- in that case,
270 * don't waste time iterating over bio segments 276 * don't waste time iterating over bio segments
271 */ 277 */
272 if (!(q->bounce_gfp & GFP_DMA)) { 278 if (!(q->bounce_gfp & GFP_DMA)) {
273 if (q->bounce_pfn >= blk_max_pfn) 279 if (q->bounce_pfn >= blk_max_pfn)
274 return; 280 return;
275 pool = page_pool; 281 pool = page_pool;
276 } else { 282 } else {
277 BUG_ON(!isa_page_pool); 283 BUG_ON(!isa_page_pool);
278 pool = isa_page_pool; 284 pool = isa_page_pool;
279 } 285 }
280 286
281 /* 287 /*
282 * slow path 288 * slow path
283 */ 289 */
284 __blk_queue_bounce(q, bio_orig, pool); 290 __blk_queue_bounce(q, bio_orig, pool);
285 } 291 }
286 292
287 EXPORT_SYMBOL(blk_queue_bounce); 293 EXPORT_SYMBOL(blk_queue_bounce);
288 294