Commit 08bafc0341f2f7920e9045bc32c40299cac8c21b

Authored by Keith Mannthey
Committed by Jens Axboe
1 parent 7c239517d9

block: Supress Buffer I/O errors when SCSI REQ_QUIET flag set

Allow the scsi request REQ_QUIET flag to be propagated to the buffer
file system layer. The basic ideas is to pass the flag from the scsi
request to the bio (block IO) and then to the buffer layer.  The buffer
layer can then suppress needless printks.

This patch declutters the kernel log by removed the 40-50 (per lun)
buffer io error messages seen during a boot in my multipath setup . It
is a good chance any real errors will be missed in the "noise" it the
logs without this patch.

During boot I see blocks of messages like
"
__ratelimit: 211 callbacks suppressed
Buffer I/O error on device sdm, logical block 5242879
Buffer I/O error on device sdm, logical block 5242879
Buffer I/O error on device sdm, logical block 5242847
Buffer I/O error on device sdm, logical block 1
Buffer I/O error on device sdm, logical block 5242878
Buffer I/O error on device sdm, logical block 5242879
Buffer I/O error on device sdm, logical block 5242879
Buffer I/O error on device sdm, logical block 5242879
Buffer I/O error on device sdm, logical block 5242879
Buffer I/O error on device sdm, logical block 5242872
"
in my logs.

My disk environment is multipath fiber channel using the SCSI_DH_RDAC
code and multipathd.  This topology includes an "active" and "ghost"
path for each lun. IO's to the "ghost" path will never complete and the
SCSI layer, via the scsi device handler rdac code, quick returns the IOs
to theses paths and sets the REQ_QUIET scsi flag to suppress the scsi
layer messages.

 I am wanting to extend the QUIET behavior to include the buffer file
system layer to deal with these errors as well. I have been running this
patch for a while now on several boxes without issue.  A few runs of
bonnie++ show no noticeable difference in performance in my setup.

Thanks for John Stultz for the quiet_error finalization.

Submitted-by:  Keith Mannthey <kmannth@us.ibm.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

Showing 4 changed files with 20 additions and 4 deletions Inline Diff

1 /* 1 /*
2 * Copyright (C) 1991, 1992 Linus Torvalds 2 * Copyright (C) 1991, 1992 Linus Torvalds
3 * Copyright (C) 1994, Karl Keyte: Added support for disk statistics 3 * Copyright (C) 1994, Karl Keyte: Added support for disk statistics
4 * Elevator latency, (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE 4 * Elevator latency, (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE
5 * Queue request tables / lock, selectable elevator, Jens Axboe <axboe@suse.de> 5 * Queue request tables / lock, selectable elevator, Jens Axboe <axboe@suse.de>
6 * kernel-doc documentation started by NeilBrown <neilb@cse.unsw.edu.au> 6 * kernel-doc documentation started by NeilBrown <neilb@cse.unsw.edu.au>
7 * - July2000 7 * - July2000
8 * bio rewrite, highmem i/o, etc, Jens Axboe <axboe@suse.de> - may 2001 8 * bio rewrite, highmem i/o, etc, Jens Axboe <axboe@suse.de> - may 2001
9 */ 9 */
10 10
11 /* 11 /*
12 * This handles all read/write requests to block devices 12 * This handles all read/write requests to block devices
13 */ 13 */
14 #include <linux/kernel.h> 14 #include <linux/kernel.h>
15 #include <linux/module.h> 15 #include <linux/module.h>
16 #include <linux/backing-dev.h> 16 #include <linux/backing-dev.h>
17 #include <linux/bio.h> 17 #include <linux/bio.h>
18 #include <linux/blkdev.h> 18 #include <linux/blkdev.h>
19 #include <linux/highmem.h> 19 #include <linux/highmem.h>
20 #include <linux/mm.h> 20 #include <linux/mm.h>
21 #include <linux/kernel_stat.h> 21 #include <linux/kernel_stat.h>
22 #include <linux/string.h> 22 #include <linux/string.h>
23 #include <linux/init.h> 23 #include <linux/init.h>
24 #include <linux/completion.h> 24 #include <linux/completion.h>
25 #include <linux/slab.h> 25 #include <linux/slab.h>
26 #include <linux/swap.h> 26 #include <linux/swap.h>
27 #include <linux/writeback.h> 27 #include <linux/writeback.h>
28 #include <linux/task_io_accounting_ops.h> 28 #include <linux/task_io_accounting_ops.h>
29 #include <linux/blktrace_api.h> 29 #include <linux/blktrace_api.h>
30 #include <linux/fault-inject.h> 30 #include <linux/fault-inject.h>
31 #include <trace/block.h> 31 #include <trace/block.h>
32 32
33 #include "blk.h" 33 #include "blk.h"
34 34
35 DEFINE_TRACE(block_plug); 35 DEFINE_TRACE(block_plug);
36 DEFINE_TRACE(block_unplug_io); 36 DEFINE_TRACE(block_unplug_io);
37 DEFINE_TRACE(block_unplug_timer); 37 DEFINE_TRACE(block_unplug_timer);
38 DEFINE_TRACE(block_getrq); 38 DEFINE_TRACE(block_getrq);
39 DEFINE_TRACE(block_sleeprq); 39 DEFINE_TRACE(block_sleeprq);
40 DEFINE_TRACE(block_rq_requeue); 40 DEFINE_TRACE(block_rq_requeue);
41 DEFINE_TRACE(block_bio_backmerge); 41 DEFINE_TRACE(block_bio_backmerge);
42 DEFINE_TRACE(block_bio_frontmerge); 42 DEFINE_TRACE(block_bio_frontmerge);
43 DEFINE_TRACE(block_bio_queue); 43 DEFINE_TRACE(block_bio_queue);
44 DEFINE_TRACE(block_rq_complete); 44 DEFINE_TRACE(block_rq_complete);
45 DEFINE_TRACE(block_remap); /* Also used in drivers/md/dm.c */ 45 DEFINE_TRACE(block_remap); /* Also used in drivers/md/dm.c */
46 EXPORT_TRACEPOINT_SYMBOL_GPL(block_remap); 46 EXPORT_TRACEPOINT_SYMBOL_GPL(block_remap);
47 47
48 static int __make_request(struct request_queue *q, struct bio *bio); 48 static int __make_request(struct request_queue *q, struct bio *bio);
49 49
50 /* 50 /*
51 * For the allocated request tables 51 * For the allocated request tables
52 */ 52 */
53 static struct kmem_cache *request_cachep; 53 static struct kmem_cache *request_cachep;
54 54
55 /* 55 /*
56 * For queue allocation 56 * For queue allocation
57 */ 57 */
58 struct kmem_cache *blk_requestq_cachep; 58 struct kmem_cache *blk_requestq_cachep;
59 59
60 /* 60 /*
61 * Controlling structure to kblockd 61 * Controlling structure to kblockd
62 */ 62 */
63 static struct workqueue_struct *kblockd_workqueue; 63 static struct workqueue_struct *kblockd_workqueue;
64 64
65 static void drive_stat_acct(struct request *rq, int new_io) 65 static void drive_stat_acct(struct request *rq, int new_io)
66 { 66 {
67 struct hd_struct *part; 67 struct hd_struct *part;
68 int rw = rq_data_dir(rq); 68 int rw = rq_data_dir(rq);
69 int cpu; 69 int cpu;
70 70
71 if (!blk_fs_request(rq) || !rq->rq_disk) 71 if (!blk_fs_request(rq) || !rq->rq_disk)
72 return; 72 return;
73 73
74 cpu = part_stat_lock(); 74 cpu = part_stat_lock();
75 part = disk_map_sector_rcu(rq->rq_disk, rq->sector); 75 part = disk_map_sector_rcu(rq->rq_disk, rq->sector);
76 76
77 if (!new_io) 77 if (!new_io)
78 part_stat_inc(cpu, part, merges[rw]); 78 part_stat_inc(cpu, part, merges[rw]);
79 else { 79 else {
80 part_round_stats(cpu, part); 80 part_round_stats(cpu, part);
81 part_inc_in_flight(part); 81 part_inc_in_flight(part);
82 } 82 }
83 83
84 part_stat_unlock(); 84 part_stat_unlock();
85 } 85 }
86 86
87 void blk_queue_congestion_threshold(struct request_queue *q) 87 void blk_queue_congestion_threshold(struct request_queue *q)
88 { 88 {
89 int nr; 89 int nr;
90 90
91 nr = q->nr_requests - (q->nr_requests / 8) + 1; 91 nr = q->nr_requests - (q->nr_requests / 8) + 1;
92 if (nr > q->nr_requests) 92 if (nr > q->nr_requests)
93 nr = q->nr_requests; 93 nr = q->nr_requests;
94 q->nr_congestion_on = nr; 94 q->nr_congestion_on = nr;
95 95
96 nr = q->nr_requests - (q->nr_requests / 8) - (q->nr_requests / 16) - 1; 96 nr = q->nr_requests - (q->nr_requests / 8) - (q->nr_requests / 16) - 1;
97 if (nr < 1) 97 if (nr < 1)
98 nr = 1; 98 nr = 1;
99 q->nr_congestion_off = nr; 99 q->nr_congestion_off = nr;
100 } 100 }
101 101
102 /** 102 /**
103 * blk_get_backing_dev_info - get the address of a queue's backing_dev_info 103 * blk_get_backing_dev_info - get the address of a queue's backing_dev_info
104 * @bdev: device 104 * @bdev: device
105 * 105 *
106 * Locates the passed device's request queue and returns the address of its 106 * Locates the passed device's request queue and returns the address of its
107 * backing_dev_info 107 * backing_dev_info
108 * 108 *
109 * Will return NULL if the request queue cannot be located. 109 * Will return NULL if the request queue cannot be located.
110 */ 110 */
111 struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev) 111 struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev)
112 { 112 {
113 struct backing_dev_info *ret = NULL; 113 struct backing_dev_info *ret = NULL;
114 struct request_queue *q = bdev_get_queue(bdev); 114 struct request_queue *q = bdev_get_queue(bdev);
115 115
116 if (q) 116 if (q)
117 ret = &q->backing_dev_info; 117 ret = &q->backing_dev_info;
118 return ret; 118 return ret;
119 } 119 }
120 EXPORT_SYMBOL(blk_get_backing_dev_info); 120 EXPORT_SYMBOL(blk_get_backing_dev_info);
121 121
122 void blk_rq_init(struct request_queue *q, struct request *rq) 122 void blk_rq_init(struct request_queue *q, struct request *rq)
123 { 123 {
124 memset(rq, 0, sizeof(*rq)); 124 memset(rq, 0, sizeof(*rq));
125 125
126 INIT_LIST_HEAD(&rq->queuelist); 126 INIT_LIST_HEAD(&rq->queuelist);
127 INIT_LIST_HEAD(&rq->timeout_list); 127 INIT_LIST_HEAD(&rq->timeout_list);
128 rq->cpu = -1; 128 rq->cpu = -1;
129 rq->q = q; 129 rq->q = q;
130 rq->sector = rq->hard_sector = (sector_t) -1; 130 rq->sector = rq->hard_sector = (sector_t) -1;
131 INIT_HLIST_NODE(&rq->hash); 131 INIT_HLIST_NODE(&rq->hash);
132 RB_CLEAR_NODE(&rq->rb_node); 132 RB_CLEAR_NODE(&rq->rb_node);
133 rq->cmd = rq->__cmd; 133 rq->cmd = rq->__cmd;
134 rq->tag = -1; 134 rq->tag = -1;
135 rq->ref_count = 1; 135 rq->ref_count = 1;
136 } 136 }
137 EXPORT_SYMBOL(blk_rq_init); 137 EXPORT_SYMBOL(blk_rq_init);
138 138
139 static void req_bio_endio(struct request *rq, struct bio *bio, 139 static void req_bio_endio(struct request *rq, struct bio *bio,
140 unsigned int nbytes, int error) 140 unsigned int nbytes, int error)
141 { 141 {
142 struct request_queue *q = rq->q; 142 struct request_queue *q = rq->q;
143 143
144 if (&q->bar_rq != rq) { 144 if (&q->bar_rq != rq) {
145 if (error) 145 if (error)
146 clear_bit(BIO_UPTODATE, &bio->bi_flags); 146 clear_bit(BIO_UPTODATE, &bio->bi_flags);
147 else if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) 147 else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
148 error = -EIO; 148 error = -EIO;
149 149
150 if (unlikely(nbytes > bio->bi_size)) { 150 if (unlikely(nbytes > bio->bi_size)) {
151 printk(KERN_ERR "%s: want %u bytes done, %u left\n", 151 printk(KERN_ERR "%s: want %u bytes done, %u left\n",
152 __func__, nbytes, bio->bi_size); 152 __func__, nbytes, bio->bi_size);
153 nbytes = bio->bi_size; 153 nbytes = bio->bi_size;
154 } 154 }
155 155
156 if (unlikely(rq->cmd_flags & REQ_QUIET))
157 set_bit(BIO_QUIET, &bio->bi_flags);
158
156 bio->bi_size -= nbytes; 159 bio->bi_size -= nbytes;
157 bio->bi_sector += (nbytes >> 9); 160 bio->bi_sector += (nbytes >> 9);
158 161
159 if (bio_integrity(bio)) 162 if (bio_integrity(bio))
160 bio_integrity_advance(bio, nbytes); 163 bio_integrity_advance(bio, nbytes);
161 164
162 if (bio->bi_size == 0) 165 if (bio->bi_size == 0)
163 bio_endio(bio, error); 166 bio_endio(bio, error);
164 } else { 167 } else {
165 168
166 /* 169 /*
167 * Okay, this is the barrier request in progress, just 170 * Okay, this is the barrier request in progress, just
168 * record the error; 171 * record the error;
169 */ 172 */
170 if (error && !q->orderr) 173 if (error && !q->orderr)
171 q->orderr = error; 174 q->orderr = error;
172 } 175 }
173 } 176 }
174 177
175 void blk_dump_rq_flags(struct request *rq, char *msg) 178 void blk_dump_rq_flags(struct request *rq, char *msg)
176 { 179 {
177 int bit; 180 int bit;
178 181
179 printk(KERN_INFO "%s: dev %s: type=%x, flags=%x\n", msg, 182 printk(KERN_INFO "%s: dev %s: type=%x, flags=%x\n", msg,
180 rq->rq_disk ? rq->rq_disk->disk_name : "?", rq->cmd_type, 183 rq->rq_disk ? rq->rq_disk->disk_name : "?", rq->cmd_type,
181 rq->cmd_flags); 184 rq->cmd_flags);
182 185
183 printk(KERN_INFO " sector %llu, nr/cnr %lu/%u\n", 186 printk(KERN_INFO " sector %llu, nr/cnr %lu/%u\n",
184 (unsigned long long)rq->sector, 187 (unsigned long long)rq->sector,
185 rq->nr_sectors, 188 rq->nr_sectors,
186 rq->current_nr_sectors); 189 rq->current_nr_sectors);
187 printk(KERN_INFO " bio %p, biotail %p, buffer %p, data %p, len %u\n", 190 printk(KERN_INFO " bio %p, biotail %p, buffer %p, data %p, len %u\n",
188 rq->bio, rq->biotail, 191 rq->bio, rq->biotail,
189 rq->buffer, rq->data, 192 rq->buffer, rq->data,
190 rq->data_len); 193 rq->data_len);
191 194
192 if (blk_pc_request(rq)) { 195 if (blk_pc_request(rq)) {
193 printk(KERN_INFO " cdb: "); 196 printk(KERN_INFO " cdb: ");
194 for (bit = 0; bit < BLK_MAX_CDB; bit++) 197 for (bit = 0; bit < BLK_MAX_CDB; bit++)
195 printk("%02x ", rq->cmd[bit]); 198 printk("%02x ", rq->cmd[bit]);
196 printk("\n"); 199 printk("\n");
197 } 200 }
198 } 201 }
199 EXPORT_SYMBOL(blk_dump_rq_flags); 202 EXPORT_SYMBOL(blk_dump_rq_flags);
200 203
201 /* 204 /*
202 * "plug" the device if there are no outstanding requests: this will 205 * "plug" the device if there are no outstanding requests: this will
203 * force the transfer to start only after we have put all the requests 206 * force the transfer to start only after we have put all the requests
204 * on the list. 207 * on the list.
205 * 208 *
206 * This is called with interrupts off and no requests on the queue and 209 * This is called with interrupts off and no requests on the queue and
207 * with the queue lock held. 210 * with the queue lock held.
208 */ 211 */
209 void blk_plug_device(struct request_queue *q) 212 void blk_plug_device(struct request_queue *q)
210 { 213 {
211 WARN_ON(!irqs_disabled()); 214 WARN_ON(!irqs_disabled());
212 215
213 /* 216 /*
214 * don't plug a stopped queue, it must be paired with blk_start_queue() 217 * don't plug a stopped queue, it must be paired with blk_start_queue()
215 * which will restart the queueing 218 * which will restart the queueing
216 */ 219 */
217 if (blk_queue_stopped(q)) 220 if (blk_queue_stopped(q))
218 return; 221 return;
219 222
220 if (!queue_flag_test_and_set(QUEUE_FLAG_PLUGGED, q)) { 223 if (!queue_flag_test_and_set(QUEUE_FLAG_PLUGGED, q)) {
221 mod_timer(&q->unplug_timer, jiffies + q->unplug_delay); 224 mod_timer(&q->unplug_timer, jiffies + q->unplug_delay);
222 trace_block_plug(q); 225 trace_block_plug(q);
223 } 226 }
224 } 227 }
225 EXPORT_SYMBOL(blk_plug_device); 228 EXPORT_SYMBOL(blk_plug_device);
226 229
227 /** 230 /**
228 * blk_plug_device_unlocked - plug a device without queue lock held 231 * blk_plug_device_unlocked - plug a device without queue lock held
229 * @q: The &struct request_queue to plug 232 * @q: The &struct request_queue to plug
230 * 233 *
231 * Description: 234 * Description:
232 * Like @blk_plug_device(), but grabs the queue lock and disables 235 * Like @blk_plug_device(), but grabs the queue lock and disables
233 * interrupts. 236 * interrupts.
234 **/ 237 **/
235 void blk_plug_device_unlocked(struct request_queue *q) 238 void blk_plug_device_unlocked(struct request_queue *q)
236 { 239 {
237 unsigned long flags; 240 unsigned long flags;
238 241
239 spin_lock_irqsave(q->queue_lock, flags); 242 spin_lock_irqsave(q->queue_lock, flags);
240 blk_plug_device(q); 243 blk_plug_device(q);
241 spin_unlock_irqrestore(q->queue_lock, flags); 244 spin_unlock_irqrestore(q->queue_lock, flags);
242 } 245 }
243 EXPORT_SYMBOL(blk_plug_device_unlocked); 246 EXPORT_SYMBOL(blk_plug_device_unlocked);
244 247
245 /* 248 /*
246 * remove the queue from the plugged list, if present. called with 249 * remove the queue from the plugged list, if present. called with
247 * queue lock held and interrupts disabled. 250 * queue lock held and interrupts disabled.
248 */ 251 */
249 int blk_remove_plug(struct request_queue *q) 252 int blk_remove_plug(struct request_queue *q)
250 { 253 {
251 WARN_ON(!irqs_disabled()); 254 WARN_ON(!irqs_disabled());
252 255
253 if (!queue_flag_test_and_clear(QUEUE_FLAG_PLUGGED, q)) 256 if (!queue_flag_test_and_clear(QUEUE_FLAG_PLUGGED, q))
254 return 0; 257 return 0;
255 258
256 del_timer(&q->unplug_timer); 259 del_timer(&q->unplug_timer);
257 return 1; 260 return 1;
258 } 261 }
259 EXPORT_SYMBOL(blk_remove_plug); 262 EXPORT_SYMBOL(blk_remove_plug);
260 263
261 /* 264 /*
262 * remove the plug and let it rip.. 265 * remove the plug and let it rip..
263 */ 266 */
264 void __generic_unplug_device(struct request_queue *q) 267 void __generic_unplug_device(struct request_queue *q)
265 { 268 {
266 if (unlikely(blk_queue_stopped(q))) 269 if (unlikely(blk_queue_stopped(q)))
267 return; 270 return;
268 271
269 if (!blk_remove_plug(q)) 272 if (!blk_remove_plug(q))
270 return; 273 return;
271 274
272 q->request_fn(q); 275 q->request_fn(q);
273 } 276 }
274 277
275 /** 278 /**
276 * generic_unplug_device - fire a request queue 279 * generic_unplug_device - fire a request queue
277 * @q: The &struct request_queue in question 280 * @q: The &struct request_queue in question
278 * 281 *
279 * Description: 282 * Description:
280 * Linux uses plugging to build bigger requests queues before letting 283 * Linux uses plugging to build bigger requests queues before letting
281 * the device have at them. If a queue is plugged, the I/O scheduler 284 * the device have at them. If a queue is plugged, the I/O scheduler
282 * is still adding and merging requests on the queue. Once the queue 285 * is still adding and merging requests on the queue. Once the queue
283 * gets unplugged, the request_fn defined for the queue is invoked and 286 * gets unplugged, the request_fn defined for the queue is invoked and
284 * transfers started. 287 * transfers started.
285 **/ 288 **/
286 void generic_unplug_device(struct request_queue *q) 289 void generic_unplug_device(struct request_queue *q)
287 { 290 {
288 if (blk_queue_plugged(q)) { 291 if (blk_queue_plugged(q)) {
289 spin_lock_irq(q->queue_lock); 292 spin_lock_irq(q->queue_lock);
290 __generic_unplug_device(q); 293 __generic_unplug_device(q);
291 spin_unlock_irq(q->queue_lock); 294 spin_unlock_irq(q->queue_lock);
292 } 295 }
293 } 296 }
294 EXPORT_SYMBOL(generic_unplug_device); 297 EXPORT_SYMBOL(generic_unplug_device);
295 298
296 static void blk_backing_dev_unplug(struct backing_dev_info *bdi, 299 static void blk_backing_dev_unplug(struct backing_dev_info *bdi,
297 struct page *page) 300 struct page *page)
298 { 301 {
299 struct request_queue *q = bdi->unplug_io_data; 302 struct request_queue *q = bdi->unplug_io_data;
300 303
301 blk_unplug(q); 304 blk_unplug(q);
302 } 305 }
303 306
304 void blk_unplug_work(struct work_struct *work) 307 void blk_unplug_work(struct work_struct *work)
305 { 308 {
306 struct request_queue *q = 309 struct request_queue *q =
307 container_of(work, struct request_queue, unplug_work); 310 container_of(work, struct request_queue, unplug_work);
308 311
309 trace_block_unplug_io(q); 312 trace_block_unplug_io(q);
310 q->unplug_fn(q); 313 q->unplug_fn(q);
311 } 314 }
312 315
313 void blk_unplug_timeout(unsigned long data) 316 void blk_unplug_timeout(unsigned long data)
314 { 317 {
315 struct request_queue *q = (struct request_queue *)data; 318 struct request_queue *q = (struct request_queue *)data;
316 319
317 trace_block_unplug_timer(q); 320 trace_block_unplug_timer(q);
318 kblockd_schedule_work(q, &q->unplug_work); 321 kblockd_schedule_work(q, &q->unplug_work);
319 } 322 }
320 323
321 void blk_unplug(struct request_queue *q) 324 void blk_unplug(struct request_queue *q)
322 { 325 {
323 /* 326 /*
324 * devices don't necessarily have an ->unplug_fn defined 327 * devices don't necessarily have an ->unplug_fn defined
325 */ 328 */
326 if (q->unplug_fn) { 329 if (q->unplug_fn) {
327 trace_block_unplug_io(q); 330 trace_block_unplug_io(q);
328 q->unplug_fn(q); 331 q->unplug_fn(q);
329 } 332 }
330 } 333 }
331 EXPORT_SYMBOL(blk_unplug); 334 EXPORT_SYMBOL(blk_unplug);
332 335
333 static void blk_invoke_request_fn(struct request_queue *q) 336 static void blk_invoke_request_fn(struct request_queue *q)
334 { 337 {
335 if (unlikely(blk_queue_stopped(q))) 338 if (unlikely(blk_queue_stopped(q)))
336 return; 339 return;
337 340
338 /* 341 /*
339 * one level of recursion is ok and is much faster than kicking 342 * one level of recursion is ok and is much faster than kicking
340 * the unplug handling 343 * the unplug handling
341 */ 344 */
342 if (!queue_flag_test_and_set(QUEUE_FLAG_REENTER, q)) { 345 if (!queue_flag_test_and_set(QUEUE_FLAG_REENTER, q)) {
343 q->request_fn(q); 346 q->request_fn(q);
344 queue_flag_clear(QUEUE_FLAG_REENTER, q); 347 queue_flag_clear(QUEUE_FLAG_REENTER, q);
345 } else { 348 } else {
346 queue_flag_set(QUEUE_FLAG_PLUGGED, q); 349 queue_flag_set(QUEUE_FLAG_PLUGGED, q);
347 kblockd_schedule_work(q, &q->unplug_work); 350 kblockd_schedule_work(q, &q->unplug_work);
348 } 351 }
349 } 352 }
350 353
351 /** 354 /**
352 * blk_start_queue - restart a previously stopped queue 355 * blk_start_queue - restart a previously stopped queue
353 * @q: The &struct request_queue in question 356 * @q: The &struct request_queue in question
354 * 357 *
355 * Description: 358 * Description:
356 * blk_start_queue() will clear the stop flag on the queue, and call 359 * blk_start_queue() will clear the stop flag on the queue, and call
357 * the request_fn for the queue if it was in a stopped state when 360 * the request_fn for the queue if it was in a stopped state when
358 * entered. Also see blk_stop_queue(). Queue lock must be held. 361 * entered. Also see blk_stop_queue(). Queue lock must be held.
359 **/ 362 **/
360 void blk_start_queue(struct request_queue *q) 363 void blk_start_queue(struct request_queue *q)
361 { 364 {
362 WARN_ON(!irqs_disabled()); 365 WARN_ON(!irqs_disabled());
363 366
364 queue_flag_clear(QUEUE_FLAG_STOPPED, q); 367 queue_flag_clear(QUEUE_FLAG_STOPPED, q);
365 blk_invoke_request_fn(q); 368 blk_invoke_request_fn(q);
366 } 369 }
367 EXPORT_SYMBOL(blk_start_queue); 370 EXPORT_SYMBOL(blk_start_queue);
368 371
369 /** 372 /**
370 * blk_stop_queue - stop a queue 373 * blk_stop_queue - stop a queue
371 * @q: The &struct request_queue in question 374 * @q: The &struct request_queue in question
372 * 375 *
373 * Description: 376 * Description:
374 * The Linux block layer assumes that a block driver will consume all 377 * The Linux block layer assumes that a block driver will consume all
375 * entries on the request queue when the request_fn strategy is called. 378 * entries on the request queue when the request_fn strategy is called.
376 * Often this will not happen, because of hardware limitations (queue 379 * Often this will not happen, because of hardware limitations (queue
377 * depth settings). If a device driver gets a 'queue full' response, 380 * depth settings). If a device driver gets a 'queue full' response,
378 * or if it simply chooses not to queue more I/O at one point, it can 381 * or if it simply chooses not to queue more I/O at one point, it can
379 * call this function to prevent the request_fn from being called until 382 * call this function to prevent the request_fn from being called until
380 * the driver has signalled it's ready to go again. This happens by calling 383 * the driver has signalled it's ready to go again. This happens by calling
381 * blk_start_queue() to restart queue operations. Queue lock must be held. 384 * blk_start_queue() to restart queue operations. Queue lock must be held.
382 **/ 385 **/
383 void blk_stop_queue(struct request_queue *q) 386 void blk_stop_queue(struct request_queue *q)
384 { 387 {
385 blk_remove_plug(q); 388 blk_remove_plug(q);
386 queue_flag_set(QUEUE_FLAG_STOPPED, q); 389 queue_flag_set(QUEUE_FLAG_STOPPED, q);
387 } 390 }
388 EXPORT_SYMBOL(blk_stop_queue); 391 EXPORT_SYMBOL(blk_stop_queue);
389 392
390 /** 393 /**
391 * blk_sync_queue - cancel any pending callbacks on a queue 394 * blk_sync_queue - cancel any pending callbacks on a queue
392 * @q: the queue 395 * @q: the queue
393 * 396 *
394 * Description: 397 * Description:
395 * The block layer may perform asynchronous callback activity 398 * The block layer may perform asynchronous callback activity
396 * on a queue, such as calling the unplug function after a timeout. 399 * on a queue, such as calling the unplug function after a timeout.
397 * A block device may call blk_sync_queue to ensure that any 400 * A block device may call blk_sync_queue to ensure that any
398 * such activity is cancelled, thus allowing it to release resources 401 * such activity is cancelled, thus allowing it to release resources
399 * that the callbacks might use. The caller must already have made sure 402 * that the callbacks might use. The caller must already have made sure
400 * that its ->make_request_fn will not re-add plugging prior to calling 403 * that its ->make_request_fn will not re-add plugging prior to calling
401 * this function. 404 * this function.
402 * 405 *
403 */ 406 */
404 void blk_sync_queue(struct request_queue *q) 407 void blk_sync_queue(struct request_queue *q)
405 { 408 {
406 del_timer_sync(&q->unplug_timer); 409 del_timer_sync(&q->unplug_timer);
407 del_timer_sync(&q->timeout); 410 del_timer_sync(&q->timeout);
408 kblockd_flush_work(&q->unplug_work); 411 kblockd_flush_work(&q->unplug_work);
409 } 412 }
410 EXPORT_SYMBOL(blk_sync_queue); 413 EXPORT_SYMBOL(blk_sync_queue);
411 414
412 /** 415 /**
413 * __blk_run_queue - run a single device queue 416 * __blk_run_queue - run a single device queue
414 * @q: The queue to run 417 * @q: The queue to run
415 * 418 *
416 * Description: 419 * Description:
417 * See @blk_run_queue. This variant must be called with the queue lock 420 * See @blk_run_queue. This variant must be called with the queue lock
418 * held and interrupts disabled. 421 * held and interrupts disabled.
419 * 422 *
420 */ 423 */
421 void __blk_run_queue(struct request_queue *q) 424 void __blk_run_queue(struct request_queue *q)
422 { 425 {
423 blk_remove_plug(q); 426 blk_remove_plug(q);
424 427
425 /* 428 /*
426 * Only recurse once to avoid overrunning the stack, let the unplug 429 * Only recurse once to avoid overrunning the stack, let the unplug
427 * handling reinvoke the handler shortly if we already got there. 430 * handling reinvoke the handler shortly if we already got there.
428 */ 431 */
429 if (!elv_queue_empty(q)) 432 if (!elv_queue_empty(q))
430 blk_invoke_request_fn(q); 433 blk_invoke_request_fn(q);
431 } 434 }
432 EXPORT_SYMBOL(__blk_run_queue); 435 EXPORT_SYMBOL(__blk_run_queue);
433 436
434 /** 437 /**
435 * blk_run_queue - run a single device queue 438 * blk_run_queue - run a single device queue
436 * @q: The queue to run 439 * @q: The queue to run
437 * 440 *
438 * Description: 441 * Description:
439 * Invoke request handling on this queue, if it has pending work to do. 442 * Invoke request handling on this queue, if it has pending work to do.
440 * May be used to restart queueing when a request has completed. Also 443 * May be used to restart queueing when a request has completed. Also
441 * See @blk_start_queueing. 444 * See @blk_start_queueing.
442 * 445 *
443 */ 446 */
444 void blk_run_queue(struct request_queue *q) 447 void blk_run_queue(struct request_queue *q)
445 { 448 {
446 unsigned long flags; 449 unsigned long flags;
447 450
448 spin_lock_irqsave(q->queue_lock, flags); 451 spin_lock_irqsave(q->queue_lock, flags);
449 __blk_run_queue(q); 452 __blk_run_queue(q);
450 spin_unlock_irqrestore(q->queue_lock, flags); 453 spin_unlock_irqrestore(q->queue_lock, flags);
451 } 454 }
452 EXPORT_SYMBOL(blk_run_queue); 455 EXPORT_SYMBOL(blk_run_queue);
453 456
454 void blk_put_queue(struct request_queue *q) 457 void blk_put_queue(struct request_queue *q)
455 { 458 {
456 kobject_put(&q->kobj); 459 kobject_put(&q->kobj);
457 } 460 }
458 461
459 void blk_cleanup_queue(struct request_queue *q) 462 void blk_cleanup_queue(struct request_queue *q)
460 { 463 {
461 /* 464 /*
462 * We know we have process context here, so we can be a little 465 * We know we have process context here, so we can be a little
463 * cautious and ensure that pending block actions on this device 466 * cautious and ensure that pending block actions on this device
464 * are done before moving on. Going into this function, we should 467 * are done before moving on. Going into this function, we should
465 * not have processes doing IO to this device. 468 * not have processes doing IO to this device.
466 */ 469 */
467 blk_sync_queue(q); 470 blk_sync_queue(q);
468 471
469 mutex_lock(&q->sysfs_lock); 472 mutex_lock(&q->sysfs_lock);
470 queue_flag_set_unlocked(QUEUE_FLAG_DEAD, q); 473 queue_flag_set_unlocked(QUEUE_FLAG_DEAD, q);
471 mutex_unlock(&q->sysfs_lock); 474 mutex_unlock(&q->sysfs_lock);
472 475
473 if (q->elevator) 476 if (q->elevator)
474 elevator_exit(q->elevator); 477 elevator_exit(q->elevator);
475 478
476 blk_put_queue(q); 479 blk_put_queue(q);
477 } 480 }
478 EXPORT_SYMBOL(blk_cleanup_queue); 481 EXPORT_SYMBOL(blk_cleanup_queue);
479 482
480 static int blk_init_free_list(struct request_queue *q) 483 static int blk_init_free_list(struct request_queue *q)
481 { 484 {
482 struct request_list *rl = &q->rq; 485 struct request_list *rl = &q->rq;
483 486
484 rl->count[READ] = rl->count[WRITE] = 0; 487 rl->count[READ] = rl->count[WRITE] = 0;
485 rl->starved[READ] = rl->starved[WRITE] = 0; 488 rl->starved[READ] = rl->starved[WRITE] = 0;
486 rl->elvpriv = 0; 489 rl->elvpriv = 0;
487 init_waitqueue_head(&rl->wait[READ]); 490 init_waitqueue_head(&rl->wait[READ]);
488 init_waitqueue_head(&rl->wait[WRITE]); 491 init_waitqueue_head(&rl->wait[WRITE]);
489 492
490 rl->rq_pool = mempool_create_node(BLKDEV_MIN_RQ, mempool_alloc_slab, 493 rl->rq_pool = mempool_create_node(BLKDEV_MIN_RQ, mempool_alloc_slab,
491 mempool_free_slab, request_cachep, q->node); 494 mempool_free_slab, request_cachep, q->node);
492 495
493 if (!rl->rq_pool) 496 if (!rl->rq_pool)
494 return -ENOMEM; 497 return -ENOMEM;
495 498
496 return 0; 499 return 0;
497 } 500 }
498 501
499 struct request_queue *blk_alloc_queue(gfp_t gfp_mask) 502 struct request_queue *blk_alloc_queue(gfp_t gfp_mask)
500 { 503 {
501 return blk_alloc_queue_node(gfp_mask, -1); 504 return blk_alloc_queue_node(gfp_mask, -1);
502 } 505 }
503 EXPORT_SYMBOL(blk_alloc_queue); 506 EXPORT_SYMBOL(blk_alloc_queue);
504 507
505 struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id) 508 struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
506 { 509 {
507 struct request_queue *q; 510 struct request_queue *q;
508 int err; 511 int err;
509 512
510 q = kmem_cache_alloc_node(blk_requestq_cachep, 513 q = kmem_cache_alloc_node(blk_requestq_cachep,
511 gfp_mask | __GFP_ZERO, node_id); 514 gfp_mask | __GFP_ZERO, node_id);
512 if (!q) 515 if (!q)
513 return NULL; 516 return NULL;
514 517
515 q->backing_dev_info.unplug_io_fn = blk_backing_dev_unplug; 518 q->backing_dev_info.unplug_io_fn = blk_backing_dev_unplug;
516 q->backing_dev_info.unplug_io_data = q; 519 q->backing_dev_info.unplug_io_data = q;
517 err = bdi_init(&q->backing_dev_info); 520 err = bdi_init(&q->backing_dev_info);
518 if (err) { 521 if (err) {
519 kmem_cache_free(blk_requestq_cachep, q); 522 kmem_cache_free(blk_requestq_cachep, q);
520 return NULL; 523 return NULL;
521 } 524 }
522 525
523 init_timer(&q->unplug_timer); 526 init_timer(&q->unplug_timer);
524 setup_timer(&q->timeout, blk_rq_timed_out_timer, (unsigned long) q); 527 setup_timer(&q->timeout, blk_rq_timed_out_timer, (unsigned long) q);
525 INIT_LIST_HEAD(&q->timeout_list); 528 INIT_LIST_HEAD(&q->timeout_list);
526 INIT_WORK(&q->unplug_work, blk_unplug_work); 529 INIT_WORK(&q->unplug_work, blk_unplug_work);
527 530
528 kobject_init(&q->kobj, &blk_queue_ktype); 531 kobject_init(&q->kobj, &blk_queue_ktype);
529 532
530 mutex_init(&q->sysfs_lock); 533 mutex_init(&q->sysfs_lock);
531 spin_lock_init(&q->__queue_lock); 534 spin_lock_init(&q->__queue_lock);
532 535
533 return q; 536 return q;
534 } 537 }
535 EXPORT_SYMBOL(blk_alloc_queue_node); 538 EXPORT_SYMBOL(blk_alloc_queue_node);
536 539
537 /** 540 /**
538 * blk_init_queue - prepare a request queue for use with a block device 541 * blk_init_queue - prepare a request queue for use with a block device
539 * @rfn: The function to be called to process requests that have been 542 * @rfn: The function to be called to process requests that have been
540 * placed on the queue. 543 * placed on the queue.
541 * @lock: Request queue spin lock 544 * @lock: Request queue spin lock
542 * 545 *
543 * Description: 546 * Description:
544 * If a block device wishes to use the standard request handling procedures, 547 * If a block device wishes to use the standard request handling procedures,
545 * which sorts requests and coalesces adjacent requests, then it must 548 * which sorts requests and coalesces adjacent requests, then it must
546 * call blk_init_queue(). The function @rfn will be called when there 549 * call blk_init_queue(). The function @rfn will be called when there
547 * are requests on the queue that need to be processed. If the device 550 * are requests on the queue that need to be processed. If the device
548 * supports plugging, then @rfn may not be called immediately when requests 551 * supports plugging, then @rfn may not be called immediately when requests
549 * are available on the queue, but may be called at some time later instead. 552 * are available on the queue, but may be called at some time later instead.
550 * Plugged queues are generally unplugged when a buffer belonging to one 553 * Plugged queues are generally unplugged when a buffer belonging to one
551 * of the requests on the queue is needed, or due to memory pressure. 554 * of the requests on the queue is needed, or due to memory pressure.
552 * 555 *
553 * @rfn is not required, or even expected, to remove all requests off the 556 * @rfn is not required, or even expected, to remove all requests off the
554 * queue, but only as many as it can handle at a time. If it does leave 557 * queue, but only as many as it can handle at a time. If it does leave
555 * requests on the queue, it is responsible for arranging that the requests 558 * requests on the queue, it is responsible for arranging that the requests
556 * get dealt with eventually. 559 * get dealt with eventually.
557 * 560 *
558 * The queue spin lock must be held while manipulating the requests on the 561 * The queue spin lock must be held while manipulating the requests on the
559 * request queue; this lock will be taken also from interrupt context, so irq 562 * request queue; this lock will be taken also from interrupt context, so irq
560 * disabling is needed for it. 563 * disabling is needed for it.
561 * 564 *
562 * Function returns a pointer to the initialized request queue, or %NULL if 565 * Function returns a pointer to the initialized request queue, or %NULL if
563 * it didn't succeed. 566 * it didn't succeed.
564 * 567 *
565 * Note: 568 * Note:
566 * blk_init_queue() must be paired with a blk_cleanup_queue() call 569 * blk_init_queue() must be paired with a blk_cleanup_queue() call
567 * when the block device is deactivated (such as at module unload). 570 * when the block device is deactivated (such as at module unload).
568 **/ 571 **/
569 572
570 struct request_queue *blk_init_queue(request_fn_proc *rfn, spinlock_t *lock) 573 struct request_queue *blk_init_queue(request_fn_proc *rfn, spinlock_t *lock)
571 { 574 {
572 return blk_init_queue_node(rfn, lock, -1); 575 return blk_init_queue_node(rfn, lock, -1);
573 } 576 }
574 EXPORT_SYMBOL(blk_init_queue); 577 EXPORT_SYMBOL(blk_init_queue);
575 578
576 struct request_queue * 579 struct request_queue *
577 blk_init_queue_node(request_fn_proc *rfn, spinlock_t *lock, int node_id) 580 blk_init_queue_node(request_fn_proc *rfn, spinlock_t *lock, int node_id)
578 { 581 {
579 struct request_queue *q = blk_alloc_queue_node(GFP_KERNEL, node_id); 582 struct request_queue *q = blk_alloc_queue_node(GFP_KERNEL, node_id);
580 583
581 if (!q) 584 if (!q)
582 return NULL; 585 return NULL;
583 586
584 q->node = node_id; 587 q->node = node_id;
585 if (blk_init_free_list(q)) { 588 if (blk_init_free_list(q)) {
586 kmem_cache_free(blk_requestq_cachep, q); 589 kmem_cache_free(blk_requestq_cachep, q);
587 return NULL; 590 return NULL;
588 } 591 }
589 592
590 /* 593 /*
591 * if caller didn't supply a lock, they get per-queue locking with 594 * if caller didn't supply a lock, they get per-queue locking with
592 * our embedded lock 595 * our embedded lock
593 */ 596 */
594 if (!lock) 597 if (!lock)
595 lock = &q->__queue_lock; 598 lock = &q->__queue_lock;
596 599
597 q->request_fn = rfn; 600 q->request_fn = rfn;
598 q->prep_rq_fn = NULL; 601 q->prep_rq_fn = NULL;
599 q->unplug_fn = generic_unplug_device; 602 q->unplug_fn = generic_unplug_device;
600 q->queue_flags = (1 << QUEUE_FLAG_CLUSTER | 603 q->queue_flags = (1 << QUEUE_FLAG_CLUSTER |
601 1 << QUEUE_FLAG_STACKABLE); 604 1 << QUEUE_FLAG_STACKABLE);
602 q->queue_lock = lock; 605 q->queue_lock = lock;
603 606
604 blk_queue_segment_boundary(q, BLK_SEG_BOUNDARY_MASK); 607 blk_queue_segment_boundary(q, BLK_SEG_BOUNDARY_MASK);
605 608
606 blk_queue_make_request(q, __make_request); 609 blk_queue_make_request(q, __make_request);
607 blk_queue_max_segment_size(q, MAX_SEGMENT_SIZE); 610 blk_queue_max_segment_size(q, MAX_SEGMENT_SIZE);
608 611
609 blk_queue_max_hw_segments(q, MAX_HW_SEGMENTS); 612 blk_queue_max_hw_segments(q, MAX_HW_SEGMENTS);
610 blk_queue_max_phys_segments(q, MAX_PHYS_SEGMENTS); 613 blk_queue_max_phys_segments(q, MAX_PHYS_SEGMENTS);
611 614
612 q->sg_reserved_size = INT_MAX; 615 q->sg_reserved_size = INT_MAX;
613 616
614 blk_set_cmd_filter_defaults(&q->cmd_filter); 617 blk_set_cmd_filter_defaults(&q->cmd_filter);
615 618
616 /* 619 /*
617 * all done 620 * all done
618 */ 621 */
619 if (!elevator_init(q, NULL)) { 622 if (!elevator_init(q, NULL)) {
620 blk_queue_congestion_threshold(q); 623 blk_queue_congestion_threshold(q);
621 return q; 624 return q;
622 } 625 }
623 626
624 blk_put_queue(q); 627 blk_put_queue(q);
625 return NULL; 628 return NULL;
626 } 629 }
627 EXPORT_SYMBOL(blk_init_queue_node); 630 EXPORT_SYMBOL(blk_init_queue_node);
628 631
629 int blk_get_queue(struct request_queue *q) 632 int blk_get_queue(struct request_queue *q)
630 { 633 {
631 if (likely(!test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))) { 634 if (likely(!test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))) {
632 kobject_get(&q->kobj); 635 kobject_get(&q->kobj);
633 return 0; 636 return 0;
634 } 637 }
635 638
636 return 1; 639 return 1;
637 } 640 }
638 641
639 static inline void blk_free_request(struct request_queue *q, struct request *rq) 642 static inline void blk_free_request(struct request_queue *q, struct request *rq)
640 { 643 {
641 if (rq->cmd_flags & REQ_ELVPRIV) 644 if (rq->cmd_flags & REQ_ELVPRIV)
642 elv_put_request(q, rq); 645 elv_put_request(q, rq);
643 mempool_free(rq, q->rq.rq_pool); 646 mempool_free(rq, q->rq.rq_pool);
644 } 647 }
645 648
646 static struct request * 649 static struct request *
647 blk_alloc_request(struct request_queue *q, int rw, int priv, gfp_t gfp_mask) 650 blk_alloc_request(struct request_queue *q, int rw, int priv, gfp_t gfp_mask)
648 { 651 {
649 struct request *rq = mempool_alloc(q->rq.rq_pool, gfp_mask); 652 struct request *rq = mempool_alloc(q->rq.rq_pool, gfp_mask);
650 653
651 if (!rq) 654 if (!rq)
652 return NULL; 655 return NULL;
653 656
654 blk_rq_init(q, rq); 657 blk_rq_init(q, rq);
655 658
656 rq->cmd_flags = rw | REQ_ALLOCED; 659 rq->cmd_flags = rw | REQ_ALLOCED;
657 660
658 if (priv) { 661 if (priv) {
659 if (unlikely(elv_set_request(q, rq, gfp_mask))) { 662 if (unlikely(elv_set_request(q, rq, gfp_mask))) {
660 mempool_free(rq, q->rq.rq_pool); 663 mempool_free(rq, q->rq.rq_pool);
661 return NULL; 664 return NULL;
662 } 665 }
663 rq->cmd_flags |= REQ_ELVPRIV; 666 rq->cmd_flags |= REQ_ELVPRIV;
664 } 667 }
665 668
666 return rq; 669 return rq;
667 } 670 }
668 671
669 /* 672 /*
670 * ioc_batching returns true if the ioc is a valid batching request and 673 * ioc_batching returns true if the ioc is a valid batching request and
671 * should be given priority access to a request. 674 * should be given priority access to a request.
672 */ 675 */
673 static inline int ioc_batching(struct request_queue *q, struct io_context *ioc) 676 static inline int ioc_batching(struct request_queue *q, struct io_context *ioc)
674 { 677 {
675 if (!ioc) 678 if (!ioc)
676 return 0; 679 return 0;
677 680
678 /* 681 /*
679 * Make sure the process is able to allocate at least 1 request 682 * Make sure the process is able to allocate at least 1 request
680 * even if the batch times out, otherwise we could theoretically 683 * even if the batch times out, otherwise we could theoretically
681 * lose wakeups. 684 * lose wakeups.
682 */ 685 */
683 return ioc->nr_batch_requests == q->nr_batching || 686 return ioc->nr_batch_requests == q->nr_batching ||
684 (ioc->nr_batch_requests > 0 687 (ioc->nr_batch_requests > 0
685 && time_before(jiffies, ioc->last_waited + BLK_BATCH_TIME)); 688 && time_before(jiffies, ioc->last_waited + BLK_BATCH_TIME));
686 } 689 }
687 690
688 /* 691 /*
689 * ioc_set_batching sets ioc to be a new "batcher" if it is not one. This 692 * ioc_set_batching sets ioc to be a new "batcher" if it is not one. This
690 * will cause the process to be a "batcher" on all queues in the system. This 693 * will cause the process to be a "batcher" on all queues in the system. This
691 * is the behaviour we want though - once it gets a wakeup it should be given 694 * is the behaviour we want though - once it gets a wakeup it should be given
692 * a nice run. 695 * a nice run.
693 */ 696 */
694 static void ioc_set_batching(struct request_queue *q, struct io_context *ioc) 697 static void ioc_set_batching(struct request_queue *q, struct io_context *ioc)
695 { 698 {
696 if (!ioc || ioc_batching(q, ioc)) 699 if (!ioc || ioc_batching(q, ioc))
697 return; 700 return;
698 701
699 ioc->nr_batch_requests = q->nr_batching; 702 ioc->nr_batch_requests = q->nr_batching;
700 ioc->last_waited = jiffies; 703 ioc->last_waited = jiffies;
701 } 704 }
702 705
703 static void __freed_request(struct request_queue *q, int rw) 706 static void __freed_request(struct request_queue *q, int rw)
704 { 707 {
705 struct request_list *rl = &q->rq; 708 struct request_list *rl = &q->rq;
706 709
707 if (rl->count[rw] < queue_congestion_off_threshold(q)) 710 if (rl->count[rw] < queue_congestion_off_threshold(q))
708 blk_clear_queue_congested(q, rw); 711 blk_clear_queue_congested(q, rw);
709 712
710 if (rl->count[rw] + 1 <= q->nr_requests) { 713 if (rl->count[rw] + 1 <= q->nr_requests) {
711 if (waitqueue_active(&rl->wait[rw])) 714 if (waitqueue_active(&rl->wait[rw]))
712 wake_up(&rl->wait[rw]); 715 wake_up(&rl->wait[rw]);
713 716
714 blk_clear_queue_full(q, rw); 717 blk_clear_queue_full(q, rw);
715 } 718 }
716 } 719 }
717 720
718 /* 721 /*
719 * A request has just been released. Account for it, update the full and 722 * A request has just been released. Account for it, update the full and
720 * congestion status, wake up any waiters. Called under q->queue_lock. 723 * congestion status, wake up any waiters. Called under q->queue_lock.
721 */ 724 */
722 static void freed_request(struct request_queue *q, int rw, int priv) 725 static void freed_request(struct request_queue *q, int rw, int priv)
723 { 726 {
724 struct request_list *rl = &q->rq; 727 struct request_list *rl = &q->rq;
725 728
726 rl->count[rw]--; 729 rl->count[rw]--;
727 if (priv) 730 if (priv)
728 rl->elvpriv--; 731 rl->elvpriv--;
729 732
730 __freed_request(q, rw); 733 __freed_request(q, rw);
731 734
732 if (unlikely(rl->starved[rw ^ 1])) 735 if (unlikely(rl->starved[rw ^ 1]))
733 __freed_request(q, rw ^ 1); 736 __freed_request(q, rw ^ 1);
734 } 737 }
735 738
736 #define blkdev_free_rq(list) list_entry((list)->next, struct request, queuelist) 739 #define blkdev_free_rq(list) list_entry((list)->next, struct request, queuelist)
737 /* 740 /*
738 * Get a free request, queue_lock must be held. 741 * Get a free request, queue_lock must be held.
739 * Returns NULL on failure, with queue_lock held. 742 * Returns NULL on failure, with queue_lock held.
740 * Returns !NULL on success, with queue_lock *not held*. 743 * Returns !NULL on success, with queue_lock *not held*.
741 */ 744 */
742 static struct request *get_request(struct request_queue *q, int rw_flags, 745 static struct request *get_request(struct request_queue *q, int rw_flags,
743 struct bio *bio, gfp_t gfp_mask) 746 struct bio *bio, gfp_t gfp_mask)
744 { 747 {
745 struct request *rq = NULL; 748 struct request *rq = NULL;
746 struct request_list *rl = &q->rq; 749 struct request_list *rl = &q->rq;
747 struct io_context *ioc = NULL; 750 struct io_context *ioc = NULL;
748 const int rw = rw_flags & 0x01; 751 const int rw = rw_flags & 0x01;
749 int may_queue, priv; 752 int may_queue, priv;
750 753
751 may_queue = elv_may_queue(q, rw_flags); 754 may_queue = elv_may_queue(q, rw_flags);
752 if (may_queue == ELV_MQUEUE_NO) 755 if (may_queue == ELV_MQUEUE_NO)
753 goto rq_starved; 756 goto rq_starved;
754 757
755 if (rl->count[rw]+1 >= queue_congestion_on_threshold(q)) { 758 if (rl->count[rw]+1 >= queue_congestion_on_threshold(q)) {
756 if (rl->count[rw]+1 >= q->nr_requests) { 759 if (rl->count[rw]+1 >= q->nr_requests) {
757 ioc = current_io_context(GFP_ATOMIC, q->node); 760 ioc = current_io_context(GFP_ATOMIC, q->node);
758 /* 761 /*
759 * The queue will fill after this allocation, so set 762 * The queue will fill after this allocation, so set
760 * it as full, and mark this process as "batching". 763 * it as full, and mark this process as "batching".
761 * This process will be allowed to complete a batch of 764 * This process will be allowed to complete a batch of
762 * requests, others will be blocked. 765 * requests, others will be blocked.
763 */ 766 */
764 if (!blk_queue_full(q, rw)) { 767 if (!blk_queue_full(q, rw)) {
765 ioc_set_batching(q, ioc); 768 ioc_set_batching(q, ioc);
766 blk_set_queue_full(q, rw); 769 blk_set_queue_full(q, rw);
767 } else { 770 } else {
768 if (may_queue != ELV_MQUEUE_MUST 771 if (may_queue != ELV_MQUEUE_MUST
769 && !ioc_batching(q, ioc)) { 772 && !ioc_batching(q, ioc)) {
770 /* 773 /*
771 * The queue is full and the allocating 774 * The queue is full and the allocating
772 * process is not a "batcher", and not 775 * process is not a "batcher", and not
773 * exempted by the IO scheduler 776 * exempted by the IO scheduler
774 */ 777 */
775 goto out; 778 goto out;
776 } 779 }
777 } 780 }
778 } 781 }
779 blk_set_queue_congested(q, rw); 782 blk_set_queue_congested(q, rw);
780 } 783 }
781 784
782 /* 785 /*
783 * Only allow batching queuers to allocate up to 50% over the defined 786 * Only allow batching queuers to allocate up to 50% over the defined
784 * limit of requests, otherwise we could have thousands of requests 787 * limit of requests, otherwise we could have thousands of requests
785 * allocated with any setting of ->nr_requests 788 * allocated with any setting of ->nr_requests
786 */ 789 */
787 if (rl->count[rw] >= (3 * q->nr_requests / 2)) 790 if (rl->count[rw] >= (3 * q->nr_requests / 2))
788 goto out; 791 goto out;
789 792
790 rl->count[rw]++; 793 rl->count[rw]++;
791 rl->starved[rw] = 0; 794 rl->starved[rw] = 0;
792 795
793 priv = !test_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags); 796 priv = !test_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags);
794 if (priv) 797 if (priv)
795 rl->elvpriv++; 798 rl->elvpriv++;
796 799
797 spin_unlock_irq(q->queue_lock); 800 spin_unlock_irq(q->queue_lock);
798 801
799 rq = blk_alloc_request(q, rw_flags, priv, gfp_mask); 802 rq = blk_alloc_request(q, rw_flags, priv, gfp_mask);
800 if (unlikely(!rq)) { 803 if (unlikely(!rq)) {
801 /* 804 /*
802 * Allocation failed presumably due to memory. Undo anything 805 * Allocation failed presumably due to memory. Undo anything
803 * we might have messed up. 806 * we might have messed up.
804 * 807 *
805 * Allocating task should really be put onto the front of the 808 * Allocating task should really be put onto the front of the
806 * wait queue, but this is pretty rare. 809 * wait queue, but this is pretty rare.
807 */ 810 */
808 spin_lock_irq(q->queue_lock); 811 spin_lock_irq(q->queue_lock);
809 freed_request(q, rw, priv); 812 freed_request(q, rw, priv);
810 813
811 /* 814 /*
812 * in the very unlikely event that allocation failed and no 815 * in the very unlikely event that allocation failed and no
813 * requests for this direction was pending, mark us starved 816 * requests for this direction was pending, mark us starved
814 * so that freeing of a request in the other direction will 817 * so that freeing of a request in the other direction will
815 * notice us. another possible fix would be to split the 818 * notice us. another possible fix would be to split the
816 * rq mempool into READ and WRITE 819 * rq mempool into READ and WRITE
817 */ 820 */
818 rq_starved: 821 rq_starved:
819 if (unlikely(rl->count[rw] == 0)) 822 if (unlikely(rl->count[rw] == 0))
820 rl->starved[rw] = 1; 823 rl->starved[rw] = 1;
821 824
822 goto out; 825 goto out;
823 } 826 }
824 827
825 /* 828 /*
826 * ioc may be NULL here, and ioc_batching will be false. That's 829 * ioc may be NULL here, and ioc_batching will be false. That's
827 * OK, if the queue is under the request limit then requests need 830 * OK, if the queue is under the request limit then requests need
828 * not count toward the nr_batch_requests limit. There will always 831 * not count toward the nr_batch_requests limit. There will always
829 * be some limit enforced by BLK_BATCH_TIME. 832 * be some limit enforced by BLK_BATCH_TIME.
830 */ 833 */
831 if (ioc_batching(q, ioc)) 834 if (ioc_batching(q, ioc))
832 ioc->nr_batch_requests--; 835 ioc->nr_batch_requests--;
833 836
834 trace_block_getrq(q, bio, rw); 837 trace_block_getrq(q, bio, rw);
835 out: 838 out:
836 return rq; 839 return rq;
837 } 840 }
838 841
839 /* 842 /*
840 * No available requests for this queue, unplug the device and wait for some 843 * No available requests for this queue, unplug the device and wait for some
841 * requests to become available. 844 * requests to become available.
842 * 845 *
843 * Called with q->queue_lock held, and returns with it unlocked. 846 * Called with q->queue_lock held, and returns with it unlocked.
844 */ 847 */
845 static struct request *get_request_wait(struct request_queue *q, int rw_flags, 848 static struct request *get_request_wait(struct request_queue *q, int rw_flags,
846 struct bio *bio) 849 struct bio *bio)
847 { 850 {
848 const int rw = rw_flags & 0x01; 851 const int rw = rw_flags & 0x01;
849 struct request *rq; 852 struct request *rq;
850 853
851 rq = get_request(q, rw_flags, bio, GFP_NOIO); 854 rq = get_request(q, rw_flags, bio, GFP_NOIO);
852 while (!rq) { 855 while (!rq) {
853 DEFINE_WAIT(wait); 856 DEFINE_WAIT(wait);
854 struct io_context *ioc; 857 struct io_context *ioc;
855 struct request_list *rl = &q->rq; 858 struct request_list *rl = &q->rq;
856 859
857 prepare_to_wait_exclusive(&rl->wait[rw], &wait, 860 prepare_to_wait_exclusive(&rl->wait[rw], &wait,
858 TASK_UNINTERRUPTIBLE); 861 TASK_UNINTERRUPTIBLE);
859 862
860 trace_block_sleeprq(q, bio, rw); 863 trace_block_sleeprq(q, bio, rw);
861 864
862 __generic_unplug_device(q); 865 __generic_unplug_device(q);
863 spin_unlock_irq(q->queue_lock); 866 spin_unlock_irq(q->queue_lock);
864 io_schedule(); 867 io_schedule();
865 868
866 /* 869 /*
867 * After sleeping, we become a "batching" process and 870 * After sleeping, we become a "batching" process and
868 * will be able to allocate at least one request, and 871 * will be able to allocate at least one request, and
869 * up to a big batch of them for a small period time. 872 * up to a big batch of them for a small period time.
870 * See ioc_batching, ioc_set_batching 873 * See ioc_batching, ioc_set_batching
871 */ 874 */
872 ioc = current_io_context(GFP_NOIO, q->node); 875 ioc = current_io_context(GFP_NOIO, q->node);
873 ioc_set_batching(q, ioc); 876 ioc_set_batching(q, ioc);
874 877
875 spin_lock_irq(q->queue_lock); 878 spin_lock_irq(q->queue_lock);
876 finish_wait(&rl->wait[rw], &wait); 879 finish_wait(&rl->wait[rw], &wait);
877 880
878 rq = get_request(q, rw_flags, bio, GFP_NOIO); 881 rq = get_request(q, rw_flags, bio, GFP_NOIO);
879 }; 882 };
880 883
881 return rq; 884 return rq;
882 } 885 }
883 886
884 struct request *blk_get_request(struct request_queue *q, int rw, gfp_t gfp_mask) 887 struct request *blk_get_request(struct request_queue *q, int rw, gfp_t gfp_mask)
885 { 888 {
886 struct request *rq; 889 struct request *rq;
887 890
888 BUG_ON(rw != READ && rw != WRITE); 891 BUG_ON(rw != READ && rw != WRITE);
889 892
890 spin_lock_irq(q->queue_lock); 893 spin_lock_irq(q->queue_lock);
891 if (gfp_mask & __GFP_WAIT) { 894 if (gfp_mask & __GFP_WAIT) {
892 rq = get_request_wait(q, rw, NULL); 895 rq = get_request_wait(q, rw, NULL);
893 } else { 896 } else {
894 rq = get_request(q, rw, NULL, gfp_mask); 897 rq = get_request(q, rw, NULL, gfp_mask);
895 if (!rq) 898 if (!rq)
896 spin_unlock_irq(q->queue_lock); 899 spin_unlock_irq(q->queue_lock);
897 } 900 }
898 /* q->queue_lock is unlocked at this point */ 901 /* q->queue_lock is unlocked at this point */
899 902
900 return rq; 903 return rq;
901 } 904 }
902 EXPORT_SYMBOL(blk_get_request); 905 EXPORT_SYMBOL(blk_get_request);
903 906
904 /** 907 /**
905 * blk_start_queueing - initiate dispatch of requests to device 908 * blk_start_queueing - initiate dispatch of requests to device
906 * @q: request queue to kick into gear 909 * @q: request queue to kick into gear
907 * 910 *
908 * This is basically a helper to remove the need to know whether a queue 911 * This is basically a helper to remove the need to know whether a queue
909 * is plugged or not if someone just wants to initiate dispatch of requests 912 * is plugged or not if someone just wants to initiate dispatch of requests
910 * for this queue. Should be used to start queueing on a device outside 913 * for this queue. Should be used to start queueing on a device outside
911 * of ->request_fn() context. Also see @blk_run_queue. 914 * of ->request_fn() context. Also see @blk_run_queue.
912 * 915 *
913 * The queue lock must be held with interrupts disabled. 916 * The queue lock must be held with interrupts disabled.
914 */ 917 */
915 void blk_start_queueing(struct request_queue *q) 918 void blk_start_queueing(struct request_queue *q)
916 { 919 {
917 if (!blk_queue_plugged(q)) { 920 if (!blk_queue_plugged(q)) {
918 if (unlikely(blk_queue_stopped(q))) 921 if (unlikely(blk_queue_stopped(q)))
919 return; 922 return;
920 q->request_fn(q); 923 q->request_fn(q);
921 } else 924 } else
922 __generic_unplug_device(q); 925 __generic_unplug_device(q);
923 } 926 }
924 EXPORT_SYMBOL(blk_start_queueing); 927 EXPORT_SYMBOL(blk_start_queueing);
925 928
926 /** 929 /**
927 * blk_requeue_request - put a request back on queue 930 * blk_requeue_request - put a request back on queue
928 * @q: request queue where request should be inserted 931 * @q: request queue where request should be inserted
929 * @rq: request to be inserted 932 * @rq: request to be inserted
930 * 933 *
931 * Description: 934 * Description:
932 * Drivers often keep queueing requests until the hardware cannot accept 935 * Drivers often keep queueing requests until the hardware cannot accept
933 * more, when that condition happens we need to put the request back 936 * more, when that condition happens we need to put the request back
934 * on the queue. Must be called with queue lock held. 937 * on the queue. Must be called with queue lock held.
935 */ 938 */
936 void blk_requeue_request(struct request_queue *q, struct request *rq) 939 void blk_requeue_request(struct request_queue *q, struct request *rq)
937 { 940 {
938 blk_delete_timer(rq); 941 blk_delete_timer(rq);
939 blk_clear_rq_complete(rq); 942 blk_clear_rq_complete(rq);
940 trace_block_rq_requeue(q, rq); 943 trace_block_rq_requeue(q, rq);
941 944
942 if (blk_rq_tagged(rq)) 945 if (blk_rq_tagged(rq))
943 blk_queue_end_tag(q, rq); 946 blk_queue_end_tag(q, rq);
944 947
945 elv_requeue_request(q, rq); 948 elv_requeue_request(q, rq);
946 } 949 }
947 EXPORT_SYMBOL(blk_requeue_request); 950 EXPORT_SYMBOL(blk_requeue_request);
948 951
949 /** 952 /**
950 * blk_insert_request - insert a special request into a request queue 953 * blk_insert_request - insert a special request into a request queue
951 * @q: request queue where request should be inserted 954 * @q: request queue where request should be inserted
952 * @rq: request to be inserted 955 * @rq: request to be inserted
953 * @at_head: insert request at head or tail of queue 956 * @at_head: insert request at head or tail of queue
954 * @data: private data 957 * @data: private data
955 * 958 *
956 * Description: 959 * Description:
957 * Many block devices need to execute commands asynchronously, so they don't 960 * Many block devices need to execute commands asynchronously, so they don't
958 * block the whole kernel from preemption during request execution. This is 961 * block the whole kernel from preemption during request execution. This is
959 * accomplished normally by inserting aritficial requests tagged as 962 * accomplished normally by inserting aritficial requests tagged as
960 * REQ_TYPE_SPECIAL in to the corresponding request queue, and letting them 963 * REQ_TYPE_SPECIAL in to the corresponding request queue, and letting them
961 * be scheduled for actual execution by the request queue. 964 * be scheduled for actual execution by the request queue.
962 * 965 *
963 * We have the option of inserting the head or the tail of the queue. 966 * We have the option of inserting the head or the tail of the queue.
964 * Typically we use the tail for new ioctls and so forth. We use the head 967 * Typically we use the tail for new ioctls and so forth. We use the head
965 * of the queue for things like a QUEUE_FULL message from a device, or a 968 * of the queue for things like a QUEUE_FULL message from a device, or a
966 * host that is unable to accept a particular command. 969 * host that is unable to accept a particular command.
967 */ 970 */
968 void blk_insert_request(struct request_queue *q, struct request *rq, 971 void blk_insert_request(struct request_queue *q, struct request *rq,
969 int at_head, void *data) 972 int at_head, void *data)
970 { 973 {
971 int where = at_head ? ELEVATOR_INSERT_FRONT : ELEVATOR_INSERT_BACK; 974 int where = at_head ? ELEVATOR_INSERT_FRONT : ELEVATOR_INSERT_BACK;
972 unsigned long flags; 975 unsigned long flags;
973 976
974 /* 977 /*
975 * tell I/O scheduler that this isn't a regular read/write (ie it 978 * tell I/O scheduler that this isn't a regular read/write (ie it
976 * must not attempt merges on this) and that it acts as a soft 979 * must not attempt merges on this) and that it acts as a soft
977 * barrier 980 * barrier
978 */ 981 */
979 rq->cmd_type = REQ_TYPE_SPECIAL; 982 rq->cmd_type = REQ_TYPE_SPECIAL;
980 rq->cmd_flags |= REQ_SOFTBARRIER; 983 rq->cmd_flags |= REQ_SOFTBARRIER;
981 984
982 rq->special = data; 985 rq->special = data;
983 986
984 spin_lock_irqsave(q->queue_lock, flags); 987 spin_lock_irqsave(q->queue_lock, flags);
985 988
986 /* 989 /*
987 * If command is tagged, release the tag 990 * If command is tagged, release the tag
988 */ 991 */
989 if (blk_rq_tagged(rq)) 992 if (blk_rq_tagged(rq))
990 blk_queue_end_tag(q, rq); 993 blk_queue_end_tag(q, rq);
991 994
992 drive_stat_acct(rq, 1); 995 drive_stat_acct(rq, 1);
993 __elv_add_request(q, rq, where, 0); 996 __elv_add_request(q, rq, where, 0);
994 blk_start_queueing(q); 997 blk_start_queueing(q);
995 spin_unlock_irqrestore(q->queue_lock, flags); 998 spin_unlock_irqrestore(q->queue_lock, flags);
996 } 999 }
997 EXPORT_SYMBOL(blk_insert_request); 1000 EXPORT_SYMBOL(blk_insert_request);
998 1001
999 /* 1002 /*
1000 * add-request adds a request to the linked list. 1003 * add-request adds a request to the linked list.
1001 * queue lock is held and interrupts disabled, as we muck with the 1004 * queue lock is held and interrupts disabled, as we muck with the
1002 * request queue list. 1005 * request queue list.
1003 */ 1006 */
1004 static inline void add_request(struct request_queue *q, struct request *req) 1007 static inline void add_request(struct request_queue *q, struct request *req)
1005 { 1008 {
1006 drive_stat_acct(req, 1); 1009 drive_stat_acct(req, 1);
1007 1010
1008 /* 1011 /*
1009 * elevator indicated where it wants this request to be 1012 * elevator indicated where it wants this request to be
1010 * inserted at elevator_merge time 1013 * inserted at elevator_merge time
1011 */ 1014 */
1012 __elv_add_request(q, req, ELEVATOR_INSERT_SORT, 0); 1015 __elv_add_request(q, req, ELEVATOR_INSERT_SORT, 0);
1013 } 1016 }
1014 1017
1015 static void part_round_stats_single(int cpu, struct hd_struct *part, 1018 static void part_round_stats_single(int cpu, struct hd_struct *part,
1016 unsigned long now) 1019 unsigned long now)
1017 { 1020 {
1018 if (now == part->stamp) 1021 if (now == part->stamp)
1019 return; 1022 return;
1020 1023
1021 if (part->in_flight) { 1024 if (part->in_flight) {
1022 __part_stat_add(cpu, part, time_in_queue, 1025 __part_stat_add(cpu, part, time_in_queue,
1023 part->in_flight * (now - part->stamp)); 1026 part->in_flight * (now - part->stamp));
1024 __part_stat_add(cpu, part, io_ticks, (now - part->stamp)); 1027 __part_stat_add(cpu, part, io_ticks, (now - part->stamp));
1025 } 1028 }
1026 part->stamp = now; 1029 part->stamp = now;
1027 } 1030 }
1028 1031
1029 /** 1032 /**
1030 * part_round_stats() - Round off the performance stats on a struct disk_stats. 1033 * part_round_stats() - Round off the performance stats on a struct disk_stats.
1031 * @cpu: cpu number for stats access 1034 * @cpu: cpu number for stats access
1032 * @part: target partition 1035 * @part: target partition
1033 * 1036 *
1034 * The average IO queue length and utilisation statistics are maintained 1037 * The average IO queue length and utilisation statistics are maintained
1035 * by observing the current state of the queue length and the amount of 1038 * by observing the current state of the queue length and the amount of
1036 * time it has been in this state for. 1039 * time it has been in this state for.
1037 * 1040 *
1038 * Normally, that accounting is done on IO completion, but that can result 1041 * Normally, that accounting is done on IO completion, but that can result
1039 * in more than a second's worth of IO being accounted for within any one 1042 * in more than a second's worth of IO being accounted for within any one
1040 * second, leading to >100% utilisation. To deal with that, we call this 1043 * second, leading to >100% utilisation. To deal with that, we call this
1041 * function to do a round-off before returning the results when reading 1044 * function to do a round-off before returning the results when reading
1042 * /proc/diskstats. This accounts immediately for all queue usage up to 1045 * /proc/diskstats. This accounts immediately for all queue usage up to
1043 * the current jiffies and restarts the counters again. 1046 * the current jiffies and restarts the counters again.
1044 */ 1047 */
1045 void part_round_stats(int cpu, struct hd_struct *part) 1048 void part_round_stats(int cpu, struct hd_struct *part)
1046 { 1049 {
1047 unsigned long now = jiffies; 1050 unsigned long now = jiffies;
1048 1051
1049 if (part->partno) 1052 if (part->partno)
1050 part_round_stats_single(cpu, &part_to_disk(part)->part0, now); 1053 part_round_stats_single(cpu, &part_to_disk(part)->part0, now);
1051 part_round_stats_single(cpu, part, now); 1054 part_round_stats_single(cpu, part, now);
1052 } 1055 }
1053 EXPORT_SYMBOL_GPL(part_round_stats); 1056 EXPORT_SYMBOL_GPL(part_round_stats);
1054 1057
1055 /* 1058 /*
1056 * queue lock must be held 1059 * queue lock must be held
1057 */ 1060 */
1058 void __blk_put_request(struct request_queue *q, struct request *req) 1061 void __blk_put_request(struct request_queue *q, struct request *req)
1059 { 1062 {
1060 if (unlikely(!q)) 1063 if (unlikely(!q))
1061 return; 1064 return;
1062 if (unlikely(--req->ref_count)) 1065 if (unlikely(--req->ref_count))
1063 return; 1066 return;
1064 1067
1065 elv_completed_request(q, req); 1068 elv_completed_request(q, req);
1066 1069
1067 /* 1070 /*
1068 * Request may not have originated from ll_rw_blk. if not, 1071 * Request may not have originated from ll_rw_blk. if not,
1069 * it didn't come out of our reserved rq pools 1072 * it didn't come out of our reserved rq pools
1070 */ 1073 */
1071 if (req->cmd_flags & REQ_ALLOCED) { 1074 if (req->cmd_flags & REQ_ALLOCED) {
1072 int rw = rq_data_dir(req); 1075 int rw = rq_data_dir(req);
1073 int priv = req->cmd_flags & REQ_ELVPRIV; 1076 int priv = req->cmd_flags & REQ_ELVPRIV;
1074 1077
1075 BUG_ON(!list_empty(&req->queuelist)); 1078 BUG_ON(!list_empty(&req->queuelist));
1076 BUG_ON(!hlist_unhashed(&req->hash)); 1079 BUG_ON(!hlist_unhashed(&req->hash));
1077 1080
1078 blk_free_request(q, req); 1081 blk_free_request(q, req);
1079 freed_request(q, rw, priv); 1082 freed_request(q, rw, priv);
1080 } 1083 }
1081 } 1084 }
1082 EXPORT_SYMBOL_GPL(__blk_put_request); 1085 EXPORT_SYMBOL_GPL(__blk_put_request);
1083 1086
1084 void blk_put_request(struct request *req) 1087 void blk_put_request(struct request *req)
1085 { 1088 {
1086 unsigned long flags; 1089 unsigned long flags;
1087 struct request_queue *q = req->q; 1090 struct request_queue *q = req->q;
1088 1091
1089 spin_lock_irqsave(q->queue_lock, flags); 1092 spin_lock_irqsave(q->queue_lock, flags);
1090 __blk_put_request(q, req); 1093 __blk_put_request(q, req);
1091 spin_unlock_irqrestore(q->queue_lock, flags); 1094 spin_unlock_irqrestore(q->queue_lock, flags);
1092 } 1095 }
1093 EXPORT_SYMBOL(blk_put_request); 1096 EXPORT_SYMBOL(blk_put_request);
1094 1097
1095 void init_request_from_bio(struct request *req, struct bio *bio) 1098 void init_request_from_bio(struct request *req, struct bio *bio)
1096 { 1099 {
1097 req->cpu = bio->bi_comp_cpu; 1100 req->cpu = bio->bi_comp_cpu;
1098 req->cmd_type = REQ_TYPE_FS; 1101 req->cmd_type = REQ_TYPE_FS;
1099 1102
1100 /* 1103 /*
1101 * inherit FAILFAST from bio (for read-ahead, and explicit FAILFAST) 1104 * inherit FAILFAST from bio (for read-ahead, and explicit FAILFAST)
1102 */ 1105 */
1103 if (bio_rw_ahead(bio)) 1106 if (bio_rw_ahead(bio))
1104 req->cmd_flags |= (REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT | 1107 req->cmd_flags |= (REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT |
1105 REQ_FAILFAST_DRIVER); 1108 REQ_FAILFAST_DRIVER);
1106 if (bio_failfast_dev(bio)) 1109 if (bio_failfast_dev(bio))
1107 req->cmd_flags |= REQ_FAILFAST_DEV; 1110 req->cmd_flags |= REQ_FAILFAST_DEV;
1108 if (bio_failfast_transport(bio)) 1111 if (bio_failfast_transport(bio))
1109 req->cmd_flags |= REQ_FAILFAST_TRANSPORT; 1112 req->cmd_flags |= REQ_FAILFAST_TRANSPORT;
1110 if (bio_failfast_driver(bio)) 1113 if (bio_failfast_driver(bio))
1111 req->cmd_flags |= REQ_FAILFAST_DRIVER; 1114 req->cmd_flags |= REQ_FAILFAST_DRIVER;
1112 1115
1113 /* 1116 /*
1114 * REQ_BARRIER implies no merging, but lets make it explicit 1117 * REQ_BARRIER implies no merging, but lets make it explicit
1115 */ 1118 */
1116 if (unlikely(bio_discard(bio))) { 1119 if (unlikely(bio_discard(bio))) {
1117 req->cmd_flags |= REQ_DISCARD; 1120 req->cmd_flags |= REQ_DISCARD;
1118 if (bio_barrier(bio)) 1121 if (bio_barrier(bio))
1119 req->cmd_flags |= REQ_SOFTBARRIER; 1122 req->cmd_flags |= REQ_SOFTBARRIER;
1120 req->q->prepare_discard_fn(req->q, req); 1123 req->q->prepare_discard_fn(req->q, req);
1121 } else if (unlikely(bio_barrier(bio))) 1124 } else if (unlikely(bio_barrier(bio)))
1122 req->cmd_flags |= (REQ_HARDBARRIER | REQ_NOMERGE); 1125 req->cmd_flags |= (REQ_HARDBARRIER | REQ_NOMERGE);
1123 1126
1124 if (bio_sync(bio)) 1127 if (bio_sync(bio))
1125 req->cmd_flags |= REQ_RW_SYNC; 1128 req->cmd_flags |= REQ_RW_SYNC;
1126 if (bio_rw_meta(bio)) 1129 if (bio_rw_meta(bio))
1127 req->cmd_flags |= REQ_RW_META; 1130 req->cmd_flags |= REQ_RW_META;
1128 1131
1129 req->errors = 0; 1132 req->errors = 0;
1130 req->hard_sector = req->sector = bio->bi_sector; 1133 req->hard_sector = req->sector = bio->bi_sector;
1131 req->ioprio = bio_prio(bio); 1134 req->ioprio = bio_prio(bio);
1132 req->start_time = jiffies; 1135 req->start_time = jiffies;
1133 blk_rq_bio_prep(req->q, req, bio); 1136 blk_rq_bio_prep(req->q, req, bio);
1134 } 1137 }
1135 1138
1136 static int __make_request(struct request_queue *q, struct bio *bio) 1139 static int __make_request(struct request_queue *q, struct bio *bio)
1137 { 1140 {
1138 struct request *req; 1141 struct request *req;
1139 int el_ret, nr_sectors, barrier, discard, err; 1142 int el_ret, nr_sectors, barrier, discard, err;
1140 const unsigned short prio = bio_prio(bio); 1143 const unsigned short prio = bio_prio(bio);
1141 const int sync = bio_sync(bio); 1144 const int sync = bio_sync(bio);
1142 int rw_flags; 1145 int rw_flags;
1143 1146
1144 nr_sectors = bio_sectors(bio); 1147 nr_sectors = bio_sectors(bio);
1145 1148
1146 /* 1149 /*
1147 * low level driver can indicate that it wants pages above a 1150 * low level driver can indicate that it wants pages above a
1148 * certain limit bounced to low memory (ie for highmem, or even 1151 * certain limit bounced to low memory (ie for highmem, or even
1149 * ISA dma in theory) 1152 * ISA dma in theory)
1150 */ 1153 */
1151 blk_queue_bounce(q, &bio); 1154 blk_queue_bounce(q, &bio);
1152 1155
1153 barrier = bio_barrier(bio); 1156 barrier = bio_barrier(bio);
1154 if (unlikely(barrier) && bio_has_data(bio) && 1157 if (unlikely(barrier) && bio_has_data(bio) &&
1155 (q->next_ordered == QUEUE_ORDERED_NONE)) { 1158 (q->next_ordered == QUEUE_ORDERED_NONE)) {
1156 err = -EOPNOTSUPP; 1159 err = -EOPNOTSUPP;
1157 goto end_io; 1160 goto end_io;
1158 } 1161 }
1159 1162
1160 discard = bio_discard(bio); 1163 discard = bio_discard(bio);
1161 if (unlikely(discard) && !q->prepare_discard_fn) { 1164 if (unlikely(discard) && !q->prepare_discard_fn) {
1162 err = -EOPNOTSUPP; 1165 err = -EOPNOTSUPP;
1163 goto end_io; 1166 goto end_io;
1164 } 1167 }
1165 1168
1166 spin_lock_irq(q->queue_lock); 1169 spin_lock_irq(q->queue_lock);
1167 1170
1168 if (unlikely(barrier) || elv_queue_empty(q)) 1171 if (unlikely(barrier) || elv_queue_empty(q))
1169 goto get_rq; 1172 goto get_rq;
1170 1173
1171 el_ret = elv_merge(q, &req, bio); 1174 el_ret = elv_merge(q, &req, bio);
1172 switch (el_ret) { 1175 switch (el_ret) {
1173 case ELEVATOR_BACK_MERGE: 1176 case ELEVATOR_BACK_MERGE:
1174 BUG_ON(!rq_mergeable(req)); 1177 BUG_ON(!rq_mergeable(req));
1175 1178
1176 if (!ll_back_merge_fn(q, req, bio)) 1179 if (!ll_back_merge_fn(q, req, bio))
1177 break; 1180 break;
1178 1181
1179 trace_block_bio_backmerge(q, bio); 1182 trace_block_bio_backmerge(q, bio);
1180 1183
1181 req->biotail->bi_next = bio; 1184 req->biotail->bi_next = bio;
1182 req->biotail = bio; 1185 req->biotail = bio;
1183 req->nr_sectors = req->hard_nr_sectors += nr_sectors; 1186 req->nr_sectors = req->hard_nr_sectors += nr_sectors;
1184 req->ioprio = ioprio_best(req->ioprio, prio); 1187 req->ioprio = ioprio_best(req->ioprio, prio);
1185 if (!blk_rq_cpu_valid(req)) 1188 if (!blk_rq_cpu_valid(req))
1186 req->cpu = bio->bi_comp_cpu; 1189 req->cpu = bio->bi_comp_cpu;
1187 drive_stat_acct(req, 0); 1190 drive_stat_acct(req, 0);
1188 if (!attempt_back_merge(q, req)) 1191 if (!attempt_back_merge(q, req))
1189 elv_merged_request(q, req, el_ret); 1192 elv_merged_request(q, req, el_ret);
1190 goto out; 1193 goto out;
1191 1194
1192 case ELEVATOR_FRONT_MERGE: 1195 case ELEVATOR_FRONT_MERGE:
1193 BUG_ON(!rq_mergeable(req)); 1196 BUG_ON(!rq_mergeable(req));
1194 1197
1195 if (!ll_front_merge_fn(q, req, bio)) 1198 if (!ll_front_merge_fn(q, req, bio))
1196 break; 1199 break;
1197 1200
1198 trace_block_bio_frontmerge(q, bio); 1201 trace_block_bio_frontmerge(q, bio);
1199 1202
1200 bio->bi_next = req->bio; 1203 bio->bi_next = req->bio;
1201 req->bio = bio; 1204 req->bio = bio;
1202 1205
1203 /* 1206 /*
1204 * may not be valid. if the low level driver said 1207 * may not be valid. if the low level driver said
1205 * it didn't need a bounce buffer then it better 1208 * it didn't need a bounce buffer then it better
1206 * not touch req->buffer either... 1209 * not touch req->buffer either...
1207 */ 1210 */
1208 req->buffer = bio_data(bio); 1211 req->buffer = bio_data(bio);
1209 req->current_nr_sectors = bio_cur_sectors(bio); 1212 req->current_nr_sectors = bio_cur_sectors(bio);
1210 req->hard_cur_sectors = req->current_nr_sectors; 1213 req->hard_cur_sectors = req->current_nr_sectors;
1211 req->sector = req->hard_sector = bio->bi_sector; 1214 req->sector = req->hard_sector = bio->bi_sector;
1212 req->nr_sectors = req->hard_nr_sectors += nr_sectors; 1215 req->nr_sectors = req->hard_nr_sectors += nr_sectors;
1213 req->ioprio = ioprio_best(req->ioprio, prio); 1216 req->ioprio = ioprio_best(req->ioprio, prio);
1214 if (!blk_rq_cpu_valid(req)) 1217 if (!blk_rq_cpu_valid(req))
1215 req->cpu = bio->bi_comp_cpu; 1218 req->cpu = bio->bi_comp_cpu;
1216 drive_stat_acct(req, 0); 1219 drive_stat_acct(req, 0);
1217 if (!attempt_front_merge(q, req)) 1220 if (!attempt_front_merge(q, req))
1218 elv_merged_request(q, req, el_ret); 1221 elv_merged_request(q, req, el_ret);
1219 goto out; 1222 goto out;
1220 1223
1221 /* ELV_NO_MERGE: elevator says don't/can't merge. */ 1224 /* ELV_NO_MERGE: elevator says don't/can't merge. */
1222 default: 1225 default:
1223 ; 1226 ;
1224 } 1227 }
1225 1228
1226 get_rq: 1229 get_rq:
1227 /* 1230 /*
1228 * This sync check and mask will be re-done in init_request_from_bio(), 1231 * This sync check and mask will be re-done in init_request_from_bio(),
1229 * but we need to set it earlier to expose the sync flag to the 1232 * but we need to set it earlier to expose the sync flag to the
1230 * rq allocator and io schedulers. 1233 * rq allocator and io schedulers.
1231 */ 1234 */
1232 rw_flags = bio_data_dir(bio); 1235 rw_flags = bio_data_dir(bio);
1233 if (sync) 1236 if (sync)
1234 rw_flags |= REQ_RW_SYNC; 1237 rw_flags |= REQ_RW_SYNC;
1235 1238
1236 /* 1239 /*
1237 * Grab a free request. This is might sleep but can not fail. 1240 * Grab a free request. This is might sleep but can not fail.
1238 * Returns with the queue unlocked. 1241 * Returns with the queue unlocked.
1239 */ 1242 */
1240 req = get_request_wait(q, rw_flags, bio); 1243 req = get_request_wait(q, rw_flags, bio);
1241 1244
1242 /* 1245 /*
1243 * After dropping the lock and possibly sleeping here, our request 1246 * After dropping the lock and possibly sleeping here, our request
1244 * may now be mergeable after it had proven unmergeable (above). 1247 * may now be mergeable after it had proven unmergeable (above).
1245 * We don't worry about that case for efficiency. It won't happen 1248 * We don't worry about that case for efficiency. It won't happen
1246 * often, and the elevators are able to handle it. 1249 * often, and the elevators are able to handle it.
1247 */ 1250 */
1248 init_request_from_bio(req, bio); 1251 init_request_from_bio(req, bio);
1249 1252
1250 spin_lock_irq(q->queue_lock); 1253 spin_lock_irq(q->queue_lock);
1251 if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags) || 1254 if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags) ||
1252 bio_flagged(bio, BIO_CPU_AFFINE)) 1255 bio_flagged(bio, BIO_CPU_AFFINE))
1253 req->cpu = blk_cpu_to_group(smp_processor_id()); 1256 req->cpu = blk_cpu_to_group(smp_processor_id());
1254 if (elv_queue_empty(q)) 1257 if (elv_queue_empty(q))
1255 blk_plug_device(q); 1258 blk_plug_device(q);
1256 add_request(q, req); 1259 add_request(q, req);
1257 out: 1260 out:
1258 if (sync) 1261 if (sync)
1259 __generic_unplug_device(q); 1262 __generic_unplug_device(q);
1260 spin_unlock_irq(q->queue_lock); 1263 spin_unlock_irq(q->queue_lock);
1261 return 0; 1264 return 0;
1262 1265
1263 end_io: 1266 end_io:
1264 bio_endio(bio, err); 1267 bio_endio(bio, err);
1265 return 0; 1268 return 0;
1266 } 1269 }
1267 1270
1268 /* 1271 /*
1269 * If bio->bi_dev is a partition, remap the location 1272 * If bio->bi_dev is a partition, remap the location
1270 */ 1273 */
1271 static inline void blk_partition_remap(struct bio *bio) 1274 static inline void blk_partition_remap(struct bio *bio)
1272 { 1275 {
1273 struct block_device *bdev = bio->bi_bdev; 1276 struct block_device *bdev = bio->bi_bdev;
1274 1277
1275 if (bio_sectors(bio) && bdev != bdev->bd_contains) { 1278 if (bio_sectors(bio) && bdev != bdev->bd_contains) {
1276 struct hd_struct *p = bdev->bd_part; 1279 struct hd_struct *p = bdev->bd_part;
1277 1280
1278 bio->bi_sector += p->start_sect; 1281 bio->bi_sector += p->start_sect;
1279 bio->bi_bdev = bdev->bd_contains; 1282 bio->bi_bdev = bdev->bd_contains;
1280 1283
1281 trace_block_remap(bdev_get_queue(bio->bi_bdev), bio, 1284 trace_block_remap(bdev_get_queue(bio->bi_bdev), bio,
1282 bdev->bd_dev, bio->bi_sector, 1285 bdev->bd_dev, bio->bi_sector,
1283 bio->bi_sector - p->start_sect); 1286 bio->bi_sector - p->start_sect);
1284 } 1287 }
1285 } 1288 }
1286 1289
1287 static void handle_bad_sector(struct bio *bio) 1290 static void handle_bad_sector(struct bio *bio)
1288 { 1291 {
1289 char b[BDEVNAME_SIZE]; 1292 char b[BDEVNAME_SIZE];
1290 1293
1291 printk(KERN_INFO "attempt to access beyond end of device\n"); 1294 printk(KERN_INFO "attempt to access beyond end of device\n");
1292 printk(KERN_INFO "%s: rw=%ld, want=%Lu, limit=%Lu\n", 1295 printk(KERN_INFO "%s: rw=%ld, want=%Lu, limit=%Lu\n",
1293 bdevname(bio->bi_bdev, b), 1296 bdevname(bio->bi_bdev, b),
1294 bio->bi_rw, 1297 bio->bi_rw,
1295 (unsigned long long)bio->bi_sector + bio_sectors(bio), 1298 (unsigned long long)bio->bi_sector + bio_sectors(bio),
1296 (long long)(bio->bi_bdev->bd_inode->i_size >> 9)); 1299 (long long)(bio->bi_bdev->bd_inode->i_size >> 9));
1297 1300
1298 set_bit(BIO_EOF, &bio->bi_flags); 1301 set_bit(BIO_EOF, &bio->bi_flags);
1299 } 1302 }
1300 1303
1301 #ifdef CONFIG_FAIL_MAKE_REQUEST 1304 #ifdef CONFIG_FAIL_MAKE_REQUEST
1302 1305
1303 static DECLARE_FAULT_ATTR(fail_make_request); 1306 static DECLARE_FAULT_ATTR(fail_make_request);
1304 1307
1305 static int __init setup_fail_make_request(char *str) 1308 static int __init setup_fail_make_request(char *str)
1306 { 1309 {
1307 return setup_fault_attr(&fail_make_request, str); 1310 return setup_fault_attr(&fail_make_request, str);
1308 } 1311 }
1309 __setup("fail_make_request=", setup_fail_make_request); 1312 __setup("fail_make_request=", setup_fail_make_request);
1310 1313
1311 static int should_fail_request(struct bio *bio) 1314 static int should_fail_request(struct bio *bio)
1312 { 1315 {
1313 struct hd_struct *part = bio->bi_bdev->bd_part; 1316 struct hd_struct *part = bio->bi_bdev->bd_part;
1314 1317
1315 if (part_to_disk(part)->part0.make_it_fail || part->make_it_fail) 1318 if (part_to_disk(part)->part0.make_it_fail || part->make_it_fail)
1316 return should_fail(&fail_make_request, bio->bi_size); 1319 return should_fail(&fail_make_request, bio->bi_size);
1317 1320
1318 return 0; 1321 return 0;
1319 } 1322 }
1320 1323
1321 static int __init fail_make_request_debugfs(void) 1324 static int __init fail_make_request_debugfs(void)
1322 { 1325 {
1323 return init_fault_attr_dentries(&fail_make_request, 1326 return init_fault_attr_dentries(&fail_make_request,
1324 "fail_make_request"); 1327 "fail_make_request");
1325 } 1328 }
1326 1329
1327 late_initcall(fail_make_request_debugfs); 1330 late_initcall(fail_make_request_debugfs);
1328 1331
1329 #else /* CONFIG_FAIL_MAKE_REQUEST */ 1332 #else /* CONFIG_FAIL_MAKE_REQUEST */
1330 1333
1331 static inline int should_fail_request(struct bio *bio) 1334 static inline int should_fail_request(struct bio *bio)
1332 { 1335 {
1333 return 0; 1336 return 0;
1334 } 1337 }
1335 1338
1336 #endif /* CONFIG_FAIL_MAKE_REQUEST */ 1339 #endif /* CONFIG_FAIL_MAKE_REQUEST */
1337 1340
1338 /* 1341 /*
1339 * Check whether this bio extends beyond the end of the device. 1342 * Check whether this bio extends beyond the end of the device.
1340 */ 1343 */
1341 static inline int bio_check_eod(struct bio *bio, unsigned int nr_sectors) 1344 static inline int bio_check_eod(struct bio *bio, unsigned int nr_sectors)
1342 { 1345 {
1343 sector_t maxsector; 1346 sector_t maxsector;
1344 1347
1345 if (!nr_sectors) 1348 if (!nr_sectors)
1346 return 0; 1349 return 0;
1347 1350
1348 /* Test device or partition size, when known. */ 1351 /* Test device or partition size, when known. */
1349 maxsector = bio->bi_bdev->bd_inode->i_size >> 9; 1352 maxsector = bio->bi_bdev->bd_inode->i_size >> 9;
1350 if (maxsector) { 1353 if (maxsector) {
1351 sector_t sector = bio->bi_sector; 1354 sector_t sector = bio->bi_sector;
1352 1355
1353 if (maxsector < nr_sectors || maxsector - nr_sectors < sector) { 1356 if (maxsector < nr_sectors || maxsector - nr_sectors < sector) {
1354 /* 1357 /*
1355 * This may well happen - the kernel calls bread() 1358 * This may well happen - the kernel calls bread()
1356 * without checking the size of the device, e.g., when 1359 * without checking the size of the device, e.g., when
1357 * mounting a device. 1360 * mounting a device.
1358 */ 1361 */
1359 handle_bad_sector(bio); 1362 handle_bad_sector(bio);
1360 return 1; 1363 return 1;
1361 } 1364 }
1362 } 1365 }
1363 1366
1364 return 0; 1367 return 0;
1365 } 1368 }
1366 1369
1367 /** 1370 /**
1368 * generic_make_request - hand a buffer to its device driver for I/O 1371 * generic_make_request - hand a buffer to its device driver for I/O
1369 * @bio: The bio describing the location in memory and on the device. 1372 * @bio: The bio describing the location in memory and on the device.
1370 * 1373 *
1371 * generic_make_request() is used to make I/O requests of block 1374 * generic_make_request() is used to make I/O requests of block
1372 * devices. It is passed a &struct bio, which describes the I/O that needs 1375 * devices. It is passed a &struct bio, which describes the I/O that needs
1373 * to be done. 1376 * to be done.
1374 * 1377 *
1375 * generic_make_request() does not return any status. The 1378 * generic_make_request() does not return any status. The
1376 * success/failure status of the request, along with notification of 1379 * success/failure status of the request, along with notification of
1377 * completion, is delivered asynchronously through the bio->bi_end_io 1380 * completion, is delivered asynchronously through the bio->bi_end_io
1378 * function described (one day) else where. 1381 * function described (one day) else where.
1379 * 1382 *
1380 * The caller of generic_make_request must make sure that bi_io_vec 1383 * The caller of generic_make_request must make sure that bi_io_vec
1381 * are set to describe the memory buffer, and that bi_dev and bi_sector are 1384 * are set to describe the memory buffer, and that bi_dev and bi_sector are
1382 * set to describe the device address, and the 1385 * set to describe the device address, and the
1383 * bi_end_io and optionally bi_private are set to describe how 1386 * bi_end_io and optionally bi_private are set to describe how
1384 * completion notification should be signaled. 1387 * completion notification should be signaled.
1385 * 1388 *
1386 * generic_make_request and the drivers it calls may use bi_next if this 1389 * generic_make_request and the drivers it calls may use bi_next if this
1387 * bio happens to be merged with someone else, and may change bi_dev and 1390 * bio happens to be merged with someone else, and may change bi_dev and
1388 * bi_sector for remaps as it sees fit. So the values of these fields 1391 * bi_sector for remaps as it sees fit. So the values of these fields
1389 * should NOT be depended on after the call to generic_make_request. 1392 * should NOT be depended on after the call to generic_make_request.
1390 */ 1393 */
1391 static inline void __generic_make_request(struct bio *bio) 1394 static inline void __generic_make_request(struct bio *bio)
1392 { 1395 {
1393 struct request_queue *q; 1396 struct request_queue *q;
1394 sector_t old_sector; 1397 sector_t old_sector;
1395 int ret, nr_sectors = bio_sectors(bio); 1398 int ret, nr_sectors = bio_sectors(bio);
1396 dev_t old_dev; 1399 dev_t old_dev;
1397 int err = -EIO; 1400 int err = -EIO;
1398 1401
1399 might_sleep(); 1402 might_sleep();
1400 1403
1401 if (bio_check_eod(bio, nr_sectors)) 1404 if (bio_check_eod(bio, nr_sectors))
1402 goto end_io; 1405 goto end_io;
1403 1406
1404 /* 1407 /*
1405 * Resolve the mapping until finished. (drivers are 1408 * Resolve the mapping until finished. (drivers are
1406 * still free to implement/resolve their own stacking 1409 * still free to implement/resolve their own stacking
1407 * by explicitly returning 0) 1410 * by explicitly returning 0)
1408 * 1411 *
1409 * NOTE: we don't repeat the blk_size check for each new device. 1412 * NOTE: we don't repeat the blk_size check for each new device.
1410 * Stacking drivers are expected to know what they are doing. 1413 * Stacking drivers are expected to know what they are doing.
1411 */ 1414 */
1412 old_sector = -1; 1415 old_sector = -1;
1413 old_dev = 0; 1416 old_dev = 0;
1414 do { 1417 do {
1415 char b[BDEVNAME_SIZE]; 1418 char b[BDEVNAME_SIZE];
1416 1419
1417 q = bdev_get_queue(bio->bi_bdev); 1420 q = bdev_get_queue(bio->bi_bdev);
1418 if (!q) { 1421 if (!q) {
1419 printk(KERN_ERR 1422 printk(KERN_ERR
1420 "generic_make_request: Trying to access " 1423 "generic_make_request: Trying to access "
1421 "nonexistent block-device %s (%Lu)\n", 1424 "nonexistent block-device %s (%Lu)\n",
1422 bdevname(bio->bi_bdev, b), 1425 bdevname(bio->bi_bdev, b),
1423 (long long) bio->bi_sector); 1426 (long long) bio->bi_sector);
1424 end_io: 1427 end_io:
1425 bio_endio(bio, err); 1428 bio_endio(bio, err);
1426 break; 1429 break;
1427 } 1430 }
1428 1431
1429 if (unlikely(nr_sectors > q->max_hw_sectors)) { 1432 if (unlikely(nr_sectors > q->max_hw_sectors)) {
1430 printk(KERN_ERR "bio too big device %s (%u > %u)\n", 1433 printk(KERN_ERR "bio too big device %s (%u > %u)\n",
1431 bdevname(bio->bi_bdev, b), 1434 bdevname(bio->bi_bdev, b),
1432 bio_sectors(bio), 1435 bio_sectors(bio),
1433 q->max_hw_sectors); 1436 q->max_hw_sectors);
1434 goto end_io; 1437 goto end_io;
1435 } 1438 }
1436 1439
1437 if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))) 1440 if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)))
1438 goto end_io; 1441 goto end_io;
1439 1442
1440 if (should_fail_request(bio)) 1443 if (should_fail_request(bio))
1441 goto end_io; 1444 goto end_io;
1442 1445
1443 /* 1446 /*
1444 * If this device has partitions, remap block n 1447 * If this device has partitions, remap block n
1445 * of partition p to block n+start(p) of the disk. 1448 * of partition p to block n+start(p) of the disk.
1446 */ 1449 */
1447 blk_partition_remap(bio); 1450 blk_partition_remap(bio);
1448 1451
1449 if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) 1452 if (bio_integrity_enabled(bio) && bio_integrity_prep(bio))
1450 goto end_io; 1453 goto end_io;
1451 1454
1452 if (old_sector != -1) 1455 if (old_sector != -1)
1453 trace_block_remap(q, bio, old_dev, bio->bi_sector, 1456 trace_block_remap(q, bio, old_dev, bio->bi_sector,
1454 old_sector); 1457 old_sector);
1455 1458
1456 trace_block_bio_queue(q, bio); 1459 trace_block_bio_queue(q, bio);
1457 1460
1458 old_sector = bio->bi_sector; 1461 old_sector = bio->bi_sector;
1459 old_dev = bio->bi_bdev->bd_dev; 1462 old_dev = bio->bi_bdev->bd_dev;
1460 1463
1461 if (bio_check_eod(bio, nr_sectors)) 1464 if (bio_check_eod(bio, nr_sectors))
1462 goto end_io; 1465 goto end_io;
1463 if ((bio_empty_barrier(bio) && !q->prepare_flush_fn) || 1466 if ((bio_empty_barrier(bio) && !q->prepare_flush_fn) ||
1464 (bio_discard(bio) && !q->prepare_discard_fn)) { 1467 (bio_discard(bio) && !q->prepare_discard_fn)) {
1465 err = -EOPNOTSUPP; 1468 err = -EOPNOTSUPP;
1466 goto end_io; 1469 goto end_io;
1467 } 1470 }
1468 1471
1469 ret = q->make_request_fn(q, bio); 1472 ret = q->make_request_fn(q, bio);
1470 } while (ret); 1473 } while (ret);
1471 } 1474 }
1472 1475
1473 /* 1476 /*
1474 * We only want one ->make_request_fn to be active at a time, 1477 * We only want one ->make_request_fn to be active at a time,
1475 * else stack usage with stacked devices could be a problem. 1478 * else stack usage with stacked devices could be a problem.
1476 * So use current->bio_{list,tail} to keep a list of requests 1479 * So use current->bio_{list,tail} to keep a list of requests
1477 * submited by a make_request_fn function. 1480 * submited by a make_request_fn function.
1478 * current->bio_tail is also used as a flag to say if 1481 * current->bio_tail is also used as a flag to say if
1479 * generic_make_request is currently active in this task or not. 1482 * generic_make_request is currently active in this task or not.
1480 * If it is NULL, then no make_request is active. If it is non-NULL, 1483 * If it is NULL, then no make_request is active. If it is non-NULL,
1481 * then a make_request is active, and new requests should be added 1484 * then a make_request is active, and new requests should be added
1482 * at the tail 1485 * at the tail
1483 */ 1486 */
1484 void generic_make_request(struct bio *bio) 1487 void generic_make_request(struct bio *bio)
1485 { 1488 {
1486 if (current->bio_tail) { 1489 if (current->bio_tail) {
1487 /* make_request is active */ 1490 /* make_request is active */
1488 *(current->bio_tail) = bio; 1491 *(current->bio_tail) = bio;
1489 bio->bi_next = NULL; 1492 bio->bi_next = NULL;
1490 current->bio_tail = &bio->bi_next; 1493 current->bio_tail = &bio->bi_next;
1491 return; 1494 return;
1492 } 1495 }
1493 /* following loop may be a bit non-obvious, and so deserves some 1496 /* following loop may be a bit non-obvious, and so deserves some
1494 * explanation. 1497 * explanation.
1495 * Before entering the loop, bio->bi_next is NULL (as all callers 1498 * Before entering the loop, bio->bi_next is NULL (as all callers
1496 * ensure that) so we have a list with a single bio. 1499 * ensure that) so we have a list with a single bio.
1497 * We pretend that we have just taken it off a longer list, so 1500 * We pretend that we have just taken it off a longer list, so
1498 * we assign bio_list to the next (which is NULL) and bio_tail 1501 * we assign bio_list to the next (which is NULL) and bio_tail
1499 * to &bio_list, thus initialising the bio_list of new bios to be 1502 * to &bio_list, thus initialising the bio_list of new bios to be
1500 * added. __generic_make_request may indeed add some more bios 1503 * added. __generic_make_request may indeed add some more bios
1501 * through a recursive call to generic_make_request. If it 1504 * through a recursive call to generic_make_request. If it
1502 * did, we find a non-NULL value in bio_list and re-enter the loop 1505 * did, we find a non-NULL value in bio_list and re-enter the loop
1503 * from the top. In this case we really did just take the bio 1506 * from the top. In this case we really did just take the bio
1504 * of the top of the list (no pretending) and so fixup bio_list and 1507 * of the top of the list (no pretending) and so fixup bio_list and
1505 * bio_tail or bi_next, and call into __generic_make_request again. 1508 * bio_tail or bi_next, and call into __generic_make_request again.
1506 * 1509 *
1507 * The loop was structured like this to make only one call to 1510 * The loop was structured like this to make only one call to
1508 * __generic_make_request (which is important as it is large and 1511 * __generic_make_request (which is important as it is large and
1509 * inlined) and to keep the structure simple. 1512 * inlined) and to keep the structure simple.
1510 */ 1513 */
1511 BUG_ON(bio->bi_next); 1514 BUG_ON(bio->bi_next);
1512 do { 1515 do {
1513 current->bio_list = bio->bi_next; 1516 current->bio_list = bio->bi_next;
1514 if (bio->bi_next == NULL) 1517 if (bio->bi_next == NULL)
1515 current->bio_tail = &current->bio_list; 1518 current->bio_tail = &current->bio_list;
1516 else 1519 else
1517 bio->bi_next = NULL; 1520 bio->bi_next = NULL;
1518 __generic_make_request(bio); 1521 __generic_make_request(bio);
1519 bio = current->bio_list; 1522 bio = current->bio_list;
1520 } while (bio); 1523 } while (bio);
1521 current->bio_tail = NULL; /* deactivate */ 1524 current->bio_tail = NULL; /* deactivate */
1522 } 1525 }
1523 EXPORT_SYMBOL(generic_make_request); 1526 EXPORT_SYMBOL(generic_make_request);
1524 1527
1525 /** 1528 /**
1526 * submit_bio - submit a bio to the block device layer for I/O 1529 * submit_bio - submit a bio to the block device layer for I/O
1527 * @rw: whether to %READ or %WRITE, or maybe to %READA (read ahead) 1530 * @rw: whether to %READ or %WRITE, or maybe to %READA (read ahead)
1528 * @bio: The &struct bio which describes the I/O 1531 * @bio: The &struct bio which describes the I/O
1529 * 1532 *
1530 * submit_bio() is very similar in purpose to generic_make_request(), and 1533 * submit_bio() is very similar in purpose to generic_make_request(), and
1531 * uses that function to do most of the work. Both are fairly rough 1534 * uses that function to do most of the work. Both are fairly rough
1532 * interfaces; @bio must be presetup and ready for I/O. 1535 * interfaces; @bio must be presetup and ready for I/O.
1533 * 1536 *
1534 */ 1537 */
1535 void submit_bio(int rw, struct bio *bio) 1538 void submit_bio(int rw, struct bio *bio)
1536 { 1539 {
1537 int count = bio_sectors(bio); 1540 int count = bio_sectors(bio);
1538 1541
1539 bio->bi_rw |= rw; 1542 bio->bi_rw |= rw;
1540 1543
1541 /* 1544 /*
1542 * If it's a regular read/write or a barrier with data attached, 1545 * If it's a regular read/write or a barrier with data attached,
1543 * go through the normal accounting stuff before submission. 1546 * go through the normal accounting stuff before submission.
1544 */ 1547 */
1545 if (bio_has_data(bio)) { 1548 if (bio_has_data(bio)) {
1546 if (rw & WRITE) { 1549 if (rw & WRITE) {
1547 count_vm_events(PGPGOUT, count); 1550 count_vm_events(PGPGOUT, count);
1548 } else { 1551 } else {
1549 task_io_account_read(bio->bi_size); 1552 task_io_account_read(bio->bi_size);
1550 count_vm_events(PGPGIN, count); 1553 count_vm_events(PGPGIN, count);
1551 } 1554 }
1552 1555
1553 if (unlikely(block_dump)) { 1556 if (unlikely(block_dump)) {
1554 char b[BDEVNAME_SIZE]; 1557 char b[BDEVNAME_SIZE];
1555 printk(KERN_DEBUG "%s(%d): %s block %Lu on %s\n", 1558 printk(KERN_DEBUG "%s(%d): %s block %Lu on %s\n",
1556 current->comm, task_pid_nr(current), 1559 current->comm, task_pid_nr(current),
1557 (rw & WRITE) ? "WRITE" : "READ", 1560 (rw & WRITE) ? "WRITE" : "READ",
1558 (unsigned long long)bio->bi_sector, 1561 (unsigned long long)bio->bi_sector,
1559 bdevname(bio->bi_bdev, b)); 1562 bdevname(bio->bi_bdev, b));
1560 } 1563 }
1561 } 1564 }
1562 1565
1563 generic_make_request(bio); 1566 generic_make_request(bio);
1564 } 1567 }
1565 EXPORT_SYMBOL(submit_bio); 1568 EXPORT_SYMBOL(submit_bio);
1566 1569
1567 /** 1570 /**
1568 * blk_rq_check_limits - Helper function to check a request for the queue limit 1571 * blk_rq_check_limits - Helper function to check a request for the queue limit
1569 * @q: the queue 1572 * @q: the queue
1570 * @rq: the request being checked 1573 * @rq: the request being checked
1571 * 1574 *
1572 * Description: 1575 * Description:
1573 * @rq may have been made based on weaker limitations of upper-level queues 1576 * @rq may have been made based on weaker limitations of upper-level queues
1574 * in request stacking drivers, and it may violate the limitation of @q. 1577 * in request stacking drivers, and it may violate the limitation of @q.
1575 * Since the block layer and the underlying device driver trust @rq 1578 * Since the block layer and the underlying device driver trust @rq
1576 * after it is inserted to @q, it should be checked against @q before 1579 * after it is inserted to @q, it should be checked against @q before
1577 * the insertion using this generic function. 1580 * the insertion using this generic function.
1578 * 1581 *
1579 * This function should also be useful for request stacking drivers 1582 * This function should also be useful for request stacking drivers
1580 * in some cases below, so export this fuction. 1583 * in some cases below, so export this fuction.
1581 * Request stacking drivers like request-based dm may change the queue 1584 * Request stacking drivers like request-based dm may change the queue
1582 * limits while requests are in the queue (e.g. dm's table swapping). 1585 * limits while requests are in the queue (e.g. dm's table swapping).
1583 * Such request stacking drivers should check those requests agaist 1586 * Such request stacking drivers should check those requests agaist
1584 * the new queue limits again when they dispatch those requests, 1587 * the new queue limits again when they dispatch those requests,
1585 * although such checkings are also done against the old queue limits 1588 * although such checkings are also done against the old queue limits
1586 * when submitting requests. 1589 * when submitting requests.
1587 */ 1590 */
1588 int blk_rq_check_limits(struct request_queue *q, struct request *rq) 1591 int blk_rq_check_limits(struct request_queue *q, struct request *rq)
1589 { 1592 {
1590 if (rq->nr_sectors > q->max_sectors || 1593 if (rq->nr_sectors > q->max_sectors ||
1591 rq->data_len > q->max_hw_sectors << 9) { 1594 rq->data_len > q->max_hw_sectors << 9) {
1592 printk(KERN_ERR "%s: over max size limit.\n", __func__); 1595 printk(KERN_ERR "%s: over max size limit.\n", __func__);
1593 return -EIO; 1596 return -EIO;
1594 } 1597 }
1595 1598
1596 /* 1599 /*
1597 * queue's settings related to segment counting like q->bounce_pfn 1600 * queue's settings related to segment counting like q->bounce_pfn
1598 * may differ from that of other stacking queues. 1601 * may differ from that of other stacking queues.
1599 * Recalculate it to check the request correctly on this queue's 1602 * Recalculate it to check the request correctly on this queue's
1600 * limitation. 1603 * limitation.
1601 */ 1604 */
1602 blk_recalc_rq_segments(rq); 1605 blk_recalc_rq_segments(rq);
1603 if (rq->nr_phys_segments > q->max_phys_segments || 1606 if (rq->nr_phys_segments > q->max_phys_segments ||
1604 rq->nr_phys_segments > q->max_hw_segments) { 1607 rq->nr_phys_segments > q->max_hw_segments) {
1605 printk(KERN_ERR "%s: over max segments limit.\n", __func__); 1608 printk(KERN_ERR "%s: over max segments limit.\n", __func__);
1606 return -EIO; 1609 return -EIO;
1607 } 1610 }
1608 1611
1609 return 0; 1612 return 0;
1610 } 1613 }
1611 EXPORT_SYMBOL_GPL(blk_rq_check_limits); 1614 EXPORT_SYMBOL_GPL(blk_rq_check_limits);
1612 1615
1613 /** 1616 /**
1614 * blk_insert_cloned_request - Helper for stacking drivers to submit a request 1617 * blk_insert_cloned_request - Helper for stacking drivers to submit a request
1615 * @q: the queue to submit the request 1618 * @q: the queue to submit the request
1616 * @rq: the request being queued 1619 * @rq: the request being queued
1617 */ 1620 */
1618 int blk_insert_cloned_request(struct request_queue *q, struct request *rq) 1621 int blk_insert_cloned_request(struct request_queue *q, struct request *rq)
1619 { 1622 {
1620 unsigned long flags; 1623 unsigned long flags;
1621 1624
1622 if (blk_rq_check_limits(q, rq)) 1625 if (blk_rq_check_limits(q, rq))
1623 return -EIO; 1626 return -EIO;
1624 1627
1625 #ifdef CONFIG_FAIL_MAKE_REQUEST 1628 #ifdef CONFIG_FAIL_MAKE_REQUEST
1626 if (rq->rq_disk && rq->rq_disk->part0.make_it_fail && 1629 if (rq->rq_disk && rq->rq_disk->part0.make_it_fail &&
1627 should_fail(&fail_make_request, blk_rq_bytes(rq))) 1630 should_fail(&fail_make_request, blk_rq_bytes(rq)))
1628 return -EIO; 1631 return -EIO;
1629 #endif 1632 #endif
1630 1633
1631 spin_lock_irqsave(q->queue_lock, flags); 1634 spin_lock_irqsave(q->queue_lock, flags);
1632 1635
1633 /* 1636 /*
1634 * Submitting request must be dequeued before calling this function 1637 * Submitting request must be dequeued before calling this function
1635 * because it will be linked to another request_queue 1638 * because it will be linked to another request_queue
1636 */ 1639 */
1637 BUG_ON(blk_queued_rq(rq)); 1640 BUG_ON(blk_queued_rq(rq));
1638 1641
1639 drive_stat_acct(rq, 1); 1642 drive_stat_acct(rq, 1);
1640 __elv_add_request(q, rq, ELEVATOR_INSERT_BACK, 0); 1643 __elv_add_request(q, rq, ELEVATOR_INSERT_BACK, 0);
1641 1644
1642 spin_unlock_irqrestore(q->queue_lock, flags); 1645 spin_unlock_irqrestore(q->queue_lock, flags);
1643 1646
1644 return 0; 1647 return 0;
1645 } 1648 }
1646 EXPORT_SYMBOL_GPL(blk_insert_cloned_request); 1649 EXPORT_SYMBOL_GPL(blk_insert_cloned_request);
1647 1650
1648 /** 1651 /**
1649 * blkdev_dequeue_request - dequeue request and start timeout timer 1652 * blkdev_dequeue_request - dequeue request and start timeout timer
1650 * @req: request to dequeue 1653 * @req: request to dequeue
1651 * 1654 *
1652 * Dequeue @req and start timeout timer on it. This hands off the 1655 * Dequeue @req and start timeout timer on it. This hands off the
1653 * request to the driver. 1656 * request to the driver.
1654 * 1657 *
1655 * Block internal functions which don't want to start timer should 1658 * Block internal functions which don't want to start timer should
1656 * call elv_dequeue_request(). 1659 * call elv_dequeue_request().
1657 */ 1660 */
1658 void blkdev_dequeue_request(struct request *req) 1661 void blkdev_dequeue_request(struct request *req)
1659 { 1662 {
1660 elv_dequeue_request(req->q, req); 1663 elv_dequeue_request(req->q, req);
1661 1664
1662 /* 1665 /*
1663 * We are now handing the request to the hardware, add the 1666 * We are now handing the request to the hardware, add the
1664 * timeout handler. 1667 * timeout handler.
1665 */ 1668 */
1666 blk_add_timer(req); 1669 blk_add_timer(req);
1667 } 1670 }
1668 EXPORT_SYMBOL(blkdev_dequeue_request); 1671 EXPORT_SYMBOL(blkdev_dequeue_request);
1669 1672
1670 /** 1673 /**
1671 * __end_that_request_first - end I/O on a request 1674 * __end_that_request_first - end I/O on a request
1672 * @req: the request being processed 1675 * @req: the request being processed
1673 * @error: %0 for success, < %0 for error 1676 * @error: %0 for success, < %0 for error
1674 * @nr_bytes: number of bytes to complete 1677 * @nr_bytes: number of bytes to complete
1675 * 1678 *
1676 * Description: 1679 * Description:
1677 * Ends I/O on a number of bytes attached to @req, and sets it up 1680 * Ends I/O on a number of bytes attached to @req, and sets it up
1678 * for the next range of segments (if any) in the cluster. 1681 * for the next range of segments (if any) in the cluster.
1679 * 1682 *
1680 * Return: 1683 * Return:
1681 * %0 - we are done with this request, call end_that_request_last() 1684 * %0 - we are done with this request, call end_that_request_last()
1682 * %1 - still buffers pending for this request 1685 * %1 - still buffers pending for this request
1683 **/ 1686 **/
1684 static int __end_that_request_first(struct request *req, int error, 1687 static int __end_that_request_first(struct request *req, int error,
1685 int nr_bytes) 1688 int nr_bytes)
1686 { 1689 {
1687 int total_bytes, bio_nbytes, next_idx = 0; 1690 int total_bytes, bio_nbytes, next_idx = 0;
1688 struct bio *bio; 1691 struct bio *bio;
1689 1692
1690 trace_block_rq_complete(req->q, req); 1693 trace_block_rq_complete(req->q, req);
1691 1694
1692 /* 1695 /*
1693 * for a REQ_TYPE_BLOCK_PC request, we want to carry any eventual 1696 * for a REQ_TYPE_BLOCK_PC request, we want to carry any eventual
1694 * sense key with us all the way through 1697 * sense key with us all the way through
1695 */ 1698 */
1696 if (!blk_pc_request(req)) 1699 if (!blk_pc_request(req))
1697 req->errors = 0; 1700 req->errors = 0;
1698 1701
1699 if (error && (blk_fs_request(req) && !(req->cmd_flags & REQ_QUIET))) { 1702 if (error && (blk_fs_request(req) && !(req->cmd_flags & REQ_QUIET))) {
1700 printk(KERN_ERR "end_request: I/O error, dev %s, sector %llu\n", 1703 printk(KERN_ERR "end_request: I/O error, dev %s, sector %llu\n",
1701 req->rq_disk ? req->rq_disk->disk_name : "?", 1704 req->rq_disk ? req->rq_disk->disk_name : "?",
1702 (unsigned long long)req->sector); 1705 (unsigned long long)req->sector);
1703 } 1706 }
1704 1707
1705 if (blk_fs_request(req) && req->rq_disk) { 1708 if (blk_fs_request(req) && req->rq_disk) {
1706 const int rw = rq_data_dir(req); 1709 const int rw = rq_data_dir(req);
1707 struct hd_struct *part; 1710 struct hd_struct *part;
1708 int cpu; 1711 int cpu;
1709 1712
1710 cpu = part_stat_lock(); 1713 cpu = part_stat_lock();
1711 part = disk_map_sector_rcu(req->rq_disk, req->sector); 1714 part = disk_map_sector_rcu(req->rq_disk, req->sector);
1712 part_stat_add(cpu, part, sectors[rw], nr_bytes >> 9); 1715 part_stat_add(cpu, part, sectors[rw], nr_bytes >> 9);
1713 part_stat_unlock(); 1716 part_stat_unlock();
1714 } 1717 }
1715 1718
1716 total_bytes = bio_nbytes = 0; 1719 total_bytes = bio_nbytes = 0;
1717 while ((bio = req->bio) != NULL) { 1720 while ((bio = req->bio) != NULL) {
1718 int nbytes; 1721 int nbytes;
1719 1722
1720 /* 1723 /*
1721 * For an empty barrier request, the low level driver must 1724 * For an empty barrier request, the low level driver must
1722 * store a potential error location in ->sector. We pass 1725 * store a potential error location in ->sector. We pass
1723 * that back up in ->bi_sector. 1726 * that back up in ->bi_sector.
1724 */ 1727 */
1725 if (blk_empty_barrier(req)) 1728 if (blk_empty_barrier(req))
1726 bio->bi_sector = req->sector; 1729 bio->bi_sector = req->sector;
1727 1730
1728 if (nr_bytes >= bio->bi_size) { 1731 if (nr_bytes >= bio->bi_size) {
1729 req->bio = bio->bi_next; 1732 req->bio = bio->bi_next;
1730 nbytes = bio->bi_size; 1733 nbytes = bio->bi_size;
1731 req_bio_endio(req, bio, nbytes, error); 1734 req_bio_endio(req, bio, nbytes, error);
1732 next_idx = 0; 1735 next_idx = 0;
1733 bio_nbytes = 0; 1736 bio_nbytes = 0;
1734 } else { 1737 } else {
1735 int idx = bio->bi_idx + next_idx; 1738 int idx = bio->bi_idx + next_idx;
1736 1739
1737 if (unlikely(bio->bi_idx >= bio->bi_vcnt)) { 1740 if (unlikely(bio->bi_idx >= bio->bi_vcnt)) {
1738 blk_dump_rq_flags(req, "__end_that"); 1741 blk_dump_rq_flags(req, "__end_that");
1739 printk(KERN_ERR "%s: bio idx %d >= vcnt %d\n", 1742 printk(KERN_ERR "%s: bio idx %d >= vcnt %d\n",
1740 __func__, bio->bi_idx, bio->bi_vcnt); 1743 __func__, bio->bi_idx, bio->bi_vcnt);
1741 break; 1744 break;
1742 } 1745 }
1743 1746
1744 nbytes = bio_iovec_idx(bio, idx)->bv_len; 1747 nbytes = bio_iovec_idx(bio, idx)->bv_len;
1745 BIO_BUG_ON(nbytes > bio->bi_size); 1748 BIO_BUG_ON(nbytes > bio->bi_size);
1746 1749
1747 /* 1750 /*
1748 * not a complete bvec done 1751 * not a complete bvec done
1749 */ 1752 */
1750 if (unlikely(nbytes > nr_bytes)) { 1753 if (unlikely(nbytes > nr_bytes)) {
1751 bio_nbytes += nr_bytes; 1754 bio_nbytes += nr_bytes;
1752 total_bytes += nr_bytes; 1755 total_bytes += nr_bytes;
1753 break; 1756 break;
1754 } 1757 }
1755 1758
1756 /* 1759 /*
1757 * advance to the next vector 1760 * advance to the next vector
1758 */ 1761 */
1759 next_idx++; 1762 next_idx++;
1760 bio_nbytes += nbytes; 1763 bio_nbytes += nbytes;
1761 } 1764 }
1762 1765
1763 total_bytes += nbytes; 1766 total_bytes += nbytes;
1764 nr_bytes -= nbytes; 1767 nr_bytes -= nbytes;
1765 1768
1766 bio = req->bio; 1769 bio = req->bio;
1767 if (bio) { 1770 if (bio) {
1768 /* 1771 /*
1769 * end more in this run, or just return 'not-done' 1772 * end more in this run, or just return 'not-done'
1770 */ 1773 */
1771 if (unlikely(nr_bytes <= 0)) 1774 if (unlikely(nr_bytes <= 0))
1772 break; 1775 break;
1773 } 1776 }
1774 } 1777 }
1775 1778
1776 /* 1779 /*
1777 * completely done 1780 * completely done
1778 */ 1781 */
1779 if (!req->bio) 1782 if (!req->bio)
1780 return 0; 1783 return 0;
1781 1784
1782 /* 1785 /*
1783 * if the request wasn't completed, update state 1786 * if the request wasn't completed, update state
1784 */ 1787 */
1785 if (bio_nbytes) { 1788 if (bio_nbytes) {
1786 req_bio_endio(req, bio, bio_nbytes, error); 1789 req_bio_endio(req, bio, bio_nbytes, error);
1787 bio->bi_idx += next_idx; 1790 bio->bi_idx += next_idx;
1788 bio_iovec(bio)->bv_offset += nr_bytes; 1791 bio_iovec(bio)->bv_offset += nr_bytes;
1789 bio_iovec(bio)->bv_len -= nr_bytes; 1792 bio_iovec(bio)->bv_len -= nr_bytes;
1790 } 1793 }
1791 1794
1792 blk_recalc_rq_sectors(req, total_bytes >> 9); 1795 blk_recalc_rq_sectors(req, total_bytes >> 9);
1793 blk_recalc_rq_segments(req); 1796 blk_recalc_rq_segments(req);
1794 return 1; 1797 return 1;
1795 } 1798 }
1796 1799
1797 /* 1800 /*
1798 * queue lock must be held 1801 * queue lock must be held
1799 */ 1802 */
1800 static void end_that_request_last(struct request *req, int error) 1803 static void end_that_request_last(struct request *req, int error)
1801 { 1804 {
1802 struct gendisk *disk = req->rq_disk; 1805 struct gendisk *disk = req->rq_disk;
1803 1806
1804 if (blk_rq_tagged(req)) 1807 if (blk_rq_tagged(req))
1805 blk_queue_end_tag(req->q, req); 1808 blk_queue_end_tag(req->q, req);
1806 1809
1807 if (blk_queued_rq(req)) 1810 if (blk_queued_rq(req))
1808 elv_dequeue_request(req->q, req); 1811 elv_dequeue_request(req->q, req);
1809 1812
1810 if (unlikely(laptop_mode) && blk_fs_request(req)) 1813 if (unlikely(laptop_mode) && blk_fs_request(req))
1811 laptop_io_completion(); 1814 laptop_io_completion();
1812 1815
1813 blk_delete_timer(req); 1816 blk_delete_timer(req);
1814 1817
1815 /* 1818 /*
1816 * Account IO completion. bar_rq isn't accounted as a normal 1819 * Account IO completion. bar_rq isn't accounted as a normal
1817 * IO on queueing nor completion. Accounting the containing 1820 * IO on queueing nor completion. Accounting the containing
1818 * request is enough. 1821 * request is enough.
1819 */ 1822 */
1820 if (disk && blk_fs_request(req) && req != &req->q->bar_rq) { 1823 if (disk && blk_fs_request(req) && req != &req->q->bar_rq) {
1821 unsigned long duration = jiffies - req->start_time; 1824 unsigned long duration = jiffies - req->start_time;
1822 const int rw = rq_data_dir(req); 1825 const int rw = rq_data_dir(req);
1823 struct hd_struct *part; 1826 struct hd_struct *part;
1824 int cpu; 1827 int cpu;
1825 1828
1826 cpu = part_stat_lock(); 1829 cpu = part_stat_lock();
1827 part = disk_map_sector_rcu(disk, req->sector); 1830 part = disk_map_sector_rcu(disk, req->sector);
1828 1831
1829 part_stat_inc(cpu, part, ios[rw]); 1832 part_stat_inc(cpu, part, ios[rw]);
1830 part_stat_add(cpu, part, ticks[rw], duration); 1833 part_stat_add(cpu, part, ticks[rw], duration);
1831 part_round_stats(cpu, part); 1834 part_round_stats(cpu, part);
1832 part_dec_in_flight(part); 1835 part_dec_in_flight(part);
1833 1836
1834 part_stat_unlock(); 1837 part_stat_unlock();
1835 } 1838 }
1836 1839
1837 if (req->end_io) 1840 if (req->end_io)
1838 req->end_io(req, error); 1841 req->end_io(req, error);
1839 else { 1842 else {
1840 if (blk_bidi_rq(req)) 1843 if (blk_bidi_rq(req))
1841 __blk_put_request(req->next_rq->q, req->next_rq); 1844 __blk_put_request(req->next_rq->q, req->next_rq);
1842 1845
1843 __blk_put_request(req->q, req); 1846 __blk_put_request(req->q, req);
1844 } 1847 }
1845 } 1848 }
1846 1849
1847 /** 1850 /**
1848 * blk_rq_bytes - Returns bytes left to complete in the entire request 1851 * blk_rq_bytes - Returns bytes left to complete in the entire request
1849 * @rq: the request being processed 1852 * @rq: the request being processed
1850 **/ 1853 **/
1851 unsigned int blk_rq_bytes(struct request *rq) 1854 unsigned int blk_rq_bytes(struct request *rq)
1852 { 1855 {
1853 if (blk_fs_request(rq)) 1856 if (blk_fs_request(rq))
1854 return rq->hard_nr_sectors << 9; 1857 return rq->hard_nr_sectors << 9;
1855 1858
1856 return rq->data_len; 1859 return rq->data_len;
1857 } 1860 }
1858 EXPORT_SYMBOL_GPL(blk_rq_bytes); 1861 EXPORT_SYMBOL_GPL(blk_rq_bytes);
1859 1862
1860 /** 1863 /**
1861 * blk_rq_cur_bytes - Returns bytes left to complete in the current segment 1864 * blk_rq_cur_bytes - Returns bytes left to complete in the current segment
1862 * @rq: the request being processed 1865 * @rq: the request being processed
1863 **/ 1866 **/
1864 unsigned int blk_rq_cur_bytes(struct request *rq) 1867 unsigned int blk_rq_cur_bytes(struct request *rq)
1865 { 1868 {
1866 if (blk_fs_request(rq)) 1869 if (blk_fs_request(rq))
1867 return rq->current_nr_sectors << 9; 1870 return rq->current_nr_sectors << 9;
1868 1871
1869 if (rq->bio) 1872 if (rq->bio)
1870 return rq->bio->bi_size; 1873 return rq->bio->bi_size;
1871 1874
1872 return rq->data_len; 1875 return rq->data_len;
1873 } 1876 }
1874 EXPORT_SYMBOL_GPL(blk_rq_cur_bytes); 1877 EXPORT_SYMBOL_GPL(blk_rq_cur_bytes);
1875 1878
1876 /** 1879 /**
1877 * end_request - end I/O on the current segment of the request 1880 * end_request - end I/O on the current segment of the request
1878 * @req: the request being processed 1881 * @req: the request being processed
1879 * @uptodate: error value or %0/%1 uptodate flag 1882 * @uptodate: error value or %0/%1 uptodate flag
1880 * 1883 *
1881 * Description: 1884 * Description:
1882 * Ends I/O on the current segment of a request. If that is the only 1885 * Ends I/O on the current segment of a request. If that is the only
1883 * remaining segment, the request is also completed and freed. 1886 * remaining segment, the request is also completed and freed.
1884 * 1887 *
1885 * This is a remnant of how older block drivers handled I/O completions. 1888 * This is a remnant of how older block drivers handled I/O completions.
1886 * Modern drivers typically end I/O on the full request in one go, unless 1889 * Modern drivers typically end I/O on the full request in one go, unless
1887 * they have a residual value to account for. For that case this function 1890 * they have a residual value to account for. For that case this function
1888 * isn't really useful, unless the residual just happens to be the 1891 * isn't really useful, unless the residual just happens to be the
1889 * full current segment. In other words, don't use this function in new 1892 * full current segment. In other words, don't use this function in new
1890 * code. Use blk_end_request() or __blk_end_request() to end a request. 1893 * code. Use blk_end_request() or __blk_end_request() to end a request.
1891 **/ 1894 **/
1892 void end_request(struct request *req, int uptodate) 1895 void end_request(struct request *req, int uptodate)
1893 { 1896 {
1894 int error = 0; 1897 int error = 0;
1895 1898
1896 if (uptodate <= 0) 1899 if (uptodate <= 0)
1897 error = uptodate ? uptodate : -EIO; 1900 error = uptodate ? uptodate : -EIO;
1898 1901
1899 __blk_end_request(req, error, req->hard_cur_sectors << 9); 1902 __blk_end_request(req, error, req->hard_cur_sectors << 9);
1900 } 1903 }
1901 EXPORT_SYMBOL(end_request); 1904 EXPORT_SYMBOL(end_request);
1902 1905
1903 static int end_that_request_data(struct request *rq, int error, 1906 static int end_that_request_data(struct request *rq, int error,
1904 unsigned int nr_bytes, unsigned int bidi_bytes) 1907 unsigned int nr_bytes, unsigned int bidi_bytes)
1905 { 1908 {
1906 if (rq->bio) { 1909 if (rq->bio) {
1907 if (__end_that_request_first(rq, error, nr_bytes)) 1910 if (__end_that_request_first(rq, error, nr_bytes))
1908 return 1; 1911 return 1;
1909 1912
1910 /* Bidi request must be completed as a whole */ 1913 /* Bidi request must be completed as a whole */
1911 if (blk_bidi_rq(rq) && 1914 if (blk_bidi_rq(rq) &&
1912 __end_that_request_first(rq->next_rq, error, bidi_bytes)) 1915 __end_that_request_first(rq->next_rq, error, bidi_bytes))
1913 return 1; 1916 return 1;
1914 } 1917 }
1915 1918
1916 return 0; 1919 return 0;
1917 } 1920 }
1918 1921
1919 /** 1922 /**
1920 * blk_end_io - Generic end_io function to complete a request. 1923 * blk_end_io - Generic end_io function to complete a request.
1921 * @rq: the request being processed 1924 * @rq: the request being processed
1922 * @error: %0 for success, < %0 for error 1925 * @error: %0 for success, < %0 for error
1923 * @nr_bytes: number of bytes to complete @rq 1926 * @nr_bytes: number of bytes to complete @rq
1924 * @bidi_bytes: number of bytes to complete @rq->next_rq 1927 * @bidi_bytes: number of bytes to complete @rq->next_rq
1925 * @drv_callback: function called between completion of bios in the request 1928 * @drv_callback: function called between completion of bios in the request
1926 * and completion of the request. 1929 * and completion of the request.
1927 * If the callback returns non %0, this helper returns without 1930 * If the callback returns non %0, this helper returns without
1928 * completion of the request. 1931 * completion of the request.
1929 * 1932 *
1930 * Description: 1933 * Description:
1931 * Ends I/O on a number of bytes attached to @rq and @rq->next_rq. 1934 * Ends I/O on a number of bytes attached to @rq and @rq->next_rq.
1932 * If @rq has leftover, sets it up for the next range of segments. 1935 * If @rq has leftover, sets it up for the next range of segments.
1933 * 1936 *
1934 * Return: 1937 * Return:
1935 * %0 - we are done with this request 1938 * %0 - we are done with this request
1936 * %1 - this request is not freed yet, it still has pending buffers. 1939 * %1 - this request is not freed yet, it still has pending buffers.
1937 **/ 1940 **/
1938 static int blk_end_io(struct request *rq, int error, unsigned int nr_bytes, 1941 static int blk_end_io(struct request *rq, int error, unsigned int nr_bytes,
1939 unsigned int bidi_bytes, 1942 unsigned int bidi_bytes,
1940 int (drv_callback)(struct request *)) 1943 int (drv_callback)(struct request *))
1941 { 1944 {
1942 struct request_queue *q = rq->q; 1945 struct request_queue *q = rq->q;
1943 unsigned long flags = 0UL; 1946 unsigned long flags = 0UL;
1944 1947
1945 if (end_that_request_data(rq, error, nr_bytes, bidi_bytes)) 1948 if (end_that_request_data(rq, error, nr_bytes, bidi_bytes))
1946 return 1; 1949 return 1;
1947 1950
1948 /* Special feature for tricky drivers */ 1951 /* Special feature for tricky drivers */
1949 if (drv_callback && drv_callback(rq)) 1952 if (drv_callback && drv_callback(rq))
1950 return 1; 1953 return 1;
1951 1954
1952 add_disk_randomness(rq->rq_disk); 1955 add_disk_randomness(rq->rq_disk);
1953 1956
1954 spin_lock_irqsave(q->queue_lock, flags); 1957 spin_lock_irqsave(q->queue_lock, flags);
1955 end_that_request_last(rq, error); 1958 end_that_request_last(rq, error);
1956 spin_unlock_irqrestore(q->queue_lock, flags); 1959 spin_unlock_irqrestore(q->queue_lock, flags);
1957 1960
1958 return 0; 1961 return 0;
1959 } 1962 }
1960 1963
1961 /** 1964 /**
1962 * blk_end_request - Helper function for drivers to complete the request. 1965 * blk_end_request - Helper function for drivers to complete the request.
1963 * @rq: the request being processed 1966 * @rq: the request being processed
1964 * @error: %0 for success, < %0 for error 1967 * @error: %0 for success, < %0 for error
1965 * @nr_bytes: number of bytes to complete 1968 * @nr_bytes: number of bytes to complete
1966 * 1969 *
1967 * Description: 1970 * Description:
1968 * Ends I/O on a number of bytes attached to @rq. 1971 * Ends I/O on a number of bytes attached to @rq.
1969 * If @rq has leftover, sets it up for the next range of segments. 1972 * If @rq has leftover, sets it up for the next range of segments.
1970 * 1973 *
1971 * Return: 1974 * Return:
1972 * %0 - we are done with this request 1975 * %0 - we are done with this request
1973 * %1 - still buffers pending for this request 1976 * %1 - still buffers pending for this request
1974 **/ 1977 **/
1975 int blk_end_request(struct request *rq, int error, unsigned int nr_bytes) 1978 int blk_end_request(struct request *rq, int error, unsigned int nr_bytes)
1976 { 1979 {
1977 return blk_end_io(rq, error, nr_bytes, 0, NULL); 1980 return blk_end_io(rq, error, nr_bytes, 0, NULL);
1978 } 1981 }
1979 EXPORT_SYMBOL_GPL(blk_end_request); 1982 EXPORT_SYMBOL_GPL(blk_end_request);
1980 1983
1981 /** 1984 /**
1982 * __blk_end_request - Helper function for drivers to complete the request. 1985 * __blk_end_request - Helper function for drivers to complete the request.
1983 * @rq: the request being processed 1986 * @rq: the request being processed
1984 * @error: %0 for success, < %0 for error 1987 * @error: %0 for success, < %0 for error
1985 * @nr_bytes: number of bytes to complete 1988 * @nr_bytes: number of bytes to complete
1986 * 1989 *
1987 * Description: 1990 * Description:
1988 * Must be called with queue lock held unlike blk_end_request(). 1991 * Must be called with queue lock held unlike blk_end_request().
1989 * 1992 *
1990 * Return: 1993 * Return:
1991 * %0 - we are done with this request 1994 * %0 - we are done with this request
1992 * %1 - still buffers pending for this request 1995 * %1 - still buffers pending for this request
1993 **/ 1996 **/
1994 int __blk_end_request(struct request *rq, int error, unsigned int nr_bytes) 1997 int __blk_end_request(struct request *rq, int error, unsigned int nr_bytes)
1995 { 1998 {
1996 if (rq->bio && __end_that_request_first(rq, error, nr_bytes)) 1999 if (rq->bio && __end_that_request_first(rq, error, nr_bytes))
1997 return 1; 2000 return 1;
1998 2001
1999 add_disk_randomness(rq->rq_disk); 2002 add_disk_randomness(rq->rq_disk);
2000 2003
2001 end_that_request_last(rq, error); 2004 end_that_request_last(rq, error);
2002 2005
2003 return 0; 2006 return 0;
2004 } 2007 }
2005 EXPORT_SYMBOL_GPL(__blk_end_request); 2008 EXPORT_SYMBOL_GPL(__blk_end_request);
2006 2009
2007 /** 2010 /**
2008 * blk_end_bidi_request - Helper function for drivers to complete bidi request. 2011 * blk_end_bidi_request - Helper function for drivers to complete bidi request.
2009 * @rq: the bidi request being processed 2012 * @rq: the bidi request being processed
2010 * @error: %0 for success, < %0 for error 2013 * @error: %0 for success, < %0 for error
2011 * @nr_bytes: number of bytes to complete @rq 2014 * @nr_bytes: number of bytes to complete @rq
2012 * @bidi_bytes: number of bytes to complete @rq->next_rq 2015 * @bidi_bytes: number of bytes to complete @rq->next_rq
2013 * 2016 *
2014 * Description: 2017 * Description:
2015 * Ends I/O on a number of bytes attached to @rq and @rq->next_rq. 2018 * Ends I/O on a number of bytes attached to @rq and @rq->next_rq.
2016 * 2019 *
2017 * Return: 2020 * Return:
2018 * %0 - we are done with this request 2021 * %0 - we are done with this request
2019 * %1 - still buffers pending for this request 2022 * %1 - still buffers pending for this request
2020 **/ 2023 **/
2021 int blk_end_bidi_request(struct request *rq, int error, unsigned int nr_bytes, 2024 int blk_end_bidi_request(struct request *rq, int error, unsigned int nr_bytes,
2022 unsigned int bidi_bytes) 2025 unsigned int bidi_bytes)
2023 { 2026 {
2024 return blk_end_io(rq, error, nr_bytes, bidi_bytes, NULL); 2027 return blk_end_io(rq, error, nr_bytes, bidi_bytes, NULL);
2025 } 2028 }
2026 EXPORT_SYMBOL_GPL(blk_end_bidi_request); 2029 EXPORT_SYMBOL_GPL(blk_end_bidi_request);
2027 2030
2028 /** 2031 /**
2029 * blk_update_request - Special helper function for request stacking drivers 2032 * blk_update_request - Special helper function for request stacking drivers
2030 * @rq: the request being processed 2033 * @rq: the request being processed
2031 * @error: %0 for success, < %0 for error 2034 * @error: %0 for success, < %0 for error
2032 * @nr_bytes: number of bytes to complete @rq 2035 * @nr_bytes: number of bytes to complete @rq
2033 * 2036 *
2034 * Description: 2037 * Description:
2035 * Ends I/O on a number of bytes attached to @rq, but doesn't complete 2038 * Ends I/O on a number of bytes attached to @rq, but doesn't complete
2036 * the request structure even if @rq doesn't have leftover. 2039 * the request structure even if @rq doesn't have leftover.
2037 * If @rq has leftover, sets it up for the next range of segments. 2040 * If @rq has leftover, sets it up for the next range of segments.
2038 * 2041 *
2039 * This special helper function is only for request stacking drivers 2042 * This special helper function is only for request stacking drivers
2040 * (e.g. request-based dm) so that they can handle partial completion. 2043 * (e.g. request-based dm) so that they can handle partial completion.
2041 * Actual device drivers should use blk_end_request instead. 2044 * Actual device drivers should use blk_end_request instead.
2042 */ 2045 */
2043 void blk_update_request(struct request *rq, int error, unsigned int nr_bytes) 2046 void blk_update_request(struct request *rq, int error, unsigned int nr_bytes)
2044 { 2047 {
2045 if (!end_that_request_data(rq, error, nr_bytes, 0)) { 2048 if (!end_that_request_data(rq, error, nr_bytes, 0)) {
2046 /* 2049 /*
2047 * These members are not updated in end_that_request_data() 2050 * These members are not updated in end_that_request_data()
2048 * when all bios are completed. 2051 * when all bios are completed.
2049 * Update them so that the request stacking driver can find 2052 * Update them so that the request stacking driver can find
2050 * how many bytes remain in the request later. 2053 * how many bytes remain in the request later.
2051 */ 2054 */
2052 rq->nr_sectors = rq->hard_nr_sectors = 0; 2055 rq->nr_sectors = rq->hard_nr_sectors = 0;
2053 rq->current_nr_sectors = rq->hard_cur_sectors = 0; 2056 rq->current_nr_sectors = rq->hard_cur_sectors = 0;
2054 } 2057 }
2055 } 2058 }
2056 EXPORT_SYMBOL_GPL(blk_update_request); 2059 EXPORT_SYMBOL_GPL(blk_update_request);
2057 2060
2058 /** 2061 /**
2059 * blk_end_request_callback - Special helper function for tricky drivers 2062 * blk_end_request_callback - Special helper function for tricky drivers
2060 * @rq: the request being processed 2063 * @rq: the request being processed
2061 * @error: %0 for success, < %0 for error 2064 * @error: %0 for success, < %0 for error
2062 * @nr_bytes: number of bytes to complete 2065 * @nr_bytes: number of bytes to complete
2063 * @drv_callback: function called between completion of bios in the request 2066 * @drv_callback: function called between completion of bios in the request
2064 * and completion of the request. 2067 * and completion of the request.
2065 * If the callback returns non %0, this helper returns without 2068 * If the callback returns non %0, this helper returns without
2066 * completion of the request. 2069 * completion of the request.
2067 * 2070 *
2068 * Description: 2071 * Description:
2069 * Ends I/O on a number of bytes attached to @rq. 2072 * Ends I/O on a number of bytes attached to @rq.
2070 * If @rq has leftover, sets it up for the next range of segments. 2073 * If @rq has leftover, sets it up for the next range of segments.
2071 * 2074 *
2072 * This special helper function is used only for existing tricky drivers. 2075 * This special helper function is used only for existing tricky drivers.
2073 * (e.g. cdrom_newpc_intr() of ide-cd) 2076 * (e.g. cdrom_newpc_intr() of ide-cd)
2074 * This interface will be removed when such drivers are rewritten. 2077 * This interface will be removed when such drivers are rewritten.
2075 * Don't use this interface in other places anymore. 2078 * Don't use this interface in other places anymore.
2076 * 2079 *
2077 * Return: 2080 * Return:
2078 * %0 - we are done with this request 2081 * %0 - we are done with this request
2079 * %1 - this request is not freed yet. 2082 * %1 - this request is not freed yet.
2080 * this request still has pending buffers or 2083 * this request still has pending buffers or
2081 * the driver doesn't want to finish this request yet. 2084 * the driver doesn't want to finish this request yet.
2082 **/ 2085 **/
2083 int blk_end_request_callback(struct request *rq, int error, 2086 int blk_end_request_callback(struct request *rq, int error,
2084 unsigned int nr_bytes, 2087 unsigned int nr_bytes,
2085 int (drv_callback)(struct request *)) 2088 int (drv_callback)(struct request *))
2086 { 2089 {
2087 return blk_end_io(rq, error, nr_bytes, 0, drv_callback); 2090 return blk_end_io(rq, error, nr_bytes, 0, drv_callback);
2088 } 2091 }
2089 EXPORT_SYMBOL_GPL(blk_end_request_callback); 2092 EXPORT_SYMBOL_GPL(blk_end_request_callback);
2090 2093
2091 void blk_rq_bio_prep(struct request_queue *q, struct request *rq, 2094 void blk_rq_bio_prep(struct request_queue *q, struct request *rq,
2092 struct bio *bio) 2095 struct bio *bio)
2093 { 2096 {
2094 /* Bit 0 (R/W) is identical in rq->cmd_flags and bio->bi_rw, and 2097 /* Bit 0 (R/W) is identical in rq->cmd_flags and bio->bi_rw, and
2095 we want BIO_RW_AHEAD (bit 1) to imply REQ_FAILFAST (bit 1). */ 2098 we want BIO_RW_AHEAD (bit 1) to imply REQ_FAILFAST (bit 1). */
2096 rq->cmd_flags |= (bio->bi_rw & 3); 2099 rq->cmd_flags |= (bio->bi_rw & 3);
2097 2100
2098 if (bio_has_data(bio)) { 2101 if (bio_has_data(bio)) {
2099 rq->nr_phys_segments = bio_phys_segments(q, bio); 2102 rq->nr_phys_segments = bio_phys_segments(q, bio);
2100 rq->buffer = bio_data(bio); 2103 rq->buffer = bio_data(bio);
2101 } 2104 }
2102 rq->current_nr_sectors = bio_cur_sectors(bio); 2105 rq->current_nr_sectors = bio_cur_sectors(bio);
2103 rq->hard_cur_sectors = rq->current_nr_sectors; 2106 rq->hard_cur_sectors = rq->current_nr_sectors;
2104 rq->hard_nr_sectors = rq->nr_sectors = bio_sectors(bio); 2107 rq->hard_nr_sectors = rq->nr_sectors = bio_sectors(bio);
2105 rq->data_len = bio->bi_size; 2108 rq->data_len = bio->bi_size;
2106 2109
2107 rq->bio = rq->biotail = bio; 2110 rq->bio = rq->biotail = bio;
2108 2111
2109 if (bio->bi_bdev) 2112 if (bio->bi_bdev)
2110 rq->rq_disk = bio->bi_bdev->bd_disk; 2113 rq->rq_disk = bio->bi_bdev->bd_disk;
2111 } 2114 }
2112 2115
2113 /** 2116 /**
2114 * blk_lld_busy - Check if underlying low-level drivers of a device are busy 2117 * blk_lld_busy - Check if underlying low-level drivers of a device are busy
2115 * @q : the queue of the device being checked 2118 * @q : the queue of the device being checked
2116 * 2119 *
2117 * Description: 2120 * Description:
2118 * Check if underlying low-level drivers of a device are busy. 2121 * Check if underlying low-level drivers of a device are busy.
2119 * If the drivers want to export their busy state, they must set own 2122 * If the drivers want to export their busy state, they must set own
2120 * exporting function using blk_queue_lld_busy() first. 2123 * exporting function using blk_queue_lld_busy() first.
2121 * 2124 *
2122 * Basically, this function is used only by request stacking drivers 2125 * Basically, this function is used only by request stacking drivers
2123 * to stop dispatching requests to underlying devices when underlying 2126 * to stop dispatching requests to underlying devices when underlying
2124 * devices are busy. This behavior helps more I/O merging on the queue 2127 * devices are busy. This behavior helps more I/O merging on the queue
2125 * of the request stacking driver and prevents I/O throughput regression 2128 * of the request stacking driver and prevents I/O throughput regression
2126 * on burst I/O load. 2129 * on burst I/O load.
2127 * 2130 *
2128 * Return: 2131 * Return:
2129 * 0 - Not busy (The request stacking driver should dispatch request) 2132 * 0 - Not busy (The request stacking driver should dispatch request)
2130 * 1 - Busy (The request stacking driver should stop dispatching request) 2133 * 1 - Busy (The request stacking driver should stop dispatching request)
2131 */ 2134 */
2132 int blk_lld_busy(struct request_queue *q) 2135 int blk_lld_busy(struct request_queue *q)
2133 { 2136 {
2134 if (q->lld_busy_fn) 2137 if (q->lld_busy_fn)
2135 return q->lld_busy_fn(q); 2138 return q->lld_busy_fn(q);
2136 2139
2137 return 0; 2140 return 0;
2138 } 2141 }
2139 EXPORT_SYMBOL_GPL(blk_lld_busy); 2142 EXPORT_SYMBOL_GPL(blk_lld_busy);
2140 2143
2141 int kblockd_schedule_work(struct request_queue *q, struct work_struct *work) 2144 int kblockd_schedule_work(struct request_queue *q, struct work_struct *work)
2142 { 2145 {
2143 return queue_work(kblockd_workqueue, work); 2146 return queue_work(kblockd_workqueue, work);
2144 } 2147 }
2145 EXPORT_SYMBOL(kblockd_schedule_work); 2148 EXPORT_SYMBOL(kblockd_schedule_work);
2146 2149
2147 void kblockd_flush_work(struct work_struct *work) 2150 void kblockd_flush_work(struct work_struct *work)
2148 { 2151 {
2149 cancel_work_sync(work); 2152 cancel_work_sync(work);
2150 } 2153 }
2151 EXPORT_SYMBOL(kblockd_flush_work); 2154 EXPORT_SYMBOL(kblockd_flush_work);
2152 2155
2153 int __init blk_dev_init(void) 2156 int __init blk_dev_init(void)
2154 { 2157 {
2155 kblockd_workqueue = create_workqueue("kblockd"); 2158 kblockd_workqueue = create_workqueue("kblockd");
2156 if (!kblockd_workqueue) 2159 if (!kblockd_workqueue)
2157 panic("Failed to create kblockd\n"); 2160 panic("Failed to create kblockd\n");
2158 2161
2159 request_cachep = kmem_cache_create("blkdev_requests", 2162 request_cachep = kmem_cache_create("blkdev_requests",
2160 sizeof(struct request), 0, SLAB_PANIC, NULL); 2163 sizeof(struct request), 0, SLAB_PANIC, NULL);
2161 2164
2162 blk_requestq_cachep = kmem_cache_create("blkdev_queue", 2165 blk_requestq_cachep = kmem_cache_create("blkdev_queue",
2163 sizeof(struct request_queue), 0, SLAB_PANIC, NULL); 2166 sizeof(struct request_queue), 0, SLAB_PANIC, NULL);
2164 2167
2165 return 0; 2168 return 0;
2166 } 2169 }
2167 2170
2168 2171
1 /* 1 /*
2 * linux/fs/buffer.c 2 * linux/fs/buffer.c
3 * 3 *
4 * Copyright (C) 1991, 1992, 2002 Linus Torvalds 4 * Copyright (C) 1991, 1992, 2002 Linus Torvalds
5 */ 5 */
6 6
7 /* 7 /*
8 * Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95 8 * Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95
9 * 9 *
10 * Removed a lot of unnecessary code and simplified things now that 10 * Removed a lot of unnecessary code and simplified things now that
11 * the buffer cache isn't our primary cache - Andrew Tridgell 12/96 11 * the buffer cache isn't our primary cache - Andrew Tridgell 12/96
12 * 12 *
13 * Speed up hash, lru, and free list operations. Use gfp() for allocating 13 * Speed up hash, lru, and free list operations. Use gfp() for allocating
14 * hash table, use SLAB cache for buffer heads. SMP threading. -DaveM 14 * hash table, use SLAB cache for buffer heads. SMP threading. -DaveM
15 * 15 *
16 * Added 32k buffer block sizes - these are required older ARM systems. - RMK 16 * Added 32k buffer block sizes - these are required older ARM systems. - RMK
17 * 17 *
18 * async buffer flushing, 1999 Andrea Arcangeli <andrea@suse.de> 18 * async buffer flushing, 1999 Andrea Arcangeli <andrea@suse.de>
19 */ 19 */
20 20
21 #include <linux/kernel.h> 21 #include <linux/kernel.h>
22 #include <linux/syscalls.h> 22 #include <linux/syscalls.h>
23 #include <linux/fs.h> 23 #include <linux/fs.h>
24 #include <linux/mm.h> 24 #include <linux/mm.h>
25 #include <linux/percpu.h> 25 #include <linux/percpu.h>
26 #include <linux/slab.h> 26 #include <linux/slab.h>
27 #include <linux/capability.h> 27 #include <linux/capability.h>
28 #include <linux/blkdev.h> 28 #include <linux/blkdev.h>
29 #include <linux/file.h> 29 #include <linux/file.h>
30 #include <linux/quotaops.h> 30 #include <linux/quotaops.h>
31 #include <linux/highmem.h> 31 #include <linux/highmem.h>
32 #include <linux/module.h> 32 #include <linux/module.h>
33 #include <linux/writeback.h> 33 #include <linux/writeback.h>
34 #include <linux/hash.h> 34 #include <linux/hash.h>
35 #include <linux/suspend.h> 35 #include <linux/suspend.h>
36 #include <linux/buffer_head.h> 36 #include <linux/buffer_head.h>
37 #include <linux/task_io_accounting_ops.h> 37 #include <linux/task_io_accounting_ops.h>
38 #include <linux/bio.h> 38 #include <linux/bio.h>
39 #include <linux/notifier.h> 39 #include <linux/notifier.h>
40 #include <linux/cpu.h> 40 #include <linux/cpu.h>
41 #include <linux/bitops.h> 41 #include <linux/bitops.h>
42 #include <linux/mpage.h> 42 #include <linux/mpage.h>
43 #include <linux/bit_spinlock.h> 43 #include <linux/bit_spinlock.h>
44 44
45 static int fsync_buffers_list(spinlock_t *lock, struct list_head *list); 45 static int fsync_buffers_list(spinlock_t *lock, struct list_head *list);
46 46
47 #define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers) 47 #define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers)
48 48
49 inline void 49 inline void
50 init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private) 50 init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private)
51 { 51 {
52 bh->b_end_io = handler; 52 bh->b_end_io = handler;
53 bh->b_private = private; 53 bh->b_private = private;
54 } 54 }
55 55
56 static int sync_buffer(void *word) 56 static int sync_buffer(void *word)
57 { 57 {
58 struct block_device *bd; 58 struct block_device *bd;
59 struct buffer_head *bh 59 struct buffer_head *bh
60 = container_of(word, struct buffer_head, b_state); 60 = container_of(word, struct buffer_head, b_state);
61 61
62 smp_mb(); 62 smp_mb();
63 bd = bh->b_bdev; 63 bd = bh->b_bdev;
64 if (bd) 64 if (bd)
65 blk_run_address_space(bd->bd_inode->i_mapping); 65 blk_run_address_space(bd->bd_inode->i_mapping);
66 io_schedule(); 66 io_schedule();
67 return 0; 67 return 0;
68 } 68 }
69 69
70 void __lock_buffer(struct buffer_head *bh) 70 void __lock_buffer(struct buffer_head *bh)
71 { 71 {
72 wait_on_bit_lock(&bh->b_state, BH_Lock, sync_buffer, 72 wait_on_bit_lock(&bh->b_state, BH_Lock, sync_buffer,
73 TASK_UNINTERRUPTIBLE); 73 TASK_UNINTERRUPTIBLE);
74 } 74 }
75 EXPORT_SYMBOL(__lock_buffer); 75 EXPORT_SYMBOL(__lock_buffer);
76 76
77 void unlock_buffer(struct buffer_head *bh) 77 void unlock_buffer(struct buffer_head *bh)
78 { 78 {
79 clear_bit_unlock(BH_Lock, &bh->b_state); 79 clear_bit_unlock(BH_Lock, &bh->b_state);
80 smp_mb__after_clear_bit(); 80 smp_mb__after_clear_bit();
81 wake_up_bit(&bh->b_state, BH_Lock); 81 wake_up_bit(&bh->b_state, BH_Lock);
82 } 82 }
83 83
84 /* 84 /*
85 * Block until a buffer comes unlocked. This doesn't stop it 85 * Block until a buffer comes unlocked. This doesn't stop it
86 * from becoming locked again - you have to lock it yourself 86 * from becoming locked again - you have to lock it yourself
87 * if you want to preserve its state. 87 * if you want to preserve its state.
88 */ 88 */
89 void __wait_on_buffer(struct buffer_head * bh) 89 void __wait_on_buffer(struct buffer_head * bh)
90 { 90 {
91 wait_on_bit(&bh->b_state, BH_Lock, sync_buffer, TASK_UNINTERRUPTIBLE); 91 wait_on_bit(&bh->b_state, BH_Lock, sync_buffer, TASK_UNINTERRUPTIBLE);
92 } 92 }
93 93
94 static void 94 static void
95 __clear_page_buffers(struct page *page) 95 __clear_page_buffers(struct page *page)
96 { 96 {
97 ClearPagePrivate(page); 97 ClearPagePrivate(page);
98 set_page_private(page, 0); 98 set_page_private(page, 0);
99 page_cache_release(page); 99 page_cache_release(page);
100 } 100 }
101 101
102
103 static int quiet_error(struct buffer_head *bh)
104 {
105 if (!test_bit(BH_Quiet, &bh->b_state) && printk_ratelimit())
106 return 0;
107 return 1;
108 }
109
110
102 static void buffer_io_error(struct buffer_head *bh) 111 static void buffer_io_error(struct buffer_head *bh)
103 { 112 {
104 char b[BDEVNAME_SIZE]; 113 char b[BDEVNAME_SIZE];
105
106 printk(KERN_ERR "Buffer I/O error on device %s, logical block %Lu\n", 114 printk(KERN_ERR "Buffer I/O error on device %s, logical block %Lu\n",
107 bdevname(bh->b_bdev, b), 115 bdevname(bh->b_bdev, b),
108 (unsigned long long)bh->b_blocknr); 116 (unsigned long long)bh->b_blocknr);
109 } 117 }
110 118
111 /* 119 /*
112 * End-of-IO handler helper function which does not touch the bh after 120 * End-of-IO handler helper function which does not touch the bh after
113 * unlocking it. 121 * unlocking it.
114 * Note: unlock_buffer() sort-of does touch the bh after unlocking it, but 122 * Note: unlock_buffer() sort-of does touch the bh after unlocking it, but
115 * a race there is benign: unlock_buffer() only use the bh's address for 123 * a race there is benign: unlock_buffer() only use the bh's address for
116 * hashing after unlocking the buffer, so it doesn't actually touch the bh 124 * hashing after unlocking the buffer, so it doesn't actually touch the bh
117 * itself. 125 * itself.
118 */ 126 */
119 static void __end_buffer_read_notouch(struct buffer_head *bh, int uptodate) 127 static void __end_buffer_read_notouch(struct buffer_head *bh, int uptodate)
120 { 128 {
121 if (uptodate) { 129 if (uptodate) {
122 set_buffer_uptodate(bh); 130 set_buffer_uptodate(bh);
123 } else { 131 } else {
124 /* This happens, due to failed READA attempts. */ 132 /* This happens, due to failed READA attempts. */
125 clear_buffer_uptodate(bh); 133 clear_buffer_uptodate(bh);
126 } 134 }
127 unlock_buffer(bh); 135 unlock_buffer(bh);
128 } 136 }
129 137
130 /* 138 /*
131 * Default synchronous end-of-IO handler.. Just mark it up-to-date and 139 * Default synchronous end-of-IO handler.. Just mark it up-to-date and
132 * unlock the buffer. This is what ll_rw_block uses too. 140 * unlock the buffer. This is what ll_rw_block uses too.
133 */ 141 */
134 void end_buffer_read_sync(struct buffer_head *bh, int uptodate) 142 void end_buffer_read_sync(struct buffer_head *bh, int uptodate)
135 { 143 {
136 __end_buffer_read_notouch(bh, uptodate); 144 __end_buffer_read_notouch(bh, uptodate);
137 put_bh(bh); 145 put_bh(bh);
138 } 146 }
139 147
140 void end_buffer_write_sync(struct buffer_head *bh, int uptodate) 148 void end_buffer_write_sync(struct buffer_head *bh, int uptodate)
141 { 149 {
142 char b[BDEVNAME_SIZE]; 150 char b[BDEVNAME_SIZE];
143 151
144 if (uptodate) { 152 if (uptodate) {
145 set_buffer_uptodate(bh); 153 set_buffer_uptodate(bh);
146 } else { 154 } else {
147 if (!buffer_eopnotsupp(bh) && printk_ratelimit()) { 155 if (!buffer_eopnotsupp(bh) && !quiet_error(bh)) {
148 buffer_io_error(bh); 156 buffer_io_error(bh);
149 printk(KERN_WARNING "lost page write due to " 157 printk(KERN_WARNING "lost page write due to "
150 "I/O error on %s\n", 158 "I/O error on %s\n",
151 bdevname(bh->b_bdev, b)); 159 bdevname(bh->b_bdev, b));
152 } 160 }
153 set_buffer_write_io_error(bh); 161 set_buffer_write_io_error(bh);
154 clear_buffer_uptodate(bh); 162 clear_buffer_uptodate(bh);
155 } 163 }
156 unlock_buffer(bh); 164 unlock_buffer(bh);
157 put_bh(bh); 165 put_bh(bh);
158 } 166 }
159 167
160 /* 168 /*
161 * Write out and wait upon all the dirty data associated with a block 169 * Write out and wait upon all the dirty data associated with a block
162 * device via its mapping. Does not take the superblock lock. 170 * device via its mapping. Does not take the superblock lock.
163 */ 171 */
164 int sync_blockdev(struct block_device *bdev) 172 int sync_blockdev(struct block_device *bdev)
165 { 173 {
166 int ret = 0; 174 int ret = 0;
167 175
168 if (bdev) 176 if (bdev)
169 ret = filemap_write_and_wait(bdev->bd_inode->i_mapping); 177 ret = filemap_write_and_wait(bdev->bd_inode->i_mapping);
170 return ret; 178 return ret;
171 } 179 }
172 EXPORT_SYMBOL(sync_blockdev); 180 EXPORT_SYMBOL(sync_blockdev);
173 181
174 /* 182 /*
175 * Write out and wait upon all dirty data associated with this 183 * Write out and wait upon all dirty data associated with this
176 * device. Filesystem data as well as the underlying block 184 * device. Filesystem data as well as the underlying block
177 * device. Takes the superblock lock. 185 * device. Takes the superblock lock.
178 */ 186 */
179 int fsync_bdev(struct block_device *bdev) 187 int fsync_bdev(struct block_device *bdev)
180 { 188 {
181 struct super_block *sb = get_super(bdev); 189 struct super_block *sb = get_super(bdev);
182 if (sb) { 190 if (sb) {
183 int res = fsync_super(sb); 191 int res = fsync_super(sb);
184 drop_super(sb); 192 drop_super(sb);
185 return res; 193 return res;
186 } 194 }
187 return sync_blockdev(bdev); 195 return sync_blockdev(bdev);
188 } 196 }
189 197
190 /** 198 /**
191 * freeze_bdev -- lock a filesystem and force it into a consistent state 199 * freeze_bdev -- lock a filesystem and force it into a consistent state
192 * @bdev: blockdevice to lock 200 * @bdev: blockdevice to lock
193 * 201 *
194 * This takes the block device bd_mount_sem to make sure no new mounts 202 * This takes the block device bd_mount_sem to make sure no new mounts
195 * happen on bdev until thaw_bdev() is called. 203 * happen on bdev until thaw_bdev() is called.
196 * If a superblock is found on this device, we take the s_umount semaphore 204 * If a superblock is found on this device, we take the s_umount semaphore
197 * on it to make sure nobody unmounts until the snapshot creation is done. 205 * on it to make sure nobody unmounts until the snapshot creation is done.
198 */ 206 */
199 struct super_block *freeze_bdev(struct block_device *bdev) 207 struct super_block *freeze_bdev(struct block_device *bdev)
200 { 208 {
201 struct super_block *sb; 209 struct super_block *sb;
202 210
203 down(&bdev->bd_mount_sem); 211 down(&bdev->bd_mount_sem);
204 sb = get_super(bdev); 212 sb = get_super(bdev);
205 if (sb && !(sb->s_flags & MS_RDONLY)) { 213 if (sb && !(sb->s_flags & MS_RDONLY)) {
206 sb->s_frozen = SB_FREEZE_WRITE; 214 sb->s_frozen = SB_FREEZE_WRITE;
207 smp_wmb(); 215 smp_wmb();
208 216
209 __fsync_super(sb); 217 __fsync_super(sb);
210 218
211 sb->s_frozen = SB_FREEZE_TRANS; 219 sb->s_frozen = SB_FREEZE_TRANS;
212 smp_wmb(); 220 smp_wmb();
213 221
214 sync_blockdev(sb->s_bdev); 222 sync_blockdev(sb->s_bdev);
215 223
216 if (sb->s_op->write_super_lockfs) 224 if (sb->s_op->write_super_lockfs)
217 sb->s_op->write_super_lockfs(sb); 225 sb->s_op->write_super_lockfs(sb);
218 } 226 }
219 227
220 sync_blockdev(bdev); 228 sync_blockdev(bdev);
221 return sb; /* thaw_bdev releases s->s_umount and bd_mount_sem */ 229 return sb; /* thaw_bdev releases s->s_umount and bd_mount_sem */
222 } 230 }
223 EXPORT_SYMBOL(freeze_bdev); 231 EXPORT_SYMBOL(freeze_bdev);
224 232
225 /** 233 /**
226 * thaw_bdev -- unlock filesystem 234 * thaw_bdev -- unlock filesystem
227 * @bdev: blockdevice to unlock 235 * @bdev: blockdevice to unlock
228 * @sb: associated superblock 236 * @sb: associated superblock
229 * 237 *
230 * Unlocks the filesystem and marks it writeable again after freeze_bdev(). 238 * Unlocks the filesystem and marks it writeable again after freeze_bdev().
231 */ 239 */
232 void thaw_bdev(struct block_device *bdev, struct super_block *sb) 240 void thaw_bdev(struct block_device *bdev, struct super_block *sb)
233 { 241 {
234 if (sb) { 242 if (sb) {
235 BUG_ON(sb->s_bdev != bdev); 243 BUG_ON(sb->s_bdev != bdev);
236 244
237 if (sb->s_op->unlockfs) 245 if (sb->s_op->unlockfs)
238 sb->s_op->unlockfs(sb); 246 sb->s_op->unlockfs(sb);
239 sb->s_frozen = SB_UNFROZEN; 247 sb->s_frozen = SB_UNFROZEN;
240 smp_wmb(); 248 smp_wmb();
241 wake_up(&sb->s_wait_unfrozen); 249 wake_up(&sb->s_wait_unfrozen);
242 drop_super(sb); 250 drop_super(sb);
243 } 251 }
244 252
245 up(&bdev->bd_mount_sem); 253 up(&bdev->bd_mount_sem);
246 } 254 }
247 EXPORT_SYMBOL(thaw_bdev); 255 EXPORT_SYMBOL(thaw_bdev);
248 256
249 /* 257 /*
250 * Various filesystems appear to want __find_get_block to be non-blocking. 258 * Various filesystems appear to want __find_get_block to be non-blocking.
251 * But it's the page lock which protects the buffers. To get around this, 259 * But it's the page lock which protects the buffers. To get around this,
252 * we get exclusion from try_to_free_buffers with the blockdev mapping's 260 * we get exclusion from try_to_free_buffers with the blockdev mapping's
253 * private_lock. 261 * private_lock.
254 * 262 *
255 * Hack idea: for the blockdev mapping, i_bufferlist_lock contention 263 * Hack idea: for the blockdev mapping, i_bufferlist_lock contention
256 * may be quite high. This code could TryLock the page, and if that 264 * may be quite high. This code could TryLock the page, and if that
257 * succeeds, there is no need to take private_lock. (But if 265 * succeeds, there is no need to take private_lock. (But if
258 * private_lock is contended then so is mapping->tree_lock). 266 * private_lock is contended then so is mapping->tree_lock).
259 */ 267 */
260 static struct buffer_head * 268 static struct buffer_head *
261 __find_get_block_slow(struct block_device *bdev, sector_t block) 269 __find_get_block_slow(struct block_device *bdev, sector_t block)
262 { 270 {
263 struct inode *bd_inode = bdev->bd_inode; 271 struct inode *bd_inode = bdev->bd_inode;
264 struct address_space *bd_mapping = bd_inode->i_mapping; 272 struct address_space *bd_mapping = bd_inode->i_mapping;
265 struct buffer_head *ret = NULL; 273 struct buffer_head *ret = NULL;
266 pgoff_t index; 274 pgoff_t index;
267 struct buffer_head *bh; 275 struct buffer_head *bh;
268 struct buffer_head *head; 276 struct buffer_head *head;
269 struct page *page; 277 struct page *page;
270 int all_mapped = 1; 278 int all_mapped = 1;
271 279
272 index = block >> (PAGE_CACHE_SHIFT - bd_inode->i_blkbits); 280 index = block >> (PAGE_CACHE_SHIFT - bd_inode->i_blkbits);
273 page = find_get_page(bd_mapping, index); 281 page = find_get_page(bd_mapping, index);
274 if (!page) 282 if (!page)
275 goto out; 283 goto out;
276 284
277 spin_lock(&bd_mapping->private_lock); 285 spin_lock(&bd_mapping->private_lock);
278 if (!page_has_buffers(page)) 286 if (!page_has_buffers(page))
279 goto out_unlock; 287 goto out_unlock;
280 head = page_buffers(page); 288 head = page_buffers(page);
281 bh = head; 289 bh = head;
282 do { 290 do {
283 if (bh->b_blocknr == block) { 291 if (bh->b_blocknr == block) {
284 ret = bh; 292 ret = bh;
285 get_bh(bh); 293 get_bh(bh);
286 goto out_unlock; 294 goto out_unlock;
287 } 295 }
288 if (!buffer_mapped(bh)) 296 if (!buffer_mapped(bh))
289 all_mapped = 0; 297 all_mapped = 0;
290 bh = bh->b_this_page; 298 bh = bh->b_this_page;
291 } while (bh != head); 299 } while (bh != head);
292 300
293 /* we might be here because some of the buffers on this page are 301 /* we might be here because some of the buffers on this page are
294 * not mapped. This is due to various races between 302 * not mapped. This is due to various races between
295 * file io on the block device and getblk. It gets dealt with 303 * file io on the block device and getblk. It gets dealt with
296 * elsewhere, don't buffer_error if we had some unmapped buffers 304 * elsewhere, don't buffer_error if we had some unmapped buffers
297 */ 305 */
298 if (all_mapped) { 306 if (all_mapped) {
299 printk("__find_get_block_slow() failed. " 307 printk("__find_get_block_slow() failed. "
300 "block=%llu, b_blocknr=%llu\n", 308 "block=%llu, b_blocknr=%llu\n",
301 (unsigned long long)block, 309 (unsigned long long)block,
302 (unsigned long long)bh->b_blocknr); 310 (unsigned long long)bh->b_blocknr);
303 printk("b_state=0x%08lx, b_size=%zu\n", 311 printk("b_state=0x%08lx, b_size=%zu\n",
304 bh->b_state, bh->b_size); 312 bh->b_state, bh->b_size);
305 printk("device blocksize: %d\n", 1 << bd_inode->i_blkbits); 313 printk("device blocksize: %d\n", 1 << bd_inode->i_blkbits);
306 } 314 }
307 out_unlock: 315 out_unlock:
308 spin_unlock(&bd_mapping->private_lock); 316 spin_unlock(&bd_mapping->private_lock);
309 page_cache_release(page); 317 page_cache_release(page);
310 out: 318 out:
311 return ret; 319 return ret;
312 } 320 }
313 321
314 /* If invalidate_buffers() will trash dirty buffers, it means some kind 322 /* If invalidate_buffers() will trash dirty buffers, it means some kind
315 of fs corruption is going on. Trashing dirty data always imply losing 323 of fs corruption is going on. Trashing dirty data always imply losing
316 information that was supposed to be just stored on the physical layer 324 information that was supposed to be just stored on the physical layer
317 by the user. 325 by the user.
318 326
319 Thus invalidate_buffers in general usage is not allwowed to trash 327 Thus invalidate_buffers in general usage is not allwowed to trash
320 dirty buffers. For example ioctl(FLSBLKBUF) expects dirty data to 328 dirty buffers. For example ioctl(FLSBLKBUF) expects dirty data to
321 be preserved. These buffers are simply skipped. 329 be preserved. These buffers are simply skipped.
322 330
323 We also skip buffers which are still in use. For example this can 331 We also skip buffers which are still in use. For example this can
324 happen if a userspace program is reading the block device. 332 happen if a userspace program is reading the block device.
325 333
326 NOTE: In the case where the user removed a removable-media-disk even if 334 NOTE: In the case where the user removed a removable-media-disk even if
327 there's still dirty data not synced on disk (due a bug in the device driver 335 there's still dirty data not synced on disk (due a bug in the device driver
328 or due an error of the user), by not destroying the dirty buffers we could 336 or due an error of the user), by not destroying the dirty buffers we could
329 generate corruption also on the next media inserted, thus a parameter is 337 generate corruption also on the next media inserted, thus a parameter is
330 necessary to handle this case in the most safe way possible (trying 338 necessary to handle this case in the most safe way possible (trying
331 to not corrupt also the new disk inserted with the data belonging to 339 to not corrupt also the new disk inserted with the data belonging to
332 the old now corrupted disk). Also for the ramdisk the natural thing 340 the old now corrupted disk). Also for the ramdisk the natural thing
333 to do in order to release the ramdisk memory is to destroy dirty buffers. 341 to do in order to release the ramdisk memory is to destroy dirty buffers.
334 342
335 These are two special cases. Normal usage imply the device driver 343 These are two special cases. Normal usage imply the device driver
336 to issue a sync on the device (without waiting I/O completion) and 344 to issue a sync on the device (without waiting I/O completion) and
337 then an invalidate_buffers call that doesn't trash dirty buffers. 345 then an invalidate_buffers call that doesn't trash dirty buffers.
338 346
339 For handling cache coherency with the blkdev pagecache the 'update' case 347 For handling cache coherency with the blkdev pagecache the 'update' case
340 is been introduced. It is needed to re-read from disk any pinned 348 is been introduced. It is needed to re-read from disk any pinned
341 buffer. NOTE: re-reading from disk is destructive so we can do it only 349 buffer. NOTE: re-reading from disk is destructive so we can do it only
342 when we assume nobody is changing the buffercache under our I/O and when 350 when we assume nobody is changing the buffercache under our I/O and when
343 we think the disk contains more recent information than the buffercache. 351 we think the disk contains more recent information than the buffercache.
344 The update == 1 pass marks the buffers we need to update, the update == 2 352 The update == 1 pass marks the buffers we need to update, the update == 2
345 pass does the actual I/O. */ 353 pass does the actual I/O. */
346 void invalidate_bdev(struct block_device *bdev) 354 void invalidate_bdev(struct block_device *bdev)
347 { 355 {
348 struct address_space *mapping = bdev->bd_inode->i_mapping; 356 struct address_space *mapping = bdev->bd_inode->i_mapping;
349 357
350 if (mapping->nrpages == 0) 358 if (mapping->nrpages == 0)
351 return; 359 return;
352 360
353 invalidate_bh_lrus(); 361 invalidate_bh_lrus();
354 invalidate_mapping_pages(mapping, 0, -1); 362 invalidate_mapping_pages(mapping, 0, -1);
355 } 363 }
356 364
357 /* 365 /*
358 * Kick pdflush then try to free up some ZONE_NORMAL memory. 366 * Kick pdflush then try to free up some ZONE_NORMAL memory.
359 */ 367 */
360 static void free_more_memory(void) 368 static void free_more_memory(void)
361 { 369 {
362 struct zone *zone; 370 struct zone *zone;
363 int nid; 371 int nid;
364 372
365 wakeup_pdflush(1024); 373 wakeup_pdflush(1024);
366 yield(); 374 yield();
367 375
368 for_each_online_node(nid) { 376 for_each_online_node(nid) {
369 (void)first_zones_zonelist(node_zonelist(nid, GFP_NOFS), 377 (void)first_zones_zonelist(node_zonelist(nid, GFP_NOFS),
370 gfp_zone(GFP_NOFS), NULL, 378 gfp_zone(GFP_NOFS), NULL,
371 &zone); 379 &zone);
372 if (zone) 380 if (zone)
373 try_to_free_pages(node_zonelist(nid, GFP_NOFS), 0, 381 try_to_free_pages(node_zonelist(nid, GFP_NOFS), 0,
374 GFP_NOFS); 382 GFP_NOFS);
375 } 383 }
376 } 384 }
377 385
378 /* 386 /*
379 * I/O completion handler for block_read_full_page() - pages 387 * I/O completion handler for block_read_full_page() - pages
380 * which come unlocked at the end of I/O. 388 * which come unlocked at the end of I/O.
381 */ 389 */
382 static void end_buffer_async_read(struct buffer_head *bh, int uptodate) 390 static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
383 { 391 {
384 unsigned long flags; 392 unsigned long flags;
385 struct buffer_head *first; 393 struct buffer_head *first;
386 struct buffer_head *tmp; 394 struct buffer_head *tmp;
387 struct page *page; 395 struct page *page;
388 int page_uptodate = 1; 396 int page_uptodate = 1;
389 397
390 BUG_ON(!buffer_async_read(bh)); 398 BUG_ON(!buffer_async_read(bh));
391 399
392 page = bh->b_page; 400 page = bh->b_page;
393 if (uptodate) { 401 if (uptodate) {
394 set_buffer_uptodate(bh); 402 set_buffer_uptodate(bh);
395 } else { 403 } else {
396 clear_buffer_uptodate(bh); 404 clear_buffer_uptodate(bh);
397 if (printk_ratelimit()) 405 if (!quiet_error(bh))
398 buffer_io_error(bh); 406 buffer_io_error(bh);
399 SetPageError(page); 407 SetPageError(page);
400 } 408 }
401 409
402 /* 410 /*
403 * Be _very_ careful from here on. Bad things can happen if 411 * Be _very_ careful from here on. Bad things can happen if
404 * two buffer heads end IO at almost the same time and both 412 * two buffer heads end IO at almost the same time and both
405 * decide that the page is now completely done. 413 * decide that the page is now completely done.
406 */ 414 */
407 first = page_buffers(page); 415 first = page_buffers(page);
408 local_irq_save(flags); 416 local_irq_save(flags);
409 bit_spin_lock(BH_Uptodate_Lock, &first->b_state); 417 bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
410 clear_buffer_async_read(bh); 418 clear_buffer_async_read(bh);
411 unlock_buffer(bh); 419 unlock_buffer(bh);
412 tmp = bh; 420 tmp = bh;
413 do { 421 do {
414 if (!buffer_uptodate(tmp)) 422 if (!buffer_uptodate(tmp))
415 page_uptodate = 0; 423 page_uptodate = 0;
416 if (buffer_async_read(tmp)) { 424 if (buffer_async_read(tmp)) {
417 BUG_ON(!buffer_locked(tmp)); 425 BUG_ON(!buffer_locked(tmp));
418 goto still_busy; 426 goto still_busy;
419 } 427 }
420 tmp = tmp->b_this_page; 428 tmp = tmp->b_this_page;
421 } while (tmp != bh); 429 } while (tmp != bh);
422 bit_spin_unlock(BH_Uptodate_Lock, &first->b_state); 430 bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
423 local_irq_restore(flags); 431 local_irq_restore(flags);
424 432
425 /* 433 /*
426 * If none of the buffers had errors and they are all 434 * If none of the buffers had errors and they are all
427 * uptodate then we can set the page uptodate. 435 * uptodate then we can set the page uptodate.
428 */ 436 */
429 if (page_uptodate && !PageError(page)) 437 if (page_uptodate && !PageError(page))
430 SetPageUptodate(page); 438 SetPageUptodate(page);
431 unlock_page(page); 439 unlock_page(page);
432 return; 440 return;
433 441
434 still_busy: 442 still_busy:
435 bit_spin_unlock(BH_Uptodate_Lock, &first->b_state); 443 bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
436 local_irq_restore(flags); 444 local_irq_restore(flags);
437 return; 445 return;
438 } 446 }
439 447
440 /* 448 /*
441 * Completion handler for block_write_full_page() - pages which are unlocked 449 * Completion handler for block_write_full_page() - pages which are unlocked
442 * during I/O, and which have PageWriteback cleared upon I/O completion. 450 * during I/O, and which have PageWriteback cleared upon I/O completion.
443 */ 451 */
444 static void end_buffer_async_write(struct buffer_head *bh, int uptodate) 452 static void end_buffer_async_write(struct buffer_head *bh, int uptodate)
445 { 453 {
446 char b[BDEVNAME_SIZE]; 454 char b[BDEVNAME_SIZE];
447 unsigned long flags; 455 unsigned long flags;
448 struct buffer_head *first; 456 struct buffer_head *first;
449 struct buffer_head *tmp; 457 struct buffer_head *tmp;
450 struct page *page; 458 struct page *page;
451 459
452 BUG_ON(!buffer_async_write(bh)); 460 BUG_ON(!buffer_async_write(bh));
453 461
454 page = bh->b_page; 462 page = bh->b_page;
455 if (uptodate) { 463 if (uptodate) {
456 set_buffer_uptodate(bh); 464 set_buffer_uptodate(bh);
457 } else { 465 } else {
458 if (printk_ratelimit()) { 466 if (!quiet_error(bh)) {
459 buffer_io_error(bh); 467 buffer_io_error(bh);
460 printk(KERN_WARNING "lost page write due to " 468 printk(KERN_WARNING "lost page write due to "
461 "I/O error on %s\n", 469 "I/O error on %s\n",
462 bdevname(bh->b_bdev, b)); 470 bdevname(bh->b_bdev, b));
463 } 471 }
464 set_bit(AS_EIO, &page->mapping->flags); 472 set_bit(AS_EIO, &page->mapping->flags);
465 set_buffer_write_io_error(bh); 473 set_buffer_write_io_error(bh);
466 clear_buffer_uptodate(bh); 474 clear_buffer_uptodate(bh);
467 SetPageError(page); 475 SetPageError(page);
468 } 476 }
469 477
470 first = page_buffers(page); 478 first = page_buffers(page);
471 local_irq_save(flags); 479 local_irq_save(flags);
472 bit_spin_lock(BH_Uptodate_Lock, &first->b_state); 480 bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
473 481
474 clear_buffer_async_write(bh); 482 clear_buffer_async_write(bh);
475 unlock_buffer(bh); 483 unlock_buffer(bh);
476 tmp = bh->b_this_page; 484 tmp = bh->b_this_page;
477 while (tmp != bh) { 485 while (tmp != bh) {
478 if (buffer_async_write(tmp)) { 486 if (buffer_async_write(tmp)) {
479 BUG_ON(!buffer_locked(tmp)); 487 BUG_ON(!buffer_locked(tmp));
480 goto still_busy; 488 goto still_busy;
481 } 489 }
482 tmp = tmp->b_this_page; 490 tmp = tmp->b_this_page;
483 } 491 }
484 bit_spin_unlock(BH_Uptodate_Lock, &first->b_state); 492 bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
485 local_irq_restore(flags); 493 local_irq_restore(flags);
486 end_page_writeback(page); 494 end_page_writeback(page);
487 return; 495 return;
488 496
489 still_busy: 497 still_busy:
490 bit_spin_unlock(BH_Uptodate_Lock, &first->b_state); 498 bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
491 local_irq_restore(flags); 499 local_irq_restore(flags);
492 return; 500 return;
493 } 501 }
494 502
495 /* 503 /*
496 * If a page's buffers are under async readin (end_buffer_async_read 504 * If a page's buffers are under async readin (end_buffer_async_read
497 * completion) then there is a possibility that another thread of 505 * completion) then there is a possibility that another thread of
498 * control could lock one of the buffers after it has completed 506 * control could lock one of the buffers after it has completed
499 * but while some of the other buffers have not completed. This 507 * but while some of the other buffers have not completed. This
500 * locked buffer would confuse end_buffer_async_read() into not unlocking 508 * locked buffer would confuse end_buffer_async_read() into not unlocking
501 * the page. So the absence of BH_Async_Read tells end_buffer_async_read() 509 * the page. So the absence of BH_Async_Read tells end_buffer_async_read()
502 * that this buffer is not under async I/O. 510 * that this buffer is not under async I/O.
503 * 511 *
504 * The page comes unlocked when it has no locked buffer_async buffers 512 * The page comes unlocked when it has no locked buffer_async buffers
505 * left. 513 * left.
506 * 514 *
507 * PageLocked prevents anyone starting new async I/O reads any of 515 * PageLocked prevents anyone starting new async I/O reads any of
508 * the buffers. 516 * the buffers.
509 * 517 *
510 * PageWriteback is used to prevent simultaneous writeout of the same 518 * PageWriteback is used to prevent simultaneous writeout of the same
511 * page. 519 * page.
512 * 520 *
513 * PageLocked prevents anyone from starting writeback of a page which is 521 * PageLocked prevents anyone from starting writeback of a page which is
514 * under read I/O (PageWriteback is only ever set against a locked page). 522 * under read I/O (PageWriteback is only ever set against a locked page).
515 */ 523 */
516 static void mark_buffer_async_read(struct buffer_head *bh) 524 static void mark_buffer_async_read(struct buffer_head *bh)
517 { 525 {
518 bh->b_end_io = end_buffer_async_read; 526 bh->b_end_io = end_buffer_async_read;
519 set_buffer_async_read(bh); 527 set_buffer_async_read(bh);
520 } 528 }
521 529
522 void mark_buffer_async_write(struct buffer_head *bh) 530 void mark_buffer_async_write(struct buffer_head *bh)
523 { 531 {
524 bh->b_end_io = end_buffer_async_write; 532 bh->b_end_io = end_buffer_async_write;
525 set_buffer_async_write(bh); 533 set_buffer_async_write(bh);
526 } 534 }
527 EXPORT_SYMBOL(mark_buffer_async_write); 535 EXPORT_SYMBOL(mark_buffer_async_write);
528 536
529 537
530 /* 538 /*
531 * fs/buffer.c contains helper functions for buffer-backed address space's 539 * fs/buffer.c contains helper functions for buffer-backed address space's
532 * fsync functions. A common requirement for buffer-based filesystems is 540 * fsync functions. A common requirement for buffer-based filesystems is
533 * that certain data from the backing blockdev needs to be written out for 541 * that certain data from the backing blockdev needs to be written out for
534 * a successful fsync(). For example, ext2 indirect blocks need to be 542 * a successful fsync(). For example, ext2 indirect blocks need to be
535 * written back and waited upon before fsync() returns. 543 * written back and waited upon before fsync() returns.
536 * 544 *
537 * The functions mark_buffer_inode_dirty(), fsync_inode_buffers(), 545 * The functions mark_buffer_inode_dirty(), fsync_inode_buffers(),
538 * inode_has_buffers() and invalidate_inode_buffers() are provided for the 546 * inode_has_buffers() and invalidate_inode_buffers() are provided for the
539 * management of a list of dependent buffers at ->i_mapping->private_list. 547 * management of a list of dependent buffers at ->i_mapping->private_list.
540 * 548 *
541 * Locking is a little subtle: try_to_free_buffers() will remove buffers 549 * Locking is a little subtle: try_to_free_buffers() will remove buffers
542 * from their controlling inode's queue when they are being freed. But 550 * from their controlling inode's queue when they are being freed. But
543 * try_to_free_buffers() will be operating against the *blockdev* mapping 551 * try_to_free_buffers() will be operating against the *blockdev* mapping
544 * at the time, not against the S_ISREG file which depends on those buffers. 552 * at the time, not against the S_ISREG file which depends on those buffers.
545 * So the locking for private_list is via the private_lock in the address_space 553 * So the locking for private_list is via the private_lock in the address_space
546 * which backs the buffers. Which is different from the address_space 554 * which backs the buffers. Which is different from the address_space
547 * against which the buffers are listed. So for a particular address_space, 555 * against which the buffers are listed. So for a particular address_space,
548 * mapping->private_lock does *not* protect mapping->private_list! In fact, 556 * mapping->private_lock does *not* protect mapping->private_list! In fact,
549 * mapping->private_list will always be protected by the backing blockdev's 557 * mapping->private_list will always be protected by the backing blockdev's
550 * ->private_lock. 558 * ->private_lock.
551 * 559 *
552 * Which introduces a requirement: all buffers on an address_space's 560 * Which introduces a requirement: all buffers on an address_space's
553 * ->private_list must be from the same address_space: the blockdev's. 561 * ->private_list must be from the same address_space: the blockdev's.
554 * 562 *
555 * address_spaces which do not place buffers at ->private_list via these 563 * address_spaces which do not place buffers at ->private_list via these
556 * utility functions are free to use private_lock and private_list for 564 * utility functions are free to use private_lock and private_list for
557 * whatever they want. The only requirement is that list_empty(private_list) 565 * whatever they want. The only requirement is that list_empty(private_list)
558 * be true at clear_inode() time. 566 * be true at clear_inode() time.
559 * 567 *
560 * FIXME: clear_inode should not call invalidate_inode_buffers(). The 568 * FIXME: clear_inode should not call invalidate_inode_buffers(). The
561 * filesystems should do that. invalidate_inode_buffers() should just go 569 * filesystems should do that. invalidate_inode_buffers() should just go
562 * BUG_ON(!list_empty). 570 * BUG_ON(!list_empty).
563 * 571 *
564 * FIXME: mark_buffer_dirty_inode() is a data-plane operation. It should 572 * FIXME: mark_buffer_dirty_inode() is a data-plane operation. It should
565 * take an address_space, not an inode. And it should be called 573 * take an address_space, not an inode. And it should be called
566 * mark_buffer_dirty_fsync() to clearly define why those buffers are being 574 * mark_buffer_dirty_fsync() to clearly define why those buffers are being
567 * queued up. 575 * queued up.
568 * 576 *
569 * FIXME: mark_buffer_dirty_inode() doesn't need to add the buffer to the 577 * FIXME: mark_buffer_dirty_inode() doesn't need to add the buffer to the
570 * list if it is already on a list. Because if the buffer is on a list, 578 * list if it is already on a list. Because if the buffer is on a list,
571 * it *must* already be on the right one. If not, the filesystem is being 579 * it *must* already be on the right one. If not, the filesystem is being
572 * silly. This will save a ton of locking. But first we have to ensure 580 * silly. This will save a ton of locking. But first we have to ensure
573 * that buffers are taken *off* the old inode's list when they are freed 581 * that buffers are taken *off* the old inode's list when they are freed
574 * (presumably in truncate). That requires careful auditing of all 582 * (presumably in truncate). That requires careful auditing of all
575 * filesystems (do it inside bforget()). It could also be done by bringing 583 * filesystems (do it inside bforget()). It could also be done by bringing
576 * b_inode back. 584 * b_inode back.
577 */ 585 */
578 586
579 /* 587 /*
580 * The buffer's backing address_space's private_lock must be held 588 * The buffer's backing address_space's private_lock must be held
581 */ 589 */
582 static void __remove_assoc_queue(struct buffer_head *bh) 590 static void __remove_assoc_queue(struct buffer_head *bh)
583 { 591 {
584 list_del_init(&bh->b_assoc_buffers); 592 list_del_init(&bh->b_assoc_buffers);
585 WARN_ON(!bh->b_assoc_map); 593 WARN_ON(!bh->b_assoc_map);
586 if (buffer_write_io_error(bh)) 594 if (buffer_write_io_error(bh))
587 set_bit(AS_EIO, &bh->b_assoc_map->flags); 595 set_bit(AS_EIO, &bh->b_assoc_map->flags);
588 bh->b_assoc_map = NULL; 596 bh->b_assoc_map = NULL;
589 } 597 }
590 598
591 int inode_has_buffers(struct inode *inode) 599 int inode_has_buffers(struct inode *inode)
592 { 600 {
593 return !list_empty(&inode->i_data.private_list); 601 return !list_empty(&inode->i_data.private_list);
594 } 602 }
595 603
596 /* 604 /*
597 * osync is designed to support O_SYNC io. It waits synchronously for 605 * osync is designed to support O_SYNC io. It waits synchronously for
598 * all already-submitted IO to complete, but does not queue any new 606 * all already-submitted IO to complete, but does not queue any new
599 * writes to the disk. 607 * writes to the disk.
600 * 608 *
601 * To do O_SYNC writes, just queue the buffer writes with ll_rw_block as 609 * To do O_SYNC writes, just queue the buffer writes with ll_rw_block as
602 * you dirty the buffers, and then use osync_inode_buffers to wait for 610 * you dirty the buffers, and then use osync_inode_buffers to wait for
603 * completion. Any other dirty buffers which are not yet queued for 611 * completion. Any other dirty buffers which are not yet queued for
604 * write will not be flushed to disk by the osync. 612 * write will not be flushed to disk by the osync.
605 */ 613 */
606 static int osync_buffers_list(spinlock_t *lock, struct list_head *list) 614 static int osync_buffers_list(spinlock_t *lock, struct list_head *list)
607 { 615 {
608 struct buffer_head *bh; 616 struct buffer_head *bh;
609 struct list_head *p; 617 struct list_head *p;
610 int err = 0; 618 int err = 0;
611 619
612 spin_lock(lock); 620 spin_lock(lock);
613 repeat: 621 repeat:
614 list_for_each_prev(p, list) { 622 list_for_each_prev(p, list) {
615 bh = BH_ENTRY(p); 623 bh = BH_ENTRY(p);
616 if (buffer_locked(bh)) { 624 if (buffer_locked(bh)) {
617 get_bh(bh); 625 get_bh(bh);
618 spin_unlock(lock); 626 spin_unlock(lock);
619 wait_on_buffer(bh); 627 wait_on_buffer(bh);
620 if (!buffer_uptodate(bh)) 628 if (!buffer_uptodate(bh))
621 err = -EIO; 629 err = -EIO;
622 brelse(bh); 630 brelse(bh);
623 spin_lock(lock); 631 spin_lock(lock);
624 goto repeat; 632 goto repeat;
625 } 633 }
626 } 634 }
627 spin_unlock(lock); 635 spin_unlock(lock);
628 return err; 636 return err;
629 } 637 }
630 638
631 /** 639 /**
632 * sync_mapping_buffers - write out & wait upon a mapping's "associated" buffers 640 * sync_mapping_buffers - write out & wait upon a mapping's "associated" buffers
633 * @mapping: the mapping which wants those buffers written 641 * @mapping: the mapping which wants those buffers written
634 * 642 *
635 * Starts I/O against the buffers at mapping->private_list, and waits upon 643 * Starts I/O against the buffers at mapping->private_list, and waits upon
636 * that I/O. 644 * that I/O.
637 * 645 *
638 * Basically, this is a convenience function for fsync(). 646 * Basically, this is a convenience function for fsync().
639 * @mapping is a file or directory which needs those buffers to be written for 647 * @mapping is a file or directory which needs those buffers to be written for
640 * a successful fsync(). 648 * a successful fsync().
641 */ 649 */
642 int sync_mapping_buffers(struct address_space *mapping) 650 int sync_mapping_buffers(struct address_space *mapping)
643 { 651 {
644 struct address_space *buffer_mapping = mapping->assoc_mapping; 652 struct address_space *buffer_mapping = mapping->assoc_mapping;
645 653
646 if (buffer_mapping == NULL || list_empty(&mapping->private_list)) 654 if (buffer_mapping == NULL || list_empty(&mapping->private_list))
647 return 0; 655 return 0;
648 656
649 return fsync_buffers_list(&buffer_mapping->private_lock, 657 return fsync_buffers_list(&buffer_mapping->private_lock,
650 &mapping->private_list); 658 &mapping->private_list);
651 } 659 }
652 EXPORT_SYMBOL(sync_mapping_buffers); 660 EXPORT_SYMBOL(sync_mapping_buffers);
653 661
654 /* 662 /*
655 * Called when we've recently written block `bblock', and it is known that 663 * Called when we've recently written block `bblock', and it is known that
656 * `bblock' was for a buffer_boundary() buffer. This means that the block at 664 * `bblock' was for a buffer_boundary() buffer. This means that the block at
657 * `bblock + 1' is probably a dirty indirect block. Hunt it down and, if it's 665 * `bblock + 1' is probably a dirty indirect block. Hunt it down and, if it's
658 * dirty, schedule it for IO. So that indirects merge nicely with their data. 666 * dirty, schedule it for IO. So that indirects merge nicely with their data.
659 */ 667 */
660 void write_boundary_block(struct block_device *bdev, 668 void write_boundary_block(struct block_device *bdev,
661 sector_t bblock, unsigned blocksize) 669 sector_t bblock, unsigned blocksize)
662 { 670 {
663 struct buffer_head *bh = __find_get_block(bdev, bblock + 1, blocksize); 671 struct buffer_head *bh = __find_get_block(bdev, bblock + 1, blocksize);
664 if (bh) { 672 if (bh) {
665 if (buffer_dirty(bh)) 673 if (buffer_dirty(bh))
666 ll_rw_block(WRITE, 1, &bh); 674 ll_rw_block(WRITE, 1, &bh);
667 put_bh(bh); 675 put_bh(bh);
668 } 676 }
669 } 677 }
670 678
671 void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode) 679 void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode)
672 { 680 {
673 struct address_space *mapping = inode->i_mapping; 681 struct address_space *mapping = inode->i_mapping;
674 struct address_space *buffer_mapping = bh->b_page->mapping; 682 struct address_space *buffer_mapping = bh->b_page->mapping;
675 683
676 mark_buffer_dirty(bh); 684 mark_buffer_dirty(bh);
677 if (!mapping->assoc_mapping) { 685 if (!mapping->assoc_mapping) {
678 mapping->assoc_mapping = buffer_mapping; 686 mapping->assoc_mapping = buffer_mapping;
679 } else { 687 } else {
680 BUG_ON(mapping->assoc_mapping != buffer_mapping); 688 BUG_ON(mapping->assoc_mapping != buffer_mapping);
681 } 689 }
682 if (!bh->b_assoc_map) { 690 if (!bh->b_assoc_map) {
683 spin_lock(&buffer_mapping->private_lock); 691 spin_lock(&buffer_mapping->private_lock);
684 list_move_tail(&bh->b_assoc_buffers, 692 list_move_tail(&bh->b_assoc_buffers,
685 &mapping->private_list); 693 &mapping->private_list);
686 bh->b_assoc_map = mapping; 694 bh->b_assoc_map = mapping;
687 spin_unlock(&buffer_mapping->private_lock); 695 spin_unlock(&buffer_mapping->private_lock);
688 } 696 }
689 } 697 }
690 EXPORT_SYMBOL(mark_buffer_dirty_inode); 698 EXPORT_SYMBOL(mark_buffer_dirty_inode);
691 699
692 /* 700 /*
693 * Mark the page dirty, and set it dirty in the radix tree, and mark the inode 701 * Mark the page dirty, and set it dirty in the radix tree, and mark the inode
694 * dirty. 702 * dirty.
695 * 703 *
696 * If warn is true, then emit a warning if the page is not uptodate and has 704 * If warn is true, then emit a warning if the page is not uptodate and has
697 * not been truncated. 705 * not been truncated.
698 */ 706 */
699 static int __set_page_dirty(struct page *page, 707 static int __set_page_dirty(struct page *page,
700 struct address_space *mapping, int warn) 708 struct address_space *mapping, int warn)
701 { 709 {
702 if (unlikely(!mapping)) 710 if (unlikely(!mapping))
703 return !TestSetPageDirty(page); 711 return !TestSetPageDirty(page);
704 712
705 if (TestSetPageDirty(page)) 713 if (TestSetPageDirty(page))
706 return 0; 714 return 0;
707 715
708 spin_lock_irq(&mapping->tree_lock); 716 spin_lock_irq(&mapping->tree_lock);
709 if (page->mapping) { /* Race with truncate? */ 717 if (page->mapping) { /* Race with truncate? */
710 WARN_ON_ONCE(warn && !PageUptodate(page)); 718 WARN_ON_ONCE(warn && !PageUptodate(page));
711 719
712 if (mapping_cap_account_dirty(mapping)) { 720 if (mapping_cap_account_dirty(mapping)) {
713 __inc_zone_page_state(page, NR_FILE_DIRTY); 721 __inc_zone_page_state(page, NR_FILE_DIRTY);
714 __inc_bdi_stat(mapping->backing_dev_info, 722 __inc_bdi_stat(mapping->backing_dev_info,
715 BDI_RECLAIMABLE); 723 BDI_RECLAIMABLE);
716 task_io_account_write(PAGE_CACHE_SIZE); 724 task_io_account_write(PAGE_CACHE_SIZE);
717 } 725 }
718 radix_tree_tag_set(&mapping->page_tree, 726 radix_tree_tag_set(&mapping->page_tree,
719 page_index(page), PAGECACHE_TAG_DIRTY); 727 page_index(page), PAGECACHE_TAG_DIRTY);
720 } 728 }
721 spin_unlock_irq(&mapping->tree_lock); 729 spin_unlock_irq(&mapping->tree_lock);
722 __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); 730 __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
723 731
724 return 1; 732 return 1;
725 } 733 }
726 734
727 /* 735 /*
728 * Add a page to the dirty page list. 736 * Add a page to the dirty page list.
729 * 737 *
730 * It is a sad fact of life that this function is called from several places 738 * It is a sad fact of life that this function is called from several places
731 * deeply under spinlocking. It may not sleep. 739 * deeply under spinlocking. It may not sleep.
732 * 740 *
733 * If the page has buffers, the uptodate buffers are set dirty, to preserve 741 * If the page has buffers, the uptodate buffers are set dirty, to preserve
734 * dirty-state coherency between the page and the buffers. It the page does 742 * dirty-state coherency between the page and the buffers. It the page does
735 * not have buffers then when they are later attached they will all be set 743 * not have buffers then when they are later attached they will all be set
736 * dirty. 744 * dirty.
737 * 745 *
738 * The buffers are dirtied before the page is dirtied. There's a small race 746 * The buffers are dirtied before the page is dirtied. There's a small race
739 * window in which a writepage caller may see the page cleanness but not the 747 * window in which a writepage caller may see the page cleanness but not the
740 * buffer dirtiness. That's fine. If this code were to set the page dirty 748 * buffer dirtiness. That's fine. If this code were to set the page dirty
741 * before the buffers, a concurrent writepage caller could clear the page dirty 749 * before the buffers, a concurrent writepage caller could clear the page dirty
742 * bit, see a bunch of clean buffers and we'd end up with dirty buffers/clean 750 * bit, see a bunch of clean buffers and we'd end up with dirty buffers/clean
743 * page on the dirty page list. 751 * page on the dirty page list.
744 * 752 *
745 * We use private_lock to lock against try_to_free_buffers while using the 753 * We use private_lock to lock against try_to_free_buffers while using the
746 * page's buffer list. Also use this to protect against clean buffers being 754 * page's buffer list. Also use this to protect against clean buffers being
747 * added to the page after it was set dirty. 755 * added to the page after it was set dirty.
748 * 756 *
749 * FIXME: may need to call ->reservepage here as well. That's rather up to the 757 * FIXME: may need to call ->reservepage here as well. That's rather up to the
750 * address_space though. 758 * address_space though.
751 */ 759 */
752 int __set_page_dirty_buffers(struct page *page) 760 int __set_page_dirty_buffers(struct page *page)
753 { 761 {
754 struct address_space *mapping = page_mapping(page); 762 struct address_space *mapping = page_mapping(page);
755 763
756 if (unlikely(!mapping)) 764 if (unlikely(!mapping))
757 return !TestSetPageDirty(page); 765 return !TestSetPageDirty(page);
758 766
759 spin_lock(&mapping->private_lock); 767 spin_lock(&mapping->private_lock);
760 if (page_has_buffers(page)) { 768 if (page_has_buffers(page)) {
761 struct buffer_head *head = page_buffers(page); 769 struct buffer_head *head = page_buffers(page);
762 struct buffer_head *bh = head; 770 struct buffer_head *bh = head;
763 771
764 do { 772 do {
765 set_buffer_dirty(bh); 773 set_buffer_dirty(bh);
766 bh = bh->b_this_page; 774 bh = bh->b_this_page;
767 } while (bh != head); 775 } while (bh != head);
768 } 776 }
769 spin_unlock(&mapping->private_lock); 777 spin_unlock(&mapping->private_lock);
770 778
771 return __set_page_dirty(page, mapping, 1); 779 return __set_page_dirty(page, mapping, 1);
772 } 780 }
773 EXPORT_SYMBOL(__set_page_dirty_buffers); 781 EXPORT_SYMBOL(__set_page_dirty_buffers);
774 782
775 /* 783 /*
776 * Write out and wait upon a list of buffers. 784 * Write out and wait upon a list of buffers.
777 * 785 *
778 * We have conflicting pressures: we want to make sure that all 786 * We have conflicting pressures: we want to make sure that all
779 * initially dirty buffers get waited on, but that any subsequently 787 * initially dirty buffers get waited on, but that any subsequently
780 * dirtied buffers don't. After all, we don't want fsync to last 788 * dirtied buffers don't. After all, we don't want fsync to last
781 * forever if somebody is actively writing to the file. 789 * forever if somebody is actively writing to the file.
782 * 790 *
783 * Do this in two main stages: first we copy dirty buffers to a 791 * Do this in two main stages: first we copy dirty buffers to a
784 * temporary inode list, queueing the writes as we go. Then we clean 792 * temporary inode list, queueing the writes as we go. Then we clean
785 * up, waiting for those writes to complete. 793 * up, waiting for those writes to complete.
786 * 794 *
787 * During this second stage, any subsequent updates to the file may end 795 * During this second stage, any subsequent updates to the file may end
788 * up refiling the buffer on the original inode's dirty list again, so 796 * up refiling the buffer on the original inode's dirty list again, so
789 * there is a chance we will end up with a buffer queued for write but 797 * there is a chance we will end up with a buffer queued for write but
790 * not yet completed on that list. So, as a final cleanup we go through 798 * not yet completed on that list. So, as a final cleanup we go through
791 * the osync code to catch these locked, dirty buffers without requeuing 799 * the osync code to catch these locked, dirty buffers without requeuing
792 * any newly dirty buffers for write. 800 * any newly dirty buffers for write.
793 */ 801 */
794 static int fsync_buffers_list(spinlock_t *lock, struct list_head *list) 802 static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
795 { 803 {
796 struct buffer_head *bh; 804 struct buffer_head *bh;
797 struct list_head tmp; 805 struct list_head tmp;
798 struct address_space *mapping; 806 struct address_space *mapping;
799 int err = 0, err2; 807 int err = 0, err2;
800 808
801 INIT_LIST_HEAD(&tmp); 809 INIT_LIST_HEAD(&tmp);
802 810
803 spin_lock(lock); 811 spin_lock(lock);
804 while (!list_empty(list)) { 812 while (!list_empty(list)) {
805 bh = BH_ENTRY(list->next); 813 bh = BH_ENTRY(list->next);
806 mapping = bh->b_assoc_map; 814 mapping = bh->b_assoc_map;
807 __remove_assoc_queue(bh); 815 __remove_assoc_queue(bh);
808 /* Avoid race with mark_buffer_dirty_inode() which does 816 /* Avoid race with mark_buffer_dirty_inode() which does
809 * a lockless check and we rely on seeing the dirty bit */ 817 * a lockless check and we rely on seeing the dirty bit */
810 smp_mb(); 818 smp_mb();
811 if (buffer_dirty(bh) || buffer_locked(bh)) { 819 if (buffer_dirty(bh) || buffer_locked(bh)) {
812 list_add(&bh->b_assoc_buffers, &tmp); 820 list_add(&bh->b_assoc_buffers, &tmp);
813 bh->b_assoc_map = mapping; 821 bh->b_assoc_map = mapping;
814 if (buffer_dirty(bh)) { 822 if (buffer_dirty(bh)) {
815 get_bh(bh); 823 get_bh(bh);
816 spin_unlock(lock); 824 spin_unlock(lock);
817 /* 825 /*
818 * Ensure any pending I/O completes so that 826 * Ensure any pending I/O completes so that
819 * ll_rw_block() actually writes the current 827 * ll_rw_block() actually writes the current
820 * contents - it is a noop if I/O is still in 828 * contents - it is a noop if I/O is still in
821 * flight on potentially older contents. 829 * flight on potentially older contents.
822 */ 830 */
823 ll_rw_block(SWRITE_SYNC, 1, &bh); 831 ll_rw_block(SWRITE_SYNC, 1, &bh);
824 brelse(bh); 832 brelse(bh);
825 spin_lock(lock); 833 spin_lock(lock);
826 } 834 }
827 } 835 }
828 } 836 }
829 837
830 while (!list_empty(&tmp)) { 838 while (!list_empty(&tmp)) {
831 bh = BH_ENTRY(tmp.prev); 839 bh = BH_ENTRY(tmp.prev);
832 get_bh(bh); 840 get_bh(bh);
833 mapping = bh->b_assoc_map; 841 mapping = bh->b_assoc_map;
834 __remove_assoc_queue(bh); 842 __remove_assoc_queue(bh);
835 /* Avoid race with mark_buffer_dirty_inode() which does 843 /* Avoid race with mark_buffer_dirty_inode() which does
836 * a lockless check and we rely on seeing the dirty bit */ 844 * a lockless check and we rely on seeing the dirty bit */
837 smp_mb(); 845 smp_mb();
838 if (buffer_dirty(bh)) { 846 if (buffer_dirty(bh)) {
839 list_add(&bh->b_assoc_buffers, 847 list_add(&bh->b_assoc_buffers,
840 &mapping->private_list); 848 &mapping->private_list);
841 bh->b_assoc_map = mapping; 849 bh->b_assoc_map = mapping;
842 } 850 }
843 spin_unlock(lock); 851 spin_unlock(lock);
844 wait_on_buffer(bh); 852 wait_on_buffer(bh);
845 if (!buffer_uptodate(bh)) 853 if (!buffer_uptodate(bh))
846 err = -EIO; 854 err = -EIO;
847 brelse(bh); 855 brelse(bh);
848 spin_lock(lock); 856 spin_lock(lock);
849 } 857 }
850 858
851 spin_unlock(lock); 859 spin_unlock(lock);
852 err2 = osync_buffers_list(lock, list); 860 err2 = osync_buffers_list(lock, list);
853 if (err) 861 if (err)
854 return err; 862 return err;
855 else 863 else
856 return err2; 864 return err2;
857 } 865 }
858 866
859 /* 867 /*
860 * Invalidate any and all dirty buffers on a given inode. We are 868 * Invalidate any and all dirty buffers on a given inode. We are
861 * probably unmounting the fs, but that doesn't mean we have already 869 * probably unmounting the fs, but that doesn't mean we have already
862 * done a sync(). Just drop the buffers from the inode list. 870 * done a sync(). Just drop the buffers from the inode list.
863 * 871 *
864 * NOTE: we take the inode's blockdev's mapping's private_lock. Which 872 * NOTE: we take the inode's blockdev's mapping's private_lock. Which
865 * assumes that all the buffers are against the blockdev. Not true 873 * assumes that all the buffers are against the blockdev. Not true
866 * for reiserfs. 874 * for reiserfs.
867 */ 875 */
868 void invalidate_inode_buffers(struct inode *inode) 876 void invalidate_inode_buffers(struct inode *inode)
869 { 877 {
870 if (inode_has_buffers(inode)) { 878 if (inode_has_buffers(inode)) {
871 struct address_space *mapping = &inode->i_data; 879 struct address_space *mapping = &inode->i_data;
872 struct list_head *list = &mapping->private_list; 880 struct list_head *list = &mapping->private_list;
873 struct address_space *buffer_mapping = mapping->assoc_mapping; 881 struct address_space *buffer_mapping = mapping->assoc_mapping;
874 882
875 spin_lock(&buffer_mapping->private_lock); 883 spin_lock(&buffer_mapping->private_lock);
876 while (!list_empty(list)) 884 while (!list_empty(list))
877 __remove_assoc_queue(BH_ENTRY(list->next)); 885 __remove_assoc_queue(BH_ENTRY(list->next));
878 spin_unlock(&buffer_mapping->private_lock); 886 spin_unlock(&buffer_mapping->private_lock);
879 } 887 }
880 } 888 }
881 EXPORT_SYMBOL(invalidate_inode_buffers); 889 EXPORT_SYMBOL(invalidate_inode_buffers);
882 890
883 /* 891 /*
884 * Remove any clean buffers from the inode's buffer list. This is called 892 * Remove any clean buffers from the inode's buffer list. This is called
885 * when we're trying to free the inode itself. Those buffers can pin it. 893 * when we're trying to free the inode itself. Those buffers can pin it.
886 * 894 *
887 * Returns true if all buffers were removed. 895 * Returns true if all buffers were removed.
888 */ 896 */
889 int remove_inode_buffers(struct inode *inode) 897 int remove_inode_buffers(struct inode *inode)
890 { 898 {
891 int ret = 1; 899 int ret = 1;
892 900
893 if (inode_has_buffers(inode)) { 901 if (inode_has_buffers(inode)) {
894 struct address_space *mapping = &inode->i_data; 902 struct address_space *mapping = &inode->i_data;
895 struct list_head *list = &mapping->private_list; 903 struct list_head *list = &mapping->private_list;
896 struct address_space *buffer_mapping = mapping->assoc_mapping; 904 struct address_space *buffer_mapping = mapping->assoc_mapping;
897 905
898 spin_lock(&buffer_mapping->private_lock); 906 spin_lock(&buffer_mapping->private_lock);
899 while (!list_empty(list)) { 907 while (!list_empty(list)) {
900 struct buffer_head *bh = BH_ENTRY(list->next); 908 struct buffer_head *bh = BH_ENTRY(list->next);
901 if (buffer_dirty(bh)) { 909 if (buffer_dirty(bh)) {
902 ret = 0; 910 ret = 0;
903 break; 911 break;
904 } 912 }
905 __remove_assoc_queue(bh); 913 __remove_assoc_queue(bh);
906 } 914 }
907 spin_unlock(&buffer_mapping->private_lock); 915 spin_unlock(&buffer_mapping->private_lock);
908 } 916 }
909 return ret; 917 return ret;
910 } 918 }
911 919
912 /* 920 /*
913 * Create the appropriate buffers when given a page for data area and 921 * Create the appropriate buffers when given a page for data area and
914 * the size of each buffer.. Use the bh->b_this_page linked list to 922 * the size of each buffer.. Use the bh->b_this_page linked list to
915 * follow the buffers created. Return NULL if unable to create more 923 * follow the buffers created. Return NULL if unable to create more
916 * buffers. 924 * buffers.
917 * 925 *
918 * The retry flag is used to differentiate async IO (paging, swapping) 926 * The retry flag is used to differentiate async IO (paging, swapping)
919 * which may not fail from ordinary buffer allocations. 927 * which may not fail from ordinary buffer allocations.
920 */ 928 */
921 struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size, 929 struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size,
922 int retry) 930 int retry)
923 { 931 {
924 struct buffer_head *bh, *head; 932 struct buffer_head *bh, *head;
925 long offset; 933 long offset;
926 934
927 try_again: 935 try_again:
928 head = NULL; 936 head = NULL;
929 offset = PAGE_SIZE; 937 offset = PAGE_SIZE;
930 while ((offset -= size) >= 0) { 938 while ((offset -= size) >= 0) {
931 bh = alloc_buffer_head(GFP_NOFS); 939 bh = alloc_buffer_head(GFP_NOFS);
932 if (!bh) 940 if (!bh)
933 goto no_grow; 941 goto no_grow;
934 942
935 bh->b_bdev = NULL; 943 bh->b_bdev = NULL;
936 bh->b_this_page = head; 944 bh->b_this_page = head;
937 bh->b_blocknr = -1; 945 bh->b_blocknr = -1;
938 head = bh; 946 head = bh;
939 947
940 bh->b_state = 0; 948 bh->b_state = 0;
941 atomic_set(&bh->b_count, 0); 949 atomic_set(&bh->b_count, 0);
942 bh->b_private = NULL; 950 bh->b_private = NULL;
943 bh->b_size = size; 951 bh->b_size = size;
944 952
945 /* Link the buffer to its page */ 953 /* Link the buffer to its page */
946 set_bh_page(bh, page, offset); 954 set_bh_page(bh, page, offset);
947 955
948 init_buffer(bh, NULL, NULL); 956 init_buffer(bh, NULL, NULL);
949 } 957 }
950 return head; 958 return head;
951 /* 959 /*
952 * In case anything failed, we just free everything we got. 960 * In case anything failed, we just free everything we got.
953 */ 961 */
954 no_grow: 962 no_grow:
955 if (head) { 963 if (head) {
956 do { 964 do {
957 bh = head; 965 bh = head;
958 head = head->b_this_page; 966 head = head->b_this_page;
959 free_buffer_head(bh); 967 free_buffer_head(bh);
960 } while (head); 968 } while (head);
961 } 969 }
962 970
963 /* 971 /*
964 * Return failure for non-async IO requests. Async IO requests 972 * Return failure for non-async IO requests. Async IO requests
965 * are not allowed to fail, so we have to wait until buffer heads 973 * are not allowed to fail, so we have to wait until buffer heads
966 * become available. But we don't want tasks sleeping with 974 * become available. But we don't want tasks sleeping with
967 * partially complete buffers, so all were released above. 975 * partially complete buffers, so all were released above.
968 */ 976 */
969 if (!retry) 977 if (!retry)
970 return NULL; 978 return NULL;
971 979
972 /* We're _really_ low on memory. Now we just 980 /* We're _really_ low on memory. Now we just
973 * wait for old buffer heads to become free due to 981 * wait for old buffer heads to become free due to
974 * finishing IO. Since this is an async request and 982 * finishing IO. Since this is an async request and
975 * the reserve list is empty, we're sure there are 983 * the reserve list is empty, we're sure there are
976 * async buffer heads in use. 984 * async buffer heads in use.
977 */ 985 */
978 free_more_memory(); 986 free_more_memory();
979 goto try_again; 987 goto try_again;
980 } 988 }
981 EXPORT_SYMBOL_GPL(alloc_page_buffers); 989 EXPORT_SYMBOL_GPL(alloc_page_buffers);
982 990
983 static inline void 991 static inline void
984 link_dev_buffers(struct page *page, struct buffer_head *head) 992 link_dev_buffers(struct page *page, struct buffer_head *head)
985 { 993 {
986 struct buffer_head *bh, *tail; 994 struct buffer_head *bh, *tail;
987 995
988 bh = head; 996 bh = head;
989 do { 997 do {
990 tail = bh; 998 tail = bh;
991 bh = bh->b_this_page; 999 bh = bh->b_this_page;
992 } while (bh); 1000 } while (bh);
993 tail->b_this_page = head; 1001 tail->b_this_page = head;
994 attach_page_buffers(page, head); 1002 attach_page_buffers(page, head);
995 } 1003 }
996 1004
997 /* 1005 /*
998 * Initialise the state of a blockdev page's buffers. 1006 * Initialise the state of a blockdev page's buffers.
999 */ 1007 */
1000 static void 1008 static void
1001 init_page_buffers(struct page *page, struct block_device *bdev, 1009 init_page_buffers(struct page *page, struct block_device *bdev,
1002 sector_t block, int size) 1010 sector_t block, int size)
1003 { 1011 {
1004 struct buffer_head *head = page_buffers(page); 1012 struct buffer_head *head = page_buffers(page);
1005 struct buffer_head *bh = head; 1013 struct buffer_head *bh = head;
1006 int uptodate = PageUptodate(page); 1014 int uptodate = PageUptodate(page);
1007 1015
1008 do { 1016 do {
1009 if (!buffer_mapped(bh)) { 1017 if (!buffer_mapped(bh)) {
1010 init_buffer(bh, NULL, NULL); 1018 init_buffer(bh, NULL, NULL);
1011 bh->b_bdev = bdev; 1019 bh->b_bdev = bdev;
1012 bh->b_blocknr = block; 1020 bh->b_blocknr = block;
1013 if (uptodate) 1021 if (uptodate)
1014 set_buffer_uptodate(bh); 1022 set_buffer_uptodate(bh);
1015 set_buffer_mapped(bh); 1023 set_buffer_mapped(bh);
1016 } 1024 }
1017 block++; 1025 block++;
1018 bh = bh->b_this_page; 1026 bh = bh->b_this_page;
1019 } while (bh != head); 1027 } while (bh != head);
1020 } 1028 }
1021 1029
1022 /* 1030 /*
1023 * Create the page-cache page that contains the requested block. 1031 * Create the page-cache page that contains the requested block.
1024 * 1032 *
1025 * This is user purely for blockdev mappings. 1033 * This is user purely for blockdev mappings.
1026 */ 1034 */
1027 static struct page * 1035 static struct page *
1028 grow_dev_page(struct block_device *bdev, sector_t block, 1036 grow_dev_page(struct block_device *bdev, sector_t block,
1029 pgoff_t index, int size) 1037 pgoff_t index, int size)
1030 { 1038 {
1031 struct inode *inode = bdev->bd_inode; 1039 struct inode *inode = bdev->bd_inode;
1032 struct page *page; 1040 struct page *page;
1033 struct buffer_head *bh; 1041 struct buffer_head *bh;
1034 1042
1035 page = find_or_create_page(inode->i_mapping, index, 1043 page = find_or_create_page(inode->i_mapping, index,
1036 (mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS)|__GFP_MOVABLE); 1044 (mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS)|__GFP_MOVABLE);
1037 if (!page) 1045 if (!page)
1038 return NULL; 1046 return NULL;
1039 1047
1040 BUG_ON(!PageLocked(page)); 1048 BUG_ON(!PageLocked(page));
1041 1049
1042 if (page_has_buffers(page)) { 1050 if (page_has_buffers(page)) {
1043 bh = page_buffers(page); 1051 bh = page_buffers(page);
1044 if (bh->b_size == size) { 1052 if (bh->b_size == size) {
1045 init_page_buffers(page, bdev, block, size); 1053 init_page_buffers(page, bdev, block, size);
1046 return page; 1054 return page;
1047 } 1055 }
1048 if (!try_to_free_buffers(page)) 1056 if (!try_to_free_buffers(page))
1049 goto failed; 1057 goto failed;
1050 } 1058 }
1051 1059
1052 /* 1060 /*
1053 * Allocate some buffers for this page 1061 * Allocate some buffers for this page
1054 */ 1062 */
1055 bh = alloc_page_buffers(page, size, 0); 1063 bh = alloc_page_buffers(page, size, 0);
1056 if (!bh) 1064 if (!bh)
1057 goto failed; 1065 goto failed;
1058 1066
1059 /* 1067 /*
1060 * Link the page to the buffers and initialise them. Take the 1068 * Link the page to the buffers and initialise them. Take the
1061 * lock to be atomic wrt __find_get_block(), which does not 1069 * lock to be atomic wrt __find_get_block(), which does not
1062 * run under the page lock. 1070 * run under the page lock.
1063 */ 1071 */
1064 spin_lock(&inode->i_mapping->private_lock); 1072 spin_lock(&inode->i_mapping->private_lock);
1065 link_dev_buffers(page, bh); 1073 link_dev_buffers(page, bh);
1066 init_page_buffers(page, bdev, block, size); 1074 init_page_buffers(page, bdev, block, size);
1067 spin_unlock(&inode->i_mapping->private_lock); 1075 spin_unlock(&inode->i_mapping->private_lock);
1068 return page; 1076 return page;
1069 1077
1070 failed: 1078 failed:
1071 BUG(); 1079 BUG();
1072 unlock_page(page); 1080 unlock_page(page);
1073 page_cache_release(page); 1081 page_cache_release(page);
1074 return NULL; 1082 return NULL;
1075 } 1083 }
1076 1084
1077 /* 1085 /*
1078 * Create buffers for the specified block device block's page. If 1086 * Create buffers for the specified block device block's page. If
1079 * that page was dirty, the buffers are set dirty also. 1087 * that page was dirty, the buffers are set dirty also.
1080 */ 1088 */
1081 static int 1089 static int
1082 grow_buffers(struct block_device *bdev, sector_t block, int size) 1090 grow_buffers(struct block_device *bdev, sector_t block, int size)
1083 { 1091 {
1084 struct page *page; 1092 struct page *page;
1085 pgoff_t index; 1093 pgoff_t index;
1086 int sizebits; 1094 int sizebits;
1087 1095
1088 sizebits = -1; 1096 sizebits = -1;
1089 do { 1097 do {
1090 sizebits++; 1098 sizebits++;
1091 } while ((size << sizebits) < PAGE_SIZE); 1099 } while ((size << sizebits) < PAGE_SIZE);
1092 1100
1093 index = block >> sizebits; 1101 index = block >> sizebits;
1094 1102
1095 /* 1103 /*
1096 * Check for a block which wants to lie outside our maximum possible 1104 * Check for a block which wants to lie outside our maximum possible
1097 * pagecache index. (this comparison is done using sector_t types). 1105 * pagecache index. (this comparison is done using sector_t types).
1098 */ 1106 */
1099 if (unlikely(index != block >> sizebits)) { 1107 if (unlikely(index != block >> sizebits)) {
1100 char b[BDEVNAME_SIZE]; 1108 char b[BDEVNAME_SIZE];
1101 1109
1102 printk(KERN_ERR "%s: requested out-of-range block %llu for " 1110 printk(KERN_ERR "%s: requested out-of-range block %llu for "
1103 "device %s\n", 1111 "device %s\n",
1104 __func__, (unsigned long long)block, 1112 __func__, (unsigned long long)block,
1105 bdevname(bdev, b)); 1113 bdevname(bdev, b));
1106 return -EIO; 1114 return -EIO;
1107 } 1115 }
1108 block = index << sizebits; 1116 block = index << sizebits;
1109 /* Create a page with the proper size buffers.. */ 1117 /* Create a page with the proper size buffers.. */
1110 page = grow_dev_page(bdev, block, index, size); 1118 page = grow_dev_page(bdev, block, index, size);
1111 if (!page) 1119 if (!page)
1112 return 0; 1120 return 0;
1113 unlock_page(page); 1121 unlock_page(page);
1114 page_cache_release(page); 1122 page_cache_release(page);
1115 return 1; 1123 return 1;
1116 } 1124 }
1117 1125
1118 static struct buffer_head * 1126 static struct buffer_head *
1119 __getblk_slow(struct block_device *bdev, sector_t block, int size) 1127 __getblk_slow(struct block_device *bdev, sector_t block, int size)
1120 { 1128 {
1121 /* Size must be multiple of hard sectorsize */ 1129 /* Size must be multiple of hard sectorsize */
1122 if (unlikely(size & (bdev_hardsect_size(bdev)-1) || 1130 if (unlikely(size & (bdev_hardsect_size(bdev)-1) ||
1123 (size < 512 || size > PAGE_SIZE))) { 1131 (size < 512 || size > PAGE_SIZE))) {
1124 printk(KERN_ERR "getblk(): invalid block size %d requested\n", 1132 printk(KERN_ERR "getblk(): invalid block size %d requested\n",
1125 size); 1133 size);
1126 printk(KERN_ERR "hardsect size: %d\n", 1134 printk(KERN_ERR "hardsect size: %d\n",
1127 bdev_hardsect_size(bdev)); 1135 bdev_hardsect_size(bdev));
1128 1136
1129 dump_stack(); 1137 dump_stack();
1130 return NULL; 1138 return NULL;
1131 } 1139 }
1132 1140
1133 for (;;) { 1141 for (;;) {
1134 struct buffer_head * bh; 1142 struct buffer_head * bh;
1135 int ret; 1143 int ret;
1136 1144
1137 bh = __find_get_block(bdev, block, size); 1145 bh = __find_get_block(bdev, block, size);
1138 if (bh) 1146 if (bh)
1139 return bh; 1147 return bh;
1140 1148
1141 ret = grow_buffers(bdev, block, size); 1149 ret = grow_buffers(bdev, block, size);
1142 if (ret < 0) 1150 if (ret < 0)
1143 return NULL; 1151 return NULL;
1144 if (ret == 0) 1152 if (ret == 0)
1145 free_more_memory(); 1153 free_more_memory();
1146 } 1154 }
1147 } 1155 }
1148 1156
1149 /* 1157 /*
1150 * The relationship between dirty buffers and dirty pages: 1158 * The relationship between dirty buffers and dirty pages:
1151 * 1159 *
1152 * Whenever a page has any dirty buffers, the page's dirty bit is set, and 1160 * Whenever a page has any dirty buffers, the page's dirty bit is set, and
1153 * the page is tagged dirty in its radix tree. 1161 * the page is tagged dirty in its radix tree.
1154 * 1162 *
1155 * At all times, the dirtiness of the buffers represents the dirtiness of 1163 * At all times, the dirtiness of the buffers represents the dirtiness of
1156 * subsections of the page. If the page has buffers, the page dirty bit is 1164 * subsections of the page. If the page has buffers, the page dirty bit is
1157 * merely a hint about the true dirty state. 1165 * merely a hint about the true dirty state.
1158 * 1166 *
1159 * When a page is set dirty in its entirety, all its buffers are marked dirty 1167 * When a page is set dirty in its entirety, all its buffers are marked dirty
1160 * (if the page has buffers). 1168 * (if the page has buffers).
1161 * 1169 *
1162 * When a buffer is marked dirty, its page is dirtied, but the page's other 1170 * When a buffer is marked dirty, its page is dirtied, but the page's other
1163 * buffers are not. 1171 * buffers are not.
1164 * 1172 *
1165 * Also. When blockdev buffers are explicitly read with bread(), they 1173 * Also. When blockdev buffers are explicitly read with bread(), they
1166 * individually become uptodate. But their backing page remains not 1174 * individually become uptodate. But their backing page remains not
1167 * uptodate - even if all of its buffers are uptodate. A subsequent 1175 * uptodate - even if all of its buffers are uptodate. A subsequent
1168 * block_read_full_page() against that page will discover all the uptodate 1176 * block_read_full_page() against that page will discover all the uptodate
1169 * buffers, will set the page uptodate and will perform no I/O. 1177 * buffers, will set the page uptodate and will perform no I/O.
1170 */ 1178 */
1171 1179
1172 /** 1180 /**
1173 * mark_buffer_dirty - mark a buffer_head as needing writeout 1181 * mark_buffer_dirty - mark a buffer_head as needing writeout
1174 * @bh: the buffer_head to mark dirty 1182 * @bh: the buffer_head to mark dirty
1175 * 1183 *
1176 * mark_buffer_dirty() will set the dirty bit against the buffer, then set its 1184 * mark_buffer_dirty() will set the dirty bit against the buffer, then set its
1177 * backing page dirty, then tag the page as dirty in its address_space's radix 1185 * backing page dirty, then tag the page as dirty in its address_space's radix
1178 * tree and then attach the address_space's inode to its superblock's dirty 1186 * tree and then attach the address_space's inode to its superblock's dirty
1179 * inode list. 1187 * inode list.
1180 * 1188 *
1181 * mark_buffer_dirty() is atomic. It takes bh->b_page->mapping->private_lock, 1189 * mark_buffer_dirty() is atomic. It takes bh->b_page->mapping->private_lock,
1182 * mapping->tree_lock and the global inode_lock. 1190 * mapping->tree_lock and the global inode_lock.
1183 */ 1191 */
1184 void mark_buffer_dirty(struct buffer_head *bh) 1192 void mark_buffer_dirty(struct buffer_head *bh)
1185 { 1193 {
1186 WARN_ON_ONCE(!buffer_uptodate(bh)); 1194 WARN_ON_ONCE(!buffer_uptodate(bh));
1187 1195
1188 /* 1196 /*
1189 * Very *carefully* optimize the it-is-already-dirty case. 1197 * Very *carefully* optimize the it-is-already-dirty case.
1190 * 1198 *
1191 * Don't let the final "is it dirty" escape to before we 1199 * Don't let the final "is it dirty" escape to before we
1192 * perhaps modified the buffer. 1200 * perhaps modified the buffer.
1193 */ 1201 */
1194 if (buffer_dirty(bh)) { 1202 if (buffer_dirty(bh)) {
1195 smp_mb(); 1203 smp_mb();
1196 if (buffer_dirty(bh)) 1204 if (buffer_dirty(bh))
1197 return; 1205 return;
1198 } 1206 }
1199 1207
1200 if (!test_set_buffer_dirty(bh)) 1208 if (!test_set_buffer_dirty(bh))
1201 __set_page_dirty(bh->b_page, page_mapping(bh->b_page), 0); 1209 __set_page_dirty(bh->b_page, page_mapping(bh->b_page), 0);
1202 } 1210 }
1203 1211
1204 /* 1212 /*
1205 * Decrement a buffer_head's reference count. If all buffers against a page 1213 * Decrement a buffer_head's reference count. If all buffers against a page
1206 * have zero reference count, are clean and unlocked, and if the page is clean 1214 * have zero reference count, are clean and unlocked, and if the page is clean
1207 * and unlocked then try_to_free_buffers() may strip the buffers from the page 1215 * and unlocked then try_to_free_buffers() may strip the buffers from the page
1208 * in preparation for freeing it (sometimes, rarely, buffers are removed from 1216 * in preparation for freeing it (sometimes, rarely, buffers are removed from
1209 * a page but it ends up not being freed, and buffers may later be reattached). 1217 * a page but it ends up not being freed, and buffers may later be reattached).
1210 */ 1218 */
1211 void __brelse(struct buffer_head * buf) 1219 void __brelse(struct buffer_head * buf)
1212 { 1220 {
1213 if (atomic_read(&buf->b_count)) { 1221 if (atomic_read(&buf->b_count)) {
1214 put_bh(buf); 1222 put_bh(buf);
1215 return; 1223 return;
1216 } 1224 }
1217 WARN(1, KERN_ERR "VFS: brelse: Trying to free free buffer\n"); 1225 WARN(1, KERN_ERR "VFS: brelse: Trying to free free buffer\n");
1218 } 1226 }
1219 1227
1220 /* 1228 /*
1221 * bforget() is like brelse(), except it discards any 1229 * bforget() is like brelse(), except it discards any
1222 * potentially dirty data. 1230 * potentially dirty data.
1223 */ 1231 */
1224 void __bforget(struct buffer_head *bh) 1232 void __bforget(struct buffer_head *bh)
1225 { 1233 {
1226 clear_buffer_dirty(bh); 1234 clear_buffer_dirty(bh);
1227 if (bh->b_assoc_map) { 1235 if (bh->b_assoc_map) {
1228 struct address_space *buffer_mapping = bh->b_page->mapping; 1236 struct address_space *buffer_mapping = bh->b_page->mapping;
1229 1237
1230 spin_lock(&buffer_mapping->private_lock); 1238 spin_lock(&buffer_mapping->private_lock);
1231 list_del_init(&bh->b_assoc_buffers); 1239 list_del_init(&bh->b_assoc_buffers);
1232 bh->b_assoc_map = NULL; 1240 bh->b_assoc_map = NULL;
1233 spin_unlock(&buffer_mapping->private_lock); 1241 spin_unlock(&buffer_mapping->private_lock);
1234 } 1242 }
1235 __brelse(bh); 1243 __brelse(bh);
1236 } 1244 }
1237 1245
1238 static struct buffer_head *__bread_slow(struct buffer_head *bh) 1246 static struct buffer_head *__bread_slow(struct buffer_head *bh)
1239 { 1247 {
1240 lock_buffer(bh); 1248 lock_buffer(bh);
1241 if (buffer_uptodate(bh)) { 1249 if (buffer_uptodate(bh)) {
1242 unlock_buffer(bh); 1250 unlock_buffer(bh);
1243 return bh; 1251 return bh;
1244 } else { 1252 } else {
1245 get_bh(bh); 1253 get_bh(bh);
1246 bh->b_end_io = end_buffer_read_sync; 1254 bh->b_end_io = end_buffer_read_sync;
1247 submit_bh(READ, bh); 1255 submit_bh(READ, bh);
1248 wait_on_buffer(bh); 1256 wait_on_buffer(bh);
1249 if (buffer_uptodate(bh)) 1257 if (buffer_uptodate(bh))
1250 return bh; 1258 return bh;
1251 } 1259 }
1252 brelse(bh); 1260 brelse(bh);
1253 return NULL; 1261 return NULL;
1254 } 1262 }
1255 1263
1256 /* 1264 /*
1257 * Per-cpu buffer LRU implementation. To reduce the cost of __find_get_block(). 1265 * Per-cpu buffer LRU implementation. To reduce the cost of __find_get_block().
1258 * The bhs[] array is sorted - newest buffer is at bhs[0]. Buffers have their 1266 * The bhs[] array is sorted - newest buffer is at bhs[0]. Buffers have their
1259 * refcount elevated by one when they're in an LRU. A buffer can only appear 1267 * refcount elevated by one when they're in an LRU. A buffer can only appear
1260 * once in a particular CPU's LRU. A single buffer can be present in multiple 1268 * once in a particular CPU's LRU. A single buffer can be present in multiple
1261 * CPU's LRUs at the same time. 1269 * CPU's LRUs at the same time.
1262 * 1270 *
1263 * This is a transparent caching front-end to sb_bread(), sb_getblk() and 1271 * This is a transparent caching front-end to sb_bread(), sb_getblk() and
1264 * sb_find_get_block(). 1272 * sb_find_get_block().
1265 * 1273 *
1266 * The LRUs themselves only need locking against invalidate_bh_lrus. We use 1274 * The LRUs themselves only need locking against invalidate_bh_lrus. We use
1267 * a local interrupt disable for that. 1275 * a local interrupt disable for that.
1268 */ 1276 */
1269 1277
1270 #define BH_LRU_SIZE 8 1278 #define BH_LRU_SIZE 8
1271 1279
1272 struct bh_lru { 1280 struct bh_lru {
1273 struct buffer_head *bhs[BH_LRU_SIZE]; 1281 struct buffer_head *bhs[BH_LRU_SIZE];
1274 }; 1282 };
1275 1283
1276 static DEFINE_PER_CPU(struct bh_lru, bh_lrus) = {{ NULL }}; 1284 static DEFINE_PER_CPU(struct bh_lru, bh_lrus) = {{ NULL }};
1277 1285
1278 #ifdef CONFIG_SMP 1286 #ifdef CONFIG_SMP
1279 #define bh_lru_lock() local_irq_disable() 1287 #define bh_lru_lock() local_irq_disable()
1280 #define bh_lru_unlock() local_irq_enable() 1288 #define bh_lru_unlock() local_irq_enable()
1281 #else 1289 #else
1282 #define bh_lru_lock() preempt_disable() 1290 #define bh_lru_lock() preempt_disable()
1283 #define bh_lru_unlock() preempt_enable() 1291 #define bh_lru_unlock() preempt_enable()
1284 #endif 1292 #endif
1285 1293
1286 static inline void check_irqs_on(void) 1294 static inline void check_irqs_on(void)
1287 { 1295 {
1288 #ifdef irqs_disabled 1296 #ifdef irqs_disabled
1289 BUG_ON(irqs_disabled()); 1297 BUG_ON(irqs_disabled());
1290 #endif 1298 #endif
1291 } 1299 }
1292 1300
1293 /* 1301 /*
1294 * The LRU management algorithm is dopey-but-simple. Sorry. 1302 * The LRU management algorithm is dopey-but-simple. Sorry.
1295 */ 1303 */
1296 static void bh_lru_install(struct buffer_head *bh) 1304 static void bh_lru_install(struct buffer_head *bh)
1297 { 1305 {
1298 struct buffer_head *evictee = NULL; 1306 struct buffer_head *evictee = NULL;
1299 struct bh_lru *lru; 1307 struct bh_lru *lru;
1300 1308
1301 check_irqs_on(); 1309 check_irqs_on();
1302 bh_lru_lock(); 1310 bh_lru_lock();
1303 lru = &__get_cpu_var(bh_lrus); 1311 lru = &__get_cpu_var(bh_lrus);
1304 if (lru->bhs[0] != bh) { 1312 if (lru->bhs[0] != bh) {
1305 struct buffer_head *bhs[BH_LRU_SIZE]; 1313 struct buffer_head *bhs[BH_LRU_SIZE];
1306 int in; 1314 int in;
1307 int out = 0; 1315 int out = 0;
1308 1316
1309 get_bh(bh); 1317 get_bh(bh);
1310 bhs[out++] = bh; 1318 bhs[out++] = bh;
1311 for (in = 0; in < BH_LRU_SIZE; in++) { 1319 for (in = 0; in < BH_LRU_SIZE; in++) {
1312 struct buffer_head *bh2 = lru->bhs[in]; 1320 struct buffer_head *bh2 = lru->bhs[in];
1313 1321
1314 if (bh2 == bh) { 1322 if (bh2 == bh) {
1315 __brelse(bh2); 1323 __brelse(bh2);
1316 } else { 1324 } else {
1317 if (out >= BH_LRU_SIZE) { 1325 if (out >= BH_LRU_SIZE) {
1318 BUG_ON(evictee != NULL); 1326 BUG_ON(evictee != NULL);
1319 evictee = bh2; 1327 evictee = bh2;
1320 } else { 1328 } else {
1321 bhs[out++] = bh2; 1329 bhs[out++] = bh2;
1322 } 1330 }
1323 } 1331 }
1324 } 1332 }
1325 while (out < BH_LRU_SIZE) 1333 while (out < BH_LRU_SIZE)
1326 bhs[out++] = NULL; 1334 bhs[out++] = NULL;
1327 memcpy(lru->bhs, bhs, sizeof(bhs)); 1335 memcpy(lru->bhs, bhs, sizeof(bhs));
1328 } 1336 }
1329 bh_lru_unlock(); 1337 bh_lru_unlock();
1330 1338
1331 if (evictee) 1339 if (evictee)
1332 __brelse(evictee); 1340 __brelse(evictee);
1333 } 1341 }
1334 1342
1335 /* 1343 /*
1336 * Look up the bh in this cpu's LRU. If it's there, move it to the head. 1344 * Look up the bh in this cpu's LRU. If it's there, move it to the head.
1337 */ 1345 */
1338 static struct buffer_head * 1346 static struct buffer_head *
1339 lookup_bh_lru(struct block_device *bdev, sector_t block, unsigned size) 1347 lookup_bh_lru(struct block_device *bdev, sector_t block, unsigned size)
1340 { 1348 {
1341 struct buffer_head *ret = NULL; 1349 struct buffer_head *ret = NULL;
1342 struct bh_lru *lru; 1350 struct bh_lru *lru;
1343 unsigned int i; 1351 unsigned int i;
1344 1352
1345 check_irqs_on(); 1353 check_irqs_on();
1346 bh_lru_lock(); 1354 bh_lru_lock();
1347 lru = &__get_cpu_var(bh_lrus); 1355 lru = &__get_cpu_var(bh_lrus);
1348 for (i = 0; i < BH_LRU_SIZE; i++) { 1356 for (i = 0; i < BH_LRU_SIZE; i++) {
1349 struct buffer_head *bh = lru->bhs[i]; 1357 struct buffer_head *bh = lru->bhs[i];
1350 1358
1351 if (bh && bh->b_bdev == bdev && 1359 if (bh && bh->b_bdev == bdev &&
1352 bh->b_blocknr == block && bh->b_size == size) { 1360 bh->b_blocknr == block && bh->b_size == size) {
1353 if (i) { 1361 if (i) {
1354 while (i) { 1362 while (i) {
1355 lru->bhs[i] = lru->bhs[i - 1]; 1363 lru->bhs[i] = lru->bhs[i - 1];
1356 i--; 1364 i--;
1357 } 1365 }
1358 lru->bhs[0] = bh; 1366 lru->bhs[0] = bh;
1359 } 1367 }
1360 get_bh(bh); 1368 get_bh(bh);
1361 ret = bh; 1369 ret = bh;
1362 break; 1370 break;
1363 } 1371 }
1364 } 1372 }
1365 bh_lru_unlock(); 1373 bh_lru_unlock();
1366 return ret; 1374 return ret;
1367 } 1375 }
1368 1376
1369 /* 1377 /*
1370 * Perform a pagecache lookup for the matching buffer. If it's there, refresh 1378 * Perform a pagecache lookup for the matching buffer. If it's there, refresh
1371 * it in the LRU and mark it as accessed. If it is not present then return 1379 * it in the LRU and mark it as accessed. If it is not present then return
1372 * NULL 1380 * NULL
1373 */ 1381 */
1374 struct buffer_head * 1382 struct buffer_head *
1375 __find_get_block(struct block_device *bdev, sector_t block, unsigned size) 1383 __find_get_block(struct block_device *bdev, sector_t block, unsigned size)
1376 { 1384 {
1377 struct buffer_head *bh = lookup_bh_lru(bdev, block, size); 1385 struct buffer_head *bh = lookup_bh_lru(bdev, block, size);
1378 1386
1379 if (bh == NULL) { 1387 if (bh == NULL) {
1380 bh = __find_get_block_slow(bdev, block); 1388 bh = __find_get_block_slow(bdev, block);
1381 if (bh) 1389 if (bh)
1382 bh_lru_install(bh); 1390 bh_lru_install(bh);
1383 } 1391 }
1384 if (bh) 1392 if (bh)
1385 touch_buffer(bh); 1393 touch_buffer(bh);
1386 return bh; 1394 return bh;
1387 } 1395 }
1388 EXPORT_SYMBOL(__find_get_block); 1396 EXPORT_SYMBOL(__find_get_block);
1389 1397
1390 /* 1398 /*
1391 * __getblk will locate (and, if necessary, create) the buffer_head 1399 * __getblk will locate (and, if necessary, create) the buffer_head
1392 * which corresponds to the passed block_device, block and size. The 1400 * which corresponds to the passed block_device, block and size. The
1393 * returned buffer has its reference count incremented. 1401 * returned buffer has its reference count incremented.
1394 * 1402 *
1395 * __getblk() cannot fail - it just keeps trying. If you pass it an 1403 * __getblk() cannot fail - it just keeps trying. If you pass it an
1396 * illegal block number, __getblk() will happily return a buffer_head 1404 * illegal block number, __getblk() will happily return a buffer_head
1397 * which represents the non-existent block. Very weird. 1405 * which represents the non-existent block. Very weird.
1398 * 1406 *
1399 * __getblk() will lock up the machine if grow_dev_page's try_to_free_buffers() 1407 * __getblk() will lock up the machine if grow_dev_page's try_to_free_buffers()
1400 * attempt is failing. FIXME, perhaps? 1408 * attempt is failing. FIXME, perhaps?
1401 */ 1409 */
1402 struct buffer_head * 1410 struct buffer_head *
1403 __getblk(struct block_device *bdev, sector_t block, unsigned size) 1411 __getblk(struct block_device *bdev, sector_t block, unsigned size)
1404 { 1412 {
1405 struct buffer_head *bh = __find_get_block(bdev, block, size); 1413 struct buffer_head *bh = __find_get_block(bdev, block, size);
1406 1414
1407 might_sleep(); 1415 might_sleep();
1408 if (bh == NULL) 1416 if (bh == NULL)
1409 bh = __getblk_slow(bdev, block, size); 1417 bh = __getblk_slow(bdev, block, size);
1410 return bh; 1418 return bh;
1411 } 1419 }
1412 EXPORT_SYMBOL(__getblk); 1420 EXPORT_SYMBOL(__getblk);
1413 1421
1414 /* 1422 /*
1415 * Do async read-ahead on a buffer.. 1423 * Do async read-ahead on a buffer..
1416 */ 1424 */
1417 void __breadahead(struct block_device *bdev, sector_t block, unsigned size) 1425 void __breadahead(struct block_device *bdev, sector_t block, unsigned size)
1418 { 1426 {
1419 struct buffer_head *bh = __getblk(bdev, block, size); 1427 struct buffer_head *bh = __getblk(bdev, block, size);
1420 if (likely(bh)) { 1428 if (likely(bh)) {
1421 ll_rw_block(READA, 1, &bh); 1429 ll_rw_block(READA, 1, &bh);
1422 brelse(bh); 1430 brelse(bh);
1423 } 1431 }
1424 } 1432 }
1425 EXPORT_SYMBOL(__breadahead); 1433 EXPORT_SYMBOL(__breadahead);
1426 1434
1427 /** 1435 /**
1428 * __bread() - reads a specified block and returns the bh 1436 * __bread() - reads a specified block and returns the bh
1429 * @bdev: the block_device to read from 1437 * @bdev: the block_device to read from
1430 * @block: number of block 1438 * @block: number of block
1431 * @size: size (in bytes) to read 1439 * @size: size (in bytes) to read
1432 * 1440 *
1433 * Reads a specified block, and returns buffer head that contains it. 1441 * Reads a specified block, and returns buffer head that contains it.
1434 * It returns NULL if the block was unreadable. 1442 * It returns NULL if the block was unreadable.
1435 */ 1443 */
1436 struct buffer_head * 1444 struct buffer_head *
1437 __bread(struct block_device *bdev, sector_t block, unsigned size) 1445 __bread(struct block_device *bdev, sector_t block, unsigned size)
1438 { 1446 {
1439 struct buffer_head *bh = __getblk(bdev, block, size); 1447 struct buffer_head *bh = __getblk(bdev, block, size);
1440 1448
1441 if (likely(bh) && !buffer_uptodate(bh)) 1449 if (likely(bh) && !buffer_uptodate(bh))
1442 bh = __bread_slow(bh); 1450 bh = __bread_slow(bh);
1443 return bh; 1451 return bh;
1444 } 1452 }
1445 EXPORT_SYMBOL(__bread); 1453 EXPORT_SYMBOL(__bread);
1446 1454
1447 /* 1455 /*
1448 * invalidate_bh_lrus() is called rarely - but not only at unmount. 1456 * invalidate_bh_lrus() is called rarely - but not only at unmount.
1449 * This doesn't race because it runs in each cpu either in irq 1457 * This doesn't race because it runs in each cpu either in irq
1450 * or with preempt disabled. 1458 * or with preempt disabled.
1451 */ 1459 */
1452 static void invalidate_bh_lru(void *arg) 1460 static void invalidate_bh_lru(void *arg)
1453 { 1461 {
1454 struct bh_lru *b = &get_cpu_var(bh_lrus); 1462 struct bh_lru *b = &get_cpu_var(bh_lrus);
1455 int i; 1463 int i;
1456 1464
1457 for (i = 0; i < BH_LRU_SIZE; i++) { 1465 for (i = 0; i < BH_LRU_SIZE; i++) {
1458 brelse(b->bhs[i]); 1466 brelse(b->bhs[i]);
1459 b->bhs[i] = NULL; 1467 b->bhs[i] = NULL;
1460 } 1468 }
1461 put_cpu_var(bh_lrus); 1469 put_cpu_var(bh_lrus);
1462 } 1470 }
1463 1471
1464 void invalidate_bh_lrus(void) 1472 void invalidate_bh_lrus(void)
1465 { 1473 {
1466 on_each_cpu(invalidate_bh_lru, NULL, 1); 1474 on_each_cpu(invalidate_bh_lru, NULL, 1);
1467 } 1475 }
1468 EXPORT_SYMBOL_GPL(invalidate_bh_lrus); 1476 EXPORT_SYMBOL_GPL(invalidate_bh_lrus);
1469 1477
1470 void set_bh_page(struct buffer_head *bh, 1478 void set_bh_page(struct buffer_head *bh,
1471 struct page *page, unsigned long offset) 1479 struct page *page, unsigned long offset)
1472 { 1480 {
1473 bh->b_page = page; 1481 bh->b_page = page;
1474 BUG_ON(offset >= PAGE_SIZE); 1482 BUG_ON(offset >= PAGE_SIZE);
1475 if (PageHighMem(page)) 1483 if (PageHighMem(page))
1476 /* 1484 /*
1477 * This catches illegal uses and preserves the offset: 1485 * This catches illegal uses and preserves the offset:
1478 */ 1486 */
1479 bh->b_data = (char *)(0 + offset); 1487 bh->b_data = (char *)(0 + offset);
1480 else 1488 else
1481 bh->b_data = page_address(page) + offset; 1489 bh->b_data = page_address(page) + offset;
1482 } 1490 }
1483 EXPORT_SYMBOL(set_bh_page); 1491 EXPORT_SYMBOL(set_bh_page);
1484 1492
1485 /* 1493 /*
1486 * Called when truncating a buffer on a page completely. 1494 * Called when truncating a buffer on a page completely.
1487 */ 1495 */
1488 static void discard_buffer(struct buffer_head * bh) 1496 static void discard_buffer(struct buffer_head * bh)
1489 { 1497 {
1490 lock_buffer(bh); 1498 lock_buffer(bh);
1491 clear_buffer_dirty(bh); 1499 clear_buffer_dirty(bh);
1492 bh->b_bdev = NULL; 1500 bh->b_bdev = NULL;
1493 clear_buffer_mapped(bh); 1501 clear_buffer_mapped(bh);
1494 clear_buffer_req(bh); 1502 clear_buffer_req(bh);
1495 clear_buffer_new(bh); 1503 clear_buffer_new(bh);
1496 clear_buffer_delay(bh); 1504 clear_buffer_delay(bh);
1497 clear_buffer_unwritten(bh); 1505 clear_buffer_unwritten(bh);
1498 unlock_buffer(bh); 1506 unlock_buffer(bh);
1499 } 1507 }
1500 1508
1501 /** 1509 /**
1502 * block_invalidatepage - invalidate part of all of a buffer-backed page 1510 * block_invalidatepage - invalidate part of all of a buffer-backed page
1503 * 1511 *
1504 * @page: the page which is affected 1512 * @page: the page which is affected
1505 * @offset: the index of the truncation point 1513 * @offset: the index of the truncation point
1506 * 1514 *
1507 * block_invalidatepage() is called when all or part of the page has become 1515 * block_invalidatepage() is called when all or part of the page has become
1508 * invalidatedby a truncate operation. 1516 * invalidatedby a truncate operation.
1509 * 1517 *
1510 * block_invalidatepage() does not have to release all buffers, but it must 1518 * block_invalidatepage() does not have to release all buffers, but it must
1511 * ensure that no dirty buffer is left outside @offset and that no I/O 1519 * ensure that no dirty buffer is left outside @offset and that no I/O
1512 * is underway against any of the blocks which are outside the truncation 1520 * is underway against any of the blocks which are outside the truncation
1513 * point. Because the caller is about to free (and possibly reuse) those 1521 * point. Because the caller is about to free (and possibly reuse) those
1514 * blocks on-disk. 1522 * blocks on-disk.
1515 */ 1523 */
1516 void block_invalidatepage(struct page *page, unsigned long offset) 1524 void block_invalidatepage(struct page *page, unsigned long offset)
1517 { 1525 {
1518 struct buffer_head *head, *bh, *next; 1526 struct buffer_head *head, *bh, *next;
1519 unsigned int curr_off = 0; 1527 unsigned int curr_off = 0;
1520 1528
1521 BUG_ON(!PageLocked(page)); 1529 BUG_ON(!PageLocked(page));
1522 if (!page_has_buffers(page)) 1530 if (!page_has_buffers(page))
1523 goto out; 1531 goto out;
1524 1532
1525 head = page_buffers(page); 1533 head = page_buffers(page);
1526 bh = head; 1534 bh = head;
1527 do { 1535 do {
1528 unsigned int next_off = curr_off + bh->b_size; 1536 unsigned int next_off = curr_off + bh->b_size;
1529 next = bh->b_this_page; 1537 next = bh->b_this_page;
1530 1538
1531 /* 1539 /*
1532 * is this block fully invalidated? 1540 * is this block fully invalidated?
1533 */ 1541 */
1534 if (offset <= curr_off) 1542 if (offset <= curr_off)
1535 discard_buffer(bh); 1543 discard_buffer(bh);
1536 curr_off = next_off; 1544 curr_off = next_off;
1537 bh = next; 1545 bh = next;
1538 } while (bh != head); 1546 } while (bh != head);
1539 1547
1540 /* 1548 /*
1541 * We release buffers only if the entire page is being invalidated. 1549 * We release buffers only if the entire page is being invalidated.
1542 * The get_block cached value has been unconditionally invalidated, 1550 * The get_block cached value has been unconditionally invalidated,
1543 * so real IO is not possible anymore. 1551 * so real IO is not possible anymore.
1544 */ 1552 */
1545 if (offset == 0) 1553 if (offset == 0)
1546 try_to_release_page(page, 0); 1554 try_to_release_page(page, 0);
1547 out: 1555 out:
1548 return; 1556 return;
1549 } 1557 }
1550 EXPORT_SYMBOL(block_invalidatepage); 1558 EXPORT_SYMBOL(block_invalidatepage);
1551 1559
1552 /* 1560 /*
1553 * We attach and possibly dirty the buffers atomically wrt 1561 * We attach and possibly dirty the buffers atomically wrt
1554 * __set_page_dirty_buffers() via private_lock. try_to_free_buffers 1562 * __set_page_dirty_buffers() via private_lock. try_to_free_buffers
1555 * is already excluded via the page lock. 1563 * is already excluded via the page lock.
1556 */ 1564 */
1557 void create_empty_buffers(struct page *page, 1565 void create_empty_buffers(struct page *page,
1558 unsigned long blocksize, unsigned long b_state) 1566 unsigned long blocksize, unsigned long b_state)
1559 { 1567 {
1560 struct buffer_head *bh, *head, *tail; 1568 struct buffer_head *bh, *head, *tail;
1561 1569
1562 head = alloc_page_buffers(page, blocksize, 1); 1570 head = alloc_page_buffers(page, blocksize, 1);
1563 bh = head; 1571 bh = head;
1564 do { 1572 do {
1565 bh->b_state |= b_state; 1573 bh->b_state |= b_state;
1566 tail = bh; 1574 tail = bh;
1567 bh = bh->b_this_page; 1575 bh = bh->b_this_page;
1568 } while (bh); 1576 } while (bh);
1569 tail->b_this_page = head; 1577 tail->b_this_page = head;
1570 1578
1571 spin_lock(&page->mapping->private_lock); 1579 spin_lock(&page->mapping->private_lock);
1572 if (PageUptodate(page) || PageDirty(page)) { 1580 if (PageUptodate(page) || PageDirty(page)) {
1573 bh = head; 1581 bh = head;
1574 do { 1582 do {
1575 if (PageDirty(page)) 1583 if (PageDirty(page))
1576 set_buffer_dirty(bh); 1584 set_buffer_dirty(bh);
1577 if (PageUptodate(page)) 1585 if (PageUptodate(page))
1578 set_buffer_uptodate(bh); 1586 set_buffer_uptodate(bh);
1579 bh = bh->b_this_page; 1587 bh = bh->b_this_page;
1580 } while (bh != head); 1588 } while (bh != head);
1581 } 1589 }
1582 attach_page_buffers(page, head); 1590 attach_page_buffers(page, head);
1583 spin_unlock(&page->mapping->private_lock); 1591 spin_unlock(&page->mapping->private_lock);
1584 } 1592 }
1585 EXPORT_SYMBOL(create_empty_buffers); 1593 EXPORT_SYMBOL(create_empty_buffers);
1586 1594
1587 /* 1595 /*
1588 * We are taking a block for data and we don't want any output from any 1596 * We are taking a block for data and we don't want any output from any
1589 * buffer-cache aliases starting from return from that function and 1597 * buffer-cache aliases starting from return from that function and
1590 * until the moment when something will explicitly mark the buffer 1598 * until the moment when something will explicitly mark the buffer
1591 * dirty (hopefully that will not happen until we will free that block ;-) 1599 * dirty (hopefully that will not happen until we will free that block ;-)
1592 * We don't even need to mark it not-uptodate - nobody can expect 1600 * We don't even need to mark it not-uptodate - nobody can expect
1593 * anything from a newly allocated buffer anyway. We used to used 1601 * anything from a newly allocated buffer anyway. We used to used
1594 * unmap_buffer() for such invalidation, but that was wrong. We definitely 1602 * unmap_buffer() for such invalidation, but that was wrong. We definitely
1595 * don't want to mark the alias unmapped, for example - it would confuse 1603 * don't want to mark the alias unmapped, for example - it would confuse
1596 * anyone who might pick it with bread() afterwards... 1604 * anyone who might pick it with bread() afterwards...
1597 * 1605 *
1598 * Also.. Note that bforget() doesn't lock the buffer. So there can 1606 * Also.. Note that bforget() doesn't lock the buffer. So there can
1599 * be writeout I/O going on against recently-freed buffers. We don't 1607 * be writeout I/O going on against recently-freed buffers. We don't
1600 * wait on that I/O in bforget() - it's more efficient to wait on the I/O 1608 * wait on that I/O in bforget() - it's more efficient to wait on the I/O
1601 * only if we really need to. That happens here. 1609 * only if we really need to. That happens here.
1602 */ 1610 */
1603 void unmap_underlying_metadata(struct block_device *bdev, sector_t block) 1611 void unmap_underlying_metadata(struct block_device *bdev, sector_t block)
1604 { 1612 {
1605 struct buffer_head *old_bh; 1613 struct buffer_head *old_bh;
1606 1614
1607 might_sleep(); 1615 might_sleep();
1608 1616
1609 old_bh = __find_get_block_slow(bdev, block); 1617 old_bh = __find_get_block_slow(bdev, block);
1610 if (old_bh) { 1618 if (old_bh) {
1611 clear_buffer_dirty(old_bh); 1619 clear_buffer_dirty(old_bh);
1612 wait_on_buffer(old_bh); 1620 wait_on_buffer(old_bh);
1613 clear_buffer_req(old_bh); 1621 clear_buffer_req(old_bh);
1614 __brelse(old_bh); 1622 __brelse(old_bh);
1615 } 1623 }
1616 } 1624 }
1617 EXPORT_SYMBOL(unmap_underlying_metadata); 1625 EXPORT_SYMBOL(unmap_underlying_metadata);
1618 1626
1619 /* 1627 /*
1620 * NOTE! All mapped/uptodate combinations are valid: 1628 * NOTE! All mapped/uptodate combinations are valid:
1621 * 1629 *
1622 * Mapped Uptodate Meaning 1630 * Mapped Uptodate Meaning
1623 * 1631 *
1624 * No No "unknown" - must do get_block() 1632 * No No "unknown" - must do get_block()
1625 * No Yes "hole" - zero-filled 1633 * No Yes "hole" - zero-filled
1626 * Yes No "allocated" - allocated on disk, not read in 1634 * Yes No "allocated" - allocated on disk, not read in
1627 * Yes Yes "valid" - allocated and up-to-date in memory. 1635 * Yes Yes "valid" - allocated and up-to-date in memory.
1628 * 1636 *
1629 * "Dirty" is valid only with the last case (mapped+uptodate). 1637 * "Dirty" is valid only with the last case (mapped+uptodate).
1630 */ 1638 */
1631 1639
1632 /* 1640 /*
1633 * While block_write_full_page is writing back the dirty buffers under 1641 * While block_write_full_page is writing back the dirty buffers under
1634 * the page lock, whoever dirtied the buffers may decide to clean them 1642 * the page lock, whoever dirtied the buffers may decide to clean them
1635 * again at any time. We handle that by only looking at the buffer 1643 * again at any time. We handle that by only looking at the buffer
1636 * state inside lock_buffer(). 1644 * state inside lock_buffer().
1637 * 1645 *
1638 * If block_write_full_page() is called for regular writeback 1646 * If block_write_full_page() is called for regular writeback
1639 * (wbc->sync_mode == WB_SYNC_NONE) then it will redirty a page which has a 1647 * (wbc->sync_mode == WB_SYNC_NONE) then it will redirty a page which has a
1640 * locked buffer. This only can happen if someone has written the buffer 1648 * locked buffer. This only can happen if someone has written the buffer
1641 * directly, with submit_bh(). At the address_space level PageWriteback 1649 * directly, with submit_bh(). At the address_space level PageWriteback
1642 * prevents this contention from occurring. 1650 * prevents this contention from occurring.
1643 */ 1651 */
1644 static int __block_write_full_page(struct inode *inode, struct page *page, 1652 static int __block_write_full_page(struct inode *inode, struct page *page,
1645 get_block_t *get_block, struct writeback_control *wbc) 1653 get_block_t *get_block, struct writeback_control *wbc)
1646 { 1654 {
1647 int err; 1655 int err;
1648 sector_t block; 1656 sector_t block;
1649 sector_t last_block; 1657 sector_t last_block;
1650 struct buffer_head *bh, *head; 1658 struct buffer_head *bh, *head;
1651 const unsigned blocksize = 1 << inode->i_blkbits; 1659 const unsigned blocksize = 1 << inode->i_blkbits;
1652 int nr_underway = 0; 1660 int nr_underway = 0;
1653 1661
1654 BUG_ON(!PageLocked(page)); 1662 BUG_ON(!PageLocked(page));
1655 1663
1656 last_block = (i_size_read(inode) - 1) >> inode->i_blkbits; 1664 last_block = (i_size_read(inode) - 1) >> inode->i_blkbits;
1657 1665
1658 if (!page_has_buffers(page)) { 1666 if (!page_has_buffers(page)) {
1659 create_empty_buffers(page, blocksize, 1667 create_empty_buffers(page, blocksize,
1660 (1 << BH_Dirty)|(1 << BH_Uptodate)); 1668 (1 << BH_Dirty)|(1 << BH_Uptodate));
1661 } 1669 }
1662 1670
1663 /* 1671 /*
1664 * Be very careful. We have no exclusion from __set_page_dirty_buffers 1672 * Be very careful. We have no exclusion from __set_page_dirty_buffers
1665 * here, and the (potentially unmapped) buffers may become dirty at 1673 * here, and the (potentially unmapped) buffers may become dirty at
1666 * any time. If a buffer becomes dirty here after we've inspected it 1674 * any time. If a buffer becomes dirty here after we've inspected it
1667 * then we just miss that fact, and the page stays dirty. 1675 * then we just miss that fact, and the page stays dirty.
1668 * 1676 *
1669 * Buffers outside i_size may be dirtied by __set_page_dirty_buffers; 1677 * Buffers outside i_size may be dirtied by __set_page_dirty_buffers;
1670 * handle that here by just cleaning them. 1678 * handle that here by just cleaning them.
1671 */ 1679 */
1672 1680
1673 block = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits); 1681 block = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
1674 head = page_buffers(page); 1682 head = page_buffers(page);
1675 bh = head; 1683 bh = head;
1676 1684
1677 /* 1685 /*
1678 * Get all the dirty buffers mapped to disk addresses and 1686 * Get all the dirty buffers mapped to disk addresses and
1679 * handle any aliases from the underlying blockdev's mapping. 1687 * handle any aliases from the underlying blockdev's mapping.
1680 */ 1688 */
1681 do { 1689 do {
1682 if (block > last_block) { 1690 if (block > last_block) {
1683 /* 1691 /*
1684 * mapped buffers outside i_size will occur, because 1692 * mapped buffers outside i_size will occur, because
1685 * this page can be outside i_size when there is a 1693 * this page can be outside i_size when there is a
1686 * truncate in progress. 1694 * truncate in progress.
1687 */ 1695 */
1688 /* 1696 /*
1689 * The buffer was zeroed by block_write_full_page() 1697 * The buffer was zeroed by block_write_full_page()
1690 */ 1698 */
1691 clear_buffer_dirty(bh); 1699 clear_buffer_dirty(bh);
1692 set_buffer_uptodate(bh); 1700 set_buffer_uptodate(bh);
1693 } else if ((!buffer_mapped(bh) || buffer_delay(bh)) && 1701 } else if ((!buffer_mapped(bh) || buffer_delay(bh)) &&
1694 buffer_dirty(bh)) { 1702 buffer_dirty(bh)) {
1695 WARN_ON(bh->b_size != blocksize); 1703 WARN_ON(bh->b_size != blocksize);
1696 err = get_block(inode, block, bh, 1); 1704 err = get_block(inode, block, bh, 1);
1697 if (err) 1705 if (err)
1698 goto recover; 1706 goto recover;
1699 clear_buffer_delay(bh); 1707 clear_buffer_delay(bh);
1700 if (buffer_new(bh)) { 1708 if (buffer_new(bh)) {
1701 /* blockdev mappings never come here */ 1709 /* blockdev mappings never come here */
1702 clear_buffer_new(bh); 1710 clear_buffer_new(bh);
1703 unmap_underlying_metadata(bh->b_bdev, 1711 unmap_underlying_metadata(bh->b_bdev,
1704 bh->b_blocknr); 1712 bh->b_blocknr);
1705 } 1713 }
1706 } 1714 }
1707 bh = bh->b_this_page; 1715 bh = bh->b_this_page;
1708 block++; 1716 block++;
1709 } while (bh != head); 1717 } while (bh != head);
1710 1718
1711 do { 1719 do {
1712 if (!buffer_mapped(bh)) 1720 if (!buffer_mapped(bh))
1713 continue; 1721 continue;
1714 /* 1722 /*
1715 * If it's a fully non-blocking write attempt and we cannot 1723 * If it's a fully non-blocking write attempt and we cannot
1716 * lock the buffer then redirty the page. Note that this can 1724 * lock the buffer then redirty the page. Note that this can
1717 * potentially cause a busy-wait loop from pdflush and kswapd 1725 * potentially cause a busy-wait loop from pdflush and kswapd
1718 * activity, but those code paths have their own higher-level 1726 * activity, but those code paths have their own higher-level
1719 * throttling. 1727 * throttling.
1720 */ 1728 */
1721 if (wbc->sync_mode != WB_SYNC_NONE || !wbc->nonblocking) { 1729 if (wbc->sync_mode != WB_SYNC_NONE || !wbc->nonblocking) {
1722 lock_buffer(bh); 1730 lock_buffer(bh);
1723 } else if (!trylock_buffer(bh)) { 1731 } else if (!trylock_buffer(bh)) {
1724 redirty_page_for_writepage(wbc, page); 1732 redirty_page_for_writepage(wbc, page);
1725 continue; 1733 continue;
1726 } 1734 }
1727 if (test_clear_buffer_dirty(bh)) { 1735 if (test_clear_buffer_dirty(bh)) {
1728 mark_buffer_async_write(bh); 1736 mark_buffer_async_write(bh);
1729 } else { 1737 } else {
1730 unlock_buffer(bh); 1738 unlock_buffer(bh);
1731 } 1739 }
1732 } while ((bh = bh->b_this_page) != head); 1740 } while ((bh = bh->b_this_page) != head);
1733 1741
1734 /* 1742 /*
1735 * The page and its buffers are protected by PageWriteback(), so we can 1743 * The page and its buffers are protected by PageWriteback(), so we can
1736 * drop the bh refcounts early. 1744 * drop the bh refcounts early.
1737 */ 1745 */
1738 BUG_ON(PageWriteback(page)); 1746 BUG_ON(PageWriteback(page));
1739 set_page_writeback(page); 1747 set_page_writeback(page);
1740 1748
1741 do { 1749 do {
1742 struct buffer_head *next = bh->b_this_page; 1750 struct buffer_head *next = bh->b_this_page;
1743 if (buffer_async_write(bh)) { 1751 if (buffer_async_write(bh)) {
1744 submit_bh(WRITE, bh); 1752 submit_bh(WRITE, bh);
1745 nr_underway++; 1753 nr_underway++;
1746 } 1754 }
1747 bh = next; 1755 bh = next;
1748 } while (bh != head); 1756 } while (bh != head);
1749 unlock_page(page); 1757 unlock_page(page);
1750 1758
1751 err = 0; 1759 err = 0;
1752 done: 1760 done:
1753 if (nr_underway == 0) { 1761 if (nr_underway == 0) {
1754 /* 1762 /*
1755 * The page was marked dirty, but the buffers were 1763 * The page was marked dirty, but the buffers were
1756 * clean. Someone wrote them back by hand with 1764 * clean. Someone wrote them back by hand with
1757 * ll_rw_block/submit_bh. A rare case. 1765 * ll_rw_block/submit_bh. A rare case.
1758 */ 1766 */
1759 end_page_writeback(page); 1767 end_page_writeback(page);
1760 1768
1761 /* 1769 /*
1762 * The page and buffer_heads can be released at any time from 1770 * The page and buffer_heads can be released at any time from
1763 * here on. 1771 * here on.
1764 */ 1772 */
1765 } 1773 }
1766 return err; 1774 return err;
1767 1775
1768 recover: 1776 recover:
1769 /* 1777 /*
1770 * ENOSPC, or some other error. We may already have added some 1778 * ENOSPC, or some other error. We may already have added some
1771 * blocks to the file, so we need to write these out to avoid 1779 * blocks to the file, so we need to write these out to avoid
1772 * exposing stale data. 1780 * exposing stale data.
1773 * The page is currently locked and not marked for writeback 1781 * The page is currently locked and not marked for writeback
1774 */ 1782 */
1775 bh = head; 1783 bh = head;
1776 /* Recovery: lock and submit the mapped buffers */ 1784 /* Recovery: lock and submit the mapped buffers */
1777 do { 1785 do {
1778 if (buffer_mapped(bh) && buffer_dirty(bh) && 1786 if (buffer_mapped(bh) && buffer_dirty(bh) &&
1779 !buffer_delay(bh)) { 1787 !buffer_delay(bh)) {
1780 lock_buffer(bh); 1788 lock_buffer(bh);
1781 mark_buffer_async_write(bh); 1789 mark_buffer_async_write(bh);
1782 } else { 1790 } else {
1783 /* 1791 /*
1784 * The buffer may have been set dirty during 1792 * The buffer may have been set dirty during
1785 * attachment to a dirty page. 1793 * attachment to a dirty page.
1786 */ 1794 */
1787 clear_buffer_dirty(bh); 1795 clear_buffer_dirty(bh);
1788 } 1796 }
1789 } while ((bh = bh->b_this_page) != head); 1797 } while ((bh = bh->b_this_page) != head);
1790 SetPageError(page); 1798 SetPageError(page);
1791 BUG_ON(PageWriteback(page)); 1799 BUG_ON(PageWriteback(page));
1792 mapping_set_error(page->mapping, err); 1800 mapping_set_error(page->mapping, err);
1793 set_page_writeback(page); 1801 set_page_writeback(page);
1794 do { 1802 do {
1795 struct buffer_head *next = bh->b_this_page; 1803 struct buffer_head *next = bh->b_this_page;
1796 if (buffer_async_write(bh)) { 1804 if (buffer_async_write(bh)) {
1797 clear_buffer_dirty(bh); 1805 clear_buffer_dirty(bh);
1798 submit_bh(WRITE, bh); 1806 submit_bh(WRITE, bh);
1799 nr_underway++; 1807 nr_underway++;
1800 } 1808 }
1801 bh = next; 1809 bh = next;
1802 } while (bh != head); 1810 } while (bh != head);
1803 unlock_page(page); 1811 unlock_page(page);
1804 goto done; 1812 goto done;
1805 } 1813 }
1806 1814
1807 /* 1815 /*
1808 * If a page has any new buffers, zero them out here, and mark them uptodate 1816 * If a page has any new buffers, zero them out here, and mark them uptodate
1809 * and dirty so they'll be written out (in order to prevent uninitialised 1817 * and dirty so they'll be written out (in order to prevent uninitialised
1810 * block data from leaking). And clear the new bit. 1818 * block data from leaking). And clear the new bit.
1811 */ 1819 */
1812 void page_zero_new_buffers(struct page *page, unsigned from, unsigned to) 1820 void page_zero_new_buffers(struct page *page, unsigned from, unsigned to)
1813 { 1821 {
1814 unsigned int block_start, block_end; 1822 unsigned int block_start, block_end;
1815 struct buffer_head *head, *bh; 1823 struct buffer_head *head, *bh;
1816 1824
1817 BUG_ON(!PageLocked(page)); 1825 BUG_ON(!PageLocked(page));
1818 if (!page_has_buffers(page)) 1826 if (!page_has_buffers(page))
1819 return; 1827 return;
1820 1828
1821 bh = head = page_buffers(page); 1829 bh = head = page_buffers(page);
1822 block_start = 0; 1830 block_start = 0;
1823 do { 1831 do {
1824 block_end = block_start + bh->b_size; 1832 block_end = block_start + bh->b_size;
1825 1833
1826 if (buffer_new(bh)) { 1834 if (buffer_new(bh)) {
1827 if (block_end > from && block_start < to) { 1835 if (block_end > from && block_start < to) {
1828 if (!PageUptodate(page)) { 1836 if (!PageUptodate(page)) {
1829 unsigned start, size; 1837 unsigned start, size;
1830 1838
1831 start = max(from, block_start); 1839 start = max(from, block_start);
1832 size = min(to, block_end) - start; 1840 size = min(to, block_end) - start;
1833 1841
1834 zero_user(page, start, size); 1842 zero_user(page, start, size);
1835 set_buffer_uptodate(bh); 1843 set_buffer_uptodate(bh);
1836 } 1844 }
1837 1845
1838 clear_buffer_new(bh); 1846 clear_buffer_new(bh);
1839 mark_buffer_dirty(bh); 1847 mark_buffer_dirty(bh);
1840 } 1848 }
1841 } 1849 }
1842 1850
1843 block_start = block_end; 1851 block_start = block_end;
1844 bh = bh->b_this_page; 1852 bh = bh->b_this_page;
1845 } while (bh != head); 1853 } while (bh != head);
1846 } 1854 }
1847 EXPORT_SYMBOL(page_zero_new_buffers); 1855 EXPORT_SYMBOL(page_zero_new_buffers);
1848 1856
1849 static int __block_prepare_write(struct inode *inode, struct page *page, 1857 static int __block_prepare_write(struct inode *inode, struct page *page,
1850 unsigned from, unsigned to, get_block_t *get_block) 1858 unsigned from, unsigned to, get_block_t *get_block)
1851 { 1859 {
1852 unsigned block_start, block_end; 1860 unsigned block_start, block_end;
1853 sector_t block; 1861 sector_t block;
1854 int err = 0; 1862 int err = 0;
1855 unsigned blocksize, bbits; 1863 unsigned blocksize, bbits;
1856 struct buffer_head *bh, *head, *wait[2], **wait_bh=wait; 1864 struct buffer_head *bh, *head, *wait[2], **wait_bh=wait;
1857 1865
1858 BUG_ON(!PageLocked(page)); 1866 BUG_ON(!PageLocked(page));
1859 BUG_ON(from > PAGE_CACHE_SIZE); 1867 BUG_ON(from > PAGE_CACHE_SIZE);
1860 BUG_ON(to > PAGE_CACHE_SIZE); 1868 BUG_ON(to > PAGE_CACHE_SIZE);
1861 BUG_ON(from > to); 1869 BUG_ON(from > to);
1862 1870
1863 blocksize = 1 << inode->i_blkbits; 1871 blocksize = 1 << inode->i_blkbits;
1864 if (!page_has_buffers(page)) 1872 if (!page_has_buffers(page))
1865 create_empty_buffers(page, blocksize, 0); 1873 create_empty_buffers(page, blocksize, 0);
1866 head = page_buffers(page); 1874 head = page_buffers(page);
1867 1875
1868 bbits = inode->i_blkbits; 1876 bbits = inode->i_blkbits;
1869 block = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits); 1877 block = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits);
1870 1878
1871 for(bh = head, block_start = 0; bh != head || !block_start; 1879 for(bh = head, block_start = 0; bh != head || !block_start;
1872 block++, block_start=block_end, bh = bh->b_this_page) { 1880 block++, block_start=block_end, bh = bh->b_this_page) {
1873 block_end = block_start + blocksize; 1881 block_end = block_start + blocksize;
1874 if (block_end <= from || block_start >= to) { 1882 if (block_end <= from || block_start >= to) {
1875 if (PageUptodate(page)) { 1883 if (PageUptodate(page)) {
1876 if (!buffer_uptodate(bh)) 1884 if (!buffer_uptodate(bh))
1877 set_buffer_uptodate(bh); 1885 set_buffer_uptodate(bh);
1878 } 1886 }
1879 continue; 1887 continue;
1880 } 1888 }
1881 if (buffer_new(bh)) 1889 if (buffer_new(bh))
1882 clear_buffer_new(bh); 1890 clear_buffer_new(bh);
1883 if (!buffer_mapped(bh)) { 1891 if (!buffer_mapped(bh)) {
1884 WARN_ON(bh->b_size != blocksize); 1892 WARN_ON(bh->b_size != blocksize);
1885 err = get_block(inode, block, bh, 1); 1893 err = get_block(inode, block, bh, 1);
1886 if (err) 1894 if (err)
1887 break; 1895 break;
1888 if (buffer_new(bh)) { 1896 if (buffer_new(bh)) {
1889 unmap_underlying_metadata(bh->b_bdev, 1897 unmap_underlying_metadata(bh->b_bdev,
1890 bh->b_blocknr); 1898 bh->b_blocknr);
1891 if (PageUptodate(page)) { 1899 if (PageUptodate(page)) {
1892 clear_buffer_new(bh); 1900 clear_buffer_new(bh);
1893 set_buffer_uptodate(bh); 1901 set_buffer_uptodate(bh);
1894 mark_buffer_dirty(bh); 1902 mark_buffer_dirty(bh);
1895 continue; 1903 continue;
1896 } 1904 }
1897 if (block_end > to || block_start < from) 1905 if (block_end > to || block_start < from)
1898 zero_user_segments(page, 1906 zero_user_segments(page,
1899 to, block_end, 1907 to, block_end,
1900 block_start, from); 1908 block_start, from);
1901 continue; 1909 continue;
1902 } 1910 }
1903 } 1911 }
1904 if (PageUptodate(page)) { 1912 if (PageUptodate(page)) {
1905 if (!buffer_uptodate(bh)) 1913 if (!buffer_uptodate(bh))
1906 set_buffer_uptodate(bh); 1914 set_buffer_uptodate(bh);
1907 continue; 1915 continue;
1908 } 1916 }
1909 if (!buffer_uptodate(bh) && !buffer_delay(bh) && 1917 if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
1910 !buffer_unwritten(bh) && 1918 !buffer_unwritten(bh) &&
1911 (block_start < from || block_end > to)) { 1919 (block_start < from || block_end > to)) {
1912 ll_rw_block(READ, 1, &bh); 1920 ll_rw_block(READ, 1, &bh);
1913 *wait_bh++=bh; 1921 *wait_bh++=bh;
1914 } 1922 }
1915 } 1923 }
1916 /* 1924 /*
1917 * If we issued read requests - let them complete. 1925 * If we issued read requests - let them complete.
1918 */ 1926 */
1919 while(wait_bh > wait) { 1927 while(wait_bh > wait) {
1920 wait_on_buffer(*--wait_bh); 1928 wait_on_buffer(*--wait_bh);
1921 if (!buffer_uptodate(*wait_bh)) 1929 if (!buffer_uptodate(*wait_bh))
1922 err = -EIO; 1930 err = -EIO;
1923 } 1931 }
1924 if (unlikely(err)) 1932 if (unlikely(err))
1925 page_zero_new_buffers(page, from, to); 1933 page_zero_new_buffers(page, from, to);
1926 return err; 1934 return err;
1927 } 1935 }
1928 1936
1929 static int __block_commit_write(struct inode *inode, struct page *page, 1937 static int __block_commit_write(struct inode *inode, struct page *page,
1930 unsigned from, unsigned to) 1938 unsigned from, unsigned to)
1931 { 1939 {
1932 unsigned block_start, block_end; 1940 unsigned block_start, block_end;
1933 int partial = 0; 1941 int partial = 0;
1934 unsigned blocksize; 1942 unsigned blocksize;
1935 struct buffer_head *bh, *head; 1943 struct buffer_head *bh, *head;
1936 1944
1937 blocksize = 1 << inode->i_blkbits; 1945 blocksize = 1 << inode->i_blkbits;
1938 1946
1939 for(bh = head = page_buffers(page), block_start = 0; 1947 for(bh = head = page_buffers(page), block_start = 0;
1940 bh != head || !block_start; 1948 bh != head || !block_start;
1941 block_start=block_end, bh = bh->b_this_page) { 1949 block_start=block_end, bh = bh->b_this_page) {
1942 block_end = block_start + blocksize; 1950 block_end = block_start + blocksize;
1943 if (block_end <= from || block_start >= to) { 1951 if (block_end <= from || block_start >= to) {
1944 if (!buffer_uptodate(bh)) 1952 if (!buffer_uptodate(bh))
1945 partial = 1; 1953 partial = 1;
1946 } else { 1954 } else {
1947 set_buffer_uptodate(bh); 1955 set_buffer_uptodate(bh);
1948 mark_buffer_dirty(bh); 1956 mark_buffer_dirty(bh);
1949 } 1957 }
1950 clear_buffer_new(bh); 1958 clear_buffer_new(bh);
1951 } 1959 }
1952 1960
1953 /* 1961 /*
1954 * If this is a partial write which happened to make all buffers 1962 * If this is a partial write which happened to make all buffers
1955 * uptodate then we can optimize away a bogus readpage() for 1963 * uptodate then we can optimize away a bogus readpage() for
1956 * the next read(). Here we 'discover' whether the page went 1964 * the next read(). Here we 'discover' whether the page went
1957 * uptodate as a result of this (potentially partial) write. 1965 * uptodate as a result of this (potentially partial) write.
1958 */ 1966 */
1959 if (!partial) 1967 if (!partial)
1960 SetPageUptodate(page); 1968 SetPageUptodate(page);
1961 return 0; 1969 return 0;
1962 } 1970 }
1963 1971
1964 /* 1972 /*
1965 * block_write_begin takes care of the basic task of block allocation and 1973 * block_write_begin takes care of the basic task of block allocation and
1966 * bringing partial write blocks uptodate first. 1974 * bringing partial write blocks uptodate first.
1967 * 1975 *
1968 * If *pagep is not NULL, then block_write_begin uses the locked page 1976 * If *pagep is not NULL, then block_write_begin uses the locked page
1969 * at *pagep rather than allocating its own. In this case, the page will 1977 * at *pagep rather than allocating its own. In this case, the page will
1970 * not be unlocked or deallocated on failure. 1978 * not be unlocked or deallocated on failure.
1971 */ 1979 */
1972 int block_write_begin(struct file *file, struct address_space *mapping, 1980 int block_write_begin(struct file *file, struct address_space *mapping,
1973 loff_t pos, unsigned len, unsigned flags, 1981 loff_t pos, unsigned len, unsigned flags,
1974 struct page **pagep, void **fsdata, 1982 struct page **pagep, void **fsdata,
1975 get_block_t *get_block) 1983 get_block_t *get_block)
1976 { 1984 {
1977 struct inode *inode = mapping->host; 1985 struct inode *inode = mapping->host;
1978 int status = 0; 1986 int status = 0;
1979 struct page *page; 1987 struct page *page;
1980 pgoff_t index; 1988 pgoff_t index;
1981 unsigned start, end; 1989 unsigned start, end;
1982 int ownpage = 0; 1990 int ownpage = 0;
1983 1991
1984 index = pos >> PAGE_CACHE_SHIFT; 1992 index = pos >> PAGE_CACHE_SHIFT;
1985 start = pos & (PAGE_CACHE_SIZE - 1); 1993 start = pos & (PAGE_CACHE_SIZE - 1);
1986 end = start + len; 1994 end = start + len;
1987 1995
1988 page = *pagep; 1996 page = *pagep;
1989 if (page == NULL) { 1997 if (page == NULL) {
1990 ownpage = 1; 1998 ownpage = 1;
1991 page = __grab_cache_page(mapping, index); 1999 page = __grab_cache_page(mapping, index);
1992 if (!page) { 2000 if (!page) {
1993 status = -ENOMEM; 2001 status = -ENOMEM;
1994 goto out; 2002 goto out;
1995 } 2003 }
1996 *pagep = page; 2004 *pagep = page;
1997 } else 2005 } else
1998 BUG_ON(!PageLocked(page)); 2006 BUG_ON(!PageLocked(page));
1999 2007
2000 status = __block_prepare_write(inode, page, start, end, get_block); 2008 status = __block_prepare_write(inode, page, start, end, get_block);
2001 if (unlikely(status)) { 2009 if (unlikely(status)) {
2002 ClearPageUptodate(page); 2010 ClearPageUptodate(page);
2003 2011
2004 if (ownpage) { 2012 if (ownpage) {
2005 unlock_page(page); 2013 unlock_page(page);
2006 page_cache_release(page); 2014 page_cache_release(page);
2007 *pagep = NULL; 2015 *pagep = NULL;
2008 2016
2009 /* 2017 /*
2010 * prepare_write() may have instantiated a few blocks 2018 * prepare_write() may have instantiated a few blocks
2011 * outside i_size. Trim these off again. Don't need 2019 * outside i_size. Trim these off again. Don't need
2012 * i_size_read because we hold i_mutex. 2020 * i_size_read because we hold i_mutex.
2013 */ 2021 */
2014 if (pos + len > inode->i_size) 2022 if (pos + len > inode->i_size)
2015 vmtruncate(inode, inode->i_size); 2023 vmtruncate(inode, inode->i_size);
2016 } 2024 }
2017 goto out; 2025 goto out;
2018 } 2026 }
2019 2027
2020 out: 2028 out:
2021 return status; 2029 return status;
2022 } 2030 }
2023 EXPORT_SYMBOL(block_write_begin); 2031 EXPORT_SYMBOL(block_write_begin);
2024 2032
2025 int block_write_end(struct file *file, struct address_space *mapping, 2033 int block_write_end(struct file *file, struct address_space *mapping,
2026 loff_t pos, unsigned len, unsigned copied, 2034 loff_t pos, unsigned len, unsigned copied,
2027 struct page *page, void *fsdata) 2035 struct page *page, void *fsdata)
2028 { 2036 {
2029 struct inode *inode = mapping->host; 2037 struct inode *inode = mapping->host;
2030 unsigned start; 2038 unsigned start;
2031 2039
2032 start = pos & (PAGE_CACHE_SIZE - 1); 2040 start = pos & (PAGE_CACHE_SIZE - 1);
2033 2041
2034 if (unlikely(copied < len)) { 2042 if (unlikely(copied < len)) {
2035 /* 2043 /*
2036 * The buffers that were written will now be uptodate, so we 2044 * The buffers that were written will now be uptodate, so we
2037 * don't have to worry about a readpage reading them and 2045 * don't have to worry about a readpage reading them and
2038 * overwriting a partial write. However if we have encountered 2046 * overwriting a partial write. However if we have encountered
2039 * a short write and only partially written into a buffer, it 2047 * a short write and only partially written into a buffer, it
2040 * will not be marked uptodate, so a readpage might come in and 2048 * will not be marked uptodate, so a readpage might come in and
2041 * destroy our partial write. 2049 * destroy our partial write.
2042 * 2050 *
2043 * Do the simplest thing, and just treat any short write to a 2051 * Do the simplest thing, and just treat any short write to a
2044 * non uptodate page as a zero-length write, and force the 2052 * non uptodate page as a zero-length write, and force the
2045 * caller to redo the whole thing. 2053 * caller to redo the whole thing.
2046 */ 2054 */
2047 if (!PageUptodate(page)) 2055 if (!PageUptodate(page))
2048 copied = 0; 2056 copied = 0;
2049 2057
2050 page_zero_new_buffers(page, start+copied, start+len); 2058 page_zero_new_buffers(page, start+copied, start+len);
2051 } 2059 }
2052 flush_dcache_page(page); 2060 flush_dcache_page(page);
2053 2061
2054 /* This could be a short (even 0-length) commit */ 2062 /* This could be a short (even 0-length) commit */
2055 __block_commit_write(inode, page, start, start+copied); 2063 __block_commit_write(inode, page, start, start+copied);
2056 2064
2057 return copied; 2065 return copied;
2058 } 2066 }
2059 EXPORT_SYMBOL(block_write_end); 2067 EXPORT_SYMBOL(block_write_end);
2060 2068
2061 int generic_write_end(struct file *file, struct address_space *mapping, 2069 int generic_write_end(struct file *file, struct address_space *mapping,
2062 loff_t pos, unsigned len, unsigned copied, 2070 loff_t pos, unsigned len, unsigned copied,
2063 struct page *page, void *fsdata) 2071 struct page *page, void *fsdata)
2064 { 2072 {
2065 struct inode *inode = mapping->host; 2073 struct inode *inode = mapping->host;
2066 int i_size_changed = 0; 2074 int i_size_changed = 0;
2067 2075
2068 copied = block_write_end(file, mapping, pos, len, copied, page, fsdata); 2076 copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
2069 2077
2070 /* 2078 /*
2071 * No need to use i_size_read() here, the i_size 2079 * No need to use i_size_read() here, the i_size
2072 * cannot change under us because we hold i_mutex. 2080 * cannot change under us because we hold i_mutex.
2073 * 2081 *
2074 * But it's important to update i_size while still holding page lock: 2082 * But it's important to update i_size while still holding page lock:
2075 * page writeout could otherwise come in and zero beyond i_size. 2083 * page writeout could otherwise come in and zero beyond i_size.
2076 */ 2084 */
2077 if (pos+copied > inode->i_size) { 2085 if (pos+copied > inode->i_size) {
2078 i_size_write(inode, pos+copied); 2086 i_size_write(inode, pos+copied);
2079 i_size_changed = 1; 2087 i_size_changed = 1;
2080 } 2088 }
2081 2089
2082 unlock_page(page); 2090 unlock_page(page);
2083 page_cache_release(page); 2091 page_cache_release(page);
2084 2092
2085 /* 2093 /*
2086 * Don't mark the inode dirty under page lock. First, it unnecessarily 2094 * Don't mark the inode dirty under page lock. First, it unnecessarily
2087 * makes the holding time of page lock longer. Second, it forces lock 2095 * makes the holding time of page lock longer. Second, it forces lock
2088 * ordering of page lock and transaction start for journaling 2096 * ordering of page lock and transaction start for journaling
2089 * filesystems. 2097 * filesystems.
2090 */ 2098 */
2091 if (i_size_changed) 2099 if (i_size_changed)
2092 mark_inode_dirty(inode); 2100 mark_inode_dirty(inode);
2093 2101
2094 return copied; 2102 return copied;
2095 } 2103 }
2096 EXPORT_SYMBOL(generic_write_end); 2104 EXPORT_SYMBOL(generic_write_end);
2097 2105
2098 /* 2106 /*
2099 * block_is_partially_uptodate checks whether buffers within a page are 2107 * block_is_partially_uptodate checks whether buffers within a page are
2100 * uptodate or not. 2108 * uptodate or not.
2101 * 2109 *
2102 * Returns true if all buffers which correspond to a file portion 2110 * Returns true if all buffers which correspond to a file portion
2103 * we want to read are uptodate. 2111 * we want to read are uptodate.
2104 */ 2112 */
2105 int block_is_partially_uptodate(struct page *page, read_descriptor_t *desc, 2113 int block_is_partially_uptodate(struct page *page, read_descriptor_t *desc,
2106 unsigned long from) 2114 unsigned long from)
2107 { 2115 {
2108 struct inode *inode = page->mapping->host; 2116 struct inode *inode = page->mapping->host;
2109 unsigned block_start, block_end, blocksize; 2117 unsigned block_start, block_end, blocksize;
2110 unsigned to; 2118 unsigned to;
2111 struct buffer_head *bh, *head; 2119 struct buffer_head *bh, *head;
2112 int ret = 1; 2120 int ret = 1;
2113 2121
2114 if (!page_has_buffers(page)) 2122 if (!page_has_buffers(page))
2115 return 0; 2123 return 0;
2116 2124
2117 blocksize = 1 << inode->i_blkbits; 2125 blocksize = 1 << inode->i_blkbits;
2118 to = min_t(unsigned, PAGE_CACHE_SIZE - from, desc->count); 2126 to = min_t(unsigned, PAGE_CACHE_SIZE - from, desc->count);
2119 to = from + to; 2127 to = from + to;
2120 if (from < blocksize && to > PAGE_CACHE_SIZE - blocksize) 2128 if (from < blocksize && to > PAGE_CACHE_SIZE - blocksize)
2121 return 0; 2129 return 0;
2122 2130
2123 head = page_buffers(page); 2131 head = page_buffers(page);
2124 bh = head; 2132 bh = head;
2125 block_start = 0; 2133 block_start = 0;
2126 do { 2134 do {
2127 block_end = block_start + blocksize; 2135 block_end = block_start + blocksize;
2128 if (block_end > from && block_start < to) { 2136 if (block_end > from && block_start < to) {
2129 if (!buffer_uptodate(bh)) { 2137 if (!buffer_uptodate(bh)) {
2130 ret = 0; 2138 ret = 0;
2131 break; 2139 break;
2132 } 2140 }
2133 if (block_end >= to) 2141 if (block_end >= to)
2134 break; 2142 break;
2135 } 2143 }
2136 block_start = block_end; 2144 block_start = block_end;
2137 bh = bh->b_this_page; 2145 bh = bh->b_this_page;
2138 } while (bh != head); 2146 } while (bh != head);
2139 2147
2140 return ret; 2148 return ret;
2141 } 2149 }
2142 EXPORT_SYMBOL(block_is_partially_uptodate); 2150 EXPORT_SYMBOL(block_is_partially_uptodate);
2143 2151
2144 /* 2152 /*
2145 * Generic "read page" function for block devices that have the normal 2153 * Generic "read page" function for block devices that have the normal
2146 * get_block functionality. This is most of the block device filesystems. 2154 * get_block functionality. This is most of the block device filesystems.
2147 * Reads the page asynchronously --- the unlock_buffer() and 2155 * Reads the page asynchronously --- the unlock_buffer() and
2148 * set/clear_buffer_uptodate() functions propagate buffer state into the 2156 * set/clear_buffer_uptodate() functions propagate buffer state into the
2149 * page struct once IO has completed. 2157 * page struct once IO has completed.
2150 */ 2158 */
2151 int block_read_full_page(struct page *page, get_block_t *get_block) 2159 int block_read_full_page(struct page *page, get_block_t *get_block)
2152 { 2160 {
2153 struct inode *inode = page->mapping->host; 2161 struct inode *inode = page->mapping->host;
2154 sector_t iblock, lblock; 2162 sector_t iblock, lblock;
2155 struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE]; 2163 struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
2156 unsigned int blocksize; 2164 unsigned int blocksize;
2157 int nr, i; 2165 int nr, i;
2158 int fully_mapped = 1; 2166 int fully_mapped = 1;
2159 2167
2160 BUG_ON(!PageLocked(page)); 2168 BUG_ON(!PageLocked(page));
2161 blocksize = 1 << inode->i_blkbits; 2169 blocksize = 1 << inode->i_blkbits;
2162 if (!page_has_buffers(page)) 2170 if (!page_has_buffers(page))
2163 create_empty_buffers(page, blocksize, 0); 2171 create_empty_buffers(page, blocksize, 0);
2164 head = page_buffers(page); 2172 head = page_buffers(page);
2165 2173
2166 iblock = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits); 2174 iblock = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
2167 lblock = (i_size_read(inode)+blocksize-1) >> inode->i_blkbits; 2175 lblock = (i_size_read(inode)+blocksize-1) >> inode->i_blkbits;
2168 bh = head; 2176 bh = head;
2169 nr = 0; 2177 nr = 0;
2170 i = 0; 2178 i = 0;
2171 2179
2172 do { 2180 do {
2173 if (buffer_uptodate(bh)) 2181 if (buffer_uptodate(bh))
2174 continue; 2182 continue;
2175 2183
2176 if (!buffer_mapped(bh)) { 2184 if (!buffer_mapped(bh)) {
2177 int err = 0; 2185 int err = 0;
2178 2186
2179 fully_mapped = 0; 2187 fully_mapped = 0;
2180 if (iblock < lblock) { 2188 if (iblock < lblock) {
2181 WARN_ON(bh->b_size != blocksize); 2189 WARN_ON(bh->b_size != blocksize);
2182 err = get_block(inode, iblock, bh, 0); 2190 err = get_block(inode, iblock, bh, 0);
2183 if (err) 2191 if (err)
2184 SetPageError(page); 2192 SetPageError(page);
2185 } 2193 }
2186 if (!buffer_mapped(bh)) { 2194 if (!buffer_mapped(bh)) {
2187 zero_user(page, i * blocksize, blocksize); 2195 zero_user(page, i * blocksize, blocksize);
2188 if (!err) 2196 if (!err)
2189 set_buffer_uptodate(bh); 2197 set_buffer_uptodate(bh);
2190 continue; 2198 continue;
2191 } 2199 }
2192 /* 2200 /*
2193 * get_block() might have updated the buffer 2201 * get_block() might have updated the buffer
2194 * synchronously 2202 * synchronously
2195 */ 2203 */
2196 if (buffer_uptodate(bh)) 2204 if (buffer_uptodate(bh))
2197 continue; 2205 continue;
2198 } 2206 }
2199 arr[nr++] = bh; 2207 arr[nr++] = bh;
2200 } while (i++, iblock++, (bh = bh->b_this_page) != head); 2208 } while (i++, iblock++, (bh = bh->b_this_page) != head);
2201 2209
2202 if (fully_mapped) 2210 if (fully_mapped)
2203 SetPageMappedToDisk(page); 2211 SetPageMappedToDisk(page);
2204 2212
2205 if (!nr) { 2213 if (!nr) {
2206 /* 2214 /*
2207 * All buffers are uptodate - we can set the page uptodate 2215 * All buffers are uptodate - we can set the page uptodate
2208 * as well. But not if get_block() returned an error. 2216 * as well. But not if get_block() returned an error.
2209 */ 2217 */
2210 if (!PageError(page)) 2218 if (!PageError(page))
2211 SetPageUptodate(page); 2219 SetPageUptodate(page);
2212 unlock_page(page); 2220 unlock_page(page);
2213 return 0; 2221 return 0;
2214 } 2222 }
2215 2223
2216 /* Stage two: lock the buffers */ 2224 /* Stage two: lock the buffers */
2217 for (i = 0; i < nr; i++) { 2225 for (i = 0; i < nr; i++) {
2218 bh = arr[i]; 2226 bh = arr[i];
2219 lock_buffer(bh); 2227 lock_buffer(bh);
2220 mark_buffer_async_read(bh); 2228 mark_buffer_async_read(bh);
2221 } 2229 }
2222 2230
2223 /* 2231 /*
2224 * Stage 3: start the IO. Check for uptodateness 2232 * Stage 3: start the IO. Check for uptodateness
2225 * inside the buffer lock in case another process reading 2233 * inside the buffer lock in case another process reading
2226 * the underlying blockdev brought it uptodate (the sct fix). 2234 * the underlying blockdev brought it uptodate (the sct fix).
2227 */ 2235 */
2228 for (i = 0; i < nr; i++) { 2236 for (i = 0; i < nr; i++) {
2229 bh = arr[i]; 2237 bh = arr[i];
2230 if (buffer_uptodate(bh)) 2238 if (buffer_uptodate(bh))
2231 end_buffer_async_read(bh, 1); 2239 end_buffer_async_read(bh, 1);
2232 else 2240 else
2233 submit_bh(READ, bh); 2241 submit_bh(READ, bh);
2234 } 2242 }
2235 return 0; 2243 return 0;
2236 } 2244 }
2237 2245
2238 /* utility function for filesystems that need to do work on expanding 2246 /* utility function for filesystems that need to do work on expanding
2239 * truncates. Uses filesystem pagecache writes to allow the filesystem to 2247 * truncates. Uses filesystem pagecache writes to allow the filesystem to
2240 * deal with the hole. 2248 * deal with the hole.
2241 */ 2249 */
2242 int generic_cont_expand_simple(struct inode *inode, loff_t size) 2250 int generic_cont_expand_simple(struct inode *inode, loff_t size)
2243 { 2251 {
2244 struct address_space *mapping = inode->i_mapping; 2252 struct address_space *mapping = inode->i_mapping;
2245 struct page *page; 2253 struct page *page;
2246 void *fsdata; 2254 void *fsdata;
2247 unsigned long limit; 2255 unsigned long limit;
2248 int err; 2256 int err;
2249 2257
2250 err = -EFBIG; 2258 err = -EFBIG;
2251 limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur; 2259 limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
2252 if (limit != RLIM_INFINITY && size > (loff_t)limit) { 2260 if (limit != RLIM_INFINITY && size > (loff_t)limit) {
2253 send_sig(SIGXFSZ, current, 0); 2261 send_sig(SIGXFSZ, current, 0);
2254 goto out; 2262 goto out;
2255 } 2263 }
2256 if (size > inode->i_sb->s_maxbytes) 2264 if (size > inode->i_sb->s_maxbytes)
2257 goto out; 2265 goto out;
2258 2266
2259 err = pagecache_write_begin(NULL, mapping, size, 0, 2267 err = pagecache_write_begin(NULL, mapping, size, 0,
2260 AOP_FLAG_UNINTERRUPTIBLE|AOP_FLAG_CONT_EXPAND, 2268 AOP_FLAG_UNINTERRUPTIBLE|AOP_FLAG_CONT_EXPAND,
2261 &page, &fsdata); 2269 &page, &fsdata);
2262 if (err) 2270 if (err)
2263 goto out; 2271 goto out;
2264 2272
2265 err = pagecache_write_end(NULL, mapping, size, 0, 0, page, fsdata); 2273 err = pagecache_write_end(NULL, mapping, size, 0, 0, page, fsdata);
2266 BUG_ON(err > 0); 2274 BUG_ON(err > 0);
2267 2275
2268 out: 2276 out:
2269 return err; 2277 return err;
2270 } 2278 }
2271 2279
2272 static int cont_expand_zero(struct file *file, struct address_space *mapping, 2280 static int cont_expand_zero(struct file *file, struct address_space *mapping,
2273 loff_t pos, loff_t *bytes) 2281 loff_t pos, loff_t *bytes)
2274 { 2282 {
2275 struct inode *inode = mapping->host; 2283 struct inode *inode = mapping->host;
2276 unsigned blocksize = 1 << inode->i_blkbits; 2284 unsigned blocksize = 1 << inode->i_blkbits;
2277 struct page *page; 2285 struct page *page;
2278 void *fsdata; 2286 void *fsdata;
2279 pgoff_t index, curidx; 2287 pgoff_t index, curidx;
2280 loff_t curpos; 2288 loff_t curpos;
2281 unsigned zerofrom, offset, len; 2289 unsigned zerofrom, offset, len;
2282 int err = 0; 2290 int err = 0;
2283 2291
2284 index = pos >> PAGE_CACHE_SHIFT; 2292 index = pos >> PAGE_CACHE_SHIFT;
2285 offset = pos & ~PAGE_CACHE_MASK; 2293 offset = pos & ~PAGE_CACHE_MASK;
2286 2294
2287 while (index > (curidx = (curpos = *bytes)>>PAGE_CACHE_SHIFT)) { 2295 while (index > (curidx = (curpos = *bytes)>>PAGE_CACHE_SHIFT)) {
2288 zerofrom = curpos & ~PAGE_CACHE_MASK; 2296 zerofrom = curpos & ~PAGE_CACHE_MASK;
2289 if (zerofrom & (blocksize-1)) { 2297 if (zerofrom & (blocksize-1)) {
2290 *bytes |= (blocksize-1); 2298 *bytes |= (blocksize-1);
2291 (*bytes)++; 2299 (*bytes)++;
2292 } 2300 }
2293 len = PAGE_CACHE_SIZE - zerofrom; 2301 len = PAGE_CACHE_SIZE - zerofrom;
2294 2302
2295 err = pagecache_write_begin(file, mapping, curpos, len, 2303 err = pagecache_write_begin(file, mapping, curpos, len,
2296 AOP_FLAG_UNINTERRUPTIBLE, 2304 AOP_FLAG_UNINTERRUPTIBLE,
2297 &page, &fsdata); 2305 &page, &fsdata);
2298 if (err) 2306 if (err)
2299 goto out; 2307 goto out;
2300 zero_user(page, zerofrom, len); 2308 zero_user(page, zerofrom, len);
2301 err = pagecache_write_end(file, mapping, curpos, len, len, 2309 err = pagecache_write_end(file, mapping, curpos, len, len,
2302 page, fsdata); 2310 page, fsdata);
2303 if (err < 0) 2311 if (err < 0)
2304 goto out; 2312 goto out;
2305 BUG_ON(err != len); 2313 BUG_ON(err != len);
2306 err = 0; 2314 err = 0;
2307 2315
2308 balance_dirty_pages_ratelimited(mapping); 2316 balance_dirty_pages_ratelimited(mapping);
2309 } 2317 }
2310 2318
2311 /* page covers the boundary, find the boundary offset */ 2319 /* page covers the boundary, find the boundary offset */
2312 if (index == curidx) { 2320 if (index == curidx) {
2313 zerofrom = curpos & ~PAGE_CACHE_MASK; 2321 zerofrom = curpos & ~PAGE_CACHE_MASK;
2314 /* if we will expand the thing last block will be filled */ 2322 /* if we will expand the thing last block will be filled */
2315 if (offset <= zerofrom) { 2323 if (offset <= zerofrom) {
2316 goto out; 2324 goto out;
2317 } 2325 }
2318 if (zerofrom & (blocksize-1)) { 2326 if (zerofrom & (blocksize-1)) {
2319 *bytes |= (blocksize-1); 2327 *bytes |= (blocksize-1);
2320 (*bytes)++; 2328 (*bytes)++;
2321 } 2329 }
2322 len = offset - zerofrom; 2330 len = offset - zerofrom;
2323 2331
2324 err = pagecache_write_begin(file, mapping, curpos, len, 2332 err = pagecache_write_begin(file, mapping, curpos, len,
2325 AOP_FLAG_UNINTERRUPTIBLE, 2333 AOP_FLAG_UNINTERRUPTIBLE,
2326 &page, &fsdata); 2334 &page, &fsdata);
2327 if (err) 2335 if (err)
2328 goto out; 2336 goto out;
2329 zero_user(page, zerofrom, len); 2337 zero_user(page, zerofrom, len);
2330 err = pagecache_write_end(file, mapping, curpos, len, len, 2338 err = pagecache_write_end(file, mapping, curpos, len, len,
2331 page, fsdata); 2339 page, fsdata);
2332 if (err < 0) 2340 if (err < 0)
2333 goto out; 2341 goto out;
2334 BUG_ON(err != len); 2342 BUG_ON(err != len);
2335 err = 0; 2343 err = 0;
2336 } 2344 }
2337 out: 2345 out:
2338 return err; 2346 return err;
2339 } 2347 }
2340 2348
2341 /* 2349 /*
2342 * For moronic filesystems that do not allow holes in file. 2350 * For moronic filesystems that do not allow holes in file.
2343 * We may have to extend the file. 2351 * We may have to extend the file.
2344 */ 2352 */
2345 int cont_write_begin(struct file *file, struct address_space *mapping, 2353 int cont_write_begin(struct file *file, struct address_space *mapping,
2346 loff_t pos, unsigned len, unsigned flags, 2354 loff_t pos, unsigned len, unsigned flags,
2347 struct page **pagep, void **fsdata, 2355 struct page **pagep, void **fsdata,
2348 get_block_t *get_block, loff_t *bytes) 2356 get_block_t *get_block, loff_t *bytes)
2349 { 2357 {
2350 struct inode *inode = mapping->host; 2358 struct inode *inode = mapping->host;
2351 unsigned blocksize = 1 << inode->i_blkbits; 2359 unsigned blocksize = 1 << inode->i_blkbits;
2352 unsigned zerofrom; 2360 unsigned zerofrom;
2353 int err; 2361 int err;
2354 2362
2355 err = cont_expand_zero(file, mapping, pos, bytes); 2363 err = cont_expand_zero(file, mapping, pos, bytes);
2356 if (err) 2364 if (err)
2357 goto out; 2365 goto out;
2358 2366
2359 zerofrom = *bytes & ~PAGE_CACHE_MASK; 2367 zerofrom = *bytes & ~PAGE_CACHE_MASK;
2360 if (pos+len > *bytes && zerofrom & (blocksize-1)) { 2368 if (pos+len > *bytes && zerofrom & (blocksize-1)) {
2361 *bytes |= (blocksize-1); 2369 *bytes |= (blocksize-1);
2362 (*bytes)++; 2370 (*bytes)++;
2363 } 2371 }
2364 2372
2365 *pagep = NULL; 2373 *pagep = NULL;
2366 err = block_write_begin(file, mapping, pos, len, 2374 err = block_write_begin(file, mapping, pos, len,
2367 flags, pagep, fsdata, get_block); 2375 flags, pagep, fsdata, get_block);
2368 out: 2376 out:
2369 return err; 2377 return err;
2370 } 2378 }
2371 2379
2372 int block_prepare_write(struct page *page, unsigned from, unsigned to, 2380 int block_prepare_write(struct page *page, unsigned from, unsigned to,
2373 get_block_t *get_block) 2381 get_block_t *get_block)
2374 { 2382 {
2375 struct inode *inode = page->mapping->host; 2383 struct inode *inode = page->mapping->host;
2376 int err = __block_prepare_write(inode, page, from, to, get_block); 2384 int err = __block_prepare_write(inode, page, from, to, get_block);
2377 if (err) 2385 if (err)
2378 ClearPageUptodate(page); 2386 ClearPageUptodate(page);
2379 return err; 2387 return err;
2380 } 2388 }
2381 2389
2382 int block_commit_write(struct page *page, unsigned from, unsigned to) 2390 int block_commit_write(struct page *page, unsigned from, unsigned to)
2383 { 2391 {
2384 struct inode *inode = page->mapping->host; 2392 struct inode *inode = page->mapping->host;
2385 __block_commit_write(inode,page,from,to); 2393 __block_commit_write(inode,page,from,to);
2386 return 0; 2394 return 0;
2387 } 2395 }
2388 2396
2389 /* 2397 /*
2390 * block_page_mkwrite() is not allowed to change the file size as it gets 2398 * block_page_mkwrite() is not allowed to change the file size as it gets
2391 * called from a page fault handler when a page is first dirtied. Hence we must 2399 * called from a page fault handler when a page is first dirtied. Hence we must
2392 * be careful to check for EOF conditions here. We set the page up correctly 2400 * be careful to check for EOF conditions here. We set the page up correctly
2393 * for a written page which means we get ENOSPC checking when writing into 2401 * for a written page which means we get ENOSPC checking when writing into
2394 * holes and correct delalloc and unwritten extent mapping on filesystems that 2402 * holes and correct delalloc and unwritten extent mapping on filesystems that
2395 * support these features. 2403 * support these features.
2396 * 2404 *
2397 * We are not allowed to take the i_mutex here so we have to play games to 2405 * We are not allowed to take the i_mutex here so we have to play games to
2398 * protect against truncate races as the page could now be beyond EOF. Because 2406 * protect against truncate races as the page could now be beyond EOF. Because
2399 * vmtruncate() writes the inode size before removing pages, once we have the 2407 * vmtruncate() writes the inode size before removing pages, once we have the
2400 * page lock we can determine safely if the page is beyond EOF. If it is not 2408 * page lock we can determine safely if the page is beyond EOF. If it is not
2401 * beyond EOF, then the page is guaranteed safe against truncation until we 2409 * beyond EOF, then the page is guaranteed safe against truncation until we
2402 * unlock the page. 2410 * unlock the page.
2403 */ 2411 */
2404 int 2412 int
2405 block_page_mkwrite(struct vm_area_struct *vma, struct page *page, 2413 block_page_mkwrite(struct vm_area_struct *vma, struct page *page,
2406 get_block_t get_block) 2414 get_block_t get_block)
2407 { 2415 {
2408 struct inode *inode = vma->vm_file->f_path.dentry->d_inode; 2416 struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
2409 unsigned long end; 2417 unsigned long end;
2410 loff_t size; 2418 loff_t size;
2411 int ret = -EINVAL; 2419 int ret = -EINVAL;
2412 2420
2413 lock_page(page); 2421 lock_page(page);
2414 size = i_size_read(inode); 2422 size = i_size_read(inode);
2415 if ((page->mapping != inode->i_mapping) || 2423 if ((page->mapping != inode->i_mapping) ||
2416 (page_offset(page) > size)) { 2424 (page_offset(page) > size)) {
2417 /* page got truncated out from underneath us */ 2425 /* page got truncated out from underneath us */
2418 goto out_unlock; 2426 goto out_unlock;
2419 } 2427 }
2420 2428
2421 /* page is wholly or partially inside EOF */ 2429 /* page is wholly or partially inside EOF */
2422 if (((page->index + 1) << PAGE_CACHE_SHIFT) > size) 2430 if (((page->index + 1) << PAGE_CACHE_SHIFT) > size)
2423 end = size & ~PAGE_CACHE_MASK; 2431 end = size & ~PAGE_CACHE_MASK;
2424 else 2432 else
2425 end = PAGE_CACHE_SIZE; 2433 end = PAGE_CACHE_SIZE;
2426 2434
2427 ret = block_prepare_write(page, 0, end, get_block); 2435 ret = block_prepare_write(page, 0, end, get_block);
2428 if (!ret) 2436 if (!ret)
2429 ret = block_commit_write(page, 0, end); 2437 ret = block_commit_write(page, 0, end);
2430 2438
2431 out_unlock: 2439 out_unlock:
2432 unlock_page(page); 2440 unlock_page(page);
2433 return ret; 2441 return ret;
2434 } 2442 }
2435 2443
2436 /* 2444 /*
2437 * nobh_write_begin()'s prereads are special: the buffer_heads are freed 2445 * nobh_write_begin()'s prereads are special: the buffer_heads are freed
2438 * immediately, while under the page lock. So it needs a special end_io 2446 * immediately, while under the page lock. So it needs a special end_io
2439 * handler which does not touch the bh after unlocking it. 2447 * handler which does not touch the bh after unlocking it.
2440 */ 2448 */
2441 static void end_buffer_read_nobh(struct buffer_head *bh, int uptodate) 2449 static void end_buffer_read_nobh(struct buffer_head *bh, int uptodate)
2442 { 2450 {
2443 __end_buffer_read_notouch(bh, uptodate); 2451 __end_buffer_read_notouch(bh, uptodate);
2444 } 2452 }
2445 2453
2446 /* 2454 /*
2447 * Attach the singly-linked list of buffers created by nobh_write_begin, to 2455 * Attach the singly-linked list of buffers created by nobh_write_begin, to
2448 * the page (converting it to circular linked list and taking care of page 2456 * the page (converting it to circular linked list and taking care of page
2449 * dirty races). 2457 * dirty races).
2450 */ 2458 */
2451 static void attach_nobh_buffers(struct page *page, struct buffer_head *head) 2459 static void attach_nobh_buffers(struct page *page, struct buffer_head *head)
2452 { 2460 {
2453 struct buffer_head *bh; 2461 struct buffer_head *bh;
2454 2462
2455 BUG_ON(!PageLocked(page)); 2463 BUG_ON(!PageLocked(page));
2456 2464
2457 spin_lock(&page->mapping->private_lock); 2465 spin_lock(&page->mapping->private_lock);
2458 bh = head; 2466 bh = head;
2459 do { 2467 do {
2460 if (PageDirty(page)) 2468 if (PageDirty(page))
2461 set_buffer_dirty(bh); 2469 set_buffer_dirty(bh);
2462 if (!bh->b_this_page) 2470 if (!bh->b_this_page)
2463 bh->b_this_page = head; 2471 bh->b_this_page = head;
2464 bh = bh->b_this_page; 2472 bh = bh->b_this_page;
2465 } while (bh != head); 2473 } while (bh != head);
2466 attach_page_buffers(page, head); 2474 attach_page_buffers(page, head);
2467 spin_unlock(&page->mapping->private_lock); 2475 spin_unlock(&page->mapping->private_lock);
2468 } 2476 }
2469 2477
2470 /* 2478 /*
2471 * On entry, the page is fully not uptodate. 2479 * On entry, the page is fully not uptodate.
2472 * On exit the page is fully uptodate in the areas outside (from,to) 2480 * On exit the page is fully uptodate in the areas outside (from,to)
2473 */ 2481 */
2474 int nobh_write_begin(struct file *file, struct address_space *mapping, 2482 int nobh_write_begin(struct file *file, struct address_space *mapping,
2475 loff_t pos, unsigned len, unsigned flags, 2483 loff_t pos, unsigned len, unsigned flags,
2476 struct page **pagep, void **fsdata, 2484 struct page **pagep, void **fsdata,
2477 get_block_t *get_block) 2485 get_block_t *get_block)
2478 { 2486 {
2479 struct inode *inode = mapping->host; 2487 struct inode *inode = mapping->host;
2480 const unsigned blkbits = inode->i_blkbits; 2488 const unsigned blkbits = inode->i_blkbits;
2481 const unsigned blocksize = 1 << blkbits; 2489 const unsigned blocksize = 1 << blkbits;
2482 struct buffer_head *head, *bh; 2490 struct buffer_head *head, *bh;
2483 struct page *page; 2491 struct page *page;
2484 pgoff_t index; 2492 pgoff_t index;
2485 unsigned from, to; 2493 unsigned from, to;
2486 unsigned block_in_page; 2494 unsigned block_in_page;
2487 unsigned block_start, block_end; 2495 unsigned block_start, block_end;
2488 sector_t block_in_file; 2496 sector_t block_in_file;
2489 int nr_reads = 0; 2497 int nr_reads = 0;
2490 int ret = 0; 2498 int ret = 0;
2491 int is_mapped_to_disk = 1; 2499 int is_mapped_to_disk = 1;
2492 2500
2493 index = pos >> PAGE_CACHE_SHIFT; 2501 index = pos >> PAGE_CACHE_SHIFT;
2494 from = pos & (PAGE_CACHE_SIZE - 1); 2502 from = pos & (PAGE_CACHE_SIZE - 1);
2495 to = from + len; 2503 to = from + len;
2496 2504
2497 page = __grab_cache_page(mapping, index); 2505 page = __grab_cache_page(mapping, index);
2498 if (!page) 2506 if (!page)
2499 return -ENOMEM; 2507 return -ENOMEM;
2500 *pagep = page; 2508 *pagep = page;
2501 *fsdata = NULL; 2509 *fsdata = NULL;
2502 2510
2503 if (page_has_buffers(page)) { 2511 if (page_has_buffers(page)) {
2504 unlock_page(page); 2512 unlock_page(page);
2505 page_cache_release(page); 2513 page_cache_release(page);
2506 *pagep = NULL; 2514 *pagep = NULL;
2507 return block_write_begin(file, mapping, pos, len, flags, pagep, 2515 return block_write_begin(file, mapping, pos, len, flags, pagep,
2508 fsdata, get_block); 2516 fsdata, get_block);
2509 } 2517 }
2510 2518
2511 if (PageMappedToDisk(page)) 2519 if (PageMappedToDisk(page))
2512 return 0; 2520 return 0;
2513 2521
2514 /* 2522 /*
2515 * Allocate buffers so that we can keep track of state, and potentially 2523 * Allocate buffers so that we can keep track of state, and potentially
2516 * attach them to the page if an error occurs. In the common case of 2524 * attach them to the page if an error occurs. In the common case of
2517 * no error, they will just be freed again without ever being attached 2525 * no error, they will just be freed again without ever being attached
2518 * to the page (which is all OK, because we're under the page lock). 2526 * to the page (which is all OK, because we're under the page lock).
2519 * 2527 *
2520 * Be careful: the buffer linked list is a NULL terminated one, rather 2528 * Be careful: the buffer linked list is a NULL terminated one, rather
2521 * than the circular one we're used to. 2529 * than the circular one we're used to.
2522 */ 2530 */
2523 head = alloc_page_buffers(page, blocksize, 0); 2531 head = alloc_page_buffers(page, blocksize, 0);
2524 if (!head) { 2532 if (!head) {
2525 ret = -ENOMEM; 2533 ret = -ENOMEM;
2526 goto out_release; 2534 goto out_release;
2527 } 2535 }
2528 2536
2529 block_in_file = (sector_t)page->index << (PAGE_CACHE_SHIFT - blkbits); 2537 block_in_file = (sector_t)page->index << (PAGE_CACHE_SHIFT - blkbits);
2530 2538
2531 /* 2539 /*
2532 * We loop across all blocks in the page, whether or not they are 2540 * We loop across all blocks in the page, whether or not they are
2533 * part of the affected region. This is so we can discover if the 2541 * part of the affected region. This is so we can discover if the
2534 * page is fully mapped-to-disk. 2542 * page is fully mapped-to-disk.
2535 */ 2543 */
2536 for (block_start = 0, block_in_page = 0, bh = head; 2544 for (block_start = 0, block_in_page = 0, bh = head;
2537 block_start < PAGE_CACHE_SIZE; 2545 block_start < PAGE_CACHE_SIZE;
2538 block_in_page++, block_start += blocksize, bh = bh->b_this_page) { 2546 block_in_page++, block_start += blocksize, bh = bh->b_this_page) {
2539 int create; 2547 int create;
2540 2548
2541 block_end = block_start + blocksize; 2549 block_end = block_start + blocksize;
2542 bh->b_state = 0; 2550 bh->b_state = 0;
2543 create = 1; 2551 create = 1;
2544 if (block_start >= to) 2552 if (block_start >= to)
2545 create = 0; 2553 create = 0;
2546 ret = get_block(inode, block_in_file + block_in_page, 2554 ret = get_block(inode, block_in_file + block_in_page,
2547 bh, create); 2555 bh, create);
2548 if (ret) 2556 if (ret)
2549 goto failed; 2557 goto failed;
2550 if (!buffer_mapped(bh)) 2558 if (!buffer_mapped(bh))
2551 is_mapped_to_disk = 0; 2559 is_mapped_to_disk = 0;
2552 if (buffer_new(bh)) 2560 if (buffer_new(bh))
2553 unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr); 2561 unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr);
2554 if (PageUptodate(page)) { 2562 if (PageUptodate(page)) {
2555 set_buffer_uptodate(bh); 2563 set_buffer_uptodate(bh);
2556 continue; 2564 continue;
2557 } 2565 }
2558 if (buffer_new(bh) || !buffer_mapped(bh)) { 2566 if (buffer_new(bh) || !buffer_mapped(bh)) {
2559 zero_user_segments(page, block_start, from, 2567 zero_user_segments(page, block_start, from,
2560 to, block_end); 2568 to, block_end);
2561 continue; 2569 continue;
2562 } 2570 }
2563 if (buffer_uptodate(bh)) 2571 if (buffer_uptodate(bh))
2564 continue; /* reiserfs does this */ 2572 continue; /* reiserfs does this */
2565 if (block_start < from || block_end > to) { 2573 if (block_start < from || block_end > to) {
2566 lock_buffer(bh); 2574 lock_buffer(bh);
2567 bh->b_end_io = end_buffer_read_nobh; 2575 bh->b_end_io = end_buffer_read_nobh;
2568 submit_bh(READ, bh); 2576 submit_bh(READ, bh);
2569 nr_reads++; 2577 nr_reads++;
2570 } 2578 }
2571 } 2579 }
2572 2580
2573 if (nr_reads) { 2581 if (nr_reads) {
2574 /* 2582 /*
2575 * The page is locked, so these buffers are protected from 2583 * The page is locked, so these buffers are protected from
2576 * any VM or truncate activity. Hence we don't need to care 2584 * any VM or truncate activity. Hence we don't need to care
2577 * for the buffer_head refcounts. 2585 * for the buffer_head refcounts.
2578 */ 2586 */
2579 for (bh = head; bh; bh = bh->b_this_page) { 2587 for (bh = head; bh; bh = bh->b_this_page) {
2580 wait_on_buffer(bh); 2588 wait_on_buffer(bh);
2581 if (!buffer_uptodate(bh)) 2589 if (!buffer_uptodate(bh))
2582 ret = -EIO; 2590 ret = -EIO;
2583 } 2591 }
2584 if (ret) 2592 if (ret)
2585 goto failed; 2593 goto failed;
2586 } 2594 }
2587 2595
2588 if (is_mapped_to_disk) 2596 if (is_mapped_to_disk)
2589 SetPageMappedToDisk(page); 2597 SetPageMappedToDisk(page);
2590 2598
2591 *fsdata = head; /* to be released by nobh_write_end */ 2599 *fsdata = head; /* to be released by nobh_write_end */
2592 2600
2593 return 0; 2601 return 0;
2594 2602
2595 failed: 2603 failed:
2596 BUG_ON(!ret); 2604 BUG_ON(!ret);
2597 /* 2605 /*
2598 * Error recovery is a bit difficult. We need to zero out blocks that 2606 * Error recovery is a bit difficult. We need to zero out blocks that
2599 * were newly allocated, and dirty them to ensure they get written out. 2607 * were newly allocated, and dirty them to ensure they get written out.
2600 * Buffers need to be attached to the page at this point, otherwise 2608 * Buffers need to be attached to the page at this point, otherwise
2601 * the handling of potential IO errors during writeout would be hard 2609 * the handling of potential IO errors during writeout would be hard
2602 * (could try doing synchronous writeout, but what if that fails too?) 2610 * (could try doing synchronous writeout, but what if that fails too?)
2603 */ 2611 */
2604 attach_nobh_buffers(page, head); 2612 attach_nobh_buffers(page, head);
2605 page_zero_new_buffers(page, from, to); 2613 page_zero_new_buffers(page, from, to);
2606 2614
2607 out_release: 2615 out_release:
2608 unlock_page(page); 2616 unlock_page(page);
2609 page_cache_release(page); 2617 page_cache_release(page);
2610 *pagep = NULL; 2618 *pagep = NULL;
2611 2619
2612 if (pos + len > inode->i_size) 2620 if (pos + len > inode->i_size)
2613 vmtruncate(inode, inode->i_size); 2621 vmtruncate(inode, inode->i_size);
2614 2622
2615 return ret; 2623 return ret;
2616 } 2624 }
2617 EXPORT_SYMBOL(nobh_write_begin); 2625 EXPORT_SYMBOL(nobh_write_begin);
2618 2626
2619 int nobh_write_end(struct file *file, struct address_space *mapping, 2627 int nobh_write_end(struct file *file, struct address_space *mapping,
2620 loff_t pos, unsigned len, unsigned copied, 2628 loff_t pos, unsigned len, unsigned copied,
2621 struct page *page, void *fsdata) 2629 struct page *page, void *fsdata)
2622 { 2630 {
2623 struct inode *inode = page->mapping->host; 2631 struct inode *inode = page->mapping->host;
2624 struct buffer_head *head = fsdata; 2632 struct buffer_head *head = fsdata;
2625 struct buffer_head *bh; 2633 struct buffer_head *bh;
2626 BUG_ON(fsdata != NULL && page_has_buffers(page)); 2634 BUG_ON(fsdata != NULL && page_has_buffers(page));
2627 2635
2628 if (unlikely(copied < len) && !page_has_buffers(page)) 2636 if (unlikely(copied < len) && !page_has_buffers(page))
2629 attach_nobh_buffers(page, head); 2637 attach_nobh_buffers(page, head);
2630 if (page_has_buffers(page)) 2638 if (page_has_buffers(page))
2631 return generic_write_end(file, mapping, pos, len, 2639 return generic_write_end(file, mapping, pos, len,
2632 copied, page, fsdata); 2640 copied, page, fsdata);
2633 2641
2634 SetPageUptodate(page); 2642 SetPageUptodate(page);
2635 set_page_dirty(page); 2643 set_page_dirty(page);
2636 if (pos+copied > inode->i_size) { 2644 if (pos+copied > inode->i_size) {
2637 i_size_write(inode, pos+copied); 2645 i_size_write(inode, pos+copied);
2638 mark_inode_dirty(inode); 2646 mark_inode_dirty(inode);
2639 } 2647 }
2640 2648
2641 unlock_page(page); 2649 unlock_page(page);
2642 page_cache_release(page); 2650 page_cache_release(page);
2643 2651
2644 while (head) { 2652 while (head) {
2645 bh = head; 2653 bh = head;
2646 head = head->b_this_page; 2654 head = head->b_this_page;
2647 free_buffer_head(bh); 2655 free_buffer_head(bh);
2648 } 2656 }
2649 2657
2650 return copied; 2658 return copied;
2651 } 2659 }
2652 EXPORT_SYMBOL(nobh_write_end); 2660 EXPORT_SYMBOL(nobh_write_end);
2653 2661
2654 /* 2662 /*
2655 * nobh_writepage() - based on block_full_write_page() except 2663 * nobh_writepage() - based on block_full_write_page() except
2656 * that it tries to operate without attaching bufferheads to 2664 * that it tries to operate without attaching bufferheads to
2657 * the page. 2665 * the page.
2658 */ 2666 */
2659 int nobh_writepage(struct page *page, get_block_t *get_block, 2667 int nobh_writepage(struct page *page, get_block_t *get_block,
2660 struct writeback_control *wbc) 2668 struct writeback_control *wbc)
2661 { 2669 {
2662 struct inode * const inode = page->mapping->host; 2670 struct inode * const inode = page->mapping->host;
2663 loff_t i_size = i_size_read(inode); 2671 loff_t i_size = i_size_read(inode);
2664 const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT; 2672 const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
2665 unsigned offset; 2673 unsigned offset;
2666 int ret; 2674 int ret;
2667 2675
2668 /* Is the page fully inside i_size? */ 2676 /* Is the page fully inside i_size? */
2669 if (page->index < end_index) 2677 if (page->index < end_index)
2670 goto out; 2678 goto out;
2671 2679
2672 /* Is the page fully outside i_size? (truncate in progress) */ 2680 /* Is the page fully outside i_size? (truncate in progress) */
2673 offset = i_size & (PAGE_CACHE_SIZE-1); 2681 offset = i_size & (PAGE_CACHE_SIZE-1);
2674 if (page->index >= end_index+1 || !offset) { 2682 if (page->index >= end_index+1 || !offset) {
2675 /* 2683 /*
2676 * The page may have dirty, unmapped buffers. For example, 2684 * The page may have dirty, unmapped buffers. For example,
2677 * they may have been added in ext3_writepage(). Make them 2685 * they may have been added in ext3_writepage(). Make them
2678 * freeable here, so the page does not leak. 2686 * freeable here, so the page does not leak.
2679 */ 2687 */
2680 #if 0 2688 #if 0
2681 /* Not really sure about this - do we need this ? */ 2689 /* Not really sure about this - do we need this ? */
2682 if (page->mapping->a_ops->invalidatepage) 2690 if (page->mapping->a_ops->invalidatepage)
2683 page->mapping->a_ops->invalidatepage(page, offset); 2691 page->mapping->a_ops->invalidatepage(page, offset);
2684 #endif 2692 #endif
2685 unlock_page(page); 2693 unlock_page(page);
2686 return 0; /* don't care */ 2694 return 0; /* don't care */
2687 } 2695 }
2688 2696
2689 /* 2697 /*
2690 * The page straddles i_size. It must be zeroed out on each and every 2698 * The page straddles i_size. It must be zeroed out on each and every
2691 * writepage invocation because it may be mmapped. "A file is mapped 2699 * writepage invocation because it may be mmapped. "A file is mapped
2692 * in multiples of the page size. For a file that is not a multiple of 2700 * in multiples of the page size. For a file that is not a multiple of
2693 * the page size, the remaining memory is zeroed when mapped, and 2701 * the page size, the remaining memory is zeroed when mapped, and
2694 * writes to that region are not written out to the file." 2702 * writes to that region are not written out to the file."
2695 */ 2703 */
2696 zero_user_segment(page, offset, PAGE_CACHE_SIZE); 2704 zero_user_segment(page, offset, PAGE_CACHE_SIZE);
2697 out: 2705 out:
2698 ret = mpage_writepage(page, get_block, wbc); 2706 ret = mpage_writepage(page, get_block, wbc);
2699 if (ret == -EAGAIN) 2707 if (ret == -EAGAIN)
2700 ret = __block_write_full_page(inode, page, get_block, wbc); 2708 ret = __block_write_full_page(inode, page, get_block, wbc);
2701 return ret; 2709 return ret;
2702 } 2710 }
2703 EXPORT_SYMBOL(nobh_writepage); 2711 EXPORT_SYMBOL(nobh_writepage);
2704 2712
2705 int nobh_truncate_page(struct address_space *mapping, 2713 int nobh_truncate_page(struct address_space *mapping,
2706 loff_t from, get_block_t *get_block) 2714 loff_t from, get_block_t *get_block)
2707 { 2715 {
2708 pgoff_t index = from >> PAGE_CACHE_SHIFT; 2716 pgoff_t index = from >> PAGE_CACHE_SHIFT;
2709 unsigned offset = from & (PAGE_CACHE_SIZE-1); 2717 unsigned offset = from & (PAGE_CACHE_SIZE-1);
2710 unsigned blocksize; 2718 unsigned blocksize;
2711 sector_t iblock; 2719 sector_t iblock;
2712 unsigned length, pos; 2720 unsigned length, pos;
2713 struct inode *inode = mapping->host; 2721 struct inode *inode = mapping->host;
2714 struct page *page; 2722 struct page *page;
2715 struct buffer_head map_bh; 2723 struct buffer_head map_bh;
2716 int err; 2724 int err;
2717 2725
2718 blocksize = 1 << inode->i_blkbits; 2726 blocksize = 1 << inode->i_blkbits;
2719 length = offset & (blocksize - 1); 2727 length = offset & (blocksize - 1);
2720 2728
2721 /* Block boundary? Nothing to do */ 2729 /* Block boundary? Nothing to do */
2722 if (!length) 2730 if (!length)
2723 return 0; 2731 return 0;
2724 2732
2725 length = blocksize - length; 2733 length = blocksize - length;
2726 iblock = (sector_t)index << (PAGE_CACHE_SHIFT - inode->i_blkbits); 2734 iblock = (sector_t)index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
2727 2735
2728 page = grab_cache_page(mapping, index); 2736 page = grab_cache_page(mapping, index);
2729 err = -ENOMEM; 2737 err = -ENOMEM;
2730 if (!page) 2738 if (!page)
2731 goto out; 2739 goto out;
2732 2740
2733 if (page_has_buffers(page)) { 2741 if (page_has_buffers(page)) {
2734 has_buffers: 2742 has_buffers:
2735 unlock_page(page); 2743 unlock_page(page);
2736 page_cache_release(page); 2744 page_cache_release(page);
2737 return block_truncate_page(mapping, from, get_block); 2745 return block_truncate_page(mapping, from, get_block);
2738 } 2746 }
2739 2747
2740 /* Find the buffer that contains "offset" */ 2748 /* Find the buffer that contains "offset" */
2741 pos = blocksize; 2749 pos = blocksize;
2742 while (offset >= pos) { 2750 while (offset >= pos) {
2743 iblock++; 2751 iblock++;
2744 pos += blocksize; 2752 pos += blocksize;
2745 } 2753 }
2746 2754
2747 err = get_block(inode, iblock, &map_bh, 0); 2755 err = get_block(inode, iblock, &map_bh, 0);
2748 if (err) 2756 if (err)
2749 goto unlock; 2757 goto unlock;
2750 /* unmapped? It's a hole - nothing to do */ 2758 /* unmapped? It's a hole - nothing to do */
2751 if (!buffer_mapped(&map_bh)) 2759 if (!buffer_mapped(&map_bh))
2752 goto unlock; 2760 goto unlock;
2753 2761
2754 /* Ok, it's mapped. Make sure it's up-to-date */ 2762 /* Ok, it's mapped. Make sure it's up-to-date */
2755 if (!PageUptodate(page)) { 2763 if (!PageUptodate(page)) {
2756 err = mapping->a_ops->readpage(NULL, page); 2764 err = mapping->a_ops->readpage(NULL, page);
2757 if (err) { 2765 if (err) {
2758 page_cache_release(page); 2766 page_cache_release(page);
2759 goto out; 2767 goto out;
2760 } 2768 }
2761 lock_page(page); 2769 lock_page(page);
2762 if (!PageUptodate(page)) { 2770 if (!PageUptodate(page)) {
2763 err = -EIO; 2771 err = -EIO;
2764 goto unlock; 2772 goto unlock;
2765 } 2773 }
2766 if (page_has_buffers(page)) 2774 if (page_has_buffers(page))
2767 goto has_buffers; 2775 goto has_buffers;
2768 } 2776 }
2769 zero_user(page, offset, length); 2777 zero_user(page, offset, length);
2770 set_page_dirty(page); 2778 set_page_dirty(page);
2771 err = 0; 2779 err = 0;
2772 2780
2773 unlock: 2781 unlock:
2774 unlock_page(page); 2782 unlock_page(page);
2775 page_cache_release(page); 2783 page_cache_release(page);
2776 out: 2784 out:
2777 return err; 2785 return err;
2778 } 2786 }
2779 EXPORT_SYMBOL(nobh_truncate_page); 2787 EXPORT_SYMBOL(nobh_truncate_page);
2780 2788
2781 int block_truncate_page(struct address_space *mapping, 2789 int block_truncate_page(struct address_space *mapping,
2782 loff_t from, get_block_t *get_block) 2790 loff_t from, get_block_t *get_block)
2783 { 2791 {
2784 pgoff_t index = from >> PAGE_CACHE_SHIFT; 2792 pgoff_t index = from >> PAGE_CACHE_SHIFT;
2785 unsigned offset = from & (PAGE_CACHE_SIZE-1); 2793 unsigned offset = from & (PAGE_CACHE_SIZE-1);
2786 unsigned blocksize; 2794 unsigned blocksize;
2787 sector_t iblock; 2795 sector_t iblock;
2788 unsigned length, pos; 2796 unsigned length, pos;
2789 struct inode *inode = mapping->host; 2797 struct inode *inode = mapping->host;
2790 struct page *page; 2798 struct page *page;
2791 struct buffer_head *bh; 2799 struct buffer_head *bh;
2792 int err; 2800 int err;
2793 2801
2794 blocksize = 1 << inode->i_blkbits; 2802 blocksize = 1 << inode->i_blkbits;
2795 length = offset & (blocksize - 1); 2803 length = offset & (blocksize - 1);
2796 2804
2797 /* Block boundary? Nothing to do */ 2805 /* Block boundary? Nothing to do */
2798 if (!length) 2806 if (!length)
2799 return 0; 2807 return 0;
2800 2808
2801 length = blocksize - length; 2809 length = blocksize - length;
2802 iblock = (sector_t)index << (PAGE_CACHE_SHIFT - inode->i_blkbits); 2810 iblock = (sector_t)index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
2803 2811
2804 page = grab_cache_page(mapping, index); 2812 page = grab_cache_page(mapping, index);
2805 err = -ENOMEM; 2813 err = -ENOMEM;
2806 if (!page) 2814 if (!page)
2807 goto out; 2815 goto out;
2808 2816
2809 if (!page_has_buffers(page)) 2817 if (!page_has_buffers(page))
2810 create_empty_buffers(page, blocksize, 0); 2818 create_empty_buffers(page, blocksize, 0);
2811 2819
2812 /* Find the buffer that contains "offset" */ 2820 /* Find the buffer that contains "offset" */
2813 bh = page_buffers(page); 2821 bh = page_buffers(page);
2814 pos = blocksize; 2822 pos = blocksize;
2815 while (offset >= pos) { 2823 while (offset >= pos) {
2816 bh = bh->b_this_page; 2824 bh = bh->b_this_page;
2817 iblock++; 2825 iblock++;
2818 pos += blocksize; 2826 pos += blocksize;
2819 } 2827 }
2820 2828
2821 err = 0; 2829 err = 0;
2822 if (!buffer_mapped(bh)) { 2830 if (!buffer_mapped(bh)) {
2823 WARN_ON(bh->b_size != blocksize); 2831 WARN_ON(bh->b_size != blocksize);
2824 err = get_block(inode, iblock, bh, 0); 2832 err = get_block(inode, iblock, bh, 0);
2825 if (err) 2833 if (err)
2826 goto unlock; 2834 goto unlock;
2827 /* unmapped? It's a hole - nothing to do */ 2835 /* unmapped? It's a hole - nothing to do */
2828 if (!buffer_mapped(bh)) 2836 if (!buffer_mapped(bh))
2829 goto unlock; 2837 goto unlock;
2830 } 2838 }
2831 2839
2832 /* Ok, it's mapped. Make sure it's up-to-date */ 2840 /* Ok, it's mapped. Make sure it's up-to-date */
2833 if (PageUptodate(page)) 2841 if (PageUptodate(page))
2834 set_buffer_uptodate(bh); 2842 set_buffer_uptodate(bh);
2835 2843
2836 if (!buffer_uptodate(bh) && !buffer_delay(bh) && !buffer_unwritten(bh)) { 2844 if (!buffer_uptodate(bh) && !buffer_delay(bh) && !buffer_unwritten(bh)) {
2837 err = -EIO; 2845 err = -EIO;
2838 ll_rw_block(READ, 1, &bh); 2846 ll_rw_block(READ, 1, &bh);
2839 wait_on_buffer(bh); 2847 wait_on_buffer(bh);
2840 /* Uhhuh. Read error. Complain and punt. */ 2848 /* Uhhuh. Read error. Complain and punt. */
2841 if (!buffer_uptodate(bh)) 2849 if (!buffer_uptodate(bh))
2842 goto unlock; 2850 goto unlock;
2843 } 2851 }
2844 2852
2845 zero_user(page, offset, length); 2853 zero_user(page, offset, length);
2846 mark_buffer_dirty(bh); 2854 mark_buffer_dirty(bh);
2847 err = 0; 2855 err = 0;
2848 2856
2849 unlock: 2857 unlock:
2850 unlock_page(page); 2858 unlock_page(page);
2851 page_cache_release(page); 2859 page_cache_release(page);
2852 out: 2860 out:
2853 return err; 2861 return err;
2854 } 2862 }
2855 2863
2856 /* 2864 /*
2857 * The generic ->writepage function for buffer-backed address_spaces 2865 * The generic ->writepage function for buffer-backed address_spaces
2858 */ 2866 */
2859 int block_write_full_page(struct page *page, get_block_t *get_block, 2867 int block_write_full_page(struct page *page, get_block_t *get_block,
2860 struct writeback_control *wbc) 2868 struct writeback_control *wbc)
2861 { 2869 {
2862 struct inode * const inode = page->mapping->host; 2870 struct inode * const inode = page->mapping->host;
2863 loff_t i_size = i_size_read(inode); 2871 loff_t i_size = i_size_read(inode);
2864 const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT; 2872 const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
2865 unsigned offset; 2873 unsigned offset;
2866 2874
2867 /* Is the page fully inside i_size? */ 2875 /* Is the page fully inside i_size? */
2868 if (page->index < end_index) 2876 if (page->index < end_index)
2869 return __block_write_full_page(inode, page, get_block, wbc); 2877 return __block_write_full_page(inode, page, get_block, wbc);
2870 2878
2871 /* Is the page fully outside i_size? (truncate in progress) */ 2879 /* Is the page fully outside i_size? (truncate in progress) */
2872 offset = i_size & (PAGE_CACHE_SIZE-1); 2880 offset = i_size & (PAGE_CACHE_SIZE-1);
2873 if (page->index >= end_index+1 || !offset) { 2881 if (page->index >= end_index+1 || !offset) {
2874 /* 2882 /*
2875 * The page may have dirty, unmapped buffers. For example, 2883 * The page may have dirty, unmapped buffers. For example,
2876 * they may have been added in ext3_writepage(). Make them 2884 * they may have been added in ext3_writepage(). Make them
2877 * freeable here, so the page does not leak. 2885 * freeable here, so the page does not leak.
2878 */ 2886 */
2879 do_invalidatepage(page, 0); 2887 do_invalidatepage(page, 0);
2880 unlock_page(page); 2888 unlock_page(page);
2881 return 0; /* don't care */ 2889 return 0; /* don't care */
2882 } 2890 }
2883 2891
2884 /* 2892 /*
2885 * The page straddles i_size. It must be zeroed out on each and every 2893 * The page straddles i_size. It must be zeroed out on each and every
2886 * writepage invokation because it may be mmapped. "A file is mapped 2894 * writepage invokation because it may be mmapped. "A file is mapped
2887 * in multiples of the page size. For a file that is not a multiple of 2895 * in multiples of the page size. For a file that is not a multiple of
2888 * the page size, the remaining memory is zeroed when mapped, and 2896 * the page size, the remaining memory is zeroed when mapped, and
2889 * writes to that region are not written out to the file." 2897 * writes to that region are not written out to the file."
2890 */ 2898 */
2891 zero_user_segment(page, offset, PAGE_CACHE_SIZE); 2899 zero_user_segment(page, offset, PAGE_CACHE_SIZE);
2892 return __block_write_full_page(inode, page, get_block, wbc); 2900 return __block_write_full_page(inode, page, get_block, wbc);
2893 } 2901 }
2894 2902
2895 sector_t generic_block_bmap(struct address_space *mapping, sector_t block, 2903 sector_t generic_block_bmap(struct address_space *mapping, sector_t block,
2896 get_block_t *get_block) 2904 get_block_t *get_block)
2897 { 2905 {
2898 struct buffer_head tmp; 2906 struct buffer_head tmp;
2899 struct inode *inode = mapping->host; 2907 struct inode *inode = mapping->host;
2900 tmp.b_state = 0; 2908 tmp.b_state = 0;
2901 tmp.b_blocknr = 0; 2909 tmp.b_blocknr = 0;
2902 tmp.b_size = 1 << inode->i_blkbits; 2910 tmp.b_size = 1 << inode->i_blkbits;
2903 get_block(inode, block, &tmp, 0); 2911 get_block(inode, block, &tmp, 0);
2904 return tmp.b_blocknr; 2912 return tmp.b_blocknr;
2905 } 2913 }
2906 2914
2907 static void end_bio_bh_io_sync(struct bio *bio, int err) 2915 static void end_bio_bh_io_sync(struct bio *bio, int err)
2908 { 2916 {
2909 struct buffer_head *bh = bio->bi_private; 2917 struct buffer_head *bh = bio->bi_private;
2910 2918
2911 if (err == -EOPNOTSUPP) { 2919 if (err == -EOPNOTSUPP) {
2912 set_bit(BIO_EOPNOTSUPP, &bio->bi_flags); 2920 set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
2913 set_bit(BH_Eopnotsupp, &bh->b_state); 2921 set_bit(BH_Eopnotsupp, &bh->b_state);
2914 } 2922 }
2923
2924 if (unlikely (test_bit(BIO_QUIET,&bio->bi_flags)))
2925 set_bit(BH_Quiet, &bh->b_state);
2915 2926
2916 bh->b_end_io(bh, test_bit(BIO_UPTODATE, &bio->bi_flags)); 2927 bh->b_end_io(bh, test_bit(BIO_UPTODATE, &bio->bi_flags));
2917 bio_put(bio); 2928 bio_put(bio);
2918 } 2929 }
2919 2930
2920 int submit_bh(int rw, struct buffer_head * bh) 2931 int submit_bh(int rw, struct buffer_head * bh)
2921 { 2932 {
2922 struct bio *bio; 2933 struct bio *bio;
2923 int ret = 0; 2934 int ret = 0;
2924 2935
2925 BUG_ON(!buffer_locked(bh)); 2936 BUG_ON(!buffer_locked(bh));
2926 BUG_ON(!buffer_mapped(bh)); 2937 BUG_ON(!buffer_mapped(bh));
2927 BUG_ON(!bh->b_end_io); 2938 BUG_ON(!bh->b_end_io);
2928 2939
2929 /* 2940 /*
2930 * Mask in barrier bit for a write (could be either a WRITE or a 2941 * Mask in barrier bit for a write (could be either a WRITE or a
2931 * WRITE_SYNC 2942 * WRITE_SYNC
2932 */ 2943 */
2933 if (buffer_ordered(bh) && (rw & WRITE)) 2944 if (buffer_ordered(bh) && (rw & WRITE))
2934 rw |= WRITE_BARRIER; 2945 rw |= WRITE_BARRIER;
2935 2946
2936 /* 2947 /*
2937 * Only clear out a write error when rewriting 2948 * Only clear out a write error when rewriting
2938 */ 2949 */
2939 if (test_set_buffer_req(bh) && (rw & WRITE)) 2950 if (test_set_buffer_req(bh) && (rw & WRITE))
2940 clear_buffer_write_io_error(bh); 2951 clear_buffer_write_io_error(bh);
2941 2952
2942 /* 2953 /*
2943 * from here on down, it's all bio -- do the initial mapping, 2954 * from here on down, it's all bio -- do the initial mapping,
2944 * submit_bio -> generic_make_request may further map this bio around 2955 * submit_bio -> generic_make_request may further map this bio around
2945 */ 2956 */
2946 bio = bio_alloc(GFP_NOIO, 1); 2957 bio = bio_alloc(GFP_NOIO, 1);
2947 2958
2948 bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9); 2959 bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);
2949 bio->bi_bdev = bh->b_bdev; 2960 bio->bi_bdev = bh->b_bdev;
2950 bio->bi_io_vec[0].bv_page = bh->b_page; 2961 bio->bi_io_vec[0].bv_page = bh->b_page;
2951 bio->bi_io_vec[0].bv_len = bh->b_size; 2962 bio->bi_io_vec[0].bv_len = bh->b_size;
2952 bio->bi_io_vec[0].bv_offset = bh_offset(bh); 2963 bio->bi_io_vec[0].bv_offset = bh_offset(bh);
2953 2964
2954 bio->bi_vcnt = 1; 2965 bio->bi_vcnt = 1;
2955 bio->bi_idx = 0; 2966 bio->bi_idx = 0;
2956 bio->bi_size = bh->b_size; 2967 bio->bi_size = bh->b_size;
2957 2968
2958 bio->bi_end_io = end_bio_bh_io_sync; 2969 bio->bi_end_io = end_bio_bh_io_sync;
2959 bio->bi_private = bh; 2970 bio->bi_private = bh;
2960 2971
2961 bio_get(bio); 2972 bio_get(bio);
2962 submit_bio(rw, bio); 2973 submit_bio(rw, bio);
2963 2974
2964 if (bio_flagged(bio, BIO_EOPNOTSUPP)) 2975 if (bio_flagged(bio, BIO_EOPNOTSUPP))
2965 ret = -EOPNOTSUPP; 2976 ret = -EOPNOTSUPP;
2966 2977
2967 bio_put(bio); 2978 bio_put(bio);
2968 return ret; 2979 return ret;
2969 } 2980 }
2970 2981
2971 /** 2982 /**
2972 * ll_rw_block: low-level access to block devices (DEPRECATED) 2983 * ll_rw_block: low-level access to block devices (DEPRECATED)
2973 * @rw: whether to %READ or %WRITE or %SWRITE or maybe %READA (readahead) 2984 * @rw: whether to %READ or %WRITE or %SWRITE or maybe %READA (readahead)
2974 * @nr: number of &struct buffer_heads in the array 2985 * @nr: number of &struct buffer_heads in the array
2975 * @bhs: array of pointers to &struct buffer_head 2986 * @bhs: array of pointers to &struct buffer_head
2976 * 2987 *
2977 * ll_rw_block() takes an array of pointers to &struct buffer_heads, and 2988 * ll_rw_block() takes an array of pointers to &struct buffer_heads, and
2978 * requests an I/O operation on them, either a %READ or a %WRITE. The third 2989 * requests an I/O operation on them, either a %READ or a %WRITE. The third
2979 * %SWRITE is like %WRITE only we make sure that the *current* data in buffers 2990 * %SWRITE is like %WRITE only we make sure that the *current* data in buffers
2980 * are sent to disk. The fourth %READA option is described in the documentation 2991 * are sent to disk. The fourth %READA option is described in the documentation
2981 * for generic_make_request() which ll_rw_block() calls. 2992 * for generic_make_request() which ll_rw_block() calls.
2982 * 2993 *
2983 * This function drops any buffer that it cannot get a lock on (with the 2994 * This function drops any buffer that it cannot get a lock on (with the
2984 * BH_Lock state bit) unless SWRITE is required, any buffer that appears to be 2995 * BH_Lock state bit) unless SWRITE is required, any buffer that appears to be
2985 * clean when doing a write request, and any buffer that appears to be 2996 * clean when doing a write request, and any buffer that appears to be
2986 * up-to-date when doing read request. Further it marks as clean buffers that 2997 * up-to-date when doing read request. Further it marks as clean buffers that
2987 * are processed for writing (the buffer cache won't assume that they are 2998 * are processed for writing (the buffer cache won't assume that they are
2988 * actually clean until the buffer gets unlocked). 2999 * actually clean until the buffer gets unlocked).
2989 * 3000 *
2990 * ll_rw_block sets b_end_io to simple completion handler that marks 3001 * ll_rw_block sets b_end_io to simple completion handler that marks
2991 * the buffer up-to-date (if approriate), unlocks the buffer and wakes 3002 * the buffer up-to-date (if approriate), unlocks the buffer and wakes
2992 * any waiters. 3003 * any waiters.
2993 * 3004 *
2994 * All of the buffers must be for the same device, and must also be a 3005 * All of the buffers must be for the same device, and must also be a
2995 * multiple of the current approved size for the device. 3006 * multiple of the current approved size for the device.
2996 */ 3007 */
2997 void ll_rw_block(int rw, int nr, struct buffer_head *bhs[]) 3008 void ll_rw_block(int rw, int nr, struct buffer_head *bhs[])
2998 { 3009 {
2999 int i; 3010 int i;
3000 3011
3001 for (i = 0; i < nr; i++) { 3012 for (i = 0; i < nr; i++) {
3002 struct buffer_head *bh = bhs[i]; 3013 struct buffer_head *bh = bhs[i];
3003 3014
3004 if (rw == SWRITE || rw == SWRITE_SYNC) 3015 if (rw == SWRITE || rw == SWRITE_SYNC)
3005 lock_buffer(bh); 3016 lock_buffer(bh);
3006 else if (!trylock_buffer(bh)) 3017 else if (!trylock_buffer(bh))
3007 continue; 3018 continue;
3008 3019
3009 if (rw == WRITE || rw == SWRITE || rw == SWRITE_SYNC) { 3020 if (rw == WRITE || rw == SWRITE || rw == SWRITE_SYNC) {
3010 if (test_clear_buffer_dirty(bh)) { 3021 if (test_clear_buffer_dirty(bh)) {
3011 bh->b_end_io = end_buffer_write_sync; 3022 bh->b_end_io = end_buffer_write_sync;
3012 get_bh(bh); 3023 get_bh(bh);
3013 if (rw == SWRITE_SYNC) 3024 if (rw == SWRITE_SYNC)
3014 submit_bh(WRITE_SYNC, bh); 3025 submit_bh(WRITE_SYNC, bh);
3015 else 3026 else
3016 submit_bh(WRITE, bh); 3027 submit_bh(WRITE, bh);
3017 continue; 3028 continue;
3018 } 3029 }
3019 } else { 3030 } else {
3020 if (!buffer_uptodate(bh)) { 3031 if (!buffer_uptodate(bh)) {
3021 bh->b_end_io = end_buffer_read_sync; 3032 bh->b_end_io = end_buffer_read_sync;
3022 get_bh(bh); 3033 get_bh(bh);
3023 submit_bh(rw, bh); 3034 submit_bh(rw, bh);
3024 continue; 3035 continue;
3025 } 3036 }
3026 } 3037 }
3027 unlock_buffer(bh); 3038 unlock_buffer(bh);
3028 } 3039 }
3029 } 3040 }
3030 3041
3031 /* 3042 /*
3032 * For a data-integrity writeout, we need to wait upon any in-progress I/O 3043 * For a data-integrity writeout, we need to wait upon any in-progress I/O
3033 * and then start new I/O and then wait upon it. The caller must have a ref on 3044 * and then start new I/O and then wait upon it. The caller must have a ref on
3034 * the buffer_head. 3045 * the buffer_head.
3035 */ 3046 */
3036 int sync_dirty_buffer(struct buffer_head *bh) 3047 int sync_dirty_buffer(struct buffer_head *bh)
3037 { 3048 {
3038 int ret = 0; 3049 int ret = 0;
3039 3050
3040 WARN_ON(atomic_read(&bh->b_count) < 1); 3051 WARN_ON(atomic_read(&bh->b_count) < 1);
3041 lock_buffer(bh); 3052 lock_buffer(bh);
3042 if (test_clear_buffer_dirty(bh)) { 3053 if (test_clear_buffer_dirty(bh)) {
3043 get_bh(bh); 3054 get_bh(bh);
3044 bh->b_end_io = end_buffer_write_sync; 3055 bh->b_end_io = end_buffer_write_sync;
3045 ret = submit_bh(WRITE_SYNC, bh); 3056 ret = submit_bh(WRITE_SYNC, bh);
3046 wait_on_buffer(bh); 3057 wait_on_buffer(bh);
3047 if (buffer_eopnotsupp(bh)) { 3058 if (buffer_eopnotsupp(bh)) {
3048 clear_buffer_eopnotsupp(bh); 3059 clear_buffer_eopnotsupp(bh);
3049 ret = -EOPNOTSUPP; 3060 ret = -EOPNOTSUPP;
3050 } 3061 }
3051 if (!ret && !buffer_uptodate(bh)) 3062 if (!ret && !buffer_uptodate(bh))
3052 ret = -EIO; 3063 ret = -EIO;
3053 } else { 3064 } else {
3054 unlock_buffer(bh); 3065 unlock_buffer(bh);
3055 } 3066 }
3056 return ret; 3067 return ret;
3057 } 3068 }
3058 3069
3059 /* 3070 /*
3060 * try_to_free_buffers() checks if all the buffers on this particular page 3071 * try_to_free_buffers() checks if all the buffers on this particular page
3061 * are unused, and releases them if so. 3072 * are unused, and releases them if so.
3062 * 3073 *
3063 * Exclusion against try_to_free_buffers may be obtained by either 3074 * Exclusion against try_to_free_buffers may be obtained by either
3064 * locking the page or by holding its mapping's private_lock. 3075 * locking the page or by holding its mapping's private_lock.
3065 * 3076 *
3066 * If the page is dirty but all the buffers are clean then we need to 3077 * If the page is dirty but all the buffers are clean then we need to
3067 * be sure to mark the page clean as well. This is because the page 3078 * be sure to mark the page clean as well. This is because the page
3068 * may be against a block device, and a later reattachment of buffers 3079 * may be against a block device, and a later reattachment of buffers
3069 * to a dirty page will set *all* buffers dirty. Which would corrupt 3080 * to a dirty page will set *all* buffers dirty. Which would corrupt
3070 * filesystem data on the same device. 3081 * filesystem data on the same device.
3071 * 3082 *
3072 * The same applies to regular filesystem pages: if all the buffers are 3083 * The same applies to regular filesystem pages: if all the buffers are
3073 * clean then we set the page clean and proceed. To do that, we require 3084 * clean then we set the page clean and proceed. To do that, we require
3074 * total exclusion from __set_page_dirty_buffers(). That is obtained with 3085 * total exclusion from __set_page_dirty_buffers(). That is obtained with
3075 * private_lock. 3086 * private_lock.
3076 * 3087 *
3077 * try_to_free_buffers() is non-blocking. 3088 * try_to_free_buffers() is non-blocking.
3078 */ 3089 */
3079 static inline int buffer_busy(struct buffer_head *bh) 3090 static inline int buffer_busy(struct buffer_head *bh)
3080 { 3091 {
3081 return atomic_read(&bh->b_count) | 3092 return atomic_read(&bh->b_count) |
3082 (bh->b_state & ((1 << BH_Dirty) | (1 << BH_Lock))); 3093 (bh->b_state & ((1 << BH_Dirty) | (1 << BH_Lock)));
3083 } 3094 }
3084 3095
3085 static int 3096 static int
3086 drop_buffers(struct page *page, struct buffer_head **buffers_to_free) 3097 drop_buffers(struct page *page, struct buffer_head **buffers_to_free)
3087 { 3098 {
3088 struct buffer_head *head = page_buffers(page); 3099 struct buffer_head *head = page_buffers(page);
3089 struct buffer_head *bh; 3100 struct buffer_head *bh;
3090 3101
3091 bh = head; 3102 bh = head;
3092 do { 3103 do {
3093 if (buffer_write_io_error(bh) && page->mapping) 3104 if (buffer_write_io_error(bh) && page->mapping)
3094 set_bit(AS_EIO, &page->mapping->flags); 3105 set_bit(AS_EIO, &page->mapping->flags);
3095 if (buffer_busy(bh)) 3106 if (buffer_busy(bh))
3096 goto failed; 3107 goto failed;
3097 bh = bh->b_this_page; 3108 bh = bh->b_this_page;
3098 } while (bh != head); 3109 } while (bh != head);
3099 3110
3100 do { 3111 do {
3101 struct buffer_head *next = bh->b_this_page; 3112 struct buffer_head *next = bh->b_this_page;
3102 3113
3103 if (bh->b_assoc_map) 3114 if (bh->b_assoc_map)
3104 __remove_assoc_queue(bh); 3115 __remove_assoc_queue(bh);
3105 bh = next; 3116 bh = next;
3106 } while (bh != head); 3117 } while (bh != head);
3107 *buffers_to_free = head; 3118 *buffers_to_free = head;
3108 __clear_page_buffers(page); 3119 __clear_page_buffers(page);
3109 return 1; 3120 return 1;
3110 failed: 3121 failed:
3111 return 0; 3122 return 0;
3112 } 3123 }
3113 3124
3114 int try_to_free_buffers(struct page *page) 3125 int try_to_free_buffers(struct page *page)
3115 { 3126 {
3116 struct address_space * const mapping = page->mapping; 3127 struct address_space * const mapping = page->mapping;
3117 struct buffer_head *buffers_to_free = NULL; 3128 struct buffer_head *buffers_to_free = NULL;
3118 int ret = 0; 3129 int ret = 0;
3119 3130
3120 BUG_ON(!PageLocked(page)); 3131 BUG_ON(!PageLocked(page));
3121 if (PageWriteback(page)) 3132 if (PageWriteback(page))
3122 return 0; 3133 return 0;
3123 3134
3124 if (mapping == NULL) { /* can this still happen? */ 3135 if (mapping == NULL) { /* can this still happen? */
3125 ret = drop_buffers(page, &buffers_to_free); 3136 ret = drop_buffers(page, &buffers_to_free);
3126 goto out; 3137 goto out;
3127 } 3138 }
3128 3139
3129 spin_lock(&mapping->private_lock); 3140 spin_lock(&mapping->private_lock);
3130 ret = drop_buffers(page, &buffers_to_free); 3141 ret = drop_buffers(page, &buffers_to_free);
3131 3142
3132 /* 3143 /*
3133 * If the filesystem writes its buffers by hand (eg ext3) 3144 * If the filesystem writes its buffers by hand (eg ext3)
3134 * then we can have clean buffers against a dirty page. We 3145 * then we can have clean buffers against a dirty page. We
3135 * clean the page here; otherwise the VM will never notice 3146 * clean the page here; otherwise the VM will never notice
3136 * that the filesystem did any IO at all. 3147 * that the filesystem did any IO at all.
3137 * 3148 *
3138 * Also, during truncate, discard_buffer will have marked all 3149 * Also, during truncate, discard_buffer will have marked all
3139 * the page's buffers clean. We discover that here and clean 3150 * the page's buffers clean. We discover that here and clean
3140 * the page also. 3151 * the page also.
3141 * 3152 *
3142 * private_lock must be held over this entire operation in order 3153 * private_lock must be held over this entire operation in order
3143 * to synchronise against __set_page_dirty_buffers and prevent the 3154 * to synchronise against __set_page_dirty_buffers and prevent the
3144 * dirty bit from being lost. 3155 * dirty bit from being lost.
3145 */ 3156 */
3146 if (ret) 3157 if (ret)
3147 cancel_dirty_page(page, PAGE_CACHE_SIZE); 3158 cancel_dirty_page(page, PAGE_CACHE_SIZE);
3148 spin_unlock(&mapping->private_lock); 3159 spin_unlock(&mapping->private_lock);
3149 out: 3160 out:
3150 if (buffers_to_free) { 3161 if (buffers_to_free) {
3151 struct buffer_head *bh = buffers_to_free; 3162 struct buffer_head *bh = buffers_to_free;
3152 3163
3153 do { 3164 do {
3154 struct buffer_head *next = bh->b_this_page; 3165 struct buffer_head *next = bh->b_this_page;
3155 free_buffer_head(bh); 3166 free_buffer_head(bh);
3156 bh = next; 3167 bh = next;
3157 } while (bh != buffers_to_free); 3168 } while (bh != buffers_to_free);
3158 } 3169 }
3159 return ret; 3170 return ret;
3160 } 3171 }
3161 EXPORT_SYMBOL(try_to_free_buffers); 3172 EXPORT_SYMBOL(try_to_free_buffers);
3162 3173
3163 void block_sync_page(struct page *page) 3174 void block_sync_page(struct page *page)
3164 { 3175 {
3165 struct address_space *mapping; 3176 struct address_space *mapping;
3166 3177
3167 smp_mb(); 3178 smp_mb();
3168 mapping = page_mapping(page); 3179 mapping = page_mapping(page);
3169 if (mapping) 3180 if (mapping)
3170 blk_run_backing_dev(mapping->backing_dev_info, page); 3181 blk_run_backing_dev(mapping->backing_dev_info, page);
3171 } 3182 }
3172 3183
3173 /* 3184 /*
3174 * There are no bdflush tunables left. But distributions are 3185 * There are no bdflush tunables left. But distributions are
3175 * still running obsolete flush daemons, so we terminate them here. 3186 * still running obsolete flush daemons, so we terminate them here.
3176 * 3187 *
3177 * Use of bdflush() is deprecated and will be removed in a future kernel. 3188 * Use of bdflush() is deprecated and will be removed in a future kernel.
3178 * The `pdflush' kernel threads fully replace bdflush daemons and this call. 3189 * The `pdflush' kernel threads fully replace bdflush daemons and this call.
3179 */ 3190 */
3180 asmlinkage long sys_bdflush(int func, long data) 3191 asmlinkage long sys_bdflush(int func, long data)
3181 { 3192 {
3182 static int msg_count; 3193 static int msg_count;
3183 3194
3184 if (!capable(CAP_SYS_ADMIN)) 3195 if (!capable(CAP_SYS_ADMIN))
3185 return -EPERM; 3196 return -EPERM;
3186 3197
3187 if (msg_count < 5) { 3198 if (msg_count < 5) {
3188 msg_count++; 3199 msg_count++;
3189 printk(KERN_INFO 3200 printk(KERN_INFO
3190 "warning: process `%s' used the obsolete bdflush" 3201 "warning: process `%s' used the obsolete bdflush"
3191 " system call\n", current->comm); 3202 " system call\n", current->comm);
3192 printk(KERN_INFO "Fix your initscripts?\n"); 3203 printk(KERN_INFO "Fix your initscripts?\n");
3193 } 3204 }
3194 3205
3195 if (func == 1) 3206 if (func == 1)
3196 do_exit(0); 3207 do_exit(0);
3197 return 0; 3208 return 0;
3198 } 3209 }
3199 3210
3200 /* 3211 /*
3201 * Buffer-head allocation 3212 * Buffer-head allocation
3202 */ 3213 */
3203 static struct kmem_cache *bh_cachep; 3214 static struct kmem_cache *bh_cachep;
3204 3215
3205 /* 3216 /*
3206 * Once the number of bh's in the machine exceeds this level, we start 3217 * Once the number of bh's in the machine exceeds this level, we start
3207 * stripping them in writeback. 3218 * stripping them in writeback.
3208 */ 3219 */
3209 static int max_buffer_heads; 3220 static int max_buffer_heads;
3210 3221
3211 int buffer_heads_over_limit; 3222 int buffer_heads_over_limit;
3212 3223
3213 struct bh_accounting { 3224 struct bh_accounting {
3214 int nr; /* Number of live bh's */ 3225 int nr; /* Number of live bh's */
3215 int ratelimit; /* Limit cacheline bouncing */ 3226 int ratelimit; /* Limit cacheline bouncing */
3216 }; 3227 };
3217 3228
3218 static DEFINE_PER_CPU(struct bh_accounting, bh_accounting) = {0, 0}; 3229 static DEFINE_PER_CPU(struct bh_accounting, bh_accounting) = {0, 0};
3219 3230
3220 static void recalc_bh_state(void) 3231 static void recalc_bh_state(void)
3221 { 3232 {
3222 int i; 3233 int i;
3223 int tot = 0; 3234 int tot = 0;
3224 3235
3225 if (__get_cpu_var(bh_accounting).ratelimit++ < 4096) 3236 if (__get_cpu_var(bh_accounting).ratelimit++ < 4096)
3226 return; 3237 return;
3227 __get_cpu_var(bh_accounting).ratelimit = 0; 3238 __get_cpu_var(bh_accounting).ratelimit = 0;
3228 for_each_online_cpu(i) 3239 for_each_online_cpu(i)
3229 tot += per_cpu(bh_accounting, i).nr; 3240 tot += per_cpu(bh_accounting, i).nr;
3230 buffer_heads_over_limit = (tot > max_buffer_heads); 3241 buffer_heads_over_limit = (tot > max_buffer_heads);
3231 } 3242 }
3232 3243
3233 struct buffer_head *alloc_buffer_head(gfp_t gfp_flags) 3244 struct buffer_head *alloc_buffer_head(gfp_t gfp_flags)
3234 { 3245 {
3235 struct buffer_head *ret = kmem_cache_alloc(bh_cachep, gfp_flags); 3246 struct buffer_head *ret = kmem_cache_alloc(bh_cachep, gfp_flags);
3236 if (ret) { 3247 if (ret) {
3237 INIT_LIST_HEAD(&ret->b_assoc_buffers); 3248 INIT_LIST_HEAD(&ret->b_assoc_buffers);
3238 get_cpu_var(bh_accounting).nr++; 3249 get_cpu_var(bh_accounting).nr++;
3239 recalc_bh_state(); 3250 recalc_bh_state();
3240 put_cpu_var(bh_accounting); 3251 put_cpu_var(bh_accounting);
3241 } 3252 }
3242 return ret; 3253 return ret;
3243 } 3254 }
3244 EXPORT_SYMBOL(alloc_buffer_head); 3255 EXPORT_SYMBOL(alloc_buffer_head);
3245 3256
3246 void free_buffer_head(struct buffer_head *bh) 3257 void free_buffer_head(struct buffer_head *bh)
3247 { 3258 {
3248 BUG_ON(!list_empty(&bh->b_assoc_buffers)); 3259 BUG_ON(!list_empty(&bh->b_assoc_buffers));
3249 kmem_cache_free(bh_cachep, bh); 3260 kmem_cache_free(bh_cachep, bh);
3250 get_cpu_var(bh_accounting).nr--; 3261 get_cpu_var(bh_accounting).nr--;
3251 recalc_bh_state(); 3262 recalc_bh_state();
3252 put_cpu_var(bh_accounting); 3263 put_cpu_var(bh_accounting);
3253 } 3264 }
3254 EXPORT_SYMBOL(free_buffer_head); 3265 EXPORT_SYMBOL(free_buffer_head);
3255 3266
3256 static void buffer_exit_cpu(int cpu) 3267 static void buffer_exit_cpu(int cpu)
3257 { 3268 {
3258 int i; 3269 int i;
3259 struct bh_lru *b = &per_cpu(bh_lrus, cpu); 3270 struct bh_lru *b = &per_cpu(bh_lrus, cpu);
3260 3271
3261 for (i = 0; i < BH_LRU_SIZE; i++) { 3272 for (i = 0; i < BH_LRU_SIZE; i++) {
3262 brelse(b->bhs[i]); 3273 brelse(b->bhs[i]);
3263 b->bhs[i] = NULL; 3274 b->bhs[i] = NULL;
3264 } 3275 }
3265 get_cpu_var(bh_accounting).nr += per_cpu(bh_accounting, cpu).nr; 3276 get_cpu_var(bh_accounting).nr += per_cpu(bh_accounting, cpu).nr;
3266 per_cpu(bh_accounting, cpu).nr = 0; 3277 per_cpu(bh_accounting, cpu).nr = 0;
3267 put_cpu_var(bh_accounting); 3278 put_cpu_var(bh_accounting);
3268 } 3279 }
3269 3280
3270 static int buffer_cpu_notify(struct notifier_block *self, 3281 static int buffer_cpu_notify(struct notifier_block *self,
3271 unsigned long action, void *hcpu) 3282 unsigned long action, void *hcpu)
3272 { 3283 {
3273 if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) 3284 if (action == CPU_DEAD || action == CPU_DEAD_FROZEN)
3274 buffer_exit_cpu((unsigned long)hcpu); 3285 buffer_exit_cpu((unsigned long)hcpu);
3275 return NOTIFY_OK; 3286 return NOTIFY_OK;
3276 } 3287 }
3277 3288
3278 /** 3289 /**
3279 * bh_uptodate_or_lock - Test whether the buffer is uptodate 3290 * bh_uptodate_or_lock - Test whether the buffer is uptodate
3280 * @bh: struct buffer_head 3291 * @bh: struct buffer_head
3281 * 3292 *
3282 * Return true if the buffer is up-to-date and false, 3293 * Return true if the buffer is up-to-date and false,
3283 * with the buffer locked, if not. 3294 * with the buffer locked, if not.
3284 */ 3295 */
3285 int bh_uptodate_or_lock(struct buffer_head *bh) 3296 int bh_uptodate_or_lock(struct buffer_head *bh)
3286 { 3297 {
3287 if (!buffer_uptodate(bh)) { 3298 if (!buffer_uptodate(bh)) {
3288 lock_buffer(bh); 3299 lock_buffer(bh);
3289 if (!buffer_uptodate(bh)) 3300 if (!buffer_uptodate(bh))
3290 return 0; 3301 return 0;
3291 unlock_buffer(bh); 3302 unlock_buffer(bh);
3292 } 3303 }
3293 return 1; 3304 return 1;
3294 } 3305 }
3295 EXPORT_SYMBOL(bh_uptodate_or_lock); 3306 EXPORT_SYMBOL(bh_uptodate_or_lock);
3296 3307
3297 /** 3308 /**
3298 * bh_submit_read - Submit a locked buffer for reading 3309 * bh_submit_read - Submit a locked buffer for reading
3299 * @bh: struct buffer_head 3310 * @bh: struct buffer_head
3300 * 3311 *
3301 * Returns zero on success and -EIO on error. 3312 * Returns zero on success and -EIO on error.
3302 */ 3313 */
3303 int bh_submit_read(struct buffer_head *bh) 3314 int bh_submit_read(struct buffer_head *bh)
3304 { 3315 {
3305 BUG_ON(!buffer_locked(bh)); 3316 BUG_ON(!buffer_locked(bh));
3306 3317
3307 if (buffer_uptodate(bh)) { 3318 if (buffer_uptodate(bh)) {
3308 unlock_buffer(bh); 3319 unlock_buffer(bh);
3309 return 0; 3320 return 0;
3310 } 3321 }
3311 3322
3312 get_bh(bh); 3323 get_bh(bh);
3313 bh->b_end_io = end_buffer_read_sync; 3324 bh->b_end_io = end_buffer_read_sync;
3314 submit_bh(READ, bh); 3325 submit_bh(READ, bh);
3315 wait_on_buffer(bh); 3326 wait_on_buffer(bh);
3316 if (buffer_uptodate(bh)) 3327 if (buffer_uptodate(bh))
3317 return 0; 3328 return 0;
3318 return -EIO; 3329 return -EIO;
3319 } 3330 }
3320 EXPORT_SYMBOL(bh_submit_read); 3331 EXPORT_SYMBOL(bh_submit_read);
3321 3332
3322 static void 3333 static void
3323 init_buffer_head(void *data) 3334 init_buffer_head(void *data)
3324 { 3335 {
3325 struct buffer_head *bh = data; 3336 struct buffer_head *bh = data;
3326 3337
3327 memset(bh, 0, sizeof(*bh)); 3338 memset(bh, 0, sizeof(*bh));
3328 INIT_LIST_HEAD(&bh->b_assoc_buffers); 3339 INIT_LIST_HEAD(&bh->b_assoc_buffers);
3329 } 3340 }
3330 3341
3331 void __init buffer_init(void) 3342 void __init buffer_init(void)
3332 { 3343 {
3333 int nrpages; 3344 int nrpages;
3334 3345
3335 bh_cachep = kmem_cache_create("buffer_head", 3346 bh_cachep = kmem_cache_create("buffer_head",
3336 sizeof(struct buffer_head), 0, 3347 sizeof(struct buffer_head), 0,
3337 (SLAB_RECLAIM_ACCOUNT|SLAB_PANIC| 3348 (SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|
3338 SLAB_MEM_SPREAD), 3349 SLAB_MEM_SPREAD),
3339 init_buffer_head); 3350 init_buffer_head);
3340 3351
3341 /* 3352 /*
3342 * Limit the bh occupancy to 10% of ZONE_NORMAL 3353 * Limit the bh occupancy to 10% of ZONE_NORMAL
3343 */ 3354 */
3344 nrpages = (nr_free_buffer_pages() * 10) / 100; 3355 nrpages = (nr_free_buffer_pages() * 10) / 100;
3345 max_buffer_heads = nrpages * (PAGE_SIZE / sizeof(struct buffer_head)); 3356 max_buffer_heads = nrpages * (PAGE_SIZE / sizeof(struct buffer_head));
3346 hotcpu_notifier(buffer_cpu_notify, 0); 3357 hotcpu_notifier(buffer_cpu_notify, 0);
3347 } 3358 }
3348 3359
3349 EXPORT_SYMBOL(__bforget); 3360 EXPORT_SYMBOL(__bforget);
3350 EXPORT_SYMBOL(__brelse); 3361 EXPORT_SYMBOL(__brelse);
3351 EXPORT_SYMBOL(__wait_on_buffer); 3362 EXPORT_SYMBOL(__wait_on_buffer);
3352 EXPORT_SYMBOL(block_commit_write); 3363 EXPORT_SYMBOL(block_commit_write);
3353 EXPORT_SYMBOL(block_prepare_write); 3364 EXPORT_SYMBOL(block_prepare_write);
3354 EXPORT_SYMBOL(block_page_mkwrite); 3365 EXPORT_SYMBOL(block_page_mkwrite);
3355 EXPORT_SYMBOL(block_read_full_page); 3366 EXPORT_SYMBOL(block_read_full_page);
3356 EXPORT_SYMBOL(block_sync_page); 3367 EXPORT_SYMBOL(block_sync_page);
3357 EXPORT_SYMBOL(block_truncate_page); 3368 EXPORT_SYMBOL(block_truncate_page);
3358 EXPORT_SYMBOL(block_write_full_page); 3369 EXPORT_SYMBOL(block_write_full_page);
3359 EXPORT_SYMBOL(cont_write_begin); 3370 EXPORT_SYMBOL(cont_write_begin);
3360 EXPORT_SYMBOL(end_buffer_read_sync); 3371 EXPORT_SYMBOL(end_buffer_read_sync);
3361 EXPORT_SYMBOL(end_buffer_write_sync); 3372 EXPORT_SYMBOL(end_buffer_write_sync);
3362 EXPORT_SYMBOL(file_fsync); 3373 EXPORT_SYMBOL(file_fsync);
3363 EXPORT_SYMBOL(fsync_bdev); 3374 EXPORT_SYMBOL(fsync_bdev);
3364 EXPORT_SYMBOL(generic_block_bmap); 3375 EXPORT_SYMBOL(generic_block_bmap);
3365 EXPORT_SYMBOL(generic_cont_expand_simple); 3376 EXPORT_SYMBOL(generic_cont_expand_simple);
3366 EXPORT_SYMBOL(init_buffer); 3377 EXPORT_SYMBOL(init_buffer);
3367 EXPORT_SYMBOL(invalidate_bdev); 3378 EXPORT_SYMBOL(invalidate_bdev);
3368 EXPORT_SYMBOL(ll_rw_block); 3379 EXPORT_SYMBOL(ll_rw_block);
3369 EXPORT_SYMBOL(mark_buffer_dirty); 3380 EXPORT_SYMBOL(mark_buffer_dirty);
3370 EXPORT_SYMBOL(submit_bh); 3381 EXPORT_SYMBOL(submit_bh);
3371 EXPORT_SYMBOL(sync_dirty_buffer); 3382 EXPORT_SYMBOL(sync_dirty_buffer);
3372 EXPORT_SYMBOL(unlock_buffer); 3383 EXPORT_SYMBOL(unlock_buffer);
1 /* 1 /*
2 * 2.5 block I/O model 2 * 2.5 block I/O model
3 * 3 *
4 * Copyright (C) 2001 Jens Axboe <axboe@suse.de> 4 * Copyright (C) 2001 Jens Axboe <axboe@suse.de>
5 * 5 *
6 * This program is free software; you can redistribute it and/or modify 6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 as 7 * it under the terms of the GNU General Public License version 2 as
8 * published by the Free Software Foundation. 8 * published by the Free Software Foundation.
9 * 9 *
10 * This program is distributed in the hope that it will be useful, 10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 12
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details. 14 * GNU General Public License for more details.
15 * 15 *
16 * You should have received a copy of the GNU General Public Licens 16 * You should have received a copy of the GNU General Public Licens
17 * along with this program; if not, write to the Free Software 17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111- 18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-
19 */ 19 */
20 #ifndef __LINUX_BIO_H 20 #ifndef __LINUX_BIO_H
21 #define __LINUX_BIO_H 21 #define __LINUX_BIO_H
22 22
23 #include <linux/highmem.h> 23 #include <linux/highmem.h>
24 #include <linux/mempool.h> 24 #include <linux/mempool.h>
25 #include <linux/ioprio.h> 25 #include <linux/ioprio.h>
26 26
27 #ifdef CONFIG_BLOCK 27 #ifdef CONFIG_BLOCK
28 28
29 #include <asm/io.h> 29 #include <asm/io.h>
30 30
31 #define BIO_DEBUG 31 #define BIO_DEBUG
32 32
33 #ifdef BIO_DEBUG 33 #ifdef BIO_DEBUG
34 #define BIO_BUG_ON BUG_ON 34 #define BIO_BUG_ON BUG_ON
35 #else 35 #else
36 #define BIO_BUG_ON 36 #define BIO_BUG_ON
37 #endif 37 #endif
38 38
39 #define BIO_MAX_PAGES 256 39 #define BIO_MAX_PAGES 256
40 #define BIO_MAX_SIZE (BIO_MAX_PAGES << PAGE_CACHE_SHIFT) 40 #define BIO_MAX_SIZE (BIO_MAX_PAGES << PAGE_CACHE_SHIFT)
41 #define BIO_MAX_SECTORS (BIO_MAX_SIZE >> 9) 41 #define BIO_MAX_SECTORS (BIO_MAX_SIZE >> 9)
42 42
43 /* 43 /*
44 * was unsigned short, but we might as well be ready for > 64kB I/O pages 44 * was unsigned short, but we might as well be ready for > 64kB I/O pages
45 */ 45 */
46 struct bio_vec { 46 struct bio_vec {
47 struct page *bv_page; 47 struct page *bv_page;
48 unsigned int bv_len; 48 unsigned int bv_len;
49 unsigned int bv_offset; 49 unsigned int bv_offset;
50 }; 50 };
51 51
52 struct bio_set; 52 struct bio_set;
53 struct bio; 53 struct bio;
54 struct bio_integrity_payload; 54 struct bio_integrity_payload;
55 typedef void (bio_end_io_t) (struct bio *, int); 55 typedef void (bio_end_io_t) (struct bio *, int);
56 typedef void (bio_destructor_t) (struct bio *); 56 typedef void (bio_destructor_t) (struct bio *);
57 57
58 /* 58 /*
59 * main unit of I/O for the block layer and lower layers (ie drivers and 59 * main unit of I/O for the block layer and lower layers (ie drivers and
60 * stacking drivers) 60 * stacking drivers)
61 */ 61 */
62 struct bio { 62 struct bio {
63 sector_t bi_sector; /* device address in 512 byte 63 sector_t bi_sector; /* device address in 512 byte
64 sectors */ 64 sectors */
65 struct bio *bi_next; /* request queue link */ 65 struct bio *bi_next; /* request queue link */
66 struct block_device *bi_bdev; 66 struct block_device *bi_bdev;
67 unsigned long bi_flags; /* status, command, etc */ 67 unsigned long bi_flags; /* status, command, etc */
68 unsigned long bi_rw; /* bottom bits READ/WRITE, 68 unsigned long bi_rw; /* bottom bits READ/WRITE,
69 * top bits priority 69 * top bits priority
70 */ 70 */
71 71
72 unsigned short bi_vcnt; /* how many bio_vec's */ 72 unsigned short bi_vcnt; /* how many bio_vec's */
73 unsigned short bi_idx; /* current index into bvl_vec */ 73 unsigned short bi_idx; /* current index into bvl_vec */
74 74
75 /* Number of segments in this BIO after 75 /* Number of segments in this BIO after
76 * physical address coalescing is performed. 76 * physical address coalescing is performed.
77 */ 77 */
78 unsigned int bi_phys_segments; 78 unsigned int bi_phys_segments;
79 79
80 unsigned int bi_size; /* residual I/O count */ 80 unsigned int bi_size; /* residual I/O count */
81 81
82 /* 82 /*
83 * To keep track of the max segment size, we account for the 83 * To keep track of the max segment size, we account for the
84 * sizes of the first and last mergeable segments in this bio. 84 * sizes of the first and last mergeable segments in this bio.
85 */ 85 */
86 unsigned int bi_seg_front_size; 86 unsigned int bi_seg_front_size;
87 unsigned int bi_seg_back_size; 87 unsigned int bi_seg_back_size;
88 88
89 unsigned int bi_max_vecs; /* max bvl_vecs we can hold */ 89 unsigned int bi_max_vecs; /* max bvl_vecs we can hold */
90 90
91 unsigned int bi_comp_cpu; /* completion CPU */ 91 unsigned int bi_comp_cpu; /* completion CPU */
92 92
93 struct bio_vec *bi_io_vec; /* the actual vec list */ 93 struct bio_vec *bi_io_vec; /* the actual vec list */
94 94
95 bio_end_io_t *bi_end_io; 95 bio_end_io_t *bi_end_io;
96 atomic_t bi_cnt; /* pin count */ 96 atomic_t bi_cnt; /* pin count */
97 97
98 void *bi_private; 98 void *bi_private;
99 #if defined(CONFIG_BLK_DEV_INTEGRITY) 99 #if defined(CONFIG_BLK_DEV_INTEGRITY)
100 struct bio_integrity_payload *bi_integrity; /* data integrity */ 100 struct bio_integrity_payload *bi_integrity; /* data integrity */
101 #endif 101 #endif
102 102
103 bio_destructor_t *bi_destructor; /* destructor */ 103 bio_destructor_t *bi_destructor; /* destructor */
104 }; 104 };
105 105
106 /* 106 /*
107 * bio flags 107 * bio flags
108 */ 108 */
109 #define BIO_UPTODATE 0 /* ok after I/O completion */ 109 #define BIO_UPTODATE 0 /* ok after I/O completion */
110 #define BIO_RW_BLOCK 1 /* RW_AHEAD set, and read/write would block */ 110 #define BIO_RW_BLOCK 1 /* RW_AHEAD set, and read/write would block */
111 #define BIO_EOF 2 /* out-out-bounds error */ 111 #define BIO_EOF 2 /* out-out-bounds error */
112 #define BIO_SEG_VALID 3 /* bi_phys_segments valid */ 112 #define BIO_SEG_VALID 3 /* bi_phys_segments valid */
113 #define BIO_CLONED 4 /* doesn't own data */ 113 #define BIO_CLONED 4 /* doesn't own data */
114 #define BIO_BOUNCED 5 /* bio is a bounce bio */ 114 #define BIO_BOUNCED 5 /* bio is a bounce bio */
115 #define BIO_USER_MAPPED 6 /* contains user pages */ 115 #define BIO_USER_MAPPED 6 /* contains user pages */
116 #define BIO_EOPNOTSUPP 7 /* not supported */ 116 #define BIO_EOPNOTSUPP 7 /* not supported */
117 #define BIO_CPU_AFFINE 8 /* complete bio on same CPU as submitted */ 117 #define BIO_CPU_AFFINE 8 /* complete bio on same CPU as submitted */
118 #define BIO_NULL_MAPPED 9 /* contains invalid user pages */ 118 #define BIO_NULL_MAPPED 9 /* contains invalid user pages */
119 #define BIO_FS_INTEGRITY 10 /* fs owns integrity data, not block layer */ 119 #define BIO_FS_INTEGRITY 10 /* fs owns integrity data, not block layer */
120 #define BIO_QUIET 11 /* Make BIO Quiet */
120 #define bio_flagged(bio, flag) ((bio)->bi_flags & (1 << (flag))) 121 #define bio_flagged(bio, flag) ((bio)->bi_flags & (1 << (flag)))
121 122
122 /* 123 /*
123 * top 4 bits of bio flags indicate the pool this bio came from 124 * top 4 bits of bio flags indicate the pool this bio came from
124 */ 125 */
125 #define BIO_POOL_BITS (4) 126 #define BIO_POOL_BITS (4)
126 #define BIO_POOL_OFFSET (BITS_PER_LONG - BIO_POOL_BITS) 127 #define BIO_POOL_OFFSET (BITS_PER_LONG - BIO_POOL_BITS)
127 #define BIO_POOL_MASK (1UL << BIO_POOL_OFFSET) 128 #define BIO_POOL_MASK (1UL << BIO_POOL_OFFSET)
128 #define BIO_POOL_IDX(bio) ((bio)->bi_flags >> BIO_POOL_OFFSET) 129 #define BIO_POOL_IDX(bio) ((bio)->bi_flags >> BIO_POOL_OFFSET)
129 130
130 /* 131 /*
131 * bio bi_rw flags 132 * bio bi_rw flags
132 * 133 *
133 * bit 0 -- data direction 134 * bit 0 -- data direction
134 * If not set, bio is a read from device. If set, it's a write to device. 135 * If not set, bio is a read from device. If set, it's a write to device.
135 * bit 1 -- rw-ahead when set 136 * bit 1 -- rw-ahead when set
136 * bit 2 -- barrier 137 * bit 2 -- barrier
137 * Insert a serialization point in the IO queue, forcing previously 138 * Insert a serialization point in the IO queue, forcing previously
138 * submitted IO to be completed before this oen is issued. 139 * submitted IO to be completed before this oen is issued.
139 * bit 3 -- synchronous I/O hint: the block layer will unplug immediately 140 * bit 3 -- synchronous I/O hint: the block layer will unplug immediately
140 * Note that this does NOT indicate that the IO itself is sync, just 141 * Note that this does NOT indicate that the IO itself is sync, just
141 * that the block layer will not postpone issue of this IO by plugging. 142 * that the block layer will not postpone issue of this IO by plugging.
142 * bit 4 -- metadata request 143 * bit 4 -- metadata request
143 * Used for tracing to differentiate metadata and data IO. May also 144 * Used for tracing to differentiate metadata and data IO. May also
144 * get some preferential treatment in the IO scheduler 145 * get some preferential treatment in the IO scheduler
145 * bit 5 -- discard sectors 146 * bit 5 -- discard sectors
146 * Informs the lower level device that this range of sectors is no longer 147 * Informs the lower level device that this range of sectors is no longer
147 * used by the file system and may thus be freed by the device. Used 148 * used by the file system and may thus be freed by the device. Used
148 * for flash based storage. 149 * for flash based storage.
149 * bit 6 -- fail fast device errors 150 * bit 6 -- fail fast device errors
150 * bit 7 -- fail fast transport errors 151 * bit 7 -- fail fast transport errors
151 * bit 8 -- fail fast driver errors 152 * bit 8 -- fail fast driver errors
152 * Don't want driver retries for any fast fail whatever the reason. 153 * Don't want driver retries for any fast fail whatever the reason.
153 */ 154 */
154 #define BIO_RW 0 /* Must match RW in req flags (blkdev.h) */ 155 #define BIO_RW 0 /* Must match RW in req flags (blkdev.h) */
155 #define BIO_RW_AHEAD 1 /* Must match FAILFAST in req flags */ 156 #define BIO_RW_AHEAD 1 /* Must match FAILFAST in req flags */
156 #define BIO_RW_BARRIER 2 157 #define BIO_RW_BARRIER 2
157 #define BIO_RW_SYNC 3 158 #define BIO_RW_SYNC 3
158 #define BIO_RW_META 4 159 #define BIO_RW_META 4
159 #define BIO_RW_DISCARD 5 160 #define BIO_RW_DISCARD 5
160 #define BIO_RW_FAILFAST_DEV 6 161 #define BIO_RW_FAILFAST_DEV 6
161 #define BIO_RW_FAILFAST_TRANSPORT 7 162 #define BIO_RW_FAILFAST_TRANSPORT 7
162 #define BIO_RW_FAILFAST_DRIVER 8 163 #define BIO_RW_FAILFAST_DRIVER 8
163 164
164 /* 165 /*
165 * upper 16 bits of bi_rw define the io priority of this bio 166 * upper 16 bits of bi_rw define the io priority of this bio
166 */ 167 */
167 #define BIO_PRIO_SHIFT (8 * sizeof(unsigned long) - IOPRIO_BITS) 168 #define BIO_PRIO_SHIFT (8 * sizeof(unsigned long) - IOPRIO_BITS)
168 #define bio_prio(bio) ((bio)->bi_rw >> BIO_PRIO_SHIFT) 169 #define bio_prio(bio) ((bio)->bi_rw >> BIO_PRIO_SHIFT)
169 #define bio_prio_valid(bio) ioprio_valid(bio_prio(bio)) 170 #define bio_prio_valid(bio) ioprio_valid(bio_prio(bio))
170 171
171 #define bio_set_prio(bio, prio) do { \ 172 #define bio_set_prio(bio, prio) do { \
172 WARN_ON(prio >= (1 << IOPRIO_BITS)); \ 173 WARN_ON(prio >= (1 << IOPRIO_BITS)); \
173 (bio)->bi_rw &= ((1UL << BIO_PRIO_SHIFT) - 1); \ 174 (bio)->bi_rw &= ((1UL << BIO_PRIO_SHIFT) - 1); \
174 (bio)->bi_rw |= ((unsigned long) (prio) << BIO_PRIO_SHIFT); \ 175 (bio)->bi_rw |= ((unsigned long) (prio) << BIO_PRIO_SHIFT); \
175 } while (0) 176 } while (0)
176 177
177 /* 178 /*
178 * various member access, note that bio_data should of course not be used 179 * various member access, note that bio_data should of course not be used
179 * on highmem page vectors 180 * on highmem page vectors
180 */ 181 */
181 #define bio_iovec_idx(bio, idx) (&((bio)->bi_io_vec[(idx)])) 182 #define bio_iovec_idx(bio, idx) (&((bio)->bi_io_vec[(idx)]))
182 #define bio_iovec(bio) bio_iovec_idx((bio), (bio)->bi_idx) 183 #define bio_iovec(bio) bio_iovec_idx((bio), (bio)->bi_idx)
183 #define bio_page(bio) bio_iovec((bio))->bv_page 184 #define bio_page(bio) bio_iovec((bio))->bv_page
184 #define bio_offset(bio) bio_iovec((bio))->bv_offset 185 #define bio_offset(bio) bio_iovec((bio))->bv_offset
185 #define bio_segments(bio) ((bio)->bi_vcnt - (bio)->bi_idx) 186 #define bio_segments(bio) ((bio)->bi_vcnt - (bio)->bi_idx)
186 #define bio_sectors(bio) ((bio)->bi_size >> 9) 187 #define bio_sectors(bio) ((bio)->bi_size >> 9)
187 #define bio_barrier(bio) ((bio)->bi_rw & (1 << BIO_RW_BARRIER)) 188 #define bio_barrier(bio) ((bio)->bi_rw & (1 << BIO_RW_BARRIER))
188 #define bio_sync(bio) ((bio)->bi_rw & (1 << BIO_RW_SYNC)) 189 #define bio_sync(bio) ((bio)->bi_rw & (1 << BIO_RW_SYNC))
189 #define bio_failfast_dev(bio) ((bio)->bi_rw & (1 << BIO_RW_FAILFAST_DEV)) 190 #define bio_failfast_dev(bio) ((bio)->bi_rw & (1 << BIO_RW_FAILFAST_DEV))
190 #define bio_failfast_transport(bio) \ 191 #define bio_failfast_transport(bio) \
191 ((bio)->bi_rw & (1 << BIO_RW_FAILFAST_TRANSPORT)) 192 ((bio)->bi_rw & (1 << BIO_RW_FAILFAST_TRANSPORT))
192 #define bio_failfast_driver(bio) ((bio)->bi_rw & (1 << BIO_RW_FAILFAST_DRIVER)) 193 #define bio_failfast_driver(bio) ((bio)->bi_rw & (1 << BIO_RW_FAILFAST_DRIVER))
193 #define bio_rw_ahead(bio) ((bio)->bi_rw & (1 << BIO_RW_AHEAD)) 194 #define bio_rw_ahead(bio) ((bio)->bi_rw & (1 << BIO_RW_AHEAD))
194 #define bio_rw_meta(bio) ((bio)->bi_rw & (1 << BIO_RW_META)) 195 #define bio_rw_meta(bio) ((bio)->bi_rw & (1 << BIO_RW_META))
195 #define bio_discard(bio) ((bio)->bi_rw & (1 << BIO_RW_DISCARD)) 196 #define bio_discard(bio) ((bio)->bi_rw & (1 << BIO_RW_DISCARD))
196 #define bio_empty_barrier(bio) (bio_barrier(bio) && !bio_has_data(bio) && !bio_discard(bio)) 197 #define bio_empty_barrier(bio) (bio_barrier(bio) && !bio_has_data(bio) && !bio_discard(bio))
197 198
198 static inline unsigned int bio_cur_sectors(struct bio *bio) 199 static inline unsigned int bio_cur_sectors(struct bio *bio)
199 { 200 {
200 if (bio->bi_vcnt) 201 if (bio->bi_vcnt)
201 return bio_iovec(bio)->bv_len >> 9; 202 return bio_iovec(bio)->bv_len >> 9;
202 else /* dataless requests such as discard */ 203 else /* dataless requests such as discard */
203 return bio->bi_size >> 9; 204 return bio->bi_size >> 9;
204 } 205 }
205 206
206 static inline void *bio_data(struct bio *bio) 207 static inline void *bio_data(struct bio *bio)
207 { 208 {
208 if (bio->bi_vcnt) 209 if (bio->bi_vcnt)
209 return page_address(bio_page(bio)) + bio_offset(bio); 210 return page_address(bio_page(bio)) + bio_offset(bio);
210 211
211 return NULL; 212 return NULL;
212 } 213 }
213 214
214 /* 215 /*
215 * will die 216 * will die
216 */ 217 */
217 #define bio_to_phys(bio) (page_to_phys(bio_page((bio))) + (unsigned long) bio_offset((bio))) 218 #define bio_to_phys(bio) (page_to_phys(bio_page((bio))) + (unsigned long) bio_offset((bio)))
218 #define bvec_to_phys(bv) (page_to_phys((bv)->bv_page) + (unsigned long) (bv)->bv_offset) 219 #define bvec_to_phys(bv) (page_to_phys((bv)->bv_page) + (unsigned long) (bv)->bv_offset)
219 220
220 /* 221 /*
221 * queues that have highmem support enabled may still need to revert to 222 * queues that have highmem support enabled may still need to revert to
222 * PIO transfers occasionally and thus map high pages temporarily. For 223 * PIO transfers occasionally and thus map high pages temporarily. For
223 * permanent PIO fall back, user is probably better off disabling highmem 224 * permanent PIO fall back, user is probably better off disabling highmem
224 * I/O completely on that queue (see ide-dma for example) 225 * I/O completely on that queue (see ide-dma for example)
225 */ 226 */
226 #define __bio_kmap_atomic(bio, idx, kmtype) \ 227 #define __bio_kmap_atomic(bio, idx, kmtype) \
227 (kmap_atomic(bio_iovec_idx((bio), (idx))->bv_page, kmtype) + \ 228 (kmap_atomic(bio_iovec_idx((bio), (idx))->bv_page, kmtype) + \
228 bio_iovec_idx((bio), (idx))->bv_offset) 229 bio_iovec_idx((bio), (idx))->bv_offset)
229 230
230 #define __bio_kunmap_atomic(addr, kmtype) kunmap_atomic(addr, kmtype) 231 #define __bio_kunmap_atomic(addr, kmtype) kunmap_atomic(addr, kmtype)
231 232
232 /* 233 /*
233 * merge helpers etc 234 * merge helpers etc
234 */ 235 */
235 236
236 #define __BVEC_END(bio) bio_iovec_idx((bio), (bio)->bi_vcnt - 1) 237 #define __BVEC_END(bio) bio_iovec_idx((bio), (bio)->bi_vcnt - 1)
237 #define __BVEC_START(bio) bio_iovec_idx((bio), (bio)->bi_idx) 238 #define __BVEC_START(bio) bio_iovec_idx((bio), (bio)->bi_idx)
238 239
239 /* Default implementation of BIOVEC_PHYS_MERGEABLE */ 240 /* Default implementation of BIOVEC_PHYS_MERGEABLE */
240 #define __BIOVEC_PHYS_MERGEABLE(vec1, vec2) \ 241 #define __BIOVEC_PHYS_MERGEABLE(vec1, vec2) \
241 ((bvec_to_phys((vec1)) + (vec1)->bv_len) == bvec_to_phys((vec2))) 242 ((bvec_to_phys((vec1)) + (vec1)->bv_len) == bvec_to_phys((vec2)))
242 243
243 /* 244 /*
244 * allow arch override, for eg virtualized architectures (put in asm/io.h) 245 * allow arch override, for eg virtualized architectures (put in asm/io.h)
245 */ 246 */
246 #ifndef BIOVEC_PHYS_MERGEABLE 247 #ifndef BIOVEC_PHYS_MERGEABLE
247 #define BIOVEC_PHYS_MERGEABLE(vec1, vec2) \ 248 #define BIOVEC_PHYS_MERGEABLE(vec1, vec2) \
248 __BIOVEC_PHYS_MERGEABLE(vec1, vec2) 249 __BIOVEC_PHYS_MERGEABLE(vec1, vec2)
249 #endif 250 #endif
250 251
251 #define __BIO_SEG_BOUNDARY(addr1, addr2, mask) \ 252 #define __BIO_SEG_BOUNDARY(addr1, addr2, mask) \
252 (((addr1) | (mask)) == (((addr2) - 1) | (mask))) 253 (((addr1) | (mask)) == (((addr2) - 1) | (mask)))
253 #define BIOVEC_SEG_BOUNDARY(q, b1, b2) \ 254 #define BIOVEC_SEG_BOUNDARY(q, b1, b2) \
254 __BIO_SEG_BOUNDARY(bvec_to_phys((b1)), bvec_to_phys((b2)) + (b2)->bv_len, (q)->seg_boundary_mask) 255 __BIO_SEG_BOUNDARY(bvec_to_phys((b1)), bvec_to_phys((b2)) + (b2)->bv_len, (q)->seg_boundary_mask)
255 #define BIO_SEG_BOUNDARY(q, b1, b2) \ 256 #define BIO_SEG_BOUNDARY(q, b1, b2) \
256 BIOVEC_SEG_BOUNDARY((q), __BVEC_END((b1)), __BVEC_START((b2))) 257 BIOVEC_SEG_BOUNDARY((q), __BVEC_END((b1)), __BVEC_START((b2)))
257 258
258 #define bio_io_error(bio) bio_endio((bio), -EIO) 259 #define bio_io_error(bio) bio_endio((bio), -EIO)
259 260
260 /* 261 /*
261 * drivers should not use the __ version unless they _really_ want to 262 * drivers should not use the __ version unless they _really_ want to
262 * run through the entire bio and not just pending pieces 263 * run through the entire bio and not just pending pieces
263 */ 264 */
264 #define __bio_for_each_segment(bvl, bio, i, start_idx) \ 265 #define __bio_for_each_segment(bvl, bio, i, start_idx) \
265 for (bvl = bio_iovec_idx((bio), (start_idx)), i = (start_idx); \ 266 for (bvl = bio_iovec_idx((bio), (start_idx)), i = (start_idx); \
266 i < (bio)->bi_vcnt; \ 267 i < (bio)->bi_vcnt; \
267 bvl++, i++) 268 bvl++, i++)
268 269
269 #define bio_for_each_segment(bvl, bio, i) \ 270 #define bio_for_each_segment(bvl, bio, i) \
270 __bio_for_each_segment(bvl, bio, i, (bio)->bi_idx) 271 __bio_for_each_segment(bvl, bio, i, (bio)->bi_idx)
271 272
272 /* 273 /*
273 * get a reference to a bio, so it won't disappear. the intended use is 274 * get a reference to a bio, so it won't disappear. the intended use is
274 * something like: 275 * something like:
275 * 276 *
276 * bio_get(bio); 277 * bio_get(bio);
277 * submit_bio(rw, bio); 278 * submit_bio(rw, bio);
278 * if (bio->bi_flags ...) 279 * if (bio->bi_flags ...)
279 * do_something 280 * do_something
280 * bio_put(bio); 281 * bio_put(bio);
281 * 282 *
282 * without the bio_get(), it could potentially complete I/O before submit_bio 283 * without the bio_get(), it could potentially complete I/O before submit_bio
283 * returns. and then bio would be freed memory when if (bio->bi_flags ...) 284 * returns. and then bio would be freed memory when if (bio->bi_flags ...)
284 * runs 285 * runs
285 */ 286 */
286 #define bio_get(bio) atomic_inc(&(bio)->bi_cnt) 287 #define bio_get(bio) atomic_inc(&(bio)->bi_cnt)
287 288
288 #if defined(CONFIG_BLK_DEV_INTEGRITY) 289 #if defined(CONFIG_BLK_DEV_INTEGRITY)
289 /* 290 /*
290 * bio integrity payload 291 * bio integrity payload
291 */ 292 */
292 struct bio_integrity_payload { 293 struct bio_integrity_payload {
293 struct bio *bip_bio; /* parent bio */ 294 struct bio *bip_bio; /* parent bio */
294 struct bio_vec *bip_vec; /* integrity data vector */ 295 struct bio_vec *bip_vec; /* integrity data vector */
295 296
296 sector_t bip_sector; /* virtual start sector */ 297 sector_t bip_sector; /* virtual start sector */
297 298
298 void *bip_buf; /* generated integrity data */ 299 void *bip_buf; /* generated integrity data */
299 bio_end_io_t *bip_end_io; /* saved I/O completion fn */ 300 bio_end_io_t *bip_end_io; /* saved I/O completion fn */
300 301
301 int bip_error; /* saved I/O error */ 302 int bip_error; /* saved I/O error */
302 unsigned int bip_size; 303 unsigned int bip_size;
303 304
304 unsigned short bip_pool; /* pool the ivec came from */ 305 unsigned short bip_pool; /* pool the ivec came from */
305 unsigned short bip_vcnt; /* # of integrity bio_vecs */ 306 unsigned short bip_vcnt; /* # of integrity bio_vecs */
306 unsigned short bip_idx; /* current bip_vec index */ 307 unsigned short bip_idx; /* current bip_vec index */
307 308
308 struct work_struct bip_work; /* I/O completion */ 309 struct work_struct bip_work; /* I/O completion */
309 }; 310 };
310 #endif /* CONFIG_BLK_DEV_INTEGRITY */ 311 #endif /* CONFIG_BLK_DEV_INTEGRITY */
311 312
312 /* 313 /*
313 * A bio_pair is used when we need to split a bio. 314 * A bio_pair is used when we need to split a bio.
314 * This can only happen for a bio that refers to just one 315 * This can only happen for a bio that refers to just one
315 * page of data, and in the unusual situation when the 316 * page of data, and in the unusual situation when the
316 * page crosses a chunk/device boundary 317 * page crosses a chunk/device boundary
317 * 318 *
318 * The address of the master bio is stored in bio1.bi_private 319 * The address of the master bio is stored in bio1.bi_private
319 * The address of the pool the pair was allocated from is stored 320 * The address of the pool the pair was allocated from is stored
320 * in bio2.bi_private 321 * in bio2.bi_private
321 */ 322 */
322 struct bio_pair { 323 struct bio_pair {
323 struct bio bio1, bio2; 324 struct bio bio1, bio2;
324 struct bio_vec bv1, bv2; 325 struct bio_vec bv1, bv2;
325 #if defined(CONFIG_BLK_DEV_INTEGRITY) 326 #if defined(CONFIG_BLK_DEV_INTEGRITY)
326 struct bio_integrity_payload bip1, bip2; 327 struct bio_integrity_payload bip1, bip2;
327 struct bio_vec iv1, iv2; 328 struct bio_vec iv1, iv2;
328 #endif 329 #endif
329 atomic_t cnt; 330 atomic_t cnt;
330 int error; 331 int error;
331 }; 332 };
332 extern struct bio_pair *bio_split(struct bio *bi, int first_sectors); 333 extern struct bio_pair *bio_split(struct bio *bi, int first_sectors);
333 extern void bio_pair_release(struct bio_pair *dbio); 334 extern void bio_pair_release(struct bio_pair *dbio);
334 335
335 extern struct bio_set *bioset_create(int, int); 336 extern struct bio_set *bioset_create(int, int);
336 extern void bioset_free(struct bio_set *); 337 extern void bioset_free(struct bio_set *);
337 338
338 extern struct bio *bio_alloc(gfp_t, int); 339 extern struct bio *bio_alloc(gfp_t, int);
339 extern struct bio *bio_kmalloc(gfp_t, int); 340 extern struct bio *bio_kmalloc(gfp_t, int);
340 extern struct bio *bio_alloc_bioset(gfp_t, int, struct bio_set *); 341 extern struct bio *bio_alloc_bioset(gfp_t, int, struct bio_set *);
341 extern void bio_put(struct bio *); 342 extern void bio_put(struct bio *);
342 extern void bio_free(struct bio *, struct bio_set *); 343 extern void bio_free(struct bio *, struct bio_set *);
343 344
344 extern void bio_endio(struct bio *, int); 345 extern void bio_endio(struct bio *, int);
345 struct request_queue; 346 struct request_queue;
346 extern int bio_phys_segments(struct request_queue *, struct bio *); 347 extern int bio_phys_segments(struct request_queue *, struct bio *);
347 348
348 extern void __bio_clone(struct bio *, struct bio *); 349 extern void __bio_clone(struct bio *, struct bio *);
349 extern struct bio *bio_clone(struct bio *, gfp_t); 350 extern struct bio *bio_clone(struct bio *, gfp_t);
350 351
351 extern void bio_init(struct bio *); 352 extern void bio_init(struct bio *);
352 353
353 extern int bio_add_page(struct bio *, struct page *, unsigned int,unsigned int); 354 extern int bio_add_page(struct bio *, struct page *, unsigned int,unsigned int);
354 extern int bio_add_pc_page(struct request_queue *, struct bio *, struct page *, 355 extern int bio_add_pc_page(struct request_queue *, struct bio *, struct page *,
355 unsigned int, unsigned int); 356 unsigned int, unsigned int);
356 extern int bio_get_nr_vecs(struct block_device *); 357 extern int bio_get_nr_vecs(struct block_device *);
357 extern sector_t bio_sector_offset(struct bio *, unsigned short, unsigned int); 358 extern sector_t bio_sector_offset(struct bio *, unsigned short, unsigned int);
358 extern struct bio *bio_map_user(struct request_queue *, struct block_device *, 359 extern struct bio *bio_map_user(struct request_queue *, struct block_device *,
359 unsigned long, unsigned int, int, gfp_t); 360 unsigned long, unsigned int, int, gfp_t);
360 struct sg_iovec; 361 struct sg_iovec;
361 struct rq_map_data; 362 struct rq_map_data;
362 extern struct bio *bio_map_user_iov(struct request_queue *, 363 extern struct bio *bio_map_user_iov(struct request_queue *,
363 struct block_device *, 364 struct block_device *,
364 struct sg_iovec *, int, int, gfp_t); 365 struct sg_iovec *, int, int, gfp_t);
365 extern void bio_unmap_user(struct bio *); 366 extern void bio_unmap_user(struct bio *);
366 extern struct bio *bio_map_kern(struct request_queue *, void *, unsigned int, 367 extern struct bio *bio_map_kern(struct request_queue *, void *, unsigned int,
367 gfp_t); 368 gfp_t);
368 extern struct bio *bio_copy_kern(struct request_queue *, void *, unsigned int, 369 extern struct bio *bio_copy_kern(struct request_queue *, void *, unsigned int,
369 gfp_t, int); 370 gfp_t, int);
370 extern void bio_set_pages_dirty(struct bio *bio); 371 extern void bio_set_pages_dirty(struct bio *bio);
371 extern void bio_check_pages_dirty(struct bio *bio); 372 extern void bio_check_pages_dirty(struct bio *bio);
372 extern struct bio *bio_copy_user(struct request_queue *, struct rq_map_data *, 373 extern struct bio *bio_copy_user(struct request_queue *, struct rq_map_data *,
373 unsigned long, unsigned int, int, gfp_t); 374 unsigned long, unsigned int, int, gfp_t);
374 extern struct bio *bio_copy_user_iov(struct request_queue *, 375 extern struct bio *bio_copy_user_iov(struct request_queue *,
375 struct rq_map_data *, struct sg_iovec *, 376 struct rq_map_data *, struct sg_iovec *,
376 int, int, gfp_t); 377 int, int, gfp_t);
377 extern int bio_uncopy_user(struct bio *); 378 extern int bio_uncopy_user(struct bio *);
378 void zero_fill_bio(struct bio *bio); 379 void zero_fill_bio(struct bio *bio);
379 extern struct bio_vec *bvec_alloc_bs(gfp_t, int, unsigned long *, struct bio_set *); 380 extern struct bio_vec *bvec_alloc_bs(gfp_t, int, unsigned long *, struct bio_set *);
380 extern unsigned int bvec_nr_vecs(unsigned short idx); 381 extern unsigned int bvec_nr_vecs(unsigned short idx);
381 382
382 /* 383 /*
383 * Allow queuer to specify a completion CPU for this bio 384 * Allow queuer to specify a completion CPU for this bio
384 */ 385 */
385 static inline void bio_set_completion_cpu(struct bio *bio, unsigned int cpu) 386 static inline void bio_set_completion_cpu(struct bio *bio, unsigned int cpu)
386 { 387 {
387 bio->bi_comp_cpu = cpu; 388 bio->bi_comp_cpu = cpu;
388 } 389 }
389 390
390 /* 391 /*
391 * bio_set is used to allow other portions of the IO system to 392 * bio_set is used to allow other portions of the IO system to
392 * allocate their own private memory pools for bio and iovec structures. 393 * allocate their own private memory pools for bio and iovec structures.
393 * These memory pools in turn all allocate from the bio_slab 394 * These memory pools in turn all allocate from the bio_slab
394 * and the bvec_slabs[]. 395 * and the bvec_slabs[].
395 */ 396 */
396 #define BIO_POOL_SIZE 2 397 #define BIO_POOL_SIZE 2
397 #define BIOVEC_NR_POOLS 6 398 #define BIOVEC_NR_POOLS 6
398 399
399 struct bio_set { 400 struct bio_set {
400 mempool_t *bio_pool; 401 mempool_t *bio_pool;
401 #if defined(CONFIG_BLK_DEV_INTEGRITY) 402 #if defined(CONFIG_BLK_DEV_INTEGRITY)
402 mempool_t *bio_integrity_pool; 403 mempool_t *bio_integrity_pool;
403 #endif 404 #endif
404 mempool_t *bvec_pools[BIOVEC_NR_POOLS]; 405 mempool_t *bvec_pools[BIOVEC_NR_POOLS];
405 }; 406 };
406 407
407 struct biovec_slab { 408 struct biovec_slab {
408 int nr_vecs; 409 int nr_vecs;
409 char *name; 410 char *name;
410 struct kmem_cache *slab; 411 struct kmem_cache *slab;
411 }; 412 };
412 413
413 extern struct bio_set *fs_bio_set; 414 extern struct bio_set *fs_bio_set;
414 415
415 /* 416 /*
416 * a small number of entries is fine, not going to be performance critical. 417 * a small number of entries is fine, not going to be performance critical.
417 * basically we just need to survive 418 * basically we just need to survive
418 */ 419 */
419 #define BIO_SPLIT_ENTRIES 2 420 #define BIO_SPLIT_ENTRIES 2
420 421
421 #ifdef CONFIG_HIGHMEM 422 #ifdef CONFIG_HIGHMEM
422 /* 423 /*
423 * remember to add offset! and never ever reenable interrupts between a 424 * remember to add offset! and never ever reenable interrupts between a
424 * bvec_kmap_irq and bvec_kunmap_irq!! 425 * bvec_kmap_irq and bvec_kunmap_irq!!
425 * 426 *
426 * This function MUST be inlined - it plays with the CPU interrupt flags. 427 * This function MUST be inlined - it plays with the CPU interrupt flags.
427 */ 428 */
428 static inline char *bvec_kmap_irq(struct bio_vec *bvec, unsigned long *flags) 429 static inline char *bvec_kmap_irq(struct bio_vec *bvec, unsigned long *flags)
429 { 430 {
430 unsigned long addr; 431 unsigned long addr;
431 432
432 /* 433 /*
433 * might not be a highmem page, but the preempt/irq count 434 * might not be a highmem page, but the preempt/irq count
434 * balancing is a lot nicer this way 435 * balancing is a lot nicer this way
435 */ 436 */
436 local_irq_save(*flags); 437 local_irq_save(*flags);
437 addr = (unsigned long) kmap_atomic(bvec->bv_page, KM_BIO_SRC_IRQ); 438 addr = (unsigned long) kmap_atomic(bvec->bv_page, KM_BIO_SRC_IRQ);
438 439
439 BUG_ON(addr & ~PAGE_MASK); 440 BUG_ON(addr & ~PAGE_MASK);
440 441
441 return (char *) addr + bvec->bv_offset; 442 return (char *) addr + bvec->bv_offset;
442 } 443 }
443 444
444 static inline void bvec_kunmap_irq(char *buffer, unsigned long *flags) 445 static inline void bvec_kunmap_irq(char *buffer, unsigned long *flags)
445 { 446 {
446 unsigned long ptr = (unsigned long) buffer & PAGE_MASK; 447 unsigned long ptr = (unsigned long) buffer & PAGE_MASK;
447 448
448 kunmap_atomic((void *) ptr, KM_BIO_SRC_IRQ); 449 kunmap_atomic((void *) ptr, KM_BIO_SRC_IRQ);
449 local_irq_restore(*flags); 450 local_irq_restore(*flags);
450 } 451 }
451 452
452 #else 453 #else
453 #define bvec_kmap_irq(bvec, flags) (page_address((bvec)->bv_page) + (bvec)->bv_offset) 454 #define bvec_kmap_irq(bvec, flags) (page_address((bvec)->bv_page) + (bvec)->bv_offset)
454 #define bvec_kunmap_irq(buf, flags) do { *(flags) = 0; } while (0) 455 #define bvec_kunmap_irq(buf, flags) do { *(flags) = 0; } while (0)
455 #endif 456 #endif
456 457
457 static inline char *__bio_kmap_irq(struct bio *bio, unsigned short idx, 458 static inline char *__bio_kmap_irq(struct bio *bio, unsigned short idx,
458 unsigned long *flags) 459 unsigned long *flags)
459 { 460 {
460 return bvec_kmap_irq(bio_iovec_idx(bio, idx), flags); 461 return bvec_kmap_irq(bio_iovec_idx(bio, idx), flags);
461 } 462 }
462 #define __bio_kunmap_irq(buf, flags) bvec_kunmap_irq(buf, flags) 463 #define __bio_kunmap_irq(buf, flags) bvec_kunmap_irq(buf, flags)
463 464
464 #define bio_kmap_irq(bio, flags) \ 465 #define bio_kmap_irq(bio, flags) \
465 __bio_kmap_irq((bio), (bio)->bi_idx, (flags)) 466 __bio_kmap_irq((bio), (bio)->bi_idx, (flags))
466 #define bio_kunmap_irq(buf,flags) __bio_kunmap_irq(buf, flags) 467 #define bio_kunmap_irq(buf,flags) __bio_kunmap_irq(buf, flags)
467 468
468 /* 469 /*
469 * Check whether this bio carries any data or not. A NULL bio is allowed. 470 * Check whether this bio carries any data or not. A NULL bio is allowed.
470 */ 471 */
471 static inline int bio_has_data(struct bio *bio) 472 static inline int bio_has_data(struct bio *bio)
472 { 473 {
473 return bio && bio->bi_io_vec != NULL; 474 return bio && bio->bi_io_vec != NULL;
474 } 475 }
475 476
476 #if defined(CONFIG_BLK_DEV_INTEGRITY) 477 #if defined(CONFIG_BLK_DEV_INTEGRITY)
477 478
478 #define bip_vec_idx(bip, idx) (&(bip->bip_vec[(idx)])) 479 #define bip_vec_idx(bip, idx) (&(bip->bip_vec[(idx)]))
479 #define bip_vec(bip) bip_vec_idx(bip, 0) 480 #define bip_vec(bip) bip_vec_idx(bip, 0)
480 481
481 #define __bip_for_each_vec(bvl, bip, i, start_idx) \ 482 #define __bip_for_each_vec(bvl, bip, i, start_idx) \
482 for (bvl = bip_vec_idx((bip), (start_idx)), i = (start_idx); \ 483 for (bvl = bip_vec_idx((bip), (start_idx)), i = (start_idx); \
483 i < (bip)->bip_vcnt; \ 484 i < (bip)->bip_vcnt; \
484 bvl++, i++) 485 bvl++, i++)
485 486
486 #define bip_for_each_vec(bvl, bip, i) \ 487 #define bip_for_each_vec(bvl, bip, i) \
487 __bip_for_each_vec(bvl, bip, i, (bip)->bip_idx) 488 __bip_for_each_vec(bvl, bip, i, (bip)->bip_idx)
488 489
489 #define bio_integrity(bio) (bio->bi_integrity != NULL) 490 #define bio_integrity(bio) (bio->bi_integrity != NULL)
490 491
491 extern struct bio_integrity_payload *bio_integrity_alloc_bioset(struct bio *, gfp_t, unsigned int, struct bio_set *); 492 extern struct bio_integrity_payload *bio_integrity_alloc_bioset(struct bio *, gfp_t, unsigned int, struct bio_set *);
492 extern struct bio_integrity_payload *bio_integrity_alloc(struct bio *, gfp_t, unsigned int); 493 extern struct bio_integrity_payload *bio_integrity_alloc(struct bio *, gfp_t, unsigned int);
493 extern void bio_integrity_free(struct bio *, struct bio_set *); 494 extern void bio_integrity_free(struct bio *, struct bio_set *);
494 extern int bio_integrity_add_page(struct bio *, struct page *, unsigned int, unsigned int); 495 extern int bio_integrity_add_page(struct bio *, struct page *, unsigned int, unsigned int);
495 extern int bio_integrity_enabled(struct bio *bio); 496 extern int bio_integrity_enabled(struct bio *bio);
496 extern int bio_integrity_set_tag(struct bio *, void *, unsigned int); 497 extern int bio_integrity_set_tag(struct bio *, void *, unsigned int);
497 extern int bio_integrity_get_tag(struct bio *, void *, unsigned int); 498 extern int bio_integrity_get_tag(struct bio *, void *, unsigned int);
498 extern int bio_integrity_prep(struct bio *); 499 extern int bio_integrity_prep(struct bio *);
499 extern void bio_integrity_endio(struct bio *, int); 500 extern void bio_integrity_endio(struct bio *, int);
500 extern void bio_integrity_advance(struct bio *, unsigned int); 501 extern void bio_integrity_advance(struct bio *, unsigned int);
501 extern void bio_integrity_trim(struct bio *, unsigned int, unsigned int); 502 extern void bio_integrity_trim(struct bio *, unsigned int, unsigned int);
502 extern void bio_integrity_split(struct bio *, struct bio_pair *, int); 503 extern void bio_integrity_split(struct bio *, struct bio_pair *, int);
503 extern int bio_integrity_clone(struct bio *, struct bio *, struct bio_set *); 504 extern int bio_integrity_clone(struct bio *, struct bio *, struct bio_set *);
504 extern int bioset_integrity_create(struct bio_set *, int); 505 extern int bioset_integrity_create(struct bio_set *, int);
505 extern void bioset_integrity_free(struct bio_set *); 506 extern void bioset_integrity_free(struct bio_set *);
506 extern void bio_integrity_init_slab(void); 507 extern void bio_integrity_init_slab(void);
507 508
508 #else /* CONFIG_BLK_DEV_INTEGRITY */ 509 #else /* CONFIG_BLK_DEV_INTEGRITY */
509 510
510 #define bio_integrity(a) (0) 511 #define bio_integrity(a) (0)
511 #define bioset_integrity_create(a, b) (0) 512 #define bioset_integrity_create(a, b) (0)
512 #define bio_integrity_prep(a) (0) 513 #define bio_integrity_prep(a) (0)
513 #define bio_integrity_enabled(a) (0) 514 #define bio_integrity_enabled(a) (0)
514 #define bio_integrity_clone(a, b, c) (0) 515 #define bio_integrity_clone(a, b, c) (0)
515 #define bioset_integrity_free(a) do { } while (0) 516 #define bioset_integrity_free(a) do { } while (0)
516 #define bio_integrity_free(a, b) do { } while (0) 517 #define bio_integrity_free(a, b) do { } while (0)
517 #define bio_integrity_endio(a, b) do { } while (0) 518 #define bio_integrity_endio(a, b) do { } while (0)
518 #define bio_integrity_advance(a, b) do { } while (0) 519 #define bio_integrity_advance(a, b) do { } while (0)
519 #define bio_integrity_trim(a, b, c) do { } while (0) 520 #define bio_integrity_trim(a, b, c) do { } while (0)
520 #define bio_integrity_split(a, b, c) do { } while (0) 521 #define bio_integrity_split(a, b, c) do { } while (0)
521 #define bio_integrity_set_tag(a, b, c) do { } while (0) 522 #define bio_integrity_set_tag(a, b, c) do { } while (0)
522 #define bio_integrity_get_tag(a, b, c) do { } while (0) 523 #define bio_integrity_get_tag(a, b, c) do { } while (0)
523 #define bio_integrity_init_slab(a) do { } while (0) 524 #define bio_integrity_init_slab(a) do { } while (0)
524 525
525 #endif /* CONFIG_BLK_DEV_INTEGRITY */ 526 #endif /* CONFIG_BLK_DEV_INTEGRITY */
526 527
527 #endif /* CONFIG_BLOCK */ 528 #endif /* CONFIG_BLOCK */
528 #endif /* __LINUX_BIO_H */ 529 #endif /* __LINUX_BIO_H */
529 530
include/linux/buffer_head.h
1 /* 1 /*
2 * include/linux/buffer_head.h 2 * include/linux/buffer_head.h
3 * 3 *
4 * Everything to do with buffer_heads. 4 * Everything to do with buffer_heads.
5 */ 5 */
6 6
7 #ifndef _LINUX_BUFFER_HEAD_H 7 #ifndef _LINUX_BUFFER_HEAD_H
8 #define _LINUX_BUFFER_HEAD_H 8 #define _LINUX_BUFFER_HEAD_H
9 9
10 #include <linux/types.h> 10 #include <linux/types.h>
11 #include <linux/fs.h> 11 #include <linux/fs.h>
12 #include <linux/linkage.h> 12 #include <linux/linkage.h>
13 #include <linux/pagemap.h> 13 #include <linux/pagemap.h>
14 #include <linux/wait.h> 14 #include <linux/wait.h>
15 #include <asm/atomic.h> 15 #include <asm/atomic.h>
16 16
17 #ifdef CONFIG_BLOCK 17 #ifdef CONFIG_BLOCK
18 18
19 enum bh_state_bits { 19 enum bh_state_bits {
20 BH_Uptodate, /* Contains valid data */ 20 BH_Uptodate, /* Contains valid data */
21 BH_Dirty, /* Is dirty */ 21 BH_Dirty, /* Is dirty */
22 BH_Lock, /* Is locked */ 22 BH_Lock, /* Is locked */
23 BH_Req, /* Has been submitted for I/O */ 23 BH_Req, /* Has been submitted for I/O */
24 BH_Uptodate_Lock,/* Used by the first bh in a page, to serialise 24 BH_Uptodate_Lock,/* Used by the first bh in a page, to serialise
25 * IO completion of other buffers in the page 25 * IO completion of other buffers in the page
26 */ 26 */
27 27
28 BH_Mapped, /* Has a disk mapping */ 28 BH_Mapped, /* Has a disk mapping */
29 BH_New, /* Disk mapping was newly created by get_block */ 29 BH_New, /* Disk mapping was newly created by get_block */
30 BH_Async_Read, /* Is under end_buffer_async_read I/O */ 30 BH_Async_Read, /* Is under end_buffer_async_read I/O */
31 BH_Async_Write, /* Is under end_buffer_async_write I/O */ 31 BH_Async_Write, /* Is under end_buffer_async_write I/O */
32 BH_Delay, /* Buffer is not yet allocated on disk */ 32 BH_Delay, /* Buffer is not yet allocated on disk */
33 BH_Boundary, /* Block is followed by a discontiguity */ 33 BH_Boundary, /* Block is followed by a discontiguity */
34 BH_Write_EIO, /* I/O error on write */ 34 BH_Write_EIO, /* I/O error on write */
35 BH_Ordered, /* ordered write */ 35 BH_Ordered, /* ordered write */
36 BH_Eopnotsupp, /* operation not supported (barrier) */ 36 BH_Eopnotsupp, /* operation not supported (barrier) */
37 BH_Unwritten, /* Buffer is allocated on disk but not written */ 37 BH_Unwritten, /* Buffer is allocated on disk but not written */
38 BH_Quiet, /* Buffer Error Prinks to be quiet */
38 39
39 BH_PrivateStart,/* not a state bit, but the first bit available 40 BH_PrivateStart,/* not a state bit, but the first bit available
40 * for private allocation by other entities 41 * for private allocation by other entities
41 */ 42 */
42 }; 43 };
43 44
44 #define MAX_BUF_PER_PAGE (PAGE_CACHE_SIZE / 512) 45 #define MAX_BUF_PER_PAGE (PAGE_CACHE_SIZE / 512)
45 46
46 struct page; 47 struct page;
47 struct buffer_head; 48 struct buffer_head;
48 struct address_space; 49 struct address_space;
49 typedef void (bh_end_io_t)(struct buffer_head *bh, int uptodate); 50 typedef void (bh_end_io_t)(struct buffer_head *bh, int uptodate);
50 51
51 /* 52 /*
52 * Historically, a buffer_head was used to map a single block 53 * Historically, a buffer_head was used to map a single block
53 * within a page, and of course as the unit of I/O through the 54 * within a page, and of course as the unit of I/O through the
54 * filesystem and block layers. Nowadays the basic I/O unit 55 * filesystem and block layers. Nowadays the basic I/O unit
55 * is the bio, and buffer_heads are used for extracting block 56 * is the bio, and buffer_heads are used for extracting block
56 * mappings (via a get_block_t call), for tracking state within 57 * mappings (via a get_block_t call), for tracking state within
57 * a page (via a page_mapping) and for wrapping bio submission 58 * a page (via a page_mapping) and for wrapping bio submission
58 * for backward compatibility reasons (e.g. submit_bh). 59 * for backward compatibility reasons (e.g. submit_bh).
59 */ 60 */
60 struct buffer_head { 61 struct buffer_head {
61 unsigned long b_state; /* buffer state bitmap (see above) */ 62 unsigned long b_state; /* buffer state bitmap (see above) */
62 struct buffer_head *b_this_page;/* circular list of page's buffers */ 63 struct buffer_head *b_this_page;/* circular list of page's buffers */
63 struct page *b_page; /* the page this bh is mapped to */ 64 struct page *b_page; /* the page this bh is mapped to */
64 65
65 sector_t b_blocknr; /* start block number */ 66 sector_t b_blocknr; /* start block number */
66 size_t b_size; /* size of mapping */ 67 size_t b_size; /* size of mapping */
67 char *b_data; /* pointer to data within the page */ 68 char *b_data; /* pointer to data within the page */
68 69
69 struct block_device *b_bdev; 70 struct block_device *b_bdev;
70 bh_end_io_t *b_end_io; /* I/O completion */ 71 bh_end_io_t *b_end_io; /* I/O completion */
71 void *b_private; /* reserved for b_end_io */ 72 void *b_private; /* reserved for b_end_io */
72 struct list_head b_assoc_buffers; /* associated with another mapping */ 73 struct list_head b_assoc_buffers; /* associated with another mapping */
73 struct address_space *b_assoc_map; /* mapping this buffer is 74 struct address_space *b_assoc_map; /* mapping this buffer is
74 associated with */ 75 associated with */
75 atomic_t b_count; /* users using this buffer_head */ 76 atomic_t b_count; /* users using this buffer_head */
76 }; 77 };
77 78
78 /* 79 /*
79 * macro tricks to expand the set_buffer_foo(), clear_buffer_foo() 80 * macro tricks to expand the set_buffer_foo(), clear_buffer_foo()
80 * and buffer_foo() functions. 81 * and buffer_foo() functions.
81 */ 82 */
82 #define BUFFER_FNS(bit, name) \ 83 #define BUFFER_FNS(bit, name) \
83 static inline void set_buffer_##name(struct buffer_head *bh) \ 84 static inline void set_buffer_##name(struct buffer_head *bh) \
84 { \ 85 { \
85 set_bit(BH_##bit, &(bh)->b_state); \ 86 set_bit(BH_##bit, &(bh)->b_state); \
86 } \ 87 } \
87 static inline void clear_buffer_##name(struct buffer_head *bh) \ 88 static inline void clear_buffer_##name(struct buffer_head *bh) \
88 { \ 89 { \
89 clear_bit(BH_##bit, &(bh)->b_state); \ 90 clear_bit(BH_##bit, &(bh)->b_state); \
90 } \ 91 } \
91 static inline int buffer_##name(const struct buffer_head *bh) \ 92 static inline int buffer_##name(const struct buffer_head *bh) \
92 { \ 93 { \
93 return test_bit(BH_##bit, &(bh)->b_state); \ 94 return test_bit(BH_##bit, &(bh)->b_state); \
94 } 95 }
95 96
96 /* 97 /*
97 * test_set_buffer_foo() and test_clear_buffer_foo() 98 * test_set_buffer_foo() and test_clear_buffer_foo()
98 */ 99 */
99 #define TAS_BUFFER_FNS(bit, name) \ 100 #define TAS_BUFFER_FNS(bit, name) \
100 static inline int test_set_buffer_##name(struct buffer_head *bh) \ 101 static inline int test_set_buffer_##name(struct buffer_head *bh) \
101 { \ 102 { \
102 return test_and_set_bit(BH_##bit, &(bh)->b_state); \ 103 return test_and_set_bit(BH_##bit, &(bh)->b_state); \
103 } \ 104 } \
104 static inline int test_clear_buffer_##name(struct buffer_head *bh) \ 105 static inline int test_clear_buffer_##name(struct buffer_head *bh) \
105 { \ 106 { \
106 return test_and_clear_bit(BH_##bit, &(bh)->b_state); \ 107 return test_and_clear_bit(BH_##bit, &(bh)->b_state); \
107 } \ 108 } \
108 109
109 /* 110 /*
110 * Emit the buffer bitops functions. Note that there are also functions 111 * Emit the buffer bitops functions. Note that there are also functions
111 * of the form "mark_buffer_foo()". These are higher-level functions which 112 * of the form "mark_buffer_foo()". These are higher-level functions which
112 * do something in addition to setting a b_state bit. 113 * do something in addition to setting a b_state bit.
113 */ 114 */
114 BUFFER_FNS(Uptodate, uptodate) 115 BUFFER_FNS(Uptodate, uptodate)
115 BUFFER_FNS(Dirty, dirty) 116 BUFFER_FNS(Dirty, dirty)
116 TAS_BUFFER_FNS(Dirty, dirty) 117 TAS_BUFFER_FNS(Dirty, dirty)
117 BUFFER_FNS(Lock, locked) 118 BUFFER_FNS(Lock, locked)
118 BUFFER_FNS(Req, req) 119 BUFFER_FNS(Req, req)
119 TAS_BUFFER_FNS(Req, req) 120 TAS_BUFFER_FNS(Req, req)
120 BUFFER_FNS(Mapped, mapped) 121 BUFFER_FNS(Mapped, mapped)
121 BUFFER_FNS(New, new) 122 BUFFER_FNS(New, new)
122 BUFFER_FNS(Async_Read, async_read) 123 BUFFER_FNS(Async_Read, async_read)
123 BUFFER_FNS(Async_Write, async_write) 124 BUFFER_FNS(Async_Write, async_write)
124 BUFFER_FNS(Delay, delay) 125 BUFFER_FNS(Delay, delay)
125 BUFFER_FNS(Boundary, boundary) 126 BUFFER_FNS(Boundary, boundary)
126 BUFFER_FNS(Write_EIO, write_io_error) 127 BUFFER_FNS(Write_EIO, write_io_error)
127 BUFFER_FNS(Ordered, ordered) 128 BUFFER_FNS(Ordered, ordered)
128 BUFFER_FNS(Eopnotsupp, eopnotsupp) 129 BUFFER_FNS(Eopnotsupp, eopnotsupp)
129 BUFFER_FNS(Unwritten, unwritten) 130 BUFFER_FNS(Unwritten, unwritten)
130 131
131 #define bh_offset(bh) ((unsigned long)(bh)->b_data & ~PAGE_MASK) 132 #define bh_offset(bh) ((unsigned long)(bh)->b_data & ~PAGE_MASK)
132 #define touch_buffer(bh) mark_page_accessed(bh->b_page) 133 #define touch_buffer(bh) mark_page_accessed(bh->b_page)
133 134
134 /* If we *know* page->private refers to buffer_heads */ 135 /* If we *know* page->private refers to buffer_heads */
135 #define page_buffers(page) \ 136 #define page_buffers(page) \
136 ({ \ 137 ({ \
137 BUG_ON(!PagePrivate(page)); \ 138 BUG_ON(!PagePrivate(page)); \
138 ((struct buffer_head *)page_private(page)); \ 139 ((struct buffer_head *)page_private(page)); \
139 }) 140 })
140 #define page_has_buffers(page) PagePrivate(page) 141 #define page_has_buffers(page) PagePrivate(page)
141 142
142 /* 143 /*
143 * Declarations 144 * Declarations
144 */ 145 */
145 146
146 void mark_buffer_dirty(struct buffer_head *bh); 147 void mark_buffer_dirty(struct buffer_head *bh);
147 void init_buffer(struct buffer_head *, bh_end_io_t *, void *); 148 void init_buffer(struct buffer_head *, bh_end_io_t *, void *);
148 void set_bh_page(struct buffer_head *bh, 149 void set_bh_page(struct buffer_head *bh,
149 struct page *page, unsigned long offset); 150 struct page *page, unsigned long offset);
150 int try_to_free_buffers(struct page *); 151 int try_to_free_buffers(struct page *);
151 struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size, 152 struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size,
152 int retry); 153 int retry);
153 void create_empty_buffers(struct page *, unsigned long, 154 void create_empty_buffers(struct page *, unsigned long,
154 unsigned long b_state); 155 unsigned long b_state);
155 void end_buffer_read_sync(struct buffer_head *bh, int uptodate); 156 void end_buffer_read_sync(struct buffer_head *bh, int uptodate);
156 void end_buffer_write_sync(struct buffer_head *bh, int uptodate); 157 void end_buffer_write_sync(struct buffer_head *bh, int uptodate);
157 158
158 /* Things to do with buffers at mapping->private_list */ 159 /* Things to do with buffers at mapping->private_list */
159 void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode); 160 void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode);
160 int inode_has_buffers(struct inode *); 161 int inode_has_buffers(struct inode *);
161 void invalidate_inode_buffers(struct inode *); 162 void invalidate_inode_buffers(struct inode *);
162 int remove_inode_buffers(struct inode *inode); 163 int remove_inode_buffers(struct inode *inode);
163 int sync_mapping_buffers(struct address_space *mapping); 164 int sync_mapping_buffers(struct address_space *mapping);
164 void unmap_underlying_metadata(struct block_device *bdev, sector_t block); 165 void unmap_underlying_metadata(struct block_device *bdev, sector_t block);
165 166
166 void mark_buffer_async_write(struct buffer_head *bh); 167 void mark_buffer_async_write(struct buffer_head *bh);
167 void invalidate_bdev(struct block_device *); 168 void invalidate_bdev(struct block_device *);
168 int sync_blockdev(struct block_device *bdev); 169 int sync_blockdev(struct block_device *bdev);
169 void __wait_on_buffer(struct buffer_head *); 170 void __wait_on_buffer(struct buffer_head *);
170 wait_queue_head_t *bh_waitq_head(struct buffer_head *bh); 171 wait_queue_head_t *bh_waitq_head(struct buffer_head *bh);
171 int fsync_bdev(struct block_device *); 172 int fsync_bdev(struct block_device *);
172 struct super_block *freeze_bdev(struct block_device *); 173 struct super_block *freeze_bdev(struct block_device *);
173 void thaw_bdev(struct block_device *, struct super_block *); 174 void thaw_bdev(struct block_device *, struct super_block *);
174 int fsync_super(struct super_block *); 175 int fsync_super(struct super_block *);
175 int fsync_no_super(struct block_device *); 176 int fsync_no_super(struct block_device *);
176 struct buffer_head *__find_get_block(struct block_device *bdev, sector_t block, 177 struct buffer_head *__find_get_block(struct block_device *bdev, sector_t block,
177 unsigned size); 178 unsigned size);
178 struct buffer_head *__getblk(struct block_device *bdev, sector_t block, 179 struct buffer_head *__getblk(struct block_device *bdev, sector_t block,
179 unsigned size); 180 unsigned size);
180 void __brelse(struct buffer_head *); 181 void __brelse(struct buffer_head *);
181 void __bforget(struct buffer_head *); 182 void __bforget(struct buffer_head *);
182 void __breadahead(struct block_device *, sector_t block, unsigned int size); 183 void __breadahead(struct block_device *, sector_t block, unsigned int size);
183 struct buffer_head *__bread(struct block_device *, sector_t block, unsigned size); 184 struct buffer_head *__bread(struct block_device *, sector_t block, unsigned size);
184 void invalidate_bh_lrus(void); 185 void invalidate_bh_lrus(void);
185 struct buffer_head *alloc_buffer_head(gfp_t gfp_flags); 186 struct buffer_head *alloc_buffer_head(gfp_t gfp_flags);
186 void free_buffer_head(struct buffer_head * bh); 187 void free_buffer_head(struct buffer_head * bh);
187 void unlock_buffer(struct buffer_head *bh); 188 void unlock_buffer(struct buffer_head *bh);
188 void __lock_buffer(struct buffer_head *bh); 189 void __lock_buffer(struct buffer_head *bh);
189 void ll_rw_block(int, int, struct buffer_head * bh[]); 190 void ll_rw_block(int, int, struct buffer_head * bh[]);
190 int sync_dirty_buffer(struct buffer_head *bh); 191 int sync_dirty_buffer(struct buffer_head *bh);
191 int submit_bh(int, struct buffer_head *); 192 int submit_bh(int, struct buffer_head *);
192 void write_boundary_block(struct block_device *bdev, 193 void write_boundary_block(struct block_device *bdev,
193 sector_t bblock, unsigned blocksize); 194 sector_t bblock, unsigned blocksize);
194 int bh_uptodate_or_lock(struct buffer_head *bh); 195 int bh_uptodate_or_lock(struct buffer_head *bh);
195 int bh_submit_read(struct buffer_head *bh); 196 int bh_submit_read(struct buffer_head *bh);
196 197
197 extern int buffer_heads_over_limit; 198 extern int buffer_heads_over_limit;
198 199
199 /* 200 /*
200 * Generic address_space_operations implementations for buffer_head-backed 201 * Generic address_space_operations implementations for buffer_head-backed
201 * address_spaces. 202 * address_spaces.
202 */ 203 */
203 void block_invalidatepage(struct page *page, unsigned long offset); 204 void block_invalidatepage(struct page *page, unsigned long offset);
204 int block_write_full_page(struct page *page, get_block_t *get_block, 205 int block_write_full_page(struct page *page, get_block_t *get_block,
205 struct writeback_control *wbc); 206 struct writeback_control *wbc);
206 int block_read_full_page(struct page*, get_block_t*); 207 int block_read_full_page(struct page*, get_block_t*);
207 int block_is_partially_uptodate(struct page *page, read_descriptor_t *desc, 208 int block_is_partially_uptodate(struct page *page, read_descriptor_t *desc,
208 unsigned long from); 209 unsigned long from);
209 int block_write_begin(struct file *, struct address_space *, 210 int block_write_begin(struct file *, struct address_space *,
210 loff_t, unsigned, unsigned, 211 loff_t, unsigned, unsigned,
211 struct page **, void **, get_block_t*); 212 struct page **, void **, get_block_t*);
212 int block_write_end(struct file *, struct address_space *, 213 int block_write_end(struct file *, struct address_space *,
213 loff_t, unsigned, unsigned, 214 loff_t, unsigned, unsigned,
214 struct page *, void *); 215 struct page *, void *);
215 int generic_write_end(struct file *, struct address_space *, 216 int generic_write_end(struct file *, struct address_space *,
216 loff_t, unsigned, unsigned, 217 loff_t, unsigned, unsigned,
217 struct page *, void *); 218 struct page *, void *);
218 void page_zero_new_buffers(struct page *page, unsigned from, unsigned to); 219 void page_zero_new_buffers(struct page *page, unsigned from, unsigned to);
219 int block_prepare_write(struct page*, unsigned, unsigned, get_block_t*); 220 int block_prepare_write(struct page*, unsigned, unsigned, get_block_t*);
220 int cont_write_begin(struct file *, struct address_space *, loff_t, 221 int cont_write_begin(struct file *, struct address_space *, loff_t,
221 unsigned, unsigned, struct page **, void **, 222 unsigned, unsigned, struct page **, void **,
222 get_block_t *, loff_t *); 223 get_block_t *, loff_t *);
223 int generic_cont_expand_simple(struct inode *inode, loff_t size); 224 int generic_cont_expand_simple(struct inode *inode, loff_t size);
224 int block_commit_write(struct page *page, unsigned from, unsigned to); 225 int block_commit_write(struct page *page, unsigned from, unsigned to);
225 int block_page_mkwrite(struct vm_area_struct *vma, struct page *page, 226 int block_page_mkwrite(struct vm_area_struct *vma, struct page *page,
226 get_block_t get_block); 227 get_block_t get_block);
227 void block_sync_page(struct page *); 228 void block_sync_page(struct page *);
228 sector_t generic_block_bmap(struct address_space *, sector_t, get_block_t *); 229 sector_t generic_block_bmap(struct address_space *, sector_t, get_block_t *);
229 int block_truncate_page(struct address_space *, loff_t, get_block_t *); 230 int block_truncate_page(struct address_space *, loff_t, get_block_t *);
230 int file_fsync(struct file *, struct dentry *, int); 231 int file_fsync(struct file *, struct dentry *, int);
231 int nobh_write_begin(struct file *, struct address_space *, 232 int nobh_write_begin(struct file *, struct address_space *,
232 loff_t, unsigned, unsigned, 233 loff_t, unsigned, unsigned,
233 struct page **, void **, get_block_t*); 234 struct page **, void **, get_block_t*);
234 int nobh_write_end(struct file *, struct address_space *, 235 int nobh_write_end(struct file *, struct address_space *,
235 loff_t, unsigned, unsigned, 236 loff_t, unsigned, unsigned,
236 struct page *, void *); 237 struct page *, void *);
237 int nobh_truncate_page(struct address_space *, loff_t, get_block_t *); 238 int nobh_truncate_page(struct address_space *, loff_t, get_block_t *);
238 int nobh_writepage(struct page *page, get_block_t *get_block, 239 int nobh_writepage(struct page *page, get_block_t *get_block,
239 struct writeback_control *wbc); 240 struct writeback_control *wbc);
240 241
241 void buffer_init(void); 242 void buffer_init(void);
242 243
243 /* 244 /*
244 * inline definitions 245 * inline definitions
245 */ 246 */
246 247
247 static inline void attach_page_buffers(struct page *page, 248 static inline void attach_page_buffers(struct page *page,
248 struct buffer_head *head) 249 struct buffer_head *head)
249 { 250 {
250 page_cache_get(page); 251 page_cache_get(page);
251 SetPagePrivate(page); 252 SetPagePrivate(page);
252 set_page_private(page, (unsigned long)head); 253 set_page_private(page, (unsigned long)head);
253 } 254 }
254 255
255 static inline void get_bh(struct buffer_head *bh) 256 static inline void get_bh(struct buffer_head *bh)
256 { 257 {
257 atomic_inc(&bh->b_count); 258 atomic_inc(&bh->b_count);
258 } 259 }
259 260
260 static inline void put_bh(struct buffer_head *bh) 261 static inline void put_bh(struct buffer_head *bh)
261 { 262 {
262 smp_mb__before_atomic_dec(); 263 smp_mb__before_atomic_dec();
263 atomic_dec(&bh->b_count); 264 atomic_dec(&bh->b_count);
264 } 265 }
265 266
266 static inline void brelse(struct buffer_head *bh) 267 static inline void brelse(struct buffer_head *bh)
267 { 268 {
268 if (bh) 269 if (bh)
269 __brelse(bh); 270 __brelse(bh);
270 } 271 }
271 272
272 static inline void bforget(struct buffer_head *bh) 273 static inline void bforget(struct buffer_head *bh)
273 { 274 {
274 if (bh) 275 if (bh)
275 __bforget(bh); 276 __bforget(bh);
276 } 277 }
277 278
278 static inline struct buffer_head * 279 static inline struct buffer_head *
279 sb_bread(struct super_block *sb, sector_t block) 280 sb_bread(struct super_block *sb, sector_t block)
280 { 281 {
281 return __bread(sb->s_bdev, block, sb->s_blocksize); 282 return __bread(sb->s_bdev, block, sb->s_blocksize);
282 } 283 }
283 284
284 static inline void 285 static inline void
285 sb_breadahead(struct super_block *sb, sector_t block) 286 sb_breadahead(struct super_block *sb, sector_t block)
286 { 287 {
287 __breadahead(sb->s_bdev, block, sb->s_blocksize); 288 __breadahead(sb->s_bdev, block, sb->s_blocksize);
288 } 289 }
289 290
290 static inline struct buffer_head * 291 static inline struct buffer_head *
291 sb_getblk(struct super_block *sb, sector_t block) 292 sb_getblk(struct super_block *sb, sector_t block)
292 { 293 {
293 return __getblk(sb->s_bdev, block, sb->s_blocksize); 294 return __getblk(sb->s_bdev, block, sb->s_blocksize);
294 } 295 }
295 296
296 static inline struct buffer_head * 297 static inline struct buffer_head *
297 sb_find_get_block(struct super_block *sb, sector_t block) 298 sb_find_get_block(struct super_block *sb, sector_t block)
298 { 299 {
299 return __find_get_block(sb->s_bdev, block, sb->s_blocksize); 300 return __find_get_block(sb->s_bdev, block, sb->s_blocksize);
300 } 301 }
301 302
302 static inline void 303 static inline void
303 map_bh(struct buffer_head *bh, struct super_block *sb, sector_t block) 304 map_bh(struct buffer_head *bh, struct super_block *sb, sector_t block)
304 { 305 {
305 set_buffer_mapped(bh); 306 set_buffer_mapped(bh);
306 bh->b_bdev = sb->s_bdev; 307 bh->b_bdev = sb->s_bdev;
307 bh->b_blocknr = block; 308 bh->b_blocknr = block;
308 bh->b_size = sb->s_blocksize; 309 bh->b_size = sb->s_blocksize;
309 } 310 }
310 311
311 /* 312 /*
312 * Calling wait_on_buffer() for a zero-ref buffer is illegal, so we call into 313 * Calling wait_on_buffer() for a zero-ref buffer is illegal, so we call into
313 * __wait_on_buffer() just to trip a debug check. Because debug code in inline 314 * __wait_on_buffer() just to trip a debug check. Because debug code in inline
314 * functions is bloaty. 315 * functions is bloaty.
315 */ 316 */
316 static inline void wait_on_buffer(struct buffer_head *bh) 317 static inline void wait_on_buffer(struct buffer_head *bh)
317 { 318 {
318 might_sleep(); 319 might_sleep();
319 if (buffer_locked(bh) || atomic_read(&bh->b_count) == 0) 320 if (buffer_locked(bh) || atomic_read(&bh->b_count) == 0)
320 __wait_on_buffer(bh); 321 __wait_on_buffer(bh);
321 } 322 }
322 323
323 static inline int trylock_buffer(struct buffer_head *bh) 324 static inline int trylock_buffer(struct buffer_head *bh)
324 { 325 {
325 return likely(!test_and_set_bit_lock(BH_Lock, &bh->b_state)); 326 return likely(!test_and_set_bit_lock(BH_Lock, &bh->b_state));
326 } 327 }
327 328
328 static inline void lock_buffer(struct buffer_head *bh) 329 static inline void lock_buffer(struct buffer_head *bh)
329 { 330 {
330 might_sleep(); 331 might_sleep();
331 if (!trylock_buffer(bh)) 332 if (!trylock_buffer(bh))
332 __lock_buffer(bh); 333 __lock_buffer(bh);
333 } 334 }
334 335
335 extern int __set_page_dirty_buffers(struct page *page); 336 extern int __set_page_dirty_buffers(struct page *page);
336 337
337 #else /* CONFIG_BLOCK */ 338 #else /* CONFIG_BLOCK */
338 339
339 static inline void buffer_init(void) {} 340 static inline void buffer_init(void) {}
340 static inline int try_to_free_buffers(struct page *page) { return 1; } 341 static inline int try_to_free_buffers(struct page *page) { return 1; }
341 static inline int sync_blockdev(struct block_device *bdev) { return 0; } 342 static inline int sync_blockdev(struct block_device *bdev) { return 0; }
342 static inline int inode_has_buffers(struct inode *inode) { return 0; } 343 static inline int inode_has_buffers(struct inode *inode) { return 0; }
343 static inline void invalidate_inode_buffers(struct inode *inode) {} 344 static inline void invalidate_inode_buffers(struct inode *inode) {}
344 static inline int remove_inode_buffers(struct inode *inode) { return 1; } 345 static inline int remove_inode_buffers(struct inode *inode) { return 1; }
345 static inline int sync_mapping_buffers(struct address_space *mapping) { return 0; } 346 static inline int sync_mapping_buffers(struct address_space *mapping) { return 0; }
346 static inline void invalidate_bdev(struct block_device *bdev) {} 347 static inline void invalidate_bdev(struct block_device *bdev) {}
347 348
348 349
349 #endif /* CONFIG_BLOCK */ 350 #endif /* CONFIG_BLOCK */
350 #endif /* _LINUX_BUFFER_HEAD_H */ 351 #endif /* _LINUX_BUFFER_HEAD_H */
351 352